Adventures of the College
+# Pros
+url_matcher = re.compile(r'' +
+ tagre("a", "href", r'(http://[^"]+)') +
+ r'([^<]+)')
num_matcher = re.compile(r'Number of Days: (\d+)')
# names of comics to exclude
@@ -368,19 +380,18 @@ url_overrides = {
"Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
}
+
def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
- data = getPageContent(url, session)
+ data = get_page(url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
url = match.group(1) + '/'
- name = unescape(match.group(2))
- name = asciify(name.replace('&', 'And').replace('@', 'At'))
- name = capfirst(name)
+ name = format_name(match.group(2))
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
@@ -391,13 +402,13 @@ def handle_url(url, session, res):
end = match.end()
mo = num_matcher.search(data[end:])
if not mo:
- print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
+ print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
continue
num = int(mo.group(1))
url = url_overrides.get(name, url)
try:
if "/d/" not in url:
- check_robotstxt(url+"d/", session)
+ check_robotstxt(url + "d/", session)
else:
check_robotstxt(url, session)
except IOError:
diff --git a/scripts/create-cbz.py b/scripts/create-cbz.py
index 0700f0488..fe3b53366 100755
--- a/scripts/create-cbz.py
+++ b/scripts/create-cbz.py
@@ -1,15 +1,20 @@
#!/usr/bin/env python
-# Copyright (C) 2013-2014 Bastian Kleineidam
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
Creates a CBZ file in the comic directory.
Uses an ordered symlink directory (see order-symlinks.py) if it exists,
else the plain files are used.
"""
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
+
import sys
import os
import zipfile
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.configuration import App
@@ -21,6 +26,7 @@ ImageExts = (
".png",
)
+
def is_image(filename):
"""Determine if given filename is an image."""
# note: isfile() also accepts symlinks
diff --git a/scripts/drunkduck.py b/scripts/drunkduck.py
index 147520d1f..b45b6f875 100755
--- a/scripts/drunkduck.py
+++ b/scripts/drunkduck.py
@@ -1,17 +1,25 @@
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
-Script to get a list of drunkduck comics and save the info in a JSON file for further processing.
+Script to get a list of drunkduck comics and save the info in a JSON file for
+further processing.
"""
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
+
import codecs
import re
import sys
import os
+
import requests
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify
-from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
+from dosagelib.util import tagre, get_page, unquote, unescape
+from scriptutil import (contains_case_insensitive, capfirst, save_result,
+ load_result, truncate_name, asciify)
json_file = __file__.replace(".py", ".json")
@@ -169,7 +177,7 @@ exclude_comics = [
def handle_url(url, session, url_matcher, num_matcher, res):
"""Parse one search result page."""
try:
- data = getPageContent(url, session)
+ data = get_page(url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
@@ -187,7 +195,7 @@ def handle_url(url, session, url_matcher, num_matcher, res):
end = match.end(1)
mo = num_matcher.search(data[end:])
if not mo:
- print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
+ print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
continue
num = int(mo.group(1))
res[name] = (path, num)
diff --git a/scripts/generate_json.sh b/scripts/generate_json.sh
index 5819e881c..c01833b26 100755
--- a/scripts/generate_json.sh
+++ b/scripts/generate_json.sh
@@ -6,7 +6,7 @@ d=$(dirname $0)
if [ $# -ge 1 ]; then
list="$*"
else
- list="creators gocomics comicgenesis keenspot smackjeeves arcamax comicfury"
+ list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory"
fi
for script in $list; do
echo "Executing ${script}.py"
diff --git a/scripts/gocomics.py b/scripts/gocomics.py
index ded106c77..5098e85d9 100755
--- a/scripts/gocomics.py
+++ b/scripts/gocomics.py
@@ -1,15 +1,18 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-# Copyright (C) 2013-2014 Bastian Kleineidam
-# Copyright (C) 2016 Tobias Gruetzmacher
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
-Script to get a list of gocomics and save the info in a JSON file for further processing.
+Script to get a list of gocomics and save the info in a JSON file for further
+processing.
"""
from __future__ import absolute_import, division, print_function
import codecs
import sys
import os
+
import requests
from lxml import html
diff --git a/scripts/hook-dosagelib.py b/scripts/hook-dosagelib.py
index 4969e2ea7..8870a276b 100644
--- a/scripts/hook-dosagelib.py
+++ b/scripts/hook-dosagelib.py
@@ -1,2 +1,8 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
+
+from __future__ import absolute_import, division, print_function
from PyInstaller.utils.hooks import collect_submodules
hiddenimports = collect_submodules('dosagelib.plugins')
diff --git a/scripts/keenspot.py b/scripts/keenspot.py
index 23e2f1cb4..ada3f92ca 100755
--- a/scripts/keenspot.py
+++ b/scripts/keenspot.py
@@ -1,28 +1,38 @@
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
Script to get a list of KeenSpot comics and save the info in a
JSON file for further processing.
"""
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
+
import codecs
import re
import sys
import os
+
import requests
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
+from dosagelib.util import get_page, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses
-from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
+from scriptutil import (contains_case_insensitive, save_result, load_result,
+ truncate_name, format_name)
+
json_file = __file__.replace(".py", ".json")
+
url_matcher = re.compile(
tagre("td", "onmouseover", r'([^"]+)') +
tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') +
r"(?:)?([^<]+)(?:)?"
)
+
# names of comics to exclude
exclude_comics = [
"BrawlintheFamily", # non-standard navigation
@@ -47,23 +57,23 @@ exclude_comics = [
"YouDamnKid", # non-standard navigation
]
+
# links to last valid strips
url_overrides = {
}
+
def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
- data = getPageContent(url, session)
+ data = get_page(url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
comicurl = match.group(2)
- name = unescape(match.group(3))
- name = asciify(name.replace('&', 'And').replace('@', 'At'))
- name = capfirst(name)
+ name = format_name(match.group(3))
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
@@ -72,7 +82,7 @@ def handle_url(url, session, res):
continue
try:
if "/d/" not in comicurl:
- check_robotstxt(comicurl+"d/", session)
+ check_robotstxt(comicurl + "d/", session)
else:
check_robotstxt(comicurl, session)
except IOError:
diff --git a/scripts/mklanguages.py b/scripts/mklanguages.py
index 6e243060c..d59f83115 100755
--- a/scripts/mklanguages.py
+++ b/scripts/mklanguages.py
@@ -1,5 +1,11 @@
#!/usr/bin/python
-# update languages.py from pycountry
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
+'''update languages.py from pycountry'''
+from __future__ import absolute_import, division, print_function
+
import os
import sys
import codecs
@@ -7,7 +13,8 @@ import codecs
basepath = os.path.dirname(os.path.dirname(__file__))
sys.path.append(basepath)
-from dosagelib.scraper import get_scraperclasses
+from dosagelib.scraper import get_scraperclasses # noqa
+
def main():
"""Update language information in dosagelib/languages.py."""
@@ -29,6 +36,7 @@ def get_used_languages():
lang[l] = scraperclass.language()
return lang
+
def write_languages(f, l):
"""Write language information."""
f.write("Languages = {%s" % os.linesep)
diff --git a/scripts/order-symlinks.py b/scripts/order-symlinks.py
index 413bfce8b..250c1cbdb 100755
--- a/scripts/order-symlinks.py
+++ b/scripts/order-symlinks.py
@@ -1,25 +1,32 @@
#!/usr/bin/env python
-# Copyright (C) 2013 Tobias Gruetzmacher
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
This script takes the JSON file created by 'dosage -o json' and uses the
metadata to build a symlink farm in the deduced order of the comic. It created
those in a subdirectory called 'inorder'.
"""
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
+
import sys
import os
import codecs
import json
+
def jsonFn(d):
"""Get JSON filename."""
return os.path.join(d, 'dosage.json')
+
def loadJson(d):
"""Return JSON data."""
with codecs.open(jsonFn(d), 'r', 'utf-8') as f:
return json.load(f)
+
def prepare_output(d):
"""Clean pre-existing links in output directory."""
outDir = os.path.join(d, 'inorder')
@@ -31,6 +38,7 @@ def prepare_output(d):
os.remove(f)
return outDir
+
def create_symlinks(d):
"""Create new symbolic links in output directory."""
data = loadJson(d)
@@ -68,4 +76,3 @@ if __name__ == '__main__':
print("No JSON file found in '%s'." % (d))
else:
print("Usage: %s comic-dirs" % (os.path.basename(sys.argv[0])))
-
diff --git a/scripts/removeafter.py b/scripts/removeafter.py
index 190124696..010ef3cef 100755
--- a/scripts/removeafter.py
+++ b/scripts/removeafter.py
@@ -1,11 +1,15 @@
#!/usr/bin/env python
-# Copyright (C) 2012-2013 Bastian Kleineidam
-"""Remove all lines after a given marker line.
-"""
-from __future__ import print_function
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
+"""Remove all lines after a given marker line."""
+from __future__ import absolute_import, division, print_function
+
import fileinput
import sys
+
def main(args):
"""Remove lines after marker."""
filename = args[0]
@@ -15,5 +19,6 @@ def main(args):
if line.startswith(marker):
break
+
if __name__ == '__main__':
main(sys.argv[1:])
diff --git a/scripts/scriptutil.py b/scripts/scriptutil.py
index 63b872103..ca72d9ce9 100644
--- a/scripts/scriptutil.py
+++ b/scripts/scriptutil.py
@@ -1,13 +1,15 @@
# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
+import re
import json
import codecs
-from dosagelib.util import unescape, asciify
+from dosagelib.util import unescape
def contains_case_insensitive(adict, akey):
@@ -42,6 +44,11 @@ def truncate_name(text):
return text[:50]
+def asciify(name):
+ """Remove non-ascii characters from string."""
+ return re.sub("[^0-9a-zA-Z_]", "", name)
+
+
def format_name(text):
"""Format a comic name."""
name = unescape(text)
diff --git a/scripts/smackjeeves.py b/scripts/smackjeeves.py
index ff43ba155..9761aeba9 100755
--- a/scripts/smackjeeves.py
+++ b/scripts/smackjeeves.py
@@ -1,22 +1,34 @@
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
-Script to get a list of smackjeeves.com comics and save the info in a JSON file for further processing.
+Script to get a list of smackjeeves.com comics and save the info in a JSON file
+for further processing.
"""
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
+
import codecs
import re
import sys
import os
-import urlparse
+try:
+ from urllib.parse import urljoin
+except ImportError:
+ from urlparse import urljoin
+
import requests
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-from dosagelib.util import getPageContent, tagre
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
+from dosagelib.util import get_page, tagre
from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
+
json_file = __file__.replace(".py", ".json")
+
# names of comics to exclude
exclude_comics = [
"4plyKamalsHead", # does not follow standard layout
@@ -98,6 +110,7 @@ exclude_comics = [
"WinterMelody", # missing images
]
+
# the latest URL of some comics repeats the previous URL
# flag this so the bounceStart uses the correct URL
repeat_comics = [
@@ -236,28 +249,32 @@ repeat_comics = [
"Zodiac",
]
+
# links to last valid strips
url_overrides = {
}
+
# HTML content matcher
-page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)', after="site_banner") +
- tagre("img", "title", r'([^"]+)'))
+page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)',
+ after="site_banner") +
+ tagre("img", "title", r'([^"]+)'))
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
num_matcher = re.compile(r'50%">\s+(\d+)\s+')
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
+
def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
- data = getPageContent(url, session)
+ data = get_page(url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in page_matcher.finditer(data):
page_url = match.group(1)
- page_url = urlparse.urljoin(url, page_url)
+ page_url = urljoin(url, page_url)
name = format_name(match.group(2))
if name in exclude_comics:
continue
@@ -269,13 +286,14 @@ def handle_url(url, session, res):
end = match.end()
mo = num_matcher.search(data[end:])
if not mo:
- print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr)
+ print("ERROR matching number:", repr(data[end:end + 300]),
+ file=sys.stderr)
continue
num = int(mo.group(1))
# search for url in extra page
print("Getting", page_url)
try:
- data2 = getPageContent(page_url, session)
+ data2 = get_page(page_url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
@@ -302,8 +320,8 @@ def get_results():
result_pages = 286
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
for i in range(0, result_pages):
- print(i+1, file=sys.stderr, end=" ")
- handle_url(base % (i*12), session, res)
+ print(i + 1, file=sys.stderr, end=" ")
+ handle_url(base % (i * 12), session, res)
save_result(res, json_file)
diff --git a/scripts/update_plugins.sh b/scripts/update_plugins.sh
index 338a895f0..4e133a59b 100755
--- a/scripts/update_plugins.sh
+++ b/scripts/update_plugins.sh
@@ -9,7 +9,7 @@ d=$(dirname $0)
if [ $# -ge 1 ]; then
list="$*"
else
- list="creators gocomics comicgenesis keenspot smackjeeves arcamax comicfury"
+ list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory"
fi
for script in $list; do
target="${d}/../dosagelib/plugins/${script}.py"
diff --git a/scripts/webcomicfactory.py b/scripts/webcomicfactory.py
index 4c0069618..66310996d 100755
--- a/scripts/webcomicfactory.py
+++ b/scripts/webcomicfactory.py
@@ -1,4 +1,7 @@
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
Script to get WebComicFactory comics and save the info in a JSON file for
@@ -12,16 +15,17 @@ import os
import requests
from lxml import html
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
-from dosagelib.util import getPageContent
+sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
+from dosagelib.util import get_page
from scriptutil import (save_result, load_result, truncate_name, format_name)
json_file = __file__.replace(".py", ".json")
def find_first(session, url):
+ print("Parsing", url, file=sys.stderr)
try:
- data = html.document_fromstring(getPageContent(url, session))
+ data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
@@ -38,8 +42,9 @@ def get_results():
res = {}
url = 'http://www.thewebcomicfactory.com/'
session = requests.Session()
+ print("Parsing", url, file=sys.stderr)
try:
- data = html.document_fromstring(getPageContent(url, session))
+ data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)