Clean up update helper scripts.

2016-04-13 00:52:16 +02:00 · 2016-04-13 00:52:16 +02:00 · 9028724a74
commit 9028724a74
parent 42e43fa4e6
17 changed files with 183 additions and 85 deletions
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -444,11 +444,6 @@ def rfc822date(indate):
    return time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(indate))


-def asciify(name):
-    """Remove non-ascii characters from string."""
-    return re.sub("[^0-9a-zA-Z_]", "", name)
-
-
 def unquote(text):
    """Replace all percent-encoded entities in text."""
    while '%' in text:
--- a/scripts/arcamax.py
+++ b/scripts/arcamax.py
@ -1,18 +1,25 @@
 #!/usr/bin/env python
-# Copyright (C) 2013-2014 Bastian Kleineidam
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
 """
-Script to get arcamax comics and save the info in a JSON file for further processing.
+Script to get arcamax comics and save the info in a JSON file for further
+processing.
 """
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
+
 import codecs
 import re
 import sys
 import os
+
 import requests
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-from dosagelib.util import getPageContent, asciify, unescape
+
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))  # noqa
+from dosagelib.util import get_page
 from dosagelib.scraper import get_scraperclasses
-from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
+from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name

 json_file = __file__.replace(".py", ".json")

@ -28,15 +35,13 @@ def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
-        data = getPageContent(url, session)
+        data = get_page(url, session).text
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        shortname = match.group(1)
-        name = unescape(match.group(2))
-        name = asciify(name.replace('&', 'And').replace('@', 'At'))
-        name = capfirst(name)
+        name = format_name(match.group(2))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
--- a/scripts/comicfury.py
+++ b/scripts/comicfury.py
@ -1,19 +1,23 @@
 #!/usr/bin/env python
-# Copyright (C) 2013-2014 Bastian Kleineidam
-# Copyright (C) 2016 Tobias Gruetzmacher
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
 """
 Script to get ComicFury comics and save the info in a JSON file for further
 processing.
 """
-from __future__ import print_function, absolute_import
+from __future__ import absolute_import, division, print_function
+
 import codecs
 import sys
 import os
+
 import requests
 from lxml import html

 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))  # noqa
-from dosagelib.util import getPageContent
+from dosagelib.util import get_page
 from dosagelib.scraper import get_scraperclasses
 from scriptutil import (contains_case_insensitive, save_result, load_result,
                        truncate_name, format_name)
@ -120,7 +124,7 @@ def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
-        data = html.document_fromstring(getPageContent(url, session))
+        data = html.document_fromstring(get_page(url, session).text)
        data.make_links_absolute(url)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
--- a/scripts/comicgenesis.py
+++ b/scripts/comicgenesis.py
@ -1,24 +1,36 @@
 #!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
 """
 Script to get a list of ComicGenesis comics and save the info in a
 JSON file for further processing.
 """
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
+
 import codecs
 import re
 import sys
 import os
+
 import requests
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
+
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))  # noqa
+from dosagelib.util import get_page, tagre, check_robotstxt
 from dosagelib.scraper import get_scraperclasses
-from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
+from scriptutil import (contains_case_insensitive, save_result, load_result,
+                        truncate_name, format_name)

 json_file = __file__.replace(".py", ".json")

-# <div class="comictitle"><strong><a target="_blank" onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return false;" href="http://collegepros.comicgenesis.com">Adventures of the College Pros</a>
-url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>')
+# <div class="comictitle"><strong><a target="_blank"
+# onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return
+# false;" href="http://collegepros.comicgenesis.com">Adventures of the College
+# Pros</a>
+url_matcher = re.compile(r'<div class="comictitle"><strong>' +
+                         tagre("a", "href", r'(http://[^"]+)') +
+                         r'([^<]+)</a>')
 num_matcher = re.compile(r'Number of Days: (\d+)')

 # names of comics to exclude
@ -368,19 +380,18 @@ url_overrides = {
    "Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
 }

+
 def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
-        data = getPageContent(url, session)
+        data = get_page(url, session).text
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        url = match.group(1) + '/'
-        name = unescape(match.group(2))
-        name = asciify(name.replace('&', 'And').replace('@', 'At'))
-        name = capfirst(name)
+        name = format_name(match.group(2))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
--- a/scripts/create-cbz.py
+++ b/scripts/create-cbz.py
@ -1,15 +1,20 @@
 #!/usr/bin/env python
-# Copyright (C) 2013-2014 Bastian Kleineidam
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
 """
 Creates a CBZ file in the comic directory.
 Uses an ordered symlink directory (see order-symlinks.py) if it exists,
 else the plain files are used.
 """
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
+
 import sys
 import os
 import zipfile
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))  # noqa
 from dosagelib.configuration import App


@ -21,6 +26,7 @@ ImageExts = (
    ".png",
 )

+
 def is_image(filename):
    """Determine if given filename is an image."""
    # note: isfile() also accepts symlinks
--- a/scripts/drunkduck.py
+++ b/scripts/drunkduck.py
@ -1,17 +1,25 @@
 #!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
 """
-Script to get a list of drunkduck comics and save the info in a JSON file for further processing.
+Script to get a list of drunkduck comics and save the info in a JSON file for
+further processing.
 """
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
+
 import codecs
 import re
 import sys
 import os
+
 import requests
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify
-from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
+
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))  # noqa
+from dosagelib.util import tagre, get_page, unquote, unescape
+from scriptutil import (contains_case_insensitive, capfirst, save_result,
+                        load_result, truncate_name, asciify)

 json_file = __file__.replace(".py", ".json")

@ -169,7 +177,7 @@ exclude_comics = [
 def handle_url(url, session, url_matcher, num_matcher, res):
    """Parse one search result page."""
    try:
-        data = getPageContent(url, session)
+        data = get_page(url, session).text
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
--- a/scripts/generate_json.sh
+++ b/scripts/generate_json.sh
@ -6,7 +6,7 @@ d=$(dirname $0)
 if [ $# -ge 1 ]; then
  list="$*"
 else
-  list="creators gocomics comicgenesis keenspot smackjeeves arcamax comicfury"
+  list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory"
 fi
 for script in $list; do
  echo "Executing ${script}.py"
--- a/scripts/gocomics.py
+++ b/scripts/gocomics.py
@ -1,15 +1,18 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-# Copyright (C) 2013-2014 Bastian Kleineidam
-# Copyright (C) 2016 Tobias Gruetzmacher
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
 """
-Script to get a list of gocomics and save the info in a JSON file for further processing.
+Script to get a list of gocomics and save the info in a JSON file for further
+processing.
 """
 from __future__ import absolute_import, division, print_function

 import codecs
 import sys
 import os
+
 import requests
 from lxml import html

--- a/scripts/hook-dosagelib.py
+++ b/scripts/hook-dosagelib.py
@ -1,2 +1,8 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
+
+from __future__ import absolute_import, division, print_function
 from PyInstaller.utils.hooks import collect_submodules
 hiddenimports = collect_submodules('dosagelib.plugins')
--- a/scripts/keenspot.py
+++ b/scripts/keenspot.py
@ -1,28 +1,38 @@
 #!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
 """
 Script to get a list of KeenSpot comics and save the info in a
 JSON file for further processing.
 """
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
+
 import codecs
 import re
 import sys
 import os
+
 import requests
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
+
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))  # noqa
+from dosagelib.util import get_page, tagre, check_robotstxt
 from dosagelib.scraper import get_scraperclasses
-from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
+from scriptutil import (contains_case_insensitive, save_result, load_result,
+                        truncate_name, format_name)
+

 json_file = __file__.replace(".py", ".json")

+
 url_matcher = re.compile(
  tagre("td", "onmouseover", r'([^"]+)') +
  tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') +
  r"(?:<b>)?([^<]+)(?:</b>)?</a>"
 )

+
 # names of comics to exclude
 exclude_comics = [
    "BrawlintheFamily", # non-standard navigation
@ -47,23 +57,23 @@ exclude_comics = [
    "YouDamnKid", # non-standard navigation
 ]

+
 # links to last valid strips
 url_overrides = {
 }

+
 def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
-        data = getPageContent(url, session)
+        data = get_page(url, session).text
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        comicurl = match.group(2)
-        name = unescape(match.group(3))
-        name = asciify(name.replace('&', 'And').replace('@', 'At'))
-        name = capfirst(name)
+        name = format_name(match.group(3))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
--- a/scripts/mklanguages.py
+++ b/scripts/mklanguages.py
@ -1,5 +1,11 @@
 #!/usr/bin/python
-# update languages.py from pycountry
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
+'''update languages.py from pycountry'''
+from __future__ import absolute_import, division, print_function
+
 import os
 import sys
 import codecs
@ -7,7 +13,8 @@ import codecs
 basepath = os.path.dirname(os.path.dirname(__file__))
 sys.path.append(basepath)

-from dosagelib.scraper import get_scraperclasses
+from dosagelib.scraper import get_scraperclasses  # noqa
+

 def main():
    """Update language information in dosagelib/languages.py."""
@ -29,6 +36,7 @@ def get_used_languages():
            lang[l] = scraperclass.language()
    return lang

+
 def write_languages(f, l):
    """Write language information."""
    f.write("Languages = {%s" % os.linesep)
--- a/scripts/order-symlinks.py
+++ b/scripts/order-symlinks.py
@ -1,25 +1,32 @@
 #!/usr/bin/env python
-# Copyright (C) 2013 Tobias Gruetzmacher
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
 """
 This script takes the JSON file created by 'dosage -o json' and uses the
 metadata to build a symlink farm in the deduced order of the comic. It created
 those in a subdirectory called 'inorder'.
 """
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
+
 import sys
 import os
 import codecs
 import json

+
 def jsonFn(d):
    """Get JSON filename."""
    return os.path.join(d, 'dosage.json')

+
 def loadJson(d):
    """Return JSON data."""
    with codecs.open(jsonFn(d), 'r', 'utf-8') as f:
        return json.load(f)

+
 def prepare_output(d):
    """Clean pre-existing links in output directory."""
    outDir = os.path.join(d, 'inorder')
@ -31,6 +38,7 @@ def prepare_output(d):
            os.remove(f)
    return outDir

+
 def create_symlinks(d):
    """Create new symbolic links in output directory."""
    data = loadJson(d)
@ -68,4 +76,3 @@ if __name__ == '__main__':
                print("No JSON file found in '%s'." % (d))
    else:
        print("Usage: %s comic-dirs" % (os.path.basename(sys.argv[0])))
-
--- a/scripts/removeafter.py
+++ b/scripts/removeafter.py
@ -1,11 +1,15 @@
 #!/usr/bin/env python
-# Copyright (C) 2012-2013 Bastian Kleineidam
-"""Remove all lines after a given marker line.
-"""
-from __future__ import print_function
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
+"""Remove all lines after a given marker line."""
+from __future__ import absolute_import, division, print_function
+
 import fileinput
 import sys

+
 def main(args):
    """Remove lines after marker."""
    filename = args[0]
@ -15,5 +19,6 @@ def main(args):
        if line.startswith(marker):
            break

+
 if __name__ == '__main__':
    main(sys.argv[1:])
--- a/scripts/scriptutil.py
+++ b/scripts/scriptutil.py
@ -1,13 +1,15 @@
 # -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012-2014 Bastian Kleineidam
 # Copyright (C) 2015-2016 Tobias Gruetzmacher

 from __future__ import absolute_import, division, print_function

+import re
 import json
 import codecs

-from dosagelib.util import unescape, asciify
+from dosagelib.util import unescape


 def contains_case_insensitive(adict, akey):
@ -42,6 +44,11 @@ def truncate_name(text):
    return text[:50]


+def asciify(name):
+    """Remove non-ascii characters from string."""
+    return re.sub("[^0-9a-zA-Z_]", "", name)
+
+
 def format_name(text):
    """Format a comic name."""
    name = unescape(text)
--- a/scripts/smackjeeves.py
+++ b/scripts/smackjeeves.py
@ -1,22 +1,34 @@
 #!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
 """
-Script to get a list of smackjeeves.com comics and save the info in a JSON file for further processing.
+Script to get a list of smackjeeves.com comics and save the info in a JSON file
+for further processing.
 """
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
+
 import codecs
 import re
 import sys
 import os
-import urlparse
+try:
+    from urllib.parse import urljoin
+except ImportError:
+    from urlparse import urljoin
+
 import requests
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-from dosagelib.util import getPageContent, tagre
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))  # noqa
+from dosagelib.util import get_page, tagre
 from dosagelib.scraper import get_scraperclasses
 from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name

+
 json_file = __file__.replace(".py", ".json")

+
 # names of comics to exclude
 exclude_comics = [
    "4plyKamalsHead", # does not follow standard layout
@ -98,6 +110,7 @@ exclude_comics = [
    "WinterMelody", # missing images
 ]

+
 # the latest URL of some comics repeats the previous URL
 # flag this so the bounceStart uses the correct URL
 repeat_comics = [
@ -236,28 +249,32 @@ repeat_comics = [
    "Zodiac",
 ]

+
 # links to last valid strips
 url_overrides = {
 }

+
 # HTML content matcher
-page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)', after="site_banner") +
+page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)',
+                                after="site_banner") +
                          tagre("img", "title", r'([^"]+)'))
 url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
 num_matcher = re.compile(r'50%">\s+(\d+)\s+')
 adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))

+
 def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
-        data = getPageContent(url, session)
+        data = get_page(url, session).text
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in page_matcher.finditer(data):
        page_url = match.group(1)
-        page_url = urlparse.urljoin(url, page_url)
+        page_url = urljoin(url, page_url)
        name = format_name(match.group(2))
        if name in exclude_comics:
            continue
@ -269,13 +286,14 @@ def handle_url(url, session, res):
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
-            print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr)
+            print("ERROR matching number:", repr(data[end:end + 300]),
+                  file=sys.stderr)
            continue
        num = int(mo.group(1))
        # search for url in extra page
        print("Getting", page_url)
        try:
-            data2 = getPageContent(page_url, session)
+            data2 = get_page(page_url, session).text
        except IOError as msg:
            print("ERROR:", msg, file=sys.stderr)
            return
--- a/scripts/update_plugins.sh
+++ b/scripts/update_plugins.sh
@ -9,7 +9,7 @@ d=$(dirname $0)
 if [ $# -ge 1 ]; then
  list="$*"
 else
-  list="creators gocomics comicgenesis keenspot smackjeeves arcamax comicfury"
+  list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory"
 fi
 for script in $list; do
  target="${d}/../dosagelib/plugins/${script}.py"
--- a/scripts/webcomicfactory.py
+++ b/scripts/webcomicfactory.py
@ -1,4 +1,7 @@
 #!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
 # Copyright (C) 2015-2016 Tobias Gruetzmacher
 """
 Script to get WebComicFactory comics and save the info in a JSON file for
@ -12,16 +15,17 @@ import os
 import requests
 from lxml import html

-sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))  # noqa
-from dosagelib.util import getPageContent
+sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))  # noqa
+from dosagelib.util import get_page
 from scriptutil import (save_result, load_result, truncate_name, format_name)

 json_file = __file__.replace(".py", ".json")


 def find_first(session, url):
+    print("Parsing", url, file=sys.stderr)
    try:
-        data = html.document_fromstring(getPageContent(url, session))
+        data = html.document_fromstring(get_page(url, session).text)
        data.make_links_absolute(url)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
@ -38,8 +42,9 @@ def get_results():
    res = {}
    url = 'http://www.thewebcomicfactory.com/'
    session = requests.Session()
+    print("Parsing", url, file=sys.stderr)
    try:
-        data = html.document_fromstring(getPageContent(url, session))
+        data = html.document_fromstring(get_page(url, session).text)
        data.make_links_absolute(url)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)