Code cleanup.

2012-12-19 20:42:53 +01:00 · 2012-12-19 20:42:53 +01:00 · afa93b498b
commit afa93b498b
parent 389ae838b8
7 changed files with 39 additions and 79 deletions
--- a/scripts/creators.py
+++ b/scripts/creators.py
@ -7,10 +7,9 @@ from __future__ import print_function
 import re
 import sys
 import os
-import json
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape, tagre
-from scriptutil import contains_case_insensitive, capfirst
+from scriptutil import contains_case_insensitive, capfirst, save_result, load_result

 json_file = __file__.replace(".py", ".json")

@ -37,30 +36,22 @@ def handle_url(url, res):
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
-            print("WARN: skipping possible duplicate", name, file=sys.stderr)
+            print("INFO: skipping possible duplicate", name, file=sys.stderr)
            continue
        res[name] = url


-def save_result(res):
-    """Save result to file."""
-    with open(json_file, 'wb') as f:
-        json.dump(res, f, sort_keys=True)
-
-
 def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
    handle_url('http://www.creators.com/comics/cat-seeall.html', res)
-    save_result(res)
+    save_result(res, json_file)


 def print_results(args):
    """Print comics."""
-    with open(json_file, "rb") as f:
-        comics = json.load(f)
-    for name, url in sorted(comics.items()):
+    for name, url in sorted(load_result(json_file).items()):
        if name in exclude_comics:
            continue
        print("add(%r, %r)" % (str(name), str(url)))
--- a/scripts/drunkduck.py
+++ b/scripts/drunkduck.py
@ -7,10 +7,9 @@ from __future__ import print_function
 import re
 import sys
 import os
-import json
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify
-from scriptutil import contains_case_insensitive, capfirst
+from scriptutil import contains_case_insensitive, capfirst, save_result, load_result

 json_file = __file__.replace(".py", ".json")

@ -167,7 +166,7 @@ def handle_url(url, url_matcher, num_matcher, res):
        name = capfirst(asciify(path))
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
-            print("WARN: skipping possible duplicate", name, file=sys.stderr)
+            print("INFO: skipping possible duplicate", name, file=sys.stderr)
            continue
        if name in exclude_comics:
            continue
@ -181,12 +180,6 @@ def handle_url(url, url_matcher, num_matcher, res):
        res[name] = (path, num)


-def save_result(res):
-    """Save result to file."""
-    with open(json_file, 'wb') as f:
-        json.dump(res, f, sort_keys=True)
-
-
 def get_results():
    """Parse all search result pages."""
    base = "http://www.drunkduck.com/search/?page=%d&search=&type=0&type=1&last_update="
@ -200,14 +193,12 @@ def get_results():
    for i in range(1, result_pages + 1):
        print(i, file=sys.stderr, end=" ")
        handle_url(base % i, href, num, res)
-    save_result(res)
+    save_result(res, json_file)


 def print_results(min_strips):
    """Print all comics that have at least the given number of minimum comic strips."""
-    with open(json_file, "rb") as f:
-        comics = json.load(f)
-    for name, entry in sorted(comics.items()):
+    for name, entry in sorted(load_result(json_file).items()):
        if name in exclude_comics:
            continue
        path, num = entry
--- a/scripts/gocomics.py
+++ b/scripts/gocomics.py
@ -7,11 +7,10 @@ from __future__ import print_function
 import re
 import sys
 import os
-import json
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import tagre, getPageContent, asciify, unescape
 from dosagelib.scraper import get_scrapers
-from scriptutil import contains_case_insensitive, capfirst
+from scriptutil import contains_case_insensitive, capfirst, save_result, load_result

 json_file = __file__.replace(".py", ".json")

@ -59,17 +58,11 @@ def handle_url(url, res):
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
-            print("WARN: skipping possible duplicate", name, file=sys.stderr)
+            print("INFO: skipping possible duplicate", name, file=sys.stderr)
            continue
        res[name] = shortname


-def save_result(res):
-    """Save result to file."""
-    with open(json_file, 'wb') as f:
-        json.dump(res, f, sort_keys=True)
-
-
 def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
@ -77,7 +70,7 @@ def get_results():
    handle_url('http://www.gocomics.com/features', res)
    handle_url('http://www.gocomics.com/explore/editorial_list', res)
    handle_url('http://www.gocomics.com/explore/sherpa_list', res)
-    save_result(res)
+    save_result(res, json_file)


 def has_creators_comic(name):
@ -91,9 +84,7 @@ def has_creators_comic(name):

 def print_results(args):
    """Print all comics that have at least the given number of minimum comic strips."""
-    with open(json_file, "rb") as f:
-        comics = json.load(f)
-    for name, shortname in sorted(comics.items()):
+    for name, shortname in sorted(load_result(json_file).items()):
        if name in exclude_comics:
            continue
        if has_creators_comic(name):
--- a/scripts/keenspot.py
+++ b/scripts/keenspot.py
@ -7,11 +7,10 @@ from __future__ import print_function
 import re
 import sys
 import os
-import json
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape, tagre
 from dosagelib.scraper import get_scrapers
-from scriptutil import contains_case_insensitive, capfirst
+from scriptutil import contains_case_insensitive, capfirst, save_result, load_result

 json_file = __file__.replace(".py", ".json")

@ -379,7 +378,7 @@ def handle_url(url, res):
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
-            print("WARN: skipping possible duplicate", name, file=sys.stderr)
+            print("INFO: skipping possible duplicate", name, file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
@ -391,12 +390,6 @@ def handle_url(url, res):
        res[name] = (url_overrides.get(name, url), num)


-def save_result(res):
-    """Save result to file."""
-    with open(json_file, 'wb') as f:
-        json.dump(res, f, sort_keys=True)
-
-
 def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
@ -404,7 +397,7 @@ def get_results():
    base = 'http://guide.comicgenesis.com/Keenspace_%s.html'
    for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ':
        handle_url(base % c, res)
-    save_result(res)
+    save_result(res, json_file)


 def has_comic(name):
@ -420,9 +413,7 @@ def has_comic(name):
 def print_results(args):
    """Print all comics that have at least the given number of minimum comic strips."""
    min_comics = int(args[0])
-    with open(json_file, "rb") as f:
-        comics = json.load(f)
-    for name, entry in sorted(comics.items()):
+    for name, entry in sorted(load_result(json_file).items()):
        if name in exclude_comics:
            continue
        url, num = entry
--- a/scripts/scriptutil.py
+++ b/scripts/scriptutil.py
@ -1,8 +1,9 @@
 # Copyright (C) 2012 Bastian Kleineidam
 import re
-
+import json

 def contains_case_insensitive(adict, akey):
+    """Check if key is in adict. The search is case insensitive."""
    for key in adict:
        if key.lower() == akey.lower():
            return True
@ -11,6 +12,7 @@ def contains_case_insensitive(adict, akey):

 _tagre = re.compile(r"<.+?>")
 def remove_html_tags(text):
+    """Remove all HTML tags from text."""
    return _tagre.sub("", text)


@ -23,6 +25,18 @@ def capfirst(text):

 _ws = re.compile(r"\s+")
 def compact_whitespace(text):
+    """Compact all subsequent whitespace to a single space."""
    if not text:
        return text
    return _ws.sub(" ", text)
+
+
+def save_result(res, json_file):
+    """Save result to file."""
+    with open(json_file, 'wb') as f:
+        json.dump(res, f, sort_keys=True)
+
+
+def load_result(json_file):
+    with open(json_file, "rb") as f:
+        return json.load(f)
--- a/scripts/smackjeeves.py
+++ b/scripts/smackjeeves.py
@ -8,11 +8,10 @@ import re
 import sys
 import os
 import urlparse
-import json
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape, tagre, unquote
 from dosagelib.scraper import get_scrapers
-from scriptutil import contains_case_insensitive, remove_html_tags, capfirst, compact_whitespace
+from scriptutil import contains_case_insensitive, remove_html_tags, capfirst, compact_whitespace, save_result, load_result

 json_file = __file__.replace(".py", ".json")

@ -231,7 +230,7 @@ def handle_url(url, res):
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
-            print("WARN: skipping possible duplicate", name, file=sys.stderr)
+            print("INFO: skipping possible duplicate", name, file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
@ -270,12 +269,6 @@ def handle_url(url, res):
        ]


-def save_result(res):
-    """Save result to file."""
-    with open(json_file, 'wb') as f:
-        json.dump(res, f, sort_keys=True)
-
-
 def get_results():
    """Parse all search result pages."""
    base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
@ -287,7 +280,7 @@ def get_results():
    for i in range(0, result_pages):
        print(i+1, file=sys.stderr, end=" ")
        handle_url(base % (i*12), res)
-    save_result(res)
+    save_result(res, json_file)


 def has_comic(name):
@ -302,9 +295,7 @@ def has_comic(name):
 def print_results(args):
    """Print all comics that have at least the given number of minimum comic strips."""
    min_comics = int(args[0])
-    with open(json_file, "rb") as f:
-        comics = json.load(f)
-    for name, entry in sorted(comics.items()):
+    for name, entry in sorted(load_result(json_file).items()):
        if name in exclude_comics:
            continue
        url, num, desc, adult, bounce = entry
--- a/scripts/universal.py
+++ b/scripts/universal.py
@ -7,11 +7,10 @@ from __future__ import print_function
 import re
 import sys
 import os
-import json
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape
 from dosagelib.scraper import get_scrapers
-from scriptutil import contains_case_insensitive, capfirst
+from scriptutil import contains_case_insensitive, capfirst, save_result, load_result

 json_file = __file__.replace(".py", ".json")

@ -55,23 +54,17 @@ def handle_url(url, res):
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
-            print("WARN: skipping possible duplicate", name, file=sys.stderr)
+            print("INFO: skipping possible duplicate", name, file=sys.stderr)
            continue
        res[name] = shortname


-def save_result(res):
-    """Save result to file."""
-    with open(json_file, 'wb') as f:
-        json.dump(res, f, sort_keys=True)
-
-
 def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
    handle_url('http://www.universaluclick.com/comics/list', res)
-    save_result(res)
+    save_result(res, json_file)


 def has_comic(name):
@ -86,9 +79,7 @@ def has_comic(name):

 def print_results(args):
    """Print all comics that have at least the given number of minimum comic strips."""
-    with open(json_file, "rb") as f:
-        comics = json.load(f)
-    for name, shortname in sorted(comics.items()):
+    for name, shortname in sorted(load_result().items()):
        if name in exclude_comics:
            continue
        if has_comic(name):