Refresh GoComics list from online directory.

2016-04-12 00:36:33 +02:00 · 2016-04-12 00:36:33 +02:00 · 443ab119e9
commit 443ab119e9
parent 0e385a3697
2 changed files with 886 additions and 451 deletions
--- a/dosagelib/plugins/gocomics.py
+++ b/dosagelib/plugins/gocomics.py
--- a/scripts/gocomics.py
+++ b/scripts/gocomics.py
@ -1,36 +1,37 @@
 #!/usr/bin/env python
-# Copyright (C) 2012-2014 Bastian Kleineidam
+# -*- coding: utf-8 -*-
+# Copyright (C) 2013-2014 Bastian Kleineidam
+# Copyright (C) 2016 Tobias Gruetzmacher
 """
 Script to get a list of gocomics and save the info in a JSON file for further processing.
 """
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
+
 import codecs
-import re
 import sys
 import os
 import requests
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-from dosagelib.util import tagre, getPageContent, asciify, unescape
-from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
+from lxml import html
+
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))  # noqa
+from dosagelib.util import get_page
+from scriptutil import contains_case_insensitive, format_name, save_result, load_result, truncate_name

 json_file = __file__.replace(".py", ".json")

-#<a href="/shortname" class="alpha_list updated">name</a>
-url_matcher = re.compile(tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r"([^<]+)</a>")
-
 # names of comics to exclude
 exclude_comics = [
-        "Angryprogrammer", # unavailable
-        "Complex", # "coming soon"
-        "Guinness", # "coming soon"
-        "Jabberwoncky", # "coming soon"
-        "KickyBrand", # unavailable
-        "Penmanship", # unavailable
-        "RandysRationale", # "coming soon"
-        "SaturdayMorningBreakfastCereal", # duplicate
-        "SignsOfOurTimes", # "coming soon"
-        "TheGagwriter", # "coming soon"
-        "Yaoyao", # "coming soon"
+        # "coming soon"
+        "Angryprogrammer",
+        "Guinness",
+        "Jabberwoncky",
+        "RandysRationale"
+        "SignsOfOurTimes",
+        "TheGagwriter",
+        "Yaoyao",
+
+        # duplicate
+        "SaturdayMorningBreakfastCereal",
 ]


@ -38,27 +39,24 @@ def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
-        data = getPageContent(url, session)
+        data = html.document_fromstring(get_page(url, session).text)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
-    for match in url_matcher.finditer(data):
-        shortname = match.group(1)
-        name = unescape(match.group(2))
-        name = asciify(name.replace('&', 'And').replace('@', 'At'))
-        name = capfirst(name)
-        if name in exclude_comics:
-            continue
+
+    for comiclink in data.cssselect('a.alpha_list'):
+        link = comiclink.attrib['href']
+        name = format_name(comiclink.text)
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
-        res[name] = shortname
+        res[name] = link


 def get_results():
    """Parse all search result pages."""
-    # store info in a dictionary {name -> shortname}
+    # store info in a dictionary {name -> uri}
    res = {}
    session = requests.Session()
    handle_url('http://www.gocomics.com/features', session, res)
@ -68,17 +66,22 @@ def get_results():
    save_result(res, json_file)


+def first_lower(x):
+    return x[0].lower()
+
+
 def print_results(args):
    """Print all comics that have at least the given number of minimum comic strips."""
    min_comics, filename = args
    with codecs.open(filename, 'a', 'utf-8') as fp:
-        for name, shortname in sorted(load_result(json_file).items()):
+        data = load_result(json_file)
+        for name, uri in sorted(data.items(), key=first_lower):
            if name in exclude_comics:
                print("Excluded " + name)
                continue
-            fp.write(u"add(%r, %r)\n" % (
-              str(truncate_name(name)), str(shortname))
-            )
+            fp.write(u"\n\nclass GC%s(_GoComics%s):\n    path = %r\n" % (
+                truncate_name(name), 'Es' if 'espanol/' in uri else '',
+                uri[1:]))


 if __name__ == '__main__':