Move WebcomicFactory in its own module.

Also, add an updater script for it.
2016-04-03 21:31:56 +02:00 · 2016-04-03 21:31:56 +02:00 · 2c6decb7f5
commit 2c6decb7f5
parent bb1f20d867
4 changed files with 228 additions and 38 deletions
--- a/dosagelib/plugins/webcomicfactory.py
+++ b/dosagelib/plugins/webcomicfactory.py
@ -0,0 +1,144 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
+# Copyright (C) 2012-2014 Bastian Kleineidam
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
+
+from __future__ import absolute_import, division, print_function
+
+from .common import _WordPressScraper
+
+
+class _WebcomicFactory(_WordPressScraper):
+    latestSearch = '//a[contains(concat(" ", @class, " "), " comic-nav-last ")]'
+
+    @classmethod
+    def starter(cls):
+        """this is basically helpers.indirectStarter, but dynamically selecting
+        the right parameters."""
+        data = cls.getPage(cls.firstStripUrl)
+        return cls.fetchUrl(cls.firstStripUrl, data, cls.latestSearch)
+
+
+# do not edit anything below since these entries are generated from
+# scripts/update_plugins.sh
+# DO NOT REMOVE
+
+
+class AsTheMayoTurns(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/as-the-mayo-turns/'
+
+
+class ComicBookMafia(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/comic-book-mafia/'
+
+
+class Dealers(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/dealers-1-1998-was-the-year/'
+
+
+class DigitalHobo(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/digital-hobo-1-its-a-living-kinda/'
+
+
+class ECoastVsWCoast(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/east-coast-vs-west-coast-greetings-from-the-coasts/'
+
+
+class GunCulture(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/gun-culture/'
+
+
+class IHateMyKids(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/i-hate-my-kids/'
+
+
+class InARelationship(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/in-a-relationship-3/'
+
+
+class IntergalacticMedicalDoctor(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/intergalactic-medical-doctor/'
+
+
+class JSchoolgirlsInLove(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/japanese-schoolgirls-in-love-1/'
+
+
+class KingdomOfTheDwarves(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/kingdom-of-the-dwarves/'
+
+
+class LesterCrenshawIsDead(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/lester-crenshaw-is-dead/'
+
+
+class Millennials(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/millennials/'
+
+
+class MiserableComedians(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/miserable-comedians-1-funny-because-its-sad/'
+
+
+class OldeTymeGamer(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/olde-tyme-gamer-playing-injured/'
+
+
+class PinJunkies(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/pin-junkies/'
+
+
+class PostApocalypticNick(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/post-apocalyptic-nick/'
+
+
+class RealTalk(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/real-talk-people-who-cut-in-line/'
+
+
+class SoManyNightmares(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/so-many-nightmares-freedom-nightmare/'
+
+
+class SportsGuys(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/sports-guys/'
+
+
+class TalesOfPizza(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/tales-of-pizza-bad-tipper/'
+
+
+class TAndA(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-webcomic-factory-premiere-t-and-a/'
+
+
+class TheAntiwarComic(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-antiwar-comic-the-party/'
+
+
+class TheGentlemensClub(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-gentlemens-club/'
+
+
+class TheHorrorOfColony6(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-horror-of-colony-6-page-1/'
+
+
+class TheKingsOfViralVideo(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-kings-of-viral-video-premiere/'
+
+
+class TheSharonAndTonyExperiment(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-sharon-and-tony-experiment/'
+
+
+class TonyDestructo(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/tony-destructo/'
+
+
+class WeirdBikerTales(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/weird-biker-tales-the-last-outlaw/'
+
+
+class WillysSpaceDive(_WebcomicFactory):
+    firstStripUrl = 'http://www.thewebcomicfactory.com/comic/willys-space-dive/'
--- a/dosagelib/plugins/wordpress.py
+++ b/dosagelib/plugins/wordpress.py
@ -37,39 +37,3 @@ for (name, linkNumber) in [
    add(name, 'http://www.flowerlarkstudios.com/',
        starter=indirectStarter('http://www.flowerlarkstudios.com/',
                                '(//div[@id="sidebar-left"]//a)[' + str(linkNumber) + ']'))
-
-# all comics on the webcomic factory
-
-for (name, url) in [
-    ('AntiwarComic', 'the-antiwar-comic-the-party/'),
-    ('AstheMayoTurns', 'as-the-mayo-turns/'),
-    ('ComicBookMafia', 'comic-book-mafia/'),
-    ('Dealers', 'dealers-1-1998-was-the-year/'),
-    ('DigitalHobo', 'digital-hobo-1-its-a-living-kinda/'),
-    ('EastCoastVsWestCoast', 'east-coast-vs-west-coast-greetings-from-the-coasts/'),
-    ('GunCulture', 'gun-culture/'),
-    ('IHateMyKids', 'i-hate-my-kids/'),
-    ('InARelationship', 'in-a-relationship-3/'),
-    ('JapaneseSchoolgirlsinLove', 'japanese-schoolgirls-in-love-1/'),
-    ('KingdomoftheDwarves', 'kingdom-of-the-dwarves/'),
-    ('LesterCrenshawisDead', 'lester-crenshaw-is-dead/'),
-    ('Millennials', 'millennials/'),
-    ('MiserableComedians', 'miserable-comedians-1-funny-because-its-sad/'),
-    ('OldeTymeGamer', 'olde-tyme-gamer-playing-injured/'),
-    ('PinJunkies', 'pin-junkies/'),
-    ('PostApocalypticNick', 'post-apocalyptic-nick/'),
-    ('RealTalk', 'real-talk-people-who-cut-in-line/'),
-    ('SoManyNightmares', 'so-many-nightmares-freedom-nightmare/'),
-    ('SportsGuys', 'sports-guys/'),
-    ('TalesOfPizza', 'tales-of-pizza-bad-tipper/'),
-    ('TheGentlemensClub', 'the-gentlemens-club/'),
-    ('TheHorrorOfColony6', 'the-horror-of-colony-6-page-1/'),
-    ('TheKingsofViralVideo', 'the-kings-of-viral-video-premiere/'),
-    ('TheSharonandTonyExperiment', 'the-sharon-and-tony-experiment/'),
-    ('TonyDestructo', 'tony-destructo/'),
-    ('WeirdBikerTales', 'weird-biker-tales-the-last-outlaw/'),
-    ('WillysSpaceDive', 'willys-space-dive/')
-]:
-    add(name, 'http://www.thewebcomicfactory.com',
-        starter=indirectStarter('http://www.thewebcomicfactory.com/comic/' + url,
-                                "//a[contains(concat(' ', text(), ' '), ' Last ')]"))
--- a/scripts/scriptutil.py
+++ b/scripts/scriptutil.py
@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
 # Copyright (C) 2012-2014 Bastian Kleineidam
-# Copyright (C) 2016 Tobias Gruetzmacher
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
+
+from __future__ import absolute_import, division, print_function
+
 import json
 import codecs

@ -42,6 +45,6 @@ def truncate_name(text):
 def format_name(text):
    """Format a comic name."""
    name = unescape(text)
+    name = "".join(capfirst(x) for x in name.split(" "))
    name = asciify(name.replace(u'&', u'And').replace(u'@', u'At'))
-    name = capfirst(name)
    return name
--- a/scripts/webcomicfactory.py
+++ b/scripts/webcomicfactory.py
@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# Copyright (C) 2015-2016 Tobias Gruetzmacher
+"""
+Script to get WebComicFactory comics and save the info in a JSON file for
+further processing.
+"""
+from __future__ import absolute_import, division, print_function
+
+import codecs
+import sys
+import os
+import requests
+from lxml import html
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))  # noqa
+from dosagelib.util import getPageContent
+from scriptutil import (save_result, load_result, truncate_name, format_name)
+
+json_file = __file__.replace(".py", ".json")
+
+
+def find_first(session, url):
+    try:
+        data = html.document_fromstring(getPageContent(url, session))
+        data.make_links_absolute(url)
+    except IOError as msg:
+        print("ERROR:", msg, file=sys.stderr)
+        return url
+    firstlinks = data.cssselect('a.comic-nav-first')
+    if not firstlinks:
+        print("INFO No first link on »%s«, already first page?" % (url))
+        return url
+    return firstlinks[0].attrib['href']
+
+
+def get_results():
+    """Parse start page for supported comics."""
+    res = {}
+    url = 'http://www.thewebcomicfactory.com/'
+    session = requests.Session()
+    try:
+        data = html.document_fromstring(getPageContent(url, session))
+        data.make_links_absolute(url)
+    except IOError as msg:
+        print("ERROR:", msg, file=sys.stderr)
+        return {}
+
+    for comicdiv in data.cssselect('div.ceo_thumbnail_widget'):
+        comicname = comicdiv.cssselect('h2')[0]
+        comiclink = comicdiv.cssselect('a')[0]
+        comicurl = comiclink.attrib['href']
+        name = format_name(comicname.text)
+        if 'comic-color-key' in comicurl:
+            continue
+        comicurl = find_first(session, comicurl)
+        res[name] = comicurl
+
+    save_result(res, json_file)
+
+
+def first_lower(x):
+    return x[0].lower()
+
+
+def print_results(args):
+    """Print all comics."""
+    min_comics, filename = args
+    with codecs.open(filename, 'a', 'utf-8') as fp:
+        data = load_result(json_file)
+        for name, url in sorted(data.items(), key=first_lower):
+            fp.write(u"\n\nclass %s(_WebcomicFactory):\n    firstStripUrl = %r\n" % (
+                     truncate_name(name), str(url)))
+
+
+if __name__ == '__main__':
+    if len(sys.argv) > 1:
+        print_results(sys.argv[1:])
+    else:
+        get_results()