From 2c6decb7f56a5e17ce2c92f57e47d053a64ec647 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sun, 3 Apr 2016 21:31:56 +0200 Subject: [PATCH] Move WebcomicFactory in its own module. Also, add an updater script for it. --- dosagelib/plugins/webcomicfactory.py | 144 +++++++++++++++++++++++++++ dosagelib/plugins/wordpress.py | 36 ------- scripts/scriptutil.py | 7 +- scripts/webcomicfactory.py | 79 +++++++++++++++ 4 files changed, 228 insertions(+), 38 deletions(-) create mode 100644 dosagelib/plugins/webcomicfactory.py create mode 100755 scripts/webcomicfactory.py diff --git a/dosagelib/plugins/webcomicfactory.py b/dosagelib/plugins/webcomicfactory.py new file mode 100644 index 000000000..763a9480f --- /dev/null +++ b/dosagelib/plugins/webcomicfactory.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs +# Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher + +from __future__ import absolute_import, division, print_function + +from .common import _WordPressScraper + + +class _WebcomicFactory(_WordPressScraper): + latestSearch = '//a[contains(concat(" ", @class, " "), " comic-nav-last ")]' + + @classmethod + def starter(cls): + """this is basically helpers.indirectStarter, but dynamically selecting + the right parameters.""" + data = cls.getPage(cls.firstStripUrl) + return cls.fetchUrl(cls.firstStripUrl, data, cls.latestSearch) + + +# do not edit anything below since these entries are generated from +# scripts/update_plugins.sh +# DO NOT REMOVE + + +class AsTheMayoTurns(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/as-the-mayo-turns/' + + +class ComicBookMafia(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/comic-book-mafia/' + + +class Dealers(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/dealers-1-1998-was-the-year/' + + +class DigitalHobo(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/digital-hobo-1-its-a-living-kinda/' + + +class ECoastVsWCoast(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/east-coast-vs-west-coast-greetings-from-the-coasts/' + + +class GunCulture(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/gun-culture/' + + +class IHateMyKids(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/i-hate-my-kids/' + + +class InARelationship(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/in-a-relationship-3/' + + +class IntergalacticMedicalDoctor(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/intergalactic-medical-doctor/' + + +class JSchoolgirlsInLove(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/japanese-schoolgirls-in-love-1/' + + +class KingdomOfTheDwarves(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/kingdom-of-the-dwarves/' + + +class LesterCrenshawIsDead(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/lester-crenshaw-is-dead/' + + +class Millennials(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/millennials/' + + +class MiserableComedians(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/miserable-comedians-1-funny-because-its-sad/' + + +class OldeTymeGamer(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/olde-tyme-gamer-playing-injured/' + + +class PinJunkies(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/pin-junkies/' + + +class PostApocalypticNick(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/post-apocalyptic-nick/' + + +class RealTalk(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/real-talk-people-who-cut-in-line/' + + +class SoManyNightmares(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/so-many-nightmares-freedom-nightmare/' + + +class SportsGuys(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/sports-guys/' + + +class TalesOfPizza(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/tales-of-pizza-bad-tipper/' + + +class TAndA(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-webcomic-factory-premiere-t-and-a/' + + +class TheAntiwarComic(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-antiwar-comic-the-party/' + + +class TheGentlemensClub(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-gentlemens-club/' + + +class TheHorrorOfColony6(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-horror-of-colony-6-page-1/' + + +class TheKingsOfViralVideo(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-kings-of-viral-video-premiere/' + + +class TheSharonAndTonyExperiment(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-sharon-and-tony-experiment/' + + +class TonyDestructo(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/tony-destructo/' + + +class WeirdBikerTales(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/weird-biker-tales-the-last-outlaw/' + + +class WillysSpaceDive(_WebcomicFactory): + firstStripUrl = 'http://www.thewebcomicfactory.com/comic/willys-space-dive/' diff --git a/dosagelib/plugins/wordpress.py b/dosagelib/plugins/wordpress.py index 2374c9d4e..a5c277ecc 100644 --- a/dosagelib/plugins/wordpress.py +++ b/dosagelib/plugins/wordpress.py @@ -37,39 +37,3 @@ for (name, linkNumber) in [ add(name, 'http://www.flowerlarkstudios.com/', starter=indirectStarter('http://www.flowerlarkstudios.com/', '(//div[@id="sidebar-left"]//a)[' + str(linkNumber) + ']')) - -# all comics on the webcomic factory - -for (name, url) in [ - ('AntiwarComic', 'the-antiwar-comic-the-party/'), - ('AstheMayoTurns', 'as-the-mayo-turns/'), - ('ComicBookMafia', 'comic-book-mafia/'), - ('Dealers', 'dealers-1-1998-was-the-year/'), - ('DigitalHobo', 'digital-hobo-1-its-a-living-kinda/'), - ('EastCoastVsWestCoast', 'east-coast-vs-west-coast-greetings-from-the-coasts/'), - ('GunCulture', 'gun-culture/'), - ('IHateMyKids', 'i-hate-my-kids/'), - ('InARelationship', 'in-a-relationship-3/'), - ('JapaneseSchoolgirlsinLove', 'japanese-schoolgirls-in-love-1/'), - ('KingdomoftheDwarves', 'kingdom-of-the-dwarves/'), - ('LesterCrenshawisDead', 'lester-crenshaw-is-dead/'), - ('Millennials', 'millennials/'), - ('MiserableComedians', 'miserable-comedians-1-funny-because-its-sad/'), - ('OldeTymeGamer', 'olde-tyme-gamer-playing-injured/'), - ('PinJunkies', 'pin-junkies/'), - ('PostApocalypticNick', 'post-apocalyptic-nick/'), - ('RealTalk', 'real-talk-people-who-cut-in-line/'), - ('SoManyNightmares', 'so-many-nightmares-freedom-nightmare/'), - ('SportsGuys', 'sports-guys/'), - ('TalesOfPizza', 'tales-of-pizza-bad-tipper/'), - ('TheGentlemensClub', 'the-gentlemens-club/'), - ('TheHorrorOfColony6', 'the-horror-of-colony-6-page-1/'), - ('TheKingsofViralVideo', 'the-kings-of-viral-video-premiere/'), - ('TheSharonandTonyExperiment', 'the-sharon-and-tony-experiment/'), - ('TonyDestructo', 'tony-destructo/'), - ('WeirdBikerTales', 'weird-biker-tales-the-last-outlaw/'), - ('WillysSpaceDive', 'willys-space-dive/') -]: - add(name, 'http://www.thewebcomicfactory.com', - starter=indirectStarter('http://www.thewebcomicfactory.com/comic/' + url, - "//a[contains(concat(' ', text(), ' '), ' Last ')]")) diff --git a/scripts/scriptutil.py b/scripts/scriptutil.py index 4a82cdd12..63b872103 100644 --- a/scripts/scriptutil.py +++ b/scripts/scriptutil.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- # Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2016 Tobias Gruetzmacher +# Copyright (C) 2015-2016 Tobias Gruetzmacher + +from __future__ import absolute_import, division, print_function + import json import codecs @@ -42,6 +45,6 @@ def truncate_name(text): def format_name(text): """Format a comic name.""" name = unescape(text) + name = "".join(capfirst(x) for x in name.split(" ")) name = asciify(name.replace(u'&', u'And').replace(u'@', u'At')) - name = capfirst(name) return name diff --git a/scripts/webcomicfactory.py b/scripts/webcomicfactory.py new file mode 100755 index 000000000..2ec745337 --- /dev/null +++ b/scripts/webcomicfactory.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# Copyright (C) 2015-2016 Tobias Gruetzmacher +""" +Script to get WebComicFactory comics and save the info in a JSON file for +further processing. +""" +from __future__ import absolute_import, division, print_function + +import codecs +import sys +import os +import requests +from lxml import html + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa +from dosagelib.util import getPageContent +from scriptutil import (save_result, load_result, truncate_name, format_name) + +json_file = __file__.replace(".py", ".json") + + +def find_first(session, url): + try: + data = html.document_fromstring(getPageContent(url, session)) + data.make_links_absolute(url) + except IOError as msg: + print("ERROR:", msg, file=sys.stderr) + return url + firstlinks = data.cssselect('a.comic-nav-first') + if not firstlinks: + print("INFO No first link on »%s«, already first page?" % (url)) + return url + return firstlinks[0].attrib['href'] + + +def get_results(): + """Parse start page for supported comics.""" + res = {} + url = 'http://www.thewebcomicfactory.com/' + session = requests.Session() + try: + data = html.document_fromstring(getPageContent(url, session)) + data.make_links_absolute(url) + except IOError as msg: + print("ERROR:", msg, file=sys.stderr) + return {} + + for comicdiv in data.cssselect('div.ceo_thumbnail_widget'): + comicname = comicdiv.cssselect('h2')[0] + comiclink = comicdiv.cssselect('a')[0] + comicurl = comiclink.attrib['href'] + name = format_name(comicname.text) + if 'comic-color-key' in comicurl: + continue + comicurl = find_first(session, comicurl) + res[name] = comicurl + + save_result(res, json_file) + + +def first_lower(x): + return x[0].lower() + + +def print_results(args): + """Print all comics.""" + min_comics, filename = args + with codecs.open(filename, 'a', 'utf-8') as fp: + data = load_result(json_file) + for name, url in sorted(data.items(), key=first_lower): + fp.write(u"\n\nclass %s(_WebcomicFactory):\n firstStripUrl = %r\n" % ( + truncate_name(name), str(url))) + + +if __name__ == '__main__': + if len(sys.argv) > 1: + print_results(sys.argv[1:]) + else: + get_results()