Move WebcomicFactory in its own module.
Also, add an updater script for it.
This commit is contained in:
parent
bb1f20d867
commit
2c6decb7f5
4 changed files with 228 additions and 38 deletions
144
dosagelib/plugins/webcomicfactory.py
Normal file
144
dosagelib/plugins/webcomicfactory.py
Normal file
|
@ -0,0 +1,144 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
from .common import _WordPressScraper
|
||||||
|
|
||||||
|
|
||||||
|
class _WebcomicFactory(_WordPressScraper):
|
||||||
|
latestSearch = '//a[contains(concat(" ", @class, " "), " comic-nav-last ")]'
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def starter(cls):
|
||||||
|
"""this is basically helpers.indirectStarter, but dynamically selecting
|
||||||
|
the right parameters."""
|
||||||
|
data = cls.getPage(cls.firstStripUrl)
|
||||||
|
return cls.fetchUrl(cls.firstStripUrl, data, cls.latestSearch)
|
||||||
|
|
||||||
|
|
||||||
|
# do not edit anything below since these entries are generated from
|
||||||
|
# scripts/update_plugins.sh
|
||||||
|
# DO NOT REMOVE
|
||||||
|
|
||||||
|
|
||||||
|
class AsTheMayoTurns(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/as-the-mayo-turns/'
|
||||||
|
|
||||||
|
|
||||||
|
class ComicBookMafia(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/comic-book-mafia/'
|
||||||
|
|
||||||
|
|
||||||
|
class Dealers(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/dealers-1-1998-was-the-year/'
|
||||||
|
|
||||||
|
|
||||||
|
class DigitalHobo(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/digital-hobo-1-its-a-living-kinda/'
|
||||||
|
|
||||||
|
|
||||||
|
class ECoastVsWCoast(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/east-coast-vs-west-coast-greetings-from-the-coasts/'
|
||||||
|
|
||||||
|
|
||||||
|
class GunCulture(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/gun-culture/'
|
||||||
|
|
||||||
|
|
||||||
|
class IHateMyKids(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/i-hate-my-kids/'
|
||||||
|
|
||||||
|
|
||||||
|
class InARelationship(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/in-a-relationship-3/'
|
||||||
|
|
||||||
|
|
||||||
|
class IntergalacticMedicalDoctor(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/intergalactic-medical-doctor/'
|
||||||
|
|
||||||
|
|
||||||
|
class JSchoolgirlsInLove(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/japanese-schoolgirls-in-love-1/'
|
||||||
|
|
||||||
|
|
||||||
|
class KingdomOfTheDwarves(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/kingdom-of-the-dwarves/'
|
||||||
|
|
||||||
|
|
||||||
|
class LesterCrenshawIsDead(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/lester-crenshaw-is-dead/'
|
||||||
|
|
||||||
|
|
||||||
|
class Millennials(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/millennials/'
|
||||||
|
|
||||||
|
|
||||||
|
class MiserableComedians(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/miserable-comedians-1-funny-because-its-sad/'
|
||||||
|
|
||||||
|
|
||||||
|
class OldeTymeGamer(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/olde-tyme-gamer-playing-injured/'
|
||||||
|
|
||||||
|
|
||||||
|
class PinJunkies(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/pin-junkies/'
|
||||||
|
|
||||||
|
|
||||||
|
class PostApocalypticNick(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/post-apocalyptic-nick/'
|
||||||
|
|
||||||
|
|
||||||
|
class RealTalk(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/real-talk-people-who-cut-in-line/'
|
||||||
|
|
||||||
|
|
||||||
|
class SoManyNightmares(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/so-many-nightmares-freedom-nightmare/'
|
||||||
|
|
||||||
|
|
||||||
|
class SportsGuys(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/sports-guys/'
|
||||||
|
|
||||||
|
|
||||||
|
class TalesOfPizza(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/tales-of-pizza-bad-tipper/'
|
||||||
|
|
||||||
|
|
||||||
|
class TAndA(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-webcomic-factory-premiere-t-and-a/'
|
||||||
|
|
||||||
|
|
||||||
|
class TheAntiwarComic(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-antiwar-comic-the-party/'
|
||||||
|
|
||||||
|
|
||||||
|
class TheGentlemensClub(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-gentlemens-club/'
|
||||||
|
|
||||||
|
|
||||||
|
class TheHorrorOfColony6(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-horror-of-colony-6-page-1/'
|
||||||
|
|
||||||
|
|
||||||
|
class TheKingsOfViralVideo(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-kings-of-viral-video-premiere/'
|
||||||
|
|
||||||
|
|
||||||
|
class TheSharonAndTonyExperiment(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/the-sharon-and-tony-experiment/'
|
||||||
|
|
||||||
|
|
||||||
|
class TonyDestructo(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/tony-destructo/'
|
||||||
|
|
||||||
|
|
||||||
|
class WeirdBikerTales(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/weird-biker-tales-the-last-outlaw/'
|
||||||
|
|
||||||
|
|
||||||
|
class WillysSpaceDive(_WebcomicFactory):
|
||||||
|
firstStripUrl = 'http://www.thewebcomicfactory.com/comic/willys-space-dive/'
|
|
@ -37,39 +37,3 @@ for (name, linkNumber) in [
|
||||||
add(name, 'http://www.flowerlarkstudios.com/',
|
add(name, 'http://www.flowerlarkstudios.com/',
|
||||||
starter=indirectStarter('http://www.flowerlarkstudios.com/',
|
starter=indirectStarter('http://www.flowerlarkstudios.com/',
|
||||||
'(//div[@id="sidebar-left"]//a)[' + str(linkNumber) + ']'))
|
'(//div[@id="sidebar-left"]//a)[' + str(linkNumber) + ']'))
|
||||||
|
|
||||||
# all comics on the webcomic factory
|
|
||||||
|
|
||||||
for (name, url) in [
|
|
||||||
('AntiwarComic', 'the-antiwar-comic-the-party/'),
|
|
||||||
('AstheMayoTurns', 'as-the-mayo-turns/'),
|
|
||||||
('ComicBookMafia', 'comic-book-mafia/'),
|
|
||||||
('Dealers', 'dealers-1-1998-was-the-year/'),
|
|
||||||
('DigitalHobo', 'digital-hobo-1-its-a-living-kinda/'),
|
|
||||||
('EastCoastVsWestCoast', 'east-coast-vs-west-coast-greetings-from-the-coasts/'),
|
|
||||||
('GunCulture', 'gun-culture/'),
|
|
||||||
('IHateMyKids', 'i-hate-my-kids/'),
|
|
||||||
('InARelationship', 'in-a-relationship-3/'),
|
|
||||||
('JapaneseSchoolgirlsinLove', 'japanese-schoolgirls-in-love-1/'),
|
|
||||||
('KingdomoftheDwarves', 'kingdom-of-the-dwarves/'),
|
|
||||||
('LesterCrenshawisDead', 'lester-crenshaw-is-dead/'),
|
|
||||||
('Millennials', 'millennials/'),
|
|
||||||
('MiserableComedians', 'miserable-comedians-1-funny-because-its-sad/'),
|
|
||||||
('OldeTymeGamer', 'olde-tyme-gamer-playing-injured/'),
|
|
||||||
('PinJunkies', 'pin-junkies/'),
|
|
||||||
('PostApocalypticNick', 'post-apocalyptic-nick/'),
|
|
||||||
('RealTalk', 'real-talk-people-who-cut-in-line/'),
|
|
||||||
('SoManyNightmares', 'so-many-nightmares-freedom-nightmare/'),
|
|
||||||
('SportsGuys', 'sports-guys/'),
|
|
||||||
('TalesOfPizza', 'tales-of-pizza-bad-tipper/'),
|
|
||||||
('TheGentlemensClub', 'the-gentlemens-club/'),
|
|
||||||
('TheHorrorOfColony6', 'the-horror-of-colony-6-page-1/'),
|
|
||||||
('TheKingsofViralVideo', 'the-kings-of-viral-video-premiere/'),
|
|
||||||
('TheSharonandTonyExperiment', 'the-sharon-and-tony-experiment/'),
|
|
||||||
('TonyDestructo', 'tony-destructo/'),
|
|
||||||
('WeirdBikerTales', 'weird-biker-tales-the-last-outlaw/'),
|
|
||||||
('WillysSpaceDive', 'willys-space-dive/')
|
|
||||||
]:
|
|
||||||
add(name, 'http://www.thewebcomicfactory.com',
|
|
||||||
starter=indirectStarter('http://www.thewebcomicfactory.com/comic/' + url,
|
|
||||||
"//a[contains(concat(' ', text(), ' '), ' Last ')]"))
|
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2016 Tobias Gruetzmacher
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import codecs
|
import codecs
|
||||||
|
|
||||||
|
@ -42,6 +45,6 @@ def truncate_name(text):
|
||||||
def format_name(text):
|
def format_name(text):
|
||||||
"""Format a comic name."""
|
"""Format a comic name."""
|
||||||
name = unescape(text)
|
name = unescape(text)
|
||||||
|
name = "".join(capfirst(x) for x in name.split(" "))
|
||||||
name = asciify(name.replace(u'&', u'And').replace(u'@', u'At'))
|
name = asciify(name.replace(u'&', u'And').replace(u'@', u'At'))
|
||||||
name = capfirst(name)
|
|
||||||
return name
|
return name
|
||||||
|
|
79
scripts/webcomicfactory.py
Executable file
79
scripts/webcomicfactory.py
Executable file
|
@ -0,0 +1,79 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
|
"""
|
||||||
|
Script to get WebComicFactory comics and save the info in a JSON file for
|
||||||
|
further processing.
|
||||||
|
"""
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
import codecs
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||||
|
from dosagelib.util import getPageContent
|
||||||
|
from scriptutil import (save_result, load_result, truncate_name, format_name)
|
||||||
|
|
||||||
|
json_file = __file__.replace(".py", ".json")
|
||||||
|
|
||||||
|
|
||||||
|
def find_first(session, url):
|
||||||
|
try:
|
||||||
|
data = html.document_fromstring(getPageContent(url, session))
|
||||||
|
data.make_links_absolute(url)
|
||||||
|
except IOError as msg:
|
||||||
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
|
return url
|
||||||
|
firstlinks = data.cssselect('a.comic-nav-first')
|
||||||
|
if not firstlinks:
|
||||||
|
print("INFO No first link on »%s«, already first page?" % (url))
|
||||||
|
return url
|
||||||
|
return firstlinks[0].attrib['href']
|
||||||
|
|
||||||
|
|
||||||
|
def get_results():
|
||||||
|
"""Parse start page for supported comics."""
|
||||||
|
res = {}
|
||||||
|
url = 'http://www.thewebcomicfactory.com/'
|
||||||
|
session = requests.Session()
|
||||||
|
try:
|
||||||
|
data = html.document_fromstring(getPageContent(url, session))
|
||||||
|
data.make_links_absolute(url)
|
||||||
|
except IOError as msg:
|
||||||
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
for comicdiv in data.cssselect('div.ceo_thumbnail_widget'):
|
||||||
|
comicname = comicdiv.cssselect('h2')[0]
|
||||||
|
comiclink = comicdiv.cssselect('a')[0]
|
||||||
|
comicurl = comiclink.attrib['href']
|
||||||
|
name = format_name(comicname.text)
|
||||||
|
if 'comic-color-key' in comicurl:
|
||||||
|
continue
|
||||||
|
comicurl = find_first(session, comicurl)
|
||||||
|
res[name] = comicurl
|
||||||
|
|
||||||
|
save_result(res, json_file)
|
||||||
|
|
||||||
|
|
||||||
|
def first_lower(x):
|
||||||
|
return x[0].lower()
|
||||||
|
|
||||||
|
|
||||||
|
def print_results(args):
|
||||||
|
"""Print all comics."""
|
||||||
|
min_comics, filename = args
|
||||||
|
with codecs.open(filename, 'a', 'utf-8') as fp:
|
||||||
|
data = load_result(json_file)
|
||||||
|
for name, url in sorted(data.items(), key=first_lower):
|
||||||
|
fp.write(u"\n\nclass %s(_WebcomicFactory):\n firstStripUrl = %r\n" % (
|
||||||
|
truncate_name(name), str(url)))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
print_results(sys.argv[1:])
|
||||||
|
else:
|
||||||
|
get_results()
|
Loading…
Reference in a new issue