From 7f7a69818b92c625eb6d42fbfc541e7c295ff58a Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Wed, 4 Nov 2015 23:43:31 +0100 Subject: [PATCH] Remove make_scraper magic from creators module. --- dosagelib/plugins/creators.py | 202 ++++++++++++++++++++++------------ scripts/creators.py | 13 +-- 2 files changed, 139 insertions(+), 76 deletions(-) diff --git a/dosagelib/plugins/creators.py b/dosagelib/plugins/creators.py index c297f39d6..f609dc6f5 100644 --- a/dosagelib/plugins/creators.py +++ b/dosagelib/plugins/creators.py @@ -1,79 +1,143 @@ # -*- coding: iso-8859-1 -*- # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015 Tobias Gruetzmacher from re import compile -from ..scraper import make_scraper +from ..scraper import _ParserScraper from ..util import tagre -_imageSearch = compile(tagre("a", "href", r'(/comics/\d+/[^"]+)')) +class _Creators(_ParserScraper): + url = 'http://www.creators.com/comics/' + imageSearch = '//td/a[@class="z"]' + prevSearch = '//a[contains(@class,"time_l")]' + help = 'Index format: n' -def add(name, path): - baseUrl = 'http://www.creators.com' - classname = 'Creators_%s' % name - globals()[classname] = make_scraper(classname, - name = 'Creators/' + name, - url = baseUrl + path + '.html', - stripUrl = baseUrl + path + '/%s.html', - lang = 'es' if name.lower().endswith('spanish') else 'en', - imageSearch = _imageSearch, - prevSearch = compile(tagre("a", "href", r'(%s/\d+\.html)' % path) + - tagre("img", "src", r'/img_comics/arrow_l\.gif')), - help = 'Index format: n', - ) + @classmethod + def getName(cls): + return 'Creators/' + cls.__name__ -# do not edit anything below since these entries are generated from scripts/update.sh + @classmethod + def starter(cls): + return cls.url + cls.path + '.html' + + def getIndexStripUrl(self, index): + return self.url + self.path + '/%s.html' % index + +class _CreatorsEs(_Creators): + lang = 'es' + + def shouldSkipUrl(self, url, data): + """Images are 404...""" + return url in ( + self.url + 'heathcliff-spanish/139736.html' + ) + +# Some comics are not listed on the "all" page (too old?) +class WinnieThePooh(_Creators): + path = u'winnie-the-pooh' + +class Recess(_Creators): + path = u'recess' + +class NaturalSelection(_Creators): + path = u'natural-selection' + +class FlightDeck(_Creators): + path = u'flight-deck' + +# do not edit anything below since these entries are generated from scripts/update_plugins.sh # DO NOT REMOVE -# duplicate of gocomics add('Agnes', '/comics/agnes') -# duplicate of gocomics add('AndyCapp', '/comics/andy-capp') -add('Archie', '/comics/archie') -add('ArchieinSpanish', '/comics/archie-spanish') -# duplicate of gocomics add('AskShagg', '/comics/ask-shagg') -# duplicate of gocomics add('BC', '/comics/bc') -add('BCinSpanish', '/comics/bc-spanish') -# duplicate of gocomics add('BallardStreet', '/comics/ballard-street') -add('CafeconLeche', '/comics/cafe-con-leche') -# duplicate of gocomics add('ChuckleBros', '/comics/chuckle-bros') -# duplicate of gocomics add('DaddysHome', '/comics/daddys-home') -# duplicate of gocomics add('DiamondLil', '/comics/diamond-lil') -# duplicate of gocomics add('DogEatDoug', '/comics/dog-eat-doug') -# duplicate of gocomics add('DogsofCKennel', '/comics/dogs-of-c-kennel') -add('DonaldDuck', '/comics/donald-duck') -add('Flare', '/comics/flare') -add('FlightDeck', '/comics/flight-deck') -# duplicate of gocomics add('FloandFriends', '/comics/flo-and-friends') -# duplicate of gocomics add('ForHeavensSake', '/comics/for-heavens-sake') -# duplicate of gocomics add('FreeRange', '/comics/free-range') -add('GirlsAndSports', '/comics/girls-and-sports') -add('GirlsandSportsinSpanish', '/comics/girls-and-sports-spanish') -# duplicate of gocomics add('Heathcliff', '/comics/heathcliff') -add('HeathcliffinSpanish', '/comics/heathcliff-spanish') -# duplicate of gocomics add('HerbandJamaal', '/comics/herb-and-jamaal') -add('HomeOffice', '/comics/stay-at-home-dad') -add('HopeAndDeath', '/comics/hope-and-death') -# duplicate of gocomics add('LibertyMeadows', '/comics/liberty-meadows') -add('LongStoryShort', '/comics/long-story-short') -add('MickeyMouse', '/comics/mickey-mouse') -# duplicate of gocomics add('Momma', '/comics/momma') -# duplicate of gocomics add('NestHeads', '/comics/nest-heads') -add('OffCenter', '/comics/off-center') -# duplicate of gocomics add('OnaClaireDay', '/comics/on-a-claire-day') -# duplicate of gocomics add('OneBigHappy', '/comics/one-big-happy') -add('Recess', '/comics/recess') -# duplicate of gocomics add('Rubes', '/comics/rubes') -add('Rugrats', '/comics/rugrats') -add('RugratsinSpanish', '/comics/rugrats-spanish') -# duplicate of gocomics add('ScaryGary', '/comics/scary-gary') -# duplicate of gocomics add('SpeedBump', '/comics/speed-bump') -# duplicate of gocomics add('StrangeBrew', '/comics/strange-brew') -# duplicate of gocomics add('TheBarn', '/comics/the-barn') -# duplicate of gocomics add('TheDinetteSet', '/comics/dinette-set') -# duplicate of gocomics add('TheMeaningofLila', '/comics/meaning-of-lila') -# duplicate of gocomics add('TheOtherCoast', '/comics/the-other-coast') -add('TheQuigmans', '/comics/the-quigmans') -add('TheWizardofIdinSpanish', '/comics/wizard-of-id-spanish') -# duplicate of gocomics add('ThinLines', '/comics/thin-lines') -# duplicate of gocomics add('WeePals', '/comics/wee-pals') -# duplicate of gocomics add('WizardofId', '/comics/wizard-of-id') -# duplicate of gocomics add('WorkingitOut', '/comics/working-it-out') -# duplicate of gocomics add('ZackHill', '/comics/zack-hill') +# Agnes has a duplicate in gocomics +# AndyCapp has a duplicate in gocomics +class Archie(_Creators): + path = u'archie' + +class ArchieinSpanish(_CreatorsEs): + path = u'archie-spanish' + +# AskShagg has a duplicate in gocomics +# BC has a duplicate in gocomics +class BCinSpanish(_CreatorsEs): + path = u'bc-spanish' + +# BallardStreet has a duplicate in gocomics +class CafeconLeche(_Creators): + path = u'cafe-con-leche' + +# ChuckleBros has a duplicate in gocomics +# DaddysHome has a duplicate in gocomics +# DiamondLil has a duplicate in gocomics +# DogEatDoug has a duplicate in gocomics +# DogsofCKennel has a duplicate in gocomics +class DonaldDuck(_Creators): + path = u'donald-duck' + +class Doodles(_Creators): + path = u'doodles' + +class Flare(_Creators): + path = u'flare' + +class FlightDeck(_Creators): + path = u'flight-deck' + +# FloandFriends has a duplicate in gocomics +# ForHeavensSake has a duplicate in gocomics +# FreeRange has a duplicate in gocomics +class GirlsAndSports(_Creators): + path = u'girls-and-sports' + +class GirlsandSportsinSpanish(_CreatorsEs): + path = u'girls-and-sports-spanish' + +# Heathcliff has a duplicate in gocomics +class HeathcliffinSpanish(_CreatorsEs): + path = u'heathcliff-spanish' + +# HerbandJamaal has a duplicate in gocomics +class HomeOffice(_Creators): + path = u'stay-at-home-dad' + +class HopeAndDeath(_Creators): + path = u'hope-and-death' + +# LibertyMeadows has a duplicate in gocomics +class LongStoryShort(_Creators): + path = u'long-story-short' + +class MickeyMouse(_Creators): + path = u'mickey-mouse' + +# Momma has a duplicate in gocomics +# NestHeads has a duplicate in gocomics +class OffCenter(_Creators): + path = u'off-center' + +# OnaClaireDay has a duplicate in gocomics +# OneBigHappy has a duplicate in gocomics +# Rubes has a duplicate in gocomics +class Rugrats(_Creators): + path = u'rugrats' + +class RugratsinSpanish(_CreatorsEs): + path = u'rugrats-spanish' + +# ScaryGary has a duplicate in gocomics +# SpeedBump has a duplicate in gocomics +# StrangeBrew has a duplicate in gocomics +# TheBarn has a duplicate in gocomics +# TheDinetteSet has a duplicate in gocomics +# TheMeaningofLila has a duplicate in gocomics +# TheOtherCoast has a duplicate in gocomics +class TheQuigmans(_Creators): + path = u'the-quigmans' + +class TheWizardofIdinSpanish(_CreatorsEs): + path = u'wizard-of-id-spanish' + +# ThinLines has a duplicate in gocomics +# WeePals has a duplicate in gocomics +# WizardofId has a duplicate in gocomics +# WorkingitOut has a duplicate in gocomics +# ZackHill has a duplicate in gocomics diff --git a/scripts/creators.py b/scripts/creators.py index 88b4759ce..6e7ab816c 100755 --- a/scripts/creators.py +++ b/scripts/creators.py @@ -16,7 +16,7 @@ from scriptutil import contains_case_insensitive, capfirst, save_result, load_re json_file = __file__.replace(".py", ".json") -url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r'([^<]+)') +url_matcher = re.compile(tagre("a", "href", r'/comics/([^/]+)\.html') + r'([^<]+)') # names of comics to exclude exclude_comics = [ @@ -67,16 +67,15 @@ def print_results(args): """Print comics.""" min_comics, filename = args with codecs.open(filename, 'a', 'utf-8') as fp: - for name, url in sorted(load_result(json_file).items()): + for name, path in sorted(load_result(json_file).items()): if name in exclude_comics: continue + lang = 'Es' if name.lower().endswith('spanish') else '' if has_gocomics_comic(name): - prefix = u'# duplicate of gocomics ' + fp.write(u'# %s has a duplicate in gocomics\n' % truncate_name(name)) else: - prefix = u'' - fp.write(u"%sadd(%r, %r)\n" % ( - prefix, str(truncate_name(name)), str(url)) - ) + fp.write(u"class %s(_Creators%s):\n path = %r\n\n" % + (truncate_name(name), lang, path)) if __name__ == '__main__':