Remove make_scraper magic from creators module.

This commit is contained in:
Tobias Gruetzmacher 2015-11-04 23:43:31 +01:00
parent 94470d564c
commit 7f7a69818b
2 changed files with 139 additions and 76 deletions

View file

@ -1,79 +1,143 @@
# -*- coding: iso-8859-1 -*- # -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015 Tobias Gruetzmacher
from re import compile from re import compile
from ..scraper import make_scraper from ..scraper import _ParserScraper
from ..util import tagre from ..util import tagre
_imageSearch = compile(tagre("a", "href", r'(/comics/\d+/[^"]+)')) class _Creators(_ParserScraper):
url = 'http://www.creators.com/comics/'
imageSearch = '//td/a[@class="z"]'
prevSearch = '//a[contains(@class,"time_l")]'
help = 'Index format: n'
def add(name, path): @classmethod
baseUrl = 'http://www.creators.com' def getName(cls):
classname = 'Creators_%s' % name return 'Creators/' + cls.__name__
globals()[classname] = make_scraper(classname,
name = 'Creators/' + name,
url = baseUrl + path + '.html',
stripUrl = baseUrl + path + '/%s.html',
lang = 'es' if name.lower().endswith('spanish') else 'en',
imageSearch = _imageSearch,
prevSearch = compile(tagre("a", "href", r'(%s/\d+\.html)' % path) +
tagre("img", "src", r'/img_comics/arrow_l\.gif')),
help = 'Index format: n',
)
# do not edit anything below since these entries are generated from scripts/update.sh @classmethod
def starter(cls):
return cls.url + cls.path + '.html'
def getIndexStripUrl(self, index):
return self.url + self.path + '/%s.html' % index
class _CreatorsEs(_Creators):
lang = 'es'
def shouldSkipUrl(self, url, data):
"""Images are 404..."""
return url in (
self.url + 'heathcliff-spanish/139736.html'
)
# Some comics are not listed on the "all" page (too old?)
class WinnieThePooh(_Creators):
path = u'winnie-the-pooh'
class Recess(_Creators):
path = u'recess'
class NaturalSelection(_Creators):
path = u'natural-selection'
class FlightDeck(_Creators):
path = u'flight-deck'
# do not edit anything below since these entries are generated from scripts/update_plugins.sh
# DO NOT REMOVE # DO NOT REMOVE
# duplicate of gocomics add('Agnes', '/comics/agnes') # Agnes has a duplicate in gocomics
# duplicate of gocomics add('AndyCapp', '/comics/andy-capp') # AndyCapp has a duplicate in gocomics
add('Archie', '/comics/archie') class Archie(_Creators):
add('ArchieinSpanish', '/comics/archie-spanish') path = u'archie'
# duplicate of gocomics add('AskShagg', '/comics/ask-shagg')
# duplicate of gocomics add('BC', '/comics/bc') class ArchieinSpanish(_CreatorsEs):
add('BCinSpanish', '/comics/bc-spanish') path = u'archie-spanish'
# duplicate of gocomics add('BallardStreet', '/comics/ballard-street')
add('CafeconLeche', '/comics/cafe-con-leche') # AskShagg has a duplicate in gocomics
# duplicate of gocomics add('ChuckleBros', '/comics/chuckle-bros') # BC has a duplicate in gocomics
# duplicate of gocomics add('DaddysHome', '/comics/daddys-home') class BCinSpanish(_CreatorsEs):
# duplicate of gocomics add('DiamondLil', '/comics/diamond-lil') path = u'bc-spanish'
# duplicate of gocomics add('DogEatDoug', '/comics/dog-eat-doug')
# duplicate of gocomics add('DogsofCKennel', '/comics/dogs-of-c-kennel') # BallardStreet has a duplicate in gocomics
add('DonaldDuck', '/comics/donald-duck') class CafeconLeche(_Creators):
add('Flare', '/comics/flare') path = u'cafe-con-leche'
add('FlightDeck', '/comics/flight-deck')
# duplicate of gocomics add('FloandFriends', '/comics/flo-and-friends') # ChuckleBros has a duplicate in gocomics
# duplicate of gocomics add('ForHeavensSake', '/comics/for-heavens-sake') # DaddysHome has a duplicate in gocomics
# duplicate of gocomics add('FreeRange', '/comics/free-range') # DiamondLil has a duplicate in gocomics
add('GirlsAndSports', '/comics/girls-and-sports') # DogEatDoug has a duplicate in gocomics
add('GirlsandSportsinSpanish', '/comics/girls-and-sports-spanish') # DogsofCKennel has a duplicate in gocomics
# duplicate of gocomics add('Heathcliff', '/comics/heathcliff') class DonaldDuck(_Creators):
add('HeathcliffinSpanish', '/comics/heathcliff-spanish') path = u'donald-duck'
# duplicate of gocomics add('HerbandJamaal', '/comics/herb-and-jamaal')
add('HomeOffice', '/comics/stay-at-home-dad') class Doodles(_Creators):
add('HopeAndDeath', '/comics/hope-and-death') path = u'doodles'
# duplicate of gocomics add('LibertyMeadows', '/comics/liberty-meadows')
add('LongStoryShort', '/comics/long-story-short') class Flare(_Creators):
add('MickeyMouse', '/comics/mickey-mouse') path = u'flare'
# duplicate of gocomics add('Momma', '/comics/momma')
# duplicate of gocomics add('NestHeads', '/comics/nest-heads') class FlightDeck(_Creators):
add('OffCenter', '/comics/off-center') path = u'flight-deck'
# duplicate of gocomics add('OnaClaireDay', '/comics/on-a-claire-day')
# duplicate of gocomics add('OneBigHappy', '/comics/one-big-happy') # FloandFriends has a duplicate in gocomics
add('Recess', '/comics/recess') # ForHeavensSake has a duplicate in gocomics
# duplicate of gocomics add('Rubes', '/comics/rubes') # FreeRange has a duplicate in gocomics
add('Rugrats', '/comics/rugrats') class GirlsAndSports(_Creators):
add('RugratsinSpanish', '/comics/rugrats-spanish') path = u'girls-and-sports'
# duplicate of gocomics add('ScaryGary', '/comics/scary-gary')
# duplicate of gocomics add('SpeedBump', '/comics/speed-bump') class GirlsandSportsinSpanish(_CreatorsEs):
# duplicate of gocomics add('StrangeBrew', '/comics/strange-brew') path = u'girls-and-sports-spanish'
# duplicate of gocomics add('TheBarn', '/comics/the-barn')
# duplicate of gocomics add('TheDinetteSet', '/comics/dinette-set') # Heathcliff has a duplicate in gocomics
# duplicate of gocomics add('TheMeaningofLila', '/comics/meaning-of-lila') class HeathcliffinSpanish(_CreatorsEs):
# duplicate of gocomics add('TheOtherCoast', '/comics/the-other-coast') path = u'heathcliff-spanish'
add('TheQuigmans', '/comics/the-quigmans')
add('TheWizardofIdinSpanish', '/comics/wizard-of-id-spanish') # HerbandJamaal has a duplicate in gocomics
# duplicate of gocomics add('ThinLines', '/comics/thin-lines') class HomeOffice(_Creators):
# duplicate of gocomics add('WeePals', '/comics/wee-pals') path = u'stay-at-home-dad'
# duplicate of gocomics add('WizardofId', '/comics/wizard-of-id')
# duplicate of gocomics add('WorkingitOut', '/comics/working-it-out') class HopeAndDeath(_Creators):
# duplicate of gocomics add('ZackHill', '/comics/zack-hill') path = u'hope-and-death'
# LibertyMeadows has a duplicate in gocomics
class LongStoryShort(_Creators):
path = u'long-story-short'
class MickeyMouse(_Creators):
path = u'mickey-mouse'
# Momma has a duplicate in gocomics
# NestHeads has a duplicate in gocomics
class OffCenter(_Creators):
path = u'off-center'
# OnaClaireDay has a duplicate in gocomics
# OneBigHappy has a duplicate in gocomics
# Rubes has a duplicate in gocomics
class Rugrats(_Creators):
path = u'rugrats'
class RugratsinSpanish(_CreatorsEs):
path = u'rugrats-spanish'
# ScaryGary has a duplicate in gocomics
# SpeedBump has a duplicate in gocomics
# StrangeBrew has a duplicate in gocomics
# TheBarn has a duplicate in gocomics
# TheDinetteSet has a duplicate in gocomics
# TheMeaningofLila has a duplicate in gocomics
# TheOtherCoast has a duplicate in gocomics
class TheQuigmans(_Creators):
path = u'the-quigmans'
class TheWizardofIdinSpanish(_CreatorsEs):
path = u'wizard-of-id-spanish'
# ThinLines has a duplicate in gocomics
# WeePals has a duplicate in gocomics
# WizardofId has a duplicate in gocomics
# WorkingitOut has a duplicate in gocomics
# ZackHill has a duplicate in gocomics

View file

@ -16,7 +16,7 @@ from scriptutil import contains_case_insensitive, capfirst, save_result, load_re
json_file = __file__.replace(".py", ".json") json_file = __file__.replace(".py", ".json")
url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<strong>([^<]+)</strong>') url_matcher = re.compile(tagre("a", "href", r'/comics/([^/]+)\.html') + r'<strong>([^<]+)</strong>')
# names of comics to exclude # names of comics to exclude
exclude_comics = [ exclude_comics = [
@ -67,16 +67,15 @@ def print_results(args):
"""Print comics.""" """Print comics."""
min_comics, filename = args min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp: with codecs.open(filename, 'a', 'utf-8') as fp:
for name, url in sorted(load_result(json_file).items()): for name, path in sorted(load_result(json_file).items()):
if name in exclude_comics: if name in exclude_comics:
continue continue
lang = 'Es' if name.lower().endswith('spanish') else ''
if has_gocomics_comic(name): if has_gocomics_comic(name):
prefix = u'# duplicate of gocomics ' fp.write(u'# %s has a duplicate in gocomics\n' % truncate_name(name))
else: else:
prefix = u'' fp.write(u"class %s(_Creators%s):\n path = %r\n\n" %
fp.write(u"%sadd(%r, %r)\n" % ( (truncate_name(name), lang, path))
prefix, str(truncate_name(name)), str(url))
)
if __name__ == '__main__': if __name__ == '__main__':