Refactor: Introduce generator methods for scrapers

This allows one comic module class to generate multiple scrapers. This
change is to support a more dynamic module system as described in #42.
This commit is contained in:
Tobias Gruetzmacher 2016-05-21 01:18:42 +02:00
parent 89cfd9d310
commit 51008a975b
16 changed files with 322 additions and 298 deletions

View file

@ -12,14 +12,13 @@ class _Arcamax(_ParserScraper):
imageSearch = '//img[@id="comic-zoom"]' imageSearch = '//img[@id="comic-zoom"]'
prevSearch = '//a[@class="prev"]' prevSearch = '//a[@class="prev"]'
def __init__(self, name):
super(_Arcamax, self).__init__('Arcamax/' + name)
@property @property
def url(self): def url(self):
return 'http://www.arcamax.com/thefunnies/' + self.path + '/' return 'http://www.arcamax.com/thefunnies/' + self.path + '/'
@property
def name(self):
return 'Arcamax/' + super(_Arcamax, self).name
# do not edit anything below since these entries are generated from # do not edit anything below since these entries are generated from
# scripts/update_plugins.sh # scripts/update_plugins.sh

View file

@ -7,58 +7,55 @@ from __future__ import absolute_import, division, print_function
from re import compile from re import compile
from ..scraper import make_scraper from ..scraper import _BasicScraper
from ..util import tagre, getQueryParams from ..util import tagre, getQueryParams
_linkTag = tagre("a", "href", r'([^"]+)') class CloneManga(_BasicScraper):
_prevSearch = compile(_linkTag + tagre("img", "src", r"previous\.gif")) _linkTag = tagre("a", "href", r'([^"]+)')
_nextSearch = compile(_linkTag + tagre("img", "src", r"next\.gif")) prevSearch = compile(_linkTag + tagre("img", "src", r"previous\.gif"))
_lastSearch = compile(_linkTag + tagre("img", "src", r"last\.gif")) nextSearch = compile(_linkTag + tagre("img", "src", r"next\.gif"))
latestSearch = compile(_linkTag + tagre("img", "src", r"last\.gif"))
help = 'Index format: n'
def __init__(self, name, shortName, imageFolder=None, lastStrip=None):
super(CloneManga, self).__init__('CloneManga/' + name)
def add(name, shortName, imageFolder=None, lastStrip=None):
classname = 'CloneManga_%s' % name
_url = 'http://manga.clone-army.org' _url = 'http://manga.clone-army.org'
baseUrl = '%s/%s.php' % (_url, shortName) self.url = '%s/%s.php' % (_url, shortName)
if imageFolder is None: if imageFolder is None:
imageFolder = shortName imageFolder = shortName
self.stripUrl = self.url + '?page=%s'
self.imageSearch = compile(tagre("img", "src", r'((?:%s/)?%s/[^"]+)' % (_url, imageFolder), after="center"))
if lastStrip is None:
self.starter = self._starter
else:
self.url = self.stripUrl % lastStrip
def namer(self, image_url, page_url): def namer(self, image_url, page_url):
return '%03d' % int(getQueryParams(page_url)['page'][0]) return '%03d' % int(getQueryParams(page_url)['page'][0])
def _starter(self): def _starter(self):
# first, try hopping to previous and next comic # first, try hopping to previous and next comic
data = self.getPage(baseUrl) data = self.getPage(self.url)
try: try:
url = self.fetchUrl(baseUrl, data, _prevSearch) url = self.fetchUrl(self.url, data, self.prevSearch)
except ValueError: except ValueError:
# no previous link found, try hopping to last comic # no previous link found, try hopping to last comic
return self.fetchUrl(baseUrl, data, _lastSearch) return self.fetchUrl(self.url, data, self.latestSearch)
else: else:
data = self.getPage(url) data = self.getPage(url)
return self.fetchUrl(url, data, _nextSearch) return self.fetchUrl(url, data, self.nextSearch)
attrs = dict( @classmethod
name='CloneManga/' + name, def getmodules(cls):
stripUrl=baseUrl + '?page=%s', return [
imageSearch=compile(tagre("img", "src", r'((?:%s/)?%s/[^"]+)' % (_url, imageFolder), after="center")), cls('AprilAndMay', 'anm', imageFolder='AAM'),
prevSearch=_prevSearch, cls('Kanami', 'kanami'),
help='Index format: n', cls('MomokaCorner', 'momoka'),
namer=namer, cls('NanasEverydayLife', 'nana', lastStrip='78'),
url=baseUrl, cls('PaperEleven', 'pxi', imageFolder='papereleven', lastStrip='311'),
) cls('Tomoyo42sRoom', 't42r'),
if lastStrip is None: cls('PennyTribute', 'penny'),
attrs['starter'] = _starter ]
else:
attrs['url'] = attrs['stripUrl'] % lastStrip
globals()[classname] = make_scraper(classname, **attrs)
add('AprilAndMay', 'anm', imageFolder='AAM')
add('Kanami', 'kanami')
add('MomokaCorner', 'momoka')
add('NanasEverydayLife', 'nana', lastStrip='78')
add('PaperEleven', 'pxi', imageFolder='papereleven', lastStrip='311')
add('Tomoyo42sRoom', 't42r')
add('PennyTribute', 'penny')

View file

@ -22,6 +22,9 @@ class _ComicFury(_ParserScraper):
help = 'Index format: n' help = 'Index format: n'
starter = bounceStarter starter = bounceStarter
def __init__(self, name):
super(_ComicFury, self).__init__('ComicFury/' + name[2:])
def namer(self, image_url, page_url): def namer(self, image_url, page_url):
parts = page_url.split('/') parts = page_url.split('/')
path, ext = os.path.splitext(image_url) path, ext = os.path.splitext(image_url)
@ -32,10 +35,6 @@ class _ComicFury(_ParserScraper):
def url(self): def url(self):
return 'http://%s.webcomic.ws/comics/' % self.sub return 'http://%s.webcomic.ws/comics/' % self.sub
@property
def name(self):
return 'ComicFury/' + super(_ComicFury, self).name[2:]
def getIndexStripUrl(self, index): def getIndexStripUrl(self, index):
return self.url + 'comics/%s' % index return self.url + 'comics/%s' % index

View file

@ -6,26 +6,25 @@
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
from re import compile from re import compile
from ..scraper import make_scraper
from ..scraper import _BasicScraper
from ..util import tagre from ..util import tagre
# Comicgenesis has a lot of comics, but most of them are disallowed by
# robots.txt
_imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
_prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') + class ComicGenesis(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
'(?:Previous comic' + '|' + '(?:Previous comic' + '|' +
tagre("img", "alt", "Previous comic") + '|' + tagre("img", "alt", "Previous comic") + '|' +
tagre("img", "src", "images/back\.gif") + tagre("img", "src", "images/back\.gif") +
')') ')')
multipleImagesPerStrip = True
help = 'Index format: yyyymmdd'
def prevUrlModifier(self, prev_url):
def add(name, url):
classname = 'ComicGenesis_%s' % name
if '/d/' in url:
stripUrl = url.split('/d/')[0] + '/d/%s.html'
else:
stripUrl = url + 'd/%s.html'
def _prevUrlModifier(self, prev_url):
if prev_url: if prev_url:
return prev_url.replace( return prev_url.replace(
"keenspace.com", "comicgenesis.com").replace( "keenspace.com", "comicgenesis.com").replace(
@ -33,95 +32,100 @@ def add(name, url):
"toonspace.com", "comicgenesis.com").replace( "toonspace.com", "comicgenesis.com").replace(
"comicgen.com", "comicgenesis.com") "comicgen.com", "comicgenesis.com")
globals()[classname] = make_scraper( def __init__(self, name, sub=None, last=None, baseUrl=None):
classname, super(ComicGenesis, self).__init__('ComicGenesis/' + name)
name='ComicGenesis/' + name,
url=url,
stripUrl=stripUrl,
imageSearch=_imageSearch,
prevSearch=_prevSearch,
prevUrlModifier=_prevUrlModifier,
multipleImagesPerStrip=True,
help='Index format: yyyymmdd',
)
# Comicgenesis has a lot of comics, but most of them are disallowed by robots.txt if sub:
# do not edit anything below since these entries are generated from scripts/update.sh baseUrl = 'http://%s.comicgenesis.com/' % sub
# DO NOT REMOVE
add('AAAAA', 'http://aaaaa.comicgenesis.com/') self.stripUrl = baseUrl + 'd/%s.html'
add('AdventuresofKiltman', 'http://kiltman.comicgenesis.com/') if last:
add('AmorModerno', 'http://amormoderno.comicgenesis.com/') self.url = self.stripUrl % last
add('AnythingButRealLife', 'http://anythingbutreallife.comicgenesis.com/') self.endOfLife = True
add('Ardra', 'http://ardra.comicgenesis.com/') else:
add('Artwork', 'http://artwork.comicgenesis.com/') self.url = baseUrl
add('BabeintheWoods', 'http://babeinthewoods.comicgenesis.com/')
add('BackwaterPlanet', 'http://bobthespirit.comicgenesis.com/') @classmethod
add('BendyStrawVampires', 'http://bsvampires.comicgenesis.com/') def getmodules(cls):
add('BlindSight', 'http://blindsight.comicgenesis.com/') return [
add('BreakingtheDoldrum', 'http://breakingthedoldrum.comicgenesis.com/') # do not edit anything below since these entries are generated from
add('Candi', 'http://candicomics.com/') # scripts/update_plugins.sh
add('CorporateLife', 'http://corporatelife.comicgenesis.com/') # DO NOT REMOVE
add('DarkWelkin', 'http://darkwelkin.comicgenesis.com/') cls('AAAAA', 'aaaaa'),
add('DemonEater', 'http://demoneater.comicgenesis.com/') cls('AdventuresofKiltman', 'kiltman'),
add('DoodleDiaries', 'http://doodlediaries.comicgenesis.com/') cls('AmorModerno', 'amormoderno'),
add('DormSweetDorm', 'http://dormsweetdorm.comicgenesis.com/') cls('AnythingButRealLife', 'anythingbutreallife'),
add('DoubleyouTeeEff', 'http://doubleyouteeeff.comicgenesis.com/') cls('Ardra', 'ardra'),
add('DragonsBane', 'http://jasonwhitewaterz.comicgenesis.com/') cls('Artwork', 'artwork'),
add('Dreamaniac', 'http://dreamaniaccomic.comicgenesis.com/') cls('BabeintheWoods', 'babeinthewoods'),
add('ElnifiChronicles', 'http://elnifichronicles.comicgenesis.com/') cls('BackwaterPlanet', 'bobthespirit'),
add('EvesApple', 'http://evesapple.comicgenesis.com/') cls('BendyStrawVampires', 'bsvampires'),
add('FancyThat', 'http://fancythat.comicgenesis.com/') cls('BlindSight', 'blindsight'),
add('FantasyQwest', 'http://creatorauthorman.comicgenesis.com/') cls('BreakingtheDoldrum', 'breakingthedoldrum'),
add('Fantazine', 'http://fantazin.comicgenesis.com/') cls('Candi', baseUrl='http://candicomics.com/'),
add('Flounderville', 'http://flounderville.comicgenesis.com/') cls('CorporateLife', 'corporatelife'),
add('GEM', 'http://keltzy.comicgenesis.com/') cls('DarkWelkin', 'darkwelkin'),
add('Gonefor300days', 'http://g4300d.comicgenesis.com/') cls('DemonEater', 'demoneater'),
add('IBlameDanny', 'http://vileterror.comicgenesis.com/') cls('DoodleDiaries', 'doodlediaries'),
add('ImpendingDoom', 'http://impending.comicgenesis.com/') cls('DormSweetDorm', 'dormsweetdorm'),
add('InANutshell', 'http://nutshellcomics.comicgenesis.com/') cls('DoubleyouTeeEff', 'doubleyouteeeff'),
add('KernyMantisComics', 'http://kernymantis.comicgenesis.com/') cls('DragonsBane', 'jasonwhitewaterz'),
add('KitsuneJewel', 'http://kitsunejewel.comicgenesis.com/') cls('Dreamaniac', 'dreamaniaccomic'),
add('KittyCattyGames', 'http://kittycattygames.comicgenesis.com/') cls('ElnifiChronicles', 'elnifichronicles'),
add('KiwiDayN', 'http://kiwidayn.comicgenesis.com/') cls('EvesApple', 'evesapple'),
add('KungFounded', 'http://kungfounded.comicgenesis.com/') cls('FancyThat', 'fancythat'),
add('LabBratz', 'http://labbratz.comicgenesis.com/') cls('FantasyQwest', 'creatorauthorman'),
add('Laserwing', 'http://laserwing.comicgenesis.com/') cls('Fantazine', 'fantazin'),
add('LumiasKingdom', 'http://lumia.comicgenesis.com/') cls('Flounderville', 'flounderville'),
add('Majestic7', 'http://majestic7.comicgenesis.com/') cls('GEM', 'keltzy'),
add('MaximumWhimsy', 'http://maximumwhimsy.comicgenesis.com/') cls('Gonefor300days', 'g4300d'),
add('MenschunsererZeitGerman', 'http://muz.comicgenesis.com/') cls('IBlameDanny', 'vileterror'),
add('MoonCrest24', 'http://mooncrest.comicgenesis.com/d/20121117.html') cls('ImpendingDoom', 'impending'),
add('Mushian', 'http://tentoumushi.comicgenesis.com/') cls('InANutshell', 'nutshellcomics'),
add('NightwolfCentral', 'http://nightwolfcentral.comicgenesis.com/') cls('KernyMantisComics', 'kernymantis'),
add('NoTimeForLife', 'http://randyraven.comicgenesis.com/') cls('KitsuneJewel', 'kitsunejewel'),
add('NoneMoreComic', 'http://nonemore.comicgenesis.com/') cls('KittyCattyGames', 'kittycattygames'),
add('ODCKS', 'http://odcks.comicgenesis.com/') cls('KiwiDayN', 'kiwidayn'),
add('OfDoom', 'http://ofdoom.comicgenesis.com/') cls('KungFounded', 'kungfounded'),
add('OpportunityofaLifetime', 'http://carpathia.comicgenesis.com/') cls('LabBratz', 'labbratz'),
add('Orbz', 'http://orbz.comicgenesis.com/') cls('Laserwing', 'laserwing'),
add('OwMySanity', 'http://owmysanity.comicgenesis.com/') cls('LumiasKingdom', 'lumia'),
add('PhantomThesis', 'http://phantomthesis.comicgenesis.com/') cls('Majestic7', 'majestic7'),
add('ProfessorSaltinesAstrodynamicDirigible', 'http://drsaltine.comicgenesis.com/') cls('MaximumWhimsy', 'maximumwhimsy'),
add('PsychicDyslexiaInstitute', 'http://pdi.comicgenesis.com/') cls('MenschunsererZeitGerman', 'muz'),
add('PublicidadeEnganosa', 'http://publicidadeenganosa.comicgenesis.com/') cls('MoonCrest24', 'mooncrest', last='20121117'),
add('RandomAxeOfKindness', 'http://randomaxe.comicgenesis.com/') cls('Mushian', 'tentoumushi'),
add('SalemUncommons', 'http://salemuncommons.comicgenesis.com/') cls('NightwolfCentral', 'nightwolfcentral'),
add('SamandElisAdventures', 'http://sameliadv.comicgenesis.com/') cls('NoTimeForLife', 'randyraven'),
add('SarahZero', 'http://plughead.comicgenesis.com/') cls('NoneMoreComic', 'nonemore'),
add('SixByNineCollege', 'http://sixbyninecollege.comicgenesis.com/') cls('ODCKS', 'odcks'),
add('SpoononHighandFireontheMountian', 'http://spoon.comicgenesis.com/') cls('OfDoom', 'ofdoom'),
add('SynapticMisfires', 'http://synapticmisfires.comicgenesis.com/') cls('OpportunityofaLifetime', 'carpathia'),
add('TakingStock', 'http://mapaghimagsik.comicgenesis.com/') cls('Orbz', 'orbz'),
add('TemplarArizona', 'http://templaraz.comicgenesis.com/') cls('OwMySanity', 'owmysanity'),
add('TheAdventuresofKaniraBaxter', 'http://kanirabaxter.comicgenesis.com/') cls('PhantomThesis', 'phantomthesis'),
add('TheAdventuresofVindibuddSuperheroInTraining', 'http://vindibudd.comicgenesis.com/d/20070720.html') cls('ProfessorSaltinesAstrodynamicDirigible', 'drsaltine'),
add('TheEasyBreather', 'http://easybreather.comicgenesis.com/') cls('PsychicDyslexiaInstitute', 'pdi'),
add('TheLounge', 'http://thelounge.comicgenesis.com/') cls('PublicidadeEnganosa', 'publicidadeenganosa'),
add('TheMisadventuresofOkk', 'http://okk.comicgenesis.com/') cls('RandomAxeOfKindness', 'randomaxe'),
add('ThePath', 'http://thepath.comicgenesis.com/') cls('SalemUncommons', 'salemuncommons'),
add('TheTalesofKalduras', 'http://kalduras.comicgenesis.com/') cls('SamandElisAdventures', 'sameliadv'),
add('Unconventional', 'http://unconventional.comicgenesis.com/') cls('SarahZero', 'plughead'),
add('WarMageNC17', 'http://warmage.comicgenesis.com/') cls('SixByNineCollege', 'sixbyninecollege'),
add('WebcomicTheWebcomicWebcomicWebcomicWebcomic', 'http://dannormnsanidey.comicgenesis.com/') cls('SpoononHighandFireontheMountian', 'spoon'),
add('WhatYouDontSee', 'http://phantomlady4.comicgenesis.com/') cls('SynapticMisfires', 'synapticmisfires'),
add('Wierdman', 'http://asa.comicgenesis.com/') cls('TakingStock', 'mapaghimagsik'),
cls('TemplarArizona', 'templaraz'),
cls('TheAdventuresofKaniraBaxter', 'kanirabaxter'),
cls('TheAdventuresofVindibuddSuperheroInTraining', 'vindibudd', last='20070720'),
cls('TheEasyBreather', 'easybreather'),
cls('TheLounge', 'thelounge'),
cls('TheMisadventuresofOkk', 'okk'),
cls('ThePath', 'thepath'),
cls('TheTalesofKalduras', 'kalduras'),
cls('Unconventional', 'unconventional'),
cls('WarMageNC17', 'warmage'),
cls('WebcomicTheWebcomicWebcomicWebcomicWebcomic', 'dannormnsanidey'),
cls('WhatYouDontSee', 'phantomlady4'),
cls('Wierdman', 'asa'),
]

View file

@ -15,9 +15,8 @@ class _Creators(_ParserScraper):
latestSearch = '//div[contains(@class,"caption")]/a' latestSearch = '//div[contains(@class,"caption")]/a'
starter = indirectStarter starter = indirectStarter
@property def __init__(self, name):
def name(self): super(_Creators, self).__init__('Creators/' + name)
return 'Creators/' + super(_Creators, self).name
@property @property
def url(self): def url(self):

View file

@ -18,9 +18,8 @@ class _GoComics(_ParserScraper):
starter = bounceStarter starter = bounceStarter
help = 'Index format: yyyy/mm/dd' help = 'Index format: yyyy/mm/dd'
@property def __init__(self, name):
def name(self): super(_GoComics, self).__init__('GoComics/' + name[2:])
return 'GoComics/' + super(_GoComics, self).name[2:]
@property @property
def url(self): def url(self):

View file

@ -1,79 +1,79 @@
# -*- coding: iso-8859-1 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from re import compile from re import compile
from ..scraper import make_scraper
from ..scraper import _BasicScraper
from ..util import tagre from ..util import tagre
_imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)')) class KeenSpot(_BasicScraper):
_stripPattern = r'([^"]*/d/\d{8}\.html)' imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
_prevSearch = ( _stripPattern = r'([^"]*/d/\d{8}\.html)'
prevSearch = (
compile(tagre("link", "href", _stripPattern, before="prev")), compile(tagre("link", "href", _stripPattern, before="prev")),
compile(tagre("a", "href", _stripPattern, after="prev")), compile(tagre("a", "href", _stripPattern, after="prev")),
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"previous_day1")), compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"previous_day1")),
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"katc7")), compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"katc7")),
)
def add(name, url):
classname = 'KeenSpot_%s' % name
if '/d/' in url:
stripUrl = url.split('/d/')[0] + '/d/%s.html'
else:
stripUrl = url + 'd/%s.html'
globals()[classname] = make_scraper(classname,
name='KeenSpot/' + name,
url=url,
stripUrl=stripUrl,
imageSearch = _imageSearch,
prevSearch = _prevSearch,
help = 'Index format: yyyymmdd',
) )
help = 'Index format: yyyymmdd'
# do not edit anything below since these entries are generated from scripts/update.sh def __init__(self, name, sub):
# DO NOT REMOVE super(KeenSpot, self).__init__('KeenSpot/' + name)
add('27TwentySeven', 'http://twenty-seven.keenspot.com/') self.url = 'http://%s.keenspot.com/' % sub
add('Adventurers', 'http://adventurers.keenspot.com/') self.stripUrl = self.url + 'd/%s.html'
add('AntiheroForHire', 'http://antihero.keenspot.com/')
add('BanzaiGirl', 'http://banzaigirl.keenspot.com/') @classmethod
add('Barker', 'http://barkercomic.keenspot.com/') def getmodules(cls):
add('Buzzboy', 'http://buzzboy.keenspot.com/') return [
add('ChoppingBlock', 'http://choppingblock.keenspot.com/') # do not edit anything below since these entries are generated from
add('ClichFlamb', 'http://clicheflambe.keenspot.com/') # scripts/update_plugins.sh
add('CountYourSheep', 'http://countyoursheep.keenspot.com/') # DO NOT REMOVE
add('EverythingJake', 'http://everythingjake.keenspot.com/') cls('27TwentySeven', 'twenty-seven'),
add('FallOutToyWorks', 'http://fallouttoyworks.keenspot.com/') cls('Adventurers', 'adventurers'),
add('FriarAndBrimstone', 'http://friarandbrimstone.keenspot.com/') cls('AntiheroForHire', 'antihero'),
add('GeneCatlow', 'http://genecatlow.keenspot.com/') cls('BanzaiGirl', 'banzaigirl'),
add('GodMode', 'http://godmode.keenspot.com/') cls('Barker', 'barkercomic'),
add('GreenWake', 'http://greenwake.keenspot.com/') cls('Buzzboy', 'buzzboy'),
add('HeadTrip', 'http://headtrip.keenspot.com/') cls('ChoppingBlock', 'choppingblock'),
add('HoaxHunters', 'http://hoaxhunters.keenspot.com/') cls('ClichFlamb', 'clicheflambe'),
add('InHere', 'http://inhere.keenspot.com/') cls('CountYourSheep', 'countyoursheep'),
add('Katrina', 'http://katrina.keenspot.com/') cls('EverythingJake', 'everythingjake'),
add('Landis', 'http://landis.keenspot.com/') cls('FallOutToyWorks', 'fallouttoyworks'),
add('MakeshiftMiracle', 'http://makeshiftmiracle.keenspot.com/') cls('FriarAndBrimstone', 'friarandbrimstone'),
add('Marksmen', 'http://marksmen.keenspot.com/') cls('GeneCatlow', 'genecatlow'),
add('MarryMe', 'http://marryme.keenspot.com/') cls('GodMode', 'godmode'),
add('MedusasDaughter', 'http://medusasdaughter.keenspot.com/') cls('GreenWake', 'greenwake'),
add('MonsterMassacre', 'http://monstermassacre.keenspot.com/') cls('HeadTrip', 'headtrip'),
add('Newshounds', 'http://newshounds.keenspot.com/') cls('HoaxHunters', 'hoaxhunters'),
add('NoPinkPonies', 'http://nopinkponies.keenspot.com/') cls('InHere', 'inhere'),
add('OutThere', 'http://outthere.keenspot.com/') cls('Katrina', 'katrina'),
add('Porcelain', 'http://porcelain.keenspot.com/') cls('Landis', 'landis'),
add('QUILTBAG', 'http://quiltbag.keenspot.com/') cls('MakeshiftMiracle', 'makeshiftmiracle'),
add('RedSpike', 'http://redspike.keenspot.com/') cls('Marksmen', 'marksmen'),
add('RumbleFall', 'http://rumblefall.keenspot.com/') cls('MarryMe', 'marryme'),
add('SamuraisBlood', 'http://samuraisblood.keenspot.com/') cls('MedusasDaughter', 'medusasdaughter'),
add('Sharky', 'http://sharky.keenspot.com/') cls('MonsterMassacre', 'monstermassacre'),
add('SomethingHappens', 'http://somethinghappens.keenspot.com/') cls('Newshounds', 'newshounds'),
add('SoreThumbs', 'http://sorethumbs.keenspot.com/') cls('NoPinkPonies', 'nopinkponies'),
add('Striptease', 'http://striptease.keenspot.com/') cls('OutThere', 'outthere'),
add('Superosity', 'http://superosity.keenspot.com/') cls('Porcelain', 'porcelain'),
add('TheFirstDaughter', 'http://thefirstdaughter.keenspot.com/') cls('QUILTBAG', 'quiltbag'),
add('TheGodChild', 'http://godchild.keenspot.com/') cls('RedSpike', 'redspike'),
add('TheHuntersofSalamanstra', 'http://salamanstra.keenspot.com/') cls('RumbleFall', 'rumblefall'),
add('TheLounge', 'http://thelounge.keenspot.com/') cls('SamuraisBlood', 'samuraisblood'),
add('WICKEDPOWERED', 'http://wickedpowered.keenspot.com/') cls('Sharky', 'sharky'),
cls('SomethingHappens', 'somethinghappens'),
cls('SoreThumbs', 'sorethumbs'),
cls('Striptease', 'striptease'),
cls('Superosity', 'superosity'),
cls('TheFirstDaughter', 'thefirstdaughter'),
cls('TheGodChild', 'godchild'),
cls('TheHuntersofSalamanstra', 'salamanstra'),
cls('TheLounge', 'thelounge'),
cls('WICKEDPOWERED', 'wickedpowered'),
]

View file

@ -1,19 +1,27 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from dosagelib.helpers import indirectStarter # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
from ..scraper import make_scraper, _ParserScraper # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from ..scraper import _ParserScraper
def add(name, url): class KindOfNormal(_ParserScraper):
attrs = dict( imageSearch = '//article[1]//div[@class="box-content"]//img'
name=name, prevSearch = '//a[@class="prev"]'
url='http://kindofnormal.com/' + url,
imageSearch='//article[1]//div[@class="box-content"]//img',
prevSearch='//a[@class="prev"]'
)
globals()[name] = make_scraper(name, _ParserScraper, **attrs)
def __init__(self, name, url):
super(KindOfNormal, self).__init__(name)
self.url = 'http://kindofnormal.com/' + url
add('MeAndDanielle', 'meanddanielle') @classmethod
add('TruthFacts', 'truthfacts') def getmodules(cls):
add('Wumo', 'wumo') return [
add('Wulffmorgenthaler', 'wumo') # name in previous versions cls('MeAndDanielle', 'meanddanielle'),
cls('TruthFacts', 'truthfacts'),
cls('Wumo', 'wumo'),
# name in previous versions
cls('Wulffmorgenthaler', 'wumo'),
]

View file

@ -1,7 +1,9 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2016 Tobias Gruetzmacher # Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from ..scraper import _ParserScraper from ..scraper import _ParserScraper
@ -10,14 +12,13 @@ class _NuklearPower(_ParserScraper):
prevSearch = '//a[@rel="prev"]' prevSearch = '//a[@rel="prev"]'
imageSearch = '//div[@id="comic"]/img' imageSearch = '//div[@id="comic"]/img'
def __init__(self, name):
super(_NuklearPower, self).__init__('NuklearPower/' + name[2:])
@property @property
def url(self): def url(self):
return 'http://www.nuklearpower.com/' + self.path + '/' return 'http://www.nuklearpower.com/' + self.path + '/'
@property
def name(self):
return 'NuklearPower/' + super(_NuklearPower, self).name[2:]
class NP8BitTheater(_NuklearPower): class NP8BitTheater(_NuklearPower):
path = '8-bit-theater' path = '8-bit-theater'

View file

@ -1,33 +1,50 @@
# -*- coding: iso-8859-1 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from re import compile from re import compile
from ..scraper import make_scraper
from ..scraper import _BasicScraper
from ..util import tagre from ..util import tagre
from .common import _WordPressScraper
_imageSearch = compile(tagre("img", "src", r'(http://[a-z0-9]+\.petitesymphony\.com/files/comics/[^"]+)'))
_prevSearch = compile(tagre("a", "href", r'(http://[a-z0-9]+\.petitesymphony\.com/comic/[^"]+)', after="navi-prev"))
def add(name):
classname = 'PetiteSymphony_%s' % name.capitalize()
url = 'http://%s.petitesymphony.com/' % name
globals()[classname] = make_scraper(classname,
name='PetiteSymphony/' + name.capitalize(),
url = url,
stripUrl = url + 'comic/%s',
imageSearch = _imageSearch,
prevSearch = _prevSearch,
multipleImagesPerStrip = True,
help='Index format: named number'
)
add("djandora") class PetiteSymphony(_BasicScraper):
add("generation17") imageSearch = compile(tagre("img", "src", r'(http://[a-z0-9]+\.petitesymphony\.com/files/comics/[^"]+)'))
add("knuckleup") prevSearch = compile(tagre("a", "href", r'(http://[a-z0-9]+\.petitesymphony\.com/comic/[^"]+)', after="navi-prev"))
add("kickinrad") multipleImagesPerStrip = True
add("orangegrind") help = 'Index format: named number'
add("rascals")
add("sangria") def __init__(self, name):
add("seed") super(PetiteSymphony, self).__init__('PetiteSymphony/' +
name.capitalize())
self.url = 'http://%s.petitesymphony.com/' % name
self.stripUrl = self.url + 'comic/%s'
@classmethod
def getmodules(cls):
return [
cls("knuckleup"),
cls("kickinrad"),
cls("orangegrind"),
cls("rascals"),
cls("sangria"),
cls("seed"),
]
class ComicsBreak(_WordPressScraper):
def __init__(self, name):
super(ComicsBreak, self).__init__('ComicsBreak/' + name)
self.url = 'http://%s.comicsbreak.com/' % name.lower()
@classmethod
def getmodules(cls):
return [
cls("Djandora"),
cls("Generation17"),
]

View file

@ -44,9 +44,8 @@ class _SmackJeeves(_ParserScraper):
broken_html_bugfix = True broken_html_bugfix = True
@property def __init__(self, name):
def name(self): super(_SmackJeeves, self).__init__('SmackJeeves/' + name[2:])
return 'SmackJeeves/' + super(_SmackJeeves, self).name[2:]
@property @property
def url(self): def url(self):

View file

@ -16,9 +16,8 @@ class _Snafu(_ParserScraper):
latestSearch = '//div[@id="feed"]/a' latestSearch = '//div[@id="feed"]/a'
starter = indirectStarter starter = indirectStarter
@property def __init__(self, name):
def name(self): super(_Snafu, self).__init__('SnafuComics/' + name)
return 'SnafuComics/' + super(_Snafu, self).name
def namer(self, image_url, page_url): def namer(self, image_url, page_url):
year, month, name = image_url.rsplit('/', 3)[1:] year, month, name = image_url.rsplit('/', 3)[1:]

View file

@ -13,9 +13,8 @@ class _WebcomicEu(_ParserScraper):
prevSearch = '//a[img[contains(@src, "navi-zurueck")]]' prevSearch = '//a[img[contains(@src, "navi-zurueck")]]'
help = 'Index format: number' help = 'Index format: number'
@property def __init__(self, name):
def name(self): super(_WebcomicEu, self).__init__('WebcomicEu/' + name)
return 'WebcomicEu/' + super(_WebcomicEu, self).name
@property @property
def url(self): def url(self):

View file

@ -16,9 +16,8 @@ class _WLPComics(_ParserScraper):
starter = bounceStarter starter = bounceStarter
help = 'Index format: nnn' help = 'Index format: nnn'
@property def __init__(self, name):
def name(self): super(_WLPComics, self).__init__('WLP/' + name)
return 'WLP/' + super(_WLPComics, self).name
def namer(self, image_url, page_url): def namer(self, image_url, page_url):
return (page_url.rsplit('/', 1)[-1].split('.')[0] + '_' + return (page_url.rsplit('/', 1)[-1].split('.')[0] + '_' +

View file

@ -82,6 +82,10 @@ class Scraper(object):
# HTTP session for configuration & cookies # HTTP session for configuration & cookies
session = requests_session() session = requests_session()
@classmethod
def getmodules(cls):
return [cls(cls.__name__)]
@property @property
def indexes(self): def indexes(self):
return self._indexes return self._indexes
@ -91,8 +95,9 @@ class Scraper(object):
if val: if val:
self._indexes = tuple(sorted(val)) self._indexes = tuple(sorted(val))
def __init__(self): def __init__(self, name):
"""Initialize internal variables.""" """Initialize internal variables."""
self.name = name
self.urls = set() self.urls = set()
self._indexes = tuple() self._indexes = tuple()
self.skippedUrls = set() self.skippedUrls = set()
@ -222,11 +227,6 @@ class Scraper(object):
"""Get comic strip URL from index.""" """Get comic strip URL from index."""
return self.stripUrl % index return self.stripUrl % index
@property
def name(self):
"""Get scraper name."""
return self.__class__.__name__
def starter(self): def starter(self):
"""Get starter URL from where to scrape comic strips.""" """Get starter URL from where to scrape comic strips."""
return self.url return self.url
@ -563,10 +563,12 @@ def get_scrapers():
if _scrapers is None: if _scrapers is None:
out.debug(u"Loading comic modules...") out.debug(u"Loading comic modules...")
modules = loader.get_modules('plugins') modules = loader.get_modules('plugins')
plugins = loader.get_plugins(modules, Scraper) plugins = list(loader.get_plugins(modules, Scraper))
_scrapers = sorted([x() for x in plugins], key=lambda p: p.name) _scrapers = sorted([m for x in plugins for m in x.getmodules()],
key=lambda p: p.name)
check_scrapers() check_scrapers()
out.debug(u"... %d modules loaded." % len(_scrapers)) out.debug(u"... %d modules loaded from %d classes." % (
len(_scrapers), len(plugins)))
return _scrapers return _scrapers

View file

@ -1,16 +1,19 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2013-2014 Bastian Kleineidam # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2016 Tobias Gruetzmacher # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from dosagelib import scraper from dosagelib import scraper
class ATestScraper(scraper._BasicScraper): class ATestScraper(scraper._BasicScraper):
name = 'Test_Test' pass
class TestVote(object): class TestVote(object):
def test_vote(self): def test_vote(self):
answer = ATestScraper().vote() answer = ATestScraper('Test_Test').vote()
assert answer in ('counted', 'no'), 'invalid answer %r' % answer assert answer in ('counted', 'no'), 'invalid answer %r' % answer