Refactor: Introduce generator methods for scrapers

This allows one comic module class to generate multiple scrapers. This
change is to support a more dynamic module system as described in #42.
This commit is contained in:
Tobias Gruetzmacher 2016-05-21 01:18:42 +02:00
parent 89cfd9d310
commit 51008a975b
16 changed files with 322 additions and 298 deletions

View file

@ -12,14 +12,13 @@ class _Arcamax(_ParserScraper):
imageSearch = '//img[@id="comic-zoom"]'
prevSearch = '//a[@class="prev"]'
def __init__(self, name):
super(_Arcamax, self).__init__('Arcamax/' + name)
@property
def url(self):
return 'http://www.arcamax.com/thefunnies/' + self.path + '/'
@property
def name(self):
return 'Arcamax/' + super(_Arcamax, self).name
# do not edit anything below since these entries are generated from
# scripts/update_plugins.sh

View file

@ -7,58 +7,55 @@ from __future__ import absolute_import, division, print_function
from re import compile
from ..scraper import make_scraper
from ..scraper import _BasicScraper
from ..util import tagre, getQueryParams
_linkTag = tagre("a", "href", r'([^"]+)')
_prevSearch = compile(_linkTag + tagre("img", "src", r"previous\.gif"))
_nextSearch = compile(_linkTag + tagre("img", "src", r"next\.gif"))
_lastSearch = compile(_linkTag + tagre("img", "src", r"last\.gif"))
class CloneManga(_BasicScraper):
_linkTag = tagre("a", "href", r'([^"]+)')
prevSearch = compile(_linkTag + tagre("img", "src", r"previous\.gif"))
nextSearch = compile(_linkTag + tagre("img", "src", r"next\.gif"))
latestSearch = compile(_linkTag + tagre("img", "src", r"last\.gif"))
help = 'Index format: n'
def __init__(self, name, shortName, imageFolder=None, lastStrip=None):
super(CloneManga, self).__init__('CloneManga/' + name)
def add(name, shortName, imageFolder=None, lastStrip=None):
classname = 'CloneManga_%s' % name
_url = 'http://manga.clone-army.org'
baseUrl = '%s/%s.php' % (_url, shortName)
if imageFolder is None:
imageFolder = shortName
_url = 'http://manga.clone-army.org'
self.url = '%s/%s.php' % (_url, shortName)
if imageFolder is None:
imageFolder = shortName
self.stripUrl = self.url + '?page=%s'
self.imageSearch = compile(tagre("img", "src", r'((?:%s/)?%s/[^"]+)' % (_url, imageFolder), after="center"))
if lastStrip is None:
self.starter = self._starter
else:
self.url = self.stripUrl % lastStrip
def namer(self, image_url, page_url):
return '%03d' % int(getQueryParams(page_url)['page'][0])
def _starter(self):
# first, try hopping to previous and next comic
data = self.getPage(baseUrl)
data = self.getPage(self.url)
try:
url = self.fetchUrl(baseUrl, data, _prevSearch)
url = self.fetchUrl(self.url, data, self.prevSearch)
except ValueError:
# no previous link found, try hopping to last comic
return self.fetchUrl(baseUrl, data, _lastSearch)
return self.fetchUrl(self.url, data, self.latestSearch)
else:
data = self.getPage(url)
return self.fetchUrl(url, data, _nextSearch)
return self.fetchUrl(url, data, self.nextSearch)
attrs = dict(
name='CloneManga/' + name,
stripUrl=baseUrl + '?page=%s',
imageSearch=compile(tagre("img", "src", r'((?:%s/)?%s/[^"]+)' % (_url, imageFolder), after="center")),
prevSearch=_prevSearch,
help='Index format: n',
namer=namer,
url=baseUrl,
)
if lastStrip is None:
attrs['starter'] = _starter
else:
attrs['url'] = attrs['stripUrl'] % lastStrip
globals()[classname] = make_scraper(classname, **attrs)
add('AprilAndMay', 'anm', imageFolder='AAM')
add('Kanami', 'kanami')
add('MomokaCorner', 'momoka')
add('NanasEverydayLife', 'nana', lastStrip='78')
add('PaperEleven', 'pxi', imageFolder='papereleven', lastStrip='311')
add('Tomoyo42sRoom', 't42r')
add('PennyTribute', 'penny')
@classmethod
def getmodules(cls):
return [
cls('AprilAndMay', 'anm', imageFolder='AAM'),
cls('Kanami', 'kanami'),
cls('MomokaCorner', 'momoka'),
cls('NanasEverydayLife', 'nana', lastStrip='78'),
cls('PaperEleven', 'pxi', imageFolder='papereleven', lastStrip='311'),
cls('Tomoyo42sRoom', 't42r'),
cls('PennyTribute', 'penny'),
]

View file

@ -22,6 +22,9 @@ class _ComicFury(_ParserScraper):
help = 'Index format: n'
starter = bounceStarter
def __init__(self, name):
super(_ComicFury, self).__init__('ComicFury/' + name[2:])
def namer(self, image_url, page_url):
parts = page_url.split('/')
path, ext = os.path.splitext(image_url)
@ -32,10 +35,6 @@ class _ComicFury(_ParserScraper):
def url(self):
return 'http://%s.webcomic.ws/comics/' % self.sub
@property
def name(self):
return 'ComicFury/' + super(_ComicFury, self).name[2:]
def getIndexStripUrl(self, index):
return self.url + 'comics/%s' % index

View file

@ -6,26 +6,25 @@
from __future__ import absolute_import, division, print_function
from re import compile
from ..scraper import make_scraper
from ..scraper import _BasicScraper
from ..util import tagre
_imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
_prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
'(?:Previous comic' + '|' +
tagre("img", "alt", "Previous comic") + '|' +
tagre("img", "src", "images/back\.gif") +
')')
# Comicgenesis has a lot of comics, but most of them are disallowed by
# robots.txt
def add(name, url):
classname = 'ComicGenesis_%s' % name
if '/d/' in url:
stripUrl = url.split('/d/')[0] + '/d/%s.html'
else:
stripUrl = url + 'd/%s.html'
class ComicGenesis(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
'(?:Previous comic' + '|' +
tagre("img", "alt", "Previous comic") + '|' +
tagre("img", "src", "images/back\.gif") +
')')
multipleImagesPerStrip = True
help = 'Index format: yyyymmdd'
def _prevUrlModifier(self, prev_url):
def prevUrlModifier(self, prev_url):
if prev_url:
return prev_url.replace(
"keenspace.com", "comicgenesis.com").replace(
@ -33,95 +32,100 @@ def add(name, url):
"toonspace.com", "comicgenesis.com").replace(
"comicgen.com", "comicgenesis.com")
globals()[classname] = make_scraper(
classname,
name='ComicGenesis/' + name,
url=url,
stripUrl=stripUrl,
imageSearch=_imageSearch,
prevSearch=_prevSearch,
prevUrlModifier=_prevUrlModifier,
multipleImagesPerStrip=True,
help='Index format: yyyymmdd',
)
def __init__(self, name, sub=None, last=None, baseUrl=None):
super(ComicGenesis, self).__init__('ComicGenesis/' + name)
# Comicgenesis has a lot of comics, but most of them are disallowed by robots.txt
# do not edit anything below since these entries are generated from scripts/update.sh
# DO NOT REMOVE
add('AAAAA', 'http://aaaaa.comicgenesis.com/')
add('AdventuresofKiltman', 'http://kiltman.comicgenesis.com/')
add('AmorModerno', 'http://amormoderno.comicgenesis.com/')
add('AnythingButRealLife', 'http://anythingbutreallife.comicgenesis.com/')
add('Ardra', 'http://ardra.comicgenesis.com/')
add('Artwork', 'http://artwork.comicgenesis.com/')
add('BabeintheWoods', 'http://babeinthewoods.comicgenesis.com/')
add('BackwaterPlanet', 'http://bobthespirit.comicgenesis.com/')
add('BendyStrawVampires', 'http://bsvampires.comicgenesis.com/')
add('BlindSight', 'http://blindsight.comicgenesis.com/')
add('BreakingtheDoldrum', 'http://breakingthedoldrum.comicgenesis.com/')
add('Candi', 'http://candicomics.com/')
add('CorporateLife', 'http://corporatelife.comicgenesis.com/')
add('DarkWelkin', 'http://darkwelkin.comicgenesis.com/')
add('DemonEater', 'http://demoneater.comicgenesis.com/')
add('DoodleDiaries', 'http://doodlediaries.comicgenesis.com/')
add('DormSweetDorm', 'http://dormsweetdorm.comicgenesis.com/')
add('DoubleyouTeeEff', 'http://doubleyouteeeff.comicgenesis.com/')
add('DragonsBane', 'http://jasonwhitewaterz.comicgenesis.com/')
add('Dreamaniac', 'http://dreamaniaccomic.comicgenesis.com/')
add('ElnifiChronicles', 'http://elnifichronicles.comicgenesis.com/')
add('EvesApple', 'http://evesapple.comicgenesis.com/')
add('FancyThat', 'http://fancythat.comicgenesis.com/')
add('FantasyQwest', 'http://creatorauthorman.comicgenesis.com/')
add('Fantazine', 'http://fantazin.comicgenesis.com/')
add('Flounderville', 'http://flounderville.comicgenesis.com/')
add('GEM', 'http://keltzy.comicgenesis.com/')
add('Gonefor300days', 'http://g4300d.comicgenesis.com/')
add('IBlameDanny', 'http://vileterror.comicgenesis.com/')
add('ImpendingDoom', 'http://impending.comicgenesis.com/')
add('InANutshell', 'http://nutshellcomics.comicgenesis.com/')
add('KernyMantisComics', 'http://kernymantis.comicgenesis.com/')
add('KitsuneJewel', 'http://kitsunejewel.comicgenesis.com/')
add('KittyCattyGames', 'http://kittycattygames.comicgenesis.com/')
add('KiwiDayN', 'http://kiwidayn.comicgenesis.com/')
add('KungFounded', 'http://kungfounded.comicgenesis.com/')
add('LabBratz', 'http://labbratz.comicgenesis.com/')
add('Laserwing', 'http://laserwing.comicgenesis.com/')
add('LumiasKingdom', 'http://lumia.comicgenesis.com/')
add('Majestic7', 'http://majestic7.comicgenesis.com/')
add('MaximumWhimsy', 'http://maximumwhimsy.comicgenesis.com/')
add('MenschunsererZeitGerman', 'http://muz.comicgenesis.com/')
add('MoonCrest24', 'http://mooncrest.comicgenesis.com/d/20121117.html')
add('Mushian', 'http://tentoumushi.comicgenesis.com/')
add('NightwolfCentral', 'http://nightwolfcentral.comicgenesis.com/')
add('NoTimeForLife', 'http://randyraven.comicgenesis.com/')
add('NoneMoreComic', 'http://nonemore.comicgenesis.com/')
add('ODCKS', 'http://odcks.comicgenesis.com/')
add('OfDoom', 'http://ofdoom.comicgenesis.com/')
add('OpportunityofaLifetime', 'http://carpathia.comicgenesis.com/')
add('Orbz', 'http://orbz.comicgenesis.com/')
add('OwMySanity', 'http://owmysanity.comicgenesis.com/')
add('PhantomThesis', 'http://phantomthesis.comicgenesis.com/')
add('ProfessorSaltinesAstrodynamicDirigible', 'http://drsaltine.comicgenesis.com/')
add('PsychicDyslexiaInstitute', 'http://pdi.comicgenesis.com/')
add('PublicidadeEnganosa', 'http://publicidadeenganosa.comicgenesis.com/')
add('RandomAxeOfKindness', 'http://randomaxe.comicgenesis.com/')
add('SalemUncommons', 'http://salemuncommons.comicgenesis.com/')
add('SamandElisAdventures', 'http://sameliadv.comicgenesis.com/')
add('SarahZero', 'http://plughead.comicgenesis.com/')
add('SixByNineCollege', 'http://sixbyninecollege.comicgenesis.com/')
add('SpoononHighandFireontheMountian', 'http://spoon.comicgenesis.com/')
add('SynapticMisfires', 'http://synapticmisfires.comicgenesis.com/')
add('TakingStock', 'http://mapaghimagsik.comicgenesis.com/')
add('TemplarArizona', 'http://templaraz.comicgenesis.com/')
add('TheAdventuresofKaniraBaxter', 'http://kanirabaxter.comicgenesis.com/')
add('TheAdventuresofVindibuddSuperheroInTraining', 'http://vindibudd.comicgenesis.com/d/20070720.html')
add('TheEasyBreather', 'http://easybreather.comicgenesis.com/')
add('TheLounge', 'http://thelounge.comicgenesis.com/')
add('TheMisadventuresofOkk', 'http://okk.comicgenesis.com/')
add('ThePath', 'http://thepath.comicgenesis.com/')
add('TheTalesofKalduras', 'http://kalduras.comicgenesis.com/')
add('Unconventional', 'http://unconventional.comicgenesis.com/')
add('WarMageNC17', 'http://warmage.comicgenesis.com/')
add('WebcomicTheWebcomicWebcomicWebcomicWebcomic', 'http://dannormnsanidey.comicgenesis.com/')
add('WhatYouDontSee', 'http://phantomlady4.comicgenesis.com/')
add('Wierdman', 'http://asa.comicgenesis.com/')
if sub:
baseUrl = 'http://%s.comicgenesis.com/' % sub
self.stripUrl = baseUrl + 'd/%s.html'
if last:
self.url = self.stripUrl % last
self.endOfLife = True
else:
self.url = baseUrl
@classmethod
def getmodules(cls):
return [
# do not edit anything below since these entries are generated from
# scripts/update_plugins.sh
# DO NOT REMOVE
cls('AAAAA', 'aaaaa'),
cls('AdventuresofKiltman', 'kiltman'),
cls('AmorModerno', 'amormoderno'),
cls('AnythingButRealLife', 'anythingbutreallife'),
cls('Ardra', 'ardra'),
cls('Artwork', 'artwork'),
cls('BabeintheWoods', 'babeinthewoods'),
cls('BackwaterPlanet', 'bobthespirit'),
cls('BendyStrawVampires', 'bsvampires'),
cls('BlindSight', 'blindsight'),
cls('BreakingtheDoldrum', 'breakingthedoldrum'),
cls('Candi', baseUrl='http://candicomics.com/'),
cls('CorporateLife', 'corporatelife'),
cls('DarkWelkin', 'darkwelkin'),
cls('DemonEater', 'demoneater'),
cls('DoodleDiaries', 'doodlediaries'),
cls('DormSweetDorm', 'dormsweetdorm'),
cls('DoubleyouTeeEff', 'doubleyouteeeff'),
cls('DragonsBane', 'jasonwhitewaterz'),
cls('Dreamaniac', 'dreamaniaccomic'),
cls('ElnifiChronicles', 'elnifichronicles'),
cls('EvesApple', 'evesapple'),
cls('FancyThat', 'fancythat'),
cls('FantasyQwest', 'creatorauthorman'),
cls('Fantazine', 'fantazin'),
cls('Flounderville', 'flounderville'),
cls('GEM', 'keltzy'),
cls('Gonefor300days', 'g4300d'),
cls('IBlameDanny', 'vileterror'),
cls('ImpendingDoom', 'impending'),
cls('InANutshell', 'nutshellcomics'),
cls('KernyMantisComics', 'kernymantis'),
cls('KitsuneJewel', 'kitsunejewel'),
cls('KittyCattyGames', 'kittycattygames'),
cls('KiwiDayN', 'kiwidayn'),
cls('KungFounded', 'kungfounded'),
cls('LabBratz', 'labbratz'),
cls('Laserwing', 'laserwing'),
cls('LumiasKingdom', 'lumia'),
cls('Majestic7', 'majestic7'),
cls('MaximumWhimsy', 'maximumwhimsy'),
cls('MenschunsererZeitGerman', 'muz'),
cls('MoonCrest24', 'mooncrest', last='20121117'),
cls('Mushian', 'tentoumushi'),
cls('NightwolfCentral', 'nightwolfcentral'),
cls('NoTimeForLife', 'randyraven'),
cls('NoneMoreComic', 'nonemore'),
cls('ODCKS', 'odcks'),
cls('OfDoom', 'ofdoom'),
cls('OpportunityofaLifetime', 'carpathia'),
cls('Orbz', 'orbz'),
cls('OwMySanity', 'owmysanity'),
cls('PhantomThesis', 'phantomthesis'),
cls('ProfessorSaltinesAstrodynamicDirigible', 'drsaltine'),
cls('PsychicDyslexiaInstitute', 'pdi'),
cls('PublicidadeEnganosa', 'publicidadeenganosa'),
cls('RandomAxeOfKindness', 'randomaxe'),
cls('SalemUncommons', 'salemuncommons'),
cls('SamandElisAdventures', 'sameliadv'),
cls('SarahZero', 'plughead'),
cls('SixByNineCollege', 'sixbyninecollege'),
cls('SpoononHighandFireontheMountian', 'spoon'),
cls('SynapticMisfires', 'synapticmisfires'),
cls('TakingStock', 'mapaghimagsik'),
cls('TemplarArizona', 'templaraz'),
cls('TheAdventuresofKaniraBaxter', 'kanirabaxter'),
cls('TheAdventuresofVindibuddSuperheroInTraining', 'vindibudd', last='20070720'),
cls('TheEasyBreather', 'easybreather'),
cls('TheLounge', 'thelounge'),
cls('TheMisadventuresofOkk', 'okk'),
cls('ThePath', 'thepath'),
cls('TheTalesofKalduras', 'kalduras'),
cls('Unconventional', 'unconventional'),
cls('WarMageNC17', 'warmage'),
cls('WebcomicTheWebcomicWebcomicWebcomicWebcomic', 'dannormnsanidey'),
cls('WhatYouDontSee', 'phantomlady4'),
cls('Wierdman', 'asa'),
]

View file

@ -15,9 +15,8 @@ class _Creators(_ParserScraper):
latestSearch = '//div[contains(@class,"caption")]/a'
starter = indirectStarter
@property
def name(self):
return 'Creators/' + super(_Creators, self).name
def __init__(self, name):
super(_Creators, self).__init__('Creators/' + name)
@property
def url(self):

View file

@ -18,9 +18,8 @@ class _GoComics(_ParserScraper):
starter = bounceStarter
help = 'Index format: yyyy/mm/dd'
@property
def name(self):
return 'GoComics/' + super(_GoComics, self).name[2:]
def __init__(self, name):
super(_GoComics, self).__init__('GoComics/' + name[2:])
@property
def url(self):

View file

@ -1,79 +1,79 @@
# -*- coding: iso-8859-1 -*-
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from re import compile
from ..scraper import make_scraper
from ..scraper import _BasicScraper
from ..util import tagre
_imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
_stripPattern = r'([^"]*/d/\d{8}\.html)'
_prevSearch = (
compile(tagre("link", "href", _stripPattern, before="prev")),
compile(tagre("a", "href", _stripPattern, after="prev")),
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"previous_day1")),
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"katc7")),
)
def add(name, url):
classname = 'KeenSpot_%s' % name
if '/d/' in url:
stripUrl = url.split('/d/')[0] + '/d/%s.html'
else:
stripUrl = url + 'd/%s.html'
globals()[classname] = make_scraper(classname,
name='KeenSpot/' + name,
url=url,
stripUrl=stripUrl,
imageSearch = _imageSearch,
prevSearch = _prevSearch,
help = 'Index format: yyyymmdd',
class KeenSpot(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
_stripPattern = r'([^"]*/d/\d{8}\.html)'
prevSearch = (
compile(tagre("link", "href", _stripPattern, before="prev")),
compile(tagre("a", "href", _stripPattern, after="prev")),
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"previous_day1")),
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"katc7")),
)
help = 'Index format: yyyymmdd'
# do not edit anything below since these entries are generated from scripts/update.sh
# DO NOT REMOVE
add('27TwentySeven', 'http://twenty-seven.keenspot.com/')
add('Adventurers', 'http://adventurers.keenspot.com/')
add('AntiheroForHire', 'http://antihero.keenspot.com/')
add('BanzaiGirl', 'http://banzaigirl.keenspot.com/')
add('Barker', 'http://barkercomic.keenspot.com/')
add('Buzzboy', 'http://buzzboy.keenspot.com/')
add('ChoppingBlock', 'http://choppingblock.keenspot.com/')
add('ClichFlamb', 'http://clicheflambe.keenspot.com/')
add('CountYourSheep', 'http://countyoursheep.keenspot.com/')
add('EverythingJake', 'http://everythingjake.keenspot.com/')
add('FallOutToyWorks', 'http://fallouttoyworks.keenspot.com/')
add('FriarAndBrimstone', 'http://friarandbrimstone.keenspot.com/')
add('GeneCatlow', 'http://genecatlow.keenspot.com/')
add('GodMode', 'http://godmode.keenspot.com/')
add('GreenWake', 'http://greenwake.keenspot.com/')
add('HeadTrip', 'http://headtrip.keenspot.com/')
add('HoaxHunters', 'http://hoaxhunters.keenspot.com/')
add('InHere', 'http://inhere.keenspot.com/')
add('Katrina', 'http://katrina.keenspot.com/')
add('Landis', 'http://landis.keenspot.com/')
add('MakeshiftMiracle', 'http://makeshiftmiracle.keenspot.com/')
add('Marksmen', 'http://marksmen.keenspot.com/')
add('MarryMe', 'http://marryme.keenspot.com/')
add('MedusasDaughter', 'http://medusasdaughter.keenspot.com/')
add('MonsterMassacre', 'http://monstermassacre.keenspot.com/')
add('Newshounds', 'http://newshounds.keenspot.com/')
add('NoPinkPonies', 'http://nopinkponies.keenspot.com/')
add('OutThere', 'http://outthere.keenspot.com/')
add('Porcelain', 'http://porcelain.keenspot.com/')
add('QUILTBAG', 'http://quiltbag.keenspot.com/')
add('RedSpike', 'http://redspike.keenspot.com/')
add('RumbleFall', 'http://rumblefall.keenspot.com/')
add('SamuraisBlood', 'http://samuraisblood.keenspot.com/')
add('Sharky', 'http://sharky.keenspot.com/')
add('SomethingHappens', 'http://somethinghappens.keenspot.com/')
add('SoreThumbs', 'http://sorethumbs.keenspot.com/')
add('Striptease', 'http://striptease.keenspot.com/')
add('Superosity', 'http://superosity.keenspot.com/')
add('TheFirstDaughter', 'http://thefirstdaughter.keenspot.com/')
add('TheGodChild', 'http://godchild.keenspot.com/')
add('TheHuntersofSalamanstra', 'http://salamanstra.keenspot.com/')
add('TheLounge', 'http://thelounge.keenspot.com/')
add('WICKEDPOWERED', 'http://wickedpowered.keenspot.com/')
def __init__(self, name, sub):
super(KeenSpot, self).__init__('KeenSpot/' + name)
self.url = 'http://%s.keenspot.com/' % sub
self.stripUrl = self.url + 'd/%s.html'
@classmethod
def getmodules(cls):
return [
# do not edit anything below since these entries are generated from
# scripts/update_plugins.sh
# DO NOT REMOVE
cls('27TwentySeven', 'twenty-seven'),
cls('Adventurers', 'adventurers'),
cls('AntiheroForHire', 'antihero'),
cls('BanzaiGirl', 'banzaigirl'),
cls('Barker', 'barkercomic'),
cls('Buzzboy', 'buzzboy'),
cls('ChoppingBlock', 'choppingblock'),
cls('ClichFlamb', 'clicheflambe'),
cls('CountYourSheep', 'countyoursheep'),
cls('EverythingJake', 'everythingjake'),
cls('FallOutToyWorks', 'fallouttoyworks'),
cls('FriarAndBrimstone', 'friarandbrimstone'),
cls('GeneCatlow', 'genecatlow'),
cls('GodMode', 'godmode'),
cls('GreenWake', 'greenwake'),
cls('HeadTrip', 'headtrip'),
cls('HoaxHunters', 'hoaxhunters'),
cls('InHere', 'inhere'),
cls('Katrina', 'katrina'),
cls('Landis', 'landis'),
cls('MakeshiftMiracle', 'makeshiftmiracle'),
cls('Marksmen', 'marksmen'),
cls('MarryMe', 'marryme'),
cls('MedusasDaughter', 'medusasdaughter'),
cls('MonsterMassacre', 'monstermassacre'),
cls('Newshounds', 'newshounds'),
cls('NoPinkPonies', 'nopinkponies'),
cls('OutThere', 'outthere'),
cls('Porcelain', 'porcelain'),
cls('QUILTBAG', 'quiltbag'),
cls('RedSpike', 'redspike'),
cls('RumbleFall', 'rumblefall'),
cls('SamuraisBlood', 'samuraisblood'),
cls('Sharky', 'sharky'),
cls('SomethingHappens', 'somethinghappens'),
cls('SoreThumbs', 'sorethumbs'),
cls('Striptease', 'striptease'),
cls('Superosity', 'superosity'),
cls('TheFirstDaughter', 'thefirstdaughter'),
cls('TheGodChild', 'godchild'),
cls('TheHuntersofSalamanstra', 'salamanstra'),
cls('TheLounge', 'thelounge'),
cls('WICKEDPOWERED', 'wickedpowered'),
]

View file

@ -1,19 +1,27 @@
# -*- coding: utf-8 -*-
from dosagelib.helpers import indirectStarter
from ..scraper import make_scraper, _ParserScraper
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from ..scraper import _ParserScraper
def add(name, url):
attrs = dict(
name=name,
url='http://kindofnormal.com/' + url,
imageSearch='//article[1]//div[@class="box-content"]//img',
prevSearch='//a[@class="prev"]'
)
globals()[name] = make_scraper(name, _ParserScraper, **attrs)
class KindOfNormal(_ParserScraper):
imageSearch = '//article[1]//div[@class="box-content"]//img'
prevSearch = '//a[@class="prev"]'
def __init__(self, name, url):
super(KindOfNormal, self).__init__(name)
self.url = 'http://kindofnormal.com/' + url
add('MeAndDanielle', 'meanddanielle')
add('TruthFacts', 'truthfacts')
add('Wumo', 'wumo')
add('Wulffmorgenthaler', 'wumo') # name in previous versions
@classmethod
def getmodules(cls):
return [
cls('MeAndDanielle', 'meanddanielle'),
cls('TruthFacts', 'truthfacts'),
cls('Wumo', 'wumo'),
# name in previous versions
cls('Wulffmorgenthaler', 'wumo'),
]

View file

@ -1,7 +1,9 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2016 Tobias Gruetzmacher
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from ..scraper import _ParserScraper
@ -10,14 +12,13 @@ class _NuklearPower(_ParserScraper):
prevSearch = '//a[@rel="prev"]'
imageSearch = '//div[@id="comic"]/img'
def __init__(self, name):
super(_NuklearPower, self).__init__('NuklearPower/' + name[2:])
@property
def url(self):
return 'http://www.nuklearpower.com/' + self.path + '/'
@property
def name(self):
return 'NuklearPower/' + super(_NuklearPower, self).name[2:]
class NP8BitTheater(_NuklearPower):
path = '8-bit-theater'

View file

@ -1,33 +1,50 @@
# -*- coding: iso-8859-1 -*-
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from re import compile
from ..scraper import make_scraper
from ..scraper import _BasicScraper
from ..util import tagre
_imageSearch = compile(tagre("img", "src", r'(http://[a-z0-9]+\.petitesymphony\.com/files/comics/[^"]+)'))
_prevSearch = compile(tagre("a", "href", r'(http://[a-z0-9]+\.petitesymphony\.com/comic/[^"]+)', after="navi-prev"))
def add(name):
classname = 'PetiteSymphony_%s' % name.capitalize()
url = 'http://%s.petitesymphony.com/' % name
globals()[classname] = make_scraper(classname,
name='PetiteSymphony/' + name.capitalize(),
url = url,
stripUrl = url + 'comic/%s',
imageSearch = _imageSearch,
prevSearch = _prevSearch,
multipleImagesPerStrip = True,
help='Index format: named number'
)
from .common import _WordPressScraper
add("djandora")
add("generation17")
add("knuckleup")
add("kickinrad")
add("orangegrind")
add("rascals")
add("sangria")
add("seed")
class PetiteSymphony(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'(http://[a-z0-9]+\.petitesymphony\.com/files/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://[a-z0-9]+\.petitesymphony\.com/comic/[^"]+)', after="navi-prev"))
multipleImagesPerStrip = True
help = 'Index format: named number'
def __init__(self, name):
super(PetiteSymphony, self).__init__('PetiteSymphony/' +
name.capitalize())
self.url = 'http://%s.petitesymphony.com/' % name
self.stripUrl = self.url + 'comic/%s'
@classmethod
def getmodules(cls):
return [
cls("knuckleup"),
cls("kickinrad"),
cls("orangegrind"),
cls("rascals"),
cls("sangria"),
cls("seed"),
]
class ComicsBreak(_WordPressScraper):
def __init__(self, name):
super(ComicsBreak, self).__init__('ComicsBreak/' + name)
self.url = 'http://%s.comicsbreak.com/' % name.lower()
@classmethod
def getmodules(cls):
return [
cls("Djandora"),
cls("Generation17"),
]

View file

@ -44,9 +44,8 @@ class _SmackJeeves(_ParserScraper):
broken_html_bugfix = True
@property
def name(self):
return 'SmackJeeves/' + super(_SmackJeeves, self).name[2:]
def __init__(self, name):
super(_SmackJeeves, self).__init__('SmackJeeves/' + name[2:])
@property
def url(self):

View file

@ -16,9 +16,8 @@ class _Snafu(_ParserScraper):
latestSearch = '//div[@id="feed"]/a'
starter = indirectStarter
@property
def name(self):
return 'SnafuComics/' + super(_Snafu, self).name
def __init__(self, name):
super(_Snafu, self).__init__('SnafuComics/' + name)
def namer(self, image_url, page_url):
year, month, name = image_url.rsplit('/', 3)[1:]

View file

@ -13,9 +13,8 @@ class _WebcomicEu(_ParserScraper):
prevSearch = '//a[img[contains(@src, "navi-zurueck")]]'
help = 'Index format: number'
@property
def name(self):
return 'WebcomicEu/' + super(_WebcomicEu, self).name
def __init__(self, name):
super(_WebcomicEu, self).__init__('WebcomicEu/' + name)
@property
def url(self):

View file

@ -16,9 +16,8 @@ class _WLPComics(_ParserScraper):
starter = bounceStarter
help = 'Index format: nnn'
@property
def name(self):
return 'WLP/' + super(_WLPComics, self).name
def __init__(self, name):
super(_WLPComics, self).__init__('WLP/' + name)
def namer(self, image_url, page_url):
return (page_url.rsplit('/', 1)[-1].split('.')[0] + '_' +

View file

@ -82,6 +82,10 @@ class Scraper(object):
# HTTP session for configuration & cookies
session = requests_session()
@classmethod
def getmodules(cls):
return [cls(cls.__name__)]
@property
def indexes(self):
return self._indexes
@ -91,8 +95,9 @@ class Scraper(object):
if val:
self._indexes = tuple(sorted(val))
def __init__(self):
def __init__(self, name):
"""Initialize internal variables."""
self.name = name
self.urls = set()
self._indexes = tuple()
self.skippedUrls = set()
@ -222,11 +227,6 @@ class Scraper(object):
"""Get comic strip URL from index."""
return self.stripUrl % index
@property
def name(self):
"""Get scraper name."""
return self.__class__.__name__
def starter(self):
"""Get starter URL from where to scrape comic strips."""
return self.url
@ -563,10 +563,12 @@ def get_scrapers():
if _scrapers is None:
out.debug(u"Loading comic modules...")
modules = loader.get_modules('plugins')
plugins = loader.get_plugins(modules, Scraper)
_scrapers = sorted([x() for x in plugins], key=lambda p: p.name)
plugins = list(loader.get_plugins(modules, Scraper))
_scrapers = sorted([m for x in plugins for m in x.getmodules()],
key=lambda p: p.name)
check_scrapers()
out.debug(u"... %d modules loaded." % len(_scrapers))
out.debug(u"... %d modules loaded from %d classes." % (
len(_scrapers), len(plugins)))
return _scrapers

View file

@ -1,16 +1,19 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2013-2014 Bastian Kleineidam
# Copyright (C) 2016 Tobias Gruetzmacher
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from dosagelib import scraper
class ATestScraper(scraper._BasicScraper):
name = 'Test_Test'
pass
class TestVote(object):
def test_vote(self):
answer = ATestScraper().vote()
answer = ATestScraper('Test_Test').vote()
assert answer in ('counted', 'no'), 'invalid answer %r' % answer