Refactor: Introduce generator methods for scrapers
This allows one comic module class to generate multiple scrapers. This change is to support a more dynamic module system as described in #42.
This commit is contained in:
parent
89cfd9d310
commit
51008a975b
16 changed files with 322 additions and 298 deletions
|
@ -12,14 +12,13 @@ class _Arcamax(_ParserScraper):
|
|||
imageSearch = '//img[@id="comic-zoom"]'
|
||||
prevSearch = '//a[@class="prev"]'
|
||||
|
||||
def __init__(self, name):
|
||||
super(_Arcamax, self).__init__('Arcamax/' + name)
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return 'http://www.arcamax.com/thefunnies/' + self.path + '/'
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return 'Arcamax/' + super(_Arcamax, self).name
|
||||
|
||||
|
||||
# do not edit anything below since these entries are generated from
|
||||
# scripts/update_plugins.sh
|
||||
|
|
|
@ -7,58 +7,55 @@ from __future__ import absolute_import, division, print_function
|
|||
|
||||
from re import compile
|
||||
|
||||
from ..scraper import make_scraper
|
||||
from ..scraper import _BasicScraper
|
||||
from ..util import tagre, getQueryParams
|
||||
|
||||
|
||||
_linkTag = tagre("a", "href", r'([^"]+)')
|
||||
_prevSearch = compile(_linkTag + tagre("img", "src", r"previous\.gif"))
|
||||
_nextSearch = compile(_linkTag + tagre("img", "src", r"next\.gif"))
|
||||
_lastSearch = compile(_linkTag + tagre("img", "src", r"last\.gif"))
|
||||
class CloneManga(_BasicScraper):
|
||||
_linkTag = tagre("a", "href", r'([^"]+)')
|
||||
prevSearch = compile(_linkTag + tagre("img", "src", r"previous\.gif"))
|
||||
nextSearch = compile(_linkTag + tagre("img", "src", r"next\.gif"))
|
||||
latestSearch = compile(_linkTag + tagre("img", "src", r"last\.gif"))
|
||||
help = 'Index format: n'
|
||||
|
||||
def __init__(self, name, shortName, imageFolder=None, lastStrip=None):
|
||||
super(CloneManga, self).__init__('CloneManga/' + name)
|
||||
|
||||
def add(name, shortName, imageFolder=None, lastStrip=None):
|
||||
classname = 'CloneManga_%s' % name
|
||||
_url = 'http://manga.clone-army.org'
|
||||
baseUrl = '%s/%s.php' % (_url, shortName)
|
||||
if imageFolder is None:
|
||||
imageFolder = shortName
|
||||
_url = 'http://manga.clone-army.org'
|
||||
self.url = '%s/%s.php' % (_url, shortName)
|
||||
if imageFolder is None:
|
||||
imageFolder = shortName
|
||||
self.stripUrl = self.url + '?page=%s'
|
||||
self.imageSearch = compile(tagre("img", "src", r'((?:%s/)?%s/[^"]+)' % (_url, imageFolder), after="center"))
|
||||
|
||||
if lastStrip is None:
|
||||
self.starter = self._starter
|
||||
else:
|
||||
self.url = self.stripUrl % lastStrip
|
||||
|
||||
def namer(self, image_url, page_url):
|
||||
return '%03d' % int(getQueryParams(page_url)['page'][0])
|
||||
|
||||
def _starter(self):
|
||||
# first, try hopping to previous and next comic
|
||||
data = self.getPage(baseUrl)
|
||||
data = self.getPage(self.url)
|
||||
try:
|
||||
url = self.fetchUrl(baseUrl, data, _prevSearch)
|
||||
url = self.fetchUrl(self.url, data, self.prevSearch)
|
||||
except ValueError:
|
||||
# no previous link found, try hopping to last comic
|
||||
return self.fetchUrl(baseUrl, data, _lastSearch)
|
||||
return self.fetchUrl(self.url, data, self.latestSearch)
|
||||
else:
|
||||
data = self.getPage(url)
|
||||
return self.fetchUrl(url, data, _nextSearch)
|
||||
return self.fetchUrl(url, data, self.nextSearch)
|
||||
|
||||
attrs = dict(
|
||||
name='CloneManga/' + name,
|
||||
stripUrl=baseUrl + '?page=%s',
|
||||
imageSearch=compile(tagre("img", "src", r'((?:%s/)?%s/[^"]+)' % (_url, imageFolder), after="center")),
|
||||
prevSearch=_prevSearch,
|
||||
help='Index format: n',
|
||||
namer=namer,
|
||||
url=baseUrl,
|
||||
)
|
||||
if lastStrip is None:
|
||||
attrs['starter'] = _starter
|
||||
else:
|
||||
attrs['url'] = attrs['stripUrl'] % lastStrip
|
||||
globals()[classname] = make_scraper(classname, **attrs)
|
||||
|
||||
|
||||
add('AprilAndMay', 'anm', imageFolder='AAM')
|
||||
add('Kanami', 'kanami')
|
||||
add('MomokaCorner', 'momoka')
|
||||
add('NanasEverydayLife', 'nana', lastStrip='78')
|
||||
add('PaperEleven', 'pxi', imageFolder='papereleven', lastStrip='311')
|
||||
add('Tomoyo42sRoom', 't42r')
|
||||
add('PennyTribute', 'penny')
|
||||
@classmethod
|
||||
def getmodules(cls):
|
||||
return [
|
||||
cls('AprilAndMay', 'anm', imageFolder='AAM'),
|
||||
cls('Kanami', 'kanami'),
|
||||
cls('MomokaCorner', 'momoka'),
|
||||
cls('NanasEverydayLife', 'nana', lastStrip='78'),
|
||||
cls('PaperEleven', 'pxi', imageFolder='papereleven', lastStrip='311'),
|
||||
cls('Tomoyo42sRoom', 't42r'),
|
||||
cls('PennyTribute', 'penny'),
|
||||
]
|
||||
|
|
|
@ -22,6 +22,9 @@ class _ComicFury(_ParserScraper):
|
|||
help = 'Index format: n'
|
||||
starter = bounceStarter
|
||||
|
||||
def __init__(self, name):
|
||||
super(_ComicFury, self).__init__('ComicFury/' + name[2:])
|
||||
|
||||
def namer(self, image_url, page_url):
|
||||
parts = page_url.split('/')
|
||||
path, ext = os.path.splitext(image_url)
|
||||
|
@ -32,10 +35,6 @@ class _ComicFury(_ParserScraper):
|
|||
def url(self):
|
||||
return 'http://%s.webcomic.ws/comics/' % self.sub
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return 'ComicFury/' + super(_ComicFury, self).name[2:]
|
||||
|
||||
def getIndexStripUrl(self, index):
|
||||
return self.url + 'comics/%s' % index
|
||||
|
||||
|
|
|
@ -6,26 +6,25 @@
|
|||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
from re import compile
|
||||
from ..scraper import make_scraper
|
||||
|
||||
from ..scraper import _BasicScraper
|
||||
from ..util import tagre
|
||||
|
||||
|
||||
_imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
|
||||
_prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
|
||||
'(?:Previous comic' + '|' +
|
||||
tagre("img", "alt", "Previous comic") + '|' +
|
||||
tagre("img", "src", "images/back\.gif") +
|
||||
')')
|
||||
# Comicgenesis has a lot of comics, but most of them are disallowed by
|
||||
# robots.txt
|
||||
|
||||
|
||||
def add(name, url):
|
||||
classname = 'ComicGenesis_%s' % name
|
||||
if '/d/' in url:
|
||||
stripUrl = url.split('/d/')[0] + '/d/%s.html'
|
||||
else:
|
||||
stripUrl = url + 'd/%s.html'
|
||||
class ComicGenesis(_BasicScraper):
|
||||
imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
|
||||
prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
|
||||
'(?:Previous comic' + '|' +
|
||||
tagre("img", "alt", "Previous comic") + '|' +
|
||||
tagre("img", "src", "images/back\.gif") +
|
||||
')')
|
||||
multipleImagesPerStrip = True
|
||||
help = 'Index format: yyyymmdd'
|
||||
|
||||
def _prevUrlModifier(self, prev_url):
|
||||
def prevUrlModifier(self, prev_url):
|
||||
if prev_url:
|
||||
return prev_url.replace(
|
||||
"keenspace.com", "comicgenesis.com").replace(
|
||||
|
@ -33,95 +32,100 @@ def add(name, url):
|
|||
"toonspace.com", "comicgenesis.com").replace(
|
||||
"comicgen.com", "comicgenesis.com")
|
||||
|
||||
globals()[classname] = make_scraper(
|
||||
classname,
|
||||
name='ComicGenesis/' + name,
|
||||
url=url,
|
||||
stripUrl=stripUrl,
|
||||
imageSearch=_imageSearch,
|
||||
prevSearch=_prevSearch,
|
||||
prevUrlModifier=_prevUrlModifier,
|
||||
multipleImagesPerStrip=True,
|
||||
help='Index format: yyyymmdd',
|
||||
)
|
||||
def __init__(self, name, sub=None, last=None, baseUrl=None):
|
||||
super(ComicGenesis, self).__init__('ComicGenesis/' + name)
|
||||
|
||||
# Comicgenesis has a lot of comics, but most of them are disallowed by robots.txt
|
||||
# do not edit anything below since these entries are generated from scripts/update.sh
|
||||
# DO NOT REMOVE
|
||||
add('AAAAA', 'http://aaaaa.comicgenesis.com/')
|
||||
add('AdventuresofKiltman', 'http://kiltman.comicgenesis.com/')
|
||||
add('AmorModerno', 'http://amormoderno.comicgenesis.com/')
|
||||
add('AnythingButRealLife', 'http://anythingbutreallife.comicgenesis.com/')
|
||||
add('Ardra', 'http://ardra.comicgenesis.com/')
|
||||
add('Artwork', 'http://artwork.comicgenesis.com/')
|
||||
add('BabeintheWoods', 'http://babeinthewoods.comicgenesis.com/')
|
||||
add('BackwaterPlanet', 'http://bobthespirit.comicgenesis.com/')
|
||||
add('BendyStrawVampires', 'http://bsvampires.comicgenesis.com/')
|
||||
add('BlindSight', 'http://blindsight.comicgenesis.com/')
|
||||
add('BreakingtheDoldrum', 'http://breakingthedoldrum.comicgenesis.com/')
|
||||
add('Candi', 'http://candicomics.com/')
|
||||
add('CorporateLife', 'http://corporatelife.comicgenesis.com/')
|
||||
add('DarkWelkin', 'http://darkwelkin.comicgenesis.com/')
|
||||
add('DemonEater', 'http://demoneater.comicgenesis.com/')
|
||||
add('DoodleDiaries', 'http://doodlediaries.comicgenesis.com/')
|
||||
add('DormSweetDorm', 'http://dormsweetdorm.comicgenesis.com/')
|
||||
add('DoubleyouTeeEff', 'http://doubleyouteeeff.comicgenesis.com/')
|
||||
add('DragonsBane', 'http://jasonwhitewaterz.comicgenesis.com/')
|
||||
add('Dreamaniac', 'http://dreamaniaccomic.comicgenesis.com/')
|
||||
add('ElnifiChronicles', 'http://elnifichronicles.comicgenesis.com/')
|
||||
add('EvesApple', 'http://evesapple.comicgenesis.com/')
|
||||
add('FancyThat', 'http://fancythat.comicgenesis.com/')
|
||||
add('FantasyQwest', 'http://creatorauthorman.comicgenesis.com/')
|
||||
add('Fantazine', 'http://fantazin.comicgenesis.com/')
|
||||
add('Flounderville', 'http://flounderville.comicgenesis.com/')
|
||||
add('GEM', 'http://keltzy.comicgenesis.com/')
|
||||
add('Gonefor300days', 'http://g4300d.comicgenesis.com/')
|
||||
add('IBlameDanny', 'http://vileterror.comicgenesis.com/')
|
||||
add('ImpendingDoom', 'http://impending.comicgenesis.com/')
|
||||
add('InANutshell', 'http://nutshellcomics.comicgenesis.com/')
|
||||
add('KernyMantisComics', 'http://kernymantis.comicgenesis.com/')
|
||||
add('KitsuneJewel', 'http://kitsunejewel.comicgenesis.com/')
|
||||
add('KittyCattyGames', 'http://kittycattygames.comicgenesis.com/')
|
||||
add('KiwiDayN', 'http://kiwidayn.comicgenesis.com/')
|
||||
add('KungFounded', 'http://kungfounded.comicgenesis.com/')
|
||||
add('LabBratz', 'http://labbratz.comicgenesis.com/')
|
||||
add('Laserwing', 'http://laserwing.comicgenesis.com/')
|
||||
add('LumiasKingdom', 'http://lumia.comicgenesis.com/')
|
||||
add('Majestic7', 'http://majestic7.comicgenesis.com/')
|
||||
add('MaximumWhimsy', 'http://maximumwhimsy.comicgenesis.com/')
|
||||
add('MenschunsererZeitGerman', 'http://muz.comicgenesis.com/')
|
||||
add('MoonCrest24', 'http://mooncrest.comicgenesis.com/d/20121117.html')
|
||||
add('Mushian', 'http://tentoumushi.comicgenesis.com/')
|
||||
add('NightwolfCentral', 'http://nightwolfcentral.comicgenesis.com/')
|
||||
add('NoTimeForLife', 'http://randyraven.comicgenesis.com/')
|
||||
add('NoneMoreComic', 'http://nonemore.comicgenesis.com/')
|
||||
add('ODCKS', 'http://odcks.comicgenesis.com/')
|
||||
add('OfDoom', 'http://ofdoom.comicgenesis.com/')
|
||||
add('OpportunityofaLifetime', 'http://carpathia.comicgenesis.com/')
|
||||
add('Orbz', 'http://orbz.comicgenesis.com/')
|
||||
add('OwMySanity', 'http://owmysanity.comicgenesis.com/')
|
||||
add('PhantomThesis', 'http://phantomthesis.comicgenesis.com/')
|
||||
add('ProfessorSaltinesAstrodynamicDirigible', 'http://drsaltine.comicgenesis.com/')
|
||||
add('PsychicDyslexiaInstitute', 'http://pdi.comicgenesis.com/')
|
||||
add('PublicidadeEnganosa', 'http://publicidadeenganosa.comicgenesis.com/')
|
||||
add('RandomAxeOfKindness', 'http://randomaxe.comicgenesis.com/')
|
||||
add('SalemUncommons', 'http://salemuncommons.comicgenesis.com/')
|
||||
add('SamandElisAdventures', 'http://sameliadv.comicgenesis.com/')
|
||||
add('SarahZero', 'http://plughead.comicgenesis.com/')
|
||||
add('SixByNineCollege', 'http://sixbyninecollege.comicgenesis.com/')
|
||||
add('SpoononHighandFireontheMountian', 'http://spoon.comicgenesis.com/')
|
||||
add('SynapticMisfires', 'http://synapticmisfires.comicgenesis.com/')
|
||||
add('TakingStock', 'http://mapaghimagsik.comicgenesis.com/')
|
||||
add('TemplarArizona', 'http://templaraz.comicgenesis.com/')
|
||||
add('TheAdventuresofKaniraBaxter', 'http://kanirabaxter.comicgenesis.com/')
|
||||
add('TheAdventuresofVindibuddSuperheroInTraining', 'http://vindibudd.comicgenesis.com/d/20070720.html')
|
||||
add('TheEasyBreather', 'http://easybreather.comicgenesis.com/')
|
||||
add('TheLounge', 'http://thelounge.comicgenesis.com/')
|
||||
add('TheMisadventuresofOkk', 'http://okk.comicgenesis.com/')
|
||||
add('ThePath', 'http://thepath.comicgenesis.com/')
|
||||
add('TheTalesofKalduras', 'http://kalduras.comicgenesis.com/')
|
||||
add('Unconventional', 'http://unconventional.comicgenesis.com/')
|
||||
add('WarMageNC17', 'http://warmage.comicgenesis.com/')
|
||||
add('WebcomicTheWebcomicWebcomicWebcomicWebcomic', 'http://dannormnsanidey.comicgenesis.com/')
|
||||
add('WhatYouDontSee', 'http://phantomlady4.comicgenesis.com/')
|
||||
add('Wierdman', 'http://asa.comicgenesis.com/')
|
||||
if sub:
|
||||
baseUrl = 'http://%s.comicgenesis.com/' % sub
|
||||
|
||||
self.stripUrl = baseUrl + 'd/%s.html'
|
||||
if last:
|
||||
self.url = self.stripUrl % last
|
||||
self.endOfLife = True
|
||||
else:
|
||||
self.url = baseUrl
|
||||
|
||||
@classmethod
|
||||
def getmodules(cls):
|
||||
return [
|
||||
# do not edit anything below since these entries are generated from
|
||||
# scripts/update_plugins.sh
|
||||
# DO NOT REMOVE
|
||||
cls('AAAAA', 'aaaaa'),
|
||||
cls('AdventuresofKiltman', 'kiltman'),
|
||||
cls('AmorModerno', 'amormoderno'),
|
||||
cls('AnythingButRealLife', 'anythingbutreallife'),
|
||||
cls('Ardra', 'ardra'),
|
||||
cls('Artwork', 'artwork'),
|
||||
cls('BabeintheWoods', 'babeinthewoods'),
|
||||
cls('BackwaterPlanet', 'bobthespirit'),
|
||||
cls('BendyStrawVampires', 'bsvampires'),
|
||||
cls('BlindSight', 'blindsight'),
|
||||
cls('BreakingtheDoldrum', 'breakingthedoldrum'),
|
||||
cls('Candi', baseUrl='http://candicomics.com/'),
|
||||
cls('CorporateLife', 'corporatelife'),
|
||||
cls('DarkWelkin', 'darkwelkin'),
|
||||
cls('DemonEater', 'demoneater'),
|
||||
cls('DoodleDiaries', 'doodlediaries'),
|
||||
cls('DormSweetDorm', 'dormsweetdorm'),
|
||||
cls('DoubleyouTeeEff', 'doubleyouteeeff'),
|
||||
cls('DragonsBane', 'jasonwhitewaterz'),
|
||||
cls('Dreamaniac', 'dreamaniaccomic'),
|
||||
cls('ElnifiChronicles', 'elnifichronicles'),
|
||||
cls('EvesApple', 'evesapple'),
|
||||
cls('FancyThat', 'fancythat'),
|
||||
cls('FantasyQwest', 'creatorauthorman'),
|
||||
cls('Fantazine', 'fantazin'),
|
||||
cls('Flounderville', 'flounderville'),
|
||||
cls('GEM', 'keltzy'),
|
||||
cls('Gonefor300days', 'g4300d'),
|
||||
cls('IBlameDanny', 'vileterror'),
|
||||
cls('ImpendingDoom', 'impending'),
|
||||
cls('InANutshell', 'nutshellcomics'),
|
||||
cls('KernyMantisComics', 'kernymantis'),
|
||||
cls('KitsuneJewel', 'kitsunejewel'),
|
||||
cls('KittyCattyGames', 'kittycattygames'),
|
||||
cls('KiwiDayN', 'kiwidayn'),
|
||||
cls('KungFounded', 'kungfounded'),
|
||||
cls('LabBratz', 'labbratz'),
|
||||
cls('Laserwing', 'laserwing'),
|
||||
cls('LumiasKingdom', 'lumia'),
|
||||
cls('Majestic7', 'majestic7'),
|
||||
cls('MaximumWhimsy', 'maximumwhimsy'),
|
||||
cls('MenschunsererZeitGerman', 'muz'),
|
||||
cls('MoonCrest24', 'mooncrest', last='20121117'),
|
||||
cls('Mushian', 'tentoumushi'),
|
||||
cls('NightwolfCentral', 'nightwolfcentral'),
|
||||
cls('NoTimeForLife', 'randyraven'),
|
||||
cls('NoneMoreComic', 'nonemore'),
|
||||
cls('ODCKS', 'odcks'),
|
||||
cls('OfDoom', 'ofdoom'),
|
||||
cls('OpportunityofaLifetime', 'carpathia'),
|
||||
cls('Orbz', 'orbz'),
|
||||
cls('OwMySanity', 'owmysanity'),
|
||||
cls('PhantomThesis', 'phantomthesis'),
|
||||
cls('ProfessorSaltinesAstrodynamicDirigible', 'drsaltine'),
|
||||
cls('PsychicDyslexiaInstitute', 'pdi'),
|
||||
cls('PublicidadeEnganosa', 'publicidadeenganosa'),
|
||||
cls('RandomAxeOfKindness', 'randomaxe'),
|
||||
cls('SalemUncommons', 'salemuncommons'),
|
||||
cls('SamandElisAdventures', 'sameliadv'),
|
||||
cls('SarahZero', 'plughead'),
|
||||
cls('SixByNineCollege', 'sixbyninecollege'),
|
||||
cls('SpoononHighandFireontheMountian', 'spoon'),
|
||||
cls('SynapticMisfires', 'synapticmisfires'),
|
||||
cls('TakingStock', 'mapaghimagsik'),
|
||||
cls('TemplarArizona', 'templaraz'),
|
||||
cls('TheAdventuresofKaniraBaxter', 'kanirabaxter'),
|
||||
cls('TheAdventuresofVindibuddSuperheroInTraining', 'vindibudd', last='20070720'),
|
||||
cls('TheEasyBreather', 'easybreather'),
|
||||
cls('TheLounge', 'thelounge'),
|
||||
cls('TheMisadventuresofOkk', 'okk'),
|
||||
cls('ThePath', 'thepath'),
|
||||
cls('TheTalesofKalduras', 'kalduras'),
|
||||
cls('Unconventional', 'unconventional'),
|
||||
cls('WarMageNC17', 'warmage'),
|
||||
cls('WebcomicTheWebcomicWebcomicWebcomicWebcomic', 'dannormnsanidey'),
|
||||
cls('WhatYouDontSee', 'phantomlady4'),
|
||||
cls('Wierdman', 'asa'),
|
||||
]
|
||||
|
|
|
@ -15,9 +15,8 @@ class _Creators(_ParserScraper):
|
|||
latestSearch = '//div[contains(@class,"caption")]/a'
|
||||
starter = indirectStarter
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return 'Creators/' + super(_Creators, self).name
|
||||
def __init__(self, name):
|
||||
super(_Creators, self).__init__('Creators/' + name)
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
|
|
|
@ -18,9 +18,8 @@ class _GoComics(_ParserScraper):
|
|||
starter = bounceStarter
|
||||
help = 'Index format: yyyy/mm/dd'
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return 'GoComics/' + super(_GoComics, self).name[2:]
|
||||
def __init__(self, name):
|
||||
super(_GoComics, self).__init__('GoComics/' + name[2:])
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
|
|
|
@ -1,79 +1,79 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
from re import compile
|
||||
from ..scraper import make_scraper
|
||||
|
||||
from ..scraper import _BasicScraper
|
||||
from ..util import tagre
|
||||
|
||||
|
||||
_imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
|
||||
_stripPattern = r'([^"]*/d/\d{8}\.html)'
|
||||
_prevSearch = (
|
||||
compile(tagre("link", "href", _stripPattern, before="prev")),
|
||||
compile(tagre("a", "href", _stripPattern, after="prev")),
|
||||
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"previous_day1")),
|
||||
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"katc7")),
|
||||
)
|
||||
|
||||
def add(name, url):
|
||||
classname = 'KeenSpot_%s' % name
|
||||
if '/d/' in url:
|
||||
stripUrl = url.split('/d/')[0] + '/d/%s.html'
|
||||
else:
|
||||
stripUrl = url + 'd/%s.html'
|
||||
|
||||
globals()[classname] = make_scraper(classname,
|
||||
name='KeenSpot/' + name,
|
||||
url=url,
|
||||
stripUrl=stripUrl,
|
||||
imageSearch = _imageSearch,
|
||||
prevSearch = _prevSearch,
|
||||
help = 'Index format: yyyymmdd',
|
||||
class KeenSpot(_BasicScraper):
|
||||
imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
|
||||
_stripPattern = r'([^"]*/d/\d{8}\.html)'
|
||||
prevSearch = (
|
||||
compile(tagre("link", "href", _stripPattern, before="prev")),
|
||||
compile(tagre("a", "href", _stripPattern, after="prev")),
|
||||
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"previous_day1")),
|
||||
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"katc7")),
|
||||
)
|
||||
help = 'Index format: yyyymmdd'
|
||||
|
||||
# do not edit anything below since these entries are generated from scripts/update.sh
|
||||
# DO NOT REMOVE
|
||||
add('27TwentySeven', 'http://twenty-seven.keenspot.com/')
|
||||
add('Adventurers', 'http://adventurers.keenspot.com/')
|
||||
add('AntiheroForHire', 'http://antihero.keenspot.com/')
|
||||
add('BanzaiGirl', 'http://banzaigirl.keenspot.com/')
|
||||
add('Barker', 'http://barkercomic.keenspot.com/')
|
||||
add('Buzzboy', 'http://buzzboy.keenspot.com/')
|
||||
add('ChoppingBlock', 'http://choppingblock.keenspot.com/')
|
||||
add('ClichFlamb', 'http://clicheflambe.keenspot.com/')
|
||||
add('CountYourSheep', 'http://countyoursheep.keenspot.com/')
|
||||
add('EverythingJake', 'http://everythingjake.keenspot.com/')
|
||||
add('FallOutToyWorks', 'http://fallouttoyworks.keenspot.com/')
|
||||
add('FriarAndBrimstone', 'http://friarandbrimstone.keenspot.com/')
|
||||
add('GeneCatlow', 'http://genecatlow.keenspot.com/')
|
||||
add('GodMode', 'http://godmode.keenspot.com/')
|
||||
add('GreenWake', 'http://greenwake.keenspot.com/')
|
||||
add('HeadTrip', 'http://headtrip.keenspot.com/')
|
||||
add('HoaxHunters', 'http://hoaxhunters.keenspot.com/')
|
||||
add('InHere', 'http://inhere.keenspot.com/')
|
||||
add('Katrina', 'http://katrina.keenspot.com/')
|
||||
add('Landis', 'http://landis.keenspot.com/')
|
||||
add('MakeshiftMiracle', 'http://makeshiftmiracle.keenspot.com/')
|
||||
add('Marksmen', 'http://marksmen.keenspot.com/')
|
||||
add('MarryMe', 'http://marryme.keenspot.com/')
|
||||
add('MedusasDaughter', 'http://medusasdaughter.keenspot.com/')
|
||||
add('MonsterMassacre', 'http://monstermassacre.keenspot.com/')
|
||||
add('Newshounds', 'http://newshounds.keenspot.com/')
|
||||
add('NoPinkPonies', 'http://nopinkponies.keenspot.com/')
|
||||
add('OutThere', 'http://outthere.keenspot.com/')
|
||||
add('Porcelain', 'http://porcelain.keenspot.com/')
|
||||
add('QUILTBAG', 'http://quiltbag.keenspot.com/')
|
||||
add('RedSpike', 'http://redspike.keenspot.com/')
|
||||
add('RumbleFall', 'http://rumblefall.keenspot.com/')
|
||||
add('SamuraisBlood', 'http://samuraisblood.keenspot.com/')
|
||||
add('Sharky', 'http://sharky.keenspot.com/')
|
||||
add('SomethingHappens', 'http://somethinghappens.keenspot.com/')
|
||||
add('SoreThumbs', 'http://sorethumbs.keenspot.com/')
|
||||
add('Striptease', 'http://striptease.keenspot.com/')
|
||||
add('Superosity', 'http://superosity.keenspot.com/')
|
||||
add('TheFirstDaughter', 'http://thefirstdaughter.keenspot.com/')
|
||||
add('TheGodChild', 'http://godchild.keenspot.com/')
|
||||
add('TheHuntersofSalamanstra', 'http://salamanstra.keenspot.com/')
|
||||
add('TheLounge', 'http://thelounge.keenspot.com/')
|
||||
add('WICKEDPOWERED', 'http://wickedpowered.keenspot.com/')
|
||||
def __init__(self, name, sub):
|
||||
super(KeenSpot, self).__init__('KeenSpot/' + name)
|
||||
self.url = 'http://%s.keenspot.com/' % sub
|
||||
self.stripUrl = self.url + 'd/%s.html'
|
||||
|
||||
@classmethod
|
||||
def getmodules(cls):
|
||||
return [
|
||||
# do not edit anything below since these entries are generated from
|
||||
# scripts/update_plugins.sh
|
||||
# DO NOT REMOVE
|
||||
cls('27TwentySeven', 'twenty-seven'),
|
||||
cls('Adventurers', 'adventurers'),
|
||||
cls('AntiheroForHire', 'antihero'),
|
||||
cls('BanzaiGirl', 'banzaigirl'),
|
||||
cls('Barker', 'barkercomic'),
|
||||
cls('Buzzboy', 'buzzboy'),
|
||||
cls('ChoppingBlock', 'choppingblock'),
|
||||
cls('ClichFlamb', 'clicheflambe'),
|
||||
cls('CountYourSheep', 'countyoursheep'),
|
||||
cls('EverythingJake', 'everythingjake'),
|
||||
cls('FallOutToyWorks', 'fallouttoyworks'),
|
||||
cls('FriarAndBrimstone', 'friarandbrimstone'),
|
||||
cls('GeneCatlow', 'genecatlow'),
|
||||
cls('GodMode', 'godmode'),
|
||||
cls('GreenWake', 'greenwake'),
|
||||
cls('HeadTrip', 'headtrip'),
|
||||
cls('HoaxHunters', 'hoaxhunters'),
|
||||
cls('InHere', 'inhere'),
|
||||
cls('Katrina', 'katrina'),
|
||||
cls('Landis', 'landis'),
|
||||
cls('MakeshiftMiracle', 'makeshiftmiracle'),
|
||||
cls('Marksmen', 'marksmen'),
|
||||
cls('MarryMe', 'marryme'),
|
||||
cls('MedusasDaughter', 'medusasdaughter'),
|
||||
cls('MonsterMassacre', 'monstermassacre'),
|
||||
cls('Newshounds', 'newshounds'),
|
||||
cls('NoPinkPonies', 'nopinkponies'),
|
||||
cls('OutThere', 'outthere'),
|
||||
cls('Porcelain', 'porcelain'),
|
||||
cls('QUILTBAG', 'quiltbag'),
|
||||
cls('RedSpike', 'redspike'),
|
||||
cls('RumbleFall', 'rumblefall'),
|
||||
cls('SamuraisBlood', 'samuraisblood'),
|
||||
cls('Sharky', 'sharky'),
|
||||
cls('SomethingHappens', 'somethinghappens'),
|
||||
cls('SoreThumbs', 'sorethumbs'),
|
||||
cls('Striptease', 'striptease'),
|
||||
cls('Superosity', 'superosity'),
|
||||
cls('TheFirstDaughter', 'thefirstdaughter'),
|
||||
cls('TheGodChild', 'godchild'),
|
||||
cls('TheHuntersofSalamanstra', 'salamanstra'),
|
||||
cls('TheLounge', 'thelounge'),
|
||||
cls('WICKEDPOWERED', 'wickedpowered'),
|
||||
]
|
||||
|
|
|
@ -1,19 +1,27 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from dosagelib.helpers import indirectStarter
|
||||
from ..scraper import make_scraper, _ParserScraper
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
from ..scraper import _ParserScraper
|
||||
|
||||
|
||||
def add(name, url):
|
||||
attrs = dict(
|
||||
name=name,
|
||||
url='http://kindofnormal.com/' + url,
|
||||
imageSearch='//article[1]//div[@class="box-content"]//img',
|
||||
prevSearch='//a[@class="prev"]'
|
||||
)
|
||||
globals()[name] = make_scraper(name, _ParserScraper, **attrs)
|
||||
class KindOfNormal(_ParserScraper):
|
||||
imageSearch = '//article[1]//div[@class="box-content"]//img'
|
||||
prevSearch = '//a[@class="prev"]'
|
||||
|
||||
def __init__(self, name, url):
|
||||
super(KindOfNormal, self).__init__(name)
|
||||
self.url = 'http://kindofnormal.com/' + url
|
||||
|
||||
add('MeAndDanielle', 'meanddanielle')
|
||||
add('TruthFacts', 'truthfacts')
|
||||
add('Wumo', 'wumo')
|
||||
add('Wulffmorgenthaler', 'wumo') # name in previous versions
|
||||
@classmethod
|
||||
def getmodules(cls):
|
||||
return [
|
||||
cls('MeAndDanielle', 'meanddanielle'),
|
||||
cls('TruthFacts', 'truthfacts'),
|
||||
cls('Wumo', 'wumo'),
|
||||
# name in previous versions
|
||||
cls('Wulffmorgenthaler', 'wumo'),
|
||||
]
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2016 Tobias Gruetzmacher
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
from ..scraper import _ParserScraper
|
||||
|
||||
|
@ -10,14 +12,13 @@ class _NuklearPower(_ParserScraper):
|
|||
prevSearch = '//a[@rel="prev"]'
|
||||
imageSearch = '//div[@id="comic"]/img'
|
||||
|
||||
def __init__(self, name):
|
||||
super(_NuklearPower, self).__init__('NuklearPower/' + name[2:])
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return 'http://www.nuklearpower.com/' + self.path + '/'
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return 'NuklearPower/' + super(_NuklearPower, self).name[2:]
|
||||
|
||||
|
||||
class NP8BitTheater(_NuklearPower):
|
||||
path = '8-bit-theater'
|
||||
|
|
|
@ -1,33 +1,50 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
from re import compile
|
||||
from ..scraper import make_scraper
|
||||
|
||||
from ..scraper import _BasicScraper
|
||||
from ..util import tagre
|
||||
|
||||
_imageSearch = compile(tagre("img", "src", r'(http://[a-z0-9]+\.petitesymphony\.com/files/comics/[^"]+)'))
|
||||
_prevSearch = compile(tagre("a", "href", r'(http://[a-z0-9]+\.petitesymphony\.com/comic/[^"]+)', after="navi-prev"))
|
||||
|
||||
def add(name):
|
||||
classname = 'PetiteSymphony_%s' % name.capitalize()
|
||||
url = 'http://%s.petitesymphony.com/' % name
|
||||
globals()[classname] = make_scraper(classname,
|
||||
name='PetiteSymphony/' + name.capitalize(),
|
||||
url = url,
|
||||
stripUrl = url + 'comic/%s',
|
||||
imageSearch = _imageSearch,
|
||||
prevSearch = _prevSearch,
|
||||
multipleImagesPerStrip = True,
|
||||
help='Index format: named number'
|
||||
)
|
||||
from .common import _WordPressScraper
|
||||
|
||||
|
||||
add("djandora")
|
||||
add("generation17")
|
||||
add("knuckleup")
|
||||
add("kickinrad")
|
||||
add("orangegrind")
|
||||
add("rascals")
|
||||
add("sangria")
|
||||
add("seed")
|
||||
class PetiteSymphony(_BasicScraper):
|
||||
imageSearch = compile(tagre("img", "src", r'(http://[a-z0-9]+\.petitesymphony\.com/files/comics/[^"]+)'))
|
||||
prevSearch = compile(tagre("a", "href", r'(http://[a-z0-9]+\.petitesymphony\.com/comic/[^"]+)', after="navi-prev"))
|
||||
multipleImagesPerStrip = True
|
||||
help = 'Index format: named number'
|
||||
|
||||
def __init__(self, name):
|
||||
super(PetiteSymphony, self).__init__('PetiteSymphony/' +
|
||||
name.capitalize())
|
||||
self.url = 'http://%s.petitesymphony.com/' % name
|
||||
self.stripUrl = self.url + 'comic/%s'
|
||||
|
||||
@classmethod
|
||||
def getmodules(cls):
|
||||
return [
|
||||
cls("knuckleup"),
|
||||
cls("kickinrad"),
|
||||
cls("orangegrind"),
|
||||
cls("rascals"),
|
||||
cls("sangria"),
|
||||
cls("seed"),
|
||||
]
|
||||
|
||||
|
||||
class ComicsBreak(_WordPressScraper):
|
||||
|
||||
def __init__(self, name):
|
||||
super(ComicsBreak, self).__init__('ComicsBreak/' + name)
|
||||
self.url = 'http://%s.comicsbreak.com/' % name.lower()
|
||||
|
||||
@classmethod
|
||||
def getmodules(cls):
|
||||
return [
|
||||
cls("Djandora"),
|
||||
cls("Generation17"),
|
||||
]
|
||||
|
|
|
@ -44,9 +44,8 @@ class _SmackJeeves(_ParserScraper):
|
|||
|
||||
broken_html_bugfix = True
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return 'SmackJeeves/' + super(_SmackJeeves, self).name[2:]
|
||||
def __init__(self, name):
|
||||
super(_SmackJeeves, self).__init__('SmackJeeves/' + name[2:])
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
|
|
|
@ -16,9 +16,8 @@ class _Snafu(_ParserScraper):
|
|||
latestSearch = '//div[@id="feed"]/a'
|
||||
starter = indirectStarter
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return 'SnafuComics/' + super(_Snafu, self).name
|
||||
def __init__(self, name):
|
||||
super(_Snafu, self).__init__('SnafuComics/' + name)
|
||||
|
||||
def namer(self, image_url, page_url):
|
||||
year, month, name = image_url.rsplit('/', 3)[1:]
|
||||
|
|
|
@ -13,9 +13,8 @@ class _WebcomicEu(_ParserScraper):
|
|||
prevSearch = '//a[img[contains(@src, "navi-zurueck")]]'
|
||||
help = 'Index format: number'
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return 'WebcomicEu/' + super(_WebcomicEu, self).name
|
||||
def __init__(self, name):
|
||||
super(_WebcomicEu, self).__init__('WebcomicEu/' + name)
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
|
|
|
@ -16,9 +16,8 @@ class _WLPComics(_ParserScraper):
|
|||
starter = bounceStarter
|
||||
help = 'Index format: nnn'
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return 'WLP/' + super(_WLPComics, self).name
|
||||
def __init__(self, name):
|
||||
super(_WLPComics, self).__init__('WLP/' + name)
|
||||
|
||||
def namer(self, image_url, page_url):
|
||||
return (page_url.rsplit('/', 1)[-1].split('.')[0] + '_' +
|
||||
|
|
|
@ -82,6 +82,10 @@ class Scraper(object):
|
|||
# HTTP session for configuration & cookies
|
||||
session = requests_session()
|
||||
|
||||
@classmethod
|
||||
def getmodules(cls):
|
||||
return [cls(cls.__name__)]
|
||||
|
||||
@property
|
||||
def indexes(self):
|
||||
return self._indexes
|
||||
|
@ -91,8 +95,9 @@ class Scraper(object):
|
|||
if val:
|
||||
self._indexes = tuple(sorted(val))
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, name):
|
||||
"""Initialize internal variables."""
|
||||
self.name = name
|
||||
self.urls = set()
|
||||
self._indexes = tuple()
|
||||
self.skippedUrls = set()
|
||||
|
@ -222,11 +227,6 @@ class Scraper(object):
|
|||
"""Get comic strip URL from index."""
|
||||
return self.stripUrl % index
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
"""Get scraper name."""
|
||||
return self.__class__.__name__
|
||||
|
||||
def starter(self):
|
||||
"""Get starter URL from where to scrape comic strips."""
|
||||
return self.url
|
||||
|
@ -563,10 +563,12 @@ def get_scrapers():
|
|||
if _scrapers is None:
|
||||
out.debug(u"Loading comic modules...")
|
||||
modules = loader.get_modules('plugins')
|
||||
plugins = loader.get_plugins(modules, Scraper)
|
||||
_scrapers = sorted([x() for x in plugins], key=lambda p: p.name)
|
||||
plugins = list(loader.get_plugins(modules, Scraper))
|
||||
_scrapers = sorted([m for x in plugins for m in x.getmodules()],
|
||||
key=lambda p: p.name)
|
||||
check_scrapers()
|
||||
out.debug(u"... %d modules loaded." % len(_scrapers))
|
||||
out.debug(u"... %d modules loaded from %d classes." % (
|
||||
len(_scrapers), len(plugins)))
|
||||
return _scrapers
|
||||
|
||||
|
||||
|
|
|
@ -1,16 +1,19 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2013-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2016 Tobias Gruetzmacher
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
from dosagelib import scraper
|
||||
|
||||
|
||||
class ATestScraper(scraper._BasicScraper):
|
||||
name = 'Test_Test'
|
||||
pass
|
||||
|
||||
|
||||
class TestVote(object):
|
||||
|
||||
def test_vote(self):
|
||||
answer = ATestScraper().vote()
|
||||
answer = ATestScraper('Test_Test').vote()
|
||||
assert answer in ('counted', 'no'), 'invalid answer %r' % answer
|
||||
|
|
Loading…
Reference in a new issue