New module for ComicSherpa (removed from GoComics)

This commit is contained in:
Tobias Gruetzmacher 2017-01-11 01:34:52 +01:00
parent 8a89246d88
commit 061efaac6e
2 changed files with 271 additions and 0 deletions

View file

@ -0,0 +1,228 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2017 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from ..scraper import _ParserScraper
class ComicSherpa(_ParserScraper):
url = 'http://www.comicssherpa.com/site/'
imageSearch = '//img[contains(@src, "/comics/")]'
prevSearch = '//a[text()="previous day"]'
help = 'Index format: yyyymmdd'
def __init__(self, name, path):
super(ComicSherpa, self).__init__('ComicSherpa/' + name)
self.url = 'http://www.comicssherpa.com/site/feature?uc_comic=' + path
def getIndexStripUrl(self, index):
return self.url + '&uc_full_date=%s' % index
@classmethod
def getmodules(cls):
return (
# do not edit anything below since these entries are generated from
# scripts/comicsherpa.py
# START AUTOUPDATE
cls('060', 'csadl'),
cls('AaronGuile', 'csdsf'),
cls('ABCStreet', 'csmbx'),
cls('ABitSketch', 'csxmy'),
cls('ABomb', 'csvur'),
cls('ACMEINKD', 'csmwt'),
cls('AcornPark', 'csdfe'),
cls('Adulting', 'cskky'),
cls('AJAndMagnus', 'csrxy'),
cls('AlisonWard', 'cspgh'),
cls('AllInGoodTime', 'csjhr'),
cls('AmandaTheGreat', 'cssyr'),
cls('AndNow', 'csnxr'),
cls('Anecdote', 'cspmf'),
cls('AnimalMitchell', 'csdnm'),
cls('AnneAndPythagoras', 'csokq'),
cls('AppleCreekComics', 'cstgq'),
cls('ATasteOfTimes', 'csprn'),
cls('BatchRejection', 'csgny'),
cls('Bazoobee', 'csfos'),
cls('BeMisery', 'csiiq'),
cls('BeneathTheFerns', 'csgzn'),
cls('BigJim', 'csiao'),
cls('Bluebonnets', 'cston'),
cls('BlueSkiesToons', 'csfoy'),
cls('BobsYourUncle', 'csmxz'),
cls('BoltsAndNuts', 'csnab'),
cls('Bork', 'csczn'),
cls('BottAuto', 'csmwz'),
cls('BUNS', 'csbft'),
cls('Bushscrubs', 'csmzx'),
cls('CAFFEINATED', 'csbmv'),
cls('CandacenCompany', 'csvpd'),
cls('CarteBlanche', 'csnwk'),
cls('CharmysArmy', 'cswrl'),
cls('CleoAndCompany', 'cscwy'),
cls('Complex', 'csusy'),
cls('CourageousManAdventures', 'csgkn'),
cls('DadsDay', 'cswly'),
cls('DBCartoons', 'csnvt'),
cls('DevinCraneComicStripGhostwriter', 'csadf'),
cls('DoghouseInYourSoul', 'cstwx'),
cls('DontPickTheFlowers', 'cswfs'),
cls('Dragin', 'cswgz'),
cls('DrWhiskers', 'cswvl'),
cls('DumbQuestionBadAnswer', 'cskro'),
cls('DungeonHordes', 'csnlo'),
cls('DustSpecks', 'csqgq'),
cls('DutchnPals', 'cskqc'),
cls('Dysconnected', 'csxbc'),
cls('Econogirl', 'csxoj'),
cls('EightballEyeball', 'csnfh'),
cls('Elmo', 'csvff'),
cls('Endangered', 'cshii'),
cls('Experiment42', 'csbjr'),
cls('FamousAndNotSoFamousQuotes', 'csdgz'),
cls('FarOut', 'csaem'),
cls('FatherOfTheBrood', 'csuul'),
cls('FloydAndTony', 'cszgj'),
cls('FoolsParadise', 'csvnw'),
cls('FrankAndSteinway', 'cseui'),
cls('FriedCritter', 'cshtp'),
cls('GarciaCartoonCo', 'csyuw'),
cls('GIRTH', 'csbjw'),
cls('GrandmaSnoops', 'csscq'),
cls('GrannyAnny', 'cskpg'),
cls('Gravy', 'csgvd'),
cls('GreenPieces', 'csnwy'),
cls('GunstonStreet', 'csgru'),
cls('HallEditorialCartoons', 'csgzx'),
cls('HaloAndHorns', 'csgub'),
cls('HaphazardHumor', 'cspsa'),
cls('Headcheese', 'cspku'),
cls('Hogwashed', 'csbnf'),
cls('HomeLife', 'csrbv'),
cls('Hubbel', 'cszrr'),
cls('HugoComics', 'csdwl'),
cls('HurrieTheMisManager', 'cssri'),
cls('HuskyTales', 'cslnp'),
cls('InkwellForest', 'csmuk'),
cls('IronyOr', 'csddz'),
cls('ItsJustJim', 'cszos'),
cls('JolleyStuffBrowser', 'csjpq'),
cls('KALEECHIKORNERS', 'cshdw'),
cls('KartoonsByKline', 'csoei'),
cls('LaffToons', 'cssvj'),
cls('LiliAndDerek', 'csvsy'),
cls('LilleysSillies', 'cstka'),
cls('LimboRoad', 'csfpp'),
cls('Loose', 'csmyn'),
cls('LumAndAbner', 'cscji'),
cls('MadDogGhettoCop', 'cskwp'),
cls('MarysNature', 'csogt'),
cls('Millennialville', 'csxrl'),
cls('Milton50', 'csmof'),
cls('Mindframe', 'csqnp'),
cls('Minihahas', 'csoat'),
cls('MiscSoup', 'csguq'),
cls('MisterAndMe', 'csvhr'),
cls('MockAll', 'csrds'),
cls('Moments', 'csnso'),
cls('Mongrels', 'csbjo'),
cls('MortsIsland', 'csfyq'),
cls('MySonIsADog', 'csfec'),
cls('NavyBean', 'csfiq'),
cls('NoAmbiguity', 'csryw'),
cls('NoBusinessIKnow', 'csmfg'),
cls('NoOrdinaryLife', 'csicr'),
cls('Npchumorcom', 'csbuv'),
cls('OneFunnyGoldenRetriever', 'csnrf'),
cls('ONIONAndPEA', 'cstsr'),
cls('OscarAndAnnie', 'csczw'),
cls('OverQuirked', 'cspes'),
cls('PaddedCell', 'csxqk'),
cls('Painterly', 'csuya'),
cls('PalAndBuddy', 'csjut'),
cls('PawsForThoughtComics', 'csced'),
cls('Peeples', 'csnkd'),
cls('PeopleOfEarth', 'csjqa'),
cls('PicpakDog', 'cstmm'),
cls('PirateMike', 'csxcb'),
cls('PoliceLimit', 'cspcc'),
cls('PoliticularJokesAndRuffus', 'csmvz'),
cls('Prideland', 'csaoa'),
cls('PrimusTheBadPhilosopher', 'csofd'),
cls('ProfessorHerbertAndGEO', 'cscje'),
cls('QueenBlackbeard', 'csecq'),
cls('QuickDraw', 'csydp'),
cls('RandysRationale', 'cshsw'),
cls('Ringers', 'csxhx'),
cls('RonWarren', 'csuwd'),
cls('SandSharkBeach', 'cssqk'),
cls('SharpCurveComics', 'csyek'),
cls('SherpaAid', 'csvku'),
cls('SignGarden', 'csbxu'),
cls('SignsOfAFrustratedGolfer', 'csxdy'),
cls('Skull', 'csdxo'),
cls('Skylarking', 'csyac'),
cls('SleepytownBeagles', 'cssbk'),
cls('SmallNerdyCreatures', 'cshqb'),
cls('Smith', 'csmdx'),
cls('Snootle', 'cseic'),
cls('SoccerDude', 'csnnb'),
cls('SoccerEarth', 'csdma'),
cls('SOD', 'cszdh'),
cls('SomethingAboutCeleste', 'csgtv'),
cls('SookyRottweiler', 'csegu'),
cls('Spaceport51', 'csbyh'),
cls('SportsByVoort', 'cskin'),
cls('StaleCrackers', 'csngu'),
cls('StankoAndTibor', 'csurl'),
cls('Strangeville', 'cskps'),
cls('SubSub', 'csvcv'),
cls('SuburbanFairyTales', 'cscek'),
cls('SUITSANDGUARDERS', 'cssag'),
cls('SuperSiblings', 'csdxj'),
cls('TheBeauforts', 'csfxu'),
cls('TheBellies', 'csubt'),
cls('TheBoobiehatch', 'csoev'),
cls('TheCardinal', 'csfjg'),
cls('TheDinkledorfs', 'cszhp'),
cls('TheEntrepiranha', 'cslml'),
cls('TheFabulousBushPigs', 'cscqi'),
cls('TheGrayZone', 'csmue'),
cls('TheGreenMonkeys', 'cscue'),
cls('TheMagicForest', 'csjts'),
cls('TheMothManAndLarvaeBoy', 'csycu'),
cls('TheMountainMen', 'cskqw'),
cls('TheNeighborhood', 'csrtu'),
cls('TheNevilleYouKnow', 'csnov'),
cls('TheNonsenseNewz', 'csghu'),
cls('TheOldManAndHisDog', 'csudu'),
cls('TheQuinnAndFinnShow', 'csynn'),
cls('TheRocks', 'cswky'),
cls('TheUnemployed', 'csanx'),
cls('TheWagesOfSindy', 'cszff'),
cls('Thingsesque', 'cstsq'),
cls('TodaysTrump', 'csbrj'),
cls('TopicToons', 'csgly'),
cls('ToughTown', 'csnjp'),
cls('ToxicValues', 'csyig'),
cls('TruthBeKnown', 'csfwi'),
cls('TuesdaysWithCory', 'csiea'),
cls('Underdone', 'csayl'),
cls('UnMannerlyWays', 'csjmh'),
cls('ViewFromTheCouch', 'csipm'),
cls('VoicesInTheDark', 'csyou'),
cls('WarpedAndDemented', 'csbgw'),
cls('Waskataskahiskewaskewan', 'cssfg'),
cls('WayOutComics', 'cstrs'),
cls('WeaselInk', 'csfsn'),
cls('WhiskeyFalls', 'csitw'),
cls('Windsock', 'csywy'),
cls('WrobbertCartoons', 'csupg'),
cls('YinYangster', 'csteo'),
cls('ZombieHeights', 'cswjq'),
cls('Zootopia', 'csquz'),
# END AUTOUPDATE
)

43
scripts/comicsherpa.py Executable file
View file

@ -0,0 +1,43 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2017 Tobias Gruetzmacher
"""
Script to get a list of ComicSherpa and save the info in a JSON file for
further processing.
"""
from __future__ import absolute_import, division, print_function
from scriptutil import ComicListUpdater
class ComicSherpaUpdater(ComicListUpdater):
# names of comics to exclude
excluded_comics = (
# missing images
'Pi',
'Rufus',
# too short
'BillyAndCo',
'BuffaloChips',
'Crawdiddy',
'NewFeature',
)
def collect_results(self):
"""Parse all listing pages."""
data = self.get_url('http://www.comicssherpa.com/site/home.html', expand=False)
for comiclink in data.xpath('//a[contains(@href, "site/feature")]'):
link = comiclink.attrib['href'].split('=')[1]
name = comiclink.text
self.add_comic(name, link)
def get_entry(self, name, url):
return u"cls('%s', '%s')," % (name, url)
if __name__ == '__main__':
ComicSherpaUpdater(__file__).run()