diff --git a/dosagelib/plugins/comicskingdom.py b/dosagelib/plugins/comicskingdom.py index 0e7813e19..b78485fc5 100644 --- a/dosagelib/plugins/comicskingdom.py +++ b/dosagelib/plugins/comicskingdom.py @@ -15,10 +15,12 @@ class ComicsKingdom(_ParserScraper): namer = joinPathPartsNamer((-2, -1), ()) help = 'Index format: yyyy-mm-dd' - def __init__(self, name, path): + def __init__(self, name, path, lang=None): super().__init__('ComicsKingdom/' + name) self.url = 'https://comicskingdom.com/' + path self.stripUrl = self.url + '/%s' + if lang: + self.lang = lang # slightly iffy hack taken from certifi # We need or own certificate bundle since ComicsKingdom screws up their @@ -32,43 +34,59 @@ class ComicsKingdom(_ParserScraper): def getmodules(cls): # noqa: Allowed to be long return ( # Some comics are not listed on the "all" page (too old?) + cls('Retail', 'retail'), # do not edit anything below since these entries are generated from # scripts/comicskingdom.py # START AUTOUPDATE - cls('AmazingSpiderMan', 'amazing-spider-man'), + cls('AmazingSpiderman', 'amazing-spider-man'), + cls('AmazingSpidermanSpanish', 'hombre-arana', lang='es'), cls('Apartment3G', 'apartment-3-g_1'), cls('ArcticCircle', 'arctic-circle'), + cls('ATodaVelocidadSpanish', 'a-toda-velocidad', lang='es'), cls('BarneyGoogleAndSnuffySmith', 'barney-google-and-snuffy-smith'), + cls('BarneyGoogleAndSnuffySmithSpanish', 'tapon', lang='es'), cls('BeetleBailey', 'beetle-bailey-1'), + cls('BeetleBaileySpanish', 'beto-el-recluta', lang='es'), cls('BetweenFriends', 'between-friends'), cls('BigBenBolt', 'big-ben-bolt'), cls('BigBenBoltSundays', 'big-ben-bolt-sundays'), cls('Bizarro', 'bizarro'), cls('Blondie', 'blondie'), + cls('BlondieSpanish', 'pepita', lang='es'), cls('BonersArk', 'boners-ark'), cls('BonersArkSundays', 'boners-ark-sundays'), cls('BrianDuffy', 'brian-duffy'), cls('BrickBradford', 'brick-bradford'), cls('BrilliantMindOfEdisonLee', 'brilliant-mind-of-edison-lee'), cls('BringingUpFather', 'bringing-up-father'), + cls('BringingUpFatherSpanish', 'educando-a-papa', lang='es'), cls('BuzSawyer', 'buz-sawyer'), cls('CarpeDiem', 'carpe-diem'), cls('Crankshaft', 'crankshaft'), cls('Crock', 'crock'), + cls('CrockSpanish', 'crock-spanish', lang='es'), cls('Curtis', 'curtis'), cls('DaddyDaze', 'daddy-daze'), - # DarrinBell has a duplicate in GoComics/DarrinBell + cls('DarrinBell', 'darrin-bell'), cls('DavidMHitch', 'david-m-hitch'), cls('DennisTheMenace', 'dennis-the-menace'), + cls('DennisTheMenaceSpanish', 'daniel-el-travieso', lang='es'), cls('Dustin', 'dustin'), cls('EdGamble', 'ed-gamble'), + # EdgeCity has a duplicate in GoComics/EdgeCity cls('FamilyCircus', 'family-circus'), + cls('FamilyCircusSpanish', 'circulo-familiar', lang='es'), + cls('FlashForward', 'flash-forward'), cls('FlashGordon', 'flash-gordon'), cls('FlashGordonSundays', 'flash-gordon-sundays'), cls('FunkyWinkerbean', 'funky-winkerbean'), - cls('FunkyWinkerbeanSundays', 'funky-winkerbean-sundays'), + cls('FunkyWinkerbeanSunday', 'funky-winkerbean-sundays'), + cls('FunkyWinkerbeanVintage', 'funky-winkerbean-1'), + cls('FunnyOnlineAnimals', 'Funny-Online-Animals'), + cls('GearheadGertie', 'Gearhead-Gertie'), cls('HagarTheHorrible', 'hagar-the-horrible'), + cls('HagarTheHorribleSpanish', 'olafo', lang='es'), cls('HeartOfJulietJones', 'heart-of-juliet-jones'), cls('HeartOfJulietJonesSundays', 'heart-of-juliet-jones-sundays'), cls('HiAndLois', 'hi-and-lois'), @@ -79,57 +97,83 @@ class ComicsKingdom(_ParserScraper): cls('JudgeParker', 'judge-parker'), cls('JungleJimSundays', 'jungle-jim-sundays'), cls('KatzenjammerKids', 'katzenjammer-kids'), + cls('KatzenjammerKidsSpanish', 'maldades-de-dos-pilluelos', lang='es'), cls('KatzenjammerKidsSundays', 'katzenjammer-kids-sundays'), cls('KevinAndKell', 'kevin-and-kell'), cls('KingOfTheRoyalMounted', 'king-of-the-royal-mounted'), cls('KirkWalters', 'kirk-walters'), cls('KrazyKat', 'krazy-kat'), + cls('LaloYLolaSpanish', 'lalo-y-lola', lang='es'), cls('LeeJudge', 'lee-judge'), + cls('LegalizationNation', 'legalization-nation'), + cls('LegendOfBill', 'Legend-of-Bill'), cls('LittleIodineSundays', 'little-iodine-sundays'), + cls('LittleKing', 'the-little-king'), cls('Lockhorns', 'lockhorns'), cls('Macanudo', 'Macanudo'), + cls('MacanudoSpanish', 'macanudo-spanish', lang='es'), cls('MallardFillmore', 'mallard-fillmore'), cls('MandrakeTheMagician', 'mandrake-the-magician-1'), + cls('MandrakeTheMagicianSpanish', 'mandrake-the-magician-spanish', lang='es'), cls('MandrakeTheMagicianSundays', 'mandrake-the-magician-sundays'), cls('MarkTrail', 'mark-trail'), + cls('MarkTrailSpanish', 'mark-trail-spanish', lang='es'), + cls('MarkTrailVintage', 'Mark-Trail-Vintage'), cls('Marvin', 'marvin'), + cls('MarvinSpanish', 'marvin-spanish', lang='es'), cls('MaryWorth', 'mary-worth'), + cls('MaryWorthSpanish', 'maria-de-oro', lang='es'), cls('MikePeters', 'mike-peters'), cls('MikeShelton', 'mike-shelton'), cls('MikeSmith', 'mike-smith'), cls('MooseAndMolly', 'moose-and-molly'), + cls('MooseAndMollySpanish', 'quintin', lang='es'), cls('MotherGooseAndGrimm', 'mother-goose-grimm'), + cls('MrAbernathySpanish', 'don-abundio', lang='es'), cls('Mutts', 'mutts'), + cls('MuttsSpanish', 'motas', lang='es'), cls('OfficeHours', 'office-hours'), cls('OnTheFastrack', 'on-the-fastrack'), cls('PajamaDiaries', 'pajama-diaries'), cls('PardonMyPlanet', 'pardon-my-planet'), cls('Phantom', 'phantom'), + cls('PhantomSpanish', 'el-fantasma', lang='es'), cls('PhantomSundays', 'phantom-sundays'), cls('Popeye', 'popeye'), cls('PopeyesCartoonClub', 'popeyes-cartoon-club'), + cls('PopeyeSpanish', 'popeye-spanish', lang='es'), cls('PrinceValiant', 'prince-valiant'), + cls('PrinceValiantSundays', 'prince-valiant-sundays'), + cls('PrincipeValienteSpanish', 'principe-valiente', lang='es'), cls('ProsAndCons', 'pros-cons'), cls('Quincy', 'quincy'), cls('RadioPatrol', 'radio-patrol'), - cls('Retail', 'retail'), + cls('RaeTheDoe', 'rae-the-doe'), cls('RexMorganMD', 'rex-morgan-m-d'), + cls('RexMorganMDSpanish', 'rex-morgan-md-spanish', lang='es'), cls('RhymesWithOrange', 'rhymes-with-orange'), cls('RipKirby', 'rip-kirby'), cls('SafeHavens', 'safe-havens'), + cls('Sales', 'sales'), cls('SallyForth', 'sally-forth'), cls('SamAndSilo', 'sam-and-silo'), + cls('SamAndSiloSpanish', 'soso-y-siso', lang='es'), cls('SecretAgentX9', 'secret-agent-x-9'), # Shoe has a duplicate in GoComics/Shoe cls('SixChix', 'six-chix'), cls('SlylockFoxAndComicsForKids', 'slylock-fox-and-comics-for-kids'), + cls('SlylockFoxAndComicsForKidsSpanish', 'solo-para-ninos', lang='es'), cls('TakeItFromTheTinkersons', 'take-it-from-the-tinkersons'), - cls('TheLittleKing', 'the-little-king'), + cls('TheyllDoItEveryTimeSpanish', 'nunca-falta-alguien-asi', lang='es'), cls('ThimbleTheater', 'thimble-theater'), cls('Tiger', 'tiger'), - cls('TigerSundays', 'tiger-sundays'), + cls('TigerSpanish', 'tigrillo', lang='es'), + cls('TigerVintage', 'tiger-1'), + cls('TigerVintageSundays', 'tiger-sundays'), + cls('TinasGroove', 'tina-s-groove'), cls('ToddTheDinosaur', 'todd-the-dinosaur'), cls('ZippyThePinhead', 'zippy-the-pinhead'), cls('Zits', 'zits'), + cls('ZitsSpanish', 'jeremias', lang='es'), # END AUTOUPDATE ) diff --git a/dosagelib/plugins/old.py b/dosagelib/plugins/old.py index 7469c856a..4c973e271 100644 --- a/dosagelib/plugins/old.py +++ b/dosagelib/plugins/old.py @@ -259,7 +259,6 @@ class Removed(Scraper): cls('GoComics/Cortoons'), cls('GoComics/CowSheepandaGnomeNamedHelga'), cls('GoComics/DabneyandDad'), - cls('GoComics/DarrinBell'), cls('GoComics/DevinCraneComicStripGhostwriter'), cls('GoComics/DialHforHBomb'), cls('GoComics/DitzAbledPrincess'), @@ -1655,10 +1654,14 @@ class Renamed(Scraper): # Renamed in 3.0 cls('AHClub', 'RickGriffinStudios/AHClub'), cls('ComicFury/MuddlemarchMudCompany', 'ComicFury/MudCompany'), + cls('ComicsKingdom/FunkyWinkerbeanSundays', 'ComicsKingdom/FunkyWinkerbeanSunday'), cls('ComicsKingdom/ShermansLagoon', 'GoComics/ShermansLagoon'), + cls('ComicsKingdom/TheLittleKing', 'ComicsKingdom/LittleKing'), + cls('ComicsKingdom/TigerSundays', 'ComicsKingdom/TigerVintageSundays'), cls('CrapIDrewOnMyLunchBreak', 'WebToons/CrapIDrewOnMyLunchBreak'), cls('GoComics/BloomCounty2017', 'GoComics/BloomCounty2019'), cls('GoComics/Cathy', 'GoComics/CathyClassics'), + cls('GoComics/DarrinBell', 'ComicsKingdom/DarrinBell'), cls('GoComics/Owlturd', 'GoComics/ShenComix'), cls('GoComics/PeanutsEnEspanol', 'GoComics/SnoopyEnEspanol'), cls('GoComics/RipleysBelieveItOrNotSpanish', 'GoComics/RipleysAunqueUstedNoLoCrea'), diff --git a/scripts/comicskingdom.py b/scripts/comicskingdom.py index ffec36f6b..b792bd675 100755 --- a/scripts/comicskingdom.py +++ b/scripts/comicskingdom.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: MIT -# Copyright (C) 2019-2021 Tobias Gruetzmacher +# Copyright (C) 2019-2022 Tobias Gruetzmacher # Copyright (C) 2019 Thomas W. Littauer """ Script to get a list of comicskingdom.com comics and save the info in a JSON @@ -8,33 +8,54 @@ file for further processing. """ from scriptutil import ComicListUpdater +from dosagelib.xml import NS class ComicsKingdomUpdater(ComicListUpdater): - dup_templates = ("Creators/%s", "DrunkDuck/%s", "GoComics/%s", - "KeenSpot/%s", "ComicGenesis/%s") - - # names of comics to exclude - excluded_comics = ( + dup_templates = ( + "Creators/%s", + "GoComics/%s", + "KeenSpot/%s", + "ComicGenesis/%s", ) - def handle_url(self, url): - """Parse one listing page.""" - data = self.get_url(url) + def handle_startpage(self, page): + """Parse list of comics from the bottom of the start page.""" + for li in page.xpath('//div[d:class("comics-list")]//li', namespaces=NS): + link = li.xpath('./a')[0] + url = link.attrib['href'] + name = link.text.removeprefix('The ') - for comicdiv in data.cssselect('ul.comic-link-group li'): - comiclink = comicdiv.cssselect('a')[0] - comicurl = comiclink.attrib['href'] - name = comicdiv.cssselect('a')[0].text + self.add_comic(name, (url, None)) - self.add_comic(name, comicurl.rsplit('/', 1)[1]) + def handle_listing(self, page, lang: str = None, add: str = ''): + + hasnew = True + while hasnew: + hasnew = False + for comicdiv in page.xpath('//div[d:class("tile")]', namespaces=NS): + nametag = comicdiv.xpath('./a/comic-name') + if len(nametag) == 0: + continue + name = nametag[0].text.removeprefix('The ') + add + url = comicdiv.xpath('./a')[0].attrib['href'] + + if self.add_comic(name, (url, lang)): + hasnew = True + + nextlink = page.xpath('//a[./img[contains(@src, "page-right")]]') + page = self.get_url(nextlink[0].attrib['href']) def collect_results(self): """Parse all search result pages.""" - self.handle_url('https://www.comicskingdom.com/') + page = self.get_url('https://www.comicskingdom.com/') + self.handle_startpage(page) + self.handle_listing(page) + self.handle_listing(self.get_url('https://www.comicskingdom.com/spanish'), 'es', 'Spanish') - def get_entry(self, name, path): - return u"cls('%s', '%s')," % (name, path) + def get_entry(self, name: str, data: tuple[str, str]): + opt = f", lang='{data[1]}'" if data[1] else '' + return f"cls('{name}', '{data[0].split('/')[3]}'{opt})," if __name__ == '__main__': diff --git a/scripts/gocomics.py b/scripts/gocomics.py index d9b600755..653c605ec 100755 --- a/scripts/gocomics.py +++ b/scripts/gocomics.py @@ -12,6 +12,10 @@ from scriptutil import ComicListUpdater class GoComicsUpdater(ComicListUpdater): + dup_templates = ( + "ComicsKingdom/%s", + ) + # names of comics to exclude excluded_comics = ( # too short diff --git a/scripts/scriptutil.py b/scripts/scriptutil.py index 449e275fc..d29b4d5e9 100644 --- a/scripts/scriptutil.py +++ b/scripts/scriptutil.py @@ -69,11 +69,13 @@ class ComicListUpdater(object): json.dump(self.res, f, sort_keys=True, indent=2, separators=(',', ': ')) - def add_comic(self, name: str, data: tuple[str, ...], count=None): + def add_comic(self, name: str, data, count=None): """Add a collected comic with a specific number of comics.""" name = format_name(name) if not self.should_skip(name): self.res[name] = {'count': count, 'data': data} + return True + return False def collect_results(self): raise NotImplementedError @@ -130,10 +132,10 @@ class ComicListUpdater(object): """Check if comic name already exists.""" names = [(tmpl % name).lower() for tmpl in self.dup_templates] if names: - for scraperobj in scrapers.get(): - lname = scraperobj.name.lower() + for scraper in scrapers.all(): + lname = scraper.name.lower() if lname in names: - return scraperobj.name + return scraper.name return None def get_entry(self, name, data):