From 32b0dfef35077a961273bd79e77262926ab31bd0 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sun, 17 Mar 2024 19:18:35 +0100 Subject: [PATCH] Adapt to new ComicsKingdom layout (fixes #307) --- dosagelib/plugins/comicskingdom.py | 82 ++++++++++++++++-------------- dosagelib/plugins/old.py | 28 +++++++++- scripts/comicskingdom.py | 38 +++----------- 3 files changed, 78 insertions(+), 70 deletions(-) diff --git a/dosagelib/plugins/comicskingdom.py b/dosagelib/plugins/comicskingdom.py index 0a792dfd5..08413fb4e 100644 --- a/dosagelib/plugins/comicskingdom.py +++ b/dosagelib/plugins/comicskingdom.py @@ -1,16 +1,18 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Thomas W. Littauer -from ..helpers import bounceStarter, joinPathPartsNamer +from urllib.parse import parse_qs, urlsplit + +from ..helpers import indirectStarter from ..scraper import ParserScraper class ComicsKingdom(ParserScraper): - imageSearch = '//img[@id="theComicImage"]' - prevSearch = '//a[./img[contains(@alt, "Previous")]]' - nextSearch = '//a[./img[contains(@alt, "Next")]]' - starter = bounceStarter - namer = joinPathPartsNamer(pageparts=(-2, -1)) + partDiv = '//div[d:class("comic-reader-item")]' + imageSearch = partDiv + '[1]//a[contains(@href, "/custom-framed-print/")]' + prevSearch = partDiv + '[2]/@data-link' + latestSearch = '//a[re:test(@href, "/[0-9-]+$")]' + starter = indirectStarter help = 'Index format: yyyy-mm-dd' def __init__(self, name, path, lang=None): @@ -20,17 +22,20 @@ class ComicsKingdom(ParserScraper): if lang: self.lang = lang + def imageUrlModifier(self, url, data): + """Extract high-quality image URL from link""" + return parse_qs(urlsplit(url).query)['img'][0] + + def link_modifier(self, fromurl, tourl): + return tourl.replace('//wp.', '//', 1) + @classmethod def getmodules(cls): # noqa: CFQ001 return ( - # Some comics are not listed on the "all" page (too old?) - cls('Retail', 'retail'), - # do not edit anything below since these entries are generated from # scripts/comicskingdom.py # START AUTOUPDATE - cls('AmazingSpiderman', 'amazing-spider-man'), - cls('AmazingSpidermanSpanish', 'hombre-arana', lang='es'), + cls('Alice', 'alice'), cls('Apartment3G', 'apartment-3-g_1'), cls('ArcticCircle', 'arctic-circle'), cls('ATodaVelocidadSpanish', 'a-toda-velocidad', lang='es'), @@ -38,22 +43,25 @@ class ComicsKingdom(ParserScraper): cls('BarneyGoogleAndSnuffySmithSpanish', 'tapon', lang='es'), cls('BeetleBailey', 'beetle-bailey-1'), cls('BeetleBaileySpanish', 'beto-el-recluta', lang='es'), + cls('BeetleMoses', 'beetle-moses'), cls('BetweenFriends', 'between-friends'), + cls('BewareOfToddler', 'beware-of-toddler'), cls('BigBenBolt', 'big-ben-bolt'), - cls('BigBenBoltSundays', 'big-ben-bolt-sundays'), cls('Bizarro', 'bizarro'), cls('Blondie', 'blondie'), cls('BlondieSpanish', 'pepita', lang='es'), + cls('BobMankoffPresentsShowMeTheFunny', 'show-me-the-funny'), + cls('BobMankoffPresentsShowMeTheFunnyAnimalEdition', 'show-me-the-funny-pets'), cls('BonersArk', 'boners-ark'), - cls('BonersArkSundays', 'boners-ark-sundays'), - cls('BrianDuffy', 'brian-duffy'), + cls('BreakOfDay', 'break-of-day'), cls('BrickBradford', 'brick-bradford'), cls('BrilliantMindOfEdisonLee', 'brilliant-mind-of-edison-lee'), cls('BringingUpFather', 'bringing-up-father'), cls('BringingUpFatherSpanish', 'educando-a-papa', lang='es'), cls('BuzSawyer', 'buz-sawyer'), + cls('Candorville', 'candorville'), cls('CarpeDiem', 'carpe-diem'), - cls('Crankshaft', 'crankshaft'), + cls('Comiclicious', 'comiclicious'), cls('Crock', 'crock'), cls('CrockSpanish', 'crock-spanish', lang='es'), cls('Curtis', 'curtis'), @@ -62,6 +70,7 @@ class ComicsKingdom(ParserScraper): cls('DavidMHitch', 'david-m-hitch'), cls('DennisTheMenace', 'dennis-the-menace'), cls('DennisTheMenaceSpanish', 'daniel-el-travieso', lang='es'), + cls('Dumplings', 'dumplings'), cls('Dustin', 'dustin'), cls('EdGamble', 'ed-gamble'), # EdgeCity has a duplicate in GoComics/EdgeCity @@ -69,18 +78,15 @@ class ComicsKingdom(ParserScraper): cls('FamilyCircusSpanish', 'circulo-familiar', lang='es'), cls('FlashForward', 'flash-forward'), cls('FlashGordon', 'flash-gordon'), - cls('FlashGordonSundays', 'flash-gordon-sundays'), - cls('FunkyWinkerbean', 'funky-winkerbean'), - cls('FunkyWinkerbeanSunday', 'funky-winkerbean-sundays'), - cls('FunkyWinkerbeanVintage', 'funky-winkerbean-1'), - cls('FunnyOnlineAnimals', 'Funny-Online-Animals'), - cls('GearheadGertie', 'Gearhead-Gertie'), + cls('FunnyOnlineAnimals', 'funny-online-animals'), + cls('GearheadGertie', 'gearhead-gertie'), + cls('GodsHands', 'gods-hands'), cls('HagarTheHorrible', 'hagar-the-horrible'), cls('HagarTheHorribleSpanish', 'olafo', lang='es'), cls('HeartOfJulietJones', 'heart-of-juliet-jones'), - cls('HeartOfJulietJonesSundays', 'heart-of-juliet-jones-sundays'), cls('HiAndLois', 'hi-and-lois'), - cls('IntelligentLife', 'Intelligent'), + cls('InsanityStreak', 'insanity-streak'), + cls('IntelligentLife', 'intelligent'), cls('JimmyMargulies', 'jimmy-margulies'), cls('JohnBranch', 'john-branch'), cls('JohnnyHazard', 'johnny-hazard'), @@ -88,7 +94,6 @@ class ComicsKingdom(ParserScraper): cls('JungleJimSundays', 'jungle-jim-sundays'), cls('KatzenjammerKids', 'katzenjammer-kids'), cls('KatzenjammerKidsSpanish', 'maldades-de-dos-pilluelos', lang='es'), - cls('KatzenjammerKidsSundays', 'katzenjammer-kids-sundays'), cls('KevinAndKell', 'kevin-and-kell'), cls('KingOfTheRoyalMounted', 'king-of-the-royal-mounted'), cls('KirkWalters', 'kirk-walters'), @@ -96,44 +101,42 @@ class ComicsKingdom(ParserScraper): cls('LaloYLolaSpanish', 'lalo-y-lola', lang='es'), cls('LeeJudge', 'lee-judge'), cls('LegalizationNation', 'legalization-nation'), - cls('LegendOfBill', 'Legend-of-Bill'), + cls('LegendOfBill', 'legend-of-bill'), cls('LittleIodineSundays', 'little-iodine-sundays'), cls('LittleKing', 'the-little-king'), - cls('Lockhorns', 'lockhorns'), - cls('Macanudo', 'Macanudo'), + cls('Macanudo', 'macanudo'), cls('MacanudoSpanish', 'macanudo-spanish', lang='es'), cls('MallardFillmore', 'mallard-fillmore'), - cls('MandrakeTheMagician', 'mandrake-the-magician-1'), + cls('MandrakeTheMagician', 'mandrake-the-magician'), cls('MandrakeTheMagicianSpanish', 'mandrake-the-magician-spanish', lang='es'), - cls('MandrakeTheMagicianSundays', 'mandrake-the-magician-sundays'), + cls('MaraLlaveKeeperOfTime', 'mara-llave-keeper-of-time'), cls('MarkTrail', 'mark-trail'), cls('MarkTrailSpanish', 'mark-trail-spanish', lang='es'), - cls('MarkTrailVintage', 'Mark-Trail-Vintage'), cls('Marvin', 'marvin'), cls('MarvinSpanish', 'marvin-spanish', lang='es'), cls('MaryWorth', 'mary-worth'), cls('MaryWorthSpanish', 'maria-de-oro', lang='es'), - cls('MikePeters', 'mike-peters'), + cls('Mazetoons', 'mazetoons'), cls('MikeShelton', 'mike-shelton'), cls('MikeSmith', 'mike-smith'), cls('MooseAndMolly', 'moose-and-molly'), cls('MooseAndMollySpanish', 'quintin', lang='es'), - cls('MotherGooseAndGrimm', 'mother-goose-grimm'), cls('MrAbernathySpanish', 'don-abundio', lang='es'), cls('Mutts', 'mutts'), cls('MuttsSpanish', 'motas', lang='es'), + cls('NeverBeenDeader', 'never-been-deader'), cls('OfficeHours', 'office-hours'), + cls('OliveAndPopeye', 'olive-popeye'), cls('OnTheFastrack', 'on-the-fastrack'), cls('PajamaDiaries', 'pajama-diaries'), cls('PardonMyPlanet', 'pardon-my-planet'), cls('Phantom', 'phantom'), cls('PhantomSpanish', 'el-fantasma', lang='es'), - cls('PhantomSundays', 'phantom-sundays'), + cls('PlanetSyndicate', 'the_planet_syndicate'), cls('Popeye', 'popeye'), cls('PopeyesCartoonClub', 'popeyes-cartoon-club'), cls('PopeyeSpanish', 'popeye-spanish', lang='es'), cls('PrinceValiant', 'prince-valiant'), - cls('PrinceValiantSundays', 'prince-valiant-sundays'), cls('PrincipeValienteSpanish', 'principe-valiente', lang='es'), cls('ProsAndCons', 'pros-cons'), cls('Quincy', 'quincy'), @@ -143,7 +146,9 @@ class ComicsKingdom(ParserScraper): cls('RexMorganMDSpanish', 'rex-morgan-md-spanish', lang='es'), cls('RhymesWithOrange', 'rhymes-with-orange'), cls('RipKirby', 'rip-kirby'), + # Rosebuds has a duplicate in GoComics/Rosebuds cls('SafeHavens', 'safe-havens'), + cls('SagaOfBrannBjornson', 'the-saga-of-brann-bjornson'), cls('Sales', 'sales'), cls('SallyForth', 'sally-forth'), cls('SamAndSilo', 'sam-and-silo'), @@ -151,17 +156,18 @@ class ComicsKingdom(ParserScraper): cls('SecretAgentX9', 'secret-agent-x-9'), # Shoe has a duplicate in GoComics/Shoe cls('SixChix', 'six-chix'), - cls('SlylockFoxAndComicsForKids', 'slylock-fox-and-comics-for-kids'), - cls('SlylockFoxAndComicsForKidsSpanish', 'solo-para-ninos', lang='es'), + cls('SlylockFox', 'slylock-fox-and-comics-for-kids'), + cls('SlylockFoxSpanish', 'solo-para-ninos', lang='es'), + cls('SuburbanFairyTales', 'suburban-fairy-tales'), cls('TakeItFromTheTinkersons', 'take-it-from-the-tinkersons'), cls('TheyllDoItEveryTimeSpanish', 'nunca-falta-alguien-asi', lang='es'), cls('ThimbleTheater', 'thimble-theater'), cls('Tiger', 'tiger'), cls('TigerSpanish', 'tigrillo', lang='es'), - cls('TigerVintage', 'tiger-1'), - cls('TigerVintageSundays', 'tiger-sundays'), cls('TinasGroove', 'tina-s-groove'), cls('ToddTheDinosaur', 'todd-the-dinosaur'), + cls('WillyBlack', 'willy-black'), + cls('WillyBlacksSpanish', 'willy-black-spanish', lang='es'), cls('ZippyThePinhead', 'zippy-the-pinhead'), cls('Zits', 'zits'), cls('ZitsSpanish', 'jeremias', lang='es'), diff --git a/dosagelib/plugins/old.py b/dosagelib/plugins/old.py index 018873864..abd282522 100644 --- a/dosagelib/plugins/old.py +++ b/dosagelib/plugins/old.py @@ -1588,6 +1588,30 @@ class Removed(Scraper): cls('Angels2200', 'del'), cls('BlackRose', 'brk'), cls('CatenaManor/CatenaCafe'), + cls('ComicsKingdom/AmazingSpiderman'), + cls('ComicsKingdom/AmazingSpidermanSpanish'), + cls('ComicsKingdom/BigBenBoltSundays'), + cls('ComicsKingdom/BonersArkSundays'), + cls('ComicsKingdom/BrianDuffy'), + cls('ComicsKingdom/Crankshaft'), + cls('ComicsKingdom/FlashGordonSundays'), + cls('ComicsKingdom/FunkyWinkerbean'), + cls('ComicsKingdom/FunkyWinkerbeanSunday'), + cls('ComicsKingdom/FunkyWinkerbeanSundays'), + cls('ComicsKingdom/FunkyWinkerbeanVintage'), + cls('ComicsKingdom/HeartOfJulietJonesSundays'), + cls('ComicsKingdom/KatzenjammerKidsSundays'), + cls('ComicsKingdom/Lockhorns'), + cls('ComicsKingdom/MandrakeTheMagicianSundays'), + cls('ComicsKingdom/MarkTrailVintage'), + cls('ComicsKingdom/MikePeters'), + cls('ComicsKingdom/MotherGooseAndGrimm'), + cls('ComicsKingdom/PhantomSundays'), + cls('ComicsKingdom/PrinceValiantSundays'), + cls('ComicsKingdom/Retail'), + cls('ComicsKingdom/TigerSundays'), + cls('ComicsKingdom/TigerVintage'), + cls('ComicsKingdom/TigerVintageSundays'), cls('Everblue', 'block'), cls('FalseStart'), cls('Ginpu'), @@ -1698,10 +1722,8 @@ class Renamed(Scraper): # Renamed in 3.0 cls('AHClub', 'RickGriffinStudios/AHClub'), cls('ComicFury/MuddlemarchMudCompany', 'ComicFury/MudCompany'), - cls('ComicsKingdom/FunkyWinkerbeanSundays', 'ComicsKingdom/FunkyWinkerbeanSunday'), cls('ComicsKingdom/ShermansLagoon', 'GoComics/ShermansLagoon'), cls('ComicsKingdom/TheLittleKing', 'ComicsKingdom/LittleKing'), - cls('ComicsKingdom/TigerSundays', 'ComicsKingdom/TigerVintageSundays'), cls('GoComics/BloomCounty2017', 'GoComics/BloomCounty2019'), cls('GoComics/Cathy', 'GoComics/CathyClassics'), cls('GoComics/DarrinBell', 'ComicsKingdom/DarrinBell'), @@ -1724,6 +1746,8 @@ class Renamed(Scraper): cls('TracesOfThePast/NSFW', 'RickGriffinStudios/TracesOfThePastNSFW'), # Renamed in 3.1 + cls('ComicsKingdom/SlylockFoxAndComicsForKids', 'ComicsKingdom/SlylockFox'), + cls('ComicsKingdom/SlylockFoxAndComicsForKidsSpanish', 'ComicsKingdom/SlylockFoxSpanish'), cls('Exiern', 'ComicFury/Exiern'), cls('MaxOveracts', 'OccasionalComicsDisorder'), cls('SafelyEndangered', 'WebToons/SafelyEndangered'), diff --git a/scripts/comicskingdom.py b/scripts/comicskingdom.py index b792bd675..c5ee04c85 100755 --- a/scripts/comicskingdom.py +++ b/scripts/comicskingdom.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: MIT -# Copyright (C) 2019-2022 Tobias Gruetzmacher -# Copyright (C) 2019 Thomas W. Littauer +# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Thomas W. Littauer """ Script to get a list of comicskingdom.com comics and save the info in a JSON file for further processing. @@ -19,39 +19,17 @@ class ComicsKingdomUpdater(ComicListUpdater): "ComicGenesis/%s", ) - def handle_startpage(self, page): - """Parse list of comics from the bottom of the start page.""" - for li in page.xpath('//div[d:class("comics-list")]//li', namespaces=NS): - link = li.xpath('./a')[0] + def handle_listing(self, page): + for link in page.xpath('//ul[d:class("index")]//a', namespaces=NS): + name = link.text_content().removeprefix('The ') url = link.attrib['href'] - name = link.text.removeprefix('The ') + lang = 'es' if ' (Spanish)' in name else None - self.add_comic(name, (url, None)) - - def handle_listing(self, page, lang: str = None, add: str = ''): - - hasnew = True - while hasnew: - hasnew = False - for comicdiv in page.xpath('//div[d:class("tile")]', namespaces=NS): - nametag = comicdiv.xpath('./a/comic-name') - if len(nametag) == 0: - continue - name = nametag[0].text.removeprefix('The ') + add - url = comicdiv.xpath('./a')[0].attrib['href'] - - if self.add_comic(name, (url, lang)): - hasnew = True - - nextlink = page.xpath('//a[./img[contains(@src, "page-right")]]') - page = self.get_url(nextlink[0].attrib['href']) + self.add_comic(name, (url, lang)) def collect_results(self): """Parse all search result pages.""" - page = self.get_url('https://www.comicskingdom.com/') - self.handle_startpage(page) - self.handle_listing(page) - self.handle_listing(self.get_url('https://www.comicskingdom.com/spanish'), 'es', 'Spanish') + self.handle_listing(self.get_url('https://comicskingdom.com/features')) def get_entry(self, name: str, data: tuple[str, str]): opt = f", lang='{data[1]}'" if data[1] else ''