From 321d7d0a5a1ed87e52b0e3174707a438f354a70a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 Feb 2024 07:19:23 +0000 Subject: [PATCH 01/41] Bump codecov/codecov-action from 3 to 4 Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 3 to 4. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/codecov/codecov-action/compare/v3...v4) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 91fc9ef4a..33990ed9e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -42,6 +42,6 @@ jobs: ${{ github.workspace }}/.tox/reports/*/coverage.xml:coverage.py prefix: ${{ github.workspace }}/.tox/py39/lib/python3.9/site-packages - - uses: codecov/codecov-action@v3 + - uses: codecov/codecov-action@v4 with: directory: '.tox/reports' From 7517b2fef832ad688094dc09fc8d659460321df7 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Tue, 13 Feb 2024 19:28:26 +0100 Subject: [PATCH 02/41] Fix README badges --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d87bfaf4f..4604ffc6a 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # Dosage -[![Tests](https://github.com/webcomics/dosage/actions/workflows/test.yml/badge.svg)](https://github.com/webcomics/dosage/actions/workflows/test.yml) +[![CI](https://github.com/webcomics/dosage/actions/workflows/ci.yaml/badge.svg)](https://github.com/webcomics/dosage/actions/workflows/ci.yaml) [![Code Climate](https://codeclimate.com/github/webcomics/dosage/badges/gpa.svg)](https://codeclimate.com/github/webcomics/dosage) [![codecov](https://codecov.io/gh/webcomics/dosage/branch/master/graph/badge.svg)](https://codecov.io/gh/webcomics/dosage) -![Maintenance](https://img.shields.io/maintenance/yes/2023.svg) +![Maintenance](https://img.shields.io/maintenance/yes/2024.svg) ![License](https://img.shields.io/github/license/webcomics/dosage) Dosage is designed to keep a local copy of specific webcomics and other From 17f7c53e535379be269c62208abb450eaec905d3 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Tue, 13 Feb 2024 19:30:07 +0100 Subject: [PATCH 03/41] Update copyright year --- COPYING | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/COPYING b/COPYING index 7233a8518..bd9871497 100644 --- a/COPYING +++ b/COPYING @@ -1,6 +1,6 @@ Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs Copyright (C) 2012-2014 Bastian Kleineidam -Copyright (C) 2015-2022 Tobias Gruetzmacher +Copyright (C) 2015-2024 Tobias Gruetzmacher Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the From b3da06b2708590b031b77a54023f95eeaefb7507 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Tue, 13 Feb 2024 23:37:08 +0100 Subject: [PATCH 04/41] Fix some modules --- dosagelib/plugins/g.py | 63 +++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 35 deletions(-) diff --git a/dosagelib/plugins/g.py b/dosagelib/plugins/g.py index 0d5c1a5ce..605f96e9c 100644 --- a/dosagelib/plugins/g.py +++ b/dosagelib/plugins/g.py @@ -3,11 +3,11 @@ # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Daniel Ring -from re import compile, escape +from re import compile -from ..scraper import _BasicScraper, _ParserScraper +from ..scraper import _BasicScraper, _ParserScraper, ParserScraper from ..helpers import indirectStarter -from ..util import tagre +from ..util import tagre, getQueryParams from .common import ComicControlScraper, WordPressScraper, WordPressNavi @@ -27,13 +27,9 @@ class Garanos(WordPressScraper): endOfLife = True -class GastroPhobia(_ParserScraper): - url = 'http://www.gastrophobia.com/' - stripUrl = url + 'index.php?date=%s' - firstStripUrl = stripUrl % '2008-07-30' - imageSearch = '//div[@id="comic"]//img' - prevSearch = '//div[@id="prev"]/a' - help = 'Index format: yyyy-mm-dd' +class GastroPhobia(ComicControlScraper): + url = 'https://gastrophobia.com/' + firstStripUrl = url + 'comix/the-mane-event' class Geeks(_ParserScraper): @@ -51,7 +47,7 @@ class GeeksNextDoor(_ParserScraper): url = 'http://www.geeksnextcomic.com/' stripUrl = url + '%s.html' firstStripUrl = stripUrl % '2007-03-27' # '2010-10-04' - imageSearch = '//p/img' + imageSearch = ('//p/img', '//p/span/img') prevSearch = ( '//a[img[contains(@src, "/nav_prev")]]', '//a[contains(text(), "< prev")]', # start page is different @@ -59,19 +55,19 @@ class GeeksNextDoor(_ParserScraper): help = 'Index format: yyyy-mm-dd' -class GirlGenius(_BasicScraper): - baseUrl = 'http://www.girlgeniusonline.com/' - rurl = escape(baseUrl) - url = baseUrl + 'comic.php' +class GirlGenius(ParserScraper): + url = 'https://www.girlgeniusonline.com/comic.php' stripUrl = url + '?date=%s' firstStripUrl = stripUrl % '20021104' - imageSearch = compile( - tagre("img", "src", r"(%sggmain/strips/[^']*)" % rurl, quote="'")) - prevSearch = compile(tagre("a", "id", "topprev", quote="\"", - before=r"(%s[^\"']+)" % rurl)) + imageSearch = '//img[@alt="Comic"]' + prevSearch = '//a[@id="topprev"]' multipleImagesPerStrip = True help = 'Index format: yyyymmdd' + def shouldSkipUrl(self, url, data): + """Skip pages without images.""" + return not data.xpath('//div[@id="comicbody"]//img[contains(@src, "comic")]') + class GirlsWithSlingshots(ComicControlScraper): url = 'https://girlswithslingshots.com/' @@ -99,20 +95,18 @@ class GoGetARoomie(ComicControlScraper): url = 'http://www.gogetaroomie.com' -class GoneWithTheBlastwave(_BasicScraper): - url = 'http://www.blastwave-comic.com/index.php?p=comic&nro=1' - starter = indirectStarter - stripUrl = url[:-1] + '%s' +class GoneWithTheBlastwave(ParserScraper): + stripUrl = 'http://www.blastwave-comic.com/index.php?p=comic&nro=%s' firstStripUrl = stripUrl % '1' - imageSearch = compile(r'' + - r'' + - r' Date: Wed, 14 Feb 2024 23:39:08 +0100 Subject: [PATCH 05/41] Update GoComics modules --- dosagelib/plugins/gocomics.py | 45 ++++++++++++++++++----------------- dosagelib/plugins/old.py | 20 ++++++++++++++++ scripts/gocomics.py | 8 ++++--- 3 files changed, 48 insertions(+), 25 deletions(-) diff --git a/dosagelib/plugins/gocomics.py b/dosagelib/plugins/gocomics.py index 0e76ec548..1faee4bdd 100644 --- a/dosagelib/plugins/gocomics.py +++ b/dosagelib/plugins/gocomics.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2022 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher from ..scraper import ParserScraper from ..helpers import indirectStarter @@ -44,7 +44,6 @@ class GoComics(ParserScraper): # START AUTOUPDATE cls('1AndDone', '1-and-done'), cls('9ChickweedLane', '9chickweedlane'), - cls('9ChickweedLaneClassics', '9-chickweed-lane-classics'), cls('9To5', '9to5'), cls('Aaggghhh', 'Aaggghhh', 'es'), cls('AdamAtHome', 'adamathome'), @@ -62,6 +61,7 @@ class GoComics(ParserScraper): cls('Annie', 'annie'), cls('AProblemLikeJamal', 'a-problem-like-jamal'), cls('ArloAndJanis', 'arloandjanis'), + cls('ArtByMoga', 'artbymoga'), cls('AskShagg', 'askshagg'), cls('AtTavicat', 'tavicat'), cls('AuntyAcid', 'aunty-acid'), @@ -69,7 +69,6 @@ class GoComics(ParserScraper): cls('BackInTheDay', 'backintheday'), cls('BackToBC', 'back-to-bc'), cls('Bacon', 'bacon'), - cls('Badlands', 'badlands'), cls('BadMachinery', 'bad-machinery'), cls('Baldo', 'baldo'), cls('BaldoEnEspanol', 'baldoespanol', 'es'), @@ -90,8 +89,8 @@ class GoComics(ParserScraper): cls('Betty', 'betty'), cls('BFGFSyndrome', 'bfgf-syndrome'), cls('BigNate', 'bignate'), - cls('BigNateFirstClass', 'big-nate-first-class'), cls('BigTop', 'bigtop'), + cls('BillBramhall', 'bill-bramhall'), cls('BirdAndMoon', 'bird-and-moon'), cls('Birdbrains', 'birdbrains'), cls('BleekerTheRechargeableDog', 'bleeker'), @@ -99,14 +98,14 @@ class GoComics(ParserScraper): cls('BloomCounty', 'bloomcounty'), cls('BloomCounty2019', 'bloom-county'), cls('BobGorrell', 'bobgorrell'), + cls('BobTheAngryFlower', 'bob-the-angry-flower'), cls('BobTheSquirrel', 'bobthesquirrel'), cls('BoNanas', 'bonanas'), cls('Boomerangs', 'boomerangs'), - cls('Bottomliners', 'bottomliners'), + cls('BottomLiners', 'bottomliners'), cls('BoundAndGagged', 'boundandgagged'), cls('Bozo', 'bozo'), cls('BreakingCatNews', 'breaking-cat-news'), - cls('BreakOfDay', 'break-of-day'), cls('Brevity', 'brevity'), cls('BrewsterRockit', 'brewsterrockit'), cls('BrianMcFadden', 'brian-mcfadden'), @@ -116,7 +115,6 @@ class GoComics(ParserScraper): cls('Buni', 'buni'), cls('CalvinAndHobbes', 'calvinandhobbes'), cls('CalvinAndHobbesEnEspanol', 'calvinandhobbesespanol', 'es'), - cls('Candorville', 'candorville'), cls('CatanaComics', 'little-moments-of-love'), cls('CathyClassics', 'cathy'), cls('CathyCommiserations', 'cathy-commiserations'), @@ -139,17 +137,18 @@ class GoComics(ParserScraper): cls('CowAndBoyClassics', 'cowandboy'), cls('CowTown', 'cowtown'), cls('Crabgrass', 'crabgrass'), + # Crankshaft has a duplicate in ComicsKingdom/Crankshaft cls('Crumb', 'crumb'), cls('CulDeSac', 'culdesac'), + cls('Curses', 'curses'), cls('DaddysHome', 'daddyshome'), cls('DanaSummers', 'danasummers'), cls('DarkSideOfTheHorse', 'darksideofthehorse'), + cls('DayByDave', 'day-by-dave'), cls('DeepDarkFears', 'deep-dark-fears'), cls('DeFlocked', 'deflocked'), cls('DiamondLil', 'diamondlil'), cls('DickTracy', 'dicktracy'), - cls('DilbertClassics', 'dilbert-classics'), - cls('DilbertEnEspanol', 'dilbert-en-espanol', 'es'), cls('DinosaurComics', 'dinosaur-comics'), cls('DogEatDoug', 'dogeatdoug'), cls('DogsOfCKennel', 'dogsofckennel'), @@ -160,15 +159,14 @@ class GoComics(ParserScraper): cls('Doonesbury', 'doonesbury'), cls('Drabble', 'drabble'), cls('DrewSheneman', 'drewsheneman'), - cls('DumbwichCastle', 'dumbwich-castle'), cls('EdgeCity', 'edge-city'), cls('Eek', 'eek'), cls('ElCafDePoncho', 'el-cafe-de-poncho', 'es'), cls('EmmyLou', 'emmy-lou'), cls('Endtown', 'endtown'), + cls('EricAllie', 'eric-allie'), cls('EverydayPeopleCartoons', 'everyday-people-cartoons'), cls('Eyebeam', 'eyebeam'), - cls('EyebeamClassic', 'eyebeam-classic'), cls('FalseKnees', 'false-knees'), cls('FamilyTree', 'familytree'), cls('Farcus', 'farcus'), @@ -191,8 +189,8 @@ class GoComics(ParserScraper): cls('FreeRange', 'freerange'), cls('FreshlySqueezed', 'freshlysqueezed'), cls('FrogApplause', 'frogapplause'), + cls('FurBabies', 'furbabies'), cls('Garfield', 'garfield'), - cls('GarfieldClassics', 'garfield-classics'), cls('GarfieldEnEspanol', 'garfieldespanol', 'es'), cls('GaryMarkstein', 'garymarkstein'), cls('GaryVarvel', 'garyvarvel'), @@ -222,6 +220,7 @@ class GoComics(ParserScraper): cls('HerbAndJamaal', 'herbandjamaal'), cls('Herman', 'herman'), cls('HomeAndAway', 'homeandaway'), + cls('HomeFree', 'homefree'), cls('HotComicsForCoolPeople', 'hot-comics-for-cool-people'), cls('HutchOwen', 'hutch-owen'), cls('ImagineThis', 'imaginethis'), @@ -238,10 +237,12 @@ class GoComics(ParserScraper): cls('JeffDanziger', 'jeffdanziger'), cls('JeffStahler', 'jeffstahler'), cls('JenSorensen', 'jen-sorensen'), + cls('JerryKingComics', 'jerry-king-comics'), cls('JimBentonCartoons', 'jim-benton-cartoons'), cls('JimMorin', 'jimmorin'), cls('JoeHeller', 'joe-heller'), cls('JoelPett', 'joelpett'), + cls('JoeyWeatherford', 'joey-weatherford'), cls('JohnDeering', 'johndeering'), cls('JumpStart', 'jumpstart'), cls('JunkDrawer', 'junk-drawer'), @@ -287,7 +288,6 @@ class GoComics(ParserScraper): cls('Lunarbaboon', 'lunarbaboon'), cls('M2Bulls', 'm2bulls'), cls('Maintaining', 'maintaining'), - cls('MakingIt', 'making-it'), cls('MannequinOnTheMoon', 'mannequin-on-the-moon'), cls('MariasDay', 'marias-day'), cls('Marmaduke', 'marmaduke'), @@ -299,6 +299,7 @@ class GoComics(ParserScraper): cls('MessycowComics', 'messy-cow'), cls('MexikidStories', 'mexikid-stories'), cls('MichaelRamirez', 'michaelramirez'), + cls('MikeBeckom', 'mike-beckom'), cls('MikeDuJour', 'mike-du-jour'), cls('MikeLester', 'mike-lester'), cls('MikeLuckovich', 'mikeluckovich'), @@ -307,9 +308,9 @@ class GoComics(ParserScraper): cls('Momma', 'momma'), cls('Monty', 'monty'), cls('MontyDiaros', 'monty-diaros', 'es'), + # MotherGooseAndGrimm has a duplicate in ComicsKingdom/MotherGooseAndGrimm cls('MotleyClassics', 'motley-classics'), cls('MrLowe', 'mr-lowe'), - cls('MtPleasant', 'mtpleasant'), cls('MuttAndJeff', 'muttandjeff'), cls('MyDadIsDracula', 'my-dad-is-dracula'), cls('MythTickle', 'mythtickle'), @@ -341,10 +342,10 @@ class GoComics(ParserScraper): cls('OverTheHedge', 'overthehedge'), cls('OzyAndMillie', 'ozy-and-millie'), cls('PatOliphant', 'patoliphant'), - cls('PCAndPixel', 'pcandpixel'), cls('Peanuts', 'peanuts'), cls('PeanutsBegins', 'peanuts-begins'), cls('PearlsBeforeSwine', 'pearlsbeforeswine'), + cls('PedroXMolina', 'pedroxmolina'), cls('Periquita', 'periquita', 'es'), cls('PerlasParaLosCerdos', 'perlas-para-los-cerdos', 'es'), cls('PerryBibleFellowship', 'perry-bible-fellowship'), @@ -383,7 +384,6 @@ class GoComics(ParserScraper): cls('RoseIsRose', 'roseisrose'), cls('Rubes', 'rubes'), cls('RudyPark', 'rudypark'), - cls('SaltNPepper', 'salt-n-pepper'), cls('SarahsScribbles', 'sarahs-scribbles'), cls('SaturdayMorningBreakfastCereal', 'saturday-morning-breakfast-cereal'), cls('SavageChickens', 'savage-chickens'), @@ -394,13 +394,11 @@ class GoComics(ParserScraper): cls('ShermansLagoon', 'shermanslagoon'), cls('ShirleyAndSonClassics', 'shirley-and-son-classics'), cls('Shoe', 'shoe'), - cls('SigneWilkinson', 'signewilkinson'), cls('SketchsharkComics', 'sketchshark-comics'), cls('SkinHorse', 'skinhorse'), cls('Skippy', 'skippy'), cls('SmallPotatoes', 'small-potatoes'), cls('SnoopyEnEspanol', 'peanuts-espanol', 'es'), - cls('Snowflakes', 'snowflakes'), cls('SnowSez', 'snow-sez'), cls('SpeedBump', 'speedbump'), cls('SpiritOfTheStaircase', 'spirit-of-the-staircase'), @@ -410,9 +408,7 @@ class GoComics(ParserScraper): cls('SteveKelley', 'stevekelley'), cls('StickyComics', 'sticky-comics'), cls('StoneSoup', 'stonesoup'), - cls('StoneSoupClassics', 'stone-soup-classics'), cls('StrangeBrew', 'strangebrew'), - cls('StuartCarlson', 'stuartcarlson'), cls('StudioJantze', 'studio-jantze'), cls('SunnyStreet', 'sunny-street'), cls('SunshineState', 'sunshine-state'), @@ -425,6 +421,7 @@ class GoComics(ParserScraper): cls('TarzanEnEspanol', 'tarzan-en-espanol', 'es'), cls('TedRall', 'ted-rall'), cls('TenCats', 'ten-cats'), + cls('Tex', 'tex'), cls('TextsFromMittens', 'texts-from-mittens'), cls('Thatababy', 'thatababy'), cls('ThatIsPriceless', 'that-is-priceless'), @@ -451,6 +448,7 @@ class GoComics(ParserScraper): cls('TheHumbleStumble', 'humble-stumble'), cls('TheKChronicles', 'thekchronicles'), cls('TheKnightLife', 'theknightlife'), + cls('TheLockhorns', 'lockhorns'), cls('TheMartianConfederacy', 'the-martian-confederacy'), cls('TheMeaningOfLila', 'meaningoflila'), cls('TheMiddleAge', 'the-middle-age'), @@ -473,6 +471,7 @@ class GoComics(ParserScraper): cls('TruthFacts', 'truth-facts'), cls('Tutelandia', 'tutelandia', 'es'), cls('TwoPartyOpera', 'two-party-opera'), + cls('UFO', 'ufo'), cls('UnderpantsAndOverbites', 'underpants-and-overbites'), cls('UnderstandingChaos', 'understanding-chaos'), cls('UnstrangePhenomena', 'unstrange-phenomena'), @@ -487,6 +486,7 @@ class GoComics(ParserScraper): cls('ViiviAndWagner', 'viivi-and-wagner'), cls('WallaceTheBrave', 'wallace-the-brave'), cls('WaltHandelsman', 'walthandelsman'), + cls('Wannabe', 'wannabe'), cls('Warped', 'warped'), cls('WatchYourHead', 'watchyourhead'), cls('Wawawiwa', 'wawawiwa'), @@ -505,6 +505,7 @@ class GoComics(ParserScraper): cls('WuMo', 'wumo'), cls('WumoEnEspanol', 'wumoespanol', 'es'), cls('Yaffle', 'yaffle'), + cls('YeahItsChill', 'yeah-its-chill'), cls('YesImHotInThis', 'yesimhotinthis'), cls('ZackHill', 'zackhill'), cls('ZenPencils', 'zen-pencils'), diff --git a/dosagelib/plugins/old.py b/dosagelib/plugins/old.py index 11ee39045..40c8b8c4a 100644 --- a/dosagelib/plugins/old.py +++ b/dosagelib/plugins/old.py @@ -1589,6 +1589,26 @@ class Removed(Scraper): cls('WebToons/CrystalVirus'), cls('WebToons/OVERPOWERED'), cls('WintersLight'), + + # Removed in 3.1 + cls('GoComics/9ChickweedLaneClassics'), + cls('GoComics/Badlands'), + cls('GoComics/BigNateFirstClass'), + cls('GoComics/BreakOfDay'), + cls('GoComics/Candorville'), + cls('GoComics/DilbertClassics'), + cls('GoComics/DilbertEnEspanol'), + cls('GoComics/DumbwichCastle'), + cls('GoComics/EyebeamClassic'), + cls('GoComics/GarfieldClassics'), + cls('GoComics/MakingIt'), + cls('GoComics/MtPleasant'), + cls('GoComics/PCAndPixel'), + cls('GoComics/SaltNPepper'), + cls('GoComics/SigneWilkinson'), + cls('GoComics/Snowflakes'), + cls('GoComics/StoneSoupClassics'), + cls('GoComics/StuartCarlson'), ) diff --git a/scripts/gocomics.py b/scripts/gocomics.py index 653c605ec..6637682a7 100755 --- a/scripts/gocomics.py +++ b/scripts/gocomics.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2022 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher """ Script to get a list of gocomics and save the info in a JSON file for further processing. @@ -20,6 +20,8 @@ class GoComicsUpdater(ComicListUpdater): excluded_comics = ( # too short 'LukeyMcGarrysTLDR', + # Has its own module + 'Widdershins', ) def handle_gocomics(self, url, outercss='a.gc-blended-link', lang=None): From 6f6b4d66033222d681aee19b2d25278904dca06e Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Wed, 14 Feb 2024 23:44:36 +0100 Subject: [PATCH 06/41] Sort removed comics under the correct version --- dosagelib/plugins/old.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/dosagelib/plugins/old.py b/dosagelib/plugins/old.py index 40c8b8c4a..035da92b1 100644 --- a/dosagelib/plugins/old.py +++ b/dosagelib/plugins/old.py @@ -604,7 +604,6 @@ class Removed(Scraper): cls('WotNow'), # Removed in 3.0 - cls('CatenaManor/CatenaCafe'), cls('ComicFury/AdventuresOftheGreatCaptainMaggieandCrew'), cls('ComicFury/AWAKENING'), cls('ComicFury/Beebleville'), @@ -833,8 +832,6 @@ class Removed(Scraper): cls('ComicsKingdom/Redeye'), cls('ComicsKingdom/RedeyeSundays'), cls('CrapIDrewOnMyLunchBreak'), - cls('FalseStart'), - cls('Ginpu'), cls('GoComics/060'), cls('GoComics/2CowsAndAChicken'), cls('GoComics/ABitSketch'), @@ -995,11 +992,9 @@ class Removed(Scraper): cls('GoComics/Wrobbertcartoons'), cls('GoComics/Zootopia'), cls('JustAnotherEscape'), - cls('KemonoCafe/PrincessBunny'), cls('Laiyu', 'brk'), cls('MangaDex/DrStone', 'legal'), cls('MangaDex/HeavensDesignTeam', 'legal'), - cls('MangaDex/ImTheMaxLevelNewbie', 'legal'), cls('MangaDex/SPYxFAMILY', 'legal'), cls('Ryugou'), cls('SeelPeel'), @@ -1573,24 +1568,22 @@ class Removed(Scraper): cls('SnafuComics/Tin'), cls('SnafuComics/Titan'), cls('StudioKhimera/Eorah', 'mov'), - cls('StudioKhimera/Mousechevious'), cls('StuffNoOneToldMe'), cls('TaleOfTenThousand'), - cls('TalesAndTactics'), cls('TheCyantianChronicles/CookieCaper'), cls('TheCyantianChronicles/Pawprints'), - cls('VampireHunterBoyfriends'), cls('VGCats/Adventure'), cls('VGCats/Super'), cls('VictimsOfTheSystem'), cls('WebDesignerCOTW'), cls('WebToons/Adamsville'), cls('WebToons/CrapIDrewOnMyLunchBreak'), - cls('WebToons/CrystalVirus'), - cls('WebToons/OVERPOWERED'), cls('WintersLight'), # Removed in 3.1 + cls('CatenaManor/CatenaCafe'), + cls('FalseStart'), + cls('Ginpu'), cls('GoComics/9ChickweedLaneClassics'), cls('GoComics/Badlands'), cls('GoComics/BigNateFirstClass'), @@ -1609,6 +1602,13 @@ class Removed(Scraper): cls('GoComics/Snowflakes'), cls('GoComics/StoneSoupClassics'), cls('GoComics/StuartCarlson'), + cls('KemonoCafe/PrincessBunny'), + cls('MangaDex/ImTheMaxLevelNewbie', 'legal'), + cls('StudioKhimera/Mousechevious'), + cls('TalesAndTactics'), + cls('VampireHunterBoyfriends'), + cls('WebToons/CrystalVirus'), + cls('WebToons/OVERPOWERED'), ) From aa50afdbf7724177b81e9e39a18faba345b03f0c Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Wed, 14 Feb 2024 23:51:30 +0100 Subject: [PATCH 07/41] Remove Lackadaisy (we are blocked) --- dosagelib/plugins/l.py | 19 +------------------ dosagelib/plugins/old.py | 1 + 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/dosagelib/plugins/l.py b/dosagelib/plugins/l.py index 28f432187..e04f168f0 100644 --- a/dosagelib/plugins/l.py +++ b/dosagelib/plugins/l.py @@ -5,24 +5,7 @@ # SPDX-FileCopyrightText: © 2019 Daniel Ring from ..scraper import ParserScraper, _ParserScraper from ..helpers import bounceStarter, indirectStarter -from .common import ComicControlScraper, WordPressScraper, WordPressNaviIn - - -class Lackadaisy(ParserScraper): - url = 'https://www.lackadaisy.com/comic.php' - stripUrl = url + '?comicid=%s' - firstStripUrl = stripUrl % '1' - imageSearch = '//div[@id="exhibit"]/img[contains(@src, "comic/")]' - prevSearch = '//div[@class="prev"]/a' - nextSearch = '//div[@class="next"]/a' - help = 'Index format: n' - starter = bounceStarter - - def namer(self, imageUrl, pageUrl): - # Use comic id for filename - num = pageUrl.rsplit('=', 1)[-1] - ext = imageUrl.rsplit('.', 1)[-1] - return 'lackadaisy_%s.%s' % (num, ext) +from .common import ComicControlScraper, WordPressScraper class Lancer(WordPressScraper): diff --git a/dosagelib/plugins/old.py b/dosagelib/plugins/old.py index 035da92b1..1c37f90c5 100644 --- a/dosagelib/plugins/old.py +++ b/dosagelib/plugins/old.py @@ -1603,6 +1603,7 @@ class Removed(Scraper): cls('GoComics/StoneSoupClassics'), cls('GoComics/StuartCarlson'), cls('KemonoCafe/PrincessBunny'), + cls('Lackadaisy', 'block'), cls('MangaDex/ImTheMaxLevelNewbie', 'legal'), cls('StudioKhimera/Mousechevious'), cls('TalesAndTactics'), From ea2bad55000803e0eb59595aa87af19230467bd6 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Thu, 15 Feb 2024 00:50:33 +0100 Subject: [PATCH 08/41] Fix some more comic modules --- dosagelib/plugins/m.py | 90 +++++++++++----------------------------- dosagelib/plugins/o.py | 6 +++ dosagelib/plugins/old.py | 3 ++ 3 files changed, 33 insertions(+), 66 deletions(-) diff --git a/dosagelib/plugins/m.py b/dosagelib/plugins/m.py index c60c184a5..d69531e36 100644 --- a/dosagelib/plugins/m.py +++ b/dosagelib/plugins/m.py @@ -4,7 +4,7 @@ # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Daniel Ring import json -from re import compile, escape, IGNORECASE +from re import compile, IGNORECASE from ..helpers import indirectStarter from ..scraper import ParserScraper, _BasicScraper, _ParserScraper @@ -13,13 +13,10 @@ from ..xml import NS from .common import ComicControlScraper, WordPressScraper, WordPressWebcomic -class MacHall(_BasicScraper): - url = 'http://www.machall.com/' - stripUrl = url + 'view.php?date=%s' - firstStripUrl = stripUrl % '2000-11-07' - imageSearch = compile(r']+?src=\'drop_shadow/previous.gif\'>') - help = 'Index format: yyyy-mm-dd' +class MacHall(ComicControlScraper): + url = 'https://www.machall.com/' + stripUrl = url + 'comic/%s' + firstStripUrl = stripUrl % 'moving-in' class MadamAndEve(_BasicScraper): @@ -58,12 +55,12 @@ class MareInternum(WordPressScraper): firstStripUrl = stripUrl % 'intro-page-1' -class Marilith(_BasicScraper): - url = 'http://www.marilith.com/' +class Marilith(ParserScraper): + url = 'https://web.archive.org/web/20170619193143/http://www.marilith.com/' stripUrl = url + 'archive.php?date=%s' firstStripUrl = stripUrl % '20041215' - imageSearch = compile(r'') - help = 'Index Format: n' - - -class MyCartoons(_BasicScraper): - url = 'http://mycartoons.de/' - rurl = escape(url) - stripUrl = url + 'page/%s' - imageSearch = ( - compile(tagre("img", "src", r'(%swp-content/cartoons/(?:[^"]+/)?\d+-\d+-\d+[^"]+)' % rurl)), - compile(tagre("img", "src", r'(%scartoons/[^"]+/\d+-\d+-\d+[^"]+)' % rurl)), - ) - prevSearch = compile(tagre("a", "href", r'(%spage/[^"]+)' % rurl) + - "«") - help = 'Index format: number' - lang = 'de' +class Moonsticks(ParserScraper): + url = "https://moonsticks.org/" + imageSearch = "//div[d:class('entry-content')]//img" + prevSearch = ('//a[@rel="prev"]', "//a[text()='\u00AB Prev']") class MyLifeWithFel(ParserScraper): diff --git a/dosagelib/plugins/o.py b/dosagelib/plugins/o.py index 5706d2ba2..2f85ee765 100644 --- a/dosagelib/plugins/o.py +++ b/dosagelib/plugins/o.py @@ -11,6 +11,12 @@ from ..util import tagre from .common import WordPressScraper, WordPressNavi +class OccasionalComicsDisorder(WordPressScraper): + url = 'https://occasionalcomics.com/' + stripUrl = url + 'comic/%s/' + firstStripUrl = stripUrl % 'latest-comic-2' + + class OctopusPie(_ParserScraper): url = 'http://www.octopuspie.com/' rurl = escape(url) diff --git a/dosagelib/plugins/old.py b/dosagelib/plugins/old.py index 1c37f90c5..4ca85ba18 100644 --- a/dosagelib/plugins/old.py +++ b/dosagelib/plugins/old.py @@ -1605,6 +1605,8 @@ class Removed(Scraper): cls('KemonoCafe/PrincessBunny'), cls('Lackadaisy', 'block'), cls('MangaDex/ImTheMaxLevelNewbie', 'legal'), + cls('MrLovenstein', 'jsh'), + cls('MyCartoons'), cls('StudioKhimera/Mousechevious'), cls('TalesAndTactics'), cls('VampireHunterBoyfriends'), @@ -1716,5 +1718,6 @@ class Renamed(Scraper): # Renamed in 3.1 cls('Exiern', 'ComicFury/Exiern'), + cls('MaxOveracts', 'OccasionalComicsDisorder'), cls('SafelyEndangered', 'WebToons/SafelyEndangered'), ) From 3c203dae72439bc1530a36c470ea6f2b3114edae Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Thu, 15 Feb 2024 23:21:24 +0100 Subject: [PATCH 09/41] Fix PvPOnline (fixes #299) Use a "working" latest page. (Additionally, mark the comic as "end of life", since it hasn't been updated in 2 years...) --- dosagelib/plugins/p.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/dosagelib/plugins/p.py b/dosagelib/plugins/p.py index 41aad4a30..89f6f5f67 100644 --- a/dosagelib/plugins/p.py +++ b/dosagelib/plugins/p.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from re import compile, escape from ..scraper import _BasicScraper, _ParserScraper, ParserScraper @@ -333,11 +333,12 @@ class PS238(_ParserScraper): class PvPOnline(ParserScraper): baseUrl = 'https://www.toonhoundstudios.com/' - url = baseUrl + 'pvp/' - stripUrl = baseUrl + 'comic/%s/' + stripUrl = baseUrl + 'comic/%s/?sid=372' + url = stripUrl % 'pvp-2022-09-16' firstStripUrl = stripUrl % '19980504' imageSearch = '//div[@id="spliced-comic"]//img/@data-src-img' prevSearch = '//a[d:class("prev")]' + endOfLife = True - def namer(self, imageUrl, pageUrl): - return 'pvp' + imageUrl.rsplit('/', 1)[-1] + def namer(self, image_url, page_url): + return 'pvp' + image_url.rsplit('/', 1)[-1] From 89b38d450ff8dbb0f8dee5b5a1fd054fddf74f6f Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Thu, 15 Feb 2024 23:55:46 +0100 Subject: [PATCH 10/41] Fix PHDComics (fixes #238) --- dosagelib/plugins/p.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/dosagelib/plugins/p.py b/dosagelib/plugins/p.py index 89f6f5f67..87a628f94 100644 --- a/dosagelib/plugins/p.py +++ b/dosagelib/plugins/p.py @@ -150,14 +150,16 @@ class PeterAndWhitney(_ParserScraper): prevSearch = '//a[./img[contains(@src, "nav_previous")]]' -class PHDComics(_ParserScraper): +class PHDComics(ParserScraper): BROKEN_COMMENT_END = compile(r'--!>') baseUrl = 'http://phdcomics.com/' url = baseUrl + 'comics.php' stripUrl = baseUrl + 'comics/archive.php?comicid=%s' firstStripUrl = stripUrl % '1' - imageSearch = '//img[@id="comic2"]' + imageSearch = ('//img[@id="comic2"]', + r'//img[d:class("img-responsive") and re:test(@name, "comic\d+")]') + multipleImagesPerStrip = True prevSearch = '//a[img[contains(@src, "prev_button")]]' nextSearch = '//a[img[contains(@src, "next_button")]]' help = 'Index format: n (unpadded)' @@ -173,7 +175,7 @@ class PHDComics(_ParserScraper): # video self.stripUrl % '1880', self.stripUrl % '1669', - ) + ) or data.xpath('//img[@id="comic" and contains(@src, "phd083123s")]') class Picklewhistle(ComicControlScraper): From b495c51bcbaedc0a2abf27d0f45050ad452286b9 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Fri, 16 Feb 2024 00:20:20 +0100 Subject: [PATCH 11/41] Fix another bunch of comics --- dosagelib/plugins/p.py | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/dosagelib/plugins/p.py b/dosagelib/plugins/p.py index 87a628f94..cc5319aa4 100644 --- a/dosagelib/plugins/p.py +++ b/dosagelib/plugins/p.py @@ -34,16 +34,11 @@ class ParadigmShift(_BasicScraper): help = 'Index format: custom' -class ParallelUniversum(_BasicScraper): - url = 'http://www.paralleluniversum.net/' - rurl = escape(url) +class ParallelUniversum(WordPressScraper): + url = 'https://www.paralleluniversum.net/' stripUrl = url + '%s/' firstStripUrl = stripUrl % '001-der-comic-ist-tot' - imageSearch = compile(tagre("img", "src", - r'(%scomics/\d+-\d+-\d+[^"]+)' % rurl)) - prevSearch = compile(tagre("a", "href", r'(%s[^"]+/)' % rurl) + - tagre("span", "class", "prev")) - help = 'Index format: number-stripname' + prevSearch = '//a[@rel="prev"]' lang = 'de' @@ -95,14 +90,12 @@ class PebbleVersion(_ParserScraper): help = 'Index format: n (unpadded)' -class PennyAndAggie(_BasicScraper): - url = 'http://pennyandaggie.com/' - rurl = escape(url) - stripUrl = url + 'index.php?p=%s' - imageSearch = compile(tagre("img", "src", r'(http://www\.pennyandaggie\.com/comics/[^"]+)')) - prevSearch = compile(tagre("a", "href", r"(index\.php\?p\=\d+)", quote="'") + - tagre("img", "src", r'%simages/previous_day\.gif' % rurl, quote="")) - help = 'Index format: n (unpadded)' +class PennyAndAggie(ComicControlScraper): + url = 'https://pixietrixcomix.com/penny-and-aggie' + stripUrl = url + '/%s' + firstStripUrl = stripUrl % '2004-09-06' + endOfLife = True + help = 'Index format: yyyy-mm-dd' class PennyArcade(_ParserScraper): @@ -117,19 +110,17 @@ class PennyArcade(_ParserScraper): help = 'Index format: yyyy/mm/dd' -class PeppermintSaga(WordPressNavi): +class PeppermintSaga(WordPressScraper): url = 'http://www.pepsaga.com/' - stripUrl = url + '?p=%s' - firstStripUrl = stripUrl % '3' - help = 'Index format: number' + stripUrl = url + 'comics/%s/' + firstStripUrl = stripUrl % 'the-sword-of-truth-vol1' adult = True -class PeppermintSagaBGR(WordPressNavi): +class PeppermintSagaBGR(WordPressScraper): url = 'http://bgr.pepsaga.com/' - stripUrl = url + '?p=%s' - firstStripUrl = stripUrl % '4' - help = 'Index format: number' + stripUrl = url + '?comic=%s' + firstStripUrl = stripUrl % '04172011' adult = True From f76061e1386894c00b07e198dc84c91b45053234 Mon Sep 17 00:00:00 2001 From: garbled1 Date: Fri, 16 Feb 2024 09:11:49 -0700 Subject: [PATCH 12/41] Fix a potential infinite loop condition, that causes the script to consume all ram on the box and get oom-killed. --- scripts/order-symlinks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/order-symlinks.py b/scripts/order-symlinks.py index c38511676..a0079c806 100755 --- a/scripts/order-symlinks.py +++ b/scripts/order-symlinks.py @@ -61,7 +61,10 @@ def create_symlinks(d): else: order.extend(data["pages"][work]["images"].values()) if "prev" in data["pages"][work]: - work = data["pages"][work]["prev"] + if data["pages"][work]["prev"] == work: + work = None + else: + work = data["pages"][work]["prev"] else: work = None order.reverse() From ee22169cc5a081612ccd092f7cdc50d0c5868e4f Mon Sep 17 00:00:00 2001 From: Tim Rightnour <6556271+garbled1@users.noreply.github.com> Date: Fri, 16 Feb 2024 17:01:48 -0700 Subject: [PATCH 13/41] Add 7 new comics to the WebToons module (#301) - 99ReinforcedStick - CydoniaShattering - DungeonCleaningLife - FinalRaidBoss - KnightUnderMyHeart - MySClassHunter - MythicItemObtained --- dosagelib/plugins/webtoons.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dosagelib/plugins/webtoons.py b/dosagelib/plugins/webtoons.py index 81b2a3035..393f9d809 100644 --- a/dosagelib/plugins/webtoons.py +++ b/dosagelib/plugins/webtoons.py @@ -52,6 +52,7 @@ class WebToons(ParserScraper): cls('1111Animals', 'comedy/1111-animals', 437), cls('2015SpaceSeries', 'sf/2015-space-series', 391), cls('3SecondStrip', 'comedy/3-second-strip', 380), + cls('99ReinforcedStick', 'comedy/99-reinforced-wooden-stick', 4286), cls('ABittersweetLife', 'slice-of-life/a-bittersweet-life', 294), cls('AboutDeath', 'drama/about-death', 82), cls('ABudgiesLife', 'slice-of-life/its-a-budgies-life', 985), @@ -128,6 +129,7 @@ class WebToons(ParserScraper): cls('CursedPrincessClub', 'comedy/cursed-princess-club', 1537), cls('Cyberbunk', 'sf/cyberbunk', 466), cls('Cyberforce', 'super-hero/cyberforce', 531), + cls('CydoniaShattering', 'fantasy/cydonia-shattering', 2881), cls('CykoKO', 'super-hero/cyko-ko', 560), cls('Darbi', 'action/darbi', 1098), cls('Darchon', 'challenge/darchon', 532053), @@ -153,6 +155,7 @@ class WebToons(ParserScraper): cls('DrawnToYou', 'challenge/drawn-to-you', 172022), cls('DrFrost', 'drama/dr-frost', 371), cls('DuelIdentity', 'challenge/duel-identity', 532064), + cls('DungeonCleaningLife', 'action/the-dungeon-cleaning-life-of-a-once-genius-hunter', 4677), cls('DungeonMinis', 'challenge/dungeonminis', 64132), cls('Dustinteractive', 'comedy/dustinteractive', 907), cls('DutyAfterSchool', 'sf/duty-after-school', 370), @@ -170,6 +173,7 @@ class WebToons(ParserScraper): cls('FAMILYMAN', 'drama/family-man', 85), cls('FantasySketchTheGame', 'sf/fantasy-sketch', 1020), cls('Faust', 'supernatural/faust', 522), + cls('FinalRaidBoss', 'fantasy/the-final-raid-boss', 3921), cls('FINALITY', 'mystery/finality', 1457), cls('Firebrand', 'supernatural/firebrand', 877), cls('FirstDefense', 'challenge/first-defense', 532072), @@ -235,6 +239,7 @@ class WebToons(ParserScraper): cls('KindOfLove', 'slice-of-life/kind-of-love', 1850), cls('KissItGoodbye', 'challenge/kiss-it-goodbye', 443703), cls('KnightRun', 'sf/knight-run', 67), + cls('KnightUnderMyHeart', 'action/knight-under-my-heart', 4215), cls('Kubera', 'fantasy/kubera', 83), cls('LalinsCurse', 'supernatural/lalins-curse', 1601), cls('Lars', 'slice-of-life/lars', 358), @@ -292,6 +297,8 @@ class WebToons(ParserScraper): cls('MyGiantNerdBoyfriend', 'slice-of-life/my-giant-nerd-boyfriend', 958), cls('MyKittyAndOldDog', 'slice-of-life/my-kitty-and-old-dog', 184), cls('MyNameIsBenny', 'slice-of-life/my-name-is-benny', 1279), + cls('MySClassHunter', 'action/my-s-class-hunters', 3963), + cls('MythicItemObtained', 'fantasy/mythic-item-obtained', 4582), cls('MyWallflowerKiss', 'challenge/my-wallflower-kiss', 151869), cls('NanoList', 'sf/nano-list', 700), cls('NationalDogDay2016', 'slice-of-life/national-dog-day', 747), From 7b9ca867fba50aed83f568f672bb283fd16539ae Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sun, 18 Feb 2024 16:53:17 +0100 Subject: [PATCH 14/41] Add some more type annotations --- dosagelib/comic.py | 23 ++++++++++-------- dosagelib/scraper.py | 58 ++++++++++++++++++++++---------------------- 2 files changed, 42 insertions(+), 39 deletions(-) diff --git a/dosagelib/comic.py b/dosagelib/comic.py index 20374c126..222549e14 100644 --- a/dosagelib/comic.py +++ b/dosagelib/comic.py @@ -1,12 +1,15 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2016 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +from __future__ import annotations + import os import glob import codecs import contextlib from datetime import datetime +from typing import Iterator from .output import out from .util import unquote, getFilename, urlopen, strsize @@ -14,27 +17,27 @@ from .events import getHandler # Maximum content size for images -MaxImageBytes = 1024 * 1024 * 20 # 20 MB +MAX_IMAGE_BYTES = 1024 * 1024 * 20 # 20 MB # RFC 1123 format, as preferred by RFC 2616 RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT" -class ComicStrip(object): +class ComicStrip: """A list of comic image URLs.""" - def __init__(self, scraper, strip_url, image_urls, text=None): + def __init__(self, scraper, strip_url: str, image_urls: str, text=None) -> None: """Store the image URL list.""" self.scraper = scraper self.strip_url = strip_url self.image_urls = image_urls self.text = text - def getImages(self): + def getImages(self) -> Iterator[ComicImage]: """Get a list of image downloaders.""" for image_url in self.image_urls: yield self.getDownloader(image_url) - def getDownloader(self, url): + def getDownloader(self, url: str) -> ComicImage: """Get an image downloader.""" filename = self.scraper.namer(url, self.strip_url) if filename is None: @@ -43,7 +46,7 @@ class ComicStrip(object): text=self.text) -class ComicImage(object): +class ComicImage: """A comic image downloader.""" ChunkBytes = 1024 * 100 # 100KB @@ -64,7 +67,7 @@ class ComicImage(object): headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR) self.urlobj = urlopen(self.url, self.scraper.session, referrer=self.referrer, - max_content_bytes=MaxImageBytes, stream=True, + max_content_bytes=MAX_IMAGE_BYTES, stream=True, headers=headers) if self.urlobj.status_code == 304: # Not modified return diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index 5a411b9b4..e9928c391 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -119,45 +119,45 @@ class Scraper: if val: self._indexes = tuple(sorted(val)) - def __init__(self, name): + def __init__(self, name: str) -> None: """Initialize internal variables.""" self.name = name - self.urls = set() + self.urls: set[str] = set() self._indexes = () - self.skippedUrls = set() + self.skippedUrls: set[str] = set() self.hitFirstStripUrl = False - def __hash__(self): + def __hash__(self) -> int: """Get hash value from name and index list.""" return hash((self.name, self.indexes)) - def shouldSkipUrl(self, url, data): + def shouldSkipUrl(self, url: str, data) -> bool: """Determine if search for images in given URL should be skipped.""" return False - def getComicStrip(self, url, data): + def getComicStrip(self, url, data) -> ComicStrip: """Get comic strip downloader for given URL and data.""" - imageUrls = self.extract_image_urls(url, data) + urls = self.extract_image_urls(url, data) # map modifier function on image URLs - imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls] + urls = [self.imageUrlModifier(x, data) for x in urls] # remove duplicate URLs - imageUrls = uniq(imageUrls) - if len(imageUrls) > 1 and not self.multipleImagesPerStrip: + urls = uniq(urls) + if len(urls) > 1 and not self.multipleImagesPerStrip: out.warn( u"Found %d images instead of 1 at %s with expressions %s" % - (len(imageUrls), url, prettyMatcherList(self.imageSearch))) - image = imageUrls[0] - out.warn(u"Choosing image %s" % image) - imageUrls = (image,) - elif not imageUrls: - out.warn(u"Found no images at %s with expressions %s" % (url, + (len(urls), url, prettyMatcherList(self.imageSearch))) + image = urls[0] + out.warn("Choosing image %s" % image) + urls = (image,) + elif not urls: + out.warn("Found no images at %s with expressions %s" % (url, prettyMatcherList(self.imageSearch))) if self.textSearch: text = self.fetchText(url, data, self.textSearch, optional=self.textOptional) else: text = None - return ComicStrip(self, url, imageUrls, text=text) + return ComicStrip(self, url, urls, text=text) def getStrips(self, maxstrips=None): """Get comic strips.""" @@ -217,7 +217,7 @@ class Scraper: break url = prevUrl - def isfirststrip(self, url): + def isfirststrip(self, url: str) -> bool: """Check if the specified URL is the first strip of a comic. This is specially for comics taken from archive.org, since the base URL of archive.org changes whenever pages are taken from a different @@ -228,7 +228,7 @@ class Scraper: currenturl = ARCHIVE_ORG_URL.sub('', url) return firsturl == currenturl - def getPrevUrl(self, url, data): + def getPrevUrl(self, url: str, data) -> str | None: """Find previous URL.""" prevUrl = None if self.prevSearch: @@ -243,40 +243,40 @@ class Scraper: getHandler().comicPageLink(self, url, prevUrl) return prevUrl - def getIndexStripUrl(self, index): + def getIndexStripUrl(self, index: str) -> str: """Get comic strip URL from index.""" return self.stripUrl % index - def starter(self): + def starter(self) -> str: """Get starter URL from where to scrape comic strips.""" return self.url - def namer(self, image_url, page_url): + def namer(self, image_url: str, page_url: str) -> str | None: """Return filename for given image and page URL.""" return - def link_modifier(self, fromurl, tourl): + def link_modifier(self, fromurl: str, tourl: str) -> str: """Optional modification of parsed link (previous/back/latest) URLs. Useful if there are domain redirects. The default implementation does not modify the URL. """ return tourl - def imageUrlModifier(self, image_url, data): + def imageUrlModifier(self, image_url: str, data) -> str: """Optional modification of parsed image URLs. Useful if the URL needs to be fixed before usage. The default implementation does not modify the URL. The given data is the URL page data. """ return image_url - def vote(self): + def vote(self) -> None: """Cast a public vote for this comic.""" uid = get_system_uid() data = {"name": self.name.replace('/', '_'), "uid": uid} response = self.session.post(configuration.VoteUrl, data=data) response.raise_for_status() - def get_download_dir(self, basepath): + def get_download_dir(self, basepath: str) -> str: """Try to find the corect download directory, ignoring case differences.""" path = basepath @@ -294,16 +294,16 @@ class Scraper: path = os.path.join(path, part) return path - def getCompleteFile(self, basepath): + def getCompleteFile(self, basepath: str) -> str: """Get filename indicating all comics are downloaded.""" dirname = self.get_download_dir(basepath) return os.path.join(dirname, "complete.txt") - def isComplete(self, basepath): + def isComplete(self, basepath: str) -> bool: """Check if all comics are downloaded.""" return os.path.isfile(self.getCompleteFile(basepath)) - def setComplete(self, basepath): + def setComplete(self, basepath: str) -> None: """Set complete flag for this comic, ie. all comics are downloaded.""" if self.endOfLife: filename = self.getCompleteFile(basepath) From 15423eab21eed8345a9c625b921499c8df995c8c Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sun, 18 Feb 2024 17:26:54 +0100 Subject: [PATCH 15/41] Drop support for Python 3.7 --- .github/workflows/ci.yaml | 2 +- Jenkinsfile | 1 - README.md | 2 +- dosagelib/__init__.py | 13 +++++-------- dosagelib/plugins/t.py | 5 +---- pyproject.toml | 7 ++----- tox.ini | 3 +-- 7 files changed, 11 insertions(+), 22 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 33990ed9e..15a247d60 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 diff --git a/Jenkinsfile b/Jenkinsfile index db7d2c10c..880f9fa62 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -4,7 +4,6 @@ def pys = [ [name: 'Python 3.10', docker: '3.10-bookworm', tox:'py310', main: false], [name: 'Python 3.9', docker: '3.9-bookworm', tox:'py39', main: false], [name: 'Python 3.8', docker: '3.8-bookworm', tox:'py38', main: false], - [name: 'Python 3.7', docker: '3.7-bookworm', tox:'py37', main: false], ] properties([ diff --git a/README.md b/README.md index 4604ffc6a..fb46a453a 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ are old enough to view them. ### Dependencies Since dosage is written in [Python](http://www.python.org/), a Python -installation is required: Dosage needs at least Python 3.7. Dosage requires +installation is required: Dosage needs at least Python 3.8. Dosage requires some Python modules from PyPI, so installation with `pip` is recommended. ### Using the Windows binary diff --git a/dosagelib/__init__.py b/dosagelib/__init__.py index 4f80013df..39ba36033 100644 --- a/dosagelib/__init__.py +++ b/dosagelib/__init__.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2019 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher """ Automated comic downloader. Dosage traverses comic websites in order to download each strip of the comic. The intended use is for @@ -14,14 +14,11 @@ The primary interface is the 'dosage' commandline script. Comic modules for each comic are located in L{dosagelib.plugins}. """ -try: - from importlib.metadata import version, PackageNotFoundError -except ImportError: - from importlib_metadata import version, PackageNotFoundError +from importlib.metadata import version, PackageNotFoundError from .output import out -AppName = u'dosage' +AppName = 'dosage' try: __version__ = version(AppName) # PEP 396 except PackageNotFoundError: diff --git a/dosagelib/plugins/t.py b/dosagelib/plugins/t.py index ebe864694..ee6801a21 100644 --- a/dosagelib/plugins/t.py +++ b/dosagelib/plugins/t.py @@ -4,10 +4,7 @@ # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Daniel Ring from re import compile, escape, MULTILINE -try: - from functools import cached_property -except ImportError: - from cached_property import cached_property +from functools import cached_property from ..scraper import _BasicScraper, _ParserScraper, ParserScraper from ..helpers import indirectStarter, joinPathPartsNamer diff --git a/pyproject.toml b/pyproject.toml index c5217a4c0..10c294f4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", @@ -27,15 +26,13 @@ classifiers = [ "Topic :: Multimedia :: Graphics", ] keywords = ["comic", "webcomic", "downloader", "archiver", "crawler"] -requires-python = ">=3.7" +requires-python = ">=3.8" dependencies = [ "colorama", "imagesize", "lxml>=4.0.0", "platformdirs", "requests>=2.0", - "cached_property;python_version<'3.8'", - "importlib_metadata;python_version<'3.8'", "importlib_resources>=5.0.0;python_version<'3.9'", ] dynamic = ["version"] @@ -101,7 +98,7 @@ ignore = [ ] noqa-require-code = true no-accept-encodings = true -min-version = "3.7" +min-version = "3.8" extend-exclude = [ '.venv', 'build', diff --git a/tox.ini b/tox.ini index 27eed7f37..02b4ffe3f 100644 --- a/tox.ini +++ b/tox.ini @@ -1,10 +1,9 @@ [tox] -envlist = py37, py38, py39, py310, py311, py312, flake8 +envlist = py38, py39, py310, py311, py312, flake8 isolated_build = True [gh-actions] python = - 3.7: py37 3.8: py38 3.9: py39 3.10: py310 From 3722fbe7e4aafce7486e99ad3b000b8b1c87a1d2 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sun, 18 Feb 2024 18:02:02 +0100 Subject: [PATCH 16/41] Update joinPathPartsNamer: Remove defaults --- dosagelib/helpers.py | 42 ++++++++++++++++++------------ dosagelib/plugins/c.py | 4 +-- dosagelib/plugins/comicskingdom.py | 7 +---- dosagelib/plugins/f.py | 4 +-- dosagelib/plugins/t.py | 2 +- dosagelib/plugins/z.py | 4 +-- tests/test_helpers.py | 12 +++++---- 7 files changed, 41 insertions(+), 34 deletions(-) diff --git a/dosagelib/helpers.py b/dosagelib/helpers.py index d53e04cfb..b3e4f00cc 100644 --- a/dosagelib/helpers.py +++ b/dosagelib/helpers.py @@ -1,39 +1,49 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2020 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring +from __future__ import annotations + +from typing import Protocol + from .util import getQueryParams +from .scraper import Scraper -def queryNamer(param, use_page_url=False): +class Namer(Protocol): + """A protocol for generic callbacks to name web comic images.""" + def __call__(_, self: Scraper, image_url: str, page_url: str) -> str | None: + ... + + +def queryNamer(param, use_page_url=False) -> Namer: """Get name from URL query part.""" - def _namer(self, image_url, page_url): + def _namer(self, image_url: str, page_url: str) -> str | None: """Get URL query part.""" url = page_url if use_page_url else image_url return getQueryParams(url)[param][0] return _namer -def regexNamer(regex, use_page_url=False): +def regexNamer(regex, use_page_url=False) -> Namer: """Get name from regular expression.""" - def _namer(self, image_url, page_url): + def _namer(self, image_url: str, page_url: str) -> str | None: """Get first regular expression group.""" url = page_url if use_page_url else image_url mo = regex.search(url) - if mo: - return mo.group(1) + return mo.group(1) if mo else None return _namer -def joinPathPartsNamer(pageurlparts, imageurlparts=(-1,), joinchar='_'): +def joinPathPartsNamer(pageparts=(), imageparts=(), joinchar='_') -> Namer: """Get name by mashing path parts together with underscores.""" - def _namer(self, imageurl, pageurl): + def _namer(self: Scraper, image_url: str, page_url: str) -> str | None: # Split and drop host name - pageurlsplit = pageurl.split('/')[3:] - imageurlsplit = imageurl.split('/')[3:] - joinparts = ([pageurlsplit[i] for i in pageurlparts] + - [imageurlsplit[i] for i in imageurlparts]) + pagesplit = page_url.split('/')[3:] + imagesplit = image_url.split('/')[3:] + joinparts = ([pagesplit[i] for i in pageparts] + + [imagesplit[i] for i in imageparts]) return joinchar.join(joinparts) return _namer diff --git a/dosagelib/plugins/c.py b/dosagelib/plugins/c.py index c596ede60..27f7278d2 100644 --- a/dosagelib/plugins/c.py +++ b/dosagelib/plugins/c.py @@ -404,7 +404,7 @@ class CrossTimeCafe(_ParserScraper): class CSectionComics(WordPressScraper): url = 'https://www.csectioncomics.com/' firstStripUrl = url + 'comics/one-day-in-country' - namer = joinPathPartsNamer((), (-3, -2, -1)) + namer = joinPathPartsNamer(imageparts=(-3, -2, -1)) multipleImagesPerStrip = True @@ -466,7 +466,7 @@ class CyanideAndHappiness(ParserScraper): prevSearch = '//div[@type="comic"]//a[*[local-name()="svg" and @rotate="180deg"]]' nextSearch = '//div[@type="comic"]//a[*[local-name()="svg" and @rotate="0deg"]]' starter = bounceStarter - namer = joinPathPartsNamer((), range(-4, 0)) + namer = joinPathPartsNamer(imageparts=range(-4, 0)) class CynWolf(_ParserScraper): diff --git a/dosagelib/plugins/comicskingdom.py b/dosagelib/plugins/comicskingdom.py index 818a37fa7..0a792dfd5 100644 --- a/dosagelib/plugins/comicskingdom.py +++ b/dosagelib/plugins/comicskingdom.py @@ -1,11 +1,6 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Thomas W. Littauer -try: - from importlib_resources import as_file, files -except ImportError: - from importlib.resources import as_file, files - from ..helpers import bounceStarter, joinPathPartsNamer from ..scraper import ParserScraper @@ -15,7 +10,7 @@ class ComicsKingdom(ParserScraper): prevSearch = '//a[./img[contains(@alt, "Previous")]]' nextSearch = '//a[./img[contains(@alt, "Next")]]' starter = bounceStarter - namer = joinPathPartsNamer((-2, -1), ()) + namer = joinPathPartsNamer(pageparts=(-2, -1)) help = 'Index format: yyyy-mm-dd' def __init__(self, name, path, lang=None): diff --git a/dosagelib/plugins/f.py b/dosagelib/plugins/f.py index d3f45ac98..01c43da33 100644 --- a/dosagelib/plugins/f.py +++ b/dosagelib/plugins/f.py @@ -171,7 +171,7 @@ class Fragile(_ParserScraper): endOfLife = True -class FredoAndPidjin(_ParserScraper): +class FredoAndPidjin(ParserScraper): url = 'https://www.pidjin.net/' stripUrl = url + '%s/' firstStripUrl = stripUrl % '2006/02/19/goofy-monday' @@ -180,7 +180,7 @@ class FredoAndPidjin(_ParserScraper): prevSearch = '//span[d:class("prev")]/a' latestSearch = '//section[d:class("latest")]//a' starter = indirectStarter - namer = joinPathPartsNamer((0, 1, 2)) + namer = joinPathPartsNamer(pageparts=(0, 1, 2), imageparts=(-1,)) class Freefall(_ParserScraper): diff --git a/dosagelib/plugins/t.py b/dosagelib/plugins/t.py index ee6801a21..1919e274a 100644 --- a/dosagelib/plugins/t.py +++ b/dosagelib/plugins/t.py @@ -272,7 +272,7 @@ class ToonHole(ParserScraper): prevSearch = '//a[@rel="prev"]' latestSearch = '//a[@rel="bookmark"]' starter = indirectStarter - namer = joinPathPartsNamer((), (-3, -2, -1)) + namer = joinPathPartsNamer(imageparts=(-3, -2, -1)) class TrippingOverYou(_BasicScraper): diff --git a/dosagelib/plugins/z.py b/dosagelib/plugins/z.py index f7556110a..f5ef8e954 100644 --- a/dosagelib/plugins/z.py +++ b/dosagelib/plugins/z.py @@ -23,7 +23,7 @@ class Zapiro(ParserScraper): imageSearch = '//div[@id="cartoon"]/img' prevSearch = '//a[d:class("left")]' nextSearch = '//a[d:class("right")]' - namer = joinPathPartsNamer((-1,), ()) + namer = joinPathPartsNamer(pageparts=(-1,)) class ZenPencils(WordPressNavi): @@ -60,7 +60,7 @@ class Zwarwald(BasicScraper): tagre("img", "src", r'http://zwarwald\.de/images/prev\.jpg', quote="'")) - namer = joinPathPartsNamer((), (-3, -2, -1)) + namer = joinPathPartsNamer(imageparts=(-3, -2, -1)) help = 'Index format: number' def shouldSkipUrl(self, url, data): diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 598a74fc4..8c13c89ca 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2019 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher from dosagelib.helpers import joinPathPartsNamer, queryNamer -class TestNamer(object): +class TestNamer: """ Tests for comic namer. """ @@ -16,6 +16,8 @@ class TestNamer(object): def test_joinPathPartsNamer(self): imgurl = 'https://HOST/wp-content/uploads/2019/02/tennis5wp-1.png' pageurl = 'https://HOST/2019/03/11/12450/' - assert joinPathPartsNamer((0, 1, 2))(self, imgurl, pageurl) == '2019_03_11_tennis5wp-1.png' - assert joinPathPartsNamer((0, 1, 2), (-1,), '-')(self, imgurl, pageurl) == '2019-03-11-tennis5wp-1.png' - assert joinPathPartsNamer((0, -2), ())(self, imgurl, pageurl) == '2019_12450' + assert joinPathPartsNamer(pageparts=(0, 1, 2), imageparts=(-1,))(self, + imgurl, pageurl) == '2019_03_11_tennis5wp-1.png' + assert joinPathPartsNamer(pageparts=(0, 1, 2), imageparts=(-1,), joinchar='-')(self, + imgurl, pageurl) == '2019-03-11-tennis5wp-1.png' + assert joinPathPartsNamer(pageparts=(0, -2))(self, imgurl, pageurl) == '2019_12450' From da60636b8a3020c250159e74178fb0a0dad84b59 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sun, 18 Feb 2024 23:43:02 +0100 Subject: [PATCH 17/41] Fix some Shivae Studios modules --- dosagelib/plugins/old.py | 4 +++- dosagelib/plugins/shivaestudios.py | 26 +++++++++----------------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/dosagelib/plugins/old.py b/dosagelib/plugins/old.py index 4ca85ba18..7cda0dc1a 100644 --- a/dosagelib/plugins/old.py +++ b/dosagelib/plugins/old.py @@ -1581,6 +1581,8 @@ class Removed(Scraper): cls('WintersLight'), # Removed in 3.1 + cls('AbbysAgency', 'brk'), + cls('BlackRose', 'brk'), cls('CatenaManor/CatenaCafe'), cls('FalseStart'), cls('Ginpu'), @@ -1607,6 +1609,7 @@ class Removed(Scraper): cls('MangaDex/ImTheMaxLevelNewbie', 'legal'), cls('MrLovenstein', 'jsh'), cls('MyCartoons'), + cls('Shivae/BlackRose', 'brk'), cls('StudioKhimera/Mousechevious'), cls('TalesAndTactics'), cls('VampireHunterBoyfriends'), @@ -1704,7 +1707,6 @@ class Renamed(Scraper): cls('GoComics/Widdershins', 'Widdershins'), cls('Guardia', 'ComicFury/Guardia'), cls('RadioactivePanda', 'Tapas/RadioactivePanda'), - cls('Shivae/BlackRose', 'BlackRose'), cls('SmackJeeves/BlackTapestries', 'ComicFury/BlackTapestries'), cls('SmackJeeves/ByTheBook', 'ByTheBook'), cls('SmackJeeves/FurryExperience', 'ComicFury/FurryExperience'), diff --git a/dosagelib/plugins/shivaestudios.py b/dosagelib/plugins/shivaestudios.py index ace417cbd..2f508cabe 100644 --- a/dosagelib/plugins/shivaestudios.py +++ b/dosagelib/plugins/shivaestudios.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2019-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2021 Daniel Ring +# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from .common import WordPressSpliced @@ -12,22 +12,20 @@ class _CommonMulti(WordPressSpliced): self.endOfLife = eol -class AbbysAgency(WordPressSpliced): - url = 'https://abbysagency.us/' - stripUrl = url + 'blog/comic/%s/' - firstStripUrl = stripUrl % 'a' - - class AlienDice(WordPressSpliced): url = 'https://aliendice.com/' stripUrl = url + 'comic/%s/' firstStripUrl = stripUrl % '05162001' + def shouldSkipUrl(self, url, data): + """Skip pages without images.""" + return not data.xpath(self.imageSearch) + def getPrevUrl(self, url, data): # Fix broken navigation if url == self.stripUrl % 'day-29-part-2-page-3-4': return self.stripUrl % 'day-29-part-2-page-3-2' - return super(AlienDice, self).getPrevUrl(url, data) + return super().getPrevUrl(url, data) def namer(self, imageUrl, pageUrl): # Fix inconsistent filename @@ -47,12 +45,6 @@ class AlienDiceLegacy(WordPressSpliced): return super().isfirststrip(url.rsplit('?', 1)[0]) -class BlackRose(WordPressSpliced): - url = 'https://www.blackrose.monster/' - stripUrl = url + 'comic/%s/' - firstStripUrl = stripUrl % '2004-11-01' - - class TheCyantianChronicles(_CommonMulti): baseUrl = 'https://cyantian.net/' @@ -81,9 +73,9 @@ class TheCyantianChronicles(_CommonMulti): class Shivae(WordPressSpliced): - url = 'https://shivae.com/' + url = 'https://shivae.net/' stripUrl = url + 'comic/%s/' - firstStripUrl = stripUrl % '09202001' + firstStripUrl = stripUrl % '2002-02-27' class ShivaeComics(_CommonMulti): From 05b9be4cd9da4730bbf7c87b07d49c552c1b260e Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Mon, 19 Feb 2024 00:53:36 +0100 Subject: [PATCH 18/41] Fix some more modules --- dosagelib/plugins/a.py | 81 ++++++++++++---------------------------- dosagelib/plugins/old.py | 4 ++ 2 files changed, 27 insertions(+), 58 deletions(-) diff --git a/dosagelib/plugins/a.py b/dosagelib/plugins/a.py index 32a5b42ce..2a4ef5b70 100644 --- a/dosagelib/plugins/a.py +++ b/dosagelib/plugins/a.py @@ -1,18 +1,18 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring -from re import compile, escape, MULTILINE +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring +from re import compile, MULTILINE from ..util import tagre -from ..scraper import BasicScraper, ParserScraper, _BasicScraper, _ParserScraper -from ..helpers import regexNamer, bounceStarter, indirectStarter +from ..scraper import ParserScraper, _BasicScraper, _ParserScraper +from ..helpers import joinPathPartsNamer, bounceStarter, indirectStarter from .common import WordPressScraper, WordPressNavi, WordPressWebcomic -class AbstruseGoose(_ParserScraper): - url = 'https://abstrusegoose.com/' +class AbstruseGoose(ParserScraper): + url = 'https://web.archive.org/web/20230930172141/https://abstrusegoose.com/' starter = bounceStarter stripUrl = url + '%s' firstStripUrl = stripUrl % '1' @@ -41,24 +41,16 @@ class AbsurdNotions(_BasicScraper): help = 'Index format: n (unpadded)' -class AcademyVale(_BasicScraper): - url = 'http://www.imagerie.com/vale/' - stripUrl = url + 'avarch.cgi?%s' - firstStripUrl = stripUrl % '001' - imageSearch = compile(tagre('img', 'src', r'(avale\d{4}-\d{2}\.gif)')) - prevSearch = compile(tagre('a', 'href', r'(avarch[^">]+)', quote="") + - tagre('img', 'src', r'AVNavBack\.gif')) - help = 'Index format: nnn' - - -class Achewood(_ParserScraper): - url = 'https://www.achewood.com/' - stripUrl = url + 'index.php?date=%s' - firstStripUrl = stripUrl % '10012001' - imageSearch = '//p[@id="comic_body"]//img' - prevSearch = '//span[d:class("left")]/a[d:class("dateNav")]' - help = 'Index format: mmddyyyy' - namer = regexNamer(compile(r'date=(\d+)')) +class Achewood(ParserScraper): + baseUrl = 'https://achewood.com/' + stripUrl = baseUrl + '%s/title.html' + url = stripUrl % '2016/12/25' + firstStripUrl = stripUrl % '2001/10/01' + imageSearch = '//img[d:class("comicImage")]' + prevSearch = '//a[d:class("comic_prev")]' + namer = joinPathPartsNamer(pageparts=range(0, 2)) + help = 'Index format: yyyy/mm/dd' + endOfLife = True class AdventuresOfFifne(_ParserScraper): @@ -117,12 +109,8 @@ class AhoiPolloi(_ParserScraper): help = 'Index format: yyyymmdd' -class AhoyEarth(WordPressNavi): - url = 'http://www.ahoyearth.com/' - - class AirForceBlues(WordPressScraper): - url = 'http://farvatoons.com/' + url = 'https://web.archive.org/web/20210102113825/http://farvatoons.com/' firstStripUrl = url + 'comic/in-texas-there-are-texans/' @@ -207,14 +195,11 @@ class AltermetaOld(_ParserScraper): help = 'Index format: n (unpadded)' -class AmazingSuperPowers(_BasicScraper): - url = 'http://www.amazingsuperpowers.com/' - rurl = escape(url) +class AmazingSuperPowers(WordPressNavi): + url = 'https://www.amazingsuperpowers.com/' stripUrl = url + '%s/' firstStripUrl = stripUrl % '2007/09/heredity' - imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl)) - prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev")) - help = 'Index format: yyyy/mm/name' + imageSearch = '//div[d:class("comicpane")]/img' def shouldSkipUrl(self, url, data): """Skip pages without images.""" @@ -243,18 +228,6 @@ class Amya(WordPressScraper): url = 'http://www.amyachronicles.com/' -class Anaria(_ParserScraper): - url = 'https://www.leahbriere.com/anaria-the-witchs-dream/' - firstStripUrl = url - imageSearch = '//div[contains(@class, "gallery")]//a' - multipleImagesPerStrip = True - endOfLife = True - - def namer(self, imageUrl, pageUrl): - filename = imageUrl.rsplit('/', 1)[-1] - return filename.replace('00.jpg', 'new00.jpg').replace('new', '1') - - class Angband(_ParserScraper): url = 'http://angband.calamarain.net/' stripUrl = url + '%s' @@ -272,14 +245,6 @@ class Angband(_ParserScraper): return self.pages[self.pages.index(url) - 1] -class Angels2200(_BasicScraper): - url = 'http://www.janahoffmann.com/angels/' - stripUrl = url + '%s' - imageSearch = compile(tagre("img", "src", r"(http://www\.janahoffmann\.com/angels/comics/[^']+)", quote="'")) - prevSearch = compile(tagre("a", "href", r'([^"]+)') + "« Previous") - help = 'Index format: yyyy/mm/dd/part--comic-' - - class Annyseed(_ParserScraper): baseUrl = ('https://web.archive.org/web/20190511031451/' 'http://www.mirrorwoodcomics.com/') diff --git a/dosagelib/plugins/old.py b/dosagelib/plugins/old.py index 7cda0dc1a..72f423f94 100644 --- a/dosagelib/plugins/old.py +++ b/dosagelib/plugins/old.py @@ -1582,6 +1582,10 @@ class Removed(Scraper): # Removed in 3.1 cls('AbbysAgency', 'brk'), + cls('AcademyVale'), + cls('AhoyEarth', 'block'), + cls('Anaria', 'del'), + cls('Angels2200', 'del'), cls('BlackRose', 'brk'), cls('CatenaManor/CatenaCafe'), cls('FalseStart'), From 48eb4ef2041c0ca7026e638c492b4a5d2963dcfe Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Mon, 19 Feb 2024 01:01:45 +0100 Subject: [PATCH 19/41] Remove Everblue, they are blocking us (fixes #303) --- dosagelib/plugins/e.py | 9 --------- dosagelib/plugins/old.py | 1 + 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/dosagelib/plugins/e.py b/dosagelib/plugins/e.py index d423528dd..5329256d6 100644 --- a/dosagelib/plugins/e.py +++ b/dosagelib/plugins/e.py @@ -167,15 +167,6 @@ class Erstwhile(WordPressNavi): endOfLife = True -class Everblue(ComicControlScraper): - url = 'http://www.everblue-comic.com/comic/' - stripUrl = url + '%s' - firstStripUrl = stripUrl % '1' - - def namer(self, imageUrl, pageUrl): - return imageUrl.rsplit('/', 1)[-1].split('-', 1)[1] - - class EverybodyLovesEricRaymond(_ParserScraper): url = 'http://geekz.co.uk/lovesraymond/' firstStripUrl = url + 'archive/slashdotted' diff --git a/dosagelib/plugins/old.py b/dosagelib/plugins/old.py index 72f423f94..018873864 100644 --- a/dosagelib/plugins/old.py +++ b/dosagelib/plugins/old.py @@ -1588,6 +1588,7 @@ class Removed(Scraper): cls('Angels2200', 'del'), cls('BlackRose', 'brk'), cls('CatenaManor/CatenaCafe'), + cls('Everblue', 'block'), cls('FalseStart'), cls('Ginpu'), cls('GoComics/9ChickweedLaneClassics'), From 32b0dfef35077a961273bd79e77262926ab31bd0 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sun, 17 Mar 2024 19:18:35 +0100 Subject: [PATCH 20/41] Adapt to new ComicsKingdom layout (fixes #307) --- dosagelib/plugins/comicskingdom.py | 82 ++++++++++++++++-------------- dosagelib/plugins/old.py | 28 +++++++++- scripts/comicskingdom.py | 38 +++----------- 3 files changed, 78 insertions(+), 70 deletions(-) diff --git a/dosagelib/plugins/comicskingdom.py b/dosagelib/plugins/comicskingdom.py index 0a792dfd5..08413fb4e 100644 --- a/dosagelib/plugins/comicskingdom.py +++ b/dosagelib/plugins/comicskingdom.py @@ -1,16 +1,18 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Thomas W. Littauer -from ..helpers import bounceStarter, joinPathPartsNamer +from urllib.parse import parse_qs, urlsplit + +from ..helpers import indirectStarter from ..scraper import ParserScraper class ComicsKingdom(ParserScraper): - imageSearch = '//img[@id="theComicImage"]' - prevSearch = '//a[./img[contains(@alt, "Previous")]]' - nextSearch = '//a[./img[contains(@alt, "Next")]]' - starter = bounceStarter - namer = joinPathPartsNamer(pageparts=(-2, -1)) + partDiv = '//div[d:class("comic-reader-item")]' + imageSearch = partDiv + '[1]//a[contains(@href, "/custom-framed-print/")]' + prevSearch = partDiv + '[2]/@data-link' + latestSearch = '//a[re:test(@href, "/[0-9-]+$")]' + starter = indirectStarter help = 'Index format: yyyy-mm-dd' def __init__(self, name, path, lang=None): @@ -20,17 +22,20 @@ class ComicsKingdom(ParserScraper): if lang: self.lang = lang + def imageUrlModifier(self, url, data): + """Extract high-quality image URL from link""" + return parse_qs(urlsplit(url).query)['img'][0] + + def link_modifier(self, fromurl, tourl): + return tourl.replace('//wp.', '//', 1) + @classmethod def getmodules(cls): # noqa: CFQ001 return ( - # Some comics are not listed on the "all" page (too old?) - cls('Retail', 'retail'), - # do not edit anything below since these entries are generated from # scripts/comicskingdom.py # START AUTOUPDATE - cls('AmazingSpiderman', 'amazing-spider-man'), - cls('AmazingSpidermanSpanish', 'hombre-arana', lang='es'), + cls('Alice', 'alice'), cls('Apartment3G', 'apartment-3-g_1'), cls('ArcticCircle', 'arctic-circle'), cls('ATodaVelocidadSpanish', 'a-toda-velocidad', lang='es'), @@ -38,22 +43,25 @@ class ComicsKingdom(ParserScraper): cls('BarneyGoogleAndSnuffySmithSpanish', 'tapon', lang='es'), cls('BeetleBailey', 'beetle-bailey-1'), cls('BeetleBaileySpanish', 'beto-el-recluta', lang='es'), + cls('BeetleMoses', 'beetle-moses'), cls('BetweenFriends', 'between-friends'), + cls('BewareOfToddler', 'beware-of-toddler'), cls('BigBenBolt', 'big-ben-bolt'), - cls('BigBenBoltSundays', 'big-ben-bolt-sundays'), cls('Bizarro', 'bizarro'), cls('Blondie', 'blondie'), cls('BlondieSpanish', 'pepita', lang='es'), + cls('BobMankoffPresentsShowMeTheFunny', 'show-me-the-funny'), + cls('BobMankoffPresentsShowMeTheFunnyAnimalEdition', 'show-me-the-funny-pets'), cls('BonersArk', 'boners-ark'), - cls('BonersArkSundays', 'boners-ark-sundays'), - cls('BrianDuffy', 'brian-duffy'), + cls('BreakOfDay', 'break-of-day'), cls('BrickBradford', 'brick-bradford'), cls('BrilliantMindOfEdisonLee', 'brilliant-mind-of-edison-lee'), cls('BringingUpFather', 'bringing-up-father'), cls('BringingUpFatherSpanish', 'educando-a-papa', lang='es'), cls('BuzSawyer', 'buz-sawyer'), + cls('Candorville', 'candorville'), cls('CarpeDiem', 'carpe-diem'), - cls('Crankshaft', 'crankshaft'), + cls('Comiclicious', 'comiclicious'), cls('Crock', 'crock'), cls('CrockSpanish', 'crock-spanish', lang='es'), cls('Curtis', 'curtis'), @@ -62,6 +70,7 @@ class ComicsKingdom(ParserScraper): cls('DavidMHitch', 'david-m-hitch'), cls('DennisTheMenace', 'dennis-the-menace'), cls('DennisTheMenaceSpanish', 'daniel-el-travieso', lang='es'), + cls('Dumplings', 'dumplings'), cls('Dustin', 'dustin'), cls('EdGamble', 'ed-gamble'), # EdgeCity has a duplicate in GoComics/EdgeCity @@ -69,18 +78,15 @@ class ComicsKingdom(ParserScraper): cls('FamilyCircusSpanish', 'circulo-familiar', lang='es'), cls('FlashForward', 'flash-forward'), cls('FlashGordon', 'flash-gordon'), - cls('FlashGordonSundays', 'flash-gordon-sundays'), - cls('FunkyWinkerbean', 'funky-winkerbean'), - cls('FunkyWinkerbeanSunday', 'funky-winkerbean-sundays'), - cls('FunkyWinkerbeanVintage', 'funky-winkerbean-1'), - cls('FunnyOnlineAnimals', 'Funny-Online-Animals'), - cls('GearheadGertie', 'Gearhead-Gertie'), + cls('FunnyOnlineAnimals', 'funny-online-animals'), + cls('GearheadGertie', 'gearhead-gertie'), + cls('GodsHands', 'gods-hands'), cls('HagarTheHorrible', 'hagar-the-horrible'), cls('HagarTheHorribleSpanish', 'olafo', lang='es'), cls('HeartOfJulietJones', 'heart-of-juliet-jones'), - cls('HeartOfJulietJonesSundays', 'heart-of-juliet-jones-sundays'), cls('HiAndLois', 'hi-and-lois'), - cls('IntelligentLife', 'Intelligent'), + cls('InsanityStreak', 'insanity-streak'), + cls('IntelligentLife', 'intelligent'), cls('JimmyMargulies', 'jimmy-margulies'), cls('JohnBranch', 'john-branch'), cls('JohnnyHazard', 'johnny-hazard'), @@ -88,7 +94,6 @@ class ComicsKingdom(ParserScraper): cls('JungleJimSundays', 'jungle-jim-sundays'), cls('KatzenjammerKids', 'katzenjammer-kids'), cls('KatzenjammerKidsSpanish', 'maldades-de-dos-pilluelos', lang='es'), - cls('KatzenjammerKidsSundays', 'katzenjammer-kids-sundays'), cls('KevinAndKell', 'kevin-and-kell'), cls('KingOfTheRoyalMounted', 'king-of-the-royal-mounted'), cls('KirkWalters', 'kirk-walters'), @@ -96,44 +101,42 @@ class ComicsKingdom(ParserScraper): cls('LaloYLolaSpanish', 'lalo-y-lola', lang='es'), cls('LeeJudge', 'lee-judge'), cls('LegalizationNation', 'legalization-nation'), - cls('LegendOfBill', 'Legend-of-Bill'), + cls('LegendOfBill', 'legend-of-bill'), cls('LittleIodineSundays', 'little-iodine-sundays'), cls('LittleKing', 'the-little-king'), - cls('Lockhorns', 'lockhorns'), - cls('Macanudo', 'Macanudo'), + cls('Macanudo', 'macanudo'), cls('MacanudoSpanish', 'macanudo-spanish', lang='es'), cls('MallardFillmore', 'mallard-fillmore'), - cls('MandrakeTheMagician', 'mandrake-the-magician-1'), + cls('MandrakeTheMagician', 'mandrake-the-magician'), cls('MandrakeTheMagicianSpanish', 'mandrake-the-magician-spanish', lang='es'), - cls('MandrakeTheMagicianSundays', 'mandrake-the-magician-sundays'), + cls('MaraLlaveKeeperOfTime', 'mara-llave-keeper-of-time'), cls('MarkTrail', 'mark-trail'), cls('MarkTrailSpanish', 'mark-trail-spanish', lang='es'), - cls('MarkTrailVintage', 'Mark-Trail-Vintage'), cls('Marvin', 'marvin'), cls('MarvinSpanish', 'marvin-spanish', lang='es'), cls('MaryWorth', 'mary-worth'), cls('MaryWorthSpanish', 'maria-de-oro', lang='es'), - cls('MikePeters', 'mike-peters'), + cls('Mazetoons', 'mazetoons'), cls('MikeShelton', 'mike-shelton'), cls('MikeSmith', 'mike-smith'), cls('MooseAndMolly', 'moose-and-molly'), cls('MooseAndMollySpanish', 'quintin', lang='es'), - cls('MotherGooseAndGrimm', 'mother-goose-grimm'), cls('MrAbernathySpanish', 'don-abundio', lang='es'), cls('Mutts', 'mutts'), cls('MuttsSpanish', 'motas', lang='es'), + cls('NeverBeenDeader', 'never-been-deader'), cls('OfficeHours', 'office-hours'), + cls('OliveAndPopeye', 'olive-popeye'), cls('OnTheFastrack', 'on-the-fastrack'), cls('PajamaDiaries', 'pajama-diaries'), cls('PardonMyPlanet', 'pardon-my-planet'), cls('Phantom', 'phantom'), cls('PhantomSpanish', 'el-fantasma', lang='es'), - cls('PhantomSundays', 'phantom-sundays'), + cls('PlanetSyndicate', 'the_planet_syndicate'), cls('Popeye', 'popeye'), cls('PopeyesCartoonClub', 'popeyes-cartoon-club'), cls('PopeyeSpanish', 'popeye-spanish', lang='es'), cls('PrinceValiant', 'prince-valiant'), - cls('PrinceValiantSundays', 'prince-valiant-sundays'), cls('PrincipeValienteSpanish', 'principe-valiente', lang='es'), cls('ProsAndCons', 'pros-cons'), cls('Quincy', 'quincy'), @@ -143,7 +146,9 @@ class ComicsKingdom(ParserScraper): cls('RexMorganMDSpanish', 'rex-morgan-md-spanish', lang='es'), cls('RhymesWithOrange', 'rhymes-with-orange'), cls('RipKirby', 'rip-kirby'), + # Rosebuds has a duplicate in GoComics/Rosebuds cls('SafeHavens', 'safe-havens'), + cls('SagaOfBrannBjornson', 'the-saga-of-brann-bjornson'), cls('Sales', 'sales'), cls('SallyForth', 'sally-forth'), cls('SamAndSilo', 'sam-and-silo'), @@ -151,17 +156,18 @@ class ComicsKingdom(ParserScraper): cls('SecretAgentX9', 'secret-agent-x-9'), # Shoe has a duplicate in GoComics/Shoe cls('SixChix', 'six-chix'), - cls('SlylockFoxAndComicsForKids', 'slylock-fox-and-comics-for-kids'), - cls('SlylockFoxAndComicsForKidsSpanish', 'solo-para-ninos', lang='es'), + cls('SlylockFox', 'slylock-fox-and-comics-for-kids'), + cls('SlylockFoxSpanish', 'solo-para-ninos', lang='es'), + cls('SuburbanFairyTales', 'suburban-fairy-tales'), cls('TakeItFromTheTinkersons', 'take-it-from-the-tinkersons'), cls('TheyllDoItEveryTimeSpanish', 'nunca-falta-alguien-asi', lang='es'), cls('ThimbleTheater', 'thimble-theater'), cls('Tiger', 'tiger'), cls('TigerSpanish', 'tigrillo', lang='es'), - cls('TigerVintage', 'tiger-1'), - cls('TigerVintageSundays', 'tiger-sundays'), cls('TinasGroove', 'tina-s-groove'), cls('ToddTheDinosaur', 'todd-the-dinosaur'), + cls('WillyBlack', 'willy-black'), + cls('WillyBlacksSpanish', 'willy-black-spanish', lang='es'), cls('ZippyThePinhead', 'zippy-the-pinhead'), cls('Zits', 'zits'), cls('ZitsSpanish', 'jeremias', lang='es'), diff --git a/dosagelib/plugins/old.py b/dosagelib/plugins/old.py index 018873864..abd282522 100644 --- a/dosagelib/plugins/old.py +++ b/dosagelib/plugins/old.py @@ -1588,6 +1588,30 @@ class Removed(Scraper): cls('Angels2200', 'del'), cls('BlackRose', 'brk'), cls('CatenaManor/CatenaCafe'), + cls('ComicsKingdom/AmazingSpiderman'), + cls('ComicsKingdom/AmazingSpidermanSpanish'), + cls('ComicsKingdom/BigBenBoltSundays'), + cls('ComicsKingdom/BonersArkSundays'), + cls('ComicsKingdom/BrianDuffy'), + cls('ComicsKingdom/Crankshaft'), + cls('ComicsKingdom/FlashGordonSundays'), + cls('ComicsKingdom/FunkyWinkerbean'), + cls('ComicsKingdom/FunkyWinkerbeanSunday'), + cls('ComicsKingdom/FunkyWinkerbeanSundays'), + cls('ComicsKingdom/FunkyWinkerbeanVintage'), + cls('ComicsKingdom/HeartOfJulietJonesSundays'), + cls('ComicsKingdom/KatzenjammerKidsSundays'), + cls('ComicsKingdom/Lockhorns'), + cls('ComicsKingdom/MandrakeTheMagicianSundays'), + cls('ComicsKingdom/MarkTrailVintage'), + cls('ComicsKingdom/MikePeters'), + cls('ComicsKingdom/MotherGooseAndGrimm'), + cls('ComicsKingdom/PhantomSundays'), + cls('ComicsKingdom/PrinceValiantSundays'), + cls('ComicsKingdom/Retail'), + cls('ComicsKingdom/TigerSundays'), + cls('ComicsKingdom/TigerVintage'), + cls('ComicsKingdom/TigerVintageSundays'), cls('Everblue', 'block'), cls('FalseStart'), cls('Ginpu'), @@ -1698,10 +1722,8 @@ class Renamed(Scraper): # Renamed in 3.0 cls('AHClub', 'RickGriffinStudios/AHClub'), cls('ComicFury/MuddlemarchMudCompany', 'ComicFury/MudCompany'), - cls('ComicsKingdom/FunkyWinkerbeanSundays', 'ComicsKingdom/FunkyWinkerbeanSunday'), cls('ComicsKingdom/ShermansLagoon', 'GoComics/ShermansLagoon'), cls('ComicsKingdom/TheLittleKing', 'ComicsKingdom/LittleKing'), - cls('ComicsKingdom/TigerSundays', 'ComicsKingdom/TigerVintageSundays'), cls('GoComics/BloomCounty2017', 'GoComics/BloomCounty2019'), cls('GoComics/Cathy', 'GoComics/CathyClassics'), cls('GoComics/DarrinBell', 'ComicsKingdom/DarrinBell'), @@ -1724,6 +1746,8 @@ class Renamed(Scraper): cls('TracesOfThePast/NSFW', 'RickGriffinStudios/TracesOfThePastNSFW'), # Renamed in 3.1 + cls('ComicsKingdom/SlylockFoxAndComicsForKids', 'ComicsKingdom/SlylockFox'), + cls('ComicsKingdom/SlylockFoxAndComicsForKidsSpanish', 'ComicsKingdom/SlylockFoxSpanish'), cls('Exiern', 'ComicFury/Exiern'), cls('MaxOveracts', 'OccasionalComicsDisorder'), cls('SafelyEndangered', 'WebToons/SafelyEndangered'), diff --git a/scripts/comicskingdom.py b/scripts/comicskingdom.py index b792bd675..c5ee04c85 100755 --- a/scripts/comicskingdom.py +++ b/scripts/comicskingdom.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: MIT -# Copyright (C) 2019-2022 Tobias Gruetzmacher -# Copyright (C) 2019 Thomas W. Littauer +# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Thomas W. Littauer """ Script to get a list of comicskingdom.com comics and save the info in a JSON file for further processing. @@ -19,39 +19,17 @@ class ComicsKingdomUpdater(ComicListUpdater): "ComicGenesis/%s", ) - def handle_startpage(self, page): - """Parse list of comics from the bottom of the start page.""" - for li in page.xpath('//div[d:class("comics-list")]//li', namespaces=NS): - link = li.xpath('./a')[0] + def handle_listing(self, page): + for link in page.xpath('//ul[d:class("index")]//a', namespaces=NS): + name = link.text_content().removeprefix('The ') url = link.attrib['href'] - name = link.text.removeprefix('The ') + lang = 'es' if ' (Spanish)' in name else None - self.add_comic(name, (url, None)) - - def handle_listing(self, page, lang: str = None, add: str = ''): - - hasnew = True - while hasnew: - hasnew = False - for comicdiv in page.xpath('//div[d:class("tile")]', namespaces=NS): - nametag = comicdiv.xpath('./a/comic-name') - if len(nametag) == 0: - continue - name = nametag[0].text.removeprefix('The ') + add - url = comicdiv.xpath('./a')[0].attrib['href'] - - if self.add_comic(name, (url, lang)): - hasnew = True - - nextlink = page.xpath('//a[./img[contains(@src, "page-right")]]') - page = self.get_url(nextlink[0].attrib['href']) + self.add_comic(name, (url, lang)) def collect_results(self): """Parse all search result pages.""" - page = self.get_url('https://www.comicskingdom.com/') - self.handle_startpage(page) - self.handle_listing(page) - self.handle_listing(self.get_url('https://www.comicskingdom.com/spanish'), 'es', 'Spanish') + self.handle_listing(self.get_url('https://comicskingdom.com/features')) def get_entry(self, name: str, data: tuple[str, str]): opt = f", lang='{data[1]}'" if data[1] else '' From cfe5738151f92c7df2e23dc72f312905388f4988 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sun, 17 Mar 2024 19:21:41 +0100 Subject: [PATCH 21/41] Parametrize renamed comic module test --- tests/test_comicnames.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/tests/test_comicnames.py b/tests/test_comicnames.py index 8d3b11e5e..a9d69c4a5 100644 --- a/tests/test_comicnames.py +++ b/tests/test_comicnames.py @@ -3,12 +3,15 @@ # Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2015-2022 Tobias Gruetzmacher import re +from operator import attrgetter + +import pytest from dosagelib.scraper import scrapers from dosagelib.plugins import old -class TestComicNames(object): +class TestComicNames: def test_names(self): for scraperobj in scrapers.all(): @@ -20,11 +23,11 @@ class TestComicNames(object): comicname = name assert re.sub("[^0-9a-zA-Z_]", "", comicname) == comicname - def test_renamed(self): - for scraperobj in scrapers.all(include_removed=True): - if not isinstance(scraperobj, old.Renamed): - continue - assert len(scraperobj.getDisabledReasons()) > 0 - # Renamed scraper should only point to an non-disabled scraper - newscraper = scrapers.find(scraperobj.newname) - assert len(newscraper.getDisabledReasons()) == 0 + @pytest.mark.parametrize(('scraperobj'), + [obj for obj in scrapers.all(include_removed=True) + if isinstance(obj, old.Renamed)], ids=attrgetter('name')) + def test_renamed(self, scraperobj): + assert len(scraperobj.getDisabledReasons()) > 0 + # Renamed scraper should only point to an non-disabled scraper + newscraper = scrapers.find(scraperobj.newname) + assert len(newscraper.getDisabledReasons()) == 0 From 2e912bcd2c9e994de23e83bfbc8b151b290dbd56 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sun, 17 Mar 2024 19:33:38 +0100 Subject: [PATCH 22/41] Remove useless page skip in GirlGenius (fixes #306) --- dosagelib/plugins/g.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dosagelib/plugins/g.py b/dosagelib/plugins/g.py index 605f96e9c..7a59ed1f9 100644 --- a/dosagelib/plugins/g.py +++ b/dosagelib/plugins/g.py @@ -64,10 +64,6 @@ class GirlGenius(ParserScraper): multipleImagesPerStrip = True help = 'Index format: yyyymmdd' - def shouldSkipUrl(self, url, data): - """Skip pages without images.""" - return not data.xpath('//div[@id="comicbody"]//img[contains(@src, "comic")]') - class GirlsWithSlingshots(ComicControlScraper): url = 'https://girlswithslingshots.com/' From 23125c74d4be98d4aa756f557e736198c242f84e Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sun, 17 Mar 2024 21:44:46 +0100 Subject: [PATCH 23/41] Unify XPath NS config over modules --- dosagelib/plugins/a.py | 12 ++++++------ dosagelib/plugins/c.py | 10 +++++----- dosagelib/plugins/comicfury.py | 14 +++++++------- dosagelib/plugins/d.py | 15 +++++---------- dosagelib/plugins/derideal.py | 6 +++--- dosagelib/plugins/e.py | 4 ++-- dosagelib/plugins/f.py | 4 ++-- dosagelib/plugins/gocomics.py | 2 +- dosagelib/plugins/kemonocafe.py | 6 +++--- dosagelib/plugins/l.py | 2 +- dosagelib/plugins/m.py | 3 +-- dosagelib/plugins/p.py | 2 +- dosagelib/plugins/r.py | 14 +++++++------- dosagelib/plugins/s.py | 8 ++++---- dosagelib/plugins/shivaestudios.py | 2 +- dosagelib/plugins/tapas.py | 5 ++--- dosagelib/plugins/u.py | 2 +- dosagelib/plugins/v.py | 14 +++++++------- dosagelib/plugins/w.py | 10 +++++----- dosagelib/plugins/webtoons.py | 4 ++-- dosagelib/plugins/wrongside.py | 14 +++++++------- dosagelib/scraper.py | 14 ++++++++------ 22 files changed, 81 insertions(+), 86 deletions(-) diff --git a/dosagelib/plugins/a.py b/dosagelib/plugins/a.py index 2a4ef5b70..51492ce4d 100644 --- a/dosagelib/plugins/a.py +++ b/dosagelib/plugins/a.py @@ -228,7 +228,7 @@ class Amya(WordPressScraper): url = 'http://www.amyachronicles.com/' -class Angband(_ParserScraper): +class Angband(ParserScraper): url = 'http://angband.calamarain.net/' stripUrl = url + '%s' imageSearch = '//img' @@ -237,7 +237,7 @@ class Angband(_ParserScraper): def starter(self): page = self.getPage(self.url) - self.pages = page.xpath('//p/a[not(contains(@href, "cast"))]/@href') + self.pages = self.match(page, '//p/a[not(contains(@href, "cast"))]/@href') self.firstStripUrl = self.pages[0] return self.pages[-1] @@ -267,7 +267,7 @@ class Annyseed(_ParserScraper): return tourl -class AntiheroForHire(_ParserScraper): +class AntiheroForHire(ParserScraper): stripUrl = 'https://www.giantrobot.club/antihero-for-hire/%s' firstStripUrl = stripUrl % '2016/6/8/entrance-vigil' url = firstStripUrl @@ -278,7 +278,7 @@ class AntiheroForHire(_ParserScraper): def starter(self): # Build list of chapters for navigation page = self.getPage(self.url) - self.chapters = page.xpath('//ul[@class="archive-group-list"]//a[contains(@class, "archive-item-link")]/@href') + self.chapters = self.match(page, '//ul[d:class("archive-group-list")]//a[d:class("archive-item-link")]/@href') return self.chapters[0] def getPrevUrl(self, url, data): @@ -314,7 +314,7 @@ class ArtificialIncident(WordPressWebcomic): firstStripUrl = stripUrl % 'issue-one-life-changing' -class AstronomyPOTD(_ParserScraper): +class AstronomyPOTD(ParserScraper): baseUrl = 'http://apod.nasa.gov/apod/' url = baseUrl + 'astropix.html' starter = bounceStarter @@ -328,7 +328,7 @@ class AstronomyPOTD(_ParserScraper): def shouldSkipUrl(self, url, data): """Skip pages without images.""" - return data.xpath('//iframe') # videos + return self.match(data, '//iframe') # videos def namer(self, image_url, page_url): return '%s-%s' % (page_url.split('/')[-1].split('.')[0][2:], diff --git a/dosagelib/plugins/c.py b/dosagelib/plugins/c.py index 27f7278d2..78b17399a 100644 --- a/dosagelib/plugins/c.py +++ b/dosagelib/plugins/c.py @@ -34,11 +34,11 @@ class CaptainSNES(_BasicScraper): help = 'Index format: yyyy/mm/dd/nnn-stripname' -class CarryOn(_ParserScraper): +class CarryOn(ParserScraper): url = 'http://www.hirezfox.com/km/co/' stripUrl = url + 'd/%s.html' firstStripUrl = stripUrl % '20040701' - imageSearch = '//div[@class="strip"]/img' + imageSearch = '//div[d:class("strip")]/img' prevSearch = '//a[text()="Previous Day"]' multipleImagesPerStrip = True @@ -122,13 +122,13 @@ class CatAndGirl(_ParserScraper): prevSearch = '//a[d:class("pager--prev")]' -class CatenaManor(_ParserScraper): +class CatenaManor(ParserScraper): baseUrl = ('https://web.archive.org/web/20141027141116/' 'http://catenamanor.com/') url = baseUrl + 'archives' stripUrl = baseUrl + '%s/' firstStripUrl = stripUrl % '2003/07' - imageSearch = '//img[@class="comicthumbnail"]' + imageSearch = '//img[d:class("comicthumbnail")]' multipleImagesPerStrip = True endOfLife = True strips: List[str] = [] @@ -136,7 +136,7 @@ class CatenaManor(_ParserScraper): def starter(self): # Retrieve archive links and select valid range archivePage = self.getPage(self.url) - archiveStrips = archivePage.xpath('//div[@id="archivepage"]//a') + archiveStrips = self.match(archivePage, '//div[@id="archivepage"]//a') valid = False for link in archiveStrips: if self.stripUrl % '2012/01' in link.get('href'): diff --git a/dosagelib/plugins/comicfury.py b/dosagelib/plugins/comicfury.py index 0a7a9c108..f5962db33 100644 --- a/dosagelib/plugins/comicfury.py +++ b/dosagelib/plugins/comicfury.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring import os from ..scraper import ParserScraper @@ -79,7 +79,7 @@ class ComicFury(ParserScraper): num = parts[-1] if self.multipleImagesPerStrip: page = self.getPage(pageUrl) - images = page.xpath('//img[@class="comicsegmentimage"]/@src') + images = self.match(page, '//img[d:class("comicsegmentimage")]/@src') if len(images) > 1: imageIndex = images.index(imageUrl) + 1 return "%s_%s-%d%s" % (self.prefix, num, imageIndex, ext) @@ -88,8 +88,8 @@ class ComicFury(ParserScraper): def shouldSkipUrl(self, url, data): """Skip pages without images.""" # Videos on Underverse - return (data.xpath('//div[@id="comicimagewrap"]//video') and - not data.xpath('//div[@id="comicimagewrap"]//img')) + return (self.match(data, '//div[@id="comicimagewrap"]//video') and + not self.match(data, '//div[@id="comicimagewrap"]//img')) @classmethod def getmodules(cls): # noqa: CFQ001 diff --git a/dosagelib/plugins/d.py b/dosagelib/plugins/d.py index f7a2e1933..3bfa0cf7c 100644 --- a/dosagelib/plugins/d.py +++ b/dosagelib/plugins/d.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from re import compile, escape from ..scraper import _BasicScraper, _ParserScraper, ParserScraper @@ -328,19 +328,14 @@ class DreamKeepersPrelude(_ParserScraper): help = 'Index format: n' -class DresdenCodak(_ParserScraper): +class DresdenCodak(ParserScraper): url = 'http://dresdencodak.com/' - startUrl = url + 'cat/comic/' firstStripUrl = url + '2007/02/08/pom/' imageSearch = '//section[d:class("entry-content")]//img[d:class("aligncenter")]' prevSearch = '//a[img[contains(@src, "prev")]]' latestSearch = '//a[d:class("tc-grid-bg-link")]' starter = indirectStarter - # Blog and comic are mixed... - def shouldSkipUrl(self, url, data): - return not data.xpath(self.imageSearch) - class DrFun(_ParserScraper): baseUrl = ('https://web.archive.org/web/20180726145737/' diff --git a/dosagelib/plugins/derideal.py b/dosagelib/plugins/derideal.py index 7b8d2e298..ca75a2e73 100644 --- a/dosagelib/plugins/derideal.py +++ b/dosagelib/plugins/derideal.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2019-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from ..scraper import ParserScraper from ..helpers import indirectStarter @@ -27,7 +27,7 @@ class Derideal(ParserScraper): def starter(self): indexPage = self.getPage(self.url) - self.chapters = indexPage.xpath('//a[contains(text(), "Read this episode")]/@href') + self.chapters = self.match(indexPage, '//a[contains(text(), "Read this episode")]/@href') self.currentChapter = len(self.chapters) return indirectStarter(self) diff --git a/dosagelib/plugins/e.py b/dosagelib/plugins/e.py index 5329256d6..857776ec0 100644 --- a/dosagelib/plugins/e.py +++ b/dosagelib/plugins/e.py @@ -113,7 +113,7 @@ class Erfworld(ParserScraper): def shouldSkipUrl(self, url, data): """Skip pages without images.""" - return not data.xpath(self.imageSearch) + return not self.match(data, self.imageSearch) def namer(self, imageUrl, pageUrl): # Fix inconsistent filenames @@ -232,7 +232,7 @@ class ExtraFabulousComics(WordPressScraper): return '_'.join((pagepart, imagename)) def shouldSkipUrl(self, url, data): - return data.xpath('//div[@id="comic"]//iframe') + return self.match(data, '//div[@id="comic"]//iframe') class ExtraLife(_BasicScraper): diff --git a/dosagelib/plugins/f.py b/dosagelib/plugins/f.py index 01c43da33..360b6ba39 100644 --- a/dosagelib/plugins/f.py +++ b/dosagelib/plugins/f.py @@ -140,7 +140,7 @@ class FoxDad(ParserScraper): def namer(self, imageUrl, pageUrl): page = self.getPage(pageUrl) - post = page.xpath('//li[@class="timestamp"]/a/@href')[0] + post = self.match(page, '//li[d:class("timestamp")]/a/@href')[0] post = post.replace('https://foxdad.com/post/', '') if '-consider-support' in post: post = post.split('-consider-support')[0] @@ -216,7 +216,7 @@ class FriendsYouAreStuckWith(WordPressScraper): def namer(self, imageUrl, pageUrl): page = self.getPage(pageUrl) - strip = page.xpath('//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '') + strip = self.match(page, '//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '') return strip + '_' + imageUrl.rstrip('/').rsplit('/', 1)[-1] diff --git a/dosagelib/plugins/gocomics.py b/dosagelib/plugins/gocomics.py index 1faee4bdd..140c112b9 100644 --- a/dosagelib/plugins/gocomics.py +++ b/dosagelib/plugins/gocomics.py @@ -31,7 +31,7 @@ class GoComics(ParserScraper): def shouldSkipUrl(self, url, data): """Skip pages without images.""" - return data.xpath('//img[contains(@src, "content-error-missing")]') + return self.match(data, '//img[contains(@src, "content-error-missing")]') @classmethod def getmodules(cls): # noqa: CFQ001 diff --git a/dosagelib/plugins/kemonocafe.py b/dosagelib/plugins/kemonocafe.py index 788ab1eaf..22692d85e 100644 --- a/dosagelib/plugins/kemonocafe.py +++ b/dosagelib/plugins/kemonocafe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2019-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from ..scraper import ParserScraper @@ -44,7 +44,7 @@ class KemonoCafe(ParserScraper): # Fix unordered filenames if 'addictivescience' in pageUrl: page = self.getPage(pageUrl) - num = int(page.xpath('//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '')) + num = int(self.match(page, '//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '')) filename = '%04d_%s' % (num, filename) elif 'CaughtInOrbit' in filename: filename = filename.replace('CaughtInOrbit', 'CIO') diff --git a/dosagelib/plugins/l.py b/dosagelib/plugins/l.py index e04f168f0..f2a536c19 100644 --- a/dosagelib/plugins/l.py +++ b/dosagelib/plugins/l.py @@ -38,7 +38,7 @@ class LazJonesAndTheMayfieldRegulatorsSideStories(LazJonesAndTheMayfieldRegulato def getPrevUrl(self, url, data): # Fix broken navigation links - if url == self.url and data.xpath(self.prevSearch + '/@href')[0] == self.stripUrl % 'summer00': + if url == self.url and self.match(data, self.prevSearch + '/@href')[0] == self.stripUrl % 'summer00': return self.stripUrl % 'summer21' return super(LazJonesAndTheMayfieldRegulators, self).getPrevUrl(url, data) diff --git a/dosagelib/plugins/m.py b/dosagelib/plugins/m.py index d69531e36..b54370f1b 100644 --- a/dosagelib/plugins/m.py +++ b/dosagelib/plugins/m.py @@ -9,7 +9,6 @@ from re import compile, IGNORECASE from ..helpers import indirectStarter from ..scraper import ParserScraper, _BasicScraper, _ParserScraper from ..util import tagre -from ..xml import NS from .common import ComicControlScraper, WordPressScraper, WordPressWebcomic @@ -153,7 +152,7 @@ class MonkeyUser(ParserScraper): def shouldSkipUrl(self, url, data): # videos - return data.xpath('//div[d:class("video-container")]', namespaces=NS) + return self.match(data, '//div[d:class("video-container")]') class MonsieurLeChien(ParserScraper): diff --git a/dosagelib/plugins/p.py b/dosagelib/plugins/p.py index cc5319aa4..0a2cf0037 100644 --- a/dosagelib/plugins/p.py +++ b/dosagelib/plugins/p.py @@ -166,7 +166,7 @@ class PHDComics(ParserScraper): # video self.stripUrl % '1880', self.stripUrl % '1669', - ) or data.xpath('//img[@id="comic" and contains(@src, "phd083123s")]') + ) or self.match(data, '//img[@id="comic" and contains(@src, "phd083123s")]') class Picklewhistle(ComicControlScraper): diff --git a/dosagelib/plugins/r.py b/dosagelib/plugins/r.py index 5a10455cc..b20714d3a 100644 --- a/dosagelib/plugins/r.py +++ b/dosagelib/plugins/r.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2021 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from re import compile from urllib.parse import urljoin @@ -121,7 +121,7 @@ class Requiem(WordPressScraper): firstStripUrl = stripUrl % '2004-06-07-3' -class Replay(_ParserScraper): +class Replay(ParserScraper): url = 'http://replaycomic.com/' stripUrl = url + 'comic/%s/' firstStripUrl = stripUrl % 'red-desert' @@ -132,11 +132,11 @@ class Replay(_ParserScraper): def starter(self): # Retrieve archive page to identify chapters archivePage = self.getPage(self.url + 'archive') - archive = archivePage.xpath('//div[@class="comic-archive-chapter-wrap"]') + archive = self.match(archivePage, '//div[d:class("comic-archive-chapter-wrap")]') self.chapter = len(archive) - 1 self.startOfChapter = [] for archiveChapter in archive: - self.startOfChapter.append(archiveChapter.xpath('.//a')[0].get('href')) + self.startOfChapter.append(self.match(archiveChapter, './/a')[0].get('href')) return bounceStarter(self) def namer(self, imageUrl, pageUrl): diff --git a/dosagelib/plugins/s.py b/dosagelib/plugins/s.py index fb115b943..d14cbb546 100644 --- a/dosagelib/plugins/s.py +++ b/dosagelib/plugins/s.py @@ -435,7 +435,7 @@ class SpaceFurries(ParserScraper): def extract_image_urls(self, url, data): # Website requires JS, so build the list of image URLs manually imageurls = [] - current = int(data.xpath('//input[@name="pagnum"]')[0].get('value')) + current = int(self.match(data, '//input[@name="pagnum"]')[0].get('value')) for page in reversed(range(1, current + 1)): imageurls.append(self.url + 'comics/' + str(page) + '.jpg') return imageurls @@ -636,16 +636,16 @@ class StrongFemaleProtagonist(_ParserScraper): ) -class StupidFox(_ParserScraper): +class StupidFox(ParserScraper): url = 'http://stupidfox.net/' stripUrl = url + '%s' firstStripUrl = stripUrl % 'hello' - imageSearch = '//div[@class="comicmid"]//img' + imageSearch = '//div[d:class("comicmid")]//img' prevSearch = '//a[@accesskey="p"]' def namer(self, imageUrl, pageUrl): page = self.getPage(pageUrl) - title = page.xpath(self.imageSearch + '/@title')[0].replace(' - ', '-').replace(' ', '-') + title = self.match(page, self.imageSearch + '/@title')[0].replace(' - ', '-').replace(' ', '-') return title + '.' + imageUrl.rsplit('.', 1)[-1] diff --git a/dosagelib/plugins/shivaestudios.py b/dosagelib/plugins/shivaestudios.py index 2f508cabe..6bedc28a7 100644 --- a/dosagelib/plugins/shivaestudios.py +++ b/dosagelib/plugins/shivaestudios.py @@ -19,7 +19,7 @@ class AlienDice(WordPressSpliced): def shouldSkipUrl(self, url, data): """Skip pages without images.""" - return not data.xpath(self.imageSearch) + return not self.match(data, self.imageSearch) def getPrevUrl(self, url, data): # Fix broken navigation diff --git a/dosagelib/plugins/tapas.py b/dosagelib/plugins/tapas.py index f3c6088fb..68b1ee9ac 100644 --- a/dosagelib/plugins/tapas.py +++ b/dosagelib/plugins/tapas.py @@ -3,7 +3,6 @@ # SPDX-FileCopyrightText: © 2019 Daniel Ring from ..output import out from ..scraper import ParserScraper -from ..xml import NS class Tapas(ParserScraper): @@ -21,7 +20,7 @@ class Tapas(ParserScraper): def starter(self): # Retrieve comic metadata from info page info = self.getPage(self.url) - series = info.xpath('//@data-series-id')[0] + series = self.match(info, '//@data-series-id')[0] # Retrieve comic metadata from API data = self.session.get(self.baseUrl + 'series/' + series + '/episodes?sort=NEWEST') data.raise_for_status() @@ -43,7 +42,7 @@ class Tapas(ParserScraper): return self._cached_image_urls def shouldSkipUrl(self, url, data): - if data.xpath('//button[d:class("js-have-to-sign")]', namespaces=NS): + if self.match(data, '//button[d:class("js-have-to-sign")]'): out.warn(f'Nothing to download on "{url}", because a login is required.') return True return False diff --git a/dosagelib/plugins/u.py b/dosagelib/plugins/u.py index 8254a1dbd..e9e2300a0 100644 --- a/dosagelib/plugins/u.py +++ b/dosagelib/plugins/u.py @@ -107,7 +107,7 @@ class Unsounded(ParserScraper): return urls def extract_css_bg(self, page) -> str | None: - comicdivs = page.xpath('//div[@id="comic"]') + comicdivs = self.match(page, '//div[@id="comic"]') if comicdivs: style = comicdivs[0].attrib.get('style') if style: diff --git a/dosagelib/plugins/v.py b/dosagelib/plugins/v.py index 33e26b317..04b6a2a02 100644 --- a/dosagelib/plugins/v.py +++ b/dosagelib/plugins/v.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2020 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from ..scraper import ParserScraper, _ParserScraper from ..helpers import bounceStarter, indirectStarter @@ -44,15 +44,15 @@ class Vibe(ParserScraper): help = 'Index format: VIBEnnn (padded)' -class VickiFox(_ParserScraper): +class VickiFox(ParserScraper): url = 'http://www.vickifox.com/comic/strip' stripUrl = url + '?id=%s' firstStripUrl = stripUrl % '001' imageSearch = '//img[contains(@src, "comic/")]' prevSearch = '//button[@id="btnPrev"]/@value' - def getPrevUrl(self, url, data): - return self.stripUrl % self.getPage(url).xpath(self.prevSearch)[0] + def link_modifier(self, fromurl, tourl): + return self.stripUrl % tourl class ViiviJaWagner(_ParserScraper): diff --git a/dosagelib/plugins/w.py b/dosagelib/plugins/w.py index 0af93415b..11543ce0d 100644 --- a/dosagelib/plugins/w.py +++ b/dosagelib/plugins/w.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from re import compile, escape, IGNORECASE from ..scraper import ParserScraper, _BasicScraper, _ParserScraper @@ -17,7 +17,7 @@ class WapsiSquare(WordPressNaviIn): def shouldSkipUrl(self, url, data): """Skip pages without images.""" - return data.xpath('//iframe') # videos + return self.match(data, '//iframe') # videos class WastedTalent(_ParserScraper): diff --git a/dosagelib/plugins/webtoons.py b/dosagelib/plugins/webtoons.py index 393f9d809..46fa5b9e3 100644 --- a/dosagelib/plugins/webtoons.py +++ b/dosagelib/plugins/webtoons.py @@ -24,9 +24,9 @@ class WebToons(ParserScraper): self.session.cookies.set(cookie, 'false', domain='webtoons.com') # Find current episode number listPage = self.getPage(self.listUrl) - currentEpisode = listPage.xpath('//div[@class="detail_lst"]/ul/li')[0].attrib['data-episode-no'] + currentEpisode = self.match(listPage, '//div[d:class("detail_lst")]/ul/li')[0].attrib['data-episode-no'] # Check for completed tag - self.endOfLife = (listPage.xpath('//div[@id="_asideDetail"]//span[@class="txt_ico_completed2"]') != []) + self.endOfLife = not self.match(listPage, '//div[@id="_asideDetail"]//span[d:class("txt_ico_completed2")]') return self.stripUrl % currentEpisode def extract_image_urls(self, url, data): diff --git a/dosagelib/plugins/wrongside.py b/dosagelib/plugins/wrongside.py index 78bc4a080..ce75d38bf 100644 --- a/dosagelib/plugins/wrongside.py +++ b/dosagelib/plugins/wrongside.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2019-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2022 Daniel Ring +# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from ..scraper import ParserScraper from ..helpers import indirectStarter @@ -15,21 +15,21 @@ class Wrongside(ParserScraper): def starter(self): archivePage = self.getPage(self.url) - chapterUrls = archivePage.xpath('//ul[@class="albThumbs"]//a/@href') + chapterUrls = self.match(archivePage, '//ul[d:class("albThumbs")]//a/@href') self.archive = [] for chapterUrl in chapterUrls: chapterPage = self.getPage(chapterUrl) - self.archive.append(chapterPage.xpath('(//ul[@id="thumbnails"]//a/@href)[last()]')[0]) + self.archive.append(self.match(chapterPage, '(//ul[@id="thumbnails"]//a/@href)[last()]')[0]) return self.archive[0] def getPrevUrl(self, url, data): - if data.xpath(self.prevSearch) == [] and len(self.archive) > 0: + if self.match(data, self.prevSearch) == [] and len(self.archive) > 0: return self.archive.pop() return super(Wrongside, self).getPrevUrl(url, data) def namer(self, imageUrl, pageUrl): page = self.getPage(pageUrl) - title = page.xpath('//div[@class="browsePath"]/h2/text()')[0] + title = self.match(page, '//div[d:class("browsePath")]/h2/text()')[0] return title.replace('"', '') + '.' + imageUrl.rsplit('.', 1)[-1] @@ -71,5 +71,5 @@ class WrongsideSideStories(ParserScraper): def namer(self, imageUrl, pageUrl): page = self.getPage(pageUrl) - title = page.xpath('//div[@class="browsePath"]/h2/text()')[0] + title = self.match(page, '//div[d:class("browsePath")]/h2/text()')[0] return title.replace('"', '') + '.' + imageUrl.rsplit('.', 1)[-1] diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index e9928c391..b0f436744 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -521,15 +521,10 @@ class ParserScraper(Scraper): return text.strip() def _matchPattern(self, data, patterns): - if self.css: - searchFun = data.cssselect - else: - def searchFun(s): - return data.xpath(s, namespaces=NS) patterns = makeSequence(patterns) for search in patterns: matched = False - for match in searchFun(search): + for match in self.match(data, search): matched = True yield match, search @@ -537,6 +532,13 @@ class ParserScraper(Scraper): # do not search other links if one pattern matched break + def match(self, data, pattern): + """Match a pattern (XPath/CSS) against a page.""" + if self.css: + return data.cssselect(pattern) + else: + return data.xpath(pattern, namespaces=NS) + def getDisabledReasons(self): res = {} if self.css and cssselect is None: From 38f4dd0ed178cacd77bdd906f2dd81965db32677 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1l=20Tam=C3=A1s?= Date: Mon, 1 Apr 2024 23:12:09 +0200 Subject: [PATCH 24/41] Added Alloy Comics to the WebToons modules (#310) Added Alloy Comics --- dosagelib/plugins/webtoons.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dosagelib/plugins/webtoons.py b/dosagelib/plugins/webtoons.py index 46fa5b9e3..30f13c919 100644 --- a/dosagelib/plugins/webtoons.py +++ b/dosagelib/plugins/webtoons.py @@ -65,6 +65,7 @@ class WebToons(ParserScraper): cls('AGoodDayToBeADog', 'romance/a-good-day-tobe-a-dog', 1390), cls('Aisopos', 'drama/aisopos', 76), cls('AliceElise', 'fantasy/alice-elise', 1481), + cls('AlloyComics', 'canvas/alloy-comics', 747447), cls('AllThatWeHopeToBe', 'slice-of-life/all-that-we-hope-to-be', 470), cls('AllThatYouAre', 'drama/all-that-you-are', 403), cls('AlwaysHuman', 'romance/always-human', 557), From f63b899bb49bb72a30c83df6b899271c0b2e3755 Mon Sep 17 00:00:00 2001 From: mindcombatant <9550058+mindcombatant@users.noreply.github.com> Date: Mon, 1 Apr 2024 22:02:53 +0000 Subject: [PATCH 25/41] fix VGCats (#311) --- dosagelib/plugins/v.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dosagelib/plugins/v.py b/dosagelib/plugins/v.py index 04b6a2a02..5c931f2af 100644 --- a/dosagelib/plugins/v.py +++ b/dosagelib/plugins/v.py @@ -27,7 +27,7 @@ class VGCats(_ParserScraper): url = 'https://www.vgcats.com/comics/' stripUrl = url + '?strip_id=%s' firstStripUrl = stripUrl % '0' - imageSearch = '//td/img[contains(@src, "images/")]' + imageSearch = '//td/font/img[contains(@src, "images/")]' prevSearch = '//a[img[contains(@src, "back.")]]' help = 'Index format: n (unpadded)' From 0bfcd323856ae85f9e99b3111b756d44cb27e880 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 9 Apr 2024 09:58:08 +0200 Subject: [PATCH 26/41] Bump peaceiris/actions-gh-pages from 3 to 4 (#312) Bumps [peaceiris/actions-gh-pages](https://github.com/peaceiris/actions-gh-pages) from 3 to 4. - [Release notes](https://github.com/peaceiris/actions-gh-pages/releases) - [Changelog](https://github.com/peaceiris/actions-gh-pages/blob/main/CHANGELOG.md) - [Commits](https://github.com/peaceiris/actions-gh-pages/compare/v3...v4) --- updated-dependencies: - dependency-name: peaceiris/actions-gh-pages dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/pages.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 14e3a7ce3..6128d2b2c 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -30,7 +30,7 @@ jobs: ssite build --output public - name: Deploy - uses: peaceiris/actions-gh-pages@v3 + uses: peaceiris/actions-gh-pages@v4 with: cname: dosage.rocks github_token: ${{ secrets.GITHUB_TOKEN }} From dce299903bfe2d384d0b6caf610cd0a3583ed635 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sun, 12 May 2024 13:44:53 +0200 Subject: [PATCH 27/41] Restrict ComicsKingdom module to current comic This prevents the scraper from randomly switching to a different comic (fixes #314) --- dosagelib/plugins/comicskingdom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dosagelib/plugins/comicskingdom.py b/dosagelib/plugins/comicskingdom.py index 08413fb4e..82a1aea40 100644 --- a/dosagelib/plugins/comicskingdom.py +++ b/dosagelib/plugins/comicskingdom.py @@ -11,7 +11,6 @@ class ComicsKingdom(ParserScraper): partDiv = '//div[d:class("comic-reader-item")]' imageSearch = partDiv + '[1]//a[contains(@href, "/custom-framed-print/")]' prevSearch = partDiv + '[2]/@data-link' - latestSearch = '//a[re:test(@href, "/[0-9-]+$")]' starter = indirectStarter help = 'Index format: yyyy-mm-dd' @@ -19,6 +18,7 @@ class ComicsKingdom(ParserScraper): super().__init__('ComicsKingdom/' + name) self.url = 'https://comicskingdom.com/' + path self.stripUrl = self.url + '/%s' + self.latestSearch = f'//a[re:test(@href, "/{path}/[0-9-]+$")]' if lang: self.lang = lang From 7d7166af6e4e97d91abb86d748425cbc92fe3607 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Tue, 14 May 2024 00:16:18 +0200 Subject: [PATCH 28/41] Switch pages deployment from branch to actions --- .github/workflows/pages.yml | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 6128d2b2c..4a9c29eb4 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -5,12 +5,19 @@ on: push: branches: - master + workflow_dispatch: permissions: - contents: write + contents: read + pages: write + id-token: write + +concurrency: + group: "pages" + cancel-in-progress: false jobs: - deploy: + build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -28,10 +35,24 @@ jobs: pip install wheel pip install git+https://github.com/spanezz/staticsite.git@v2.3 ssite build --output public + cd public + rm -rf Jenkinsfile dosagelib scripts tests - - name: Deploy - uses: peaceiris/actions-gh-pages@v4 + - name: Setup Pages + id: pages + uses: actions/configure-pages@v5 + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 with: - cname: dosage.rocks - github_token: ${{ secrets.GITHUB_TOKEN }} - exclude_assets: 'Jenkinsfile,dosagelib,scripts,setup.*,tests,*.ini' + path: public + + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 From f4f45945d04b9b74b0a5e7850001e77f57da8cd5 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Tue, 14 May 2024 22:04:46 +0200 Subject: [PATCH 29/41] Add "How to be a Mind Reaver" to Webtoons module (fixes #321) --- dosagelib/plugins/webtoons.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dosagelib/plugins/webtoons.py b/dosagelib/plugins/webtoons.py index 30f13c919..1cd594998 100644 --- a/dosagelib/plugins/webtoons.py +++ b/dosagelib/plugins/webtoons.py @@ -214,6 +214,7 @@ class WebToons(ParserScraper): cls('Hooky', 'fantasy/hooky', 425), cls('HoovesOfDeath', 'fantasy/hooves-of-death', 1535), cls('HouseOfStars', 'fantasy/house-of-stars', 1620), + cls('HowToBeAMindReaver', 'canvas/how-to-be-a-mind-reaver', 301213), cls('HowToBecomeADragon', 'fantasy/how-to-become-a-dragon', 1973), cls('HowToLove', 'slice-of-life/how-to-love', 472), cls('IDontWantThisKindOfHero', 'super-hero/i-dont-want-this-kind-of-hero', 98), From f87526738c679ac50e9f99701ebb380b78504732 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Tue, 14 May 2024 22:09:54 +0200 Subject: [PATCH 30/41] Add "Mage and Mimic" to Webtoons module (fixes #320) --- dosagelib/plugins/webtoons.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dosagelib/plugins/webtoons.py b/dosagelib/plugins/webtoons.py index 1cd594998..ec61087e5 100644 --- a/dosagelib/plugins/webtoons.py +++ b/dosagelib/plugins/webtoons.py @@ -268,6 +268,7 @@ class WebToons(ParserScraper): cls('LUMINE', 'fantasy/lumine', 1022), cls('Lunarbaboon', 'slice-of-life/lunarbaboon', 523), cls('MageAndDemonQueen', 'comedy/mage-and-demon-queen', 1438), + cls('MageAndMimic', 'comedy/mage-and-mimic', 5973), cls('Magical12thGraders', 'super-hero/magical-12th-graders', 90), cls('Magician', 'fantasy/magician', 70), cls('MagicSodaPop', 'fantasy/magic-soda-pop', 1947), From 50a656bb6f26a1b9b0dc8da4efc030658f5b594f Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Tue, 14 May 2024 22:22:48 +0200 Subject: [PATCH 31/41] Fix Sheldon (half-fixes #318) --- dosagelib/plugins/s.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dosagelib/plugins/s.py b/dosagelib/plugins/s.py index d14cbb546..4ff001ac3 100644 --- a/dosagelib/plugins/s.py +++ b/dosagelib/plugins/s.py @@ -196,7 +196,7 @@ class Sharksplode(WordPressScraper): class Sheldon(ParserScraper): url = 'https://www.sheldoncomics.com/' firstStripUrl = url + 'comic/well-who-is-this/' - imageSearch = '//div[@id="comic"]//img' + imageSearch = '//div[@id="comic"]//img/@data-src-img' prevSearch = '//a[img[d:class("left")]]' From 6e138a022841a23dfb0a013389d0983fb1aa6daf Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Tue, 14 May 2024 22:47:33 +0200 Subject: [PATCH 32/41] Fix Drive (fixes #318) --- dosagelib/plugins/d.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/dosagelib/plugins/d.py b/dosagelib/plugins/d.py index 3bfa0cf7c..4b632cac9 100644 --- a/dosagelib/plugins/d.py +++ b/dosagelib/plugins/d.py @@ -350,14 +350,12 @@ class DrFun(_ParserScraper): help = 'Index format: nnnnn' -class Drive(_BasicScraper): +class Drive(ParserScraper): url = 'http://www.drivecomic.com/' - rurl = escape(url) - stripUrl = url + 'archive/%s.html' - firstStripUrl = stripUrl % '090815' - imageSearch = compile(tagre("img", "src", r'(http://cdn\.drivecomic\.com/strips/main/[^"]+)')) - prevSearch = compile(tagre("a", "href", r'(%sarchive/\d+\.html)' % rurl) + "Previous") - help = 'Index format: yymmdd' + firstStripUrl = url + 'comic/act-1-pg-001/' + imageSearch = ('//div[@id="unspliced-comic"]//img/@data-src-img', + '//div[@id="unspliced-comic"]//picture//img') + prevSearch = '//a[d:class("previous-comic")]' class DrMcNinja(_ParserScraper): From 76d5180b4951258d7836788b31da2b69adf4fd67 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Tue, 14 May 2024 22:57:50 +0200 Subject: [PATCH 33/41] Fix EvilInc (fixes #317) --- dosagelib/plugins/e.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dosagelib/plugins/e.py b/dosagelib/plugins/e.py index 857776ec0..1de61de7c 100644 --- a/dosagelib/plugins/e.py +++ b/dosagelib/plugins/e.py @@ -181,9 +181,10 @@ class EvilDiva(WordPressScraper): endOfLife = True -class EvilInc(_ParserScraper): +class EvilInc(ParserScraper): url = 'https://www.evil-inc.com/' - imageSearch = '//div[@id="unspliced-comic"]/img/@data-src' + imageSearch = ('//div[@id="unspliced-comic"]/img', + '//div[@id="unspliced-comic"]/picture//img') prevSearch = '//a[./i[d:class("fa-chevron-left")]]' firstStripUrl = url + 'comic/monday-3/' From 284efdc212bc1526a8e523d9bb4933b808a9464d Mon Sep 17 00:00:00 2001 From: Dr Yann Golanski Date: Sun, 26 May 2024 11:03:57 +0100 Subject: [PATCH 34/41] feat(webtoons): added dungeon & doodles, Tales From The Table (#322) Dungeons & Doodles: Tales From the Tables](https://www.webtoons.com/en/canvas/dungeons-doodles-tales-from-the-tables/list?title_no=682646). --- dosagelib/plugins/webtoons.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dosagelib/plugins/webtoons.py b/dosagelib/plugins/webtoons.py index ec61087e5..8d4d0f990 100644 --- a/dosagelib/plugins/webtoons.py +++ b/dosagelib/plugins/webtoons.py @@ -157,6 +157,7 @@ class WebToons(ParserScraper): cls('DrFrost', 'drama/dr-frost', 371), cls('DuelIdentity', 'challenge/duel-identity', 532064), cls('DungeonCleaningLife', 'action/the-dungeon-cleaning-life-of-a-once-genius-hunter', 4677), + cls('DungeonsAndDoodlesTalesFromTheTables', 'canvas/dungeons-doodles-tales-from-the-tables', 682646), cls('DungeonMinis', 'challenge/dungeonminis', 64132), cls('Dustinteractive', 'comedy/dustinteractive', 907), cls('DutyAfterSchool', 'sf/duty-after-school', 370), From 5391b8518febf8cf37ad3c7d23206a6a922569df Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 26 May 2024 12:05:46 +0200 Subject: [PATCH 35/41] Bump paambaati/codeclimate-action from 5.0.0 to 6.0.0 (#313) Bumps [paambaati/codeclimate-action](https://github.com/paambaati/codeclimate-action) from 5.0.0 to 6.0.0. - [Release notes](https://github.com/paambaati/codeclimate-action/releases) - [Changelog](https://github.com/paambaati/codeclimate-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/paambaati/codeclimate-action/compare/v5.0.0...v6.0.0) --- updated-dependencies: - dependency-name: paambaati/codeclimate-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 15a247d60..3cf88c3f0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -32,7 +32,7 @@ jobs: if: ${{ matrix.python-version != env.DEFAULT_PYTHON }} - name: Test with tox (and upload coverage) - uses: paambaati/codeclimate-action@v5.0.0 + uses: paambaati/codeclimate-action@v6.0.0 if: ${{ matrix.python-version == env.DEFAULT_PYTHON }} env: CC_TEST_REPORTER_ID: 2a411f596959fc32f5d73f3ba7cef8cc4d5733299d742dbfc97fd6c190b9010c From 2b7ca3f30ccf2b3b4d1c9c41cc0b3363ec2fdf15 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sun, 26 May 2024 15:26:54 +0200 Subject: [PATCH 36/41] ComicsKingdom: Use more reliable source for full-size image (fixes #323) --- dosagelib/plugins/comicskingdom.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/dosagelib/plugins/comicskingdom.py b/dosagelib/plugins/comicskingdom.py index 82a1aea40..372cf8933 100644 --- a/dosagelib/plugins/comicskingdom.py +++ b/dosagelib/plugins/comicskingdom.py @@ -1,15 +1,13 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Thomas W. Littauer -from urllib.parse import parse_qs, urlsplit - from ..helpers import indirectStarter from ..scraper import ParserScraper class ComicsKingdom(ParserScraper): partDiv = '//div[d:class("comic-reader-item")]' - imageSearch = partDiv + '[1]//a[contains(@href, "/custom-framed-print/")]' + imageSearch = '//meta[@property="og:image"]/@content' prevSearch = partDiv + '[2]/@data-link' starter = indirectStarter help = 'Index format: yyyy-mm-dd' @@ -22,10 +20,6 @@ class ComicsKingdom(ParserScraper): if lang: self.lang = lang - def imageUrlModifier(self, url, data): - """Extract high-quality image URL from link""" - return parse_qs(urlsplit(url).query)['img'][0] - def link_modifier(self, fromurl, tourl): return tourl.replace('//wp.', '//', 1) From 8afe7d1cdcf889db3fa058a11ff76defc9937352 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Thu, 13 Jun 2024 23:10:41 +0200 Subject: [PATCH 37/41] Modernize importlib.metadata usage in PyInstaller spec --- scripts/dosage.spec | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/scripts/dosage.spec b/scripts/dosage.spec index eb9883c25..68d00026a 100644 --- a/scripts/dosage.spec +++ b/scripts/dosage.spec @@ -1,28 +1,30 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2017-2020 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2017 Tobias Gruetzmacher + +import re +from importlib import metadata # Idea from # https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Setuptools-Entry-Point, # but with importlib -def Entrypoint(group, name, **kwargs): - import re - try: - from importlib.metadata import entry_points - except ImportError: - from importlib_metadata import entry_points - +def entrypoint(group, name, **kwargs): # get the entry point - eps = entry_points()[group] - ep = next(ep for ep in eps if ep.name == name) - module, attr = re.split(r'\s*:\s*', ep.value, 1) + eps = metadata.entry_points() + if 'select' in dir(eps): + # modern + ep = eps.select(group=group)[name] + else: + # legacy (pre-3.10) + ep = next(ep for ep in eps[group] if ep.name == name) + module, attr = re.split(r'\s*:\s*', ep.value, maxsplit=1) # script name must not be a valid module name to avoid name clashes on import script_path = os.path.join(workpath, name + '-script.py') print("creating script for entry point", group, name) - with open(script_path, 'w') as fh: + with open(script_path, mode='w', encoding='utf-8') as fh: print("import sys", file=fh) print("import", module, file=fh) - print("sys.exit(%s.%s())" % (module, attr), file=fh) + print(f"sys.exit({module}.{attr}())", file=fh) return Analysis( [script_path] + kwargs.get('scripts', []), @@ -30,7 +32,7 @@ def Entrypoint(group, name, **kwargs): ) -a = Entrypoint('console_scripts', 'dosage') +a = entrypoint('console_scripts', 'dosage') a.binaries = [x for x in a.binaries if not x[1].lower().startswith(r'c:\windows')] From 6024b2a01b5746b0eabb227d7a4c14ccb9e8ee13 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Thu, 13 Jun 2024 23:18:36 +0200 Subject: [PATCH 38/41] Build modern Windows EXE with Python 3.12 --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 880f9fa62..1d60b1969 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -74,7 +74,7 @@ pys.each { py -> parallel(tasks) parallel modern: { stage('Modern Windows binary') { - windowsBuild('3.11', 'dosage.exe') + windowsBuild('3.12', 'dosage.exe') } }, legacy: { From b32e67fce868f83ae6e5a39fc91766d2c8aa99ed Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 14 Jun 2024 06:03:41 +0000 Subject: [PATCH 39/41] Bump paambaati/codeclimate-action from 6.0.0 to 8.0.0 (#326) --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3cf88c3f0..837f3e0df 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -32,7 +32,7 @@ jobs: if: ${{ matrix.python-version != env.DEFAULT_PYTHON }} - name: Test with tox (and upload coverage) - uses: paambaati/codeclimate-action@v6.0.0 + uses: paambaati/codeclimate-action@v8.0.0 if: ${{ matrix.python-version == env.DEFAULT_PYTHON }} env: CC_TEST_REPORTER_ID: 2a411f596959fc32f5d73f3ba7cef8cc4d5733299d742dbfc97fd6c190b9010c From 8984e9a2b59550ec6eb3ac10a9ed771bec269ee1 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Thu, 20 Jun 2024 21:48:21 +0200 Subject: [PATCH 40/41] Add "Vampire Family" to Webtoons module --- dosagelib/plugins/webtoons.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dosagelib/plugins/webtoons.py b/dosagelib/plugins/webtoons.py index 8d4d0f990..a917b285e 100644 --- a/dosagelib/plugins/webtoons.py +++ b/dosagelib/plugins/webtoons.py @@ -450,6 +450,7 @@ class WebToons(ParserScraper): cls('UpAndOut', 'slice-of-life/up-and-out', 488), cls('UrbanAnimal', 'super-hero/urban-animal', 1483), cls('Uriah', 'horror/uriah', 1607), + cls('VampireFamily', 'comedy/vampire-family', 6402), cls('VarsityNoir', 'mystery/varsity-noir', 1613), cls('VersionDayAndNight', 'drama/version-day-and-night', 1796), cls('WafflesAndPancakes', 'slice-of-life/waffles-and-pancakes', 1310), From df1e72e90a681446db35b363e1110835516d1ed2 Mon Sep 17 00:00:00 2001 From: Dr Yann Golanski Date: Sun, 30 Jun 2024 17:28:08 +0100 Subject: [PATCH 41/41] feat(webtoon): added hench comics (#328) https://www.webtoons.com/en/canvas/hench/list?title_no=857225 --- dosagelib/plugins/webtoons.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dosagelib/plugins/webtoons.py b/dosagelib/plugins/webtoons.py index a917b285e..39051ffce 100644 --- a/dosagelib/plugins/webtoons.py +++ b/dosagelib/plugins/webtoons.py @@ -210,6 +210,7 @@ class WebToons(ParserScraper): cls('HeliosFemina', 'fantasy/helios-femina', 638), cls('HelloWorld', 'slice-of-life/hello-world', 827), cls('Hellper', 'fantasy/hellper', 185), + cls('Hench', 'canvas/hench/', 857225), cls('HeroineChic', 'super-hero/heroine-chic', 561), cls('HIVE', 'thriller/hive', 65), cls('Hooky', 'fantasy/hooky', 425),