From 4f932803a34f67bd7a9d5ea0fa381d2f6741bf64 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sat, 10 Jun 2023 15:05:57 +0200 Subject: [PATCH] Extend scraper API with a extract_image_urls method This is just a light wrapper around fetchUrls, but frees comic modules from second-guessing for what purpose fetchUrls was called when they are overriding that API - And yes, some comic modules already got this wrong, they are now all fixed. --- dosagelib/plugins/e.py | 23 ++++++++--------- dosagelib/plugins/f.py | 12 ++++----- dosagelib/plugins/l.py | 16 ++++++------ dosagelib/plugins/m.py | 14 +++++------ dosagelib/plugins/mangadex.py | 24 +++++++++--------- dosagelib/plugins/o.py | 47 ++++++++++++----------------------- dosagelib/plugins/s.py | 24 +++++++++--------- dosagelib/plugins/tapas.py | 14 +++++------ dosagelib/plugins/u.py | 27 ++++++++++---------- dosagelib/plugins/webtoons.py | 12 ++++----- dosagelib/scraper.py | 14 ++++++++--- 11 files changed, 107 insertions(+), 120 deletions(-) diff --git a/dosagelib/plugins/e.py b/dosagelib/plugins/e.py index 44341f711..d423528dd 100644 --- a/dosagelib/plugins/e.py +++ b/dosagelib/plugins/e.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring import os from re import compile, IGNORECASE from ..helpers import bounceStarter, indirectStarter -from ..scraper import _BasicScraper, _ParserScraper +from ..scraper import ParserScraper, _BasicScraper, _ParserScraper from ..util import tagre from .common import ComicControlScraper, WordPressScraper, WordPressNavi @@ -99,7 +99,7 @@ class EmergencyExit(_BasicScraper): help = 'Index format: n' -class Erfworld(_ParserScraper): +class Erfworld(ParserScraper): stripUrl = 'https://archives.erfworld.com/%s' url = stripUrl % 'getLatestPage.php' firstStripUrl = stripUrl % 'Kickstarter+Stories/1' @@ -111,12 +111,9 @@ class Erfworld(_ParserScraper): textOptional = True starter = bounceStarter - def fetchUrls(self, url, data, urlSearch): - # Return the main logo for text-only pages - try: - return super().fetchUrls(url, data, urlSearch) - except ValueError: - return super().fetchUrls(url, data, '//li[@class="erf-logo"]//img') + def shouldSkipUrl(self, url, data): + """Skip pages without images.""" + return not data.xpath(self.imageSearch) def namer(self, imageUrl, pageUrl): # Fix inconsistent filenames @@ -138,7 +135,7 @@ class Erfworld(_ParserScraper): return self.stripUrl % 'Book+0/81' elif url == self.stripUrl % 'Book+0/1': return self.stripUrl % 'Kickstarter+Stories/54' - return super(Erfworld, self).getPrevUrl(url, data) + return super().getPrevUrl(url, data) class ErmaFelnaEDF(_ParserScraper): diff --git a/dosagelib/plugins/f.py b/dosagelib/plugins/f.py index 2bef57265..2a8d04572 100644 --- a/dosagelib/plugins/f.py +++ b/dosagelib/plugins/f.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2021 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from re import compile, escape from ..util import tagre @@ -240,7 +240,7 @@ class FunInJammies(WordPressScraper): help = 'Index format: n (unpadded)' -class FurPiled(_ParserScraper): +class FurPiled(ParserScraper): stripUrl = ('https://web.archive.org/web/20160404074145/' 'http://www.liondogworks.com/images/fp-%03d.jpg') url = stripUrl % 427 @@ -254,7 +254,7 @@ class FurPiled(_ParserScraper): nextStrip = nextStrip - 1 return self.stripUrl % nextStrip - def fetchUrls(self, url, data, urlSearch): + def extract_image_urls(self, url, data): # URLs are direct links to images return [url] diff --git a/dosagelib/plugins/l.py b/dosagelib/plugins/l.py index f28bccc93..d75126782 100644 --- a/dosagelib/plugins/l.py +++ b/dosagelib/plugins/l.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from re import compile from ..scraper import ParserScraper, _BasicScraper, _ParserScraper @@ -86,7 +86,7 @@ class LifeAintNoPonyFarm(WordPressScraper): endOfLife = True -class LifeAsRendered(_ParserScraper): +class LifeAsRendered(ParserScraper): # Reverse navigation doesn't work properly, so search forward instead stripUrl = 'https://kittyredden.com/LAR/%s/' url = stripUrl % '0100' @@ -121,11 +121,11 @@ class LifeAsRendered(_ParserScraper): filename = imageUrl.rsplit('/', 1)[-1] return filename.replace('ReN', 'N').replace('N01P', 'A02S') - def fetchUrls(self, url, data, urlSearch): + def extract_image_urls(self, url, data): # Fix missing image link - if 'LAR/0403' in url and urlSearch == self.imageSearch: + if 'LAR/0403' in url: return [self.stripUrl.rstrip('/') % 'A04/A04P03.png'] - return super(LifeAsRendered, self).fetchUrls(url, data, urlSearch) + return super().extract_image_urls(url, data) def getPrevUrl(self, url, data): # Fix broken navigation links diff --git a/dosagelib/plugins/m.py b/dosagelib/plugins/m.py index baf4d7f6e..8dac3469b 100644 --- a/dosagelib/plugins/m.py +++ b/dosagelib/plugins/m.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring import json from re import compile, escape, IGNORECASE from ..helpers import indirectStarter -from ..scraper import _BasicScraper, _ParserScraper +from ..scraper import ParserScraper, _BasicScraper, _ParserScraper from ..util import tagre from ..xml import NS from .common import ComicControlScraper, WordPressScraper, WordPressWebcomic @@ -233,7 +233,7 @@ class MyCartoons(_BasicScraper): lang = 'de' -class MyLifeWithFel(_ParserScraper): +class MyLifeWithFel(ParserScraper): baseUrl = 'https://www.mylifewithfel.com/' stripUrl = baseUrl + 'api/posts/%s' firstStripUrl = stripUrl % '1' @@ -249,7 +249,7 @@ class MyLifeWithFel(_ParserScraper): def getPrevUrl(self, url, data): return self.stripUrl % json.loads(data.text_content())['previous']['id'] - def fetchUrls(self, url, data, urlSearch): + def extract_image_urls(self, url, data): return [self.baseUrl + json.loads(data.text_content())['post']['image']] def namer(self, imageUrl, pageUrl): diff --git a/dosagelib/plugins/mangadex.py b/dosagelib/plugins/mangadex.py index 1bd6e9ea9..91c64a25d 100644 --- a/dosagelib/plugins/mangadex.py +++ b/dosagelib/plugins/mangadex.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2019-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring import json from ..scraper import ParserScraper @@ -71,24 +71,24 @@ class MangaDex(ParserScraper): return None return self.stripUrl % self.chapters[self.chapters.index(chapter[0]) - 1]['id'] - def fetchUrls(self, url, data, urlSearch): + def extract_image_urls(self, url, data): # Retrieve chapter metadata from API - chapterData = json.loads(data.text_content()) - self.chapter = chapterData['data'] - cdnData = self.session.get(self.cdnUrl % self.chapter['id']) - cdnData.raise_for_status() - cdnBlock = cdnData.json() + chapters = json.loads(data.text_content()) + self.chapter = chapters['data'] + cdnresponse = self.session.get(self.cdnUrl % self.chapter['id']) + cdnresponse.raise_for_status() + cdnblock = cdnresponse.json() # Save link order for position-based filenames - imageUrl = self.imageUrl % cdnBlock['chapter']['hash'] - self.imageUrls = [imageUrl % page for page in cdnBlock['chapter']['data']] - return self.imageUrls + urltemplate = self.imageUrl % cdnblock['chapter']['hash'] + self._cached_image_urls = [urltemplate % page for page in cdnblock['chapter']['data']] + return self._cached_image_urls def namer(self, imageUrl, pageUrl): # Construct filename from episode number and page index in array chapter = self.chapter['attributes']['chapter'] chapterNum = chapter if chapter is not None else 0 - pageNum = self.imageUrls.index(imageUrl) + pageNum = self._cached_image_urls.index(imageUrl) pageExt = imageUrl.rsplit('.')[-1] return '%s-%02d.%s' % (chapterNum, pageNum, pageExt) diff --git a/dosagelib/plugins/o.py b/dosagelib/plugins/o.py index 954d5d69b..5706d2ba2 100644 --- a/dosagelib/plugins/o.py +++ b/dosagelib/plugins/o.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2020 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from re import compile, escape from ..helpers import bounceStarter, indirectStarter -from ..scraper import _BasicScraper, _ParserScraper +from ..scraper import ParserScraper, _BasicScraper, _ParserScraper from ..util import tagre from .common import WordPressScraper, WordPressNavi @@ -21,7 +21,7 @@ class OctopusPie(_ParserScraper): help = 'Index format: yyyy-mm-dd/nnn-strip-name' -class OffWhite(_ParserScraper): +class OffWhite(ParserScraper): baseUrl = 'https://web.archive.org/web/20200627222318/http://off-white.eu/' stripUrl = baseUrl + 'comic/%s/' firstStripUrl = stripUrl % 'prologue-page-1-2' @@ -32,18 +32,6 @@ class OffWhite(_ParserScraper): starter = indirectStarter endOfLife = True - def fetchUrls(self, url, data, urlSearch): - # Fix missing page - if url == self.stripUrl % 'page-37': - return [self.baseUrl + 'ow_v2/wp-content/uploads/2011/01/new-037.jpg'] - return super(OffWhite, self).fetchUrls(url, data, urlSearch) - - def getPrevUrl(self, url, data): - # Fix missing page - if url == self.stripUrl % 'page-37': - return self.stripUrl % 'page-36' - return super(OffWhite, self).getPrevUrl(url, data) - class Oglaf(_ParserScraper): url = 'http://oglaf.com/' @@ -55,19 +43,16 @@ class Oglaf(_ParserScraper): multipleImagesPerStrip = True adult = True - def fetchUrls(self, url, data, search): - urls = [] - urls.extend(super(Oglaf, self).fetchUrls(url, data, search)) - if search == self.imageSearch: - try: - nexturls = self.fetchUrls(url, data, self.nextSearch) - except ValueError: - pass - else: - while nexturls and nexturls[0].startswith(url): - data = self.getPage(nexturls[0]) - urls.extend(super(Oglaf, self).fetchUrls(nexturls, data, search)) - nexturls = self.fetchUrls(url, data, self.nextSearch) + def extract_image_urls(self, url, data): + urls = super().extract_image_urls(url, data) + try: + nexturl = self.fetchUrls(url, data, self.nextSearch)[0] + while nexturl.startswith(url): + data = self.getPage(nexturl) + urls.extend(super().extract_image_urls(url, data)) + nexturl = self.fetchUrls(url, data, self.nextSearch)[0] + except ValueError: + pass return urls diff --git a/dosagelib/plugins/s.py b/dosagelib/plugins/s.py index 131873281..b1516b16e 100644 --- a/dosagelib/plugins/s.py +++ b/dosagelib/plugins/s.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from re import compile, escape, IGNORECASE, sub from os.path import splitext @@ -436,20 +436,20 @@ class Sorcery101(WordPressWebcomic): help = 'Index format: stripname' -class SpaceFurries(_ParserScraper): - url = 'http://www.spacefurrs.org/' +class SpaceFurries(ParserScraper): + url = 'https://www.spacefurrs.org/' firstStripUrl = url multipleImagesPerStrip = True adult = True endOfLife = True - def fetchUrls(self, url, data, urlSearch): + def extract_image_urls(self, url, data): # Website requires JS, so build the list of image URLs manually - imageUrls = [] - currentPage = int(data.xpath('//input[@name="pagnum"]')[0].get('value')) - for page in reversed(range(1, currentPage + 1)): - imageUrls.append(self.url + 'comics/' + str(page) + '.jpg') - return imageUrls + imageurls = [] + current = int(data.xpath('//input[@name="pagnum"]')[0].get('value')) + for page in reversed(range(1, current + 1)): + imageurls.append(self.url + 'comics/' + str(page) + '.jpg') + return imageurls class SpaceJunkArlia(_ParserScraper): diff --git a/dosagelib/plugins/tapas.py b/dosagelib/plugins/tapas.py index a1db5bdb0..f3c6088fb 100644 --- a/dosagelib/plugins/tapas.py +++ b/dosagelib/plugins/tapas.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2019-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2022 Daniel Ring +# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from ..output import out from ..scraper import ParserScraper from ..xml import NS @@ -37,10 +37,10 @@ class Tapas(ParserScraper): self.firstStripUrl = self.stripUrl % apiData['prev_ep_id'] return self.stripUrl % apiData['prev_ep_id'] - def fetchUrls(self, url, data, urlSearch): + def extract_image_urls(self, url, data): # Save link order for position-based filenames - self.imageUrls = super().fetchUrls(url, data, urlSearch) - return self.imageUrls + self._cached_image_urls = super().extract_image_urls(url, data) + return self._cached_image_urls def shouldSkipUrl(self, url, data): if data.xpath('//button[d:class("js-have-to-sign")]', namespaces=NS): @@ -51,9 +51,9 @@ class Tapas(ParserScraper): def namer(self, imageUrl, pageUrl): # Construct filename from episode number and image position on page episodeNum = pageUrl.rsplit('/', 1)[-1] - imageNum = self.imageUrls.index(imageUrl) + imageNum = self._cached_image_urls.index(imageUrl) imageExt = pageUrl.rsplit('.', 1)[-1] - if len(self.imageUrls) > 1: + if len(self._cached_image_urls) > 1: filename = "%s-%d.%s" % (episodeNum, imageNum, imageExt) else: filename = "%s.%s" % (episodeNum, imageExt) diff --git a/dosagelib/plugins/u.py b/dosagelib/plugins/u.py index 12eeb0bbd..99e31d682 100644 --- a/dosagelib/plugins/u.py +++ b/dosagelib/plugins/u.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2020 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring import json from re import compile from urllib.parse import urljoin @@ -29,7 +29,7 @@ class UberQuest(ParserScraper): def getPrevUrl(self, url, data): return self.stripUrl % json.loads(data.text_content())[0]['prev_id'] - def fetchUrls(self, url, data, urlSearch): + def extract_image_urls(self, url, data): return [json.loads(data.text_content())[0]['attachment']] def namer(self, imageUrl, pageUrl): @@ -80,7 +80,7 @@ class UnicornJelly(BasicScraper): class Unsounded(ParserScraper): - url = 'http://www.casualvillain.com/Unsounded/' + url = 'https://www.casualvillain.com/Unsounded/' startUrl = url + 'comic+index/' stripUrl = url + 'comic/ch%s/ch%s_%s.html' firstStripUrl = stripUrl % ('01', '01', '01') @@ -91,18 +91,17 @@ class Unsounded(ParserScraper): starter = indirectStarter help = 'Index format: chapter-page' - def fetchUrls(self, url, data, urlSearch): - imageUrls = super(Unsounded, self).fetchUrls(url, data, urlSearch) + def extract_image_urls(self, url, data): + imageUrls = super().extract_image_urls(url, data) # Include background for multi-image pages imageRegex = compile(r'background-image: url\((pageart/.*)\)') for match in imageRegex.finditer(str(etree.tostring(data))): - print(match) - searchUrls.append(normaliseURL(urljoin(data[1], match.group(1)))) + imageUrls.append(normaliseURL(urljoin(data[1], match.group(1)))) return imageUrls - def namer(self, imageUrl, pageUrl): - filename = imageUrl.rsplit('/', 1)[-1] - pagename = pageUrl.rsplit('/', 1)[-1] + def namer(self, image_url, page_url): + filename = image_url.rsplit('/', 1)[-1] + pagename = page_url.rsplit('/', 1)[-1] if pagename.split('.', 1)[0] != filename.split('.', 1)[0]: filename = pagename.split('_', 1)[0] + '_' + filename return filename @@ -111,7 +110,7 @@ class Unsounded(ParserScraper): # Fix missing navigation links between chapters if 'ch13/you_let_me_fall' in url: return self.stripUrl % ('13', '13', '85') - return super(Unsounded, self).getPrevUrl(url, data) + return super().getPrevUrl(url, data) def getIndexStripUrl(self, index): chapter, num = index.split('-') diff --git a/dosagelib/plugins/webtoons.py b/dosagelib/plugins/webtoons.py index ecd3f47c2..d78c6229b 100644 --- a/dosagelib/plugins/webtoons.py +++ b/dosagelib/plugins/webtoons.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2019-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2022 Daniel Ring +# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from ..scraper import ParserScraper @@ -29,18 +29,18 @@ class WebToons(ParserScraper): self.endOfLife = (listPage.xpath('//div[@id="_asideDetail"]//span[@class="txt_ico_completed2"]') != []) return self.stripUrl % currentEpisode - def fetchUrls(self, url, data, urlSearch): + def extract_image_urls(self, url, data): # Save link order for position-based filenames - self.imageUrls = super().fetchUrls(url, data, urlSearch) + self._cached_image_urls = super().extract_image_urls(url, data) # Update firstStripUrl with the correct episode title if url.rsplit('=', 1)[-1] == '1': self.firstStripUrl = url - return self.imageUrls + return self._cached_image_urls def namer(self, imageUrl, pageUrl): # Construct filename from episode number and image position on page episodeNum = pageUrl.rsplit('=', 1)[-1] - imageNum = self.imageUrls.index(imageUrl) + imageNum = self._cached_image_urls.index(imageUrl) imageExt = pageUrl.rsplit('.', 1)[-1].split('?', 1)[0] return "%s-%03d.%s" % (episodeNum, imageNum, imageExt) diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index 7078f842b..5a411b9b4 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2022 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher from __future__ import annotations import html @@ -137,7 +137,7 @@ class Scraper: def getComicStrip(self, url, data): """Get comic strip downloader for given URL and data.""" - imageUrls = self.fetchUrls(url, data, self.imageSearch) + imageUrls = self.extract_image_urls(url, data) # map modifier function on image URLs imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls] # remove duplicate URLs @@ -325,6 +325,12 @@ class Scraper: """ return get_page(url, self.session, allow_errors=self.allow_errors) + def extract_image_urls(self, url, data): + """ + Extract image URLs from page data using the classes imageSearch attribute. + """ + return self.fetchUrls(url, data, self.imageSearch) + def fetchUrls(self, url, data, urlsearch): raise ValueError("No implementation for fetchUrls!")