Extend scraper API with a extract_image_urls method

This is just a light wrapper around fetchUrls, but frees comic modules
from second-guessing for what purpose fetchUrls was called when they are
overriding that API - And yes, some comic modules already got this
wrong, they are now all fixed.
This commit is contained in:
Tobias Gruetzmacher 2023-06-10 15:05:57 +02:00
parent 355ef44b7e
commit 4f932803a3
No known key found for this signature in database
11 changed files with 107 additions and 120 deletions

View file

@ -1,13 +1,13 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
import os import os
from re import compile, IGNORECASE from re import compile, IGNORECASE
from ..helpers import bounceStarter, indirectStarter from ..helpers import bounceStarter, indirectStarter
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
from ..util import tagre from ..util import tagre
from .common import ComicControlScraper, WordPressScraper, WordPressNavi from .common import ComicControlScraper, WordPressScraper, WordPressNavi
@ -99,7 +99,7 @@ class EmergencyExit(_BasicScraper):
help = 'Index format: n' help = 'Index format: n'
class Erfworld(_ParserScraper): class Erfworld(ParserScraper):
stripUrl = 'https://archives.erfworld.com/%s' stripUrl = 'https://archives.erfworld.com/%s'
url = stripUrl % 'getLatestPage.php' url = stripUrl % 'getLatestPage.php'
firstStripUrl = stripUrl % 'Kickstarter+Stories/1' firstStripUrl = stripUrl % 'Kickstarter+Stories/1'
@ -111,12 +111,9 @@ class Erfworld(_ParserScraper):
textOptional = True textOptional = True
starter = bounceStarter starter = bounceStarter
def fetchUrls(self, url, data, urlSearch): def shouldSkipUrl(self, url, data):
# Return the main logo for text-only pages """Skip pages without images."""
try: return not data.xpath(self.imageSearch)
return super().fetchUrls(url, data, urlSearch)
except ValueError:
return super().fetchUrls(url, data, '//li[@class="erf-logo"]//img')
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):
# Fix inconsistent filenames # Fix inconsistent filenames
@ -138,7 +135,7 @@ class Erfworld(_ParserScraper):
return self.stripUrl % 'Book+0/81' return self.stripUrl % 'Book+0/81'
elif url == self.stripUrl % 'Book+0/1': elif url == self.stripUrl % 'Book+0/1':
return self.stripUrl % 'Kickstarter+Stories/54' return self.stripUrl % 'Kickstarter+Stories/54'
return super(Erfworld, self).getPrevUrl(url, data) return super().getPrevUrl(url, data)
class ErmaFelnaEDF(_ParserScraper): class ErmaFelnaEDF(_ParserScraper):

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2021 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape from re import compile, escape
from ..util import tagre from ..util import tagre
@ -240,7 +240,7 @@ class FunInJammies(WordPressScraper):
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
class FurPiled(_ParserScraper): class FurPiled(ParserScraper):
stripUrl = ('https://web.archive.org/web/20160404074145/' stripUrl = ('https://web.archive.org/web/20160404074145/'
'http://www.liondogworks.com/images/fp-%03d.jpg') 'http://www.liondogworks.com/images/fp-%03d.jpg')
url = stripUrl % 427 url = stripUrl % 427
@ -254,7 +254,7 @@ class FurPiled(_ParserScraper):
nextStrip = nextStrip - 1 nextStrip = nextStrip - 1
return self.stripUrl % nextStrip return self.stripUrl % nextStrip
def fetchUrls(self, url, data, urlSearch): def extract_image_urls(self, url, data):
# URLs are direct links to images # URLs are direct links to images
return [url] return [url]

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile from re import compile
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
@ -86,7 +86,7 @@ class LifeAintNoPonyFarm(WordPressScraper):
endOfLife = True endOfLife = True
class LifeAsRendered(_ParserScraper): class LifeAsRendered(ParserScraper):
# Reverse navigation doesn't work properly, so search forward instead # Reverse navigation doesn't work properly, so search forward instead
stripUrl = 'https://kittyredden.com/LAR/%s/' stripUrl = 'https://kittyredden.com/LAR/%s/'
url = stripUrl % '0100' url = stripUrl % '0100'
@ -121,11 +121,11 @@ class LifeAsRendered(_ParserScraper):
filename = imageUrl.rsplit('/', 1)[-1] filename = imageUrl.rsplit('/', 1)[-1]
return filename.replace('ReN', 'N').replace('N01P', 'A02S') return filename.replace('ReN', 'N').replace('N01P', 'A02S')
def fetchUrls(self, url, data, urlSearch): def extract_image_urls(self, url, data):
# Fix missing image link # Fix missing image link
if 'LAR/0403' in url and urlSearch == self.imageSearch: if 'LAR/0403' in url:
return [self.stripUrl.rstrip('/') % 'A04/A04P03.png'] return [self.stripUrl.rstrip('/') % 'A04/A04P03.png']
return super(LifeAsRendered, self).fetchUrls(url, data, urlSearch) return super().extract_image_urls(url, data)
def getPrevUrl(self, url, data): def getPrevUrl(self, url, data):
# Fix broken navigation links # Fix broken navigation links

View file

@ -1,13 +1,13 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
import json import json
from re import compile, escape, IGNORECASE from re import compile, escape, IGNORECASE
from ..helpers import indirectStarter from ..helpers import indirectStarter
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
from ..util import tagre from ..util import tagre
from ..xml import NS from ..xml import NS
from .common import ComicControlScraper, WordPressScraper, WordPressWebcomic from .common import ComicControlScraper, WordPressScraper, WordPressWebcomic
@ -233,7 +233,7 @@ class MyCartoons(_BasicScraper):
lang = 'de' lang = 'de'
class MyLifeWithFel(_ParserScraper): class MyLifeWithFel(ParserScraper):
baseUrl = 'https://www.mylifewithfel.com/' baseUrl = 'https://www.mylifewithfel.com/'
stripUrl = baseUrl + 'api/posts/%s' stripUrl = baseUrl + 'api/posts/%s'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
@ -249,7 +249,7 @@ class MyLifeWithFel(_ParserScraper):
def getPrevUrl(self, url, data): def getPrevUrl(self, url, data):
return self.stripUrl % json.loads(data.text_content())['previous']['id'] return self.stripUrl % json.loads(data.text_content())['previous']['id']
def fetchUrls(self, url, data, urlSearch): def extract_image_urls(self, url, data):
return [self.baseUrl + json.loads(data.text_content())['post']['image']] return [self.baseUrl + json.loads(data.text_content())['post']['image']]
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):

View file

@ -1,6 +1,6 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
import json import json
from ..scraper import ParserScraper from ..scraper import ParserScraper
@ -71,24 +71,24 @@ class MangaDex(ParserScraper):
return None return None
return self.stripUrl % self.chapters[self.chapters.index(chapter[0]) - 1]['id'] return self.stripUrl % self.chapters[self.chapters.index(chapter[0]) - 1]['id']
def fetchUrls(self, url, data, urlSearch): def extract_image_urls(self, url, data):
# Retrieve chapter metadata from API # Retrieve chapter metadata from API
chapterData = json.loads(data.text_content()) chapters = json.loads(data.text_content())
self.chapter = chapterData['data'] self.chapter = chapters['data']
cdnData = self.session.get(self.cdnUrl % self.chapter['id']) cdnresponse = self.session.get(self.cdnUrl % self.chapter['id'])
cdnData.raise_for_status() cdnresponse.raise_for_status()
cdnBlock = cdnData.json() cdnblock = cdnresponse.json()
# Save link order for position-based filenames # Save link order for position-based filenames
imageUrl = self.imageUrl % cdnBlock['chapter']['hash'] urltemplate = self.imageUrl % cdnblock['chapter']['hash']
self.imageUrls = [imageUrl % page for page in cdnBlock['chapter']['data']] self._cached_image_urls = [urltemplate % page for page in cdnblock['chapter']['data']]
return self.imageUrls return self._cached_image_urls
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):
# Construct filename from episode number and page index in array # Construct filename from episode number and page index in array
chapter = self.chapter['attributes']['chapter'] chapter = self.chapter['attributes']['chapter']
chapterNum = chapter if chapter is not None else 0 chapterNum = chapter if chapter is not None else 0
pageNum = self.imageUrls.index(imageUrl) pageNum = self._cached_image_urls.index(imageUrl)
pageExt = imageUrl.rsplit('.')[-1] pageExt = imageUrl.rsplit('.')[-1]
return '%s-%02d.%s' % (chapterNum, pageNum, pageExt) return '%s-%02d.%s' % (chapterNum, pageNum, pageExt)

View file

@ -1,12 +1,12 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape from re import compile, escape
from ..helpers import bounceStarter, indirectStarter from ..helpers import bounceStarter, indirectStarter
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
from ..util import tagre from ..util import tagre
from .common import WordPressScraper, WordPressNavi from .common import WordPressScraper, WordPressNavi
@ -21,7 +21,7 @@ class OctopusPie(_ParserScraper):
help = 'Index format: yyyy-mm-dd/nnn-strip-name' help = 'Index format: yyyy-mm-dd/nnn-strip-name'
class OffWhite(_ParserScraper): class OffWhite(ParserScraper):
baseUrl = 'https://web.archive.org/web/20200627222318/http://off-white.eu/' baseUrl = 'https://web.archive.org/web/20200627222318/http://off-white.eu/'
stripUrl = baseUrl + 'comic/%s/' stripUrl = baseUrl + 'comic/%s/'
firstStripUrl = stripUrl % 'prologue-page-1-2' firstStripUrl = stripUrl % 'prologue-page-1-2'
@ -32,18 +32,6 @@ class OffWhite(_ParserScraper):
starter = indirectStarter starter = indirectStarter
endOfLife = True endOfLife = True
def fetchUrls(self, url, data, urlSearch):
# Fix missing page
if url == self.stripUrl % 'page-37':
return [self.baseUrl + 'ow_v2/wp-content/uploads/2011/01/new-037.jpg']
return super(OffWhite, self).fetchUrls(url, data, urlSearch)
def getPrevUrl(self, url, data):
# Fix missing page
if url == self.stripUrl % 'page-37':
return self.stripUrl % 'page-36'
return super(OffWhite, self).getPrevUrl(url, data)
class Oglaf(_ParserScraper): class Oglaf(_ParserScraper):
url = 'http://oglaf.com/' url = 'http://oglaf.com/'
@ -55,19 +43,16 @@ class Oglaf(_ParserScraper):
multipleImagesPerStrip = True multipleImagesPerStrip = True
adult = True adult = True
def fetchUrls(self, url, data, search): def extract_image_urls(self, url, data):
urls = [] urls = super().extract_image_urls(url, data)
urls.extend(super(Oglaf, self).fetchUrls(url, data, search)) try:
if search == self.imageSearch: nexturl = self.fetchUrls(url, data, self.nextSearch)[0]
try: while nexturl.startswith(url):
nexturls = self.fetchUrls(url, data, self.nextSearch) data = self.getPage(nexturl)
except ValueError: urls.extend(super().extract_image_urls(url, data))
pass nexturl = self.fetchUrls(url, data, self.nextSearch)[0]
else: except ValueError:
while nexturls and nexturls[0].startswith(url): pass
data = self.getPage(nexturls[0])
urls.extend(super(Oglaf, self).fetchUrls(nexturls, data, search))
nexturls = self.fetchUrls(url, data, self.nextSearch)
return urls return urls

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape, IGNORECASE, sub from re import compile, escape, IGNORECASE, sub
from os.path import splitext from os.path import splitext
@ -436,20 +436,20 @@ class Sorcery101(WordPressWebcomic):
help = 'Index format: stripname' help = 'Index format: stripname'
class SpaceFurries(_ParserScraper): class SpaceFurries(ParserScraper):
url = 'http://www.spacefurrs.org/' url = 'https://www.spacefurrs.org/'
firstStripUrl = url firstStripUrl = url
multipleImagesPerStrip = True multipleImagesPerStrip = True
adult = True adult = True
endOfLife = True endOfLife = True
def fetchUrls(self, url, data, urlSearch): def extract_image_urls(self, url, data):
# Website requires JS, so build the list of image URLs manually # Website requires JS, so build the list of image URLs manually
imageUrls = [] imageurls = []
currentPage = int(data.xpath('//input[@name="pagnum"]')[0].get('value')) current = int(data.xpath('//input[@name="pagnum"]')[0].get('value'))
for page in reversed(range(1, currentPage + 1)): for page in reversed(range(1, current + 1)):
imageUrls.append(self.url + 'comics/' + str(page) + '.jpg') imageurls.append(self.url + 'comics/' + str(page) + '.jpg')
return imageUrls return imageurls
class SpaceJunkArlia(_ParserScraper): class SpaceJunkArlia(_ParserScraper):

View file

@ -1,6 +1,6 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# Copyright (C) 2019-2022 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..output import out from ..output import out
from ..scraper import ParserScraper from ..scraper import ParserScraper
from ..xml import NS from ..xml import NS
@ -37,10 +37,10 @@ class Tapas(ParserScraper):
self.firstStripUrl = self.stripUrl % apiData['prev_ep_id'] self.firstStripUrl = self.stripUrl % apiData['prev_ep_id']
return self.stripUrl % apiData['prev_ep_id'] return self.stripUrl % apiData['prev_ep_id']
def fetchUrls(self, url, data, urlSearch): def extract_image_urls(self, url, data):
# Save link order for position-based filenames # Save link order for position-based filenames
self.imageUrls = super().fetchUrls(url, data, urlSearch) self._cached_image_urls = super().extract_image_urls(url, data)
return self.imageUrls return self._cached_image_urls
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):
if data.xpath('//button[d:class("js-have-to-sign")]', namespaces=NS): if data.xpath('//button[d:class("js-have-to-sign")]', namespaces=NS):
@ -51,9 +51,9 @@ class Tapas(ParserScraper):
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):
# Construct filename from episode number and image position on page # Construct filename from episode number and image position on page
episodeNum = pageUrl.rsplit('/', 1)[-1] episodeNum = pageUrl.rsplit('/', 1)[-1]
imageNum = self.imageUrls.index(imageUrl) imageNum = self._cached_image_urls.index(imageUrl)
imageExt = pageUrl.rsplit('.', 1)[-1] imageExt = pageUrl.rsplit('.', 1)[-1]
if len(self.imageUrls) > 1: if len(self._cached_image_urls) > 1:
filename = "%s-%d.%s" % (episodeNum, imageNum, imageExt) filename = "%s-%d.%s" % (episodeNum, imageNum, imageExt)
else: else:
filename = "%s.%s" % (episodeNum, imageExt) filename = "%s.%s" % (episodeNum, imageExt)

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
import json import json
from re import compile from re import compile
from urllib.parse import urljoin from urllib.parse import urljoin
@ -29,7 +29,7 @@ class UberQuest(ParserScraper):
def getPrevUrl(self, url, data): def getPrevUrl(self, url, data):
return self.stripUrl % json.loads(data.text_content())[0]['prev_id'] return self.stripUrl % json.loads(data.text_content())[0]['prev_id']
def fetchUrls(self, url, data, urlSearch): def extract_image_urls(self, url, data):
return [json.loads(data.text_content())[0]['attachment']] return [json.loads(data.text_content())[0]['attachment']]
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):
@ -80,7 +80,7 @@ class UnicornJelly(BasicScraper):
class Unsounded(ParserScraper): class Unsounded(ParserScraper):
url = 'http://www.casualvillain.com/Unsounded/' url = 'https://www.casualvillain.com/Unsounded/'
startUrl = url + 'comic+index/' startUrl = url + 'comic+index/'
stripUrl = url + 'comic/ch%s/ch%s_%s.html' stripUrl = url + 'comic/ch%s/ch%s_%s.html'
firstStripUrl = stripUrl % ('01', '01', '01') firstStripUrl = stripUrl % ('01', '01', '01')
@ -91,18 +91,17 @@ class Unsounded(ParserScraper):
starter = indirectStarter starter = indirectStarter
help = 'Index format: chapter-page' help = 'Index format: chapter-page'
def fetchUrls(self, url, data, urlSearch): def extract_image_urls(self, url, data):
imageUrls = super(Unsounded, self).fetchUrls(url, data, urlSearch) imageUrls = super().extract_image_urls(url, data)
# Include background for multi-image pages # Include background for multi-image pages
imageRegex = compile(r'background-image: url\((pageart/.*)\)') imageRegex = compile(r'background-image: url\((pageart/.*)\)')
for match in imageRegex.finditer(str(etree.tostring(data))): for match in imageRegex.finditer(str(etree.tostring(data))):
print(match) imageUrls.append(normaliseURL(urljoin(data[1], match.group(1))))
searchUrls.append(normaliseURL(urljoin(data[1], match.group(1))))
return imageUrls return imageUrls
def namer(self, imageUrl, pageUrl): def namer(self, image_url, page_url):
filename = imageUrl.rsplit('/', 1)[-1] filename = image_url.rsplit('/', 1)[-1]
pagename = pageUrl.rsplit('/', 1)[-1] pagename = page_url.rsplit('/', 1)[-1]
if pagename.split('.', 1)[0] != filename.split('.', 1)[0]: if pagename.split('.', 1)[0] != filename.split('.', 1)[0]:
filename = pagename.split('_', 1)[0] + '_' + filename filename = pagename.split('_', 1)[0] + '_' + filename
return filename return filename
@ -111,7 +110,7 @@ class Unsounded(ParserScraper):
# Fix missing navigation links between chapters # Fix missing navigation links between chapters
if 'ch13/you_let_me_fall' in url: if 'ch13/you_let_me_fall' in url:
return self.stripUrl % ('13', '13', '85') return self.stripUrl % ('13', '13', '85')
return super(Unsounded, self).getPrevUrl(url, data) return super().getPrevUrl(url, data)
def getIndexStripUrl(self, index): def getIndexStripUrl(self, index):
chapter, num = index.split('-') chapter, num = index.split('-')

View file

@ -1,6 +1,6 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# Copyright (C) 2019-2022 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..scraper import ParserScraper from ..scraper import ParserScraper
@ -29,18 +29,18 @@ class WebToons(ParserScraper):
self.endOfLife = (listPage.xpath('//div[@id="_asideDetail"]//span[@class="txt_ico_completed2"]') != []) self.endOfLife = (listPage.xpath('//div[@id="_asideDetail"]//span[@class="txt_ico_completed2"]') != [])
return self.stripUrl % currentEpisode return self.stripUrl % currentEpisode
def fetchUrls(self, url, data, urlSearch): def extract_image_urls(self, url, data):
# Save link order for position-based filenames # Save link order for position-based filenames
self.imageUrls = super().fetchUrls(url, data, urlSearch) self._cached_image_urls = super().extract_image_urls(url, data)
# Update firstStripUrl with the correct episode title # Update firstStripUrl with the correct episode title
if url.rsplit('=', 1)[-1] == '1': if url.rsplit('=', 1)[-1] == '1':
self.firstStripUrl = url self.firstStripUrl = url
return self.imageUrls return self._cached_image_urls
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):
# Construct filename from episode number and image position on page # Construct filename from episode number and image position on page
episodeNum = pageUrl.rsplit('=', 1)[-1] episodeNum = pageUrl.rsplit('=', 1)[-1]
imageNum = self.imageUrls.index(imageUrl) imageNum = self._cached_image_urls.index(imageUrl)
imageExt = pageUrl.rsplit('.', 1)[-1].split('?', 1)[0] imageExt = pageUrl.rsplit('.', 1)[-1].split('?', 1)[0]
return "%s-%03d.%s" % (episodeNum, imageNum, imageExt) return "%s-%03d.%s" % (episodeNum, imageNum, imageExt)

View file

@ -1,7 +1,7 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
from __future__ import annotations from __future__ import annotations
import html import html
@ -137,7 +137,7 @@ class Scraper:
def getComicStrip(self, url, data): def getComicStrip(self, url, data):
"""Get comic strip downloader for given URL and data.""" """Get comic strip downloader for given URL and data."""
imageUrls = self.fetchUrls(url, data, self.imageSearch) imageUrls = self.extract_image_urls(url, data)
# map modifier function on image URLs # map modifier function on image URLs
imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls] imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls]
# remove duplicate URLs # remove duplicate URLs
@ -325,6 +325,12 @@ class Scraper:
""" """
return get_page(url, self.session, allow_errors=self.allow_errors) return get_page(url, self.session, allow_errors=self.allow_errors)
def extract_image_urls(self, url, data):
"""
Extract image URLs from page data using the classes imageSearch attribute.
"""
return self.fetchUrls(url, data, self.imageSearch)
def fetchUrls(self, url, data, urlsearch): def fetchUrls(self, url, data, urlsearch):
raise ValueError("No implementation for fetchUrls!") raise ValueError("No implementation for fetchUrls!")