Extend scraper API with a extract_image_urls method
This is just a light wrapper around fetchUrls, but frees comic modules from second-guessing for what purpose fetchUrls was called when they are overriding that API - And yes, some comic modules already got this wrong, they are now all fixed.
This commit is contained in:
parent
355ef44b7e
commit
4f932803a3
11 changed files with 107 additions and 120 deletions
|
@ -1,13 +1,13 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2022 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2020 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
import os
|
import os
|
||||||
from re import compile, IGNORECASE
|
from re import compile, IGNORECASE
|
||||||
|
|
||||||
from ..helpers import bounceStarter, indirectStarter
|
from ..helpers import bounceStarter, indirectStarter
|
||||||
from ..scraper import _BasicScraper, _ParserScraper
|
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
from .common import ComicControlScraper, WordPressScraper, WordPressNavi
|
from .common import ComicControlScraper, WordPressScraper, WordPressNavi
|
||||||
|
|
||||||
|
@ -99,7 +99,7 @@ class EmergencyExit(_BasicScraper):
|
||||||
help = 'Index format: n'
|
help = 'Index format: n'
|
||||||
|
|
||||||
|
|
||||||
class Erfworld(_ParserScraper):
|
class Erfworld(ParserScraper):
|
||||||
stripUrl = 'https://archives.erfworld.com/%s'
|
stripUrl = 'https://archives.erfworld.com/%s'
|
||||||
url = stripUrl % 'getLatestPage.php'
|
url = stripUrl % 'getLatestPage.php'
|
||||||
firstStripUrl = stripUrl % 'Kickstarter+Stories/1'
|
firstStripUrl = stripUrl % 'Kickstarter+Stories/1'
|
||||||
|
@ -111,12 +111,9 @@ class Erfworld(_ParserScraper):
|
||||||
textOptional = True
|
textOptional = True
|
||||||
starter = bounceStarter
|
starter = bounceStarter
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlSearch):
|
def shouldSkipUrl(self, url, data):
|
||||||
# Return the main logo for text-only pages
|
"""Skip pages without images."""
|
||||||
try:
|
return not data.xpath(self.imageSearch)
|
||||||
return super().fetchUrls(url, data, urlSearch)
|
|
||||||
except ValueError:
|
|
||||||
return super().fetchUrls(url, data, '//li[@class="erf-logo"]//img')
|
|
||||||
|
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, imageUrl, pageUrl):
|
||||||
# Fix inconsistent filenames
|
# Fix inconsistent filenames
|
||||||
|
@ -138,7 +135,7 @@ class Erfworld(_ParserScraper):
|
||||||
return self.stripUrl % 'Book+0/81'
|
return self.stripUrl % 'Book+0/81'
|
||||||
elif url == self.stripUrl % 'Book+0/1':
|
elif url == self.stripUrl % 'Book+0/1':
|
||||||
return self.stripUrl % 'Kickstarter+Stories/54'
|
return self.stripUrl % 'Kickstarter+Stories/54'
|
||||||
return super(Erfworld, self).getPrevUrl(url, data)
|
return super().getPrevUrl(url, data)
|
||||||
|
|
||||||
|
|
||||||
class ErmaFelnaEDF(_ParserScraper):
|
class ErmaFelnaEDF(_ParserScraper):
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2021 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2020 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
from re import compile, escape
|
from re import compile, escape
|
||||||
|
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
|
@ -240,7 +240,7 @@ class FunInJammies(WordPressScraper):
|
||||||
help = 'Index format: n (unpadded)'
|
help = 'Index format: n (unpadded)'
|
||||||
|
|
||||||
|
|
||||||
class FurPiled(_ParserScraper):
|
class FurPiled(ParserScraper):
|
||||||
stripUrl = ('https://web.archive.org/web/20160404074145/'
|
stripUrl = ('https://web.archive.org/web/20160404074145/'
|
||||||
'http://www.liondogworks.com/images/fp-%03d.jpg')
|
'http://www.liondogworks.com/images/fp-%03d.jpg')
|
||||||
url = stripUrl % 427
|
url = stripUrl % 427
|
||||||
|
@ -254,7 +254,7 @@ class FurPiled(_ParserScraper):
|
||||||
nextStrip = nextStrip - 1
|
nextStrip = nextStrip - 1
|
||||||
return self.stripUrl % nextStrip
|
return self.stripUrl % nextStrip
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlSearch):
|
def extract_image_urls(self, url, data):
|
||||||
# URLs are direct links to images
|
# URLs are direct links to images
|
||||||
return [url]
|
return [url]
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2022 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2020 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
from re import compile
|
from re import compile
|
||||||
|
|
||||||
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
|
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
|
||||||
|
@ -86,7 +86,7 @@ class LifeAintNoPonyFarm(WordPressScraper):
|
||||||
endOfLife = True
|
endOfLife = True
|
||||||
|
|
||||||
|
|
||||||
class LifeAsRendered(_ParserScraper):
|
class LifeAsRendered(ParserScraper):
|
||||||
# Reverse navigation doesn't work properly, so search forward instead
|
# Reverse navigation doesn't work properly, so search forward instead
|
||||||
stripUrl = 'https://kittyredden.com/LAR/%s/'
|
stripUrl = 'https://kittyredden.com/LAR/%s/'
|
||||||
url = stripUrl % '0100'
|
url = stripUrl % '0100'
|
||||||
|
@ -121,11 +121,11 @@ class LifeAsRendered(_ParserScraper):
|
||||||
filename = imageUrl.rsplit('/', 1)[-1]
|
filename = imageUrl.rsplit('/', 1)[-1]
|
||||||
return filename.replace('ReN', 'N').replace('N01P', 'A02S')
|
return filename.replace('ReN', 'N').replace('N01P', 'A02S')
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlSearch):
|
def extract_image_urls(self, url, data):
|
||||||
# Fix missing image link
|
# Fix missing image link
|
||||||
if 'LAR/0403' in url and urlSearch == self.imageSearch:
|
if 'LAR/0403' in url:
|
||||||
return [self.stripUrl.rstrip('/') % 'A04/A04P03.png']
|
return [self.stripUrl.rstrip('/') % 'A04/A04P03.png']
|
||||||
return super(LifeAsRendered, self).fetchUrls(url, data, urlSearch)
|
return super().extract_image_urls(url, data)
|
||||||
|
|
||||||
def getPrevUrl(self, url, data):
|
def getPrevUrl(self, url, data):
|
||||||
# Fix broken navigation links
|
# Fix broken navigation links
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2022 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2020 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
import json
|
import json
|
||||||
from re import compile, escape, IGNORECASE
|
from re import compile, escape, IGNORECASE
|
||||||
|
|
||||||
from ..helpers import indirectStarter
|
from ..helpers import indirectStarter
|
||||||
from ..scraper import _BasicScraper, _ParserScraper
|
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
from ..xml import NS
|
from ..xml import NS
|
||||||
from .common import ComicControlScraper, WordPressScraper, WordPressWebcomic
|
from .common import ComicControlScraper, WordPressScraper, WordPressWebcomic
|
||||||
|
@ -233,7 +233,7 @@ class MyCartoons(_BasicScraper):
|
||||||
lang = 'de'
|
lang = 'de'
|
||||||
|
|
||||||
|
|
||||||
class MyLifeWithFel(_ParserScraper):
|
class MyLifeWithFel(ParserScraper):
|
||||||
baseUrl = 'https://www.mylifewithfel.com/'
|
baseUrl = 'https://www.mylifewithfel.com/'
|
||||||
stripUrl = baseUrl + 'api/posts/%s'
|
stripUrl = baseUrl + 'api/posts/%s'
|
||||||
firstStripUrl = stripUrl % '1'
|
firstStripUrl = stripUrl % '1'
|
||||||
|
@ -249,7 +249,7 @@ class MyLifeWithFel(_ParserScraper):
|
||||||
def getPrevUrl(self, url, data):
|
def getPrevUrl(self, url, data):
|
||||||
return self.stripUrl % json.loads(data.text_content())['previous']['id']
|
return self.stripUrl % json.loads(data.text_content())['previous']['id']
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlSearch):
|
def extract_image_urls(self, url, data):
|
||||||
return [self.baseUrl + json.loads(data.text_content())['post']['image']]
|
return [self.baseUrl + json.loads(data.text_content())['post']['image']]
|
||||||
|
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, imageUrl, pageUrl):
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2019-2022 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2020 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from ..scraper import ParserScraper
|
from ..scraper import ParserScraper
|
||||||
|
@ -71,24 +71,24 @@ class MangaDex(ParserScraper):
|
||||||
return None
|
return None
|
||||||
return self.stripUrl % self.chapters[self.chapters.index(chapter[0]) - 1]['id']
|
return self.stripUrl % self.chapters[self.chapters.index(chapter[0]) - 1]['id']
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlSearch):
|
def extract_image_urls(self, url, data):
|
||||||
# Retrieve chapter metadata from API
|
# Retrieve chapter metadata from API
|
||||||
chapterData = json.loads(data.text_content())
|
chapters = json.loads(data.text_content())
|
||||||
self.chapter = chapterData['data']
|
self.chapter = chapters['data']
|
||||||
cdnData = self.session.get(self.cdnUrl % self.chapter['id'])
|
cdnresponse = self.session.get(self.cdnUrl % self.chapter['id'])
|
||||||
cdnData.raise_for_status()
|
cdnresponse.raise_for_status()
|
||||||
cdnBlock = cdnData.json()
|
cdnblock = cdnresponse.json()
|
||||||
|
|
||||||
# Save link order for position-based filenames
|
# Save link order for position-based filenames
|
||||||
imageUrl = self.imageUrl % cdnBlock['chapter']['hash']
|
urltemplate = self.imageUrl % cdnblock['chapter']['hash']
|
||||||
self.imageUrls = [imageUrl % page for page in cdnBlock['chapter']['data']]
|
self._cached_image_urls = [urltemplate % page for page in cdnblock['chapter']['data']]
|
||||||
return self.imageUrls
|
return self._cached_image_urls
|
||||||
|
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, imageUrl, pageUrl):
|
||||||
# Construct filename from episode number and page index in array
|
# Construct filename from episode number and page index in array
|
||||||
chapter = self.chapter['attributes']['chapter']
|
chapter = self.chapter['attributes']['chapter']
|
||||||
chapterNum = chapter if chapter is not None else 0
|
chapterNum = chapter if chapter is not None else 0
|
||||||
pageNum = self.imageUrls.index(imageUrl)
|
pageNum = self._cached_image_urls.index(imageUrl)
|
||||||
pageExt = imageUrl.rsplit('.')[-1]
|
pageExt = imageUrl.rsplit('.')[-1]
|
||||||
return '%s-%02d.%s' % (chapterNum, pageNum, pageExt)
|
return '%s-%02d.%s' % (chapterNum, pageNum, pageExt)
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2020 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
from re import compile, escape
|
from re import compile, escape
|
||||||
|
|
||||||
from ..helpers import bounceStarter, indirectStarter
|
from ..helpers import bounceStarter, indirectStarter
|
||||||
from ..scraper import _BasicScraper, _ParserScraper
|
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
from .common import WordPressScraper, WordPressNavi
|
from .common import WordPressScraper, WordPressNavi
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ class OctopusPie(_ParserScraper):
|
||||||
help = 'Index format: yyyy-mm-dd/nnn-strip-name'
|
help = 'Index format: yyyy-mm-dd/nnn-strip-name'
|
||||||
|
|
||||||
|
|
||||||
class OffWhite(_ParserScraper):
|
class OffWhite(ParserScraper):
|
||||||
baseUrl = 'https://web.archive.org/web/20200627222318/http://off-white.eu/'
|
baseUrl = 'https://web.archive.org/web/20200627222318/http://off-white.eu/'
|
||||||
stripUrl = baseUrl + 'comic/%s/'
|
stripUrl = baseUrl + 'comic/%s/'
|
||||||
firstStripUrl = stripUrl % 'prologue-page-1-2'
|
firstStripUrl = stripUrl % 'prologue-page-1-2'
|
||||||
|
@ -32,18 +32,6 @@ class OffWhite(_ParserScraper):
|
||||||
starter = indirectStarter
|
starter = indirectStarter
|
||||||
endOfLife = True
|
endOfLife = True
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlSearch):
|
|
||||||
# Fix missing page
|
|
||||||
if url == self.stripUrl % 'page-37':
|
|
||||||
return [self.baseUrl + 'ow_v2/wp-content/uploads/2011/01/new-037.jpg']
|
|
||||||
return super(OffWhite, self).fetchUrls(url, data, urlSearch)
|
|
||||||
|
|
||||||
def getPrevUrl(self, url, data):
|
|
||||||
# Fix missing page
|
|
||||||
if url == self.stripUrl % 'page-37':
|
|
||||||
return self.stripUrl % 'page-36'
|
|
||||||
return super(OffWhite, self).getPrevUrl(url, data)
|
|
||||||
|
|
||||||
|
|
||||||
class Oglaf(_ParserScraper):
|
class Oglaf(_ParserScraper):
|
||||||
url = 'http://oglaf.com/'
|
url = 'http://oglaf.com/'
|
||||||
|
@ -55,19 +43,16 @@ class Oglaf(_ParserScraper):
|
||||||
multipleImagesPerStrip = True
|
multipleImagesPerStrip = True
|
||||||
adult = True
|
adult = True
|
||||||
|
|
||||||
def fetchUrls(self, url, data, search):
|
def extract_image_urls(self, url, data):
|
||||||
urls = []
|
urls = super().extract_image_urls(url, data)
|
||||||
urls.extend(super(Oglaf, self).fetchUrls(url, data, search))
|
try:
|
||||||
if search == self.imageSearch:
|
nexturl = self.fetchUrls(url, data, self.nextSearch)[0]
|
||||||
try:
|
while nexturl.startswith(url):
|
||||||
nexturls = self.fetchUrls(url, data, self.nextSearch)
|
data = self.getPage(nexturl)
|
||||||
except ValueError:
|
urls.extend(super().extract_image_urls(url, data))
|
||||||
pass
|
nexturl = self.fetchUrls(url, data, self.nextSearch)[0]
|
||||||
else:
|
except ValueError:
|
||||||
while nexturls and nexturls[0].startswith(url):
|
pass
|
||||||
data = self.getPage(nexturls[0])
|
|
||||||
urls.extend(super(Oglaf, self).fetchUrls(nexturls, data, search))
|
|
||||||
nexturls = self.fetchUrls(url, data, self.nextSearch)
|
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2022 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2020 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
from re import compile, escape, IGNORECASE, sub
|
from re import compile, escape, IGNORECASE, sub
|
||||||
from os.path import splitext
|
from os.path import splitext
|
||||||
|
|
||||||
|
@ -436,20 +436,20 @@ class Sorcery101(WordPressWebcomic):
|
||||||
help = 'Index format: stripname'
|
help = 'Index format: stripname'
|
||||||
|
|
||||||
|
|
||||||
class SpaceFurries(_ParserScraper):
|
class SpaceFurries(ParserScraper):
|
||||||
url = 'http://www.spacefurrs.org/'
|
url = 'https://www.spacefurrs.org/'
|
||||||
firstStripUrl = url
|
firstStripUrl = url
|
||||||
multipleImagesPerStrip = True
|
multipleImagesPerStrip = True
|
||||||
adult = True
|
adult = True
|
||||||
endOfLife = True
|
endOfLife = True
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlSearch):
|
def extract_image_urls(self, url, data):
|
||||||
# Website requires JS, so build the list of image URLs manually
|
# Website requires JS, so build the list of image URLs manually
|
||||||
imageUrls = []
|
imageurls = []
|
||||||
currentPage = int(data.xpath('//input[@name="pagnum"]')[0].get('value'))
|
current = int(data.xpath('//input[@name="pagnum"]')[0].get('value'))
|
||||||
for page in reversed(range(1, currentPage + 1)):
|
for page in reversed(range(1, current + 1)):
|
||||||
imageUrls.append(self.url + 'comics/' + str(page) + '.jpg')
|
imageurls.append(self.url + 'comics/' + str(page) + '.jpg')
|
||||||
return imageUrls
|
return imageurls
|
||||||
|
|
||||||
|
|
||||||
class SpaceJunkArlia(_ParserScraper):
|
class SpaceJunkArlia(_ParserScraper):
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2019-2022 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2022 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
from ..output import out
|
from ..output import out
|
||||||
from ..scraper import ParserScraper
|
from ..scraper import ParserScraper
|
||||||
from ..xml import NS
|
from ..xml import NS
|
||||||
|
@ -37,10 +37,10 @@ class Tapas(ParserScraper):
|
||||||
self.firstStripUrl = self.stripUrl % apiData['prev_ep_id']
|
self.firstStripUrl = self.stripUrl % apiData['prev_ep_id']
|
||||||
return self.stripUrl % apiData['prev_ep_id']
|
return self.stripUrl % apiData['prev_ep_id']
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlSearch):
|
def extract_image_urls(self, url, data):
|
||||||
# Save link order for position-based filenames
|
# Save link order for position-based filenames
|
||||||
self.imageUrls = super().fetchUrls(url, data, urlSearch)
|
self._cached_image_urls = super().extract_image_urls(url, data)
|
||||||
return self.imageUrls
|
return self._cached_image_urls
|
||||||
|
|
||||||
def shouldSkipUrl(self, url, data):
|
def shouldSkipUrl(self, url, data):
|
||||||
if data.xpath('//button[d:class("js-have-to-sign")]', namespaces=NS):
|
if data.xpath('//button[d:class("js-have-to-sign")]', namespaces=NS):
|
||||||
|
@ -51,9 +51,9 @@ class Tapas(ParserScraper):
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, imageUrl, pageUrl):
|
||||||
# Construct filename from episode number and image position on page
|
# Construct filename from episode number and image position on page
|
||||||
episodeNum = pageUrl.rsplit('/', 1)[-1]
|
episodeNum = pageUrl.rsplit('/', 1)[-1]
|
||||||
imageNum = self.imageUrls.index(imageUrl)
|
imageNum = self._cached_image_urls.index(imageUrl)
|
||||||
imageExt = pageUrl.rsplit('.', 1)[-1]
|
imageExt = pageUrl.rsplit('.', 1)[-1]
|
||||||
if len(self.imageUrls) > 1:
|
if len(self._cached_image_urls) > 1:
|
||||||
filename = "%s-%d.%s" % (episodeNum, imageNum, imageExt)
|
filename = "%s-%d.%s" % (episodeNum, imageNum, imageExt)
|
||||||
else:
|
else:
|
||||||
filename = "%s.%s" % (episodeNum, imageExt)
|
filename = "%s.%s" % (episodeNum, imageExt)
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2020 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
import json
|
import json
|
||||||
from re import compile
|
from re import compile
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
@ -29,7 +29,7 @@ class UberQuest(ParserScraper):
|
||||||
def getPrevUrl(self, url, data):
|
def getPrevUrl(self, url, data):
|
||||||
return self.stripUrl % json.loads(data.text_content())[0]['prev_id']
|
return self.stripUrl % json.loads(data.text_content())[0]['prev_id']
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlSearch):
|
def extract_image_urls(self, url, data):
|
||||||
return [json.loads(data.text_content())[0]['attachment']]
|
return [json.loads(data.text_content())[0]['attachment']]
|
||||||
|
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, imageUrl, pageUrl):
|
||||||
|
@ -80,7 +80,7 @@ class UnicornJelly(BasicScraper):
|
||||||
|
|
||||||
|
|
||||||
class Unsounded(ParserScraper):
|
class Unsounded(ParserScraper):
|
||||||
url = 'http://www.casualvillain.com/Unsounded/'
|
url = 'https://www.casualvillain.com/Unsounded/'
|
||||||
startUrl = url + 'comic+index/'
|
startUrl = url + 'comic+index/'
|
||||||
stripUrl = url + 'comic/ch%s/ch%s_%s.html'
|
stripUrl = url + 'comic/ch%s/ch%s_%s.html'
|
||||||
firstStripUrl = stripUrl % ('01', '01', '01')
|
firstStripUrl = stripUrl % ('01', '01', '01')
|
||||||
|
@ -91,18 +91,17 @@ class Unsounded(ParserScraper):
|
||||||
starter = indirectStarter
|
starter = indirectStarter
|
||||||
help = 'Index format: chapter-page'
|
help = 'Index format: chapter-page'
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlSearch):
|
def extract_image_urls(self, url, data):
|
||||||
imageUrls = super(Unsounded, self).fetchUrls(url, data, urlSearch)
|
imageUrls = super().extract_image_urls(url, data)
|
||||||
# Include background for multi-image pages
|
# Include background for multi-image pages
|
||||||
imageRegex = compile(r'background-image: url\((pageart/.*)\)')
|
imageRegex = compile(r'background-image: url\((pageart/.*)\)')
|
||||||
for match in imageRegex.finditer(str(etree.tostring(data))):
|
for match in imageRegex.finditer(str(etree.tostring(data))):
|
||||||
print(match)
|
imageUrls.append(normaliseURL(urljoin(data[1], match.group(1))))
|
||||||
searchUrls.append(normaliseURL(urljoin(data[1], match.group(1))))
|
|
||||||
return imageUrls
|
return imageUrls
|
||||||
|
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, image_url, page_url):
|
||||||
filename = imageUrl.rsplit('/', 1)[-1]
|
filename = image_url.rsplit('/', 1)[-1]
|
||||||
pagename = pageUrl.rsplit('/', 1)[-1]
|
pagename = page_url.rsplit('/', 1)[-1]
|
||||||
if pagename.split('.', 1)[0] != filename.split('.', 1)[0]:
|
if pagename.split('.', 1)[0] != filename.split('.', 1)[0]:
|
||||||
filename = pagename.split('_', 1)[0] + '_' + filename
|
filename = pagename.split('_', 1)[0] + '_' + filename
|
||||||
return filename
|
return filename
|
||||||
|
@ -111,7 +110,7 @@ class Unsounded(ParserScraper):
|
||||||
# Fix missing navigation links between chapters
|
# Fix missing navigation links between chapters
|
||||||
if 'ch13/you_let_me_fall' in url:
|
if 'ch13/you_let_me_fall' in url:
|
||||||
return self.stripUrl % ('13', '13', '85')
|
return self.stripUrl % ('13', '13', '85')
|
||||||
return super(Unsounded, self).getPrevUrl(url, data)
|
return super().getPrevUrl(url, data)
|
||||||
|
|
||||||
def getIndexStripUrl(self, index):
|
def getIndexStripUrl(self, index):
|
||||||
chapter, num = index.split('-')
|
chapter, num = index.split('-')
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2019-2022 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2022 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
from ..scraper import ParserScraper
|
from ..scraper import ParserScraper
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,18 +29,18 @@ class WebToons(ParserScraper):
|
||||||
self.endOfLife = (listPage.xpath('//div[@id="_asideDetail"]//span[@class="txt_ico_completed2"]') != [])
|
self.endOfLife = (listPage.xpath('//div[@id="_asideDetail"]//span[@class="txt_ico_completed2"]') != [])
|
||||||
return self.stripUrl % currentEpisode
|
return self.stripUrl % currentEpisode
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlSearch):
|
def extract_image_urls(self, url, data):
|
||||||
# Save link order for position-based filenames
|
# Save link order for position-based filenames
|
||||||
self.imageUrls = super().fetchUrls(url, data, urlSearch)
|
self._cached_image_urls = super().extract_image_urls(url, data)
|
||||||
# Update firstStripUrl with the correct episode title
|
# Update firstStripUrl with the correct episode title
|
||||||
if url.rsplit('=', 1)[-1] == '1':
|
if url.rsplit('=', 1)[-1] == '1':
|
||||||
self.firstStripUrl = url
|
self.firstStripUrl = url
|
||||||
return self.imageUrls
|
return self._cached_image_urls
|
||||||
|
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, imageUrl, pageUrl):
|
||||||
# Construct filename from episode number and image position on page
|
# Construct filename from episode number and image position on page
|
||||||
episodeNum = pageUrl.rsplit('=', 1)[-1]
|
episodeNum = pageUrl.rsplit('=', 1)[-1]
|
||||||
imageNum = self.imageUrls.index(imageUrl)
|
imageNum = self._cached_image_urls.index(imageUrl)
|
||||||
imageExt = pageUrl.rsplit('.', 1)[-1].split('?', 1)[0]
|
imageExt = pageUrl.rsplit('.', 1)[-1].split('?', 1)[0]
|
||||||
return "%s-%03d.%s" % (episodeNum, imageNum, imageExt)
|
return "%s-%03d.%s" % (episodeNum, imageNum, imageExt)
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2022 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import html
|
import html
|
||||||
|
@ -137,7 +137,7 @@ class Scraper:
|
||||||
|
|
||||||
def getComicStrip(self, url, data):
|
def getComicStrip(self, url, data):
|
||||||
"""Get comic strip downloader for given URL and data."""
|
"""Get comic strip downloader for given URL and data."""
|
||||||
imageUrls = self.fetchUrls(url, data, self.imageSearch)
|
imageUrls = self.extract_image_urls(url, data)
|
||||||
# map modifier function on image URLs
|
# map modifier function on image URLs
|
||||||
imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls]
|
imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls]
|
||||||
# remove duplicate URLs
|
# remove duplicate URLs
|
||||||
|
@ -325,6 +325,12 @@ class Scraper:
|
||||||
"""
|
"""
|
||||||
return get_page(url, self.session, allow_errors=self.allow_errors)
|
return get_page(url, self.session, allow_errors=self.allow_errors)
|
||||||
|
|
||||||
|
def extract_image_urls(self, url, data):
|
||||||
|
"""
|
||||||
|
Extract image URLs from page data using the classes imageSearch attribute.
|
||||||
|
"""
|
||||||
|
return self.fetchUrls(url, data, self.imageSearch)
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlsearch):
|
def fetchUrls(self, url, data, urlsearch):
|
||||||
raise ValueError("No implementation for fetchUrls!")
|
raise ValueError("No implementation for fetchUrls!")
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue