Try to fix some more comics using the Internet Archive

This commit is contained in:
Tobias Gruetzmacher 2021-01-31 23:40:21 +01:00
parent 5b3bfdd09e
commit 87f4049347
10 changed files with 27 additions and 50 deletions

View file

@ -1,7 +1,7 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher # Copyright (C) 2015-2021 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # Copyright (C) 2019-2020 Daniel Ring
from re import compile, escape from re import compile, escape
@ -121,14 +121,14 @@ class ForestHill(_WordPressScraper):
url = 'https://www.foresthillcomic.org/' url = 'https://www.foresthillcomic.org/'
class ForLackOfABetterComic(_BasicScraper): class ForLackOfABetterComic(_ParserScraper):
url = 'http://forlackofabettercomic.com/' url = 'https://web.archive.org/web/20200224010115/http://forlackofabettercomic.com/'
rurl = r'http://(?:www\.)?forlackofabettercomic\.com/'
stripUrl = url + '?id=%s' stripUrl = url + '?id=%s'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(%simg/comic/\d+[^"]+)' % rurl, after="comicimg")) imageSearch = '//img[@id="comicimg"]'
prevSearch = compile(tagre("a", "href", r'(%s\?id\=\d+)' % rurl) + r'Prev') prevSearch = '//a[text()="Prev"]'
help = 'Index format: number' help = 'Index format: number'
endOfLife = True
class FoxDad(_ParserScraper): class FoxDad(_ParserScraper):

View file

@ -1,7 +1,7 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher # Copyright (C) 2015-2021 Tobias Gruetzmacher
from re import compile, escape from re import compile, escape
from ..scraper import _BasicScraper from ..scraper import _BasicScraper
@ -44,14 +44,3 @@ class JohnnyWander(_ComicControlScraper):
imageSearch = ('//ul[d:class("cc-showbig")]/li/@data-src', imageSearch = ('//ul[d:class("cc-showbig")]/li/@data-src',
_ComicControlScraper.imageSearch) _ComicControlScraper.imageSearch)
url = 'http://www.johnnywander.com/' url = 'http://www.johnnywander.com/'
class JustAnotherEscape(_BasicScraper):
url = 'http://www.justanotherescape.com/'
rurl = escape(url)
stripUrl = url + 'index.cgi?date=%s'
imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href",
r'(%s/index\.cgi\?date=\d+)' % rurl) +
tagre("img", "alt", "Previous Comic"))
help = 'Index format: yyyymmdd'

View file

@ -22,7 +22,8 @@ class OctopusPie(_ParserScraper):
class OffWhite(_ParserScraper): class OffWhite(_ParserScraper):
stripUrl = 'http://off-white.eu/comic/%s/' baseUrl = 'https://web.archive.org/web/20200627222318/http://off-white.eu/'
stripUrl = baseUrl + 'comic/%s/'
firstStripUrl = stripUrl % 'prologue-page-1-2' firstStripUrl = stripUrl % 'prologue-page-1-2'
url = firstStripUrl url = firstStripUrl
imageSearch = '//img[@class="comic-page"]' imageSearch = '//img[@class="comic-page"]'
@ -34,7 +35,7 @@ class OffWhite(_ParserScraper):
def fetchUrls(self, url, data, urlSearch): def fetchUrls(self, url, data, urlSearch):
# Fix missing page # Fix missing page
if url == self.stripUrl % 'page-37': if url == self.stripUrl % 'page-37':
return ['http://off-white.eu/ow_v2/wp-content/uploads/2011/01/new-037.jpg'] return [self.baseUrl + 'ow_v2/wp-content/uploads/2011/01/new-037.jpg']
return super(OffWhite, self).fetchUrls(url, data, urlSearch) return super(OffWhite, self).fetchUrls(url, data, urlSearch)
def getPrevUrl(self, url, data): def getPrevUrl(self, url, data):

View file

@ -673,10 +673,12 @@ class Removed(Scraper):
cls('GoComics/UncleArtsFunland'), cls('GoComics/UncleArtsFunland'),
cls('GoComics/USAcres'), cls('GoComics/USAcres'),
cls('GoComics/WorldOfWonder'), cls('GoComics/WorldOfWonder'),
cls('JustAnotherEscape'),
cls('Laiyu', 'brk'), cls('Laiyu', 'brk'),
cls('MangaDex/DrStone', 'legal'), cls('MangaDex/DrStone', 'legal'),
cls('MangaDex/HeavensDesignTeam', 'legal'), cls('MangaDex/HeavensDesignTeam', 'legal'),
cls('MangaDex/SPYxFAMILY', 'legal'), cls('MangaDex/SPYxFAMILY', 'legal'),
cls('Ryugou'),
cls('SmackJeeves/20TimesKirby'), cls('SmackJeeves/20TimesKirby'),
cls('SmackJeeves/2Kingdoms'), cls('SmackJeeves/2Kingdoms'),
cls('SmackJeeves/355Days'), cls('SmackJeeves/355Days'),

View file

@ -308,13 +308,14 @@ class ProphecyOfTheCircle(_WPNavi):
class Prototype(_ParserScraper): class Prototype(_ParserScraper):
stripUrl = 'http://planetprototype.com/%s/' stripUrl = 'https://web.archive.org/web/20201030035444/http://planetprototype.com/%s/'
firstStripUrl = stripUrl % '2018/03/30/vol-1-ch-1-front-cover' firstStripUrl = stripUrl % '2018/03/30/vol-1-ch-1-front-cover'
url = firstStripUrl url = firstStripUrl
imageSearch = '//img[contains(@class, "wp-post-image")]' imageSearch = '//img[contains(@class, "wp-post-image")]'
prevSearch = '//a[.//text()="Previous"]' prevSearch = '//a[.//text()="Previous"]'
latestSearch = '//a[.//text()="Latest"]' latestSearch = '//a[.//text()="Latest"]'
starter = indirectStarter starter = indirectStarter
endOfLife = True
class PS238(_ParserScraper): class PS238(_ParserScraper):

View file

@ -1,7 +1,7 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher # Copyright (C) 2015-2021 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # Copyright (C) 2019-2020 Daniel Ring
from re import compile from re import compile
from urllib.parse import urljoin from urllib.parse import urljoin
@ -153,22 +153,3 @@ class Ruthe(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'(/?cartoons/strip_\d+[^"]+)')) imageSearch = compile(tagre("img", "src", r'(/?cartoons/strip_\d+[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/cartoon/\d+/datum/asc/)')) prevSearch = compile(tagre("a", "href", r'(/cartoon/\d+/datum/asc/)'))
help = 'Index format: number' help = 'Index format: number'
class Ryugou(_WPWebcomic):
url = 'http://ryugou.swashbuckledcomics.com/'
stripUrl = url + 'comic/%s/'
firstStripUrl = 'ryugou-chapter-1-cover'
starter = bounceStarter
adult = True
def namer(self, imageUrl, pageUrl):
title = pageUrl.rstrip('/').rsplit('/', 1)[-1]
ext = imageUrl.rsplit('.', 1)[-1]
return title + '.' + ext
def fetchUrls(self, url, data, urlSearch):
imageUrls = super(Ryugou, self).fetchUrls(url, data, urlSearch)
if url == self.stripUrl % '1-3':
imageUrls = [imageUrls[1]]
return imageUrls

View file

@ -1,7 +1,7 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher # Copyright (C) 2015-2021 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # Copyright (C) 2019-2020 Daniel Ring
from re import compile, escape, IGNORECASE, sub from re import compile, escape, IGNORECASE, sub
from os.path import splitext from os.path import splitext
@ -345,11 +345,13 @@ class SnowFlame(_WordPressScraper):
class SodiumEyes(_WordPressScraper): class SodiumEyes(_WordPressScraper):
url = 'http://sodiumeyes.com/' url = 'https://web.archive.org/web/20200220041406/http://sodiumeyes.com/'
starter = indirectStarter
endOfLife = True
class SoloLeveling(_ParserScraper): class SoloLeveling(_ParserScraper):
url = 'https://w1.sololeveling.net/' url = 'https://w3.sololeveling.net/'
stripUrl = url + 'manga/solo-leveling-chapter-%s/' stripUrl = url + 'manga/solo-leveling-chapter-%s/'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = '//div[@class="img_container"]//img' imageSearch = '//div[@class="img_container"]//img'

View file

@ -1,7 +1,7 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher # Copyright (C) 2015-2021 Tobias Gruetzmacher
from re import compile, escape from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
@ -27,12 +27,13 @@ class Zapiro(_ParserScraper):
class ZenPencils(_WPNavi): class ZenPencils(_WPNavi):
url = 'https://zenpencils.com/' url = 'https://web.archive.org/web/20200723091741/https://zenpencils.com/'
multipleImagesPerStrip = True multipleImagesPerStrip = True
firstStripUrl = url + 'comic/1-ralph-waldo-emerson-make-them-cry/' firstStripUrl = url + 'comic/1-ralph-waldo-emerson-make-them-cry/'
starter = bounceStarter starter = bounceStarter
prevSearch = '//a[d:class("navi-prev")]' prevSearch = '//a[d:class("navi-prev")]'
nextSearch = '//a[d:class("navi-next")]' nextSearch = '//a[d:class("navi-next")]'
endOfLife = True
class ZombieHunters(_BasicScraper): class ZombieHunters(_BasicScraper):

View file

@ -155,9 +155,9 @@ class TestDosage(object):
@responses.activate @responses.activate
def test_json_page_key_bounce_and_multi_image(self, tmpdir): def test_json_page_key_bounce_and_multi_image(self, tmpdir):
httpmocks.page('https://zenpencils.com/', 'zp-home') httpmocks.page(re.compile(r'.*com/$'), 'zp-home')
httpmocks.page('https://zenpencils.com/comic/missing/', 'zp-223') httpmocks.page(re.compile(r'.*com/comic/missing/$'), 'zp-223')
httpmocks.page('https://zenpencils.com/comic/lifejacket/', 'zp-222') httpmocks.page(re.compile(r'.*com/comic/lifejacket/$'), 'zp-222')
httpmocks.jpeg(re.compile(r'https://cdn-.*\.jpg')) httpmocks.jpeg(re.compile(r'https://cdn-.*\.jpg'))
cmd_ok("-v", "-b", str(tmpdir), "-o", "json", "ZenPencils") cmd_ok("-v", "-b", str(tmpdir), "-o", "json", "ZenPencils")

View file

@ -44,7 +44,7 @@ class TestModules(object):
@responses.activate @responses.activate
def test_sololeveling_geoblock(self, tmpdir): def test_sololeveling_geoblock(self, tmpdir):
responses.add(responses.GET, 'https://w1.sololeveling.net/', responses.add(responses.GET, 'https://w3.sololeveling.net/',
'<span>1020</span>', status=403) '<span>1020</span>', status=403)
with pytest.raises(GeoblockedException): with pytest.raises(GeoblockedException):