Fix some old modules using the Internet Archive

This commit is contained in:
Tobias Gruetzmacher 2020-01-09 17:38:13 +01:00
parent 275370a835
commit 752525c3e9
16 changed files with 173 additions and 137 deletions

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2018 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -213,14 +213,12 @@ class AlienShores(_WordPressScraper):
firstStripUrl = url + 'AScomic/updated-cover/'
class AllTheGrowingThings(_BasicScraper):
url = 'http://growingthings.typodmary.com/'
rurl = escape(url)
class AllTheGrowingThings(_WordPressScraper):
url = ('https://web.archive.org/web/20160611212229/'
'http://growingthings.typodmary.com/')
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2009/04/21/all-the-growing-things'
imageSearch = compile(tagre("img", "src", r'(%sfiles/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
help = 'Index format: yyyy/mm/dd/strip-name'
firstStripUrl = stripUrl % 'all-the-growing-things'
endOfLife = True
class AlphaLuna(_ParserScraper):
@ -329,11 +327,14 @@ class Angels2200(_BasicScraper):
class Annyseed(_ParserScraper):
baseUrl = 'http://www.mirrorwoodcomics.com/'
url = baseUrl + 'AnnyseedLatest.htm'
baseUrl = ('https://web.archive.org/web/20190511031451/'
'http://www.mirrorwoodcomics.com/')
stripUrl = baseUrl + 'Annyseed%s.htm'
url = stripUrl % 'Latest'
firstStripUrl = stripUrl % '000'
imageSearch = '//div/img[contains(@src, "Annyseed")]'
prevSearch = '//a[img[@name="Previousbtn"]]'
endOfLife = True
help = 'Index format: nnn'
FIX_RE = compile(r'Annyseed/Finished%20For%20Print/')

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -156,12 +156,13 @@ class BiggerThanCheeses(_BasicScraper):
class BillyTheDunce(_ParserScraper):
url = 'http://www.duncepress.com/'
firstStripUrl = url + '2009/06/an-introduction-of-sorts'
stripUrl = ('https://web.archive.org/web/20180404142544/'
'http://www.duncepress.com/%s/')
url = stripUrl % '2012/02/losing-more'
firstStripUrl = stripUrl % '2009/06/an-introduction-of-sorts'
imageSearch = '//div[@class="entry"]/p[1]/a'
prevSearch = '//a[@rel="prev"]'
latestSearch = '//h2[@class="post-title"]/a'
starter = indirectStarter
endOfLife = True
class BittersweetCandyBowl(_ParserScraper):

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -125,8 +125,8 @@ class CatenaCafe(_WordPressScraper):
class CatenaManor(_ParserScraper):
# Retrieve comic from the Internet Archive
baseUrl = 'https://web.archive.org/web/20141027141116/http://catenamanor.com/'
baseUrl = ('https://web.archive.org/web/20141027141116/'
'http://catenamanor.com/')
url = baseUrl + 'archives'
stripUrl = baseUrl + '%s/'
firstStripUrl = stripUrl % '2003/07'

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -139,7 +139,8 @@ class DemolitionSquad(_ParserScraper):
class DerTodUndDasMaedchen(_ParserScraper):
url = 'http://www.cartoontomb.de/deutsch/tod2.php'
url = ('https://web.archive.org/web/20180106180134/'
'http://www.cartoontomb.de/deutsch/tod2.php')
stripUrl = url + '?bild=%s.jpg'
firstStripUrl = stripUrl % '00_01_01'
imageSearch = '//img[contains(@src, "images/tod/teil2")]'
@ -305,16 +306,17 @@ class DresdenCodak(_ParserScraper):
return not data.xpath(self.imageSearch)
class DrFun(_BasicScraper):
baseUrl = 'http://www.ibiblio.org/Dave/'
url = baseUrl + 'ar00502.htm'
class DrFun(_ParserScraper):
baseUrl = ('https://web.archive.org/web/20180726145737/'
'http://www.ibiblio.org/Dave/')
stripUrl = baseUrl + 'ar%s.htm'
url = stripUrl % '00502'
firstStripUrl = stripUrl % '00001'
imageSearch = compile(tagre("a", "href", r'(Dr-Fun/df\d+/df[^"]+)'))
imageSearch = '//a[contains(@href, "Dr-Fun/df")]'
multipleImagesPerStrip = True
prevSearch = compile(tagre("a", "href", r'([^"]+)') + 'Previous Week,')
help = 'Index format: nnnnn'
prevSearch = '//a[contains(text(), "Previous Week")]'
endOfLife = True
help = 'Index format: nnnnn'
class Drive(_BasicScraper):

View file

@ -181,10 +181,11 @@ class EverybodyLovesEricRaymond(_ParserScraper):
prevSearch = '//a[@rel="prev"]'
# Seems to be GeoBlocked from Germany?
class EvilDiva(_WordPressScraper):
url = 'http://www.evildivacomics.com/'
url = ('https://web.archive.org/web/20190221223751/'
'https://www.evildivacomics.com/')
firstStripUrl = url + 'comic/evil-diva-issue-1-cover/'
endOfLife = True
class EvilInc(_WordPressScraper):

View file

@ -1,10 +1,10 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from re import compile, escape, IGNORECASE
from re import compile, escape
from ..util import tagre
from ..scraper import _BasicScraper, _ParserScraper
@ -27,14 +27,15 @@ class Faneurysm(_WPNaviIn):
endOfLife = True
class FantasyRealms(_BasicScraper):
url = 'http://www.fantasyrealmsonline.com/'
stripUrl = url + 'manga/%s.php'
imageSearch = compile(r'<img src="(\d{1,4}.\w{3,4})" width="540"', IGNORECASE)
prevSearch = compile(r'<a href="(.+?)"><img src="../images/nav-back.gif"', IGNORECASE)
latestSearch = compile(r'<a href="(manga/.+?)"><img src="preview.jpg"', IGNORECASE)
class FantasyRealms(_ParserScraper):
stripUrl = ('https://web.archive.org/web/20161204192651/'
'http://fantasyrealmsonline.com/manga/%s.php')
url = stripUrl % '091'
firstStripUrl = stripUrl % '001'
imageSearch = '//img[contains(@src, "/manga/0")]'
prevSearch = '//a[img[contains(@src, "nav-back")]]'
endOfLife = True
help = 'Index format: nnn'
starter = indirectStarter
class FarToTheNorth(_ComicControlScraper):
@ -57,16 +58,15 @@ class FireflyCross(_WordPressScraper):
firstStripUrl = url + '?comic=05062002'
class FirstWorldProblems(_BasicScraper):
url = 'http://bradcolbow.com/archive/C5/'
class FirstWorldProblems(_ParserScraper):
url = ('https://web.archive.org/web/20150710053456/'
'http://bradcolbow.com/archive/C5/')
stripUrl = url + '%s/'
firstStripUrl = stripUrl % 'P10'
imageSearch = compile(tagre("img", "src",
r'(http://(?:fwpcomics\.s3\.amazonaws\.com|s3\.amazonaws\.com/fwpcomics)/s1-[^"]+)'))
prevSearch = compile(tagre("a", "href",
r'(http://bradcolbow\.com/archive/C5/[^"]+)', before="prev"))
imageSearch = '//div[{}]//img'.format(xpath_class('entry'))
prevSearch = '//a[{}]'.format(xpath_class('prev'))
multipleImagesPerStrip = True
help = 'Index format: a letter and a number'
endOfLife = True
class FlakyPastry(_BasicScraper):
@ -79,12 +79,14 @@ class FlakyPastry(_BasicScraper):
help = 'Index format: nnnn'
class Flemcomics(_BasicScraper):
url = 'http://www.flemcomics.com/'
class Flemcomics(_ParserScraper):
url = ('https://web.archive.org/web/20180414110349/'
'http://www.flemcomics.com/')
stripUrl = url + 'd/%s.html'
imageSearch = compile(tagre("img", "src", r'(/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/d/\d+\.html)') +
tagre("img", "src", r'/images/previous_day\.jpg'))
firstStripUrl = stripUrl % '19980101'
imageSearch = '//img[{}]'.format(xpath_class('ksc'))
prevSearch = '//a[@rel="prev"]'
endOfLife = True
help = 'Index format: yyyymmdd'
@ -160,10 +162,12 @@ class FoxTails(_ParserScraper):
class Fragile(_ParserScraper):
url = 'http://www.fragilestory.com/'
url = ('https://web.archive.org/web/20190308203109/'
'http://www.fragilestory.com/')
imageSearch = '//div[@id="comic_strip"]/a[@class="nobg"]/img'
prevSearch = '//div[@id="nav_comic_a"]/a[2]'
firstStripUrl = url + 'strips/chapter_01'
endOfLife = True
class FredoAndPidjin(_ParserScraper):
@ -214,19 +218,22 @@ class FullFrontalNerdity(_BasicScraper):
help = 'Index format: number'
class FunInJammies(_BasicScraper):
url = 'http://www.funinjammies.com/'
class FunInJammies(_WordPressScraper):
url = ('https://web.archive.org/web/20170205105241/'
'http://www.funinjammies.com/')
stripUrl = url + 'comic.php?issue=%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(r'(/comics/.+?)"')
prevSearch = compile(r'(/comic.php.+?)" id.+?prev')
prevSearch = '//a[text()="< Prev"]'
endOfLife = True
help = 'Index format: n (unpadded)'
class FurPiled(_ParserScraper):
stripUrl = 'https://web.archive.org/web/20160404074145/http://www.liondogworks.com/images/fp-%03d.jpg'
stripUrl = ('https://web.archive.org/web/20160404074145/'
'http://www.liondogworks.com/images/fp-%03d.jpg')
url = stripUrl % 427
firstStripUrl = stripUrl % 1
endOfLife = True
def getPrevUrl(self, url, data):
# Skip missing pages

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -21,20 +21,12 @@ class Galaxion(_WPNavi):
help = 'Index format: n-comic/book-n/chapter-n/title-nnn'
class Garanos(_BasicScraper):
baseUrl = 'http://garanos.alexheberling.com/'
rurl = escape(baseUrl)
url = baseUrl + 'pages/page-1/'
starter = indirectStarter
stripUrl = baseUrl + 'pages/page-%s'
imageSearch = compile(
tagre("img", "src",
r'(%swp-content/uploads/sites/\d+/\d+/\d+/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl,
after="prev"))
latestSearch = compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl,
after="nav-last"))
help = 'Index format: n (unpadded)'
class Garanos(_WordPressScraper):
stripUrl = ('https://web.archive.org/web/20180314181433/'
'http://garanos.alexheberling.com/pages/%s/')
url = stripUrl % 'page-487'
firstStripUrl = stripUrl % 'vol01'
endOfLife = True
class GastroPhobia(_ParserScraper):
@ -46,13 +38,14 @@ class GastroPhobia(_ParserScraper):
help = 'Index format: yyyy-mm-dd'
class Geeks(_BasicScraper):
url = 'http://sevenfloorsdown.com/geeks/'
class Geeks(_ParserScraper):
url = ('https://web.archive.org/web/20190527194921/'
'http://sevenfloorsdown.com/geeks/')
stripUrl = url + 'archives/%s'
firstStripUrl = stripUrl % '10'
imageSearch = compile(
r'<img src=\'(http://sevenfloorsdown.com/geeks/comics/.+?)\'')
prevSearch = compile(r'<a href="(.+?)">&laquo; Previous')
imageSearch = '//div[@id="comic"]/img'
prevSearch = '//a[contains(text(), "Previous")]'
endOfLife = True
help = 'Index format: nnn'
@ -116,15 +109,12 @@ class GlassHalfEmpty(_BasicScraper):
help = 'Index format: nnn'
class GleefulNihilism(_BasicScraper):
url = 'http://gleefulnihilism.com/'
rurl = escape(url)
class GleefulNihilism(_WordPressScraper):
url = ('https://web.archive.org/web/20170911203122/'
'http://gleefulnihilism.com/')
stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % 'amoeba'
imageSearch = compile(
tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/[^"]+)' % rurl))
prevSearch = compile(
tagre("a", "href", r'(%scomic/[^"]+)' % rurl) + '&lsaquo;')
endOfLife = True
help = 'Index format: stripname'

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -68,9 +68,11 @@ class LetsSpeakEnglish(_ComicControlScraper):
class LifeAintNoPonyFarm(_WordPressScraper):
url = 'http://sarahburrini.com/en/'
url = ('https://web.archive.org/web/20181221154155/'
'http://sarahburrini.com/en/')
firstStripUrl = url + 'comic/my-first-webcomic/'
multipleImagesPerStrip = True
endOfLife = True
class LilithsWord(_ComicControlScraper):

View file

@ -1,14 +1,14 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter
from ..helpers import indirectStarter, xpath_class
from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
@ -88,12 +88,13 @@ class Newshounds(_ParserScraper):
return super().getPrevUrl(url, data)
class NewWorld(_BasicScraper):
url = 'http://www.tfsnewworld.com/'
class NewWorld(_WordPressScraper):
url = ('https://web.archive.org/web/20190718012133/'
'http://www.tfsnewworld.com/')
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2007/08/30/63'
imageSearch = compile(r'<img src="(http://www.tfsnewworld.com/comics/.+?)"')
prevSearch = compile(r'<div class="nav-previous"><a href="([^"]+)" rel="prev">')
prevSearch = '//a[@rel="prev"]'
endOfLife = True
help = 'Index format: yyyy/mm/dd/stripn'
@ -109,7 +110,9 @@ class NichtLustig(_BasicScraper):
class Nicky510(_WPNavi):
url = 'http://www.nickyitis.com/'
url = ('https://web.archive.org/web/20160510215718/'
'http://www.nickyitis.com/')
endOfLife = True
class NicoleAndDerek(_ParserScraper):
@ -140,13 +143,13 @@ class Nightshift(_ParserScraper):
return chapter + '_' + page
class Nimona(_BasicScraper):
url = 'http://gingerhaze.com/nimona/'
class Nimona(_ParserScraper):
url = ('https://web.archive.org/web/20141008095502/'
'http://gingerhaze.com/nimona/')
stripUrl = url + 'comic/%s'
firstStripUrl = stripUrl % "page-1"
imageSearch = compile(tagre("img", "src", r'(http://gingerhaze\.com/sites/default/files/nimona-pages/.+?)'))
prevSearch = compile(r'<a href="(/nimona/comic/[^"]+)"><img src="http://gingerhaze\.com/sites/default/files/comicdrop/comicdrop_prev_label_file\.png"')
help = 'Index format: stripname'
imageSearch = '//div[{}]//img'.format(xpath_class('field-name-field-comic-page'))
prevSearch = '//a[img[contains(@src, "/comicdrop_prev_label")]]'
endOfLife = True

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -125,7 +125,9 @@ class OnTheFastrack(_BasicScraper):
class OopsComicAdventure(_WordPressScraper):
url = 'http://oopscomicadventure.com/'
url = ('https://web.archive.org/web/20190102215141/'
'http://oopscomicadventure.com/')
endOfLife = True
class Optipess(_WPNavi):

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -44,8 +44,10 @@ class ParallelUniversum(_BasicScraper):
class PartiallyClips(_WordPressScraper):
url = 'http://partiallyclips.com/'
url = ('https://web.archive.org/web/20180509161332/'
'http://partiallyclips.com/')
firstStripUrl = url + 'comic/screaming-woman/'
endOfLife = True
class PastelDefender(_BasicScraper):

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -119,34 +119,36 @@ class SchlockMercenary(_ParserScraper):
help = 'Index format: yyyy-mm-dd'
class SchoolBites(_BasicScraper):
url = 'http://schoolbites.net/'
class SchoolBites(_ParserScraper):
url = ('https://web.archive.org/web/20170215065523/'
'http://schoolbites.net/')
stripUrl = url + 'd/%s.html'
imageSearch = compile(tagre("img", "src", r'(http://cdn\.schoolbites\.net/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://schoolbites\.net/d/\d+\.html)', after="prev"))
imageSearch = '//img[{}]'.format(xpath_class('ksc'))
prevSearch = '//a[@rel="prev"]'
endOfLife = True
help = 'Index format: yyyymmdd'
class Schuelert(_BasicScraper):
url = 'http://www.schuelert.de/'
rurl = escape(url)
class Schuelert(_ParserScraper):
url = ('https://web.archive.org/web/20190103022830/'
'http://www.schuelert.de/')
stripUrl = url + 'index.php?paged=%s'
firstStripUrl = stripUrl % '5'
imageSearch = compile(tagre("img", "src", r"(%swp-content/[^']+)" % rurl, quote="'"))
prevSearch = compile(tagre("a", "href", r'(%sindex\.php\?paged=\d+)' % rurl) + "&laquo;")
firstStripUrl = stripUrl % '3'
imageSearch = '//img[contains(@src, "wp-content")]'
prevSearch = '//span[{}]/a'.format(xpath_class('prevlink'))
multipleImagesPerStrip = True
help = 'Index format: none'
endOfLife = True
lang = 'de'
class Science(_BasicScraper):
url = 'http://sci-ence.org/'
rurl = escape(url)
stripUrl = url + '%s/'
class Science(_ParserScraper):
stripUrl = ('https://web.archive.org/web/20180616152753/'
'http://sci-ence.org/%s/')
url = stripUrl % 'new-york-comic-con-2013'
firstStripUrl = stripUrl % 'periodic-table-element-ass'
prevSearch = compile(tagre("a", "href", r'(%s[^"]+/)' % rurl, after="prev"))
imageSearch = compile(tagre("img", "src", r'(%scomics/\d+-\d+-\d+[^"]+)' % rurl))
help = 'Index format: stripname'
prevSearch = '//a[{}]'.format(xpath_class('navi-prev'))
imageSearch = '//div[@class="comicpane"]//img'
endOfLife = True
class SeelPeel(_WPNaviIn):
@ -321,10 +323,12 @@ class SMBC(_ComicControlScraper):
class SnowFlame(_WordPressScraper):
url = 'http://www.snowflamecomic.com/'
url = ('https://web.archive.org/web/20160905071051/'
'http://www.snowflamecomic.com/')
stripUrl = url + '?comic=snowflame-%s-%s'
firstStripUrl = stripUrl % ('01', '01')
starter = bounceStarter
endOfLife = True
help = 'Index format: chapter-page'
def getIndexStripUrl(self, index):
@ -493,12 +497,14 @@ class StandStillStaySilent(_ParserScraper):
class StarCrossdDestiny(_ParserScraper):
baseUrl = 'http://starcrossd.net/'
baseUrl = ('https://web.archive.org/web/20190918132321/'
'http://starcrossd.net/')
url = baseUrl + 'comic.html'
stripUrl = baseUrl + 'archives/%s.html'
firstStripUrl = stripUrl % '00000001'
imageSearch = '//div[@id="comic"]//img'
prevSearch = '//a[text()="prev"]'
endOfLife = True
help = 'Index format: nnnnnnnn'
def namer(self, image_url, page_url):

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -48,10 +48,14 @@ class Tamberlane(_ParserScraper):
class TheBrads(_ParserScraper):
url = 'http://bradcolbow.com/archive/'
imageSearch = '//div[%s]//img' % xpath_class('entry')
prevSearch = '//a[%s]' % xpath_class('prev')
url = ('https://web.archive.org/web/20171211154809/'
'http://bradcolbow.com/archive/C4/')
stripUrl = url + '%s/'
firstStripUrl = stripUrl % 'P125'
imageSearch = '//div[{}]//img'.format(xpath_class('entry'))
prevSearch = '//a[{}]'.format(xpath_class('prev'))
multipleImagesPerStrip = True
endOfLife = True
class TheClassMenagerie(_ParserScraper):
@ -107,15 +111,14 @@ class TheJunkHyenasDiner(_WordPressScraper):
firstStripUrl = stripUrl % 'intro'
class TheLandscaper(_BasicScraper):
stripUrl = 'http://landscaper.visual-assault.net/comic/%s'
class TheLandscaper(_ParserScraper):
stripUrl = ('https://web.archive.org/web/20171129163510/'
'http://landscaper.visual-assault.net/comic/%s')
url = stripUrl % 'latest'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src",
r'(/comics/comic/comic_page/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/comic/[^"]+)') +
'&lsaquo; Previous')
help = 'Index format: name'
imageSearch = '//article[{}]//img[1]'.format(xpath_class('comic'))
prevSearch = '//a[contains(text(), "Previous")]'
endOfLife = True
class TheMelvinChronicles(_WordPressScraper):

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -14,8 +14,10 @@ from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
class Underling(_WPNavi):
url = 'http://underlingcomic.com/'
url = ('https://web.archive.org/web/20190806120425/'
'http://underlingcomic.com/')
firstStripUrl = url + 'page-one/'
endOfLife = True
class Undertow(_BasicScraper):

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -30,6 +30,9 @@ from .output import out
from .events import getHandler
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
class Scraper(object):
'''Base class for all comic scraper, but without a specific scrape
implementation.'''
@ -183,7 +186,7 @@ class Scraper(object):
except ValueError as msg:
# image not found
out.exception(msg)
if self.firstStripUrl == url:
if self.isfirststrip(url):
out.debug(u"Stop at first URL %s" % url)
self.hitFirstStripUrl = True
break
@ -199,6 +202,17 @@ class Scraper(object):
break
url = prevUrl
def isfirststrip(self, url):
"""Check if the specified URL is the first strip of a comic. This is
specially for comics taken from archive.org, since the base URL of
archive.org changes whenever pages are taken from a different
snapshot."""
if not self.firstStripUrl:
return False
firsturl = ARCHIVE_ORG_URL.sub('', self.firstStripUrl)
currenturl = ARCHIVE_ORG_URL.sub('', url)
return firsturl == currenturl
def getPrevUrl(self, url, data):
"""Find previous URL."""
prevUrl = None