Fix some old modules using the Internet Archive

This commit is contained in:
Tobias Gruetzmacher 2020-01-09 17:38:13 +01:00
parent 275370a835
commit 752525c3e9
16 changed files with 173 additions and 137 deletions

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2018 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
@ -213,14 +213,12 @@ class AlienShores(_WordPressScraper):
firstStripUrl = url + 'AScomic/updated-cover/' firstStripUrl = url + 'AScomic/updated-cover/'
class AllTheGrowingThings(_BasicScraper): class AllTheGrowingThings(_WordPressScraper):
url = 'http://growingthings.typodmary.com/' url = ('https://web.archive.org/web/20160611212229/'
rurl = escape(url) 'http://growingthings.typodmary.com/')
stripUrl = url + '%s/' stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2009/04/21/all-the-growing-things' firstStripUrl = stripUrl % 'all-the-growing-things'
imageSearch = compile(tagre("img", "src", r'(%sfiles/[^"]+)' % rurl)) endOfLife = True
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
help = 'Index format: yyyy/mm/dd/strip-name'
class AlphaLuna(_ParserScraper): class AlphaLuna(_ParserScraper):
@ -329,11 +327,14 @@ class Angels2200(_BasicScraper):
class Annyseed(_ParserScraper): class Annyseed(_ParserScraper):
baseUrl = 'http://www.mirrorwoodcomics.com/' baseUrl = ('https://web.archive.org/web/20190511031451/'
url = baseUrl + 'AnnyseedLatest.htm' 'http://www.mirrorwoodcomics.com/')
stripUrl = baseUrl + 'Annyseed%s.htm' stripUrl = baseUrl + 'Annyseed%s.htm'
url = stripUrl % 'Latest'
firstStripUrl = stripUrl % '000'
imageSearch = '//div/img[contains(@src, "Annyseed")]' imageSearch = '//div/img[contains(@src, "Annyseed")]'
prevSearch = '//a[img[@name="Previousbtn"]]' prevSearch = '//a[img[@name="Previousbtn"]]'
endOfLife = True
help = 'Index format: nnn' help = 'Index format: nnn'
FIX_RE = compile(r'Annyseed/Finished%20For%20Print/') FIX_RE = compile(r'Annyseed/Finished%20For%20Print/')

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
@ -156,12 +156,13 @@ class BiggerThanCheeses(_BasicScraper):
class BillyTheDunce(_ParserScraper): class BillyTheDunce(_ParserScraper):
url = 'http://www.duncepress.com/' stripUrl = ('https://web.archive.org/web/20180404142544/'
firstStripUrl = url + '2009/06/an-introduction-of-sorts' 'http://www.duncepress.com/%s/')
url = stripUrl % '2012/02/losing-more'
firstStripUrl = stripUrl % '2009/06/an-introduction-of-sorts'
imageSearch = '//div[@class="entry"]/p[1]/a' imageSearch = '//div[@class="entry"]/p[1]/a'
prevSearch = '//a[@rel="prev"]' prevSearch = '//a[@rel="prev"]'
latestSearch = '//h2[@class="post-title"]/a' endOfLife = True
starter = indirectStarter
class BittersweetCandyBowl(_ParserScraper): class BittersweetCandyBowl(_ParserScraper):

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
@ -125,8 +125,8 @@ class CatenaCafe(_WordPressScraper):
class CatenaManor(_ParserScraper): class CatenaManor(_ParserScraper):
# Retrieve comic from the Internet Archive baseUrl = ('https://web.archive.org/web/20141027141116/'
baseUrl = 'https://web.archive.org/web/20141027141116/http://catenamanor.com/' 'http://catenamanor.com/')
url = baseUrl + 'archives' url = baseUrl + 'archives'
stripUrl = baseUrl + '%s/' stripUrl = baseUrl + '%s/'
firstStripUrl = stripUrl % '2003/07' firstStripUrl = stripUrl % '2003/07'

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
@ -139,7 +139,8 @@ class DemolitionSquad(_ParserScraper):
class DerTodUndDasMaedchen(_ParserScraper): class DerTodUndDasMaedchen(_ParserScraper):
url = 'http://www.cartoontomb.de/deutsch/tod2.php' url = ('https://web.archive.org/web/20180106180134/'
'http://www.cartoontomb.de/deutsch/tod2.php')
stripUrl = url + '?bild=%s.jpg' stripUrl = url + '?bild=%s.jpg'
firstStripUrl = stripUrl % '00_01_01' firstStripUrl = stripUrl % '00_01_01'
imageSearch = '//img[contains(@src, "images/tod/teil2")]' imageSearch = '//img[contains(@src, "images/tod/teil2")]'
@ -305,16 +306,17 @@ class DresdenCodak(_ParserScraper):
return not data.xpath(self.imageSearch) return not data.xpath(self.imageSearch)
class DrFun(_BasicScraper): class DrFun(_ParserScraper):
baseUrl = 'http://www.ibiblio.org/Dave/' baseUrl = ('https://web.archive.org/web/20180726145737/'
url = baseUrl + 'ar00502.htm' 'http://www.ibiblio.org/Dave/')
stripUrl = baseUrl + 'ar%s.htm' stripUrl = baseUrl + 'ar%s.htm'
url = stripUrl % '00502'
firstStripUrl = stripUrl % '00001' firstStripUrl = stripUrl % '00001'
imageSearch = compile(tagre("a", "href", r'(Dr-Fun/df\d+/df[^"]+)')) imageSearch = '//a[contains(@href, "Dr-Fun/df")]'
multipleImagesPerStrip = True multipleImagesPerStrip = True
prevSearch = compile(tagre("a", "href", r'([^"]+)') + 'Previous Week,') prevSearch = '//a[contains(text(), "Previous Week")]'
help = 'Index format: nnnnn'
endOfLife = True endOfLife = True
help = 'Index format: nnnnn'
class Drive(_BasicScraper): class Drive(_BasicScraper):

View file

@ -181,10 +181,11 @@ class EverybodyLovesEricRaymond(_ParserScraper):
prevSearch = '//a[@rel="prev"]' prevSearch = '//a[@rel="prev"]'
# Seems to be GeoBlocked from Germany?
class EvilDiva(_WordPressScraper): class EvilDiva(_WordPressScraper):
url = 'http://www.evildivacomics.com/' url = ('https://web.archive.org/web/20190221223751/'
'https://www.evildivacomics.com/')
firstStripUrl = url + 'comic/evil-diva-issue-1-cover/' firstStripUrl = url + 'comic/evil-diva-issue-1-cover/'
endOfLife = True
class EvilInc(_WordPressScraper): class EvilInc(_WordPressScraper):

View file

@ -1,10 +1,10 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
from re import compile, escape, IGNORECASE from re import compile, escape
from ..util import tagre from ..util import tagre
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
@ -27,14 +27,15 @@ class Faneurysm(_WPNaviIn):
endOfLife = True endOfLife = True
class FantasyRealms(_BasicScraper): class FantasyRealms(_ParserScraper):
url = 'http://www.fantasyrealmsonline.com/' stripUrl = ('https://web.archive.org/web/20161204192651/'
stripUrl = url + 'manga/%s.php' 'http://fantasyrealmsonline.com/manga/%s.php')
imageSearch = compile(r'<img src="(\d{1,4}.\w{3,4})" width="540"', IGNORECASE) url = stripUrl % '091'
prevSearch = compile(r'<a href="(.+?)"><img src="../images/nav-back.gif"', IGNORECASE) firstStripUrl = stripUrl % '001'
latestSearch = compile(r'<a href="(manga/.+?)"><img src="preview.jpg"', IGNORECASE) imageSearch = '//img[contains(@src, "/manga/0")]'
prevSearch = '//a[img[contains(@src, "nav-back")]]'
endOfLife = True
help = 'Index format: nnn' help = 'Index format: nnn'
starter = indirectStarter
class FarToTheNorth(_ComicControlScraper): class FarToTheNorth(_ComicControlScraper):
@ -57,16 +58,15 @@ class FireflyCross(_WordPressScraper):
firstStripUrl = url + '?comic=05062002' firstStripUrl = url + '?comic=05062002'
class FirstWorldProblems(_BasicScraper): class FirstWorldProblems(_ParserScraper):
url = 'http://bradcolbow.com/archive/C5/' url = ('https://web.archive.org/web/20150710053456/'
'http://bradcolbow.com/archive/C5/')
stripUrl = url + '%s/' stripUrl = url + '%s/'
firstStripUrl = stripUrl % 'P10' firstStripUrl = stripUrl % 'P10'
imageSearch = compile(tagre("img", "src", imageSearch = '//div[{}]//img'.format(xpath_class('entry'))
r'(http://(?:fwpcomics\.s3\.amazonaws\.com|s3\.amazonaws\.com/fwpcomics)/s1-[^"]+)')) prevSearch = '//a[{}]'.format(xpath_class('prev'))
prevSearch = compile(tagre("a", "href",
r'(http://bradcolbow\.com/archive/C5/[^"]+)', before="prev"))
multipleImagesPerStrip = True multipleImagesPerStrip = True
help = 'Index format: a letter and a number' endOfLife = True
class FlakyPastry(_BasicScraper): class FlakyPastry(_BasicScraper):
@ -79,12 +79,14 @@ class FlakyPastry(_BasicScraper):
help = 'Index format: nnnn' help = 'Index format: nnnn'
class Flemcomics(_BasicScraper): class Flemcomics(_ParserScraper):
url = 'http://www.flemcomics.com/' url = ('https://web.archive.org/web/20180414110349/'
'http://www.flemcomics.com/')
stripUrl = url + 'd/%s.html' stripUrl = url + 'd/%s.html'
imageSearch = compile(tagre("img", "src", r'(/comics/[^"]+)')) firstStripUrl = stripUrl % '19980101'
prevSearch = compile(tagre("a", "href", r'(/d/\d+\.html)') + imageSearch = '//img[{}]'.format(xpath_class('ksc'))
tagre("img", "src", r'/images/previous_day\.jpg')) prevSearch = '//a[@rel="prev"]'
endOfLife = True
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
@ -160,10 +162,12 @@ class FoxTails(_ParserScraper):
class Fragile(_ParserScraper): class Fragile(_ParserScraper):
url = 'http://www.fragilestory.com/' url = ('https://web.archive.org/web/20190308203109/'
'http://www.fragilestory.com/')
imageSearch = '//div[@id="comic_strip"]/a[@class="nobg"]/img' imageSearch = '//div[@id="comic_strip"]/a[@class="nobg"]/img'
prevSearch = '//div[@id="nav_comic_a"]/a[2]' prevSearch = '//div[@id="nav_comic_a"]/a[2]'
firstStripUrl = url + 'strips/chapter_01' firstStripUrl = url + 'strips/chapter_01'
endOfLife = True
class FredoAndPidjin(_ParserScraper): class FredoAndPidjin(_ParserScraper):
@ -214,19 +218,22 @@ class FullFrontalNerdity(_BasicScraper):
help = 'Index format: number' help = 'Index format: number'
class FunInJammies(_BasicScraper): class FunInJammies(_WordPressScraper):
url = 'http://www.funinjammies.com/' url = ('https://web.archive.org/web/20170205105241/'
'http://www.funinjammies.com/')
stripUrl = url + 'comic.php?issue=%s' stripUrl = url + 'comic.php?issue=%s'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = compile(r'(/comics/.+?)"') prevSearch = '//a[text()="< Prev"]'
prevSearch = compile(r'(/comic.php.+?)" id.+?prev') endOfLife = True
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
class FurPiled(_ParserScraper): class FurPiled(_ParserScraper):
stripUrl = 'https://web.archive.org/web/20160404074145/http://www.liondogworks.com/images/fp-%03d.jpg' stripUrl = ('https://web.archive.org/web/20160404074145/'
'http://www.liondogworks.com/images/fp-%03d.jpg')
url = stripUrl % 427 url = stripUrl % 427
firstStripUrl = stripUrl % 1 firstStripUrl = stripUrl % 1
endOfLife = True
def getPrevUrl(self, url, data): def getPrevUrl(self, url, data):
# Skip missing pages # Skip missing pages

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
@ -21,20 +21,12 @@ class Galaxion(_WPNavi):
help = 'Index format: n-comic/book-n/chapter-n/title-nnn' help = 'Index format: n-comic/book-n/chapter-n/title-nnn'
class Garanos(_BasicScraper): class Garanos(_WordPressScraper):
baseUrl = 'http://garanos.alexheberling.com/' stripUrl = ('https://web.archive.org/web/20180314181433/'
rurl = escape(baseUrl) 'http://garanos.alexheberling.com/pages/%s/')
url = baseUrl + 'pages/page-1/' url = stripUrl % 'page-487'
starter = indirectStarter firstStripUrl = stripUrl % 'vol01'
stripUrl = baseUrl + 'pages/page-%s' endOfLife = True
imageSearch = compile(
tagre("img", "src",
r'(%swp-content/uploads/sites/\d+/\d+/\d+/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl,
after="prev"))
latestSearch = compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl,
after="nav-last"))
help = 'Index format: n (unpadded)'
class GastroPhobia(_ParserScraper): class GastroPhobia(_ParserScraper):
@ -46,13 +38,14 @@ class GastroPhobia(_ParserScraper):
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'
class Geeks(_BasicScraper): class Geeks(_ParserScraper):
url = 'http://sevenfloorsdown.com/geeks/' url = ('https://web.archive.org/web/20190527194921/'
'http://sevenfloorsdown.com/geeks/')
stripUrl = url + 'archives/%s' stripUrl = url + 'archives/%s'
firstStripUrl = stripUrl % '10' firstStripUrl = stripUrl % '10'
imageSearch = compile( imageSearch = '//div[@id="comic"]/img'
r'<img src=\'(http://sevenfloorsdown.com/geeks/comics/.+?)\'') prevSearch = '//a[contains(text(), "Previous")]'
prevSearch = compile(r'<a href="(.+?)">&laquo; Previous') endOfLife = True
help = 'Index format: nnn' help = 'Index format: nnn'
@ -116,15 +109,12 @@ class GlassHalfEmpty(_BasicScraper):
help = 'Index format: nnn' help = 'Index format: nnn'
class GleefulNihilism(_BasicScraper): class GleefulNihilism(_WordPressScraper):
url = 'http://gleefulnihilism.com/' url = ('https://web.archive.org/web/20170911203122/'
rurl = escape(url) 'http://gleefulnihilism.com/')
stripUrl = url + 'comic/%s/' stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % 'amoeba' firstStripUrl = stripUrl % 'amoeba'
imageSearch = compile( endOfLife = True
tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/[^"]+)' % rurl))
prevSearch = compile(
tagre("a", "href", r'(%scomic/[^"]+)' % rurl) + '&lsaquo;')
help = 'Index format: stripname' help = 'Index format: stripname'

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
@ -68,9 +68,11 @@ class LetsSpeakEnglish(_ComicControlScraper):
class LifeAintNoPonyFarm(_WordPressScraper): class LifeAintNoPonyFarm(_WordPressScraper):
url = 'http://sarahburrini.com/en/' url = ('https://web.archive.org/web/20181221154155/'
'http://sarahburrini.com/en/')
firstStripUrl = url + 'comic/my-first-webcomic/' firstStripUrl = url + 'comic/my-first-webcomic/'
multipleImagesPerStrip = True multipleImagesPerStrip = True
endOfLife = True
class LilithsWord(_ComicControlScraper): class LilithsWord(_ComicControlScraper):

View file

@ -1,14 +1,14 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
from re import compile, escape from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter from ..helpers import indirectStarter, xpath_class
from ..util import tagre from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
@ -88,12 +88,13 @@ class Newshounds(_ParserScraper):
return super().getPrevUrl(url, data) return super().getPrevUrl(url, data)
class NewWorld(_BasicScraper): class NewWorld(_WordPressScraper):
url = 'http://www.tfsnewworld.com/' url = ('https://web.archive.org/web/20190718012133/'
'http://www.tfsnewworld.com/')
stripUrl = url + '%s/' stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2007/08/30/63' firstStripUrl = stripUrl % '2007/08/30/63'
imageSearch = compile(r'<img src="(http://www.tfsnewworld.com/comics/.+?)"') prevSearch = '//a[@rel="prev"]'
prevSearch = compile(r'<div class="nav-previous"><a href="([^"]+)" rel="prev">') endOfLife = True
help = 'Index format: yyyy/mm/dd/stripn' help = 'Index format: yyyy/mm/dd/stripn'
@ -109,7 +110,9 @@ class NichtLustig(_BasicScraper):
class Nicky510(_WPNavi): class Nicky510(_WPNavi):
url = 'http://www.nickyitis.com/' url = ('https://web.archive.org/web/20160510215718/'
'http://www.nickyitis.com/')
endOfLife = True
class NicoleAndDerek(_ParserScraper): class NicoleAndDerek(_ParserScraper):
@ -140,13 +143,13 @@ class Nightshift(_ParserScraper):
return chapter + '_' + page return chapter + '_' + page
class Nimona(_BasicScraper): class Nimona(_ParserScraper):
url = 'http://gingerhaze.com/nimona/' url = ('https://web.archive.org/web/20141008095502/'
'http://gingerhaze.com/nimona/')
stripUrl = url + 'comic/%s' stripUrl = url + 'comic/%s'
firstStripUrl = stripUrl % "page-1" firstStripUrl = stripUrl % "page-1"
imageSearch = compile(tagre("img", "src", r'(http://gingerhaze\.com/sites/default/files/nimona-pages/.+?)')) imageSearch = '//div[{}]//img'.format(xpath_class('field-name-field-comic-page'))
prevSearch = compile(r'<a href="(/nimona/comic/[^"]+)"><img src="http://gingerhaze\.com/sites/default/files/comicdrop/comicdrop_prev_label_file\.png"') prevSearch = '//a[img[contains(@src, "/comicdrop_prev_label")]]'
help = 'Index format: stripname'
endOfLife = True endOfLife = True

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
@ -125,7 +125,9 @@ class OnTheFastrack(_BasicScraper):
class OopsComicAdventure(_WordPressScraper): class OopsComicAdventure(_WordPressScraper):
url = 'http://oopscomicadventure.com/' url = ('https://web.archive.org/web/20190102215141/'
'http://oopscomicadventure.com/')
endOfLife = True
class Optipess(_WPNavi): class Optipess(_WPNavi):

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
@ -44,8 +44,10 @@ class ParallelUniversum(_BasicScraper):
class PartiallyClips(_WordPressScraper): class PartiallyClips(_WordPressScraper):
url = 'http://partiallyclips.com/' url = ('https://web.archive.org/web/20180509161332/'
'http://partiallyclips.com/')
firstStripUrl = url + 'comic/screaming-woman/' firstStripUrl = url + 'comic/screaming-woman/'
endOfLife = True
class PastelDefender(_BasicScraper): class PastelDefender(_BasicScraper):

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
@ -119,34 +119,36 @@ class SchlockMercenary(_ParserScraper):
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'
class SchoolBites(_BasicScraper): class SchoolBites(_ParserScraper):
url = 'http://schoolbites.net/' url = ('https://web.archive.org/web/20170215065523/'
'http://schoolbites.net/')
stripUrl = url + 'd/%s.html' stripUrl = url + 'd/%s.html'
imageSearch = compile(tagre("img", "src", r'(http://cdn\.schoolbites\.net/comics/[^"]+)')) imageSearch = '//img[{}]'.format(xpath_class('ksc'))
prevSearch = compile(tagre("a", "href", r'(http://schoolbites\.net/d/\d+\.html)', after="prev")) prevSearch = '//a[@rel="prev"]'
endOfLife = True
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
class Schuelert(_BasicScraper): class Schuelert(_ParserScraper):
url = 'http://www.schuelert.de/' url = ('https://web.archive.org/web/20190103022830/'
rurl = escape(url) 'http://www.schuelert.de/')
stripUrl = url + 'index.php?paged=%s' stripUrl = url + 'index.php?paged=%s'
firstStripUrl = stripUrl % '5' firstStripUrl = stripUrl % '3'
imageSearch = compile(tagre("img", "src", r"(%swp-content/[^']+)" % rurl, quote="'")) imageSearch = '//img[contains(@src, "wp-content")]'
prevSearch = compile(tagre("a", "href", r'(%sindex\.php\?paged=\d+)' % rurl) + "&laquo;") prevSearch = '//span[{}]/a'.format(xpath_class('prevlink'))
multipleImagesPerStrip = True multipleImagesPerStrip = True
help = 'Index format: none' endOfLife = True
lang = 'de' lang = 'de'
class Science(_BasicScraper): class Science(_ParserScraper):
url = 'http://sci-ence.org/' stripUrl = ('https://web.archive.org/web/20180616152753/'
rurl = escape(url) 'http://sci-ence.org/%s/')
stripUrl = url + '%s/' url = stripUrl % 'new-york-comic-con-2013'
firstStripUrl = stripUrl % 'periodic-table-element-ass' firstStripUrl = stripUrl % 'periodic-table-element-ass'
prevSearch = compile(tagre("a", "href", r'(%s[^"]+/)' % rurl, after="prev")) prevSearch = '//a[{}]'.format(xpath_class('navi-prev'))
imageSearch = compile(tagre("img", "src", r'(%scomics/\d+-\d+-\d+[^"]+)' % rurl)) imageSearch = '//div[@class="comicpane"]//img'
help = 'Index format: stripname' endOfLife = True
class SeelPeel(_WPNaviIn): class SeelPeel(_WPNaviIn):
@ -321,10 +323,12 @@ class SMBC(_ComicControlScraper):
class SnowFlame(_WordPressScraper): class SnowFlame(_WordPressScraper):
url = 'http://www.snowflamecomic.com/' url = ('https://web.archive.org/web/20160905071051/'
'http://www.snowflamecomic.com/')
stripUrl = url + '?comic=snowflame-%s-%s' stripUrl = url + '?comic=snowflame-%s-%s'
firstStripUrl = stripUrl % ('01', '01') firstStripUrl = stripUrl % ('01', '01')
starter = bounceStarter starter = bounceStarter
endOfLife = True
help = 'Index format: chapter-page' help = 'Index format: chapter-page'
def getIndexStripUrl(self, index): def getIndexStripUrl(self, index):
@ -493,12 +497,14 @@ class StandStillStaySilent(_ParserScraper):
class StarCrossdDestiny(_ParserScraper): class StarCrossdDestiny(_ParserScraper):
baseUrl = 'http://starcrossd.net/' baseUrl = ('https://web.archive.org/web/20190918132321/'
'http://starcrossd.net/')
url = baseUrl + 'comic.html' url = baseUrl + 'comic.html'
stripUrl = baseUrl + 'archives/%s.html' stripUrl = baseUrl + 'archives/%s.html'
firstStripUrl = stripUrl % '00000001' firstStripUrl = stripUrl % '00000001'
imageSearch = '//div[@id="comic"]//img' imageSearch = '//div[@id="comic"]//img'
prevSearch = '//a[text()="prev"]' prevSearch = '//a[text()="prev"]'
endOfLife = True
help = 'Index format: nnnnnnnn' help = 'Index format: nnnnnnnn'
def namer(self, image_url, page_url): def namer(self, image_url, page_url):

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
@ -48,10 +48,14 @@ class Tamberlane(_ParserScraper):
class TheBrads(_ParserScraper): class TheBrads(_ParserScraper):
url = 'http://bradcolbow.com/archive/' url = ('https://web.archive.org/web/20171211154809/'
imageSearch = '//div[%s]//img' % xpath_class('entry') 'http://bradcolbow.com/archive/C4/')
prevSearch = '//a[%s]' % xpath_class('prev') stripUrl = url + '%s/'
firstStripUrl = stripUrl % 'P125'
imageSearch = '//div[{}]//img'.format(xpath_class('entry'))
prevSearch = '//a[{}]'.format(xpath_class('prev'))
multipleImagesPerStrip = True multipleImagesPerStrip = True
endOfLife = True
class TheClassMenagerie(_ParserScraper): class TheClassMenagerie(_ParserScraper):
@ -107,15 +111,14 @@ class TheJunkHyenasDiner(_WordPressScraper):
firstStripUrl = stripUrl % 'intro' firstStripUrl = stripUrl % 'intro'
class TheLandscaper(_BasicScraper): class TheLandscaper(_ParserScraper):
stripUrl = 'http://landscaper.visual-assault.net/comic/%s' stripUrl = ('https://web.archive.org/web/20171129163510/'
'http://landscaper.visual-assault.net/comic/%s')
url = stripUrl % 'latest' url = stripUrl % 'latest'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", imageSearch = '//article[{}]//img[1]'.format(xpath_class('comic'))
r'(/comics/comic/comic_page/[^"]+)')) prevSearch = '//a[contains(text(), "Previous")]'
prevSearch = compile(tagre("a", "href", r'(/comic/[^"]+)') + endOfLife = True
'&lsaquo; Previous')
help = 'Index format: name'
class TheMelvinChronicles(_WordPressScraper): class TheMelvinChronicles(_WordPressScraper):

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
@ -14,8 +14,10 @@ from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
class Underling(_WPNavi): class Underling(_WPNavi):
url = 'http://underlingcomic.com/' url = ('https://web.archive.org/web/20190806120425/'
'http://underlingcomic.com/')
firstStripUrl = url + 'page-one/' firstStripUrl = url + 'page-one/'
endOfLife = True
class Undertow(_BasicScraper): class Undertow(_BasicScraper):

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
@ -30,6 +30,9 @@ from .output import out
from .events import getHandler from .events import getHandler
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
class Scraper(object): class Scraper(object):
'''Base class for all comic scraper, but without a specific scrape '''Base class for all comic scraper, but without a specific scrape
implementation.''' implementation.'''
@ -183,7 +186,7 @@ class Scraper(object):
except ValueError as msg: except ValueError as msg:
# image not found # image not found
out.exception(msg) out.exception(msg)
if self.firstStripUrl == url: if self.isfirststrip(url):
out.debug(u"Stop at first URL %s" % url) out.debug(u"Stop at first URL %s" % url)
self.hitFirstStripUrl = True self.hitFirstStripUrl = True
break break
@ -199,6 +202,17 @@ class Scraper(object):
break break
url = prevUrl url = prevUrl
def isfirststrip(self, url):
"""Check if the specified URL is the first strip of a comic. This is
specially for comics taken from archive.org, since the base URL of
archive.org changes whenever pages are taken from a different
snapshot."""
if not self.firstStripUrl:
return False
firsturl = ARCHIVE_ORG_URL.sub('', self.firstStripUrl)
currenturl = ARCHIVE_ORG_URL.sub('', url)
return firsturl == currenturl
def getPrevUrl(self, url, data): def getPrevUrl(self, url, data):
"""Find previous URL.""" """Find previous URL."""
prevUrl = None prevUrl = None