Fix some old modules using the Internet Archive
This commit is contained in:
parent
275370a835
commit
752525c3e9
16 changed files with 173 additions and 137 deletions
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2018 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -213,14 +213,12 @@ class AlienShores(_WordPressScraper):
|
||||||
firstStripUrl = url + 'AScomic/updated-cover/'
|
firstStripUrl = url + 'AScomic/updated-cover/'
|
||||||
|
|
||||||
|
|
||||||
class AllTheGrowingThings(_BasicScraper):
|
class AllTheGrowingThings(_WordPressScraper):
|
||||||
url = 'http://growingthings.typodmary.com/'
|
url = ('https://web.archive.org/web/20160611212229/'
|
||||||
rurl = escape(url)
|
'http://growingthings.typodmary.com/')
|
||||||
stripUrl = url + '%s/'
|
stripUrl = url + '%s/'
|
||||||
firstStripUrl = stripUrl % '2009/04/21/all-the-growing-things'
|
firstStripUrl = stripUrl % 'all-the-growing-things'
|
||||||
imageSearch = compile(tagre("img", "src", r'(%sfiles/[^"]+)' % rurl))
|
endOfLife = True
|
||||||
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
|
|
||||||
help = 'Index format: yyyy/mm/dd/strip-name'
|
|
||||||
|
|
||||||
|
|
||||||
class AlphaLuna(_ParserScraper):
|
class AlphaLuna(_ParserScraper):
|
||||||
|
@ -329,11 +327,14 @@ class Angels2200(_BasicScraper):
|
||||||
|
|
||||||
|
|
||||||
class Annyseed(_ParserScraper):
|
class Annyseed(_ParserScraper):
|
||||||
baseUrl = 'http://www.mirrorwoodcomics.com/'
|
baseUrl = ('https://web.archive.org/web/20190511031451/'
|
||||||
url = baseUrl + 'AnnyseedLatest.htm'
|
'http://www.mirrorwoodcomics.com/')
|
||||||
stripUrl = baseUrl + 'Annyseed%s.htm'
|
stripUrl = baseUrl + 'Annyseed%s.htm'
|
||||||
|
url = stripUrl % 'Latest'
|
||||||
|
firstStripUrl = stripUrl % '000'
|
||||||
imageSearch = '//div/img[contains(@src, "Annyseed")]'
|
imageSearch = '//div/img[contains(@src, "Annyseed")]'
|
||||||
prevSearch = '//a[img[@name="Previousbtn"]]'
|
prevSearch = '//a[img[@name="Previousbtn"]]'
|
||||||
|
endOfLife = True
|
||||||
help = 'Index format: nnn'
|
help = 'Index format: nnn'
|
||||||
FIX_RE = compile(r'Annyseed/Finished%20For%20Print/')
|
FIX_RE = compile(r'Annyseed/Finished%20For%20Print/')
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -156,12 +156,13 @@ class BiggerThanCheeses(_BasicScraper):
|
||||||
|
|
||||||
|
|
||||||
class BillyTheDunce(_ParserScraper):
|
class BillyTheDunce(_ParserScraper):
|
||||||
url = 'http://www.duncepress.com/'
|
stripUrl = ('https://web.archive.org/web/20180404142544/'
|
||||||
firstStripUrl = url + '2009/06/an-introduction-of-sorts'
|
'http://www.duncepress.com/%s/')
|
||||||
|
url = stripUrl % '2012/02/losing-more'
|
||||||
|
firstStripUrl = stripUrl % '2009/06/an-introduction-of-sorts'
|
||||||
imageSearch = '//div[@class="entry"]/p[1]/a'
|
imageSearch = '//div[@class="entry"]/p[1]/a'
|
||||||
prevSearch = '//a[@rel="prev"]'
|
prevSearch = '//a[@rel="prev"]'
|
||||||
latestSearch = '//h2[@class="post-title"]/a'
|
endOfLife = True
|
||||||
starter = indirectStarter
|
|
||||||
|
|
||||||
|
|
||||||
class BittersweetCandyBowl(_ParserScraper):
|
class BittersweetCandyBowl(_ParserScraper):
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -125,8 +125,8 @@ class CatenaCafe(_WordPressScraper):
|
||||||
|
|
||||||
|
|
||||||
class CatenaManor(_ParserScraper):
|
class CatenaManor(_ParserScraper):
|
||||||
# Retrieve comic from the Internet Archive
|
baseUrl = ('https://web.archive.org/web/20141027141116/'
|
||||||
baseUrl = 'https://web.archive.org/web/20141027141116/http://catenamanor.com/'
|
'http://catenamanor.com/')
|
||||||
url = baseUrl + 'archives'
|
url = baseUrl + 'archives'
|
||||||
stripUrl = baseUrl + '%s/'
|
stripUrl = baseUrl + '%s/'
|
||||||
firstStripUrl = stripUrl % '2003/07'
|
firstStripUrl = stripUrl % '2003/07'
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -139,7 +139,8 @@ class DemolitionSquad(_ParserScraper):
|
||||||
|
|
||||||
|
|
||||||
class DerTodUndDasMaedchen(_ParserScraper):
|
class DerTodUndDasMaedchen(_ParserScraper):
|
||||||
url = 'http://www.cartoontomb.de/deutsch/tod2.php'
|
url = ('https://web.archive.org/web/20180106180134/'
|
||||||
|
'http://www.cartoontomb.de/deutsch/tod2.php')
|
||||||
stripUrl = url + '?bild=%s.jpg'
|
stripUrl = url + '?bild=%s.jpg'
|
||||||
firstStripUrl = stripUrl % '00_01_01'
|
firstStripUrl = stripUrl % '00_01_01'
|
||||||
imageSearch = '//img[contains(@src, "images/tod/teil2")]'
|
imageSearch = '//img[contains(@src, "images/tod/teil2")]'
|
||||||
|
@ -305,16 +306,17 @@ class DresdenCodak(_ParserScraper):
|
||||||
return not data.xpath(self.imageSearch)
|
return not data.xpath(self.imageSearch)
|
||||||
|
|
||||||
|
|
||||||
class DrFun(_BasicScraper):
|
class DrFun(_ParserScraper):
|
||||||
baseUrl = 'http://www.ibiblio.org/Dave/'
|
baseUrl = ('https://web.archive.org/web/20180726145737/'
|
||||||
url = baseUrl + 'ar00502.htm'
|
'http://www.ibiblio.org/Dave/')
|
||||||
stripUrl = baseUrl + 'ar%s.htm'
|
stripUrl = baseUrl + 'ar%s.htm'
|
||||||
|
url = stripUrl % '00502'
|
||||||
firstStripUrl = stripUrl % '00001'
|
firstStripUrl = stripUrl % '00001'
|
||||||
imageSearch = compile(tagre("a", "href", r'(Dr-Fun/df\d+/df[^"]+)'))
|
imageSearch = '//a[contains(@href, "Dr-Fun/df")]'
|
||||||
multipleImagesPerStrip = True
|
multipleImagesPerStrip = True
|
||||||
prevSearch = compile(tagre("a", "href", r'([^"]+)') + 'Previous Week,')
|
prevSearch = '//a[contains(text(), "Previous Week")]'
|
||||||
help = 'Index format: nnnnn'
|
|
||||||
endOfLife = True
|
endOfLife = True
|
||||||
|
help = 'Index format: nnnnn'
|
||||||
|
|
||||||
|
|
||||||
class Drive(_BasicScraper):
|
class Drive(_BasicScraper):
|
||||||
|
|
|
@ -181,10 +181,11 @@ class EverybodyLovesEricRaymond(_ParserScraper):
|
||||||
prevSearch = '//a[@rel="prev"]'
|
prevSearch = '//a[@rel="prev"]'
|
||||||
|
|
||||||
|
|
||||||
# Seems to be GeoBlocked from Germany?
|
|
||||||
class EvilDiva(_WordPressScraper):
|
class EvilDiva(_WordPressScraper):
|
||||||
url = 'http://www.evildivacomics.com/'
|
url = ('https://web.archive.org/web/20190221223751/'
|
||||||
|
'https://www.evildivacomics.com/')
|
||||||
firstStripUrl = url + 'comic/evil-diva-issue-1-cover/'
|
firstStripUrl = url + 'comic/evil-diva-issue-1-cover/'
|
||||||
|
endOfLife = True
|
||||||
|
|
||||||
|
|
||||||
class EvilInc(_WordPressScraper):
|
class EvilInc(_WordPressScraper):
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
from re import compile, escape, IGNORECASE
|
from re import compile, escape
|
||||||
|
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
from ..scraper import _BasicScraper, _ParserScraper
|
from ..scraper import _BasicScraper, _ParserScraper
|
||||||
|
@ -27,14 +27,15 @@ class Faneurysm(_WPNaviIn):
|
||||||
endOfLife = True
|
endOfLife = True
|
||||||
|
|
||||||
|
|
||||||
class FantasyRealms(_BasicScraper):
|
class FantasyRealms(_ParserScraper):
|
||||||
url = 'http://www.fantasyrealmsonline.com/'
|
stripUrl = ('https://web.archive.org/web/20161204192651/'
|
||||||
stripUrl = url + 'manga/%s.php'
|
'http://fantasyrealmsonline.com/manga/%s.php')
|
||||||
imageSearch = compile(r'<img src="(\d{1,4}.\w{3,4})" width="540"', IGNORECASE)
|
url = stripUrl % '091'
|
||||||
prevSearch = compile(r'<a href="(.+?)"><img src="../images/nav-back.gif"', IGNORECASE)
|
firstStripUrl = stripUrl % '001'
|
||||||
latestSearch = compile(r'<a href="(manga/.+?)"><img src="preview.jpg"', IGNORECASE)
|
imageSearch = '//img[contains(@src, "/manga/0")]'
|
||||||
|
prevSearch = '//a[img[contains(@src, "nav-back")]]'
|
||||||
|
endOfLife = True
|
||||||
help = 'Index format: nnn'
|
help = 'Index format: nnn'
|
||||||
starter = indirectStarter
|
|
||||||
|
|
||||||
|
|
||||||
class FarToTheNorth(_ComicControlScraper):
|
class FarToTheNorth(_ComicControlScraper):
|
||||||
|
@ -57,16 +58,15 @@ class FireflyCross(_WordPressScraper):
|
||||||
firstStripUrl = url + '?comic=05062002'
|
firstStripUrl = url + '?comic=05062002'
|
||||||
|
|
||||||
|
|
||||||
class FirstWorldProblems(_BasicScraper):
|
class FirstWorldProblems(_ParserScraper):
|
||||||
url = 'http://bradcolbow.com/archive/C5/'
|
url = ('https://web.archive.org/web/20150710053456/'
|
||||||
|
'http://bradcolbow.com/archive/C5/')
|
||||||
stripUrl = url + '%s/'
|
stripUrl = url + '%s/'
|
||||||
firstStripUrl = stripUrl % 'P10'
|
firstStripUrl = stripUrl % 'P10'
|
||||||
imageSearch = compile(tagre("img", "src",
|
imageSearch = '//div[{}]//img'.format(xpath_class('entry'))
|
||||||
r'(http://(?:fwpcomics\.s3\.amazonaws\.com|s3\.amazonaws\.com/fwpcomics)/s1-[^"]+)'))
|
prevSearch = '//a[{}]'.format(xpath_class('prev'))
|
||||||
prevSearch = compile(tagre("a", "href",
|
|
||||||
r'(http://bradcolbow\.com/archive/C5/[^"]+)', before="prev"))
|
|
||||||
multipleImagesPerStrip = True
|
multipleImagesPerStrip = True
|
||||||
help = 'Index format: a letter and a number'
|
endOfLife = True
|
||||||
|
|
||||||
|
|
||||||
class FlakyPastry(_BasicScraper):
|
class FlakyPastry(_BasicScraper):
|
||||||
|
@ -79,12 +79,14 @@ class FlakyPastry(_BasicScraper):
|
||||||
help = 'Index format: nnnn'
|
help = 'Index format: nnnn'
|
||||||
|
|
||||||
|
|
||||||
class Flemcomics(_BasicScraper):
|
class Flemcomics(_ParserScraper):
|
||||||
url = 'http://www.flemcomics.com/'
|
url = ('https://web.archive.org/web/20180414110349/'
|
||||||
|
'http://www.flemcomics.com/')
|
||||||
stripUrl = url + 'd/%s.html'
|
stripUrl = url + 'd/%s.html'
|
||||||
imageSearch = compile(tagre("img", "src", r'(/comics/[^"]+)'))
|
firstStripUrl = stripUrl % '19980101'
|
||||||
prevSearch = compile(tagre("a", "href", r'(/d/\d+\.html)') +
|
imageSearch = '//img[{}]'.format(xpath_class('ksc'))
|
||||||
tagre("img", "src", r'/images/previous_day\.jpg'))
|
prevSearch = '//a[@rel="prev"]'
|
||||||
|
endOfLife = True
|
||||||
help = 'Index format: yyyymmdd'
|
help = 'Index format: yyyymmdd'
|
||||||
|
|
||||||
|
|
||||||
|
@ -160,10 +162,12 @@ class FoxTails(_ParserScraper):
|
||||||
|
|
||||||
|
|
||||||
class Fragile(_ParserScraper):
|
class Fragile(_ParserScraper):
|
||||||
url = 'http://www.fragilestory.com/'
|
url = ('https://web.archive.org/web/20190308203109/'
|
||||||
|
'http://www.fragilestory.com/')
|
||||||
imageSearch = '//div[@id="comic_strip"]/a[@class="nobg"]/img'
|
imageSearch = '//div[@id="comic_strip"]/a[@class="nobg"]/img'
|
||||||
prevSearch = '//div[@id="nav_comic_a"]/a[2]'
|
prevSearch = '//div[@id="nav_comic_a"]/a[2]'
|
||||||
firstStripUrl = url + 'strips/chapter_01'
|
firstStripUrl = url + 'strips/chapter_01'
|
||||||
|
endOfLife = True
|
||||||
|
|
||||||
|
|
||||||
class FredoAndPidjin(_ParserScraper):
|
class FredoAndPidjin(_ParserScraper):
|
||||||
|
@ -214,19 +218,22 @@ class FullFrontalNerdity(_BasicScraper):
|
||||||
help = 'Index format: number'
|
help = 'Index format: number'
|
||||||
|
|
||||||
|
|
||||||
class FunInJammies(_BasicScraper):
|
class FunInJammies(_WordPressScraper):
|
||||||
url = 'http://www.funinjammies.com/'
|
url = ('https://web.archive.org/web/20170205105241/'
|
||||||
|
'http://www.funinjammies.com/')
|
||||||
stripUrl = url + 'comic.php?issue=%s'
|
stripUrl = url + 'comic.php?issue=%s'
|
||||||
firstStripUrl = stripUrl % '1'
|
firstStripUrl = stripUrl % '1'
|
||||||
imageSearch = compile(r'(/comics/.+?)"')
|
prevSearch = '//a[text()="< Prev"]'
|
||||||
prevSearch = compile(r'(/comic.php.+?)" id.+?prev')
|
endOfLife = True
|
||||||
help = 'Index format: n (unpadded)'
|
help = 'Index format: n (unpadded)'
|
||||||
|
|
||||||
|
|
||||||
class FurPiled(_ParserScraper):
|
class FurPiled(_ParserScraper):
|
||||||
stripUrl = 'https://web.archive.org/web/20160404074145/http://www.liondogworks.com/images/fp-%03d.jpg'
|
stripUrl = ('https://web.archive.org/web/20160404074145/'
|
||||||
|
'http://www.liondogworks.com/images/fp-%03d.jpg')
|
||||||
url = stripUrl % 427
|
url = stripUrl % 427
|
||||||
firstStripUrl = stripUrl % 1
|
firstStripUrl = stripUrl % 1
|
||||||
|
endOfLife = True
|
||||||
|
|
||||||
def getPrevUrl(self, url, data):
|
def getPrevUrl(self, url, data):
|
||||||
# Skip missing pages
|
# Skip missing pages
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -21,20 +21,12 @@ class Galaxion(_WPNavi):
|
||||||
help = 'Index format: n-comic/book-n/chapter-n/title-nnn'
|
help = 'Index format: n-comic/book-n/chapter-n/title-nnn'
|
||||||
|
|
||||||
|
|
||||||
class Garanos(_BasicScraper):
|
class Garanos(_WordPressScraper):
|
||||||
baseUrl = 'http://garanos.alexheberling.com/'
|
stripUrl = ('https://web.archive.org/web/20180314181433/'
|
||||||
rurl = escape(baseUrl)
|
'http://garanos.alexheberling.com/pages/%s/')
|
||||||
url = baseUrl + 'pages/page-1/'
|
url = stripUrl % 'page-487'
|
||||||
starter = indirectStarter
|
firstStripUrl = stripUrl % 'vol01'
|
||||||
stripUrl = baseUrl + 'pages/page-%s'
|
endOfLife = True
|
||||||
imageSearch = compile(
|
|
||||||
tagre("img", "src",
|
|
||||||
r'(%swp-content/uploads/sites/\d+/\d+/\d+/[^"]+)' % rurl))
|
|
||||||
prevSearch = compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl,
|
|
||||||
after="prev"))
|
|
||||||
latestSearch = compile(tagre("a", "href", r'(%spages/[^"]+)' % rurl,
|
|
||||||
after="nav-last"))
|
|
||||||
help = 'Index format: n (unpadded)'
|
|
||||||
|
|
||||||
|
|
||||||
class GastroPhobia(_ParserScraper):
|
class GastroPhobia(_ParserScraper):
|
||||||
|
@ -46,13 +38,14 @@ class GastroPhobia(_ParserScraper):
|
||||||
help = 'Index format: yyyy-mm-dd'
|
help = 'Index format: yyyy-mm-dd'
|
||||||
|
|
||||||
|
|
||||||
class Geeks(_BasicScraper):
|
class Geeks(_ParserScraper):
|
||||||
url = 'http://sevenfloorsdown.com/geeks/'
|
url = ('https://web.archive.org/web/20190527194921/'
|
||||||
|
'http://sevenfloorsdown.com/geeks/')
|
||||||
stripUrl = url + 'archives/%s'
|
stripUrl = url + 'archives/%s'
|
||||||
firstStripUrl = stripUrl % '10'
|
firstStripUrl = stripUrl % '10'
|
||||||
imageSearch = compile(
|
imageSearch = '//div[@id="comic"]/img'
|
||||||
r'<img src=\'(http://sevenfloorsdown.com/geeks/comics/.+?)\'')
|
prevSearch = '//a[contains(text(), "Previous")]'
|
||||||
prevSearch = compile(r'<a href="(.+?)">« Previous')
|
endOfLife = True
|
||||||
help = 'Index format: nnn'
|
help = 'Index format: nnn'
|
||||||
|
|
||||||
|
|
||||||
|
@ -116,15 +109,12 @@ class GlassHalfEmpty(_BasicScraper):
|
||||||
help = 'Index format: nnn'
|
help = 'Index format: nnn'
|
||||||
|
|
||||||
|
|
||||||
class GleefulNihilism(_BasicScraper):
|
class GleefulNihilism(_WordPressScraper):
|
||||||
url = 'http://gleefulnihilism.com/'
|
url = ('https://web.archive.org/web/20170911203122/'
|
||||||
rurl = escape(url)
|
'http://gleefulnihilism.com/')
|
||||||
stripUrl = url + 'comic/%s/'
|
stripUrl = url + 'comic/%s/'
|
||||||
firstStripUrl = stripUrl % 'amoeba'
|
firstStripUrl = stripUrl % 'amoeba'
|
||||||
imageSearch = compile(
|
endOfLife = True
|
||||||
tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/[^"]+)' % rurl))
|
|
||||||
prevSearch = compile(
|
|
||||||
tagre("a", "href", r'(%scomic/[^"]+)' % rurl) + '‹')
|
|
||||||
help = 'Index format: stripname'
|
help = 'Index format: stripname'
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -68,9 +68,11 @@ class LetsSpeakEnglish(_ComicControlScraper):
|
||||||
|
|
||||||
|
|
||||||
class LifeAintNoPonyFarm(_WordPressScraper):
|
class LifeAintNoPonyFarm(_WordPressScraper):
|
||||||
url = 'http://sarahburrini.com/en/'
|
url = ('https://web.archive.org/web/20181221154155/'
|
||||||
|
'http://sarahburrini.com/en/')
|
||||||
firstStripUrl = url + 'comic/my-first-webcomic/'
|
firstStripUrl = url + 'comic/my-first-webcomic/'
|
||||||
multipleImagesPerStrip = True
|
multipleImagesPerStrip = True
|
||||||
|
endOfLife = True
|
||||||
|
|
||||||
|
|
||||||
class LilithsWord(_ComicControlScraper):
|
class LilithsWord(_ComicControlScraper):
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
from re import compile, escape
|
from re import compile, escape
|
||||||
|
|
||||||
from ..scraper import _BasicScraper, _ParserScraper
|
from ..scraper import _BasicScraper, _ParserScraper
|
||||||
from ..helpers import indirectStarter
|
from ..helpers import indirectStarter, xpath_class
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
|
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
|
||||||
|
|
||||||
|
@ -88,12 +88,13 @@ class Newshounds(_ParserScraper):
|
||||||
return super().getPrevUrl(url, data)
|
return super().getPrevUrl(url, data)
|
||||||
|
|
||||||
|
|
||||||
class NewWorld(_BasicScraper):
|
class NewWorld(_WordPressScraper):
|
||||||
url = 'http://www.tfsnewworld.com/'
|
url = ('https://web.archive.org/web/20190718012133/'
|
||||||
|
'http://www.tfsnewworld.com/')
|
||||||
stripUrl = url + '%s/'
|
stripUrl = url + '%s/'
|
||||||
firstStripUrl = stripUrl % '2007/08/30/63'
|
firstStripUrl = stripUrl % '2007/08/30/63'
|
||||||
imageSearch = compile(r'<img src="(http://www.tfsnewworld.com/comics/.+?)"')
|
prevSearch = '//a[@rel="prev"]'
|
||||||
prevSearch = compile(r'<div class="nav-previous"><a href="([^"]+)" rel="prev">')
|
endOfLife = True
|
||||||
help = 'Index format: yyyy/mm/dd/stripn'
|
help = 'Index format: yyyy/mm/dd/stripn'
|
||||||
|
|
||||||
|
|
||||||
|
@ -109,7 +110,9 @@ class NichtLustig(_BasicScraper):
|
||||||
|
|
||||||
|
|
||||||
class Nicky510(_WPNavi):
|
class Nicky510(_WPNavi):
|
||||||
url = 'http://www.nickyitis.com/'
|
url = ('https://web.archive.org/web/20160510215718/'
|
||||||
|
'http://www.nickyitis.com/')
|
||||||
|
endOfLife = True
|
||||||
|
|
||||||
|
|
||||||
class NicoleAndDerek(_ParserScraper):
|
class NicoleAndDerek(_ParserScraper):
|
||||||
|
@ -140,13 +143,13 @@ class Nightshift(_ParserScraper):
|
||||||
return chapter + '_' + page
|
return chapter + '_' + page
|
||||||
|
|
||||||
|
|
||||||
class Nimona(_BasicScraper):
|
class Nimona(_ParserScraper):
|
||||||
url = 'http://gingerhaze.com/nimona/'
|
url = ('https://web.archive.org/web/20141008095502/'
|
||||||
|
'http://gingerhaze.com/nimona/')
|
||||||
stripUrl = url + 'comic/%s'
|
stripUrl = url + 'comic/%s'
|
||||||
firstStripUrl = stripUrl % "page-1"
|
firstStripUrl = stripUrl % "page-1"
|
||||||
imageSearch = compile(tagre("img", "src", r'(http://gingerhaze\.com/sites/default/files/nimona-pages/.+?)'))
|
imageSearch = '//div[{}]//img'.format(xpath_class('field-name-field-comic-page'))
|
||||||
prevSearch = compile(r'<a href="(/nimona/comic/[^"]+)"><img src="http://gingerhaze\.com/sites/default/files/comicdrop/comicdrop_prev_label_file\.png"')
|
prevSearch = '//a[img[contains(@src, "/comicdrop_prev_label")]]'
|
||||||
help = 'Index format: stripname'
|
|
||||||
endOfLife = True
|
endOfLife = True
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -125,7 +125,9 @@ class OnTheFastrack(_BasicScraper):
|
||||||
|
|
||||||
|
|
||||||
class OopsComicAdventure(_WordPressScraper):
|
class OopsComicAdventure(_WordPressScraper):
|
||||||
url = 'http://oopscomicadventure.com/'
|
url = ('https://web.archive.org/web/20190102215141/'
|
||||||
|
'http://oopscomicadventure.com/')
|
||||||
|
endOfLife = True
|
||||||
|
|
||||||
|
|
||||||
class Optipess(_WPNavi):
|
class Optipess(_WPNavi):
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -44,8 +44,10 @@ class ParallelUniversum(_BasicScraper):
|
||||||
|
|
||||||
|
|
||||||
class PartiallyClips(_WordPressScraper):
|
class PartiallyClips(_WordPressScraper):
|
||||||
url = 'http://partiallyclips.com/'
|
url = ('https://web.archive.org/web/20180509161332/'
|
||||||
|
'http://partiallyclips.com/')
|
||||||
firstStripUrl = url + 'comic/screaming-woman/'
|
firstStripUrl = url + 'comic/screaming-woman/'
|
||||||
|
endOfLife = True
|
||||||
|
|
||||||
|
|
||||||
class PastelDefender(_BasicScraper):
|
class PastelDefender(_BasicScraper):
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -119,34 +119,36 @@ class SchlockMercenary(_ParserScraper):
|
||||||
help = 'Index format: yyyy-mm-dd'
|
help = 'Index format: yyyy-mm-dd'
|
||||||
|
|
||||||
|
|
||||||
class SchoolBites(_BasicScraper):
|
class SchoolBites(_ParserScraper):
|
||||||
url = 'http://schoolbites.net/'
|
url = ('https://web.archive.org/web/20170215065523/'
|
||||||
|
'http://schoolbites.net/')
|
||||||
stripUrl = url + 'd/%s.html'
|
stripUrl = url + 'd/%s.html'
|
||||||
imageSearch = compile(tagre("img", "src", r'(http://cdn\.schoolbites\.net/comics/[^"]+)'))
|
imageSearch = '//img[{}]'.format(xpath_class('ksc'))
|
||||||
prevSearch = compile(tagre("a", "href", r'(http://schoolbites\.net/d/\d+\.html)', after="prev"))
|
prevSearch = '//a[@rel="prev"]'
|
||||||
|
endOfLife = True
|
||||||
help = 'Index format: yyyymmdd'
|
help = 'Index format: yyyymmdd'
|
||||||
|
|
||||||
|
|
||||||
class Schuelert(_BasicScraper):
|
class Schuelert(_ParserScraper):
|
||||||
url = 'http://www.schuelert.de/'
|
url = ('https://web.archive.org/web/20190103022830/'
|
||||||
rurl = escape(url)
|
'http://www.schuelert.de/')
|
||||||
stripUrl = url + 'index.php?paged=%s'
|
stripUrl = url + 'index.php?paged=%s'
|
||||||
firstStripUrl = stripUrl % '5'
|
firstStripUrl = stripUrl % '3'
|
||||||
imageSearch = compile(tagre("img", "src", r"(%swp-content/[^']+)" % rurl, quote="'"))
|
imageSearch = '//img[contains(@src, "wp-content")]'
|
||||||
prevSearch = compile(tagre("a", "href", r'(%sindex\.php\?paged=\d+)' % rurl) + "«")
|
prevSearch = '//span[{}]/a'.format(xpath_class('prevlink'))
|
||||||
multipleImagesPerStrip = True
|
multipleImagesPerStrip = True
|
||||||
help = 'Index format: none'
|
endOfLife = True
|
||||||
lang = 'de'
|
lang = 'de'
|
||||||
|
|
||||||
|
|
||||||
class Science(_BasicScraper):
|
class Science(_ParserScraper):
|
||||||
url = 'http://sci-ence.org/'
|
stripUrl = ('https://web.archive.org/web/20180616152753/'
|
||||||
rurl = escape(url)
|
'http://sci-ence.org/%s/')
|
||||||
stripUrl = url + '%s/'
|
url = stripUrl % 'new-york-comic-con-2013'
|
||||||
firstStripUrl = stripUrl % 'periodic-table-element-ass'
|
firstStripUrl = stripUrl % 'periodic-table-element-ass'
|
||||||
prevSearch = compile(tagre("a", "href", r'(%s[^"]+/)' % rurl, after="prev"))
|
prevSearch = '//a[{}]'.format(xpath_class('navi-prev'))
|
||||||
imageSearch = compile(tagre("img", "src", r'(%scomics/\d+-\d+-\d+[^"]+)' % rurl))
|
imageSearch = '//div[@class="comicpane"]//img'
|
||||||
help = 'Index format: stripname'
|
endOfLife = True
|
||||||
|
|
||||||
|
|
||||||
class SeelPeel(_WPNaviIn):
|
class SeelPeel(_WPNaviIn):
|
||||||
|
@ -321,10 +323,12 @@ class SMBC(_ComicControlScraper):
|
||||||
|
|
||||||
|
|
||||||
class SnowFlame(_WordPressScraper):
|
class SnowFlame(_WordPressScraper):
|
||||||
url = 'http://www.snowflamecomic.com/'
|
url = ('https://web.archive.org/web/20160905071051/'
|
||||||
|
'http://www.snowflamecomic.com/')
|
||||||
stripUrl = url + '?comic=snowflame-%s-%s'
|
stripUrl = url + '?comic=snowflame-%s-%s'
|
||||||
firstStripUrl = stripUrl % ('01', '01')
|
firstStripUrl = stripUrl % ('01', '01')
|
||||||
starter = bounceStarter
|
starter = bounceStarter
|
||||||
|
endOfLife = True
|
||||||
help = 'Index format: chapter-page'
|
help = 'Index format: chapter-page'
|
||||||
|
|
||||||
def getIndexStripUrl(self, index):
|
def getIndexStripUrl(self, index):
|
||||||
|
@ -493,12 +497,14 @@ class StandStillStaySilent(_ParserScraper):
|
||||||
|
|
||||||
|
|
||||||
class StarCrossdDestiny(_ParserScraper):
|
class StarCrossdDestiny(_ParserScraper):
|
||||||
baseUrl = 'http://starcrossd.net/'
|
baseUrl = ('https://web.archive.org/web/20190918132321/'
|
||||||
|
'http://starcrossd.net/')
|
||||||
url = baseUrl + 'comic.html'
|
url = baseUrl + 'comic.html'
|
||||||
stripUrl = baseUrl + 'archives/%s.html'
|
stripUrl = baseUrl + 'archives/%s.html'
|
||||||
firstStripUrl = stripUrl % '00000001'
|
firstStripUrl = stripUrl % '00000001'
|
||||||
imageSearch = '//div[@id="comic"]//img'
|
imageSearch = '//div[@id="comic"]//img'
|
||||||
prevSearch = '//a[text()="prev"]'
|
prevSearch = '//a[text()="prev"]'
|
||||||
|
endOfLife = True
|
||||||
help = 'Index format: nnnnnnnn'
|
help = 'Index format: nnnnnnnn'
|
||||||
|
|
||||||
def namer(self, image_url, page_url):
|
def namer(self, image_url, page_url):
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -48,10 +48,14 @@ class Tamberlane(_ParserScraper):
|
||||||
|
|
||||||
|
|
||||||
class TheBrads(_ParserScraper):
|
class TheBrads(_ParserScraper):
|
||||||
url = 'http://bradcolbow.com/archive/'
|
url = ('https://web.archive.org/web/20171211154809/'
|
||||||
imageSearch = '//div[%s]//img' % xpath_class('entry')
|
'http://bradcolbow.com/archive/C4/')
|
||||||
prevSearch = '//a[%s]' % xpath_class('prev')
|
stripUrl = url + '%s/'
|
||||||
|
firstStripUrl = stripUrl % 'P125'
|
||||||
|
imageSearch = '//div[{}]//img'.format(xpath_class('entry'))
|
||||||
|
prevSearch = '//a[{}]'.format(xpath_class('prev'))
|
||||||
multipleImagesPerStrip = True
|
multipleImagesPerStrip = True
|
||||||
|
endOfLife = True
|
||||||
|
|
||||||
|
|
||||||
class TheClassMenagerie(_ParserScraper):
|
class TheClassMenagerie(_ParserScraper):
|
||||||
|
@ -107,15 +111,14 @@ class TheJunkHyenasDiner(_WordPressScraper):
|
||||||
firstStripUrl = stripUrl % 'intro'
|
firstStripUrl = stripUrl % 'intro'
|
||||||
|
|
||||||
|
|
||||||
class TheLandscaper(_BasicScraper):
|
class TheLandscaper(_ParserScraper):
|
||||||
stripUrl = 'http://landscaper.visual-assault.net/comic/%s'
|
stripUrl = ('https://web.archive.org/web/20171129163510/'
|
||||||
|
'http://landscaper.visual-assault.net/comic/%s')
|
||||||
url = stripUrl % 'latest'
|
url = stripUrl % 'latest'
|
||||||
firstStripUrl = stripUrl % '1'
|
firstStripUrl = stripUrl % '1'
|
||||||
imageSearch = compile(tagre("img", "src",
|
imageSearch = '//article[{}]//img[1]'.format(xpath_class('comic'))
|
||||||
r'(/comics/comic/comic_page/[^"]+)'))
|
prevSearch = '//a[contains(text(), "Previous")]'
|
||||||
prevSearch = compile(tagre("a", "href", r'(/comic/[^"]+)') +
|
endOfLife = True
|
||||||
'‹ Previous')
|
|
||||||
help = 'Index format: name'
|
|
||||||
|
|
||||||
|
|
||||||
class TheMelvinChronicles(_WordPressScraper):
|
class TheMelvinChronicles(_WordPressScraper):
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -14,8 +14,10 @@ from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
|
||||||
|
|
||||||
|
|
||||||
class Underling(_WPNavi):
|
class Underling(_WPNavi):
|
||||||
url = 'http://underlingcomic.com/'
|
url = ('https://web.archive.org/web/20190806120425/'
|
||||||
|
'http://underlingcomic.com/')
|
||||||
firstStripUrl = url + 'page-one/'
|
firstStripUrl = url + 'page-one/'
|
||||||
|
endOfLife = True
|
||||||
|
|
||||||
|
|
||||||
class Undertow(_BasicScraper):
|
class Undertow(_BasicScraper):
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
@ -30,6 +30,9 @@ from .output import out
|
||||||
from .events import getHandler
|
from .events import getHandler
|
||||||
|
|
||||||
|
|
||||||
|
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
|
||||||
|
|
||||||
|
|
||||||
class Scraper(object):
|
class Scraper(object):
|
||||||
'''Base class for all comic scraper, but without a specific scrape
|
'''Base class for all comic scraper, but without a specific scrape
|
||||||
implementation.'''
|
implementation.'''
|
||||||
|
@ -183,7 +186,7 @@ class Scraper(object):
|
||||||
except ValueError as msg:
|
except ValueError as msg:
|
||||||
# image not found
|
# image not found
|
||||||
out.exception(msg)
|
out.exception(msg)
|
||||||
if self.firstStripUrl == url:
|
if self.isfirststrip(url):
|
||||||
out.debug(u"Stop at first URL %s" % url)
|
out.debug(u"Stop at first URL %s" % url)
|
||||||
self.hitFirstStripUrl = True
|
self.hitFirstStripUrl = True
|
||||||
break
|
break
|
||||||
|
@ -199,6 +202,17 @@ class Scraper(object):
|
||||||
break
|
break
|
||||||
url = prevUrl
|
url = prevUrl
|
||||||
|
|
||||||
|
def isfirststrip(self, url):
|
||||||
|
"""Check if the specified URL is the first strip of a comic. This is
|
||||||
|
specially for comics taken from archive.org, since the base URL of
|
||||||
|
archive.org changes whenever pages are taken from a different
|
||||||
|
snapshot."""
|
||||||
|
if not self.firstStripUrl:
|
||||||
|
return False
|
||||||
|
firsturl = ARCHIVE_ORG_URL.sub('', self.firstStripUrl)
|
||||||
|
currenturl = ARCHIVE_ORG_URL.sub('', url)
|
||||||
|
return firsturl == currenturl
|
||||||
|
|
||||||
def getPrevUrl(self, url, data):
|
def getPrevUrl(self, url, data):
|
||||||
"""Find previous URL."""
|
"""Find previous URL."""
|
||||||
prevUrl = None
|
prevUrl = None
|
||||||
|
|
Loading…
Reference in a new issue