Tobias Gruetzmacher 3f9feec041 Allow modules to ignore some HTTP error codes.
This is neccessary since it seems some webservers out there are
misconfigured to deliver actual content with an HTTP error code...
2016-11-01 18:25:02 +01:00

302 lines
10 KiB

# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from re import compile, escape, MULTILINE
from ..util import tagre
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import regexNamer, bounceStarter, indirectStarter
from .common import _WordPressScraper, xpath_class, WP_LATEST_SEARCH
class AbstruseGoose(_BasicScraper):
url = ''
rurl = escape(url)
starter = bounceStarter
stripUrl = url + '%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre('img', 'src',
prevSearch = compile(tagre('a', 'href', r'(%s\d+)' % rurl) +
r'&laquo; Previous')
nextSearch = compile(tagre('a', 'href', r'(%s\d+)' % rurl) +
r'Next &raquo;')
help = 'Index format: n (unpadded)'
textSearch = compile(tagre("img", "title", r'([^"]+)'))
def namer(self, image_url, page_url):
index = int(page_url.rstrip('/').split('/')[-1])
name = image_url.split('/')[-1].split('.')[0]
return 'c%03d-%s' % (index, name)
class AbsurdNotions(_BasicScraper):
baseUrl = ''
url = baseUrl + 'page129.html'
stripUrl = baseUrl + 'page%s.html'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre('img', 'src', r'(an[^"]+)'))
multipleImagesPerStrip = True
prevSearch = compile(tagre('a', 'href', r'([^"]+)') +
tagre('img', 'src', 'nprev\.gif'))
help = 'Index format: n (unpadded)'
class AcademyVale(_BasicScraper):
url = ''
stripUrl = url + 'avarch.cgi?%s'
firstStripUrl = stripUrl % '001'
imageSearch = compile(tagre('img', 'src', r'(avale\d{4}-\d{2}\.gif)'))
prevSearch = compile(tagre('a', 'href', r'(avarch[^">]+)', quote="") +
tagre('img', 'src', 'AVNavBack\.gif'))
help = 'Index format: nnn'
class Achewood(_BasicScraper):
url = ''
stripUrl = url + 'index.php?date=%s'
firstStripUrl = stripUrl % '00000000'
imageSearch = compile(tagre("img", "src", r'(/comic\.php\?date=\d+)'))
prevSearch = compile(tagre("a", "href", r'(index\.php\?date=\d+)',
help = 'Index format: mmddyyyy'
namer = regexNamer(compile(r'date=(\d+)'))
class AfterStrife(_WordPressScraper):
baseUrl = ''
stripUrl = baseUrl + '?p=%s'
url = stripUrl % '262'
firstStripUrl = stripUrl % '1'
prevSearch = '//a[%s]' % xpath_class('navi-prev')
help = 'Index format: nnn'
endOfLife = True
class AGirlAndHerFed(_BasicScraper):
url = ''
stripUrl = url + '1.%s.html'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(img/strip/[^"]+\.jpg)'))
prevSearch = compile(r'<a href="([^"]+)">[^>]+Back')
help = 'Index format: nnn'
class AhoiPolloi(_ParserScraper):
url = ''
stripUrl = url + '?day=%s'
firstStripUrl = stripUrl % '20060306'
multipleImagesPerStrip = True
lang = 'de'
imageSearch = '//img[contains(@src, "/static/antville/ahoipolloi/")]'
prevSearch = '//a[contains(@href, "/?day=")]'
help = 'Index format: yyyymmdd'
class AhoyEarth(_WordPressScraper):
url = ''
prevSearch = '//a[%s]' % xpath_class('navi-prev')
class AirForceBlues(_WordPressScraper):
url = ''
firstStripUrl = url + 'comic/in-texas-there-are-texans/'
class ALessonIsLearned(_BasicScraper):
url = ''
prevSearch = compile(tagre("a", "href", r"(index\.php\?comic=\d+)",
quote="'") + r"[^>]+previous")
stripUrl = url + 'index.php?comic=%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r"(cmx/lesson\d+\.[a-z]+)"))
help = 'Index format: nnn'
class Alice(_WordPressScraper):
url = ''
latestSearch = '//a[text()="Latest Alice!"]'
starter = indirectStarter
class AlienLovesPredator(_BasicScraper):
url = ''
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2004/10/12/unavoidable-delay'
imageSearch = compile(tagre("img", "src", r'([^"]+)',
after='border="1" alt="" width="750"'))
prevSearch = compile(tagre("a", "href", r'([^"]+)', after="prev"))
help = 'Index format: yyyy/mm/dd/name'
class AlienShores(_WordPressScraper):
url = ''
firstStripUrl = url + 'AScomic/updated-cover/'
class AllTheGrowingThings(_BasicScraper):
url = ''
rurl = escape(url)
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2009/04/21/all-the-growing-things'
imageSearch = compile(tagre("img", "src", r'(%sfiles/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
help = 'Index format: yyyy/mm/dd/strip-name'
class AlphaLuna(_BasicScraper):
url = ''
stripUrl = url + 'issue-%s/'
firstStripUrl = stripUrl % '1/cover'
imageSearch = compile(tagre("a", "href",
r'[^"]*/(?:issue-|support/upcoming)[^"]+') +
tagre("img", "src", r'([^"]*/PAGINAS/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"]+)') +
tagre("img", "alt", "Prev"))
help = 'Index format: issue/page (e.g. 4/05)'
class AlphaLunaSpanish(AlphaLuna):
name = 'AlphaLuna/Spanish'
lang = 'es'
url = ''
stripUrl = url + 'issue-%s/'
firstStripUrl = stripUrl % '1/portada'
class Altermeta(_BasicScraper):
url = ''
rurl = escape(url)
stripUrl = url + 'archive.php?comic=%s'
firstStripUrl = stripUrl % '0'
imageSearch = compile(r'<img src="(comics/[^"]+)" />')
prevSearch = compile(r'<a href="([^"]+)"><img src="%stemplate/default/images/sasha/back\.png' % rurl)
help = 'Index format: n (unpadded)'
class AltermetaOld(Altermeta):
url = Altermeta.url + 'oldarchive/index.php'
stripUrl = Altermeta.url + 'oldarchive/archive.php?comic=%s'
firstStripUrl = stripUrl % '0'
prevSearch = compile(r'<a href="([^"]+)">Back')
class AmazingSuperPowers(_BasicScraper):
url = ''
rurl = escape(url)
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2007/09/heredity'
imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
help = 'Index format: yyyy/mm/name'
def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
return url in (
# video
self.stripUrl % '2013/05/orbital-deathray-kickstarter',
class Amya(_WordPressScraper):
url = ''
class Angband(_BasicScraper):
url = ''
stripUrl = url + 'view.php?date=%s'
firstStripUrl = stripUrl % '2005-12-30'
imageSearch = compile(tagre("img", "src", r'(comics/Scroll[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(view\.php\?date\=[^"]+)') +
help = 'Index format: yyyy-mm-dd'
class Angels2200(_BasicScraper):
url = ''
stripUrl = url + '%s'
imageSearch = compile(tagre("img", "src", r"(http://www\.janahoffmann\.com/angels/comics/[^']+)", quote="'"))
prevSearch = compile(tagre("a", "href", r'([^"]+)') + "&laquo; Previous")
help = 'Index format: yyyy/mm/dd/part-<n>-comic-<n>'
class Annyseed(_ParserScraper):
baseUrl = ''
url = baseUrl + 'AnnyseedLatest.htm'
stripUrl = baseUrl + 'Annyseed%s.htm'
imageSearch = '//div/img[contains(@src, "Annyseed")]'
prevSearch = '//a[img[@name="Previousbtn"]]'
help = 'Index format: nnn'
class AoiHouse(_ParserScraper):
url = ''
imageSearch = '//div[@id="comic"]/a[2]/img'
prevSearch = '//a[@id="cndprev"]'
class AppleGeeks(_BasicScraper):
url = ''
stripUrl = url + 'comics/viewcomic.php?issue=%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'((?:/comics/)?issue\d+\.jpg)'))
prevSearch = compile(r'<div class="caption">Previous Comic</div>\s*<p><a href="([^"]+)">', MULTILINE)
allow_errors = (404,)
help = 'Index format: n (unpadded)'
class ARedTailsDream(_BasicScraper):
baseUrl = ''
stripUrl = baseUrl + 'comic/page%s.php'
firstStripUrl = stripUrl % '00'
url = baseUrl + 'comic/recent.php'
imageSearch = compile(tagre('img', 'src', r'(chapter.+?/eng[^"]*)'))
prevSearch = compile(tagre('a', 'href', r'(page\d+\.php)') +
tagre("img", "src", r'.*?aprev.*?'))
help = 'Index format: nn'
class Ashes(_WordPressScraper):
url = ''
firstStripUrl = url
latestSearch = WP_LATEST_SEARCH
starter = indirectStarter
class ASofterWorld(_ParserScraper):
url = ''
stripUrl = url + 'index.php?id=%s'
firstStripUrl = stripUrl % '1'
imageSearch = '//div[@id="comicimg"]//img'
prevSearch = '//div[@id="previous"]/a'
help = 'Index format: n (unpadded)'
class AstronomyPOTD(_ParserScraper):
baseUrl = ''
url = baseUrl + 'astropix.html'
starter = bounceStarter
stripUrl = baseUrl + 'ap%s.html'
firstStripUrl = stripUrl % '061012'
imageSearch = '//a/img'
multipleImagesPerStrip = True
prevSearch = '//a[text()="<"]'
nextSearch = '//a[text()=">"]'
help = 'Index format: yymmdd'
def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
return data.xpath('//iframe') # videos
def namer(self, image_url, page_url):
return '%s-%s' % (page_url.split('/')[-1].split('.')[0][2:],
class AxeCop(_WordPressScraper):
url = ''