Tobias Gruetzmacher 3f9feec041 Allow modules to ignore some HTTP error codes.
This is neccessary since it seems some webservers out there are
misconfigured to deliver actual content with an HTTP error code...
2016-11-01 18:25:02 +01:00

499 lines
18 KiB

# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from re import compile, escape, IGNORECASE, sub
from os.path import splitext
import datetime
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, bounceStarter
from ..util import tagre
from .common import (_ComicControlScraper, _WordPressScraper, WP_LATEST_SEARCH,
class SabrinaOnline(_BasicScraper):
url = ''
stripUrl = url + '%s.html'
firstStripUrl = stripUrl % '1996-01'
imageSearch = compile(tagre("a", "href", r'(strips/[^"]*)'))
prevSearch = compile(tagre("a", "href", r"(\d\d\d\d-\d\d.html)") +
tagre("img", "src", "b_back.gif"))
help = 'Index format: yyyy-qq'
adult = True
multipleImagesPerStrip = True
def starter(self):
"""Pick last one in a list of archive pages."""
archive = self.url + 'archive.html'
data = self.getPage(archive)
search = compile(tagre("a", "href", r"(\d\d\d\d-\d\d.html)"))
archivepages = self.fetchUrls(archive, data, search)
return archivepages[-1]
class SafelyEndangered(_WordPressScraper):
url = ''
firstStripUrl = url + 'comic/ignored/'
prevSearch = '//a[%s]' % xpath_class('navi-prev')
class SailorsunOrg(_WordPressScraper):
url = ''
class SamAndFuzzy(_ParserScraper):
url = ''
stripUrl = url + '%s'
firstStripUrl = stripUrl % '1'
imageSearch = '//img[@class="comic-image"]'
prevSearch = '//li[@class="prev-page"]/a'
help = 'Index format: n (unpadded)'
class SandraOnTheRocks(_BasicScraper):
url = ''
stripUrl = url + 'strips-sotr/%s'
firstStripUrl = stripUrl % 'start_by_running'
imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"]*/strips-sotr/[^"]+)', before="cn[id]prev"))
help = 'Index format: name'
class ScandinaviaAndTheWorld(_ParserScraper):
url = ''
stripUrl = url + '%s'
firstStripUrl = stripUrl % 'sweden-denmark-and-norway'
starter = indirectStarter
imageSearch = '//img[@itemprop="image"]'
prevSearch = '//a[@accesskey="p"]'
latestSearch = '//a[text()="View latest comic"]'
textSearch = '//span[@itemprop="articleBody"]'
help = 'Index format: stripname'
class ScaryGoRound(_ParserScraper):
url = ''
stripUrl = url + '?date=%s'
firstStripUrl = stripUrl % '20020604'
imageSearch = '//img[contains(@src, "/strips/")]'
prevSearch = '//a[contains(text(), "Previous")]'
endOfLife = True
help = 'Index format: yyyymmdd'
class ScenesFromAMultiverse(_BasicScraper):
url = ''
rurl = escape(url)
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2010/06/14/parenthood'
imageSearch = (
compile(tagre("div", "id", "comic") + r"\s*" +
tagre("img", "src",
compile(tagre("div", "id", "comic") + r"\s*" +
tagre("a", "href", r'[^"]*') +
tagre("img", "src",
prevSearch = compile(tagre("a", "href", r'(%scomic/\d+\d+/\d+/\d+/[^"]+)' % rurl, after="prev"))
help = 'Index format: yyyy/mm/dd/stripname'
class SchlockMercenary(_ParserScraper):
url = ''
stripUrl = url + '%s'
firstStripUrl = stripUrl % '2000-06-12'
imageSearch = '//div[@class="strip-image-wrapper"]/img'
multipleImagesPerStrip = True
prevSearch = '//a[@class="previous-strip"]'
help = 'Index format: yyyy-mm-dd'
class SchoolBites(_BasicScraper):
url = ''
stripUrl = url + 'd/%s.html'
imageSearch = compile(tagre("img", "src", r'(http://cdn\.schoolbites\.net/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://schoolbites\.net/d/\d+\.html)', after="prev"))
help = 'Index format: yyyymmdd'
class Schuelert(_BasicScraper):
url = ''
rurl = escape(url)
stripUrl = url + 'index.php?paged=%s'
firstStripUrl = stripUrl % '5'
imageSearch = compile(tagre("img", "src", r"(%swp-content/[^']+)" % rurl, quote="'"))
prevSearch = compile(tagre("a", "href", r'(%sindex\.php\?paged=\d+)' % rurl) + "«")
multipleImagesPerStrip = True
help = 'Index format: none'
lang = 'de'
class Science(_BasicScraper):
url = ''
rurl = escape(url)
stripUrl = url + '%s/'
firstStripUrl = stripUrl % 'periodic-table-element-ass'
prevSearch = compile(tagre("a", "href", r'(%s[^"]+/)' % rurl, after="prev"))
imageSearch = compile(tagre("img", "src", r'(%scomics/\d+-\d+-\d+[^"]+)' % rurl))
help = 'Index format: stripname'
class SequentialArt(_BasicScraper):
url = ''
stripUrl = url + '?s=%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'([^"]+)', before="strip"))
prevSearch = compile(tagre("a", "href", r'(/sequentialart\.php\?s=\d+)') +
tagre("img", "src", "Nav_BackOne\.gif"))
help = 'Index format: name'
class SexyLosers(_ParserScraper):
adult = True
url = ''
stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % '003'
imageSearch = '//div[@class="entry-content"]//img'
prevSearch = '//a[@rel="prev"]'
latestSearch = '//a[@rel="bookmark"]'
help = 'Index format: nnn'
starter = indirectStarter
def namer(self, image_url, page_url):
index = page_url.rsplit('/', 2)[1]
title = image_url.rsplit('/', 1)[1]
return index + '-' + title
class Sharksplode(_WordPressScraper):
url = ''
textSearch = '//div[@id="comic"]//img/@alt'
allow_errors = (403,)
class Sheldon(_BasicScraper):
url = ''
rurl = escape(url)
stripUrl = url + 'archive/%s.html'
firstStripUrl = stripUrl % '011130'
imageSearch = compile(tagre("img", "src", r'(http://cdn\.sheldoncomics\.com/strips/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(%sarchive/\d+\.html)' % rurl,
help = 'Index format: yymmdd'
class ShermansLagoon(_BasicScraper):
url = ''
stripUrl = url + 'comics/%s'
firstStripUrl = stripUrl % '/december-29-2003/'
imageSearch = compile(tagre("img", "src", r'(https://safr\.kingfeatures\.com/idn/cnfeed/zone/js/content\.php\?file=.+?)'))
prevSearch = compile(r'id="previouscomic" class="button white"><a href="(%scomics/[a-z0-9-]+/)"' % url)
help = 'Index format: monthname-day-year'
def namer(self, image_url, page_url):
name = page_url.rsplit('/', 3)[2]
if name == "":
name ="%B-%d-%Y").lower()
# name is monthname-day-year
month, day, year = name.split('-')
return "%s-%s-%s" % (year, month, day)
class Shivae(_WordPressScraper):
url = ''
firstStripUrl = url + 'gnip/ck-chapter-01/caidenkoel-title-01/'
class Shortpacked(_ParserScraper):
url = ''
stripUrl = url + '?id=%s'
css = True
imageSearch = 'img#comic'
prevSearch = 'a.prev'
help = 'Index format: nnn'
class ShotgunShuffle(_WordPressScraper):
url = ''
firstStripUrl = url + 'comic/pilot/'
class SinFest(_BasicScraper):
url = ''
stripUrl = url + 'view.php?date=%s'
imageSearch = compile(tagre("img", "src", r'(btphp/comics/.+)',
prevSearch = compile(tagre("a", "href", r'(view\.php\?date=.+)') + '\\s*' +
tagre("img", "src", r'\.\./images/prev\.gif'))
help = 'Index format: yyyy-mm-dd'
class Sithrah(_ParserScraper):
url = ''
imageSearch = '//div[@class="webcomic-image"]/img'
prevSearch = '//a[%s]' % xpath_class('previous-webcomic-link')
class SkinDeep(_BasicScraper):
url = ''
stripUrl = url + 'archive/%s/'
imageSearch = compile(r'<span class="webcomic-object[^>]*><img src="([^"]*)"')
prevSearch = compile(tagre("a", "href", r'([^"]+)',
help = 'Index format: custom'
class SleeplessDomain(_ComicControlScraper):
url = ''
class SlightlyDamned(_ComicControlScraper):
url = ''
firstStripUrl = url + 'comic/part-one-to-hell-and-back'
class SluggyFreelance(_BasicScraper):
url = ''
stripUrl = url + 'comics/archives/daily/%s'
imageSearch = compile(r'<img src="(/images/comics/.+?)"')
prevSearch = compile(r'<a href="(.+?)"[^>]+?><span class="ui-icon ui-icon-seek-prev">')
multipleImagesPerStrip = True
help = 'Index format: yymmdd'
class SMBC(_ComicControlScraper):
url = ''
firstStripUrl = url + 'comic/2002-09-05'
multipleImagesPerStrip = True
imageSearch = ['//img[@id="cc-comic"]', '//div[@id="aftercomic"]/img']
textSearch = '//img[@id="cc-comic"]/@title'
def namer(self, image_url, page_url):
"""Remove random noise from name."""
return image_url.rsplit('-', 1)[-1]
class SnowFlame(_WordPressScraper):
url = ''
stripUrl = url + '?comic=snowflame-%s-%s'
firstStripUrl = stripUrl % ('01', '01')
starter = bounceStarter
help = 'Index format: chapter-page'
def getIndexStripUrl(self, index):
return self.stripUrl % tuple(index.split('-'))
def namer(self, image_url, page_url):
prefix, filename = image_url.rsplit('/', 1)
ro = compile(r'snowflame-([^-]+)-([^-]+)')
mo =
chapter =
page =
return "%s-%s-%s" % (chapter, page, filename)
class SodiumEyes(_WordPressScraper):
url = ''
class SomethingPositive(_BasicScraper):
url = ''
stripUrl = url + 'sp%s.shtml'
imageSearch = (
compile(tagre("img", "src", r'(sp\d+\.png)')),
compile(tagre("img", "src", r'(twither\.gif)')),
prevSearch = compile(tagre("a", "href", r'(sp\d+\.shtml)') + "(?:" +
tagre("img", "src", r'images/previous\.gif') +
help = 'Index format: mmddyyyy'
class Sorcery101(_ParserScraper):
baseUrl = ''
stripUrl = baseUrl + '%s/'
url = stripUrl % 'sorcery101-ch-01'
firstStripUrl = url
imageSearch = '//div[@class="webcomic-image"]/img'
prevSearch = '//a[@rel="prev"]'
latestSearch = '//a[%s]' % xpath_class('last-webcomic-link')
starter = indirectStarter
allow_errors = (500,)
help = 'Index format: stripname'
class SpaceJunkArlia(_ParserScraper):
url = ''
stripUrl = url + '?strip_id=%s'
firstStripUrl = stripUrl % '0'
imageSearch = '//div[%s]/img' % xpath_class('content')
prevSearch = '//a[text()="<"]'
help = 'Index format: number'
class SpaceTrawler(_ParserScraper):
url = ''
firstStripUrl = url + '2010/01/01/spacetrawler-4/'
imageSearch = '//img[%s]' % xpath_class('size-full')
prevSearch = '//a[@rel="prev"]'
class Spamusement(_BasicScraper):
url = ''
rurl = escape(url)
stripUrl = url + 'index.php/comics/view/%s'
imageSearch = compile(r'<img src="(%sgfx/\d+\..+?)"' % rurl, IGNORECASE)
prevSearch = compile(r'<a href="(%sindex.php/comics/view/.+?)">' % rurl,
latestSearch = prevSearch
help = 'Index format: n (unpadded)'
starter = indirectStarter
class SpareParts(_BasicScraper):
baseUrl = ''
url = baseUrl + 'comics/?date=20080328'
stripUrl = baseUrl + 'comics/index.php?date=%s'
firstStripUrl = stripUrl % '20031022'
imageSearch = compile(tagre("img", "src", r'(http://www\.sparepartscomics\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(index\.php\?date=\d+)',
quote="'") + "Previous Comic")
help = 'Index format: yyyymmdd'
class Spinnerette(_ComicControlScraper):
url = ''
class SPQRBlues(_WordPressScraper):
url = ''
class StandStillStaySilent(_ParserScraper):
url = ''
stripUrl = url + '?page=%s'
firstStripUrl = stripUrl % '1'
imageSearch = '//img[@class="comicnormal"]'
prevSearch = '//a[div[@id="navprev"]]'
help = 'Index Format: number'
class StarCrossdDestiny(_ParserScraper):
baseUrl = ''
url = baseUrl + 'comic.html'
stripUrl = baseUrl + 'archives/%s.html'
firstStripUrl = stripUrl % '00000001'
imageSearch = '//div[@id="comic"]//img'
prevSearch = '//a[text()="prev"]'
help = 'Index format: nnnnnnnn'
def namer(self, image_url, page_url):
if image_url.find('ch1') == -1:
# At first all images were stored in a strips/ directory but
# that was changed with the introduction of book2
image_url = sub('(?:strips)|(?:images)', 'book1', image_url)
elif not image_url.find('strips') == -1:
image_url = image_url.replace('strips/', '')
directory, filename = image_url.split('/')[-2:]
filename, extension = splitext(filename)
return directory + '-' + filename
class StationV3(_ParserScraper):
url = ''
stripUrl = url + 'd2/%s.html'
firstStripUrl = stripUrl % '20150628'
imageSearch = '//img[contains(@src,"/comics2/")]'
prevSearch = '//a[img[contains(@src,"/previous2")]]'
help = 'Index format: yyyymmdd'
class StickyDillyBuns(_BasicScraper):
url = ''
stripUrl = url + 'strips-sdb/%s'
firstStripUrl = stripUrl % 'awesome_leading_man'
imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"]*/strips-sdb/[^"]+)',
help = 'Index format: name'
class StreetFighter(_ComicControlScraper):
url = ''
class StringTheory(_WordPressScraper):
url = ''
firstStripUrl = url + 'comics/chapterone/chapterone/'
prevSearch = '//a[%s]' % xpath_class('navi-prev')
class StrongFemaleProtagonist(_ParserScraper):
url = ''
stripUrl = url + '%s/'
css = True
imageSearch = 'article p img'
prevSearch = ''
help = 'Index format: issue-?/page-??'
def shouldSkipUrl(self, url, data):
"""Skip hiatus & non-comic pages."""
return url in (
self.stripUrl % 'guest-art/tuesday',
self.stripUrl % 'guest-art/friday',
self.stripUrl % 'guest-art/wednesday',
self.stripUrl % 'issue-5/newspaper',
self.stripUrl % 'issue-5/hiatus-1',
self.stripUrl % 'issue-5/hiatus-2',
self.stripUrl % 'issue-1/no-page',
class StuffNoOneToldMe(_BasicScraper):
url = ''
stripUrl = url + '%s.html'
firstStripUrl = stripUrl % '2010/05/01'
olderHref = r"(http://www\.snotm\.com/\d+/\d+/[^']+\.html)"
starter = indirectStarter
imageSearch = (
compile(tagre("img", "src", r'(http://i\.imgur\.com/[^"]+)') +
r"(?:</a>|<br />)"),
compile(tagre("img", "src", r'(http://\d+\.bp\.blogspot\.com/[^"]+)') +
r"(?:(?:&nbsp;)?</a>|<span |<br />)"),
compile(tagre("img", "src", r'(https://lh\d+\.googleusercontent\.com/[^"]+)') + r"</a>"),
prevSearch = compile(tagre("a", "href", olderHref, quote="'",
latestSearch = compile(tagre("a", "href", olderHref, quote="'"))
multipleImagesPerStrip = True
help = 'Index format: yyyy/mm/stripname'
def namer(self, image_url, page_url):
"""Use page URL to construct meaningful image name."""
parts, year, month, stripname = page_url.rsplit('/', 3)
stripname = stripname.rsplit('.', 1)[0]
parts, imagename = image_url.rsplit('/', 1)
return '%s-%s-%s-%s' % (year, month, stripname, imagename)
def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
return url in (
self.stripUrl % '2016/05/so-you-would-like-to-share-my-comics', # no comic
self.stripUrl % '2012/08/self-rant', # no comic
self.stripUrl % '2012/06/if-you-wonder-where-ive-been', # video
self.stripUrl % '2011/10/i-didnt-make-this-nor-have-anything-to', # video
self.stripUrl % '2010/12/first-snotm-fans-in-sao-paulo', # no comic
self.stripUrl % '2010/11/ear-infection', # no comic
class SupernormalStep(_ComicControlScraper):
url = ''