Fix some comic modules.

This commit is contained in:
Tobias Gruetzmacher 2016-05-05 20:55:14 +02:00
parent bb2ac39639
commit 77ed0218e0
5 changed files with 101 additions and 146 deletions

View file

@ -7,7 +7,7 @@ from __future__ import absolute_import, division, print_function
from re import compile, escape
from ..scraper import _BasicScraper
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter
from ..util import tagre
from .common import (_ComicControlScraper, _WordPressScraper, WP_LATEST_SEARCH,
@ -18,13 +18,14 @@ class Namesake(_ComicControlScraper):
url = 'http://namesakecomic.com/'
class NamirDeiter(_BasicScraper):
url = 'http://www.namirdeiter.com/'
rurl = escape(url)
stripUrl = url + 'comics/index.php?date=%s'
firstStripUrl = stripUrl % '19991128'
imageSearch = compile(tagre("img", "src", r"'?(%scomics/\d+\.jpg)'?" % rurl, quote=""))
prevSearch = compile(tagre("a", "href", r'(%scomics/index\.php\?date=\d+)' % rurl, quote="'") + "Previous")
class NamirDeiter(_ParserScraper):
baseUrl = 'http://www.namirdeiter.com/comics/'
stripUrl = baseUrl + 'index.php?date=%s'
url = stripUrl % '20150410'
firstStripUrl = baseUrl
imageSearch = '//a/img'
prevSearch = '//a[text()="Previous"]'
endOfLife = True
help = 'Index format: yyyymmdd'
@ -89,8 +90,8 @@ class Nicky510(_WordPressScraper):
class Nimona(_BasicScraper):
url = 'http://gingerhaze.com/nimona/'
stripUrl = url + '%s/'
firstStripUrl = stripUrl % "comic/page-1"
stripUrl = url + 'comic/%s'
firstStripUrl = stripUrl % "page-1"
imageSearch = compile(tagre("img", "src", r'(http://gingerhaze\.com/sites/default/files/nimona-pages/.+?)'))
prevSearch = compile(r'<a href="(/nimona/comic/[^"]+)"><img src="http://gingerhaze\.com/sites/default/files/comicdrop/comicdrop_prev_label_file\.png"')
help = 'Index format: stripname'
@ -115,31 +116,21 @@ class NoMoreSavePoints(_WordPressScraper):
starter = indirectStarter
class NoNeedForBushido(_BasicScraper):
class NoNeedForBushido(_ParserScraper):
url = 'http://nn4b.com/'
rurl = escape(url)
stripUrl = url + '?webcomic1=%s'
imageSearch = compile(
tagre("a", "rel", "next") +
tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/[^"]+)' % rurl,
after="attachment-full"))
prevSearch = compile(tagre("a", "href", r'(%s\?webcomic1=[^"]+)' % rurl,
after="previous-webcomic"))
latestSearch = compile(tagre("a", "href", r'(%s\?webcomic1=[^"]+)' % rurl,
after="last-webcomic"))
stripUrl = url + 'comic/%s'
imageSearch = '//div[@id="comic-image"]//img'
prevSearch = '//a[@rel="prev"]'
help = 'Index format: nnn'
starter = indirectStarter
class NotInventedHere(_BasicScraper):
class NotInventedHere(_ParserScraper):
url = 'http://notinventedhe.re/'
rurl = escape(url)
stripUrl = url + '%s/'
firstStripUrl = stripUrl % 'on/2009-9-21'
imageSearch = compile(tagre("img", "src", r'(http://thiswas.notinventedhe.re/on/\d+-\d+-\d+)'))
prevSearch = compile(tagre("a", "href", r'(/on/\d+-\d+-\d+)') +
'\s*Previous')
help = 'Index format: yyyy-mm-dd'
stripUrl = url + 'on/%s'
firstStripUrl = stripUrl % '2009-9-21'
imageSearch = '//div[@id="comic-content"]//img'
prevSearch = '//a[@id="nav-previous"]'
help = 'Index format: yyyy-m-d'
class Nukees(_BasicScraper):

View file

@ -56,8 +56,8 @@ class OkCancel(_BasicScraper):
class OmakeTheater(_ParserScraper):
url = 'http://omaketheater.com/comics/'
stripUrl = url + '%s/'
url = 'http://omaketheater.com/comic/'
stripUrl = url + '%s'
firstStripUrl = stripUrl % '1'
css = True
imageSearch = ".comicImage img"

View file

@ -76,28 +76,16 @@ class PennyAndAggie(_BasicScraper):
help = 'Index format: n (unpadded)'
class PennyArcade(_BasicScraper):
url = 'http://penny-arcade.com/comic/'
class PennyArcade(_ParserScraper):
url = 'http://www.penny-arcade.com/comic/'
rurl = escape(url)
stripUrl = url + '%s'
firstStripUrl = stripUrl % '1998/11/18'
imageSearch = compile(tagre("img", "src", r'(http://art\.penny-arcade\.com/photos/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl,
before="btnPrev"))
nextSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl,
before="btnNext"))
imageSearch = '//div[@id="comicFrame"]//img'
prevSearch = '//a[%s]' % xpath_class('btnPrev')
nextSearch = '//a[%s]' % xpath_class('btnNext')
starter = bounceStarter
help = 'Index format: yyyy/mm/dd/'
def prevUrlModifier(self, prev_url):
if prev_url:
dummy, yyyy, mm, dd = prev_url.rsplit('/', 3)
try:
int(dd)
except ValueError:
# URL has form yyyy/mm/dd/stripname
prev_url = "%s/%s/%s" % (dummy, yyyy, mm)
return prev_url
help = 'Index format: yyyy/mm/dd'
def namer(self, image_url, page_url):
p = page_url.split('/')
@ -116,17 +104,15 @@ class PeppermintSaga(_BasicScraper):
adult = True
class PHDComics(_BasicScraper):
class PHDComics(_ParserScraper):
baseUrl = 'http://phdcomics.com/'
url = baseUrl + 'comics.php'
stripUrl = baseUrl + 'comics/archive.php?comicid=%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(http://www\.phdcomics\.com/comics/archive/phd[^ ]+)', quote=""))
prevSearch = compile(
tagre("a", "href", r'((?:comics/)?archive\.php\?comicid=\d+)',
quote="") +
tagre("img", "src", r'(?:comics/)?images/prev_button\.gif', quote=""))
help = 'Index format: number'
imageSearch = '//img[@id="comic"]'
prevSearch = '//a[img[contains(@src, "prev_button")]]'
nextSearch = '//a[img[contains(@src, "next_button")]]'
help = 'Index format: n (unpadded)'
def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
@ -150,15 +136,9 @@ class PicPakDog(_BasicScraper):
help = 'Index format: stripname'
class PiledHigherAndDeeper(_BasicScraper):
url = 'http://www.phdcomics.com/comics.php'
# Keep, because naming is different to PHDComics...
class PiledHigherAndDeeper(PHDComics):
starter = bounceStarter
stripUrl = url + '?comicid=%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(http://www\.phdcomics\.com/comics/archive/phd\d+s\d?\.\w{3,4})', quote=""))
prevSearch = compile(r'<a href=((comics/)?archive\.php\?comicid=\d+)>.*<img [^>]*prev_button\.gif')
nextSearch = compile(r'<a href=(archive\.php\?comicid=\d+)>.*<img [^>]*next_button\.gif')
help = 'Index format: n (unpadded)'
namer = queryNamer('comicid', use_page_url=True)
@ -232,11 +212,9 @@ class Precocious(_BasicScraper):
class PS238(_ParserScraper):
url = 'http://ps238.nodwick.com/'
stripUrl = url + '/comic/%s/'
starter = bounceStarter
stripUrl = url + 'comic/%s/'
imageSearch = '//div[@id="comic"]//img'
prevSearch = '//a[@class="comic-nav-base comic-nav-previous"]'
nextSearch = '//a[@class="comic-nav-base comic-nav-next"]'
help = 'Index format: yyyy-mm-dd'

View file

@ -5,11 +5,16 @@
from __future__ import absolute_import, division, print_function
from re import compile, escape
from re import compile
try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import bounceStarter
from ..helpers import indirectStarter
from ..util import tagre
from .common import _WordPressScraper, xpath_class
class RadioactivePanda(_BasicScraper):
@ -20,23 +25,23 @@ class RadioactivePanda(_BasicScraper):
help = 'Index format: n (no padding)'
class RalfTheDestroyer(_ParserScraper):
class RalfTheDestroyer(_WordPressScraper):
url = 'http://ralfthedestroyer.com/'
stripUrl = url + '%s/'
css = True
imageSearch = '#comic-1 > a:first-child img'
prevSearch = 'td.comic_navi_left > a:nth-of-type(2)'
help = 'Index format: stripname'
class RealLife(_BasicScraper):
class RealLife(_WordPressScraper):
url = 'http://reallifecomics.com/'
rurl = escape(url)
stripUrl = url + 'comic.php?comic=%s'
firstStripUrl = stripUrl % '991115'
imageSearch = compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'((?:%s)?comic\.php\?comic=[^"]+)' % rurl, after="nav-previous"))
help = 'Index format: monthname-dd-yyyy)'
firstStripUrl = stripUrl % 'title-1'
help = 'Index format: monthname-dd-yyyy'
def getPrevUrl(self, url, data):
# "Parse" JavaScript
prevtag = data.find_class('comic-nav-previous')
if not prevtag:
return None
target = prevtag[0].get('onclick').split("'")[1]
return urljoin(url, target)
class RealmOfAtland(_BasicScraper):
@ -48,26 +53,14 @@ class RealmOfAtland(_BasicScraper):
help = 'Index format: nnn'
class RedMeat(_BasicScraper):
baseUrl = 'http://www.redmeat.com/redmeat/'
url = baseUrl + 'current/index.html'
starter = bounceStarter
stripUrl = baseUrl + '%s/index.html'
firstStripUrl = stripUrl % '1996-06-10'
imageSearch = compile(tagre("img", "src", r'(http://www\.redmeat\.com/imager/b/redmeat/[^"]*\.png)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.redmeat\.com/[^"]*)', after="prev"))
nextSearch = compile(tagre("a", "href", r'(http://www\.redmeat\.com/[^"]*)', after="next"))
help = 'Index format: yyyy-mm-dd'
class RedMeat(_ParserScraper):
url = 'http://www.redmeat.com/max-cannon/FreshMeat'
imageSearch = '//div[@class="comicStrip"]//img'
prevSearch = '//a[@class="prev"]'
class RedsPlanet(_BasicScraper):
url = 'http://www.redsplanet.com/comic/'
rurl = escape(url)
stripUrl = url + 'rp/%s/'
firstStripUrl = stripUrl % 'pro/prologue-01'
imageSearch = compile(tagre("img", "src", r'(%scomics/\d+-\d+-\d+_[^"/]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%srp/[^"/]+/[^"/]+/)' % rurl))
help = 'Index format: chapter/stripname'
def namer(self, image_url, page_url):
parts = image_url.rsplit('/', 2)
return '_'.join(parts[1:3])
class RedString(_BasicScraper):
@ -79,30 +72,30 @@ class RedString(_BasicScraper):
help = 'Index format: nnn'
class RomanticallyApocalyptic(_BasicScraper):
class RomanticallyApocalyptic(_ParserScraper):
url = 'http://romanticallyapocalyptic.com/'
rurl = escape(url)
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre("img", "src", r'(%sart/\d+[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%s\d+[^"]+)' % rurl) + "\s*" +
tagre('span', 'class', 'spritePrevious'))
stripUrl = url + '%s'
firstStripUrl = stripUrl % '0'
imageSearch = '//div[%s]/center//img' % xpath_class('comicpanel')
prevSearch = '//a[@accesskey="p"]'
latestSearch = '//a[span[%s]]' % xpath_class('glyphicon-fast-forward')
starter = indirectStarter
help = 'Index format: n'
adult = True
class Roza(_BasicScraper):
class Roza(_ParserScraper):
url = 'http://www.junglestudio.com/roza/index.php'
stripUrl = url + '?date=%s'
firstStripUrl = stripUrl % '2007-05-01'
imageSearch = compile(r'<img src="(pages/.+?)"')
prevSearch = compile(r'<a href="(index.php\?date=.+?)">[^>].+?navtable_01.gif')
imageSearch = '//img[contains(@src, "pages/")]'
prevSearch = '//a[img[contains(@src, "navtable_01.gif")]]'
help = 'Index format: yyyy-mm-dd'
class Ruthe(_BasicScraper):
url = 'http://ruthe.de/'
stripUrl = url + 'cartoon/%s/datum/ASC'
stripUrl = url + 'cartoon/%s/datum/asc/'
firstStripUrl = stripUrl % '1'
lang = 'de'
imageSearch = compile(tagre("img", "src", r'(/?cartoons/strip_\d+[^"]+)'))

View file

@ -46,13 +46,13 @@ class SailorsunOrg(_WordPressScraper):
url = 'http://sailorsun.org/'
class SamAndFuzzy(_BasicScraper):
class SamAndFuzzy(_ParserScraper):
url = 'http://www.samandfuzzy.com/'
stripUrl = 'http://samandfuzzy.com/%s'
stripUrl = url + '%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(r'(/comics/.+?)" alt')
prevSearch = compile(r'"><a href="(.+?)"><img src="imgint/nav_prev.gif"')
help = 'Index format: nnnn'
imageSearch = '//img[@class="comic-image"]'
prevSearch = '//li[@class="prev-page"]/a'
help = 'Index format: n (unpadded)'
class SandraOnTheRocks(_BasicScraper):
@ -78,7 +78,7 @@ class ScandinaviaAndTheWorld(_ParserScraper):
class ScaryGoRound(_ParserScraper):
url = 'http://www.scarygoround.com/sgr/ar.php'
stripUrl = url + 'ar.php?date=%s'
stripUrl = url + '?date=%s'
firstStripUrl = stripUrl % '20020604'
imageSearch = '//img[contains(@src, "/strips/")]'
prevSearch = '//a[contains(text(), "Previous")]'
@ -104,14 +104,13 @@ class ScenesFromAMultiverse(_BasicScraper):
help = 'Index format: yyyy/mm/dd/stripname'
class SchlockMercenary(_BasicScraper):
class SchlockMercenary(_ParserScraper):
url = 'http://www.schlockmercenary.com/'
stripUrl = url + '%s'
firstStripUrl = stripUrl % '2000-06-12'
imageSearch = compile(tagre("img", "src", r'(http://static\.schlockmercenary\.com/comics/[^"]+)'))
imageSearch = '//div[@class="strip-image-wrapper"]/img'
multipleImagesPerStrip = True
prevSearch = compile(tagre("a", "href", r'(/\d+-\d+-\d+)', quote="'",
after="nav-previous"))
prevSearch = '//a[@class="previous-strip"]'
help = 'Index format: yyyy-mm-dd'
@ -267,8 +266,10 @@ class SinFest(_BasicScraper):
help = 'Index format: yyyy-mm-dd'
class Sithrah(_WordPressScraper):
class Sithrah(_ParserScraper):
url = 'http://sithrah.com/'
imageSearch = '//div[@class="webcomic-image"]/img'
prevSearch = '//a[%s]' % xpath_class('previous-webcomic-link')
class SkinDeep(_BasicScraper):
@ -284,8 +285,9 @@ class SleeplessDomain(_ComicControlScraper):
url = 'http://www.sleeplessdomain.com/'
class SlightlyDamned(_WordPressScraper):
class SlightlyDamned(_ComicControlScraper):
url = 'http://www.sdamned.com/'
firstStripUrl = url + 'comic/part-one-to-hell-and-back'
class SluggyFreelance(_BasicScraper):
@ -299,8 +301,7 @@ class SluggyFreelance(_BasicScraper):
class SMBC(_ParserScraper):
url = 'http://www.smbc-comics.com/'
rurl = escape(url)
stripUrl = url + '?id=%s'
stripUrl = url + 'index.php?id=%s'
firstStripUrl = stripUrl % '1'
multipleImagesPerStrip = True
imageSearch = ['//img[@id="comic"]', '//div[@id="aftercomic"]/img']
@ -363,14 +364,15 @@ class SomethingPositive(_BasicScraper):
help = 'Index format: mmddyyyy'
class Sorcery101(_BasicScraper):
baseUrl = 'http://www.sorcery101.net/'
url = baseUrl + 'sorcery-101/'
rurl = escape(baseUrl)
stripUrl = url + '%s/'
imageSearch = compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%ssorcery-101/[^"]+)' % rurl,
after="previous-"))
class Sorcery101(_ParserScraper):
baseUrl = 'http://www.sorcery101.net/sorcery-101/'
stripUrl = baseUrl + '%s/'
url = stripUrl % 'sorcery101-ch-01'
firstStripUrl = url
imageSearch = '//div[@class="webcomic-image"]/img'
prevSearch = '//a[@rel="prev"]'
latestSearch = '//a[%s]' % xpath_class('last-webcomic-link')
starter = indirectStarter
help = 'Index format: stripname'
@ -423,7 +425,7 @@ class StandStillStaySilent(_ParserScraper):
class StarCrossdDestiny(_ParserScraper):
baseUrl = 'http://www.starcrossd.net/'
baseUrl = 'http://starcrossd.net/'
url = baseUrl + 'comic.html'
stripUrl = baseUrl + 'archives/%s.html'
firstStripUrl = stripUrl % '00000001'
@ -445,10 +447,10 @@ class StarCrossdDestiny(_ParserScraper):
class StationV3(_ParserScraper):
url = 'http://www.stationv3.com/'
stripUrl = url + 'd/%s.html'
stripUrl = url + 'd2/%s.html'
firstStripUrl = stripUrl % '20150628'
imageSearch = '//img[contains(@src,"/comics2/")]'
prevSearch = '//a[img[contains(@src,"/previous2")]]'
help = 'Index format: yyyymmdd'
@ -462,15 +464,6 @@ class StickyDillyBuns(_BasicScraper):
help = 'Index format: name'
class StrawberryDeathCake(_BasicScraper):
url = 'http://strawberrydeathcake.com/'
rurl = escape(url)
imageSearch = compile(tagre("img", "src",
r'(%swp-content/webcomic/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%sarchive/[^"]+)' % rurl,
after="previous"))
class StreetFighter(_ComicControlScraper):
url = 'http://www.streetfightercomics.com'