2016-03-31 21:13:54 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
2016-10-28 22:21:41 +00:00
|
|
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
2014-01-05 15:50:57 +00:00
|
|
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
2019-11-03 23:16:25 +00:00
|
|
|
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
2012-11-21 20:57:26 +00:00
|
|
|
|
2016-03-31 21:13:54 +00:00
|
|
|
from __future__ import absolute_import, division, print_function
|
2016-04-10 21:04:34 +00:00
|
|
|
|
2013-04-10 16:19:11 +00:00
|
|
|
from re import compile, escape, IGNORECASE
|
2012-06-20 19:58:13 +00:00
|
|
|
|
2016-05-16 21:16:29 +00:00
|
|
|
from ..scraper import _BasicScraper, _ParserScraper
|
2012-11-26 06:13:32 +00:00
|
|
|
from ..util import tagre
|
2019-06-21 07:54:45 +00:00
|
|
|
from ..helpers import bounceStarter, indirectStarter, xpath_class
|
2019-06-30 06:52:05 +00:00
|
|
|
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
|
2012-06-20 19:58:13 +00:00
|
|
|
|
|
|
|
|
2016-05-16 21:16:29 +00:00
|
|
|
class WapsiSquare(_WordPressScraper):
|
2013-03-06 19:21:10 +00:00
|
|
|
url = 'http://wapsisquare.com/'
|
2016-05-16 21:16:29 +00:00
|
|
|
firstStripUrl = url + 'comic/09092001/'
|
2012-06-20 19:58:13 +00:00
|
|
|
|
|
|
|
|
2013-02-06 21:08:36 +00:00
|
|
|
class WastedTalent(_BasicScraper):
|
|
|
|
url = 'http://www.wastedtalent.ca/'
|
|
|
|
stripUrl = url + 'comic/%s'
|
2013-04-10 21:57:09 +00:00
|
|
|
firstStripUrl = stripUrl % 'anime-crack'
|
2013-02-06 21:08:36 +00:00
|
|
|
imageSearch = compile(tagre("img", "src", r'(http://www\.wastedtalent\.ca/sites/default/files/imagecache/comic_full/comics/\d+/[^"]+)'))
|
2016-03-31 21:13:54 +00:00
|
|
|
prevSearch = compile(tagre("a", "href", r'(/comic/[^"]+)',
|
|
|
|
after="comic_prev"))
|
2013-02-06 21:08:36 +00:00
|
|
|
help = 'Index format: stripname'
|
|
|
|
|
|
|
|
|
2017-05-21 22:30:31 +00:00
|
|
|
class WebDesignerCOTW(_ParserScraper):
|
|
|
|
baseUrl = 'https://www.webdesignerdepot.com/'
|
|
|
|
url = baseUrl + 'category/comics/'
|
2016-04-13 18:01:51 +00:00
|
|
|
starter = indirectStarter
|
2017-05-21 22:30:31 +00:00
|
|
|
firstStripUrl = baseUrl + '2009/11/comics-of-the-week-1/'
|
|
|
|
imageSearch = '//article[%s]//img' % xpath_class('article-content')
|
2013-04-03 18:30:51 +00:00
|
|
|
multipleImagesPerStrip = True
|
2017-05-21 22:30:31 +00:00
|
|
|
prevSearch = '//a[span[%s]]' % xpath_class('icon-right-small')
|
|
|
|
latestSearch = '//a[%s]' % xpath_class('anim-link')
|
2013-04-03 18:30:51 +00:00
|
|
|
|
2014-02-10 20:58:09 +00:00
|
|
|
def shouldSkipUrl(self, url, data):
|
2013-04-03 18:30:51 +00:00
|
|
|
"""Skip non-comic URLs."""
|
|
|
|
return 'comics-of-the-week' not in url
|
|
|
|
|
2016-04-21 06:20:49 +00:00
|
|
|
def namer(self, image_url, page_url):
|
|
|
|
imagename = image_url.rsplit('/', 1)[1]
|
|
|
|
week = compile(r'week-(\d+)').search(page_url).group(1)
|
2013-04-03 18:30:51 +00:00
|
|
|
return "%s-%s" % (week, imagename)
|
|
|
|
|
|
|
|
|
2013-03-06 19:21:10 +00:00
|
|
|
class WeCanSleepTomorrow(_BasicScraper):
|
|
|
|
url = 'http://wecansleeptomorrow.com/'
|
2013-04-10 16:19:11 +00:00
|
|
|
rurl = escape(url)
|
2013-03-06 19:21:10 +00:00
|
|
|
stripUrl = url + '%s/'
|
2013-04-10 16:19:11 +00:00
|
|
|
imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl))
|
|
|
|
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
|
2013-03-06 19:21:10 +00:00
|
|
|
help = 'Index format: yyyy/mm/dd/stripname'
|
|
|
|
|
|
|
|
|
2019-07-13 09:02:04 +00:00
|
|
|
class Weregeek(_ParserScraper):
|
2013-12-10 18:50:21 +00:00
|
|
|
url = 'http://www.weregeek.com/'
|
|
|
|
stripUrl = url + '%s/'
|
2019-07-13 09:02:04 +00:00
|
|
|
firstStripUrl = stripUrl % '2006/11/27'
|
|
|
|
imageSearch = '//div[@id="comic"]/img'
|
|
|
|
prevSearch = '//a[./img[@alt="Previous"]]'
|
2013-12-10 18:50:21 +00:00
|
|
|
help = 'Index format: yyyy/mm/dd'
|
|
|
|
|
|
|
|
|
2019-07-13 07:49:31 +00:00
|
|
|
class WereIWolf(_ParserScraper):
|
|
|
|
stripUrl = 'https://wolfwares.ca/comics/Were I wolf/strip2.php?name=%s&start=%s'
|
|
|
|
url = stripUrl % ('4 Black and White - part 3', 'latest')
|
|
|
|
firstStripUrl = stripUrl % ('1 Sirens', '0')
|
|
|
|
imageSearch = '//img[contains(@src, "ROW")]'
|
|
|
|
prevSearch = '//a[./img[contains(@src, "previous")]]'
|
|
|
|
multipleImagesPerStrip = True
|
|
|
|
endOfLife = True
|
|
|
|
chapters = ('1 Sirens',
|
|
|
|
'2 Black and White',
|
|
|
|
'3 Black and White - Princess and Knight',
|
|
|
|
'4 Black and White - part 3')
|
|
|
|
|
|
|
|
def namer(self, imageUrl, pageUrl):
|
|
|
|
# Prepend chapter number to image filename
|
|
|
|
for chapter in self.chapters:
|
|
|
|
if chapter in pageUrl:
|
|
|
|
chapterNum = chapter[0]
|
|
|
|
return chapterNum + '_' + imageUrl.rsplit('/', 1)[-1]
|
|
|
|
|
|
|
|
def getPrevUrl(self, url, data):
|
|
|
|
# Fix missing navigation links between chapters
|
|
|
|
if url == self.stripUrl % (self.chapters[3], '0'):
|
|
|
|
return self.stripUrl % (self.chapters[2], 'latest')
|
|
|
|
if url == self.stripUrl % (self.chapters[2], '0'):
|
|
|
|
return self.stripUrl % (self.chapters[1], 'latest')
|
|
|
|
if url == self.stripUrl % (self.chapters[1], '0'):
|
|
|
|
return self.stripUrl % (self.chapters[0], 'latest')
|
|
|
|
return super(WereIWolf, self).getPrevUrl(url, data)
|
|
|
|
|
|
|
|
def getIndexStripUrl(self, index):
|
|
|
|
# Get comic strip URL from index
|
|
|
|
index = index.split('-')
|
|
|
|
return self.stripUrl % (index[0], index[1])
|
|
|
|
|
|
|
|
|
2016-05-16 21:16:29 +00:00
|
|
|
class WhiteNoise(_WordPressScraper):
|
|
|
|
url = 'http://whitenoisecomic.com/'
|
2019-10-19 22:27:43 +00:00
|
|
|
stripUrl = url + 'comic/%s/'
|
|
|
|
firstStripUrl = stripUrl % 'book-one'
|
2016-05-16 21:16:29 +00:00
|
|
|
prevSearch = '//a[%s]' % xpath_class('previous-webcomic-link')
|
2013-04-10 16:36:33 +00:00
|
|
|
|
|
|
|
|
2019-06-21 07:54:45 +00:00
|
|
|
class WhiteNoiseLee(_ComicControlScraper):
|
|
|
|
url = 'http://www.white-noise-comic.com/'
|
|
|
|
stripUrl = url + 'comic/%s'
|
|
|
|
firstStripUrl = stripUrl % '1-0'
|
|
|
|
starter = bounceStarter
|
|
|
|
|
|
|
|
def namer(self, imageUrl, pageUrl):
|
|
|
|
return pageUrl.rsplit('/', 1)[-1] + '.' + imageUrl.rsplit('.', 1)[-1]
|
|
|
|
|
|
|
|
|
2016-04-10 21:04:34 +00:00
|
|
|
class Whomp(_ComicControlScraper):
|
2014-06-24 18:48:49 +00:00
|
|
|
url = 'http://www.whompcomic.com/'
|
2016-04-10 21:04:34 +00:00
|
|
|
firstStripUrl = url + 'comic/06152010'
|
|
|
|
textSearch = '//img[@id="cc-comic"]/@title'
|
2014-06-24 18:48:49 +00:00
|
|
|
|
|
|
|
|
2012-06-20 19:58:13 +00:00
|
|
|
class WhyTheLongFace(_BasicScraper):
|
2013-04-13 18:58:00 +00:00
|
|
|
baseUrl = 'http://www.absurdnotions.org/'
|
|
|
|
rurl = escape(baseUrl)
|
|
|
|
url = baseUrl + 'wtlf200709.html'
|
|
|
|
stripUrl = baseUrl + 'wtlf%s.html'
|
2013-04-10 21:57:09 +00:00
|
|
|
firstStripUrl = stripUrl % '200306'
|
2016-03-31 21:13:54 +00:00
|
|
|
imageSearch = compile(r'<img src="(%swtlf.+?|lf\d+.\w{1,4})"' % rurl,
|
|
|
|
IGNORECASE)
|
2012-12-04 06:02:40 +00:00
|
|
|
multipleImagesPerStrip = True
|
2012-06-20 19:58:13 +00:00
|
|
|
prevSearch = compile(r'HREF="(.+?)"><IMG SRC="nprev.gif" ')
|
|
|
|
help = 'Index format: yyyymm'
|
|
|
|
|
|
|
|
|
2016-05-16 21:16:29 +00:00
|
|
|
class Wigu(_ParserScraper):
|
|
|
|
stripUrl = 'http://www.wigucomics.com/adventures/index.php?comic=%s'
|
|
|
|
url = stripUrl % '-1'
|
2013-04-10 21:57:09 +00:00
|
|
|
firstStripUrl = stripUrl % '1'
|
2016-05-16 21:16:29 +00:00
|
|
|
imageSearch = '//div[@id="comic"]//img[contains(@src, "/comics/")]'
|
|
|
|
prevSearch = '//a[@alt="go back"]'
|
|
|
|
endOfLife = True
|
2012-11-26 06:13:32 +00:00
|
|
|
help = 'Index format: n'
|
2012-06-20 19:58:13 +00:00
|
|
|
|
|
|
|
|
2019-07-02 07:23:52 +00:00
|
|
|
class WildeLife(_ComicControlScraper):
|
|
|
|
url = 'http://www.wildelifecomic.com/'
|
|
|
|
stripUrl = url + 'comic/%s'
|
|
|
|
firstStripUrl = stripUrl % '1'
|
|
|
|
|
|
|
|
|
2013-02-06 21:08:36 +00:00
|
|
|
class Wonderella(_BasicScraper):
|
|
|
|
url = 'http://nonadventures.com/'
|
2013-04-10 16:19:11 +00:00
|
|
|
rurl = escape(url)
|
2013-02-06 21:08:36 +00:00
|
|
|
stripUrl = url + '%s/'
|
2013-04-10 21:57:09 +00:00
|
|
|
firstStripUrl = stripUrl % '2006/09/09/the-torment-of-a-thousand-yesterdays'
|
2016-03-31 21:13:54 +00:00
|
|
|
imageSearch = compile(tagre("div", "id", r"comic", quote=r'["\']') +
|
|
|
|
r"\s*" +
|
|
|
|
tagre("img", "src", r'(%scomics/[^"]+)' % rurl))
|
|
|
|
prevSearch = compile(tagre("a", "href", r'(%s\d+/\d+/\d+/[^"]+)' % rurl,
|
|
|
|
after="prev"))
|
2013-02-06 21:08:36 +00:00
|
|
|
help = 'Index format: yyyy/mm/dd/name'
|
|
|
|
|
|
|
|
|
2013-04-10 16:36:33 +00:00
|
|
|
class Wondermark(_BasicScraper):
|
|
|
|
url = 'http://wondermark.com/'
|
|
|
|
stripUrl = url + '%s/'
|
2013-04-10 21:57:09 +00:00
|
|
|
firstStripUrl = stripUrl % '001'
|
2013-04-10 16:36:33 +00:00
|
|
|
imageSearch = compile(r'<img src="(http://wondermark.com/c/.+?)"')
|
|
|
|
prevSearch = compile(r'<a href="(.+?)" rel="prev">')
|
|
|
|
help = 'Index format: nnn'
|
|
|
|
|
|
|
|
|
2013-03-06 19:00:30 +00:00
|
|
|
class WorldOfMrToast(_BasicScraper):
|
2013-04-13 18:58:00 +00:00
|
|
|
baseUrl = 'http://www.theimaginaryworld.com/'
|
|
|
|
url = baseUrl + 'mrTcomicA.html'
|
2013-03-06 19:00:30 +00:00
|
|
|
imageSearch = compile(tagre("img", "src", r'(comic[^"]+)'))
|
|
|
|
# list the archive links since there is no prev/next navigation
|
|
|
|
prevurls = (
|
|
|
|
url,
|
2013-04-13 18:58:00 +00:00
|
|
|
baseUrl + 'mrTcomicW02.html',
|
|
|
|
baseUrl + 'mrTcomicW01.html',
|
|
|
|
baseUrl + 'mrGcomic03.html',
|
|
|
|
baseUrl + 'mrGcomic02.html',
|
|
|
|
baseUrl + 'mrGcomic01.html',
|
|
|
|
baseUrl + 'mrTcomicT05.html',
|
|
|
|
baseUrl + 'mrTcomicT04.html',
|
|
|
|
baseUrl + 'mrTcomicT03.html',
|
|
|
|
baseUrl + 'mrTcomicT02.html',
|
|
|
|
baseUrl + 'mrTcomicT01.html',
|
|
|
|
baseUrl + 'mrTcomicIW3.html',
|
|
|
|
baseUrl + 'mrTcomicIW2.html',
|
|
|
|
baseUrl + 'mrTcomicIW1.html',
|
2013-03-06 19:00:30 +00:00
|
|
|
)
|
|
|
|
firstStripUrl = prevurls[-1]
|
|
|
|
multipleImagesPerStrip = True
|
2016-05-16 21:16:29 +00:00
|
|
|
endOfLife = True
|
2013-03-06 19:00:30 +00:00
|
|
|
|
2016-05-16 21:16:29 +00:00
|
|
|
def getPrevUrl(self, url, data):
|
2013-03-06 19:00:30 +00:00
|
|
|
idx = self.prevurls.index(url)
|
|
|
|
try:
|
2016-04-10 21:04:34 +00:00
|
|
|
return self.prevurls[idx + 1]
|
2013-03-06 19:00:30 +00:00
|
|
|
except IndexError:
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2016-04-10 21:04:34 +00:00
|
|
|
class WorldOfWarcraftEh(_WordPressScraper):
|
2016-03-31 21:13:54 +00:00
|
|
|
url = 'http://woweh.com/'
|
|
|
|
|
|
|
|
|
2013-03-21 17:33:16 +00:00
|
|
|
class WormWorldSaga(_BasicScraper):
|
|
|
|
url = 'http://www.wormworldsaga.com/'
|
|
|
|
stripUrl = url + 'chapters/%s/index.php'
|
|
|
|
firstStripUrl = stripUrl % 'chapter01/EN'
|
|
|
|
imageSearch = (
|
|
|
|
compile(tagre("img", "src", r'(images/CH\d+_\d+\.[^"]+)')),
|
|
|
|
compile(tagre("img", "src", r'(panels/CH\d+_[^"]+)')),
|
|
|
|
)
|
2013-04-30 04:42:49 +00:00
|
|
|
latestChapter = 5
|
2013-03-21 17:33:16 +00:00
|
|
|
multipleImagesPerStrip = True
|
|
|
|
|
2016-04-13 18:01:51 +00:00
|
|
|
def starter(self):
|
2013-03-21 17:33:16 +00:00
|
|
|
return '%schapters/chapter%02d/%s/index.php' % (
|
2016-04-13 18:01:51 +00:00
|
|
|
self.url, self.latestChapter, self.lang.upper())
|
2013-03-21 17:33:16 +00:00
|
|
|
|
2015-05-31 23:45:22 +00:00
|
|
|
def getPrevUrl(self, url, data):
|
2013-03-21 17:33:16 +00:00
|
|
|
"""Find previous URL."""
|
|
|
|
if 'chapter04' in url:
|
|
|
|
return url.replace('chapter04', 'chapter03')
|
|
|
|
if 'chapter03' in url:
|
|
|
|
return url.replace('chapter03', 'chapter02')
|
|
|
|
if 'chapter02' in url:
|
|
|
|
return url.replace('chapter02', 'chapter01')
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2016-03-31 21:13:54 +00:00
|
|
|
class WormWorldSagaFrench(WormWorldSaga):
|
|
|
|
lang = 'fr'
|
|
|
|
|
|
|
|
|
2013-03-21 17:33:16 +00:00
|
|
|
class WormWorldSagaGerman(WormWorldSaga):
|
|
|
|
lang = 'de'
|
|
|
|
|
2016-03-31 21:13:54 +00:00
|
|
|
|
2013-03-21 17:33:16 +00:00
|
|
|
class WormWorldSagaSpanish(WormWorldSaga):
|
|
|
|
lang = 'es'
|
2019-06-30 06:52:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Wrongside(_WPNavi):
|
|
|
|
url = 'http://www.ayzewi.com/comic/'
|
|
|
|
stripUrl = url + '?comic=%s'
|
|
|
|
firstStripUrl = stripUrl % 'intro-2'
|