dosage/dosagelib/plugins/w.py

239 lines
8.3 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
2014-01-05 15:50:57 +00:00
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
2012-11-21 20:57:26 +00:00
from __future__ import absolute_import, division, print_function
from re import compile, escape, IGNORECASE
2012-06-20 19:58:13 +00:00
2012-10-11 10:03:12 +00:00
from ..scraper import _BasicScraper
2012-11-26 06:13:32 +00:00
from ..util import tagre
2013-04-03 18:30:51 +00:00
from ..helpers import indirectStarter
from .common import _ComicControlScraper, _WordPressScraper
2012-06-20 19:58:13 +00:00
2013-03-06 19:21:10 +00:00
class WapsiSquare(_BasicScraper):
url = 'http://wapsisquare.com/'
rurl = escape(url)
2013-04-10 21:57:09 +00:00
stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % '09092001'
imageSearch = compile(r'<img src="(%scomics/.+?)"' % rurl)
2013-03-06 19:21:10 +00:00
prevSearch = compile(r'<a href="(.+?)"[^>]+?>Previous</a>')
2013-04-03 18:30:51 +00:00
help = 'Index format: stripname'
2012-06-20 19:58:13 +00:00
2013-02-06 21:08:36 +00:00
class WastedTalent(_BasicScraper):
url = 'http://www.wastedtalent.ca/'
stripUrl = url + 'comic/%s'
2013-04-10 21:57:09 +00:00
firstStripUrl = stripUrl % 'anime-crack'
2013-02-06 21:08:36 +00:00
imageSearch = compile(tagre("img", "src", r'(http://www\.wastedtalent\.ca/sites/default/files/imagecache/comic_full/comics/\d+/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/comic/[^"]+)',
after="comic_prev"))
2013-02-06 21:08:36 +00:00
help = 'Index format: stripname'
2013-03-06 19:21:10 +00:00
class WayfarersMoon(_BasicScraper):
url = 'http://www.wayfarersmoon.com/'
stripUrl = url + 'index.php?page=%s'
2013-04-10 21:57:09 +00:00
firstStripUrl = stripUrl % '0'
2013-03-06 19:21:10 +00:00
imageSearch = compile(r'<img src="(/admin.+?)"')
prevSearch = compile(r'<a href="(.+?)".+?btn_back.gif')
help = 'Index format: nn'
2013-04-03 18:30:51 +00:00
class WebDesignerCOTW(_BasicScraper):
url = 'http://www.webdesignerdepot.com/'
rurl = escape(url)
starter = indirectStarter
2013-04-03 18:30:51 +00:00
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2009/11/comics-of-the-week-1'
imageSearch = (
compile(tagre("img", "src", r'(http://netdna\.webdesignerdepot\.com/uploads/\d+/\d+/\d+s?\.[^"]+)')),
compile(tagre("img", "src", r'(http://netdna\.webdesignerdepot\.com/uploads/\d+/\d+/Christmas\d+\.[^"]+)')),
compile(tagre("img", "src", r'(http://netdna\.webdesignerdepot\.com/uploads/comics\d+[a-z0-9]*/\d+a?\.[^"]+)')),
compile(tagre("img", "src", r'(http://netdna\.webdesignerdepot\.com/uploads/comics/\d+\.[^"]+)')),
)
multipleImagesPerStrip = True
prevSearch = compile(tagre("link", "href", r"(%s\d+/\d+/[^']+)" % rurl,
before='prev', quote="'"))
latestSearch = compile(tagre("a", "href", r'(%s\d+/\d+/[^"]+/)' % rurl))
2013-04-03 18:30:51 +00:00
help = 'Index format: yyyy/mm/stripname'
def shouldSkipUrl(self, url, data):
2013-04-03 18:30:51 +00:00
"""Skip non-comic URLs."""
return 'comics-of-the-week' not in url
@classmethod
def namer(cls, imageUrl, pageUrl):
imagename = imageUrl.rsplit('/', 1)[1]
week = compile(r'week-(\d+)').search(pageUrl).group(1)
return "%s-%s" % (week, imagename)
2013-03-06 19:21:10 +00:00
class WeCanSleepTomorrow(_BasicScraper):
url = 'http://wecansleeptomorrow.com/'
rurl = escape(url)
2013-03-06 19:21:10 +00:00
stripUrl = url + '%s/'
imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
2013-03-06 19:21:10 +00:00
help = 'Index format: yyyy/mm/dd/stripname'
class Weregeek(_BasicScraper):
url = 'http://www.weregeek.com/'
rurl = escape(url)
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2006/11/27/'
imageSearch = compile(tagre("img", "src",
r'(%scomics/\d+-\d+-\d+[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'((%s)?/?\d+/\d+/\d+/)' % rurl) +
'\s*' + tagre('img', 'src', '[^"]*previous_day.gif'))
help = 'Index format: yyyy/mm/dd'
2012-06-20 19:58:13 +00:00
class WhiteNinja(_BasicScraper):
2013-04-13 18:58:00 +00:00
baseUrl = 'http://www.whiteninjacomics.com/'
url = baseUrl + 'comics.shtml'
stripUrl = baseUrl + 'comics/%s.shtml'
2012-06-20 19:58:13 +00:00
imageSearch = compile(r'<img src=(/images/comics/(?!t-).+?\.gif) border=0')
prevSearch = compile(r'(/comics/.+?shtml).+?previous')
help = 'Index format: s (comic name)'
2013-04-10 16:36:33 +00:00
class WhiteNoise(_BasicScraper):
2013-04-13 18:58:00 +00:00
baseUrl = 'http://www.wncomic.com/'
url = baseUrl + 'archive.php'
stripUrl = baseUrl + 'archive_comments.php?strip_id=%s'
2013-04-10 21:57:09 +00:00
firstStripUrl = stripUrl % '1'
2013-04-10 16:36:33 +00:00
imageSearch = compile(r'(istrip_files/strips/.+?)"')
prevSearch = compile(r'</a><a href="(.+?)"><img src="images/top_back.jpg" ')
help = 'Index format: n'
class Whomp(_ComicControlScraper):
2014-06-24 18:48:49 +00:00
url = 'http://www.whompcomic.com/'
firstStripUrl = url + 'comic/06152010'
textSearch = '//img[@id="cc-comic"]/@title'
2014-06-24 18:48:49 +00:00
2012-06-20 19:58:13 +00:00
class WhyTheLongFace(_BasicScraper):
2013-04-13 18:58:00 +00:00
baseUrl = 'http://www.absurdnotions.org/'
rurl = escape(baseUrl)
url = baseUrl + 'wtlf200709.html'
stripUrl = baseUrl + 'wtlf%s.html'
2013-04-10 21:57:09 +00:00
firstStripUrl = stripUrl % '200306'
imageSearch = compile(r'<img src="(%swtlf.+?|lf\d+.\w{1,4})"' % rurl,
IGNORECASE)
2012-12-04 06:02:40 +00:00
multipleImagesPerStrip = True
2012-06-20 19:58:13 +00:00
prevSearch = compile(r'HREF="(.+?)"><IMG SRC="nprev.gif" ')
help = 'Index format: yyyymm'
class Wigu(_BasicScraper):
url = 'http://wigucomics.com/'
2013-11-12 17:33:14 +00:00
stripUrl = url + 'oc/index.php?comic=%s'
2013-04-10 21:57:09 +00:00
firstStripUrl = stripUrl % '1'
2013-11-12 17:33:14 +00:00
imageSearch = compile(tagre("img", "src", r'(/oc/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/oc/index\.php\?comic=\d+)',
after="go back"))
2012-11-26 06:13:32 +00:00
help = 'Index format: n'
2012-06-20 19:58:13 +00:00
2013-02-06 21:08:36 +00:00
class Wonderella(_BasicScraper):
url = 'http://nonadventures.com/'
rurl = escape(url)
2013-02-06 21:08:36 +00:00
stripUrl = url + '%s/'
2013-04-10 21:57:09 +00:00
firstStripUrl = stripUrl % '2006/09/09/the-torment-of-a-thousand-yesterdays'
imageSearch = compile(tagre("div", "id", r"comic", quote=r'["\']') +
r"\s*" +
tagre("img", "src", r'(%scomics/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%s\d+/\d+/\d+/[^"]+)' % rurl,
after="prev"))
2013-02-06 21:08:36 +00:00
help = 'Index format: yyyy/mm/dd/name'
2013-04-10 16:36:33 +00:00
class Wondermark(_BasicScraper):
url = 'http://wondermark.com/'
stripUrl = url + '%s/'
2013-04-10 21:57:09 +00:00
firstStripUrl = stripUrl % '001'
2013-04-10 16:36:33 +00:00
imageSearch = compile(r'<img src="(http://wondermark.com/c/.+?)"')
prevSearch = compile(r'<a href="(.+?)" rel="prev">')
help = 'Index format: nnn'
class WorldOfMrToast(_BasicScraper):
2013-04-13 18:58:00 +00:00
baseUrl = 'http://www.theimaginaryworld.com/'
url = baseUrl + 'mrTcomicA.html'
stripUrl = baseUrl + '%s.html'
imageSearch = compile(tagre("img", "src", r'(comic[^"]+)'))
# list the archive links since there is no prev/next navigation
prevurls = (
url,
2013-04-13 18:58:00 +00:00
baseUrl + 'mrTcomicW02.html',
baseUrl + 'mrTcomicW01.html',
baseUrl + 'mrGcomic03.html',
baseUrl + 'mrGcomic02.html',
baseUrl + 'mrGcomic01.html',
baseUrl + 'mrTcomicT05.html',
baseUrl + 'mrTcomicT04.html',
baseUrl + 'mrTcomicT03.html',
baseUrl + 'mrTcomicT02.html',
baseUrl + 'mrTcomicT01.html',
baseUrl + 'mrTcomicIW3.html',
baseUrl + 'mrTcomicIW2.html',
baseUrl + 'mrTcomicIW1.html',
)
firstStripUrl = prevurls[-1]
multipleImagesPerStrip = True
help = 'Index format: none'
def getPrevUrl(self, url, data, baseUrl):
idx = self.prevurls.index(url)
try:
return self.prevurls[idx + 1]
except IndexError:
return None
class WorldOfWarcraftEh(_WordPressScraper):
url = 'http://woweh.com/'
2013-03-21 17:33:16 +00:00
class WormWorldSaga(_BasicScraper):
url = 'http://www.wormworldsaga.com/'
stripUrl = url + 'chapters/%s/index.php'
firstStripUrl = stripUrl % 'chapter01/EN'
imageSearch = (
compile(tagre("img", "src", r'(images/CH\d+_\d+\.[^"]+)')),
compile(tagre("img", "src", r'(panels/CH\d+_[^"]+)')),
)
2013-04-30 04:42:49 +00:00
latestChapter = 5
2013-03-21 17:33:16 +00:00
multipleImagesPerStrip = True
def starter(self):
2013-03-21 17:33:16 +00:00
return '%schapters/chapter%02d/%s/index.php' % (
self.url, self.latestChapter, self.lang.upper())
2013-03-21 17:33:16 +00:00
2015-05-31 23:45:22 +00:00
def getPrevUrl(self, url, data):
2013-03-21 17:33:16 +00:00
"""Find previous URL."""
if 'chapter04' in url:
return url.replace('chapter04', 'chapter03')
if 'chapter03' in url:
return url.replace('chapter03', 'chapter02')
if 'chapter02' in url:
return url.replace('chapter02', 'chapter01')
return None
class WormWorldSagaFrench(WormWorldSaga):
lang = 'fr'
2013-03-21 17:33:16 +00:00
class WormWorldSagaGerman(WormWorldSaga):
lang = 'de'
2013-03-21 17:33:16 +00:00
class WormWorldSagaSpanish(WormWorldSaga):
lang = 'es'