From 975d2376bf050e4b9a20a88d3af2dc33a1a7e550 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sat, 7 May 2016 01:50:10 +0200 Subject: [PATCH] Another round of comic module fixes. --- dosagelib/plugins/c.py | 14 +------ dosagelib/plugins/common.py | 18 +++++++++ dosagelib/plugins/f.py | 17 ++------- dosagelib/plugins/j.py | 8 +--- dosagelib/plugins/t.py | 73 ++++++++++++------------------------- 5 files changed, 49 insertions(+), 81 deletions(-) diff --git a/dosagelib/plugins/c.py b/dosagelib/plugins/c.py index bb50fb853..79029d2e3 100755 --- a/dosagelib/plugins/c.py +++ b/dosagelib/plugins/c.py @@ -10,7 +10,7 @@ from re import compile, escape from ..scraper import _BasicScraper, _ParserScraper from ..helpers import bounceStarter, indirectStarter from ..util import tagre -from .common import _WordPressScraper, xpath_class +from .common import _TumblrScraper, _WordPressScraper, xpath_class class Caggage(_BasicScraper): @@ -207,24 +207,14 @@ class CigarroAndCerveja(_ParserScraper): prevSearch = '//a[contains(text()," Prev")]', -class Collar6(_ParserScraper): +class Collar6(_TumblrScraper): url = 'http://collar6.tumblr.com/' firstStripUrl = url + 'post/138117470810/the-very-first-strip-from-when-i-thought-it-was' imageSearch = '//figure[@class="photo-hires-item"]//img' prevSearch = '//a[@class="previous-button"]' latestSearch = '//li[@class="timestamp"]/a' - starter = indirectStarter adult = True - def namer(self, image_url, page_url): - # tumblr URLs: http://host/post/num/name - # 0 1 2 3 4 5 - parts = page_url.split('/') - if len(parts) > 5: - return '%s_%s' % (parts[4], parts[5]) - else: - return parts[4] - class Comedity(_BasicScraper): url = 'http://www.comedity.com/' diff --git a/dosagelib/plugins/common.py b/dosagelib/plugins/common.py index a2491c5d4..ae693a340 100644 --- a/dosagelib/plugins/common.py +++ b/dosagelib/plugins/common.py @@ -6,6 +6,7 @@ from __future__ import absolute_import, division, print_function from ..scraper import _ParserScraper +from ..helpers import indirectStarter # Common base classes for comics with the same structure (same hosting # software, for example) go here. Since those are shared by many modules, @@ -35,3 +36,20 @@ class _WPNaviIn(_WordPressScraper): class _ComicControlScraper(_ParserScraper): imageSearch = '//img[@id="cc-comic"]' prevSearch = '//a[@rel="prev"]' + + +class _TumblrScraper(_ParserScraper): + starter = indirectStarter + + def namer(self, image_url, page_url): + # tumblr URLs: http://host/post/num/name + # 0 1 2 3 4 5 + parts = page_url.split('/') + if len(parts) > 5: + return '%s_%s' % (parts[4], parts[5]) + else: + return parts[4] + + def shouldSkipUrl(self, url, data): + """Reblogged stuff is iframed""" + return data.xpath('//div[@id="post"]//iframe') diff --git a/dosagelib/plugins/f.py b/dosagelib/plugins/f.py index e995a8df0..16d507d71 100644 --- a/dosagelib/plugins/f.py +++ b/dosagelib/plugins/f.py @@ -46,16 +46,6 @@ class FauxPas(_BasicScraper): help = 'Index format: nnn' -class FilibusterCartoons(_BasicScraper): - url = 'http://www.filibustercartoons.com/' - rurl = escape(url) - stripUrl = url + 'index.php/%s' - firstStripUrl = stripUrl % '2001/06/28/poor-jean' - imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl)) - prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev")) - help = 'Index format: yyyy/mm/dd/name' - - class FireflyCross(_WordPressScraper): url = 'http://www.fireflycross.pensandtales.com/' firstStripUrl = url + '?comic=05062002' @@ -101,14 +91,13 @@ class Flipside(_BasicScraper): help = 'Index format: nnnn' -class FonFlatter(_BasicScraper): +class FonFlatter(_ParserScraper): url = 'http://www.fonflatter.de/' - rurl = escape(url) stripUrl = url + '%s/' firstStripUrl = stripUrl % '2005/09/20/01-begegnung-mit-batman' lang = 'de' - imageSearch = compile(r'src="(%s\d+/fred_\d+-\d+-\d+[^"]+)' % rurl) - prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev")) + imageSearch = r'//img[re:test(@src, "/fred_\d+")]' + prevSearch = '//a[@rel="prev"]' help = 'Index format: yyyy/mm/dd/number-stripname' def shouldSkipUrl(self, url, data): diff --git a/dosagelib/plugins/j.py b/dosagelib/plugins/j.py index 51e3a0bd3..8a7c594ee 100644 --- a/dosagelib/plugins/j.py +++ b/dosagelib/plugins/j.py @@ -10,6 +10,7 @@ from re import compile, escape from ..scraper import _BasicScraper from ..util import tagre from ..helpers import indirectStarter +from .common import _ComicControlScraper class JackCannon(_BasicScraper): @@ -51,13 +52,8 @@ class JoeAndMonkey(_BasicScraper): help = 'Index format: nnn' -class JohnnyWander(_BasicScraper): +class JohnnyWander(_ComicControlScraper): url = 'http://www.johnnywander.com/' - stripUrl = url + 'comics/%s' - firstStripUrl = stripUrl % '423' - imageSearch = compile(tagre("img", "src", r'(http://www\.johnnywander\.com/files/comics/[^"]+)')) - prevSearch = compile(tagre("a", "href", r'(/comics/\d+)') + r'prev') - help = 'Index format: nnn' class JustAnotherEscape(_BasicScraper): diff --git a/dosagelib/plugins/t.py b/dosagelib/plugins/t.py index 1504903a4..197844013 100644 --- a/dosagelib/plugins/t.py +++ b/dosagelib/plugins/t.py @@ -4,11 +4,13 @@ # Copyright (C) 2015-2016 Tobias Gruetzmacher from __future__ import absolute_import, division, print_function + from re import compile, escape, IGNORECASE + from ..scraper import _BasicScraper, _ParserScraper from ..helpers import indirectStarter from ..util import tagre -from .common import _WordPressScraper +from .common import _ComicControlScraper, _TumblrScraper, _WordPressScraper class TheBrads(_BasicScraper): @@ -50,9 +52,8 @@ class TheGentlemansArmchair(_WordPressScraper): class TheLandscaper(_BasicScraper): - url = 'http://landscaper.visual-assault.net/comic/latest' - rurl = escape(url) - stripUrl = url + 'comic/%s' + stripUrl = 'http://landscaper.visual-assault.net/comic/%s' + url = stripUrl % 'latest' firstStripUrl = stripUrl % '1' imageSearch = compile(tagre("img", "src", r'(/comics/comic/comic_page/[^"]+)')) @@ -65,14 +66,11 @@ class TheMelvinChronicles(_WordPressScraper): url = 'http://melvin.jeaniebottle.com/' -class TheNoob(_BasicScraper): - url = 'http://www.thenoobcomic.com/index.php' - stripUrl = url + '?pos=%s' +class TheNoob(_WordPressScraper): + url = 'http://thenoobcomic.com/' + stripUrl = url + 'comic/%s/' firstStripUrl = stripUrl % '1' - imageSearch = compile(tagre("img", "src", r'(/headquarters/comics/[^"]+)')) - prevSearch = compile(tagre("a", "href", r'(\?pos=\d+)', - before="comic_nav_previous_button")) - help = 'Index format: nnnn' + help = 'Index format: n (unpadded)' class TheOrderOfTheStick(_BasicScraper): @@ -100,33 +98,21 @@ class TheParkingLotIsFull(_BasicScraper): help = 'Index format: nnn' -class TheThinHLine(_BasicScraper): +class TheThinHLine(_TumblrScraper): url = 'http://thinhline.tumblr.com/' - rurl = escape(url) - stripUrl = url + 'post/%s' - firstStripUrl = stripUrl % '3517345105' - imageSearch = compile(tagre('img', 'data-src', - r'([^"]+media.tumblr.com/[^"]+)', - before='content-image')) - prevSearch = compile(tagre("a", "href", r'([^"]+)') + '>') - latestSearch = compile(tagre("a", "href", r'([^"]+)', - after='class="timestamp"')) - starter = indirectStarter + firstStripUrl = url + 'post/4177372348/thl-1-a-cats-got-his-tongue-click-on-the' + imageSearch = '//img[@id="content-image"]/@data-src' + prevSearch = '//div[@id="pagination"]/a[text()=">"]' + latestSearch = '//a[@class="timestamp"]' adult = True - indirectImageSearch = compile(tagre('a', 'href', r'(%simage/\d+)' % rurl)) + indirectImageSearch = '//div[@id="post"]//a[not(@rel) and img]' def getComicStrip(self, url, data): """The comic strip image is in a separate page.""" - pageUrl = self.fetchUrl(url, data, self.indirectImageSearch) - pageData = self.getPage(pageUrl) - return super(TheThinHLine, self).getComicStrip(pageUrl, pageData) - - def namer(self, image_url, page_url): - """Use page URL sequence which is apparently increasing.""" - num = page_url.split('/')[-1] - ext = image_url.rsplit('.', 1)[1] - return "thethinhline-%s.%s" % (num, ext) + subPage = self.fetchUrl(url, data, self.indirectImageSearch) + pageData = self.getPage(subPage) + return super(TheThinHLine, self).getComicStrip(subPage, pageData) class TheWhiteboard(_BasicScraper): @@ -137,13 +123,9 @@ class TheWhiteboard(_BasicScraper): help = 'Index format: twb or wb + n wg. twb1000' -class TheWotch(_BasicScraper): +class TheWotch(_WordPressScraper): url = 'http://www.thewotch.com/' - stripUrl = url + '?date=%s' - firstStripUrl = stripUrl % '2002-11-21' - imageSearch = compile(r"