diff --git a/dosagelib/comic.py b/dosagelib/comic.py index f1924e2d5..563fb705e 100644 --- a/dosagelib/comic.py +++ b/dosagelib/comic.py @@ -17,10 +17,10 @@ class FetchComicError(IOError): class ComicStrip(object): """A list of comic image URLs.""" - def __init__(self, name, parentUrl, imageUrls, namer): + def __init__(self, name, stripUrl, imageUrls, namer): """Store the image URL list.""" self.name = name - self.parentUrl = parentUrl + self.stripUrl = stripUrl self.imageUrls = imageUrls self.namer = namer @@ -31,10 +31,10 @@ class ComicStrip(object): def getDownloader(self, url): """Get an image downloader.""" - filename = self.namer(url, self.parentUrl) + filename = self.namer(url, self.stripUrl) if filename is None: filename = url.rsplit('/', 1)[1] - return ComicImage(self.name, url, self.parentUrl, filename) + return ComicImage(self.name, url, self.stripUrl, filename) class ComicImage(object): diff --git a/dosagelib/helpers.py b/dosagelib/helpers.py index 021155632..64f16d8c1 100644 --- a/dosagelib/helpers.py +++ b/dosagelib/helpers.py @@ -19,7 +19,9 @@ def regexNamer(regex): """Get name from regular expression.""" @staticmethod def _namer(imageUrl, pageUrl): - return regex.search(imageUrl).group(1) + mo = regex.search(imageUrl) + if mo: + return mo.group(1) return _namer diff --git a/dosagelib/plugins/a.py b/dosagelib/plugins/a.py index a771feecd..5591683aa 100644 --- a/dosagelib/plugins/a.py +++ b/dosagelib/plugins/a.py @@ -73,7 +73,7 @@ class Alice(_BasicScraper): latestUrl = 'http://alice.alicecomics.com/' stripUrl = 'http://alice.alicecomics.com/wp-content/webcomic/alicecomics/%s.jpg' imageSearch = compile(tagre("img", "src", r'(http://alice\.alicecomics\.com/wp-content/webcomic/alicecomics/[^"]+)')) - prevSearch = compile(tagre("a", "href", r'(http://alice.alicecomics.com/archive/[^!]+)', after="previous")) + prevSearch = compile(tagre("a", "href", r'(http://alice.alicecomics.com/archive/[^"]+)', after="previous")) help = 'Index format: yyyy-mm-dd' diff --git a/dosagelib/plugins/b.py b/dosagelib/plugins/b.py index 2c12093d7..2fd6c87c3 100644 --- a/dosagelib/plugins/b.py +++ b/dosagelib/plugins/b.py @@ -10,7 +10,7 @@ class BadlyDrawnKitties(_BasicScraper): latestUrl = 'http://www.badlydrawnkitties.com/' stripUrl = 'http://www.badlydrawnkitties.com/new/%s.html' imageSearch = compile(r'') - prevSearch = compile(r'"(/new/.+?)".+?previous.gif') + prevSearch = compile(tagre("a", "href", r'(/[^"]+)') + tagre("img", "src", r'/images/previous\.gif')) help = 'Index format: n (unpadded)' @@ -123,7 +123,7 @@ class ButternutSquash(_BasicScraper): latestUrl = 'http://www.butternutsquash.net/' stripUrl = 'http://www.butternutsquash.net/v3/%s' imageSearch = compile(tagre("img", "src", r'(http://www\.butternutsquash\.net/comics/[^"]+)')) - prevSearch = compile(tagre("a", "href", r'(http://www\.butternutsquash\.net/[^!]+)', after="prev")) + prevSearch = compile(tagre("a", "href", r'(http://www\.butternutsquash\.net/[^"]+)', after="prev")) help = 'Index format: yyyy/mm/dd/strip-name-author-name' @@ -245,8 +245,8 @@ class BloodBound(_BasicScraper): class BookOfBiff(_BasicScraper): latestUrl = 'http://www.thebookofbiff.com/' stripUrl = 'http://www.thebookofbiff.com/%s' - imageSearch = compile(r'◄ Previous') + imageSearch = compile(tagre("img", "src", r'([^"]+/comics/[^"]+)')) + prevSearch = compile(tagre("a", "href", r'([^"]+)', after="Previous")) help = 'Index format: yyyy/mm/dd/stripnum-strip-name' diff --git a/dosagelib/plugins/c.py b/dosagelib/plugins/c.py index a8c1d8ba5..77a4b97b4 100644 --- a/dosagelib/plugins/c.py +++ b/dosagelib/plugins/c.py @@ -52,7 +52,7 @@ class Catena(_BasicScraper): latestUrl = 'http://catenamanor.com/' stripUrl = 'http://catenamanor.com/%s.gif' imageSearch = compile(tagre("img", "src", r'(http://catenamanor\.com/comics/[^"]+)')) - prevSearch = compile(tagre("a", "href", r'[^"]+', after='rel="prev"')) + prevSearch = compile(tagre("a", "href", r'([^"]+)', after='rel="prev"')) help = 'Index format: yyyy-mm-dd-' @@ -127,15 +127,6 @@ class Comedity(_BasicScraper): help = 'Index format: n (no padding)' -class Comet7(_BasicScraper): - latestUrl = 'http://www.comet7.com/' - imageUrl = 'http://www.comet7.com/archive_page.php?id=%s' - imageSearch = compile(r'"(.*?/strips/.*?)"') - prevSearch = compile(r'"(.*?)".*?previous_stripf') - help = 'Index format: n (unpadded)' - - - class Commissioned(_BasicScraper): latestUrl = 'http://www.commissionedcomic.com/' stripUrl = 'http://www.commissionedcomic.com/index.php?strip=%s' @@ -148,8 +139,8 @@ class Commissioned(_BasicScraper): class CoolCatStudio(_BasicScraper): latestUrl = 'http://www.coolcatstudio.com/' stripUrl = 'http://www.coolcatstudio.com/index.php?p=%s' - imageSearch = compile(r'(/comics/.+?)"') - prevSearch = compile(r"href='(.+?)'>PREV") + imageSearch = compile(tagre("img", "src", r'(http://www.coolcatstudio.com/comics/[^"]+)')) + prevSearch = compile(tagre("a", "href", r'(http://www\.coolcatstudio\.com/strips-cat/[^"]+)', before="cniprevt")) help = 'Index format: n' @@ -214,7 +205,7 @@ def cloneManga(name, shortName, lastStrip=None): name='CloneManga/' + name, starter=starter, stripUrl=stripUrl, - imageSearch=compile(tagre("img", "src", r'((?:%s)?/%s/[^"]+)' % (url, shortName), after="center")), + imageSearch=compile(tagre("img", "src", r'((?:%s/)?%s/[^"]+)' % (url, shortName), after="center")), prevSearch=compile(tagre("a", "href", r'([^"]+)')+tagre("img", "src", r"previous\.gif")), help='Index format: n', namer=namer) diff --git a/tests/test_comics.py b/tests/test_comics.py index 35b735081..af7d2432b 100644 --- a/tests/test_comics.py +++ b/tests/test_comics.py @@ -3,6 +3,7 @@ # Copyright (C) 2012 Bastian Kleineidam import tempfile import shutil +import re from itertools import islice from unittest import TestCase from dosagelib import scraper @@ -18,7 +19,7 @@ class _ComicTester(TestCase): def test_comic(self): # Test a scraper. It must be able to traverse backward for - # at least 5 pages from the start, and find strip images + # at least 5 strips from the start, and find strip images # on at least 4 pages. scraperobj = self.scraperclass() num = empty = 0 @@ -27,7 +28,12 @@ class _ComicTester(TestCase): for image in strip.getImages(): images += 1 self.save(image) - if not images: + if images: + # test that the stripUrl regex matches the retrieved strip URL + urlmatch = re.escape(self.scraperclass.stripUrl).replace("%s", r".+") + mo = re.compile(urlmatch).match(strip.stripUrl) + self.check(mo is not None, 'strip URL %r does not match %r' % (strip.stripUrl, self.scraperclass.stripUrl)) + else: empty += 1 num += 1 self.check(num >= 4, 'traversal failed after %d strips.' % num)