Fix some comics.

2012-11-13 19:12:28 +01:00 · 2012-11-13 19:12:28 +01:00 · eba2f0089d
commit eba2f0089d
parent 5006ed7f40
6 changed files with 24 additions and 25 deletions
--- a/dosagelib/comic.py
+++ b/dosagelib/comic.py
@ -17,10 +17,10 @@ class FetchComicError(IOError):
 class ComicStrip(object):
    """A list of comic image URLs."""

-    def __init__(self, name, parentUrl, imageUrls, namer):
+    def __init__(self, name, stripUrl, imageUrls, namer):
        """Store the image URL list."""
        self.name = name
-        self.parentUrl = parentUrl
+        self.stripUrl = stripUrl
        self.imageUrls = imageUrls
        self.namer = namer

@ -31,10 +31,10 @@ class ComicStrip(object):

    def getDownloader(self, url):
        """Get an image downloader."""
-        filename = self.namer(url, self.parentUrl)
+        filename = self.namer(url, self.stripUrl)
        if filename is None:
            filename = url.rsplit('/', 1)[1]
-        return ComicImage(self.name, url, self.parentUrl, filename)
+        return ComicImage(self.name, url, self.stripUrl, filename)


 class ComicImage(object):
--- a/dosagelib/helpers.py
+++ b/dosagelib/helpers.py
@ -19,7 +19,9 @@ def regexNamer(regex):
    """Get name from regular expression."""
    @staticmethod
    def _namer(imageUrl, pageUrl):
-        return regex.search(imageUrl).group(1)
+        mo = regex.search(imageUrl)
+        if mo:
+            return mo.group(1)
    return _namer


--- a/dosagelib/plugins/a.py
+++ b/dosagelib/plugins/a.py
@ -73,7 +73,7 @@ class Alice(_BasicScraper):
    latestUrl = 'http://alice.alicecomics.com/'
    stripUrl = 'http://alice.alicecomics.com/wp-content/webcomic/alicecomics/%s.jpg'
    imageSearch = compile(tagre("img", "src", r'(http://alice\.alicecomics\.com/wp-content/webcomic/alicecomics/[^"]+)'))
-    prevSearch = compile(tagre("a", "href", r'(http://alice.alicecomics.com/archive/[^!]+)', after="previous"))
+    prevSearch = compile(tagre("a", "href", r'(http://alice.alicecomics.com/archive/[^"]+)', after="previous"))
    help = 'Index format: yyyy-mm-dd'


--- a/dosagelib/plugins/b.py
+++ b/dosagelib/plugins/b.py
@ -10,7 +10,7 @@ class BadlyDrawnKitties(_BasicScraper):
    latestUrl = 'http://www.badlydrawnkitties.com/'
    stripUrl = 'http://www.badlydrawnkitties.com/new/%s.html'
    imageSearch = compile(r'<img src="(/new/.+?)">')
-    prevSearch = compile(r'"(/new/.+?)".+?previous.gif')
+    prevSearch = compile(tagre("a", "href", r'(/[^"]+)') + tagre("img", "src", r'/images/previous\.gif'))
    help = 'Index format: n (unpadded)'


@ -123,7 +123,7 @@ class ButternutSquash(_BasicScraper):
    latestUrl = 'http://www.butternutsquash.net/'
    stripUrl = 'http://www.butternutsquash.net/v3/%s'
    imageSearch = compile(tagre("img", "src", r'(http://www\.butternutsquash\.net/comics/[^"]+)'))
-    prevSearch = compile(tagre("a", "href", r'(http://www\.butternutsquash\.net/[^!]+)', after="prev"))
+    prevSearch = compile(tagre("a", "href", r'(http://www\.butternutsquash\.net/[^"]+)', after="prev"))
    help = 'Index format: yyyy/mm/dd/strip-name-author-name'


@ -245,8 +245,8 @@ class BloodBound(_BasicScraper):
 class BookOfBiff(_BasicScraper):
    latestUrl = 'http://www.thebookofbiff.com/'
    stripUrl = 'http://www.thebookofbiff.com/%s'
-    imageSearch = compile(r'<img src="(http://www.thebookofbiff.com/comics/.+?)"')
-    prevSearch = compile(r'<a href="(http://www.thebookofbiff.com/.+?)">&#9668; Previous</a>')
+    imageSearch = compile(tagre("img", "src", r'([^"]+/comics/[^"]+)'))
+    prevSearch = compile(tagre("a", "href", r'([^"]+)', after="Previous"))
    help = 'Index format: yyyy/mm/dd/stripnum-strip-name'


--- a/dosagelib/plugins/c.py
+++ b/dosagelib/plugins/c.py
@ -52,7 +52,7 @@ class Catena(_BasicScraper):
    latestUrl = 'http://catenamanor.com/'
    stripUrl = 'http://catenamanor.com/%s.gif'
    imageSearch = compile(tagre("img", "src", r'(http://catenamanor\.com/comics/[^"]+)'))
-    prevSearch = compile(tagre("a", "href", r'[^"]+', after='rel="prev"'))
+    prevSearch = compile(tagre("a", "href", r'([^"]+)', after='rel="prev"'))
    help = 'Index format: yyyy-mm-dd-<name>'


@ -127,15 +127,6 @@ class Comedity(_BasicScraper):
    help = 'Index format: n (no padding)'


-class Comet7(_BasicScraper):
-    latestUrl = 'http://www.comet7.com/'
-    imageUrl = 'http://www.comet7.com/archive_page.php?id=%s'
-    imageSearch = compile(r'"(.*?/strips/.*?)"')
-    prevSearch = compile(r'"(.*?)".*?previous_stripf')
-    help = 'Index format: n (unpadded)'
-
-
-
 class Commissioned(_BasicScraper):
    latestUrl = 'http://www.commissionedcomic.com/'
    stripUrl = 'http://www.commissionedcomic.com/index.php?strip=%s'
@ -148,8 +139,8 @@ class Commissioned(_BasicScraper):
 class CoolCatStudio(_BasicScraper):
    latestUrl = 'http://www.coolcatstudio.com/'
    stripUrl = 'http://www.coolcatstudio.com/index.php?p=%s'
-    imageSearch = compile(r'(/comics/.+?)"')
-    prevSearch = compile(r"href='(.+?)'>PREV")
+    imageSearch = compile(tagre("img", "src", r'(http://www.coolcatstudio.com/comics/[^"]+)'))
+    prevSearch = compile(tagre("a", "href", r'(http://www\.coolcatstudio\.com/strips-cat/[^"]+)', before="cniprevt"))
    help = 'Index format: n'


@ -214,7 +205,7 @@ def cloneManga(name, shortName, lastStrip=None):
            name='CloneManga/' + name,
            starter=starter,
            stripUrl=stripUrl,
-            imageSearch=compile(tagre("img", "src", r'((?:%s)?/%s/[^"]+)' % (url, shortName), after="center")),
+            imageSearch=compile(tagre("img", "src", r'((?:%s/)?%s/[^"]+)' % (url, shortName), after="center")),
            prevSearch=compile(tagre("a", "href", r'([^"]+)')+tagre("img", "src", r"previous\.gif")),
            help='Index format: n',
            namer=namer)
--- a/tests/test_comics.py
+++ b/tests/test_comics.py
@ -3,6 +3,7 @@
 # Copyright (C) 2012 Bastian Kleineidam
 import tempfile
 import shutil
+import re
 from itertools import islice
 from unittest import TestCase
 from dosagelib import scraper
@ -18,7 +19,7 @@ class _ComicTester(TestCase):

    def test_comic(self):
        # Test a scraper. It must be able to traverse backward for
-        # at least 5 pages from the start, and find strip images
+        # at least 5 strips from the start, and find strip images
        # on at least 4 pages.
        scraperobj = self.scraperclass()
        num = empty = 0
@ -27,7 +28,12 @@ class _ComicTester(TestCase):
            for image in strip.getImages():
                images += 1
                self.save(image)
-            if not images:
+            if images:
+                # test that the stripUrl regex matches the retrieved strip URL
+                urlmatch = re.escape(self.scraperclass.stripUrl).replace("%s", r".+")
+                mo = re.compile(urlmatch).match(strip.stripUrl)
+                self.check(mo is not None, 'strip URL %r does not match %r' % (strip.stripUrl, self.scraperclass.stripUrl))
+            else:
                empty += 1
            num += 1
        self.check(num >= 4, 'traversal failed after %d strips.' % num)