Fix some comics.

This commit is contained in:
Bastian Kleineidam 2012-11-13 19:12:28 +01:00
parent 5006ed7f40
commit eba2f0089d
6 changed files with 24 additions and 25 deletions

View file

@ -17,10 +17,10 @@ class FetchComicError(IOError):
class ComicStrip(object): class ComicStrip(object):
"""A list of comic image URLs.""" """A list of comic image URLs."""
def __init__(self, name, parentUrl, imageUrls, namer): def __init__(self, name, stripUrl, imageUrls, namer):
"""Store the image URL list.""" """Store the image URL list."""
self.name = name self.name = name
self.parentUrl = parentUrl self.stripUrl = stripUrl
self.imageUrls = imageUrls self.imageUrls = imageUrls
self.namer = namer self.namer = namer
@ -31,10 +31,10 @@ class ComicStrip(object):
def getDownloader(self, url): def getDownloader(self, url):
"""Get an image downloader.""" """Get an image downloader."""
filename = self.namer(url, self.parentUrl) filename = self.namer(url, self.stripUrl)
if filename is None: if filename is None:
filename = url.rsplit('/', 1)[1] filename = url.rsplit('/', 1)[1]
return ComicImage(self.name, url, self.parentUrl, filename) return ComicImage(self.name, url, self.stripUrl, filename)
class ComicImage(object): class ComicImage(object):

View file

@ -19,7 +19,9 @@ def regexNamer(regex):
"""Get name from regular expression.""" """Get name from regular expression."""
@staticmethod @staticmethod
def _namer(imageUrl, pageUrl): def _namer(imageUrl, pageUrl):
return regex.search(imageUrl).group(1) mo = regex.search(imageUrl)
if mo:
return mo.group(1)
return _namer return _namer

View file

@ -73,7 +73,7 @@ class Alice(_BasicScraper):
latestUrl = 'http://alice.alicecomics.com/' latestUrl = 'http://alice.alicecomics.com/'
stripUrl = 'http://alice.alicecomics.com/wp-content/webcomic/alicecomics/%s.jpg' stripUrl = 'http://alice.alicecomics.com/wp-content/webcomic/alicecomics/%s.jpg'
imageSearch = compile(tagre("img", "src", r'(http://alice\.alicecomics\.com/wp-content/webcomic/alicecomics/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://alice\.alicecomics\.com/wp-content/webcomic/alicecomics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://alice.alicecomics.com/archive/[^!]+)', after="previous")) prevSearch = compile(tagre("a", "href", r'(http://alice.alicecomics.com/archive/[^"]+)', after="previous"))
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'

View file

@ -10,7 +10,7 @@ class BadlyDrawnKitties(_BasicScraper):
latestUrl = 'http://www.badlydrawnkitties.com/' latestUrl = 'http://www.badlydrawnkitties.com/'
stripUrl = 'http://www.badlydrawnkitties.com/new/%s.html' stripUrl = 'http://www.badlydrawnkitties.com/new/%s.html'
imageSearch = compile(r'<img src="(/new/.+?)">') imageSearch = compile(r'<img src="(/new/.+?)">')
prevSearch = compile(r'"(/new/.+?)".+?previous.gif') prevSearch = compile(tagre("a", "href", r'(/[^"]+)') + tagre("img", "src", r'/images/previous\.gif'))
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
@ -123,7 +123,7 @@ class ButternutSquash(_BasicScraper):
latestUrl = 'http://www.butternutsquash.net/' latestUrl = 'http://www.butternutsquash.net/'
stripUrl = 'http://www.butternutsquash.net/v3/%s' stripUrl = 'http://www.butternutsquash.net/v3/%s'
imageSearch = compile(tagre("img", "src", r'(http://www\.butternutsquash\.net/comics/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://www\.butternutsquash\.net/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.butternutsquash\.net/[^!]+)', after="prev")) prevSearch = compile(tagre("a", "href", r'(http://www\.butternutsquash\.net/[^"]+)', after="prev"))
help = 'Index format: yyyy/mm/dd/strip-name-author-name' help = 'Index format: yyyy/mm/dd/strip-name-author-name'
@ -245,8 +245,8 @@ class BloodBound(_BasicScraper):
class BookOfBiff(_BasicScraper): class BookOfBiff(_BasicScraper):
latestUrl = 'http://www.thebookofbiff.com/' latestUrl = 'http://www.thebookofbiff.com/'
stripUrl = 'http://www.thebookofbiff.com/%s' stripUrl = 'http://www.thebookofbiff.com/%s'
imageSearch = compile(r'<img src="(http://www.thebookofbiff.com/comics/.+?)"') imageSearch = compile(tagre("img", "src", r'([^"]+/comics/[^"]+)'))
prevSearch = compile(r'<a href="(http://www.thebookofbiff.com/.+?)">&#9668; Previous</a>') prevSearch = compile(tagre("a", "href", r'([^"]+)', after="Previous"))
help = 'Index format: yyyy/mm/dd/stripnum-strip-name' help = 'Index format: yyyy/mm/dd/stripnum-strip-name'

View file

@ -52,7 +52,7 @@ class Catena(_BasicScraper):
latestUrl = 'http://catenamanor.com/' latestUrl = 'http://catenamanor.com/'
stripUrl = 'http://catenamanor.com/%s.gif' stripUrl = 'http://catenamanor.com/%s.gif'
imageSearch = compile(tagre("img", "src", r'(http://catenamanor\.com/comics/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://catenamanor\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'[^"]+', after='rel="prev"')) prevSearch = compile(tagre("a", "href", r'([^"]+)', after='rel="prev"'))
help = 'Index format: yyyy-mm-dd-<name>' help = 'Index format: yyyy-mm-dd-<name>'
@ -127,15 +127,6 @@ class Comedity(_BasicScraper):
help = 'Index format: n (no padding)' help = 'Index format: n (no padding)'
class Comet7(_BasicScraper):
latestUrl = 'http://www.comet7.com/'
imageUrl = 'http://www.comet7.com/archive_page.php?id=%s'
imageSearch = compile(r'"(.*?/strips/.*?)"')
prevSearch = compile(r'"(.*?)".*?previous_stripf')
help = 'Index format: n (unpadded)'
class Commissioned(_BasicScraper): class Commissioned(_BasicScraper):
latestUrl = 'http://www.commissionedcomic.com/' latestUrl = 'http://www.commissionedcomic.com/'
stripUrl = 'http://www.commissionedcomic.com/index.php?strip=%s' stripUrl = 'http://www.commissionedcomic.com/index.php?strip=%s'
@ -148,8 +139,8 @@ class Commissioned(_BasicScraper):
class CoolCatStudio(_BasicScraper): class CoolCatStudio(_BasicScraper):
latestUrl = 'http://www.coolcatstudio.com/' latestUrl = 'http://www.coolcatstudio.com/'
stripUrl = 'http://www.coolcatstudio.com/index.php?p=%s' stripUrl = 'http://www.coolcatstudio.com/index.php?p=%s'
imageSearch = compile(r'(/comics/.+?)"') imageSearch = compile(tagre("img", "src", r'(http://www.coolcatstudio.com/comics/[^"]+)'))
prevSearch = compile(r"href='(.+?)'>PREV") prevSearch = compile(tagre("a", "href", r'(http://www\.coolcatstudio\.com/strips-cat/[^"]+)', before="cniprevt"))
help = 'Index format: n' help = 'Index format: n'
@ -214,7 +205,7 @@ def cloneManga(name, shortName, lastStrip=None):
name='CloneManga/' + name, name='CloneManga/' + name,
starter=starter, starter=starter,
stripUrl=stripUrl, stripUrl=stripUrl,
imageSearch=compile(tagre("img", "src", r'((?:%s)?/%s/[^"]+)' % (url, shortName), after="center")), imageSearch=compile(tagre("img", "src", r'((?:%s/)?%s/[^"]+)' % (url, shortName), after="center")),
prevSearch=compile(tagre("a", "href", r'([^"]+)')+tagre("img", "src", r"previous\.gif")), prevSearch=compile(tagre("a", "href", r'([^"]+)')+tagre("img", "src", r"previous\.gif")),
help='Index format: n', help='Index format: n',
namer=namer) namer=namer)

View file

@ -3,6 +3,7 @@
# Copyright (C) 2012 Bastian Kleineidam # Copyright (C) 2012 Bastian Kleineidam
import tempfile import tempfile
import shutil import shutil
import re
from itertools import islice from itertools import islice
from unittest import TestCase from unittest import TestCase
from dosagelib import scraper from dosagelib import scraper
@ -18,7 +19,7 @@ class _ComicTester(TestCase):
def test_comic(self): def test_comic(self):
# Test a scraper. It must be able to traverse backward for # Test a scraper. It must be able to traverse backward for
# at least 5 pages from the start, and find strip images # at least 5 strips from the start, and find strip images
# on at least 4 pages. # on at least 4 pages.
scraperobj = self.scraperclass() scraperobj = self.scraperclass()
num = empty = 0 num = empty = 0
@ -27,7 +28,12 @@ class _ComicTester(TestCase):
for image in strip.getImages(): for image in strip.getImages():
images += 1 images += 1
self.save(image) self.save(image)
if not images: if images:
# test that the stripUrl regex matches the retrieved strip URL
urlmatch = re.escape(self.scraperclass.stripUrl).replace("%s", r".+")
mo = re.compile(urlmatch).match(strip.stripUrl)
self.check(mo is not None, 'strip URL %r does not match %r' % (strip.stripUrl, self.scraperclass.stripUrl))
else:
empty += 1 empty += 1
num += 1 num += 1
self.check(num >= 4, 'traversal failed after %d strips.' % num) self.check(num >= 4, 'traversal failed after %d strips.' % num)