Remember skipped URLs.

This commit is contained in:
Bastian Kleineidam 2013-02-20 20:51:39 +01:00
parent 6d04ef7abd
commit ae0e9feea1
2 changed files with 11 additions and 5 deletions

View file

@ -58,6 +58,7 @@ class _BasicScraper(object):
self.indexes = tuple(indexes) self.indexes = tuple(indexes)
else: else:
self.indexes = tuple() self.indexes = tuple()
self.skippedUrls = set()
def __cmp__(self, other): def __cmp__(self, other):
"""Compare scraper by name and index list.""" """Compare scraper by name and index list."""
@ -84,17 +85,22 @@ class _BasicScraper(object):
for index in self.indexes: for index in self.indexes:
url = self.stripUrl % index url = self.stripUrl % index
if url in self.noImageUrls: if url in self.noImageUrls:
out.info('Skipping no-image URL %s' % url) self.skipUrl(url)
else: else:
yield self.getStrip(url) yield self.getStrip(url)
else: else:
url = self.getLatestUrl() url = self.getLatestUrl()
if url in self.noImageUrls: if url in self.noImageUrls:
out.info('Skipping no-image URL %s' % url) self.skipUrl(url)
else: else:
yield self.getStrip(self.getLatestUrl()) yield self.getStrip(self.getLatestUrl())
def skipUrl(self, url):
"""Document that an URL had no images."""
out.info('Skipping URL %s without image' % url)
self.skippedUrls.add(url)
def getStrip(self, url): def getStrip(self, url):
"""Get comic strip for given URL.""" """Get comic strip for given URL."""
data, baseUrl = getPageContent(url, self.session) data, baseUrl = getPageContent(url, self.session)
@ -135,7 +141,7 @@ class _BasicScraper(object):
while url: while url:
data, baseUrl = getPageContent(url, self.session) data, baseUrl = getPageContent(url, self.session)
if url in self.noImageUrls: if url in self.noImageUrls:
out.info('Skipping no-image URL %s' % url) self.skipUrl(url)
else: else:
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch)) imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
yield self.getComicStrip(url, imageUrls) yield self.getComicStrip(url, imageUrls)

View file

@ -53,8 +53,8 @@ class _ComicTester(TestCase):
# ie. it detects duplicate filenames. # ie. it detects duplicate filenames.
saved_images = self.get_saved_images() saved_images = self.get_saved_images()
num_images = len(saved_images) num_images = len(saved_images)
# subtract the number of URLs with no image from the expected image number # subtract the number of skipped URLs with no image from the expected image number
num_images_expected = max_strips - len(scraperobj.noImageUrls) num_images_expected = max_strips - len(scraperobj.skippedUrls)
attrs = (num_images, saved_images, num_images_expected, self.tmpdir) attrs = (num_images, saved_images, num_images_expected, self.tmpdir)
if self.scraperclass.multipleImagesPerStrip: if self.scraperclass.multipleImagesPerStrip:
self.check(num_images >= num_images_expected, 'saved %d %s instead of at least %d images in %s' % attrs) self.check(num_images >= num_images_expected, 'saved %d %s instead of at least %d images in %s' % attrs)