From ae0e9feea1c517f4270c2778ea6bd9dc1907492b Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Wed, 20 Feb 2013 20:51:39 +0100 Subject: [PATCH] Remember skipped URLs. --- dosagelib/scraper.py | 12 +++++++++--- tests/test_comics.py | 4 ++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index 3f7da1806..0e124ef77 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -58,6 +58,7 @@ class _BasicScraper(object): self.indexes = tuple(indexes) else: self.indexes = tuple() + self.skippedUrls = set() def __cmp__(self, other): """Compare scraper by name and index list.""" @@ -84,17 +85,22 @@ class _BasicScraper(object): for index in self.indexes: url = self.stripUrl % index if url in self.noImageUrls: - out.info('Skipping no-image URL %s' % url) + self.skipUrl(url) else: yield self.getStrip(url) else: url = self.getLatestUrl() if url in self.noImageUrls: - out.info('Skipping no-image URL %s' % url) + self.skipUrl(url) else: yield self.getStrip(self.getLatestUrl()) + def skipUrl(self, url): + """Document that an URL had no images.""" + out.info('Skipping URL %s without image' % url) + self.skippedUrls.add(url) + def getStrip(self, url): """Get comic strip for given URL.""" data, baseUrl = getPageContent(url, self.session) @@ -135,7 +141,7 @@ class _BasicScraper(object): while url: data, baseUrl = getPageContent(url, self.session) if url in self.noImageUrls: - out.info('Skipping no-image URL %s' % url) + self.skipUrl(url) else: imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch)) yield self.getComicStrip(url, imageUrls) diff --git a/tests/test_comics.py b/tests/test_comics.py index 3a3cd62c9..bef6a9e3a 100644 --- a/tests/test_comics.py +++ b/tests/test_comics.py @@ -53,8 +53,8 @@ class _ComicTester(TestCase): # ie. it detects duplicate filenames. saved_images = self.get_saved_images() num_images = len(saved_images) - # subtract the number of URLs with no image from the expected image number - num_images_expected = max_strips - len(scraperobj.noImageUrls) + # subtract the number of skipped URLs with no image from the expected image number + num_images_expected = max_strips - len(scraperobj.skippedUrls) attrs = (num_images, saved_images, num_images_expected, self.tmpdir) if self.scraperclass.multipleImagesPerStrip: self.check(num_images >= num_images_expected, 'saved %d %s instead of at least %d images in %s' % attrs)