diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index b971a9b4b..3f7da1806 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -33,6 +33,9 @@ class _BasicScraper(object): # if more than one image per URL is expected multipleImagesPerStrip = False + # set of URLs that have no image (eg. only a video link) + noImageUrls = set() + # set to False if previous URLs do not match the strip URL (ie. because of redirects) prevUrlMatchesStripUrl = True @@ -80,9 +83,17 @@ class _BasicScraper(object): if self.indexes: for index in self.indexes: url = self.stripUrl % index - yield self.getStrip(url) + if url in self.noImageUrls: + out.info('Skipping no-image URL %s' % url) + else: + yield self.getStrip(url) + else: - yield self.getStrip(self.getLatestUrl()) + url = self.getLatestUrl() + if url in self.noImageUrls: + out.info('Skipping no-image URL %s' % url) + else: + yield self.getStrip(self.getLatestUrl()) def getStrip(self, url): """Get comic strip for given URL.""" @@ -123,8 +134,11 @@ class _BasicScraper(object): seen_urls = set() while url: data, baseUrl = getPageContent(url, self.session) - imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch)) - yield self.getComicStrip(url, imageUrls) + if url in self.noImageUrls: + out.info('Skipping no-image URL %s' % url) + else: + imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch)) + yield self.getComicStrip(url, imageUrls) if self.firstStripUrl == url: out.debug("Stop at first URL %s" % url) break diff --git a/tests/test_comics.py b/tests/test_comics.py index cb6d12742..d73bc5ba7 100644 --- a/tests/test_comics.py +++ b/tests/test_comics.py @@ -53,11 +53,13 @@ class _ComicTester(TestCase): # ie. it detects duplicate filenames. saved_images = self.get_saved_images() num_images = len(saved_images) - attrs = (num_images, saved_images, max_strips, self.tmpdir) + # subtract the number of URLs with no image from the expected image number + num_images_expected = max_strips - len(scraperobj.noImageUrls) + attrs = (num_images, saved_images, num_images_expected, self.tmpdir) if self.scraperclass.multipleImagesPerStrip: - self.check(num_images >= max_strips, 'saved %d %s instead of at least %d images in %s' % attrs) + self.check(num_images >= num_images_expected, 'saved %d %s instead of at least %d images in %s' % attrs) else: - self.check(num_images == max_strips, 'saved %d %s instead of %d images in %s' % attrs) + self.check(num_images == num_images_expected, 'saved %d %s instead of %d images in %s' % attrs) def check_stripurl(self, strip): if not self.scraperclass.stripUrl: