Remember skipped URLs.
This commit is contained in:
parent
6d04ef7abd
commit
ae0e9feea1
2 changed files with 11 additions and 5 deletions
|
@ -58,6 +58,7 @@ class _BasicScraper(object):
|
||||||
self.indexes = tuple(indexes)
|
self.indexes = tuple(indexes)
|
||||||
else:
|
else:
|
||||||
self.indexes = tuple()
|
self.indexes = tuple()
|
||||||
|
self.skippedUrls = set()
|
||||||
|
|
||||||
def __cmp__(self, other):
|
def __cmp__(self, other):
|
||||||
"""Compare scraper by name and index list."""
|
"""Compare scraper by name and index list."""
|
||||||
|
@ -84,17 +85,22 @@ class _BasicScraper(object):
|
||||||
for index in self.indexes:
|
for index in self.indexes:
|
||||||
url = self.stripUrl % index
|
url = self.stripUrl % index
|
||||||
if url in self.noImageUrls:
|
if url in self.noImageUrls:
|
||||||
out.info('Skipping no-image URL %s' % url)
|
self.skipUrl(url)
|
||||||
else:
|
else:
|
||||||
yield self.getStrip(url)
|
yield self.getStrip(url)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
url = self.getLatestUrl()
|
url = self.getLatestUrl()
|
||||||
if url in self.noImageUrls:
|
if url in self.noImageUrls:
|
||||||
out.info('Skipping no-image URL %s' % url)
|
self.skipUrl(url)
|
||||||
else:
|
else:
|
||||||
yield self.getStrip(self.getLatestUrl())
|
yield self.getStrip(self.getLatestUrl())
|
||||||
|
|
||||||
|
def skipUrl(self, url):
|
||||||
|
"""Document that an URL had no images."""
|
||||||
|
out.info('Skipping URL %s without image' % url)
|
||||||
|
self.skippedUrls.add(url)
|
||||||
|
|
||||||
def getStrip(self, url):
|
def getStrip(self, url):
|
||||||
"""Get comic strip for given URL."""
|
"""Get comic strip for given URL."""
|
||||||
data, baseUrl = getPageContent(url, self.session)
|
data, baseUrl = getPageContent(url, self.session)
|
||||||
|
@ -135,7 +141,7 @@ class _BasicScraper(object):
|
||||||
while url:
|
while url:
|
||||||
data, baseUrl = getPageContent(url, self.session)
|
data, baseUrl = getPageContent(url, self.session)
|
||||||
if url in self.noImageUrls:
|
if url in self.noImageUrls:
|
||||||
out.info('Skipping no-image URL %s' % url)
|
self.skipUrl(url)
|
||||||
else:
|
else:
|
||||||
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
||||||
yield self.getComicStrip(url, imageUrls)
|
yield self.getComicStrip(url, imageUrls)
|
||||||
|
|
|
@ -53,8 +53,8 @@ class _ComicTester(TestCase):
|
||||||
# ie. it detects duplicate filenames.
|
# ie. it detects duplicate filenames.
|
||||||
saved_images = self.get_saved_images()
|
saved_images = self.get_saved_images()
|
||||||
num_images = len(saved_images)
|
num_images = len(saved_images)
|
||||||
# subtract the number of URLs with no image from the expected image number
|
# subtract the number of skipped URLs with no image from the expected image number
|
||||||
num_images_expected = max_strips - len(scraperobj.noImageUrls)
|
num_images_expected = max_strips - len(scraperobj.skippedUrls)
|
||||||
attrs = (num_images, saved_images, num_images_expected, self.tmpdir)
|
attrs = (num_images, saved_images, num_images_expected, self.tmpdir)
|
||||||
if self.scraperclass.multipleImagesPerStrip:
|
if self.scraperclass.multipleImagesPerStrip:
|
||||||
self.check(num_images >= num_images_expected, 'saved %d %s instead of at least %d images in %s' % attrs)
|
self.check(num_images >= num_images_expected, 'saved %d %s instead of at least %d images in %s' % attrs)
|
||||||
|
|
Loading…
Reference in a new issue