Fix warning for scrapers with multiple image patterns.

This commit is contained in:
Bastian Kleineidam 2013-04-03 20:32:19 +02:00
parent f53a516219
commit 2c0ca04882
2 changed files with 15 additions and 8 deletions

View file

@ -4,7 +4,7 @@
import requests
import time
from . import loader
from .util import fetchUrl, fetchUrls, getPageContent
from .util import fetchUrl, fetchUrls, getPageContent, makeList
from .comic import ComicStrip
from .output import out
from .events import getHandler
@ -104,9 +104,11 @@ class _BasicScraper(object):
imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)
imageUrls = set(map(self.imageUrlModifier, imageUrls))
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
out.warn("found %d images instead of 1 at %s with %s" % (len(imageUrls), url, self.imageSearch.pattern))
patterns = [x.pattern for x in makeList(self.imageSearch)]
out.warn("found %d images instead of 1 at %s with patterns %s" % (len(imageUrls), url, patterns))
elif not imageUrls:
out.warn("found no images at %s with %s" % (url, self.imageSearch.pattern))
patterns = [x.pattern for x in makeList(self.imageSearch)]
out.warn("found no images at %s with patterns %s" % (url, patterns))
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session)
def getStrips(self, maxstrips=None):

View file

@ -136,20 +136,25 @@ def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes)
def makeList(item):
"""If tiem is already a list or tuple, return it.
Else return a list with item as single element."""
if isinstance(item, (list, tuple)):
return item
return [item]
def fetchUrls(url, data, baseUrl, urlSearch):
"""Search all entries for given URL pattern(s) in a HTML page."""
searchUrls = []
if isinstance(urlSearch, (types.ListType, types.TupleType)):
searches = urlSearch
else:
searches = [urlSearch]
searches = makeList(urlSearch)
for search in searches:
for match in search.finditer(data):
searchUrl = match.group(1)
if not searchUrl:
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
out.debug('matched URL %r with pattern %s' % (searchUrl, search.pattern))
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
searchUrls.append(normaliseURL(urljoin(baseUrl, searchUrl)))
if searchUrls:
# do not search other links if one pattern matched
break