Fix warning for scrapers with multiple image patterns.

This commit is contained in:
Bastian Kleineidam 2013-04-03 20:32:19 +02:00
parent f53a516219
commit 2c0ca04882
2 changed files with 15 additions and 8 deletions

View file

@ -4,7 +4,7 @@
import requests import requests
import time import time
from . import loader from . import loader
from .util import fetchUrl, fetchUrls, getPageContent from .util import fetchUrl, fetchUrls, getPageContent, makeList
from .comic import ComicStrip from .comic import ComicStrip
from .output import out from .output import out
from .events import getHandler from .events import getHandler
@ -104,9 +104,11 @@ class _BasicScraper(object):
imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch) imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)
imageUrls = set(map(self.imageUrlModifier, imageUrls)) imageUrls = set(map(self.imageUrlModifier, imageUrls))
if len(imageUrls) > 1 and not self.multipleImagesPerStrip: if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
out.warn("found %d images instead of 1 at %s with %s" % (len(imageUrls), url, self.imageSearch.pattern)) patterns = [x.pattern for x in makeList(self.imageSearch)]
out.warn("found %d images instead of 1 at %s with patterns %s" % (len(imageUrls), url, patterns))
elif not imageUrls: elif not imageUrls:
out.warn("found no images at %s with %s" % (url, self.imageSearch.pattern)) patterns = [x.pattern for x in makeList(self.imageSearch)]
out.warn("found no images at %s with patterns %s" % (url, patterns))
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session) return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session)
def getStrips(self, maxstrips=None): def getStrips(self, maxstrips=None):

View file

@ -136,20 +136,25 @@ def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes) return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes)
def makeList(item):
"""If tiem is already a list or tuple, return it.
Else return a list with item as single element."""
if isinstance(item, (list, tuple)):
return item
return [item]
def fetchUrls(url, data, baseUrl, urlSearch): def fetchUrls(url, data, baseUrl, urlSearch):
"""Search all entries for given URL pattern(s) in a HTML page.""" """Search all entries for given URL pattern(s) in a HTML page."""
searchUrls = [] searchUrls = []
if isinstance(urlSearch, (types.ListType, types.TupleType)): searches = makeList(urlSearch)
searches = urlSearch
else:
searches = [urlSearch]
for search in searches: for search in searches:
for match in search.finditer(data): for match in search.finditer(data):
searchUrl = match.group(1) searchUrl = match.group(1)
if not searchUrl: if not searchUrl:
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url)) raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
out.debug('matched URL %r with pattern %s' % (searchUrl, search.pattern)) out.debug('matched URL %r with pattern %s' % (searchUrl, search.pattern))
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl))) searchUrls.append(normaliseURL(urljoin(baseUrl, searchUrl)))
if searchUrls: if searchUrls:
# do not search other links if one pattern matched # do not search other links if one pattern matched
break break