Fix warning for scrapers with multiple image patterns.
This commit is contained in:
parent
f53a516219
commit
2c0ca04882
2 changed files with 15 additions and 8 deletions
|
@ -4,7 +4,7 @@
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
from . import loader
|
from . import loader
|
||||||
from .util import fetchUrl, fetchUrls, getPageContent
|
from .util import fetchUrl, fetchUrls, getPageContent, makeList
|
||||||
from .comic import ComicStrip
|
from .comic import ComicStrip
|
||||||
from .output import out
|
from .output import out
|
||||||
from .events import getHandler
|
from .events import getHandler
|
||||||
|
@ -104,9 +104,11 @@ class _BasicScraper(object):
|
||||||
imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)
|
imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)
|
||||||
imageUrls = set(map(self.imageUrlModifier, imageUrls))
|
imageUrls = set(map(self.imageUrlModifier, imageUrls))
|
||||||
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
||||||
out.warn("found %d images instead of 1 at %s with %s" % (len(imageUrls), url, self.imageSearch.pattern))
|
patterns = [x.pattern for x in makeList(self.imageSearch)]
|
||||||
|
out.warn("found %d images instead of 1 at %s with patterns %s" % (len(imageUrls), url, patterns))
|
||||||
elif not imageUrls:
|
elif not imageUrls:
|
||||||
out.warn("found no images at %s with %s" % (url, self.imageSearch.pattern))
|
patterns = [x.pattern for x in makeList(self.imageSearch)]
|
||||||
|
out.warn("found no images at %s with patterns %s" % (url, patterns))
|
||||||
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session)
|
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session)
|
||||||
|
|
||||||
def getStrips(self, maxstrips=None):
|
def getStrips(self, maxstrips=None):
|
||||||
|
|
|
@ -136,20 +136,25 @@ def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
||||||
return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes)
|
return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes)
|
||||||
|
|
||||||
|
|
||||||
|
def makeList(item):
|
||||||
|
"""If tiem is already a list or tuple, return it.
|
||||||
|
Else return a list with item as single element."""
|
||||||
|
if isinstance(item, (list, tuple)):
|
||||||
|
return item
|
||||||
|
return [item]
|
||||||
|
|
||||||
|
|
||||||
def fetchUrls(url, data, baseUrl, urlSearch):
|
def fetchUrls(url, data, baseUrl, urlSearch):
|
||||||
"""Search all entries for given URL pattern(s) in a HTML page."""
|
"""Search all entries for given URL pattern(s) in a HTML page."""
|
||||||
searchUrls = []
|
searchUrls = []
|
||||||
if isinstance(urlSearch, (types.ListType, types.TupleType)):
|
searches = makeList(urlSearch)
|
||||||
searches = urlSearch
|
|
||||||
else:
|
|
||||||
searches = [urlSearch]
|
|
||||||
for search in searches:
|
for search in searches:
|
||||||
for match in search.finditer(data):
|
for match in search.finditer(data):
|
||||||
searchUrl = match.group(1)
|
searchUrl = match.group(1)
|
||||||
if not searchUrl:
|
if not searchUrl:
|
||||||
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
|
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
|
||||||
out.debug('matched URL %r with pattern %s' % (searchUrl, search.pattern))
|
out.debug('matched URL %r with pattern %s' % (searchUrl, search.pattern))
|
||||||
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
|
searchUrls.append(normaliseURL(urljoin(baseUrl, searchUrl)))
|
||||||
if searchUrls:
|
if searchUrls:
|
||||||
# do not search other links if one pattern matched
|
# do not search other links if one pattern matched
|
||||||
break
|
break
|
||||||
|
|
Loading…
Reference in a new issue