Bugfix: Don't assume RE patterns in base class.

This commit is contained in:
Tobias Gruetzmacher 2014-10-13 22:29:47 +02:00
parent e92a3fb3a1
commit 17bc454132
2 changed files with 15 additions and 5 deletions

View file

@ -19,7 +19,7 @@ except ImportError:
from . import loader, configuration, util
from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
getDirname, unescape, tagre, normaliseURL)
getDirname, unescape, tagre, normaliseURL, prettyMatcherList)
from .comic import ComicStrip
from .output import out
from .events import getHandler
@ -131,14 +131,12 @@ class Scraper(object):
# remove duplicate URLs
imageUrls = set(imageUrls)
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
patterns = [x.pattern for x in makeSequence(self.imageSearch)]
out.warn(u"Found %d images instead of 1 at %s with expressions %s" % (len(imageUrls), url, patterns))
out.warn(u"Found %d images instead of 1 at %s with expressions %s" % (len(imageUrls), url, prettyMatcherList(self.imageSearch)))
image = sorted(imageUrls)[0]
out.warn(u"Choosing image %s" % image)
imageUrls = (image,)
elif not imageUrls:
patterns = [x.pattern for x in makeSequence(self.imageSearch)]
out.warn(u"Found no images at %s with expressions %s" % (url, patterns))
out.warn(u"Found no images at %s with expressions %s" % (url, prettyMatcherList(self.imageSearch)))
if self.textSearch:
text = self.fetchText(url, data, self.textSearch, optional=self.textOptional)
else:

View file

@ -217,6 +217,18 @@ def makeSequence(item):
return (item,)
def prettyMatcherList(things):
"""Try to construct a nicely-formatted string for a list of matcher
objects. Those may be compiled regular expressions or strings..."""
norm = []
for x in makeSequence(things):
if hasattr(x, 'pattern'):
norm.append(x.pattern)
else:
norm.append(x)
return "('%s')" % "', '".join(norm)
_htmlparser = HTMLParser()
def unescape(text):
"""Replace HTML entities and character references."""