Allow a list of regular expressions for image and previous link search.

This commit is contained in:
Bastian Kleineidam 2013-03-12 20:48:26 +01:00
parent 6de26aeeaa
commit 43f20270d0
2 changed files with 17 additions and 8 deletions

View file

@ -37,9 +37,11 @@ class _BasicScraper(object):
lang = 'en'
# compiled regular expression that will locate the URL for the previous strip in a page
# this can also be a list or tuple of compiled regular expressions
prevSearch = None
# compiled regular expression that will locate the strip image URLs strip in a page
# this can also be a list or tuple of compiled regular expressions
imageSearch = None
# usually the index format help

View file

@ -12,6 +12,7 @@ import cgi
import re
import traceback
import time
import types
from htmlentitydefs import name2codepoint
from .decorators import memoized
@ -133,16 +134,22 @@ def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
def fetchUrls(url, data, baseUrl, urlSearch):
"""Search all entries for given URL pattern in a HTML page."""
"""Search all entries for given URL pattern(s) in a HTML page."""
searchUrls = []
for match in urlSearch.finditer(data):
searchUrl = match.group(1)
if not searchUrl:
raise ValueError("Pattern %s matched empty URL at %s." % (urlSearch.pattern, url))
out.debug('matched URL %r with pattern %s' % (searchUrl, urlSearch.pattern))
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
if isinstance(urlSearch, (types.ListType, types.TupleType)):
searches = urlSearch
else:
searches = [urlSearch]
for search in searches:
for match in search.finditer(data):
searchUrl = match.group(1)
if not searchUrl:
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
out.debug('matched URL %r with pattern %s' % (searchUrl, search.pattern))
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
if not searchUrls:
raise ValueError("Pattern %s not found at URL %s." % (urlSearch.pattern, url))
patterns = [x.pattern for x in searches]
raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
return searchUrls