Allow a list of regular expressions for image and previous link search.

This commit is contained in:
Bastian Kleineidam 2013-03-12 20:48:26 +01:00
parent 6de26aeeaa
commit 43f20270d0
2 changed files with 17 additions and 8 deletions

View file

@ -37,9 +37,11 @@ class _BasicScraper(object):
lang = 'en' lang = 'en'
# compiled regular expression that will locate the URL for the previous strip in a page # compiled regular expression that will locate the URL for the previous strip in a page
# this can also be a list or tuple of compiled regular expressions
prevSearch = None prevSearch = None
# compiled regular expression that will locate the strip image URLs strip in a page # compiled regular expression that will locate the strip image URLs strip in a page
# this can also be a list or tuple of compiled regular expressions
imageSearch = None imageSearch = None
# usually the index format help # usually the index format help

View file

@ -12,6 +12,7 @@ import cgi
import re import re
import traceback import traceback
import time import time
import types
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint
from .decorators import memoized from .decorators import memoized
@ -133,16 +134,22 @@ def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
def fetchUrls(url, data, baseUrl, urlSearch): def fetchUrls(url, data, baseUrl, urlSearch):
"""Search all entries for given URL pattern in a HTML page.""" """Search all entries for given URL pattern(s) in a HTML page."""
searchUrls = [] searchUrls = []
for match in urlSearch.finditer(data): if isinstance(urlSearch, (types.ListType, types.TupleType)):
searchUrl = match.group(1) searches = urlSearch
if not searchUrl: else:
raise ValueError("Pattern %s matched empty URL at %s." % (urlSearch.pattern, url)) searches = [urlSearch]
out.debug('matched URL %r with pattern %s' % (searchUrl, urlSearch.pattern)) for search in searches:
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl))) for match in search.finditer(data):
searchUrl = match.group(1)
if not searchUrl:
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
out.debug('matched URL %r with pattern %s' % (searchUrl, search.pattern))
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
if not searchUrls: if not searchUrls:
raise ValueError("Pattern %s not found at URL %s." % (urlSearch.pattern, url)) patterns = [x.pattern for x in searches]
raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
return searchUrls return searchUrls