From 43f20270d0a6e163d321b9bd8c9efc72e46a79da Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Tue, 12 Mar 2013 20:48:26 +0100 Subject: [PATCH] Allow a list of regular expressions for image and previous link search. --- dosagelib/scraper.py | 2 ++ dosagelib/util.py | 23 +++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index a8c44d7b6..0bd0b5075 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -37,9 +37,11 @@ class _BasicScraper(object): lang = 'en' # compiled regular expression that will locate the URL for the previous strip in a page + # this can also be a list or tuple of compiled regular expressions prevSearch = None # compiled regular expression that will locate the strip image URLs strip in a page + # this can also be a list or tuple of compiled regular expressions imageSearch = None # usually the index format help diff --git a/dosagelib/util.py b/dosagelib/util.py index 80209c223..62f7a8a6d 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -12,6 +12,7 @@ import cgi import re import traceback import time +import types from htmlentitydefs import name2codepoint from .decorators import memoized @@ -133,16 +134,22 @@ def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes): def fetchUrls(url, data, baseUrl, urlSearch): - """Search all entries for given URL pattern in a HTML page.""" + """Search all entries for given URL pattern(s) in a HTML page.""" searchUrls = [] - for match in urlSearch.finditer(data): - searchUrl = match.group(1) - if not searchUrl: - raise ValueError("Pattern %s matched empty URL at %s." % (urlSearch.pattern, url)) - out.debug('matched URL %r with pattern %s' % (searchUrl, urlSearch.pattern)) - searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl))) + if isinstance(urlSearch, (types.ListType, types.TupleType)): + searches = urlSearch + else: + searches = [urlSearch] + for search in searches: + for match in search.finditer(data): + searchUrl = match.group(1) + if not searchUrl: + raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url)) + out.debug('matched URL %r with pattern %s' % (searchUrl, search.pattern)) + searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl))) if not searchUrls: - raise ValueError("Pattern %s not found at URL %s." % (urlSearch.pattern, url)) + patterns = [x.pattern for x in searches] + raise ValueError("Patterns %s not found at URL %s." % (patterns, url)) return searchUrls