Allow a list of regular expressions for image and previous link search.

2013-03-12 20:48:26 +01:00 · 2013-03-12 20:48:26 +01:00 · 43f20270d0
commit 43f20270d0
parent 6de26aeeaa
2 changed files with 17 additions and 8 deletions
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -37,9 +37,11 @@ class _BasicScraper(object):
    lang = 'en'
    # compiled regular expression that will locate the URL for the previous strip in a page
    # this can also be a list or tuple of compiled regular expressions
    prevSearch = None
    # compiled regular expression that will locate the strip image URLs strip in a page
    # this can also be a list or tuple of compiled regular expressions
    imageSearch = None
    # usually the index format help
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -12,6 +12,7 @@ import cgi
 import re
 import traceback
 import time
 import types
 from htmlentitydefs import name2codepoint
 from .decorators import memoized
@ -133,16 +134,22 @@ def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
 def fetchUrls(url, data, baseUrl, urlSearch):
-    """Search all entries for given URL pattern in a HTML page."""
+    """Search all entries for given URL pattern(s) in a HTML page."""
    searchUrls = []
-    for match in urlSearch.finditer(data):
+    if isinstance(urlSearch, (types.ListType, types.TupleType)):
-        searchUrl = match.group(1)
+        searches = urlSearch
-        if not searchUrl:
+    else:
-            raise ValueError("Pattern %s matched empty URL at %s." % (urlSearch.pattern, url))
+        searches = [urlSearch]
-        out.debug('matched URL %r with pattern %s' % (searchUrl, urlSearch.pattern))
+    for search in searches:
-        searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
+        for match in search.finditer(data):
            searchUrl = match.group(1)
            if not searchUrl:
                raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
            out.debug('matched URL %r with pattern %s' % (searchUrl, search.pattern))
            searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
    if not searchUrls:
-        raise ValueError("Pattern %s not found at URL %s." % (urlSearch.pattern, url))
+        patterns = [x.pattern for x in searches]
        raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
    return searchUrls