Allow a list of regular expressions for image and previous link search.
This commit is contained in:
parent
6de26aeeaa
commit
43f20270d0
2 changed files with 17 additions and 8 deletions
|
@ -37,9 +37,11 @@ class _BasicScraper(object):
|
|||
lang = 'en'
|
||||
|
||||
# compiled regular expression that will locate the URL for the previous strip in a page
|
||||
# this can also be a list or tuple of compiled regular expressions
|
||||
prevSearch = None
|
||||
|
||||
# compiled regular expression that will locate the strip image URLs strip in a page
|
||||
# this can also be a list or tuple of compiled regular expressions
|
||||
imageSearch = None
|
||||
|
||||
# usually the index format help
|
||||
|
|
|
@ -12,6 +12,7 @@ import cgi
|
|||
import re
|
||||
import traceback
|
||||
import time
|
||||
import types
|
||||
from htmlentitydefs import name2codepoint
|
||||
|
||||
from .decorators import memoized
|
||||
|
@ -133,16 +134,22 @@ def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
|||
|
||||
|
||||
def fetchUrls(url, data, baseUrl, urlSearch):
|
||||
"""Search all entries for given URL pattern in a HTML page."""
|
||||
"""Search all entries for given URL pattern(s) in a HTML page."""
|
||||
searchUrls = []
|
||||
for match in urlSearch.finditer(data):
|
||||
if isinstance(urlSearch, (types.ListType, types.TupleType)):
|
||||
searches = urlSearch
|
||||
else:
|
||||
searches = [urlSearch]
|
||||
for search in searches:
|
||||
for match in search.finditer(data):
|
||||
searchUrl = match.group(1)
|
||||
if not searchUrl:
|
||||
raise ValueError("Pattern %s matched empty URL at %s." % (urlSearch.pattern, url))
|
||||
out.debug('matched URL %r with pattern %s' % (searchUrl, urlSearch.pattern))
|
||||
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
|
||||
out.debug('matched URL %r with pattern %s' % (searchUrl, search.pattern))
|
||||
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
|
||||
if not searchUrls:
|
||||
raise ValueError("Pattern %s not found at URL %s." % (urlSearch.pattern, url))
|
||||
patterns = [x.pattern for x in searches]
|
||||
raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
|
||||
return searchUrls
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue