Allow a list of regular expressions for image and previous link search.
This commit is contained in:
parent
6de26aeeaa
commit
43f20270d0
2 changed files with 17 additions and 8 deletions
|
@ -37,9 +37,11 @@ class _BasicScraper(object):
|
||||||
lang = 'en'
|
lang = 'en'
|
||||||
|
|
||||||
# compiled regular expression that will locate the URL for the previous strip in a page
|
# compiled regular expression that will locate the URL for the previous strip in a page
|
||||||
|
# this can also be a list or tuple of compiled regular expressions
|
||||||
prevSearch = None
|
prevSearch = None
|
||||||
|
|
||||||
# compiled regular expression that will locate the strip image URLs strip in a page
|
# compiled regular expression that will locate the strip image URLs strip in a page
|
||||||
|
# this can also be a list or tuple of compiled regular expressions
|
||||||
imageSearch = None
|
imageSearch = None
|
||||||
|
|
||||||
# usually the index format help
|
# usually the index format help
|
||||||
|
|
|
@ -12,6 +12,7 @@ import cgi
|
||||||
import re
|
import re
|
||||||
import traceback
|
import traceback
|
||||||
import time
|
import time
|
||||||
|
import types
|
||||||
from htmlentitydefs import name2codepoint
|
from htmlentitydefs import name2codepoint
|
||||||
|
|
||||||
from .decorators import memoized
|
from .decorators import memoized
|
||||||
|
@ -133,16 +134,22 @@ def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
||||||
|
|
||||||
|
|
||||||
def fetchUrls(url, data, baseUrl, urlSearch):
|
def fetchUrls(url, data, baseUrl, urlSearch):
|
||||||
"""Search all entries for given URL pattern in a HTML page."""
|
"""Search all entries for given URL pattern(s) in a HTML page."""
|
||||||
searchUrls = []
|
searchUrls = []
|
||||||
for match in urlSearch.finditer(data):
|
if isinstance(urlSearch, (types.ListType, types.TupleType)):
|
||||||
searchUrl = match.group(1)
|
searches = urlSearch
|
||||||
if not searchUrl:
|
else:
|
||||||
raise ValueError("Pattern %s matched empty URL at %s." % (urlSearch.pattern, url))
|
searches = [urlSearch]
|
||||||
out.debug('matched URL %r with pattern %s' % (searchUrl, urlSearch.pattern))
|
for search in searches:
|
||||||
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
|
for match in search.finditer(data):
|
||||||
|
searchUrl = match.group(1)
|
||||||
|
if not searchUrl:
|
||||||
|
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
|
||||||
|
out.debug('matched URL %r with pattern %s' % (searchUrl, search.pattern))
|
||||||
|
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
|
||||||
if not searchUrls:
|
if not searchUrls:
|
||||||
raise ValueError("Pattern %s not found at URL %s." % (urlSearch.pattern, url))
|
patterns = [x.pattern for x in searches]
|
||||||
|
raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
|
||||||
return searchUrls
|
return searchUrls
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue