Prevent empty URL matching.

This commit is contained in:
Bastian Kleineidam 2012-10-11 18:16:29 +02:00
parent cb9760f483
commit c0ad053647

View file

@ -76,11 +76,13 @@ def getPageContent(url):
return data, baseUrl return data, baseUrl
def fetchUrl(url, searchRo): def fetchUrl(url, urlSearch):
data, baseUrl = getPageContent(url) data, baseUrl = getPageContent(url)
match = searchRo.search(data) match = urlSearch.search(data)
if match: if match:
searchUrl = match.group(1) searchUrl = match.group(1)
if not searchUrl:
raise ValueError("Match empty URL at %s with pattern %s" % (url, urlSearch.pattern))
out.write('matched URL %r' % searchUrl, 2) out.write('matched URL %r' % searchUrl, 2)
return urlparse.urljoin(baseUrl, searchUrl) return urlparse.urljoin(baseUrl, searchUrl)
return None return None
@ -92,6 +94,8 @@ def fetchUrls(url, imageSearch, prevSearch=None):
imageUrls = set() imageUrls = set()
for match in imageSearch.finditer(data): for match in imageSearch.finditer(data):
imageUrl = match.group(1) imageUrl = match.group(1)
if not imageUrl:
raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern))
out.write('matched image URL %r' % imageUrl, 2) out.write('matched image URL %r' % imageUrl, 2)
imageUrls.add(urlparse.urljoin(baseUrl, imageUrl)) imageUrls.add(urlparse.urljoin(baseUrl, imageUrl))
if not imageUrls: if not imageUrls:
@ -101,6 +105,8 @@ def fetchUrls(url, imageSearch, prevSearch=None):
match = prevSearch.search(data) match = prevSearch.search(data)
if match: if match:
prevUrl = match.group(1) prevUrl = match.group(1)
if not prevUrl:
raise ValueError("Match empty previous URL at %s with pattern %s" % (url, prevSearch.pattern))
out.write('matched previous URL %r' % prevUrl, 2) out.write('matched previous URL %r' % prevUrl, 2)
prevUrl = urlparse.urljoin(baseUrl, prevUrl) prevUrl = urlparse.urljoin(baseUrl, prevUrl)
else: else: