Correct path quoting.

This commit is contained in:
Bastian Kleineidam 2013-02-12 17:55:33 +01:00
parent adbff1bca1
commit 10f6a1caa1

View file

@ -135,7 +135,7 @@ def fetchUrls(url, data, baseUrl, urlSearch):
out.debug('matched URL %r with pattern %s' % (searchUrl, urlSearch.pattern)) out.debug('matched URL %r with pattern %s' % (searchUrl, urlSearch.pattern))
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl))) searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
if not searchUrls: if not searchUrls:
raise ValueError("Pattern %s not found at URL %s with data %r." % (urlSearch.pattern, url, data)) raise ValueError("Pattern %s not found at URL %s." % (urlSearch.pattern, url))
return searchUrls return searchUrls
@ -168,6 +168,8 @@ def unescape(text):
return re.sub(r"&#?\w+;", _fixup, text) return re.sub(r"&#?\w+;", _fixup, text)
_nopathquote_chars = "-;/=,~*+()@!"
def normaliseURL(url): def normaliseURL(url):
"""Removes any leading empty segments to avoid breaking urllib2; also replaces """Removes any leading empty segments to avoid breaking urllib2; also replaces
HTML entities and character references. HTML entities and character references.
@ -181,7 +183,7 @@ def normaliseURL(url):
segments = pu[2].split('/') segments = pu[2].split('/')
while segments and segments[0] in ('', '..'): while segments and segments[0] in ('', '..'):
del segments[0] del segments[0]
pu[2] = quote(unquote('/' + '/'.join(segments))) pu[2] = quote(unquote('/' + '/'.join(segments)), safechars=_nopathquote_chars)
# remove leading '&' from query # remove leading '&' from query
if pu[4].startswith('&'): if pu[4].startswith('&'):
pu[4] = pu[4][1:] pu[4] = pu[4][1:]