Fix URL norming, See issue #2.

2013-01-23 21:16:22 +01:00 · 2013-01-23 21:16:22 +01:00 · f1356a9ff8
commit f1356a9ff8
parent 9ad4477d1f
2 changed files with 19 additions and 7 deletions
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@ -14,6 +14,8 @@ Changes:
 Fixes:
 - comics: Fixed LeastICouldDo image URL.
  Closes: GH bug #1
+- comics: Fix URL norming.
+  Closes: GH bug #2
 - documentation: Fix wrong option name: it's -a instead of -c.
  Closes: GH bug #3
 - comics: Fix UnboundLocalError when using indexed retrieval.
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -33,10 +33,20 @@ MaxRetries = 3
 # Default connection timeout
 ConnectionTimeoutSecs = 60

+# The character set to encode non-ASCII characters in a URL. See also
+# http://tools.ietf.org/html/rfc2396#section-2.1
+# Note that the encoding is not really specified, but most browsers
+# encode in UTF-8 when no encoding is specified by the HTTP headers,
+# else they use the page encoding for followed link. See als
+# http://code.google.com/p/browsersec/wiki/Part1#Unicode_in_URLs
+UrlEncoding = "utf-8"
+
+
 if hasattr(requests, 'adapters'):
    # requests >= 1.0
    requests.adapters.DEFAULT_RETRIES = MaxRetries

+
 def tagre(tag, attribute, value, quote='"', before="", after=""):
    """Return a regular expression matching the given HTML tag, attribute
    and value. It matches the tag and attribute names case insensitive,
@ -77,7 +87,7 @@ def case_insensitive_re(name):
    insensitive.
    @param name: the name to make case insensitive
    @ptype name: string
-    @return: the case insenstive regex
+    @return: the case insensitive regex
    @rtype: string
    """
    return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
@ -168,9 +178,6 @@ def unescape(text):
                text = unichr(name2codepoint[text[1:-1]])
            except KeyError:
                pass
-        if isinstance(text, unicode):
-            text = text.encode('utf-8')
-            text = urllib2.quote(text, safe=';/?:@&=+$,')
        return text
    return re.sub(r"&#?\w+;", _fixup, text)

@ -179,6 +186,8 @@ def normaliseURL(url):
    """Removes any leading empty segments to avoid breaking urllib2; also replaces
    HTML entities and character references.
    """
+    if isinstance(url, unicode):
+        url = url.encode(UrlEncoding, 'ignore')
    # XXX: brutal hack
    url = unescape(url)

@ -186,7 +195,7 @@ def normaliseURL(url):
    segments = pu[2].split('/')
    while segments and segments[0] in ('', '..'):
        del segments[0]
-    pu[2] = '/' + '/'.join(segments).replace(' ', '%20')
+    pu[2] = quote(unquote('/' + '/'.join(segments)))
    # remove leading '&' from query
    if pu[4].startswith('&'):
        pu[4] = pu[4][1:]
@ -389,9 +398,10 @@ def unquote(text):
    return text


-def quote(text):
+def quote(text, safechars='/'):
    """Percent-encode given text."""
-    return urllib.quote(text)
+    return urllib.quote(text, safechars)
+

 def strsize (b):
    """Return human representation of bytes b. A negative number of bytes