diff --git a/dosagelib/util.py b/dosagelib/util.py index 05dc1c67d..725d63166 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -22,9 +22,9 @@ import re import traceback import time try: - from html.entities import name2codepoint + from HTMLParser import HTMLParser except ImportError: - from htmlentitydefs import name2codepoint + from html.parser import HTMLParser from .decorators import memoized from .output import out from .configuration import UserAgent, AppName, App, SupportUrl @@ -180,28 +180,10 @@ def fetchUrl(url, data, baseUrl, urlSearch): return fetchUrls(url, data, baseUrl, urlSearch)[0] +_htmlparser = HTMLParser() def unescape(text): """Replace HTML entities and character references.""" - def _fixup(m): - """Replace HTML entities.""" - text = m.group(0) - if text[:2] == "&#": - # character reference - try: - if text[:3] == "&#x": - text = unichr(int(text[3:-1], 16)) - else: - text = unichr(int(text[2:-1])) - except ValueError: - pass - else: - # named entity - try: - text = unichr(name2codepoint[text[1:-1]]) - except KeyError: - pass - return text - return re.sub(r"&#?\w+;", _fixup, text) + return _htmlparser.unescape(text) _nopathquote_chars = "-;/=,~*+()@!" diff --git a/tests/test_util.py b/tests/test_util.py index 9b9b15739..7bb3f7992 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -13,9 +13,9 @@ class URLTest(TestCase): """ def test_unescape(self): # Test HTML replacement. - self.assertEqual(unescape('foo&bar'), 'foo&bar') - self.assertEqual(unescape('foo bar'), u'foo\xa0bar') - self.assertEqual(unescape('"foo"'), '"foo"') + self.assertEqual(unescape(u'foo&bar'), u'foo&bar') + self.assertEqual(unescape(u'foo bar'), u'foo\xa0bar') + self.assertEqual(unescape(u'"foo"'), u'"foo"') def test_normalisation(self): # Test URL normalisation.