Use HTMLParser.unescape instead of rolling our own function.

This commit is contained in:
Bastian Kleineidam 2013-04-05 18:53:19 +02:00
parent 9ec48d57d9
commit adb31d84af
2 changed files with 7 additions and 25 deletions

View file

@ -22,9 +22,9 @@ import re
import traceback
import time
try:
from html.entities import name2codepoint
from HTMLParser import HTMLParser
except ImportError:
from htmlentitydefs import name2codepoint
from html.parser import HTMLParser
from .decorators import memoized
from .output import out
from .configuration import UserAgent, AppName, App, SupportUrl
@ -180,28 +180,10 @@ def fetchUrl(url, data, baseUrl, urlSearch):
return fetchUrls(url, data, baseUrl, urlSearch)[0]
_htmlparser = HTMLParser()
def unescape(text):
"""Replace HTML entities and character references."""
def _fixup(m):
"""Replace HTML entities."""
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
text = unichr(int(text[3:-1], 16))
else:
text = unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(name2codepoint[text[1:-1]])
except KeyError:
pass
return text
return re.sub(r"&#?\w+;", _fixup, text)
return _htmlparser.unescape(text)
_nopathquote_chars = "-;/=,~*+()@!"

View file

@ -13,9 +13,9 @@ class URLTest(TestCase):
"""
def test_unescape(self):
# Test HTML replacement.
self.assertEqual(unescape('foo&bar'), 'foo&bar')
self.assertEqual(unescape('foo bar'), u'foo\xa0bar')
self.assertEqual(unescape('"foo"'), '"foo"')
self.assertEqual(unescape(u'foo&bar'), u'foo&bar')
self.assertEqual(unescape(u'foo bar'), u'foo\xa0bar')
self.assertEqual(unescape(u'"foo"'), u'"foo"')
def test_normalisation(self):
# Test URL normalisation.