Use HTMLParser.unescape instead of rolling our own function.

This commit is contained in:
Bastian Kleineidam 2013-04-05 18:53:19 +02:00
parent 9ec48d57d9
commit adb31d84af
2 changed files with 7 additions and 25 deletions

View file

@ -22,9 +22,9 @@ import re
import traceback import traceback
import time import time
try: try:
from html.entities import name2codepoint from HTMLParser import HTMLParser
except ImportError: except ImportError:
from htmlentitydefs import name2codepoint from html.parser import HTMLParser
from .decorators import memoized from .decorators import memoized
from .output import out from .output import out
from .configuration import UserAgent, AppName, App, SupportUrl from .configuration import UserAgent, AppName, App, SupportUrl
@ -180,28 +180,10 @@ def fetchUrl(url, data, baseUrl, urlSearch):
return fetchUrls(url, data, baseUrl, urlSearch)[0] return fetchUrls(url, data, baseUrl, urlSearch)[0]
_htmlparser = HTMLParser()
def unescape(text): def unescape(text):
"""Replace HTML entities and character references.""" """Replace HTML entities and character references."""
def _fixup(m): return _htmlparser.unescape(text)
"""Replace HTML entities."""
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
text = unichr(int(text[3:-1], 16))
else:
text = unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(name2codepoint[text[1:-1]])
except KeyError:
pass
return text
return re.sub(r"&#?\w+;", _fixup, text)
_nopathquote_chars = "-;/=,~*+()@!" _nopathquote_chars = "-;/=,~*+()@!"

View file

@ -13,9 +13,9 @@ class URLTest(TestCase):
""" """
def test_unescape(self): def test_unescape(self):
# Test HTML replacement. # Test HTML replacement.
self.assertEqual(unescape('foo&bar'), 'foo&bar') self.assertEqual(unescape(u'foo&bar'), u'foo&bar')
self.assertEqual(unescape('foo bar'), u'foo\xa0bar') self.assertEqual(unescape(u'foo bar'), u'foo\xa0bar')
self.assertEqual(unescape('"foo"'), '"foo"') self.assertEqual(unescape(u'"foo"'), u'"foo"')
def test_normalisation(self): def test_normalisation(self):
# Test URL normalisation. # Test URL normalisation.