Remove (useless) wrapper around html.unescape

This commit is contained in:
Tobias Gruetzmacher 2020-04-13 01:53:45 +02:00
parent ccd3c57977
commit 62c3540c28
4 changed files with 16 additions and 27 deletions

View file

@ -2,11 +2,12 @@
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher
import html
import os
import re
from urllib.parse import urljoin
from lxml import html, etree
import lxml
from lxml.html.defs import link_attrs as html_link_attrs
try:
@ -20,8 +21,8 @@ except ImportError:
pycountry = None
from . import configuration, http, languages, loader
from .util import (get_page, makeSequence, get_system_uid, unescape, tagre,
normaliseURL, prettyMatcherList, uniq)
from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL,
prettyMatcherList, uniq)
from .comic import ComicStrip
from .output import out
from .events import getHandler
@ -400,7 +401,7 @@ class _BasicScraper(Scraper):
text = match.group(1)
out.debug(u'matched text %r with pattern %s' %
(text, textSearch.pattern))
return unescape(text).strip()
return html.unescape(text).strip()
if optional:
return None
else:
@ -462,7 +463,7 @@ class _ParserScraper(Scraper):
return tree
def _parse_page(self, data):
if self.broken_html_bugfix and etree.LIBXML_VERSION < (2, 9, 3):
if self.broken_html_bugfix and lxml.etree.LIBXML_VERSION < (2, 9, 3):
def fix_not_open_tags(match):
fix = (len(match.group(1)) * '&lt;') + match.group(2)
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
@ -470,7 +471,7 @@ class _ParserScraper(Scraper):
return fix
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
tree = html.document_fromstring(data)
tree = lxml.html.document_fromstring(data)
return tree
def fetchUrls(self, url, data, urlSearch):

View file

@ -172,14 +172,6 @@ def prettyMatcherList(things):
return "('%s')" % "', '".join(norm)
def unescape(text):
"""Replace HTML entities and character references."""
return html.unescape(text)
_nopathquote_chars = "-;/=,~*+()@!"
def normaliseURL(url):
"""Normalising
- strips and leading or trailing whitespace,
@ -188,7 +180,7 @@ def normaliseURL(url):
"""
url = unicode_safe(url).strip()
# XXX: brutal hack
url = unescape(url)
url = html.unescape(url)
pu = list(urlparse(url))
segments = pu[2].split('/')

View file

@ -3,17 +3,18 @@
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher
import codecs
import html
import json
import os
import re
import sys
import time
from lxml import html
import lxml
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import unescape, get_page
from dosagelib.util import get_page
from dosagelib import scraper, http
@ -37,7 +38,8 @@ class ComicListUpdater(object):
"""Get an HTML page and parse it with LXML."""
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, self.session).text)
pagetext = get_page(url, self.session).text
data = lxml.html.document_fromstring(pagetext)
if expand:
data.make_links_absolute(url)
if self.sleep > 0:
@ -185,7 +187,7 @@ def asciify(name):
def format_name(text):
"""Format a comic name."""
name = unescape(text)
name = html.unescape(text)
name = "".join(capfirst(x) for x in name.split(" "))
name = asciify(name.replace(u'&', u'And').replace(u'@', u'At').replace('ñ', 'n'))
return name

View file

@ -1,10 +1,10 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2018 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
import pytest
import re
from dosagelib.util import normaliseURL, unescape, tagre, get_system_uid
from dosagelib.util import normaliseURL, tagre, get_system_uid
class TestURL(object):
@ -12,12 +12,6 @@ class TestURL(object):
Tests for URL utility functions.
"""
def test_unescape(self):
# Test HTML replacement.
assert unescape(u'foo&amp;bar') == u'foo&bar'
assert unescape(u'foo&#160;bar') == u'foo\xa0bar'
assert unescape(u'&quot;foo&quot;') == u'"foo"'
def test_normalisation(self):
# Test URL normalisation.
assert (normaliseURL('http://example.com//bar/baz&amp;baz') ==