Remove (useless) wrapper around html.unescape

This commit is contained in:
Tobias Gruetzmacher 2020-04-13 01:53:45 +02:00
parent ccd3c57977
commit 62c3540c28
4 changed files with 16 additions and 27 deletions

View file

@ -2,11 +2,12 @@
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
import html
import os import os
import re import re
from urllib.parse import urljoin from urllib.parse import urljoin
from lxml import html, etree import lxml
from lxml.html.defs import link_attrs as html_link_attrs from lxml.html.defs import link_attrs as html_link_attrs
try: try:
@ -20,8 +21,8 @@ except ImportError:
pycountry = None pycountry = None
from . import configuration, http, languages, loader from . import configuration, http, languages, loader
from .util import (get_page, makeSequence, get_system_uid, unescape, tagre, from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL,
normaliseURL, prettyMatcherList, uniq) prettyMatcherList, uniq)
from .comic import ComicStrip from .comic import ComicStrip
from .output import out from .output import out
from .events import getHandler from .events import getHandler
@ -400,7 +401,7 @@ class _BasicScraper(Scraper):
text = match.group(1) text = match.group(1)
out.debug(u'matched text %r with pattern %s' % out.debug(u'matched text %r with pattern %s' %
(text, textSearch.pattern)) (text, textSearch.pattern))
return unescape(text).strip() return html.unescape(text).strip()
if optional: if optional:
return None return None
else: else:
@ -462,7 +463,7 @@ class _ParserScraper(Scraper):
return tree return tree
def _parse_page(self, data): def _parse_page(self, data):
if self.broken_html_bugfix and etree.LIBXML_VERSION < (2, 9, 3): if self.broken_html_bugfix and lxml.etree.LIBXML_VERSION < (2, 9, 3):
def fix_not_open_tags(match): def fix_not_open_tags(match):
fix = (len(match.group(1)) * '&lt;') + match.group(2) fix = (len(match.group(1)) * '&lt;') + match.group(2)
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % ( out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
@ -470,7 +471,7 @@ class _ParserScraper(Scraper):
return fix return fix
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data) data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
tree = html.document_fromstring(data) tree = lxml.html.document_fromstring(data)
return tree return tree
def fetchUrls(self, url, data, urlSearch): def fetchUrls(self, url, data, urlSearch):

View file

@ -172,14 +172,6 @@ def prettyMatcherList(things):
return "('%s')" % "', '".join(norm) return "('%s')" % "', '".join(norm)
def unescape(text):
"""Replace HTML entities and character references."""
return html.unescape(text)
_nopathquote_chars = "-;/=,~*+()@!"
def normaliseURL(url): def normaliseURL(url):
"""Normalising """Normalising
- strips and leading or trailing whitespace, - strips and leading or trailing whitespace,
@ -188,7 +180,7 @@ def normaliseURL(url):
""" """
url = unicode_safe(url).strip() url = unicode_safe(url).strip()
# XXX: brutal hack # XXX: brutal hack
url = unescape(url) url = html.unescape(url)
pu = list(urlparse(url)) pu = list(urlparse(url))
segments = pu[2].split('/') segments = pu[2].split('/')

View file

@ -3,17 +3,18 @@
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
import codecs import codecs
import html
import json import json
import os import os
import re import re
import sys import sys
import time import time
from lxml import html import lxml
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import unescape, get_page from dosagelib.util import get_page
from dosagelib import scraper, http from dosagelib import scraper, http
@ -37,7 +38,8 @@ class ComicListUpdater(object):
"""Get an HTML page and parse it with LXML.""" """Get an HTML page and parse it with LXML."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
try: try:
data = html.document_fromstring(get_page(url, self.session).text) pagetext = get_page(url, self.session).text
data = lxml.html.document_fromstring(pagetext)
if expand: if expand:
data.make_links_absolute(url) data.make_links_absolute(url)
if self.sleep > 0: if self.sleep > 0:
@ -185,7 +187,7 @@ def asciify(name):
def format_name(text): def format_name(text):
"""Format a comic name.""" """Format a comic name."""
name = unescape(text) name = html.unescape(text)
name = "".join(capfirst(x) for x in name.split(" ")) name = "".join(capfirst(x) for x in name.split(" "))
name = asciify(name.replace(u'&', u'And').replace(u'@', u'At').replace('ñ', 'n')) name = asciify(name.replace(u'&', u'And').replace(u'@', u'At').replace('ñ', 'n'))
return name return name

View file

@ -1,10 +1,10 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2018 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
import pytest import pytest
import re import re
from dosagelib.util import normaliseURL, unescape, tagre, get_system_uid from dosagelib.util import normaliseURL, tagre, get_system_uid
class TestURL(object): class TestURL(object):
@ -12,12 +12,6 @@ class TestURL(object):
Tests for URL utility functions. Tests for URL utility functions.
""" """
def test_unescape(self):
# Test HTML replacement.
assert unescape(u'foo&amp;bar') == u'foo&bar'
assert unescape(u'foo&#160;bar') == u'foo\xa0bar'
assert unescape(u'&quot;foo&quot;') == u'"foo"'
def test_normalisation(self): def test_normalisation(self):
# Test URL normalisation. # Test URL normalisation.
assert (normaliseURL('http://example.com//bar/baz&amp;baz') == assert (normaliseURL('http://example.com//bar/baz&amp;baz') ==