diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index 024ee5193..e89daf6ac 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -2,11 +2,12 @@ # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2015-2020 Tobias Gruetzmacher +import html import os import re from urllib.parse import urljoin -from lxml import html, etree +import lxml from lxml.html.defs import link_attrs as html_link_attrs try: @@ -20,8 +21,8 @@ except ImportError: pycountry = None from . import configuration, http, languages, loader -from .util import (get_page, makeSequence, get_system_uid, unescape, tagre, - normaliseURL, prettyMatcherList, uniq) +from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL, + prettyMatcherList, uniq) from .comic import ComicStrip from .output import out from .events import getHandler @@ -400,7 +401,7 @@ class _BasicScraper(Scraper): text = match.group(1) out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern)) - return unescape(text).strip() + return html.unescape(text).strip() if optional: return None else: @@ -462,7 +463,7 @@ class _ParserScraper(Scraper): return tree def _parse_page(self, data): - if self.broken_html_bugfix and etree.LIBXML_VERSION < (2, 9, 3): + if self.broken_html_bugfix and lxml.etree.LIBXML_VERSION < (2, 9, 3): def fix_not_open_tags(match): fix = (len(match.group(1)) * '<') + match.group(2) out.warn("Found possibly broken HTML '%s', fixing as '%s'" % ( @@ -470,7 +471,7 @@ class _ParserScraper(Scraper): return fix data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data) - tree = html.document_fromstring(data) + tree = lxml.html.document_fromstring(data) return tree def fetchUrls(self, url, data, urlSearch): diff --git a/dosagelib/util.py b/dosagelib/util.py index d1ef3fd9b..2c8afbdd0 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -172,14 +172,6 @@ def prettyMatcherList(things): return "('%s')" % "', '".join(norm) -def unescape(text): - """Replace HTML entities and character references.""" - return html.unescape(text) - - -_nopathquote_chars = "-;/=,~*+()@!" - - def normaliseURL(url): """Normalising - strips and leading or trailing whitespace, @@ -188,7 +180,7 @@ def normaliseURL(url): """ url = unicode_safe(url).strip() # XXX: brutal hack - url = unescape(url) + url = html.unescape(url) pu = list(urlparse(url)) segments = pu[2].split('/') diff --git a/scripts/scriptutil.py b/scripts/scriptutil.py index d261f8837..034e5dcb3 100644 --- a/scripts/scriptutil.py +++ b/scripts/scriptutil.py @@ -3,17 +3,18 @@ # Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2015-2020 Tobias Gruetzmacher import codecs +import html import json import os import re import sys import time -from lxml import html +import lxml sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa -from dosagelib.util import unescape, get_page +from dosagelib.util import get_page from dosagelib import scraper, http @@ -37,7 +38,8 @@ class ComicListUpdater(object): """Get an HTML page and parse it with LXML.""" print("Parsing", url, file=sys.stderr) try: - data = html.document_fromstring(get_page(url, self.session).text) + pagetext = get_page(url, self.session).text + data = lxml.html.document_fromstring(pagetext) if expand: data.make_links_absolute(url) if self.sleep > 0: @@ -185,7 +187,7 @@ def asciify(name): def format_name(text): """Format a comic name.""" - name = unescape(text) + name = html.unescape(text) name = "".join(capfirst(x) for x in name.split(" ")) name = asciify(name.replace(u'&', u'And').replace(u'@', u'At').replace('ñ', 'n')) return name diff --git a/tests/test_util.py b/tests/test_util.py index 78f4a3a86..a533dde93 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2018 Tobias Gruetzmacher +# Copyright (C) 2015-2020 Tobias Gruetzmacher import pytest import re -from dosagelib.util import normaliseURL, unescape, tagre, get_system_uid +from dosagelib.util import normaliseURL, tagre, get_system_uid class TestURL(object): @@ -12,12 +12,6 @@ class TestURL(object): Tests for URL utility functions. """ - def test_unescape(self): - # Test HTML replacement. - assert unescape(u'foo&bar') == u'foo&bar' - assert unescape(u'foo bar') == u'foo\xa0bar' - assert unescape(u'"foo"') == u'"foo"' - def test_normalisation(self): # Test URL normalisation. assert (normaliseURL('http://example.com//bar/baz&baz') ==