Remove (useless) wrapper around html.unescape

2020-04-13 01:53:45 +02:00 · 2020-04-13 01:53:45 +02:00 · 62c3540c28
commit 62c3540c28
parent ccd3c57977
4 changed files with 16 additions and 27 deletions
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -2,11 +2,12 @@
 # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012-2014 Bastian Kleineidam
 # Copyright (C) 2015-2020 Tobias Gruetzmacher
+import html
 import os
 import re
 from urllib.parse import urljoin

-from lxml import html, etree
+import lxml
 from lxml.html.defs import link_attrs as html_link_attrs

 try:
@ -20,8 +21,8 @@ except ImportError:
    pycountry = None

 from . import configuration, http, languages, loader
-from .util import (get_page, makeSequence, get_system_uid, unescape, tagre,
-    normaliseURL, prettyMatcherList, uniq)
+from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL,
+        prettyMatcherList, uniq)
 from .comic import ComicStrip
 from .output import out
 from .events import getHandler
@ -400,7 +401,7 @@ class _BasicScraper(Scraper):
                text = match.group(1)
                out.debug(u'matched text %r with pattern %s' %
                          (text, textSearch.pattern))
-                return unescape(text).strip()
+                return html.unescape(text).strip()
            if optional:
                return None
            else:
@ -462,7 +463,7 @@ class _ParserScraper(Scraper):
        return tree

    def _parse_page(self, data):
-        if self.broken_html_bugfix and etree.LIBXML_VERSION < (2, 9, 3):
+        if self.broken_html_bugfix and lxml.etree.LIBXML_VERSION < (2, 9, 3):
            def fix_not_open_tags(match):
                fix = (len(match.group(1)) * '&lt;') + match.group(2)
                out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
@ -470,7 +471,7 @@ class _ParserScraper(Scraper):
                return fix
            data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)

-        tree = html.document_fromstring(data)
+        tree = lxml.html.document_fromstring(data)
        return tree

    def fetchUrls(self, url, data, urlSearch):
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -172,14 +172,6 @@ def prettyMatcherList(things):
    return "('%s')" % "', '".join(norm)


-def unescape(text):
-    """Replace HTML entities and character references."""
-    return html.unescape(text)
-
-
-_nopathquote_chars = "-;/=,~*+()@!"
-
-
 def normaliseURL(url):
    """Normalising
    - strips and leading or trailing whitespace,
@ -188,7 +180,7 @@ def normaliseURL(url):
    """
    url = unicode_safe(url).strip()
    # XXX: brutal hack
-    url = unescape(url)
+    url = html.unescape(url)

    pu = list(urlparse(url))
    segments = pu[2].split('/')
--- a/scripts/scriptutil.py
+++ b/scripts/scriptutil.py
@ -3,17 +3,18 @@
 # Copyright (C) 2012-2014 Bastian Kleineidam
 # Copyright (C) 2015-2020 Tobias Gruetzmacher
 import codecs
+import html
 import json
 import os
 import re
 import sys
 import time

-from lxml import html
+import lxml

 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))  # noqa

-from dosagelib.util import unescape, get_page
+from dosagelib.util import get_page
 from dosagelib import scraper, http


@ -37,7 +38,8 @@ class ComicListUpdater(object):
        """Get an HTML page and parse it with LXML."""
        print("Parsing", url, file=sys.stderr)
        try:
-            data = html.document_fromstring(get_page(url, self.session).text)
+            pagetext = get_page(url, self.session).text
+            data = lxml.html.document_fromstring(pagetext)
            if expand:
                data.make_links_absolute(url)
            if self.sleep > 0:
@ -185,7 +187,7 @@ def asciify(name):

 def format_name(text):
    """Format a comic name."""
-    name = unescape(text)
+    name = html.unescape(text)
    name = "".join(capfirst(x) for x in name.split(" "))
    name = asciify(name.replace(u'&', u'And').replace(u'@', u'At').replace('ñ', 'n'))
    return name
--- a/tests/test_util.py
+++ b/tests/test_util.py
@ -1,10 +1,10 @@
 # -*- coding: utf-8 -*-
 # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012-2014 Bastian Kleineidam
-# Copyright (C) 2015-2018 Tobias Gruetzmacher
+# Copyright (C) 2015-2020 Tobias Gruetzmacher
 import pytest
 import re
-from dosagelib.util import normaliseURL, unescape, tagre, get_system_uid
+from dosagelib.util import normaliseURL, tagre, get_system_uid


 class TestURL(object):
@ -12,12 +12,6 @@ class TestURL(object):
    Tests for URL utility functions.
    """

-    def test_unescape(self):
-        # Test HTML replacement.
-        assert unescape(u'foo&amp;bar') == u'foo&bar'
-        assert unescape(u'foo&#160;bar') == u'foo\xa0bar'
-        assert unescape(u'&quot;foo&quot;') == u'"foo"'
-
    def test_normalisation(self):
        # Test URL normalisation.
        assert (normaliseURL('http://example.com//bar/baz&amp;baz') ==