From 62c3540c2877e04b007ed8bbe40d9403b062afbf Mon Sep 17 00:00:00 2001
From: Tobias Gruetzmacher <tobias-git@23.gs>
Date: Mon, 13 Apr 2020 01:53:45 +0200
Subject: [PATCH] Remove (useless) wrapper around html.unescape

---
 dosagelib/scraper.py  | 13 +++++++------
 dosagelib/util.py     | 10 +---------
 scripts/scriptutil.py | 10 ++++++----
 tests/test_util.py    | 10 ++--------
 4 files changed, 16 insertions(+), 27 deletions(-)

diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py
index 024ee5193..e89daf6ac 100644
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@@ -2,11 +2,12 @@
 # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012-2014 Bastian Kleineidam
 # Copyright (C) 2015-2020 Tobias Gruetzmacher
+import html
 import os
 import re
 from urllib.parse import urljoin
 
-from lxml import html, etree
+import lxml
 from lxml.html.defs import link_attrs as html_link_attrs
 
 try:
@@ -20,8 +21,8 @@ except ImportError:
     pycountry = None
 
 from . import configuration, http, languages, loader
-from .util import (get_page, makeSequence, get_system_uid, unescape, tagre,
-    normaliseURL, prettyMatcherList, uniq)
+from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL,
+        prettyMatcherList, uniq)
 from .comic import ComicStrip
 from .output import out
 from .events import getHandler
@@ -400,7 +401,7 @@ class _BasicScraper(Scraper):
                 text = match.group(1)
                 out.debug(u'matched text %r with pattern %s' %
                           (text, textSearch.pattern))
-                return unescape(text).strip()
+                return html.unescape(text).strip()
             if optional:
                 return None
             else:
@@ -462,7 +463,7 @@ class _ParserScraper(Scraper):
         return tree
 
     def _parse_page(self, data):
-        if self.broken_html_bugfix and etree.LIBXML_VERSION < (2, 9, 3):
+        if self.broken_html_bugfix and lxml.etree.LIBXML_VERSION < (2, 9, 3):
             def fix_not_open_tags(match):
                 fix = (len(match.group(1)) * '&lt;') + match.group(2)
                 out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
@@ -470,7 +471,7 @@ class _ParserScraper(Scraper):
                 return fix
             data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
 
-        tree = html.document_fromstring(data)
+        tree = lxml.html.document_fromstring(data)
         return tree
 
     def fetchUrls(self, url, data, urlSearch):
diff --git a/dosagelib/util.py b/dosagelib/util.py
index d1ef3fd9b..2c8afbdd0 100644
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@@ -172,14 +172,6 @@ def prettyMatcherList(things):
     return "('%s')" % "', '".join(norm)
 
 
-def unescape(text):
-    """Replace HTML entities and character references."""
-    return html.unescape(text)
-
-
-_nopathquote_chars = "-;/=,~*+()@!"
-
-
 def normaliseURL(url):
     """Normalising
     - strips and leading or trailing whitespace,
@@ -188,7 +180,7 @@ def normaliseURL(url):
     """
     url = unicode_safe(url).strip()
     # XXX: brutal hack
-    url = unescape(url)
+    url = html.unescape(url)
 
     pu = list(urlparse(url))
     segments = pu[2].split('/')
diff --git a/scripts/scriptutil.py b/scripts/scriptutil.py
index d261f8837..034e5dcb3 100644
--- a/scripts/scriptutil.py
+++ b/scripts/scriptutil.py
@@ -3,17 +3,18 @@
 # Copyright (C) 2012-2014 Bastian Kleineidam
 # Copyright (C) 2015-2020 Tobias Gruetzmacher
 import codecs
+import html
 import json
 import os
 import re
 import sys
 import time
 
-from lxml import html
+import lxml
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))  # noqa
 
-from dosagelib.util import unescape, get_page
+from dosagelib.util import get_page
 from dosagelib import scraper, http
 
 
@@ -37,7 +38,8 @@ class ComicListUpdater(object):
         """Get an HTML page and parse it with LXML."""
         print("Parsing", url, file=sys.stderr)
         try:
-            data = html.document_fromstring(get_page(url, self.session).text)
+            pagetext = get_page(url, self.session).text
+            data = lxml.html.document_fromstring(pagetext)
             if expand:
                 data.make_links_absolute(url)
             if self.sleep > 0:
@@ -185,7 +187,7 @@ def asciify(name):
 
 def format_name(text):
     """Format a comic name."""
-    name = unescape(text)
+    name = html.unescape(text)
     name = "".join(capfirst(x) for x in name.split(" "))
     name = asciify(name.replace(u'&', u'And').replace(u'@', u'At').replace('ñ', 'n'))
     return name
diff --git a/tests/test_util.py b/tests/test_util.py
index 78f4a3a86..a533dde93 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -1,10 +1,10 @@
 # -*- coding: utf-8 -*-
 # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012-2014 Bastian Kleineidam
-# Copyright (C) 2015-2018 Tobias Gruetzmacher
+# Copyright (C) 2015-2020 Tobias Gruetzmacher
 import pytest
 import re
-from dosagelib.util import normaliseURL, unescape, tagre, get_system_uid
+from dosagelib.util import normaliseURL, tagre, get_system_uid
 
 
 class TestURL(object):
@@ -12,12 +12,6 @@ class TestURL(object):
     Tests for URL utility functions.
     """
 
-    def test_unescape(self):
-        # Test HTML replacement.
-        assert unescape(u'foo&amp;bar') == u'foo&bar'
-        assert unescape(u'foo&#160;bar') == u'foo\xa0bar'
-        assert unescape(u'&quot;foo&quot;') == u'"foo"'
-
     def test_normalisation(self):
         # Test URL normalisation.
         assert (normaliseURL('http://example.com//bar/baz&amp;baz') ==