Remove workaround for libxml2 older 2.9.3 (2015)

This workaround was written in 2016 while that version was still found on many systems. Addionally, this workaround needs to be enabled by the developer, who might not even be aware that they need to enable it for a specific module. We still throw a warning to the user if running with such an old libxml version.
2020-09-29 21:16:48 +02:00 · 2020-09-29 21:16:48 +02:00 · e98a1601ca
commit e98a1601ca
parent c065a820ed
5 changed files with 10 additions and 18 deletions
--- a/dosagelib/plugins/b.py
+++ b/dosagelib/plugins/b.py
@ -33,7 +33,6 @@ class BadMachinery(_ParserScraper):
    firstStripUrl = stripUrl % '20090921'
    imageSearch = '//img[@class="comicimg"]'
    prevSearch = '//a[contains(text(), "Previous")]'
-    broken_html_bugfix = True
    endOfLife = True
    help = 'Index format: yyyymmdd'

--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -5,6 +5,7 @@
 import html
 import os
 import re
+import warnings
 from urllib.parse import urljoin

 import lxml
@ -32,6 +33,10 @@ from .xml import NS
 ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')


+if lxml.etree.LIBXML_VERSION < (2, 9, 3):
+    warnings.warn('Your libxml2 is very old (< 2.9.3), some dosage modules might missbehave')
+
+
 class GeoblockedException(IOError):
    def __init__(self):
        super().__init__('It seems your current location is geo-blocked.')
@ -438,8 +443,6 @@ class _ParserScraper(Scraper):
    of the HTML element and returns that.
    """

-    BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
-
    # Taken directly from LXML
    XML_DECL = re.compile(
        r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
@ -448,11 +451,6 @@ class _ParserScraper(Scraper):
    # another Python module, XPath is the default for now.
    css = False

-    # Activate a workaround for unescaped < characters on libxml version older
-    # then 2.9.3. This is disabled by default since most sites are not THAT
-    # broken ;)
-    broken_html_bugfix = False
-
    def getPage(self, url):
        page = super(_ParserScraper, self).getPage(url)
        if page.encoding:
@ -469,14 +467,6 @@ class _ParserScraper(Scraper):
        return tree

    def _parse_page(self, data):
-        if self.broken_html_bugfix and lxml.etree.LIBXML_VERSION < (2, 9, 3):
-            def fix_not_open_tags(match):
-                fix = (len(match.group(1)) * '&lt;') + match.group(2)
-                out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
-                         match.group(0), fix), level=2)
-                return fix
-            data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
-
        tree = lxml.html.document_fromstring(data)
        return tree

--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -15,6 +15,8 @@ from urllib.parse import (parse_qs, quote as url_quote, unquote as url_unquote,
        urlparse, urlunparse, urlsplit)
 from urllib.robotparser import RobotFileParser

+import lxml
+
 from .output import out
 from .configuration import UserAgent, App, SupportUrl
 from . import AppName
@ -347,6 +349,7 @@ def print_app_info(out=sys.stderr):
    print(App, file=out)
    print("Python %(version)s on %(platform)s" %
          {"version": sys.version, "platform": sys.platform}, file=out)
+    print("libxml2 version: %i.%i.%i" % lxml.etree.LIBXML_VERSION, file=out)
    stime = strtime(time.time())
    print("Local time:", stime, file=out)
    print("sys.argv", sys.argv, file=out)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,6 @@
 colorama
 imagesize
-lxml
+lxml>=4.0.0
 requests>=2.0
 cached_property; python_version<'3.8'
 importlib_metadata; python_version<'3.8'
--- a/setup.cfg
+++ b/setup.cfg
@ -34,7 +34,7 @@ packages = find:
 install_requires =
    colorama
    imagesize
-    lxml
+    lxml>=4.0.0
    requests>=2.0
    cached_property;python_version<'3.8'
    importlib_metadata;python_version<'3.8'