From e98a1601ca962570197fec4285ff2b39b29b92df Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Tue, 29 Sep 2020 21:16:48 +0200 Subject: [PATCH] Remove workaround for libxml2 older 2.9.3 (2015) This workaround was written in 2016 while that version was still found on many systems. Addionally, this workaround needs to be enabled by the developer, who might not even be aware that they need to enable it for a specific module. We still throw a warning to the user if running with such an old libxml version. --- dosagelib/plugins/b.py | 1 - dosagelib/scraper.py | 20 +++++--------------- dosagelib/util.py | 3 +++ requirements.txt | 2 +- setup.cfg | 2 +- 5 files changed, 10 insertions(+), 18 deletions(-) diff --git a/dosagelib/plugins/b.py b/dosagelib/plugins/b.py index ecac62756..7d53b6ca5 100644 --- a/dosagelib/plugins/b.py +++ b/dosagelib/plugins/b.py @@ -33,7 +33,6 @@ class BadMachinery(_ParserScraper): firstStripUrl = stripUrl % '20090921' imageSearch = '//img[@class="comicimg"]' prevSearch = '//a[contains(text(), "Previous")]' - broken_html_bugfix = True endOfLife = True help = 'Index format: yyyymmdd' diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index b0ab299f6..c54aaac34 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -5,6 +5,7 @@ import html import os import re +import warnings from urllib.parse import urljoin import lxml @@ -32,6 +33,10 @@ from .xml import NS ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/') +if lxml.etree.LIBXML_VERSION < (2, 9, 3): + warnings.warn('Your libxml2 is very old (< 2.9.3), some dosage modules might missbehave') + + class GeoblockedException(IOError): def __init__(self): super().__init__('It seems your current location is geo-blocked.') @@ -438,8 +443,6 @@ class _ParserScraper(Scraper): of the HTML element and returns that. """ - BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])') - # Taken directly from LXML XML_DECL = re.compile( r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U) @@ -448,11 +451,6 @@ class _ParserScraper(Scraper): # another Python module, XPath is the default for now. css = False - # Activate a workaround for unescaped < characters on libxml version older - # then 2.9.3. This is disabled by default since most sites are not THAT - # broken ;) - broken_html_bugfix = False - def getPage(self, url): page = super(_ParserScraper, self).getPage(url) if page.encoding: @@ -469,14 +467,6 @@ class _ParserScraper(Scraper): return tree def _parse_page(self, data): - if self.broken_html_bugfix and lxml.etree.LIBXML_VERSION < (2, 9, 3): - def fix_not_open_tags(match): - fix = (len(match.group(1)) * '<') + match.group(2) - out.warn("Found possibly broken HTML '%s', fixing as '%s'" % ( - match.group(0), fix), level=2) - return fix - data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data) - tree = lxml.html.document_fromstring(data) return tree diff --git a/dosagelib/util.py b/dosagelib/util.py index 20250e035..11a5e095b 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -15,6 +15,8 @@ from urllib.parse import (parse_qs, quote as url_quote, unquote as url_unquote, urlparse, urlunparse, urlsplit) from urllib.robotparser import RobotFileParser +import lxml + from .output import out from .configuration import UserAgent, App, SupportUrl from . import AppName @@ -347,6 +349,7 @@ def print_app_info(out=sys.stderr): print(App, file=out) print("Python %(version)s on %(platform)s" % {"version": sys.version, "platform": sys.platform}, file=out) + print("libxml2 version: %i.%i.%i" % lxml.etree.LIBXML_VERSION, file=out) stime = strtime(time.time()) print("Local time:", stime, file=out) print("sys.argv", sys.argv, file=out) diff --git a/requirements.txt b/requirements.txt index 1797b97d2..a377a8800 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ colorama imagesize -lxml +lxml>=4.0.0 requests>=2.0 cached_property; python_version<'3.8' importlib_metadata; python_version<'3.8' diff --git a/setup.cfg b/setup.cfg index 61c79a3ae..ee25d9181 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,7 +34,7 @@ packages = find: install_requires = colorama imagesize - lxml + lxml>=4.0.0 requests>=2.0 cached_property;python_version<'3.8' importlib_metadata;python_version<'3.8'