diff --git a/dosagelib/plugins/b.py b/dosagelib/plugins/b.py index ecac62756..7d53b6ca5 100644 --- a/dosagelib/plugins/b.py +++ b/dosagelib/plugins/b.py @@ -33,7 +33,6 @@ class BadMachinery(_ParserScraper): firstStripUrl = stripUrl % '20090921' imageSearch = '//img[@class="comicimg"]' prevSearch = '//a[contains(text(), "Previous")]' - broken_html_bugfix = True endOfLife = True help = 'Index format: yyyymmdd' diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index b0ab299f6..c54aaac34 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -5,6 +5,7 @@ import html import os import re +import warnings from urllib.parse import urljoin import lxml @@ -32,6 +33,10 @@ from .xml import NS ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/') +if lxml.etree.LIBXML_VERSION < (2, 9, 3): + warnings.warn('Your libxml2 is very old (< 2.9.3), some dosage modules might missbehave') + + class GeoblockedException(IOError): def __init__(self): super().__init__('It seems your current location is geo-blocked.') @@ -438,8 +443,6 @@ class _ParserScraper(Scraper): of the HTML element and returns that. """ - BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])') - # Taken directly from LXML XML_DECL = re.compile( r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U) @@ -448,11 +451,6 @@ class _ParserScraper(Scraper): # another Python module, XPath is the default for now. css = False - # Activate a workaround for unescaped < characters on libxml version older - # then 2.9.3. This is disabled by default since most sites are not THAT - # broken ;) - broken_html_bugfix = False - def getPage(self, url): page = super(_ParserScraper, self).getPage(url) if page.encoding: @@ -469,14 +467,6 @@ class _ParserScraper(Scraper): return tree def _parse_page(self, data): - if self.broken_html_bugfix and lxml.etree.LIBXML_VERSION < (2, 9, 3): - def fix_not_open_tags(match): - fix = (len(match.group(1)) * '<') + match.group(2) - out.warn("Found possibly broken HTML '%s', fixing as '%s'" % ( - match.group(0), fix), level=2) - return fix - data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data) - tree = lxml.html.document_fromstring(data) return tree diff --git a/dosagelib/util.py b/dosagelib/util.py index 20250e035..11a5e095b 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -15,6 +15,8 @@ from urllib.parse import (parse_qs, quote as url_quote, unquote as url_unquote, urlparse, urlunparse, urlsplit) from urllib.robotparser import RobotFileParser +import lxml + from .output import out from .configuration import UserAgent, App, SupportUrl from . import AppName @@ -347,6 +349,7 @@ def print_app_info(out=sys.stderr): print(App, file=out) print("Python %(version)s on %(platform)s" % {"version": sys.version, "platform": sys.platform}, file=out) + print("libxml2 version: %i.%i.%i" % lxml.etree.LIBXML_VERSION, file=out) stime = strtime(time.time()) print("Local time:", stime, file=out) print("sys.argv", sys.argv, file=out) diff --git a/requirements.txt b/requirements.txt index 1797b97d2..a377a8800 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ colorama imagesize -lxml +lxml>=4.0.0 requests>=2.0 cached_property; python_version<'3.8' importlib_metadata; python_version<'3.8' diff --git a/setup.cfg b/setup.cfg index 61c79a3ae..ee25d9181 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,7 +34,7 @@ packages = find: install_requires = colorama imagesize - lxml + lxml>=4.0.0 requests>=2.0 cached_property;python_version<'3.8' importlib_metadata;python_version<'3.8'