Remove workaround for libxml2 older 2.9.3 (2015)

This workaround was written in 2016 while that version was still found
on many systems. Addionally, this workaround needs to be enabled by the
developer, who might not even be aware that they need to enable it for a
specific module. We still throw a warning to the user if running with
such an old libxml version.
This commit is contained in:
Tobias Gruetzmacher 2020-09-29 21:16:48 +02:00
parent c065a820ed
commit e98a1601ca
5 changed files with 10 additions and 18 deletions

View file

@ -33,7 +33,6 @@ class BadMachinery(_ParserScraper):
firstStripUrl = stripUrl % '20090921'
imageSearch = '//img[@class="comicimg"]'
prevSearch = '//a[contains(text(), "Previous")]'
broken_html_bugfix = True
endOfLife = True
help = 'Index format: yyyymmdd'

View file

@ -5,6 +5,7 @@
import html
import os
import re
import warnings
from urllib.parse import urljoin
import lxml
@ -32,6 +33,10 @@ from .xml import NS
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
if lxml.etree.LIBXML_VERSION < (2, 9, 3):
warnings.warn('Your libxml2 is very old (< 2.9.3), some dosage modules might missbehave')
class GeoblockedException(IOError):
def __init__(self):
super().__init__('It seems your current location is geo-blocked.')
@ -438,8 +443,6 @@ class _ParserScraper(Scraper):
of the HTML element and returns that.
"""
BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
# Taken directly from LXML
XML_DECL = re.compile(
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
@ -448,11 +451,6 @@ class _ParserScraper(Scraper):
# another Python module, XPath is the default for now.
css = False
# Activate a workaround for unescaped < characters on libxml version older
# then 2.9.3. This is disabled by default since most sites are not THAT
# broken ;)
broken_html_bugfix = False
def getPage(self, url):
page = super(_ParserScraper, self).getPage(url)
if page.encoding:
@ -469,14 +467,6 @@ class _ParserScraper(Scraper):
return tree
def _parse_page(self, data):
if self.broken_html_bugfix and lxml.etree.LIBXML_VERSION < (2, 9, 3):
def fix_not_open_tags(match):
fix = (len(match.group(1)) * '&lt;') + match.group(2)
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
match.group(0), fix), level=2)
return fix
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
tree = lxml.html.document_fromstring(data)
return tree

View file

@ -15,6 +15,8 @@ from urllib.parse import (parse_qs, quote as url_quote, unquote as url_unquote,
urlparse, urlunparse, urlsplit)
from urllib.robotparser import RobotFileParser
import lxml
from .output import out
from .configuration import UserAgent, App, SupportUrl
from . import AppName
@ -347,6 +349,7 @@ def print_app_info(out=sys.stderr):
print(App, file=out)
print("Python %(version)s on %(platform)s" %
{"version": sys.version, "platform": sys.platform}, file=out)
print("libxml2 version: %i.%i.%i" % lxml.etree.LIBXML_VERSION, file=out)
stime = strtime(time.time())
print("Local time:", stime, file=out)
print("sys.argv", sys.argv, file=out)

View file

@ -1,6 +1,6 @@
colorama
imagesize
lxml
lxml>=4.0.0
requests>=2.0
cached_property; python_version<'3.8'
importlib_metadata; python_version<'3.8'

View file

@ -34,7 +34,7 @@ packages = find:
install_requires =
colorama
imagesize
lxml
lxml>=4.0.0
requests>=2.0
cached_property;python_version<'3.8'
importlib_metadata;python_version<'3.8'