Remove workaround for libxml2 older 2.9.3 (2015)

This workaround was written in 2016 while that version was still found
on many systems. Addionally, this workaround needs to be enabled by the
developer, who might not even be aware that they need to enable it for a
specific module. We still throw a warning to the user if running with
such an old libxml version.
This commit is contained in:
Tobias Gruetzmacher 2020-09-29 21:16:48 +02:00
parent c065a820ed
commit e98a1601ca
5 changed files with 10 additions and 18 deletions

View file

@ -33,7 +33,6 @@ class BadMachinery(_ParserScraper):
firstStripUrl = stripUrl % '20090921' firstStripUrl = stripUrl % '20090921'
imageSearch = '//img[@class="comicimg"]' imageSearch = '//img[@class="comicimg"]'
prevSearch = '//a[contains(text(), "Previous")]' prevSearch = '//a[contains(text(), "Previous")]'
broken_html_bugfix = True
endOfLife = True endOfLife = True
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'

View file

@ -5,6 +5,7 @@
import html import html
import os import os
import re import re
import warnings
from urllib.parse import urljoin from urllib.parse import urljoin
import lxml import lxml
@ -32,6 +33,10 @@ from .xml import NS
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/') ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
if lxml.etree.LIBXML_VERSION < (2, 9, 3):
warnings.warn('Your libxml2 is very old (< 2.9.3), some dosage modules might missbehave')
class GeoblockedException(IOError): class GeoblockedException(IOError):
def __init__(self): def __init__(self):
super().__init__('It seems your current location is geo-blocked.') super().__init__('It seems your current location is geo-blocked.')
@ -438,8 +443,6 @@ class _ParserScraper(Scraper):
of the HTML element and returns that. of the HTML element and returns that.
""" """
BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
# Taken directly from LXML # Taken directly from LXML
XML_DECL = re.compile( XML_DECL = re.compile(
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U) r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
@ -448,11 +451,6 @@ class _ParserScraper(Scraper):
# another Python module, XPath is the default for now. # another Python module, XPath is the default for now.
css = False css = False
# Activate a workaround for unescaped < characters on libxml version older
# then 2.9.3. This is disabled by default since most sites are not THAT
# broken ;)
broken_html_bugfix = False
def getPage(self, url): def getPage(self, url):
page = super(_ParserScraper, self).getPage(url) page = super(_ParserScraper, self).getPage(url)
if page.encoding: if page.encoding:
@ -469,14 +467,6 @@ class _ParserScraper(Scraper):
return tree return tree
def _parse_page(self, data): def _parse_page(self, data):
if self.broken_html_bugfix and lxml.etree.LIBXML_VERSION < (2, 9, 3):
def fix_not_open_tags(match):
fix = (len(match.group(1)) * '&lt;') + match.group(2)
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
match.group(0), fix), level=2)
return fix
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
tree = lxml.html.document_fromstring(data) tree = lxml.html.document_fromstring(data)
return tree return tree

View file

@ -15,6 +15,8 @@ from urllib.parse import (parse_qs, quote as url_quote, unquote as url_unquote,
urlparse, urlunparse, urlsplit) urlparse, urlunparse, urlsplit)
from urllib.robotparser import RobotFileParser from urllib.robotparser import RobotFileParser
import lxml
from .output import out from .output import out
from .configuration import UserAgent, App, SupportUrl from .configuration import UserAgent, App, SupportUrl
from . import AppName from . import AppName
@ -347,6 +349,7 @@ def print_app_info(out=sys.stderr):
print(App, file=out) print(App, file=out)
print("Python %(version)s on %(platform)s" % print("Python %(version)s on %(platform)s" %
{"version": sys.version, "platform": sys.platform}, file=out) {"version": sys.version, "platform": sys.platform}, file=out)
print("libxml2 version: %i.%i.%i" % lxml.etree.LIBXML_VERSION, file=out)
stime = strtime(time.time()) stime = strtime(time.time())
print("Local time:", stime, file=out) print("Local time:", stime, file=out)
print("sys.argv", sys.argv, file=out) print("sys.argv", sys.argv, file=out)

View file

@ -1,6 +1,6 @@
colorama colorama
imagesize imagesize
lxml lxml>=4.0.0
requests>=2.0 requests>=2.0
cached_property; python_version<'3.8' cached_property; python_version<'3.8'
importlib_metadata; python_version<'3.8' importlib_metadata; python_version<'3.8'

View file

@ -34,7 +34,7 @@ packages = find:
install_requires = install_requires =
colorama colorama
imagesize imagesize
lxml lxml>=4.0.0
requests>=2.0 requests>=2.0
cached_property;python_version<'3.8' cached_property;python_version<'3.8'
importlib_metadata;python_version<'3.8' importlib_metadata;python_version<'3.8'