Remove workaround for libxml2 older 2.9.3 (2015)
This workaround was written in 2016 while that version was still found on many systems. Addionally, this workaround needs to be enabled by the developer, who might not even be aware that they need to enable it for a specific module. We still throw a warning to the user if running with such an old libxml version.
This commit is contained in:
parent
c065a820ed
commit
e98a1601ca
5 changed files with 10 additions and 18 deletions
|
@ -33,7 +33,6 @@ class BadMachinery(_ParserScraper):
|
|||
firstStripUrl = stripUrl % '20090921'
|
||||
imageSearch = '//img[@class="comicimg"]'
|
||||
prevSearch = '//a[contains(text(), "Previous")]'
|
||||
broken_html_bugfix = True
|
||||
endOfLife = True
|
||||
help = 'Index format: yyyymmdd'
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
import html
|
||||
import os
|
||||
import re
|
||||
import warnings
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import lxml
|
||||
|
@ -32,6 +33,10 @@ from .xml import NS
|
|||
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
|
||||
|
||||
|
||||
if lxml.etree.LIBXML_VERSION < (2, 9, 3):
|
||||
warnings.warn('Your libxml2 is very old (< 2.9.3), some dosage modules might missbehave')
|
||||
|
||||
|
||||
class GeoblockedException(IOError):
|
||||
def __init__(self):
|
||||
super().__init__('It seems your current location is geo-blocked.')
|
||||
|
@ -438,8 +443,6 @@ class _ParserScraper(Scraper):
|
|||
of the HTML element and returns that.
|
||||
"""
|
||||
|
||||
BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
|
||||
|
||||
# Taken directly from LXML
|
||||
XML_DECL = re.compile(
|
||||
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
|
||||
|
@ -448,11 +451,6 @@ class _ParserScraper(Scraper):
|
|||
# another Python module, XPath is the default for now.
|
||||
css = False
|
||||
|
||||
# Activate a workaround for unescaped < characters on libxml version older
|
||||
# then 2.9.3. This is disabled by default since most sites are not THAT
|
||||
# broken ;)
|
||||
broken_html_bugfix = False
|
||||
|
||||
def getPage(self, url):
|
||||
page = super(_ParserScraper, self).getPage(url)
|
||||
if page.encoding:
|
||||
|
@ -469,14 +467,6 @@ class _ParserScraper(Scraper):
|
|||
return tree
|
||||
|
||||
def _parse_page(self, data):
|
||||
if self.broken_html_bugfix and lxml.etree.LIBXML_VERSION < (2, 9, 3):
|
||||
def fix_not_open_tags(match):
|
||||
fix = (len(match.group(1)) * '<') + match.group(2)
|
||||
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
|
||||
match.group(0), fix), level=2)
|
||||
return fix
|
||||
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
|
||||
|
||||
tree = lxml.html.document_fromstring(data)
|
||||
return tree
|
||||
|
||||
|
|
|
@ -15,6 +15,8 @@ from urllib.parse import (parse_qs, quote as url_quote, unquote as url_unquote,
|
|||
urlparse, urlunparse, urlsplit)
|
||||
from urllib.robotparser import RobotFileParser
|
||||
|
||||
import lxml
|
||||
|
||||
from .output import out
|
||||
from .configuration import UserAgent, App, SupportUrl
|
||||
from . import AppName
|
||||
|
@ -347,6 +349,7 @@ def print_app_info(out=sys.stderr):
|
|||
print(App, file=out)
|
||||
print("Python %(version)s on %(platform)s" %
|
||||
{"version": sys.version, "platform": sys.platform}, file=out)
|
||||
print("libxml2 version: %i.%i.%i" % lxml.etree.LIBXML_VERSION, file=out)
|
||||
stime = strtime(time.time())
|
||||
print("Local time:", stime, file=out)
|
||||
print("sys.argv", sys.argv, file=out)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
colorama
|
||||
imagesize
|
||||
lxml
|
||||
lxml>=4.0.0
|
||||
requests>=2.0
|
||||
cached_property; python_version<'3.8'
|
||||
importlib_metadata; python_version<'3.8'
|
||||
|
|
|
@ -34,7 +34,7 @@ packages = find:
|
|||
install_requires =
|
||||
colorama
|
||||
imagesize
|
||||
lxml
|
||||
lxml>=4.0.0
|
||||
requests>=2.0
|
||||
cached_property;python_version<'3.8'
|
||||
importlib_metadata;python_version<'3.8'
|
||||
|
|
Loading…
Reference in a new issue