Remove workaround for libxml2 older 2.9.3 (2015)
This workaround was written in 2016 while that version was still found on many systems. Addionally, this workaround needs to be enabled by the developer, who might not even be aware that they need to enable it for a specific module. We still throw a warning to the user if running with such an old libxml version.
This commit is contained in:
parent
c065a820ed
commit
e98a1601ca
5 changed files with 10 additions and 18 deletions
|
@ -33,7 +33,6 @@ class BadMachinery(_ParserScraper):
|
||||||
firstStripUrl = stripUrl % '20090921'
|
firstStripUrl = stripUrl % '20090921'
|
||||||
imageSearch = '//img[@class="comicimg"]'
|
imageSearch = '//img[@class="comicimg"]'
|
||||||
prevSearch = '//a[contains(text(), "Previous")]'
|
prevSearch = '//a[contains(text(), "Previous")]'
|
||||||
broken_html_bugfix = True
|
|
||||||
endOfLife = True
|
endOfLife = True
|
||||||
help = 'Index format: yyyymmdd'
|
help = 'Index format: yyyymmdd'
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
import html
|
import html
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import warnings
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import lxml
|
import lxml
|
||||||
|
@ -32,6 +33,10 @@ from .xml import NS
|
||||||
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
|
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
|
||||||
|
|
||||||
|
|
||||||
|
if lxml.etree.LIBXML_VERSION < (2, 9, 3):
|
||||||
|
warnings.warn('Your libxml2 is very old (< 2.9.3), some dosage modules might missbehave')
|
||||||
|
|
||||||
|
|
||||||
class GeoblockedException(IOError):
|
class GeoblockedException(IOError):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__('It seems your current location is geo-blocked.')
|
super().__init__('It seems your current location is geo-blocked.')
|
||||||
|
@ -438,8 +443,6 @@ class _ParserScraper(Scraper):
|
||||||
of the HTML element and returns that.
|
of the HTML element and returns that.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
|
|
||||||
|
|
||||||
# Taken directly from LXML
|
# Taken directly from LXML
|
||||||
XML_DECL = re.compile(
|
XML_DECL = re.compile(
|
||||||
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
|
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
|
||||||
|
@ -448,11 +451,6 @@ class _ParserScraper(Scraper):
|
||||||
# another Python module, XPath is the default for now.
|
# another Python module, XPath is the default for now.
|
||||||
css = False
|
css = False
|
||||||
|
|
||||||
# Activate a workaround for unescaped < characters on libxml version older
|
|
||||||
# then 2.9.3. This is disabled by default since most sites are not THAT
|
|
||||||
# broken ;)
|
|
||||||
broken_html_bugfix = False
|
|
||||||
|
|
||||||
def getPage(self, url):
|
def getPage(self, url):
|
||||||
page = super(_ParserScraper, self).getPage(url)
|
page = super(_ParserScraper, self).getPage(url)
|
||||||
if page.encoding:
|
if page.encoding:
|
||||||
|
@ -469,14 +467,6 @@ class _ParserScraper(Scraper):
|
||||||
return tree
|
return tree
|
||||||
|
|
||||||
def _parse_page(self, data):
|
def _parse_page(self, data):
|
||||||
if self.broken_html_bugfix and lxml.etree.LIBXML_VERSION < (2, 9, 3):
|
|
||||||
def fix_not_open_tags(match):
|
|
||||||
fix = (len(match.group(1)) * '<') + match.group(2)
|
|
||||||
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
|
|
||||||
match.group(0), fix), level=2)
|
|
||||||
return fix
|
|
||||||
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
|
|
||||||
|
|
||||||
tree = lxml.html.document_fromstring(data)
|
tree = lxml.html.document_fromstring(data)
|
||||||
return tree
|
return tree
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,8 @@ from urllib.parse import (parse_qs, quote as url_quote, unquote as url_unquote,
|
||||||
urlparse, urlunparse, urlsplit)
|
urlparse, urlunparse, urlsplit)
|
||||||
from urllib.robotparser import RobotFileParser
|
from urllib.robotparser import RobotFileParser
|
||||||
|
|
||||||
|
import lxml
|
||||||
|
|
||||||
from .output import out
|
from .output import out
|
||||||
from .configuration import UserAgent, App, SupportUrl
|
from .configuration import UserAgent, App, SupportUrl
|
||||||
from . import AppName
|
from . import AppName
|
||||||
|
@ -347,6 +349,7 @@ def print_app_info(out=sys.stderr):
|
||||||
print(App, file=out)
|
print(App, file=out)
|
||||||
print("Python %(version)s on %(platform)s" %
|
print("Python %(version)s on %(platform)s" %
|
||||||
{"version": sys.version, "platform": sys.platform}, file=out)
|
{"version": sys.version, "platform": sys.platform}, file=out)
|
||||||
|
print("libxml2 version: %i.%i.%i" % lxml.etree.LIBXML_VERSION, file=out)
|
||||||
stime = strtime(time.time())
|
stime = strtime(time.time())
|
||||||
print("Local time:", stime, file=out)
|
print("Local time:", stime, file=out)
|
||||||
print("sys.argv", sys.argv, file=out)
|
print("sys.argv", sys.argv, file=out)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
colorama
|
colorama
|
||||||
imagesize
|
imagesize
|
||||||
lxml
|
lxml>=4.0.0
|
||||||
requests>=2.0
|
requests>=2.0
|
||||||
cached_property; python_version<'3.8'
|
cached_property; python_version<'3.8'
|
||||||
importlib_metadata; python_version<'3.8'
|
importlib_metadata; python_version<'3.8'
|
||||||
|
|
|
@ -34,7 +34,7 @@ packages = find:
|
||||||
install_requires =
|
install_requires =
|
||||||
colorama
|
colorama
|
||||||
imagesize
|
imagesize
|
||||||
lxml
|
lxml>=4.0.0
|
||||||
requests>=2.0
|
requests>=2.0
|
||||||
cached_property;python_version<'3.8'
|
cached_property;python_version<'3.8'
|
||||||
importlib_metadata;python_version<'3.8'
|
importlib_metadata;python_version<'3.8'
|
||||||
|
|
Loading…
Reference in a new issue