Remove (useless) wrapper around html.unescape
This commit is contained in:
parent
ccd3c57977
commit
62c3540c28
4 changed files with 16 additions and 27 deletions
|
@ -2,11 +2,12 @@
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
|
import html
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
from lxml import html, etree
|
import lxml
|
||||||
from lxml.html.defs import link_attrs as html_link_attrs
|
from lxml.html.defs import link_attrs as html_link_attrs
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -20,8 +21,8 @@ except ImportError:
|
||||||
pycountry = None
|
pycountry = None
|
||||||
|
|
||||||
from . import configuration, http, languages, loader
|
from . import configuration, http, languages, loader
|
||||||
from .util import (get_page, makeSequence, get_system_uid, unescape, tagre,
|
from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL,
|
||||||
normaliseURL, prettyMatcherList, uniq)
|
prettyMatcherList, uniq)
|
||||||
from .comic import ComicStrip
|
from .comic import ComicStrip
|
||||||
from .output import out
|
from .output import out
|
||||||
from .events import getHandler
|
from .events import getHandler
|
||||||
|
@ -400,7 +401,7 @@ class _BasicScraper(Scraper):
|
||||||
text = match.group(1)
|
text = match.group(1)
|
||||||
out.debug(u'matched text %r with pattern %s' %
|
out.debug(u'matched text %r with pattern %s' %
|
||||||
(text, textSearch.pattern))
|
(text, textSearch.pattern))
|
||||||
return unescape(text).strip()
|
return html.unescape(text).strip()
|
||||||
if optional:
|
if optional:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
|
@ -462,7 +463,7 @@ class _ParserScraper(Scraper):
|
||||||
return tree
|
return tree
|
||||||
|
|
||||||
def _parse_page(self, data):
|
def _parse_page(self, data):
|
||||||
if self.broken_html_bugfix and etree.LIBXML_VERSION < (2, 9, 3):
|
if self.broken_html_bugfix and lxml.etree.LIBXML_VERSION < (2, 9, 3):
|
||||||
def fix_not_open_tags(match):
|
def fix_not_open_tags(match):
|
||||||
fix = (len(match.group(1)) * '<') + match.group(2)
|
fix = (len(match.group(1)) * '<') + match.group(2)
|
||||||
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
|
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
|
||||||
|
@ -470,7 +471,7 @@ class _ParserScraper(Scraper):
|
||||||
return fix
|
return fix
|
||||||
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
|
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
|
||||||
|
|
||||||
tree = html.document_fromstring(data)
|
tree = lxml.html.document_fromstring(data)
|
||||||
return tree
|
return tree
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlSearch):
|
def fetchUrls(self, url, data, urlSearch):
|
||||||
|
|
|
@ -172,14 +172,6 @@ def prettyMatcherList(things):
|
||||||
return "('%s')" % "', '".join(norm)
|
return "('%s')" % "', '".join(norm)
|
||||||
|
|
||||||
|
|
||||||
def unescape(text):
|
|
||||||
"""Replace HTML entities and character references."""
|
|
||||||
return html.unescape(text)
|
|
||||||
|
|
||||||
|
|
||||||
_nopathquote_chars = "-;/=,~*+()@!"
|
|
||||||
|
|
||||||
|
|
||||||
def normaliseURL(url):
|
def normaliseURL(url):
|
||||||
"""Normalising
|
"""Normalising
|
||||||
- strips and leading or trailing whitespace,
|
- strips and leading or trailing whitespace,
|
||||||
|
@ -188,7 +180,7 @@ def normaliseURL(url):
|
||||||
"""
|
"""
|
||||||
url = unicode_safe(url).strip()
|
url = unicode_safe(url).strip()
|
||||||
# XXX: brutal hack
|
# XXX: brutal hack
|
||||||
url = unescape(url)
|
url = html.unescape(url)
|
||||||
|
|
||||||
pu = list(urlparse(url))
|
pu = list(urlparse(url))
|
||||||
segments = pu[2].split('/')
|
segments = pu[2].split('/')
|
||||||
|
|
|
@ -3,17 +3,18 @@
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
import codecs
|
import codecs
|
||||||
|
import html
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from lxml import html
|
import lxml
|
||||||
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||||
|
|
||||||
from dosagelib.util import unescape, get_page
|
from dosagelib.util import get_page
|
||||||
from dosagelib import scraper, http
|
from dosagelib import scraper, http
|
||||||
|
|
||||||
|
|
||||||
|
@ -37,7 +38,8 @@ class ComicListUpdater(object):
|
||||||
"""Get an HTML page and parse it with LXML."""
|
"""Get an HTML page and parse it with LXML."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data = html.document_fromstring(get_page(url, self.session).text)
|
pagetext = get_page(url, self.session).text
|
||||||
|
data = lxml.html.document_fromstring(pagetext)
|
||||||
if expand:
|
if expand:
|
||||||
data.make_links_absolute(url)
|
data.make_links_absolute(url)
|
||||||
if self.sleep > 0:
|
if self.sleep > 0:
|
||||||
|
@ -185,7 +187,7 @@ def asciify(name):
|
||||||
|
|
||||||
def format_name(text):
|
def format_name(text):
|
||||||
"""Format a comic name."""
|
"""Format a comic name."""
|
||||||
name = unescape(text)
|
name = html.unescape(text)
|
||||||
name = "".join(capfirst(x) for x in name.split(" "))
|
name = "".join(capfirst(x) for x in name.split(" "))
|
||||||
name = asciify(name.replace(u'&', u'And').replace(u'@', u'At').replace('ñ', 'n'))
|
name = asciify(name.replace(u'&', u'And').replace(u'@', u'At').replace('ñ', 'n'))
|
||||||
return name
|
return name
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2018 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
import pytest
|
import pytest
|
||||||
import re
|
import re
|
||||||
from dosagelib.util import normaliseURL, unescape, tagre, get_system_uid
|
from dosagelib.util import normaliseURL, tagre, get_system_uid
|
||||||
|
|
||||||
|
|
||||||
class TestURL(object):
|
class TestURL(object):
|
||||||
|
@ -12,12 +12,6 @@ class TestURL(object):
|
||||||
Tests for URL utility functions.
|
Tests for URL utility functions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def test_unescape(self):
|
|
||||||
# Test HTML replacement.
|
|
||||||
assert unescape(u'foo&bar') == u'foo&bar'
|
|
||||||
assert unescape(u'foo bar') == u'foo\xa0bar'
|
|
||||||
assert unescape(u'"foo"') == u'"foo"'
|
|
||||||
|
|
||||||
def test_normalisation(self):
|
def test_normalisation(self):
|
||||||
# Test URL normalisation.
|
# Test URL normalisation.
|
||||||
assert (normaliseURL('http://example.com//bar/baz&baz') ==
|
assert (normaliseURL('http://example.com//bar/baz&baz') ==
|
||||||
|
|
Loading…
Reference in a new issue