Remove (useless) wrapper around html.unescape
This commit is contained in:
parent
ccd3c57977
commit
62c3540c28
4 changed files with 16 additions and 27 deletions
|
@ -2,11 +2,12 @@
|
|||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||
import html
|
||||
import os
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from lxml import html, etree
|
||||
import lxml
|
||||
from lxml.html.defs import link_attrs as html_link_attrs
|
||||
|
||||
try:
|
||||
|
@ -20,8 +21,8 @@ except ImportError:
|
|||
pycountry = None
|
||||
|
||||
from . import configuration, http, languages, loader
|
||||
from .util import (get_page, makeSequence, get_system_uid, unescape, tagre,
|
||||
normaliseURL, prettyMatcherList, uniq)
|
||||
from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL,
|
||||
prettyMatcherList, uniq)
|
||||
from .comic import ComicStrip
|
||||
from .output import out
|
||||
from .events import getHandler
|
||||
|
@ -400,7 +401,7 @@ class _BasicScraper(Scraper):
|
|||
text = match.group(1)
|
||||
out.debug(u'matched text %r with pattern %s' %
|
||||
(text, textSearch.pattern))
|
||||
return unescape(text).strip()
|
||||
return html.unescape(text).strip()
|
||||
if optional:
|
||||
return None
|
||||
else:
|
||||
|
@ -462,7 +463,7 @@ class _ParserScraper(Scraper):
|
|||
return tree
|
||||
|
||||
def _parse_page(self, data):
|
||||
if self.broken_html_bugfix and etree.LIBXML_VERSION < (2, 9, 3):
|
||||
if self.broken_html_bugfix and lxml.etree.LIBXML_VERSION < (2, 9, 3):
|
||||
def fix_not_open_tags(match):
|
||||
fix = (len(match.group(1)) * '<') + match.group(2)
|
||||
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
|
||||
|
@ -470,7 +471,7 @@ class _ParserScraper(Scraper):
|
|||
return fix
|
||||
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
|
||||
|
||||
tree = html.document_fromstring(data)
|
||||
tree = lxml.html.document_fromstring(data)
|
||||
return tree
|
||||
|
||||
def fetchUrls(self, url, data, urlSearch):
|
||||
|
|
|
@ -172,14 +172,6 @@ def prettyMatcherList(things):
|
|||
return "('%s')" % "', '".join(norm)
|
||||
|
||||
|
||||
def unescape(text):
|
||||
"""Replace HTML entities and character references."""
|
||||
return html.unescape(text)
|
||||
|
||||
|
||||
_nopathquote_chars = "-;/=,~*+()@!"
|
||||
|
||||
|
||||
def normaliseURL(url):
|
||||
"""Normalising
|
||||
- strips and leading or trailing whitespace,
|
||||
|
@ -188,7 +180,7 @@ def normaliseURL(url):
|
|||
"""
|
||||
url = unicode_safe(url).strip()
|
||||
# XXX: brutal hack
|
||||
url = unescape(url)
|
||||
url = html.unescape(url)
|
||||
|
||||
pu = list(urlparse(url))
|
||||
segments = pu[2].split('/')
|
||||
|
|
|
@ -3,17 +3,18 @@
|
|||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||
import codecs
|
||||
import html
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
|
||||
from lxml import html
|
||||
import lxml
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||
|
||||
from dosagelib.util import unescape, get_page
|
||||
from dosagelib.util import get_page
|
||||
from dosagelib import scraper, http
|
||||
|
||||
|
||||
|
@ -37,7 +38,8 @@ class ComicListUpdater(object):
|
|||
"""Get an HTML page and parse it with LXML."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data = html.document_fromstring(get_page(url, self.session).text)
|
||||
pagetext = get_page(url, self.session).text
|
||||
data = lxml.html.document_fromstring(pagetext)
|
||||
if expand:
|
||||
data.make_links_absolute(url)
|
||||
if self.sleep > 0:
|
||||
|
@ -185,7 +187,7 @@ def asciify(name):
|
|||
|
||||
def format_name(text):
|
||||
"""Format a comic name."""
|
||||
name = unescape(text)
|
||||
name = html.unescape(text)
|
||||
name = "".join(capfirst(x) for x in name.split(" "))
|
||||
name = asciify(name.replace(u'&', u'And').replace(u'@', u'At').replace('ñ', 'n'))
|
||||
return name
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2018 Tobias Gruetzmacher
|
||||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||
import pytest
|
||||
import re
|
||||
from dosagelib.util import normaliseURL, unescape, tagre, get_system_uid
|
||||
from dosagelib.util import normaliseURL, tagre, get_system_uid
|
||||
|
||||
|
||||
class TestURL(object):
|
||||
|
@ -12,12 +12,6 @@ class TestURL(object):
|
|||
Tests for URL utility functions.
|
||||
"""
|
||||
|
||||
def test_unescape(self):
|
||||
# Test HTML replacement.
|
||||
assert unescape(u'foo&bar') == u'foo&bar'
|
||||
assert unescape(u'foo bar') == u'foo\xa0bar'
|
||||
assert unescape(u'"foo"') == u'"foo"'
|
||||
|
||||
def test_normalisation(self):
|
||||
# Test URL normalisation.
|
||||
assert (normaliseURL('http://example.com//bar/baz&baz') ==
|
||||
|
|
Loading…
Reference in a new issue