Add an xpath extension to match CSS classes
This commit is contained in:
parent
bd44fdbb79
commit
64123eab64
4 changed files with 49 additions and 7 deletions
|
@ -26,6 +26,7 @@ from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL,
|
||||||
from .comic import ComicStrip
|
from .comic import ComicStrip
|
||||||
from .output import out
|
from .output import out
|
||||||
from .events import getHandler
|
from .events import getHandler
|
||||||
|
from .xml import NS
|
||||||
|
|
||||||
|
|
||||||
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
|
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
|
||||||
|
@ -434,10 +435,6 @@ class _ParserScraper(Scraper):
|
||||||
XML_DECL = re.compile(
|
XML_DECL = re.compile(
|
||||||
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
|
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
|
||||||
|
|
||||||
NS = {
|
|
||||||
"re": "http://exslt.org/regular-expressions"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Switch between CSS and XPath selectors for this class. Since CSS needs
|
# Switch between CSS and XPath selectors for this class. Since CSS needs
|
||||||
# another Python module, XPath is the default for now.
|
# another Python module, XPath is the default for now.
|
||||||
css = False
|
css = False
|
||||||
|
@ -519,7 +516,7 @@ class _ParserScraper(Scraper):
|
||||||
searchFun = data.cssselect
|
searchFun = data.cssselect
|
||||||
else:
|
else:
|
||||||
def searchFun(s):
|
def searchFun(s):
|
||||||
return data.xpath(s, namespaces=self.NS)
|
return data.xpath(s, namespaces=NS)
|
||||||
patterns = makeSequence(patterns)
|
patterns = makeSequence(patterns)
|
||||||
for search in patterns:
|
for search in patterns:
|
||||||
matched = False
|
matched = False
|
||||||
|
|
20
dosagelib/xml.py
Normal file
20
dosagelib/xml.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
# SPDX-License-Identifier: MIT
|
||||||
|
# Copyright (C) 2020 Tobias Gruetzmacher
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
|
||||||
|
NS = {
|
||||||
|
'd': 'https://dosage.rocks/xpath',
|
||||||
|
're': 'http://exslt.org/regular-expressions'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def find_by_class(context, cls):
|
||||||
|
attributes = context.context_node.attrib
|
||||||
|
if 'class' in attributes:
|
||||||
|
return cls in attributes['class'].split(' ')
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
dosagens = etree.FunctionNamespace(NS['d'])
|
||||||
|
dosagens['class'] = find_by_class
|
|
@ -14,7 +14,7 @@ def _file(name):
|
||||||
|
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def _content(name):
|
def content(name):
|
||||||
with gzip.open(_file(name + '.html.gz'), 'r') as f:
|
with gzip.open(_file(name + '.html.gz'), 'r') as f:
|
||||||
return f.read()
|
return f.read()
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ def _img(name):
|
||||||
|
|
||||||
|
|
||||||
def page(url, pagename):
|
def page(url, pagename):
|
||||||
add(GET, url, _content(pagename))
|
add(GET, url, content(pagename))
|
||||||
|
|
||||||
|
|
||||||
def png(url, name='empty'):
|
def png(url, name='empty'):
|
||||||
|
|
25
tests/test_xml.py
Normal file
25
tests/test_xml.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
# SPDX-License-Identifier: MIT
|
||||||
|
# Copyright (C) 2020 Tobias Gruetzmacher
|
||||||
|
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
|
from dosagelib.xml import NS
|
||||||
|
|
||||||
|
import httpmocks
|
||||||
|
|
||||||
|
|
||||||
|
tree = html.document_fromstring(httpmocks.content('zp-222'))
|
||||||
|
|
||||||
|
|
||||||
|
class TestXML:
|
||||||
|
def xpath(self, path):
|
||||||
|
return tree.xpath(path, namespaces=NS)
|
||||||
|
|
||||||
|
def test_class_ext(self):
|
||||||
|
assert len(self.xpath('//li[d:class("menu-item-3773")]')) == 1
|
||||||
|
assert len(self.xpath('//ul[d:class("menu")]')) == 1
|
||||||
|
assert len(self.xpath('//li[d:class("menu-item-object-custom")]')) == 2
|
||||||
|
assert len(self.xpath('//li[d:class("menu-item")]')) == 25
|
||||||
|
|
||||||
|
def test_re_ext(self):
|
||||||
|
assert len(self.xpath(r'//img[re:test(@src, "posters.*jpg")]')) == 1
|
Loading…
Reference in a new issue