Add an xpath extension to match CSS classes

This commit is contained in:
Tobias Gruetzmacher 2020-07-31 20:14:04 +02:00
parent bd44fdbb79
commit 64123eab64
4 changed files with 49 additions and 7 deletions

View file

@ -26,6 +26,7 @@ from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL,
from .comic import ComicStrip
from .output import out
from .events import getHandler
from .xml import NS
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
@ -434,10 +435,6 @@ class _ParserScraper(Scraper):
XML_DECL = re.compile(
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
NS = {
"re": "http://exslt.org/regular-expressions"
}
# Switch between CSS and XPath selectors for this class. Since CSS needs
# another Python module, XPath is the default for now.
css = False
@ -519,7 +516,7 @@ class _ParserScraper(Scraper):
searchFun = data.cssselect
else:
def searchFun(s):
return data.xpath(s, namespaces=self.NS)
return data.xpath(s, namespaces=NS)
patterns = makeSequence(patterns)
for search in patterns:
matched = False

20
dosagelib/xml.py Normal file
View file

@ -0,0 +1,20 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2020 Tobias Gruetzmacher
from lxml import etree
NS = {
'd': 'https://dosage.rocks/xpath',
're': 'http://exslt.org/regular-expressions'
}
def find_by_class(context, cls):
attributes = context.context_node.attrib
if 'class' in attributes:
return cls in attributes['class'].split(' ')
return False
dosagens = etree.FunctionNamespace(NS['d'])
dosagens['class'] = find_by_class

View file

@ -14,7 +14,7 @@ def _file(name):
@lru_cache()
def _content(name):
def content(name):
with gzip.open(_file(name + '.html.gz'), 'r') as f:
return f.read()
@ -26,7 +26,7 @@ def _img(name):
def page(url, pagename):
add(GET, url, _content(pagename))
add(GET, url, content(pagename))
def png(url, name='empty'):

25
tests/test_xml.py Normal file
View file

@ -0,0 +1,25 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2020 Tobias Gruetzmacher
from lxml import html
from dosagelib.xml import NS
import httpmocks
tree = html.document_fromstring(httpmocks.content('zp-222'))
class TestXML:
def xpath(self, path):
return tree.xpath(path, namespaces=NS)
def test_class_ext(self):
assert len(self.xpath('//li[d:class("menu-item-3773")]')) == 1
assert len(self.xpath('//ul[d:class("menu")]')) == 1
assert len(self.xpath('//li[d:class("menu-item-object-custom")]')) == 2
assert len(self.xpath('//li[d:class("menu-item")]')) == 25
def test_re_ext(self):
assert len(self.xpath(r'//img[re:test(@src, "posters.*jpg")]')) == 1