Add an xpath extension to match CSS classes
This commit is contained in:
parent
bd44fdbb79
commit
64123eab64
4 changed files with 49 additions and 7 deletions
|
@ -26,6 +26,7 @@ from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL,
|
|||
from .comic import ComicStrip
|
||||
from .output import out
|
||||
from .events import getHandler
|
||||
from .xml import NS
|
||||
|
||||
|
||||
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
|
||||
|
@ -434,10 +435,6 @@ class _ParserScraper(Scraper):
|
|||
XML_DECL = re.compile(
|
||||
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
|
||||
|
||||
NS = {
|
||||
"re": "http://exslt.org/regular-expressions"
|
||||
}
|
||||
|
||||
# Switch between CSS and XPath selectors for this class. Since CSS needs
|
||||
# another Python module, XPath is the default for now.
|
||||
css = False
|
||||
|
@ -519,7 +516,7 @@ class _ParserScraper(Scraper):
|
|||
searchFun = data.cssselect
|
||||
else:
|
||||
def searchFun(s):
|
||||
return data.xpath(s, namespaces=self.NS)
|
||||
return data.xpath(s, namespaces=NS)
|
||||
patterns = makeSequence(patterns)
|
||||
for search in patterns:
|
||||
matched = False
|
||||
|
|
20
dosagelib/xml.py
Normal file
20
dosagelib/xml.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2020 Tobias Gruetzmacher
|
||||
from lxml import etree
|
||||
|
||||
|
||||
NS = {
|
||||
'd': 'https://dosage.rocks/xpath',
|
||||
're': 'http://exslt.org/regular-expressions'
|
||||
}
|
||||
|
||||
|
||||
def find_by_class(context, cls):
|
||||
attributes = context.context_node.attrib
|
||||
if 'class' in attributes:
|
||||
return cls in attributes['class'].split(' ')
|
||||
return False
|
||||
|
||||
|
||||
dosagens = etree.FunctionNamespace(NS['d'])
|
||||
dosagens['class'] = find_by_class
|
|
@ -14,7 +14,7 @@ def _file(name):
|
|||
|
||||
|
||||
@lru_cache()
|
||||
def _content(name):
|
||||
def content(name):
|
||||
with gzip.open(_file(name + '.html.gz'), 'r') as f:
|
||||
return f.read()
|
||||
|
||||
|
@ -26,7 +26,7 @@ def _img(name):
|
|||
|
||||
|
||||
def page(url, pagename):
|
||||
add(GET, url, _content(pagename))
|
||||
add(GET, url, content(pagename))
|
||||
|
||||
|
||||
def png(url, name='empty'):
|
||||
|
|
25
tests/test_xml.py
Normal file
25
tests/test_xml.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2020 Tobias Gruetzmacher
|
||||
|
||||
from lxml import html
|
||||
|
||||
from dosagelib.xml import NS
|
||||
|
||||
import httpmocks
|
||||
|
||||
|
||||
tree = html.document_fromstring(httpmocks.content('zp-222'))
|
||||
|
||||
|
||||
class TestXML:
|
||||
def xpath(self, path):
|
||||
return tree.xpath(path, namespaces=NS)
|
||||
|
||||
def test_class_ext(self):
|
||||
assert len(self.xpath('//li[d:class("menu-item-3773")]')) == 1
|
||||
assert len(self.xpath('//ul[d:class("menu")]')) == 1
|
||||
assert len(self.xpath('//li[d:class("menu-item-object-custom")]')) == 2
|
||||
assert len(self.xpath('//li[d:class("menu-item")]')) == 25
|
||||
|
||||
def test_re_ext(self):
|
||||
assert len(self.xpath(r'//img[re:test(@src, "posters.*jpg")]')) == 1
|
Loading…
Reference in a new issue