From 64123eab647a124e84cc116de0c7d0285350ce56 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Fri, 31 Jul 2020 20:14:04 +0200 Subject: [PATCH] Add an xpath extension to match CSS classes --- dosagelib/scraper.py | 7 ++----- dosagelib/xml.py | 20 ++++++++++++++++++++ tests/httpmocks.py | 4 ++-- tests/test_xml.py | 25 +++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 7 deletions(-) create mode 100644 dosagelib/xml.py create mode 100644 tests/test_xml.py diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index f1ab5376f..33b37790c 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -26,6 +26,7 @@ from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL, from .comic import ComicStrip from .output import out from .events import getHandler +from .xml import NS ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/') @@ -434,10 +435,6 @@ class _ParserScraper(Scraper): XML_DECL = re.compile( r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U) - NS = { - "re": "http://exslt.org/regular-expressions" - } - # Switch between CSS and XPath selectors for this class. Since CSS needs # another Python module, XPath is the default for now. css = False @@ -519,7 +516,7 @@ class _ParserScraper(Scraper): searchFun = data.cssselect else: def searchFun(s): - return data.xpath(s, namespaces=self.NS) + return data.xpath(s, namespaces=NS) patterns = makeSequence(patterns) for search in patterns: matched = False diff --git a/dosagelib/xml.py b/dosagelib/xml.py new file mode 100644 index 000000000..bdac1f73e --- /dev/null +++ b/dosagelib/xml.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2020 Tobias Gruetzmacher +from lxml import etree + + +NS = { + 'd': 'https://dosage.rocks/xpath', + 're': 'http://exslt.org/regular-expressions' +} + + +def find_by_class(context, cls): + attributes = context.context_node.attrib + if 'class' in attributes: + return cls in attributes['class'].split(' ') + return False + + +dosagens = etree.FunctionNamespace(NS['d']) +dosagens['class'] = find_by_class diff --git a/tests/httpmocks.py b/tests/httpmocks.py index 3e833b78e..4a50d38b6 100644 --- a/tests/httpmocks.py +++ b/tests/httpmocks.py @@ -14,7 +14,7 @@ def _file(name): @lru_cache() -def _content(name): +def content(name): with gzip.open(_file(name + '.html.gz'), 'r') as f: return f.read() @@ -26,7 +26,7 @@ def _img(name): def page(url, pagename): - add(GET, url, _content(pagename)) + add(GET, url, content(pagename)) def png(url, name='empty'): diff --git a/tests/test_xml.py b/tests/test_xml.py new file mode 100644 index 000000000..0172aa6f1 --- /dev/null +++ b/tests/test_xml.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2020 Tobias Gruetzmacher + +from lxml import html + +from dosagelib.xml import NS + +import httpmocks + + +tree = html.document_fromstring(httpmocks.content('zp-222')) + + +class TestXML: + def xpath(self, path): + return tree.xpath(path, namespaces=NS) + + def test_class_ext(self): + assert len(self.xpath('//li[d:class("menu-item-3773")]')) == 1 + assert len(self.xpath('//ul[d:class("menu")]')) == 1 + assert len(self.xpath('//li[d:class("menu-item-object-custom")]')) == 2 + assert len(self.xpath('//li[d:class("menu-item")]')) == 25 + + def test_re_ext(self): + assert len(self.xpath(r'//img[re:test(@src, "posters.*jpg")]')) == 1