Add an xpath extension to match CSS classes

2020-07-31 20:14:04 +02:00 · 2020-07-31 20:14:04 +02:00 · 64123eab64
commit 64123eab64
parent bd44fdbb79
4 changed files with 49 additions and 7 deletions
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -26,6 +26,7 @@ from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL,
 from .comic import ComicStrip
 from .output import out
 from .events import getHandler
+from .xml import NS


 ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
@ -434,10 +435,6 @@ class _ParserScraper(Scraper):
    XML_DECL = re.compile(
        r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)

-    NS = {
-        "re": "http://exslt.org/regular-expressions"
-    }
-
    # Switch between CSS and XPath selectors for this class. Since CSS needs
    # another Python module, XPath is the default for now.
    css = False
@ -519,7 +516,7 @@ class _ParserScraper(Scraper):
            searchFun = data.cssselect
        else:
            def searchFun(s):
-                return data.xpath(s, namespaces=self.NS)
+                return data.xpath(s, namespaces=NS)
        patterns = makeSequence(patterns)
        for search in patterns:
            matched = False
--- a/dosagelib/xml.py
+++ b/dosagelib/xml.py
@ -0,0 +1,20 @@
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2020 Tobias Gruetzmacher
+from lxml import etree
+
+
+NS = {
+    'd': 'https://dosage.rocks/xpath',
+    're': 'http://exslt.org/regular-expressions'
+}
+
+
+def find_by_class(context, cls):
+    attributes = context.context_node.attrib
+    if 'class' in attributes:
+        return cls in attributes['class'].split(' ')
+    return False
+
+
+dosagens = etree.FunctionNamespace(NS['d'])
+dosagens['class'] = find_by_class
--- a/tests/httpmocks.py
+++ b/tests/httpmocks.py
@ -14,7 +14,7 @@ def _file(name):


@lru_cache()
-def _content(name):
+def content(name):
    with gzip.open(_file(name + '.html.gz'), 'r') as f:
        return f.read()

@ -26,7 +26,7 @@ def _img(name):


 def page(url, pagename):
-    add(GET, url, _content(pagename))
+    add(GET, url, content(pagename))


 def png(url, name='empty'):
--- a/tests/test_xml.py
+++ b/tests/test_xml.py
@ -0,0 +1,25 @@
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2020 Tobias Gruetzmacher
+
+from lxml import html
+
+from dosagelib.xml import NS
+
+import httpmocks
+
+
+tree = html.document_fromstring(httpmocks.content('zp-222'))
+
+
+class TestXML:
+    def xpath(self, path):
+        return tree.xpath(path, namespaces=NS)
+
+    def test_class_ext(self):
+        assert len(self.xpath('//li[d:class("menu-item-3773")]')) == 1
+        assert len(self.xpath('//ul[d:class("menu")]')) == 1
+        assert len(self.xpath('//li[d:class("menu-item-object-custom")]')) == 2
+        assert len(self.xpath('//li[d:class("menu-item")]')) == 25
+
+    def test_re_ext(self):
+        assert len(self.xpath(r'//img[re:test(@src, "posters.*jpg")]')) == 1