Add an xpath extension to match CSS classes

2020-07-31 20:14:04 +02:00 · 2020-07-31 20:14:04 +02:00 · 64123eab64
commit 64123eab64
parent bd44fdbb79
4 changed files with 49 additions and 7 deletions
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -26,6 +26,7 @@ from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL,
 from .comic import ComicStrip
 from .output import out
 from .events import getHandler
 from .xml import NS
 ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
@ -434,10 +435,6 @@ class _ParserScraper(Scraper):
    XML_DECL = re.compile(
        r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
    NS = {
        "re": "http://exslt.org/regular-expressions"
    }
    # Switch between CSS and XPath selectors for this class. Since CSS needs
    # another Python module, XPath is the default for now.
    css = False
@ -519,7 +516,7 @@ class _ParserScraper(Scraper):
            searchFun = data.cssselect
        else:
            def searchFun(s):
-                return data.xpath(s, namespaces=self.NS)
+                return data.xpath(s, namespaces=NS)
        patterns = makeSequence(patterns)
        for search in patterns:
            matched = False
--- a/dosagelib/xml.py
+++ b/dosagelib/xml.py
@ -0,0 +1,20 @@
 # SPDX-License-Identifier: MIT
 # Copyright (C) 2020 Tobias Gruetzmacher
 from lxml import etree
 NS = {
    'd': 'https://dosage.rocks/xpath',
    're': 'http://exslt.org/regular-expressions'
 }
 def find_by_class(context, cls):
    attributes = context.context_node.attrib
    if 'class' in attributes:
        return cls in attributes['class'].split(' ')
    return False
 dosagens = etree.FunctionNamespace(NS['d'])
 dosagens['class'] = find_by_class
--- a/tests/httpmocks.py
+++ b/tests/httpmocks.py
@ -14,7 +14,7 @@ def _file(name):
@lru_cache()
-def _content(name):
+def content(name):
    with gzip.open(_file(name + '.html.gz'), 'r') as f:
        return f.read()
@ -26,7 +26,7 @@ def _img(name):
 def page(url, pagename):
-    add(GET, url, _content(pagename))
+    add(GET, url, content(pagename))
 def png(url, name='empty'):
--- a/tests/test_xml.py
+++ b/tests/test_xml.py
@ -0,0 +1,25 @@
 # SPDX-License-Identifier: MIT
 # Copyright (C) 2020 Tobias Gruetzmacher
 from lxml import html
 from dosagelib.xml import NS
 import httpmocks
 tree = html.document_fromstring(httpmocks.content('zp-222'))
 class TestXML:
    def xpath(self, path):
        return tree.xpath(path, namespaces=NS)
    def test_class_ext(self):
        assert len(self.xpath('//li[d:class("menu-item-3773")]')) == 1
        assert len(self.xpath('//ul[d:class("menu")]')) == 1
        assert len(self.xpath('//li[d:class("menu-item-object-custom")]')) == 2
        assert len(self.xpath('//li[d:class("menu-item")]')) == 25
    def test_re_ext(self):
        assert len(self.xpath(r'//img[re:test(@src, "posters.*jpg")]')) == 1