Add support for CSS selectors to HTML parser.

Each comic module author can decide if she wants to use CSS or XPath,
not a mix of both. Using CSS needs the cssselect python module and the
module gets disabled if it is unavailable.
This commit is contained in:
Tobias Gruetzmacher 2014-10-13 22:43:06 +02:00
parent 17bc454132
commit 1d52d6a152

View file

@ -17,6 +17,11 @@ try:
except ImportError: except ImportError:
html = None html = None
try:
import cssselect
except ImportError:
cssselect = None
from . import loader, configuration, util from . import loader, configuration, util
from .util import (getPageContent, makeSequence, get_system_uid, urlopen, from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
getDirname, unescape, tagre, normaliseURL, prettyMatcherList) getDirname, unescape, tagre, normaliseURL, prettyMatcherList)
@ -404,6 +409,10 @@ class _ParserScraper(Scraper):
of the HTML element and returns that. of the HTML element and returns that.
""" """
# Switch between CSS and XPath selectors for this class. Since CSS needs
# another Python module, XPath is the default for now.
css = False
@classmethod @classmethod
def getPage(cls, url): def getPage(cls, url):
tree = html.document_fromstring(getPageContent(url, cls.session)) tree = html.document_fromstring(getPageContent(url, cls.session))
@ -414,9 +423,13 @@ class _ParserScraper(Scraper):
def fetchUrls(cls, url, data, urlSearch): def fetchUrls(cls, url, data, urlSearch):
"""Search all entries for given XPath in a HTML page.""" """Search all entries for given XPath in a HTML page."""
searchUrls = [] searchUrls = []
if cls.css:
searchFun = data.cssselect
else:
searchFun = data.xpath
searches = makeSequence(urlSearch) searches = makeSequence(urlSearch)
for search in searches: for search in searches:
for match in data.xpath(search): for match in searchFun(search):
try: try:
for attrib in html_link_attrs: for attrib in html_link_attrs:
if attrib in match.attrib: if attrib in match.attrib:
@ -453,6 +466,8 @@ class _ParserScraper(Scraper):
@classmethod @classmethod
def getDisabledReasons(cls): def getDisabledReasons(cls):
res = {} res = {}
if cls.css and cssselect is None:
res['css'] = u"This module needs the cssselect (python-cssselect) python module which is not installed."
if html is None: if html is None:
res['lxml'] = u"This module needs the lxml (python-lxml) python module which is not installed." res['lxml'] = u"This module needs the lxml (python-lxml) python module which is not installed."
return res return res