Add support for CSS selectors to HTML parser.
Each comic module author can decide if she wants to use CSS or XPath, not a mix of both. Using CSS needs the cssselect python module and the module gets disabled if it is unavailable.
This commit is contained in:
parent
17bc454132
commit
1d52d6a152
1 changed files with 16 additions and 1 deletions
|
@ -17,6 +17,11 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
html = None
|
html = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
import cssselect
|
||||||
|
except ImportError:
|
||||||
|
cssselect = None
|
||||||
|
|
||||||
from . import loader, configuration, util
|
from . import loader, configuration, util
|
||||||
from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
|
from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
|
||||||
getDirname, unescape, tagre, normaliseURL, prettyMatcherList)
|
getDirname, unescape, tagre, normaliseURL, prettyMatcherList)
|
||||||
|
@ -404,6 +409,10 @@ class _ParserScraper(Scraper):
|
||||||
of the HTML element and returns that.
|
of the HTML element and returns that.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Switch between CSS and XPath selectors for this class. Since CSS needs
|
||||||
|
# another Python module, XPath is the default for now.
|
||||||
|
css = False
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def getPage(cls, url):
|
def getPage(cls, url):
|
||||||
tree = html.document_fromstring(getPageContent(url, cls.session))
|
tree = html.document_fromstring(getPageContent(url, cls.session))
|
||||||
|
@ -414,9 +423,13 @@ class _ParserScraper(Scraper):
|
||||||
def fetchUrls(cls, url, data, urlSearch):
|
def fetchUrls(cls, url, data, urlSearch):
|
||||||
"""Search all entries for given XPath in a HTML page."""
|
"""Search all entries for given XPath in a HTML page."""
|
||||||
searchUrls = []
|
searchUrls = []
|
||||||
|
if cls.css:
|
||||||
|
searchFun = data.cssselect
|
||||||
|
else:
|
||||||
|
searchFun = data.xpath
|
||||||
searches = makeSequence(urlSearch)
|
searches = makeSequence(urlSearch)
|
||||||
for search in searches:
|
for search in searches:
|
||||||
for match in data.xpath(search):
|
for match in searchFun(search):
|
||||||
try:
|
try:
|
||||||
for attrib in html_link_attrs:
|
for attrib in html_link_attrs:
|
||||||
if attrib in match.attrib:
|
if attrib in match.attrib:
|
||||||
|
@ -453,6 +466,8 @@ class _ParserScraper(Scraper):
|
||||||
@classmethod
|
@classmethod
|
||||||
def getDisabledReasons(cls):
|
def getDisabledReasons(cls):
|
||||||
res = {}
|
res = {}
|
||||||
|
if cls.css and cssselect is None:
|
||||||
|
res['css'] = u"This module needs the cssselect (python-cssselect) python module which is not installed."
|
||||||
if html is None:
|
if html is None:
|
||||||
res['lxml'] = u"This module needs the lxml (python-lxml) python module which is not installed."
|
res['lxml'] = u"This module needs the lxml (python-lxml) python module which is not installed."
|
||||||
return res
|
return res
|
||||||
|
|
Loading…
Reference in a new issue