Sort scraper modules (mostly for test stability).
This commit is contained in:
parent
36cc4e32f4
commit
78e13962f9
1 changed files with 54 additions and 28 deletions
|
@ -1,6 +1,8 @@
|
||||||
# -*- coding: iso-8859-1 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2014-2016 Tobias Gruetzmacher
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
import random
|
import random
|
||||||
|
@ -27,21 +29,24 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pycountry = None
|
pycountry = None
|
||||||
|
|
||||||
from . import loader, configuration, util, languages
|
from . import loader, configuration, languages
|
||||||
from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
|
from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
|
||||||
getDirname, unescape, tagre, normaliseURL, prettyMatcherList)
|
getDirname, unescape, tagre, normaliseURL,
|
||||||
|
prettyMatcherList)
|
||||||
from .comic import ComicStrip
|
from .comic import ComicStrip
|
||||||
from .output import out
|
from .output import out
|
||||||
from .events import getHandler
|
from .events import getHandler
|
||||||
|
|
||||||
|
|
||||||
class Scraper(object):
|
class Scraper(object):
|
||||||
'''Base class for all comic scraper, but without a specific scrape implementation.'''
|
'''Base class for all comic scraper, but without a specific scrape
|
||||||
|
implementation.'''
|
||||||
|
|
||||||
# The URL for the comic strip
|
# The URL for the comic strip
|
||||||
url = None
|
url = None
|
||||||
|
|
||||||
# A string that is interpolated with the strip index to yield the URL for a particular strip.
|
# A string that is interpolated with the strip index to yield the URL for a
|
||||||
|
# particular strip.
|
||||||
stripUrl = None
|
stripUrl = None
|
||||||
|
|
||||||
# Stop search for previous URLs at this URL
|
# Stop search for previous URLs at this URL
|
||||||
|
@ -50,7 +55,8 @@ class Scraper(object):
|
||||||
# if more than one image per URL is expected
|
# if more than one image per URL is expected
|
||||||
multipleImagesPerStrip = False
|
multipleImagesPerStrip = False
|
||||||
|
|
||||||
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
|
# set to False if previous URLs do not match the strip URL (ie. because of
|
||||||
|
# redirects)
|
||||||
prevUrlMatchesStripUrl = True
|
prevUrlMatchesStripUrl = True
|
||||||
|
|
||||||
# set to True if this comic contains adult content
|
# set to True if this comic contains adult content
|
||||||
|
@ -122,17 +128,22 @@ class Scraper(object):
|
||||||
# remove duplicate URLs
|
# remove duplicate URLs
|
||||||
imageUrls = set(imageUrls)
|
imageUrls = set(imageUrls)
|
||||||
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
||||||
out.warn(u"Found %d images instead of 1 at %s with expressions %s" % (len(imageUrls), url, prettyMatcherList(self.imageSearch)))
|
out.warn(
|
||||||
|
u"Found %d images instead of 1 at %s with expressions %s" %
|
||||||
|
(len(imageUrls), url, prettyMatcherList(self.imageSearch)))
|
||||||
image = sorted(imageUrls)[0]
|
image = sorted(imageUrls)[0]
|
||||||
out.warn(u"Choosing image %s" % image)
|
out.warn(u"Choosing image %s" % image)
|
||||||
imageUrls = (image,)
|
imageUrls = (image,)
|
||||||
elif not imageUrls:
|
elif not imageUrls:
|
||||||
out.warn(u"Found no images at %s with expressions %s" % (url, prettyMatcherList(self.imageSearch)))
|
out.warn(u"Found no images at %s with expressions %s" % (url,
|
||||||
|
prettyMatcherList(self.imageSearch)))
|
||||||
if self.textSearch:
|
if self.textSearch:
|
||||||
text = self.fetchText(url, data, self.textSearch, optional=self.textOptional)
|
text = self.fetchText(url, data, self.textSearch,
|
||||||
|
optional=self.textOptional)
|
||||||
else:
|
else:
|
||||||
text = None
|
text = None
|
||||||
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session, text=text)
|
return ComicStrip(self.getName(), url, imageUrls, self.namer,
|
||||||
|
self.session, text=text)
|
||||||
|
|
||||||
def getStrips(self, maxstrips=None):
|
def getStrips(self, maxstrips=None):
|
||||||
"""Get comic strips."""
|
"""Get comic strips."""
|
||||||
|
@ -284,11 +295,11 @@ class Scraper(object):
|
||||||
of fetchUrls and fetchText.
|
of fetchUrls and fetchText.
|
||||||
|
|
||||||
Implementation notes: While this base class does not restrict how the
|
Implementation notes: While this base class does not restrict how the
|
||||||
returned data is structured, subclasses (specific scrapers) should specify
|
returned data is structured, subclasses (specific scrapers) should
|
||||||
how this data works, since the stracture is passed into different methods
|
specify how this data works, since the stracture is passed into
|
||||||
which can be defined by comic modules and these methods should be able to
|
different methods which can be defined by comic modules and these
|
||||||
use the data if they so desire... (Affected methods: shouldSkipUrl,
|
methods should be able to use the data if they so desire... (Affected
|
||||||
imageUrlModifier)
|
methods: shouldSkipUrl, imageUrlModifier)
|
||||||
"""
|
"""
|
||||||
raise ValueError("No implementation for getPage!")
|
raise ValueError("No implementation for getPage!")
|
||||||
|
|
||||||
|
@ -334,6 +345,7 @@ class Scraper(object):
|
||||||
pass
|
pass
|
||||||
return lang
|
return lang
|
||||||
|
|
||||||
|
|
||||||
class _BasicScraper(Scraper):
|
class _BasicScraper(Scraper):
|
||||||
"""
|
"""
|
||||||
Scraper base class that matches regular expressions against HTML pages.
|
Scraper base class that matches regular expressions against HTML pages.
|
||||||
|
@ -369,15 +381,18 @@ class _BasicScraper(Scraper):
|
||||||
for match in search.finditer(data[0]):
|
for match in search.finditer(data[0]):
|
||||||
searchUrl = match.group(1)
|
searchUrl = match.group(1)
|
||||||
if not searchUrl:
|
if not searchUrl:
|
||||||
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
|
raise ValueError("Pattern %s matched empty URL at %s." %
|
||||||
out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern))
|
(search.pattern, url))
|
||||||
|
out.debug(u'matched URL %r with pattern %s' %
|
||||||
|
(searchUrl, search.pattern))
|
||||||
searchUrls.append(normaliseURL(urljoin(data[1], searchUrl)))
|
searchUrls.append(normaliseURL(urljoin(data[1], searchUrl)))
|
||||||
if searchUrls:
|
if searchUrls:
|
||||||
# do not search other links if one pattern matched
|
# do not search other links if one pattern matched
|
||||||
break
|
break
|
||||||
if not searchUrls:
|
if not searchUrls:
|
||||||
patterns = [x.pattern for x in searches]
|
patterns = [x.pattern for x in searches]
|
||||||
raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
|
raise ValueError("Patterns %s not found at URL %s." %
|
||||||
|
(patterns, url))
|
||||||
return searchUrls
|
return searchUrls
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -387,12 +402,14 @@ class _BasicScraper(Scraper):
|
||||||
match = textSearch.search(data[0])
|
match = textSearch.search(data[0])
|
||||||
if match:
|
if match:
|
||||||
text = match.group(1)
|
text = match.group(1)
|
||||||
out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
|
out.debug(u'matched text %r with pattern %s' %
|
||||||
|
(text, textSearch.pattern))
|
||||||
return unescape(text).strip()
|
return unescape(text).strip()
|
||||||
if optional:
|
if optional:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
|
raise ValueError("Pattern %s not found at URL %s." %
|
||||||
|
(textSearch.pattern, url))
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -466,7 +483,9 @@ class _ParserScraper(Scraper):
|
||||||
if optional:
|
if optional:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
raise ValueError("XPath %s did not match anything at URL %s." % (textSearch, url))
|
raise ValueError(
|
||||||
|
"XPath %s did not match anything at URL %s." %
|
||||||
|
(textSearch, url))
|
||||||
out.debug(u'Matched text %r with XPath %s' % (text, textSearch))
|
out.debug(u'Matched text %r with XPath %s' % (text, textSearch))
|
||||||
return unescape(text).strip()
|
return unescape(text).strip()
|
||||||
else:
|
else:
|
||||||
|
@ -476,11 +495,15 @@ class _ParserScraper(Scraper):
|
||||||
def getDisabledReasons(cls):
|
def getDisabledReasons(cls):
|
||||||
res = {}
|
res = {}
|
||||||
if cls.css and cssselect is None:
|
if cls.css and cssselect is None:
|
||||||
res['css'] = u"This module needs the cssselect (python-cssselect) python module which is not installed."
|
res['css'] = (u"This module needs the cssselect " +
|
||||||
|
u"(python-cssselect) python module which is " +
|
||||||
|
u"not installed.")
|
||||||
if html is None:
|
if html is None:
|
||||||
res['lxml'] = u"This module needs the lxml (python-lxml) python module which is not installed."
|
res['lxml'] = (u"This module needs the lxml (python-lxml) " +
|
||||||
|
u"python module which is not installed.")
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
def find_scraperclasses(comic, multiple_allowed=False):
|
def find_scraperclasses(comic, multiple_allowed=False):
|
||||||
"""Get a list comic scraper classes. Can return more than one entries if
|
"""Get a list comic scraper classes. Can return more than one entries if
|
||||||
multiple_allowed is True, else it raises a ValueError if multiple
|
multiple_allowed is True, else it raises a ValueError if multiple
|
||||||
|
@ -508,6 +531,8 @@ def find_scraperclasses(comic, multiple_allowed=False):
|
||||||
|
|
||||||
|
|
||||||
_scraperclasses = None
|
_scraperclasses = None
|
||||||
|
|
||||||
|
|
||||||
def get_scraperclasses():
|
def get_scraperclasses():
|
||||||
"""Find all comic scraper classes in the plugins directory.
|
"""Find all comic scraper classes in the plugins directory.
|
||||||
The result is cached.
|
The result is cached.
|
||||||
|
@ -519,7 +544,7 @@ def get_scraperclasses():
|
||||||
out.debug(u"Loading comic modules...")
|
out.debug(u"Loading comic modules...")
|
||||||
modules = loader.get_modules('plugins')
|
modules = loader.get_modules('plugins')
|
||||||
plugins = loader.get_plugins(modules, Scraper)
|
plugins = loader.get_plugins(modules, Scraper)
|
||||||
_scraperclasses = list(plugins)
|
_scraperclasses = sorted(plugins, key=lambda p: p.getName())
|
||||||
check_scrapers()
|
check_scrapers()
|
||||||
out.debug(u"... %d modules loaded." % len(_scraperclasses))
|
out.debug(u"... %d modules loaded." % len(_scraperclasses))
|
||||||
return _scraperclasses
|
return _scraperclasses
|
||||||
|
@ -533,7 +558,8 @@ def check_scrapers():
|
||||||
if name in d:
|
if name in d:
|
||||||
name1 = scraperclass.getName()
|
name1 = scraperclass.getName()
|
||||||
name2 = d[name].getName()
|
name2 = d[name].getName()
|
||||||
raise ValueError('duplicate scrapers %s and %s found' % (name1, name2))
|
raise ValueError('duplicate scrapers %s and %s found' %
|
||||||
|
(name1, name2))
|
||||||
d[name] = scraperclass
|
d[name] = scraperclass
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue