Sort scraper modules (mostly for test stability).

This commit is contained in:
Tobias Gruetzmacher 2016-03-13 20:24:21 +01:00
parent 36cc4e32f4
commit 78e13962f9

View file

@ -1,6 +1,8 @@
# -*- coding: iso-8859-1 -*-
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2014-2016 Tobias Gruetzmacher
import requests
import time
import random
@ -27,21 +29,24 @@ try:
except ImportError:
pycountry = None
from . import loader, configuration, util, languages
from . import loader, configuration, languages
from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
getDirname, unescape, tagre, normaliseURL, prettyMatcherList)
getDirname, unescape, tagre, normaliseURL,
prettyMatcherList)
from .comic import ComicStrip
from .output import out
from .events import getHandler
class Scraper(object):
'''Base class for all comic scraper, but without a specific scrape implementation.'''
'''Base class for all comic scraper, but without a specific scrape
implementation.'''
# The URL for the comic strip
url = None
# A string that is interpolated with the strip index to yield the URL for a particular strip.
# A string that is interpolated with the strip index to yield the URL for a
# particular strip.
stripUrl = None
# Stop search for previous URLs at this URL
@ -50,7 +55,8 @@ class Scraper(object):
# if more than one image per URL is expected
multipleImagesPerStrip = False
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
# set to False if previous URLs do not match the strip URL (ie. because of
# redirects)
prevUrlMatchesStripUrl = True
# set to True if this comic contains adult content
@ -122,17 +128,22 @@ class Scraper(object):
# remove duplicate URLs
imageUrls = set(imageUrls)
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
out.warn(u"Found %d images instead of 1 at %s with expressions %s" % (len(imageUrls), url, prettyMatcherList(self.imageSearch)))
out.warn(
u"Found %d images instead of 1 at %s with expressions %s" %
(len(imageUrls), url, prettyMatcherList(self.imageSearch)))
image = sorted(imageUrls)[0]
out.warn(u"Choosing image %s" % image)
imageUrls = (image,)
elif not imageUrls:
out.warn(u"Found no images at %s with expressions %s" % (url, prettyMatcherList(self.imageSearch)))
out.warn(u"Found no images at %s with expressions %s" % (url,
prettyMatcherList(self.imageSearch)))
if self.textSearch:
text = self.fetchText(url, data, self.textSearch, optional=self.textOptional)
text = self.fetchText(url, data, self.textSearch,
optional=self.textOptional)
else:
text = None
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session, text=text)
return ComicStrip(self.getName(), url, imageUrls, self.namer,
self.session, text=text)
def getStrips(self, maxstrips=None):
"""Get comic strips."""
@ -284,11 +295,11 @@ class Scraper(object):
of fetchUrls and fetchText.
Implementation notes: While this base class does not restrict how the
returned data is structured, subclasses (specific scrapers) should specify
how this data works, since the stracture is passed into different methods
which can be defined by comic modules and these methods should be able to
use the data if they so desire... (Affected methods: shouldSkipUrl,
imageUrlModifier)
returned data is structured, subclasses (specific scrapers) should
specify how this data works, since the stracture is passed into
different methods which can be defined by comic modules and these
methods should be able to use the data if they so desire... (Affected
methods: shouldSkipUrl, imageUrlModifier)
"""
raise ValueError("No implementation for getPage!")
@ -326,14 +337,15 @@ class Scraper(object):
lang = languages.Languages[cls.lang]
else:
try:
lang = pycountry.languages.get(alpha2 = cls.lang).name
lang = pycountry.languages.get(alpha2=cls.lang).name
except KeyError:
try:
lang = pycountry.languages.get(iso639_1_code = cls.lang).name
lang = pycountry.languages.get(iso639_1_code=cls.lang).name
except KeyError:
pass
return lang
class _BasicScraper(Scraper):
"""
Scraper base class that matches regular expressions against HTML pages.
@ -369,15 +381,18 @@ class _BasicScraper(Scraper):
for match in search.finditer(data[0]):
searchUrl = match.group(1)
if not searchUrl:
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern))
raise ValueError("Pattern %s matched empty URL at %s." %
(search.pattern, url))
out.debug(u'matched URL %r with pattern %s' %
(searchUrl, search.pattern))
searchUrls.append(normaliseURL(urljoin(data[1], searchUrl)))
if searchUrls:
# do not search other links if one pattern matched
break
if not searchUrls:
patterns = [x.pattern for x in searches]
raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
raise ValueError("Patterns %s not found at URL %s." %
(patterns, url))
return searchUrls
@classmethod
@ -387,12 +402,14 @@ class _BasicScraper(Scraper):
match = textSearch.search(data[0])
if match:
text = match.group(1)
out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
out.debug(u'matched text %r with pattern %s' %
(text, textSearch.pattern))
return unescape(text).strip()
if optional:
return None
else:
raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
raise ValueError("Pattern %s not found at URL %s." %
(textSearch.pattern, url))
else:
return None
@ -466,7 +483,9 @@ class _ParserScraper(Scraper):
if optional:
return None
else:
raise ValueError("XPath %s did not match anything at URL %s." % (textSearch, url))
raise ValueError(
"XPath %s did not match anything at URL %s." %
(textSearch, url))
out.debug(u'Matched text %r with XPath %s' % (text, textSearch))
return unescape(text).strip()
else:
@ -476,11 +495,15 @@ class _ParserScraper(Scraper):
def getDisabledReasons(cls):
res = {}
if cls.css and cssselect is None:
res['css'] = u"This module needs the cssselect (python-cssselect) python module which is not installed."
res['css'] = (u"This module needs the cssselect " +
u"(python-cssselect) python module which is " +
u"not installed.")
if html is None:
res['lxml'] = u"This module needs the lxml (python-lxml) python module which is not installed."
res['lxml'] = (u"This module needs the lxml (python-lxml) " +
u"python module which is not installed.")
return res
def find_scraperclasses(comic, multiple_allowed=False):
"""Get a list comic scraper classes. Can return more than one entries if
multiple_allowed is True, else it raises a ValueError if multiple
@ -508,6 +531,8 @@ def find_scraperclasses(comic, multiple_allowed=False):
_scraperclasses = None
def get_scraperclasses():
"""Find all comic scraper classes in the plugins directory.
The result is cached.
@ -519,7 +544,7 @@ def get_scraperclasses():
out.debug(u"Loading comic modules...")
modules = loader.get_modules('plugins')
plugins = loader.get_plugins(modules, Scraper)
_scraperclasses = list(plugins)
_scraperclasses = sorted(plugins, key=lambda p: p.getName())
check_scrapers()
out.debug(u"... %d modules loaded." % len(_scraperclasses))
return _scraperclasses
@ -533,10 +558,11 @@ def check_scrapers():
if name in d:
name1 = scraperclass.getName()
name2 = d[name].getName()
raise ValueError('duplicate scrapers %s and %s found' % (name1, name2))
raise ValueError('duplicate scrapers %s and %s found' %
(name1, name2))
d[name] = scraperclass
def make_scraper(classname, scraperType = _BasicScraper, **attributes):
def make_scraper(classname, scraperType=_BasicScraper, **attributes):
"""Make a new scraper class with given name and attributes."""
return type(classname, (scraperType,), attributes)