Sort scraper modules (mostly for test stability).

This commit is contained in:
Tobias Gruetzmacher 2016-03-13 20:24:21 +01:00
parent 36cc4e32f4
commit 78e13962f9

View file

@ -1,6 +1,8 @@
# -*- coding: iso-8859-1 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2014-2016 Tobias Gruetzmacher
import requests import requests
import time import time
import random import random
@ -27,21 +29,24 @@ try:
except ImportError: except ImportError:
pycountry = None pycountry = None
from . import loader, configuration, util, languages from . import loader, configuration, languages
from .util import (getPageContent, makeSequence, get_system_uid, urlopen, from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
getDirname, unescape, tagre, normaliseURL, prettyMatcherList) getDirname, unescape, tagre, normaliseURL,
prettyMatcherList)
from .comic import ComicStrip from .comic import ComicStrip
from .output import out from .output import out
from .events import getHandler from .events import getHandler
class Scraper(object): class Scraper(object):
'''Base class for all comic scraper, but without a specific scrape implementation.''' '''Base class for all comic scraper, but without a specific scrape
implementation.'''
# The URL for the comic strip # The URL for the comic strip
url = None url = None
# A string that is interpolated with the strip index to yield the URL for a particular strip. # A string that is interpolated with the strip index to yield the URL for a
# particular strip.
stripUrl = None stripUrl = None
# Stop search for previous URLs at this URL # Stop search for previous URLs at this URL
@ -50,7 +55,8 @@ class Scraper(object):
# if more than one image per URL is expected # if more than one image per URL is expected
multipleImagesPerStrip = False multipleImagesPerStrip = False
# set to False if previous URLs do not match the strip URL (ie. because of redirects) # set to False if previous URLs do not match the strip URL (ie. because of
# redirects)
prevUrlMatchesStripUrl = True prevUrlMatchesStripUrl = True
# set to True if this comic contains adult content # set to True if this comic contains adult content
@ -122,17 +128,22 @@ class Scraper(object):
# remove duplicate URLs # remove duplicate URLs
imageUrls = set(imageUrls) imageUrls = set(imageUrls)
if len(imageUrls) > 1 and not self.multipleImagesPerStrip: if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
out.warn(u"Found %d images instead of 1 at %s with expressions %s" % (len(imageUrls), url, prettyMatcherList(self.imageSearch))) out.warn(
u"Found %d images instead of 1 at %s with expressions %s" %
(len(imageUrls), url, prettyMatcherList(self.imageSearch)))
image = sorted(imageUrls)[0] image = sorted(imageUrls)[0]
out.warn(u"Choosing image %s" % image) out.warn(u"Choosing image %s" % image)
imageUrls = (image,) imageUrls = (image,)
elif not imageUrls: elif not imageUrls:
out.warn(u"Found no images at %s with expressions %s" % (url, prettyMatcherList(self.imageSearch))) out.warn(u"Found no images at %s with expressions %s" % (url,
prettyMatcherList(self.imageSearch)))
if self.textSearch: if self.textSearch:
text = self.fetchText(url, data, self.textSearch, optional=self.textOptional) text = self.fetchText(url, data, self.textSearch,
optional=self.textOptional)
else: else:
text = None text = None
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session, text=text) return ComicStrip(self.getName(), url, imageUrls, self.namer,
self.session, text=text)
def getStrips(self, maxstrips=None): def getStrips(self, maxstrips=None):
"""Get comic strips.""" """Get comic strips."""
@ -284,11 +295,11 @@ class Scraper(object):
of fetchUrls and fetchText. of fetchUrls and fetchText.
Implementation notes: While this base class does not restrict how the Implementation notes: While this base class does not restrict how the
returned data is structured, subclasses (specific scrapers) should specify returned data is structured, subclasses (specific scrapers) should
how this data works, since the stracture is passed into different methods specify how this data works, since the stracture is passed into
which can be defined by comic modules and these methods should be able to different methods which can be defined by comic modules and these
use the data if they so desire... (Affected methods: shouldSkipUrl, methods should be able to use the data if they so desire... (Affected
imageUrlModifier) methods: shouldSkipUrl, imageUrlModifier)
""" """
raise ValueError("No implementation for getPage!") raise ValueError("No implementation for getPage!")
@ -334,6 +345,7 @@ class Scraper(object):
pass pass
return lang return lang
class _BasicScraper(Scraper): class _BasicScraper(Scraper):
""" """
Scraper base class that matches regular expressions against HTML pages. Scraper base class that matches regular expressions against HTML pages.
@ -369,15 +381,18 @@ class _BasicScraper(Scraper):
for match in search.finditer(data[0]): for match in search.finditer(data[0]):
searchUrl = match.group(1) searchUrl = match.group(1)
if not searchUrl: if not searchUrl:
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url)) raise ValueError("Pattern %s matched empty URL at %s." %
out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern)) (search.pattern, url))
out.debug(u'matched URL %r with pattern %s' %
(searchUrl, search.pattern))
searchUrls.append(normaliseURL(urljoin(data[1], searchUrl))) searchUrls.append(normaliseURL(urljoin(data[1], searchUrl)))
if searchUrls: if searchUrls:
# do not search other links if one pattern matched # do not search other links if one pattern matched
break break
if not searchUrls: if not searchUrls:
patterns = [x.pattern for x in searches] patterns = [x.pattern for x in searches]
raise ValueError("Patterns %s not found at URL %s." % (patterns, url)) raise ValueError("Patterns %s not found at URL %s." %
(patterns, url))
return searchUrls return searchUrls
@classmethod @classmethod
@ -387,12 +402,14 @@ class _BasicScraper(Scraper):
match = textSearch.search(data[0]) match = textSearch.search(data[0])
if match: if match:
text = match.group(1) text = match.group(1)
out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern)) out.debug(u'matched text %r with pattern %s' %
(text, textSearch.pattern))
return unescape(text).strip() return unescape(text).strip()
if optional: if optional:
return None return None
else: else:
raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url)) raise ValueError("Pattern %s not found at URL %s." %
(textSearch.pattern, url))
else: else:
return None return None
@ -466,7 +483,9 @@ class _ParserScraper(Scraper):
if optional: if optional:
return None return None
else: else:
raise ValueError("XPath %s did not match anything at URL %s." % (textSearch, url)) raise ValueError(
"XPath %s did not match anything at URL %s." %
(textSearch, url))
out.debug(u'Matched text %r with XPath %s' % (text, textSearch)) out.debug(u'Matched text %r with XPath %s' % (text, textSearch))
return unescape(text).strip() return unescape(text).strip()
else: else:
@ -476,11 +495,15 @@ class _ParserScraper(Scraper):
def getDisabledReasons(cls): def getDisabledReasons(cls):
res = {} res = {}
if cls.css and cssselect is None: if cls.css and cssselect is None:
res['css'] = u"This module needs the cssselect (python-cssselect) python module which is not installed." res['css'] = (u"This module needs the cssselect " +
u"(python-cssselect) python module which is " +
u"not installed.")
if html is None: if html is None:
res['lxml'] = u"This module needs the lxml (python-lxml) python module which is not installed." res['lxml'] = (u"This module needs the lxml (python-lxml) " +
u"python module which is not installed.")
return res return res
def find_scraperclasses(comic, multiple_allowed=False): def find_scraperclasses(comic, multiple_allowed=False):
"""Get a list comic scraper classes. Can return more than one entries if """Get a list comic scraper classes. Can return more than one entries if
multiple_allowed is True, else it raises a ValueError if multiple multiple_allowed is True, else it raises a ValueError if multiple
@ -508,6 +531,8 @@ def find_scraperclasses(comic, multiple_allowed=False):
_scraperclasses = None _scraperclasses = None
def get_scraperclasses(): def get_scraperclasses():
"""Find all comic scraper classes in the plugins directory. """Find all comic scraper classes in the plugins directory.
The result is cached. The result is cached.
@ -519,7 +544,7 @@ def get_scraperclasses():
out.debug(u"Loading comic modules...") out.debug(u"Loading comic modules...")
modules = loader.get_modules('plugins') modules = loader.get_modules('plugins')
plugins = loader.get_plugins(modules, Scraper) plugins = loader.get_plugins(modules, Scraper)
_scraperclasses = list(plugins) _scraperclasses = sorted(plugins, key=lambda p: p.getName())
check_scrapers() check_scrapers()
out.debug(u"... %d modules loaded." % len(_scraperclasses)) out.debug(u"... %d modules loaded." % len(_scraperclasses))
return _scraperclasses return _scraperclasses
@ -533,7 +558,8 @@ def check_scrapers():
if name in d: if name in d:
name1 = scraperclass.getName() name1 = scraperclass.getName()
name2 = d[name].getName() name2 = d[name].getName()
raise ValueError('duplicate scrapers %s and %s found' % (name1, name2)) raise ValueError('duplicate scrapers %s and %s found' %
(name1, name2))
d[name] = scraperclass d[name] = scraperclass