dosage/dosagelib/scraper.py

263 lines
9 KiB
Python
Raw Normal View History

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
2013-01-28 17:52:26 +00:00
# Copyright (C) 2012-2013 Bastian Kleineidam
2012-12-12 16:41:29 +00:00
import requests
2012-10-11 10:03:12 +00:00
from . import loader
2013-02-11 18:43:46 +00:00
from .util import fetchUrl, fetchUrls, getPageContent
2012-10-11 10:03:12 +00:00
from .comic import ComicStrip
2012-10-11 17:53:37 +00:00
from .output import out
2012-06-20 19:58:13 +00:00
2012-10-11 10:03:12 +00:00
class _BasicScraper(object):
'''Base class with scrape functions for comics.
2013-02-05 18:51:46 +00:00
@type url: C{string}
@cvar url: The URL for the comic strip.
2012-11-13 18:10:19 +00:00
@type stripUrl: C{string}
@cvar stripUrl: A string that is interpolated with the strip index
2012-10-11 10:03:12 +00:00
to yield the URL for a particular strip.
2013-02-13 18:59:59 +00:00
@type firstStripUrl: C{string} optional
@cvar firstStripUrl: Stop searching for previous URLs at this URL.
If not set and no previous URL is found a warning is printed.
2012-10-11 10:03:12 +00:00
@type imageSearch: C{regex}
@cvar imageSearch: A compiled regex that will locate the strip image URL
when applied to the strip page.
@type prevSearch: C{regex}
@cvar prevSearch: A compiled regex that will locate the URL for the
previous strip when applied to a strip page.
'''
2012-12-05 20:52:52 +00:00
2013-02-13 18:59:59 +00:00
# stop at this URL
firstStripUrl = None
# if more than one image per URL is expected
multipleImagesPerStrip = False
2012-12-05 20:52:52 +00:00
2013-02-18 19:03:27 +00:00
# set of URLs that have no image (eg. only a video link)
noImageUrls = set()
2012-12-05 20:52:52 +00:00
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
prevUrlMatchesStripUrl = True
2012-12-08 20:29:57 +00:00
# set to True if this comic contains adult content
adult = False
2012-12-12 16:41:29 +00:00
# a description of the comic contents
description = ''
# usually the index format help
2012-12-12 16:41:29 +00:00
help = ''
2012-10-11 10:03:12 +00:00
2012-12-12 16:41:29 +00:00
# HTTP session storing cookies
session = requests.session()
2012-12-05 20:52:52 +00:00
2012-10-11 17:53:37 +00:00
def __init__(self, indexes=None):
2012-10-11 10:03:12 +00:00
"""Initialize internal variables."""
self.urls = set()
2013-02-13 19:00:16 +00:00
if indexes:
self.indexes = tuple(indexes)
else:
self.indexes = tuple()
def __cmp__(self, other):
2013-02-18 19:02:16 +00:00
"""Compare scraper by name and index list."""
2013-02-13 19:00:16 +00:00
if not isinstance(other, _BasicScraper):
return 1
# first, order by name
d = cmp(self.get_name(), other.get_name())
if d != 0:
return d
# then by indexes
return cmp(self.indexes, other.indexes)
def __hash__(self):
2013-02-18 19:02:16 +00:00
"""Get hash value from name and index list."""
2013-02-13 19:00:16 +00:00
return hash((self.get_name(), self.indexes))
2012-10-11 10:03:12 +00:00
2012-10-11 17:53:37 +00:00
def getCurrentStrips(self):
2012-10-11 10:03:12 +00:00
"""Get current comic strip."""
2012-10-11 17:53:37 +00:00
msg = 'Retrieving the current strip'
if self.indexes:
msg += " for indexes %s" % self.indexes
2012-12-07 23:45:18 +00:00
out.info(msg+"...")
2012-10-11 17:53:37 +00:00
if self.indexes:
for index in self.indexes:
2012-11-13 18:10:19 +00:00
url = self.stripUrl % index
2013-02-18 19:03:27 +00:00
if url in self.noImageUrls:
out.info('Skipping no-image URL %s' % url)
else:
yield self.getStrip(url)
2012-10-11 17:53:37 +00:00
else:
2013-02-18 19:03:27 +00:00
url = self.getLatestUrl()
if url in self.noImageUrls:
out.info('Skipping no-image URL %s' % url)
else:
yield self.getStrip(self.getLatestUrl())
2012-10-11 10:03:12 +00:00
def getStrip(self, url):
"""Get comic strip for given URL."""
2013-02-12 16:55:13 +00:00
data, baseUrl = getPageContent(url, self.session)
2013-02-11 18:43:46 +00:00
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
2012-12-07 23:45:18 +00:00
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
2012-10-11 10:03:12 +00:00
return self.getComicStrip(url, imageUrls)
def getComicStrip(self, url, imageUrls):
"""Get comic strip downloader for given URL and images."""
2013-02-12 16:55:13 +00:00
return ComicStrip(self.get_name(), url, imageUrls, self.namer, self.session)
2012-10-11 10:03:12 +00:00
2012-12-07 23:45:18 +00:00
def getAllStrips(self, maxstrips=None):
2012-10-11 10:03:12 +00:00
"""Get all comic strips."""
2012-12-07 23:45:18 +00:00
if maxstrips:
msg = 'Retrieving %d strips' % maxstrips
else:
msg = 'Retrieving all strips'
2013-01-29 17:51:35 +00:00
if self.indexes:
msg += " for indexes %s" % self.indexes
2012-12-12 16:41:29 +00:00
if self.adult:
2013-01-29 17:51:35 +00:00
msg += " (including adult content)"
2012-12-12 16:41:29 +00:00
out.info(msg)
2012-10-11 17:53:37 +00:00
if self.indexes:
for index in self.indexes:
2012-11-13 18:10:19 +00:00
url = self.stripUrl % index
for strip in self.getStripsFor(url, maxstrips):
2012-10-11 17:53:37 +00:00
yield strip
else:
url = self.getLatestUrl()
2012-12-07 23:45:18 +00:00
for strip in self.getStripsFor(url, maxstrips):
2012-10-11 17:53:37 +00:00
yield strip
2012-12-07 23:45:18 +00:00
def getStripsFor(self, url, maxstrips):
"""Get comic strips for an URL. If maxstrips is a positive number, stop after
retrieving the given number of strips."""
2012-10-11 10:03:12 +00:00
seen_urls = set()
while url:
2013-02-12 16:55:13 +00:00
data, baseUrl = getPageContent(url, self.session)
2013-02-18 19:03:27 +00:00
if url in self.noImageUrls:
out.info('Skipping no-image URL %s' % url)
else:
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
yield self.getComicStrip(url, imageUrls)
2013-02-13 18:59:59 +00:00
if self.firstStripUrl == url:
out.debug("Stop at first URL %s" % url)
break
2013-02-12 16:55:13 +00:00
prevUrl = None
if self.prevSearch:
try:
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
except ValueError as msg:
# assume there is no previous URL, but print a warning
out.warn("%s Assuming no previous comic strips exist." % msg)
else:
prevUrl = self.prevUrlModifier(prevUrl)
out.debug("Matched previous URL %s" % prevUrl)
2012-10-11 10:03:12 +00:00
seen_urls.add(url)
2012-12-08 20:29:57 +00:00
if prevUrl in seen_urls:
# avoid recursive URL loops
out.warn("Already seen previous URL %r" % prevUrl)
break
url = prevUrl
2012-12-07 23:45:18 +00:00
if maxstrips is not None:
maxstrips -= 1
if maxstrips <= 0:
break
2012-10-11 10:03:12 +00:00
def setStrip(self, index):
"""Set current comic strip URL."""
2012-11-13 18:10:19 +00:00
self.currentUrl = self.stripUrl % index
2012-10-11 10:03:12 +00:00
@classmethod
def get_name(cls):
"""Get scraper name."""
if hasattr(cls, 'name'):
return cls.name
return cls.__name__
@classmethod
def starter(cls):
"""Get starter URL from where to scrape comic strips."""
return cls.url
2012-10-11 10:03:12 +00:00
@classmethod
def namer(cls, imageUrl, pageUrl):
"""Return filename for given image and page URL."""
return None
2012-12-02 17:35:06 +00:00
@classmethod
def prevUrlModifier(cls, prevUrl):
"""Optional modification of parsed previous URLs. Useful if
there are domain redirects. The default implementation does
not modify the URL.
"""
return prevUrl
2012-10-11 10:03:12 +00:00
def getFilename(self, imageUrl, pageUrl):
"""Return filename for given image and page URL."""
return self.namer(imageUrl, pageUrl)
def getLatestUrl(self):
"""Get starter URL from where to scrape comic strips."""
return self.starter()
2013-02-13 21:18:05 +00:00
def find_scraperclasses(comic, multiple_allowed=False):
"""Get a list comic scraper classes. Can return more than one entries if
multiple_allowed is True, else it raises a ValueError if multiple
modules match. The match is a case insensitive substring search."""
2012-12-12 16:41:29 +00:00
if not comic:
raise ValueError("empty comic name")
2012-06-20 19:58:13 +00:00
candidates = []
2012-10-11 10:03:12 +00:00
cname = comic.lower()
for scraperclass in get_scraperclasses():
2012-10-11 10:03:12 +00:00
lname = scraperclass.get_name().lower()
2012-06-20 19:58:13 +00:00
if lname == cname:
# perfect match
2013-02-13 21:18:05 +00:00
if not multiple_allowed:
return [scraperclass]
2013-02-13 21:18:05 +00:00
else:
candidates.append(scraperclass)
elif cname in lname:
2012-10-11 10:03:12 +00:00
candidates.append(scraperclass)
2013-02-13 21:18:05 +00:00
if len(candidates) > 1 and not multiple_allowed:
2012-06-20 19:58:13 +00:00
comics = ", ".join(x.get_name() for x in candidates)
2012-12-12 16:41:29 +00:00
raise ValueError('multiple comics found: %s' % comics)
2013-02-13 21:18:05 +00:00
elif not candidates:
2012-12-12 16:41:29 +00:00
raise ValueError('comic %r not found' % comic)
2013-02-13 21:18:05 +00:00
return candidates
2012-06-20 19:58:13 +00:00
_scraperclasses = None
def get_scraperclasses():
2012-06-20 19:58:13 +00:00
"""Find all comic scraper classes in the plugins directory.
The result is cached.
@return: list of _BasicScraper classes
@rtype: list of _BasicScraper
"""
global _scraperclasses
if _scraperclasses is None:
2012-12-07 23:45:18 +00:00
out.debug("Loading comic modules...")
2012-11-19 20:20:50 +00:00
modules = loader.get_modules()
2012-10-11 10:03:12 +00:00
plugins = loader.get_plugins(modules, _BasicScraper)
_scraperclasses = list(plugins)
2012-06-20 19:58:13 +00:00
check_scrapers()
out.debug("... %d modules loaded." % len(_scraperclasses))
return _scraperclasses
2012-06-20 19:58:13 +00:00
def check_scrapers():
2012-10-11 10:03:12 +00:00
"""Check for duplicate scraper class names."""
2012-06-20 19:58:13 +00:00
d = {}
for scraperclass in _scraperclasses:
2012-10-11 10:03:12 +00:00
name = scraperclass.get_name().lower()
2012-06-20 19:58:13 +00:00
if name in d:
2012-10-11 10:03:12 +00:00
name1 = scraperclass.get_name()
2012-06-20 19:58:13 +00:00
name2 = d[name].get_name()
2012-12-12 16:41:29 +00:00
raise ValueError('duplicate scrapers %s and %s found' % (name1, name2))
2012-10-11 10:03:12 +00:00
d[name] = scraperclass
2012-11-26 06:14:02 +00:00
def make_scraper(classname, **attributes):
"""Make a new scraper class with given name and attributes."""
return type(classname, (_BasicScraper,), attributes)