2012-06-20 20:41:04 +00:00
|
|
|
# -*- coding: iso-8859-1 -*-
|
|
|
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
2013-01-28 17:52:26 +00:00
|
|
|
# Copyright (C) 2012-2013 Bastian Kleineidam
|
2012-12-12 16:41:29 +00:00
|
|
|
import requests
|
2012-10-11 10:03:12 +00:00
|
|
|
from . import loader
|
2013-02-11 18:43:46 +00:00
|
|
|
from .util import fetchUrl, fetchUrls, getPageContent
|
2012-10-11 10:03:12 +00:00
|
|
|
from .comic import ComicStrip
|
2012-10-11 17:53:37 +00:00
|
|
|
from .output import out
|
2012-06-20 19:58:13 +00:00
|
|
|
|
|
|
|
|
2012-10-11 10:03:12 +00:00
|
|
|
class _BasicScraper(object):
|
|
|
|
'''Base class with scrape functions for comics.
|
|
|
|
|
2013-02-05 18:51:46 +00:00
|
|
|
@type url: C{string}
|
|
|
|
@cvar url: The URL for the comic strip.
|
2012-11-13 18:10:19 +00:00
|
|
|
@type stripUrl: C{string}
|
|
|
|
@cvar stripUrl: A string that is interpolated with the strip index
|
2012-10-11 10:03:12 +00:00
|
|
|
to yield the URL for a particular strip.
|
|
|
|
@type imageSearch: C{regex}
|
|
|
|
@cvar imageSearch: A compiled regex that will locate the strip image URL
|
|
|
|
when applied to the strip page.
|
|
|
|
@type prevSearch: C{regex}
|
|
|
|
@cvar prevSearch: A compiled regex that will locate the URL for the
|
|
|
|
previous strip when applied to a strip page.
|
|
|
|
'''
|
2012-12-05 20:52:52 +00:00
|
|
|
|
2012-11-26 17:44:31 +00:00
|
|
|
# if more than one image per URL is expected
|
|
|
|
multipleImagesPerStrip = False
|
2012-12-05 20:52:52 +00:00
|
|
|
|
|
|
|
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
|
|
|
|
prevUrlMatchesStripUrl = True
|
|
|
|
|
2012-12-08 20:29:57 +00:00
|
|
|
# set to True if this comic contains adult content
|
|
|
|
adult = False
|
|
|
|
|
2012-12-12 16:41:29 +00:00
|
|
|
# a description of the comic contents
|
|
|
|
description = ''
|
|
|
|
|
2012-11-26 17:44:31 +00:00
|
|
|
# usually the index format help
|
2012-12-12 16:41:29 +00:00
|
|
|
help = ''
|
2012-10-11 10:03:12 +00:00
|
|
|
|
2012-12-12 16:41:29 +00:00
|
|
|
# HTTP session storing cookies
|
|
|
|
session = requests.session()
|
2012-12-05 20:52:52 +00:00
|
|
|
|
2012-10-11 17:53:37 +00:00
|
|
|
def __init__(self, indexes=None):
|
2012-10-11 10:03:12 +00:00
|
|
|
"""Initialize internal variables."""
|
|
|
|
self.urls = set()
|
2012-10-11 17:53:37 +00:00
|
|
|
self.indexes = indexes
|
2012-10-11 10:03:12 +00:00
|
|
|
|
2012-10-11 17:53:37 +00:00
|
|
|
def getCurrentStrips(self):
|
2012-10-11 10:03:12 +00:00
|
|
|
"""Get current comic strip."""
|
2012-10-11 17:53:37 +00:00
|
|
|
msg = 'Retrieving the current strip'
|
|
|
|
if self.indexes:
|
|
|
|
msg += " for indexes %s" % self.indexes
|
2012-12-07 23:45:18 +00:00
|
|
|
out.info(msg+"...")
|
2012-10-11 17:53:37 +00:00
|
|
|
if self.indexes:
|
|
|
|
for index in self.indexes:
|
2012-11-13 18:10:19 +00:00
|
|
|
url = self.stripUrl % index
|
2012-10-11 17:53:37 +00:00
|
|
|
yield self.getStrip(url)
|
|
|
|
else:
|
|
|
|
yield self.getStrip(self.getLatestUrl())
|
2012-10-11 10:03:12 +00:00
|
|
|
|
|
|
|
def getStrip(self, url):
|
|
|
|
"""Get comic strip for given URL."""
|
2013-02-11 18:43:46 +00:00
|
|
|
data, baseUrl = getPageContent(url, session=self.session)
|
|
|
|
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
2012-11-26 17:44:31 +00:00
|
|
|
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
2012-12-07 23:45:18 +00:00
|
|
|
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
|
2012-10-11 10:03:12 +00:00
|
|
|
return self.getComicStrip(url, imageUrls)
|
|
|
|
|
|
|
|
def getComicStrip(self, url, imageUrls):
|
|
|
|
"""Get comic strip downloader for given URL and images."""
|
|
|
|
return ComicStrip(self.get_name(), url, imageUrls, self.namer)
|
|
|
|
|
2012-12-07 23:45:18 +00:00
|
|
|
def getAllStrips(self, maxstrips=None):
|
2012-10-11 10:03:12 +00:00
|
|
|
"""Get all comic strips."""
|
2012-12-07 23:45:18 +00:00
|
|
|
if maxstrips:
|
|
|
|
msg = 'Retrieving %d strips' % maxstrips
|
|
|
|
else:
|
|
|
|
msg = 'Retrieving all strips'
|
2013-01-29 17:51:35 +00:00
|
|
|
if self.indexes:
|
|
|
|
msg += " for indexes %s" % self.indexes
|
2012-12-12 16:41:29 +00:00
|
|
|
if self.adult:
|
2013-01-29 17:51:35 +00:00
|
|
|
msg += " (including adult content)"
|
2012-12-12 16:41:29 +00:00
|
|
|
out.info(msg)
|
2012-10-11 17:53:37 +00:00
|
|
|
if self.indexes:
|
|
|
|
for index in self.indexes:
|
2012-11-13 18:10:19 +00:00
|
|
|
url = self.stripUrl % index
|
2013-01-23 19:21:52 +00:00
|
|
|
for strip in self.getStripsFor(url, maxstrips):
|
2012-10-11 17:53:37 +00:00
|
|
|
yield strip
|
|
|
|
else:
|
|
|
|
url = self.getLatestUrl()
|
2012-12-07 23:45:18 +00:00
|
|
|
for strip in self.getStripsFor(url, maxstrips):
|
2012-10-11 17:53:37 +00:00
|
|
|
yield strip
|
|
|
|
|
2012-12-07 23:45:18 +00:00
|
|
|
def getStripsFor(self, url, maxstrips):
|
|
|
|
"""Get comic strips for an URL. If maxstrips is a positive number, stop after
|
|
|
|
retrieving the given number of strips."""
|
2012-10-11 10:03:12 +00:00
|
|
|
seen_urls = set()
|
|
|
|
while url:
|
2013-02-11 18:43:46 +00:00
|
|
|
data, baseUrl = getPageContent(url, session=self.session)
|
|
|
|
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
|
|
|
yield self.getComicStrip(url, imageUrls)
|
|
|
|
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
|
2012-12-02 17:35:06 +00:00
|
|
|
prevUrl = self.prevUrlModifier(prevUrl)
|
2012-12-07 23:45:18 +00:00
|
|
|
out.debug("Matched previous URL %s" % prevUrl)
|
2012-10-11 10:03:12 +00:00
|
|
|
seen_urls.add(url)
|
2012-12-08 20:29:57 +00:00
|
|
|
if prevUrl in seen_urls:
|
|
|
|
# avoid recursive URL loops
|
|
|
|
out.warn("Already seen previous URL %r" % prevUrl)
|
|
|
|
break
|
|
|
|
url = prevUrl
|
2012-12-07 23:45:18 +00:00
|
|
|
if maxstrips is not None:
|
|
|
|
maxstrips -= 1
|
|
|
|
if maxstrips <= 0:
|
|
|
|
break
|
2012-10-11 10:03:12 +00:00
|
|
|
|
|
|
|
def setStrip(self, index):
|
|
|
|
"""Set current comic strip URL."""
|
2012-11-13 18:10:19 +00:00
|
|
|
self.currentUrl = self.stripUrl % index
|
2012-10-11 10:03:12 +00:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_name(cls):
|
|
|
|
"""Get scraper name."""
|
|
|
|
if hasattr(cls, 'name'):
|
|
|
|
return cls.name
|
|
|
|
return cls.__name__
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def starter(cls):
|
|
|
|
"""Get starter URL from where to scrape comic strips."""
|
2013-02-04 20:00:26 +00:00
|
|
|
return cls.url
|
2012-10-11 10:03:12 +00:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def namer(cls, imageUrl, pageUrl):
|
|
|
|
"""Return filename for given image and page URL."""
|
|
|
|
return None
|
|
|
|
|
2012-12-02 17:35:06 +00:00
|
|
|
@classmethod
|
|
|
|
def prevUrlModifier(cls, prevUrl):
|
|
|
|
"""Optional modification of parsed previous URLs. Useful if
|
|
|
|
there are domain redirects. The default implementation does
|
|
|
|
not modify the URL.
|
|
|
|
"""
|
|
|
|
return prevUrl
|
|
|
|
|
2012-10-11 10:03:12 +00:00
|
|
|
def getFilename(self, imageUrl, pageUrl):
|
|
|
|
"""Return filename for given image and page URL."""
|
|
|
|
return self.namer(imageUrl, pageUrl)
|
|
|
|
|
|
|
|
def getLatestUrl(self):
|
|
|
|
"""Get starter URL from where to scrape comic strips."""
|
|
|
|
return self.starter()
|
|
|
|
|
|
|
|
|
|
|
|
def get_scraper(comic):
|
2012-06-20 19:58:13 +00:00
|
|
|
"""Returns a comic module object."""
|
2012-12-12 16:41:29 +00:00
|
|
|
if not comic:
|
|
|
|
raise ValueError("empty comic name")
|
2012-06-20 19:58:13 +00:00
|
|
|
candidates = []
|
2012-10-11 10:03:12 +00:00
|
|
|
cname = comic.lower()
|
|
|
|
for scraperclass in get_scrapers():
|
|
|
|
lname = scraperclass.get_name().lower()
|
2012-06-20 19:58:13 +00:00
|
|
|
if lname == cname:
|
|
|
|
# perfect match
|
2012-10-11 10:03:12 +00:00
|
|
|
return scraperclass
|
2012-06-20 19:58:13 +00:00
|
|
|
if cname in lname:
|
2012-10-11 10:03:12 +00:00
|
|
|
candidates.append(scraperclass)
|
2012-06-20 19:58:13 +00:00
|
|
|
if len(candidates) == 1:
|
|
|
|
return candidates[0]
|
|
|
|
elif candidates:
|
|
|
|
comics = ", ".join(x.get_name() for x in candidates)
|
2012-12-12 16:41:29 +00:00
|
|
|
raise ValueError('multiple comics found: %s' % comics)
|
2012-06-20 19:58:13 +00:00
|
|
|
else:
|
2012-12-12 16:41:29 +00:00
|
|
|
raise ValueError('comic %r not found' % comic)
|
2012-06-20 19:58:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
_scrapers = None
|
|
|
|
def get_scrapers():
|
|
|
|
"""Find all comic scraper classes in the plugins directory.
|
|
|
|
The result is cached.
|
|
|
|
@return: list of _BasicScraper classes
|
|
|
|
@rtype: list of _BasicScraper
|
|
|
|
"""
|
|
|
|
global _scrapers
|
|
|
|
if _scrapers is None:
|
2012-12-07 23:45:18 +00:00
|
|
|
out.debug("Loading comic modules...")
|
2012-11-19 20:20:50 +00:00
|
|
|
modules = loader.get_modules()
|
2012-10-11 10:03:12 +00:00
|
|
|
plugins = loader.get_plugins(modules, _BasicScraper)
|
|
|
|
_scrapers = list(plugins)
|
2012-06-20 19:58:13 +00:00
|
|
|
_scrapers.sort(key=lambda s: s.get_name())
|
|
|
|
check_scrapers()
|
2012-12-07 23:45:18 +00:00
|
|
|
out.debug("... %d modules loaded." % len(_scrapers))
|
2012-06-20 19:58:13 +00:00
|
|
|
return _scrapers
|
|
|
|
|
|
|
|
|
|
|
|
def check_scrapers():
|
2012-10-11 10:03:12 +00:00
|
|
|
"""Check for duplicate scraper class names."""
|
2012-06-20 19:58:13 +00:00
|
|
|
d = {}
|
2012-10-11 10:03:12 +00:00
|
|
|
for scraperclass in _scrapers:
|
|
|
|
name = scraperclass.get_name().lower()
|
2012-06-20 19:58:13 +00:00
|
|
|
if name in d:
|
2012-10-11 10:03:12 +00:00
|
|
|
name1 = scraperclass.get_name()
|
2012-06-20 19:58:13 +00:00
|
|
|
name2 = d[name].get_name()
|
2012-12-12 16:41:29 +00:00
|
|
|
raise ValueError('duplicate scrapers %s and %s found' % (name1, name2))
|
2012-10-11 10:03:12 +00:00
|
|
|
d[name] = scraperclass
|
2012-11-26 06:14:02 +00:00
|
|
|
|
|
|
|
|
|
|
|
def make_scraper(classname, **attributes):
|
|
|
|
"""Make a new scraper class with given name and attributes."""
|
|
|
|
return type(classname, (_BasicScraper,), attributes)
|