dosage/dosagelib/scraper.py

177 lines
5.6 KiB
Python
Raw Normal View History

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
2012-06-20 19:58:13 +00:00
import os
2012-10-11 10:03:12 +00:00
from . import loader
from .util import fetchUrls
from .comic import ComicStrip
2012-10-11 17:53:37 +00:00
from .output import out
2012-06-20 19:58:13 +00:00
disabled = []
def init_disabled():
filename = os.path.expanduser('~/.dosage/disabled')
2012-10-11 10:03:12 +00:00
if os.path.isfile(filename):
with open(filename) as f:
for line in f:
if line and not line.startswith('#'):
disabled.append(line.rstrip())
2012-06-20 19:58:13 +00:00
init_disabled()
class DisabledComicError(ValueError):
pass
2012-10-11 10:03:12 +00:00
class _BasicScraper(object):
'''Base class with scrape functions for comics.
@type latestUrl: C{string}
@cvar latestUrl: The URL for the latest comic strip.
@type imageUrl: C{string}
@cvar imageUrl: A string that is interpolated with the strip index
to yield the URL for a particular strip.
@type imageSearch: C{regex}
@cvar imageSearch: A compiled regex that will locate the strip image URL
when applied to the strip page.
@type prevSearch: C{regex}
@cvar prevSearch: A compiled regex that will locate the URL for the
previous strip when applied to a strip page.
'''
help = 'Sorry, no help for this comic yet.'
2012-10-11 17:53:37 +00:00
def __init__(self, indexes=None):
2012-10-11 10:03:12 +00:00
"""Initialize internal variables."""
self.urls = set()
2012-10-11 17:53:37 +00:00
self.indexes = indexes
2012-10-11 10:03:12 +00:00
2012-10-11 17:53:37 +00:00
def getCurrentStrips(self):
2012-10-11 10:03:12 +00:00
"""Get current comic strip."""
2012-10-11 17:53:37 +00:00
msg = 'Retrieving the current strip'
if self.indexes:
msg += " for indexes %s" % self.indexes
out.write(msg+"...")
if self.indexes:
for index in self.indexes:
url = self.imageUrl % index
yield self.getStrip(url)
else:
yield self.getStrip(self.getLatestUrl())
2012-10-11 10:03:12 +00:00
def getStrip(self, url):
"""Get comic strip for given URL."""
imageUrls = fetchUrls(url, self.imageSearch)
return self.getComicStrip(url, imageUrls)
def getComicStrip(self, url, imageUrls):
"""Get comic strip downloader for given URL and images."""
return ComicStrip(self.get_name(), url, imageUrls, self.namer)
def getAllStrips(self):
"""Get all comic strips."""
2012-10-11 17:53:37 +00:00
msg = 'Retrieving all strips'
if self.indexes:
msg += " for indexes %s" % self.indexes
out.write(msg+"...")
if self.indexes:
for index in self.indexes:
url = self.imageUrl % index
for strip in self.getAllStripsFor(url):
yield strip
else:
url = self.getLatestUrl()
for strip in self.getAllStripsFor(url):
yield strip
def getAllStripsFor(self, url):
"""Get all comic strips for an URL."""
2012-10-11 10:03:12 +00:00
seen_urls = set()
while url:
imageUrls, prevUrl = fetchUrls(url, self.imageSearch, self.prevSearch)
seen_urls.add(url)
yield self.getComicStrip(url, imageUrls)
# avoid recursive URL loops
url = prevUrl if prevUrl not in seen_urls else None
def setStrip(self, index):
"""Set current comic strip URL."""
self.currentUrl = self.imageUrl % index
def getHelp(self):
"""Return help text for this scraper."""
return self.help
@classmethod
def get_name(cls):
"""Get scraper name."""
if hasattr(cls, 'name'):
return cls.name
return cls.__name__
@classmethod
def starter(cls):
"""Get starter URL from where to scrape comic strips."""
return cls.latestUrl
@classmethod
def namer(cls, imageUrl, pageUrl):
"""Return filename for given image and page URL."""
return None
def getFilename(self, imageUrl, pageUrl):
"""Return filename for given image and page URL."""
return self.namer(imageUrl, pageUrl)
def getLatestUrl(self):
"""Get starter URL from where to scrape comic strips."""
return self.starter()
def get_scraper(comic):
2012-06-20 19:58:13 +00:00
"""Returns a comic module object."""
candidates = []
2012-10-11 10:03:12 +00:00
cname = comic.lower()
for scraperclass in get_scrapers():
lname = scraperclass.get_name().lower()
2012-06-20 19:58:13 +00:00
if lname == cname:
# perfect match
2012-10-11 10:03:12 +00:00
return scraperclass
2012-06-20 19:58:13 +00:00
if cname in lname:
2012-10-11 10:03:12 +00:00
candidates.append(scraperclass)
2012-06-20 19:58:13 +00:00
if len(candidates) == 1:
return candidates[0]
elif candidates:
comics = ", ".join(x.get_name() for x in candidates)
raise ValueError('Multiple comics %s found.' % comics)
else:
2012-10-11 10:03:12 +00:00
raise ValueError('Comic %r not found.' % comic)
2012-06-20 19:58:13 +00:00
_scrapers = None
def get_scrapers():
"""Find all comic scraper classes in the plugins directory.
The result is cached.
@return: list of _BasicScraper classes
@rtype: list of _BasicScraper
"""
global _scrapers
if _scrapers is None:
2012-10-11 10:03:12 +00:00
folder = os.path.join(os.path.dirname(__file__), 'plugins')
importprefix = 'dosagelib.plugins.'
modules = loader.get_modules(folder, importprefix)
plugins = loader.get_plugins(modules, _BasicScraper)
_scrapers = list(plugins)
2012-06-20 19:58:13 +00:00
_scrapers.sort(key=lambda s: s.get_name())
check_scrapers()
return _scrapers
def check_scrapers():
2012-10-11 10:03:12 +00:00
"""Check for duplicate scraper class names."""
2012-06-20 19:58:13 +00:00
d = {}
2012-10-11 10:03:12 +00:00
for scraperclass in _scrapers:
name = scraperclass.get_name().lower()
2012-06-20 19:58:13 +00:00
if name in d:
2012-10-11 10:03:12 +00:00
name1 = scraperclass.get_name()
2012-06-20 19:58:13 +00:00
name2 = d[name].get_name()
raise ValueError('Duplicate scrapers %s and %s found' % (name1, name2))
2012-10-11 10:03:12 +00:00
d[name] = scraperclass