dosage/dosagelib/scraper.py

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
import os
from . import loader
from .util import fetchUrls
from .comic import ComicStrip

disabled = []
def init_disabled():
    filename = os.path.expanduser('~/.dosage/disabled')
    if os.path.isfile(filename):
        with open(filename) as f:
            for line in f:
                if line and not line.startswith('#'):
                    disabled.append(line.rstrip())
init_disabled()

class DisabledComicError(ValueError):
    pass


class _BasicScraper(object):
    '''Base class with scrape functions for comics.

    @type latestUrl: C{string}
    @cvar latestUrl: The URL for the latest comic strip.
    @type imageUrl: C{string}
    @cvar imageUrl: A string that is interpolated with the strip index
        to yield the URL for a particular strip.
    @type imageSearch: C{regex}
    @cvar imageSearch: A compiled regex that will locate the strip image URL
        when applied to the strip page.
    @type prevSearch: C{regex}
    @cvar prevSearch: A compiled regex that will locate the URL for the
        previous strip when applied to a strip page.
    '''
    help = 'Sorry, no help for this comic yet.'

    def __init__(self, indices=None):
        """Initialize internal variables."""
        self.urls = set()
        self.indices = indices

    def getCurrentStrip(self):
        """Get current comic strip."""
        return self.getStrip(self.getLatestUrl())

    def getStrip(self, url):
        """Get comic strip for given URL."""
        imageUrls = fetchUrls(url, self.imageSearch)
        return self.getComicStrip(url, imageUrls)

    def getComicStrip(self, url, imageUrls):
        """Get comic strip downloader for given URL and images."""
        return ComicStrip(self.get_name(), url, imageUrls, self.namer)

    def getAllStrips(self):
        """Get all comic strips."""
        seen_urls = set()
        url = self.getLatestUrl()
        while url:
            imageUrls, prevUrl = fetchUrls(url, self.imageSearch, self.prevSearch)
            seen_urls.add(url)
            yield self.getComicStrip(url, imageUrls)
            # avoid recursive URL loops
            url = prevUrl if prevUrl not in seen_urls else None

    def setStrip(self, index):
        """Set current comic strip URL."""
        self.currentUrl = self.imageUrl % index

    def getHelp(self):
        """Return help text for this scraper."""
        return self.help

    @classmethod
    def get_name(cls):
        """Get scraper name."""
        if hasattr(cls, 'name'):
            return cls.name
        return cls.__name__

    @classmethod
    def starter(cls):
        """Get starter URL from where to scrape comic strips."""
        return cls.latestUrl

    @classmethod
    def namer(cls, imageUrl, pageUrl):
        """Return filename for given image and page URL."""
        return None

    def getFilename(self, imageUrl, pageUrl):
        """Return filename for given image and page URL."""
        return self.namer(imageUrl, pageUrl)

    def getLatestUrl(self):
        """Get starter URL from where to scrape comic strips."""
        return self.starter()


def get_scraper(comic):
    """Returns a comic module object."""
    candidates = []
    cname = comic.lower()
    for scraperclass in get_scrapers():
        lname = scraperclass.get_name().lower()
        if lname == cname:
            # perfect match
            return scraperclass
        if cname in lname:
            candidates.append(scraperclass)
    if len(candidates) == 1:
        return candidates[0]
    elif candidates:
        comics = ", ".join(x.get_name() for x in candidates)
        raise ValueError('Multiple comics %s found.' % comics)
    else:
        raise ValueError('Comic %r not found.' % comic)


_scrapers = None
def get_scrapers():
    """Find all comic scraper classes in the plugins directory.
    The result is cached.
    @return: list of _BasicScraper classes
    @rtype: list of _BasicScraper
    """
    global _scrapers
    if _scrapers is None:
        folder = os.path.join(os.path.dirname(__file__), 'plugins')
        importprefix = 'dosagelib.plugins.'
        modules = loader.get_modules(folder, importprefix)
        plugins = loader.get_plugins(modules, _BasicScraper)
        _scrapers = list(plugins)
        _scrapers.sort(key=lambda s: s.get_name())
        check_scrapers()
    return _scrapers


def check_scrapers():
    """Check for duplicate scraper class names."""
    d = {}
    for scraperclass in _scrapers:
        name = scraperclass.get_name().lower()
        if name in d:
            name1 = scraperclass.get_name()
            name2 = d[name].get_name()
            raise ValueError('Duplicate scrapers %s and %s found' % (name1, name2))
        d[name] = scraperclass
Updated copyright for all source files. 2012-06-20 20:41:04 +00:00			`# -- coding: iso-8859-1 --`
			`# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs`
			`# Copyright (C) 2012 Bastian Kleineidam`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`import os`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`from . import loader`
			`from .util import fetchUrls`
			`from .comic import ComicStrip`
Initial commit to Github. 2012-06-20 19:58:13 +00:00
			`disabled = []`
			`def init_disabled():`
			`filename = os.path.expanduser('~/.dosage/disabled')`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`if os.path.isfile(filename):`
			`with open(filename) as f:`
			`for line in f:`
			`if line and not line.startswith('#'):`
			`disabled.append(line.rstrip())`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`init_disabled()`

			`class DisabledComicError(ValueError):`
			`pass`


A lot of refactoring. 2012-10-11 10:03:12 +00:00			`class _BasicScraper(object):`
			`'''Base class with scrape functions for comics.`

			`@type latestUrl: C{string}`
			`@cvar latestUrl: The URL for the latest comic strip.`
			`@type imageUrl: C{string}`
			`@cvar imageUrl: A string that is interpolated with the strip index`
			`to yield the URL for a particular strip.`
			`@type imageSearch: C{regex}`
			`@cvar imageSearch: A compiled regex that will locate the strip image URL`
			`when applied to the strip page.`
			`@type prevSearch: C{regex}`
			`@cvar prevSearch: A compiled regex that will locate the URL for the`
			`previous strip when applied to a strip page.`
			`'''`
			`help = 'Sorry, no help for this comic yet.'`

			`def __init__(self, indices=None):`
			`"""Initialize internal variables."""`
			`self.urls = set()`
			`self.indices = indices`

			`def getCurrentStrip(self):`
			`"""Get current comic strip."""`
			`return self.getStrip(self.getLatestUrl())`

			`def getStrip(self, url):`
			`"""Get comic strip for given URL."""`
			`imageUrls = fetchUrls(url, self.imageSearch)`
			`return self.getComicStrip(url, imageUrls)`

			`def getComicStrip(self, url, imageUrls):`
			`"""Get comic strip downloader for given URL and images."""`
			`return ComicStrip(self.get_name(), url, imageUrls, self.namer)`

			`def getAllStrips(self):`
			`"""Get all comic strips."""`
			`seen_urls = set()`
			`url = self.getLatestUrl()`
			`while url:`
			`imageUrls, prevUrl = fetchUrls(url, self.imageSearch, self.prevSearch)`
			`seen_urls.add(url)`
			`yield self.getComicStrip(url, imageUrls)`
			`# avoid recursive URL loops`
			`url = prevUrl if prevUrl not in seen_urls else None`

			`def setStrip(self, index):`
			`"""Set current comic strip URL."""`
			`self.currentUrl = self.imageUrl % index`

			`def getHelp(self):`
			`"""Return help text for this scraper."""`
			`return self.help`

			`@classmethod`
			`def get_name(cls):`
			`"""Get scraper name."""`
			`if hasattr(cls, 'name'):`
			`return cls.name`
			`return cls.__name__`

			`@classmethod`
			`def starter(cls):`
			`"""Get starter URL from where to scrape comic strips."""`
			`return cls.latestUrl`

			`@classmethod`
			`def namer(cls, imageUrl, pageUrl):`
			`"""Return filename for given image and page URL."""`
			`return None`

			`def getFilename(self, imageUrl, pageUrl):`
			`"""Return filename for given image and page URL."""`
			`return self.namer(imageUrl, pageUrl)`

			`def getLatestUrl(self):`
			`"""Get starter URL from where to scrape comic strips."""`
			`return self.starter()`


			`def get_scraper(comic):`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`"""Returns a comic module object."""`
			`candidates = []`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`cname = comic.lower()`
			`for scraperclass in get_scrapers():`
			`lname = scraperclass.get_name().lower()`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`if lname == cname:`
			`# perfect match`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`return scraperclass`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`if cname in lname:`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`candidates.append(scraperclass)`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`if len(candidates) == 1:`
			`return candidates[0]`
			`elif candidates:`
			`comics = ", ".join(x.get_name() for x in candidates)`
			`raise ValueError('Multiple comics %s found.' % comics)`
			`else:`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`raise ValueError('Comic %r not found.' % comic)`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

			`_scrapers = None`
			`def get_scrapers():`
			`"""Find all comic scraper classes in the plugins directory.`
			`The result is cached.`
			`@return: list of _BasicScraper classes`
			`@rtype: list of _BasicScraper`
			`"""`
			`global _scrapers`
			`if _scrapers is None:`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`folder = os.path.join(os.path.dirname(__file__), 'plugins')`
			`importprefix = 'dosagelib.plugins.'`
			`modules = loader.get_modules(folder, importprefix)`
			`plugins = loader.get_plugins(modules, _BasicScraper)`
			`_scrapers = list(plugins)`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`_scrapers.sort(key=lambda s: s.get_name())`
			`check_scrapers()`
			`return _scrapers`


			`def check_scrapers():`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`"""Check for duplicate scraper class names."""`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`d = {}`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`for scraperclass in _scrapers:`
			`name = scraperclass.get_name().lower()`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`if name in d:`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`name1 = scraperclass.get_name()`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`name2 = d[name].get_name()`
			`raise ValueError('Duplicate scrapers %s and %s found' % (name1, name2))`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`d[name] = scraperclass`