dosage/dosagelib/scraper.py

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2013 Bastian Kleineidam
import requests
from . import loader
from .util import fetchUrls
from .comic import ComicStrip
from .output import out


class _BasicScraper(object):
    '''Base class with scrape functions for comics.

    @type latestUrl: C{string}
    @cvar latestUrl: The URL for the latest comic strip.
    @type stripUrl: C{string}
    @cvar stripUrl: A string that is interpolated with the strip index
        to yield the URL for a particular strip.
    @type imageSearch: C{regex}
    @cvar imageSearch: A compiled regex that will locate the strip image URL
        when applied to the strip page.
    @type prevSearch: C{regex}
    @cvar prevSearch: A compiled regex that will locate the URL for the
        previous strip when applied to a strip page.
    '''

    # if more than one image per URL is expected
    multipleImagesPerStrip = False

    # set to False if previous URLs do not match the strip URL (ie. because of redirects)
    prevUrlMatchesStripUrl = True

    # set to True if this comic contains adult content
    adult = False

    # a description of the comic contents
    description = ''

    # usually the index format help
    help = ''

    # HTTP session storing cookies
    session = requests.session()

    def __init__(self, indexes=None):
        """Initialize internal variables."""
        self.urls = set()
        self.indexes = indexes

    def getCurrentStrips(self):
        """Get current comic strip."""
        msg = 'Retrieving the current strip'
        if self.indexes:
            msg += " for indexes %s" % self.indexes
        out.info(msg+"...")
        if self.indexes:
            for index in self.indexes:
                url = self.stripUrl % index
                yield self.getStrip(url)
        else:
            yield self.getStrip(self.getLatestUrl())

    def getStrip(self, url):
        """Get comic strip for given URL."""
        imageUrls = fetchUrls(url, self.imageSearch, session=self.session)[0]
        if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
            out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
        return self.getComicStrip(url, imageUrls)

    def getComicStrip(self, url, imageUrls):
        """Get comic strip downloader for given URL and images."""
        return ComicStrip(self.get_name(), url, imageUrls, self.namer)

    def getAllStrips(self, maxstrips=None):
        """Get all comic strips."""
        if maxstrips:
            msg = 'Retrieving %d strips' % maxstrips
        else:
            msg = 'Retrieving all strips'
        if self.indexes:
            msg += " for indexes %s" % self.indexes
        if self.adult:
            msg += " (including adult content)"
        out.info(msg)
        if self.indexes:
            for index in self.indexes:
                url = self.stripUrl % index
                for strip in self.getStripsFor(url, maxstrips):
                    yield strip
        else:
            url = self.getLatestUrl()
            for strip in self.getStripsFor(url, maxstrips):
                yield strip

    def getStripsFor(self, url, maxstrips):
        """Get comic strips for an URL. If maxstrips is a positive number, stop after
        retrieving the given number of strips."""
        seen_urls = set()
        while url:
            imageUrls, prevUrl = fetchUrls(url, self.imageSearch,
              self.prevSearch, session=self.session)
            prevUrl = self.prevUrlModifier(prevUrl)
            out.debug("Matched previous URL %s" % prevUrl)
            seen_urls.add(url)
            yield self.getComicStrip(url, imageUrls)
            if prevUrl in seen_urls:
                # avoid recursive URL loops
                out.warn("Already seen previous URL %r" % prevUrl)
                break
            url = prevUrl
            if maxstrips is not None:
                maxstrips -= 1
                if maxstrips <= 0:
                    break

    def setStrip(self, index):
        """Set current comic strip URL."""
        self.currentUrl = self.stripUrl % index

    @classmethod
    def get_name(cls):
        """Get scraper name."""
        if hasattr(cls, 'name'):
            return cls.name
        return cls.__name__

    @classmethod
    def starter(cls):
        """Get starter URL from where to scrape comic strips."""
        return cls.latestUrl

    @classmethod
    def namer(cls, imageUrl, pageUrl):
        """Return filename for given image and page URL."""
        return None

    @classmethod
    def prevUrlModifier(cls, prevUrl):
        """Optional modification of parsed previous URLs. Useful if
        there are domain redirects. The default implementation does
        not modify the URL.
        """
        return prevUrl

    def getFilename(self, imageUrl, pageUrl):
        """Return filename for given image and page URL."""
        return self.namer(imageUrl, pageUrl)

    def getLatestUrl(self):
        """Get starter URL from where to scrape comic strips."""
        return self.starter()


def get_scraper(comic):
    """Returns a comic module object."""
    if not comic:
        raise ValueError("empty comic name")
    candidates = []
    cname = comic.lower()
    for scraperclass in get_scrapers():
        lname = scraperclass.get_name().lower()
        if lname == cname:
            # perfect match
            return scraperclass
        if cname in lname:
            candidates.append(scraperclass)
    if len(candidates) == 1:
        return candidates[0]
    elif candidates:
        comics = ", ".join(x.get_name() for x in candidates)
        raise ValueError('multiple comics found: %s' % comics)
    else:
        raise ValueError('comic %r not found' % comic)


_scrapers = None
def get_scrapers():
    """Find all comic scraper classes in the plugins directory.
    The result is cached.
    @return: list of _BasicScraper classes
    @rtype: list of _BasicScraper
    """
    global _scrapers
    if _scrapers is None:
        out.debug("Loading comic modules...")
        modules = loader.get_modules()
        plugins = loader.get_plugins(modules, _BasicScraper)
        _scrapers = list(plugins)
        _scrapers.sort(key=lambda s: s.get_name())
        check_scrapers()
        out.debug("... %d modules loaded." % len(_scrapers))
    return _scrapers


def check_scrapers():
    """Check for duplicate scraper class names."""
    d = {}
    for scraperclass in _scrapers:
        name = scraperclass.get_name().lower()
        if name in d:
            name1 = scraperclass.get_name()
            name2 = d[name].get_name()
            raise ValueError('duplicate scrapers %s and %s found' % (name1, name2))
        d[name] = scraperclass


def make_scraper(classname, **attributes):
    """Make a new scraper class with given name and attributes."""
    return type(classname, (_BasicScraper,), attributes)
Updated copyright for all source files. 2012-06-20 20:41:04 +00:00			`# -- coding: iso-8859-1 --`
			`# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs`
Updated copyright. 2013-01-28 17:52:26 +00:00			`# Copyright (C) 2012-2013 Bastian Kleineidam`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`import requests`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`from . import loader`
			`from .util import fetchUrls`
			`from .comic import ComicStrip`
Fix indexed retrieval. 2012-10-11 17:53:37 +00:00			`from .output import out`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

A lot of refactoring. 2012-10-11 10:03:12 +00:00			`class _BasicScraper(object):`
			`'''Base class with scrape functions for comics.`

			`@type latestUrl: C{string}`
			`@cvar latestUrl: The URL for the latest comic strip.`
Rename imageUrl to stripUrl. 2012-11-13 18:10:19 +00:00			`@type stripUrl: C{string}`
			`@cvar stripUrl: A string that is interpolated with the strip index`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`to yield the URL for a particular strip.`
			`@type imageSearch: C{regex}`
			`@cvar imageSearch: A compiled regex that will locate the strip image URL`
			`when applied to the strip page.`
			`@type prevSearch: C{regex}`
			`@cvar prevSearch: A compiled regex that will locate the URL for the`
			`previous strip when applied to a strip page.`
			`'''`
Fix more comics. 2012-12-05 20:52:52 +00:00
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`# if more than one image per URL is expected`
			`multipleImagesPerStrip = False`
Fix more comics. 2012-12-05 20:52:52 +00:00
			`# set to False if previous URLs do not match the strip URL (ie. because of redirects)`
			`prevUrlMatchesStripUrl = True`

Add cookie feature. 2012-12-08 20:29:57 +00:00			`# set to True if this comic contains adult content`
			`adult = False`

Various fixes and additions. 2012-12-12 16:41:29 +00:00			`# a description of the comic contents`
			`description = ''`

Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`# usually the index format help`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`help = ''`
A lot of refactoring. 2012-10-11 10:03:12 +00:00
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`# HTTP session storing cookies`
			`session = requests.session()`
Fix more comics. 2012-12-05 20:52:52 +00:00
Fix indexed retrieval. 2012-10-11 17:53:37 +00:00			`def __init__(self, indexes=None):`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`"""Initialize internal variables."""`
			`self.urls = set()`
Fix indexed retrieval. 2012-10-11 17:53:37 +00:00			`self.indexes = indexes`
A lot of refactoring. 2012-10-11 10:03:12 +00:00
Fix indexed retrieval. 2012-10-11 17:53:37 +00:00			`def getCurrentStrips(self):`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`"""Get current comic strip."""`
Fix indexed retrieval. 2012-10-11 17:53:37 +00:00			`msg = 'Retrieving the current strip'`
			`if self.indexes:`
			`msg += " for indexes %s" % self.indexes`
Fix more comics. 2012-12-07 23:45:18 +00:00			`out.info(msg+"...")`
Fix indexed retrieval. 2012-10-11 17:53:37 +00:00			`if self.indexes:`
			`for index in self.indexes:`
Rename imageUrl to stripUrl. 2012-11-13 18:10:19 +00:00			`url = self.stripUrl % index`
Fix indexed retrieval. 2012-10-11 17:53:37 +00:00			`yield self.getStrip(url)`
			`else:`
			`yield self.getStrip(self.getLatestUrl())`
A lot of refactoring. 2012-10-11 10:03:12 +00:00
			`def getStrip(self, url):`
			`"""Get comic strip for given URL."""`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`imageUrls = fetchUrls(url, self.imageSearch, session=self.session)[0]`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`if len(imageUrls) > 1 and not self.multipleImagesPerStrip:`
Fix more comics. 2012-12-07 23:45:18 +00:00			`out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`return self.getComicStrip(url, imageUrls)`

			`def getComicStrip(self, url, imageUrls):`
			`"""Get comic strip downloader for given URL and images."""`
			`return ComicStrip(self.get_name(), url, imageUrls, self.namer)`

Fix more comics. 2012-12-07 23:45:18 +00:00			`def getAllStrips(self, maxstrips=None):`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`"""Get all comic strips."""`
Fix more comics. 2012-12-07 23:45:18 +00:00			`if maxstrips:`
			`msg = 'Retrieving %d strips' % maxstrips`
			`else:`
			`msg = 'Retrieving all strips'`
Improve comic strip message. 2013-01-29 17:51:35 +00:00			`if self.indexes:`
			`msg += " for indexes %s" % self.indexes`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`if self.adult:`
Improve comic strip message. 2013-01-29 17:51:35 +00:00			`msg += " (including adult content)"`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`out.info(msg)`
Fix indexed retrieval. 2012-10-11 17:53:37 +00:00			`if self.indexes:`
			`for index in self.indexes:`
Rename imageUrl to stripUrl. 2012-11-13 18:10:19 +00:00			`url = self.stripUrl % index`
Retrieve more than one strip in index mode. 2013-01-23 19:21:52 +00:00			`for strip in self.getStripsFor(url, maxstrips):`
Fix indexed retrieval. 2012-10-11 17:53:37 +00:00			`yield strip`
			`else:`
			`url = self.getLatestUrl()`
Fix more comics. 2012-12-07 23:45:18 +00:00			`for strip in self.getStripsFor(url, maxstrips):`
Fix indexed retrieval. 2012-10-11 17:53:37 +00:00			`yield strip`

Fix more comics. 2012-12-07 23:45:18 +00:00			`def getStripsFor(self, url, maxstrips):`
			`"""Get comic strips for an URL. If maxstrips is a positive number, stop after`
			`retrieving the given number of strips."""`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`seen_urls = set()`
			`while url:`
Add cookie feature. 2012-12-08 20:29:57 +00:00			`imageUrls, prevUrl = fetchUrls(url, self.imageSearch,`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`self.prevSearch, session=self.session)`
Fix some comics. 2012-12-02 17:35:06 +00:00			`prevUrl = self.prevUrlModifier(prevUrl)`
Fix more comics. 2012-12-07 23:45:18 +00:00			`out.debug("Matched previous URL %s" % prevUrl)`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`seen_urls.add(url)`
			`yield self.getComicStrip(url, imageUrls)`
Add cookie feature. 2012-12-08 20:29:57 +00:00			`if prevUrl in seen_urls:`
			`# avoid recursive URL loops`
			`out.warn("Already seen previous URL %r" % prevUrl)`
			`break`
			`url = prevUrl`
Fix more comics. 2012-12-07 23:45:18 +00:00			`if maxstrips is not None:`
			`maxstrips -= 1`
			`if maxstrips <= 0:`
			`break`
A lot of refactoring. 2012-10-11 10:03:12 +00:00
			`def setStrip(self, index):`
			`"""Set current comic strip URL."""`
Rename imageUrl to stripUrl. 2012-11-13 18:10:19 +00:00			`self.currentUrl = self.stripUrl % index`
A lot of refactoring. 2012-10-11 10:03:12 +00:00
			`@classmethod`
			`def get_name(cls):`
			`"""Get scraper name."""`
			`if hasattr(cls, 'name'):`
			`return cls.name`
			`return cls.__name__`

			`@classmethod`
			`def starter(cls):`
			`"""Get starter URL from where to scrape comic strips."""`
			`return cls.latestUrl`

			`@classmethod`
			`def namer(cls, imageUrl, pageUrl):`
			`"""Return filename for given image and page URL."""`
			`return None`

Fix some comics. 2012-12-02 17:35:06 +00:00			`@classmethod`
			`def prevUrlModifier(cls, prevUrl):`
			`"""Optional modification of parsed previous URLs. Useful if`
			`there are domain redirects. The default implementation does`
			`not modify the URL.`
			`"""`
			`return prevUrl`

A lot of refactoring. 2012-10-11 10:03:12 +00:00			`def getFilename(self, imageUrl, pageUrl):`
			`"""Return filename for given image and page URL."""`
			`return self.namer(imageUrl, pageUrl)`

			`def getLatestUrl(self):`
			`"""Get starter URL from where to scrape comic strips."""`
			`return self.starter()`


			`def get_scraper(comic):`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`"""Returns a comic module object."""`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`if not comic:`
			`raise ValueError("empty comic name")`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`candidates = []`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`cname = comic.lower()`
			`for scraperclass in get_scrapers():`
			`lname = scraperclass.get_name().lower()`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`if lname == cname:`
			`# perfect match`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`return scraperclass`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`if cname in lname:`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`candidates.append(scraperclass)`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`if len(candidates) == 1:`
			`return candidates[0]`
			`elif candidates:`
			`comics = ", ".join(x.get_name() for x in candidates)`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`raise ValueError('multiple comics found: %s' % comics)`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`else:`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`raise ValueError('comic %r not found' % comic)`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

			`_scrapers = None`
			`def get_scrapers():`
			`"""Find all comic scraper classes in the plugins directory.`
			`The result is cached.`
			`@return: list of _BasicScraper classes`
			`@rtype: list of _BasicScraper`
			`"""`
			`global _scrapers`
			`if _scrapers is None:`
Fix more comics. 2012-12-07 23:45:18 +00:00			`out.debug("Loading comic modules...")`
Require python 2.7, use importlib. 2012-11-19 20:20:50 +00:00			`modules = loader.get_modules()`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`plugins = loader.get_plugins(modules, _BasicScraper)`
			`_scrapers = list(plugins)`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`_scrapers.sort(key=lambda s: s.get_name())`
			`check_scrapers()`
Fix more comics. 2012-12-07 23:45:18 +00:00			`out.debug("... %d modules loaded." % len(_scrapers))`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`return _scrapers`


			`def check_scrapers():`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`"""Check for duplicate scraper class names."""`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`d = {}`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`for scraperclass in _scrapers:`
			`name = scraperclass.get_name().lower()`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`if name in d:`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`name1 = scraperclass.get_name()`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`name2 = d[name].get_name()`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`raise ValueError('duplicate scrapers %s and %s found' % (name1, name2))`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`d[name] = scraperclass`
Dynamic type generation helpers. 2012-11-26 06:14:02 +00:00

			`def make_scraper(classname, **attributes):`
			`"""Make a new scraper class with given name and attributes."""`
			`return type(classname, (_BasicScraper,), attributes)`