dosage/dosagelib/helpers.py

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
import re

from .util import fetchUrl, getQueryParams
from .scraper import _BasicScraper

def queryNamer(paramName, usePageUrl=False):
    """Get name from URL query part."""
    @staticmethod
    def _namer(imageUrl, pageUrl):
        url = (imageUrl, pageUrl)[usePageUrl]
        return getQueryParams(url)[paramName][0]
    return _namer


def regexNamer(regex):
    """Get name from regular expression."""
    @staticmethod
    def _namer(imageUrl, pageUrl):
        mo = regex.search(imageUrl)
        if mo:
            return mo.group(1)
    return _namer


def bounceStarter(latestUrl, nextSearch):
    """Get start URL by "bouncing" back and forth one time."""
    @classmethod
    def _starter(cls):
        url = fetchUrl(latestUrl, cls.prevSearch)
        if not url:
            raise ValueError("could not find prevSearch pattern %r in %s" % (cls.prevSearch.pattern, latestUrl))
        url = fetchUrl(url, nextSearch)
        if not url:
            raise ValueError("could not find nextSearch pattern %r in %s" % (nextSearch.pattern, latestUrl))
        return url
    return _starter


def indirectStarter(baseUrl, latestSearch):
    """Get start URL by indirection."""
    @staticmethod
    def _starter():
        url = fetchUrl(baseUrl, latestSearch)
        if not url:
            raise ValueError("could not find latestSearch pattern %r in %s" % (latestSearch.pattern, baseUrl))
        return url
    return _starter


class IndirectLatestMixin(object):
    '''
    Mixin for comics that link to the latest comic from a base page of
    some kind. This also supports comics which don't link to the last comic
    from the base page, but the beginning of the latest chapter or similiar
    schemes. It simulates going forward until it can't find a 'next' link as
    specified by the 'nextSearch' regex.

    @type baseUrl: C{string}
    @cvar baseUrl: the URL where the link to the latest comic is found.
    @type latestSearch C{regex}
    @cvar latestSearch: a compiled regex for finding the 'latest' URL.
    @type nextSearch C{regex}
    @cvar nextSearch: a compiled regex for finding the 'next' URL.
    '''

    __latestUrl = None

    def getLatestUrl(self):
        """Get latest comic URL."""
        if not self.__latestUrl:
            self.__latestUrl = fetchUrl(self.baseUrl, self.latestSearch)
            if hasattr(self, "nextSearch"):
                nextUrl = fetchUrl(self.__latestUrl, self.nextSearch)
                while nextUrl:
                    self.__latestUrl = nextUrl
                    nextUrl = fetchUrl(self.__latestUrl, self.nextSearch)
        return self.__latestUrl

    latestUrl = property(getLatestUrl)


class _PHPScraper(_BasicScraper):
    """
    Scraper for comics using phpComic/CUSP.

    This provides an easy way to define scrapers for webcomics using phpComic.
    """
    imageUrl = property(lambda self: self.basePath + 'daily.php?date=%s')
    imageSearch = property(lambda self: re.compile(r'<img alt=[^>]+ src="(%scomics/\d{6}\..+?)">' % (self.basePath,)))

    help = 'Index format: yymmdd'

    @classmethod
    def starter(cls):
        """Get starter URL."""
        return cls.basePath + cls.latestUrl
Updated copyright for all source files. 2012-06-20 20:41:04 +00:00			`# -- coding: iso-8859-1 --`
			`# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs`
			`# Copyright (C) 2012 Bastian Kleineidam`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`import re`

A lot of refactoring. 2012-10-11 10:03:12 +00:00			`from .util import fetchUrl, getQueryParams`
			`from .scraper import _BasicScraper`
Initial commit to Github. 2012-06-20 19:58:13 +00:00
			`def queryNamer(paramName, usePageUrl=False):`
Document some functions. 2012-09-26 14:47:39 +00:00			`"""Get name from URL query part."""`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`@staticmethod`
			`def _namer(imageUrl, pageUrl):`
			`url = (imageUrl, pageUrl)[usePageUrl]`
			`return getQueryParams(url)[paramName][0]`
			`return _namer`


			`def regexNamer(regex):`
Document some functions. 2012-09-26 14:47:39 +00:00			`"""Get name from regular expression."""`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`@staticmethod`
			`def _namer(imageUrl, pageUrl):`
Fix some comics. 2012-11-13 18:12:28 +00:00			`mo = regex.search(imageUrl)`
			`if mo:`
			`return mo.group(1)`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`return _namer`


			`def bounceStarter(latestUrl, nextSearch):`
Document some functions. 2012-09-26 14:47:39 +00:00			`"""Get start URL by "bouncing" back and forth one time."""`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`@classmethod`
			`def _starter(cls):`
			`url = fetchUrl(latestUrl, cls.prevSearch)`
Fix some comics. 2012-11-21 20:57:26 +00:00			`if not url:`
			`raise ValueError("could not find prevSearch pattern %r in %s" % (cls.prevSearch.pattern, latestUrl))`
			`url = fetchUrl(url, nextSearch)`
			`if not url:`
			`raise ValueError("could not find nextSearch pattern %r in %s" % (nextSearch.pattern, latestUrl))`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`return url`
			`return _starter`


			`def indirectStarter(baseUrl, latestSearch):`
Document some functions. 2012-09-26 14:47:39 +00:00			`"""Get start URL by indirection."""`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`@staticmethod`
			`def _starter():`
Fix some comics. 2012-11-21 20:57:26 +00:00			`url = fetchUrl(baseUrl, latestSearch)`
			`if not url:`
			`raise ValueError("could not find latestSearch pattern %r in %s" % (latestSearch.pattern, baseUrl))`
			`return url`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`return _starter`


			`class IndirectLatestMixin(object):`
			`'''`
			`Mixin for comics that link to the latest comic from a base page of`
			`some kind. This also supports comics which don't link to the last comic`
			`from the base page, but the beginning of the latest chapter or similiar`
			`schemes. It simulates going forward until it can't find a 'next' link as`
			`specified by the 'nextSearch' regex.`

			`@type baseUrl: C{string}`
			`@cvar baseUrl: the URL where the link to the latest comic is found.`
			`@type latestSearch C{regex}`
			`@cvar latestSearch: a compiled regex for finding the 'latest' URL.`
			`@type nextSearch C{regex}`
			`@cvar nextSearch: a compiled regex for finding the 'next' URL.`
			`'''`

			`__latestUrl = None`

			`def getLatestUrl(self):`
Document some functions. 2012-09-26 14:47:39 +00:00			`"""Get latest comic URL."""`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`if not self.__latestUrl:`
			`self.__latestUrl = fetchUrl(self.baseUrl, self.latestSearch)`
			`if hasattr(self, "nextSearch"):`
			`nextUrl = fetchUrl(self.__latestUrl, self.nextSearch)`
			`while nextUrl:`
			`self.__latestUrl = nextUrl`
			`nextUrl = fetchUrl(self.__latestUrl, self.nextSearch)`
			`return self.__latestUrl`

			`latestUrl = property(getLatestUrl)`


			`class _PHPScraper(_BasicScraper):`
			`"""`
Document some functions. 2012-09-26 14:47:39 +00:00			`Scraper for comics using phpComic/CUSP.`
Initial commit to Github. 2012-06-20 19:58:13 +00:00
			`This provides an easy way to define scrapers for webcomics using phpComic.`
			`"""`
			`imageUrl = property(lambda self: self.basePath + 'daily.php?date=%s')`
			`imageSearch = property(lambda self: re.compile(r'<img alt=[^>]+ src="(%scomics/\d{6}\..+?)">' % (self.basePath,)))`

			`help = 'Index format: yymmdd'`

			`@classmethod`
			`def starter(cls):`
Document some functions. 2012-09-26 14:47:39 +00:00			`"""Get starter URL."""`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`return cls.basePath + cls.latestUrl`