dosage/dosagelib/scraper.py

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2013 Bastian Kleineidam
import requests
import time
import os
from . import loader, configuration
from .util import (fetchUrl, fetchUrls, getPageContent, makeSequence,
  get_system_uid, urlopen, getDirname)
from .comic import ComicStrip
from .output import out
from .events import getHandler


class Genre:
    """Genre of a comic strip."""
    adventure = u"Adventure"
    crazy = u"Crazy"
    drama = u"Drama"
    fantasy = u"Fantasy"
    gaming = u"Gaming"
    humor = u"Humor"
    reallife = u"Real life"
    scifi = u"Sci-fi"
    other = u"Other"


class _BasicScraper(object):
    '''Base class with scrape functions for comics.'''

    # The URL for the comic strip
    url = None

    # A string that is interpolated with the strip index to yield the URL for a particular strip.
    stripUrl = None

    # Stop search for previous URLs at this URL
    firstStripUrl = None

    # if more than one image per URL is expected
    multipleImagesPerStrip = False

    # set to False if previous URLs do not match the strip URL (ie. because of redirects)
    prevUrlMatchesStripUrl = True

    # set to True if this comic contains adult content
    adult = False

    # set to True if this comic will not get updated anymore
    endOfLife = False

    # a description of the comic contents
    description = u''

    # langauge of the comic (two-letter ISO 639-1 code)
    lang = 'en'

    # list of genres for this comic strip
    genres = (Genre.other,)

    # compiled regular expression that will locate the URL for the previous strip in a page
    # this can also be a list or tuple of compiled regular expressions
    prevSearch = None

    # compiled regular expression that will locate the strip image URLs strip in a page
    # this can also be a list or tuple of compiled regular expressions
    imageSearch = None

    # usually the index format help
    help = ''

    # wait time between downloading comic strips
    waitSeconds = 0

    # HTTP session storing cookies
    session = requests.session()

    def __init__(self, indexes=None):
        """Initialize internal variables."""
        self.urls = set()
        if indexes:
            self.indexes = tuple(sorted(indexes))
        else:
            self.indexes = tuple()
        self.skippedUrls = set()
        self.hitFirstStripUrl = False

    def __cmp__(self, other):
        """Compare scraper by name and index list."""
        if not isinstance(other, _BasicScraper):
            return 1
        # first, order by name
        d = cmp(self.getName(), other.getName())
        if d != 0:
            return d
        # then by indexes
        return cmp(self.indexes, other.indexes)

    def __hash__(self):
        """Get hash value from name and index list."""
        return hash((self.getName(), self.indexes))

    def shouldSkipUrl(self, url):
        """Determine if search for images in given URL should be skipped."""
        return False

    def getComicStrip(self, url, data, baseUrl):
        """Get comic strip downloader for given URL and data."""
        imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)
        imageUrls = set(map(self.imageUrlModifier, imageUrls))
        if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
            patterns = [x.pattern for x in makeSequence(self.imageSearch)]
            out.warn(u"found %d images instead of 1 at %s with patterns %s" % (len(imageUrls), url, patterns))
            image = sorted(imageUrls)[0]
            out.warn(u"choosing image %s" % image)
            imageUrls = (image,)
        elif not imageUrls:
            patterns = [x.pattern for x in makeSequence(self.imageSearch)]
            out.warn(u"found no images at %s with patterns %s" % (url, patterns))
        return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session)

    def getStrips(self, maxstrips=None):
        """Get comic strips."""
        if maxstrips:
            word = u"strip" if maxstrips == 1 else "strips"
            msg = u'Retrieving %d %s' % (maxstrips, word)
        else:
            msg = u'Retrieving all strips'
        if self.indexes:
            if len(self.indexes) == 1:
                msg += u" for index %s" % self.indexes[0]
            else:
                msg += u" for indexes %s" % self.indexes
            urls = [self.getIndexStripUrl(index) for index in self.indexes]
        else:
            urls = [self.getLatestUrl()]
        if self.adult:
            msg += u" (including adult content)"
        out.info(msg)
        for url in urls:
            for strip in self.getStripsFor(url, maxstrips):
                yield strip

    def getStripsFor(self, url, maxstrips):
        """Get comic strips for an URL. If maxstrips is a positive number, stop after
        retrieving the given number of strips."""
        self.hitFirstStripUrl = False
        seen_urls = set()
        while url:
            out.info(u'Get strip URL %s' % url, level=1)
            data, baseUrl = getPageContent(url, self.session)
            if self.shouldSkipUrl(url):
                out.info(u'Skipping URL %s' % url)
                self.skippedUrls.add(url)
            else:
                try:
                    yield self.getComicStrip(url, data, baseUrl)
                except ValueError as msg:
                    # image not found
                    out.exception(msg)
            if self.firstStripUrl == url:
                out.debug(u"Stop at first URL %s" % url)
                self.hitFirstStripUrl = True
                break
            if maxstrips is not None:
                maxstrips -= 1
                if maxstrips <= 0:
                    break
            prevUrl = self.getPrevUrl(url, data, baseUrl)
            seen_urls.add(url)
            if prevUrl in seen_urls:
                # avoid recursive URL loops
                out.warn(u"Already seen previous URL %r" % prevUrl)
                break
            url = prevUrl
            if url and self.waitSeconds:
                time.sleep(self.waitSeconds)

    def getPrevUrl(self, url, data, baseUrl):
        """Find previous URL."""
        prevUrl = None
        if self.prevSearch:
            try:
                prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
            except ValueError as msg:
                # assume there is no previous URL, but print a warning
                out.warn(u"%s Assuming no previous comic strips exist." % msg)
            else:
                prevUrl = self.prevUrlModifier(prevUrl)
                out.debug(u"Matched previous URL %s" % prevUrl)
                getHandler().comicPageLink(self.getName(), url, prevUrl)
        return prevUrl

    def getIndexStripUrl(self, index):
        """Get comic strip URL from index."""
        return self.stripUrl % index

    @classmethod
    def getName(cls):
        """Get scraper name."""
        if hasattr(cls, 'name'):
            return cls.name
        return cls.__name__

    @classmethod
    def starter(cls):
        """Get starter URL from where to scrape comic strips."""
        return cls.url

    @classmethod
    def namer(cls, imageUrl, pageUrl):
        """Return filename for given image and page URL."""
        return None

    @classmethod
    def prevUrlModifier(cls, prevUrl):
        """Optional modification of parsed previous URLs. Useful if
        there are domain redirects. The default implementation does
        not modify the URL.
        """
        return prevUrl

    @classmethod
    def imageUrlModifier(cls, imageUrl):
        """Optional modification of parsed image URLs. Useful if the URL
        needs to be fixed before usage. The default implementation does
        not modify the URL.
        """
        return imageUrl

    def getLatestUrl(self):
        """Get starter URL from where to scrape comic strips."""
        return self.starter()

    @classmethod
    def vote(cls):
        """Cast a public vote for this comic."""
        url = configuration.VoteUrl + 'count/'
        uid = get_system_uid()
        data = {"name": cls.getName().replace('/', '_'), "uid": uid}
        page = urlopen(url, cls.session, data=data)
        return page.text

    def getCompleteFile(self, basepath):
        """Get filename indicating all comics are downloaded."""
        dirname = getDirname(self.getName())
        return os.path.join(basepath, dirname, "complete.txt")

    def isComplete(self, basepath):
        """Check if all comics are downloaded."""
        return os.path.isfile(self.getCompleteFile(basepath))

    def setComplete(self, basepath):
        """Set complete flag for this comic, ie. all comics are downloaded."""
        if self.endOfLife:
            filename = self.getCompleteFile(basepath)
            if not os.path.exists(filename):
                with open(filename, 'w') as f:
                    f.write('All comics should be downloaded here.')


def find_scraperclasses(comic, multiple_allowed=False):
    """Get a list comic scraper classes. Can return more than one entries if
    multiple_allowed is True, else it raises a ValueError if multiple
    modules match. The match is a case insensitive substring search."""
    if not comic:
        raise ValueError("empty comic name")
    candidates = []
    cname = comic.lower()
    for scraperclass in get_scraperclasses():
        lname = scraperclass.getName().lower()
        if lname == cname:
            # perfect match
            if not multiple_allowed:
                return [scraperclass]
            else:
                candidates.append(scraperclass)
        elif cname in lname:
            candidates.append(scraperclass)
    if len(candidates) > 1 and not multiple_allowed:
        comics = ", ".join(x.getName() for x in candidates)
        raise ValueError('multiple comics found: %s' % comics)
    elif not candidates:
        raise ValueError('comic %r not found' % comic)
    return candidates


_scraperclasses = None
def get_scraperclasses():
    """Find all comic scraper classes in the plugins directory.
    The result is cached.
    @return: list of _BasicScraper classes
    @rtype: list of _BasicScraper
    """
    global _scraperclasses
    if _scraperclasses is None:
        out.debug(u"Loading comic modules...")
        modules = loader.get_modules()
        plugins = loader.get_plugins(modules, _BasicScraper)
        _scraperclasses = list(plugins)
        check_scrapers()
        out.debug(u"... %d modules loaded." % len(_scraperclasses))
    return _scraperclasses


def check_scrapers():
    """Check for duplicate scraper class names."""
    d = {}
    for scraperclass in _scraperclasses:
        name = scraperclass.getName().lower()
        if name in d:
            name1 = scraperclass.getName()
            name2 = d[name].getName()
            raise ValueError('duplicate scrapers %s and %s found' % (name1, name2))
        d[name] = scraperclass


def make_scraper(classname, **attributes):
    """Make a new scraper class with given name and attributes."""
    return type(classname, (_BasicScraper,), attributes)
Updated copyright for all source files. 2012-06-20 20:41:04 +00:00			`# -- coding: iso-8859-1 --`
			`# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs`
Updated copyright. 2013-01-28 17:52:26 +00:00			`# Copyright (C) 2012-2013 Bastian Kleineidam`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`import requests`
Add option to wait before downloading. 2013-03-08 05:46:50 +00:00			`import time`
Detect completed end-of-life comics. 2013-04-25 20:40:06 +00:00			`import os`
Voting part 2 2013-04-08 19:20:01 +00:00			`from . import loader, configuration`
			`from .util import (fetchUrl, fetchUrls, getPageContent, makeSequence,`
Detect completed end-of-life comics. 2013-04-25 20:40:06 +00:00			`get_system_uid, urlopen, getDirname)`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`from .comic import ComicStrip`
Fix indexed retrieval. 2012-10-11 17:53:37 +00:00			`from .output import out`
Add event comicPageLink for every previous link. This event allows a listener to build connections between pages. 2013-03-10 15:23:04 +00:00			`from .events import getHandler`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

Add genre tags. 2013-03-26 16:33:27 +00:00			`class Genre:`
			`"""Genre of a comic strip."""`
			`adventure = u"Adventure"`
			`crazy = u"Crazy"`
			`drama = u"Drama"`
			`fantasy = u"Fantasy"`
			`gaming = u"Gaming"`
			`humor = u"Humor"`
			`reallife = u"Real life"`
			`scifi = u"Sci-fi"`
			`other = u"Other"`


A lot of refactoring. 2012-10-11 10:03:12 +00:00			`class _BasicScraper(object):`
Code cleanup. 2013-03-07 17:22:39 +00:00			`'''Base class with scrape functions for comics.'''`

			`# The URL for the comic strip`
			`url = None`

			`# A string that is interpolated with the strip index to yield the URL for a particular strip.`
			`stripUrl = None`

			`# Stop search for previous URLs at this URL`
Add firstStripUrl to scrapers. 2013-02-13 18:59:59 +00:00			`firstStripUrl = None`

Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`# if more than one image per URL is expected`
			`multipleImagesPerStrip = False`
Fix more comics. 2012-12-05 20:52:52 +00:00
			`# set to False if previous URLs do not match the strip URL (ie. because of redirects)`
			`prevUrlMatchesStripUrl = True`

Add cookie feature. 2012-12-08 20:29:57 +00:00			`# set to True if this comic contains adult content`
			`adult = False`

Detect completed end-of-life comics. 2013-04-25 20:40:06 +00:00			`# set to True if this comic will not get updated anymore`
			`endOfLife = False`

Various fixes and additions. 2012-12-12 16:41:29 +00:00			`# a description of the comic contents`
Unicode descriptions. 2013-04-29 05:35:56 +00:00			`description = u''`
Various fixes and additions. 2012-12-12 16:41:29 +00:00
Fix some comics and add language tag. 2013-03-08 21:33:05 +00:00			`# langauge of the comic (two-letter ISO 639-1 code)`
			`lang = 'en'`

Add genre tags. 2013-03-26 16:33:27 +00:00			`# list of genres for this comic strip`
Fix genre list 2013-03-26 18:58:22 +00:00			`genres = (Genre.other,)`
Add genre tags. 2013-03-26 16:33:27 +00:00
Code cleanup. 2013-03-07 17:22:39 +00:00			`# compiled regular expression that will locate the URL for the previous strip in a page`
Allow a list of regular expressions for image and previous link search. 2013-03-12 19:48:26 +00:00			`# this can also be a list or tuple of compiled regular expressions`
Code cleanup. 2013-03-07 17:22:39 +00:00			`prevSearch = None`

			`# compiled regular expression that will locate the strip image URLs strip in a page`
Allow a list of regular expressions for image and previous link search. 2013-03-12 19:48:26 +00:00			`# this can also be a list or tuple of compiled regular expressions`
Code cleanup. 2013-03-07 17:22:39 +00:00			`imageSearch = None`

Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`# usually the index format help`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`help = ''`
A lot of refactoring. 2012-10-11 10:03:12 +00:00
Fix some comics and add language tag. 2013-03-08 21:33:05 +00:00			`# wait time between downloading comic strips`
Add option to wait before downloading. 2013-03-08 05:46:50 +00:00			`waitSeconds = 0`

Various fixes and additions. 2012-12-12 16:41:29 +00:00			`# HTTP session storing cookies`
			`session = requests.session()`
Fix more comics. 2012-12-05 20:52:52 +00:00
Fix indexed retrieval. 2012-10-11 17:53:37 +00:00			`def __init__(self, indexes=None):`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`"""Initialize internal variables."""`
			`self.urls = set()`
Make _BasicScraper hashable. 2013-02-13 19:00:16 +00:00			`if indexes:`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`self.indexes = tuple(sorted(indexes))`
Make _BasicScraper hashable. 2013-02-13 19:00:16 +00:00			`else:`
			`self.indexes = tuple()`
Remember skipped URLs. 2013-02-20 19:51:39 +00:00			`self.skippedUrls = set()`
Fix tests which hit the first URL. 2013-02-21 18:48:21 +00:00			`self.hitFirstStripUrl = False`
Make _BasicScraper hashable. 2013-02-13 19:00:16 +00:00
			`def __cmp__(self, other):`
Code cleanup. 2013-02-18 19:02:16 +00:00			`"""Compare scraper by name and index list."""`
Make _BasicScraper hashable. 2013-02-13 19:00:16 +00:00			`if not isinstance(other, _BasicScraper):`
			`return 1`
			`# first, order by name`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`d = cmp(self.getName(), other.getName())`
Make _BasicScraper hashable. 2013-02-13 19:00:16 +00:00			`if d != 0:`
			`return d`
			`# then by indexes`
			`return cmp(self.indexes, other.indexes)`

			`def __hash__(self):`
Code cleanup. 2013-02-18 19:02:16 +00:00			`"""Get hash value from name and index list."""`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`return hash((self.getName(), self.indexes))`
A lot of refactoring. 2012-10-11 10:03:12 +00:00
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`def shouldSkipUrl(self, url):`
			`"""Determine if search for images in given URL should be skipped."""`
			`return False`
Add imageUrlModifier() for scrapers. 2013-03-04 18:10:27 +00:00
			`def getComicStrip(self, url, data, baseUrl):`
			`"""Get comic strip downloader for given URL and data."""`
			`imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)`
			`imageUrls = set(map(self.imageUrlModifier, imageUrls))`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`if len(imageUrls) > 1 and not self.multipleImagesPerStrip:`
Use tuples rather than lists. 2013-04-05 16:55:19 +00:00			`patterns = [x.pattern for x in makeSequence(self.imageSearch)]`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.warn(u"found %d images instead of 1 at %s with patterns %s" % (len(imageUrls), url, patterns))`
Fixed some comics. 2013-04-11 16:27:43 +00:00			`image = sorted(imageUrls)[0]`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.warn(u"choosing image %s" % image)`
Fixed some comics. 2013-04-11 16:27:43 +00:00			`imageUrls = (image,)`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`elif not imageUrls:`
Use tuples rather than lists. 2013-04-05 16:55:19 +00:00			`patterns = [x.pattern for x in makeSequence(self.imageSearch)]`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.warn(u"found no images at %s with patterns %s" % (url, patterns))`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session)`
A lot of refactoring. 2012-10-11 10:03:12 +00:00
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`def getStrips(self, maxstrips=None):`
			`"""Get comic strips."""`
Fix more comics. 2012-12-07 23:45:18 +00:00			`if maxstrips:`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`word = u"strip" if maxstrips == 1 else "strips"`
			`msg = u'Retrieving %d %s' % (maxstrips, word)`
Fix more comics. 2012-12-07 23:45:18 +00:00			`else:`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`msg = u'Retrieving all strips'`
Improve comic strip message. 2013-01-29 17:51:35 +00:00			`if self.indexes:`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`if len(self.indexes) == 1:`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`msg += u" for index %s" % self.indexes[0]`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`else:`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`msg += u" for indexes %s" % self.indexes`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`urls = [self.getIndexStripUrl(index) for index in self.indexes]`
			`else:`
			`urls = [self.getLatestUrl()]`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`if self.adult:`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`msg += u" (including adult content)"`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`out.info(msg)`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`for url in urls:`
Fix more comics. 2012-12-07 23:45:18 +00:00			`for strip in self.getStripsFor(url, maxstrips):`
Fix indexed retrieval. 2012-10-11 17:53:37 +00:00			`yield strip`

Fix more comics. 2012-12-07 23:45:18 +00:00			`def getStripsFor(self, url, maxstrips):`
			`"""Get comic strips for an URL. If maxstrips is a positive number, stop after`
			`retrieving the given number of strips."""`
Fix tests which hit the first URL. 2013-02-21 18:48:21 +00:00			`self.hitFirstStripUrl = False`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`seen_urls = set()`
			`while url:`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.info(u'Get strip URL %s' % url, level=1)`
Always use connection pooling. 2013-02-12 16:55:13 +00:00			`data, baseUrl = getPageContent(url, self.session)`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`if self.shouldSkipUrl(url):`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.info(u'Skipping URL %s' % url)`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`self.skippedUrls.add(url)`
Allow selected strips without images. 2013-02-18 19:03:27 +00:00			`else:`
Dont stop on image regex errors. 2013-03-15 06:03:54 +00:00			`try:`
			`yield self.getComicStrip(url, data, baseUrl)`
			`except ValueError as msg:`
			`# image not found`
Print stacktrace on image errors. 2013-03-25 18:48:47 +00:00			`out.exception(msg)`
Add firstStripUrl to scrapers. 2013-02-13 18:59:59 +00:00			`if self.firstStripUrl == url:`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.debug(u"Stop at first URL %s" % url)`
Fix tests which hit the first URL. 2013-02-21 18:48:21 +00:00			`self.hitFirstStripUrl = True`
Add firstStripUrl to scrapers. 2013-02-13 18:59:59 +00:00			`break`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`if maxstrips is not None:`
			`maxstrips -= 1`
			`if maxstrips <= 0:`
			`break`
			`prevUrl = self.getPrevUrl(url, data, baseUrl)`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`seen_urls.add(url)`
Add cookie feature. 2012-12-08 20:29:57 +00:00			`if prevUrl in seen_urls:`
			`# avoid recursive URL loops`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.warn(u"Already seen previous URL %r" % prevUrl)`
Add cookie feature. 2012-12-08 20:29:57 +00:00			`break`
			`url = prevUrl`
Fix some comics and add language tag. 2013-03-08 21:33:05 +00:00			`if url and self.waitSeconds:`
Add option to wait before downloading. 2013-03-08 05:46:50 +00:00			`time.sleep(self.waitSeconds)`
A lot of refactoring. 2012-10-11 10:03:12 +00:00
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`def getPrevUrl(self, url, data, baseUrl):`
			`"""Find previous URL."""`
			`prevUrl = None`
			`if self.prevSearch:`
			`try:`
			`prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)`
			`except ValueError as msg:`
			`# assume there is no previous URL, but print a warning`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.warn(u"%s Assuming no previous comic strips exist." % msg)`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`else:`
			`prevUrl = self.prevUrlModifier(prevUrl)`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.debug(u"Matched previous URL %s" % prevUrl)`
Add event comicPageLink for every previous link. This event allows a listener to build connections between pages. 2013-03-10 15:23:04 +00:00			`getHandler().comicPageLink(self.getName(), url, prevUrl)`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`return prevUrl`

			`def getIndexStripUrl(self, index):`
			`"""Get comic strip URL from index."""`
			`return self.stripUrl % index`
A lot of refactoring. 2012-10-11 10:03:12 +00:00
			`@classmethod`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`def getName(cls):`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`"""Get scraper name."""`
			`if hasattr(cls, 'name'):`
			`return cls.name`
			`return cls.__name__`

			`@classmethod`
			`def starter(cls):`
			`"""Get starter URL from where to scrape comic strips."""`
Always have an url attribute in comic scrapers. 2013-02-04 20:00:26 +00:00			`return cls.url`
A lot of refactoring. 2012-10-11 10:03:12 +00:00
			`@classmethod`
			`def namer(cls, imageUrl, pageUrl):`
			`"""Return filename for given image and page URL."""`
			`return None`

Fix some comics. 2012-12-02 17:35:06 +00:00			`@classmethod`
			`def prevUrlModifier(cls, prevUrl):`
			`"""Optional modification of parsed previous URLs. Useful if`
			`there are domain redirects. The default implementation does`
			`not modify the URL.`
			`"""`
			`return prevUrl`

Add imageUrlModifier() for scrapers. 2013-03-04 18:10:27 +00:00			`@classmethod`
			`def imageUrlModifier(cls, imageUrl):`
			`"""Optional modification of parsed image URLs. Useful if the URL`
			`needs to be fixed before usage. The default implementation does`
			`not modify the URL.`
			`"""`
			`return imageUrl`

A lot of refactoring. 2012-10-11 10:03:12 +00:00			`def getLatestUrl(self):`
			`"""Get starter URL from where to scrape comic strips."""`
			`return self.starter()`

Implemented voting 2013-04-09 17:33:50 +00:00			`@classmethod`
			`def vote(cls):`
First part of voting stuff. 2013-04-08 18:19:10 +00:00			`"""Cast a public vote for this comic."""`
Implemented voting 2013-04-09 17:33:50 +00:00			`url = configuration.VoteUrl + 'count/'`
Voting part 2 2013-04-08 19:20:01 +00:00			`uid = get_system_uid()`
Implemented voting 2013-04-09 17:33:50 +00:00			`data = {"name": cls.getName().replace('/', '_'), "uid": uid}`
Fixed some comics. 2013-04-11 16:27:43 +00:00			`page = urlopen(url, cls.session, data=data)`
Implemented voting 2013-04-09 17:33:50 +00:00			`return page.text`
First part of voting stuff. 2013-04-08 18:19:10 +00:00
Detect completed end-of-life comics. 2013-04-25 20:40:06 +00:00			`def getCompleteFile(self, basepath):`
			`"""Get filename indicating all comics are downloaded."""`
			`dirname = getDirname(self.getName())`
			`return os.path.join(basepath, dirname, "complete.txt")`

			`def isComplete(self, basepath):`
			`"""Check if all comics are downloaded."""`
			`return os.path.isfile(self.getCompleteFile(basepath))`

			`def setComplete(self, basepath):`
			`"""Set complete flag for this comic, ie. all comics are downloaded."""`
			`if self.endOfLife:`
			`filename = self.getCompleteFile(basepath)`
			`if not os.path.exists(filename):`
			`with open(filename, 'w') as f:`
			`f.write('All comics should be downloaded here.')`

A lot of refactoring. 2012-10-11 10:03:12 +00:00
Allow multiple comic name matches. 2013-02-13 21:18:05 +00:00			`def find_scraperclasses(comic, multiple_allowed=False):`
			`"""Get a list comic scraper classes. Can return more than one entries if`
			`multiple_allowed is True, else it raises a ValueError if multiple`
			`modules match. The match is a case insensitive substring search."""`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`if not comic:`
			`raise ValueError("empty comic name")`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`candidates = []`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`cname = comic.lower()`
Rename get_scrapers to get_scraperclasses 2013-02-13 18:59:13 +00:00			`for scraperclass in get_scraperclasses():`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`lname = scraperclass.getName().lower()`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`if lname == cname:`
			`# perfect match`
Allow multiple comic name matches. 2013-02-13 21:18:05 +00:00			`if not multiple_allowed:`
Fix scraperclass function. Closes issue #7. 2013-02-18 18:59:16 +00:00			`return [scraperclass]`
Allow multiple comic name matches. 2013-02-13 21:18:05 +00:00			`else:`
			`candidates.append(scraperclass)`
			`elif cname in lname:`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`candidates.append(scraperclass)`
Allow multiple comic name matches. 2013-02-13 21:18:05 +00:00			`if len(candidates) > 1 and not multiple_allowed:`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`comics = ", ".join(x.getName() for x in candidates)`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`raise ValueError('multiple comics found: %s' % comics)`
Allow multiple comic name matches. 2013-02-13 21:18:05 +00:00			`elif not candidates:`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`raise ValueError('comic %r not found' % comic)`
Allow multiple comic name matches. 2013-02-13 21:18:05 +00:00			`return candidates`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

Rename get_scrapers to get_scraperclasses 2013-02-13 18:59:13 +00:00			`_scraperclasses = None`
			`def get_scraperclasses():`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`"""Find all comic scraper classes in the plugins directory.`
			`The result is cached.`
			`@return: list of _BasicScraper classes`
			`@rtype: list of _BasicScraper`
			`"""`
Rename get_scrapers to get_scraperclasses 2013-02-13 18:59:13 +00:00			`global _scraperclasses`
			`if _scraperclasses is None:`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.debug(u"Loading comic modules...")`
Require python 2.7, use importlib. 2012-11-19 20:20:50 +00:00			`modules = loader.get_modules()`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`plugins = loader.get_plugins(modules, _BasicScraper)`
Rename get_scrapers to get_scraperclasses 2013-02-13 18:59:13 +00:00			`_scraperclasses = list(plugins)`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`check_scrapers()`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.debug(u"... %d modules loaded." % len(_scraperclasses))`
Rename get_scrapers to get_scraperclasses 2013-02-13 18:59:13 +00:00			`return _scraperclasses`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

			`def check_scrapers():`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`"""Check for duplicate scraper class names."""`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`d = {}`
Rename get_scrapers to get_scraperclasses 2013-02-13 18:59:13 +00:00			`for scraperclass in _scraperclasses:`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`name = scraperclass.getName().lower()`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`if name in d:`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`name1 = scraperclass.getName()`
			`name2 = d[name].getName()`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`raise ValueError('duplicate scrapers %s and %s found' % (name1, name2))`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`d[name] = scraperclass`
Dynamic type generation helpers. 2012-11-26 06:14:02 +00:00

			`def make_scraper(classname, **attributes):`
			`"""Make a new scraper class with given name and attributes."""`
			`return type(classname, (_BasicScraper,), attributes)`