dosage/dosagelib/comic.py

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2013 Bastian Kleineidam
import os

from .output import out
from .util import getImageObject, normaliseURL, unquote, getDirname, getFilename, writeFile
from .events import getHandler

class ComicStrip(object):
    """A list of comic image URLs."""

    def __init__(self, name, stripUrl, imageUrls, namer, session, text=None):
        """Store the image URL list."""
        self.name = name
        self.stripUrl = stripUrl
        self.imageUrls = imageUrls
        self.namer = namer
        self.session = session
        self.text = text

    def getImages(self):
        """Get a list of image downloaders."""
        for imageUrl in self.imageUrls:
            yield self.getDownloader(normaliseURL(imageUrl))

    def getDownloader(self, url):
        """Get an image downloader."""
        filename = self.namer(url, self.stripUrl)
        if filename is None:
            filename = url.rsplit('/', 1)[1]
        dirname = getDirname(self.name)
        return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session, text=self.text)


class ComicImage(object):
    """A comic image downloader."""

    ChunkBytes = 1024 * 100 # 100KB

    def __init__(self, name, url, referrer, dirname, filename, session, text=None):
        """Set URL and filename."""
        self.name = name
        self.referrer = referrer
        self.url = url
        self.dirname = dirname
        filename = getFilename(filename)
        self.filename, self.ext = os.path.splitext(filename)
        self.session = session
        self.text = text

    def connect(self):
        """Connect to host and get meta information."""
        self.urlobj = getImageObject(self.url, self.referrer, self.session)
        content_type = unquote(self.urlobj.headers.get('content-type', 'application/octet-stream'))
        content_type = content_type.split(';', 1)[0]
        if '/' in content_type:
            maintype, subtype = content_type.split('/', 1)
        else:
            maintype = content_type
            subtype = None
        if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'):
            raise IOError('content type %r is not an image at %s' % (content_type, self.url))
        # Always use mime type for file extension if it is sane.
        if maintype == 'image':
            self.ext = '.' + subtype.replace('jpeg', 'jpg')
        self.contentLength = int(self.urlobj.headers.get('content-length', 0))
        out.debug(u'... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength))

    def save(self, basepath):
        """Save comic URL to filename on disk."""
        out.info(u"Get image URL %s" % self.url, level=1)
        self.connect()
        filename = "%s%s" % (self.filename, self.ext)
        comicDir = os.path.join(basepath, self.dirname)
        if not os.path.isdir(comicDir):
            os.makedirs(comicDir)
        fn = os.path.join(comicDir, filename)
        # compare with >= since content length could be the compressed size
        if os.path.isfile(fn) and os.path.getsize(fn) >= self.contentLength:
            out.info(u'Skipping existing file "%s".' % fn)
            return fn, False
        content = self.urlobj.content
        if not content:
            out.warn(u"Empty content from %s, try again..." % self.url)
            self.connect()
            content = self.urlobj.content
        out.debug(u'Writing comic to file %s...' % fn)
        writeFile(fn, content)
        if self.text:
            fntext = os.path.join(comicDir, "%s.txt" % self.filename)
            out.debug(u'Writing comic text to file %s...' % fntext)
            writeFile(fntext, self.text, encoding='utf-8')
        getHandler().comicDownloaded(self, fn, text=self.text)
        return fn, True
Updated copyright for all source files. 2012-06-20 20:41:04 +00:00			`# -- coding: iso-8859-1 --`
			`# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs`
Updated copyright. 2013-02-13 05:28:35 +00:00			`# Copyright (C) 2012-2013 Bastian Kleineidam`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`import os`

			`from .output import out`
Apply same file checks files as for image files. 2013-12-05 17:29:15 +00:00			`from .util import getImageObject, normaliseURL, unquote, getDirname, getFilename, writeFile`
Fix event handling. 2012-10-12 20:07:50 +00:00			`from .events import getHandler`
Initial commit to Github. 2012-06-20 19:58:13 +00:00
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`class ComicStrip(object):`
			`"""A list of comic image URLs."""`
Document some functions. 2012-09-26 14:47:39 +00:00
Add text search in comic strips. 2013-11-29 19:26:49 +00:00			`def __init__(self, name, stripUrl, imageUrls, namer, session, text=None):`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`"""Store the image URL list."""`
			`self.name = name`
Fix some comics. 2012-11-13 18:12:28 +00:00			`self.stripUrl = stripUrl`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`self.imageUrls = imageUrls`
			`self.namer = namer`
Always use connection pooling. 2013-02-12 16:55:13 +00:00			`self.session = session`
Add text search in comic strips. 2013-11-29 19:26:49 +00:00			`self.text = text`
A lot of refactoring. 2012-10-11 10:03:12 +00:00
			`def getImages(self):`
			`"""Get a list of image downloaders."""`
			`for imageUrl in self.imageUrls:`
			`yield self.getDownloader(normaliseURL(imageUrl))`

			`def getDownloader(self, url):`
Add more documentation. 2012-10-11 16:02:29 +00:00			`"""Get an image downloader."""`
Fix some comics. 2012-11-13 18:12:28 +00:00			`filename = self.namer(url, self.stripUrl)`
Add more documentation. 2012-10-11 16:02:29 +00:00			`if filename is None:`
			`filename = url.rsplit('/', 1)[1]`
Fix more comics. 2012-12-07 23:45:18 +00:00			`dirname = getDirname(self.name)`
Add text search in comic strips. 2013-11-29 19:26:49 +00:00			`return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session, text=self.text)`
A lot of refactoring. 2012-10-11 10:03:12 +00:00

			`class ComicImage(object):`
Add more documentation. 2012-10-11 16:02:29 +00:00			`"""A comic image downloader."""`

Always use connection pooling. 2013-02-12 16:55:13 +00:00			`ChunkBytes = 1024 * 100 # 100KB`

Add text search in comic strips. 2013-11-29 19:26:49 +00:00			`def __init__(self, name, url, referrer, dirname, filename, session, text=None):`
Document some functions. 2012-09-26 14:47:39 +00:00			`"""Set URL and filename."""`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`self.name = name`
Document some functions. 2012-09-26 14:47:39 +00:00			`self.referrer = referrer`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`self.url = url`
Fix more comics. 2012-12-07 23:45:18 +00:00			`self.dirname = dirname`
			`filename = getFilename(filename)`
Document some functions. 2012-09-26 14:47:39 +00:00			`self.filename, self.ext = os.path.splitext(filename)`
Always use connection pooling. 2013-02-12 16:55:13 +00:00			`self.session = session`
Add text search in comic strips. 2013-11-29 19:26:49 +00:00			`self.text = text`
Document some functions. 2012-09-26 14:47:39 +00:00
			`def connect(self):`
			`"""Connect to host and get meta information."""`
Simplify exception handling. 2013-03-07 23:06:50 +00:00			`self.urlobj = getImageObject(self.url, self.referrer, self.session)`
Fix URLs with no content type header. 2013-03-07 22:08:37 +00:00			`content_type = unquote(self.urlobj.headers.get('content-type', 'application/octet-stream'))`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`content_type = content_type.split(';', 1)[0]`
			`if '/' in content_type:`
			`maintype, subtype = content_type.split('/', 1)`
			`else:`
			`maintype = content_type`
			`subtype = None`
			`if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'):`
Retry empty downloads and don't set a manual modification time. 2013-03-07 17:20:38 +00:00			`raise IOError('content type %r is not an image at %s' % (content_type, self.url))`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`# Always use mime type for file extension if it is sane.`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`if maintype == 'image':`
			`self.ext = '.' + subtype.replace('jpeg', 'jpg')`
			`self.contentLength = int(self.urlobj.headers.get('content-length', 0))`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.debug(u'... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength))`
Initial commit to Github. 2012-06-20 19:58:13 +00:00
Remove progress stuff. 2012-10-11 16:08:18 +00:00			`def save(self, basepath):`
Document some functions. 2012-09-26 14:47:39 +00:00			`"""Save comic URL to filename on disk."""`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.info(u"Get image URL %s" % self.url, level=1)`
Document some functions. 2012-09-26 14:47:39 +00:00			`self.connect()`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`filename = "%s%s" % (self.filename, self.ext)`
Fix more comics. 2012-12-07 23:45:18 +00:00			`comicDir = os.path.join(basepath, self.dirname)`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`if not os.path.isdir(comicDir):`
			`os.makedirs(comicDir)`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`fn = os.path.join(comicDir, filename)`
Retry empty downloads and don't set a manual modification time. 2013-03-07 17:20:38 +00:00			`# compare with >= since content length could be the compressed size`
			`if os.path.isfile(fn) and os.path.getsize(fn) >= self.contentLength:`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.info(u'Skipping existing file "%s".' % fn)`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`return fn, False`
Retry empty downloads and don't set a manual modification time. 2013-03-07 17:20:38 +00:00			`content = self.urlobj.content`
			`if not content:`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.warn(u"Empty content from %s, try again..." % self.url)`
Retry empty downloads and don't set a manual modification time. 2013-03-07 17:20:38 +00:00			`self.connect()`
			`content = self.urlobj.content`
Apply same file checks files as for image files. 2013-12-05 17:29:15 +00:00			`out.debug(u'Writing comic to file %s...' % fn)`
			`writeFile(fn, content)`
Add text search in comic strips. 2013-11-29 19:26:49 +00:00			`if self.text:`
			`fntext = os.path.join(comicDir, "%s.txt" % self.filename)`
			`out.debug(u'Writing comic text to file %s...' % fntext)`
Apply same file checks files as for image files. 2013-12-05 17:29:15 +00:00			`writeFile(fntext, self.text, encoding='utf-8')`
			`getHandler().comicDownloaded(self, fn, text=self.text)`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`return fn, True`