dosage/dosagelib/comic.py

# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
from __future__ import annotations

import os
import glob
import codecs
import contextlib
from datetime import datetime
from typing import Iterator

from .output import out
from .util import unquote, getFilename, urlopen, strsize
from .events import getHandler


# Maximum content size for images
MAX_IMAGE_BYTES = 1024 * 1024 * 20  # 20 MB
# RFC 1123 format, as preferred by RFC 2616
RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT"


class ComicStrip:
    """A list of comic image URLs."""

    def __init__(self, scraper, strip_url: str, image_urls: str, text=None) -> None:
        """Store the image URL list."""
        self.scraper = scraper
        self.strip_url = strip_url
        self.image_urls = image_urls
        self.text = text

    def getImages(self) -> Iterator[ComicImage]:
        """Get a list of image downloaders."""
        for image_url in self.image_urls:
            yield self.getDownloader(image_url)

    def getDownloader(self, url: str) -> ComicImage:
        """Get an image downloader."""
        filename = self.scraper.namer(url, self.strip_url)
        if filename is None:
            filename = url.rsplit('/', 1)[1]
        return ComicImage(self.scraper, url, self.strip_url, filename,
                          text=self.text)


class ComicImage:
    """A comic image downloader."""

    ChunkBytes = 1024 * 100  # 100KB

    def __init__(self, scraper, url, referrer, filename, text=None):
        """Set URL and filename."""
        self.scraper = scraper
        self.referrer = referrer
        self.url = url
        filename = getFilename(filename)
        self.filename, self.ext = os.path.splitext(filename)
        self.text = text

    def connect(self, lastchange=None):
        """Connect to host and get meta information."""
        headers = {}
        if lastchange:
            headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)
        self.urlobj = urlopen(self.url, self.scraper.session,
                              referrer=self.referrer,
                              max_content_bytes=MAX_IMAGE_BYTES, stream=True,
                              headers=headers)
        if self.urlobj.status_code == 304:  # Not modified
            return
        content_type = unquote(self.urlobj.headers.get(
            'content-type', 'application/octet-stream'))
        content_type = content_type.split(';', 1)[0]
        if '/' in content_type:
            maintype, subtype = content_type.split('/', 1)
        else:
            maintype = content_type
            subtype = None
        if maintype != 'image' and content_type not in (
                'application/octet-stream', 'application/x-shockwave-flash'):
            raise IOError('content type %r is not an image at %s' % (
                content_type, self.url))
        # Always use mime type for file extension if it is sane.
        if maintype == 'image':
            self.ext = '.' + subtype.replace('jpeg', 'jpg')
        self.contentLength = int(self.urlobj.headers.get('content-length', 0))
        out.debug(u'... filename = %r, ext = %r, contentLength = %d' % (
            self.filename, self.ext, self.contentLength))

    def save(self, basepath):
        """Save comic URL to filename on disk."""
        fnbase = self._fnbase(basepath)
        exist = [x for x in glob.glob(fnbase + ".*") if not x.endswith(".txt")]
        out.info(u"Get image URL %s" % self.url, level=1)
        if len(exist) == 1:
            lastchange = os.path.getmtime(exist[0])
            self.connect(datetime.utcfromtimestamp(lastchange))
            if self.urlobj.status_code == 304:  # Not modified
                self._exist_err(exist[0])
                return exist[0], False
        else:
            self.connect()
        fn = fnbase + self.ext
        # compare with >= since content length could be the compressed size
        if os.path.isfile(fn) and os.path.getsize(fn) >= self.contentLength:
            self._exist_err(fn)
            return fn, False
        out.debug(u'Writing comic to file %s...' % fn)
        with self.fileout(fn) as f:
            for chunk in self.urlobj.iter_content(self.ChunkBytes):
                f.write(chunk)
        if self.text:
            fntext = fnbase + ".txt"
            out.debug(u'Writing comic text to file %s...' % fntext)
            with self.fileout(fntext, encoding='utf-8') as f:
                f.write(self.text)
        getHandler().comicDownloaded(self, fn)
        return fn, True

    @contextlib.contextmanager
    def fileout(self, filename, encoding=None):
        """Write content to given filename. Checks for zero-sized files.
        If encoding is given writes to a codec.open() file."""
        def getfp(filename, encoding):
            """Get open file object."""
            if encoding:
                return codecs.open(filename, 'w', encoding)
            return open(filename, 'wb')

        try:
            with getfp(filename, encoding) as fp:
                yield fp
                size = fp.tell()
        except Exception:
            if os.path.isfile(filename):
                os.remove(filename)
            raise
        else:
            out.info(u"Saved %s (%s)." % (filename, strsize(size)))

    def _exist_err(self, fn):
        out.info(u'Skipping existing file "%s".' % fn)

    def _fnbase(self, basepath):
        '''Determine the target base name of this comic file and make sure the
        directory exists.'''
        comicdir = self.scraper.get_download_dir(basepath)
        if not os.path.isdir(comicdir):
            os.makedirs(comicdir)
        return os.path.join(comicdir, self.filename)
Update file headers The default encoding for source files is UTF-8 since Python 3, so we can drop all encoding headers. While we are at it, just replace them with SPDX headers. 2020-04-18 11:45:44 +00:00			`# SPDX-License-Identifier: MIT`
Add some more type annotations 2024-02-18 15:53:17 +00:00			`# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs`
			`# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam`
			`# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher`
			`from __future__ import annotations`

Initial commit to Github. 2012-06-20 19:58:13 +00:00			`import os`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`import glob`
			`import codecs`
			`import contextlib`
			`from datetime import datetime`
Add some more type annotations 2024-02-18 15:53:17 +00:00			`from typing import Iterator`
Initial commit to Github. 2012-06-20 19:58:13 +00:00
			`from .output import out`
Ignore case for comic download directories. Since we already match comics case-insensitive on the command line, this was a logical step, even if this means changing quite a bit of code that all tries to resolve the "comic directory" in a slightly different way... 2016-06-05 21:55:54 +00:00			`from .util import unquote, getFilename, urlopen, strsize`
Fix event handling. 2012-10-12 20:07:50 +00:00			`from .events import getHandler`
Initial commit to Github. 2012-06-20 19:58:13 +00:00
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00
			`# Maximum content size for images`
Add some more type annotations 2024-02-18 15:53:17 +00:00			`MAX_IMAGE_BYTES = 1024 * 1024 * 20 # 20 MB`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`# RFC 1123 format, as preferred by RFC 2616`
			`RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT"`


Add some more type annotations 2024-02-18 15:53:17 +00:00			`class ComicStrip:`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`"""A list of comic image URLs."""`
Document some functions. 2012-09-26 14:47:39 +00:00
Add some more type annotations 2024-02-18 15:53:17 +00:00			`def __init__(self, scraper, strip_url: str, image_urls: str, text=None) -> None:`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`"""Store the image URL list."""`
Ignore case for comic download directories. Since we already match comics case-insensitive on the command line, this was a logical step, even if this means changing quite a bit of code that all tries to resolve the "comic directory" in a slightly different way... 2016-06-05 21:55:54 +00:00			`self.scraper = scraper`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`self.strip_url = strip_url`
			`self.image_urls = image_urls`
Add text search in comic strips. 2013-11-29 19:26:49 +00:00			`self.text = text`
A lot of refactoring. 2012-10-11 10:03:12 +00:00
Add some more type annotations 2024-02-18 15:53:17 +00:00			`def getImages(self) -> Iterator[ComicImage]:`
A lot of refactoring. 2012-10-11 10:03:12 +00:00			`"""Get a list of image downloaders."""`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`for image_url in self.image_urls:`
			`yield self.getDownloader(image_url)`
A lot of refactoring. 2012-10-11 10:03:12 +00:00
Add some more type annotations 2024-02-18 15:53:17 +00:00			`def getDownloader(self, url: str) -> ComicImage:`
Add more documentation. 2012-10-11 16:02:29 +00:00			`"""Get an image downloader."""`
Ignore case for comic download directories. Since we already match comics case-insensitive on the command line, this was a logical step, even if this means changing quite a bit of code that all tries to resolve the "comic directory" in a slightly different way... 2016-06-05 21:55:54 +00:00			`filename = self.scraper.namer(url, self.strip_url)`
Add more documentation. 2012-10-11 16:02:29 +00:00			`if filename is None:`
			`filename = url.rsplit('/', 1)[1]`
Ignore case for comic download directories. Since we already match comics case-insensitive on the command line, this was a logical step, even if this means changing quite a bit of code that all tries to resolve the "comic directory" in a slightly different way... 2016-06-05 21:55:54 +00:00			`return ComicImage(self.scraper, url, self.strip_url, filename,`
			`text=self.text)`
A lot of refactoring. 2012-10-11 10:03:12 +00:00

Add some more type annotations 2024-02-18 15:53:17 +00:00			`class ComicImage:`
Add more documentation. 2012-10-11 16:02:29 +00:00			`"""A comic image downloader."""`

Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`ChunkBytes = 1024 * 100 # 100KB`
Always use connection pooling. 2013-02-12 16:55:13 +00:00
Ignore case for comic download directories. Since we already match comics case-insensitive on the command line, this was a logical step, even if this means changing quite a bit of code that all tries to resolve the "comic directory" in a slightly different way... 2016-06-05 21:55:54 +00:00			`def __init__(self, scraper, url, referrer, filename, text=None):`
Document some functions. 2012-09-26 14:47:39 +00:00			`"""Set URL and filename."""`
Ignore case for comic download directories. Since we already match comics case-insensitive on the command line, this was a logical step, even if this means changing quite a bit of code that all tries to resolve the "comic directory" in a slightly different way... 2016-06-05 21:55:54 +00:00			`self.scraper = scraper`
Document some functions. 2012-09-26 14:47:39 +00:00			`self.referrer = referrer`
Stripping should be done in normaliseUrl. 2014-06-08 08:12:33 +00:00			`self.url = url`
Fix more comics. 2012-12-07 23:45:18 +00:00			`filename = getFilename(filename)`
Document some functions. 2012-09-26 14:47:39 +00:00			`self.filename, self.ext = os.path.splitext(filename)`
Add text search in comic strips. 2013-11-29 19:26:49 +00:00			`self.text = text`
Document some functions. 2012-09-26 14:47:39 +00:00
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`def connect(self, lastchange=None):`
Document some functions. 2012-09-26 14:47:39 +00:00			`"""Connect to host and get meta information."""`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`headers = {}`
			`if lastchange:`
			`headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)`
Ignore case for comic download directories. Since we already match comics case-insensitive on the command line, this was a logical step, even if this means changing quite a bit of code that all tries to resolve the "comic directory" in a slightly different way... 2016-06-05 21:55:54 +00:00			`self.urlobj = urlopen(self.url, self.scraper.session,`
			`referrer=self.referrer,`
Add some more type annotations 2024-02-18 15:53:17 +00:00			`max_content_bytes=MAX_IMAGE_BYTES, stream=True,`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`headers=headers)`
			`if self.urlobj.status_code == 304: # Not modified`
			`return`
			`content_type = unquote(self.urlobj.headers.get(`
			`'content-type', 'application/octet-stream'))`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`content_type = content_type.split(';', 1)[0]`
			`if '/' in content_type:`
			`maintype, subtype = content_type.split('/', 1)`
			`else:`
			`maintype = content_type`
			`subtype = None`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`if maintype != 'image' and content_type not in (`
			`'application/octet-stream', 'application/x-shockwave-flash'):`
			`raise IOError('content type %r is not an image at %s' % (`
			`content_type, self.url))`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`# Always use mime type for file extension if it is sane.`
Fix comics, improve tests, use python-requests. 2012-11-26 17:44:31 +00:00			`if maintype == 'image':`
			`self.ext = '.' + subtype.replace('jpeg', 'jpg')`
			`self.contentLength = int(self.urlobj.headers.get('content-length', 0))`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`out.debug(u'... filename = %r, ext = %r, contentLength = %d' % (`
			`self.filename, self.ext, self.contentLength))`
Initial commit to Github. 2012-06-20 19:58:13 +00:00
Remove progress stuff. 2012-10-11 16:08:18 +00:00			`def save(self, basepath):`
Document some functions. 2012-09-26 14:47:39 +00:00			`"""Save comic URL to filename on disk."""`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`fnbase = self._fnbase(basepath)`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`exist = [x for x in glob.glob(fnbase + ".*") if not x.endswith(".txt")]`
More unicode output fixes. 2013-04-30 04:40:20 +00:00			`out.info(u"Get image URL %s" % self.url, level=1)`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`if len(exist) == 1:`
			`lastchange = os.path.getmtime(exist[0])`
			`self.connect(datetime.utcfromtimestamp(lastchange))`
			`if self.urlobj.status_code == 304: # Not modified`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`self._exist_err(exist[0])`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`return exist[0], False`
			`else:`
			`self.connect()`
			`fn = fnbase + self.ext`
Retry empty downloads and don't set a manual modification time. 2013-03-07 17:20:38 +00:00			`# compare with >= since content length could be the compressed size`
			`if os.path.isfile(fn) and os.path.getsize(fn) >= self.contentLength:`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`self._exist_err(fn)`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`return fn, False`
Apply same file checks files as for image files. 2013-12-05 17:29:15 +00:00			`out.debug(u'Writing comic to file %s...' % fn)`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`with self.fileout(fn) as f:`
			`for chunk in self.urlobj.iter_content(self.ChunkBytes):`
			`f.write(chunk)`
Add text search in comic strips. 2013-11-29 19:26:49 +00:00			`if self.text:`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`fntext = fnbase + ".txt"`
Add text search in comic strips. 2013-11-29 19:26:49 +00:00			`out.debug(u'Writing comic text to file %s...' % fntext)`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`with self.fileout(fntext, encoding='utf-8') as f:`
			`f.write(self.text)`
Ignore case for comic download directories. Since we already match comics case-insensitive on the command line, this was a logical step, even if this means changing quite a bit of code that all tries to resolve the "comic directory" in a slightly different way... 2016-06-05 21:55:54 +00:00			`getHandler().comicDownloaded(self, fn)`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`return fn, True`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00
			`@contextlib.contextmanager`
			`def fileout(self, filename, encoding=None):`
			`"""Write content to given filename. Checks for zero-sized files.`
			`If encoding is given writes to a codec.open() file."""`
			`def getfp(filename, encoding):`
			`"""Get open file object."""`
			`if encoding:`
			`return codecs.open(filename, 'w', encoding)`
			`return open(filename, 'wb')`

			`try:`
			`with getfp(filename, encoding) as fp:`
			`yield fp`
			`size = fp.tell()`
			`except Exception:`
			`if os.path.isfile(filename):`
			`os.remove(filename)`
			`raise`
			`else:`
			`out.info(u"Saved %s (%s)." % (filename, strsize(size)))`

Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00			`def _exist_err(self, fn):`
Send "If-Modified-Since" header for images. 2016-04-18 22:32:25 +00:00			`out.info(u'Skipping existing file "%s".' % fn)`
Speed up comic module tests. This fakes an If-Modified-Since header, so most web servers don't need to send comic images at all. This should also reduce the amount of data that needs to be fetched for comic module tests. 2016-07-31 22:44:34 +00:00
			`def _fnbase(self, basepath):`
			`'''Determine the target base name of this comic file and make sure the`
			`directory exists.'''`
			`comicdir = self.scraper.get_download_dir(basepath)`
			`if not os.path.isdir(comicdir):`
			`os.makedirs(comicdir)`
			`return os.path.join(comicdir, self.filename)`