dosage/dosagelib/comic.py

154 lines
5.7 KiB
Python
Raw Permalink Normal View History

# SPDX-License-Identifier: MIT
2024-02-18 15:53:17 +00:00
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
from __future__ import annotations
2012-06-20 19:58:13 +00:00
import os
import glob
import codecs
import contextlib
from datetime import datetime
2024-02-18 15:53:17 +00:00
from typing import Iterator
2012-06-20 19:58:13 +00:00
from .output import out
from .util import unquote, getFilename, urlopen, strsize
2012-10-12 20:07:50 +00:00
from .events import getHandler
2012-06-20 19:58:13 +00:00
# Maximum content size for images
2024-02-18 15:53:17 +00:00
MAX_IMAGE_BYTES = 1024 * 1024 * 20 # 20 MB
# RFC 1123 format, as preferred by RFC 2616
RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT"
2024-02-18 15:53:17 +00:00
class ComicStrip:
2012-10-11 10:03:12 +00:00
"""A list of comic image URLs."""
2012-09-26 14:47:39 +00:00
2024-02-18 15:53:17 +00:00
def __init__(self, scraper, strip_url: str, image_urls: str, text=None) -> None:
2012-10-11 10:03:12 +00:00
"""Store the image URL list."""
self.scraper = scraper
self.strip_url = strip_url
self.image_urls = image_urls
2013-11-29 19:26:49 +00:00
self.text = text
2012-10-11 10:03:12 +00:00
2024-02-18 15:53:17 +00:00
def getImages(self) -> Iterator[ComicImage]:
2012-10-11 10:03:12 +00:00
"""Get a list of image downloaders."""
for image_url in self.image_urls:
yield self.getDownloader(image_url)
2012-10-11 10:03:12 +00:00
2024-02-18 15:53:17 +00:00
def getDownloader(self, url: str) -> ComicImage:
2012-10-11 16:02:29 +00:00
"""Get an image downloader."""
filename = self.scraper.namer(url, self.strip_url)
2012-10-11 16:02:29 +00:00
if filename is None:
filename = url.rsplit('/', 1)[1]
return ComicImage(self.scraper, url, self.strip_url, filename,
text=self.text)
2012-10-11 10:03:12 +00:00
2024-02-18 15:53:17 +00:00
class ComicImage:
2012-10-11 16:02:29 +00:00
"""A comic image downloader."""
ChunkBytes = 1024 * 100 # 100KB
2013-02-12 16:55:13 +00:00
def __init__(self, scraper, url, referrer, filename, text=None):
2012-09-26 14:47:39 +00:00
"""Set URL and filename."""
self.scraper = scraper
2012-09-26 14:47:39 +00:00
self.referrer = referrer
self.url = url
2012-12-07 23:45:18 +00:00
filename = getFilename(filename)
2012-09-26 14:47:39 +00:00
self.filename, self.ext = os.path.splitext(filename)
2013-11-29 19:26:49 +00:00
self.text = text
2012-09-26 14:47:39 +00:00
def connect(self, lastchange=None):
2012-09-26 14:47:39 +00:00
"""Connect to host and get meta information."""
headers = {}
if lastchange:
headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)
self.urlobj = urlopen(self.url, self.scraper.session,
referrer=self.referrer,
2024-02-18 15:53:17 +00:00
max_content_bytes=MAX_IMAGE_BYTES, stream=True,
headers=headers)
if self.urlobj.status_code == 304: # Not modified
return
content_type = unquote(self.urlobj.headers.get(
'content-type', 'application/octet-stream'))
content_type = content_type.split(';', 1)[0]
if '/' in content_type:
maintype, subtype = content_type.split('/', 1)
else:
maintype = content_type
subtype = None
if maintype != 'image' and content_type not in (
'application/octet-stream', 'application/x-shockwave-flash'):
raise IOError('content type %r is not an image at %s' % (
content_type, self.url))
2012-06-20 19:58:13 +00:00
# Always use mime type for file extension if it is sane.
if maintype == 'image':
self.ext = '.' + subtype.replace('jpeg', 'jpg')
self.contentLength = int(self.urlobj.headers.get('content-length', 0))
out.debug(u'... filename = %r, ext = %r, contentLength = %d' % (
self.filename, self.ext, self.contentLength))
2012-06-20 19:58:13 +00:00
2012-10-11 16:08:18 +00:00
def save(self, basepath):
2012-09-26 14:47:39 +00:00
"""Save comic URL to filename on disk."""
fnbase = self._fnbase(basepath)
exist = [x for x in glob.glob(fnbase + ".*") if not x.endswith(".txt")]
2013-04-30 04:40:20 +00:00
out.info(u"Get image URL %s" % self.url, level=1)
if len(exist) == 1:
lastchange = os.path.getmtime(exist[0])
self.connect(datetime.utcfromtimestamp(lastchange))
if self.urlobj.status_code == 304: # Not modified
self._exist_err(exist[0])
return exist[0], False
else:
self.connect()
fn = fnbase + self.ext
# compare with >= since content length could be the compressed size
if os.path.isfile(fn) and os.path.getsize(fn) >= self.contentLength:
self._exist_err(fn)
2012-06-20 19:58:13 +00:00
return fn, False
out.debug(u'Writing comic to file %s...' % fn)
with self.fileout(fn) as f:
for chunk in self.urlobj.iter_content(self.ChunkBytes):
f.write(chunk)
2013-11-29 19:26:49 +00:00
if self.text:
fntext = fnbase + ".txt"
2013-11-29 19:26:49 +00:00
out.debug(u'Writing comic text to file %s...' % fntext)
with self.fileout(fntext, encoding='utf-8') as f:
f.write(self.text)
getHandler().comicDownloaded(self, fn)
2012-06-20 19:58:13 +00:00
return fn, True
@contextlib.contextmanager
def fileout(self, filename, encoding=None):
"""Write content to given filename. Checks for zero-sized files.
If encoding is given writes to a codec.open() file."""
def getfp(filename, encoding):
"""Get open file object."""
if encoding:
return codecs.open(filename, 'w', encoding)
return open(filename, 'wb')
try:
with getfp(filename, encoding) as fp:
yield fp
size = fp.tell()
except Exception:
if os.path.isfile(filename):
os.remove(filename)
raise
else:
out.info(u"Saved %s (%s)." % (filename, strsize(size)))
def _exist_err(self, fn):
out.info(u'Skipping existing file "%s".' % fn)
def _fnbase(self, basepath):
'''Determine the target base name of this comic file and make sure the
directory exists.'''
comicdir = self.scraper.get_download_dir(basepath)
if not os.path.isdir(comicdir):
os.makedirs(comicdir)
return os.path.join(comicdir, self.filename)