27d28b8eef
The default encoding for source files is UTF-8 since Python 3, so we can drop all encoding headers. While we are at it, just replace them with SPDX headers.
382 lines
12 KiB
Python
382 lines
12 KiB
Python
# SPDX-License-Identifier: MIT
|
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
|
import os
|
|
import time
|
|
from urllib.parse import quote as url_quote
|
|
import codecs
|
|
import json
|
|
|
|
import imagesize
|
|
|
|
from . import rss, util, configuration
|
|
from .output import out
|
|
|
|
# Maximum width or height to display an image in exported pages.
|
|
# Note that only the displayed size is adjusted, not the image itself.
|
|
MaxImageSize = (800, 800)
|
|
|
|
|
|
class EventHandler(object):
|
|
"""Base class for writing events to files. The currently defined events are
|
|
start(), comicDownloaded() and end()."""
|
|
|
|
def __init__(self, basepath, baseurl, allowdownscale):
|
|
"""Initialize base path and url."""
|
|
self.basepath = basepath
|
|
self.baseurl = baseurl or self.getBaseUrl()
|
|
self.allowdownscale = allowdownscale
|
|
|
|
def getBaseUrl(self):
|
|
'''Return a file: URL that probably points to the basedir.
|
|
|
|
This is used as a halfway sane default when the base URL is not
|
|
provided; not perfect, but should work in most cases.'''
|
|
components = util.splitpath(os.path.abspath(self.basepath))
|
|
url = '/'.join([url_quote(component, '') for component in components])
|
|
return 'file:///' + url + '/'
|
|
|
|
def getUrlFromFilename(self, filename):
|
|
"""Construct URL from filename."""
|
|
components = util.splitpath(util.getRelativePath(self.basepath, filename))
|
|
url = '/'.join([url_quote(component, '') for component in components])
|
|
return self.baseurl + url
|
|
|
|
def start(self):
|
|
"""Emit a start event. Should be overridden in subclass."""
|
|
pass
|
|
|
|
def comicDownloaded(self, comic, filename):
|
|
"""Emit a comic downloaded event. Should be overridden in subclass.
|
|
Parameters are:
|
|
|
|
comic: The ComicImage class calling this event
|
|
filename: The target filename
|
|
"""
|
|
pass
|
|
|
|
def comicPageLink(self, scraper, url, prevUrl):
|
|
"""Emit an event to inform the handler about links between comic pages.
|
|
Should be overridden in subclass. Parameters are:
|
|
|
|
scraper: The Scraper class calling this event
|
|
url: The current page url
|
|
prevUrl: The previous page url
|
|
"""
|
|
pass
|
|
|
|
def end(self):
|
|
"""Emit an end event. Should be overridden in subclass."""
|
|
pass
|
|
|
|
|
|
class RSSEventHandler(EventHandler):
|
|
"""Output in RSS format."""
|
|
|
|
name = 'rss'
|
|
|
|
def getFilename(self):
|
|
"""Return RSS filename."""
|
|
return os.path.abspath(os.path.join(self.basepath, 'dailydose.rss'))
|
|
|
|
def start(self):
|
|
"""Log start event."""
|
|
today = time.time()
|
|
yesterday = today - 86400
|
|
today = time.localtime(today)
|
|
yesterday = time.localtime(yesterday)
|
|
|
|
link = configuration.Url
|
|
|
|
self.rssfn = self.getFilename()
|
|
|
|
if os.path.exists(self.rssfn):
|
|
self.newfile = False
|
|
self.rss = rss.parseFeed(self.rssfn, yesterday)
|
|
else:
|
|
self.newfile = True
|
|
self.rss = rss.Feed('Daily Dosage', link,
|
|
'Comics for %s' % time.strftime('%Y/%m/%d', today))
|
|
|
|
def comicDownloaded(self, comic, filename):
|
|
"""Write RSS entry for downloaded comic."""
|
|
imageUrl = self.getUrlFromFilename(filename)
|
|
size = None
|
|
if self.allowdownscale:
|
|
size = getDimensionForImage(filename, MaxImageSize)
|
|
title = '%s - %s' % (comic.scraper.name, os.path.basename(filename))
|
|
pageUrl = comic.referrer
|
|
description = '<img src="%s"' % imageUrl
|
|
if size:
|
|
description += ' width="%d" height="%d"' % size
|
|
description += '/>'
|
|
if comic.text:
|
|
description += '<br/>%s' % comic.text
|
|
description += '<br/><a href="%s">View Comic Online</a>' % pageUrl
|
|
args = (
|
|
title,
|
|
imageUrl,
|
|
description,
|
|
util.rfc822date(time.time())
|
|
)
|
|
|
|
if self.newfile:
|
|
self.newfile = False
|
|
self.rss.addItem(*args)
|
|
else:
|
|
self.rss.addItem(*args, append=False)
|
|
|
|
def end(self):
|
|
"""Write RSS data to file."""
|
|
self.rss.write(self.rssfn)
|
|
|
|
|
|
def getDimensionForImage(filename, maxsize):
|
|
"""Return scaled image size in (width, height) format.
|
|
The scaling preserves the aspect ratio."""
|
|
try:
|
|
origsize = imagesize.get(filename)
|
|
except Exception as e:
|
|
out.warn("Could not get image size of {}: {}".format(os.path.basename(filename), e))
|
|
return None
|
|
|
|
width, height = origsize
|
|
if width > maxsize[0]:
|
|
height = max(round(height * maxsize[0] / width), 1)
|
|
width = round(maxsize[0])
|
|
if height > maxsize[1]:
|
|
width = max(round(width * maxsize[1] / height), 1)
|
|
height = round(maxsize[1])
|
|
|
|
if width < origsize[0] or height < origsize[1]:
|
|
out.info("Downscaled display size from %s to %s" % (origsize, (width, height)))
|
|
return (width, height)
|
|
|
|
|
|
class HtmlEventHandler(EventHandler):
|
|
"""Output in HTML format."""
|
|
|
|
name = 'html'
|
|
encoding = 'utf-8'
|
|
|
|
def fnFromDate(self, date):
|
|
"""Get filename from date."""
|
|
fn = time.strftime('comics-%Y%m%d', date)
|
|
fn = os.path.join(self.basepath, 'html', fn + ".html")
|
|
fn = os.path.abspath(fn)
|
|
return fn
|
|
|
|
def addNavLinks(self):
|
|
if self.yesterdayUrl:
|
|
self.html.write(u'<a href="%s">Previous Day</a> | ' % self.yesterdayUrl)
|
|
self.html.write(u'<a href="%s">Next Day</a>\n' % self.tomorrowUrl)
|
|
|
|
def start(self):
|
|
"""Start HTML output."""
|
|
today = time.time()
|
|
yesterday = today - 86400
|
|
tomorrow = today + 86400
|
|
today = time.localtime(today)
|
|
yesterday = time.localtime(yesterday)
|
|
tomorrow = time.localtime(tomorrow)
|
|
|
|
fn = self.fnFromDate(today)
|
|
if os.path.exists(fn):
|
|
out.warn('HTML output file %r already exists' % fn)
|
|
out.warn('the page link of previous run will skip this file')
|
|
out.warn('try to generate HTML output only once per day')
|
|
fn = util.getNonexistingFile(fn)
|
|
|
|
d = os.path.dirname(fn)
|
|
if not os.path.isdir(d):
|
|
os.makedirs(d)
|
|
|
|
try:
|
|
fn_yesterday = self.fnFromDate(yesterday)
|
|
fn_yesterday = util.getExistingFile(fn_yesterday)
|
|
self.yesterdayUrl = self.getUrlFromFilename(fn_yesterday)
|
|
except ValueError:
|
|
self.yesterdayUrl = None
|
|
self.tomorrowUrl = self.getUrlFromFilename(self.fnFromDate(tomorrow))
|
|
|
|
self.html = codecs.open(fn, 'w', self.encoding)
|
|
self.html.write(u'''<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta http-equiv="Content-Type" content="text/html; charset=%s"/>
|
|
<meta name="generator" content="%s"/>
|
|
<title>Comics for %s</title>
|
|
</head>
|
|
<body>
|
|
''' % (self.encoding, configuration.App, time.strftime('%Y/%m/%d', today)))
|
|
self.addNavLinks()
|
|
self.html.write(u'<ul>\n')
|
|
# last comic name (eg. CalvinAndHobbes)
|
|
self.lastComic = None
|
|
# last comic strip URL (eg. http://example.com/page42)
|
|
self.lastUrl = None
|
|
|
|
def comicDownloaded(self, comic, filename, text=None):
|
|
"""Write HTML entry for downloaded comic."""
|
|
if self.lastComic != comic.scraper.name:
|
|
self.newComic(comic)
|
|
size = None
|
|
if self.allowdownscale:
|
|
size = getDimensionForImage(filename, MaxImageSize)
|
|
imageUrl = self.getUrlFromFilename(filename)
|
|
pageUrl = comic.referrer
|
|
if pageUrl != self.lastUrl:
|
|
self.html.write(u'<li><a href="%s">%s</a>\n' % (pageUrl, pageUrl))
|
|
self.html.write(u'<br/><img src="%s"' % imageUrl)
|
|
if size:
|
|
self.html.write(' width="%d" height="%d"' % size)
|
|
self.html.write('/>\n')
|
|
if text:
|
|
self.html.write(u'<br/>%s\n' % text)
|
|
self.lastComic = comic.scraper.name
|
|
self.lastUrl = pageUrl
|
|
|
|
def newComic(self, comic):
|
|
"""Start new comic list in HTML."""
|
|
if self.lastUrl is not None:
|
|
self.html.write(u'</li>\n')
|
|
if self.lastComic is not None:
|
|
self.html.write(u'</ul>\n')
|
|
self.html.write(u'<li>%s</li>\n' % comic.scraper.name)
|
|
self.html.write(u'<ul>\n')
|
|
|
|
def end(self):
|
|
"""End HTML output."""
|
|
if self.lastUrl is not None:
|
|
self.html.write(u'</li>\n')
|
|
if self.lastComic is not None:
|
|
self.html.write(u'</ul>\n')
|
|
self.html.write(u'</ul>\n')
|
|
self.addNavLinks()
|
|
self.html.close()
|
|
|
|
|
|
class JSONEventHandler(EventHandler):
|
|
"""Output metadata for comics in JSON format."""
|
|
|
|
name = 'json'
|
|
encoding = 'utf-8'
|
|
|
|
def start(self):
|
|
"""Start with empty data."""
|
|
self.data = {}
|
|
|
|
def jsonFn(self, scraper):
|
|
"""Get filename for the JSON file for a comic."""
|
|
fn = os.path.join(scraper.get_download_dir(self.basepath), 'dosage.json')
|
|
fn = os.path.abspath(fn)
|
|
return fn
|
|
|
|
def getComicData(self, scraper):
|
|
"""Return dictionary with comic info."""
|
|
if scraper not in self.data:
|
|
if os.path.exists(self.jsonFn(scraper)):
|
|
with codecs.open(self.jsonFn(scraper), 'r', self.encoding) as f:
|
|
self.data[scraper] = json.load(f)
|
|
else:
|
|
self.data[scraper] = {'pages': {}}
|
|
return self.data[scraper]
|
|
|
|
def getPageInfo(self, scraper, url):
|
|
"""Return dictionary with comic page info."""
|
|
comicData = self.getComicData(scraper)
|
|
if url not in comicData['pages']:
|
|
comicData['pages'][url] = {'images': {}}
|
|
return comicData['pages'][url]
|
|
|
|
def comicDownloaded(self, comic, filename):
|
|
"""Add URL-to-filename mapping into JSON."""
|
|
pageInfo = self.getPageInfo(comic.scraper, comic.referrer)
|
|
|
|
# If there's already an image for this page start keeping track of their order
|
|
if len(pageInfo['images'].keys()) == 1:
|
|
pageInfo['imagesOrder'] = list(pageInfo['images'].keys())
|
|
if 'imagesOrder' in pageInfo.keys():
|
|
pageInfo['imagesOrder'].append(comic.url)
|
|
|
|
pageInfo['images'][comic.url] = os.path.basename(filename)
|
|
|
|
def comicPageLink(self, scraper, url, prevUrl):
|
|
"""Write previous link into JSON."""
|
|
pageInfo = self.getPageInfo(scraper, url)
|
|
pageInfo['prev'] = prevUrl
|
|
|
|
def end(self):
|
|
"""Write all JSON data to files."""
|
|
for scraper in self.data:
|
|
with codecs.open(self.jsonFn(scraper), 'w', self.encoding) as f:
|
|
json.dump(self.data[scraper], f, indent=2, separators=(',', ': '), sort_keys=True)
|
|
|
|
|
|
_handler_classes = {}
|
|
|
|
|
|
def addHandlerClass(clazz):
|
|
"""Register handler class."""
|
|
if not issubclass(clazz, EventHandler):
|
|
raise ValueError("%s must be subclassed from %s" % (clazz, EventHandler))
|
|
_handler_classes[clazz.name] = clazz
|
|
|
|
|
|
addHandlerClass(HtmlEventHandler)
|
|
addHandlerClass(RSSEventHandler)
|
|
addHandlerClass(JSONEventHandler)
|
|
|
|
|
|
def getHandlerNames():
|
|
"""Get sorted handler names."""
|
|
return sorted(_handler_classes.keys())
|
|
|
|
|
|
# FIXME: Hidden singleton :(
|
|
_handlers = []
|
|
|
|
|
|
def addHandler(name, basepath=None, baseurl=None, allowDownscale=False):
|
|
"""Add an event handler with given name."""
|
|
if basepath is None:
|
|
basepath = '.'
|
|
_handlers.append(_handler_classes[name](basepath, baseurl, allowDownscale))
|
|
|
|
|
|
def clear_handlers():
|
|
del _handlers[:]
|
|
|
|
|
|
class MultiHandler(object):
|
|
"""Encapsulate a list of handlers."""
|
|
|
|
def start(self):
|
|
"""Emit start events for handlers."""
|
|
for handler in _handlers:
|
|
handler.start()
|
|
|
|
def comicDownloaded(self, comic, filename):
|
|
"""Emit comic downloaded events for handlers."""
|
|
for handler in _handlers:
|
|
handler.comicDownloaded(comic, filename)
|
|
|
|
def comicPageLink(self, scraper, url, prevUrl):
|
|
"""Emit an event to inform the handler about links between comic pages.
|
|
Should be overridden in subclass."""
|
|
for handler in _handlers:
|
|
handler.comicPageLink(scraper, url, prevUrl)
|
|
|
|
def end(self):
|
|
"""Emit end events for handlers."""
|
|
for handler in _handlers:
|
|
handler.end()
|
|
|
|
|
|
multihandler = MultiHandler()
|
|
|
|
|
|
def getHandler():
|
|
"""Get installed event handler."""
|
|
return multihandler
|