Send "If-Modified-Since" header for images.

This commit is contained in:
Tobias Gruetzmacher 2016-04-19 00:32:25 +02:00
parent 13a3409854
commit 4204f5f1e4
3 changed files with 109 additions and 90 deletions

View file

@ -1,44 +1,61 @@
# -*- coding: iso-8859-1 -*-
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
import os
import glob
import codecs
import contextlib
from datetime import datetime
from .output import out
from .util import getImageObject, normaliseURL, unquote, getDirname, getFilename, writeFile
from .util import unquote, getDirname, getFilename, urlopen, strsize
from .events import getHandler
# Maximum content size for images
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
# RFC 1123 format, as preferred by RFC 2616
RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT"
class ComicStrip(object):
"""A list of comic image URLs."""
def __init__(self, name, stripUrl, imageUrls, namer, session, text=None):
def __init__(self, name, strip_url, image_urls, namer, session, text=None):
"""Store the image URL list."""
self.name = name
self.stripUrl = stripUrl
self.imageUrls = imageUrls
self.strip_url = strip_url
self.image_urls = image_urls
self.namer = namer
self.session = session
self.text = text
def getImages(self):
"""Get a list of image downloaders."""
for imageUrl in self.imageUrls:
yield self.getDownloader(normaliseURL(imageUrl))
for image_url in self.image_urls:
yield self.getDownloader(image_url)
def getDownloader(self, url):
"""Get an image downloader."""
filename = self.namer(url, self.stripUrl)
filename = self.namer(url, self.strip_url)
if filename is None:
filename = url.rsplit('/', 1)[1]
dirname = getDirname(self.name)
return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session, text=self.text)
return ComicImage(self.name, url, self.strip_url, dirname, filename,
self.session, text=self.text)
class ComicImage(object):
"""A comic image downloader."""
ChunkBytes = 1024 * 100 # 100KB
ChunkBytes = 1024 * 100 # 100KB
def __init__(self, name, url, referrer, dirname, filename, session, text=None):
def __init__(self, name, url, referrer, dirname, filename, session,
text=None):
"""Set URL and filename."""
self.name = name
self.referrer = referrer
@ -49,47 +66,88 @@ class ComicImage(object):
self.session = session
self.text = text
def connect(self):
def connect(self, lastchange=None):
"""Connect to host and get meta information."""
self.urlobj = getImageObject(self.url, self.referrer, self.session)
content_type = unquote(self.urlobj.headers.get('content-type', 'application/octet-stream'))
headers = {}
if lastchange:
headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)
self.urlobj = urlopen(self.url, self.session, referrer=self.referrer,
max_content_bytes=MaxImageBytes, stream=True,
headers=headers)
if self.urlobj.status_code == 304: # Not modified
return
content_type = unquote(self.urlobj.headers.get(
'content-type', 'application/octet-stream'))
content_type = content_type.split(';', 1)[0]
if '/' in content_type:
maintype, subtype = content_type.split('/', 1)
else:
maintype = content_type
subtype = None
if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'):
raise IOError('content type %r is not an image at %s' % (content_type, self.url))
if maintype != 'image' and content_type not in (
'application/octet-stream', 'application/x-shockwave-flash'):
raise IOError('content type %r is not an image at %s' % (
content_type, self.url))
# Always use mime type for file extension if it is sane.
if maintype == 'image':
self.ext = '.' + subtype.replace('jpeg', 'jpg')
self.contentLength = int(self.urlobj.headers.get('content-length', 0))
out.debug(u'... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength))
out.debug(u'... filename = %r, ext = %r, contentLength = %d' % (
self.filename, self.ext, self.contentLength))
def save(self, basepath):
"""Save comic URL to filename on disk."""
comicdir = os.path.join(basepath, self.dirname)
if not os.path.isdir(comicdir):
os.makedirs(comicdir)
fnbase = os.path.join(comicdir, self.filename)
exist = [x for x in glob.glob(fnbase + ".*") if not x.endswith(".txt")]
out.info(u"Get image URL %s" % self.url, level=1)
self.connect()
filename = "%s%s" % (self.filename, self.ext)
comicDir = os.path.join(basepath, self.dirname)
if not os.path.isdir(comicDir):
os.makedirs(comicDir)
fn = os.path.join(comicDir, filename)
if len(exist) == 1:
lastchange = os.path.getmtime(exist[0])
self.connect(datetime.utcfromtimestamp(lastchange))
if self.urlobj.status_code == 304: # Not modified
self.exist_err(exist[0])
return exist[0], False
else:
self.connect()
fn = fnbase + self.ext
# compare with >= since content length could be the compressed size
if os.path.isfile(fn) and os.path.getsize(fn) >= self.contentLength:
out.info(u'Skipping existing file "%s".' % fn)
self.exist_err(fn)
return fn, False
content = self.urlobj.content
if not content:
out.warn(u"Empty content from %s, try again..." % self.url)
self.connect()
content = self.urlobj.content
out.debug(u'Writing comic to file %s...' % fn)
writeFile(fn, content)
with self.fileout(fn) as f:
for chunk in self.urlobj.iter_content(self.ChunkBytes):
f.write(chunk)
if self.text:
fntext = os.path.join(comicDir, "%s.txt" % self.filename)
fntext = fnbase + ".txt"
out.debug(u'Writing comic text to file %s...' % fntext)
writeFile(fntext, self.text, encoding='utf-8')
with self.fileout(fntext, encoding='utf-8') as f:
f.write(self.text)
getHandler().comicDownloaded(self, fn, text=self.text)
return fn, True
@contextlib.contextmanager
def fileout(self, filename, encoding=None):
"""Write content to given filename. Checks for zero-sized files.
If encoding is given writes to a codec.open() file."""
def getfp(filename, encoding):
"""Get open file object."""
if encoding:
return codecs.open(filename, 'w', encoding)
return open(filename, 'wb')
try:
with getfp(filename, encoding) as fp:
yield fp
size = fp.tell()
except Exception:
if os.path.isfile(filename):
os.remove(filename)
raise
else:
out.info(u"Saved %s (%s)." % (filename, strsize(size)))
def exist_err(self, fn):
out.info(u'Skipping existing file "%s".' % fn)

View file

@ -1,9 +1,9 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2014-2016 Tobias Gruetzmacher
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import division, print_function
from __future__ import absolute_import, division, print_function
try:
from urllib.parse import quote as url_quote, unquote as url_unquote
except ImportError:
@ -23,7 +23,6 @@ import sys
import os
import cgi
import re
import codecs
import traceback
import time
import subprocess
@ -38,9 +37,6 @@ from .configuration import UserAgent, AppName, App, SupportUrl
# Maximum content size for HTML pages
MaxContentBytes = 1024 * 1024 * 3 # 3 MB
# Maximum content size for images
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
# Default number of retries
MaxRetries = 3
@ -194,12 +190,6 @@ def get_page(url, session, max_content_bytes=MaxContentBytes):
return page
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
"""Get response object for given image URL."""
return urlopen(url, session, referrer=referrer,
max_content_bytes=max_content_bytes, stream=True)
def makeSequence(item):
"""If item is already a list or tuple, return it.
Else return a tuple with item as single element."""
@ -289,26 +279,23 @@ def get_robotstxt_parser(url, session=None):
def urlopen(url, session, referrer=None, max_content_bytes=None,
timeout=ConnectionTimeoutSecs, raise_for_status=True,
stream=False, data=None, useragent=UserAgent):
raise_for_status=True, useragent=UserAgent, **kwargs):
"""Open an URL and return the response object."""
out.debug(u'Open URL %s' % url)
headers = {'User-Agent': useragent}
if 'headers' not in kwargs:
kwargs['headers'] = {}
kwargs['headers']['User-Agent'] = useragent
if referrer:
headers['Referer'] = referrer
out.debug(u'Sending headers %s' % headers, level=3)
kwargs['headers']['Referer'] = referrer
out.debug(u'Sending headers %s' % kwargs['headers'], level=3)
out.debug(u'Sending cookies %s' % session.cookies)
kwargs = {
"headers": headers,
"timeout": timeout,
"stream": stream,
}
if data is None:
if 'timeout' not in kwargs:
kwargs['timeout'] = ConnectionTimeoutSecs
if 'data' not in kwargs:
method = 'GET'
else:
kwargs['data'] = data
method = 'POST'
out.debug(u'Sending POST data %s' % data, level=3)
out.debug(u'Sending POST data %s' % kwargs['data'], level=3)
try:
req = session.request(method, url, **kwargs)
out.debug(u'Response cookies: %s' % req.cookies)
@ -547,31 +534,3 @@ def strlimit(s, length=72):
if length == 0:
return ""
return "%s..." % s[:length]
def writeFile(filename, content, encoding=None):
"""Write content to given filename. Checks for zero-sized files.
If encoding is given writes to a codec.open() file."""
if not content:
raise OSError("empty content for file %s" % filename)
def getfp(filename, encoding):
"""Get open file object."""
if encoding:
return codecs.open(filename, 'w', encoding)
return open(filename, 'wb')
try:
with getfp(filename, encoding) as fp:
fp.write(content)
fp.flush()
os.fsync(fp.fileno())
size = os.path.getsize(filename)
if size == 0:
raise OSError("empty file %s" % filename)
except Exception:
if os.path.isfile(filename):
os.remove(filename)
raise
else:
out.info(u"Saved %s (%s)." % (filename, strsize(size)))

View file

@ -3,6 +3,8 @@
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
import re
import os
import multiprocessing
@ -70,7 +72,7 @@ def _test_comic(outdir, scraperobj):
msg = 'Traversed %d strips instead of %d.' % (num_strips,
num_strips_expected)
if strip:
msg += " Check the prevSearch pattern at %s" % strip.stripUrl
msg += " Check the prevSearch pattern at %s" % strip.strip_url
assert num_strips == num_strips_expected, msg
if strip:
_check_scraperesult(outdir, num_strips_expected, strip, scraperobj)
@ -83,10 +85,10 @@ def _check_strip(outdir, strip, multipleImagesPerStrip):
for image in strip.getImages():
images.append(image.url)
image.save(outdir)
assert images, 'failed to find images at %s' % strip.stripUrl
assert images, 'failed to find images at %s' % strip.strip_url
if not multipleImagesPerStrip:
assert len(images) == 1, 'found more than 1 image at %s: %s' % (
strip.stripUrl, images)
strip.strip_url, images)
def _check_scraperesult(outdir, num_images_expected, strip, scraperobj):
@ -113,7 +115,7 @@ def _check_stripurl(strip, scraperobj):
urlmatch = urlmatch.replace(r"\%s", r".+")
urlmatch = "^%s$" % urlmatch
ro = re.compile(urlmatch)
mo = ro.search(strip.stripUrl)
mo = ro.search(strip.strip_url)
err = 'strip URL %r does not match stripUrl pattern %s' % (
strip.stripUrl, urlmatch)
strip.strip_url, urlmatch)
assert mo is not None, err