Send "If-Modified-Since" header for images.

This commit is contained in:
Tobias Gruetzmacher 2016-04-19 00:32:25 +02:00
parent 13a3409854
commit 4204f5f1e4
3 changed files with 109 additions and 90 deletions

View file

@ -1,36 +1,52 @@
# -*- coding: iso-8859-1 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
import os import os
import glob
import codecs
import contextlib
from datetime import datetime
from .output import out from .output import out
from .util import getImageObject, normaliseURL, unquote, getDirname, getFilename, writeFile from .util import unquote, getDirname, getFilename, urlopen, strsize
from .events import getHandler from .events import getHandler
# Maximum content size for images
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
# RFC 1123 format, as preferred by RFC 2616
RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT"
class ComicStrip(object): class ComicStrip(object):
"""A list of comic image URLs.""" """A list of comic image URLs."""
def __init__(self, name, stripUrl, imageUrls, namer, session, text=None): def __init__(self, name, strip_url, image_urls, namer, session, text=None):
"""Store the image URL list.""" """Store the image URL list."""
self.name = name self.name = name
self.stripUrl = stripUrl self.strip_url = strip_url
self.imageUrls = imageUrls self.image_urls = image_urls
self.namer = namer self.namer = namer
self.session = session self.session = session
self.text = text self.text = text
def getImages(self): def getImages(self):
"""Get a list of image downloaders.""" """Get a list of image downloaders."""
for imageUrl in self.imageUrls: for image_url in self.image_urls:
yield self.getDownloader(normaliseURL(imageUrl)) yield self.getDownloader(image_url)
def getDownloader(self, url): def getDownloader(self, url):
"""Get an image downloader.""" """Get an image downloader."""
filename = self.namer(url, self.stripUrl) filename = self.namer(url, self.strip_url)
if filename is None: if filename is None:
filename = url.rsplit('/', 1)[1] filename = url.rsplit('/', 1)[1]
dirname = getDirname(self.name) dirname = getDirname(self.name)
return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session, text=self.text) return ComicImage(self.name, url, self.strip_url, dirname, filename,
self.session, text=self.text)
class ComicImage(object): class ComicImage(object):
@ -38,7 +54,8 @@ class ComicImage(object):
ChunkBytes = 1024 * 100 # 100KB ChunkBytes = 1024 * 100 # 100KB
def __init__(self, name, url, referrer, dirname, filename, session, text=None): def __init__(self, name, url, referrer, dirname, filename, session,
text=None):
"""Set URL and filename.""" """Set URL and filename."""
self.name = name self.name = name
self.referrer = referrer self.referrer = referrer
@ -49,47 +66,88 @@ class ComicImage(object):
self.session = session self.session = session
self.text = text self.text = text
def connect(self): def connect(self, lastchange=None):
"""Connect to host and get meta information.""" """Connect to host and get meta information."""
self.urlobj = getImageObject(self.url, self.referrer, self.session) headers = {}
content_type = unquote(self.urlobj.headers.get('content-type', 'application/octet-stream')) if lastchange:
headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)
self.urlobj = urlopen(self.url, self.session, referrer=self.referrer,
max_content_bytes=MaxImageBytes, stream=True,
headers=headers)
if self.urlobj.status_code == 304: # Not modified
return
content_type = unquote(self.urlobj.headers.get(
'content-type', 'application/octet-stream'))
content_type = content_type.split(';', 1)[0] content_type = content_type.split(';', 1)[0]
if '/' in content_type: if '/' in content_type:
maintype, subtype = content_type.split('/', 1) maintype, subtype = content_type.split('/', 1)
else: else:
maintype = content_type maintype = content_type
subtype = None subtype = None
if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'): if maintype != 'image' and content_type not in (
raise IOError('content type %r is not an image at %s' % (content_type, self.url)) 'application/octet-stream', 'application/x-shockwave-flash'):
raise IOError('content type %r is not an image at %s' % (
content_type, self.url))
# Always use mime type for file extension if it is sane. # Always use mime type for file extension if it is sane.
if maintype == 'image': if maintype == 'image':
self.ext = '.' + subtype.replace('jpeg', 'jpg') self.ext = '.' + subtype.replace('jpeg', 'jpg')
self.contentLength = int(self.urlobj.headers.get('content-length', 0)) self.contentLength = int(self.urlobj.headers.get('content-length', 0))
out.debug(u'... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength)) out.debug(u'... filename = %r, ext = %r, contentLength = %d' % (
self.filename, self.ext, self.contentLength))
def save(self, basepath): def save(self, basepath):
"""Save comic URL to filename on disk.""" """Save comic URL to filename on disk."""
comicdir = os.path.join(basepath, self.dirname)
if not os.path.isdir(comicdir):
os.makedirs(comicdir)
fnbase = os.path.join(comicdir, self.filename)
exist = [x for x in glob.glob(fnbase + ".*") if not x.endswith(".txt")]
out.info(u"Get image URL %s" % self.url, level=1) out.info(u"Get image URL %s" % self.url, level=1)
if len(exist) == 1:
lastchange = os.path.getmtime(exist[0])
self.connect(datetime.utcfromtimestamp(lastchange))
if self.urlobj.status_code == 304: # Not modified
self.exist_err(exist[0])
return exist[0], False
else:
self.connect() self.connect()
filename = "%s%s" % (self.filename, self.ext) fn = fnbase + self.ext
comicDir = os.path.join(basepath, self.dirname)
if not os.path.isdir(comicDir):
os.makedirs(comicDir)
fn = os.path.join(comicDir, filename)
# compare with >= since content length could be the compressed size # compare with >= since content length could be the compressed size
if os.path.isfile(fn) and os.path.getsize(fn) >= self.contentLength: if os.path.isfile(fn) and os.path.getsize(fn) >= self.contentLength:
out.info(u'Skipping existing file "%s".' % fn) self.exist_err(fn)
return fn, False return fn, False
content = self.urlobj.content
if not content:
out.warn(u"Empty content from %s, try again..." % self.url)
self.connect()
content = self.urlobj.content
out.debug(u'Writing comic to file %s...' % fn) out.debug(u'Writing comic to file %s...' % fn)
writeFile(fn, content) with self.fileout(fn) as f:
for chunk in self.urlobj.iter_content(self.ChunkBytes):
f.write(chunk)
if self.text: if self.text:
fntext = os.path.join(comicDir, "%s.txt" % self.filename) fntext = fnbase + ".txt"
out.debug(u'Writing comic text to file %s...' % fntext) out.debug(u'Writing comic text to file %s...' % fntext)
writeFile(fntext, self.text, encoding='utf-8') with self.fileout(fntext, encoding='utf-8') as f:
f.write(self.text)
getHandler().comicDownloaded(self, fn, text=self.text) getHandler().comicDownloaded(self, fn, text=self.text)
return fn, True return fn, True
@contextlib.contextmanager
def fileout(self, filename, encoding=None):
"""Write content to given filename. Checks for zero-sized files.
If encoding is given writes to a codec.open() file."""
def getfp(filename, encoding):
"""Get open file object."""
if encoding:
return codecs.open(filename, 'w', encoding)
return open(filename, 'wb')
try:
with getfp(filename, encoding) as fp:
yield fp
size = fp.tell()
except Exception:
if os.path.isfile(filename):
os.remove(filename)
raise
else:
out.info(u"Saved %s (%s)." % (filename, strsize(size)))
def exist_err(self, fn):
out.info(u'Skipping existing file "%s".' % fn)

View file

@ -1,9 +1,9 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2014-2016 Tobias Gruetzmacher # Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import division, print_function from __future__ import absolute_import, division, print_function
try: try:
from urllib.parse import quote as url_quote, unquote as url_unquote from urllib.parse import quote as url_quote, unquote as url_unquote
except ImportError: except ImportError:
@ -23,7 +23,6 @@ import sys
import os import os
import cgi import cgi
import re import re
import codecs
import traceback import traceback
import time import time
import subprocess import subprocess
@ -38,9 +37,6 @@ from .configuration import UserAgent, AppName, App, SupportUrl
# Maximum content size for HTML pages # Maximum content size for HTML pages
MaxContentBytes = 1024 * 1024 * 3 # 3 MB MaxContentBytes = 1024 * 1024 * 3 # 3 MB
# Maximum content size for images
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
# Default number of retries # Default number of retries
MaxRetries = 3 MaxRetries = 3
@ -194,12 +190,6 @@ def get_page(url, session, max_content_bytes=MaxContentBytes):
return page return page
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
"""Get response object for given image URL."""
return urlopen(url, session, referrer=referrer,
max_content_bytes=max_content_bytes, stream=True)
def makeSequence(item): def makeSequence(item):
"""If item is already a list or tuple, return it. """If item is already a list or tuple, return it.
Else return a tuple with item as single element.""" Else return a tuple with item as single element."""
@ -289,26 +279,23 @@ def get_robotstxt_parser(url, session=None):
def urlopen(url, session, referrer=None, max_content_bytes=None, def urlopen(url, session, referrer=None, max_content_bytes=None,
timeout=ConnectionTimeoutSecs, raise_for_status=True, raise_for_status=True, useragent=UserAgent, **kwargs):
stream=False, data=None, useragent=UserAgent):
"""Open an URL and return the response object.""" """Open an URL and return the response object."""
out.debug(u'Open URL %s' % url) out.debug(u'Open URL %s' % url)
headers = {'User-Agent': useragent} if 'headers' not in kwargs:
kwargs['headers'] = {}
kwargs['headers']['User-Agent'] = useragent
if referrer: if referrer:
headers['Referer'] = referrer kwargs['headers']['Referer'] = referrer
out.debug(u'Sending headers %s' % headers, level=3) out.debug(u'Sending headers %s' % kwargs['headers'], level=3)
out.debug(u'Sending cookies %s' % session.cookies) out.debug(u'Sending cookies %s' % session.cookies)
kwargs = { if 'timeout' not in kwargs:
"headers": headers, kwargs['timeout'] = ConnectionTimeoutSecs
"timeout": timeout, if 'data' not in kwargs:
"stream": stream,
}
if data is None:
method = 'GET' method = 'GET'
else: else:
kwargs['data'] = data
method = 'POST' method = 'POST'
out.debug(u'Sending POST data %s' % data, level=3) out.debug(u'Sending POST data %s' % kwargs['data'], level=3)
try: try:
req = session.request(method, url, **kwargs) req = session.request(method, url, **kwargs)
out.debug(u'Response cookies: %s' % req.cookies) out.debug(u'Response cookies: %s' % req.cookies)
@ -547,31 +534,3 @@ def strlimit(s, length=72):
if length == 0: if length == 0:
return "" return ""
return "%s..." % s[:length] return "%s..." % s[:length]
def writeFile(filename, content, encoding=None):
"""Write content to given filename. Checks for zero-sized files.
If encoding is given writes to a codec.open() file."""
if not content:
raise OSError("empty content for file %s" % filename)
def getfp(filename, encoding):
"""Get open file object."""
if encoding:
return codecs.open(filename, 'w', encoding)
return open(filename, 'wb')
try:
with getfp(filename, encoding) as fp:
fp.write(content)
fp.flush()
os.fsync(fp.fileno())
size = os.path.getsize(filename)
if size == 0:
raise OSError("empty file %s" % filename)
except Exception:
if os.path.isfile(filename):
os.remove(filename)
raise
else:
out.info(u"Saved %s (%s)." % (filename, strsize(size)))

View file

@ -3,6 +3,8 @@
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher # Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
import re import re
import os import os
import multiprocessing import multiprocessing
@ -70,7 +72,7 @@ def _test_comic(outdir, scraperobj):
msg = 'Traversed %d strips instead of %d.' % (num_strips, msg = 'Traversed %d strips instead of %d.' % (num_strips,
num_strips_expected) num_strips_expected)
if strip: if strip:
msg += " Check the prevSearch pattern at %s" % strip.stripUrl msg += " Check the prevSearch pattern at %s" % strip.strip_url
assert num_strips == num_strips_expected, msg assert num_strips == num_strips_expected, msg
if strip: if strip:
_check_scraperesult(outdir, num_strips_expected, strip, scraperobj) _check_scraperesult(outdir, num_strips_expected, strip, scraperobj)
@ -83,10 +85,10 @@ def _check_strip(outdir, strip, multipleImagesPerStrip):
for image in strip.getImages(): for image in strip.getImages():
images.append(image.url) images.append(image.url)
image.save(outdir) image.save(outdir)
assert images, 'failed to find images at %s' % strip.stripUrl assert images, 'failed to find images at %s' % strip.strip_url
if not multipleImagesPerStrip: if not multipleImagesPerStrip:
assert len(images) == 1, 'found more than 1 image at %s: %s' % ( assert len(images) == 1, 'found more than 1 image at %s: %s' % (
strip.stripUrl, images) strip.strip_url, images)
def _check_scraperesult(outdir, num_images_expected, strip, scraperobj): def _check_scraperesult(outdir, num_images_expected, strip, scraperobj):
@ -113,7 +115,7 @@ def _check_stripurl(strip, scraperobj):
urlmatch = urlmatch.replace(r"\%s", r".+") urlmatch = urlmatch.replace(r"\%s", r".+")
urlmatch = "^%s$" % urlmatch urlmatch = "^%s$" % urlmatch
ro = re.compile(urlmatch) ro = re.compile(urlmatch)
mo = ro.search(strip.stripUrl) mo = ro.search(strip.strip_url)
err = 'strip URL %r does not match stripUrl pattern %s' % ( err = 'strip URL %r does not match stripUrl pattern %s' % (
strip.stripUrl, urlmatch) strip.strip_url, urlmatch)
assert mo is not None, err assert mo is not None, err