Send "If-Modified-Since" header for images.
This commit is contained in:
parent
13a3409854
commit
4204f5f1e4
3 changed files with 109 additions and 90 deletions
|
@ -1,44 +1,61 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import os
|
||||
import glob
|
||||
import codecs
|
||||
import contextlib
|
||||
from datetime import datetime
|
||||
|
||||
from .output import out
|
||||
from .util import getImageObject, normaliseURL, unquote, getDirname, getFilename, writeFile
|
||||
from .util import unquote, getDirname, getFilename, urlopen, strsize
|
||||
from .events import getHandler
|
||||
|
||||
|
||||
# Maximum content size for images
|
||||
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
|
||||
# RFC 1123 format, as preferred by RFC 2616
|
||||
RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT"
|
||||
|
||||
|
||||
class ComicStrip(object):
|
||||
"""A list of comic image URLs."""
|
||||
|
||||
def __init__(self, name, stripUrl, imageUrls, namer, session, text=None):
|
||||
def __init__(self, name, strip_url, image_urls, namer, session, text=None):
|
||||
"""Store the image URL list."""
|
||||
self.name = name
|
||||
self.stripUrl = stripUrl
|
||||
self.imageUrls = imageUrls
|
||||
self.strip_url = strip_url
|
||||
self.image_urls = image_urls
|
||||
self.namer = namer
|
||||
self.session = session
|
||||
self.text = text
|
||||
|
||||
def getImages(self):
|
||||
"""Get a list of image downloaders."""
|
||||
for imageUrl in self.imageUrls:
|
||||
yield self.getDownloader(normaliseURL(imageUrl))
|
||||
for image_url in self.image_urls:
|
||||
yield self.getDownloader(image_url)
|
||||
|
||||
def getDownloader(self, url):
|
||||
"""Get an image downloader."""
|
||||
filename = self.namer(url, self.stripUrl)
|
||||
filename = self.namer(url, self.strip_url)
|
||||
if filename is None:
|
||||
filename = url.rsplit('/', 1)[1]
|
||||
dirname = getDirname(self.name)
|
||||
return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session, text=self.text)
|
||||
return ComicImage(self.name, url, self.strip_url, dirname, filename,
|
||||
self.session, text=self.text)
|
||||
|
||||
|
||||
class ComicImage(object):
|
||||
"""A comic image downloader."""
|
||||
|
||||
ChunkBytes = 1024 * 100 # 100KB
|
||||
ChunkBytes = 1024 * 100 # 100KB
|
||||
|
||||
def __init__(self, name, url, referrer, dirname, filename, session, text=None):
|
||||
def __init__(self, name, url, referrer, dirname, filename, session,
|
||||
text=None):
|
||||
"""Set URL and filename."""
|
||||
self.name = name
|
||||
self.referrer = referrer
|
||||
|
@ -49,47 +66,88 @@ class ComicImage(object):
|
|||
self.session = session
|
||||
self.text = text
|
||||
|
||||
def connect(self):
|
||||
def connect(self, lastchange=None):
|
||||
"""Connect to host and get meta information."""
|
||||
self.urlobj = getImageObject(self.url, self.referrer, self.session)
|
||||
content_type = unquote(self.urlobj.headers.get('content-type', 'application/octet-stream'))
|
||||
headers = {}
|
||||
if lastchange:
|
||||
headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)
|
||||
self.urlobj = urlopen(self.url, self.session, referrer=self.referrer,
|
||||
max_content_bytes=MaxImageBytes, stream=True,
|
||||
headers=headers)
|
||||
if self.urlobj.status_code == 304: # Not modified
|
||||
return
|
||||
content_type = unquote(self.urlobj.headers.get(
|
||||
'content-type', 'application/octet-stream'))
|
||||
content_type = content_type.split(';', 1)[0]
|
||||
if '/' in content_type:
|
||||
maintype, subtype = content_type.split('/', 1)
|
||||
else:
|
||||
maintype = content_type
|
||||
subtype = None
|
||||
if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'):
|
||||
raise IOError('content type %r is not an image at %s' % (content_type, self.url))
|
||||
if maintype != 'image' and content_type not in (
|
||||
'application/octet-stream', 'application/x-shockwave-flash'):
|
||||
raise IOError('content type %r is not an image at %s' % (
|
||||
content_type, self.url))
|
||||
# Always use mime type for file extension if it is sane.
|
||||
if maintype == 'image':
|
||||
self.ext = '.' + subtype.replace('jpeg', 'jpg')
|
||||
self.contentLength = int(self.urlobj.headers.get('content-length', 0))
|
||||
out.debug(u'... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength))
|
||||
out.debug(u'... filename = %r, ext = %r, contentLength = %d' % (
|
||||
self.filename, self.ext, self.contentLength))
|
||||
|
||||
def save(self, basepath):
|
||||
"""Save comic URL to filename on disk."""
|
||||
comicdir = os.path.join(basepath, self.dirname)
|
||||
if not os.path.isdir(comicdir):
|
||||
os.makedirs(comicdir)
|
||||
fnbase = os.path.join(comicdir, self.filename)
|
||||
exist = [x for x in glob.glob(fnbase + ".*") if not x.endswith(".txt")]
|
||||
out.info(u"Get image URL %s" % self.url, level=1)
|
||||
self.connect()
|
||||
filename = "%s%s" % (self.filename, self.ext)
|
||||
comicDir = os.path.join(basepath, self.dirname)
|
||||
if not os.path.isdir(comicDir):
|
||||
os.makedirs(comicDir)
|
||||
fn = os.path.join(comicDir, filename)
|
||||
if len(exist) == 1:
|
||||
lastchange = os.path.getmtime(exist[0])
|
||||
self.connect(datetime.utcfromtimestamp(lastchange))
|
||||
if self.urlobj.status_code == 304: # Not modified
|
||||
self.exist_err(exist[0])
|
||||
return exist[0], False
|
||||
else:
|
||||
self.connect()
|
||||
fn = fnbase + self.ext
|
||||
# compare with >= since content length could be the compressed size
|
||||
if os.path.isfile(fn) and os.path.getsize(fn) >= self.contentLength:
|
||||
out.info(u'Skipping existing file "%s".' % fn)
|
||||
self.exist_err(fn)
|
||||
return fn, False
|
||||
content = self.urlobj.content
|
||||
if not content:
|
||||
out.warn(u"Empty content from %s, try again..." % self.url)
|
||||
self.connect()
|
||||
content = self.urlobj.content
|
||||
out.debug(u'Writing comic to file %s...' % fn)
|
||||
writeFile(fn, content)
|
||||
with self.fileout(fn) as f:
|
||||
for chunk in self.urlobj.iter_content(self.ChunkBytes):
|
||||
f.write(chunk)
|
||||
if self.text:
|
||||
fntext = os.path.join(comicDir, "%s.txt" % self.filename)
|
||||
fntext = fnbase + ".txt"
|
||||
out.debug(u'Writing comic text to file %s...' % fntext)
|
||||
writeFile(fntext, self.text, encoding='utf-8')
|
||||
with self.fileout(fntext, encoding='utf-8') as f:
|
||||
f.write(self.text)
|
||||
getHandler().comicDownloaded(self, fn, text=self.text)
|
||||
return fn, True
|
||||
|
||||
@contextlib.contextmanager
|
||||
def fileout(self, filename, encoding=None):
|
||||
"""Write content to given filename. Checks for zero-sized files.
|
||||
If encoding is given writes to a codec.open() file."""
|
||||
def getfp(filename, encoding):
|
||||
"""Get open file object."""
|
||||
if encoding:
|
||||
return codecs.open(filename, 'w', encoding)
|
||||
return open(filename, 'wb')
|
||||
|
||||
try:
|
||||
with getfp(filename, encoding) as fp:
|
||||
yield fp
|
||||
size = fp.tell()
|
||||
except Exception:
|
||||
if os.path.isfile(filename):
|
||||
os.remove(filename)
|
||||
raise
|
||||
else:
|
||||
out.info(u"Saved %s (%s)." % (filename, strsize(size)))
|
||||
|
||||
def exist_err(self, fn):
|
||||
out.info(u'Skipping existing file "%s".' % fn)
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2014-2016 Tobias Gruetzmacher
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
|
||||
from __future__ import division, print_function
|
||||
from __future__ import absolute_import, division, print_function
|
||||
try:
|
||||
from urllib.parse import quote as url_quote, unquote as url_unquote
|
||||
except ImportError:
|
||||
|
@ -23,7 +23,6 @@ import sys
|
|||
import os
|
||||
import cgi
|
||||
import re
|
||||
import codecs
|
||||
import traceback
|
||||
import time
|
||||
import subprocess
|
||||
|
@ -38,9 +37,6 @@ from .configuration import UserAgent, AppName, App, SupportUrl
|
|||
# Maximum content size for HTML pages
|
||||
MaxContentBytes = 1024 * 1024 * 3 # 3 MB
|
||||
|
||||
# Maximum content size for images
|
||||
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
|
||||
|
||||
# Default number of retries
|
||||
MaxRetries = 3
|
||||
|
||||
|
@ -194,12 +190,6 @@ def get_page(url, session, max_content_bytes=MaxContentBytes):
|
|||
return page
|
||||
|
||||
|
||||
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
||||
"""Get response object for given image URL."""
|
||||
return urlopen(url, session, referrer=referrer,
|
||||
max_content_bytes=max_content_bytes, stream=True)
|
||||
|
||||
|
||||
def makeSequence(item):
|
||||
"""If item is already a list or tuple, return it.
|
||||
Else return a tuple with item as single element."""
|
||||
|
@ -289,26 +279,23 @@ def get_robotstxt_parser(url, session=None):
|
|||
|
||||
|
||||
def urlopen(url, session, referrer=None, max_content_bytes=None,
|
||||
timeout=ConnectionTimeoutSecs, raise_for_status=True,
|
||||
stream=False, data=None, useragent=UserAgent):
|
||||
raise_for_status=True, useragent=UserAgent, **kwargs):
|
||||
"""Open an URL and return the response object."""
|
||||
out.debug(u'Open URL %s' % url)
|
||||
headers = {'User-Agent': useragent}
|
||||
if 'headers' not in kwargs:
|
||||
kwargs['headers'] = {}
|
||||
kwargs['headers']['User-Agent'] = useragent
|
||||
if referrer:
|
||||
headers['Referer'] = referrer
|
||||
out.debug(u'Sending headers %s' % headers, level=3)
|
||||
kwargs['headers']['Referer'] = referrer
|
||||
out.debug(u'Sending headers %s' % kwargs['headers'], level=3)
|
||||
out.debug(u'Sending cookies %s' % session.cookies)
|
||||
kwargs = {
|
||||
"headers": headers,
|
||||
"timeout": timeout,
|
||||
"stream": stream,
|
||||
}
|
||||
if data is None:
|
||||
if 'timeout' not in kwargs:
|
||||
kwargs['timeout'] = ConnectionTimeoutSecs
|
||||
if 'data' not in kwargs:
|
||||
method = 'GET'
|
||||
else:
|
||||
kwargs['data'] = data
|
||||
method = 'POST'
|
||||
out.debug(u'Sending POST data %s' % data, level=3)
|
||||
out.debug(u'Sending POST data %s' % kwargs['data'], level=3)
|
||||
try:
|
||||
req = session.request(method, url, **kwargs)
|
||||
out.debug(u'Response cookies: %s' % req.cookies)
|
||||
|
@ -547,31 +534,3 @@ def strlimit(s, length=72):
|
|||
if length == 0:
|
||||
return ""
|
||||
return "%s..." % s[:length]
|
||||
|
||||
|
||||
def writeFile(filename, content, encoding=None):
|
||||
"""Write content to given filename. Checks for zero-sized files.
|
||||
If encoding is given writes to a codec.open() file."""
|
||||
if not content:
|
||||
raise OSError("empty content for file %s" % filename)
|
||||
|
||||
def getfp(filename, encoding):
|
||||
"""Get open file object."""
|
||||
if encoding:
|
||||
return codecs.open(filename, 'w', encoding)
|
||||
return open(filename, 'wb')
|
||||
|
||||
try:
|
||||
with getfp(filename, encoding) as fp:
|
||||
fp.write(content)
|
||||
fp.flush()
|
||||
os.fsync(fp.fileno())
|
||||
size = os.path.getsize(filename)
|
||||
if size == 0:
|
||||
raise OSError("empty file %s" % filename)
|
||||
except Exception:
|
||||
if os.path.isfile(filename):
|
||||
os.remove(filename)
|
||||
raise
|
||||
else:
|
||||
out.info(u"Saved %s (%s)." % (filename, strsize(size)))
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import re
|
||||
import os
|
||||
import multiprocessing
|
||||
|
@ -70,7 +72,7 @@ def _test_comic(outdir, scraperobj):
|
|||
msg = 'Traversed %d strips instead of %d.' % (num_strips,
|
||||
num_strips_expected)
|
||||
if strip:
|
||||
msg += " Check the prevSearch pattern at %s" % strip.stripUrl
|
||||
msg += " Check the prevSearch pattern at %s" % strip.strip_url
|
||||
assert num_strips == num_strips_expected, msg
|
||||
if strip:
|
||||
_check_scraperesult(outdir, num_strips_expected, strip, scraperobj)
|
||||
|
@ -83,10 +85,10 @@ def _check_strip(outdir, strip, multipleImagesPerStrip):
|
|||
for image in strip.getImages():
|
||||
images.append(image.url)
|
||||
image.save(outdir)
|
||||
assert images, 'failed to find images at %s' % strip.stripUrl
|
||||
assert images, 'failed to find images at %s' % strip.strip_url
|
||||
if not multipleImagesPerStrip:
|
||||
assert len(images) == 1, 'found more than 1 image at %s: %s' % (
|
||||
strip.stripUrl, images)
|
||||
strip.strip_url, images)
|
||||
|
||||
|
||||
def _check_scraperesult(outdir, num_images_expected, strip, scraperobj):
|
||||
|
@ -113,7 +115,7 @@ def _check_stripurl(strip, scraperobj):
|
|||
urlmatch = urlmatch.replace(r"\%s", r".+")
|
||||
urlmatch = "^%s$" % urlmatch
|
||||
ro = re.compile(urlmatch)
|
||||
mo = ro.search(strip.stripUrl)
|
||||
mo = ro.search(strip.strip_url)
|
||||
err = 'strip URL %r does not match stripUrl pattern %s' % (
|
||||
strip.stripUrl, urlmatch)
|
||||
strip.strip_url, urlmatch)
|
||||
assert mo is not None, err
|
||||
|
|
Loading…
Reference in a new issue