Send "If-Modified-Since" header for images.
This commit is contained in:
parent
13a3409854
commit
4204f5f1e4
3 changed files with 109 additions and 90 deletions
|
@ -1,36 +1,52 @@
|
||||||
# -*- coding: iso-8859-1 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import glob
|
||||||
|
import codecs
|
||||||
|
import contextlib
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
from .output import out
|
from .output import out
|
||||||
from .util import getImageObject, normaliseURL, unquote, getDirname, getFilename, writeFile
|
from .util import unquote, getDirname, getFilename, urlopen, strsize
|
||||||
from .events import getHandler
|
from .events import getHandler
|
||||||
|
|
||||||
|
|
||||||
|
# Maximum content size for images
|
||||||
|
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
|
||||||
|
# RFC 1123 format, as preferred by RFC 2616
|
||||||
|
RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT"
|
||||||
|
|
||||||
|
|
||||||
class ComicStrip(object):
|
class ComicStrip(object):
|
||||||
"""A list of comic image URLs."""
|
"""A list of comic image URLs."""
|
||||||
|
|
||||||
def __init__(self, name, stripUrl, imageUrls, namer, session, text=None):
|
def __init__(self, name, strip_url, image_urls, namer, session, text=None):
|
||||||
"""Store the image URL list."""
|
"""Store the image URL list."""
|
||||||
self.name = name
|
self.name = name
|
||||||
self.stripUrl = stripUrl
|
self.strip_url = strip_url
|
||||||
self.imageUrls = imageUrls
|
self.image_urls = image_urls
|
||||||
self.namer = namer
|
self.namer = namer
|
||||||
self.session = session
|
self.session = session
|
||||||
self.text = text
|
self.text = text
|
||||||
|
|
||||||
def getImages(self):
|
def getImages(self):
|
||||||
"""Get a list of image downloaders."""
|
"""Get a list of image downloaders."""
|
||||||
for imageUrl in self.imageUrls:
|
for image_url in self.image_urls:
|
||||||
yield self.getDownloader(normaliseURL(imageUrl))
|
yield self.getDownloader(image_url)
|
||||||
|
|
||||||
def getDownloader(self, url):
|
def getDownloader(self, url):
|
||||||
"""Get an image downloader."""
|
"""Get an image downloader."""
|
||||||
filename = self.namer(url, self.stripUrl)
|
filename = self.namer(url, self.strip_url)
|
||||||
if filename is None:
|
if filename is None:
|
||||||
filename = url.rsplit('/', 1)[1]
|
filename = url.rsplit('/', 1)[1]
|
||||||
dirname = getDirname(self.name)
|
dirname = getDirname(self.name)
|
||||||
return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session, text=self.text)
|
return ComicImage(self.name, url, self.strip_url, dirname, filename,
|
||||||
|
self.session, text=self.text)
|
||||||
|
|
||||||
|
|
||||||
class ComicImage(object):
|
class ComicImage(object):
|
||||||
|
@ -38,7 +54,8 @@ class ComicImage(object):
|
||||||
|
|
||||||
ChunkBytes = 1024 * 100 # 100KB
|
ChunkBytes = 1024 * 100 # 100KB
|
||||||
|
|
||||||
def __init__(self, name, url, referrer, dirname, filename, session, text=None):
|
def __init__(self, name, url, referrer, dirname, filename, session,
|
||||||
|
text=None):
|
||||||
"""Set URL and filename."""
|
"""Set URL and filename."""
|
||||||
self.name = name
|
self.name = name
|
||||||
self.referrer = referrer
|
self.referrer = referrer
|
||||||
|
@ -49,47 +66,88 @@ class ComicImage(object):
|
||||||
self.session = session
|
self.session = session
|
||||||
self.text = text
|
self.text = text
|
||||||
|
|
||||||
def connect(self):
|
def connect(self, lastchange=None):
|
||||||
"""Connect to host and get meta information."""
|
"""Connect to host and get meta information."""
|
||||||
self.urlobj = getImageObject(self.url, self.referrer, self.session)
|
headers = {}
|
||||||
content_type = unquote(self.urlobj.headers.get('content-type', 'application/octet-stream'))
|
if lastchange:
|
||||||
|
headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)
|
||||||
|
self.urlobj = urlopen(self.url, self.session, referrer=self.referrer,
|
||||||
|
max_content_bytes=MaxImageBytes, stream=True,
|
||||||
|
headers=headers)
|
||||||
|
if self.urlobj.status_code == 304: # Not modified
|
||||||
|
return
|
||||||
|
content_type = unquote(self.urlobj.headers.get(
|
||||||
|
'content-type', 'application/octet-stream'))
|
||||||
content_type = content_type.split(';', 1)[0]
|
content_type = content_type.split(';', 1)[0]
|
||||||
if '/' in content_type:
|
if '/' in content_type:
|
||||||
maintype, subtype = content_type.split('/', 1)
|
maintype, subtype = content_type.split('/', 1)
|
||||||
else:
|
else:
|
||||||
maintype = content_type
|
maintype = content_type
|
||||||
subtype = None
|
subtype = None
|
||||||
if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'):
|
if maintype != 'image' and content_type not in (
|
||||||
raise IOError('content type %r is not an image at %s' % (content_type, self.url))
|
'application/octet-stream', 'application/x-shockwave-flash'):
|
||||||
|
raise IOError('content type %r is not an image at %s' % (
|
||||||
|
content_type, self.url))
|
||||||
# Always use mime type for file extension if it is sane.
|
# Always use mime type for file extension if it is sane.
|
||||||
if maintype == 'image':
|
if maintype == 'image':
|
||||||
self.ext = '.' + subtype.replace('jpeg', 'jpg')
|
self.ext = '.' + subtype.replace('jpeg', 'jpg')
|
||||||
self.contentLength = int(self.urlobj.headers.get('content-length', 0))
|
self.contentLength = int(self.urlobj.headers.get('content-length', 0))
|
||||||
out.debug(u'... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength))
|
out.debug(u'... filename = %r, ext = %r, contentLength = %d' % (
|
||||||
|
self.filename, self.ext, self.contentLength))
|
||||||
|
|
||||||
def save(self, basepath):
|
def save(self, basepath):
|
||||||
"""Save comic URL to filename on disk."""
|
"""Save comic URL to filename on disk."""
|
||||||
|
comicdir = os.path.join(basepath, self.dirname)
|
||||||
|
if not os.path.isdir(comicdir):
|
||||||
|
os.makedirs(comicdir)
|
||||||
|
fnbase = os.path.join(comicdir, self.filename)
|
||||||
|
exist = [x for x in glob.glob(fnbase + ".*") if not x.endswith(".txt")]
|
||||||
out.info(u"Get image URL %s" % self.url, level=1)
|
out.info(u"Get image URL %s" % self.url, level=1)
|
||||||
|
if len(exist) == 1:
|
||||||
|
lastchange = os.path.getmtime(exist[0])
|
||||||
|
self.connect(datetime.utcfromtimestamp(lastchange))
|
||||||
|
if self.urlobj.status_code == 304: # Not modified
|
||||||
|
self.exist_err(exist[0])
|
||||||
|
return exist[0], False
|
||||||
|
else:
|
||||||
self.connect()
|
self.connect()
|
||||||
filename = "%s%s" % (self.filename, self.ext)
|
fn = fnbase + self.ext
|
||||||
comicDir = os.path.join(basepath, self.dirname)
|
|
||||||
if not os.path.isdir(comicDir):
|
|
||||||
os.makedirs(comicDir)
|
|
||||||
fn = os.path.join(comicDir, filename)
|
|
||||||
# compare with >= since content length could be the compressed size
|
# compare with >= since content length could be the compressed size
|
||||||
if os.path.isfile(fn) and os.path.getsize(fn) >= self.contentLength:
|
if os.path.isfile(fn) and os.path.getsize(fn) >= self.contentLength:
|
||||||
out.info(u'Skipping existing file "%s".' % fn)
|
self.exist_err(fn)
|
||||||
return fn, False
|
return fn, False
|
||||||
content = self.urlobj.content
|
|
||||||
if not content:
|
|
||||||
out.warn(u"Empty content from %s, try again..." % self.url)
|
|
||||||
self.connect()
|
|
||||||
content = self.urlobj.content
|
|
||||||
out.debug(u'Writing comic to file %s...' % fn)
|
out.debug(u'Writing comic to file %s...' % fn)
|
||||||
writeFile(fn, content)
|
with self.fileout(fn) as f:
|
||||||
|
for chunk in self.urlobj.iter_content(self.ChunkBytes):
|
||||||
|
f.write(chunk)
|
||||||
if self.text:
|
if self.text:
|
||||||
fntext = os.path.join(comicDir, "%s.txt" % self.filename)
|
fntext = fnbase + ".txt"
|
||||||
out.debug(u'Writing comic text to file %s...' % fntext)
|
out.debug(u'Writing comic text to file %s...' % fntext)
|
||||||
writeFile(fntext, self.text, encoding='utf-8')
|
with self.fileout(fntext, encoding='utf-8') as f:
|
||||||
|
f.write(self.text)
|
||||||
getHandler().comicDownloaded(self, fn, text=self.text)
|
getHandler().comicDownloaded(self, fn, text=self.text)
|
||||||
return fn, True
|
return fn, True
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def fileout(self, filename, encoding=None):
|
||||||
|
"""Write content to given filename. Checks for zero-sized files.
|
||||||
|
If encoding is given writes to a codec.open() file."""
|
||||||
|
def getfp(filename, encoding):
|
||||||
|
"""Get open file object."""
|
||||||
|
if encoding:
|
||||||
|
return codecs.open(filename, 'w', encoding)
|
||||||
|
return open(filename, 'wb')
|
||||||
|
|
||||||
|
try:
|
||||||
|
with getfp(filename, encoding) as fp:
|
||||||
|
yield fp
|
||||||
|
size = fp.tell()
|
||||||
|
except Exception:
|
||||||
|
if os.path.isfile(filename):
|
||||||
|
os.remove(filename)
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
out.info(u"Saved %s (%s)." % (filename, strsize(size)))
|
||||||
|
|
||||||
|
def exist_err(self, fn):
|
||||||
|
out.info(u'Skipping existing file "%s".' % fn)
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2014-2016 Tobias Gruetzmacher
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
try:
|
try:
|
||||||
from urllib.parse import quote as url_quote, unquote as url_unquote
|
from urllib.parse import quote as url_quote, unquote as url_unquote
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -23,7 +23,6 @@ import sys
|
||||||
import os
|
import os
|
||||||
import cgi
|
import cgi
|
||||||
import re
|
import re
|
||||||
import codecs
|
|
||||||
import traceback
|
import traceback
|
||||||
import time
|
import time
|
||||||
import subprocess
|
import subprocess
|
||||||
|
@ -38,9 +37,6 @@ from .configuration import UserAgent, AppName, App, SupportUrl
|
||||||
# Maximum content size for HTML pages
|
# Maximum content size for HTML pages
|
||||||
MaxContentBytes = 1024 * 1024 * 3 # 3 MB
|
MaxContentBytes = 1024 * 1024 * 3 # 3 MB
|
||||||
|
|
||||||
# Maximum content size for images
|
|
||||||
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
|
|
||||||
|
|
||||||
# Default number of retries
|
# Default number of retries
|
||||||
MaxRetries = 3
|
MaxRetries = 3
|
||||||
|
|
||||||
|
@ -194,12 +190,6 @@ def get_page(url, session, max_content_bytes=MaxContentBytes):
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
|
||||||
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
|
||||||
"""Get response object for given image URL."""
|
|
||||||
return urlopen(url, session, referrer=referrer,
|
|
||||||
max_content_bytes=max_content_bytes, stream=True)
|
|
||||||
|
|
||||||
|
|
||||||
def makeSequence(item):
|
def makeSequence(item):
|
||||||
"""If item is already a list or tuple, return it.
|
"""If item is already a list or tuple, return it.
|
||||||
Else return a tuple with item as single element."""
|
Else return a tuple with item as single element."""
|
||||||
|
@ -289,26 +279,23 @@ def get_robotstxt_parser(url, session=None):
|
||||||
|
|
||||||
|
|
||||||
def urlopen(url, session, referrer=None, max_content_bytes=None,
|
def urlopen(url, session, referrer=None, max_content_bytes=None,
|
||||||
timeout=ConnectionTimeoutSecs, raise_for_status=True,
|
raise_for_status=True, useragent=UserAgent, **kwargs):
|
||||||
stream=False, data=None, useragent=UserAgent):
|
|
||||||
"""Open an URL and return the response object."""
|
"""Open an URL and return the response object."""
|
||||||
out.debug(u'Open URL %s' % url)
|
out.debug(u'Open URL %s' % url)
|
||||||
headers = {'User-Agent': useragent}
|
if 'headers' not in kwargs:
|
||||||
|
kwargs['headers'] = {}
|
||||||
|
kwargs['headers']['User-Agent'] = useragent
|
||||||
if referrer:
|
if referrer:
|
||||||
headers['Referer'] = referrer
|
kwargs['headers']['Referer'] = referrer
|
||||||
out.debug(u'Sending headers %s' % headers, level=3)
|
out.debug(u'Sending headers %s' % kwargs['headers'], level=3)
|
||||||
out.debug(u'Sending cookies %s' % session.cookies)
|
out.debug(u'Sending cookies %s' % session.cookies)
|
||||||
kwargs = {
|
if 'timeout' not in kwargs:
|
||||||
"headers": headers,
|
kwargs['timeout'] = ConnectionTimeoutSecs
|
||||||
"timeout": timeout,
|
if 'data' not in kwargs:
|
||||||
"stream": stream,
|
|
||||||
}
|
|
||||||
if data is None:
|
|
||||||
method = 'GET'
|
method = 'GET'
|
||||||
else:
|
else:
|
||||||
kwargs['data'] = data
|
|
||||||
method = 'POST'
|
method = 'POST'
|
||||||
out.debug(u'Sending POST data %s' % data, level=3)
|
out.debug(u'Sending POST data %s' % kwargs['data'], level=3)
|
||||||
try:
|
try:
|
||||||
req = session.request(method, url, **kwargs)
|
req = session.request(method, url, **kwargs)
|
||||||
out.debug(u'Response cookies: %s' % req.cookies)
|
out.debug(u'Response cookies: %s' % req.cookies)
|
||||||
|
@ -547,31 +534,3 @@ def strlimit(s, length=72):
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return ""
|
return ""
|
||||||
return "%s..." % s[:length]
|
return "%s..." % s[:length]
|
||||||
|
|
||||||
|
|
||||||
def writeFile(filename, content, encoding=None):
|
|
||||||
"""Write content to given filename. Checks for zero-sized files.
|
|
||||||
If encoding is given writes to a codec.open() file."""
|
|
||||||
if not content:
|
|
||||||
raise OSError("empty content for file %s" % filename)
|
|
||||||
|
|
||||||
def getfp(filename, encoding):
|
|
||||||
"""Get open file object."""
|
|
||||||
if encoding:
|
|
||||||
return codecs.open(filename, 'w', encoding)
|
|
||||||
return open(filename, 'wb')
|
|
||||||
|
|
||||||
try:
|
|
||||||
with getfp(filename, encoding) as fp:
|
|
||||||
fp.write(content)
|
|
||||||
fp.flush()
|
|
||||||
os.fsync(fp.fileno())
|
|
||||||
size = os.path.getsize(filename)
|
|
||||||
if size == 0:
|
|
||||||
raise OSError("empty file %s" % filename)
|
|
||||||
except Exception:
|
|
||||||
if os.path.isfile(filename):
|
|
||||||
os.remove(filename)
|
|
||||||
raise
|
|
||||||
else:
|
|
||||||
out.info(u"Saved %s (%s)." % (filename, strsize(size)))
|
|
||||||
|
|
|
@ -3,6 +3,8 @@
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
@ -70,7 +72,7 @@ def _test_comic(outdir, scraperobj):
|
||||||
msg = 'Traversed %d strips instead of %d.' % (num_strips,
|
msg = 'Traversed %d strips instead of %d.' % (num_strips,
|
||||||
num_strips_expected)
|
num_strips_expected)
|
||||||
if strip:
|
if strip:
|
||||||
msg += " Check the prevSearch pattern at %s" % strip.stripUrl
|
msg += " Check the prevSearch pattern at %s" % strip.strip_url
|
||||||
assert num_strips == num_strips_expected, msg
|
assert num_strips == num_strips_expected, msg
|
||||||
if strip:
|
if strip:
|
||||||
_check_scraperesult(outdir, num_strips_expected, strip, scraperobj)
|
_check_scraperesult(outdir, num_strips_expected, strip, scraperobj)
|
||||||
|
@ -83,10 +85,10 @@ def _check_strip(outdir, strip, multipleImagesPerStrip):
|
||||||
for image in strip.getImages():
|
for image in strip.getImages():
|
||||||
images.append(image.url)
|
images.append(image.url)
|
||||||
image.save(outdir)
|
image.save(outdir)
|
||||||
assert images, 'failed to find images at %s' % strip.stripUrl
|
assert images, 'failed to find images at %s' % strip.strip_url
|
||||||
if not multipleImagesPerStrip:
|
if not multipleImagesPerStrip:
|
||||||
assert len(images) == 1, 'found more than 1 image at %s: %s' % (
|
assert len(images) == 1, 'found more than 1 image at %s: %s' % (
|
||||||
strip.stripUrl, images)
|
strip.strip_url, images)
|
||||||
|
|
||||||
|
|
||||||
def _check_scraperesult(outdir, num_images_expected, strip, scraperobj):
|
def _check_scraperesult(outdir, num_images_expected, strip, scraperobj):
|
||||||
|
@ -113,7 +115,7 @@ def _check_stripurl(strip, scraperobj):
|
||||||
urlmatch = urlmatch.replace(r"\%s", r".+")
|
urlmatch = urlmatch.replace(r"\%s", r".+")
|
||||||
urlmatch = "^%s$" % urlmatch
|
urlmatch = "^%s$" % urlmatch
|
||||||
ro = re.compile(urlmatch)
|
ro = re.compile(urlmatch)
|
||||||
mo = ro.search(strip.stripUrl)
|
mo = ro.search(strip.strip_url)
|
||||||
err = 'strip URL %r does not match stripUrl pattern %s' % (
|
err = 'strip URL %r does not match stripUrl pattern %s' % (
|
||||||
strip.stripUrl, urlmatch)
|
strip.strip_url, urlmatch)
|
||||||
assert mo is not None, err
|
assert mo is not None, err
|
||||||
|
|
Loading…
Reference in a new issue