dosage/dosagelib/util.py

484 lines
15 KiB
Python
Raw Normal View History

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
2013-01-09 21:21:19 +00:00
# Copyright (C) 2012-2013 Bastian Kleineidam
from __future__ import division, print_function
2012-06-20 19:58:13 +00:00
2013-01-24 20:42:27 +00:00
import urllib, urlparse
2012-12-12 16:41:29 +00:00
import robotparser
import requests
2012-06-20 19:58:13 +00:00
import sys
import os
import cgi
import re
import traceback
import time
import types
2012-06-20 19:58:13 +00:00
from htmlentitydefs import name2codepoint
2012-12-12 16:41:29 +00:00
from .decorators import memoized
2012-06-20 19:58:13 +00:00
from .output import out
from .configuration import UserAgent, AppName, App, SupportUrl
2013-03-08 21:33:05 +00:00
from .languages import Iso2Language
2012-06-20 19:58:13 +00:00
2012-12-05 20:52:52 +00:00
# Maximum content size for HTML pages
MaxContentBytes = 1024 * 1024 * 2 # 2 MB
# Maximum content size for images
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
2012-12-19 19:43:18 +00:00
# Default number of retries
MaxRetries = 3
2013-02-13 16:54:10 +00:00
# Time to pause between retries
RetryPauseSeconds = 5
2012-12-07 23:45:18 +00:00
# Default connection timeout
ConnectionTimeoutSecs = 60
2012-06-20 19:58:13 +00:00
2013-01-23 20:16:22 +00:00
# The character set to encode non-ASCII characters in a URL. See also
# http://tools.ietf.org/html/rfc2396#section-2.1
# Note that the encoding is not really specified, but most browsers
# encode in UTF-8 when no encoding is specified by the HTTP headers,
# else they use the page encoding for followed link. See als
# http://code.google.com/p/browsersec/wiki/Part1#Unicode_in_URLs
UrlEncoding = "utf-8"
2012-12-19 19:43:18 +00:00
if hasattr(requests, 'adapters'):
# requests >= 1.0
requests.adapters.DEFAULT_RETRIES = MaxRetries
2013-01-23 20:16:22 +00:00
2012-10-12 19:11:44 +00:00
def tagre(tag, attribute, value, quote='"', before="", after=""):
2012-10-11 10:03:12 +00:00
"""Return a regular expression matching the given HTML tag, attribute
and value. It matches the tag and attribute names case insensitive,
and skips arbitrary whitespace and leading HTML attributes. The "<>" at
the start and end of the HTML tag is also matched.
@param tag: the tag name
@ptype tag: string
@param attribute: the attribute name
@ptype attribute: string
@param value: the attribute value
@ptype value: string
2012-10-11 13:43:29 +00:00
@param quote: the attribute quote (default ")
@ptype quote: string
2012-10-12 19:11:44 +00:00
@param after: match after attribute value but before end
@ptype after: string
2012-10-11 10:03:12 +00:00
@return: the generated regular expression suitable for re.compile()
@rtype: string
"""
if before:
prefix = r"[^>]*%s[^>]*\s+" % before
else:
prefix = r"(?:[^>]*\s+)?"
2012-10-11 10:03:12 +00:00
attrs = dict(
tag=case_insensitive_re(tag),
attribute=case_insensitive_re(attribute),
value=value,
2012-10-11 13:43:29 +00:00
quote=quote,
prefix=prefix,
2012-10-12 19:11:44 +00:00
after=after,
2012-10-11 10:03:12 +00:00
)
return r'<\s*%(tag)s\s+%(prefix)s%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
2012-06-20 19:58:13 +00:00
2012-10-11 10:03:12 +00:00
def case_insensitive_re(name):
"""Reformat the given name to a case insensitive regular expression string
without using re.IGNORECASE. This way selective strings can be made case
insensitive.
@param name: the name to make case insensitive
@ptype name: string
2013-01-23 20:16:22 +00:00
@return: the case insensitive regex
2012-10-11 10:03:12 +00:00
@rtype: string
"""
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
2012-06-20 19:58:13 +00:00
2012-10-11 10:03:12 +00:00
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
2012-06-20 19:58:13 +00:00
2013-02-13 16:54:10 +00:00
def isValidPageContent(data):
"""Check if page content is empty or has error messages."""
# The python requests library sometimes returns empty data.
# Some webservers have a 200 OK status but have an error message as response.
return data and not data.startswith("Internal Server Error")
2013-02-12 16:55:13 +00:00
def getPageContent(url, session, max_content_bytes=MaxContentBytes):
2012-12-12 16:41:29 +00:00
"""Get text content of given URL."""
2013-02-12 16:55:13 +00:00
check_robotstxt(url, session)
2012-10-11 10:03:12 +00:00
# read page data
try:
page = urlopen(url, session, max_content_bytes=max_content_bytes, stream=False)
except IOError:
page = urlopen(url, session, max_content_bytes=max_content_bytes, stream=False)
data = page.text
2013-02-13 16:54:10 +00:00
tries = MaxRetries
while not isValidPageContent(data) and tries > 0:
time.sleep(RetryPauseSeconds)
2013-02-18 19:38:59 +00:00
page = urlopen(url, session, max_content_bytes=max_content_bytes, stream=False)
2013-02-11 18:52:59 +00:00
data = page.text
2013-02-13 16:54:10 +00:00
tries -= 1
if not isValidPageContent(data):
raise ValueError("Got invalid page content from %s: %r" % (url, data))
2012-10-11 10:03:12 +00:00
# determine base URL
baseUrl = None
match = baseSearch.search(data)
if match:
baseUrl = match.group(1)
2012-06-20 19:58:13 +00:00
else:
2012-10-11 10:03:12 +00:00
baseUrl = url
return data, baseUrl
2012-06-20 19:58:13 +00:00
2013-02-12 16:55:13 +00:00
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
2012-12-05 20:52:52 +00:00
"""Get response object for given image URL."""
2013-02-12 16:55:13 +00:00
return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes)
2012-12-05 20:52:52 +00:00
2013-02-11 18:43:46 +00:00
def fetchUrls(url, data, baseUrl, urlSearch):
"""Search all entries for given URL pattern(s) in a HTML page."""
2013-02-11 18:43:46 +00:00
searchUrls = []
if isinstance(urlSearch, (types.ListType, types.TupleType)):
searches = urlSearch
else:
searches = [urlSearch]
for search in searches:
for match in search.finditer(data):
searchUrl = match.group(1)
if not searchUrl:
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
out.debug('matched URL %r with pattern %s' % (searchUrl, search.pattern))
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
if searchUrls:
# do not search other links if one pattern matched
break
2013-02-11 18:43:46 +00:00
if not searchUrls:
patterns = [x.pattern for x in searches]
raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
2013-02-11 18:43:46 +00:00
return searchUrls
def fetchUrl(url, data, baseUrl, urlSearch):
"""Search first entry for given URL pattern in a HTML page."""
return fetchUrls(url, data, baseUrl, urlSearch)[0]
2012-06-20 19:58:13 +00:00
def unescape(text):
2012-12-12 16:41:29 +00:00
"""Replace HTML entities and character references."""
2012-06-20 19:58:13 +00:00
def _fixup(m):
2012-12-12 16:41:29 +00:00
"""Replace HTML entities."""
2012-06-20 19:58:13 +00:00
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
text = unichr(int(text[3:-1], 16))
else:
text = unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(name2codepoint[text[1:-1]])
except KeyError:
pass
return text
2012-11-21 20:57:26 +00:00
return re.sub(r"&#?\w+;", _fixup, text)
2012-06-20 19:58:13 +00:00
2013-02-12 16:55:33 +00:00
_nopathquote_chars = "-;/=,~*+()@!"
2012-06-20 19:58:13 +00:00
def normaliseURL(url):
2012-12-12 16:41:29 +00:00
"""Removes any leading empty segments to avoid breaking urllib2; also replaces
2012-06-20 19:58:13 +00:00
HTML entities and character references.
"""
2013-02-21 18:48:04 +00:00
# XXX does not work for python3
2013-01-23 20:16:22 +00:00
if isinstance(url, unicode):
url = url.encode(UrlEncoding, 'ignore')
2012-06-20 19:58:13 +00:00
# XXX: brutal hack
url = unescape(url)
2012-06-20 19:58:13 +00:00
pu = list(urlparse.urlparse(url))
2012-11-21 20:57:26 +00:00
segments = pu[2].split('/')
2012-12-04 06:02:40 +00:00
while segments and segments[0] in ('', '..'):
2012-06-20 19:58:13 +00:00
del segments[0]
2013-02-23 08:08:08 +00:00
pu[2] = '/' + '/'.join(segments)
2012-11-14 19:23:30 +00:00
# remove leading '&' from query
2012-11-21 20:57:26 +00:00
if pu[4].startswith('&'):
pu[4] = pu[4][1:]
# remove anchor
pu[5] = ""
2012-06-20 19:58:13 +00:00
return urlparse.urlunparse(pu)
2012-11-21 20:57:26 +00:00
2012-12-12 16:41:29 +00:00
def get_roboturl(url):
"""Get robots.txt URL from given URL."""
pu = urlparse.urlparse(url)
return urlparse.urlunparse((pu[0], pu[1], "/robots.txt", "", "", ""))
2013-02-12 16:55:13 +00:00
def check_robotstxt(url, session):
2012-12-12 16:41:29 +00:00
"""Check if robots.txt allows our user agent for the given URL.
@raises: IOError if URL is not allowed
"""
roboturl = get_roboturl(url)
2013-02-13 16:52:07 +00:00
rp = get_robotstxt_parser(roboturl, session=session)
2012-12-12 16:41:29 +00:00
if not rp.can_fetch(UserAgent, url):
raise IOError("%s is disallowed by robots.txt" % url)
@memoized
2013-02-13 16:52:07 +00:00
def get_robotstxt_parser(url, session=None):
2012-12-12 16:41:29 +00:00
"""Get a RobotFileParser for the given robots.txt URL."""
rp = robotparser.RobotFileParser()
2013-02-21 18:48:04 +00:00
try:
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
except Exception:
# connect or timeout errors are treated as an absent robotst.txt
2012-12-12 16:41:29 +00:00
rp.allow_all = True
2013-02-21 18:48:04 +00:00
else:
if req.status_code in (401, 403):
rp.disallow_all = True
elif req.status_code >= 400:
rp.allow_all = True
elif req.status_code == 200:
rp.parse(req.content.splitlines())
2012-12-12 16:41:29 +00:00
return rp
2013-02-12 16:55:13 +00:00
def urlopen(url, session, referrer=None, max_content_bytes=None,
2013-02-18 19:38:59 +00:00
timeout=ConnectionTimeoutSecs, raise_for_status=True,
stream=True):
2012-12-12 16:41:29 +00:00
"""Open an URL and return the response object."""
2012-12-07 23:45:18 +00:00
out.debug('Open URL %s' % url)
headers = {'User-Agent': UserAgent}
2012-06-20 19:58:13 +00:00
if referrer:
headers['Referer'] = referrer
2013-02-07 17:28:56 +00:00
out.debug('Sending headers %s' % headers, level=3)
2012-12-19 19:43:18 +00:00
kwargs = {
"headers": headers,
"timeout": timeout,
}
if hasattr(requests, 'adapters'):
# requests >= 1.0
2013-02-18 19:38:59 +00:00
kwargs["stream"] = stream
else:
2012-12-19 19:43:18 +00:00
# requests << 1.0
2013-02-18 19:38:59 +00:00
kwargs["prefetch"] = not stream
2012-12-19 19:43:18 +00:00
kwargs["config"] = {"max_retries": MaxRetries}
try:
2012-12-19 19:43:18 +00:00
req = session.get(url, **kwargs)
2012-12-05 20:52:52 +00:00
check_content_size(url, req.headers, max_content_bytes)
2012-12-12 16:41:29 +00:00
if raise_for_status:
req.raise_for_status()
return req
except requests.exceptions.RequestException as err:
msg = 'URL retrieval of %s failed: %s' % (url, err)
raise IOError(msg)
2012-06-20 19:58:13 +00:00
2012-12-12 16:41:29 +00:00
2012-12-05 20:52:52 +00:00
def check_content_size(url, headers, max_content_bytes):
2012-12-12 16:41:29 +00:00
"""Check that content length in URL response headers do not exceed the
given maximum bytes.
"""
2012-12-05 20:52:52 +00:00
if not max_content_bytes:
return
if 'content-length' in headers:
size = int(headers['content-length'])
if size > max_content_bytes:
2012-12-12 16:41:29 +00:00
msg = 'URL content of %s with %d bytes exceeds %d bytes.' % (url, size, max_content_bytes)
2012-12-05 20:52:52 +00:00
raise IOError(msg)
2012-06-20 20:33:26 +00:00
2012-06-20 19:58:13 +00:00
def splitpath(path):
2012-12-12 16:41:29 +00:00
"""Split a path in its components."""
2012-06-20 19:58:13 +00:00
c = []
head, tail = os.path.split(path)
while tail:
c.insert(0, tail)
head, tail = os.path.split(head)
return c
2012-11-21 20:57:26 +00:00
2012-06-20 19:58:13 +00:00
def getRelativePath(basepath, path):
2012-12-12 16:41:29 +00:00
"""Get a path that is relative to the given base path."""
2012-06-20 19:58:13 +00:00
basepath = splitpath(os.path.abspath(basepath))
path = splitpath(os.path.abspath(path))
afterCommon = False
for c in basepath:
if afterCommon or path[0] != c:
path.insert(0, os.path.pardir)
afterCommon = True
else:
del path[0]
return os.path.join(*path)
2012-11-21 20:57:26 +00:00
2012-06-20 19:58:13 +00:00
def getQueryParams(url):
2012-12-12 16:41:29 +00:00
"""Get URL query parameters."""
2012-06-20 19:58:13 +00:00
query = urlparse.urlsplit(url)[3]
2012-12-07 23:45:18 +00:00
out.debug('Extracting query parameters from %r (%r)...' % (url, query))
2012-06-20 19:58:13 +00:00
return cgi.parse_qs(query)
def internal_error(out=sys.stderr, etype=None, evalue=None, tb=None):
"""Print internal error message (output defaults to stderr)."""
print(os.linesep, file=out)
print("""********** Oops, I did it again. *************
2012-06-20 19:58:13 +00:00
You have found an internal error in %(app)s. Please write a bug report
at %(url)s and include at least the information below:
2012-06-20 19:58:13 +00:00
Not disclosing some of the information below due to privacy reasons is ok.
2012-06-20 19:58:13 +00:00
I will try to help you nonetheless, but you have to give me something
I can work with ;) .
""" % dict(app=AppName, url=SupportUrl), file=out)
2012-06-20 19:58:13 +00:00
if etype is None:
etype = sys.exc_info()[0]
if evalue is None:
evalue = sys.exc_info()[1]
2012-11-21 20:57:26 +00:00
print(etype, evalue, file=out)
2012-06-20 19:58:13 +00:00
if tb is None:
tb = sys.exc_info()[2]
traceback.print_exception(etype, evalue, tb, None, out)
print_app_info(out=out)
print_proxy_info(out=out)
print_locale_info(out=out)
print(os.linesep,
"******** %s internal error, over and out ********" % AppName, file=out)
2012-06-20 19:58:13 +00:00
def print_env_info(key, out=sys.stderr):
"""If given environment key is defined, print it out."""
value = os.getenv(key)
if value is not None:
print(key, "=", repr(value), file=out)
2012-06-20 19:58:13 +00:00
def print_proxy_info(out=sys.stderr):
"""Print proxy info."""
print_env_info("http_proxy", out=out)
def print_locale_info(out=sys.stderr):
"""Print locale info."""
for key in ("LANGUAGE", "LC_ALL", "LC_CTYPE", "LANG"):
print_env_info(key, out=out)
def print_app_info(out=sys.stderr):
"""Print system and application info (output defaults to stderr)."""
print("System info:", file=out)
print(App, file=out)
print("Python %(version)s on %(platform)s" %
{"version": sys.version, "platform": sys.platform}, file=out)
2012-06-20 19:58:13 +00:00
stime = strtime(time.time())
print("Local time:", stime, file=out)
print("sys.argv", sys.argv, file=out)
2012-06-20 19:58:13 +00:00
def strtime(t):
"""Return ISO 8601 formatted time."""
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \
strtimezone()
def strtimezone():
"""Return timezone info, %z on some platforms, but not supported on all.
"""
if time.daylight:
zone = time.altzone
else:
zone = time.timezone
return "%+04d" % (-zone//3600)
2012-11-26 06:14:02 +00:00
2012-12-12 16:41:29 +00:00
def rfc822date(indate):
"""Format date in rfc822 format."""
return time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(indate))
2012-11-26 06:14:02 +00:00
def asciify(name):
"""Remove non-ascii characters from string."""
return re.sub("[^0-9a-zA-Z_]", "", name)
def unquote(text):
2012-12-12 16:41:29 +00:00
"""Replace all percent-encoded entities in text."""
while '%' in text:
2012-12-07 23:45:18 +00:00
newtext = urllib.unquote(text)
if newtext == text:
break
text = newtext
return text
2012-12-02 17:35:06 +00:00
2013-01-23 20:16:22 +00:00
def quote(text, safechars='/'):
2012-12-12 16:41:29 +00:00
"""Percent-encode given text."""
2013-01-23 20:16:22 +00:00
return urllib.quote(text, safechars)
2012-12-07 23:45:18 +00:00
2012-12-02 17:35:06 +00:00
def strsize (b):
"""Return human representation of bytes b. A negative number of bytes
raises a value error."""
if b < 0:
raise ValueError("Invalid negative byte number")
if b < 1024:
return "%dB" % b
if b < 1024 * 10:
return "%dKB" % (b // 1024)
if b < 1024 * 1024:
return "%.2fKB" % (float(b) / 1024)
if b < 1024 * 1024 * 10:
return "%.2fMB" % (float(b) / (1024*1024))
if b < 1024 * 1024 * 1024:
return "%.1fMB" % (float(b) / (1024*1024))
if b < 1024 * 1024 * 1024 * 10:
return "%.2fGB" % (float(b) / (1024*1024*1024))
return "%.1fGB" % (float(b) / (1024*1024*1024))
2012-12-12 16:41:29 +00:00
2012-12-07 23:45:18 +00:00
def getDirname(name):
"""Replace slashes with path separator of name."""
return name.replace('/', os.sep)
def getFilename(name):
2012-12-12 16:41:29 +00:00
"""Get a filename from given name without dangerous or incompatible characters."""
2012-12-07 23:45:18 +00:00
# first replace all illegal chars
name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name)
# then remove double dots and underscores
while ".." in name:
name = name.replace('..', '.')
while "__" in name:
name = name.replace('__', '_')
# remove a leading dot or minus
if name.startswith((".", "-")):
name = name[1:]
return name
2012-12-12 16:41:29 +00:00
def strlimit (s, length=72):
"""If the length of the string exceeds the given limit, it will be cut
off and three dots will be appended.
@param s: the string to limit
@type s: string
@param length: maximum length
@type length: non-negative integer
@return: limited string, at most length+3 characters long
"""
assert length >= 0, "length limit must be a non-negative integer"
if not s or len(s) <= length:
return s
if length == 0:
return ""
return "%s..." % s[:length]
2013-03-08 21:33:05 +00:00
def getLangName(code):
"""Get name of language specified by ISO 693-1 code."""
return Iso2Language[code]