dosage/dosagelib/util.py

536 lines
16 KiB
Python
Raw Normal View History

2016-03-13 20:27:31 +00:00
# -*- coding: utf-8 -*-
2016-10-28 22:21:41 +00:00
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
2014-01-05 15:50:57 +00:00
# Copyright (C) 2012-2014 Bastian Kleineidam
2018-05-22 22:54:40 +00:00
# Copyright (C) 2015-2018 Tobias Gruetzmacher
2016-03-13 20:27:31 +00:00
from __future__ import absolute_import, division, print_function
from six.moves.urllib.parse import (
quote as url_quote, unquote as url_unquote, urlparse, urlunparse, urlsplit)
from six.moves.urllib_robotparser import RobotFileParser
import requests
2016-03-13 20:27:31 +00:00
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
2012-06-20 19:58:13 +00:00
import sys
import os
import cgi
import re
import traceback
import time
2013-04-08 19:20:01 +00:00
import subprocess
2018-05-22 22:54:40 +00:00
try:
import html
except ImportError:
# Python 2.7
from HTMLParser import HTMLParser
html = HTMLParser()
from six.moves import range
import six
try:
from functools import lru_cache
except ImportError:
from backports.functools_lru_cache import lru_cache
2012-06-20 19:58:13 +00:00
from .output import out
from .configuration import UserAgent, AppName, App, SupportUrl
2012-12-05 20:52:52 +00:00
# Maximum content size for HTML pages
2016-03-13 20:27:31 +00:00
MaxContentBytes = 1024 * 1024 * 3 # 3 MB
2012-12-05 20:52:52 +00:00
2012-12-19 19:43:18 +00:00
# Default number of retries
MaxRetries = 3
2016-03-13 20:27:31 +00:00
# Factor for retry backoff (see urllib3.util.retry, this default means
# 2s, 4s, 8s)
RetryBackoffFactor = 2
2013-02-13 16:54:10 +00:00
2012-12-07 23:45:18 +00:00
# Default connection timeout
ConnectionTimeoutSecs = 60
2012-06-20 19:58:13 +00:00
2013-01-23 20:16:22 +00:00
# The character set to encode non-ASCII characters in a URL. See also
# http://tools.ietf.org/html/rfc2396#section-2.1
# Note that the encoding is not really specified, but most browsers
# encode in UTF-8 when no encoding is specified by the HTTP headers,
# else they use the page encoding for followed link. See als
# http://code.google.com/p/browsersec/wiki/Part1#Unicode_in_URLs
UrlEncoding = "utf-8"
2016-03-13 20:27:31 +00:00
def requests_session():
s = requests.Session()
retry = Retry(MaxRetries, backoff_factor=RetryBackoffFactor)
s.mount('http://', HTTPAdapter(max_retries=retry))
s.mount('https://', HTTPAdapter(max_retries=retry))
return s
2013-04-08 19:20:01 +00:00
def get_system_uid():
"""Get a (probably) unique ID to identify a system.
Used to differentiate votes.
"""
try:
if os.name == 'nt':
return get_nt_system_uid()
if sys.platform == 'darwin':
return get_osx_system_uid()
except Exception:
return get_mac_uid()
else:
return get_mac_uid()
def get_nt_system_uid():
"""Get the MachineGuid from
HKEY_LOCAL_MACHINE\Software\Microsoft\Cryptography\MachineGuid
"""
try:
import _winreg as winreg
except ImportError:
import winreg
lm = winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE)
try:
key = winreg.OpenKey(lm, r"Software\Microsoft\Cryptography")
try:
return winreg.QueryValueEx(key, "MachineGuid")[0]
finally:
key.Close()
finally:
lm.Close()
def get_osx_system_uid():
"""Get the OSX system ID.
$ system_profiler |grep "r (system)"
2013-04-09 17:33:50 +00:00
Serial Number (system): C24E1322XYZ
2013-04-08 19:20:01 +00:00
"""
res = backtick(["system_profile"]).splitlines()
for line in res:
if "r (system)" in line:
return line.split(':', 1)[1].strip()
raise ValueError("Could not find system number in %r" % res)
def get_mac_uid():
"""Get the MAC address of the system."""
import uuid
2013-04-09 17:33:50 +00:00
return "%d" % uuid.getnode()
2013-04-08 19:20:01 +00:00
2016-03-13 20:27:31 +00:00
def backtick(cmd, encoding='utf-8'):
2013-04-08 19:20:01 +00:00
"""Return decoded output from command."""
data = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0]
return data.decode(encoding)
2013-04-13 06:00:11 +00:00
2013-04-05 16:57:44 +00:00
def unicode_safe(text, encoding=UrlEncoding, errors='ignore'):
"""Decode text to Unicode if not already done."""
if isinstance(text, six.text_type):
2013-04-05 16:57:44 +00:00
return text
return text.decode(encoding, errors)
2012-10-12 19:11:44 +00:00
def tagre(tag, attribute, value, quote='"', before="", after=""):
2012-10-11 10:03:12 +00:00
"""Return a regular expression matching the given HTML tag, attribute
and value. It matches the tag and attribute names case insensitive,
and skips arbitrary whitespace and leading HTML attributes. The "<>" at
the start and end of the HTML tag is also matched.
@param tag: the tag name
@ptype tag: string
@param attribute: the attribute name
@ptype attribute: string
@param value: the attribute value
@ptype value: string
2012-10-11 13:43:29 +00:00
@param quote: the attribute quote (default ")
@ptype quote: string
2012-10-12 19:11:44 +00:00
@param after: match after attribute value but before end
@ptype after: string
2012-10-11 10:03:12 +00:00
@return: the generated regular expression suitable for re.compile()
@rtype: string
"""
if before:
prefix = r"[^>]*%s[^>]*\s+" % before
else:
prefix = r"(?:[^>]*\s+)?"
2012-10-11 10:03:12 +00:00
attrs = dict(
tag=case_insensitive_re(tag),
attribute=case_insensitive_re(attribute),
value=value,
2012-10-11 13:43:29 +00:00
quote=quote,
prefix=prefix,
2012-10-12 19:11:44 +00:00
after=after,
2012-10-11 10:03:12 +00:00
)
2016-03-13 20:27:31 +00:00
return (r'<\s*%(tag)s\s+%(prefix)s' +
r'%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)' +
r's[^>]*%(after)s[^>]*>') % attrs
2012-06-20 19:58:13 +00:00
2012-10-11 10:03:12 +00:00
def case_insensitive_re(name):
"""Reformat the given name to a case insensitive regular expression string
without using re.IGNORECASE. This way selective strings can be made case
insensitive.
@param name: the name to make case insensitive
@ptype name: string
2013-01-23 20:16:22 +00:00
@return: the case insensitive regex
2012-10-11 10:03:12 +00:00
@rtype: string
"""
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
2012-06-20 19:58:13 +00:00
def get_page(url, session, **kwargs):
2012-12-12 16:41:29 +00:00
"""Get text content of given URL."""
2013-02-12 16:55:13 +00:00
check_robotstxt(url, session)
2012-10-11 10:03:12 +00:00
# read page data
page = urlopen(url, session, max_content_bytes=MaxContentBytes, **kwargs)
out.debug(u"Got page content %r" % page.content, level=3)
return page
2012-06-20 19:58:13 +00:00
2013-04-05 16:55:19 +00:00
def makeSequence(item):
"""If item is already a list or tuple, return it.
Else return a tuple with item as single element."""
if isinstance(item, (list, tuple)):
return item
2013-04-05 16:55:19 +00:00
return (item,)
def prettyMatcherList(things):
"""Try to construct a nicely-formatted string for a list of matcher
objects. Those may be compiled regular expressions or strings..."""
norm = []
for x in makeSequence(things):
if hasattr(x, 'pattern'):
norm.append(x.pattern)
else:
norm.append(x)
return "('%s')" % "', '".join(norm)
2013-11-29 19:26:49 +00:00
def unescape(text):
2012-12-12 16:41:29 +00:00
"""Replace HTML entities and character references."""
2018-05-22 22:54:40 +00:00
return html.unescape(text)
2012-11-21 20:57:26 +00:00
2012-06-20 19:58:13 +00:00
2013-02-12 16:55:33 +00:00
_nopathquote_chars = "-;/=,~*+()@!"
2016-03-13 20:27:31 +00:00
2012-06-20 19:58:13 +00:00
def normaliseURL(url):
"""Normalising
- strips and leading or trailing whitespace,
- replaces HTML entities and character references,
- removes any leading empty segments to avoid breaking urllib2.
2012-06-20 19:58:13 +00:00
"""
url = unicode_safe(url).strip()
2012-06-20 19:58:13 +00:00
# XXX: brutal hack
url = unescape(url)
2012-06-20 19:58:13 +00:00
2013-04-03 18:32:43 +00:00
pu = list(urlparse(url))
2012-11-21 20:57:26 +00:00
segments = pu[2].split('/')
2012-12-04 06:02:40 +00:00
while segments and segments[0] in ('', '..'):
2012-06-20 19:58:13 +00:00
del segments[0]
2013-02-23 08:08:08 +00:00
pu[2] = '/' + '/'.join(segments)
2012-11-14 19:23:30 +00:00
# remove leading '&' from query
2012-11-21 20:57:26 +00:00
if pu[4].startswith('&'):
pu[4] = pu[4][1:]
# remove anchor
pu[5] = ""
2013-04-03 18:32:43 +00:00
return urlunparse(pu)
2012-06-20 19:58:13 +00:00
2012-11-21 20:57:26 +00:00
2012-12-12 16:41:29 +00:00
def get_roboturl(url):
"""Get robots.txt URL from given URL."""
2013-04-03 18:32:43 +00:00
pu = urlparse(url)
return urlunparse((pu[0], pu[1], "/robots.txt", "", "", ""))
2012-12-12 16:41:29 +00:00
2013-02-12 16:55:13 +00:00
def check_robotstxt(url, session):
2012-12-12 16:41:29 +00:00
"""Check if robots.txt allows our user agent for the given URL.
@raises: IOError if URL is not allowed
"""
roboturl = get_roboturl(url)
2013-02-13 16:52:07 +00:00
rp = get_robotstxt_parser(roboturl, session=session)
2013-04-09 17:36:00 +00:00
if not rp.can_fetch(UserAgent, str(url)):
2014-07-03 15:12:25 +00:00
raise IOError("%s is disallowed by %s" % (url, roboturl))
2012-12-12 16:41:29 +00:00
@lru_cache()
2013-02-13 16:52:07 +00:00
def get_robotstxt_parser(url, session=None):
2012-12-12 16:41:29 +00:00
"""Get a RobotFileParser for the given robots.txt URL."""
rp = RobotFileParser()
2013-02-21 18:48:04 +00:00
try:
2016-03-13 20:27:31 +00:00
req = urlopen(url, session, max_content_bytes=MaxContentBytes,
allow_errors=range(600))
2013-02-21 18:48:04 +00:00
except Exception:
# connect or timeout errors are treated as an absent robots.txt
2012-12-12 16:41:29 +00:00
rp.allow_all = True
2013-02-21 18:48:04 +00:00
else:
if req.status_code >= 400:
2013-02-21 18:48:04 +00:00
rp.allow_all = True
elif req.status_code == 200:
rp.parse(req.text.splitlines())
2012-12-12 16:41:29 +00:00
return rp
2013-02-12 16:55:13 +00:00
def urlopen(url, session, referrer=None, max_content_bytes=None,
allow_errors=(), useragent=UserAgent, **kwargs):
2012-12-12 16:41:29 +00:00
"""Open an URL and return the response object."""
2013-04-30 04:40:20 +00:00
out.debug(u'Open URL %s' % url)
if 'headers' not in kwargs:
kwargs['headers'] = {}
kwargs['headers']['User-Agent'] = useragent
2012-06-20 19:58:13 +00:00
if referrer:
kwargs['headers']['Referer'] = referrer
out.debug(u'Sending headers %s' % kwargs['headers'], level=3)
2013-04-30 04:40:20 +00:00
out.debug(u'Sending cookies %s' % session.cookies)
if 'timeout' not in kwargs:
kwargs['timeout'] = ConnectionTimeoutSecs
if 'data' not in kwargs:
method = 'GET'
2013-04-05 04:36:05 +00:00
else:
method = 'POST'
out.debug(u'Sending POST data %s' % kwargs['data'], level=3)
try:
req = session.request(method, url, **kwargs)
2013-04-30 04:40:20 +00:00
out.debug(u'Response cookies: %s' % req.cookies)
2012-12-05 20:52:52 +00:00
check_content_size(url, req.headers, max_content_bytes)
if req.status_code not in allow_errors:
2012-12-12 16:41:29 +00:00
req.raise_for_status()
return req
except requests.exceptions.RequestException as err:
msg = 'URL retrieval of %s failed: %s' % (url, err)
raise IOError(msg)
2012-06-20 19:58:13 +00:00
2012-12-12 16:41:29 +00:00
2012-12-05 20:52:52 +00:00
def check_content_size(url, headers, max_content_bytes):
2012-12-12 16:41:29 +00:00
"""Check that content length in URL response headers do not exceed the
given maximum bytes.
"""
2012-12-05 20:52:52 +00:00
if not max_content_bytes:
return
if 'content-length' in headers:
size = int(headers['content-length'])
if size > max_content_bytes:
2016-03-13 20:27:31 +00:00
raise IOError(
'URL content of %s with %d bytes exceeds %d bytes.' %
(url, size, max_content_bytes))
2012-12-05 20:52:52 +00:00
2012-06-20 20:33:26 +00:00
2012-06-20 19:58:13 +00:00
def splitpath(path):
2012-12-12 16:41:29 +00:00
"""Split a path in its components."""
2012-06-20 19:58:13 +00:00
c = []
head, tail = os.path.split(path)
while tail:
c.insert(0, tail)
head, tail = os.path.split(head)
return c
2012-11-21 20:57:26 +00:00
2012-06-20 19:58:13 +00:00
def getRelativePath(basepath, path):
2012-12-12 16:41:29 +00:00
"""Get a path that is relative to the given base path."""
2012-06-20 19:58:13 +00:00
basepath = splitpath(os.path.abspath(basepath))
path = splitpath(os.path.abspath(path))
afterCommon = False
for c in basepath:
if afterCommon or path[0] != c:
path.insert(0, os.path.pardir)
afterCommon = True
else:
del path[0]
return os.path.join(*path)
2012-11-21 20:57:26 +00:00
2012-06-20 19:58:13 +00:00
def getQueryParams(url):
2012-12-12 16:41:29 +00:00
"""Get URL query parameters."""
2013-04-03 18:32:43 +00:00
query = urlsplit(url)[3]
2013-04-30 04:40:20 +00:00
out.debug(u'Extracting query parameters from %r (%r)...' % (url, query))
2012-06-20 19:58:13 +00:00
return cgi.parse_qs(query)
def internal_error(out=sys.stderr, etype=None, evalue=None, tb=None):
"""Print internal error message (output defaults to stderr)."""
print(os.linesep, file=out)
print("""********** Oops, I did it again. *************
2012-06-20 19:58:13 +00:00
You have found an internal error in %(app)s. Please write a bug report
at %(url)s and include at least the information below:
2012-06-20 19:58:13 +00:00
Not disclosing some of the information below due to privacy reasons is ok.
2012-06-20 19:58:13 +00:00
I will try to help you nonetheless, but you have to give me something
I can work with ;) .
""" % dict(app=AppName, url=SupportUrl), file=out)
2012-06-20 19:58:13 +00:00
if etype is None:
etype = sys.exc_info()[0]
if evalue is None:
evalue = sys.exc_info()[1]
2012-11-21 20:57:26 +00:00
print(etype, evalue, file=out)
2012-06-20 19:58:13 +00:00
if tb is None:
tb = sys.exc_info()[2]
traceback.print_exception(etype, evalue, tb, None, out)
print_app_info(out=out)
print_proxy_info(out=out)
print_locale_info(out=out)
print(os.linesep,
2016-03-13 20:27:31 +00:00
"******** %s internal error, over and out ********" % AppName,
file=out)
2012-06-20 19:58:13 +00:00
def print_env_info(key, out=sys.stderr):
"""If given environment key is defined, print it out."""
value = os.getenv(key)
if value is not None:
print(key, "=", repr(value), file=out)
2012-06-20 19:58:13 +00:00
def print_proxy_info(out=sys.stderr):
"""Print proxy info."""
print_env_info("http_proxy", out=out)
def print_locale_info(out=sys.stderr):
"""Print locale info."""
for key in ("LANGUAGE", "LC_ALL", "LC_CTYPE", "LANG"):
print_env_info(key, out=out)
def print_app_info(out=sys.stderr):
"""Print system and application info (output defaults to stderr)."""
print("System info:", file=out)
print(App, file=out)
print("Python %(version)s on %(platform)s" %
2016-03-13 20:27:31 +00:00
{"version": sys.version, "platform": sys.platform}, file=out)
2012-06-20 19:58:13 +00:00
stime = strtime(time.time())
print("Local time:", stime, file=out)
print("sys.argv", sys.argv, file=out)
2012-06-20 19:58:13 +00:00
def strtime(t):
"""Return ISO 8601 formatted time."""
2016-03-13 20:27:31 +00:00
return (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) +
strtimezone())
2012-06-20 19:58:13 +00:00
def strtimezone():
"""Return timezone info, %z on some platforms, but not supported on all.
"""
if time.daylight:
zone = time.altzone
else:
zone = time.timezone
return "%+04d" % (-zone // 3600)
2012-11-26 06:14:02 +00:00
2012-12-12 16:41:29 +00:00
def rfc822date(indate):
"""Format date in rfc822 format."""
return time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(indate))
def unquote(text):
2012-12-12 16:41:29 +00:00
"""Replace all percent-encoded entities in text."""
while '%' in text:
2013-04-03 18:32:43 +00:00
newtext = url_unquote(text)
2012-12-07 23:45:18 +00:00
if newtext == text:
break
text = newtext
return text
2012-12-02 17:35:06 +00:00
2013-01-23 20:16:22 +00:00
def quote(text, safechars='/'):
2012-12-12 16:41:29 +00:00
"""Percent-encode given text."""
2013-04-03 18:32:43 +00:00
return url_quote(text, safechars)
2013-01-23 20:16:22 +00:00
2012-12-07 23:45:18 +00:00
2016-03-13 20:27:31 +00:00
def strsize(b):
2012-12-02 17:35:06 +00:00
"""Return human representation of bytes b. A negative number of bytes
raises a value error."""
if b < 0:
raise ValueError("Invalid negative byte number")
if b < 1024:
return "%dB" % b
if b < 1024 * 10:
return "%dKB" % (b // 1024)
if b < 1024 * 1024:
return "%.2fKB" % (float(b) / 1024)
if b < 1024 * 1024 * 10:
return "%.2fMB" % (float(b) / (1024 * 1024))
2012-12-02 17:35:06 +00:00
if b < 1024 * 1024 * 1024:
return "%.1fMB" % (float(b) / (1024 * 1024))
2012-12-02 17:35:06 +00:00
if b < 1024 * 1024 * 1024 * 10:
return "%.2fGB" % (float(b) / (1024 * 1024 * 1024))
return "%.1fGB" % (float(b) / (1024 * 1024 * 1024))
2012-12-02 17:35:06 +00:00
2012-12-12 16:41:29 +00:00
2012-12-07 23:45:18 +00:00
def getFilename(name):
2016-03-13 20:27:31 +00:00
"""Get a filename from given name without dangerous or incompatible
characters."""
2012-12-07 23:45:18 +00:00
# first replace all illegal chars
name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name)
# then remove double dots and underscores
while ".." in name:
name = name.replace('..', '.')
while "__" in name:
name = name.replace('__', '_')
# remove a leading dot or minus
if name.startswith((".", "-")):
name = name[1:]
return name
2012-12-12 16:41:29 +00:00
def getExistingFile(name, max_suffix=1000):
"""Add filename suffix until file exists
@return: filename if file is found
@raise: ValueError if maximum suffix number is reached while searching
"""
num = 1
stem, ext = os.path.splitext(name)
filename = name
while not os.path.exists(filename):
suffix = "-%d" % num
filename = stem + suffix + ext
num += 1
if num >= max_suffix:
raise ValueError("No file %r found" % name)
return filename
def getNonexistingFile(name):
"""Add filename suffix until file not exists
@return: filename
"""
num = 1
stem, ext = os.path.splitext(name)
filename = name
while os.path.exists(filename):
suffix = "-%d" % num
filename = stem + suffix + ext
num += 1
return filename
2016-03-13 20:27:31 +00:00
def strlimit(s, length=72):
2012-12-12 16:41:29 +00:00
"""If the length of the string exceeds the given limit, it will be cut
off and three dots will be appended.
@param s: the string to limit
@type s: string
@param length: maximum length
@type length: non-negative integer
@return: limited string, at most length+3 characters long
"""
assert length >= 0, "length limit must be a non-negative integer"
if not s or len(s) <= length:
return s
if length == 0:
return ""
return "%s..." % s[:length]
def uniq(input):
"""Remove duplicates from a list while preserving the list order"""
output = []
for item in input:
if item not in output:
output.append(item)
return output