dosage/dosagelib/util.py

441 lines
14 KiB
Python
Raw Normal View History

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
from __future__ import division, print_function
2012-06-20 19:58:13 +00:00
import urllib, urllib2, urlparse
2012-12-12 16:41:29 +00:00
import robotparser
import requests
2012-06-20 19:58:13 +00:00
import sys
import os
import cgi
import re
import traceback
import time
from htmlentitydefs import name2codepoint
2012-12-12 16:41:29 +00:00
from .decorators import memoized
2012-06-20 19:58:13 +00:00
from .output import out
from .configuration import UserAgent, AppName, App, SupportUrl
2012-12-12 16:41:29 +00:00
from .fileutil import has_module
2012-06-20 20:33:26 +00:00
has_curses = has_module("curses")
2012-06-20 19:58:13 +00:00
2012-12-05 20:52:52 +00:00
# Maximum content size for HTML pages
MaxContentBytes = 1024 * 1024 * 2 # 2 MB
# Maximum content size for images
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
2012-12-07 23:45:18 +00:00
# Default connection timeout
ConnectionTimeoutSecs = 60
2012-06-20 19:58:13 +00:00
2012-10-12 19:11:44 +00:00
def tagre(tag, attribute, value, quote='"', before="", after=""):
2012-10-11 10:03:12 +00:00
"""Return a regular expression matching the given HTML tag, attribute
and value. It matches the tag and attribute names case insensitive,
and skips arbitrary whitespace and leading HTML attributes. The "<>" at
the start and end of the HTML tag is also matched.
@param tag: the tag name
@ptype tag: string
@param attribute: the attribute name
@ptype attribute: string
@param value: the attribute value
@ptype value: string
2012-10-11 13:43:29 +00:00
@param quote: the attribute quote (default ")
@ptype quote: string
2012-10-12 19:11:44 +00:00
@param after: match after attribute value but before end
@ptype after: string
2012-10-11 10:03:12 +00:00
@return: the generated regular expression suitable for re.compile()
@rtype: string
"""
if before:
prefix = r"[^>]*%s[^>]*\s+" % before
else:
prefix = r"(?:[^>]*\s+)?"
2012-10-11 10:03:12 +00:00
attrs = dict(
tag=case_insensitive_re(tag),
attribute=case_insensitive_re(attribute),
value=value,
2012-10-11 13:43:29 +00:00
quote=quote,
prefix=prefix,
2012-10-12 19:11:44 +00:00
after=after,
2012-10-11 10:03:12 +00:00
)
return r'<\s*%(tag)s\s+%(prefix)s%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
2012-06-20 19:58:13 +00:00
2012-10-11 10:03:12 +00:00
def case_insensitive_re(name):
"""Reformat the given name to a case insensitive regular expression string
without using re.IGNORECASE. This way selective strings can be made case
insensitive.
@param name: the name to make case insensitive
@ptype name: string
@return: the case insenstive regex
@rtype: string
"""
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
2012-06-20 19:58:13 +00:00
2012-10-11 10:03:12 +00:00
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
2012-06-20 19:58:13 +00:00
2012-12-12 16:41:29 +00:00
def getPageContent(url, max_content_bytes=MaxContentBytes, session=None):
"""Get text content of given URL."""
check_robotstxt(url)
2012-10-11 10:03:12 +00:00
# read page data
2012-12-08 20:29:57 +00:00
page = urlopen(url, max_content_bytes=max_content_bytes,
2012-12-12 16:41:29 +00:00
session=session)
data = page.text
2012-10-11 10:03:12 +00:00
# determine base URL
baseUrl = None
match = baseSearch.search(data)
if match:
baseUrl = match.group(1)
2012-06-20 19:58:13 +00:00
else:
2012-10-11 10:03:12 +00:00
baseUrl = url
return data, baseUrl
2012-06-20 19:58:13 +00:00
2012-12-05 20:52:52 +00:00
def getImageObject(url, referrer, max_content_bytes=MaxImageBytes):
"""Get response object for given image URL."""
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
2012-12-12 16:41:29 +00:00
def fetchUrl(url, urlSearch, session=None):
"""Search for given URL pattern in a HTML page."""
data, baseUrl = getPageContent(url, session=session)
2012-10-11 16:16:29 +00:00
match = urlSearch.search(data)
2012-06-20 19:58:13 +00:00
if match:
2012-10-11 10:03:12 +00:00
searchUrl = match.group(1)
2012-10-11 16:16:29 +00:00
if not searchUrl:
raise ValueError("Match empty URL at %s with pattern %s" % (url, urlSearch.pattern))
2012-12-07 23:45:18 +00:00
out.debug('matched URL %r' % searchUrl)
2012-11-21 20:57:26 +00:00
return normaliseURL(urlparse.urljoin(baseUrl, searchUrl))
2012-06-20 19:58:13 +00:00
return None
2012-12-12 16:41:29 +00:00
def fetchUrls(url, imageSearch, prevSearch=None, session=None):
"""Search for given image and previous URL pattern in a HTML page."""
data, baseUrl = getPageContent(url, session=session)
2012-10-11 10:03:12 +00:00
# match images
imageUrls = set()
for match in imageSearch.finditer(data):
imageUrl = match.group(1)
2012-10-11 16:16:29 +00:00
if not imageUrl:
raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern))
2012-12-07 23:45:18 +00:00
out.debug('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern))
2012-11-21 20:57:26 +00:00
imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl)))
2012-10-11 10:03:12 +00:00
if not imageUrls:
2012-12-07 23:45:18 +00:00
out.warn("no images found at %s with pattern %s" % (url, imageSearch.pattern))
2012-10-11 10:03:12 +00:00
if prevSearch is not None:
# match previous URL
match = prevSearch.search(data)
if match:
prevUrl = match.group(1)
2012-10-11 16:16:29 +00:00
if not prevUrl:
raise ValueError("Match empty previous URL at %s with pattern %s" % (url, prevSearch.pattern))
2012-11-21 20:57:26 +00:00
prevUrl = normaliseURL(urlparse.urljoin(baseUrl, prevUrl))
2012-10-11 10:03:12 +00:00
else:
2012-12-07 23:45:18 +00:00
out.debug('no previous URL %s at %s' % (prevSearch.pattern, url))
2012-10-11 10:03:12 +00:00
prevUrl = None
return imageUrls, prevUrl
2012-11-21 20:57:26 +00:00
return imageUrls, None
2012-06-20 19:58:13 +00:00
def unescape(text):
2012-12-12 16:41:29 +00:00
"""Replace HTML entities and character references."""
2012-06-20 19:58:13 +00:00
def _fixup(m):
2012-12-12 16:41:29 +00:00
"""Replace HTML entities."""
2012-06-20 19:58:13 +00:00
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
text = unichr(int(text[3:-1], 16))
else:
text = unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(name2codepoint[text[1:-1]])
except KeyError:
pass
if isinstance(text, unicode):
text = text.encode('utf-8')
text = urllib2.quote(text, safe=';/?:@&=+$,')
return text
2012-11-21 20:57:26 +00:00
return re.sub(r"&#?\w+;", _fixup, text)
2012-06-20 19:58:13 +00:00
def normaliseURL(url):
2012-12-12 16:41:29 +00:00
"""Removes any leading empty segments to avoid breaking urllib2; also replaces
2012-06-20 19:58:13 +00:00
HTML entities and character references.
"""
# XXX: brutal hack
url = unescape(url)
2012-06-20 19:58:13 +00:00
pu = list(urlparse.urlparse(url))
2012-11-21 20:57:26 +00:00
segments = pu[2].split('/')
2012-12-04 06:02:40 +00:00
while segments and segments[0] in ('', '..'):
2012-06-20 19:58:13 +00:00
del segments[0]
2012-11-21 20:57:26 +00:00
pu[2] = '/' + '/'.join(segments).replace(' ', '%20')
2012-11-14 19:23:30 +00:00
# remove leading '&' from query
2012-11-21 20:57:26 +00:00
if pu[4].startswith('&'):
pu[4] = pu[4][1:]
# remove anchor
pu[5] = ""
2012-06-20 19:58:13 +00:00
return urlparse.urlunparse(pu)
2012-11-21 20:57:26 +00:00
2012-12-12 16:41:29 +00:00
def get_roboturl(url):
"""Get robots.txt URL from given URL."""
pu = urlparse.urlparse(url)
return urlparse.urlunparse((pu[0], pu[1], "/robots.txt", "", "", ""))
def check_robotstxt(url):
"""Check if robots.txt allows our user agent for the given URL.
@raises: IOError if URL is not allowed
"""
roboturl = get_roboturl(url)
rp = get_robotstxt_parser(roboturl)
if not rp.can_fetch(UserAgent, url):
raise IOError("%s is disallowed by robots.txt" % url)
@memoized
def get_robotstxt_parser(url):
"""Get a RobotFileParser for the given robots.txt URL."""
rp = robotparser.RobotFileParser()
req = urlopen(url, max_content_bytes=MaxContentBytes, raise_for_status=False)
if req.status_code in (401, 403):
rp.disallow_all = True
elif req.status_code >= 400:
rp.allow_all = True
elif req.status_code == 200:
rp.parse(req.content.splitlines())
return rp
2012-12-07 23:45:18 +00:00
def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_bytes=None,
2012-12-12 16:41:29 +00:00
timeout=ConnectionTimeoutSecs, session=None, raise_for_status=True):
"""Open an URL and return the response object."""
2012-12-07 23:45:18 +00:00
out.debug('Open URL %s' % url)
2012-09-27 19:24:28 +00:00
assert retries >= 0, 'invalid retry value %r' % retries
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
headers = {'User-Agent': UserAgent}
2012-06-20 19:58:13 +00:00
if referrer:
headers['Referer'] = referrer
2012-12-12 16:41:29 +00:00
config = {"max_retries": retries}
if session is None:
session = requests
try:
2012-12-12 16:41:29 +00:00
req = session.get(url, headers=headers, config=config,
prefetch=False, timeout=timeout)
2012-12-05 20:52:52 +00:00
check_content_size(url, req.headers, max_content_bytes)
2012-12-12 16:41:29 +00:00
if raise_for_status:
req.raise_for_status()
return req
except requests.exceptions.RequestException as err:
msg = 'URL retrieval of %s failed: %s' % (url, err)
raise IOError(msg)
2012-06-20 19:58:13 +00:00
2012-12-12 16:41:29 +00:00
2012-12-05 20:52:52 +00:00
def check_content_size(url, headers, max_content_bytes):
2012-12-12 16:41:29 +00:00
"""Check that content length in URL response headers do not exceed the
given maximum bytes.
"""
2012-12-05 20:52:52 +00:00
if not max_content_bytes:
return
if 'content-length' in headers:
size = int(headers['content-length'])
if size > max_content_bytes:
2012-12-12 16:41:29 +00:00
msg = 'URL content of %s with %d bytes exceeds %d bytes.' % (url, size, max_content_bytes)
2012-12-05 20:52:52 +00:00
raise IOError(msg)
2012-06-20 20:33:26 +00:00
2012-06-20 19:58:13 +00:00
def splitpath(path):
2012-12-12 16:41:29 +00:00
"""Split a path in its components."""
2012-06-20 19:58:13 +00:00
c = []
head, tail = os.path.split(path)
while tail:
c.insert(0, tail)
head, tail = os.path.split(head)
return c
2012-11-21 20:57:26 +00:00
2012-06-20 19:58:13 +00:00
def getRelativePath(basepath, path):
2012-12-12 16:41:29 +00:00
"""Get a path that is relative to the given base path."""
2012-06-20 19:58:13 +00:00
basepath = splitpath(os.path.abspath(basepath))
path = splitpath(os.path.abspath(path))
afterCommon = False
for c in basepath:
if afterCommon or path[0] != c:
path.insert(0, os.path.pardir)
afterCommon = True
else:
del path[0]
return os.path.join(*path)
2012-11-21 20:57:26 +00:00
2012-06-20 19:58:13 +00:00
def getQueryParams(url):
2012-12-12 16:41:29 +00:00
"""Get URL query parameters."""
2012-06-20 19:58:13 +00:00
query = urlparse.urlsplit(url)[3]
2012-12-07 23:45:18 +00:00
out.debug('Extracting query parameters from %r (%r)...' % (url, query))
2012-06-20 19:58:13 +00:00
return cgi.parse_qs(query)
def internal_error(out=sys.stderr, etype=None, evalue=None, tb=None):
"""Print internal error message (output defaults to stderr)."""
print(os.linesep, file=out)
print("""********** Oops, I did it again. *************
2012-06-20 19:58:13 +00:00
You have found an internal error in %(app)s. Please write a bug report
at %(url)s and include at least the information below:
2012-06-20 19:58:13 +00:00
Not disclosing some of the information below due to privacy reasons is ok.
2012-06-20 19:58:13 +00:00
I will try to help you nonetheless, but you have to give me something
I can work with ;) .
""" % dict(app=AppName, url=SupportUrl), file=out)
2012-06-20 19:58:13 +00:00
if etype is None:
etype = sys.exc_info()[0]
if evalue is None:
evalue = sys.exc_info()[1]
2012-11-21 20:57:26 +00:00
print(etype, evalue, file=out)
2012-06-20 19:58:13 +00:00
if tb is None:
tb = sys.exc_info()[2]
traceback.print_exception(etype, evalue, tb, None, out)
print_app_info(out=out)
print_proxy_info(out=out)
print_locale_info(out=out)
print(os.linesep,
"******** %s internal error, over and out ********" % AppName, file=out)
2012-06-20 19:58:13 +00:00
def print_env_info(key, out=sys.stderr):
"""If given environment key is defined, print it out."""
value = os.getenv(key)
if value is not None:
print(key, "=", repr(value), file=out)
2012-06-20 19:58:13 +00:00
def print_proxy_info(out=sys.stderr):
"""Print proxy info."""
print_env_info("http_proxy", out=out)
def print_locale_info(out=sys.stderr):
"""Print locale info."""
for key in ("LANGUAGE", "LC_ALL", "LC_CTYPE", "LANG"):
print_env_info(key, out=out)
def print_app_info(out=sys.stderr):
"""Print system and application info (output defaults to stderr)."""
print("System info:", file=out)
print(App, file=out)
print("Python %(version)s on %(platform)s" %
{"version": sys.version, "platform": sys.platform}, file=out)
2012-06-20 19:58:13 +00:00
stime = strtime(time.time())
print("Local time:", stime, file=out)
print("sys.argv", sys.argv, file=out)
2012-06-20 19:58:13 +00:00
def strtime(t):
"""Return ISO 8601 formatted time."""
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \
strtimezone()
def strtimezone():
"""Return timezone info, %z on some platforms, but not supported on all.
"""
if time.daylight:
zone = time.altzone
else:
zone = time.timezone
return "%+04d" % (-zone//3600)
2012-11-26 06:14:02 +00:00
2012-12-12 16:41:29 +00:00
def rfc822date(indate):
"""Format date in rfc822 format."""
return time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(indate))
2012-11-26 06:14:02 +00:00
def asciify(name):
"""Remove non-ascii characters from string."""
return re.sub("[^0-9a-zA-Z_]", "", name)
def unquote(text):
2012-12-12 16:41:29 +00:00
"""Replace all percent-encoded entities in text."""
while '%' in text:
2012-12-07 23:45:18 +00:00
newtext = urllib.unquote(text)
if newtext == text:
break
text = newtext
return text
2012-12-02 17:35:06 +00:00
2012-12-07 23:45:18 +00:00
def quote(text):
2012-12-12 16:41:29 +00:00
"""Percent-encode given text."""
2012-12-07 23:45:18 +00:00
return urllib.quote(text)
2012-12-02 17:35:06 +00:00
def strsize (b):
"""Return human representation of bytes b. A negative number of bytes
raises a value error."""
if b < 0:
raise ValueError("Invalid negative byte number")
if b < 1024:
return "%dB" % b
if b < 1024 * 10:
return "%dKB" % (b // 1024)
if b < 1024 * 1024:
return "%.2fKB" % (float(b) / 1024)
if b < 1024 * 1024 * 10:
return "%.2fMB" % (float(b) / (1024*1024))
if b < 1024 * 1024 * 1024:
return "%.1fMB" % (float(b) / (1024*1024))
if b < 1024 * 1024 * 1024 * 10:
return "%.2fGB" % (float(b) / (1024*1024*1024))
return "%.1fGB" % (float(b) / (1024*1024*1024))
2012-12-12 16:41:29 +00:00
2012-12-07 23:45:18 +00:00
def getDirname(name):
"""Replace slashes with path separator of name."""
return name.replace('/', os.sep)
def getFilename(name):
2012-12-12 16:41:29 +00:00
"""Get a filename from given name without dangerous or incompatible characters."""
2012-12-07 23:45:18 +00:00
# first replace all illegal chars
name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name)
# then remove double dots and underscores
while ".." in name:
name = name.replace('..', '.')
while "__" in name:
name = name.replace('__', '_')
# remove a leading dot or minus
if name.startswith((".", "-")):
name = name[1:]
return name
2012-12-12 16:41:29 +00:00
def strlimit (s, length=72):
"""If the length of the string exceeds the given limit, it will be cut
off and three dots will be appended.
@param s: the string to limit
@type s: string
@param length: maximum length
@type length: non-negative integer
@return: limited string, at most length+3 characters long
"""
assert length >= 0, "length limit must be a non-negative integer"
if not s or len(s) <= length:
return s
if length == 0:
return ""
return "%s..." % s[:length]