Catch robots.txt errors.

2013-02-21 19:48:04 +01:00 · 2013-02-21 19:48:04 +01:00 · d0c3492cc7
parent b453c442c2
commit d0c3492cc7
1 changed files with 12 additions and 6 deletions
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -183,6 +183,7 @@ def normaliseURL(url):
    """Removes any leading empty segments to avoid breaking urllib2; also replaces
    HTML entities and character references.
    """
    # XXX does not work for python3
    if isinstance(url, unicode):
        url = url.encode(UrlEncoding, 'ignore')
    # XXX: brutal hack
@ -221,7 +222,12 @@ def check_robotstxt(url, session):
 def get_robotstxt_parser(url, session=None):
    """Get a RobotFileParser for the given robots.txt URL."""
    rp = robotparser.RobotFileParser()
    try:
        req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
    except Exception:
        # connect or timeout errors are treated as an absent robotst.txt
        rp.allow_all = True
    else:
        if req.status_code in (401, 403):
            rp.disallow_all = True
        elif req.status_code >= 400: