From 5affd8af68b8a09d815815897ad8495732da9d57 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Wed, 15 Jul 2015 19:11:55 +0200 Subject: [PATCH] More relaxed robots.txt handling. This is in line with how Perl's LWP::RobotUA and Google handles server errors when fetching robots.txt: Just assume access is allowed. See https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt --- dosagelib/util.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dosagelib/util.py b/dosagelib/util.py index f6f93bd10..0f54ca92f 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -282,12 +282,10 @@ def get_robotstxt_parser(url, session=None): try: req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False) except Exception: - # connect or timeout errors are treated as an absent robotst.txt + # connect or timeout errors are treated as an absent robots.txt rp.allow_all = True else: - if req.status_code in (401, 403): - rp.disallow_all = True - elif req.status_code >= 400: + if req.status_code >= 400: rp.allow_all = True elif req.status_code == 200: rp.parse(req.text.splitlines())