Only cache robots.txt URL on memoize.

This commit is contained in:
Bastian Kleineidam 2013-02-13 17:52:07 +01:00
parent a8f0a4f2c8
commit f16e860f1e
2 changed files with 5 additions and 5 deletions

View file

@ -11,18 +11,18 @@ class memoized (object):
self.func = func
self.cache = {}
def __call__(self, *args):
def __call__(self, *args, **kwargs):
"""Lookup and return cached result if found. Else call stored
function with given arguments."""
try:
return self.cache[args]
except KeyError:
self.cache[args] = value = self.func(*args)
self.cache[args] = value = self.func(*args, **kwargs)
return value
except TypeError:
# uncachable -- for instance, passing a list as an argument.
# Better to not cache than to blow up entirely.
return self.func(*args)
return self.func(*args, **kwargs)
def __repr__(self):
"""Return the function's docstring."""

View file

@ -203,13 +203,13 @@ def check_robotstxt(url, session):
@raises: IOError if URL is not allowed
"""
roboturl = get_roboturl(url)
rp = get_robotstxt_parser(roboturl, session)
rp = get_robotstxt_parser(roboturl, session=session)
if not rp.can_fetch(UserAgent, url):
raise IOError("%s is disallowed by robots.txt" % url)
@memoized
def get_robotstxt_parser(url, session):
def get_robotstxt_parser(url, session=None):
"""Get a RobotFileParser for the given robots.txt URL."""
rp = robotparser.RobotFileParser()
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)