Only cache robots.txt URL on memoize.

This commit is contained in:
Bastian Kleineidam 2013-02-13 17:52:07 +01:00
parent a8f0a4f2c8
commit f16e860f1e
2 changed files with 5 additions and 5 deletions

View file

@ -11,18 +11,18 @@ class memoized (object):
self.func = func self.func = func
self.cache = {} self.cache = {}
def __call__(self, *args): def __call__(self, *args, **kwargs):
"""Lookup and return cached result if found. Else call stored """Lookup and return cached result if found. Else call stored
function with given arguments.""" function with given arguments."""
try: try:
return self.cache[args] return self.cache[args]
except KeyError: except KeyError:
self.cache[args] = value = self.func(*args) self.cache[args] = value = self.func(*args, **kwargs)
return value return value
except TypeError: except TypeError:
# uncachable -- for instance, passing a list as an argument. # uncachable -- for instance, passing a list as an argument.
# Better to not cache than to blow up entirely. # Better to not cache than to blow up entirely.
return self.func(*args) return self.func(*args, **kwargs)
def __repr__(self): def __repr__(self):
"""Return the function's docstring.""" """Return the function's docstring."""

View file

@ -203,13 +203,13 @@ def check_robotstxt(url, session):
@raises: IOError if URL is not allowed @raises: IOError if URL is not allowed
""" """
roboturl = get_roboturl(url) roboturl = get_roboturl(url)
rp = get_robotstxt_parser(roboturl, session) rp = get_robotstxt_parser(roboturl, session=session)
if not rp.can_fetch(UserAgent, url): if not rp.can_fetch(UserAgent, url):
raise IOError("%s is disallowed by robots.txt" % url) raise IOError("%s is disallowed by robots.txt" % url)
@memoized @memoized
def get_robotstxt_parser(url, session): def get_robotstxt_parser(url, session=None):
"""Get a RobotFileParser for the given robots.txt URL.""" """Get a RobotFileParser for the given robots.txt URL."""
rp = robotparser.RobotFileParser() rp = robotparser.RobotFileParser()
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False) req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)