Only cache robots.txt URL on memoize.
This commit is contained in:
parent
a8f0a4f2c8
commit
f16e860f1e
2 changed files with 5 additions and 5 deletions
|
@ -11,18 +11,18 @@ class memoized (object):
|
|||
self.func = func
|
||||
self.cache = {}
|
||||
|
||||
def __call__(self, *args):
|
||||
def __call__(self, *args, **kwargs):
|
||||
"""Lookup and return cached result if found. Else call stored
|
||||
function with given arguments."""
|
||||
try:
|
||||
return self.cache[args]
|
||||
except KeyError:
|
||||
self.cache[args] = value = self.func(*args)
|
||||
self.cache[args] = value = self.func(*args, **kwargs)
|
||||
return value
|
||||
except TypeError:
|
||||
# uncachable -- for instance, passing a list as an argument.
|
||||
# Better to not cache than to blow up entirely.
|
||||
return self.func(*args)
|
||||
return self.func(*args, **kwargs)
|
||||
|
||||
def __repr__(self):
|
||||
"""Return the function's docstring."""
|
||||
|
|
|
@ -203,13 +203,13 @@ def check_robotstxt(url, session):
|
|||
@raises: IOError if URL is not allowed
|
||||
"""
|
||||
roboturl = get_roboturl(url)
|
||||
rp = get_robotstxt_parser(roboturl, session)
|
||||
rp = get_robotstxt_parser(roboturl, session=session)
|
||||
if not rp.can_fetch(UserAgent, url):
|
||||
raise IOError("%s is disallowed by robots.txt" % url)
|
||||
|
||||
|
||||
@memoized
|
||||
def get_robotstxt_parser(url, session):
|
||||
def get_robotstxt_parser(url, session=None):
|
||||
"""Get a RobotFileParser for the given robots.txt URL."""
|
||||
rp = robotparser.RobotFileParser()
|
||||
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
|
||||
|
|
Loading…
Reference in a new issue