Only cache robots.txt URL on memoize.
This commit is contained in:
parent
a8f0a4f2c8
commit
f16e860f1e
2 changed files with 5 additions and 5 deletions
|
@ -11,18 +11,18 @@ class memoized (object):
|
||||||
self.func = func
|
self.func = func
|
||||||
self.cache = {}
|
self.cache = {}
|
||||||
|
|
||||||
def __call__(self, *args):
|
def __call__(self, *args, **kwargs):
|
||||||
"""Lookup and return cached result if found. Else call stored
|
"""Lookup and return cached result if found. Else call stored
|
||||||
function with given arguments."""
|
function with given arguments."""
|
||||||
try:
|
try:
|
||||||
return self.cache[args]
|
return self.cache[args]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
self.cache[args] = value = self.func(*args)
|
self.cache[args] = value = self.func(*args, **kwargs)
|
||||||
return value
|
return value
|
||||||
except TypeError:
|
except TypeError:
|
||||||
# uncachable -- for instance, passing a list as an argument.
|
# uncachable -- for instance, passing a list as an argument.
|
||||||
# Better to not cache than to blow up entirely.
|
# Better to not cache than to blow up entirely.
|
||||||
return self.func(*args)
|
return self.func(*args, **kwargs)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
"""Return the function's docstring."""
|
"""Return the function's docstring."""
|
||||||
|
|
|
@ -203,13 +203,13 @@ def check_robotstxt(url, session):
|
||||||
@raises: IOError if URL is not allowed
|
@raises: IOError if URL is not allowed
|
||||||
"""
|
"""
|
||||||
roboturl = get_roboturl(url)
|
roboturl = get_roboturl(url)
|
||||||
rp = get_robotstxt_parser(roboturl, session)
|
rp = get_robotstxt_parser(roboturl, session=session)
|
||||||
if not rp.can_fetch(UserAgent, url):
|
if not rp.can_fetch(UserAgent, url):
|
||||||
raise IOError("%s is disallowed by robots.txt" % url)
|
raise IOError("%s is disallowed by robots.txt" % url)
|
||||||
|
|
||||||
|
|
||||||
@memoized
|
@memoized
|
||||||
def get_robotstxt_parser(url, session):
|
def get_robotstxt_parser(url, session=None):
|
||||||
"""Get a RobotFileParser for the given robots.txt URL."""
|
"""Get a RobotFileParser for the given robots.txt URL."""
|
||||||
rp = robotparser.RobotFileParser()
|
rp = robotparser.RobotFileParser()
|
||||||
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
|
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
|
||||||
|
|
Loading…
Reference in a new issue