Revert robots.txt handling.

This brings us back to only honouring robots.txt on page downloads, not on image downloads. Rationale: Dosage is not a "robot" in the classical sense. It's not designed to spider huge amounts of web sites in search for some content to index, it's only intended to help users keep a personal archive of comics he is interested in. We try very hard to never download any image twice. This fixes #24. (Precedent for this rationale: Google Feedfetcher: https://support.google.com/webmasters/answer/178852?hl=en#robots)
2015-07-17 20:46:56 +02:00 · 2015-07-17 20:46:56 +02:00 · 68d4dd463a
commit 68d4dd463a
parent d88b97573d
2 changed files with 0 additions and 5 deletions
--- a/dosagelib/plugins/gocomics.py
+++ b/dosagelib/plugins/gocomics.py
@ -17,10 +17,6 @@ _prevSearch = compile(tagre("a", "href", r'(/[^"]+/\d+/\d+/\d+)', after="prev"))
 _nextSearch = compile(tagre("a", "href", r'(/[^"]+/\d+/\d+/\d+)', after="next"))
 def add(name, shortname):
    # Unfortunately, the whole http://assets.amuniversal.com/ is blocked by
    # robots.txt, so we disable GoComics for now...
    return
    baseUrl = 'http://www.gocomics.com'
    url = baseUrl + shortname
    classname = 'GoComics_%s' % name
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -205,7 +205,6 @@ def getPageContent(url, session, max_content_bytes=MaxContentBytes):
 def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
    """Get response object for given image URL."""
    check_robotstxt(url, session)
    return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes, stream=True)