Allow setting a crawl delay in update scripts.

2017-02-12 22:36:06 +01:00 · 2017-02-12 22:36:06 +01:00 · 0d6f50217c
commit 0d6f50217c
parent 83187b0554
2 changed files with 9 additions and 1 deletions
--- a/scripts/scriptutil.py
+++ b/scripts/scriptutil.py
@ -8,6 +8,7 @@ from __future__ import absolute_import, division, print_function
 import os
 import re
 import sys
+import time
 import json
 import codecs

@ -39,6 +40,7 @@ class ComicListUpdater(object):
    def __init__(self, name):
        self.json = name.replace(".py", ".json")
        self.session = requests.Session()
+        self.sleep = 0

    def get_url(self, url, expand=True):
        """Get an HTML page and parse it with LXML."""
@ -47,6 +49,8 @@ class ComicListUpdater(object):
            data = html.document_fromstring(get_page(url, self.session).text)
            if expand:
                data.make_links_absolute(url)
+            if self.sleep > 0:
+                time.sleep(self.sleep)
            return data
        except IOError as msg:
            print("ERROR:", msg, file=sys.stderr)
--- a/scripts/smackjeeves.py
+++ b/scripts/smackjeeves.py
@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012-2014 Bastian Kleineidam
-# Copyright (C) 2015-2016 Tobias Gruetzmacher
+# Copyright (C) 2015-2017 Tobias Gruetzmacher
 """
 Script to get a list of smackjeeves.com comics and save the info in a JSON file
 for further processing.
@ -115,6 +115,10 @@ class SmackJeevesUpdater(ComicListUpdater):
        "Razor",
    )

+    def __init__(self, name):
+        super(SmackJeevesUpdater, self).__init__(name)
+        self.sleep = 2
+
    def handle_url(self, url):
        """Parse one search result page."""
        data = self.get_url(url)