Allow setting a crawl delay in update scripts.

This commit is contained in:
Tobias Gruetzmacher 2017-02-12 22:36:06 +01:00
parent 83187b0554
commit 0d6f50217c
2 changed files with 9 additions and 1 deletions

View file

@ -8,6 +8,7 @@ from __future__ import absolute_import, division, print_function
import os import os
import re import re
import sys import sys
import time
import json import json
import codecs import codecs
@ -39,6 +40,7 @@ class ComicListUpdater(object):
def __init__(self, name): def __init__(self, name):
self.json = name.replace(".py", ".json") self.json = name.replace(".py", ".json")
self.session = requests.Session() self.session = requests.Session()
self.sleep = 0
def get_url(self, url, expand=True): def get_url(self, url, expand=True):
"""Get an HTML page and parse it with LXML.""" """Get an HTML page and parse it with LXML."""
@ -47,6 +49,8 @@ class ComicListUpdater(object):
data = html.document_fromstring(get_page(url, self.session).text) data = html.document_fromstring(get_page(url, self.session).text)
if expand: if expand:
data.make_links_absolute(url) data.make_links_absolute(url)
if self.sleep > 0:
time.sleep(self.sleep)
return data return data
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)

View file

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher # Copyright (C) 2015-2017 Tobias Gruetzmacher
""" """
Script to get a list of smackjeeves.com comics and save the info in a JSON file Script to get a list of smackjeeves.com comics and save the info in a JSON file
for further processing. for further processing.
@ -115,6 +115,10 @@ class SmackJeevesUpdater(ComicListUpdater):
"Razor", "Razor",
) )
def __init__(self, name):
super(SmackJeevesUpdater, self).__init__(name)
self.sleep = 2
def handle_url(self, url): def handle_url(self, url):
"""Parse one search result page.""" """Parse one search result page."""
data = self.get_url(url) data = self.get_url(url)