Allow setting a crawl delay in update scripts.
This commit is contained in:
parent
83187b0554
commit
0d6f50217c
2 changed files with 9 additions and 1 deletions
|
@ -8,6 +8,7 @@ from __future__ import absolute_import, division, print_function
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
import json
|
import json
|
||||||
import codecs
|
import codecs
|
||||||
|
|
||||||
|
@ -39,6 +40,7 @@ class ComicListUpdater(object):
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
self.json = name.replace(".py", ".json")
|
self.json = name.replace(".py", ".json")
|
||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
|
self.sleep = 0
|
||||||
|
|
||||||
def get_url(self, url, expand=True):
|
def get_url(self, url, expand=True):
|
||||||
"""Get an HTML page and parse it with LXML."""
|
"""Get an HTML page and parse it with LXML."""
|
||||||
|
@ -47,6 +49,8 @@ class ComicListUpdater(object):
|
||||||
data = html.document_fromstring(get_page(url, self.session).text)
|
data = html.document_fromstring(get_page(url, self.session).text)
|
||||||
if expand:
|
if expand:
|
||||||
data.make_links_absolute(url)
|
data.make_links_absolute(url)
|
||||||
|
if self.sleep > 0:
|
||||||
|
time.sleep(self.sleep)
|
||||||
return data
|
return data
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
# Copyright (C) 2015-2017 Tobias Gruetzmacher
|
||||||
"""
|
"""
|
||||||
Script to get a list of smackjeeves.com comics and save the info in a JSON file
|
Script to get a list of smackjeeves.com comics and save the info in a JSON file
|
||||||
for further processing.
|
for further processing.
|
||||||
|
@ -115,6 +115,10 @@ class SmackJeevesUpdater(ComicListUpdater):
|
||||||
"Razor",
|
"Razor",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def __init__(self, name):
|
||||||
|
super(SmackJeevesUpdater, self).__init__(name)
|
||||||
|
self.sleep = 2
|
||||||
|
|
||||||
def handle_url(self, url):
|
def handle_url(self, url):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
data = self.get_url(url)
|
data = self.get_url(url)
|
||||||
|
|
Loading…
Reference in a new issue