From ba9ece047d8af522d6576a0e0f7ec8809bd87b53 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Tue, 12 Feb 2013 21:53:57 +0100 Subject: [PATCH] Fix scripts --- scripts/arcamax.py | 8 +++++--- scripts/creators.py | 8 +++++--- scripts/drunkduck.py | 8 +++++--- scripts/gocomics.py | 12 +++++++----- scripts/keenspot.py | 8 +++++--- scripts/smackjeeves.py | 11 +++++++---- 6 files changed, 34 insertions(+), 21 deletions(-) diff --git a/scripts/arcamax.py b/scripts/arcamax.py index a80c6391e..91d54e578 100755 --- a/scripts/arcamax.py +++ b/scripts/arcamax.py @@ -7,6 +7,7 @@ from __future__ import print_function import re import sys import os +import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from dosagelib.util import getPageContent, asciify, unescape from dosagelib.scraper import get_scrapers @@ -21,11 +22,11 @@ exclude_comics = [ ] -def handle_url(url, res): +def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: - data, baseUrl = getPageContent(url) + data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return @@ -49,7 +50,8 @@ def get_results(): """Parse all search result pages.""" # store info in a dictionary {name -> shortname} res = {} - handle_url('http://www.arcamax.com/comics', res) + session = requests.Session() + handle_url('http://www.arcamax.com/comics', session, res) save_result(res, json_file) diff --git a/scripts/creators.py b/scripts/creators.py index 878628f73..c234f4309 100755 --- a/scripts/creators.py +++ b/scripts/creators.py @@ -7,6 +7,7 @@ from __future__ import print_function import re import sys import os +import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from dosagelib.util import getPageContent, asciify, unescape, tagre from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name @@ -19,11 +20,11 @@ url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r' shortname} res = {} - handle_url('http://www.creators.com/comics/cat-seeall.html', res) + session = requests.Session() + handle_url('http://www.creators.com/comics/cat-seeall.html', session, res) save_result(res, json_file) diff --git a/scripts/drunkduck.py b/scripts/drunkduck.py index 0034592d8..040858888 100755 --- a/scripts/drunkduck.py +++ b/scripts/drunkduck.py @@ -7,6 +7,7 @@ from __future__ import print_function import re import sys import os +import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name @@ -154,10 +155,10 @@ exclude_comics = [ ] -def handle_url(url, url_matcher, num_matcher, res): +def handle_url(url, session, url_matcher, num_matcher, res): """Parse one search result page.""" try: - data, baseUrl = getPageContent(url) + data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return @@ -191,9 +192,10 @@ def get_results(): # a search for an empty string returned 825 result pages result_pages = 825 print("Parsing", result_pages, "search result pages...", file=sys.stderr) + session = requests.Session() for i in range(1, result_pages + 1): print(i, file=sys.stderr, end=" ") - handle_url(base % i, href, num, res) + handle_url(base % i, session, href, num, res) save_result(res, json_file) diff --git a/scripts/gocomics.py b/scripts/gocomics.py index 59ec04401..73bde7e9f 100755 --- a/scripts/gocomics.py +++ b/scripts/gocomics.py @@ -7,6 +7,7 @@ from __future__ import print_function import re import sys import os +import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from dosagelib.util import tagre, getPageContent, asciify, unescape from dosagelib.scraper import get_scrapers @@ -41,11 +42,11 @@ exclude_comics = [ ] -def handle_url(url, res): +def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: - data, baseUrl = getPageContent(url) + data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return @@ -67,9 +68,10 @@ def get_results(): """Parse all search result pages.""" # store info in a dictionary {name -> shortname} res = {} - handle_url('http://www.gocomics.com/features', res) - handle_url('http://www.gocomics.com/explore/editorial_list', res) - handle_url('http://www.gocomics.com/explore/sherpa_list', res) + session = requests.Session() + handle_url('http://www.gocomics.com/features', session, res) + handle_url('http://www.gocomics.com/explore/editorial_list', session, res) + handle_url('http://www.gocomics.com/explore/sherpa_list', session, res) save_result(res, json_file) diff --git a/scripts/keenspot.py b/scripts/keenspot.py index 272b64af4..7ca4a3c8a 100755 --- a/scripts/keenspot.py +++ b/scripts/keenspot.py @@ -7,6 +7,7 @@ from __future__ import print_function import re import sys import os +import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from dosagelib.util import getPageContent, asciify, unescape, tagre from dosagelib.scraper import get_scrapers @@ -361,11 +362,11 @@ url_overrides = { "Zortic": "http://zortic.comicgenesis.com/d/20030922.html", } -def handle_url(url, res): +def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: - data, baseUrl = getPageContent(url) + data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return @@ -394,9 +395,10 @@ def get_results(): """Parse all search result pages.""" # store info in a dictionary {name -> shortname} res = {} + session = requests.Session() base = 'http://guide.comicgenesis.com/Keenspace_%s.html' for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ': - handle_url(base % c, res) + handle_url(base % c, session, res) save_result(res, json_file) diff --git a/scripts/smackjeeves.py b/scripts/smackjeeves.py index 2604fe30d..6d0a9de42 100755 --- a/scripts/smackjeeves.py +++ b/scripts/smackjeeves.py @@ -8,6 +8,7 @@ import re import sys import os import urlparse +import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from dosagelib.util import getPageContent, asciify, unescape, tagre, unquote from dosagelib.scraper import get_scrapers @@ -53,6 +54,7 @@ exclude_comics = [ "SimplePixel", # does not follow standard layout "SJArtCollab", # missing images "SlightlyDifferent", # missing images + "SpaceSchool", # does not follow standard layout "TheAfterSubtract", # does not follow standard layout "THEVOIDWEBCOMIC", # does not follow standard layout "ThreadCrashers", # has no previous comic link @@ -212,11 +214,11 @@ num_matcher = re.compile(r'50%">\s+(\d+)\s+') desc_matcher = re.compile(r"(.+?)", re.DOTALL) adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png')) -def handle_url(url, res): +def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: - data, baseUrl = getPageContent(url) + data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return @@ -242,7 +244,7 @@ def handle_url(url, res): # search for url in extra page print("Getting", page_url) try: - data2, baseUrl2 = getPageContent(page_url) + data2, baseUrl2 = getPageContent(page_url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return @@ -272,6 +274,7 @@ def handle_url(url, res): def get_results(): """Parse all search result pages.""" base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d" + session = requests.Session() # store info in a dictionary {name -> url, number of comics, description, adult flag} res = {} # a search for an empty string returned 286 result pages @@ -279,7 +282,7 @@ def get_results(): print("Parsing", result_pages, "search result pages...", file=sys.stderr) for i in range(0, result_pages): print(i+1, file=sys.stderr, end=" ") - handle_url(base % (i*12), res) + handle_url(base % (i*12), session, res) save_result(res, json_file)