From ba9ece047d8af522d6576a0e0f7ec8809bd87b53 Mon Sep 17 00:00:00 2001
From: Bastian Kleineidam <bastian.kleineidam@web.de>
Date: Tue, 12 Feb 2013 21:53:57 +0100
Subject: [PATCH] Fix scripts

---
 scripts/arcamax.py     |  8 +++++---
 scripts/creators.py    |  8 +++++---
 scripts/drunkduck.py   |  8 +++++---
 scripts/gocomics.py    | 12 +++++++-----
 scripts/keenspot.py    |  8 +++++---
 scripts/smackjeeves.py | 11 +++++++----
 6 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/scripts/arcamax.py b/scripts/arcamax.py
index a80c6391e..91d54e578 100755
--- a/scripts/arcamax.py
+++ b/scripts/arcamax.py
@@ -7,6 +7,7 @@ from __future__ import print_function
 import re
 import sys
 import os
+import requests
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape
 from dosagelib.scraper import get_scrapers
@@ -21,11 +22,11 @@ exclude_comics = [
 ]
 
 
-def handle_url(url, res):
+def handle_url(url, session, res):
     """Parse one search result page."""
     print("Parsing", url, file=sys.stderr)
     try:
-        data, baseUrl = getPageContent(url)
+        data, baseUrl = getPageContent(url, session)
     except IOError as msg:
         print("ERROR:", msg, file=sys.stderr)
         return
@@ -49,7 +50,8 @@ def get_results():
     """Parse all search result pages."""
     # store info in a dictionary {name -> shortname}
     res = {}
-    handle_url('http://www.arcamax.com/comics', res)
+    session = requests.Session()
+    handle_url('http://www.arcamax.com/comics', session, res)
     save_result(res, json_file)
 
 
diff --git a/scripts/creators.py b/scripts/creators.py
index 878628f73..c234f4309 100755
--- a/scripts/creators.py
+++ b/scripts/creators.py
@@ -7,6 +7,7 @@ from __future__ import print_function
 import re
 import sys
 import os
+import requests
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape, tagre
 from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
@@ -19,11 +20,11 @@ url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<stron
 exclude_comics = [
 ]
 
-def handle_url(url, res):
+def handle_url(url, session, res):
     """Parse one search result page."""
     print("Parsing", url, file=sys.stderr)
     try:
-        data, baseUrl = getPageContent(url)
+        data, baseUrl = getPageContent(url, session)
     except IOError as msg:
         print("ERROR:", msg, file=sys.stderr)
         return
@@ -45,7 +46,8 @@ def get_results():
     """Parse all search result pages."""
     # store info in a dictionary {name -> shortname}
     res = {}
-    handle_url('http://www.creators.com/comics/cat-seeall.html', res)
+    session = requests.Session()
+    handle_url('http://www.creators.com/comics/cat-seeall.html', session, res)
     save_result(res, json_file)
 
 
diff --git a/scripts/drunkduck.py b/scripts/drunkduck.py
index 0034592d8..040858888 100755
--- a/scripts/drunkduck.py
+++ b/scripts/drunkduck.py
@@ -7,6 +7,7 @@ from __future__ import print_function
 import re
 import sys
 import os
+import requests
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify
 from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
@@ -154,10 +155,10 @@ exclude_comics = [
 ]
 
 
-def handle_url(url, url_matcher, num_matcher, res):
+def handle_url(url, session, url_matcher, num_matcher, res):
     """Parse one search result page."""
     try:
-        data, baseUrl = getPageContent(url)
+        data, baseUrl = getPageContent(url, session)
     except IOError as msg:
         print("ERROR:", msg, file=sys.stderr)
         return
@@ -191,9 +192,10 @@ def get_results():
     # a search for an empty string returned 825 result pages
     result_pages = 825
     print("Parsing", result_pages, "search result pages...", file=sys.stderr)
+    session = requests.Session()
     for i in range(1, result_pages + 1):
         print(i, file=sys.stderr, end=" ")
-        handle_url(base % i, href, num, res)
+        handle_url(base % i, session, href, num, res)
     save_result(res, json_file)
 
 
diff --git a/scripts/gocomics.py b/scripts/gocomics.py
index 59ec04401..73bde7e9f 100755
--- a/scripts/gocomics.py
+++ b/scripts/gocomics.py
@@ -7,6 +7,7 @@ from __future__ import print_function
 import re
 import sys
 import os
+import requests
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import tagre, getPageContent, asciify, unescape
 from dosagelib.scraper import get_scrapers
@@ -41,11 +42,11 @@ exclude_comics = [
 ]
 
 
-def handle_url(url, res):
+def handle_url(url, session, res):
     """Parse one search result page."""
     print("Parsing", url, file=sys.stderr)
     try:
-        data, baseUrl = getPageContent(url)
+        data, baseUrl = getPageContent(url, session)
     except IOError as msg:
         print("ERROR:", msg, file=sys.stderr)
         return
@@ -67,9 +68,10 @@ def get_results():
     """Parse all search result pages."""
     # store info in a dictionary {name -> shortname}
     res = {}
-    handle_url('http://www.gocomics.com/features', res)
-    handle_url('http://www.gocomics.com/explore/editorial_list', res)
-    handle_url('http://www.gocomics.com/explore/sherpa_list', res)
+    session = requests.Session()
+    handle_url('http://www.gocomics.com/features', session, res)
+    handle_url('http://www.gocomics.com/explore/editorial_list', session, res)
+    handle_url('http://www.gocomics.com/explore/sherpa_list', session, res)
     save_result(res, json_file)
 
 
diff --git a/scripts/keenspot.py b/scripts/keenspot.py
index 272b64af4..7ca4a3c8a 100755
--- a/scripts/keenspot.py
+++ b/scripts/keenspot.py
@@ -7,6 +7,7 @@ from __future__ import print_function
 import re
 import sys
 import os
+import requests
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape, tagre
 from dosagelib.scraper import get_scrapers
@@ -361,11 +362,11 @@ url_overrides = {
     "Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
 }
 
-def handle_url(url, res):
+def handle_url(url, session, res):
     """Parse one search result page."""
     print("Parsing", url, file=sys.stderr)
     try:
-        data, baseUrl = getPageContent(url)
+        data, baseUrl = getPageContent(url, session)
     except IOError as msg:
         print("ERROR:", msg, file=sys.stderr)
         return
@@ -394,9 +395,10 @@ def get_results():
     """Parse all search result pages."""
     # store info in a dictionary {name -> shortname}
     res = {}
+    session = requests.Session()
     base = 'http://guide.comicgenesis.com/Keenspace_%s.html'
     for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ':
-        handle_url(base % c, res)
+        handle_url(base % c, session, res)
     save_result(res, json_file)
 
 
diff --git a/scripts/smackjeeves.py b/scripts/smackjeeves.py
index 2604fe30d..6d0a9de42 100755
--- a/scripts/smackjeeves.py
+++ b/scripts/smackjeeves.py
@@ -8,6 +8,7 @@ import re
 import sys
 import os
 import urlparse
+import requests
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape, tagre, unquote
 from dosagelib.scraper import get_scrapers
@@ -53,6 +54,7 @@ exclude_comics = [
     "SimplePixel", # does not follow standard layout
     "SJArtCollab", # missing images
     "SlightlyDifferent", # missing images
+    "SpaceSchool", # does not follow standard layout
     "TheAfterSubtract", # does not follow standard layout
     "THEVOIDWEBCOMIC", # does not follow standard layout
     "ThreadCrashers", # has no previous comic link
@@ -212,11 +214,11 @@ num_matcher = re.compile(r'50%">\s+(\d+)\s+')
 desc_matcher = re.compile(r"</div>(.+?)</div>", re.DOTALL)
 adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
 
-def handle_url(url, res):
+def handle_url(url, session, res):
     """Parse one search result page."""
     print("Parsing", url, file=sys.stderr)
     try:
-        data, baseUrl = getPageContent(url)
+        data, baseUrl = getPageContent(url, session)
     except IOError as msg:
         print("ERROR:", msg, file=sys.stderr)
         return
@@ -242,7 +244,7 @@ def handle_url(url, res):
         # search for url in extra page
         print("Getting", page_url)
         try:
-            data2, baseUrl2 = getPageContent(page_url)
+            data2, baseUrl2 = getPageContent(page_url, session)
         except IOError as msg:
             print("ERROR:", msg, file=sys.stderr)
             return
@@ -272,6 +274,7 @@ def handle_url(url, res):
 def get_results():
     """Parse all search result pages."""
     base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
+    session = requests.Session()
     # store info in a dictionary {name -> url, number of comics, description, adult flag}
     res = {}
     # a search for an empty string returned 286 result pages
@@ -279,7 +282,7 @@ def get_results():
     print("Parsing", result_pages, "search result pages...", file=sys.stderr)
     for i in range(0, result_pages):
         print(i+1, file=sys.stderr, end=" ")
-        handle_url(base % (i*12), res)
+        handle_url(base % (i*12), session, res)
     save_result(res, json_file)