Fix scripts

2013-02-12 21:53:57 +01:00 · 2013-02-12 21:53:57 +01:00 · ba9ece047d
commit ba9ece047d
parent 49ddcecb72
6 changed files with 34 additions and 21 deletions
--- a/scripts/arcamax.py
+++ b/scripts/arcamax.py
@ -7,6 +7,7 @@ from __future__ import print_function
 import re
 import sys
 import os
+import requests
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape
 from dosagelib.scraper import get_scrapers
@ -21,11 +22,11 @@ exclude_comics = [
 ]


-def handle_url(url, res):
+def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
-        data, baseUrl = getPageContent(url)
+        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
@ -49,7 +50,8 @@ def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
-    handle_url('http://www.arcamax.com/comics', res)
+    session = requests.Session()
+    handle_url('http://www.arcamax.com/comics', session, res)
    save_result(res, json_file)


--- a/scripts/creators.py
+++ b/scripts/creators.py
@ -7,6 +7,7 @@ from __future__ import print_function
 import re
 import sys
 import os
+import requests
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape, tagre
 from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
@ -19,11 +20,11 @@ url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<stron
 exclude_comics = [
 ]

-def handle_url(url, res):
+def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
-        data, baseUrl = getPageContent(url)
+        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
@ -45,7 +46,8 @@ def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
-    handle_url('http://www.creators.com/comics/cat-seeall.html', res)
+    session = requests.Session()
+    handle_url('http://www.creators.com/comics/cat-seeall.html', session, res)
    save_result(res, json_file)


--- a/scripts/drunkduck.py
+++ b/scripts/drunkduck.py
@ -7,6 +7,7 @@ from __future__ import print_function
 import re
 import sys
 import os
+import requests
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify
 from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
@ -154,10 +155,10 @@ exclude_comics = [
 ]


-def handle_url(url, url_matcher, num_matcher, res):
+def handle_url(url, session, url_matcher, num_matcher, res):
    """Parse one search result page."""
    try:
-        data, baseUrl = getPageContent(url)
+        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
@ -191,9 +192,10 @@ def get_results():
    # a search for an empty string returned 825 result pages
    result_pages = 825
    print("Parsing", result_pages, "search result pages...", file=sys.stderr)
+    session = requests.Session()
    for i in range(1, result_pages + 1):
        print(i, file=sys.stderr, end=" ")
-        handle_url(base % i, href, num, res)
+        handle_url(base % i, session, href, num, res)
    save_result(res, json_file)


--- a/scripts/gocomics.py
+++ b/scripts/gocomics.py
@ -7,6 +7,7 @@ from __future__ import print_function
 import re
 import sys
 import os
+import requests
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import tagre, getPageContent, asciify, unescape
 from dosagelib.scraper import get_scrapers
@ -41,11 +42,11 @@ exclude_comics = [
 ]


-def handle_url(url, res):
+def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
-        data, baseUrl = getPageContent(url)
+        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
@ -67,9 +68,10 @@ def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
-    handle_url('http://www.gocomics.com/features', res)
-    handle_url('http://www.gocomics.com/explore/editorial_list', res)
-    handle_url('http://www.gocomics.com/explore/sherpa_list', res)
+    session = requests.Session()
+    handle_url('http://www.gocomics.com/features', session, res)
+    handle_url('http://www.gocomics.com/explore/editorial_list', session, res)
+    handle_url('http://www.gocomics.com/explore/sherpa_list', session, res)
    save_result(res, json_file)


--- a/scripts/keenspot.py
+++ b/scripts/keenspot.py
@ -7,6 +7,7 @@ from __future__ import print_function
 import re
 import sys
 import os
+import requests
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape, tagre
 from dosagelib.scraper import get_scrapers
@ -361,11 +362,11 @@ url_overrides = {
    "Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
 }

-def handle_url(url, res):
+def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
-        data, baseUrl = getPageContent(url)
+        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
@ -394,9 +395,10 @@ def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
+    session = requests.Session()
    base = 'http://guide.comicgenesis.com/Keenspace_%s.html'
    for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ':
-        handle_url(base % c, res)
+        handle_url(base % c, session, res)
    save_result(res, json_file)


--- a/scripts/smackjeeves.py
+++ b/scripts/smackjeeves.py
@ -8,6 +8,7 @@ import re
 import sys
 import os
 import urlparse
+import requests
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape, tagre, unquote
 from dosagelib.scraper import get_scrapers
@ -53,6 +54,7 @@ exclude_comics = [
    "SimplePixel", # does not follow standard layout
    "SJArtCollab", # missing images
    "SlightlyDifferent", # missing images
+    "SpaceSchool", # does not follow standard layout
    "TheAfterSubtract", # does not follow standard layout
    "THEVOIDWEBCOMIC", # does not follow standard layout
    "ThreadCrashers", # has no previous comic link
@ -212,11 +214,11 @@ num_matcher = re.compile(r'50%">\s+(\d+)\s+')
 desc_matcher = re.compile(r"</div>(.+?)</div>", re.DOTALL)
 adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))

-def handle_url(url, res):
+def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
-        data, baseUrl = getPageContent(url)
+        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
@ -242,7 +244,7 @@ def handle_url(url, res):
        # search for url in extra page
        print("Getting", page_url)
        try:
-            data2, baseUrl2 = getPageContent(page_url)
+            data2, baseUrl2 = getPageContent(page_url, session)
        except IOError as msg:
            print("ERROR:", msg, file=sys.stderr)
            return
@ -272,6 +274,7 @@ def handle_url(url, res):
 def get_results():
    """Parse all search result pages."""
    base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
+    session = requests.Session()
    # store info in a dictionary {name -> url, number of comics, description, adult flag}
    res = {}
    # a search for an empty string returned 286 result pages
@ -279,7 +282,7 @@ def get_results():
    print("Parsing", result_pages, "search result pages...", file=sys.stderr)
    for i in range(0, result_pages):
        print(i+1, file=sys.stderr, end=" ")
-        handle_url(base % (i*12), res)
+        handle_url(base % (i*12), session, res)
    save_result(res, json_file)