Fix scripts

This commit is contained in:
Bastian Kleineidam 2013-02-12 21:53:57 +01:00
parent 49ddcecb72
commit ba9ece047d
6 changed files with 34 additions and 21 deletions

View file

@ -7,6 +7,7 @@ from __future__ import print_function
import re import re
import sys import sys
import os import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape from dosagelib.util import getPageContent, asciify, unescape
from dosagelib.scraper import get_scrapers from dosagelib.scraper import get_scrapers
@ -21,11 +22,11 @@ exclude_comics = [
] ]
def handle_url(url, res): def handle_url(url, session, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
try: try:
data, baseUrl = getPageContent(url) data, baseUrl = getPageContent(url, session)
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
@ -49,7 +50,8 @@ def get_results():
"""Parse all search result pages.""" """Parse all search result pages."""
# store info in a dictionary {name -> shortname} # store info in a dictionary {name -> shortname}
res = {} res = {}
handle_url('http://www.arcamax.com/comics', res) session = requests.Session()
handle_url('http://www.arcamax.com/comics', session, res)
save_result(res, json_file) save_result(res, json_file)

View file

@ -7,6 +7,7 @@ from __future__ import print_function
import re import re
import sys import sys
import os import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre from dosagelib.util import getPageContent, asciify, unescape, tagre
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
@ -19,11 +20,11 @@ url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<stron
exclude_comics = [ exclude_comics = [
] ]
def handle_url(url, res): def handle_url(url, session, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
try: try:
data, baseUrl = getPageContent(url) data, baseUrl = getPageContent(url, session)
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
@ -45,7 +46,8 @@ def get_results():
"""Parse all search result pages.""" """Parse all search result pages."""
# store info in a dictionary {name -> shortname} # store info in a dictionary {name -> shortname}
res = {} res = {}
handle_url('http://www.creators.com/comics/cat-seeall.html', res) session = requests.Session()
handle_url('http://www.creators.com/comics/cat-seeall.html', session, res)
save_result(res, json_file) save_result(res, json_file)

View file

@ -7,6 +7,7 @@ from __future__ import print_function
import re import re
import sys import sys
import os import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
@ -154,10 +155,10 @@ exclude_comics = [
] ]
def handle_url(url, url_matcher, num_matcher, res): def handle_url(url, session, url_matcher, num_matcher, res):
"""Parse one search result page.""" """Parse one search result page."""
try: try:
data, baseUrl = getPageContent(url) data, baseUrl = getPageContent(url, session)
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
@ -191,9 +192,10 @@ def get_results():
# a search for an empty string returned 825 result pages # a search for an empty string returned 825 result pages
result_pages = 825 result_pages = 825
print("Parsing", result_pages, "search result pages...", file=sys.stderr) print("Parsing", result_pages, "search result pages...", file=sys.stderr)
session = requests.Session()
for i in range(1, result_pages + 1): for i in range(1, result_pages + 1):
print(i, file=sys.stderr, end=" ") print(i, file=sys.stderr, end=" ")
handle_url(base % i, href, num, res) handle_url(base % i, session, href, num, res)
save_result(res, json_file) save_result(res, json_file)

View file

@ -7,6 +7,7 @@ from __future__ import print_function
import re import re
import sys import sys
import os import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import tagre, getPageContent, asciify, unescape from dosagelib.util import tagre, getPageContent, asciify, unescape
from dosagelib.scraper import get_scrapers from dosagelib.scraper import get_scrapers
@ -41,11 +42,11 @@ exclude_comics = [
] ]
def handle_url(url, res): def handle_url(url, session, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
try: try:
data, baseUrl = getPageContent(url) data, baseUrl = getPageContent(url, session)
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
@ -67,9 +68,10 @@ def get_results():
"""Parse all search result pages.""" """Parse all search result pages."""
# store info in a dictionary {name -> shortname} # store info in a dictionary {name -> shortname}
res = {} res = {}
handle_url('http://www.gocomics.com/features', res) session = requests.Session()
handle_url('http://www.gocomics.com/explore/editorial_list', res) handle_url('http://www.gocomics.com/features', session, res)
handle_url('http://www.gocomics.com/explore/sherpa_list', res) handle_url('http://www.gocomics.com/explore/editorial_list', session, res)
handle_url('http://www.gocomics.com/explore/sherpa_list', session, res)
save_result(res, json_file) save_result(res, json_file)

View file

@ -7,6 +7,7 @@ from __future__ import print_function
import re import re
import sys import sys
import os import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre from dosagelib.util import getPageContent, asciify, unescape, tagre
from dosagelib.scraper import get_scrapers from dosagelib.scraper import get_scrapers
@ -361,11 +362,11 @@ url_overrides = {
"Zortic": "http://zortic.comicgenesis.com/d/20030922.html", "Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
} }
def handle_url(url, res): def handle_url(url, session, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
try: try:
data, baseUrl = getPageContent(url) data, baseUrl = getPageContent(url, session)
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
@ -394,9 +395,10 @@ def get_results():
"""Parse all search result pages.""" """Parse all search result pages."""
# store info in a dictionary {name -> shortname} # store info in a dictionary {name -> shortname}
res = {} res = {}
session = requests.Session()
base = 'http://guide.comicgenesis.com/Keenspace_%s.html' base = 'http://guide.comicgenesis.com/Keenspace_%s.html'
for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ': for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ':
handle_url(base % c, res) handle_url(base % c, session, res)
save_result(res, json_file) save_result(res, json_file)

View file

@ -8,6 +8,7 @@ import re
import sys import sys
import os import os
import urlparse import urlparse
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre, unquote from dosagelib.util import getPageContent, asciify, unescape, tagre, unquote
from dosagelib.scraper import get_scrapers from dosagelib.scraper import get_scrapers
@ -53,6 +54,7 @@ exclude_comics = [
"SimplePixel", # does not follow standard layout "SimplePixel", # does not follow standard layout
"SJArtCollab", # missing images "SJArtCollab", # missing images
"SlightlyDifferent", # missing images "SlightlyDifferent", # missing images
"SpaceSchool", # does not follow standard layout
"TheAfterSubtract", # does not follow standard layout "TheAfterSubtract", # does not follow standard layout
"THEVOIDWEBCOMIC", # does not follow standard layout "THEVOIDWEBCOMIC", # does not follow standard layout
"ThreadCrashers", # has no previous comic link "ThreadCrashers", # has no previous comic link
@ -212,11 +214,11 @@ num_matcher = re.compile(r'50%">\s+(\d+)\s+')
desc_matcher = re.compile(r"</div>(.+?)</div>", re.DOTALL) desc_matcher = re.compile(r"</div>(.+?)</div>", re.DOTALL)
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png')) adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
def handle_url(url, res): def handle_url(url, session, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
try: try:
data, baseUrl = getPageContent(url) data, baseUrl = getPageContent(url, session)
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
@ -242,7 +244,7 @@ def handle_url(url, res):
# search for url in extra page # search for url in extra page
print("Getting", page_url) print("Getting", page_url)
try: try:
data2, baseUrl2 = getPageContent(page_url) data2, baseUrl2 = getPageContent(page_url, session)
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
@ -272,6 +274,7 @@ def handle_url(url, res):
def get_results(): def get_results():
"""Parse all search result pages.""" """Parse all search result pages."""
base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d" base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
session = requests.Session()
# store info in a dictionary {name -> url, number of comics, description, adult flag} # store info in a dictionary {name -> url, number of comics, description, adult flag}
res = {} res = {}
# a search for an empty string returned 286 result pages # a search for an empty string returned 286 result pages
@ -279,7 +282,7 @@ def get_results():
print("Parsing", result_pages, "search result pages...", file=sys.stderr) print("Parsing", result_pages, "search result pages...", file=sys.stderr)
for i in range(0, result_pages): for i in range(0, result_pages):
print(i+1, file=sys.stderr, end=" ") print(i+1, file=sys.stderr, end=" ")
handle_url(base % (i*12), res) handle_url(base % (i*12), session, res)
save_result(res, json_file) save_result(res, json_file)