Fix scripts
This commit is contained in:
parent
49ddcecb72
commit
ba9ece047d
6 changed files with 34 additions and 21 deletions
|
@ -7,6 +7,7 @@ from __future__ import print_function
|
|||
import re
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape
|
||||
from dosagelib.scraper import get_scrapers
|
||||
|
@ -21,11 +22,11 @@ exclude_comics = [
|
|||
]
|
||||
|
||||
|
||||
def handle_url(url, res):
|
||||
def handle_url(url, session, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data, baseUrl = getPageContent(url)
|
||||
data, baseUrl = getPageContent(url, session)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
|
@ -49,7 +50,8 @@ def get_results():
|
|||
"""Parse all search result pages."""
|
||||
# store info in a dictionary {name -> shortname}
|
||||
res = {}
|
||||
handle_url('http://www.arcamax.com/comics', res)
|
||||
session = requests.Session()
|
||||
handle_url('http://www.arcamax.com/comics', session, res)
|
||||
save_result(res, json_file)
|
||||
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ from __future__ import print_function
|
|||
import re
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
||||
|
@ -19,11 +20,11 @@ url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<stron
|
|||
exclude_comics = [
|
||||
]
|
||||
|
||||
def handle_url(url, res):
|
||||
def handle_url(url, session, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data, baseUrl = getPageContent(url)
|
||||
data, baseUrl = getPageContent(url, session)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
|
@ -45,7 +46,8 @@ def get_results():
|
|||
"""Parse all search result pages."""
|
||||
# store info in a dictionary {name -> shortname}
|
||||
res = {}
|
||||
handle_url('http://www.creators.com/comics/cat-seeall.html', res)
|
||||
session = requests.Session()
|
||||
handle_url('http://www.creators.com/comics/cat-seeall.html', session, res)
|
||||
save_result(res, json_file)
|
||||
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ from __future__ import print_function
|
|||
import re
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify
|
||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
||||
|
@ -154,10 +155,10 @@ exclude_comics = [
|
|||
]
|
||||
|
||||
|
||||
def handle_url(url, url_matcher, num_matcher, res):
|
||||
def handle_url(url, session, url_matcher, num_matcher, res):
|
||||
"""Parse one search result page."""
|
||||
try:
|
||||
data, baseUrl = getPageContent(url)
|
||||
data, baseUrl = getPageContent(url, session)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
|
@ -191,9 +192,10 @@ def get_results():
|
|||
# a search for an empty string returned 825 result pages
|
||||
result_pages = 825
|
||||
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
|
||||
session = requests.Session()
|
||||
for i in range(1, result_pages + 1):
|
||||
print(i, file=sys.stderr, end=" ")
|
||||
handle_url(base % i, href, num, res)
|
||||
handle_url(base % i, session, href, num, res)
|
||||
save_result(res, json_file)
|
||||
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ from __future__ import print_function
|
|||
import re
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import tagre, getPageContent, asciify, unescape
|
||||
from dosagelib.scraper import get_scrapers
|
||||
|
@ -41,11 +42,11 @@ exclude_comics = [
|
|||
]
|
||||
|
||||
|
||||
def handle_url(url, res):
|
||||
def handle_url(url, session, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data, baseUrl = getPageContent(url)
|
||||
data, baseUrl = getPageContent(url, session)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
|
@ -67,9 +68,10 @@ def get_results():
|
|||
"""Parse all search result pages."""
|
||||
# store info in a dictionary {name -> shortname}
|
||||
res = {}
|
||||
handle_url('http://www.gocomics.com/features', res)
|
||||
handle_url('http://www.gocomics.com/explore/editorial_list', res)
|
||||
handle_url('http://www.gocomics.com/explore/sherpa_list', res)
|
||||
session = requests.Session()
|
||||
handle_url('http://www.gocomics.com/features', session, res)
|
||||
handle_url('http://www.gocomics.com/explore/editorial_list', session, res)
|
||||
handle_url('http://www.gocomics.com/explore/sherpa_list', session, res)
|
||||
save_result(res, json_file)
|
||||
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ from __future__ import print_function
|
|||
import re
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
||||
from dosagelib.scraper import get_scrapers
|
||||
|
@ -361,11 +362,11 @@ url_overrides = {
|
|||
"Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
|
||||
}
|
||||
|
||||
def handle_url(url, res):
|
||||
def handle_url(url, session, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data, baseUrl = getPageContent(url)
|
||||
data, baseUrl = getPageContent(url, session)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
|
@ -394,9 +395,10 @@ def get_results():
|
|||
"""Parse all search result pages."""
|
||||
# store info in a dictionary {name -> shortname}
|
||||
res = {}
|
||||
session = requests.Session()
|
||||
base = 'http://guide.comicgenesis.com/Keenspace_%s.html'
|
||||
for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ':
|
||||
handle_url(base % c, res)
|
||||
handle_url(base % c, session, res)
|
||||
save_result(res, json_file)
|
||||
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@ import re
|
|||
import sys
|
||||
import os
|
||||
import urlparse
|
||||
import requests
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape, tagre, unquote
|
||||
from dosagelib.scraper import get_scrapers
|
||||
|
@ -53,6 +54,7 @@ exclude_comics = [
|
|||
"SimplePixel", # does not follow standard layout
|
||||
"SJArtCollab", # missing images
|
||||
"SlightlyDifferent", # missing images
|
||||
"SpaceSchool", # does not follow standard layout
|
||||
"TheAfterSubtract", # does not follow standard layout
|
||||
"THEVOIDWEBCOMIC", # does not follow standard layout
|
||||
"ThreadCrashers", # has no previous comic link
|
||||
|
@ -212,11 +214,11 @@ num_matcher = re.compile(r'50%">\s+(\d+)\s+')
|
|||
desc_matcher = re.compile(r"</div>(.+?)</div>", re.DOTALL)
|
||||
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
|
||||
|
||||
def handle_url(url, res):
|
||||
def handle_url(url, session, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data, baseUrl = getPageContent(url)
|
||||
data, baseUrl = getPageContent(url, session)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
|
@ -242,7 +244,7 @@ def handle_url(url, res):
|
|||
# search for url in extra page
|
||||
print("Getting", page_url)
|
||||
try:
|
||||
data2, baseUrl2 = getPageContent(page_url)
|
||||
data2, baseUrl2 = getPageContent(page_url, session)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
|
@ -272,6 +274,7 @@ def handle_url(url, res):
|
|||
def get_results():
|
||||
"""Parse all search result pages."""
|
||||
base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
|
||||
session = requests.Session()
|
||||
# store info in a dictionary {name -> url, number of comics, description, adult flag}
|
||||
res = {}
|
||||
# a search for an empty string returned 286 result pages
|
||||
|
@ -279,7 +282,7 @@ def get_results():
|
|||
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
|
||||
for i in range(0, result_pages):
|
||||
print(i+1, file=sys.stderr, end=" ")
|
||||
handle_url(base % (i*12), res)
|
||||
handle_url(base % (i*12), session, res)
|
||||
save_result(res, json_file)
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue