Fix scripts
This commit is contained in:
parent
49ddcecb72
commit
ba9ece047d
6 changed files with 34 additions and 21 deletions
|
@ -7,6 +7,7 @@ from __future__ import print_function
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import requests
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
from dosagelib.util import getPageContent, asciify, unescape
|
from dosagelib.util import getPageContent, asciify, unescape
|
||||||
from dosagelib.scraper import get_scrapers
|
from dosagelib.scraper import get_scrapers
|
||||||
|
@ -21,11 +22,11 @@ exclude_comics = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def handle_url(url, res):
|
def handle_url(url, session, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data, baseUrl = getPageContent(url)
|
data, baseUrl = getPageContent(url, session)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
|
@ -49,7 +50,8 @@ def get_results():
|
||||||
"""Parse all search result pages."""
|
"""Parse all search result pages."""
|
||||||
# store info in a dictionary {name -> shortname}
|
# store info in a dictionary {name -> shortname}
|
||||||
res = {}
|
res = {}
|
||||||
handle_url('http://www.arcamax.com/comics', res)
|
session = requests.Session()
|
||||||
|
handle_url('http://www.arcamax.com/comics', session, res)
|
||||||
save_result(res, json_file)
|
save_result(res, json_file)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@ from __future__ import print_function
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import requests
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
||||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
||||||
|
@ -19,11 +20,11 @@ url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<stron
|
||||||
exclude_comics = [
|
exclude_comics = [
|
||||||
]
|
]
|
||||||
|
|
||||||
def handle_url(url, res):
|
def handle_url(url, session, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data, baseUrl = getPageContent(url)
|
data, baseUrl = getPageContent(url, session)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
|
@ -45,7 +46,8 @@ def get_results():
|
||||||
"""Parse all search result pages."""
|
"""Parse all search result pages."""
|
||||||
# store info in a dictionary {name -> shortname}
|
# store info in a dictionary {name -> shortname}
|
||||||
res = {}
|
res = {}
|
||||||
handle_url('http://www.creators.com/comics/cat-seeall.html', res)
|
session = requests.Session()
|
||||||
|
handle_url('http://www.creators.com/comics/cat-seeall.html', session, res)
|
||||||
save_result(res, json_file)
|
save_result(res, json_file)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@ from __future__ import print_function
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import requests
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify
|
from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify
|
||||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
||||||
|
@ -154,10 +155,10 @@ exclude_comics = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def handle_url(url, url_matcher, num_matcher, res):
|
def handle_url(url, session, url_matcher, num_matcher, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
try:
|
try:
|
||||||
data, baseUrl = getPageContent(url)
|
data, baseUrl = getPageContent(url, session)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
|
@ -191,9 +192,10 @@ def get_results():
|
||||||
# a search for an empty string returned 825 result pages
|
# a search for an empty string returned 825 result pages
|
||||||
result_pages = 825
|
result_pages = 825
|
||||||
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
|
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
|
||||||
|
session = requests.Session()
|
||||||
for i in range(1, result_pages + 1):
|
for i in range(1, result_pages + 1):
|
||||||
print(i, file=sys.stderr, end=" ")
|
print(i, file=sys.stderr, end=" ")
|
||||||
handle_url(base % i, href, num, res)
|
handle_url(base % i, session, href, num, res)
|
||||||
save_result(res, json_file)
|
save_result(res, json_file)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@ from __future__ import print_function
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import requests
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
from dosagelib.util import tagre, getPageContent, asciify, unescape
|
from dosagelib.util import tagre, getPageContent, asciify, unescape
|
||||||
from dosagelib.scraper import get_scrapers
|
from dosagelib.scraper import get_scrapers
|
||||||
|
@ -41,11 +42,11 @@ exclude_comics = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def handle_url(url, res):
|
def handle_url(url, session, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data, baseUrl = getPageContent(url)
|
data, baseUrl = getPageContent(url, session)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
|
@ -67,9 +68,10 @@ def get_results():
|
||||||
"""Parse all search result pages."""
|
"""Parse all search result pages."""
|
||||||
# store info in a dictionary {name -> shortname}
|
# store info in a dictionary {name -> shortname}
|
||||||
res = {}
|
res = {}
|
||||||
handle_url('http://www.gocomics.com/features', res)
|
session = requests.Session()
|
||||||
handle_url('http://www.gocomics.com/explore/editorial_list', res)
|
handle_url('http://www.gocomics.com/features', session, res)
|
||||||
handle_url('http://www.gocomics.com/explore/sherpa_list', res)
|
handle_url('http://www.gocomics.com/explore/editorial_list', session, res)
|
||||||
|
handle_url('http://www.gocomics.com/explore/sherpa_list', session, res)
|
||||||
save_result(res, json_file)
|
save_result(res, json_file)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@ from __future__ import print_function
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import requests
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
||||||
from dosagelib.scraper import get_scrapers
|
from dosagelib.scraper import get_scrapers
|
||||||
|
@ -361,11 +362,11 @@ url_overrides = {
|
||||||
"Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
|
"Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
|
||||||
}
|
}
|
||||||
|
|
||||||
def handle_url(url, res):
|
def handle_url(url, session, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data, baseUrl = getPageContent(url)
|
data, baseUrl = getPageContent(url, session)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
|
@ -394,9 +395,10 @@ def get_results():
|
||||||
"""Parse all search result pages."""
|
"""Parse all search result pages."""
|
||||||
# store info in a dictionary {name -> shortname}
|
# store info in a dictionary {name -> shortname}
|
||||||
res = {}
|
res = {}
|
||||||
|
session = requests.Session()
|
||||||
base = 'http://guide.comicgenesis.com/Keenspace_%s.html'
|
base = 'http://guide.comicgenesis.com/Keenspace_%s.html'
|
||||||
for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ':
|
for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ':
|
||||||
handle_url(base % c, res)
|
handle_url(base % c, session, res)
|
||||||
save_result(res, json_file)
|
save_result(res, json_file)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,7 @@ import re
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import urlparse
|
import urlparse
|
||||||
|
import requests
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
from dosagelib.util import getPageContent, asciify, unescape, tagre, unquote
|
from dosagelib.util import getPageContent, asciify, unescape, tagre, unquote
|
||||||
from dosagelib.scraper import get_scrapers
|
from dosagelib.scraper import get_scrapers
|
||||||
|
@ -53,6 +54,7 @@ exclude_comics = [
|
||||||
"SimplePixel", # does not follow standard layout
|
"SimplePixel", # does not follow standard layout
|
||||||
"SJArtCollab", # missing images
|
"SJArtCollab", # missing images
|
||||||
"SlightlyDifferent", # missing images
|
"SlightlyDifferent", # missing images
|
||||||
|
"SpaceSchool", # does not follow standard layout
|
||||||
"TheAfterSubtract", # does not follow standard layout
|
"TheAfterSubtract", # does not follow standard layout
|
||||||
"THEVOIDWEBCOMIC", # does not follow standard layout
|
"THEVOIDWEBCOMIC", # does not follow standard layout
|
||||||
"ThreadCrashers", # has no previous comic link
|
"ThreadCrashers", # has no previous comic link
|
||||||
|
@ -212,11 +214,11 @@ num_matcher = re.compile(r'50%">\s+(\d+)\s+')
|
||||||
desc_matcher = re.compile(r"</div>(.+?)</div>", re.DOTALL)
|
desc_matcher = re.compile(r"</div>(.+?)</div>", re.DOTALL)
|
||||||
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
|
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
|
||||||
|
|
||||||
def handle_url(url, res):
|
def handle_url(url, session, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data, baseUrl = getPageContent(url)
|
data, baseUrl = getPageContent(url, session)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
|
@ -242,7 +244,7 @@ def handle_url(url, res):
|
||||||
# search for url in extra page
|
# search for url in extra page
|
||||||
print("Getting", page_url)
|
print("Getting", page_url)
|
||||||
try:
|
try:
|
||||||
data2, baseUrl2 = getPageContent(page_url)
|
data2, baseUrl2 = getPageContent(page_url, session)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
|
@ -272,6 +274,7 @@ def handle_url(url, res):
|
||||||
def get_results():
|
def get_results():
|
||||||
"""Parse all search result pages."""
|
"""Parse all search result pages."""
|
||||||
base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
|
base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
|
||||||
|
session = requests.Session()
|
||||||
# store info in a dictionary {name -> url, number of comics, description, adult flag}
|
# store info in a dictionary {name -> url, number of comics, description, adult flag}
|
||||||
res = {}
|
res = {}
|
||||||
# a search for an empty string returned 286 result pages
|
# a search for an empty string returned 286 result pages
|
||||||
|
@ -279,7 +282,7 @@ def get_results():
|
||||||
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
|
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
|
||||||
for i in range(0, result_pages):
|
for i in range(0, result_pages):
|
||||||
print(i+1, file=sys.stderr, end=" ")
|
print(i+1, file=sys.stderr, end=" ")
|
||||||
handle_url(base % (i*12), res)
|
handle_url(base % (i*12), session, res)
|
||||||
save_result(res, json_file)
|
save_result(res, json_file)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue