Code cleanup.
This commit is contained in:
parent
389ae838b8
commit
afa93b498b
7 changed files with 39 additions and 79 deletions
|
@ -7,10 +7,9 @@ from __future__ import print_function
|
|||
import re
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
||||
from scriptutil import contains_case_insensitive, capfirst
|
||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
|
@ -37,30 +36,22 @@ def handle_url(url, res):
|
|||
continue
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
print("INFO: skipping possible duplicate", name, file=sys.stderr)
|
||||
continue
|
||||
res[name] = url
|
||||
|
||||
|
||||
def save_result(res):
|
||||
"""Save result to file."""
|
||||
with open(json_file, 'wb') as f:
|
||||
json.dump(res, f, sort_keys=True)
|
||||
|
||||
|
||||
def get_results():
|
||||
"""Parse all search result pages."""
|
||||
# store info in a dictionary {name -> shortname}
|
||||
res = {}
|
||||
handle_url('http://www.creators.com/comics/cat-seeall.html', res)
|
||||
save_result(res)
|
||||
save_result(res, json_file)
|
||||
|
||||
|
||||
def print_results(args):
|
||||
"""Print comics."""
|
||||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, url in sorted(comics.items()):
|
||||
for name, url in sorted(load_result(json_file).items()):
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
print("add(%r, %r)" % (str(name), str(url)))
|
||||
|
|
|
@ -7,10 +7,9 @@ from __future__ import print_function
|
|||
import re
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify
|
||||
from scriptutil import contains_case_insensitive, capfirst
|
||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
|
@ -167,7 +166,7 @@ def handle_url(url, url_matcher, num_matcher, res):
|
|||
name = capfirst(asciify(path))
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
print("INFO: skipping possible duplicate", name, file=sys.stderr)
|
||||
continue
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
|
@ -181,12 +180,6 @@ def handle_url(url, url_matcher, num_matcher, res):
|
|||
res[name] = (path, num)
|
||||
|
||||
|
||||
def save_result(res):
|
||||
"""Save result to file."""
|
||||
with open(json_file, 'wb') as f:
|
||||
json.dump(res, f, sort_keys=True)
|
||||
|
||||
|
||||
def get_results():
|
||||
"""Parse all search result pages."""
|
||||
base = "http://www.drunkduck.com/search/?page=%d&search=&type=0&type=1&last_update="
|
||||
|
@ -200,14 +193,12 @@ def get_results():
|
|||
for i in range(1, result_pages + 1):
|
||||
print(i, file=sys.stderr, end=" ")
|
||||
handle_url(base % i, href, num, res)
|
||||
save_result(res)
|
||||
save_result(res, json_file)
|
||||
|
||||
|
||||
def print_results(min_strips):
|
||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
||||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, entry in sorted(comics.items()):
|
||||
for name, entry in sorted(load_result(json_file).items()):
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
path, num = entry
|
||||
|
|
|
@ -7,11 +7,10 @@ from __future__ import print_function
|
|||
import re
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import tagre, getPageContent, asciify, unescape
|
||||
from dosagelib.scraper import get_scrapers
|
||||
from scriptutil import contains_case_insensitive, capfirst
|
||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
|
@ -59,17 +58,11 @@ def handle_url(url, res):
|
|||
continue
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
print("INFO: skipping possible duplicate", name, file=sys.stderr)
|
||||
continue
|
||||
res[name] = shortname
|
||||
|
||||
|
||||
def save_result(res):
|
||||
"""Save result to file."""
|
||||
with open(json_file, 'wb') as f:
|
||||
json.dump(res, f, sort_keys=True)
|
||||
|
||||
|
||||
def get_results():
|
||||
"""Parse all search result pages."""
|
||||
# store info in a dictionary {name -> shortname}
|
||||
|
@ -77,7 +70,7 @@ def get_results():
|
|||
handle_url('http://www.gocomics.com/features', res)
|
||||
handle_url('http://www.gocomics.com/explore/editorial_list', res)
|
||||
handle_url('http://www.gocomics.com/explore/sherpa_list', res)
|
||||
save_result(res)
|
||||
save_result(res, json_file)
|
||||
|
||||
|
||||
def has_creators_comic(name):
|
||||
|
@ -91,9 +84,7 @@ def has_creators_comic(name):
|
|||
|
||||
def print_results(args):
|
||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
||||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, shortname in sorted(comics.items()):
|
||||
for name, shortname in sorted(load_result(json_file).items()):
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if has_creators_comic(name):
|
||||
|
|
|
@ -7,11 +7,10 @@ from __future__ import print_function
|
|||
import re
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
||||
from dosagelib.scraper import get_scrapers
|
||||
from scriptutil import contains_case_insensitive, capfirst
|
||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
|
@ -379,7 +378,7 @@ def handle_url(url, res):
|
|||
continue
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
print("INFO: skipping possible duplicate", name, file=sys.stderr)
|
||||
continue
|
||||
# find out how many images this comic has
|
||||
end = match.end()
|
||||
|
@ -391,12 +390,6 @@ def handle_url(url, res):
|
|||
res[name] = (url_overrides.get(name, url), num)
|
||||
|
||||
|
||||
def save_result(res):
|
||||
"""Save result to file."""
|
||||
with open(json_file, 'wb') as f:
|
||||
json.dump(res, f, sort_keys=True)
|
||||
|
||||
|
||||
def get_results():
|
||||
"""Parse all search result pages."""
|
||||
# store info in a dictionary {name -> shortname}
|
||||
|
@ -404,7 +397,7 @@ def get_results():
|
|||
base = 'http://guide.comicgenesis.com/Keenspace_%s.html'
|
||||
for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ':
|
||||
handle_url(base % c, res)
|
||||
save_result(res)
|
||||
save_result(res, json_file)
|
||||
|
||||
|
||||
def has_comic(name):
|
||||
|
@ -420,9 +413,7 @@ def has_comic(name):
|
|||
def print_results(args):
|
||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
||||
min_comics = int(args[0])
|
||||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, entry in sorted(comics.items()):
|
||||
for name, entry in sorted(load_result(json_file).items()):
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
url, num = entry
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
import re
|
||||
|
||||
import json
|
||||
|
||||
def contains_case_insensitive(adict, akey):
|
||||
"""Check if key is in adict. The search is case insensitive."""
|
||||
for key in adict:
|
||||
if key.lower() == akey.lower():
|
||||
return True
|
||||
|
@ -11,6 +12,7 @@ def contains_case_insensitive(adict, akey):
|
|||
|
||||
_tagre = re.compile(r"<.+?>")
|
||||
def remove_html_tags(text):
|
||||
"""Remove all HTML tags from text."""
|
||||
return _tagre.sub("", text)
|
||||
|
||||
|
||||
|
@ -23,6 +25,18 @@ def capfirst(text):
|
|||
|
||||
_ws = re.compile(r"\s+")
|
||||
def compact_whitespace(text):
|
||||
"""Compact all subsequent whitespace to a single space."""
|
||||
if not text:
|
||||
return text
|
||||
return _ws.sub(" ", text)
|
||||
|
||||
|
||||
def save_result(res, json_file):
|
||||
"""Save result to file."""
|
||||
with open(json_file, 'wb') as f:
|
||||
json.dump(res, f, sort_keys=True)
|
||||
|
||||
|
||||
def load_result(json_file):
|
||||
with open(json_file, "rb") as f:
|
||||
return json.load(f)
|
||||
|
|
|
@ -8,11 +8,10 @@ import re
|
|||
import sys
|
||||
import os
|
||||
import urlparse
|
||||
import json
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape, tagre, unquote
|
||||
from dosagelib.scraper import get_scrapers
|
||||
from scriptutil import contains_case_insensitive, remove_html_tags, capfirst, compact_whitespace
|
||||
from scriptutil import contains_case_insensitive, remove_html_tags, capfirst, compact_whitespace, save_result, load_result
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
|
@ -231,7 +230,7 @@ def handle_url(url, res):
|
|||
continue
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
print("INFO: skipping possible duplicate", name, file=sys.stderr)
|
||||
continue
|
||||
# find out how many images this comic has
|
||||
end = match.end()
|
||||
|
@ -270,12 +269,6 @@ def handle_url(url, res):
|
|||
]
|
||||
|
||||
|
||||
def save_result(res):
|
||||
"""Save result to file."""
|
||||
with open(json_file, 'wb') as f:
|
||||
json.dump(res, f, sort_keys=True)
|
||||
|
||||
|
||||
def get_results():
|
||||
"""Parse all search result pages."""
|
||||
base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
|
||||
|
@ -287,7 +280,7 @@ def get_results():
|
|||
for i in range(0, result_pages):
|
||||
print(i+1, file=sys.stderr, end=" ")
|
||||
handle_url(base % (i*12), res)
|
||||
save_result(res)
|
||||
save_result(res, json_file)
|
||||
|
||||
|
||||
def has_comic(name):
|
||||
|
@ -302,9 +295,7 @@ def has_comic(name):
|
|||
def print_results(args):
|
||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
||||
min_comics = int(args[0])
|
||||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, entry in sorted(comics.items()):
|
||||
for name, entry in sorted(load_result(json_file).items()):
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
url, num, desc, adult, bounce = entry
|
||||
|
|
|
@ -7,11 +7,10 @@ from __future__ import print_function
|
|||
import re
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape
|
||||
from dosagelib.scraper import get_scrapers
|
||||
from scriptutil import contains_case_insensitive, capfirst
|
||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
|
@ -55,23 +54,17 @@ def handle_url(url, res):
|
|||
continue
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
print("INFO: skipping possible duplicate", name, file=sys.stderr)
|
||||
continue
|
||||
res[name] = shortname
|
||||
|
||||
|
||||
def save_result(res):
|
||||
"""Save result to file."""
|
||||
with open(json_file, 'wb') as f:
|
||||
json.dump(res, f, sort_keys=True)
|
||||
|
||||
|
||||
def get_results():
|
||||
"""Parse all search result pages."""
|
||||
# store info in a dictionary {name -> shortname}
|
||||
res = {}
|
||||
handle_url('http://www.universaluclick.com/comics/list', res)
|
||||
save_result(res)
|
||||
save_result(res, json_file)
|
||||
|
||||
|
||||
def has_comic(name):
|
||||
|
@ -86,9 +79,7 @@ def has_comic(name):
|
|||
|
||||
def print_results(args):
|
||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
||||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, shortname in sorted(comics.items()):
|
||||
for name, shortname in sorted(load_result().items()):
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if has_comic(name):
|
||||
|
|
Loading…
Reference in a new issue