Code cleanup.

This commit is contained in:
Bastian Kleineidam 2012-12-19 20:42:53 +01:00
parent 389ae838b8
commit afa93b498b
7 changed files with 39 additions and 79 deletions

View file

@ -7,10 +7,9 @@ from __future__ import print_function
import re
import sys
import os
import json
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre
from scriptutil import contains_case_insensitive, capfirst
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result
json_file = __file__.replace(".py", ".json")
@ -37,30 +36,22 @@ def handle_url(url, res):
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr)
print("INFO: skipping possible duplicate", name, file=sys.stderr)
continue
res[name] = url
def save_result(res):
"""Save result to file."""
with open(json_file, 'wb') as f:
json.dump(res, f, sort_keys=True)
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
handle_url('http://www.creators.com/comics/cat-seeall.html', res)
save_result(res)
save_result(res, json_file)
def print_results(args):
"""Print comics."""
with open(json_file, "rb") as f:
comics = json.load(f)
for name, url in sorted(comics.items()):
for name, url in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
print("add(%r, %r)" % (str(name), str(url)))

View file

@ -7,10 +7,9 @@ from __future__ import print_function
import re
import sys
import os
import json
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify
from scriptutil import contains_case_insensitive, capfirst
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result
json_file = __file__.replace(".py", ".json")
@ -167,7 +166,7 @@ def handle_url(url, url_matcher, num_matcher, res):
name = capfirst(asciify(path))
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr)
print("INFO: skipping possible duplicate", name, file=sys.stderr)
continue
if name in exclude_comics:
continue
@ -181,12 +180,6 @@ def handle_url(url, url_matcher, num_matcher, res):
res[name] = (path, num)
def save_result(res):
"""Save result to file."""
with open(json_file, 'wb') as f:
json.dump(res, f, sort_keys=True)
def get_results():
"""Parse all search result pages."""
base = "http://www.drunkduck.com/search/?page=%d&search=&type=0&type=1&last_update="
@ -200,14 +193,12 @@ def get_results():
for i in range(1, result_pages + 1):
print(i, file=sys.stderr, end=" ")
handle_url(base % i, href, num, res)
save_result(res)
save_result(res, json_file)
def print_results(min_strips):
"""Print all comics that have at least the given number of minimum comic strips."""
with open(json_file, "rb") as f:
comics = json.load(f)
for name, entry in sorted(comics.items()):
for name, entry in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
path, num = entry

View file

@ -7,11 +7,10 @@ from __future__ import print_function
import re
import sys
import os
import json
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import tagre, getPageContent, asciify, unescape
from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, capfirst
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result
json_file = __file__.replace(".py", ".json")
@ -59,17 +58,11 @@ def handle_url(url, res):
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr)
print("INFO: skipping possible duplicate", name, file=sys.stderr)
continue
res[name] = shortname
def save_result(res):
"""Save result to file."""
with open(json_file, 'wb') as f:
json.dump(res, f, sort_keys=True)
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
@ -77,7 +70,7 @@ def get_results():
handle_url('http://www.gocomics.com/features', res)
handle_url('http://www.gocomics.com/explore/editorial_list', res)
handle_url('http://www.gocomics.com/explore/sherpa_list', res)
save_result(res)
save_result(res, json_file)
def has_creators_comic(name):
@ -91,9 +84,7 @@ def has_creators_comic(name):
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
with open(json_file, "rb") as f:
comics = json.load(f)
for name, shortname in sorted(comics.items()):
for name, shortname in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
if has_creators_comic(name):

View file

@ -7,11 +7,10 @@ from __future__ import print_function
import re
import sys
import os
import json
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre
from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, capfirst
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result
json_file = __file__.replace(".py", ".json")
@ -379,7 +378,7 @@ def handle_url(url, res):
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr)
print("INFO: skipping possible duplicate", name, file=sys.stderr)
continue
# find out how many images this comic has
end = match.end()
@ -391,12 +390,6 @@ def handle_url(url, res):
res[name] = (url_overrides.get(name, url), num)
def save_result(res):
"""Save result to file."""
with open(json_file, 'wb') as f:
json.dump(res, f, sort_keys=True)
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
@ -404,7 +397,7 @@ def get_results():
base = 'http://guide.comicgenesis.com/Keenspace_%s.html'
for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ':
handle_url(base % c, res)
save_result(res)
save_result(res, json_file)
def has_comic(name):
@ -420,9 +413,7 @@ def has_comic(name):
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
min_comics = int(args[0])
with open(json_file, "rb") as f:
comics = json.load(f)
for name, entry in sorted(comics.items()):
for name, entry in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
url, num = entry

View file

@ -1,8 +1,9 @@
# Copyright (C) 2012 Bastian Kleineidam
import re
import json
def contains_case_insensitive(adict, akey):
"""Check if key is in adict. The search is case insensitive."""
for key in adict:
if key.lower() == akey.lower():
return True
@ -11,6 +12,7 @@ def contains_case_insensitive(adict, akey):
_tagre = re.compile(r"<.+?>")
def remove_html_tags(text):
"""Remove all HTML tags from text."""
return _tagre.sub("", text)
@ -23,6 +25,18 @@ def capfirst(text):
_ws = re.compile(r"\s+")
def compact_whitespace(text):
"""Compact all subsequent whitespace to a single space."""
if not text:
return text
return _ws.sub(" ", text)
def save_result(res, json_file):
"""Save result to file."""
with open(json_file, 'wb') as f:
json.dump(res, f, sort_keys=True)
def load_result(json_file):
with open(json_file, "rb") as f:
return json.load(f)

View file

@ -8,11 +8,10 @@ import re
import sys
import os
import urlparse
import json
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre, unquote
from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, remove_html_tags, capfirst, compact_whitespace
from scriptutil import contains_case_insensitive, remove_html_tags, capfirst, compact_whitespace, save_result, load_result
json_file = __file__.replace(".py", ".json")
@ -231,7 +230,7 @@ def handle_url(url, res):
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr)
print("INFO: skipping possible duplicate", name, file=sys.stderr)
continue
# find out how many images this comic has
end = match.end()
@ -270,12 +269,6 @@ def handle_url(url, res):
]
def save_result(res):
"""Save result to file."""
with open(json_file, 'wb') as f:
json.dump(res, f, sort_keys=True)
def get_results():
"""Parse all search result pages."""
base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
@ -287,7 +280,7 @@ def get_results():
for i in range(0, result_pages):
print(i+1, file=sys.stderr, end=" ")
handle_url(base % (i*12), res)
save_result(res)
save_result(res, json_file)
def has_comic(name):
@ -302,9 +295,7 @@ def has_comic(name):
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
min_comics = int(args[0])
with open(json_file, "rb") as f:
comics = json.load(f)
for name, entry in sorted(comics.items()):
for name, entry in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
url, num, desc, adult, bounce = entry

View file

@ -7,11 +7,10 @@ from __future__ import print_function
import re
import sys
import os
import json
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape
from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, capfirst
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result
json_file = __file__.replace(".py", ".json")
@ -55,23 +54,17 @@ def handle_url(url, res):
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr)
print("INFO: skipping possible duplicate", name, file=sys.stderr)
continue
res[name] = shortname
def save_result(res):
"""Save result to file."""
with open(json_file, 'wb') as f:
json.dump(res, f, sort_keys=True)
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
handle_url('http://www.universaluclick.com/comics/list', res)
save_result(res)
save_result(res, json_file)
def has_comic(name):
@ -86,9 +79,7 @@ def has_comic(name):
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
with open(json_file, "rb") as f:
comics = json.load(f)
for name, shortname in sorted(comics.items()):
for name, shortname in sorted(load_result().items()):
if name in exclude_comics:
continue
if has_comic(name):