Ensure a unicode description for keenspot comics.

This commit is contained in:
Bastian Kleineidam 2013-04-29 20:24:54 +02:00
parent a9117b3bc9
commit d9cd8f3fd6
2 changed files with 7 additions and 7 deletions

View file

@ -22,7 +22,7 @@ url_matcher = re.compile(
r"(?:<b>)?([^<]+)(?:</b>)?</a>"
)
descurl_matcher = re.compile(r"(desc/[^']+\.html)")
desc_matcher = re.compile(r'</font><br>(.+)(?:</b>)?</td></tr>', re.DOTALL)
desc_matcher = re.compile(ur'</font><br>(.+)(?:</b>)?</td></tr>', re.DOTALL)
# names of comics to exclude
exclude_comics = [
@ -89,10 +89,10 @@ def get_description(url, session):
data, baseUrl = getPageContent(url, session)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return ""
return u""
mo = desc_matcher.search(data)
if not mo:
print(data)
print("ERROR:", repr(data))
return format_description(mo.group(1))

View file

@ -14,7 +14,7 @@ def contains_case_insensitive(adict, akey):
_tagre = re.compile(r"<.+?>")
def remove_html_tags(text):
"""Remove all HTML tags from text."""
return _tagre.sub("", text)
return _tagre.sub(u"", text)
def capfirst(text):
@ -24,12 +24,12 @@ def capfirst(text):
return text[0].upper() + text[1:]
_ws = re.compile(r"\s+")
_ws = re.compile(ur"\s+")
def compact_whitespace(text):
"""Compact all subsequent whitespace to a single space."""
if not text:
return text
return _ws.sub(" ", text)
return _ws.sub(u" ", text)
def save_result(res, json_file):
@ -52,7 +52,7 @@ def truncate_name(text):
def format_name(text):
"""Format a comic name."""
name = unescape(text)
name = asciify(name.replace('&', 'And').replace('@', 'At'))
name = asciify(name.replace(u'&', u'And').replace(u'@', u'At'))
name = capfirst(name)
return name