From d9cd8f3fd6dcbb7c9eef2fa19d3788759de98b93 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Mon, 29 Apr 2013 20:24:54 +0200 Subject: [PATCH] Ensure a unicode description for keenspot comics. --- scripts/keenspot.py | 6 +++--- scripts/scriptutil.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/keenspot.py b/scripts/keenspot.py index 14ed84a56..c8376b9fc 100755 --- a/scripts/keenspot.py +++ b/scripts/keenspot.py @@ -22,7 +22,7 @@ url_matcher = re.compile( r"(?:)?([^<]+)(?:)?" ) descurl_matcher = re.compile(r"(desc/[^']+\.html)") -desc_matcher = re.compile(r'
(.+)(?:)?', re.DOTALL) +desc_matcher = re.compile(ur'
(.+)(?:)?', re.DOTALL) # names of comics to exclude exclude_comics = [ @@ -89,10 +89,10 @@ def get_description(url, session): data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) - return "" + return u"" mo = desc_matcher.search(data) if not mo: - print(data) + print("ERROR:", repr(data)) return format_description(mo.group(1)) diff --git a/scripts/scriptutil.py b/scripts/scriptutil.py index 9dcfde477..00b92732d 100644 --- a/scripts/scriptutil.py +++ b/scripts/scriptutil.py @@ -14,7 +14,7 @@ def contains_case_insensitive(adict, akey): _tagre = re.compile(r"<.+?>") def remove_html_tags(text): """Remove all HTML tags from text.""" - return _tagre.sub("", text) + return _tagre.sub(u"", text) def capfirst(text): @@ -24,12 +24,12 @@ def capfirst(text): return text[0].upper() + text[1:] -_ws = re.compile(r"\s+") +_ws = re.compile(ur"\s+") def compact_whitespace(text): """Compact all subsequent whitespace to a single space.""" if not text: return text - return _ws.sub(" ", text) + return _ws.sub(u" ", text) def save_result(res, json_file): @@ -52,7 +52,7 @@ def truncate_name(text): def format_name(text): """Format a comic name.""" name = unescape(text) - name = asciify(name.replace('&', 'And').replace('@', 'At')) + name = asciify(name.replace(u'&', u'And').replace(u'@', u'At')) name = capfirst(name) return name