Ensure a unicode description for keenspot comics.

2013-04-29 20:24:54 +02:00 · 2013-04-29 20:24:54 +02:00 · d9cd8f3fd6
commit d9cd8f3fd6
parent a9117b3bc9
2 changed files with 7 additions and 7 deletions
--- a/scripts/keenspot.py
+++ b/scripts/keenspot.py
@ -22,7 +22,7 @@ url_matcher = re.compile(
  r"(?:<b>)?([^<]+)(?:</b>)?</a>"
 )
 descurl_matcher = re.compile(r"(desc/[^']+\.html)")
-desc_matcher = re.compile(r'</font><br>(.+)(?:</b>)?</td></tr>', re.DOTALL)
+desc_matcher = re.compile(ur'</font><br>(.+)(?:</b>)?</td></tr>', re.DOTALL)
 # names of comics to exclude
 exclude_comics = [
@ -89,10 +89,10 @@ def get_description(url, session):
        data, baseUrl = getPageContent(url, session)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
-        return ""
+        return u""
    mo = desc_matcher.search(data)
    if not mo:
-        print(data)
+        print("ERROR:", repr(data))
    return format_description(mo.group(1))
--- a/scripts/scriptutil.py
+++ b/scripts/scriptutil.py
@ -14,7 +14,7 @@ def contains_case_insensitive(adict, akey):
 _tagre = re.compile(r"<.+?>")
 def remove_html_tags(text):
    """Remove all HTML tags from text."""
-    return _tagre.sub("", text)
+    return _tagre.sub(u"", text)
 def capfirst(text):
@ -24,12 +24,12 @@ def capfirst(text):
    return text[0].upper() + text[1:]
-_ws = re.compile(r"\s+")
+_ws = re.compile(ur"\s+")
 def compact_whitespace(text):
    """Compact all subsequent whitespace to a single space."""
    if not text:
        return text
-    return _ws.sub(" ", text)
+    return _ws.sub(u" ", text)
 def save_result(res, json_file):
@ -52,7 +52,7 @@ def truncate_name(text):
 def format_name(text):
    """Format a comic name."""
    name = unescape(text)
-    name = asciify(name.replace('&', 'And').replace('@', 'At'))
+    name = asciify(name.replace(u'&', u'And').replace(u'@', u'At'))
    name = capfirst(name)
    return name