Ensure a unicode description for keenspot comics.
This commit is contained in:
parent
a9117b3bc9
commit
d9cd8f3fd6
2 changed files with 7 additions and 7 deletions
|
@ -22,7 +22,7 @@ url_matcher = re.compile(
|
||||||
r"(?:<b>)?([^<]+)(?:</b>)?</a>"
|
r"(?:<b>)?([^<]+)(?:</b>)?</a>"
|
||||||
)
|
)
|
||||||
descurl_matcher = re.compile(r"(desc/[^']+\.html)")
|
descurl_matcher = re.compile(r"(desc/[^']+\.html)")
|
||||||
desc_matcher = re.compile(r'</font><br>(.+)(?:</b>)?</td></tr>', re.DOTALL)
|
desc_matcher = re.compile(ur'</font><br>(.+)(?:</b>)?</td></tr>', re.DOTALL)
|
||||||
|
|
||||||
# names of comics to exclude
|
# names of comics to exclude
|
||||||
exclude_comics = [
|
exclude_comics = [
|
||||||
|
@ -89,10 +89,10 @@ def get_description(url, session):
|
||||||
data, baseUrl = getPageContent(url, session)
|
data, baseUrl = getPageContent(url, session)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return ""
|
return u""
|
||||||
mo = desc_matcher.search(data)
|
mo = desc_matcher.search(data)
|
||||||
if not mo:
|
if not mo:
|
||||||
print(data)
|
print("ERROR:", repr(data))
|
||||||
return format_description(mo.group(1))
|
return format_description(mo.group(1))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,7 @@ def contains_case_insensitive(adict, akey):
|
||||||
_tagre = re.compile(r"<.+?>")
|
_tagre = re.compile(r"<.+?>")
|
||||||
def remove_html_tags(text):
|
def remove_html_tags(text):
|
||||||
"""Remove all HTML tags from text."""
|
"""Remove all HTML tags from text."""
|
||||||
return _tagre.sub("", text)
|
return _tagre.sub(u"", text)
|
||||||
|
|
||||||
|
|
||||||
def capfirst(text):
|
def capfirst(text):
|
||||||
|
@ -24,12 +24,12 @@ def capfirst(text):
|
||||||
return text[0].upper() + text[1:]
|
return text[0].upper() + text[1:]
|
||||||
|
|
||||||
|
|
||||||
_ws = re.compile(r"\s+")
|
_ws = re.compile(ur"\s+")
|
||||||
def compact_whitespace(text):
|
def compact_whitespace(text):
|
||||||
"""Compact all subsequent whitespace to a single space."""
|
"""Compact all subsequent whitespace to a single space."""
|
||||||
if not text:
|
if not text:
|
||||||
return text
|
return text
|
||||||
return _ws.sub(" ", text)
|
return _ws.sub(u" ", text)
|
||||||
|
|
||||||
|
|
||||||
def save_result(res, json_file):
|
def save_result(res, json_file):
|
||||||
|
@ -52,7 +52,7 @@ def truncate_name(text):
|
||||||
def format_name(text):
|
def format_name(text):
|
||||||
"""Format a comic name."""
|
"""Format a comic name."""
|
||||||
name = unescape(text)
|
name = unescape(text)
|
||||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
name = asciify(name.replace(u'&', u'And').replace(u'@', u'At'))
|
||||||
name = capfirst(name)
|
name = capfirst(name)
|
||||||
return name
|
return name
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue