2016-03-03 22:08:51 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
2016-04-12 22:52:16 +00:00
|
|
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
2014-01-05 15:50:57 +00:00
|
|
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
2016-04-03 19:31:56 +00:00
|
|
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
|
|
|
|
|
|
|
from __future__ import absolute_import, division, print_function
|
|
|
|
|
2016-04-14 20:22:37 +00:00
|
|
|
import os
|
2016-04-12 22:52:16 +00:00
|
|
|
import re
|
2016-04-14 20:22:37 +00:00
|
|
|
import sys
|
2012-12-19 19:42:53 +00:00
|
|
|
import json
|
2016-03-31 21:25:53 +00:00
|
|
|
import codecs
|
|
|
|
|
2016-05-22 20:55:06 +00:00
|
|
|
try:
|
|
|
|
from os import replace as rename
|
|
|
|
except ImportError:
|
|
|
|
from os import rename
|
|
|
|
|
2016-04-14 20:22:37 +00:00
|
|
|
import requests
|
|
|
|
from lxml import html
|
|
|
|
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
|
|
|
|
|
|
|
|
from dosagelib.util import unescape, get_page
|
|
|
|
from dosagelib import scraper
|
|
|
|
|
|
|
|
|
|
|
|
def first_lower(x):
|
|
|
|
return x[0].lower()
|
|
|
|
|
|
|
|
|
|
|
|
class ComicListUpdater(object):
|
|
|
|
dup_templates = ()
|
|
|
|
excluded_comics = ()
|
|
|
|
|
2016-05-22 20:55:06 +00:00
|
|
|
START = "# START AUTOUPDATE"
|
|
|
|
END = "# END AUTOUPDATE"
|
|
|
|
|
2016-04-14 20:22:37 +00:00
|
|
|
def __init__(self, name):
|
|
|
|
self.json = name.replace(".py", ".json")
|
|
|
|
self.session = requests.Session()
|
|
|
|
|
2016-04-14 22:26:14 +00:00
|
|
|
def get_url(self, url, expand=True):
|
2016-04-14 20:22:37 +00:00
|
|
|
"""Get an HTML page and parse it with LXML."""
|
|
|
|
print("Parsing", url, file=sys.stderr)
|
|
|
|
try:
|
|
|
|
data = html.document_fromstring(get_page(url, self.session).text)
|
2016-04-14 22:26:14 +00:00
|
|
|
if expand:
|
|
|
|
data.make_links_absolute(url)
|
2016-04-14 20:22:37 +00:00
|
|
|
return data
|
|
|
|
except IOError as msg:
|
|
|
|
print("ERROR:", msg, file=sys.stderr)
|
|
|
|
raise
|
|
|
|
|
|
|
|
def should_skip(self, name):
|
|
|
|
if contains_case_insensitive(self.res, name):
|
|
|
|
# we cannot handle two comics that only differ in case
|
|
|
|
print("INFO: skipping possible duplicate", repr(name),
|
|
|
|
file=sys.stderr)
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
def get_results(self):
|
|
|
|
"""Collect comics and save dictionary in JSON file."""
|
|
|
|
self.res = {}
|
|
|
|
self.collect_results()
|
|
|
|
|
|
|
|
if not self.res:
|
|
|
|
print("ERROR:", "did not match any comics", file=sys.stderr)
|
|
|
|
return
|
|
|
|
|
|
|
|
with codecs.open(self.json, 'wb', 'utf-8') as f:
|
|
|
|
json.dump(self.res, f, sort_keys=True, indent=2,
|
|
|
|
separators=(',', ': '))
|
|
|
|
|
|
|
|
def add_comic(self, name, data, count=None):
|
|
|
|
"""Add a collected comic with a specific number of comics."""
|
|
|
|
name = format_name(name)
|
|
|
|
if not self.should_skip(name):
|
|
|
|
self.res[name] = {'count': count, 'data': data}
|
|
|
|
|
|
|
|
def collect_results(self):
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
def print_results(self, args):
|
|
|
|
"""Print all comics that have at least the given number of minimum
|
|
|
|
comic strips."""
|
|
|
|
min_comics, filename = args
|
|
|
|
min_comics = int(min_comics)
|
2016-05-22 20:55:06 +00:00
|
|
|
oldf = codecs.open(filename, 'r', 'utf-8')
|
|
|
|
newf = codecs.open(filename + '.new', 'w', 'utf-8')
|
|
|
|
with oldf, newf:
|
|
|
|
indent = self.copy_until_start(oldf, newf)
|
2016-04-14 20:22:37 +00:00
|
|
|
with codecs.open(self.json, 'rb', 'utf-8') as f:
|
|
|
|
data = json.load(f)
|
|
|
|
for name, entry in sorted(data.items(), key=first_lower):
|
2016-05-22 20:55:06 +00:00
|
|
|
self.write_entry(newf, name, entry, min_comics, indent)
|
|
|
|
self.copy_after_end(oldf, newf)
|
|
|
|
rename(filename + '.new', filename)
|
|
|
|
|
|
|
|
def copy_until_start(self, src, dest):
|
|
|
|
for line in src:
|
|
|
|
dest.write(line)
|
|
|
|
if line.strip().startswith(self.START):
|
|
|
|
return line.find(self.START)
|
|
|
|
raise RuntimeError("can't find start marker!")
|
|
|
|
|
|
|
|
def copy_after_end(self, src, dest):
|
|
|
|
skip = True
|
|
|
|
for line in src:
|
|
|
|
if line.strip().startswith(self.END):
|
|
|
|
skip = False
|
|
|
|
if not skip:
|
|
|
|
dest.write(line)
|
|
|
|
if skip:
|
|
|
|
raise RuntimeError("can't find end marker!")
|
|
|
|
|
|
|
|
def write_entry(self, fp, name, entry, min_comics, indent):
|
|
|
|
if name in self.excluded_comics:
|
|
|
|
return
|
|
|
|
count = entry['count']
|
|
|
|
if count and count < min_comics:
|
|
|
|
return
|
|
|
|
dup = self.find_dups(name)
|
|
|
|
fp.write(" " * indent)
|
|
|
|
if dup is not None:
|
|
|
|
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
|
|
|
|
else:
|
|
|
|
fp.write(self.get_entry(
|
|
|
|
truncate_name(name),
|
|
|
|
entry['data']).replace("\n", "\n" + (" " * indent)) + "\n")
|
2016-04-14 20:22:37 +00:00
|
|
|
|
|
|
|
def find_dups(self, name):
|
|
|
|
"""Check if comic name already exists."""
|
|
|
|
names = [(tmpl % name).lower() for tmpl in self.dup_templates]
|
|
|
|
if names:
|
|
|
|
for scraperobj in scraper.get_scrapers():
|
|
|
|
lname = scraperobj.name.lower()
|
2016-04-16 11:13:47 +00:00
|
|
|
if lname in names:
|
2016-04-14 20:22:37 +00:00
|
|
|
return scraperobj.name
|
|
|
|
return None
|
|
|
|
|
2016-05-22 20:55:06 +00:00
|
|
|
def get_entry(self, name, data):
|
|
|
|
"""Return an entry for the module generator."""
|
2016-04-14 20:22:37 +00:00
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
if len(sys.argv) > 1:
|
|
|
|
self.print_results(sys.argv[1:])
|
|
|
|
else:
|
|
|
|
self.get_results()
|
2016-03-03 22:08:51 +00:00
|
|
|
|
2012-11-29 05:46:58 +00:00
|
|
|
|
|
|
|
def contains_case_insensitive(adict, akey):
|
2012-12-19 19:42:53 +00:00
|
|
|
"""Check if key is in adict. The search is case insensitive."""
|
2012-11-29 05:46:58 +00:00
|
|
|
for key in adict:
|
|
|
|
if key.lower() == akey.lower():
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
2012-12-12 16:41:29 +00:00
|
|
|
|
|
|
|
def capfirst(text):
|
|
|
|
"""Uppercase the first character of text."""
|
|
|
|
if not text:
|
|
|
|
return text
|
|
|
|
return text[0].upper() + text[1:]
|
|
|
|
|
|
|
|
|
2012-12-19 19:42:53 +00:00
|
|
|
def save_result(res, json_file):
|
|
|
|
"""Save result to file."""
|
2016-03-31 21:25:53 +00:00
|
|
|
with codecs.open(json_file, 'wb', 'utf-8') as f:
|
2016-03-03 22:08:51 +00:00
|
|
|
json.dump(res, f, sort_keys=True, indent=2, separators=(',', ': '))
|
2012-12-19 19:42:53 +00:00
|
|
|
|
|
|
|
|
|
|
|
def load_result(json_file):
|
2013-01-09 21:20:03 +00:00
|
|
|
"""Load contents of a json file."""
|
2016-03-31 21:25:53 +00:00
|
|
|
with codecs.open(json_file, 'rb', 'utf-8') as f:
|
2012-12-19 19:42:53 +00:00
|
|
|
return json.load(f)
|
2013-01-09 21:20:03 +00:00
|
|
|
|
|
|
|
|
|
|
|
def truncate_name(text):
|
2016-03-03 22:08:51 +00:00
|
|
|
"""Ensure the comic name does not exceed 50 characters."""
|
|
|
|
return text[:50]
|
2013-02-13 19:02:47 +00:00
|
|
|
|
|
|
|
|
2016-04-12 22:52:16 +00:00
|
|
|
def asciify(name):
|
|
|
|
"""Remove non-ascii characters from string."""
|
|
|
|
return re.sub("[^0-9a-zA-Z_]", "", name)
|
|
|
|
|
|
|
|
|
2013-02-13 19:02:47 +00:00
|
|
|
def format_name(text):
|
|
|
|
"""Format a comic name."""
|
|
|
|
name = unescape(text)
|
2016-04-03 19:31:56 +00:00
|
|
|
name = "".join(capfirst(x) for x in name.split(" "))
|
2013-04-29 18:24:54 +00:00
|
|
|
name = asciify(name.replace(u'&', u'And').replace(u'@', u'At'))
|
2013-02-13 19:02:47 +00:00
|
|
|
return name
|