dosage/scripts/scriptutil.py

202 lines
5.9 KiB
Python
Raw Permalink Normal View History

# SPDX-License-Identifier: MIT
2016-10-28 22:21:41 +00:00
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
2014-01-05 15:50:57 +00:00
# Copyright (C) 2012-2014 Bastian Kleineidam
2022-06-05 18:23:56 +00:00
# Copyright (C) 2015-2022 Tobias Gruetzmacher
import codecs
import html
import json
import os
2016-04-12 22:52:16 +00:00
import re
import sys
import time
2016-05-22 20:55:06 +00:00
import lxml
from dosagelib.scraper import scrapers
from dosagelib.util import get_page
from dosagelib import http
def first_lower(x):
return x[0].lower()
class ComicListUpdater(object):
2022-06-05 18:23:56 +00:00
dup_templates: tuple[str, ...] = ()
excluded_comics: tuple[str, ...] = ()
2016-05-22 20:55:06 +00:00
START = "# START AUTOUPDATE"
END = "# END AUTOUPDATE"
2022-06-05 18:23:56 +00:00
def __init__(self, name: str):
self.json = name.replace(".py", ".json")
self.session = http.default_session
self.sleep = 0
2022-06-05 18:23:56 +00:00
def get_url(self, url: str, expand=True):
"""Get an HTML page and parse it with LXML."""
print("Parsing", url, file=sys.stderr)
try:
pagetext = get_page(url, self.session).text
data = lxml.html.document_fromstring(pagetext)
2016-04-14 22:26:14 +00:00
if expand:
data.make_links_absolute(url)
if self.sleep > 0:
time.sleep(self.sleep)
return data
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
raise
2022-06-05 18:23:56 +00:00
def should_skip(self, name: str):
if contains_case_insensitive(self.res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name),
file=sys.stderr)
return True
return False
def get_results(self):
"""Collect comics and save dictionary in JSON file."""
self.res = {}
self.collect_results()
if not self.res:
print("ERROR:", "did not match any comics", file=sys.stderr)
return
with codecs.open(self.json, 'wb', 'utf-8') as f:
json.dump(self.res, f, sort_keys=True, indent=2,
separators=(',', ': '))
def add_comic(self, name: str, data, count=None):
"""Add a collected comic with a specific number of comics."""
name = format_name(name)
if not self.should_skip(name):
self.res[name] = {'count': count, 'data': data}
return True
return False
def collect_results(self):
raise NotImplementedError
def print_results(self, args):
"""Print all comics that have at least the given number of minimum
comic strips."""
min_comics, filename = args
min_comics = int(min_comics)
2016-05-22 20:55:06 +00:00
oldf = codecs.open(filename, 'r', 'utf-8')
newf = codecs.open(filename + '.new', 'w', 'utf-8')
with oldf, newf:
indent = self.copy_until_start(oldf, newf)
with codecs.open(self.json, 'rb', 'utf-8') as f:
data = json.load(f)
for name, entry in sorted(data.items(), key=first_lower):
2016-05-22 20:55:06 +00:00
self.write_entry(newf, name, entry, min_comics, indent)
self.copy_after_end(oldf, newf)
os.replace(filename + '.new', filename)
2016-05-22 20:55:06 +00:00
def copy_until_start(self, src, dest):
for line in src:
dest.write(line)
if line.strip().startswith(self.START):
return line.find(self.START)
raise RuntimeError("can't find start marker!")
def copy_after_end(self, src, dest):
skip = True
for line in src:
if line.strip().startswith(self.END):
skip = False
if not skip:
dest.write(line)
if skip:
raise RuntimeError("can't find end marker!")
def write_entry(self, fp, name, entry, min_comics, indent):
if name in self.excluded_comics:
return
count = entry['count']
if count and count < min_comics:
return
dup = self.find_dups(name)
fp.write(" " * indent)
if dup is not None:
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
else:
fp.write(self.get_entry(
truncate_name(name),
entry['data']).replace("\n", "\n" + (" " * indent)) + "\n")
def find_dups(self, name):
"""Check if comic name already exists."""
names = [(tmpl % name).lower() for tmpl in self.dup_templates]
if names:
for scraper in scrapers.all():
lname = scraper.name.lower()
2016-04-16 11:13:47 +00:00
if lname in names:
return scraper.name
return None
2016-05-22 20:55:06 +00:00
def get_entry(self, name, data):
"""Return an entry for the module generator."""
raise NotImplementedError
def run(self):
if len(sys.argv) > 1:
self.print_results(sys.argv[1:])
else:
self.get_results()
2012-11-29 05:46:58 +00:00
def contains_case_insensitive(adict, akey):
2012-12-19 19:42:53 +00:00
"""Check if key is in adict. The search is case insensitive."""
2012-11-29 05:46:58 +00:00
for key in adict:
if key.lower() == akey.lower():
return True
return False
2012-12-12 16:41:29 +00:00
def capfirst(text):
"""Uppercase the first character of text."""
if not text:
return text
return text[0].upper() + text[1:]
2012-12-19 19:42:53 +00:00
def save_result(res, json_file):
"""Save result to file."""
2016-03-31 21:25:53 +00:00
with codecs.open(json_file, 'wb', 'utf-8') as f:
json.dump(res, f, sort_keys=True, indent=2, separators=(',', ': '))
2012-12-19 19:42:53 +00:00
def load_result(json_file):
2013-01-09 21:20:03 +00:00
"""Load contents of a json file."""
2016-03-31 21:25:53 +00:00
with codecs.open(json_file, 'rb', 'utf-8') as f:
2012-12-19 19:42:53 +00:00
return json.load(f)
2013-01-09 21:20:03 +00:00
def truncate_name(text):
"""Ensure the comic name does not exceed 50 characters."""
return text[:50]
2013-02-13 19:02:47 +00:00
2016-04-12 22:52:16 +00:00
def asciify(name):
"""Remove non-ascii characters from string."""
return re.sub("[^0-9a-zA-Z_]", "", name)
TRANS = str.maketrans({
'&': 'And',
'@': 'At',
'ñ': 'n',
'á': 'a',
})
2013-02-13 19:02:47 +00:00
def format_name(text):
"""Format a comic name."""
name = html.unescape(text)
name = "".join(capfirst(x) for x in name.split(" "))
return asciify(name.translate(TRANS))