Add comic excludes in scripts.

This commit is contained in:
Bastian Kleineidam 2012-11-29 06:46:58 +01:00
parent d89c225292
commit bcae1b018c
11 changed files with 58 additions and 31 deletions

View file

@ -1272,7 +1272,6 @@ add('Twisted_Mirrors')
add('TwoMoons') add('TwoMoons')
add('Two_Rooks') add('Two_Rooks')
add('Two_Weeks_Notice') add('Two_Weeks_Notice')
add('Twonks_and_Plonkers')
add('Typical_Strange') add('Typical_Strange')
add('UNA_Frontiers_Commentary') add('UNA_Frontiers_Commentary')
add('USB') add('USB')

View file

@ -10,19 +10,17 @@ import os
import json import json
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre from dosagelib.util import getPageContent, asciify, unescape, tagre
from scriptutil import contains_case_insensitive
json_file = __file__.replace(".py", ".json") json_file = __file__.replace(".py", ".json")
# <a href="/comics/agnes.html"><strong>Agnes</strong></a> # <a href="/comics/agnes.html"><strong>Agnes</strong></a>
url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<strong>([^<]+)</strong>') url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<strong>([^<]+)</strong>')
def contains_case_insensitive(adict, akey): # names of comics to exclude
for key in adict: exclude_comics = [
if key.lower() == akey.lower(): ]
return True
return False
def handle_url(url, res): def handle_url(url, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
@ -35,6 +33,8 @@ def handle_url(url, res):
url = match.group(1) url = match.group(1)
name = unescape(match.group(2)) name = unescape(match.group(2))
name = asciify(name.replace('&', 'And').replace('@', 'At')) name = asciify(name.replace('&', 'And').replace('@', 'At'))
if name in exclude_comics:
continue
if contains_case_insensitive(res, name): if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case # we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr) print("WARN: skipping possible duplicate", name, file=sys.stderr)
@ -61,6 +61,8 @@ def print_results(args):
with open(json_file, "rb") as f: with open(json_file, "rb") as f:
comics = json.load(f) comics = json.load(f)
for name, url in sorted(comics.items()): for name, url in sorted(comics.items()):
if name in exclude_comics:
continue
print("add(%r, %r)" % (str(name), str(url))) print("add(%r, %r)" % (str(name), str(url)))

View file

@ -10,14 +10,14 @@ import os
import json import json
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import tagre, getPageContent from dosagelib.util import tagre, getPageContent
from scriptutil import contains_case_insensitive
json_file = __file__.replace(".py", ".json") json_file = __file__.replace(".py", ".json")
def contains_case_insensitive(adict, akey): # names of comics to exclude
for key in adict: exclude_comics = [
if key.lower() == akey.lower(): "Twonks_and_Plonkers", # broken images, no real content
return True ]
return False
def handle_url(url, url_matcher, num_matcher, res): def handle_url(url, url_matcher, num_matcher, res):
@ -34,6 +34,8 @@ def handle_url(url, url_matcher, num_matcher, res):
# we cannot handle two comics that only differ in case # we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr) print("WARN: skipping possible duplicate", name, file=sys.stderr)
continue continue
if name in exclude_comics:
continue
# find out how many images this comic has # find out how many images this comic has
end = match.end(1) end = match.end(1)
mo = num_matcher.search(data[end:]) mo = num_matcher.search(data[end:])
@ -71,6 +73,8 @@ def print_results(min_strips):
with open(json_file, "rb") as f: with open(json_file, "rb") as f:
comics = json.load(f) comics = json.load(f)
for name, num in sorted(comics.items()): for name, num in sorted(comics.items()):
if name in exclude_comics:
continue
if num >= min_strips: if num >= min_strips:
print("add('%s')" % name) print("add('%s')" % name)

View file

@ -11,19 +11,18 @@ import json
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import tagre, getPageContent, asciify, unescape from dosagelib.util import tagre, getPageContent, asciify, unescape
from dosagelib.scraper import get_scrapers from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive
json_file = __file__.replace(".py", ".json") json_file = __file__.replace(".py", ".json")
#<a href="/shortname" class="alpha_list updated">name</a> #<a href="/shortname" class="alpha_list updated">name</a>
url_matcher = re.compile(tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r"([^<]+)</a>") url_matcher = re.compile(tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r"([^<]+)</a>")
def contains_case_insensitive(adict, akey): # names of comics to exclude
for key in adict: exclude_comics = [
if key.lower() == akey.lower(): ]
return True
return False
def handle_url(url, res): def handle_url(url, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
@ -36,6 +35,8 @@ def handle_url(url, res):
shortname = match.group(1) shortname = match.group(1)
name = unescape(match.group(2)) name = unescape(match.group(2))
name = asciify(name.replace('&', 'And').replace('@', 'At')) name = asciify(name.replace('&', 'And').replace('@', 'At'))
if name in exclude_comics:
continue
if contains_case_insensitive(res, name): if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case # we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr) print("WARN: skipping possible duplicate", name, file=sys.stderr)
@ -73,6 +74,8 @@ def print_results(args):
with open(json_file, "rb") as f: with open(json_file, "rb") as f:
comics = json.load(f) comics = json.load(f)
for name, shortname in sorted(comics.items()): for name, shortname in sorted(comics.items()):
if name in exclude_comics:
continue
if has_creators_comic(name): if has_creators_comic(name):
prefix = '#' prefix = '#'
else: else:

View file

@ -11,6 +11,7 @@ import json
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre from dosagelib.util import getPageContent, asciify, unescape, tagre
from dosagelib.scraper import get_scrapers from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive
json_file = __file__.replace(".py", ".json") json_file = __file__.replace(".py", ".json")
@ -18,13 +19,11 @@ json_file = __file__.replace(".py", ".json")
url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>') url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>')
num_matcher = re.compile(r'Number of Days: (\d+)') num_matcher = re.compile(r'Number of Days: (\d+)')
def contains_case_insensitive(adict, akey): # names of comics to exclude
for key in adict: exclude_comics = [
if key.lower() == akey.lower(): ]
return True
return False
def handle_url(url, res): def handle_url(url, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
@ -37,6 +36,8 @@ def handle_url(url, res):
url = match.group(1) + '/' url = match.group(1) + '/'
name = unescape(match.group(2)) name = unescape(match.group(2))
name = asciify(name.replace('&', 'And').replace('@', 'At')) name = asciify(name.replace('&', 'And').replace('@', 'At'))
if name in exclude_comics:
continue
if contains_case_insensitive(res, name): if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case # we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr) print("WARN: skipping possible duplicate", name, file=sys.stderr)
@ -83,6 +84,8 @@ def print_results(args):
with open(json_file, "rb") as f: with open(json_file, "rb") as f:
comics = json.load(f) comics = json.load(f)
for name, entry in sorted(comics.items()): for name, entry in sorted(comics.items()):
if name in exclude_comics:
continue
url, num = entry url, num = entry
if num < min_comics: if num < min_comics:
continue continue

View file

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# Copyright (C) 2012 Bastian Kleineidam
from __future__ import print_function from __future__ import print_function
import sys import sys
import os import os

View file

@ -1,4 +1,5 @@
#!/bin/sh -e #!/bin/sh -e
# Copyright (C) 2012 Bastian Kleineidam
set -u set -u
# generates a convenience test script from failed tests # generates a convenience test script from failed tests

View file

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# Copyright (C) 2012 Bastian Kleineidam
"""Remove all lines after a given marker line. """Remove all lines after a given marker line.
""" """
from __future__ import print_function from __future__ import print_function

9
scripts/scriptutil.py Normal file
View file

@ -0,0 +1,9 @@
# Copyright (C) 2012 Bastian Kleineidam
def contains_case_insensitive(adict, akey):
for key in adict:
if key.lower() == akey.lower():
return True
return False

View file

@ -11,19 +11,18 @@ import json
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape from dosagelib.util import getPageContent, asciify, unescape
from dosagelib.scraper import get_scrapers from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive
json_file = __file__.replace(".py", ".json") json_file = __file__.replace(".py", ".json")
#<li><a href="/comics/strip/9chickweedlane">9 Chickweed Lane</a> #<li><a href="/comics/strip/9chickweedlane">9 Chickweed Lane</a>
url_matcher = re.compile(r'<li><a href="(/comics/[^"]+)">([^<]+)</a>') url_matcher = re.compile(r'<li><a href="(/comics/[^"]+)">([^<]+)</a>')
def contains_case_insensitive(adict, akey): # names of comics to exclude
for key in adict: exclude_comics = [
if key.lower() == akey.lower(): ]
return True
return False
def handle_url(url, res): def handle_url(url, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
@ -36,6 +35,8 @@ def handle_url(url, res):
shortname = match.group(1) shortname = match.group(1)
name = unescape(match.group(2)) name = unescape(match.group(2))
name = asciify(name.replace('&', 'And').replace('@', 'At')) name = asciify(name.replace('&', 'And').replace('@', 'At'))
if name in exclude_comics:
continue
if contains_case_insensitive(res, name): if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case # we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr) print("WARN: skipping possible duplicate", name, file=sys.stderr)
@ -65,13 +66,15 @@ def has_comic(name):
if lname == cname or lname == gname: if lname == cname or lname == gname:
return True return True
return False return False
def print_results(args): def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips.""" """Print all comics that have at least the given number of minimum comic strips."""
with open(json_file, "rb") as f: with open(json_file, "rb") as f:
comics = json.load(f) comics = json.load(f)
for name, shortname in sorted(comics.items()): for name, shortname in sorted(comics.items()):
if name in exclude_comics:
continue
if has_comic(name): if has_comic(name):
prefix = '#' prefix = '#'
else: else:

View file

@ -1,4 +1,5 @@
#!/bin/sh -e #!/bin/sh -e
# Copyright (C) 2012 Bastian Kleineidam
set -u set -u
mincomics=100 mincomics=100