Add comic excludes in scripts.
This commit is contained in:
parent
d89c225292
commit
bcae1b018c
11 changed files with 58 additions and 31 deletions
|
@ -1272,7 +1272,6 @@ add('Twisted_Mirrors')
|
||||||
add('TwoMoons')
|
add('TwoMoons')
|
||||||
add('Two_Rooks')
|
add('Two_Rooks')
|
||||||
add('Two_Weeks_Notice')
|
add('Two_Weeks_Notice')
|
||||||
add('Twonks_and_Plonkers')
|
|
||||||
add('Typical_Strange')
|
add('Typical_Strange')
|
||||||
add('UNA_Frontiers_Commentary')
|
add('UNA_Frontiers_Commentary')
|
||||||
add('USB')
|
add('USB')
|
||||||
|
|
|
@ -10,19 +10,17 @@ import os
|
||||||
import json
|
import json
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
||||||
|
from scriptutil import contains_case_insensitive
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
json_file = __file__.replace(".py", ".json")
|
||||||
|
|
||||||
# <a href="/comics/agnes.html"><strong>Agnes</strong></a>
|
# <a href="/comics/agnes.html"><strong>Agnes</strong></a>
|
||||||
url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<strong>([^<]+)</strong>')
|
url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<strong>([^<]+)</strong>')
|
||||||
|
|
||||||
def contains_case_insensitive(adict, akey):
|
# names of comics to exclude
|
||||||
for key in adict:
|
exclude_comics = [
|
||||||
if key.lower() == akey.lower():
|
]
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def handle_url(url, res):
|
def handle_url(url, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
|
@ -35,6 +33,8 @@ def handle_url(url, res):
|
||||||
url = match.group(1)
|
url = match.group(1)
|
||||||
name = unescape(match.group(2))
|
name = unescape(match.group(2))
|
||||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
||||||
|
if name in exclude_comics:
|
||||||
|
continue
|
||||||
if contains_case_insensitive(res, name):
|
if contains_case_insensitive(res, name):
|
||||||
# we cannot handle two comics that only differ in case
|
# we cannot handle two comics that only differ in case
|
||||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||||
|
@ -61,6 +61,8 @@ def print_results(args):
|
||||||
with open(json_file, "rb") as f:
|
with open(json_file, "rb") as f:
|
||||||
comics = json.load(f)
|
comics = json.load(f)
|
||||||
for name, url in sorted(comics.items()):
|
for name, url in sorted(comics.items()):
|
||||||
|
if name in exclude_comics:
|
||||||
|
continue
|
||||||
print("add(%r, %r)" % (str(name), str(url)))
|
print("add(%r, %r)" % (str(name), str(url)))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,14 +10,14 @@ import os
|
||||||
import json
|
import json
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
from dosagelib.util import tagre, getPageContent
|
from dosagelib.util import tagre, getPageContent
|
||||||
|
from scriptutil import contains_case_insensitive
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
json_file = __file__.replace(".py", ".json")
|
||||||
|
|
||||||
def contains_case_insensitive(adict, akey):
|
# names of comics to exclude
|
||||||
for key in adict:
|
exclude_comics = [
|
||||||
if key.lower() == akey.lower():
|
"Twonks_and_Plonkers", # broken images, no real content
|
||||||
return True
|
]
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def handle_url(url, url_matcher, num_matcher, res):
|
def handle_url(url, url_matcher, num_matcher, res):
|
||||||
|
@ -34,6 +34,8 @@ def handle_url(url, url_matcher, num_matcher, res):
|
||||||
# we cannot handle two comics that only differ in case
|
# we cannot handle two comics that only differ in case
|
||||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
|
if name in exclude_comics:
|
||||||
|
continue
|
||||||
# find out how many images this comic has
|
# find out how many images this comic has
|
||||||
end = match.end(1)
|
end = match.end(1)
|
||||||
mo = num_matcher.search(data[end:])
|
mo = num_matcher.search(data[end:])
|
||||||
|
@ -71,6 +73,8 @@ def print_results(min_strips):
|
||||||
with open(json_file, "rb") as f:
|
with open(json_file, "rb") as f:
|
||||||
comics = json.load(f)
|
comics = json.load(f)
|
||||||
for name, num in sorted(comics.items()):
|
for name, num in sorted(comics.items()):
|
||||||
|
if name in exclude_comics:
|
||||||
|
continue
|
||||||
if num >= min_strips:
|
if num >= min_strips:
|
||||||
print("add('%s')" % name)
|
print("add('%s')" % name)
|
||||||
|
|
||||||
|
|
|
@ -11,19 +11,18 @@ import json
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
from dosagelib.util import tagre, getPageContent, asciify, unescape
|
from dosagelib.util import tagre, getPageContent, asciify, unescape
|
||||||
from dosagelib.scraper import get_scrapers
|
from dosagelib.scraper import get_scrapers
|
||||||
|
from scriptutil import contains_case_insensitive
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
json_file = __file__.replace(".py", ".json")
|
||||||
|
|
||||||
#<a href="/shortname" class="alpha_list updated">name</a>
|
#<a href="/shortname" class="alpha_list updated">name</a>
|
||||||
url_matcher = re.compile(tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r"([^<]+)</a>")
|
url_matcher = re.compile(tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r"([^<]+)</a>")
|
||||||
|
|
||||||
def contains_case_insensitive(adict, akey):
|
# names of comics to exclude
|
||||||
for key in adict:
|
exclude_comics = [
|
||||||
if key.lower() == akey.lower():
|
]
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def handle_url(url, res):
|
def handle_url(url, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
|
@ -36,6 +35,8 @@ def handle_url(url, res):
|
||||||
shortname = match.group(1)
|
shortname = match.group(1)
|
||||||
name = unescape(match.group(2))
|
name = unescape(match.group(2))
|
||||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
||||||
|
if name in exclude_comics:
|
||||||
|
continue
|
||||||
if contains_case_insensitive(res, name):
|
if contains_case_insensitive(res, name):
|
||||||
# we cannot handle two comics that only differ in case
|
# we cannot handle two comics that only differ in case
|
||||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||||
|
@ -73,6 +74,8 @@ def print_results(args):
|
||||||
with open(json_file, "rb") as f:
|
with open(json_file, "rb") as f:
|
||||||
comics = json.load(f)
|
comics = json.load(f)
|
||||||
for name, shortname in sorted(comics.items()):
|
for name, shortname in sorted(comics.items()):
|
||||||
|
if name in exclude_comics:
|
||||||
|
continue
|
||||||
if has_creators_comic(name):
|
if has_creators_comic(name):
|
||||||
prefix = '#'
|
prefix = '#'
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -11,6 +11,7 @@ import json
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
||||||
from dosagelib.scraper import get_scrapers
|
from dosagelib.scraper import get_scrapers
|
||||||
|
from scriptutil import contains_case_insensitive
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
json_file = __file__.replace(".py", ".json")
|
||||||
|
|
||||||
|
@ -18,13 +19,11 @@ json_file = __file__.replace(".py", ".json")
|
||||||
url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>')
|
url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>')
|
||||||
num_matcher = re.compile(r'Number of Days: (\d+)')
|
num_matcher = re.compile(r'Number of Days: (\d+)')
|
||||||
|
|
||||||
def contains_case_insensitive(adict, akey):
|
# names of comics to exclude
|
||||||
for key in adict:
|
exclude_comics = [
|
||||||
if key.lower() == akey.lower():
|
]
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def handle_url(url, res):
|
def handle_url(url, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
|
@ -37,6 +36,8 @@ def handle_url(url, res):
|
||||||
url = match.group(1) + '/'
|
url = match.group(1) + '/'
|
||||||
name = unescape(match.group(2))
|
name = unescape(match.group(2))
|
||||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
||||||
|
if name in exclude_comics:
|
||||||
|
continue
|
||||||
if contains_case_insensitive(res, name):
|
if contains_case_insensitive(res, name):
|
||||||
# we cannot handle two comics that only differ in case
|
# we cannot handle two comics that only differ in case
|
||||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||||
|
@ -83,6 +84,8 @@ def print_results(args):
|
||||||
with open(json_file, "rb") as f:
|
with open(json_file, "rb") as f:
|
||||||
comics = json.load(f)
|
comics = json.load(f)
|
||||||
for name, entry in sorted(comics.items()):
|
for name, entry in sorted(comics.items()):
|
||||||
|
if name in exclude_comics:
|
||||||
|
continue
|
||||||
url, num = entry
|
url, num = entry
|
||||||
if num < min_comics:
|
if num < min_comics:
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# Copyright (C) 2012 Bastian Kleineidam
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#!/bin/sh -e
|
#!/bin/sh -e
|
||||||
|
# Copyright (C) 2012 Bastian Kleineidam
|
||||||
set -u
|
set -u
|
||||||
# generates a convenience test script from failed tests
|
# generates a convenience test script from failed tests
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# Copyright (C) 2012 Bastian Kleineidam
|
||||||
"""Remove all lines after a given marker line.
|
"""Remove all lines after a given marker line.
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
9
scripts/scriptutil.py
Normal file
9
scripts/scriptutil.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
# Copyright (C) 2012 Bastian Kleineidam
|
||||||
|
|
||||||
|
def contains_case_insensitive(adict, akey):
|
||||||
|
for key in adict:
|
||||||
|
if key.lower() == akey.lower():
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
|
@ -11,19 +11,18 @@ import json
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
from dosagelib.util import getPageContent, asciify, unescape
|
from dosagelib.util import getPageContent, asciify, unescape
|
||||||
from dosagelib.scraper import get_scrapers
|
from dosagelib.scraper import get_scrapers
|
||||||
|
from scriptutil import contains_case_insensitive
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
json_file = __file__.replace(".py", ".json")
|
||||||
|
|
||||||
#<li><a href="/comics/strip/9chickweedlane">9 Chickweed Lane</a>
|
#<li><a href="/comics/strip/9chickweedlane">9 Chickweed Lane</a>
|
||||||
url_matcher = re.compile(r'<li><a href="(/comics/[^"]+)">([^<]+)</a>')
|
url_matcher = re.compile(r'<li><a href="(/comics/[^"]+)">([^<]+)</a>')
|
||||||
|
|
||||||
def contains_case_insensitive(adict, akey):
|
# names of comics to exclude
|
||||||
for key in adict:
|
exclude_comics = [
|
||||||
if key.lower() == akey.lower():
|
]
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def handle_url(url, res):
|
def handle_url(url, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
|
@ -36,6 +35,8 @@ def handle_url(url, res):
|
||||||
shortname = match.group(1)
|
shortname = match.group(1)
|
||||||
name = unescape(match.group(2))
|
name = unescape(match.group(2))
|
||||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
||||||
|
if name in exclude_comics:
|
||||||
|
continue
|
||||||
if contains_case_insensitive(res, name):
|
if contains_case_insensitive(res, name):
|
||||||
# we cannot handle two comics that only differ in case
|
# we cannot handle two comics that only differ in case
|
||||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||||
|
@ -65,13 +66,15 @@ def has_comic(name):
|
||||||
if lname == cname or lname == gname:
|
if lname == cname or lname == gname:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def print_results(args):
|
def print_results(args):
|
||||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
"""Print all comics that have at least the given number of minimum comic strips."""
|
||||||
with open(json_file, "rb") as f:
|
with open(json_file, "rb") as f:
|
||||||
comics = json.load(f)
|
comics = json.load(f)
|
||||||
for name, shortname in sorted(comics.items()):
|
for name, shortname in sorted(comics.items()):
|
||||||
|
if name in exclude_comics:
|
||||||
|
continue
|
||||||
if has_comic(name):
|
if has_comic(name):
|
||||||
prefix = '#'
|
prefix = '#'
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#!/bin/sh -e
|
#!/bin/sh -e
|
||||||
|
# Copyright (C) 2012 Bastian Kleineidam
|
||||||
set -u
|
set -u
|
||||||
|
|
||||||
mincomics=100
|
mincomics=100
|
||||||
|
|
Loading…
Reference in a new issue