Add comic excludes in scripts.
This commit is contained in:
parent
d89c225292
commit
bcae1b018c
11 changed files with 58 additions and 31 deletions
|
@ -1272,7 +1272,6 @@ add('Twisted_Mirrors')
|
|||
add('TwoMoons')
|
||||
add('Two_Rooks')
|
||||
add('Two_Weeks_Notice')
|
||||
add('Twonks_and_Plonkers')
|
||||
add('Typical_Strange')
|
||||
add('UNA_Frontiers_Commentary')
|
||||
add('USB')
|
||||
|
|
|
@ -10,19 +10,17 @@ import os
|
|||
import json
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
||||
from scriptutil import contains_case_insensitive
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
# <a href="/comics/agnes.html"><strong>Agnes</strong></a>
|
||||
url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<strong>([^<]+)</strong>')
|
||||
|
||||
def contains_case_insensitive(adict, akey):
|
||||
for key in adict:
|
||||
if key.lower() == akey.lower():
|
||||
return True
|
||||
return False
|
||||
# names of comics to exclude
|
||||
exclude_comics = [
|
||||
]
|
||||
|
||||
|
||||
def handle_url(url, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
|
@ -35,6 +33,8 @@ def handle_url(url, res):
|
|||
url = match.group(1)
|
||||
name = unescape(match.group(2))
|
||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
|
@ -61,6 +61,8 @@ def print_results(args):
|
|||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, url in sorted(comics.items()):
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
print("add(%r, %r)" % (str(name), str(url)))
|
||||
|
||||
|
||||
|
|
|
@ -10,14 +10,14 @@ import os
|
|||
import json
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import tagre, getPageContent
|
||||
from scriptutil import contains_case_insensitive
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
def contains_case_insensitive(adict, akey):
|
||||
for key in adict:
|
||||
if key.lower() == akey.lower():
|
||||
return True
|
||||
return False
|
||||
# names of comics to exclude
|
||||
exclude_comics = [
|
||||
"Twonks_and_Plonkers", # broken images, no real content
|
||||
]
|
||||
|
||||
|
||||
def handle_url(url, url_matcher, num_matcher, res):
|
||||
|
@ -34,6 +34,8 @@ def handle_url(url, url_matcher, num_matcher, res):
|
|||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
continue
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
# find out how many images this comic has
|
||||
end = match.end(1)
|
||||
mo = num_matcher.search(data[end:])
|
||||
|
@ -71,6 +73,8 @@ def print_results(min_strips):
|
|||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, num in sorted(comics.items()):
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if num >= min_strips:
|
||||
print("add('%s')" % name)
|
||||
|
||||
|
|
|
@ -11,19 +11,18 @@ import json
|
|||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import tagre, getPageContent, asciify, unescape
|
||||
from dosagelib.scraper import get_scrapers
|
||||
from scriptutil import contains_case_insensitive
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
#<a href="/shortname" class="alpha_list updated">name</a>
|
||||
url_matcher = re.compile(tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r"([^<]+)</a>")
|
||||
|
||||
def contains_case_insensitive(adict, akey):
|
||||
for key in adict:
|
||||
if key.lower() == akey.lower():
|
||||
return True
|
||||
return False
|
||||
# names of comics to exclude
|
||||
exclude_comics = [
|
||||
]
|
||||
|
||||
|
||||
|
||||
def handle_url(url, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
|
@ -36,6 +35,8 @@ def handle_url(url, res):
|
|||
shortname = match.group(1)
|
||||
name = unescape(match.group(2))
|
||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
|
@ -73,6 +74,8 @@ def print_results(args):
|
|||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, shortname in sorted(comics.items()):
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if has_creators_comic(name):
|
||||
prefix = '#'
|
||||
else:
|
||||
|
|
|
@ -11,6 +11,7 @@ import json
|
|||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
||||
from dosagelib.scraper import get_scrapers
|
||||
from scriptutil import contains_case_insensitive
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
|
@ -18,13 +19,11 @@ json_file = __file__.replace(".py", ".json")
|
|||
url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>')
|
||||
num_matcher = re.compile(r'Number of Days: (\d+)')
|
||||
|
||||
def contains_case_insensitive(adict, akey):
|
||||
for key in adict:
|
||||
if key.lower() == akey.lower():
|
||||
return True
|
||||
return False
|
||||
# names of comics to exclude
|
||||
exclude_comics = [
|
||||
]
|
||||
|
||||
|
||||
|
||||
def handle_url(url, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
|
@ -37,6 +36,8 @@ def handle_url(url, res):
|
|||
url = match.group(1) + '/'
|
||||
name = unescape(match.group(2))
|
||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
|
@ -83,6 +84,8 @@ def print_results(args):
|
|||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, entry in sorted(comics.items()):
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
url, num = entry
|
||||
if num < min_comics:
|
||||
continue
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import os
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#!/bin/sh -e
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
set -u
|
||||
# generates a convenience test script from failed tests
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
"""Remove all lines after a given marker line.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
|
9
scripts/scriptutil.py
Normal file
9
scripts/scriptutil.py
Normal file
|
@ -0,0 +1,9 @@
|
|||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
|
||||
def contains_case_insensitive(adict, akey):
|
||||
for key in adict:
|
||||
if key.lower() == akey.lower():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
|
@ -11,19 +11,18 @@ import json
|
|||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape
|
||||
from dosagelib.scraper import get_scrapers
|
||||
from scriptutil import contains_case_insensitive
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
#<li><a href="/comics/strip/9chickweedlane">9 Chickweed Lane</a>
|
||||
url_matcher = re.compile(r'<li><a href="(/comics/[^"]+)">([^<]+)</a>')
|
||||
|
||||
def contains_case_insensitive(adict, akey):
|
||||
for key in adict:
|
||||
if key.lower() == akey.lower():
|
||||
return True
|
||||
return False
|
||||
# names of comics to exclude
|
||||
exclude_comics = [
|
||||
]
|
||||
|
||||
|
||||
|
||||
def handle_url(url, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
|
@ -36,6 +35,8 @@ def handle_url(url, res):
|
|||
shortname = match.group(1)
|
||||
name = unescape(match.group(2))
|
||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
|
@ -65,13 +66,15 @@ def has_comic(name):
|
|||
if lname == cname or lname == gname:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
|
||||
def print_results(args):
|
||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
||||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, shortname in sorted(comics.items()):
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if has_comic(name):
|
||||
prefix = '#'
|
||||
else:
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#!/bin/sh -e
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
set -u
|
||||
|
||||
mincomics=100
|
||||
|
|
Loading…
Reference in a new issue