Add comic excludes in scripts.

2012-11-29 06:46:58 +01:00 · 2012-11-29 06:46:58 +01:00 · bcae1b018c
commit bcae1b018c
parent d89c225292
11 changed files with 58 additions and 31 deletions
--- a/dosagelib/plugins/drunkduck.py
+++ b/dosagelib/plugins/drunkduck.py
@ -1272,7 +1272,6 @@ add('Twisted_Mirrors')
 add('TwoMoons')
 add('Two_Rooks')
 add('Two_Weeks_Notice')
 add('Twonks_and_Plonkers')
 add('Typical_Strange')
 add('UNA_Frontiers_Commentary')
 add('USB')
--- a/scripts/creators.py
+++ b/scripts/creators.py
@ -10,19 +10,17 @@ import os
 import json
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape, tagre
 from scriptutil import contains_case_insensitive
 json_file = __file__.replace(".py", ".json")
 # <a href="/comics/agnes.html"><strong>Agnes</strong></a>
 url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<strong>([^<]+)</strong>')
-def contains_case_insensitive(adict, akey):
+# names of comics to exclude
-    for key in adict:
+exclude_comics = [
-        if key.lower() == akey.lower():
+]
            return True
    return False
 def handle_url(url, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
@ -35,6 +33,8 @@ def handle_url(url, res):
        url = match.group(1)
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("WARN: skipping possible duplicate", name, file=sys.stderr)
@ -61,6 +61,8 @@ def print_results(args):
    with open(json_file, "rb") as f:
        comics = json.load(f)
    for name, url in sorted(comics.items()):
        if name in exclude_comics:
            continue
        print("add(%r, %r)" % (str(name), str(url)))
--- a/scripts/drunkduck.py
+++ b/scripts/drunkduck.py
@ -10,14 +10,14 @@ import os
 import json
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import tagre, getPageContent
 from scriptutil import contains_case_insensitive
 json_file = __file__.replace(".py", ".json")
-def contains_case_insensitive(adict, akey):
+# names of comics to exclude
-    for key in adict:
+exclude_comics = [
-        if key.lower() == akey.lower():
+    "Twonks_and_Plonkers", # broken images, no real content
-            return True
+]
    return False
 def handle_url(url, url_matcher, num_matcher, res):
@ -34,6 +34,8 @@ def handle_url(url, url_matcher, num_matcher, res):
            # we cannot handle two comics that only differ in case
            print("WARN: skipping possible duplicate", name, file=sys.stderr)
            continue
        if name in exclude_comics:
            continue
        # find out how many images this comic has
        end = match.end(1)
        mo = num_matcher.search(data[end:])
@ -71,6 +73,8 @@ def print_results(min_strips):
    with open(json_file, "rb") as f:
        comics = json.load(f)
    for name, num in sorted(comics.items()):
        if name in exclude_comics:
            continue
        if num >= min_strips:
            print("add('%s')" % name)
--- a/scripts/gocomics.py
+++ b/scripts/gocomics.py
@ -11,19 +11,18 @@ import json
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import tagre, getPageContent, asciify, unescape
 from dosagelib.scraper import get_scrapers
 from scriptutil import contains_case_insensitive
 json_file = __file__.replace(".py", ".json")
 #<a href="/shortname" class="alpha_list updated">name</a>
 url_matcher = re.compile(tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r"([^<]+)</a>")
-def contains_case_insensitive(adict, akey):
+# names of comics to exclude
-    for key in adict:
+exclude_comics = [
-        if key.lower() == akey.lower():
+]
-            return True
+
    return False
 def handle_url(url, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
@ -36,6 +35,8 @@ def handle_url(url, res):
        shortname = match.group(1)
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("WARN: skipping possible duplicate", name, file=sys.stderr)
@ -73,6 +74,8 @@ def print_results(args):
    with open(json_file, "rb") as f:
        comics = json.load(f)
    for name, shortname in sorted(comics.items()):
        if name in exclude_comics:
            continue
        if has_creators_comic(name):
            prefix = '#'
        else:
--- a/scripts/keenspot.py
+++ b/scripts/keenspot.py
@ -11,6 +11,7 @@ import json
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape, tagre
 from dosagelib.scraper import get_scrapers
 from scriptutil import contains_case_insensitive
 json_file = __file__.replace(".py", ".json")
@ -18,13 +19,11 @@ json_file = __file__.replace(".py", ".json")
 url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>')
 num_matcher = re.compile(r'Number of Days: (\d+)')
-def contains_case_insensitive(adict, akey):
+# names of comics to exclude
-    for key in adict:
+exclude_comics = [
-        if key.lower() == akey.lower():
+]
-            return True
+
    return False
 def handle_url(url, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
@ -37,6 +36,8 @@ def handle_url(url, res):
        url = match.group(1) + '/'
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("WARN: skipping possible duplicate", name, file=sys.stderr)
@ -83,6 +84,8 @@ def print_results(args):
    with open(json_file, "rb") as f:
        comics = json.load(f)
    for name, entry in sorted(comics.items()):
        if name in exclude_comics:
            continue
        url, num = entry
        if num < min_comics:
            continue
--- a/scripts/mktestpage.py
+++ b/scripts/mktestpage.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python
 # Copyright (C) 2012 Bastian Kleineidam
 from __future__ import print_function
 import sys
 import os
--- a/scripts/mktestscript.sh
+++ b/scripts/mktestscript.sh
@ -1,4 +1,5 @@
 #!/bin/sh -e
 # Copyright (C) 2012 Bastian Kleineidam
 set -u
 # generates a convenience test script from failed tests
--- a/scripts/removeafter.py
+++ b/scripts/removeafter.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python
 # Copyright (C) 2012 Bastian Kleineidam
 """Remove all lines after a given marker line.
 """
 from __future__ import print_function
--- a/scripts/scriptutil.py
+++ b/scripts/scriptutil.py
@ -0,0 +1,9 @@
 # Copyright (C) 2012 Bastian Kleineidam
 def contains_case_insensitive(adict, akey):
    for key in adict:
        if key.lower() == akey.lower():
            return True
    return False
--- a/scripts/universal.py
+++ b/scripts/universal.py
@ -11,19 +11,18 @@ import json
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape
 from dosagelib.scraper import get_scrapers
 from scriptutil import contains_case_insensitive
 json_file = __file__.replace(".py", ".json")
 #<li><a href="/comics/strip/9chickweedlane">9 Chickweed Lane</a>
 url_matcher = re.compile(r'<li><a href="(/comics/[^"]+)">([^<]+)</a>')
-def contains_case_insensitive(adict, akey):
+# names of comics to exclude
-    for key in adict:
+exclude_comics = [
-        if key.lower() == akey.lower():
+]
-            return True
+
    return False
 def handle_url(url, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
@ -36,6 +35,8 @@ def handle_url(url, res):
        shortname = match.group(1)
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("WARN: skipping possible duplicate", name, file=sys.stderr)
@ -65,13 +66,15 @@ def has_comic(name):
        if lname == cname or lname == gname:
            return True
    return False
- 
+
 def print_results(args):
    """Print all comics that have at least the given number of minimum comic strips."""
    with open(json_file, "rb") as f:
        comics = json.load(f)
    for name, shortname in sorted(comics.items()):
        if name in exclude_comics:
            continue
        if has_comic(name):
            prefix = '#'
        else:
--- a/scripts/update_plugins.py
+++ b/scripts/update_plugins.py
@ -1,4 +1,5 @@
 #!/bin/sh -e
 # Copyright (C) 2012 Bastian Kleineidam
 set -u
 mincomics=100