Remove universal strips since they are almost all duplicated and the rest is useless.

2013-02-12 20:56:02 +01:00 · 2013-02-12 20:56:02 +01:00 · 9ec4a44953
commit 9ec4a44953
parent 10f6a1caa1
5 changed files with 8 additions and 239 deletions
--- a/dosagelib/plugins/universal.py
+++ b/dosagelib/plugins/universal.py
@ -1,137 +0,0 @@
 # -*- coding: iso-8859-1 -*-
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012-2013 Bastian Kleineidam
 """
 The Universal comics only have some samples, but those samples are always the newest ones.
 """
 import datetime
 from re import compile, escape
 from ..scraper import make_scraper
 from ..util import tagre, getPageContent
 def parse_strdate(strdate):
    """Parse date string. XXX this is locale dependant but it should not be."""
    return datetime.datetime.strptime(strdate, "%A, %B %d, %Y")
 _imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+<h4>published')
 def add(name, shortname):
    url = 'http://www.universaluclick.com%s' % shortname
    classname = 'Universal_%s' % name
    @classmethod
    def namer(cls, imageUrl, pageUrl):
        """Parse publish date from page content which looks like:
         <img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
         <h4>published: Sunday, November 11, 2012</h4>
        """
        data = getPageContent(pageUrl, cls.session)[0]
        ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
        mo = ro.search(data)
        if mo:
             strdate = mo.group(1)
             return parse_strdate(strdate).strftime("%Y%m%d")
    globals()[classname] = make_scraper(classname,
        name='Universal/' + name,
        url = url,
        stripUrl = url + '%s/',
        imageSearch = _imageSearch,
        multipleImagesPerStrip = True,
        prevSearch = None,
        help = 'Index format: none',
        namer = namer,
    )
 # do not edit anything below since these entries are generated from scripts/update.sh
 # DO NOT REMOVE
 #add('9ChickweedLane', '/comics/strip/9chickweedlane')
 #add('AdamAtHome', '/comics/strip/adamathome')
 #add('AlleyOop', '/comics/strip/alley-oop')
 #add('ArloandJanis', '/comics/strip/arloandjanis')
 #add('BadReporter', '/comics/badreporter')
 #add('Baldo', '/comics/strip/baldo')
 #add('Betty', '/comics/strip/betty')
 #add('BigNate', '/comics/strip/bignate')
 #add('Biographic', '/comics/strip/biographic')
 add('Brevitystrip', '/comics/strip/brevity')
 #add('CalvinandHobbes', '/comics/strip/calvinandhobbes')
 #add('Cathy', '/comics/strip/cathy')
 #add('Cleats', '/comics/strip/cleats')
 #add('ClosetoHome', '/comics/panel/closetohome')
 #add('Cornered', '/comics/panel/cornered')
 #add('CowandBoyClassics', '/comics/strip/cowandboy')
 #add('CuldeSac', '/comics/strip/culdesac')
 #add('Doonesbury', '/comics/strip/doonesbury')
 #add('Drabble', '/comics/strip/drabble')
 #add('FMinus', '/comics/strip/fminus')
 #add('ForBetterorForWorse', '/comics/strip/forbetterorforworse')
 #add('FoxTrot', '/comics/strip/foxtrot')
 #add('FrankAndErnest', '/comics/strip/frankandernest')
 #add('Frazz', '/comics/strip/frazz')
 #add('FredBasset', '/comics/strip/fredbasset')
 #add('FreshlySqueezed', '/comics/strip/freshlysqueezed')
 #add('Garfield', '/comics/strip/garfield')
 #add('GetFuzzy', '/comics/strip/getfuzzy')
 #add('GingerMeggs', '/comics/strip/gingermeggs')
 #add('Graffiti', '/comics/panel/graffiti')
 #add('GrandAvenue', '/comics/strip/grand-avenue')
 #add('HealthCapsules', '/comics/panel/healthcapsules')
 #add('HeartoftheCity', '/comics/strip/heartofthecity')
 #add('Herman', '/comics/panel/herman')
 #add('InkPen', '/comics/strip/inkpen')
 #add('IntheBleachers', '/comics/panel/inthebleachers')
 #add('IntheSticks', '/comics/strip/inthesticks')
 #add('JumpStart', '/comics/strip/jumpstart')
 #add('KidCity', '/comics/strip/kidcity')
 #add('KidSpot', '/comics/panel/kidspot')
 #add('KitNCarlyle', '/comics/panel/kitncarlyle')
 #add('LaCucaracha', '/comics/strip/lacucaracha')
 #add('Lio', '/comics/strip/lio')
 #add('Lola', '/comics/strip/lola')
 #add('Luann', '/comics/strip/luann')
 add('MagicEye', '/comics/strip/magiceye')
 #add('MagicinaMinute', '/comics/strip/magicinaminute')
 #add('Marmaduke', '/comics/panel/marmaduke')
 add('MerlinsWorldofMarvels', '/comics/strip/merlinsworldofmarvels')
 #add('ModeratelyConfused', '/comics/panel/moderately-confused')
 #add('Monty', '/comics/strip/monty')
 #add('MuttAndJeff', '/comics/strip/muttandjeff')
 #add('Nancy', '/comics/strip/nancy')
 #add('NonSequitur', '/comics/strip/nonsequitur')
 add('NonSequiturPanel', '/comics/panel/non-sequitur-panel')
 #add('OfftheMark', '/comics/panel/offthemark')
 #add('Overboard', '/comics/strip/overboard')
 #add('OvertheHedge', '/comics/strip/overthehedge')
 #add('Peanuts', '/comics/strip/peanuts')
 #add('PearlsBeforeSwine', '/comics/strip/pearlsbeforeswine')
 #add('PoochCafe', '/comics/strip/poochcafe')
 add('Portuguese', '/comics/category/portuguese')
 #add('PricklyCity', '/comics/strip/pricklycity')
 #add('RealLifeAdventures', '/comics/panel/reallifeadventures')
 #add('RealityCheck', '/comics/panel/realitycheck')
 #add('RedandRover', '/comics/strip/redandrover')
 #add('RipHaywire', '/comics/strip/riphaywire')
 #add('RipleysBelieveItorNot', '/comics/panel/ripleysbelieveitornot')
 #add('RoseisRose', '/comics/strip/roseisrose')
 #add('RudyPark', '/comics/strip/rudypark')
 #add('Shortcuts', '/comics/strip/shortcuts')
 #add('SouptoNutz', '/comics/strip/soup-to-nutz')
 #add('StoneSoup', '/comics/strip/stonesoup')
 #add('TankMcNamara', '/comics/strip/tankmcnamara')
 #add('Tarzan', '/comics/strip/tarzan')
 #add('Thatababy', '/comics/strip/thatababy')
 #add('TheArgyleSweater', '/comics/panel/theargylesweater')
 #add('TheBornLoser', '/comics/strip/the-born-loser')
 #add('TheBuckets', '/comics/strip/thebuckets')
 #add('TheDinetteSet', '/comics/panel/dinetteset')
 #add('TheDuplex', '/comics/strip/duplex')
 #add('TheElderberries', '/comics/strip/theelderberries')
 #add('TheFlyingMcCoys', '/comics/panel/theflyingmccoys')
 #add('TheFuscoBrothers', '/comics/strip/thefuscobrothers')
 #add('TheGrizzwells', '/comics/strip/thegrizzwells')
 #add('TheKnightLife', '/comics/strip/theknightlife')
 #add('TomtheDancingBug', '/comics/strip/tomthedancingbug')
 #add('UncleArtsFunland', '/comics/strip/uncleartsfunland')
 #add('Ziggy', '/comics/panel/ziggy')
--- a/scripts/generate_json.sh
+++ b/scripts/generate_json.sh
@ -1,8 +1,9 @@
-#!/bin/sh -e
+#!/bin/sh
 set -e
 set -u
 d=$(dirname $0)
-for script in creators gocomics drunkduck universal keenspot smackjeeves arcamax; do
+for script in creators gocomics drunkduck keenspot smackjeeves arcamax; do
  echo "Executing ${script}.py"
  "${d}/${script}.py"
 done
--- a/scripts/mktestscript.sh
+++ b/scripts/mktestscript.sh
@ -1,5 +1,6 @@
-#!/bin/sh -e
+#!/bin/sh
 # Copyright (C) 2012 Bastian Kleineidam
 set -e
 set -u
 # generates a convenience test script from failed tests
--- a/scripts/universal.py
+++ b/scripts/universal.py
@ -1,97 +0,0 @@
 #!/usr/bin/env python
 # Copyright (C) 2012-2013 Bastian Kleineidam
 """
 Script to get universal comics and save the info in a JSON file for further processing.
 """
 from __future__ import print_function
 import re
 import sys
 import os
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 from dosagelib.util import getPageContent, asciify, unescape
 from dosagelib.scraper import get_scrapers
 from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
 json_file = __file__.replace(".py", ".json")
 #<li><a href="/comics/strip/9chickweedlane">9 Chickweed Lane</a>
 url_matcher = re.compile(r'<li><a href="(/comics/[^"]+)">([^<]+)</a>')
 # names of comics to exclude
 exclude_comics = [
    "BusinessAndFinance", # not a comic
    "ComicPanel", # not a comic
    "ComicsAZ", # not a comic
    "ComicStrip", # not a comic
    "Espaol", # not a comic
    "Family", # not a comic
    "ForKids", # not a comic
    "JamesBond", # not a comic
    "Men", # not a comic
    "NEA", # not a comic
    "PeanutsPortuguese", # not found
    "Pets", # not a comic
    "SundayOnly", # not a comic
    "WebExclusive", # not a comic
    "Women", # not a comic
 ]
 def handle_url(url, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data, baseUrl = getPageContent(url)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in url_matcher.finditer(data):
        shortname = match.group(1)
        name = unescape(match.group(2))
        name = asciify(name.replace('&', 'And').replace('@', 'At'))
        name = capfirst(name)
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", name, file=sys.stderr)
            continue
        res[name] = shortname
 def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
    handle_url('http://www.universaluclick.com/comics/list', res)
    save_result(res, json_file)
 def has_comic(name):
    """Check if comic name already exists."""
    cname = ("Creators/%s" % name).lower()
    gname = ("GoComics/%s" % name).lower()
    for scraperclass in get_scrapers():
        lname = scraperclass.get_name().lower()
        if lname == cname or lname == gname:
            return True
    return False
 def print_results(args):
    """Print all comics that have at least the given number of minimum comic strips."""
    for name, shortname in sorted(load_result(json_file).items()):
        if name in exclude_comics:
            continue
        if has_comic(name):
            prefix = '#'
        else:
            prefix = ''
        print("%sadd(%r, %r)" % (prefix, str(truncate_name(name)), str(shortname)))
 if __name__ == '__main__':
    if len(sys.argv) > 1:
        print_results(sys.argv[1:])
    else:
        get_results()
--- a/scripts/update_plugins.sh
+++ b/scripts/update_plugins.sh
@ -1,11 +1,12 @@
-#!/bin/sh -e
+#!/bin/sh
 # Copyright (C) 2012-2013 Bastian Kleineidam
 set -e
 set -u
 mincomics=100
 d=$(dirname $0)
-for script in creators gocomics drunkduck universal keenspot smackjeeves arcamax; do
+for script in creators gocomics drunkduck keenspot smackjeeves arcamax; do
  target="${d}/../dosagelib/plugins/${script}.py"
  echo "Upating $target"
  "${d}/removeafter.py" "$target" "# DO NOT REMOVE"