Remove universal strips since they are almost all duplicated and the rest is useless.

2013-02-12 20:56:02 +01:00 · 2013-02-12 20:56:02 +01:00 · 9ec4a44953
commit 9ec4a44953
parent 10f6a1caa1
5 changed files with 8 additions and 239 deletions
--- a/dosagelib/plugins/universal.py
+++ b/dosagelib/plugins/universal.py
@ -1,137 +0,0 @@
-# -*- coding: iso-8859-1 -*-
-# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
-# Copyright (C) 2012-2013 Bastian Kleineidam
-"""
-The Universal comics only have some samples, but those samples are always the newest ones.
-"""
-import datetime
-from re import compile, escape
-from ..scraper import make_scraper
-from ..util import tagre, getPageContent
-
-
-def parse_strdate(strdate):
-    """Parse date string. XXX this is locale dependant but it should not be."""
-    return datetime.datetime.strptime(strdate, "%A, %B %d, %Y")
-
-_imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+<h4>published')
-
-def add(name, shortname):
-    url = 'http://www.universaluclick.com%s' % shortname
-    classname = 'Universal_%s' % name
-
-    @classmethod
-    def namer(cls, imageUrl, pageUrl):
-        """Parse publish date from page content which looks like:
-         <img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
-         <h4>published: Sunday, November 11, 2012</h4>
-        """
-        data = getPageContent(pageUrl, cls.session)[0]
-        ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
-        mo = ro.search(data)
-        if mo:
-             strdate = mo.group(1)
-             return parse_strdate(strdate).strftime("%Y%m%d")
-
-    globals()[classname] = make_scraper(classname,
-        name='Universal/' + name,
-        url = url,
-        stripUrl = url + '%s/',
-        imageSearch = _imageSearch,
-        multipleImagesPerStrip = True,
-        prevSearch = None,
-        help = 'Index format: none',
-        namer = namer,
-    )
-
-# do not edit anything below since these entries are generated from scripts/update.sh
-# DO NOT REMOVE
-#add('9ChickweedLane', '/comics/strip/9chickweedlane')
-#add('AdamAtHome', '/comics/strip/adamathome')
-#add('AlleyOop', '/comics/strip/alley-oop')
-#add('ArloandJanis', '/comics/strip/arloandjanis')
-#add('BadReporter', '/comics/badreporter')
-#add('Baldo', '/comics/strip/baldo')
-#add('Betty', '/comics/strip/betty')
-#add('BigNate', '/comics/strip/bignate')
-#add('Biographic', '/comics/strip/biographic')
-add('Brevitystrip', '/comics/strip/brevity')
-#add('CalvinandHobbes', '/comics/strip/calvinandhobbes')
-#add('Cathy', '/comics/strip/cathy')
-#add('Cleats', '/comics/strip/cleats')
-#add('ClosetoHome', '/comics/panel/closetohome')
-#add('Cornered', '/comics/panel/cornered')
-#add('CowandBoyClassics', '/comics/strip/cowandboy')
-#add('CuldeSac', '/comics/strip/culdesac')
-#add('Doonesbury', '/comics/strip/doonesbury')
-#add('Drabble', '/comics/strip/drabble')
-#add('FMinus', '/comics/strip/fminus')
-#add('ForBetterorForWorse', '/comics/strip/forbetterorforworse')
-#add('FoxTrot', '/comics/strip/foxtrot')
-#add('FrankAndErnest', '/comics/strip/frankandernest')
-#add('Frazz', '/comics/strip/frazz')
-#add('FredBasset', '/comics/strip/fredbasset')
-#add('FreshlySqueezed', '/comics/strip/freshlysqueezed')
-#add('Garfield', '/comics/strip/garfield')
-#add('GetFuzzy', '/comics/strip/getfuzzy')
-#add('GingerMeggs', '/comics/strip/gingermeggs')
-#add('Graffiti', '/comics/panel/graffiti')
-#add('GrandAvenue', '/comics/strip/grand-avenue')
-#add('HealthCapsules', '/comics/panel/healthcapsules')
-#add('HeartoftheCity', '/comics/strip/heartofthecity')
-#add('Herman', '/comics/panel/herman')
-#add('InkPen', '/comics/strip/inkpen')
-#add('IntheBleachers', '/comics/panel/inthebleachers')
-#add('IntheSticks', '/comics/strip/inthesticks')
-#add('JumpStart', '/comics/strip/jumpstart')
-#add('KidCity', '/comics/strip/kidcity')
-#add('KidSpot', '/comics/panel/kidspot')
-#add('KitNCarlyle', '/comics/panel/kitncarlyle')
-#add('LaCucaracha', '/comics/strip/lacucaracha')
-#add('Lio', '/comics/strip/lio')
-#add('Lola', '/comics/strip/lola')
-#add('Luann', '/comics/strip/luann')
-add('MagicEye', '/comics/strip/magiceye')
-#add('MagicinaMinute', '/comics/strip/magicinaminute')
-#add('Marmaduke', '/comics/panel/marmaduke')
-add('MerlinsWorldofMarvels', '/comics/strip/merlinsworldofmarvels')
-#add('ModeratelyConfused', '/comics/panel/moderately-confused')
-#add('Monty', '/comics/strip/monty')
-#add('MuttAndJeff', '/comics/strip/muttandjeff')
-#add('Nancy', '/comics/strip/nancy')
-#add('NonSequitur', '/comics/strip/nonsequitur')
-add('NonSequiturPanel', '/comics/panel/non-sequitur-panel')
-#add('OfftheMark', '/comics/panel/offthemark')
-#add('Overboard', '/comics/strip/overboard')
-#add('OvertheHedge', '/comics/strip/overthehedge')
-#add('Peanuts', '/comics/strip/peanuts')
-#add('PearlsBeforeSwine', '/comics/strip/pearlsbeforeswine')
-#add('PoochCafe', '/comics/strip/poochcafe')
-add('Portuguese', '/comics/category/portuguese')
-#add('PricklyCity', '/comics/strip/pricklycity')
-#add('RealLifeAdventures', '/comics/panel/reallifeadventures')
-#add('RealityCheck', '/comics/panel/realitycheck')
-#add('RedandRover', '/comics/strip/redandrover')
-#add('RipHaywire', '/comics/strip/riphaywire')
-#add('RipleysBelieveItorNot', '/comics/panel/ripleysbelieveitornot')
-#add('RoseisRose', '/comics/strip/roseisrose')
-#add('RudyPark', '/comics/strip/rudypark')
-#add('Shortcuts', '/comics/strip/shortcuts')
-#add('SouptoNutz', '/comics/strip/soup-to-nutz')
-#add('StoneSoup', '/comics/strip/stonesoup')
-#add('TankMcNamara', '/comics/strip/tankmcnamara')
-#add('Tarzan', '/comics/strip/tarzan')
-#add('Thatababy', '/comics/strip/thatababy')
-#add('TheArgyleSweater', '/comics/panel/theargylesweater')
-#add('TheBornLoser', '/comics/strip/the-born-loser')
-#add('TheBuckets', '/comics/strip/thebuckets')
-#add('TheDinetteSet', '/comics/panel/dinetteset')
-#add('TheDuplex', '/comics/strip/duplex')
-#add('TheElderberries', '/comics/strip/theelderberries')
-#add('TheFlyingMcCoys', '/comics/panel/theflyingmccoys')
-#add('TheFuscoBrothers', '/comics/strip/thefuscobrothers')
-#add('TheGrizzwells', '/comics/strip/thegrizzwells')
-#add('TheKnightLife', '/comics/strip/theknightlife')
-#add('TomtheDancingBug', '/comics/strip/tomthedancingbug')
-#add('UncleArtsFunland', '/comics/strip/uncleartsfunland')
-#add('Ziggy', '/comics/panel/ziggy')
--- a/scripts/generate_json.sh
+++ b/scripts/generate_json.sh
@ -1,8 +1,9 @@
-#!/bin/sh -e
+#!/bin/sh
+set -e
 set -u

 d=$(dirname $0)
-for script in creators gocomics drunkduck universal keenspot smackjeeves arcamax; do
+for script in creators gocomics drunkduck keenspot smackjeeves arcamax; do
  echo "Executing ${script}.py"
  "${d}/${script}.py"
 done
--- a/scripts/mktestscript.sh
+++ b/scripts/mktestscript.sh
@ -1,5 +1,6 @@
-#!/bin/sh -e
+#!/bin/sh
 # Copyright (C) 2012 Bastian Kleineidam
+set -e
 set -u
 # generates a convenience test script from failed tests

--- a/scripts/universal.py
+++ b/scripts/universal.py
@ -1,97 +0,0 @@
-#!/usr/bin/env python
-# Copyright (C) 2012-2013 Bastian Kleineidam
-"""
-Script to get universal comics and save the info in a JSON file for further processing.
-"""
-from __future__ import print_function
-import re
-import sys
-import os
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-from dosagelib.util import getPageContent, asciify, unescape
-from dosagelib.scraper import get_scrapers
-from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
-
-json_file = __file__.replace(".py", ".json")
-
-#<li><a href="/comics/strip/9chickweedlane">9 Chickweed Lane</a>
-url_matcher = re.compile(r'<li><a href="(/comics/[^"]+)">([^<]+)</a>')
-
-# names of comics to exclude
-exclude_comics = [
-    "BusinessAndFinance", # not a comic
-    "ComicPanel", # not a comic
-    "ComicsAZ", # not a comic
-    "ComicStrip", # not a comic
-    "Espaol", # not a comic
-    "Family", # not a comic
-    "ForKids", # not a comic
-    "JamesBond", # not a comic
-    "Men", # not a comic
-    "NEA", # not a comic
-    "PeanutsPortuguese", # not found
-    "Pets", # not a comic
-    "SundayOnly", # not a comic
-    "WebExclusive", # not a comic
-    "Women", # not a comic
-]
-
-
-def handle_url(url, res):
-    """Parse one search result page."""
-    print("Parsing", url, file=sys.stderr)
-    try:
-        data, baseUrl = getPageContent(url)
-    except IOError as msg:
-        print("ERROR:", msg, file=sys.stderr)
-        return
-    for match in url_matcher.finditer(data):
-        shortname = match.group(1)
-        name = unescape(match.group(2))
-        name = asciify(name.replace('&', 'And').replace('@', 'At'))
-        name = capfirst(name)
-        if name in exclude_comics:
-            continue
-        if contains_case_insensitive(res, name):
-            # we cannot handle two comics that only differ in case
-            print("INFO: skipping possible duplicate", name, file=sys.stderr)
-            continue
-        res[name] = shortname
-
-
-def get_results():
-    """Parse all search result pages."""
-    # store info in a dictionary {name -> shortname}
-    res = {}
-    handle_url('http://www.universaluclick.com/comics/list', res)
-    save_result(res, json_file)
-
-
-def has_comic(name):
-    """Check if comic name already exists."""
-    cname = ("Creators/%s" % name).lower()
-    gname = ("GoComics/%s" % name).lower()
-    for scraperclass in get_scrapers():
-        lname = scraperclass.get_name().lower()
-        if lname == cname or lname == gname:
-            return True
-    return False
-
-
-def print_results(args):
-    """Print all comics that have at least the given number of minimum comic strips."""
-    for name, shortname in sorted(load_result(json_file).items()):
-        if name in exclude_comics:
-            continue
-        if has_comic(name):
-            prefix = '#'
-        else:
-            prefix = ''
-        print("%sadd(%r, %r)" % (prefix, str(truncate_name(name)), str(shortname)))
-
-
-if __name__ == '__main__':
-    if len(sys.argv) > 1:
-        print_results(sys.argv[1:])
-    else:
-        get_results()
--- a/scripts/update_plugins.sh
+++ b/scripts/update_plugins.sh
@ -1,11 +1,12 @@
-#!/bin/sh -e
+#!/bin/sh
 # Copyright (C) 2012-2013 Bastian Kleineidam
+set -e
 set -u

 mincomics=100
 d=$(dirname $0)

-for script in creators gocomics drunkduck universal keenspot smackjeeves arcamax; do
+for script in creators gocomics drunkduck keenspot smackjeeves arcamax; do
  target="${d}/../dosagelib/plugins/${script}.py"
  echo "Upating $target"
  "${d}/removeafter.py" "$target" "# DO NOT REMOVE"