Remove universal strips since they are almost all duplicated and the rest is useless.

This commit is contained in:
Bastian Kleineidam 2013-02-12 20:56:02 +01:00
parent 10f6a1caa1
commit 9ec4a44953
5 changed files with 8 additions and 239 deletions

View file

@ -1,137 +0,0 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2013 Bastian Kleineidam
"""
The Universal comics only have some samples, but those samples are always the newest ones.
"""
import datetime
from re import compile, escape
from ..scraper import make_scraper
from ..util import tagre, getPageContent
def parse_strdate(strdate):
"""Parse date string. XXX this is locale dependant but it should not be."""
return datetime.datetime.strptime(strdate, "%A, %B %d, %Y")
_imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+<h4>published')
def add(name, shortname):
url = 'http://www.universaluclick.com%s' % shortname
classname = 'Universal_%s' % name
@classmethod
def namer(cls, imageUrl, pageUrl):
"""Parse publish date from page content which looks like:
<img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
<h4>published: Sunday, November 11, 2012</h4>
"""
data = getPageContent(pageUrl, cls.session)[0]
ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
mo = ro.search(data)
if mo:
strdate = mo.group(1)
return parse_strdate(strdate).strftime("%Y%m%d")
globals()[classname] = make_scraper(classname,
name='Universal/' + name,
url = url,
stripUrl = url + '%s/',
imageSearch = _imageSearch,
multipleImagesPerStrip = True,
prevSearch = None,
help = 'Index format: none',
namer = namer,
)
# do not edit anything below since these entries are generated from scripts/update.sh
# DO NOT REMOVE
#add('9ChickweedLane', '/comics/strip/9chickweedlane')
#add('AdamAtHome', '/comics/strip/adamathome')
#add('AlleyOop', '/comics/strip/alley-oop')
#add('ArloandJanis', '/comics/strip/arloandjanis')
#add('BadReporter', '/comics/badreporter')
#add('Baldo', '/comics/strip/baldo')
#add('Betty', '/comics/strip/betty')
#add('BigNate', '/comics/strip/bignate')
#add('Biographic', '/comics/strip/biographic')
add('Brevitystrip', '/comics/strip/brevity')
#add('CalvinandHobbes', '/comics/strip/calvinandhobbes')
#add('Cathy', '/comics/strip/cathy')
#add('Cleats', '/comics/strip/cleats')
#add('ClosetoHome', '/comics/panel/closetohome')
#add('Cornered', '/comics/panel/cornered')
#add('CowandBoyClassics', '/comics/strip/cowandboy')
#add('CuldeSac', '/comics/strip/culdesac')
#add('Doonesbury', '/comics/strip/doonesbury')
#add('Drabble', '/comics/strip/drabble')
#add('FMinus', '/comics/strip/fminus')
#add('ForBetterorForWorse', '/comics/strip/forbetterorforworse')
#add('FoxTrot', '/comics/strip/foxtrot')
#add('FrankAndErnest', '/comics/strip/frankandernest')
#add('Frazz', '/comics/strip/frazz')
#add('FredBasset', '/comics/strip/fredbasset')
#add('FreshlySqueezed', '/comics/strip/freshlysqueezed')
#add('Garfield', '/comics/strip/garfield')
#add('GetFuzzy', '/comics/strip/getfuzzy')
#add('GingerMeggs', '/comics/strip/gingermeggs')
#add('Graffiti', '/comics/panel/graffiti')
#add('GrandAvenue', '/comics/strip/grand-avenue')
#add('HealthCapsules', '/comics/panel/healthcapsules')
#add('HeartoftheCity', '/comics/strip/heartofthecity')
#add('Herman', '/comics/panel/herman')
#add('InkPen', '/comics/strip/inkpen')
#add('IntheBleachers', '/comics/panel/inthebleachers')
#add('IntheSticks', '/comics/strip/inthesticks')
#add('JumpStart', '/comics/strip/jumpstart')
#add('KidCity', '/comics/strip/kidcity')
#add('KidSpot', '/comics/panel/kidspot')
#add('KitNCarlyle', '/comics/panel/kitncarlyle')
#add('LaCucaracha', '/comics/strip/lacucaracha')
#add('Lio', '/comics/strip/lio')
#add('Lola', '/comics/strip/lola')
#add('Luann', '/comics/strip/luann')
add('MagicEye', '/comics/strip/magiceye')
#add('MagicinaMinute', '/comics/strip/magicinaminute')
#add('Marmaduke', '/comics/panel/marmaduke')
add('MerlinsWorldofMarvels', '/comics/strip/merlinsworldofmarvels')
#add('ModeratelyConfused', '/comics/panel/moderately-confused')
#add('Monty', '/comics/strip/monty')
#add('MuttAndJeff', '/comics/strip/muttandjeff')
#add('Nancy', '/comics/strip/nancy')
#add('NonSequitur', '/comics/strip/nonsequitur')
add('NonSequiturPanel', '/comics/panel/non-sequitur-panel')
#add('OfftheMark', '/comics/panel/offthemark')
#add('Overboard', '/comics/strip/overboard')
#add('OvertheHedge', '/comics/strip/overthehedge')
#add('Peanuts', '/comics/strip/peanuts')
#add('PearlsBeforeSwine', '/comics/strip/pearlsbeforeswine')
#add('PoochCafe', '/comics/strip/poochcafe')
add('Portuguese', '/comics/category/portuguese')
#add('PricklyCity', '/comics/strip/pricklycity')
#add('RealLifeAdventures', '/comics/panel/reallifeadventures')
#add('RealityCheck', '/comics/panel/realitycheck')
#add('RedandRover', '/comics/strip/redandrover')
#add('RipHaywire', '/comics/strip/riphaywire')
#add('RipleysBelieveItorNot', '/comics/panel/ripleysbelieveitornot')
#add('RoseisRose', '/comics/strip/roseisrose')
#add('RudyPark', '/comics/strip/rudypark')
#add('Shortcuts', '/comics/strip/shortcuts')
#add('SouptoNutz', '/comics/strip/soup-to-nutz')
#add('StoneSoup', '/comics/strip/stonesoup')
#add('TankMcNamara', '/comics/strip/tankmcnamara')
#add('Tarzan', '/comics/strip/tarzan')
#add('Thatababy', '/comics/strip/thatababy')
#add('TheArgyleSweater', '/comics/panel/theargylesweater')
#add('TheBornLoser', '/comics/strip/the-born-loser')
#add('TheBuckets', '/comics/strip/thebuckets')
#add('TheDinetteSet', '/comics/panel/dinetteset')
#add('TheDuplex', '/comics/strip/duplex')
#add('TheElderberries', '/comics/strip/theelderberries')
#add('TheFlyingMcCoys', '/comics/panel/theflyingmccoys')
#add('TheFuscoBrothers', '/comics/strip/thefuscobrothers')
#add('TheGrizzwells', '/comics/strip/thegrizzwells')
#add('TheKnightLife', '/comics/strip/theknightlife')
#add('TomtheDancingBug', '/comics/strip/tomthedancingbug')
#add('UncleArtsFunland', '/comics/strip/uncleartsfunland')
#add('Ziggy', '/comics/panel/ziggy')

View file

@ -1,8 +1,9 @@
#!/bin/sh -e #!/bin/sh
set -e
set -u set -u
d=$(dirname $0) d=$(dirname $0)
for script in creators gocomics drunkduck universal keenspot smackjeeves arcamax; do for script in creators gocomics drunkduck keenspot smackjeeves arcamax; do
echo "Executing ${script}.py" echo "Executing ${script}.py"
"${d}/${script}.py" "${d}/${script}.py"
done done

View file

@ -1,5 +1,6 @@
#!/bin/sh -e #!/bin/sh
# Copyright (C) 2012 Bastian Kleineidam # Copyright (C) 2012 Bastian Kleineidam
set -e
set -u set -u
# generates a convenience test script from failed tests # generates a convenience test script from failed tests

View file

@ -1,97 +0,0 @@
#!/usr/bin/env python
# Copyright (C) 2012-2013 Bastian Kleineidam
"""
Script to get universal comics and save the info in a JSON file for further processing.
"""
from __future__ import print_function
import re
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape
from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
json_file = __file__.replace(".py", ".json")
#<li><a href="/comics/strip/9chickweedlane">9 Chickweed Lane</a>
url_matcher = re.compile(r'<li><a href="(/comics/[^"]+)">([^<]+)</a>')
# names of comics to exclude
exclude_comics = [
"BusinessAndFinance", # not a comic
"ComicPanel", # not a comic
"ComicsAZ", # not a comic
"ComicStrip", # not a comic
"Espaol", # not a comic
"Family", # not a comic
"ForKids", # not a comic
"JamesBond", # not a comic
"Men", # not a comic
"NEA", # not a comic
"PeanutsPortuguese", # not found
"Pets", # not a comic
"SundayOnly", # not a comic
"WebExclusive", # not a comic
"Women", # not a comic
]
def handle_url(url, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data, baseUrl = getPageContent(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
shortname = match.group(1)
name = unescape(match.group(2))
name = asciify(name.replace('&', 'And').replace('@', 'At'))
name = capfirst(name)
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", name, file=sys.stderr)
continue
res[name] = shortname
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
handle_url('http://www.universaluclick.com/comics/list', res)
save_result(res, json_file)
def has_comic(name):
"""Check if comic name already exists."""
cname = ("Creators/%s" % name).lower()
gname = ("GoComics/%s" % name).lower()
for scraperclass in get_scrapers():
lname = scraperclass.get_name().lower()
if lname == cname or lname == gname:
return True
return False
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
for name, shortname in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
if has_comic(name):
prefix = '#'
else:
prefix = ''
print("%sadd(%r, %r)" % (prefix, str(truncate_name(name)), str(shortname)))
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()

View file

@ -1,11 +1,12 @@
#!/bin/sh -e #!/bin/sh
# Copyright (C) 2012-2013 Bastian Kleineidam # Copyright (C) 2012-2013 Bastian Kleineidam
set -e
set -u set -u
mincomics=100 mincomics=100
d=$(dirname $0) d=$(dirname $0)
for script in creators gocomics drunkduck universal keenspot smackjeeves arcamax; do for script in creators gocomics drunkduck keenspot smackjeeves arcamax; do
target="${d}/../dosagelib/plugins/${script}.py" target="${d}/../dosagelib/plugins/${script}.py"
echo "Upating $target" echo "Upating $target"
"${d}/removeafter.py" "$target" "# DO NOT REMOVE" "${d}/removeafter.py" "$target" "# DO NOT REMOVE"