Remove universal strips since they are almost all duplicated and the rest is useless.
This commit is contained in:
parent
10f6a1caa1
commit
9ec4a44953
5 changed files with 8 additions and 239 deletions
|
@ -1,137 +0,0 @@
|
||||||
# -*- coding: iso-8859-1 -*-
|
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
|
||||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
|
||||||
"""
|
|
||||||
The Universal comics only have some samples, but those samples are always the newest ones.
|
|
||||||
"""
|
|
||||||
import datetime
|
|
||||||
from re import compile, escape
|
|
||||||
from ..scraper import make_scraper
|
|
||||||
from ..util import tagre, getPageContent
|
|
||||||
|
|
||||||
|
|
||||||
def parse_strdate(strdate):
|
|
||||||
"""Parse date string. XXX this is locale dependant but it should not be."""
|
|
||||||
return datetime.datetime.strptime(strdate, "%A, %B %d, %Y")
|
|
||||||
|
|
||||||
_imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+<h4>published')
|
|
||||||
|
|
||||||
def add(name, shortname):
|
|
||||||
url = 'http://www.universaluclick.com%s' % shortname
|
|
||||||
classname = 'Universal_%s' % name
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def namer(cls, imageUrl, pageUrl):
|
|
||||||
"""Parse publish date from page content which looks like:
|
|
||||||
<img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
|
|
||||||
<h4>published: Sunday, November 11, 2012</h4>
|
|
||||||
"""
|
|
||||||
data = getPageContent(pageUrl, cls.session)[0]
|
|
||||||
ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
|
|
||||||
mo = ro.search(data)
|
|
||||||
if mo:
|
|
||||||
strdate = mo.group(1)
|
|
||||||
return parse_strdate(strdate).strftime("%Y%m%d")
|
|
||||||
|
|
||||||
globals()[classname] = make_scraper(classname,
|
|
||||||
name='Universal/' + name,
|
|
||||||
url = url,
|
|
||||||
stripUrl = url + '%s/',
|
|
||||||
imageSearch = _imageSearch,
|
|
||||||
multipleImagesPerStrip = True,
|
|
||||||
prevSearch = None,
|
|
||||||
help = 'Index format: none',
|
|
||||||
namer = namer,
|
|
||||||
)
|
|
||||||
|
|
||||||
# do not edit anything below since these entries are generated from scripts/update.sh
|
|
||||||
# DO NOT REMOVE
|
|
||||||
#add('9ChickweedLane', '/comics/strip/9chickweedlane')
|
|
||||||
#add('AdamAtHome', '/comics/strip/adamathome')
|
|
||||||
#add('AlleyOop', '/comics/strip/alley-oop')
|
|
||||||
#add('ArloandJanis', '/comics/strip/arloandjanis')
|
|
||||||
#add('BadReporter', '/comics/badreporter')
|
|
||||||
#add('Baldo', '/comics/strip/baldo')
|
|
||||||
#add('Betty', '/comics/strip/betty')
|
|
||||||
#add('BigNate', '/comics/strip/bignate')
|
|
||||||
#add('Biographic', '/comics/strip/biographic')
|
|
||||||
add('Brevitystrip', '/comics/strip/brevity')
|
|
||||||
#add('CalvinandHobbes', '/comics/strip/calvinandhobbes')
|
|
||||||
#add('Cathy', '/comics/strip/cathy')
|
|
||||||
#add('Cleats', '/comics/strip/cleats')
|
|
||||||
#add('ClosetoHome', '/comics/panel/closetohome')
|
|
||||||
#add('Cornered', '/comics/panel/cornered')
|
|
||||||
#add('CowandBoyClassics', '/comics/strip/cowandboy')
|
|
||||||
#add('CuldeSac', '/comics/strip/culdesac')
|
|
||||||
#add('Doonesbury', '/comics/strip/doonesbury')
|
|
||||||
#add('Drabble', '/comics/strip/drabble')
|
|
||||||
#add('FMinus', '/comics/strip/fminus')
|
|
||||||
#add('ForBetterorForWorse', '/comics/strip/forbetterorforworse')
|
|
||||||
#add('FoxTrot', '/comics/strip/foxtrot')
|
|
||||||
#add('FrankAndErnest', '/comics/strip/frankandernest')
|
|
||||||
#add('Frazz', '/comics/strip/frazz')
|
|
||||||
#add('FredBasset', '/comics/strip/fredbasset')
|
|
||||||
#add('FreshlySqueezed', '/comics/strip/freshlysqueezed')
|
|
||||||
#add('Garfield', '/comics/strip/garfield')
|
|
||||||
#add('GetFuzzy', '/comics/strip/getfuzzy')
|
|
||||||
#add('GingerMeggs', '/comics/strip/gingermeggs')
|
|
||||||
#add('Graffiti', '/comics/panel/graffiti')
|
|
||||||
#add('GrandAvenue', '/comics/strip/grand-avenue')
|
|
||||||
#add('HealthCapsules', '/comics/panel/healthcapsules')
|
|
||||||
#add('HeartoftheCity', '/comics/strip/heartofthecity')
|
|
||||||
#add('Herman', '/comics/panel/herman')
|
|
||||||
#add('InkPen', '/comics/strip/inkpen')
|
|
||||||
#add('IntheBleachers', '/comics/panel/inthebleachers')
|
|
||||||
#add('IntheSticks', '/comics/strip/inthesticks')
|
|
||||||
#add('JumpStart', '/comics/strip/jumpstart')
|
|
||||||
#add('KidCity', '/comics/strip/kidcity')
|
|
||||||
#add('KidSpot', '/comics/panel/kidspot')
|
|
||||||
#add('KitNCarlyle', '/comics/panel/kitncarlyle')
|
|
||||||
#add('LaCucaracha', '/comics/strip/lacucaracha')
|
|
||||||
#add('Lio', '/comics/strip/lio')
|
|
||||||
#add('Lola', '/comics/strip/lola')
|
|
||||||
#add('Luann', '/comics/strip/luann')
|
|
||||||
add('MagicEye', '/comics/strip/magiceye')
|
|
||||||
#add('MagicinaMinute', '/comics/strip/magicinaminute')
|
|
||||||
#add('Marmaduke', '/comics/panel/marmaduke')
|
|
||||||
add('MerlinsWorldofMarvels', '/comics/strip/merlinsworldofmarvels')
|
|
||||||
#add('ModeratelyConfused', '/comics/panel/moderately-confused')
|
|
||||||
#add('Monty', '/comics/strip/monty')
|
|
||||||
#add('MuttAndJeff', '/comics/strip/muttandjeff')
|
|
||||||
#add('Nancy', '/comics/strip/nancy')
|
|
||||||
#add('NonSequitur', '/comics/strip/nonsequitur')
|
|
||||||
add('NonSequiturPanel', '/comics/panel/non-sequitur-panel')
|
|
||||||
#add('OfftheMark', '/comics/panel/offthemark')
|
|
||||||
#add('Overboard', '/comics/strip/overboard')
|
|
||||||
#add('OvertheHedge', '/comics/strip/overthehedge')
|
|
||||||
#add('Peanuts', '/comics/strip/peanuts')
|
|
||||||
#add('PearlsBeforeSwine', '/comics/strip/pearlsbeforeswine')
|
|
||||||
#add('PoochCafe', '/comics/strip/poochcafe')
|
|
||||||
add('Portuguese', '/comics/category/portuguese')
|
|
||||||
#add('PricklyCity', '/comics/strip/pricklycity')
|
|
||||||
#add('RealLifeAdventures', '/comics/panel/reallifeadventures')
|
|
||||||
#add('RealityCheck', '/comics/panel/realitycheck')
|
|
||||||
#add('RedandRover', '/comics/strip/redandrover')
|
|
||||||
#add('RipHaywire', '/comics/strip/riphaywire')
|
|
||||||
#add('RipleysBelieveItorNot', '/comics/panel/ripleysbelieveitornot')
|
|
||||||
#add('RoseisRose', '/comics/strip/roseisrose')
|
|
||||||
#add('RudyPark', '/comics/strip/rudypark')
|
|
||||||
#add('Shortcuts', '/comics/strip/shortcuts')
|
|
||||||
#add('SouptoNutz', '/comics/strip/soup-to-nutz')
|
|
||||||
#add('StoneSoup', '/comics/strip/stonesoup')
|
|
||||||
#add('TankMcNamara', '/comics/strip/tankmcnamara')
|
|
||||||
#add('Tarzan', '/comics/strip/tarzan')
|
|
||||||
#add('Thatababy', '/comics/strip/thatababy')
|
|
||||||
#add('TheArgyleSweater', '/comics/panel/theargylesweater')
|
|
||||||
#add('TheBornLoser', '/comics/strip/the-born-loser')
|
|
||||||
#add('TheBuckets', '/comics/strip/thebuckets')
|
|
||||||
#add('TheDinetteSet', '/comics/panel/dinetteset')
|
|
||||||
#add('TheDuplex', '/comics/strip/duplex')
|
|
||||||
#add('TheElderberries', '/comics/strip/theelderberries')
|
|
||||||
#add('TheFlyingMcCoys', '/comics/panel/theflyingmccoys')
|
|
||||||
#add('TheFuscoBrothers', '/comics/strip/thefuscobrothers')
|
|
||||||
#add('TheGrizzwells', '/comics/strip/thegrizzwells')
|
|
||||||
#add('TheKnightLife', '/comics/strip/theknightlife')
|
|
||||||
#add('TomtheDancingBug', '/comics/strip/tomthedancingbug')
|
|
||||||
#add('UncleArtsFunland', '/comics/strip/uncleartsfunland')
|
|
||||||
#add('Ziggy', '/comics/panel/ziggy')
|
|
|
@ -1,8 +1,9 @@
|
||||||
#!/bin/sh -e
|
#!/bin/sh
|
||||||
|
set -e
|
||||||
set -u
|
set -u
|
||||||
|
|
||||||
d=$(dirname $0)
|
d=$(dirname $0)
|
||||||
for script in creators gocomics drunkduck universal keenspot smackjeeves arcamax; do
|
for script in creators gocomics drunkduck keenspot smackjeeves arcamax; do
|
||||||
echo "Executing ${script}.py"
|
echo "Executing ${script}.py"
|
||||||
"${d}/${script}.py"
|
"${d}/${script}.py"
|
||||||
done
|
done
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#!/bin/sh -e
|
#!/bin/sh
|
||||||
# Copyright (C) 2012 Bastian Kleineidam
|
# Copyright (C) 2012 Bastian Kleineidam
|
||||||
|
set -e
|
||||||
set -u
|
set -u
|
||||||
# generates a convenience test script from failed tests
|
# generates a convenience test script from failed tests
|
||||||
|
|
||||||
|
|
|
@ -1,97 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
|
||||||
"""
|
|
||||||
Script to get universal comics and save the info in a JSON file for further processing.
|
|
||||||
"""
|
|
||||||
from __future__ import print_function
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
|
||||||
from dosagelib.util import getPageContent, asciify, unescape
|
|
||||||
from dosagelib.scraper import get_scrapers
|
|
||||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
|
||||||
|
|
||||||
#<li><a href="/comics/strip/9chickweedlane">9 Chickweed Lane</a>
|
|
||||||
url_matcher = re.compile(r'<li><a href="(/comics/[^"]+)">([^<]+)</a>')
|
|
||||||
|
|
||||||
# names of comics to exclude
|
|
||||||
exclude_comics = [
|
|
||||||
"BusinessAndFinance", # not a comic
|
|
||||||
"ComicPanel", # not a comic
|
|
||||||
"ComicsAZ", # not a comic
|
|
||||||
"ComicStrip", # not a comic
|
|
||||||
"Espaol", # not a comic
|
|
||||||
"Family", # not a comic
|
|
||||||
"ForKids", # not a comic
|
|
||||||
"JamesBond", # not a comic
|
|
||||||
"Men", # not a comic
|
|
||||||
"NEA", # not a comic
|
|
||||||
"PeanutsPortuguese", # not found
|
|
||||||
"Pets", # not a comic
|
|
||||||
"SundayOnly", # not a comic
|
|
||||||
"WebExclusive", # not a comic
|
|
||||||
"Women", # not a comic
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def handle_url(url, res):
|
|
||||||
"""Parse one search result page."""
|
|
||||||
print("Parsing", url, file=sys.stderr)
|
|
||||||
try:
|
|
||||||
data, baseUrl = getPageContent(url)
|
|
||||||
except IOError as msg:
|
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
|
||||||
return
|
|
||||||
for match in url_matcher.finditer(data):
|
|
||||||
shortname = match.group(1)
|
|
||||||
name = unescape(match.group(2))
|
|
||||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
|
||||||
name = capfirst(name)
|
|
||||||
if name in exclude_comics:
|
|
||||||
continue
|
|
||||||
if contains_case_insensitive(res, name):
|
|
||||||
# we cannot handle two comics that only differ in case
|
|
||||||
print("INFO: skipping possible duplicate", name, file=sys.stderr)
|
|
||||||
continue
|
|
||||||
res[name] = shortname
|
|
||||||
|
|
||||||
|
|
||||||
def get_results():
|
|
||||||
"""Parse all search result pages."""
|
|
||||||
# store info in a dictionary {name -> shortname}
|
|
||||||
res = {}
|
|
||||||
handle_url('http://www.universaluclick.com/comics/list', res)
|
|
||||||
save_result(res, json_file)
|
|
||||||
|
|
||||||
|
|
||||||
def has_comic(name):
|
|
||||||
"""Check if comic name already exists."""
|
|
||||||
cname = ("Creators/%s" % name).lower()
|
|
||||||
gname = ("GoComics/%s" % name).lower()
|
|
||||||
for scraperclass in get_scrapers():
|
|
||||||
lname = scraperclass.get_name().lower()
|
|
||||||
if lname == cname or lname == gname:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def print_results(args):
|
|
||||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
|
||||||
for name, shortname in sorted(load_result(json_file).items()):
|
|
||||||
if name in exclude_comics:
|
|
||||||
continue
|
|
||||||
if has_comic(name):
|
|
||||||
prefix = '#'
|
|
||||||
else:
|
|
||||||
prefix = ''
|
|
||||||
print("%sadd(%r, %r)" % (prefix, str(truncate_name(name)), str(shortname)))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
if len(sys.argv) > 1:
|
|
||||||
print_results(sys.argv[1:])
|
|
||||||
else:
|
|
||||||
get_results()
|
|
|
@ -1,11 +1,12 @@
|
||||||
#!/bin/sh -e
|
#!/bin/sh
|
||||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||||
|
set -e
|
||||||
set -u
|
set -u
|
||||||
|
|
||||||
mincomics=100
|
mincomics=100
|
||||||
d=$(dirname $0)
|
d=$(dirname $0)
|
||||||
|
|
||||||
for script in creators gocomics drunkduck universal keenspot smackjeeves arcamax; do
|
for script in creators gocomics drunkduck keenspot smackjeeves arcamax; do
|
||||||
target="${d}/../dosagelib/plugins/${script}.py"
|
target="${d}/../dosagelib/plugins/${script}.py"
|
||||||
echo "Upating $target"
|
echo "Upating $target"
|
||||||
"${d}/removeafter.py" "$target" "# DO NOT REMOVE"
|
"${d}/removeafter.py" "$target" "# DO NOT REMOVE"
|
||||||
|
|
Loading…
Reference in a new issue