Fix some comics and add language tag.

This commit is contained in:
Bastian Kleineidam 2013-03-08 22:33:05 +01:00
parent b368f125bc
commit 88e28f3923
12 changed files with 273 additions and 29 deletions

43
dosage
View file

@ -18,7 +18,7 @@ from collections import OrderedDict
from dosagelib import events, scraper from dosagelib import events, scraper
from dosagelib.output import out from dosagelib.output import out
from dosagelib.util import internal_error, getDirname, strlimit from dosagelib.util import internal_error, getDirname, strlimit, getLangName
from dosagelib.ansicolor import get_columns from dosagelib.ansicolor import get_columns
from dosagelib.configuration import App, Freeware, Copyright, SupportUrl from dosagelib.configuration import App, Freeware, Copyright, SupportUrl
@ -118,7 +118,7 @@ def saveComicStrip(strip, basepath):
filename, saved = image.save(basepath) filename, saved = image.save(basepath)
if saved: if saved:
allskipped = False allskipped = False
except IOError as msg: except Exception as msg:
out.error('Could not save image at %s to %s: %s' % (image.referrer, image.filename, msg)) out.error('Could not save image at %s to %s: %s' % (image.referrer, image.filename, msg))
errors += 1 errors += 1
return errors, allskipped return errors, allskipped
@ -126,21 +126,19 @@ def saveComicStrip(strip, basepath):
def displayHelp(comics): def displayHelp(comics):
"""Print help for comic strips.""" """Print help for comic strips."""
try: for scraperobj in getScrapers(comics):
for scraperobj in getScrapers(comics): displayComicHelp(scraperobj)
displayComicHelp(scraperobj)
except Exception as msg:
out.error(msg)
return 1
return 0 return 0
def displayComicHelp(scraperobj): def displayComicHelp(scraperobj):
"""Print description and help for a comic.""" """Print description and help for a comic."""
out.context = scraperobj.getName() out.context = getScraperName(scraperobj)
try: try:
if scraperobj.description: if scraperobj.description:
out.info("Description: " + scraperobj.description) out.info("Description: " + scraperobj.description)
if scraperobj.lang:
out.info("Language: " + getLangName(scraperobj.lang))
if scraperobj.help: if scraperobj.help:
for line in scraperobj.help.splitlines(): for line in scraperobj.help.splitlines():
out.info(line) out.info(line)
@ -157,9 +155,6 @@ def getComics(options):
try: try:
for scraperobj in getScrapers(options.comic, options.basepath, options.adult, options.multimatch): for scraperobj in getScrapers(options.comic, options.basepath, options.adult, options.multimatch):
errors += getStrips(scraperobj, options) errors += getStrips(scraperobj, options)
except Exception as msg:
out.error(msg)
errors += 1
finally: finally:
out.context = '' out.context = ''
events.getHandler().end() events.getHandler().end()
@ -199,7 +194,7 @@ def run(options):
if options.list: if options.list:
return doList() return doList()
if options.singlelist: if options.singlelist:
return doList(columnList=False) return doList(columnList=False, verbose=options.verbose)
# after this a list of comic strips is needed # after this a list of comic strips is needed
if not options.comic: if not options.comic:
out.warn('No comics specified, bailing out!') out.warn('No comics specified, bailing out!')
@ -209,26 +204,26 @@ def run(options):
return getComics(options) return getComics(options)
def doList(columnList=True): def doList(columnList=True, verbose=False):
"""List available comics.""" """List available comics."""
out.info('Available comic scrapers:') out.info('Available comic scrapers:')
out.info('Comics marked with [A] require age confirmation with the --adult option.') out.info('Comics marked with [A] require age confirmation with the --adult option.')
scrapers = sorted(getScrapers(['@@']), key=lambda s: s.getName()) scrapers = sorted(getScrapers(['@@']), key=lambda s: s.getName())
try: if columnList:
if columnList: num = doColumnList(scrapers)
num = doColumnList(scrapers) else:
else: num = doSingleList(scrapers, verbose=verbose)
num = doSingleList(scrapers) out.info('%d supported comics.' % num)
out.info('%d supported comics.' % num)
except IOError:
pass
return 0 return 0
def doSingleList(scrapers): def doSingleList(scrapers, verbose=False):
"""Get list of scraper names, one per line.""" """Get list of scraper names, one per line."""
for num, scraperobj in enumerate(scrapers): for num, scraperobj in enumerate(scrapers):
print(getScraperName(scraperobj)) if verbose:
displayComicHelp(scraperobj)
else:
print(getScraperName(scraperobj))
return num return num

189
dosagelib/languages.py Normal file
View file

@ -0,0 +1,189 @@
# -*- coding: utf-8 -*-
# ISO 693-1 language codes from pycountry
Iso2Language = {
u'aa': u'Afar',
u'ab': u'Abkhazian',
u'af': u'Afrikaans',
u'ak': u'Akan',
u'sq': u'Albanian',
u'am': u'Amharic',
u'ar': u'Arabic',
u'an': u'Aragonese',
u'hy': u'Armenian',
u'as': u'Assamese',
u'av': u'Avaric',
u'ae': u'Avestan',
u'ay': u'Aymara',
u'az': u'Azerbaijani',
u'ba': u'Bashkir',
u'bm': u'Bambara',
u'eu': u'Basque',
u'be': u'Belarusian',
u'bn': u'Bengali',
u'bh': u'Bihari languages',
u'bi': u'Bislama',
u'bs': u'Bosnian',
u'br': u'Breton',
u'bg': u'Bulgarian',
u'my': u'Burmese',
u'ca': u'Catalan; Valencian',
u'ch': u'Chamorro',
u'ce': u'Chechen',
u'zh': u'Chinese',
u'cu': u'Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic',
u'cv': u'Chuvash',
u'kw': u'Cornish',
u'co': u'Corsican',
u'cr': u'Cree',
u'cs': u'Czech',
u'da': u'Danish',
u'dv': u'Divehi; Dhivehi; Maldivian',
u'nl': u'Dutch; Flemish',
u'dz': u'Dzongkha',
u'en': u'English',
u'eo': u'Esperanto',
u'et': u'Estonian',
u'ee': u'Ewe',
u'fo': u'Faroese',
u'fj': u'Fijian',
u'fi': u'Finnish',
u'fr': u'French',
u'fy': u'Western Frisian',
u'ff': u'Fulah',
u'ka': u'Georgian',
u'de': u'German',
u'gd': u'Gaelic; Scottish Gaelic',
u'ga': u'Irish',
u'gl': u'Galician',
u'gv': u'Manx',
u'el': u'Greek, Modern (1453-)',
u'gn': u'Guarani',
u'gu': u'Gujarati',
u'ht': u'Haitian; Haitian Creole',
u'ha': u'Hausa',
u'he': u'Hebrew',
u'hz': u'Herero',
u'hi': u'Hindi',
u'ho': u'Hiri Motu',
u'hr': u'Croatian',
u'hu': u'Hungarian',
u'ig': u'Igbo',
u'is': u'Icelandic',
u'io': u'Ido',
u'ii': u'Sichuan Yi; Nuosu',
u'iu': u'Inuktitut',
u'ie': u'Interlingue; Occidental',
u'ia': u'Interlingua (International Auxiliary Language Association)',
u'id': u'Indonesian',
u'ik': u'Inupiaq',
u'it': u'Italian',
u'jv': u'Javanese',
u'ja': u'Japanese',
u'kl': u'Kalaallisut; Greenlandic',
u'kn': u'Kannada',
u'ks': u'Kashmiri',
u'kr': u'Kanuri',
u'kk': u'Kazakh',
u'km': u'Central Khmer',
u'ki': u'Kikuyu; Gikuyu',
u'rw': u'Kinyarwanda',
u'ky': u'Kirghiz; Kyrgyz',
u'kv': u'Komi',
u'kg': u'Kongo',
u'ko': u'Korean',
u'kj': u'Kuanyama; Kwanyama',
u'ku': u'Kurdish',
u'lo': u'Lao',
u'la': u'Latin',
u'lv': u'Latvian',
u'li': u'Limburgan; Limburger; Limburgish',
u'ln': u'Lingala',
u'lt': u'Lithuanian',
u'lb': u'Luxembourgish; Letzeburgesch',
u'lu': u'Luba-Katanga',
u'lg': u'Ganda',
u'mk': u'Macedonian',
u'mh': u'Marshallese',
u'ml': u'Malayalam',
u'mi': u'Maori',
u'mr': u'Marathi',
u'ms': u'Malay',
u'mg': u'Malagasy',
u'mt': u'Maltese',
u'mo': u'Moldavian; Moldovan',
u'mn': u'Mongolian',
u'na': u'Nauru',
u'nv': u'Navajo; Navaho',
u'nr': u'Ndebele, South; South Ndebele',
u'nd': u'Ndebele, North; North Ndebele',
u'ng': u'Ndonga',
u'ne': u'Nepali',
u'nn': u'Norwegian Nynorsk; Nynorsk, Norwegian',
u'nb': u'Bokm\xe5l, Norwegian; Norwegian Bokm\xe5l',
u'no': u'Norwegian',
u'ny': u'Chichewa; Chewa; Nyanja',
u'oc': u'Occitan (post 1500)',
u'oj': u'Ojibwa',
u'or': u'Oriya',
u'om': u'Oromo',
u'os': u'Ossetian; Ossetic',
u'pa': u'Panjabi; Punjabi',
u'fa': u'Persian',
u'pi': u'Pali',
u'pl': u'Polish',
u'pt': u'Portuguese',
u'ps': u'Pushto; Pashto',
u'qu': u'Quechua',
u'rm': u'Romansh',
u'ro': u'Romanian',
u'rn': u'Rundi',
u'ru': u'Russian',
u'sg': u'Sango',
u'sa': u'Sanskrit',
u'si': u'Sinhala; Sinhalese',
u'sk': u'Slovak',
u'sl': u'Slovenian',
u'se': u'Northern Sami',
u'sm': u'Samoan',
u'sn': u'Shona',
u'sd': u'Sindhi',
u'so': u'Somali',
u'st': u'Sotho, Southern',
u'es': u'Spanish; Castilian',
u'sc': u'Sardinian',
u'sr': u'Serbian',
u'ss': u'Swati',
u'su': u'Sundanese',
u'sw': u'Swahili',
u'sv': u'Swedish',
u'ty': u'Tahitian',
u'ta': u'Tamil',
u'tt': u'Tatar',
u'te': u'Telugu',
u'tg': u'Tajik',
u'tl': u'Tagalog',
u'th': u'Thai',
u'bo': u'Tibetan',
u'ti': u'Tigrinya',
u'to': u'Tonga (Tonga Islands)',
u'tn': u'Tswana',
u'ts': u'Tsonga',
u'tk': u'Turkmen',
u'tr': u'Turkish',
u'tw': u'Twi',
u'ug': u'Uighur; Uyghur',
u'uk': u'Ukrainian',
u'ur': u'Urdu',
u'uz': u'Uzbek',
u've': u'Venda',
u'vi': u'Vietnamese',
u'vo': u'Volap\xfck',
u'cy': u'Welsh',
u'wa': u'Walloon',
u'wo': u'Wolof',
u'xh': u'Xhosa',
u'yi': u'Yiddish',
u'yo': u'Yoruba',
u'za': u'Zhuang; Chuang',
u'zu': u'Zulu',
}

View file

@ -54,6 +54,7 @@ class AhoiPolloi(_BasicScraper):
stripUrl = url + '?day=%s' stripUrl = url + '?day=%s'
firstStripUrl = stripUrl % '20060305' firstStripUrl = stripUrl % '20060305'
multipleImagesPerStrip = True multipleImagesPerStrip = True
lang = 'de'
imageSearch = compile(tagre('img', 'src', r'(/static/antville/ahoipolloi/images/[^"]+)')) imageSearch = compile(tagre('img', 'src', r'(/static/antville/ahoipolloi/images/[^"]+)'))
prevSearch = compile(tagre('a', 'href', r'(http://ahoipolloi\.blogger\.de/\?day=\d+)')) prevSearch = compile(tagre('a', 'href', r'(http://ahoipolloi\.blogger\.de/\?day=\d+)'))
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
@ -98,6 +99,7 @@ class AlphaLuna(_BasicScraper):
class AlphaLunaSpanish(AlphaLuna): class AlphaLunaSpanish(AlphaLuna):
name = 'AlphaLuna/Spanish' name = 'AlphaLuna/Spanish'
lang = 'es'
url = 'http://alphaluna.net/spanish/' url = 'http://alphaluna.net/spanish/'
stripUrl = url + 'issue-%s/' stripUrl = url + 'issue-%s/'

View file

@ -172,7 +172,7 @@ class BratHalla(_BasicScraper):
class BrentalFloss(_BasicScraper): class BrentalFloss(_BasicScraper):
url = 'http://www.brentalflossthecomic.com/' url = 'http://brentalflossthecomic.com/'
stripUrl = url + '?id=%s' stripUrl = url + '?id=%s'
imageSearch = compile(tagre("img", "src", r'([^"]*/img/comic/[^"]*)')) imageSearch = compile(tagre("img", "src", r'([^"]*/img/comic/[^"]*)'))
prevSearch = compile(tagre("a", "href", r'([^"]*)') + "Prev") prevSearch = compile(tagre("a", "href", r'([^"]*)') + "Prev")

View file

@ -15,6 +15,7 @@ def add(name, path):
name = 'Creators/' + name, name = 'Creators/' + name,
url = baseurl + path + '.html', url = baseurl + path + '.html',
stripUrl = baseurl + path + '/%s.html', stripUrl = baseurl + path + '/%s.html',
lang = 'es' if name.lower().endswith('spanish') else 'en',
imageSearch = _imageSearch, imageSearch = _imageSearch,
prevSearch = compile(tagre("a", "href", r'(%s/\d+\.html)' % path) + prevSearch = compile(tagre("a", "href", r'(%s/\d+\.html)' % path) +
tagre("img", "src", r'/img_comics/arrow_l\.gif')), tagre("img", "src", r'/img_comics/arrow_l\.gif')),

View file

@ -43,6 +43,7 @@ def add(name, path):
stripUrl = _url + '%s/', stripUrl = _url + '%s/',
imageSearch = _imageSearch, imageSearch = _imageSearch,
prevSearch = _prevSearch, prevSearch = _prevSearch,
lang = 'es' if name.lower().endswith('spanish') else 'en',
help = 'Index format: n (unpadded)', help = 'Index format: n (unpadded)',
namer = _namer, namer = _namer,
) )

View file

@ -90,6 +90,7 @@ class NekoTheKitty(_BasicScraper):
class NichtLustig(_BasicScraper): class NichtLustig(_BasicScraper):
url = 'http://www.nichtlustig.de/main.html' url = 'http://www.nichtlustig.de/main.html'
stripUrl = 'http://static.nichtlustig.de/toondb/%s.html' stripUrl = 'http://static.nichtlustig.de/toondb/%s.html'
lang = 'de'
imageSearch = compile('background-image:url\((http://static\.nichtlustig\.de/comics/full/\d+\.jpg)') imageSearch = compile('background-image:url\((http://static\.nichtlustig\.de/comics/full/\d+\.jpg)')
prevSearch = compile(tagre("a", "href", r'(http://static\.nichtlustig\.de/toondb/\d+\.html)')) prevSearch = compile(tagre("a", "href", r'(http://static\.nichtlustig\.de/toondb/\d+\.html)'))
help = 'Index format: yymmdd' help = 'Index format: yymmdd'

View file

@ -57,6 +57,7 @@ def add(name, url, description, adult, bounce):
prevSearch = _prevSearch, prevSearch = _prevSearch,
prevUrlMatchesStripUrl = not adult, prevUrlMatchesStripUrl = not adult,
description = description, description = description,
lang = 'es' if name.lower().endswith('spanish') else 'en',
help = 'Index format: nnnn (some increasing number)', help = 'Index format: nnnn (some increasing number)',
namer = namer, namer = namer,
) )

View file

@ -50,11 +50,27 @@ class ZombieHunters(_BasicScraper):
class Zwarwald(_BasicScraper): class Zwarwald(_BasicScraper):
url = "http://www.zwarwald.de/" url = "http://www.zwarwald.de/"
stripUrl = url + 'index.php/page/%s/' stripUrl = url + 'index.php/page/%s/'
imageSearch = compile(tagre("img", "src", r'(http://www\.zwarwald\.de/images/\d+/\d+/[^"]+)')) # anything before page 495 seems to be flash
firstStripUrl = stripUrl % '495'
lang = 'de'
imageSearch = compile(tagre("img", "src", r'(http://(?:www\.zwarwald\.de|wp1163540.wp190.webpack.hosteurope.de/wordpress)/images/\d+/\d+/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.zwarwald\.de/index\.php/page/\d+/)') + prevSearch = compile(tagre("a", "href", r'(http://www\.zwarwald\.de/index\.php/page/\d+/)') +
tagre("img", "src", r'http://zwarwald\.de/images/prev\.jpg', quote="'")) tagre("img", "src", r'http://zwarwald\.de/images/prev\.jpg', quote="'"))
help = 'Index format: number' help = 'Index format: number'
waitSeconds = 1 waitSeconds = 1
def shouldSkipUrl(self, url): def shouldSkipUrl(self, url):
return url in (self.stripUrl % "112",) """Some pages have flash content."""
return url in (
self.stripUrl % "112",
self.stripUrl % "222",
self.stripUrl % "223",
self.stripUrl % "246",
self.stripUrl % "368",
self.stripUrl % '495',
)
@classmethod
def namer(cls, imageUrl, pageUrl):
prefix, year, month, name = imageUrl.rsplit('/', 3)
return "%s_%s_%s" % (year, month, name)

View file

@ -33,6 +33,9 @@ class _BasicScraper(object):
# a description of the comic contents # a description of the comic contents
description = '' description = ''
# langauge of the comic (two-letter ISO 639-1 code)
lang = 'en'
# compiled regular expression that will locate the URL for the previous strip in a page # compiled regular expression that will locate the URL for the previous strip in a page
prevSearch = None prevSearch = None
@ -42,7 +45,7 @@ class _BasicScraper(object):
# usually the index format help # usually the index format help
help = '' help = ''
# wait time before downloading any pages or images # wait time between downloading comic strips
waitSeconds = 0 waitSeconds = 0
# HTTP session storing cookies # HTTP session storing cookies
@ -137,7 +140,7 @@ class _BasicScraper(object):
out.warn("Already seen previous URL %r" % prevUrl) out.warn("Already seen previous URL %r" % prevUrl)
break break
url = prevUrl url = prevUrl
if self.waitSeconds: if url and self.waitSeconds:
time.sleep(self.waitSeconds) time.sleep(self.waitSeconds)
def getPrevUrl(self, url, data, baseUrl): def getPrevUrl(self, url, data, baseUrl):

View file

@ -17,6 +17,7 @@ from htmlentitydefs import name2codepoint
from .decorators import memoized from .decorators import memoized
from .output import out from .output import out
from .configuration import UserAgent, AppName, App, SupportUrl from .configuration import UserAgent, AppName, App, SupportUrl
from .languages import Iso2Language
# Maximum content size for HTML pages # Maximum content size for HTML pages
MaxContentBytes = 1024 * 1024 * 2 # 2 MB MaxContentBytes = 1024 * 1024 * 2 # 2 MB
@ -462,3 +463,8 @@ def strlimit (s, length=72):
if length == 0: if length == 0:
return "" return ""
return "%s..." % s[:length] return "%s..." % s[:length]
def getLangName(code):
"""Get name of language specified by ISO 693-1 code."""
return Iso2Language[code]

29
scripts/mklanguages.py Executable file
View file

@ -0,0 +1,29 @@
#!/usr/bin/python
# update languages.py from pycountry
import os
import codecs
import pycountry
basepath = os.path.dirname(os.path.dirname(__file__))
def main():
"""Update language information in dosagelib/languages.py."""
fn =os.path.join(basepath, 'dosagelib', 'languages.py')
encoding = 'utf-8'
with codecs.open(fn, 'w', encoding) as f:
f.write('# -*- coding: %s -*-%s' % (encoding, os.linesep))
f.write('# ISO 693-1 language codes from pycountry%s' % os.linesep)
write_languages(f)
def write_languages(f):
"""Write language information."""
f.write("Iso2Language = {%s" % os.linesep)
for language in pycountry.languages:
if hasattr(language, 'alpha2'):
f.write(" %r: %r,%s" % (language.alpha2, language.name, os.linesep))
f.write("}%s" % os.linesep)
if __name__ == '__main__':
main()