From 88e28f3923687ccff4791de611a7476a63105af5 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Fri, 8 Mar 2013 22:33:05 +0100 Subject: [PATCH] Fix some comics and add language tag. --- dosage | 43 ++++--- dosagelib/languages.py | 189 +++++++++++++++++++++++++++++++ dosagelib/plugins/a.py | 2 + dosagelib/plugins/b.py | 2 +- dosagelib/plugins/creators.py | 1 + dosagelib/plugins/drunkduck.py | 1 + dosagelib/plugins/n.py | 1 + dosagelib/plugins/smackjeeves.py | 1 + dosagelib/plugins/z.py | 20 +++- dosagelib/scraper.py | 7 +- dosagelib/util.py | 6 + scripts/mklanguages.py | 29 +++++ 12 files changed, 273 insertions(+), 29 deletions(-) create mode 100644 dosagelib/languages.py create mode 100755 scripts/mklanguages.py diff --git a/dosage b/dosage index a221f34c2..5b5fe4bcb 100755 --- a/dosage +++ b/dosage @@ -18,7 +18,7 @@ from collections import OrderedDict from dosagelib import events, scraper from dosagelib.output import out -from dosagelib.util import internal_error, getDirname, strlimit +from dosagelib.util import internal_error, getDirname, strlimit, getLangName from dosagelib.ansicolor import get_columns from dosagelib.configuration import App, Freeware, Copyright, SupportUrl @@ -118,7 +118,7 @@ def saveComicStrip(strip, basepath): filename, saved = image.save(basepath) if saved: allskipped = False - except IOError as msg: + except Exception as msg: out.error('Could not save image at %s to %s: %s' % (image.referrer, image.filename, msg)) errors += 1 return errors, allskipped @@ -126,21 +126,19 @@ def saveComicStrip(strip, basepath): def displayHelp(comics): """Print help for comic strips.""" - try: - for scraperobj in getScrapers(comics): - displayComicHelp(scraperobj) - except Exception as msg: - out.error(msg) - return 1 + for scraperobj in getScrapers(comics): + displayComicHelp(scraperobj) return 0 def displayComicHelp(scraperobj): """Print description and help for a comic.""" - out.context = scraperobj.getName() + out.context = getScraperName(scraperobj) try: if scraperobj.description: out.info("Description: " + scraperobj.description) + if scraperobj.lang: + out.info("Language: " + getLangName(scraperobj.lang)) if scraperobj.help: for line in scraperobj.help.splitlines(): out.info(line) @@ -157,9 +155,6 @@ def getComics(options): try: for scraperobj in getScrapers(options.comic, options.basepath, options.adult, options.multimatch): errors += getStrips(scraperobj, options) - except Exception as msg: - out.error(msg) - errors += 1 finally: out.context = '' events.getHandler().end() @@ -199,7 +194,7 @@ def run(options): if options.list: return doList() if options.singlelist: - return doList(columnList=False) + return doList(columnList=False, verbose=options.verbose) # after this a list of comic strips is needed if not options.comic: out.warn('No comics specified, bailing out!') @@ -209,26 +204,26 @@ def run(options): return getComics(options) -def doList(columnList=True): +def doList(columnList=True, verbose=False): """List available comics.""" out.info('Available comic scrapers:') out.info('Comics marked with [A] require age confirmation with the --adult option.') scrapers = sorted(getScrapers(['@@']), key=lambda s: s.getName()) - try: - if columnList: - num = doColumnList(scrapers) - else: - num = doSingleList(scrapers) - out.info('%d supported comics.' % num) - except IOError: - pass + if columnList: + num = doColumnList(scrapers) + else: + num = doSingleList(scrapers, verbose=verbose) + out.info('%d supported comics.' % num) return 0 -def doSingleList(scrapers): +def doSingleList(scrapers, verbose=False): """Get list of scraper names, one per line.""" for num, scraperobj in enumerate(scrapers): - print(getScraperName(scraperobj)) + if verbose: + displayComicHelp(scraperobj) + else: + print(getScraperName(scraperobj)) return num diff --git a/dosagelib/languages.py b/dosagelib/languages.py new file mode 100644 index 000000000..81173588a --- /dev/null +++ b/dosagelib/languages.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- +# ISO 693-1 language codes from pycountry +Iso2Language = { + u'aa': u'Afar', + u'ab': u'Abkhazian', + u'af': u'Afrikaans', + u'ak': u'Akan', + u'sq': u'Albanian', + u'am': u'Amharic', + u'ar': u'Arabic', + u'an': u'Aragonese', + u'hy': u'Armenian', + u'as': u'Assamese', + u'av': u'Avaric', + u'ae': u'Avestan', + u'ay': u'Aymara', + u'az': u'Azerbaijani', + u'ba': u'Bashkir', + u'bm': u'Bambara', + u'eu': u'Basque', + u'be': u'Belarusian', + u'bn': u'Bengali', + u'bh': u'Bihari languages', + u'bi': u'Bislama', + u'bs': u'Bosnian', + u'br': u'Breton', + u'bg': u'Bulgarian', + u'my': u'Burmese', + u'ca': u'Catalan; Valencian', + u'ch': u'Chamorro', + u'ce': u'Chechen', + u'zh': u'Chinese', + u'cu': u'Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic', + u'cv': u'Chuvash', + u'kw': u'Cornish', + u'co': u'Corsican', + u'cr': u'Cree', + u'cs': u'Czech', + u'da': u'Danish', + u'dv': u'Divehi; Dhivehi; Maldivian', + u'nl': u'Dutch; Flemish', + u'dz': u'Dzongkha', + u'en': u'English', + u'eo': u'Esperanto', + u'et': u'Estonian', + u'ee': u'Ewe', + u'fo': u'Faroese', + u'fj': u'Fijian', + u'fi': u'Finnish', + u'fr': u'French', + u'fy': u'Western Frisian', + u'ff': u'Fulah', + u'ka': u'Georgian', + u'de': u'German', + u'gd': u'Gaelic; Scottish Gaelic', + u'ga': u'Irish', + u'gl': u'Galician', + u'gv': u'Manx', + u'el': u'Greek, Modern (1453-)', + u'gn': u'Guarani', + u'gu': u'Gujarati', + u'ht': u'Haitian; Haitian Creole', + u'ha': u'Hausa', + u'he': u'Hebrew', + u'hz': u'Herero', + u'hi': u'Hindi', + u'ho': u'Hiri Motu', + u'hr': u'Croatian', + u'hu': u'Hungarian', + u'ig': u'Igbo', + u'is': u'Icelandic', + u'io': u'Ido', + u'ii': u'Sichuan Yi; Nuosu', + u'iu': u'Inuktitut', + u'ie': u'Interlingue; Occidental', + u'ia': u'Interlingua (International Auxiliary Language Association)', + u'id': u'Indonesian', + u'ik': u'Inupiaq', + u'it': u'Italian', + u'jv': u'Javanese', + u'ja': u'Japanese', + u'kl': u'Kalaallisut; Greenlandic', + u'kn': u'Kannada', + u'ks': u'Kashmiri', + u'kr': u'Kanuri', + u'kk': u'Kazakh', + u'km': u'Central Khmer', + u'ki': u'Kikuyu; Gikuyu', + u'rw': u'Kinyarwanda', + u'ky': u'Kirghiz; Kyrgyz', + u'kv': u'Komi', + u'kg': u'Kongo', + u'ko': u'Korean', + u'kj': u'Kuanyama; Kwanyama', + u'ku': u'Kurdish', + u'lo': u'Lao', + u'la': u'Latin', + u'lv': u'Latvian', + u'li': u'Limburgan; Limburger; Limburgish', + u'ln': u'Lingala', + u'lt': u'Lithuanian', + u'lb': u'Luxembourgish; Letzeburgesch', + u'lu': u'Luba-Katanga', + u'lg': u'Ganda', + u'mk': u'Macedonian', + u'mh': u'Marshallese', + u'ml': u'Malayalam', + u'mi': u'Maori', + u'mr': u'Marathi', + u'ms': u'Malay', + u'mg': u'Malagasy', + u'mt': u'Maltese', + u'mo': u'Moldavian; Moldovan', + u'mn': u'Mongolian', + u'na': u'Nauru', + u'nv': u'Navajo; Navaho', + u'nr': u'Ndebele, South; South Ndebele', + u'nd': u'Ndebele, North; North Ndebele', + u'ng': u'Ndonga', + u'ne': u'Nepali', + u'nn': u'Norwegian Nynorsk; Nynorsk, Norwegian', + u'nb': u'Bokm\xe5l, Norwegian; Norwegian Bokm\xe5l', + u'no': u'Norwegian', + u'ny': u'Chichewa; Chewa; Nyanja', + u'oc': u'Occitan (post 1500)', + u'oj': u'Ojibwa', + u'or': u'Oriya', + u'om': u'Oromo', + u'os': u'Ossetian; Ossetic', + u'pa': u'Panjabi; Punjabi', + u'fa': u'Persian', + u'pi': u'Pali', + u'pl': u'Polish', + u'pt': u'Portuguese', + u'ps': u'Pushto; Pashto', + u'qu': u'Quechua', + u'rm': u'Romansh', + u'ro': u'Romanian', + u'rn': u'Rundi', + u'ru': u'Russian', + u'sg': u'Sango', + u'sa': u'Sanskrit', + u'si': u'Sinhala; Sinhalese', + u'sk': u'Slovak', + u'sl': u'Slovenian', + u'se': u'Northern Sami', + u'sm': u'Samoan', + u'sn': u'Shona', + u'sd': u'Sindhi', + u'so': u'Somali', + u'st': u'Sotho, Southern', + u'es': u'Spanish; Castilian', + u'sc': u'Sardinian', + u'sr': u'Serbian', + u'ss': u'Swati', + u'su': u'Sundanese', + u'sw': u'Swahili', + u'sv': u'Swedish', + u'ty': u'Tahitian', + u'ta': u'Tamil', + u'tt': u'Tatar', + u'te': u'Telugu', + u'tg': u'Tajik', + u'tl': u'Tagalog', + u'th': u'Thai', + u'bo': u'Tibetan', + u'ti': u'Tigrinya', + u'to': u'Tonga (Tonga Islands)', + u'tn': u'Tswana', + u'ts': u'Tsonga', + u'tk': u'Turkmen', + u'tr': u'Turkish', + u'tw': u'Twi', + u'ug': u'Uighur; Uyghur', + u'uk': u'Ukrainian', + u'ur': u'Urdu', + u'uz': u'Uzbek', + u've': u'Venda', + u'vi': u'Vietnamese', + u'vo': u'Volap\xfck', + u'cy': u'Welsh', + u'wa': u'Walloon', + u'wo': u'Wolof', + u'xh': u'Xhosa', + u'yi': u'Yiddish', + u'yo': u'Yoruba', + u'za': u'Zhuang; Chuang', + u'zu': u'Zulu', +} diff --git a/dosagelib/plugins/a.py b/dosagelib/plugins/a.py index 137bc9a76..2fcdc8ea3 100644 --- a/dosagelib/plugins/a.py +++ b/dosagelib/plugins/a.py @@ -54,6 +54,7 @@ class AhoiPolloi(_BasicScraper): stripUrl = url + '?day=%s' firstStripUrl = stripUrl % '20060305' multipleImagesPerStrip = True + lang = 'de' imageSearch = compile(tagre('img', 'src', r'(/static/antville/ahoipolloi/images/[^"]+)')) prevSearch = compile(tagre('a', 'href', r'(http://ahoipolloi\.blogger\.de/\?day=\d+)')) help = 'Index format: yyyymmdd' @@ -98,6 +99,7 @@ class AlphaLuna(_BasicScraper): class AlphaLunaSpanish(AlphaLuna): name = 'AlphaLuna/Spanish' + lang = 'es' url = 'http://alphaluna.net/spanish/' stripUrl = url + 'issue-%s/' diff --git a/dosagelib/plugins/b.py b/dosagelib/plugins/b.py index 4362ebe54..0ee1d58bb 100644 --- a/dosagelib/plugins/b.py +++ b/dosagelib/plugins/b.py @@ -172,7 +172,7 @@ class BratHalla(_BasicScraper): class BrentalFloss(_BasicScraper): - url = 'http://www.brentalflossthecomic.com/' + url = 'http://brentalflossthecomic.com/' stripUrl = url + '?id=%s' imageSearch = compile(tagre("img", "src", r'([^"]*/img/comic/[^"]*)')) prevSearch = compile(tagre("a", "href", r'([^"]*)') + "Prev") diff --git a/dosagelib/plugins/creators.py b/dosagelib/plugins/creators.py index a081f943a..057b14bd7 100644 --- a/dosagelib/plugins/creators.py +++ b/dosagelib/plugins/creators.py @@ -15,6 +15,7 @@ def add(name, path): name = 'Creators/' + name, url = baseurl + path + '.html', stripUrl = baseurl + path + '/%s.html', + lang = 'es' if name.lower().endswith('spanish') else 'en', imageSearch = _imageSearch, prevSearch = compile(tagre("a", "href", r'(%s/\d+\.html)' % path) + tagre("img", "src", r'/img_comics/arrow_l\.gif')), diff --git a/dosagelib/plugins/drunkduck.py b/dosagelib/plugins/drunkduck.py index b4774ad0d..ff6d2741a 100644 --- a/dosagelib/plugins/drunkduck.py +++ b/dosagelib/plugins/drunkduck.py @@ -43,6 +43,7 @@ def add(name, path): stripUrl = _url + '%s/', imageSearch = _imageSearch, prevSearch = _prevSearch, + lang = 'es' if name.lower().endswith('spanish') else 'en', help = 'Index format: n (unpadded)', namer = _namer, ) diff --git a/dosagelib/plugins/n.py b/dosagelib/plugins/n.py index d0f7163df..835429a43 100644 --- a/dosagelib/plugins/n.py +++ b/dosagelib/plugins/n.py @@ -90,6 +90,7 @@ class NekoTheKitty(_BasicScraper): class NichtLustig(_BasicScraper): url = 'http://www.nichtlustig.de/main.html' stripUrl = 'http://static.nichtlustig.de/toondb/%s.html' + lang = 'de' imageSearch = compile('background-image:url\((http://static\.nichtlustig\.de/comics/full/\d+\.jpg)') prevSearch = compile(tagre("a", "href", r'(http://static\.nichtlustig\.de/toondb/\d+\.html)')) help = 'Index format: yymmdd' diff --git a/dosagelib/plugins/smackjeeves.py b/dosagelib/plugins/smackjeeves.py index 92f70aa1f..ba1283b70 100644 --- a/dosagelib/plugins/smackjeeves.py +++ b/dosagelib/plugins/smackjeeves.py @@ -57,6 +57,7 @@ def add(name, url, description, adult, bounce): prevSearch = _prevSearch, prevUrlMatchesStripUrl = not adult, description = description, + lang = 'es' if name.lower().endswith('spanish') else 'en', help = 'Index format: nnnn (some increasing number)', namer = namer, ) diff --git a/dosagelib/plugins/z.py b/dosagelib/plugins/z.py index e9bfe4234..a15d297ba 100644 --- a/dosagelib/plugins/z.py +++ b/dosagelib/plugins/z.py @@ -50,11 +50,27 @@ class ZombieHunters(_BasicScraper): class Zwarwald(_BasicScraper): url = "http://www.zwarwald.de/" stripUrl = url + 'index.php/page/%s/' - imageSearch = compile(tagre("img", "src", r'(http://www\.zwarwald\.de/images/\d+/\d+/[^"]+)')) + # anything before page 495 seems to be flash + firstStripUrl = stripUrl % '495' + lang = 'de' + imageSearch = compile(tagre("img", "src", r'(http://(?:www\.zwarwald\.de|wp1163540.wp190.webpack.hosteurope.de/wordpress)/images/\d+/\d+/[^"]+)')) prevSearch = compile(tagre("a", "href", r'(http://www\.zwarwald\.de/index\.php/page/\d+/)') + tagre("img", "src", r'http://zwarwald\.de/images/prev\.jpg', quote="'")) help = 'Index format: number' waitSeconds = 1 def shouldSkipUrl(self, url): - return url in (self.stripUrl % "112",) + """Some pages have flash content.""" + return url in ( + self.stripUrl % "112", + self.stripUrl % "222", + self.stripUrl % "223", + self.stripUrl % "246", + self.stripUrl % "368", + self.stripUrl % '495', + ) + + @classmethod + def namer(cls, imageUrl, pageUrl): + prefix, year, month, name = imageUrl.rsplit('/', 3) + return "%s_%s_%s" % (year, month, name) diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index 2aabbdb67..a8c44d7b6 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -33,6 +33,9 @@ class _BasicScraper(object): # a description of the comic contents description = '' + # langauge of the comic (two-letter ISO 639-1 code) + lang = 'en' + # compiled regular expression that will locate the URL for the previous strip in a page prevSearch = None @@ -42,7 +45,7 @@ class _BasicScraper(object): # usually the index format help help = '' - # wait time before downloading any pages or images + # wait time between downloading comic strips waitSeconds = 0 # HTTP session storing cookies @@ -137,7 +140,7 @@ class _BasicScraper(object): out.warn("Already seen previous URL %r" % prevUrl) break url = prevUrl - if self.waitSeconds: + if url and self.waitSeconds: time.sleep(self.waitSeconds) def getPrevUrl(self, url, data, baseUrl): diff --git a/dosagelib/util.py b/dosagelib/util.py index ab42886a7..80209c223 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -17,6 +17,7 @@ from htmlentitydefs import name2codepoint from .decorators import memoized from .output import out from .configuration import UserAgent, AppName, App, SupportUrl +from .languages import Iso2Language # Maximum content size for HTML pages MaxContentBytes = 1024 * 1024 * 2 # 2 MB @@ -462,3 +463,8 @@ def strlimit (s, length=72): if length == 0: return "" return "%s..." % s[:length] + + +def getLangName(code): + """Get name of language specified by ISO 693-1 code.""" + return Iso2Language[code] diff --git a/scripts/mklanguages.py b/scripts/mklanguages.py new file mode 100755 index 000000000..1fd676c18 --- /dev/null +++ b/scripts/mklanguages.py @@ -0,0 +1,29 @@ +#!/usr/bin/python +# update languages.py from pycountry +import os +import codecs +import pycountry + +basepath = os.path.dirname(os.path.dirname(__file__)) + +def main(): + """Update language information in dosagelib/languages.py.""" + fn =os.path.join(basepath, 'dosagelib', 'languages.py') + encoding = 'utf-8' + with codecs.open(fn, 'w', encoding) as f: + f.write('# -*- coding: %s -*-%s' % (encoding, os.linesep)) + f.write('# ISO 693-1 language codes from pycountry%s' % os.linesep) + write_languages(f) + + +def write_languages(f): + """Write language information.""" + f.write("Iso2Language = {%s" % os.linesep) + for language in pycountry.languages: + if hasattr(language, 'alpha2'): + f.write(" %r: %r,%s" % (language.alpha2, language.name, os.linesep)) + f.write("}%s" % os.linesep) + + +if __name__ == '__main__': + main()