From 835f484673ca96796fc84db51a36d5be87a27b9a Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Mon, 16 Dec 2019 23:47:14 +0100 Subject: [PATCH] Simplify ComicsKingdom extraction a bit --- dosagelib/plugins/comicskingdom.py | 54 +++++++++--------------------- scripts/comicskingdom.py | 17 ++++------ 2 files changed, 21 insertions(+), 50 deletions(-) diff --git a/dosagelib/plugins/comicskingdom.py b/dosagelib/plugins/comicskingdom.py index 0f85d8da2..09cd12791 100644 --- a/dosagelib/plugins/comicskingdom.py +++ b/dosagelib/plugins/comicskingdom.py @@ -1,62 +1,38 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2016 Tobias Gruetzmacher +# Copyright (C) 2019 Tobias Gruetzmacher # Copyright (C) 2019 Thomas W. Littauer from __future__ import absolute_import, division, print_function from ..scraper import _BasicScraper -from ..helpers import indirectStarter +from ..helpers import bounceStarter, joinPathPartsNamer import re class ComicsKingdom(_BasicScraper): - # changed mid-June 2019 - # imageSearch = re.compile(r' image-url="(https://safr\.kingfeatures\.com/api/img\.php\?e=...&s=.&file=[^"]+)"') - imageSearch = re.compile(r'property="og:image" content="(https://safr\.kingfeatures\.com/api/img\.php\?e=...&s=.&file=[^"]+)"') - prevSearch = re.compile(r' :is-left-arrow="true" .*date-slug="(\d\d\d\d-\d\d-\d\d)"') + imageSearch = re.compile(r'property="og:image" content="(https://[^"]*img\.php\?[^"]+)"') + prevSearch = re.compile(r':is-left-arrow="true"[^>]*date-slug="(\d\d\d\d-\d\d-\d\d)"') + nextSearch = re.compile(r':is-left-arrow="false"[^>]*date-slug="(\d\d\d\d-\d\d-\d\d)"') + starter = bounceStarter + namer = joinPathPartsNamer((-2, -1), ()) help = 'Index format: yyyy-mm-dd' - - def __init__(self, name, path, lang=None): + def __init__(self, name, path): super(ComicsKingdom, self).__init__('ComicsKingdom/' + name) - self.url = 'https://comicskingdom.com/' + path - if lang: - self.lang = lang - - def namer(self, image_url, page_url): - - if page_url != self.url: - - date = page_url.rsplit('/', 3)[3] - name = page_url.rsplit('/', 3)[2] - - else: - - import datetime - date = datetime.date.today().strftime("%Y-%m-%d") - name = page_url.rsplit('/', 2)[2] - - return "%s_%s.png" % (name.title(), date) - - def link_modifier(self, url, tourl): - - urllen = len(self.url) - if tourl[:urllen] != self.url: - - datestr = tourl[-11:] # /YYYY-MM-DD - tourl = self.url + datestr - - return tourl + self.url = 'https://www.comicskingdom.com/' + path + self.stripUrl = self.url + '/%s' + def link_modifier(self, url, tourl): + if self.url not in tourl: + tourl = self.url + '/' + tourl.rsplit("/", 1)[1] + return tourl @classmethod def getmodules(cls): return ( # Some comics are not listed on the "all" page (too old?) - + # do not edit anything below since these entries are generated from # scripts/comicskingdom.py # START AUTOUPDATE diff --git a/scripts/comicskingdom.py b/scripts/comicskingdom.py index 755cfe2b5..3e1cd8390 100755 --- a/scripts/comicskingdom.py +++ b/scripts/comicskingdom.py @@ -1,25 +1,22 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2016 Tobias Gruetzmacher +# Copyright (C) 2019 Tobias Gruetzmacher # Copyright (C) 2019 Thomas W. Littauer """ -Script to get a list of comicskingdom.com comics and save the info in a JSON file -for further processing. +Script to get a list of comicskingdom.com comics and save the info in a JSON +file for further processing. """ from __future__ import absolute_import, division, print_function from scriptutil import ComicListUpdater + class ComicsKingdomUpdater(ComicListUpdater): dup_templates = ("Creators/%s", "DrunkDuck/%s", "GoComics/%s", "KeenSpot/%s", "ComicGenesis/%s", "SmackJeeves/%s") - # names of comics to exclude excluded_comics = ( - # no images - 'Doodles', ) def handle_url(self, url): @@ -30,17 +27,15 @@ class ComicsKingdomUpdater(ComicListUpdater): comiclink = comicdiv.cssselect('a')[0] comicurl = comiclink.attrib['href'] name = comicdiv.cssselect('a')[0].text - + self.add_comic(name, comicurl.rsplit('/', 1)[1]) def collect_results(self): """Parse all search result pages.""" self.handle_url('https://www.comicskingdom.com/') - def get_entry(self, name, path): - langopt = ", 'es'" if name.lower().endswith('spanish') else '' - return u"cls('%s', '%s'%s)," % (name, path, langopt) + return u"cls('%s', '%s')," % (name, path) if __name__ == '__main__':