Simplify ComicsKingdom extraction a bit
This commit is contained in:
parent
4d369376c0
commit
835f484673
2 changed files with 21 additions and 50 deletions
|
@ -1,62 +1,38 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2019 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
|
||||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
|
||||||
# Copyright (C) 2019 Thomas W. Littauer
|
# Copyright (C) 2019 Thomas W. Littauer
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
from ..scraper import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
from ..helpers import indirectStarter
|
from ..helpers import bounceStarter, joinPathPartsNamer
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
class ComicsKingdom(_BasicScraper):
|
class ComicsKingdom(_BasicScraper):
|
||||||
# changed mid-June 2019
|
imageSearch = re.compile(r'property="og:image" content="(https://[^"]*img\.php\?[^"]+)"')
|
||||||
# imageSearch = re.compile(r' image-url="(https://safr\.kingfeatures\.com/api/img\.php\?e=...&s=.&file=[^"]+)"')
|
prevSearch = re.compile(r':is-left-arrow="true"[^>]*date-slug="(\d\d\d\d-\d\d-\d\d)"')
|
||||||
imageSearch = re.compile(r'property="og:image" content="(https://safr\.kingfeatures\.com/api/img\.php\?e=...&s=.&file=[^"]+)"')
|
nextSearch = re.compile(r':is-left-arrow="false"[^>]*date-slug="(\d\d\d\d-\d\d-\d\d)"')
|
||||||
prevSearch = re.compile(r' :is-left-arrow="true" .*date-slug="(\d\d\d\d-\d\d-\d\d)"')
|
starter = bounceStarter
|
||||||
|
namer = joinPathPartsNamer((-2, -1), ())
|
||||||
help = 'Index format: yyyy-mm-dd'
|
help = 'Index format: yyyy-mm-dd'
|
||||||
|
|
||||||
|
def __init__(self, name, path):
|
||||||
def __init__(self, name, path, lang=None):
|
|
||||||
super(ComicsKingdom, self).__init__('ComicsKingdom/' + name)
|
super(ComicsKingdom, self).__init__('ComicsKingdom/' + name)
|
||||||
self.url = 'https://comicskingdom.com/' + path
|
self.url = 'https://www.comicskingdom.com/' + path
|
||||||
if lang:
|
self.stripUrl = self.url + '/%s'
|
||||||
self.lang = lang
|
|
||||||
|
|
||||||
def namer(self, image_url, page_url):
|
|
||||||
|
|
||||||
if page_url != self.url:
|
|
||||||
|
|
||||||
date = page_url.rsplit('/', 3)[3]
|
|
||||||
name = page_url.rsplit('/', 3)[2]
|
|
||||||
|
|
||||||
else:
|
|
||||||
|
|
||||||
import datetime
|
|
||||||
date = datetime.date.today().strftime("%Y-%m-%d")
|
|
||||||
name = page_url.rsplit('/', 2)[2]
|
|
||||||
|
|
||||||
return "%s_%s.png" % (name.title(), date)
|
|
||||||
|
|
||||||
def link_modifier(self, url, tourl):
|
|
||||||
|
|
||||||
urllen = len(self.url)
|
|
||||||
if tourl[:urllen] != self.url:
|
|
||||||
|
|
||||||
datestr = tourl[-11:] # /YYYY-MM-DD
|
|
||||||
tourl = self.url + datestr
|
|
||||||
|
|
||||||
return tourl
|
|
||||||
|
|
||||||
|
def link_modifier(self, url, tourl):
|
||||||
|
if self.url not in tourl:
|
||||||
|
tourl = self.url + '/' + tourl.rsplit("/", 1)[1]
|
||||||
|
return tourl
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def getmodules(cls):
|
def getmodules(cls):
|
||||||
return (
|
return (
|
||||||
# Some comics are not listed on the "all" page (too old?)
|
# Some comics are not listed on the "all" page (too old?)
|
||||||
|
|
||||||
# do not edit anything below since these entries are generated from
|
# do not edit anything below since these entries are generated from
|
||||||
# scripts/comicskingdom.py
|
# scripts/comicskingdom.py
|
||||||
# START AUTOUPDATE
|
# START AUTOUPDATE
|
||||||
|
|
|
@ -1,25 +1,22 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2019 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
|
||||||
# Copyright (C) 2019 Thomas W. Littauer
|
# Copyright (C) 2019 Thomas W. Littauer
|
||||||
"""
|
"""
|
||||||
Script to get a list of comicskingdom.com comics and save the info in a JSON file
|
Script to get a list of comicskingdom.com comics and save the info in a JSON
|
||||||
for further processing.
|
file for further processing.
|
||||||
"""
|
"""
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
from scriptutil import ComicListUpdater
|
from scriptutil import ComicListUpdater
|
||||||
|
|
||||||
|
|
||||||
class ComicsKingdomUpdater(ComicListUpdater):
|
class ComicsKingdomUpdater(ComicListUpdater):
|
||||||
dup_templates = ("Creators/%s", "DrunkDuck/%s", "GoComics/%s",
|
dup_templates = ("Creators/%s", "DrunkDuck/%s", "GoComics/%s",
|
||||||
"KeenSpot/%s", "ComicGenesis/%s", "SmackJeeves/%s")
|
"KeenSpot/%s", "ComicGenesis/%s", "SmackJeeves/%s")
|
||||||
|
|
||||||
|
|
||||||
# names of comics to exclude
|
# names of comics to exclude
|
||||||
excluded_comics = (
|
excluded_comics = (
|
||||||
# no images
|
|
||||||
'Doodles',
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def handle_url(self, url):
|
def handle_url(self, url):
|
||||||
|
@ -30,17 +27,15 @@ class ComicsKingdomUpdater(ComicListUpdater):
|
||||||
comiclink = comicdiv.cssselect('a')[0]
|
comiclink = comicdiv.cssselect('a')[0]
|
||||||
comicurl = comiclink.attrib['href']
|
comicurl = comiclink.attrib['href']
|
||||||
name = comicdiv.cssselect('a')[0].text
|
name = comicdiv.cssselect('a')[0].text
|
||||||
|
|
||||||
self.add_comic(name, comicurl.rsplit('/', 1)[1])
|
self.add_comic(name, comicurl.rsplit('/', 1)[1])
|
||||||
|
|
||||||
def collect_results(self):
|
def collect_results(self):
|
||||||
"""Parse all search result pages."""
|
"""Parse all search result pages."""
|
||||||
self.handle_url('https://www.comicskingdom.com/')
|
self.handle_url('https://www.comicskingdom.com/')
|
||||||
|
|
||||||
|
|
||||||
def get_entry(self, name, path):
|
def get_entry(self, name, path):
|
||||||
langopt = ", 'es'" if name.lower().endswith('spanish') else ''
|
return u"cls('%s', '%s')," % (name, path)
|
||||||
return u"cls('%s', '%s'%s)," % (name, path, langopt)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Loading…
Reference in a new issue