diff --git a/dosagelib/plugins/tapastic.py b/dosagelib/plugins/tapastic.py new file mode 100644 index 000000000..3a80fe4a5 --- /dev/null +++ b/dosagelib/plugins/tapastic.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2019-2020 Tobias Gruetzmacher +# Copyright (C) 2019-2020 Daniel Ring +import json +import re + +from ..scraper import _ParserScraper + + +class Tapastic(_ParserScraper): + baseUrl = 'https://tapas.io/' + imageSearch = '//article[@class="ep-contents"]//img' + episodeIdSearch = re.compile(r'episodeId : (\d+),') + episodeListSearch = re.compile(r'episodeList : (.*),') + multipleImagesPerStrip = True + + def __init__(self, name, url): + super(Tapastic, self).__init__('Tapastic/' + name) + self.url = self.baseUrl + 'series/' + url + self.stripUrl = self.baseUrl + 'episode/%s' + + def starter(self): + # Retrieve series data object + seriesPage = self.getPage(self.url) + dataScript = seriesPage.xpath('//script[contains(text(), "var _data")]')[0].text + # Extract episode list + currentEpisode = self.episodeIdSearch.findall(dataScript)[0] + self.episodeList = json.loads(self.episodeListSearch.findall(dataScript)[0]) + return self.stripUrl % currentEpisode + + def fetchUrls(self, url, data, urlSearch): + # Save link order for position-based filenames + self.imageUrls = super().fetchUrls(url, data, urlSearch) + # Update firstStripUrl with the correct episode title + if int(url.rsplit('/', 1)[-1]) == self.episodeList[0]['id']: + self.firstStripUrl = url + return self.imageUrls + + def getPrevUrl(self, url, data): + episodeId = int(url.rsplit('/', 1)[-1]) + index = [i for i, ep in enumerate(self.episodeList) if ep['id'] == episodeId][0] + if index == 0: + return None + return self.stripUrl % str(self.episodeList[index - 1]['id']) + + def namer(self, imageUrl, pageUrl): + # Construct filename from episode number and image position on page + episodeNum = pageUrl.rsplit('/', 1)[-1] + imageNum = self.imageUrls.index(imageUrl) + imageExt = pageUrl.rsplit('.', 1)[-1] + if len(self.imageUrls) > 1: + filename = "%s-%d.%s" % (episodeNum, imageNum, imageExt) + else: + filename = "%s.%s" % (episodeNum, imageExt) + return filename + + @classmethod + def getmodules(cls): + return ( + # Manually-added comics + cls('NoFuture', 'NoFuture'), + cls('OrensForge', 'OrensForge'), + cls('RavenWolf', 'RavenWolf'), + cls('TheCatTheVineAndTheVictory', 'The-Cat-The-Vine-and-The-Victory'), + cls('TheGodsPack', 'The-Gods-Pack'), + + # START AUTOUPDATE + # END AUTOUPDATE + ) diff --git a/scripts/generate_json.sh b/scripts/generate_json.sh index c28c21ddc..6ff7617c5 100755 --- a/scripts/generate_json.sh +++ b/scripts/generate_json.sh @@ -9,7 +9,7 @@ d=$(dirname $0) if [ $# -ge 1 ]; then list="$*" else - list="arcamax comicfury comicgenesis comicskingdom creators gocomics keenspot webcomicfactory webtoons" + list="arcamax comicfury comicgenesis comicskingdom creators gocomics keenspot tapastic webcomicfactory webtoons" fi for script in $list; do echo "Executing ${script}.py" diff --git a/scripts/tapastic.py b/scripts/tapastic.py new file mode 100644 index 000000000..d285c024e --- /dev/null +++ b/scripts/tapastic.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +# Copyright (C) 2019-2020 Tobias Gruetzmacher +# Copyright (C) 2019-2020 Daniel Ring +""" +Script to get a list of Tapastic comics and save the info in a +JSON file for further processing. +""" +from urllib.parse import urlsplit, parse_qs + +from scriptutil import ComicListUpdater +from dosagelib.util import check_robotstxt + + +class TapasticUpdater(ComicListUpdater): + def collect_results(self): + # Retrieve the first 10 top comics list pages + url = 'https://tapas.io/comics?browse=ALL&sort_type=LIKE&pageNumber=' + count = 10 + + data = [self.get_url(url + str(i), robot=False) for i in range(0, count)] + for page in data: + for comiclink in page.xpath('//a[@class="preferred title"]'): + comicurl = comiclink.attrib['href'] + name = comiclink.text + self.add_comic(name, comicurl) + + def get_entry(self, name, url): + shortName = name.replace(' ', '').replace('\'', '') + titleNum = int(parse_qs(urlsplit(url).query)['title_no'][0]) + url = url.rsplit('/', 1)[0].replace('/series/', '') + return u"cls('%s', '%s', %d)," % (shortName, url, titleNum) + + +if __name__ == '__main__': + TapasticUpdater(__file__).run() diff --git a/scripts/update_plugins.sh b/scripts/update_plugins.sh index 97bc7d3e6..8ce0a9126 100755 --- a/scripts/update_plugins.sh +++ b/scripts/update_plugins.sh @@ -11,7 +11,7 @@ d=$(dirname $0) if [ $# -ge 1 ]; then list="$*" else - list="arcamax comicfury comicgenesis comicskingdom creators gocomics keenspot webcomicfactory webtoons" + list="arcamax comicfury comicgenesis comicskingdom creators gocomics keenspot tapastic webcomicfactory webtoons" fi for script in $list; do target="${d}/../dosagelib/plugins/${script}.py"