From 6bbdcfb341a5116d3a3c7a148908b70da77b23f9 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Tue, 5 Apr 2016 23:58:43 +0200 Subject: [PATCH] BloomingFaeries: Don't download every page twice. (Also, simplify namer, switch to _ParserScraper) --- dosagelib/plugins/b.py | 27 +++++++++------------------ dosagelib/plugins/common.py | 4 ++-- 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/dosagelib/plugins/b.py b/dosagelib/plugins/b.py index 996c264a9..6a753805f 100644 --- a/dosagelib/plugins/b.py +++ b/dosagelib/plugins/b.py @@ -7,10 +7,11 @@ from __future__ import absolute_import, division, print_function from re import compile, escape -from ..util import tagre, getPageContent +from ..util import tagre from ..scraper import _BasicScraper, _ParserScraper from ..helpers import indirectStarter -from .common import _ComicControlScraper, _ComicPressScraper, _WordPressScraper +from .common import (_ComicControlScraper, _ComicPressScraper, + _WordPressScraper, WP_PREV_SEARCH) class BackwaterPlanet(_BasicScraper): @@ -179,26 +180,16 @@ class BloodBound(_WordPressScraper): firstStripUrl = 'http://bloodboundcomic.com/comic/06112006/' -class BloomingFaeries(_BasicScraper): +class BloomingFaeries(_ParserScraper): adult = True url = 'http://www.bloomingfaeries.com/' - stripUrl = url + 'comic/public/%s/' - firstStripUrl = stripUrl % "pit-stop" - imageSearch = compile(tagre("img", "src", r'(http://www.bloomingfaeries.com/wp-content/uploads[^"]+)', after='title')) - prevSearch = compile(tagre("a", "href", r'([^"]+)', - after='comic-nav-base comic-nav-previous')) - help = 'Index format: stripname' + firstStripUrl = url + 'comic/public/pit-stop/' + imageSearch = '//div[@id="comic"]//img' + prevSearch = WP_PREV_SEARCH @classmethod - def namer(cls, imageUrl, pageUrl): - bf = imageUrl.split('/') - name = bf[-1] - re = compile(tagre("div", "class", r'comic-id-([^"]+)')) - content = getPageContent(pageUrl, cls.session) - match = re.search(content) - if not match: - return None - return "BF%s_%s" % (match.group(1), name) + def namer(cls, image_url, page_url): + return "_".join(image_url.rsplit('/', 3)[1:]) class BMovieComic(_BasicScraper): diff --git a/dosagelib/plugins/common.py b/dosagelib/plugins/common.py index 734543706..a041afae4 100644 --- a/dosagelib/plugins/common.py +++ b/dosagelib/plugins/common.py @@ -12,13 +12,13 @@ from ..scraper import _ParserScraper # please don't use lists of expression, as that makes it hard to track which # expression is for which comics. - WP_LATEST_SEARCH = '//a[contains(concat(" ", @class, " "), " comic-nav-last ")]' +WP_PREV_SEARCH = '//a[contains(concat(" ", @class, " "), " comic-nav-previous ")]' class _WordPressScraper(_ParserScraper): imageSearch = '//div[@id="comic"]//img' - prevSearch = "//a[contains(concat(' ', @class, ' '), ' comic-nav-previous ')]" + prevSearch = WP_PREV_SEARCH class _ComicPressScraper(_WordPressScraper):