dosage/dosagelib/plugins/common.py

# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher

from __future__ import absolute_import, division, print_function

from ..scraper import _ParserScraper
from ..helpers import indirectStarter

# Common base classes for comics with the same structure (same hosting
# software, for example) go here. Since those are shared by many modules,
# please don't use lists of expression, as that makes it hard to track which
# expression is for which comics.


def xpath_class(name):
    """Returns an XPath expressions which finds a tag which has a specified
    class."""
    return 'contains(concat(" ", @class, " "), " %s ")' % name


WP_LATEST_SEARCH = '//a[%s]' % xpath_class('comic-nav-last')
WP_PREV_SEARCH = '//a[%s]' % xpath_class('comic-nav-previous')


class _WordPressScraper(_ParserScraper):
    imageSearch = '//div[@id="comic"]//img'
    prevSearch = WP_PREV_SEARCH


class _WPNaviIn(_WordPressScraper):
    prevSearch = '//a[%s]' % xpath_class('navi-prev-in')


class _ComicControlScraper(_ParserScraper):
    imageSearch = '//img[@id="cc-comic"]'
    prevSearch = '//a[@rel="prev"]'


class _TumblrScraper(_ParserScraper):
    starter = indirectStarter

    def namer(self, image_url, page_url):
        # tumblr URLs: http://host/post/num/name
        #              0    1 2    3    4   5
        parts = page_url.split('/')
        if len(parts) > 5:
            return '%s_%s' % (parts[4], parts[5])
        else:
            return parts[4]

    def shouldSkipUrl(self, url, data):
        """Reblogged stuff is iframed"""
        return data.xpath('//div[@id="post"]//iframe')