diff --git a/dosage b/dosage index 5fc7f447d..cd1f0cb4f 100755 --- a/dosage +++ b/dosage @@ -136,7 +136,7 @@ def displayHelp(options): """Print help for comic strips.""" errors = 0 try: - for scraperobj in director.getScrapers(options.comic, options.basepath): + for scraperobj in director.getScrapers(options.comic, options.basepath, listing=True): errors += displayComicHelp(scraperobj) except ValueError as msg: out.exception(msg) @@ -239,12 +239,17 @@ def doList(columnList=True, verbose=False): out.info(u'Available comic scrapers:') out.info(u'Comics tagged with [%s] require age confirmation with the --adult option.' % TAG_ADULT) out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG) - scrapers = sorted(director.getAllScrapers(), key=lambda s: s.getName()) + scrapers = sorted(director.getAllScrapers(listing=True), key=lambda s: s.getName()) if columnList: - num = doColumnList(scrapers) + num, disabled = doColumnList(scrapers) else: - num = doSingleList(scrapers, verbose=verbose) + num, disabled = doSingleList(scrapers, verbose=verbose) out.info(u'%d supported comics.' % num) + if disabled: + out.info('') + out.info(u'Some comics are disabled, they are tagged with [%s:REASON], where REASON is one of:' % TAG_DISABLED) + for k in disabled: + out.info(u' %-10s %s' % (k, disabled[k])) if page: pydoc.pager(fd.getvalue()) return 0 @@ -254,38 +259,46 @@ def doList(columnList=True, verbose=False): def doSingleList(scrapers, verbose=False): """Get list of scraper names, one per line.""" + disabled = {} for num, scraperobj in enumerate(scrapers): if verbose: displayComicHelp(scraperobj) else: - out.info(getScraperName(scraperobj)) - return num + out.info(getScraperName(scraperobj, reasons=disabled)) + return num, disabled def doColumnList(scrapers): """Get list of scraper names with multiple names per line.""" + disabled = {} screenWidth = get_columns(sys.stdout) # limit name length so at least two columns are there limit = (screenWidth // 2) - 8 - names = [getScraperName(scraperobj, limit=limit) for scraperobj in scrapers] + names = [getScraperName(scraperobj, limit=limit, reasons=disabled) for scraperobj in scrapers] num = len(names) maxlen = max(len(name) for name in names) namesPerLine = max(screenWidth // (maxlen + 1), 1) while names: out.info(u''.join(name.ljust(maxlen) for name in names[:namesPerLine])) del names[:namesPerLine] - return num + return num, disabled TAG_ADULT = "adult" TAG_LANG = "lang" +TAG_DISABLED = "dis" -def getScraperName(scraperobj, limit=None): +def getScraperName(scraperobj, limit=None, reasons=None): """Get comic scraper name.""" tags = [] if scraperobj.adult: tags.append(TAG_ADULT) if scraperobj.lang != "en": tags.append("%s:%s" % (TAG_LANG, scraperobj.lang)) + disabled = scraperobj.getDisabledReasons() + if disabled: + reasons.update(disabled) + for reason in disabled: + tags.append("%s:%s" % (TAG_DISABLED, reason)) if tags: suffix = " [" + ", ".join(tags) + "]" else: diff --git a/dosagelib/director.py b/dosagelib/director.py index e57aa772b..f18593536 100644 --- a/dosagelib/director.py +++ b/dosagelib/director.py @@ -189,12 +189,12 @@ def finish(): out.warn("Waiting for download threads to finish.") -def getAllScrapers(): +def getAllScrapers(listing=False): """Get all scrapers.""" - return getScrapers(['@@']) + return getScrapers(['@@'], listing=listing) -def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False): +def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listing=False): """Get scraper objects for the given comics.""" if '@' in comics: # only scrapers whose directory already exists @@ -203,17 +203,13 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False): for scraperclass in scraper.get_scraperclasses(): dirname = getDirname(scraperclass.getName()) if os.path.isdir(os.path.join(basepath, dirname)): - if not adult and scraperclass.adult: - warn_adult(scraperclass) - continue - yield scraperclass() + if shouldRunScraper(scraperclass, adult, listing): + yield scraperclass() elif '@@' in comics: # all scrapers for scraperclass in scraper.get_scraperclasses(): - if not adult and scraperclass.adult: - warn_adult(scraperclass) - continue - yield scraperclass() + if shouldRunScraper(scraperclass, adult, listing): + yield scraperclass() else: # get only selected comic scrapers # store them in a set to eliminate duplicates @@ -233,15 +229,30 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False): indexes = None scraperclasses = scraper.find_scraperclasses(name, multiple_allowed=multiple_allowed) for scraperclass in scraperclasses: - if not adult and scraperclass.adult: - warn_adult(scraperclass) - continue - scraperobj = scraperclass(indexes=indexes) - if scraperobj not in scrapers: - scrapers.add(scraperobj) - yield scraperobj + if shouldRunScraper(scraperclass, adult, listing): + scraperobj = scraperclass(indexes=indexes) + if scraperobj not in scrapers: + scrapers.add(scraperobj) + yield scraperobj + + +def shouldRunScraper(scraperclass, adult=True, listing=False): + if listing: + return True + if not adult and scraperclass.adult: + warn_adult(scraperclass) + return False + reasons = scraperclass.getDisabledReasons() + if reasons: + warn_disabled(scraperclass, reasons) + return False + return True def warn_adult(scraperclass): """Print warning about adult content.""" out.warn(u"skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.getName()) + +def warn_disabled(scraperclass, reasons): + """Print warning about disabled comic modules.""" + out.warn(u"Skipping comic %s: %s" % (scraperclass.getName(), ' '.join(reasons.values()))) diff --git a/dosagelib/helpers.py b/dosagelib/helpers.py index 53118be46..7b7a62940 100644 --- a/dosagelib/helpers.py +++ b/dosagelib/helpers.py @@ -1,7 +1,7 @@ # -*- coding: iso-8859-1 -*- # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam -from .util import fetchUrl, getPageContent, getQueryParams +from .util import getQueryParams def queryNamer(paramName, usePageUrl=False): """Get name from URL query part.""" @@ -30,10 +30,10 @@ def bounceStarter(url, nextSearch): @classmethod def _starter(cls): """Get bounced start URL.""" - data, baseUrl = getPageContent(url, cls.session) - url1 = fetchUrl(url, data, baseUrl, cls.prevSearch) - data, baseUrl = getPageContent(url1, cls.session) - return fetchUrl(url1, data, baseUrl, nextSearch) + data = cls.getPage(url) + url1 = cls.fetchUrl(url, data, cls.prevSearch) + data = cls.getPage(url1) + return cls.fetchUrl(url1, data, nextSearch) return _starter @@ -42,6 +42,6 @@ def indirectStarter(url, latestSearch): @classmethod def _starter(cls): """Get indirect start URL.""" - data, baseUrl = getPageContent(url, cls.session) - return fetchUrl(url, data, baseUrl, latestSearch) + data = cls.getPage(url) + return cls.fetchUrl(url, data, latestSearch) return _starter diff --git a/dosagelib/plugins/b.py b/dosagelib/plugins/b.py index 98810c0ea..c701fec4b 100644 --- a/dosagelib/plugins/b.py +++ b/dosagelib/plugins/b.py @@ -5,7 +5,7 @@ from re import compile, escape from ..util import tagre -from ..scraper import _BasicScraper +from ..scraper import _BasicScraper, _ParserScraper from ..helpers import indirectStarter @@ -148,6 +148,28 @@ class BizarreUprising(_BasicScraper): help = 'Index format: n/name' +class BladeKitten(_ParserScraper): + description = u"Blade Kitten aka Kit Ballard, is the hottest and best bounty hunter in the Korunda System and isn't afraid to let people know it!" + url = 'http://www.bladekitten.com/' + stripUrl = url + 'comics/blade-kitten/%s/page:%s' + firstStripUrl = stripUrl % ('1','1') + imageSearch = '//img[@class="comic_page_image"]' + prevSearch = '//span[@class="comic_nav_prev"]//a' + textSearch = '//div[@class="comic_comment_inner"]//p' + textOptional = True + help = 'Index format: chapter-page' + starter = indirectStarter(url, '//h4//a[contains(@href, "/comics/")]') + + def getIndexStripUrl(self, index): + return self.stripUrl % tuple(index.split('-')) + + @classmethod + def namer(cls, imageUrl, pageUrl): + filename = imageUrl.rsplit('/', 1)[1] + _, chapter, page = pageUrl.rsplit('/', 2) + page = page.split(':')[1] + return "bladekitten-%02i-%02i-%s" % (int(chapter), int(page), filename) + class BlankIt(_BasicScraper): description = u'An absurd, insane, and delightful webcomic from Aric McKeown and Lem Pew.' url = 'http://blankitcomics.com/' diff --git a/dosagelib/plugins/c.py b/dosagelib/plugins/c.py index 6480ba53d..bfeac0da8 100755 --- a/dosagelib/plugins/c.py +++ b/dosagelib/plugins/c.py @@ -420,7 +420,7 @@ class CyanideAndHappiness(_BasicScraper): def shouldSkipUrl(self, url, data): """Skip pages without images.""" - return "/comics/play-button.png" in data + return "/comics/play-button.png" in data[0] @classmethod def namer(cls, imageUrl, pageUrl): diff --git a/dosagelib/plugins/clonemanga.py b/dosagelib/plugins/clonemanga.py index 93b4f4107..a26310aec 100644 --- a/dosagelib/plugins/clonemanga.py +++ b/dosagelib/plugins/clonemanga.py @@ -3,7 +3,7 @@ # Copyright (C) 2012-2014 Bastian Kleineidam from re import compile from ..scraper import make_scraper -from ..util import tagre, getQueryParams, fetchUrl, getPageContent +from ..util import tagre, getQueryParams _linkTag = tagre("a", "href", r'([^"]+)') @@ -25,15 +25,15 @@ def add(name, shortName, imageFolder=None, lastStrip=None): @classmethod def _starter(cls): # first, try hopping to previous and next comic - data, _baseUrl = getPageContent(baseUrl, cls.session) + data = cls.getPage(baseUrl) try: - url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch) + url = cls.fetchUrl(baseUrl, data, _prevSearch) except ValueError: # no previous link found, try hopping to last comic - return fetchUrl(baseUrl, data, _baseUrl, _lastSearch) + return cls.fetchUrl(baseUrl, data, _lastSearch) else: - data, _baseUrl = getPageContent(url, cls.session) - return fetchUrl(url, data, _baseUrl, _nextSearch) + data = cls.getPage(url) + return cls.fetchUrl(url, data, _nextSearch) attrs = dict( name='CloneManga/' + name, diff --git a/dosagelib/plugins/drunkduck.py b/dosagelib/plugins/drunkduck.py index 8a48888f6..a0c385335 100644 --- a/dosagelib/plugins/drunkduck.py +++ b/dosagelib/plugins/drunkduck.py @@ -4,7 +4,7 @@ from re import compile from ..scraper import make_scraper, Genre -from ..util import tagre, fetchUrl, getPageContent +from ..util import tagre # note: adding the compile() functions inside add() is a major performance hog _imageSearch = compile(tagre("img", "src", r'(https://s3\.amazonaws\.com/media\.drunkduck\.com/[^"]+)', before="page-image")) @@ -27,15 +27,15 @@ def add(name, path): @classmethod def _starter(cls): # first, try hopping to previous and next comic - data, baseUrl = getPageContent(_url, cls.session) + data = cls.getPage(_url) try: - url = fetchUrl(_url, data, baseUrl, _prevSearch) + url = cls.fetchUrl(_url, data, _prevSearch) except ValueError: # no previous link found, try hopping to last comic - return fetchUrl(_url, data, baseUrl, _lastSearch) + return cls.fetchUrl(_url, data, _lastSearch) else: - data, baseUrl = getPageContent(url, cls.session) - return fetchUrl(url, data, baseUrl, _nextSearch) + data = cls.getPage(url) + return cls.fetchUrl(url, data, _nextSearch) attrs = dict( name = 'DrunkDuck/' + name, diff --git a/dosagelib/plugins/h.py b/dosagelib/plugins/h.py index dd52c8362..daeaea9d8 100644 --- a/dosagelib/plugins/h.py +++ b/dosagelib/plugins/h.py @@ -3,7 +3,7 @@ from re import compile, escape from ..scraper import _BasicScraper -from ..util import tagre, getPageContent, fetchUrls +from ..util import tagre from ..helpers import bounceStarter @@ -21,9 +21,9 @@ class HagarTheHorrible(_BasicScraper): def starter(cls): """Return last gallery link.""" url = 'http://www.hagardunor.net/comics.php' - content = getPageContent(url, cls.session)[0] + data = cls.getPage(url) pattern = compile(tagre("a", "href", cls.prevUrl)) - for starturl in fetchUrls(url, content, url, pattern): + for starturl in cls.fetchUrls(url, data, pattern): pass return starturl diff --git a/dosagelib/plugins/p.py b/dosagelib/plugins/p.py index e039bde24..0951b5236 100755 --- a/dosagelib/plugins/p.py +++ b/dosagelib/plugins/p.py @@ -5,7 +5,7 @@ from re import compile, escape from ..scraper import _BasicScraper from ..helpers import bounceStarter, queryNamer, indirectStarter -from ..util import tagre, fetchUrl, getPageContent +from ..util import tagre class PandyLand(_BasicScraper): @@ -104,10 +104,10 @@ class PennyArcade(_BasicScraper): @classmethod def starter(cls): """Get bounced start URL.""" - data, baseUrl = getPageContent(cls.url, cls.session) - url1 = fetchUrl(cls.url, data, baseUrl, cls.prevSearch) - data, baseUrl = getPageContent(url1, cls.session) - url2 = fetchUrl(url1, data, baseUrl, cls.nextSearch) + data = cls.getPage(cls.url) + url1 = cls.fetchUrl(cls.url, data, cls.prevSearch) + data = cls.getPage(url1) + url2 = cls.fetchUrl(url1, data, cls.nextSearch) return cls.prevUrlModifier(url2) @classmethod diff --git a/dosagelib/plugins/s.py b/dosagelib/plugins/s.py index 977f46852..e5f610d8e 100644 --- a/dosagelib/plugins/s.py +++ b/dosagelib/plugins/s.py @@ -4,7 +4,7 @@ from re import compile, escape, IGNORECASE, sub from os.path import splitext -from ..scraper import _BasicScraper +from ..scraper import _BasicScraper, _ParserScraper from ..helpers import indirectStarter, bounceStarter from ..util import tagre, getPageContent @@ -544,6 +544,25 @@ class StrawberryDeathCake(_BasicScraper): help = 'Index format: stripname' +class StrongFemaleProtagonist(_ParserScraper): + url = 'http://strongfemaleprotagonist.com/' + stripUrl = url + '%s/' + css = True + imageSearch = 'article p:first-child img' + prevSearch = 'div.nav-previous > a' + help = 'Index format: issue-?/page-??' + + def shouldSkipUrl(self, url, data): + """Skip hiatus & non-comic pages.""" + return url in ( + self.stripUrl % 'guest-art/tuesday', + self.stripUrl % 'guest-art/friday', + self.stripUrl % 'guest-art/wednesday', + self.stripUrl % 'issue-5/newspaper', + self.stripUrl % 'issue-5/hiatus-1', + self.stripUrl % 'issue-5/hiatus-2', + ) + class SuburbanTribe(_BasicScraper): url = 'http://www.pixelwhip.com/' rurl = escape(url) diff --git a/dosagelib/plugins/smackjeeves.py b/dosagelib/plugins/smackjeeves.py index 766e3ee2b..111f56e1d 100644 --- a/dosagelib/plugins/smackjeeves.py +++ b/dosagelib/plugins/smackjeeves.py @@ -3,7 +3,7 @@ # Copyright (C) 2012-2014 Bastian Kleineidam from re import compile from ..scraper import make_scraper -from ..util import tagre, quote, fetchUrl, case_insensitive_re, getPageContent +from ..util import tagre, quote, case_insensitive_re # SmackJeeves is a crawlers nightmare - users are allowed to edit HTML directly. # That's why there are so much different search patterns. @@ -45,11 +45,11 @@ def add(name, url, description, adult, bounce): def _starter(cls): """Get start URL.""" url1 = modifier(url) - data, baseUrl = getPageContent(url1, cls.session) - url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch) + data = cls.getPage(url1) + url2 = cls.fetchUrl(url1, data, cls.prevSearch) if bounce: - data, baseUrl = getPageContent(url2, cls.session) - url3 = fetchUrl(url2, data, baseUrl, _nextSearch) + data = cls.getPage(url2) + url3 = cls.fetchUrl(url2, data, _nextSearch) return modifier(url3) return modifier(url2) diff --git a/dosagelib/plugins/t.py b/dosagelib/plugins/t.py index e43ab48ff..97403153d 100755 --- a/dosagelib/plugins/t.py +++ b/dosagelib/plugins/t.py @@ -5,7 +5,7 @@ from re import compile, escape, IGNORECASE from ..scraper import _BasicScraper from ..helpers import indirectStarter -from ..util import tagre, fetchUrl, getPageContent +from ..util import tagre class TheBrads(_BasicScraper): @@ -223,11 +223,11 @@ class TheThinHLine(_BasicScraper): indirectImageSearch = compile(tagre('a', 'href', r'(%simage/\d+)' % rurl)) - def getComicStrip(self, url, data, baseUrl): + def getComicStrip(self, url, data): """The comic strip image is in a separate page.""" - pageUrl = fetchUrl(url, data, baseUrl, self.indirectImageSearch) - pageData, pageBaseUrl = getPageContent(pageUrl, self.session) - return super(TheThinHLine, self).getComicStrip(pageUrl, pageData, pageBaseUrl) + pageUrl = self.fetchUrl(url, data, self.indirectImageSearch) + pageData = self.getPage(pageUrl) + return super(TheThinHLine, self).getComicStrip(pageUrl, pageData) @classmethod def namer(cls, imageUrl, pageUrl): diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index 3a85cbb69..b46424118 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -5,9 +5,26 @@ import requests import time import random import os -from . import loader, configuration -from .util import (fetchUrl, fetchUrls, fetchText, getPageContent, - makeSequence, get_system_uid, urlopen, getDirname, unescape) +import re +try: + from urllib.parse import urljoin +except ImportError: + from urlparse import urljoin + +try: + from lxml import html + from lxml.html.defs import link_attrs as html_link_attrs +except ImportError: + html = None + +try: + import cssselect +except ImportError: + cssselect = None + +from . import loader, configuration, util +from .util import (getPageContent, makeSequence, get_system_uid, urlopen, + getDirname, unescape, tagre, normaliseURL, prettyMatcherList) from .comic import ComicStrip from .output import out from .events import getHandler @@ -26,8 +43,8 @@ class Genre: other = u"Other" -class _BasicScraper(object): - '''Base class with scrape functions for comics.''' +class Scraper(object): + '''Base class for all comic scraper, but without a specific scrape implementation.''' # The URL for the comic strip url = None @@ -59,15 +76,15 @@ class _BasicScraper(object): # list of genres for this comic strip genres = (Genre.other,) - # compiled regular expression that will locate the URL for the previous strip in a page - # this can also be a list or tuple of compiled regular expressions + # an expression that will locate the URL for the previous strip in a page + # this can also be a list or tuple prevSearch = None - # compiled regular expression that will locate the strip image URLs strip in a page - # this can also be a list or tuple of compiled regular expressions + # an expression that will locate the strip image URLs strip in a page + # this can also be a list or tuple imageSearch = None - # compiled regular expression to store a text together with the image + # an expression to store a text together with the image # sometimes comic strips have additional text info for each comic textSearch = None @@ -94,7 +111,7 @@ class _BasicScraper(object): def __cmp__(self, other): """Compare scraper by name and index list.""" - if not isinstance(other, _BasicScraper): + if not isinstance(other, Scraper): return 1 # first, order by name d = cmp(self.getName(), other.getName()) @@ -111,26 +128,22 @@ class _BasicScraper(object): """Determine if search for images in given URL should be skipped.""" return False - def getComicStrip(self, url, data, baseUrl): + def getComicStrip(self, url, data): """Get comic strip downloader for given URL and data.""" - imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch) + imageUrls = self.fetchUrls(url, data, self.imageSearch) # map modifier function on image URLs imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls] # remove duplicate URLs imageUrls = set(imageUrls) if len(imageUrls) > 1 and not self.multipleImagesPerStrip: - patterns = [x.pattern for x in makeSequence(self.imageSearch)] - out.warn(u"found %d images instead of 1 at %s with patterns %s" % (len(imageUrls), url, patterns)) + out.warn(u"Found %d images instead of 1 at %s with expressions %s" % (len(imageUrls), url, prettyMatcherList(self.imageSearch))) image = sorted(imageUrls)[0] - out.warn(u"choosing image %s" % image) + out.warn(u"Choosing image %s" % image) imageUrls = (image,) elif not imageUrls: - patterns = [x.pattern for x in makeSequence(self.imageSearch)] - out.warn(u"found no images at %s with patterns %s" % (url, patterns)) + out.warn(u"Found no images at %s with expressions %s" % (url, prettyMatcherList(self.imageSearch))) if self.textSearch: - text = fetchText(url, data, self.textSearch, optional=self.textOptional) - if text: - text = unescape(text).strip() + text = self.fetchText(url, data, self.textSearch, optional=self.textOptional) else: text = None return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session, text=text) @@ -167,13 +180,13 @@ class _BasicScraper(object): seen_urls = set() while url: out.info(u'Get strip URL %s' % url, level=1) - data, baseUrl = getPageContent(url, self.session) + data = self.getPage(url) if self.shouldSkipUrl(url, data): out.info(u'Skipping URL %s' % url) self.skippedUrls.add(url) else: try: - yield self.getComicStrip(url, data, baseUrl) + yield self.getComicStrip(url, data) except ValueError as msg: # image not found out.exception(msg) @@ -185,7 +198,7 @@ class _BasicScraper(object): maxstrips -= 1 if maxstrips <= 0: break - prevUrl = self.getPrevUrl(url, data, baseUrl) + prevUrl = self.getPrevUrl(url, data) seen_urls.add(url) if prevUrl in seen_urls: # avoid recursive URL loops @@ -196,18 +209,18 @@ class _BasicScraper(object): # wait up to 2 seconds for next URL time.sleep(1.0 + random.random()) - def getPrevUrl(self, url, data, baseUrl): + def getPrevUrl(self, url, data): """Find previous URL.""" prevUrl = None if self.prevSearch: try: - prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch) + prevUrl = self.fetchUrl(url, data, self.prevSearch) except ValueError as msg: # assume there is no previous URL, but print a warning out.warn(u"%s Assuming no previous comic strips exist." % msg) else: prevUrl = self.prevUrlModifier(prevUrl) - out.debug(u"Matched previous URL %s" % prevUrl) + out.debug(u"Found previous URL %s" % prevUrl) getHandler().comicPageLink(self.getName(), url, prevUrl) return prevUrl @@ -278,6 +291,186 @@ class _BasicScraper(object): with open(filename, 'w') as f: f.write('All comics should be downloaded here.') + @classmethod + def getPage(cls, url): + """ + Fetch a page and return the opaque repesentation for the data parameter + of fetchUrls and fetchText. + + Implementation notes: While this base class does not restrict how the + returned data is structured, subclasses (specific scrapers) should specify + how this data works, since the stracture is passed into different methods + which can be defined by comic modules and these methods should be able to + use the data if they so desire... (Affected methods: shouldSkipUrl, + imageUrlModifier) + """ + raise ValueError("No implementation for getPage!") + + @classmethod + def fetchUrls(cls, url, data, urlSearch): + raise ValueError("No implementation for fetchUrls!") + + @classmethod + def fetchUrl(cls, url, data, urlSearch): + return cls.fetchUrls(url, data, urlSearch)[0] + + @classmethod + def fetchText(cls, url, data, textSearch, optional): + raise ValueError("No implementation for fetchText!") + + @classmethod + def getDisabledReasons(cls): + """ + Get a dict of reasons why this comic module is disabled. The key is a + short (unique) identifier, the value is a string explaining why the + module is deactivated. If the module is not disabled, just return an + empty dict. + """ + return {} + + +class _BasicScraper(Scraper): + """ + Scraper base class that matches regular expressions against HTML pages. + + Subclasses of this scraper should use compiled regular expressions as + values for prevSearch, imageSearch and textSearch. + + Implementation note: The return value of getPage is a tuple: the first + element is the raw HTML page text, the second element is the base URL (if + any). + """ + + BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)')) + + @classmethod + def getPage(cls, url): + content = getPageContent(url, cls.session) + # determine base URL + baseUrl = None + match = cls.BASE_SEARCH.search(content) + if match: + baseUrl = match.group(1) + else: + baseUrl = url + return (content, baseUrl) + + @classmethod + def fetchUrls(cls, url, data, urlSearch): + """Search all entries for given URL pattern(s) in a HTML page.""" + searchUrls = [] + searches = makeSequence(urlSearch) + for search in searches: + for match in search.finditer(data[0]): + searchUrl = match.group(1) + if not searchUrl: + raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url)) + out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern)) + searchUrls.append(normaliseURL(urljoin(data[1], searchUrl))) + if searchUrls: + # do not search other links if one pattern matched + break + if not searchUrls: + patterns = [x.pattern for x in searches] + raise ValueError("Patterns %s not found at URL %s." % (patterns, url)) + return searchUrls + + @classmethod + def fetchText(cls, url, data, textSearch, optional): + """Search text entry for given text pattern in a HTML page.""" + if textSearch: + match = textSearch.search(data[0]) + if match: + text = match.group(1) + out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern)) + return unescape(text).strip() + if optional: + return None + else: + raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url)) + else: + return None + + +class _ParserScraper(Scraper): + """ + Scraper base class that uses a HTML parser and XPath expressions. + + All links are resolved before XPath searches are applied, so all URLs are + absolute! + + Subclasses of this class should use XPath expressions as values for + prevSearch, imageSearch and textSearch. When the XPath directly selects an + attribute, it is used as the output. + + All those searches try to do something intelligent when they match a + complete HTML Element: prevSearch and imageSearch try to find a "link + attribute" and use that as URL. textSearch strips all tags from the content + of the HTML element and returns that. + """ + + # Switch between CSS and XPath selectors for this class. Since CSS needs + # another Python module, XPath is the default for now. + css = False + + @classmethod + def getPage(cls, url): + tree = html.document_fromstring(getPageContent(url, cls.session)) + tree.make_links_absolute(url) + return tree + + @classmethod + def fetchUrls(cls, url, data, urlSearch): + """Search all entries for given XPath in a HTML page.""" + searchUrls = [] + if cls.css: + searchFun = data.cssselect + else: + searchFun = data.xpath + searches = makeSequence(urlSearch) + for search in searches: + for match in searchFun(search): + try: + for attrib in html_link_attrs: + if attrib in match.attrib: + searchUrls.append(match.get(attrib)) + except AttributeError: + searchUrls.append(str(match)) + if searchUrls: + # do not search other links if one pattern matched + break + if not searchUrls: + raise ValueError("XPath %s not found at URL %s." % (searches, url)) + return searchUrls + + @classmethod + def fetchText(cls, url, data, textSearch, optional): + """Search text entry for given text XPath in a HTML page.""" + if textSearch: + text = '' + for match in data.xpath(textSearch): + try: + text += ' ' + match.text_content() + except AttributeError: + text += ' ' + unicode(match) + if text.strip() == '': + if optional: + return None + else: + raise ValueError("XPath %s did not match anything at URL %s." % (textSearch, url)) + out.debug(u'Matched text %r with XPath %s' % (text, textSearch)) + return unescape(text).strip() + else: + return None + + @classmethod + def getDisabledReasons(cls): + res = {} + if cls.css and cssselect is None: + res['css'] = u"This module needs the cssselect (python-cssselect) python module which is not installed." + if html is None: + res['lxml'] = u"This module needs the lxml (python-lxml) python module which is not installed." + return res def find_scraperclasses(comic, multiple_allowed=False): """Get a list comic scraper classes. Can return more than one entries if @@ -309,14 +502,14 @@ _scraperclasses = None def get_scraperclasses(): """Find all comic scraper classes in the plugins directory. The result is cached. - @return: list of _BasicScraper classes - @rtype: list of _BasicScraper + @return: list of Scraper classes + @rtype: list of Scraper """ global _scraperclasses if _scraperclasses is None: out.debug(u"Loading comic modules...") modules = loader.get_modules('plugins') - plugins = loader.get_plugins(modules, _BasicScraper) + plugins = loader.get_plugins(modules, Scraper) _scraperclasses = list(plugins) check_scrapers() out.debug(u"... %d modules loaded." % len(_scraperclasses)) diff --git a/dosagelib/util.py b/dosagelib/util.py index c0f0df3f3..b16cc07d1 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -7,9 +7,9 @@ try: except ImportError: from urllib import quote as url_quote, unquote as url_unquote try: - from urllib.parse import urlparse, urlunparse, urljoin, urlsplit + from urllib.parse import urlparse, urlunparse, urlsplit except ImportError: - from urlparse import urlparse, urlunparse, urljoin, urlsplit + from urlparse import urlparse, urlunparse, urlsplit try: from urllib import robotparser except ImportError: @@ -176,8 +176,6 @@ def case_insensitive_re(name): return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name) -baseSearch = re.compile(tagre("base", "href", '([^"]*)')) - def isValidPageContent(data): """Check if page content is empty or has error messages.""" # The python requests library sometimes returns empty data. @@ -203,14 +201,7 @@ def getPageContent(url, session, max_content_bytes=MaxContentBytes): if not isValidPageContent(data): raise ValueError("Got invalid page content from %s: %r" % (url, data)) out.debug(u"Got page content %r" % data, level=3) - # determine base URL - baseUrl = None - match = baseSearch.search(data) - if match: - baseUrl = match.group(1) - else: - baseUrl = url - return data, baseUrl + return data def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes): @@ -226,40 +217,16 @@ def makeSequence(item): return (item,) -def fetchUrls(url, data, baseUrl, urlSearch): - """Search all entries for given URL pattern(s) in a HTML page.""" - searchUrls = [] - searches = makeSequence(urlSearch) - for search in searches: - for match in search.finditer(data): - searchUrl = match.group(1) - if not searchUrl: - raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url)) - out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern)) - searchUrls.append(normaliseURL(urljoin(baseUrl, searchUrl))) - if searchUrls: - # do not search other links if one pattern matched - break - if not searchUrls: - patterns = [x.pattern for x in searches] - raise ValueError("Patterns %s not found at URL %s." % (patterns, url)) - return searchUrls - - -def fetchUrl(url, data, baseUrl, urlSearch): - """Search first URL entry for given URL pattern in a HTML page.""" - return fetchUrls(url, data, baseUrl, urlSearch)[0] - - -def fetchText(url, data, textSearch, optional=False): - """Search text entry for given text pattern in a HTML page."""# - match = textSearch.search(data) - if match: - text = match.group(1) - out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern)) - return text - if not optional: - raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url)) +def prettyMatcherList(things): + """Try to construct a nicely-formatted string for a list of matcher + objects. Those may be compiled regular expressions or strings...""" + norm = [] + for x in makeSequence(things): + if hasattr(x, 'pattern'): + norm.append(x.pattern) + else: + norm.append(x) + return "('%s')" % "', '".join(norm) _htmlparser = HTMLParser() diff --git a/requirements.txt b/requirements.txt index a37178243..d43beb4bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ requests # optional: argcomplete +lxml