Merge branch 'htmlparser' - I think it's ready.
This closes pull request #70.
This commit is contained in:
commit
5934f03453
15 changed files with 369 additions and 143 deletions
31
dosage
31
dosage
|
@ -136,7 +136,7 @@ def displayHelp(options):
|
||||||
"""Print help for comic strips."""
|
"""Print help for comic strips."""
|
||||||
errors = 0
|
errors = 0
|
||||||
try:
|
try:
|
||||||
for scraperobj in director.getScrapers(options.comic, options.basepath):
|
for scraperobj in director.getScrapers(options.comic, options.basepath, listing=True):
|
||||||
errors += displayComicHelp(scraperobj)
|
errors += displayComicHelp(scraperobj)
|
||||||
except ValueError as msg:
|
except ValueError as msg:
|
||||||
out.exception(msg)
|
out.exception(msg)
|
||||||
|
@ -239,12 +239,17 @@ def doList(columnList=True, verbose=False):
|
||||||
out.info(u'Available comic scrapers:')
|
out.info(u'Available comic scrapers:')
|
||||||
out.info(u'Comics tagged with [%s] require age confirmation with the --adult option.' % TAG_ADULT)
|
out.info(u'Comics tagged with [%s] require age confirmation with the --adult option.' % TAG_ADULT)
|
||||||
out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG)
|
out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG)
|
||||||
scrapers = sorted(director.getAllScrapers(), key=lambda s: s.getName())
|
scrapers = sorted(director.getAllScrapers(listing=True), key=lambda s: s.getName())
|
||||||
if columnList:
|
if columnList:
|
||||||
num = doColumnList(scrapers)
|
num, disabled = doColumnList(scrapers)
|
||||||
else:
|
else:
|
||||||
num = doSingleList(scrapers, verbose=verbose)
|
num, disabled = doSingleList(scrapers, verbose=verbose)
|
||||||
out.info(u'%d supported comics.' % num)
|
out.info(u'%d supported comics.' % num)
|
||||||
|
if disabled:
|
||||||
|
out.info('')
|
||||||
|
out.info(u'Some comics are disabled, they are tagged with [%s:REASON], where REASON is one of:' % TAG_DISABLED)
|
||||||
|
for k in disabled:
|
||||||
|
out.info(u' %-10s %s' % (k, disabled[k]))
|
||||||
if page:
|
if page:
|
||||||
pydoc.pager(fd.getvalue())
|
pydoc.pager(fd.getvalue())
|
||||||
return 0
|
return 0
|
||||||
|
@ -254,38 +259,46 @@ def doList(columnList=True, verbose=False):
|
||||||
|
|
||||||
def doSingleList(scrapers, verbose=False):
|
def doSingleList(scrapers, verbose=False):
|
||||||
"""Get list of scraper names, one per line."""
|
"""Get list of scraper names, one per line."""
|
||||||
|
disabled = {}
|
||||||
for num, scraperobj in enumerate(scrapers):
|
for num, scraperobj in enumerate(scrapers):
|
||||||
if verbose:
|
if verbose:
|
||||||
displayComicHelp(scraperobj)
|
displayComicHelp(scraperobj)
|
||||||
else:
|
else:
|
||||||
out.info(getScraperName(scraperobj))
|
out.info(getScraperName(scraperobj, reasons=disabled))
|
||||||
return num
|
return num, disabled
|
||||||
|
|
||||||
|
|
||||||
def doColumnList(scrapers):
|
def doColumnList(scrapers):
|
||||||
"""Get list of scraper names with multiple names per line."""
|
"""Get list of scraper names with multiple names per line."""
|
||||||
|
disabled = {}
|
||||||
screenWidth = get_columns(sys.stdout)
|
screenWidth = get_columns(sys.stdout)
|
||||||
# limit name length so at least two columns are there
|
# limit name length so at least two columns are there
|
||||||
limit = (screenWidth // 2) - 8
|
limit = (screenWidth // 2) - 8
|
||||||
names = [getScraperName(scraperobj, limit=limit) for scraperobj in scrapers]
|
names = [getScraperName(scraperobj, limit=limit, reasons=disabled) for scraperobj in scrapers]
|
||||||
num = len(names)
|
num = len(names)
|
||||||
maxlen = max(len(name) for name in names)
|
maxlen = max(len(name) for name in names)
|
||||||
namesPerLine = max(screenWidth // (maxlen + 1), 1)
|
namesPerLine = max(screenWidth // (maxlen + 1), 1)
|
||||||
while names:
|
while names:
|
||||||
out.info(u''.join(name.ljust(maxlen) for name in names[:namesPerLine]))
|
out.info(u''.join(name.ljust(maxlen) for name in names[:namesPerLine]))
|
||||||
del names[:namesPerLine]
|
del names[:namesPerLine]
|
||||||
return num
|
return num, disabled
|
||||||
|
|
||||||
TAG_ADULT = "adult"
|
TAG_ADULT = "adult"
|
||||||
TAG_LANG = "lang"
|
TAG_LANG = "lang"
|
||||||
|
TAG_DISABLED = "dis"
|
||||||
|
|
||||||
def getScraperName(scraperobj, limit=None):
|
def getScraperName(scraperobj, limit=None, reasons=None):
|
||||||
"""Get comic scraper name."""
|
"""Get comic scraper name."""
|
||||||
tags = []
|
tags = []
|
||||||
if scraperobj.adult:
|
if scraperobj.adult:
|
||||||
tags.append(TAG_ADULT)
|
tags.append(TAG_ADULT)
|
||||||
if scraperobj.lang != "en":
|
if scraperobj.lang != "en":
|
||||||
tags.append("%s:%s" % (TAG_LANG, scraperobj.lang))
|
tags.append("%s:%s" % (TAG_LANG, scraperobj.lang))
|
||||||
|
disabled = scraperobj.getDisabledReasons()
|
||||||
|
if disabled:
|
||||||
|
reasons.update(disabled)
|
||||||
|
for reason in disabled:
|
||||||
|
tags.append("%s:%s" % (TAG_DISABLED, reason))
|
||||||
if tags:
|
if tags:
|
||||||
suffix = " [" + ", ".join(tags) + "]"
|
suffix = " [" + ", ".join(tags) + "]"
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -189,12 +189,12 @@ def finish():
|
||||||
out.warn("Waiting for download threads to finish.")
|
out.warn("Waiting for download threads to finish.")
|
||||||
|
|
||||||
|
|
||||||
def getAllScrapers():
|
def getAllScrapers(listing=False):
|
||||||
"""Get all scrapers."""
|
"""Get all scrapers."""
|
||||||
return getScrapers(['@@'])
|
return getScrapers(['@@'], listing=listing)
|
||||||
|
|
||||||
|
|
||||||
def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
|
def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listing=False):
|
||||||
"""Get scraper objects for the given comics."""
|
"""Get scraper objects for the given comics."""
|
||||||
if '@' in comics:
|
if '@' in comics:
|
||||||
# only scrapers whose directory already exists
|
# only scrapers whose directory already exists
|
||||||
|
@ -203,17 +203,13 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
|
||||||
for scraperclass in scraper.get_scraperclasses():
|
for scraperclass in scraper.get_scraperclasses():
|
||||||
dirname = getDirname(scraperclass.getName())
|
dirname = getDirname(scraperclass.getName())
|
||||||
if os.path.isdir(os.path.join(basepath, dirname)):
|
if os.path.isdir(os.path.join(basepath, dirname)):
|
||||||
if not adult and scraperclass.adult:
|
if shouldRunScraper(scraperclass, adult, listing):
|
||||||
warn_adult(scraperclass)
|
yield scraperclass()
|
||||||
continue
|
|
||||||
yield scraperclass()
|
|
||||||
elif '@@' in comics:
|
elif '@@' in comics:
|
||||||
# all scrapers
|
# all scrapers
|
||||||
for scraperclass in scraper.get_scraperclasses():
|
for scraperclass in scraper.get_scraperclasses():
|
||||||
if not adult and scraperclass.adult:
|
if shouldRunScraper(scraperclass, adult, listing):
|
||||||
warn_adult(scraperclass)
|
yield scraperclass()
|
||||||
continue
|
|
||||||
yield scraperclass()
|
|
||||||
else:
|
else:
|
||||||
# get only selected comic scrapers
|
# get only selected comic scrapers
|
||||||
# store them in a set to eliminate duplicates
|
# store them in a set to eliminate duplicates
|
||||||
|
@ -233,15 +229,30 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
|
||||||
indexes = None
|
indexes = None
|
||||||
scraperclasses = scraper.find_scraperclasses(name, multiple_allowed=multiple_allowed)
|
scraperclasses = scraper.find_scraperclasses(name, multiple_allowed=multiple_allowed)
|
||||||
for scraperclass in scraperclasses:
|
for scraperclass in scraperclasses:
|
||||||
if not adult and scraperclass.adult:
|
if shouldRunScraper(scraperclass, adult, listing):
|
||||||
warn_adult(scraperclass)
|
scraperobj = scraperclass(indexes=indexes)
|
||||||
continue
|
if scraperobj not in scrapers:
|
||||||
scraperobj = scraperclass(indexes=indexes)
|
scrapers.add(scraperobj)
|
||||||
if scraperobj not in scrapers:
|
yield scraperobj
|
||||||
scrapers.add(scraperobj)
|
|
||||||
yield scraperobj
|
|
||||||
|
def shouldRunScraper(scraperclass, adult=True, listing=False):
|
||||||
|
if listing:
|
||||||
|
return True
|
||||||
|
if not adult and scraperclass.adult:
|
||||||
|
warn_adult(scraperclass)
|
||||||
|
return False
|
||||||
|
reasons = scraperclass.getDisabledReasons()
|
||||||
|
if reasons:
|
||||||
|
warn_disabled(scraperclass, reasons)
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def warn_adult(scraperclass):
|
def warn_adult(scraperclass):
|
||||||
"""Print warning about adult content."""
|
"""Print warning about adult content."""
|
||||||
out.warn(u"skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.getName())
|
out.warn(u"skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.getName())
|
||||||
|
|
||||||
|
def warn_disabled(scraperclass, reasons):
|
||||||
|
"""Print warning about disabled comic modules."""
|
||||||
|
out.warn(u"Skipping comic %s: %s" % (scraperclass.getName(), ' '.join(reasons.values())))
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: iso-8859-1 -*-
|
# -*- coding: iso-8859-1 -*-
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
from .util import fetchUrl, getPageContent, getQueryParams
|
from .util import getQueryParams
|
||||||
|
|
||||||
def queryNamer(paramName, usePageUrl=False):
|
def queryNamer(paramName, usePageUrl=False):
|
||||||
"""Get name from URL query part."""
|
"""Get name from URL query part."""
|
||||||
|
@ -30,10 +30,10 @@ def bounceStarter(url, nextSearch):
|
||||||
@classmethod
|
@classmethod
|
||||||
def _starter(cls):
|
def _starter(cls):
|
||||||
"""Get bounced start URL."""
|
"""Get bounced start URL."""
|
||||||
data, baseUrl = getPageContent(url, cls.session)
|
data = cls.getPage(url)
|
||||||
url1 = fetchUrl(url, data, baseUrl, cls.prevSearch)
|
url1 = cls.fetchUrl(url, data, cls.prevSearch)
|
||||||
data, baseUrl = getPageContent(url1, cls.session)
|
data = cls.getPage(url1)
|
||||||
return fetchUrl(url1, data, baseUrl, nextSearch)
|
return cls.fetchUrl(url1, data, nextSearch)
|
||||||
return _starter
|
return _starter
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,6 +42,6 @@ def indirectStarter(url, latestSearch):
|
||||||
@classmethod
|
@classmethod
|
||||||
def _starter(cls):
|
def _starter(cls):
|
||||||
"""Get indirect start URL."""
|
"""Get indirect start URL."""
|
||||||
data, baseUrl = getPageContent(url, cls.session)
|
data = cls.getPage(url)
|
||||||
return fetchUrl(url, data, baseUrl, latestSearch)
|
return cls.fetchUrl(url, data, latestSearch)
|
||||||
return _starter
|
return _starter
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
from re import compile, escape
|
from re import compile, escape
|
||||||
|
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
from ..scraper import _BasicScraper
|
from ..scraper import _BasicScraper, _ParserScraper
|
||||||
from ..helpers import indirectStarter
|
from ..helpers import indirectStarter
|
||||||
|
|
||||||
|
|
||||||
|
@ -148,6 +148,28 @@ class BizarreUprising(_BasicScraper):
|
||||||
help = 'Index format: n/name'
|
help = 'Index format: n/name'
|
||||||
|
|
||||||
|
|
||||||
|
class BladeKitten(_ParserScraper):
|
||||||
|
description = u"Blade Kitten aka Kit Ballard, is the hottest and best bounty hunter in the Korunda System and isn't afraid to let people know it!"
|
||||||
|
url = 'http://www.bladekitten.com/'
|
||||||
|
stripUrl = url + 'comics/blade-kitten/%s/page:%s'
|
||||||
|
firstStripUrl = stripUrl % ('1','1')
|
||||||
|
imageSearch = '//img[@class="comic_page_image"]'
|
||||||
|
prevSearch = '//span[@class="comic_nav_prev"]//a'
|
||||||
|
textSearch = '//div[@class="comic_comment_inner"]//p'
|
||||||
|
textOptional = True
|
||||||
|
help = 'Index format: chapter-page'
|
||||||
|
starter = indirectStarter(url, '//h4//a[contains(@href, "/comics/")]')
|
||||||
|
|
||||||
|
def getIndexStripUrl(self, index):
|
||||||
|
return self.stripUrl % tuple(index.split('-'))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def namer(cls, imageUrl, pageUrl):
|
||||||
|
filename = imageUrl.rsplit('/', 1)[1]
|
||||||
|
_, chapter, page = pageUrl.rsplit('/', 2)
|
||||||
|
page = page.split(':')[1]
|
||||||
|
return "bladekitten-%02i-%02i-%s" % (int(chapter), int(page), filename)
|
||||||
|
|
||||||
class BlankIt(_BasicScraper):
|
class BlankIt(_BasicScraper):
|
||||||
description = u'An absurd, insane, and delightful webcomic from Aric McKeown and Lem Pew.'
|
description = u'An absurd, insane, and delightful webcomic from Aric McKeown and Lem Pew.'
|
||||||
url = 'http://blankitcomics.com/'
|
url = 'http://blankitcomics.com/'
|
||||||
|
|
|
@ -420,7 +420,7 @@ class CyanideAndHappiness(_BasicScraper):
|
||||||
|
|
||||||
def shouldSkipUrl(self, url, data):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Skip pages without images."""
|
"""Skip pages without images."""
|
||||||
return "/comics/play-button.png" in data
|
return "/comics/play-button.png" in data[0]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def namer(cls, imageUrl, pageUrl):
|
def namer(cls, imageUrl, pageUrl):
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
from re import compile
|
from re import compile
|
||||||
from ..scraper import make_scraper
|
from ..scraper import make_scraper
|
||||||
from ..util import tagre, getQueryParams, fetchUrl, getPageContent
|
from ..util import tagre, getQueryParams
|
||||||
|
|
||||||
|
|
||||||
_linkTag = tagre("a", "href", r'([^"]+)')
|
_linkTag = tagre("a", "href", r'([^"]+)')
|
||||||
|
@ -25,15 +25,15 @@ def add(name, shortName, imageFolder=None, lastStrip=None):
|
||||||
@classmethod
|
@classmethod
|
||||||
def _starter(cls):
|
def _starter(cls):
|
||||||
# first, try hopping to previous and next comic
|
# first, try hopping to previous and next comic
|
||||||
data, _baseUrl = getPageContent(baseUrl, cls.session)
|
data = cls.getPage(baseUrl)
|
||||||
try:
|
try:
|
||||||
url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch)
|
url = cls.fetchUrl(baseUrl, data, _prevSearch)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# no previous link found, try hopping to last comic
|
# no previous link found, try hopping to last comic
|
||||||
return fetchUrl(baseUrl, data, _baseUrl, _lastSearch)
|
return cls.fetchUrl(baseUrl, data, _lastSearch)
|
||||||
else:
|
else:
|
||||||
data, _baseUrl = getPageContent(url, cls.session)
|
data = cls.getPage(url)
|
||||||
return fetchUrl(url, data, _baseUrl, _nextSearch)
|
return cls.fetchUrl(url, data, _nextSearch)
|
||||||
|
|
||||||
attrs = dict(
|
attrs = dict(
|
||||||
name='CloneManga/' + name,
|
name='CloneManga/' + name,
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
from re import compile
|
from re import compile
|
||||||
from ..scraper import make_scraper, Genre
|
from ..scraper import make_scraper, Genre
|
||||||
from ..util import tagre, fetchUrl, getPageContent
|
from ..util import tagre
|
||||||
|
|
||||||
# note: adding the compile() functions inside add() is a major performance hog
|
# note: adding the compile() functions inside add() is a major performance hog
|
||||||
_imageSearch = compile(tagre("img", "src", r'(https://s3\.amazonaws\.com/media\.drunkduck\.com/[^"]+)', before="page-image"))
|
_imageSearch = compile(tagre("img", "src", r'(https://s3\.amazonaws\.com/media\.drunkduck\.com/[^"]+)', before="page-image"))
|
||||||
|
@ -27,15 +27,15 @@ def add(name, path):
|
||||||
@classmethod
|
@classmethod
|
||||||
def _starter(cls):
|
def _starter(cls):
|
||||||
# first, try hopping to previous and next comic
|
# first, try hopping to previous and next comic
|
||||||
data, baseUrl = getPageContent(_url, cls.session)
|
data = cls.getPage(_url)
|
||||||
try:
|
try:
|
||||||
url = fetchUrl(_url, data, baseUrl, _prevSearch)
|
url = cls.fetchUrl(_url, data, _prevSearch)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# no previous link found, try hopping to last comic
|
# no previous link found, try hopping to last comic
|
||||||
return fetchUrl(_url, data, baseUrl, _lastSearch)
|
return cls.fetchUrl(_url, data, _lastSearch)
|
||||||
else:
|
else:
|
||||||
data, baseUrl = getPageContent(url, cls.session)
|
data = cls.getPage(url)
|
||||||
return fetchUrl(url, data, baseUrl, _nextSearch)
|
return cls.fetchUrl(url, data, _nextSearch)
|
||||||
|
|
||||||
attrs = dict(
|
attrs = dict(
|
||||||
name = 'DrunkDuck/' + name,
|
name = 'DrunkDuck/' + name,
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
|
|
||||||
from re import compile, escape
|
from re import compile, escape
|
||||||
from ..scraper import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
from ..util import tagre, getPageContent, fetchUrls
|
from ..util import tagre
|
||||||
from ..helpers import bounceStarter
|
from ..helpers import bounceStarter
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,9 +21,9 @@ class HagarTheHorrible(_BasicScraper):
|
||||||
def starter(cls):
|
def starter(cls):
|
||||||
"""Return last gallery link."""
|
"""Return last gallery link."""
|
||||||
url = 'http://www.hagardunor.net/comics.php'
|
url = 'http://www.hagardunor.net/comics.php'
|
||||||
content = getPageContent(url, cls.session)[0]
|
data = cls.getPage(url)
|
||||||
pattern = compile(tagre("a", "href", cls.prevUrl))
|
pattern = compile(tagre("a", "href", cls.prevUrl))
|
||||||
for starturl in fetchUrls(url, content, url, pattern):
|
for starturl in cls.fetchUrls(url, data, pattern):
|
||||||
pass
|
pass
|
||||||
return starturl
|
return starturl
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
from re import compile, escape
|
from re import compile, escape
|
||||||
from ..scraper import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
from ..helpers import bounceStarter, queryNamer, indirectStarter
|
from ..helpers import bounceStarter, queryNamer, indirectStarter
|
||||||
from ..util import tagre, fetchUrl, getPageContent
|
from ..util import tagre
|
||||||
|
|
||||||
|
|
||||||
class PandyLand(_BasicScraper):
|
class PandyLand(_BasicScraper):
|
||||||
|
@ -104,10 +104,10 @@ class PennyArcade(_BasicScraper):
|
||||||
@classmethod
|
@classmethod
|
||||||
def starter(cls):
|
def starter(cls):
|
||||||
"""Get bounced start URL."""
|
"""Get bounced start URL."""
|
||||||
data, baseUrl = getPageContent(cls.url, cls.session)
|
data = cls.getPage(cls.url)
|
||||||
url1 = fetchUrl(cls.url, data, baseUrl, cls.prevSearch)
|
url1 = cls.fetchUrl(cls.url, data, cls.prevSearch)
|
||||||
data, baseUrl = getPageContent(url1, cls.session)
|
data = cls.getPage(url1)
|
||||||
url2 = fetchUrl(url1, data, baseUrl, cls.nextSearch)
|
url2 = cls.fetchUrl(url1, data, cls.nextSearch)
|
||||||
return cls.prevUrlModifier(url2)
|
return cls.prevUrlModifier(url2)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
from re import compile, escape, IGNORECASE, sub
|
from re import compile, escape, IGNORECASE, sub
|
||||||
from os.path import splitext
|
from os.path import splitext
|
||||||
from ..scraper import _BasicScraper
|
from ..scraper import _BasicScraper, _ParserScraper
|
||||||
from ..helpers import indirectStarter, bounceStarter
|
from ..helpers import indirectStarter, bounceStarter
|
||||||
from ..util import tagre, getPageContent
|
from ..util import tagre, getPageContent
|
||||||
|
|
||||||
|
@ -544,6 +544,25 @@ class StrawberryDeathCake(_BasicScraper):
|
||||||
help = 'Index format: stripname'
|
help = 'Index format: stripname'
|
||||||
|
|
||||||
|
|
||||||
|
class StrongFemaleProtagonist(_ParserScraper):
|
||||||
|
url = 'http://strongfemaleprotagonist.com/'
|
||||||
|
stripUrl = url + '%s/'
|
||||||
|
css = True
|
||||||
|
imageSearch = 'article p:first-child img'
|
||||||
|
prevSearch = 'div.nav-previous > a'
|
||||||
|
help = 'Index format: issue-?/page-??'
|
||||||
|
|
||||||
|
def shouldSkipUrl(self, url, data):
|
||||||
|
"""Skip hiatus & non-comic pages."""
|
||||||
|
return url in (
|
||||||
|
self.stripUrl % 'guest-art/tuesday',
|
||||||
|
self.stripUrl % 'guest-art/friday',
|
||||||
|
self.stripUrl % 'guest-art/wednesday',
|
||||||
|
self.stripUrl % 'issue-5/newspaper',
|
||||||
|
self.stripUrl % 'issue-5/hiatus-1',
|
||||||
|
self.stripUrl % 'issue-5/hiatus-2',
|
||||||
|
)
|
||||||
|
|
||||||
class SuburbanTribe(_BasicScraper):
|
class SuburbanTribe(_BasicScraper):
|
||||||
url = 'http://www.pixelwhip.com/'
|
url = 'http://www.pixelwhip.com/'
|
||||||
rurl = escape(url)
|
rurl = escape(url)
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
from re import compile
|
from re import compile
|
||||||
from ..scraper import make_scraper
|
from ..scraper import make_scraper
|
||||||
from ..util import tagre, quote, fetchUrl, case_insensitive_re, getPageContent
|
from ..util import tagre, quote, case_insensitive_re
|
||||||
|
|
||||||
# SmackJeeves is a crawlers nightmare - users are allowed to edit HTML directly.
|
# SmackJeeves is a crawlers nightmare - users are allowed to edit HTML directly.
|
||||||
# That's why there are so much different search patterns.
|
# That's why there are so much different search patterns.
|
||||||
|
@ -45,11 +45,11 @@ def add(name, url, description, adult, bounce):
|
||||||
def _starter(cls):
|
def _starter(cls):
|
||||||
"""Get start URL."""
|
"""Get start URL."""
|
||||||
url1 = modifier(url)
|
url1 = modifier(url)
|
||||||
data, baseUrl = getPageContent(url1, cls.session)
|
data = cls.getPage(url1)
|
||||||
url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch)
|
url2 = cls.fetchUrl(url1, data, cls.prevSearch)
|
||||||
if bounce:
|
if bounce:
|
||||||
data, baseUrl = getPageContent(url2, cls.session)
|
data = cls.getPage(url2)
|
||||||
url3 = fetchUrl(url2, data, baseUrl, _nextSearch)
|
url3 = cls.fetchUrl(url2, data, _nextSearch)
|
||||||
return modifier(url3)
|
return modifier(url3)
|
||||||
return modifier(url2)
|
return modifier(url2)
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
from re import compile, escape, IGNORECASE
|
from re import compile, escape, IGNORECASE
|
||||||
from ..scraper import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
from ..helpers import indirectStarter
|
from ..helpers import indirectStarter
|
||||||
from ..util import tagre, fetchUrl, getPageContent
|
from ..util import tagre
|
||||||
|
|
||||||
|
|
||||||
class TheBrads(_BasicScraper):
|
class TheBrads(_BasicScraper):
|
||||||
|
@ -223,11 +223,11 @@ class TheThinHLine(_BasicScraper):
|
||||||
|
|
||||||
indirectImageSearch = compile(tagre('a', 'href', r'(%simage/\d+)' % rurl))
|
indirectImageSearch = compile(tagre('a', 'href', r'(%simage/\d+)' % rurl))
|
||||||
|
|
||||||
def getComicStrip(self, url, data, baseUrl):
|
def getComicStrip(self, url, data):
|
||||||
"""The comic strip image is in a separate page."""
|
"""The comic strip image is in a separate page."""
|
||||||
pageUrl = fetchUrl(url, data, baseUrl, self.indirectImageSearch)
|
pageUrl = self.fetchUrl(url, data, self.indirectImageSearch)
|
||||||
pageData, pageBaseUrl = getPageContent(pageUrl, self.session)
|
pageData = self.getPage(pageUrl)
|
||||||
return super(TheThinHLine, self).getComicStrip(pageUrl, pageData, pageBaseUrl)
|
return super(TheThinHLine, self).getComicStrip(pageUrl, pageData)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def namer(cls, imageUrl, pageUrl):
|
def namer(cls, imageUrl, pageUrl):
|
||||||
|
|
|
@ -5,9 +5,26 @@ import requests
|
||||||
import time
|
import time
|
||||||
import random
|
import random
|
||||||
import os
|
import os
|
||||||
from . import loader, configuration
|
import re
|
||||||
from .util import (fetchUrl, fetchUrls, fetchText, getPageContent,
|
try:
|
||||||
makeSequence, get_system_uid, urlopen, getDirname, unescape)
|
from urllib.parse import urljoin
|
||||||
|
except ImportError:
|
||||||
|
from urlparse import urljoin
|
||||||
|
|
||||||
|
try:
|
||||||
|
from lxml import html
|
||||||
|
from lxml.html.defs import link_attrs as html_link_attrs
|
||||||
|
except ImportError:
|
||||||
|
html = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
import cssselect
|
||||||
|
except ImportError:
|
||||||
|
cssselect = None
|
||||||
|
|
||||||
|
from . import loader, configuration, util
|
||||||
|
from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
|
||||||
|
getDirname, unescape, tagre, normaliseURL, prettyMatcherList)
|
||||||
from .comic import ComicStrip
|
from .comic import ComicStrip
|
||||||
from .output import out
|
from .output import out
|
||||||
from .events import getHandler
|
from .events import getHandler
|
||||||
|
@ -26,8 +43,8 @@ class Genre:
|
||||||
other = u"Other"
|
other = u"Other"
|
||||||
|
|
||||||
|
|
||||||
class _BasicScraper(object):
|
class Scraper(object):
|
||||||
'''Base class with scrape functions for comics.'''
|
'''Base class for all comic scraper, but without a specific scrape implementation.'''
|
||||||
|
|
||||||
# The URL for the comic strip
|
# The URL for the comic strip
|
||||||
url = None
|
url = None
|
||||||
|
@ -59,15 +76,15 @@ class _BasicScraper(object):
|
||||||
# list of genres for this comic strip
|
# list of genres for this comic strip
|
||||||
genres = (Genre.other,)
|
genres = (Genre.other,)
|
||||||
|
|
||||||
# compiled regular expression that will locate the URL for the previous strip in a page
|
# an expression that will locate the URL for the previous strip in a page
|
||||||
# this can also be a list or tuple of compiled regular expressions
|
# this can also be a list or tuple
|
||||||
prevSearch = None
|
prevSearch = None
|
||||||
|
|
||||||
# compiled regular expression that will locate the strip image URLs strip in a page
|
# an expression that will locate the strip image URLs strip in a page
|
||||||
# this can also be a list or tuple of compiled regular expressions
|
# this can also be a list or tuple
|
||||||
imageSearch = None
|
imageSearch = None
|
||||||
|
|
||||||
# compiled regular expression to store a text together with the image
|
# an expression to store a text together with the image
|
||||||
# sometimes comic strips have additional text info for each comic
|
# sometimes comic strips have additional text info for each comic
|
||||||
textSearch = None
|
textSearch = None
|
||||||
|
|
||||||
|
@ -94,7 +111,7 @@ class _BasicScraper(object):
|
||||||
|
|
||||||
def __cmp__(self, other):
|
def __cmp__(self, other):
|
||||||
"""Compare scraper by name and index list."""
|
"""Compare scraper by name and index list."""
|
||||||
if not isinstance(other, _BasicScraper):
|
if not isinstance(other, Scraper):
|
||||||
return 1
|
return 1
|
||||||
# first, order by name
|
# first, order by name
|
||||||
d = cmp(self.getName(), other.getName())
|
d = cmp(self.getName(), other.getName())
|
||||||
|
@ -111,26 +128,22 @@ class _BasicScraper(object):
|
||||||
"""Determine if search for images in given URL should be skipped."""
|
"""Determine if search for images in given URL should be skipped."""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def getComicStrip(self, url, data, baseUrl):
|
def getComicStrip(self, url, data):
|
||||||
"""Get comic strip downloader for given URL and data."""
|
"""Get comic strip downloader for given URL and data."""
|
||||||
imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)
|
imageUrls = self.fetchUrls(url, data, self.imageSearch)
|
||||||
# map modifier function on image URLs
|
# map modifier function on image URLs
|
||||||
imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls]
|
imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls]
|
||||||
# remove duplicate URLs
|
# remove duplicate URLs
|
||||||
imageUrls = set(imageUrls)
|
imageUrls = set(imageUrls)
|
||||||
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
||||||
patterns = [x.pattern for x in makeSequence(self.imageSearch)]
|
out.warn(u"Found %d images instead of 1 at %s with expressions %s" % (len(imageUrls), url, prettyMatcherList(self.imageSearch)))
|
||||||
out.warn(u"found %d images instead of 1 at %s with patterns %s" % (len(imageUrls), url, patterns))
|
|
||||||
image = sorted(imageUrls)[0]
|
image = sorted(imageUrls)[0]
|
||||||
out.warn(u"choosing image %s" % image)
|
out.warn(u"Choosing image %s" % image)
|
||||||
imageUrls = (image,)
|
imageUrls = (image,)
|
||||||
elif not imageUrls:
|
elif not imageUrls:
|
||||||
patterns = [x.pattern for x in makeSequence(self.imageSearch)]
|
out.warn(u"Found no images at %s with expressions %s" % (url, prettyMatcherList(self.imageSearch)))
|
||||||
out.warn(u"found no images at %s with patterns %s" % (url, patterns))
|
|
||||||
if self.textSearch:
|
if self.textSearch:
|
||||||
text = fetchText(url, data, self.textSearch, optional=self.textOptional)
|
text = self.fetchText(url, data, self.textSearch, optional=self.textOptional)
|
||||||
if text:
|
|
||||||
text = unescape(text).strip()
|
|
||||||
else:
|
else:
|
||||||
text = None
|
text = None
|
||||||
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session, text=text)
|
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session, text=text)
|
||||||
|
@ -167,13 +180,13 @@ class _BasicScraper(object):
|
||||||
seen_urls = set()
|
seen_urls = set()
|
||||||
while url:
|
while url:
|
||||||
out.info(u'Get strip URL %s' % url, level=1)
|
out.info(u'Get strip URL %s' % url, level=1)
|
||||||
data, baseUrl = getPageContent(url, self.session)
|
data = self.getPage(url)
|
||||||
if self.shouldSkipUrl(url, data):
|
if self.shouldSkipUrl(url, data):
|
||||||
out.info(u'Skipping URL %s' % url)
|
out.info(u'Skipping URL %s' % url)
|
||||||
self.skippedUrls.add(url)
|
self.skippedUrls.add(url)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
yield self.getComicStrip(url, data, baseUrl)
|
yield self.getComicStrip(url, data)
|
||||||
except ValueError as msg:
|
except ValueError as msg:
|
||||||
# image not found
|
# image not found
|
||||||
out.exception(msg)
|
out.exception(msg)
|
||||||
|
@ -185,7 +198,7 @@ class _BasicScraper(object):
|
||||||
maxstrips -= 1
|
maxstrips -= 1
|
||||||
if maxstrips <= 0:
|
if maxstrips <= 0:
|
||||||
break
|
break
|
||||||
prevUrl = self.getPrevUrl(url, data, baseUrl)
|
prevUrl = self.getPrevUrl(url, data)
|
||||||
seen_urls.add(url)
|
seen_urls.add(url)
|
||||||
if prevUrl in seen_urls:
|
if prevUrl in seen_urls:
|
||||||
# avoid recursive URL loops
|
# avoid recursive URL loops
|
||||||
|
@ -196,18 +209,18 @@ class _BasicScraper(object):
|
||||||
# wait up to 2 seconds for next URL
|
# wait up to 2 seconds for next URL
|
||||||
time.sleep(1.0 + random.random())
|
time.sleep(1.0 + random.random())
|
||||||
|
|
||||||
def getPrevUrl(self, url, data, baseUrl):
|
def getPrevUrl(self, url, data):
|
||||||
"""Find previous URL."""
|
"""Find previous URL."""
|
||||||
prevUrl = None
|
prevUrl = None
|
||||||
if self.prevSearch:
|
if self.prevSearch:
|
||||||
try:
|
try:
|
||||||
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
|
prevUrl = self.fetchUrl(url, data, self.prevSearch)
|
||||||
except ValueError as msg:
|
except ValueError as msg:
|
||||||
# assume there is no previous URL, but print a warning
|
# assume there is no previous URL, but print a warning
|
||||||
out.warn(u"%s Assuming no previous comic strips exist." % msg)
|
out.warn(u"%s Assuming no previous comic strips exist." % msg)
|
||||||
else:
|
else:
|
||||||
prevUrl = self.prevUrlModifier(prevUrl)
|
prevUrl = self.prevUrlModifier(prevUrl)
|
||||||
out.debug(u"Matched previous URL %s" % prevUrl)
|
out.debug(u"Found previous URL %s" % prevUrl)
|
||||||
getHandler().comicPageLink(self.getName(), url, prevUrl)
|
getHandler().comicPageLink(self.getName(), url, prevUrl)
|
||||||
return prevUrl
|
return prevUrl
|
||||||
|
|
||||||
|
@ -278,6 +291,186 @@ class _BasicScraper(object):
|
||||||
with open(filename, 'w') as f:
|
with open(filename, 'w') as f:
|
||||||
f.write('All comics should be downloaded here.')
|
f.write('All comics should be downloaded here.')
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def getPage(cls, url):
|
||||||
|
"""
|
||||||
|
Fetch a page and return the opaque repesentation for the data parameter
|
||||||
|
of fetchUrls and fetchText.
|
||||||
|
|
||||||
|
Implementation notes: While this base class does not restrict how the
|
||||||
|
returned data is structured, subclasses (specific scrapers) should specify
|
||||||
|
how this data works, since the stracture is passed into different methods
|
||||||
|
which can be defined by comic modules and these methods should be able to
|
||||||
|
use the data if they so desire... (Affected methods: shouldSkipUrl,
|
||||||
|
imageUrlModifier)
|
||||||
|
"""
|
||||||
|
raise ValueError("No implementation for getPage!")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetchUrls(cls, url, data, urlSearch):
|
||||||
|
raise ValueError("No implementation for fetchUrls!")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetchUrl(cls, url, data, urlSearch):
|
||||||
|
return cls.fetchUrls(url, data, urlSearch)[0]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetchText(cls, url, data, textSearch, optional):
|
||||||
|
raise ValueError("No implementation for fetchText!")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def getDisabledReasons(cls):
|
||||||
|
"""
|
||||||
|
Get a dict of reasons why this comic module is disabled. The key is a
|
||||||
|
short (unique) identifier, the value is a string explaining why the
|
||||||
|
module is deactivated. If the module is not disabled, just return an
|
||||||
|
empty dict.
|
||||||
|
"""
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
class _BasicScraper(Scraper):
|
||||||
|
"""
|
||||||
|
Scraper base class that matches regular expressions against HTML pages.
|
||||||
|
|
||||||
|
Subclasses of this scraper should use compiled regular expressions as
|
||||||
|
values for prevSearch, imageSearch and textSearch.
|
||||||
|
|
||||||
|
Implementation note: The return value of getPage is a tuple: the first
|
||||||
|
element is the raw HTML page text, the second element is the base URL (if
|
||||||
|
any).
|
||||||
|
"""
|
||||||
|
|
||||||
|
BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def getPage(cls, url):
|
||||||
|
content = getPageContent(url, cls.session)
|
||||||
|
# determine base URL
|
||||||
|
baseUrl = None
|
||||||
|
match = cls.BASE_SEARCH.search(content)
|
||||||
|
if match:
|
||||||
|
baseUrl = match.group(1)
|
||||||
|
else:
|
||||||
|
baseUrl = url
|
||||||
|
return (content, baseUrl)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetchUrls(cls, url, data, urlSearch):
|
||||||
|
"""Search all entries for given URL pattern(s) in a HTML page."""
|
||||||
|
searchUrls = []
|
||||||
|
searches = makeSequence(urlSearch)
|
||||||
|
for search in searches:
|
||||||
|
for match in search.finditer(data[0]):
|
||||||
|
searchUrl = match.group(1)
|
||||||
|
if not searchUrl:
|
||||||
|
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
|
||||||
|
out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern))
|
||||||
|
searchUrls.append(normaliseURL(urljoin(data[1], searchUrl)))
|
||||||
|
if searchUrls:
|
||||||
|
# do not search other links if one pattern matched
|
||||||
|
break
|
||||||
|
if not searchUrls:
|
||||||
|
patterns = [x.pattern for x in searches]
|
||||||
|
raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
|
||||||
|
return searchUrls
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetchText(cls, url, data, textSearch, optional):
|
||||||
|
"""Search text entry for given text pattern in a HTML page."""
|
||||||
|
if textSearch:
|
||||||
|
match = textSearch.search(data[0])
|
||||||
|
if match:
|
||||||
|
text = match.group(1)
|
||||||
|
out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
|
||||||
|
return unescape(text).strip()
|
||||||
|
if optional:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class _ParserScraper(Scraper):
|
||||||
|
"""
|
||||||
|
Scraper base class that uses a HTML parser and XPath expressions.
|
||||||
|
|
||||||
|
All links are resolved before XPath searches are applied, so all URLs are
|
||||||
|
absolute!
|
||||||
|
|
||||||
|
Subclasses of this class should use XPath expressions as values for
|
||||||
|
prevSearch, imageSearch and textSearch. When the XPath directly selects an
|
||||||
|
attribute, it is used as the output.
|
||||||
|
|
||||||
|
All those searches try to do something intelligent when they match a
|
||||||
|
complete HTML Element: prevSearch and imageSearch try to find a "link
|
||||||
|
attribute" and use that as URL. textSearch strips all tags from the content
|
||||||
|
of the HTML element and returns that.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Switch between CSS and XPath selectors for this class. Since CSS needs
|
||||||
|
# another Python module, XPath is the default for now.
|
||||||
|
css = False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def getPage(cls, url):
|
||||||
|
tree = html.document_fromstring(getPageContent(url, cls.session))
|
||||||
|
tree.make_links_absolute(url)
|
||||||
|
return tree
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetchUrls(cls, url, data, urlSearch):
|
||||||
|
"""Search all entries for given XPath in a HTML page."""
|
||||||
|
searchUrls = []
|
||||||
|
if cls.css:
|
||||||
|
searchFun = data.cssselect
|
||||||
|
else:
|
||||||
|
searchFun = data.xpath
|
||||||
|
searches = makeSequence(urlSearch)
|
||||||
|
for search in searches:
|
||||||
|
for match in searchFun(search):
|
||||||
|
try:
|
||||||
|
for attrib in html_link_attrs:
|
||||||
|
if attrib in match.attrib:
|
||||||
|
searchUrls.append(match.get(attrib))
|
||||||
|
except AttributeError:
|
||||||
|
searchUrls.append(str(match))
|
||||||
|
if searchUrls:
|
||||||
|
# do not search other links if one pattern matched
|
||||||
|
break
|
||||||
|
if not searchUrls:
|
||||||
|
raise ValueError("XPath %s not found at URL %s." % (searches, url))
|
||||||
|
return searchUrls
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetchText(cls, url, data, textSearch, optional):
|
||||||
|
"""Search text entry for given text XPath in a HTML page."""
|
||||||
|
if textSearch:
|
||||||
|
text = ''
|
||||||
|
for match in data.xpath(textSearch):
|
||||||
|
try:
|
||||||
|
text += ' ' + match.text_content()
|
||||||
|
except AttributeError:
|
||||||
|
text += ' ' + unicode(match)
|
||||||
|
if text.strip() == '':
|
||||||
|
if optional:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
raise ValueError("XPath %s did not match anything at URL %s." % (textSearch, url))
|
||||||
|
out.debug(u'Matched text %r with XPath %s' % (text, textSearch))
|
||||||
|
return unescape(text).strip()
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def getDisabledReasons(cls):
|
||||||
|
res = {}
|
||||||
|
if cls.css and cssselect is None:
|
||||||
|
res['css'] = u"This module needs the cssselect (python-cssselect) python module which is not installed."
|
||||||
|
if html is None:
|
||||||
|
res['lxml'] = u"This module needs the lxml (python-lxml) python module which is not installed."
|
||||||
|
return res
|
||||||
|
|
||||||
def find_scraperclasses(comic, multiple_allowed=False):
|
def find_scraperclasses(comic, multiple_allowed=False):
|
||||||
"""Get a list comic scraper classes. Can return more than one entries if
|
"""Get a list comic scraper classes. Can return more than one entries if
|
||||||
|
@ -309,14 +502,14 @@ _scraperclasses = None
|
||||||
def get_scraperclasses():
|
def get_scraperclasses():
|
||||||
"""Find all comic scraper classes in the plugins directory.
|
"""Find all comic scraper classes in the plugins directory.
|
||||||
The result is cached.
|
The result is cached.
|
||||||
@return: list of _BasicScraper classes
|
@return: list of Scraper classes
|
||||||
@rtype: list of _BasicScraper
|
@rtype: list of Scraper
|
||||||
"""
|
"""
|
||||||
global _scraperclasses
|
global _scraperclasses
|
||||||
if _scraperclasses is None:
|
if _scraperclasses is None:
|
||||||
out.debug(u"Loading comic modules...")
|
out.debug(u"Loading comic modules...")
|
||||||
modules = loader.get_modules('plugins')
|
modules = loader.get_modules('plugins')
|
||||||
plugins = loader.get_plugins(modules, _BasicScraper)
|
plugins = loader.get_plugins(modules, Scraper)
|
||||||
_scraperclasses = list(plugins)
|
_scraperclasses = list(plugins)
|
||||||
check_scrapers()
|
check_scrapers()
|
||||||
out.debug(u"... %d modules loaded." % len(_scraperclasses))
|
out.debug(u"... %d modules loaded." % len(_scraperclasses))
|
||||||
|
|
|
@ -7,9 +7,9 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from urllib import quote as url_quote, unquote as url_unquote
|
from urllib import quote as url_quote, unquote as url_unquote
|
||||||
try:
|
try:
|
||||||
from urllib.parse import urlparse, urlunparse, urljoin, urlsplit
|
from urllib.parse import urlparse, urlunparse, urlsplit
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from urlparse import urlparse, urlunparse, urljoin, urlsplit
|
from urlparse import urlparse, urlunparse, urlsplit
|
||||||
try:
|
try:
|
||||||
from urllib import robotparser
|
from urllib import robotparser
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -176,8 +176,6 @@ def case_insensitive_re(name):
|
||||||
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
|
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
|
||||||
|
|
||||||
|
|
||||||
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
|
||||||
|
|
||||||
def isValidPageContent(data):
|
def isValidPageContent(data):
|
||||||
"""Check if page content is empty or has error messages."""
|
"""Check if page content is empty or has error messages."""
|
||||||
# The python requests library sometimes returns empty data.
|
# The python requests library sometimes returns empty data.
|
||||||
|
@ -203,14 +201,7 @@ def getPageContent(url, session, max_content_bytes=MaxContentBytes):
|
||||||
if not isValidPageContent(data):
|
if not isValidPageContent(data):
|
||||||
raise ValueError("Got invalid page content from %s: %r" % (url, data))
|
raise ValueError("Got invalid page content from %s: %r" % (url, data))
|
||||||
out.debug(u"Got page content %r" % data, level=3)
|
out.debug(u"Got page content %r" % data, level=3)
|
||||||
# determine base URL
|
return data
|
||||||
baseUrl = None
|
|
||||||
match = baseSearch.search(data)
|
|
||||||
if match:
|
|
||||||
baseUrl = match.group(1)
|
|
||||||
else:
|
|
||||||
baseUrl = url
|
|
||||||
return data, baseUrl
|
|
||||||
|
|
||||||
|
|
||||||
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
||||||
|
@ -226,40 +217,16 @@ def makeSequence(item):
|
||||||
return (item,)
|
return (item,)
|
||||||
|
|
||||||
|
|
||||||
def fetchUrls(url, data, baseUrl, urlSearch):
|
def prettyMatcherList(things):
|
||||||
"""Search all entries for given URL pattern(s) in a HTML page."""
|
"""Try to construct a nicely-formatted string for a list of matcher
|
||||||
searchUrls = []
|
objects. Those may be compiled regular expressions or strings..."""
|
||||||
searches = makeSequence(urlSearch)
|
norm = []
|
||||||
for search in searches:
|
for x in makeSequence(things):
|
||||||
for match in search.finditer(data):
|
if hasattr(x, 'pattern'):
|
||||||
searchUrl = match.group(1)
|
norm.append(x.pattern)
|
||||||
if not searchUrl:
|
else:
|
||||||
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
|
norm.append(x)
|
||||||
out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern))
|
return "('%s')" % "', '".join(norm)
|
||||||
searchUrls.append(normaliseURL(urljoin(baseUrl, searchUrl)))
|
|
||||||
if searchUrls:
|
|
||||||
# do not search other links if one pattern matched
|
|
||||||
break
|
|
||||||
if not searchUrls:
|
|
||||||
patterns = [x.pattern for x in searches]
|
|
||||||
raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
|
|
||||||
return searchUrls
|
|
||||||
|
|
||||||
|
|
||||||
def fetchUrl(url, data, baseUrl, urlSearch):
|
|
||||||
"""Search first URL entry for given URL pattern in a HTML page."""
|
|
||||||
return fetchUrls(url, data, baseUrl, urlSearch)[0]
|
|
||||||
|
|
||||||
|
|
||||||
def fetchText(url, data, textSearch, optional=False):
|
|
||||||
"""Search text entry for given text pattern in a HTML page."""#
|
|
||||||
match = textSearch.search(data)
|
|
||||||
if match:
|
|
||||||
text = match.group(1)
|
|
||||||
out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
|
|
||||||
return text
|
|
||||||
if not optional:
|
|
||||||
raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
|
|
||||||
|
|
||||||
|
|
||||||
_htmlparser = HTMLParser()
|
_htmlparser = HTMLParser()
|
||||||
|
|
|
@ -2,3 +2,4 @@
|
||||||
requests
|
requests
|
||||||
# optional:
|
# optional:
|
||||||
argcomplete
|
argcomplete
|
||||||
|
lxml
|
||||||
|
|
Loading…
Reference in a new issue