Simplify the fetchUrl code.
This commit is contained in:
parent
ae81b88acd
commit
67836942d8
6 changed files with 55 additions and 87 deletions
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: iso-8859-1 -*-
|
# -*- coding: iso-8859-1 -*-
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||||
from .util import fetchUrl, getQueryParams
|
from .util import fetchUrl, getPageContent, getQueryParams
|
||||||
|
|
||||||
def queryNamer(paramName, usePageUrl=False):
|
def queryNamer(paramName, usePageUrl=False):
|
||||||
"""Get name from URL query part."""
|
"""Get name from URL query part."""
|
||||||
|
@ -29,24 +29,18 @@ def bounceStarter(url, nextSearch):
|
||||||
@classmethod
|
@classmethod
|
||||||
def _starter(cls):
|
def _starter(cls):
|
||||||
"""Get bounced start URL."""
|
"""Get bounced start URL."""
|
||||||
url1 = fetchUrl(url, cls.prevSearch, session=cls.session)
|
data, baseUrl = getPageContent(url, session=cls.session)
|
||||||
if not url1:
|
url1 = fetchUrl(url, data, baseUrl, cls.prevSearch)
|
||||||
raise ValueError("could not find prevSearch pattern %r in %s" % (cls.prevSearch.pattern, url))
|
data, baseUrl = getPageContent(url1, session=cls.session)
|
||||||
url2 = fetchUrl(url1, nextSearch, session=cls.session)
|
return fetchUrl(url1, data, baseUrl, nextSearch)
|
||||||
if not url2:
|
|
||||||
raise ValueError("could not find nextSearch pattern %r in %s" % (nextSearch.pattern, url1))
|
|
||||||
return url2
|
|
||||||
return _starter
|
return _starter
|
||||||
|
|
||||||
|
|
||||||
def indirectStarter(baseUrl, latestSearch):
|
def indirectStarter(url, latestSearch):
|
||||||
"""Get start URL by indirection."""
|
"""Get start URL by indirection."""
|
||||||
@classmethod
|
@classmethod
|
||||||
def _starter(cls):
|
def _starter(cls):
|
||||||
"""Get indirect start URL."""
|
"""Get indirect start URL."""
|
||||||
url = fetchUrl(baseUrl, latestSearch, session=cls.session)
|
data, baseUrl = getPageContent(url, session=cls.session)
|
||||||
if not url:
|
return fetchUrl(url, data, baseUrl, latestSearch)
|
||||||
raise ValueError("could not find latestSearch pattern %r in %s" % (latestSearch.pattern, baseUrl))
|
|
||||||
return url
|
|
||||||
return _starter
|
return _starter
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||||
from re import compile
|
from re import compile
|
||||||
from ..scraper import make_scraper
|
from ..scraper import make_scraper
|
||||||
from ..util import tagre, getQueryParams, fetchUrl
|
from ..util import tagre, getQueryParams, fetchUrl, getPageContent
|
||||||
|
|
||||||
|
|
||||||
_linkTag = tagre("a", "href", r'([^"]+)')
|
_linkTag = tagre("a", "href", r'([^"]+)')
|
||||||
|
@ -25,17 +25,15 @@ def add(name, shortName, imageFolder=None, lastStrip=None):
|
||||||
@classmethod
|
@classmethod
|
||||||
def _starter(cls):
|
def _starter(cls):
|
||||||
# first, try hopping to previous and next comic
|
# first, try hopping to previous and next comic
|
||||||
url = fetchUrl(baseUrl, _prevSearch)
|
data, _baseUrl = getPageContent(baseUrl, session=cls.session)
|
||||||
if not url:
|
try:
|
||||||
|
url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch)
|
||||||
|
except ValueError:
|
||||||
# no previous link found, try hopping to last comic
|
# no previous link found, try hopping to last comic
|
||||||
url = fetchUrl(baseUrl, _lastSearch)
|
return fetchUrl(baseUrl, data, _baseUrl, _lastSearch)
|
||||||
if not url:
|
else:
|
||||||
raise ValueError("could not find lastSearch pattern %r in %s" % (_lastSearch.pattern, baseUrl))
|
data, _baseUrl = getPageContent(url, session=cls.session)
|
||||||
return url
|
return fetchUrl(url, data, _baseUrl, _nextSearch)
|
||||||
url = fetchUrl(url, _nextSearch)
|
|
||||||
if not url:
|
|
||||||
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, url))
|
|
||||||
return url
|
|
||||||
|
|
||||||
attrs = dict(
|
attrs = dict(
|
||||||
name='CloneManga/' + name,
|
name='CloneManga/' + name,
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
from re import compile
|
from re import compile
|
||||||
from ..scraper import make_scraper
|
from ..scraper import make_scraper
|
||||||
from ..util import tagre, fetchUrl
|
from ..util import tagre, fetchUrl, getPageContent
|
||||||
|
|
||||||
# note: adding the compile() functions inside add() is a major performance hog
|
# note: adding the compile() functions inside add() is a major performance hog
|
||||||
_imageSearch = compile(tagre("img", "src", r'(http://media\.drunkduck\.com\.s3\.amazonaws\.com:80/[^"]+)', before="page-image"))
|
_imageSearch = compile(tagre("img", "src", r'(http://media\.drunkduck\.com\.s3\.amazonaws\.com:80/[^"]+)', before="page-image"))
|
||||||
|
@ -26,17 +26,15 @@ def add(name, path):
|
||||||
@classmethod
|
@classmethod
|
||||||
def _starter(cls):
|
def _starter(cls):
|
||||||
# first, try hopping to previous and next comic
|
# first, try hopping to previous and next comic
|
||||||
url = fetchUrl(_url, _prevSearch)
|
data, baseUrl = getPageContent(_url, session=cls.session)
|
||||||
if not url:
|
try:
|
||||||
|
url = fetchUrl(_url, data, baseUrl, _prevSearch)
|
||||||
|
except ValueError:
|
||||||
# no previous link found, try hopping to last comic
|
# no previous link found, try hopping to last comic
|
||||||
url = fetchUrl(_url, _lastSearch)
|
return fetchUrl(_url, data, baseUrl, _lastSearch)
|
||||||
if not url:
|
else:
|
||||||
raise ValueError("could not find lastSearch pattern %r in %s" % (_lastSearch.pattern, _url))
|
data, baseUrl = getPageContent(url, session=cls.session)
|
||||||
return url
|
return fetchUrl(url, data, baseUrl, _nextSearch)
|
||||||
url = fetchUrl(url, _nextSearch)
|
|
||||||
if not url:
|
|
||||||
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, url))
|
|
||||||
return url
|
|
||||||
|
|
||||||
globals()[classname] = make_scraper(classname,
|
globals()[classname] = make_scraper(classname,
|
||||||
name = 'DrunkDuck/' + name,
|
name = 'DrunkDuck/' + name,
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||||
from re import compile
|
from re import compile
|
||||||
from ..scraper import make_scraper
|
from ..scraper import make_scraper
|
||||||
from ..util import tagre, quote, fetchUrl, case_insensitive_re
|
from ..util import tagre, quote, fetchUrl, case_insensitive_re, getPageContent
|
||||||
|
|
||||||
_imageSearch = compile(tagre("img", "src", r'([^"]+)', after='id="comic_image"'))
|
_imageSearch = compile(tagre("img", "src", r'([^"]+)', after='id="comic_image"'))
|
||||||
_linkSearch = tagre("a", "href", r'([^>"]*/comics/\d+/[^>"]*)', quote='"?')
|
_linkSearch = tagre("a", "href", r'([^>"]*/comics/\d+/[^>"]*)', quote='"?')
|
||||||
|
@ -30,15 +30,14 @@ def add(name, url, description, adult, bounce):
|
||||||
@classmethod
|
@classmethod
|
||||||
def _starter(cls):
|
def _starter(cls):
|
||||||
"""Get start URL."""
|
"""Get start URL."""
|
||||||
url1 = fetchUrl(modifier(url), cls.prevSearch, session=cls.session)
|
url1 = modifier(url)
|
||||||
if not url1:
|
data, baseUrl = getPageContent(url1, session=cls.session)
|
||||||
raise ValueError("could not find prevSearch pattern %r in %s" % (cls.prevSearch.pattern, modifier(url)))
|
url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch)
|
||||||
if bounce:
|
if bounce:
|
||||||
url2 = fetchUrl(modifier(url1), _nextSearch, session=cls.session)
|
data, baseUrl = getPageContent(url2, session=cls.session)
|
||||||
if not url2:
|
url3 = fetchUrl(url2, data, baseUrl, _nextSearch)
|
||||||
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, modifier(url1)))
|
return modifier(url3)
|
||||||
return modifier(url2)
|
return modifier(url2)
|
||||||
return modifier(url1)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def namer(cls, imageUrl, pageUrl):
|
def namer(cls, imageUrl, pageUrl):
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||||
import requests
|
import requests
|
||||||
from . import loader
|
from . import loader
|
||||||
from .util import fetchUrls
|
from .util import fetchUrl, fetchUrls, getPageContent
|
||||||
from .comic import ComicStrip
|
from .comic import ComicStrip
|
||||||
from .output import out
|
from .output import out
|
||||||
|
|
||||||
|
@ -62,7 +62,8 @@ class _BasicScraper(object):
|
||||||
|
|
||||||
def getStrip(self, url):
|
def getStrip(self, url):
|
||||||
"""Get comic strip for given URL."""
|
"""Get comic strip for given URL."""
|
||||||
imageUrls = fetchUrls(url, self.imageSearch, session=self.session)[0]
|
data, baseUrl = getPageContent(url, session=self.session)
|
||||||
|
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
||||||
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
||||||
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
|
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
|
||||||
return self.getComicStrip(url, imageUrls)
|
return self.getComicStrip(url, imageUrls)
|
||||||
|
@ -97,12 +98,13 @@ class _BasicScraper(object):
|
||||||
retrieving the given number of strips."""
|
retrieving the given number of strips."""
|
||||||
seen_urls = set()
|
seen_urls = set()
|
||||||
while url:
|
while url:
|
||||||
imageUrls, prevUrl = fetchUrls(url, self.imageSearch,
|
data, baseUrl = getPageContent(url, session=self.session)
|
||||||
self.prevSearch, session=self.session)
|
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
||||||
|
yield self.getComicStrip(url, imageUrls)
|
||||||
|
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
|
||||||
prevUrl = self.prevUrlModifier(prevUrl)
|
prevUrl = self.prevUrlModifier(prevUrl)
|
||||||
out.debug("Matched previous URL %s" % prevUrl)
|
out.debug("Matched previous URL %s" % prevUrl)
|
||||||
seen_urls.add(url)
|
seen_urls.add(url)
|
||||||
yield self.getComicStrip(url, imageUrls)
|
|
||||||
if prevUrl in seen_urls:
|
if prevUrl in seen_urls:
|
||||||
# avoid recursive URL loops
|
# avoid recursive URL loops
|
||||||
out.warn("Already seen previous URL %r" % prevUrl)
|
out.warn("Already seen previous URL %r" % prevUrl)
|
||||||
|
|
|
@ -99,8 +99,7 @@ def getPageContent(url, max_content_bytes=MaxContentBytes, session=None):
|
||||||
"""Get text content of given URL."""
|
"""Get text content of given URL."""
|
||||||
check_robotstxt(url)
|
check_robotstxt(url)
|
||||||
# read page data
|
# read page data
|
||||||
page = urlopen(url, max_content_bytes=max_content_bytes,
|
page = urlopen(url, max_content_bytes=max_content_bytes, session=session)
|
||||||
session=session)
|
|
||||||
data = page.text
|
data = page.text
|
||||||
# determine base URL
|
# determine base URL
|
||||||
baseUrl = None
|
baseUrl = None
|
||||||
|
@ -117,45 +116,23 @@ def getImageObject(url, referrer, max_content_bytes=MaxImageBytes):
|
||||||
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
|
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
|
||||||
|
|
||||||
|
|
||||||
def fetchUrl(url, urlSearch, session=None):
|
def fetchUrls(url, data, baseUrl, urlSearch):
|
||||||
"""Search for given URL pattern in a HTML page."""
|
"""Search all entries for given URL pattern in a HTML page."""
|
||||||
data, baseUrl = getPageContent(url, session=session)
|
searchUrls = []
|
||||||
match = urlSearch.search(data)
|
for match in urlSearch.finditer(data):
|
||||||
if match:
|
|
||||||
searchUrl = match.group(1)
|
searchUrl = match.group(1)
|
||||||
if not searchUrl:
|
if not searchUrl:
|
||||||
raise ValueError("Match empty URL at %s with pattern %s" % (url, urlSearch.pattern))
|
raise ValueError("Pattern %s matched empty URL at %s." % (urlSearch.pattern, url))
|
||||||
out.debug('matched URL %r' % searchUrl)
|
out.debug('matched URL %r with pattern %s' % (searchUrl, urlSearch.pattern))
|
||||||
return normaliseURL(urlparse.urljoin(baseUrl, searchUrl))
|
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
|
||||||
return None
|
if not searchUrls:
|
||||||
|
raise ValueError("Pattern %s not found at URL %s with data %r." % (urlSearch.pattern, url, data))
|
||||||
|
return searchUrls
|
||||||
|
|
||||||
|
|
||||||
def fetchUrls(url, imageSearch, prevSearch=None, session=None):
|
def fetchUrl(url, data, baseUrl, urlSearch):
|
||||||
"""Search for given image and previous URL pattern in a HTML page."""
|
"""Search first entry for given URL pattern in a HTML page."""
|
||||||
data, baseUrl = getPageContent(url, session=session)
|
return fetchUrls(url, data, baseUrl, urlSearch)[0]
|
||||||
# match images
|
|
||||||
imageUrls = set()
|
|
||||||
for match in imageSearch.finditer(data):
|
|
||||||
imageUrl = match.group(1)
|
|
||||||
if not imageUrl:
|
|
||||||
raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern))
|
|
||||||
out.debug('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern))
|
|
||||||
imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl)))
|
|
||||||
if not imageUrls:
|
|
||||||
out.warn("no images found at %s with pattern %s" % (url, imageSearch.pattern))
|
|
||||||
if prevSearch is not None:
|
|
||||||
# match previous URL
|
|
||||||
match = prevSearch.search(data)
|
|
||||||
if match:
|
|
||||||
prevUrl = match.group(1)
|
|
||||||
if not prevUrl:
|
|
||||||
raise ValueError("Match empty previous URL at %s with pattern %s" % (url, prevSearch.pattern))
|
|
||||||
prevUrl = normaliseURL(urlparse.urljoin(baseUrl, prevUrl))
|
|
||||||
else:
|
|
||||||
out.debug('no previous URL %s at %s' % (prevSearch.pattern, url))
|
|
||||||
prevUrl = None
|
|
||||||
return imageUrls, prevUrl
|
|
||||||
return imageUrls, None
|
|
||||||
|
|
||||||
|
|
||||||
def unescape(text):
|
def unescape(text):
|
||||||
|
|
Loading…
Reference in a new issue