Add cookie feature.
This commit is contained in:
parent
9130f90ef7
commit
4def4b81bd
2 changed files with 26 additions and 12 deletions
|
@ -29,6 +29,12 @@ class _BasicScraper(object):
|
||||||
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
|
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
|
||||||
prevUrlMatchesStripUrl = True
|
prevUrlMatchesStripUrl = True
|
||||||
|
|
||||||
|
# cookies to send for requests
|
||||||
|
cookies = None
|
||||||
|
|
||||||
|
# set to True if this comic contains adult content
|
||||||
|
adult = False
|
||||||
|
|
||||||
# usually the index format help
|
# usually the index format help
|
||||||
help = 'Sorry, no help for this comic yet.'
|
help = 'Sorry, no help for this comic yet.'
|
||||||
|
|
||||||
|
@ -53,7 +59,7 @@ class _BasicScraper(object):
|
||||||
|
|
||||||
def getStrip(self, url):
|
def getStrip(self, url):
|
||||||
"""Get comic strip for given URL."""
|
"""Get comic strip for given URL."""
|
||||||
imageUrls = fetchUrls(url, self.imageSearch)[0]
|
imageUrls = fetchUrls(url, self.imageSearch, cookies=self.cookies)[0]
|
||||||
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
||||||
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
|
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
|
||||||
return self.getComicStrip(url, imageUrls)
|
return self.getComicStrip(url, imageUrls)
|
||||||
|
@ -86,13 +92,17 @@ class _BasicScraper(object):
|
||||||
retrieving the given number of strips."""
|
retrieving the given number of strips."""
|
||||||
seen_urls = set()
|
seen_urls = set()
|
||||||
while url:
|
while url:
|
||||||
imageUrls, prevUrl = fetchUrls(url, self.imageSearch, self.prevSearch)
|
imageUrls, prevUrl = fetchUrls(url, self.imageSearch,
|
||||||
|
self.prevSearch, cookies=self.cookies)
|
||||||
prevUrl = self.prevUrlModifier(prevUrl)
|
prevUrl = self.prevUrlModifier(prevUrl)
|
||||||
out.debug("Matched previous URL %s" % prevUrl)
|
out.debug("Matched previous URL %s" % prevUrl)
|
||||||
seen_urls.add(url)
|
seen_urls.add(url)
|
||||||
yield self.getComicStrip(url, imageUrls)
|
yield self.getComicStrip(url, imageUrls)
|
||||||
|
if prevUrl in seen_urls:
|
||||||
# avoid recursive URL loops
|
# avoid recursive URL loops
|
||||||
url = prevUrl if prevUrl not in seen_urls else None
|
out.warn("Already seen previous URL %r" % prevUrl)
|
||||||
|
break
|
||||||
|
url = prevUrl
|
||||||
if maxstrips is not None:
|
if maxstrips is not None:
|
||||||
maxstrips -= 1
|
maxstrips -= 1
|
||||||
if maxstrips <= 0:
|
if maxstrips <= 0:
|
||||||
|
|
|
@ -78,9 +78,10 @@ def case_insensitive_re(name):
|
||||||
|
|
||||||
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
||||||
|
|
||||||
def getPageContent(url, max_content_bytes=MaxContentBytes):
|
def getPageContent(url, max_content_bytes=MaxContentBytes, cookies=None):
|
||||||
# read page data
|
# read page data
|
||||||
page = urlopen(url, max_content_bytes=max_content_bytes)
|
page = urlopen(url, max_content_bytes=max_content_bytes,
|
||||||
|
cookies=cookies)
|
||||||
data = page.text
|
data = page.text
|
||||||
# determine base URL
|
# determine base URL
|
||||||
baseUrl = None
|
baseUrl = None
|
||||||
|
@ -97,8 +98,8 @@ def getImageObject(url, referrer, max_content_bytes=MaxImageBytes):
|
||||||
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
|
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
|
||||||
|
|
||||||
|
|
||||||
def fetchUrl(url, urlSearch):
|
def fetchUrl(url, urlSearch, cookies=None):
|
||||||
data, baseUrl = getPageContent(url)
|
data, baseUrl = getPageContent(url, cookies=cookies)
|
||||||
match = urlSearch.search(data)
|
match = urlSearch.search(data)
|
||||||
if match:
|
if match:
|
||||||
searchUrl = match.group(1)
|
searchUrl = match.group(1)
|
||||||
|
@ -109,8 +110,8 @@ def fetchUrl(url, urlSearch):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def fetchUrls(url, imageSearch, prevSearch=None):
|
def fetchUrls(url, imageSearch, prevSearch=None, cookies=None):
|
||||||
data, baseUrl = getPageContent(url)
|
data, baseUrl = getPageContent(url, cookies=cookies)
|
||||||
# match images
|
# match images
|
||||||
imageUrls = set()
|
imageUrls = set()
|
||||||
for match in imageSearch.finditer(data):
|
for match in imageSearch.finditer(data):
|
||||||
|
@ -186,7 +187,7 @@ def normaliseURL(url):
|
||||||
|
|
||||||
|
|
||||||
def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_bytes=None,
|
def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_bytes=None,
|
||||||
timeout=ConnectionTimeoutSecs):
|
timeout=ConnectionTimeoutSecs, cookies=None):
|
||||||
out.debug('Open URL %s' % url)
|
out.debug('Open URL %s' % url)
|
||||||
assert retries >= 0, 'invalid retry value %r' % retries
|
assert retries >= 0, 'invalid retry value %r' % retries
|
||||||
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
|
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
|
||||||
|
@ -194,8 +195,11 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_byt
|
||||||
config = {"max_retries": retries}
|
config = {"max_retries": retries}
|
||||||
if referrer:
|
if referrer:
|
||||||
headers['Referer'] = referrer
|
headers['Referer'] = referrer
|
||||||
|
if not cookies:
|
||||||
|
cookies = {}
|
||||||
try:
|
try:
|
||||||
req = requests.get(url, headers=headers, config=config, prefetch=False, timeout=timeout)
|
req = requests.get(url, headers=headers, config=config,
|
||||||
|
prefetch=False, timeout=timeout, cookies=cookies)
|
||||||
check_content_size(url, req.headers, max_content_bytes)
|
check_content_size(url, req.headers, max_content_bytes)
|
||||||
req.raise_for_status()
|
req.raise_for_status()
|
||||||
return req
|
return req
|
||||||
|
|
Loading…
Reference in a new issue