Add cookie feature.
This commit is contained in:
parent
9130f90ef7
commit
4def4b81bd
2 changed files with 26 additions and 12 deletions
|
@ -29,6 +29,12 @@ class _BasicScraper(object):
|
|||
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
|
||||
prevUrlMatchesStripUrl = True
|
||||
|
||||
# cookies to send for requests
|
||||
cookies = None
|
||||
|
||||
# set to True if this comic contains adult content
|
||||
adult = False
|
||||
|
||||
# usually the index format help
|
||||
help = 'Sorry, no help for this comic yet.'
|
||||
|
||||
|
@ -53,7 +59,7 @@ class _BasicScraper(object):
|
|||
|
||||
def getStrip(self, url):
|
||||
"""Get comic strip for given URL."""
|
||||
imageUrls = fetchUrls(url, self.imageSearch)[0]
|
||||
imageUrls = fetchUrls(url, self.imageSearch, cookies=self.cookies)[0]
|
||||
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
||||
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
|
||||
return self.getComicStrip(url, imageUrls)
|
||||
|
@ -86,13 +92,17 @@ class _BasicScraper(object):
|
|||
retrieving the given number of strips."""
|
||||
seen_urls = set()
|
||||
while url:
|
||||
imageUrls, prevUrl = fetchUrls(url, self.imageSearch, self.prevSearch)
|
||||
imageUrls, prevUrl = fetchUrls(url, self.imageSearch,
|
||||
self.prevSearch, cookies=self.cookies)
|
||||
prevUrl = self.prevUrlModifier(prevUrl)
|
||||
out.debug("Matched previous URL %s" % prevUrl)
|
||||
seen_urls.add(url)
|
||||
yield self.getComicStrip(url, imageUrls)
|
||||
if prevUrl in seen_urls:
|
||||
# avoid recursive URL loops
|
||||
url = prevUrl if prevUrl not in seen_urls else None
|
||||
out.warn("Already seen previous URL %r" % prevUrl)
|
||||
break
|
||||
url = prevUrl
|
||||
if maxstrips is not None:
|
||||
maxstrips -= 1
|
||||
if maxstrips <= 0:
|
||||
|
|
|
@ -78,9 +78,10 @@ def case_insensitive_re(name):
|
|||
|
||||
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
||||
|
||||
def getPageContent(url, max_content_bytes=MaxContentBytes):
|
||||
def getPageContent(url, max_content_bytes=MaxContentBytes, cookies=None):
|
||||
# read page data
|
||||
page = urlopen(url, max_content_bytes=max_content_bytes)
|
||||
page = urlopen(url, max_content_bytes=max_content_bytes,
|
||||
cookies=cookies)
|
||||
data = page.text
|
||||
# determine base URL
|
||||
baseUrl = None
|
||||
|
@ -97,8 +98,8 @@ def getImageObject(url, referrer, max_content_bytes=MaxImageBytes):
|
|||
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
|
||||
|
||||
|
||||
def fetchUrl(url, urlSearch):
|
||||
data, baseUrl = getPageContent(url)
|
||||
def fetchUrl(url, urlSearch, cookies=None):
|
||||
data, baseUrl = getPageContent(url, cookies=cookies)
|
||||
match = urlSearch.search(data)
|
||||
if match:
|
||||
searchUrl = match.group(1)
|
||||
|
@ -109,8 +110,8 @@ def fetchUrl(url, urlSearch):
|
|||
return None
|
||||
|
||||
|
||||
def fetchUrls(url, imageSearch, prevSearch=None):
|
||||
data, baseUrl = getPageContent(url)
|
||||
def fetchUrls(url, imageSearch, prevSearch=None, cookies=None):
|
||||
data, baseUrl = getPageContent(url, cookies=cookies)
|
||||
# match images
|
||||
imageUrls = set()
|
||||
for match in imageSearch.finditer(data):
|
||||
|
@ -186,7 +187,7 @@ def normaliseURL(url):
|
|||
|
||||
|
||||
def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_bytes=None,
|
||||
timeout=ConnectionTimeoutSecs):
|
||||
timeout=ConnectionTimeoutSecs, cookies=None):
|
||||
out.debug('Open URL %s' % url)
|
||||
assert retries >= 0, 'invalid retry value %r' % retries
|
||||
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
|
||||
|
@ -194,8 +195,11 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_byt
|
|||
config = {"max_retries": retries}
|
||||
if referrer:
|
||||
headers['Referer'] = referrer
|
||||
if not cookies:
|
||||
cookies = {}
|
||||
try:
|
||||
req = requests.get(url, headers=headers, config=config, prefetch=False, timeout=timeout)
|
||||
req = requests.get(url, headers=headers, config=config,
|
||||
prefetch=False, timeout=timeout, cookies=cookies)
|
||||
check_content_size(url, req.headers, max_content_bytes)
|
||||
req.raise_for_status()
|
||||
return req
|
||||
|
|
Loading…
Reference in a new issue