Add cookie feature.

This commit is contained in:
Bastian Kleineidam 2012-12-08 21:29:57 +01:00
parent 9130f90ef7
commit 4def4b81bd
2 changed files with 26 additions and 12 deletions

View file

@ -29,6 +29,12 @@ class _BasicScraper(object):
# set to False if previous URLs do not match the strip URL (ie. because of redirects) # set to False if previous URLs do not match the strip URL (ie. because of redirects)
prevUrlMatchesStripUrl = True prevUrlMatchesStripUrl = True
# cookies to send for requests
cookies = None
# set to True if this comic contains adult content
adult = False
# usually the index format help # usually the index format help
help = 'Sorry, no help for this comic yet.' help = 'Sorry, no help for this comic yet.'
@ -53,7 +59,7 @@ class _BasicScraper(object):
def getStrip(self, url): def getStrip(self, url):
"""Get comic strip for given URL.""" """Get comic strip for given URL."""
imageUrls = fetchUrls(url, self.imageSearch)[0] imageUrls = fetchUrls(url, self.imageSearch, cookies=self.cookies)[0]
if len(imageUrls) > 1 and not self.multipleImagesPerStrip: if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern)) out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
return self.getComicStrip(url, imageUrls) return self.getComicStrip(url, imageUrls)
@ -86,13 +92,17 @@ class _BasicScraper(object):
retrieving the given number of strips.""" retrieving the given number of strips."""
seen_urls = set() seen_urls = set()
while url: while url:
imageUrls, prevUrl = fetchUrls(url, self.imageSearch, self.prevSearch) imageUrls, prevUrl = fetchUrls(url, self.imageSearch,
self.prevSearch, cookies=self.cookies)
prevUrl = self.prevUrlModifier(prevUrl) prevUrl = self.prevUrlModifier(prevUrl)
out.debug("Matched previous URL %s" % prevUrl) out.debug("Matched previous URL %s" % prevUrl)
seen_urls.add(url) seen_urls.add(url)
yield self.getComicStrip(url, imageUrls) yield self.getComicStrip(url, imageUrls)
# avoid recursive URL loops if prevUrl in seen_urls:
url = prevUrl if prevUrl not in seen_urls else None # avoid recursive URL loops
out.warn("Already seen previous URL %r" % prevUrl)
break
url = prevUrl
if maxstrips is not None: if maxstrips is not None:
maxstrips -= 1 maxstrips -= 1
if maxstrips <= 0: if maxstrips <= 0:

View file

@ -78,9 +78,10 @@ def case_insensitive_re(name):
baseSearch = re.compile(tagre("base", "href", '([^"]*)')) baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
def getPageContent(url, max_content_bytes=MaxContentBytes): def getPageContent(url, max_content_bytes=MaxContentBytes, cookies=None):
# read page data # read page data
page = urlopen(url, max_content_bytes=max_content_bytes) page = urlopen(url, max_content_bytes=max_content_bytes,
cookies=cookies)
data = page.text data = page.text
# determine base URL # determine base URL
baseUrl = None baseUrl = None
@ -97,8 +98,8 @@ def getImageObject(url, referrer, max_content_bytes=MaxImageBytes):
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes) return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
def fetchUrl(url, urlSearch): def fetchUrl(url, urlSearch, cookies=None):
data, baseUrl = getPageContent(url) data, baseUrl = getPageContent(url, cookies=cookies)
match = urlSearch.search(data) match = urlSearch.search(data)
if match: if match:
searchUrl = match.group(1) searchUrl = match.group(1)
@ -109,8 +110,8 @@ def fetchUrl(url, urlSearch):
return None return None
def fetchUrls(url, imageSearch, prevSearch=None): def fetchUrls(url, imageSearch, prevSearch=None, cookies=None):
data, baseUrl = getPageContent(url) data, baseUrl = getPageContent(url, cookies=cookies)
# match images # match images
imageUrls = set() imageUrls = set()
for match in imageSearch.finditer(data): for match in imageSearch.finditer(data):
@ -186,7 +187,7 @@ def normaliseURL(url):
def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_bytes=None, def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_bytes=None,
timeout=ConnectionTimeoutSecs): timeout=ConnectionTimeoutSecs, cookies=None):
out.debug('Open URL %s' % url) out.debug('Open URL %s' % url)
assert retries >= 0, 'invalid retry value %r' % retries assert retries >= 0, 'invalid retry value %r' % retries
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
@ -194,8 +195,11 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_byt
config = {"max_retries": retries} config = {"max_retries": retries}
if referrer: if referrer:
headers['Referer'] = referrer headers['Referer'] = referrer
if not cookies:
cookies = {}
try: try:
req = requests.get(url, headers=headers, config=config, prefetch=False, timeout=timeout) req = requests.get(url, headers=headers, config=config,
prefetch=False, timeout=timeout, cookies=cookies)
check_content_size(url, req.headers, max_content_bytes) check_content_size(url, req.headers, max_content_bytes)
req.raise_for_status() req.raise_for_status()
return req return req