From 4def4b81bdb69d142a57ca0e9b07291c1426d252 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Sat, 8 Dec 2012 21:29:57 +0100 Subject: [PATCH] Add cookie feature. --- dosagelib/scraper.py | 18 ++++++++++++++---- dosagelib/util.py | 20 ++++++++++++-------- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index 04cae62f2..88bd7f1e9 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -29,6 +29,12 @@ class _BasicScraper(object): # set to False if previous URLs do not match the strip URL (ie. because of redirects) prevUrlMatchesStripUrl = True + # cookies to send for requests + cookies = None + + # set to True if this comic contains adult content + adult = False + # usually the index format help help = 'Sorry, no help for this comic yet.' @@ -53,7 +59,7 @@ class _BasicScraper(object): def getStrip(self, url): """Get comic strip for given URL.""" - imageUrls = fetchUrls(url, self.imageSearch)[0] + imageUrls = fetchUrls(url, self.imageSearch, cookies=self.cookies)[0] if len(imageUrls) > 1 and not self.multipleImagesPerStrip: out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern)) return self.getComicStrip(url, imageUrls) @@ -86,13 +92,17 @@ class _BasicScraper(object): retrieving the given number of strips.""" seen_urls = set() while url: - imageUrls, prevUrl = fetchUrls(url, self.imageSearch, self.prevSearch) + imageUrls, prevUrl = fetchUrls(url, self.imageSearch, + self.prevSearch, cookies=self.cookies) prevUrl = self.prevUrlModifier(prevUrl) out.debug("Matched previous URL %s" % prevUrl) seen_urls.add(url) yield self.getComicStrip(url, imageUrls) - # avoid recursive URL loops - url = prevUrl if prevUrl not in seen_urls else None + if prevUrl in seen_urls: + # avoid recursive URL loops + out.warn("Already seen previous URL %r" % prevUrl) + break + url = prevUrl if maxstrips is not None: maxstrips -= 1 if maxstrips <= 0: diff --git a/dosagelib/util.py b/dosagelib/util.py index 14633e746..7a5fcf2eb 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -78,9 +78,10 @@ def case_insensitive_re(name): baseSearch = re.compile(tagre("base", "href", '([^"]*)')) -def getPageContent(url, max_content_bytes=MaxContentBytes): +def getPageContent(url, max_content_bytes=MaxContentBytes, cookies=None): # read page data - page = urlopen(url, max_content_bytes=max_content_bytes) + page = urlopen(url, max_content_bytes=max_content_bytes, + cookies=cookies) data = page.text # determine base URL baseUrl = None @@ -97,8 +98,8 @@ def getImageObject(url, referrer, max_content_bytes=MaxImageBytes): return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes) -def fetchUrl(url, urlSearch): - data, baseUrl = getPageContent(url) +def fetchUrl(url, urlSearch, cookies=None): + data, baseUrl = getPageContent(url, cookies=cookies) match = urlSearch.search(data) if match: searchUrl = match.group(1) @@ -109,8 +110,8 @@ def fetchUrl(url, urlSearch): return None -def fetchUrls(url, imageSearch, prevSearch=None): - data, baseUrl = getPageContent(url) +def fetchUrls(url, imageSearch, prevSearch=None, cookies=None): + data, baseUrl = getPageContent(url, cookies=cookies) # match images imageUrls = set() for match in imageSearch.finditer(data): @@ -186,7 +187,7 @@ def normaliseURL(url): def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_bytes=None, - timeout=ConnectionTimeoutSecs): + timeout=ConnectionTimeoutSecs, cookies=None): out.debug('Open URL %s' % url) assert retries >= 0, 'invalid retry value %r' % retries assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds @@ -194,8 +195,11 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_byt config = {"max_retries": retries} if referrer: headers['Referer'] = referrer + if not cookies: + cookies = {} try: - req = requests.get(url, headers=headers, config=config, prefetch=False, timeout=timeout) + req = requests.get(url, headers=headers, config=config, + prefetch=False, timeout=timeout, cookies=cookies) check_content_size(url, req.headers, max_content_bytes) req.raise_for_status() return req