Move all regular expression operation into the new class.
- Move fetchUrls, fetchUrl and fetchText. - Move base URL handling.
This commit is contained in:
parent
fde1fdced6
commit
0e03eca8f0
2 changed files with 48 additions and 52 deletions
|
@ -5,8 +5,14 @@ import requests
|
||||||
import time
|
import time
|
||||||
import random
|
import random
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
try:
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
except ImportError:
|
||||||
|
from urlparse import urljoin
|
||||||
from . import loader, configuration, util
|
from . import loader, configuration, util
|
||||||
from .util import (makeSequence, get_system_uid, urlopen, getDirname)
|
from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
|
||||||
|
getDirname, unescape, tagre, normaliseURL)
|
||||||
from .comic import ComicStrip
|
from .comic import ComicStrip
|
||||||
from .output import out
|
from .output import out
|
||||||
from .events import getHandler
|
from .events import getHandler
|
||||||
|
@ -315,20 +321,55 @@ class _BasicScraper(Scraper):
|
||||||
any).
|
any).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def getPage(cls, url):
|
def getPage(cls, url):
|
||||||
content, baseUrl = util.getPageContent(url, cls.session)
|
content = getPageContent(url, cls.session)
|
||||||
|
# determine base URL
|
||||||
|
baseUrl = None
|
||||||
|
match = cls.BASE_SEARCH.search(content)
|
||||||
|
if match:
|
||||||
|
baseUrl = match.group(1)
|
||||||
|
else:
|
||||||
|
baseUrl = url
|
||||||
return (content, baseUrl)
|
return (content, baseUrl)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def fetchUrls(cls, url, data, urlSearch):
|
def fetchUrls(cls, url, data, urlSearch):
|
||||||
"""Search all entries for given URL pattern(s) in a HTML page."""
|
"""Search all entries for given URL pattern(s) in a HTML page."""
|
||||||
return util.fetchUrls(url, data[0], data[1], urlSearch)
|
searchUrls = []
|
||||||
|
searches = makeSequence(urlSearch)
|
||||||
|
for search in searches:
|
||||||
|
for match in search.finditer(data[0]):
|
||||||
|
searchUrl = match.group(1)
|
||||||
|
if not searchUrl:
|
||||||
|
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
|
||||||
|
out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern))
|
||||||
|
searchUrls.append(normaliseURL(urljoin(data[1], searchUrl)))
|
||||||
|
if searchUrls:
|
||||||
|
# do not search other links if one pattern matched
|
||||||
|
break
|
||||||
|
if not searchUrls:
|
||||||
|
patterns = [x.pattern for x in searches]
|
||||||
|
raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
|
||||||
|
return searchUrls
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def fetchText(cls, url, data, textSearch, optional):
|
def fetchText(cls, url, data, textSearch, optional):
|
||||||
"""Search text entry for given text pattern in a HTML page."""
|
"""Search text entry for given text pattern in a HTML page."""
|
||||||
return util.fetchText(url, data[0], textSearch, optional)
|
if textSearch:
|
||||||
|
match = textSearch.search(data[0])
|
||||||
|
if match:
|
||||||
|
text = match.group(1)
|
||||||
|
out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
|
||||||
|
return unescape(text).strip()
|
||||||
|
if optional:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def find_scraperclasses(comic, multiple_allowed=False):
|
def find_scraperclasses(comic, multiple_allowed=False):
|
||||||
|
|
|
@ -7,9 +7,9 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from urllib import quote as url_quote, unquote as url_unquote
|
from urllib import quote as url_quote, unquote as url_unquote
|
||||||
try:
|
try:
|
||||||
from urllib.parse import urlparse, urlunparse, urljoin, urlsplit
|
from urllib.parse import urlparse, urlunparse, urlsplit
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from urlparse import urlparse, urlunparse, urljoin, urlsplit
|
from urlparse import urlparse, urlunparse, urlsplit
|
||||||
try:
|
try:
|
||||||
from urllib import robotparser
|
from urllib import robotparser
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -176,8 +176,6 @@ def case_insensitive_re(name):
|
||||||
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
|
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
|
||||||
|
|
||||||
|
|
||||||
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
|
||||||
|
|
||||||
def isValidPageContent(data):
|
def isValidPageContent(data):
|
||||||
"""Check if page content is empty or has error messages."""
|
"""Check if page content is empty or has error messages."""
|
||||||
# The python requests library sometimes returns empty data.
|
# The python requests library sometimes returns empty data.
|
||||||
|
@ -203,14 +201,7 @@ def getPageContent(url, session, max_content_bytes=MaxContentBytes):
|
||||||
if not isValidPageContent(data):
|
if not isValidPageContent(data):
|
||||||
raise ValueError("Got invalid page content from %s: %r" % (url, data))
|
raise ValueError("Got invalid page content from %s: %r" % (url, data))
|
||||||
out.debug(u"Got page content %r" % data, level=3)
|
out.debug(u"Got page content %r" % data, level=3)
|
||||||
# determine base URL
|
return data
|
||||||
baseUrl = None
|
|
||||||
match = baseSearch.search(data)
|
|
||||||
if match:
|
|
||||||
baseUrl = match.group(1)
|
|
||||||
else:
|
|
||||||
baseUrl = url
|
|
||||||
return data, baseUrl
|
|
||||||
|
|
||||||
|
|
||||||
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
||||||
|
@ -226,42 +217,6 @@ def makeSequence(item):
|
||||||
return (item,)
|
return (item,)
|
||||||
|
|
||||||
|
|
||||||
def fetchUrls(url, data, baseUrl, urlSearch):
|
|
||||||
"""Search all entries for given URL pattern(s) in a HTML page."""
|
|
||||||
searchUrls = []
|
|
||||||
searches = makeSequence(urlSearch)
|
|
||||||
for search in searches:
|
|
||||||
for match in search.finditer(data):
|
|
||||||
searchUrl = match.group(1)
|
|
||||||
if not searchUrl:
|
|
||||||
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
|
|
||||||
out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern))
|
|
||||||
searchUrls.append(normaliseURL(urljoin(baseUrl, searchUrl)))
|
|
||||||
if searchUrls:
|
|
||||||
# do not search other links if one pattern matched
|
|
||||||
break
|
|
||||||
if not searchUrls:
|
|
||||||
patterns = [x.pattern for x in searches]
|
|
||||||
raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
|
|
||||||
return searchUrls
|
|
||||||
|
|
||||||
|
|
||||||
def fetchUrl(url, data, baseUrl, urlSearch):
|
|
||||||
"""Search first URL entry for given URL pattern in a HTML page."""
|
|
||||||
return fetchUrls(url, data, baseUrl, urlSearch)[0]
|
|
||||||
|
|
||||||
|
|
||||||
def fetchText(url, data, textSearch, optional=False):
|
|
||||||
"""Search text entry for given text pattern in a HTML page."""#
|
|
||||||
match = textSearch.search(data)
|
|
||||||
if match:
|
|
||||||
text = match.group(1)
|
|
||||||
out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
|
|
||||||
return text
|
|
||||||
if not optional:
|
|
||||||
raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
|
|
||||||
|
|
||||||
|
|
||||||
_htmlparser = HTMLParser()
|
_htmlparser = HTMLParser()
|
||||||
def unescape(text):
|
def unescape(text):
|
||||||
"""Replace HTML entities and character references."""
|
"""Replace HTML entities and character references."""
|
||||||
|
|
Loading…
Reference in a new issue