diff --git a/dosagelib/util.py b/dosagelib/util.py index f29fba554..3a30e70a0 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -444,11 +444,6 @@ def rfc822date(indate): return time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(indate)) -def asciify(name): - """Remove non-ascii characters from string.""" - return re.sub("[^0-9a-zA-Z_]", "", name) - - def unquote(text): """Replace all percent-encoded entities in text.""" while '%' in text: diff --git a/scripts/arcamax.py b/scripts/arcamax.py index e60935c60..17142f9a5 100755 --- a/scripts/arcamax.py +++ b/scripts/arcamax.py @@ -1,18 +1,25 @@ #!/usr/bin/env python -# Copyright (C) 2013-2014 Bastian Kleineidam +# -*- coding: utf-8 -*- +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs +# Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher """ -Script to get arcamax comics and save the info in a JSON file for further processing. +Script to get arcamax comics and save the info in a JSON file for further +processing. """ -from __future__ import print_function +from __future__ import absolute_import, division, print_function + import codecs import re import sys import os + import requests -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) -from dosagelib.util import getPageContent, asciify, unescape + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa +from dosagelib.util import get_page from dosagelib.scraper import get_scraperclasses -from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name +from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name json_file = __file__.replace(".py", ".json") @@ -20,7 +27,7 @@ url_matcher = re.compile(r'
  • ([^<]+)') # names of comics to exclude exclude_comics = [ - "HagartheHorrible", # better source available + "HagartheHorrible", # better source available ] @@ -28,15 +35,13 @@ def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: - data = getPageContent(url, session) + data = get_page(url, session).text except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): shortname = match.group(1) - name = unescape(match.group(2)) - name = asciify(name.replace('&', 'And').replace('@', 'At')) - name = capfirst(name) + name = format_name(match.group(2)) if name in exclude_comics: continue if contains_case_insensitive(res, name): @@ -86,7 +91,7 @@ def print_results(args): else: prefix = u'' fp.write(u"%sadd(%r, %r)\n" % (prefix, str(truncate_name(name)), - str(shortname))) + str(shortname))) if __name__ == '__main__': diff --git a/scripts/comicfury.py b/scripts/comicfury.py index 66762871e..390a1ade9 100755 --- a/scripts/comicfury.py +++ b/scripts/comicfury.py @@ -1,19 +1,23 @@ #!/usr/bin/env python -# Copyright (C) 2013-2014 Bastian Kleineidam -# Copyright (C) 2016 Tobias Gruetzmacher +# -*- coding: utf-8 -*- +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs +# Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher """ Script to get ComicFury comics and save the info in a JSON file for further processing. """ -from __future__ import print_function, absolute_import +from __future__ import absolute_import, division, print_function + import codecs import sys import os + import requests from lxml import html sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa -from dosagelib.util import getPageContent +from dosagelib.util import get_page from dosagelib.scraper import get_scraperclasses from scriptutil import (contains_case_insensitive, save_result, load_result, truncate_name, format_name) @@ -120,7 +124,7 @@ def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: - data = html.document_fromstring(getPageContent(url, session)) + data = html.document_fromstring(get_page(url, session).text) data.make_links_absolute(url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) diff --git a/scripts/comicgenesis.py b/scripts/comicgenesis.py index dc880923b..d3fc969ce 100755 --- a/scripts/comicgenesis.py +++ b/scripts/comicgenesis.py @@ -1,24 +1,36 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher """ Script to get a list of ComicGenesis comics and save the info in a JSON file for further processing. """ -from __future__ import print_function +from __future__ import absolute_import, division, print_function + import codecs import re import sys import os + import requests -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) -from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa +from dosagelib.util import get_page, tagre, check_robotstxt from dosagelib.scraper import get_scraperclasses -from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name +from scriptutil import (contains_case_insensitive, save_result, load_result, + truncate_name, format_name) json_file = __file__.replace(".py", ".json") -#
    Adventures of the College Pros -url_matcher = re.compile(r'
    ' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)') +#
    Adventures of the College +# Pros +url_matcher = re.compile(r'
    ' + + tagre("a", "href", r'(http://[^"]+)') + + r'([^<]+)') num_matcher = re.compile(r'Number of Days: (\d+)') # names of comics to exclude @@ -368,19 +380,18 @@ url_overrides = { "Zortic": "http://zortic.comicgenesis.com/d/20030922.html", } + def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: - data = getPageContent(url, session) + data = get_page(url, session).text except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): url = match.group(1) + '/' - name = unescape(match.group(2)) - name = asciify(name.replace('&', 'And').replace('@', 'At')) - name = capfirst(name) + name = format_name(match.group(2)) if name in exclude_comics: continue if contains_case_insensitive(res, name): @@ -391,13 +402,13 @@ def handle_url(url, session, res): end = match.end() mo = num_matcher.search(data[end:]) if not mo: - print("ERROR:", repr(data[end:end+300]), file=sys.stderr) + print("ERROR:", repr(data[end:end + 300]), file=sys.stderr) continue num = int(mo.group(1)) url = url_overrides.get(name, url) try: if "/d/" not in url: - check_robotstxt(url+"d/", session) + check_robotstxt(url + "d/", session) else: check_robotstxt(url, session) except IOError: diff --git a/scripts/create-cbz.py b/scripts/create-cbz.py index 0700f0488..fe3b53366 100755 --- a/scripts/create-cbz.py +++ b/scripts/create-cbz.py @@ -1,15 +1,20 @@ #!/usr/bin/env python -# Copyright (C) 2013-2014 Bastian Kleineidam +# -*- coding: utf-8 -*- +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs +# Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher """ Creates a CBZ file in the comic directory. Uses an ordered symlink directory (see order-symlinks.py) if it exists, else the plain files are used. """ -from __future__ import print_function +from __future__ import absolute_import, division, print_function + import sys import os import zipfile -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa from dosagelib.configuration import App @@ -21,6 +26,7 @@ ImageExts = ( ".png", ) + def is_image(filename): """Determine if given filename is an image.""" # note: isfile() also accepts symlinks diff --git a/scripts/drunkduck.py b/scripts/drunkduck.py index 147520d1f..b45b6f875 100755 --- a/scripts/drunkduck.py +++ b/scripts/drunkduck.py @@ -1,17 +1,25 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher """ -Script to get a list of drunkduck comics and save the info in a JSON file for further processing. +Script to get a list of drunkduck comics and save the info in a JSON file for +further processing. """ -from __future__ import print_function +from __future__ import absolute_import, division, print_function + import codecs import re import sys import os + import requests -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) -from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify -from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa +from dosagelib.util import tagre, get_page, unquote, unescape +from scriptutil import (contains_case_insensitive, capfirst, save_result, + load_result, truncate_name, asciify) json_file = __file__.replace(".py", ".json") @@ -169,7 +177,7 @@ exclude_comics = [ def handle_url(url, session, url_matcher, num_matcher, res): """Parse one search result page.""" try: - data = getPageContent(url, session) + data = get_page(url, session).text except IOError as msg: print("ERROR:", msg, file=sys.stderr) return @@ -187,7 +195,7 @@ def handle_url(url, session, url_matcher, num_matcher, res): end = match.end(1) mo = num_matcher.search(data[end:]) if not mo: - print("ERROR:", repr(data[end:end+300]), file=sys.stderr) + print("ERROR:", repr(data[end:end + 300]), file=sys.stderr) continue num = int(mo.group(1)) res[name] = (path, num) diff --git a/scripts/generate_json.sh b/scripts/generate_json.sh index 5819e881c..c01833b26 100755 --- a/scripts/generate_json.sh +++ b/scripts/generate_json.sh @@ -6,7 +6,7 @@ d=$(dirname $0) if [ $# -ge 1 ]; then list="$*" else - list="creators gocomics comicgenesis keenspot smackjeeves arcamax comicfury" + list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory" fi for script in $list; do echo "Executing ${script}.py" diff --git a/scripts/gocomics.py b/scripts/gocomics.py index ded106c77..5098e85d9 100755 --- a/scripts/gocomics.py +++ b/scripts/gocomics.py @@ -1,15 +1,18 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright (C) 2013-2014 Bastian Kleineidam -# Copyright (C) 2016 Tobias Gruetzmacher +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs +# Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher """ -Script to get a list of gocomics and save the info in a JSON file for further processing. +Script to get a list of gocomics and save the info in a JSON file for further +processing. """ from __future__ import absolute_import, division, print_function import codecs import sys import os + import requests from lxml import html diff --git a/scripts/hook-dosagelib.py b/scripts/hook-dosagelib.py index 4969e2ea7..8870a276b 100644 --- a/scripts/hook-dosagelib.py +++ b/scripts/hook-dosagelib.py @@ -1,2 +1,8 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs +# Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher + +from __future__ import absolute_import, division, print_function from PyInstaller.utils.hooks import collect_submodules hiddenimports = collect_submodules('dosagelib.plugins') diff --git a/scripts/keenspot.py b/scripts/keenspot.py index 23e2f1cb4..ada3f92ca 100755 --- a/scripts/keenspot.py +++ b/scripts/keenspot.py @@ -1,28 +1,38 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher """ Script to get a list of KeenSpot comics and save the info in a JSON file for further processing. """ -from __future__ import print_function +from __future__ import absolute_import, division, print_function + import codecs import re import sys import os + import requests -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) -from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa +from dosagelib.util import get_page, tagre, check_robotstxt from dosagelib.scraper import get_scraperclasses -from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name +from scriptutil import (contains_case_insensitive, save_result, load_result, + truncate_name, format_name) + json_file = __file__.replace(".py", ".json") + url_matcher = re.compile( tagre("td", "onmouseover", r'([^"]+)') + tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') + r"(?:)?([^<]+)(?:)?" ) + # names of comics to exclude exclude_comics = [ "BrawlintheFamily", # non-standard navigation @@ -47,23 +57,23 @@ exclude_comics = [ "YouDamnKid", # non-standard navigation ] + # links to last valid strips url_overrides = { } + def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: - data = getPageContent(url, session) + data = get_page(url, session).text except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = match.group(2) - name = unescape(match.group(3)) - name = asciify(name.replace('&', 'And').replace('@', 'At')) - name = capfirst(name) + name = format_name(match.group(3)) if name in exclude_comics: continue if contains_case_insensitive(res, name): @@ -72,7 +82,7 @@ def handle_url(url, session, res): continue try: if "/d/" not in comicurl: - check_robotstxt(comicurl+"d/", session) + check_robotstxt(comicurl + "d/", session) else: check_robotstxt(comicurl, session) except IOError: diff --git a/scripts/mklanguages.py b/scripts/mklanguages.py index 6e243060c..d59f83115 100755 --- a/scripts/mklanguages.py +++ b/scripts/mklanguages.py @@ -1,5 +1,11 @@ #!/usr/bin/python -# update languages.py from pycountry +# -*- coding: utf-8 -*- +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs +# Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher +'''update languages.py from pycountry''' +from __future__ import absolute_import, division, print_function + import os import sys import codecs @@ -7,7 +13,8 @@ import codecs basepath = os.path.dirname(os.path.dirname(__file__)) sys.path.append(basepath) -from dosagelib.scraper import get_scraperclasses +from dosagelib.scraper import get_scraperclasses # noqa + def main(): """Update language information in dosagelib/languages.py.""" @@ -29,6 +36,7 @@ def get_used_languages(): lang[l] = scraperclass.language() return lang + def write_languages(f, l): """Write language information.""" f.write("Languages = {%s" % os.linesep) diff --git a/scripts/order-symlinks.py b/scripts/order-symlinks.py index 413bfce8b..250c1cbdb 100755 --- a/scripts/order-symlinks.py +++ b/scripts/order-symlinks.py @@ -1,25 +1,32 @@ #!/usr/bin/env python -# Copyright (C) 2013 Tobias Gruetzmacher +# -*- coding: utf-8 -*- +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs +# Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher """ This script takes the JSON file created by 'dosage -o json' and uses the metadata to build a symlink farm in the deduced order of the comic. It created those in a subdirectory called 'inorder'. """ -from __future__ import print_function +from __future__ import absolute_import, division, print_function + import sys import os import codecs import json + def jsonFn(d): """Get JSON filename.""" return os.path.join(d, 'dosage.json') + def loadJson(d): """Return JSON data.""" with codecs.open(jsonFn(d), 'r', 'utf-8') as f: return json.load(f) + def prepare_output(d): """Clean pre-existing links in output directory.""" outDir = os.path.join(d, 'inorder') @@ -31,6 +38,7 @@ def prepare_output(d): os.remove(f) return outDir + def create_symlinks(d): """Create new symbolic links in output directory.""" data = loadJson(d) @@ -68,4 +76,3 @@ if __name__ == '__main__': print("No JSON file found in '%s'." % (d)) else: print("Usage: %s comic-dirs" % (os.path.basename(sys.argv[0]))) - diff --git a/scripts/removeafter.py b/scripts/removeafter.py index 190124696..010ef3cef 100755 --- a/scripts/removeafter.py +++ b/scripts/removeafter.py @@ -1,11 +1,15 @@ #!/usr/bin/env python -# Copyright (C) 2012-2013 Bastian Kleineidam -"""Remove all lines after a given marker line. -""" -from __future__ import print_function +# -*- coding: utf-8 -*- +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs +# Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher +"""Remove all lines after a given marker line.""" +from __future__ import absolute_import, division, print_function + import fileinput import sys + def main(args): """Remove lines after marker.""" filename = args[0] @@ -15,5 +19,6 @@ def main(args): if line.startswith(marker): break + if __name__ == '__main__': main(sys.argv[1:]) diff --git a/scripts/scriptutil.py b/scripts/scriptutil.py index 63b872103..ca72d9ce9 100644 --- a/scripts/scriptutil.py +++ b/scripts/scriptutil.py @@ -1,13 +1,15 @@ # -*- coding: utf-8 -*- +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2015-2016 Tobias Gruetzmacher from __future__ import absolute_import, division, print_function +import re import json import codecs -from dosagelib.util import unescape, asciify +from dosagelib.util import unescape def contains_case_insensitive(adict, akey): @@ -42,6 +44,11 @@ def truncate_name(text): return text[:50] +def asciify(name): + """Remove non-ascii characters from string.""" + return re.sub("[^0-9a-zA-Z_]", "", name) + + def format_name(text): """Format a comic name.""" name = unescape(text) diff --git a/scripts/smackjeeves.py b/scripts/smackjeeves.py index ff43ba155..9761aeba9 100755 --- a/scripts/smackjeeves.py +++ b/scripts/smackjeeves.py @@ -1,22 +1,34 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher """ -Script to get a list of smackjeeves.com comics and save the info in a JSON file for further processing. +Script to get a list of smackjeeves.com comics and save the info in a JSON file +for further processing. """ -from __future__ import print_function +from __future__ import absolute_import, division, print_function + import codecs import re import sys import os -import urlparse +try: + from urllib.parse import urljoin +except ImportError: + from urlparse import urljoin + import requests -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) -from dosagelib.util import getPageContent, tagre + +sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa +from dosagelib.util import get_page, tagre from dosagelib.scraper import get_scraperclasses from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name + json_file = __file__.replace(".py", ".json") + # names of comics to exclude exclude_comics = [ "4plyKamalsHead", # does not follow standard layout @@ -98,6 +110,7 @@ exclude_comics = [ "WinterMelody", # missing images ] + # the latest URL of some comics repeats the previous URL # flag this so the bounceStart uses the correct URL repeat_comics = [ @@ -236,28 +249,32 @@ repeat_comics = [ "Zodiac", ] + # links to last valid strips url_overrides = { } + # HTML content matcher -page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)', after="site_banner") + - tagre("img", "title", r'([^"]+)')) +page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)', + after="site_banner") + + tagre("img", "title", r'([^"]+)')) url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic") num_matcher = re.compile(r'50%">\s+(\d+)\s+') adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png')) + def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: - data = getPageContent(url, session) + data = get_page(url, session).text except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in page_matcher.finditer(data): page_url = match.group(1) - page_url = urlparse.urljoin(url, page_url) + page_url = urljoin(url, page_url) name = format_name(match.group(2)) if name in exclude_comics: continue @@ -269,13 +286,14 @@ def handle_url(url, session, res): end = match.end() mo = num_matcher.search(data[end:]) if not mo: - print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr) + print("ERROR matching number:", repr(data[end:end + 300]), + file=sys.stderr) continue num = int(mo.group(1)) # search for url in extra page print("Getting", page_url) try: - data2 = getPageContent(page_url, session) + data2 = get_page(page_url, session).text except IOError as msg: print("ERROR:", msg, file=sys.stderr) return @@ -302,8 +320,8 @@ def get_results(): result_pages = 286 print("Parsing", result_pages, "search result pages...", file=sys.stderr) for i in range(0, result_pages): - print(i+1, file=sys.stderr, end=" ") - handle_url(base % (i*12), session, res) + print(i + 1, file=sys.stderr, end=" ") + handle_url(base % (i * 12), session, res) save_result(res, json_file) diff --git a/scripts/update_plugins.sh b/scripts/update_plugins.sh index 338a895f0..4e133a59b 100755 --- a/scripts/update_plugins.sh +++ b/scripts/update_plugins.sh @@ -9,7 +9,7 @@ d=$(dirname $0) if [ $# -ge 1 ]; then list="$*" else - list="creators gocomics comicgenesis keenspot smackjeeves arcamax comicfury" + list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory" fi for script in $list; do target="${d}/../dosagelib/plugins/${script}.py" diff --git a/scripts/webcomicfactory.py b/scripts/webcomicfactory.py index 4c0069618..66310996d 100755 --- a/scripts/webcomicfactory.py +++ b/scripts/webcomicfactory.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs +# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2015-2016 Tobias Gruetzmacher """ Script to get WebComicFactory comics and save the info in a JSON file for @@ -12,16 +15,17 @@ import os import requests from lxml import html -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa -from dosagelib.util import getPageContent +sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa +from dosagelib.util import get_page from scriptutil import (save_result, load_result, truncate_name, format_name) json_file = __file__.replace(".py", ".json") def find_first(session, url): + print("Parsing", url, file=sys.stderr) try: - data = html.document_fromstring(getPageContent(url, session)) + data = html.document_fromstring(get_page(url, session).text) data.make_links_absolute(url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) @@ -38,8 +42,9 @@ def get_results(): res = {} url = 'http://www.thewebcomicfactory.com/' session = requests.Session() + print("Parsing", url, file=sys.stderr) try: - data = html.document_fromstring(getPageContent(url, session)) + data = html.document_fromstring(get_page(url, session).text) data.make_links_absolute(url) except IOError as msg: print("ERROR:", msg, file=sys.stderr)