Clean up update helper scripts.

This commit is contained in:
Tobias Gruetzmacher 2016-04-13 00:52:16 +02:00
parent 42e43fa4e6
commit 9028724a74
17 changed files with 183 additions and 85 deletions

View file

@ -444,11 +444,6 @@ def rfc822date(indate):
return time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(indate)) return time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(indate))
def asciify(name):
"""Remove non-ascii characters from string."""
return re.sub("[^0-9a-zA-Z_]", "", name)
def unquote(text): def unquote(text):
"""Replace all percent-encoded entities in text.""" """Replace all percent-encoded entities in text."""
while '%' in text: while '%' in text:

View file

@ -1,18 +1,25 @@
#!/usr/bin/env python #!/usr/bin/env python
# Copyright (C) 2013-2014 Bastian Kleineidam # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
""" """
Script to get arcamax comics and save the info in a JSON file for further processing. Script to get arcamax comics and save the info in a JSON file for further
processing.
""" """
from __future__ import print_function from __future__ import absolute_import, division, print_function
import codecs import codecs
import re import re
import sys import sys
import os import os
import requests import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from dosagelib.scraper import get_scraperclasses from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
json_file = __file__.replace(".py", ".json") json_file = __file__.replace(".py", ".json")
@ -28,15 +35,13 @@ def handle_url(url, session, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
try: try:
data = getPageContent(url, session) data = get_page(url, session).text
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
for match in url_matcher.finditer(data): for match in url_matcher.finditer(data):
shortname = match.group(1) shortname = match.group(1)
name = unescape(match.group(2)) name = format_name(match.group(2))
name = asciify(name.replace('&', 'And').replace('@', 'At'))
name = capfirst(name)
if name in exclude_comics: if name in exclude_comics:
continue continue
if contains_case_insensitive(res, name): if contains_case_insensitive(res, name):

View file

@ -1,19 +1,23 @@
#!/usr/bin/env python #!/usr/bin/env python
# Copyright (C) 2013-2014 Bastian Kleineidam # -*- coding: utf-8 -*-
# Copyright (C) 2016 Tobias Gruetzmacher # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
""" """
Script to get ComicFury comics and save the info in a JSON file for further Script to get ComicFury comics and save the info in a JSON file for further
processing. processing.
""" """
from __future__ import print_function, absolute_import from __future__ import absolute_import, division, print_function
import codecs import codecs
import sys import sys
import os import os
import requests import requests
from lxml import html from lxml import html
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import getPageContent from dosagelib.util import get_page
from dosagelib.scraper import get_scraperclasses from dosagelib.scraper import get_scraperclasses
from scriptutil import (contains_case_insensitive, save_result, load_result, from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name) truncate_name, format_name)
@ -120,7 +124,7 @@ def handle_url(url, session, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
try: try:
data = html.document_fromstring(getPageContent(url, session)) data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url) data.make_links_absolute(url)
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)

View file

@ -1,24 +1,36 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
""" """
Script to get a list of ComicGenesis comics and save the info in a Script to get a list of ComicGenesis comics and save the info in a
JSON file for further processing. JSON file for further processing.
""" """
from __future__ import print_function from __future__ import absolute_import, division, print_function
import codecs import codecs
import re import re
import sys import sys
import os import os
import requests import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
json_file = __file__.replace(".py", ".json") json_file = __file__.replace(".py", ".json")
# <div class="comictitle"><strong><a target="_blank" onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return false;" href="http://collegepros.comicgenesis.com">Adventures of the College Pros</a> # <div class="comictitle"><strong><a target="_blank"
url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>') # onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return
# false;" href="http://collegepros.comicgenesis.com">Adventures of the College
# Pros</a>
url_matcher = re.compile(r'<div class="comictitle"><strong>' +
tagre("a", "href", r'(http://[^"]+)') +
r'([^<]+)</a>')
num_matcher = re.compile(r'Number of Days: (\d+)') num_matcher = re.compile(r'Number of Days: (\d+)')
# names of comics to exclude # names of comics to exclude
@ -368,19 +380,18 @@ url_overrides = {
"Zortic": "http://zortic.comicgenesis.com/d/20030922.html", "Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
} }
def handle_url(url, session, res): def handle_url(url, session, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
try: try:
data = getPageContent(url, session) data = get_page(url, session).text
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
for match in url_matcher.finditer(data): for match in url_matcher.finditer(data):
url = match.group(1) + '/' url = match.group(1) + '/'
name = unescape(match.group(2)) name = format_name(match.group(2))
name = asciify(name.replace('&', 'And').replace('@', 'At'))
name = capfirst(name)
if name in exclude_comics: if name in exclude_comics:
continue continue
if contains_case_insensitive(res, name): if contains_case_insensitive(res, name):
@ -391,13 +402,13 @@ def handle_url(url, session, res):
end = match.end() end = match.end()
mo = num_matcher.search(data[end:]) mo = num_matcher.search(data[end:])
if not mo: if not mo:
print("ERROR:", repr(data[end:end+300]), file=sys.stderr) print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
continue continue
num = int(mo.group(1)) num = int(mo.group(1))
url = url_overrides.get(name, url) url = url_overrides.get(name, url)
try: try:
if "/d/" not in url: if "/d/" not in url:
check_robotstxt(url+"d/", session) check_robotstxt(url + "d/", session)
else: else:
check_robotstxt(url, session) check_robotstxt(url, session)
except IOError: except IOError:

View file

@ -1,15 +1,20 @@
#!/usr/bin/env python #!/usr/bin/env python
# Copyright (C) 2013-2014 Bastian Kleineidam # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
""" """
Creates a CBZ file in the comic directory. Creates a CBZ file in the comic directory.
Uses an ordered symlink directory (see order-symlinks.py) if it exists, Uses an ordered symlink directory (see order-symlinks.py) if it exists,
else the plain files are used. else the plain files are used.
""" """
from __future__ import print_function from __future__ import absolute_import, division, print_function
import sys import sys
import os import os
import zipfile import zipfile
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.configuration import App from dosagelib.configuration import App
@ -21,6 +26,7 @@ ImageExts = (
".png", ".png",
) )
def is_image(filename): def is_image(filename):
"""Determine if given filename is an image.""" """Determine if given filename is an image."""
# note: isfile() also accepts symlinks # note: isfile() also accepts symlinks

View file

@ -1,17 +1,25 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
""" """
Script to get a list of drunkduck comics and save the info in a JSON file for further processing. Script to get a list of drunkduck comics and save the info in a JSON file for
further processing.
""" """
from __future__ import print_function from __future__ import absolute_import, division, print_function
import codecs import codecs
import re import re
import sys import sys
import os import os
import requests import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name from dosagelib.util import tagre, get_page, unquote, unescape
from scriptutil import (contains_case_insensitive, capfirst, save_result,
load_result, truncate_name, asciify)
json_file = __file__.replace(".py", ".json") json_file = __file__.replace(".py", ".json")
@ -169,7 +177,7 @@ exclude_comics = [
def handle_url(url, session, url_matcher, num_matcher, res): def handle_url(url, session, url_matcher, num_matcher, res):
"""Parse one search result page.""" """Parse one search result page."""
try: try:
data = getPageContent(url, session) data = get_page(url, session).text
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
@ -187,7 +195,7 @@ def handle_url(url, session, url_matcher, num_matcher, res):
end = match.end(1) end = match.end(1)
mo = num_matcher.search(data[end:]) mo = num_matcher.search(data[end:])
if not mo: if not mo:
print("ERROR:", repr(data[end:end+300]), file=sys.stderr) print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
continue continue
num = int(mo.group(1)) num = int(mo.group(1))
res[name] = (path, num) res[name] = (path, num)

View file

@ -6,7 +6,7 @@ d=$(dirname $0)
if [ $# -ge 1 ]; then if [ $# -ge 1 ]; then
list="$*" list="$*"
else else
list="creators gocomics comicgenesis keenspot smackjeeves arcamax comicfury" list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory"
fi fi
for script in $list; do for script in $list; do
echo "Executing ${script}.py" echo "Executing ${script}.py"

View file

@ -1,15 +1,18 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2013-2014 Bastian Kleineidam # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2016 Tobias Gruetzmacher # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
""" """
Script to get a list of gocomics and save the info in a JSON file for further processing. Script to get a list of gocomics and save the info in a JSON file for further
processing.
""" """
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import codecs import codecs
import sys import sys
import os import os
import requests import requests
from lxml import html from lxml import html

View file

@ -1,2 +1,8 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from PyInstaller.utils.hooks import collect_submodules from PyInstaller.utils.hooks import collect_submodules
hiddenimports = collect_submodules('dosagelib.plugins') hiddenimports = collect_submodules('dosagelib.plugins')

View file

@ -1,28 +1,38 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
""" """
Script to get a list of KeenSpot comics and save the info in a Script to get a list of KeenSpot comics and save the info in a
JSON file for further processing. JSON file for further processing.
""" """
from __future__ import print_function from __future__ import absolute_import, division, print_function
import codecs import codecs
import re import re
import sys import sys
import os import os
import requests import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
json_file = __file__.replace(".py", ".json") json_file = __file__.replace(".py", ".json")
url_matcher = re.compile( url_matcher = re.compile(
tagre("td", "onmouseover", r'([^"]+)') + tagre("td", "onmouseover", r'([^"]+)') +
tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') + tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') +
r"(?:<b>)?([^<]+)(?:</b>)?</a>" r"(?:<b>)?([^<]+)(?:</b>)?</a>"
) )
# names of comics to exclude # names of comics to exclude
exclude_comics = [ exclude_comics = [
"BrawlintheFamily", # non-standard navigation "BrawlintheFamily", # non-standard navigation
@ -47,23 +57,23 @@ exclude_comics = [
"YouDamnKid", # non-standard navigation "YouDamnKid", # non-standard navigation
] ]
# links to last valid strips # links to last valid strips
url_overrides = { url_overrides = {
} }
def handle_url(url, session, res): def handle_url(url, session, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
try: try:
data = getPageContent(url, session) data = get_page(url, session).text
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
for match in url_matcher.finditer(data): for match in url_matcher.finditer(data):
comicurl = match.group(2) comicurl = match.group(2)
name = unescape(match.group(3)) name = format_name(match.group(3))
name = asciify(name.replace('&', 'And').replace('@', 'At'))
name = capfirst(name)
if name in exclude_comics: if name in exclude_comics:
continue continue
if contains_case_insensitive(res, name): if contains_case_insensitive(res, name):
@ -72,7 +82,7 @@ def handle_url(url, session, res):
continue continue
try: try:
if "/d/" not in comicurl: if "/d/" not in comicurl:
check_robotstxt(comicurl+"d/", session) check_robotstxt(comicurl + "d/", session)
else: else:
check_robotstxt(comicurl, session) check_robotstxt(comicurl, session)
except IOError: except IOError:

View file

@ -1,5 +1,11 @@
#!/usr/bin/python #!/usr/bin/python
# update languages.py from pycountry # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
'''update languages.py from pycountry'''
from __future__ import absolute_import, division, print_function
import os import os
import sys import sys
import codecs import codecs
@ -7,7 +13,8 @@ import codecs
basepath = os.path.dirname(os.path.dirname(__file__)) basepath = os.path.dirname(os.path.dirname(__file__))
sys.path.append(basepath) sys.path.append(basepath)
from dosagelib.scraper import get_scraperclasses from dosagelib.scraper import get_scraperclasses # noqa
def main(): def main():
"""Update language information in dosagelib/languages.py.""" """Update language information in dosagelib/languages.py."""
@ -29,6 +36,7 @@ def get_used_languages():
lang[l] = scraperclass.language() lang[l] = scraperclass.language()
return lang return lang
def write_languages(f, l): def write_languages(f, l):
"""Write language information.""" """Write language information."""
f.write("Languages = {%s" % os.linesep) f.write("Languages = {%s" % os.linesep)

View file

@ -1,25 +1,32 @@
#!/usr/bin/env python #!/usr/bin/env python
# Copyright (C) 2013 Tobias Gruetzmacher # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
""" """
This script takes the JSON file created by 'dosage -o json' and uses the This script takes the JSON file created by 'dosage -o json' and uses the
metadata to build a symlink farm in the deduced order of the comic. It created metadata to build a symlink farm in the deduced order of the comic. It created
those in a subdirectory called 'inorder'. those in a subdirectory called 'inorder'.
""" """
from __future__ import print_function from __future__ import absolute_import, division, print_function
import sys import sys
import os import os
import codecs import codecs
import json import json
def jsonFn(d): def jsonFn(d):
"""Get JSON filename.""" """Get JSON filename."""
return os.path.join(d, 'dosage.json') return os.path.join(d, 'dosage.json')
def loadJson(d): def loadJson(d):
"""Return JSON data.""" """Return JSON data."""
with codecs.open(jsonFn(d), 'r', 'utf-8') as f: with codecs.open(jsonFn(d), 'r', 'utf-8') as f:
return json.load(f) return json.load(f)
def prepare_output(d): def prepare_output(d):
"""Clean pre-existing links in output directory.""" """Clean pre-existing links in output directory."""
outDir = os.path.join(d, 'inorder') outDir = os.path.join(d, 'inorder')
@ -31,6 +38,7 @@ def prepare_output(d):
os.remove(f) os.remove(f)
return outDir return outDir
def create_symlinks(d): def create_symlinks(d):
"""Create new symbolic links in output directory.""" """Create new symbolic links in output directory."""
data = loadJson(d) data = loadJson(d)
@ -68,4 +76,3 @@ if __name__ == '__main__':
print("No JSON file found in '%s'." % (d)) print("No JSON file found in '%s'." % (d))
else: else:
print("Usage: %s comic-dirs" % (os.path.basename(sys.argv[0]))) print("Usage: %s comic-dirs" % (os.path.basename(sys.argv[0])))

View file

@ -1,11 +1,15 @@
#!/usr/bin/env python #!/usr/bin/env python
# Copyright (C) 2012-2013 Bastian Kleineidam # -*- coding: utf-8 -*-
"""Remove all lines after a given marker line. # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
""" # Copyright (C) 2012-2014 Bastian Kleineidam
from __future__ import print_function # Copyright (C) 2015-2016 Tobias Gruetzmacher
"""Remove all lines after a given marker line."""
from __future__ import absolute_import, division, print_function
import fileinput import fileinput
import sys import sys
def main(args): def main(args):
"""Remove lines after marker.""" """Remove lines after marker."""
filename = args[0] filename = args[0]
@ -15,5 +19,6 @@ def main(args):
if line.startswith(marker): if line.startswith(marker):
break break
if __name__ == '__main__': if __name__ == '__main__':
main(sys.argv[1:]) main(sys.argv[1:])

View file

@ -1,13 +1,15 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher # Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import re
import json import json
import codecs import codecs
from dosagelib.util import unescape, asciify from dosagelib.util import unescape
def contains_case_insensitive(adict, akey): def contains_case_insensitive(adict, akey):
@ -42,6 +44,11 @@ def truncate_name(text):
return text[:50] return text[:50]
def asciify(name):
"""Remove non-ascii characters from string."""
return re.sub("[^0-9a-zA-Z_]", "", name)
def format_name(text): def format_name(text):
"""Format a comic name.""" """Format a comic name."""
name = unescape(text) name = unescape(text)

View file

@ -1,22 +1,34 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
""" """
Script to get a list of smackjeeves.com comics and save the info in a JSON file for further processing. Script to get a list of smackjeeves.com comics and save the info in a JSON file
for further processing.
""" """
from __future__ import print_function from __future__ import absolute_import, division, print_function
import codecs import codecs
import re import re
import sys import sys
import os import os
import urlparse try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
import requests import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, tagre sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
from dosagelib.util import get_page, tagre
from dosagelib.scraper import get_scraperclasses from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
json_file = __file__.replace(".py", ".json") json_file = __file__.replace(".py", ".json")
# names of comics to exclude # names of comics to exclude
exclude_comics = [ exclude_comics = [
"4plyKamalsHead", # does not follow standard layout "4plyKamalsHead", # does not follow standard layout
@ -98,6 +110,7 @@ exclude_comics = [
"WinterMelody", # missing images "WinterMelody", # missing images
] ]
# the latest URL of some comics repeats the previous URL # the latest URL of some comics repeats the previous URL
# flag this so the bounceStart uses the correct URL # flag this so the bounceStart uses the correct URL
repeat_comics = [ repeat_comics = [
@ -236,28 +249,32 @@ repeat_comics = [
"Zodiac", "Zodiac",
] ]
# links to last valid strips # links to last valid strips
url_overrides = { url_overrides = {
} }
# HTML content matcher # HTML content matcher
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)', after="site_banner") + page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)',
after="site_banner") +
tagre("img", "title", r'([^"]+)')) tagre("img", "title", r'([^"]+)'))
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic") url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
num_matcher = re.compile(r'50%">\s+(\d+)\s+') num_matcher = re.compile(r'50%">\s+(\d+)\s+')
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png')) adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
def handle_url(url, session, res): def handle_url(url, session, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
try: try:
data = getPageContent(url, session) data = get_page(url, session).text
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
for match in page_matcher.finditer(data): for match in page_matcher.finditer(data):
page_url = match.group(1) page_url = match.group(1)
page_url = urlparse.urljoin(url, page_url) page_url = urljoin(url, page_url)
name = format_name(match.group(2)) name = format_name(match.group(2))
if name in exclude_comics: if name in exclude_comics:
continue continue
@ -269,13 +286,14 @@ def handle_url(url, session, res):
end = match.end() end = match.end()
mo = num_matcher.search(data[end:]) mo = num_matcher.search(data[end:])
if not mo: if not mo:
print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr) print("ERROR matching number:", repr(data[end:end + 300]),
file=sys.stderr)
continue continue
num = int(mo.group(1)) num = int(mo.group(1))
# search for url in extra page # search for url in extra page
print("Getting", page_url) print("Getting", page_url)
try: try:
data2 = getPageContent(page_url, session) data2 = get_page(page_url, session).text
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
@ -302,8 +320,8 @@ def get_results():
result_pages = 286 result_pages = 286
print("Parsing", result_pages, "search result pages...", file=sys.stderr) print("Parsing", result_pages, "search result pages...", file=sys.stderr)
for i in range(0, result_pages): for i in range(0, result_pages):
print(i+1, file=sys.stderr, end=" ") print(i + 1, file=sys.stderr, end=" ")
handle_url(base % (i*12), session, res) handle_url(base % (i * 12), session, res)
save_result(res, json_file) save_result(res, json_file)

View file

@ -9,7 +9,7 @@ d=$(dirname $0)
if [ $# -ge 1 ]; then if [ $# -ge 1 ]; then
list="$*" list="$*"
else else
list="creators gocomics comicgenesis keenspot smackjeeves arcamax comicfury" list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory"
fi fi
for script in $list; do for script in $list; do
target="${d}/../dosagelib/plugins/${script}.py" target="${d}/../dosagelib/plugins/${script}.py"

View file

@ -1,4 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher # Copyright (C) 2015-2016 Tobias Gruetzmacher
""" """
Script to get WebComicFactory comics and save the info in a JSON file for Script to get WebComicFactory comics and save the info in a JSON file for
@ -12,16 +15,17 @@ import os
import requests import requests
from lxml import html from lxml import html
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
from dosagelib.util import getPageContent from dosagelib.util import get_page
from scriptutil import (save_result, load_result, truncate_name, format_name) from scriptutil import (save_result, load_result, truncate_name, format_name)
json_file = __file__.replace(".py", ".json") json_file = __file__.replace(".py", ".json")
def find_first(session, url): def find_first(session, url):
print("Parsing", url, file=sys.stderr)
try: try:
data = html.document_fromstring(getPageContent(url, session)) data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url) data.make_links_absolute(url)
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
@ -38,8 +42,9 @@ def get_results():
res = {} res = {}
url = 'http://www.thewebcomicfactory.com/' url = 'http://www.thewebcomicfactory.com/'
session = requests.Session() session = requests.Session()
print("Parsing", url, file=sys.stderr)
try: try:
data = html.document_fromstring(getPageContent(url, session)) data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url) data.make_links_absolute(url)
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)