Clean up update helper scripts.

This commit is contained in:
Tobias Gruetzmacher 2016-04-13 00:52:16 +02:00
parent 42e43fa4e6
commit 9028724a74
17 changed files with 183 additions and 85 deletions

View file

@ -444,11 +444,6 @@ def rfc822date(indate):
return time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(indate))
def asciify(name):
"""Remove non-ascii characters from string."""
return re.sub("[^0-9a-zA-Z_]", "", name)
def unquote(text):
"""Replace all percent-encoded entities in text."""
while '%' in text:

View file

@ -1,18 +1,25 @@
#!/usr/bin/env python
# Copyright (C) 2013-2014 Bastian Kleineidam
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
Script to get arcamax comics and save the info in a JSON file for further processing.
Script to get arcamax comics and save the info in a JSON file for further
processing.
"""
from __future__ import print_function
from __future__ import absolute_import, division, print_function
import codecs
import re
import sys
import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
json_file = __file__.replace(".py", ".json")
@ -20,7 +27,7 @@ url_matcher = re.compile(r'<li><a href="(/thefunnies/[^"]+)">([^<]+)</a>')
# names of comics to exclude
exclude_comics = [
"HagartheHorrible", # better source available
"HagartheHorrible", # better source available
]
@ -28,15 +35,13 @@ def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data = getPageContent(url, session)
data = get_page(url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
shortname = match.group(1)
name = unescape(match.group(2))
name = asciify(name.replace('&', 'And').replace('@', 'At'))
name = capfirst(name)
name = format_name(match.group(2))
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
@ -86,7 +91,7 @@ def print_results(args):
else:
prefix = u''
fp.write(u"%sadd(%r, %r)\n" % (prefix, str(truncate_name(name)),
str(shortname)))
str(shortname)))
if __name__ == '__main__':

View file

@ -1,19 +1,23 @@
#!/usr/bin/env python
# Copyright (C) 2013-2014 Bastian Kleineidam
# Copyright (C) 2016 Tobias Gruetzmacher
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
Script to get ComicFury comics and save the info in a JSON file for further
processing.
"""
from __future__ import print_function, absolute_import
from __future__ import absolute_import, division, print_function
import codecs
import sys
import os
import requests
from lxml import html
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import getPageContent
from dosagelib.util import get_page
from dosagelib.scraper import get_scraperclasses
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
@ -120,7 +124,7 @@ def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(getPageContent(url, session))
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)

View file

@ -1,24 +1,36 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
Script to get a list of ComicGenesis comics and save the info in a
JSON file for further processing.
"""
from __future__ import print_function
from __future__ import absolute_import, division, print_function
import codecs
import re
import sys
import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
json_file = __file__.replace(".py", ".json")
# <div class="comictitle"><strong><a target="_blank" onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return false;" href="http://collegepros.comicgenesis.com">Adventures of the College Pros</a>
url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>')
# <div class="comictitle"><strong><a target="_blank"
# onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return
# false;" href="http://collegepros.comicgenesis.com">Adventures of the College
# Pros</a>
url_matcher = re.compile(r'<div class="comictitle"><strong>' +
tagre("a", "href", r'(http://[^"]+)') +
r'([^<]+)</a>')
num_matcher = re.compile(r'Number of Days: (\d+)')
# names of comics to exclude
@ -368,19 +380,18 @@ url_overrides = {
"Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
}
def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data = getPageContent(url, session)
data = get_page(url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
url = match.group(1) + '/'
name = unescape(match.group(2))
name = asciify(name.replace('&', 'And').replace('@', 'At'))
name = capfirst(name)
name = format_name(match.group(2))
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
@ -391,13 +402,13 @@ def handle_url(url, session, res):
end = match.end()
mo = num_matcher.search(data[end:])
if not mo:
print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
continue
num = int(mo.group(1))
url = url_overrides.get(name, url)
try:
if "/d/" not in url:
check_robotstxt(url+"d/", session)
check_robotstxt(url + "d/", session)
else:
check_robotstxt(url, session)
except IOError:

View file

@ -1,15 +1,20 @@
#!/usr/bin/env python
# Copyright (C) 2013-2014 Bastian Kleineidam
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
Creates a CBZ file in the comic directory.
Uses an ordered symlink directory (see order-symlinks.py) if it exists,
else the plain files are used.
"""
from __future__ import print_function
from __future__ import absolute_import, division, print_function
import sys
import os
import zipfile
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.configuration import App
@ -21,6 +26,7 @@ ImageExts = (
".png",
)
def is_image(filename):
"""Determine if given filename is an image."""
# note: isfile() also accepts symlinks

View file

@ -1,17 +1,25 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
Script to get a list of drunkduck comics and save the info in a JSON file for further processing.
Script to get a list of drunkduck comics and save the info in a JSON file for
further processing.
"""
from __future__ import print_function
from __future__ import absolute_import, division, print_function
import codecs
import re
import sys
import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import tagre, get_page, unquote, unescape
from scriptutil import (contains_case_insensitive, capfirst, save_result,
load_result, truncate_name, asciify)
json_file = __file__.replace(".py", ".json")
@ -169,7 +177,7 @@ exclude_comics = [
def handle_url(url, session, url_matcher, num_matcher, res):
"""Parse one search result page."""
try:
data = getPageContent(url, session)
data = get_page(url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
@ -187,7 +195,7 @@ def handle_url(url, session, url_matcher, num_matcher, res):
end = match.end(1)
mo = num_matcher.search(data[end:])
if not mo:
print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
continue
num = int(mo.group(1))
res[name] = (path, num)

View file

@ -6,7 +6,7 @@ d=$(dirname $0)
if [ $# -ge 1 ]; then
list="$*"
else
list="creators gocomics comicgenesis keenspot smackjeeves arcamax comicfury"
list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory"
fi
for script in $list; do
echo "Executing ${script}.py"

View file

@ -1,15 +1,18 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2013-2014 Bastian Kleineidam
# Copyright (C) 2016 Tobias Gruetzmacher
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
Script to get a list of gocomics and save the info in a JSON file for further processing.
Script to get a list of gocomics and save the info in a JSON file for further
processing.
"""
from __future__ import absolute_import, division, print_function
import codecs
import sys
import os
import requests
from lxml import html

View file

@ -1,2 +1,8 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from PyInstaller.utils.hooks import collect_submodules
hiddenimports = collect_submodules('dosagelib.plugins')

View file

@ -1,28 +1,38 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
Script to get a list of KeenSpot comics and save the info in a
JSON file for further processing.
"""
from __future__ import print_function
from __future__ import absolute_import, division, print_function
import codecs
import re
import sys
import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
json_file = __file__.replace(".py", ".json")
url_matcher = re.compile(
tagre("td", "onmouseover", r'([^"]+)') +
tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') +
r"(?:<b>)?([^<]+)(?:</b>)?</a>"
)
# names of comics to exclude
exclude_comics = [
"BrawlintheFamily", # non-standard navigation
@ -47,23 +57,23 @@ exclude_comics = [
"YouDamnKid", # non-standard navigation
]
# links to last valid strips
url_overrides = {
}
def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data = getPageContent(url, session)
data = get_page(url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
comicurl = match.group(2)
name = unescape(match.group(3))
name = asciify(name.replace('&', 'And').replace('@', 'At'))
name = capfirst(name)
name = format_name(match.group(3))
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
@ -72,7 +82,7 @@ def handle_url(url, session, res):
continue
try:
if "/d/" not in comicurl:
check_robotstxt(comicurl+"d/", session)
check_robotstxt(comicurl + "d/", session)
else:
check_robotstxt(comicurl, session)
except IOError:

View file

@ -1,5 +1,11 @@
#!/usr/bin/python
# update languages.py from pycountry
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
'''update languages.py from pycountry'''
from __future__ import absolute_import, division, print_function
import os
import sys
import codecs
@ -7,7 +13,8 @@ import codecs
basepath = os.path.dirname(os.path.dirname(__file__))
sys.path.append(basepath)
from dosagelib.scraper import get_scraperclasses
from dosagelib.scraper import get_scraperclasses # noqa
def main():
"""Update language information in dosagelib/languages.py."""
@ -29,6 +36,7 @@ def get_used_languages():
lang[l] = scraperclass.language()
return lang
def write_languages(f, l):
"""Write language information."""
f.write("Languages = {%s" % os.linesep)

View file

@ -1,25 +1,32 @@
#!/usr/bin/env python
# Copyright (C) 2013 Tobias Gruetzmacher
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
This script takes the JSON file created by 'dosage -o json' and uses the
metadata to build a symlink farm in the deduced order of the comic. It created
those in a subdirectory called 'inorder'.
"""
from __future__ import print_function
from __future__ import absolute_import, division, print_function
import sys
import os
import codecs
import json
def jsonFn(d):
"""Get JSON filename."""
return os.path.join(d, 'dosage.json')
def loadJson(d):
"""Return JSON data."""
with codecs.open(jsonFn(d), 'r', 'utf-8') as f:
return json.load(f)
def prepare_output(d):
"""Clean pre-existing links in output directory."""
outDir = os.path.join(d, 'inorder')
@ -31,6 +38,7 @@ def prepare_output(d):
os.remove(f)
return outDir
def create_symlinks(d):
"""Create new symbolic links in output directory."""
data = loadJson(d)
@ -68,4 +76,3 @@ if __name__ == '__main__':
print("No JSON file found in '%s'." % (d))
else:
print("Usage: %s comic-dirs" % (os.path.basename(sys.argv[0])))

View file

@ -1,11 +1,15 @@
#!/usr/bin/env python
# Copyright (C) 2012-2013 Bastian Kleineidam
"""Remove all lines after a given marker line.
"""
from __future__ import print_function
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""Remove all lines after a given marker line."""
from __future__ import absolute_import, division, print_function
import fileinput
import sys
def main(args):
"""Remove lines after marker."""
filename = args[0]
@ -15,5 +19,6 @@ def main(args):
if line.startswith(marker):
break
if __name__ == '__main__':
main(sys.argv[1:])

View file

@ -1,13 +1,15 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
import re
import json
import codecs
from dosagelib.util import unescape, asciify
from dosagelib.util import unescape
def contains_case_insensitive(adict, akey):
@ -42,6 +44,11 @@ def truncate_name(text):
return text[:50]
def asciify(name):
"""Remove non-ascii characters from string."""
return re.sub("[^0-9a-zA-Z_]", "", name)
def format_name(text):
"""Format a comic name."""
name = unescape(text)

View file

@ -1,22 +1,34 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
Script to get a list of smackjeeves.com comics and save the info in a JSON file for further processing.
Script to get a list of smackjeeves.com comics and save the info in a JSON file
for further processing.
"""
from __future__ import print_function
from __future__ import absolute_import, division, print_function
import codecs
import re
import sys
import os
import urlparse
try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, tagre
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
from dosagelib.util import get_page, tagre
from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
json_file = __file__.replace(".py", ".json")
# names of comics to exclude
exclude_comics = [
"4plyKamalsHead", # does not follow standard layout
@ -98,6 +110,7 @@ exclude_comics = [
"WinterMelody", # missing images
]
# the latest URL of some comics repeats the previous URL
# flag this so the bounceStart uses the correct URL
repeat_comics = [
@ -236,28 +249,32 @@ repeat_comics = [
"Zodiac",
]
# links to last valid strips
url_overrides = {
}
# HTML content matcher
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)', after="site_banner") +
tagre("img", "title", r'([^"]+)'))
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)',
after="site_banner") +
tagre("img", "title", r'([^"]+)'))
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
num_matcher = re.compile(r'50%">\s+(\d+)\s+')
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data = getPageContent(url, session)
data = get_page(url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in page_matcher.finditer(data):
page_url = match.group(1)
page_url = urlparse.urljoin(url, page_url)
page_url = urljoin(url, page_url)
name = format_name(match.group(2))
if name in exclude_comics:
continue
@ -269,13 +286,14 @@ def handle_url(url, session, res):
end = match.end()
mo = num_matcher.search(data[end:])
if not mo:
print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr)
print("ERROR matching number:", repr(data[end:end + 300]),
file=sys.stderr)
continue
num = int(mo.group(1))
# search for url in extra page
print("Getting", page_url)
try:
data2 = getPageContent(page_url, session)
data2 = get_page(page_url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
@ -302,8 +320,8 @@ def get_results():
result_pages = 286
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
for i in range(0, result_pages):
print(i+1, file=sys.stderr, end=" ")
handle_url(base % (i*12), session, res)
print(i + 1, file=sys.stderr, end=" ")
handle_url(base % (i * 12), session, res)
save_result(res, json_file)

View file

@ -9,7 +9,7 @@ d=$(dirname $0)
if [ $# -ge 1 ]; then
list="$*"
else
list="creators gocomics comicgenesis keenspot smackjeeves arcamax comicfury"
list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory"
fi
for script in $list; do
target="${d}/../dosagelib/plugins/${script}.py"

View file

@ -1,4 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
Script to get WebComicFactory comics and save the info in a JSON file for
@ -12,16 +15,17 @@ import os
import requests
from lxml import html
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import getPageContent
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
from dosagelib.util import get_page
from scriptutil import (save_result, load_result, truncate_name, format_name)
json_file = __file__.replace(".py", ".json")
def find_first(session, url):
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(getPageContent(url, session))
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
@ -38,8 +42,9 @@ def get_results():
res = {}
url = 'http://www.thewebcomicfactory.com/'
session = requests.Session()
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(getPageContent(url, session))
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)