Clean up update helper scripts.
This commit is contained in:
parent
42e43fa4e6
commit
9028724a74
17 changed files with 183 additions and 85 deletions
|
@ -444,11 +444,6 @@ def rfc822date(indate):
|
||||||
return time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(indate))
|
return time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(indate))
|
||||||
|
|
||||||
|
|
||||||
def asciify(name):
|
|
||||||
"""Remove non-ascii characters from string."""
|
|
||||||
return re.sub("[^0-9a-zA-Z_]", "", name)
|
|
||||||
|
|
||||||
|
|
||||||
def unquote(text):
|
def unquote(text):
|
||||||
"""Replace all percent-encoded entities in text."""
|
"""Replace all percent-encoded entities in text."""
|
||||||
while '%' in text:
|
while '%' in text:
|
||||||
|
|
|
@ -1,18 +1,25 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# Copyright (C) 2013-2014 Bastian Kleineidam
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
"""
|
"""
|
||||||
Script to get arcamax comics and save the info in a JSON file for further processing.
|
Script to get arcamax comics and save the info in a JSON file for further
|
||||||
|
processing.
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
|
||||||
from dosagelib.util import getPageContent, asciify, unescape
|
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||||
|
from dosagelib.util import get_page
|
||||||
from dosagelib.scraper import get_scraperclasses
|
from dosagelib.scraper import get_scraperclasses
|
||||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
json_file = __file__.replace(".py", ".json")
|
||||||
|
|
||||||
|
@ -28,15 +35,13 @@ def handle_url(url, session, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data = getPageContent(url, session)
|
data = get_page(url, session).text
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
for match in url_matcher.finditer(data):
|
for match in url_matcher.finditer(data):
|
||||||
shortname = match.group(1)
|
shortname = match.group(1)
|
||||||
name = unescape(match.group(2))
|
name = format_name(match.group(2))
|
||||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
|
||||||
name = capfirst(name)
|
|
||||||
if name in exclude_comics:
|
if name in exclude_comics:
|
||||||
continue
|
continue
|
||||||
if contains_case_insensitive(res, name):
|
if contains_case_insensitive(res, name):
|
||||||
|
|
|
@ -1,19 +1,23 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# Copyright (C) 2013-2014 Bastian Kleineidam
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2016 Tobias Gruetzmacher
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
"""
|
"""
|
||||||
Script to get ComicFury comics and save the info in a JSON file for further
|
Script to get ComicFury comics and save the info in a JSON file for further
|
||||||
processing.
|
processing.
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function, absolute_import
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||||
from dosagelib.util import getPageContent
|
from dosagelib.util import get_page
|
||||||
from dosagelib.scraper import get_scraperclasses
|
from dosagelib.scraper import get_scraperclasses
|
||||||
from scriptutil import (contains_case_insensitive, save_result, load_result,
|
from scriptutil import (contains_case_insensitive, save_result, load_result,
|
||||||
truncate_name, format_name)
|
truncate_name, format_name)
|
||||||
|
@ -120,7 +124,7 @@ def handle_url(url, session, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data = html.document_fromstring(getPageContent(url, session))
|
data = html.document_fromstring(get_page(url, session).text)
|
||||||
data.make_links_absolute(url)
|
data.make_links_absolute(url)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
|
|
|
@ -1,24 +1,36 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
"""
|
"""
|
||||||
Script to get a list of ComicGenesis comics and save the info in a
|
Script to get a list of ComicGenesis comics and save the info in a
|
||||||
JSON file for further processing.
|
JSON file for further processing.
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
|
||||||
from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
|
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||||
|
from dosagelib.util import get_page, tagre, check_robotstxt
|
||||||
from dosagelib.scraper import get_scraperclasses
|
from dosagelib.scraper import get_scraperclasses
|
||||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
from scriptutil import (contains_case_insensitive, save_result, load_result,
|
||||||
|
truncate_name, format_name)
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
json_file = __file__.replace(".py", ".json")
|
||||||
|
|
||||||
# <div class="comictitle"><strong><a target="_blank" onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return false;" href="http://collegepros.comicgenesis.com">Adventures of the College Pros</a>
|
# <div class="comictitle"><strong><a target="_blank"
|
||||||
url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>')
|
# onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return
|
||||||
|
# false;" href="http://collegepros.comicgenesis.com">Adventures of the College
|
||||||
|
# Pros</a>
|
||||||
|
url_matcher = re.compile(r'<div class="comictitle"><strong>' +
|
||||||
|
tagre("a", "href", r'(http://[^"]+)') +
|
||||||
|
r'([^<]+)</a>')
|
||||||
num_matcher = re.compile(r'Number of Days: (\d+)')
|
num_matcher = re.compile(r'Number of Days: (\d+)')
|
||||||
|
|
||||||
# names of comics to exclude
|
# names of comics to exclude
|
||||||
|
@ -368,19 +380,18 @@ url_overrides = {
|
||||||
"Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
|
"Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def handle_url(url, session, res):
|
def handle_url(url, session, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data = getPageContent(url, session)
|
data = get_page(url, session).text
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
for match in url_matcher.finditer(data):
|
for match in url_matcher.finditer(data):
|
||||||
url = match.group(1) + '/'
|
url = match.group(1) + '/'
|
||||||
name = unescape(match.group(2))
|
name = format_name(match.group(2))
|
||||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
|
||||||
name = capfirst(name)
|
|
||||||
if name in exclude_comics:
|
if name in exclude_comics:
|
||||||
continue
|
continue
|
||||||
if contains_case_insensitive(res, name):
|
if contains_case_insensitive(res, name):
|
||||||
|
@ -391,13 +402,13 @@ def handle_url(url, session, res):
|
||||||
end = match.end()
|
end = match.end()
|
||||||
mo = num_matcher.search(data[end:])
|
mo = num_matcher.search(data[end:])
|
||||||
if not mo:
|
if not mo:
|
||||||
print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
|
print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
num = int(mo.group(1))
|
num = int(mo.group(1))
|
||||||
url = url_overrides.get(name, url)
|
url = url_overrides.get(name, url)
|
||||||
try:
|
try:
|
||||||
if "/d/" not in url:
|
if "/d/" not in url:
|
||||||
check_robotstxt(url+"d/", session)
|
check_robotstxt(url + "d/", session)
|
||||||
else:
|
else:
|
||||||
check_robotstxt(url, session)
|
check_robotstxt(url, session)
|
||||||
except IOError:
|
except IOError:
|
||||||
|
|
|
@ -1,15 +1,20 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# Copyright (C) 2013-2014 Bastian Kleineidam
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
"""
|
"""
|
||||||
Creates a CBZ file in the comic directory.
|
Creates a CBZ file in the comic directory.
|
||||||
Uses an ordered symlink directory (see order-symlinks.py) if it exists,
|
Uses an ordered symlink directory (see order-symlinks.py) if it exists,
|
||||||
else the plain files are used.
|
else the plain files are used.
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import zipfile
|
import zipfile
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
|
||||||
|
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||||
from dosagelib.configuration import App
|
from dosagelib.configuration import App
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,6 +26,7 @@ ImageExts = (
|
||||||
".png",
|
".png",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def is_image(filename):
|
def is_image(filename):
|
||||||
"""Determine if given filename is an image."""
|
"""Determine if given filename is an image."""
|
||||||
# note: isfile() also accepts symlinks
|
# note: isfile() also accepts symlinks
|
||||||
|
|
|
@ -1,17 +1,25 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
"""
|
"""
|
||||||
Script to get a list of drunkduck comics and save the info in a JSON file for further processing.
|
Script to get a list of drunkduck comics and save the info in a JSON file for
|
||||||
|
further processing.
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
|
||||||
from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify
|
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
from dosagelib.util import tagre, get_page, unquote, unescape
|
||||||
|
from scriptutil import (contains_case_insensitive, capfirst, save_result,
|
||||||
|
load_result, truncate_name, asciify)
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
json_file = __file__.replace(".py", ".json")
|
||||||
|
|
||||||
|
@ -169,7 +177,7 @@ exclude_comics = [
|
||||||
def handle_url(url, session, url_matcher, num_matcher, res):
|
def handle_url(url, session, url_matcher, num_matcher, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
try:
|
try:
|
||||||
data = getPageContent(url, session)
|
data = get_page(url, session).text
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
|
@ -187,7 +195,7 @@ def handle_url(url, session, url_matcher, num_matcher, res):
|
||||||
end = match.end(1)
|
end = match.end(1)
|
||||||
mo = num_matcher.search(data[end:])
|
mo = num_matcher.search(data[end:])
|
||||||
if not mo:
|
if not mo:
|
||||||
print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
|
print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
num = int(mo.group(1))
|
num = int(mo.group(1))
|
||||||
res[name] = (path, num)
|
res[name] = (path, num)
|
||||||
|
|
|
@ -6,7 +6,7 @@ d=$(dirname $0)
|
||||||
if [ $# -ge 1 ]; then
|
if [ $# -ge 1 ]; then
|
||||||
list="$*"
|
list="$*"
|
||||||
else
|
else
|
||||||
list="creators gocomics comicgenesis keenspot smackjeeves arcamax comicfury"
|
list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory"
|
||||||
fi
|
fi
|
||||||
for script in $list; do
|
for script in $list; do
|
||||||
echo "Executing ${script}.py"
|
echo "Executing ${script}.py"
|
||||||
|
|
|
@ -1,15 +1,18 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2013-2014 Bastian Kleineidam
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2016 Tobias Gruetzmacher
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
"""
|
"""
|
||||||
Script to get a list of gocomics and save the info in a JSON file for further processing.
|
Script to get a list of gocomics and save the info in a JSON file for further
|
||||||
|
processing.
|
||||||
"""
|
"""
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
||||||
|
|
|
@ -1,2 +1,8 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
from PyInstaller.utils.hooks import collect_submodules
|
from PyInstaller.utils.hooks import collect_submodules
|
||||||
hiddenimports = collect_submodules('dosagelib.plugins')
|
hiddenimports = collect_submodules('dosagelib.plugins')
|
||||||
|
|
|
@ -1,28 +1,38 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
"""
|
"""
|
||||||
Script to get a list of KeenSpot comics and save the info in a
|
Script to get a list of KeenSpot comics and save the info in a
|
||||||
JSON file for further processing.
|
JSON file for further processing.
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
|
||||||
from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
|
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||||
|
from dosagelib.util import get_page, tagre, check_robotstxt
|
||||||
from dosagelib.scraper import get_scraperclasses
|
from dosagelib.scraper import get_scraperclasses
|
||||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
from scriptutil import (contains_case_insensitive, save_result, load_result,
|
||||||
|
truncate_name, format_name)
|
||||||
|
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
json_file = __file__.replace(".py", ".json")
|
||||||
|
|
||||||
|
|
||||||
url_matcher = re.compile(
|
url_matcher = re.compile(
|
||||||
tagre("td", "onmouseover", r'([^"]+)') +
|
tagre("td", "onmouseover", r'([^"]+)') +
|
||||||
tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') +
|
tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') +
|
||||||
r"(?:<b>)?([^<]+)(?:</b>)?</a>"
|
r"(?:<b>)?([^<]+)(?:</b>)?</a>"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# names of comics to exclude
|
# names of comics to exclude
|
||||||
exclude_comics = [
|
exclude_comics = [
|
||||||
"BrawlintheFamily", # non-standard navigation
|
"BrawlintheFamily", # non-standard navigation
|
||||||
|
@ -47,23 +57,23 @@ exclude_comics = [
|
||||||
"YouDamnKid", # non-standard navigation
|
"YouDamnKid", # non-standard navigation
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# links to last valid strips
|
# links to last valid strips
|
||||||
url_overrides = {
|
url_overrides = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def handle_url(url, session, res):
|
def handle_url(url, session, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data = getPageContent(url, session)
|
data = get_page(url, session).text
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
for match in url_matcher.finditer(data):
|
for match in url_matcher.finditer(data):
|
||||||
comicurl = match.group(2)
|
comicurl = match.group(2)
|
||||||
name = unescape(match.group(3))
|
name = format_name(match.group(3))
|
||||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
|
||||||
name = capfirst(name)
|
|
||||||
if name in exclude_comics:
|
if name in exclude_comics:
|
||||||
continue
|
continue
|
||||||
if contains_case_insensitive(res, name):
|
if contains_case_insensitive(res, name):
|
||||||
|
@ -72,7 +82,7 @@ def handle_url(url, session, res):
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
if "/d/" not in comicurl:
|
if "/d/" not in comicurl:
|
||||||
check_robotstxt(comicurl+"d/", session)
|
check_robotstxt(comicurl + "d/", session)
|
||||||
else:
|
else:
|
||||||
check_robotstxt(comicurl, session)
|
check_robotstxt(comicurl, session)
|
||||||
except IOError:
|
except IOError:
|
||||||
|
|
|
@ -1,5 +1,11 @@
|
||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
# update languages.py from pycountry
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
|
'''update languages.py from pycountry'''
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import codecs
|
import codecs
|
||||||
|
@ -7,7 +13,8 @@ import codecs
|
||||||
basepath = os.path.dirname(os.path.dirname(__file__))
|
basepath = os.path.dirname(os.path.dirname(__file__))
|
||||||
sys.path.append(basepath)
|
sys.path.append(basepath)
|
||||||
|
|
||||||
from dosagelib.scraper import get_scraperclasses
|
from dosagelib.scraper import get_scraperclasses # noqa
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Update language information in dosagelib/languages.py."""
|
"""Update language information in dosagelib/languages.py."""
|
||||||
|
@ -29,6 +36,7 @@ def get_used_languages():
|
||||||
lang[l] = scraperclass.language()
|
lang[l] = scraperclass.language()
|
||||||
return lang
|
return lang
|
||||||
|
|
||||||
|
|
||||||
def write_languages(f, l):
|
def write_languages(f, l):
|
||||||
"""Write language information."""
|
"""Write language information."""
|
||||||
f.write("Languages = {%s" % os.linesep)
|
f.write("Languages = {%s" % os.linesep)
|
||||||
|
|
|
@ -1,25 +1,32 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# Copyright (C) 2013 Tobias Gruetzmacher
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
"""
|
"""
|
||||||
This script takes the JSON file created by 'dosage -o json' and uses the
|
This script takes the JSON file created by 'dosage -o json' and uses the
|
||||||
metadata to build a symlink farm in the deduced order of the comic. It created
|
metadata to build a symlink farm in the deduced order of the comic. It created
|
||||||
those in a subdirectory called 'inorder'.
|
those in a subdirectory called 'inorder'.
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import codecs
|
import codecs
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|
||||||
def jsonFn(d):
|
def jsonFn(d):
|
||||||
"""Get JSON filename."""
|
"""Get JSON filename."""
|
||||||
return os.path.join(d, 'dosage.json')
|
return os.path.join(d, 'dosage.json')
|
||||||
|
|
||||||
|
|
||||||
def loadJson(d):
|
def loadJson(d):
|
||||||
"""Return JSON data."""
|
"""Return JSON data."""
|
||||||
with codecs.open(jsonFn(d), 'r', 'utf-8') as f:
|
with codecs.open(jsonFn(d), 'r', 'utf-8') as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
def prepare_output(d):
|
def prepare_output(d):
|
||||||
"""Clean pre-existing links in output directory."""
|
"""Clean pre-existing links in output directory."""
|
||||||
outDir = os.path.join(d, 'inorder')
|
outDir = os.path.join(d, 'inorder')
|
||||||
|
@ -31,6 +38,7 @@ def prepare_output(d):
|
||||||
os.remove(f)
|
os.remove(f)
|
||||||
return outDir
|
return outDir
|
||||||
|
|
||||||
|
|
||||||
def create_symlinks(d):
|
def create_symlinks(d):
|
||||||
"""Create new symbolic links in output directory."""
|
"""Create new symbolic links in output directory."""
|
||||||
data = loadJson(d)
|
data = loadJson(d)
|
||||||
|
@ -68,4 +76,3 @@ if __name__ == '__main__':
|
||||||
print("No JSON file found in '%s'." % (d))
|
print("No JSON file found in '%s'." % (d))
|
||||||
else:
|
else:
|
||||||
print("Usage: %s comic-dirs" % (os.path.basename(sys.argv[0])))
|
print("Usage: %s comic-dirs" % (os.path.basename(sys.argv[0])))
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,15 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
# -*- coding: utf-8 -*-
|
||||||
"""Remove all lines after a given marker line.
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
"""
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
from __future__ import print_function
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
|
"""Remove all lines after a given marker line."""
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import fileinput
|
import fileinput
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
"""Remove lines after marker."""
|
"""Remove lines after marker."""
|
||||||
filename = args[0]
|
filename = args[0]
|
||||||
|
@ -15,5 +19,6 @@ def main(args):
|
||||||
if line.startswith(marker):
|
if line.startswith(marker):
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main(sys.argv[1:])
|
main(sys.argv[1:])
|
||||||
|
|
|
@ -1,13 +1,15 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
import re
|
||||||
import json
|
import json
|
||||||
import codecs
|
import codecs
|
||||||
|
|
||||||
from dosagelib.util import unescape, asciify
|
from dosagelib.util import unescape
|
||||||
|
|
||||||
|
|
||||||
def contains_case_insensitive(adict, akey):
|
def contains_case_insensitive(adict, akey):
|
||||||
|
@ -42,6 +44,11 @@ def truncate_name(text):
|
||||||
return text[:50]
|
return text[:50]
|
||||||
|
|
||||||
|
|
||||||
|
def asciify(name):
|
||||||
|
"""Remove non-ascii characters from string."""
|
||||||
|
return re.sub("[^0-9a-zA-Z_]", "", name)
|
||||||
|
|
||||||
|
|
||||||
def format_name(text):
|
def format_name(text):
|
||||||
"""Format a comic name."""
|
"""Format a comic name."""
|
||||||
name = unescape(text)
|
name = unescape(text)
|
||||||
|
|
|
@ -1,22 +1,34 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
"""
|
"""
|
||||||
Script to get a list of smackjeeves.com comics and save the info in a JSON file for further processing.
|
Script to get a list of smackjeeves.com comics and save the info in a JSON file
|
||||||
|
for further processing.
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import urlparse
|
try:
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
except ImportError:
|
||||||
|
from urlparse import urljoin
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
|
||||||
from dosagelib.util import getPageContent, tagre
|
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
|
||||||
|
from dosagelib.util import get_page, tagre
|
||||||
from dosagelib.scraper import get_scraperclasses
|
from dosagelib.scraper import get_scraperclasses
|
||||||
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
|
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
|
||||||
|
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
json_file = __file__.replace(".py", ".json")
|
||||||
|
|
||||||
|
|
||||||
# names of comics to exclude
|
# names of comics to exclude
|
||||||
exclude_comics = [
|
exclude_comics = [
|
||||||
"4plyKamalsHead", # does not follow standard layout
|
"4plyKamalsHead", # does not follow standard layout
|
||||||
|
@ -98,6 +110,7 @@ exclude_comics = [
|
||||||
"WinterMelody", # missing images
|
"WinterMelody", # missing images
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# the latest URL of some comics repeats the previous URL
|
# the latest URL of some comics repeats the previous URL
|
||||||
# flag this so the bounceStart uses the correct URL
|
# flag this so the bounceStart uses the correct URL
|
||||||
repeat_comics = [
|
repeat_comics = [
|
||||||
|
@ -236,28 +249,32 @@ repeat_comics = [
|
||||||
"Zodiac",
|
"Zodiac",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# links to last valid strips
|
# links to last valid strips
|
||||||
url_overrides = {
|
url_overrides = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# HTML content matcher
|
# HTML content matcher
|
||||||
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)', after="site_banner") +
|
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)',
|
||||||
|
after="site_banner") +
|
||||||
tagre("img", "title", r'([^"]+)'))
|
tagre("img", "title", r'([^"]+)'))
|
||||||
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
|
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
|
||||||
num_matcher = re.compile(r'50%">\s+(\d+)\s+')
|
num_matcher = re.compile(r'50%">\s+(\d+)\s+')
|
||||||
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
|
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
|
||||||
|
|
||||||
|
|
||||||
def handle_url(url, session, res):
|
def handle_url(url, session, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data = getPageContent(url, session)
|
data = get_page(url, session).text
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
for match in page_matcher.finditer(data):
|
for match in page_matcher.finditer(data):
|
||||||
page_url = match.group(1)
|
page_url = match.group(1)
|
||||||
page_url = urlparse.urljoin(url, page_url)
|
page_url = urljoin(url, page_url)
|
||||||
name = format_name(match.group(2))
|
name = format_name(match.group(2))
|
||||||
if name in exclude_comics:
|
if name in exclude_comics:
|
||||||
continue
|
continue
|
||||||
|
@ -269,13 +286,14 @@ def handle_url(url, session, res):
|
||||||
end = match.end()
|
end = match.end()
|
||||||
mo = num_matcher.search(data[end:])
|
mo = num_matcher.search(data[end:])
|
||||||
if not mo:
|
if not mo:
|
||||||
print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr)
|
print("ERROR matching number:", repr(data[end:end + 300]),
|
||||||
|
file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
num = int(mo.group(1))
|
num = int(mo.group(1))
|
||||||
# search for url in extra page
|
# search for url in extra page
|
||||||
print("Getting", page_url)
|
print("Getting", page_url)
|
||||||
try:
|
try:
|
||||||
data2 = getPageContent(page_url, session)
|
data2 = get_page(page_url, session).text
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
|
@ -302,8 +320,8 @@ def get_results():
|
||||||
result_pages = 286
|
result_pages = 286
|
||||||
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
|
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
|
||||||
for i in range(0, result_pages):
|
for i in range(0, result_pages):
|
||||||
print(i+1, file=sys.stderr, end=" ")
|
print(i + 1, file=sys.stderr, end=" ")
|
||||||
handle_url(base % (i*12), session, res)
|
handle_url(base % (i * 12), session, res)
|
||||||
save_result(res, json_file)
|
save_result(res, json_file)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ d=$(dirname $0)
|
||||||
if [ $# -ge 1 ]; then
|
if [ $# -ge 1 ]; then
|
||||||
list="$*"
|
list="$*"
|
||||||
else
|
else
|
||||||
list="creators gocomics comicgenesis keenspot smackjeeves arcamax comicfury"
|
list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory"
|
||||||
fi
|
fi
|
||||||
for script in $list; do
|
for script in $list; do
|
||||||
target="${d}/../dosagelib/plugins/${script}.py"
|
target="${d}/../dosagelib/plugins/${script}.py"
|
||||||
|
|
|
@ -1,4 +1,7 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
"""
|
"""
|
||||||
Script to get WebComicFactory comics and save the info in a JSON file for
|
Script to get WebComicFactory comics and save the info in a JSON file for
|
||||||
|
@ -12,16 +15,17 @@ import os
|
||||||
import requests
|
import requests
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
|
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
|
||||||
from dosagelib.util import getPageContent
|
from dosagelib.util import get_page
|
||||||
from scriptutil import (save_result, load_result, truncate_name, format_name)
|
from scriptutil import (save_result, load_result, truncate_name, format_name)
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
json_file = __file__.replace(".py", ".json")
|
||||||
|
|
||||||
|
|
||||||
def find_first(session, url):
|
def find_first(session, url):
|
||||||
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data = html.document_fromstring(getPageContent(url, session))
|
data = html.document_fromstring(get_page(url, session).text)
|
||||||
data.make_links_absolute(url)
|
data.make_links_absolute(url)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
|
@ -38,8 +42,9 @@ def get_results():
|
||||||
res = {}
|
res = {}
|
||||||
url = 'http://www.thewebcomicfactory.com/'
|
url = 'http://www.thewebcomicfactory.com/'
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data = html.document_fromstring(getPageContent(url, session))
|
data = html.document_fromstring(get_page(url, session).text)
|
||||||
data.make_links_absolute(url)
|
data.make_links_absolute(url)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
|
|
Loading…
Reference in a new issue