Clean up update helper scripts.
This commit is contained in:
parent
42e43fa4e6
commit
9028724a74
17 changed files with 183 additions and 85 deletions
|
@ -444,11 +444,6 @@ def rfc822date(indate):
|
|||
return time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(indate))
|
||||
|
||||
|
||||
def asciify(name):
|
||||
"""Remove non-ascii characters from string."""
|
||||
return re.sub("[^0-9a-zA-Z_]", "", name)
|
||||
|
||||
|
||||
def unquote(text):
|
||||
"""Replace all percent-encoded entities in text."""
|
||||
while '%' in text:
|
||||
|
|
|
@ -1,18 +1,25 @@
|
|||
#!/usr/bin/env python
|
||||
# Copyright (C) 2013-2014 Bastian Kleineidam
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
"""
|
||||
Script to get arcamax comics and save the info in a JSON file for further processing.
|
||||
Script to get arcamax comics and save the info in a JSON file for further
|
||||
processing.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import codecs
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
|
||||
import requests
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||
from dosagelib.util import get_page
|
||||
from dosagelib.scraper import get_scraperclasses
|
||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
||||
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
|
@ -20,7 +27,7 @@ url_matcher = re.compile(r'<li><a href="(/thefunnies/[^"]+)">([^<]+)</a>')
|
|||
|
||||
# names of comics to exclude
|
||||
exclude_comics = [
|
||||
"HagartheHorrible", # better source available
|
||||
"HagartheHorrible", # better source available
|
||||
]
|
||||
|
||||
|
||||
|
@ -28,15 +35,13 @@ def handle_url(url, session, res):
|
|||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data = getPageContent(url, session)
|
||||
data = get_page(url, session).text
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
for match in url_matcher.finditer(data):
|
||||
shortname = match.group(1)
|
||||
name = unescape(match.group(2))
|
||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
||||
name = capfirst(name)
|
||||
name = format_name(match.group(2))
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if contains_case_insensitive(res, name):
|
||||
|
@ -86,7 +91,7 @@ def print_results(args):
|
|||
else:
|
||||
prefix = u''
|
||||
fp.write(u"%sadd(%r, %r)\n" % (prefix, str(truncate_name(name)),
|
||||
str(shortname)))
|
||||
str(shortname)))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -1,19 +1,23 @@
|
|||
#!/usr/bin/env python
|
||||
# Copyright (C) 2013-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2016 Tobias Gruetzmacher
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
"""
|
||||
Script to get ComicFury comics and save the info in a JSON file for further
|
||||
processing.
|
||||
"""
|
||||
from __future__ import print_function, absolute_import
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import codecs
|
||||
import sys
|
||||
import os
|
||||
|
||||
import requests
|
||||
from lxml import html
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||
from dosagelib.util import getPageContent
|
||||
from dosagelib.util import get_page
|
||||
from dosagelib.scraper import get_scraperclasses
|
||||
from scriptutil import (contains_case_insensitive, save_result, load_result,
|
||||
truncate_name, format_name)
|
||||
|
@ -120,7 +124,7 @@ def handle_url(url, session, res):
|
|||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data = html.document_fromstring(getPageContent(url, session))
|
||||
data = html.document_fromstring(get_page(url, session).text)
|
||||
data.make_links_absolute(url)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
|
|
|
@ -1,24 +1,36 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
"""
|
||||
Script to get a list of ComicGenesis comics and save the info in a
|
||||
JSON file for further processing.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import codecs
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
|
||||
import requests
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||
from dosagelib.util import get_page, tagre, check_robotstxt
|
||||
from dosagelib.scraper import get_scraperclasses
|
||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
||||
from scriptutil import (contains_case_insensitive, save_result, load_result,
|
||||
truncate_name, format_name)
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
# <div class="comictitle"><strong><a target="_blank" onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return false;" href="http://collegepros.comicgenesis.com">Adventures of the College Pros</a>
|
||||
url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>')
|
||||
# <div class="comictitle"><strong><a target="_blank"
|
||||
# onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return
|
||||
# false;" href="http://collegepros.comicgenesis.com">Adventures of the College
|
||||
# Pros</a>
|
||||
url_matcher = re.compile(r'<div class="comictitle"><strong>' +
|
||||
tagre("a", "href", r'(http://[^"]+)') +
|
||||
r'([^<]+)</a>')
|
||||
num_matcher = re.compile(r'Number of Days: (\d+)')
|
||||
|
||||
# names of comics to exclude
|
||||
|
@ -368,19 +380,18 @@ url_overrides = {
|
|||
"Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
|
||||
}
|
||||
|
||||
|
||||
def handle_url(url, session, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data = getPageContent(url, session)
|
||||
data = get_page(url, session).text
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
for match in url_matcher.finditer(data):
|
||||
url = match.group(1) + '/'
|
||||
name = unescape(match.group(2))
|
||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
||||
name = capfirst(name)
|
||||
name = format_name(match.group(2))
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if contains_case_insensitive(res, name):
|
||||
|
@ -391,13 +402,13 @@ def handle_url(url, session, res):
|
|||
end = match.end()
|
||||
mo = num_matcher.search(data[end:])
|
||||
if not mo:
|
||||
print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
|
||||
print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
|
||||
continue
|
||||
num = int(mo.group(1))
|
||||
url = url_overrides.get(name, url)
|
||||
try:
|
||||
if "/d/" not in url:
|
||||
check_robotstxt(url+"d/", session)
|
||||
check_robotstxt(url + "d/", session)
|
||||
else:
|
||||
check_robotstxt(url, session)
|
||||
except IOError:
|
||||
|
|
|
@ -1,15 +1,20 @@
|
|||
#!/usr/bin/env python
|
||||
# Copyright (C) 2013-2014 Bastian Kleineidam
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
"""
|
||||
Creates a CBZ file in the comic directory.
|
||||
Uses an ordered symlink directory (see order-symlinks.py) if it exists,
|
||||
else the plain files are used.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import sys
|
||||
import os
|
||||
import zipfile
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||
from dosagelib.configuration import App
|
||||
|
||||
|
||||
|
@ -21,6 +26,7 @@ ImageExts = (
|
|||
".png",
|
||||
)
|
||||
|
||||
|
||||
def is_image(filename):
|
||||
"""Determine if given filename is an image."""
|
||||
# note: isfile() also accepts symlinks
|
||||
|
|
|
@ -1,17 +1,25 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
"""
|
||||
Script to get a list of drunkduck comics and save the info in a JSON file for further processing.
|
||||
Script to get a list of drunkduck comics and save the info in a JSON file for
|
||||
further processing.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import codecs
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
|
||||
import requests
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import tagre, getPageContent, unquote, unescape, asciify
|
||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||
from dosagelib.util import tagre, get_page, unquote, unescape
|
||||
from scriptutil import (contains_case_insensitive, capfirst, save_result,
|
||||
load_result, truncate_name, asciify)
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
|
@ -169,7 +177,7 @@ exclude_comics = [
|
|||
def handle_url(url, session, url_matcher, num_matcher, res):
|
||||
"""Parse one search result page."""
|
||||
try:
|
||||
data = getPageContent(url, session)
|
||||
data = get_page(url, session).text
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
|
@ -187,7 +195,7 @@ def handle_url(url, session, url_matcher, num_matcher, res):
|
|||
end = match.end(1)
|
||||
mo = num_matcher.search(data[end:])
|
||||
if not mo:
|
||||
print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
|
||||
print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
|
||||
continue
|
||||
num = int(mo.group(1))
|
||||
res[name] = (path, num)
|
||||
|
|
|
@ -6,7 +6,7 @@ d=$(dirname $0)
|
|||
if [ $# -ge 1 ]; then
|
||||
list="$*"
|
||||
else
|
||||
list="creators gocomics comicgenesis keenspot smackjeeves arcamax comicfury"
|
||||
list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory"
|
||||
fi
|
||||
for script in $list; do
|
||||
echo "Executing ${script}.py"
|
||||
|
|
|
@ -1,15 +1,18 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2013-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2016 Tobias Gruetzmacher
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
"""
|
||||
Script to get a list of gocomics and save the info in a JSON file for further processing.
|
||||
Script to get a list of gocomics and save the info in a JSON file for further
|
||||
processing.
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import codecs
|
||||
import sys
|
||||
import os
|
||||
|
||||
import requests
|
||||
from lxml import html
|
||||
|
||||
|
|
|
@ -1,2 +1,8 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
from PyInstaller.utils.hooks import collect_submodules
|
||||
hiddenimports = collect_submodules('dosagelib.plugins')
|
||||
|
|
|
@ -1,28 +1,38 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
"""
|
||||
Script to get a list of KeenSpot comics and save the info in a
|
||||
JSON file for further processing.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import codecs
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
|
||||
import requests
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||
from dosagelib.util import get_page, tagre, check_robotstxt
|
||||
from dosagelib.scraper import get_scraperclasses
|
||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
||||
from scriptutil import (contains_case_insensitive, save_result, load_result,
|
||||
truncate_name, format_name)
|
||||
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
|
||||
url_matcher = re.compile(
|
||||
tagre("td", "onmouseover", r'([^"]+)') +
|
||||
tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') +
|
||||
r"(?:<b>)?([^<]+)(?:</b>)?</a>"
|
||||
)
|
||||
|
||||
|
||||
# names of comics to exclude
|
||||
exclude_comics = [
|
||||
"BrawlintheFamily", # non-standard navigation
|
||||
|
@ -47,23 +57,23 @@ exclude_comics = [
|
|||
"YouDamnKid", # non-standard navigation
|
||||
]
|
||||
|
||||
|
||||
# links to last valid strips
|
||||
url_overrides = {
|
||||
}
|
||||
|
||||
|
||||
def handle_url(url, session, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data = getPageContent(url, session)
|
||||
data = get_page(url, session).text
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
for match in url_matcher.finditer(data):
|
||||
comicurl = match.group(2)
|
||||
name = unescape(match.group(3))
|
||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
||||
name = capfirst(name)
|
||||
name = format_name(match.group(3))
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if contains_case_insensitive(res, name):
|
||||
|
@ -72,7 +82,7 @@ def handle_url(url, session, res):
|
|||
continue
|
||||
try:
|
||||
if "/d/" not in comicurl:
|
||||
check_robotstxt(comicurl+"d/", session)
|
||||
check_robotstxt(comicurl + "d/", session)
|
||||
else:
|
||||
check_robotstxt(comicurl, session)
|
||||
except IOError:
|
||||
|
|
|
@ -1,5 +1,11 @@
|
|||
#!/usr/bin/python
|
||||
# update languages.py from pycountry
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
'''update languages.py from pycountry'''
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import os
|
||||
import sys
|
||||
import codecs
|
||||
|
@ -7,7 +13,8 @@ import codecs
|
|||
basepath = os.path.dirname(os.path.dirname(__file__))
|
||||
sys.path.append(basepath)
|
||||
|
||||
from dosagelib.scraper import get_scraperclasses
|
||||
from dosagelib.scraper import get_scraperclasses # noqa
|
||||
|
||||
|
||||
def main():
|
||||
"""Update language information in dosagelib/languages.py."""
|
||||
|
@ -29,6 +36,7 @@ def get_used_languages():
|
|||
lang[l] = scraperclass.language()
|
||||
return lang
|
||||
|
||||
|
||||
def write_languages(f, l):
|
||||
"""Write language information."""
|
||||
f.write("Languages = {%s" % os.linesep)
|
||||
|
|
|
@ -1,25 +1,32 @@
|
|||
#!/usr/bin/env python
|
||||
# Copyright (C) 2013 Tobias Gruetzmacher
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
"""
|
||||
This script takes the JSON file created by 'dosage -o json' and uses the
|
||||
metadata to build a symlink farm in the deduced order of the comic. It created
|
||||
those in a subdirectory called 'inorder'.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import sys
|
||||
import os
|
||||
import codecs
|
||||
import json
|
||||
|
||||
|
||||
def jsonFn(d):
|
||||
"""Get JSON filename."""
|
||||
return os.path.join(d, 'dosage.json')
|
||||
|
||||
|
||||
def loadJson(d):
|
||||
"""Return JSON data."""
|
||||
with codecs.open(jsonFn(d), 'r', 'utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def prepare_output(d):
|
||||
"""Clean pre-existing links in output directory."""
|
||||
outDir = os.path.join(d, 'inorder')
|
||||
|
@ -31,6 +38,7 @@ def prepare_output(d):
|
|||
os.remove(f)
|
||||
return outDir
|
||||
|
||||
|
||||
def create_symlinks(d):
|
||||
"""Create new symbolic links in output directory."""
|
||||
data = loadJson(d)
|
||||
|
@ -68,4 +76,3 @@ if __name__ == '__main__':
|
|||
print("No JSON file found in '%s'." % (d))
|
||||
else:
|
||||
print("Usage: %s comic-dirs" % (os.path.basename(sys.argv[0])))
|
||||
|
||||
|
|
|
@ -1,11 +1,15 @@
|
|||
#!/usr/bin/env python
|
||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||
"""Remove all lines after a given marker line.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
"""Remove all lines after a given marker line."""
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import fileinput
|
||||
import sys
|
||||
|
||||
|
||||
def main(args):
|
||||
"""Remove lines after marker."""
|
||||
filename = args[0]
|
||||
|
@ -15,5 +19,6 @@ def main(args):
|
|||
if line.startswith(marker):
|
||||
break
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv[1:])
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import re
|
||||
import json
|
||||
import codecs
|
||||
|
||||
from dosagelib.util import unescape, asciify
|
||||
from dosagelib.util import unescape
|
||||
|
||||
|
||||
def contains_case_insensitive(adict, akey):
|
||||
|
@ -42,6 +44,11 @@ def truncate_name(text):
|
|||
return text[:50]
|
||||
|
||||
|
||||
def asciify(name):
|
||||
"""Remove non-ascii characters from string."""
|
||||
return re.sub("[^0-9a-zA-Z_]", "", name)
|
||||
|
||||
|
||||
def format_name(text):
|
||||
"""Format a comic name."""
|
||||
name = unescape(text)
|
||||
|
|
|
@ -1,22 +1,34 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
"""
|
||||
Script to get a list of smackjeeves.com comics and save the info in a JSON file for further processing.
|
||||
Script to get a list of smackjeeves.com comics and save the info in a JSON file
|
||||
for further processing.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import codecs
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
import urlparse
|
||||
try:
|
||||
from urllib.parse import urljoin
|
||||
except ImportError:
|
||||
from urlparse import urljoin
|
||||
|
||||
import requests
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, tagre
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
|
||||
from dosagelib.util import get_page, tagre
|
||||
from dosagelib.scraper import get_scraperclasses
|
||||
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
|
||||
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
|
||||
# names of comics to exclude
|
||||
exclude_comics = [
|
||||
"4plyKamalsHead", # does not follow standard layout
|
||||
|
@ -98,6 +110,7 @@ exclude_comics = [
|
|||
"WinterMelody", # missing images
|
||||
]
|
||||
|
||||
|
||||
# the latest URL of some comics repeats the previous URL
|
||||
# flag this so the bounceStart uses the correct URL
|
||||
repeat_comics = [
|
||||
|
@ -236,28 +249,32 @@ repeat_comics = [
|
|||
"Zodiac",
|
||||
]
|
||||
|
||||
|
||||
# links to last valid strips
|
||||
url_overrides = {
|
||||
}
|
||||
|
||||
|
||||
# HTML content matcher
|
||||
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)', after="site_banner") +
|
||||
tagre("img", "title", r'([^"]+)'))
|
||||
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)',
|
||||
after="site_banner") +
|
||||
tagre("img", "title", r'([^"]+)'))
|
||||
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
|
||||
num_matcher = re.compile(r'50%">\s+(\d+)\s+')
|
||||
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
|
||||
|
||||
|
||||
def handle_url(url, session, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data = getPageContent(url, session)
|
||||
data = get_page(url, session).text
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
for match in page_matcher.finditer(data):
|
||||
page_url = match.group(1)
|
||||
page_url = urlparse.urljoin(url, page_url)
|
||||
page_url = urljoin(url, page_url)
|
||||
name = format_name(match.group(2))
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
|
@ -269,13 +286,14 @@ def handle_url(url, session, res):
|
|||
end = match.end()
|
||||
mo = num_matcher.search(data[end:])
|
||||
if not mo:
|
||||
print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr)
|
||||
print("ERROR matching number:", repr(data[end:end + 300]),
|
||||
file=sys.stderr)
|
||||
continue
|
||||
num = int(mo.group(1))
|
||||
# search for url in extra page
|
||||
print("Getting", page_url)
|
||||
try:
|
||||
data2 = getPageContent(page_url, session)
|
||||
data2 = get_page(page_url, session).text
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
|
@ -302,8 +320,8 @@ def get_results():
|
|||
result_pages = 286
|
||||
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
|
||||
for i in range(0, result_pages):
|
||||
print(i+1, file=sys.stderr, end=" ")
|
||||
handle_url(base % (i*12), session, res)
|
||||
print(i + 1, file=sys.stderr, end=" ")
|
||||
handle_url(base % (i * 12), session, res)
|
||||
save_result(res, json_file)
|
||||
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ d=$(dirname $0)
|
|||
if [ $# -ge 1 ]; then
|
||||
list="$*"
|
||||
else
|
||||
list="creators gocomics comicgenesis keenspot smackjeeves arcamax comicfury"
|
||||
list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory"
|
||||
fi
|
||||
for script in $list; do
|
||||
target="${d}/../dosagelib/plugins/${script}.py"
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
"""
|
||||
Script to get WebComicFactory comics and save the info in a JSON file for
|
||||
|
@ -12,16 +15,17 @@ import os
|
|||
import requests
|
||||
from lxml import html
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||
from dosagelib.util import getPageContent
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
|
||||
from dosagelib.util import get_page
|
||||
from scriptutil import (save_result, load_result, truncate_name, format_name)
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
|
||||
def find_first(session, url):
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data = html.document_fromstring(getPageContent(url, session))
|
||||
data = html.document_fromstring(get_page(url, session).text)
|
||||
data.make_links_absolute(url)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
|
@ -38,8 +42,9 @@ def get_results():
|
|||
res = {}
|
||||
url = 'http://www.thewebcomicfactory.com/'
|
||||
session = requests.Session()
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data = html.document_fromstring(getPageContent(url, session))
|
||||
data = html.document_fromstring(get_page(url, session).text)
|
||||
data.make_links_absolute(url)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
|
|
Loading…
Reference in a new issue