Refactor update helpers: Remove duplicate code.

This commit is contained in:
Tobias Gruetzmacher 2016-04-14 22:22:37 +02:00
parent 497653c448
commit dab5aef094
9 changed files with 344 additions and 497 deletions

View file

@ -9,97 +9,35 @@ processing.
"""
from __future__ import absolute_import, division, print_function
import codecs
import sys
import os
from scriptutil import ComicListUpdater
import requests
from lxml import html
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from dosagelib.scraper import get_scrapers
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
json_file = __file__.replace(".py", ".json")
class ArcamaxUpdater(ComicListUpdater):
dup_templates = ("Creators/%s", "DrunkDuck/%s", "GoComics/%s",
"KeenSpot/%s", "ComicGenesis/%s", "SmackJeeves/%s")
# names of comics to exclude
exclude_comics = [
excluded_comics = (
"HagartheHorrible", # better source available
]
)
def handle_url(url, session, res):
def handle_url(self, url):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
data = self.get_url(url)
for comiclink in data.cssselect('a.comic-icon'):
path = comiclink.attrib['href']
name = format_name(comiclink.attrib['title'])
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
continue
res[name] = path.rsplit('/', 2)[1]
if not res:
print("ERROR:", "did not match any comics", file=sys.stderr)
name = comiclink.attrib['title']
self.add_comic(name, path.rsplit('/', 2)[1])
def get_results():
def collect_results(self):
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
session = requests.Session()
handle_url('http://www.arcamax.com/comics', session, res)
save_result(res, json_file)
self.handle_url('http://www.arcamax.com/comics')
def find_dups(name):
"""Check if comic name already exists."""
names = [
("Creators/%s" % name).lower(),
("DrunkDuck/%s" % name).lower(),
("GoComics/%s" % name).lower(),
("KeenSpot/%s" % name).lower(),
("ComicGenesis/%s" % name).lower(),
("SmackJeeves/%s" % name).lower(),
]
for scraperobj in get_scrapers():
lname = scraperobj.name.lower()
if lname in names or lname == name.lower():
return scraperobj.name
return None
def first_lower(x):
return x[0].lower()
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp:
data = load_result(json_file)
for name, path in sorted(data.items(), key=first_lower):
dup = find_dups(name)
if dup is not None:
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
else:
fp.write(u"\n\nclass %s(_Arcamax):\n path = %r\n" % (
truncate_name(name), path))
def get_classdef(self, name, entry):
return u"class %s(_Arcamax):\n path = %r" % (name, entry)
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()
ArcamaxUpdater(__file__).run()

View file

@ -9,26 +9,20 @@ processing.
"""
from __future__ import absolute_import, division, print_function
import codecs
import sys
import os
import requests
from lxml import html
from scriptutil import ComicListUpdater
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from dosagelib.scraper import get_scraperclasses
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
class ComicFuryUpdater(ComicListUpdater):
# Absolute minumum number of pages a comic may have (restrict search space)
MIN_COMICS = 90
json_file = __file__.replace(".py", ".json")
dup_templates = ("Creators/%s", "DrunkDuck/%s", "GoComics/%s",
"KeenSpot/%s", "SmackJeeves/%s", "Arcamax/%s")
# names of comics to exclude
exclude_comics = [
excluded_comics = (
# unsuitable navigation
"AlfdisAndGunnora",
"AnAmericanNerdInAnimatedTokyo",
@ -117,105 +111,46 @@ exclude_comics = [
"Schizmatic",
"WakeTheSleepers",
"WeightOfEternity",
]
)
def handle_url(url, session, res):
def handle_url(self, url):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
data = self.get_url(url)
num = 999
count = 999
for comicdiv in data.cssselect('div.searchresult'):
comiclink = comicdiv.cssselect('h3 a')[0]
comicurl = comiclink.attrib['href']
name = format_name(comiclink.text)
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name),
file=sys.stderr)
continue
name = comiclink.text
info = comicdiv.cssselect('span.comicinfo')
# find out how many images this comic has
num = int(info[1].text.strip())
count = int(info[1].text.strip())
# find activity
active = info[6].text.strip().lower() == "active"
lang = info[7].text.strip().lower()
res[name] = [comicurl, num, active, lang]
self.add_comic(name, (comicurl, active, lang), count)
return num
return count
def get_results():
def collect_results(self):
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
session = requests.Session()
# Sort by page count, so we can abort when we get under some threshold.
baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&query=' +
'&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&all_ge=1' +
'&all_st=1&all_la=1&page=%d')
baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&' +
'query=&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&' +
'all_ge=1&all_st=1&all_la=1&page=%d')
last_count = 999
page = 1
print("Parsing search result pages...", file=sys.stderr)
while last_count >= MIN_COMICS:
last_count = handle_url(baseUrl % page, session, res)
while last_count >= self.MIN_COMICS:
last_count = self.handle_url(baseUrl % page)
page += 1
print(last_count, file=sys.stderr, end=" ")
save_result(res, json_file)
def find_dups(name):
"""Check if comic name already exists."""
names = [
("Creators/%s" % name).lower(),
("DrunkDuck/%s" % name).lower(),
("GoComics/%s" % name).lower(),
("KeenSpot/%s" % name).lower(),
("SmackJeeves/%s" % name).lower(),
("Arcamax/%s" % name).lower(),
]
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
if lname in names:
return scraperclass.getName().lower()
return None
def first_lower(x):
return x[0].lower()
def print_results(args):
"""Print all comics that have at least the given number of minimum
comic strips."""
min_comics, filename = args
min_comics = int(min_comics)
with codecs.open(filename, 'a', 'utf-8') as fp:
data = load_result(json_file)
for name, entry in sorted(data.items(), key=first_lower):
url, num, active, lang = entry
if name in exclude_comics:
fp.write(u"# %s is excluded\n" % name)
continue
if num < min_comics:
continue
dup = find_dups(name)
if dup is not None:
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
else:
fp.write(u"class CF%s(_ComicFury):\n url = %r\n\n\n" % (
truncate_name(name), str(url)))
def get_classdef(self, name, entry):
url, active, lang = entry
return u"class CF%s(_ComicFury):\n url = %r" % (name, url)
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()
ComicFuryUpdater(__file__).run()

View file

@ -18,7 +18,7 @@ import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses
from dosagelib.scraper import get_scrapers
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
@ -435,8 +435,8 @@ def has_comic(name):
("Creators/%s" % name).lower(),
("GoComics/%s" % name).lower(),
]
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
for scraperobj in get_scrapers():
lname = scraperclass.name.lower()
if lname in names:
return True
return False

View file

@ -8,88 +8,37 @@ for further processing.
"""
from __future__ import absolute_import, division, print_function
import codecs
import sys
import os
from scriptutil import ComicListUpdater
import requests
from lxml import html
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from dosagelib.scraper import get_scraperclasses
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
json_file = __file__.replace(".py", ".json")
class CreatorsUpdater(ComicListUpdater):
dup_templates = ('GoComics/%s',)
# names of comics to exclude
exclude_comics = [
'Doodles', # no images
]
excluded_comics = (
# no images
'Doodles',
)
def handle_url(url, session, res):
def handle_url(self, url):
"""Parse one listing page."""
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
data = self.get_url(url)
for comicdiv in data.cssselect('ul.all-test li'):
comiclink = comicdiv.cssselect('a')[0]
comicurl = comiclink.attrib['href']
name = format_name(comicdiv.cssselect('p strong')[0].text)
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name),
file=sys.stderr)
continue
name = comicdiv.cssselect('p strong')[0].text
res[name] = comicurl.rsplit('/', 1)[1]
self.add_comic(name, comicurl.rsplit('/', 1)[1])
def get_results():
def collect_results(self):
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
sess = requests.Session()
handle_url('https://www.creators.com/categories/comics/all', sess, res)
handle_url('https://www.creators.com/categories/cartoons/all', sess, res)
save_result(res, json_file)
self.handle_url('https://www.creators.com/categories/comics/all')
self.handle_url('https://www.creators.com/categories/cartoons/all')
def has_gocomics_comic(name):
"""Test if comic name already exists."""
cname = "Gocomics/%s" % name
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
if lname == cname.lower():
return True
return False
def print_results(args):
"""Print comics."""
min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp:
for name, path in sorted(load_result(json_file).items()):
def get_classdef(self, name, data):
lang = 'Es' if name.lower().endswith('spanish') else ''
if has_gocomics_comic(name):
fp.write(u'# %s has a duplicate in gocomics\n' %
truncate_name(name))
else:
fp.write(u"class %s(_Creators%s):\n path = %r\n\n\n" %
(truncate_name(name), lang, path))
return u"class %s(_Creators%s):\n path = %r" % (name, lang, data)
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()
CreatorsUpdater(__file__).run()

View file

@ -9,21 +9,12 @@ processing.
"""
from __future__ import absolute_import, division, print_function
import codecs
import sys
import os
from scriptutil import ComicListUpdater
import requests
from lxml import html
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from scriptutil import contains_case_insensitive, format_name, save_result, load_result, truncate_name
json_file = __file__.replace(".py", ".json")
class GoComicsUpdater(ComicListUpdater):
# names of comics to exclude
exclude_comics = [
excluded_comics = [
# "coming soon"
"Angryprogrammer",
"Guinness",
@ -37,58 +28,26 @@ exclude_comics = [
"SaturdayMorningBreakfastCereal",
]
def handle_url(url, session, res):
def handle_url(self, url):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, session).text)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
data = self.get_url(url)
for comiclink in data.cssselect('a.alpha_list'):
link = comiclink.attrib['href']
name = format_name(comiclink.text)
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
continue
res[name] = link
name = comiclink.text
self.add_comic(name, link)
def collect_results(self):
"""Parse all listing pages."""
self.handle_url('http://www.gocomics.com/features')
self.handle_url('http://www.gocomics.com/explore/espanol')
self.handle_url('http://www.gocomics.com/explore/editorial_list')
self.handle_url('http://www.gocomics.com/explore/sherpa_list')
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> uri}
res = {}
session = requests.Session()
handle_url('http://www.gocomics.com/features', session, res)
handle_url('http://www.gocomics.com/explore/espanol', session, res)
handle_url('http://www.gocomics.com/explore/editorial_list', session, res)
handle_url('http://www.gocomics.com/explore/sherpa_list', session, res)
save_result(res, json_file)
def first_lower(x):
return x[0].lower()
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp:
data = load_result(json_file)
for name, uri in sorted(data.items(), key=first_lower):
if name in exclude_comics:
print("Excluded " + name)
continue
fp.write(u"\n\nclass GC%s(_GoComics%s):\n path = %r\n" % (
truncate_name(name), 'Es' if 'espanol/' in uri else '',
uri[1:]))
def get_classdef(self, name, url):
return u"class GC%s(_GoComics%s):\n path = %r" % (
name, 'Es' if 'espanol/' in url else '', url[1:])
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()
GoComicsUpdater(__file__).run()

View file

@ -18,7 +18,7 @@ import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses
from dosagelib.scraper import get_scrapers
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
@ -108,8 +108,8 @@ def has_comic(name):
("GoComics/%s" % name).lower(),
("ComicGenesis/%s" % name).lower(),
]
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
for scraperobj in get_scrapers():
lname = scraperobj.name.lower()
if lname in names:
return True
return False

View file

@ -5,11 +5,114 @@
from __future__ import absolute_import, division, print_function
import os
import re
import sys
import json
import codecs
from dosagelib.util import unescape
import requests
from lxml import html
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import unescape, get_page
from dosagelib import scraper
def first_lower(x):
return x[0].lower()
class ComicListUpdater(object):
dup_templates = ()
excluded_comics = ()
def __init__(self, name):
self.json = name.replace(".py", ".json")
self.session = requests.Session()
def get_url(self, url):
"""Get an HTML page and parse it with LXML."""
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, self.session).text)
data.make_links_absolute(url)
return data
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
raise
def should_skip(self, name):
if name in self.excluded_comics:
return True
if contains_case_insensitive(self.res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name),
file=sys.stderr)
return True
return False
def get_results(self):
"""Collect comics and save dictionary in JSON file."""
self.res = {}
self.collect_results()
if not self.res:
print("ERROR:", "did not match any comics", file=sys.stderr)
return
with codecs.open(self.json, 'wb', 'utf-8') as f:
json.dump(self.res, f, sort_keys=True, indent=2,
separators=(',', ': '))
def add_comic(self, name, data, count=None):
"""Add a collected comic with a specific number of comics."""
name = format_name(name)
if not self.should_skip(name):
self.res[name] = {'count': count, 'data': data}
def collect_results(self):
raise NotImplementedError
def print_results(self, args):
"""Print all comics that have at least the given number of minimum
comic strips."""
min_comics, filename = args
min_comics = int(min_comics)
with codecs.open(filename, 'a', 'utf-8') as fp:
with codecs.open(self.json, 'rb', 'utf-8') as f:
data = json.load(f)
for name, entry in sorted(data.items(), key=first_lower):
count = entry['count']
if count and count < min_comics:
continue
dup = self.find_dups(name)
if dup is not None:
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
else:
fp.write(u"\n\n%s\n" %
self.get_classdef(truncate_name(name),
entry['data']))
def find_dups(self, name):
"""Check if comic name already exists."""
names = [(tmpl % name).lower() for tmpl in self.dup_templates]
if names:
for scraperobj in scraper.get_scrapers():
lname = scraperobj.name.lower()
if lname in names or lname == name.lower():
return scraperobj.name
return None
def get_classdef(self, name, data):
raise NotImplementedError
def run(self):
if len(sys.argv) > 1:
self.print_results(sys.argv[1:])
else:
self.get_results()
def contains_case_insensitive(adict, akey):

View file

@ -22,7 +22,7 @@ import requests
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
from dosagelib.util import get_page, tagre
from dosagelib.scraper import get_scraperclasses
from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
@ -328,8 +328,8 @@ def get_results():
def has_comic(name):
"""Check if comic name already exists."""
cname = name.lower()
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
for scraperobj in get_scrapers():
lname = scraperobj.name.lower()
if lname == cname:
return True
return False

View file

@ -9,77 +9,40 @@ further processing.
"""
from __future__ import absolute_import, division, print_function
import codecs
import sys
import os
import requests
from lxml import html
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
from dosagelib.util import get_page
from scriptutil import (save_result, load_result, truncate_name, format_name)
json_file = __file__.replace(".py", ".json")
from scriptutil import ComicListUpdater
def find_first(session, url):
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return url
class WebComicFactoryUpdater(ComicListUpdater):
def find_first(self, url):
data = self.get_url(url)
firstlinks = data.cssselect('a.comic-nav-first')
if not firstlinks:
print("INFO No first link on »%s«, already first page?" % (url))
print("INFO:", "No first link on »%s«, already first page?" %
(url))
return url
return firstlinks[0].attrib['href']
def get_results():
def collect_results(self):
"""Parse start page for supported comics."""
res = {}
url = 'http://www.thewebcomicfactory.com/'
session = requests.Session()
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return {}
data = self.get_url(url)
for comicdiv in data.cssselect('div.ceo_thumbnail_widget'):
comicname = comicdiv.cssselect('h2')[0]
comiclink = comicdiv.cssselect('a')[0]
comicurl = comiclink.attrib['href']
name = format_name(comicname.text)
name = comicname.text
if 'comic-color-key' in comicurl:
continue
comicurl = find_first(session, comicurl)
res[name] = comicurl
comicurl = self.find_first(comicurl)
self.add_comic(name, comicurl)
save_result(res, json_file)
def first_lower(x):
return x[0].lower()
def print_results(args):
"""Print all comics."""
min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp:
data = load_result(json_file)
for name, url in sorted(data.items(), key=first_lower):
fp.write(u"\n\nclass %s(_WebcomicFactory):\n url = %r\n" % (
truncate_name(name), str(url)))
fp.write(u" firstStripUrl = url\n")
def get_classdef(self, name, url):
return (u"class %s(_WebcomicFactory):\n url = %r\n" % (name, url) +
u" firstStripUrl = url")
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()
WebComicFactoryUpdater(__file__).run()