Refactor update helpers: Remove duplicate code.

This commit is contained in:
Tobias Gruetzmacher 2016-04-14 22:22:37 +02:00
parent 497653c448
commit dab5aef094
9 changed files with 344 additions and 497 deletions

View file

@ -9,97 +9,35 @@ processing.
"""
from __future__ import absolute_import, division, print_function
import codecs
import sys
import os
import requests
from lxml import html
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from dosagelib.scraper import get_scrapers
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
json_file = __file__.replace(".py", ".json")
# names of comics to exclude
exclude_comics = [
"HagartheHorrible", # better source available
]
from scriptutil import ComicListUpdater
def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
class ArcamaxUpdater(ComicListUpdater):
dup_templates = ("Creators/%s", "DrunkDuck/%s", "GoComics/%s",
"KeenSpot/%s", "ComicGenesis/%s", "SmackJeeves/%s")
for comiclink in data.cssselect('a.comic-icon'):
path = comiclink.attrib['href']
name = format_name(comiclink.attrib['title'])
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
continue
res[name] = path.rsplit('/', 2)[1]
if not res:
print("ERROR:", "did not match any comics", file=sys.stderr)
# names of comics to exclude
excluded_comics = (
"HagartheHorrible", # better source available
)
def handle_url(self, url):
"""Parse one search result page."""
data = self.get_url(url)
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
session = requests.Session()
handle_url('http://www.arcamax.com/comics', session, res)
save_result(res, json_file)
for comiclink in data.cssselect('a.comic-icon'):
path = comiclink.attrib['href']
name = comiclink.attrib['title']
self.add_comic(name, path.rsplit('/', 2)[1])
def find_dups(name):
"""Check if comic name already exists."""
names = [
("Creators/%s" % name).lower(),
("DrunkDuck/%s" % name).lower(),
("GoComics/%s" % name).lower(),
("KeenSpot/%s" % name).lower(),
("ComicGenesis/%s" % name).lower(),
("SmackJeeves/%s" % name).lower(),
]
for scraperobj in get_scrapers():
lname = scraperobj.name.lower()
if lname in names or lname == name.lower():
return scraperobj.name
return None
def collect_results(self):
"""Parse all search result pages."""
self.handle_url('http://www.arcamax.com/comics')
def first_lower(x):
return x[0].lower()
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp:
data = load_result(json_file)
for name, path in sorted(data.items(), key=first_lower):
dup = find_dups(name)
if dup is not None:
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
else:
fp.write(u"\n\nclass %s(_Arcamax):\n path = %r\n" % (
truncate_name(name), path))
def get_classdef(self, name, entry):
return u"class %s(_Arcamax):\n path = %r" % (name, entry)
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()
ArcamaxUpdater(__file__).run()

View file

@ -9,213 +9,148 @@ processing.
"""
from __future__ import absolute_import, division, print_function
import codecs
import sys
import os
import requests
from lxml import html
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from dosagelib.scraper import get_scraperclasses
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
# Absolute minumum number of pages a comic may have (restrict search space)
MIN_COMICS = 90
json_file = __file__.replace(".py", ".json")
# names of comics to exclude
exclude_comics = [
# unsuitable navigation
"AlfdisAndGunnora",
"AnAmericanNerdInAnimatedTokyo",
"AngryAlien",
"BoozerAndStoner",
"Bonejangles",
"ConradStory",
"Crossing",
"ChristianHumberReloaded",
"CorkAndBlotto",
"Democomix",
"ErraticBeatComics",
"EnergyWielders",
"EvilBearorg",
"Fiascos",
"FateOfTheBlueStar",
"FPK",
"Fanartgyle",
"FrigginRandom",
"GoodbyeKitty",
"HighlyExperiMental",
"IfAndCanBeFlowers",
"JournalismStory",
"JohnsonSuperior",
"Keel",
"JudgeDredBasset",
"LomeathAndHuilii",
"MNPB",
"LucidsDream",
"MadDog",
"Minebreakers",
"MoonlightValley",
"MyImmortalFool",
"NATO",
"NothingFits",
"OptimisticFishermenAndPessimisticFishermen",
"Old2G",
"NothingFitsArtBlog",
"OutToLunchTheStingRayWhoreStory",
"Pandemonium",
"Pewfell",
"ProjectX",
"Ratantia",
"RealLifeTrips",
"Sandgate",
"Secondpuberty",
"Seconds",
"SlightlyEccentricOrigins",
"StardustTheCat",
"StrangerThanFiction",
"TalamakGreatAdventure",
"TheBattalion",
"TheDailyProblem",
"TheMansionOfE",
"ThePainter",
"TheSeekers",
"TheTrialsOfKlahadOfTheAbyss",
"TheStickmen",
"ThornsInOurSide",
"TopHeavyVeryBustyPinUpsForAdults",
"USBUnlimitedSimulatedBody",
"TylerHumanRecycler",
"UAF",
"WhenPigsFly",
"YeOldeLegotimeTheatre",
# no content
"Angst",
# images gone
"BaseballCapsAndTiaras",
"CROSSWORLDSNEXUS",
"Fathead",
"KevinZombie",
"KindergardenCrisIs",
"NoSongsForTheDead",
"RequiemShadowbornPariah",
"TezzleAndZeek",
# broken HTML
"CrossingOver",
# unique html
"IKilledTheHero",
"PowerOfPower",
"Schizmatic",
"WakeTheSleepers",
"WeightOfEternity",
]
from scriptutil import ComicListUpdater
def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
class ComicFuryUpdater(ComicListUpdater):
# Absolute minumum number of pages a comic may have (restrict search space)
MIN_COMICS = 90
num = 999
for comicdiv in data.cssselect('div.searchresult'):
comiclink = comicdiv.cssselect('h3 a')[0]
comicurl = comiclink.attrib['href']
name = format_name(comiclink.text)
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name),
file=sys.stderr)
continue
dup_templates = ("Creators/%s", "DrunkDuck/%s", "GoComics/%s",
"KeenSpot/%s", "SmackJeeves/%s", "Arcamax/%s")
info = comicdiv.cssselect('span.comicinfo')
# find out how many images this comic has
num = int(info[1].text.strip())
# find activity
active = info[6].text.strip().lower() == "active"
lang = info[7].text.strip().lower()
res[name] = [comicurl, num, active, lang]
# names of comics to exclude
excluded_comics = (
# unsuitable navigation
"AlfdisAndGunnora",
"AnAmericanNerdInAnimatedTokyo",
"AngryAlien",
"BoozerAndStoner",
"Bonejangles",
"ConradStory",
"Crossing",
"ChristianHumberReloaded",
"CorkAndBlotto",
"Democomix",
"ErraticBeatComics",
"EnergyWielders",
"EvilBearorg",
"Fiascos",
"FateOfTheBlueStar",
"FPK",
"Fanartgyle",
"FrigginRandom",
"GoodbyeKitty",
"HighlyExperiMental",
"IfAndCanBeFlowers",
"JournalismStory",
"JohnsonSuperior",
"Keel",
"JudgeDredBasset",
"LomeathAndHuilii",
"MNPB",
"LucidsDream",
"MadDog",
"Minebreakers",
"MoonlightValley",
"MyImmortalFool",
"NATO",
"NothingFits",
"OptimisticFishermenAndPessimisticFishermen",
"Old2G",
"NothingFitsArtBlog",
"OutToLunchTheStingRayWhoreStory",
"Pandemonium",
"Pewfell",
"ProjectX",
"Ratantia",
"RealLifeTrips",
"Sandgate",
"Secondpuberty",
"Seconds",
"SlightlyEccentricOrigins",
"StardustTheCat",
"StrangerThanFiction",
"TalamakGreatAdventure",
"TheBattalion",
"TheDailyProblem",
"TheMansionOfE",
"ThePainter",
"TheSeekers",
"TheTrialsOfKlahadOfTheAbyss",
"TheStickmen",
"ThornsInOurSide",
"TopHeavyVeryBustyPinUpsForAdults",
"USBUnlimitedSimulatedBody",
"TylerHumanRecycler",
"UAF",
"WhenPigsFly",
"YeOldeLegotimeTheatre",
return num
# no content
"Angst",
# images gone
"BaseballCapsAndTiaras",
"CROSSWORLDSNEXUS",
"Fathead",
"KevinZombie",
"KindergardenCrisIs",
"NoSongsForTheDead",
"RequiemShadowbornPariah",
"TezzleAndZeek",
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
session = requests.Session()
# Sort by page count, so we can abort when we get under some threshold.
baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&query=' +
'&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&all_ge=1' +
'&all_st=1&all_la=1&page=%d')
last_count = 999
page = 1
print("Parsing search result pages...", file=sys.stderr)
while last_count >= MIN_COMICS:
last_count = handle_url(baseUrl % page, session, res)
page += 1
print(last_count, file=sys.stderr, end=" ")
save_result(res, json_file)
# broken HTML
"CrossingOver",
# unique html
"IKilledTheHero",
"PowerOfPower",
"Schizmatic",
"WakeTheSleepers",
"WeightOfEternity",
)
def find_dups(name):
"""Check if comic name already exists."""
names = [
("Creators/%s" % name).lower(),
("DrunkDuck/%s" % name).lower(),
("GoComics/%s" % name).lower(),
("KeenSpot/%s" % name).lower(),
("SmackJeeves/%s" % name).lower(),
("Arcamax/%s" % name).lower(),
]
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
if lname in names:
return scraperclass.getName().lower()
return None
def handle_url(self, url):
"""Parse one search result page."""
data = self.get_url(url)
count = 999
for comicdiv in data.cssselect('div.searchresult'):
comiclink = comicdiv.cssselect('h3 a')[0]
comicurl = comiclink.attrib['href']
name = comiclink.text
def first_lower(x):
return x[0].lower()
info = comicdiv.cssselect('span.comicinfo')
# find out how many images this comic has
count = int(info[1].text.strip())
# find activity
active = info[6].text.strip().lower() == "active"
lang = info[7].text.strip().lower()
self.add_comic(name, (comicurl, active, lang), count)
return count
def print_results(args):
"""Print all comics that have at least the given number of minimum
comic strips."""
min_comics, filename = args
min_comics = int(min_comics)
with codecs.open(filename, 'a', 'utf-8') as fp:
data = load_result(json_file)
for name, entry in sorted(data.items(), key=first_lower):
url, num, active, lang = entry
if name in exclude_comics:
fp.write(u"# %s is excluded\n" % name)
continue
if num < min_comics:
continue
dup = find_dups(name)
if dup is not None:
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
else:
fp.write(u"class CF%s(_ComicFury):\n url = %r\n\n\n" % (
truncate_name(name), str(url)))
def collect_results(self):
"""Parse all search result pages."""
# Sort by page count, so we can abort when we get under some threshold.
baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&' +
'query=&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&' +
'all_ge=1&all_st=1&all_la=1&page=%d')
last_count = 999
page = 1
print("Parsing search result pages...", file=sys.stderr)
while last_count >= self.MIN_COMICS:
last_count = self.handle_url(baseUrl % page)
page += 1
print(last_count, file=sys.stderr, end=" ")
def get_classdef(self, name, entry):
url, active, lang = entry
return u"class CF%s(_ComicFury):\n url = %r" % (name, url)
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()
ComicFuryUpdater(__file__).run()

View file

@ -18,7 +18,7 @@ import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses
from dosagelib.scraper import get_scrapers
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
@ -435,8 +435,8 @@ def has_comic(name):
("Creators/%s" % name).lower(),
("GoComics/%s" % name).lower(),
]
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
for scraperobj in get_scrapers():
lname = scraperclass.name.lower()
if lname in names:
return True
return False

View file

@ -8,88 +8,37 @@ for further processing.
"""
from __future__ import absolute_import, division, print_function
import codecs
import sys
import os
import requests
from lxml import html
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from dosagelib.scraper import get_scraperclasses
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
json_file = __file__.replace(".py", ".json")
# names of comics to exclude
exclude_comics = [
'Doodles', # no images
]
from scriptutil import ComicListUpdater
def handle_url(url, session, res):
"""Parse one listing page."""
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
class CreatorsUpdater(ComicListUpdater):
dup_templates = ('GoComics/%s',)
for comicdiv in data.cssselect('ul.all-test li'):
comiclink = comicdiv.cssselect('a')[0]
comicurl = comiclink.attrib['href']
name = format_name(comicdiv.cssselect('p strong')[0].text)
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name),
file=sys.stderr)
continue
# names of comics to exclude
excluded_comics = (
# no images
'Doodles',
)
res[name] = comicurl.rsplit('/', 1)[1]
def handle_url(self, url):
"""Parse one listing page."""
data = self.get_url(url)
for comicdiv in data.cssselect('ul.all-test li'):
comiclink = comicdiv.cssselect('a')[0]
comicurl = comiclink.attrib['href']
name = comicdiv.cssselect('p strong')[0].text
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
sess = requests.Session()
handle_url('https://www.creators.com/categories/comics/all', sess, res)
handle_url('https://www.creators.com/categories/cartoons/all', sess, res)
save_result(res, json_file)
self.add_comic(name, comicurl.rsplit('/', 1)[1])
def collect_results(self):
"""Parse all search result pages."""
self.handle_url('https://www.creators.com/categories/comics/all')
self.handle_url('https://www.creators.com/categories/cartoons/all')
def has_gocomics_comic(name):
"""Test if comic name already exists."""
cname = "Gocomics/%s" % name
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
if lname == cname.lower():
return True
return False
def print_results(args):
"""Print comics."""
min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp:
for name, path in sorted(load_result(json_file).items()):
lang = 'Es' if name.lower().endswith('spanish') else ''
if has_gocomics_comic(name):
fp.write(u'# %s has a duplicate in gocomics\n' %
truncate_name(name))
else:
fp.write(u"class %s(_Creators%s):\n path = %r\n\n\n" %
(truncate_name(name), lang, path))
def get_classdef(self, name, data):
lang = 'Es' if name.lower().endswith('spanish') else ''
return u"class %s(_Creators%s):\n path = %r" % (name, lang, data)
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()
CreatorsUpdater(__file__).run()

View file

@ -9,86 +9,45 @@ processing.
"""
from __future__ import absolute_import, division, print_function
import codecs
import sys
import os
import requests
from lxml import html
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from scriptutil import contains_case_insensitive, format_name, save_result, load_result, truncate_name
json_file = __file__.replace(".py", ".json")
# names of comics to exclude
exclude_comics = [
# "coming soon"
"Angryprogrammer",
"Guinness",
"Jabberwoncky",
"RandysRationale"
"SignsOfOurTimes",
"TheGagwriter",
"Yaoyao",
# duplicate
"SaturdayMorningBreakfastCereal",
]
from scriptutil import ComicListUpdater
def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, session).text)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
class GoComicsUpdater(ComicListUpdater):
# names of comics to exclude
excluded_comics = [
# "coming soon"
"Angryprogrammer",
"Guinness",
"Jabberwoncky",
"RandysRationale"
"SignsOfOurTimes",
"TheGagwriter",
"Yaoyao",
for comiclink in data.cssselect('a.alpha_list'):
link = comiclink.attrib['href']
name = format_name(comiclink.text)
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
continue
res[name] = link
# duplicate
"SaturdayMorningBreakfastCereal",
]
def handle_url(self, url):
"""Parse one search result page."""
data = self.get_url(url)
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> uri}
res = {}
session = requests.Session()
handle_url('http://www.gocomics.com/features', session, res)
handle_url('http://www.gocomics.com/explore/espanol', session, res)
handle_url('http://www.gocomics.com/explore/editorial_list', session, res)
handle_url('http://www.gocomics.com/explore/sherpa_list', session, res)
save_result(res, json_file)
for comiclink in data.cssselect('a.alpha_list'):
link = comiclink.attrib['href']
name = comiclink.text
self.add_comic(name, link)
def collect_results(self):
"""Parse all listing pages."""
self.handle_url('http://www.gocomics.com/features')
self.handle_url('http://www.gocomics.com/explore/espanol')
self.handle_url('http://www.gocomics.com/explore/editorial_list')
self.handle_url('http://www.gocomics.com/explore/sherpa_list')
def first_lower(x):
return x[0].lower()
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp:
data = load_result(json_file)
for name, uri in sorted(data.items(), key=first_lower):
if name in exclude_comics:
print("Excluded " + name)
continue
fp.write(u"\n\nclass GC%s(_GoComics%s):\n path = %r\n" % (
truncate_name(name), 'Es' if 'espanol/' in uri else '',
uri[1:]))
def get_classdef(self, name, url):
return u"class GC%s(_GoComics%s):\n path = %r" % (
name, 'Es' if 'espanol/' in url else '', url[1:])
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()
GoComicsUpdater(__file__).run()

View file

@ -18,7 +18,7 @@ import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses
from dosagelib.scraper import get_scrapers
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
@ -108,8 +108,8 @@ def has_comic(name):
("GoComics/%s" % name).lower(),
("ComicGenesis/%s" % name).lower(),
]
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
for scraperobj in get_scrapers():
lname = scraperobj.name.lower()
if lname in names:
return True
return False

View file

@ -5,11 +5,114 @@
from __future__ import absolute_import, division, print_function
import os
import re
import sys
import json
import codecs
from dosagelib.util import unescape
import requests
from lxml import html
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import unescape, get_page
from dosagelib import scraper
def first_lower(x):
return x[0].lower()
class ComicListUpdater(object):
dup_templates = ()
excluded_comics = ()
def __init__(self, name):
self.json = name.replace(".py", ".json")
self.session = requests.Session()
def get_url(self, url):
"""Get an HTML page and parse it with LXML."""
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, self.session).text)
data.make_links_absolute(url)
return data
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
raise
def should_skip(self, name):
if name in self.excluded_comics:
return True
if contains_case_insensitive(self.res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name),
file=sys.stderr)
return True
return False
def get_results(self):
"""Collect comics and save dictionary in JSON file."""
self.res = {}
self.collect_results()
if not self.res:
print("ERROR:", "did not match any comics", file=sys.stderr)
return
with codecs.open(self.json, 'wb', 'utf-8') as f:
json.dump(self.res, f, sort_keys=True, indent=2,
separators=(',', ': '))
def add_comic(self, name, data, count=None):
"""Add a collected comic with a specific number of comics."""
name = format_name(name)
if not self.should_skip(name):
self.res[name] = {'count': count, 'data': data}
def collect_results(self):
raise NotImplementedError
def print_results(self, args):
"""Print all comics that have at least the given number of minimum
comic strips."""
min_comics, filename = args
min_comics = int(min_comics)
with codecs.open(filename, 'a', 'utf-8') as fp:
with codecs.open(self.json, 'rb', 'utf-8') as f:
data = json.load(f)
for name, entry in sorted(data.items(), key=first_lower):
count = entry['count']
if count and count < min_comics:
continue
dup = self.find_dups(name)
if dup is not None:
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
else:
fp.write(u"\n\n%s\n" %
self.get_classdef(truncate_name(name),
entry['data']))
def find_dups(self, name):
"""Check if comic name already exists."""
names = [(tmpl % name).lower() for tmpl in self.dup_templates]
if names:
for scraperobj in scraper.get_scrapers():
lname = scraperobj.name.lower()
if lname in names or lname == name.lower():
return scraperobj.name
return None
def get_classdef(self, name, data):
raise NotImplementedError
def run(self):
if len(sys.argv) > 1:
self.print_results(sys.argv[1:])
else:
self.get_results()
def contains_case_insensitive(adict, akey):

View file

@ -22,7 +22,7 @@ import requests
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
from dosagelib.util import get_page, tagre
from dosagelib.scraper import get_scraperclasses
from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
@ -328,8 +328,8 @@ def get_results():
def has_comic(name):
"""Check if comic name already exists."""
cname = name.lower()
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
for scraperobj in get_scrapers():
lname = scraperobj.name.lower()
if lname == cname:
return True
return False

View file

@ -9,77 +9,40 @@ further processing.
"""
from __future__ import absolute_import, division, print_function
import codecs
import sys
import os
import requests
from lxml import html
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
from dosagelib.util import get_page
from scriptutil import (save_result, load_result, truncate_name, format_name)
json_file = __file__.replace(".py", ".json")
from scriptutil import ComicListUpdater
def find_first(session, url):
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return url
firstlinks = data.cssselect('a.comic-nav-first')
if not firstlinks:
print("INFO No first link on »%s«, already first page?" % (url))
return url
return firstlinks[0].attrib['href']
class WebComicFactoryUpdater(ComicListUpdater):
def find_first(self, url):
data = self.get_url(url)
def get_results():
"""Parse start page for supported comics."""
res = {}
url = 'http://www.thewebcomicfactory.com/'
session = requests.Session()
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return {}
firstlinks = data.cssselect('a.comic-nav-first')
if not firstlinks:
print("INFO:", "No first link on »%s«, already first page?" %
(url))
return url
return firstlinks[0].attrib['href']
for comicdiv in data.cssselect('div.ceo_thumbnail_widget'):
comicname = comicdiv.cssselect('h2')[0]
comiclink = comicdiv.cssselect('a')[0]
comicurl = comiclink.attrib['href']
name = format_name(comicname.text)
if 'comic-color-key' in comicurl:
continue
comicurl = find_first(session, comicurl)
res[name] = comicurl
def collect_results(self):
"""Parse start page for supported comics."""
url = 'http://www.thewebcomicfactory.com/'
data = self.get_url(url)
save_result(res, json_file)
for comicdiv in data.cssselect('div.ceo_thumbnail_widget'):
comicname = comicdiv.cssselect('h2')[0]
comiclink = comicdiv.cssselect('a')[0]
comicurl = comiclink.attrib['href']
name = comicname.text
if 'comic-color-key' in comicurl:
continue
comicurl = self.find_first(comicurl)
self.add_comic(name, comicurl)
def first_lower(x):
return x[0].lower()
def print_results(args):
"""Print all comics."""
min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp:
data = load_result(json_file)
for name, url in sorted(data.items(), key=first_lower):
fp.write(u"\n\nclass %s(_WebcomicFactory):\n url = %r\n" % (
truncate_name(name), str(url)))
fp.write(u" firstStripUrl = url\n")
def get_classdef(self, name, url):
return (u"class %s(_WebcomicFactory):\n url = %r\n" % (name, url) +
u" firstStripUrl = url")
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()
WebComicFactoryUpdater(__file__).run()