diff --git a/scripts/arcamax.py b/scripts/arcamax.py index 646237630..62c353a5c 100755 --- a/scripts/arcamax.py +++ b/scripts/arcamax.py @@ -9,97 +9,35 @@ processing. """ from __future__ import absolute_import, division, print_function -import codecs -import sys -import os - -import requests -from lxml import html - -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa -from dosagelib.util import get_page -from dosagelib.scraper import get_scrapers -from scriptutil import (contains_case_insensitive, save_result, load_result, - truncate_name, format_name) - -json_file = __file__.replace(".py", ".json") - -# names of comics to exclude -exclude_comics = [ - "HagartheHorrible", # better source available -] +from scriptutil import ComicListUpdater -def handle_url(url, session, res): - """Parse one search result page.""" - print("Parsing", url, file=sys.stderr) - try: - data = html.document_fromstring(get_page(url, session).text) - data.make_links_absolute(url) - except IOError as msg: - print("ERROR:", msg, file=sys.stderr) - return +class ArcamaxUpdater(ComicListUpdater): + dup_templates = ("Creators/%s", "DrunkDuck/%s", "GoComics/%s", + "KeenSpot/%s", "ComicGenesis/%s", "SmackJeeves/%s") - for comiclink in data.cssselect('a.comic-icon'): - path = comiclink.attrib['href'] - name = format_name(comiclink.attrib['title']) - if name in exclude_comics: - continue - if contains_case_insensitive(res, name): - # we cannot handle two comics that only differ in case - print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) - continue - res[name] = path.rsplit('/', 2)[1] - if not res: - print("ERROR:", "did not match any comics", file=sys.stderr) + # names of comics to exclude + excluded_comics = ( + "HagartheHorrible", # better source available + ) + def handle_url(self, url): + """Parse one search result page.""" + data = self.get_url(url) -def get_results(): - """Parse all search result pages.""" - # store info in a dictionary {name -> shortname} - res = {} - session = requests.Session() - handle_url('http://www.arcamax.com/comics', session, res) - save_result(res, json_file) + for comiclink in data.cssselect('a.comic-icon'): + path = comiclink.attrib['href'] + name = comiclink.attrib['title'] + self.add_comic(name, path.rsplit('/', 2)[1]) -def find_dups(name): - """Check if comic name already exists.""" - names = [ - ("Creators/%s" % name).lower(), - ("DrunkDuck/%s" % name).lower(), - ("GoComics/%s" % name).lower(), - ("KeenSpot/%s" % name).lower(), - ("ComicGenesis/%s" % name).lower(), - ("SmackJeeves/%s" % name).lower(), - ] - for scraperobj in get_scrapers(): - lname = scraperobj.name.lower() - if lname in names or lname == name.lower(): - return scraperobj.name - return None + def collect_results(self): + """Parse all search result pages.""" + self.handle_url('http://www.arcamax.com/comics') - -def first_lower(x): - return x[0].lower() - - -def print_results(args): - """Print all comics that have at least the given number of minimum comic strips.""" - min_comics, filename = args - with codecs.open(filename, 'a', 'utf-8') as fp: - data = load_result(json_file) - for name, path in sorted(data.items(), key=first_lower): - dup = find_dups(name) - if dup is not None: - fp.write(u"# %s has a duplicate in %s\n" % (name, dup)) - else: - fp.write(u"\n\nclass %s(_Arcamax):\n path = %r\n" % ( - truncate_name(name), path)) + def get_classdef(self, name, entry): + return u"class %s(_Arcamax):\n path = %r" % (name, entry) if __name__ == '__main__': - if len(sys.argv) > 1: - print_results(sys.argv[1:]) - else: - get_results() + ArcamaxUpdater(__file__).run() diff --git a/scripts/comicfury.py b/scripts/comicfury.py index 52fc48fe4..fef200c64 100755 --- a/scripts/comicfury.py +++ b/scripts/comicfury.py @@ -9,213 +9,148 @@ processing. """ from __future__ import absolute_import, division, print_function -import codecs import sys -import os -import requests -from lxml import html - -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa -from dosagelib.util import get_page -from dosagelib.scraper import get_scraperclasses -from scriptutil import (contains_case_insensitive, save_result, load_result, - truncate_name, format_name) - -# Absolute minumum number of pages a comic may have (restrict search space) -MIN_COMICS = 90 - -json_file = __file__.replace(".py", ".json") - -# names of comics to exclude -exclude_comics = [ - # unsuitable navigation - "AlfdisAndGunnora", - "AnAmericanNerdInAnimatedTokyo", - "AngryAlien", - "BoozerAndStoner", - "Bonejangles", - "ConradStory", - "Crossing", - "ChristianHumberReloaded", - "CorkAndBlotto", - "Democomix", - "ErraticBeatComics", - "EnergyWielders", - "EvilBearorg", - "Fiascos", - "FateOfTheBlueStar", - "FPK", - "Fanartgyle", - "FrigginRandom", - "GoodbyeKitty", - "HighlyExperiMental", - "IfAndCanBeFlowers", - "JournalismStory", - "JohnsonSuperior", - "Keel", - "JudgeDredBasset", - "LomeathAndHuilii", - "MNPB", - "LucidsDream", - "MadDog", - "Minebreakers", - "MoonlightValley", - "MyImmortalFool", - "NATO", - "NothingFits", - "OptimisticFishermenAndPessimisticFishermen", - "Old2G", - "NothingFitsArtBlog", - "OutToLunchTheStingRayWhoreStory", - "Pandemonium", - "Pewfell", - "ProjectX", - "Ratantia", - "RealLifeTrips", - "Sandgate", - "Secondpuberty", - "Seconds", - "SlightlyEccentricOrigins", - "StardustTheCat", - "StrangerThanFiction", - "TalamakGreatAdventure", - "TheBattalion", - "TheDailyProblem", - "TheMansionOfE", - "ThePainter", - "TheSeekers", - "TheTrialsOfKlahadOfTheAbyss", - "TheStickmen", - "ThornsInOurSide", - "TopHeavyVeryBustyPinUpsForAdults", - "USBUnlimitedSimulatedBody", - "TylerHumanRecycler", - "UAF", - "WhenPigsFly", - "YeOldeLegotimeTheatre", - - # no content - "Angst", - - # images gone - "BaseballCapsAndTiaras", - "CROSSWORLDSNEXUS", - "Fathead", - "KevinZombie", - "KindergardenCrisIs", - "NoSongsForTheDead", - "RequiemShadowbornPariah", - "TezzleAndZeek", - - # broken HTML - "CrossingOver", - - # unique html - "IKilledTheHero", - "PowerOfPower", - "Schizmatic", - "WakeTheSleepers", - "WeightOfEternity", -] +from scriptutil import ComicListUpdater -def handle_url(url, session, res): - """Parse one search result page.""" - print("Parsing", url, file=sys.stderr) - try: - data = html.document_fromstring(get_page(url, session).text) - data.make_links_absolute(url) - except IOError as msg: - print("ERROR:", msg, file=sys.stderr) - return +class ComicFuryUpdater(ComicListUpdater): + # Absolute minumum number of pages a comic may have (restrict search space) + MIN_COMICS = 90 - num = 999 - for comicdiv in data.cssselect('div.searchresult'): - comiclink = comicdiv.cssselect('h3 a')[0] - comicurl = comiclink.attrib['href'] - name = format_name(comiclink.text) - if contains_case_insensitive(res, name): - # we cannot handle two comics that only differ in case - print("INFO: skipping possible duplicate", repr(name), - file=sys.stderr) - continue + dup_templates = ("Creators/%s", "DrunkDuck/%s", "GoComics/%s", + "KeenSpot/%s", "SmackJeeves/%s", "Arcamax/%s") - info = comicdiv.cssselect('span.comicinfo') - # find out how many images this comic has - num = int(info[1].text.strip()) - # find activity - active = info[6].text.strip().lower() == "active" - lang = info[7].text.strip().lower() - res[name] = [comicurl, num, active, lang] + # names of comics to exclude + excluded_comics = ( + # unsuitable navigation + "AlfdisAndGunnora", + "AnAmericanNerdInAnimatedTokyo", + "AngryAlien", + "BoozerAndStoner", + "Bonejangles", + "ConradStory", + "Crossing", + "ChristianHumberReloaded", + "CorkAndBlotto", + "Democomix", + "ErraticBeatComics", + "EnergyWielders", + "EvilBearorg", + "Fiascos", + "FateOfTheBlueStar", + "FPK", + "Fanartgyle", + "FrigginRandom", + "GoodbyeKitty", + "HighlyExperiMental", + "IfAndCanBeFlowers", + "JournalismStory", + "JohnsonSuperior", + "Keel", + "JudgeDredBasset", + "LomeathAndHuilii", + "MNPB", + "LucidsDream", + "MadDog", + "Minebreakers", + "MoonlightValley", + "MyImmortalFool", + "NATO", + "NothingFits", + "OptimisticFishermenAndPessimisticFishermen", + "Old2G", + "NothingFitsArtBlog", + "OutToLunchTheStingRayWhoreStory", + "Pandemonium", + "Pewfell", + "ProjectX", + "Ratantia", + "RealLifeTrips", + "Sandgate", + "Secondpuberty", + "Seconds", + "SlightlyEccentricOrigins", + "StardustTheCat", + "StrangerThanFiction", + "TalamakGreatAdventure", + "TheBattalion", + "TheDailyProblem", + "TheMansionOfE", + "ThePainter", + "TheSeekers", + "TheTrialsOfKlahadOfTheAbyss", + "TheStickmen", + "ThornsInOurSide", + "TopHeavyVeryBustyPinUpsForAdults", + "USBUnlimitedSimulatedBody", + "TylerHumanRecycler", + "UAF", + "WhenPigsFly", + "YeOldeLegotimeTheatre", - return num + # no content + "Angst", + # images gone + "BaseballCapsAndTiaras", + "CROSSWORLDSNEXUS", + "Fathead", + "KevinZombie", + "KindergardenCrisIs", + "NoSongsForTheDead", + "RequiemShadowbornPariah", + "TezzleAndZeek", -def get_results(): - """Parse all search result pages.""" - # store info in a dictionary {name -> shortname} - res = {} - session = requests.Session() - # Sort by page count, so we can abort when we get under some threshold. - baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&query=' + - '&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&all_ge=1' + - '&all_st=1&all_la=1&page=%d') - last_count = 999 - page = 1 - print("Parsing search result pages...", file=sys.stderr) - while last_count >= MIN_COMICS: - last_count = handle_url(baseUrl % page, session, res) - page += 1 - print(last_count, file=sys.stderr, end=" ") - save_result(res, json_file) + # broken HTML + "CrossingOver", + # unique html + "IKilledTheHero", + "PowerOfPower", + "Schizmatic", + "WakeTheSleepers", + "WeightOfEternity", + ) -def find_dups(name): - """Check if comic name already exists.""" - names = [ - ("Creators/%s" % name).lower(), - ("DrunkDuck/%s" % name).lower(), - ("GoComics/%s" % name).lower(), - ("KeenSpot/%s" % name).lower(), - ("SmackJeeves/%s" % name).lower(), - ("Arcamax/%s" % name).lower(), - ] - for scraperclass in get_scraperclasses(): - lname = scraperclass.getName().lower() - if lname in names: - return scraperclass.getName().lower() - return None + def handle_url(self, url): + """Parse one search result page.""" + data = self.get_url(url) + count = 999 + for comicdiv in data.cssselect('div.searchresult'): + comiclink = comicdiv.cssselect('h3 a')[0] + comicurl = comiclink.attrib['href'] + name = comiclink.text -def first_lower(x): - return x[0].lower() + info = comicdiv.cssselect('span.comicinfo') + # find out how many images this comic has + count = int(info[1].text.strip()) + # find activity + active = info[6].text.strip().lower() == "active" + lang = info[7].text.strip().lower() + self.add_comic(name, (comicurl, active, lang), count) + return count -def print_results(args): - """Print all comics that have at least the given number of minimum - comic strips.""" - min_comics, filename = args - min_comics = int(min_comics) - with codecs.open(filename, 'a', 'utf-8') as fp: - data = load_result(json_file) - for name, entry in sorted(data.items(), key=first_lower): - url, num, active, lang = entry - if name in exclude_comics: - fp.write(u"# %s is excluded\n" % name) - continue - if num < min_comics: - continue - dup = find_dups(name) - if dup is not None: - fp.write(u"# %s has a duplicate in %s\n" % (name, dup)) - else: - fp.write(u"class CF%s(_ComicFury):\n url = %r\n\n\n" % ( - truncate_name(name), str(url))) + def collect_results(self): + """Parse all search result pages.""" + # Sort by page count, so we can abort when we get under some threshold. + baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&' + + 'query=&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&' + + 'all_ge=1&all_st=1&all_la=1&page=%d') + last_count = 999 + page = 1 + print("Parsing search result pages...", file=sys.stderr) + while last_count >= self.MIN_COMICS: + last_count = self.handle_url(baseUrl % page) + page += 1 + print(last_count, file=sys.stderr, end=" ") + + def get_classdef(self, name, entry): + url, active, lang = entry + return u"class CF%s(_ComicFury):\n url = %r" % (name, url) if __name__ == '__main__': - if len(sys.argv) > 1: - print_results(sys.argv[1:]) - else: - get_results() + ComicFuryUpdater(__file__).run() diff --git a/scripts/comicgenesis.py b/scripts/comicgenesis.py index d3fc969ce..ee0b59792 100755 --- a/scripts/comicgenesis.py +++ b/scripts/comicgenesis.py @@ -18,7 +18,7 @@ import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa from dosagelib.util import get_page, tagre, check_robotstxt -from dosagelib.scraper import get_scraperclasses +from dosagelib.scraper import get_scrapers from scriptutil import (contains_case_insensitive, save_result, load_result, truncate_name, format_name) @@ -435,8 +435,8 @@ def has_comic(name): ("Creators/%s" % name).lower(), ("GoComics/%s" % name).lower(), ] - for scraperclass in get_scraperclasses(): - lname = scraperclass.getName().lower() + for scraperobj in get_scrapers(): + lname = scraperclass.name.lower() if lname in names: return True return False diff --git a/scripts/creators.py b/scripts/creators.py index e7bfcae02..dd1de7408 100755 --- a/scripts/creators.py +++ b/scripts/creators.py @@ -8,88 +8,37 @@ for further processing. """ from __future__ import absolute_import, division, print_function -import codecs -import sys -import os - -import requests -from lxml import html - -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa -from dosagelib.util import get_page -from dosagelib.scraper import get_scraperclasses -from scriptutil import (contains_case_insensitive, save_result, load_result, - truncate_name, format_name) - -json_file = __file__.replace(".py", ".json") - -# names of comics to exclude -exclude_comics = [ - 'Doodles', # no images -] +from scriptutil import ComicListUpdater -def handle_url(url, session, res): - """Parse one listing page.""" - print("Parsing", url, file=sys.stderr) - try: - data = html.document_fromstring(get_page(url, session).text) - data.make_links_absolute(url) - except IOError as msg: - print("ERROR:", msg, file=sys.stderr) - return +class CreatorsUpdater(ComicListUpdater): + dup_templates = ('GoComics/%s',) - for comicdiv in data.cssselect('ul.all-test li'): - comiclink = comicdiv.cssselect('a')[0] - comicurl = comiclink.attrib['href'] - name = format_name(comicdiv.cssselect('p strong')[0].text) - if name in exclude_comics: - continue - if contains_case_insensitive(res, name): - # we cannot handle two comics that only differ in case - print("INFO: skipping possible duplicate", repr(name), - file=sys.stderr) - continue + # names of comics to exclude + excluded_comics = ( + # no images + 'Doodles', + ) - res[name] = comicurl.rsplit('/', 1)[1] + def handle_url(self, url): + """Parse one listing page.""" + data = self.get_url(url) + for comicdiv in data.cssselect('ul.all-test li'): + comiclink = comicdiv.cssselect('a')[0] + comicurl = comiclink.attrib['href'] + name = comicdiv.cssselect('p strong')[0].text -def get_results(): - """Parse all search result pages.""" - # store info in a dictionary {name -> shortname} - res = {} - sess = requests.Session() - handle_url('https://www.creators.com/categories/comics/all', sess, res) - handle_url('https://www.creators.com/categories/cartoons/all', sess, res) - save_result(res, json_file) + self.add_comic(name, comicurl.rsplit('/', 1)[1]) + def collect_results(self): + """Parse all search result pages.""" + self.handle_url('https://www.creators.com/categories/comics/all') + self.handle_url('https://www.creators.com/categories/cartoons/all') -def has_gocomics_comic(name): - """Test if comic name already exists.""" - cname = "Gocomics/%s" % name - for scraperclass in get_scraperclasses(): - lname = scraperclass.getName().lower() - if lname == cname.lower(): - return True - return False - - -def print_results(args): - """Print comics.""" - min_comics, filename = args - with codecs.open(filename, 'a', 'utf-8') as fp: - for name, path in sorted(load_result(json_file).items()): - lang = 'Es' if name.lower().endswith('spanish') else '' - if has_gocomics_comic(name): - fp.write(u'# %s has a duplicate in gocomics\n' % - truncate_name(name)) - else: - fp.write(u"class %s(_Creators%s):\n path = %r\n\n\n" % - (truncate_name(name), lang, path)) - + def get_classdef(self, name, data): + lang = 'Es' if name.lower().endswith('spanish') else '' + return u"class %s(_Creators%s):\n path = %r" % (name, lang, data) if __name__ == '__main__': - if len(sys.argv) > 1: - print_results(sys.argv[1:]) - else: - get_results() + CreatorsUpdater(__file__).run() diff --git a/scripts/gocomics.py b/scripts/gocomics.py index 5098e85d9..994bdda70 100755 --- a/scripts/gocomics.py +++ b/scripts/gocomics.py @@ -9,86 +9,45 @@ processing. """ from __future__ import absolute_import, division, print_function -import codecs -import sys -import os - -import requests -from lxml import html - -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa -from dosagelib.util import get_page -from scriptutil import contains_case_insensitive, format_name, save_result, load_result, truncate_name - -json_file = __file__.replace(".py", ".json") - -# names of comics to exclude -exclude_comics = [ - # "coming soon" - "Angryprogrammer", - "Guinness", - "Jabberwoncky", - "RandysRationale" - "SignsOfOurTimes", - "TheGagwriter", - "Yaoyao", - - # duplicate - "SaturdayMorningBreakfastCereal", -] +from scriptutil import ComicListUpdater -def handle_url(url, session, res): - """Parse one search result page.""" - print("Parsing", url, file=sys.stderr) - try: - data = html.document_fromstring(get_page(url, session).text) - except IOError as msg: - print("ERROR:", msg, file=sys.stderr) - return +class GoComicsUpdater(ComicListUpdater): + # names of comics to exclude + excluded_comics = [ + # "coming soon" + "Angryprogrammer", + "Guinness", + "Jabberwoncky", + "RandysRationale" + "SignsOfOurTimes", + "TheGagwriter", + "Yaoyao", - for comiclink in data.cssselect('a.alpha_list'): - link = comiclink.attrib['href'] - name = format_name(comiclink.text) - if contains_case_insensitive(res, name): - # we cannot handle two comics that only differ in case - print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) - continue - res[name] = link + # duplicate + "SaturdayMorningBreakfastCereal", + ] + def handle_url(self, url): + """Parse one search result page.""" + data = self.get_url(url) -def get_results(): - """Parse all search result pages.""" - # store info in a dictionary {name -> uri} - res = {} - session = requests.Session() - handle_url('http://www.gocomics.com/features', session, res) - handle_url('http://www.gocomics.com/explore/espanol', session, res) - handle_url('http://www.gocomics.com/explore/editorial_list', session, res) - handle_url('http://www.gocomics.com/explore/sherpa_list', session, res) - save_result(res, json_file) + for comiclink in data.cssselect('a.alpha_list'): + link = comiclink.attrib['href'] + name = comiclink.text + self.add_comic(name, link) + def collect_results(self): + """Parse all listing pages.""" + self.handle_url('http://www.gocomics.com/features') + self.handle_url('http://www.gocomics.com/explore/espanol') + self.handle_url('http://www.gocomics.com/explore/editorial_list') + self.handle_url('http://www.gocomics.com/explore/sherpa_list') -def first_lower(x): - return x[0].lower() - - -def print_results(args): - """Print all comics that have at least the given number of minimum comic strips.""" - min_comics, filename = args - with codecs.open(filename, 'a', 'utf-8') as fp: - data = load_result(json_file) - for name, uri in sorted(data.items(), key=first_lower): - if name in exclude_comics: - print("Excluded " + name) - continue - fp.write(u"\n\nclass GC%s(_GoComics%s):\n path = %r\n" % ( - truncate_name(name), 'Es' if 'espanol/' in uri else '', - uri[1:])) + def get_classdef(self, name, url): + return u"class GC%s(_GoComics%s):\n path = %r" % ( + name, 'Es' if 'espanol/' in url else '', url[1:]) if __name__ == '__main__': - if len(sys.argv) > 1: - print_results(sys.argv[1:]) - else: - get_results() + GoComicsUpdater(__file__).run() diff --git a/scripts/keenspot.py b/scripts/keenspot.py index ada3f92ca..3891c82b6 100755 --- a/scripts/keenspot.py +++ b/scripts/keenspot.py @@ -18,7 +18,7 @@ import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa from dosagelib.util import get_page, tagre, check_robotstxt -from dosagelib.scraper import get_scraperclasses +from dosagelib.scraper import get_scrapers from scriptutil import (contains_case_insensitive, save_result, load_result, truncate_name, format_name) @@ -108,8 +108,8 @@ def has_comic(name): ("GoComics/%s" % name).lower(), ("ComicGenesis/%s" % name).lower(), ] - for scraperclass in get_scraperclasses(): - lname = scraperclass.getName().lower() + for scraperobj in get_scrapers(): + lname = scraperobj.name.lower() if lname in names: return True return False diff --git a/scripts/scriptutil.py b/scripts/scriptutil.py index ca72d9ce9..d38192f31 100644 --- a/scripts/scriptutil.py +++ b/scripts/scriptutil.py @@ -5,11 +5,114 @@ from __future__ import absolute_import, division, print_function +import os import re +import sys import json import codecs -from dosagelib.util import unescape +import requests +from lxml import html + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa + +from dosagelib.util import unescape, get_page +from dosagelib import scraper + + +def first_lower(x): + return x[0].lower() + + +class ComicListUpdater(object): + dup_templates = () + excluded_comics = () + + def __init__(self, name): + self.json = name.replace(".py", ".json") + self.session = requests.Session() + + def get_url(self, url): + """Get an HTML page and parse it with LXML.""" + print("Parsing", url, file=sys.stderr) + try: + data = html.document_fromstring(get_page(url, self.session).text) + data.make_links_absolute(url) + return data + except IOError as msg: + print("ERROR:", msg, file=sys.stderr) + raise + + def should_skip(self, name): + if name in self.excluded_comics: + return True + if contains_case_insensitive(self.res, name): + # we cannot handle two comics that only differ in case + print("INFO: skipping possible duplicate", repr(name), + file=sys.stderr) + return True + return False + + def get_results(self): + """Collect comics and save dictionary in JSON file.""" + self.res = {} + self.collect_results() + + if not self.res: + print("ERROR:", "did not match any comics", file=sys.stderr) + return + + with codecs.open(self.json, 'wb', 'utf-8') as f: + json.dump(self.res, f, sort_keys=True, indent=2, + separators=(',', ': ')) + + def add_comic(self, name, data, count=None): + """Add a collected comic with a specific number of comics.""" + name = format_name(name) + if not self.should_skip(name): + self.res[name] = {'count': count, 'data': data} + + def collect_results(self): + raise NotImplementedError + + def print_results(self, args): + """Print all comics that have at least the given number of minimum + comic strips.""" + min_comics, filename = args + min_comics = int(min_comics) + with codecs.open(filename, 'a', 'utf-8') as fp: + with codecs.open(self.json, 'rb', 'utf-8') as f: + data = json.load(f) + for name, entry in sorted(data.items(), key=first_lower): + count = entry['count'] + if count and count < min_comics: + continue + dup = self.find_dups(name) + if dup is not None: + fp.write(u"# %s has a duplicate in %s\n" % (name, dup)) + else: + fp.write(u"\n\n%s\n" % + self.get_classdef(truncate_name(name), + entry['data'])) + + def find_dups(self, name): + """Check if comic name already exists.""" + names = [(tmpl % name).lower() for tmpl in self.dup_templates] + if names: + for scraperobj in scraper.get_scrapers(): + lname = scraperobj.name.lower() + if lname in names or lname == name.lower(): + return scraperobj.name + return None + + def get_classdef(self, name, data): + raise NotImplementedError + + def run(self): + if len(sys.argv) > 1: + self.print_results(sys.argv[1:]) + else: + self.get_results() def contains_case_insensitive(adict, akey): diff --git a/scripts/smackjeeves.py b/scripts/smackjeeves.py index 9761aeba9..0cc36e320 100755 --- a/scripts/smackjeeves.py +++ b/scripts/smackjeeves.py @@ -22,7 +22,7 @@ import requests sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa from dosagelib.util import get_page, tagre -from dosagelib.scraper import get_scraperclasses +from dosagelib.scraper import get_scrapers from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name @@ -328,8 +328,8 @@ def get_results(): def has_comic(name): """Check if comic name already exists.""" cname = name.lower() - for scraperclass in get_scraperclasses(): - lname = scraperclass.getName().lower() + for scraperobj in get_scrapers(): + lname = scraperobj.name.lower() if lname == cname: return True return False diff --git a/scripts/webcomicfactory.py b/scripts/webcomicfactory.py index 66310996d..ef271942d 100755 --- a/scripts/webcomicfactory.py +++ b/scripts/webcomicfactory.py @@ -9,77 +9,40 @@ further processing. """ from __future__ import absolute_import, division, print_function -import codecs -import sys -import os -import requests -from lxml import html - -sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa -from dosagelib.util import get_page -from scriptutil import (save_result, load_result, truncate_name, format_name) - -json_file = __file__.replace(".py", ".json") +from scriptutil import ComicListUpdater -def find_first(session, url): - print("Parsing", url, file=sys.stderr) - try: - data = html.document_fromstring(get_page(url, session).text) - data.make_links_absolute(url) - except IOError as msg: - print("ERROR:", msg, file=sys.stderr) - return url - firstlinks = data.cssselect('a.comic-nav-first') - if not firstlinks: - print("INFO No first link on »%s«, already first page?" % (url)) - return url - return firstlinks[0].attrib['href'] +class WebComicFactoryUpdater(ComicListUpdater): + def find_first(self, url): + data = self.get_url(url) -def get_results(): - """Parse start page for supported comics.""" - res = {} - url = 'http://www.thewebcomicfactory.com/' - session = requests.Session() - print("Parsing", url, file=sys.stderr) - try: - data = html.document_fromstring(get_page(url, session).text) - data.make_links_absolute(url) - except IOError as msg: - print("ERROR:", msg, file=sys.stderr) - return {} + firstlinks = data.cssselect('a.comic-nav-first') + if not firstlinks: + print("INFO:", "No first link on »%s«, already first page?" % + (url)) + return url + return firstlinks[0].attrib['href'] - for comicdiv in data.cssselect('div.ceo_thumbnail_widget'): - comicname = comicdiv.cssselect('h2')[0] - comiclink = comicdiv.cssselect('a')[0] - comicurl = comiclink.attrib['href'] - name = format_name(comicname.text) - if 'comic-color-key' in comicurl: - continue - comicurl = find_first(session, comicurl) - res[name] = comicurl + def collect_results(self): + """Parse start page for supported comics.""" + url = 'http://www.thewebcomicfactory.com/' + data = self.get_url(url) - save_result(res, json_file) + for comicdiv in data.cssselect('div.ceo_thumbnail_widget'): + comicname = comicdiv.cssselect('h2')[0] + comiclink = comicdiv.cssselect('a')[0] + comicurl = comiclink.attrib['href'] + name = comicname.text + if 'comic-color-key' in comicurl: + continue + comicurl = self.find_first(comicurl) + self.add_comic(name, comicurl) - -def first_lower(x): - return x[0].lower() - - -def print_results(args): - """Print all comics.""" - min_comics, filename = args - with codecs.open(filename, 'a', 'utf-8') as fp: - data = load_result(json_file) - for name, url in sorted(data.items(), key=first_lower): - fp.write(u"\n\nclass %s(_WebcomicFactory):\n url = %r\n" % ( - truncate_name(name), str(url))) - fp.write(u" firstStripUrl = url\n") + def get_classdef(self, name, url): + return (u"class %s(_WebcomicFactory):\n url = %r\n" % (name, url) + + u" firstStripUrl = url") if __name__ == '__main__': - if len(sys.argv) > 1: - print_results(sys.argv[1:]) - else: - get_results() + WebComicFactoryUpdater(__file__).run()