From 1d2e1f2dd17babb2968be916822a93dfb97e40ad Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Wed, 20 Apr 2016 23:48:29 +0200 Subject: [PATCH] Update SmackJeeves update helper. Don't use it right now, it adds a HUGE amount of comics. --- scripts/smackjeeves.py | 479 +++++++++++++---------------------------- 1 file changed, 148 insertions(+), 331 deletions(-) diff --git a/scripts/smackjeeves.py b/scripts/smackjeeves.py index 0cc36e320..04a1ba32a 100755 --- a/scripts/smackjeeves.py +++ b/scripts/smackjeeves.py @@ -9,354 +9,171 @@ for further processing. """ from __future__ import absolute_import, division, print_function -import codecs -import re import sys -import os try: - from urllib.parse import urljoin + from urllib.parse import urlsplit except ImportError: - from urlparse import urljoin + from urlparse import urlsplit -import requests - -sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa -from dosagelib.util import get_page, tagre -from dosagelib.scraper import get_scrapers -from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name +from scriptutil import ComicListUpdater -json_file = __file__.replace(".py", ".json") +class SmackJeevesUpdater(ComicListUpdater): + # Absolute minumum number of pages a comic may have (restrict search space) + MIN_COMICS = 90 + ADULT_IMG = 'http://www.smackjeeves.com/images/mature_content.png' -# names of comics to exclude -exclude_comics = [ - "4plyKamalsHead", # does not follow standard layout - "9Lives", # missing images - "ADifferentPerspective", # does not follow standard layout - "AFairlyTwistedRealitySuper", # does not follow standard layout - "Ahoge", # does not follow standard layout - "AngelJunkPileFelix", # images are 403 forbidden - "AntavioussGenLab", # images are 403 forbidden - "AreyougayJohnny", # does not follow standard layout - "Authorbattlesthevideogame", # missing images - "BambooArmonicKnightsGuild", # missing previous link - "BassLegends", # does not follow standard layout - "BreIshurnasspritesandstuff", # comic moved - "CatboyattheCon", # missing images - "Comatose", # does not follow standard layout - "ContraandtheSpamDump", # missing images - "ClubLove", # does not follow standard layout - "Darkkyosshorts", # missing images - "DeSTRESS", # does not follow standard layout - "DollarStoreCaviar", # broken images - "DreamCatcher", # does not follow standard layout - "EdgeofDecember", # missing images - "FroakieShocaiz", # unsuitable navigation - "Fumiko", # does not follow standard layout - "FurryExperience", # timeout - "GART", # does not follow standard layout - "GarytheAlchemist", # does not follow standard layout - "GBAsCrib", # timeout - "HAndJ", # missing images - "HEARD", # missing images - "Indigo", # broken domain name - "IwillbenapoSpamDump", # missing images - "ItoshiisCrazyNuzlockeAdventures", # does not follow standard layout - "JennyHaniver", # does not follow standard layout - "KiLAiLO", # does not follow standard layout - "KirbysoftheAlternateDimension", # missing images - "Letsreviewshallwe", # missing images - "LoudEra", # does not follow standard layout - "LunarHill", # does not follow standard layout - "Mafiagame", # does not follow standard layout - "MegaManSpriteExpo", # missing images - "MyLifewithFelENESPANOL", # does not follow standard layout - "MylifewithFel", # does not follow standard layout - "NegativeZen", # does not follow standard layout - "Nemutionpobae", # does not follow standard layout - "NightShot", # does not follow standard layout - "NormalIsBoring", # does not follow standard layout - "Okamirai", # images are 403 forbidden - "OmnisSpriteShowcase", # missing images - "OpticalDisarray", # does not follow standard layout - "PicturesofYou", # does not follow standard layout - "PiecesofBrokenGlass", # broken images - "PlatonicManagementDilemma", # missing images - "Pornjunkiesstrip", # does not follow standard layout - "PrettyUgly", # does not follow standard layout - "Project217", # does not follow standard layout - "RemmyzRandomz", # does not follow standard layout - "Ribon", # does not follow standard layout - "RubysWorld", # does not follow standard layout - "SecretSanta2011", # missing images - "ShinkaTheLastEevee", # does not follow standard layout - "SimplePixel", # does not follow standard layout - "SJArtCollab", # missing images - "SladesMansionofawesomeness", # does not follow standard layout - "SlightlyDifferent", # missing images - "SpaceSchool", # does not follow standard layout - "SushiGummy", # does not follow standard layout - "TheAfterSubtract", # does not follow standard layout - "ThePokemonArtBox", # does not follow standard layout - "THEVOIDWEBCOMIC", # does not follow standard layout - "TC2KsPokemobians", # does not follow standard layout - "ThreadCrashers", # has no previous comic link - "ToDefeatThemAll", # does not follow standard layout - "TotallyKotor", # missing images - "TwoKeys", # does not follow standard layout - "Vbcomics", # does not follow standard layout - "WerewolfRichard", # does not follow standard layout - "WinterMelody", # missing images -] + # names of comics to exclude + excluded_comics = ( + # comic moved/we have a better module + "Amya", + "Carciphona", + "Footloose", + "TitleUnrelated", + # does not follow standard layout + "300DaysOfSyao", + "ADifferentPerspective", + "Captor", + "ClubLove", + "Comatose", + "DeSTRESS", + "DreamCatcher", + "Fumiko", + "GART", + "GarytheAlchemist", + "ItoshiisCrazyNuzlockeAdventures", + "JennyHaniver", + "KiLAiLO", + "LoudEra", + "LunarHill", + "Mafiagame", + "MylifewithFel", + "MyLifewithFelENESPANOL", + "NegativeZen", + "Nemutionpobae", + "NightShot", + "NormalIsBoring", + "OpticalDisarray", + "PicturesofYou", + "Pornjunkiesstrip", + "PrettyUgly", + "Project217", + "RemmyzRandomz", + "Ribon", + "RubysWorld", + "ShinkaTheLastEevee", + "SimplePixel", + "SladesMansionofawesomeness", + "SpaceSchool", + "SushiGummy", + "TC2KsPokemobians", + "TheAfterSubtract", + "ThePokemonArtBox", + "THEVOIDWEBCOMIC", + "ToDefeatThemAll", + "TwoKeys", + "Vbcomics", + "WerewolfRichard", -# the latest URL of some comics repeats the previous URL -# flag this so the bounceStart uses the correct URL -repeat_comics = [ - "1009sSpritersVacation", - "22Special22Care", - "2Kingdoms", - "2Masters", - "AbbimaysRandomness", - "AdaLeeComesOn", - "AdventuresofMitch", - "AkumaKisei", - "ALaMode", - "AnimalLoversYuriCollab", - "Area9", - "AStrangeTypeofLove", - "Autophobia", - "BearlyAbel", - "BeCarefreeWithMeSoon", - "BlindandBlue", - "BlueStreak", - "BlueWell", - "BlueYonder", - "Border", - "BoyLessons", - "Boywithasecret", - "BreakFreemagazine", - "BrightStars", - "ByTheBook", - "ClairetheFlare", - "CloeRemembrance", - "ComicFullofSprites", - "CrappilyDrawnMinicomics", - "CupidsaMoron", - "D00R", - "DeathNoteIridescent", - "DemonEater", - "DenizensAttention", - "DevilsCake", - "Dreamcatchers", - "EmeraldNuzlocke", - "EonsAgo", - "ERRORERROR", - "EvilPlan", - "FailureConfetti", - "FlyorFail", - "ForestHill", - "FrobertTheDemon", - "GarytheAlchemist", - "GhostsTaleACrossover", - "Glasshearts", - "GoldenSunGenerationsAftermathVolume1", - "GoldenSunGenerationsColossoVolume6", - "GuardiansoftheGalaxialSpaceways", - "HatShop", - "HDMTHCOMICS", - "Helix", - "Hephaestus", - "HolyBlasphemy", - "HopeForABreeze", - "Hotarugari", - "InsideOuTAYuriTale", - "Insomanywords", - "INUSITADOONLINE", - "ItsCharacterDevelopment", - "JosephAndYusra", - "JustAnotherDay", - "KasaKeira", - "KirbyAdventure", - "KirbyandtheDarkKnight", - "KirbyFunfestTheOriginals", - "KirbysofTHEVOID", - "KuroiHitsuji", - "KuroShouri", - "LandoftheSky", - "LeCirquedObscure", - "LethalDose", - "LOGOS", - "LostLove", - "LsEmpire", - "MariovsSonicvsMegaMan", - "Mega", - "MementoMori", - "Mokepon", - "MrGrimmsCircusofHorrors", - "MyFakeHeart", - "MyFriendScotty", - "MYth", - "NemesisKatharsis", - "NiceKitty", - "Nutshel", - "OptimalClutter", - "Panacea", - "PhilosophicalPenisJokes", - "PrettyUgly", - "PSY", - "PTO", - "RainLGBT", - "ReidyandFriendsShowcase", - "RubysWorld", - "SallySprocketAndPistonPete", - "SimonSues", - "SimpleBear", - "SmallPressAdventures", - "SonicWorldAdventure", - "SoulGuardian", - "SPOON", - "STASonictheAdventure", - "Stay", - "StellaInChrome", - "StrangersandFriends", - "SunmeetsMoon", - "TAG", - "TaikiTheWebcomic", - "TechnicolorLondon", - "TEN", - "ThatWasntThereYesterday", - "TheAntihero", - "TheBrideoftheShark", - "TheCafedAlizee", - "TheEssyaneWarriors", - "ThehumanBEing", - "TheKwiddexProtocol", - "TheLegendofZeldaMaidenoftheMoon", - "ThePirateBalthasar", - "TheRandomObscureFairyTaleNoOnesEverReallyHeardOf", - "TheReborn", - "TheTytonNuzlockeChallengeEmeraldEdition", - "ToD", - "TPTruePower", - "TwoKeys", - "UndertheSkin", - "WelcometoFreakshow", - "Whenweweresilent", - "WhiteHeart", - "Yaoishereforareason", - "Zodiac", -] + # has no previous comic link + "ThreadCrashers", + "AchievementStuck", + # images are 403 forbidden + "AngelJunkPileFelix", + "AntavioussGenLab", + "Okamirai", -# links to last valid strips -url_overrides = { -} + # missing images + "CatboyattheCon", + "ContraandtheSpamDump", + "Darkkyosshorts", + "DollarStoreCaviar", + "EdgeofDecember", + "HAndJ", + "HEARD", + "IwillbenapoSpamDump", + "KirbysoftheAlternateDimension", + "Letsreviewshallwe", + "MegaManSpriteExpo", + "OmnisSpriteShowcase", + "PiecesofBrokenGlass", + "PlatonicManagementDilemma", + "SecretSanta2011", + "SerendipityAnEquestrianTale", + "SJArtCollab", + "SlightlyDifferent", + "TheAttackoftheRecoloursSeason1", + "TotallyKotor", + "WinterMelody", + "ZonowTheHedgehog", + # missing previous link + "BambooArmonicKnightsGuild", -# HTML content matcher -page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)', - after="site_banner") + - tagre("img", "title", r'([^"]+)')) -url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic") -num_matcher = re.compile(r'50%">\s+(\d+)\s+') -adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png')) + # broken host name + "Razor", + ) + def handle_url(self, url): + """Parse one search result page.""" + data = self.get_url(url) -def handle_url(url, session, res): - """Parse one search result page.""" - print("Parsing", url, file=sys.stderr) - try: - data = get_page(url, session).text - except IOError as msg: - print("ERROR:", msg, file=sys.stderr) - return - for match in page_matcher.finditer(data): - page_url = match.group(1) - page_url = urljoin(url, page_url) - name = format_name(match.group(2)) - if name in exclude_comics: - continue - if contains_case_insensitive(res, name): - # we cannot handle two comics that only differ in case - print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) - continue - # find out how many images this comic has - end = match.end() - mo = num_matcher.search(data[end:]) - if not mo: - print("ERROR matching number:", repr(data[end:end + 300]), - file=sys.stderr) - continue - num = int(mo.group(1)) - # search for url in extra page - print("Getting", page_url) - try: - data2 = get_page(page_url, session).text - except IOError as msg: - print("ERROR:", msg, file=sys.stderr) - return - mo = url_matcher.search(data2) - if not mo: - print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr) - continue - comic_url = mo.group(1) - # search for adult flag - adult = adult_matcher.search(data2[end:]) - bounce = name not in repeat_comics - res[name] = [ - url_overrides.get(name, comic_url), num, bool(adult), bounce - ] - - -def get_results(): - """Parse all search result pages.""" - base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d" - session = requests.Session() - # store info in a dictionary {name -> url, number of comics, adult flag, bounce flag} - res = {} - # a search for an empty string returned 286 result pages - result_pages = 286 - print("Parsing", result_pages, "search result pages...", file=sys.stderr) - for i in range(0, result_pages): - print(i + 1, file=sys.stderr, end=" ") - handle_url(base % (i * 12), session, res) - save_result(res, json_file) - - -def has_comic(name): - """Check if comic name already exists.""" - cname = name.lower() - for scraperobj in get_scrapers(): - lname = scraperobj.name.lower() - if lname == cname: - return True - return False - - -def print_results(args): - """Print all comics that have at least the given number of minimum comic strips.""" - min_comics, filename = args - min_comics = int(min_comics) - with codecs.open(filename, 'a', 'utf-8') as fp: - for name, entry in sorted(load_result(json_file).items()): - if name in exclude_comics: - continue - url, num, adult, bounce = entry - if num < min_comics: - continue - if has_comic(name): - prefix = u'#' + num = 999 + for comicdiv in data.cssselect( + 'div#webcomic_search_results div.full_banner_div'): + page_url = comicdiv.cssselect('a:first-child')[0].attrib['href'] + name = comicdiv.cssselect('img.banny') + if name: + name = name[0].attrib['title'] else: - prefix = u'' - fp.write(u"%sadd(%r, %r, %s, %s)\n" % ( - prefix, str(truncate_name(name)), str(url), adult, bounce - )) + name = comicdiv.cssselect('h2')[0].text + # find out how many images this comic has + mo = comicdiv.cssselect('span.small-meter') + if not mo: + print("ERROR matching number of comics", file=sys.stderr) + continue + num = int(mo[0].text.strip()) + # search for url in extra page + data2 = self.get_url(page_url) + mo = data2.cssselect('div#quick_reading_links a:last-child') + if not mo: + print("ERROR matching comic URL", file=sys.stderr) + continue + comic_url = mo[0].attrib['href'] + # search for adult flag + adult = data2.xpath('//img[@src="' + self.ADULT_IMG + '"]') + self.add_comic(name, (comic_url, bool(adult)), num) + next_url = data.cssselect( + "div.search_nav td:last-child a")[0].attrib['href'] + return (next_url, num) + + def collect_results(self): + """Parse all search result pages.""" + # Sort by number of comics, so we can abort when we get under some + # threshold. + next_url = ( + "http://www.smackjeeves.com/search.php?submit=1" + + "&search_mode=webcomics&comic_title=&sort_by=4&special=all" + + "&last_update=6&style_all=on&genre_all=on&format_all=on") + last_count = 999 + while last_count >= self.MIN_COMICS: + print(last_count, file=sys.stderr, end=" ") + next_url, last_count = self.handle_url(next_url) + + def get_classdef(self, name, data): + sub, top = urlsplit(data[0]).hostname.split('.', 1) + cl = u"class SJ%s(_SmackJeeves):" % name + if top.lower() == "smackjeeves.com": + cl += "\n sub = '%s'" % sub + else: + cl += "\n host = '%s.%s'" % (sub, top) + if data[1]: + cl += "\n adult = True" + return cl if __name__ == '__main__': - if len(sys.argv) > 1: - print_results(sys.argv[1:]) - else: - get_results() + SmackJeevesUpdater(__file__).run()