Update SmackJeeves update helper.

Don't use it right now, it adds a HUGE amount of comics.
2016-04-20 23:48:29 +02:00 · 2016-04-20 23:48:29 +02:00 · 1d2e1f2dd1
commit 1d2e1f2dd1
parent fe51a449df
1 changed files with 148 additions and 331 deletions
--- a/scripts/smackjeeves.py
+++ b/scripts/smackjeeves.py
@ -9,354 +9,171 @@ for further processing.
 """
 from __future__ import absolute_import, division, print_function

-import codecs
-import re
 import sys
-import os
 try:
-    from urllib.parse import urljoin
+    from urllib.parse import urlsplit
 except ImportError:
-    from urlparse import urljoin
+    from urlparse import urlsplit

-import requests
-
-sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))  # noqa
-from dosagelib.util import get_page, tagre
-from dosagelib.scraper import get_scrapers
-from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
+from scriptutil import ComicListUpdater


-json_file = __file__.replace(".py", ".json")
+class SmackJeevesUpdater(ComicListUpdater):
+    # Absolute minumum number of pages a comic may have (restrict search space)
+    MIN_COMICS = 90

+    ADULT_IMG = 'http://www.smackjeeves.com/images/mature_content.png'

-# names of comics to exclude
-exclude_comics = [
-    "4plyKamalsHead", # does not follow standard layout
-    "9Lives", # missing images
-    "ADifferentPerspective", # does not follow standard layout
-    "AFairlyTwistedRealitySuper", # does not follow standard layout
-    "Ahoge", # does not follow standard layout
-    "AngelJunkPileFelix", # images are 403 forbidden
-    "AntavioussGenLab", # images are 403 forbidden
-    "AreyougayJohnny", # does not follow standard layout
-    "Authorbattlesthevideogame", # missing images
-    "BambooArmonicKnightsGuild", # missing previous link
-    "BassLegends", # does not follow standard layout
-    "BreIshurnasspritesandstuff", # comic moved
-    "CatboyattheCon", # missing images
-    "Comatose", # does not follow standard layout
-    "ContraandtheSpamDump", # missing images
-    "ClubLove", # does not follow standard layout
-    "Darkkyosshorts", # missing images
-    "DeSTRESS", # does not follow standard layout
-    "DollarStoreCaviar", # broken images
-    "DreamCatcher", # does not follow standard layout
-    "EdgeofDecember", # missing images
-    "FroakieShocaiz", # unsuitable navigation
-    "Fumiko", # does not follow standard layout
-    "FurryExperience", # timeout
-    "GART", # does not follow standard layout
-    "GarytheAlchemist", # does not follow standard layout
-    "GBAsCrib", # timeout
-    "HAndJ", # missing images
-    "HEARD", # missing images
-    "Indigo", # broken domain name
-    "IwillbenapoSpamDump", # missing images
-    "ItoshiisCrazyNuzlockeAdventures", # does not follow standard layout
-    "JennyHaniver", # does not follow standard layout
-    "KiLAiLO", # does not follow standard layout
-    "KirbysoftheAlternateDimension", # missing images
-    "Letsreviewshallwe", # missing images
-    "LoudEra", # does not follow standard layout
-    "LunarHill", # does not follow standard layout
-    "Mafiagame", # does not follow standard layout
-    "MegaManSpriteExpo", # missing images
-    "MyLifewithFelENESPANOL", # does not follow standard layout
-    "MylifewithFel", # does not follow standard layout
-    "NegativeZen", # does not follow standard layout
-    "Nemutionpobae", # does not follow standard layout
-    "NightShot", # does not follow standard layout
-    "NormalIsBoring", # does not follow standard layout
-    "Okamirai", # images are 403 forbidden
-    "OmnisSpriteShowcase", # missing images
-    "OpticalDisarray", # does not follow standard layout
-    "PicturesofYou", # does not follow standard layout
-    "PiecesofBrokenGlass", # broken images
-    "PlatonicManagementDilemma", # missing images
-    "Pornjunkiesstrip", # does not follow standard layout
-    "PrettyUgly", # does not follow standard layout
-    "Project217", # does not follow standard layout
-    "RemmyzRandomz", # does not follow standard layout
-    "Ribon", # does not follow standard layout
-    "RubysWorld", # does not follow standard layout
-    "SecretSanta2011", # missing images
-    "ShinkaTheLastEevee", # does not follow standard layout
-    "SimplePixel", # does not follow standard layout
-    "SJArtCollab", # missing images
-    "SladesMansionofawesomeness", # does not follow standard layout
-    "SlightlyDifferent", # missing images
-    "SpaceSchool", # does not follow standard layout
-    "SushiGummy", # does not follow standard layout
-    "TheAfterSubtract", # does not follow standard layout
-    "ThePokemonArtBox", # does not follow standard layout
-    "THEVOIDWEBCOMIC", # does not follow standard layout
-    "TC2KsPokemobians", # does not follow standard layout
-    "ThreadCrashers", # has no previous comic link
-    "ToDefeatThemAll", # does not follow standard layout
-    "TotallyKotor", # missing images
-    "TwoKeys", # does not follow standard layout
-    "Vbcomics", # does not follow standard layout
-    "WerewolfRichard", # does not follow standard layout
-    "WinterMelody", # missing images
-]
+    # names of comics to exclude
+    excluded_comics = (
+        # comic moved/we have a better module
+        "Amya",
+        "Carciphona",
+        "Footloose",
+        "TitleUnrelated",

-
-# the latest URL of some comics repeats the previous URL
-# flag this so the bounceStart uses the correct URL
-repeat_comics = [
-    "1009sSpritersVacation",
-    "22Special22Care",
-    "2Kingdoms",
-    "2Masters",
-    "AbbimaysRandomness",
-    "AdaLeeComesOn",
-    "AdventuresofMitch",
-    "AkumaKisei",
-    "ALaMode",
-    "AnimalLoversYuriCollab",
-    "Area9",
-    "AStrangeTypeofLove",
-    "Autophobia",
-    "BearlyAbel",
-    "BeCarefreeWithMeSoon",
-    "BlindandBlue",
-    "BlueStreak",
-    "BlueWell",
-    "BlueYonder",
-    "Border",
-    "BoyLessons",
-    "Boywithasecret",
-    "BreakFreemagazine",
-    "BrightStars",
-    "ByTheBook",
-    "ClairetheFlare",
-    "CloeRemembrance",
-    "ComicFullofSprites",
-    "CrappilyDrawnMinicomics",
-    "CupidsaMoron",
-    "D00R",
-    "DeathNoteIridescent",
-    "DemonEater",
-    "DenizensAttention",
-    "DevilsCake",
-    "Dreamcatchers",
-    "EmeraldNuzlocke",
-    "EonsAgo",
-    "ERRORERROR",
-    "EvilPlan",
-    "FailureConfetti",
-    "FlyorFail",
-    "ForestHill",
-    "FrobertTheDemon",
+        # does not follow standard layout
+        "300DaysOfSyao",
+        "ADifferentPerspective",
+        "Captor",
+        "ClubLove",
+        "Comatose",
+        "DeSTRESS",
+        "DreamCatcher",
+        "Fumiko",
+        "GART",
        "GarytheAlchemist",
-    "GhostsTaleACrossover",
-    "Glasshearts",
-    "GoldenSunGenerationsAftermathVolume1",
-    "GoldenSunGenerationsColossoVolume6",
-    "GuardiansoftheGalaxialSpaceways",
-    "HatShop",
-    "HDMTHCOMICS",
-    "Helix",
-    "Hephaestus",
-    "HolyBlasphemy",
-    "HopeForABreeze",
-    "Hotarugari",
-    "InsideOuTAYuriTale",
-    "Insomanywords",
-    "INUSITADOONLINE",
-    "ItsCharacterDevelopment",
-    "JosephAndYusra",
-    "JustAnotherDay",
-    "KasaKeira",
-    "KirbyAdventure",
-    "KirbyandtheDarkKnight",
-    "KirbyFunfestTheOriginals",
-    "KirbysofTHEVOID",
-    "KuroiHitsuji",
-    "KuroShouri",
-    "LandoftheSky",
-    "LeCirquedObscure",
-    "LethalDose",
-    "LOGOS",
-    "LostLove",
-    "LsEmpire",
-    "MariovsSonicvsMegaMan",
-    "Mega",
-    "MementoMori",
-    "Mokepon",
-    "MrGrimmsCircusofHorrors",
-    "MyFakeHeart",
-    "MyFriendScotty",
-    "MYth",
-    "NemesisKatharsis",
-    "NiceKitty",
-    "Nutshel",
-    "OptimalClutter",
-    "Panacea",
-    "PhilosophicalPenisJokes",
+        "ItoshiisCrazyNuzlockeAdventures",
+        "JennyHaniver",
+        "KiLAiLO",
+        "LoudEra",
+        "LunarHill",
+        "Mafiagame",
+        "MylifewithFel",
+        "MyLifewithFelENESPANOL",
+        "NegativeZen",
+        "Nemutionpobae",
+        "NightShot",
+        "NormalIsBoring",
+        "OpticalDisarray",
+        "PicturesofYou",
+        "Pornjunkiesstrip",
        "PrettyUgly",
-    "PSY",
-    "PTO",
-    "RainLGBT",
-    "ReidyandFriendsShowcase",
+        "Project217",
+        "RemmyzRandomz",
+        "Ribon",
        "RubysWorld",
-    "SallySprocketAndPistonPete",
-    "SimonSues",
-    "SimpleBear",
-    "SmallPressAdventures",
-    "SonicWorldAdventure",
-    "SoulGuardian",
-    "SPOON",
-    "STASonictheAdventure",
-    "Stay",
-    "StellaInChrome",
-    "StrangersandFriends",
-    "SunmeetsMoon",
-    "TAG",
-    "TaikiTheWebcomic",
-    "TechnicolorLondon",
-    "TEN",
-    "ThatWasntThereYesterday",
-    "TheAntihero",
-    "TheBrideoftheShark",
-    "TheCafedAlizee",
-    "TheEssyaneWarriors",
-    "ThehumanBEing",
-    "TheKwiddexProtocol",
-    "TheLegendofZeldaMaidenoftheMoon",
-    "ThePirateBalthasar",
-    "TheRandomObscureFairyTaleNoOnesEverReallyHeardOf",
-    "TheReborn",
-    "TheTytonNuzlockeChallengeEmeraldEdition",
-    "ToD",
-    "TPTruePower",
+        "ShinkaTheLastEevee",
+        "SimplePixel",
+        "SladesMansionofawesomeness",
+        "SpaceSchool",
+        "SushiGummy",
+        "TC2KsPokemobians",
+        "TheAfterSubtract",
+        "ThePokemonArtBox",
+        "THEVOIDWEBCOMIC",
+        "ToDefeatThemAll",
        "TwoKeys",
-    "UndertheSkin",
-    "WelcometoFreakshow",
-    "Whenweweresilent",
-    "WhiteHeart",
-    "Yaoishereforareason",
-    "Zodiac",
-]
+        "Vbcomics",
+        "WerewolfRichard",

+        # has no previous comic link
+        "ThreadCrashers",
+        "AchievementStuck",

-# links to last valid strips
-url_overrides = {
-}
+        # images are 403 forbidden
+        "AngelJunkPileFelix",
+        "AntavioussGenLab",
+        "Okamirai",

+        # missing images
+        "CatboyattheCon",
+        "ContraandtheSpamDump",
+        "Darkkyosshorts",
+        "DollarStoreCaviar",
+        "EdgeofDecember",
+        "HAndJ",
+        "HEARD",
+        "IwillbenapoSpamDump",
+        "KirbysoftheAlternateDimension",
+        "Letsreviewshallwe",
+        "MegaManSpriteExpo",
+        "OmnisSpriteShowcase",
+        "PiecesofBrokenGlass",
+        "PlatonicManagementDilemma",
+        "SecretSanta2011",
+        "SerendipityAnEquestrianTale",
+        "SJArtCollab",
+        "SlightlyDifferent",
+        "TheAttackoftheRecoloursSeason1",
+        "TotallyKotor",
+        "WinterMelody",
+        "ZonowTheHedgehog",

-# HTML content matcher
-page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)',
-                                after="site_banner") +
-                          tagre("img", "title", r'([^"]+)'))
-url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
-num_matcher = re.compile(r'50%">\s+(\d+)\s+')
-adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
+        # missing previous link
+        "BambooArmonicKnightsGuild",

+        # broken host name
+        "Razor",
+    )

-def handle_url(url, session, res):
+    def handle_url(self, url):
        """Parse one search result page."""
-    print("Parsing", url, file=sys.stderr)
-    try:
-        data = get_page(url, session).text
-    except IOError as msg:
-        print("ERROR:", msg, file=sys.stderr)
-        return
-    for match in page_matcher.finditer(data):
-        page_url = match.group(1)
-        page_url = urljoin(url, page_url)
-        name = format_name(match.group(2))
-        if name in exclude_comics:
-            continue
-        if contains_case_insensitive(res, name):
-            # we cannot handle two comics that only differ in case
-            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
-            continue
-        # find out how many images this comic has
-        end = match.end()
-        mo = num_matcher.search(data[end:])
-        if not mo:
-            print("ERROR matching number:", repr(data[end:end + 300]),
-                  file=sys.stderr)
-            continue
-        num = int(mo.group(1))
-        # search for url in extra page
-        print("Getting", page_url)
-        try:
-            data2 = get_page(page_url, session).text
-        except IOError as msg:
-            print("ERROR:", msg, file=sys.stderr)
-            return
-        mo = url_matcher.search(data2)
-        if not mo:
-            print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr)
-            continue
-        comic_url = mo.group(1)
-        # search for adult flag
-        adult = adult_matcher.search(data2[end:])
-        bounce = name not in repeat_comics
-        res[name] = [
-          url_overrides.get(name, comic_url), num, bool(adult), bounce
-        ]
+        data = self.get_url(url)

-
-def get_results():
-    """Parse all search result pages."""
-    base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
-    session = requests.Session()
-    # store info in a dictionary {name -> url, number of comics, adult flag, bounce flag}
-    res = {}
-    # a search for an empty string returned 286 result pages
-    result_pages = 286
-    print("Parsing", result_pages, "search result pages...", file=sys.stderr)
-    for i in range(0, result_pages):
-        print(i + 1, file=sys.stderr, end=" ")
-        handle_url(base % (i * 12), session, res)
-    save_result(res, json_file)
-
-
-def has_comic(name):
-    """Check if comic name already exists."""
-    cname = name.lower()
-    for scraperobj in get_scrapers():
-        lname = scraperobj.name.lower()
-        if lname == cname:
-            return True
-    return False
-
-
-def print_results(args):
-    """Print all comics that have at least the given number of minimum comic strips."""
-    min_comics, filename = args
-    min_comics = int(min_comics)
-    with codecs.open(filename, 'a', 'utf-8') as fp:
-        for name, entry in sorted(load_result(json_file).items()):
-            if name in exclude_comics:
-                continue
-            url, num, adult, bounce = entry
-            if num < min_comics:
-                continue
-            if has_comic(name):
-                prefix = u'#'
+        num = 999
+        for comicdiv in data.cssselect(
+                'div#webcomic_search_results div.full_banner_div'):
+            page_url = comicdiv.cssselect('a:first-child')[0].attrib['href']
+            name = comicdiv.cssselect('img.banny')
+            if name:
+                name = name[0].attrib['title']
            else:
-                prefix = u''
-            fp.write(u"%sadd(%r, %r, %s, %s)\n" % (
-              prefix, str(truncate_name(name)), str(url), adult, bounce
-            ))
+                name = comicdiv.cssselect('h2')[0].text
+            # find out how many images this comic has
+            mo = comicdiv.cssselect('span.small-meter')
+            if not mo:
+                print("ERROR matching number of comics", file=sys.stderr)
+                continue
+            num = int(mo[0].text.strip())
+            # search for url in extra page
+            data2 = self.get_url(page_url)
+            mo = data2.cssselect('div#quick_reading_links a:last-child')
+            if not mo:
+                print("ERROR matching comic URL", file=sys.stderr)
+                continue
+            comic_url = mo[0].attrib['href']
+            # search for adult flag
+            adult = data2.xpath('//img[@src="' + self.ADULT_IMG + '"]')
+            self.add_comic(name, (comic_url, bool(adult)), num)

+        next_url = data.cssselect(
+            "div.search_nav td:last-child a")[0].attrib['href']
+        return (next_url, num)
+
+    def collect_results(self):
+        """Parse all search result pages."""
+        # Sort by number of comics, so we can abort when we get under some
+        # threshold.
+        next_url = (
+            "http://www.smackjeeves.com/search.php?submit=1" +
+            "&search_mode=webcomics&comic_title=&sort_by=4&special=all" +
+            "&last_update=6&style_all=on&genre_all=on&format_all=on")
+        last_count = 999
+        while last_count >= self.MIN_COMICS:
+            print(last_count, file=sys.stderr, end=" ")
+            next_url, last_count = self.handle_url(next_url)
+
+    def get_classdef(self, name, data):
+        sub, top = urlsplit(data[0]).hostname.split('.', 1)
+        cl = u"class SJ%s(_SmackJeeves):" % name
+        if top.lower() == "smackjeeves.com":
+            cl += "\n    sub = '%s'" % sub
+        else:
+            cl += "\n    host = '%s.%s'" % (sub, top)
+        if data[1]:
+            cl += "\n    adult = True"
+        return cl

 if __name__ == '__main__':
-    if len(sys.argv) > 1:
-        print_results(sys.argv[1:])
-    else:
-        get_results()
+    SmackJeevesUpdater(__file__).run()