dosage/scripts/smackjeeves.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
Script to get a list of smackjeeves.com comics and save the info in a JSON file
for further processing.
"""
from __future__ import absolute_import, division, print_function

import codecs
import re
import sys
import os
try:
    from urllib.parse import urljoin
except ImportError:
    from urlparse import urljoin

import requests

sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))  # noqa
from dosagelib.util import get_page, tagre
from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name


json_file = __file__.replace(".py", ".json")


# names of comics to exclude
exclude_comics = [
    "4plyKamalsHead", # does not follow standard layout
    "9Lives", # missing images
    "ADifferentPerspective", # does not follow standard layout
    "AFairlyTwistedRealitySuper", # does not follow standard layout
    "Ahoge", # does not follow standard layout
    "AngelJunkPileFelix", # images are 403 forbidden
    "AntavioussGenLab", # images are 403 forbidden
    "AreyougayJohnny", # does not follow standard layout
    "Authorbattlesthevideogame", # missing images
    "BambooArmonicKnightsGuild", # missing previous link
    "BassLegends", # does not follow standard layout
    "BreIshurnasspritesandstuff", # comic moved
    "CatboyattheCon", # missing images
    "Comatose", # does not follow standard layout
    "ContraandtheSpamDump", # missing images
    "ClubLove", # does not follow standard layout
    "Darkkyosshorts", # missing images
    "DeSTRESS", # does not follow standard layout
    "DollarStoreCaviar", # broken images
    "DreamCatcher", # does not follow standard layout
    "EdgeofDecember", # missing images
    "FroakieShocaiz", # unsuitable navigation
    "Fumiko", # does not follow standard layout
    "FurryExperience", # timeout
    "GART", # does not follow standard layout
    "GarytheAlchemist", # does not follow standard layout
    "GBAsCrib", # timeout
    "HAndJ", # missing images
    "HEARD", # missing images
    "Indigo", # broken domain name
    "IwillbenapoSpamDump", # missing images
    "ItoshiisCrazyNuzlockeAdventures", # does not follow standard layout
    "JennyHaniver", # does not follow standard layout
    "KiLAiLO", # does not follow standard layout
    "KirbysoftheAlternateDimension", # missing images
    "Letsreviewshallwe", # missing images
    "LoudEra", # does not follow standard layout
    "LunarHill", # does not follow standard layout
    "Mafiagame", # does not follow standard layout
    "MegaManSpriteExpo", # missing images
    "MyLifewithFelENESPANOL", # does not follow standard layout
    "MylifewithFel", # does not follow standard layout
    "NegativeZen", # does not follow standard layout
    "Nemutionpobae", # does not follow standard layout
    "NightShot", # does not follow standard layout
    "NormalIsBoring", # does not follow standard layout
    "Okamirai", # images are 403 forbidden
    "OmnisSpriteShowcase", # missing images
    "OpticalDisarray", # does not follow standard layout
    "PicturesofYou", # does not follow standard layout
    "PiecesofBrokenGlass", # broken images
    "PlatonicManagementDilemma", # missing images
    "Pornjunkiesstrip", # does not follow standard layout
    "PrettyUgly", # does not follow standard layout
    "Project217", # does not follow standard layout
    "RemmyzRandomz", # does not follow standard layout
    "Ribon", # does not follow standard layout
    "RubysWorld", # does not follow standard layout
    "SecretSanta2011", # missing images
    "ShinkaTheLastEevee", # does not follow standard layout
    "SimplePixel", # does not follow standard layout
    "SJArtCollab", # missing images
    "SladesMansionofawesomeness", # does not follow standard layout
    "SlightlyDifferent", # missing images
    "SpaceSchool", # does not follow standard layout
    "SushiGummy", # does not follow standard layout
    "TheAfterSubtract", # does not follow standard layout
    "ThePokemonArtBox", # does not follow standard layout
    "THEVOIDWEBCOMIC", # does not follow standard layout
    "TC2KsPokemobians", # does not follow standard layout
    "ThreadCrashers", # has no previous comic link
    "ToDefeatThemAll", # does not follow standard layout
    "TotallyKotor", # missing images
    "TwoKeys", # does not follow standard layout
    "Vbcomics", # does not follow standard layout
    "WerewolfRichard", # does not follow standard layout
    "WinterMelody", # missing images
]


# the latest URL of some comics repeats the previous URL
# flag this so the bounceStart uses the correct URL
repeat_comics = [
    "1009sSpritersVacation",
    "22Special22Care",
    "2Kingdoms",
    "2Masters",
    "AbbimaysRandomness",
    "AdaLeeComesOn",
    "AdventuresofMitch",
    "AkumaKisei",
    "ALaMode",
    "AnimalLoversYuriCollab",
    "Area9",
    "AStrangeTypeofLove",
    "Autophobia",
    "BearlyAbel",
    "BeCarefreeWithMeSoon",
    "BlindandBlue",
    "BlueStreak",
    "BlueWell",
    "BlueYonder",
    "Border",
    "BoyLessons",
    "Boywithasecret",
    "BreakFreemagazine",
    "BrightStars",
    "ByTheBook",
    "ClairetheFlare",
    "CloeRemembrance",
    "ComicFullofSprites",
    "CrappilyDrawnMinicomics",
    "CupidsaMoron",
    "D00R",
    "DeathNoteIridescent",
    "DemonEater",
    "DenizensAttention",
    "DevilsCake",
    "Dreamcatchers",
    "EmeraldNuzlocke",
    "EonsAgo",
    "ERRORERROR",
    "EvilPlan",
    "FailureConfetti",
    "FlyorFail",
    "ForestHill",
    "FrobertTheDemon",
    "GarytheAlchemist",
    "GhostsTaleACrossover",
    "Glasshearts",
    "GoldenSunGenerationsAftermathVolume1",
    "GoldenSunGenerationsColossoVolume6",
    "GuardiansoftheGalaxialSpaceways",
    "HatShop",
    "HDMTHCOMICS",
    "Helix",
    "Hephaestus",
    "HolyBlasphemy",
    "HopeForABreeze",
    "Hotarugari",
    "InsideOuTAYuriTale",
    "Insomanywords",
    "INUSITADOONLINE",
    "ItsCharacterDevelopment",
    "JosephAndYusra",
    "JustAnotherDay",
    "KasaKeira",
    "KirbyAdventure",
    "KirbyandtheDarkKnight",
    "KirbyFunfestTheOriginals",
    "KirbysofTHEVOID",
    "KuroiHitsuji",
    "KuroShouri",
    "LandoftheSky",
    "LeCirquedObscure",
    "LethalDose",
    "LOGOS",
    "LostLove",
    "LsEmpire",
    "MariovsSonicvsMegaMan",
    "Mega",
    "MementoMori",
    "Mokepon",
    "MrGrimmsCircusofHorrors",
    "MyFakeHeart",
    "MyFriendScotty",
    "MYth",
    "NemesisKatharsis",
    "NiceKitty",
    "Nutshel",
    "OptimalClutter",
    "Panacea",
    "PhilosophicalPenisJokes",
    "PrettyUgly",
    "PSY",
    "PTO",
    "RainLGBT",
    "ReidyandFriendsShowcase",
    "RubysWorld",
    "SallySprocketAndPistonPete",
    "SimonSues",
    "SimpleBear",
    "SmallPressAdventures",
    "SonicWorldAdventure",
    "SoulGuardian",
    "SPOON",
    "STASonictheAdventure",
    "Stay",
    "StellaInChrome",
    "StrangersandFriends",
    "SunmeetsMoon",
    "TAG",
    "TaikiTheWebcomic",
    "TechnicolorLondon",
    "TEN",
    "ThatWasntThereYesterday",
    "TheAntihero",
    "TheBrideoftheShark",
    "TheCafedAlizee",
    "TheEssyaneWarriors",
    "ThehumanBEing",
    "TheKwiddexProtocol",
    "TheLegendofZeldaMaidenoftheMoon",
    "ThePirateBalthasar",
    "TheRandomObscureFairyTaleNoOnesEverReallyHeardOf",
    "TheReborn",
    "TheTytonNuzlockeChallengeEmeraldEdition",
    "ToD",
    "TPTruePower",
    "TwoKeys",
    "UndertheSkin",
    "WelcometoFreakshow",
    "Whenweweresilent",
    "WhiteHeart",
    "Yaoishereforareason",
    "Zodiac",
]


# links to last valid strips
url_overrides = {
}


# HTML content matcher
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)',
                                after="site_banner") +
                          tagre("img", "title", r'([^"]+)'))
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
num_matcher = re.compile(r'50%">\s+(\d+)\s+')
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))


def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = get_page(url, session).text
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
    for match in page_matcher.finditer(data):
        page_url = match.group(1)
        page_url = urljoin(url, page_url)
        name = format_name(match.group(2))
        if name in exclude_comics:
            continue
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
            continue
        # find out how many images this comic has
        end = match.end()
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR matching number:", repr(data[end:end + 300]),
                  file=sys.stderr)
            continue
        num = int(mo.group(1))
        # search for url in extra page
        print("Getting", page_url)
        try:
            data2 = get_page(page_url, session).text
        except IOError as msg:
            print("ERROR:", msg, file=sys.stderr)
            return
        mo = url_matcher.search(data2)
        if not mo:
            print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr)
            continue
        comic_url = mo.group(1)
        # search for adult flag
        adult = adult_matcher.search(data2[end:])
        bounce = name not in repeat_comics
        res[name] = [
          url_overrides.get(name, comic_url), num, bool(adult), bounce
        ]


def get_results():
    """Parse all search result pages."""
    base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
    session = requests.Session()
    # store info in a dictionary {name -> url, number of comics, adult flag, bounce flag}
    res = {}
    # a search for an empty string returned 286 result pages
    result_pages = 286
    print("Parsing", result_pages, "search result pages...", file=sys.stderr)
    for i in range(0, result_pages):
        print(i + 1, file=sys.stderr, end=" ")
        handle_url(base % (i * 12), session, res)
    save_result(res, json_file)


def has_comic(name):
    """Check if comic name already exists."""
    cname = name.lower()
    for scraperobj in get_scrapers():
        lname = scraperobj.name.lower()
        if lname == cname:
            return True
    return False


def print_results(args):
    """Print all comics that have at least the given number of minimum comic strips."""
    min_comics, filename = args
    min_comics = int(min_comics)
    with codecs.open(filename, 'a', 'utf-8') as fp:
        for name, entry in sorted(load_result(json_file).items()):
            if name in exclude_comics:
                continue
            url, num, adult, bounce = entry
            if num < min_comics:
                continue
            if has_comic(name):
                prefix = u'#'
            else:
                prefix = u''
            fp.write(u"%sadd(%r, %r, %s, %s)\n" % (
              prefix, str(truncate_name(name)), str(url), adult, bounce
            ))


if __name__ == '__main__':
    if len(sys.argv) > 1:
        print_results(sys.argv[1:])
    else:
        get_results()
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`#!/usr/bin/env python`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00			`# -- coding: utf-8 --`
			`# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs`
Updated copyright. 2014-01-05 15:50:57 +00:00			`# Copyright (C) 2012-2014 Bastian Kleineidam`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00			`# Copyright (C) 2015-2016 Tobias Gruetzmacher`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"""`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00			`Script to get a list of smackjeeves.com comics and save the info in a JSON file`
			`for further processing.`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"""`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00			`from __future__ import absolute_import, division, print_function`

Improve plugin update. 2013-05-22 20:29:03 +00:00			`import codecs`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`import re`
			`import sys`
			`import os`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00			`try:`
			`from urllib.parse import urljoin`
			`except ImportError:`
			`from urlparse import urljoin`

Fix scripts 2013-02-12 20:53:57 +00:00			`import requests`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00
			`sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa`
			`from dosagelib.util import get_page, tagre`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`from dosagelib.scraper import get_scrapers`
Remove descriptions and genres (closes #9). Maintaining the descriptions creates quite a bit of overhead (finding them, copying them, checking if they are still correct) for a minimal user benefit. PS: Viewing this diff should be easier in a difftool that shows changes in a line, for example kdiff3. 2015-04-20 18:25:12 +00:00			`from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name`
Various fixes and additions. 2012-12-12 16:41:29 +00:00
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`json_file = __file__.replace(".py", ".json")`

Clean up update helper scripts. 2016-04-12 22:52:16 +00:00
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`# names of comics to exclude`
			`exclude_comics = [`
Comic fixes. 2013-05-25 21:24:33 +00:00			`"4plyKamalsHead", # does not follow standard layout`
Fixed some comics. 2013-04-11 16:27:43 +00:00			`"9Lives", # missing images`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"ADifferentPerspective", # does not follow standard layout`
Remove broken scripted plugins. 2013-04-04 16:30:02 +00:00			`"AFairlyTwistedRealitySuper", # does not follow standard layout`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"Ahoge", # does not follow standard layout`
			`"AngelJunkPileFelix", # images are 403 forbidden`
Fix some comics. 2013-04-28 17:58:38 +00:00			`"AntavioussGenLab", # images are 403 forbidden`
Remove broken scripted plugins. 2013-04-04 16:30:02 +00:00			`"AreyougayJohnny", # does not follow standard layout`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"Authorbattlesthevideogame", # missing images`
			`"BambooArmonicKnightsGuild", # missing previous link`
Comic fixes. 2013-05-25 21:24:33 +00:00			`"BassLegends", # does not follow standard layout`
Fixed some comics. 2013-04-11 16:27:43 +00:00			`"BreIshurnasspritesandstuff", # comic moved`
			`"CatboyattheCon", # missing images`
Updated plugins. 2013-07-09 20:21:12 +00:00			`"Comatose", # does not follow standard layout`
Fixed some comics. 2013-04-11 16:27:43 +00:00			`"ContraandtheSpamDump", # missing images`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"ClubLove", # does not follow standard layout`
Fix some comics. 2013-02-19 19:58:04 +00:00			`"Darkkyosshorts", # missing images`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"DeSTRESS", # does not follow standard layout`
			`"DollarStoreCaviar", # broken images`
			`"DreamCatcher", # does not follow standard layout`
			`"EdgeofDecember", # missing images`
Fix some comics. 2013-04-28 17:58:38 +00:00			`"FroakieShocaiz", # unsuitable navigation`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"Fumiko", # does not follow standard layout`
			`"FurryExperience", # timeout`
Fix some comics. 2013-02-19 19:58:04 +00:00			`"GART", # does not follow standard layout`
Comic fixes. 2013-05-25 21:24:33 +00:00			`"GarytheAlchemist", # does not follow standard layout`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"GBAsCrib", # timeout`
Updated plugins. 2013-07-09 20:21:12 +00:00			`"HAndJ", # missing images`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"HEARD", # missing images`
Updated exclusions. 2013-03-26 16:33:15 +00:00			`"Indigo", # broken domain name`
Fix some comics. 2013-02-19 19:58:04 +00:00			`"IwillbenapoSpamDump", # missing images`
Comic fixes. 2013-05-25 21:24:33 +00:00			`"ItoshiisCrazyNuzlockeAdventures", # does not follow standard layout`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"JennyHaniver", # does not follow standard layout`
			`"KiLAiLO", # does not follow standard layout`
Fix some comics. 2013-02-19 19:58:04 +00:00			`"KirbysoftheAlternateDimension", # missing images`
Fix some comics. 2013-04-28 17:58:38 +00:00			`"Letsreviewshallwe", # missing images`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"LoudEra", # does not follow standard layout`
			`"LunarHill", # does not follow standard layout`
Fixed some comics. 2013-04-11 16:27:43 +00:00			`"Mafiagame", # does not follow standard layout`
			`"MegaManSpriteExpo", # missing images`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"MyLifewithFelENESPANOL", # does not follow standard layout`
			`"MylifewithFel", # does not follow standard layout`
			`"NegativeZen", # does not follow standard layout`
Fix some comics. 2013-02-19 19:58:04 +00:00			`"Nemutionpobae", # does not follow standard layout`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"NightShot", # does not follow standard layout`
Various comics are fixed. 2012-12-13 20:05:27 +00:00			`"NormalIsBoring", # does not follow standard layout`
			`"Okamirai", # images are 403 forbidden`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"OmnisSpriteShowcase", # missing images`
			`"OpticalDisarray", # does not follow standard layout`
			`"PicturesofYou", # does not follow standard layout`
Fixed some comics. 2013-04-11 16:27:43 +00:00			`"PiecesofBrokenGlass", # broken images`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"PlatonicManagementDilemma", # missing images`
			`"Pornjunkiesstrip", # does not follow standard layout`
Fix some comics. 2013-02-19 19:58:04 +00:00			`"PrettyUgly", # does not follow standard layout`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"Project217", # does not follow standard layout`
Fixed some comics. 2013-04-11 16:27:43 +00:00			`"RemmyzRandomz", # does not follow standard layout`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"Ribon", # does not follow standard layout`
Fixed some comics. 2013-04-11 16:27:43 +00:00			`"RubysWorld", # does not follow standard layout`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"SecretSanta2011", # missing images`
			`"ShinkaTheLastEevee", # does not follow standard layout`
Various comics are fixed. 2012-12-13 20:05:27 +00:00			`"SimplePixel", # does not follow standard layout`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"SJArtCollab", # missing images`
Fixed some comics. 2013-04-11 16:27:43 +00:00			`"SladesMansionofawesomeness", # does not follow standard layout`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"SlightlyDifferent", # missing images`
Fix scripts 2013-02-12 20:53:57 +00:00			`"SpaceSchool", # does not follow standard layout`
Fixed some comics. 2013-04-11 16:27:43 +00:00			`"SushiGummy", # does not follow standard layout`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"TheAfterSubtract", # does not follow standard layout`
Fixed some comics. 2013-04-11 16:27:43 +00:00			`"ThePokemonArtBox", # does not follow standard layout`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"THEVOIDWEBCOMIC", # does not follow standard layout`
Comic fixes. 2013-05-25 21:24:33 +00:00			`"TC2KsPokemobians", # does not follow standard layout`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"ThreadCrashers", # has no previous comic link`
Comic fixes. 2013-05-25 21:24:33 +00:00			`"ToDefeatThemAll", # does not follow standard layout`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"TotallyKotor", # missing images`
Updated plugins. 2013-07-09 20:21:12 +00:00			`"TwoKeys", # does not follow standard layout`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"Vbcomics", # does not follow standard layout`
			`"WerewolfRichard", # does not follow standard layout`
			`"WinterMelody", # missing images`
			`]`

Clean up update helper scripts. 2016-04-12 22:52:16 +00:00
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`# the latest URL of some comics repeats the previous URL`
			`# flag this so the bounceStart uses the correct URL`
			`repeat_comics = [`
			`"1009sSpritersVacation",`
			`"22Special22Care",`
			`"2Kingdoms",`
			`"2Masters",`
			`"AbbimaysRandomness",`
			`"AdaLeeComesOn",`
			`"AdventuresofMitch",`
			`"AkumaKisei",`
			`"ALaMode",`
			`"AnimalLoversYuriCollab",`
			`"Area9",`
			`"AStrangeTypeofLove",`
			`"Autophobia",`
			`"BearlyAbel",`
			`"BeCarefreeWithMeSoon",`
			`"BlindandBlue",`
			`"BlueStreak",`
			`"BlueWell",`
			`"BlueYonder",`
			`"Border",`
			`"BoyLessons",`
			`"Boywithasecret",`
			`"BreakFreemagazine",`
			`"BrightStars",`
			`"ByTheBook",`
			`"ClairetheFlare",`
			`"CloeRemembrance",`
			`"ComicFullofSprites",`
			`"CrappilyDrawnMinicomics",`
			`"CupidsaMoron",`
			`"D00R",`
			`"DeathNoteIridescent",`
			`"DemonEater",`
			`"DenizensAttention",`
			`"DevilsCake",`
			`"Dreamcatchers",`
			`"EmeraldNuzlocke",`
			`"EonsAgo",`
			`"ERRORERROR",`
			`"EvilPlan",`
			`"FailureConfetti",`
			`"FlyorFail",`
			`"ForestHill",`
			`"FrobertTheDemon",`
			`"GarytheAlchemist",`
			`"GhostsTaleACrossover",`
			`"Glasshearts",`
			`"GoldenSunGenerationsAftermathVolume1",`
			`"GoldenSunGenerationsColossoVolume6",`
			`"GuardiansoftheGalaxialSpaceways",`
			`"HatShop",`
			`"HDMTHCOMICS",`
			`"Helix",`
			`"Hephaestus",`
			`"HolyBlasphemy",`
			`"HopeForABreeze",`
			`"Hotarugari",`
			`"InsideOuTAYuriTale",`
			`"Insomanywords",`
			`"INUSITADOONLINE",`
			`"ItsCharacterDevelopment",`
			`"JosephAndYusra",`
			`"JustAnotherDay",`
			`"KasaKeira",`
			`"KirbyAdventure",`
			`"KirbyandtheDarkKnight",`
			`"KirbyFunfestTheOriginals",`
			`"KirbysofTHEVOID",`
			`"KuroiHitsuji",`
			`"KuroShouri",`
			`"LandoftheSky",`
			`"LeCirquedObscure",`
			`"LethalDose",`
			`"LOGOS",`
			`"LostLove",`
			`"LsEmpire",`
			`"MariovsSonicvsMegaMan",`
			`"Mega",`
			`"MementoMori",`
			`"Mokepon",`
			`"MrGrimmsCircusofHorrors",`
			`"MyFakeHeart",`
			`"MyFriendScotty",`
			`"MYth",`
			`"NemesisKatharsis",`
			`"NiceKitty",`
			`"Nutshel",`
			`"OptimalClutter",`
			`"Panacea",`
			`"PhilosophicalPenisJokes",`
			`"PrettyUgly",`
			`"PSY",`
			`"PTO",`
			`"RainLGBT",`
			`"ReidyandFriendsShowcase",`
			`"RubysWorld",`
			`"SallySprocketAndPistonPete",`
			`"SimonSues",`
			`"SimpleBear",`
			`"SmallPressAdventures",`
			`"SonicWorldAdventure",`
			`"SoulGuardian",`
			`"SPOON",`
			`"STASonictheAdventure",`
			`"Stay",`
			`"StellaInChrome",`
			`"StrangersandFriends",`
			`"SunmeetsMoon",`
			`"TAG",`
			`"TaikiTheWebcomic",`
			`"TechnicolorLondon",`
			`"TEN",`
			`"ThatWasntThereYesterday",`
			`"TheAntihero",`
			`"TheBrideoftheShark",`
			`"TheCafedAlizee",`
			`"TheEssyaneWarriors",`
			`"ThehumanBEing",`
			`"TheKwiddexProtocol",`
			`"TheLegendofZeldaMaidenoftheMoon",`
			`"ThePirateBalthasar",`
			`"TheRandomObscureFairyTaleNoOnesEverReallyHeardOf",`
			`"TheReborn",`
			`"TheTytonNuzlockeChallengeEmeraldEdition",`
			`"ToD",`
			`"TPTruePower",`
			`"TwoKeys",`
			`"UndertheSkin",`
			`"WelcometoFreakshow",`
			`"Whenweweresilent",`
			`"WhiteHeart",`
			`"Yaoishereforareason",`
			`"Zodiac",`
			`]`

Clean up update helper scripts. 2016-04-12 22:52:16 +00:00
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`# links to last valid strips`
			`url_overrides = {`
			`}`

Clean up update helper scripts. 2016-04-12 22:52:16 +00:00
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`# HTML content matcher`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00			`page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)',`
			`after="site_banner") +`
			`tagre("img", "title", r'([^"]+)'))`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")`
			`num_matcher = re.compile(r'50%">\s+(\d+)\s+')`
			`adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))`

Clean up update helper scripts. 2016-04-12 22:52:16 +00:00
Fix scripts 2013-02-12 20:53:57 +00:00			`def handle_url(url, session, res):`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`"""Parse one search result page."""`
			`print("Parsing", url, file=sys.stderr)`
			`try:`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00			`data = get_page(url, session).text`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`except IOError as msg:`
			`print("ERROR:", msg, file=sys.stderr)`
			`return`
			`for match in page_matcher.finditer(data):`
			`page_url = match.group(1)`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00			`page_url = urljoin(url, page_url)`
Add some scriptsutil functions. 2013-02-13 19:02:47 +00:00			`name = format_name(match.group(2))`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`if name in exclude_comics:`
			`continue`
			`if contains_case_insensitive(res, name):`
			`# we cannot handle two comics that only differ in case`
Improved logging. 2013-03-12 19:47:11 +00:00			`print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`continue`
			`# find out how many images this comic has`
			`end = match.end()`
			`mo = num_matcher.search(data[end:])`
			`if not mo:`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00			`print("ERROR matching number:", repr(data[end:end + 300]),`
			`file=sys.stderr)`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`continue`
			`num = int(mo.group(1))`
			`# search for url in extra page`
			`print("Getting", page_url)`
			`try:`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00			`data2 = get_page(page_url, session).text`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`except IOError as msg:`
			`print("ERROR:", msg, file=sys.stderr)`
			`return`
			`mo = url_matcher.search(data2)`
			`if not mo:`
			`print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr)`
			`continue`
			`comic_url = mo.group(1)`
			`# search for adult flag`
			`adult = adult_matcher.search(data2[end:])`
			`bounce = name not in repeat_comics`
			`res[name] = [`
Remove descriptions and genres (closes #9). Maintaining the descriptions creates quite a bit of overhead (finding them, copying them, checking if they are still correct) for a minimal user benefit. PS: Viewing this diff should be easier in a difftool that shows changes in a line, for example kdiff3. 2015-04-20 18:25:12 +00:00			`url_overrides.get(name, comic_url), num, bool(adult), bounce`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`]`


			`def get_results():`
			`"""Parse all search result pages."""`
			`base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"`
Fix scripts 2013-02-12 20:53:57 +00:00			`session = requests.Session()`
Remove descriptions and genres (closes #9). Maintaining the descriptions creates quite a bit of overhead (finding them, copying them, checking if they are still correct) for a minimal user benefit. PS: Viewing this diff should be easier in a difftool that shows changes in a line, for example kdiff3. 2015-04-20 18:25:12 +00:00			`# store info in a dictionary {name -> url, number of comics, adult flag, bounce flag}`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`res = {}`
			`# a search for an empty string returned 286 result pages`
			`result_pages = 286`
			`print("Parsing", result_pages, "search result pages...", file=sys.stderr)`
			`for i in range(0, result_pages):`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00			`print(i + 1, file=sys.stderr, end=" ")`
			`handle_url(base % (i * 12), session, res)`
Code cleanup. 2012-12-19 19:42:53 +00:00			`save_result(res, json_file)`
Various fixes and additions. 2012-12-12 16:41:29 +00:00

			`def has_comic(name):`
Add missing docstrings. 2013-01-09 21:26:00 +00:00			`"""Check if comic name already exists."""`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`cname = name.lower()`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`for scraperobj in get_scrapers():`
			`lname = scraperobj.name.lower()`
Various fixes and additions. 2012-12-12 16:41:29 +00:00			`if lname == cname:`
			`return True`
			`return False`


			`def print_results(args):`
			`"""Print all comics that have at least the given number of minimum comic strips."""`
Improve plugin update. 2013-05-22 20:29:03 +00:00			`min_comics, filename = args`
			`min_comics = int(min_comics)`
			`with codecs.open(filename, 'a', 'utf-8') as fp:`
			`for name, entry in sorted(load_result(json_file).items()):`
			`if name in exclude_comics:`
			`continue`
Remove descriptions and genres (closes #9). Maintaining the descriptions creates quite a bit of overhead (finding them, copying them, checking if they are still correct) for a minimal user benefit. PS: Viewing this diff should be easier in a difftool that shows changes in a line, for example kdiff3. 2015-04-20 18:25:12 +00:00			`url, num, adult, bounce = entry`
Improve plugin update. 2013-05-22 20:29:03 +00:00			`if num < min_comics:`
			`continue`
			`if has_comic(name):`
			`prefix = u'#'`
			`else:`
			`prefix = u''`
Remove descriptions and genres (closes #9). Maintaining the descriptions creates quite a bit of overhead (finding them, copying them, checking if they are still correct) for a minimal user benefit. PS: Viewing this diff should be easier in a difftool that shows changes in a line, for example kdiff3. 2015-04-20 18:25:12 +00:00			`fp.write(u"%sadd(%r, %r, %s, %s)\n" % (`
			`prefix, str(truncate_name(name)), str(url), adult, bounce`
Improve plugin update. 2013-05-22 20:29:03 +00:00			`))`
Various fixes and additions. 2012-12-12 16:41:29 +00:00

			`if __name__ == '__main__':`
			`if len(sys.argv) > 1:`
			`print_results(sys.argv[1:])`
			`else:`
			`get_results()`