dosage/scripts/smackjeeves.py

353 lines
12 KiB
Python
Raw Normal View History

2012-12-12 16:41:29 +00:00
#!/usr/bin/env python
2013-01-09 21:21:19 +00:00
# Copyright (C) 2012-2013 Bastian Kleineidam
2012-12-12 16:41:29 +00:00
"""
Script to get a list of smackjeeves.com comics and save the info in a JSON file for further processing.
"""
from __future__ import print_function
2013-05-22 20:29:03 +00:00
import codecs
2012-12-12 16:41:29 +00:00
import re
import sys
import os
import urlparse
2013-02-12 20:53:57 +00:00
import requests
2012-12-12 16:41:29 +00:00
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
2013-02-13 19:02:47 +00:00
from dosagelib.util import getPageContent, tagre
from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name, format_description
2012-12-12 16:41:29 +00:00
json_file = __file__.replace(".py", ".json")
# names of comics to exclude
exclude_comics = [
2013-05-25 21:24:33 +00:00
"4plyKamalsHead", # does not follow standard layout
2013-04-11 16:27:43 +00:00
"9Lives", # missing images
2012-12-12 16:41:29 +00:00
"ADifferentPerspective", # does not follow standard layout
2013-04-04 16:30:02 +00:00
"AFairlyTwistedRealitySuper", # does not follow standard layout
2012-12-12 16:41:29 +00:00
"Ahoge", # does not follow standard layout
"AngelJunkPileFelix", # images are 403 forbidden
2013-04-28 17:58:38 +00:00
"AntavioussGenLab", # images are 403 forbidden
2013-04-04 16:30:02 +00:00
"AreyougayJohnny", # does not follow standard layout
2012-12-12 16:41:29 +00:00
"Authorbattlesthevideogame", # missing images
"BambooArmonicKnightsGuild", # missing previous link
2013-05-25 21:24:33 +00:00
"BassLegends", # does not follow standard layout
2013-04-11 16:27:43 +00:00
"BreIshurnasspritesandstuff", # comic moved
"CatboyattheCon", # missing images
2013-07-09 20:21:12 +00:00
"Comatose", # does not follow standard layout
2013-04-11 16:27:43 +00:00
"ContraandtheSpamDump", # missing images
2012-12-12 16:41:29 +00:00
"ClubLove", # does not follow standard layout
2013-02-19 19:58:04 +00:00
"Darkkyosshorts", # missing images
2012-12-12 16:41:29 +00:00
"DeSTRESS", # does not follow standard layout
"DollarStoreCaviar", # broken images
"DreamCatcher", # does not follow standard layout
"EdgeofDecember", # missing images
2013-04-28 17:58:38 +00:00
"FroakieShocaiz", # unsuitable navigation
2012-12-12 16:41:29 +00:00
"Fumiko", # does not follow standard layout
"FurryExperience", # timeout
2013-02-19 19:58:04 +00:00
"GART", # does not follow standard layout
2013-05-25 21:24:33 +00:00
"GarytheAlchemist", # does not follow standard layout
2012-12-12 16:41:29 +00:00
"GBAsCrib", # timeout
2013-07-09 20:21:12 +00:00
"HAndJ", # missing images
2012-12-12 16:41:29 +00:00
"HEARD", # missing images
2013-03-26 16:33:15 +00:00
"Indigo", # broken domain name
2013-02-19 19:58:04 +00:00
"IwillbenapoSpamDump", # missing images
2013-05-25 21:24:33 +00:00
"ItoshiisCrazyNuzlockeAdventures", # does not follow standard layout
2012-12-12 16:41:29 +00:00
"JennyHaniver", # does not follow standard layout
"KiLAiLO", # does not follow standard layout
2013-02-19 19:58:04 +00:00
"KirbysoftheAlternateDimension", # missing images
2013-04-28 17:58:38 +00:00
"Letsreviewshallwe", # missing images
2012-12-12 16:41:29 +00:00
"LoudEra", # does not follow standard layout
"LunarHill", # does not follow standard layout
2013-04-11 16:27:43 +00:00
"Mafiagame", # does not follow standard layout
"MegaManSpriteExpo", # missing images
2012-12-12 16:41:29 +00:00
"MyLifewithFelENESPANOL", # does not follow standard layout
"MylifewithFel", # does not follow standard layout
"NegativeZen", # does not follow standard layout
2013-02-19 19:58:04 +00:00
"Nemutionpobae", # does not follow standard layout
2012-12-12 16:41:29 +00:00
"NightShot", # does not follow standard layout
2012-12-13 20:05:27 +00:00
"NormalIsBoring", # does not follow standard layout
"Okamirai", # images are 403 forbidden
2012-12-12 16:41:29 +00:00
"OmnisSpriteShowcase", # missing images
"OpticalDisarray", # does not follow standard layout
"PicturesofYou", # does not follow standard layout
2013-04-11 16:27:43 +00:00
"PiecesofBrokenGlass", # broken images
2012-12-12 16:41:29 +00:00
"PlatonicManagementDilemma", # missing images
"Pornjunkiesstrip", # does not follow standard layout
2013-02-19 19:58:04 +00:00
"PrettyUgly", # does not follow standard layout
2012-12-12 16:41:29 +00:00
"Project217", # does not follow standard layout
2013-04-11 16:27:43 +00:00
"RemmyzRandomz", # does not follow standard layout
2012-12-12 16:41:29 +00:00
"Ribon", # does not follow standard layout
2013-04-11 16:27:43 +00:00
"RubysWorld", # does not follow standard layout
2012-12-12 16:41:29 +00:00
"SecretSanta2011", # missing images
"ShinkaTheLastEevee", # does not follow standard layout
2012-12-13 20:05:27 +00:00
"SimplePixel", # does not follow standard layout
2012-12-12 16:41:29 +00:00
"SJArtCollab", # missing images
2013-04-11 16:27:43 +00:00
"SladesMansionofawesomeness", # does not follow standard layout
2012-12-12 16:41:29 +00:00
"SlightlyDifferent", # missing images
2013-02-12 20:53:57 +00:00
"SpaceSchool", # does not follow standard layout
2013-04-11 16:27:43 +00:00
"SushiGummy", # does not follow standard layout
2012-12-12 16:41:29 +00:00
"TheAfterSubtract", # does not follow standard layout
2013-04-11 16:27:43 +00:00
"ThePokemonArtBox", # does not follow standard layout
2012-12-12 16:41:29 +00:00
"THEVOIDWEBCOMIC", # does not follow standard layout
2013-05-25 21:24:33 +00:00
"TC2KsPokemobians", # does not follow standard layout
2012-12-12 16:41:29 +00:00
"ThreadCrashers", # has no previous comic link
2013-05-25 21:24:33 +00:00
"ToDefeatThemAll", # does not follow standard layout
2012-12-12 16:41:29 +00:00
"TotallyKotor", # missing images
2013-07-09 20:21:12 +00:00
"TwoKeys", # does not follow standard layout
2012-12-12 16:41:29 +00:00
"Vbcomics", # does not follow standard layout
"WerewolfRichard", # does not follow standard layout
"WinterMelody", # missing images
]
# the latest URL of some comics repeats the previous URL
# flag this so the bounceStart uses the correct URL
repeat_comics = [
"1009sSpritersVacation",
"22Special22Care",
"2Kingdoms",
"2Masters",
"AbbimaysRandomness",
"AdaLeeComesOn",
"AdventuresofMitch",
"AkumaKisei",
"ALaMode",
"AnimalLoversYuriCollab",
"Area9",
"AStrangeTypeofLove",
"Autophobia",
"BearlyAbel",
"BeCarefreeWithMeSoon",
"BlindandBlue",
"BlueStreak",
"BlueWell",
"BlueYonder",
"Border",
"BoyLessons",
"Boywithasecret",
"BreakFreemagazine",
"BrightStars",
"ByTheBook",
"ClairetheFlare",
"CloeRemembrance",
"ComicFullofSprites",
"CrappilyDrawnMinicomics",
"CupidsaMoron",
"D00R",
"DeathNoteIridescent",
"DemonEater",
"DenizensAttention",
"DevilsCake",
"Dreamcatchers",
"EmeraldNuzlocke",
"EonsAgo",
"ERRORERROR",
"EvilPlan",
"FailureConfetti",
"FlyorFail",
"ForestHill",
"FrobertTheDemon",
"GarytheAlchemist",
"GhostsTaleACrossover",
"Glasshearts",
"GoldenSunGenerationsAftermathVolume1",
"GoldenSunGenerationsColossoVolume6",
"GuardiansoftheGalaxialSpaceways",
"HatShop",
"HDMTHCOMICS",
"Helix",
"Hephaestus",
"HolyBlasphemy",
"HopeForABreeze",
"Hotarugari",
"InsideOuTAYuriTale",
"Insomanywords",
"INUSITADOONLINE",
"ItsCharacterDevelopment",
"JosephAndYusra",
"JustAnotherDay",
"KasaKeira",
"KirbyAdventure",
"KirbyandtheDarkKnight",
"KirbyFunfestTheOriginals",
"KirbysofTHEVOID",
"KuroiHitsuji",
"KuroShouri",
"LandoftheSky",
"LeCirquedObscure",
"LethalDose",
"LOGOS",
"LostLove",
"LsEmpire",
"MariovsSonicvsMegaMan",
"Mega",
"MementoMori",
"Mokepon",
"MrGrimmsCircusofHorrors",
"MyFakeHeart",
"MyFriendScotty",
"MYth",
"NemesisKatharsis",
"NiceKitty",
"Nutshel",
"OptimalClutter",
"Panacea",
"PhilosophicalPenisJokes",
"PrettyUgly",
"PSY",
"PTO",
"RainLGBT",
"ReidyandFriendsShowcase",
"RubysWorld",
"SallySprocketAndPistonPete",
"SimonSues",
"SimpleBear",
"SmallPressAdventures",
"SonicWorldAdventure",
"SoulGuardian",
"SPOON",
"STASonictheAdventure",
"Stay",
"StellaInChrome",
"StrangersandFriends",
"SunmeetsMoon",
"TAG",
"TaikiTheWebcomic",
"TechnicolorLondon",
"TEN",
"ThatWasntThereYesterday",
"TheAntihero",
"TheBrideoftheShark",
"TheCafedAlizee",
"TheEssyaneWarriors",
"ThehumanBEing",
"TheKwiddexProtocol",
"TheLegendofZeldaMaidenoftheMoon",
"ThePirateBalthasar",
"TheRandomObscureFairyTaleNoOnesEverReallyHeardOf",
"TheReborn",
"TheTytonNuzlockeChallengeEmeraldEdition",
"ToD",
"TPTruePower",
"TwoKeys",
"UndertheSkin",
"WelcometoFreakshow",
"Whenweweresilent",
"WhiteHeart",
"Yaoishereforareason",
"Zodiac",
]
# links to last valid strips
url_overrides = {
}
# HTML content matcher
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)', after="site_banner") +
tagre("img", "title", r'([^"]+)'))
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
num_matcher = re.compile(r'50%">\s+(\d+)\s+')
desc_matcher = re.compile(r"</div>(.+?)</div>", re.DOTALL)
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
2013-02-12 20:53:57 +00:00
def handle_url(url, session, res):
2012-12-12 16:41:29 +00:00
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
2013-02-12 20:53:57 +00:00
data, baseUrl = getPageContent(url, session)
2012-12-12 16:41:29 +00:00
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in page_matcher.finditer(data):
page_url = match.group(1)
page_url = urlparse.urljoin(url, page_url)
2013-02-13 19:02:47 +00:00
name = format_name(match.group(2))
2012-12-12 16:41:29 +00:00
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
2013-03-12 19:47:11 +00:00
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
2012-12-12 16:41:29 +00:00
continue
# find out how many images this comic has
end = match.end()
mo = num_matcher.search(data[end:])
if not mo:
print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr)
continue
num = int(mo.group(1))
# search for url in extra page
print("Getting", page_url)
try:
2013-02-12 20:53:57 +00:00
data2, baseUrl2 = getPageContent(page_url, session)
2012-12-12 16:41:29 +00:00
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
mo = url_matcher.search(data2)
if not mo:
print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr)
continue
comic_url = mo.group(1)
# search for description
end = mo.end()
mo = desc_matcher.search(data2[end:])
if not mo:
print("ERROR matching comic description:", repr(data2[end:end+300]), file=sys.stderr)
continue
2013-02-13 19:02:47 +00:00
desc = format_description(mo.group(1))
2012-12-12 16:41:29 +00:00
# search for adult flag
adult = adult_matcher.search(data2[end:])
bounce = name not in repeat_comics
res[name] = [
url_overrides.get(name, comic_url), num, desc, bool(adult), bounce
]
def get_results():
"""Parse all search result pages."""
base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
2013-02-12 20:53:57 +00:00
session = requests.Session()
2012-12-12 16:41:29 +00:00
# store info in a dictionary {name -> url, number of comics, description, adult flag}
res = {}
# a search for an empty string returned 286 result pages
result_pages = 286
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
for i in range(0, result_pages):
print(i+1, file=sys.stderr, end=" ")
2013-02-12 20:53:57 +00:00
handle_url(base % (i*12), session, res)
2012-12-19 19:42:53 +00:00
save_result(res, json_file)
2012-12-12 16:41:29 +00:00
def has_comic(name):
2013-01-09 21:26:00 +00:00
"""Check if comic name already exists."""
2012-12-12 16:41:29 +00:00
cname = name.lower()
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
2012-12-12 16:41:29 +00:00
if lname == cname:
return True
return False
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
2013-05-22 20:29:03 +00:00
min_comics, filename = args
min_comics = int(min_comics)
with codecs.open(filename, 'a', 'utf-8') as fp:
for name, entry in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
url, num, desc, adult, bounce = entry
if num < min_comics:
continue
if has_comic(name):
prefix = u'#'
else:
prefix = u''
fp.write(u"%sadd(%r, %r, %r, %s, %s)\n" % (
prefix, str(truncate_name(name)), str(url), desc, adult, bounce
))
2012-12-12 16:41:29 +00:00
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()