Update SmackJeeves update helper.
Don't use it right now, it adds a HUGE amount of comics.
This commit is contained in:
parent
fe51a449df
commit
1d2e1f2dd1
1 changed files with 148 additions and 331 deletions
|
@ -9,354 +9,171 @@ for further processing.
|
|||
"""
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import codecs
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
try:
|
||||
from urllib.parse import urljoin
|
||||
from urllib.parse import urlsplit
|
||||
except ImportError:
|
||||
from urlparse import urljoin
|
||||
from urlparse import urlsplit
|
||||
|
||||
import requests
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
|
||||
from dosagelib.util import get_page, tagre
|
||||
from dosagelib.scraper import get_scrapers
|
||||
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
|
||||
from scriptutil import ComicListUpdater
|
||||
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
class SmackJeevesUpdater(ComicListUpdater):
|
||||
# Absolute minumum number of pages a comic may have (restrict search space)
|
||||
MIN_COMICS = 90
|
||||
|
||||
ADULT_IMG = 'http://www.smackjeeves.com/images/mature_content.png'
|
||||
|
||||
# names of comics to exclude
|
||||
exclude_comics = [
|
||||
"4plyKamalsHead", # does not follow standard layout
|
||||
"9Lives", # missing images
|
||||
"ADifferentPerspective", # does not follow standard layout
|
||||
"AFairlyTwistedRealitySuper", # does not follow standard layout
|
||||
"Ahoge", # does not follow standard layout
|
||||
"AngelJunkPileFelix", # images are 403 forbidden
|
||||
"AntavioussGenLab", # images are 403 forbidden
|
||||
"AreyougayJohnny", # does not follow standard layout
|
||||
"Authorbattlesthevideogame", # missing images
|
||||
"BambooArmonicKnightsGuild", # missing previous link
|
||||
"BassLegends", # does not follow standard layout
|
||||
"BreIshurnasspritesandstuff", # comic moved
|
||||
"CatboyattheCon", # missing images
|
||||
"Comatose", # does not follow standard layout
|
||||
"ContraandtheSpamDump", # missing images
|
||||
"ClubLove", # does not follow standard layout
|
||||
"Darkkyosshorts", # missing images
|
||||
"DeSTRESS", # does not follow standard layout
|
||||
"DollarStoreCaviar", # broken images
|
||||
"DreamCatcher", # does not follow standard layout
|
||||
"EdgeofDecember", # missing images
|
||||
"FroakieShocaiz", # unsuitable navigation
|
||||
"Fumiko", # does not follow standard layout
|
||||
"FurryExperience", # timeout
|
||||
"GART", # does not follow standard layout
|
||||
"GarytheAlchemist", # does not follow standard layout
|
||||
"GBAsCrib", # timeout
|
||||
"HAndJ", # missing images
|
||||
"HEARD", # missing images
|
||||
"Indigo", # broken domain name
|
||||
"IwillbenapoSpamDump", # missing images
|
||||
"ItoshiisCrazyNuzlockeAdventures", # does not follow standard layout
|
||||
"JennyHaniver", # does not follow standard layout
|
||||
"KiLAiLO", # does not follow standard layout
|
||||
"KirbysoftheAlternateDimension", # missing images
|
||||
"Letsreviewshallwe", # missing images
|
||||
"LoudEra", # does not follow standard layout
|
||||
"LunarHill", # does not follow standard layout
|
||||
"Mafiagame", # does not follow standard layout
|
||||
"MegaManSpriteExpo", # missing images
|
||||
"MyLifewithFelENESPANOL", # does not follow standard layout
|
||||
"MylifewithFel", # does not follow standard layout
|
||||
"NegativeZen", # does not follow standard layout
|
||||
"Nemutionpobae", # does not follow standard layout
|
||||
"NightShot", # does not follow standard layout
|
||||
"NormalIsBoring", # does not follow standard layout
|
||||
"Okamirai", # images are 403 forbidden
|
||||
"OmnisSpriteShowcase", # missing images
|
||||
"OpticalDisarray", # does not follow standard layout
|
||||
"PicturesofYou", # does not follow standard layout
|
||||
"PiecesofBrokenGlass", # broken images
|
||||
"PlatonicManagementDilemma", # missing images
|
||||
"Pornjunkiesstrip", # does not follow standard layout
|
||||
"PrettyUgly", # does not follow standard layout
|
||||
"Project217", # does not follow standard layout
|
||||
"RemmyzRandomz", # does not follow standard layout
|
||||
"Ribon", # does not follow standard layout
|
||||
"RubysWorld", # does not follow standard layout
|
||||
"SecretSanta2011", # missing images
|
||||
"ShinkaTheLastEevee", # does not follow standard layout
|
||||
"SimplePixel", # does not follow standard layout
|
||||
"SJArtCollab", # missing images
|
||||
"SladesMansionofawesomeness", # does not follow standard layout
|
||||
"SlightlyDifferent", # missing images
|
||||
"SpaceSchool", # does not follow standard layout
|
||||
"SushiGummy", # does not follow standard layout
|
||||
"TheAfterSubtract", # does not follow standard layout
|
||||
"ThePokemonArtBox", # does not follow standard layout
|
||||
"THEVOIDWEBCOMIC", # does not follow standard layout
|
||||
"TC2KsPokemobians", # does not follow standard layout
|
||||
"ThreadCrashers", # has no previous comic link
|
||||
"ToDefeatThemAll", # does not follow standard layout
|
||||
"TotallyKotor", # missing images
|
||||
"TwoKeys", # does not follow standard layout
|
||||
"Vbcomics", # does not follow standard layout
|
||||
"WerewolfRichard", # does not follow standard layout
|
||||
"WinterMelody", # missing images
|
||||
]
|
||||
# names of comics to exclude
|
||||
excluded_comics = (
|
||||
# comic moved/we have a better module
|
||||
"Amya",
|
||||
"Carciphona",
|
||||
"Footloose",
|
||||
"TitleUnrelated",
|
||||
|
||||
# does not follow standard layout
|
||||
"300DaysOfSyao",
|
||||
"ADifferentPerspective",
|
||||
"Captor",
|
||||
"ClubLove",
|
||||
"Comatose",
|
||||
"DeSTRESS",
|
||||
"DreamCatcher",
|
||||
"Fumiko",
|
||||
"GART",
|
||||
"GarytheAlchemist",
|
||||
"ItoshiisCrazyNuzlockeAdventures",
|
||||
"JennyHaniver",
|
||||
"KiLAiLO",
|
||||
"LoudEra",
|
||||
"LunarHill",
|
||||
"Mafiagame",
|
||||
"MylifewithFel",
|
||||
"MyLifewithFelENESPANOL",
|
||||
"NegativeZen",
|
||||
"Nemutionpobae",
|
||||
"NightShot",
|
||||
"NormalIsBoring",
|
||||
"OpticalDisarray",
|
||||
"PicturesofYou",
|
||||
"Pornjunkiesstrip",
|
||||
"PrettyUgly",
|
||||
"Project217",
|
||||
"RemmyzRandomz",
|
||||
"Ribon",
|
||||
"RubysWorld",
|
||||
"ShinkaTheLastEevee",
|
||||
"SimplePixel",
|
||||
"SladesMansionofawesomeness",
|
||||
"SpaceSchool",
|
||||
"SushiGummy",
|
||||
"TC2KsPokemobians",
|
||||
"TheAfterSubtract",
|
||||
"ThePokemonArtBox",
|
||||
"THEVOIDWEBCOMIC",
|
||||
"ToDefeatThemAll",
|
||||
"TwoKeys",
|
||||
"Vbcomics",
|
||||
"WerewolfRichard",
|
||||
|
||||
# the latest URL of some comics repeats the previous URL
|
||||
# flag this so the bounceStart uses the correct URL
|
||||
repeat_comics = [
|
||||
"1009sSpritersVacation",
|
||||
"22Special22Care",
|
||||
"2Kingdoms",
|
||||
"2Masters",
|
||||
"AbbimaysRandomness",
|
||||
"AdaLeeComesOn",
|
||||
"AdventuresofMitch",
|
||||
"AkumaKisei",
|
||||
"ALaMode",
|
||||
"AnimalLoversYuriCollab",
|
||||
"Area9",
|
||||
"AStrangeTypeofLove",
|
||||
"Autophobia",
|
||||
"BearlyAbel",
|
||||
"BeCarefreeWithMeSoon",
|
||||
"BlindandBlue",
|
||||
"BlueStreak",
|
||||
"BlueWell",
|
||||
"BlueYonder",
|
||||
"Border",
|
||||
"BoyLessons",
|
||||
"Boywithasecret",
|
||||
"BreakFreemagazine",
|
||||
"BrightStars",
|
||||
"ByTheBook",
|
||||
"ClairetheFlare",
|
||||
"CloeRemembrance",
|
||||
"ComicFullofSprites",
|
||||
"CrappilyDrawnMinicomics",
|
||||
"CupidsaMoron",
|
||||
"D00R",
|
||||
"DeathNoteIridescent",
|
||||
"DemonEater",
|
||||
"DenizensAttention",
|
||||
"DevilsCake",
|
||||
"Dreamcatchers",
|
||||
"EmeraldNuzlocke",
|
||||
"EonsAgo",
|
||||
"ERRORERROR",
|
||||
"EvilPlan",
|
||||
"FailureConfetti",
|
||||
"FlyorFail",
|
||||
"ForestHill",
|
||||
"FrobertTheDemon",
|
||||
"GarytheAlchemist",
|
||||
"GhostsTaleACrossover",
|
||||
"Glasshearts",
|
||||
"GoldenSunGenerationsAftermathVolume1",
|
||||
"GoldenSunGenerationsColossoVolume6",
|
||||
"GuardiansoftheGalaxialSpaceways",
|
||||
"HatShop",
|
||||
"HDMTHCOMICS",
|
||||
"Helix",
|
||||
"Hephaestus",
|
||||
"HolyBlasphemy",
|
||||
"HopeForABreeze",
|
||||
"Hotarugari",
|
||||
"InsideOuTAYuriTale",
|
||||
"Insomanywords",
|
||||
"INUSITADOONLINE",
|
||||
"ItsCharacterDevelopment",
|
||||
"JosephAndYusra",
|
||||
"JustAnotherDay",
|
||||
"KasaKeira",
|
||||
"KirbyAdventure",
|
||||
"KirbyandtheDarkKnight",
|
||||
"KirbyFunfestTheOriginals",
|
||||
"KirbysofTHEVOID",
|
||||
"KuroiHitsuji",
|
||||
"KuroShouri",
|
||||
"LandoftheSky",
|
||||
"LeCirquedObscure",
|
||||
"LethalDose",
|
||||
"LOGOS",
|
||||
"LostLove",
|
||||
"LsEmpire",
|
||||
"MariovsSonicvsMegaMan",
|
||||
"Mega",
|
||||
"MementoMori",
|
||||
"Mokepon",
|
||||
"MrGrimmsCircusofHorrors",
|
||||
"MyFakeHeart",
|
||||
"MyFriendScotty",
|
||||
"MYth",
|
||||
"NemesisKatharsis",
|
||||
"NiceKitty",
|
||||
"Nutshel",
|
||||
"OptimalClutter",
|
||||
"Panacea",
|
||||
"PhilosophicalPenisJokes",
|
||||
"PrettyUgly",
|
||||
"PSY",
|
||||
"PTO",
|
||||
"RainLGBT",
|
||||
"ReidyandFriendsShowcase",
|
||||
"RubysWorld",
|
||||
"SallySprocketAndPistonPete",
|
||||
"SimonSues",
|
||||
"SimpleBear",
|
||||
"SmallPressAdventures",
|
||||
"SonicWorldAdventure",
|
||||
"SoulGuardian",
|
||||
"SPOON",
|
||||
"STASonictheAdventure",
|
||||
"Stay",
|
||||
"StellaInChrome",
|
||||
"StrangersandFriends",
|
||||
"SunmeetsMoon",
|
||||
"TAG",
|
||||
"TaikiTheWebcomic",
|
||||
"TechnicolorLondon",
|
||||
"TEN",
|
||||
"ThatWasntThereYesterday",
|
||||
"TheAntihero",
|
||||
"TheBrideoftheShark",
|
||||
"TheCafedAlizee",
|
||||
"TheEssyaneWarriors",
|
||||
"ThehumanBEing",
|
||||
"TheKwiddexProtocol",
|
||||
"TheLegendofZeldaMaidenoftheMoon",
|
||||
"ThePirateBalthasar",
|
||||
"TheRandomObscureFairyTaleNoOnesEverReallyHeardOf",
|
||||
"TheReborn",
|
||||
"TheTytonNuzlockeChallengeEmeraldEdition",
|
||||
"ToD",
|
||||
"TPTruePower",
|
||||
"TwoKeys",
|
||||
"UndertheSkin",
|
||||
"WelcometoFreakshow",
|
||||
"Whenweweresilent",
|
||||
"WhiteHeart",
|
||||
"Yaoishereforareason",
|
||||
"Zodiac",
|
||||
]
|
||||
# has no previous comic link
|
||||
"ThreadCrashers",
|
||||
"AchievementStuck",
|
||||
|
||||
# images are 403 forbidden
|
||||
"AngelJunkPileFelix",
|
||||
"AntavioussGenLab",
|
||||
"Okamirai",
|
||||
|
||||
# links to last valid strips
|
||||
url_overrides = {
|
||||
}
|
||||
# missing images
|
||||
"CatboyattheCon",
|
||||
"ContraandtheSpamDump",
|
||||
"Darkkyosshorts",
|
||||
"DollarStoreCaviar",
|
||||
"EdgeofDecember",
|
||||
"HAndJ",
|
||||
"HEARD",
|
||||
"IwillbenapoSpamDump",
|
||||
"KirbysoftheAlternateDimension",
|
||||
"Letsreviewshallwe",
|
||||
"MegaManSpriteExpo",
|
||||
"OmnisSpriteShowcase",
|
||||
"PiecesofBrokenGlass",
|
||||
"PlatonicManagementDilemma",
|
||||
"SecretSanta2011",
|
||||
"SerendipityAnEquestrianTale",
|
||||
"SJArtCollab",
|
||||
"SlightlyDifferent",
|
||||
"TheAttackoftheRecoloursSeason1",
|
||||
"TotallyKotor",
|
||||
"WinterMelody",
|
||||
"ZonowTheHedgehog",
|
||||
|
||||
# missing previous link
|
||||
"BambooArmonicKnightsGuild",
|
||||
|
||||
# HTML content matcher
|
||||
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)',
|
||||
after="site_banner") +
|
||||
tagre("img", "title", r'([^"]+)'))
|
||||
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
|
||||
num_matcher = re.compile(r'50%">\s+(\d+)\s+')
|
||||
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
|
||||
# broken host name
|
||||
"Razor",
|
||||
)
|
||||
|
||||
def handle_url(self, url):
|
||||
"""Parse one search result page."""
|
||||
data = self.get_url(url)
|
||||
|
||||
def handle_url(url, session, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data = get_page(url, session).text
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
for match in page_matcher.finditer(data):
|
||||
page_url = match.group(1)
|
||||
page_url = urljoin(url, page_url)
|
||||
name = format_name(match.group(2))
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
|
||||
continue
|
||||
# find out how many images this comic has
|
||||
end = match.end()
|
||||
mo = num_matcher.search(data[end:])
|
||||
if not mo:
|
||||
print("ERROR matching number:", repr(data[end:end + 300]),
|
||||
file=sys.stderr)
|
||||
continue
|
||||
num = int(mo.group(1))
|
||||
# search for url in extra page
|
||||
print("Getting", page_url)
|
||||
try:
|
||||
data2 = get_page(page_url, session).text
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
mo = url_matcher.search(data2)
|
||||
if not mo:
|
||||
print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr)
|
||||
continue
|
||||
comic_url = mo.group(1)
|
||||
# search for adult flag
|
||||
adult = adult_matcher.search(data2[end:])
|
||||
bounce = name not in repeat_comics
|
||||
res[name] = [
|
||||
url_overrides.get(name, comic_url), num, bool(adult), bounce
|
||||
]
|
||||
|
||||
|
||||
def get_results():
|
||||
"""Parse all search result pages."""
|
||||
base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
|
||||
session = requests.Session()
|
||||
# store info in a dictionary {name -> url, number of comics, adult flag, bounce flag}
|
||||
res = {}
|
||||
# a search for an empty string returned 286 result pages
|
||||
result_pages = 286
|
||||
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
|
||||
for i in range(0, result_pages):
|
||||
print(i + 1, file=sys.stderr, end=" ")
|
||||
handle_url(base % (i * 12), session, res)
|
||||
save_result(res, json_file)
|
||||
|
||||
|
||||
def has_comic(name):
|
||||
"""Check if comic name already exists."""
|
||||
cname = name.lower()
|
||||
for scraperobj in get_scrapers():
|
||||
lname = scraperobj.name.lower()
|
||||
if lname == cname:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def print_results(args):
|
||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
||||
min_comics, filename = args
|
||||
min_comics = int(min_comics)
|
||||
with codecs.open(filename, 'a', 'utf-8') as fp:
|
||||
for name, entry in sorted(load_result(json_file).items()):
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
url, num, adult, bounce = entry
|
||||
if num < min_comics:
|
||||
continue
|
||||
if has_comic(name):
|
||||
prefix = u'#'
|
||||
num = 999
|
||||
for comicdiv in data.cssselect(
|
||||
'div#webcomic_search_results div.full_banner_div'):
|
||||
page_url = comicdiv.cssselect('a:first-child')[0].attrib['href']
|
||||
name = comicdiv.cssselect('img.banny')
|
||||
if name:
|
||||
name = name[0].attrib['title']
|
||||
else:
|
||||
prefix = u''
|
||||
fp.write(u"%sadd(%r, %r, %s, %s)\n" % (
|
||||
prefix, str(truncate_name(name)), str(url), adult, bounce
|
||||
))
|
||||
name = comicdiv.cssselect('h2')[0].text
|
||||
# find out how many images this comic has
|
||||
mo = comicdiv.cssselect('span.small-meter')
|
||||
if not mo:
|
||||
print("ERROR matching number of comics", file=sys.stderr)
|
||||
continue
|
||||
num = int(mo[0].text.strip())
|
||||
# search for url in extra page
|
||||
data2 = self.get_url(page_url)
|
||||
mo = data2.cssselect('div#quick_reading_links a:last-child')
|
||||
if not mo:
|
||||
print("ERROR matching comic URL", file=sys.stderr)
|
||||
continue
|
||||
comic_url = mo[0].attrib['href']
|
||||
# search for adult flag
|
||||
adult = data2.xpath('//img[@src="' + self.ADULT_IMG + '"]')
|
||||
self.add_comic(name, (comic_url, bool(adult)), num)
|
||||
|
||||
next_url = data.cssselect(
|
||||
"div.search_nav td:last-child a")[0].attrib['href']
|
||||
return (next_url, num)
|
||||
|
||||
def collect_results(self):
|
||||
"""Parse all search result pages."""
|
||||
# Sort by number of comics, so we can abort when we get under some
|
||||
# threshold.
|
||||
next_url = (
|
||||
"http://www.smackjeeves.com/search.php?submit=1" +
|
||||
"&search_mode=webcomics&comic_title=&sort_by=4&special=all" +
|
||||
"&last_update=6&style_all=on&genre_all=on&format_all=on")
|
||||
last_count = 999
|
||||
while last_count >= self.MIN_COMICS:
|
||||
print(last_count, file=sys.stderr, end=" ")
|
||||
next_url, last_count = self.handle_url(next_url)
|
||||
|
||||
def get_classdef(self, name, data):
|
||||
sub, top = urlsplit(data[0]).hostname.split('.', 1)
|
||||
cl = u"class SJ%s(_SmackJeeves):" % name
|
||||
if top.lower() == "smackjeeves.com":
|
||||
cl += "\n sub = '%s'" % sub
|
||||
else:
|
||||
cl += "\n host = '%s.%s'" % (sub, top)
|
||||
if data[1]:
|
||||
cl += "\n adult = True"
|
||||
return cl
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) > 1:
|
||||
print_results(sys.argv[1:])
|
||||
else:
|
||||
get_results()
|
||||
SmackJeevesUpdater(__file__).run()
|
||||
|
|
Loading…
Reference in a new issue