Update ComicFury comics. (+871, -245)

- Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing.
2016-03-17 00:44:06 +01:00 · 2016-03-17 00:44:06 +01:00 · 552f29e5fc
commit 552f29e5fc
parent 6727e9b559
3 changed files with 4169 additions and 677 deletions
--- a/dosagelib/plugins/comicfury.py
+++ b/dosagelib/plugins/comicfury.py
--- a/dosagelib/plugins/d.py
+++ b/dosagelib/plugins/d.py
@ -1,6 +1,7 @@
-# -*- coding: iso-8859-1 -*-
+# -*- coding: utf-8 -*-
 # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
 # Copyright (C) 2012-2014 Bastian Kleineidam
 # Copyright (C) 2015-2016 Tobias Gruetzmacher
 from re import compile, escape
@ -50,16 +51,6 @@ class Damonk(_BasicScraper):
    help = 'Index format: yyyymmdd'
 # XXX disallowed /search by robots.txt
 class _DandyAndCompany(_BasicScraper):
    url = 'http://www.dandyandcompany.com/'
    stripUrl = None
    multipleImagesPerStrip = True
    imageSearch = compile(tagre("a", "href", r'(http://\d+\.bp\.blogspot\.com/[^"]+)', after="imageanchor"))
    prevSearch = compile(tagre("a", "href", r"([^']+)", quote="'", after="Older Posts"))
    help = 'Index format: none'
 class DangerouslyChloe(_BasicScraper):
    url = 'http://www.dangerouslychloe.com/'
    stripUrl = url + 'strips-dc/%s'
--- a/scripts/comicfury.py
+++ b/scripts/comicfury.py
@ -1,253 +1,118 @@
 #!/usr/bin/env python
 # Copyright (C) 2013-2014 Bastian Kleineidam
 # Copyright (C) 2016 Tobias Gruetzmacher
 """
-Script to get arcamax comics and save the info in a JSON file for further processing.
+Script to get ComicFury comics and save the info in a JSON file for further
 processing.
 """
-from __future__ import print_function
+from __future__ import print_function, absolute_import
 import codecs
 import re
 import sys
 import os
 import requests
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+from lxml import html
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))  # noqa
 from dosagelib.util import getPageContent
 from dosagelib.scraper import get_scraperclasses
-from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
+from scriptutil import (contains_case_insensitive, save_result, load_result,
                        truncate_name, format_name)
 # Absolute minumum number of pages a comic may have (restrict search space)
 MIN_COMICS = 90
 json_file = __file__.replace(".py", ".json")
 url_matcher = re.compile(r'<h3><a href="([^"]+)">')
 num_matcher = re.compile(r'<b>Comics:</b> <span class="comicinfo">(\d+)</span>')
 genre_matcher = re.compile(r'<b>Genre:</b> <span class="comicinfo">([^<]+)</span>')
 activity_matcher = re.compile(r'<b>Activity status:</b> <span class="comicinfo">([^<]+)</span>')
 # names of comics to exclude
 exclude_comics = [
-    "1000", # unsuitable navigation
+    # unsuitable navigation
-    "12yearsofmissj", # unsuitable navigation
+    "AlfdisAndGunnora",
-    "3DGlasses", # unsuitable navigation
+    "AnAmericanNerdinAnimatedTokyo",
-    "30Days", # unsuitable navigation
+    "AngryAlien",
-    "6tsc", # unsuitable navigation
+    "BoozerAndStoner",
-    "Abyss", # unsuitable navigation
+    "Bonejangles",
-    "Acelestialstory", # unsuitable navigation
+    "ConradStory",
-    "Actdr", # unsuitable navigation
+    "Crossing",
-    "Aerosol", # unsuitable navigation
+    "ChristianHumberReloaded",
-    "Ahtiventures", # unsuitable navigation
+    "CorkandBlotto",
-    "Alienirony", # unsuitable navigation
+    "Democomix",
-    "Aloonaticstale", # unsuitable navigation
+    "ErraticBeatComics",
-    "Amity", # unsuitable navigation
+    "EnergyWielders",
-    "Angelguardian", # unsuitable navigation
+    "EvilBearorg",
-    "AngelguardianEspanol", # unsuitable navigation
+    "Fiascos",
-    "Angryalien", # unsuitable navigation
+    "FateoftheBlueStar",
-    "Animangitis", # unsuitable navigation
+    "FPK",
-    "Archininja", # unsuitable navigation
+    "Fanartgyle",
-    "Arveytoonz", # unsuitable navigation
+    "FrigginRandom",
-    "AsperitasAstraalia", # unsuitable navigation
+    "GoodbyeKitty",
-    "AttackoftheRobofemoids", # unsuitable navigation
+    "HighlyExperiMental",
-    "Auriga", # unsuitable navigation
+    "IfAndCanBeFlowers",
-    "Bedlam", # unsuitable navigation
+    "JournalismStory",
-    "BITCHSquad", # missing images
+    "JohnsonSuperior",
-    "Bidoof", # unsuitable navigation
+    "Keel",
-    "Blobworld", # unsuitable navigation
+    "JudgeDredBasset",
-    "BlockTales", # unsuitable navigation
+    "LomeathAndHuilii",
-    "Bobcomix", # unsuitable navigation
+    "MNPB",
-    "Bonejangles", # unsuitable navigation
+    "LucidsDream",
-    "BookOfLiesComic", # unsuitable navigation
+    "MadDog",
-    "BoozerandStoner", # unsuitable navigation
+    "Minebreakers",
-    "Boyaurus", # unsuitable navigation
+    "Moonlightvalley",
-    "Brainfood", # unsuitable navigation
+    "MyImmortalFool",
-    "Bromosworld", # unsuitable navigation
+    "NATO",
-    "BulletMythology", # unsuitable navigation
+    "NothingFits",
-    "Bunnysher", # page moved
+    "OptimisticFishermenandPessimisticFishermen",
-    "BUXY", # unsuitable navigation
+    "Old2G",
-    "CafeGruesome", # unsuitable navigation
+    "NothingFitsArtBlog",
-    "Castofmadness", # unsuitable navigation
+    "OutToLunchTheStingRayWhoreStory",
-    "Chanpuru", # unsuitable navigation
+    "Pandemonium",
-    "Christmaswithmaddog", # unsuitable navigation
+    "Pewfell",
-    "ChroniclesOfLillian", # unsuitable navigation
+    "ProjectX",
-    "Comicshortsmain", # unsuitable navigation
+    "Ratantia",
-    "Conrads", # unsuitable navigation
+    "RealLifeTrips",
-    "ConradTheCaterpillar", # unsuitable navigation
+    "Sandgate",
-    "ConsequencesOfChoice", # unsuitable navigation
+    "Secondpuberty",
-    "CoolYuleComics", # unsuitable navigation
+    "Seconds",
-    "Crossworldsnexus", # unsuitable navigation
+    "SlightlyEccentricOrigins",
-    "Colorforce", # unsuitable navigation
+    "StardusttheCat",
-    "Coolstorybro", # unsuitable navigation
+    "StrangerthanFiction",
-    "Crepusculars", # unsuitable navigation
+    "TalamakGreatAdventure",
-    "CtrlZ", # unsuitable navigation
+    "TheBattalion",
-    "DeadNight", # unsuitable navigation
+    "TheDailyProblem",
-    "Democomix", # unsuitable navigation
+    "TheMansionofE",
-    "Dinosaurkingdom", # unsuitable navigation
+    "ThePainter",
-    "Donutsforsharks", # unsuitable navigation
+    "TheSeekers",
-    "Dotcomic", # unsuitable navigation
+    "TheTrialsofKlahadoftheAbyss",
-    "Droned", # unsuitable navigation
+    "TheStickmen",
-    "Druids", # unsuitable navigation
+    "ThornsInOurSide",
-    "Effingukookoo", # unsuitable navigation
+    "TopHeavyVeryBustyPinUpsForAdults",
-    "Elijahandazuuclassic", # unsuitable navigation
+    "USBUnlimitedsimulatedbody",
-    "ErraticBeat", # unsuitable navigation
+    "TylerHumanRecycler",
-    "ErraticE", # unsuitable navigation
+    "UAF",
-    "EternalKnights", # unsuitable navigation
+    "WhenPigsFly",
-    "Evilbear", # unsuitable navigation
+    "YeOldeLegotimeTheatre",
-    "Ewmic", # unsuitable navigation
+
-    "Fannicklas", # unsuitable navigation
+    # no content
-    "Fateofthebluestar", # unsuitable navigation
+    "Angst",
-    "Fishbowl", # unsuitable navigation
+
-    "Foe", # unsuitable navigation
+    # images gone
-    "Foreignterritory", # unsuitable navigation
+    "BaseballCapsandTiaras",
-    "Freakingawfulpuns", # page is gone
+    "CROSSWORLDSNEXUS",
-    "Frigginrandom", # unsuitable navigation
+    "Fathead",
-    "Frostfire", # unsuitable navigation
+    "KevinZombie",
-    "Furnerdy", # unsuitable navigation
+    "KindergardenCrisIs",
-    "Fuzzylittleninjas", # unsuitable navigation
+    "NoSongsForTheDead",
-    "Garfieldminusjon", # unsuitable navigation
+    "RequiemShadowbornPariah",
-    "Gatito", # unsuitable navigation
+    "TezzleandZeek",
-    "Gbksayonara", # unsuitable navigation
+
-    "Gillimurphyorig", # unsuitable navigation
+    # broken HTML
-    "Gratz", # unsuitable navigation
+    "CrossingOver",
-    "Greygaroutopheavyartwork", # unsuitable navigation
+
-    "GrimReaperSchool", # unsuitable navigation
+    # unique html
-    "Goldrush", # unsuitable navigation
+    "IKilledtheHero",
-    "GRIND", # unsuitable navigation
+    "PowerofPower",
-    "Haywire", # unsuitable navigation
+    "Schizmatic",
-    "Hallodri", # unsuitable navigation
+    "WaketheSleepers",
-    "Harrysorehead", # unsuitable navigation
+    "WeightofEternity",
    "HazSci", # unsuitable navigation
    "Hellboundarchive", # unsuitable navigation
    "Herecomesskeeter", # unsuitable navigation
    "Highlyexperimental", # unsuitable navigation
    "Holycowcomics", # unsuitable navigation
    "Hourlykelly", # unsuitable navigation
    "Houseescapeold", # unsuitable navigation
    "Horizongakuen", # unsuitable navigation
    "Icannotdraw", # unsuitable navigation
    "Ign", # unsuitable navigation
    "Illusionoftime", # unsuitable navigation
    "InsideOuT", # unsuitable navigation
    "Introvert", # unsuitable navigation
    "Immortalfool", # unsuitable navigation
    "Insectia", # unsuitable navigation
    "Jackitandfriends", # unsuitable navigation
    "Jenffersshow5", # unsuitable navigation
    "Johnsonsuperior", # unsuitable navigation
    "Joostdailies", # unsuitable navigation
    "Journ", # unsuitable navigation
    "JourneyToRaifina", # unsuitable navigation
    "Junk", # unsuitable navigation
    "Kaze", # unsuitable navigation
    "Kmlssticks", # unsuitable navigation
    "KiLAiLO", # unsuitable navigation
    "Kingdomprettycure", # unsuitable navigation
    "Kmfe", # unsuitable navigation
    "Lately", # unsuitable navigation
    "Legendoftheredphantom", # unsuitable navigation
    "LiteBites", # unsuitable navigation
    "Littlephoenix", # unsuitable navigation
    "Llwhoelterran", # unsuitable navigation
    "Lomeathandhuilii", # unsuitable navigation
    "Longandexcitingjourney", # unsuitable navigation
    "Lovekillsslowly", # unsuitable navigation
    "Mannack", # unsuitable navigation
    "Mars", # unsuitable navigation
    "MaskoftheAryans", # unsuitable navigation
    "Megamaiden", # unsuitable navigation
    "Minebreakers", # unsuitable navigation
    "Minecraft2b2t", # unsuitable navigation
    "Mischeif", # unsuitable navigation
    "Mitadakesaga", # unsuitable navigation
    "Mlpfib", # unsuitable navigation
    "Monsterloverdp", # unsuitable navigation
    "MoonlightValley", # unsuitable navigation
    "MurghComics", # unsuitable navigation
    "MVPL", # unsuitable navigation
    "Monobow", # unsuitable navigation
    "Mytvisevil", # unsuitable navigation
    "Natao", # unsuitable navigation
    "Nemution", # unsuitable navigation
    "NMG", # unsuitable navigation
    "Noche", # unsuitable navigation
    "Noprrkele", # unsuitable navigation
    "Nothingfits", # unsuitable navigation
    "Nothingfitsartblog", # unsuitable navigation
    "NotYoursAmI", # unsuitable navigation
    "Oeight", # unsuitable navigation
    "Ofpf", # unsuitable navigation
    "Old2g", # unsuitable navigation
    "Outtolunch", # unsuitable navigation
    "Parisel313", # unsuitable navigation
    "Patchworkpeople", # unsuitable navigation
    "Pewfell", # unsuitable navigation
    "Phoenix", # unsuitable navigation
    "Pi5a", # unsuitable navigation
    "Pokemonwarpers", # unsuitable navigation
    "Princeofcats", # unsuitable navigation
    "Princess", # unsuitable navigation
    "ProjectX", # unsuitable navigation
    "ReadershipofOne", # unsuitable navigation
    "Rebuildofgenericmanga", # unsuitable navigation
    "Queenie", # unsuitable navigation
    "Rain", # unsuitable navigation
    "Ratantia", # unsuitable navigation
    "Rath", # unsuitable navigation
    "RawLatex", # unsuitable navigation
    "Remnants", # unsuitable navigation
    "Requiem", # unsuitable navigation
    "Retrofiyora", # unsuitable navigation
    "Rexfordavenue", # unsuitable navigation
    "Rocr", # unsuitable navigation
    "Rosie", # unsuitable navigation
    "S", # unsuitable navigation
    "Sandgate", # unsuitable navigation
    "Shadowstories", # unsuitable navigation
    "Sigh", # unsuitable navigation
    "Sleazyspacesage", # unsuitable navigation
    "Slightlyeccentric", # unsuitable navigation
    "Slightlyeccentricorigins", # unsuitable navigation
    "Smbhax", # unsuitable navigation
    "SpiritSquire1", # unsuitable navigation
    "Stampedegirl", # unsuitable navigation
    "Stardustthecat", # unsuitable navigation
    "Sticklife", # unsuitable navigation
    "StickMisadventures", # unsuitable navigation
    "Stinkoman", # unsuitable navigation
    "StrangerThanFiction", # unsuitable navigation
    "SundaySmash", # unsuitable navigation
    "Superproultimatewrestling", # unsuitable navigation
    "Sweetcheeriosandorangejuice", # unsuitable navigation
    "Synapticisms", # unsuitable navigation
    "Talesofspoons", # unsuitable navigation
    "Terwilligers", # unsuitable navigation
    "Thedevilshorn", # unsuitable navigation
    "TheEntity", # unsuitable navigation
    "Theworldjumper", # unsuitable navigation
    "TheWorldofUh", # unsuitable navigation
    "Thewriter13", # unsuitable navigation
    "ToC", # unsuitable navigation
    "TOGM", # unsuitable navigation
    "Townburgcity", # unsuitable navigation
    "Tuhinaloota", # unsuitable navigation
    "Tezzleandzeek", # unsuitable navigation
    "TheDragonFistsofSmortySmythe", # unsuitable navigation
    "Theredeemers", # unsuitable navigation
    "Thestickmen", # unsuitable navigation
    "Thingsthatannoyme", # unsuitable navigation
    "ThornsInOurSide", # unsuitable navigation
    "Two_Rooks", # unsuitable navigation
    "Unichat", # unsuitable navigation
    "UFPA", # unsuitable navigation
    "V4", # unsuitable navigation
    "Verboten", # unsuitable navigation
    "Warg", # unsuitable navigation
    "Warrior27", # unsuitable navigation
    "Wastedpotential", # unsuitable navigation
    "Wcf", # unsuitable navigation
    "Whoseline", # unsuitable navigation
    "WindRiders", # unsuitable navigation
    "WitchesTeaParty", # unsuitable navigation
    "Woohooligan", # unsuitable navigation
    "Xenozone", # unsuitable navigation
    "XWingAlliance", # unsuitable navigation
    "Yppcomic", # unsuitable navigation
    "Zeroeffort", # unsuitable navigation
 ]
@ -255,35 +120,32 @@ def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
-        data = getPageContent(url, session)
+        data = html.document_fromstring(getPageContent(url, session))
        data.make_links_absolute(url)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return
-    for match in url_matcher.finditer(data):
+
-        comicurl = match.group(1)
+    num = 999
-        name = format_name(comicurl.split('.', 1)[0][7:])
+    for comicdiv in data.cssselect('div.searchresult'):
-        if name in exclude_comics:
+        comiclink = comicdiv.cssselect('h3 a')[0]
-            continue
+        comicurl = comiclink.attrib['href']
        name = format_name(comiclink.text)
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
-            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
+            print("INFO: skipping possible duplicate", repr(name),
                  file=sys.stderr)
            continue
        info = comicdiv.cssselect('span.comicinfo')
        # find out how many images this comic has
-        end = match.end()
+        num = int(info[1].text.strip())
        mo = num_matcher.search(data[end:])
        if not mo:
            print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr)
            continue
        num = int(mo.group(1))
        # find activity
-        mo = activity_matcher.search(data[end:])
+        active = info[6].text.strip().lower() == "active"
-        if not mo:
+        lang = info[7].text.strip().lower()
-            print("ERROR matching activity:", repr(data[end:end+300]), file=sys.stderr)
+        res[name] = [comicurl, num, active, lang]
-            continue
+
-        active = mo.group(1).lower() == "active"
+    return num
        res[name] = [comicurl, num, active]
    if not res:
        print("ERROR:", "did not match any comics", file=sys.stderr)
 def get_results():
@ -291,15 +153,21 @@ def get_results():
    # store info in a dictionary {name -> shortname}
    res = {}
    session = requests.Session()
-    baseUrl = 'http://comicfury.com/search.php?search=1&webcomics=Search+for+webcomics&query=&worder=5&asc=1&incvi=1&incse=1&incnu=1&incla=1&all_ge=1&all_st=1&all_la=1&page='
+    # Sort by page count, so we can abort when we get under some threshold.
-    pages = 382
+    baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&query=' +
-    for i in range(1, pages+1):
+               '&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&all_ge=1' +
-        url = baseUrl + str(i)
+               '&all_st=1&all_la=1&page=%d')
-        handle_url(url, session, res)
+    last_count = 999
    page = 1
    print("Parsing search result pages...", file=sys.stderr)
    while last_count >= MIN_COMICS:
        last_count = handle_url(baseUrl % page, session, res)
        page += 1
        print(last_count, file=sys.stderr, end=" ")
    save_result(res, json_file)
-def has_comic(name):
+def find_dups(name):
    """Check if comic name already exists."""
    names = [
        ("Creators/%s" % name).lower(),
@ -312,28 +180,29 @@ def has_comic(name):
    for scraperclass in get_scraperclasses():
        lname = scraperclass.getName().lower()
        if lname in names:
-            return True
+            return scraperclass.getName().lower()
-    return False
+    return None
 def print_results(args):
-    """Print all comics that have at least the given number of minimum comic strips."""
+    """Print all comics that have at least the given number of minimum
    comic strips."""
    min_comics, filename = args
    min_comics = int(min_comics)
    with codecs.open(filename, 'a', 'utf-8') as fp:
        for name, entry in sorted(load_result(json_file).items()):
            url, num, active, lang = entry
            if name in exclude_comics:
                fp.write(u"# %s is excluded\n" % name)
                continue
            url, num, active = entry
            if num < min_comics:
                continue
-            if has_comic(name):
+            dup = find_dups(name)
-                prefix = u'#'
+            if dup is not None:
                fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
            else:
-                prefix = u''
+                fp.write(u"class CF%s(_ComicFury):\n    url = %r\n\n\n" % (
-            fp.write(u"%sadd(%r, %r)\n" % (
+                         truncate_name(name), str(url)))
              prefix, str(truncate_name(name)), str(url)
            ))
 if __name__ == '__main__':