dosage/scripts/comicfury.py

#!/usr/bin/env python
# Copyright (C) 2013-2014 Bastian Kleineidam
# Copyright (C) 2016 Tobias Gruetzmacher
"""
Script to get ComicFury comics and save the info in a JSON file for further
processing.
"""
from __future__ import print_function, absolute_import
import codecs
import sys
import os
import requests
from lxml import html

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))  # noqa
from dosagelib.util import getPageContent
from dosagelib.scraper import get_scraperclasses
from scriptutil import (contains_case_insensitive, save_result, load_result,
                        truncate_name, format_name)

# Absolute minumum number of pages a comic may have (restrict search space)
MIN_COMICS = 90

json_file = __file__.replace(".py", ".json")

# names of comics to exclude
exclude_comics = [
    # unsuitable navigation
    "AlfdisAndGunnora",
    "AnAmericanNerdinAnimatedTokyo",
    "AngryAlien",
    "BoozerAndStoner",
    "Bonejangles",
    "ConradStory",
    "Crossing",
    "ChristianHumberReloaded",
    "CorkandBlotto",
    "Democomix",
    "ErraticBeatComics",
    "EnergyWielders",
    "EvilBearorg",
    "Fiascos",
    "FateoftheBlueStar",
    "FPK",
    "Fanartgyle",
    "FrigginRandom",
    "GoodbyeKitty",
    "HighlyExperiMental",
    "IfAndCanBeFlowers",
    "JournalismStory",
    "JohnsonSuperior",
    "Keel",
    "JudgeDredBasset",
    "LomeathAndHuilii",
    "MNPB",
    "LucidsDream",
    "MadDog",
    "Minebreakers",
    "Moonlightvalley",
    "MyImmortalFool",
    "NATO",
    "NothingFits",
    "OptimisticFishermenandPessimisticFishermen",
    "Old2G",
    "NothingFitsArtBlog",
    "OutToLunchTheStingRayWhoreStory",
    "Pandemonium",
    "Pewfell",
    "ProjectX",
    "Ratantia",
    "RealLifeTrips",
    "Sandgate",
    "Secondpuberty",
    "Seconds",
    "SlightlyEccentricOrigins",
    "StardusttheCat",
    "StrangerthanFiction",
    "TalamakGreatAdventure",
    "TheBattalion",
    "TheDailyProblem",
    "TheMansionofE",
    "ThePainter",
    "TheSeekers",
    "TheTrialsofKlahadoftheAbyss",
    "TheStickmen",
    "ThornsInOurSide",
    "TopHeavyVeryBustyPinUpsForAdults",
    "USBUnlimitedsimulatedbody",
    "TylerHumanRecycler",
    "UAF",
    "WhenPigsFly",
    "YeOldeLegotimeTheatre",

    # no content
    "Angst",

    # images gone
    "BaseballCapsandTiaras",
    "CROSSWORLDSNEXUS",
    "Fathead",
    "KevinZombie",
    "KindergardenCrisIs",
    "NoSongsForTheDead",
    "RequiemShadowbornPariah",
    "TezzleandZeek",

    # broken HTML
    "CrossingOver",

    # unique html
    "IKilledtheHero",
    "PowerofPower",
    "Schizmatic",
    "WaketheSleepers",
    "WeightofEternity",
]


def handle_url(url, session, res):
    """Parse one search result page."""
    print("Parsing", url, file=sys.stderr)
    try:
        data = html.document_fromstring(getPageContent(url, session))
        data.make_links_absolute(url)
    except IOError as msg:
        print("ERROR:", msg, file=sys.stderr)
        return

    num = 999
    for comicdiv in data.cssselect('div.searchresult'):
        comiclink = comicdiv.cssselect('h3 a')[0]
        comicurl = comiclink.attrib['href']
        name = format_name(comiclink.text)
        if contains_case_insensitive(res, name):
            # we cannot handle two comics that only differ in case
            print("INFO: skipping possible duplicate", repr(name),
                  file=sys.stderr)
            continue

        info = comicdiv.cssselect('span.comicinfo')
        # find out how many images this comic has
        num = int(info[1].text.strip())
        # find activity
        active = info[6].text.strip().lower() == "active"
        lang = info[7].text.strip().lower()
        res[name] = [comicurl, num, active, lang]

    return num


def get_results():
    """Parse all search result pages."""
    # store info in a dictionary {name -> shortname}
    res = {}
    session = requests.Session()
    # Sort by page count, so we can abort when we get under some threshold.
    baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&query=' +
               '&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&all_ge=1' +
               '&all_st=1&all_la=1&page=%d')
    last_count = 999
    page = 1
    print("Parsing search result pages...", file=sys.stderr)
    while last_count >= MIN_COMICS:
        last_count = handle_url(baseUrl % page, session, res)
        page += 1
        print(last_count, file=sys.stderr, end=" ")
    save_result(res, json_file)


def find_dups(name):
    """Check if comic name already exists."""
    names = [
        ("Creators/%s" % name).lower(),
        ("DrunkDuck/%s" % name).lower(),
        ("GoComics/%s" % name).lower(),
        ("KeenSpot/%s" % name).lower(),
        ("SmackJeeves/%s" % name).lower(),
        ("Arcamax/%s" % name).lower(),
    ]
    for scraperclass in get_scraperclasses():
        lname = scraperclass.getName().lower()
        if lname in names:
            return scraperclass.getName().lower()
    return None


def print_results(args):
    """Print all comics that have at least the given number of minimum
    comic strips."""
    min_comics, filename = args
    min_comics = int(min_comics)
    with codecs.open(filename, 'a', 'utf-8') as fp:
        for name, entry in sorted(load_result(json_file).items()):
            url, num, active, lang = entry
            if name in exclude_comics:
                fp.write(u"# %s is excluded\n" % name)
                continue
            if num < min_comics:
                continue
            dup = find_dups(name)
            if dup is not None:
                fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
            else:
                fp.write(u"class CF%s(_ComicFury):\n    url = %r\n\n\n" % (
                         truncate_name(name), str(url)))


if __name__ == '__main__':
    if len(sys.argv) > 1:
        print_results(sys.argv[1:])
    else:
        get_results()
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`#!/usr/bin/env python`
Updated copyright. 2014-01-05 15:50:57 +00:00			`# Copyright (C) 2013-2014 Bastian Kleineidam`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`# Copyright (C) 2016 Tobias Gruetzmacher`
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`"""`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`Script to get ComicFury comics and save the info in a JSON file for further`
			`processing.`
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`"""`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`from __future__ import print_function, absolute_import`
Improve plugin update. 2013-05-22 20:29:03 +00:00			`import codecs`
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`import sys`
			`import os`
			`import requests`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`from lxml import html`

			`sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa`
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`from dosagelib.util import getPageContent`
			`from dosagelib.scraper import get_scraperclasses`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`from scriptutil import (contains_case_insensitive, save_result, load_result,`
			`truncate_name, format_name)`
Added comicfury comics. 2013-02-13 16:53:36 +00:00
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`# Absolute minumum number of pages a comic may have (restrict search space)`
			`MIN_COMICS = 90`
Added comicfury comics. 2013-02-13 16:53:36 +00:00
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`json_file = __file__.replace(".py", ".json")`
Added comicfury comics. 2013-02-13 16:53:36 +00:00
			`# names of comics to exclude`
			`exclude_comics = [`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`# unsuitable navigation`
			`"AlfdisAndGunnora",`
			`"AnAmericanNerdinAnimatedTokyo",`
			`"AngryAlien",`
			`"BoozerAndStoner",`
			`"Bonejangles",`
			`"ConradStory",`
			`"Crossing",`
			`"ChristianHumberReloaded",`
			`"CorkandBlotto",`
			`"Democomix",`
			`"ErraticBeatComics",`
			`"EnergyWielders",`
			`"EvilBearorg",`
			`"Fiascos",`
			`"FateoftheBlueStar",`
			`"FPK",`
			`"Fanartgyle",`
			`"FrigginRandom",`
			`"GoodbyeKitty",`
			`"HighlyExperiMental",`
			`"IfAndCanBeFlowers",`
			`"JournalismStory",`
			`"JohnsonSuperior",`
			`"Keel",`
			`"JudgeDredBasset",`
			`"LomeathAndHuilii",`
			`"MNPB",`
			`"LucidsDream",`
			`"MadDog",`
			`"Minebreakers",`
			`"Moonlightvalley",`
			`"MyImmortalFool",`
			`"NATO",`
			`"NothingFits",`
			`"OptimisticFishermenandPessimisticFishermen",`
			`"Old2G",`
			`"NothingFitsArtBlog",`
			`"OutToLunchTheStingRayWhoreStory",`
			`"Pandemonium",`
			`"Pewfell",`
			`"ProjectX",`
			`"Ratantia",`
			`"RealLifeTrips",`
			`"Sandgate",`
			`"Secondpuberty",`
			`"Seconds",`
			`"SlightlyEccentricOrigins",`
			`"StardusttheCat",`
			`"StrangerthanFiction",`
			`"TalamakGreatAdventure",`
			`"TheBattalion",`
			`"TheDailyProblem",`
			`"TheMansionofE",`
			`"ThePainter",`
			`"TheSeekers",`
			`"TheTrialsofKlahadoftheAbyss",`
			`"TheStickmen",`
			`"ThornsInOurSide",`
			`"TopHeavyVeryBustyPinUpsForAdults",`
			`"USBUnlimitedsimulatedbody",`
			`"TylerHumanRecycler",`
			`"UAF",`
			`"WhenPigsFly",`
			`"YeOldeLegotimeTheatre",`

			`# no content`
			`"Angst",`

			`# images gone`
			`"BaseballCapsandTiaras",`
			`"CROSSWORLDSNEXUS",`
			`"Fathead",`
			`"KevinZombie",`
			`"KindergardenCrisIs",`
			`"NoSongsForTheDead",`
			`"RequiemShadowbornPariah",`
			`"TezzleandZeek",`

			`# broken HTML`
			`"CrossingOver",`

			`# unique html`
			`"IKilledtheHero",`
			`"PowerofPower",`
			`"Schizmatic",`
			`"WaketheSleepers",`
			`"WeightofEternity",`
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`]`


			`def handle_url(url, session, res):`
			`"""Parse one search result page."""`
			`print("Parsing", url, file=sys.stderr)`
			`try:`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`data = html.document_fromstring(getPageContent(url, session))`
			`data.make_links_absolute(url)`
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`except IOError as msg:`
			`print("ERROR:", msg, file=sys.stderr)`
			`return`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00
			`num = 999`
			`for comicdiv in data.cssselect('div.searchresult'):`
			`comiclink = comicdiv.cssselect('h3 a')[0]`
			`comicurl = comiclink.attrib['href']`
			`name = format_name(comiclink.text)`
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`if contains_case_insensitive(res, name):`
			`# we cannot handle two comics that only differ in case`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`print("INFO: skipping possible duplicate", repr(name),`
			`file=sys.stderr)`
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`continue`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00
			`info = comicdiv.cssselect('span.comicinfo')`
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`# find out how many images this comic has`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`num = int(info[1].text.strip())`
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`# find activity`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`active = info[6].text.strip().lower() == "active"`
			`lang = info[7].text.strip().lower()`
			`res[name] = [comicurl, num, active, lang]`

			`return num`
Added comicfury comics. 2013-02-13 16:53:36 +00:00

			`def get_results():`
			`"""Parse all search result pages."""`
			`# store info in a dictionary {name -> shortname}`
			`res = {}`
			`session = requests.Session()`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`# Sort by page count, so we can abort when we get under some threshold.`
			`baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&query=' +`
			`'&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&all_ge=1' +`
			`'&all_st=1&all_la=1&page=%d')`
			`last_count = 999`
			`page = 1`
			`print("Parsing search result pages...", file=sys.stderr)`
			`while last_count >= MIN_COMICS:`
			`last_count = handle_url(baseUrl % page, session, res)`
			`page += 1`
			`print(last_count, file=sys.stderr, end=" ")`
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`save_result(res, json_file)`


Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`def find_dups(name):`
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`"""Check if comic name already exists."""`
			`names = [`
			`("Creators/%s" % name).lower(),`
			`("DrunkDuck/%s" % name).lower(),`
			`("GoComics/%s" % name).lower(),`
			`("KeenSpot/%s" % name).lower(),`
			`("SmackJeeves/%s" % name).lower(),`
			`("Arcamax/%s" % name).lower(),`
			`]`
			`for scraperclass in get_scraperclasses():`
Added some comic strips and cleanup the scraper code. 2013-03-06 19:00:30 +00:00			`lname = scraperclass.getName().lower()`
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`if lname in names:`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`return scraperclass.getName().lower()`
			`return None`
Added comicfury comics. 2013-02-13 16:53:36 +00:00

			`def print_results(args):`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`"""Print all comics that have at least the given number of minimum`
			`comic strips."""`
Improve plugin update. 2013-05-22 20:29:03 +00:00			`min_comics, filename = args`
			`min_comics = int(min_comics)`
			`with codecs.open(filename, 'a', 'utf-8') as fp:`
			`for name, entry in sorted(load_result(json_file).items()):`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`url, num, active, lang = entry`
Improve plugin update. 2013-05-22 20:29:03 +00:00			`if name in exclude_comics:`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`fp.write(u"# %s is excluded\n" % name)`
Improve plugin update. 2013-05-22 20:29:03 +00:00			`continue`
			`if num < min_comics:`
			`continue`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`dup = find_dups(name)`
			`if dup is not None:`
			`fp.write(u"# %s has a duplicate in %s\n" % (name, dup))`
Improve plugin update. 2013-05-22 20:29:03 +00:00			`else:`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`fp.write(u"class CF%s(_ComicFury):\n url = %r\n\n\n" % (`
			`truncate_name(name), str(url)))`
Added comicfury comics. 2013-02-13 16:53:36 +00:00

			`if __name__ == '__main__':`
			`if len(sys.argv) > 1:`
			`print_results(sys.argv[1:])`
			`else:`
			`get_results()`