dosage/scripts/mkdescription.py

100 lines
2.8 KiB
Python
Raw Normal View History

2013-04-14 07:02:14 +00:00
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
2014-01-05 15:50:57 +00:00
# Copyright (C) 2013-2014 Bastian Kleineidam
2013-04-14 07:02:14 +00:00
from __future__ import print_function
import sys
import os
# for dosage import
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.scraper import get_scraperclasses
from scriptutil import save_result, load_result
from bs4 import BeautifulSoup
import requests
# User-Agent: Iceweasel (Firefox) 15.02 (Debian)
UserAgent = "Mozilla/5.0 (X11; Linux x86_64; rv:15.0) Gecko/20120724 Debian Iceweasel/15.02"
json_file = __file__.replace(".py", ".json")
def get_scraper_url(scraperclass):
"""Get base or starter url."""
if hasattr(scraperclass, 'baseUrl'):
return scraperclass.baseUrl
return scraperclass.url
def classname(clazz):
"""Get name of given class."""
return clazz.__name__
def elem_text(elem, sep=u" "):
"""Get text content of a BeautifulSoup HTML element node."""
return sep.join(elem.stripped_strings)
def get_description(url, lang):
"""Get URL description from meta information."""
headers = {'User-Agent': UserAgent}
try:
req = requests.get(url, headers=headers)
except Exception as msg:
print("Error: %s" % msg)
return None
if req.status_code != requests.codes.ok:
print("WARN: HTTP %d" % req.status_code)
return u""
doc = BeautifulSoup(req.text)
elem = doc.find("meta", dict(property="og:description"))
if elem:
return elem["content"]
for elem in doc.find_all("meta", dict(name="description")):
if "content" in elem:
return elem["content"]
elem = doc.find('title')
if elem:
return elem_text(elem)
def main(args):
"""Get scraper descriptions from google results."""
if os.path.isfile(json_file):
result = load_result(json_file)
else:
result = {}
if args:
tofind = args[0]
else:
tofind = None
for scraperclass in sorted(get_scraperclasses(), key=classname):
key = classname(scraperclass)
if tofind and key != tofind:
continue
tofind = None
if '_' in key:
continue
print(key)
if scraperclass.description:
continue
if key in result:
continue
url = get_scraper_url(scraperclass)
print(url)
lang = scraperclass.lang
description = get_description(url, lang)
if description:
print(description)
# store result
module = scraperclass.__module__
result[key] = dict(description=description, module=module, url=url)
save_result(result, json_file)
else:
print("No description found")
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))