pylookup.py

#!/usr/bin/env python2

"""
Pylookup is to lookup entries from python documentation, especially within
emacs. Pylookup adopts most of ideas from haddoc, lovely toolkit by Martin
Blais.

(usage)
  ./pylookup.py -l ljust
  ./pylookup.py -u http://docs.python.org

"""

from __future__ import with_statement

import os
import sys
import re
try:
    import cPickle as pickle
except:
    import pickle
import formatter

from os.path import join, dirname, exists, abspath, expanduser
from contextlib import closing

if sys.version_info[0] == 3:
    import html.parser    as htmllib
    import urllib.parse   as urlparse
    import urllib.request as urllib
else:
    import htmllib, urllib, urlparse

VERBOSE = False
FORMATS = {
             "Emacs" : "{entry}\t({desc})\t[{book}];{url}",
             "Terminal" : "{entry}\t({desc})\t[{book}]\n{url}"
           }

def build_book(s, num):
    """
    Build book identifier from `s`, with `num` links.
    """
    for matcher, replacement in (("library", "lib"),
                                ("c-api", "api"),
                                ("reference", "ref"),
                                ("", "etc")):
        if matcher in s:
            return replacement if num == 1 else "%s/%d" % (replacement, num)

def trim(s):
    """
    Add any globle filtering rules here
    """
    s = s.replace( "Python Enhancement Proposals!", "")
    s = s.replace( "PEP ", "PEP-")
    return s

class Element(object):
    def __init__(self, entry, desc, book, url):
        self.book = book
        self.url = url
        self.desc = desc
        self.entry = entry

    def __format__(self, format_spec):
        return format_spec.format(entry=self.entry, desc=self.desc,
                                  book=self.book, url=self.url)

    def match_insensitive(self, key):
        """
        Match key case insensitive against entry and desc.

        `key` : Lowercase string.
        """
        return key in self.entry.lower() or key in self.desc.lower()

    def match_sensitive(self, key):
        """
        Match key case sensitive against entry and desc.

        `key` : Lowercase string.
        """
        return key in self.entry or key in self.desc

    def match_in_entry_insensitive(self, key):
        """
        Match key case insensitive against entry.

        `key` : Lowercase string.
        """
        return key in self.entry.lower()

    def match_in_entry_sensitive(self, key):
        """
        Match key case sensitive against entry.

        `key` : Lowercase string.
        """
        return key in self.entry


def get_matcher(insensitive=True, desc=True):
    """
    Get `Element.match_*` function.

    >>> get_matcher(0, 0)
    <unbound method Element.match_in_entry_sensitive>
    >>> get_matcher(1, 0)
    <unbound method Element.match_in_entry_insensitive>
    >>> get_matcher(0, 1)
    <unbound method Element.match_sensitive>
    >>> get_matcher(1, 1)
    <unbound method Element.match_insensitive>

    """
    _sensitive = "_insensitive" if insensitive else "_sensitive"
    _in_entry = "" if desc else "_in_entry"
    return getattr(Element, "match{0}{1}".format(_in_entry, _sensitive))


class IndexProcessor( htmllib.HTMLParser ):
    """
    Extract the index links from a Python HTML documentation index.
    """

    def __init__( self, writer, dirn):
        htmllib.HTMLParser.__init__( self, formatter.NullFormatter() )

        self.writer     = writer
        self.dirn       = dirn
        self.entry      = ""
        self.desc       = ""
        self.list_entry = False
        self.one_entry  = False
        self.num_of_a   = 0
        self.desc_cnt   = 0

    def start_dd( self, att ):
        self.list_entry = True

    def end_dd( self ):
        self.list_entry = False

    def start_li( self, att ):
        self.one_entry = True
        self.num_of_a = 0

    def start_a(self, att):
        if self.one_entry:
            self.url = join(self.dirn, dict(att)['href'])
            self.save_bgn()

    def end_a(self):
        global VERBOSE
        if self.one_entry:
            if self.num_of_a == 0 :
                self.desc = self.save_end()

                if VERBOSE:
                    self.desc_cnt += 1
                    if self.desc_cnt % 100 == 0:
                        sys.stdout.write("%04d %s\r" \
                                             % (self.desc_cnt, self.desc.ljust(80)))

                # extract first element
                #  ex) __and__() (in module operator)
                if not self.list_entry :
                    self.entry = re.sub( "\([^)]+\)", "", self.desc )

                    # clean up PEP
                    self.entry = trim(self.entry)

                    match = re.search( "\([^)]+\)", self.desc )
                    if match :
                        self.desc = match.group(0)

                self.desc = trim(re.sub( "[()]", "", self.desc ))

            self.num_of_a += 1
            book = build_book(self.url, self.num_of_a)
            e = Element(self.entry, self.desc, book, self.url)

            self.writer(e)

def update(db, urls, append=False):
    """Update database with entries from urls.

    `db` : filename to database
    `urls` : list of URL
    `append` : append to db
    """
    mode = "ab" if append else "wb"
    with open(db, mode) as f:
        writer = lambda e: pickle.dump(e, f)
        for url in urls:
            # detech 'file' or 'url' schemes
            parsed = urlparse.urlparse(url)
            if not parsed.scheme or parsed.scheme == "file":
                dst = abspath(expanduser(parsed.path))
                if not os.path.exists(dst):
                    print("Error: %s doesn't exist" % dst)
                    exit(1)
                url = "file://%s" % dst
            else:
                url = parsed.geturl()

            potential_urls = []
            if url.endswith('.html'):
                potential_urls.append(url)
            else:
                # guess index URLs
                # for stdlib, this is genindex-all.html
                # for django, numpy, etc. it's genindex.html
                url = url.rstrip("/")
                potential_urls.append(url + "/genindex-all.html")
                potential_urls.append(url + "/genindex.html")

            success = False
            for index_url in potential_urls:
                try:
                    print "Wait for a few seconds..."
                    print "Fetching index from '%s'" % index_url

                    index = urllib.urlopen(index_url).read()
                    if not issubclass(type(index), str):
                        index = index.decode()

                    parser = IndexProcessor(writer, dirname(index_url))
                    with closing(parser):
                        parser.feed(index)

                    # success, we don't need to try other potential urls
                    print "Loaded index from '%s'" % index_url
                    success = True
                    break
                except IOError:
                    print "Error: fetching file from '%s'" % index_url

            if not success:
                print "Failed to load index for input '%s'" % url


def lookup(db, key, format_spec, out=sys.stdout, insensitive=True, desc=True):
    """Lookup key from database and print to out.

    `db` : filename to database
    `key` : key to lookup
    `out` : file-like to write to
    `insensitive` : lookup key case insensitive
    """
    matcher = get_matcher(insensitive, desc)
    if insensitive:
        key = key.lower()
    with open(db, "rb") as f:
        try:
            while True:
                e = pickle.load(f)
                if matcher(e, key):
                    out.write('%s\n' % format(e, format_spec))
        except EOFError:
            pass

def cache(db, out=sys.stdout):
    """Print unique entries from db to out.

    `db` : filename to database
    `out` : file-like to write to
    """
    with open(db, "rb") as f:
        keys = set()
        try:
            while True:
                e = pickle.load(f)
                k = e.entry
                k = re.sub( "\([^)]*\)", "", k )
                k = re.sub( "\[[^]]*\]", "", k )
                keys.add(k)
        except EOFError:
            pass
        for k in keys:
            out.write('%s\n' % k)

if __name__ == "__main__":
    import optparse
    parser = optparse.OptionParser( __doc__.strip() )
    parser.add_option( "-d", "--db",
                       help="database name",
                       dest="db", default="pylookup.db" )
    parser.add_option( "-l", "--lookup",
                       help="keyword to search",
                       dest="key" )
    parser.add_option( "-u", "--update",
                       help="update url or path",
                       action="append", type="str", dest="url" )
    parser.add_option( "-c", "--cache" ,
                       help="extract keywords, internally used",
                       action="store_true", default=False, dest="cache")
    parser.add_option( "-a", "--append",
                       help="append to the db from multiple sources",
                       action="store_true", default=False, dest="append")
    parser.add_option( "-f", "--format",
                       help="type of output formatting, valid: Emacs, Terminal",
                       choices=["Emacs", "Terminal"],
                       default="Terminal", dest="format")
    parser.add_option( "-i", "--insensitive", default=1, choices=['0', '1'],
                       help="SEARCH OPTION: insensitive search "
                       "(valid: 0, 1; default: %default)")
    parser.add_option( "-s", "--desc", default=1, choices=['0', '1'],
                       help="SEARCH OPTION: include description field "
                       "(valid: 0, 1; default: %default)")
    parser.add_option("-v", "--verbose",
                      help="verbose", action="store_true",
                      dest="verbose", default=False)
    ( opts, args ) = parser.parse_args()

    VERBOSE = opts.verbose
    if opts.url:
        update(opts.db, opts.url, opts.append)
    if opts.cache:
        cache(opts.db)
    if opts.key:
        lookup(opts.db, opts.key, FORMATS[opts.format],
               insensitive=int(opts.insensitive), desc=int(opts.desc))