pinyin-comp

#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
complete path by acronym of pinyin initials

a fork of chsdir by shaozx@gmail.com
(http://code.google.com/p/easyscripts/wiki/chsdir)

用拼音补全命令行中的中文名称和路径

实验目录：$ ls ./
SVN培训  全球眼  浙江建行  浙江农信

使用: (输完后按 TAB 键自动补全)
    cd S     <tab>             进入[SVN培训]
    cd q     <tab>             进入[全球眼]
    cd z     <tab>             自动补全[浙江]
    cd zj    <tab><tab>        提示[浙江建行 浙江农信]备选
    cd zj1   <tab>             进入[浙江建行]
    cd zj2   <tab>             进入[浙江农信]
    cd 浙江j <tab>             进入[浙江建行]
    cd zjj   <tab>             进入[浙江建行]


如需模糊拼音支持，定义环境变量FUZZY
例如，对 n/l 进行模糊匹配：export FUZZY="{ 'n':'l' }"
多个匹配对之间用逗号隔开： export FUZZY="{ 'n':'l','f':'p' }"

Changelog

    2009-05-08修订  取不到拼音的汉字匹配任何字符
    2009-05-07新增  名称中有中文全角,可以用英文符号补全
    2009-05-06修订  解决数字序号定位时借位的问题
    2009-08-30新增  增加多音字支持，比如“音乐”
                    ((遇到有未被收入的多音字，请邮件联系我))
    2009-08-31新增  增加模糊拼音支持，比如'n'->'l'
                    ((需手工增加环境变量，配置方法见后面说明))

    2010-11-17 fork :D

    2010-11-18 use pre-generated pinyin table to get pinyin initial,
               it make the logic cleaner and easier to maintain,
               though it introduce extra delay which user can hardly feel.

"""

import os
import sys
import locale

import pinyin

_, default_encoding = locale.getdefaultlocale()


DOUBLE_WIDTH = {
                    u"～" : u"~" ,
                    u"！" : u"!" ,
                    u"＠" : u"@" ,
                    u"＃" : u"#" ,
                    u"＄" : u"$" ,
                    u"％" : u"%" ,
                    u"＆" : u"&" ,
                    u"＊" : u"*" ,
                    u"（" : u"(" ,
                    u"）" : u")" ,
                    u"＿" : u"_" ,
                    u"－" : u"-" ,
                    u"＋" : u"+" ,
                    u"［" : u"[" ,
                    u"］" : u"]" ,
                    u"＜" : u"<" ,
                    u"＞" : u">" ,
                    u"？" : u"?" ,
                    u"，" : u"," ,
                    u"。" : u"." ,
                    u"／" : u"/" ,
                    u"、" : "u"  ,
                }

# fuzzy pinyin
FUZZY = {}
try:
    FUZZY = eval( os.getenv("FUZZY") )
    FUZZY.keys()
except StandardError:
    FUZZY = {}

def transform_double_width(uni_char):
    "transform double-width char into its single-width equivalent  "
    try :
        return DOUBLE_WIDTH[uni_char]
    except KeyError:
        return uni_char

def fuzzynize(pinyin):
    "fuzzy one pinyin to another"
    try :
        return FUZZY[pinyin]
    except KeyError:
        return pinyin

def get_pinyin_initials(uni_char):
    "get the initial of Chinese pinyin"
    try:
        pinyin_initial = pinyin.pinyin_initial[uni_char]
        return "".join(pinyin_initial)
    except KeyError:
        return uni_char

def acronymize(uni_char):
    "get single-byte acronym for one unicode char"

    # replace double-width chars with its single-width equivalents
    uni_char = transform_double_width(uni_char)

    # if ascii, return immediately
    if uni_char <  u"\x80" :
        return uni_char

    pinyin_initial = get_pinyin_initials(uni_char)

    # special case for chars having polyphone
    if len(pinyin_initial) > 1:
        return u"`%s`" % pinyin_initial

    # for most cases.
    return fuzzynize( pinyin_initial )

def get_acronym(text):
    "get acronym for text string"
    text = unicodelize(text)

    acronym = u""

    for char in text:
        acronym += acronymize(char)

    return acronym

def unicodelize(text):
    "try to convert string into unicode string."
    if not isinstance(text, unicode):
        try:
            return unicode(text, default_encoding)
        except UnicodeDecodeError:
            pass

    return text

def stringlize(text):
    "try to convert unicode string back into string"
    if isinstance(text, unicode):
        try:
            return text.encode(default_encoding)
        except UnicodeEncodeError:
            pass

    return text

def expand_leading_tilda(path):
    "expand leading ~/ or ~user/"
    return os.path.expanduser(path)

if __name__ == '__main__':

    # chsdir <dirattr> <already_input_part>
    if len(sys.argv) != 3 :
        sys.exit(1)

    dironly = sys.argv[1]

    path = sys.argv[2].replace("\\","")
    path = expand_leading_tilda(path)
    path = unicodelize(path)
    # support fuzzy pinyin
    path = "".join( [ fuzzynize(x) for x in path] )

    index          = None
    effective_path = path

    # deal with special form such as  'xxx/zj1'
    if len(path) > 1 and '0' < path[-1] <= '9':
        index          = int(path[-1])
        effective_path = path[:-1]

    dirname            = os.path.dirname(path)
    basename           = os.path.basename(path)
    effective_basename = os.path.basename(effective_path)

    if not dirname :
        dirname = u"./"

    # get all top-level subentries(non-recursive)
    try:
        entries = os.listdir(dirname)
    except OSError:
        sys.exit(0)

    # if an entry with the exact basename already exist, do nothing
    if basename in entries or effective_basename in entries :
        sys.exit(0)

    basename_acronym = get_acronym(effective_basename).replace("\\","")

    reply = []

    for entry in entries:

        entry_acronym = get_acronym(entry).replace("\\","")

        # ignore entry which does not contain Chinese character.
        if entry_acronym == entry :
            continue

        i = j = 0

        while i < len(basename_acronym) and j < len(entry_acronym) :

            # dealing with polyphone
            if entry_acronym[j] == "`":
                end = entry_acronym.index("`", j+1)
                if entry_acronym.find( basename_acronym[i], j, end ) > 0 :
                    i += 1
                    j = end + 1
                    continue
            else:
                if ( basename_acronym[i] == entry_acronym[j] or
                     basename_acronym[i] == "?" ):
                    i += 1
                    j += 1
                    continue

            if basename_acronym[i] != entry[i] :
                break

        # one match is found
        if i == len(basename_acronym)  :
            candicate = os.path.join(dirname, entry).replace("./", "")
            # if the caller is only interested with folders
            if dironly == "x-d" and not os.path.isdir(candicate):
                continue

            reply.append( candicate )

    try:
        locale.setlocale(locale.LC_ALL, "")
    except StandardError:
        pass

    # when dealing with outer world, always use native encoding
    reply = [ stringlize(x) for x in reply]
    reply.sort( key=locale.strxfrm )

    if index :
        try:
            print ( reply[index - 1] )
        except IndexError:
            # return the last candidate when index is out of range
            print ( reply[:-1])
    else:
        print ( "\n".join(reply) )