diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f64ceca --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +tokenizers/jieba.beleve/ \ No newline at end of file diff --git a/analyse/tag_extracker.go b/analyse/tag_extracker.go index cf68075..92bd5fe 100644 --- a/analyse/tag_extracker.go +++ b/analyse/tag_extracker.go @@ -74,7 +74,7 @@ func (t *TagExtracter) LoadStopWords(fileName string) error { func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) { freqMap := make(map[string]float64) - for w := range t.seg.Cut(sentence, true) { + for _, w := range t.seg.Cut(sentence, true) { w = strings.TrimSpace(w) if utf8.RuneCountInString(w) < 2 { continue diff --git a/finalseg/finalseg.go b/finalseg/finalseg.go index d09a139..ae6679c 100644 --- a/finalseg/finalseg.go +++ b/finalseg/finalseg.go @@ -10,88 +10,86 @@ var ( reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`) ) -func cutHan(sentence string) chan string { - result := make(chan string) - go func() { - runes := []rune(sentence) - _, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'}) - begin, next := 0, 0 - for i, char := range runes { - pos := posList[i] - switch pos { - case 'B': - begin = i - case 'E': - result <- string(runes[begin : i+1]) - next = i + 1 - case 'S': - result <- string(char) - next = i + 1 - } - } - if next < len(runes) { - result <- string(runes[next:]) +func cutHan(sentence string) []string { + result := make([]string, 0, 10) + + runes := []rune(sentence) + _, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'}) + begin, next := 0, 0 + for i, char := range runes { + pos := posList[i] + switch pos { + case 'B': + begin = i + case 'E': + result = append(result, string(runes[begin:i+1])) + next = i + 1 + case 'S': + result = append(result, string(char)) + next = i + 1 } - close(result) - }() + } + if next < len(runes) { + result = append(result, string(runes[next:])) + } + return result } // Cut cuts sentence into words using Hidden Markov Model with Viterbi // algorithm. It is used by Jiebago for unknonw words. -func Cut(sentence string) chan string { - result := make(chan string) +func Cut(sentence string) []string { + result := make([]string, 0, 10) s := sentence var hans string var hanLoc []int var nonhanLoc []int - go func() { - for { - hanLoc = reHan.FindStringIndex(s) - if hanLoc == nil { - if len(s) == 0 { - break - } - } else if hanLoc[0] == 0 { - hans = s[hanLoc[0]:hanLoc[1]] - s = s[hanLoc[1]:] - for han := range cutHan(hans) { - result <- han - } - continue + + for { + hanLoc = reHan.FindStringIndex(s) + if hanLoc == nil { + if len(s) == 0 { + break } - nonhanLoc = reSkip.FindStringIndex(s) - if nonhanLoc == nil { - if len(s) == 0 { - break - } - } else if nonhanLoc[0] == 0 { - nonhans := s[nonhanLoc[0]:nonhanLoc[1]] - s = s[nonhanLoc[1]:] - if nonhans != "" { - result <- nonhans - continue - } + } else if hanLoc[0] == 0 { + hans = s[hanLoc[0]:hanLoc[1]] + s = s[hanLoc[1]:] + for _, han := range cutHan(hans) { + result = append(result, han) } - var loc []int - if hanLoc == nil && nonhanLoc == nil { - if len(s) > 0 { - result <- s - break - } - } else if hanLoc == nil { - loc = nonhanLoc - } else if nonhanLoc == nil { - loc = hanLoc - } else if hanLoc[0] < nonhanLoc[0] { - loc = hanLoc - } else { - loc = nonhanLoc + continue + } + nonhanLoc = reSkip.FindStringIndex(s) + if nonhanLoc == nil { + if len(s) == 0 { + break + } + } else if nonhanLoc[0] == 0 { + nonhans := s[nonhanLoc[0]:nonhanLoc[1]] + s = s[nonhanLoc[1]:] + if nonhans != "" { + result = append(result, nonhans) + continue } - result <- s[:loc[0]] - s = s[loc[0]:] } - close(result) - }() + var loc []int + if hanLoc == nil && nonhanLoc == nil { + if len(s) > 0 { + result = append(result, s) + break + } + } else if hanLoc == nil { + loc = nonhanLoc + } else if nonhanLoc == nil { + loc = hanLoc + } else if hanLoc[0] < nonhanLoc[0] { + loc = hanLoc + } else { + loc = nonhanLoc + } + result = append(result, s[:loc[0]]) + s = s[loc[0]:] + } + return result } diff --git a/jieba.go b/jieba.go index efa04c1..169bc47 100644 --- a/jieba.go +++ b/jieba.go @@ -72,7 +72,7 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 { } } else { word := words[0] - for segment := range seg.Cut(word, false) { + for _, segment := range seg.Cut(word, false) { if freq, ok := seg.dict.Frequency(segment); ok { frequency *= freq } @@ -165,95 +165,98 @@ func (seg *Segmenter) calc(runes []rune) map[int]route { return rs } -type cutFunc func(sentence string) <-chan string - -func (seg *Segmenter) cutDAG(sentence string) <-chan string { - result := make(chan string) - go func() { - runes := []rune(sentence) - routes := seg.calc(runes) - var y int - length := len(runes) - var buf []rune - for x := 0; x < length; { - y = routes[x].index + 1 - frag := runes[x:y] - if y-x == 1 { - buf = append(buf, frag...) - } else { - if len(buf) > 0 { - bufString := string(buf) - if len(buf) == 1 { - result <- bufString +// ratio words and letters in an article commonly +const ( + RatioLetterWord float32 = 1.5 + RatioLetterWordFull float32 = 1 +) + +type cutFunc func(sentence string) []string + +func (seg *Segmenter) cutDAG(sentence string) []string { + result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1) + + runes := []rune(sentence) + routes := seg.calc(runes) + var y int + length := len(runes) + var buf []rune + for x := 0; x < length; { + y = routes[x].index + 1 + frag := runes[x:y] + if y-x == 1 { + buf = append(buf, frag...) + } else { + if len(buf) > 0 { + bufString := string(buf) + if len(buf) == 1 { + result = append(result, bufString) + } else { + if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { + for _, x := range finalseg.Cut(bufString) { + result = append(result, x) + } } else { - if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { - for x := range finalseg.Cut(bufString) { - result <- x - } - } else { - for _, elem := range buf { - result <- string(elem) - } + for _, elem := range buf { + result = append(result, string(elem)) } } - buf = make([]rune, 0) } - result <- string(frag) + buf = make([]rune, 0) } - x = y + result = append(result, string(frag)) } + x = y + } - if len(buf) > 0 { - bufString := string(buf) - if len(buf) == 1 { - result <- bufString + if len(buf) > 0 { + bufString := string(buf) + if len(buf) == 1 { + result = append(result, bufString) + } else { + if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { + for _, t := range finalseg.Cut(bufString) { + result = append(result, t) + } } else { - if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { - for t := range finalseg.Cut(bufString) { - result <- t - } - } else { - for _, elem := range buf { - result <- string(elem) - } + for _, elem := range buf { + result = append(result, string(elem)) } } } - close(result) - }() + } + return result } -func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string { - result := make(chan string) - - go func() { - runes := []rune(sentence) - routes := seg.calc(runes) - var y int - length := len(runes) - var buf []rune - for x := 0; x < length; { - y = routes[x].index + 1 - frag := runes[x:y] - if reEng.MatchString(string(frag)) && len(frag) == 1 { - buf = append(buf, frag...) - x = y - continue - } - if len(buf) > 0 { - result <- string(buf) - buf = make([]rune, 0) - } - result <- string(frag) +func (seg *Segmenter) cutDAGNoHMM(sentence string) []string { + result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1) + + runes := []rune(sentence) + routes := seg.calc(runes) + var y int + length := len(runes) + var buf []rune + for x := 0; x < length; { + y = routes[x].index + 1 + frag := runes[x:y] + if reEng.MatchString(string(frag)) && len(frag) == 1 { + buf = append(buf, frag...) x = y + continue } if len(buf) > 0 { - result <- string(buf) + result = append(result, string(buf)) buf = make([]rune, 0) } - close(result) - }() + result = append(result, string(frag)) + x = y + } + if len(buf) > 0 { + result = append(result, string(buf)) + buf = make([]rune, 0) + } + return result } @@ -261,8 +264,8 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string { // Parameter hmm controls whether to use the Hidden Markov Model. // Accurate mode attempts to cut the sentence into the most accurate // segmentations, which is suitable for text analysis. -func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string { - result := make(chan string) +func (seg *Segmenter) Cut(sentence string, hmm bool) []string { + result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1) var cut cutFunc if hmm { cut = seg.cutDAG @@ -270,84 +273,80 @@ func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string { cut = seg.cutDAGNoHMM } - go func() { - for _, block := range util.RegexpSplit(reHanDefault, sentence, -1) { - if len(block) == 0 { - continue + for _, block := range util.RegexpSplit(reHanDefault, sentence, -1) { + if len(block) == 0 { + continue + } + if reHanDefault.MatchString(block) { + for _, x := range cut(block) { + result = append(result, x) } - if reHanDefault.MatchString(block) { - for x := range cut(block) { - result <- x - } + continue + } + for _, subBlock := range util.RegexpSplit(reSkipDefault, block, -1) { + if reSkipDefault.MatchString(subBlock) { + result = append(result, subBlock) continue } - for _, subBlock := range util.RegexpSplit(reSkipDefault, block, -1) { - if reSkipDefault.MatchString(subBlock) { - result <- subBlock - continue - } - for _, r := range subBlock { - result <- string(r) - } + for _, r := range subBlock { + result = append(result, string(r)) } } - close(result) - }() + } + return result } -func (seg *Segmenter) cutAll(sentence string) <-chan string { - result := make(chan string) - go func() { - runes := []rune(sentence) - dag := seg.dag(runes) - start := -1 - ks := make([]int, len(dag)) - for k := range dag { - ks[k] = k +func (seg *Segmenter) cutAll(sentence string) []string { + result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1) + + runes := []rune(sentence) + dag := seg.dag(runes) + start := -1 + ks := make([]int, len(dag)) + for k := range dag { + ks[k] = k + } + var l []int + for k := range ks { + l = dag[k] + if len(l) == 1 && k > start { + result = append(result, string(runes[k:l[0]+1])) + start = l[0] + continue } - var l []int - for k := range ks { - l = dag[k] - if len(l) == 1 && k > start { - result <- string(runes[k : l[0]+1]) - start = l[0] - continue - } - for _, j := range l { - if j > k { - result <- string(runes[k : j+1]) - start = j - } + for _, j := range l { + if j > k { + result = append(result, string(runes[k:j+1])) + start = j } } - close(result) - }() + } + return result } // CutAll cuts a sentence into words using full mode. // Full mode gets all the possible words from the sentence. // Fast but not accurate. -func (seg *Segmenter) CutAll(sentence string) <-chan string { - result := make(chan string) - go func() { - for _, block := range util.RegexpSplit(reHanCutAll, sentence, -1) { - if len(block) == 0 { - continue - } - if reHanCutAll.MatchString(block) { - for x := range seg.cutAll(block) { - result <- x - } - continue - } - for _, subBlock := range reSkipCutAll.Split(block, -1) { - result <- subBlock +func (seg *Segmenter) CutAll(sentence string) []string { + result := make([]string, 0, int(float32(len(sentence))/RatioLetterWordFull)+1) + + for _, block := range util.RegexpSplit(reHanCutAll, sentence, -1) { + if len(block) == 0 { + continue + } + if reHanCutAll.MatchString(block) { + for _, x := range seg.cutAll(block) { + result = append(result, x) } + continue + } + for _, subBlock := range reSkipCutAll.Split(block, -1) { + result = append(result, subBlock) } - close(result) - }() + } + return result } @@ -355,26 +354,25 @@ func (seg *Segmenter) CutAll(sentence string) <-chan string { // Search engine mode, based on the accurate mode, attempts to cut long words // into several short words, which can raise the recall rate. // Suitable for search engines. -func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string { - result := make(chan string) - go func() { - for word := range seg.Cut(sentence, hmm) { - runes := []rune(word) - for _, increment := range []int{2, 3} { - if len(runes) <= increment { - continue - } - var gram string - for i := 0; i < len(runes)-increment+1; i++ { - gram = string(runes[i : i+increment]) - if v, ok := seg.dict.Frequency(gram); ok && v > 0.0 { - result <- gram - } +func (seg *Segmenter) CutForSearch(sentence string, hmm bool) []string { + result := make([]string, 0, int(float32(len(sentence))/RatioLetterWordFull)+1) + + for _, word := range seg.Cut(sentence, hmm) { + runes := []rune(word) + for _, increment := range []int{2, 3} { + if len(runes) <= increment { + continue + } + var gram string + for i := 0; i < len(runes)-increment+1; i++ { + gram = string(runes[i : i+increment]) + if v, ok := seg.dict.Frequency(gram); ok && v > 0.0 { + result = append(result, gram) } } - result <- word } - close(result) - }() + result = append(result, word) + } + return result } diff --git a/jieba_test.go b/jieba_test.go index 6203848..c7d7d72 100644 --- a/jieba_test.go +++ b/jieba_test.go @@ -619,23 +619,15 @@ func init() { seg.LoadDictionary("dict.txt") } -func chanToArray(ch <-chan string) []string { - var result []string - for word := range ch { - result = append(result, word) - } - return result -} - func TestCutDAG(t *testing.T) { - result := chanToArray(seg.cutDAG("BP神经网络如何训练才能在分类时增加区分度?")) + result := seg.cutDAG("BP神经网络如何训练才能在分类时增加区分度?") if len(result) != 11 { t.Fatal(result) } } func TestCutDAGNoHmm(t *testing.T) { - result := chanToArray(seg.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?")) + result := seg.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?") if len(result) != 11 { t.Fatal(result) } @@ -644,7 +636,7 @@ func TestCutDAGNoHmm(t *testing.T) { func TestDefaultCut(t *testing.T) { var result []string for index, content := range testContents { - result = chanToArray(seg.Cut(content, true)) + result = seg.Cut(content, true) if len(result) != len(defaultCutResult[index]) { t.Errorf("default cut for %s length should be %d not %d\n", content, len(defaultCutResult[index]), len(result)) @@ -662,7 +654,7 @@ func TestDefaultCut(t *testing.T) { func TestCutAll(t *testing.T) { var result []string for index, content := range testContents { - result = chanToArray(seg.CutAll(content)) + result = seg.CutAll(content) if len(result) != len(cutAllResult[index]) { t.Errorf("cut all for %s length should be %d not %d\n", content, len(cutAllResult[index]), len(result)) @@ -680,7 +672,7 @@ func TestCutAll(t *testing.T) { func TestDefaultCutNoHMM(t *testing.T) { var result []string for index, content := range testContents { - result = chanToArray(seg.Cut(content, false)) + result = seg.Cut(content, false) if len(result) != len(defaultCutNoHMMResult[index]) { t.Fatalf("default cut no hmm for %s length should be %d not %d\n", content, len(defaultCutNoHMMResult[index]), len(result)) @@ -696,7 +688,7 @@ func TestDefaultCutNoHMM(t *testing.T) { func TestCutForSearch(t *testing.T) { var result []string for index, content := range testContents { - result = chanToArray(seg.CutForSearch(content, true)) + result = seg.CutForSearch(content, true) if len(result) != len(cutForSearchResult[index]) { t.Fatalf("cut for search for %s length should be %d not %d\n", content, len(cutForSearchResult[index]), len(result)) @@ -708,7 +700,7 @@ func TestCutForSearch(t *testing.T) { } } for index, content := range testContents { - result = chanToArray(seg.CutForSearch(content, false)) + result = seg.CutForSearch(content, false) if len(result) != len(cutForSearchNoHMMResult[index]) { t.Fatalf("cut for search no hmm for %s length should be %d not %d\n", content, len(cutForSearchNoHMMResult[index]), len(result)) @@ -725,7 +717,7 @@ func TestLoadDictionary(t *testing.T) { var result []string seg.LoadDictionary("foobar.txt") for index, content := range testContents { - result = chanToArray(seg.Cut(content, true)) + result = seg.Cut(content, true) if len(result) != len(userDictCutResult[index]) { t.Fatalf("default cut with user dictionary for %s length should be %d not %d\n", content, len(userDictCutResult[index]), len(result)) @@ -745,7 +737,7 @@ func TestLoadUserDictionary(t *testing.T) { sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"} - words := chanToArray(seg.Cut(sentence, true)) + words := seg.Cut(sentence, true) if len(words) != len(result) { t.Fatal(len(words)) } @@ -757,7 +749,7 @@ func TestLoadUserDictionary(t *testing.T) { sentence = "easy_install is great" result = []string{"easy_install", " ", "is", " ", "great"} - words = chanToArray(seg.Cut(sentence, true)) + words = seg.Cut(sentence, true) if len(words) != len(result) { t.Fatal(len(words)) } @@ -769,7 +761,7 @@ func TestLoadUserDictionary(t *testing.T) { sentence = "python 的正则表达式是好用的" result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"} - words = chanToArray(seg.Cut(sentence, true)) + words = seg.Cut(sentence, true) if len(words) != len(result) { t.Fatal(words) t.Fatal(result) @@ -786,7 +778,7 @@ func BenchmarkCutNoHMM(b *testing.B) { sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" b.ResetTimer() for i := 0; i < b.N; i++ { - chanToArray(seg.Cut(sentence, false)) + seg.Cut(sentence, false) } } @@ -794,7 +786,7 @@ func BenchmarkCut(b *testing.B) { sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" b.ResetTimer() for i := 0; i < b.N; i++ { - chanToArray(seg.Cut(sentence, true)) + seg.Cut(sentence, true) } } @@ -802,7 +794,7 @@ func BenchmarkCutAll(b *testing.B) { sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" b.ResetTimer() for i := 0; i < b.N; i++ { - chanToArray(seg.CutAll(sentence)) + seg.CutAll(sentence) } } @@ -810,7 +802,7 @@ func BenchmarkCutForSearchNoHMM(b *testing.B) { sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" b.ResetTimer() for i := 0; i < b.N; i++ { - chanToArray(seg.CutForSearch(sentence, false)) + seg.CutForSearch(sentence, false) } } @@ -818,6 +810,6 @@ func BenchmarkCutForSearch(b *testing.B) { sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" b.ResetTimer() for i := 0; i < b.N; i++ { - chanToArray(seg.CutForSearch(sentence, true)) + seg.CutForSearch(sentence, true) } } diff --git a/tokenizers/analyzer.go b/tokenizers/analyzer.go new file mode 100644 index 0000000..6af133a --- /dev/null +++ b/tokenizers/analyzer.go @@ -0,0 +1,30 @@ +package tokenizers + +import ( + "errors" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +type JiebaAnalyzer struct { +} + +func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + tokenizerName, ok := config["tokenizer"].(string) + if !ok { + return nil, errors.New("must specify tokenizer") + } + tokenizer, err := cache.TokenizerNamed(tokenizerName) + if err != nil { + return nil, err + } + alz := &analysis.Analyzer{ + Tokenizer: tokenizer, + } + return alz, nil +} + +func init() { + registry.RegisterAnalyzer("jieba", analyzerConstructor) +} diff --git a/tokenizers/example_bleve_test.go b/tokenizers/example_bleve_test.go index f5da6b4..766b566 100644 --- a/tokenizers/example_bleve_test.go +++ b/tokenizers/example_bleve_test.go @@ -101,26 +101,26 @@ func Example_beleveSearch() { // Output: // Result of "水果世博园": 2 matches: // 1. Doc 3, (1.099550) - // Name: 买水果然后来世博园。 + // Name: 买水果然后来世博园。 // 2. Doc 2, (0.031941) - // Name: The second one 你 中文测试中文 is even more interesting! 吃水果 + // Name: The second one 你 中文测试中文 is even more interesting! 吃水果 // Result of "你": 1 matches: // 1. Doc 2, (0.391161) - // Name: The second one 中文测试中文 is even more interesting! 吃水果 + // Name: The second one 中文测试中文 is even more interesting! 吃水果 // Result of "first": 1 matches: // 1. Doc 1, (0.512150) - // Name: This is the first document we’ve added + // Name: This is the first document we’ve added // Result of "中文": 1 matches: // 1. Doc 2, (0.553186) - // Name: The second one 你 中文测试中文 is even more interesting! 吃水果 + // Name: The second one 你 中文测试中文 is even more interesting! 吃水果 // Result of "交换机": 2 matches: // 1. Doc 4, (0.608495) - // Name: 工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作 + // Name: 工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作 // 2. Doc 5, (0.086700) - // Name: 咱俩交换一下吧。 + // Name: 咱俩交换一下吧。 // Result of "交换": 2 matches: // 1. Doc 5, (0.534158) - // Name: 咱俩交换一下吧。 + // Name: 咱俩交换一下吧。 // 2. Doc 4, (0.296297) - // Name: 工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作 + // Name: 工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作 } diff --git a/tokenizers/tokenizer.go b/tokenizers/tokenizer.go index 6311b5d..fad5c73 100644 --- a/tokenizers/tokenizer.go +++ b/tokenizers/tokenizer.go @@ -60,7 +60,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { pos := 1 var width int var gram string - for word := range jt.seg.Cut(string(input), jt.hmm) { + for _, word := range jt.seg.Cut(string(input), jt.hmm) { if jt.searchMode { runes := []rune(word) width = len(runes)