diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f64ceca
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+tokenizers/jieba.beleve/
\ No newline at end of file
diff --git a/analyse/tag_extracker.go b/analyse/tag_extracker.go
index cf68075..92bd5fe 100644
--- a/analyse/tag_extracker.go
+++ b/analyse/tag_extracker.go
@@ -74,7 +74,7 @@ func (t *TagExtracter) LoadStopWords(fileName string) error {
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
freqMap := make(map[string]float64)
- for w := range t.seg.Cut(sentence, true) {
+ for _, w := range t.seg.Cut(sentence, true) {
w = strings.TrimSpace(w)
if utf8.RuneCountInString(w) < 2 {
continue
diff --git a/finalseg/finalseg.go b/finalseg/finalseg.go
index d09a139..ae6679c 100644
--- a/finalseg/finalseg.go
+++ b/finalseg/finalseg.go
@@ -10,88 +10,86 @@ var (
reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
)
-func cutHan(sentence string) chan string {
- result := make(chan string)
- go func() {
- runes := []rune(sentence)
- _, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
- begin, next := 0, 0
- for i, char := range runes {
- pos := posList[i]
- switch pos {
- case 'B':
- begin = i
- case 'E':
- result <- string(runes[begin : i+1])
- next = i + 1
- case 'S':
- result <- string(char)
- next = i + 1
- }
- }
- if next < len(runes) {
- result <- string(runes[next:])
+func cutHan(sentence string) []string {
+ result := make([]string, 0, 10)
+
+ runes := []rune(sentence)
+ _, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
+ begin, next := 0, 0
+ for i, char := range runes {
+ pos := posList[i]
+ switch pos {
+ case 'B':
+ begin = i
+ case 'E':
+ result = append(result, string(runes[begin:i+1]))
+ next = i + 1
+ case 'S':
+ result = append(result, string(char))
+ next = i + 1
}
- close(result)
- }()
+ }
+ if next < len(runes) {
+ result = append(result, string(runes[next:]))
+ }
+
return result
}
// Cut cuts sentence into words using Hidden Markov Model with Viterbi
// algorithm. It is used by Jiebago for unknonw words.
-func Cut(sentence string) chan string {
- result := make(chan string)
+func Cut(sentence string) []string {
+ result := make([]string, 0, 10)
s := sentence
var hans string
var hanLoc []int
var nonhanLoc []int
- go func() {
- for {
- hanLoc = reHan.FindStringIndex(s)
- if hanLoc == nil {
- if len(s) == 0 {
- break
- }
- } else if hanLoc[0] == 0 {
- hans = s[hanLoc[0]:hanLoc[1]]
- s = s[hanLoc[1]:]
- for han := range cutHan(hans) {
- result <- han
- }
- continue
+
+ for {
+ hanLoc = reHan.FindStringIndex(s)
+ if hanLoc == nil {
+ if len(s) == 0 {
+ break
}
- nonhanLoc = reSkip.FindStringIndex(s)
- if nonhanLoc == nil {
- if len(s) == 0 {
- break
- }
- } else if nonhanLoc[0] == 0 {
- nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
- s = s[nonhanLoc[1]:]
- if nonhans != "" {
- result <- nonhans
- continue
- }
+ } else if hanLoc[0] == 0 {
+ hans = s[hanLoc[0]:hanLoc[1]]
+ s = s[hanLoc[1]:]
+ for _, han := range cutHan(hans) {
+ result = append(result, han)
}
- var loc []int
- if hanLoc == nil && nonhanLoc == nil {
- if len(s) > 0 {
- result <- s
- break
- }
- } else if hanLoc == nil {
- loc = nonhanLoc
- } else if nonhanLoc == nil {
- loc = hanLoc
- } else if hanLoc[0] < nonhanLoc[0] {
- loc = hanLoc
- } else {
- loc = nonhanLoc
+ continue
+ }
+ nonhanLoc = reSkip.FindStringIndex(s)
+ if nonhanLoc == nil {
+ if len(s) == 0 {
+ break
+ }
+ } else if nonhanLoc[0] == 0 {
+ nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
+ s = s[nonhanLoc[1]:]
+ if nonhans != "" {
+ result = append(result, nonhans)
+ continue
}
- result <- s[:loc[0]]
- s = s[loc[0]:]
}
- close(result)
- }()
+ var loc []int
+ if hanLoc == nil && nonhanLoc == nil {
+ if len(s) > 0 {
+ result = append(result, s)
+ break
+ }
+ } else if hanLoc == nil {
+ loc = nonhanLoc
+ } else if nonhanLoc == nil {
+ loc = hanLoc
+ } else if hanLoc[0] < nonhanLoc[0] {
+ loc = hanLoc
+ } else {
+ loc = nonhanLoc
+ }
+ result = append(result, s[:loc[0]])
+ s = s[loc[0]:]
+ }
+
return result
}
diff --git a/jieba.go b/jieba.go
index efa04c1..169bc47 100644
--- a/jieba.go
+++ b/jieba.go
@@ -72,7 +72,7 @@ func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
}
} else {
word := words[0]
- for segment := range seg.Cut(word, false) {
+ for _, segment := range seg.Cut(word, false) {
if freq, ok := seg.dict.Frequency(segment); ok {
frequency *= freq
}
@@ -165,95 +165,98 @@ func (seg *Segmenter) calc(runes []rune) map[int]route {
return rs
}
-type cutFunc func(sentence string) <-chan string
-
-func (seg *Segmenter) cutDAG(sentence string) <-chan string {
- result := make(chan string)
- go func() {
- runes := []rune(sentence)
- routes := seg.calc(runes)
- var y int
- length := len(runes)
- var buf []rune
- for x := 0; x < length; {
- y = routes[x].index + 1
- frag := runes[x:y]
- if y-x == 1 {
- buf = append(buf, frag...)
- } else {
- if len(buf) > 0 {
- bufString := string(buf)
- if len(buf) == 1 {
- result <- bufString
+// ratio words and letters in an article commonly
+const (
+ RatioLetterWord float32 = 1.5
+ RatioLetterWordFull float32 = 1
+)
+
+type cutFunc func(sentence string) []string
+
+func (seg *Segmenter) cutDAG(sentence string) []string {
+ result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
+
+ runes := []rune(sentence)
+ routes := seg.calc(runes)
+ var y int
+ length := len(runes)
+ var buf []rune
+ for x := 0; x < length; {
+ y = routes[x].index + 1
+ frag := runes[x:y]
+ if y-x == 1 {
+ buf = append(buf, frag...)
+ } else {
+ if len(buf) > 0 {
+ bufString := string(buf)
+ if len(buf) == 1 {
+ result = append(result, bufString)
+ } else {
+ if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
+ for _, x := range finalseg.Cut(bufString) {
+ result = append(result, x)
+ }
} else {
- if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
- for x := range finalseg.Cut(bufString) {
- result <- x
- }
- } else {
- for _, elem := range buf {
- result <- string(elem)
- }
+ for _, elem := range buf {
+ result = append(result, string(elem))
}
}
- buf = make([]rune, 0)
}
- result <- string(frag)
+ buf = make([]rune, 0)
}
- x = y
+ result = append(result, string(frag))
}
+ x = y
+ }
- if len(buf) > 0 {
- bufString := string(buf)
- if len(buf) == 1 {
- result <- bufString
+ if len(buf) > 0 {
+ bufString := string(buf)
+ if len(buf) == 1 {
+ result = append(result, bufString)
+ } else {
+ if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
+ for _, t := range finalseg.Cut(bufString) {
+ result = append(result, t)
+ }
} else {
- if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
- for t := range finalseg.Cut(bufString) {
- result <- t
- }
- } else {
- for _, elem := range buf {
- result <- string(elem)
- }
+ for _, elem := range buf {
+ result = append(result, string(elem))
}
}
}
- close(result)
- }()
+ }
+
return result
}
-func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
- result := make(chan string)
-
- go func() {
- runes := []rune(sentence)
- routes := seg.calc(runes)
- var y int
- length := len(runes)
- var buf []rune
- for x := 0; x < length; {
- y = routes[x].index + 1
- frag := runes[x:y]
- if reEng.MatchString(string(frag)) && len(frag) == 1 {
- buf = append(buf, frag...)
- x = y
- continue
- }
- if len(buf) > 0 {
- result <- string(buf)
- buf = make([]rune, 0)
- }
- result <- string(frag)
+func (seg *Segmenter) cutDAGNoHMM(sentence string) []string {
+ result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
+
+ runes := []rune(sentence)
+ routes := seg.calc(runes)
+ var y int
+ length := len(runes)
+ var buf []rune
+ for x := 0; x < length; {
+ y = routes[x].index + 1
+ frag := runes[x:y]
+ if reEng.MatchString(string(frag)) && len(frag) == 1 {
+ buf = append(buf, frag...)
x = y
+ continue
}
if len(buf) > 0 {
- result <- string(buf)
+ result = append(result, string(buf))
buf = make([]rune, 0)
}
- close(result)
- }()
+ result = append(result, string(frag))
+ x = y
+ }
+ if len(buf) > 0 {
+ result = append(result, string(buf))
+ buf = make([]rune, 0)
+ }
+
return result
}
@@ -261,8 +264,8 @@ func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
// Parameter hmm controls whether to use the Hidden Markov Model.
// Accurate mode attempts to cut the sentence into the most accurate
// segmentations, which is suitable for text analysis.
-func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string {
- result := make(chan string)
+func (seg *Segmenter) Cut(sentence string, hmm bool) []string {
+ result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
var cut cutFunc
if hmm {
cut = seg.cutDAG
@@ -270,84 +273,80 @@ func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string {
cut = seg.cutDAGNoHMM
}
- go func() {
- for _, block := range util.RegexpSplit(reHanDefault, sentence, -1) {
- if len(block) == 0 {
- continue
+ for _, block := range util.RegexpSplit(reHanDefault, sentence, -1) {
+ if len(block) == 0 {
+ continue
+ }
+ if reHanDefault.MatchString(block) {
+ for _, x := range cut(block) {
+ result = append(result, x)
}
- if reHanDefault.MatchString(block) {
- for x := range cut(block) {
- result <- x
- }
+ continue
+ }
+ for _, subBlock := range util.RegexpSplit(reSkipDefault, block, -1) {
+ if reSkipDefault.MatchString(subBlock) {
+ result = append(result, subBlock)
continue
}
- for _, subBlock := range util.RegexpSplit(reSkipDefault, block, -1) {
- if reSkipDefault.MatchString(subBlock) {
- result <- subBlock
- continue
- }
- for _, r := range subBlock {
- result <- string(r)
- }
+ for _, r := range subBlock {
+ result = append(result, string(r))
}
}
- close(result)
- }()
+ }
+
return result
}
-func (seg *Segmenter) cutAll(sentence string) <-chan string {
- result := make(chan string)
- go func() {
- runes := []rune(sentence)
- dag := seg.dag(runes)
- start := -1
- ks := make([]int, len(dag))
- for k := range dag {
- ks[k] = k
+func (seg *Segmenter) cutAll(sentence string) []string {
+ result := make([]string, 0, int(float32(len(sentence))/RatioLetterWord)+1)
+
+ runes := []rune(sentence)
+ dag := seg.dag(runes)
+ start := -1
+ ks := make([]int, len(dag))
+ for k := range dag {
+ ks[k] = k
+ }
+ var l []int
+ for k := range ks {
+ l = dag[k]
+ if len(l) == 1 && k > start {
+ result = append(result, string(runes[k:l[0]+1]))
+ start = l[0]
+ continue
}
- var l []int
- for k := range ks {
- l = dag[k]
- if len(l) == 1 && k > start {
- result <- string(runes[k : l[0]+1])
- start = l[0]
- continue
- }
- for _, j := range l {
- if j > k {
- result <- string(runes[k : j+1])
- start = j
- }
+ for _, j := range l {
+ if j > k {
+ result = append(result, string(runes[k:j+1]))
+ start = j
}
}
- close(result)
- }()
+ }
+
return result
}
// CutAll cuts a sentence into words using full mode.
// Full mode gets all the possible words from the sentence.
// Fast but not accurate.
-func (seg *Segmenter) CutAll(sentence string) <-chan string {
- result := make(chan string)
- go func() {
- for _, block := range util.RegexpSplit(reHanCutAll, sentence, -1) {
- if len(block) == 0 {
- continue
- }
- if reHanCutAll.MatchString(block) {
- for x := range seg.cutAll(block) {
- result <- x
- }
- continue
- }
- for _, subBlock := range reSkipCutAll.Split(block, -1) {
- result <- subBlock
+func (seg *Segmenter) CutAll(sentence string) []string {
+ result := make([]string, 0, int(float32(len(sentence))/RatioLetterWordFull)+1)
+
+ for _, block := range util.RegexpSplit(reHanCutAll, sentence, -1) {
+ if len(block) == 0 {
+ continue
+ }
+ if reHanCutAll.MatchString(block) {
+ for _, x := range seg.cutAll(block) {
+ result = append(result, x)
}
+ continue
+ }
+ for _, subBlock := range reSkipCutAll.Split(block, -1) {
+ result = append(result, subBlock)
}
- close(result)
- }()
+ }
+
return result
}
@@ -355,26 +354,25 @@ func (seg *Segmenter) CutAll(sentence string) <-chan string {
// Search engine mode, based on the accurate mode, attempts to cut long words
// into several short words, which can raise the recall rate.
// Suitable for search engines.
-func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string {
- result := make(chan string)
- go func() {
- for word := range seg.Cut(sentence, hmm) {
- runes := []rune(word)
- for _, increment := range []int{2, 3} {
- if len(runes) <= increment {
- continue
- }
- var gram string
- for i := 0; i < len(runes)-increment+1; i++ {
- gram = string(runes[i : i+increment])
- if v, ok := seg.dict.Frequency(gram); ok && v > 0.0 {
- result <- gram
- }
+func (seg *Segmenter) CutForSearch(sentence string, hmm bool) []string {
+ result := make([]string, 0, int(float32(len(sentence))/RatioLetterWordFull)+1)
+
+ for _, word := range seg.Cut(sentence, hmm) {
+ runes := []rune(word)
+ for _, increment := range []int{2, 3} {
+ if len(runes) <= increment {
+ continue
+ }
+ var gram string
+ for i := 0; i < len(runes)-increment+1; i++ {
+ gram = string(runes[i : i+increment])
+ if v, ok := seg.dict.Frequency(gram); ok && v > 0.0 {
+ result = append(result, gram)
}
}
- result <- word
}
- close(result)
- }()
+ result = append(result, word)
+ }
+
return result
}
diff --git a/jieba_test.go b/jieba_test.go
index 6203848..c7d7d72 100644
--- a/jieba_test.go
+++ b/jieba_test.go
@@ -619,23 +619,15 @@ func init() {
seg.LoadDictionary("dict.txt")
}
-func chanToArray(ch <-chan string) []string {
- var result []string
- for word := range ch {
- result = append(result, word)
- }
- return result
-}
-
func TestCutDAG(t *testing.T) {
- result := chanToArray(seg.cutDAG("BP神经网络如何训练才能在分类时增加区分度?"))
+ result := seg.cutDAG("BP神经网络如何训练才能在分类时增加区分度?")
if len(result) != 11 {
t.Fatal(result)
}
}
func TestCutDAGNoHmm(t *testing.T) {
- result := chanToArray(seg.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?"))
+ result := seg.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?")
if len(result) != 11 {
t.Fatal(result)
}
@@ -644,7 +636,7 @@ func TestCutDAGNoHmm(t *testing.T) {
func TestDefaultCut(t *testing.T) {
var result []string
for index, content := range testContents {
- result = chanToArray(seg.Cut(content, true))
+ result = seg.Cut(content, true)
if len(result) != len(defaultCutResult[index]) {
t.Errorf("default cut for %s length should be %d not %d\n",
content, len(defaultCutResult[index]), len(result))
@@ -662,7 +654,7 @@ func TestDefaultCut(t *testing.T) {
func TestCutAll(t *testing.T) {
var result []string
for index, content := range testContents {
- result = chanToArray(seg.CutAll(content))
+ result = seg.CutAll(content)
if len(result) != len(cutAllResult[index]) {
t.Errorf("cut all for %s length should be %d not %d\n",
content, len(cutAllResult[index]), len(result))
@@ -680,7 +672,7 @@ func TestCutAll(t *testing.T) {
func TestDefaultCutNoHMM(t *testing.T) {
var result []string
for index, content := range testContents {
- result = chanToArray(seg.Cut(content, false))
+ result = seg.Cut(content, false)
if len(result) != len(defaultCutNoHMMResult[index]) {
t.Fatalf("default cut no hmm for %s length should be %d not %d\n",
content, len(defaultCutNoHMMResult[index]), len(result))
@@ -696,7 +688,7 @@ func TestDefaultCutNoHMM(t *testing.T) {
func TestCutForSearch(t *testing.T) {
var result []string
for index, content := range testContents {
- result = chanToArray(seg.CutForSearch(content, true))
+ result = seg.CutForSearch(content, true)
if len(result) != len(cutForSearchResult[index]) {
t.Fatalf("cut for search for %s length should be %d not %d\n",
content, len(cutForSearchResult[index]), len(result))
@@ -708,7 +700,7 @@ func TestCutForSearch(t *testing.T) {
}
}
for index, content := range testContents {
- result = chanToArray(seg.CutForSearch(content, false))
+ result = seg.CutForSearch(content, false)
if len(result) != len(cutForSearchNoHMMResult[index]) {
t.Fatalf("cut for search no hmm for %s length should be %d not %d\n",
content, len(cutForSearchNoHMMResult[index]), len(result))
@@ -725,7 +717,7 @@ func TestLoadDictionary(t *testing.T) {
var result []string
seg.LoadDictionary("foobar.txt")
for index, content := range testContents {
- result = chanToArray(seg.Cut(content, true))
+ result = seg.Cut(content, true)
if len(result) != len(userDictCutResult[index]) {
t.Fatalf("default cut with user dictionary for %s length should be %d not %d\n",
content, len(userDictCutResult[index]), len(result))
@@ -745,7 +737,7 @@ func TestLoadUserDictionary(t *testing.T) {
sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
- words := chanToArray(seg.Cut(sentence, true))
+ words := seg.Cut(sentence, true)
if len(words) != len(result) {
t.Fatal(len(words))
}
@@ -757,7 +749,7 @@ func TestLoadUserDictionary(t *testing.T) {
sentence = "easy_install is great"
result = []string{"easy_install", " ", "is", " ", "great"}
- words = chanToArray(seg.Cut(sentence, true))
+ words = seg.Cut(sentence, true)
if len(words) != len(result) {
t.Fatal(len(words))
}
@@ -769,7 +761,7 @@ func TestLoadUserDictionary(t *testing.T) {
sentence = "python 的正则表达式是好用的"
result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"}
- words = chanToArray(seg.Cut(sentence, true))
+ words = seg.Cut(sentence, true)
if len(words) != len(result) {
t.Fatal(words)
t.Fatal(result)
@@ -786,7 +778,7 @@ func BenchmarkCutNoHMM(b *testing.B) {
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
b.ResetTimer()
for i := 0; i < b.N; i++ {
- chanToArray(seg.Cut(sentence, false))
+ seg.Cut(sentence, false)
}
}
@@ -794,7 +786,7 @@ func BenchmarkCut(b *testing.B) {
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
b.ResetTimer()
for i := 0; i < b.N; i++ {
- chanToArray(seg.Cut(sentence, true))
+ seg.Cut(sentence, true)
}
}
@@ -802,7 +794,7 @@ func BenchmarkCutAll(b *testing.B) {
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
b.ResetTimer()
for i := 0; i < b.N; i++ {
- chanToArray(seg.CutAll(sentence))
+ seg.CutAll(sentence)
}
}
@@ -810,7 +802,7 @@ func BenchmarkCutForSearchNoHMM(b *testing.B) {
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
b.ResetTimer()
for i := 0; i < b.N; i++ {
- chanToArray(seg.CutForSearch(sentence, false))
+ seg.CutForSearch(sentence, false)
}
}
@@ -818,6 +810,6 @@ func BenchmarkCutForSearch(b *testing.B) {
sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
b.ResetTimer()
for i := 0; i < b.N; i++ {
- chanToArray(seg.CutForSearch(sentence, true))
+ seg.CutForSearch(sentence, true)
}
}
diff --git a/tokenizers/analyzer.go b/tokenizers/analyzer.go
new file mode 100644
index 0000000..6af133a
--- /dev/null
+++ b/tokenizers/analyzer.go
@@ -0,0 +1,30 @@
+package tokenizers
+
+import (
+ "errors"
+
+ "github.com/blevesearch/bleve/analysis"
+ "github.com/blevesearch/bleve/registry"
+)
+
+type JiebaAnalyzer struct {
+}
+
+func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+ tokenizerName, ok := config["tokenizer"].(string)
+ if !ok {
+ return nil, errors.New("must specify tokenizer")
+ }
+ tokenizer, err := cache.TokenizerNamed(tokenizerName)
+ if err != nil {
+ return nil, err
+ }
+ alz := &analysis.Analyzer{
+ Tokenizer: tokenizer,
+ }
+ return alz, nil
+}
+
+func init() {
+ registry.RegisterAnalyzer("jieba", analyzerConstructor)
+}
diff --git a/tokenizers/example_bleve_test.go b/tokenizers/example_bleve_test.go
index f5da6b4..766b566 100644
--- a/tokenizers/example_bleve_test.go
+++ b/tokenizers/example_bleve_test.go
@@ -101,26 +101,26 @@ func Example_beleveSearch() {
// Output:
// Result of "水果世博园": 2 matches:
// 1. Doc 3, (1.099550)
- // Name: 买水果然后来世博园。
+ // Name: 买水果然后来世博园。
// 2. Doc 2, (0.031941)
- // Name: The second one 你 中文测试中文 is even more interesting! 吃水果
+ // Name: The second one 你 中文测试中文 is even more interesting! 吃水果
// Result of "你": 1 matches:
// 1. Doc 2, (0.391161)
- // Name: The second one 你 中文测试中文 is even more interesting! 吃水果
+ // Name: The second one 你 中文测试中文 is even more interesting! 吃水果
// Result of "first": 1 matches:
// 1. Doc 1, (0.512150)
- // Name: This is the first document we’ve added
+ // Name: This is the first document we’ve added
// Result of "中文": 1 matches:
// 1. Doc 2, (0.553186)
- // Name: The second one 你 中文测试中文 is even more interesting! 吃水果
+ // Name: The second one 你 中文测试中文 is even more interesting! 吃水果
// Result of "交换机": 2 matches:
// 1. Doc 4, (0.608495)
- // Name: 工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作
+ // Name: 工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作
// 2. Doc 5, (0.086700)
- // Name: 咱俩交换一下吧。
+ // Name: 咱俩交换一下吧。
// Result of "交换": 2 matches:
// 1. Doc 5, (0.534158)
- // Name: 咱俩交换一下吧。
+ // Name: 咱俩交换一下吧。
// 2. Doc 4, (0.296297)
- // Name: 工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作
+ // Name: 工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作
}
diff --git a/tokenizers/tokenizer.go b/tokenizers/tokenizer.go
index 6311b5d..fad5c73 100644
--- a/tokenizers/tokenizer.go
+++ b/tokenizers/tokenizer.go
@@ -60,7 +60,7 @@ func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
pos := 1
var width int
var gram string
- for word := range jt.seg.Cut(string(input), jt.hmm) {
+ for _, word := range jt.seg.Cut(string(input), jt.hmm) {
if jt.searchMode {
runes := []rune(word)
width = len(runes)