Skip to content

Commit 10459e0

Browse files
authored
Merge pull request #182 from CocaineCong/feature-bm25
refactor: extract the segment with weight module
2 parents 2e51ba3 + bdb140e commit 10459e0

File tree

4 files changed

+49
-45
lines changed

4 files changed

+49
-45
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,4 @@ _testmain.go
7070
.glide/
7171
examples/dict/embed/embed
7272
examples/dict/embed/main
73+
oryxBuildBinary

hmm/idf/tag_extracker.go

+6-40
Original file line numberDiff line numberDiff line change
@@ -6,43 +6,9 @@ import (
66
"unicode/utf8"
77

88
"github.com/go-ego/gse"
9+
"github.com/go-ego/gse/hmm/segment"
910
)
1011

11-
// Segment type a word with weight.
12-
type Segment struct {
13-
text string
14-
weight float64
15-
}
16-
17-
// Text return the segment's text.
18-
func (s Segment) Text() string {
19-
return s.text
20-
}
21-
22-
// Weight return the segment's weight.
23-
func (s Segment) Weight() float64 {
24-
return s.weight
25-
}
26-
27-
// Segments type a slice of Segment.
28-
type Segments []Segment
29-
30-
func (ss Segments) Len() int {
31-
return len(ss)
32-
}
33-
34-
func (ss Segments) Less(i, j int) bool {
35-
if ss[i].weight == ss[j].weight {
36-
return ss[i].text < ss[j].text
37-
}
38-
39-
return ss[i].weight < ss[j].weight
40-
}
41-
42-
func (ss Segments) Swap(i, j int) {
43-
ss[i], ss[j] = ss[j], ss[i]
44-
}
45-
4612
// TagExtracter is extract tags struct.
4713
type TagExtracter struct {
4814
seg gse.Segmenter
@@ -82,7 +48,7 @@ func (t *TagExtracter) LoadStopWords(fileName ...string) error {
8248
}
8349

8450
// ExtractTags extract the topK key words from text.
85-
func (t *TagExtracter) ExtractTags(text string, topK int) (tags Segments) {
51+
func (t *TagExtracter) ExtractTags(text string, topK int) (tags segment.Segments) {
8652
freqMap := make(map[string]float64)
8753

8854
for _, w := range t.seg.Cut(text, true) {
@@ -110,13 +76,13 @@ func (t *TagExtracter) ExtractTags(text string, topK int) (tags Segments) {
11076
freqMap[k] = v / total
11177
}
11278

113-
ws := make(Segments, 0)
114-
var s Segment
79+
ws := make(segment.Segments, 0)
80+
var s segment.Segment
11581
for k, v := range freqMap {
11682
if freq, _, ok := t.Idf.Freq(k); ok {
117-
s = Segment{text: k, weight: freq * v}
83+
s = segment.Segment{Text: k, Weight: freq * v}
11884
} else {
119-
s = Segment{text: k, weight: t.Idf.median * v}
85+
s = segment.Segment{Text: k, Weight: t.Idf.median * v}
12086
}
12187
ws = append(ws, s)
12288
}

hmm/idf/textrank.go

+6-5
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66

77
"github.com/go-ego/gse"
88
"github.com/go-ego/gse/hmm/pos"
9+
"github.com/go-ego/gse/hmm/segment"
910
)
1011

1112
const dampingFactor = 0.85
@@ -81,7 +82,7 @@ func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) {
8182
edge{start: end, end: start, weight: weight})
8283
}
8384

84-
func (u *undirectWeightedGraph) rank() Segments {
85+
func (u *undirectWeightedGraph) rank() segment.Segments {
8586
if !sort.IsSorted(u.keys) {
8687
sort.Sort(u.keys)
8788
}
@@ -124,10 +125,10 @@ func (u *undirectWeightedGraph) rank() Segments {
124125
}
125126
}
126127

127-
result := make(Segments, 0)
128+
result := make(segment.Segments, 0)
128129
for n, w := range ws {
129130
result = append(result,
130-
Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)},
131+
segment.Segment{Text: n, Weight: (w - minRank/10.0) / (maxRank - minRank/10.0)},
131132
)
132133
}
133134

@@ -137,7 +138,7 @@ func (u *undirectWeightedGraph) rank() Segments {
137138

138139
// TextRankWithPOS extracts keywords from text using TextRank algorithm.
139140
// Parameter allowPOS allows a []string pos list.
140-
func (t *TextRanker) TextRankWithPOS(text string, topK int, allowPOS []string) Segments {
141+
func (t *TextRanker) TextRankWithPOS(text string, topK int, allowPOS []string) segment.Segments {
141142
posFilt := make(map[string]int)
142143
for _, pos1 := range allowPOS {
143144
posFilt[pos1] = 1
@@ -181,6 +182,6 @@ func (t *TextRanker) TextRankWithPOS(text string, topK int, allowPOS []string) S
181182

182183
// TextRank extract keywords from text using TextRank algorithm.
183184
// Parameter topK specify how many top keywords to be returned at most.
184-
func (t *TextRanker) TextRank(text string, topK int) Segments {
185+
func (t *TextRanker) TextRank(text string, topK int) segment.Segments {
185186
return t.TextRankWithPOS(text, topK, defaultAllowPOS)
186187
}

hmm/segment/segment.go

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
package segment
2+
3+
// Segment type a word with weight.
4+
type Segment struct {
5+
Text string
6+
Weight float64
7+
}
8+
9+
// Text return the segment's text.
10+
func (s Segment) GetText() string {
11+
return s.Text
12+
}
13+
14+
// Weight return the segment's weight.
15+
func (s Segment) GetWeight() float64 {
16+
return s.Weight
17+
}
18+
19+
// Segments type a slice of Segment.
20+
type Segments []Segment
21+
22+
func (ss Segments) Len() int {
23+
return len(ss)
24+
}
25+
26+
func (ss Segments) Less(i, j int) bool {
27+
if ss[i].Weight == ss[j].Weight {
28+
return ss[i].Text < ss[j].Text
29+
}
30+
31+
return ss[i].Weight < ss[j].Weight
32+
}
33+
34+
func (ss Segments) Swap(i, j int) {
35+
ss[i], ss[j] = ss[j], ss[i]
36+
}

0 commit comments

Comments
 (0)