Skip to content

Commit 8756fbd

Browse files
committed
refactor to build tags
Signed-off-by: Alexander Bezzubov <[email protected]>
1 parent 553399e commit 8756fbd

10 files changed

+141
-114
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ Makefile.main
88
build/
99
vendor/
1010
java/lib/
11+
.vscode/

internal/tokenizer/common.go

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
// Package tokenizer implements file tokenization used by the enry content
2+
// classifier. This package is an implementation detail of enry and should not
3+
// be imported by other packages.
4+
package tokenizer
5+
6+
const byteLimit = 100000
File renamed without changes.
File renamed without changes.
File renamed without changes.

internal/tokenizer/flex/tokenize_c.go

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
package flex
2+
3+
// #include <stdlib.h>
4+
// #include "linguist.h"
5+
// #include "lex.linguist_yy.h"
6+
// int linguist_yywrap(yyscan_t yyscanner) {
7+
// return 1;
8+
// }
9+
import "C"
10+
import "unsafe"
11+
12+
// TokenizeC is only calling a C-flex based tokenizer from linguist
13+
func TokenizeC(content []byte) []string {
14+
cs := C.CBytes(content)
15+
defer C.free(unsafe.Pointer(cs))
16+
// C.tokenizer_extract_tokens((*C.char)(cs))
17+
return nil
18+
}
19+
20+
const maxTokenLen = 32
21+
22+
23+
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
24+
func TokenizeFlex(content []byte) []string {
25+
var buf C.YY_BUFFER_STATE
26+
var scanner C.yyscan_t
27+
var extra C.struct_tokenizer_extra
28+
// var scanner *C.yyscan_t = (*C.yyscan_t)(C.malloc(C.sizeof_yyscan_t))
29+
// var extra *C.struct_tokenizer_extra = (*C.struct_tokenizer_extra)(C.malloc(C.sizeof_struct_tokenizer_extra))
30+
var _len C.ulong
31+
var r C.int
32+
33+
_len = C.ulong(len(content))
34+
cs := C.CBytes(content)
35+
defer C.free(unsafe.Pointer(cs))
36+
37+
C.linguist_yylex_init_extra(&extra, &scanner)
38+
buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)
39+
40+
41+
ary := []string{}
42+
for {
43+
extra._type = C.NO_ACTION
44+
extra.token = nil
45+
r = C.linguist_yylex(scanner)
46+
switch (extra._type) {
47+
case C.NO_ACTION:
48+
break
49+
case C.REGULAR_TOKEN:
50+
_len = C.strlen(extra.token)
51+
if (_len <= maxTokenLen) {
52+
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
53+
//rb_ary_push(ary, rb_str_new(extra.token, len))
54+
}
55+
C.free(unsafe.Pointer(extra.token))
56+
break
57+
case C.SHEBANG_TOKEN:
58+
_len = C.strlen(extra.token)
59+
if (_len <= maxTokenLen) {
60+
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
61+
ary = append(ary, s)
62+
//s = rb_str_new2("SHEBANG#!");
63+
//rb_str_cat(s, extra.token, len);
64+
//rb_ary_push(ary, s);
65+
}
66+
C.free(unsafe.Pointer(extra.token))
67+
break
68+
case C.SGML_TOKEN:
69+
_len = C.strlen(extra.token)
70+
if (_len <= maxTokenLen) {
71+
s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
72+
ary = append(ary, s)
73+
//s = rb_str_new(extra.token, len);
74+
//rb_str_cat2(s, ">");
75+
//rb_ary_push(ary, s);
76+
}
77+
C.free(unsafe.Pointer(extra.token))
78+
break
79+
}
80+
if r == 0 {
81+
break
82+
}
83+
}
84+
85+
C.linguist_yy_delete_buffer(buf, scanner)
86+
C.linguist_yylex_destroy(scanner)
87+
// C.free(unsafe.Pointer(extra))
88+
// C.free(unsafe.Pointer(scanner))
89+
90+
return ary
91+
}
+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package flex
2+
3+
// import (
4+
// "testing"
5+
6+
// "gopkg.in/src-d/enry.v1/internal/tokenizer"
7+
// )
8+
9+
// func BenchmarkTokenizerC(b *testing.B) {
10+
// b.ReportAllocs()
11+
// for i := 0; i < b.N; i++ {
12+
// for _, test := range tokenizer.Tests {
13+
// TokenizeC(test.content)
14+
// }
15+
// }
16+
// }
17+
18+
// func BenchmarkTokenizerFlex(b *testing.B) {
19+
// b.ReportAllocs()
20+
// for i := 0; i < b.N; i++ {
21+
// for _, test := range tokenizer.Tests {
22+
// TokenizeFlex(test.content)
23+
// }
24+
// }
25+
// }

internal/tokenizer/tokenize.go

+2-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
// Package tokenizer implements file tokenization used by the enry content
2-
// classifier. This package is an implementation detail of enry and should not
3-
// be imported by other packages.
1+
// +build !flex
2+
43
package tokenizer
54

65
import (
@@ -9,8 +8,6 @@ import (
98
"gopkg.in/src-d/enry.v1/regex"
109
)
1110

12-
const byteLimit = 100000
13-
1411
// Tokenize returns language-agnostic lexical tokens from content. The tokens
1512
// returned should match what the Linguist library returns. At most the first
1613
// 100KB of content are tokenized.

internal/tokenizer/tokenize_c.go

+10-85
Original file line numberDiff line numberDiff line change
@@ -1,91 +1,16 @@
1-
package tokenizer
2-
3-
// #include <stdlib.h>
4-
// #include "linguist.h"
5-
// #include "lex.linguist_yy.h"
6-
// int linguist_yywrap(yyscan_t yyscanner) {
7-
// return 1;
8-
// }
9-
import "C"
10-
import "unsafe"
11-
12-
// TokenizeC is only calling a C-flex based tokenizer from linguist
13-
func TokenizeC(content []byte) []string {
14-
cs := C.CBytes(content)
15-
defer C.free(unsafe.Pointer(cs))
16-
// C.tokenizer_extract_tokens((*C.char)(cs))
17-
return nil
18-
}
19-
20-
const maxTokenLen = 32
1+
// +build flex
212

3+
package tokenizer
224

23-
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
24-
func TokenizeFlex(content []byte) []string {
25-
var buf C.YY_BUFFER_STATE
26-
var scanner C.yyscan_t
27-
var extra C.struct_tokenizer_extra
28-
// var scanner *C.yyscan_t = (*C.yyscan_t)(C.malloc(C.sizeof_yyscan_t))
29-
// var extra *C.struct_tokenizer_extra = (*C.struct_tokenizer_extra)(C.malloc(C.sizeof_struct_tokenizer_extra))
30-
var _len C.ulong
31-
var r C.int
32-
33-
_len = C.ulong(len(content))
34-
cs := C.CBytes(content)
35-
defer C.free(unsafe.Pointer(cs))
36-
37-
C.linguist_yylex_init_extra(&extra, &scanner)
38-
buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)
39-
5+
import "gopkg.in/src-d/enry.v1/internal/tokenizer/flex"
406

41-
ary := []string{}
42-
for {
43-
extra._type = C.NO_ACTION
44-
extra.token = nil
45-
r = C.linguist_yylex(scanner)
46-
switch (extra._type) {
47-
case C.NO_ACTION:
48-
break
49-
case C.REGULAR_TOKEN:
50-
_len = C.strlen(extra.token)
51-
if (_len <= maxTokenLen) {
52-
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
53-
//rb_ary_push(ary, rb_str_new(extra.token, len))
54-
}
55-
C.free(unsafe.Pointer(extra.token))
56-
break
57-
case C.SHEBANG_TOKEN:
58-
_len = C.strlen(extra.token)
59-
if (_len <= maxTokenLen) {
60-
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
61-
ary = append(ary, s)
62-
//s = rb_str_new2("SHEBANG#!");
63-
//rb_str_cat(s, extra.token, len);
64-
//rb_ary_push(ary, s);
65-
}
66-
C.free(unsafe.Pointer(extra.token))
67-
break
68-
case C.SGML_TOKEN:
69-
_len = C.strlen(extra.token)
70-
if (_len <= maxTokenLen) {
71-
s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
72-
ary = append(ary, s)
73-
//s = rb_str_new(extra.token, len);
74-
//rb_str_cat2(s, ">");
75-
//rb_ary_push(ary, s);
76-
}
77-
C.free(unsafe.Pointer(extra.token))
78-
break
79-
}
80-
if r == 0 {
81-
break
82-
}
7+
// Tokenize returns language-agnostic lexical tokens from content. The tokens
8+
// returned should match what the Linguist library returns. At most the first
9+
// 100KB of content are tokenized.
10+
func Tokenize(content []byte) []string {
11+
if len(content) > byteLimit {
12+
content = content[:byteLimit]
8313
}
8414

85-
C.linguist_yy_delete_buffer(buf, scanner)
86-
C.linguist_yylex_destroy(scanner)
87-
// C.free(unsafe.Pointer(extra))
88-
// C.free(unsafe.Pointer(scanner))
89-
90-
return ary
15+
return flex.TokenizeFlex(content)
9116
}

internal/tokenizer/tokenize_test.go

+6-24
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ var (
9191
"-", "|", "+", "&&", "<", "<", "-", "!", "!", "!", "=", "=", "!", ":", "=", ":", "=", ",", ",", "=", ">", ">", "=", "=", "=", "=", ">",
9292
"'", ",", ">", "=", ">", "=", "=", ">", "=", ">", ":", ">", "=", ">"}
9393

94-
tests = []struct {
94+
Tests = []struct {
9595
name string
9696
content []byte
9797
expected []string
@@ -101,10 +101,10 @@ var (
101101
)
102102

103103
func TestTokenize(t *testing.T) {
104-
for _, test := range tests {
104+
for _, test := range Tests {
105105
t.Run(test.name, func(t *testing.T) {
106106
before := string(test.content)
107-
tokens := TokenizeFlex(test.content)
107+
tokens := Tokenize(test.content)
108108
after := string(test.content)
109109
require.Equal(t, before, after, "the input slice was modified")
110110
require.Equal(t, len(test.expected), len(tokens), fmt.Sprintf("token' slice length = %v, want %v", len(test.expected), len(tokens)))
@@ -118,39 +118,21 @@ func TestTokenize(t *testing.T) {
118118
func BenchmarkTokenizer_BaselineCopy(b *testing.B) {
119119
b.ReportAllocs()
120120
for i := 0; i < b.N; i++ {
121-
for _, test := range tests {
121+
for _, test := range Tests {
122122
test.content = append([]byte(nil), test.content...)
123123
}
124124
}
125125
}
126126

127-
func BenchmarkTokenizerGo(b *testing.B) {
127+
func BenchmarkTokenizer(b *testing.B) {
128128
b.ReportAllocs()
129129
for i := 0; i < b.N; i++ {
130-
for _, test := range tests {
130+
for _, test := range Tests {
131131
Tokenize(test.content)
132132
}
133133
}
134134
}
135135

136-
func BenchmarkTokenizerC(b *testing.B) {
137-
b.ReportAllocs()
138-
for i := 0; i < b.N; i++ {
139-
for _, test := range tests {
140-
TokenizeC(test.content)
141-
}
142-
}
143-
}
144-
145-
func BenchmarkTokenizerFlex(b *testing.B) {
146-
b.ReportAllocs()
147-
for i := 0; i < b.N; i++ {
148-
for _, test := range tests {
149-
TokenizeFlex(test.content)
150-
}
151-
}
152-
}
153-
154136
//TODO(bzz): introduce tokenizer benchmark suit
155137
// baseline - just read the files
156138
// RE2

0 commit comments

Comments
 (0)