Skip to content

Commit ae559a9

Browse files
committed
Add ISO-2022 charset recognizers
1 parent b1ae293 commit ae559a9

File tree

2 files changed

+105
-3
lines changed

2 files changed

+105
-3
lines changed

2022.go

+102
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
package chardet
2+
3+
import (
4+
"bytes"
5+
)
6+
7+
type recognizer2022 struct {
8+
charset string
9+
escapes [][]byte
10+
}
11+
12+
func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
13+
return recognizerOutput{
14+
Charset: r.charset,
15+
Confidence: r.matchConfidence(input.input),
16+
}
17+
}
18+
19+
func (r *recognizer2022) matchConfidence(input []byte) int {
20+
var hits, misses, shifts int
21+
input:
22+
for i := 0; i < len(input); i++ {
23+
c := input[i]
24+
if c == 0x1B {
25+
for _, esc := range r.escapes {
26+
if bytes.HasPrefix(input[i+1:], esc) {
27+
hits++
28+
i += len(esc)
29+
continue input
30+
}
31+
}
32+
misses++
33+
} else if c == 0x0E || c == 0x0F {
34+
shifts++
35+
}
36+
}
37+
if hits == 0 {
38+
return 0
39+
}
40+
quality := (100*hits - 100*misses) / (hits + misses)
41+
if hits+shifts < 5 {
42+
quality -= (5 - (hits + shifts)) * 10
43+
}
44+
if quality < 0 {
45+
quality = 0
46+
}
47+
return quality
48+
}
49+
50+
var escapeSequences_2022JP = [][]byte{
51+
{0x24, 0x28, 0x43}, // KS X 1001:1992
52+
{0x24, 0x28, 0x44}, // JIS X 212-1990
53+
{0x24, 0x40}, // JIS C 6226-1978
54+
{0x24, 0x41}, // GB 2312-80
55+
{0x24, 0x42}, // JIS X 208-1983
56+
{0x26, 0x40}, // JIS X 208 1990, 1997
57+
{0x28, 0x42}, // ASCII
58+
{0x28, 0x48}, // JIS-Roman
59+
{0x28, 0x49}, // Half-width katakana
60+
{0x28, 0x4a}, // JIS-Roman
61+
{0x2e, 0x41}, // ISO 8859-1
62+
{0x2e, 0x46}, // ISO 8859-7
63+
}
64+
65+
var escapeSequences_2022KR = [][]byte{
66+
{0x24, 0x29, 0x43},
67+
}
68+
69+
var escapeSequences_2022CN = [][]byte{
70+
{0x24, 0x29, 0x41}, // GB 2312-80
71+
{0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
72+
{0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
73+
{0x24, 0x29, 0x45}, // ISO-IR-165
74+
{0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
75+
{0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
76+
{0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
77+
{0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
78+
{0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
79+
{0x4e}, // SS2
80+
{0x4f}, // SS3
81+
}
82+
83+
func newRecognizer_2022JP() *recognizer2022 {
84+
return &recognizer2022{
85+
"ISO-2022-JP",
86+
escapeSequences_2022JP,
87+
}
88+
}
89+
90+
func newRecognizer_2022KR() *recognizer2022 {
91+
return &recognizer2022{
92+
"ISO-2022-KR",
93+
escapeSequences_2022KR,
94+
}
95+
}
96+
97+
func newRecognizer_2022CN() *recognizer2022 {
98+
return &recognizer2022{
99+
"ISO-2022-CN",
100+
escapeSequences_2022CN,
101+
}
102+
}

detector.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,9 @@ var recognizers = []recognizer{
5252
newRecognizer_euc_kr(),
5353
newRecognizer_big5(),
5454

55-
// newRecognizer_2022JP(),
56-
// newRecognizer_2022KR(),
57-
// newRecognizer_2022CN(),
55+
newRecognizer_2022JP(),
56+
newRecognizer_2022KR(),
57+
newRecognizer_2022CN(),
5858

5959
newRecognizer_IBM424_he_rtl(),
6060
newRecognizer_IBM424_he_ltr(),

0 commit comments

Comments
 (0)