-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathentofu.ts
213 lines (184 loc) · 6.19 KB
/
entofu.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
/**
* Reference implementation of the Entofu algorithm (Base262144).
*
* The encoding produces valid unassigned 4-byte Unicode characters (tofus).
*
* Bit distribution of a tofu: 111100xx 10zzzzzz 10zzzzzz 10zzzzzz
* - xx = 10: regular tofu (first byte = F2)
* - xx = 01: special tofu, unpadded terminal tofu (first byte = F1)
* - xx = 11: special tofu, padded terminal tofu or noncharacter substitute tofu (first byte = F3)
* - z…: data bits
*/
const BITS_PER_BYTE = 8
const BITS_PER_TOFU = 18
const UNICODE_LEAD = 0xf0 // 0b11110000
const UNICODE_LEAD_REGULAR = 0xf2 // 0b11110010
const UNICODE_LEAD_TERMINAL = 0xf1 // 0b11110001
const UNICODE_LEAD_TERMINAL_PADDED = 0xf3 // 0b11110011
const UNICODE_LEAD_NONCHAR = 0xf3 // 0b11110011
const UNICODE_CONTINUATION = 0x80 // 0b10000000
/** As a mask, matches both BE and BF. */
const UNICODE_CONTINUATION_BE = 0xbe // 0b10111110
const UNICODE_CONTINUATION_BF = 0xbf // 0b10111111
const UNICODE_PLANE_NONCHAR = 0x90 // 0b10010000
/** As a mask, matches _F of the second byte, not just 8F. */
const UNICODE_PLANE_XF = 0x8f // 0b10001111
/**
* Encodes a string into a tofu string.
* @param input Text.
* @returns Entofu encoded text.
**/
export function entofu(input: string): string {
let binary = new TextEncoder().encode(input)
return stringify(binary)
}
/**
* Decodes a tofu string into its original string.
* @param input Entofu encoded text.
* @returns Text.
**/
export function detofu(input: string): string {
let text = parse(input)
return new TextDecoder('utf8').decode(text)
}
/**
* Encodes binary data into a tofu string (UTF-16).
* @param input Binary data.
* @returns Entofu encoded data (as a string).
*/
export function stringify(input: Uint8Array): string {
let encoded = encode(input)
return new TextDecoder('utf8').decode(encoded)
}
/**
* Decodes a tofu string (UTF-16) into binary data.
* @param input Entofu encoded data (as a string).
* @returns Binary data.
*/
export function parse(input: string): Uint8Array {
let encoded = new TextEncoder().encode(input)
return decode(encoded)
}
/**
* Encodes binary data into tofu bytes (UTF-8).
* @param input Binary data.
* @returns Entofu encoded data as UTF-8 byte array.
*/
export function encode(input: Uint8Array): Uint8Array {
let bits = input.byteLength * BITS_PER_BYTE
let length = Math.ceil(bits / BITS_PER_TOFU) * 4 // in tofus
let output = new Uint8Array(length)
let index = 0
let buffer = 0
let count = 0
for (let offset = 0; offset < length; offset += 4) {
let tofu = new Uint8Array(4)
tofu[0] = UNICODE_LEAD_REGULAR
for (let byte = 1; byte <= 3; byte++) {
// Fill the bit buffer from the input
if (count < 6) {
buffer = (buffer << BITS_PER_BYTE) | input[index++]
count += BITS_PER_BYTE
}
// Extract 6 bits for the tofu as they become available
if (count >= 6) {
tofu[byte] = UNICODE_CONTINUATION | (buffer >> (count - 6))
buffer &= (1 << (count - 6)) - 1
count -= 6
}
}
// Handle terminal tofu and padding
if (offset + 4 === length) {
tofu[0] = UNICODE_LEAD_TERMINAL
let remainder = bits % BITS_PER_TOFU
if (remainder && remainder < 12) {
tofu[0] = UNICODE_LEAD_TERMINAL_PADDED
if (remainder < 6) {
tofu[3] = tofu[1]
tofu[2] = UNICODE_CONTINUATION
tofu[1] = UNICODE_CONTINUATION | 1
} else {
tofu[3] = tofu[2]
tofu[2] = tofu[1]
tofu[1] = UNICODE_CONTINUATION
}
}
}
if (isNoncharacter(tofu)) {
let special = tofu[0] & 1
let plane = (tofu[1] >> 4) & 0b11
let noncharacter = tofu[3] & 1
tofu[0] = UNICODE_LEAD_NONCHAR
tofu[1] = UNICODE_PLANE_NONCHAR
tofu[2] = UNICODE_CONTINUATION
tofu[3] = UNICODE_CONTINUATION | noncharacter | (plane << 1) | (special << 3)
}
output.set(tofu, offset)
}
return output
}
/**
* Decodes tofu bytes (UTF-8) into binary data.
* @param input Entofu encoded data as a UTF-8 byte array.
* @returns Binary data.
*/
export function decode(input: Uint8Array): Uint8Array {
let length = Math.ceil(((input.length / 4) * BITS_PER_TOFU) / BITS_PER_BYTE)
let output = new Uint8Array(length)
let index = 0
let buffer = 0
let count = 0
for (let offset = 0; offset < input.length; offset += 4) {
let tofu = input.subarray(offset, offset + 4)
if ((tofu[0] & 0b11111100) !== UNICODE_LEAD) throw Error(`Invalid leading byte at ${offset}`)
if (isNoncharacter(tofu)) continue
// Bytes to skip (for padded terminal tofus)
let skip = 0
// Special tofu (terminal/noncharacter)
if ((tofu[0] & 1) === 1) {
// Padded/noncharacter
if (((tofu[0] >> 1) & 1) === 1) {
// Noncharacter
if (tofu[1] === UNICODE_PLANE_NONCHAR) {
let noncharacter = tofu[3] & 1
let plane = (tofu[3] >> 1) & 0b11
let special = (tofu[3] >> 3) & 1
tofu[0] = UNICODE_LEAD | (special ? 0b01 : 0b10)
tofu[1] = UNICODE_CONTINUATION | (plane << 4) | 0b1111
tofu[2] = UNICODE_CONTINUATION | 0b111111
tofu[3] = UNICODE_CONTINUATION | 0b111110 | noncharacter
} else {
// Padded terminal tofu
skip = 1 + (tofu[1] & 1)
}
}
}
for (let byte = 1 + skip; byte <= 3; byte++) {
if ((tofu[byte] & 0b11000000) !== UNICODE_CONTINUATION)
throw Error(`Invalid continuation byte at ${offset + byte}`)
// Fill the bit buffer from the tofu byte's data
let bits = tofu[byte] & 0b111111
buffer = (buffer << 6) | bits
count += 6
// Extract bytes as they become available
while (count >= 8) {
let data = (buffer >> (count - 8)) & 0xff
output[index++] = data
count -= 8
}
}
}
// Cap the output to only extracted bytes (may be < length)
return output.subarray(0, index)
}
/**
* Checks whether a character is, in fact, not a character.
* @param tofu A UTF-8 code point (4 bytes).
*/
function isNoncharacter(tofu: Uint8Array): boolean {
return (
tofu[2] === UNICODE_CONTINUATION_BF &&
(tofu[3] & UNICODE_CONTINUATION_BE) === UNICODE_CONTINUATION_BE &&
(tofu[1] & UNICODE_PLANE_XF) === UNICODE_PLANE_XF
)
}