Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(text): handle code points > U+FFFF in levenshteinDistance #6014

Merged
merged 4 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 58 additions & 30 deletions text/levenshtein_distance.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,23 @@ const { ceil } = Math;

// This implements Myers' bit-vector algorithm as described here:
// https://dl.acm.org/doi/pdf/10.1145/316542.316550
const peq = new Uint32Array(0x10000);
const peq = new Uint32Array(0x10ffff);
lionel-rowe marked this conversation as resolved.
Show resolved Hide resolved

function myers32(t: string, p: string): number {
const n = t.length;
const m = p.length;
for (let i = 0; i < m; i++) {
peq[p.charCodeAt(i)]! |= 1 << i;
const n = unicodeStrLen(t);
const m = unicodeStrLen(p);
for (let i = 0; i < m;) {
const cp = p.codePointAt(i)!;
peq[cp]! |= 1 << i;
i += cp > 0xffff ? 2 : 1;
}
const last = m - 1;
let pv = -1;
let mv = 0;
let score = m;
for (let j = 0; j < n; j++) {
const eq = peq[t.charCodeAt(j)]!;
for (let j = 0; j < n;) {
const cp = t.codePointAt(j)!;
const eq = peq[cp]!;
const xv = eq | mv;
const xh = (((eq & pv) + pv) ^ pv) | eq;
let ph = mv | ~(xh | pv);
Expand All @@ -29,31 +32,38 @@ function myers32(t: string, p: string): number {
mh = mh << 1;
pv = mh | ~(xv | ph);
mv = ph & xv;

j += cp > 0xffff ? 2 : 1;
}
for (let i = 0; i < m; i++) {
peq[p.charCodeAt(i)] = 0;
for (let i = 0; i < m;) {
const cp = p.codePointAt(i)!;
peq[cp] = 0;
i += cp > 0xffff ? 2 : 1;
}
return score;
}

function myersX(t: string, p: string): number {
const n = t.length;
const m = p.length;
const n = unicodeStrLen(t);
const m = unicodeStrLen(p);
// Initialize the horizontal deltas to +1.
const h = new Int8Array(n).fill(1);
const bmax = ceil(m / 32) - 1;
// Process the blocks row by row so that we can use the fixed-size peq array.
for (let b = 0; b < bmax; b++) {
const start = b * 32;
const end = (b + 1) * 32;
for (let i = start; i < end; i++) {
peq[p.charCodeAt(i)]! |= 1 << i;
for (let i = start; i < end;) {
const cp = p.codePointAt(i)!;
peq[cp]! |= 1 << i;
i += cp > 0xffff ? 2 : 1;
}
let pv = -1;
let mv = 0;
for (let j = 0; j < n; j++) {
for (let j = 0; j < n;) {
const hin = h[j]!;
let eq = peq[t.charCodeAt(j)]!;
const cp = t.codePointAt(j)!;
let eq = peq[cp]!;
const xv = eq | mv;
eq |= hin >>> 31;
const xh = (((eq & pv) + pv) ^ pv) | eq;
Expand All @@ -64,22 +74,30 @@ function myersX(t: string, p: string): number {
mh = (mh << 1) | (hin >>> 31);
pv = mh | ~(xv | ph);
mv = ph & xv;

j += cp > 0xffff ? 2 : 1;
}
for (let i = start; i < end; i++) {
peq[p.charCodeAt(i)] = 0;
for (let i = start; i < end;) {
const cp = p.codePointAt(i)!;
peq[cp] = 0;

i += cp > 0xffff ? 2 : 1;
}
}
const start = bmax * 32;
for (let i = start; i < m; i++) {
peq[p.charCodeAt(i)]! |= 1 << i;
for (let i = start; i < m;) {
const cp = p.codePointAt(i)!;
peq[cp]! |= 1 << i;
i += cp > 0xffff ? 2 : 1;
}
const last = m - 1;
let pv = -1;
let mv = 0;
let score = m;
for (let j = 0; j < n; j++) {
for (let j = 0; j < n;) {
const hin = h[j]!;
let eq = peq[t.charCodeAt(j)]!;
const cp = t.codePointAt(j)!;
let eq = peq[cp]!;
const xv = eq | mv;
eq |= hin >>> 31;
const xh = (((eq & pv) + pv) ^ pv) | eq;
Expand All @@ -90,9 +108,13 @@ function myersX(t: string, p: string): number {
mh = (mh << 1) | (hin >>> 31);
pv = mh | ~(xv | ph);
mv = ph & xv;

j += cp > 0xffff ? 2 : 1;
}
for (let i = start; i < m; i++) {
peq[p.charCodeAt(i)] = 0;
for (let i = start; i < m;) {
const cp = p.codePointAt(i)!;
peq[cp] = 0;
i += cp > 0xffff ? 2 : 1;
}
return score;
}
Expand All @@ -119,13 +141,19 @@ function myersX(t: string, p: string): number {
* @returns The Levenshtein distance between the two strings.
*/
export function levenshteinDistance(str1: string, str2: string): number {
if (str1.length < str2.length) {
const tmp = str1;
str1 = str2;
str2 = tmp;
let strLen1 = unicodeStrLen(str1);
let strLen2 = unicodeStrLen(str2);

if (strLen1 < strLen2) {
[str1, str2] = [str2, str1];
[strLen1, strLen2] = [strLen2, strLen1];
}
if (str2.length === 0) {
return str1.length;
if (str2 === "") {
return strLen1;
}
return str2.length <= 32 ? myers32(str1, str2) : myersX(str1, str2);
return strLen2 <= 32 ? myers32(str1, str2) : myersX(str1, str2);
}

function unicodeStrLen(str: string) {
return str.replaceAll(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g, ".").length;
}
31 changes: 31 additions & 0 deletions text/levenshtein_distance_test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,34 @@ Deno.test("levenshteinDistance() handles long strings", () => {
30,
);
});

Deno.test("levenshteinDistance() handles code points above U+FFFF", async (t) => {
await t.step("`myers32` fast path", () => {
assertEquals(levenshteinDistance("💩", "x"), 1);
assertEquals(levenshteinDistance("💩", ""), 1);
assertEquals(levenshteinDistance("x", "💩"), 1);
assertEquals(levenshteinDistance("", "💩"), 1);
// first surrogate same
assertEquals(levenshteinDistance("💩", "💫"), 1);
// both surrogates different
assertEquals(levenshteinDistance("💩", "🦄"), 1);
// max cp
assertEquals(levenshteinDistance("\u{10FFFE}", "\u{10FFFF}"), 1);
});

await t.step("`myersX` path", () => {
assertEquals(levenshteinDistance("💩".repeat(33), "x".repeat(33)), 33);
assertEquals(levenshteinDistance("💩".repeat(33), ""), 33);
assertEquals(levenshteinDistance("x".repeat(33), "💩".repeat(33)), 33);
assertEquals(levenshteinDistance("", "💩".repeat(33)), 33);
// first surrogate same
assertEquals(levenshteinDistance("💩".repeat(33), "💫".repeat(33)), 33);
// both surrogates different
assertEquals(levenshteinDistance("💩".repeat(33), "🦄".repeat(33)), 33);
// max cp
assertEquals(
levenshteinDistance("\u{10FFFE}".repeat(33), "\u{10FFFF}".repeat(33)),
33,
);
});
});