diff --git a/src/core/series.ts b/src/core/series.ts index 46d3a7cb..b4328084 100644 --- a/src/core/series.ts +++ b/src/core/series.ts @@ -764,8 +764,12 @@ export class Series { let nanCount = 0; let allNumeric = true; - // Single pass: partition NaN/null and initialise AoS radix entries for finite numerics. - // fvals is indexed by compact slot (finCount) so reads and writes are sequential. + // Clear histograms before the init loop so we can accumulate them inline. + _rxHisto.fill(0); + + // Single pass: partition NaN/null, initialise AoS radix entries for finite + // numerics, and accumulate all 8 histograms simultaneously — eliminating the + // separate O(n) histogram scan that the previous implementation required. for (let i = 0; i < n; i++) { const v = vals[i]; if (v === null || v === undefined || (typeof v === "number" && Number.isNaN(v))) { @@ -791,6 +795,24 @@ export class Series { _rxA[base] = i; _rxA[base + 1] = lo; _rxA[base + 2] = hi; + // Accumulate all 8 histogram passes inline — no second scan needed. + let idx: number; + idx = lo & 0xff; + _rxHisto[idx] = _rxHisto[idx]! + 1; + idx = 256 + ((lo >>> 8) & 0xff); + _rxHisto[idx] = _rxHisto[idx]! + 1; + idx = 512 + ((lo >>> 16) & 0xff); + _rxHisto[idx] = _rxHisto[idx]! + 1; + idx = 768 + ((lo >>> 24) & 0xff); + _rxHisto[idx] = _rxHisto[idx]! + 1; + idx = 1024 + (hi & 0xff); + _rxHisto[idx] = _rxHisto[idx]! + 1; + idx = 1280 + ((hi >>> 8) & 0xff); + _rxHisto[idx] = _rxHisto[idx]! + 1; + idx = 1536 + ((hi >>> 16) & 0xff); + _rxHisto[idx] = _rxHisto[idx]! + 1; + idx = 1792 + ((hi >>> 24) & 0xff); + _rxHisto[idx] = _rxHisto[idx]! + 1; } else { allNumeric = false; } @@ -807,35 +829,9 @@ export class Series { if (allNumeric && finCount > 0) { // ── LSD radix sort: 8 passes × 8 bits over IEEE-754 transformed keys ── - // _rxA is already initialised by the merged loop above. + // _rxA and _rxHisto are already initialised by the merged loop above. // AoS layout: srcBuf[i*3]=origIdx, srcBuf[i*3+1]=loKey, srcBuf[i*3+2]=hiKey. - // Pre-compute all 8 histograms in a single O(n) scan, saving 7 redundant - // count loops. _rxHisto[pass*256 + byte] = count of elements with that byte. - _rxHisto.fill(0); - for (let i = 0; i < finCount; i++) { - const si = i * 3; - const lo = srcBuf[si + 1]!; - const hi = srcBuf[si + 2]!; - let idx: number; - idx = 0 * 256 + (lo & 0xff); - _rxHisto[idx] = _rxHisto[idx]! + 1; - idx = 1 * 256 + ((lo >>> 8) & 0xff); - _rxHisto[idx] = _rxHisto[idx]! + 1; - idx = 2 * 256 + ((lo >>> 16) & 0xff); - _rxHisto[idx] = _rxHisto[idx]! + 1; - idx = 3 * 256 + ((lo >>> 24) & 0xff); - _rxHisto[idx] = _rxHisto[idx]! + 1; - idx = 4 * 256 + (hi & 0xff); - _rxHisto[idx] = _rxHisto[idx]! + 1; - idx = 5 * 256 + ((hi >>> 8) & 0xff); - _rxHisto[idx] = _rxHisto[idx]! + 1; - idx = 6 * 256 + ((hi >>> 16) & 0xff); - _rxHisto[idx] = _rxHisto[idx]! + 1; - idx = 7 * 256 + ((hi >>> 24) & 0xff); - _rxHisto[idx] = _rxHisto[idx]! + 1; - } - // Convert each histogram to an exclusive prefix sum (cumulative offsets). for (let pass = 0; pass < 8; pass++) { const base = pass * 256; @@ -855,8 +851,8 @@ export class Series { const keyOff = pass < 4 ? 1 : 2; const shift = (pass % 4) * 8; const histoBase = pass * 256; - for (let i = 0; i < finCount; i++) { - const si = i * 3; + // Use accumulated stride counter (si += 3) to avoid i*3 multiply per element. + for (let i = 0, si = 0; i < finCount; i++, si += 3) { const bucket = (srcBuf[si + keyOff]! >>> shift) & 0xff; const p = _rxHisto[histoBase + bucket]!; _rxHisto[histoBase + bucket] = p + 1; @@ -904,15 +900,15 @@ export class Series { } if (allNumeric) { if (ascending) { - for (let i = 0; i < finCount; i++) { - const idx = srcBuf[i * 3]!; + for (let i = 0, si = 0; i < finCount; i++, si += 3) { + const idx = srcBuf[si]!; perm[pos] = idx; outData[pos] = vals[idx] as T; pos = pos + 1; } } else { - for (let i = finCount - 1; i >= 0; i--) { - const idx = srcBuf[i * 3]!; + for (let i = finCount - 1, si = (finCount - 1) * 3; i >= 0; i--, si -= 3) { + const idx = srcBuf[si]!; perm[pos] = idx; outData[pos] = vals[idx] as T; pos = pos + 1; @@ -929,15 +925,15 @@ export class Series { } else { if (allNumeric) { if (ascending) { - for (let i = 0; i < finCount; i++) { - const idx = srcBuf[i * 3]!; + for (let i = 0, si = 0; i < finCount; i++, si += 3) { + const idx = srcBuf[si]!; perm[pos] = idx; outData[pos] = vals[idx] as T; pos = pos + 1; } } else { - for (let i = finCount - 1; i >= 0; i--) { - const idx = srcBuf[i * 3]!; + for (let i = finCount - 1, si = (finCount - 1) * 3; i >= 0; i--, si -= 3) { + const idx = srcBuf[si]!; perm[pos] = idx; outData[pos] = vals[idx] as T; pos = pos + 1; @@ -959,9 +955,16 @@ export class Series { } } + // RangeIndex fast path: for a default 0-based RangeIndex the output index is + // just perm itself — skip 100k bounds-checked at() calls from index.take(). + const outIndex: Index