Skip to content

Commit 0f884a2

Browse files
mbostockFil
andauthored
support Apache Arrow as a normalized data representation (#2115)
* columnar support for arrow tables * defer reading the values until they're actually requested (which is often not the case) * tests * comment * fix apache arrow dates (alternative to #2096) * add test snapshot * fix test wrt apache/arrow#40718 * arrow table data; fix BigInt coercion * more arrow support * arrow date hint; fix BigInt coercion * inline floater * shorten slightly * valueof tests; better arrow coercion * Arrow-aware stack transform * a few more dataify * fix merge conflict * fix Plot.find and stack customOrder * handle Arrow in a few more places --------- Co-authored-by: Philippe Rivière <[email protected]>
1 parent c911b1a commit 0f884a2

26 files changed

+1682
-71
lines changed

package.json

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
"@types/node": "^20.5.0",
5757
"@typescript-eslint/eslint-plugin": "^7.2.0",
5858
"@typescript-eslint/parser": "^7.2.0",
59+
"apache-arrow": "^16.0.2",
5960
"c8": "^9.1.0",
6061
"canvas": "^2.0.0",
6162
"d3-geo-projection": "^4.0.0",

src/interactions/pointer.js

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import {pointer as pointof} from "d3";
22
import {composeRender} from "../mark.js";
3+
import {isArray} from "../options.js";
34
import {applyFrameAnchor} from "../style.js";
45

56
const states = new WeakMap();
@@ -126,7 +127,11 @@ function pointerK(kx, ky, {x, y, px, py, maxRadius = 40, channels, render, ...op
126127

127128
// Dispatch the value. When simultaneously exiting this facet and
128129
// entering a new one, prioritize the entering facet.
129-
if (!(i == null && facetState?.size > 1)) context.dispatchValue(i == null ? null : data[i]);
130+
if (!(i == null && facetState?.size > 1)) {
131+
const value = i == null ? null : isArray(data) ? data[i] : data.get(i);
132+
context.dispatchValue(value);
133+
}
134+
130135
return r;
131136
}
132137

src/mark.d.ts

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ export type TipPointer = "x" | "y" | "xy";
3232
*
3333
* - an array, typed array, or other iterable
3434
* - an object with a length property and indexed values
35+
* - an Apache Arrow Table
3536
*/
3637
export type Data = Iterable<any> | ArrayLike<any>;
3738

src/mark.js

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import {channelDomain, createChannels, valueObject} from "./channel.js";
22
import {defined} from "./defined.js";
33
import {maybeFacetAnchor} from "./facet.js";
44
import {maybeClip, maybeNamed, maybeValue} from "./options.js";
5-
import {arrayify, isDomainSort, isObject, isOptions, keyword, range, singleton} from "./options.js";
5+
import {dataify, isDomainSort, isObject, isOptions, keyword, range, singleton} from "./options.js";
66
import {project} from "./projection.js";
77
import {maybeClassName, styles} from "./style.js";
88
import {basic, initializer} from "./transforms/basic.js";
@@ -89,10 +89,10 @@ export class Mark {
8989
}
9090
}
9191
initialize(facets, facetChannels, plotOptions) {
92-
let data = arrayify(this.data);
92+
let data = dataify(this.data);
9393
if (facets === undefined && data != null) facets = [range(data)];
9494
const originalFacets = facets;
95-
if (this.transform != null) ({facets, data} = this.transform(data, facets, plotOptions)), (data = arrayify(data));
95+
if (this.transform != null) ({facets, data} = this.transform(data, facets, plotOptions)), (data = dataify(data));
9696
if (facets !== undefined) facets.original = originalFacets; // needed to read facetChannels
9797
const channels = createChannels(this.channels, data);
9898
if (this.sort != null) channelDomain(data, facets, channels, facetChannels, this.sort); // mutates facetChannels!

src/options.js

+75-17
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,36 @@ import {timeInterval, utcInterval} from "./time.js";
77
export const TypedArray = Object.getPrototypeOf(Uint8Array);
88
const objectToString = Object.prototype.toString;
99

10+
export function isArray(value) {
11+
return value instanceof Array || value instanceof TypedArray;
12+
}
13+
14+
function isNumberArray(value) {
15+
return value instanceof TypedArray && !isBigIntArray(value);
16+
}
17+
18+
function isNumberType(type) {
19+
return type?.prototype instanceof TypedArray && !isBigIntType(type);
20+
}
21+
22+
function isBigIntArray(value) {
23+
return value instanceof BigInt64Array || value instanceof BigUint64Array;
24+
}
25+
26+
function isBigIntType(type) {
27+
return type === BigInt64Array || type === BigUint64Array;
28+
}
29+
1030
// If a reindex is attached to the data, channel values expressed as arrays will
1131
// be reindexed when the channels are instantiated. See exclusiveFacets.
1232
export const reindex = Symbol("reindex");
1333

1434
export function valueof(data, value, type) {
1535
const valueType = typeof value;
1636
return valueType === "string"
17-
? maybeTypedMap(data, field(value), type)
37+
? isArrowTable(data)
38+
? maybeTypedArrowify(data.getChild(value), type)
39+
: maybeTypedMap(data, field(value), type)
1840
: valueType === "function"
1941
? maybeTypedMap(data, value, type)
2042
: valueType === "number" || value instanceof Date || valueType === "boolean"
@@ -29,21 +51,25 @@ function maybeTake(values, index) {
2951
}
3052

3153
function maybeTypedMap(data, f, type) {
32-
return map(data, type?.prototype instanceof TypedArray ? floater(f) : f, type);
54+
return map(data, isNumberType(type) ? (d, i) => coerceNumber(f(d, i)) : f, type); // allow conversion from BigInt
3355
}
3456

3557
function maybeTypedArrayify(data, type) {
3658
return type === undefined
3759
? arrayify(data) // preserve undefined type
60+
: isArrowVector(data)
61+
? maybeTypedArrowify(data, type)
3862
: data instanceof type
3963
? data
40-
: type.prototype instanceof TypedArray && !(data instanceof TypedArray)
41-
? type.from(data, coerceNumber)
42-
: type.from(data);
64+
: type.from(data, isNumberType(type) && !isNumberArray(data) ? coerceNumber : undefined);
4365
}
4466

45-
function floater(f) {
46-
return (d, i) => coerceNumber(f(d, i));
67+
function maybeTypedArrowify(vector, type) {
68+
return vector == null
69+
? vector
70+
: (type === undefined || type === Array) && isArrowDateType(vector.type)
71+
? coerceDates(vector.toArray())
72+
: maybeTypedArrayify(vector.toArray(), type);
4773
}
4874

4975
export const singleton = [null]; // for data-less decoration marks, e.g. frame
@@ -70,7 +96,7 @@ export function percentile(reduce) {
7096

7197
// If the values are specified as a typed array, no coercion is required.
7298
export function coerceNumbers(values) {
73-
return values instanceof TypedArray ? values : map(values, coerceNumber, Float64Array);
99+
return isNumberArray(values) ? values : map(values, coerceNumber, Float64Array);
74100
}
75101

76102
// Unlike Mark’s number, here we want to convert null and undefined to NaN since
@@ -95,7 +121,7 @@ export function coerceDate(x) {
95121
? x
96122
: typeof x === "string"
97123
? isoParse(x)
98-
: x == null || isNaN((x = +x))
124+
: x == null || isNaN((x = Number(x))) // allow conversion from BigInt
99125
? undefined
100126
: new Date(x);
101127
}
@@ -130,9 +156,15 @@ export function keyword(input, name, allowed) {
130156
return i;
131157
}
132158

159+
// Like arrayify, but also allows data to be an Apache Arrow Table.
160+
export function dataify(data) {
161+
return isArrowTable(data) ? data : arrayify(data);
162+
}
163+
133164
// Promotes the specified data to an array as needed.
134165
export function arrayify(values) {
135-
if (values == null || values instanceof Array || values instanceof TypedArray) return values;
166+
if (values == null || isArray(values)) return values;
167+
if (isArrowVector(values)) return maybeTypedArrowify(values);
136168
switch (values.type) {
137169
case "FeatureCollection":
138170
return values.features;
@@ -233,22 +265,21 @@ export function maybeZ({z, fill, stroke} = {}) {
233265
return z;
234266
}
235267

268+
export function lengthof(data) {
269+
return isArray(data) ? data.length : data?.numRows;
270+
}
271+
236272
// Returns a Uint32Array with elements [0, 1, 2, … data.length - 1].
237273
export function range(data) {
238-
const n = data.length;
274+
const n = lengthof(data);
239275
const r = new Uint32Array(n);
240276
for (let i = 0; i < n; ++i) r[i] = i;
241277
return r;
242278
}
243279

244-
// Returns a filtered range of data given the test function.
245-
export function where(data, test) {
246-
return range(data).filter((i) => test(data[i], i, data));
247-
}
248-
249280
// Returns an array [values[index[0]], values[index[1]], …].
250281
export function take(values, index) {
251-
return map(index, (i) => values[i], values.constructor);
282+
return isArray(values) ? map(index, (i) => values[i], values.constructor) : map(index, (i) => values.at(i));
252283
}
253284

254285
// If f does not take exactly one argument, wraps it in a function that uses take.
@@ -575,3 +606,30 @@ export function maybeClip(clip) {
575606
else if (clip != null) clip = keyword(clip, "clip", ["frame", "sphere"]);
576607
return clip;
577608
}
609+
610+
// https://github.com/observablehq/stdlib/blob/746ca2e69135df6178e4f3a17244def35d8d6b20/src/arrow.js#L4C1-L17C1
611+
function isArrowTable(value) {
612+
return (
613+
value &&
614+
typeof value.getChild === "function" &&
615+
typeof value.toArray === "function" &&
616+
value.schema &&
617+
Array.isArray(value.schema.fields)
618+
);
619+
}
620+
621+
function isArrowVector(value) {
622+
return value && typeof value.toArray === "function" && value.type;
623+
}
624+
625+
// Apache Arrow now represents dates as numbers. We currently only support
626+
// implicit coercion to JavaScript Date objects when the numbers represent
627+
// milliseconds since Unix epoch.
628+
function isArrowDateType(type) {
629+
return (
630+
type &&
631+
(type.typeId === 8 || // date
632+
type.typeId === 10) && // timestamp
633+
type.unit === 1 // millisecond
634+
);
635+
}

src/plot.js

+4-4
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import {axisFx, axisFy, axisX, axisY, gridFx, gridFy, gridX, gridY} from "./mark
1010
import {frame} from "./marks/frame.js";
1111
import {tip} from "./marks/tip.js";
1212
import {isColor, isIterable, isNone, isScaleOptions} from "./options.js";
13-
import {arrayify, map, yes, maybeIntervalTransform, subarray} from "./options.js";
13+
import {dataify, lengthof, map, yes, maybeIntervalTransform, subarray} from "./options.js";
1414
import {createProjection, getGeometryChannels, hasProjection} from "./projection.js";
1515
import {createScales, createScaleFunctions, autoScaleRange, exposeScales} from "./scales.js";
1616
import {innerDimensions, outerDimensions} from "./scales.js";
@@ -459,7 +459,7 @@ function maybeTopFacet(facet, options) {
459459
if (facet == null) return;
460460
const {x, y} = facet;
461461
if (x == null && y == null) return;
462-
const data = arrayify(facet.data);
462+
const data = dataify(facet.data);
463463
if (data == null) throw new Error("missing facet data");
464464
const channels = {};
465465
if (x != null) channels.fx = createChannel(data, {value: x, scale: "fx"});
@@ -478,7 +478,7 @@ function maybeMarkFacet(mark, topFacetState, options) {
478478
// here with maybeTopFacet that we could reduce.
479479
const {fx, fy} = mark;
480480
if (fx != null || fy != null) {
481-
const data = arrayify(mark.data ?? fx ?? fy);
481+
const data = dataify(mark.data ?? fx ?? fy);
482482
if (data === undefined) throw new Error(`missing facet data in ${mark.ariaLabel}`);
483483
if (data === null) return; // ignore channel definitions if no data is provided TODO this right?
484484
const channels = {};
@@ -500,7 +500,7 @@ function maybeMarkFacet(mark, topFacetState, options) {
500500
if (
501501
data.length > 0 &&
502502
(groups.size > 1 || (groups.size === 1 && channels.fx && channels.fy && [...groups][0][1].size > 1)) &&
503-
arrayify(mark.data)?.length === data.length
503+
lengthof(dataify(mark.data)) === lengthof(data)
504504
) {
505505
warn(
506506
`Warning: the ${mark.ariaLabel} mark appears to use faceted data, but isn’t faceted. The mark data has the same length as the facet data and the mark facet option is "auto", but the mark data and facet data are distinct. If this mark should be faceted, set the mark facet option to true; otherwise, suppress this warning by setting the mark facet option to false.`

src/transforms/basic.js

+6-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import {randomLcg} from "d3";
22
import {ascendingDefined, descendingDefined} from "../defined.js";
3-
import {arrayify, isDomainSort, isOptions, maybeValue, valueof} from "../options.js";
3+
import {isArray, isDomainSort, isOptions} from "../options.js";
4+
import {dataify, maybeValue, valueof} from "../options.js";
45

56
export function basic({filter: f1, sort: s1, reverse: r1, transform: t1, initializer: i1, ...options} = {}, transform) {
67
// If both t1 and t2 are defined, returns a composite transform that first
@@ -40,7 +41,7 @@ function composeTransform(t1, t2) {
4041
if (t2 == null) return t1 === null ? undefined : t1;
4142
return function (data, facets, plotOptions) {
4243
({data, facets} = t1.call(this, data, facets, plotOptions));
43-
return t2.call(this, arrayify(data), facets, plotOptions);
44+
return t2.call(this, dataify(data), facets, plotOptions);
4445
};
4546
}
4647

@@ -101,7 +102,9 @@ function sortTransform(value) {
101102

102103
function sortData(compare) {
103104
return (data, facets) => {
104-
const compareData = (i, j) => compare(data[i], data[j]);
105+
const compareData = isArray(data)
106+
? (i, j) => compare(data[i], data[j])
107+
: (i, j) => compare(data.get(i), data.get(j));
105108
return {data, facets: facets.map((I) => I.slice().sort(compareData))};
106109
};
107110
}

src/transforms/exclusiveFacets.js

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
import {reindex, slice} from "../options.js";
1+
import {lengthof, reindex, slice} from "../options.js";
22

33
export function exclusiveFacets(data, facets) {
44
if (facets.length === 1) return {data, facets}; // only one facet; trivially exclusive
55

6-
const n = data.length;
6+
const n = lengthof(data);
77
const O = new Uint8Array(n);
88
let overlaps = 0;
99

src/transforms/group.js

+6-34
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,9 @@
1-
import {
2-
InternSet,
3-
deviation,
4-
group as grouper,
5-
max,
6-
maxIndex,
7-
mean,
8-
median,
9-
min,
10-
minIndex,
11-
mode,
12-
rollup,
13-
sort,
14-
sum,
15-
variance
16-
} from "d3";
1+
import {InternSet, group as grouper, rollup, sort} from "d3";
2+
import {deviation, max, maxIndex, mean, median, min, minIndex, mode, sum, variance} from "d3";
173
import {ascendingDefined} from "../defined.js";
18-
import {
19-
column,
20-
identity,
21-
isObject,
22-
isTemporal,
23-
labelof,
24-
maybeApplyInterval,
25-
maybeColorChannel,
26-
maybeColumn,
27-
maybeInput,
28-
maybeTuple,
29-
percentile,
30-
range,
31-
second,
32-
take,
33-
valueof
34-
} from "../options.js";
4+
import {maybeApplyInterval, maybeColorChannel, maybeColumn, maybeInput, maybeTuple} from "../options.js";
5+
import {isArray, isObject, isTemporal} from "../options.js";
6+
import {column, identity, labelof, percentile, range, second, take, valueof} from "../options.js";
357
import {basic} from "./basic.js";
368

379
// Group on {z, fill, stroke}.
@@ -444,7 +416,7 @@ export function find(test) {
444416
if (typeof test !== "function") throw new Error(`invalid test function: ${test}`);
445417
return {
446418
reduceIndex(I, V, {data}) {
447-
return V[I.find((i) => test(data[i], i, data))];
419+
return V[I.find(isArray(data) ? (i) => test(data[i], i, data) : (i) => test(data.get(i), i, data))];
448420
}
449421
};
450422
}

src/transforms/stack.js

+6-4
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import {InternMap, cumsum, greatest, group, groupSort, max, min, rollup, sum} fr
22
import {ascendingDefined, descendingDefined} from "../defined.js";
33
import {withTip} from "../mark.js";
44
import {maybeApplyInterval, maybeColumn, maybeZ, maybeZero} from "../options.js";
5-
import {column, field, mid, one, range, valueof} from "../options.js";
5+
import {column, field, isArray, lengthof, mid, one, range, valueof} from "../options.js";
66
import {basic} from "./basic.js";
77
import {exclusiveFacets} from "./exclusiveFacets.js";
88

@@ -91,7 +91,7 @@ function stack(x, y = one, kx, ky, {offset, order, reverse}, options) {
9191
const Y = valueof(data, y, Float64Array);
9292
const Z = valueof(data, z);
9393
const compare = order && order(data, X, Y, Z);
94-
const n = data.length;
94+
const n = lengthof(data);
9595
const Y1 = setY1(new Float64Array(n));
9696
const Y2 = setY2(new Float64Array(n));
9797
const facetstacks = [];
@@ -252,7 +252,7 @@ function maybeOrder(order, offset, ky) {
252252
return orderAccessor(field(order));
253253
}
254254
if (typeof order === "function") return (order.length === 1 ? orderAccessor : orderComparator)(order);
255-
if (Array.isArray(order)) return orderGiven(order);
255+
if (isArray(order)) return orderGiven(order);
256256
throw new Error(`invalid order: ${order}`);
257257
}
258258

@@ -327,7 +327,9 @@ function orderAccessor(f) {
327327
}
328328

329329
function orderComparator(f) {
330-
return (data) => (i, j) => f(data[i], data[j]);
330+
return (data) => {
331+
return isArray(data) ? (i, j) => f(data[i], data[j]) : (i, j) => f(data.get(i), data.get(j));
332+
};
331333
}
332334

333335
function orderGiven(domain) {

0 commit comments

Comments
 (0)