diff --git a/README.md b/README.md index ace82cd..64b9f33 100644 --- a/README.md +++ b/README.md @@ -8,13 +8,13 @@ Flechette performs fast extraction of data columns in the Arrow binary IPC forma In the process of developing multiple data analysis packages that consume Arrow data (including Arquero, Mosaic, and Vega), we've had to develop workarounds for the performance and correctness of the Arrow JavaScript reference implementation. Instead of workarounds, Flechette addresses these issues head-on. -* _Speed_. Flechette provides faster decoding. Across varied datasets, initial performance tests show 1.3-1.6x faster value iteration, 2-7x faster array extraction, and 5-9x faster row object extraction. +* _Speed_. Flechette provides faster decoding. Initial performance tests show 1.3-1.6x faster value iteration, 2-2.5x faster random value access, 2-7x faster array extraction, and 7-11x faster row object extraction. * _Size_. Flechette is ~17k minified (~6k gzip'd), versus 163k minified (~43k gzip'd) for Arrow JS. * _Coverage_. Flechette supports data types unsupported by the reference implementation at the time of writing, including decimal-to-number conversion, month/day/nanosecond time intervals (as used by DuckDB, for example), list views, and run-end encoded data. -* _Flexibility_. Flechette includes options to control data value conversion, such as numerical timestamps vs. Date objects for temporal data, and numbers vs. bigint values for 64-bit integer data. +* _Flexibility_. Flechette includes options to control data value conversion, such as numerical timestamps vs. Date objects for temporal data, numbers vs. bigint values for 64-bit integer data, and vanilla JS objects vs. optimized proxy objects for structs. * _Simplicity_. Our goal is to provide a smaller, simpler code base in the hope that it will make it easier for ourselves and others to improve the library. If you'd like to see support for additional Arrow data types or features, please [file an issue](https://github.com/uwdata/flechette/issues) or [open a pull request](https://github.com/uwdata/flechette/pulls). @@ -61,7 +61,7 @@ const time0 = table.getChild('time').at(0); // { delay: Int16Array, distance: Int16Array, time: Float32Array } const columns = table.toColumns(); -// convert Arrow data to an array of standard JS objects +// convert Arrow data to an array of JS objects // [ { delay: 14, distance: 405, time: 0.01666666753590107 }, ... ] const objects = table.toArray(); @@ -72,13 +72,14 @@ const subtable = table.select(['delay', 'time']); ### Customize Data Extraction -Data extraction can be customized using options provided to the table generation method. By default, temporal data is returned as numeric timestamps, 64-bit integers are coerced to numbers, and map-typed data is returned as an array of [key, value] pairs. These defaults can be changed via conversion options that push (or remove) transformations to the underlying data batches. +Data extraction can be customized using options provided to the table generation method. By default, temporal data is returned as numeric timestamps, 64-bit integers are coerced to numbers, map-typed data is returned as an array of [key, value] pairs, and struct/row objects are returned as vanilla JS objects with extracted property values. These defaults can be changed via conversion options that push (or remove) transformations to the underlying data batches. ```js const table = tableFromIPC(ipc, { - useDate: true, // map temporal data to Date objects - useBigInt: true, // use BigInt, do not coerce to number - useMap: true // create Map objects for [key, value] pair lists + useDate: true, // use Date objects for temporal (Date, Timestamp) data + useBigInt: true, // use BigInt for large integers, do not coerce to number + useMap: true, // use Map objects for [key, value] pair lists + useProxy: true // use zero-copy proxies for struct and table row objects }); ``` diff --git a/perf/perf-test.js b/perf/perf-test.js index e05b8ec..aecafe7 100644 --- a/perf/perf-test.js +++ b/perf/perf-test.js @@ -4,7 +4,7 @@ import { tableFromIPC as flTable } from '../src/index.js'; import { benchmark } from './util.js'; // table creation -const fl = bytes => flTable(bytes, { useBigInt: true }); +const fl = bytes => flTable(bytes, { useProxy: true, useBigInt: true }); const aa = bytes => aaTable(bytes); // parse ipc data to columns diff --git a/src/batch.js b/src/batch.js index 7548f66..c8c2d93 100644 --- a/src/batch.js +++ b/src/batch.js @@ -733,18 +733,20 @@ export class DenseUnionBatch extends SparseUnionBatch { */ export class StructBatch extends ArrayBatch { /** - * Create a new column batch. + * Create a new struct batch. * @param {object} options * @param {number} options.length The length of the batch * @param {number} options.nullCount The null value count * @param {Uint8Array} [options.validity] Validity bitmap buffer * @param {Batch[]} options.children Children batches * @param {string[]} options.names Child batch names + * @param {(names: string[], batches: Batch[]) => + * (index: number) => Record} options.factory + * Struct object factory creation method */ - constructor({ names, ...rest }) { + constructor({ names, factory, ...rest }) { super(rest); - /** @type {string[]} */ - this.names = names; + this.factory = factory(names, this.children); } /** @@ -752,13 +754,7 @@ export class StructBatch extends ArrayBatch { * @returns {Record} */ value(index) { - const { children, names } = this; - const n = names.length; - const struct = {}; - for (let i = 0; i < n; ++i) { - struct[names[i]] = children[i].at(index); - } - return struct; + return this.factory(index); } } diff --git a/src/table-from-ipc.js b/src/table-from-ipc.js index 371a0c2..dcded69 100644 --- a/src/table-from-ipc.js +++ b/src/table-from-ipc.js @@ -42,7 +42,7 @@ import { } from './constants.js'; import { parseIPC } from './parse-ipc.js'; import { Table } from './table.js'; -import { keyFor } from './util.js'; +import { keyFor, objectFactory, proxyFactory } from './util.js'; /** * Decode [Apache Arrow IPC data][1] and return a new Table. The input binary @@ -105,7 +105,11 @@ export function createTable(data, options = {}) { fields.forEach((f, i) => cols[i].add(visit(f.type, ctx))); } - return new Table(schema, cols.map(c => c.done())); + return new Table( + schema, + cols.map(c => c.done()), + options.useProxy ? proxyFactory : objectFactory + ); } /** @@ -145,7 +149,7 @@ function contextGenerator(options, version, dictionaryMap) { */ function visit(type, ctx) { const { typeId, bitWidth, precision, scale, stride, unit } = type; - const { useBigInt, useDate, useMap } = ctx.options; + const { useBigInt, useDate, useMap, useProxy } = ctx.options; // no field node, no buffers if (typeId === Type.Null) { @@ -246,7 +250,8 @@ function visit(type, ctx) { // validity and children case Type.FixedSizeList: return kids(FixedListBatch, { stride }); case Type.Struct: return kids(StructBatch, { - names: type.children.map(child => child.name) + names: type.children.map(child => child.name), + factory: useProxy ? proxyFactory : objectFactory }); // children only diff --git a/src/table.js b/src/table.js index 1e8f2cc..f15e856 100644 --- a/src/table.js +++ b/src/table.js @@ -1,4 +1,4 @@ -import { bisect } from './util.js'; +import { bisect, objectFactory } from './util.js'; /** * A table consists of a collection of named columns (or 'children'). @@ -12,8 +12,12 @@ export class Table { * Create a new table with the given schema and columns (children). * @param {import('./types.js').Schema} schema The table schema. * @param {import('./column.js').Column[]} children The table columns. + * @param {(names: string[], batches: import('./batch.js').Batch[]) => + * (index: number) => Record} [factoryMethod] + * Row object factory creation method. By default, vanilla JS objects + * are used, with property values extracted from Arrow data. */ - constructor(schema, children) { + constructor(schema, children, factoryMethod = objectFactory) { /** @readonly */ this.schema = schema; /** @readonly */ @@ -23,6 +27,19 @@ export class Table { * @readonly */ this.children = children; + + // lazily created row object generators + const gen = []; + + /** + * Returns a row object generator for the given batch index. + * @private + * @readonly + * @param {number} b The batch index. + * @returns {(index: number) => Record} + */ + this.factory = (b) => gen[b] ?? + (gen[b] = factoryMethod(this.names, children.map(c => c.data[b]))); } /** @@ -117,12 +134,13 @@ export class Table { * @returns {Record[]} */ toArray() { - const { children, numRows, names } = this; + const { children, factory, numRows } = this; const data = children[0]?.data ?? []; const output = Array(numRows); for (let b = 0, row = -1; b < data.length; ++b) { + const f = factory(b); for (let i = 0; i < data[b].length; ++i) { - output[++row] = rowObject(names, children, b, i); + output[++row] = f(i); } } return output; @@ -133,11 +151,12 @@ export class Table { * @returns {Generator, any, null>} */ *[Symbol.iterator]() { - const { children, names } = this; + const { children, factory } = this; const data = children[0]?.data ?? []; for (let b = 0; b < data.length; ++b) { + const f = factory(b); for (let i = 0; i < data[b].length; ++i) { - yield rowObject(names, children, b, i); + yield f(i); } } } @@ -148,11 +167,11 @@ export class Table { * @returns {Record} The row object. */ at(index) { - const { names, children, numRows } = this; + const { children, factory, numRows } = this; if (index < 0 || index >= numRows) return null; const [{ offsets }] = children; - const i = bisect(offsets, index) - 1; - return rowObject(names, children, i, index - offsets[i]); + const b = bisect(offsets, index) - 1; + return factory(b)(index - offsets[b]); } /** @@ -171,11 +190,3 @@ function renameField(field, name) { ? { ...field, name } : field; } - -function rowObject(names, children, batch, index) { - const o = {}; - for (let j = 0; j < names.length; ++j) { - o[names[j]] = children[j].data[batch].at(index); - } - return o; -} diff --git a/src/types.ts b/src/types.ts index 1ce706f..5e4786b 100644 --- a/src/types.ts +++ b/src/types.ts @@ -303,4 +303,12 @@ export interface ExtractionOptions { * both `Map` and `Object.fromEntries` (default). */ useMap?: boolean; + /** + * If true, extract Arrow 'Struct' values and table row objects using + * zero-copy proxy objects that extract data from underlying Arrow batches. + * The proxy objects can improve performance and reduce memory usage, but + * do not support property enumeration (`Object.keys`, `Object.values`, + * `Object.entries`) or spreading (`{ ...object }`). + */ + useProxy?: boolean; } diff --git a/src/util.js b/src/util.js index b323b72..f434614 100644 --- a/src/util.js +++ b/src/util.js @@ -87,6 +87,59 @@ export function bisect(offsets, index) { return a; } +export const RowIndex = Symbol('rowIndex'); + +/** + * Returns a row proxy object factory. The resulting method takes a + * batch-level row index as input and returns an object that proxies + * access to underlying batches. + * @param {string[]} names The column (property) names + * @param {import('./batch.js').Batch[]} batches The value batches. + * @returns {(index: number) => Record} + */ +export function proxyFactory(names, batches) { + class RowObject { + constructor(index) { + this[RowIndex] = index; + } + }; + + // prototype for row proxy objects + const proto = RowObject.prototype; + + for (let i = 0; i < names.length; ++i) { + // skip duplicated column names + if (Object.hasOwn(proto, names[i])) continue; + + // add a getter method for the current batch + const batch = batches[i]; + Object.defineProperty(proto, names[i], { + get() { return batch.at(this[RowIndex]); }, + enumerable: true + }); + } + + return (index) => new RowObject(index); +} + +/** + * Returns a row object factory. The resulting method takes a + * batch-level row index as input and returns an object whose property + * values have been extracted from the batches. + * @param {string[]} names The column (property) names + * @param {import('./batch.js').Batch[]} batches The value batches. + * @returns {(index: number) => Record} + */ +export function objectFactory(names, batches) { + return (index) => { + const r = {}; + for (let i = 0; i < names.length; ++i) { + r[names[i]] = batches[i].at(index); + } + return r; + } +} + // -- flatbuffer utilities ----- /** diff --git a/test/table-from-ipc-test.js b/test/table-from-ipc-test.js index faf52da..0917de5 100644 --- a/test/table-from-ipc-test.js +++ b/test/table-from-ipc-test.js @@ -2,6 +2,7 @@ import assert from 'node:assert'; import { readFile } from 'node:fs/promises'; import { arrowFromDuckDB, arrowQuery } from './util/arrow-from-duckdb.js'; import { tableFromIPC } from '../src/index.js'; +import { RowIndex } from '../src/util.js'; const toDate = v => new Date(v); const toBigInt = v => BigInt(v); @@ -255,6 +256,26 @@ describe('tableFromIPC', () => { await valueTest([ {a: ['a', 'b'], b: Math.E}, {a: ['c', 'd'], b: Math.PI} ]); }); + it('decodes struct data with useProxy', async () => { + const values = [ {a: 1, b: 'foo'}, null, {a: 2, b: 'baz'} ]; + const bytes = await arrowFromDuckDB(values); + const column = tableFromIPC(bytes, { useProxy: true }).getChild('value'); + const proxies = column.toArray(); + assert.strictEqual(proxies[0][RowIndex], 0); + for (let i = 0; i < values.length; ++i) { + const proxy = proxies[i]; + const value = values[i]; + if (value === null) { + assert.strictEqual(proxy, value); + } else { + assert.ok(proxy[RowIndex] >= 0); + for (const key of Object.keys(value)) { + assert.strictEqual(proxy[key], value[key]); + } + } + } + }); + it('decodes run-end-encoded data', async () => { const buf = await readFile(`test/data/runendencoded.arrows`); const table = tableFromIPC(new Uint8Array(buf));