From e71cc5efd522715ef9032bdff72e50c99db0d6d5 Mon Sep 17 00:00:00 2001 From: jheer Date: Wed, 11 Sep 2024 13:19:35 -0700 Subject: [PATCH] feat: Add struct/row object proxy, update docs. --- README.md | 11 +++-- docs/api/data-types.md | 4 +- docs/api/index.md | 6 ++- docs/api/table.md | 2 +- docs/index.md | 9 +++-- perf/decode-perf.js | 2 +- src/batch-type.js | 6 +-- src/batch.js | 24 +++++++---- src/build/table-from-arrays.js | 2 +- src/build/table-from-columns.js | 14 ++++--- src/decode/table-from-ipc.js | 2 +- src/table.js | 56 ++++++++++++++++--------- src/types.ts | 12 ++++++ src/util/struct.js | 72 +++++++++++++++++++++++++++++++++ test/table-from-ipc-test.js | 11 +++++ 15 files changed, 181 insertions(+), 52 deletions(-) create mode 100644 src/util/struct.js diff --git a/README.md b/README.md index 13e4a03..70b9c8d 100644 --- a/README.md +++ b/README.md @@ -4,13 +4,15 @@ Flechette performs fast extraction and encoding of data columns in the Arrow binary IPC format, supporting ingestion of Arrow data from sources such as [DuckDB](https://duckdb.org/) and Arrow use in JavaScript data analysis tools like [Arquero](https://github.com/uwdata/arquero), [Mosaic](https://github.com/uwdata/mosaic), [Observable Plot](https://observablehq.com/plot/), and [Vega-Lite](https://vega.github.io/vega-lite/). +For documentation, see the [**API Reference**](https://idl.uw.edu/flechette/api). + ## Why Flechette? In the process of developing multiple data analysis packages that consume Arrow data (including Arquero, Mosaic, and Vega), we've had to develop workarounds for the performance and correctness of the Arrow JavaScript reference implementation. Instead of workarounds, Flechette addresses these issues head-on. -* _Speed_. Flechette provides better performance. Performance tests show 1.3-1.6x faster value iteration, 2-7x faster array extraction, 5-9x faster row object extraction, and 1.5-3.5x faster building of Arrow columns. +* _Speed_. Flechette provides better performance. Performance tests show 1.3-1.6x faster value iteration, 2-7x faster array extraction, 7-11x faster row object extraction, and 1.5-3.5x faster building of Arrow columns. -* _Size_. Flechette is smaller: ~42k minified (~13k gzip'd) versus 163k minified (~43k gzip'd) for Arrow JS. Flechette's encoders and decoders also tree-shake cleanly, so you only pay for what you need in your own bundles. +* _Size_. Flechette is smaller: ~42k minified (~14k gzip'd) versus 163k minified (~43k gzip'd) for Arrow JS. Flechette's encoders and decoders also tree-shake cleanly, so only pay for what you need in your own bundles. * _Coverage_. Flechette supports data types unsupported by the reference implementation, including decimal-to-number conversion, month/day/nanosecond time intervals (as used by DuckDB, for example), run-end encoded data, binary views, and list views. @@ -108,7 +110,7 @@ const ipcTyped = tableToIPC(tableTyped, { format: 'file' }); ### Customize Data Extraction -Data extraction can be customized using options provided to the table generation method. By default, temporal data is returned as numeric timestamps, 64-bit integers are coerced to numbers, and map-typed data is returned as an array of [key, value] pairs. These defaults can be changed via conversion options that push (or remove) transformations to the underlying data batches. +Data extraction can be customized using options provided to table generation methods. By default, temporal data is returned as numeric timestamps, 64-bit integers are coerced to numbers, map-typed data is returned as an array of [key, value] pairs, and struct/row objects are returned as vanilla JS objects with extracted property values. These defaults can be changed via conversion options that push (or remove) transformations to the underlying data batches. ```js const table = tableFromIPC(ipc, { @@ -116,10 +118,11 @@ const table = tableFromIPC(ipc, { useDecimalBigInt: true, // use BigInt for decimals, do not coerce to number useBigInt: true, // use BigInt for 64-bit ints, do not coerce to number useMap: true // create Map objects for [key, value] pair lists + useProxy: true // use zero-copy proxies for struct and table row objects }); ``` -The same extraction options can be passed to `tableFromArrays`. +The same extraction options can be passed to `tableFromArrays`. For more, see the [**API Reference**](https://idl.uw.edu/flechette/api). ## Build Instructions diff --git a/docs/api/data-types.md b/docs/api/data-types.md index 726b6b0..d113e92 100644 --- a/docs/api/data-types.md +++ b/docs/api/data-types.md @@ -7,7 +7,7 @@ title: Flechette API Reference ## Data Type Overview -The table below provides an overview of all data types supported by the Apache Arrow format and how Flechette maps them to JavaScript types. The table indicates if Flechette can read the type (via [`tableFromIPC`](/flechette/api/#tableFromIPC)), write the type (via [`tableToIPC`](/flechette/api/#tableToIPC)), and build the type from JavaScript values (via [`tableFromArrays`](/flechette/api/#tableFromArrays) or [`columnFromArray`](/flechette/api/#tableFromArray)). +The table below provides an overview of all data types supported by the Apache Arrow format and how Flechette maps them to JavaScript types. The table indicates if Flechette can read the type (via [`tableFromIPC`](/flechette/api/#tableFromIPC)), write the type (via [`tableToIPC`](/flechette/api/#tableToIPC)), and build the type from JavaScript values (via [`tableFromArrays`](/flechette/api/#tableFromArrays) or [`columnFromArray`](/flechette/api/#columnFromArray)). | Id | Data Type | Read? | Write? | Build? | JavaScript Type | | --: | ----------------------------------- | :---: | :----: | :----: | --------------- | @@ -341,7 +341,7 @@ Extracted JavaScript values depend on the child types. * *mode* (`number`): The union mode. One of `UnionMode.Sparse` or `UnionMode.Dense`. * *children* (`(DataType[] | Field)[]`): The children fields or data types. Types are mapped to nullable fields with no metadata. * *typeIds* (`number[]`): Children type ids, in the same order as the children types. Type ids provide a level of indirection over children types. If not provided, the children indices are used as the type ids. -* *typeIdForValue* (`(value: any, index: number) => number`): A function that takes an arbitrary value and a row index and returns a correponding union type id. This function is required to build union-typed data with [`tableFromArrays`](/flechette/api/#tableFromArrays) or [`columnFromArray`](/flechette/api/#tableFromArray). +* *typeIdForValue* (`(value: any, index: number) => number`): A function that takes an arbitrary value and a row index and returns a correponding union type id. This function is required to build union-typed data with [`tableFromArrays`](/flechette/api/#tableFromArrays) or [`columnFromArray`](/flechette/api/#columnFromArray). ### FixedSizeBinary diff --git a/docs/api/index.md b/docs/api/index.md index aab82f4..ec345fc 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -18,12 +18,13 @@ title: Flechette API Reference Decode [Apache Arrow IPC data](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc) and return a new [`Table`](table). The input binary data may be either an `ArrayBuffer` or `Uint8Array`. For Arrow data in the [IPC 'stream' format](https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format), an array of `Uint8Array` values is also supported. -* *data* (`ArrayBuffer` | `Uint8Array` | `Uint8Array[]`): The source byte buffer, or an array of buffers. If an array, each byte array may contain one or more self-contained messages. Messages may NOT span multiple byte arrays. +* *data* (`ArrayBuffer` \| `Uint8Array` \| `Uint8Array[]`): The source byte buffer, or an array of buffers. If an array, each byte array may contain one or more self-contained messages. Messages may NOT span multiple byte arrays. * *options* (`ExtractionOptions`): Options for controlling how values are transformed when extracted from an Arrow binary representation. * *useDate* (`boolean`): If true, extract dates and timestamps as JavaScript `Date` objects Otherwise, return numerical timestamp values (default). * *useDecimalBigInt* (`boolean`): If true, extract decimal-type data as BigInt values, where fractional digits are scaled to integers. Otherwise, return converted floating-point numbers (default). * *useBigInt* (`boolean`): If true, extract 64-bit integers as JavaScript `BigInt` values Otherwise, coerce long integers to JavaScript number values (default). * *useMap* (`boolean`): If true, extract Arrow 'Map' values as JavaScript `Map` instances Otherwise, return an array of [key, value] pairs compatible with both `Map` and `Object.fromEntries` (default). + * *useProxy* (`boolean`): If true, extract Arrow 'Struct' values and table row objects using zero-copy proxy objects that extract data from underlying Arrow batches. The proxy objects can improve performance and reduce memory usage, but do not support property enumeration (`Object.keys`, `Object.values`, `Object.entries`) or spreading (`{ ...object }`). *Examples* @@ -134,11 +135,12 @@ const col = columnFromArray( ```
# -tableFromColumns(columns[, type, options]) +tableFromColumns(columns[, useProxy]) Create a new table from a collection of columns. This method is useful for creating new tables using one or more pre-existing column instances. Otherwise, [`tableFromArrays`](#tableFromArrays) should be preferred. Input columns are assumed to have the same record batch sizes and non-conflicting dictionary ids. * *data* (`object | array`): The input columns as an object with name keys, or an array of [name, column] pairs. +* *useProxy* (`boolean`): Flag indicating if row proxy objects should be used to represent table rows (default `false`). Typically this should match the value of the `useProxy` extraction option used for column generation. *Examples* diff --git a/docs/api/table.md b/docs/api/table.md index 6c5fb30..9f1610b 100644 --- a/docs/api/table.md +++ b/docs/api/table.md @@ -7,7 +7,7 @@ title: Flechette API Reference ## Table Class -A table consisting of named columns (or 'children'). To extract table data directly to JavaScript values, use [`toColumns()`](#toColumns) to produce an object that maps column names to extracted value arrays, or [`toArray()`](#toArray) to extract an array of row objects. Tables are [iterable](#iterator), iterating over row objects. While `toArray()` and [table iterators](#iterator) enable convenient use by tools that expect row objects, column-oriented processing is more efficient and thus recommended. Use [`getChild`](#getChild) or [`getChildAt`](#getChildAt) to access a specific [`Column`](column). +A table consists of named data [columns](#column) (or 'children'). To extract table data directly to JavaScript values, use [`toColumns()`](#toColumns) to produce an object that maps column names to extracted value arrays, or [`toArray()`](#toArray) to extract an array of row objects. Tables are [iterable](#iterator), iterating over row objects. While `toArray()` and [table iterators](#iterator) enable convenient use by tools that expect row objects, column-oriented processing is more efficient and thus recommended. Use [`getChild`](#getChild) or [`getChildAt`](#getChildAt) to access a specific [`Column`](column). * [constructor](#constructor) * [numCols](#numCols) diff --git a/docs/index.md b/docs/index.md index 2908f61..2871e0e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -4,15 +4,15 @@ Flechette performs fast extraction and encoding of data columns in the Arrow binary IPC format, supporting ingestion of Arrow data from sources such as [DuckDB](https://duckdb.org/) and Arrow use in JavaScript data analysis tools like [Arquero](https://github.com/uwdata/arquero), [Mosaic](https://github.com/uwdata/mosaic), [Observable Plot](https://observablehq.com/plot/), and [Vega-Lite](https://vega.github.io/vega-lite/). -[**API Reference**](api) +For documentation, see the [**API Reference**](api). ## Why Flechette? In the process of developing multiple data analysis packages that consume Arrow data (including Arquero, Mosaic, and Vega), we've had to develop workarounds for the performance and correctness of the Arrow JavaScript reference implementation. Instead of workarounds, Flechette addresses these issues head-on. -* _Speed_. Flechette provides better performance. Performance tests show 1.3-1.6x faster value iteration, 2-7x faster array extraction, 5-9x faster row object extraction, and 1.5-3.5x faster building of Arrow columns. +* _Speed_. Flechette provides better performance. Performance tests show 1.3-1.6x faster value iteration, 2-7x faster array extraction, 7-11x faster row object extraction, and 1.5-3.5x faster building of Arrow columns. -* _Size_. Flechette is smaller: ~42k minified (~13k gzip'd) versus 163k minified (~43k gzip'd) for Arrow JS. Flechette's encoders and decoders also tree-shake cleanly, so you only pay for what you need in your own bundles. +* _Size_. Flechette is smaller: ~42k minified (~14k gzip'd) versus 163k minified (~43k gzip'd) for Arrow JS. Flechette's encoders and decoders also tree-shake cleanly, so only pay for what you need in your own bundles. * _Coverage_. Flechette supports data types unsupported by the reference implementation, including decimal-to-number conversion, month/day/nanosecond time intervals (as used by DuckDB, for example), run-end encoded data, binary views, and list views. @@ -110,7 +110,7 @@ const ipcTyped = tableToIPC(tableTyped, { format: 'file' }); ### Customize Data Extraction -Data extraction can be customized using options provided to the table generation method. By default, temporal data is returned as numeric timestamps, 64-bit integers are coerced to numbers, and map-typed data is returned as an array of [key, value] pairs. These defaults can be changed via conversion options that push (or remove) transformations to the underlying data batches. +Data extraction can be customized using options provided to table generation methods. By default, temporal data is returned as numeric timestamps, 64-bit integers are coerced to numbers, map-typed data is returned as an array of [key, value] pairs, and struct/row objects are returned as vanilla JS objects with extracted property values. These defaults can be changed via conversion options that push (or remove) transformations to the underlying data batches. ```js const table = tableFromIPC(ipc, { @@ -118,6 +118,7 @@ const table = tableFromIPC(ipc, { useDecimalBigInt: true, // use BigInt for decimals, do not coerce to number useBigInt: true, // use BigInt for 64-bit ints, do not coerce to number useMap: true // create Map objects for [key, value] pair lists + useProxy: true // use zero-copy proxies for struct and table row objects }); ``` diff --git a/perf/decode-perf.js b/perf/decode-perf.js index c6d90ed..86607ea 100644 --- a/perf/decode-perf.js +++ b/perf/decode-perf.js @@ -4,7 +4,7 @@ import { tableFromIPC as flTable } from '../src/index.js'; import { benchmark } from './util.js'; // table creation -const fl = bytes => flTable(bytes, { useBigInt: true }); +const fl = bytes => flTable(bytes, { useBigInt: true, useProxy: true }); const aa = bytes => aaTable(bytes); // decode ipc data to columns diff --git a/src/batch-type.js b/src/batch-type.js index a2b6aaa..115e632 100644 --- a/src/batch-type.js +++ b/src/batch-type.js @@ -1,10 +1,10 @@ -import { BinaryBatch, BinaryViewBatch, BoolBatch, DateBatch, DateDayBatch, DateDayMillisecondBatch, DecimalBigIntBatch, DecimalNumberBatch, DenseUnionBatch, DictionaryBatch, DirectBatch, FixedBinaryBatch, FixedListBatch, Float16Batch, Int64Batch, IntervalDayTimeBatch, IntervalMonthDayNanoBatch, IntervalYearMonthBatch, LargeBinaryBatch, LargeListBatch, LargeListViewBatch, LargeUtf8Batch, ListBatch, ListViewBatch, MapBatch, MapEntryBatch, NullBatch, RunEndEncodedBatch, SparseUnionBatch, StructBatch, TimestampMicrosecondBatch, TimestampMillisecondBatch, TimestampNanosecondBatch, TimestampSecondBatch, Utf8Batch, Utf8ViewBatch } from './batch.js'; +import { BinaryBatch, BinaryViewBatch, BoolBatch, DateBatch, DateDayBatch, DateDayMillisecondBatch, DecimalBigIntBatch, DecimalNumberBatch, DenseUnionBatch, DictionaryBatch, DirectBatch, FixedBinaryBatch, FixedListBatch, Float16Batch, Int64Batch, IntervalDayTimeBatch, IntervalMonthDayNanoBatch, IntervalYearMonthBatch, LargeBinaryBatch, LargeListBatch, LargeListViewBatch, LargeUtf8Batch, ListBatch, ListViewBatch, MapBatch, MapEntryBatch, NullBatch, RunEndEncodedBatch, SparseUnionBatch, StructBatch, StructProxyBatch, TimestampMicrosecondBatch, TimestampMillisecondBatch, TimestampNanosecondBatch, TimestampSecondBatch, Utf8Batch, Utf8ViewBatch } from './batch.js'; import { DateUnit, IntervalUnit, TimeUnit, Type } from './constants.js'; import { invalidDataType } from './data-types.js'; export function batchType(type, options = {}) { const { typeId, bitWidth, precision, unit } = type; - const { useBigInt, useDate, useDecimalBigInt, useMap } = options; + const { useBigInt, useDate, useDecimalBigInt, useMap, useProxy } = options; switch (typeId) { case Type.Null: return NullBatch; @@ -47,7 +47,7 @@ export function batchType(type, options = {}) { case Type.ListView: return ListViewBatch; case Type.LargeListView: return LargeListViewBatch; case Type.FixedSizeList: return FixedListBatch; - case Type.Struct: return StructBatch; + case Type.Struct: return useProxy ? StructProxyBatch : StructBatch; case Type.RunEndEncoded: return RunEndEncodedBatch; case Type.Dictionary: return DictionaryBatch; case Type.Union: return type.mode ? DenseUnionBatch : SparseUnionBatch; diff --git a/src/batch.js b/src/batch.js index 3db8069..6f2d6ef 100644 --- a/src/batch.js +++ b/src/batch.js @@ -2,6 +2,7 @@ import { bisect, float64Array } from './util/arrays.js'; import { divide, fromDecimal128, fromDecimal256, toNumber } from './util/numbers.js'; import { decodeBit, readInt32, readInt64 } from './util/read.js'; import { decodeUtf8 } from './util/strings.js'; +import { objectFactory, proxyFactory } from './util/struct.js'; /** * Check if the input is a batch that supports direct access to @@ -730,11 +731,12 @@ export class DenseUnionBatch extends SparseUnionBatch { * @extends {ArrayBatch>} */ export class StructBatch extends ArrayBatch { - constructor(options) { + constructor(options, factory = objectFactory) { super(options); /** @type {string[]} */ // @ts-ignore this.names = this.type.children.map(child => child.name); + this.factory = factory(this.names, this.children); } /** @@ -742,13 +744,19 @@ export class StructBatch extends ArrayBatch { * @returns {Record} */ value(index) { - const { children, names } = this; - const n = names.length; - const struct = {}; - for (let i = 0; i < n; ++i) { - struct[names[i]] = children[i].at(index); - } - return struct; + return this.factory(index); + } +} + +/** + * A batch of struct values, containing a set of named properties. + * Structs are returned as proxy objects that extract data directly + * from underlying Arrow batches. + * @extends {StructBatch} + */ +export class StructProxyBatch extends StructBatch { + constructor(options) { + super(options, proxyFactory); } } diff --git a/src/build/table-from-arrays.js b/src/build/table-from-arrays.js index bcb063c..319721c 100644 --- a/src/build/table-from-arrays.js +++ b/src/build/table-from-arrays.js @@ -19,5 +19,5 @@ export function tableFromArrays(data, options = {}) { /** @type {[string, import('../column.js').Column]} */ ( [ name, columnFromArray(array, types[name], opt, ctx)] )); - return tableFromColumns(columns); + return tableFromColumns(columns, options.useProxy); } diff --git a/src/build/table-from-columns.js b/src/build/table-from-columns.js index a738762..5a04c86 100644 --- a/src/build/table-from-columns.js +++ b/src/build/table-from-columns.js @@ -6,11 +6,13 @@ import { Table } from '../table.js'; * Create a new table from a collection of columns. Columns are assumed * to have the same record batch sizes and consistent dictionary ids. * @param {[string, import('../column.js').Column][] -* | Record} data The columns, -* as an object with name keys, or an array of [name, column] pairs. -* @returns {Table} The new table. -*/ -export function tableFromColumns(data) { + * | Record} data The columns, + * as an object with name keys, or an array of [name, column] pairs. + * @param {boolean} [useProxy] Flag indicating if row proxy + * objects should be used to represent table rows (default `false`). + * @returns {Table} The new table. + */ +export function tableFromColumns(data, useProxy) { const fields = []; const dictionaryTypes = new Map; const entries = Array.isArray(data) ? data : Object.entries(data); @@ -39,5 +41,5 @@ export function tableFromColumns(data) { dictionaryTypes }; - return new Table(schema, columns); + return new Table(schema, columns, useProxy); } diff --git a/src/decode/table-from-ipc.js b/src/decode/table-from-ipc.js index 7b90cdc..2e09801 100644 --- a/src/decode/table-from-ipc.js +++ b/src/decode/table-from-ipc.js @@ -67,7 +67,7 @@ export function createTable(data, options = {}) { fields.forEach((f, i) => cols[i].add(visit(f.type, ctx))); } - return new Table(schema, cols.map(c => c.done())); + return new Table(schema, cols.map(c => c.done()), options.useProxy); } /** diff --git a/src/table.js b/src/table.js index 5ec759e..14b6cb0 100644 --- a/src/table.js +++ b/src/table.js @@ -1,4 +1,5 @@ import { bisect } from './util/arrays.js'; +import { objectFactory, proxyFactory } from './util/struct.js'; /** * A table consists of a collection of named columns (or 'children'). @@ -12,17 +13,39 @@ export class Table { * Create a new table with the given schema and columns (children). * @param {import('./types.js').Schema} schema The table schema. * @param {import('./column.js').Column[]} children The table columns. + * @param {boolean} [useProxy=false] Flag indicating if row proxy + * objects should be used to represent table rows (default `false`). */ - constructor(schema, children) { + constructor(schema, children, useProxy = false) { + const names = schema.fields.map(f => f.name); + /** @readonly */ this.schema = schema; /** @readonly */ - this.names = schema.fields.map(f => f.name); + this.names = names; /** * @type {import('./column.js').Column[]} * @readonly */ this.children = children; + /** + * @type {import('./types.js').StructFactory} + * @readonly + */ + this.factory = useProxy ? proxyFactory : objectFactory; + + // lazily created row object generators + const gen = []; + + /** + * Returns a row object generator for the given batch index. + * @private + * @readonly + * @param {number} b The batch index. + * @returns {(index: number) => Record} + */ + this.getFactory = b => gen[b] + ?? (gen[b] = this.factory(names, children.map(c => c.data[b]))); } /** @@ -75,14 +98,15 @@ export class Table { * @returns {Table} A new table with columns at the specified indices. */ selectAt(indices, as = []) { - const { children, schema } = this; + const { children, factory, schema } = this; const { fields } = schema; return new Table( { ...schema, fields: indices.map((i, j) => renameField(fields[i], as[j])) }, - indices.map(i => children[i]) + indices.map(i => children[i]), + factory === proxyFactory ); } @@ -117,12 +141,13 @@ export class Table { * @returns {Record[]} */ toArray() { - const { children, numRows, names } = this; + const { children, getFactory, numRows } = this; const data = children[0]?.data ?? []; const output = Array(numRows); for (let b = 0, row = -1; b < data.length; ++b) { + const f = getFactory(b); for (let i = 0; i < data[b].length; ++i) { - output[++row] = rowObject(names, children, b, i); + output[++row] = f(i); } } return output; @@ -133,11 +158,12 @@ export class Table { * @returns {Generator, any, null>} */ *[Symbol.iterator]() { - const { children, names } = this; + const { children, getFactory } = this; const data = children[0]?.data ?? []; for (let b = 0; b < data.length; ++b) { + const f = getFactory(b); for (let i = 0; i < data[b].length; ++i) { - yield rowObject(names, children, b, i); + yield f(i); } } } @@ -148,11 +174,11 @@ export class Table { * @returns {Record} The row object. */ at(index) { - const { names, children, numRows } = this; + const { children, getFactory, numRows } = this; if (index < 0 || index >= numRows) return null; const [{ offsets }] = children; - const i = bisect(offsets, index) - 1; - return rowObject(names, children, i, index - offsets[i]); + const b = bisect(offsets, index) - 1; + return getFactory(b)(index - offsets[b]); } /** @@ -171,11 +197,3 @@ function renameField(field, name) { ? { ...field, name } : field; } - -function rowObject(names, children, batch, index) { - const o = {}; - for (let j = 0; j < names.length; ++j) { - o[names[j]] = children[j].data[batch].at(index); - } - return o; -} diff --git a/src/types.ts b/src/types.ts index cf12927..71b8cfa 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,3 +1,4 @@ +import { Batch } from './batch.js'; import { Version, Endianness, @@ -91,6 +92,9 @@ export interface ValueArray extends ArrayLike, Iterable { slice(start?: number, end?: number): ValueArray; } +/** Struct/row object factory method. */ +export type StructFactory = (names: string[], batches: Batch[]) => (index: number) => Record; + /** Custom metadata. */ export type Metadata = Map; @@ -321,6 +325,14 @@ export interface ExtractionOptions { * both `Map` and `Object.fromEntries` (default). */ useMap?: boolean; + /** + * If true, extract Arrow 'Struct' values and table row objects using + * zero-copy proxy objects that extract data from underlying Arrow batches. + * The proxy objects can improve performance and reduce memory usage, but + * do not support property enumeration (`Object.keys`, `Object.values`, + * `Object.entries`) or spreading (`{ ...object }`). + */ + useProxy?: boolean; } /** diff --git a/src/util/struct.js b/src/util/struct.js new file mode 100644 index 0000000..147a629 --- /dev/null +++ b/src/util/struct.js @@ -0,0 +1,72 @@ +export const RowIndex = Symbol('rowIndex'); + +/** + * Returns a row proxy object factory. The resulting method takes a + * batch-level row index as input and returns an object that proxies + * access to underlying batches. + * @param {string[]} names The column (property) names + * @param {import('../batch.js').Batch[]} batches The value batches. + * @returns {(index: number) => Record} + */ +export function proxyFactory(names, batches) { + class RowObject { + /** + * Create a new proxy row object representing a struct or table row. + * @param {number} index The record batch row index. + */ + constructor(index) { + this[RowIndex] = index; + } + + /** + * Return a JSON-compatible object representation. + */ + toJSON() { + return structObject(names, batches, this[RowIndex]); + } + }; + + // prototype for row proxy objects + const proto = RowObject.prototype; + + for (let i = 0; i < names.length; ++i) { + // skip duplicated column names + if (Object.hasOwn(proto, names[i])) continue; + + // add a getter method for the current batch + const batch = batches[i]; + Object.defineProperty(proto, names[i], { + get() { return batch.at(this[RowIndex]); }, + enumerable: true + }); + } + + return index => new RowObject(index); +} + +/** + * Returns a row object factory. The resulting method takes a + * batch-level row index as input and returns an object whose property + * values have been extracted from the batches. + * @param {string[]} names The column (property) names + * @param {import('../batch.js').Batch[]} batches The value batches. + * @returns {(index: number) => Record} + */ +export function objectFactory(names, batches) { + return index => structObject(names, batches, index); +} + +/** + * Return a vanilla object representing a struct (row object) type. + * @param {string[]} names The column (property) names + * @param {import('../batch.js').Batch[]} batches The value batches. + * @param {number} index The record batch row index. + * @returns {Record} + */ +export function structObject(names, batches, index) { + const obj = {}; + for (let i = 0; i < names.length; ++i) { + obj[names[i]] = batches[i].at(index); + } + return obj; +} diff --git a/test/table-from-ipc-test.js b/test/table-from-ipc-test.js index 26e216f..24c8c8f 100644 --- a/test/table-from-ipc-test.js +++ b/test/table-from-ipc-test.js @@ -2,6 +2,7 @@ import assert from 'node:assert'; import { tableFromIPC } from '../src/index.js'; import { arrowFromDuckDB } from './util/arrow-from-duckdb.js'; import { binaryView, bool, dateDay, decimal, empty, fixedListInt32, fixedListUtf8, float32, float64, int16, int32, int64, int8, intervalMonthDayNano, largeListView, listInt32, listUtf8, listView, map, runEndEncoded32, runEndEncoded64, struct, timestampMicrosecond, timestampMillisecond, timestampNanosecond, timestampSecond, uint16, uint32, uint64, uint8, union, utf8, utf8View } from './util/data.js'; +import { RowIndex } from '../src/util/struct.js'; const toBigInt = v => BigInt(v); const toDate = v => new Date(v); @@ -102,6 +103,16 @@ describe('tableFromIPC', () => { it('decodes struct data', () => test(struct)); + it('decodes struct data with useProxy', async () => { + const data = await struct(); + for (const { bytes, values } of data) { + const column = tableFromIPC(bytes, { useProxy: true }).getChildAt(0); + const proxies = column.toArray(); + assert.strictEqual(proxies.every(p => p === null || p[RowIndex] >= 0), true); + assert.deepStrictEqual(proxies.map(p => p ? p.toJSON() : null), values); + } + }); + it('decodes run-end-encoded data with 32-bit run ends', async () => { const data = await runEndEncoded32(); for (const { bytes, runs, values } of data) {