diff --git a/docs/api/data-types.md b/docs/api/data-types.md index bd456e9..00c99fc 100644 --- a/docs/api/data-types.md +++ b/docs/api/data-types.md @@ -85,14 +85,14 @@ Create a new field instance for use in a schema or type definition. A field repr ### Dictionary <hr/><a id="dictionary" href="#dictionary">#</a> -<b>dictionary</b>(<i>type</i>[, <i>indexType</i>, <i>id</i>, <i>ordered</i>]) +<b>dictionary</b>(<i>type</i>[, <i>indexType</i>, <i>ordered</i>, <i>id</i>]) -Create a Dictionary data type instance. A dictionary type consists of a dictionary of values (which may be of any type) and corresponding integer indices that reference those values. If values are repeated, a dictionary encoding can provide substantial space savings. In the IPC format, dictionary indices reside alongside other columns in a record batch, while dictionary values are written to special dictionary batches, linked by a unique dictionary *id*. Internally Flechette extracts dictionary values upfront; while this incurs some initial overhead, it enables fast subsequent lookups. +Create a Dictionary data type instance. A dictionary type consists of a dictionary of values (which may be of any type) and corresponding integer indices that reference those values. If values are repeated, a dictionary encoding can provide substantial space savings. In the IPC format, dictionary indices reside alongside other columns in a record batch, while dictionary values are written to special dictionary batches, linked by a unique dictionary *id* assigned at encoding time. Internally Flechette extracts dictionary values immediately upon decoding; while this incurs some initial overhead, it enables fast subsequent lookups. * *type* (`DataType`): The data type of dictionary values. * *indexType* (`DataType`): The data type of dictionary indices. Must be an integer type (default [`int32`](#int32)). -* *id* (`number`): The dictionary id, should be unique in a table. Defaults to `-1`, but is set to a proper id if the type is passed through [`tableFromArrays`](/flechette/api/#tableFromArrays). * *ordered* (`boolean`): Indicates if dictionary values are ordered (default `false`). +* *id* (`number`): Optional dictionary id. The default value (-1) indicates that the dictionary applies to a single column only. Provide an explicit id in order to reuse a dictionary across columns when building, in which case different dictionaries *must* have different unique ids. All dictionary ids are later resolved (possibly to new values) upon IPC encoding. ### Null diff --git a/docs/api/index.md b/docs/api/index.md index 0a438bf..1ca5c2e 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -137,7 +137,7 @@ const col = columnFromArray( <hr/><a id="tableFromColumns" href="#tableFromColumns">#</a> <b>tableFromColumns</b>(<i>columns</i>[, <i>useProxy</i>]) -Create a new table from a collection of columns. This method is useful for creating new tables using one or more pre-existing column instances. Otherwise, [`tableFromArrays`](#tableFromArrays) should be preferred. Input columns are assumed to have the same record batch sizes and non-conflicting dictionary ids. +Create a new table from a collection of columns. This method is useful for creating new tables using one or more pre-existing column instances. Otherwise, [`tableFromArrays`](#tableFromArrays) should be preferred. Input columns are assumed to have the same record batch sizes. * *data* (`object | array`): The input columns as an object with name keys, or an array of [name, column] pairs. * *useProxy* (`boolean`): Flag indicating if row proxy objects should be used to represent table rows (default `false`). Typically this should match the value of the `useProxy` extraction option used for column generation. diff --git a/src/build/builder.js b/src/build/builder.js index 0c9cbfe..6a207e7 100644 --- a/src/build/builder.js +++ b/src/build/builder.js @@ -6,7 +6,7 @@ import { toBigInt, toDateDay, toFloat16, toTimestamp } from '../util/numbers.js' import { BinaryBuilder } from './builders/binary.js'; import { BoolBuilder } from './builders/bool.js'; import { DecimalBuilder } from './builders/decimal.js'; -import { DictionaryBuilder, dictionaryValues } from './builders/dictionary.js'; +import { DictionaryBuilder, dictionaryContext } from './builders/dictionary.js'; import { FixedSizeBinaryBuilder } from './builders/fixed-size-binary.js'; import { FixedSizeListBuilder } from './builders/fixed-size-list.js'; import { IntervalDayTimeBuilder, IntervalMonthDayNanoBuilder } from './builders/interval.js'; @@ -19,36 +19,20 @@ import { Utf8Builder } from './builders/utf8.js'; import { DirectBuilder, Int64Builder, TransformBuilder } from './builders/values.js'; /** - * Create a new context object for shared builder state. + * Create a context object for shared builder state. * @param {import('../types.js').ExtractionOptions} [options] * Batch extraction options. - * @param {Map<number, ReturnType<dictionaryValues>>} [dictMap] - * A map of dictionary ids to value builder helpers. +* @param {ReturnType<dictionaryContext>} [dictionaries] + * Context object for tracking dictionaries. */ -export function builderContext(options, dictMap = new Map) { - let dictId = 0; +export function builderContext( + options = {}, + dictionaries = dictionaryContext() +) { return { - batchType(type) { - return batchType(type, options); - }, - dictionary(type, id) { - let dict; - if (id != null) { - dict = dictMap.get(id); - } else { - while (dictMap.has(dictId + 1)) ++dictId; - id = dictId; - } - if (!dict) { - dictMap.set(id, dict = dictionaryValues(id, type, this)); - } - return dict; - }, - finish() { - for (const dict of dictMap.values()) { - dict.finish(options); - } - } + batchType: type => batchType(type, options), + dictionary(type) { return dictionaries.get(type, this); }, + finish: () => dictionaries.finish(options) }; } diff --git a/src/build/builders/dictionary.js b/src/build/builders/dictionary.js index 2720378..987f058 100644 --- a/src/build/builders/dictionary.js +++ b/src/build/builders/dictionary.js @@ -6,21 +6,58 @@ import { builder } from '../builder.js'; import { ValidityBuilder } from './validity.js'; /** - * Builder helped for creating dictionary values. - * @param {number} id The dictionary id. + * Create a context object for managing dictionary builders. + */ +export function dictionaryContext() { + const idMap = new Map; + const dicts = new Set; + return { + /** + * Get a dictionary values builder for the given dictionary type. + * @param {import('../../types.js').DictionaryType} type + * The dictionary type. + * @param {*} ctx The builder context. + * @returns {ReturnType<dictionaryValues>} + */ + get(type, ctx) { + // if a dictionary has a non-negative id, assume it was set + // intentionally and track it for potential reuse across columns + // otherwise the dictionary is used for a single column only + const id = type.id; + if (id >= 0 && idMap.has(id)) { + return idMap.get(id); + } else { + const dict = dictionaryValues(type, ctx); + if (id >= 0) idMap.set(id, dict); + dicts.add(dict); + return dict; + } + }, + /** + * Finish building dictionary values columns and assign them to + * their corresponding dictionary batches. + * @param {import('../../types.js').ExtractionOptions} options + */ + finish(options) { + dicts.forEach(dict => dict.finish(options)); + } + }; +} + +/** + * Builder helper for creating dictionary values. * @param {import('../../types.js').DictionaryType} type * The dictionary data type. - * @param {*} ctx - * @returns + * @param {ReturnType<import('../builder.js').builderContext>} ctx + * The builder context. */ -export function dictionaryValues(id, type, ctx) { +export function dictionaryValues(type, ctx) { const keys = Object.create(null); const values = builder(type.dictionary, ctx); const batches = []; values.init(); let index = -1; - type.id = id; return { type, diff --git a/src/build/column-from-array.js b/src/build/column-from-array.js index be0e8a6..9e8b7da 100644 --- a/src/build/column-from-array.js +++ b/src/build/column-from-array.js @@ -1,10 +1,8 @@ import { float32Array, float64Array, int16Array, int32Array, int64Array, int8Array, isInt64ArrayType, isTypedArray, uint16Array, uint32Array, uint64Array, uint8Array } from '../util/arrays.js'; -import { DirectBatch, Int64Batch, NullBatch } from '../batch.js'; +import { DirectBatch, Int64Batch } from '../batch.js'; import { Column } from '../column.js'; import { float32, float64, int16, int32, int64, int8, uint16, uint32, uint64, uint8 } from '../data-types.js'; -import { inferType } from './infer-type.js'; -import { builder, builderContext } from './builder.js'; -import { Type } from '../constants.js'; +import { columnFromValues } from './column-from-values.js'; /** * Create a new column from a provided data array. @@ -14,25 +12,20 @@ import { Type } from '../constants.js'; * If not specified, type inference is attempted. * @param {import('../types.js').ColumnBuilderOptions} [options] * Builder options for the generated column. - * @param {ReturnType<import('./builder.js').builderContext>} [ctx] + * @param {ReturnType<import('./builders/dictionary.js').dictionaryContext>} [dicts] * Builder context object, for internal use only. * @returns {Column<T>} The generated column. */ -export function columnFromArray(data, type, options = {}, ctx) { - if (!type) { - if (isTypedArray(data)) { - return columnFromTypedArray(data, options); - } else { - type = inferType(data); - } - } - return columnFromValues(data, type, options, ctx); +export function columnFromArray(data, type, options = {}, dicts) { + return !type && isTypedArray(data) + ? columnFromTypedArray(data, options) + : columnFromValues(data.length, v => data.forEach(v), type, options, dicts); } /** * Create a new column from a typed array input. * @template T - * @param {import('../types.js').TypedArray} values + * @param {import('../types.js').TypedArray} values The input data. * @param {import('../types.js').ColumnBuilderOptions} options * Builder options for the generated column. * @returns {Column<T>} The generated column. @@ -62,52 +55,6 @@ function columnFromTypedArray(values, { maxBatchRows, useBigInt }) { return new Column(batches); } -/** - * Build a column by iterating over the provided values array. - * @template T - * @param {Array | import('../types.js').TypedArray} values The input data. - * @param {import('../types.js').DataType} type The column data type. - * @param {import('../types.js').ColumnBuilderOptions} [options] - * Builder options for the generated column. - * @param {ReturnType<import('./builder.js').builderContext>} [ctx] - * Builder context object, for internal use only. - * @returns {Column<T>} The generated column. - */ -function columnFromValues(values, type, options, ctx) { - const { maxBatchRows, ...opt } = options; - const length = values.length; - const limit = Math.min(maxBatchRows || Infinity, length); - - // if null type, generate batches and exit early - if (type.typeId === Type.Null) { - return new Column(nullBatches(type, length, limit)); - } - - const data = []; - ctx ??= builderContext(opt); - const b = builder(type, ctx).init(); - const next = b => data.push(b.batch()); - const numBatches = Math.floor(length / limit); - - let idx = 0; - let row = 0; - for (let i = 0; i < numBatches; ++i) { - for (row = 0; row < limit; ++row) { - b.set(values[idx++], row); - } - next(b); - } - for (row = 0; idx < length; ++idx) { - b.set(values[idx], row++); - } - if (row) next(b); - - // resolve dictionaries - ctx.finish(); - - return new Column(data); -} - /** * Return an Arrow data type for a given typed array type. * @param {import('../types.js').TypedArrayConstructor} arrayType @@ -128,22 +75,3 @@ function typeForTypedArray(arrayType) { case uint64Array: return uint64(); } } - -/** - * Create null batches with the given batch size limit. - * @param {import('../types.js').NullType} type The null data type. - * @param {number} length The total column length. - * @param {number} limit The maximum batch size. - * @returns {import('../batch.js').NullBatch[]} The null batches. - */ -function nullBatches(type, length, limit) { - const data = []; - const batch = length => new NullBatch({ length, nullCount: length, type }); - const numBatches = Math.floor(length / limit); - for (let i = 0; i < numBatches; ++i) { - data.push(batch(limit)); - } - const rem = length % limit; - if (rem) data.push(batch(rem)); - return data; -} diff --git a/src/build/column-from-values.js b/src/build/column-from-values.js new file mode 100644 index 0000000..a72f750 --- /dev/null +++ b/src/build/column-from-values.js @@ -0,0 +1,69 @@ +import { NullBatch } from '../batch.js'; +import { Column } from '../column.js'; +import { inferType } from './infer-type.js'; +import { builder, builderContext } from './builder.js'; +import { Type } from '../constants.js'; + +/** + * Create a new column by iterating over provided values. + * @template T + * @param {number} length The input data length. + * @param {(visitor: (value: any) => void) => void} visit + * A function that applies a callback to successive data values. + * @param {import('../types.js').DataType} type The data type. + * @param {import('../types.js').ColumnBuilderOptions} [options] + * Builder options for the generated column. + * @param {ReturnType< + * import('./builders/dictionary.js').dictionaryContext + * >} [dicts] Builder context object, for internal use only. + * @returns {Column<T>} The generated column. + */ +export function columnFromValues(length, visit, type, options, dicts) { + type ??= inferType(visit); + const { maxBatchRows, ...opt } = options; + const limit = Math.min(maxBatchRows || Infinity, length); + + // if null type, generate batches and exit early + if (type.typeId === Type.Null) { + return new Column(nullBatches(type, length, limit)); + } + + const ctx = builderContext(opt, dicts); + const b = builder(type, ctx).init(); + const data = []; + const next = b => data.push(b.batch()); + + let row = 0; + visit(value => { + b.set(value, row++); + if (row >= limit) { + next(b); + row = 0; + } + }); + if (row) next(b); + + // resolve dictionaries + ctx.finish(); + + return new Column(data); +} + +/** + * Create null batches with the given batch size limit. + * @param {import('../types.js').NullType} type The null data type. + * @param {number} length The total column length. + * @param {number} limit The maximum batch size. + * @returns {import('../batch.js').NullBatch[]} The null batches. + */ +function nullBatches(type, length, limit) { + const data = []; + const batch = length => new NullBatch({ length, nullCount: length, type }); + const numBatches = Math.floor(length / limit); + for (let i = 0; i < numBatches; ++i) { + data.push(batch(limit)); + } + const rem = length % limit; + if (rem) data.push(batch(rem)); + return data; +} diff --git a/src/build/infer-type.js b/src/build/infer-type.js index a66ee17..2deaea5 100644 --- a/src/build/infer-type.js +++ b/src/build/infer-type.js @@ -3,14 +3,13 @@ import { isArray } from '../util/arrays.js'; /** * Infer the data type for a given input array. - * @param {import('../types.js').ValueArray} data The data array. + * @param {(visitor: (value: any) => void) => void} visit + * A function that applies a callback to successive data values. * @returns {import('../types.js').DataType} The data type. */ -export function inferType(data) { +export function inferType(visit) { const profile = profiler(); - for (let i = 0; i < data.length; ++i) { - profile.add(data[i]); - } + visit(value => profile.add(value)); return profile.type(); } diff --git a/src/build/table-from-arrays.js b/src/build/table-from-arrays.js index 319721c..26bf25a 100644 --- a/src/build/table-from-arrays.js +++ b/src/build/table-from-arrays.js @@ -1,4 +1,4 @@ -import { builderContext } from './builder.js'; +import { dictionaryContext } from './builders/dictionary.js'; import { columnFromArray } from './column-from-array.js'; import { tableFromColumns } from './table-from-columns.js'; @@ -13,11 +13,11 @@ import { tableFromColumns } from './table-from-columns.js'; */ export function tableFromArrays(data, options = {}) { const { types = {}, ...opt } = options; - const ctx = builderContext(); + const dicts = dictionaryContext(); const entries = Array.isArray(data) ? data : Object.entries(data); const columns = entries.map(([name, array]) => /** @type {[string, import('../column.js').Column]} */ ( - [ name, columnFromArray(array, types[name], opt, ctx)] + [ name, columnFromArray(array, types[name], opt, dicts)] )); return tableFromColumns(columns, options.useProxy); } diff --git a/src/build/table-from-columns.js b/src/build/table-from-columns.js index 5a04c86..f73c6e2 100644 --- a/src/build/table-from-columns.js +++ b/src/build/table-from-columns.js @@ -1,10 +1,10 @@ -import { Endianness, Type, Version } from '../constants.js'; +import { Endianness, Version } from '../constants.js'; import { field } from '../data-types.js'; import { Table } from '../table.js'; /** * Create a new table from a collection of columns. Columns are assumed - * to have the same record batch sizes and consistent dictionary ids. + * to have the same record batch sizes. * @param {[string, import('../column.js').Column][] * | Record<string, import('../column.js').Column>} data The columns, * as an object with name keys, or an array of [name, column] pairs. @@ -14,21 +14,13 @@ import { Table } from '../table.js'; */ export function tableFromColumns(data, useProxy) { const fields = []; - const dictionaryTypes = new Map; const entries = Array.isArray(data) ? data : Object.entries(data); const length = entries[0]?.[1].length; + const columns = entries.map(([name, col]) => { if (col.length !== length) { throw new Error('All columns must have the same length.'); } - const type = col.type; - if (type.typeId === Type.Dictionary) { - const dict = dictionaryTypes.get(type.id); - if (dict && dict !== type.dictionary) { - throw new Error('Same id used across different dictionaries.'); - } - dictionaryTypes.set(type.id, type.dictionary); - } fields.push(field(name, col.type)); return col; }); @@ -37,8 +29,7 @@ export function tableFromColumns(data, useProxy) { version: Version.V5, endianness: Endianness.Little, fields, - metadata: null, - dictionaryTypes + metadata: null }; return new Table(schema, columns, useProxy); diff --git a/src/data-types.js b/src/data-types.js index aa7bf4c..1653fb7 100644 --- a/src/data-types.js +++ b/src/data-types.js @@ -85,17 +85,21 @@ const basicType = (typeId) => ({ typeId }); * values. * @param {import('./types.js').IntType} [indexType] The data type of * dictionary indices. Must be an integer type (default `int32`). - * @param {number} [id=-1] The dictionary id, should be unique in a table. * @param {boolean} [ordered=false] Indicates if dictionary values are * ordered (default `false`). + * @param {number} [id=-1] The dictionary id. The default value (-1) indicates + * the dictionary applies to a single column only. Provide an explicit id in + * order to reuse a dictionary across columns when building, in which case + * different dictionaries *must* have different unique ids. All dictionary + * ids are later resolved (possibly to new values) upon IPC encoding. * @returns {import('./types.js').DictionaryType} */ -export const dictionary = (type, indexType, id = -1, ordered = false) => ({ +export const dictionary = (type, indexType, ordered = false, id = -1) => ({ typeId: Type.Dictionary, dictionary: type, indices: indexType || int32(), - id, - ordered + ordered, + id }); /** diff --git a/src/decode/schema.js b/src/decode/schema.js index 1a8df94..62d1d47 100644 --- a/src/decode/schema.js +++ b/src/decode/schema.js @@ -12,7 +12,6 @@ import { decodeMetadata } from './metadata.js'; * @returns {import('../types.js').Schema} The schema */ export function decodeSchema(buf, index, version) { - const dictionaryTypes = new Map; // 4: endianness (int16) // 6: fields (vector) // 8: metadata (vector) @@ -21,25 +20,22 @@ export function decodeSchema(buf, index, version) { return { version, endianness: /** @type {import('../types.js').Endianness_} */ (get(4, readInt16, 0)), - fields: get(6, (buf, off) => decodeSchemaFields(buf, off, dictionaryTypes), []), - metadata: get(8, decodeMetadata), - dictionaryTypes + fields: get(6, decodeSchemaFields, []), + metadata: get(8, decodeMetadata) }; } /** * @returns {import('../types.js').Field[] | null} */ -function decodeSchemaFields(buf, fieldsOffset, dictionaryTypes) { - return readVector(buf, fieldsOffset, 4, - (buf, pos) => decodeField(buf, pos, dictionaryTypes) - ); +function decodeSchemaFields(buf, fieldsOffset) { + return readVector(buf, fieldsOffset, 4, decodeField); } /** * @returns {import('../types.js').Field} */ -function decodeField(buf, index, dictionaryTypes) { +function decodeField(buf, index) { // 4: name (string) // 6: nullable (bool) // 8: type id (uint8) @@ -51,22 +47,12 @@ function decodeField(buf, index, dictionaryTypes) { const typeId = get(8, readUint8, Type.NONE); const typeOffset = get(10, readOffset, 0); const dict = get(12, decodeDictionary); - const children = get(14, (buf, off) => decodeFieldChildren(buf, off, dictionaryTypes)); + const children = get(14, (buf, off) => decodeFieldChildren(buf, off)); - let type; + let type = decodeDataType(buf, typeOffset, typeId, children); if (dict) { - const { id } = dict; - let dictType = dictionaryTypes.get(id); - if (!dictType) { - // if dictionary encoded and the first time we've seen this id, decode - // the type and children fields and add to the dictionary map. - dictType = decodeDataType(buf, typeOffset, typeId, children); - dictionaryTypes.set(id, dictType); - } - dict.dictionary = dictType; + dict.dictionary = type; type = dict; - } else { - type = decodeDataType(buf, typeOffset, typeId, children); } return { @@ -80,10 +66,8 @@ function decodeField(buf, index, dictionaryTypes) { /** * @returns {import('../types.js').Field[] | null} */ -function decodeFieldChildren(buf, fieldOffset, dictionaryTypes) { - const children = readVector(buf, fieldOffset, 4, - (buf, pos) => decodeField(buf, pos, dictionaryTypes) - ); +function decodeFieldChildren(buf, fieldOffset) { + const children = readVector(buf, fieldOffset, 4, decodeField); return children.length ? children : null; } @@ -102,8 +86,8 @@ function decodeDictionary(buf, index) { return dictionary( null, // data type will be populated by caller get(6, decodeInt, int32()), // index type + get(8, readBoolean, false), // ordered get(4, readInt64, 0), // id - get(8, readBoolean, false) // ordered ); } diff --git a/src/decode/table-from-ipc.js b/src/decode/table-from-ipc.js index 2e09801..bcecdc8 100644 --- a/src/decode/table-from-ipc.js +++ b/src/decode/table-from-ipc.js @@ -37,11 +37,20 @@ export function tableFromIPC(data, options) { */ export function createTable(data, options = {}) { const { schema = { fields: [] }, dictionaries, records } = data; - const { version, fields, dictionaryTypes } = schema; + const { version, fields } = schema; const dictionaryMap = new Map; const context = contextGenerator(options, version, dictionaryMap); - // decode dictionaries + // build dictionary type map + const dictionaryTypes = new Map; + visitSchemaFields(schema, field => { + const type = field.type; + if (type.typeId === Type.Dictionary) { + dictionaryTypes.set(type.id, type.dictionary); + } + }); + + // decode dictionaries, build dictionary column map const dicts = new Map; for (const dict of dictionaries) { const { id, data, isDelta, body } = dict; @@ -70,6 +79,21 @@ export function createTable(data, options = {}) { return new Table(schema, cols.map(c => c.done()), options.useProxy); } +/** + * Visit all fields within a schema. + * @param {import('../types.js').Schema} schema + * @param {(field: import('../types.js').Field) => void} visitor + */ +function visitSchemaFields(schema, visitor) { + schema.fields.forEach(function visitField(field) { + visitor(field); + // @ts-ignore + field.type.dictionary?.children?.forEach(visitField); + // @ts-ignore + field.type.children?.forEach(visitField); + }); +} + /** * Context object generator for field visitation and buffer definition. */ diff --git a/src/encode/table-to-ipc.js b/src/encode/table-to-ipc.js index 6f9b110..3419545 100644 --- a/src/encode/table-to-ipc.js +++ b/src/encode/table-to-ipc.js @@ -16,14 +16,17 @@ export function tableToIPC(table, options) { if (typeof options === 'string') { options = { format: options }; } - const schema = table.schema; const columns = table.children; - const dictionaries = assembleDictionaryBatches(columns); + const { dictionaries, idMap } = assembleDictionaryBatches(columns); const records = assembleRecordBatches(columns); + const schema = assembleSchema(table.schema, idMap); const data = { schema, dictionaries, records }; return encodeIPC(data, options).finish(); } +/** + * Create a new assembly context. + */ function assembleContext() { let byteLength = 0; const nodes = []; @@ -74,37 +77,107 @@ function assembleContext() { } /** - * @param {import('../column.js').Column[]} columns - * @returns {import('../types.js').DictionaryBatch[]} + * Assemble dictionary batches and their unique ids. + * @param {import('../column.js').Column[]} columns The table columns. + * @returns {{ + * dictionaries: import('../types.js').DictionaryBatch[], + * idMap: Map<import('../types.js').DataType, number> + * }} + * The assembled dictionary batches and a map from dictionary column + * instances to dictionary ids. */ function assembleDictionaryBatches(columns) { const dictionaries = []; - const seen = new Set; - - for (const col of columns) { - const { type } = col; - if (type.typeId !== -1) continue; - if (seen.has(type.id)) continue; - seen.add(type.id); - - // pass dictionary and deltas as-is - // @ts-ignore - const dict = col.data[0].dictionary; - for (let i = 0; i < dict.data.length; ++i) { - dictionaries.push({ - id: type.id, - isDelta: i > 0, - data: assembleRecordBatch([dict], i) - }); + const dictMap = new Map; + const idMap = new Map; + let id = -1; + + // track dictionaries, key by dictionary column, assign ids + const visitor = dictionaryColumn => { + if (!dictMap.has(dictionaryColumn)) { + dictMap.set(dictionaryColumn, ++id); + for (let i = 0; i < dictionaryColumn.data.length; ++i) { + dictionaries.push({ + id, + isDelta: i > 0, + data: assembleRecordBatch([dictionaryColumn], i) + }); + } + idMap.set(dictionaryColumn.type, id); + } else { + idMap.set(dictionaryColumn.type, dictMap.get(dictionaryColumn)); } + }; + + // recurse through column batches to find dictionaries + // it is sufficient to visit the first batch only, + // as all batches have the same dictionary column + columns.forEach(col => visitDictionaries(col.data[0], visitor)); + + return { dictionaries, idMap }; +} + +/** + * Traverse column batches to visit dictionary columns. + * @param {import('../batch.js').Batch} batch + * @param {(column: import('../column.js').Column) => void} visitor + */ +function visitDictionaries(batch, visitor) { + if (batch?.type.typeId === Type.Dictionary) { + // @ts-ignore - batch has type DictionaryBatch + const dictionary = batch.dictionary; + visitor(dictionary); + visitDictionaries(dictionary.data[0], visitor); } + batch?.children?.forEach(child => visitDictionaries(child, visitor)); +} + +/** + * Assemble a schema with resolved dictionary ids. + * @param {import('../types.js').Schema} schema The schema. + * @param {Map<import('../types.js').DataType, number>} idMap A map + * from dictionary value types to dictionary ids. + * @returns {import('../types.js').Schema} A new schema with resolved + * dictionary ids. If there are no dictionaries, the input schema is + * returned unchanged. + */ +function assembleSchema(schema, idMap) { + // early exit if no dictionaries + if (!idMap.size) return schema; - return dictionaries; + const visit = type => { + if (type.typeId === Type.Dictionary) { + type.id = idMap.get(type.dictionary); // lookup and set id + visitDictType(type); + } + if (type.children) { + (type.children = type.children.slice()).forEach(visitFields); + } + }; + + // visit a field in a field array + const visitFields = (field, index, array) => { + const type = { ...field.type }; + array[index] = { ...field, type }; + visit(type); + }; + + // visit a dictionary values type + const visitDictType = (parentType) => { + const type = { ...parentType.dictionary }; + parentType.dictionary = type; + visit(type); + }; + + schema = { ...schema, fields: schema.fields.slice() }; + schema.fields.forEach(visitFields); + return schema; } /** - * @param {import('../column.js').Column[]} columns - * @returns {import('../types.js').RecordBatch[]} + * Assemble record batches with marshalled buffers. + * @param {import('../column.js').Column[]} columns The table columns. + * @returns {import('../types.js').RecordBatch[]} The assembled record batches. */ function assembleRecordBatches(columns) { return (columns[0]?.data || []) @@ -112,8 +185,10 @@ function assembleRecordBatches(columns) { } /** - * @param {import('../column.js').Column[]} columns - * @returns {import('../types.js').RecordBatch} + * Assemble a record batch with marshalled buffers. + * @param {import('../column.js').Column[]} columns The table columns. + * @param {number} batchIndex The batch index. + * @returns {import('../types.js').RecordBatch} The assembled record batch. */ function assembleRecordBatch(columns, batchIndex = 0) { const ctx = assembleContext(); @@ -124,10 +199,10 @@ function assembleRecordBatch(columns, batchIndex = 0) { } /** - * Visit a column batch, assembling buffer information. - * @param {import('../types.js').DataType} type - * @param {import('../batch.js').Batch} batch - * @param {ReturnType<assembleContext>} ctx + * Visit a column batch, assembling buffer data. + * @param {import('../types.js').DataType} type The data type. + * @param {import('../batch.js').Batch} batch The column batch. + * @param {ReturnType<assembleContext>} ctx The assembly context. */ function visit(type, batch, ctx) { const { typeId } = type; diff --git a/src/index.js b/src/index.js index ed21313..8f7eb0c 100644 --- a/src/index.js +++ b/src/index.js @@ -40,12 +40,14 @@ export { largeListView } from './data-types.js'; +export { Batch } from './batch.js'; export { Column } from './column.js'; export { Table } from './table.js'; -export { Batch } from './batch.js'; export { batchType } from './batch-type.js'; export { tableFromIPC } from './decode/table-from-ipc.js'; export { tableToIPC } from './encode/table-to-ipc.js'; export { tableFromArrays } from './build/table-from-arrays.js'; export { tableFromColumns } from './build/table-from-columns.js'; export { columnFromArray } from './build/column-from-array.js'; +export { columnFromValues } from './build/column-from-values.js'; +export { dictionaryContext } from './build/builders/dictionary.js'; diff --git a/src/types.ts b/src/types.ts index 71b8cfa..f2ca5e4 100644 --- a/src/types.ts +++ b/src/types.ts @@ -105,8 +105,7 @@ export interface Schema { version?: Version_; endianness?: Endianness_; fields: Field[]; - metadata?: Metadata | null; - dictionaryTypes?: Map<number, DataType>; + metadata?: Metadata | null } /** diff --git a/test/infer-type-test.js b/test/infer-type-test.js index 1a3fc57..95e81ca 100644 --- a/test/infer-type-test.js +++ b/test/infer-type-test.js @@ -6,46 +6,50 @@ function matches(actual, expect) { assert.deepStrictEqual(actual, expect); } +function infer(values) { + return inferType(visitor => values.forEach(visitor)); +} + describe('inferType', () => { it('infers integer types', () => { - matches(inferType([1, 2, 3]), int8()); - matches(inferType([1e3, 2e3, 3e3]), int16()); - matches(inferType([1e6, 2e6, 3e6]), int32()); - matches(inferType([1n, 2n, 3n]), int64()); + matches(infer([1, 2, 3]), int8()); + matches(infer([1e3, 2e3, 3e3]), int16()); + matches(infer([1e6, 2e6, 3e6]), int32()); + matches(infer([1n, 2n, 3n]), int64()); - matches(inferType([-1, 2, 3]), int8()); - matches(inferType([-1e3, 2e3, 3e3]), int16()); - matches(inferType([-1e6, 2e6, 3e6]), int32()); - matches(inferType([-1n, 2n, 3n]), int64()); + matches(infer([-1, 2, 3]), int8()); + matches(infer([-1e3, 2e3, 3e3]), int16()); + matches(infer([-1e6, 2e6, 3e6]), int32()); + matches(infer([-1n, 2n, 3n]), int64()); - matches(inferType([1, 2, null, undefined, 3]), int8()); - matches(inferType([1e3, 2e3, null, undefined, 3e3]), int16()); - matches(inferType([1e6, 2e6, null, undefined, 3e6]), int32()); - matches(inferType([1n, 2n, null, undefined, 3n]), int64()); + matches(infer([1, 2, null, undefined, 3]), int8()); + matches(infer([1e3, 2e3, null, undefined, 3e3]), int16()); + matches(infer([1e6, 2e6, null, undefined, 3e6]), int32()); + matches(infer([1n, 2n, null, undefined, 3n]), int64()); }); it('infers float types', () => { - matches(inferType([1.1, 2.2, 3.3]), float64()); - matches(inferType([-1.1, 2.2, 3.3]), float64()); - matches(inferType([1, 2, 3.3]), float64()); - matches(inferType([1, 2, NaN]), float64()); - matches(inferType([NaN, null, undefined, NaN]), float64()); - matches(inferType([Number.MIN_SAFE_INTEGER, Number.MAX_SAFE_INTEGER]), float64()); + matches(infer([1.1, 2.2, 3.3]), float64()); + matches(infer([-1.1, 2.2, 3.3]), float64()); + matches(infer([1, 2, 3.3]), float64()); + matches(infer([1, 2, NaN]), float64()); + matches(infer([NaN, null, undefined, NaN]), float64()); + matches(infer([Number.MIN_SAFE_INTEGER, Number.MAX_SAFE_INTEGER]), float64()); }); it('infers utf8 dictionary types', () => { const type = dictionary(utf8(), int32()); - matches(inferType(['foo', 'bar', 'baz']), type); - matches(inferType(['foo', 'bar', null, undefined, 'baz']), type); + matches(infer(['foo', 'bar', 'baz']), type); + matches(infer(['foo', 'bar', null, undefined, 'baz']), type); }); it('infers bool types', () => { - matches(inferType([true, false, true]), bool()); - matches(inferType([true, false, null, undefined, true]), bool()); + matches(infer([true, false, true]), bool()); + matches(infer([true, false, null, undefined, true]), bool()); }); it('infers date day types', () => { - matches(inferType([ + matches(infer([ new Date(Date.UTC(2000, 1, 2)), new Date(Date.UTC(2006, 3, 20)), null, @@ -55,7 +59,7 @@ describe('inferType', () => { it('infers timestamp types', () => { matches( - inferType([ + infer([ new Date(Date.UTC(2000, 1, 2)), new Date(Date.UTC(2006, 3, 20)), null, @@ -67,14 +71,14 @@ describe('inferType', () => { }); it('infers list types', () => { - matches(inferType([[1, 2], [3, 4]]), list(int8())); - matches(inferType([[true, null, false], null, undefined, [false, undefined, true]]), list(bool())); - matches(inferType([['foo', 'bar', null], null, ['bar', 'baz']]), list(dictionary(utf8(), int32()))); + matches(infer([[1, 2], [3, 4]]), list(int8())); + matches(infer([[true, null, false], null, undefined, [false, undefined, true]]), list(bool())); + matches(infer([['foo', 'bar', null], null, ['bar', 'baz']]), list(dictionary(utf8(), int32()))); }); it('infers struct types', () => { matches( - inferType([ + infer([ { foo: 1, bar: [1.1, 2.2] }, { foo: null, bar: [2.2, null, 3.3] }, null, @@ -86,10 +90,10 @@ describe('inferType', () => { }); it('throws on bigints that exceed 64 bits', () => { - assert.throws(() => inferType([(1n << 200n)])); + assert.throws(() => infer([(1n << 200n)])); }); it('throws on mixed types', () => { - assert.throws(() => inferType([1, true, 'foo'])); + assert.throws(() => infer([1, true, 'foo'])); }); }); diff --git a/test/table-to-ipc-test.js b/test/table-to-ipc-test.js index 8281a57..b5cb697 100644 --- a/test/table-to-ipc-test.js +++ b/test/table-to-ipc-test.js @@ -32,7 +32,6 @@ function testEncode(bytes) { // ensure complete schema, override version const schema = { - dictionaryTypes: new Map, endianness: 0, metadata: null, ...table.schema, diff --git a/test/util/decimal.js b/test/util/decimal.js index 080730d..801237e 100644 --- a/test/util/decimal.js +++ b/test/util/decimal.js @@ -11,8 +11,7 @@ export function decimalDataToEncode() { nullable: true, metadata: null }], - metadata: null, - dictionaryTypes: new Map + metadata: null }, records: [{ length: 3,