diff --git a/docs/api/data-types.md b/docs/api/data-types.md
index bd456e9..00c99fc 100644
--- a/docs/api/data-types.md
+++ b/docs/api/data-types.md
@@ -85,14 +85,14 @@ Create a new field instance for use in a schema or type definition. A field repr
### Dictionary
#
-dictionary(type[, indexType, id, ordered])
+dictionary(type[, indexType, ordered, id])
-Create a Dictionary data type instance. A dictionary type consists of a dictionary of values (which may be of any type) and corresponding integer indices that reference those values. If values are repeated, a dictionary encoding can provide substantial space savings. In the IPC format, dictionary indices reside alongside other columns in a record batch, while dictionary values are written to special dictionary batches, linked by a unique dictionary *id*. Internally Flechette extracts dictionary values upfront; while this incurs some initial overhead, it enables fast subsequent lookups.
+Create a Dictionary data type instance. A dictionary type consists of a dictionary of values (which may be of any type) and corresponding integer indices that reference those values. If values are repeated, a dictionary encoding can provide substantial space savings. In the IPC format, dictionary indices reside alongside other columns in a record batch, while dictionary values are written to special dictionary batches, linked by a unique dictionary *id* assigned at encoding time. Internally Flechette extracts dictionary values immediately upon decoding; while this incurs some initial overhead, it enables fast subsequent lookups.
* *type* (`DataType`): The data type of dictionary values.
* *indexType* (`DataType`): The data type of dictionary indices. Must be an integer type (default [`int32`](#int32)).
-* *id* (`number`): The dictionary id, should be unique in a table. Defaults to `-1`, but is set to a proper id if the type is passed through [`tableFromArrays`](/flechette/api/#tableFromArrays).
* *ordered* (`boolean`): Indicates if dictionary values are ordered (default `false`).
+* *id* (`number`): Optional dictionary id. The default value (-1) indicates that the dictionary applies to a single column only. Provide an explicit id in order to reuse a dictionary across columns when building, in which case different dictionaries *must* have different unique ids. All dictionary ids are later resolved (possibly to new values) upon IPC encoding.
### Null
diff --git a/docs/api/index.md b/docs/api/index.md
index 0a438bf..1ca5c2e 100644
--- a/docs/api/index.md
+++ b/docs/api/index.md
@@ -137,7 +137,7 @@ const col = columnFromArray(
#
tableFromColumns(columns[, useProxy])
-Create a new table from a collection of columns. This method is useful for creating new tables using one or more pre-existing column instances. Otherwise, [`tableFromArrays`](#tableFromArrays) should be preferred. Input columns are assumed to have the same record batch sizes and non-conflicting dictionary ids.
+Create a new table from a collection of columns. This method is useful for creating new tables using one or more pre-existing column instances. Otherwise, [`tableFromArrays`](#tableFromArrays) should be preferred. Input columns are assumed to have the same record batch sizes.
* *data* (`object | array`): The input columns as an object with name keys, or an array of [name, column] pairs.
* *useProxy* (`boolean`): Flag indicating if row proxy objects should be used to represent table rows (default `false`). Typically this should match the value of the `useProxy` extraction option used for column generation.
diff --git a/src/build/builder.js b/src/build/builder.js
index 0c9cbfe..6a207e7 100644
--- a/src/build/builder.js
+++ b/src/build/builder.js
@@ -6,7 +6,7 @@ import { toBigInt, toDateDay, toFloat16, toTimestamp } from '../util/numbers.js'
import { BinaryBuilder } from './builders/binary.js';
import { BoolBuilder } from './builders/bool.js';
import { DecimalBuilder } from './builders/decimal.js';
-import { DictionaryBuilder, dictionaryValues } from './builders/dictionary.js';
+import { DictionaryBuilder, dictionaryContext } from './builders/dictionary.js';
import { FixedSizeBinaryBuilder } from './builders/fixed-size-binary.js';
import { FixedSizeListBuilder } from './builders/fixed-size-list.js';
import { IntervalDayTimeBuilder, IntervalMonthDayNanoBuilder } from './builders/interval.js';
@@ -19,36 +19,20 @@ import { Utf8Builder } from './builders/utf8.js';
import { DirectBuilder, Int64Builder, TransformBuilder } from './builders/values.js';
/**
- * Create a new context object for shared builder state.
+ * Create a context object for shared builder state.
* @param {import('../types.js').ExtractionOptions} [options]
* Batch extraction options.
- * @param {Map>} [dictMap]
- * A map of dictionary ids to value builder helpers.
+* @param {ReturnType} [dictionaries]
+ * Context object for tracking dictionaries.
*/
-export function builderContext(options, dictMap = new Map) {
- let dictId = 0;
+export function builderContext(
+ options = {},
+ dictionaries = dictionaryContext()
+) {
return {
- batchType(type) {
- return batchType(type, options);
- },
- dictionary(type, id) {
- let dict;
- if (id != null) {
- dict = dictMap.get(id);
- } else {
- while (dictMap.has(dictId + 1)) ++dictId;
- id = dictId;
- }
- if (!dict) {
- dictMap.set(id, dict = dictionaryValues(id, type, this));
- }
- return dict;
- },
- finish() {
- for (const dict of dictMap.values()) {
- dict.finish(options);
- }
- }
+ batchType: type => batchType(type, options),
+ dictionary(type) { return dictionaries.get(type, this); },
+ finish: () => dictionaries.finish(options)
};
}
diff --git a/src/build/builders/dictionary.js b/src/build/builders/dictionary.js
index 2720378..987f058 100644
--- a/src/build/builders/dictionary.js
+++ b/src/build/builders/dictionary.js
@@ -6,21 +6,58 @@ import { builder } from '../builder.js';
import { ValidityBuilder } from './validity.js';
/**
- * Builder helped for creating dictionary values.
- * @param {number} id The dictionary id.
+ * Create a context object for managing dictionary builders.
+ */
+export function dictionaryContext() {
+ const idMap = new Map;
+ const dicts = new Set;
+ return {
+ /**
+ * Get a dictionary values builder for the given dictionary type.
+ * @param {import('../../types.js').DictionaryType} type
+ * The dictionary type.
+ * @param {*} ctx The builder context.
+ * @returns {ReturnType}
+ */
+ get(type, ctx) {
+ // if a dictionary has a non-negative id, assume it was set
+ // intentionally and track it for potential reuse across columns
+ // otherwise the dictionary is used for a single column only
+ const id = type.id;
+ if (id >= 0 && idMap.has(id)) {
+ return idMap.get(id);
+ } else {
+ const dict = dictionaryValues(type, ctx);
+ if (id >= 0) idMap.set(id, dict);
+ dicts.add(dict);
+ return dict;
+ }
+ },
+ /**
+ * Finish building dictionary values columns and assign them to
+ * their corresponding dictionary batches.
+ * @param {import('../../types.js').ExtractionOptions} options
+ */
+ finish(options) {
+ dicts.forEach(dict => dict.finish(options));
+ }
+ };
+}
+
+/**
+ * Builder helper for creating dictionary values.
* @param {import('../../types.js').DictionaryType} type
* The dictionary data type.
- * @param {*} ctx
- * @returns
+ * @param {ReturnType} ctx
+ * The builder context.
*/
-export function dictionaryValues(id, type, ctx) {
+export function dictionaryValues(type, ctx) {
const keys = Object.create(null);
const values = builder(type.dictionary, ctx);
const batches = [];
values.init();
let index = -1;
- type.id = id;
return {
type,
diff --git a/src/build/column-from-array.js b/src/build/column-from-array.js
index be0e8a6..9e8b7da 100644
--- a/src/build/column-from-array.js
+++ b/src/build/column-from-array.js
@@ -1,10 +1,8 @@
import { float32Array, float64Array, int16Array, int32Array, int64Array, int8Array, isInt64ArrayType, isTypedArray, uint16Array, uint32Array, uint64Array, uint8Array } from '../util/arrays.js';
-import { DirectBatch, Int64Batch, NullBatch } from '../batch.js';
+import { DirectBatch, Int64Batch } from '../batch.js';
import { Column } from '../column.js';
import { float32, float64, int16, int32, int64, int8, uint16, uint32, uint64, uint8 } from '../data-types.js';
-import { inferType } from './infer-type.js';
-import { builder, builderContext } from './builder.js';
-import { Type } from '../constants.js';
+import { columnFromValues } from './column-from-values.js';
/**
* Create a new column from a provided data array.
@@ -14,25 +12,20 @@ import { Type } from '../constants.js';
* If not specified, type inference is attempted.
* @param {import('../types.js').ColumnBuilderOptions} [options]
* Builder options for the generated column.
- * @param {ReturnType} [ctx]
+ * @param {ReturnType} [dicts]
* Builder context object, for internal use only.
* @returns {Column} The generated column.
*/
-export function columnFromArray(data, type, options = {}, ctx) {
- if (!type) {
- if (isTypedArray(data)) {
- return columnFromTypedArray(data, options);
- } else {
- type = inferType(data);
- }
- }
- return columnFromValues(data, type, options, ctx);
+export function columnFromArray(data, type, options = {}, dicts) {
+ return !type && isTypedArray(data)
+ ? columnFromTypedArray(data, options)
+ : columnFromValues(data.length, v => data.forEach(v), type, options, dicts);
}
/**
* Create a new column from a typed array input.
* @template T
- * @param {import('../types.js').TypedArray} values
+ * @param {import('../types.js').TypedArray} values The input data.
* @param {import('../types.js').ColumnBuilderOptions} options
* Builder options for the generated column.
* @returns {Column} The generated column.
@@ -62,52 +55,6 @@ function columnFromTypedArray(values, { maxBatchRows, useBigInt }) {
return new Column(batches);
}
-/**
- * Build a column by iterating over the provided values array.
- * @template T
- * @param {Array | import('../types.js').TypedArray} values The input data.
- * @param {import('../types.js').DataType} type The column data type.
- * @param {import('../types.js').ColumnBuilderOptions} [options]
- * Builder options for the generated column.
- * @param {ReturnType} [ctx]
- * Builder context object, for internal use only.
- * @returns {Column} The generated column.
- */
-function columnFromValues(values, type, options, ctx) {
- const { maxBatchRows, ...opt } = options;
- const length = values.length;
- const limit = Math.min(maxBatchRows || Infinity, length);
-
- // if null type, generate batches and exit early
- if (type.typeId === Type.Null) {
- return new Column(nullBatches(type, length, limit));
- }
-
- const data = [];
- ctx ??= builderContext(opt);
- const b = builder(type, ctx).init();
- const next = b => data.push(b.batch());
- const numBatches = Math.floor(length / limit);
-
- let idx = 0;
- let row = 0;
- for (let i = 0; i < numBatches; ++i) {
- for (row = 0; row < limit; ++row) {
- b.set(values[idx++], row);
- }
- next(b);
- }
- for (row = 0; idx < length; ++idx) {
- b.set(values[idx], row++);
- }
- if (row) next(b);
-
- // resolve dictionaries
- ctx.finish();
-
- return new Column(data);
-}
-
/**
* Return an Arrow data type for a given typed array type.
* @param {import('../types.js').TypedArrayConstructor} arrayType
@@ -128,22 +75,3 @@ function typeForTypedArray(arrayType) {
case uint64Array: return uint64();
}
}
-
-/**
- * Create null batches with the given batch size limit.
- * @param {import('../types.js').NullType} type The null data type.
- * @param {number} length The total column length.
- * @param {number} limit The maximum batch size.
- * @returns {import('../batch.js').NullBatch[]} The null batches.
- */
-function nullBatches(type, length, limit) {
- const data = [];
- const batch = length => new NullBatch({ length, nullCount: length, type });
- const numBatches = Math.floor(length / limit);
- for (let i = 0; i < numBatches; ++i) {
- data.push(batch(limit));
- }
- const rem = length % limit;
- if (rem) data.push(batch(rem));
- return data;
-}
diff --git a/src/build/column-from-values.js b/src/build/column-from-values.js
new file mode 100644
index 0000000..a72f750
--- /dev/null
+++ b/src/build/column-from-values.js
@@ -0,0 +1,69 @@
+import { NullBatch } from '../batch.js';
+import { Column } from '../column.js';
+import { inferType } from './infer-type.js';
+import { builder, builderContext } from './builder.js';
+import { Type } from '../constants.js';
+
+/**
+ * Create a new column by iterating over provided values.
+ * @template T
+ * @param {number} length The input data length.
+ * @param {(visitor: (value: any) => void) => void} visit
+ * A function that applies a callback to successive data values.
+ * @param {import('../types.js').DataType} type The data type.
+ * @param {import('../types.js').ColumnBuilderOptions} [options]
+ * Builder options for the generated column.
+ * @param {ReturnType<
+ * import('./builders/dictionary.js').dictionaryContext
+ * >} [dicts] Builder context object, for internal use only.
+ * @returns {Column} The generated column.
+ */
+export function columnFromValues(length, visit, type, options, dicts) {
+ type ??= inferType(visit);
+ const { maxBatchRows, ...opt } = options;
+ const limit = Math.min(maxBatchRows || Infinity, length);
+
+ // if null type, generate batches and exit early
+ if (type.typeId === Type.Null) {
+ return new Column(nullBatches(type, length, limit));
+ }
+
+ const ctx = builderContext(opt, dicts);
+ const b = builder(type, ctx).init();
+ const data = [];
+ const next = b => data.push(b.batch());
+
+ let row = 0;
+ visit(value => {
+ b.set(value, row++);
+ if (row >= limit) {
+ next(b);
+ row = 0;
+ }
+ });
+ if (row) next(b);
+
+ // resolve dictionaries
+ ctx.finish();
+
+ return new Column(data);
+}
+
+/**
+ * Create null batches with the given batch size limit.
+ * @param {import('../types.js').NullType} type The null data type.
+ * @param {number} length The total column length.
+ * @param {number} limit The maximum batch size.
+ * @returns {import('../batch.js').NullBatch[]} The null batches.
+ */
+function nullBatches(type, length, limit) {
+ const data = [];
+ const batch = length => new NullBatch({ length, nullCount: length, type });
+ const numBatches = Math.floor(length / limit);
+ for (let i = 0; i < numBatches; ++i) {
+ data.push(batch(limit));
+ }
+ const rem = length % limit;
+ if (rem) data.push(batch(rem));
+ return data;
+}
diff --git a/src/build/infer-type.js b/src/build/infer-type.js
index a66ee17..2deaea5 100644
--- a/src/build/infer-type.js
+++ b/src/build/infer-type.js
@@ -3,14 +3,13 @@ import { isArray } from '../util/arrays.js';
/**
* Infer the data type for a given input array.
- * @param {import('../types.js').ValueArray} data The data array.
+ * @param {(visitor: (value: any) => void) => void} visit
+ * A function that applies a callback to successive data values.
* @returns {import('../types.js').DataType} The data type.
*/
-export function inferType(data) {
+export function inferType(visit) {
const profile = profiler();
- for (let i = 0; i < data.length; ++i) {
- profile.add(data[i]);
- }
+ visit(value => profile.add(value));
return profile.type();
}
diff --git a/src/build/table-from-arrays.js b/src/build/table-from-arrays.js
index 319721c..26bf25a 100644
--- a/src/build/table-from-arrays.js
+++ b/src/build/table-from-arrays.js
@@ -1,4 +1,4 @@
-import { builderContext } from './builder.js';
+import { dictionaryContext } from './builders/dictionary.js';
import { columnFromArray } from './column-from-array.js';
import { tableFromColumns } from './table-from-columns.js';
@@ -13,11 +13,11 @@ import { tableFromColumns } from './table-from-columns.js';
*/
export function tableFromArrays(data, options = {}) {
const { types = {}, ...opt } = options;
- const ctx = builderContext();
+ const dicts = dictionaryContext();
const entries = Array.isArray(data) ? data : Object.entries(data);
const columns = entries.map(([name, array]) =>
/** @type {[string, import('../column.js').Column]} */ (
- [ name, columnFromArray(array, types[name], opt, ctx)]
+ [ name, columnFromArray(array, types[name], opt, dicts)]
));
return tableFromColumns(columns, options.useProxy);
}
diff --git a/src/build/table-from-columns.js b/src/build/table-from-columns.js
index 5a04c86..f73c6e2 100644
--- a/src/build/table-from-columns.js
+++ b/src/build/table-from-columns.js
@@ -1,10 +1,10 @@
-import { Endianness, Type, Version } from '../constants.js';
+import { Endianness, Version } from '../constants.js';
import { field } from '../data-types.js';
import { Table } from '../table.js';
/**
* Create a new table from a collection of columns. Columns are assumed
- * to have the same record batch sizes and consistent dictionary ids.
+ * to have the same record batch sizes.
* @param {[string, import('../column.js').Column][]
* | Record} data The columns,
* as an object with name keys, or an array of [name, column] pairs.
@@ -14,21 +14,13 @@ import { Table } from '../table.js';
*/
export function tableFromColumns(data, useProxy) {
const fields = [];
- const dictionaryTypes = new Map;
const entries = Array.isArray(data) ? data : Object.entries(data);
const length = entries[0]?.[1].length;
+
const columns = entries.map(([name, col]) => {
if (col.length !== length) {
throw new Error('All columns must have the same length.');
}
- const type = col.type;
- if (type.typeId === Type.Dictionary) {
- const dict = dictionaryTypes.get(type.id);
- if (dict && dict !== type.dictionary) {
- throw new Error('Same id used across different dictionaries.');
- }
- dictionaryTypes.set(type.id, type.dictionary);
- }
fields.push(field(name, col.type));
return col;
});
@@ -37,8 +29,7 @@ export function tableFromColumns(data, useProxy) {
version: Version.V5,
endianness: Endianness.Little,
fields,
- metadata: null,
- dictionaryTypes
+ metadata: null
};
return new Table(schema, columns, useProxy);
diff --git a/src/data-types.js b/src/data-types.js
index aa7bf4c..1653fb7 100644
--- a/src/data-types.js
+++ b/src/data-types.js
@@ -85,17 +85,21 @@ const basicType = (typeId) => ({ typeId });
* values.
* @param {import('./types.js').IntType} [indexType] The data type of
* dictionary indices. Must be an integer type (default `int32`).
- * @param {number} [id=-1] The dictionary id, should be unique in a table.
* @param {boolean} [ordered=false] Indicates if dictionary values are
* ordered (default `false`).
+ * @param {number} [id=-1] The dictionary id. The default value (-1) indicates
+ * the dictionary applies to a single column only. Provide an explicit id in
+ * order to reuse a dictionary across columns when building, in which case
+ * different dictionaries *must* have different unique ids. All dictionary
+ * ids are later resolved (possibly to new values) upon IPC encoding.
* @returns {import('./types.js').DictionaryType}
*/
-export const dictionary = (type, indexType, id = -1, ordered = false) => ({
+export const dictionary = (type, indexType, ordered = false, id = -1) => ({
typeId: Type.Dictionary,
dictionary: type,
indices: indexType || int32(),
- id,
- ordered
+ ordered,
+ id
});
/**
diff --git a/src/decode/schema.js b/src/decode/schema.js
index 1a8df94..62d1d47 100644
--- a/src/decode/schema.js
+++ b/src/decode/schema.js
@@ -12,7 +12,6 @@ import { decodeMetadata } from './metadata.js';
* @returns {import('../types.js').Schema} The schema
*/
export function decodeSchema(buf, index, version) {
- const dictionaryTypes = new Map;
// 4: endianness (int16)
// 6: fields (vector)
// 8: metadata (vector)
@@ -21,25 +20,22 @@ export function decodeSchema(buf, index, version) {
return {
version,
endianness: /** @type {import('../types.js').Endianness_} */ (get(4, readInt16, 0)),
- fields: get(6, (buf, off) => decodeSchemaFields(buf, off, dictionaryTypes), []),
- metadata: get(8, decodeMetadata),
- dictionaryTypes
+ fields: get(6, decodeSchemaFields, []),
+ metadata: get(8, decodeMetadata)
};
}
/**
* @returns {import('../types.js').Field[] | null}
*/
-function decodeSchemaFields(buf, fieldsOffset, dictionaryTypes) {
- return readVector(buf, fieldsOffset, 4,
- (buf, pos) => decodeField(buf, pos, dictionaryTypes)
- );
+function decodeSchemaFields(buf, fieldsOffset) {
+ return readVector(buf, fieldsOffset, 4, decodeField);
}
/**
* @returns {import('../types.js').Field}
*/
-function decodeField(buf, index, dictionaryTypes) {
+function decodeField(buf, index) {
// 4: name (string)
// 6: nullable (bool)
// 8: type id (uint8)
@@ -51,22 +47,12 @@ function decodeField(buf, index, dictionaryTypes) {
const typeId = get(8, readUint8, Type.NONE);
const typeOffset = get(10, readOffset, 0);
const dict = get(12, decodeDictionary);
- const children = get(14, (buf, off) => decodeFieldChildren(buf, off, dictionaryTypes));
+ const children = get(14, (buf, off) => decodeFieldChildren(buf, off));
- let type;
+ let type = decodeDataType(buf, typeOffset, typeId, children);
if (dict) {
- const { id } = dict;
- let dictType = dictionaryTypes.get(id);
- if (!dictType) {
- // if dictionary encoded and the first time we've seen this id, decode
- // the type and children fields and add to the dictionary map.
- dictType = decodeDataType(buf, typeOffset, typeId, children);
- dictionaryTypes.set(id, dictType);
- }
- dict.dictionary = dictType;
+ dict.dictionary = type;
type = dict;
- } else {
- type = decodeDataType(buf, typeOffset, typeId, children);
}
return {
@@ -80,10 +66,8 @@ function decodeField(buf, index, dictionaryTypes) {
/**
* @returns {import('../types.js').Field[] | null}
*/
-function decodeFieldChildren(buf, fieldOffset, dictionaryTypes) {
- const children = readVector(buf, fieldOffset, 4,
- (buf, pos) => decodeField(buf, pos, dictionaryTypes)
- );
+function decodeFieldChildren(buf, fieldOffset) {
+ const children = readVector(buf, fieldOffset, 4, decodeField);
return children.length ? children : null;
}
@@ -102,8 +86,8 @@ function decodeDictionary(buf, index) {
return dictionary(
null, // data type will be populated by caller
get(6, decodeInt, int32()), // index type
+ get(8, readBoolean, false), // ordered
get(4, readInt64, 0), // id
- get(8, readBoolean, false) // ordered
);
}
diff --git a/src/decode/table-from-ipc.js b/src/decode/table-from-ipc.js
index 2e09801..bcecdc8 100644
--- a/src/decode/table-from-ipc.js
+++ b/src/decode/table-from-ipc.js
@@ -37,11 +37,20 @@ export function tableFromIPC(data, options) {
*/
export function createTable(data, options = {}) {
const { schema = { fields: [] }, dictionaries, records } = data;
- const { version, fields, dictionaryTypes } = schema;
+ const { version, fields } = schema;
const dictionaryMap = new Map;
const context = contextGenerator(options, version, dictionaryMap);
- // decode dictionaries
+ // build dictionary type map
+ const dictionaryTypes = new Map;
+ visitSchemaFields(schema, field => {
+ const type = field.type;
+ if (type.typeId === Type.Dictionary) {
+ dictionaryTypes.set(type.id, type.dictionary);
+ }
+ });
+
+ // decode dictionaries, build dictionary column map
const dicts = new Map;
for (const dict of dictionaries) {
const { id, data, isDelta, body } = dict;
@@ -70,6 +79,21 @@ export function createTable(data, options = {}) {
return new Table(schema, cols.map(c => c.done()), options.useProxy);
}
+/**
+ * Visit all fields within a schema.
+ * @param {import('../types.js').Schema} schema
+ * @param {(field: import('../types.js').Field) => void} visitor
+ */
+function visitSchemaFields(schema, visitor) {
+ schema.fields.forEach(function visitField(field) {
+ visitor(field);
+ // @ts-ignore
+ field.type.dictionary?.children?.forEach(visitField);
+ // @ts-ignore
+ field.type.children?.forEach(visitField);
+ });
+}
+
/**
* Context object generator for field visitation and buffer definition.
*/
diff --git a/src/encode/table-to-ipc.js b/src/encode/table-to-ipc.js
index 6f9b110..3419545 100644
--- a/src/encode/table-to-ipc.js
+++ b/src/encode/table-to-ipc.js
@@ -16,14 +16,17 @@ export function tableToIPC(table, options) {
if (typeof options === 'string') {
options = { format: options };
}
- const schema = table.schema;
const columns = table.children;
- const dictionaries = assembleDictionaryBatches(columns);
+ const { dictionaries, idMap } = assembleDictionaryBatches(columns);
const records = assembleRecordBatches(columns);
+ const schema = assembleSchema(table.schema, idMap);
const data = { schema, dictionaries, records };
return encodeIPC(data, options).finish();
}
+/**
+ * Create a new assembly context.
+ */
function assembleContext() {
let byteLength = 0;
const nodes = [];
@@ -74,37 +77,107 @@ function assembleContext() {
}
/**
- * @param {import('../column.js').Column[]} columns
- * @returns {import('../types.js').DictionaryBatch[]}
+ * Assemble dictionary batches and their unique ids.
+ * @param {import('../column.js').Column[]} columns The table columns.
+ * @returns {{
+ * dictionaries: import('../types.js').DictionaryBatch[],
+ * idMap: Map
+ * }}
+ * The assembled dictionary batches and a map from dictionary column
+ * instances to dictionary ids.
*/
function assembleDictionaryBatches(columns) {
const dictionaries = [];
- const seen = new Set;
-
- for (const col of columns) {
- const { type } = col;
- if (type.typeId !== -1) continue;
- if (seen.has(type.id)) continue;
- seen.add(type.id);
-
- // pass dictionary and deltas as-is
- // @ts-ignore
- const dict = col.data[0].dictionary;
- for (let i = 0; i < dict.data.length; ++i) {
- dictionaries.push({
- id: type.id,
- isDelta: i > 0,
- data: assembleRecordBatch([dict], i)
- });
+ const dictMap = new Map;
+ const idMap = new Map;
+ let id = -1;
+
+ // track dictionaries, key by dictionary column, assign ids
+ const visitor = dictionaryColumn => {
+ if (!dictMap.has(dictionaryColumn)) {
+ dictMap.set(dictionaryColumn, ++id);
+ for (let i = 0; i < dictionaryColumn.data.length; ++i) {
+ dictionaries.push({
+ id,
+ isDelta: i > 0,
+ data: assembleRecordBatch([dictionaryColumn], i)
+ });
+ }
+ idMap.set(dictionaryColumn.type, id);
+ } else {
+ idMap.set(dictionaryColumn.type, dictMap.get(dictionaryColumn));
}
+ };
+
+ // recurse through column batches to find dictionaries
+ // it is sufficient to visit the first batch only,
+ // as all batches have the same dictionary column
+ columns.forEach(col => visitDictionaries(col.data[0], visitor));
+
+ return { dictionaries, idMap };
+}
+
+/**
+ * Traverse column batches to visit dictionary columns.
+ * @param {import('../batch.js').Batch} batch
+ * @param {(column: import('../column.js').Column) => void} visitor
+ */
+function visitDictionaries(batch, visitor) {
+ if (batch?.type.typeId === Type.Dictionary) {
+ // @ts-ignore - batch has type DictionaryBatch
+ const dictionary = batch.dictionary;
+ visitor(dictionary);
+ visitDictionaries(dictionary.data[0], visitor);
}
+ batch?.children?.forEach(child => visitDictionaries(child, visitor));
+}
+
+/**
+ * Assemble a schema with resolved dictionary ids.
+ * @param {import('../types.js').Schema} schema The schema.
+ * @param {Map} idMap A map
+ * from dictionary value types to dictionary ids.
+ * @returns {import('../types.js').Schema} A new schema with resolved
+ * dictionary ids. If there are no dictionaries, the input schema is
+ * returned unchanged.
+ */
+function assembleSchema(schema, idMap) {
+ // early exit if no dictionaries
+ if (!idMap.size) return schema;
- return dictionaries;
+ const visit = type => {
+ if (type.typeId === Type.Dictionary) {
+ type.id = idMap.get(type.dictionary); // lookup and set id
+ visitDictType(type);
+ }
+ if (type.children) {
+ (type.children = type.children.slice()).forEach(visitFields);
+ }
+ };
+
+ // visit a field in a field array
+ const visitFields = (field, index, array) => {
+ const type = { ...field.type };
+ array[index] = { ...field, type };
+ visit(type);
+ };
+
+ // visit a dictionary values type
+ const visitDictType = (parentType) => {
+ const type = { ...parentType.dictionary };
+ parentType.dictionary = type;
+ visit(type);
+ };
+
+ schema = { ...schema, fields: schema.fields.slice() };
+ schema.fields.forEach(visitFields);
+ return schema;
}
/**
- * @param {import('../column.js').Column[]} columns
- * @returns {import('../types.js').RecordBatch[]}
+ * Assemble record batches with marshalled buffers.
+ * @param {import('../column.js').Column[]} columns The table columns.
+ * @returns {import('../types.js').RecordBatch[]} The assembled record batches.
*/
function assembleRecordBatches(columns) {
return (columns[0]?.data || [])
@@ -112,8 +185,10 @@ function assembleRecordBatches(columns) {
}
/**
- * @param {import('../column.js').Column[]} columns
- * @returns {import('../types.js').RecordBatch}
+ * Assemble a record batch with marshalled buffers.
+ * @param {import('../column.js').Column[]} columns The table columns.
+ * @param {number} batchIndex The batch index.
+ * @returns {import('../types.js').RecordBatch} The assembled record batch.
*/
function assembleRecordBatch(columns, batchIndex = 0) {
const ctx = assembleContext();
@@ -124,10 +199,10 @@ function assembleRecordBatch(columns, batchIndex = 0) {
}
/**
- * Visit a column batch, assembling buffer information.
- * @param {import('../types.js').DataType} type
- * @param {import('../batch.js').Batch} batch
- * @param {ReturnType} ctx
+ * Visit a column batch, assembling buffer data.
+ * @param {import('../types.js').DataType} type The data type.
+ * @param {import('../batch.js').Batch} batch The column batch.
+ * @param {ReturnType} ctx The assembly context.
*/
function visit(type, batch, ctx) {
const { typeId } = type;
diff --git a/src/index.js b/src/index.js
index ed21313..8f7eb0c 100644
--- a/src/index.js
+++ b/src/index.js
@@ -40,12 +40,14 @@ export {
largeListView
} from './data-types.js';
+export { Batch } from './batch.js';
export { Column } from './column.js';
export { Table } from './table.js';
-export { Batch } from './batch.js';
export { batchType } from './batch-type.js';
export { tableFromIPC } from './decode/table-from-ipc.js';
export { tableToIPC } from './encode/table-to-ipc.js';
export { tableFromArrays } from './build/table-from-arrays.js';
export { tableFromColumns } from './build/table-from-columns.js';
export { columnFromArray } from './build/column-from-array.js';
+export { columnFromValues } from './build/column-from-values.js';
+export { dictionaryContext } from './build/builders/dictionary.js';
diff --git a/src/types.ts b/src/types.ts
index 71b8cfa..f2ca5e4 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -105,8 +105,7 @@ export interface Schema {
version?: Version_;
endianness?: Endianness_;
fields: Field[];
- metadata?: Metadata | null;
- dictionaryTypes?: Map;
+ metadata?: Metadata | null
}
/**
diff --git a/test/infer-type-test.js b/test/infer-type-test.js
index 1a3fc57..95e81ca 100644
--- a/test/infer-type-test.js
+++ b/test/infer-type-test.js
@@ -6,46 +6,50 @@ function matches(actual, expect) {
assert.deepStrictEqual(actual, expect);
}
+function infer(values) {
+ return inferType(visitor => values.forEach(visitor));
+}
+
describe('inferType', () => {
it('infers integer types', () => {
- matches(inferType([1, 2, 3]), int8());
- matches(inferType([1e3, 2e3, 3e3]), int16());
- matches(inferType([1e6, 2e6, 3e6]), int32());
- matches(inferType([1n, 2n, 3n]), int64());
+ matches(infer([1, 2, 3]), int8());
+ matches(infer([1e3, 2e3, 3e3]), int16());
+ matches(infer([1e6, 2e6, 3e6]), int32());
+ matches(infer([1n, 2n, 3n]), int64());
- matches(inferType([-1, 2, 3]), int8());
- matches(inferType([-1e3, 2e3, 3e3]), int16());
- matches(inferType([-1e6, 2e6, 3e6]), int32());
- matches(inferType([-1n, 2n, 3n]), int64());
+ matches(infer([-1, 2, 3]), int8());
+ matches(infer([-1e3, 2e3, 3e3]), int16());
+ matches(infer([-1e6, 2e6, 3e6]), int32());
+ matches(infer([-1n, 2n, 3n]), int64());
- matches(inferType([1, 2, null, undefined, 3]), int8());
- matches(inferType([1e3, 2e3, null, undefined, 3e3]), int16());
- matches(inferType([1e6, 2e6, null, undefined, 3e6]), int32());
- matches(inferType([1n, 2n, null, undefined, 3n]), int64());
+ matches(infer([1, 2, null, undefined, 3]), int8());
+ matches(infer([1e3, 2e3, null, undefined, 3e3]), int16());
+ matches(infer([1e6, 2e6, null, undefined, 3e6]), int32());
+ matches(infer([1n, 2n, null, undefined, 3n]), int64());
});
it('infers float types', () => {
- matches(inferType([1.1, 2.2, 3.3]), float64());
- matches(inferType([-1.1, 2.2, 3.3]), float64());
- matches(inferType([1, 2, 3.3]), float64());
- matches(inferType([1, 2, NaN]), float64());
- matches(inferType([NaN, null, undefined, NaN]), float64());
- matches(inferType([Number.MIN_SAFE_INTEGER, Number.MAX_SAFE_INTEGER]), float64());
+ matches(infer([1.1, 2.2, 3.3]), float64());
+ matches(infer([-1.1, 2.2, 3.3]), float64());
+ matches(infer([1, 2, 3.3]), float64());
+ matches(infer([1, 2, NaN]), float64());
+ matches(infer([NaN, null, undefined, NaN]), float64());
+ matches(infer([Number.MIN_SAFE_INTEGER, Number.MAX_SAFE_INTEGER]), float64());
});
it('infers utf8 dictionary types', () => {
const type = dictionary(utf8(), int32());
- matches(inferType(['foo', 'bar', 'baz']), type);
- matches(inferType(['foo', 'bar', null, undefined, 'baz']), type);
+ matches(infer(['foo', 'bar', 'baz']), type);
+ matches(infer(['foo', 'bar', null, undefined, 'baz']), type);
});
it('infers bool types', () => {
- matches(inferType([true, false, true]), bool());
- matches(inferType([true, false, null, undefined, true]), bool());
+ matches(infer([true, false, true]), bool());
+ matches(infer([true, false, null, undefined, true]), bool());
});
it('infers date day types', () => {
- matches(inferType([
+ matches(infer([
new Date(Date.UTC(2000, 1, 2)),
new Date(Date.UTC(2006, 3, 20)),
null,
@@ -55,7 +59,7 @@ describe('inferType', () => {
it('infers timestamp types', () => {
matches(
- inferType([
+ infer([
new Date(Date.UTC(2000, 1, 2)),
new Date(Date.UTC(2006, 3, 20)),
null,
@@ -67,14 +71,14 @@ describe('inferType', () => {
});
it('infers list types', () => {
- matches(inferType([[1, 2], [3, 4]]), list(int8()));
- matches(inferType([[true, null, false], null, undefined, [false, undefined, true]]), list(bool()));
- matches(inferType([['foo', 'bar', null], null, ['bar', 'baz']]), list(dictionary(utf8(), int32())));
+ matches(infer([[1, 2], [3, 4]]), list(int8()));
+ matches(infer([[true, null, false], null, undefined, [false, undefined, true]]), list(bool()));
+ matches(infer([['foo', 'bar', null], null, ['bar', 'baz']]), list(dictionary(utf8(), int32())));
});
it('infers struct types', () => {
matches(
- inferType([
+ infer([
{ foo: 1, bar: [1.1, 2.2] },
{ foo: null, bar: [2.2, null, 3.3] },
null,
@@ -86,10 +90,10 @@ describe('inferType', () => {
});
it('throws on bigints that exceed 64 bits', () => {
- assert.throws(() => inferType([(1n << 200n)]));
+ assert.throws(() => infer([(1n << 200n)]));
});
it('throws on mixed types', () => {
- assert.throws(() => inferType([1, true, 'foo']));
+ assert.throws(() => infer([1, true, 'foo']));
});
});
diff --git a/test/table-to-ipc-test.js b/test/table-to-ipc-test.js
index 8281a57..b5cb697 100644
--- a/test/table-to-ipc-test.js
+++ b/test/table-to-ipc-test.js
@@ -32,7 +32,6 @@ function testEncode(bytes) {
// ensure complete schema, override version
const schema = {
- dictionaryTypes: new Map,
endianness: 0,
metadata: null,
...table.schema,
diff --git a/test/util/decimal.js b/test/util/decimal.js
index 080730d..801237e 100644
--- a/test/util/decimal.js
+++ b/test/util/decimal.js
@@ -11,8 +11,7 @@ export function decimalDataToEncode() {
nullable: true,
metadata: null
}],
- metadata: null,
- dictionaryTypes: new Map
+ metadata: null
},
records: [{
length: 3,