Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RunEndEncoded type support. #10

Merged
merged 2 commits into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ In the process of developing multiple data analysis packages that consume Arrow

* _Speed_. Flechette provides faster decoding. Across varied datasets, initial performance tests show 1.3-1.6x faster value iteration, 2-7x faster array extraction, and 5-9x faster row object extraction.

* _Size_. Flechette is ~16k minified (~6k gzip'd), versus 163k minified (~43k gzip'd) for Arrow JS.
* _Size_. Flechette is ~17k minified (~6k gzip'd), versus 163k minified (~43k gzip'd) for Arrow JS.

* _Coverage_. Flechette supports multiple data types unsupported by the reference implementation at the time of writing, including decimal-to-number conversion and support for month/day/nanosecond time intervals (as used, for example, by DuckDB).
* _Coverage_. Flechette supports data types unsupported by the reference implementation at the time of writing, including decimal-to-number conversion, month/day/nanosecond time intervals (as used by DuckDB, for example), list views, and run-end encoded data.

* _Flexibility_. Flechette includes options to control data value conversion, such as numerical timestamps vs. Date objects for temporal data, and numbers vs. bigint values for 64-bit integer data.

Expand Down
23 changes: 20 additions & 3 deletions src/batch.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { float64 } from './array-types.js';
import { decodeBit, decodeUtf8, divide, readInt32, readInt64AsNum, toNumber } from './util.js';
import { bisect, decodeBit, decodeUtf8, divide, readInt32, readInt64AsNum, toNumber } from './util.js';

/**
* Check if the input is a batch that supports direct access to
Expand Down Expand Up @@ -548,7 +548,7 @@ export class ListViewBatch extends ArrayBatch {
*/
value(index) {
const a = /** @type {number} */ (this.offsets[index]);
const b = a + /** @type {number} */ (this.sizes[index]);
const b = a + /** @type {number} */ (this.sizes[index]);
return this.children[0].slice(a, b);
}
}
Expand All @@ -567,7 +567,7 @@ export class LargeListViewBatch extends ArrayBatch {
*/
value(index) {
const a = /** @type {bigint} */ (this.offsets[index]);
const b = a + /** @type {bigint} */ (this.sizes[index]);
const b = a + /** @type {bigint} */ (this.sizes[index]);
return this.children[0].slice(toNumber(a), toNumber(b));
}
}
Expand Down Expand Up @@ -762,6 +762,23 @@ export class StructBatch extends ArrayBatch {
}
}

/**
* A batch of run-end-encoded values.
* @template T
* @extends {ArrayBatch<T>}
*/
export class RunEndEncodedBatch extends ArrayBatch {
/**
* @param {number} index The value index.
*/
value(index) {
const [ { values: runs }, vals ] = this.children;
return vals.at(
bisect(/** @type {import('./types.js').IntegerArray} */(runs), index)
);
}
}

/**
* A batch of dictionary-encoded values.
* @template T
Expand Down
4 changes: 2 additions & 2 deletions src/column.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { isDirectBatch } from './batch.js';
import { bisectOffsets } from './util.js';
import { bisect } from './util.js';

/**
* Build up a column from batches.
Expand Down Expand Up @@ -107,7 +107,7 @@ export class Column {
// NOTE: if there is only one batch, this method is replaced with an
// optimized version in the Column constructor.
const { data, offsets } = this;
const i = bisectOffsets(offsets, index);
const i = bisect(offsets, index) - 1;
return data[i]?.at(index - offsets[i]); // undefined if out of range
}

Expand Down
6 changes: 0 additions & 6 deletions src/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,6 @@ export const Type = /** @type {const} */ ({
* must be a 16/32/64-bit integer array which encodes the indices at which
* the run with the value in each corresponding index in the values child
* array ends. Like list/struct types, the value array can be of any type.
*
* Not currently supported by Flechette.
*/
RunEndEncoded: 22,
/**
Expand Down Expand Up @@ -276,15 +274,11 @@ export const Type = /** @type {const} */ ({
* Represents the same logical types that List can, but contains offsets and
* sizes allowing for writes in any order and sharing of child values among
* list values.
*
* Not currently supported by Flechette.
*/
ListView: 25,
/**
* Same as ListView, but with 64-bit offsets and sizes, allowing to represent
* extremely large data values.
*
* Not currently supported by Flechette.
*/
LargeListView: 26
});
Expand Down
2 changes: 2 additions & 0 deletions src/decode/data-type.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ export function decodeDataType(buf, index, typeId, children) {
case Type.LargeListView:
return { typeId, children: [children?.[0]], offsets: int64 };
case Type.Struct:
case Type.RunEndEncoded:
// @ts-ignore
return { typeId, children };
case Type.Int:
return decodeInt(buf, index);
Expand Down
7 changes: 7 additions & 0 deletions src/table-from-ipc.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import {
MapBatch,
MapEntryBatch,
NullBatch,
RunEndEncodedBatch,
SparseUnionBatch,
StructBatch,
TimestampMicrosecondBatch,
Expand Down Expand Up @@ -234,6 +235,12 @@ function visit(type, ctx) {
names: type.children.map(child => child.name)
});

// children only
case Type.RunEndEncoded: return new RunEndEncodedBatch({
...node,
children: ctx.visitAll(type.children)
});

// dictionary
case Type.Dictionary: {
const { id, keys } = type;
Expand Down
4 changes: 2 additions & 2 deletions src/table.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { bisectOffsets } from './util.js';
import { bisect } from './util.js';

/**
* A table consists of a collection of named columns (or 'children').
Expand Down Expand Up @@ -151,7 +151,7 @@ export class Table {
const { names, children, numRows } = this;
if (index < 0 || index >= numRows) return null;
const [{ offsets }] = children;
const i = bisectOffsets(offsets, index);
const i = bisect(offsets, index) - 1;
return rowObject(names, children, i, index - offsets[i]);
}

Expand Down
4 changes: 4 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,9 @@ export type LargeUtf8Type = { typeId: 20, offsets: BigInt64ArrayConstructor };
/** List data type with 64-bit integer offsets for larger data. */
export type LargeListType = { typeId: 21, children: [Field], offsets: BigInt64ArrayConstructor };

/** RunEndEncoded data type. */
export type RunEndEncodedType = { typeId: 22, children: [Field, Field] };

/** ListView data type. */
export type ListViewType = { typeId: 25, children: [Field], offsets: Int32ArrayConstructor };

Expand Down Expand Up @@ -221,6 +224,7 @@ export type DataType =
| LargeBinaryType
| LargeUtf8Type
| LargeListType
| RunEndEncodedType
| ListViewType
| LargeListViewType
| DictionaryType;
Expand Down
37 changes: 22 additions & 15 deletions src/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,25 +59,32 @@ export function divide(num, div) {

/**
* Determine the correct index into an offset array for a given
* full column row index.
* @param {Int32Array} offsets The offsets array.
* full column row index. Assumes offset indices can be manipulated
* as 32-bit signed integers.
* @param {import("./types.js").IntegerArray} offsets The offsets array.
* @param {number} index The full column row index.
*/
export function bisectOffsets(offsets, index) {
// binary search for batch index
// we use a fast unsigned bit shift for division by two
// this assumes offsets.length <= Math.pow(2, 31), which seems safe
// otherwise that is a whole lotta record batches to handle in JS...
export function bisect(offsets, index) {
let a = 0;
let b = offsets.length;
do {
const mid = (a + b) >>> 1;
if (offsets[mid] <= index) a = mid + 1;
else b = mid;
} while (a < b);

// decrement to the desired offset array index
return --a;
if (b <= 2147483648) { // 2 ** 31
// fast version, use unsigned bit shift
// array length fits within 32-bit signed integer
do {
const mid = (a + b) >>> 1;
if (offsets[mid] <= index) a = mid + 1;
else b = mid;
} while (a < b);
} else {
// slow version, use division and truncate
// array length exceeds 32-bit signed integer
do {
const mid = Math.trunc((a + b) / 2);
if (offsets[mid] <= index) a = mid + 1;
else b = mid;
} while (a < b);
}
return a;
}

// -- flatbuffer utilities -----
Expand Down
Binary file added test/data/runendencoded.arrows
Binary file not shown.
Binary file added test/data/runendencoded64.arrows
Binary file not shown.
20 changes: 20 additions & 0 deletions test/table-from-ipc-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,26 @@ describe('tableFromIPC', () => {
await valueTest([ {a: ['a', 'b'], b: Math.E}, {a: ['c', 'd'], b: Math.PI} ]);
});

it('decodes run-end-encoded data', async () => {
const buf = await readFile(`test/data/runendencoded.arrows`);
const table = tableFromIPC(new Uint8Array(buf));
const column = table.getChild('value');
const [{ children: [runs, vals] }] = column.data;
assert.deepStrictEqual([...runs], [2, 3, 4, 6, 8, 9]);
assert.deepStrictEqual([...vals], ['foo', null, 'bar', 'baz', null, 'foo']);
compare(column, ['foo', 'foo', null, 'bar', 'baz', 'baz', null, null, 'foo']);
});

it('decodes run-end-encoded data with 64-bit run ends', async () => {
const buf = await readFile(`test/data/runendencoded64.arrows`);
const table = tableFromIPC(new Uint8Array(buf), { useBigInt: true });
const column = table.getChild('value');
const [{ children: [runs, vals] }] = column.data;
assert.deepStrictEqual([...runs], [2n, 3n, 4n, 6n, 8n, 9n]);
assert.deepStrictEqual([...vals], ['foo', null, 'bar', 'baz', null, 'foo']);
compare(column, ['foo', 'foo', null, 'bar', 'baz', 'baz', null, null, 'foo']);
});

it('decodes empty data', async () => {
// For empty result sets, DuckDB node only returns a zero byte
// Other variants may include a schema message
Expand Down