Skip to content

Commit

Permalink
Add RunEndEncoded type support. (#10)
Browse files Browse the repository at this point in the history
* feat: Add RunEndEncoded type support.

* docs: Update comments regarding type support.
  • Loading branch information
jheer authored Aug 23, 2024
1 parent 8a7002a commit 4a1bc1c
Show file tree
Hide file tree
Showing 12 changed files with 81 additions and 30 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ In the process of developing multiple data analysis packages that consume Arrow

* _Speed_. Flechette provides faster decoding. Across varied datasets, initial performance tests show 1.3-1.6x faster value iteration, 2-7x faster array extraction, and 5-9x faster row object extraction.

* _Size_. Flechette is ~16k minified (~6k gzip'd), versus 163k minified (~43k gzip'd) for Arrow JS.
* _Size_. Flechette is ~17k minified (~6k gzip'd), versus 163k minified (~43k gzip'd) for Arrow JS.

* _Coverage_. Flechette supports multiple data types unsupported by the reference implementation at the time of writing, including decimal-to-number conversion and support for month/day/nanosecond time intervals (as used, for example, by DuckDB).
* _Coverage_. Flechette supports data types unsupported by the reference implementation at the time of writing, including decimal-to-number conversion, month/day/nanosecond time intervals (as used by DuckDB, for example), list views, and run-end encoded data.

* _Flexibility_. Flechette includes options to control data value conversion, such as numerical timestamps vs. Date objects for temporal data, and numbers vs. bigint values for 64-bit integer data.

Expand Down
23 changes: 20 additions & 3 deletions src/batch.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { float64 } from './array-types.js';
import { decodeBit, decodeUtf8, divide, readInt32, readInt64AsNum, toNumber } from './util.js';
import { bisect, decodeBit, decodeUtf8, divide, readInt32, readInt64AsNum, toNumber } from './util.js';

/**
* Check if the input is a batch that supports direct access to
Expand Down Expand Up @@ -548,7 +548,7 @@ export class ListViewBatch extends ArrayBatch {
*/
value(index) {
const a = /** @type {number} */ (this.offsets[index]);
const b = a + /** @type {number} */ (this.sizes[index]);
const b = a + /** @type {number} */ (this.sizes[index]);
return this.children[0].slice(a, b);
}
}
Expand All @@ -567,7 +567,7 @@ export class LargeListViewBatch extends ArrayBatch {
*/
value(index) {
const a = /** @type {bigint} */ (this.offsets[index]);
const b = a + /** @type {bigint} */ (this.sizes[index]);
const b = a + /** @type {bigint} */ (this.sizes[index]);
return this.children[0].slice(toNumber(a), toNumber(b));
}
}
Expand Down Expand Up @@ -762,6 +762,23 @@ export class StructBatch extends ArrayBatch {
}
}

/**
* A batch of run-end-encoded values.
* @template T
* @extends {ArrayBatch<T>}
*/
export class RunEndEncodedBatch extends ArrayBatch {
/**
* @param {number} index The value index.
*/
value(index) {
const [ { values: runs }, vals ] = this.children;
return vals.at(
bisect(/** @type {import('./types.js').IntegerArray} */(runs), index)
);
}
}

/**
* A batch of dictionary-encoded values.
* @template T
Expand Down
4 changes: 2 additions & 2 deletions src/column.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { isDirectBatch } from './batch.js';
import { bisectOffsets } from './util.js';
import { bisect } from './util.js';

/**
* Build up a column from batches.
Expand Down Expand Up @@ -107,7 +107,7 @@ export class Column {
// NOTE: if there is only one batch, this method is replaced with an
// optimized version in the Column constructor.
const { data, offsets } = this;
const i = bisectOffsets(offsets, index);
const i = bisect(offsets, index) - 1;
return data[i]?.at(index - offsets[i]); // undefined if out of range
}

Expand Down
6 changes: 0 additions & 6 deletions src/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,6 @@ export const Type = /** @type {const} */ ({
* must be a 16/32/64-bit integer array which encodes the indices at which
* the run with the value in each corresponding index in the values child
* array ends. Like list/struct types, the value array can be of any type.
*
* Not currently supported by Flechette.
*/
RunEndEncoded: 22,
/**
Expand Down Expand Up @@ -276,15 +274,11 @@ export const Type = /** @type {const} */ ({
* Represents the same logical types that List can, but contains offsets and
* sizes allowing for writes in any order and sharing of child values among
* list values.
*
* Not currently supported by Flechette.
*/
ListView: 25,
/**
* Same as ListView, but with 64-bit offsets and sizes, allowing to represent
* extremely large data values.
*
* Not currently supported by Flechette.
*/
LargeListView: 26
});
Expand Down
2 changes: 2 additions & 0 deletions src/decode/data-type.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ export function decodeDataType(buf, index, typeId, children) {
case Type.LargeListView:
return { typeId, children: [children?.[0]], offsets: int64 };
case Type.Struct:
case Type.RunEndEncoded:
// @ts-ignore
return { typeId, children };
case Type.Int:
return decodeInt(buf, index);
Expand Down
7 changes: 7 additions & 0 deletions src/table-from-ipc.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import {
MapBatch,
MapEntryBatch,
NullBatch,
RunEndEncodedBatch,
SparseUnionBatch,
StructBatch,
TimestampMicrosecondBatch,
Expand Down Expand Up @@ -234,6 +235,12 @@ function visit(type, ctx) {
names: type.children.map(child => child.name)
});

// children only
case Type.RunEndEncoded: return new RunEndEncodedBatch({
...node,
children: ctx.visitAll(type.children)
});

// dictionary
case Type.Dictionary: {
const { id, keys } = type;
Expand Down
4 changes: 2 additions & 2 deletions src/table.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { bisectOffsets } from './util.js';
import { bisect } from './util.js';

/**
* A table consists of a collection of named columns (or 'children').
Expand Down Expand Up @@ -151,7 +151,7 @@ export class Table {
const { names, children, numRows } = this;
if (index < 0 || index >= numRows) return null;
const [{ offsets }] = children;
const i = bisectOffsets(offsets, index);
const i = bisect(offsets, index) - 1;
return rowObject(names, children, i, index - offsets[i]);
}

Expand Down
4 changes: 4 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,9 @@ export type LargeUtf8Type = { typeId: 20, offsets: BigInt64ArrayConstructor };
/** List data type with 64-bit integer offsets for larger data. */
export type LargeListType = { typeId: 21, children: [Field], offsets: BigInt64ArrayConstructor };

/** RunEndEncoded data type. */
export type RunEndEncodedType = { typeId: 22, children: [Field, Field] };

/** ListView data type. */
export type ListViewType = { typeId: 25, children: [Field], offsets: Int32ArrayConstructor };

Expand Down Expand Up @@ -221,6 +224,7 @@ export type DataType =
| LargeBinaryType
| LargeUtf8Type
| LargeListType
| RunEndEncodedType
| ListViewType
| LargeListViewType
| DictionaryType;
Expand Down
37 changes: 22 additions & 15 deletions src/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,25 +59,32 @@ export function divide(num, div) {

/**
* Determine the correct index into an offset array for a given
* full column row index.
* @param {Int32Array} offsets The offsets array.
* full column row index. Assumes offset indices can be manipulated
* as 32-bit signed integers.
* @param {import("./types.js").IntegerArray} offsets The offsets array.
* @param {number} index The full column row index.
*/
export function bisectOffsets(offsets, index) {
// binary search for batch index
// we use a fast unsigned bit shift for division by two
// this assumes offsets.length <= Math.pow(2, 31), which seems safe
// otherwise that is a whole lotta record batches to handle in JS...
export function bisect(offsets, index) {
let a = 0;
let b = offsets.length;
do {
const mid = (a + b) >>> 1;
if (offsets[mid] <= index) a = mid + 1;
else b = mid;
} while (a < b);

// decrement to the desired offset array index
return --a;
if (b <= 2147483648) { // 2 ** 31
// fast version, use unsigned bit shift
// array length fits within 32-bit signed integer
do {
const mid = (a + b) >>> 1;
if (offsets[mid] <= index) a = mid + 1;
else b = mid;
} while (a < b);
} else {
// slow version, use division and truncate
// array length exceeds 32-bit signed integer
do {
const mid = Math.trunc((a + b) / 2);
if (offsets[mid] <= index) a = mid + 1;
else b = mid;
} while (a < b);
}
return a;
}

// -- flatbuffer utilities -----
Expand Down
Binary file added test/data/runendencoded.arrows
Binary file not shown.
Binary file added test/data/runendencoded64.arrows
Binary file not shown.
20 changes: 20 additions & 0 deletions test/table-from-ipc-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,26 @@ describe('tableFromIPC', () => {
await valueTest([ {a: ['a', 'b'], b: Math.E}, {a: ['c', 'd'], b: Math.PI} ]);
});

it('decodes run-end-encoded data', async () => {
const buf = await readFile(`test/data/runendencoded.arrows`);
const table = tableFromIPC(new Uint8Array(buf));
const column = table.getChild('value');
const [{ children: [runs, vals] }] = column.data;
assert.deepStrictEqual([...runs], [2, 3, 4, 6, 8, 9]);
assert.deepStrictEqual([...vals], ['foo', null, 'bar', 'baz', null, 'foo']);
compare(column, ['foo', 'foo', null, 'bar', 'baz', 'baz', null, null, 'foo']);
});

it('decodes run-end-encoded data with 64-bit run ends', async () => {
const buf = await readFile(`test/data/runendencoded64.arrows`);
const table = tableFromIPC(new Uint8Array(buf), { useBigInt: true });
const column = table.getChild('value');
const [{ children: [runs, vals] }] = column.data;
assert.deepStrictEqual([...runs], [2n, 3n, 4n, 6n, 8n, 9n]);
assert.deepStrictEqual([...vals], ['foo', null, 'bar', 'baz', null, 'foo']);
compare(column, ['foo', 'foo', null, 'bar', 'baz', 'baz', null, null, 'foo']);
});

it('decodes empty data', async () => {
// For empty result sets, DuckDB node only returns a zero byte
// Other variants may include a schema message
Expand Down

0 comments on commit 4a1bc1c

Please sign in to comment.