uwdata · jheer · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024
diff --git a/README.md b/README.md
@@ -10,9 +10,9 @@ In the process of developing multiple data analysis packages that consume Arrow
 
 * _Speed_. Flechette provides faster decoding. Across varied datasets, initial performance tests show 1.3-1.6x faster value iteration, 2-7x faster array extraction, and 5-9x faster row object extraction.
 
-* _Size_. Flechette is ~16k minified (~6k gzip'd), versus 163k minified (~43k gzip'd) for Arrow JS.
+* _Size_. Flechette is ~17k minified (~6k gzip'd), versus 163k minified (~43k gzip'd) for Arrow JS.
 
-* _Coverage_. Flechette supports multiple data types unsupported by the reference implementation at the time of writing, including decimal-to-number conversion and support for month/day/nanosecond time intervals (as used, for example, by DuckDB).
+* _Coverage_. Flechette supports data types unsupported by the reference implementation at the time of writing, including decimal-to-number conversion, month/day/nanosecond time intervals (as used by DuckDB, for example), list views, and run-end encoded data.
 
 * _Flexibility_. Flechette includes options to control data value conversion, such as numerical timestamps vs. Date objects for temporal data, and numbers vs. bigint values for 64-bit integer data.
 

diff --git a/src/batch.js b/src/batch.js
@@ -1,5 +1,5 @@
 import { float64 } from './array-types.js';
-import { decodeBit, decodeUtf8, divide, readInt32, readInt64AsNum, toNumber } from './util.js';
+import { bisect, decodeBit, decodeUtf8, divide, readInt32, readInt64AsNum, toNumber } from './util.js';
 
 /**
  * Check if the input is a batch that supports direct access to
@@ -548,7 +548,7 @@ export class ListViewBatch extends ArrayBatch {
    */
   value(index) {
     const a = /** @type {number} */ (this.offsets[index]);
-    const b = a +  /** @type {number} */ (this.sizes[index]);
+    const b = a + /** @type {number} */ (this.sizes[index]);
     return this.children[0].slice(a, b);
   }
 }
@@ -567,7 +567,7 @@ export class LargeListViewBatch extends ArrayBatch {
    */
   value(index) {
     const a = /** @type {bigint} */ (this.offsets[index]);
-    const b = a +  /** @type {bigint} */ (this.sizes[index]);
+    const b = a + /** @type {bigint} */ (this.sizes[index]);
     return this.children[0].slice(toNumber(a), toNumber(b));
   }
 }
@@ -762,6 +762,23 @@ export class StructBatch extends ArrayBatch {
   }
 }
 
+/**
+ * A batch of run-end-encoded values.
+ * @template T
+ * @extends {ArrayBatch<T>}
+ */
+export class RunEndEncodedBatch extends ArrayBatch {
+  /**
+   * @param {number} index The value index.
+   */
+  value(index) {
+    const [ { values: runs }, vals ] = this.children;
+    return vals.at(
+      bisect(/** @type {import('./types.js').IntegerArray} */(runs), index)
+    );
+  }
+}
+
 /**
  * A batch of dictionary-encoded values.
  * @template T

diff --git a/src/column.js b/src/column.js
@@ -1,5 +1,5 @@
 import { isDirectBatch } from './batch.js';
-import { bisectOffsets } from './util.js';
+import { bisect } from './util.js';
 
 /**
  * Build up a column from batches.
@@ -107,7 +107,7 @@ export class Column {
     // NOTE: if there is only one batch, this method is replaced with an
     // optimized version in the Column constructor.
     const { data, offsets } = this;
-    const i = bisectOffsets(offsets, index);
+    const i = bisect(offsets, index) - 1;
     return data[i]?.at(index - offsets[i]); // undefined if out of range
   }
 

diff --git a/src/constants.js b/src/constants.js
@@ -244,8 +244,6 @@ export const Type = /** @type {const} */ ({
    * must be a 16/32/64-bit integer array which encodes the indices at which
    * the run with the value in  each corresponding index in the values child
    * array ends. Like list/struct types, the value array can be of any type.
-   *
-   * Not currently supported by Flechette.
    */
   RunEndEncoded: 22,
   /**
@@ -276,15 +274,11 @@ export const Type = /** @type {const} */ ({
    * Represents the same logical types that List can, but contains offsets and
    * sizes allowing for writes in any order and sharing of child values among
    * list values.
-   *
-   * Not currently supported by Flechette.
    */
   ListView: 25,
   /**
    * Same as ListView, but with 64-bit offsets and sizes, allowing to represent
    * extremely large data values.
-   *
-   * Not currently supported by Flechette.
    */
   LargeListView: 26
 });

diff --git a/src/decode/data-type.js b/src/decode/data-type.js
@@ -29,6 +29,8 @@ export function decodeDataType(buf, index, typeId, children) {
     case Type.LargeListView:
       return { typeId, children: [children?.[0]], offsets: int64 };
     case Type.Struct:
+    case Type.RunEndEncoded:
+      // @ts-ignore
       return { typeId, children };
     case Type.Int:
       return decodeInt(buf, index);

diff --git a/src/table-from-ipc.js b/src/table-from-ipc.js
@@ -25,6 +25,7 @@ import {
   MapBatch,
   MapEntryBatch,
   NullBatch,
+  RunEndEncodedBatch,
   SparseUnionBatch,
   StructBatch,
   TimestampMicrosecondBatch,
@@ -234,6 +235,12 @@ function visit(type, ctx) {
       names: type.children.map(child => child.name)
     });
 
+    // children only
+    case Type.RunEndEncoded: return new RunEndEncodedBatch({
+      ...node,
+      children: ctx.visitAll(type.children)
+    });
+
     // dictionary
     case Type.Dictionary: {
       const { id, keys } = type;

diff --git a/src/table.js b/src/table.js
@@ -1,4 +1,4 @@
-import { bisectOffsets } from './util.js';
+import { bisect } from './util.js';
 
 /**
  * A table consists of a collection of named columns (or 'children').
@@ -151,7 +151,7 @@ export class Table {
     const { names, children, numRows } = this;
     if (index < 0 || index >= numRows) return null;
     const [{ offsets }] = children;
-    const i = bisectOffsets(offsets, index);
+    const i = bisect(offsets, index) - 1;
     return rowObject(names, children, i, index - offsets[i]);
   }
 

diff --git a/src/types.ts b/src/types.ts
@@ -189,6 +189,9 @@ export type LargeUtf8Type = { typeId: 20, offsets: BigInt64ArrayConstructor };
 /** List data type with 64-bit integer offsets for larger data. */
 export type LargeListType = { typeId: 21, children: [Field], offsets: BigInt64ArrayConstructor };
 
+/** RunEndEncoded data type. */
+export type RunEndEncodedType = { typeId: 22, children: [Field, Field] };
+
 /** ListView data type. */
 export type ListViewType = { typeId: 25, children: [Field], offsets: Int32ArrayConstructor };
 
@@ -221,6 +224,7 @@ export type DataType =
   | LargeBinaryType
   | LargeUtf8Type
   | LargeListType
+  | RunEndEncodedType
   | ListViewType
   | LargeListViewType
   | DictionaryType;

diff --git a/src/util.js b/src/util.js
@@ -59,25 +59,32 @@ export function divide(num, div) {
 
 /**
  * Determine the correct index into an offset array for a given
- * full column row index.
- * @param {Int32Array} offsets The offsets array.
+ * full column row index. Assumes offset indices can be manipulated
+ * as 32-bit signed integers.
+ * @param {import("./types.js").IntegerArray} offsets The offsets array.
  * @param {number} index The full column row index.
  */
-export function bisectOffsets(offsets, index) {
-  // binary search for batch index
-  // we use a fast unsigned bit shift for division by two
-  // this assumes offsets.length <= Math.pow(2, 31), which seems safe
-  // otherwise that is a whole lotta record batches to handle in JS...
+export function bisect(offsets, index) {
   let a = 0;
   let b = offsets.length;
-  do {
-    const mid = (a + b) >>> 1;
-    if (offsets[mid] <= index) a = mid + 1;
-    else b = mid;
-  } while (a < b);
-
-  // decrement to the desired offset array index
-  return --a;
+  if (b <= 2147483648) { // 2 ** 31
+    // fast version, use unsigned bit shift
+    // array length fits within 32-bit signed integer
+    do {
+      const mid = (a + b) >>> 1;
+      if (offsets[mid] <= index) a = mid + 1;
+      else b = mid;
+    } while (a < b);
+  } else {
+    // slow version, use division and truncate
+    // array length exceeds 32-bit signed integer
+    do {
+      const mid = Math.trunc((a + b) / 2);
+      if (offsets[mid] <= index) a = mid + 1;
+      else b = mid;
+    } while (a < b);
+  }
+  return a;
 }
 
 // -- flatbuffer utilities -----

diff --git a/test/data/runendencoded.arrows b/test/data/runendencoded.arrows
diff --git a/test/data/runendencoded64.arrows b/test/data/runendencoded64.arrows
diff --git a/test/table-from-ipc-test.js b/test/table-from-ipc-test.js
@@ -255,6 +255,26 @@ describe('tableFromIPC', () => {
     await valueTest([ {a: ['a', 'b'], b: Math.E}, {a: ['c', 'd'], b: Math.PI} ]);
   });
 
+  it('decodes run-end-encoded data', async () => {
+    const buf = await readFile(`test/data/runendencoded.arrows`);
+    const table = tableFromIPC(new Uint8Array(buf));
+    const column = table.getChild('value');
+    const [{ children: [runs, vals] }] = column.data;
+    assert.deepStrictEqual([...runs], [2, 3, 4, 6, 8, 9]);
+    assert.deepStrictEqual([...vals], ['foo', null, 'bar', 'baz', null, 'foo']);
+    compare(column, ['foo', 'foo', null, 'bar', 'baz', 'baz', null, null, 'foo']);
+  });
+
+  it('decodes run-end-encoded data with 64-bit run ends', async () => {
+    const buf = await readFile(`test/data/runendencoded64.arrows`);
+    const table = tableFromIPC(new Uint8Array(buf), { useBigInt: true });
+    const column = table.getChild('value');
+    const [{ children: [runs, vals] }] = column.data;
+    assert.deepStrictEqual([...runs], [2n, 3n, 4n, 6n, 8n, 9n]);
+    assert.deepStrictEqual([...vals], ['foo', null, 'bar', 'baz', null, 'foo']);
+    compare(column, ['foo', 'foo', null, 'bar', 'baz', 'baz', null, null, 'foo']);
+  });
+
   it('decodes empty data', async () => {
     // For empty result sets, DuckDB node only returns a zero byte
     // Other variants may include a schema message