Skip to content

Commit

Permalink
Add random access for row objects, get methods for compatibility (#5)
Browse files Browse the repository at this point in the history
* feat: Add get methods, inc. row object.

* test: Add random access perf tests.
  • Loading branch information
jheer authored Aug 14, 2024
1 parent cae4e17 commit c8ec41f
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 45 deletions.
15 changes: 15 additions & 0 deletions perf/perf-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,20 @@ function iterateValues(table) {
names.forEach(name => Array.from(table.getChild(name)));
}

// random access to each column value
// this will be slower if there are multiple record batches
// due to the need for binary search over the offsets array
function randomAccess(table) {
const { numRows, numCols } = table;
const vals = Array(numCols);
for (let j = 0; j < numCols; ++j) {
const col = table.getChildAt(j);
for (let i = 0; i < numRows; ++i) {
vals[j] = col.at(i);
}
}
}

// generate row objects, access each property
function visitObjects(table) {
const nr = table.numRows;
Expand Down Expand Up @@ -58,6 +72,7 @@ async function run(file) {
trial('Parse Table from IPC', file, bytes, parseIPC, 10);
trial('Extract Arrays', file, bytes, extractArrays, 10);
trial('Iterate Values', file, bytes, iterateValues, 10);
trial('Random Access', file, bytes, randomAccess, 10);
trial('Visit Row Objects', file, bytes, visitObjects, 5);
console.log();
}
Expand Down
27 changes: 14 additions & 13 deletions src/column.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { isDirectBatch } from './batch.js';
import { bisectOffsets } from './util.js';

/**
* Build up a column from batches.
Expand Down Expand Up @@ -99,25 +100,25 @@ export class Column {
* lookup less efficient than a standard array access. If making a full
* scan of a column, consider extracting arrays via `toArray()` or using an
* iterator (`for (const value of column) {...}`).
* @param {number} index The index
* @param {number} index The row index.
* @returns {T | null} The value.
*/
at(index) {
// NOTE: if there is only one batch, this method is replaced with an
// optimized version within the Column constructor.
// optimized version in the Column constructor.
const { data, offsets } = this;
const i = bisectOffsets(offsets, index);
return data[i]?.at(index - offsets[i]); // undefined if out of range
}

// binary search for batch index
let a = 0;
let b = offsets.length;
do {
const mid = (a + b) >>> 1;
if (offsets[mid] <= index) a = mid + 1;
else b = mid;
} while (a < b);

// returns undefined if index is out of range
return data[--a]?.at(index - offsets[a]);
/**
* Return the column value at the given index. This method is the same as
* `at()` and is provided for better compatibility with Apache Arrow JS.
* @param {number} index The row index.
* @returns {T | null} The value.
*/
get(index) {
return this.at(index);
}

/**
Expand Down
70 changes: 43 additions & 27 deletions src/table.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import { bisectOffsets } from './util.js';

/**
* A table consists of a collection of named columns (or 'children').
* To work with table data directly in JavaScript, usse `toColumns()`
Expand Down Expand Up @@ -110,42 +112,57 @@ export class Table {
return cols;
}

/**
* Return an array of objects representing the rows of this table.
* @returns {Record<string, any>[]}
*/
toArray() {
const { children, numRows, names } = this;
const data = children[0]?.data ?? [];
const output = Array(numRows);
for (let b = 0, row = -1; b < data.length; ++b) {
for (let i = 0; i < data[b].length; ++i) {
output[++row] = rowObject(names, children, b, i);
}
}
return output;
}

/**
* Return an iterator over objects representing the rows of this table.
* @returns {Generator<Record<string, any>, any, null>}
*/
*[Symbol.iterator]() {
const { children, names } = this;
const batches = children[0]?.data.length ?? 0;
// for each batch...
for (let b = 0; b < batches; ++b) {
const data = children.map(c => c.data[b]);
const rows = data[0].length;
// for each row...
for (let i = 0; i < rows; ++i) {
yield rowObject(names, data, i);
const data = children[0]?.data ?? [];
for (let b = 0; b < data.length; ++b) {
for (let i = 0; i < data[b].length; ++i) {
yield rowObject(names, children, b, i);
}
}
}

/**
* Return an array of objects representing the rows of this table.
* @returns {Record<string, any>[]}
* Return a row object for the given index.
* @param {number} index The row index.
* @returns {Record<string, any>} The row object.
*/
toArray() {
const { children, numRows, names } = this;
const batches = children[0]?.data.length ?? 0;
const output = Array(numRows);
// for each batch...
for (let b = 0, row = -1; b < batches; ++b) {
const data = children.map(c => c.data[b]);
const rows = data?.[0].length;
// for each row...
for (let i = 0; i < rows; ++i) {
output[++row] = rowObject(names, data, i);
}
}
return output;
at(index) {
const { names, children, numRows } = this;
if (index < 0 || index >= numRows) return null;
const [{ offsets }] = children;
const i = bisectOffsets(offsets, index);
return rowObject(names, children, i, index - offsets[i]);
}

/**
* Return a row object for the given index. This method is the same as
* `at()` and is provided for better compatibility with Apache Arrow JS.
* @param {number} index The row index.
* @returns {Record<string, any>} The row object.
*/
get(index) {
return this.at(index);
}
}

Expand All @@ -155,11 +172,10 @@ function renameField(field, name) {
: field;
}

function rowObject(names, data, index) {
function rowObject(names, children, batch, index) {
const o = {};
// for each column...
for (let j = 0; j < names.length; ++j) {
o[names[j]] = data[j].at(index);
o[names[j]] = children[j].data[batch].at(index);
}
return o;
}
23 changes: 23 additions & 0 deletions src/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,29 @@ export function divide(num, div) {
return toNumber(num / div) + toNumber(num % div) / toNumber(div);
}

/**
* Determine the correct index into an offset array for a given
* full column row index.
* @param {Int32Array} offsets The offsets array.
* @param {number} index The full column row index.
*/
export function bisectOffsets(offsets, index) {
// binary search for batch index
// we use a fast unsigned bit shift for division by two
// this assumes offsets.length <= Math.pow(2, 31), which seems safe
// otherwise that is a whole lotta record batches to handle in JS...
let a = 0;
let b = offsets.length;
do {
const mid = (a + b) >>> 1;
if (offsets[mid] <= index) a = mid + 1;
else b = mid;
} while (a < b);

// decrement to the desired offset array index
return --a;
}

// -- flatbuffer utilities -----

/**
Expand Down
24 changes: 19 additions & 5 deletions test/table-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,42 @@ const values = [
const table = tableFromIPC(await arrowFromDuckDB(values));

describe('Table', () => {
it('provides row count', async () => {
it('provides row count', () => {
assert.deepStrictEqual(table.numRows, 3);
});

it('provides column count', async () => {
it('provides column count', () => {
assert.deepStrictEqual(table.numCols, 1);
});

it('provides child column accessors', async () => {
it('provides child column accessors', () => {
const col = table.getChild('value');
assert.strictEqual(col, table.getChildAt(0));
assert.deepStrictEqual(col.toArray(), values);
});

it('provides object array', async () => {
it('provides object array', () => {
assert.deepStrictEqual(table.toArray(), values.map(value => ({ value })));
});

it('provides column array map', async () => {
it('provides column array map', () => {
assert.deepStrictEqual(table.toColumns(), { value: values });
});

it('provides random access via at/get', () => {
const idx = [0, 1, 2];

// table object random access
const obj = values.map(value => ({ value }));
assert.deepStrictEqual(idx.map(i => table.at(i)), obj);
assert.deepStrictEqual(idx.map(i => table.get(i)), obj);

// column value random access
const col = table.getChildAt(0);
assert.deepStrictEqual(idx.map(i => col.at(i)), values);
assert.deepStrictEqual(idx.map(i => col.get(i)), values);
});

it('provides select by index', async () => {
const sel = table.selectAt([0, 0]);
const col = table.getChild('value');
Expand Down

0 comments on commit c8ec41f

Please sign in to comment.