diff --git a/perf/perf-test.js b/perf/perf-test.js index 9582ef4..e05b8ec 100644 --- a/perf/perf-test.js +++ b/perf/perf-test.js @@ -28,6 +28,20 @@ function iterateValues(table) { names.forEach(name => Array.from(table.getChild(name))); } +// random access to each column value +// this will be slower if there are multiple record batches +// due to the need for binary search over the offsets array +function randomAccess(table) { + const { numRows, numCols } = table; + const vals = Array(numCols); + for (let j = 0; j < numCols; ++j) { + const col = table.getChildAt(j); + for (let i = 0; i < numRows; ++i) { + vals[j] = col.at(i); + } + } +} + // generate row objects, access each property function visitObjects(table) { const nr = table.numRows; @@ -58,6 +72,7 @@ async function run(file) { trial('Parse Table from IPC', file, bytes, parseIPC, 10); trial('Extract Arrays', file, bytes, extractArrays, 10); trial('Iterate Values', file, bytes, iterateValues, 10); + trial('Random Access', file, bytes, randomAccess, 10); trial('Visit Row Objects', file, bytes, visitObjects, 5); console.log(); } diff --git a/src/column.js b/src/column.js index a8550f3..90ce805 100644 --- a/src/column.js +++ b/src/column.js @@ -1,4 +1,5 @@ import { isDirectBatch } from './batch.js'; +import { bisectOffsets } from './util.js'; /** * Build up a column from batches. @@ -99,25 +100,25 @@ export class Column { * lookup less efficient than a standard array access. If making a full * scan of a column, consider extracting arrays via `toArray()` or using an * iterator (`for (const value of column) {...}`). - * @param {number} index The index + * @param {number} index The row index. * @returns {T | null} The value. */ at(index) { // NOTE: if there is only one batch, this method is replaced with an - // optimized version within the Column constructor. + // optimized version in the Column constructor. const { data, offsets } = this; + const i = bisectOffsets(offsets, index); + return data[i]?.at(index - offsets[i]); // undefined if out of range + } - // binary search for batch index - let a = 0; - let b = offsets.length; - do { - const mid = (a + b) >>> 1; - if (offsets[mid] <= index) a = mid + 1; - else b = mid; - } while (a < b); - - // returns undefined if index is out of range - return data[--a]?.at(index - offsets[a]); + /** + * Return the column value at the given index. This method is the same as + * `at()` and is provided for better compatibility with Apache Arrow JS. + * @param {number} index The row index. + * @returns {T | null} The value. + */ + get(index) { + return this.at(index); } /** diff --git a/src/table.js b/src/table.js index 85a40d3..1bd5bc6 100644 --- a/src/table.js +++ b/src/table.js @@ -1,3 +1,5 @@ +import { bisectOffsets } from './util.js'; + /** * A table consists of a collection of named columns (or 'children'). * To work with table data directly in JavaScript, usse `toColumns()` @@ -110,42 +112,57 @@ export class Table { return cols; } + /** + * Return an array of objects representing the rows of this table. + * @returns {Record[]} + */ + toArray() { + const { children, numRows, names } = this; + const data = children[0]?.data ?? []; + const output = Array(numRows); + for (let b = 0, row = -1; b < data.length; ++b) { + for (let i = 0; i < data[b].length; ++i) { + output[++row] = rowObject(names, children, b, i); + } + } + return output; + } + /** * Return an iterator over objects representing the rows of this table. * @returns {Generator, any, null>} */ *[Symbol.iterator]() { const { children, names } = this; - const batches = children[0]?.data.length ?? 0; - // for each batch... - for (let b = 0; b < batches; ++b) { - const data = children.map(c => c.data[b]); - const rows = data[0].length; - // for each row... - for (let i = 0; i < rows; ++i) { - yield rowObject(names, data, i); + const data = children[0]?.data ?? []; + for (let b = 0; b < data.length; ++b) { + for (let i = 0; i < data[b].length; ++i) { + yield rowObject(names, children, b, i); } } } /** - * Return an array of objects representing the rows of this table. - * @returns {Record[]} + * Return a row object for the given index. + * @param {number} index The row index. + * @returns {Record} The row object. */ - toArray() { - const { children, numRows, names } = this; - const batches = children[0]?.data.length ?? 0; - const output = Array(numRows); - // for each batch... - for (let b = 0, row = -1; b < batches; ++b) { - const data = children.map(c => c.data[b]); - const rows = data?.[0].length; - // for each row... - for (let i = 0; i < rows; ++i) { - output[++row] = rowObject(names, data, i); - } - } - return output; + at(index) { + const { names, children, numRows } = this; + if (index < 0 || index >= numRows) return null; + const [{ offsets }] = children; + const i = bisectOffsets(offsets, index); + return rowObject(names, children, i, index - offsets[i]); + } + + /** + * Return a row object for the given index. This method is the same as + * `at()` and is provided for better compatibility with Apache Arrow JS. + * @param {number} index The row index. + * @returns {Record} The row object. + */ + get(index) { + return this.at(index); } } @@ -155,11 +172,10 @@ function renameField(field, name) { : field; } -function rowObject(names, data, index) { +function rowObject(names, children, batch, index) { const o = {}; - // for each column... for (let j = 0; j < names.length; ++j) { - o[names[j]] = data[j].at(index); + o[names[j]] = children[j].data[batch].at(index); } return o; } diff --git a/src/util.js b/src/util.js index ca4d0eb..906ac91 100644 --- a/src/util.js +++ b/src/util.js @@ -57,6 +57,29 @@ export function divide(num, div) { return toNumber(num / div) + toNumber(num % div) / toNumber(div); } +/** + * Determine the correct index into an offset array for a given + * full column row index. + * @param {Int32Array} offsets The offsets array. + * @param {number} index The full column row index. + */ +export function bisectOffsets(offsets, index) { + // binary search for batch index + // we use a fast unsigned bit shift for division by two + // this assumes offsets.length <= Math.pow(2, 31), which seems safe + // otherwise that is a whole lotta record batches to handle in JS... + let a = 0; + let b = offsets.length; + do { + const mid = (a + b) >>> 1; + if (offsets[mid] <= index) a = mid + 1; + else b = mid; + } while (a < b); + + // decrement to the desired offset array index + return --a; +} + // -- flatbuffer utilities ----- /** diff --git a/test/table-test.js b/test/table-test.js index bd8aade..872e915 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -12,28 +12,42 @@ const values = [ const table = tableFromIPC(await arrowFromDuckDB(values)); describe('Table', () => { - it('provides row count', async () => { + it('provides row count', () => { assert.deepStrictEqual(table.numRows, 3); }); - it('provides column count', async () => { + it('provides column count', () => { assert.deepStrictEqual(table.numCols, 1); }); - it('provides child column accessors', async () => { + it('provides child column accessors', () => { const col = table.getChild('value'); assert.strictEqual(col, table.getChildAt(0)); assert.deepStrictEqual(col.toArray(), values); }); - it('provides object array', async () => { + it('provides object array', () => { assert.deepStrictEqual(table.toArray(), values.map(value => ({ value }))); }); - it('provides column array map', async () => { + it('provides column array map', () => { assert.deepStrictEqual(table.toColumns(), { value: values }); }); + it('provides random access via at/get', () => { + const idx = [0, 1, 2]; + + // table object random access + const obj = values.map(value => ({ value })); + assert.deepStrictEqual(idx.map(i => table.at(i)), obj); + assert.deepStrictEqual(idx.map(i => table.get(i)), obj); + + // column value random access + const col = table.getChildAt(0); + assert.deepStrictEqual(idx.map(i => col.at(i)), values); + assert.deepStrictEqual(idx.map(i => col.get(i)), values); + }); + it('provides select by index', async () => { const sel = table.selectAt([0, 0]); const col = table.getChild('value');