Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 22 additions & 7 deletions packages/core/services/DatabaseService/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,19 @@ const PRE_DEFINED_COLUMNS = Object.values(PreDefinedColumn);

const DATA_SOURCE_COLUMN = "Data source";

// Suffix appended to every DuckDB file-handle name so that a short name like
// "foo" can never prefix-match a longer name like "foo2".
// See https://github.com/duckdb/duckdb-wasm/issues/2227
//
// This is a workaround, not a complete fix: a collision is still possible if a
// user uploads a file whose name already ends with this suffix (e.g.
// "foo-bff-filehandle.parquet"). A proper fix requires an upstream change in
// duckdb-wasm to use exact-match lookups for registered file handles.
const FILE_HANDLE_SUFFIX = "-bff-filehandle";
function fileHandleName(name: string): string {
return name + FILE_HANDLE_SUFFIX;
}

// Map each actual column name to the predefined column name when they fuzzy-match.
function getActualToPreDefinedColumnMap(columns: string[]): Map<string, string> {
const map = new Map<string, string>();
Expand Down Expand Up @@ -213,9 +226,11 @@ export default abstract class DatabaseService {
throw new Error("Database failed to initialize");
}

const handle = fileHandleName(name);

if (uri instanceof File) {
await this.database.registerFileHandle(
name,
handle,
uri,
duckdb.DuckDBDataProtocol.BROWSER_FILEREADER,
true
Expand All @@ -225,29 +240,29 @@ export default abstract class DatabaseService {
? duckdb.DuckDBDataProtocol.S3
: duckdb.DuckDBDataProtocol.HTTP;

await this.database.registerFileURL(name, uri, protocol, false);
await this.database.registerFileURL(handle, uri, protocol, false);
}

if (type === "parquet") {
await this.createParquetDirectView(name);
} else if (type === "json") {
await this.execute(`CREATE TABLE "${name}" AS FROM read_json_auto('${name}');`);
await this.execute(`CREATE TABLE "${name}" AS FROM read_json_auto('${handle}');`);
} else {
// Default to CSV. Use sample_size=-1 to scan the full file before deciding column
// types, eliminating "first N rows look numeric, later rows have strings" failures.
// Fall back to all_varchar=true if type inference fails (e.g. truly mixed-type column)
// so the file always loads successfully.
try {
await this.execute(
`CREATE TABLE "${name}" AS FROM read_csv_auto('${name}', header=true, sample_size=-1);`
`CREATE TABLE "${name}" AS FROM read_csv_auto('${handle}', header=true, sample_size=-1);`
);
} catch {
console.warn(
`Failed to infer column types for CSV "${name}"; falling back to all_varchar=true. All columns will be loaded as strings.`
);
await this.execute(`DROP TABLE IF EXISTS "${name}"`);
await this.execute(
`CREATE TABLE "${name}" AS FROM read_csv_auto('${name}', header=true, all_varchar=true);`
`CREATE TABLE "${name}" AS FROM read_csv_auto('${handle}', header=true, all_varchar=true);`
);
}
}
Expand Down Expand Up @@ -798,7 +813,7 @@ export default abstract class DatabaseService {
selectParts.push(`"filename" AS "${DATA_SOURCE_COLUMN}"`);
}
// 4. Create the view for this data source
const quotedNames = sourceNames.map((name) => `'${name}'`).join(", ");
const quotedNames = sourceNames.map((name) => `'${fileHandleName(name)}'`).join(", ");
const createViewSql = `CREATE VIEW "${aggregateName}"
AS SELECT ${selectParts.join(", ")}
FROM parquet_scan(ARRAY[${quotedNames}], union_by_name = true);`;
Expand Down Expand Up @@ -1187,7 +1202,7 @@ export default abstract class DatabaseService {
// Similar to getColumnsOnDataSource below, but suitable for use during the
// data source preparation step.
private async getRawParquetColumns(name: string): Promise<string[]> {
const sql = `DESCRIBE SELECT * FROM parquet_scan("${name}")`;
const sql = `DESCRIBE SELECT * FROM parquet_scan("${fileHandleName(name)}")`;
const rows = await this.query(sql).promise;
return rows.map((row) => row["column_name"] as string);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ describe("DatabaseService", () => {

public query(sql: string): { promise: Promise<{ [key: string]: any }[]> } {
const parquetDescribeMatch = sql.match(
/DESCRIBE SELECT \* FROM parquet_scan\("(.+)"\)/
/DESCRIBE SELECT \* FROM parquet_scan\("(.+)-bff-filehandle"\)/
);
if (parquetDescribeMatch) {
const sourceName = parquetDescribeMatch[1];
Expand Down Expand Up @@ -351,6 +351,27 @@ describe("DatabaseService", () => {
);
});

it("uses suffixed file handle names in parquet_scan to avoid prefix collisions", async () => {
// Regression: if "foo" and "foo2" are registered as-is, DuckDB
// prefix-matches "foo" against "foo2" (duckdb-wasm#2227).
// This test only verifies the suffix is applied in the generated SQL;
// actual collision prevention is a DuckDB-wasm integration concern.
const service = new MockAggregateParquetDatabaseService({
foo: ["file_path"],
foo2: ["file_path"],
});

await service.prepareDataSources([
{ name: "foo", type: "parquet", uri: "https://example.com/foo.parquet" },
{ name: "foo2", type: "parquet", uri: "https://example.com/foo2.parquet" },
]);

const createViewSql = service.executedSQL.find((sql) => sql.includes("CREATE VIEW"));
expect(createViewSql).to.not.be.undefined;
expect(createViewSql).to.match(/parquet_scan\(ARRAY\[.*'foo-bff-filehandle'.*]/);
expect(createViewSql).to.match(/parquet_scan\(ARRAY\[.*'foo2-bff-filehandle'.*]/);
});

it("creates aggregate parquet view using union_by_name and data source projection", async () => {
const service = new MockAggregateParquetDatabaseService({
"a.parquet": ["file_path"],
Expand Down
Loading