Skip to content

query flattengin benchamarking script #108

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
119 changes: 119 additions & 0 deletions meerkat-node/src/__tests__/benchmarking/benchmarking.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import { duckdbExec } from "../../duckdb-exec";
const {
PARQUET_FILE_PATH,
getOldQueries,
getNewQueries
} = require('./constants')

const TABLE_NAME = 'benchmarking'

const RUNS_PER_QUERY = 20
const EXECUTION_INDEX = 5;

type RunInfo = {
end: number;
start: number;
}

const getParquetSQL = (parquetPath: string, TABLE_NAME: string) => `CREATE TABLE ${TABLE_NAME} AS SELECT * FROM read_parquet('${parquetPath}');` as const

const getAverageDuration = (results: { end: number, start: number }[]) => {
let totalDuration = 0;
results.forEach((result) => {
const duration = result.end - result.start;
totalDuration += duration;
})
const avgDuration = totalDuration / results.length;
return avgDuration;
}

const getAverageImprovement = (oldRuns: RunInfo[], newRuns: RunInfo[]) => {
const oldAvgDuration = getAverageDuration(oldRuns)
const newAvgDuration = getAverageDuration(newRuns)
// Calculate improvement
const improvement = ((oldAvgDuration - newAvgDuration) / oldAvgDuration) * 100;
return {
oldAvgDuration,
newAvgDuration,
improvement
}
}


const getPercentileItem = (runs: RunInfo[], percentile: number) => {
const sortedRuns = runs.sort((a, b) => a.end - a.start - (b.end - b.start))
const p90Index = Math.floor(runs.length * percentile)
return sortedRuns[p90Index]
}

const getPercentileImprovement = (oldRuns: RunInfo[], newRuns: RunInfo[], percentile: number) => {
const { end: oldEndP90, start: oldStartP90 } = getPercentileItem(oldRuns, percentile)
const { end: newEndP90, start: newStartP90 } = getPercentileItem(newRuns, percentile)
const oldDuration = oldEndP90 - oldStartP90
const newDuration = newEndP90 - newStartP90
const improvement = ((oldDuration - newDuration) / oldDuration) * 100;
return {
oldDuration,
newDuration,
improvement
}
}

const runQueryWithTiming = async (sql: string, runs: number): Promise<RunInfo[]> => {
const runsArray = new Array(runs).fill(undefined)
const dataPromises = runsArray.map(async () => {
const start = performance.now();
await duckdbExec(sql);
const end = performance.now();
return {
end,
start
}
})
const results = await Promise.all(dataPromises);
return results

}


const benchmarkQueries = async (tableName: string, runsPerQuery: number) => {
const oldQueries = getOldQueries(tableName);
const newQueries = getNewQueries(tableName);

const oldQuery = oldQueries[EXECUTION_INDEX];
const newQuery = newQueries[EXECUTION_INDEX];

console.log({ oldQuery, newQuery })
// Run old query multiple times and get average
const oldRuns = await runQueryWithTiming(oldQuery.sql, runsPerQuery);

// Run new query multiple times and get average
const newRuns = await runQueryWithTiming(newQuery.sql, runsPerQuery);

const avgImprovement = getAverageImprovement(oldRuns, newRuns)
const p50Improvement = getPercentileImprovement(oldRuns, newRuns, 0.5)
const p90Improvement = getPercentileImprovement(oldRuns, newRuns, 0.9)


return {
avg: avgImprovement,
p50: p50Improvement,
p90: p90Improvement
}

}



describe('benchmarking', () => {
beforeAll(async () => {
const sql = getParquetSQL(PARQUET_FILE_PATH, TABLE_NAME);
await duckdbExec(sql);
})

it('should be able to run benchmarks', async () => {
const result = await benchmarkQueries(TABLE_NAME, RUNS_PER_QUERY)
console.log(result)
})
})

182 changes: 182 additions & 0 deletions meerkat-node/src/__tests__/benchmarking/constants.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@



const getOldQueries = (
tableName
) => {
return [
{
name: "Simple COUNT",
sql: `
SELECT COUNT(*) FROM (
SELECT * FROM (
SELECT * FROM ${tableName}
)
)`
},
{
name: "Aggregation with GROUP BY",
sql: `
SELECT ${tableName}__customer_id, SUM(total_amount) AS total_spent FROM (
SELECT *, customer_id as ${tableName}__customer_id FROM (
SELECT * FROM ${tableName}
)
) GROUP BY ${tableName}__customer_id ORDER BY total_spent DESC`
},
{
name: "Multiple Aggregations with GROUP BY",
sql: `SELECT product_id, product_name, SUM(quantity) AS total_quantity, AVG(product_price) AS avg_price FROM (
SELECT *, product_id AS ${tableName}__product_id FROM (
SELECT * FROM ${tableName}
)
) GROUP BY product_id, product_name ORDER BY total_quantity DESC`
},
{
name: "Date Truncation and Aggregation",
sql: `
SELECT ${tableName}__month, SUM(total_amount) AS monthly_revenue FROM (
SELECT *, DATE_TRUNC('month', order_date) AS ${tableName}__month FROM (
SELECT * FROM ${tableName}
)
) GROUP BY ${tableName}__month ORDER BY ${tableName}__month`
},
{
name: "Window Function",
sql: `
SELECT
${tableName}__order_id,
${tableName}__order_date,
${tableName}__total_amount,
SUM(total_amount) OVER (ORDER BY order_date) AS ${tableName}__cumulative_revenue
FROM (
SELECT *, order_id AS ${tableName}__order_id, order_date AS ${tableName}__order_date, total_amount AS ${tableName}__total_amount FROM (
SELECT * FROM ${tableName}
)
)
ORDER BY ${tableName}__order_date
LIMIT 100
`
}, {
name: 'filters',
sql: `
SELECT
DATE_TRUNC('month', ${tableName}.order_date) AS ${tableName}__order_month,
${tableName}__customer_id,
${tableName}__product_id,
${tableName}__product_name,
COUNT(DISTINCT ${tableName}.order_id) AS ${tableName}__total_orders,
SUM(${tableName}.quantity) AS ${tableName}__quantity,
AVG(${tableName}.product_price) AS ${tableName}__avg_product_price,
FROM (
select
*,
${tableName}.customer_id AS ${tableName}__customer_id,
${tableName}.product_id AS ${tableName}__product_id,
${tableName}.product_name AS ${tableName}__product_name,
${tableName}.order_date AS ${tableName}__order_date,
from (
select * from ${tableName}
) AS ${tableName}
) AS ${tableName}
WHERE
${tableName}__order_date BETWEEN '2023-01-01 00:00:00' AND '2023-06-30 23:59:59'
AND ${tableName}__product_id BETWEEN 100 AND 200
AND ${tableName}__product_name LIKE '%Premium%'
GROUP BY
${tableName}__order_month,
${tableName}__customer_id,
${tableName}__product_id,
${tableName}__product_name
HAVING
${tableName}__total_orders > 10
ORDER BY
${tableName}__total_orders DESC
`
}

];
}

const getNewQueries = (tableName: string) => {
return [
{
name: "Simple COUNT",
sql: `SELECT COUNT(*) FROM ${tableName}`
},
{
name: "Aggregation with GROUP BY",
sql: `SELECT customer_id, SUM(total_amount) AS total_spent FROM ${tableName} GROUP BY customer_id ORDER BY total_spent DESC`
},
{
name: "Multiple Aggregations with GROUP BY",
sql: `SELECT product_id, product_name, SUM(quantity) AS total_quantity, AVG(product_price) AS avg_price FROM ${tableName} GROUP BY product_id, product_name ORDER BY total_quantity DESC`
},
{
name: "Date Truncation and Aggregation",
sql: `SELECT DATE_TRUNC('month', order_date) AS month, SUM(total_amount) AS monthly_revenue FROM ${tableName} GROUP BY month ORDER BY month`
},
{
name: "Window Function",
sql: `
SELECT
order_id,
order_date,
total_amount,
SUM(total_amount) OVER (ORDER BY order_date) AS cumulative_revenue
FROM ${tableName}
ORDER BY order_date
LIMIT 100
`
},
{
name: 'filters',
sql: `
SELECT
${tableName}__order_month,
${tableName}.customer_id,
${tableName}.product_id,
${tableName}.product_name,
COUNT(DISTINCT ${tableName}.order_id) AS benchmarking__total_orders,
SUM(${tableName}.quantity) AS benchmarking__quantity,
AVG(${tableName}.product_price) AS benchmarking__avg_product_price
FROM
(
SELECT
DATE_TRUNC('month', ${tableName}.order_date) AS ${tableName}__order_month,
${tableName}.order_date,
${tableName}.customer_id,
${tableName}.product_id,
${tableName}.product_name,
${tableName}.order_id,
${tableName}.quantity,
${tableName}.product_price
from ${tableName}
) AS ${tableName}
WHERE
${tableName}__order_month BETWEEN '2023-01-01' AND '2023-06-30'
AND ${tableName}.product_id BETWEEN 100 AND 200
AND ${tableName}.product_name LIKE '%Premium%'
GROUP BY
${tableName}__order_month,
${tableName}.customer_id,
${tableName}.product_id,
${tableName}.product_name
HAVING
COUNT(DISTINCT ${tableName}.order_id) > 10
ORDER BY
COUNT(DISTINCT ${tableName}.order_id) DESC`
}
];
}



module.exports = {
PARQUET_FILE_PATH: '/Users/zaidjan/Documents/Projects/meerkat/meerkat-node/src/__tests__/benchmarking/benchmarking.parquet',
getOldQueries,
getNewQueries
}




Loading
Loading