diff --git a/benchmark_data_tools/pytest.ini b/benchmark_data_tools/pytest.ini new file mode 100644 index 00000000..f84387f8 --- /dev/null +++ b/benchmark_data_tools/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +addopts = -ra --durations=10 + diff --git a/benchmark_data_tools/tests/conftest.py b/benchmark_data_tools/tests/conftest.py new file mode 100644 index 00000000..3f37aa8c --- /dev/null +++ b/benchmark_data_tools/tests/conftest.py @@ -0,0 +1,63 @@ +import os +import signal +import sys +import contextlib +import pytest +import importlib + + +DEFAULT_TIMEOUT_SECS = int(os.environ.get("PYTEST_DEFAULT_TIMEOUT", 10)) + + +def _install_alarm(timeout: int): + def _handler(signum, frame): + raise TimeoutError(f"Test timed out after {timeout}s") + + prev_handler = signal.getsignal(signal.SIGALRM) + signal.signal(signal.SIGALRM, _handler) + # setitimer allows fractional seconds; here integer seconds are fine + signal.setitimer(signal.ITIMER_REAL, timeout) + return prev_handler + + +@pytest.fixture(autouse=True) +def per_test_timeout(): + # If pytest-timeout plugin is present, rely on it (pytest.ini sets --timeout) + if any("pytest_timeout" in str(m) for m in sys.modules.keys()): + yield + return + # Fallback: POSIX-only SIGALRM based timeout + if os.name != "posix" or not hasattr(signal, "SIGALRM"): + yield + return + prev = _install_alarm(DEFAULT_TIMEOUT_SECS) + try: + yield + finally: + # cancel alarm and restore previous handler + with contextlib.suppress(Exception): + signal.setitimer(signal.ITIMER_REAL, 0) + signal.signal(signal.SIGALRM, prev) + + +@pytest.fixture(autouse=True) +def clean_duckdb_catalog(): + """Ensure DuckDB starts each test with an empty catalog. + + Tests that import duckdb will share a process-global connection state. + Drop any existing tables between tests to avoid cross-test contamination. + """ + yield + try: + duckdb = importlib.import_module("duckdb") + except Exception: + return + try: + tables = duckdb.sql("SHOW TABLES").fetchall() + except Exception: + return + for (tbl,) in tables: + with contextlib.suppress(Exception): + duckdb.sql(f"DROP TABLE IF EXISTS {tbl}") + + diff --git a/benchmark_data_tools/tests/test_duckdb_utils.py b/benchmark_data_tools/tests/test_duckdb_utils.py new file mode 100644 index 00000000..02d7183c --- /dev/null +++ b/benchmark_data_tools/tests/test_duckdb_utils.py @@ -0,0 +1,57 @@ +import json +from pathlib import Path +import sys + +import pytest + +# Add repo 'velox-testing' root to sys.path to import modules +sys.path.insert(0, str(Path(__file__).resolve().parents[2])) # repo root (velox-testing) +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) # benchmark_data_tools dir for 'duckdb_utils' + +from benchmark_data_tools.duckdb_utils import is_decimal_column +from benchmark_data_tools.generate_data_files import ( + write_metadata, + rearrange_directory, + get_column_projection, +) + + +def test_is_decimal_column(): + assert is_decimal_column("DECIMAL(10,2)") + assert is_decimal_column("DECIMAL(38,18)") + assert not is_decimal_column("DOUBLE") + assert not is_decimal_column("VARCHAR") + + +def test_write_metadata(tmp_path): + write_metadata(str(tmp_path), 0.01) + p = tmp_path / "metadata.json" + assert p.exists() + meta = json.loads(p.read_text()) + assert meta["scale_factor"] == 0.01 + + +def test_rearrange_directory_moves_partitions(tmp_path): + raw = tmp_path / "raw" + (raw / "part-1").mkdir(parents=True) + # Simulate two tables + (raw / "part-1" / "orders.parquet").write_bytes(b"") + (raw / "part-1" / "customer.parquet").write_bytes(b"") + + rearrange_directory(str(raw), 1) + + assert not (raw / "part-1").exists() + assert (raw / "orders" / "orders-1.parquet").exists() + assert (raw / "customer" / "customer-1.parquet").exists() + + +def test_get_column_projection_converts_decimal(): + # column metadata rows from duckdb DESCRIBE: (name, type, ...) + dec_col = ("price", "DECIMAL(10,2)") + dbl_col = ("qty", "DOUBLE") + assert ( + get_column_projection(dec_col, True) + == "CAST(price AS DOUBLE) AS price" + ) + assert get_column_projection(dbl_col, True) == "qty" + diff --git a/benchmark_data_tools/tests/test_generate_data_files.py b/benchmark_data_tools/tests/test_generate_data_files.py new file mode 100644 index 00000000..d4370de2 --- /dev/null +++ b/benchmark_data_tools/tests/test_generate_data_files.py @@ -0,0 +1,243 @@ +import json +import subprocess +import sys +from pathlib import Path + +import pytest +from types import SimpleNamespace +import sys as _sys + +# Allow direct imports of the module under test +_sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + + +def _script_path(name: str) -> str: + return str(Path(__file__).resolve().parents[1] / name) + + +def _duckdb_ext_available(ext: str) -> bool: + try: + import duckdb # noqa: F401 + subprocess.run( + [ + sys.executable, + "-c", + f"import duckdb; duckdb.sql('INSTALL {ext}; LOAD {ext};')", + ], + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + return True + except Exception: + return False + + +def test_help_exits_zero(): + script = _script_path("generate_data_files.py") + proc = subprocess.run([sys.executable, script, "-h"], text=True, stdout=subprocess.PIPE) + assert proc.returncode == 0 + assert "Generate benchmark parquet data files" in proc.stdout or "usage" in proc.stdout + + +@pytest.mark.skipif(not _duckdb_ext_available("tpch"), reason="duckdb tpch extension not available") +def test_generate_tpch_duckdb_small(tmp_path): + script = _script_path("generate_data_files.py") + out = tmp_path / "tpch_sf0001" + args = [ + sys.executable, + script, + "-b", + "tpch", + "-d", + str(out), + "-s", + "0.001", + "--use-duckdb", + "-j", + "1", + ] + proc = subprocess.run(args, text=True, capture_output=True) + assert proc.returncode == 0 + # Expect metadata and at least one table dir + meta = out / "metadata.json" + assert meta.exists() + data = json.loads(meta.read_text()) + assert float(data["scale_factor"]) == pytest.approx(0.001) + # Find any subdir containing parquet + has_any = False + for p in out.iterdir(): + if p.is_dir() and any(x.suffix == ".parquet" for x in p.glob("*.parquet")): + has_any = True + break + assert has_any, "expected at least one parquet file to be written" + + +@pytest.mark.skipif(not _duckdb_ext_available("tpch"), reason="duckdb tpch extension not available") +def test_verbose_and_overwrite(tmp_path): + script = _script_path("generate_data_files.py") + out = tmp_path / "tpch_sf0001" + out.mkdir(parents=True) + # Pre-create a file that should be removed since script recreates directory + (out / "old.txt").write_text("old") + args = [ + sys.executable, + script, + "-b", + "tpch", + "-d", + str(out), + "-s", + "0.001", + "--use-duckdb", + "-v", + ] + proc = subprocess.run(args, text=True, capture_output=True) + assert proc.returncode == 0 + # Directory should exist and old file should be gone + assert out.exists() + assert not (out / "old.txt").exists() + # Verbose path prints "generating with duckdb" + assert "generating with duckdb" in (proc.stdout + proc.stderr) + + +@pytest.mark.skipif( + not (_duckdb_ext_available("tpch") and pytest.importorskip("pyarrow", reason="pyarrow required")), + reason="duckdb tpch extension or pyarrow not available", +) +def test_convert_decimals_to_floats_no_decimal_types(tmp_path): + import pyarrow.parquet as pq + + script = _script_path("generate_data_files.py") + out = tmp_path / "tpch_sf0001" + args = [ + sys.executable, + script, + "-b", + "tpch", + "-d", + str(out), + "-s", + "0.001", + "--use-duckdb", + "-c", + ] + proc = subprocess.run(args, text=True, capture_output=True) + assert proc.returncode == 0 + # Inspect a known table with DECIMALs in TPCH (e.g., lineitem) + lineitem = out / "lineitem" / "lineitem.parquet" + # Some small scales might not include all tables; fall back to any table + target = lineitem if lineitem.exists() else next(out.glob("*/*.parquet")) + schema = pq.read_schema(target) + # Ensure no decimal types remain after conversion + assert all("decimal" not in str(f.type).lower() for f in schema) + + +@pytest.mark.skipif(not _duckdb_ext_available("tpcds"), reason="duckdb tpcds extension not available") +def test_tpcds_schema_with_zero_scale(): + import duckdb + + # Generate only schema with zero scale (fast); do not write files + duckdb.sql("INSTALL tpcds; LOAD tpcds; CALL dsdgen(sf=0);") + tables = duckdb.sql("SHOW TABLES").fetchall() + # Expect a reasonable number of tables present + assert len(tables) >= 5 + # Check that DESCRIBE works for one known table + table_name = tables[0][0] + desc = duckdb.sql(f"DESCRIBE {table_name}").fetchall() + assert len(desc) > 0 + + +def test_invalid_missing_required_args(tmp_path): + script = _script_path("generate_data_files.py") + # Missing benchmark type + proc = subprocess.run( + [sys.executable, script, "-d", str(tmp_path / "x"), "-s", "0.1"], + text=True, + capture_output=True, + ) + assert proc.returncode != 0 + # Missing data dir + proc = subprocess.run( + [sys.executable, script, "-b", "tpch", "-s", "0.1"], + text=True, + capture_output=True, + ) + assert proc.returncode != 0 + # Missing scale factor + proc = subprocess.run( + [sys.executable, script, "-b", "tpch", "-d", str(tmp_path / "y")], + text=True, + capture_output=True, + ) + assert proc.returncode != 0 + + +@pytest.mark.skipif(not _duckdb_ext_available("tpch"), reason="duckdb tpch extension not available") +def test_extra_options_accepted(tmp_path): + script = _script_path("generate_data_files.py") + out = tmp_path / "tpch_sf0001" + # Options --max-rows-per-file and -j are relevant to tpchgen path, but should be accepted with duckdb + proc = subprocess.run( + [ + sys.executable, + script, + "-b", + "tpch", + "-d", + str(out), + "-s", + "0.001", + "--use-duckdb", + "--max-rows-per-file", + "1000", + "-j", + "2", + ], + text=True, + capture_output=True, + ) + assert proc.returncode == 0 + + +def test_tpchgen_partitions_count_monkeypatched(tmp_path, monkeypatch): + # Import the module under test for monkeypatching + import generate_data_files as gdf + from pathlib import Path as _Path + + out_dir = tmp_path / "tpch_partitions" + + # Provide a fixed partition mapping to avoid duckdb dependency + monkeypatch.setattr( + gdf, + "get_table_sf_ratios", + lambda scale_factor, max_rows: {"orders": 3, "customer": 2}, + ) + + # Replace the partition generator to create placeholder parquet files + def fake_generate_partition(table, partition, raw_data_path, scale_factor, num_partitions, verbose): + pdir = _Path(raw_data_path) / f"part-{partition}" + pdir.mkdir(parents=True, exist_ok=True) + (_Path(pdir) / f"{table}.parquet").write_text("") + + monkeypatch.setattr(gdf, "generate_partition", fake_generate_partition) + + args = SimpleNamespace( + data_dir_path=str(out_dir), + scale_factor=1, + max_rows_per_file=1_000_000, + num_threads=2, + verbose=False, + convert_decimals_to_floats=False, + benchmark_type="tpch", + ) + + gdf.generate_data_files_with_tpchgen(args) + + # After rearrange_directory, each table dir should contain one file per partition + orders = list((out_dir / "orders").glob("*.parquet")) + customer = list((out_dir / "customer").glob("*.parquet")) + assert len(orders) == 3 + assert len(customer) == 2 + + diff --git a/benchmark_data_tools/tests/test_generate_query_file.py b/benchmark_data_tools/tests/test_generate_query_file.py new file mode 100644 index 00000000..1dc94c01 --- /dev/null +++ b/benchmark_data_tools/tests/test_generate_query_file.py @@ -0,0 +1,53 @@ +import json +import subprocess +import sys +from pathlib import Path + +import pytest + + +def _script_path(name: str) -> str: + return str(Path(__file__).resolve().parents[1] / name) + + +def _duckdb_ext_available(ext: str) -> bool: + try: + import duckdb # noqa: F401 + subprocess.run( + [sys.executable, "-c", f"import duckdb; duckdb.sql('INSTALL {ext}; LOAD {ext};')"], + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + return True + except Exception: + return False + + +@pytest.mark.parametrize("benchmark,expected_count", [("tpch", 22), ("tpcds", 99)]) +def test_generate_queries_counts(tmp_path, benchmark, expected_count): + if not _duckdb_ext_available(benchmark): + pytest.skip(f"duckdb {benchmark} extension not available") + script = _script_path("generate_query_file.py") + out_dir = tmp_path / f"queries_{benchmark}" + proc = subprocess.run( + [sys.executable, script, "--benchmark-type", benchmark, "--queries-dir-path", str(out_dir)], + text=True, + ) + assert proc.returncode == 0 + qf = out_dir / "queries.json" + assert qf.exists() + data = json.loads(qf.read_text()) + assert len(data) == expected_count + # Keys are Q1..Qn + assert all(k.startswith("Q") for k in data.keys()) + + +def test_help_exits_zero(): + script = _script_path("generate_query_file.py") + proc = subprocess.run([sys.executable, script, "-h"], text=True, stdout=subprocess.PIPE) + assert proc.returncode == 0 + assert "Usage" in proc.stdout or "usage" in proc.stdout + + + diff --git a/benchmark_data_tools/tests/test_generate_table_schemas.py b/benchmark_data_tools/tests/test_generate_table_schemas.py new file mode 100644 index 00000000..31a913c6 --- /dev/null +++ b/benchmark_data_tools/tests/test_generate_table_schemas.py @@ -0,0 +1,84 @@ +import subprocess +import sys +from pathlib import Path + +import duckdb +import pytest +import sys as _sys + +# Ensure module import from benchmark_data_tools directory +_sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + + +def _script_path(name: str) -> str: + return str(Path(__file__).resolve().parents[1] / name) + + +def test_help_exits_zero(): + script = _script_path("generate_table_schemas.py") + proc = subprocess.run([sys.executable, script, "-h"], text=True, stdout=subprocess.PIPE) + assert proc.returncode == 0 + assert "Generate benchmark table schemas" in proc.stdout or "usage" in proc.stdout + + +def test_generate_schemas_tpch_not_null(tmp_path, monkeypatch): + # Import module to test its functions directly + import generate_table_schemas as gts + + data_dir = tmp_path / "data" + schemas_dir = tmp_path / "schemas" + # Simulate two table directories in data_dir + (data_dir / "orders").mkdir(parents=True) + (data_dir / "customer").mkdir(parents=True) + + # Monkeypatch duck utils to create simple in-memory tables with NOT NULL + def fake_create_not_null_table(table_name, data_path): + duckdb.sql(f"DROP TABLE IF EXISTS {table_name}") + duckdb.sql( + f"CREATE TABLE {table_name} (id BIGINT NOT NULL, val DOUBLE NOT NULL)" + ) + + def fake_create_table(table_name, data_path): + duckdb.sql(f"DROP TABLE IF EXISTS {table_name}") + duckdb.sql(f"CREATE TABLE {table_name} (id BIGINT, val DOUBLE)") + + monkeypatch.setattr(gts.duck, "create_not_null_table", fake_create_not_null_table) + monkeypatch.setattr(gts.duck, "create_table", fake_create_table) + + # Generate schema files for tpch (expects NOT NULL columns) + gts.generate_table_schemas("tpch", str(schemas_dir), str(data_dir), verbose=False) + + # Check that schema files were written and include NOT NULL + for tbl in ["orders", "customer"]: + p = schemas_dir / f"{tbl}.sql" + assert p.exists() + sql = p.read_text() + assert "CREATE TABLE hive.{schema}." in sql + assert "NOT NULL" in sql + + +def test_generate_schemas_tpcds_nullable(tmp_path, monkeypatch): + import generate_table_schemas as gts + + data_dir = tmp_path / "data" + schemas_dir = tmp_path / "schemas" + (data_dir / "store_sales").mkdir(parents=True) + + # Ensure clean state (previous test may have created tables) + for (tbl,) in duckdb.sql("SHOW TABLES").fetchall(): + duckdb.sql(f"DROP TABLE IF EXISTS {tbl}") + + def fake_create_table(table_name, data_path): + duckdb.sql(f"DROP TABLE IF EXISTS {table_name}") + duckdb.sql(f"CREATE TABLE {table_name} (k INTEGER, v VARCHAR)") + + monkeypatch.setattr(gts.duck, "create_table", fake_create_table) + # tpch path uses create_not_null_table, tpcds uses create_table + gts.generate_table_schemas("tpcds", str(schemas_dir), str(data_dir), verbose=False) + + p = schemas_dir / "store_sales.sql" + assert p.exists() + sql = p.read_text() + # Columns should not be forced NOT NULL for tpcds + assert "NOT NULL" not in sql + diff --git a/presto/scripts/setup_benchmark_data_and_tables.sh b/presto/scripts/setup_benchmark_data_and_tables.sh index cca91898..2a5f538e 100755 --- a/presto/scripts/setup_benchmark_data_and_tables.sh +++ b/presto/scripts/setup_benchmark_data_and_tables.sh @@ -16,6 +16,9 @@ set -e +SCRIPT_PATH=$(dirname -- "${BASH_SOURCE[0]}") +source "${SCRIPT_PATH}/../../scripts/helper_function.sh" + SCRIPT_DESCRIPTION="This script generates benchmark data and sets up related tables under the given schema name. Generated data will reside under the PRESTO_DATA_DIR path in a directory with name that matches the value set for the --data-dir-name argument." @@ -34,7 +37,7 @@ function extra_options_parser() { SCRIPT_EXTRA_OPTIONS_UNKNOWN_ARG=false return 0 else - echo "Error: --scale-factor requires a value" + echo_error "Error: --scale-factor requires a value" return 1 fi shift 2 diff --git a/presto/scripts/setup_benchmark_helper_check_instance_and_parse_args.sh b/presto/scripts/setup_benchmark_helper_check_instance_and_parse_args.sh index 6ad30aa3..f457c269 100644 --- a/presto/scripts/setup_benchmark_helper_check_instance_and_parse_args.sh +++ b/presto/scripts/setup_benchmark_helper_check_instance_and_parse_args.sh @@ -16,6 +16,9 @@ set -e +SCRIPT_PATH=$(dirname -- "${BASH_SOURCE[0]}") +source "${SCRIPT_PATH}/../../scripts/helper_function.sh" + if [[ -z $SCRIPT_DESCRIPTION ]]; then echo "Internal error: SCRIPT_DESCRIPTION must be set" exit 1 @@ -51,15 +54,12 @@ EOF } if [[ -z $PRESTO_DATA_DIR ]]; then - echo "Error: PRESTO_DATA_DIR must be set to the directory path that contains the benchmark data directories" print_help - exit 1 + echo_error "Error: PRESTO_DATA_DIR must be set to the directory path that contains the benchmark data directories" fi source ./common_functions.sh -wait_for_worker_node_registration - parse_args() { while [[ $# -gt 0 ]]; do case $1 in @@ -72,8 +72,7 @@ parse_args() { BENCHMARK_TYPE=$2 shift 2 else - echo "Error: --benchmark-type requires a value" - exit 1 + echo_error "Error: --benchmark-type requires a value" fi ;; -s|--schema-name) @@ -81,8 +80,7 @@ parse_args() { SCHEMA_NAME=$2 shift 2 else - echo "Error: --schema-name requires a value" - exit 1 + echo_error "Error: --schema-name requires a value" fi ;; -d|--data-dir-name) @@ -90,8 +88,7 @@ parse_args() { DATA_DIR_NAME=$2 shift 2 else - echo "Error: --data-dir-name requires a value" - exit 1 + echo_error "Error: --data-dir-name requires a value" fi ;; *) @@ -104,9 +101,8 @@ parse_args() { fi if [[ "$SCRIPT_EXTRA_OPTIONS_UNKNOWN_ARG" == "true" ]]; then - echo "Error: Unknown argument $1" print_help - exit 1 + echo_error "Error: Unknown argument $1" fi ;; esac @@ -115,20 +111,19 @@ parse_args() { parse_args "$@" +wait_for_worker_node_registration + if [[ -z ${BENCHMARK_TYPE} || ! ${BENCHMARK_TYPE} =~ ^tpc(h|ds)$ ]]; then - echo "Error: A valid benchmark type (tpch or tpcds) is required. Use the -b or --benchmark-type argument." print_help - exit 1 + echo_error "Error: A valid benchmark type (tpch or tpcds) is required. Use the -b or --benchmark-type argument." fi if [[ -z ${SCHEMA_NAME} ]]; then - echo "Error: Schema name is required. Use the -s or --schema-name argument." print_help - exit 1 + echo_error "Error: Schema name is required. Use the -s or --schema-name argument." fi if [[ -z ${DATA_DIR_NAME} ]]; then - echo "Error: Data directory name is required. Use the -d or --data-dir-name argument." print_help - exit 1 + echo_error "Error: Data directory name is required. Use the -d or --data-dir-name argument." fi diff --git a/presto/scripts/setup_benchmark_tables.sh b/presto/scripts/setup_benchmark_tables.sh index b7723c75..7bed2d3b 100755 --- a/presto/scripts/setup_benchmark_tables.sh +++ b/presto/scripts/setup_benchmark_tables.sh @@ -22,11 +22,11 @@ that matches the value set for the --data-dir-name argument." SCRIPT_EXAMPLE_ARGS="-b tpch -s my_tpch_sf100 -d sf100" +source ../../scripts/helper_function.sh source ./setup_benchmark_helper_check_instance_and_parse_args.sh if [[ ! -d ${PRESTO_DATA_DIR}/${DATA_DIR_NAME} ]]; then - echo "Error: Benchmark data must already exist inside: ${PRESTO_DATA_DIR}/${DATA_DIR_NAME}" - exit 1 + echo_error "Error: Benchmark data must already exist inside: ${PRESTO_DATA_DIR}/${DATA_DIR_NAME}" fi SCHEMA_GEN_SCRIPT_PATH=$(readlink -f ../../benchmark_data_tools/generate_table_schemas.py) diff --git a/presto/tests/setup_benchmark_data_and_tables.bats b/presto/tests/setup_benchmark_data_and_tables.bats new file mode 100644 index 00000000..79f3fc7f --- /dev/null +++ b/presto/tests/setup_benchmark_data_and_tables.bats @@ -0,0 +1,119 @@ +#!/usr/bin/env bats + +# Tests for presto/scripts/setup_benchmark_data_and_tables.sh + +setup() { + export TEST_ROOT="$BATS_TEST_TMPDIR/work" + export LOG_DIR="$BATS_TEST_TMPDIR/logs" + mkdir -p "$TEST_ROOT/velox-testing/scripts" "$TEST_ROOT/velox-testing/presto/scripts" \ + "$TEST_ROOT/velox-testing/benchmark_data_tools" "$LOG_DIR" "$BATS_TEST_TMPDIR/stubs" + + # Copy needed helpers verbatim + cp -f "$BATS_TEST_DIRNAME/../../scripts/helper_function.sh" \ + "$TEST_ROOT/velox-testing/scripts/helper_function.sh" + cp -f "$BATS_TEST_DIRNAME/../scripts/common_functions.sh" \ + "$TEST_ROOT/velox-testing/presto/scripts/common_functions.sh" + cp -f "$BATS_TEST_DIRNAME/../scripts/setup_benchmark_helper_check_instance_and_parse_args.sh" \ + "$TEST_ROOT/velox-testing/presto/scripts/setup_benchmark_helper_check_instance_and_parse_args.sh" + cp -f "$BATS_TEST_DIRNAME/../scripts/setup_benchmark_data_and_tables.sh" \ + "$TEST_ROOT/velox-testing/presto/scripts/setup_benchmark_data_and_tables.sh" + + # Stubs: curl and jq to satisfy wait_for_worker_node_registration() + cat > "$BATS_TEST_TMPDIR/stubs/curl" <<'SH' +#!/usr/bin/env bash +# Simulate successful response and write a non-empty JSON array +for ((i=1; i<=$#; i++)); do + if [[ "${!i}" == "-o" ]]; then + outvar=$((i+1)); out=${!outvar}; echo '[1]' > "$out"; break + fi +done +exit 0 +SH + chmod +x "$BATS_TEST_TMPDIR/stubs/curl" + + cat > "$BATS_TEST_TMPDIR/stubs/jq" <<'SH' +#!/usr/bin/env bash +# Return length 1 to indicate non-empty +echo 1 +SH + chmod +x "$BATS_TEST_TMPDIR/stubs/jq" + + # Stub: ../../scripts/run_py_script.sh (relative to presto/scripts) + cat > "$TEST_ROOT/velox-testing/scripts/run_py_script.sh" <<'SH' +#!/usr/bin/env bash +set -euo pipefail +echo "[run_py_script] $*" >> "${RUN_LOG:?}" +exit 0 +SH + chmod +x "$TEST_ROOT/velox-testing/scripts/run_py_script.sh" + + # Stub: ./setup_benchmark_tables.sh (same dir as script) + cat > "$TEST_ROOT/velox-testing/presto/scripts/setup_benchmark_tables.sh" <<'SH' +#!/usr/bin/env bash +set -euo pipefail +echo "[setup_benchmark_tables] $*" >> "${RUN_LOG:?}" +exit 0 +SH + chmod +x "$TEST_ROOT/velox-testing/presto/scripts/setup_benchmark_tables.sh" + + export PATH="$BATS_TEST_TMPDIR/stubs:$PATH" + export PRESTO_DATA_DIR="$BATS_TEST_TMPDIR/presto_data" + mkdir -p "$PRESTO_DATA_DIR" + export RUN_LOG="$LOG_DIR/calls.log" + : > "$RUN_LOG" +} + +@test "prints help with --help" { + cd "$TEST_ROOT/velox-testing/presto/scripts" + run ./setup_benchmark_data_and_tables.sh --help + [ "$status" -eq 0 ] + [[ "$output" == *"Usage:"* ]] +} + +@test "fails when PRESTO_DATA_DIR is unset" { + cd "$TEST_ROOT/velox-testing/presto/scripts" + run env -u PRESTO_DATA_DIR ./setup_benchmark_data_and_tables.sh -h + [ "$status" -ne 0 ] + [[ "$output" == *"PRESTO_DATA_DIR must be set"* ]] +} + +@test "validates required args and invokes downstream scripts (tpch)" { + cd "$TEST_ROOT/velox-testing/presto/scripts" + run ./setup_benchmark_data_and_tables.sh -b tpch -s my_schema -d sf100 -f 100 -c + [ "$status" -eq 0 ] + + # Check run_py_script was called with expected args (order-sensitive subset) + run grep -F "[run_py_script] -p" "$RUN_LOG" + [ "$status" -eq 0 ] + [[ "$output" == *"--benchmark-type tpch"* ]] + [[ "$output" == *"--data-dir-path ${PRESTO_DATA_DIR}/sf100"* ]] + [[ "$output" == *"--scale-factor 100"* ]] + [[ "$output" == *"--convert-decimals-to-floats"* ]] + + # Check setup_benchmark_tables invoked with expected args + run grep -F "[setup_benchmark_tables]" "$RUN_LOG" + [ "$status" -eq 0 ] + [[ "$output" == *"-b tpch"* ]] + [[ "$output" == *"-s my_schema"* ]] + [[ "$output" == *"-d sf100"* ]] +} + +@test "rejects invalid benchmark type" { + cd "$TEST_ROOT/velox-testing/presto/scripts" + run ./setup_benchmark_data_and_tables.sh -b foo -s s -d d -f 1 + [ "$status" -ne 0 ] + [[ "$output" == *"A valid benchmark type"* ]] +} + +@test "requires schema and data dir" { + cd "$TEST_ROOT/velox-testing/presto/scripts" + run ./setup_benchmark_data_and_tables.sh -b tpch -d sf1 -f 1 + [ "$status" -ne 0 ] + [[ "$output" == *"Schema name is required"* ]] + + run ./setup_benchmark_data_and_tables.sh -b tpch -s myschema -f 1 + [ "$status" -ne 0 ] + [[ "$output" == *"Data directory name is required"* ]] +} + + diff --git a/presto/tests/setup_benchmark_tables.bats b/presto/tests/setup_benchmark_tables.bats new file mode 100644 index 00000000..1efb6a3e --- /dev/null +++ b/presto/tests/setup_benchmark_tables.bats @@ -0,0 +1,94 @@ +#!/usr/bin/env bats + +# Tests for presto/scripts/setup_benchmark_tables.sh + +setup() { + export TEST_ROOT="$BATS_TEST_TMPDIR/work" + export LOG_DIR="$BATS_TEST_TMPDIR/logs" + mkdir -p "$TEST_ROOT/velox-testing/scripts" "$TEST_ROOT/velox-testing/presto/scripts" \ + "$TEST_ROOT/velox-testing/benchmark_data_tools" "$LOG_DIR" "$BATS_TEST_TMPDIR/stubs" + + # Copy needed helpers + cp -f "$BATS_TEST_DIRNAME/../../scripts/helper_function.sh" \ + "$TEST_ROOT/velox-testing/scripts/helper_function.sh" + cp -f "$BATS_TEST_DIRNAME/../scripts/common_functions.sh" \ + "$TEST_ROOT/velox-testing/presto/scripts/common_functions.sh" + cp -f "$BATS_TEST_DIRNAME/../scripts/setup_benchmark_helper_check_instance_and_parse_args.sh" \ + "$TEST_ROOT/velox-testing/presto/scripts/setup_benchmark_helper_check_instance_and_parse_args.sh" + cp -f "$BATS_TEST_DIRNAME/../scripts/setup_benchmark_tables.sh" \ + "$TEST_ROOT/velox-testing/presto/scripts/setup_benchmark_tables.sh" + + # Stubs for curl/jq + cat > "$BATS_TEST_TMPDIR/stubs/curl" <<'SH' +#!/usr/bin/env bash +for ((i=1; i<=$#; i++)); do + if [[ "${!i}" == "-o" ]]; then outvar=$((i+1)); out=${!outvar}; echo '[1]' > "$out"; break; fi +done +exit 0 +SH + chmod +x "$BATS_TEST_TMPDIR/stubs/curl" + echo -e '#!/usr/bin/env bash\necho 1' > "$BATS_TEST_TMPDIR/stubs/jq" + chmod +x "$BATS_TEST_TMPDIR/stubs/jq" + + # Stub run_py_script to log calls + cat > "$TEST_ROOT/velox-testing/scripts/run_py_script.sh" <<'SH' +#!/usr/bin/env bash +set -euo pipefail +echo "[run_py_script] $*" >> "${RUN_LOG:?}" +exit 0 +SH + chmod +x "$TEST_ROOT/velox-testing/scripts/run_py_script.sh" + + # Create expected python scripts/requirements so readlink -f succeeds + mkdir -p "$TEST_ROOT/velox-testing/benchmark_data_tools" + mkdir -p "$TEST_ROOT/velox-testing/presto/testing/integration_tests" + echo '#!/usr/bin/env python3' > "$TEST_ROOT/velox-testing/benchmark_data_tools/generate_table_schemas.py" + echo '#!/usr/bin/env python3' > "$TEST_ROOT/velox-testing/presto/testing/integration_tests/create_hive_tables.py" + echo '# requirements' > "$TEST_ROOT/velox-testing/presto/testing/requirements.txt" + + export PATH="$BATS_TEST_TMPDIR/stubs:$PATH" + export PRESTO_DATA_DIR="$BATS_TEST_TMPDIR/presto_data" + mkdir -p "$PRESTO_DATA_DIR/sf10" + export RUN_LOG="$LOG_DIR/calls.log" + : > "$RUN_LOG" +} + +@test "prints help with --help" { + cd "$TEST_ROOT/velox-testing/presto/scripts" + run ./setup_benchmark_tables.sh --help + [ "$status" -eq 0 ] + [[ "$output" == *"Usage:"* ]] +} + +@test "fails if data dir does not exist" { + cd "$TEST_ROOT/velox-testing/presto/scripts" + run ./setup_benchmark_tables.sh -b tpch -s myschema -d missing + [ "$status" -ne 0 ] + [[ "$output" == *"Benchmark data must already exist"* ]] +} + +@test "invokes schema and create tables scripts with expected args" { + cd "$TEST_ROOT/velox-testing/presto/scripts" + run ./setup_benchmark_tables.sh -b tpch -s myschema -d sf10 + [ "$status" -eq 0 ] + run grep -F "[run_py_script] -p" "$RUN_LOG" + [ "$status" -eq 0 ] + # Two invocations expected (schema gen, create tables) + run grep -F "[run_py_script] -p" "$RUN_LOG" + [ "$status" -eq 0 ] + + run grep -c "\[run_py_script] -p" "$RUN_LOG" + [ "$status" -eq 0 ] + [ "$output" -eq 2 ] + + # Check presence of key args in logs (no redirections inside `run`) + run grep -F -- "--benchmark-type tpch" "$RUN_LOG" + [ "$status" -eq 0 ] + + run grep -F -- "--schema-name myschema" "$RUN_LOG" + [ "$status" -eq 0 ] + + run grep -F -- "--data-dir-name sf10" "$RUN_LOG" +} + + diff --git a/scripts/helper_function.sh b/scripts/helper_function.sh new file mode 100644 index 00000000..e42f587b --- /dev/null +++ b/scripts/helper_function.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# --- Print error messages in red --- +function echo_error() { + echo -e "\033[0;31m$*\033[0m" >&2 + exit 1 +} + +# --- Print warning messages in yellow --- +function echo_warning() { + echo -e "\033[0;33m$*\033[0m: " >&2 +} + +# --- Print warning messages in green --- +function echo_success() { + echo -e "\033[0;32m$*\033[0m: " +} diff --git a/scripts/tests/run_py_script.bats b/scripts/tests/run_py_script.bats new file mode 100644 index 00000000..afc01dba --- /dev/null +++ b/scripts/tests/run_py_script.bats @@ -0,0 +1,67 @@ +#!/usr/bin/env bats + +# Tests for scripts/run_py_script.sh + +setup() { + export TEST_ROOT="$BATS_TEST_TMPDIR/work" + mkdir -p "$TEST_ROOT/velox-testing/scripts" "$BATS_TEST_TMPDIR/stubs" + cp -f "$BATS_TEST_DIRNAME/../run_py_script.sh" "$TEST_ROOT/velox-testing/scripts/run_py_script.sh" + + # Stub py_env_functions to avoid real venv work + cat > "$TEST_ROOT/velox-testing/scripts/py_env_functions.sh" <<'SH' +#!/usr/bin/env bash +init_python_virtual_env(){ echo "init_python_virtual_env"; } +delete_python_virtual_env(){ echo "delete_python_virtual_env"; } +SH + + # Stub pip and python + cat > "$BATS_TEST_TMPDIR/stubs/pip" <<'SH' +#!/usr/bin/env bash +echo "pip $*" >> "${RUN_LOG:?}" +exit 0 +SH + chmod +x "$BATS_TEST_TMPDIR/stubs/pip" + + cat > "$BATS_TEST_TMPDIR/stubs/python" <<'SH' +#!/usr/bin/env bash +echo "python $*" >> "${RUN_LOG:?}" +exit 0 +SH + chmod +x "$BATS_TEST_TMPDIR/stubs/python" + + export PATH="$BATS_TEST_TMPDIR/stubs:$PATH" + export RUN_LOG="$BATS_TEST_TMPDIR/calls.log" + : > "$RUN_LOG" +} + +@test "prints help with --help" { + cd "$TEST_ROOT/velox-testing/scripts" + run ./run_py_script.sh --help + [ "$status" -eq 0 ] + [[ "$output" == *"Usage:"* ]] +} + +@test "requires --python-script-path" { + cd "$TEST_ROOT/velox-testing/scripts" + run ./run_py_script.sh + [ "$status" -ne 0 ] + [[ "$output" == *"--python-script-path must be set"* ]] +} + +@test "uses default requirements path and passes through args" { + cd "$TEST_ROOT/velox-testing/scripts" + # Create a dummy python script and requirements beside it + mkdir -p "$BATS_TEST_TMPDIR/py" + echo -e "print('ok')" > "$BATS_TEST_TMPDIR/py/tool.py" + echo -e "# requirements" > "$BATS_TEST_TMPDIR/py/requirements.txt" + run ./run_py_script.sh -p "$BATS_TEST_TMPDIR/py/tool.py" --foo bar --baz + [ "$status" -eq 0 ] + run grep -F "pip install -q -r $BATS_TEST_TMPDIR/py/requirements.txt" "$RUN_LOG" + [ "$status" -eq 0 ] + run grep -F "python $BATS_TEST_TMPDIR/py/tool.py --foo bar --baz" "$RUN_LOG" + [ "$status" -eq 0 ] +} + + + +