Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/tutorials/create-a-new-workflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ uv pip install -e examples/documentation_guides/workflows/text_file_ingest
Run the workflow with the following command:
```bash
nat run --config_file examples/documentation_guides/workflows/text_file_ingest/configs/config.yml \
--input "What does DOCA GPUNetIO to remove the CPU from the critical path?"
--input "What does DOCA GPUNetIO do to remove the CPU from the critical path?"
```

If successful, you should receive output similar to the following:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,6 @@ async def test_token_usage_tool(df_path: Path):
@pytest.mark.usefixtures("nvidia_api_key")
async def test_full_workflow():
config_file: Path = locate_example_config(ProfilerAgentConfig)
await run_workflow(config_file, "Is the product of 33 * 4 greater than the current hour of the day?", "yes")
await run_workflow(config_file=config_file,
question="Is the product of 33 * 4 greater than the current hour of the day?",
expected_answer="yes")
4 changes: 2 additions & 2 deletions examples/agents/tests/test_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def rewoo_answer_fixture(request: pytest.FixtureRequest, rewoo_data: list[dict])
indirect=True)
async def test_rewoo_full_workflow(rewoo_question: str, rewoo_answer: str):
config_file = os.path.join(AGENTS_DIR, "rewoo/configs/config.yml")
await run_workflow(config_file, rewoo_question, rewoo_answer)
await run_workflow(config_file=config_file, question=rewoo_question, expected_answer=rewoo_answer)


@pytest.mark.slow
Expand All @@ -79,4 +79,4 @@ async def test_rewoo_full_workflow(rewoo_question: str, rewoo_answer: str):
],
ids=["mixture_of_agents", "react", "react-reasoning", "tool_calling", "tool_calling-reasoning"])
async def test_agent_full_workflow(config_file: str, question: str, answer: str):
await run_workflow(config_file, question, answer)
await run_workflow(config_file=config_file, question=question, expected_answer=answer)
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,4 @@ async def test_full_workflow(milvus_uri: str) -> None:
config.retrievers['retriever'].uri = HttpUrl(url=milvus_uri)

# Unfortunately the workflow itself returns inconsistent results
await run_workflow(None, "List 5 subspecies of Aardvark?", "Aardvark", config=config)
await run_workflow(config=config, question="List 5 subspecies of Aardvark?", expected_answer="Aardvark")
4 changes: 2 additions & 2 deletions examples/documentation_guides/tests/test_custom_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def answer_fixture() -> str:
@pytest.mark.usefixtures("nvidia_api_key")
async def test_custom_full_workflow(custom_workflow_dir: Path, question: str, answer: str):
config_file = custom_workflow_dir / "custom_config.yml"
await run_workflow(config_file, question, answer)
await run_workflow(config_file=config_file, question=question, expected_answer=answer)


@pytest.mark.slow
Expand All @@ -53,4 +53,4 @@ async def test_custom_full_workflow(custom_workflow_dir: Path, question: str, an
async def test_search_full_workflow(custom_workflow_dir: Path, question: str, answer: str):
# Technically this is the same as the custom workflow test, but it requires a second key
config_file = custom_workflow_dir / "search_config.yml"
await run_workflow(config_file, question, answer)
await run_workflow(config_file=config_file, question=question, expected_answer=answer)
67 changes: 67 additions & 0 deletions examples/documentation_guides/tests/test_text_file_ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import sys
from collections.abc import Generator
from pathlib import Path

import pytest

from nat.test.utils import locate_example_config
from nat.test.utils import run_workflow

logger = logging.getLogger(__name__)


@pytest.fixture(name="text_file_ingest_dir", scope="session")
def text_file_ingest_dir_fixture(workflows_dir: Path) -> Path:
text_file_ingest = workflows_dir / "text_file_ingest"
assert text_file_ingest.exists(), f"Could not find text_file_ingest example at {text_file_ingest}"
return text_file_ingest


@pytest.fixture(name="src_dir", scope="session", autouse=True)
def src_dir_fixture(text_file_ingest_dir: Path) -> Path:
src_dir = text_file_ingest_dir / "src"
assert src_dir.exists(), f"Could not find text_file_ingest src at {src_dir}"

return src_dir


@pytest.fixture(name="add_src_dir_to_path", scope="session")
def add_src_dir_to_path_fixture(src_dir: Path) -> Generator[str]:
# Since this is a documentation guide, it is not installed by default, so we need to manually append it to the path
abs_src_dir = str(src_dir.absolute())
if abs_src_dir not in sys.path:
added = True
sys.path.append(abs_src_dir)
else:
added = False

yield abs_src_dir

if added:
sys.path.remove(abs_src_dir)


@pytest.mark.integration
@pytest.mark.usefixtures("nvidia_api_key", "add_src_dir_to_path")
async def test_text_file_ingest_full_workflow():
from text_file_ingest.text_file_ingest_function import TextFileIngestFunctionConfig
config_file = locate_example_config(TextFileIngestFunctionConfig)
await run_workflow(config_file=config_file,
question="What does DOCA GPUNetIO do to remove the CPU from the critical path?",
expected_answer="GPUDirect")
Original file line number Diff line number Diff line change
Expand Up @@ -29,19 +29,6 @@ llms:
_type: nim
model_name: meta/llama-3.1-70b-instruct
temperature: 0.0
nim_rag_eval_llm:
_type: nim
model_name: meta/llama-3.1-70b-instruct
max_tokens: 8
nim_rag_eval_large_llm:
_type: nim
model_name: meta/llama-3.1-70b-instruct
max_tokens: 2048
nim_trajectory_eval_llm:
_type: nim
model_name: meta/llama-3.1-70b-instruct
temperature: 0.0
max_tokens: 1024

embedders:
nv-embedqa-e5-v5:
Expand All @@ -54,36 +41,3 @@ workflow:
llm_name: nim_llm
verbose: true
parse_agent_response_max_retries: 3

eval:
general:
output_dir: .tmp/nat/examples/getting_started/simple_web_query/
dataset:
_type: json
file_path: examples/evaluation_and_profiling/simple_web_query_eval/data/langsmith.json
profiler:
fit_model: True

evaluators:
rag_accuracy:
_type: ragas
metric: AnswerAccuracy
llm_name: nim_rag_eval_llm
rag_groundedness:
_type: ragas
metric: ResponseGroundedness
llm_name: nim_rag_eval_llm
rag_relevance:
_type: ragas
metric: ContextRelevance
llm_name: nim_rag_eval_llm
rag_factual_correctness:
_type: ragas
metric:
FactualCorrectness:
kwargs:
mode: precision
llm_name: nim_rag_eval_large_llm # requires more tokens
trajectory:
_type: trajectory
llm_name: nim_trajectory_eval_llm
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from pathlib import Path

import pytest

from nat.test.utils import locate_example_config
from nat.test.utils import run_workflow

logger = logging.getLogger(__name__)


@pytest.mark.integration
@pytest.mark.usefixtures("nvidia_api_key")
async def test_run_full_workflow():
from nat.runtime.loader import load_config
from nat_email_phishing_analyzer.register import EmailPhishingAnalyzerConfig

config_file: Path = locate_example_config(EmailPhishingAnalyzerConfig)
config = load_config(config_file)

# Unfortunately the workflow itself returns inconsistent results
await run_workflow(
config=config,
question=(
"Dear [Customer], Thank you for your purchase on [Date]. We have processed a refund of $[Amount] to your "
"account. Please provide your account and routing numbers so we can complete the transaction. Thank you, "
"[Your Company]"),
expected_answer="likely")


@pytest.mark.skip(reason="This test gets rate limited potentially issue #842 and does not complete")
@pytest.mark.integration
@pytest.mark.usefixtures("nvidia_api_key", "require_nest_asyncio")
async def test_optimize_full_workflow(capsys):
from nat.data_models.config import Config
from nat.data_models.optimizer import OptimizerRunConfig
from nat.profiler.parameter_optimization.optimizer_runtime import optimize_config
from nat_email_phishing_analyzer.register import EmailPhishingAnalyzerConfig

config_file: Path = locate_example_config(EmailPhishingAnalyzerConfig, "config_optimizer.yml")
config = OptimizerRunConfig(config_file=config_file,
dataset=None,
override=(('eval.general.max_concurrency', '1'), ('optimizer.numeric.n_trials', '1')))
optimized_config = await optimize_config(config)
assert isinstance(optimized_config, Config)
captured_output = capsys.readouterr()

assert "All optimization phases complete" in captured_output.out
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ Install this evaluation example:
uv pip install -e examples/evaluation_and_profiling/simple_calculator_eval
```

> **Note**: If you encounter rate limiting (`[429] Too Many Requests`) during evaluation, try setting the `eval.general.max_concurrency` value either in the YAML directly or via the command line with: `--override eval.general.max_concurrency 1`.

## Run the Workflow

### Running Evaluation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ workflow:

eval:
general:
output_dir: .tmp/nat/examples/getting_started/simple_web_query
output_dir: .tmp/nat/examples/getting_started/simple_calculator
dataset:
_type: json
file_path: examples/getting_started/simple_calculator/data/simple_calculator.json
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from pathlib import Path

import pytest

from nat.eval.evaluate import EvaluationRun
from nat.eval.evaluate import EvaluationRunConfig
from nat.test.utils import locate_example_config
from nat.test.utils import validate_workflow_output

logger = logging.getLogger(__name__)


@pytest.mark.integration
@pytest.mark.usefixtures("nvidia_api_key")
async def test_eval():
"""
1. nat-eval writes the workflow output to workflow_output.json
2. nat-eval creates a file with scores for each evaluation metric.
3. This test audits -
a. the rag accuracy metric
b. the trajectory score (if present)
"""
import nat_simple_calculator_eval

# Get config dynamically
config_file: Path = locate_example_config(nat_simple_calculator_eval, "config-tunable-rag-eval.yml")

# Create the configuration object for running the evaluation, single rep using the eval config in eval_config.yml
# WIP: skip test if eval config is not present
config = EvaluationRunConfig(
config_file=config_file,
dataset=None,
result_json_path="$",
skip_workflow=False,
skip_completed_entries=False,
endpoint=None,
endpoint_timeout=30,
reps=1,
override=(('eval.general.max_concurrency', '1'), ),
)

# Run evaluation
eval_runner = EvaluationRun(config=config)
output = await eval_runner.run_and_evaluate()

# Ensure the workflow was not interrupted
assert not output.workflow_interrupted, "The workflow was interrupted"

# Look for the tuneable_eval_output file
tuneable_eval_output: Path | None = None

for output_file in output.evaluator_output_files:
assert output_file.exists()
output_file_str = str(output_file)
if "tuneable_eval_output" in output_file_str:
tuneable_eval_output = output_file

# Validate the workflow output
assert output.workflow_output_file, "The workflow_output.json file was not created"
validate_workflow_output(output.workflow_output_file)

# Verify that atleast one tuneable_eval_output file is present
assert tuneable_eval_output, "Expected output file does not exist"
Original file line number Diff line number Diff line change
Expand Up @@ -19,39 +19,14 @@

import pytest

import nat_simple_web_query_eval
from nat.eval.evaluate import EvaluationRun
from nat.eval.evaluate import EvaluationRunConfig
from nat.test.utils import locate_example_config
from nat.test.utils import validate_workflow_output

logger = logging.getLogger(__name__)


def validate_workflow_output(workflow_output_file: Path):
"""
Validate the contents of the workflow output file.
WIP: output format should be published as a schema and this validation should be done against that schema.
"""
# Ensure the workflow_output.json file was created
assert workflow_output_file.exists(), "The workflow_output.json file was not created"

# Read and validate the workflow_output.json file
try:
with open(workflow_output_file, encoding="utf-8") as f:
result_json = json.load(f)
except json.JSONDecodeError:
pytest.fail("Failed to parse workflow_output.json as valid JSON")

assert isinstance(result_json, list), "The workflow_output.json file is not a list"
assert len(result_json) > 0, "The workflow_output.json file is empty"
assert isinstance(result_json[0], dict), "The workflow_output.json file is not a list of dictionaries"

# Ensure required keys exist
required_keys = ["id", "question", "answer", "generated_answer", "intermediate_steps"]
for key in required_keys:
assert all(item.get(key) for item in result_json), f"The '{key}' key is missing in workflow_output.json"


def validate_rag_accuracy(rag_metric_output_file: Path, score: float):
"""
1. Validate the contents of the rag evaluator ouput file.
Expand Down Expand Up @@ -110,6 +85,8 @@ async def test_eval():
a. the rag accuracy metric
b. the trajectory score (if present)
"""
import nat_simple_web_query_eval

# Get config dynamically
config_file: Path = locate_example_config(nat_simple_web_query_eval, "eval_config.yml")

Expand Down
Loading