Skip to content

Commit 2f6ff54

Browse files
committed
TensorRT-LLM import fix and aot_joint_export specify as explicit setting in dynamo.compile
TRT-LLM installation utilities and adding test cases adding the option in _compiler.py changes in the TRT-LLM loading tool- removing install_wget, install_unzip, install_mpi Further changes in error logging of the TRT-LLM installation tool moving the load_tensorrt_llm to dynamo/utils.py correcting misprint for TRT LLM load Using python lib for download to make it platform agnostic dll file path update for windows correcting the non critical lint error Including version in versions.txt linting error fixes and rebase fix removing Platform enum from converter_utils.py Addressing review comments- tmp dir for wheel download and wheel extraction, variable for py_version checks for windows where NCCL backend is not supported adding checks for windows and jetson devices Keeping the extracted and deleting download file, restructuring test modifying the error warning of missing libmpi libs removing the redundant initializations adding tests in CI correcting the skip test condition install MPI libs for linux x86 adding SBSA to the supported platform of TRT-LLM libs and installing MPI libs for the distributed tests Using python package for platform detection using python platform
1 parent 530a2cd commit 2f6ff54

File tree

11 files changed

+386
-164
lines changed

11 files changed

+386
-164
lines changed

.github/workflows/build-test-linux-aarch64.yml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,41 @@ jobs:
356356
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
357357
popd
358358
359+
tests-py-distributed:
360+
name: Test dynamo distributed [Python]
361+
needs: [filter-matrix, build]
362+
if: false
363+
strategy:
364+
fail-fast: false
365+
matrix:
366+
include:
367+
- repository: pytorch/tensorrt
368+
package-name: torch_tensorrt
369+
pre-script: packaging/pre_build_script.sh
370+
post-script: packaging/post_build_script.sh
371+
smoke-test-script: packaging/smoke_test_script.sh
372+
uses: ./.github/workflows/linux-test.yml
373+
with:
374+
job-name: tests-py-dynamo-distributed
375+
repository: "pytorch/tensorrt"
376+
ref: ""
377+
test-infra-repository: pytorch/test-infra
378+
test-infra-ref: main
379+
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
380+
pre-script: ${{ matrix.pre-script }}
381+
script: |
382+
set -euo pipefail
383+
export USE_HOST_DEPS=1
384+
export CI_BUILD=1
385+
export USE_TRTLLM_PLUGINS=1
386+
dnf install -y mpich mpich-devel openmpi openmpi-devel
387+
pushd .
388+
cd tests/py
389+
cd dynamo
390+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
391+
popd
392+
393+
359394
concurrency:
360395
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
361396
cancel-in-progress: true

.github/workflows/build-test-linux-x86_64.yml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,39 @@ jobs:
337337
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
338338
popd
339339
340+
tests-py-distributed:
341+
name: Test dynamo distributed [Python]
342+
needs: [filter-matrix, build]
343+
strategy:
344+
fail-fast: false
345+
matrix:
346+
include:
347+
- repository: pytorch/tensorrt
348+
package-name: torch_tensorrt
349+
pre-script: packaging/pre_build_script.sh
350+
post-script: packaging/post_build_script.sh
351+
smoke-test-script: packaging/smoke_test_script.sh
352+
uses: ./.github/workflows/linux-test.yml
353+
with:
354+
job-name: tests-py-dynamo-distributed
355+
repository: "pytorch/tensorrt"
356+
ref: ""
357+
test-infra-repository: pytorch/test-infra
358+
test-infra-ref: main
359+
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
360+
pre-script: ${{ matrix.pre-script }}
361+
script: |
362+
set -euo pipefail
363+
export USE_HOST_DEPS=1
364+
export CI_BUILD=1
365+
export USE_TRTLLM_PLUGINS=1
366+
dnf install -y mpich mpich-devel openmpi openmpi-devel
367+
pushd .
368+
cd tests/py
369+
cd dynamo
370+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
371+
popd
372+
340373
concurrency:
341374
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
342375
cancel-in-progress: true

dev_dep_versions.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
__cuda_version__: "12.8"
22
__tensorrt_version__: "10.12.0"
3+
__tensorrt_llm_version__: "0.17.0.post1"

py/torch_tensorrt/dynamo/_compiler.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def cross_compile_for_windows(
103103
tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
104104
l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
105105
offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
106+
use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
106107
**kwargs: Any,
107108
) -> torch.fx.GraphModule:
108109
"""Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -176,6 +177,7 @@ def cross_compile_for_windows(
176177
enable_weight_streaming (bool): Enable weight streaming.
177178
tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
178179
l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
180+
use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
179181
**kwargs: Any,
180182
Returns:
181183
torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -339,6 +341,7 @@ def cross_compile_for_windows(
339341
"enable_weight_streaming": enable_weight_streaming,
340342
"tiling_optimization_level": tiling_optimization_level,
341343
"l2_limit_for_tiling": l2_limit_for_tiling,
344+
"use_distributed_mode_trace": use_distributed_mode_trace,
342345
}
343346

344347
# disable the following settings is not supported for cross compilation for windows feature
@@ -439,6 +442,7 @@ def compile(
439442
tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
440443
l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
441444
offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
445+
use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
442446
**kwargs: Any,
443447
) -> torch.fx.GraphModule:
444448
"""Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -515,6 +519,7 @@ def compile(
515519
tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
516520
l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
517521
offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
522+
use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
518523
**kwargs: Any,
519524
Returns:
520525
torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -688,6 +693,7 @@ def compile(
688693
"tiling_optimization_level": tiling_optimization_level,
689694
"l2_limit_for_tiling": l2_limit_for_tiling,
690695
"offload_module_to_cpu": offload_module_to_cpu,
696+
"use_distributed_mode_trace": use_distributed_mode_trace,
691697
}
692698

693699
settings = CompilationSettings(**compilation_options)
@@ -1051,6 +1057,7 @@ def convert_exported_program_to_serialized_trt_engine(
10511057
tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
10521058
l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
10531059
offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
1060+
use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
10541061
**kwargs: Any,
10551062
) -> bytes:
10561063
"""Convert an ExportedProgram to a serialized TensorRT engine
@@ -1114,6 +1121,7 @@ def convert_exported_program_to_serialized_trt_engine(
11141121
enable_weight_streaming (bool): Enable weight streaming.
11151122
tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
11161123
l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
1124+
use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
11171125
Returns:
11181126
bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
11191127
"""
@@ -1236,6 +1244,7 @@ def convert_exported_program_to_serialized_trt_engine(
12361244
"tiling_optimization_level": tiling_optimization_level,
12371245
"l2_limit_for_tiling": l2_limit_for_tiling,
12381246
"offload_module_to_cpu": offload_module_to_cpu,
1247+
"use_distributed_mode_trace": use_distributed_mode_trace,
12391248
}
12401249

12411250
settings = CompilationSettings(**compilation_options)

py/torch_tensorrt/dynamo/conversion/converter_utils.py

Lines changed: 0 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import collections
2-
import ctypes
32
import functools
43
import logging
5-
import os
64
from typing import (
75
Any,
86
Callable,
@@ -1120,69 +1118,6 @@ def args_bounds_check(
11201118
return args[i] if len(args) > i and args[i] is not None else replacement
11211119

11221120

1123-
def load_tensorrt_llm() -> bool:
1124-
"""
1125-
Attempts to load the TensorRT-LLM plugin and initialize it.
1126-
1127-
Returns:
1128-
bool: True if the plugin was successfully loaded and initialized, False otherwise.
1129-
"""
1130-
try:
1131-
import tensorrt_llm as trt_llm # noqa: F401
1132-
1133-
_LOGGER.info("TensorRT-LLM successfully imported")
1134-
return True
1135-
except (ImportError, AssertionError) as e_import_error:
1136-
# Check for environment variable for the plugin library path
1137-
plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
1138-
if not plugin_lib_path:
1139-
_LOGGER.warning(
1140-
"TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops",
1141-
)
1142-
return False
1143-
1144-
_LOGGER.info(f"TensorRT-LLM Plugin lib path found: {plugin_lib_path}")
1145-
try:
1146-
# Load the shared library
1147-
handle = ctypes.CDLL(plugin_lib_path)
1148-
_LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}")
1149-
except OSError as e_os_error:
1150-
_LOGGER.error(
1151-
f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
1152-
f"Ensure the path is correct and the library is compatible",
1153-
exc_info=e_os_error,
1154-
)
1155-
return False
1156-
1157-
try:
1158-
# Configure plugin initialization arguments
1159-
handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
1160-
handle.initTrtLlmPlugins.restype = ctypes.c_bool
1161-
except AttributeError as e_plugin_unavailable:
1162-
_LOGGER.warning(
1163-
"Unable to initialize the TensorRT-LLM plugin library",
1164-
exc_info=e_plugin_unavailable,
1165-
)
1166-
return False
1167-
1168-
try:
1169-
# Initialize the plugin
1170-
TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm"
1171-
if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")):
1172-
_LOGGER.info("TensorRT-LLM plugin successfully initialized")
1173-
return True
1174-
else:
1175-
_LOGGER.warning("TensorRT-LLM plugin library failed in initialization")
1176-
return False
1177-
except Exception as e_initialization_error:
1178-
_LOGGER.warning(
1179-
"Exception occurred during TensorRT-LLM plugin library initialization",
1180-
exc_info=e_initialization_error,
1181-
)
1182-
return False
1183-
return False
1184-
1185-
11861121
def promote_trt_tensors_to_same_dtype(
11871122
ctx: ConversionContext, lhs: TRTTensor, rhs: TRTTensor, name_prefix: str
11881123
) -> tuple[TRTTensor, TRTTensor]:

py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,15 @@
1111
from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
1212
dynamo_tensorrt_converter,
1313
)
14-
from torch_tensorrt.dynamo.conversion.converter_utils import load_tensorrt_llm
14+
from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
15+
tensorrt_fused_nccl_all_gather_op,
16+
tensorrt_fused_nccl_reduce_scatter_op,
17+
)
18+
from torch_tensorrt.dynamo.utils import load_tensorrt_llm_for_nccl
1519

1620
_LOGGER: logging.Logger = logging.getLogger(__name__)
1721

18-
if load_tensorrt_llm():
19-
from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
20-
tensorrt_fused_nccl_all_gather_op,
21-
tensorrt_fused_nccl_reduce_scatter_op,
22-
)
22+
if load_tensorrt_llm_for_nccl():
2323

2424
@dynamo_tensorrt_converter(tensorrt_fused_nccl_all_gather_op)
2525
def fused_nccl_gather(

0 commit comments

Comments
 (0)