sjbae1999
diff --git a/‎README.md
+209-231 b/‎README.md
+209-231
diff --git a/‎benchmarks/cpp/README.md
+5 b/‎benchmarks/cpp/README.md
+5
diff --git a/‎benchmarks/cpp/gptManagerBenchmark.cpp
+17-9 b/‎benchmarks/cpp/gptManagerBenchmark.cpp
+17-9
diff --git a/‎benchmarks/cpp/gptSessionBenchmark.cpp
+14-3 b/‎benchmarks/cpp/gptSessionBenchmark.cpp
+14-3
diff --git a/‎benchmarks/python/all_reduce.py
+137 b/‎benchmarks/python/all_reduce.py
+137
diff --git a/‎benchmarks/python/benchmark.py
+10-3 b/‎benchmarks/python/benchmark.py
+10-3
diff --git a/‎benchmarks/python/gpt_benchmark.py
+14-14 b/‎benchmarks/python/gpt_benchmark.py
+14-14
@@ -9,6 +9,11 @@ multiple GPUs or multiple nodes with multiple GPUs.
 
 Please follow the [`installation document`](../../../README.md) to build TensorRT-LLM.
 
+Windows users: Follow the
+[`Windows installation document`](../../../windows/README.md)
+instead, and be sure to set DLL paths as specified in
+[Extra Steps for C++ Runtime Usage](../../../windows/README.md#extra-steps-for-c-runtime-usage).
+
 After that, you can build benchmarking source code for C++ runtime
 ```
 cd cpp/build
 
@@ -275,7 +275,8 @@ class GptServer
     GptServer(std::filesystem::path const& trtEnginePath, TrtGptModelType modelType, int32_t maxBeamWidth,
         batch_scheduler::SchedulerPolicy schedulerPolicy, std::optional<int32_t> maxNumSequences,
         std::optional<int32_t> maxTokensInPagedKvCache, std::optional<float> kvCacheFreeGpuMemFraction,
-        std::optional<bool> enableTrtOverlap, std::shared_ptr<Recorder> recorder)
+        std::optional<bool> enableTrtOverlap, std::shared_ptr<Recorder> recorder,
+        std::optional<uint64_t> terminateReqId)
     {
         const TrtGptModelOptionalParams& optionalParams = TrtGptModelOptionalParams(
             maxNumSequences, maxTokensInPagedKvCache, kvCacheFreeGpuMemFraction, enableTrtOverlap);
@@ -285,8 +286,9 @@ class GptServer
             [this](uint64_t requestId, std::list<NamedTensor> response_tensors, bool final_response,
                 const std::string& errMsg)
             { return sendResponse(requestId, response_tensors, final_response, errMsg); },
-            nullptr, nullptr, optionalParams);
+            nullptr, nullptr, optionalParams, terminateReqId);
         mRecorder = recorder;
+        mTerminateReqId = terminateReqId;
     }
 
     ~GptServer()
@@ -298,7 +300,7 @@ class GptServer
     {
         // Create InferenceRequest from a set of tensors
         auto request = std::make_shared<InferenceRequest>(requestId);
-        if (requestId == -1)
+        if (requestId == mTerminateReqId)
         {
             mWorkItemsQueue.push(request, requestId);
             return;
@@ -430,6 +432,7 @@ class GptServer
     std::shared_ptr<GptManager> mBatchManager;
     std::shared_ptr<Recorder> mRecorder;
     WorkItemsQueue mWorkItemsQueue;
+    std::optional<uint64_t> mTerminateReqId;
 
 }; // class GptServer
 
@@ -479,11 +482,7 @@ void benchmarkGptManager(std::string const& modelName, std::filesystem::path con
         TLLM_LOG_ERROR(errStr);
     }
 
-    const int maxBeamWidth = 1;
-    auto recorder = std::make_shared<Recorder>();
-    auto gptServer = std::make_shared<GptServer>(engineDir, modelType, maxBeamWidth, schedulerPolicy, maxNumSequences,
-        maxTokensInPagedKvCache, kvCacheFreeGpuMemFraction, enableTrtOverlap, recorder);
-
+    // Load dataset
     auto dataset = parseDataset(datasetPath);
     std::vector<std::vector<NamedTensor>> tensors_list;
     const auto num_samples = dataset.first.size();
@@ -499,6 +498,12 @@ void benchmarkGptManager(std::string const& modelName, std::filesystem::path con
         tensors_list.push_back(tensors);
     }
 
+    const int maxBeamWidth = 1;
+    auto recorder = std::make_shared<Recorder>();
+    uint64_t terminateReqId = num_samples + 1;
+    auto gptServer = std::make_shared<GptServer>(engineDir, modelType, maxBeamWidth, schedulerPolicy, maxNumSequences,
+        maxTokensInPagedKvCache, kvCacheFreeGpuMemFraction, enableTrtOverlap, recorder, terminateReqId);
+
     if (worldConfig.getRank() == 0)
     {
         recorder->initialize();
@@ -510,8 +515,11 @@ void benchmarkGptManager(std::string const& modelName, std::filesystem::path con
         recorder->finalize();
         recorder->calculateMetrics();
         recorder->report();
-        gptServer->enqueue({}, -1, false);
+        // Send terminateReqId to terminate servers on all ranks
+        // Sever on rank 0 will broadcast the terminate signal to other servers on multi-GPU cases
+        gptServer->enqueue({}, terminateReqId, false);
     }
+    // Wait until benchmarking is done and batch manager is terminated
     gptServer->waitBatchManager();
 }
 
 
@@ -36,7 +36,8 @@ namespace
 {
 void benchmarkGptSession(std::string const& modelName, std::filesystem::path const& dataPath,
     std::vector<int> const& batchSizes, std::vector<std::vector<int>> const& inOutLen,
-    std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp, int numRuns, int duration, bool cudaGraphMode)
+    std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp, int numRuns, int duration,
+    std::optional<SizeType> numMicroBatches, bool cudaGraphMode)
 {
     auto const json = GptJsonConfig::parse(dataPath / "config.json");
     auto const modelConfig = json.getModelConfig();
@@ -73,7 +74,8 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
 
         for (auto const batchSize : batchSizes)
         {
-            session.setup(batchSize, beamWidth, maxInputLength + maxNewTokens, decoderPerRequest);
+            session.setup(
+                batchSize, beamWidth, maxInputLength + maxNewTokens, decoderPerRequest, std::nullopt, numMicroBatches);
 
             std::vector<SizeType> inputLenghtsHost(batchSize, maxInputLength);
             auto inputLenghts
@@ -163,6 +165,8 @@ int main(int argc, char* argv[])
         cxxopts::value<int>()->default_value("10"));
     options.add_options()("duration", "Minimal duration of iterations to measure in seconds.",
         cxxopts::value<int>()->default_value("60"));
+    options.add_options()(
+        "num_micro_batches", "Number of micro batches if enabling pipeline parallelism.", cxxopts::value<int>());
 
     options.add_options()("enable_cuda_graph", "Execute GPT session with CUDA graph.");
 
@@ -235,6 +239,13 @@ int main(int argc, char* argv[])
         return 1;
     }
 
+    // Argument: Number of micro batches
+    std::optional<SizeType> numMicroBatches{std::nullopt};
+    if (result.count("num_micro_batches"))
+    {
+        numMicroBatches = result["num_micro_batches"].as<int>();
+    }
+
     // Argument: Enable CUDA graph
     auto enableCudaGraph = result.count("enable_cuda_graph") > 0;
 
@@ -244,7 +255,7 @@ int main(int argc, char* argv[])
     {
         benchmarkGptSession(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes,
             inOutLen, logger, result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>(),
-            enableCudaGraph);
+            numMicroBatches, enableCudaGraph);
     }
     catch (const std::exception& e)
     {
 
@@ -0,0 +1,137 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser
+
+import tensorrt as trt
+import torch
+from cuda import cuda, cudart
+from mpi4py import MPI
+from polygraphy.backend.trt import CreateConfig, EngineFromNetwork
+
+import tensorrt_llm as tllm
+from tensorrt_llm import Mapping, Tensor
+from tensorrt_llm._ipc_utils import IpcMemory, peer_access
+from tensorrt_llm.functional import AllReduceStrategy, allreduce
+
+
+def allreduce_benchmark(dtype: str, test_range: str = "10,10000000,10"):
+    tllm.logger.set_level('error')
+    world_size = tllm.mpi_world_size()
+    rank = tllm.mpi_rank()
+
+    torch.cuda.set_device(rank)
+    cudart.cudaSetDevice(rank)
+
+    mapping = Mapping(world_size, rank, world_size, world_size)
+
+    if world_size == 1:
+        raise RuntimeError("Benchmark must run with mpi_world_size > 1")
+
+    ipc_barriers_in = IpcMemory(
+        mapping, IpcMemory.IPC_BARRIERS_SIZE_PER_GPU * mapping.tp_size)
+    ipc_barriers_out = IpcMemory(
+        mapping, IpcMemory.IPC_BARRIERS_SIZE_PER_GPU * mapping.tp_size)
+    torch_dtype = tllm._utils.str_dtype_to_torch(dtype)
+
+    min_size, max_size, ratio = [int(i) for i in test_range.split(",")]
+    inner_loop = 1000
+
+    size = min_size
+    while size < max_size:
+        ipc_buffers = IpcMemory(mapping, size * 4)
+        workspace = torch.tensor(ipc_buffers.serialize() +
+                                 ipc_barriers_in.serialize() +
+                                 ipc_barriers_out.serialize(),
+                                 dtype=torch.int64,
+                                 device="cpu")
+
+        input = torch.zeros(size, dtype=torch_dtype, device="cuda")
+
+        for strategy in [
+                AllReduceStrategy.RING, AllReduceStrategy.ONESHOT,
+                AllReduceStrategy.TWOSHOT
+        ]:
+            builder = tllm.Builder()
+            net = builder.create_network()
+            net.plugin_config.set_nccl_plugin(dtype)
+
+            with tllm.net_guard(net):
+                network = tllm.default_trtnet()
+
+                x = Tensor(name='x',
+                           shape=input.shape,
+                           dtype=tllm.str_dtype_to_trt(dtype))
+
+                w = Tensor(name='workspace',
+                           shape=workspace.shape,
+                           dtype=trt.int64)
+
+                current = x
+                for i in range(inner_loop):
+                    current = allreduce(
+                        current, mapping.tp_group,
+                        w if strategy != AllReduceStrategy.RING else None, i,
+                        strategy)
+                output = current.trt_tensor
+
+                output.name = 'output'
+                output.dtype = tllm.str_dtype_to_trt(dtype)
+                network.mark_output(output)
+
+            build_engine = EngineFromNetwork(
+                (builder.trt_builder, net.trt_network),
+                config=CreateConfig(
+                    fp16=(dtype == 'float16'),
+                    bf16=(dtype == 'bfloat16'),
+                    precision_constraints='obey',
+                ))
+
+            output = torch.zeros_like(input)
+
+            stream = torch.cuda.current_stream()
+            feed_dict = {'x': input, 'workspace': workspace}
+
+            session = tllm.runtime.Session.from_engine(build_engine())
+            _, start = cuda.cuEventCreate(0)
+            _, stop = cuda.cuEventCreate(0)
+            with peer_access(mapping):
+                MPI.COMM_WORLD.barrier()
+
+                cuda.cuEventRecord(start, stream.cuda_stream)
+                session.run(inputs=feed_dict,
+                            outputs={"output": output},
+                            stream=stream.cuda_stream)
+                cuda.cuEventRecord(stop, stream.cuda_stream)
+            torch.cuda.synchronize()
+            _, ms = cuda.cuEventElapsedTime(start, stop)
+
+            if mapping.rank == 0:
+                print(f"{size=}, {strategy=}, {ms=}")
+        size *= ratio
+        if mapping.rank == 0:
+            print("")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--dtype", "-t", default="float16")
+    parser.add_argument("--range",
+                        "-r",
+                        default="256,25600000,10",
+                        help="min_size,max_size,multiplicative_ratio")
+    args = parser.parse_args()
+
+    allreduce_benchmark(args.dtype, args.range)
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
+import multiprocessing as mp
 from multiprocessing import Process, Queue
 from time import time
 
@@ -38,11 +39,12 @@ def parse_arguments():
         '--mode',
         type=str,
         default="plugin",
-        choices=['ootb', 'plugin'],
+        choices=['ootb', 'plugin', 'ootb-except-mha'],
         help=
         ('Choose mode between ootb/plugin. '
          '\"ootb\" means the engines will be built without any plugins, '
-         'while \"plugin\" means the engines will be built with tuned recipe of using plugins.'
+         '\"plugin\" means the engines will be built with tuned recipe of using plugins.'
+         '\"ootb-except-mha\" means the engines will be built with only attention plugins.'
          ))
 
     parser.add_argument('--batch_size',
@@ -298,12 +300,16 @@ def main(args):
             )
 
         except Exception as e:
+            print("Found exception during benchmarking", e.with_traceback())
             p.kill()
             raise e
-
+        logger.debug("Sending signal to mem monitor process, start")
         q1.put(1)
+        logger.debug("Sending signal to mem monitor process, done")
         peak_gpu_used = q2.get()
+        logger.debug("Get peak gpu memory usage from mem monitor process, done")
         p.join()
+        logger.debug("Memory monitor process joined")
 
         latency = round(sum(latencies) / iter_idx, 3)
         latencies.sort()
@@ -318,5 +324,6 @@ def main(args):
 
 
 if __name__ == '__main__':
+    mp.set_start_method('spawn')
     args = parse_arguments()
     main(args)
@@ -57,7 +57,7 @@ def __init__(self,
         self.refit = refit
         self.num_beams = num_beams
         self.build_time = 0
-        self.mode = mode  # plugin or ootb
+        self.mode = mode  # plugin or ootb or ootb-except-mha
         self.fuse_bias = True
 
         self.cuda_graph_mode = kwargs.get('enable_cuda_graph', False)
@@ -83,17 +83,20 @@ def __init__(self,
             self.per_token = False
             self.per_channel = False
 
-            is_plugin_mode = mode == 'plugin'
-            plg_dtype = dtype if is_plugin_mode else False
-            self.use_gpt_attention_plugin = plg_dtype
-            self.use_gemm_plugin = plg_dtype
+            use_mha_plugin = mode == 'plugin' or mode == 'ootb-except-mha'
+            mha_plg_dtype = dtype if use_mha_plugin else False
+            use_non_mha_plugin = mode == 'plugin'
+            non_mha_plg_dtype = dtype if use_mha_plugin else False
+
+            self.use_gpt_attention_plugin = mha_plg_dtype
+            self.use_gemm_plugin = non_mha_plg_dtype
             # Starting TRT9.1 OOTB norm layer sees improvement over plugin norm layer
             self.use_layernorm_plugin = False
             self.use_rmsnorm_plugin = False
-            self.use_lookup_plugin = plg_dtype
-            self.enable_context_fmha = True
+            self.use_lookup_plugin = non_mha_plg_dtype
+            self.enable_context_fmha = use_mha_plugin
             self.quant_mode = QuantMode(0)
-            self.remove_input_padding = is_plugin_mode
+            self.remove_input_padding = use_non_mha_plugin
 
             for key, value in get_build_config(model_name).items():
                 setattr(self, key, value)
@@ -135,8 +138,6 @@ def __init__(self,
                 self.quant_mode = self.quant_mode.set_fp8_qdq()
 
             if self.fp8_kv_cache:
-                # Watch out, enable_fp8 and fp8_kv_cache are not exclusive
-                assert self.use_gpt_attention_plugin, "GPT attention plugin needed"
                 self.quant_mode = self.quant_mode.set_fp8_kv_cache()
 
             engine_buffer = self.build()
@@ -151,7 +152,9 @@ def __init__(self,
             num_layers=self.num_layers,
             gpt_attention_plugin=self.use_gpt_attention_plugin,
             remove_input_padding=self.remove_input_padding,
-            quant_mode=self.quant_mode)
+            quant_mode=self.quant_mode,
+            use_custom_all_reduce=self.enable_custom_all_reduce,
+        )
         if model_name == 'chatglm_6b':
             self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
                 end_id=130005,
@@ -392,9 +395,6 @@ def build(self):
             network.plugin_config.set_smooth_quant_gemm_plugin(dtype=self.dtype)
             network.plugin_config.set_layernorm_quantization_plugin(
                 dtype=self.dtype)
-            # FIXME(nkorobov)
-            # See https://nvbugs/4164762
-            # See https://nvbugs/4174113
             network.plugin_config.set_quantize_tensor_plugin()
             network.plugin_config.set_quantize_per_token_plugin()
         elif self.use_weight_only: