anapple-hub
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎.pre-commit-config.yaml
+1-1 b/‎.pre-commit-config.yaml
+1-1
diff --git a/‎README.md
+3 b/‎README.md
+3
diff --git a/‎benchmarks/cpp/README.md
+6-8 b/‎benchmarks/cpp/README.md
+6-8
diff --git a/‎benchmarks/cpp/gptManagerBenchmark.cpp
+26-6 b/‎benchmarks/cpp/gptManagerBenchmark.cpp
+26-6
diff --git a/‎benchmarks/cpp/gptSessionBenchmark.cpp
+3 b/‎benchmarks/cpp/gptSessionBenchmark.cpp
+3
diff --git a/‎benchmarks/python/benchmark.py
-4 b/‎benchmarks/python/benchmark.py
-4
diff --git a/‎benchmarks/python/build.py
+21-29 b/‎benchmarks/python/build.py
+21-29
diff --git a/‎benchmarks/python/check_accuracy_mlperf.py
+16-3 b/‎benchmarks/python/check_accuracy_mlperf.py
+16-3
diff --git a/‎benchmarks/python/gpt_benchmark.py
+4-8 b/‎benchmarks/python/gpt_benchmark.py
+4-8
diff --git a/‎benchmarks/suite/tensorrt_llm_bench/utils/enums.py
+1-3 b/‎benchmarks/suite/tensorrt_llm_bench/utils/enums.py
+1-3
@@ -6,6 +6,7 @@ __pycache__/
 *.nsys-rep
 .VSCodeCounter
 build*/
+!builders/
 *.egg-info/
 .coverage
 *.onnx
 
@@ -46,5 +46,5 @@ repos:
         args:
         - --skip=".git,3rdparty"
         - --exclude-file=examples/whisper/tokenizer.py
-        - --ignore-words-list=rouge,inout,atleast,strat,nd
+        - --ignore-words-list=rouge,inout,atleast,strat,nd,subtile
         exclude: 'tests/llm-test-defs/turtle/test_input_files'
@@ -75,3 +75,6 @@ To get started with TensorRT-LLM, visit our documentation:
 - [Installation Guide for Linux](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
 - [Installation Guide for Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html)
 - [Supported Hardware, Models, and other Software](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html)
+
+## Community
+- [Model zoo](https://huggingface.co/TheFloat16) (generated by TRT-LLM rel 0.9 a9356d4b7610330e89c1010f342a9ac644215c52)
@@ -210,8 +210,10 @@ TP=2
 PP=1
 MAX_LEN=1024
 MAX_BATCH=32
-MAX_LORA_RANK=32
+NUM_LAYERS=40
+MAX_LORA_RANK=64
 NUM_LORA_MODS=7
+EOS_ID=2
 
 SOURCE_LORA=chinese-llama-2-lora-13b
 CPP_LORA=chinese-llama-2-lora-13b-cpp
@@ -234,7 +236,7 @@ ${HOME}/.local/bin/trtllm-build \
     --gemm_plugin float16 \
     --lora_plugin float16 \
     --use_paged_context_fmha enable \
-    --lora_target_modules attn_qkv \
+    --lora_target_modules attn_q attn_k attn_v attn_dense mlp_h_to_4h mlp_4h_to_h mlp_gate \
     --max_lora_rank ${MAX_LORA_RANK}
 
 NUM_LORAS=(8 16 24 32 64 128 256)
@@ -252,8 +254,6 @@ mkdir -p $EG_DIR/data
 # Prepare dataset without lora_task_id
 python benchmarks/cpp/prepare_dataset.py \
     --output "${EG_DIR}/data/token-norm-dist.json" \
-    --request-rate -1 \
-    --time-delay-dist constant \
     --tokenizer $TOKENIZER \
     token-norm-dist \
     --num-requests $NUM_REQUESTS \
@@ -263,8 +263,6 @@ python benchmarks/cpp/prepare_dataset.py \
 for nloras in ${NUM_LORAS[@]}; do
     python benchmarks/cpp/prepare_dataset.py \
         --output "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
-        --request-rate -1 \
-        --time-delay-dist constant \
         --rand-task-id 0 $(( $nloras - 1 )) \
         --tokenizer $TOKENIZER \
         token-norm-dist \
@@ -292,7 +290,7 @@ mpirun -n ${TP} --output-filename ${EG_DIR}/log-base-lora \
 
 # Now run inference with various numbers or loras
 # The host cache is set large enough to hold all the LoRAs in lora_dir
-# GPU cache is set to hold 32 LoRAs
+# GPU cache is set to hold 16 LoRAs
 # This benchmark will preload all the LoRAs into the host cache
 # We run inference on a range of active LoRAs exercising different cache miss rates.
 for nloras in ${NUM_LORAS[@]}; do
@@ -303,7 +301,7 @@ for nloras in ${NUM_LORAS[@]}; do
         --type IFB \
         --dataset "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
         --lora_host_cache_bytes 8589934592 \
-        --lora_num_device_mod_layers $(( 32 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
+        --lora_num_device_mod_layers $(( 16 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
         --kv_cache_free_gpu_mem_fraction 0.80 \
         --log_level info \
         --eos_id ${EOS_ID} \
 
@@ -458,10 +458,6 @@ class Recorder
     {
         this->recordEnd(requestId, hasError);
 
-        if (mRespJsonFile.empty())
-            return;
-        int32_t outputSeqLen;
-
         for (auto& tensor : responseTensors)
         {
             if (tensor.name == inference_request::kOutputIdsTensorName)
@@ -471,7 +467,7 @@ class Recorder
             else if (tensor.name == inference_request::kSequenceLengthTensorName)
             {
                 // Tensor of shape nBeams, and we only need the first one
-                outputSeqLen = *(bufferCast<int32_t>(*(tensor.tensor)));
+                int32_t outputSeqLen = *(bufferCast<int32_t>(*(tensor.tensor)));
                 if (mOutputHasInput)
                 {
                     int inputSeqLen = mRequestBenchInfos[requestId].inputLength;
@@ -482,6 +478,30 @@ class Recorder
         }
     }
 
+    void recordEnd(uint64_t requestId, texec::Response const& response)
+    {
+
+        this->recordEnd(requestId, response.hasError());
+
+        // Get the actual output length
+        if (!response.hasError())
+        {
+            auto outputTokenIds = response.getResult().outputTokenIds;
+
+            int32_t outSeqLen = 0;
+            for (auto const& beam : outputTokenIds)
+            {
+                outSeqLen = std::max(static_cast<int32_t>(beam.size()), outSeqLen);
+            }
+            if (mOutputHasInput)
+            {
+                int inputSeqLen = mRequestBenchInfos[requestId].inputLength;
+                outSeqLen -= inputSeqLen;
+            }
+            mRequestBenchInfos[requestId].outputLength = outSeqLen;
+        }
+    }
+
     float calcPercentile(std::vector<float> const& latencies, int percentile)
     {
         int const index = static_cast<int>(std::ceil((percentile / 100.0) * latencies.size())) - 1;
@@ -827,7 +847,7 @@ class ExecutorServer
                     numFinished++;
                     if (!warmup)
                     {
-                        mRecorder->recordEnd(reqId, response.hasError());
+                        mRecorder->recordEnd(reqId, response);
                     }
                 }
             }
 
@@ -34,6 +34,7 @@
 #include <NvInfer.h>
 #include <atomic>
 #include <chrono>
+#include <cuda_profiler_api.h>
 #include <cxxopts.hpp>
 #include <future>
 #include <sstream>
@@ -213,6 +214,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
                 std::vector<float> latencies;
                 std::vector<float> generationTimes;
                 auto generationProfiler = std::make_shared<GptSession::GenerationProfiler>();
+                cudaProfilerStart();
                 while (iterIdx < numRuns)
                 {
                     auto const start = std::chrono::steady_clock::now();
@@ -242,6 +244,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
                         break;
                     }
                 }
+                cudaProfilerStop();
 
                 TLLM_LOG_INFO(memoryCounter.toString());
                 done = true;
 
@@ -198,10 +198,6 @@ def parse_arguments():
         help=
         'Quick sanity check with num_layer=1; will be silently ignored if --engine_dir is specified.'
     )
-    parser.add_argument('--strongly_typed',
-                        default=False,
-                        action='store_true',
-                        help='This option will reduce the building time.')
     parser.add_argument(
         '--gpu_weights_percent',
         type=str,
 
@@ -151,10 +151,6 @@ def parse_arguments():
                         default=False,
                         action='store_true',
                         help="Build engines serially")
-    parser.add_argument('--strongly_typed',
-                        default=False,
-                        action='store_true',
-                        help='This option will reduce the building time.')
     parser.add_argument(
         '--multiple_profiles',
         default=False,
@@ -251,9 +247,6 @@ def build_gpt(args):
     if not args.serial_build:
         torch.cuda.set_device(runtime_rank)
 
-    strongly_typed = args.strongly_typed
-    if args.quantization is not None and "fp8" in args.quantization:
-        strongly_typed = True
     num_kv_heads = build_config['num_heads'] \
         if build_config['num_kv_heads'] is None else build_config['num_kv_heads']
     apply_query_key_layer_scaling = False
@@ -321,7 +314,7 @@ def build_gpt(args):
         quant_mode=quant_mode,
         use_refit=False,
         opt_level=build_config['builder_opt'],
-        strongly_typed=strongly_typed,
+        strongly_typed=True,
         weight_streaming=is_weight_streaming,
         **builder_config_extra_kwargs)
     engine_name = get_engine_name(args.model, args.dtype, world_size,
@@ -363,8 +356,10 @@ def build_gpt(args):
             'apply_query_key_layer_scaling':
             builder_config.apply_query_key_layer_scaling,
             'rotary_pct': build_config['rotary_pct'],
-            'moe_num_experts': build_config["moe_num_experts"],
-            'moe_top_k': build_config["moe_top_k"],
+            'moe': {
+                'num_experts': build_config["moe_num_experts"],
+                'top_k': build_config["moe_top_k"],
+            },
         }
         config = PretrainedConfig.from_dict(config)
         tensorrt_llm_model = tensorrt_llm.models.GPTForCausalLM(config)
@@ -399,7 +394,7 @@ def build_gpt(args):
     elif family == "llama":
         config = {
             'architecture':
-            'LLaMAForCausalLM',
+            'LlamaForCausalLM',
             'dtype':
             args.dtype,
             'num_hidden_layers':
@@ -430,10 +425,10 @@ def build_gpt(args):
                 'world_size': world_size,
                 'tp_size': world_size
             },
-            'moe_num_experts':
-            build_config["moe_num_experts"],
-            'moe_top_k':
-            build_config["moe_top_k"],
+            'moe': {
+                'num_experts': build_config["moe_num_experts"],
+                'top_k': build_config["moe_top_k"],
+            }
         }
         config = PretrainedConfig.from_dict(config)
         tensorrt_llm_model = tensorrt_llm.models.LLaMAForCausalLM(config)
@@ -602,9 +597,6 @@ def build_gpt(args):
         }
         config = PretrainedConfig.from_dict(config)
         tensorrt_llm_model = tensorrt_llm.models.BloomForCausalLM(config)
-        tensorrt_llm_model = optimize_model(
-            tensorrt_llm_model,
-            use_parallel_embedding=config.use_parallel_embedding)
     elif family == "falcon":
         config = {
             'architecture':
@@ -696,7 +688,7 @@ def build_gpt(args):
     elif family == "internlm":
         config = {
             'architecture':
-            'LLaMAForCausalLM',
+            'LlamaForCausalLM',
             'dtype':
             args.dtype,
             'num_hidden_layers':
@@ -778,10 +770,10 @@ def build_gpt(args):
                 'world_size': world_size,
                 'tp_size': world_size
             },
-            'moe_num_experts':
-            build_config["moe_num_experts"],
-            'moe_top_k':
-            build_config["moe_top_k"],
+            'moe': {
+                'num_experts': build_config["moe_num_experts"],
+                'top_k': build_config["moe_top_k"],
+            },
             'qwen_type':
             'qwen',
         }
@@ -821,10 +813,10 @@ def build_gpt(args):
                 'world_size': world_size,
                 'tp_size': world_size
             },
-            'moe_num_experts':
-            build_config["moe_num_experts"],
-            'moe_top_k':
-            build_config["moe_top_k"],
+            'moe': {
+                'num_experts': build_config["moe_num_experts"],
+                'top_k': build_config["moe_top_k"],
+            },
             'qwen_type':
             'qwen2',
         }
@@ -1029,7 +1021,7 @@ def build_bert(args):
         max_batch_size=max_batch_size,
         max_input_len=max_input_len,
         opt_level=build_config['builder_opt'],
-        strongly_typed=args.strongly_typed,
+        strongly_typed=True,
         weight_streaming=is_weight_streaming,
     )
     engine_name = get_engine_name(args.model, args.dtype, world_size,
@@ -1207,7 +1199,7 @@ def enc_dec_build_helper(component, config, args):
         cross_attention=(component == 'decoder'),
         has_position_embedding=has_position_embedding,
         has_token_type_embedding=False,  # by default
-        strongly_typed=False,  # by default
+        strongly_typed=True,
         gather_all_token_logits=False,  # by default
         int8=(quant_mode.has_act_and_weight_quant()
               or quant_mode.is_int8_weight_only()),
 
@@ -1,4 +1,5 @@
 import json
+import os
 from enum import Enum
 
 import evaluate
@@ -82,9 +83,11 @@ def calculate_toks_per_sample(preds, eos_id):
     return avg_len / num_samples
 
 
-def calculate_rouge_score(preds, targets):
+def calculate_rouge_score(preds, targets, rouge_dir=None):
     print("Calculating ROUGE scores...")
-    metric = evaluate.load("rouge")
+    rouge_dir = rouge_dir if rouge_dir and os.path.exists(
+        rouge_dir) else "rouge"
+    metric = evaluate.load(rouge_dir)
     preds, targets = postprocess_text(preds, targets[0:len(preds)])
     result = metric.compute(predictions=preds,
                             references=targets,
@@ -114,6 +117,15 @@ def parse_arguments():
     parser.add_argument("--base_model",
                         type=str,
                         help="Location of the model used (to create tokenizer)")
+
+    parser.add_argument(
+        '--rouge_dir',
+        default=None,
+        type=str,
+        help=
+        "evaluate.load('rouge') will attempt to pull rouge package from HF. Use cached rouge can avoid network outage of host or HF."
+    )
+
     args = parser.parse_args()
 
     return args
@@ -146,7 +158,8 @@ def main():
     tps_score = calculate_toks_per_sample(pred_toks, tokenizer.eos_token)
 
     pred_texts = tokenizer.batch_decode(pred_toks, skip_special_tokens=True)
-    achieved_scores = calculate_rouge_score(pred_texts, target_texts)
+    achieved_scores = calculate_rouge_score(pred_texts, target_texts,
+                                            args.rouge_dir)
 
     achieved_scores['tokens_per_sample'] = tps_score
     targets = ACCURACY_TARGETS[model]
 
@@ -279,14 +279,10 @@ def check_memory(self, io_shapes: list, raise_exception=False):
             self.kv_cache_elem_per_token(self.build_config, self.runtime_mapping.tp_size, self.runtime_mapping.pp_size) * element_size(self.kv_dtype)
         # when MHA is OOTB, it requires extra KV cache size, because OOTB don't support inplace updating KV cache.
         if not self.use_gpt_attention_plugin:
-            if os.getenv('TRTLLM_DISABLE_OOTB_KVCACHE_REUSE') != 'ON':
-                local_n_layer = ceil(self.build_config.num_layers /
-                                     self.runtime_mapping.pp_size)
-                kv_cache_size_in_bytes = kv_cache_size_in_bytes / local_n_layer * (
-                    local_n_layer + 1)
-            else:
-                # without reusing, we need one for past as engine inputs, one for present as engine outputs.
-                kv_cache_size_in_bytes *= 2
+            local_n_layer = ceil(self.build_config.num_layers /
+                                 self.runtime_mapping.pp_size)
+            kv_cache_size_in_bytes = kv_cache_size_in_bytes / local_n_layer * (
+                local_n_layer + 1)
 
         kv_cache_size_in_mb = bytes_to_target_unit(kv_cache_size_in_bytes,
                                                    "MiB")
 
@@ -51,9 +51,7 @@ def get_build_options(self, dtype: str) -> List[str]:
             List[str]: A list of command line arguments to be added to build
             commands.
         """
-        if self.value == self.FP8:
-            return ["--strongly_typed"]
-        else:
+        if not self.value == self.FP8:
             return ["--gemm_plugin", dtype]
Original file line number	Diff line number	Diff line change
`@@ -458,10 +458,6 @@ class Recorder`
`458`	`458`	`{`
`459`	`459`	`this->recordEnd(requestId, hasError);`
`460`	`460`
`461`		`- if (mRespJsonFile.empty())`
`462`		`- return;`
`463`		`- int32_t outputSeqLen;`
`464`		`-`
`465`	`461`	`for (auto& tensor : responseTensors)`
`466`	`462`	`{`
`467`	`463`	`if (tensor.name == inference_request::kOutputIdsTensorName)`
`@@ -471,7 +467,7 @@ class Recorder`
`471`	`467`	`else if (tensor.name == inference_request::kSequenceLengthTensorName)`
`472`	`468`	`{`
`473`	`469`	`// Tensor of shape nBeams, and we only need the first one`
`474`		`- outputSeqLen = (bufferCast<int32_t>((tensor.tensor)));`
	`470`	`+ int32_t outputSeqLen = (bufferCast<int32_t>((tensor.tensor)));`
`475`	`471`	`if (mOutputHasInput)`
`476`	`472`	`{`
`477`	`473`	`int inputSeqLen = mRequestBenchInfos[requestId].inputLength;`
`@@ -482,6 +478,30 @@ class Recorder`
`482`	`478`	`}`
`483`	`479`	`}`
`484`	`480`
	`481`	`+ void recordEnd(uint64_t requestId, texec::Response const& response)`
	`482`	`+ {`
	`483`	`+`
	`484`	`+ this->recordEnd(requestId, response.hasError());`
	`485`	`+`
	`486`	`+ // Get the actual output length`
	`487`	`+ if (!response.hasError())`
	`488`	`+ {`
	`489`	`+ auto outputTokenIds = response.getResult().outputTokenIds;`
	`490`	`+`
	`491`	`+ int32_t outSeqLen = 0;`
	`492`	`+ for (auto const& beam : outputTokenIds)`
	`493`	`+ {`
	`494`	`+ outSeqLen = std::max(static_cast<int32_t>(beam.size()), outSeqLen);`
	`495`	`+ }`
	`496`	`+ if (mOutputHasInput)`
	`497`	`+ {`
	`498`	`+ int inputSeqLen = mRequestBenchInfos[requestId].inputLength;`
	`499`	`+ outSeqLen -= inputSeqLen;`
	`500`	`+ }`
	`501`	`+ mRequestBenchInfos[requestId].outputLength = outSeqLen;`
	`502`	`+ }`
	`503`	`+ }`
	`504`	`+`
`485`	`505`	`float calcPercentile(std::vector<float> const& latencies, int percentile)`
`486`	`506`	`{`
`487`	`507`	`int const index = static_cast<int>(std::ceil((percentile / 100.0) * latencies.size())) - 1;`
`@@ -827,7 +847,7 @@ class ExecutorServer`
`827`	`847`	`numFinished++;`
`828`	`848`	`if (!warmup)`
`829`	`849`	`{`
`830`		`- mRecorder->recordEnd(reqId, response.hasError());`
	`850`	`+ mRecorder->recordEnd(reqId, response);`
`831`	`851`	`}`
`832`	`852`	`}`
`833`	`853`	`}`