Skip to content

Commit b777bd6

Browse files
kaiyuxRunningLeonTlntinJamesTheZngoanpv
authored
Update TensorRT-LLM (NVIDIA#1725)
* Update TensorRT-LLM --------- Co-authored-by: RunningLeon <[email protected]> Co-authored-by: Tlntin <[email protected]> Co-authored-by: ZHENG, Zhen <[email protected]> Co-authored-by: Pham Van Ngoan <[email protected]> Co-authored-by: Nathan Price <[email protected]> Co-authored-by: Tushar Goel <[email protected]> Co-authored-by: Mati <[email protected]>
1 parent f430a4b commit b777bd6

File tree

368 files changed

+21445
-8977
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

368 files changed

+21445
-8977
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ __pycache__/
66
*.nsys-rep
77
.VSCodeCounter
88
build*/
9+
!builders/
910
*.egg-info/
1011
.coverage
1112
*.onnx

.pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,5 @@ repos:
4646
args:
4747
- --skip=".git,3rdparty"
4848
- --exclude-file=examples/whisper/tokenizer.py
49-
- --ignore-words-list=rouge,inout,atleast,strat,nd
49+
- --ignore-words-list=rouge,inout,atleast,strat,nd,subtile
5050
exclude: 'tests/llm-test-defs/turtle/test_input_files'

README.md

+3
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,6 @@ To get started with TensorRT-LLM, visit our documentation:
7575
- [Installation Guide for Linux](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
7676
- [Installation Guide for Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html)
7777
- [Supported Hardware, Models, and other Software](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html)
78+
79+
## Community
80+
- [Model zoo](https://huggingface.co/TheFloat16) (generated by TRT-LLM rel 0.9 a9356d4b7610330e89c1010f342a9ac644215c52)

benchmarks/cpp/README.md

+6-8
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,10 @@ TP=2
210210
PP=1
211211
MAX_LEN=1024
212212
MAX_BATCH=32
213-
MAX_LORA_RANK=32
213+
NUM_LAYERS=40
214+
MAX_LORA_RANK=64
214215
NUM_LORA_MODS=7
216+
EOS_ID=2
215217
216218
SOURCE_LORA=chinese-llama-2-lora-13b
217219
CPP_LORA=chinese-llama-2-lora-13b-cpp
@@ -234,7 +236,7 @@ ${HOME}/.local/bin/trtllm-build \
234236
--gemm_plugin float16 \
235237
--lora_plugin float16 \
236238
--use_paged_context_fmha enable \
237-
--lora_target_modules attn_qkv \
239+
--lora_target_modules attn_q attn_k attn_v attn_dense mlp_h_to_4h mlp_4h_to_h mlp_gate \
238240
--max_lora_rank ${MAX_LORA_RANK}
239241
240242
NUM_LORAS=(8 16 24 32 64 128 256)
@@ -252,8 +254,6 @@ mkdir -p $EG_DIR/data
252254
# Prepare dataset without lora_task_id
253255
python benchmarks/cpp/prepare_dataset.py \
254256
--output "${EG_DIR}/data/token-norm-dist.json" \
255-
--request-rate -1 \
256-
--time-delay-dist constant \
257257
--tokenizer $TOKENIZER \
258258
token-norm-dist \
259259
--num-requests $NUM_REQUESTS \
@@ -263,8 +263,6 @@ python benchmarks/cpp/prepare_dataset.py \
263263
for nloras in ${NUM_LORAS[@]}; do
264264
python benchmarks/cpp/prepare_dataset.py \
265265
--output "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
266-
--request-rate -1 \
267-
--time-delay-dist constant \
268266
--rand-task-id 0 $(( $nloras - 1 )) \
269267
--tokenizer $TOKENIZER \
270268
token-norm-dist \
@@ -292,7 +290,7 @@ mpirun -n ${TP} --output-filename ${EG_DIR}/log-base-lora \
292290
293291
# Now run inference with various numbers or loras
294292
# The host cache is set large enough to hold all the LoRAs in lora_dir
295-
# GPU cache is set to hold 32 LoRAs
293+
# GPU cache is set to hold 16 LoRAs
296294
# This benchmark will preload all the LoRAs into the host cache
297295
# We run inference on a range of active LoRAs exercising different cache miss rates.
298296
for nloras in ${NUM_LORAS[@]}; do
@@ -303,7 +301,7 @@ for nloras in ${NUM_LORAS[@]}; do
303301
--type IFB \
304302
--dataset "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
305303
--lora_host_cache_bytes 8589934592 \
306-
--lora_num_device_mod_layers $(( 32 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
304+
--lora_num_device_mod_layers $(( 16 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
307305
--kv_cache_free_gpu_mem_fraction 0.80 \
308306
--log_level info \
309307
--eos_id ${EOS_ID} \

benchmarks/cpp/gptManagerBenchmark.cpp

+26-6
Original file line numberDiff line numberDiff line change
@@ -458,10 +458,6 @@ class Recorder
458458
{
459459
this->recordEnd(requestId, hasError);
460460

461-
if (mRespJsonFile.empty())
462-
return;
463-
int32_t outputSeqLen;
464-
465461
for (auto& tensor : responseTensors)
466462
{
467463
if (tensor.name == inference_request::kOutputIdsTensorName)
@@ -471,7 +467,7 @@ class Recorder
471467
else if (tensor.name == inference_request::kSequenceLengthTensorName)
472468
{
473469
// Tensor of shape nBeams, and we only need the first one
474-
outputSeqLen = *(bufferCast<int32_t>(*(tensor.tensor)));
470+
int32_t outputSeqLen = *(bufferCast<int32_t>(*(tensor.tensor)));
475471
if (mOutputHasInput)
476472
{
477473
int inputSeqLen = mRequestBenchInfos[requestId].inputLength;
@@ -482,6 +478,30 @@ class Recorder
482478
}
483479
}
484480

481+
void recordEnd(uint64_t requestId, texec::Response const& response)
482+
{
483+
484+
this->recordEnd(requestId, response.hasError());
485+
486+
// Get the actual output length
487+
if (!response.hasError())
488+
{
489+
auto outputTokenIds = response.getResult().outputTokenIds;
490+
491+
int32_t outSeqLen = 0;
492+
for (auto const& beam : outputTokenIds)
493+
{
494+
outSeqLen = std::max(static_cast<int32_t>(beam.size()), outSeqLen);
495+
}
496+
if (mOutputHasInput)
497+
{
498+
int inputSeqLen = mRequestBenchInfos[requestId].inputLength;
499+
outSeqLen -= inputSeqLen;
500+
}
501+
mRequestBenchInfos[requestId].outputLength = outSeqLen;
502+
}
503+
}
504+
485505
float calcPercentile(std::vector<float> const& latencies, int percentile)
486506
{
487507
int const index = static_cast<int>(std::ceil((percentile / 100.0) * latencies.size())) - 1;
@@ -827,7 +847,7 @@ class ExecutorServer
827847
numFinished++;
828848
if (!warmup)
829849
{
830-
mRecorder->recordEnd(reqId, response.hasError());
850+
mRecorder->recordEnd(reqId, response);
831851
}
832852
}
833853
}

benchmarks/cpp/gptSessionBenchmark.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include <NvInfer.h>
3535
#include <atomic>
3636
#include <chrono>
37+
#include <cuda_profiler_api.h>
3738
#include <cxxopts.hpp>
3839
#include <future>
3940
#include <sstream>
@@ -213,6 +214,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
213214
std::vector<float> latencies;
214215
std::vector<float> generationTimes;
215216
auto generationProfiler = std::make_shared<GptSession::GenerationProfiler>();
217+
cudaProfilerStart();
216218
while (iterIdx < numRuns)
217219
{
218220
auto const start = std::chrono::steady_clock::now();
@@ -242,6 +244,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
242244
break;
243245
}
244246
}
247+
cudaProfilerStop();
245248

246249
TLLM_LOG_INFO(memoryCounter.toString());
247250
done = true;

benchmarks/python/benchmark.py

-4
Original file line numberDiff line numberDiff line change
@@ -198,10 +198,6 @@ def parse_arguments():
198198
help=
199199
'Quick sanity check with num_layer=1; will be silently ignored if --engine_dir is specified.'
200200
)
201-
parser.add_argument('--strongly_typed',
202-
default=False,
203-
action='store_true',
204-
help='This option will reduce the building time.')
205201
parser.add_argument(
206202
'--gpu_weights_percent',
207203
type=str,

benchmarks/python/build.py

+21-29
Original file line numberDiff line numberDiff line change
@@ -151,10 +151,6 @@ def parse_arguments():
151151
default=False,
152152
action='store_true',
153153
help="Build engines serially")
154-
parser.add_argument('--strongly_typed',
155-
default=False,
156-
action='store_true',
157-
help='This option will reduce the building time.')
158154
parser.add_argument(
159155
'--multiple_profiles',
160156
default=False,
@@ -251,9 +247,6 @@ def build_gpt(args):
251247
if not args.serial_build:
252248
torch.cuda.set_device(runtime_rank)
253249

254-
strongly_typed = args.strongly_typed
255-
if args.quantization is not None and "fp8" in args.quantization:
256-
strongly_typed = True
257250
num_kv_heads = build_config['num_heads'] \
258251
if build_config['num_kv_heads'] is None else build_config['num_kv_heads']
259252
apply_query_key_layer_scaling = False
@@ -321,7 +314,7 @@ def build_gpt(args):
321314
quant_mode=quant_mode,
322315
use_refit=False,
323316
opt_level=build_config['builder_opt'],
324-
strongly_typed=strongly_typed,
317+
strongly_typed=True,
325318
weight_streaming=is_weight_streaming,
326319
**builder_config_extra_kwargs)
327320
engine_name = get_engine_name(args.model, args.dtype, world_size,
@@ -363,8 +356,10 @@ def build_gpt(args):
363356
'apply_query_key_layer_scaling':
364357
builder_config.apply_query_key_layer_scaling,
365358
'rotary_pct': build_config['rotary_pct'],
366-
'moe_num_experts': build_config["moe_num_experts"],
367-
'moe_top_k': build_config["moe_top_k"],
359+
'moe': {
360+
'num_experts': build_config["moe_num_experts"],
361+
'top_k': build_config["moe_top_k"],
362+
},
368363
}
369364
config = PretrainedConfig.from_dict(config)
370365
tensorrt_llm_model = tensorrt_llm.models.GPTForCausalLM(config)
@@ -399,7 +394,7 @@ def build_gpt(args):
399394
elif family == "llama":
400395
config = {
401396
'architecture':
402-
'LLaMAForCausalLM',
397+
'LlamaForCausalLM',
403398
'dtype':
404399
args.dtype,
405400
'num_hidden_layers':
@@ -430,10 +425,10 @@ def build_gpt(args):
430425
'world_size': world_size,
431426
'tp_size': world_size
432427
},
433-
'moe_num_experts':
434-
build_config["moe_num_experts"],
435-
'moe_top_k':
436-
build_config["moe_top_k"],
428+
'moe': {
429+
'num_experts': build_config["moe_num_experts"],
430+
'top_k': build_config["moe_top_k"],
431+
}
437432
}
438433
config = PretrainedConfig.from_dict(config)
439434
tensorrt_llm_model = tensorrt_llm.models.LLaMAForCausalLM(config)
@@ -602,9 +597,6 @@ def build_gpt(args):
602597
}
603598
config = PretrainedConfig.from_dict(config)
604599
tensorrt_llm_model = tensorrt_llm.models.BloomForCausalLM(config)
605-
tensorrt_llm_model = optimize_model(
606-
tensorrt_llm_model,
607-
use_parallel_embedding=config.use_parallel_embedding)
608600
elif family == "falcon":
609601
config = {
610602
'architecture':
@@ -696,7 +688,7 @@ def build_gpt(args):
696688
elif family == "internlm":
697689
config = {
698690
'architecture':
699-
'LLaMAForCausalLM',
691+
'LlamaForCausalLM',
700692
'dtype':
701693
args.dtype,
702694
'num_hidden_layers':
@@ -778,10 +770,10 @@ def build_gpt(args):
778770
'world_size': world_size,
779771
'tp_size': world_size
780772
},
781-
'moe_num_experts':
782-
build_config["moe_num_experts"],
783-
'moe_top_k':
784-
build_config["moe_top_k"],
773+
'moe': {
774+
'num_experts': build_config["moe_num_experts"],
775+
'top_k': build_config["moe_top_k"],
776+
},
785777
'qwen_type':
786778
'qwen',
787779
}
@@ -821,10 +813,10 @@ def build_gpt(args):
821813
'world_size': world_size,
822814
'tp_size': world_size
823815
},
824-
'moe_num_experts':
825-
build_config["moe_num_experts"],
826-
'moe_top_k':
827-
build_config["moe_top_k"],
816+
'moe': {
817+
'num_experts': build_config["moe_num_experts"],
818+
'top_k': build_config["moe_top_k"],
819+
},
828820
'qwen_type':
829821
'qwen2',
830822
}
@@ -1029,7 +1021,7 @@ def build_bert(args):
10291021
max_batch_size=max_batch_size,
10301022
max_input_len=max_input_len,
10311023
opt_level=build_config['builder_opt'],
1032-
strongly_typed=args.strongly_typed,
1024+
strongly_typed=True,
10331025
weight_streaming=is_weight_streaming,
10341026
)
10351027
engine_name = get_engine_name(args.model, args.dtype, world_size,
@@ -1207,7 +1199,7 @@ def enc_dec_build_helper(component, config, args):
12071199
cross_attention=(component == 'decoder'),
12081200
has_position_embedding=has_position_embedding,
12091201
has_token_type_embedding=False, # by default
1210-
strongly_typed=False, # by default
1202+
strongly_typed=True,
12111203
gather_all_token_logits=False, # by default
12121204
int8=(quant_mode.has_act_and_weight_quant()
12131205
or quant_mode.is_int8_weight_only()),

benchmarks/python/check_accuracy_mlperf.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
import os
23
from enum import Enum
34

45
import evaluate
@@ -82,9 +83,11 @@ def calculate_toks_per_sample(preds, eos_id):
8283
return avg_len / num_samples
8384

8485

85-
def calculate_rouge_score(preds, targets):
86+
def calculate_rouge_score(preds, targets, rouge_dir=None):
8687
print("Calculating ROUGE scores...")
87-
metric = evaluate.load("rouge")
88+
rouge_dir = rouge_dir if rouge_dir and os.path.exists(
89+
rouge_dir) else "rouge"
90+
metric = evaluate.load(rouge_dir)
8891
preds, targets = postprocess_text(preds, targets[0:len(preds)])
8992
result = metric.compute(predictions=preds,
9093
references=targets,
@@ -114,6 +117,15 @@ def parse_arguments():
114117
parser.add_argument("--base_model",
115118
type=str,
116119
help="Location of the model used (to create tokenizer)")
120+
121+
parser.add_argument(
122+
'--rouge_dir',
123+
default=None,
124+
type=str,
125+
help=
126+
"evaluate.load('rouge') will attempt to pull rouge package from HF. Use cached rouge can avoid network outage of host or HF."
127+
)
128+
117129
args = parser.parse_args()
118130

119131
return args
@@ -146,7 +158,8 @@ def main():
146158
tps_score = calculate_toks_per_sample(pred_toks, tokenizer.eos_token)
147159

148160
pred_texts = tokenizer.batch_decode(pred_toks, skip_special_tokens=True)
149-
achieved_scores = calculate_rouge_score(pred_texts, target_texts)
161+
achieved_scores = calculate_rouge_score(pred_texts, target_texts,
162+
args.rouge_dir)
150163

151164
achieved_scores['tokens_per_sample'] = tps_score
152165
targets = ACCURACY_TARGETS[model]

benchmarks/python/gpt_benchmark.py

+4-8
Original file line numberDiff line numberDiff line change
@@ -279,14 +279,10 @@ def check_memory(self, io_shapes: list, raise_exception=False):
279279
self.kv_cache_elem_per_token(self.build_config, self.runtime_mapping.tp_size, self.runtime_mapping.pp_size) * element_size(self.kv_dtype)
280280
# when MHA is OOTB, it requires extra KV cache size, because OOTB don't support inplace updating KV cache.
281281
if not self.use_gpt_attention_plugin:
282-
if os.getenv('TRTLLM_DISABLE_OOTB_KVCACHE_REUSE') != 'ON':
283-
local_n_layer = ceil(self.build_config.num_layers /
284-
self.runtime_mapping.pp_size)
285-
kv_cache_size_in_bytes = kv_cache_size_in_bytes / local_n_layer * (
286-
local_n_layer + 1)
287-
else:
288-
# without reusing, we need one for past as engine inputs, one for present as engine outputs.
289-
kv_cache_size_in_bytes *= 2
282+
local_n_layer = ceil(self.build_config.num_layers /
283+
self.runtime_mapping.pp_size)
284+
kv_cache_size_in_bytes = kv_cache_size_in_bytes / local_n_layer * (
285+
local_n_layer + 1)
290286

291287
kv_cache_size_in_mb = bytes_to_target_unit(kv_cache_size_in_bytes,
292288
"MiB")

benchmarks/suite/tensorrt_llm_bench/utils/enums.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,7 @@ def get_build_options(self, dtype: str) -> List[str]:
5151
List[str]: A list of command line arguments to be added to build
5252
commands.
5353
"""
54-
if self.value == self.FP8:
55-
return ["--strongly_typed"]
56-
else:
54+
if not self.value == self.FP8:
5755
return ["--gemm_plugin", dtype]
5856

5957

0 commit comments

Comments
 (0)