@@ -151,10 +151,6 @@ def parse_arguments():
151
151
default = False ,
152
152
action = 'store_true' ,
153
153
help = "Build engines serially" )
154
- parser .add_argument ('--strongly_typed' ,
155
- default = False ,
156
- action = 'store_true' ,
157
- help = 'This option will reduce the building time.' )
158
154
parser .add_argument (
159
155
'--multiple_profiles' ,
160
156
default = False ,
@@ -251,9 +247,6 @@ def build_gpt(args):
251
247
if not args .serial_build :
252
248
torch .cuda .set_device (runtime_rank )
253
249
254
- strongly_typed = args .strongly_typed
255
- if args .quantization is not None and "fp8" in args .quantization :
256
- strongly_typed = True
257
250
num_kv_heads = build_config ['num_heads' ] \
258
251
if build_config ['num_kv_heads' ] is None else build_config ['num_kv_heads' ]
259
252
apply_query_key_layer_scaling = False
@@ -321,7 +314,7 @@ def build_gpt(args):
321
314
quant_mode = quant_mode ,
322
315
use_refit = False ,
323
316
opt_level = build_config ['builder_opt' ],
324
- strongly_typed = strongly_typed ,
317
+ strongly_typed = True ,
325
318
weight_streaming = is_weight_streaming ,
326
319
** builder_config_extra_kwargs )
327
320
engine_name = get_engine_name (args .model , args .dtype , world_size ,
@@ -363,8 +356,10 @@ def build_gpt(args):
363
356
'apply_query_key_layer_scaling' :
364
357
builder_config .apply_query_key_layer_scaling ,
365
358
'rotary_pct' : build_config ['rotary_pct' ],
366
- 'moe_num_experts' : build_config ["moe_num_experts" ],
367
- 'moe_top_k' : build_config ["moe_top_k" ],
359
+ 'moe' : {
360
+ 'num_experts' : build_config ["moe_num_experts" ],
361
+ 'top_k' : build_config ["moe_top_k" ],
362
+ },
368
363
}
369
364
config = PretrainedConfig .from_dict (config )
370
365
tensorrt_llm_model = tensorrt_llm .models .GPTForCausalLM (config )
@@ -399,7 +394,7 @@ def build_gpt(args):
399
394
elif family == "llama" :
400
395
config = {
401
396
'architecture' :
402
- 'LLaMAForCausalLM ' ,
397
+ 'LlamaForCausalLM ' ,
403
398
'dtype' :
404
399
args .dtype ,
405
400
'num_hidden_layers' :
@@ -430,10 +425,10 @@ def build_gpt(args):
430
425
'world_size' : world_size ,
431
426
'tp_size' : world_size
432
427
},
433
- 'moe_num_experts' :
434
- build_config ["moe_num_experts" ],
435
- 'moe_top_k' :
436
- build_config [ "moe_top_k" ],
428
+ 'moe' : {
429
+ 'num_experts' : build_config ["moe_num_experts" ],
430
+ 'top_k' : build_config [ "moe_top_k" ],
431
+ }
437
432
}
438
433
config = PretrainedConfig .from_dict (config )
439
434
tensorrt_llm_model = tensorrt_llm .models .LLaMAForCausalLM (config )
@@ -602,9 +597,6 @@ def build_gpt(args):
602
597
}
603
598
config = PretrainedConfig .from_dict (config )
604
599
tensorrt_llm_model = tensorrt_llm .models .BloomForCausalLM (config )
605
- tensorrt_llm_model = optimize_model (
606
- tensorrt_llm_model ,
607
- use_parallel_embedding = config .use_parallel_embedding )
608
600
elif family == "falcon" :
609
601
config = {
610
602
'architecture' :
@@ -696,7 +688,7 @@ def build_gpt(args):
696
688
elif family == "internlm" :
697
689
config = {
698
690
'architecture' :
699
- 'LLaMAForCausalLM ' ,
691
+ 'LlamaForCausalLM ' ,
700
692
'dtype' :
701
693
args .dtype ,
702
694
'num_hidden_layers' :
@@ -778,10 +770,10 @@ def build_gpt(args):
778
770
'world_size' : world_size ,
779
771
'tp_size' : world_size
780
772
},
781
- 'moe_num_experts' :
782
- build_config ["moe_num_experts" ],
783
- 'moe_top_k' :
784
- build_config [ "moe_top_k" ] ,
773
+ 'moe' : {
774
+ 'num_experts' : build_config ["moe_num_experts" ],
775
+ 'top_k' : build_config [ "moe_top_k" ],
776
+ } ,
785
777
'qwen_type' :
786
778
'qwen' ,
787
779
}
@@ -821,10 +813,10 @@ def build_gpt(args):
821
813
'world_size' : world_size ,
822
814
'tp_size' : world_size
823
815
},
824
- 'moe_num_experts' :
825
- build_config ["moe_num_experts" ],
826
- 'moe_top_k' :
827
- build_config [ "moe_top_k" ] ,
816
+ 'moe' : {
817
+ 'num_experts' : build_config ["moe_num_experts" ],
818
+ 'top_k' : build_config [ "moe_top_k" ],
819
+ } ,
828
820
'qwen_type' :
829
821
'qwen2' ,
830
822
}
@@ -1029,7 +1021,7 @@ def build_bert(args):
1029
1021
max_batch_size = max_batch_size ,
1030
1022
max_input_len = max_input_len ,
1031
1023
opt_level = build_config ['builder_opt' ],
1032
- strongly_typed = args . strongly_typed ,
1024
+ strongly_typed = True ,
1033
1025
weight_streaming = is_weight_streaming ,
1034
1026
)
1035
1027
engine_name = get_engine_name (args .model , args .dtype , world_size ,
@@ -1207,7 +1199,7 @@ def enc_dec_build_helper(component, config, args):
1207
1199
cross_attention = (component == 'decoder' ),
1208
1200
has_position_embedding = has_position_embedding ,
1209
1201
has_token_type_embedding = False , # by default
1210
- strongly_typed = False , # by default
1202
+ strongly_typed = True ,
1211
1203
gather_all_token_logits = False , # by default
1212
1204
int8 = (quant_mode .has_act_and_weight_quant ()
1213
1205
or quant_mode .is_int8_weight_only ()),
0 commit comments