@@ -465,20 +465,23 @@ export class MLCEngine implements MLCEngineInterface {
465
465
pipeline : LLMChatPipeline ,
466
466
chatConfig : ChatConfig ,
467
467
genConfig : GenerationConfig ,
468
+ timeReceived : number ,
468
469
) : AsyncGenerator < ChatCompletionChunk , void , void > ;
469
470
asyncGenerate (
470
471
request : CompletionCreateParamsStreaming ,
471
472
model : string ,
472
473
pipeline : LLMChatPipeline ,
473
474
chatConfig : ChatConfig ,
474
475
genConfig : GenerationConfig ,
476
+ timeReceived : number ,
475
477
) : AsyncGenerator < Completion , void , void > ;
476
478
async * asyncGenerate (
477
479
request : ChatCompletionRequestStreaming | CompletionCreateParamsStreaming ,
478
480
model : string ,
479
481
pipeline : LLMChatPipeline ,
480
482
chatConfig : ChatConfig ,
481
483
genConfig : GenerationConfig ,
484
+ timeReceived : number ,
482
485
) : AsyncGenerator < ChatCompletionChunk | Completion , void , void > {
483
486
// Since it is an async generator, we need to do fine-grained try-catch to ensure lock is
484
487
// released only when errors occur. Then release at the very end when no error occurs.
@@ -678,18 +681,39 @@ export class MLCEngine implements MLCEngineInterface {
678
681
679
682
// 4. Usage chunk
680
683
if ( request . stream_options ?. include_usage ) {
684
+ const usedGrammar =
685
+ "response_format" in request &&
686
+ ( request . response_format ?. type === "grammar" ||
687
+ request . response_format ?. type === "json_object" ) ;
681
688
const completion_tokens = pipeline . getCurRoundDecodingTotalTokens ( ) ;
682
689
const prompt_tokens = pipeline . getCurRoundPrefillTotalTokens ( ) ;
683
690
const prefill_tokens_per_s = pipeline . getCurRoundPrefillTokensPerSec ( ) ;
684
691
const decode_tokens_per_s = pipeline . getCurRoundDecodingTokensPerSec ( ) ;
692
+ const grammar_init_s = pipeline . getCurRoundGrammarInitTotalTime ( ) ;
693
+ const prefill_time = pipeline . getCurRoundPrefillTotalTime ( ) ;
694
+ const decode_time = pipeline . getCurRoundDecodingTotalTime ( ) ;
695
+ const grammar_per_token_s =
696
+ pipeline . getCurRoundGrammarPerTokenTotalTime ( ) ;
697
+ const defaultExtra = {
698
+ e2e_latency_s : ( Date . now ( ) - timeReceived ) / 1000 ,
699
+ prefill_tokens_per_s : prefill_tokens_per_s ,
700
+ decode_tokens_per_s : decode_tokens_per_s ,
701
+ time_to_first_token_s : prefill_time ,
702
+ time_per_output_token_s : decode_time / completion_tokens ,
703
+ } ;
685
704
const usage : CompletionUsage = {
686
705
completion_tokens : completion_tokens ,
687
706
prompt_tokens : prompt_tokens ,
688
707
total_tokens : completion_tokens + prompt_tokens ,
689
- extra : {
690
- prefill_tokens_per_s : prefill_tokens_per_s ,
691
- decode_tokens_per_s : decode_tokens_per_s ,
692
- } ,
708
+ extra : usedGrammar
709
+ ? {
710
+ ...defaultExtra ,
711
+ ...{
712
+ grammar_init_s : grammar_init_s ,
713
+ grammar_per_token_s : grammar_per_token_s / completion_tokens ,
714
+ } ,
715
+ }
716
+ : defaultExtra ,
693
717
} ;
694
718
if ( isChatCompletion ) {
695
719
const usageChunk : ChatCompletionChunk = {
@@ -745,6 +769,7 @@ export class MLCEngine implements MLCEngineInterface {
745
769
async chatCompletion (
746
770
request : ChatCompletionRequest ,
747
771
) : Promise < AsyncIterable < ChatCompletionChunk > | ChatCompletion > {
772
+ const timeReceived = Date . now ( ) ;
748
773
// 0. Check model loaded and preprocess inputs
749
774
const [ selectedModelId , selectedPipeline , selectedChatConfig ] =
750
775
this . getLLMStates ( "ChatCompletionRequest" , request . model ) ;
@@ -766,6 +791,7 @@ export class MLCEngine implements MLCEngineInterface {
766
791
logprobs : request . logprobs ,
767
792
top_logprobs : request . top_logprobs ,
768
793
response_format : request . response_format ,
794
+ ignore_eos : request . ignore_eos ,
769
795
} ;
770
796
771
797
// 0.5 Block wait until this pipeline finishes all previous requests
@@ -780,6 +806,7 @@ export class MLCEngine implements MLCEngineInterface {
780
806
selectedPipeline ,
781
807
selectedChatConfig ,
782
808
genConfig ,
809
+ timeReceived ,
783
810
) ;
784
811
}
785
812
@@ -796,6 +823,8 @@ export class MLCEngine implements MLCEngineInterface {
796
823
let prompt_tokens = 0 ;
797
824
let prefill_time = 0 ;
798
825
let decode_time = 0 ;
826
+ let grammar_init_s = 0 ;
827
+ let grammar_per_token_s = 0 ;
799
828
for ( let i = 0 ; i < n ; i ++ ) {
800
829
let outputMessage : string ;
801
830
if ( this . interruptSignal ) {
@@ -852,8 +881,21 @@ export class MLCEngine implements MLCEngineInterface {
852
881
prompt_tokens += selectedPipeline . getCurRoundPrefillTotalTokens ( ) ;
853
882
prefill_time += selectedPipeline . getCurRoundPrefillTotalTime ( ) ;
854
883
decode_time += selectedPipeline . getCurRoundDecodingTotalTime ( ) ;
884
+ grammar_init_s += selectedPipeline . getCurRoundGrammarInitTotalTime ( ) ;
885
+ grammar_per_token_s +=
886
+ selectedPipeline . getCurRoundGrammarPerTokenTotalTime ( ) ;
855
887
}
856
-
888
+ const usedGrammar =
889
+ "response_format" in request &&
890
+ ( request . response_format ?. type === "grammar" ||
891
+ request . response_format ?. type === "json_object" ) ;
892
+ const defaultExtra = {
893
+ e2e_latency_s : ( Date . now ( ) - timeReceived ) / 1000 ,
894
+ prefill_tokens_per_s : prompt_tokens / prefill_time ,
895
+ decode_tokens_per_s : completion_tokens / decode_time ,
896
+ time_to_first_token_s : prefill_time ,
897
+ time_per_output_token_s : decode_time / completion_tokens ,
898
+ } ;
857
899
const response : ChatCompletion = {
858
900
id : crypto . randomUUID ( ) ,
859
901
choices : choices ,
@@ -864,10 +906,15 @@ export class MLCEngine implements MLCEngineInterface {
864
906
completion_tokens : completion_tokens ,
865
907
prompt_tokens : prompt_tokens ,
866
908
total_tokens : completion_tokens + prompt_tokens ,
867
- extra : {
868
- prefill_tokens_per_s : prompt_tokens / prefill_time ,
869
- decode_tokens_per_s : completion_tokens / decode_time ,
870
- } ,
909
+ extra : usedGrammar
910
+ ? {
911
+ ...defaultExtra ,
912
+ ...{
913
+ grammar_init_s : grammar_init_s ,
914
+ grammar_per_token_s : grammar_per_token_s / completion_tokens ,
915
+ } ,
916
+ }
917
+ : defaultExtra ,
871
918
} as CompletionUsage ,
872
919
} ;
873
920
@@ -901,6 +948,8 @@ export class MLCEngine implements MLCEngineInterface {
901
948
async completion (
902
949
request : CompletionCreateParams ,
903
950
) : Promise < AsyncIterable < Completion > | Completion > {
951
+ const timeReceived = Date . now ( ) ;
952
+
904
953
// 0. Check model loaded and preprocess inputs
905
954
const [ selectedModelId , selectedPipeline , selectedChatConfig ] =
906
955
this . getLLMStates ( "CompletionCreateParams" , request . model ) ;
@@ -915,6 +964,7 @@ export class MLCEngine implements MLCEngineInterface {
915
964
logit_bias : request . logit_bias ,
916
965
logprobs : request . logprobs ,
917
966
top_logprobs : request . top_logprobs ,
967
+ ignore_eos : request . ignore_eos ,
918
968
} ;
919
969
920
970
// 0.5 Block wait until this pipeline finishes all previous requests
@@ -929,6 +979,7 @@ export class MLCEngine implements MLCEngineInterface {
929
979
selectedPipeline ,
930
980
selectedChatConfig ,
931
981
genConfig ,
982
+ timeReceived ,
932
983
) ;
933
984
}
934
985
@@ -989,8 +1040,11 @@ export class MLCEngine implements MLCEngineInterface {
989
1040
prompt_tokens : prompt_tokens ,
990
1041
total_tokens : completion_tokens + prompt_tokens ,
991
1042
extra : {
1043
+ e2e_latency_s : ( Date . now ( ) - timeReceived ) / 1000 ,
992
1044
prefill_tokens_per_s : prompt_tokens / prefill_time ,
993
1045
decode_tokens_per_s : completion_tokens / decode_time ,
1046
+ time_to_first_token_s : prefill_time ,
1047
+ time_per_output_token_s : decode_time / completion_tokens ,
994
1048
} ,
995
1049
} as CompletionUsage ,
996
1050
} ;
0 commit comments