-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_dynamic.py
executable file
·184 lines (168 loc) · 6.87 KB
/
run_dynamic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# !/usr/bin/env python3
import os
from time import perf_counter
import json
import onnx
import onnxruntime
import numpy as np
from openvino.runtime import Core, Model, Tensor, PartialShape, Type, ProfilingInfo, properties
from openvino.runtime import opset8 as opset
from openvino.runtime.op import Constant, Parameter, tensor_iterator
from openvino.runtime.passes import Manager
from openvino.runtime.utils.types import get_dtype
import openvino as ov
print_config = True
enable_onnx_profiling = True
enable_openvino_profiling = False
enable_save_data = False
MAX_SEQ = 64
#MAX_SEQ = 12
#MAX_SEQ = 13
bos_token_id = 0
eos_token_id = 2
def onnx_infer(sess, input_ids, position_ids, attention_mask, past_key_values):
"""
"""
input_dict = {
"input_ids": input_ids,
"position_ids": position_ids,
"attention_mask": attention_mask,
"past_key_values": past_key_values
}
output_name_list = []
for node in sess.get_outputs():
output_name_list.append(node.name)
res = sess.run(output_name_list, input_dict)
return res
def get_token(next_tokens_scores):
"""
简单实现,用以验证模型。
只保证输入输出与原始模型一致。
"""
def softmax(x):
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum(axis=-1)
probs = softmax(next_tokens_scores)
probs = np.log(probs)
next_tokens = np.argmax(probs, axis=-1).reshape([-1, 1])
return next_tokens
def save_data(engine, q_mode, cur_len, outputs):
folder_name = f"./result/{engine}_{q_mode}_{cur_len}"
if not os.path.exists(folder_name):
os.makedirs(folder_name)
np.save(f"{folder_name}/logits", outputs[0][:, -1, :])
np.save(f"{folder_name}/past_key_values", outputs[1])
def main(engine = "onnx", q_mode = False):
print(f"Engine: {engine}")
# text = "Hello, it is a great day! nice to"
input_ids = np.array([[1, 16644, 31844, 357, 322, 260, 1014, 1124, 31905, 4504, 289]], dtype=np.int64)
position_ids = np.array([[7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87]], dtype=np.int64)
cur_len = input_ids.shape[-1] # 11
attention_mask = np.ones([1, 1, cur_len, cur_len], dtype=np.float32) * np.finfo(np.float32).min
attention_mask = np.triu(attention_mask, 1) # 返回输入矩阵 input 的上三角部分,其余部分被设为 0。
past_key_values = np.zeros([31, 2, 0, 24, 128], "float32") # 实际上是个空矩阵
if engine == "onnx":
sess_options = None
if enable_onnx_profiling:
sess_options = onnxruntime.SessionOptions()
sess_options.enable_profiling = True
sess_options.profile_file_prefix = "/home/openvino-ci-58/meiyang/work/ernie3.5se/dynamic/onnx_profiling/onnx_profiling"
model_file = "/mnt/llm_irs/paddle/dynamic_model_onnx/model.onnx"
sess = onnxruntime.InferenceSession(model_file, sess_options)
elif engine == "ov":
if q_mode:
model_file = f"/mnt/llm_irs/paddle/{q_mode}/model.xml"
else:
model_file = "/mnt/llm_irs/paddle/ir/model.xml"
core = ov.Core()
model = core.read_model(model_file)
latency = {'PERFORMANCE_HINT': 'LATENCY',
'SCHEDULING_CORE_TYPE': 'PCORE_ONLY',
'ENABLE_HYPER_THREADING': 'YES',
}
if enable_openvino_profiling:
latency['PERF_COUNT'] = 'YES'
compiled_model = core.compile_model(model, 'CPU', latency)
if print_config:
keys = compiled_model.get_property(properties.supported_properties())
print("Model:")
for k in keys:
skip_keys = ('SUPPORTED_METRICS', 'SUPPORTED_CONFIG_KEYS', properties.supported_properties())
if k not in skip_keys:
value = compiled_model.get_property(k)
print(f' {k}: {value}')
#if k == properties.device.properties():
#for device_key in value.keys():
# print(f' {device_key}:')
# for k2, value2 in value.get(device_key).items():
# if k2 not in skip_keys:
# print(f' {k2}: {value2}')
ireq = compiled_model.create_infer_request()
print("load success!")
# 超过最大生成长度 或者 遇到结束符
infer_time = []
nodes_list = []
while cur_len < MAX_SEQ:
start = perf_counter()
if engine == "onnx":
outputs = onnx_infer(
sess,
input_ids,
position_ids,
attention_mask,
past_key_values
)
elif engine == "ov":
inputs = {
"input_ids": input_ids,
"position_ids": position_ids,
"attention_mask": attention_mask,
"past_key_values": past_key_values
}
outputs = ireq.infer(inputs)
if enable_openvino_profiling:
pi_list = []
for profile_info in ireq.get_profiling_info():
status = "OPTIMIZED_OUT"
if profile_info.status == ProfilingInfo.Status.EXECUTED:
status = "EXECUTED"
elif profile_info.status == ProfilingInfo.Status.NOT_RUN:
status = "NOT_RUN"
pi = {"name": profile_info.node_name,
"cpu_time": profile_info.cpu_time.microseconds/1000.0,
"real_time": profile_info.real_time.microseconds/1000.0,
"node_type": profile_info.node_type,
"exec_type": profile_info.exec_type,
"status": status}
pi_list.append(pi)
nodes_list.append({"nodes": pi_list})
elapsed = perf_counter() - start
infer_time.append(elapsed)
cur_len += 1
logits = outputs[0][:, -1, :]
past_key_values = outputs[1]
next_token = get_token(logits) # (1, 1)
print("next_token: ", next_token)
# 结束符
if next_token == eos_token_id:
print("eos")
break
input_ids = next_token
position_ids = np.array([[cur_len - 1]], dtype=np.int64)
attention_mask = np.zeros([1, 1, 1, cur_len], dtype=np.float32)
if enable_save_data:
save_data(engine, q_mode, cur_len, outputs)
if engine == "ov" and enable_openvino_profiling:
dp = {"detailed_performance": nodes_list, "report_type": "detailed"}
with open("benchmark_detailed_counters_report.json", "w") as f:
json.dump(dp, f)
#print(infer_time)
infer_time.pop(0)
if len(infer_time):
avg_time = sum(infer_time) / len(infer_time)
print(avg_time)
if __name__ == '__main__':
#main("onnx")
main("ov")
#main("ov", "INT8")
#main("ov", "INT4_SYM")