From 01d2d8311f11ca9b574737b53e2afdd9b33f11af Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Fri, 19 Apr 2024 19:49:47 +0800
Subject: [PATCH 01/28] ILUVATAT-CoRE

---
 .../backends/ILUVATAR/ILUVATAR.json           |    1 +
 .../backends/ILUVATAR/README.zh_CN.md         |   88 +
 .../general_perf/backends/ILUVATAR/common.py  |  273 ++
 .../ILUVATAR/compile_backend_iluvatar.py      |  207 ++
 .../backends/ILUVATAR/optimizer/__init__.py   |    0
 .../ILUVATAR/optimizer/onnx_model_bert.py     |  578 ++++
 .../ILUVATAR/optimizer/onnx_model_t5.py       |  519 ++++
 .../ILUVATAR/optimizer/onnx_model_yolo.py     |  114 +
 .../backends/ILUVATAR/optimizer/optimizer.md  |   51 +
 .../backends/ILUVATAR/optimizer/optimizer.py  |  188 ++
 .../ILUVATAR/optimizer/passes/__init__.py     |    0
 .../ILUVATAR/optimizer/passes/float16.py      |  394 +++
 .../optimizer/passes/fuse_series_bias_add.py  |   65 +
 .../passes/fusion_albert_attention.py         |  602 ++++
 .../optimizer/passes/fusion_attention.py      |  571 ++++
 .../ILUVATAR/optimizer/passes/fusion_base.py  |   82 +
 .../optimizer/passes/fusion_biasgelu.py       |   66 +
 .../optimizer/passes/fusion_customfc.py       |  279 ++
 .../passes/fusion_disentangled_attention.py   |  109 +
 .../optimizer/passes/fusion_embedlayer.py     |  703 +++++
 .../optimizer/passes/fusion_fastgelu.py       |  404 +++
 .../passes/fusion_format_roformer.py          |  113 +
 .../ILUVATAR/optimizer/passes/fusion_gelu.py  |  333 +++
 .../passes/fusion_gelu_approximation.py       |   27 +
 .../optimizer/passes/fusion_gpt_attention.py  |  473 ++++
 .../passes/fusion_gpt_attention_megatron.py   |  292 ++
 .../passes/fusion_gpt_attention_no_past.py    |  252 ++
 .../optimizer/passes/fusion_layernorm.py      |  296 ++
 .../optimizer/passes/fusion_options.py        |  167 ++
 .../passes/fusion_qordered_attention.py       |  421 +++
 .../optimizer/passes/fusion_qordered_gelu.py  |  117 +
 .../passes/fusion_qordered_layernorm.py       |  121 +
 .../passes/fusion_qordered_matmul.py          |  217 ++
 .../optimizer/passes/fusion_reshape.py        |  175 ++
 .../optimizer/passes/fusion_rms_norm.py       |  155 ++
 .../ILUVATAR/optimizer/passes/fusion_shape.py |  110 +
 .../optimizer/passes/fusion_skiplayernorm.py  |  212 ++
 .../passes/fusion_swinl_attention.py          |  321 +++
 .../optimizer/passes/fusion_t5_attention.py   |  312 +++
 .../ILUVATAR/optimizer/passes/fusion_utils.py |  240 ++
 .../passes/fusion_videobert_attention.py      |  306 +++
 .../optimizer/passes/fusion_xsoftmax.py       |   83 +
 .../optimizer/passes/fusion_yolov5_decoder.py |  131 +
 .../ILUVATAR/optimizer/passes/onnx_model.py   | 1166 ++++++++
 .../optimizer/passes/shape_infer_helper.py    |  122 +
 .../optimizer/passes/symbolic_shape_infer.py  | 2431 +++++++++++++++++
 .../ILUVATAR/optimizer/requirements.txt       |    3 +
 .../backends/ILUVATAR/requirements.txt        |    4 +
 .../ILUVATAR/runtime_backend_iluvatar.py      |  308 +++
 49 files changed, 14202 insertions(+)
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/ILUVATAR.json
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/common.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/ILUVATAR.json b/byte_infer_perf/general_perf/backends/ILUVATAR/ILUVATAR.json
new file mode 100755
index 000000000..0637a088a
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/ILUVATAR.json
@@ -0,0 +1 @@
+[]
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
new file mode 100755
index 000000000..016f01309
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
@@ -0,0 +1,88 @@
+"""
+    操作说明：如果不想跑CPU端的性能、精度、数值指标，可以执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32（示例）;
+             如果模型提供了pt、pb格式的优先选择torch的配置进行测试；
+
+    功能实现：
+        1、pt、pb模型转换在compile模块预处理过程中实现；
+        2、在天数智芯BI-150显卡上，调用推理引擎tensorrt进行推理，一些onnx模型需要利用前面一步导出的onnx模型再进行插件算子的优化；
+    
+    环境准备：
+        1、sdk版本：http://sw.iluvatar.ai/download/corex/daily_packages/latest/x86_64/bi150/sdk/corex-installer-linux64-3.4.0.20240418.74_x86_64_10.2.run
+        2、ixrt版本：http://sw.iluvatar.ai/download/corex/daily_packages/latest/x86_64/bi150/apps/py3.10/ixrt-0.9.1+corex.3.4.0.20240418.71-cp310-cp310-linux_x86_64.whl
+
+    遗留问题：
+        1、roformer、conformer、widedeep模型做了特殊处理，目前还不能做到加载模型预处理的onnx模型直接进行推理，研发还在继续优化
+"""
+
+"""
+    ******************下面简单的说明11个小模型是如何测试与测试报告生成的*****************
+    整个代码运行过程中，主要是从workloads目录下加载对应的模型的配置，主要有test_perf、test_accuracy、test_numeric三项测试内容，用户可以根据自己的需要选择开启与否；
+    一般情况下采用字节默认的配置项即可；
+
+    cd ByteMLPerf/byte_infer_perf;
+    1、bert模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
+        生成的测试报告在：general_perf/reports/ILUVATAR/bert-torch-fp32/
+
+    2、albert模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task albert-torch-fp32
+        生成的测试报告在：general_perf/reports/ILUVATAR/albert-torch-fp32/
+
+    3、debert模型：
+           ***给定的pt模型转成onnx后输入只有2个，因此这里特殊处理了一下；目前不能直接使用optimizer脚本优化后的onnx直接进行推理，我们把这个模型优化流程给出了，但是实际上使用了处理好的onnx：
+              deberta-base-squad-sim_end.onnx，将其放到：general_perf/model_zoo/popular/open_deberta/ 目录下；
+           ***
+           其次，需要修改model_zoo下面的general_perf/model_zoo/deberta-torch-fp32.json里面输入的个数，去掉token_type_ids.1相关的配置
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task deberta-torch-fp32
+        生成的测试报告在：general_perf/reports/ILUVATAR/deberta-torch-fp32/
+
+    4、roberta模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roberta-torch-fp32
+        生成的测试报告在：general_perf/reports/ILUVATAR/roberta-torch-fp32/
+
+    5、videobert模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task videobert-onnx-fp32
+        生成的测试报告在：general_perf/reports/ILUVATAR/videobert-onnx-fp32
+    
+    6、widedeep模型：
+        ***该模型经过了特殊的处理，需要采用的onnx模型：widedeep_dynamicshape_sim.onnx；将其放到：general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
+        ***
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
+        生成的测试报告在：general_perf/reports/ILUVATAR/widedeep-tf-fp32
+
+    7、swin-transformer模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task swin-large-torch-fp32
+        生成的测试报告在：general_perf/reports/ILUVATAR/swin-large-torch-fp32
+
+    8、resnet50模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task resnet50-torch-fp32
+        生成的测试报告在：general_perf/reports/ILUVATAR/resnet50-torch-fp32
+
+    9、yolov5模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task yolov5-onnx-fp32
+        生成的测试报告在：general_perf/reports/ILUVATAR/yolov5-onnx-fp32
+
+    10、conformer模型：
+        ***该onnx模型的transpose算子的逻辑是有问题，做了特殊处理；采用处理好的onnx模型：conformer_encoder_optimizer_end.onnx；
+           将其放到：general_perf/model_zoo/popular/open_conformer/ 
+        ***
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task conformer-encoder-onnx-fp32
+        生成的测试报告在：general_perf/reports/ILUVATAR/conformer-encoder-onnx-fp32
+
+    11、roformer模型：
+        ***********该模型暂时没有解决，等待后续解决了再修改代码，再进行测试***********
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roformer-tf-fp32
+        生成的测试报告在：general_perf/reports/ILUVATAR/roformer-tf-fp32
+"""
+
+"""
+    ****************大模型操作流程******
+    1. 进入ByteMLPerf目录
+    2. 执行
+        1）python3 byte_infer_perf/llm_perf/core/perf_engine.py --task chatglm2-torch-fp16-6b --hardware_type ILU, 得到chatglm2-torch-fp16-6b的精度和性能数据
+
+        2）python3 byte_infer_perf/llm_perf/core/perf_engine.py --task chinese-llama2-torch-fp16-13b --hardware_type ILU, 得到chinese-llama2-torch-fp16-13b的精度和性能数据
+
+    3. 在byte_infer_perf/llm_perf/reports/ILU目录下查看得到模型精度和性能数据的json文件。
+"""
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/common.py b/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
new file mode 100755
index 000000000..ca7dfa573
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
@@ -0,0 +1,273 @@
+import random
+import torch
+import time
+import ctypes
+import argparse
+import numpy as np
+from os.path import join, dirname, exists
+
+import tensorrt
+from tensorrt import Dims
+import pycuda.driver as cuda
+from cuda import cuda,cudart
+import threading
+import time
+
+
+def setup_seed(seed):
+     torch.manual_seed(seed)
+     torch.cuda.manual_seed_all(seed)
+     np.random.seed(seed)
+     random.seed(seed)
+     torch.backends.cudnn.deterministic = True
+
+
+def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
+    if not dynamic_path:
+        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
+
+    if not exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
+    
+    ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
+    tensorrt.init_libnvinfer_plugins(logger, namespace)
+    print(f"Loaded plugin from {dynamic_path}")
+
+
+def build_engine(model_name, onnx_model_path, engine_path, MaxBatchSize):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+
+    profile = builder.create_optimization_profile()
+
+    if model_name == 'resnet50':
+        profile.set_shape(
+                "input", Dims([1, 3, 224, 224]), Dims([32, 3, 224, 224]), Dims([MaxBatchSize, 3, 224, 224]))
+        
+    elif model_name == 'videobert':
+        profile.set_shape(
+            "image", Dims([1, 3, 224, 224]), Dims([32, 3, 224, 224]), Dims([MaxBatchSize, 3, 224, 224]))
+        profile.set_shape(
+            "text", Dims([100, 77]), Dims([100, 77]), Dims([100, 77]))
+        
+    elif model_name == 'yolov5':
+        profile.set_shape(
+                "images", Dims([1, 3, 640, 640]), Dims([32, 3, 640, 640]), Dims([MaxBatchSize, 3, 640, 640]))
+    
+    elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta':
+        profile.set_shape(
+            "input_ids.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384]))
+        profile.set_shape(
+            "attention_mask.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384]))
+        profile.set_shape(
+            "token_type_ids.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384]))
+        
+    elif model_name == 'deberta':
+        profile.set_shape(
+            "input_ids.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384]))
+        profile.set_shape(
+            "attention_mask.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384]))
+    
+    elif model_name == 'widedeep':
+        profile.set_shape(
+            "new_numeric_placeholder:0", Dims([MaxBatchSize, 13]), Dims([MaxBatchSize, 13]), Dims([MaxBatchSize, 13]))
+        profile.set_shape(
+            "new_categorical_placeholder:0", Dims([MaxBatchSize * 26, 2]), Dims([MaxBatchSize * 26, 2]), Dims([MaxBatchSize * 26, 2]))
+        profile.set_shape(
+            "import/head/predictions/zeros_like:0", Dims([MaxBatchSize, 1]), Dims([MaxBatchSize, 1]), Dims([MaxBatchSize, 1]))
+        
+    elif model_name == 'conformer':
+        profile.set_shape(
+            "src", Dims([1, 3, 64, 512]), Dims([16, 3, 64, 512]), Dims([MaxBatchSize, 3, 64, 512]))
+        profile.set_shape(
+            "src_pad_mask", Dims([1, 128]), Dims([16, 128]), Dims([MaxBatchSize, 128]))
+        
+    elif model_name == 'roformer':
+        profile.set_shape(
+            "input_segment:0", Dims([1, 1024]), Dims([16, 1024]), Dims([MaxBatchSize, 1024]))
+        profile.set_shape(
+            "input_token:0", Dims([1, 1024]), Dims([16, 1024]), Dims([MaxBatchSize, 1024]))
+        
+    elif model_name == 'swin':
+        profile.set_shape(
+                "pixel_values.1", Dims([1, 3, 384, 384]), Dims([32, 3, 384, 384]), Dims([MaxBatchSize, 3, 384, 384]))
+
+    else:
+        pass
+
+    build_config.add_optimization_profile(profile)
+
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file(onnx_model_path)
+    build_config.set_flag(tensorrt.BuilderFlag.FP16)
+
+    # set dynamic shape
+    num_inputs = network.num_inputs
+
+    for i in range(num_inputs):
+        if model_name == 'resnet50':
+            input_tensor = network.get_input(i)
+            input_tensor.shape = Dims([-1, 3, 224, 224])
+
+        elif model_name == 'videobert':
+            input_tensor = network.get_input(i)
+            if i == 0:
+                input_tensor.shape = Dims([-1, 3, 224, 224])
+            else:
+                input_tensor.shape = Dims([100, 77])
+
+        elif model_name == 'yolov5':
+            input_tensor = network.get_input(i)
+            input_tensor.shape = Dims([-1, 3, 640, 640])
+            network.get_input(i).dtype = tensorrt.float16
+
+        elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta':        
+            input_tensor = network.get_input(i)
+            input_tensor.shape = Dims([-1, 384])
+        
+        elif model_name == 'widedeep':
+            input_tensor = network.get_input(i)
+            if i == 0:
+                input_tensor.shape = Dims([-26, 2])
+            elif i == 1:
+                input_tensor.shape = Dims([-1, 13])
+            else:
+                input_tensor.shape = Dims([-1, 1])
+
+        elif model_name == 'conformer':
+            input_tensor = network.get_input(i)
+            if i == 0:
+                input_tensor.shape = Dims([-1, 3, 64, 512])
+            else:
+                input_tensor.shape = Dims([-1, 128])
+        
+        elif model_name == 'roformer':
+            input_tensor = network.get_input(i)
+            input_tensor.shape = Dims([32, 1024])
+
+        elif model_name == 'swin':
+            input_tensor = network.get_input(i)
+            input_tensor.shape = Dims([-1, 3, 384, 384])
+
+        else:
+            pass
+
+    plan = builder.build_serialized_network(network, build_config)
+
+    with open(engine_path, "wb") as f:
+        f.write(plan)
+
+    print("***Build dynamic shape engine success!***")
+
+
+def init_by_tensorrt(engine_path):
+    datatype = tensorrt.DataType.FLOAT
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+
+    with open(engine_path, "rb") as f, tensorrt.Runtime(logger) as runtime:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+    
+    return engine, context
+
+
+def setup_io_bindings(engine, context):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = context.get_binding_shape(i)
+
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+
+        for s in shape:
+            size *= s
+        
+        # allocation = cuda.mem_alloc(size)
+        err, allocation = cudart.cudaMalloc(size)
+        assert err == cudart.cudaError_t.cudaSuccess
+        
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+            "nbytes": size
+        }
+
+        allocations.append(allocation)
+
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+
+    return inputs, outputs, allocations
+
+
+# multi cores inference codes
+class Task:
+    def __init__(self, bs, dataset, device_id, load_fun, benchmark_fun, performance_reports, lock) -> None:
+        self.dataset = dataset
+        self.benchmark_fun = benchmark_fun
+        self.device_id = device_id
+        self.performance_reports = performance_reports
+        checkCudaErrors(cudart.cudaSetDevice(device_id))
+        load_fun(bs)
+        self.lock = lock
+
+    def run(self):
+        checkCudaErrors(cudart.cudaSetDevice(self.device_id))
+        batch_reports = self.benchmark_fun(self.dataset)
+        self.performance_reports.append(batch_reports)
+
+
+class TaskThread(threading.Thread):
+   def __init__(self, func, args):
+      threading.Thread.__init__(self)
+      self.func = func
+      self.args = args
+      
+   def run(self):
+      self.func(*self.args)
+
+
+def _cudaGetErrorEnum(error):
+    if isinstance(error, cuda.CUresult):
+        err, name = cuda.cuGetErrorName(error)
+        return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
+    elif isinstance(error, cudart.cudaError_t):
+        return cudart.cudaGetErrorName(error)[1]
+    else:
+        raise RuntimeError('Unknown error type: {}'.format(error))
+
+
+def checkCudaErrors(result):
+    if result[0].value:
+        raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])))
+    if len(result) == 1:
+        return None
+    elif len(result) == 2:
+        return result[1]
+    else:
+        return result[1:]
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
new file mode 100755
index 000000000..0c0f62994
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
@@ -0,0 +1,207 @@
+# Copyright 2023 Graphcore Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+import subprocess
+
+import tensorrt
+
+from general_perf.backends.ILUVATAR.common import load_ixrt_plugin
+load_ixrt_plugin()
+
+from general_perf.backends.ILUVATAR.common import build_engine
+from general_perf.backends.ILUVATAR.optimizer.passes import *
+from general_perf.tools.torch_to_onnx import torch_to_onnx
+from general_perf.tools.saved_to_onnx import savedmodel_to_onnx
+from general_perf.model_zoo import *
+from general_perf.backends import compile_backend
+
+log = logging.getLogger("CompileBackendILUVATAR")
+
+
+class CompileBackendILUVATAR(compile_backend.CompileBackend):
+    def __init__(self):
+        super(CompileBackendILUVATAR, self).__init__()
+        self.hardware_type = "ILUVATAR"
+        self.need_reload = False
+        self.model_runtimes = []
+        self.model_config = None
+
+    def version(self) -> str:
+        """Return compile backend version details."""
+        return tensorrt.__version__
+    
+    def compile(self, configs, dataloader=None):
+        model = configs['model_info']['model']
+        model_name = configs['model_info']['model'].split("-")[0]
+        model_path = configs['model_info']['model_path']
+        MaxBatchSize = configs['model_info']['max_batch_size']
+
+        # call the ONNX model and the compiled engine file
+        if model_name == 'videobert' or model_name == 'conformer':
+            onnx_model_path = model_path.split(".")[0] + "_end.onnx"
+            engine_path = model_path.split(".")[0] + "_end.engine"
+        
+        elif model_name == 'yolov5':
+            onnx_model_path = model_path.split(".")[0] + "_sim.onnx"
+            cmd = f'onnxsim {model_path} {onnx_model_path}'
+            subprocess.call(cmd, shell=True)
+            engine_path = model_path.split(".")[0] + "_sim.engine"
+
+        elif model_name == 'widedeep':
+            onnx_model_path = model_path + "/" + model + "_end.onnx"
+            engine_path = model_path + "/" + model + "_end.engine"
+        
+        elif model_name == 'roformer':
+            onnx_model_path = model_path + "/" + model + ".onnx"
+            engine_path = model_path + "/" + model + ".engine"
+
+        elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin':
+            onnx_model_path = os.path.dirname(model_path) + "/" + model + "_end.onnx"
+            engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine"
+        
+        else:
+            onnx_model_path = os.path.dirname(model_path) + "/" + model + ".onnx"
+            engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
+
+        # model preprocessing
+        # self.get_onnx(configs)
+
+        # build engine
+        if model_name == 'widedeep':
+            for bs in configs['workload']['batch_sizes']:
+                onnx_model_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_sim.onnx"
+                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_sim_" + str(bs) + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=bs)
+        
+        # elif model_name == 'roformer':
+        #     # onnx_model_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen-sim-modified-bs32.onnx"
+        #     # engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen-sim-modified-" + str(32) + ".engine"
+        #     # build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=32)
+        #     for bs in configs['workload']['batch_sizes']:
+        #         onnx_model_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen-sim-modified-bs32_bak.onnx"
+        #         engine_paths = "general_perf/general_perf/model_zoo/popular/open_roformer/roformer-frozen-sim-modified-" + str(bs) + ".engine" 
+        #         build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_paths, MaxBatchSize=bs)
+        
+        elif model_name == 'conformer':
+            onnx_model_path = "general_perf/model_zoo/popular/open_conformer/conformer_encoder_optimizer_end.onnx"
+            engine_path = "general_perf/model_zoo/popular/open_conformer/conformer_encoder_optimizer_end" + ".engine"    
+            build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
+
+        elif model_name == 'deberta':
+            onnx_model_path = "general_perf/model_zoo/popular/open_deberta/deberta-base-squad-sim_end.onnx"
+            engine_path = "general_perf/model_zoo/popular/open_conformer/deberta-base-squad-sim_end" + ".engine"    
+            build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
+
+        else:
+            build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
+
+        result = {
+            "model": 
+                configs['model_info']['model'],
+            "model_name": 
+                configs['model_info']['model'].split("-")[0],
+            "model_path":
+                configs['model_info']['model_path'],
+            "framework": 
+                configs['model_info']['framework'],
+            "compile_precision": 
+                configs['model_info']['model_precision'],
+            "input_type": 
+                configs['model_info']['input_type'].split(","),
+            "max_batch_size": 
+                configs['model_info']['max_batch_size'],
+            "compile_status":
+                "success",
+            "sg_percent": 100,
+            "segments": [
+                {
+                    "sg_idx": 0,
+                    "is_fallback": False,
+                    "input_tensor_map": 
+                        configs['model_info']['input_shape'],
+                    "output_tensor_map": 
+                        configs['model_info']['outputs'],
+                    "compiled_model": [
+                        {
+                            "compiled_bs": 1,
+                            "compiled_obj": configs['model_info']['model_path'],
+                        },
+                    ],
+                },
+            ],
+        }
+
+        self.configs = result
+        self.workload = configs['workload']
+        self.model_info = configs['model_info']
+
+        for key, value in result.items():
+            print('{key}: {value}'.format(key=key, value=value))
+
+        return result
+
+
+    def get_interact_profile(self, configs):
+        """
+            Collect information for core engine to let user interactively fill in configurations.
+        """
+        return []
+
+    def get_best_batch_size(self):
+        """Get Best Batch Size for the model.
+        Usually take the max batch size can be loaded to IPU as the best batch size to
+        get highest throughput.
+        """
+        return None
+    
+    def get_onnx(self, configs):
+        model = configs['model_info']['model']
+        model_name = configs['model_info']['model'].split("-")[0]
+        model_path = configs['model_info']['model_path']
+
+        # model save location
+        if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
+            onnx_model_path = model_path 
+
+        elif model_name == 'widedeep' or model_name == 'roformer':
+            onnx_model_path = model_path + "/" + model + ".onnx"
+
+        else:
+            onnx_model_path = os.path.dirname(model_path) + "/" + model + ".onnx"
+
+        framework = configs['model_info']['framework']
+
+        if framework == 'Pytorch':
+            torch_to_onnx(model_path=model_path, output_path=onnx_model_path)
+            print("***Convert pt model to onnx model success!***")
+
+        if framework == 'Tensorflow':
+            savedmodel_to_onnx(model_path=model_path, output_path=onnx_model_path)
+            print("***Convert pb model to onnx model success!***")
+
+        # Convert ONNX model to plugin operator model
+        """
+            ***********待处理问题记录************
+            conformer 模型不能利用optimizer.py脚本转换, 因为attention比较特殊, 利用处理好的onnx模型进行测试;
+            roformer  模型目前没有实现通过加载固定shape的onnx, 生成不同的batch的engine实现动态shape推理;
+            widedeep  模型目前对原始的onnx暂时不支持直接动态shape推理, 对模型做了一系列处理, 并且不需要进行optimizer.py脚本处理, 直接加载处理好的onnx模型;
+        """        
+        if model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or \
+            model_name == 'videobert' or model_name == 'swin':
+            
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path}'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py
new file mode 100755
index 000000000..e69de29bb
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py
new file mode 100755
index 000000000..c5ca9cfb5
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py
@@ -0,0 +1,578 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+import onnx
+from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
+from passes.fuse_series_bias_add import FusionSerialBiasAdd
+from passes.fusion_albert_attention import FusionAlbertAttention
+from passes.fusion_attention import AttentionMask, FusionAttention
+from passes.fusion_biasgelu import FusionBiasGelu
+from passes.fusion_customfc import (
+    FusionCustomFC,
+    FusionCustomFCActivation,
+    FusionCustomFCGPT2,
+)
+from passes.fusion_disentangled_attention import FusionDisentangledAttention
+from passes.fusion_embedlayer import FusionEmbedLayerNormalization
+from passes.fusion_fastgelu import FusionFastGelu
+from passes.fusion_format_roformer import (
+    FusionFormatInvalidMask,
+    FusionRemoveUselessElementwise,
+)
+from passes.fusion_gelu import FusionGelu
+from passes.fusion_gelu_approximation import FusionGeluApproximation
+from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast
+from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
+from passes.fusion_options import FusionOptions
+from passes.fusion_qordered_attention import FusionQOrderedAttention
+from passes.fusion_qordered_gelu import FusionQOrderedGelu
+from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
+from passes.fusion_qordered_matmul import FusionQOrderedMatMul
+from passes.fusion_reshape import FusionReshape
+from passes.fusion_shape import FusionShape
+from passes.fusion_skiplayernorm import (
+    FusionBiasSkipLayerNormalization,
+    FusionSkipLayerNormalization,
+)
+from passes.fusion_swinl_attention import FusionSwinLAttention
+from passes.fusion_utils import FusionUtils
+from passes.fusion_videobert_attention import FusionVideoBertAttention
+from passes.fusion_xsoftmax import FusionXSoftmax
+from passes.onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class BertOptimizationOptions(FusionOptions):
+    """This class is deprecated"""
+
+    def __init__(self, model_type):
+        logger.warning(
+            f"BertOptimizationOptions is depreciated. Please use FusionOptions instead."
+        )
+        super().__init__(model_type)
+
+
+class BertOnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
+        """Initialize BERT ONNX Model.
+
+        Args:
+            model (ModelProto): the ONNX model
+            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
+            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
+        """
+        assert (num_heads == 0 and hidden_size == 0) or (
+            num_heads > 0 and hidden_size % num_heads == 0
+        )
+
+        super().__init__(model)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+
+        self.attention_mask = AttentionMask(self)
+        self.attention_fusion = FusionAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.qordered_attention_fusion = FusionQOrderedAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.utils = FusionUtils(self)
+
+    def fuse_attention(self):
+        self.attention_fusion.apply()
+        FusionAlbertAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        ).apply()
+        fusion = FusionVideoBertAttention(self)
+        fusion.apply()
+        FusionSwinLAttention(self).apply()
+        FusionGptAttentionNoPast(self).apply()
+        # Only relevant in models with Q-DQ nodes
+        self.qordered_attention_fusion.apply()
+
+    def fuse_format_roformer(self):
+        FusionRemoveUselessElementwise(self).apply()
+        fusion = FusionFormatInvalidMask(self)
+        fusion.apply()
+
+    def fuse_custom_fc(self):
+        fusion = FusionCustomFC(self)
+        fusion.apply()
+
+    def fuse_custom_fc_activation(self):
+        fusion = FusionCustomFCActivation(self)
+        fusion.apply()
+
+    def fuse_custom_fc_gpt2_classify(self):
+        fusion = FusionCustomFCGPT2(self)
+        fusion.apply()
+
+    def fuse_swinT_serial_bias_add(self):
+        fusion = FusionSerialBiasAdd(self)
+        fusion.apply()
+
+    def fuse_gelu(self):
+        fusion = FusionGelu(self)
+        fusion.apply()
+        fusion = FusionFastGelu(self)
+        fusion.apply()
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedGelu(self)
+        fusion.apply()
+
+    def fuse_bias_gelu(self, is_fastgelu):
+        fusion = FusionBiasGelu(self, is_fastgelu)
+        fusion.apply()
+
+    def fuse_custom_xsoftmax(self):
+        fusion = FusionXSoftmax(self)
+        fusion.apply()
+
+    def fuse_disentangled_attention(self):
+        fusion = FusionDisentangledAttention(self)
+        fusion.apply()
+
+    def gelu_approximation(self):
+        fusion = FusionGeluApproximation(self)
+        fusion.apply()
+
+    def fuse_add_bias_skip_layer_norm(self):
+        fusion = FusionBiasSkipLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_reshape(self):
+        fusion = FusionReshape(self)
+        fusion.apply()
+
+    def fuse_shape(self):
+        fusion = FusionShape(self)
+        fusion.apply()
+
+    def fuse_embed_layer(self):
+        fusion = FusionEmbedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_layer_norm(self):
+        fusion = FusionLayerNormalization(self)
+        fusion.apply()
+
+        fusion = FusionLayerNormalizationTF(self)
+        fusion.apply()
+
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_skip_layer_norm(self):
+        fusion = FusionSkipLayerNormalization(self)
+        fusion.apply()
+
+    # Only relevant in models with Q-DQ nodes
+    def fuse_qordered_mamtul(self):
+        fusion = FusionQOrderedMatMul(self)
+        fusion.apply()
+
+    def get_graph_inputs_from_node_type(
+        self, op_type: str, input_indices: List[int], casted: bool
+    ):
+        """
+        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
+        Returns a list of the graph input names based on the filter whether it is casted or not.
+        """
+        graph_inputs = []
+
+        output_name_to_node = self.output_name_to_node()
+        nodes = self.get_nodes_by_op_type(op_type)
+        for node in nodes:
+            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
+            for bert_input in bert_inputs:
+                if self.find_graph_input(bert_input):
+                    if not casted:
+                        graph_inputs.append(bert_input)
+                elif bert_input in output_name_to_node:
+                    parent = output_name_to_node[bert_input]
+                    if (
+                        parent.op_type == "Cast"
+                        and self.find_graph_input(parent.input[0]) is not None
+                    ):
+                        if casted:
+                            graph_inputs.append(parent.input[0])
+        return graph_inputs
+
+    def get_graph_inputs_from_fused_nodes(self, casted: bool):
+        inputs = self.get_graph_inputs_from_node_type(
+            "EmbedLayerNormalization", [0, 1, 7], casted
+        )
+        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
+        return inputs
+
+    def change_graph_input_type(
+        self,
+        graph: GraphProto,
+        graph_input: ValueInfoProto,
+        new_type: int = TensorProto.INT32,
+    ):
+        """Change graph input type, and add Cast node if needed.
+
+        Args:
+            graph (GraphProto): graph
+            graph_input (TensorProto): input of the graph
+            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
+
+        Returns:
+            NodeProto: a new Cast node that added. None if Cast node is not added.
+            List[NodeProto]: Cast nodes that have been removed.
+        """
+        assert isinstance(graph, GraphProto)
+        assert isinstance(graph_input, ValueInfoProto)
+        assert self.find_graph_input(graph_input.name)
+
+        if graph_input.type.tensor_type.elem_type == int(new_type):
+            return None, []
+
+        new_cast_node = None
+        nodes_to_remove = []
+
+        input_name_to_nodes = self.input_name_to_nodes()
+        if graph_input.name in input_name_to_nodes:
+            nodes = input_name_to_nodes[graph_input.name]
+
+            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
+            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
+            if nodes_not_cast:
+                node_name = self.create_node_name("Cast")
+                output_name = node_name + "_" + graph_input.name
+                new_value_info = graph.value_info.add()
+                new_value_info.CopyFrom(graph_input)
+                new_value_info.name = output_name
+                new_cast_node = helper.make_node(
+                    "Cast",
+                    [graph_input.name],
+                    [output_name],
+                    to=int(graph_input.type.tensor_type.elem_type),
+                    name=node_name,
+                )
+                graph.node.extend([new_cast_node])
+
+                for node in nodes_not_cast:
+                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
+
+            # For children that is Cast node, no need to insert Cast.
+            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
+            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
+            for node in nodes_cast:
+                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
+                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
+                if not self.find_graph_output(node.output[0]):
+                    nodes_to_remove.append(node)
+            if nodes_to_remove:
+                self.remove_nodes(nodes_to_remove)
+
+        graph_input.type.tensor_type.elem_type = int(new_type)
+        return new_cast_node, nodes_to_remove
+
+    def change_graph_inputs_to_int32(self):
+        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
+        graph = self.graph()
+        add_cast_count = 0
+        remove_cast_count = 0
+        for graph_input in graph.input:
+            new_node, removed_nodes = self.change_graph_input_type(
+                graph, graph_input, TensorProto.INT32
+            )
+            if new_node:
+                add_cast_count += 1
+            remove_cast_count += len(removed_nodes)
+        logger.info(
+            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
+        )
+
+    def use_dynamic_axes(
+        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
+    ):
+        """
+        Update input and output shape to use dynamic axes.
+        """
+        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
+            casted=True
+        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
+
+        dynamic_batch_inputs = {}
+        for input in self.model.graph.input:
+            if input.name in bert_graph_inputs:
+                dim_proto = input.type.tensor_type.shape.dim[0]
+                dim_proto.dim_param = dynamic_batch_dim
+                if dynamic_seq_len is not None:
+                    dim_proto = input.type.tensor_type.shape.dim[1]
+                    dim_proto.dim_param = dynamic_seq_len
+
+        for output in self.model.graph.output:
+            dim_proto = output.type.tensor_type.shape.dim[0]
+            dim_proto.dim_param = dynamic_batch_dim
+
+    def preprocess(self):
+        self.adjust_reshape_and_expand()
+        return
+
+    def adjust_reshape_and_expand(self):
+        nodes_to_remove = []
+        for node in self.nodes():
+            if node.op_type == "Reshape":
+                # Clean up unneccessary reshape nodes.
+                # Find reshape nodes with no actually data in "shape" attribute and remove.
+                reshape_shape = self.get_constant_value(node.input[1])
+                if reshape_shape is not None and reshape_shape.size == 0:
+                    nodes_to_remove.extend([node])
+                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
+                    continue
+
+                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
+                # changing current reshape's input to output of slice.
+                reshape_path = self.match_parent_path(
+                    node,
+                    ["Expand", "Expand", "Reshape", "Slice"],
+                    [0, 0, 0, 0],
+                    self.output_name_to_node(),
+                )
+                if reshape_path is not None:
+                    expand_node = reshape_path[-3]
+                    expand_shape_value = self.get_constant_value(expand_node.input[1])
+
+                    reshape_before_expand = reshape_path[-2]
+                    shape_value = self.get_constant_value(
+                        reshape_before_expand.input[1]
+                    )
+
+                    slice_node = reshape_path[-1]
+                    if (
+                        expand_shape_value is not None
+                        and shape_value is not None
+                        and len(expand_shape_value) == 2
+                        and len(shape_value) == 1
+                        and expand_shape_value[1] == shape_value[0]
+                    ):
+                        node.input[0] = slice_node.output[0]
+
+        if nodes_to_remove:
+            self.remove_nodes(nodes_to_remove)
+            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
+
+    def clean_graph(self):
+        output_name_to_node = self.output_name_to_node()
+        nodes_to_remove = []
+        for node in self.nodes():
+            # Before:
+            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
+            #          |                                                     |
+            #          |                                                     v
+            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # After:
+            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
+            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
+            if node.op_type in op_input_id:
+                i = op_input_id[node.op_type]
+                parent_nodes = self.match_parent_path(
+                    node,
+                    [
+                        "Cast",
+                        "ConstantOfShape",
+                        "Concat",
+                        "Unsqueeze",
+                        "Gather",
+                        "Shape",
+                    ],
+                    [i, 0, 0, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    (
+                        cast,
+                        constantOfShape,
+                        concat,
+                        unsqueeze,
+                        gather,
+                        shape,
+                    ) = parent_nodes
+                    if shape.input[0] == self.graph().input[0].name:
+                        constantOfShape.input[0] = shape.output[0]
+                        output_name_to_node = self.output_name_to_node()
+
+            if node.op_type == "Attention":
+                # Before:
+                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
+                # After:
+                #   remove this path, and remove the optional mask_index input of Attention node.
+                parent_nodes = self.match_parent_path(
+                    node,
+                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
+                    [3, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
+                        attention_node = helper.make_node(
+                            "Attention",
+                            inputs=node.input[0 : len(node.input) - 1],
+                            outputs=node.output,
+                            name=node.name + "_remove_mask",
+                        )
+                        attention_node.domain = "com.microsoft"
+                        attention_node.attribute.extend(
+                            [helper.make_attribute("num_heads", self.num_heads)]
+                        )
+                        self.add_node(
+                            attention_node, self.get_graph_by_node(attention_node).name
+                        )
+                        nodes_to_remove.append(node)
+        self.remove_nodes(nodes_to_remove)
+
+    def postprocess(self):
+        self.clean_graph()
+        self.prune_graph()
+
+    def optimize(
+        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
+    ):
+        if (options is not None) and not options.enable_shape_inference:
+            self.disable_shape_inference()
+
+        self.utils.remove_identity_nodes()
+
+        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
+        self.utils.remove_useless_cast_nodes()
+
+        if (options is None) or options.enable_layer_norm:
+            self.fuse_layer_norm()
+
+        if (options is None) or options.enable_gelu:
+            self.fuse_gelu()
+
+        self.preprocess()
+
+        self.fuse_reshape()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        if options.enable_swint_opt:
+            self.fuse_custom_fc()
+            self.fuse_swinT_serial_bias_add()
+
+        if options.enable_format_roformer:
+            self.fuse_format_roformer()
+
+        if options.enable_gpt2_classify:
+            self.fuse_custom_fc_gpt2_classify()
+
+        if (options is None) or options.enable_attention:
+            if options is not None:
+                self.attention_mask.set_mask_format(options.attention_mask_format)
+            self.fuse_attention()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        self.fuse_custom_fc()
+
+        self.fuse_custom_xsoftmax()
+
+        self.fuse_disentangled_attention()
+
+        # Perform the MatMul fusion after the Attention fusion as we do not
+        # want to fuse the MatMuls inside the Attention subgraphs
+        if (options is None) or options.enable_qordered_matmul:
+            self.fuse_qordered_mamtul()
+
+        self.fuse_shape()
+
+        if (options is None) or options.enable_embed_layer_norm:
+            self.fuse_embed_layer()
+
+        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
+        self.utils.remove_useless_reshape_nodes()
+
+        self.postprocess()
+
+        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
+        if (options is None) or options.enable_bias_gelu:
+            # Fuse Gelu and Add Bias before it.
+            self.fuse_bias_gelu(is_fastgelu=True)
+            self.fuse_bias_gelu(is_fastgelu=False)
+
+        if (options is None) or options.enable_bias_skip_layer_norm:
+            # Fuse SkipLayerNormalization and Add Bias before it.
+            self.fuse_add_bias_skip_layer_norm()
+
+        if options is not None and options.enable_gelu_approximation:
+            self.gelu_approximation()
+
+        self.fuse_custom_fc_activation()
+
+        self.remove_unused_constant()
+
+        # Use symbolic batch dimension in input and output.
+        if add_dynamic_axes:
+            self.use_dynamic_axes()
+
+        logger.info(f"opset version: {self.get_opset_version()}")
+
+    def get_fused_operator_statistics(self):
+        """
+        Returns node count of fused operators.
+        """
+        op_count = {}
+        ops = [
+            "EmbedLayerNormalization",
+            "Attention",
+            "QOrderedAttention",
+            "Gelu",
+            "QOrderedGelu",
+            "FastGelu",
+            "BiasGelu",
+            "LayerNormalization",
+            "QOrderedLayerNormalization",
+            "SkipLayerNormalization",
+            "QOrderedMatMul",
+        ]
+        for op in ops:
+            nodes = self.get_nodes_by_op_type(op)
+            op_count[op] = len(nodes)
+        logger.info(f"Optimized operators:{op_count}")
+        return op_count
+
+    def is_fully_optimized(self):
+        """
+        Returns True when the model is fully optimized.
+        """
+        op_count = self.get_fused_operator_statistics()
+        embed = op_count["EmbedLayerNormalization"]
+        attention = op_count["Attention"] + op_count["QOrderedAttention"]
+        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
+        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
+        is_perfect = (
+            (embed > 0)
+            and (attention > 0)
+            and (attention == gelu)
+            and (layer_norm >= 2 * attention)
+        )
+
+        if layer_norm == 0:
+            logger.debug("Layer Normalization not fused")
+
+        if gelu == 0:
+            logger.debug("Gelu/FastGelu not fused")
+
+        if embed == 0:
+            logger.debug("Embed Layer not fused")
+
+        if attention == 0:
+            logger.warning("Attention not fused")
+
+        return is_perfect
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py
new file mode 100755
index 000000000..4b1d6b5fe
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py
@@ -0,0 +1,519 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+import onnx
+from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
+from passes.fusion_attention import AttentionMask, FusionAttention
+from passes.fusion_biasgelu import FusionBiasGelu
+from passes.fusion_customfc import FusionCustomFC, FusionCustomFCActivation
+from passes.fusion_embedlayer import FusionEmbedLayerNormalization
+from passes.fusion_fastgelu import FusionFastGelu
+from passes.fusion_gelu import FusionGelu
+from passes.fusion_gelu_approximation import FusionGeluApproximation
+from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
+from passes.fusion_options import FusionOptions
+from passes.fusion_qordered_attention import FusionQOrderedAttention
+from passes.fusion_qordered_gelu import FusionQOrderedGelu
+from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
+from passes.fusion_qordered_matmul import FusionQOrderedMatMul
+from passes.fusion_reshape import FusionReshape
+from passes.fusion_rms_norm import FusionRMSNorm
+from passes.fusion_shape import FusionShape
+from passes.fusion_skiplayernorm import (
+    FusionBiasSkipLayerNormalization,
+    FusionSkipLayerNormalization,
+)
+from passes.fusion_t5_attention import FusionT5Attention
+from passes.fusion_utils import FusionUtils
+from passes.onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class BertOptimizationOptions(FusionOptions):
+    """This class is deprecated"""
+
+    def __init__(self, model_type):
+        logger.warning(
+            f"BertOptimizationOptions is depreciated. Please use FusionOptions instead."
+        )
+        super().__init__(model_type)
+
+
+class T5OnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
+        """Initialize T5 ONNX Model.
+
+        Args:
+            model (ModelProto): the ONNX model
+            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
+            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
+        """
+        assert (num_heads == 0 and hidden_size == 0) or (
+            num_heads > 0 and hidden_size % num_heads == 0
+        )
+
+        super().__init__(model)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+
+        self.attention_mask = AttentionMask(self)
+        self.attention_fusion = FusionAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.qordered_attention_fusion = FusionQOrderedAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.utils = FusionUtils(self)
+
+    def fuse_custom_fc(self):
+        fusion = FusionCustomFC(self)
+        fusion.apply()
+
+    def fuse_custom_fc_activation(self):
+        fusion = FusionCustomFCActivation(self)
+        fusion.apply()
+
+    def fuse_gelu(self):
+        fusion = FusionGelu(self)
+        fusion.apply()
+        fusion = FusionFastGelu(self)
+        fusion.apply()
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedGelu(self)
+        fusion.apply()
+
+    def fuse_bias_gelu(self, is_fastgelu):
+        fusion = FusionBiasGelu(self, is_fastgelu)
+        fusion.apply()
+
+    def gelu_approximation(self):
+        fusion = FusionGeluApproximation(self)
+        fusion.apply()
+
+    def fuse_add_bias_skip_layer_norm(self):
+        fusion = FusionBiasSkipLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_reshape(self):
+        fusion = FusionReshape(self)
+        fusion.apply()
+
+    def fuse_shape(self):
+        fusion = FusionShape(self)
+        fusion.apply()
+
+    def fuse_embed_layer(self):
+        fusion = FusionEmbedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_rms_norm(self):
+        fusion = FusionRMSNorm(self)
+        fusion.apply()
+
+    def fuse_t5_attention(self):
+        fusion = FusionT5Attention(self)
+        fusion.apply()
+        # pass
+
+    def fuse_layer_norm(self):
+        fusion = FusionLayerNormalization(self)
+        fusion.apply()
+
+        fusion = FusionLayerNormalizationTF(self)
+        fusion.apply()
+
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_skip_layer_norm(self):
+        fusion = FusionSkipLayerNormalization(self)
+        fusion.apply()
+
+    # Only relevant in models with Q-DQ nodes
+    def fuse_qordered_mamtul(self):
+        fusion = FusionQOrderedMatMul(self)
+        fusion.apply()
+
+    def get_graph_inputs_from_node_type(
+        self, op_type: str, input_indices: List[int], casted: bool
+    ):
+        """
+        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
+        Returns a list of the graph input names based on the filter whether it is casted or not.
+        """
+        graph_inputs = []
+
+        output_name_to_node = self.output_name_to_node()
+        nodes = self.get_nodes_by_op_type(op_type)
+        for node in nodes:
+            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
+            for bert_input in bert_inputs:
+                if self.find_graph_input(bert_input):
+                    if not casted:
+                        graph_inputs.append(bert_input)
+                elif bert_input in output_name_to_node:
+                    parent = output_name_to_node[bert_input]
+                    if (
+                        parent.op_type == "Cast"
+                        and self.find_graph_input(parent.input[0]) is not None
+                    ):
+                        if casted:
+                            graph_inputs.append(parent.input[0])
+        return graph_inputs
+
+    def get_graph_inputs_from_fused_nodes(self, casted: bool):
+        inputs = self.get_graph_inputs_from_node_type(
+            "EmbedLayerNormalization", [0, 1, 7], casted
+        )
+        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
+        return inputs
+
+    def change_graph_input_type(
+        self,
+        graph: GraphProto,
+        graph_input: ValueInfoProto,
+        new_type: int = TensorProto.INT32,
+    ):
+        """Change graph input type, and add Cast node if needed.
+
+        Args:
+            graph (GraphProto): graph
+            graph_input (TensorProto): input of the graph
+            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
+
+        Returns:
+            NodeProto: a new Cast node that added. None if Cast node is not added.
+            List[NodeProto]: Cast nodes that have been removed.
+        """
+        assert isinstance(graph, GraphProto)
+        assert isinstance(graph_input, ValueInfoProto)
+        assert self.find_graph_input(graph_input.name)
+
+        if graph_input.type.tensor_type.elem_type == int(new_type):
+            return None, []
+
+        new_cast_node = None
+        nodes_to_remove = []
+
+        input_name_to_nodes = self.input_name_to_nodes()
+        if graph_input.name in input_name_to_nodes:
+            nodes = input_name_to_nodes[graph_input.name]
+
+            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
+            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
+            if nodes_not_cast:
+                node_name = self.create_node_name("Cast")
+                output_name = node_name + "_" + graph_input.name
+                new_value_info = graph.value_info.add()
+                new_value_info.CopyFrom(graph_input)
+                new_value_info.name = output_name
+                new_cast_node = helper.make_node(
+                    "Cast",
+                    [graph_input.name],
+                    [output_name],
+                    to=int(graph_input.type.tensor_type.elem_type),
+                    name=node_name,
+                )
+                graph.node.extend([new_cast_node])
+
+                for node in nodes_not_cast:
+                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
+
+            # For children that is Cast node, no need to insert Cast.
+            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
+            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
+            for node in nodes_cast:
+                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
+                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
+                if not self.find_graph_output(node.output[0]):
+                    nodes_to_remove.append(node)
+            if nodes_to_remove:
+                self.remove_nodes(nodes_to_remove)
+
+        graph_input.type.tensor_type.elem_type = int(new_type)
+        return new_cast_node, nodes_to_remove
+
+    def change_graph_inputs_to_int32(self):
+        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
+        graph = self.graph()
+        add_cast_count = 0
+        remove_cast_count = 0
+        for graph_input in graph.input:
+            new_node, removed_nodes = self.change_graph_input_type(
+                graph, graph_input, TensorProto.INT32
+            )
+            if new_node:
+                add_cast_count += 1
+            remove_cast_count += len(removed_nodes)
+        logger.info(
+            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
+        )
+
+    def use_dynamic_axes(
+        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
+    ):
+        """
+        Update input and output shape to use dynamic axes.
+        """
+        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
+            casted=True
+        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
+
+        dynamic_batch_inputs = {}
+        for input in self.model.graph.input:
+            if input.name in bert_graph_inputs:
+                dim_proto = input.type.tensor_type.shape.dim[0]
+                dim_proto.dim_param = dynamic_batch_dim
+                if dynamic_seq_len is not None:
+                    dim_proto = input.type.tensor_type.shape.dim[1]
+                    dim_proto.dim_param = dynamic_seq_len
+
+        for output in self.model.graph.output:
+            dim_proto = output.type.tensor_type.shape.dim[0]
+            dim_proto.dim_param = dynamic_batch_dim
+
+    def preprocess(self):
+        self.adjust_reshape_and_expand()
+        return
+
+    def adjust_reshape_and_expand(self):
+        nodes_to_remove = []
+        for node in self.nodes():
+            if node.op_type == "Reshape":
+                # Clean up unneccessary reshape nodes.
+                # Find reshape nodes with no actually data in "shape" attribute and remove.
+                reshape_shape = self.get_constant_value(node.input[1])
+                if reshape_shape is not None and reshape_shape.size == 0:
+                    nodes_to_remove.extend([node])
+                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
+                    continue
+
+                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
+                # changing current reshape's input to output of slice.
+                reshape_path = self.match_parent_path(
+                    node,
+                    ["Expand", "Expand", "Reshape", "Slice"],
+                    [0, 0, 0, 0],
+                    self.output_name_to_node(),
+                )
+                if reshape_path is not None:
+                    expand_node = reshape_path[-3]
+                    expand_shape_value = self.get_constant_value(expand_node.input[1])
+
+                    reshape_before_expand = reshape_path[-2]
+                    shape_value = self.get_constant_value(
+                        reshape_before_expand.input[1]
+                    )
+
+                    slice_node = reshape_path[-1]
+                    if (
+                        expand_shape_value is not None
+                        and shape_value is not None
+                        and len(expand_shape_value) == 2
+                        and len(shape_value) == 1
+                        and expand_shape_value[1] == shape_value[0]
+                    ):
+                        node.input[0] = slice_node.output[0]
+
+        if nodes_to_remove:
+            self.remove_nodes(nodes_to_remove)
+            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
+
+    def clean_graph(self):
+        output_name_to_node = self.output_name_to_node()
+        nodes_to_remove = []
+        for node in self.nodes():
+            # Before:
+            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
+            #          |                                                     |
+            #          |                                                     v
+            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # After:
+            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
+            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
+            if node.op_type in op_input_id:
+                i = op_input_id[node.op_type]
+                parent_nodes = self.match_parent_path(
+                    node,
+                    [
+                        "Cast",
+                        "ConstantOfShape",
+                        "Concat",
+                        "Unsqueeze",
+                        "Gather",
+                        "Shape",
+                    ],
+                    [i, 0, 0, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    (
+                        cast,
+                        constantOfShape,
+                        concat,
+                        unsqueeze,
+                        gather,
+                        shape,
+                    ) = parent_nodes
+                    if shape.input[0] == self.graph().input[0].name:
+                        constantOfShape.input[0] = shape.output[0]
+                        output_name_to_node = self.output_name_to_node()
+
+            if node.op_type == "Attention":
+                # Before:
+                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
+                # After:
+                #   remove this path, and remove the optional mask_index input of Attention node.
+                parent_nodes = self.match_parent_path(
+                    node,
+                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
+                    [3, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
+                        attention_node = helper.make_node(
+                            "Attention",
+                            inputs=node.input[0 : len(node.input) - 1],
+                            outputs=node.output,
+                            name=node.name + "_remove_mask",
+                        )
+                        attention_node.domain = "com.microsoft"
+                        attention_node.attribute.extend(
+                            [helper.make_attribute("num_heads", self.num_heads)]
+                        )
+                        self.add_node(
+                            attention_node, self.get_graph_by_node(attention_node).name
+                        )
+                        nodes_to_remove.append(node)
+        self.remove_nodes(nodes_to_remove)
+
+    def postprocess(self):
+        self.clean_graph()
+        self.prune_graph()
+
+    def optimize(
+        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
+    ):
+        if (options is not None) and not options.enable_shape_inference:
+            self.disable_shape_inference()
+
+        self.utils.remove_identity_nodes()
+
+        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
+        self.utils.remove_useless_cast_nodes()
+
+        if (options is None) or options.enable_layer_norm:
+            self.fuse_layer_norm()
+
+        if (options is None) or options.enable_gelu:
+            self.fuse_gelu()
+
+        self.preprocess()
+
+        self.fuse_reshape()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        # Perform the MatMul fusion after the Attention fusion as we do not
+        # want to fuse the MatMuls inside the Attention subgraphs
+        if (options is None) or options.enable_qordered_matmul:
+            self.fuse_qordered_mamtul()
+
+        self.fuse_shape()
+
+        self.fuse_rms_norm()
+
+        self.fuse_t5_attention()
+
+        if (options is None) or options.enable_embed_layer_norm:
+            self.fuse_embed_layer()
+
+        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
+        self.utils.remove_useless_reshape_nodes()
+
+        self.postprocess()
+
+        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
+        if (options is None) or options.enable_bias_gelu:
+            # Fuse Gelu and Add Bias before it.
+            self.fuse_bias_gelu(is_fastgelu=True)
+            self.fuse_bias_gelu(is_fastgelu=False)
+
+        if (options is None) or options.enable_bias_skip_layer_norm:
+            # Fuse SkipLayerNormalization and Add Bias before it.
+            self.fuse_add_bias_skip_layer_norm()
+
+        if options is not None and options.enable_gelu_approximation:
+            self.gelu_approximation()
+
+        self.remove_unused_constant()
+
+        # Use symbolic batch dimension in input and output.
+        if add_dynamic_axes:
+            self.use_dynamic_axes()
+
+        logger.info(f"opset version: {self.get_opset_version()}")
+
+    def get_fused_operator_statistics(self):
+        """
+        Returns node count of fused operators.
+        """
+        op_count = {}
+        ops = [
+            "EmbedLayerNormalization",
+            "Attention",
+            "QOrderedAttention",
+            "Gelu",
+            "QOrderedGelu",
+            "FastGelu",
+            "BiasGelu",
+            "LayerNormalization",
+            "QOrderedLayerNormalization",
+            "SkipLayerNormalization",
+            "QOrderedMatMul",
+        ]
+        for op in ops:
+            nodes = self.get_nodes_by_op_type(op)
+            op_count[op] = len(nodes)
+        logger.info(f"Optimized operators:{op_count}")
+        return op_count
+
+    def is_fully_optimized(self):
+        """
+        Returns True when the model is fully optimized.
+        """
+        op_count = self.get_fused_operator_statistics()
+        embed = op_count["EmbedLayerNormalization"]
+        attention = op_count["Attention"] + op_count["QOrderedAttention"]
+        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
+        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
+        is_perfect = (
+            (embed > 0)
+            and (attention > 0)
+            and (attention == gelu)
+            and (layer_norm >= 2 * attention)
+        )
+
+        if layer_norm == 0:
+            logger.debug("Layer Normalization not fused")
+
+        if gelu == 0:
+            logger.debug("Gelu/FastGelu not fused")
+
+        if embed == 0:
+            logger.debug("Embed Layer not fused")
+
+        if attention == 0:
+            logger.warning("Attention not fused")
+
+        return is_perfect
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py
new file mode 100755
index 000000000..88e6c99c1
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py
@@ -0,0 +1,114 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+from onnx import ModelProto
+from passes.fuse_series_bias_add import FusionSerialBiasAdd
+from passes.fusion_customfc import FusionCustomFC, FusionCustomFCActivation
+from passes.fusion_fastgelu import FusionFastGelu
+from passes.fusion_format_roformer import (
+    FusionFormatInvalidMask,
+    FusionRemoveUselessElementwise,
+)
+from passes.fusion_gelu import FusionGelu
+from passes.fusion_gelu_approximation import FusionGeluApproximation
+from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
+from passes.fusion_options import FusionOptions
+from passes.fusion_qordered_attention import FusionQOrderedAttention
+from passes.fusion_qordered_gelu import FusionQOrderedGelu
+from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
+from passes.fusion_reshape import FusionReshape
+from passes.fusion_shape import FusionShape
+from passes.fusion_utils import FusionUtils
+from passes.fusion_yolov5_decoder import FusionYoloV5Decoder
+from passes.onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class YoloOnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
+        """Initialize BERT ONNX Model.
+
+        Args:
+            model (ModelProto): the ONNX model
+            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
+            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
+        """
+        assert (num_heads == 0 and hidden_size == 0) or (
+            num_heads > 0 and hidden_size % num_heads == 0
+        )
+        super().__init__(model)
+        self.utils = FusionUtils(self)
+
+    def fuse_format_roformer(self):
+        FusionRemoveUselessElementwise(self).apply()
+        fusion = FusionFormatInvalidMask(self)
+        fusion.apply()
+
+    def fuse_custom_fc(self):
+        fusion = FusionCustomFC(self)
+        fusion.apply()
+
+    def fuse_custom_fc_activation(self):
+        fusion = FusionCustomFCActivation(self)
+        fusion.apply()
+
+    def fuse_swinT_serial_bias_add(self):
+        fusion = FusionSerialBiasAdd(self)
+        fusion.apply()
+
+    def fuse_gelu(self):
+        fusion = FusionGelu(self)
+        fusion.apply()
+        fusion = FusionFastGelu(self)
+        fusion.apply()
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedGelu(self)
+        fusion.apply()
+
+    def fuse_reshape(self):
+        fusion = FusionReshape(self)
+        fusion.apply()
+
+    def fuse_shape(self):
+        fusion = FusionShape(self)
+        fusion.apply()
+
+    def fuse_layer_norm(self):
+        fusion = FusionLayerNormalization(self)
+        fusion.apply()
+
+        fusion = FusionLayerNormalizationTF(self)
+        fusion.apply()
+
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedLayerNormalization(self)
+        fusion.apply()
+
+    def optimize(
+        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
+    ):
+        if (options is not None) and not options.enable_shape_inference:
+            self.disable_shape_inference()
+
+        self.utils.remove_identity_nodes()
+
+        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
+        self.utils.remove_useless_cast_nodes()
+
+        if (options is None) or options.enable_layer_norm:
+            self.fuse_layer_norm()
+
+        if (options is None) or options.enable_gelu:
+            self.fuse_gelu()
+
+        self.fuse_reshape()
+
+        FusionYoloV5Decoder(self).apply()
+        self.remove_unused_constant()
+        logger.info(f"opset version: {self.get_opset_version()}")
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md
new file mode 100755
index 000000000..dc823d366
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md
@@ -0,0 +1,51 @@
+# IxRT optimizer
+
+## 1. optimizer 简介
+`optimizer` 是一个 ixrt 中集成的图融合工具，用于将onnx图中的op融合成对应的ixrt plugin；
+
+## 2. optimizer 功能说明
+| 功能           | 说明  |
+| -------------- | ---- |
+| 多 batchsize 支持 | 支持设置不同 batchsize 进行推理测试 |
+| 动态图支持 | 支持融合动态图和静态图 |
+| 模型支持 | 目前测试通过videobert, roberta, deberta, swinL, roformer, albert等模型 |
+
+## 3. optimizer 运行参数
+| 参数           | 说明  |
+| -------------- | ---- |
+| `--onnx`       | 必选 ，指定要运行的 onnx 模型路径 |
+| `--num_heads`  | 可选 ，指定模型对应Attention模块注意力头的个数 |
+|`--hidden_size`    | 可选， 模型模型隐藏层的大小|
+|`--input_shapes` | 可选 ，指定模型输入数据类型，示例 --input_shapes "input_name1:3x224x224, input_name2:3x224x224"类型 |
+| `--dump_onnx` | 可选 ，用于图融合过程中dump出中间的onnx图 |
+|`--model_type`        | 可选 ，可以指定要融合的模型类型，默认是"bert", 可选["bert", "swint", "roformer"]|
+|`--log_level`     |可选 ，指定ixrt运行时显示日志的等级， 可指定为debug、info、error，默认为 info|
+
+
+## 4. 运行示例
+
+###  4.1 示例1：融合albert|videobert|roberta|deberta
+```bash
+cd oss/tools/optimizer
+python3 optimizer.py --onnx ${MODEL_PATH}
+```
+
+###  4.2 示例2：融合swinL
+```bash
+cd oss/tools/optimizer
+python3 optimizer.py --onnx ${MODEL_PATH} --input_shapes pixel_values.1:${BS}x3x384x384 --model_type swint
+```
+
+###  4.3 示例3：融合roformer
+```bash
+cd oss/tools/optimizer
+python3 optimizer.py --onnx ${MODEL_PATH} --model_type roformer
+```
+
+### 4.4 精度验证
+
+请参考[高级话题](5_advanced_topics.md)中的<u>精度对比工具</u>一节，了解详细使用方法和原理。
+
+也可以用[C++ API 使用简介](3_cpp_api.md)或 [Python API 使用简介](4_python_api.md)
+
+具体使用方法可以参考oss/samples
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py
new file mode 100755
index 000000000..49ed79498
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py
@@ -0,0 +1,188 @@
+import argparse
+import logging
+import time
+from typing import Dict, Optional
+
+import onnx
+from onnx import ModelProto, helper, load_model
+from onnx_model_bert import BertOnnxModel
+from onnx_model_t5 import T5OnnxModel
+from onnx_model_yolo import YoloOnnxModel
+from onnxsim import simplify
+from passes.fusion_options import FusionOptions
+from passes.symbolic_shape_infer import SymbolicShapeInference
+
+logger = logging.getLogger(__name__)
+MODEL_TYPES = {
+    "bert": (BertOnnxModel, None, "pytorch", 1),
+    "swint": (BertOnnxModel, None, "pytorch", 1),
+    "roformer": (BertOnnxModel, None, "tf2onnx", 1),
+    "gpt2": (BertOnnxModel, None, "pytorch", 1),
+    "t5": (T5OnnxModel, None, "tf2onnx", 1),
+    "yolo": (YoloOnnxModel, None, "pytorch", 1),
+}
+
+
+def optimize_by_fusion(
+    model: ModelProto,
+    model_type: str = "bert",
+    num_heads: int = 0,
+    hidden_size: int = 0,
+    optimization_options: Optional[FusionOptions] = None,
+):
+    """Optimize Model by graph fusion logic.
+
+    Note that ONNXRuntime graph optimizations (like constant folding) will not be applied. So it is better to enable
+    constant folding during exporting ONNX model, or run optimize_by_onnxruntime on the model first like optimize_model.
+
+    For BERT model, num_heads and hidden_size are optional. For other model types, you need specify these parameters.
+
+    Args:
+        model (ModelProto): model object
+        model_type (str, optional): model type - like bert, bert_tf, bert_keras or gpt2. Defaults to 'bert'.
+        num_heads (int, optional): number of attention heads. Defaults to 0.
+                                   0 allows detect the parameter from graph automatically (for model_type "bert" only).
+        hidden_size (int, optional): hidden size. Defaults to 0.
+                                     0 allows detect the parameter from graph automatically (for model_type "bert" only).
+        optimization_options (FusionOptions, optional): optimization options that turn on/off some fusions. Defaults to None.
+
+     Returns:
+        object of an optimizer class.
+    """
+    if model_type != "bert" and (num_heads == 0 or hidden_size == 0):
+        logger.warning(
+            "Please specify parameters of num_heads and hidden_size when model_type is not 'bert'"
+        )
+
+    (optimizer_class, transformer_class, producer, _) = MODEL_TYPES[model_type]
+
+    if model.producer_name and producer != model.producer_name:
+        logger.warning(
+            f'Model producer not matched: Expected "{producer}", Got "{model.producer_name}".'
+            "Please specify correct --model_type parameter."
+        )
+
+    if optimization_options is None:
+        optimization_options = FusionOptions(model_type)
+
+    optimizer = optimizer_class(model, num_heads, hidden_size)
+
+    optimizer.optimize(optimization_options)
+
+    optimizer.topological_sort()
+
+    return optimizer, transformer_class
+
+
+def optimize_to_ixrt(args):
+    onnx_name = args.onnx[:-5]
+    model = onnx.load(args.onnx)
+
+    logger.info("simplify..")
+    simplified_model, check = simplify(model)
+    logger.info("simplify model end...")
+    if args.dump_onnx:
+        onnx.save(simplified_model, onnx_name + "_sim.onnx")
+
+    # transfer to static shape and optimize it
+    static_sim_model = simplified_model
+    if args.input_shapes:
+        for input_tensor in simplified_model.graph.input:
+            if input_tensor.name in args.input_shapes.keys():
+                new_shape = args.input_shapes[input_tensor.name]
+                dim_list = []
+                for dim in new_shape:
+                    if isinstance(dim, int):
+                        dim_proto = onnx.TensorShapeProto.Dimension()
+                        dim_proto.dim_value = dim
+                        dim_list.append(dim_proto)
+                    elif isinstance(dim, str):
+                        dim_proto = onnx.TensorShapeProto.Dimension()
+                        dim_proto.dim_param = dim
+                        dim_list.append(dim_proto)
+
+                del input_tensor.type.tensor_type.shape.dim[:]
+                input_tensor.type.tensor_type.shape.dim.extend(dim_list)
+
+    try:
+        static_model = SymbolicShapeInference.infer_shapes(
+            simplified_model, 2**31 - 1, False, False, 3
+        )
+        static_sim_model, check = simplify(static_model)
+        if args.dump_onnx:
+            onnx.save(static_sim_model, onnx_name + "_sim_static_sim.onnx")
+    except Exception as e:
+        static_model = static_sim_model = simplified_model
+
+    if args.dump_onnx:
+        onnx.save(static_model, onnx_name + "_sim_static.onnx")
+
+    logger.info("start fusion..")
+    opt_model, _ = optimize_by_fusion(
+        static_sim_model, args.model_type, args.num_heads, args.hidden_size
+    )
+    opt_model.save_model_to_file(onnx_name + "_end.onnx")
+    logger.info("done..")
+
+
+def parse_params(params_str):
+    params = {}
+    for item in params_str.replace(" ", "").split(","):
+        key, value = item.split(":")
+        params[key] = [int(x) if x.isdigit() else x for x in value.split("x")]
+    return params
+
+
+def args_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--onnx", type=str, default=None, required=False, help="ONNX model file path"
+    )
+    parser.add_argument(
+        "--num_heads",
+        type=int,
+        default=0,
+        help="Used in model optimization. The num of the head used in the network",
+    )
+    parser.add_argument(
+        "--hidden_size",
+        type=int,
+        default=0,
+        help="Used in model optimization. The hidden_size used in the network",
+    )
+    parser.add_argument(
+        "--input_shapes",
+        type=parse_params,
+        help='Static input_shapes to the inference, format is --input_shapes "input_name1:3x224x224, input_name2:3x224x224"',
+    )
+    parser.add_argument(
+        "--dump_onnx",
+        action="store_true",
+        help="Whether to dump onnx",
+    )
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default="bert",
+        choices=["bert", "swint", "roformer", "t5", "yolo", "gpt2"],
+        help="Which kind of model to optimize",
+    )
+    parser.add_argument(
+        "--log_level",
+        type=str,
+        default="info",
+        choices=["debug", "info", "error"],
+        help="Which kind of model to optimize",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = args_parser()
+    if args.log_level == "info":
+        logging.basicConfig(level=logging.INFO)
+    elif args.log_level == "debug":
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.ERROR)
+    optimize_to_ixrt(args)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py
new file mode 100755
index 000000000..e69de29bb
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py
new file mode 100755
index 000000000..437e72fce
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py
@@ -0,0 +1,394 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+# This file is modified from https://github.com/microsoft/onnxconverter-common/blob/master/onnxconverter_common/float16.py
+# Modifications: keep_io_types can be list of names; convert initializers if needed to preserve precision; add force_fp16_initializers option.
+
+import itertools
+import logging
+from typing import Dict, List
+
+import numpy as np
+import onnx
+from onnx import helper, numpy_helper
+from onnx import onnx_pb as onnx_proto
+from packaging import version
+
+logger = logging.getLogger(__name__)
+
+
+def _npfloat16_to_int(np_list):
+    """
+    Convert numpy float16 to python int.
+
+    :param np_list: numpy float16 list
+    :return int_list: python int list
+    """
+    return [int(bin(_.view("H"))[2:].zfill(16), 2) for _ in np_list]
+
+
+def convert_np_to_float16(np_array, min_positive_val=5.96e-08, max_finite_val=65504.0):
+    """
+    Convert float32 numpy array to float16 without changing sign or finiteness.
+    Positive values less than min_positive_val are mapped to min_positive_val.
+    Positive finite values greater than max_finite_val are mapped to max_finite_val.
+    Similar for negative values. NaN, 0, inf, and -inf are unchanged.
+    """
+
+    def between(a, b, c):
+        return np.logical_and(a < b, b < c)
+
+    np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)
+    np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)
+    np_array = np.where(between(max_finite_val, np_array, float("inf")), max_finite_val, np_array)
+    np_array = np.where(between(float("-inf"), np_array, -max_finite_val), -max_finite_val, np_array)
+    return np.float16(np_array)
+
+
+def convert_tensor_float_to_float16(tensor, min_positive_val=5.96e-08, max_finite_val=65504.0):
+    """Convert tensor float to float16.
+
+    Args:
+        tensor (TensorProto): the tensor to convert.
+        min_positive_val (float, optional): minimal positive value. Defaults to 1e-7.
+        max_finite_val (float, optional): maximal finite value. Defaults to 1e4.
+
+    Raises:
+        ValueError: input type is not TensorProto.
+
+    Returns:
+        TensorProto: the converted tensor.
+    """
+
+    if not isinstance(tensor, onnx_proto.TensorProto):
+        raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
+
+    if tensor.data_type == onnx_proto.TensorProto.FLOAT:
+        tensor.data_type = onnx_proto.TensorProto.FLOAT16
+        # convert float_data (float type) to float16 and write to int32_data
+        if tensor.float_data:
+            float16_data = convert_np_to_float16(np.array(tensor.float_data), min_positive_val, max_finite_val)
+            int_list = _npfloat16_to_int(float16_data)
+            tensor.int32_data[:] = int_list
+            tensor.float_data[:] = []
+        # convert raw_data (bytes type)
+        if tensor.raw_data:
+            # convert n.raw_data to float
+            float32_list = np.frombuffer(tensor.raw_data, dtype="float32")
+            # convert float to float16
+            float16_list = convert_np_to_float16(float32_list, min_positive_val, max_finite_val)
+            # convert float16 to bytes and write back to raw_data
+            tensor.raw_data = float16_list.tobytes()
+    return tensor
+
+
+def make_value_info_from_tensor(tensor):
+    shape = numpy_helper.to_array(tensor).shape
+    return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape)
+
+
+DEFAULT_OP_BLOCK_LIST = [
+    "ArrayFeatureExtractor",
+    "Binarizer",
+    "CastMap",
+    "CategoryMapper",
+    "DictVectorizer",
+    "FeatureVectorizer",
+    "Imputer",
+    "LabelEncoder",
+    "LinearClassifier",
+    "LinearRegressor",
+    "Normalizer",
+    "OneHotEncoder",
+    "SVMClassifier",
+    "SVMRegressor",
+    "Scaler",
+    "TreeEnsembleClassifier",
+    "TreeEnsembleRegressor",
+    "ZipMap",
+    "NonMaxSuppression",
+    "TopK",
+    "RoiAlign",
+    "Resize",
+    "Range",
+    "CumSum",
+    "Min",
+    "Max",
+    "Upsample",
+]
+
+
+class InitializerTracker:
+    """Class for keeping track of initializer."""
+
+    def __init__(self, initializer: onnx_proto.TensorProto):
+        self.initializer = initializer
+        self.fp32_nodes = []
+        self.fp16_nodes = []
+
+    def add_node(self, node: onnx_proto.NodeProto, is_node_blocked):
+        if is_node_blocked:
+            self.fp32_nodes.append(node)
+        else:
+            self.fp16_nodes.append(node)
+
+
+def convert_float_to_float16(
+    model,
+    min_positive_val=5.96e-08,
+    max_finite_val=65504.0,
+    keep_io_types=False,
+    disable_shape_infer=False,
+    op_block_list=None,
+    node_block_list=None,
+    force_fp16_initializers=False,
+):
+    """Convert model tensor float type in the ONNX ModelProto input to tensor float16.
+
+    Args:
+        model (ModelProto): The ONNX model to convert.
+        min_positive_val (float, optional): minimal positive value. Defaults to 5.96e-08.
+        max_finite_val (float, optional): maximal finite value of float16. Defaults to 65504.
+        keep_io_types (Union[bool, List[str]], optional): It could be boolean or a list of float32 input/output names.
+                                                          If True, model inputs/outputs should be left as float32. Defaults to False.
+        disable_shape_infer (bool, optional): Skips running onnx shape/type inference. Useful if shape inference has been done. Defaults to False.
+        op_block_list (List[str], optional): List of op types to leave as float32.
+                                             Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST` as default.
+        node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None.
+        force_fp16_initializers(bool): force converting all float initializers to float16.
+                                       Default to false, which will convert only the one needed to avoid precision loss.
+    Raises:
+        ValueError: input type is not ModelProto.
+
+    Returns:
+        ModelProto: converted model.
+    """
+    assert (
+        min_positive_val >= 5.96e-08
+    ), "invalid min_positive_val. smallest positive float16 value: subnormal 5.96e-08, and normalized 6.104e-05"
+    assert max_finite_val <= float(np.finfo(np.float16).max), "invalid max_finite_val. largest float16 value: 65504"
+
+    func_infer_shape = None
+    if not disable_shape_infer and version.parse(onnx.__version__) >= version.parse("1.2.0"):
+        try:
+            from onnx.shape_inference import infer_shapes
+
+            func_infer_shape = infer_shapes
+        finally:
+            pass
+
+    if not isinstance(model, onnx_proto.ModelProto):
+        raise ValueError("Expected model type is an ONNX ModelProto but got %s" % type(model))
+
+    # create blocklists
+    if op_block_list is None:
+        op_block_list = DEFAULT_OP_BLOCK_LIST
+    if node_block_list is None:
+        node_block_list = []
+    op_block_list = set(op_block_list)
+    node_block_list = set(node_block_list)
+
+    logger.debug(
+        f"fp16 parameters: min_positive_val={min_positive_val} max_finite_val={max_finite_val} keep_io_types={keep_io_types} disable_shape_infer={disable_shape_infer} op_block_list={op_block_list} node_block_list={node_block_list} force_fp16_initializers={force_fp16_initializers}"
+    )
+
+    # create a queue for BFS
+    queue = []
+    value_info_list = []
+    node_list = []
+    # type inference on input model
+    if func_infer_shape is not None:
+        model = func_infer_shape(model)
+    queue.append(model)
+    name_mapping = {}
+    graph_io_to_skip = set()
+    io_casts = set()
+
+    fp32_inputs = [n.name for n in model.graph.input if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT]
+    fp32_outputs = [n.name for n in model.graph.output if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT]
+    if isinstance(keep_io_types, list):
+        fp32_inputs = [n for n in fp32_inputs if n in keep_io_types]
+        fp32_outputs = [n for n in fp32_outputs if n in keep_io_types]
+    elif not keep_io_types:
+        fp32_inputs = []
+        fp32_outputs = []
+
+    for i, n in enumerate(model.graph.input):
+        if n.name in fp32_inputs:
+            output_name = "graph_input_cast_" + str(i)
+            name_mapping[n.name] = output_name
+            graph_io_to_skip.add(n.name)
+
+            node_name = "graph_input_cast" + str(i)
+            new_value_info = model.graph.value_info.add()
+            new_value_info.CopyFrom(n)
+            new_value_info.name = output_name
+            new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+            # add Cast node (from tensor(float) to tensor(float16) after graph input
+            new_node = [helper.make_node("Cast", [n.name], [output_name], to=10, name=node_name)]
+            model.graph.node.extend(new_node)
+            value_info_list.append(new_value_info)
+            io_casts.add(node_name)
+
+    for i, n in enumerate(model.graph.output):
+        if n.name in fp32_outputs:
+            input_name = "graph_output_cast_" + str(i)
+            name_mapping[n.name] = input_name
+            graph_io_to_skip.add(n.name)
+
+            node_name = "graph_output_cast" + str(i)
+            # add Cast node (from tensor(float16) to tensor(float) before graph output
+            new_value_info = model.graph.value_info.add()
+            new_value_info.CopyFrom(n)
+            new_value_info.name = input_name
+            new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+            new_node = [helper.make_node("Cast", [input_name], [n.name], to=1, name=node_name)]
+            model.graph.node.extend(new_node)
+            value_info_list.append(new_value_info)
+            io_casts.add(node_name)
+
+    fp32_initializers: Dict[str, InitializerTracker] = {}
+    while queue:
+        next_level = []
+        for q in queue:
+            # if q is model, push q.graph (GraphProto)
+            if isinstance(q, onnx_proto.ModelProto):
+                next_level.append(q.graph)
+            # if q is model.graph, push q.node.attribute (AttributeProto)
+            if isinstance(q, onnx_proto.GraphProto):
+                for n in q.initializer:  # TensorProto type
+                    if n.data_type == onnx_proto.TensorProto.FLOAT:
+                        assert n.name not in fp32_initializers
+                        fp32_initializers[n.name] = InitializerTracker(n)
+
+                for n in q.node:
+                    # if n is in the block list (doesn't support float16), no conversion for the node,
+                    # and save the node for further processing
+                    if n.name in io_casts:
+                        continue
+                    for i in range(len(n.input)):
+                        if n.input[i] in name_mapping:
+                            n.input[i] = name_mapping[n.input[i]]
+                    for i in range(len(n.output)):
+                        if n.output[i] in name_mapping:
+                            n.output[i] = name_mapping[n.output[i]]
+
+                    is_node_blocked = n.op_type in op_block_list or n.name in node_block_list
+                    for input in n.input:
+                        if input in fp32_initializers:
+                            fp32_initializers[input].add_node(n, is_node_blocked)
+
+                    if is_node_blocked:
+                        node_list.append(n)
+                    else:
+                        if n.op_type == "Cast":
+                            for attr in n.attribute:
+                                if attr.name == "to" and attr.i == 1:
+                                    attr.i = 10
+                                    break
+                        for attr in n.attribute:
+                            next_level.append(attr)
+            # if q is model.graph.node.attribute, push q.g and q.graphs (GraphProto)
+            # and process node.attribute.t and node.attribute.tensors (TensorProto)
+            if isinstance(q, onnx_proto.AttributeProto):
+                next_level.append(q.g)
+                for n in q.graphs:
+                    next_level.append(n)
+                q.t.CopyFrom(convert_tensor_float_to_float16(q.t, min_positive_val, max_finite_val))
+                for n in q.tensors:
+                    n = convert_tensor_float_to_float16(n, min_positive_val, max_finite_val)
+            # if q is graph, process input, output and value_info (ValueInfoProto)
+            if isinstance(q, onnx_proto.GraphProto):
+                # Note that float initializers tracked by fp32_initializers will be processed later.
+                # for all ValueInfoProto with tensor(float) type in input, output and value_info, convert them to
+                # tensor(float16) except map and seq(map). And save them in value_info_list for further processing
+                for n in itertools.chain(q.input, q.output, q.value_info):
+                    if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
+                        if n.name not in graph_io_to_skip:
+                            n.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+                            value_info_list.append(n)
+                    if n.type.HasField("sequence_type"):
+                        if n.type.sequence_type.elem_type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
+                            if n.name not in graph_io_to_skip:
+                                n.type.sequence_type.elem_type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+                                value_info_list.append(n)
+
+        queue = next_level
+
+    for key, value in fp32_initializers.items():
+        # By default, to avoid precision loss, do not convert an initializer to fp16 when it is used only by fp32 nodes.
+        if force_fp16_initializers or value.fp16_nodes:
+            value.initializer = convert_tensor_float_to_float16(value.initializer, min_positive_val, max_finite_val)
+            value_info_list.append(make_value_info_from_tensor(value.initializer))
+            if value.fp32_nodes and not force_fp16_initializers:
+                logger.info(
+                    "initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{}".format(
+                        value.fp16_nodes
+                    )
+                )
+
+    # process the nodes in block list that doesn't support tensor(float16)
+    for node in node_list:
+        # if input's name is in the value_info_list meaning input is tensor(float16) type,
+        # insert a float16 to float Cast node before the node,
+        # change current node's input name and create new value_info for the new name
+        for i in range(len(node.input)):
+            input = node.input[i]
+            for value_info in value_info_list:
+                if input == value_info.name:
+                    # create new value_info for current node's new input name
+                    new_value_info = model.graph.value_info.add()
+                    new_value_info.CopyFrom(value_info)
+                    output_name = node.name + "_input_cast_" + str(i)
+                    new_value_info.name = output_name
+                    new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT
+                    # add Cast node (from tensor(float16) to tensor(float) before current node
+                    node_name = node.name + "_input_cast" + str(i)
+                    new_node = [helper.make_node("Cast", [input], [output_name], to=1, name=node_name)]
+                    model.graph.node.extend(new_node)
+                    # change current node's input name
+                    node.input[i] = output_name
+                    break
+        # if output's name is in the value_info_list meaning output is tensor(float16) type, insert a float to
+        # float16 Cast node after the node, change current node's output name and create new value_info for the new name
+        for i in range(len(node.output)):
+            output = node.output[i]
+            for value_info in value_info_list:
+                if output == value_info.name:
+                    # create new value_info for current node's new output
+                    new_value_info = model.graph.value_info.add()
+                    new_value_info.CopyFrom(value_info)
+                    input_name = node.name + "_output_cast_" + str(i)
+                    new_value_info.name = input_name
+                    new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT
+                    # add Cast node (from tensor(float) to tensor(float16) after current node
+                    node_name = node.name + "_output_cast" + str(i)
+                    new_node = [helper.make_node("Cast", [input_name], [output], to=10, name=node_name)]
+                    model.graph.node.extend(new_node)
+                    # change current node's input name
+                    node.output[i] = input_name
+                    break
+    return model
+
+
+def float_to_float16_max_diff(tensor, min_positive_val=5.96e-08, max_finite_val=65504.0):
+    """Measure the maximum absolute difference after converting a float tensor to float16."""
+    if not isinstance(tensor, onnx_proto.TensorProto):
+        raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
+    if tensor.data_type != onnx_proto.TensorProto.FLOAT:
+        raise ValueError("Expected tensor data type is float.")
+
+    float32_data = None
+    if tensor.float_data:
+        float32_data = np.array(tensor.float_data)
+
+    if tensor.raw_data:
+        float32_data = np.frombuffer(tensor.raw_data, dtype="float32")
+
+    if float32_data is None:
+        raise RuntimeError("external data not loaded!")
+
+    float16_data = convert_np_to_float16(float32_data, min_positive_val, max_finite_val)
+    return np.amax(np.abs(float32_data - np.float32(float16_data)))
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py
new file mode 100755
index 000000000..e1fde76f9
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py
@@ -0,0 +1,65 @@
+from logging import getLogger
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+import numpy as np
+import onnx
+
+logger = getLogger(__name__)
+
+
+class FusionSerialBiasAdd(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "Add", "Softmax")
+
+    def match_parent_path_from_dict(self, start_node, path_dict):
+        res_path = None
+        res_nodes = None
+        for k, v in path_dict.items():
+            res_nodes = self.model.match_parent_path(start_node, v[0], v[1])
+            if res_nodes is None:
+                continue
+            return res_nodes, k
+        return res_nodes, res_path
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        paths = {
+            "path1": (["Reshape", "Add", "Reshape", "Add"], [0, 0, 0, 0]),
+        }
+        series_nodes, path_chosen = self.match_parent_path_from_dict(node, paths)
+        if not series_nodes:
+            return
+        last_reshape, add_2nd, _, add_1st = series_nodes
+
+        biases = [
+            self.model.get_initializer(add_1st.input[1]),
+            self.model.get_initializer(add_2nd.input[1])
+        ]
+        if not all(biases):
+            return
+
+        bias_arr_1st = NumpyHelper.to_array(biases[0])
+        bias_arr_2nd = NumpyHelper.to_array(biases[1]).squeeze(0)
+        try:
+            relative_position_bias = bias_arr_1st + bias_arr_2nd
+        except Exception as e:
+            print("Two bias are unrelated:", e)
+            return
+
+        # Fuse
+        add_name = self.model.create_node_name("Add", "Add")
+        B = biases[0]
+        B.CopyFrom(numpy_helper.from_array(relative_position_bias, B.name))
+
+        fused_node = helper.make_node(
+            "Add",
+            inputs=[add_1st.input[0], B.name],
+            outputs=last_reshape.output,
+            name=add_name,
+        )
+        fused_node.domain = "com.iluvatar"
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+        self.nodes_to_remove.extend(series_nodes)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py
new file mode 100755
index 000000000..47b8ec777
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py
@@ -0,0 +1,602 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import List, Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_attention import AttentionMask
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+def get_tensor_attr(attrs, attr_name):
+    result = None
+    for i in attrs:
+        if i.name == attr_name:
+            return numpy_helper.to_array(i.t)
+    return result
+
+
+class FusionAlbertAttention(Fusion):
+    """
+    Fuse Albert subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+        hidden_size: int,
+        num_heads: int,
+        attention_mask: AttentionMask,
+    ):
+        super().__init__(
+            model,
+            "CustomQKVToContextPluginDynamic_IxRT",
+            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
+        )
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.attention_mask = attention_mask
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        q_shape = self.model.get_initializer(reshape_q.input[1])
+        if q_shape is None:
+            logger.debug(f"{reshape_q.input[1]} is not initializer.")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        q_shape_value = NumpyHelper.to_array(q_shape)
+        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
+            logger.debug(
+                f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
+            )
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        num_heads = q_shape_value[2]
+        head_size = q_shape_value[3]
+        hidden_size = num_heads * head_size
+
+        if self.num_heads > 0 and num_heads != self.num_heads:
+            if self.num_heads_warning:
+                logger.warning(
+                    f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value."
+                )
+                self.num_heads_warning = False  # Do not show the warning more than once
+
+        if self.hidden_size > 0 and hidden_size != self.hidden_size:
+            if self.hidden_size_warning:
+                logger.warning(
+                    f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
+                )
+                self.hidden_size_warning = (
+                    False  # Do not show the warning more than once
+                )
+
+        return num_heads, hidden_size
+
+    def get_add_qk_str(self, add_qk: NodeProto):
+        shape_infer = self.model.infer_runtime_shape(update=True)
+        if shape_infer is None:
+            return
+
+        input_0_shape = shape_infer.get_edge_shape(add_qk.input[0])
+        input_1_shape = shape_infer.get_edge_shape(add_qk.input[1])
+
+        if input_0_shape is None or input_1_shape is None:
+            logger.debug(f"one of the inputs of {add_qk} is None")
+            return None
+
+        if input_0_shape != input_1_shape:
+            logger.debug(f"the shape of two inputs of {add_qk} is not same")
+            return None
+
+        return add_qk.input[1]
+
+    def create_attention_node(
+        self,
+        mask_index: str,
+        q_matmul: NodeProto,
+        k_matmul: NodeProto,
+        v_matmul: NodeProto,
+        q_add: NodeProto,
+        k_add: NodeProto,
+        v_add: NodeProto,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        add_qk_str: str,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            mask_index (str): mask input
+            q_matmul (NodeProto): MatMul node in fully connection for Q
+            k_matmul (NodeProto): MatMul node in fully connection for  K
+            v_matmul (NodeProto): MatMul node in fully connection for  V
+            q_add (NodeProto): Add bias node in fully connection for Q
+            k_add (NodeProto): Add bias node in fully connection for K
+            v_add (NodeProto): Add bias node in fully connection for V
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(
+                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
+            )
+            return None
+
+        q_weight = self.model.get_initializer(q_matmul.input[1])
+        k_weight = self.model.get_initializer(k_matmul.input[1])
+        v_weight = self.model.get_initializer(v_matmul.input[1])
+        q_bias = self.model.get_initializer(
+            q_add.input[1]
+        ) or self.model.get_initializer(q_add.input[0])
+        k_bias = self.model.get_initializer(
+            k_add.input[1]
+        ) or self.model.get_initializer(k_add.input[0])
+        v_bias = self.model.get_initializer(
+            v_add.input[1]
+        ) or self.model.get_initializer(v_add.input[0])
+
+        if q_weight is None:
+            print(
+                f"{q_matmul.input[1]} is not an initializer. "
+                "Please set do_constant_folding=True in torch.onnx.export to unblock attention fusion"
+            )
+            return None
+        if not (k_weight and v_weight and q_bias and k_bias):
+            return None
+
+        qw = NumpyHelper.to_array(q_weight)
+        kw = NumpyHelper.to_array(k_weight)
+        vw = NumpyHelper.to_array(v_weight)
+
+        # assert q and k have same shape as expected
+        assert qw.shape == kw.shape
+
+        qw_in_size = qw.shape[0]
+        kw_in_size = kw.shape[0]
+        vw_in_size = vw.shape[0]
+
+        assert qw_in_size == kw_in_size == vw_in_size
+
+        if hidden_size > 0 and hidden_size != qw_in_size:
+            logger.warning(
+                f"Input hidden size ({hidden_size}) is not same as weight matrix dimension of q,k,v ({qw_in_size}). "
+                "Please provide a correct input hidden size or pass in 0"
+            )
+
+        is_qkv_diff_dims = False
+
+        # All the matrices can have the same shape or q, k matrics can have the same shape with v being different
+        # For 2d weights, the shapes would be [in_size, out_size].
+        # For 3d weights, shape would be [in_size, a, b] where a*b = out_size
+        qw_out_size = np.prod(qw.shape[1:])
+        kw_out_size = np.prod(kw.shape[1:])
+        vw_out_size = np.prod(vw.shape[1:])
+
+        qkv_weight_dim = 0
+        qkv_weight = np.concatenate((qw, kw, vw), axis=1)
+        qkv_weight_dim = qw_out_size + kw_out_size + vw_out_size
+
+        qb = NumpyHelper.to_array(q_bias)
+        kb = NumpyHelper.to_array(k_bias)
+        vb = NumpyHelper.to_array(v_bias)
+
+        q_bias_shape = np.prod(qb.shape)
+        k_bias_shape = np.prod(kb.shape)
+        v_bias_shape = np.prod(vb.shape)
+
+        assert q_bias_shape == k_bias_shape == qw_out_size
+        assert v_bias_shape == vw_out_size
+
+        qkv_bias_dim = 0
+        if is_qkv_diff_dims:
+            qkv_bias = np.concatenate((qb, kb, vb), axis=0)
+            qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape
+        else:
+            qkv_bias = np.stack((qb, kb, vb), axis=0)
+            qkv_bias_dim = 3 * q_bias_shape
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        weight = helper.make_tensor(
+            name=attention_node_name + "_qkv_weight",
+            data_type=TensorProto.FLOAT,
+            dims=[qkv_weight_dim, qw_in_size],
+            vals=qkv_weight.transpose(1, 0).flatten().tolist(),
+        )
+
+        # Sometimes weights and bias are stored in fp16
+        if q_weight.data_type == 10:
+            weight.CopyFrom(
+                numpy_helper.from_array(
+                    NumpyHelper.to_array(weight).astype(np.float16), weight.name
+                )
+            )
+        self.model.add_initializer(weight, self.this_graph_name)
+
+        bias = helper.make_tensor(
+            name=attention_node_name + "_qkv_bias",
+            data_type=TensorProto.FLOAT,
+            dims=[qkv_bias_dim],
+            vals=qkv_bias.flatten().tolist(),
+        )
+        if q_bias.data_type == 10:
+            bias.CopyFrom(
+                numpy_helper.from_array(
+                    NumpyHelper.to_array(bias).astype(np.float16), bias.name
+                )
+            )
+        self.model.add_initializer(bias, self.this_graph_name)
+
+        fc_output_tensor = helper.make_tensor_value_info(
+            attention_node_name + "_input", TensorProto.FLOAT, [None, None, None]
+        )
+        fc_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[input],
+            outputs=[fc_output_tensor.name],
+            name=self.model.create_node_name("AttentionFC", "MatMul_AddBias_"),
+        )
+        fc_node.domain = "com.iluvatar"
+        b = NumpyHelper.to_array(bias)
+        fc_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
+        fc_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fc_node.attribute.extend([helper.make_attribute("W", weight)])
+        fc_node.attribute.extend([helper.make_attribute("B", bias)])
+        fc_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fc_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fc_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fc_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fc_node)
+
+        attention_inputs = [fc_node.output[0]]
+        if mask_index is not None:
+            attention_inputs.append(mask_index)
+        else:
+            attention_inputs.append("")
+
+        if add_qk_str is not None:
+            attention_inputs.append("")
+            attention_inputs.append(add_qk_str)
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend(
+            [helper.make_attribute("hidden_size", hidden_size)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
+
+        if is_qkv_diff_dims:
+            attention_node.attribute.extend(
+                [
+                    helper.make_attribute(
+                        "qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size]
+                    )
+                ]
+            )
+
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+        if normalize_node.op_type == "LayerNormalization":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                start_node = add_before_layernorm
+            else:
+                return
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_nodes = self.model.match_parent_path(
+            start_node,
+            ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+            [None, None, 0, 0, 0],
+        )
+        if qkv_nodes is None:
+            qkv_nodes = self.model.match_parent_path(
+                start_node,
+                ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+                [1, None, 0, 0, 0],
+            )
+        einsum_node = None
+        if qkv_nodes is not None:
+            (_, _, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
+        else:
+            # Match Albert
+            qkv_nodes = self.model.match_parent_path(
+                start_node, ["Add", "Einsum", "Transpose", "MatMul"], [1, None, 0, 0]
+            )
+            if qkv_nodes is not None:
+                (_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes
+            else:
+                return
+
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+        """
+        Match flaubert                     Mask
+                                            |
+        Mul --> LayerNormalization -->  Attention --> MatMul --> Add
+         |                                                        |
+         |                                                        |
+         +---------------------------------------------------------
+        """
+        mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0)
+        if mul_before_layernorm is not None:
+            mul_children = input_name_to_nodes[mul_before_layernorm.output[0]]
+            if mul_children is not None and len(mul_children) == 2:
+                layernorm_node = mul_children[1]
+                if layernorm_node.op_type == "LayerNormalization":
+                    root_input = layernorm_node.output[0]
+                else:
+                    return
+            elif mul_children is not None and len(mul_children) == 5:
+                root_input = mul_before_layernorm.output[0]
+            else:
+                return
+        elif normalize_node.op_type == "LayerNormalization":
+            children = input_name_to_nodes[root_input]
+            for child in children:
+                if child.op_type == "LayerNormalization":
+                    root_input = child.output[0]
+
+        children = input_name_to_nodes[root_input]
+        children_types = [child.op_type for child in children]
+        if children_types.count("MatMul") != 3:
+            return
+
+        v_nodes = self.model.match_parent_path(
+            matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
+        )
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (_, _, add_v, matmul_v) = v_nodes
+
+        is_distill = False
+        is_distill_add = False
+        qk_paths = {
+            "path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]),
+            "path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]),
+            "path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]),
+            "path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]),
+        }
+
+        qk_nodes = None
+        for k, v in qk_paths.items():
+            qk_nodes = self.model.match_parent_path(matmul_qkv, v[0], v[1])
+            if qk_nodes is None:
+                continue
+            if k == "path3":
+                is_distill = True
+            if k == "path4":
+                is_distill_add = True
+            break
+
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+
+        add_qk = None
+        matmul_qk = None
+        where_qk = None
+        if is_distill:
+            (_, where_qk, matmul_qk, _) = qk_nodes
+        elif is_distill_add:
+            (_, add_qk, where_qk, matmul_qk) = qk_nodes
+        else:
+            (_, add_qk, _, matmul_qk) = qk_nodes
+
+        q_nodes = self.model.match_parent_path(
+            matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None]
+        )
+        if q_nodes is None:
+            q_nodes = self.model.match_parent_path(
+                matmul_qk,
+                ["Div", "Transpose", "Reshape", "Add", "MatMul"],
+                [0, 0, 0, 0, None],
+            )
+            if q_nodes is None:
+                logger.debug("fuse_attention: failed to match q path")
+                return
+        reshape_q = q_nodes[-3]
+        add_q = q_nodes[-2]
+        matmul_q = q_nodes[-1]
+
+        k_nodes = self.model.match_parent_path(
+            matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
+        )
+        if k_nodes is None:
+            k_nodes = self.model.match_parent_path(
+                matmul_qk,
+                ["Transpose", "Transpose", "Reshape", "Add", "MatMul"],
+                [1, 0, 0, 0, None],
+            )
+            if k_nodes is None:
+                logger.debug("fuse_attention: failed to match k path")
+                return
+        add_k = k_nodes[-2]
+        matmul_k = k_nodes[-1]
+
+        # Note that Cast might be removed by OnnxRuntime so we match two patterns here.
+        mask_nodes = None
+        add_qk_str = None
+        if is_distill:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                where_qk,
+                [
+                    (["Expand", "Reshape", "Equal"], [0, 0, 0]),
+                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
+                    (["Cast", "Expand", "Reshape", "Equal"], [0, 0, 0, 0]),
+                ],
+                output_name_to_node,
+            )
+        elif is_distill_add:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                where_qk,
+                [
+                    (["Cast", "Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0, 0]),
+                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
+                ],
+                output_name_to_node,
+            )
+            if add_qk is not None:
+                add_qk_str = self.get_add_qk_str(add_qk)
+                if add_qk_str is None:
+                    logger.debug(
+                        f"fuse_attention: failed to verify shape inference of {add_qk}"
+                    )
+                    return
+        else:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                add_qk,
+                [
+                    (
+                        ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
+                        [None, 0, 1, 0, 0],
+                    ),
+                    (["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]),
+                    (["Mul", "Sub", "Cast", "Unsqueeze"], [None, 0, 1, 0]),
+                ],
+                output_name_to_node,
+            )
+        if mask_nodes is None:
+            logger.debug("fuse_attention: failed to match mask path")
+            return
+
+        if (
+            matmul_v.input[0] == root_input
+            and matmul_q.input[0] == root_input
+            and matmul_k.input[0] == root_input
+        ):
+            # mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
+            if mask_nodes[0].op_type == "Mul":
+                mask_val = self.model.get_initializer(mask_nodes[0].input[1])
+                if mask_val is not None:
+                    mask_val_arr = NumpyHelper.to_array(mask_val)
+                    mask_val_arr = np.where(mask_val_arr <= -100, -100, 0.0).astype(
+                        np.float32
+                    )
+                    mask_val.CopyFrom(
+                        numpy_helper.from_array(mask_val_arr, mask_val.name)
+                    )
+            mask_index = mask_nodes[0].output[0]
+
+            attention_last_node = reshape_qkv if einsum_node is None else transpose_qkv
+
+            q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
+            # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
+            # the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately
+            new_node = self.create_attention_node(
+                mask_index,
+                matmul_q,
+                matmul_k,
+                matmul_v,
+                add_q,
+                add_k,
+                add_v,
+                q_num_heads,
+                q_hidden_size,
+                root_input,
+                attention_last_node.output[0],
+                add_qk_str,
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+            if einsum_node is not None:
+                unique_index = einsum_node.input[0]
+                new_edge = "edge_modified_" + unique_index
+                shape_tensor = helper.make_tensor(
+                    name="shape_modified_tensor" + unique_index,
+                    data_type=TensorProto.INT64,
+                    dims=[4],
+                    vals=np.int64(
+                        [0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]
+                    ).tobytes(),
+                    raw=True,
+                )
+                self.model.add_initializer(shape_tensor, self.this_graph_name)
+                self.model.add_node(
+                    helper.make_node(
+                        "Reshape",
+                        [attention_last_node.output[0], shape_tensor.name],
+                        [new_edge],
+                        "reshape_modified_" + unique_index,
+                    ),
+                    self.this_graph_name,
+                )
+                einsum_node.input[0] = new_edge
+
+            self.nodes_to_remove.extend(
+                [attention_last_node, transpose_qkv, matmul_qkv]
+            )
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes)
+            self.nodes_to_remove.extend(k_nodes)
+            self.nodes_to_remove.extend(v_nodes)
+
+            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
+            # self.nodes_to_remove.extend(mask_nodes)
+            self.prune_graph = True
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py
new file mode 100755
index 000000000..c75072183
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py
@@ -0,0 +1,571 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+class AttentionMask:
+    """
+    Fuse Attention subgraph into one Attention node.
+    """
+
+    def __init__(self, model: OnnxModel):
+        self.model = model
+        # A lookup table with mask input as key, and mask index output as value
+        self.mask_indice = {}
+        # A lookup table with mask input as key, and cast (to int32) output as value
+        self.mask_casted = {}
+        self.utils = FusionUtils(model)
+        self.mask_format = AttentionMaskFormat.MaskIndexEnd
+
+    def set_mask_format(self, mask_format: AttentionMaskFormat):
+        self.mask_format = mask_format
+
+    def set_mask_indice(self, mask, mask_index):
+        if mask in self.mask_indice:
+            assert mask_index == self.mask_indice[mask]
+        self.mask_indice[mask] = mask_index
+
+    def get_first_mask(self):
+        assert len(self.mask_indice) > 0
+        return next(iter(self.mask_indice))
+
+    def process_mask(self, input: str) -> str:
+        if self.mask_format == AttentionMaskFormat.NoMask:
+            return None
+
+        if input in self.mask_indice:
+            return self.mask_indice[input]
+
+        # Add cast to convert int64 to int32
+        if self.model.find_graph_input(input):
+            casted, input_name = self.utils.cast_graph_input_to_int32(input)
+        else:
+            input_name, cast_node = self.utils.cast_input_to_int32(input)
+            casted = True
+
+        if casted:
+            self.mask_casted[input] = input_name
+
+        # Attention supports int32 attention mask (2D) since 1.4.0
+        if self.mask_format == AttentionMaskFormat.AttentionMask:
+            self.mask_indice[input] = input_name
+            return input_name
+
+        # Add a mask processing node to convert attention mask to mask index (1D)
+        output_name = self.model.create_node_name("mask_index")
+        mask_index_node = helper.make_node(
+            "ReduceSum",
+            inputs=[input_name],
+            outputs=[output_name],
+            name=self.model.create_node_name("ReduceSum", "MaskReduceSum"),
+        )
+        mask_index_node.attribute.extend([helper.make_attribute("axes", [1]), helper.make_attribute("keepdims", 0)])
+        self.model.add_node(mask_index_node)
+
+        self.mask_indice[input] = output_name
+        return output_name
+
+
+class FusionAttention(Fusion):
+    """
+    Fuse Attention subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+        hidden_size: int,
+        num_heads: int,
+        attention_mask: AttentionMask,
+    ):
+        super().__init__(model, "Attention", ["SkipLayerNormalization", "LayerNormalization"])
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.attention_mask = attention_mask
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        q_shape = self.model.get_initializer(reshape_q.input[1])
+        if q_shape is None:
+            logger.debug(f"{reshape_q.input[1]} is not initializer.")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        q_shape_value = NumpyHelper.to_array(q_shape)
+        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
+            logger.debug(f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size].")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        num_heads = q_shape_value[2]
+        head_size = q_shape_value[3]
+        hidden_size = num_heads * head_size
+
+        if self.num_heads > 0 and num_heads != self.num_heads:
+            if self.num_heads_warning:
+                logger.warning(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.")
+                self.num_heads_warning = False  # Do not show the warning more than once
+
+        if self.hidden_size > 0 and hidden_size != self.hidden_size:
+            if self.hidden_size_warning:
+                logger.warning(
+                    f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
+                )
+                self.hidden_size_warning = False  # Do not show the warning more than once
+
+        return num_heads, hidden_size
+
+    def get_add_qk_str(self, add_qk: NodeProto):
+        shape_infer = self.model.infer_runtime_shape(update=True)
+        if shape_infer is None:
+            return
+
+        input_0_shape = shape_infer.get_edge_shape(add_qk.input[0])
+        input_1_shape = shape_infer.get_edge_shape(add_qk.input[1])
+
+        if input_0_shape is None or input_1_shape is None:
+            logger.debug(f"one of the inputs of {add_qk} is None")
+            return None
+
+        if input_0_shape != input_1_shape:
+            logger.debug(f"the shape of two inputs of {add_qk} is not same")
+            return None
+
+        return add_qk.input[1]
+
+    def create_attention_node(
+        self,
+        mask_index: str,
+        q_matmul: NodeProto,
+        k_matmul: NodeProto,
+        v_matmul: NodeProto,
+        q_add: NodeProto,
+        k_add: NodeProto,
+        v_add: NodeProto,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        add_qk_str: str,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            mask_index (str): mask input
+            q_matmul (NodeProto): MatMul node in fully connection for Q
+            k_matmul (NodeProto): MatMul node in fully connection for  K
+            v_matmul (NodeProto): MatMul node in fully connection for  V
+            q_add (NodeProto): Add bias node in fully connection for Q
+            k_add (NodeProto): Add bias node in fully connection for K
+            v_add (NodeProto): Add bias node in fully connection for V
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
+            return None
+
+        q_weight = self.model.get_initializer(q_matmul.input[1])
+        k_weight = self.model.get_initializer(k_matmul.input[1])
+        v_weight = self.model.get_initializer(v_matmul.input[1])
+        q_bias = self.model.get_initializer(q_add.input[1]) or self.model.get_initializer(q_add.input[0])
+        k_bias = self.model.get_initializer(k_add.input[1]) or self.model.get_initializer(k_add.input[0])
+        v_bias = self.model.get_initializer(v_add.input[1]) or self.model.get_initializer(v_add.input[0])
+
+        if q_weight is None:
+            print(
+                f"{q_matmul.input[1]} is not an initializer. "
+                "Please set do_constant_folding=True in torch.onnx.export to unblock attention fusion"
+            )
+            return None
+        if not (k_weight and v_weight and q_bias and k_bias):
+            return None
+
+        qw = NumpyHelper.to_array(q_weight)
+        kw = NumpyHelper.to_array(k_weight)
+        vw = NumpyHelper.to_array(v_weight)
+
+        # assert q and k have same shape as expected
+        assert qw.shape == kw.shape
+
+        qw_in_size = qw.shape[0]
+        kw_in_size = kw.shape[0]
+        vw_in_size = vw.shape[0]
+
+        assert qw_in_size == kw_in_size == vw_in_size
+
+        if hidden_size > 0 and hidden_size != qw_in_size:
+            logger.warning(
+                f"Input hidden size ({hidden_size}) is not same as weight matrix dimension of q,k,v ({qw_in_size}). "
+                "Please provide a correct input hidden size or pass in 0"
+            )
+
+        is_qkv_diff_dims = False
+        if qw.shape != vw.shape:
+            is_qkv_diff_dims = True
+
+        # All the matrices can have the same shape or q, k matrics can have the same shape with v being different
+        # For 2d weights, the shapes would be [in_size, out_size].
+        # For 3d weights, shape would be [in_size, a, b] where a*b = out_size
+        qw_out_size = np.prod(qw.shape[1:])
+        kw_out_size = np.prod(kw.shape[1:])
+        vw_out_size = np.prod(vw.shape[1:])
+
+        qkv_weight_dim = 0
+        if is_qkv_diff_dims:
+            qkv_weight = np.concatenate((qw, kw, vw), axis=1)
+            qkv_weight_dim = qw_out_size + kw_out_size + vw_out_size
+        else:
+            qkv_weight = np.stack((qw, kw, vw), axis=1)
+            qkv_weight_dim = 3 * qw_out_size
+
+        qb = NumpyHelper.to_array(q_bias)
+        kb = NumpyHelper.to_array(k_bias)
+        vb = NumpyHelper.to_array(v_bias)
+
+        q_bias_shape = np.prod(qb.shape)
+        k_bias_shape = np.prod(kb.shape)
+        v_bias_shape = np.prod(vb.shape)
+
+        assert q_bias_shape == k_bias_shape == qw_out_size
+        assert v_bias_shape == vw_out_size
+
+        qkv_bias_dim = 0
+        if is_qkv_diff_dims:
+            qkv_bias = np.concatenate((qb, kb, vb), axis=0)
+            qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape
+        else:
+            qkv_bias = np.stack((qb, kb, vb), axis=0)
+            qkv_bias_dim = 3 * q_bias_shape
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        weight = helper.make_tensor(
+            name=attention_node_name + "_qkv_weight",
+            data_type=TensorProto.FLOAT,
+            dims=[qw_in_size, qkv_weight_dim],
+            vals=qkv_weight.flatten().tolist(),
+        )
+
+        # Sometimes weights and bias are stored in fp16
+        if q_weight.data_type == 10:
+            weight.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(weight).astype(np.float16), weight.name))
+        self.model.add_initializer(weight, self.this_graph_name)
+
+        bias = helper.make_tensor(
+            name=attention_node_name + "_qkv_bias",
+            data_type=TensorProto.FLOAT,
+            dims=[qkv_bias_dim],
+            vals=qkv_bias.flatten().tolist(),
+        )
+        if q_bias.data_type == 10:
+            bias.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(bias).astype(np.float16), bias.name))
+        self.model.add_initializer(bias, self.this_graph_name)
+
+        attention_inputs = [
+            input,
+            attention_node_name + "_qkv_weight",
+            attention_node_name + "_qkv_bias",
+        ]
+        if mask_index is not None:
+            attention_inputs.append(mask_index)
+        else:
+            attention_inputs.append("")
+
+        if add_qk_str is not None:
+            attention_inputs.append("")
+            attention_inputs.append(add_qk_str)
+
+        attention_node = helper.make_node(
+            "Attention",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.microsoft"
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+
+        if is_qkv_diff_dims:
+            attention_node.attribute.extend(
+                [helper.make_attribute("qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size])]
+            )
+
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+        if normalize_node.op_type == "LayerNormalization":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                start_node = add_before_layernorm
+            else:
+                return
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_nodes = self.model.match_parent_path(
+            start_node,
+            ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+            [None, None, 0, 0, 0],
+        )
+        einsum_node = None
+        if qkv_nodes is not None:
+            (_, _, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
+        else:
+            # Match Albert
+            qkv_nodes = self.model.match_parent_path(
+                start_node, ["Add", "Einsum", "Transpose", "MatMul"], [1, None, 0, 0]
+            )
+            if qkv_nodes is not None:
+                (_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes
+            else:
+                return
+
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+        """
+        Match flaubert                     Mask
+                                            |
+        Mul --> LayerNormalization -->  Attention --> MatMul --> Add
+         |                                                        |
+         |                                                        |
+         +---------------------------------------------------------
+        """
+        mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0)
+        if mul_before_layernorm is not None:
+            mul_children = input_name_to_nodes[mul_before_layernorm.output[0]]
+            if mul_children is not None and len(mul_children) == 2:
+                layernorm_node = mul_children[1]
+                if layernorm_node.op_type == "LayerNormalization":
+                    root_input = layernorm_node.output[0]
+                else:
+                    return
+            elif mul_children is not None and len(mul_children) == 5:
+                root_input = mul_before_layernorm.output[0]
+            else:
+                return
+        elif normalize_node.op_type == "LayerNormalization":
+            children = input_name_to_nodes[root_input]
+            for child in children:
+                if child.op_type == "LayerNormalization":
+                    root_input = child.output[0]
+
+        children = input_name_to_nodes[root_input]
+        children_types = [child.op_type for child in children]
+        if children_types.count("MatMul") != 3:
+            return
+
+        v_nodes = self.model.match_parent_path(matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None])
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (_, _, add_v, matmul_v) = v_nodes
+
+        is_distill = False
+        is_distill_add = False
+        qk_paths = {
+            "path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]),
+            "path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]),
+            "path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]),
+            "path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]),
+        }
+
+        qk_nodes = None
+        for k, v in qk_paths.items():
+            qk_nodes = self.model.match_parent_path(matmul_qkv, v[0], v[1])
+            if qk_nodes is None:
+                continue
+            if k == "path3":
+                is_distill = True
+            if k == "path4":
+                is_distill_add = True
+            break
+
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+
+        add_qk = None
+        matmul_qk = None
+        where_qk = None
+        if is_distill:
+            (_, where_qk, matmul_qk, _) = qk_nodes
+        elif is_distill_add:
+            (_, add_qk, where_qk, matmul_qk) = qk_nodes
+        else:
+            (_, add_qk, _, matmul_qk) = qk_nodes
+
+        q_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None])
+        if q_nodes is None:
+            q_nodes = self.model.match_parent_path(
+                matmul_qk,
+                ["Div", "Transpose", "Reshape", "Add", "MatMul"],
+                [0, 0, 0, 0, None],
+            )
+            if q_nodes is None:
+                logger.debug("fuse_attention: failed to match q path")
+                return
+        reshape_q = q_nodes[-3]
+        add_q = q_nodes[-2]
+        matmul_q = q_nodes[-1]
+
+        k_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None])
+        if k_nodes is None:
+            k_nodes = self.model.match_parent_path(
+                matmul_qk,
+                ["Transpose", "Transpose", "Reshape", "Add", "MatMul"],
+                [1, 0, 0, 0, None],
+            )
+            if k_nodes is None:
+                logger.debug("fuse_attention: failed to match k path")
+                return
+        add_k = k_nodes[-2]
+        matmul_k = k_nodes[-1]
+
+        # Note that Cast might be removed by OnnxRuntime so we match two patterns here.
+        mask_nodes = None
+        add_qk_str = None
+        if is_distill:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                where_qk,
+                [
+                    (["Expand", "Reshape", "Equal"], [0, 0, 0]),
+                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
+                    (["Cast", "Expand", "Reshape", "Equal"], [0, 0, 0, 0]),
+                ],
+                output_name_to_node,
+            )
+        elif is_distill_add:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                where_qk,
+                [
+                    (["Cast", "Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0, 0]),
+                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
+                ],
+                output_name_to_node,
+            )
+            if add_qk is not None:
+                add_qk_str = self.get_add_qk_str(add_qk)
+                if add_qk_str is None:
+                    logger.debug(f"fuse_attention: failed to verify shape inference of {add_qk}")
+                    return
+        else:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                add_qk,
+                [
+                    (
+                        ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
+                        [None, 0, 1, 0, 0],
+                    ),
+                    (["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]),
+                ],
+                output_name_to_node,
+            )
+        if mask_nodes is None:
+            logger.debug("fuse_attention: failed to match mask path")
+            return
+
+        if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input:
+            mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
+
+            attention_last_node = reshape_qkv if einsum_node is None else transpose_qkv
+
+            q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
+            # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
+            # the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately
+            new_node = self.create_attention_node(
+                mask_index,
+                matmul_q,
+                matmul_k,
+                matmul_v,
+                add_q,
+                add_k,
+                add_v,
+                q_num_heads,
+                q_hidden_size,
+                root_input,
+                attention_last_node.output[0],
+                add_qk_str,
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+            if einsum_node is not None:
+                unique_index = einsum_node.input[0]
+                new_edge = "edge_modified_" + unique_index
+                shape_tensor = helper.make_tensor(
+                    name="shape_modified_tensor" + unique_index,
+                    data_type=TensorProto.INT64,
+                    dims=[4],
+                    vals=np.int64([0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]).tobytes(),
+                    raw=True,
+                )
+                self.model.add_initializer(shape_tensor, self.this_graph_name)
+                self.model.add_node(
+                    helper.make_node(
+                        "Reshape",
+                        [attention_last_node.output[0], shape_tensor.name],
+                        [new_edge],
+                        "reshape_modified_" + unique_index,
+                    ),
+                    self.this_graph_name,
+                )
+                einsum_node.input[0] = new_edge
+
+            self.nodes_to_remove.extend([attention_last_node, transpose_qkv, matmul_qkv])
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes)
+            self.nodes_to_remove.extend(k_nodes)
+            self.nodes_to_remove.extend(v_nodes)
+
+            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
+            # self.nodes_to_remove.extend(mask_nodes)
+            self.prune_graph = True
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py
new file mode 100755
index 000000000..aaf742a45
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py
@@ -0,0 +1,82 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import List, Union
+
+from onnx import GraphProto
+
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class Fusion:
+    def __init__(
+        self,
+        model: OnnxModel,
+        fused_op_type: str,
+        search_op_types: Union[str, List[str]],
+        description: str = None,
+    ):
+        self.search_op_types: List[str] = (
+            [search_op_types] if isinstance(search_op_types, str) else search_op_types
+        )
+        self.fused_op_type: str = fused_op_type
+        self.description: str = (
+            f"{fused_op_type}({description})" if description else fused_op_type
+        )
+        self.model: OnnxModel = model
+        self.nodes_to_remove: List = []
+        self.nodes_to_add: List = []
+        self.prune_graph: bool = False
+        self.node_name_to_graph_name: dict = {}
+        self.this_graph_name: str = None
+        # It is optional that subclass updates fused_count since we will also check nodes_to_add to get counter.
+        self.fused_count: int = 0
+
+    def apply(self):
+        logger.debug(f"start {self.description} fusion...")
+        input_name_to_nodes = self.model.input_name_to_nodes()
+        output_name_to_node = self.model.output_name_to_node()
+
+        # This assumes that two search ops will not be fused at same time!
+        for search_op_type in self.search_op_types:
+            for node in self.model.get_nodes_by_op_type(search_op_type):
+                graph = self.model.get_graph_by_node(node)
+                if graph is None:
+                    raise Exception("Can not find node in any graphs")
+                self.this_graph_name = graph.name
+                self.fuse(node, input_name_to_nodes, output_name_to_node)
+
+        op_list = [node.op_type for node in self.nodes_to_add]
+        count = max(self.fused_count, op_list.count(self.fused_op_type))
+        if count > 0:
+            logger.info(f"Fused {self.description} count: {count}")
+
+        self.model.remove_nodes(self.nodes_to_remove)
+        self.model.add_nodes(self.nodes_to_add, self.node_name_to_graph_name)
+
+        if self.prune_graph:
+            self.model.prune_graph()
+        elif self.nodes_to_remove or self.nodes_to_add:
+            self.model.update_graph()
+
+    def match_parent_path_from_dict(
+        self, start_node, path_dict, output_name_to_node=None, return_indice=None
+    ):
+        res_path = None
+        res_nodes = None
+        for k, v in path_dict.items():
+            res_nodes = self.model.match_parent_path(
+                start_node,
+                v[0],
+                v[1],
+                output_name_to_node=output_name_to_node,
+                return_indice=return_indice,
+            )
+            if res_nodes is None:
+                continue
+            return res_nodes, k
+        return res_nodes, res_path
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py
new file mode 100755
index 000000000..8e3406c7f
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py
@@ -0,0 +1,66 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionBiasGelu(Fusion):
+    def __init__(self, model: OnnxModel, is_fastgelu):
+        if is_fastgelu:
+            super().__init__(model, "FastGelu", "FastGelu", "add bias")
+        else:
+            super().__init__(model, "BiasGelu", "Gelu")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        gelu_op_type = node.op_type
+        fuse_op_type = "BiasGelu" if gelu_op_type == "Gelu" else "FastGelu"
+
+        if len(node.input) != 1:
+            return
+
+        nodes = self.model.match_parent_path(node, ["Add", "MatMul"], [0, None])
+        if nodes is None:
+            return
+        (add, matmul) = nodes
+
+        bias_weight = None
+        # bias should be one dimension
+        bias_index = -1
+        for i, input in enumerate(add.input):
+            initializer = self.model.get_initializer(input)
+            if initializer is None:
+                continue
+            bias_index = i
+            bias_weight = NumpyHelper.to_array(initializer)
+            break
+        if bias_weight is None:
+            return
+        if len(bias_weight.shape) != 1:
+            return
+
+        subgraph_nodes = [node, add]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        fused_node = helper.make_node(
+            fuse_op_type,
+            inputs=[matmul.output[0], add.input[bias_index]],
+            outputs=node.output,
+            name=self.model.create_node_name(fuse_op_type, gelu_op_type + "_AddBias_"),
+        )
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py
new file mode 100755
index 000000000..074b6d595
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py
@@ -0,0 +1,279 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionCustomFCGPT2(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Reshape"], "gpt2")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        nodes = self.model.match_parent_path(node, ["Gemm", "Reshape"], [0, 0])
+
+        if nodes is None:
+            return False
+
+        (matmul, reshape_before_matmul) = nodes
+
+        matmul_weight = self.model.get_initializer(matmul.input[1])
+        matmul_bias = self.model.get_initializer(matmul.input[2])
+
+        if matmul_weight is None or matmul_bias is None:
+            return False
+
+        w = NumpyHelper.to_array(matmul_weight)
+        b = NumpyHelper.to_array(matmul_bias)
+
+        trans_matmul_weight = w.transpose(1, 0)
+        if matmul_weight.name not in self.model.initializer_visited.keys():
+            self.model.initializer_visited[matmul_weight.name] = True
+            if matmul_weight.data_type == 10:
+                matmul_weight.CopyFrom(
+                    numpy_helper.from_array(
+                        trans_matmul_weight.astype(np.float16), matmul_weight.name
+                    )
+                )
+            else:
+                matmul_weight.CopyFrom(
+                    numpy_helper.from_array(trans_matmul_weight, matmul_weight.name)
+                )
+
+        if matmul_bias.data_type == 10:
+            matmul_bias.CopyFrom(
+                numpy_helper.from_array(b.astype(np.float16), matmul_bias.name)
+            )
+        else:
+            matmul_bias.CopyFrom(numpy_helper.from_array(b, matmul_bias.name))
+
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[reshape_before_matmul.input[0]],
+            outputs=node.output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fused_node.attribute.extend([helper.make_attribute("W", matmul_weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", matmul_bias)])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+        self.nodes_to_remove.extend([matmul, node, reshape_before_matmul])
+
+
+class FusionCustomFC(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Add"])
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if self.fuse_1(node, input_name_to_nodes, output_name_to_node):
+            return
+
+    def fuse_1(self, node, input_name_to_nodes, output_name_to_node):
+        if len(node.input) != 2:
+            return False
+        nodes = self.model.match_parent_path(node, ["MatMul"], [None])
+
+        if nodes is None:
+            return False
+        matmul = nodes[0]
+
+        matmul_weight = self.model.get_initializer(matmul.input[1])
+        matmul_bias = self.model.get_initializer(
+            node.input[1]
+        ) or self.model.get_initializer(node.input[0])
+
+        if matmul_weight is None or matmul_bias is None:
+            return False
+
+        w = NumpyHelper.to_array(matmul_weight)
+        b = NumpyHelper.to_array(matmul_bias)
+
+        trans_matmul_weight = w.transpose(1, 0)
+        if matmul_weight.name not in self.model.initializer_visited.keys():
+            self.model.initializer_visited[matmul_weight.name] = True
+            if matmul_weight.data_type == 10:
+                matmul_weight.CopyFrom(
+                    numpy_helper.from_array(
+                        trans_matmul_weight.astype(np.float16), matmul_weight.name
+                    )
+                )
+            else:
+                matmul_weight.CopyFrom(
+                    numpy_helper.from_array(trans_matmul_weight, matmul_weight.name)
+                )
+
+        if matmul_bias.data_type == 10:
+            matmul_bias.CopyFrom(
+                numpy_helper.from_array(b.astype(np.float16), matmul_bias.name)
+            )
+        else:
+            matmul_bias.CopyFrom(numpy_helper.from_array(b, matmul_bias.name))
+
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[matmul.input[0]],
+            outputs=node.output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fused_node.attribute.extend([helper.make_attribute("W", matmul_weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", matmul_bias)])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+        self.nodes_to_remove.extend([matmul, node])
+        return True
+
+    # For model Roformer.
+    def fuse_2(self, node, input_name_to_nodes, output_name_to_node):
+        if len(node.input) != 2:
+            return False
+
+        fc_paths = {
+            "path1": (["Reshape", "MatMul"], [0, 0]),
+            "path2": (["Reshape", "MatMul"], [1, 0]),
+        }
+
+        nodes, paths = self.match_parent_path_from_dict(node, fc_paths)
+        if nodes is None:
+            return False
+
+        reshape_after_matmul = nodes[0]
+        matmul = nodes[1]
+
+        weight = self.model.get_initializer(matmul.input[1])
+        bias = self.model.get_initializer(node.input[1]) or self.model.get_initializer(
+            node.input[0]
+        )
+
+        if weight is None or bias is None:
+            return False
+
+        w = NumpyHelper.to_array(weight)
+        w_in_size = w.shape[0]
+        weight_dim = np.prod(w.shape[1:])
+
+        b = NumpyHelper.to_array(bias)
+        bias_dim = np.prod(b.shape)
+        weight_arr = (
+            onnx.numpy_helper.to_array(weight).flatten().reshape(w_in_size, weight_dim)
+        )
+        weight.CopyFrom(onnx.numpy_helper.from_array(weight_arr, weight.name))
+        # Sometimes weights and bias are stored in fp16
+        if weight.data_type == 10:
+            weight.CopyFrom(
+                numpy_helper.from_array(
+                    NumpyHelper.to_array(weight).astype(np.float16), weight.name
+                )
+            )
+        bias_arr = onnx.numpy_helper.to_array(bias).flatten()
+        bias.CopyFrom(onnx.numpy_helper.from_array(bias_arr, bias.name))
+        if bias.data_type == 10:
+            bias.CopyFrom(
+                numpy_helper.from_array(
+                    NumpyHelper.to_array(bias).astype(np.float16), bias.name
+                )
+            )
+
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[matmul.input[0]],
+            outputs=node.output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", 1)])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 1)])
+        fused_node.attribute.extend([helper.make_attribute("W", weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", bias)])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", 1)])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+
+        self.nodes_to_remove.extend([node, nodes[0], nodes[1]])
+        return True
+
+
+class FusionCustomFCActivation(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model,
+            "CustomFCPluginDynamic_IxRT",
+            ["Gelu", "Relu", "CustomGeluPluginDynamic_IxRT", "Mul"],
+            "with activation",
+        )
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if node.op_type == "Mul":
+            return_indice = []
+            nodes = self.model.match_parent_path(
+                node,
+                ["Sigmoid", "Mul", "CustomFCPluginDynamic_IxRT"],
+                [None, 0, 0],
+                return_indice=return_indice,
+            )
+            if nodes is None:
+                return
+
+            (sigmoid_node, mul_node, custom_fc_node) = nodes
+            if output_name_to_node[node.input[1 - return_indice[0]]] != custom_fc_node:
+                return
+
+            activation_type = 20
+            for attr in custom_fc_node.attribute:
+                if attr.name == "act_type":
+                    attr.i = activation_type
+                    break
+
+            custom_fc_node.output[0] = node.output[0]
+            self.nodes_to_add.append(custom_fc_node)
+            self.nodes_to_remove.extend([node, sigmoid_node, mul_node, custom_fc_node])
+            self.node_name_to_graph_name[custom_fc_node.name] = self.this_graph_name
+        else:
+            nodes = self.model.match_parent_path(
+                node, ["CustomFCPluginDynamic_IxRT"], [0]
+            )
+
+            if nodes is None:
+                logger.debug("CustomFCActivation: failed to match fc+gelu/relu path")
+                return
+
+            fc_node = nodes[0]
+            activation_type = 3
+            if node.op_type == "Gelu":
+                activation_type = 21
+            if node.op_type == "Relu":
+                activation_type = 4
+
+            for attr in fc_node.attribute:
+                if attr.name == "act_type":
+                    attr.i = activation_type
+                    break
+
+            fc_node.output[0] = node.output[0]
+            self.nodes_to_add.append(fc_node)
+            self.nodes_to_remove.extend([node, fc_node])
+            self.node_name_to_graph_name[fc_node.name] = self.this_graph_name
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py
new file mode 100755
index 000000000..04eb863f8
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py
@@ -0,0 +1,109 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Tuple, Union
+
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionDisentangledAttention(Fusion):
+    """
+    Match Disentangled Attention
+        -------------------------------------------
+                                                  |
+        GatherElements          -->   Add  -->   Add  -->
+                                       |
+        GatherElements --> Transpose  ->
+    """
+
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "DisentangledAttention_IxRT", "Add")
+
+    def create_disentangled_attention_node(
+        self,
+        inputs: List[str],
+        outputs: List[str],
+    ) -> Union[NodeProto, None]:
+        """Create an disentangled attention node.
+
+        Args:
+            inputs List[str]: data input names
+            outputs List[str]: data output names
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        disentangled_attention_node_name = self.model.create_node_name(
+            "DisentangledAttention"
+        )
+
+        disentangled_attention_node = helper.make_node(
+            "DisentangledAttention_IxRT",
+            inputs=inputs,
+            outputs=outputs,
+            name=disentangled_attention_node_name,
+        )
+        disentangled_attention_node.domain = "com.iluvatar"
+        disentangled_attention_node.attribute.extend(
+            [helper.make_attribute("plugin_namespace", "")]
+        )
+        disentangled_attention_node.attribute.extend(
+            [helper.make_attribute("plugin_version", "1")]
+        )
+        disentangled_attention_node.attribute.extend(
+            [helper.make_attribute("factor", 0.1)]
+        )
+        disentangled_attention_node.attribute.extend(
+            [helper.make_attribute("span", 512)]
+        )
+
+        return disentangled_attention_node
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+
+        disentangled_attention_path1 = {
+            "path": (["Add", "GatherElements", "MatMul"], [None, None, None]),
+        }
+
+        disentangled_attention_path2 = {
+            "path": (
+                ["Add", "Transpose", "GatherElements", "MatMul"],
+                [None, None, None, None],
+            ),
+        }
+
+        nodes1, _ = self.match_parent_path_from_dict(node, disentangled_attention_path1)
+        nodes2, _ = self.match_parent_path_from_dict(node, disentangled_attention_path2)
+
+        if nodes1 is not None and nodes2 is not None:
+            if nodes1[0] == nodes2[0]:
+                (head_add, first_gather, first_matmul) = nodes1
+                (_, transpose, second_gather, second_matmul) = nodes2
+                tail_add = node
+
+                first_input = [i for i in tail_add.input if i != head_add.output[0]][0]
+                second_input = first_matmul.output[0]
+                third_input = second_matmul.output[0]
+                output = tail_add.output[0]
+
+                disentangled_attention_node = self.create_disentangled_attention_node(
+                    [first_input, second_input, third_input], [output]
+                )
+                self.nodes_to_add.append(disentangled_attention_node)
+                self.node_name_to_graph_name[
+                    disentangled_attention_node.name
+                ] = self.this_graph_name
+                self.nodes_to_remove.append(tail_add)
+                self.nodes_to_remove.append(head_add)
+                self.nodes_to_remove.append(first_gather)
+                self.nodes_to_remove.append(transpose)
+                self.nodes_to_remove.append(second_gather)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py
new file mode 100755
index 000000000..90bddbf89
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py
@@ -0,0 +1,703 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Dict, List, Tuple, Union
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import NodeProto, TensorProto, helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionEmbedLayerNoMask(Fusion):
+    """
+    Fuse embedding layer into one node (EmbedLayerNormalization).
+    It supports the following model types: BERT, DistilBert, ALBert.
+    """
+
+    def __init__(self, model: OnnxModel, description: str = "no mask"):
+        super().__init__(
+            model,
+            "EmbedLayerNormalization",
+            ["LayerNormalization", "SkipLayerNormalization"],
+            description,
+        )
+        self.utils = FusionUtils(model)
+        self.shape_infer_helper = self.model.infer_runtime_shape({}, update=True)
+        # The following will be reset in each fuse call of FusionEmbedLayerNormalization
+        self.attention = None
+        self.embed_node = None
+
+    def match_two_gather(self, add: NodeProto) -> Union[None, Tuple[NodeProto, NodeProto]]:
+        gather_0_path = self.model.match_parent_path(add, ["Gather"], [0])
+        if gather_0_path is None:
+            return None
+
+        gather_1_path = self.model.match_parent_path(add, ["Gather"], [1])
+        if gather_1_path is None:
+            return None
+
+        return gather_0_path[0], gather_1_path[0]
+
+    def check_attention_subgraph(
+        self,
+        layernorm: NodeProto,
+        input_name_to_nodes: Dict[str, List[NodeProto]],
+        is_distil_bert: bool,
+    ) -> bool:
+        """Check that LayerNormalization has a child of Attention node or subgraph like Attention.
+
+        Args:
+            layernorm (NodeProto): LayerNormalization node
+            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
+            is_distil_bert (bool): whether it is DistilBert or not
+
+        Returns:
+            bool: whether there is Attention node or subgraph like Attention
+        """
+        self.attention = self.model.find_first_child_by_type(
+            layernorm, "Attention", input_name_to_nodes, recursive=False
+        )
+        if self.attention is None:
+            # In case user disables attention fusion, check whether subgraph looks like Attention.
+            if layernorm.output[0] not in input_name_to_nodes:
+                return False
+            children = input_name_to_nodes[layernorm.output[0]]
+
+            # For Albert, there is MatMul+Add after embedding layer before attention.
+            if len(children) == 1 and children[0].op_type == "MatMul" and children[0].output[0] in input_name_to_nodes:
+                grandchildren = input_name_to_nodes[children[0].output[0]]
+                if (
+                    len(grandchildren) == 1
+                    and grandchildren[0].op_type == "Add"
+                    and grandchildren[0].output[0] in input_name_to_nodes
+                ):
+                    nodes = input_name_to_nodes[grandchildren[0].output[0]]
+                    for node in nodes:
+                        if node.op_type == "Attention":
+                            self.attention = node
+                            return True
+                    children_types = sorted([child.op_type for child in nodes])
+            else:
+                children_types = sorted([child.op_type for child in children])
+
+            # Two Shape nodes might be merged by ORT
+            if is_distil_bert:
+                # SkipLayerNormailization might exist when model has been optimized by ORT first.
+                if (
+                    children_types != ["MatMul", "MatMul", "MatMul", "Shape", "SkipLayerNormalization"]
+                    and children_types != ["Add", "MatMul", "MatMul", "MatMul", "Shape", "Shape"]
+                    and children_types != ["Add", "MatMul", "MatMul", "MatMul", "Shape"]
+                ):
+                    logger.debug("No Attention like subgraph in children of LayerNormalization")
+                    return False
+            else:
+                if children_types != ["Add", "MatMul", "MatMul", "MatMul",] and children_types != [
+                    "MatMul",
+                    "MatMul",
+                    "MatMul",
+                    "SkipLayerNormalization",
+                ]:
+                    logger.debug("No Attention like subgraph in children of LayerNormalization")
+                    return False
+        return True
+
+    def match_position_embedding_distilbert(self, position_embedding_gather, input_ids, output_name_to_node):
+        """  Match position embedding path from input_ids to Gather for DistilBert.
+
+        Pattern is like the following:
+                 (input_ids)
+                      |
+                     Shape
+                       |   \
+                       |    Gather (indices=1)
+                       |       |
+                       |      Cast (optional)
+                       |       |
+                       |      Range (start=0, end=*, delta=1)
+                       |       |
+                       |    Unsqueeze
+                       |    /
+                      Expand
+                        |
+                      Gather
+        """
+        # remove after tests pass
+        path1 = self.model.match_parent_path(position_embedding_gather, ["Expand", "Shape"], [1, 1])
+        if path1 is None:
+            path1 = self.model.match_parent_path(
+                position_embedding_gather,
+                ["Expand", "Where", "Reshape", "Shape"],
+                [1, 1, 2, 0],
+            )
+            if path1 is None:
+                return False
+
+        expand, shape = path1[0], path1[-1]
+        if shape.input[0] != input_ids:
+            return False
+
+        _, path2, _ = self.model.match_parent_paths(
+            expand,
+            [
+                (["Unsqueeze", "Range", "Cast", "Gather", "Shape"], [0, 0, 1, 0, 0]),
+                (["Unsqueeze", "Range", "Gather", "Shape"], [0, 0, 1, 0]),
+            ],
+            output_name_to_node,
+        )
+        if path2 is None:
+            return False
+
+        range_node = path2[1]
+        if not (
+            self.utils.check_node_input_value(range_node, 0, 0) and self.utils.check_node_input_value(range_node, 2, 1)
+        ):
+            return False
+
+        gather_node = path2[-2]
+        if not (self.utils.check_node_input_value(gather_node, 1, 1)):
+            return False
+
+        shape_node = path2[-1]
+        if shape_node.input[0] != input_ids:
+            return False
+
+        return True
+
+    def match_position_embedding_roberta(self, position_embedding_gather, input_ids, output_name_to_node):
+        """Match position embedding path from input_ids to Gather for Roberta.
+
+        Roberta Embedding Layer Pattern (* is optional since it might be removed by ORT, ? is the padding word id):
+          (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Mul -- Cast(to=7) -- Add(B=1) -- Cast(to=7)* --> Gather
+                                                |                              ^
+                                                V                              |
+                                                +------------------------------+
+
+        Roberta new pattern from transformers v4.9:
+           (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Add(B=0) -- Mul -- Cast(to=7) -- Add(B=1) --> Gather
+                                                |                                           ^
+                                                V                                           |
+                                                +-------------------------------------------+
+
+        start_node = position_embedding_gather
+        start_index = 1
+
+        # match optional Cast node.
+        parent = self.model.get_parent(start_node, start_index, output_name_to_node)
+        if parent is None:
+            return
+        if parent.op_type == "Cast":
+            if OnnxModel.get_node_attribute(parent, "to") != 7:
+                return
+            start_node = parent
+            start_index = 0
+
+        i, path, return_indices = self.model.match_parent_paths(
+            start_node,
+            [ (['Add', 'Cast', 'Mul', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0]),
+              (['Add', 'Cast', 'Mul', 'Add', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0, 0])],
+            output_name_to_node)
+
+        if path is not None:
+            # constant input of Add shall be 1.
+            i, value = self.model.get_constant_input(path[0])
+            if value != 1:
+                return False
+
+            _, self.padding_word_id = self.model.get_constant_input(path[-1])
+
+            return input_ids == path[-1].input[0]
+        """
+
+        return False
+
+    def match_position_embedding_bert(self, position_embedding_gather, input_ids, output_name_to_node):
+        """  Match position embedding path from input_ids to Gather for BERT.
+
+        BERT Embedding Layer Pattern:       
+                                    (input_ids)
+                                   /         \
+                                 /          Shape
+                                /              |
+                              /              Gather (indices=1)
+                             /                  |
+                            /                  Add (optional, B=0)
+                           /                    |
+                        Gather (segment_ids) Unsqueeze (axes=0)
+                           \        |           |
+                            \     Gather      Slice (data[1,512], starts=0, ends=*, axes=1, steps=1)
+                              \    /            |
+                                Add          Gather 
+                                   \       /
+                                      Add
+                                       |
+                                LayerNormalization
+        """
+        path = self.model.match_parent_path(
+            position_embedding_gather,
+            ["Slice", "Unsqueeze"],
+            [1, 2],
+            output_name_to_node,
+        )
+        if path is None:
+            return False
+
+        slice, unsqueeze = path
+        slice_weight = self.model.get_constant_value(slice.input[0])
+        if not (
+            slice_weight is not None
+            and len(slice_weight.shape) == 2
+            and slice_weight.shape[0] == 1
+            and self.utils.check_node_input_value(slice, 1, [0])
+            and self.utils.check_node_input_value(slice, 3, [1])
+            and (len(slice.input) == 4 or self.utils.check_node_input_value(slice, 4, [1]))
+        ):
+            return False
+
+        opset_version = self.model.get_opset_version()
+        if opset_version < 13:
+            if not FusionUtils.check_node_attribute(unsqueeze, "axes", [0]):
+                return False
+        else:
+            if not self.utils.check_node_input_value(unsqueeze, 1, [0]):
+                return False
+
+        node = self.model.get_parent(unsqueeze, 0, output_name_to_node)
+        if node is None:
+            return False
+        if node.op_type == "Add":
+            if not self.utils.check_node_input_value(node, 1, 0):
+                return False
+            gather = self.model.get_parent(node, 0, output_name_to_node)
+        else:
+            gather = node
+
+        if gather is None or gather.op_type != "Gather":
+            return False
+        if not (self.utils.check_node_input_value(gather, 1, 1)):
+            return False
+
+        shape = self.model.get_parent(gather, 0, output_name_to_node)
+        if shape is None or shape.op_type != "Shape":
+            return False
+
+        return input_ids == shape.input[0]
+
+    def match_position_embedding(self, position_embedding_gather, input_ids, output_name_to_node):
+        if self.match_position_embedding_bert(position_embedding_gather, input_ids, output_name_to_node):
+            return True
+
+        # TODO: Support roberta (position starts from 2 instead of 0) in EmbedLayerNormalization kernel
+        #       related: https://github.com/huggingface/transformers/issues/10736
+        # if self.match_position_embedding_roberta(position_embedding_gather, input_ids, output_name_to_node):
+        #    return True
+
+        if self.match_position_embedding_distilbert(position_embedding_gather, input_ids, output_name_to_node):
+            return True
+
+        return False
+
+    def check_embedding(self, word_embedding_gather, segment_embedding_gather, position_embedding_gather):
+        """Sanity check of embedding weights, and match hidden_size of weights and shape of inputs."""
+        input_ids = word_embedding_gather.input[1]
+        segment_ids = segment_embedding_gather.input[1] if segment_embedding_gather else None
+        position_ids = position_embedding_gather.input[1]
+
+        if self.shape_infer_helper is not None:
+            input_ids_shape = self.shape_infer_helper.get_edge_shape(input_ids)
+            position_ids_shape = self.shape_infer_helper.get_edge_shape(position_ids)
+            assert input_ids_shape and position_ids_shape
+            if not (
+                len(input_ids_shape) == 2
+                and len(position_ids_shape) == 2
+                and input_ids_shape[1] == position_ids_shape[1]
+            ):
+                logger.info(
+                    "Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {} vs {}".format(
+                        input_ids_shape, position_ids_shape
+                    )
+                )
+                return False
+
+            if segment_ids and not self.shape_infer_helper.compare_shape(input_ids, segment_ids):
+                logger.info(
+                    "Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {} != {}".format(
+                        input_ids_shape,
+                        self.shape_infer_helper.get_edge_shape(segment_ids),
+                    )
+                )
+                return False
+
+        word_embedding_table = self.model.get_constant_value(word_embedding_gather.input[0])
+        if word_embedding_table is None or len(word_embedding_table.shape) != 2:
+            logger.info("Cannot fuse EmbedLayerNormalization: word embedding table is not expected")
+            return False
+
+        position_embedding_table = self.model.get_constant_value(position_embedding_gather.input[0])
+        if (
+            position_embedding_table is None
+            or len(position_embedding_table.shape) != 2
+            or (word_embedding_table.shape[1] != position_embedding_table.shape[1])
+        ):
+            logger.info("Cannot fuse EmbedLayerNormalization: position embedding table is not expected")
+            return False
+
+        if segment_ids:
+            segment_embedding_table = self.model.get_constant_value(segment_embedding_gather.input[0])
+            if (
+                segment_embedding_table is None
+                or len(segment_embedding_table.shape) != 2
+                or (word_embedding_table.shape[1] != segment_embedding_table.shape[1])
+            ):
+                logger.info("Cannot fuse EmbedLayerNormalization: segment embedding table is not expected")
+                return False
+
+        # In normal case, word embeding table is the largest, and segment embedding table is the smallest, while postion embedding table is in between.
+        # TODO: use other information (like initializer names) to identify different embedding weights automatically.
+        if word_embedding_table.shape[0] <= position_embedding_table.shape[0]:
+            logger.warning(
+                f"word_embedding_table ({word_embedding_gather.input[0]}) size {word_embedding_table.shape[0]} <= position_embedding_table ({position_embedding_gather.input[0]}) size {position_embedding_table.shape[0]}"
+            )
+
+        if segment_ids:
+            if word_embedding_table.shape[0] <= segment_embedding_table.shape[0]:
+                logger.warning(
+                    f"word_embedding_table ({word_embedding_gather.input[0]}) size {word_embedding_table.shape[0]} <= segment_embedding_table ({segment_embedding_gather.input[0]}) size {segment_embedding_table.shape[0]}"
+                )
+
+            if position_embedding_table.shape[0] <= segment_embedding_table.shape[0]:
+                logger.warning(
+                    f"position_embedding_table ({position_embedding_gather.input[0]}) size {position_embedding_table.shape[0]} <= segment_embedding_table ({segment_embedding_gather.input[0]}) size {segment_embedding_table.shape[0]}"
+                )
+
+        return True
+
+    def cast_to_int32(self, input_name: str) -> Tuple[str, Union[None, NodeProto]]:
+        """Cast a graph input or node input to int32.
+
+        Args:
+            input_name (str): name of graph input or node input
+
+        Returns:
+            A tuple of casted input name and the cast node.
+            int32_output (str): If input is int32, it is the input name, Otherwise it is output name of Cast node.
+            input_cast_node (Union[None, NodeProto]): Cast node. It could be None if input is int32.
+        """
+        input_cast_node = None
+        graph_input = self.model.find_graph_input(input_name)
+        if graph_input is not None:
+            if graph_input.type.tensor_type.elem_type != TensorProto.INT32:
+                int32_output, input_cast_node = self.utils.cast_input_to_int32(input_name)
+            else:
+                int32_output = input_name
+        else:
+            int32_output, input_cast_node = self.utils.cast_input_to_int32(input_name)
+
+        return int32_output, input_cast_node
+
+    def create_fused_node(
+        self,
+        input_ids: str,
+        layernorm: NodeProto,
+        word_embedding_gather: NodeProto,
+        position_embedding_gather: NodeProto,
+        segment_embedding_gather: Union[None, NodeProto],
+        position_ids: str = None,
+        embedding_sum_output=False,
+    ):
+        """Create an EmbedLayerNormalization node. Note that segment embedding is optional.
+
+        Args:
+            input_ids (str): input_ids for word embeddings
+            layernorm (NodeProto): LayerNormalization or SkipLayerNormalization node.
+            word_embedding_gather (NodeProto): the Gather node for word embedding
+            position_embedding_gather (NodeProto): the Gather node for position embedding
+            segment_embedding_gather (Union[None, NodeProto]): the Gather node for segment embedding, or None.
+
+        Returns:
+            NodeProto: the EmbedLayerNormalization node created.
+        """
+        nodes_to_add = []
+        input_ids, _ = self.cast_to_int32(input_ids)
+
+        node_name = self.model.create_node_name("EmbedLayerNormalization")
+
+        if layernorm.op_type == "LayerNormalization":
+            gamma = layernorm.input[1]
+            beta = layernorm.input[2]
+        else:  # SkipLayerNormalization
+            gamma = layernorm.input[2]
+            beta = layernorm.input[3]
+
+        embed_node_inputs = None
+        if segment_embedding_gather is not None:
+            segment_ids, _ = self.cast_to_int32(segment_embedding_gather.input[1])
+
+            embed_node_inputs = [
+                input_ids,
+                segment_ids,
+                word_embedding_gather.input[0],
+                position_embedding_gather.input[0],
+                segment_embedding_gather.input[0],
+                gamma,
+                beta,
+            ]
+        else:  # no segment embedding
+            embed_node_inputs = [
+                input_ids,
+                "",
+                word_embedding_gather.input[0],
+                position_embedding_gather.input[0],
+                "",
+                gamma,
+                beta,
+            ]
+
+        if position_ids is not None:
+            # Adding an empty input for mask before position_ids
+            embed_node_inputs.append("")
+            position_ids, _ = self.cast_to_int32(position_ids)
+            embed_node_inputs.append(position_ids)
+
+        embed_node_outputs = [node_name + "_output", node_name + "_dummy_mask_index"]
+        if embedding_sum_output:
+            embed_node_outputs.append(node_name + "_embedding_sum")
+
+        embed_node = helper.make_node(
+            "EmbedLayerNormalization",
+            embed_node_inputs,
+            outputs=embed_node_outputs,
+            name=node_name,
+        )
+
+        embed_node.domain = "com.microsoft"
+
+        # Pass attribute "epsilon" from normalize node to EmbedLayerNormalization.
+        for att in layernorm.attribute:
+            if att.name == "epsilon":
+                embed_node.attribute.extend([att])
+
+        # Set default value to 1e-12 if no attribute is found.
+        # OnnxRuntime 1.2.0 or older has no epsilon attribute. The optimized model can only work for 1.3.0 or later.
+        if len(embed_node.attribute) == 0:
+            embed_node.attribute.extend([helper.make_attribute("epsilon", 1.0e-12)])
+
+        # Make sure new EmbedLayerNormalization node is the last one in self.nodes_to_add.
+        nodes_to_add.append(embed_node)
+        for node in nodes_to_add:
+            self.node_name_to_graph_name[node.name] = self.this_graph_name
+        self.nodes_to_add.extend(nodes_to_add)
+
+        self.embed_node = embed_node
+        return embed_node
+
+    def finish_fusion(self, layernorm, embed_node):
+        self.model.replace_input_of_all_nodes(layernorm.output[0], embed_node.output[0])
+        # use prune graph to remove nodes that is not needed
+        self.prune_graph = True
+
+    def is_embedding_sum_needed(self, add_before_layer_norm):
+        """Check that Add before layer norm has an output to add before next layernorm
+
+        Args:
+            add_before_layer_norm (NodeProto): Add before any LayerNormalization node in topological order of graph
+
+        Returns:
+            bool: whether there is an extra output needed out of embed layer norm node
+        """
+
+        nodes = self.model.get_children(add_before_layer_norm)
+
+        return len(nodes) > 1
+
+    def fuse_gpt2(self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+        # graph checks
+        # gpt2 has no segment embedding, subgraph pattern is like
+        #     input_ids  position_ids
+        #        |        |
+        #     Gather    Gather
+        #          \   /
+        #           Add _ _ _ _ _
+        #            |           |
+        #    LayerNormalization  |
+        #            |           |
+        #         Attention      |
+        #            |           |
+        #          Matmul        |
+        #            |          /
+        #           Add        /
+        #             \       /
+        #                Add
+        two_gather = self.match_two_gather(add_before_layernorm)
+        if two_gather is None:
+            return False
+
+        add_output = add_before_layernorm.output[0]
+
+        word_embedding_gather, position_embedding_gather = two_gather
+        input_ids = word_embedding_gather.input[1]
+        position_ids = position_embedding_gather.input[1]
+
+        if not self.check_attention_subgraph(layernorm, input_name_to_nodes, is_distil_bert=False):
+            return False
+
+        if not self.check_embedding(word_embedding_gather, None, position_embedding_gather):
+            return False
+
+        optional_embedding_sum_output = False
+        if self.is_embedding_sum_needed(add_before_layernorm):
+            optional_embedding_sum_output = True
+
+        # make the fused node
+        embed_node = self.create_fused_node(
+            input_ids,
+            layernorm,
+            word_embedding_gather,
+            position_embedding_gather,
+            None,
+            position_ids,
+            optional_embedding_sum_output,
+        )
+
+        # direct the output to another add too
+        self.model.replace_input_of_all_nodes(layernorm.output[0], embed_node.output[0])
+        if optional_embedding_sum_output:
+            self.model.replace_input_of_all_nodes(add_output, embed_node.output[2])
+
+        return True
+
+    def fuse_distilbert(self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+        """Fuse embedding layer for DistilBert
+        Args:
+            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
+            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
+            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
+            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
+        """
+
+        # DistilBert has no segment embedding, subgraph pattern is like
+        #       input_ids
+        #        |      \
+        #        |     (position_embedding_subgraph)
+        #        |        |
+        #     Gather    Gather
+        #          \   /
+        #           Add
+        #            |
+        #    LayerNormalization
+        two_gather = self.match_two_gather(add_before_layernorm)
+        if two_gather is None:
+            return False
+
+        word_embedding_gather, position_embedding_gather = two_gather
+        input_ids = word_embedding_gather.input[1]
+
+        if not self.check_attention_subgraph(layernorm, input_name_to_nodes, is_distil_bert=True):
+            return False
+
+        if not self.match_position_embedding(position_embedding_gather, input_ids, output_name_to_node):
+            return False
+
+        if not self.check_embedding(word_embedding_gather, None, position_embedding_gather):
+            return False
+
+        embed_node = self.create_fused_node(
+            input_ids, layernorm, word_embedding_gather, position_embedding_gather, None
+        )
+        self.finish_fusion(layernorm, embed_node)
+        return True
+
+    def fuse_bert(self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+        """Fuse embedding layer for Bert
+        Args:
+            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
+            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
+            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
+            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
+        """
+
+        add_2_gather = self.model.match_parent_path(add_before_layernorm, ["Add"], [0])
+        if add_2_gather is None:
+            return False
+
+        two_gather = self.match_two_gather(add_2_gather[0])
+        if two_gather is None:
+            return False
+
+        word_embedding_gather, segment_embedding_gather = two_gather
+
+        input_ids = word_embedding_gather.input[1]
+
+        if not self.check_attention_subgraph(layernorm, input_name_to_nodes, is_distil_bert=False):
+            return False
+
+        position_embedding_path = self.model.match_parent_path(add_before_layernorm, ["Gather"], [1])
+        if position_embedding_path is None:
+            return False
+
+        position_embedding_gather = position_embedding_path[0]
+        if not self.match_position_embedding(position_embedding_gather, input_ids, output_name_to_node):
+            if not self.match_position_embedding(segment_embedding_gather, input_ids, output_name_to_node):
+                return False
+            # position and segment are switched
+            temp = segment_embedding_gather
+            segment_embedding_gather = position_embedding_gather
+            position_embedding_gather = temp
+
+        if not self.check_embedding(word_embedding_gather, segment_embedding_gather, position_embedding_gather):
+            return False
+
+        embed_node = self.create_fused_node(
+            input_ids,
+            layernorm,
+            word_embedding_gather,
+            position_embedding_gather,
+            segment_embedding_gather,
+        )
+        self.finish_fusion(layernorm, embed_node)
+        return True
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if node.op_type == "LayerNormalization":
+            first_add_path = self.model.match_parent_path(node, ["Add"], [0])
+            if first_add_path is None:
+                return
+            add_before_layernorm = first_add_path[0]
+        else:  # SkipLayerNormalization
+            add_before_layernorm = node  # Add is fused into SkipLayerNormalization
+
+        if self.fuse_gpt2(node, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+            return
+
+        if self.fuse_distilbert(node, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+            return
+
+        if self.fuse_bert(node, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+            return
+
+
+class FusionEmbedLayerNormalization(FusionEmbedLayerNoMask):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "with mask")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        # Reset attention and embed_node so that we know fusion is successful when they are not None.
+        self.attention = None
+        self.embed_node = None
+        super().fuse(node, input_name_to_nodes, output_name_to_node)
+
+        if self.attention and self.embed_node:
+            mask_index = self.attention.input[3]
+            if mask_index in output_name_to_node:
+                node = output_name_to_node[mask_index]
+                if node.op_type == "ReduceSum":
+                    embed_node = self.embed_node
+                    mask_input_name = node.input[0]
+                    self.nodes_to_remove.extend([node])
+                    embed_node.input.append(mask_input_name)
+                    embed_node.output[1] = mask_index
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py
new file mode 100755
index 000000000..0e24a9dd7
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py
@@ -0,0 +1,404 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Dict, Optional
+
+from onnx import helper
+
+from .fusion_base import Fusion
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionFastGelu(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "CustomGeluPluginDynamic_IxRT", "Tanh")
+
+    def fuse(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        if self.fuse_1(tanh_node, input_name_to_nodes, output_name_to_node):
+            return
+
+        if self.fuse_2(tanh_node, input_name_to_nodes, output_name_to_node):
+            return
+
+        if self.fuse_3(tanh_node, input_name_to_nodes, output_name_to_node):
+            return
+
+    def fuse_1(
+        self, tanh_node, input_name_to_nodes, output_name_to_node
+    ) -> Optional[bool]:
+        """
+        Fuse Gelu with tanh into one node:
+              +---------------------------+
+              |                           |
+              |                           v
+            [root] --> Pow --> Mul -----> Add  --> Mul --> Tanh --> Add --> Mul
+              |       (Y=3)   (B=0.0447...)       (B=0.7978...)    (B=1)     ^
+              |                                                              |
+              +------> Mul(B=0.5)--------------------------------------------+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if tanh_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[tanh_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_tanh = children[0]
+
+        if not self.model.has_constant_input(add_after_tanh, 1.0):
+            return
+
+        if add_after_tanh.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_tanh.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_after_tanh = children[0]
+
+        mul_half = self.model.match_parent(
+            mul_after_tanh, "Mul", None, output_name_to_node
+        )
+        if mul_half is None:
+            return
+
+        i = self.model.find_constant_input(mul_half, 0.5)
+        if i < 0:
+            return
+
+        root_input = mul_half.input[0 if i == 1 else 1]
+
+        # root_node could be None when root_input is graph input
+        root_node = self.model.get_parent(
+            mul_half, 0 if i == 1 else 1, output_name_to_node
+        )
+
+        mul_before_tanh = self.model.match_parent(
+            tanh_node, "Mul", 0, output_name_to_node
+        )
+        if mul_before_tanh is None:
+            return
+
+        i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001)
+        if i < 0:
+            return
+
+        add_before_tanh = self.model.match_parent(
+            mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node
+        )
+        if add_before_tanh is None:
+            return
+
+        mul_after_pow = self.model.match_parent(
+            add_before_tanh,
+            "Mul",
+            None,
+            output_name_to_node,
+            exclude=[root_node] if root_node else [],
+        )
+        if mul_after_pow is None:
+            return
+
+        i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001)
+        if i < 0:
+            return
+
+        pow = self.model.match_parent(
+            mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node
+        )
+        if pow is None:
+            return
+
+        if not self.model.has_constant_input(pow, 3.0):
+            return
+
+        if pow.input[0] != root_input:
+            return
+
+        subgraph_nodes = [
+            mul_after_tanh,
+            mul_half,
+            add_after_tanh,
+            tanh_node,
+            mul_before_tanh,
+            add_before_tanh,
+            mul_after_pow,
+            pow,
+        ]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [mul_after_tanh.output[0]],
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node(
+            "CustomGeluPluginDynamic_IxRT",
+            inputs=[root_input],
+            outputs=mul_after_tanh.output,
+            name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
+
+    def fuse_2(
+        self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict
+    ) -> Optional[bool]:
+        """
+        This pattern is from Tensorflow model.
+        Fuse Gelu with tanh into one node:
+              +---------------------------+
+              |                           |
+              |                           v
+            [root] --> Pow --> Mul -----> Add  --> Mul --> Tanh --> Add --> Mul(B=0.5)-->Mul-->
+              |       (Y=3)   (B=0.0447...)       (B=0.7978...)    (B=1)                  ^
+              |                                                                           |
+              +---------------------------------------------------------------------------+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if tanh_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[tanh_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_tanh = children[0]
+
+        if not self.model.has_constant_input(add_after_tanh, 1.0):
+            return
+
+        if add_after_tanh.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_tanh.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_half = children[0]
+
+        i = self.model.find_constant_input(mul_half, 0.5)
+        if i < 0:
+            return
+
+        if mul_half.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[mul_half.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_after_mul_half = children[0]
+
+        root_node = self.model.get_parent(
+            mul_after_mul_half,
+            0 if mul_after_mul_half.input[1] == mul_half.output[0] else 1,
+            output_name_to_node,
+        )
+        if root_node is None:
+            return
+
+        mul_before_tanh = self.model.match_parent(
+            tanh_node, "Mul", 0, output_name_to_node
+        )
+        if mul_before_tanh is None:
+            return
+
+        i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001)
+        if i < 0:
+            return
+
+        add_before_tanh = self.model.match_parent(
+            mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node
+        )
+        if add_before_tanh is None:
+            return
+
+        mul_after_pow = self.model.match_parent(
+            add_before_tanh, "Mul", None, output_name_to_node, exclude=[root_node]
+        )
+        if mul_after_pow is None:
+            return
+
+        i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001)
+        if i < 0:
+            return
+
+        pow = self.model.match_parent(
+            mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node
+        )
+        if pow is None:
+            return
+
+        if not self.model.has_constant_input(pow, 3.0):
+            return
+
+        if pow.input[0] != root_node.output[0]:
+            return
+
+        subgraph_nodes = [
+            mul_after_mul_half,
+            mul_half,
+            add_after_tanh,
+            tanh_node,
+            mul_before_tanh,
+            add_before_tanh,
+            mul_after_pow,
+            pow,
+        ]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [mul_after_mul_half.output[0]],
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node(
+            "CustomGeluPluginDynamic_IxRT",
+            inputs=[root_node.output[0]],
+            outputs=mul_after_mul_half.output,
+            name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
+
+    def fuse_3(
+        self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict
+    ) -> Optional[bool]:
+        """
+        OpenAI's gelu implementation, also used in Megatron:
+           Gelu(x) = x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1.0 + 0.044715 * x * x)))
+
+        Fuse subgraph into a FastGelu node:
+            +------------ Mul (B=0.79788456) -------------------+
+            |                                                   |
+            +-------------------------------+                   |
+            |                               |                   |
+            |                               v                   v
+          [root] --> Mul (B=0.044715) --> Mul --> Add(B=1) --> Mul --> Tanh --> Add(B=1) --> Mul-->
+            |                                                                                 ^
+            |                                                                                 |
+            +-----------> Mul (B=0.5) --------------------------------------------------------+
+        """
+        if tanh_node.output[0] not in input_name_to_nodes:
+            return
+
+        children = input_name_to_nodes[tanh_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_tanh = children[0]
+
+        if not self.model.has_constant_input(add_after_tanh, 1.0):
+            return
+
+        if add_after_tanh.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_tanh.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_last = children[0]
+
+        mul_half = self.model.match_parent(mul_last, "Mul", None, output_name_to_node)
+        if mul_half is None:
+            return
+
+        i = self.model.find_constant_input(mul_half, 0.5)
+        if i < 0:
+            return
+
+        root_input = mul_half.input[0 if i == 1 else 1]
+
+        mul_before_tanh = self.model.match_parent(
+            tanh_node, "Mul", 0, output_name_to_node
+        )
+        if mul_before_tanh is None:
+            return
+
+        add_1 = self.model.match_parent(
+            mul_before_tanh, "Add", None, output_name_to_node
+        )
+        if add_1 is None:
+            return
+        j = self.model.find_constant_input(add_1, 1.0)
+        if j < 0:
+            return
+
+        mul_7978 = self.model.match_parent(
+            mul_before_tanh, "Mul", None, output_name_to_node
+        )
+        if mul_7978 is None:
+            return
+        k = self.model.find_constant_input(mul_7978, 0.7978, delta=0.0001)
+        if k < 0:
+            return
+        if mul_7978.input[0 if k == 1 else 1] != root_input:
+            return
+
+        mul_before_add_1 = self.model.match_parent(
+            add_1, "Mul", 0 if j == 1 else 1, output_name_to_node
+        )
+        if mul_before_add_1 is None:
+            return
+
+        if mul_before_add_1.input[0] == root_input:
+            another = 1
+        elif mul_before_add_1.input[1] == root_input:
+            another = 0
+        else:
+            return
+
+        mul_0447 = self.model.match_parent(
+            mul_before_add_1, "Mul", another, output_name_to_node
+        )
+        if mul_0447 is None:
+            return
+        m = self.model.find_constant_input(mul_0447, 0.0447, delta=0.0001)
+        if m < 0:
+            return
+
+        if mul_0447.input[0 if m == 1 else 1] != root_input:
+            return
+
+        subgraph_nodes = [
+            mul_0447,
+            mul_before_add_1,
+            add_1,
+            mul_before_tanh,
+            tanh_node,
+            add_after_tanh,
+            mul_7978,
+            mul_half,
+            mul_last,
+        ]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [mul_last.output[0]],
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node(
+            "CustomGeluPluginDynamic_IxRT",
+            inputs=[root_input],
+            outputs=mul_last.output,
+            name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py
new file mode 100755
index 000000000..5bd2e0c48
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py
@@ -0,0 +1,113 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import math
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+
+
+logger = getLogger(__name__)
+
+class FusionRemoveUselessElementwise(Fusion):
+    """
+    Fusion to remove useless elementwise in roformer model.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(model, "Sqrt", "Sqrt")
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        paths = {
+            "path1" : (["Max", "Min", "Add", "GlobalAveragePool"], [None, None, None, None]),
+        }
+
+        pool_nodes, pool_path = self.match_parent_path_from_dict(node, paths)
+
+        if pool_nodes is None:
+            logger.debug("GlobalAveragePool: failed searching path after pool node.")
+            return
+
+        max_node = pool_nodes[0]
+        min_node = pool_nodes[1]
+        add_node = pool_nodes[2]
+        pool_node = pool_nodes[3]
+        if not self.model.has_constant_input(add_node, 9.999999960041972e-13):
+            return
+
+        if not self.model.has_constant_input(max_node, 0):
+            return
+
+        max_node.input[0] = pool_node.output[0]
+        self.nodes_to_remove.extend([min_node, add_node])
+
+
+class FusionFormatInvalidMask(Fusion):
+    """
+    Fusion to format invalid mask in roformer model.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(model, "Softmax", ["Softmax"])
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        nodes = self.model.match_parent_path(
+            node,
+            ["Add", "Mul"],
+            [0, 1],
+        )
+
+        if nodes is None:
+            logger.debug("Roformer: unable to format the mul.")
+            return
+
+        mul_node = nodes[1]
+
+        inputs = mul_node.input
+        outputs = mul_node.output
+
+        coef0 = self.model.get_initializer(inputs[0])
+        coef1 = self.model.get_initializer(inputs[1])
+        if (coef0 and coef1) or (not coef0 and not coef1):
+            return
+        coef = coef0 if coef0 else coef1
+        coef.CopyFrom(numpy_helper.from_array(np.array([-100.0]).astype(np.float32), coef.name))
+
+        new_node = helper.make_node(
+            "Mul",
+            inputs = inputs,
+            outputs = outputs,
+            name = mul_node.name,
+        )
+        new_node.domain = "com.iluvatar"
+
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+        self.nodes_to_remove.extend([mul_node])
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py
new file mode 100755
index 000000000..f4c5c7e84
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py
@@ -0,0 +1,333 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Dict, Optional
+
+from .fusion_base import Fusion
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionGelu(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "Gelu", "Erf")
+
+    def fuse(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        if self.fuse_1(erf_node, input_name_to_nodes, output_name_to_node):
+            return
+        if self.fuse_2(erf_node, input_name_to_nodes, output_name_to_node):
+            return
+        if self.fuse_3(erf_node, input_name_to_nodes, output_name_to_node):
+            return
+        self.fuse_4(erf_node, input_name_to_nodes, output_name_to_node)
+
+    def fuse_1(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
+        """
+        This pattern is from PyTorch model
+        Fuse Gelu with Erf into one node:
+        Pattern 1:
+                       +-------Mul(0.5)---------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->
+                              (B=1.4142...)       (1)
+
+        Pattern 2:
+                       +------------------------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->Mul -->
+                              (B=1.4142...)       (1)            (0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if erf_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_erf = children[0]
+
+        if not self.model.has_constant_input(add_after_erf, 1):
+            return
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_after_erf = children[0]
+
+        div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node)
+        if div is None:
+            return
+
+        if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1:
+            return
+
+        subgraph_input = div.input[0]
+
+        another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
+        if subgraph_input == mul_after_erf.input[another]:  # pattern 2
+            children = input_name_to_nodes[mul_after_erf.output[0]]
+            if len(children) != 1 or children[0].op_type != "Mul":
+                return
+            mul_half = children[0]
+            if not self.model.has_constant_input(mul_half, 0.5):
+                return
+            subgraph_output = mul_half.output[0]
+        else:  # pattern 1
+            mul_half = self.model.match_parent(mul_after_erf, "Mul", another, output_name_to_node)
+            if mul_half is None:
+                return
+
+            if not self.model.has_constant_input(mul_half, 0.5):
+                return
+
+            if subgraph_input not in mul_half.input:
+                return
+
+            subgraph_output = mul_after_erf.output[0]
+
+        subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul_half]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node("Gelu", inputs=[subgraph_input], outputs=[subgraph_output])
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
+
+    def fuse_2(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
+        """
+        This pattern is from Keras model
+        Fuse Gelu with Erf into one node:
+                       +------------------------------------------+
+                       |                                          |
+                       |                                          v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->Mul
+                              (B=1.4142...)       (A=1)   (A=0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if erf_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_erf = children[0]
+
+        if not self.model.has_constant_input(add_after_erf, 1):
+            return
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_after_erf = children[0]
+
+        if not self.model.has_constant_input(mul_after_erf, 0.5):
+            return
+
+        if mul_after_erf.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[mul_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul = children[0]
+
+        div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node)
+        if div is None:
+            return
+
+        sqrt_node = None
+        if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1:
+            sqrt_node = self.model.match_parent(div, "Sqrt", 1, output_name_to_node)
+            if sqrt_node is None:
+                return
+            if not self.model.has_constant_input(sqrt_node, 2.0):
+                return
+
+        root_node = self.model.get_parent(div, 0, output_name_to_node)
+        if root_node is None:
+            return
+
+        if root_node.output[0] not in mul.input:
+            return
+
+        subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul]
+        if sqrt_node:
+            subgraph_nodes.append(sqrt_node)
+
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, [mul.output[0]], input_name_to_nodes, output_name_to_node
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[mul.output[0]])
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
+
+    def fuse_3(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
+        """
+        This pattern is from TensorFlow model
+        Fuse Gelu with Erf into one node:
+                       +----------------------------------------------+
+                       |                                              |
+                       |                                              v
+                    [root] --> Mul -----> Erf    -->   Add --> Mul -->Mul
+                               (A=0.7071067690849304)  (B=1)  (B=0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+
+        if erf_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_erf = children[0]
+
+        if not self.model.has_constant_input(add_after_erf, 1):
+            return
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_half = children[0]
+
+        if not self.model.has_constant_input(mul_half, 0.5):
+            return
+
+        first_mul = self.model.match_parent(erf_node, "Mul", 0, output_name_to_node)
+        if first_mul is None:
+            return
+
+        i = self.model.find_constant_input(first_mul, 0.7071067690849304, delta=0.001)
+        if i < 0:
+            return
+
+        root_node = self.model.get_parent(first_mul, 0 if i == 1 else 1, output_name_to_node)
+        if root_node is None:
+            return
+
+        if mul_half.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[mul_half.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        last_mul = children[0]
+
+        if not (last_mul.input[0] == root_node.output[0] or last_mul.input[1] == root_node.output[0]):
+            return
+
+        subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [last_mul.output[0]],
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[last_mul.output[0]])
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
+
+    def fuse_4(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
+        """
+        This pattern is from TensorFlow model
+        Fuse Gelu with Erf into one node:
+        Pattern 1:
+                       +-------Mul(0.5)---------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Mul -----> Erf  --> Add --> Mul -->
+                              (B=0.7071...)       (1)
+
+        Pattern 2:
+                       +------------------------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Mul -----> Erf  --> Add --> Mul -->Mul -->
+                              (B=0.7071...)       (1)            (0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if erf_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_erf = children[0]
+
+        if not self.model.has_constant_input(add_after_erf, 1):
+            return
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_after_erf = children[0]
+
+        mul_before_erf = self.model.match_parent(erf_node, "Mul", 0, output_name_to_node)
+        if mul_before_erf is None:
+            return
+
+        if self.model.find_constant_input(mul_before_erf, 0.7071, delta=0.001) != 1:
+            return
+
+        subgraph_input = mul_before_erf.input[0]
+
+        another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
+        if subgraph_input == mul_after_erf.input[another]:  # pattern 2
+            children = input_name_to_nodes[mul_after_erf.output[0]]
+            if len(children) != 1 or children[0].op_type != "Mul":
+                return
+            mul_half = children[0]
+            if not self.model.has_constant_input(mul_half, 0.5):
+                return
+            subgraph_output = mul_half.output[0]
+        else:  # pattern 1
+            mul_half = self.model.match_parent(mul_after_erf, "Mul", another, output_name_to_node)
+            if mul_half is None:
+                return
+
+            if not self.model.has_constant_input(mul_half, 0.5):
+                return
+
+            if subgraph_input not in mul_half.input:
+                return
+
+            subgraph_output = mul_after_erf.output[0]
+
+        subgraph_nodes = [mul_before_erf, erf_node, add_after_erf, mul_after_erf, mul_half]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node("Gelu", inputs=[subgraph_input], outputs=[subgraph_output])
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py
new file mode 100755
index 000000000..35f4b93a7
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py
@@ -0,0 +1,27 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+from .fusion_base import Fusion
+from onnx import helper
+from .onnx_model import OnnxModel
+
+
+class FusionGeluApproximation(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "FastGelu", ["Gelu", "BiasGelu"], "GeluApproximation")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        new_node = helper.make_node(
+            "FastGelu",
+            inputs=node.input,
+            outputs=node.output,
+            name=self.model.create_node_name("FastGelu", node.op_type + "_Approximation"),
+        )
+        new_node.domain = "com.microsoft"
+        self.nodes_to_remove.append(node)
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py
new file mode 100755
index 000000000..b856dd19d
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py
@@ -0,0 +1,473 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+
+import numpy as np
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionGptAttentionPastBase(Fusion):
+    """Base class for GPT Attention Fusion with past state"""
+
+    def __init__(self, model: OnnxModel, num_heads: int):
+        super().__init__(model, "Attention", "LayerNormalization", "with past")
+        self.num_heads = num_heads
+        self.utils = FusionUtils(model)
+        self.casted_attention_mask = {}  # map from name of attention mask to the name that casted to int32
+
+    def match_past_pattern_1(self, concat_k, concat_v, output_name_to_node):
+        # Pattern 1:
+        #                      {past}
+        #                    /        \
+        #                   /          \
+        #    Gather(axes=0, indices=0)  Gather(indices=1)
+        #      |                          |
+        #    Transpose (perm=0,1,3,2)     |
+        #      |                          |
+        #  Concat_k                     Concat_v
+        #      |                        /
+        #  Transpose (perm=0,1,3,2)    /
+        #      |                      /
+        #  Unsqueeze        Unsqueeze
+        #        \        /
+        #         \      /
+        #           Concat
+        #             |
+        #         {present}
+        gather = self.model.get_parent(concat_v, 0, output_name_to_node)
+        if gather.op_type != "Gather":
+            logger.debug("match_past_pattern_1: expect Gather for past")
+            return None
+
+        if not self.model.find_constant_input(gather, 1) == 1:
+            logger.debug("match_past_pattern_1: expect indices=1 for Gather of past")
+            return None
+        past = gather.input[0]
+
+        parent = self.model.get_parent(concat_k, 0, output_name_to_node)
+        if parent.op_type == "Gather":
+            gather_past_k = parent
+        else:
+            past_k_nodes = self.model.match_parent_path(concat_k, ["Transpose", "Gather"], [0, 0])
+            if past_k_nodes is None:
+                logger.debug("match_past_pattern_1: failed match Transpose and Gather")
+                return None
+            gather_past_k = past_k_nodes[-1]
+
+        if not self.model.find_constant_input(gather_past_k, 0) == 1:
+            logger.debug("match_past_pattern_1: expect indices=0 for Gather k of past")
+            return None
+        past_k = gather_past_k.input[0]
+        if past != past_k:
+            logger.debug("match_past_pattern_1: expect past to be same")
+            return None
+
+        return past
+
+    def match_past_pattern_2(self, concat_k, concat_v, output_name_to_node):
+        # Pattern 2:
+        #      Split (QKV)
+        #      / |   |
+        #     /  |   +----------------------+
+        #        |                          |
+        #        |         {past}           |
+        #        |           |              |
+        #      Reshape     Split         Reshape
+        #        |         /    \           |
+        # Transpose_k  Squeeze  Squeeze  Transpose_v
+        #        |      |        \        /
+        #        +------|---+     \      /
+        #               |   |      \    /
+        #              Concat_k   Concat_v
+        #               |            |
+        #          Unsqueeze    Unsqueeze
+        #                \       /
+        #                 Concat
+        #                   |
+        #               {present}
+        #
+        squeeze = self.model.get_parent(concat_v, 0, output_name_to_node)
+        if squeeze.op_type != "Squeeze":
+            logger.debug("match_past_pattern_2: expect Squeeze as parent of concat_v")
+            return None
+
+        split = self.model.get_parent(squeeze, 0, output_name_to_node)
+        if split.op_type != "Split":
+            logger.debug("match_past_pattern_2: expect Split for past path")
+            return None
+
+        opset_version = self.model.get_opset_version()
+        if opset_version < 13:
+            if not FusionUtils.check_node_attribute(squeeze, "axes", [0]):
+                logger.debug("match_past_pattern_2: axes != [0] for Squeeze in past path")
+                return None
+
+            if not FusionUtils.check_node_attribute(split, "split", [1, 1]):
+                logger.debug("match_past_pattern_2: split != [1, 1] for Split in past path")
+                return None
+        else:
+            if not self.utils.check_node_input_value(squeeze, 1, [0]):
+                logger.debug("match_past_pattern_2: axes != [0] for Squeeze in past path")
+                return None
+
+            if not self.utils.check_node_input_value(split, 1, [1, 1]):
+                logger.debug("match_past_pattern_2: split != [1, 1] for Split in past path")
+                return None
+
+        if not FusionUtils.check_node_attribute(split, "axis", 0, default_value=0):
+            logger.debug("match_past_pattern_2: attribute axis of Split are not expected in past path")
+            return None
+        past = split.input[0]
+
+        past_k_nodes = self.model.match_parent_path(concat_k, ["Squeeze", "Split"], [0, 0])
+        if past_k_nodes is None:
+            logger.debug("match_past_pattern_2: failed to match past_k_nodes path")
+            return None
+        past_k = past_k_nodes[-1].input[0]
+
+        if past != past_k:
+            logger.info("match_past_pattern_2: expect past to be same")
+            return None
+
+        return past
+
+    def match_present(self, concat_v, input_name_to_nodes):
+        unsqueeze_present_v = self.model.find_first_child_by_type(
+            concat_v, "Unsqueeze", input_name_to_nodes, recursive=False
+        )
+        if not unsqueeze_present_v:
+            logger.info("expect unsqueeze for present")
+            return None
+        concat_present = self.model.find_first_child_by_type(
+            unsqueeze_present_v, "Concat", input_name_to_nodes, recursive=False
+        )
+        if not concat_present:
+            logger.info("expect concat for present")
+            return None
+
+        present = concat_present.output[0]
+        return present
+
+    def cast_attention_mask(self, input_name):
+        if input_name in self.casted_attention_mask:
+            attention_mask_input_name = self.casted_attention_mask[input_name]
+        elif self.model.find_graph_input(input_name):
+            casted, attention_mask_input_name = self.utils.cast_graph_input_to_int32(input_name)
+            self.casted_attention_mask[input_name] = attention_mask_input_name
+        else:
+            attention_mask_input_name, cast_node = self.utils.cast_input_to_int32(input_name)
+            self.casted_attention_mask[input_name] = attention_mask_input_name
+        return attention_mask_input_name
+
+
+class FusionGptAttention(FusionGptAttentionPastBase):
+    """
+    Fuse GPT-2 Attention with past state subgraph into one Attention node.
+    """
+
+    def __init__(self, model: OnnxModel, num_heads: int):
+        super().__init__(model, num_heads)
+
+    def create_attention_node(
+        self,
+        fc_weight,
+        fc_bias,
+        gemm_qkv,
+        past,
+        present,
+        input,
+        output,
+        mask,
+        is_unidirectional,
+    ):
+        attention_node_name = self.model.create_node_name("GptAttention")
+        attention_node = helper.make_node(
+            "Attention",
+            inputs=[input, fc_weight, fc_bias, mask, past],
+            outputs=[attention_node_name + "_output", present],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.microsoft"
+        attention_node.attribute.extend(
+            [
+                helper.make_attribute("num_heads", self.num_heads),
+                helper.make_attribute("unidirectional", 1 if is_unidirectional else 0),
+            ]
+        )
+
+        matmul_node = helper.make_node(
+            "MatMul",
+            inputs=[attention_node_name + "_output", gemm_qkv.input[1]],
+            outputs=[attention_node_name + "_matmul_output"],
+            name=attention_node_name + "_matmul",
+        )
+
+        add_node = helper.make_node(
+            "Add",
+            inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]],
+            outputs=[output],
+            name=attention_node_name + "_add",
+        )
+        self.nodes_to_add.extend([attention_node, matmul_node, add_node])
+        self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
+        self.node_name_to_graph_name[matmul_node.name] = self.this_graph_name
+        self.node_name_to_graph_name[add_node.name] = self.this_graph_name
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        past = None
+        present = None
+        return_indice = []
+        qkv_nodes = self.model.match_parent_path(
+            normalize_node,
+            ["Add", "Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"],
+            [0, None, 0, 0, 0, 0, 0],
+            output_name_to_node=output_name_to_node,
+            return_indice=return_indice,
+        )  # yapf: disable
+        if qkv_nodes is None:
+            return
+        (
+            add_qkv,
+            reshape_qkv,
+            gemm_qkv,
+            reshape_1,
+            reshape_2,
+            transpose_qkv,
+            matmul_qkv,
+        ) = qkv_nodes
+
+        another_input = add_qkv.input[1 - return_indice[0]]
+
+        v_nodes = self.model.match_parent_path(matmul_qkv, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0])
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (concat_v, transpose_v, reshape_v, split_fc) = v_nodes
+
+        fc_nodes = self.model.match_parent_path(
+            split_fc,
+            ["Reshape", "Gemm", "Reshape", "LayerNormalization"],
+            [0, 0, 0, 0],
+            output_name_to_node,
+        )
+        if fc_nodes is None:
+            fc_nodes = self.model.match_parent_path(
+                split_fc,
+                ["Add", "MatMul", "LayerNormalization"],
+                [0, None, 0],
+                output_name_to_node,
+            )
+            if fc_nodes is None:
+                logger.debug("fuse_attention: failed to match fc path")
+                return
+            fc_weight = fc_nodes[1].input[1]
+            i, _ = self.model.get_constant_input(fc_nodes[0])
+            fc_bias = fc_nodes[0].input[i]
+        else:
+            fc_weight = fc_nodes[1].input[1]
+            fc_bias = fc_nodes[1].input[2]
+
+        layernorm_before_attention = fc_nodes[-1]
+
+        if not another_input in layernorm_before_attention.input:
+            logger.debug("Add and LayerNormalization shall have one same input")
+            return
+
+        is_unidirectional = True
+        slice_mask = None
+        input_mask_nodes = None
+        concat_k_to_match = None
+        qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Sub", "Mul", "Div", "MatMul"], [0, 0, 0, 0, 0])
+        if qk_nodes is not None:
+            (softmax_qk, sub_qk, mul_qk, div_qk, matmul_qk) = qk_nodes
+            mask_nodes = self.model.match_parent_path(
+                sub_qk,
+                [
+                    "Mul",
+                    "Sub",
+                    "Slice",
+                    "Slice",
+                    "Unsqueeze",
+                    "Sub",
+                    "Squeeze",
+                    "Slice",
+                    "Shape",
+                    "Div",
+                ],
+                [1, 0, 1, 0, 1, 0, 0, 0, 0, 0],
+            )  # yapf: disable
+            if mask_nodes is None:
+                logger.debug("fuse_attention: failed to match unidirectional mask path")
+                return
+            div_mask = mask_nodes[-1]
+            slice_mask = mask_nodes[3]
+
+            if div_qk != div_mask:
+                logger.debug("fuse_attention: skip since div_qk != div_mask")
+                return
+        else:
+            # New pattern for gpt2 from PyTorch 1.5.0 and Transformers 2.9.0.
+            i, qk_nodes, _ = self.model.match_parent_paths(
+                matmul_qkv,
+                [
+                    (["Softmax", "Where", "Div", "MatMul"], [0, 0, 1, 0]),
+                    (["Softmax", "Add", "Where", "Div", "MatMul"], [0, 0, None, 1, 0]),
+                ],
+                output_name_to_node,
+            )
+            if qk_nodes is None:
+                logger.debug("fuse_attention: failed to match qk nodes")
+                return
+
+            where_qk = qk_nodes[-3]
+            div_qk = qk_nodes[-2]
+            matmul_qk = qk_nodes[-1]
+
+            if i == 1:
+                add_qk = qk_nodes[1]
+                _, input_mask_nodes, _ = self.model.match_parent_paths(
+                    add_qk,
+                    [
+                        (
+                            ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze", "Reshape"],
+                            [None, 0, 1, 0, 0, 0],
+                        ),
+                        (
+                            ["Mul", "Sub", "Unsqueeze", "Unsqueeze", "Reshape"],
+                            [None, 0, 1, 0, 0],
+                        ),
+                        (
+                            ["Mul", "Sub", "Unsqueeze", "Unsqueeze"],
+                            [None, 0, 1, 0],
+                        ),  # useless cast and reshape are removed.
+                    ],
+                    output_name_to_node,
+                )  # yapf: disable
+                if input_mask_nodes is None:
+                    logger.debug("fuse_attention: failed to match input attention mask path")
+                    return
+
+            mask_nodes = self.model.match_parent_path(
+                where_qk,
+                [
+                    "Cast",
+                    "Slice",
+                    "Slice",
+                    "Unsqueeze",
+                    "Sub",
+                    "Squeeze",
+                    "Slice",
+                    "Shape",
+                ],
+                [0, 0, 0, 1, 0, 0, 0, 0],
+                output_name_to_node,
+            )  # yapf: disable
+            if mask_nodes is None:
+                # TODO: match mask path for GPT2LMHeadModel_BeamSearchStep.
+                logger.debug("fuse_attention: failed to match mask path")
+                return
+
+            slice_mask = mask_nodes[2]
+
+            div_or_concat = self.model.get_parent(mask_nodes[-1], 0, output_name_to_node)
+            if div_or_concat.op_type == "Div":
+                div_mask = div_or_concat
+                if div_qk != div_mask:
+                    logger.debug("fuse_attention: skip since div_qk != div_mask")
+                    return
+            elif div_or_concat.op_type == "Concat":
+                concat_k_to_match = div_or_concat
+            else:
+                logger.debug("fuse_attention: failed to match mask path")
+
+        # Validate that the mask data is either lower triangular (unidirectional) or all ones
+        mask_data = numpy_helper.to_array(self.model.get_initializer(slice_mask.input[0]))
+        if not (
+            len(mask_data.shape) == 4 and mask_data.shape[:2] == (1, 1) and mask_data.shape[2] == mask_data.shape[3]
+        ):
+            logger.debug("fuse_attention: skip since mask shape is not 1x1xWxW")
+            return
+        if np.allclose(mask_data, np.ones_like(mask_data)):
+            is_unidirectional = False
+        elif not np.allclose(mask_data, np.tril(np.ones_like(mask_data))):
+            logger.debug("fuse_attention: skip since mask is neither lower triangular nor ones")
+            return
+
+        q_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0])
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        (transpose_q, reshape_q, split_q) = q_nodes
+        if split_fc != split_q:
+            logger.debug("fuse_attention: skip since split_fc != split_q")
+            return
+
+        k_nodes = self.model.match_parent_path(matmul_qk, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0])
+        if k_nodes is None:
+            # This pattern is from pytorch 1.7.1 and transformers 4.6.1
+            k_nodes = self.model.match_parent_path(
+                matmul_qk,
+                ["Transpose", "Concat", "Transpose", "Reshape", "Split"],
+                [1, 0, 1, 0, 0],
+            )
+            if k_nodes is None:
+                logger.debug("fuse_attention: failed to match k path")
+                return
+            else:
+                (_, concat_k, transpose_k, reshape_k, split_k) = k_nodes
+        else:
+            (concat_k, transpose_k, reshape_k, split_k) = k_nodes
+        if split_fc != split_k:
+            logger.debug("fuse_attention: skip since split_fc != split_k")
+            return
+
+        if concat_k_to_match and concat_k != concat_k_to_match:
+            logger.debug("fuse_attention: skip since concat_k != concat_k_to_match")
+            return
+
+        attention_mask_input_name = ""
+        if input_mask_nodes is not None:
+            input_name = input_mask_nodes[-1].input[0]
+            attention_mask_input_name = self.cast_attention_mask(input_name)
+
+        # Match past and present paths
+        past = self.match_past_pattern_1(concat_k, concat_v, output_name_to_node) or self.match_past_pattern_2(
+            concat_k, concat_v, output_name_to_node
+        )
+        if past is None:
+            logger.info("fuse_attention: failed to match past path")
+            return
+        if not self.model.find_graph_input(past):
+            logger.debug("past is not graph input.")
+            # For GPT2LMHeadModel_BeamSearchStep, there is an extra Gather node to select beam index so it is not graph input.
+
+        present = self.match_present(concat_v, input_name_to_nodes)
+        if present is None:
+            logger.info("fuse_attention: failed to match present path")
+            return
+        if not self.model.find_graph_output(present):
+            logger.info("expect present to be graph output")
+            return
+
+        self.create_attention_node(
+            fc_weight,
+            fc_bias,
+            gemm_qkv,
+            past,
+            present,
+            layernorm_before_attention.output[0],
+            reshape_qkv.output[0],
+            attention_mask_input_name,
+            is_unidirectional,
+        )
+
+        # we rely on prune_graph() to clean old subgraph nodes:
+        # qk_nodes + q_nodes + k_nodes + v_nodes + mask_nodes + [reshape_qkv, transpose_qkv, matmul_qkv]
+        self.prune_graph = True
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py
new file mode 100755
index 000000000..8510ae429
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py
@@ -0,0 +1,292 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+
+import numpy as np
+from .fusion_base import Fusion
+from .fusion_gpt_attention import FusionGptAttentionPastBase
+from .fusion_utils import FusionUtils
+from onnx import TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+def is_close(value, expected_value):
+    return abs(value - expected_value) <= 1e-6
+
+
+class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
+    """
+    Fuse GPT-2 Attention with past state subgraph from Megatron into one Attention node.
+    """
+
+    def __init__(self, model: OnnxModel, num_heads: int):
+        super().__init__(model, num_heads)
+
+    def fuse_attention_node(
+        self,
+        matmul_before_split,
+        add_before_split,
+        past,
+        present,
+        input,
+        reshape_qkv,
+        mask,
+    ):
+        attention_node_name = self.model.create_node_name("GptAttention")
+        int32_mask = self.cast_attention_mask(mask)
+        output = reshape_qkv.output[0]
+        i = 1 if (add_before_split.input[0] == matmul_before_split.output[0]) else 0
+        attention_node = helper.make_node(
+            "Attention",
+            inputs=[
+                input,
+                matmul_before_split.input[1],
+                add_before_split.input[i],
+                int32_mask,
+                past,
+            ],
+            outputs=[output, present],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.microsoft"
+        attention_node.attribute.extend(
+            [
+                helper.make_attribute("num_heads", self.num_heads),
+                helper.make_attribute("unidirectional", 0),  # unidirectional shall not be ON for 4D attention mask
+            ]
+        )
+
+        nodes_to_add = [attention_node]
+        self.nodes_to_add.extend(nodes_to_add)
+
+        for node in nodes_to_add:
+            self.node_name_to_graph_name[node.name] = self.this_graph_name
+
+        self.nodes_to_remove.append(reshape_qkv)
+
+        # we rely on prune_graph() to clean old subgraph nodes
+        self.prune_graph = True
+
+    def match_mask(self, sub_qk, mul_qk, matmul_qk, layernorm_before_attention):
+        mask_nodes = self.model.match_parent_path(
+            sub_qk, ["Mul", "Sub", "Slice", "Slice"], [1, 0, 1, 0]
+        )  # yapf: disable
+        if mask_nodes is None:
+            logger.debug("fuse_attention: failed to match unidirectional mask path")
+            return None
+        (mul_mask, sub_mask, last_slice_mask, slice_mask) = mask_nodes
+
+        if mul_qk.input[1] != last_slice_mask.output[0]:
+            logger.debug("fuse_attention failed: mul_qk.input[1] != last_slice_mask.output[0]")
+            return None
+
+        if not self.utils.check_node_input_value(mul_mask, 1, 10000.0):
+            logger.debug("fuse_attention failed: mul_mask input 1 is not constant 10000.0")
+            return None
+
+        if not self.utils.check_node_input_value(sub_mask, 0, 1.0):
+            logger.debug("fuse_attention failed: sub_mask input 0 is not constant 1.0")
+            return None
+
+        if not self.model.find_graph_input(slice_mask.input[0]):
+            logger.info("expect slick_mask input 0 to be graph input")
+            return None
+
+        if not self.utils.check_node_input_value(last_slice_mask, 1, [0]):
+            logger.debug("fuse_attention failed: last_slice_mask input 1 (starts) is not constant [0]")
+            return None
+
+        if not self.utils.check_node_input_value(last_slice_mask, 3, [3]):
+            logger.debug("fuse_attention failed: last_slice_mask input 3 (axes) is not constant [3]")
+            return False
+
+        if not self.utils.check_node_input_value(last_slice_mask, 4, [1]):
+            logger.debug("fuse_attention failed: last_slice_mask input 4 (steps) is not constant [1]")
+            return False
+
+        if not self.utils.check_node_input_value(slice_mask, 3, [2]):
+            logger.debug("fuse_attention failed: slice_mask input 3 (axes) is not constant [2]")
+            return None
+
+        if not self.utils.check_node_input_value(slice_mask, 4, [1]):
+            logger.debug("fuse_attention failed: slice_mask input 4 (steps) is not constant [1]")
+            return None
+
+        last_slice_path = self.model.match_parent_path(
+            last_slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0]
+        )
+        if last_slice_path is None or last_slice_path[-1] != matmul_qk:
+            logger.debug("fuse_attention: failed to match last slice path")
+            return None
+
+        first_slice_path = self.model.match_parent_path(
+            slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0]
+        )
+        if first_slice_path is None or first_slice_path[-1] != matmul_qk:
+            logger.debug("fuse_attention: failed to match first slice path")
+            return None
+
+        first_slice_sub = self.model.match_parent_path(
+            slice_mask,
+            ["Unsqueeze", "Sub", "Gather", "Shape", "MatMul"],
+            [1, 0, 0, 0, 0],
+        )
+        if first_slice_sub is None or first_slice_sub[-1] != matmul_qk:
+            logger.debug("fuse_attention: failed to match last slice sub path")
+            return None
+
+        first_slice_sub_1 = self.model.match_parent_path(
+            slice_mask,
+            ["Unsqueeze", "Sub", "Gather", "Shape", "LayerNormalization"],
+            [1, 0, 1, 0, 0],
+        )
+        if first_slice_sub_1 is None or first_slice_sub_1[-1] != layernorm_before_attention:
+            logger.debug("fuse_attention: failed to match last slice sub path 1")
+            return None
+
+        return slice_mask.input[0]
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        past = None
+        present = None
+
+        qkv_nodes = self.model.match_parent_path(
+            normalize_node,
+            ["Add", "Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+            [0, 1, None, 0, 0, 0],
+            output_name_to_node=output_name_to_node,
+        )  # yapf: disable
+        if qkv_nodes is None:
+            return
+        (
+            add_skip,
+            add_after_attention,
+            matmul_after_attention,
+            reshape_qkv,
+            transpose_qkv,
+            matmul_qkv,
+        ) = qkv_nodes
+
+        skip_input = add_skip.input[0]
+
+        v_nodes = self.model.match_parent_path(
+            matmul_qkv,
+            [
+                "Concat",
+                "Transpose",
+                "Reshape",
+                "Split",
+                "Add",
+                "MatMul",
+                "LayerNormalization",
+            ],
+            [1, 1, 0, 0, 0, None, 0],
+        )  # yapf: disable
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (
+            concat_v,
+            transpose_v,
+            reshape_v,
+            split_v,
+            add_before_split,
+            matmul_before_split,
+            layernorm_before_attention,
+        ) = v_nodes
+        if skip_input != layernorm_before_attention.input[0]:
+            logger.debug("fuse_attention: skip_input != layernorm_before_attention.input[0]")
+            return
+
+        qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Sub", "Mul", "MatMul"], [0, 0, 0, 0])
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return None
+        (softmax_qk, sub_qk, mul_qk, matmul_qk) = qk_nodes
+        if self.model.get_node_attribute(softmax_qk, "axis") != 3:
+            logger.debug("fuse_attention failed: softmax_qk axis != 3")
+            return None
+
+        attention_mask = self.match_mask(sub_qk, mul_qk, matmul_qk, layernorm_before_attention)
+
+        q_nodes = self.model.match_parent_path(matmul_qk, ["Div", "Transpose", "Reshape", "Split"], [0, 0, 0, 0])
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        (div_q, transpose_q, reshape_q, split_q) = q_nodes
+        if split_v != split_q:
+            logger.debug("fuse_attention: skip since split_v != split_q")
+            return
+
+        k_nodes = self.model.match_parent_path(
+            matmul_qk,
+            ["Div", "Transpose", "Concat", "Transpose", "Reshape", "Split"],
+            [1, 0, 0, 1, 0, 0],
+        )
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        (div_k, _, concat_k, transpose_k, reshape_k, split_k) = k_nodes
+        if split_v != split_k:
+            logger.debug("fuse_attention: skip since split_v != split_k")
+            return
+
+        i, value = self.model.get_constant_input(reshape_k)
+        if not (
+            isinstance(value, np.ndarray)
+            and list(value.shape) == [4]
+            and value[0] == 0
+            and value[1] == 0
+            and value[2] > 0
+            and value[3] > 0
+        ):
+            logger.debug("fuse_attention: reshape constant input is not [0, 0, N, H]")
+            return
+
+        num_heads = value[2]
+        if num_heads != self.num_heads:
+            logger.info(f"Detected num_heads={num_heads}. Ignore user specified value {self.num_heads}")
+            self.num_heads = num_heads
+
+        hidden_size_per_head = value[3]
+        i, value = self.model.get_constant_input(div_k)
+        expected_value = float(np.sqrt(np.sqrt(hidden_size_per_head)))
+        if not is_close(value, expected_value):
+            logger.debug(f"fuse_attention: div_k value={value} expected={expected_value}")
+            return
+
+        i, value = self.model.get_constant_input(div_q)
+        if not is_close(value, expected_value):
+            logger.debug(f"fuse_attention: div_q value={value} expected={expected_value}")
+            return
+
+        # Match past and present paths
+        past = self.match_past_pattern_2(concat_k, concat_v, output_name_to_node)
+        if past is None:
+            logger.debug("fuse_attention: match past failed")
+            return
+        if not self.model.find_graph_input(past):
+            logger.debug("fuse_attention: past is not graph input.")
+            # For GPT2LMHeadModel_BeamSearchStep, there is an extra Gather node to select beam index so it is not graph input.
+
+        present = self.match_present(concat_v, input_name_to_nodes)
+        if present is None:
+            logger.debug("fuse_attention: match present failed")
+            return
+        if not self.model.find_graph_output(present):
+            logger.info("fuse_attention: expect present to be graph output")
+            return
+
+        self.fuse_attention_node(
+            matmul_before_split,
+            add_before_split,
+            past,
+            present,
+            layernorm_before_attention.output[0],
+            reshape_qkv,
+            attention_mask,
+        )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py
new file mode 100755
index 000000000..ca88f144f
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py
@@ -0,0 +1,252 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from logging import getLogger
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionGptAttentionNoPast(Fusion):
+    """
+    Fuse GPT-2 Attention without past state into one Attention node.
+    This does not support attention_mask graph input right now.
+    """
+
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model,
+            "CustomQKVToContextPluginDynamic_IxRT",
+            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
+            "without past",
+        )
+        self.where_qk_shared = None
+
+    def get_num_heads_and_hidden_size(
+        self, custom_fc: NodeProto, div: NodeProto
+    ) -> Tuple[int, int]:
+        div_initializer = self.model.get_initializer(div.input[1])
+
+        # 检查float_data是否为空
+        if len(div_initializer.float_data) > 0:
+            div_value = div_initializer.float_data[0]
+        else:
+            # 如果float_data为空，尝试其他方式获取数据
+            # 例如，如果数据存储在raw_data中
+            if len(div_initializer.raw_data) > 0:
+                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type]
+                div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0]
+            else:
+                raise ValueError("Data not found in the div_initializer")
+
+        for attr in custom_fc.attribute:
+            if attr.name == "W":
+                tensor_value = attr.t
+                tensor_shape = [dim for dim in tensor_value.dims]
+                break
+        head_dim = math.ceil(div_value * div_value)
+        hidden_size = tensor_shape[0]
+        num_heads = hidden_size // head_dim
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+        self,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        where_qk: NodeProto,
+    ) -> Union[NodeProto, None]:
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        attention_inputs = [input]
+        if where_qk is not None:
+            has_mask = 1
+            has_qk_bias = 1
+            attention_inputs.append(where_qk.output[0])
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend(
+            [helper.make_attribute("hidden_size", hidden_size)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend(
+            [helper.make_attribute("has_qk_bias", has_qk_bias)]
+        )
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        return_indice = []
+        add_qkv = normalize_node
+        if normalize_node.op_type == "LayerNormalization":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                add_qkv = add_before_layernorm
+
+        qkv_paths = {
+            "path1": (
+                ["CustomFCPluginDynamic_IxRT", "Reshape", "Transpose", "MatMul"],
+                [None, 0, 0, 0],
+            ),
+            "path2": (
+                ["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"],
+                [None, 0, 0],
+            ),
+        }
+
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(
+            add_qkv,
+            qkv_paths,
+            output_name_to_node,
+            return_indice,
+        )  # yapf: disable
+
+        if qkv_nodes is None:
+            return
+        reshape_2 = None
+        if qkv_path == "path1":
+            (
+                custom_fc_after_attention,
+                reshape_2,
+                transpose_qkv,
+                matmul_qkv,
+            ) = qkv_nodes
+        else:
+            (
+                custom_fc_after_attention,
+                transpose_qkv,
+                matmul_qkv,
+            ) = qkv_nodes
+
+        another_input = add_qkv.input[1 - return_indice[0]]
+
+        v_nodes = self.model.match_parent_path(
+            matmul_qkv,
+            ["Transpose", "Reshape", "Split", "CustomFCPluginDynamic_IxRT"],
+            [1, 0, 0, 0],
+        )  # yapf: disable
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (
+            transpose_v,
+            reshape_v,
+            split_v,
+            custom_fc_before_attention,
+        ) = v_nodes
+
+        layernorm_before_attention = self.model.get_parent(
+            custom_fc_before_attention, 0, output_name_to_node
+        )
+        if (
+            layernorm_before_attention is None
+            or layernorm_before_attention.op_type != "LayerNormalization"
+        ):
+            if layernorm_before_attention.op_type != "Add":
+                logger.debug(
+                    f"failed to get layernorm before gemm. Got {layernorm_before_attention.op_type}"
+                )
+                return
+
+        if not another_input in layernorm_before_attention.input:
+            # match openai-gpt
+            if not another_input in layernorm_before_attention.output:
+                logger.debug("Add and LayerNormalization shall have one same input")
+                return
+
+        qk_nodes = self.model.match_parent_path(
+            matmul_qkv, ["Softmax", "Add", "Where", "Div", "MatMul"], [0, None, 0, 1, 0]
+        )
+        where_qk = None
+        matmul_qk = None
+        mask_return_indices = []
+        if qk_nodes is not None:
+            (softmax_qk, add_qk, where_qk, div_qk, matmul_qk) = qk_nodes
+            mask_nodes = self.model.match_parent_path(
+                add_qk,
+                ["Mul", "Sub", "Cast", "Unsqueeze"],
+                [None, 0, 1, 0],
+                return_indice=mask_return_indices,
+            )  # yapf: disable
+            if mask_nodes is None:
+                logger.debug("fuse_attention: failed to match mask path")
+                return
+
+        q_nodes = self.model.match_parent_path(
+            matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0]
+        )
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        (transpose_q, reshape_q, split_q) = q_nodes
+        if split_v != split_q:
+            logger.debug("fuse_attention: skip since split_v != split_q")
+            return
+
+        k_nodes = self.model.match_parent_path(
+            matmul_qk, ["Transpose", "Reshape", "Split"], [1, 0, 0]
+        )
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        (transpose_k, reshape_k, split_k) = k_nodes
+        if split_v != split_k:
+            logger.debug("fuse_attention: skip since split_v != split_k")
+            return
+
+        if where_qk is None:
+            return
+
+        if self.where_qk_shared is None:
+            where_qk.input[1] = mask_nodes[0].output[0]
+            div_qk.output[0] = where_qk.output[0]
+            add_qk.input[1 - mask_return_indices[0]] = div_qk.output[0]
+            self.where_qk_shared = where_qk
+            self.nodes_to_remove.extend([softmax_qk, add_qk, div_qk, matmul_qk])
+        else:
+            self.nodes_to_remove.extend(
+                [softmax_qk, add_qk, where_qk, div_qk, matmul_qk]
+            )
+
+        num_heads, hidden_size = self.get_num_heads_and_hidden_size(
+            custom_fc_after_attention, div_qk
+        )
+        new_node = self.create_attention_node(
+            num_heads,
+            hidden_size,
+            custom_fc_before_attention.output[0],
+            transpose_qkv.output[0] if reshape_2 is None else reshape_2.output[0],
+            self.where_qk_shared,
+        )
+
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+        if reshape_2 is not None:
+            self.nodes_to_remove.extend([reshape_2])
+        self.nodes_to_remove.extend([transpose_qkv, matmul_qkv])
+        self.nodes_to_remove.extend(q_nodes)
+        self.nodes_to_remove.extend(k_nodes)
+        self.nodes_to_remove.extend(v_nodes[:-1])
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py
new file mode 100755
index 000000000..922afa78a
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py
@@ -0,0 +1,296 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Dict
+
+from .fusion_base import Fusion
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionLayerNormalization(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "LayerNormalization", "ReduceMean")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        """
+        Fuse Layer Normalization subgraph into one node LayerNormalization:
+              +----------------------+
+              |                      |
+              |                      v
+          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
+                     (axis=2 or -1)  |      (Y=2)   (axis=2 or -1)  (E-6 or E-12 or 0)    ^
+                                     |                                               |
+                                     +-----------------------------------------------+
+
+         It also handles cases of duplicated sub nodes exported from older version of PyTorch:
+              +----------------------+
+              |                      v
+              |           +-------> Sub-----------------------------------------------+
+              |           |                                                           |
+              |           |                                                           v
+          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div  --> Mul --> Add
+              |                      ^
+              |                      |
+              +----------------------+
+        """
+        children = self.model.get_children(node, input_name_to_nodes)
+        if len(children) == 0 or len(children) > 2:
+            return
+
+        root_input = node.input[0]
+
+        if children[0].op_type != "Sub" or children[0].input[0] != root_input:
+            return
+
+        if len(children) == 2:
+            if children[1].op_type != "Sub" or children[1].input[0] != root_input:
+                return
+
+        div_node = None
+        for child in children:
+            div_node = self.model.find_first_child_by_type(child, "Div", input_name_to_nodes, recursive=False)
+            if div_node is not None:
+                break
+        if div_node is None:
+            return
+
+        path_id, parent_nodes, _ = self.model.match_parent_paths(
+            div_node,
+            [
+                (["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
+                (
+                    ["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"],
+                    [1, 0, 0, 0, 0, 0],
+                ),
+            ],
+            output_name_to_node,
+        )
+        if path_id < 0:
+            return
+
+        sub_node = parent_nodes[-1]
+        if sub_node not in children:
+            return
+
+        second_add_node = parent_nodes[1]
+        i, add_weight = self.model.get_constant_input(second_add_node)
+        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
+            logger.warning(f"epsilon value is not expeced: {add_weight}")
+            return
+
+        pow_node = parent_nodes[3]
+        if not self.model.find_constant_input(pow_node, 2.0) == 1:
+            return
+
+        mul_node = input_name_to_nodes[div_node.output[0]][0]
+        if mul_node.op_type != "Mul":
+            return
+
+        last_add_node = input_name_to_nodes[mul_node.output[0]][0]
+        if last_add_node.op_type != "Add":
+            return
+
+        subgraph_nodes = [node]
+        subgraph_nodes.extend(children)
+        subgraph_nodes.extend(parent_nodes[:-1])
+
+        subgraph_nodes.extend([last_add_node, mul_node, div_node])
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            last_add_node.output,
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
+            return
+
+        weight_input = mul_node.input[1 - self.model.input_index(div_node.output[0], mul_node)]
+        if not self.model.is_constant_with_specified_dimension(weight_input, 1, "layernorm weight"):
+            return
+
+        bias_input = last_add_node.input[1 - self.model.input_index(mul_node.output[0], last_add_node)]
+        if not self.model.is_constant_with_specified_dimension(bias_input, 1, "layernorm bias"):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        normalize_node = helper.make_node(
+            "LayerNormalization",
+            inputs=[node.input[0], weight_input, bias_input],
+            outputs=[last_add_node.output[0]],
+            name=self.model.create_node_name("LayerNormalization", name_prefix="LayerNorm"),
+        )
+        normalize_node.attribute.extend([helper.make_attribute("epsilon", float(add_weight))])
+        self.nodes_to_add.append(normalize_node)
+        self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+
+
+class FusionLayerNormalizationTF(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "LayerNormalization", "Add", "TF")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        """
+         Layer Norm from Tensorflow model(using keras2onnx or tf2onnx):
+          +------------------------------------+
+          |                                    |
+          |                                    |
+        (Cast_1)                               |
+          |                                    |
+          |                                    v                                           (B)                             (B)             (A)
+         Add --> (Cast_1) --> ReduceMean -->  Sub  --> Mul --> ReduceMean --> (Cast_3) --> Add --> Sqrt --> Reciprocol --> Mul --> Mul --> Sub --> Add
+          |                       |                                                                                         |       ^              ^
+          |                       |                                                                                         |       |              |
+          |                       +--------------------------------------------------(Cast_2)-------------------------------|-------+              |
+          |                                                                                                                 v                      |
+          +---------------------------------------------------------------------------------------------------------------> Mul--------------------+
+        """
+        return_indice = []
+        _, parent_nodes, return_indice = self.model.match_parent_paths(
+            node,
+            [
+                (
+                    [
+                        "Sub",
+                        "Mul",
+                        "Mul",
+                        "Reciprocal",
+                        "Sqrt",
+                        "Add",
+                        "ReduceMean",
+                        "Mul",
+                        "Sub",
+                        "ReduceMean",
+                    ],
+                    [1, 1, None, 0, 0, 0, None, 0, 0, None],
+                ),
+                (
+                    [
+                        "Sub",
+                        "Mul",
+                        "Mul",
+                        "Reciprocal",
+                        "Sqrt",
+                        "Add",
+                        "Cast",
+                        "ReduceMean",
+                        "Mul",
+                        "Sub",
+                        "ReduceMean",
+                    ],
+                    [1, 1, None, 0, 0, 0, 0, None, 0, 0, None],
+                ),
+            ],
+            output_name_to_node,
+        )  # yapf: disable
+
+        if parent_nodes is None:
+            return
+
+        assert len(return_indice) == 3
+        if not (return_indice[0] in [0, 1] and return_indice[1] in [0, 1] and return_indice[2] in [0, 1]):
+            logger.debug("return indice is exepected in [0, 1], but got {return_indice}")
+            return
+
+        (
+            sub_node_0,
+            mul_node_0,
+            mul_node_1,
+            reciprocol_node,
+            sqrt_node,
+            add_node_0,
+        ) = parent_nodes[:6]
+        reduce_mean_node_0, mul_node_2, sub_node_1, reduce_mean_node_1 = parent_nodes[-4:]
+
+        cast_node_3 = None
+        if len(parent_nodes) == 11:
+            cast_node_3 = parent_nodes[6]
+            assert cast_node_3.op_type == "Cast"
+
+        mul_node_3 = self.model.match_parent(node, "Mul", 0, output_name_to_node)
+        if mul_node_3 is None:
+            logger.debug("mul_node_3 not found")
+            return
+
+        node_before_reduce = self.model.get_parent(reduce_mean_node_1, 0, output_name_to_node)
+        root_node = (
+            node_before_reduce
+            if cast_node_3 is None
+            else self.model.get_parent(node_before_reduce, 0, output_name_to_node)
+        )
+        if root_node is None:
+            logger.debug("root node is none")
+            return
+
+        i, epsilon = self.model.get_constant_input(add_node_0)
+        if epsilon is None or epsilon <= 0 or (epsilon > 1.0e-5 and cast_node_3 is None):
+            logger.debug("epsilon is not matched")
+            return
+
+        if cast_node_3 is None and (
+            reduce_mean_node_1.input[0] not in mul_node_3.input or reduce_mean_node_1.input[0] not in sub_node_1.input
+        ):
+            logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
+            return
+
+        if cast_node_3 is not None and (
+            node_before_reduce.input[0] not in mul_node_3.input or reduce_mean_node_1.input[0] not in sub_node_1.input
+        ):
+            logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
+            return
+
+        if mul_node_2.input[0] != mul_node_2.input[1]:
+            logger.debug("mul_node_2 shall have two same inputs")
+            return
+
+        subgraph_nodes = [
+            node,
+            sub_node_0,
+            mul_node_0,
+            mul_node_1,
+            reciprocol_node,
+            sqrt_node,
+            add_node_0,
+            reduce_mean_node_0,
+            mul_node_2,
+            sub_node_1,
+            reduce_mean_node_1,
+            mul_node_3,
+        ]
+
+        if cast_node_3 is not None:
+            cast_node_2 = self.model.match_parent(mul_node_0, "Cast", 0, output_name_to_node)
+            if cast_node_2 is None:
+                logger.debug("cast_node_2 not found")
+                return
+            subgraph_nodes.extend([node_before_reduce, cast_node_2, cast_node_3])
+
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            node.output,
+            self.model.input_name_to_nodes(),
+            self.model.output_name_to_node(),
+        ):
+            logger.debug("not safe to fuse layer normalization")
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        weight_input = mul_node_1.input[1]
+        bias_input = sub_node_0.input[0]
+
+        # TODO: add epsilon attribute
+        fused_node = helper.make_node(
+            "LayerNormalization",
+            inputs=[mul_node_3.input[0], weight_input, bias_input],
+            outputs=[node.output[0]],
+            name=self.model.create_node_name("LayerNormalization", name_prefix="LayerNorm"),
+        )
+        fused_node.attribute.extend([helper.make_attribute("epsilon", float(epsilon))])
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py
new file mode 100755
index 000000000..af315ce4f
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py
@@ -0,0 +1,167 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from argparse import ArgumentParser
+
+
+class AttentionMaskFormat:
+    MaskIndexEnd = 0
+    MaskIndexEndAndStart = 1
+    AttentionMask = 2
+    NoMask = 3
+
+
+class FusionOptions:
+    """Options of fusion in graph optimization"""
+
+    def __init__(self, model_type):
+        self.enable_gelu = True
+        self.enable_layer_norm = True
+        self.enable_attention = True
+        self.enable_skip_layer_norm = True
+        self.enable_embed_layer_norm = True
+        self.enable_bias_skip_layer_norm = True
+        self.enable_bias_gelu = True
+        self.enable_gelu_approximation = False
+        self.enable_qordered_matmul = True
+
+        self.enable_shape_inference = True
+        self.enable_swint_opt = False
+        self.enable_format_roformer = False
+        self.enable_gpt2_classify = False
+        self.attention_mask_format = AttentionMaskFormat.AttentionMask
+
+        if model_type == "gpt2":
+            self.enable_skip_layer_norm = False
+            self.enable_gpt2_classify = True
+        elif model_type == "swint":
+            self.enable_swint_opt = True
+        elif model_type == "roformer":
+            self.enable_format_roformer = True
+
+    def use_raw_attention_mask(self, use_raw_mask=True):
+        if use_raw_mask:
+            self.attention_mask_format = AttentionMaskFormat.AttentionMask
+        else:
+            self.attention_mask_format = AttentionMaskFormat.MaskIndexEnd
+
+    def disable_attention_mask(self):
+        self.attention_mask_format = AttentionMaskFormat.NoMask
+
+    @staticmethod
+    def parse(args):
+        options = FusionOptions(args.model_type)
+        if args.disable_gelu:
+            options.enable_gelu = False
+        if args.disable_layer_norm:
+            options.enable_layer_norm = False
+        if args.disable_attention:
+            options.enable_attention = False
+        if args.disable_skip_layer_norm:
+            options.enable_skip_layer_norm = False
+        if args.disable_embed_layer_norm:
+            options.enable_embed_layer_norm = False
+        if args.disable_bias_skip_layer_norm:
+            options.enable_bias_skip_layer_norm = False
+        if args.disable_bias_gelu:
+            options.enable_bias_gelu = False
+        if args.enable_gelu_approximation:
+            options.enable_gelu_approximation = True
+        if args.disable_shape_inference:
+            options.enable_shape_inference = False
+        if args.use_mask_index:
+            options.use_raw_attention_mask(False)
+        if args.no_attention_mask:
+            options.disable_attention_mask()
+        return options
+
+    @staticmethod
+    def add_arguments(parser: ArgumentParser):
+        parser.add_argument(
+            "--disable_attention",
+            required=False,
+            action="store_true",
+            help="disable Attention fusion",
+        )
+        parser.set_defaults(disable_attention=False)
+
+        parser.add_argument(
+            "--disable_skip_layer_norm",
+            required=False,
+            action="store_true",
+            help="disable SkipLayerNormalization fusion",
+        )
+        parser.set_defaults(disable_skip_layer_norm=False)
+
+        parser.add_argument(
+            "--disable_embed_layer_norm",
+            required=False,
+            action="store_true",
+            help="disable EmbedLayerNormalization fusion",
+        )
+        parser.set_defaults(disable_embed_layer_norm=False)
+
+        parser.add_argument(
+            "--disable_bias_skip_layer_norm",
+            required=False,
+            action="store_true",
+            help="disable Add Bias and SkipLayerNormalization fusion",
+        )
+        parser.set_defaults(disable_bias_skip_layer_norm=False)
+
+        parser.add_argument(
+            "--disable_bias_gelu",
+            required=False,
+            action="store_true",
+            help="disable Add Bias and Gelu/FastGelu fusion",
+        )
+        parser.set_defaults(disable_bias_gelu=False)
+
+        parser.add_argument(
+            "--disable_layer_norm",
+            required=False,
+            action="store_true",
+            help="disable LayerNormalization fusion",
+        )
+        parser.set_defaults(disable_layer_norm=False)
+
+        parser.add_argument(
+            "--disable_gelu",
+            required=False,
+            action="store_true",
+            help="disable Gelu fusion",
+        )
+        parser.set_defaults(disable_gelu=False)
+
+        parser.add_argument(
+            "--enable_gelu_approximation",
+            required=False,
+            action="store_true",
+            help="enable Gelu/BiasGelu to FastGelu conversion",
+        )
+        parser.set_defaults(enable_gelu_approximation=False)
+
+        parser.add_argument(
+            "--disable_shape_inference",
+            required=False,
+            action="store_true",
+            help="disable symbolic shape inference",
+        )
+        parser.set_defaults(disable_shape_inference=False)
+
+        parser.add_argument(
+            "--use_mask_index",
+            required=False,
+            action="store_true",
+            help="use mask index instead of raw attention mask in attention operator",
+        )
+        parser.set_defaults(use_mask_index=False)
+
+        parser.add_argument(
+            "--no_attention_mask",
+            required=False,
+            action="store_true",
+            help="no attention mask. Only works for model_type=bert",
+        )
+        parser.set_defaults(no_attention_mask=False)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py
new file mode 100755
index 000000000..b9b502acb
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py
@@ -0,0 +1,421 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Tuple
+
+import numpy as np
+from .fusion_attention import AttentionMask
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils, NumpyHelper
+from onnx import NodeProto, helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionQOrderedAttention(Fusion):
+    def __init__(
+        self,
+        model: OnnxModel,
+        hidden_size: int,
+        num_heads: int,
+        attention_mask: AttentionMask,
+    ):
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.attention_mask = attention_mask
+
+        super().__init__(model, "QOrderedAttention", "QOrderedLayerNormalization")
+
+    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        q_shape = self.model.get_initializer(reshape_q.input[1])
+        if q_shape is None:
+            logger.debug(f"{reshape_q.input[1]} is not initializer.")
+
+            # Check if the second input to Reshape flows through a Constant node
+            # TODO: Investigate why FusionAttention doesn't have such logic
+            constant_node = self.model.match_parent_path(reshape_q, ["Constant"], [1])
+
+            if constant_node is None:
+                return self.num_heads, self.hidden_size  # Fall back to user specified value
+            else:
+                constant_node = constant_node[0]
+
+                if len(constant_node.attribute) != 1:
+                    return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+                # This is assuming it is a Tensor attribute (this is a safe assumption)
+                q_shape = constant_node.attribute[0].t
+
+        q_shape_value = NumpyHelper.to_array(q_shape)
+        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
+            logger.debug(f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size].")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        num_heads = q_shape_value[2]
+        head_size = q_shape_value[3]
+        hidden_size = num_heads * head_size
+
+        if self.num_heads > 0 and num_heads != self.num_heads:
+            if self.num_heads_warning:
+                logger.warning(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.")
+                self.num_heads_warning = False  # Do not show the warning more than once
+
+        if self.hidden_size > 0 and hidden_size != self.hidden_size:
+            if self.hidden_size_warning:
+                logger.warning(
+                    f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
+                )
+                self.hidden_size_warning = False  # Do not show the warning more than once
+
+        return num_heads, hidden_size
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        add_before_layernorm = self.model.match_parent_path(
+            normalize_node,
+            ["QuantizeLinear", "Add"],
+            [0, 0],
+        )
+
+        if add_before_layernorm is not None:
+            start_node = add_before_layernorm[-1]
+        else:
+            return
+
+        # Input QDQ nodes
+        dequantize_input = self.model.match_parent_path(
+            start_node,
+            ["DequantizeLinear"],
+            [None],
+        )
+
+        if dequantize_input is None:
+            logger.debug("fuse_qordered_attention: failed to match input qdq nodes path")
+            return
+
+        dequantize_input = dequantize_input[-1]
+
+        # QKV nodes
+        qkv_nodes = self.model.match_parent_path(
+            start_node,
+            ["Add", "MatMul", "Reshape", "Transpose", "DequantizeLinear", "QuantizeLinear", "MatMul"],
+            [None, None, 0, 0, 0, 0, 0],
+        )
+
+        if qkv_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match qkv path")
+            return
+
+        (_, projection_matmul, reshape_qkv, transpose_qkv, dequantize_qkv, quantize_qkv, matmul_qkv) = qkv_nodes
+
+        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_qkv, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_qkv, self.model):
+            return
+
+        # Identify the root input to the Attention node
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+
+            other_inputs.append(input)
+
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+
+        # V nodes
+        v_nodes = self.model.match_parent_path(
+            matmul_qkv,
+            ["Transpose", "Reshape", "DequantizeLinear", "QuantizeLinear", "Add", "MatMul"],
+            [1, 0, 0, 0, 0, None],
+        )
+
+        if v_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match v path")
+            return
+
+        (_, _, dequantize_v, quantize_v, add_v, matmul_v) = v_nodes
+
+        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_v, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_v, self.model):
+            return
+
+        # V MatMul weight
+        dequantize_v_matmul_weight = self.model.match_parent_path(matmul_v, ["DequantizeLinear"], [1])
+
+        if dequantize_v_matmul_weight is None:
+            logger.debug("fuse_qordered_attention: failed to match v path")
+            return
+
+        dequantize_v_matmul_weight = dequantize_v_matmul_weight[0]
+
+        if self.model.get_constant_value(dequantize_v_matmul_weight.input[0]) is None:
+            return
+
+        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
+        # Per-channel scales are supported for weights alone
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_v_matmul_weight, self.model, False):
+            return
+
+        # QK nodes
+        qk_nodes = self.model.match_parent_path(
+            matmul_qkv,
+            [
+                "DequantizeLinear",
+                "QuantizeLinear",
+                "Softmax",
+                "Add",
+                "Div",
+                "DequantizeLinear",
+                "QuantizeLinear",
+                "MatMul",
+            ],
+            [0, 0, 0, 0, None, 0, 0, 0],
+        )
+
+        if qk_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match qk path")
+            return
+
+        (
+            dequantize_qk_softmax,
+            quantize_qk_softmax,
+            softmax_qk,
+            add_qk,
+            div_qk,
+            dequantize_qk,
+            quantize_qk,
+            matmul_qk,
+        ) = qk_nodes
+
+        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_qk_softmax, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk_softmax, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_qk, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk, self.model):
+            return
+
+        # Q nodes
+        q_nodes = self.model.match_parent_path(
+            matmul_qk,
+            ["Transpose", "Reshape", "DequantizeLinear", "QuantizeLinear", "Add", "MatMul"],
+            [0, 0, 0, 0, 0, None],
+        )
+
+        if q_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match q path")
+            return
+
+        (_, reshape_q, dequantize_q, quantize_q, add_q, matmul_q) = q_nodes
+
+        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_q, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_q, self.model):
+            return
+
+        # Q MatMul weight
+        dequantize_q_matmul_weight = self.model.match_parent_path(matmul_q, ["DequantizeLinear"], [1])
+
+        if dequantize_q_matmul_weight is None:
+            logger.debug("fuse_qordered_attention: failed to match q path")
+            return
+
+        dequantize_q_matmul_weight = dequantize_q_matmul_weight[0]
+
+        if self.model.get_constant_value(dequantize_q_matmul_weight.input[0]) is None:
+            return
+
+        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
+        # Per-channel scales are supported for weights alone
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_q_matmul_weight, self.model, False):
+            return
+
+        # K nodes
+        k_nodes = self.model.match_parent_path(
+            matmul_qk,
+            ["Transpose", "Reshape", "DequantizeLinear", "QuantizeLinear", "Add", "MatMul"],
+            [1, 0, 0, 0, 0, None],
+        )
+
+        if k_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match k path")
+            return
+
+        (_, _, dequantize_k, quantize_k, add_k, matmul_k) = k_nodes
+
+        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_k, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_k, self.model):
+            return
+
+        # K MatMul weight
+        dequantize_k_matmul_weight = self.model.match_parent_path(matmul_k, ["DequantizeLinear"], [1])
+
+        if dequantize_k_matmul_weight is None:
+            logger.debug("fuse_qordered_attention: failed to match k path")
+            return
+
+        dequantize_k_matmul_weight = dequantize_k_matmul_weight[0]
+
+        if self.model.get_constant_value(dequantize_k_matmul_weight.input[0]) is None:
+            return
+
+        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
+        # Per-channel scales are supported for weights alone
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_k_matmul_weight, self.model, False):
+            return
+
+        # Mask nodes
+        mask_nodes = self.model.match_parent_path(
+            add_qk, ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0, 0]
+        )
+
+        if mask_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match mask_nodes path")
+            return
+
+        # Ascertain `qkv_hidden_sizes` attribute value
+        q_weight = self.model.get_initializer(dequantize_q_matmul_weight.input[0])
+        k_weight = self.model.get_initializer(dequantize_k_matmul_weight.input[0])
+        v_weight = self.model.get_initializer(dequantize_v_matmul_weight.input[0])
+
+        qw = NumpyHelper.to_array(q_weight)
+        kw = NumpyHelper.to_array(k_weight)
+        vw = NumpyHelper.to_array(v_weight)
+
+        qw_out_size = np.prod(qw.shape[1:])
+        kw_out_size = np.prod(kw.shape[1:])
+        vw_out_size = np.prod(vw.shape[1:])
+
+        # Form QOrderedAttention node
+        if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input:
+            mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
+
+            # Ascertain `num_heads` and `hidden_size`
+            num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
+
+            # Formulate the inputs
+            # Actual quantized input
+            attention_inputs = [dequantize_input.input[0]]
+            attention_inputs.append(dequantize_input.input[1])
+
+            attention_inputs.append(dequantize_q.input[1])
+            attention_inputs.append(dequantize_k.input[1])
+            attention_inputs.append(dequantize_v.input[1])
+
+            attention_inputs.append(dequantize_q_matmul_weight.input[0])
+            attention_inputs.append(dequantize_k_matmul_weight.input[0])
+            attention_inputs.append(dequantize_v_matmul_weight.input[0])
+
+            attention_inputs.append(dequantize_q_matmul_weight.input[1])
+            attention_inputs.append(dequantize_k_matmul_weight.input[1])
+            attention_inputs.append(dequantize_v_matmul_weight.input[1])
+
+            if self.model.get_initializer(add_q.input[0]):
+                attention_inputs.append(add_q.input[0])
+            else:  # second input is the constant bias
+                attention_inputs.append(add_q.input[1])
+
+            if self.model.get_initializer(add_k.input[0]):
+                attention_inputs.append(add_k.input[0])
+            else:  # second input is the constant bias
+                attention_inputs.append(add_k.input[1])
+
+            if self.model.get_initializer(add_v.input[0]):
+                attention_inputs.append(add_v.input[0])
+            else:  # second input is the constant bias
+                attention_inputs.append(add_v.input[1])
+
+            attention_inputs.append(quantize_qk.input[1])
+            attention_inputs.append(quantize_qk_softmax.input[1])
+            attention_inputs.append(dequantize_qkv.input[1])
+
+            # Mask input
+            if mask_index is not None:
+                attention_inputs.append(mask_index)
+            else:
+                attention_inputs.append("")
+
+            # The MatMul weight 'B' and 'bias' need some post-processing
+            # Transpose weight 'B' from order ROW to order COL
+            # This offline transpose is needed only while using the CUDA EP
+            # TODO: Make this fusion logic EP-agnostic ?
+            q_weight_tensor = self.model.get_initializer(dequantize_q_matmul_weight.input[0])
+            FusionUtils.transpose_2d_int8_tensor(q_weight_tensor)
+
+            k_weight_tensor = self.model.get_initializer(dequantize_k_matmul_weight.input[0])
+            FusionUtils.transpose_2d_int8_tensor(k_weight_tensor)
+
+            v_weight_tensor = self.model.get_initializer(dequantize_v_matmul_weight.input[0])
+            FusionUtils.transpose_2d_int8_tensor(v_weight_tensor)
+
+            # Name and create Attention node
+            attention_node_name = self.model.create_node_name("QOrderedAttention")
+
+            attention_node = helper.make_node(
+                "QOrderedAttention",
+                inputs=attention_inputs,
+                outputs=[reshape_qkv.output[0]],
+                name=attention_node_name,
+            )
+
+            self.model.replace_node_input(dequantize_qkv, dequantize_qkv.input[0], attention_node.output[0])
+            self.model.replace_node_input(projection_matmul, projection_matmul.input[0], dequantize_qkv.output[0])
+
+            attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+            attention_node.attribute.extend([helper.make_attribute("order_input", 1)])
+            attention_node.attribute.extend([helper.make_attribute("order_weight", 0)])
+            attention_node.attribute.extend([helper.make_attribute("order_output", 1)])
+            attention_node.attribute.extend(
+                [helper.make_attribute("qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size])]
+            )
+
+            attention_node.domain = "com.microsoft"
+
+            self.nodes_to_add.append(attention_node)
+            self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
+
+            self.nodes_to_remove.extend([reshape_qkv, transpose_qkv, quantize_qkv, matmul_qkv])
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes)
+            self.nodes_to_remove.extend(k_nodes)
+            self.nodes_to_remove.extend(v_nodes)
+            self.nodes_to_remove.extend(
+                [dequantize_q_matmul_weight, dequantize_k_matmul_weight, dequantize_v_matmul_weight]
+            )
+
+            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
+            # self.nodes_to_remove.extend(mask_nodes)
+            self.prune_graph = True
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py
new file mode 100755
index 000000000..3ce59f784
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py
@@ -0,0 +1,117 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Dict
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionQOrderedGelu(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "QOrderedGelu", ["Gelu", "FastGelu"])
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        """
+        INPUT PATTERN
+        Fuse (quantized) Gelu subgraph into one node QOrderedGelu:
+            -> quantized input  -> DQ -> Gelu -> Q ->
+
+        (or)
+
+            -> quantized input  -> DQ -> FastGelu -> Q ->
+
+        OUTPUT PATTERN
+            -> QOrderedGelu ->
+        """
+        gelu_children = self.model.get_children(node, input_name_to_nodes)
+
+        # Should only have 1 child - QuantizeLinear (or)
+        # Should have 2 children - QuantizeLinear + Shape
+        if not (
+            (len(gelu_children) == 1 and gelu_children[0].op_type == "QuantizeLinear")
+            or (
+                len(gelu_children) == 2
+                and gelu_children[0].op_type == "QuantizeLinear"
+                and gelu_children[1].op_type == "Shape"
+            )
+        ):
+            return
+
+        downstream_quantize_node = gelu_children[0]
+        downstream_shape_node = None
+
+        if len(gelu_children) == 2:
+            downstream_shape_node = gelu_children[1]
+
+        if not FusionUtils.check_qdq_node_for_fusion(downstream_quantize_node, self.model):
+            return
+
+        # The first input to Gelu should flow through a DequantizeLinear node
+        first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
+            node,
+            [(["DequantizeLinear"], [0])],
+            output_name_to_node,
+        )
+
+        if first_path_id < 0:
+            return
+
+        upstream_dequantize_node = first_input_parent_nodes[0]
+
+        if not FusionUtils.check_qdq_node_for_fusion(upstream_dequantize_node, self.model):
+            return
+
+        # Fusion logic
+        subgraph_nodes = [node]  # Gelu/FastGelu
+        subgraph_nodes.extend([downstream_quantize_node, upstream_dequantize_node])  # Relevant Q, DQ nodes
+
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [node.output[0], downstream_quantize_node.output[0]]
+            if downstream_shape_node is not None
+            else downstream_quantize_node.output,
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            logger.debug(f"It is not safe to fuse QOrderedGelu node. Skip")
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        ordered_gelu_node = helper.make_node(
+            "QOrderedGelu",
+            inputs=[
+                upstream_dequantize_node.input[0],
+                upstream_dequantize_node.input[1],
+                downstream_quantize_node.input[1],
+            ],
+            outputs=[downstream_quantize_node.output[0]],
+            name=self.model.create_node_name("QOrderedGelu", name_prefix="QOrderedGelu"),
+        )
+
+        # Arrange the downstream Shape's input to be fed from the
+        # downstream QuantizeLinear node, so that fusion will
+        # be deemed safe
+        if downstream_shape_node is not None:
+            self.model.replace_node_input(
+                downstream_shape_node, downstream_shape_node.input[0], downstream_quantize_node.output[0]
+            )
+
+        # TODO: We only support CuBlasLt order ORDER_ROW for now.
+        # Once we start supporting other data ordering format(s), we
+        # will support user configuring the data ordering for the op.
+        ordered_gelu_node.attribute.extend([helper.make_attribute("order_X", 1)])
+        ordered_gelu_node.attribute.extend([helper.make_attribute("order_Y", 1)])
+
+        ordered_gelu_node.domain = "com.microsoft"
+
+        self.nodes_to_add.append(ordered_gelu_node)
+        self.node_name_to_graph_name[ordered_gelu_node.name] = self.this_graph_name
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py
new file mode 100755
index 000000000..08def4a20
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py
@@ -0,0 +1,121 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Dict
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionQOrderedLayerNormalization(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "QOrderedLayerNormalization", "LayerNormalization")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        """
+        Fuse (quantized) Layer Normalization subgraph into one node QOrderedLayerNormalization:
+            quantized input  -> DQ
+                                |
+                                |
+            (other inputs)-> LayerNormalization --> Q -->
+
+            should become
+
+            (quantized input + other inputs)->  QOrderedLayerNormalization --> Q -->
+        """
+
+        children = self.model.get_children(node, input_name_to_nodes)
+
+        # Should only have 1 child - QuantizeLinear (or)
+        # Should have 2 children - QuantizeLinear + Shape
+        if not (
+            (len(children) == 1 and children[0].op_type == "QuantizeLinear")
+            or (len(children) == 2 and children[0].op_type == "QuantizeLinear" and children[1].op_type == "Shape")
+        ):
+            return
+
+        downstream_quantize_node = children[0]
+        downstream_shape_node = None
+
+        if len(children) == 2:
+            downstream_shape_node = children[1]
+
+        if not FusionUtils.check_qdq_node_for_fusion(downstream_quantize_node, self.model):
+            return
+
+        # The first input to LayerNormalization should flow through a DequantizeLinear node
+        first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
+            node,
+            [(["DequantizeLinear"], [0])],
+            output_name_to_node,
+        )
+
+        if first_path_id < 0:
+            return
+
+        upstream_dequantize_node = first_input_parent_nodes[0]
+
+        if not FusionUtils.check_qdq_node_for_fusion(upstream_dequantize_node, self.model):
+            return
+
+        # Fusion logic
+        subgraph_nodes = [node]  # LayerNormalization
+        subgraph_nodes.extend([downstream_quantize_node])  # Q node after LayerNormalization
+
+        upstream_dequantize_node_children = self.model.get_children(upstream_dequantize_node, input_name_to_nodes)
+
+        # In GPT2, the DQ node will be feeding a residual downstream Add and hence,
+        # we do not want to remove it
+        if len(upstream_dequantize_node_children) == 1:
+            subgraph_nodes.extend([upstream_dequantize_node])  # DQ node before LayerNormalization
+
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [node.output[0], downstream_quantize_node.output[0]]
+            if downstream_shape_node is not None
+            else downstream_quantize_node.output,
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            logger.debug(f"It is not safe to fuse QOrderedLayerNormalization node. Skip")
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        normalize_node = helper.make_node(
+            "QOrderedLayerNormalization",
+            inputs=[
+                upstream_dequantize_node.input[0],
+                upstream_dequantize_node.input[1],
+                node.input[1],
+                node.input[2],
+                downstream_quantize_node.input[1],
+            ],
+            outputs=[downstream_quantize_node.output[0]],
+            name=self.model.create_node_name("QOrderedLayerNormalization", name_prefix="QOrderedLayerNormalization"),
+        )
+
+        # Arrange the downstream Shape's input to be fed from the
+        # downstream QuantizeLinear node, so that fusion will
+        # be deemed safe
+        if downstream_shape_node is not None:
+            self.model.replace_node_input(
+                downstream_shape_node, downstream_shape_node.input[0], downstream_quantize_node.output[0]
+            )
+
+        # TODO: We only support CuBlasLt order ORDER_ROW for now.
+        # Once we start supporting other data ordering format(s), we
+        # will support user configuring the data ordering for the op.
+        normalize_node.attribute.extend([helper.make_attribute("order_X", 1)])
+        normalize_node.attribute.extend([helper.make_attribute("order_Y", 1)])
+
+        normalize_node.domain = "com.microsoft"
+
+        self.nodes_to_add.append(normalize_node)
+        self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py
new file mode 100755
index 000000000..de0196c53
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py
@@ -0,0 +1,217 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Dict
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionQOrderedMatMul(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "QOrderedMatMul", "MatMul")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        matmul_children = self.model.get_children(node, input_name_to_nodes)
+
+        # Should only have 1 child - Bias Add
+        if len(matmul_children) != 1 or matmul_children[0].op_type != "Add":
+            return
+
+        bias_add_node = matmul_children[0]
+
+        # Atleast one of the inputs to Bias Add node must be a constant
+        bias_add_node_index = 0
+        if (
+            self.model.get_constant_value(bias_add_node.input[0]) is None
+            and self.model.get_constant_value(bias_add_node.input[1]) is None
+        ):
+            return
+
+        if self.model.get_constant_value(bias_add_node.input[0]) is None:
+            bias_add_node_index = 1
+
+        bias_add_children = self.model.get_children(bias_add_node, input_name_to_nodes)
+
+        if len(bias_add_children) != 1:
+            return
+
+        bias_add_child = bias_add_children[0]
+
+        # Bias Add can have another Add downstream (Residual Add layer)
+        residual_add_node = None
+
+        downstream_quantize_node = None
+
+        if bias_add_child.op_type == "Add":
+            residual_add_node = bias_add_child
+
+            residual_add_children = self.model.get_children(residual_add_node, input_name_to_nodes)
+
+            if len(residual_add_children) != 1 or residual_add_children[0].op_type != "QuantizeLinear":
+                return
+
+            downstream_quantize_node = residual_add_children[0]
+
+        elif bias_add_child.op_type == "QuantizeLinear":
+            downstream_quantize_node = bias_add_child
+
+        else:
+            return
+
+        # Make sure the downstream QuantizeLinear has the proper zero points and scales
+        if not FusionUtils.check_qdq_node_for_fusion(downstream_quantize_node, self.model):
+            return
+
+        # The first input to MatMul should flow through a DequantizeLinear node
+        first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
+            node,
+            [(["DequantizeLinear"], [0])],
+            output_name_to_node,
+        )
+
+        # If Attention is not fused, this is the pattern to look for
+        # leading upto the MatMul
+        reshape_node_0 = None
+        transpose_node_0 = None
+        if first_path_id < 0:
+            first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
+                node,
+                [(["Reshape", "Transpose", "DequantizeLinear", "QuantizeLinear"], [0, 0, 0, 0])],
+                output_name_to_node,
+            )
+
+            if first_path_id < 0:
+                return
+
+            reshape_node_0 = first_input_parent_nodes[0]
+            transpose_node_0 = first_input_parent_nodes[1]
+            dequantize_node_0 = first_input_parent_nodes[2]
+        else:
+            dequantize_node_0 = first_input_parent_nodes[0]
+
+        # Make sure the upstream DequantizeLinear-0 has the proper zero points and scales
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_node_0, self.model):
+            return
+
+        # The second input to MatMul should flow through a DequantizeLinear node
+        dequantize_node_1 = None
+        is_weight_transpose_required = True
+
+        weight_path_id, weight_nodes, _ = self.model.match_parent_paths(
+            node,
+            [(["DequantizeLinear", "QuantizeLinear", "Transpose", "DequantizeLinear"], [1, 0, 0, 0])],
+            output_name_to_node,
+        )
+
+        if weight_path_id < 0:
+            weight_path_id, weight_nodes, _ = self.model.match_parent_paths(
+                node,
+                [(["DequantizeLinear"], [1])],
+                output_name_to_node,
+            )
+
+            if weight_path_id < 0:
+                return
+
+            dequantize_node_1 = weight_nodes[0]
+        else:
+            is_weight_transpose_required = False
+            dequantize_node_1 = weight_nodes[3]
+
+        # Check if weight 'B' is a constant
+        if self.model.get_constant_value(dequantize_node_1.input[0]) is None:
+            return
+
+        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
+        # Per-channel scales are supported for weights alone
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_node_1, self.model, False):
+            return
+
+        # Make sure the upstream flow into the Residual Add node flows through a DQ node
+        residual_add_dequantize_node = None
+
+        if residual_add_node is not None:
+            residual_path_id, residual_input_parent_nodes, _ = self.model.match_parent_paths(
+                residual_add_node,
+                [
+                    (["DequantizeLinear"], [1]),
+                ],
+                output_name_to_node,
+            )
+
+            if residual_path_id < 0:
+                return
+
+            residual_add_dequantize_node = residual_input_parent_nodes[0]
+
+        # Make sure the upstream DequantizeLinear to the Residual Add has the proper zero points and scales
+        if residual_add_dequantize_node is not None and not FusionUtils.check_qdq_node_for_fusion(
+            residual_add_dequantize_node, self.model
+        ):
+            return
+
+        # Subgraph nodes to be fused
+        subgraph_nodes = [node, bias_add_node]  # MatMul + Bias Add
+
+        if residual_add_node is not None:
+            subgraph_nodes.extend([residual_add_node])  # Residual Add
+
+        subgraph_nodes.extend(weight_nodes)
+        subgraph_nodes.extend([downstream_quantize_node])  # Downstream Q node
+
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, downstream_quantize_node.output, input_name_to_nodes, output_name_to_node
+        ):
+            logger.debug(f"It is not safe to fuse QOrderedMatMul node. Skip")
+            return
+
+        # Deal with the case where-in the Attention subgraph is not fused
+        if transpose_node_0 is not None:
+            self.model.replace_node_input(transpose_node_0, transpose_node_0.input[0], dequantize_node_0.input[0])
+
+        # Make inputs
+        fused_node_inputs = [
+            reshape_node_0.output[0] if reshape_node_0 is not None else dequantize_node_0.input[0],
+            dequantize_node_0.input[1],
+            dequantize_node_1.input[0],
+            dequantize_node_1.input[1],
+            downstream_quantize_node.input[1],
+            bias_add_node.input[bias_add_node_index],
+        ]
+
+        if residual_add_node is not None:
+            fused_node_inputs.append(residual_add_dequantize_node.input[0])
+            fused_node_inputs.append(residual_add_dequantize_node.input[1])
+
+        # The MatMul weight 'B' and 'bias' need some post-processing
+        # Transpose weight 'B' from order ROW to order COL
+        # This offline transpose is needed only while using the CUDA EP
+        # TODO: Make this fusion logic EP-agnostic ?
+        if is_weight_transpose_required:
+            weight_tensor = self.model.get_initializer(dequantize_node_1.input[0])
+            FusionUtils.transpose_2d_int8_tensor(weight_tensor)
+
+        fused_node = helper.make_node(
+            "QOrderedMatMul",
+            inputs=fused_node_inputs,
+            outputs=[downstream_quantize_node.output[0]],
+            name=self.model.create_node_name("QOrderedMatMul", name_prefix="QOrderedMatMul"),
+        )
+
+        fused_node.attribute.extend([helper.make_attribute("order_A", 1)])
+        fused_node.attribute.extend([helper.make_attribute("order_B", 0)])
+        fused_node.attribute.extend([helper.make_attribute("order_Y", 1)])
+
+        fused_node.domain = "com.microsoft"
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py
new file mode 100755
index 000000000..d2b46c16c
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py
@@ -0,0 +1,175 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+import numpy as np
+from .fusion_base import Fusion
+from onnx import TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionReshape(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "Reshape", "Reshape")
+        self.prune_graph: bool = False
+
+    def replace_reshape_node(self, shape, reshape_node, concat_node):
+        shape_value = np.asarray(shape, dtype=np.int64)
+        constant_shape_name = self.model.create_node_name("Constant", "constant_shape")
+        new_node = helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[constant_shape_name],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.INT64,
+                dims=shape_value.shape,
+                vals=bytes(shape_value),
+                raw=True,
+            ),
+        )
+        reshape_node.input[1] = constant_shape_name
+        reshape_node.name = self.model.create_node_name("Reshape", "Reshape_Fuse")
+        self.nodes_to_remove.extend([concat_node])
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+    def fuse(self, reshape_node, input_name_to_nodes, output_name_to_node):
+        if reshape_node.input[1] not in output_name_to_node:
+            return
+
+        concat_node = output_name_to_node[reshape_node.input[1]]
+        if concat_node.op_type != "Concat" or len(concat_node.input) < 3 or len(concat_node.input) > 4:
+            return
+
+        path0 = self.model.match_parent_path(
+            concat_node,
+            ["Unsqueeze", "Gather", "Shape"],
+            [0, 0, 0],
+            output_name_to_node,
+        )
+        if path0 is None:
+            return
+
+        (unsqueeze_0, gather_0, shape_0) = path0
+
+        path1 = self.model.match_parent_path(
+            concat_node,
+            ["Unsqueeze", "Gather", "Shape"],
+            [1, 0, 0],
+            output_name_to_node,
+        )
+        if path1 is None:
+            return
+        (unsqueeze_1, gather_1, shape_1) = path1
+
+        shape = []
+        gather_value = self.model.get_constant_value(gather_0.input[1])
+        if gather_value == 0:
+            shape.append(0)
+
+        gather_value = self.model.get_constant_value(gather_1.input[1])
+        if gather_value == 1:
+            shape.append(0)
+
+        if len(shape) != 2:
+            return
+
+        path2 = []
+        path3 = []
+        shape_nodes = [shape_0, shape_1]
+        if len(concat_node.input) == 3 and self.model.get_initializer(concat_node.input[2]) is None:
+            path2 = self.model.match_parent_path(
+                concat_node,
+                ["Unsqueeze", "Mul", "Gather", "Shape"],
+                [2, 0, 0, 0],
+                output_name_to_node,
+            )
+            if path2 is None:
+                path2 = self.model.match_parent_path(
+                    concat_node,
+                    ["Unsqueeze", "Mul", "Squeeze", "Slice", "Shape"],
+                    [2, 0, 0, 0, 0],
+                    output_name_to_node,
+                )  # GPT2 exported by PyTorch 1.4 with opset_version=11
+                if path2 is None:
+                    return
+
+            path3 = self.model.match_parent_path(
+                concat_node,
+                ["Unsqueeze", "Mul", "Gather", "Shape"],
+                [2, 0, 1, 0],
+                output_name_to_node,
+            )
+            if path3 is None:
+                path3 = self.model.match_parent_path(
+                    concat_node,
+                    ["Unsqueeze", "Mul", "Squeeze", "Slice", "Shape"],
+                    [2, 0, 1, 0, 0],
+                    output_name_to_node,
+                )  # GPT2 exported by PyTorch 1.4 with opset_version=11
+                if path3 is None:
+                    return
+
+            shape_nodes.extend([path2[-1], path3[-1]])
+            shape.append(-1)
+        elif len(concat_node.input) > 2:
+            concat_2 = self.model.get_initializer(concat_node.input[2])
+            if concat_2 is None:
+                return
+            concat_value = numpy_helper.to_array(concat_2)
+            if isinstance(concat_value, list):
+                shape.extend(concat_value)
+            else:
+                shape.append(concat_value)
+
+        if len(concat_node.input) == 4 and self.model.get_initializer(concat_node.input[3]) is None:
+            if -1 in shape:
+                return
+
+            path2 = self.model.match_parent_path(
+                concat_node,
+                ["Unsqueeze", "Div", "Gather", "Shape"],
+                [3, 0, 0, 0],
+                output_name_to_node,
+            )
+            if path2 is None:
+                path2 = self.model.match_parent_path(
+                    concat_node,
+                    ["Unsqueeze", "Div", "Squeeze", "Slice", "Shape"],
+                    [3, 0, 0, 0, 0],
+                    output_name_to_node,
+                )  # GPT2 exported by PyTorch 1.4 with opset_version=11
+                if path2 is None:
+                    return
+            shape_nodes.extend([path2[-1]])
+            shape.append(-1)
+        elif len(concat_node.input) > 3:
+            concat_3 = self.model.get_initializer(concat_node.input[3])
+            if concat_3 is None:
+                return
+
+            concat_value = numpy_helper.to_array(concat_3)
+            if isinstance(concat_value, list):
+                shape.extend(concat_value)
+            else:
+                shape.append(concat_value)
+
+        root_input = reshape_node.input[0]
+        same_shape_input = True
+        for shape_node in shape_nodes:
+            if shape_node.input[0] != root_input:
+                same_shape_input = False
+
+        if not same_shape_input:
+            return
+
+        self.replace_reshape_node(shape, reshape_node, concat_node)
+
+        # TODO(tlwu): Subgraph blocks pruning un-used nodes. Add code to remove un-used nodes safely.
+        self.prune_graph = True
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py
new file mode 100755
index 000000000..c831f15c5
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py
@@ -0,0 +1,155 @@
+import logging
+from typing import Dict
+
+from onnx import helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = logging.getLogger(__name__)
+
+
+class FusionRMSNorm(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "RMSNorm", "Mul")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        if node.op_type != "Mul":
+            return
+
+        sim_ln_nodes = None
+        # SimplifiedLayerNorm calculation (notation from https://onnx.ai/onnx/operators/onnx__LayerNormalization.html#summary):
+        # DD = Pow(D, 2)
+        # Var = ReduceMean(DD)
+        # VarEps = Add(Var, epsilon)
+        # StdDev = Sqrt(VarEps)
+        # InvStdDev = Div(1, StdDev)
+        # Normalized = Mul(D, InvStdDev)
+        # NormalizedScaled = Mul(Normalized, Scale)
+
+        #                              RMSNorm
+        #          +-------------------------------------------------------+
+        #          |                                                       |
+        # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
+        #                                                                  |
+        #                                                                 node
+        sim_ln_nodes_1 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"],
+            [1, 1, 1, 0, 0, 0, 0],
+        )
+        #                                RMSNorm
+        #             +-------------------------------------------------------+
+        #             |                                                       |
+        # Gather --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
+        #                                                                     |
+        #                                                                    node
+        sim_ln_nodes_2 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Gather"],
+            [1, 1, 1, 0, 0, 0, 0],
+        )
+
+        # For LLaMA from Microsoft custom export:
+        # sim_ln_nodes_3 uses a different start parent index than sim_ln_nodes_1
+        #
+        #                              RMSNorm
+        #          +-------------------------------------------------------+
+        #          |                                                       |
+        # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
+        #                                                                  |
+        #                                                                 node
+        sim_ln_nodes_3 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"],
+            [0, 1, 1, 0, 0, 0, 0],
+        )
+
+        # sim_ln_nodes_4 starts with a graph input instead of an Add node like sim_ln_nodes_3
+        #
+        #                                  RMSNorm
+        #                  +-----------------------------------------------+
+        #                  |                                               |
+        # graph_input --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul
+        #                                                                  |
+        #                                                                 node
+        sim_ln_nodes_4 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow"],
+            [0, 1, 1, 0, 0, 0],
+        )
+
+        add_node, pow_node = None, None
+        if sim_ln_nodes_1 is not None:
+            sim_ln_nodes = sim_ln_nodes_1
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-2]
+        elif sim_ln_nodes_2 is not None:
+            sim_ln_nodes = sim_ln_nodes_2
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-2]
+        elif sim_ln_nodes_3 is not None:
+            sim_ln_nodes = sim_ln_nodes_3
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-2]
+        elif sim_ln_nodes_4 is not None:
+            sim_ln_nodes = sim_ln_nodes_4
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-1]
+            # Verify that parent input to Pow node is graph_input
+            if pow_node.input[0] not in self.model.get_graphs_input_names():
+                return
+        else:
+            return
+
+        layernorm_weight_index = (
+            1 if sim_ln_nodes in (sim_ln_nodes_3, sim_ln_nodes_4) else 0
+        )
+        starts_with_graph_input = sim_ln_nodes == sim_ln_nodes_4
+
+        if self.model.find_constant_input(pow_node, 2.0) != 1:
+            return
+
+        root_input = pow_node.input[0]
+        if root_input != sim_ln_nodes[0].input[0]:
+            return
+
+        i, add_weight = self.model.get_constant_input(add_node)
+        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
+            logger.warning(f"epsilon value is not expected: {add_weight}")
+            return
+
+        self.nodes_to_remove.extend(
+            sim_ln_nodes[:-1] if not starts_with_graph_input else sim_ln_nodes
+        )
+        self.nodes_to_remove.append(node)
+
+        normalize_node = helper.make_node(
+            "RMSNormPluginDynamic_IxRT",
+            inputs=[root_input, node.input[layernorm_weight_index]],
+            outputs=[node.output[0]],
+            name=self.model.create_node_name(
+                "RMSNormPluginDynamic_IxRT", name_prefix="RMSNorm_"
+            ),
+        )
+
+        normalize_node.domain = "com.iluvatar"
+        normalize_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        normalize_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        normalize_node.attribute.extend(
+            [helper.make_attribute("epsilon", float(add_weight))]
+        )
+        normalize_node.attribute.extend([helper.make_attribute("axis", -1)])
+        normalize_node.attribute.extend([helper.make_attribute("stash_type", 1)])
+        gamma_data = self.model.get_initializer(normalize_node.input[1])
+        gamma_data_np = NumpyHelper.to_array(gamma_data)
+        normalize_node.attribute.extend(
+            [helper.make_attribute("hidden_size", int(gamma_data_np.shape[0]))]
+        )
+
+        normalize_node.attribute.extend([helper.make_attribute("gamma", gamma_data)])
+
+        self.nodes_to_add.append(normalize_node)
+        self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+        return True
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py
new file mode 100755
index 000000000..b47be680f
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py
@@ -0,0 +1,110 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Dict, List, Union
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import NodeProto, TensorProto
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionShape(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "Shape", "Concat")
+        self.utils = FusionUtils(model)
+        self.shape_infer = None
+        self.shape_infer_done = False
+
+    def get_dimensions_from_tensor_proto(self, tensor_proto: TensorProto) -> Union[int, None]:
+        if tensor_proto.type.tensor_type.HasField("shape"):
+            return len(tensor_proto.type.tensor_type.shape.dim)
+        else:
+            return None
+
+    def get_dimensions(self, input_name: str) -> Union[int, None]:
+        graph_input = self.model.find_graph_input(input_name)
+        if graph_input:
+            return self.get_dimensions_from_tensor_proto(graph_input)
+
+        if not self.shape_infer_done:
+            self.shape_infer = self.model.infer_runtime_shape({}, update=True)
+            self.shape_infer_done = True
+
+        if self.shape_infer is not None:
+            return self.get_dimensions_from_tensor_proto(self.shape_infer.known_vi_[input_name])
+
+        return None
+
+    def fuse(
+        self,
+        concat_node: NodeProto,
+        input_name_to_nodes: Dict[str, List[NodeProto]],
+        output_name_to_node: Dict[str, NodeProto],
+    ):
+        """
+        Smplify subgraph like
+
+                   (2d_input)
+                    /       \
+                Shape       shape
+                /             \
+            Gather(indices=0)  Gather(indices=1)
+                |                |
+            Unsqueeze(axes=0)   Unsqueeze(axes=0)
+                   \          /
+                      Concat 
+                        |
+
+        into  (2d_input) --> Shape -->
+        """
+        opset_version = self.model.get_opset_version()
+
+        inputs = len(concat_node.input)
+        root = None
+        shape_output = None
+        for i in range(inputs):
+            path = self.model.match_parent_path(
+                concat_node,
+                ["Unsqueeze", "Gather", "Shape"],
+                [i, 0, 0],
+                output_name_to_node,
+            )
+            if path is None:
+                return
+
+            unsqueeze, gather, shape = path
+            if i == 0:
+                shape_output = shape.output[0]
+            if root is None:
+                root = shape.input[0]
+                if self.get_dimensions(root) != inputs:
+                    return
+            elif shape.input[0] != root:
+                return
+
+            if not FusionUtils.check_node_attribute(unsqueeze, "axis", 0, default_value=0):
+                return
+
+            if opset_version < 13:
+                if not FusionUtils.check_node_attribute(unsqueeze, "axes", [0]):
+                    return
+            else:
+                if not self.utils.check_node_input_value(unsqueeze, 1, [0]):
+                    return
+
+            value = self.model.get_constant_value(gather.input[1])
+            from numpy import array_equal, ndarray
+
+            if not (isinstance(value, ndarray) and value.size == 1 and value.item() == i):
+                return
+
+        if self.model.find_graph_output(concat_node.output[0]) is None:
+            self.model.replace_input_of_all_nodes(concat_node.output[0], shape_output)
+            self.fused_count += 1
+            self.prune_graph = True
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py
new file mode 100755
index 000000000..586896446
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py
@@ -0,0 +1,212 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+from onnx import helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionSkipLayerNormalization(Fusion):
+    """
+    Fuse Add + LayerNormalization into one node: SkipLayerNormalization
+    Note: This fusion does not check the input shape of Add and LayerNormalization.
+    """
+
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model, "CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"
+        )
+        # Update shape inference is needed since other fusions might add new edge which does not have shape info yet.
+        self.shape_infer_helper = self.model.infer_runtime_shape(
+            {"batch_size": 4, "seq_len": 7}, update=True
+        )
+
+        if self.shape_infer_helper is None:
+            # TODO(tianleiwu): support subgraph in shape inference or add broadcasting in SkipLayerNormalization op.
+            logger.warning("symbolic shape inference disabled or failed.")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        add = self.model.get_parent(node, 0, output_name_to_node)
+
+        # In some models there is input_ids->gather->add->LayerNorm and one of input of the
+        # add node is initializer with fixed shape which should not be fused into SkipLayerNorm
+        if add is None:
+            return
+
+        for add_input in add.input:
+            if self.model.get_initializer(add_input) != None:
+                return
+
+        # The number of input node of add should be 2
+        if len(self.model.get_parents(add)) != 2:
+            return
+
+        if self.shape_infer_helper is not None:
+            if not self.shape_infer_helper.compare_shape(add.input[0], add.input[1]):
+                logger.debug(
+                    "skip SkipLayerNormalization fusion since shape of inputs (%s, %s) are not same",
+                    add.input[0],
+                    add.input[1],
+                )
+                return
+        else:
+            layernorm_weight = self.model.get_initializer(node.input[1])
+            if layernorm_weight is not None:
+                layernorm_weight_arr = NumpyHelper.to_array(layernorm_weight)
+                hidden_size = layernorm_weight_arr.shape[0]
+            else:
+                logger.debug(
+                    "skip SkipLayerNormalization fusion since symbolic shape inference failed"
+                )
+                return
+
+        # gather_path = self.model.match_parent_path(add, ["Gather"], [None])
+        # if gather_path is not None and self.model.find_graph_input(gather_path[0].input[1]) is None:
+        #     if self.model.match_parent_path(gather_path[0], ["ConstantOfShape"], [1]) is None:
+        #         return
+
+        if (
+            add is not None
+            and add.op_type == "Add"
+            and self.model.is_safe_to_fuse_nodes(
+                [add, node], node.output, input_name_to_nodes, output_name_to_node
+            )
+        ):
+            self.nodes_to_remove.extend([add, node])
+
+            inputs = [add.input[0], add.input[1]]
+            normalize_node = helper.make_node(
+                "CustomSkipLayerNormPluginDynamic_IxRT",
+                inputs=inputs,
+                outputs=[node.output[0]],
+                name=self.model.create_node_name(
+                    "SkipLayerNormalization", name_prefix="SkipLayerNorm"
+                ),
+            )
+            normalize_node.domain = "com.iluvatar"
+            if self.shape_infer_helper is not None:
+                hidden_size = self.shape_infer_helper.get_edge_shape(node.input[1])[-1]
+            normalize_node.attribute.extend([helper.make_attribute("ld", hidden_size)])
+            normalize_node.attribute.extend([helper.make_attribute("type_id", 2)])
+            normalize_node.attribute.extend(
+                [
+                    helper.make_attribute(
+                        "beta", self.model.get_initializer(node.input[2])
+                    )
+                ]
+            )
+            normalize_node.attribute.extend(
+                [
+                    helper.make_attribute(
+                        "gamma", self.model.get_initializer(node.input[1])
+                    )
+                ]
+            )
+            normalize_node.attribute.extend(
+                [helper.make_attribute("plugin_namespace", "")]
+            )
+            normalize_node.attribute.extend(
+                [helper.make_attribute("plugin_version", "1")]
+            )
+
+            self.nodes_to_add.append(normalize_node)
+            self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+
+
+class FusionBiasSkipLayerNormalization(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model,
+            "CustomSkipLayerNormPluginDynamic_IxRT",
+            "SkipLayerNormalization",
+            "add bias",
+        )
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if len(node.input) != 4:
+            return
+
+        return_indice = []
+        nodes = self.model.match_parent_path(
+            node, ["Add", "MatMul"], [None, None], None, return_indice
+        )
+        if nodes is None:
+            return
+        assert len(return_indice) == 2
+        add_input_index = return_indice[0]
+        if add_input_index >= 2:
+            return
+
+        (add, matmul) = nodes
+
+        # bias should be one dimension
+        bias_index = -1
+        for i, input in enumerate(add.input):
+            initializer = self.model.get_initializer(input)
+            if initializer is None:
+                continue
+            bias_index = i
+            bias_weight = NumpyHelper.to_array(initializer)
+            break
+        if bias_weight is None:
+            logger.debug(f"Bias weight not found")
+            return
+        if len(bias_weight.shape) != 1:
+            logger.debug(f"Bias weight is not 1D")
+            return
+
+        subgraph_nodes = [node, add]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node
+        ):
+            logger.debug(
+                f"Skip fusing SkipLayerNormalization with Bias since it is not safe"
+            )
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        inputs = [
+            node.input[1 - add_input_index],
+            matmul.output[0],
+            node.input[2],
+            node.input[3],
+            add.input[bias_index],
+        ]
+        new_node = helper.make_node(
+            "CustomSkipLayerNormPluginDynamic_IxRT",
+            inputs=inputs,
+            outputs=node.output,
+            name=self.model.create_node_name(
+                "SkipLayerNormalization", "SkipLayerNorm_AddBias_"
+            ),
+        )
+        new_node.domain = "com.iluvatar"
+        hidden_size = self.shape_infer_helper.get_edge_shape(node.input[2])[-1]
+        new_node.attribute.extend([helper.make_attribute("ld", hidden_size)])
+        new_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        new_node.attribute.extend(
+            [helper.make_attribute("beta", self.model.get_initializer(node.input[3]))]
+        )
+        new_node.attribute.extend(
+            [helper.make_attribute("gamma", self.model.get_initializer(node.input[2]))]
+        )
+        new_node.attribute.extend(
+            [
+                helper.make_attribute(
+                    "bias", self.model.get_initializer(add.input[bias_index])
+                )
+            ]
+        )
+        new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py
new file mode 100755
index 000000000..8edb9a5ad
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py
@@ -0,0 +1,321 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union, List
+
+import numpy as np
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+import onnx
+
+logger = getLogger(__name__)
+
+
+def get_tensor_attr(attrs, attr_name):
+    result = None
+    for i in attrs:
+        if i.name == attr_name:
+            return numpy_helper.to_array(i.t)
+    return result
+
+
+class FusionSwinLAttention(Fusion):
+    """
+    Fuse SwinL subgraph into one Attention node.
+    """
+
+    def __init__(
+            self,
+            model: OnnxModel,
+    ):
+        super().__init__(model, "CustomQKVToContextPluginDynamic_IxRT", ["CustomFCPluginDynamic_IxRT"])
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(self, reshape_v: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        v_shape = self.model.get_initializer(reshape_v.input[1])
+        if v_shape is None:
+            logger.debug(f"{reshape_v.input[1]} is not initializer.")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        v_shape_value = NumpyHelper.to_array(v_shape)
+        if len(v_shape_value) != 3 or (v_shape_value[1] <= 0 or v_shape_value[2] <= 0):
+            logger.debug(f"v_shape_value={v_shape_value}. Expected value are like [0, 0, num_heads, head_size].")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        num_heads = 1
+        for value_info in self.model.graph().value_info:
+            if value_info.name == reshape_v.input[0]:
+                num_heads = value_info.type.tensor_type.shape.dim[2].dim_value
+                break
+        hidden_size = v_shape_value[2]
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+            self,
+            num_heads: int,
+            hidden_size: int,
+            inputs: List[str],
+            output: str,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
+            return None
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend([helper.make_attribute("hidden_size", hidden_size)])
+        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        self.fuse_pattern1(normalize_node, input_name_to_nodes, output_name_to_node)
+        self.fuse_pattern2(normalize_node, input_name_to_nodes, output_name_to_node)
+
+    def fuse_pattern2(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        """ match Swin-L pattern and fuse them to CustomFC --> Attention --> CustomFC
+         """
+        logger.debug("fuse swin-L attention pass")
+        # 1. CustomFCPluginDynamic_IxRT node as start, go up to find a pattern for swin-L pattern
+        start_node = normalize_node
+        qkv_paths = {
+            "path1": (["Reshape", "Transpose", "MatMul"], [0, 0, 0]),
+        }
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+        assert qkv_path == 'path1', 'abnormal qkv path'
+        reshape_qkv, transpose_qkv, matmul_qkv = qkv_nodes
+
+        # 2. MatMul as start, go up to find v path
+        v_paths = {
+            "path1": (["Transpose", "Reshape", "CustomFCPluginDynamic_IxRT"], [None, 0, 0])
+        }
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+        if not v_nodes:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        assert v_path == 'path1', 'abnormal v path'
+
+        # 3. MatMul as start, go up to find q,k paths
+        # q path
+        q_paths = {
+            "path1": (["Softmax", "Add", "Div", "MatMul", "Transpose", "Reshape", "CustomFCPluginDynamic_IxRT"],
+                      [None, 0, 0, 0, 0, 0, 0]),
+        }
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qkv, q_paths)
+        if not q_nodes:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        assert q_path == 'path1', 'abnormal q paths found'
+
+        # get Add(bias) input name as fused Attention inputs
+        add_op, div_op = q_nodes[1], q_nodes[2]
+        relative_position_bias_name = add_op.input[1] if add_op.input[0] == div_op.output[0] else add_op.input[0]
+
+        # k path
+        k_paths = {
+            "path2": (["Softmax", "Add", "Div", "MatMul", "Transpose", "Reshape", "CustomFCPluginDynamic_IxRT"],
+                      [None, 0, 0, 0, 1, 0, 0])
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qkv, k_paths)
+        if not k_nodes:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        assert k_path == 'path2', 'abnormal k paths found'
+        # 4. Fuse 3 CustomFC into one, and fuse attention
+        # Fuse FCs
+        fc_nodes = [q_nodes[-1], k_nodes[-1], v_nodes[-1]]
+        weight = self.fuse_tensor_in_node_attrs(fc_nodes, "W", q_nodes[-1].name + "_Weight")
+        bias = self.fuse_tensor_in_node_attrs(fc_nodes, "B", q_nodes[-1].name + "_Bias")
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[q_nodes[-1].input[0]],
+            outputs=q_nodes[-1].output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", numpy_helper.to_array(bias).shape[0])])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fused_node.attribute.extend([helper.make_attribute("W", weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", bias)])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+
+        # Fuse Attention
+        num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_qkv)
+        attention_node = self.create_attention_node(
+            num_heads,
+            hidden_size,
+            [fused_node.output[0], relative_position_bias_name],
+            reshape_qkv.output[0],
+        )
+        if not attention_node:
+            return
+        self.nodes_to_add.append(attention_node)
+        self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
+        self.nodes_to_remove.extend([*qkv_nodes, *q_nodes[:-2], *k_nodes[:-2], *v_nodes])
+        self.prune_graph = True
+
+    def fuse_pattern1(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        """ match Swin-L pattern and fuse them to CustomFC --> Attention --> CustomFC
+        """
+        logger.debug("fuse swin-L attention pass")
+        # 1. CustomFCPluginDynamic_IxRT node as start, go up to find a pattern for swin-L pattern
+        start_node = normalize_node
+        qkv_paths = {
+            "path1": (["Reshape", "Transpose", "MatMul"], [0, 0, 0]),
+        }
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+        assert qkv_path == 'path1', 'abnormal qkv path'
+        reshape_qkv, transpose_qkv, matmul_qkv = qkv_nodes
+
+        # 2. MatMul as start, go up to find v path
+        v_paths = {
+            "path1": (["Transpose", "Reshape", "Add", "Split", "MatMul"], [None, 0, 0, None, 0])
+        }
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+        if not v_nodes:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        assert v_path == 'path1', 'abnormal v path'
+
+        # 3. MatMul as start, go up to find q,k paths
+        # q path
+        q_paths = {
+            "path1": (["Softmax", "Add", "Div", "MatMul", "Transpose", "Reshape", "Add", "Split", "MatMul"],
+                      [None, 0, 0, 0, 0, 0, 0, None, 0]),
+        }
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qkv, q_paths)
+        if not q_nodes:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        assert q_path == 'path1', 'abnormal q paths found'
+
+        # get Add(bias) input name as fused Attention inputs
+        add_op, div_op = q_nodes[1], q_nodes[2]
+        relative_position_bias_name = add_op.input[1] if add_op.input[0] == div_op.output[0] else add_op.input[0]
+
+        # k path
+        k_paths = {
+            "path2": (["Softmax", "Add", "Div", "MatMul", "Transpose", "Reshape", "Add", "Split", "MatMul"],
+                      [None, 0, 0, 0, 1, 0, 0, None, 0])
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qkv, k_paths)
+        if not k_nodes:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        assert k_path == 'path2', 'abnormal k paths found'
+        # 4. Attention and CustomFC have been found, now transform the found nodes to two plugin nodes
+        # Test 3 paths have the same origin
+        is_same_origin = q_nodes[-1] is k_nodes[-1] is v_nodes[-1]
+        is_same_origin &= q_nodes[-2] is k_nodes[-2] is v_nodes[-2]
+        is_same_origin &= q_nodes[-3] is not k_nodes[-2] is not v_nodes[-3]
+        if not is_same_origin:
+            print("swin-L fuse_attention: found qkv path but not has the same origin")
+            return
+        origin_matmul = q_nodes[-1]
+        fc_add = [q_nodes[-3], k_nodes[-3], v_nodes[-3]]
+        # Now fuse
+        num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_qkv)
+
+        # Fuse FC
+        weight = self.model.get_initializer(origin_matmul.input[1])
+        biases = [self.model.get_initializer(i.input[0]) for i in fc_add]
+        if not weight or not all(biases):
+            print("swin-L: couldn't find weights")
+            return
+        weight_arr = onnx.numpy_helper.to_array(weight).transpose(1,0)
+        weight.CopyFrom(numpy_helper.from_array(weight_arr))
+        bias_arr = np.concatenate([onnx.numpy_helper.to_array(i) for i in biases], axis=0)
+
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[origin_matmul.input[0]],
+            outputs=fc_add[0].output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", bias_arr.shape[0])])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fused_node.attribute.extend([helper.make_attribute("W", weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", numpy_helper.from_array(bias_arr))])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+        # Fuse Attention
+        attention_node = self.create_attention_node(
+            num_heads,
+            hidden_size,
+            [fused_node.output[0], relative_position_bias_name],
+            reshape_qkv.output[0],
+
+        )
+        if not attention_node:
+            return
+        self.nodes_to_add.append(attention_node)
+        self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
+        self.nodes_to_remove.extend([*qkv_nodes, *q_nodes[:-2], *k_nodes[:-2], *v_nodes])
+        self.prune_graph = True
+
+    def fuse_tensor_in_node_attrs(self, fc_nodes, attr_name, tensor_name):
+        result = [get_tensor_attr(i.attribute, attr_name) for i in fc_nodes]
+        result = np.concatenate(result, axis=0)
+        result = numpy_helper.from_array(result, tensor_name)
+        return result
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py
new file mode 100755
index 000000000..661e83759
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py
@@ -0,0 +1,312 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+class FusionT5Attention(Fusion):
+    """
+    Fuse T5Attention subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(
+            model,
+            "CustomQKVToContextPluginDynamic_IxRT",
+            ["CustomSkipLayerNormPluginDynamic_IxRT", "RMSNormPluginDynamic_IxRT"],
+        )
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        q_shape = self.model.get_initializer(reshape_q.input[1])
+        if q_shape is None:
+            logger.debug(f"{reshape_q.input[1]} is not initializer.")
+            return [0, 0]
+
+        q_shape_value = NumpyHelper.to_array(q_shape)
+        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
+            logger.debug(
+                f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
+            )
+            return [0, 0]
+
+        num_heads = q_shape_value[2]
+        head_size = q_shape_value[3]
+        hidden_size = num_heads * head_size
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+        self,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        matmul_qk_add: NodeProto,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(
+                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
+            )
+            return None
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        qk_bias = None
+        has_mask = 0
+        has_qk_bias = 0
+        add_input_is_value = False
+        if matmul_qk_add is not None:
+            has_qk_bias = 1
+            qk_bias = self.model.get_initializer(matmul_qk_add.input[1])
+            if qk_bias:
+                add_input_is_value = True
+                qk_bias_arr = NumpyHelper.to_array(qk_bias)
+                if len(qk_bias_arr.shape) == 3:
+                    qk_bias_arr = qk_bias_arr.squeeze(0)
+                has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0)
+                if np.any(has_neg_inf):
+                    qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype(
+                        np.float32
+                    )
+                qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name))
+
+        attention_inputs = [input]
+
+        # 如果add的输入不是值，而是一个边，那么这个边的值需要cast到fp32
+        cast_node = None
+        if not add_input_is_value:
+            cast_out_name = attention_node_name + "_fp32_in1"
+            cast_out_tensor = helper.make_tensor_value_info(
+                cast_out_name, TensorProto.FLOAT, [None, None, None, None]
+            )
+            # self.model.add_initializer(cast_out_name)
+            cast_node = helper.make_node(
+                "Cast",
+                inputs=[matmul_qk_add.input[1]],
+                outputs=[cast_out_tensor.name],
+                name=self.model.create_node_name("Cast"),
+                to=1,
+            )
+            self.node_name_to_graph_name[cast_node.name] = self.this_graph_name
+            attention_inputs.append(cast_out_name)
+
+        if has_qk_bias:
+            if add_input_is_value:
+                has_mask = 1
+                attention_inputs.append(qk_bias.name)
+            else:
+                has_mask = 1
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend(
+            [helper.make_attribute("hidden_size", hidden_size)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend(
+            [helper.make_attribute("has_qk_bias", has_qk_bias)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("is_t5_mode", 1)])
+
+        return attention_node, cast_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+        if normalize_node.op_type == "RMSNormPluginDynamic_IxRT":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                start_node = add_before_layernorm
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_paths = {
+            "path1": (["MatMul", "Reshape", "Transpose", "MatMul"], [0, 0, 0, 0]),
+            "path2": (["MatMul", "Reshape", "Transpose", "MatMul"], [1, 0, 0, 0]),
+        }
+
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+
+        if qkv_path in ["path1", "path2"]:
+            (atten_matmul, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
+
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+        """
+        Match T5
+        Add/Gather --> LayerNormalization --> Attention --> Add --> LayerNormalization
+         |                                                  |
+         |                                                  |
+         +---------------------------------------------------
+        """
+        transpose_before_layernorm = self.model.match_parent(start_node, "Gather", 0)
+        if transpose_before_layernorm is not None:
+            node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == "RMSNormPluginDynamic_IxRT":
+                    root_input = child.output[0]
+
+        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
+        if add_before_layernorm is not None:
+            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == "RMSNormPluginDynamic_IxRT":
+                    root_input = child.output[0]
+
+        v_paths = {
+            "path1": (
+                ["Transpose", "Reshape", "Split", "MatMul"],
+                [1, 0, 0, None],
+            )  # T5
+        }
+
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+        if v_path == "path1":
+            (_, _, _, matmul_in_qkv) = v_nodes
+
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+
+        qk_paths = {
+            "path1": (["Softmax", "MatMul"], [0, 0]),
+            "path2": (["Softmax", "Add", "MatMul"], [0, 0, None]),
+        }
+
+        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
+
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+
+        matmul_qk_add = None
+        if qk_path == "path1":
+            (_, matmul_qk) = qk_nodes
+        else:
+            (_, matmul_qk_add, matmul_qk) = qk_nodes
+
+        q_paths = {"path1": (["Transpose", "Reshape", "Split"], [0, 0, 0])}
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+
+        if q_path == "path1":
+            (_, reshape_q, split_q) = q_nodes
+            # print("   split_q.name : ", split_q.name)
+
+        k_paths = {
+            "path1": (["Transpose", "Reshape", "Split"], [1, 0, 0]),
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
+
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+
+        if k_path == "path1":
+            (_, _, split_k) = k_nodes
+
+        if (
+            matmul_in_qkv.input[0] == root_input
+            and split_q.input[0] == matmul_in_qkv.output[0]
+            and split_k.input[0] == matmul_in_qkv.output[0]
+        ):
+            attention_last_node = reshape_qkv
+
+            num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
+
+            new_node, new_cast_node = self.create_attention_node(
+                num_heads,
+                hidden_size,
+                matmul_in_qkv.output[0],
+                attention_last_node.output[0],
+                matmul_qk_add,
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.append(new_node)
+            if new_cast_node:
+                self.nodes_to_add.append(new_cast_node)
+
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+            self.nodes_to_remove.extend(
+                [attention_last_node, transpose_qkv, matmul_qkv]
+            )
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes)
+            self.nodes_to_remove.extend(k_nodes)
+            self.nodes_to_remove.extend(v_nodes[:-2])
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py
new file mode 100755
index 000000000..5207f28f0
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py
@@ -0,0 +1,240 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Tuple
+
+import numpy
+from numpy import array_equal, ndarray
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+from onnx import onnx_pb as onnx_proto
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionUtils:
+    def __init__(self, model: OnnxModel):
+        self.model: OnnxModel = model
+
+    def cast_graph_input_to_int32(self, input_name: str) -> Tuple[bool, str]:
+        graph_input = self.model.find_graph_input(input_name)
+        if graph_input is not None and graph_input.type.tensor_type.elem_type != TensorProto.INT32:
+            cast_output, cast_node = self.cast_input_to_int32(input_name)
+            logger.debug(f"Casted graph input {input_name} to int32")
+            return True, cast_output
+
+        logger.debug(f"Did not cast graph input {input_name} to int32: found {graph_input is not None}")
+        return False, input_name
+
+    def cast_input_to_int32(self, input_name: str):
+        cast_output = input_name + "_int32"
+
+        # Avoid consequent Cast nodes.
+        inputs = [input_name]
+        output_name_to_node = self.model.output_name_to_node()
+        if input_name in output_name_to_node:
+            parent_node = output_name_to_node[input_name]
+            if parent_node and parent_node.op_type == "Cast":
+                inputs = [parent_node.input[0]]
+
+        cast_node = helper.make_node("Cast", inputs=inputs, outputs=[cast_output])
+        cast_node.attribute.extend([helper.make_attribute("to", int(TensorProto.INT32))])
+        self.model.add_node(cast_node)
+
+        return cast_output, cast_node
+
+    def remove_cast_int32(self, input_name: str):
+        input_name_to_nodes = self.model.input_name_to_nodes()
+        nodes = input_name_to_nodes[input_name]
+        for node in nodes:
+            if node.op_type == "Cast":
+                is_int32 = False
+                for att in node.attribute:
+                    if att.name == "to" and att.i == int(TensorProto.INT32):
+                        is_int32 = True
+                        break
+                if is_int32:
+                    output_name = node.output[0]
+                    self.model.remove_node(node)
+                    self.model.replace_input_of_all_nodes(output_name, input_name)
+
+    @staticmethod
+    def check_node_attribute(node, attribute_name: str, expected_value, default_value=None):
+        """Verify that a node has expected value for an attribute.
+
+        Args:
+            node (NodeProto): a node to check
+            attribute_name (str): name of attribute
+            expected_value (Any): expected value of the attribute
+            default_value (Any, optional): default value if the attribute does not exist. Defaults to None.
+
+        Returns:
+            bool: whether the check is passed or not
+        """
+        value = default_value
+        for attr in node.attribute:
+            if attr.name == attribute_name:
+                value = helper.get_attribute_value(attr)
+
+        if isinstance(expected_value, list):
+            return (isinstance(value, ndarray) or isinstance(value, list)) and array_equal(
+                expected_value, value, equal_nan=False
+            )
+        else:
+            return value == expected_value
+
+    @staticmethod
+    def transpose_2d_int8_tensor(tensor: onnx_proto.TensorProto):
+        """Transpose a 2-D INT8 TensorProto
+        Args:
+            tensor (TensorProto): tensor to be transposed
+        Returns:
+            tensor (TensorProto): transposed tensor
+        """
+        if not isinstance(tensor, onnx_proto.TensorProto):
+            raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
+
+        if len(tensor.dims) != 2 or tensor.data_type != onnx_proto.TensorProto.INT8:
+            raise ValueError("Only INT8 2-D tensors can be transposed")
+
+        if tensor.raw_data:
+            int32_data = numpy.reshape(numpy.frombuffer(tensor.raw_data, dtype="int8"), tensor.dims)
+            int32_transposed_data = numpy.transpose(int32_data, [1, 0])
+            tensor.raw_data = int32_transposed_data.tobytes()
+
+        else:
+            raise ValueError("only raw buffer supported")
+
+        return tensor
+
+    @staticmethod
+    def check_qdq_node_for_fusion(node: NodeProto, model: OnnxModel, allow_per_tensor_quantization_only=True):
+        """Verify if a provided QuantizeLinear (Q) / DequantizeLinear (DQ) node is a good candidate for fusion.
+           It is a good candidate for fusion if:
+           (1) The Q/DQ node is for per-tensor quantization if allow_per_tensor_quantization_only is `True`
+           (2) The Q/DQ node should have constant scale
+           (3) The Q/DQ node should have a zero point of 0
+        Args:
+            node (NodeProto): a Q/DQ node to check
+        Returns:
+            bool: whether the check is passed or not
+        """
+        if not node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
+            logger.debug(f"Provided node is not a Q/DQ node. Op Type: {node.op_type}")
+
+        scale = model.get_constant_value(node.input[1])
+
+        # Scale is not constant
+        if scale is None:
+            return False
+
+        # Not per-tensor quantization
+        scale_has_single_element = scale.ndim == 0 or (scale.ndim == 1 and scale.shape[0] == 1)
+        if allow_per_tensor_quantization_only and not scale_has_single_element:
+            return False
+
+        # If the Q/DQ node has no zero point input, it is assumed to be 0 (per ONNX spec)
+        if len(node.input) == 2:
+            return True
+
+        # Zero point should be constant and should have a value of 0
+        zero_point = model.get_constant_value(node.input[2])
+
+        # Zero point and scale should have same number of dims
+        if scale.ndim != zero_point.ndim:
+            return False
+
+        # Zero point is not constant or zero point is not zero
+        if zero_point is None:
+            return False
+
+        return numpy.all(zero_point == 0)
+
+    def check_node_input_value(self, node, input_index: int, expected_value):
+        """Verify that a node has expected input value
+
+        Args:
+            node (NodeProto): a node to check
+            input_index (int): index of its input to be verified
+            expected_value (Any): expected value of the input
+
+        Returns:
+            bool: whether the check is passed or not
+        """
+        assert len(node.input) > input_index
+
+        value = self.model.get_constant_value(node.input[input_index])
+
+        if isinstance(expected_value, list):
+            return (isinstance(value, ndarray) or isinstance(value, list)) and array_equal(
+                expected_value, value, equal_nan=False
+            )
+        else:
+            return value == expected_value
+
+    def remove_identity_nodes(self):
+        """Remove Identity nodes, except those right before graph output."""
+        nodes_to_remove = []
+        for node in self.model.nodes():
+            if node.op_type == "Identity":
+                if node.output[0] not in self.model.get_graphs_output_names():
+                    self.model.replace_input_of_all_nodes(node.output[0], node.input[0])
+                    nodes_to_remove.append(node)
+
+        if nodes_to_remove:
+            self.model.remove_nodes(nodes_to_remove)
+            logger.info(f"Removed {len(nodes_to_remove)} Identity nodes")
+
+    def remove_cascaded_cast_nodes(self):
+        self.model.remove_cascaded_cast_nodes()
+
+    def remove_useless_cast_nodes(self):
+        self.model.remove_useless_cast_nodes()
+
+    def remove_useless_reshape_nodes(self):
+        """Remove reshape node that is not needed based on symbolic shape inference: input and output has same shape"""
+        shape_infer = self.model.infer_runtime_shape(update=True)
+        if shape_infer is None:
+            return
+
+        nodes_to_remove = []
+        for node in self.model.nodes():
+            if node.op_type == "Reshape":
+                input_shape = shape_infer.get_edge_shape(node.input[0])
+                output_shape = shape_infer.get_edge_shape(node.output[0])
+                if input_shape and output_shape and input_shape == output_shape:
+                    logger.info(
+                        f"Remove reshape node {node.name} since its input shape is same as output: {input_shape}"
+                    )
+                    nodes_to_remove.append(node)
+
+        if nodes_to_remove:
+            graph_input_names = set(self.model.get_graphs_input_names())
+            graph_output_names = set(self.model.get_graphs_output_names())
+            for node in nodes_to_remove:
+                if bool(set(node.output) & graph_output_names):
+                    if not bool(set(node.input) & graph_input_names):
+                        self.model.replace_output_of_all_nodes(node.input[0], node.output[0])
+                    else:
+                        continue
+                else:
+                    self.model.replace_input_of_all_nodes(node.output[0], node.input[0])
+                self.model.remove_node(node)
+
+
+class NumpyHelper:
+    @staticmethod
+    def to_array(tensor: TensorProto, fill_zeros: bool = False) -> ndarray:
+        # When weights are in external data format but not presented, we can still test the optimizer with two changes:
+        # (1) set fill_zeros = True  (2) change load_external_data=False in optimizer.py
+        if fill_zeros:
+            from onnx import mapping
+
+            return ndarray(
+                shape=tensor.dims,
+                dtype=mapping.TENSOR_TYPE_TO_NP_TYPE[tensor.data_type],
+            )
+
+        return numpy_helper.to_array(tensor)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py
new file mode 100755
index 000000000..1133877bf
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py
@@ -0,0 +1,306 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+import onnx
+import math
+
+logger = getLogger(__name__)
+
+class FusionVideoBertAttention(Fusion):
+    """
+    Fuse VideoBertAttention subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(model, "CustomQKVToContextPluginDynamic_IxRT", ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"])
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(self, atten_matmul: NodeProto, div: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        atten_matul_initializer = self.model.get_initializer(atten_matmul.input[1])
+        div_initializer = self.model.get_initializer(div.input[1])
+        
+        # 检查float_data是否为空
+        if len(div_initializer.float_data) > 0:
+            div_value = div_initializer.float_data[0]
+        else:
+            # 如果float_data为空，尝试其他方式获取数据
+            # 例如，如果数据存储在raw_data中
+            if len(div_initializer.raw_data) > 0:
+                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type]
+                div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0]
+            else:
+                raise ValueError("Data not found in the div_initializer")
+            
+        atten_matul_shape_value = NumpyHelper.to_array(atten_matul_initializer).shape
+        head_dim = math.ceil(div_value*div_value)
+        hidden_size = atten_matul_shape_value[0]
+        num_heads = hidden_size // head_dim
+
+        return num_heads, hidden_size 
+
+    def create_attention_node(
+        self,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        matmul_qk_add: NodeProto
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
+            return None
+
+        attention_node_name = self.model.create_node_name("Attention")
+        
+        qk_bias = None
+        has_mask = 0
+        has_qk_bias = 0
+        if matmul_qk_add is not None:
+            has_qk_bias = 1
+            qk_bias = self.model.get_initializer(matmul_qk_add.input[1])
+            qk_bias_arr = NumpyHelper.to_array(qk_bias)
+            if len(qk_bias_arr.shape) == 3:
+                qk_bias_arr = qk_bias_arr.squeeze(0)
+            has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0)
+            if np.any(has_neg_inf):
+                qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype(np.float32)
+            qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name))
+        
+        attention_inputs = [
+            input
+        ]
+        
+        if qk_bias is not None:
+            has_mask = 1
+            attention_inputs.append(qk_bias.name)
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend([helper.make_attribute("hidden_size", hidden_size)])
+        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", has_qk_bias)])
+        
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+        if normalize_node.op_type == "LayerNormalization":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                start_node = add_before_layernorm
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_paths = {
+            "path1" : (["Add", "MatMul", "Reshape", "Transpose", "MatMul"], [0, None, 0, 0, 0]),
+            "path2" : (["Add", "MatMul", "Reshape", "Transpose", "MatMul"], [1, None, 0, 0, 0]),
+        }
+
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+
+        if qkv_path in ['path1', 'path2']:
+            (_, atten_matmul, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
+
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+        """
+        Match videobert              
+        transpose/Add --> LayerNormalization -->  Attention -->     Add --> LayerNormalization
+         |                                                        |
+         |                                                        |
+         +---------------------------------------------------------
+        """
+        transpose_before_layernorm = self.model.match_parent(start_node, "Transpose", 0)
+        if transpose_before_layernorm is not None:
+            node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == 'LayerNormalization':
+                    root_input = child.output[0]
+
+        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
+        if add_before_layernorm is not None:
+            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == 'LayerNormalization':
+                    root_input = child.output[0]
+
+        v_paths = {
+            "path1" : (["Transpose", "Reshape", "Slice", "Add", "MatMul"], [1, 0, 0, 0, None]) # videobert
+        }
+
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+        if v_path == 'path1':
+            (_, _, _, add_in_qkv, matmul_in_qkv) = v_nodes
+
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        
+        qk_paths = {
+            "path1": (["Softmax", "MatMul"], [0, 0]),
+            "path2": (["Softmax", "Add", "MatMul"], [0, 0, None])
+        }
+
+        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
+        
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+        
+        matmul_qk_add = None
+        if qk_path == "path1":
+            (_, matmul_qk) = qk_nodes
+        else:
+            (_, matmul_qk_add, matmul_qk) = qk_nodes
+
+        q_paths = {
+            "path1" : (["Transpose", "Reshape", "Slice"], [0, 0, 0]),
+            "path2" : (["Div", "Transpose", "Reshape", "Slice"], [0, 0, 0, 0])
+        }
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        
+        if q_path == 'path1':
+            (_, _, slice_q) = q_nodes
+        else:
+            (div, _, _, slice_q) = q_nodes
+
+        k_paths = {
+            "path1" : (["Transpose", "Reshape", "Slice"], [1, 0, 0]),
+            "path2" : (["Div", "Transpose", "Reshape", "Slice"], [1, 0, 0, 0])
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
+
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        
+        if k_path == 'path1':
+            (_, _, slice_k) = k_nodes
+        else:
+            (div, _, _, slice_k) = k_nodes
+        
+        if matmul_in_qkv.input[0] == root_input and slice_q.input[0] == add_in_qkv.output[0] and slice_k.input[0] == add_in_qkv.output[0]:
+            attention_last_node = reshape_qkv
+
+            num_heads, hidden_size = self.get_num_heads_and_hidden_size(atten_matmul, div)
+            
+            new_node = self.create_attention_node(
+                num_heads,
+                hidden_size,
+                add_in_qkv.output[0],
+                attention_last_node.output[0],
+                matmul_qk_add
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+            self.nodes_to_remove.extend([attention_last_node, transpose_qkv, matmul_qkv])
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes)
+            self.nodes_to_remove.extend(k_nodes)
+            self.nodes_to_remove.extend(v_nodes[:-2])
+            
+            # fuse head and tail transpose
+            if transpose_before_layernorm is not None:
+                node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
+                for child in node_children:
+                    for i, input in enumerate(child.input):
+                        if child.input[i] == transpose_before_layernorm.output[0]:
+                            child.input[i] = transpose_before_layernorm.input[0]
+                self.nodes_to_remove.extend([transpose_before_layernorm])
+                
+                node = transpose_before_layernorm
+                while True:
+                    found = False
+                    node_children = input_name_to_nodes[node.output[0]]
+                    for child in node_children:
+                        if child is not None and child.op_type in ['SkipLayerNorm', "Add"]:
+                            node = child
+                            found = True
+                            break
+                    if not found:
+                        break
+                node_children = input_name_to_nodes[node.output[0]]
+                if len(node_children) == 1 and node_children[0].op_type == 'Transpose':
+                    transpose_node = node_children[0]
+                    transpose_children = input_name_to_nodes[transpose_node.output[0]]
+                    for i, input in enumerate(transpose_children[0].input):
+                        if transpose_children[0].input[i] == transpose_node.output[0]:
+                            transpose_children[0].input[i] = transpose_node.input[0]
+                    self.nodes_to_remove.extend([transpose_node])
+            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
+            # self.nodes_to_remove.extend(mask_nodes)
+            # self.prune_graph = True
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py
new file mode 100755
index 000000000..85d9cb2d8
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py
@@ -0,0 +1,83 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Tuple, Union
+
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionXSoftmax(Fusion):
+    """
+    Fuse Where + Softmax + Where into one node: XSoftmax
+    """
+
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "XSoftmax_IxRT", "MatMul")
+
+    def create_xsoftmax_node(
+        self, data_input: str, mask_input: str, output: str
+    ) -> Union[NodeProto, None]:
+        """Create an XSoftmax node.
+
+        Args:
+            data_input (str): data input name
+            mask_input (str): max input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        xsoftmax_node_name = self.model.create_node_name("XSoftmax")
+
+        xsoftmax_node = helper.make_node(
+            "XSoftmax_IxRT",
+            inputs=[data_input, mask_input],
+            outputs=[output],
+            name=xsoftmax_node_name,
+        )
+        xsoftmax_node.domain = "com.iluvatar"
+        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        xsoftmax_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        xsoftmax_node.attribute.extend([helper.make_attribute("dim", -1)])
+
+        return xsoftmax_node
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+
+        xsoftmax_paths = {
+            "path": (["Where", "Softmax", "Where", "Add"], [None, None, None, None]),
+        }
+        xsoftmax_nodes, xsoftmax_path = self.match_parent_path_from_dict(
+            node, xsoftmax_paths
+        )
+
+        if xsoftmax_nodes is None:
+            logger.debug("fuse_xsoftmax: failed to match xsoftmax path")
+            return
+        else:
+            (tail_where, softmax, head_where, add) = xsoftmax_nodes
+            where_inputs = [i for i in tail_where.input if i in head_where.input]
+            assert len(where_inputs) == 1
+            mask_input = where_inputs[0]
+            data_input = add.output[0]
+            data_output = tail_where.output[0]
+
+            xsoftmax_node = self.create_xsoftmax_node(
+                data_input, mask_input, data_output
+            )
+
+            self.nodes_to_add.append(xsoftmax_node)
+            self.node_name_to_graph_name[xsoftmax_node.name] = self.this_graph_name
+            self.nodes_to_remove.append(tail_where)
+            self.nodes_to_remove.append(softmax)
+            self.nodes_to_remove.append(head_where)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py
new file mode 100755
index 000000000..ba66693c9
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py
@@ -0,0 +1,131 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import List, Tuple, Union
+
+import numpy as np
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+def get_tensor_attr(attrs, attr_name):
+    result = None
+    for i in attrs:
+        if i.name == attr_name:
+            return numpy_helper.to_array(i.t)
+    return result
+
+
+class FusionYoloV5Decoder(Fusion):
+    """
+    Fuse SwinL subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(model, "YoloV5Decoder", ["Reshape"])
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        short_path = ["Concat", "Slice", "Sigmoid", "Transpose", "Reshape"]
+        paths = [
+            (["Concat", "Unsqueeze", "Gather", "Shape"], [1] + [None] * 3),
+            (
+                ["Concat", "Mul", "Add", "Sub", "Mul", "Slice", "Sigmoid", "Transpose"],
+                [0, 0] + [None] * 6,
+            ),
+            (
+                ["Concat", "Mul", "Pow", "Mul", "Slice", "Sigmoid", "Transpose"],
+                [0, 1] + [None] * 5,
+            ),
+            (short_path, [None] * 5),
+            (short_path + ["Concat", "Unsqueeze", "Gather", "Shape"], [None] * 9),
+        ]
+        paths_found = []
+        nodes_names_found = set()
+        nodes_found = []
+        for path_i in paths:
+            nodes = self.model.match_parent_path(normalize_node, path_i[0], path_i[1])
+            paths_found.append(nodes)
+            if nodes:
+                for n in nodes:
+                    if n.name not in nodes_names_found:
+                        nodes_names_found.add(n.name)
+                        nodes_found.append(n)
+        if not all(paths_found):
+            return
+        shape_node = paths_found[-1][-1]
+        params = self._find_yolov5_decoder_params(paths_found)
+        self._fuse_node(
+            inputs=shape_node.input, outputs=normalize_node.output, params=params
+        )
+        self.nodes_to_remove.extend(nodes_found)
+        self._delete_extra_output_edges(paths_found)
+        self.prune_graph = True
+
+    def _fuse_node(self, inputs, outputs, params):
+        fused_node = helper.make_node(
+            "YoloV5Decoder",
+            inputs=inputs,
+            outputs=outputs,
+            name=self.model.create_node_name("YoloV5Decoder"),
+        )
+        fused_node.attribute.extend(params)
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+
+    def _delete_extra_output_edges(self, paths_found):
+        transpose_node = paths_found[2][-1]
+        assert transpose_node.op_type == "Transpose"
+        out_edge = transpose_node.output[0]
+        for item in self.model.graph().output:
+            if item.name == out_edge:
+                self.model.graph().output.remove(item)
+                logger.warning(f"Output: {out_edge} is useless in graph, delete it")
+                return
+
+    def _find_yolov5_decoder_params(self, paths_found):
+        # num_class
+        concat_op = paths_found[0][0]
+        assert concat_op.op_type == "Concat"
+        num_class_arr = self.model.get_initializer(concat_op.input[2], True)
+        assert num_class_arr
+        num_class = (num_class_arr - 5).tolist()[0]
+        num_class = helper.make_attribute("num_class", num_class)
+
+        # stride
+        mul_op = paths_found[1][1]
+        assert mul_op.op_type == "Mul"
+        input_arrs = self.model.get_initializer_input_edges(mul_op.name, True)
+        assert len(input_arrs) == 1
+        stride = input_arrs[0].tolist()
+        stride = helper.make_attribute("stride", stride)
+
+        # anchor
+        mul_op = paths_found[2][1]
+        assert mul_op.op_type == "Mul"
+        anchor = self.model.get_initializer_input_edges(mul_op.name, True)
+        assert len(anchor) == 1
+        anchor = anchor[0]
+        anchor = anchor[0, :, 0, 0, :] if len(anchor.shape) == 5 else anchor[:, 0, 0, :]
+        anchor = helper.make_attribute("anchor", list(anchor.flatten()))
+
+        # fast_impl
+        fast_impl = helper.make_attribute("faster_impl", 1)
+
+        return [num_class, stride, anchor, fast_impl]
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py
new file mode 100755
index 000000000..b176058c9
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py
@@ -0,0 +1,1166 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import logging
+import os
+import sys
+from collections import deque
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+from onnx import (
+    AttributeProto,
+    GraphProto,
+    ModelProto,
+    NodeProto,
+    TensorProto,
+    helper,
+    numpy_helper,
+    save_model,
+)
+
+from .float16 import convert_float_to_float16
+from .shape_infer_helper import SymbolicShapeInferenceHelper
+
+logger = logging.getLogger(__name__)
+
+
+class OnnxModel:
+    def __init__(self, model):
+        self.initialize(model)
+        self.initializer_visited: Dict[str, bool] = {}
+
+    def initialize(self, model):
+        self.model: ModelProto = model
+        self._node_name_suffix: Dict[
+            str, int
+        ] = {}  # key is node name prefix, value is the last suffix generated
+        self.shape_infer_helper: SymbolicShapeInferenceHelper = None
+        self.enable_shape_infer: bool = True
+        self.all_graphs: Optional[List[GraphProto]] = None
+
+    def disable_shape_inference(self):
+        self.enable_shape_infer = False
+
+    def infer_runtime_shape(self, dynamic_axis_mapping={}, update=False):
+        if self.enable_shape_infer:
+            if self.shape_infer_helper is None or update:
+                self.shape_infer_helper = SymbolicShapeInferenceHelper(self.model)
+
+            try:
+                if self.shape_infer_helper.infer(dynamic_axis_mapping):
+                    return self.shape_infer_helper
+            except:
+                self.enable_shape_infer = (
+                    False  # disable shape inference to suppress same error message.
+                )
+                print("failed in shape inference", sys.exc_info()[0])
+
+        return None
+
+    def input_name_to_nodes(self):
+        input_name_to_nodes = {}
+        for node in self.nodes():
+            for input_name in node.input:
+                if input_name not in input_name_to_nodes:
+                    input_name_to_nodes[input_name] = [node]
+                else:
+                    input_name_to_nodes[input_name].append(node)
+        return input_name_to_nodes
+
+    def output_name_to_node(self):
+        output_name_to_node = {}
+        for node in self.nodes():
+            for output_name in node.output:
+                output_name_to_node[output_name] = node
+        return output_name_to_node
+
+    def nodes(self):
+        all_nodes = []
+        for graph in self.graphs():
+            for node in graph.node:
+                all_nodes.append(node)
+        return all_nodes
+
+    def graph(self):
+        return self.model.graph
+
+    def graphs(self):
+        if self.all_graphs is not None:
+            return self.all_graphs
+        self.all_graphs = []
+        graph_queue = [self.model.graph]
+        while graph_queue:
+            graph = graph_queue.pop(0)
+            self.all_graphs.append(graph)
+            for node in graph.node:
+                for attr in node.attribute:
+                    if attr.type == AttributeProto.AttributeType.GRAPH:
+                        assert isinstance(attr.g, GraphProto)
+                        graph_queue.append(attr.g)
+                    if attr.type == AttributeProto.AttributeType.GRAPHS:
+                        for g in attr.graphs:
+                            assert isinstance(g, GraphProto)
+                            graph_queue.append(g)
+        return self.all_graphs
+
+    def get_graphs_input_names(self):
+        input_names = []
+        for graph in self.graphs():
+            for input in graph.input:
+                input_names.append(input.name)
+        return input_names
+
+    def get_graphs_output_names(self):
+        output_names = []
+        for graph in self.graphs():
+            for output in graph.output:
+                output_names.append(output.name)
+        return output_names
+
+    def get_graph_by_node(self, node):
+        for graph in self.graphs():
+            if node in graph.node:
+                return graph
+        return None
+
+    def get_graph_by_name(self, graph_name):
+        for graph in self.graphs():
+            if graph_name == graph.name:
+                return graph
+        return None
+
+    def get_topological_insert_id(self, graph, outputs):
+        for idx, node in enumerate(graph.node):
+            for input in node.input:
+                if input in outputs:
+                    return idx
+        return len(graph.node)
+
+    def remove_node(self, node):
+        for graph in self.graphs():
+            if node in graph.node:
+                graph.node.remove(node)
+
+    def remove_nodes(self, nodes_to_remove):
+        for node in nodes_to_remove:
+            self.remove_node(node)
+
+    def add_node(self, node, graph_name=None):
+        if graph_name is None or graph_name == self.model.graph.name:
+            self.model.graph.node.extend([node])
+        else:
+            graph = self.get_graph_by_name(graph_name)
+            insert_idx = self.get_topological_insert_id(graph, node.output)
+            graph.node.insert(insert_idx, node)
+
+    def add_nodes(self, nodes_to_add, node_name_to_graph_name=None):
+        if node_name_to_graph_name is None:
+            self.model.graph.node.extend(nodes_to_add)
+        else:
+            for node in nodes_to_add:
+                graph_name = node_name_to_graph_name[node.name]
+                self.add_node(node, graph_name)
+
+    def add_initializer(self, tensor, graph_name=None):
+        if graph_name is None or graph_name == self.model.graph.name:
+            self.model.graph.initializer.extend([tensor])
+        else:
+            graph = self.get_graph_by_name(graph_name)
+            graph.initializer.extend([tensor])
+
+    def add_input(self, input, graph_name=None):
+        if graph_name is None or graph_name == self.model.graph.name:
+            self.model.graph.input.extend([input])
+        else:
+            graph = self.get_graph_by_name(graph_name)
+            graph.input.extend([input])
+
+    @staticmethod
+    def replace_node_input(node, old_input_name, new_input_name):
+        assert isinstance(old_input_name, str) and isinstance(new_input_name, str)
+        for j in range(len(node.input)):
+            if node.input[j] == old_input_name:
+                node.input[j] = new_input_name
+
+    def replace_input_of_all_nodes(self, old_input_name, new_input_name):
+        for node in self.model.graph.node:
+            OnnxModel.replace_node_input(node, old_input_name, new_input_name)
+
+    @staticmethod
+    def replace_node_output(node, old_output_name, new_output_name):
+        assert isinstance(old_output_name, str) and isinstance(new_output_name, str)
+        for j in range(len(node.output)):
+            if node.output[j] == old_output_name:
+                node.output[j] = new_output_name
+
+    def replace_output_of_all_nodes(self, old_output_name, new_output_name):
+        for node in self.model.graph.node:
+            OnnxModel.replace_node_output(node, old_output_name, new_output_name)
+
+    def get_initializer(self, name, return_np_array=False):
+        for graph in self.graphs():
+            for tensor in graph.initializer:
+                if tensor.name == name:
+                    return numpy_helper.to_array(tensor) if return_np_array else tensor
+        return None
+
+    def get_node(self, op_name):
+        for graph in self.graphs():
+            for n in graph.node:
+                if n.name == op_name:
+                    return n
+        return None
+
+    def get_initializer_input_edges(self, op_name, return_np_array=False):
+        initializers = {i.name: i for graph in self.graphs() for i in graph.initializer}
+        node = self.get_node(op_name)
+        assert node
+        result = []
+        for i in node.input:
+            if i in initializers:
+                tensor = initializers[i]
+                tensor = numpy_helper.to_array(tensor) if return_np_array else tensor
+                result.append(tensor)
+        return result
+
+    def get_nodes_by_op_type(self, op_type):
+        nodes = []
+        for node in self.nodes():
+            if node.op_type == op_type:
+                nodes.append(node)
+        return nodes
+
+    def get_children(self, node, input_name_to_nodes=None):
+        if input_name_to_nodes is None:
+            input_name_to_nodes = self.input_name_to_nodes()
+
+        children = []
+        for output in node.output:
+            if output in input_name_to_nodes:
+                for node in input_name_to_nodes[output]:
+                    children.append(node)
+        return children
+
+    def get_parents(self, node, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        parents = []
+        for input in node.input:
+            if input in output_name_to_node:
+                parents.append(output_name_to_node[input])
+        return parents
+
+    def get_parent(self, node, i, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        if len(node.input) <= i:
+            return None
+
+        input = node.input[i]
+        if input not in output_name_to_node:
+            return None
+
+        return output_name_to_node[input]
+
+    def match_first_parent(self, node, parent_op_type, output_name_to_node, exclude=[]):
+        """
+        Find parent node based on constraints on op_type.
+
+        Args:
+            node (str): current node name.
+            parent_op_type (str): constraint of parent node op_type.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            exclude (list): list of nodes that are excluded (not allowed to match as parent).
+
+        Returns:
+            parent: The matched parent node. None if not found.
+            index: The input index of matched parent node. None if not found.
+        """
+        for i, input in enumerate(node.input):
+            if input in output_name_to_node:
+                parent = output_name_to_node[input]
+                if parent.op_type == parent_op_type and parent not in exclude:
+                    return parent, i
+                else:
+                    logger.debug(
+                        f"To find first {parent_op_type}, current {parent.op_type}"
+                    )
+        return None, None
+
+    def match_parent(
+        self,
+        node,
+        parent_op_type,
+        input_index=None,
+        output_name_to_node=None,
+        exclude=[],
+        return_indice=None,
+    ):
+        """
+        Find parent node based on constraints on op_type and index.
+        When input_index is None, we will find the first parent node based on constraints, and return_indice will be appended the corresponding input index.
+
+        Args:
+            node (str): current node name.
+            parent_op_type (str): constraint of parent node op_type.
+            input_index (int or None): only check the parent given input index of current node.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            exclude (list): list of nodes that are excluded (not allowed to match as parent).
+            return_indice (list): a list to append the input index when input_index is None.
+
+        Returns:
+            parent: The matched parent node.
+        """
+        assert node is not None
+        assert input_index is None or input_index >= 0
+
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        if input_index is None:
+            parent, index = self.match_first_parent(
+                node, parent_op_type, output_name_to_node, exclude
+            )
+            if return_indice is not None:
+                return_indice.append(index)
+            return parent
+
+        if input_index >= len(node.input):
+            logger.debug(f"input_index {input_index} >= node inputs {len(node.input)}")
+            return None
+
+        parent = self.get_parent(node, input_index, output_name_to_node)
+        if (
+            parent is not None
+            and parent.op_type == parent_op_type
+            and parent not in exclude
+        ):
+            return parent
+
+        if parent is not None:
+            logger.debug(f"Expect {parent_op_type}, Got {parent.op_type}")
+
+        return None
+
+    def match_parent_paths(self, node, paths, output_name_to_node):
+        for i, path in enumerate(paths):
+            assert isinstance(path, List) or isinstance(path, Tuple)
+            return_indice = []
+            matched = self.match_parent_path(
+                node, path[0], path[1], output_name_to_node, return_indice
+            )
+            if matched:
+                return i, matched, return_indice
+        return -1, None, None
+
+    def match_parent_path(
+        self,
+        node,
+        parent_op_types,
+        parent_input_index,
+        output_name_to_node=None,
+        return_indice=None,
+    ):
+        """
+        Find a sequence of input edges based on constraints on parent op_type and index.
+        When input_index is None, we will find the first parent node based on constraints, and return_indice will be appended the corresponding input index.
+
+        Args:
+            node (str): current node name.
+            parent_op_types (str): constraint of parent node op_type of each input edge.
+            parent_input_index (list): constraint of input index of each input edge. None means no constraint.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            return_indice (list): a list to append the input index when there is no constraint on input index of an edge.
+
+        Returns:
+            parents: a list of matched parent node.
+        """
+        assert len(parent_input_index) == len(parent_op_types)
+
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        current_node = node
+        matched_parents = []
+        for i, op_type in enumerate(parent_op_types):
+            matched_parent = self.match_parent(
+                current_node,
+                op_type,
+                parent_input_index[i],
+                output_name_to_node,
+                exclude=[],
+                return_indice=return_indice,
+            )
+            if matched_parent is None:
+                logger.debug(
+                    f"Failed to match index={i} parent_input_index={parent_input_index[i]} op_type={op_type}",
+                    stack_info=True,
+                )
+                return None
+
+            matched_parents.append(matched_parent)
+            current_node = matched_parent
+
+        return matched_parents
+
+    def find_first_child_by_type(
+        self, node, child_type, input_name_to_nodes=None, recursive=True
+    ):
+        children = self.get_children(node, input_name_to_nodes)
+        dq = deque(children)
+        while len(dq) > 0:
+            current_node = dq.pop()
+            if current_node.op_type == child_type:
+                return current_node
+
+            if recursive:
+                children = self.get_children(current_node, input_name_to_nodes)
+                for child in children:
+                    dq.appendleft(child)
+
+        return None
+
+    def find_first_parent_by_type(
+        self, node, parent_type, output_name_to_node=None, recursive=True
+    ):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        parents = self.get_parents(node, output_name_to_node)
+        dq = deque(parents)
+        while len(dq) > 0:
+            current_node = dq.pop()
+            if current_node.op_type == parent_type:
+                return current_node
+
+            if recursive:
+                parents = self.get_parents(current_node, output_name_to_node)
+                for parent in parents:
+                    dq.appendleft(parent)
+
+        return None
+
+    def get_constant_value(self, output_name):
+        for node in self.get_nodes_by_op_type("Constant"):
+            if node.output[0] == output_name:
+                for att in node.attribute:
+                    if att.name == "value":
+                        return numpy_helper.to_array(att.t)
+
+        # Fall back to intializer since constant folding might have been applied.
+        initializer = self.get_initializer(output_name)
+        if initializer is not None:
+            return numpy_helper.to_array(initializer)
+
+        return None
+
+    def get_constant_input(self, node):
+        for i, input in enumerate(node.input):
+            value = self.get_constant_value(input)
+            if value is not None:
+                return i, value
+
+        return None, None
+
+    def find_constant_input(self, node, expected_value, delta=0.000001):
+        i, value = self.get_constant_input(node)
+        if (
+            value is not None
+            and value.size == 1
+            and abs(value - expected_value) < delta
+        ):
+            return i
+
+        return -1
+
+    def is_constant_with_specified_dimension(
+        self, output_name, dimensions, description
+    ):
+        value = self.get_constant_value(output_name)
+        if value is None:
+            logger.debug(f"{description} {output_name} is not initializer.")
+            return False
+
+        if len(value.shape) != dimensions:
+            logger.debug(
+                f"{description} {output_name} shall have {dimensions} dimensions. Got shape {value.shape}"
+            )
+            return False
+
+        return True
+
+    def has_constant_input(self, node, expected_value, delta=0.000001):
+        return self.find_constant_input(node, expected_value, delta) >= 0
+
+    def get_children_subgraph_nodes(
+        self, root_node, stop_nodes, input_name_to_nodes=None
+    ):
+        if input_name_to_nodes is None:
+            input_name_to_nodes = self.input_name_to_nodes()
+
+        children = input_name_to_nodes[root_node.output[0]]
+
+        unique_nodes = []
+
+        dq = deque(children)
+        while len(dq) > 0:
+            current_node = dq.pop()
+            if current_node in stop_nodes:
+                continue
+
+            if current_node not in unique_nodes:
+                unique_nodes.append(current_node)
+
+                for output in current_node.output:
+                    if output in input_name_to_nodes:
+                        children = input_name_to_nodes[output]
+                        for child in children:
+                            dq.appendleft(child)
+
+        return unique_nodes
+
+    def tensor_shape_to_list(self, tensor_type):
+        """Convert tensor shape to list"""
+        shape_list = []
+        for d in tensor_type.shape.dim:
+            if d.HasField("dim_value"):
+                shape_list.append(d.dim_value)  # known dimension
+            elif d.HasField("dim_param"):
+                shape_list.append(d.dim_param)  # unknown dimension with symbolic name
+            else:
+                shape_list.append("?")  # shall not happen
+        return shape_list
+
+    def get_dtype(self, input_or_output: str):
+        """Try get data type given a name (could be initializer, graph input or output)."""
+        tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info}
+
+        if input_or_output in tensor_type_map:
+            return tensor_type_map[input_or_output].tensor_type.elem_type
+
+        graph_input = self.find_graph_input(input_or_output)
+        if graph_input:
+            return graph_input.type.tensor_type.elem_type
+
+        graph_output = self.find_graph_output(input_or_output)
+        if graph_output:
+            return graph_output.type.tensor_type.elem_type
+
+        return None
+
+    @staticmethod
+    def get_node_attribute(node: NodeProto, attribute_name: str):
+        for attr in node.attribute:
+            if attr.name == attribute_name:
+                value = helper.get_attribute_value(attr)
+                return value
+        return None
+
+    def remove_cascaded_cast_nodes(self):
+        """Remove Cast node that are followed by another Cast node like  --> Cast --> Cast -->
+        Note that this shall be used carefully since it might introduce semantic change.
+        For example, float -> int -> float could get different value than the original float value.
+        So, it is recommended to used only in post-processing of mixed precision conversion.
+        """
+        output_name_to_node = self.output_name_to_node()
+        removed_count = 0
+        for node in self.nodes():
+            if node.op_type == "Cast":
+                parent = self.get_parent(
+                    node, 0, output_name_to_node=output_name_to_node
+                )
+                if parent and parent.op_type == "Cast":
+                    node.input[0] = parent.input[0]
+                    removed_count += 1
+
+        if removed_count > 0:
+            logger.info("Removed %d cascaded Cast nodes", removed_count)
+            self.prune_graph()
+
+    def remove_useless_cast_nodes(self):
+        """Remove cast nodes that are not needed: input and output has same data type."""
+        shape_infer = self.infer_runtime_shape(update=True)
+        if shape_infer is None:
+            logger.info(
+                f"Skip removing useless cast nodes since shape inference failed."
+            )
+            return
+
+        def get_data_type(input_or_output_name):
+            dtype = self.get_dtype(input_or_output_name)
+            if dtype:
+                return dtype
+            if shape_infer.known_vi_[input_or_output_name].type.tensor_type.HasField(
+                "elem_type"
+            ):
+                return shape_infer.known_vi_[
+                    input_or_output_name
+                ].type.tensor_type.elem_type
+            return None
+
+        nodes_to_remove = []
+        for node in self.nodes():
+            if node.op_type == "Cast":
+                input_dtype = get_data_type(node.input[0])
+                output_dtype = get_data_type(node.output[0])
+                if input_dtype and input_dtype == output_dtype:
+                    nodes_to_remove.append(node)
+
+        if nodes_to_remove:
+            graph_input_names = set(self.get_graphs_input_names())
+            graph_output_names = set(self.get_graphs_output_names())
+            for node in nodes_to_remove:
+                if bool(set(node.output) & graph_output_names):
+                    if not bool(set(node.input) & graph_input_names):
+                        self.replace_output_of_all_nodes(node.input[0], node.output[0])
+                    else:
+                        continue
+                else:
+                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
+                self.remove_node(node)
+
+            logger.info(
+                "Removed %d Cast nodes with output type same as input",
+                len(nodes_to_remove),
+            )
+
+    def convert_model_float32_to_float16(self, cast_input_output=True):
+        logger.warning(
+            "The function convert_model_float32_to_float16 is deprecated. Use convert_float_to_float16 instead!"
+        )
+        self.convert_float_to_float16(
+            use_symbolic_shape_infer=True, keep_io_types=cast_input_output
+        )
+
+    def convert_float_to_float16(self, use_symbolic_shape_infer=True, **kwargs):
+        """Convert a model to half (default) or mixed precision.
+           To use mixed precision, user need specify which graph inputs, outputs, operator type or list of nodes shall keep in float32.
+           By default, we use symbolic shape inference to get shape and type information. If not, ONNX shape inference will be used.
+           Note that symbolic/ONNX shape inference might fail, and the conversion might not proceed without shape and type information.
+
+        Args:
+            use_symbolic_shape_infer (bool, optional): use symbolic shape inference instead of onnx shape inference. Defaults to True.
+            keep_io_types (Union[bool, List[str]], optional): It could be boolean or a list of float32 input/output names.
+                                                              If True, model inputs/outputs should be left as float32. Defaults to False.
+            op_block_list (List[str], optional): List of operator types to leave as float32.
+                                                 Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST` as default.
+            node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None.
+            force_fp16_initializers(bool): force converting all float initializers to float16.
+                                           Default to false, which will convert only the one needed to avoid precision loss.
+            min_positive_val (float, optional): minimal positive value. Defaults to 1e-7.
+            max_finite_val (float, optional): maximal finite value. Defaults to 1e4.
+        """
+        if "keep_io_types" not in kwargs:
+            kwargs["keep_io_types"] = True
+
+        model = self.model
+        if use_symbolic_shape_infer:
+            # Use symbolic shape inference since custom operators (like Gelu, SkipLayerNormalization etc) are not recognized by onnx shape inference.
+            shape_infer_helper = SymbolicShapeInferenceHelper(model)
+            model = shape_infer_helper.infer_shapes(
+                model, auto_merge=True, guess_output_rank=False
+            )
+
+        parameters = {"disable_shape_infer": use_symbolic_shape_infer}
+        parameters.update(
+            {
+                key: kwargs[key]
+                for key in [
+                    "keep_io_types",
+                    "min_positive_val",
+                    "max_finite_val",
+                    "op_block_list",
+                    "node_block_list",
+                    "force_fp16_initializers",
+                ]
+                if key in kwargs
+            }
+        )
+
+        fp16_model = convert_float_to_float16(model, **parameters)
+        self.initialize(fp16_model)
+
+        self.remove_cascaded_cast_nodes()
+
+        self.remove_useless_cast_nodes()
+
+    def create_node_name(self, op_type, name_prefix=None):
+        """Create a unique node name that starts with a prefix (default is operator type).
+           The name will not be duplicated with any name that generated or existed in current graphs.
+        Args:
+            op_type (str): operator type
+            name_prefix (str, optional): prefix of node name. Defaults to None.
+
+        Returns:
+            str: node name
+        """
+
+        if name_prefix:
+            prefix = name_prefix if name_prefix.endswith("_") else (name_prefix + "_")
+        else:
+            prefix = op_type + "_"
+
+        suffix: int = 0
+        if prefix in self._node_name_suffix:
+            suffix = self._node_name_suffix[prefix] + 1
+        else:
+            # Check existed node name only once for a prefix as we assume create_node_name is called for every new node in fusion.
+            for node in self.nodes():
+                if node.name and node.name.startswith(prefix):
+                    try:
+                        index = int(node.name[len(prefix) :])
+                        suffix = max(index + 1, suffix)
+                    except ValueError:
+                        continue
+
+        # Record the generated suffix so that we can avoid generating duplicated name.
+        self._node_name_suffix[prefix] = suffix
+
+        return prefix + str(suffix)
+
+    def find_graph_input(self, input_name):
+        for input in self.model.graph.input:
+            if input.name == input_name:
+                return input
+        return None
+
+    def find_graph_output(self, output_name):
+        for output in self.model.graph.output:
+            if output.name == output_name:
+                return output
+        return None
+
+    def get_parent_subgraph_nodes(self, node, stop_nodes, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        unique_nodes = []
+
+        parents = self.get_parents(node, output_name_to_node)
+        dq = deque(parents)
+        while len(dq) > 0:
+            current_node = dq.pop()
+            if current_node in stop_nodes:
+                continue
+
+            if current_node not in unique_nodes:
+                unique_nodes.append(current_node)
+
+                for input in current_node.input:
+                    if input in output_name_to_node:
+                        dq.appendleft(output_name_to_node[input])
+
+        return unique_nodes
+
+    def get_graph_inputs(self, current_node, recursive=False):
+        """
+        Find graph inputs that linked to current node.
+        """
+        graph_inputs = []
+        for input in current_node.input:
+            if self.find_graph_input(input) and input not in graph_inputs:
+                graph_inputs.append(input)
+
+        if recursive:
+            parent_nodes = self.get_parent_subgraph_nodes(current_node, [])
+            for node in parent_nodes:
+                for input in node.input:
+                    if self.find_graph_input(input) and input not in graph_inputs:
+                        graph_inputs.append(input)
+        return graph_inputs
+
+    @staticmethod
+    def input_index(node_output, child_node):
+        index = 0
+        for input in child_node.input:
+            if input == node_output:
+                return index
+            index += 1
+        return -1
+
+    def remove_unused_constant(self):
+        input_name_to_nodes = self.input_name_to_nodes()
+
+        # remove unused constant
+        unused_nodes = []
+        nodes = self.nodes()
+        for node in nodes:
+            if node.op_type == "Constant" and node.output[0] not in input_name_to_nodes:
+                unused_nodes.append(node)
+
+        self.remove_nodes(unused_nodes)
+
+        if len(unused_nodes) > 0:
+            logger.debug(f"Removed unused constant nodes: {len(unused_nodes)}")
+
+    def prune_graph(self, outputs=None):
+        """
+        Prune graph to keep only required outputs. It removes unnecessary inputs and nodes.
+        Nodes are not linked (directly or indirectly) to any required output will be removed.
+
+        Args:
+            outputs (list): a list of graph outputs to retain. If it is None, all graph outputs will be kept.
+        """
+        if len(self.graphs()) > 1:
+            logger.debug(f"Skip prune_graph since graph has subgraph")
+            return
+
+        if outputs is None:
+            outputs = [output.name for output in self.model.graph.output]
+
+        output_name_to_node = self.output_name_to_node()
+        all_nodes = []
+        for output in outputs:
+            if output in output_name_to_node:
+                last_node = output_name_to_node[output]
+                if last_node in all_nodes:
+                    continue
+                nodes = self.get_parent_subgraph_nodes(last_node, [])
+                all_nodes.append(last_node)
+                all_nodes.extend(nodes)
+
+        nodes_to_remove = []
+        for node in self.model.graph.node:
+            if node not in all_nodes:
+                nodes_to_remove.append(node)
+
+        self.remove_nodes(nodes_to_remove)
+
+        # remove outputs not in list
+        output_to_remove = []
+        for output in self.model.graph.output:
+            if output.name not in outputs:
+                output_to_remove.append(output)
+        for output in output_to_remove:
+            self.model.graph.output.remove(output)
+
+        # remove inputs not used by any node.
+        input_name_to_nodes = self.input_name_to_nodes()
+        input_to_remove = []
+        for input in self.model.graph.input:
+            if input.name not in input_name_to_nodes:
+                input_to_remove.append(input)
+        for input in input_to_remove:
+            self.model.graph.input.remove(input)
+
+        if input_to_remove or output_to_remove or nodes_to_remove:
+            logger.info(
+                "Graph pruned: {} inputs, {} outputs and {} nodes are removed".format(
+                    len(input_to_remove), len(output_to_remove), len(nodes_to_remove)
+                )
+            )
+
+        self.update_graph()
+
+    def update_graph(self, verbose=False):
+        graph = self.model.graph
+
+        remaining_input_names = []
+        for node in graph.node:
+            if node.op_type in ["Loop", "Scan", "If"]:
+                # TODO: handle inner graph
+                logger.debug(
+                    f"Skip update_graph since graph has operator: {node.op_type}"
+                )
+                return
+            if node.op_type != "Constant":
+                for input_name in node.input:
+                    if input_name not in remaining_input_names:
+                        remaining_input_names.append(input_name)
+        if verbose:
+            logger.debug(f"remaining input names: {remaining_input_names}")
+
+        # remove graph input that is not used
+        inputs_to_remove = []
+        for input in graph.input:
+            if input.name not in remaining_input_names:
+                inputs_to_remove.append(input)
+        for input in inputs_to_remove:
+            graph.input.remove(input)
+
+        names_to_remove = [input.name for input in inputs_to_remove]
+        logger.debug(f"remove {len(inputs_to_remove)} unused inputs: {names_to_remove}")
+
+        # remove weights that are not used
+        weights_to_remove = []
+        weights_to_keep = []
+        for initializer in graph.initializer:
+            if (
+                initializer.name not in remaining_input_names
+                and not self.find_graph_output(initializer.name)
+            ):
+                weights_to_remove.append(initializer)
+            else:
+                weights_to_keep.append(initializer.name)
+        for initializer in weights_to_remove:
+            graph.initializer.remove(initializer)
+
+        names_to_remove = [initializer.name for initializer in weights_to_remove]
+        logger.debug(
+            f"remove {len(weights_to_remove)} unused initializers: {names_to_remove}"
+        )
+        if verbose:
+            logger.debug(f"remaining initializers:{weights_to_keep}")
+
+        self.remove_unused_constant()
+
+    def is_safe_to_fuse_nodes(
+        self, nodes_to_remove, keep_outputs, input_name_to_nodes, output_name_to_node
+    ):
+        for node_to_remove in nodes_to_remove:
+            for output_to_remove in node_to_remove.output:
+                if output_to_remove in keep_outputs:
+                    continue
+
+                if output_to_remove in input_name_to_nodes:
+                    for impacted_node in input_name_to_nodes[output_to_remove]:
+                        if impacted_node not in nodes_to_remove:
+                            logger.debug(
+                                f"it is not safe to remove nodes since output {output_to_remove} is used by {impacted_node}"
+                            )
+                            return False
+        return True
+
+    @staticmethod
+    def graph_topological_sort(graph):
+        deps_count = [0] * len(graph.node)  # dependency count of each node
+        deps_to_nodes = {}  # input to node indice
+        sorted_nodes = []  # initialize sorted_nodes
+        for node_idx, node in enumerate(graph.node):
+            # CANNOT use len(node.input) directly because input can be optional
+            deps_count[node_idx] = sum(1 for _ in node.input if _)
+            if deps_count[node_idx] == 0:  # Constant doesn't depend on any inputs
+                sorted_nodes.append(graph.node[node_idx])
+                continue
+
+            for input_name in node.input:
+                if input_name not in deps_to_nodes:
+                    deps_to_nodes[input_name] = [node_idx]
+                else:
+                    deps_to_nodes[input_name].append(node_idx)
+
+        # Note: this logic only applies to top level graph since a sub graph could use intializer from parent graph
+        initializer_names = [init.name for init in graph.initializer]
+        graph_input_names = [input.name for input in graph.input]
+        input_names = initializer_names + graph_input_names
+        input_names.sort()
+        prev_input_name = None
+        for input_name in input_names:
+            if prev_input_name == input_name:
+                continue
+
+            prev_input_name = input_name
+            if input_name in deps_to_nodes:
+                for node_idx in deps_to_nodes[input_name]:
+                    deps_count[node_idx] = deps_count[node_idx] - 1
+                    if deps_count[node_idx] == 0:
+                        sorted_nodes.append(graph.node[node_idx])
+
+        start = 0
+        end = len(sorted_nodes)
+
+        while start < end:
+            for output in sorted_nodes[start].output:
+                if output in deps_to_nodes:
+                    for node_idx in deps_to_nodes[output]:
+                        deps_count[node_idx] = deps_count[node_idx] - 1
+                        if deps_count[node_idx] == 0:
+                            sorted_nodes.append(graph.node[node_idx])
+                            end = end + 1
+            start = start + 1
+
+        if end != len(graph.node):
+            raise RuntimeError(
+                f"Graph is not a DAG: end={end}, len(graph.node)={len(graph.node)}, graph.node[end]={graph.node[end]}"
+            )
+
+        graph.ClearField("node")
+        graph.node.extend(sorted_nodes)
+
+    def topological_sort(self):
+        # TODO: support graph_topological_sort() in subgraphs
+        # for graph in self.graphs():
+        #    self.graph_topological_sort(graph)
+        OnnxModel.graph_topological_sort(self.model.graph)
+
+    @staticmethod
+    def save(
+        model,
+        output_path,
+        save_as_external_data=False,
+        all_tensors_to_one_file=True,
+        size_threshold=1024,
+        convert_attribute=False,
+    ):
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+
+        if save_as_external_data:
+            # Save model to external data, which is needed for model size > 2GB
+            output_dir = Path(output_path).parent
+            output_dir.mkdir(parents=True, exist_ok=True)
+            external_data_path = output_path + ".data"
+            location = (
+                Path(external_data_path).name if all_tensors_to_one_file else None
+            )
+
+            if os.path.exists(output_path):
+                logger.info(f"Delete the existed onnx file: {output_path}")
+                os.remove(output_path)
+
+            if all_tensors_to_one_file:
+                if os.path.exists(external_data_path):
+                    # Delete the external data file. Otherwise, data will be appended to existing file.
+                    logger.info(
+                        f"Delete the existed external data file: {external_data_path}"
+                    )
+                    os.remove(external_data_path)
+            else:
+                if os.listdir(output_dir):
+                    raise RuntimeError(
+                        f"Output directory ({output_dir}) for external data is not empty."
+                    )
+
+            save_model(
+                model,
+                output_path,
+                save_as_external_data=True,
+                all_tensors_to_one_file=all_tensors_to_one_file,
+                location=location,
+                size_threshold=size_threshold,
+                convert_attribute=convert_attribute,
+            )
+        else:
+            save_model(model, output_path)
+
+    def save_model_to_file(
+        self, output_path, use_external_data_format=False, all_tensors_to_one_file=True
+    ):
+        logger.info(f"Sort graphs in topological order")
+        self.topological_sort()
+
+        if output_path.endswith(".json"):  # Output text for testing small model.
+            with open(output_path, "w") as out:
+                out.write(str(model))
+        else:
+            OnnxModel.save(
+                self.model,
+                output_path,
+                use_external_data_format,
+                all_tensors_to_one_file,
+            )
+        logger.info(f"Model saved to {output_path}")
+
+    def get_graph_inputs_excluding_initializers(self):
+        """
+        Returns real graph inputs (excluding initializers from older onnx model).
+        """
+        graph_inputs = []
+        for input in self.model.graph.input:
+            if self.get_initializer(input.name) is None:
+                graph_inputs.append(input)
+        return graph_inputs
+
+    def get_opset_version(self):
+        """Get opset version of onnx domain
+
+        Raises:
+            RuntimeError: ONNX model has no opset for default domain.
+
+        Returns:
+            int: opset version of onnx domain.
+        """
+        for opset in self.model.opset_import:
+            if opset.domain in ["", "ai.onnx"]:
+                return opset.version
+        raise RuntimeError("ONNX model has no opset for default domain")
+
+    @staticmethod
+    def has_same_value(tensor1: TensorProto, tensor2: TensorProto) -> bool:
+        """Returns True when two tensors have same value.
+           Note that name can be different.
+
+        Args:
+            tensor1 (TensorProto): initializer 1
+            tensor2 (TensorProto): initializer 2
+
+        Returns:
+            bool: True when two intializers has same value.
+        """
+        if tensor1.data_type != tensor2.data_type or tensor1.dims != tensor2.dims:
+            return False
+        if tensor1.HasField("raw_data") and tensor2.HasField("raw_data"):
+            return tensor1.raw_data == tensor2.raw_data
+        return numpy_helper.to_array(tensor1) == numpy_helper.to_array(tensor2)
+
+    def remove_duplicated_initializer(self):
+        """Remove initializers with duplicated values, and only keep the first one.
+        It could help reduce size of models (like ALBert) with shared weights.
+        Note: this function does not process subgraph.
+        """
+        if len(self.graphs()) > 1:
+            logger.warning("remove_duplicated_initializer does not process subgraphs.")
+
+        initializer_count = len(self.model.graph.initializer)
+
+        same = [-1] * initializer_count
+        for i in range(initializer_count - 1):
+            if same[i] >= 0:
+                continue
+            for j in range(i + 1, initializer_count):
+                if OnnxModel.has_same_value(
+                    self.model.graph.initializer[i], self.model.graph.initializer[j]
+                ):
+                    same[j] = i
+
+        count = 0
+        for i in range(initializer_count):
+            if same[i] >= 0:
+                count += 1
+                self.replace_input_of_all_nodes(
+                    self.model.graph.initializer[i].name,
+                    self.model.graph.initializer[same[i]].name,
+                )
+
+        if count > 0:
+            self.update_graph()
+            print(f"Removed {count} initializers with duplicated value")
+
+    def add_prefix_to_names(self, prefix: str):
+        """Add prefix to initializer or intermediate outputs in graph. Main graph inputs and outputs are excluded.
+        It could help avoid conflicting in name of node_args when merging two graphs.
+        Note: this function does not process subgraph.
+        """
+        if len(self.graphs()) > 1:
+            logger.warning("add_prefix_to_names does not process subgraphs.")
+
+        # Exclude the names of inputs and outputs of main graph (but not subgraphs)
+        excluded = [i.name for i in self.model.graph.input] + [
+            o.name for o in self.model.graph.output
+        ]
+
+        for initializer in self.model.graph.initializer:
+            if initializer.name not in excluded:
+                if prefix + initializer.name not in excluded:
+                    initializer.name = prefix + initializer.name
+
+        for node in self.model.graph.node:
+            # update name of node inputs
+            for j in range(len(node.input)):
+                if node.input[j] not in excluded:
+                    if prefix + node.input[j] not in excluded:
+                        node.input[j] = prefix + node.input[j]
+
+            # update name of node outputs
+            for j in range(len(node.output)):
+                if node.output[j] not in excluded:
+                    if prefix + node.output[j] not in excluded:
+                        node.output[j] = prefix + node.output[j]
+
+        for value_info in self.model.graph.value_info:
+            if value_info.name not in excluded:
+                value_info.name = prefix + value_info.name
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py
new file mode 100755
index 000000000..111444028
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py
@@ -0,0 +1,122 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import logging
+import os
+import sys
+from typing import Dict
+
+# In ORT Package the symbolic_shape_infer.py is in ../tools
+file_path = os.path.dirname(__file__)
+if os.path.exists(os.path.join(file_path, "../tools/symbolic_shape_infer.py")):
+    sys.path.append(os.path.join(file_path, "../tools"))
+else:
+    sys.path.append(os.path.join(file_path, ".."))
+
+from .symbolic_shape_infer import SymbolicShapeInference, get_shape_from_type_proto, sympy
+
+logger = logging.getLogger(__name__)
+
+
+class SymbolicShapeInferenceHelper(SymbolicShapeInference):
+    def __init__(self, model, verbose=0, int_max=2**31 - 1, auto_merge=True, guess_output_rank=False):
+        super().__init__(int_max, auto_merge, guess_output_rank, verbose)
+        self.model_ = model
+        self.all_shapes_inferred_: bool = False
+        self.is_inferred_: bool = False
+        self.dynamic_axis_mapping_: Dict[str, int] = {}
+
+    def infer(self, dynamic_axis_mapping: Dict[str, int], max_runs: int = 128):
+        """Run shape inference, and try replace dynamic axis from string to integer when mapping is provided.
+
+        Args:
+            dynamic_axis_mapping (_type_): a dictionary with name of dynamic axis as key, like {"batch_size" : 4}
+            max_runs (int, optional): limit maximum number of runs to avoid infinite loop. Defaults to 32.
+
+        Returns:
+            bool: whether all shapes has been inferred or not.
+        """
+        assert dynamic_axis_mapping is not None
+
+        if self.is_inferred_ and self.dynamic_axis_mapping_ == dynamic_axis_mapping:
+            return self.all_shapes_inferred_
+
+        self.dynamic_axis_mapping_ = dynamic_axis_mapping
+
+        self._preprocess(self.model_)
+
+        count = 0
+        while self.run_:
+            logger.debug(f"shape infer run {count}")
+            self.all_shapes_inferred_ = self._infer_impl()
+            count += 1
+            if max_runs > 0 and count >= max_runs:
+                break
+
+        self.is_inferred_ = True
+        return self.all_shapes_inferred_
+
+    def _get_sympy_shape(self, node, idx):
+        """Override it to ensure shape inference by giving the actual value of dynamic axis."""
+        sympy_shape = []
+
+        shape = self._get_shape(node, idx)
+        if shape:
+            for dim in shape:
+                if isinstance(dim, str):
+                    if dim in self.dynamic_axis_mapping_:
+                        sympy_shape.append(self.dynamic_axis_mapping_[dim])
+                    elif dim in self.symbolic_dims_:
+                        sympy_shape.append(self.symbolic_dims_[dim])
+                    else:
+                        sympy_shape.append(sympy.Symbol(dim, integer=True))
+                else:
+                    assert dim is not None
+                    sympy_shape.append(dim)
+        return sympy_shape
+
+    def get_edge_shape(self, edge):
+        """Get shape of an edge.
+
+        Args:
+            edge (str): name of edge
+
+        Returns:
+            Optional[List[int]]: the shape, or None if shape is unknown
+        """
+        assert self.all_shapes_inferred_
+        if edge not in self.known_vi_:
+            print("Cannot retrieve the shape of " + str(edge))
+            return None
+
+        type_proto = self.known_vi_[edge].type
+        shape = get_shape_from_type_proto(type_proto)
+
+        if shape is not None:
+            for i, dim in enumerate(shape):
+                if isinstance(dim, str) and dim in self.dynamic_axis_mapping_:
+                    shape[i] = self.dynamic_axis_mapping_[dim]
+
+        return shape
+
+    def compare_shape(self, edge, edge_other):
+        """Compare shape of two edges.
+
+        Args:
+            edge (str): name of edge
+            edge_other (str): name of another edge
+
+        Raises:
+            Exception: At least one shape is missed for edges to compare
+
+        Returns:
+            bool: whether the shape is same or not
+        """
+        assert self.all_shapes_inferred_
+        shape = self.get_edge_shape(edge)
+        shape_other = self.get_edge_shape(edge_other)
+        if shape is None or shape_other is None:
+            raise Exception("At least one shape is missed for edges to compare")
+        return shape == shape_other
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py
new file mode 100755
index 000000000..e5157f90e
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py
@@ -0,0 +1,2431 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+# -*- coding: UTF-8 -*-
+import argparse
+import logging
+
+import numpy as np
+import onnx
+import sympy
+from onnx import helper, numpy_helper, shape_inference
+from packaging import version
+
+assert version.parse(onnx.__version__) >= version.parse("1.8.0")
+
+logger = logging.getLogger(__name__)
+
+
+def get_attribute(node, attr_name, default_value=None):
+    found = [attr for attr in node.attribute if attr.name == attr_name]
+    if found:
+        return helper.get_attribute_value(found[0])
+    return default_value
+
+
+def get_dim_from_proto(dim):
+    return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) == str else None
+
+
+def is_sequence(type_proto):
+    cls_type = type_proto.WhichOneof("value")
+    assert cls_type in ["tensor_type", "sequence_type"]
+    return cls_type == "sequence_type"
+
+
+def get_shape_from_type_proto(type_proto):
+    assert not is_sequence(type_proto)
+    if type_proto.tensor_type.HasField("shape"):
+        return [get_dim_from_proto(d) for d in type_proto.tensor_type.shape.dim]
+    else:
+        return None  # note no shape is different from shape without dim (scalar)
+
+
+def get_shape_from_value_info(vi):
+    cls_type = vi.type.WhichOneof("value")
+    if cls_type is None:
+        return None
+    if is_sequence(vi.type):
+        if "tensor_type" == vi.type.sequence_type.elem_type.WhichOneof("value"):
+            return get_shape_from_type_proto(vi.type.sequence_type.elem_type)
+        else:
+            return None
+    else:
+        return get_shape_from_type_proto(vi.type)
+
+
+def make_named_value_info(name):
+    vi = onnx.ValueInfoProto()
+    vi.name = name
+    return vi
+
+
+def get_shape_from_sympy_shape(sympy_shape):
+    return [None if i is None else (int(i) if is_literal(i) else str(i)) for i in sympy_shape]
+
+
+def is_literal(dim):
+    return type(dim) in [int, np.int64, np.int32, sympy.Integer] or (hasattr(dim, "is_number") and dim.is_number)
+
+
+def handle_negative_axis(axis, rank):
+    assert axis < rank and axis >= -rank
+    return axis if axis >= 0 else rank + axis
+
+
+def get_opset(mp, domain=None):
+    domain = domain or ["", "onnx", "ai.onnx"]
+    if type(domain) != list:
+        domain = [domain]
+    for opset in mp.opset_import:
+        if opset.domain in domain:
+            return opset.version
+
+    return None
+
+
+def as_scalar(x):
+    if type(x) == list:
+        assert len(x) == 1
+        return x[0]
+    elif type(x) == np.ndarray:
+        return x.item()
+    else:
+        return x
+
+
+def as_list(x, keep_none):
+    if type(x) == list:
+        return x
+    elif type(x) == np.ndarray:
+        return list(x)
+    elif keep_none and x is None:
+        return None
+    else:
+        return [x]
+
+
+def sympy_reduce_product(x):
+    if type(x) == list:
+        value = sympy.Integer(1)
+        for v in x:
+            value = value * v
+    else:
+        value = x
+    return value
+
+
+class SymbolicShapeInference:
+    def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
+        self.dispatcher_ = {
+            "Add": self._infer_symbolic_compute_ops,
+            "ArrayFeatureExtractor": self._infer_ArrayFeatureExtractor,
+            "AveragePool": self._infer_Pool,
+            "BatchNormalization": self._infer_BatchNormalization,
+            "Cast": self._infer_Cast,
+            "CategoryMapper": self._infer_CategoryMapper,
+            "Compress": self._infer_Compress,
+            "Concat": self._infer_Concat,
+            "ConcatFromSequence": self._infer_ConcatFromSequence,
+            "Constant": self._infer_Constant,
+            "ConstantOfShape": self._infer_ConstantOfShape,
+            "Conv": self._infer_Conv,
+            "CumSum": self._pass_on_shape_and_type,
+            "Div": self._infer_symbolic_compute_ops,
+            "Einsum": self._infer_Einsum,
+            "Expand": self._infer_Expand,
+            "Equal": self._infer_symbolic_compute_ops,
+            "Floor": self._infer_symbolic_compute_ops,
+            "Gather": self._infer_Gather,
+            "GatherElements": self._infer_GatherElements,
+            "GatherND": self._infer_GatherND,
+            "Identity": self._pass_on_shape_and_type,
+            "If": self._infer_If,
+            "Loop": self._infer_Loop,
+            "MatMul": self._infer_MatMul,
+            "MatMulInteger16": self._infer_MatMulInteger,
+            "MaxPool": self._infer_Pool,
+            "Max": self._infer_symbolic_compute_ops,
+            "Min": self._infer_symbolic_compute_ops,
+            "Mul": self._infer_symbolic_compute_ops,
+            "NonMaxSuppression": self._infer_NonMaxSuppression,
+            "NonZero": self._infer_NonZero,
+            "OneHot": self._infer_OneHot,
+            "Pad": self._infer_Pad,
+            "Range": self._infer_Range,
+            "Reciprocal": self._pass_on_shape_and_type,
+            "ReduceSum": self._infer_ReduceSum,
+            "ReduceProd": self._infer_ReduceProd,
+            "Reshape": self._infer_Reshape,
+            "Resize": self._infer_Resize,
+            "Round": self._pass_on_shape_and_type,
+            "Scan": self._infer_Scan,
+            "ScatterElements": self._infer_ScatterElements,
+            "SequenceAt": self._infer_SequenceAt,
+            "SequenceInsert": self._infer_SequenceInsert,
+            "Shape": self._infer_Shape,
+            "Size": self._infer_Size,
+            "Slice": self._infer_Slice,
+            "SoftmaxCrossEntropyLoss": self._infer_SoftmaxCrossEntropyLoss,
+            "SoftmaxCrossEntropyLossInternal": self._infer_SoftmaxCrossEntropyLoss,
+            "NegativeLogLikelihoodLossInternal": self._infer_SoftmaxCrossEntropyLoss,
+            "Split": self._infer_Split,
+            "SplitToSequence": self._infer_SplitToSequence,
+            "Squeeze": self._infer_Squeeze,
+            "Sub": self._infer_symbolic_compute_ops,
+            "Tile": self._infer_Tile,
+            "TopK": self._infer_TopK,
+            "Transpose": self._infer_Transpose,
+            "Unsqueeze": self._infer_Unsqueeze,
+            "Where": self._infer_symbolic_compute_ops,
+            "ZipMap": self._infer_ZipMap,
+            "Neg": self._infer_symbolic_compute_ops,
+            # contrib ops:
+            "Attention": self._infer_Attention,
+            "BiasGelu": self._infer_BiasGelu,
+            "EmbedLayerNormalization": self._infer_EmbedLayerNormalization,
+            "FastGelu": self._infer_FastGelu,
+            "Gelu": self._infer_Gelu,
+            "LayerNormalization": self._infer_LayerNormalization,
+            "LongformerAttention": self._infer_LongformerAttention,
+            "PythonOp": self._infer_PythonOp,
+            "SkipLayerNormalization": self._infer_SkipLayerNormalization,
+        }
+        self.aten_op_dispatcher_ = {
+            "embedding": self._infer_Gather,
+            "bitwise_or": self._infer_aten_bitwise_or,
+            "diagonal": self._infer_aten_diagonal,
+            "max_pool2d_with_indices": self._infer_aten_pool2d,
+            "max": self._infer_aten_minmax,
+            "min": self._infer_aten_minmax,
+            "multinomial": self._infer_aten_multinomial,
+            "unfold": self._infer_aten_unfold,
+            "argmax": self._infer_aten_argmax,
+            "avg_pool2d": self._infer_aten_pool2d,
+            "_adaptive_avg_pool2d": self._infer_aten_pool2d,
+            "numpy_T": self._infer_Transpose,
+        }
+        self.run_ = True
+        self.suggested_merge_ = {}
+        self.symbolic_dims_ = {}
+        self.input_symbols_ = {}
+        self.auto_merge_ = auto_merge
+        self.guess_output_rank_ = guess_output_rank
+        self.verbose_ = verbose
+        self.int_max_ = int_max
+        self.subgraph_id_ = 0
+        self.prefix_ = prefix
+
+    def _add_suggested_merge(self, symbols, apply=False):
+        assert all([(type(s) == str and s in self.symbolic_dims_) or is_literal(s) for s in symbols])
+        symbols = set(symbols)
+        for k, v in self.suggested_merge_.items():
+            if k in symbols:
+                symbols.remove(k)
+                symbols.add(v)
+        map_to = None
+        # if there is literal, map to it first
+        for s in symbols:
+            if is_literal(s):
+                map_to = s
+                break
+        # when no literals, map to input symbolic dims, then existing symbolic dims
+        if map_to is None:
+            for s in symbols:
+                if s in self.input_symbols_:
+                    map_to = s
+                    break
+        if map_to is None:
+            for s in symbols:
+                if type(self.symbolic_dims_[s]) == sympy.Symbol:
+                    map_to = s
+                    break
+        # when nothing to map to, use the shorter one
+        if map_to is None:
+            if self.verbose_ > 0:
+                logger.warning("Potential unsafe merge between symbolic expressions: ({})".format(",".join(symbols)))
+            symbols_list = list(symbols)
+            lens = [len(s) for s in symbols_list]
+            map_to = symbols_list[lens.index(min(lens))]
+            symbols.remove(map_to)
+
+        for s in symbols:
+            if s == map_to:
+                continue
+            if is_literal(map_to) and is_literal(s):
+                assert int(map_to) == int(s)
+            self.suggested_merge_[s] = int(map_to) if is_literal(map_to) else map_to
+            for k, v in self.suggested_merge_.items():
+                if v == s:
+                    self.suggested_merge_[k] = map_to
+        if apply and self.auto_merge_:
+            self._apply_suggested_merge()
+
+    def _apply_suggested_merge(self, graph_input_only=False):
+        if not self.suggested_merge_:
+            return
+        for i in list(self.out_mp_.graph.input) + ([] if graph_input_only else list(self.out_mp_.graph.value_info)):
+            for d in i.type.tensor_type.shape.dim:
+                if d.dim_param in self.suggested_merge_:
+                    v = self.suggested_merge_[d.dim_param]
+                    if is_literal(v):
+                        d.dim_value = int(v)
+                    else:
+                        d.dim_param = v
+
+    def _preprocess(self, in_mp):
+        self.out_mp_ = onnx.ModelProto()
+        self.out_mp_.CopyFrom(in_mp)
+        self.graph_inputs_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)])
+        self.initializers_ = dict([(i.name, i) for i in self.out_mp_.graph.initializer])
+        self.known_vi_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)])
+        self.known_vi_.update(
+            dict(
+                [
+                    (
+                        i.name,
+                        helper.make_tensor_value_info(i.name, i.data_type, list(i.dims)),
+                    )
+                    for i in self.out_mp_.graph.initializer
+                ]
+            )
+        )
+
+    def _merge_symbols(self, dims):
+        if not all([type(d) == str for d in dims]):
+            if self.auto_merge_:
+                unique_dims = list(set(dims))
+                is_int = [is_literal(d) for d in unique_dims]
+                assert sum(is_int) <= 1  # if there are more than 1 unique ints, something is wrong
+                if sum(is_int) == 1:
+                    int_dim = is_int.index(1)
+                    if self.verbose_ > 0:
+                        logger.debug(
+                            "dim {} has been merged with value {}".format(
+                                unique_dims[:int_dim] + unique_dims[int_dim + 1 :],
+                                unique_dims[int_dim],
+                            )
+                        )
+                    self._check_merged_dims(unique_dims, allow_broadcast=False)
+                    return unique_dims[int_dim]
+                else:
+                    if self.verbose_ > 0:
+                        logger.debug("dim {} has been mergd with dim {}".format(unique_dims[1:], unique_dims[0]))
+                    return dims[0]
+            else:
+                return None
+        if all([d == dims[0] for d in dims]):
+            return dims[0]
+        merged = [self.suggested_merge_[d] if d in self.suggested_merge_ else d for d in dims]
+        if all([d == merged[0] for d in merged]):
+            assert merged[0] in self.symbolic_dims_
+            return merged[0]
+        else:
+            return None
+
+    # broadcast from right to left, and merge symbolic dims if needed
+    def _broadcast_shapes(self, shape1, shape2):
+        new_shape = []
+        rank1 = len(shape1)
+        rank2 = len(shape2)
+        new_rank = max(rank1, rank2)
+        for i in range(new_rank):
+            dim1 = shape1[rank1 - 1 - i] if i < rank1 else 1
+            dim2 = shape2[rank2 - 1 - i] if i < rank2 else 1
+            if dim1 == 1 or dim1 == dim2:
+                new_dim = dim2
+            elif dim2 == 1:
+                new_dim = dim1
+            else:
+                new_dim = self._merge_symbols([dim1, dim2])
+                if not new_dim:
+                    # warning about unsupported broadcast when not auto merge
+                    # note that auto merge has the risk of incorrectly merge symbols while one of them being 1
+                    # for example, 'a' = 1, 'b' = 5 at runtime is valid broadcasting, but with auto merge 'a' == 'b'
+                    if self.auto_merge_:
+                        self._add_suggested_merge([dim1, dim2], apply=True)
+                    else:
+                        logger.warning("unsupported broadcast between " + str(dim1) + " " + str(dim2))
+            new_shape = [new_dim] + new_shape
+        return new_shape
+
+    def _get_shape(self, node, idx):
+        name = node.input[idx]
+        if name in self.known_vi_:
+            vi = self.known_vi_[name]
+            return get_shape_from_value_info(vi)
+        else:
+            assert name in self.initializers_
+            return list(self.initializers_[name].dims)
+
+    def _get_shape_rank(self, node, idx):
+        return len(self._get_shape(node, idx))
+
+    def _get_sympy_shape(self, node, idx):
+        sympy_shape = []
+        for d in self._get_shape(node, idx):
+            if type(d) == str:
+                sympy_shape.append(
+                    self.symbolic_dims_[d]
+                    if d in self.symbolic_dims_
+                    else sympy.Symbol(d, integer=True, nonnegative=True)
+                )
+            else:
+                assert None != d
+                sympy_shape.append(d)
+        return sympy_shape
+
+    def _get_value(self, node, idx):
+        name = node.input[idx]
+        assert name in self.sympy_data_ or name in self.initializers_
+        return self.sympy_data_[name] if name in self.sympy_data_ else numpy_helper.to_array(self.initializers_[name])
+
+    def _try_get_value(self, node, idx):
+        if idx >= len(node.input):
+            return None
+        name = node.input[idx]
+        if name in self.sympy_data_ or name in self.initializers_:
+            return self._get_value(node, idx)
+        return None
+
+    def _update_computed_dims(self, new_sympy_shape):
+        for i, new_dim in enumerate(new_sympy_shape):
+            if not is_literal(new_dim) and not type(new_dim) == str:
+                str_dim = str(new_dim)
+                if str_dim in self.suggested_merge_:
+                    if is_literal(self.suggested_merge_[str_dim]):
+                        continue  # no need to create dim for literals
+                    new_sympy_shape[i] = self.symbolic_dims_[self.suggested_merge_[str_dim]]
+                else:
+                    # add new_dim if it's a computational expression
+                    if not str(new_dim) in self.symbolic_dims_:
+                        self.symbolic_dims_[str(new_dim)] = new_dim
+
+    def _onnx_infer_single_node(self, node):
+        # skip onnx shape inference for some ops, as they are handled in _infer_*
+        skip_infer = node.op_type in [
+            "If",
+            "Loop",
+            "Scan",
+            "SplitToSequence",
+            "ZipMap",  # contrib ops
+            "Attention",
+            "BiasGelu",
+            "EmbedLayerNormalization",
+            "FastGelu",
+            "Gelu",
+            "LayerNormalization",
+            "LongformerAttention",
+            "SkipLayerNormalization",
+            "PythonOp",
+        ]
+
+        if not skip_infer:
+            # Only pass initializers that satisfy the following condition:
+            # (1) Operator need value of some input for shape inference.
+            #     For example, Unsqueeze in opset 13 uses the axes input to calculate shape of output.
+            # (2) opset version >= 9. In older version, initializer is required in graph input by onnx spec.
+            # (3) The initializer is not in graph input. The means the node input is "constant" in inference.
+            initializers = []
+            if (get_opset(self.out_mp_) >= 9) and node.op_type in ["Unsqueeze"]:
+                initializers = [
+                    self.initializers_[name]
+                    for name in node.input
+                    if (name in self.initializers_ and name not in self.graph_inputs_)
+                ]
+
+            # run single node inference with self.known_vi_ shapes
+            tmp_graph = helper.make_graph(
+                [node],
+                "tmp",
+                [self.known_vi_[i] for i in node.input if i],
+                [make_named_value_info(i) for i in node.output],
+                initializers,
+            )
+
+            self.tmp_mp_.graph.CopyFrom(tmp_graph)
+
+            self.tmp_mp_ = shape_inference.infer_shapes(self.tmp_mp_)
+
+        for i_o in range(len(node.output)):
+            o = node.output[i_o]
+            vi = self.out_mp_.graph.value_info.add()
+            if not skip_infer:
+                vi.CopyFrom(self.tmp_mp_.graph.output[i_o])
+            else:
+                vi.name = o
+            self.known_vi_[o] = vi
+
+    def _onnx_infer_subgraph(self, node, subgraph, use_node_input=True, inc_subgraph_id=True):
+        if self.verbose_ > 2:
+            logger.debug(
+                "Inferencing subgraph of node {} with output({}...): {}".format(node.name, node.output[0], node.op_type)
+            )
+        # node inputs are not passed directly to the subgraph
+        # it's up to the node dispatcher to prepare subgraph input
+        # for example, with Scan/Loop, subgraph input shape would be trimmed from node input shape
+        # besides, inputs in subgraph could shadow implicit inputs
+        subgraph_inputs = set([i.name for i in list(subgraph.initializer) + list(subgraph.input)])
+        subgraph_implicit_input = set([name for name in self.known_vi_.keys() if not name in subgraph_inputs])
+        tmp_graph = helper.make_graph(
+            list(subgraph.node),
+            "tmp",
+            list(subgraph.input) + [self.known_vi_[i] for i in subgraph_implicit_input],
+            [make_named_value_info(i.name) for i in subgraph.output],
+        )
+        tmp_graph.initializer.extend([i for i in self.out_mp_.graph.initializer if i.name in subgraph_implicit_input])
+        tmp_graph.initializer.extend(subgraph.initializer)
+        self.tmp_mp_.graph.CopyFrom(tmp_graph)
+
+        symbolic_shape_inference = SymbolicShapeInference(
+            self.int_max_,
+            self.auto_merge_,
+            self.guess_output_rank_,
+            self.verbose_,
+            prefix=self.prefix_ + "_" + str(self.subgraph_id_),
+        )
+        if inc_subgraph_id:
+            self.subgraph_id_ += 1
+
+        all_shapes_inferred = False
+        symbolic_shape_inference._preprocess(self.tmp_mp_)
+        symbolic_shape_inference.suggested_merge_ = self.suggested_merge_.copy()
+        while symbolic_shape_inference.run_:
+            all_shapes_inferred = symbolic_shape_inference._infer_impl(self.sympy_data_.copy())
+        symbolic_shape_inference._update_output_from_vi()
+        if use_node_input:
+            # if subgraph uses node input, it needs to update to merged dims
+            subgraph.ClearField("input")
+            subgraph.input.extend(symbolic_shape_inference.out_mp_.graph.input[: len(node.input)])
+        subgraph.ClearField("output")
+        subgraph.output.extend(symbolic_shape_inference.out_mp_.graph.output)
+        subgraph.ClearField("value_info")
+        subgraph.value_info.extend(symbolic_shape_inference.out_mp_.graph.value_info)
+        subgraph.ClearField("node")
+        subgraph.node.extend(symbolic_shape_inference.out_mp_.graph.node)
+        # for new symbolic dims from subgraph output, add to main graph symbolic dims
+        subgraph_shapes = [get_shape_from_value_info(o) for o in symbolic_shape_inference.out_mp_.graph.output]
+        subgraph_new_symbolic_dims = set(
+            [d for s in subgraph_shapes if s for d in s if type(d) == str and not d in self.symbolic_dims_]
+        )
+        new_dims = {}
+        for d in subgraph_new_symbolic_dims:
+            assert d in symbolic_shape_inference.symbolic_dims_
+            new_dims[d] = symbolic_shape_inference.symbolic_dims_[d]
+        self.symbolic_dims_.update(new_dims)
+        return symbolic_shape_inference
+
+    def _get_int_values(self, node, broadcast=False):
+        values = [self._try_get_value(node, i) for i in range(len(node.input))]
+        if all([v is not None for v in values]):
+            # some shape compute is in floating point, cast to int for sympy
+            for i, v in enumerate(values):
+                if type(v) != np.ndarray:
+                    continue
+                if len(v.shape) > 1:
+                    new_v = None  # ignore value for rank > 1
+                elif len(v.shape) == 0:
+                    new_v = int(v.item())
+                else:
+                    assert len(v.shape) == 1
+                    new_v = [int(vv) for vv in v]
+                values[i] = new_v
+        values_len = [len(v) if type(v) == list else 0 for v in values]
+        max_len = max(values_len)
+        if max_len >= 1 and broadcast:
+            # broadcast
+            for i, v in enumerate(values):
+                if v is None:
+                    continue  # don't broadcast if value is unknown
+                if type(v) == list:
+                    if len(v) < max_len:
+                        values[i] = v * max_len
+                    else:
+                        assert len(v) == max_len
+                else:
+                    values[i] = [v] * max_len
+        return values
+
+    def _compute_on_sympy_data(self, node, op_func):
+        assert len(node.output) == 1
+        values = self._get_int_values(node, broadcast=True)
+        if all([v is not None for v in values]):
+            is_list = [type(v) == list for v in values]
+            as_list = any(is_list)
+            if as_list:
+                self.sympy_data_[node.output[0]] = [op_func(vs) for vs in zip(*values)]
+            else:
+                self.sympy_data_[node.output[0]] = op_func(values)
+
+    def _pass_on_sympy_data(self, node):
+        assert len(node.input) == 1 or node.op_type in [
+            "Reshape",
+            "Unsqueeze",
+            "Squeeze",
+        ]
+        self._compute_on_sympy_data(node, lambda x: x[0])
+
+    def _pass_on_shape_and_type(self, node):
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                self._get_shape(node, 0),
+            )
+        )
+
+    def _new_symbolic_dim(self, prefix, dim):
+        new_dim = "{}_d{}".format(prefix, dim)
+        if new_dim in self.suggested_merge_:
+            v = self.suggested_merge_[new_dim]
+            new_symbolic_dim = sympy.Integer(int(v)) if is_literal(v) else v
+        else:
+            new_symbolic_dim = sympy.Symbol(new_dim, integer=True, nonnegative=True)
+            self.symbolic_dims_[new_dim] = new_symbolic_dim
+        return new_symbolic_dim
+
+    def _new_symbolic_dim_from_output(self, node, out_idx=0, dim=0):
+        return self._new_symbolic_dim(
+            "{}{}_{}_o{}_".format(
+                node.op_type,
+                self.prefix_,
+                list(self.out_mp_.graph.node).index(node),
+                out_idx,
+            ),
+            dim,
+        )
+
+    def _new_symbolic_shape(self, rank, node, out_idx=0):
+        return [self._new_symbolic_dim_from_output(node, out_idx, i) for i in range(rank)]
+
+    def _compute_conv_pool_shape(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        if len(node.input) > 1:
+            W_shape = self._get_sympy_shape(node, 1)
+            rank = len(W_shape) - 2  # number of spatial axes
+            kernel_shape = W_shape[-rank:]
+            sympy_shape[1] = W_shape[0]
+        else:
+            W_shape = None
+            kernel_shape = get_attribute(node, "kernel_shape")
+            rank = len(kernel_shape)
+
+        assert len(sympy_shape) == rank + 2
+
+        # only need to symbolic shape inference if input has symbolic dims in spatial axes
+        is_symbolic_dims = [not is_literal(i) for i in sympy_shape[-rank:]]
+
+        if not any(is_symbolic_dims):
+            shape = get_shape_from_value_info(self.known_vi_[node.output[0]])
+            if len(shape) > 0:
+                assert len(sympy_shape) == len(shape)
+                sympy_shape[-rank:] = [sympy.Integer(d) for d in shape[-rank:]]
+                return sympy_shape
+
+        dilations = get_attribute(node, "dilations", [1] * rank)
+        strides = get_attribute(node, "strides", [1] * rank)
+        effective_kernel_shape = [(k - 1) * d + 1 for k, d in zip(kernel_shape, dilations)]
+        pads = get_attribute(node, "pads")
+        if pads is None:
+            pads = [0] * (2 * rank)
+            auto_pad = get_attribute(node, "auto_pad", b"NOTSET").decode("utf-8")
+            if auto_pad != "VALID" and auto_pad != "NOTSET":
+                try:
+                    residual = [sympy.Mod(d, s) for d, s in zip(sympy_shape[-rank:], strides)]
+                    total_pads = [
+                        max(0, (k - s) if r == 0 else (k - r))
+                        for k, s, r in zip(effective_kernel_shape, strides, residual)
+                    ]
+                except TypeError:  # sympy may throw TypeError: cannot determine truth value of Relational
+                    total_pads = [
+                        max(0, (k - s)) for k, s in zip(effective_kernel_shape, strides)
+                    ]  # assuming no residual if sympy throws error
+            elif auto_pad == "VALID":
+                total_pads = []
+            else:
+                total_pads = [0] * rank
+        else:
+            assert len(pads) == 2 * rank
+            total_pads = [p1 + p2 for p1, p2 in zip(pads[:rank], pads[rank:])]
+
+        ceil_mode = get_attribute(node, "ceil_mode", 0)
+        for i in range(rank):
+            effective_input_size = sympy_shape[-rank + i]
+            if len(total_pads) > 0:
+                effective_input_size = effective_input_size + total_pads[i]
+            if ceil_mode:
+                strided_kernel_positions = sympy.ceiling(
+                    (effective_input_size - effective_kernel_shape[i]) / strides[i]
+                )
+            else:
+                strided_kernel_positions = (effective_input_size - effective_kernel_shape[i]) // strides[i]
+            sympy_shape[-rank + i] = strided_kernel_positions + 1
+        return sympy_shape
+
+    def _check_merged_dims(self, dims, allow_broadcast=True):
+        if allow_broadcast:
+            dims = [d for d in dims if not (is_literal(d) and int(d) <= 1)]
+        if not all([d == dims[0] for d in dims]):
+            self._add_suggested_merge(dims, apply=True)
+
+    def _compute_matmul_shape(self, node, output_dtype=None):
+        lhs_shape = self._get_shape(node, 0)
+        rhs_shape = self._get_shape(node, 1)
+        lhs_rank = len(lhs_shape)
+        rhs_rank = len(rhs_shape)
+        lhs_reduce_dim = 0
+        rhs_reduce_dim = 0
+        assert lhs_rank > 0 and rhs_rank > 0
+        if lhs_rank == 1 and rhs_rank == 1:
+            new_shape = []
+        elif lhs_rank == 1:
+            rhs_reduce_dim = -2
+            new_shape = rhs_shape[:rhs_reduce_dim] + [rhs_shape[-1]]
+        elif rhs_rank == 1:
+            lhs_reduce_dim = -1
+            new_shape = lhs_shape[:lhs_reduce_dim]
+        else:
+            lhs_reduce_dim = -1
+            rhs_reduce_dim = -2
+            new_shape = self._broadcast_shapes(lhs_shape[:-2], rhs_shape[:-2]) + [lhs_shape[-2]] + [rhs_shape[-1]]
+        # merge reduce dim
+        self._check_merged_dims(
+            [lhs_shape[lhs_reduce_dim], rhs_shape[rhs_reduce_dim]],
+            allow_broadcast=False,
+        )
+        if output_dtype is None:
+            # infer output_dtype from input type when not specified
+            output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, new_shape))
+
+    def _fuse_tensor_type(self, node, out_idx, dst_type, src_type):
+        """
+        update dst_tensor_type to be compatible with src_tensor_type when dimension mismatches
+        """
+        dst_tensor_type = (
+            dst_type.sequence_type.elem_type.tensor_type if is_sequence(dst_type) else dst_type.tensor_type
+        )
+        src_tensor_type = (
+            src_type.sequence_type.elem_type.tensor_type if is_sequence(src_type) else src_type.tensor_type
+        )
+        if dst_tensor_type.elem_type != src_tensor_type.elem_type:
+            node_id = node.name if node.name else node.op_type
+            raise ValueError(
+                f"For node {node_id}, dst_tensor_type.elem_type != src_tensor_type.elem_type: "
+                f"{onnx.onnx_pb.TensorProto.DataType.Name(dst_tensor_type.elem_type)} vs "
+                f"{onnx.onnx_pb.TensorProto.DataType.Name(src_tensor_type.elem_type)}"
+            )
+        if dst_tensor_type.HasField("shape"):
+            for di, ds in enumerate(zip(dst_tensor_type.shape.dim, src_tensor_type.shape.dim)):
+                if ds[0] != ds[1]:
+                    # create a new symbolic dimension for node/out_idx/mismatch dim id in dst_tensor_type for tensor_type
+                    # for sequence_type, clear the dimension
+                    new_dim = onnx.TensorShapeProto.Dimension()
+                    if not is_sequence(dst_type):
+                        new_dim.dim_param = str(self._new_symbolic_dim_from_output(node, out_idx, di))
+                    dst_tensor_type.shape.dim[di].CopyFrom(new_dim)
+        else:
+            dst_tensor_type.CopyFrom(src_tensor_type)
+
+    def _infer_ArrayFeatureExtractor(self, node):
+        data_shape = self._get_shape(node, 0)
+        indices_shape = self._get_shape(node, 1)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                data_shape[:-1] + indices_shape,
+            )
+        )
+
+    def _infer_symbolic_compute_ops(self, node):
+        funcs = {
+            "Add": lambda l: l[0] + l[1],
+            "Div": lambda l: l[0] // l[1],  # integer div in sympy
+            "Equal": lambda l: l[0] == l[1],
+            "Floor": lambda l: sympy.floor(l[0]),
+            "Max": lambda l: l[1]
+            if is_literal(l[0]) and int(l[0]) < -self.int_max_
+            else (l[0] if is_literal(l[1]) and int(l[1]) < -self.int_max_ else sympy.Max(l[0], l[1])),
+            "Min": lambda l: l[1]
+            if is_literal(l[0]) and int(l[0]) > self.int_max_
+            else (l[0] if is_literal(l[1]) and int(l[1]) > self.int_max_ else sympy.Min(l[0], l[1])),
+            "Mul": lambda l: l[0] * l[1],
+            "Sub": lambda l: l[0] - l[1],
+            "Where": lambda l: l[1] if l[0] else l[2],
+            "Neg": lambda l: -l[0],
+        }
+        assert node.op_type in funcs
+        self._compute_on_sympy_data(node, funcs[node.op_type])
+
+    def _infer_Cast(self, node):
+        self._pass_on_sympy_data(node)
+
+    def _infer_CategoryMapper(self, node):
+        input_type = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        if input_type == onnx.TensorProto.STRING:
+            output_type = onnx.TensorProto.INT64
+        else:
+            output_type = onnx.TensorProto.STRING
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_type, self._get_shape(node, 0)))
+
+    def _infer_Compress(self, node):
+        input_shape = self._get_shape(node, 0)
+        # create a new symbolic dimension for Compress output
+        compress_len = str(self._new_symbolic_dim_from_output(node))
+        axis = get_attribute(node, "axis")
+        if axis == None:
+            # when axis is not specified, input is flattened before compress so output is 1D
+            output_shape = [compress_len]
+        else:
+            output_shape = input_shape
+            output_shape[handle_negative_axis(axis, len(input_shape))] = compress_len
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                output_shape,
+            )
+        )
+
+    def _infer_Concat(self, node):
+        if any([i in self.sympy_data_ or i in self.initializers_ for i in node.input]):
+            values = self._get_int_values(node)
+            if all([v is not None for v in values]):
+                assert 0 == get_attribute(node, "axis")
+                self.sympy_data_[node.output[0]] = []
+                for i in range(len(node.input)):
+                    value = values[i]
+                    if type(value) == list:
+                        self.sympy_data_[node.output[0]].extend(value)
+                    else:
+                        self.sympy_data_[node.output[0]].append(value)
+
+        sympy_shape = self._get_sympy_shape(node, 0)
+        axis = handle_negative_axis(get_attribute(node, "axis"), len(sympy_shape))
+        for i_idx in range(1, len(node.input)):
+            input_shape = self._get_sympy_shape(node, i_idx)
+            if input_shape:
+                sympy_shape[axis] = sympy_shape[axis] + input_shape[axis]
+        self._update_computed_dims(sympy_shape)
+        # merge symbolic dims for non-concat axes
+        for d in range(len(sympy_shape)):
+            if d == axis:
+                continue
+            dims = [self._get_shape(node, i_idx)[d] for i_idx in range(len(node.input)) if self._get_shape(node, i_idx)]
+            if all([d == dims[0] for d in dims]):
+                continue
+            merged = self._merge_symbols(dims)
+            if type(merged) == str:
+                sympy_shape[d] = self.symbolic_dims_[merged] if merged else None
+            else:
+                sympy_shape[d] = merged
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(sympy_shape),
+            )
+        )
+
+    def _infer_ConcatFromSequence(self, node):
+        seq_shape = self._get_shape(node, 0)
+        new_axis = 1 if get_attribute(node, "new_axis") else 0
+        axis = handle_negative_axis(get_attribute(node, "axis"), len(seq_shape) + new_axis)
+        concat_dim = str(self._new_symbolic_dim_from_output(node, 0, axis))
+        new_shape = seq_shape
+        if new_axis:
+            new_shape = seq_shape[:axis] + [concat_dim] + seq_shape[axis:]
+        else:
+            new_shape[axis] = concat_dim
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.sequence_type.elem_type.tensor_type.elem_type,
+                new_shape,
+            )
+        )
+
+    def _infer_Constant(self, node):
+        t = get_attribute(node, "value")
+        self.sympy_data_[node.output[0]] = numpy_helper.to_array(t)
+
+    def _infer_ConstantOfShape(self, node):
+        sympy_shape = self._get_int_values(node)[0]
+        vi = self.known_vi_[node.output[0]]
+        if sympy_shape is not None:
+            if type(sympy_shape) != list:
+                sympy_shape = [sympy_shape]
+            self._update_computed_dims(sympy_shape)
+            # update sympy data if output type is int, and shape is known
+            if vi.type.tensor_type.elem_type == onnx.TensorProto.INT64 and all([is_literal(x) for x in sympy_shape]):
+                self.sympy_data_[node.output[0]] = np.ones(
+                    [int(x) for x in sympy_shape], dtype=np.int64
+                ) * numpy_helper.to_array(get_attribute(node, "value", 0))
+        else:
+            # create new dynamic shape
+            # note input0 is a 1D vector of shape, the new symbolic shape has the rank of the shape vector length
+            sympy_shape = self._new_symbolic_shape(self._get_shape(node, 0)[0], node)
+
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                vi.type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(sympy_shape),
+            )
+        )
+
+    def _infer_Conv(self, node):
+        sympy_shape = self._compute_conv_pool_shape(node)
+        self._update_computed_dims(sympy_shape)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                vi.type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(sympy_shape),
+            )
+        )
+
+    def _infer_Einsum(self, node):
+        # ref:https://github.com/onnx/onnx/blob/623dfaa0151b2e4ce49779c3ec31cbd78c592b80/onnx/defs/math/defs.cc#L3275
+        equation = get_attribute(node, "equation")
+        equation = equation.replace(b" ", b"")
+        mid_index = equation.find(b"->")
+        left_equation = equation[:mid_index] if mid_index != -1 else equation
+
+        num_operands = 0
+        num_ellipsis = 0
+        num_ellipsis_indices = 0
+
+        letter_to_dim = {}
+
+        terms = left_equation.split(b",")
+        for term in terms:
+            ellipsis_index = term.find(b"...")
+            shape = self._get_shape(node, num_operands)
+            rank = len(shape)
+            if ellipsis_index != -1:
+                if num_ellipsis == 0:
+                    num_ellipsis_indices = rank - len(term) + 3
+                num_ellipsis = num_ellipsis + 1
+            for i in range(1, rank + 1):
+                letter = term[-i]
+                if letter != 46:  # letter != b'.'
+                    dim = shape[-i]
+                    if letter not in letter_to_dim.keys():
+                        letter_to_dim[letter] = dim
+                    elif type(dim) != sympy.Symbol:
+                        letter_to_dim[letter] = dim
+            num_operands = num_operands + 1
+
+        new_sympy_shape = []
+        from collections import OrderedDict
+
+        num_letter_occurrences = OrderedDict()
+        if mid_index != -1:
+            right_equation = equation[mid_index + 2 :]
+            right_ellipsis_index = right_equation.find(b"...")
+            if right_ellipsis_index != -1:
+                for i in range(num_ellipsis_indices):
+                    new_sympy_shape.append(shape[i])
+            for c in right_equation:
+                if c != 46:  # c != b'.'
+                    new_sympy_shape.append(letter_to_dim[c])
+        else:
+            for i in range(num_ellipsis_indices):
+                new_sympy_shape.append(shape[i])
+            for c in left_equation:
+                if c != 44 and c != 46:  # c != b',' and c != b'.':
+                    if c in num_letter_occurrences:
+                        num_letter_occurrences[c] = num_letter_occurrences[c] + 1
+                    else:
+                        num_letter_occurrences[c] = 1
+            for key, value in num_letter_occurrences.items():
+                if value == 1:
+                    new_sympy_shape.append(letter_to_dim[key])
+
+        output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, new_sympy_shape))
+
+    def _infer_Expand(self, node):
+        expand_to_shape = as_list(self._try_get_value(node, 1), keep_none=True)
+        if expand_to_shape is not None:
+            # new_shape's dim can come from shape value
+            self._update_computed_dims(expand_to_shape)
+            shape = self._get_shape(node, 0)
+            new_shape = self._broadcast_shapes(shape, get_shape_from_sympy_shape(expand_to_shape))
+            vi = self.known_vi_[node.output[0]]
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                    new_shape,
+                )
+            )
+
+    def _infer_Gather(self, node):
+        data_shape = self._get_shape(node, 0)
+        axis = handle_negative_axis(get_attribute(node, "axis", 0), len(data_shape))
+        indices_shape = self._get_shape(node, 1)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                data_shape[:axis] + indices_shape + data_shape[axis + 1 :],
+            )
+        )
+        # for 1D input, do some sympy compute
+        if node.input[0] in self.sympy_data_ and len(data_shape) == 1 and 0 == get_attribute(node, "axis", 0):
+            idx = self._try_get_value(node, 1)
+            if idx is not None:
+                data = self.sympy_data_[node.input[0]]
+                if type(data) == list:
+                    if type(idx) == np.ndarray and len(idx.shape) == 1:
+                        self.sympy_data_[node.output[0]] = [data[int(i)] for i in idx]
+                    else:
+                        self.sympy_data_[node.output[0]] = data[int(idx)]
+                else:
+                    assert idx == 0 or idx == -1
+                    self.sympy_data_[node.output[0]] = data
+
+    def _infer_GatherElements(self, node):
+        indices_shape = self._get_shape(node, 1)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                indices_shape,
+            )
+        )
+
+    def _infer_GatherND(self, node):
+        data_shape = self._get_shape(node, 0)
+        data_rank = len(data_shape)
+        indices_shape = self._get_shape(node, 1)
+        indices_rank = len(indices_shape)
+        last_index_dimension = indices_shape[-1]
+        assert is_literal(last_index_dimension) and last_index_dimension <= data_rank
+        new_shape = indices_shape[:-1] + data_shape[last_index_dimension:]
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                new_shape,
+            )
+        )
+
+    def _infer_If(self, node):
+        # special case for constant condition, in case there are mismatching shape from the non-executed branch
+        subgraphs = [
+            get_attribute(node, "then_branch"),
+            get_attribute(node, "else_branch"),
+        ]
+        cond = self._try_get_value(node, 0)
+        if cond is not None:
+            if as_scalar(cond) > 0:
+                subgraphs[1].CopyFrom(subgraphs[0])
+            else:
+                subgraphs[0].CopyFrom(subgraphs[1])
+
+        for i_sub, subgraph in enumerate(subgraphs):
+            subgraph_infer = self._onnx_infer_subgraph(node, subgraph, use_node_input=False)
+            for i_out in range(len(node.output)):
+                vi = self.known_vi_[node.output[i_out]]
+                if i_sub == 0:
+                    vi.CopyFrom(subgraph.output[i_out])
+                    vi.name = node.output[i_out]
+                else:
+                    self._fuse_tensor_type(node, i_out, vi.type, subgraph.output[i_out].type)
+
+                # pass on sympy data from subgraph, if cond is constant
+                if cond is not None and i_sub == (0 if as_scalar(cond) > 0 else 1):
+                    if subgraph.output[i_out].name in subgraph_infer.sympy_data_:
+                        self.sympy_data_[vi.name] = subgraph_infer.sympy_data_[subgraph.output[i_out].name]
+
+    def _infer_Loop(self, node):
+        subgraph = get_attribute(node, "body")
+        assert len(subgraph.input) == len(node.input)
+        num_loop_carried = len(node.input) - 2  # minus the length and initial loop condition
+        # when sequence_type is used as loop carried input
+        # needs to run subgraph infer twice if the tensor shape in sequence contains None
+        for i, si in enumerate(subgraph.input):
+            si_name = si.name
+            si.CopyFrom(self.known_vi_[node.input[i]])
+            si.name = si_name
+
+        self._onnx_infer_subgraph(node, subgraph)
+
+        # check subgraph input/output for shape changes in loop carried variables
+        # for tensor_type, create new symbolic dim when changing, i.e., output = Concat(input, a)
+        # for sequence_type, propagate from output to input
+        need_second_infer = False
+        for i_out in range(1, num_loop_carried + 1):
+            so = subgraph.output[i_out]
+            so_shape = get_shape_from_value_info(so)
+            if is_sequence(so.type):
+                if so_shape and None in so_shape:
+                    # copy shape from output to input
+                    # note that loop input is [loop_len, cond, input_0, input_1, ...]
+                    # while loop output is [cond, output_0, output_1, ...]
+                    subgraph.input[i_out + 1].type.sequence_type.elem_type.CopyFrom(so.type.sequence_type.elem_type)
+                    need_second_infer = True
+            else:
+                si = subgraph.input[i_out + 1]
+                si_shape = get_shape_from_value_info(si)
+                for di, dims in enumerate(zip(si_shape, so_shape)):
+                    if dims[0] != dims[1]:
+                        new_dim = onnx.TensorShapeProto.Dimension()
+                        new_dim.dim_param = str(self._new_symbolic_dim_from_output(node, i_out, di))
+                        si.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
+                        so.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
+                        need_second_infer = True
+
+        if need_second_infer:
+            if self.verbose_ > 2:
+                logger.debug(
+                    "Rerun Loop: {}({}...), because of sequence in loop carried variables".format(
+                        node.name, node.output[0]
+                    )
+                )
+            self._onnx_infer_subgraph(node, subgraph, inc_subgraph_id=False)
+
+        # create a new symbolic dimension for iteration dependent dimension
+        loop_iter_dim = str(self._new_symbolic_dim_from_output(node))
+        for i in range(len(node.output)):
+            vi = self.known_vi_[node.output[i]]
+            vi.CopyFrom(subgraph.output[i + 1])  # first subgraph output is condition, not in node output
+            if i >= num_loop_carried:
+                assert not is_sequence(vi.type)  # TODO: handle loop accumulation in sequence_type
+                subgraph_vi_dim = subgraph.output[i + 1].type.tensor_type.shape.dim
+                vi.type.tensor_type.shape.ClearField("dim")
+                vi_dim = vi.type.tensor_type.shape.dim
+                vi_dim.add().dim_param = loop_iter_dim
+                vi_dim.extend(list(subgraph_vi_dim))
+            vi.name = node.output[i]
+
+    def _infer_MatMul(self, node):
+        self._compute_matmul_shape(node)
+
+    def _infer_MatMulInteger(self, node):
+        self._compute_matmul_shape(node, onnx.TensorProto.INT32)
+
+    def _infer_NonMaxSuppression(self, node):
+        selected = str(self._new_symbolic_dim_from_output(node))
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, [selected, 3]))
+
+    def _infer_NonZero(self, node):
+        input_rank = self._get_shape_rank(node, 0)
+        # create a new symbolic dimension for NonZero output
+        nz_len = str(self._new_symbolic_dim_from_output(node, 0, 1))
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], vi.type.tensor_type.elem_type, [input_rank, nz_len]))
+
+    def _infer_OneHot(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        depth = self._try_get_value(node, 1)
+        axis = get_attribute(node, "axis", -1)
+        axis = handle_negative_axis(axis, len(sympy_shape) + 1)
+        new_shape = get_shape_from_sympy_shape(
+            sympy_shape[:axis]
+            + [self._new_symbolic_dim_from_output(node) if not is_literal(depth) else depth]
+            + sympy_shape[axis:]
+        )
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[2]].type.tensor_type.elem_type,
+                new_shape,
+            )
+        )
+
+    def _infer_Pad(self, node):
+        if get_opset(self.out_mp_) <= 10:
+            pads = get_attribute(node, "pads")
+        else:
+            pads = self._try_get_value(node, 1)
+
+        sympy_shape = self._get_sympy_shape(node, 0)
+        rank = len(sympy_shape)
+
+        if pads is not None:
+            assert len(pads) == 2 * rank
+            new_sympy_shape = [
+                d + pad_up + pad_down for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:])
+            ]
+            self._update_computed_dims(new_sympy_shape)
+        else:
+            # dynamic pads, create new symbolic dimensions
+            new_sympy_shape = self._new_symbolic_shape(rank, node)
+        output_tp = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(node.output[0], output_tp, get_shape_from_sympy_shape(new_sympy_shape))
+        )
+
+    def _infer_Pool(self, node):
+        sympy_shape = self._compute_conv_pool_shape(node)
+        self._update_computed_dims(sympy_shape)
+        for o in node.output:
+            if not o:
+                continue
+            vi = self.known_vi_[o]
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    o,
+                    vi.type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(sympy_shape),
+                )
+            )
+
+    def _infer_aten_bitwise_or(self, node):
+        shape0 = self._get_shape(node, 0)
+        shape1 = self._get_shape(node, 1)
+        new_shape = self._broadcast_shapes(shape0, shape1)
+        t0 = self.known_vi_[node.input[0]]
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], t0.type.tensor_type.elem_type, new_shape))
+
+    def _infer_aten_diagonal(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        rank = len(sympy_shape)
+        offset = self._try_get_value(node, 1)
+        dim1 = self._try_get_value(node, 2)
+        dim2 = self._try_get_value(node, 3)
+
+        assert offset is not None and dim1 is not None and dim2 is not None
+        dim1 = handle_negative_axis(dim1, rank)
+        dim2 = handle_negative_axis(dim2, rank)
+
+        new_shape = []
+        for dim, val in enumerate(sympy_shape):
+            if dim not in [dim1, dim2]:
+                new_shape.append(val)
+
+        shape1 = sympy_shape[dim1]
+        shape2 = sympy_shape[dim2]
+        if offset >= 0:
+            diag_shape = sympy.Max(0, sympy.Min(shape1, shape2 - offset))
+        else:
+            diag_shape = sympy.Max(0, sympy.Min(shape1 + offset, shape2))
+        new_shape.append(diag_shape)
+
+        if node.output[0]:
+            vi = self.known_vi_[node.output[0]]
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(new_shape),
+                )
+            )
+
+    def _infer_aten_multinomial(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        rank = len(sympy_shape)
+        assert rank in [1, 2]
+        num_samples = self._try_get_value(node, 1)
+        di = rank - 1
+        last_dim = num_samples if num_samples else str(self._new_symbolic_dim_from_output(node, 0, di))
+        output_shape = sympy_shape[:-1] + [last_dim]
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                onnx.TensorProto.INT64,
+                get_shape_from_sympy_shape(output_shape),
+            )
+        )
+
+    def _infer_aten_pool2d(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        assert len(sympy_shape) == 4
+        sympy_shape[-2:] = [self._new_symbolic_dim_from_output(node, 0, i) for i in [2, 3]]
+        self._update_computed_dims(sympy_shape)
+        for i, o in enumerate(node.output):
+            if not o:
+                continue
+            vi = self.known_vi_[o]
+            elem_type = onnx.TensorProto.INT64 if i == 1 else self.known_vi_[node.input[0]].type.tensor_type.elem_type
+            vi.CopyFrom(helper.make_tensor_value_info(o, elem_type, get_shape_from_sympy_shape(sympy_shape)))
+
+    def _infer_aten_minmax(self, node):
+        vi = self.known_vi_[node.output[0]]
+        if len(node.input) == 1:
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, []
+                )
+            )
+        else:
+            assert len(node.input) == 3
+            keepdim = self._try_get_value(node, 2)
+            assert keepdim is not None  # can only handle known keepdim case.
+            dim = self._try_get_value(node, 1)
+            if dim is None:
+                rank = self._get_shape_rank(node, 0)
+                output_shape = self._new_symbolic_shape(rank if keepdim else rank - 1, node)
+            else:
+                shape = self._get_sympy_shape(node, 0)
+                dim = handle_negative_axis(dim, len(shape))
+                output_shape = shape[:dim]
+                if keepdim:
+                    output_shape += [1]
+                output_shape += shape[dim + 1 :]
+
+            output_shape = get_shape_from_sympy_shape(output_shape)
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, output_shape
+                )
+            )
+            vi1 = self.known_vi_[node.output[1]]
+            vi1.CopyFrom(helper.make_tensor_value_info(node.output[1], onnx.TensorProto.INT64, output_shape))
+
+    def _infer_aten_unfold(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        dimension = self._try_get_value(node, 1)
+        size = self._try_get_value(node, 2)
+        step = self._try_get_value(node, 3)
+        if dimension is not None and size is not None and step is not None:
+            assert dimension < len(sympy_shape)
+            sympy_shape[dimension] = (sympy_shape[dimension] - size) // step + 1
+            sympy_shape.append(size)
+        else:
+            rank = len(sympy_shape)
+            sympy_shape = self._new_symbolic_shape(rank + 1, node)
+        self._update_computed_dims(sympy_shape)
+        if node.output[0]:
+            vi = self.known_vi_[node.output[0]]
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(sympy_shape),
+                )
+            )
+
+    def _infer_aten_argmax(self, node):
+        new_shape = None
+        if node.input[1] == "":
+            # The argmax of the flattened input is returned.
+            new_shape = []
+        else:
+            dim = self._try_get_value(node, 1)
+            keepdim = self._try_get_value(node, 2)
+            if keepdim is not None:
+                sympy_shape = self._get_sympy_shape(node, 0)
+                if dim is not None:
+                    dim = handle_negative_axis(dim, len(sympy_shape))
+                    if keepdim:
+                        sympy_shape[dim] = 1
+                    else:
+                        del sympy_shape[dim]
+                else:
+                    rank = len(sympy_shape)
+                    sympy_shape = self._new_symbolic_shape(rank if keepdim else rank - 1, node)
+                self._update_computed_dims(sympy_shape)
+                new_shape = get_shape_from_sympy_shape(sympy_shape)
+        if node.output[0] and new_shape is not None:
+            vi = self.known_vi_[node.output[0]]
+            vi.CopyFrom(helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, new_shape))
+
+    def _infer_BatchNormalization(self, node):
+        self._propagate_shape_and_type(node)
+
+        # this works for opsets < 14 and 14 since we check i < len(node.output) in the loop
+        for i in [1, 2, 3, 4]:
+            if i < len(node.output) and node.output[i] != "":
+                # all of these parameters have the same shape as the 1st input
+                self._propagate_shape_and_type(node, input_index=1, output_index=i)
+
+    def _infer_Range(self, node):
+        vi = self.known_vi_[node.output[0]]
+        input_data = self._get_int_values(node)
+        if all([i is not None for i in input_data]):
+            start = as_scalar(input_data[0])
+            limit = as_scalar(input_data[1])
+            delta = as_scalar(input_data[2])
+            new_sympy_shape = [sympy.Max(sympy.ceiling((limit - start) / delta), 0)]
+        else:
+            new_sympy_shape = [self._new_symbolic_dim_from_output(node)]
+        self._update_computed_dims(new_sympy_shape)
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(new_sympy_shape),
+            )
+        )
+
+    def _infer_ReduceSum(self, node):
+        keep_dims = get_attribute(node, "keepdims", 1)
+        if get_opset(self.out_mp_) >= 13 and len(node.input) > 1:
+            # ReduceSum changes axes to input[1] in opset 13
+            axes = self._try_get_value(node, 1)
+            vi = self.known_vi_[node.output[0]]
+            if axes is None:
+                assert keep_dims  # can only handle keep_dims==True when axes is unknown, by generating new ranks
+                vi.CopyFrom(
+                    helper.make_tensor_value_info(
+                        node.output[0],
+                        self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                        get_shape_from_sympy_shape(self._new_symbolic_shape(self._get_shape_rank(node, 0), node)),
+                    )
+                )
+            else:
+                shape = self._get_shape(node, 0)
+                output_shape = []
+                axes = [handle_negative_axis(a, len(shape)) for a in axes]
+                for i, d in enumerate(shape):
+                    if i in axes:
+                        if keep_dims:
+                            output_shape.append(1)
+                    else:
+                        output_shape.append(d)
+                vi.CopyFrom(
+                    helper.make_tensor_value_info(
+                        node.output[0],
+                        self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                        output_shape,
+                    )
+                )
+
+    def _infer_ReduceProd(self, node):
+        axes = get_attribute(node, "axes")
+        keep_dims = get_attribute(node, "keepdims", 1)
+        if keep_dims == 0 and axes == [0]:
+            data = self._get_int_values(node)[0]
+            if data is not None:
+                self.sympy_data_[node.output[0]] = sympy_reduce_product(data)
+
+    def _infer_Reshape(self, node):
+        shape_value = self._try_get_value(node, 1)
+        vi = self.known_vi_[node.output[0]]
+        if shape_value is None:
+            shape_shape = self._get_shape(node, 1)
+            assert len(shape_shape) == 1
+            shape_rank = shape_shape[0]
+            assert is_literal(shape_rank)
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    vi.type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(self._new_symbolic_shape(shape_rank, node)),
+                )
+            )
+        else:
+            input_sympy_shape = self._get_sympy_shape(node, 0)
+            total = int(1)
+            for d in input_sympy_shape:
+                total = total * d
+            new_sympy_shape = []
+            deferred_dim_idx = -1
+            non_deferred_size = int(1)
+            for i, d in enumerate(shape_value):
+                if type(d) == sympy.Symbol:
+                    new_sympy_shape.append(d)
+                elif d == 0:
+                    new_sympy_shape.append(input_sympy_shape[i])
+                    non_deferred_size = non_deferred_size * input_sympy_shape[i]
+                else:
+                    new_sympy_shape.append(d)
+                if d == -1:
+                    deferred_dim_idx = i
+                elif d != 0:
+                    non_deferred_size = non_deferred_size * d
+
+            assert new_sympy_shape.count(-1) < 2
+            if -1 in new_sympy_shape:
+                new_dim = total // non_deferred_size
+                new_sympy_shape[deferred_dim_idx] = new_dim
+
+            self._update_computed_dims(new_sympy_shape)
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    vi.type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(new_sympy_shape),
+                )
+            )
+
+        self._pass_on_sympy_data(node)
+
+    def _infer_Resize(self, node):
+        vi = self.known_vi_[node.output[0]]
+        input_sympy_shape = self._get_sympy_shape(node, 0)
+        if get_opset(self.out_mp_) <= 10:
+            scales = self._try_get_value(node, 1)
+            if scales is not None:
+                new_sympy_shape = [sympy.simplify(sympy.floor(d * s)) for d, s in zip(input_sympy_shape, scales)]
+                self._update_computed_dims(new_sympy_shape)
+                vi.CopyFrom(
+                    helper.make_tensor_value_info(
+                        node.output[0],
+                        self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                        get_shape_from_sympy_shape(new_sympy_shape),
+                    )
+                )
+        else:
+            roi = self._try_get_value(node, 1)
+            scales = self._try_get_value(node, 2)
+            sizes = self._try_get_value(node, 3)
+            if sizes is not None:
+                new_sympy_shape = [sympy.simplify(sympy.floor(s)) for s in sizes]
+                self._update_computed_dims(new_sympy_shape)
+            elif scales is not None:
+                rank = len(scales)
+                if get_attribute(node, "coordinate_transformation_mode") == "tf_crop_and_resize":
+                    assert len(roi) == 2 * rank
+                    roi_start = list(roi)[:rank]
+                    roi_end = list(roi)[rank:]
+                else:
+                    roi_start = [0] * rank
+                    roi_end = [1] * rank
+                scales = list(scales)
+                new_sympy_shape = [
+                    sympy.simplify(sympy.floor(d * (end - start) * scale))
+                    for d, start, end, scale in zip(input_sympy_shape, roi_start, roi_end, scales)
+                ]
+                self._update_computed_dims(new_sympy_shape)
+            else:
+                new_sympy_shape = self._new_symbolic_shape(self._get_shape_rank(node, 0), node)
+
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(new_sympy_shape),
+                )
+            )
+
+    def _infer_Scan(self, node):
+        subgraph = get_attribute(node, "body")
+        num_scan_inputs = get_attribute(node, "num_scan_inputs")
+        scan_input_axes = get_attribute(node, "scan_input_axes", [0] * num_scan_inputs)
+        num_scan_states = len(node.input) - num_scan_inputs
+        scan_input_axes = [
+            handle_negative_axis(ax, self._get_shape_rank(node, i + num_scan_states))
+            for i, ax in enumerate(scan_input_axes)
+        ]
+        # We may have cases where the subgraph has optional inputs that appear in both subgraph's input and initializer,
+        # but not in the node's input. In such cases, the input model might be invalid, but let's skip those optional inputs.
+        assert len(subgraph.input) >= len(node.input)
+        subgraph_inputs = subgraph.input[: len(node.input)]
+        for i, si in enumerate(subgraph_inputs):
+            subgraph_name = si.name
+            si.CopyFrom(self.known_vi_[node.input[i]])
+            if i >= num_scan_states:
+                scan_input_dim = si.type.tensor_type.shape.dim
+                scan_input_dim.remove(scan_input_dim[scan_input_axes[i - num_scan_states]])
+            si.name = subgraph_name
+        self._onnx_infer_subgraph(node, subgraph)
+        num_scan_outputs = len(node.output) - num_scan_states
+        scan_output_axes = get_attribute(node, "scan_output_axes", [0] * num_scan_outputs)
+        scan_input_dim = get_shape_from_type_proto(self.known_vi_[node.input[-1]].type)[scan_input_axes[-1]]
+        for i, o in enumerate(node.output):
+            vi = self.known_vi_[o]
+            if i >= num_scan_states:
+                shape = get_shape_from_type_proto(subgraph.output[i].type)
+                new_dim = handle_negative_axis(scan_output_axes[i - num_scan_states], len(shape) + 1)
+                shape = shape[:new_dim] + [scan_input_dim] + shape[new_dim:]
+                vi.CopyFrom(helper.make_tensor_value_info(o, subgraph.output[i].type.tensor_type.elem_type, shape))
+            else:
+                vi.CopyFrom(subgraph.output[i])
+            vi.name = o
+
+    def _infer_ScatterElements(self, node):
+        data_shape = self._get_shape(node, 0)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                data_shape,
+            )
+        )
+
+    def _infer_SequenceAt(self, node):
+        # need to create new symbolic dimension if sequence shape has None:
+        seq_shape = self._get_shape(node, 0)
+        vi = self.known_vi_[node.output[0]]
+        if seq_shape is not None:
+            for di, d in enumerate(seq_shape):
+                if d is not None:
+                    continue
+                new_dim = onnx.TensorShapeProto.Dimension()
+                new_dim.dim_param = str(self._new_symbolic_dim_from_output(node, 0, di))
+                vi.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
+
+    def _infer_SequenceInsert(self, node):
+        # workaround bug in onnx's shape inference
+        vi_seq = self.known_vi_[node.input[0]]
+        vi_tensor = self.known_vi_[node.input[1]]
+        vi_out_seq = self.known_vi_[node.output[0]]
+        vi_out_seq.CopyFrom(vi_seq)
+        vi_out_seq.name = node.output[0]
+        self._fuse_tensor_type(node, 0, vi_out_seq.type, vi_tensor.type)
+
+    def _infer_Shape(self, node):
+        self.sympy_data_[node.output[0]] = self._get_sympy_shape(node, 0)
+
+    def _infer_Size(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        self.sympy_data_[node.output[0]] = sympy_reduce_product(sympy_shape)
+        self.known_vi_[node.output[0]].CopyFrom(
+            helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, [])
+        )
+
+    def _infer_Slice(self, node):
+        def less_equal(x, y):
+            try:
+                return bool(x <= y)
+            except TypeError:
+                pass
+            try:
+                return bool(y >= x)
+            except TypeError:
+                pass
+            try:
+                return bool(-x >= -y)
+            except TypeError:
+                pass
+            try:
+                return bool(-y <= -x)
+            except TypeError:
+                # the last attempt; this may raise TypeError
+                return bool(y - x >= 0)
+
+        def handle_negative_index(index, bound):
+            """normalizes a negative index to be in [0, bound)"""
+            try:
+                if not less_equal(0, index):
+                    if is_literal(index) and index <= -self.int_max_:
+                        # this case is handled separately
+                        return index
+                    return bound + index
+            except TypeError:
+                logger.warning("Cannot determine if {} < 0".format(index))
+            return index
+
+        if get_opset(self.out_mp_) <= 9:
+            axes = get_attribute(node, "axes")
+            starts = get_attribute(node, "starts")
+            ends = get_attribute(node, "ends")
+            if not axes:
+                axes = list(range(len(starts)))
+            steps = [1] * len(axes)
+        else:
+            starts = as_list(self._try_get_value(node, 1), keep_none=True)
+            ends = as_list(self._try_get_value(node, 2), keep_none=True)
+            axes = self._try_get_value(node, 3)
+            steps = self._try_get_value(node, 4)
+            if axes is None and not (starts is None and ends is None):
+                axes = list(range(0, len(starts if starts is not None else ends)))
+            if steps is None and not (starts is None and ends is None):
+                steps = [1] * len(starts if starts is not None else ends)
+            axes = as_list(axes, keep_none=True)
+            steps = as_list(steps, keep_none=True)
+
+        new_sympy_shape = self._get_sympy_shape(node, 0)
+        if starts is None or ends is None:
+            if axes is None:
+                for i in range(len(new_sympy_shape)):
+                    new_sympy_shape[i] = self._new_symbolic_dim_from_output(node, 0, i)
+            else:
+                new_sympy_shape = get_shape_from_sympy_shape(new_sympy_shape)
+                for i in axes:
+                    new_sympy_shape[i] = self._new_symbolic_dim_from_output(node, 0, i)
+        else:
+            for i, s, e, t in zip(axes, starts, ends, steps):
+                e = handle_negative_index(e, new_sympy_shape[i])
+                if is_literal(e):
+                    if e >= self.int_max_:
+                        e = new_sympy_shape[i]
+                    elif e <= -self.int_max_:
+                        e = 0 if s > 0 else -1
+                    elif is_literal(new_sympy_shape[i]):
+                        if e < 0:
+                            e = max(0, e + new_sympy_shape[i])
+                        e = min(e, new_sympy_shape[i])
+                    else:
+                        if e > 0:
+                            e = (
+                                sympy.Min(e, new_sympy_shape[i]) if e > 1 else e
+                            )  # special case for slicing first to make computation easier
+                else:
+                    if is_literal(new_sympy_shape[i]):
+                        e = sympy.Min(e, new_sympy_shape[i])
+                    else:
+                        try:
+                            if not less_equal(e, new_sympy_shape[i]):
+                                e = new_sympy_shape[i]
+                        except Exception:
+                            logger.warning(
+                                "Unable to determine if {} <= {}, treat as equal".format(e, new_sympy_shape[i])
+                            )
+                            e = new_sympy_shape[i]
+
+                s = handle_negative_index(s, new_sympy_shape[i])
+                if is_literal(new_sympy_shape[i]) and is_literal(s):
+                    s = max(0, min(s, new_sympy_shape[i]))
+
+                new_sympy_shape[i] = sympy.simplify((e - s + t + (-1 if t > 0 else 1)) // t)
+
+            self._update_computed_dims(new_sympy_shape)
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                vi.type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(new_sympy_shape),
+            )
+        )
+
+        # handle sympy_data if needed, for slice in shape computation
+        if (
+            node.input[0] in self.sympy_data_
+            and [0] == axes
+            and len(starts) == 1
+            and len(ends) == 1
+            and len(steps) == 1
+        ):
+            input_sympy_data = self.sympy_data_[node.input[0]]
+            if type(input_sympy_data) == list or (
+                type(input_sympy_data) == np.array and len(input_sympy_data.shape) == 1
+            ):
+                self.sympy_data_[node.output[0]] = input_sympy_data[starts[0] : ends[0] : steps[0]]
+
+    def _infer_SoftmaxCrossEntropyLoss(self, node):
+        vi = self.known_vi_[node.output[0]]
+        elem_type = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        vi.type.tensor_type.elem_type = elem_type
+        vi.type.tensor_type.shape.CopyFrom(onnx.TensorShapeProto())
+
+        if len(node.output) > 1:
+            data_shape = self._get_shape(node, 0)
+            vi = self.known_vi_[node.output[1]]
+            vi.CopyFrom(helper.make_tensor_value_info(vi.name, elem_type, data_shape))
+
+    def _infer_Split_Common(self, node, make_value_info_func):
+        input_sympy_shape = self._get_sympy_shape(node, 0)
+        axis = handle_negative_axis(get_attribute(node, "axis", 0), len(input_sympy_shape))
+        split = get_attribute(node, "split")
+        if not split:
+            num_outputs = len(node.output)
+            split = [input_sympy_shape[axis] / sympy.Integer(num_outputs)] * num_outputs
+            self._update_computed_dims(split)
+        else:
+            split = [sympy.Integer(s) for s in split]
+
+        for i_o in range(len(split)):
+            vi = self.known_vi_[node.output[i_o]]
+            vi.CopyFrom(
+                make_value_info_func(
+                    node.output[i_o],
+                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(input_sympy_shape[:axis] + [split[i_o]] + input_sympy_shape[axis + 1 :]),
+                )
+            )
+            self.known_vi_[vi.name] = vi
+
+    def _infer_Split(self, node):
+        self._infer_Split_Common(node, helper.make_tensor_value_info)
+
+    def _infer_SplitToSequence(self, node):
+        self._infer_Split_Common(node, helper.make_sequence_value_info)
+
+    def _infer_Squeeze(self, node):
+        input_shape = self._get_shape(node, 0)
+        op_set = get_opset(self.out_mp_)
+
+        # Depending on op-version 'axes' are provided as attribute or via 2nd input
+        if op_set < 13:
+            axes = get_attribute(node, "axes")
+            assert self._try_get_value(node, 1) is None
+        else:
+            axes = self._try_get_value(node, 1)
+            assert get_attribute(node, "axes") is None
+
+        if axes is None:
+            # No axes have been provided (neither via attribute nor via input).
+            # In this case the 'Shape' op should remove all axis with dimension 1.
+            # For symbolic dimensions we guess they are !=1.
+            output_shape = [s for s in input_shape if s != 1]
+            if self.verbose_ > 0:
+                symbolic_dimensions = [s for s in input_shape if type(s) != int]
+                if len(symbolic_dimensions) > 0:
+                    logger.debug(
+                        f"Symbolic dimensions in input shape of op: '{node.op_type}' node: '{node.name}'. "
+                        + f"Assuming the following dimensions are never equal to 1: {symbolic_dimensions}"
+                    )
+        else:
+            axes = [handle_negative_axis(a, len(input_shape)) for a in axes]
+            output_shape = []
+            for i in range(len(input_shape)):
+                if i not in axes:
+                    output_shape.append(input_shape[i])
+                else:
+                    assert input_shape[i] == 1 or type(input_shape[i]) != int
+                    if self.verbose_ > 0 and type(input_shape[i]) != int:
+                        logger.debug(
+                            f"Symbolic dimensions in input shape of op: '{node.op_type}' node: '{node.name}'. "
+                            + f"Assuming the dimension '{input_shape[i]}' at index {i} of the input to be equal to 1."
+                        )
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                output_shape,
+            )
+        )
+        self._pass_on_sympy_data(node)
+
+    def _infer_Tile(self, node):
+        repeats_value = self._try_get_value(node, 1)
+        new_sympy_shape = []
+        if repeats_value is not None:
+            input_sympy_shape = self._get_sympy_shape(node, 0)
+            for i, d in enumerate(input_sympy_shape):
+                new_dim = d * repeats_value[i]
+                new_sympy_shape.append(new_dim)
+            self._update_computed_dims(new_sympy_shape)
+        else:
+            new_sympy_shape = self._new_symbolic_shape(self._get_shape_rank(node, 0), node)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                vi.type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(new_sympy_shape),
+            )
+        )
+
+    def _infer_TopK(self, node):
+        rank = self._get_shape_rank(node, 0)
+        axis = handle_negative_axis(get_attribute(node, "axis", -1), rank)
+        new_shape = self._get_shape(node, 0)
+
+        if get_opset(self.out_mp_) <= 9:
+            k = get_attribute(node, "k")
+        else:
+            k = self._get_int_values(node)[1]
+
+        if k == None:
+            k = self._new_symbolic_dim_from_output(node)
+        else:
+            k = as_scalar(k)
+
+        if type(k) in [int, str]:
+            new_shape[axis] = k
+        else:
+            new_sympy_shape = self._get_sympy_shape(node, 0)
+            new_sympy_shape[axis] = k
+            self._update_computed_dims(
+                new_sympy_shape
+            )  # note that TopK dim could be computed in sympy_data, so need to update computed_dims when it enters shape
+            new_shape = get_shape_from_sympy_shape(new_sympy_shape)
+
+        for i_o in range(len(node.output)):
+            vi = self.known_vi_[node.output[i_o]]
+            vi.CopyFrom(helper.make_tensor_value_info(node.output[i_o], vi.type.tensor_type.elem_type, new_shape))
+
+    def _infer_Transpose(self, node):
+        if node.input[0] in self.sympy_data_:
+            data_shape = self._get_shape(node, 0)
+            perm = get_attribute(node, "perm", reversed(list(range(len(data_shape)))))
+            input_data = self.sympy_data_[node.input[0]]
+            self.sympy_data_[node.output[0]] = (
+                np.transpose(np.array(input_data).reshape(*data_shape), axes=tuple(perm)).flatten().tolist()
+            )
+
+    def _infer_Unsqueeze(self, node):
+        input_shape = self._get_shape(node, 0)
+        op_set = get_opset(self.out_mp_)
+
+        # Depending on op-version 'axes' are provided as attribute or via 2nd input
+        if op_set < 13:
+            axes = get_attribute(node, "axes")
+            assert self._try_get_value(node, 1) is None
+        else:
+            axes = self._try_get_value(node, 1)
+            assert get_attribute(node, "axes") is None
+
+        output_rank = len(input_shape) + len(axes)
+        axes = [handle_negative_axis(a, output_rank) for a in axes]
+
+        input_axis = 0
+        output_shape = []
+        for i in range(output_rank):
+            if i in axes:
+                output_shape.append(1)
+            else:
+                output_shape.append(input_shape[input_axis])
+                input_axis += 1
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                output_shape,
+            )
+        )
+
+        self._pass_on_sympy_data(node)
+
+    def _infer_ZipMap(self, node):
+        map_key_type = None
+        if get_attribute(node, "classlabels_int64s") is not None:
+            map_key_type = onnx.TensorProto.INT64
+        elif get_attribute(node, "classlabels_strings") is not None:
+            map_key_type = onnx.TensorProto.STRING
+
+        assert map_key_type is not None
+        new_vi = onnx.ValueInfoProto()
+        new_vi.name = node.output[0]
+        new_vi.type.sequence_type.elem_type.map_type.value_type.tensor_type.elem_type = onnx.TensorProto.FLOAT
+        new_vi.type.sequence_type.elem_type.map_type.key_type = map_key_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(new_vi)
+
+    def _infer_Attention(self, node):
+        shape = self._get_shape(node, 0)
+        shape_bias = self._get_shape(node, 2)
+        assert len(shape) == 3 and len(shape_bias) == 1
+        qkv_hidden_sizes_attr = get_attribute(node, "qkv_hidden_sizes")
+        if qkv_hidden_sizes_attr is not None:
+            assert len(qkv_hidden_sizes_attr) == 3
+            shape[2] = int(qkv_hidden_sizes_attr[2])
+        else:
+            shape[2] = int(shape_bias[0] / 3)
+        output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, shape))
+
+        if len(node.output) > 1:
+            # input shape: (batch_size, sequence_length, hidden_size)
+            # past shape: (2, batch_size, num_heads, past_sequence_length, head_size)
+            # mask shape: (batch_size, total_sequence_length) or (batch_size, sequence_length, total_sequence_length) or (batch_size, 1, max_seq_len, max_seq_len)
+            # present shape: (2, batch_size, num_heads, total_sequence_length, head_size), where total_sequence_length=sequence_length+past_sequence_length
+            input_shape = self._get_shape(node, 0)
+            past_shape = self._get_shape(node, 4)
+            mask_shape = self._get_shape(node, 3)
+            if len(past_shape) == 5:
+                if len(mask_shape) in [2, 3]:
+                    past_shape[3] = mask_shape[-1]
+                elif isinstance(input_shape[1], int) and isinstance(past_shape[3], int):
+                    past_shape[3] = input_shape[1] + past_shape[3]
+                else:
+                    past_shape[3] = f"{past_shape[3]}+{input_shape[1]}"
+                vi = self.known_vi_[node.output[1]]
+                vi.CopyFrom(helper.make_tensor_value_info(vi.name, output_dtype, past_shape))
+
+    def _infer_BiasGelu(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_FastGelu(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_Gelu(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_LayerNormalization(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_LongformerAttention(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_EmbedLayerNormalization(self, node):
+        input_ids_shape = self._get_shape(node, 0)
+        word_embedding_shape = self._get_shape(node, 2)
+        assert len(input_ids_shape) == 2 and len(word_embedding_shape) == 2
+        output_shape = input_ids_shape + [word_embedding_shape[1]]
+
+        word_embedding_dtype = self.known_vi_[node.input[2]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], word_embedding_dtype, output_shape))
+
+        mask_index_shape = [input_ids_shape[0]]
+        vi = self.known_vi_[node.output[1]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[1], onnx.TensorProto.INT32, mask_index_shape))
+
+        if len(node.output) > 2:
+            # Optional output of add before layer nomalization is done
+            # shape is same as the output
+            vi = self.known_vi_[node.output[2]]
+            vi.CopyFrom(helper.make_tensor_value_info(node.output[2], word_embedding_dtype, output_shape))
+
+    def _infer_SkipLayerNormalization(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_PythonOp(self, node):
+        output_tensor_types = get_attribute(node, "output_tensor_types")
+        assert output_tensor_types
+        output_tensor_ranks = get_attribute(node, "output_tensor_ranks")
+        assert output_tensor_ranks
+
+        # set the context output seperately.
+        # The first output is autograd's context.
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, []))
+
+        # Outputs after autograd's context are tensors.
+        # We assume their ranks are fixed for different model inputs.
+        for i in range(len(node.output) - 1):
+            # Process the i-th tensor outputs.
+            vi = self.known_vi_[node.output[i + 1]]
+            sympy_shape = self._new_symbolic_shape(output_tensor_ranks[i], node)
+            shape = get_shape_from_sympy_shape(sympy_shape)
+            value_info = helper.make_tensor_value_info(node.output[i + 1], output_tensor_types[i], shape)
+            vi.CopyFrom(value_info)
+
+    def _propagate_shape_and_type(self, node, input_index=0, output_index=0):
+        shape = self._get_shape(node, input_index)
+        output_dtype = self.known_vi_[node.input[input_index]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[output_index]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[output_index], output_dtype, shape))
+
+    def _is_none_dim(self, dim_value):
+        if type(dim_value) != str:
+            return False
+        if "unk__" not in dim_value:
+            return False
+        if dim_value in self.symbolic_dims_.keys():
+            return False
+        return True
+
+    def _is_shape_contains_none_dim(self, out_shape):
+        for out in out_shape:
+            if self._is_none_dim(out):
+                return out
+        return None
+
+    def _infer_impl(self, start_sympy_data=None):
+        self.sympy_data_ = start_sympy_data or {}
+        self.out_mp_.graph.ClearField("value_info")
+        self._apply_suggested_merge(graph_input_only=True)
+        self.input_symbols_ = set()
+        for i in self.out_mp_.graph.input:
+            input_shape = get_shape_from_value_info(i)
+            if input_shape is None:
+                continue
+
+            if is_sequence(i.type):
+                input_dims = i.type.sequence_type.elem_type.tensor_type.shape.dim
+            else:
+                input_dims = i.type.tensor_type.shape.dim
+
+            for i_dim, dim in enumerate(input_shape):
+                if dim is None:
+                    # some models use None for symbolic dim in input, replace it with a string
+                    input_dims[i_dim].dim_param = str(self._new_symbolic_dim(i.name, i_dim))
+
+            self.input_symbols_.update([d for d in input_shape if type(d) == str])
+
+        for s in self.input_symbols_:
+            if s in self.suggested_merge_:
+                s_merge = self.suggested_merge_[s]
+                assert s_merge in self.symbolic_dims_
+                self.symbolic_dims_[s] = self.symbolic_dims_[s_merge]
+            else:
+                # Since inputs are not produced by other ops, we can assume positivity
+                self.symbolic_dims_[s] = sympy.Symbol(s, integer=True, positive=True)
+        # create a temporary ModelProto for single node inference
+        # note that we remove initializer to have faster inference
+        # for tensor ops like Reshape/Tile/Expand that read initializer, we need to do sympy computation based inference anyways
+        self.tmp_mp_ = onnx.ModelProto()
+        self.tmp_mp_.CopyFrom(self.out_mp_)
+        self.tmp_mp_.graph.ClearField("initializer")
+
+        # compute prerequesite for node for topological sort
+        # node with subgraphs may have dependency on implicit inputs, which will affect topological sort
+        prereq_for_node = {}  # map from node to all its inputs, including implicit ones in subgraph
+
+        def get_prereq(node):
+            names = set(i for i in node.input if i)
+            subgraphs = []
+            if "If" == node.op_type:
+                subgraphs = [
+                    get_attribute(node, "then_branch"),
+                    get_attribute(node, "else_branch"),
+                ]
+            elif node.op_type in ["Loop", "Scan"]:
+                subgraphs = [get_attribute(node, "body")]
+            for g in subgraphs:
+                g_outputs_and_initializers = {i.name for i in g.initializer}
+                g_prereq = set()
+                for n in g.node:
+                    g_outputs_and_initializers.update(n.output)
+                for n in g.node:
+                    g_prereq.update([i for i in get_prereq(n) if i not in g_outputs_and_initializers])
+                names.update(g_prereq)
+                # remove subgraph inputs from g_prereq since those are local-only
+                for i in g.input:
+                    if i.name in names:
+                        names.remove(i.name)
+            return names
+
+        for n in self.tmp_mp_.graph.node:
+            prereq_for_node[n.output[0]] = get_prereq(n)
+
+        # topological sort nodes, note there might be dead nodes so we check if all graph outputs are reached to terminate
+        sorted_nodes = []
+        sorted_known_vi = set([i.name for i in list(self.out_mp_.graph.input) + list(self.out_mp_.graph.initializer)])
+        if any([o.name in sorted_known_vi for o in self.out_mp_.graph.output]):
+            # Loop/Scan will have some graph output in graph inputs, so don't do topological sort
+            sorted_nodes = self.out_mp_.graph.node
+        else:
+            while not all([o.name in sorted_known_vi for o in self.out_mp_.graph.output]):
+                old_sorted_nodes_len = len(sorted_nodes)
+                for node in self.out_mp_.graph.node:
+                    if (node.output[0] not in sorted_known_vi) and all(
+                        [i in sorted_known_vi for i in prereq_for_node[node.output[0]] if i]
+                    ):
+                        sorted_known_vi.update(node.output)
+                        sorted_nodes.append(node)
+                if old_sorted_nodes_len == len(sorted_nodes) and not all(
+                    [o.name in sorted_known_vi for o in self.out_mp_.graph.output]
+                ):
+                    raise Exception("Invalid model with cyclic graph")
+
+        for node in sorted_nodes:
+            assert all([i in self.known_vi_ for i in node.input if i])
+            self._onnx_infer_single_node(node)
+            known_aten_op = False
+            if node.op_type in self.dispatcher_:
+                self.dispatcher_[node.op_type](node)
+            elif node.op_type in ["ConvTranspose"]:
+                # onnx shape inference ops like ConvTranspose may have empty shape for symbolic input
+                # before adding symbolic compute for them
+                # mark the output type as UNDEFINED to allow guessing of rank
+                vi = self.known_vi_[node.output[0]]
+                if len(vi.type.tensor_type.shape.dim) == 0:
+                    vi.type.tensor_type.elem_type = onnx.TensorProto.UNDEFINED
+            elif node.op_type == "ATen" and node.domain == "org.pytorch.aten":
+                for attr in node.attribute:
+                    # TODO: Is overload_name needed?
+                    if attr.name == "operator":
+                        aten_op_name = attr.s.decode("utf-8") if isinstance(attr.s, bytes) else attr.s
+                        if aten_op_name in self.aten_op_dispatcher_:
+                            known_aten_op = True
+                            self.aten_op_dispatcher_[aten_op_name](node)
+                        break
+
+            if self.verbose_ > 2:
+                logger.debug(node.op_type + ": " + node.name)
+                for i, name in enumerate(node.input):
+                    logger.debug(
+                        "  Input {}: {} {}".format(i, name, "initializer" if name in self.initializers_ else "")
+                    )
+
+            # onnx automatically merge dims with value, i.e. Mul(['aaa', 'bbb'], [1000, 1]) -> [1000, 'bbb']
+            # symbolic shape inference needs to apply merge of 'aaa' -> 1000 in this case
+            if node.op_type in [
+                "Add",
+                "Sub",
+                "Mul",
+                "Div",
+                "MatMul",
+                "MatMulInteger",
+                "MatMulInteger16",
+                "Where",
+                "Sum",
+            ]:
+                vi = self.known_vi_[node.output[0]]
+                out_rank = len(get_shape_from_type_proto(vi.type))
+                in_shapes = [self._get_shape(node, i) for i in range(len(node.input))]
+                for d in range(out_rank - (2 if node.op_type in ["MatMul", "MatMulInteger", "MatMulInteger16"] else 0)):
+                    in_dims = [s[len(s) - out_rank + d] for s in in_shapes if len(s) + d >= out_rank]
+                    if len(in_dims) > 1:
+                        self._check_merged_dims(in_dims, allow_broadcast=True)
+
+            for i_o in range(len(node.output)):
+                vi = self.known_vi_[node.output[i_o]]
+                out_type = vi.type
+                out_type_kind = out_type.WhichOneof("value")
+
+                # do not process shape for non-tensors
+                if out_type_kind not in ["tensor_type", "sparse_tensor_type", None]:
+                    if self.verbose_ > 2:
+                        if out_type_kind == "sequence_type":
+                            seq_cls_type = out_type.sequence_type.elem_type.WhichOneof("value")
+                            if "tensor_type" == seq_cls_type:
+                                logger.debug(
+                                    "  {}: sequence of {} {}".format(
+                                        node.output[i_o],
+                                        str(get_shape_from_value_info(vi)),
+                                        onnx.TensorProto.DataType.Name(
+                                            vi.type.sequence_type.elem_type.tensor_type.elem_type
+                                        ),
+                                    )
+                                )
+                            else:
+                                logger.debug("  {}: sequence of {}".format(node.output[i_o], seq_cls_type))
+                        else:
+                            logger.debug("  {}: {}".format(node.output[i_o], out_type_kind))
+                    continue
+
+                out_shape = get_shape_from_value_info(vi)
+                out_type_undefined = out_type.tensor_type.elem_type == onnx.TensorProto.UNDEFINED
+                if self.verbose_ > 2:
+                    logger.debug(
+                        "  {}: {} {}".format(
+                            node.output[i_o],
+                            str(out_shape),
+                            onnx.TensorProto.DataType.Name(vi.type.tensor_type.elem_type),
+                        )
+                    )
+                    if node.output[i_o] in self.sympy_data_:
+                        logger.debug("  Sympy Data: " + str(self.sympy_data_[node.output[i_o]]))
+
+                # onnx >= 1.11.0, use unk__#index instead of None when the shape dim is uncertain
+                if (
+                    out_shape is not None and (None in out_shape or self._is_shape_contains_none_dim(out_shape))
+                ) or out_type_undefined:
+                    if self.auto_merge_:
+                        if node.op_type in [
+                            "Add",
+                            "Sub",
+                            "Mul",
+                            "Div",
+                            "MatMul",
+                            "MatMulInteger",
+                            "MatMulInteger16",
+                            "Concat",
+                            "Where",
+                            "Sum",
+                            "Equal",
+                            "Less",
+                            "Greater",
+                            "LessOrEqual",
+                            "GreaterOrEqual",
+                            "Min",
+                            "Max",
+                        ]:
+                            shapes = [self._get_shape(node, i) for i in range(len(node.input))]
+                            if node.op_type in [
+                                "MatMul",
+                                "MatMulInteger",
+                                "MatMulInteger16",
+                            ]:
+                                if None in out_shape or self._is_shape_contains_none_dim(out_shape):
+                                    if None in out_shape:
+                                        idx = out_shape.index(None)
+                                    else:
+                                        idx = out_shape.index(self._is_shape_contains_none_dim(out_shape))
+                                    dim_idx = [len(s) - len(out_shape) + idx for s in shapes]
+                                    # only support auto merge for MatMul for dim < rank-2 when rank > 2
+                                    assert len(shapes[0]) > 2 and dim_idx[0] < len(shapes[0]) - 2
+                                    assert len(shapes[1]) > 2 and dim_idx[1] < len(shapes[1]) - 2
+                        elif node.op_type == "Expand":
+                            # auto merge for cases like Expand([min(batch, 1), min(seq, 512)], [batch, seq])
+                            shapes = [
+                                self._get_shape(node, 0),
+                                self._get_value(node, 1),
+                            ]
+                        else:
+                            shapes = []
+
+                        if shapes:
+                            for idx in range(len(out_shape)):
+                                if out_shape[idx] is not None and not self._is_none_dim(out_shape[idx]):
+                                    continue
+                                # note that the broadcasting rule aligns from right to left
+                                # if a tensor has a lower rank (dim_idx[idx] < 0), it would automatically broadcast and need no merge
+                                dim_idx = [len(s) - len(out_shape) + idx for s in shapes]
+                                if len(dim_idx) > 0:
+                                    self._add_suggested_merge(
+                                        [
+                                            s[i] if is_literal(s[i]) else str(s[i])
+                                            for s, i in zip(shapes, dim_idx)
+                                            if i >= 0
+                                        ]
+                                    )
+                            self.run_ = True
+                        else:
+                            self.run_ = False
+                    else:
+                        self.run_ = False
+
+                    # create new dynamic dims for ops not handled by symbolic shape inference
+                    if self.run_ == False and not node.op_type in self.dispatcher_ and not known_aten_op:
+                        is_unknown_op = out_type_undefined and (out_shape is None or len(out_shape) == 0)
+                        if is_unknown_op:
+                            # unknown op to ONNX, maybe from higher opset or other domain
+                            # only guess the output rank from input 0 when using guess_output_rank option
+                            out_rank = self._get_shape_rank(node, 0) if self.guess_output_rank_ else -1
+                        else:
+                            # valid ONNX op, but not handled by symbolic shape inference, just assign dynamic shape
+                            out_rank = len(out_shape)
+
+                        if out_rank >= 0:
+                            new_shape = self._new_symbolic_shape(out_rank, node, i_o)
+                            if out_type_undefined:
+                                # guess output data type from input vi if not defined
+                                out_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+                            else:
+                                # otherwise, use original data type
+                                out_dtype = vi.type.tensor_type.elem_type
+                            vi.CopyFrom(
+                                helper.make_tensor_value_info(
+                                    vi.name,
+                                    out_dtype,
+                                    get_shape_from_sympy_shape(new_shape),
+                                )
+                            )
+
+                            if self.verbose_ > 0:
+                                if is_unknown_op:
+                                    logger.debug(
+                                        "Possible unknown op: {} node: {}, guessing {} shape".format(
+                                            node.op_type, node.name, vi.name
+                                        )
+                                    )
+                                if self.verbose_ > 2:
+                                    logger.debug(
+                                        "  {}: {} {}".format(
+                                            node.output[i_o],
+                                            str(new_shape),
+                                            vi.type.tensor_type.elem_type,
+                                        )
+                                    )
+
+                            self.run_ = True
+                            continue  # continue the inference after guess, no need to stop as no merge is needed
+
+                    if self.verbose_ > 0 or not self.auto_merge_ or out_type_undefined:
+                        logger.debug("Stopping at incomplete shape inference at " + node.op_type + ": " + node.name)
+                        logger.debug("node inputs:")
+                        for i in node.input:
+                            logger.debug(self.known_vi_[i])
+                        logger.debug("node outputs:")
+                        for o in node.output:
+                            logger.debug(self.known_vi_[o])
+                        if self.auto_merge_ and not out_type_undefined:
+                            logger.debug("Merging: " + str(self.suggested_merge_))
+                    return False
+
+        self.run_ = False
+        return True
+
+    def _update_output_from_vi(self):
+        for output in self.out_mp_.graph.output:
+            if output.name in self.known_vi_:
+                output.CopyFrom(self.known_vi_[output.name])
+
+    @staticmethod
+    def infer_shapes(in_mp, int_max=2**31 - 1, auto_merge=False, guess_output_rank=False, verbose=0):
+        onnx_opset = get_opset(in_mp)
+        if (not onnx_opset) or onnx_opset < 7:
+            logger.warning("Only support models of onnx opset 7 and above.")
+            return None
+        symbolic_shape_inference = SymbolicShapeInference(int_max, auto_merge, guess_output_rank, verbose)
+        all_shapes_inferred = False
+        symbolic_shape_inference._preprocess(in_mp)
+        while symbolic_shape_inference.run_:
+            all_shapes_inferred = symbolic_shape_inference._infer_impl()
+        symbolic_shape_inference._update_output_from_vi()
+        if not all_shapes_inferred:
+            raise Exception("Incomplete symbolic shape inference")
+        return symbolic_shape_inference.out_mp_
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True, help="The input model file")
+    parser.add_argument("--output", help="The output model file")
+    parser.add_argument(
+        "--auto_merge",
+        help="Automatically merge symbolic dims when confliction happens",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--int_max",
+        help="maximum value for integer to be treated as boundless for ops like slice",
+        type=int,
+        default=2**31 - 1,
+    )
+    parser.add_argument(
+        "--guess_output_rank",
+        help="guess output rank to be the same as input 0 for unknown ops",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--verbose",
+        help="Prints detailed logs of inference, 0: turn off, 1: warnings, 3: detailed",
+        type=int,
+        default=0,
+    )
+    parser.add_argument(
+        "--save_as_external_data",
+        help="Saving an ONNX model to external data",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--all_tensors_to_one_file",
+        help="Saving all the external data to one file",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--external_data_location",
+        help="The file location to save the external file",
+        default="./",
+    )
+    parser.add_argument(
+        "--external_data_size_threshold",
+        help="The size threshold for external data",
+        type=int,
+        default=1024,
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    logger.info("input model: " + args.input)
+    if args.output:
+        logger.info("output model " + args.output)
+    logger.info("Doing symbolic shape inference...")
+    out_mp = SymbolicShapeInference.infer_shapes(
+        onnx.load(args.input),
+        args.int_max,
+        args.auto_merge,
+        args.guess_output_rank,
+        args.verbose,
+    )
+    if args.output and out_mp:
+        if args.save_as_external_data:
+            onnx.save_model(
+                out_mp,
+                args.output,
+                save_as_external_data=True,
+                all_tensors_to_one_file=args.all_tensors_to_one_file,
+                location=args.external_data_location,
+                size_threshold=args.external_data_size_threshold,
+                convert_attribute=False,
+            )
+        else:
+            onnx.save(out_mp, args.output)
+        logger.info("Done!")
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt
new file mode 100755
index 000000000..422887817
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt
@@ -0,0 +1,3 @@
+sympy
+packaging
+onnxsim
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt b/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
new file mode 100755
index 000000000..5c4a8abca
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
@@ -0,0 +1,4 @@
+transformers==4.35.2
+datasets==2.14.7
+onnx==1.15.0
+pandas==2.1.3
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
new file mode 100755
index 000000000..b132626e0
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -0,0 +1,308 @@
+# Copyright 2023 Graphcore Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import torch
+import logging
+import numpy as np
+from tqdm import tqdm
+import threading
+
+from general_perf.backends import runtime_backend
+from general_perf.backends.ILUVATAR.common import init_by_tensorrt, setup_io_bindings
+from general_perf.backends.ILUVATAR.common import Task, TaskThread, _cudaGetErrorEnum, checkCudaErrors
+from tensorrt import Dims
+from cuda import cuda, cudart
+
+from general_perf.backends.ILUVATAR.common import load_ixrt_plugin
+load_ixrt_plugin()
+
+log = logging.getLogger("RuntimeBackendILUVATAR")
+
+pt_dtype_map = {
+    "FLOAT32": torch.float32,
+    "FLOAT16": torch.float16,
+    "INT8": torch.int8,
+    "LONG": torch.long,
+    "INT64": torch.int64,
+    "BOOL": torch.bool
+}
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64,
+    "BOOL": np.bool
+}
+
+class RuntimeBackendILUVATAR(runtime_backend.RuntimeBackend):
+    def __init__(self):
+        super(RuntimeBackendILUVATAR, self).__init__()
+        self.hardware_type = "ILUVATAR"
+        self.need_reload = False
+        self.model_runtimes = []
+        self.configs = None
+        self.engine = None
+        self.context = None
+        self.batch_size = -1
+        self.workload = None
+
+    # Dual-core inference of Tian SoC BI-150 graphics card
+    def benchmark(self, dataloader):
+        performance_reports = []
+        merged_dict = {}
+        
+        workers = []
+        lock = threading.Lock()
+        for i in range(2):
+            device_id = i
+            task = Task(self.batch_size, dataloader, device_id, self.load, self.benchmark_interact, performance_reports, lock)
+
+            work = TaskThread(task.run, [])
+            workers.append(work)
+            work.start()
+            work.join()
+            
+        del self.engine
+        del self.context
+
+        if len(performance_reports[0]) == len(performance_reports[1]):
+            if performance_reports[0].keys() == performance_reports[1].keys():
+
+                qps = performance_reports[0]['QPS'] + performance_reports[1]['QPS']
+                avg_latency = round(((performance_reports[0]['AVG Latency'] + performance_reports[1]['AVG Latency']) / 2.0), 2)
+                p99_latency = round(((performance_reports[0]['P99 Latency'] + performance_reports[1]['P99 Latency']) / 2.0), 2)
+
+                merged_dict['BS'] = performance_reports[0]['BS']
+                merged_dict['QPS'] = qps
+                merged_dict['AVG Latency'] = avg_latency
+                merged_dict["P99 Latency"] = p99_latency
+
+        return merged_dict  
+
+    def predict(self, feeds):
+        # The deberta model is currently unable to undergo accuracy testing temporarily
+        input_tensors = []
+        i = 0
+
+        model_name = self.configs["model"].split("-")[0]
+        if model_name == 'deberta':
+            keys = list(feeds.keys())
+            input_ids = torch.tensor(feeds[keys[0]], dtype=pt_dtype_map[self.input_type[0]])
+            attention_mask = torch.tensor(feeds[keys[1]], dtype=pt_dtype_map[self.input_type[1]])
+            input_tensors = [input_ids, attention_mask]
+
+        else:
+            for key, _ in feeds.items():
+                tmp_tensor = torch.tensor(feeds[key],
+                                    dtype=pt_dtype_map[self.input_type[i]])
+                input_tensors.append(tmp_tensor)
+                i += 1
+
+        # ixrt inference
+        engine = self.engine
+        assert engine
+        context = self.context
+        assert context
+
+        # set dynamic shape
+        input_tensor_map = self.configs["segments"][0]["input_tensor_map"]
+        input_shape = input_tensor_map.values()
+
+        i = 0
+        for input_name, _ in input_tensor_map.items():
+            if model_name == 'widedeep':
+                input_tensors.append(np.zeros((self.batch_size, 1), dtype=np.float32))
+                input_names = [
+                    "new_categorical_placeholder:0",
+                    "new_numeric_placeholder:0",
+                    "import/head/predictions/zeros_like:0"
+                ]
+                for input_name in input_names:
+                    if input_name == 'new_categorical_placeholder:0':
+                        input_shape = input_tensors[0].shape
+                    if input_name == 'new_numeric_placeholder:0':
+                        input_shape = input_tensors[1].shape
+                    if input_name == 'import/head/predictions/zeros_like:0':
+                        input_shape = input_tensors[2].shape
+                
+                    input_idx = engine.get_binding_index(input_name)
+                    context.set_binding_shape(input_idx, Dims(input_shape))
+            else:
+                input_shape = input_tensors[i].shape
+                input_idx = engine.get_binding_index(input_name)
+                context.set_binding_shape(input_idx, Dims(input_shape))
+                i += 1
+        
+        # Setup I/O bindings
+        inputs, outputs, allocations = setup_io_bindings(engine, context)
+
+        # Prepare the output data
+        outputs_list = []
+        for i in range(len(outputs)):
+            output = np.zeros(outputs[i]["shape"], outputs[i]["dtype"])
+            outputs_list.append(output)
+
+        data_batch_list = []
+        for i in range(len(input_tensors)):
+            data_batch = np.ascontiguousarray(input_tensors[i])
+            data_batch_list.append(data_batch)
+
+        # H2D: host to device
+        for i in range(len(inputs)):
+            (err, ) = cudart.cudaMemcpy(
+                        inputs[i]["allocation"],
+                        data_batch_list[i],
+                        inputs[i]["nbytes"],
+                        cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
+            )
+
+        context.execute_v2(allocations)
+        
+        # D2H: device to host
+        for i in range(len(outputs)):
+            (err, )= cudart.cudaMemcpy(outputs_list[i], 
+                        outputs[i]["allocation"], 
+                        outputs[i]["nbytes"], 
+                        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
+            )
+           
+        # Free Gpu Memory
+        # cuda-python
+        for i in range(len(inputs)):
+            err, = cudart.cudaFree(inputs[i]["allocation"])
+            assert err == cudart.cudaError_t.cudaSuccess
+
+        for i in range(len(outputs)):
+            err, = cudart.cudaFree(outputs[i]["allocation"])
+            assert err == cudart.cudaError_t.cudaSuccess
+        
+        result = {}
+
+        output_tensor_map = self.configs["segments"][0]["output_tensor_map"]
+        output_name = output_tensor_map.split(",")
+
+        for i in range(len(output_name)):
+            result[output_name[i]] = outputs_list[i]
+        
+        if model_name == 'videobert':
+            return outputs_list
+        else:
+            return result
+    
+    def benchmark_interact(self, dataloader):
+        batch_size = self.get_loaded_batch_size()
+        iterations = self.workload['iterations']
+        times_range = []
+        report = {}
+        report["BS"] = batch_size
+
+        test_data = self._get_fake_samples(batch_size=batch_size,
+                        shape=self.configs['segments'][0]['input_tensor_map'],
+                        input_type=self.configs['input_type'])
+
+        for _ in range(30):
+            self.predict(test_data)
+
+        for _ in range(iterations):
+            start_time = time.time()
+            self.predict(test_data)
+            end_time = time.time()
+            times_range.append(end_time - start_time)
+
+        times_range.sort()
+        tail_latency = round(
+            times_range[int(len(times_range) * 0.99)] * 1000, 2)
+        avg_latency = round(sum(times_range) / iterations * 1000, 2)
+        qps = int(1000.0 * self.batch_size / avg_latency)
+
+        log.info(
+            'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
+            format(self.batch_size, qps, avg_latency, tail_latency))
+
+        report['QPS'] = qps
+        report['AVG Latency'] = avg_latency
+        report['P99 Latency'] = tail_latency
+
+        return report
+
+    def get_loaded_batch_size(self):
+        # return self.workload['batch_sizes'][0]
+        return self.batch_size
+
+    def load(self, batch_size) -> None:
+        # load engine
+        model = self.configs['model']
+        model_name = self.configs['model'].split("-")[0]
+        model_path = self.configs['model_path']
+
+        if model_name == 'videobert' or model_name == 'conformer':
+            engine_path = model_path.split(".")[0] + "_end.engine"
+
+        elif model_name == 'yolov5':
+            engine_path = model_path.split(".")[0] + "_sim.engine"
+
+        elif model_name == 'widedeep':
+            engine_path = model_path + "/" + model + "_end.engine"
+        
+        elif model_name == 'roformer':
+            engine_path = model_path + "/" + model + ".engine"
+        
+        elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin':
+            engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine" 
+
+        else:
+            engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
+        
+        # **************to do*************
+        if model_name == 'widedeep':      
+            engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_sim_" + str(batch_size) + ".engine"
+
+        if model_name == 'conformer':
+            engine_path = "general_perf/model_zoo/popular/open_conformer/conformer_encoder_optimizer_end" + ".engine"    
+        
+        # if model_name == 'roformer':
+        #     engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen-sim-modified-" + str(batch_size) + ".engine" 
+        
+        if model_name == 'deberta':
+            engine_path = "general_perf/model_zoo/popular/open_conformer/deberta-base-squad-sim_end" + ".engine"   
+
+        engine, context = init_by_tensorrt(engine_path)
+
+        self.input_type = self.configs['input_type']
+        
+        self.batch_size = batch_size
+        self.model_runtimes = []
+        self.engine = engine
+        self.context = context
+
+    def _get_fake_samples(self, batch_size, shape, input_type):
+        data = {}
+        if input_type:
+            i = 0
+            for key, val in shape.items():
+                if key != "text":
+                    val = [val[0] * batch_size] + val[1:]
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                else:
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                i += 1
+            return data
+        else:
+            raise ValueError("Please provide input type")
\ No newline at end of file

From 54a81c8d7a66ca653ccb993866605a65f36e96d3 Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Sat, 20 Apr 2024 17:06:51 +0800
Subject: [PATCH 02/28] update reports

---
 .../ILUVATAR/compile_backend_iluvatar.py      |  4 +--
 .../ILUVATAR/runtime_backend_iluvatar.py      | 33 +++++++++++++++++--
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
index 0c0f62994..8609bab7a 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
@@ -77,7 +77,7 @@ def compile(self, configs, dataloader=None):
             engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
 
         # model preprocessing
-        # self.get_onnx(configs)
+        self.get_onnx(configs)
 
         # build engine
         if model_name == 'widedeep':
@@ -118,7 +118,7 @@ def compile(self, configs, dataloader=None):
             "framework": 
                 configs['model_info']['framework'],
             "compile_precision": 
-                configs['model_info']['model_precision'],
+                configs['model_info']['model_precision'].replace('FP32', 'FP16'),
             "input_type": 
                 configs['model_info']['input_type'].split(","),
             "max_batch_size": 
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
index b132626e0..09dd3d3e7 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -60,6 +60,8 @@ def __init__(self):
         self.context = None
         self.batch_size = -1
         self.workload = None
+        self.predict_fps = None
+        self.predict_time = None
 
     # Dual-core inference of Tian SoC BI-150 graphics card
     def benchmark(self, dataloader):
@@ -87,11 +89,19 @@ def benchmark(self, dataloader):
                 avg_latency = round(((performance_reports[0]['AVG Latency'] + performance_reports[1]['AVG Latency']) / 2.0), 2)
                 p99_latency = round(((performance_reports[0]['P99 Latency'] + performance_reports[1]['P99 Latency']) / 2.0), 2)
 
+                predict_qps = performance_reports[0]['predict QPS'] + performance_reports[1]['predict QPS']
+                predict_avg_latency = round(((performance_reports[0]['predict AVG Latency'] + performance_reports[1]['predict AVG Latency']) / 2.0), 2)
+                predict_p99_latency = round(((performance_reports[0]['predict P99 Latency'] + performance_reports[1]['predict P99 Latency']) / 2.0), 2)
+
                 merged_dict['BS'] = performance_reports[0]['BS']
                 merged_dict['QPS'] = qps
                 merged_dict['AVG Latency'] = avg_latency
                 merged_dict["P99 Latency"] = p99_latency
 
+                merged_dict['predict QPS'] = predict_qps
+                merged_dict['predict AVG Latency'] = predict_avg_latency
+                merged_dict["predict P99 Latency"] = predict_p99_latency
+
         return merged_dict  
 
     def predict(self, feeds):
@@ -170,8 +180,12 @@ def predict(self, feeds):
                         inputs[i]["nbytes"],
                         cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
             )
-
+        
+        starttime = time.time()
         context.execute_v2(allocations)
+        endtime = time.time()
+
+        self.predict_time = endtime - starttime
         
         # D2H: device to host
         for i in range(len(outputs)):
@@ -208,6 +222,7 @@ def benchmark_interact(self, dataloader):
         batch_size = self.get_loaded_batch_size()
         iterations = self.workload['iterations']
         times_range = []
+        predict_range = []
         report = {}
         report["BS"] = batch_size
 
@@ -223,6 +238,7 @@ def benchmark_interact(self, dataloader):
             self.predict(test_data)
             end_time = time.time()
             times_range.append(end_time - start_time)
+            predict_range.append(self.predict_time)
 
         times_range.sort()
         tail_latency = round(
@@ -230,6 +246,12 @@ def benchmark_interact(self, dataloader):
         avg_latency = round(sum(times_range) / iterations * 1000, 2)
         qps = int(1000.0 * self.batch_size / avg_latency)
 
+        predict_range.sort()
+        predict_tail_latency = round(
+            predict_range[int(len(predict_range) * 0.99)] * 1000, 2)
+        predict_avg_latency = round(sum(predict_range) / iterations * 1000, 2)
+        fps = int(1000.0 * batch_size / predict_avg_latency)
+
         log.info(
             'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
             format(self.batch_size, qps, avg_latency, tail_latency))
@@ -238,6 +260,10 @@ def benchmark_interact(self, dataloader):
         report['AVG Latency'] = avg_latency
         report['P99 Latency'] = tail_latency
 
+        report['predict QPS'] = fps
+        report['predict AVG Latency'] = predict_avg_latency
+        report['predict P99 Latency'] = predict_tail_latency
+
         return report
 
     def get_loaded_batch_size(self):
@@ -249,6 +275,7 @@ def load(self, batch_size) -> None:
         model = self.configs['model']
         model_name = self.configs['model'].split("-")[0]
         model_path = self.configs['model_path']
+        self.model_runtimes = []
 
         if model_name == 'videobert' or model_name == 'conformer':
             engine_path = model_path.split(".")[0] + "_end.engine"
@@ -283,6 +310,8 @@ def load(self, batch_size) -> None:
 
         engine, context = init_by_tensorrt(engine_path)
 
+        self.model_runtimes.append(engine)
+
         self.input_type = self.configs['input_type']
         
         self.batch_size = batch_size
@@ -305,4 +334,4 @@ def _get_fake_samples(self, batch_size, shape, input_type):
                 i += 1
             return data
         else:
-            raise ValueError("Please provide input type")
\ No newline at end of file
+            raise ValueError("Please provide input type")

From 74b954ace9848bafa6365fb5e0d5ebe7e5d68f27 Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Mon, 22 Apr 2024 15:15:40 +0800
Subject: [PATCH 03/28] update optimiser functions

---
 .../ILUVATAR/compile_backend_iluvatar.py      | 32 +++++++++++++------
 .../ILUVATAR/runtime_backend_iluvatar.py      |  9 +++---
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
index 8609bab7a..86c1ed9a3 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
@@ -50,15 +50,9 @@ def compile(self, configs, dataloader=None):
         MaxBatchSize = configs['model_info']['max_batch_size']
 
         # call the ONNX model and the compiled engine file
-        if model_name == 'videobert' or model_name == 'conformer':
+        if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
             onnx_model_path = model_path.split(".")[0] + "_end.onnx"
             engine_path = model_path.split(".")[0] + "_end.engine"
-        
-        elif model_name == 'yolov5':
-            onnx_model_path = model_path.split(".")[0] + "_sim.onnx"
-            cmd = f'onnxsim {model_path} {onnx_model_path}'
-            subprocess.call(cmd, shell=True)
-            engine_path = model_path.split(".")[0] + "_sim.engine"
 
         elif model_name == 'widedeep':
             onnx_model_path = model_path + "/" + model + "_end.onnx"
@@ -192,16 +186,34 @@ def get_onnx(self, configs):
             savedmodel_to_onnx(model_path=model_path, output_path=onnx_model_path)
             print("***Convert pb model to onnx model success!***")
 
-        # Convert ONNX model to plugin operator model
+        # Convert ONNX model to plugin operator model: Support fusion of dynamic and static graphs
         """
-            ***********待处理问题记录************
+            *********************待处理问题记录: 后续会更新进展************************
             conformer 模型不能利用optimizer.py脚本转换, 因为attention比较特殊, 利用处理好的onnx模型进行测试;
             roformer  模型目前没有实现通过加载固定shape的onnx, 生成不同的batch的engine实现动态shape推理;
             widedeep  模型目前对原始的onnx暂时不支持直接动态shape推理, 对模型做了一系列处理, 并且不需要进行optimizer.py脚本处理, 直接加载处理好的onnx模型;
         """        
         if model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or \
-            model_name == 'videobert' or model_name == 'swin':
+            model_name == 'videobert':
             
             cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path}'
             subprocess.call(cmd, shell=True)
             print("***Convert onnx model to plugin operator model success!***")
+
+        elif model_name == 'swin':
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type swint'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
+        elif model_name == 'yolov5':
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type yolo'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
+        elif model_name == 'roformer':
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type roformer'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+        
+
+
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
index 09dd3d3e7..6d06f0257 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -211,6 +211,10 @@ def predict(self, feeds):
         output_name = output_tensor_map.split(",")
 
         for i in range(len(output_name)):
+            if model_name == 'yolov5':
+                result[output_name[0]] = outputs_list[0]
+                break
+
             result[output_name[i]] = outputs_list[i]
         
         if model_name == 'videobert':
@@ -277,12 +281,9 @@ def load(self, batch_size) -> None:
         model_path = self.configs['model_path']
         self.model_runtimes = []
 
-        if model_name == 'videobert' or model_name == 'conformer':
+        if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
             engine_path = model_path.split(".")[0] + "_end.engine"
 
-        elif model_name == 'yolov5':
-            engine_path = model_path.split(".")[0] + "_sim.engine"
-
         elif model_name == 'widedeep':
             engine_path = model_path + "/" + model + "_end.engine"
         

From 709acaa5e7f4a62960c0cea2395063753ba97747 Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Thu, 25 Apr 2024 11:16:55 +0800
Subject: [PATCH 04/28] add single-core of gpt2

---
 .../general_perf/backends/ILUVATAR/common.py  |  25 +-
 .../ILUVATAR/compile_backend_iluvatar.py      |  25 +-
 .../ILUVATAR/runtime_backend_iluvatar.py      | 390 ++++++++++--------
 3 files changed, 259 insertions(+), 181 deletions(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/common.py b/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
index ca7dfa573..fa5603427 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
@@ -1,8 +1,7 @@
+import os
 import random
 import torch
-import time
 import ctypes
-import argparse
 import numpy as np
 from os.path import join, dirname, exists
 
@@ -11,7 +10,9 @@
 import pycuda.driver as cuda
 from cuda import cuda,cudart
 import threading
-import time
+
+import tvm
+from general_perf.backends.ILUVATAR.utils.import_model import import_model_to_igie
 
 
 def setup_seed(seed):
@@ -74,11 +75,11 @@ def build_engine(model_name, onnx_model_path, engine_path, MaxBatchSize):
     
     elif model_name == 'widedeep':
         profile.set_shape(
-            "new_numeric_placeholder:0", Dims([MaxBatchSize, 13]), Dims([MaxBatchSize, 13]), Dims([MaxBatchSize, 13]))
+            "new_numeric_placeholder:0", Dims([1, 13]), Dims([16, 13]), Dims([MaxBatchSize, 13]))
         profile.set_shape(
-            "new_categorical_placeholder:0", Dims([MaxBatchSize * 26, 2]), Dims([MaxBatchSize * 26, 2]), Dims([MaxBatchSize * 26, 2]))
+            "new_categorical_placeholder:0", Dims([1 * 26, 2]), Dims([16 * 26, 2]), Dims([MaxBatchSize * 26, 2]))
         profile.set_shape(
-            "import/head/predictions/zeros_like:0", Dims([MaxBatchSize, 1]), Dims([MaxBatchSize, 1]), Dims([MaxBatchSize, 1]))
+            "import/head/predictions/zeros_like:0", Dims([1, 1]), Dims([16, 1]), Dims([MaxBatchSize, 1]))
         
     elif model_name == 'conformer':
         profile.set_shape(
@@ -164,6 +165,16 @@ def build_engine(model_name, onnx_model_path, engine_path, MaxBatchSize):
     print("***Build dynamic shape engine success!***")
 
 
+def build_igie_engine(model_name, model_path, input_dict, model_framework, precision, engine_path):
+    if not os.path.exists(engine_path):
+        target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")
+        mod, params = import_model_to_igie(model_path, input_dict, model_framework)
+        lib = tvm.relay.build(mod, target=target, params=params, precision=precision, verbose=False)
+        lib.export_library(engine_path)
+    else:
+        pass
+
+
 def init_by_tensorrt(engine_path):
     datatype = tensorrt.DataType.FLOAT
     host_mem = tensorrt.IHostMemory
@@ -235,6 +246,7 @@ def __init__(self, bs, dataset, device_id, load_fun, benchmark_fun, performance_
         checkCudaErrors(cudart.cudaSetDevice(device_id))
         load_fun(bs)
         self.lock = lock
+        
 
     def run(self):
         checkCudaErrors(cudart.cudaSetDevice(self.device_id))
@@ -271,3 +283,4 @@ def checkCudaErrors(result):
         return result[1]
     else:
         return result[1:]
+
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
index 86c1ed9a3..2f81557d8 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
@@ -21,7 +21,7 @@
 from general_perf.backends.ILUVATAR.common import load_ixrt_plugin
 load_ixrt_plugin()
 
-from general_perf.backends.ILUVATAR.common import build_engine
+from general_perf.backends.ILUVATAR.common import build_engine, build_igie_engine
 from general_perf.backends.ILUVATAR.optimizer.passes import *
 from general_perf.tools.torch_to_onnx import torch_to_onnx
 from general_perf.tools.saved_to_onnx import savedmodel_to_onnx
@@ -75,10 +75,9 @@ def compile(self, configs, dataloader=None):
 
         # build engine
         if model_name == 'widedeep':
-            for bs in configs['workload']['batch_sizes']:
-                onnx_model_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_sim.onnx"
-                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_sim_" + str(bs) + ".engine"    
-                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=bs)
+            onnx_model_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape.onnx"
+            engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape" + ".engine"    
+            build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
         
         # elif model_name == 'roformer':
         #     # onnx_model_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen-sim-modified-bs32.onnx"
@@ -99,6 +98,18 @@ def compile(self, configs, dataloader=None):
             engine_path = "general_perf/model_zoo/popular/open_conformer/deberta-base-squad-sim_end" + ".engine"    
             build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
 
+        elif model_name == 'gpt2':
+            for bs in configs['workload']['batch_sizes']:
+                onnx_model_path = os.path.dirname(model_path) + "/" + model + ".onnx"
+                engine_path = os.path.dirname(model_path) + "/" + model + "_bs" + str(bs) + ".so" 
+
+                for key, val in configs['model_info']['input_shape'].items():
+                    input_dict = {}
+                    val = val = [val[0] * bs] + val[1:] 
+                    input_dict[key] = val
+                    
+                build_igie_engine(model_name=model_name, model_path=onnx_model_path, input_dict=input_dict, model_framework='onnx', precision='fp16', engine_path=engine_path)
+
         else:
             build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
 
@@ -214,6 +225,6 @@ def get_onnx(self, configs):
             cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type roformer'
             subprocess.call(cmd, shell=True)
             print("***Convert onnx model to plugin operator model success!***")
-        
-
 
+        else:
+            pass
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
index 6d06f0257..a79ac6ef3 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -20,8 +20,10 @@
 from tqdm import tqdm
 import threading
 
+import tvm
 from general_perf.backends import runtime_backend
 from general_perf.backends.ILUVATAR.common import init_by_tensorrt, setup_io_bindings
+from general_perf.backends.ILUVATAR.utils import get_target
 from general_perf.backends.ILUVATAR.common import Task, TaskThread, _cudaGetErrorEnum, checkCudaErrors
 from tensorrt import Dims
 from cuda import cuda, cudart
@@ -67,42 +69,65 @@ def __init__(self):
     def benchmark(self, dataloader):
         performance_reports = []
         merged_dict = {}
+        model_name = self.configs["model"].split("-")[0]
         
         workers = []
         lock = threading.Lock()
-        for i in range(2):
-            device_id = i
-            task = Task(self.batch_size, dataloader, device_id, self.load, self.benchmark_interact, performance_reports, lock)
-
-            work = TaskThread(task.run, [])
-            workers.append(work)
-            work.start()
-            work.join()
+        if model_name != 'gpt2':
+            for i in range(2):
+                device_id = i
+                task = Task(self.batch_size, dataloader, device_id, self.load, self.benchmark_interact, performance_reports, lock)
+
+                work = TaskThread(task.run, [])
+                workers.append(work)
+                work.start()
+                work.join()
+                
+            del self.engine
+            del self.context
             
-        del self.engine
-        del self.context
-
-        if len(performance_reports[0]) == len(performance_reports[1]):
-            if performance_reports[0].keys() == performance_reports[1].keys():
-
-                qps = performance_reports[0]['QPS'] + performance_reports[1]['QPS']
-                avg_latency = round(((performance_reports[0]['AVG Latency'] + performance_reports[1]['AVG Latency']) / 2.0), 2)
-                p99_latency = round(((performance_reports[0]['P99 Latency'] + performance_reports[1]['P99 Latency']) / 2.0), 2)
-
-                predict_qps = performance_reports[0]['predict QPS'] + performance_reports[1]['predict QPS']
-                predict_avg_latency = round(((performance_reports[0]['predict AVG Latency'] + performance_reports[1]['predict AVG Latency']) / 2.0), 2)
-                predict_p99_latency = round(((performance_reports[0]['predict P99 Latency'] + performance_reports[1]['predict P99 Latency']) / 2.0), 2)
-
-                merged_dict['BS'] = performance_reports[0]['BS']
-                merged_dict['QPS'] = qps
-                merged_dict['AVG Latency'] = avg_latency
-                merged_dict["P99 Latency"] = p99_latency
+        else:
+            # ****to do******
+            for i in range(1):
+                device_id = i
+                task = Task(self.batch_size, dataloader, device_id, self.load, self.benchmark_interact, performance_reports, lock)
+
+                work = TaskThread(task.run, [])
+                workers.append(work)
+                work.start()
+                work.join()
+
+        if model_name != 'gpt2':
+            if len(performance_reports[0]) == len(performance_reports[1]):
+                if performance_reports[0].keys() == performance_reports[1].keys():
+
+                    qps = performance_reports[0]['QPS'] + performance_reports[1]['QPS']
+                    avg_latency = round(((performance_reports[0]['AVG Latency'] + performance_reports[1]['AVG Latency']) / 2.0), 2)
+                    p99_latency = round(((performance_reports[0]['P99 Latency'] + performance_reports[1]['P99 Latency']) / 2.0), 2)
+
+                    predict_qps = performance_reports[0]['predict QPS'] + performance_reports[1]['predict QPS']
+                    predict_avg_latency = round(((performance_reports[0]['predict AVG Latency'] + performance_reports[1]['predict AVG Latency']) / 2.0), 2)
+                    predict_p99_latency = round(((performance_reports[0]['predict P99 Latency'] + performance_reports[1]['predict P99 Latency']) / 2.0), 2)
+
+                    merged_dict['BS'] = performance_reports[0]['BS']
+                    merged_dict['QPS'] = qps
+                    merged_dict['AVG Latency'] = avg_latency
+                    merged_dict["P99 Latency"] = p99_latency
+
+                    merged_dict['predict QPS'] = predict_qps
+                    merged_dict['predict AVG Latency'] = predict_avg_latency
+                    merged_dict["predict P99 Latency"] = predict_p99_latency
+                    
+            return merged_dict
+        
+        else:
+            merged_dict['BS'] = performance_reports[0]['BS']
+            merged_dict['QPS'] = performance_reports[0]['QPS']
+            merged_dict['AVG Latency'] = performance_reports[0]['AVG Latency']
+            merged_dict["P99 Latency"] = performance_reports[0]["P99 Latency"]
 
-                merged_dict['predict QPS'] = predict_qps
-                merged_dict['predict AVG Latency'] = predict_avg_latency
-                merged_dict["predict P99 Latency"] = predict_p99_latency
+            return merged_dict
 
-        return merged_dict  
 
     def predict(self, feeds):
         # The deberta model is currently unable to undergo accuracy testing temporarily
@@ -110,121 +135,133 @@ def predict(self, feeds):
         i = 0
 
         model_name = self.configs["model"].split("-")[0]
-        if model_name == 'deberta':
-            keys = list(feeds.keys())
-            input_ids = torch.tensor(feeds[keys[0]], dtype=pt_dtype_map[self.input_type[0]])
-            attention_mask = torch.tensor(feeds[keys[1]], dtype=pt_dtype_map[self.input_type[1]])
-            input_tensors = [input_ids, attention_mask]
-
-        else:
-            for key, _ in feeds.items():
-                tmp_tensor = torch.tensor(feeds[key],
-                                    dtype=pt_dtype_map[self.input_type[i]])
-                input_tensors.append(tmp_tensor)
-                i += 1
-
-        # ixrt inference
-        engine = self.engine
-        assert engine
-        context = self.context
-        assert context
+        if model_name != 'gpt2':
+            if model_name == 'deberta':
+                keys = list(feeds.keys())
+                input_ids = torch.tensor(feeds[keys[0]], dtype=pt_dtype_map[self.input_type[0]])
+                attention_mask = torch.tensor(feeds[keys[1]], dtype=pt_dtype_map[self.input_type[1]])
+                input_tensors = [input_ids, attention_mask]
 
-        # set dynamic shape
-        input_tensor_map = self.configs["segments"][0]["input_tensor_map"]
-        input_shape = input_tensor_map.values()
+            else:
+                for key, _ in feeds.items():
+                    tmp_tensor = torch.tensor(feeds[key],
+                                        dtype=pt_dtype_map[self.input_type[i]])
+                    input_tensors.append(tmp_tensor)
+                    i += 1
+
+            # ixrt inference
+            engine = self.engine
+            assert engine
+            context = self.context
+            assert context
+
+            # set dynamic shape
+            input_tensor_map = self.configs["segments"][0]["input_tensor_map"]
+            input_shape = input_tensor_map.values()
 
-        i = 0
-        for input_name, _ in input_tensor_map.items():
-            if model_name == 'widedeep':
-                input_tensors.append(np.zeros((self.batch_size, 1), dtype=np.float32))
-                input_names = [
-                    "new_categorical_placeholder:0",
-                    "new_numeric_placeholder:0",
-                    "import/head/predictions/zeros_like:0"
-                ]
-                for input_name in input_names:
-                    if input_name == 'new_categorical_placeholder:0':
-                        input_shape = input_tensors[0].shape
-                    if input_name == 'new_numeric_placeholder:0':
-                        input_shape = input_tensors[1].shape
-                    if input_name == 'import/head/predictions/zeros_like:0':
-                        input_shape = input_tensors[2].shape
-                
+            i = 0
+            for input_name, _ in input_tensor_map.items():
+                if model_name == 'widedeep':
+                    input_tensors.append(np.zeros((self.batch_size, 1), dtype=np.float32))
+                    input_names = [
+                        "new_categorical_placeholder:0",
+                        "new_numeric_placeholder:0",
+                        "import/head/predictions/zeros_like:0"
+                    ]
+                    for input_name in input_names:
+                        if input_name == 'new_categorical_placeholder:0':
+                            input_shape = input_tensors[0].shape
+                        if input_name == 'new_numeric_placeholder:0':
+                            input_shape = input_tensors[1].shape
+                        if input_name == 'import/head/predictions/zeros_like:0':
+                            input_shape = input_tensors[2].shape
+                    
+                        input_idx = engine.get_binding_index(input_name)
+                        context.set_binding_shape(input_idx, Dims(input_shape))
+                else:
+                    input_shape = input_tensors[i].shape
                     input_idx = engine.get_binding_index(input_name)
                     context.set_binding_shape(input_idx, Dims(input_shape))
-            else:
-                input_shape = input_tensors[i].shape
-                input_idx = engine.get_binding_index(input_name)
-                context.set_binding_shape(input_idx, Dims(input_shape))
-                i += 1
-        
-        # Setup I/O bindings
-        inputs, outputs, allocations = setup_io_bindings(engine, context)
-
-        # Prepare the output data
-        outputs_list = []
-        for i in range(len(outputs)):
-            output = np.zeros(outputs[i]["shape"], outputs[i]["dtype"])
-            outputs_list.append(output)
-
-        data_batch_list = []
-        for i in range(len(input_tensors)):
-            data_batch = np.ascontiguousarray(input_tensors[i])
-            data_batch_list.append(data_batch)
-
-        # H2D: host to device
-        for i in range(len(inputs)):
-            (err, ) = cudart.cudaMemcpy(
-                        inputs[i]["allocation"],
-                        data_batch_list[i],
-                        inputs[i]["nbytes"],
-                        cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
-            )
-        
-        starttime = time.time()
-        context.execute_v2(allocations)
-        endtime = time.time()
+                    i += 1
+            
+            # Setup I/O bindings
+            inputs, outputs, allocations = setup_io_bindings(engine, context)
+
+            # Prepare the output data
+            outputs_list = []
+            for i in range(len(outputs)):
+                output = np.zeros(outputs[i]["shape"], outputs[i]["dtype"])
+                outputs_list.append(output)
+
+            data_batch_list = []
+            for i in range(len(input_tensors)):
+                data_batch = np.ascontiguousarray(input_tensors[i])
+                data_batch_list.append(data_batch)
+
+            # H2D: host to device
+            for i in range(len(inputs)):
+                (err, ) = cudart.cudaMemcpy(
+                            inputs[i]["allocation"],
+                            data_batch_list[i],
+                            inputs[i]["nbytes"],
+                            cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
+                )
+            
+            starttime = time.time()
+            context.execute_v2(allocations)
+            endtime = time.time()
 
-        self.predict_time = endtime - starttime
-        
-        # D2H: device to host
-        for i in range(len(outputs)):
-            (err, )= cudart.cudaMemcpy(outputs_list[i], 
-                        outputs[i]["allocation"], 
-                        outputs[i]["nbytes"], 
-                        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
-            )
-           
-        # Free Gpu Memory
-        # cuda-python
-        for i in range(len(inputs)):
-            err, = cudart.cudaFree(inputs[i]["allocation"])
-            assert err == cudart.cudaError_t.cudaSuccess
-
-        for i in range(len(outputs)):
-            err, = cudart.cudaFree(outputs[i]["allocation"])
-            assert err == cudart.cudaError_t.cudaSuccess
-        
-        result = {}
+            self.predict_time = endtime - starttime
+            
+            # D2H: device to host
+            for i in range(len(outputs)):
+                (err, )= cudart.cudaMemcpy(outputs_list[i], 
+                            outputs[i]["allocation"], 
+                            outputs[i]["nbytes"], 
+                            cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
+                )
+            
+            # Free Gpu Memory
+            # cuda-python
+            for i in range(len(inputs)):
+                err, = cudart.cudaFree(inputs[i]["allocation"])
+                assert err == cudart.cudaError_t.cudaSuccess
+
+            for i in range(len(outputs)):
+                err, = cudart.cudaFree(outputs[i]["allocation"])
+                assert err == cudart.cudaError_t.cudaSuccess
+            
+            result = {}
 
-        output_tensor_map = self.configs["segments"][0]["output_tensor_map"]
-        output_name = output_tensor_map.split(",")
+            output_tensor_map = self.configs["segments"][0]["output_tensor_map"]
+            output_name = output_tensor_map.split(",")
 
-        for i in range(len(output_name)):
-            if model_name == 'yolov5':
-                result[output_name[0]] = outputs_list[0]
-                break
+            for i in range(len(output_name)):
+                if model_name == 'yolov5':
+                    result[output_name[0]] = outputs_list[0]
+                    break
 
-            result[output_name[i]] = outputs_list[i]
-        
+                result[output_name[i]] = outputs_list[i]
+
+        else:
+            result = None
+            self.predict_igie(feeds)
+            
         if model_name == 'videobert':
             return outputs_list
         else:
             return result
     
+    def predict_igie(self, dataloader):
+        self.module_igie.set_input("input_ids", tvm.nd.array(dataloader["input_ids"].astype('int64'), self.device))
+        self.module_igie.run()
+        output = None   # self.module_igie.get_output(0).numpy()
+        return output
+    
     def benchmark_interact(self, dataloader):
         batch_size = self.get_loaded_batch_size()
         iterations = self.workload['iterations']
+        model_name = self.configs["model"].split("-")[0]
         times_range = []
         predict_range = []
         report = {}
@@ -250,11 +287,14 @@ def benchmark_interact(self, dataloader):
         avg_latency = round(sum(times_range) / iterations * 1000, 2)
         qps = int(1000.0 * self.batch_size / avg_latency)
 
-        predict_range.sort()
-        predict_tail_latency = round(
-            predict_range[int(len(predict_range) * 0.99)] * 1000, 2)
-        predict_avg_latency = round(sum(predict_range) / iterations * 1000, 2)
-        fps = int(1000.0 * batch_size / predict_avg_latency)
+        if model_name != 'gpt2':
+            predict_range.sort()
+            predict_tail_latency = round(
+                predict_range[int(len(predict_range) * 0.99)] * 1000, 2)
+            predict_avg_latency = round(sum(predict_range) / iterations * 1000, 2)
+            fps = int(1000.0 * batch_size / predict_avg_latency)
+        else:
+            pass
 
         log.info(
             'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
@@ -264,9 +304,12 @@ def benchmark_interact(self, dataloader):
         report['AVG Latency'] = avg_latency
         report['P99 Latency'] = tail_latency
 
-        report['predict QPS'] = fps
-        report['predict AVG Latency'] = predict_avg_latency
-        report['predict P99 Latency'] = predict_tail_latency
+        if model_name != 'gpt2':
+            report['predict QPS'] = fps
+            report['predict AVG Latency'] = predict_avg_latency
+            report['predict P99 Latency'] = predict_tail_latency
+        else:
+            pass
 
         return report
 
@@ -281,44 +324,55 @@ def load(self, batch_size) -> None:
         model_path = self.configs['model_path']
         self.model_runtimes = []
 
-        if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
-            engine_path = model_path.split(".")[0] + "_end.engine"
+        if model_name != 'gpt2':
+            if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
+                engine_path = model_path.split(".")[0] + "_end.engine"
 
-        elif model_name == 'widedeep':
-            engine_path = model_path + "/" + model + "_end.engine"
-        
-        elif model_name == 'roformer':
-            engine_path = model_path + "/" + model + ".engine"
-        
-        elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin':
-            engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine" 
+            elif model_name == 'widedeep':
+                engine_path = model_path + "/" + model + "_end.engine"
+            
+            elif model_name == 'roformer':
+                engine_path = model_path + "/" + model + ".engine"
+            
+            elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin':
+                engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine" 
 
-        else:
-            engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
-        
-        # **************to do*************
-        if model_name == 'widedeep':      
-            engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_sim_" + str(batch_size) + ".engine"
+            else:
+                engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
+            
+            # **************to do*************
+            if model_name == 'widedeep':      
+                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape" + ".engine"
 
-        if model_name == 'conformer':
-            engine_path = "general_perf/model_zoo/popular/open_conformer/conformer_encoder_optimizer_end" + ".engine"    
-        
-        # if model_name == 'roformer':
-        #     engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen-sim-modified-" + str(batch_size) + ".engine" 
-        
-        if model_name == 'deberta':
-            engine_path = "general_perf/model_zoo/popular/open_conformer/deberta-base-squad-sim_end" + ".engine"   
+            if model_name == 'conformer':
+                engine_path = "general_perf/model_zoo/popular/open_conformer/conformer_encoder_optimizer_end" + ".engine"    
+            
+            # if model_name == 'roformer':
+            #     engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen-sim-modified-" + str(batch_size) + ".engine" 
+            
+            if model_name == 'deberta':
+                engine_path = "general_perf/model_zoo/popular/open_conformer/deberta-base-squad-sim_end" + ".engine"   
 
-        engine, context = init_by_tensorrt(engine_path)
+            engine, context = init_by_tensorrt(engine_path)
 
-        self.model_runtimes.append(engine)
+            self.model_runtimes.append(engine)
 
-        self.input_type = self.configs['input_type']
+            self.input_type = self.configs['input_type']
+            
+            self.batch_size = batch_size
+            self.model_runtimes = []
+            self.engine = engine
+            self.context = context
         
-        self.batch_size = batch_size
-        self.model_runtimes = []
-        self.engine = engine
-        self.context = context
+        else:
+            _, device = get_target('iluvatar_with_all_libs')
+            engine_path = os.path.dirname(model_path) + "/" + model + "_bs" + str(batch_size) + ".so" 
+            lib = tvm.runtime.load_module(engine_path)
+            module_igie = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+            self.module_igie = module_igie
+            self.device = device
+            self.batch_size = batch_size
 
     def _get_fake_samples(self, batch_size, shape, input_type):
         data = {}

From 91981c9b5a25e9f5774ff46773255fbfedd5ba93 Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Thu, 25 Apr 2024 21:33:04 +0800
Subject: [PATCH 05/28] update and Optimization

---
 .../general_perf/backends/ILUVATAR/common.py  |   7 +-
 .../ILUVATAR/runtime_backend_iluvatar.py      | 167 +++--
 .../backends/ILUVATAR/utils/__init__.py       |  20 +
 .../backends/ILUVATAR/utils/argument.py       | 331 ++++++++++
 .../backends/ILUVATAR/utils/coco_metric.py    | 622 ++++++++++++++++++
 .../backends/ILUVATAR/utils/compile_engine.py |  19 +
 .../backends/ILUVATAR/utils/dataloader.py     | 595 +++++++++++++++++
 .../ILUVATAR/utils/fastCoCoeval/__init__.py   |   9 +
 .../utils/fastCoCoeval/cocoeval/cocoeval.cpp  | 502 ++++++++++++++
 .../utils/fastCoCoeval/cocoeval/cocoeval.h    |  98 +++
 .../utils/fastCoCoeval/fast_coco_eval_api.py  | 154 +++++
 .../ILUVATAR/utils/fastCoCoeval/jit_ops.py    | 179 +++++
 .../backends/ILUVATAR/utils/file.py           |  20 +
 .../ILUVATAR/utils/imagenet_metric.py         |  23 +
 .../backends/ILUVATAR/utils/import_model.py   | 113 ++++
 .../backends/ILUVATAR/utils/mod_rewriter.py   |  81 +++
 .../ILUVATAR/utils/onnx_rewrite_batch_size.py | 113 ++++
 .../backends/ILUVATAR/utils/onnx_util.py      | 130 ++++
 .../backends/ILUVATAR/utils/quantization.py   | 531 +++++++++++++++
 .../backends/ILUVATAR/utils/stauts_checker.py |  21 +
 .../backends/ILUVATAR/utils/target.py         |  24 +
 .../backends/ILUVATAR/utils/timer.py          |  81 +++
 22 files changed, 3747 insertions(+), 93 deletions(-)
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/__init__.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/argument.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/coco_metric.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/compile_engine.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/dataloader.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/__init__.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.cpp
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.h
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/fast_coco_eval_api.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/jit_ops.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/file.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/imagenet_metric.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/import_model.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/mod_rewriter.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_rewrite_batch_size.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_util.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/quantization.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/stauts_checker.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/target.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/utils/timer.py

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/common.py b/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
index fa5603427..9d9a1a5d9 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
@@ -238,14 +238,17 @@ def setup_io_bindings(engine, context):
 
 # multi cores inference codes
 class Task:
-    def __init__(self, bs, dataset, device_id, load_fun, benchmark_fun, performance_reports, lock) -> None:
+    def __init__(self, bs, dataset, device_id, load_fun, benchmark_fun, performance_reports, lock, framework) -> None:
         self.dataset = dataset
         self.benchmark_fun = benchmark_fun
         self.device_id = device_id
         self.performance_reports = performance_reports
         checkCudaErrors(cudart.cudaSetDevice(device_id))
-        load_fun(bs)
+        if framework != 'gpt2':
+            load_fun(bs)
+
         self.lock = lock
+        self.module = None
         
 
     def run(self):
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
index a79ac6ef3..6b07ccdd1 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -64,6 +64,7 @@ def __init__(self):
         self.workload = None
         self.predict_fps = None
         self.predict_time = None
+        self.task = None
 
     # Dual-core inference of Tian SoC BI-150 graphics card
     def benchmark(self, dataloader):
@@ -73,61 +74,41 @@ def benchmark(self, dataloader):
         
         workers = []
         lock = threading.Lock()
+        for i in range(2):
+            device_id = i
+            self.task = Task(self.batch_size, dataloader, device_id, self.load, self.benchmark_interact, performance_reports, lock, framework=model_name)
+
+            work = TaskThread(self.task.run, [])
+            workers.append(work)
+            work.start()
+            work.join()
+        
         if model_name != 'gpt2':
-            for i in range(2):
-                device_id = i
-                task = Task(self.batch_size, dataloader, device_id, self.load, self.benchmark_interact, performance_reports, lock)
-
-                work = TaskThread(task.run, [])
-                workers.append(work)
-                work.start()
-                work.join()
-                
             del self.engine
             del self.context
             
-        else:
-            # ****to do******
-            for i in range(1):
-                device_id = i
-                task = Task(self.batch_size, dataloader, device_id, self.load, self.benchmark_interact, performance_reports, lock)
+        if len(performance_reports[0]) == len(performance_reports[1]):
+            if performance_reports[0].keys() == performance_reports[1].keys():
 
-                work = TaskThread(task.run, [])
-                workers.append(work)
-                work.start()
-                work.join()
+                qps = performance_reports[0]['QPS'] + performance_reports[1]['QPS']
+                avg_latency = round(((performance_reports[0]['AVG Latency'] + performance_reports[1]['AVG Latency']) / 2.0), 2)
+                p99_latency = round(((performance_reports[0]['P99 Latency'] + performance_reports[1]['P99 Latency']) / 2.0), 2)
 
-        if model_name != 'gpt2':
-            if len(performance_reports[0]) == len(performance_reports[1]):
-                if performance_reports[0].keys() == performance_reports[1].keys():
-
-                    qps = performance_reports[0]['QPS'] + performance_reports[1]['QPS']
-                    avg_latency = round(((performance_reports[0]['AVG Latency'] + performance_reports[1]['AVG Latency']) / 2.0), 2)
-                    p99_latency = round(((performance_reports[0]['P99 Latency'] + performance_reports[1]['P99 Latency']) / 2.0), 2)
+                merged_dict['BS'] = performance_reports[0]['BS']
+                merged_dict['QPS'] = qps
+                merged_dict['AVG Latency'] = avg_latency
+                merged_dict["P99 Latency"] = p99_latency
 
+                if model_name != 'gpt2':
                     predict_qps = performance_reports[0]['predict QPS'] + performance_reports[1]['predict QPS']
                     predict_avg_latency = round(((performance_reports[0]['predict AVG Latency'] + performance_reports[1]['predict AVG Latency']) / 2.0), 2)
                     predict_p99_latency = round(((performance_reports[0]['predict P99 Latency'] + performance_reports[1]['predict P99 Latency']) / 2.0), 2)
 
-                    merged_dict['BS'] = performance_reports[0]['BS']
-                    merged_dict['QPS'] = qps
-                    merged_dict['AVG Latency'] = avg_latency
-                    merged_dict["P99 Latency"] = p99_latency
-
                     merged_dict['predict QPS'] = predict_qps
                     merged_dict['predict AVG Latency'] = predict_avg_latency
                     merged_dict["predict P99 Latency"] = predict_p99_latency
-                    
-            return merged_dict
-        
-        else:
-            merged_dict['BS'] = performance_reports[0]['BS']
-            merged_dict['QPS'] = performance_reports[0]['QPS']
-            merged_dict['AVG Latency'] = performance_reports[0]['AVG Latency']
-            merged_dict["P99 Latency"] = performance_reports[0]["P99 Latency"]
-
-            return merged_dict
-
+                
+        return merged_dict
 
     def predict(self, feeds):
         # The deberta model is currently unable to undergo accuracy testing temporarily
@@ -253,9 +234,10 @@ def predict(self, feeds):
             return result
     
     def predict_igie(self, dataloader):
-        self.module_igie.set_input("input_ids", tvm.nd.array(dataloader["input_ids"].astype('int64'), self.device))
-        self.module_igie.run()
-        output = None   # self.module_igie.get_output(0).numpy()
+        self.task.module.set_input("input_ids", tvm.nd.array(dataloader["input_ids"].astype('int64'), self.device))
+        self.task.module.run()
+        output = self.task.module.get_output(0)
+
         return output
     
     def benchmark_interact(self, dataloader):
@@ -267,6 +249,9 @@ def benchmark_interact(self, dataloader):
         report = {}
         report["BS"] = batch_size
 
+        if model_name == 'gpt2':
+            self.load_igie(batch_size)
+
         test_data = self._get_fake_samples(batch_size=batch_size,
                         shape=self.configs['segments'][0]['input_tensor_map'],
                         input_type=self.configs['input_type'])
@@ -293,8 +278,6 @@ def benchmark_interact(self, dataloader):
                 predict_range[int(len(predict_range) * 0.99)] * 1000, 2)
             predict_avg_latency = round(sum(predict_range) / iterations * 1000, 2)
             fps = int(1000.0 * batch_size / predict_avg_latency)
-        else:
-            pass
 
         log.info(
             'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
@@ -308,8 +291,6 @@ def benchmark_interact(self, dataloader):
             report['predict QPS'] = fps
             report['predict AVG Latency'] = predict_avg_latency
             report['predict P99 Latency'] = predict_tail_latency
-        else:
-            pass
 
         return report
 
@@ -322,57 +303,61 @@ def load(self, batch_size) -> None:
         model = self.configs['model']
         model_name = self.configs['model'].split("-")[0]
         model_path = self.configs['model_path']
-        self.model_runtimes = []
-
-        if model_name != 'gpt2':
-            if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
-                engine_path = model_path.split(".")[0] + "_end.engine"
+        
+        if model_name == 'gpt2':
+            self.batch_size = batch_size
+            return
+        
+        if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
+            engine_path = model_path.split(".")[0] + "_end.engine"
 
-            elif model_name == 'widedeep':
-                engine_path = model_path + "/" + model + "_end.engine"
-            
-            elif model_name == 'roformer':
-                engine_path = model_path + "/" + model + ".engine"
-            
-            elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin':
-                engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine" 
+        elif model_name == 'widedeep':
+            engine_path = model_path + "/" + model + "_end.engine"
+        
+        elif model_name == 'roformer':
+            engine_path = model_path + "/" + model + ".engine"
+        
+        elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin':
+            engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine" 
 
-            else:
-                engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
-            
-            # **************to do*************
-            if model_name == 'widedeep':      
-                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape" + ".engine"
+        else:
+            engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
+        
+        # **************to do*************
+        if model_name == 'widedeep':      
+            engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape" + ".engine"
 
-            if model_name == 'conformer':
-                engine_path = "general_perf/model_zoo/popular/open_conformer/conformer_encoder_optimizer_end" + ".engine"    
-            
-            # if model_name == 'roformer':
-            #     engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen-sim-modified-" + str(batch_size) + ".engine" 
-            
-            if model_name == 'deberta':
-                engine_path = "general_perf/model_zoo/popular/open_conformer/deberta-base-squad-sim_end" + ".engine"   
+        if model_name == 'conformer':
+            engine_path = "general_perf/model_zoo/popular/open_conformer/conformer_encoder_optimizer_end" + ".engine"    
+        
+        # if model_name == 'roformer':
+        #     engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen-sim-modified-" + str(batch_size) + ".engine" 
+        
+        if model_name == 'deberta':
+            engine_path = "general_perf/model_zoo/popular/open_conformer/deberta-base-squad-sim_end" + ".engine"   
 
-            engine, context = init_by_tensorrt(engine_path)
+        engine, context = init_by_tensorrt(engine_path)
 
-            self.model_runtimes.append(engine)
+        self.model_runtimes.append(engine)
 
-            self.input_type = self.configs['input_type']
-            
-            self.batch_size = batch_size
-            self.model_runtimes = []
-            self.engine = engine
-            self.context = context
+        self.input_type = self.configs['input_type']
         
-        else:
-            _, device = get_target('iluvatar_with_all_libs')
-            engine_path = os.path.dirname(model_path) + "/" + model + "_bs" + str(batch_size) + ".so" 
-            lib = tvm.runtime.load_module(engine_path)
-            module_igie = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+        self.batch_size = batch_size
+        self.engine = engine
+        self.context = context         
+    
+    def load_igie(self, batch_size):
+        model = self.configs['model']
+        model_path = self.configs['model_path']
 
-            self.module_igie = module_igie
-            self.device = device
-            self.batch_size = batch_size
+        target, _ = get_target('iluvatar_with_all_libs')
+        device = tvm.device(target.kind.name, self.task.device_id)
+        engine_path = os.path.dirname(model_path) + "/" + model + "_bs" + str(batch_size) + ".so"
+        lib = tvm.runtime.load_module(engine_path)
+        self.task.module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+        self.device = device
+        self.batch_size = batch_size
 
     def _get_fake_samples(self, batch_size, shape, input_type):
         data = {}
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/__init__.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/__init__.py
new file mode 100755
index 000000000..4f6b31079
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/__init__.py
@@ -0,0 +1,20 @@
+from .file import load_json, save_json
+from .timer import Timer
+
+
+from .argument import get_args
+from .import_model import import_model_to_igie
+from .target import get_target
+
+from .dataloader import get_dataloader_from_args, download_builtin_data
+
+
+from .imagenet_metric import get_topk_accuracy
+from .coco_metric import COCO2017Evaluator, COCO2017EvaluatorForYolox, COCO2017EvaluatorForYolov4
+
+from .quantization import igie_quantize_model_from_args, onnx_quantize_model_from_args
+
+from .mod_rewriter import modify_seq_len_for_nlp
+from .stauts_checker import check_status
+
+from .compile_engine import compile_engine_from_args
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/argument.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/argument.py
new file mode 100755
index 000000000..4c2f253ff
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/argument.py
@@ -0,0 +1,331 @@
+import argparse
+import os
+import sys
+import json
+from numbers import Number
+
+def to_bool(value):
+    if isinstance(value, bool):
+        return value
+    elif isinstance(value, str):
+        return value.lower() in ("yes", "true", "t", "1")
+    elif isinstance(value, Number):
+        return value != 0
+    else:
+        return False
+
+
+def get_args_parser():
+
+    parser = argparse.ArgumentParser()
+
+    # always required
+    parser.add_argument("--model_path",
+                        type=str,
+                        required=True,
+                        help="model path or model name in torchviso")
+
+    parser.add_argument("--input",
+                        type=str,
+                        required=True,
+                        dest="input",
+                        nargs='+',
+                        help="""
+                            input name and shape/dtype, format shoul be input_name:input_shape or input_name:input_shape/dtype,
+                            and use space to connect multiple inputs,
+                            if dtype is not given, we assuem the dtype is float32
+                            single input case: --input input1:1,3,224,224
+                            multiple inputs case: --input input1:32,3,224,224 input2:32,100
+                            miltiple inputs with differnet dtype case: --input input1:32,3,224,224/float32 input2:32,100/int64
+                            """)
+                        
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision")
+    
+    ## common optional
+    parser.add_argument("--target",
+                        type=str,
+                        choices=["llvm", "iluvatar", "iluvatar_with_cudnn_cublas",  "iluvatar_with_ixinfer", "iluvatar_with_all_libs"],
+                        default="iluvatar_with_all_libs",
+                        help="""IGIE compile target
+                            llvm: cpu only
+                            iluvatar: gpu without any other accerelate library
+                            iluvatar_with_cudnn_cublas: gpu with all accerelate library cudnn/cublas
+                            iluvatar_with_ixinfer: gpu with all accerelate library ixinfer
+                            iluvatar_with_all_libs: gpu with all accerelate library cudnn/cublas/ixinfer
+                            """)
+    
+    parser.add_argument("--engine_path",
+                        type=str,
+                        default=None,
+                        help="save path of engine, save in pwd if not provided")
+
+    parser.add_argument("--warmup",
+                        type=int,
+                        default=3,
+                        help="numbe of warmup before test")
+    
+    # parser.add_argument("--test_count",
+    #                     type=int,
+    #                     default=None,
+    #                     help="number of batch to test, test all batch if not specified")
+
+    parser.add_argument("--verbose",
+                        type=to_bool,
+                        default=False,
+                        help="dump igie mod to file if is True")
+    
+    parser.add_argument("--num_workers",
+                        type=int,
+                        default=16,
+                        help="number of workers used in pytorch dataloader")
+    
+    parser.add_argument("--batch_size",
+                        type=int,
+                        default=None,
+                        help="""model batch size for datalodaer,
+                            use the first dimension of the first input when not specified 
+                            this argument will be useful for multi-input case:
+                            e.g. input_ids:1000,22 pixel_values:32,3,224,224 attention_mask:1000,22
+                            """)
+    
+    ## dataset
+    parser.add_argument("--use_imagenet",
+                        type=to_bool,
+                        default=False,
+                        help="use imagenet val dataet for calibration and test")
+    
+    parser.add_argument("--use_coco2017",
+                        type=to_bool,
+                        default=False,
+                        help="use coco2017 val datatset for calibration and test")
+
+    # parser.add_argument("--custom_data_path",
+    #                     type=str,
+    #                     default=None,
+    #                     help="user-provided custom data path to define user's datalodaer"
+    #                     )
+
+    parser.add_argument("--input_layout",
+                        type=str,
+                        choices=["NHWC", "NCHW"],
+                        default="NCHW",
+                        help="model input layout, only works for cv model")
+
+    parser.add_argument("--calibration_file_path",
+                        type=str,
+                        default=None,
+                        help="user-provided calibration npy data path, only used for calibration")
+    
+    ## custom quantization config
+    parser.add_argument("--automatic_yolo_quantization",
+                        type=to_bool,
+                        default=False,
+                        help="automaticlly find the best strategy for yolo by skipping the yolo detect node quantization")    
+    
+    parser.add_argument("--quantization_config_path",
+                        type=str,
+                        default=None,
+                        help="quantization config path for onnxruntime, should be a json file, refer to igie-doc for more infomation")    
+    
+    
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=to_bool,
+                        default=False,
+                        help="run performance test only")
+    
+    parser.add_argument('--just_export',
+                        type=to_bool,
+                        default=False,
+                        help="just export engine and return")
+    
+    ## other custom option
+    
+    parser.add_argument("--custom_option",
+                        type=str,
+                        default=None,
+                        dest="custom_option",
+                        nargs='+',
+                        help="""
+                            user-provided custom key:value option, use space to connect multiple option,
+                            bool value will be cast to Python bool type automaticaly,
+                            single option case: --custom_option my_data_path:/local/data
+                            multiple option case: --custom_option my_data_path:/local/data use_optionA:True
+                            """)
+    
+    
+    return parser
+
+
+
+def _parse_framework(args_dict):
+    model_path_or_name = args_dict["model_path"]
+    framework = None
+ 
+    # NOTE(chen.chen):
+    # We rely on the suffix to distinguish the source framework of the model,
+    # e.g. model.onnx, model.pb, etc. 
+    
+    # But if the model_path is_not exists, we will try to find it from torchvision and raise except when not found
+    # e.g. resnet18, resnet50
+    
+    if os.path.exists(model_path_or_name):
+        ext = os.path.splitext(model_path_or_name)[1]
+        
+        if ext == ".onnx":
+            framework = "onnx"
+        elif ext == ".pb":
+            framework = "tensorflow"
+        elif ext == ".pt":
+            framework = "pytorch"
+        else:
+            raise ValueError(f"{ext} is not supported yet")
+    else:            
+        # NOTE(chen.chen)
+        # paddle model saved as a directory
+        # so we need check if it is a paddle model here
+        paddle_model = f"{model_path_or_name}.pdmodel"
+        if os.path.exists(paddle_model):
+            framework = "paddle"
+        else:        
+            # NOTE(chen.chen):
+            # we support use torchvision pretrained model
+            # when model_path has no extension, we will try to find it from torchvision
+            # e.g. --model_path resnet50
+            framework = "pytorch"
+
+    args_dict["model_framework"] = framework
+
+        
+
+def _parse_input(args_dict):
+    input_list = args_dict.pop("input")    
+    
+    input_dict = {}
+    input_name_list = []
+    input_shape_list = []
+    input_dtype_list = []
+    batch_size = None
+    for i in input_list:
+        name, shape_dtype = i.rsplit(":", 1)
+        if "/" in shape_dtype:
+            shape, dtype = shape_dtype.split("/")
+            dtype = dtype.replace("fp", "float")
+            input_dtype_list.append(dtype)
+        else:
+            shape = shape_dtype
+            input_dtype_list.append("float32")
+        shape = tuple([int(j) for j in shape.split(",")])
+        input_dict[name] = shape
+        input_name_list.append(name)
+        input_shape_list.append(shape)
+        
+        if batch_size is None:
+            batch_size = shape[0]
+    
+    args_dict["input_dict"] = input_dict
+    args_dict["input_name_list"] = input_name_list
+    args_dict["input_shape_list"] = input_shape_list
+    args_dict["input_dtype_list"] = input_dtype_list
+    if args_dict["batch_size"] is None:
+        args_dict["batch_size"] = batch_size
+
+
+def _parse_engine_path(args_dict):
+    if args_dict["engine_path"] is None:
+        model_base_name = os.path.splitext(os.path.split(args_dict["model_path"])[1])[0]
+        args_dict["engine_path"] = f"{model_base_name}_batchsize_{args_dict['batch_size']}_{args_dict['precision']}.so"
+    assert args_dict["engine_path"].endswith("so")
+
+   
+def _parse_custom_option(args_dict):
+    custom_option_dict = {}
+    if args_dict["custom_option"] is not None :
+        custom_option = args_dict.pop("custom_option")
+        
+        for option in custom_option:
+            key, value = option.split(":", 1)
+            if value.lower() == "true":
+                value = True
+            elif value.lower() == "false":
+                value = False
+            elif "," in value:
+                value = value.split(",")
+            custom_option_dict[key] = value
+    
+    required_pass = custom_option_dict.get("required_pass", [])
+    if not isinstance(required_pass, list):
+        required_pass = [required_pass]
+    
+    args_dict["required_pass"] = required_pass
+    args_dict["custom_option"] = custom_option_dict
+
+
+def _parse_dataset(args_dict):
+    args_dict["use_builtin_data"] = args_dict["use_imagenet"] or args_dict["use_coco2017"]
+    if not args_dict["use_builtin_data"]:
+        args_dict["perf_only"] = True
+
+def _parse_quantization_config(args_dict):
+    
+    quantization_config_path = args_dict["quantization_config_path"]
+    if quantization_config_path is not None:
+        assert os.path.exists(quantization_config_path)
+        
+        with open(quantization_config_path, "r") as f:
+            data = json.load(f)
+        args_dict["quantization_config"] = data
+    else:
+        args_dict["quantization_config"] = {}
+
+
+
+def get_args(return_dict=False):   
+    if sys.version_info.major != 3 and sys.version_info.minor < 7:
+        raise ValueError(f"need at least python3.7, got {sys.version}")
+    
+    args_dict = vars(get_args_parser().parse_args())
+
+    _parse_framework(args_dict)
+    _parse_input(args_dict)
+    _parse_engine_path(args_dict)
+    _parse_quantization_config(args_dict)
+    _parse_dataset(args_dict)
+    _parse_custom_option(args_dict)
+    
+    from pprint import pprint
+    pprint(args_dict, indent=2)  
+
+    if return_dict:
+        return args_dict
+    
+    return argparse.Namespace(**args_dict)
+    
+
+
+if __name__ == "__main__":
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224 --precision=int8
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224,44444 input2:32,100 --precision=int8
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224,44444/float32 input2:32,100/int64 --precision=int8
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224,44444/float32 input2:32,100/fp16 --precision=int8
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224,44444 input2:32,100 --precision=int8 --custom_option my_data_path:/local/data use_optionA:True
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224,44444 input2:32,100 --precision=int8 --custom_option my_data_path:/local/data use_optionA:True required_pass:pass1,pass2,pass3
+    args = get_args(return_dict=True)
+    
+    from pprint import pprint
+    pprint(args)
+    
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/coco_metric.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/coco_metric.py
new file mode 100755
index 000000000..a4e468bb7
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/coco_metric.py
@@ -0,0 +1,622 @@
+import os
+import json
+import cv2
+import numpy as np
+
+import torch
+import torchvision
+from pycocotools.coco import COCO
+
+def get_coco_accuracy(pred_json, ann_json):
+    coco = COCO(annotation_file=ann_json)
+    coco_pred = coco.loadRes(pred_json)
+    try:
+        from .fastCoCoeval.fast_coco_eval_api import COCOeval_opt as COCOeval
+        coco_evaluator = COCOeval(cocoGt=coco, cocoDt=coco_pred, iouType="bbox")
+    except:
+        from pycocotools.cocoeval import COCOeval
+        print("Can't import fastCoCoeval, Using PyCoCcotools API ...")
+        coco_evaluator = COCOeval(cocoGt=coco, cocoDt=coco_pred, iouType="bbox")
+            
+    coco_evaluator.evaluate()
+    coco_evaluator.accumulate()
+    coco_evaluator.summarize()
+    return coco_evaluator.stats
+
+coco80_to_coco91 = [
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+    23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+    46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+    65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88,
+    89, 90
+]
+
+coco80_to_coco91_dict = {idx: i for idx, i in enumerate(coco80_to_coco91)}
+coco91_to_coco80_dict = {i: idx for idx, i in enumerate(coco80_to_coco91)}
+
+
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114)):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[
+        1]  # wh padding
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im,
+                            top,
+                            bottom,
+                            left,
+                            right,
+                            cv2.BORDER_CONSTANT,
+                            value=color)  # add border
+    return im, ratio, (dw, dh)
+
+
+def box_area(box):
+    # box = xyxy(4,n)
+    return (box[2] - box[0]) * (box[3] - box[1])
+
+
+def box_iou(box1, box2, eps=1e-7):
+    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Arguments:
+        box1 (Tensor[N, 4])
+        box2 (Tensor[M, 4])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+
+    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
+    (a1, a2), (b1, b2) = box1[:, None].chunk(2, 2), box2.chunk(2, 1)
+    inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp(0).prod(2)
+
+    # IoU = inter / (area1 + area2 - inter)
+    return inter / (box_area(box1.T)[:, None] + box_area(box2.T) - inter + eps)
+
+
+def xyxy2xywh(x):
+    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
+    y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
+    y[:, 2] = x[:, 2] - x[:, 0]  # width
+    y[:, 3] = x[:, 3] - x[:, 1]  # height
+    return y
+
+
+def xywh2xyxy(x):
+    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
+    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
+    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
+    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
+    return y
+
+
+def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0):
+    # Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = w * (x[:, 0] - x[:, 2] / 2) + padw  # top left x
+    y[:, 1] = h * (x[:, 1] - x[:, 3] / 2) + padh  # top left y
+    y[:, 2] = w * (x[:, 0] + x[:, 2] / 2) + padw  # bottom right x
+    y[:, 3] = h * (x[:, 1] + x[:, 3] / 2) + padh  # bottom right y
+    return y
+
+
+def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0):
+    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] normalized where xy1=top-left, xy2=bottom-right
+    if clip:
+        clip_boxes(x, (h - eps, w - eps))  # warning: inplace clip
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = ((x[:, 0] + x[:, 2]) / 2) / w  # x center
+    y[:, 1] = ((x[:, 1] + x[:, 3]) / 2) / h  # y center
+    y[:, 2] = (x[:, 2] - x[:, 0]) / w  # width
+    y[:, 3] = (x[:, 3] - x[:, 1]) / h  # height
+    return y
+
+
+def xyn2xy(x, w=640, h=640, padw=0, padh=0):
+    # Convert normalized segments into pixel segments, shape (n,2)
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = w * x[:, 0] + padw  # top left x
+    y[:, 1] = h * x[:, 1] + padh  # top left y
+    return y
+
+
+def segment2box(segment, width=640, height=640):
+    # Convert 1 segment label to 1 box label, applying inside-image constraint, i.e. (xy1, xy2, ...) to (xyxy)
+    x, y = segment.T  # segment xy
+    inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height)
+    x, y, = x[inside], y[inside]
+    return np.array([x.min(), y.min(), x.max(),
+                     y.max()]) if any(x) else np.zeros((1, 4))  # xyxy
+
+
+def segments2boxes(segments):
+    # Convert segment labels to box labels, i.e. (cls, xy1, xy2, ...) to (cls, xywh)
+    boxes = []
+    for s in segments:
+        x, y = s.T  # segment xy
+        boxes.append([x.min(), y.min(), x.max(), y.max()])  # cls, xyxy
+    return xyxy2xywh(np.array(boxes))  # cls, xywh
+
+
+def resample_segments(segments, n=1000):
+    # Up-sample an (n,2) segment
+    for i, s in enumerate(segments):
+        s = np.concatenate((s, s[0:1, :]), axis=0)
+        x = np.linspace(0, len(s) - 1, n)
+        xp = np.arange(len(s))
+        segments[i] = np.concatenate([
+            np.interp(x, xp, s[:, i]) for i in range(2)
+        ]).reshape(2, -1).T  # segment xy
+    return segments
+
+
+def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None):
+    # Rescale boxes (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0],
+                   img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (
+            img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+
+    boxes[:, [0, 2]] -= pad[0]  # x padding
+    boxes[:, [1, 3]] -= pad[1]  # y padding
+    boxes[:, :4] /= gain
+    clip_boxes(boxes, img0_shape)
+    return boxes
+
+
+def scale_segments(img1_shape, segments, img0_shape, ratio_pad=None):
+    # Rescale coords (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0],
+                   img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (
+            img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+
+    segments[:, 0] -= pad[0]  # x padding
+    segments[:, 1] -= pad[1]  # y padding
+    segments /= gain
+    clip_segments(segments, img0_shape)
+    return segments
+
+
+def clip_boxes(boxes, shape):
+    # Clip boxes (xyxy) to image shape (height, width)
+    if isinstance(boxes, torch.Tensor):  # faster individually
+        boxes[:, 0].clamp_(0, shape[1])  # x1
+        boxes[:, 1].clamp_(0, shape[0])  # y1
+        boxes[:, 2].clamp_(0, shape[1])  # x2
+        boxes[:, 3].clamp_(0, shape[0])  # y2
+    else:  # np.array (faster grouped)
+        boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
+        boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
+
+
+def clip_segments(boxes, shape):
+    # Clip segments (xy1,xy2,...) to image shape (height, width)
+    if isinstance(boxes, torch.Tensor):  # faster individually
+        boxes[:, 0].clamp_(0, shape[1])  # x
+        boxes[:, 1].clamp_(0, shape[0])  # y
+    else:  # np.array (faster grouped)
+        boxes[:, 0] = boxes[:, 0].clip(0, shape[1])  # x
+        boxes[:, 1] = boxes[:, 1].clip(0, shape[0])  # y
+
+
+def non_max_suppression(
+        prediction,
+        conf_thres=0.25,
+        iou_thres=0.45,
+        classes=None,
+        agnostic=False,
+        multi_label=True,
+        labels=(),
+        max_det=300,
+        nm=0,  # number of masks
+):
+    """Non-Maximum Suppression (NMS) on inference results to reject overlapping detections
+
+    Returns:
+         list of detections, on (n,6) tensor per image [xyxy, conf, cls]
+    """
+
+    if isinstance(
+            prediction, (list, tuple)
+    ):  # YOLOv5 model in validation model, output = (inference_out, loss_out)
+        prediction = prediction[0]  # select only inference output
+
+    bs = prediction.shape[0]  # batch size
+    nc = prediction.shape[2] - nm - 5  # number of classes
+    xc = prediction[..., 4] > conf_thres  # candidates
+
+    # Checks
+    assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
+    assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
+
+    # Settings
+    # min_wh = 2  # (pixels) minimum box width and height
+    max_wh = 7680  # (pixels) maximum box width and height
+    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
+    time_limit = 0.5 + 0.05 * bs  # seconds to quit after
+    redundant = True  # require redundant detections
+    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
+    merge = False  # use merge-NMS
+
+    # t = time.time()
+    mi = 5 + nc  # mask start index
+    output = [torch.zeros((0, 6 + nm))] * bs
+    for xi, x in enumerate(prediction):  # image index, image inference
+        # Apply constraints
+        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
+        x = x[xc[xi]]  # confidence
+
+        # Cat apriori labels if autolabelling
+        if labels and len(labels[xi]):
+            lb = labels[xi]
+            v = torch.zeros((len(lb), nc + nm + 5), device=x.device)
+            v[:, :4] = lb[:, 1:5]  # box
+            v[:, 4] = 1.0  # conf
+            v[range(len(lb)), lb[:, 0].long() + 5] = 1.0  # cls
+            x = torch.cat((x, v), 0)
+
+        # If none remain process next image
+        if not x.shape[0]:
+            continue
+
+        # Compute conf
+        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+
+        # Box/Mask
+        box = xywh2xyxy(
+            x[:, :4])  # center_x, center_y, width, height) to (x1, y1, x2, y2)
+        mask = x[:, mi:]  # zero columns if no masks
+
+        # Detections matrix nx6 (xyxy, conf, cls)
+        if multi_label:
+            i, j = (x[:, 5:mi] > conf_thres).nonzero(as_tuple=False).T
+            x = torch.cat(
+                (box[i], x[i, 5 + j, None], j[:, None].float(), mask[i]), 1)
+        else:  # best class only
+            conf, j = x[:, 5:mi].max(1, keepdim=True)
+            x = torch.cat((box, conf, j.float(), mask),
+                          1)[conf.view(-1) > conf_thres]
+
+        # Filter by class
+        if classes is not None:
+            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+
+        # Apply finite constraint
+        # if not torch.isfinite(x).all():
+        #     x = x[torch.isfinite(x).all(1)]
+
+        # Check shape
+        n = x.shape[0]  # number of boxes
+        if not n:  # no boxes
+            continue
+        elif n > max_nms:  # excess boxes
+            x = x[x[:, 4].argsort(
+                descending=True)[:max_nms]]  # sort by confidence
+        else:
+            x = x[x[:, 4].argsort(descending=True)]  # sort by confidence
+
+        # Batched NMS
+        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + c, x[:,
+                                        4]  # boxes (offset by class), scores
+        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
+        if i.shape[0] > max_det:  # limit detections
+            i = i[:max_det]
+        if merge and (1 < n <
+                      3E3):  # Merge NMS (boxes merged using weighted mean)
+            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
+            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
+            weights = iou * scores[None]  # box weights
+            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(
+                1, keepdim=True)  # merged boxes
+            if redundant:
+                i = i[iou.sum(1) > 1]  # require redundancy
+
+        output[xi] = x[i]
+    return output
+
+
+
+
+#NOTE(chen.chen):
+# just work for coco2017 val using pycocotools
+# maybe we need some abstraction here for generic coco-like dataset
+class COCO2017Evaluator:    
+    def __init__(self,
+                 label_path,
+                 image_size=640,
+                 with_nms=False,
+                 conf_thres=0.001,
+                 iou_thres=0.65):
+        self.with_nms = with_nms
+        self.conf_thres = conf_thres
+        self.iou_thres = iou_thres
+        self.label_path = label_path
+        self.image_size = image_size
+
+        self.jdict = []
+
+        # iou vector for mAP@0.5:0.95
+        self.iouv = torch.linspace(0.5, 0.95, 10)  
+        self.niou = self.iouv.numel()
+    
+    def evaluate(self, pred, all_inputs, nms_count=None):
+        im = all_inputs[0]
+        targets = all_inputs[1]
+        paths = all_inputs[2]
+        shapes = all_inputs[3]
+
+        _, _, height, width = im.shape
+        targets[:, 2:] *= np.array((width, height, width, height))
+        
+        if self.with_nms:
+            assert nms_count is not None
+            tmp_out = []
+            for boxes, count in zip(pred, nms_count):
+                count = count[0]
+                boxes = boxes[:count, :]
+                boxes_cp = boxes.copy()
+                # (x1,y1,x2,y2,class_id,score)
+                # To (x1,y1,x2,y2,score,class_id)
+                boxes[:, 4] = boxes_cp[:, 5]
+                boxes[:, 5] = boxes_cp[:, 4]
+                tmp_out.append(torch.from_numpy(boxes))
+            pred = tmp_out   
+        else:
+            pred = torch.from_numpy(pred)
+            pred = non_max_suppression(pred, self.conf_thres, self.iou_thres)
+        for idx, det in enumerate(pred):
+            img_path = paths[idx]
+
+            predn = det
+            shape = shapes[idx][0]
+            scale_boxes(im[idx].shape[1:], predn[:, :4], shape, shapes[idx][1])  # native-space pred
+
+            self._save_one_json(predn, self.jdict, img_path, coco80_to_coco91)  # append to COCO-JSON dictionary
+        
+
+    def _save_one_json(self, predn, jdict, path, class_map):
+        # Save one JSON result in the format
+        # {"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}
+        image_id = int(os.path.splitext(os.path.basename(path))[0])
+        box = xyxy2xywh(predn[:, :4])
+        box[:, :2] -= box[:, 2:] / 2
+        for p, b in zip(predn.tolist(), box.tolist()):
+            jdict.append({
+                'image_id': image_id,
+                'category_id': class_map[int(p[5])],
+                'bbox': [round(x, 3) for x in b],
+                'score': round(p[4], 5)
+            })
+
+
+    def summary(self):
+        if len(self.jdict):
+            pred_json = os.path.join("coco2017_predictions.json")
+            with open(pred_json, 'w') as f:
+                json.dump(self.jdict, f)
+            result = get_coco_accuracy(pred_json, self.label_path)
+        else:
+            raise ValueError("can not find generated json dict for pycocotools")
+        return result
+
+# coco2017 val evaluator For Yolox
+class COCO2017EvaluatorForYolox(COCO2017Evaluator):
+    def evaluate(self, pred, all_inputs):
+        im = all_inputs[0]
+        img_path = all_inputs[1]
+        img_info = all_inputs[2]
+        
+        _, _, height, width = im.shape
+        img_size = [height, width]
+
+        pred = torch.from_numpy(self.Detect(pred, img_size=[height, width]))
+
+        nms_outputs = self.postprocess(
+                    pred, conf_thre=self.conf_thres, nms_thre=self.iou_thres
+                )
+
+        for (output, org_img, path) in zip(nms_outputs, img_info, img_path):
+            if output is None:
+                continue
+            
+            bboxes = output[:, 0:4]
+
+            img_h, img_w = org_img
+
+            scale = min(img_size[0] / float(img_h), img_size[1] / float(img_w))
+
+            bboxes /= scale
+            cls = output[:, 6]
+            scores = output[:, 4] * output[:, 5]
+            
+            bboxes = self._xyxy2xywh(bboxes)
+
+            self._save_one_json(bboxes, cls, scores, self.jdict, path, coco80_to_coco91)
+
+    def Detect(self, outputs, img_size):
+        grids = []
+        expanded_strides = []
+
+        strides = [8, 16, 32]
+
+        hsizes = [img_size[0] // stride for stride in strides]
+        wsizes = [img_size[1] // stride for stride in strides]
+
+        for hsize, wsize, stride in zip(hsizes, wsizes, strides):
+            xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
+            grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
+            grids.append(grid)
+            shape = grid.shape[:2]
+            expanded_strides.append(np.full((*shape, 1), stride))
+
+        grids = np.concatenate(grids, 1)
+        expanded_strides = np.concatenate(expanded_strides, 1)
+        outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
+        outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
+        
+        return outputs
+    
+    def postprocess(self, prediction, num_classes=80, conf_thre=0.7, nms_thre=0.45, class_agnostic=False):
+        box_corner = prediction.new(prediction.shape)
+        box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+        box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+        box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+        box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+        prediction[:, :, :4] = box_corner[:, :, :4]
+
+        output = [None for _ in range(len(prediction))]
+
+        for i, image_pred in enumerate(prediction):
+            # If none are remaining => process next image
+            if not image_pred.size(0):
+                continue
+            # Get score and class with highest confidence
+            class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True)
+
+            conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
+            # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
+            detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1)
+            detections = detections[conf_mask]
+            
+            if not detections.size(0):
+                continue
+            if class_agnostic:
+                nms_out_index = torchvision.ops.nms(
+                    detections[:, :4],
+                    detections[:, 4] * detections[:, 5],
+                    nms_thre,
+                )
+            else:
+                nms_out_index = torchvision.ops.batched_nms(
+                    detections[:, :4],
+                    detections[:, 4] * detections[:, 5],
+                    detections[:, 6],
+                    nms_thre,
+                )
+            detections = detections[nms_out_index]
+
+            if output[i] is None:
+                output[i] = detections
+            else:
+                output[i] = torch.cat((output[i], detections))
+
+        return output
+
+    def _xyxy2xywh(self, bboxes):
+        bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+        bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+        return bboxes
+
+    def _save_one_json(self, bboxes, class_, scores, jdict, path, class_map):
+        image_id = int(os.path.splitext(os.path.basename(path))[0])
+        for box, score, cls in zip(bboxes.numpy().tolist(), scores.numpy().tolist(), class_.numpy().tolist()):
+            jdict.append({
+                'image_id': image_id,
+                'category_id': class_map[int(cls)],
+                'bbox': box,
+                'score': score
+            })
+
+
+# coco2017 val evaluator For Yolov4
+class COCO2017EvaluatorForYolov4(COCO2017EvaluatorForYolox):
+    def evaluate(self, pred, all_inputs):
+        im = all_inputs[0]
+        img_path = all_inputs[1]
+        img_info = all_inputs[2]
+
+        boxes = torch.squeeze(torch.from_numpy(pred[0]), dim=2)
+        confs = torch.from_numpy(pred[1])
+        detections = torch.cat((boxes, confs.float()), 2)
+
+        nms_outputs = self.postprocess(
+            detections, conf_thre=self.conf_thres, nms_thre=self.iou_thres
+        )
+
+        for (output, org_img, path) in zip(nms_outputs, img_info, img_path):
+            if output is None:
+                continue
+            
+            bboxes = output[:, 0:4]
+            img_h, img_w = org_img
+            bboxes[:, 0] *= img_w
+            bboxes[:, 2] *= img_w
+            bboxes[:, 1] *= img_h
+            bboxes[:, 3] *= img_h
+
+            cls = output[:, 5]
+            scores = output[:, 4]
+            
+            bboxes = self._xyxy2xywh(bboxes)
+            self._save_one_json(bboxes, cls, scores, self.jdict, path, coco80_to_coco91)
+    
+    def postprocess(self, prediction, num_classes=80, conf_thre=0.7, nms_thre=0.45, class_agnostic=False):
+        output = [None for _ in range(len(prediction))]
+
+        for i, image_pred in enumerate(prediction):
+            # If none are remaining => process next image
+            if not image_pred.size(0):
+                continue
+            # Get score and class with highest confidence
+            class_conf, class_pred = torch.max(image_pred[:, 4: 4 + num_classes], 1, keepdim=True)
+
+            conf_mask = (class_conf.squeeze() >= conf_thre).squeeze()
+            detections = torch.cat((image_pred[:, :4], class_conf, class_pred.float()), 1)
+            detections = detections[conf_mask]
+
+            if not detections.size(0):
+                continue
+            if class_agnostic:
+                nms_out_index = torchvision.ops.nms(
+                    detections[:, :4],
+                    detections[:, 4],
+                    nms_thre,
+                )
+            else:
+                nms_out_index = torchvision.ops.batched_nms(
+                    detections[:, :4],
+                    detections[:, 4],
+                    detections[:, 5],
+                    nms_thre,
+                )
+            detections = detections[nms_out_index]
+
+            if output[i] is None:
+                output[i] = detections
+            else:
+                output[i] = torch.cat((output[i], detections))
+
+        return output
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/compile_engine.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/compile_engine.py
new file mode 100755
index 000000000..e310f6b8b
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/compile_engine.py
@@ -0,0 +1,19 @@
+import os
+import tvm
+
+from .import_model import import_model_to_igie
+from .target import get_target
+
+
+# a simple wrapper for compile engine and get module
+def compile_engine_from_args(args):
+    target, device = get_target(args.target)
+    
+    if not os.path.exists(args.engine_path):
+        mod, params = import_model_to_igie(args.model_path, args.input_dict, args.model_framework)
+        lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision, verbose=args.verbose, required_pass=args.required_pass)
+        lib.export_library(args.engine_path)
+    else:
+        lib = tvm.runtime.load_module(args.engine_path)   
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+    return module
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/dataloader.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/dataloader.py
new file mode 100755
index 000000000..8a01ef7e7
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/dataloader.py
@@ -0,0 +1,595 @@
+import os
+import numpy as np
+from PIL import Image
+from collections import defaultdict
+
+import tensorflow as tf
+try:
+    tf = tf.compat.v1
+except ImportError:
+    tf = tf
+tf.enable_eager_execution()
+
+import torch
+import torchvision
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+
+from pycocotools.coco import COCO
+
+from .coco_metric import *
+
+_igie_cache_dir = os.path.expanduser("~/.igie_cache")
+_bulitin_data_url = "http://10.113.3.3/data/CI_DATA/ci_data.tar.gz"
+_builtin_data_path = os.path.join(_igie_cache_dir, "modelzoo_data")
+_symbolic_link_data_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data")
+
+
+### Tensorflow image pre-process function
+def _mean_image_subtraction(image, means):
+    """Subtracts the given means from each image channel."""
+    if image.get_shape().ndims != 3:
+        raise ValueError('Input must be of size [height, width, C>0]')
+    num_channels = image.get_shape().as_list()[-1]
+    if len(means) != num_channels:
+        raise ValueError('len(means) must match the number of channels')
+    channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
+    for i in range(num_channels):
+        channels[i] -= means[i]
+    return tf.concat(axis=2, values=channels)
+
+def _central_crop(image, crop_height, crop_width):
+    shape = tf.shape(image)
+    height, width = shape[0], shape[1]
+
+    amount_to_be_cropped_h = (height - crop_height)
+    crop_top = amount_to_be_cropped_h // 2
+    amount_to_be_cropped_w = (width - crop_width)
+    crop_left = amount_to_be_cropped_w // 2
+    return tf.slice(image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
+
+def _aspect_preserving_resize(image, resize_min):
+    """Resize images preserving the original aspect ratio.
+    """
+    shape = tf.shape(image)
+    height, width = shape[0], shape[1]
+    new_height, new_width = _smallest_size_at_least(height, width, resize_min)
+    return _resize_image(image, new_height, new_width)
+
+def _smallest_size_at_least(height, width, resize_min):
+    resize_min = tf.cast(resize_min, tf.float32)
+    # Convert to floats to make subsequent calculations go smoothly.
+    height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
+    smaller_dim = tf.minimum(height, width)
+    scale_ratio = resize_min / smaller_dim
+    # Convert back to ints to make heights and widths that TF ops will accept.
+    new_height = tf.cast(height * scale_ratio, tf.int32)
+    new_width = tf.cast(width * scale_ratio, tf.int32)
+    return new_height, new_width
+
+def _resize_image(image, height, width):
+    return tf.image.resize(image, [height, width], method=tf.image.ResizeMethod.BILINEAR, preserve_aspect_ratio=False)
+
+
+
+### Pytorch image pre-process function
+def _torch_imagenet_preprocess(image_path):
+    img = Image.open(image_path).convert('RGB')
+    # preprocess image to nomalized tensor for pytorch
+    _PYTORCH_IMAGENET_PREPROCESS = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[
+                                    0.229, 0.224, 0.225]),
+        ]
+    )
+    img = _PYTORCH_IMAGENET_PREPROCESS(img)
+    return img
+
+
+### Tensorflow image pre-process function
+def _tf_imagenet_preprocess(image_path):
+    img = Image.open(image_path).convert('RGB')
+    _TF_IMAGENET_PREPROCESS = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+        ]
+    )
+    
+    img = _TF_IMAGENET_PREPROCESS(img)
+    img *= 255.0
+    assert len(img.shape) == 3
+    img = transforms.Normalize(mean=[123.68, 116.78, 103.94], std=[1, 1, 1])(img)
+    img = img.permute((1, 2, 0)) # CHW -> HWC
+    
+    return img
+
+
+class ImageNetDataset(torch.utils.data.Dataset):
+    def __init__(self, image_dir_path, label_dir_path="", layout="NHWC", image_size=(224, 224)):
+        super().__init__()
+        self.image_dir_path = image_dir_path
+        self.label_dir_path = label_dir_path
+        self.layout = layout
+        
+        if len(image_size) == 1:
+            self.image_height = self.image_width = image_size
+        if len(image_size) == 2:
+            self.image_height = image_size[0]
+            self.image_width = image_size[1]
+        assert self.layout in ["NHWC", "NCHW"], f"layout should be NHWC or NCHW, got {self.layout} "
+        self.img_list = os.listdir(self.image_dir_path)
+        self.label_dict = self.get_label_dict()
+        
+        self.images = []
+        self.length = 0
+
+        for image_dir in self.img_list:
+            image_path = os.path.join(self.image_dir_path, image_dir)
+            if os.path.isdir(image_path):
+                for image in os.listdir(image_path):
+                    self.images.append(os.path.join(image_path, image))
+                    self.length += 1
+
+    def __getitem__(self, index):
+        ## NHWC pre-process for tensorflow
+        if self.layout == "NHWC":
+            processed_image = _tf_imagenet_preprocess(self.images[index])
+            # image = cv2.imread(self.images[index])
+            # image = cv2.cvtColor(image, 4)
+            # resize_image = _aspect_preserving_resize(image, 256)
+            # crop_image = _central_crop(resize_image, self.image_height, self.image_width)  
+            # crop_image.set_shape([self.image_height, self.image_width, 3])
+            # crop_image = tf.to_float(crop_image)
+            # processed_image = _mean_image_subtraction(crop_image, [123.68, 116.78, 103.94]).numpy()
+        
+        ## NCHW pre-process for Pytorch
+        elif self.layout == "NCHW":
+            processed_image = _torch_imagenet_preprocess(self.images[index])
+        else:
+            raise ValueError("Unsupported data layout")
+
+        image_name = self.images[index].split('/')[-1].strip()
+        label = self.label_dict[image_name]
+
+        return processed_image, label
+
+    def __len__(self):
+        return self.length
+
+    def get_label_dict(self):
+        image_label = {}
+        label_path = os.path.join(self.image_dir_path, 'val.txt')
+        if self.label_dir_path != "":
+            label_path = self.label_dir_path
+        if os.path.exists(label_path):
+            with open(label_path, 'r') as file:
+                lines = file.readlines()
+        
+            for line in lines:
+                image = line.split(' ')[0].strip()
+                label = line.split(' ')[1].strip()
+                image_label[image] = int(label)
+        
+        return image_label
+
+def get_imagenet_dataloader(data_path, batch_size, num_workers, model_framework, input_layout):
+    if model_framework == "tensorflow":
+        val_dir = os.path.join(data_path, "val")
+        dataset = ImageNetDataset(val_dir, layout="NHWC")
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size, num_workers=num_workers, drop_last=True)
+
+    else:
+        assert input_layout == "NCHW"
+        val_dir = os.path.join(data_path, 'validation')
+        assert os.path.isdir(val_dir), f"{val_dir} does not exist, please specify correct data path"
+
+        dataset = torchvision.datasets.ImageFolder(
+            val_dir,
+            transforms.Compose(
+                [
+                    transforms.Resize(256, interpolation=InterpolationMode.BILINEAR),
+                    transforms.CenterCrop(224),
+                    transforms.PILToTensor(),
+                    transforms.ConvertImageDtype(torch.float),
+                    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
+                ]
+            )
+        )
+
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size, num_workers=num_workers, drop_last=True)
+
+    return dataloader
+
+class COCO2017Dataset(torch.utils.data.Dataset):
+    def __init__(self,
+                 image_dir_path,
+                 label_json_path,
+                 image_size=640,
+                 pad_color=114,
+                 val_mode=True,
+                 input_layout="NCHW"):
+
+        self.image_dir_path = image_dir_path
+        self.label_json_path = label_json_path
+        self.image_size = image_size
+        self.pad_color = pad_color
+        self.val_mode = val_mode
+        self.input_layout = input_layout
+
+        self.coco = COCO(annotation_file=self.label_json_path)
+        
+        if self.val_mode:
+            self.img_ids = list(sorted(self.coco.imgs.keys()))  # 5000
+        else:  # train mode need images with labels
+            self.img_ids = sorted(list(self.coco.imgToAnns.keys()))  # 4952
+
+    def __len__(self):
+        return len(self.img_ids)
+
+    def __getitem__(self, index):
+        img_path = self._get_image_path(index)
+        img, (h0, w0), (h, w) = self._load_image(index)
+
+        img, ratio, pad = letterbox(img,
+                                    self.image_size,
+                                    color=(self.pad_color, self.pad_color, self.pad_color))
+        shapes = (h0, w0), ((h / h0, w / w0), pad)  # for COCO mAP rescaling
+
+        # load label
+        raw_label = self._load_json_label(index)
+        # normalized xywh to pixel xyxy format
+        raw_label[:, 1:] = xywhn2xyxy(raw_label[:, 1:],
+                                      ratio[0] * w,
+                                      ratio[1] * h,
+                                      padw=pad[0],
+                                      padh=pad[1])
+
+        raw_label[:, 1:] = xyxy2xywhn(raw_label[:, 1:],
+                                      w=img.shape[1],
+                                      h=img.shape[0],
+                                      clip=True,
+                                      eps=1E-3)
+
+        nl = len(raw_label)  # number of labels
+        labels_out = np.zeros((nl, 6))
+        labels_out[:, 1:] = raw_label
+
+        # Convert
+        img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+        img = np.ascontiguousarray(img) / 255.0  # 0~1 np array
+        if self.input_layout == "NHWC":
+            img = img.transpose((1, 2, 0))
+
+        return img, labels_out, img_path, shapes
+
+    def _get_image_path(self, index):
+        idx = self.img_ids[index]
+        path = self.coco.loadImgs(idx)[0]["file_name"]
+        img_path = os.path.join(self.image_dir_path, path)
+        return img_path
+
+    def _load_image(self, index):
+        img_path = self._get_image_path(index)
+
+        im = cv2.imread(img_path)  # BGR
+        h0, w0 = im.shape[:2]  # orig hw
+        r = self.image_size / max(h0, w0)  # ratio
+        if r != 1:  # if sizes are not equal
+            im = cv2.resize(im, (int(w0 * r), int(h0 * r)), interpolation=cv2.INTER_LINEAR)
+        return im.astype("float32"), (h0, w0), im.shape[:2]  # im, hw_original, hw_resized
+
+    def _load_json_label(self, index):
+        _, (h0, w0), _ = self._load_image(index)
+
+        idx = self.img_ids[index]
+        ann_ids = self.coco.getAnnIds(imgIds=idx)
+        targets = self.coco.loadAnns(ids=ann_ids)
+
+        labels = []
+        for target in targets:
+            cat = target["category_id"]
+            coco80_cat = coco91_to_coco80_dict[cat]
+            cat = np.array([[coco80_cat]])
+
+            x, y, w, h = target["bbox"]
+            x1, y1, x2, y2 = x, y, int(x + w), int(y + h)
+            xyxy = np.array([[x1, y1, x2, y2]])
+            xywhn = xyxy2xywhn(xyxy, w0, h0)
+            labels.append(np.hstack((cat, xywhn)))
+
+        if labels:
+            labels = np.vstack(labels)
+        else:
+            if self.val_mode:
+                # for some image without label
+                labels = np.zeros((1, 5))
+            else:
+                raise ValueError(f"set val_mode = False to use images with labels")
+
+        return labels
+
+    @staticmethod
+    def collate_fn(batch):
+        im, label, path, shapes = zip(*batch)
+        for i, lb in enumerate(label):
+            lb[:, 0] = i
+        return np.concatenate([i[None] for i in im], axis=0), np.concatenate(label, 0), path, shapes
+
+# Datasets just for Yolox
+class COCO2017DatasetForYolox(COCO2017Dataset):
+    def __getitem__(self, index):
+        img_path = self._get_image_path(index)
+        img = self._load_image(img_path)
+
+        img, r = self.preproc(img, input_size=self.image_size)
+        
+        return img, img_path, r
+
+    def _load_image(self, img_path):
+        img = cv2.imread(img_path)
+        assert img is not None, f"file {img_path} not found"
+
+        return img
+    
+    def preproc(self, img, input_size, swap=(2, 0, 1)):
+        if len(img.shape) == 3:
+            padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
+        else:
+            padded_img = np.ones(input_size, dtype=np.uint8) * 114
+        
+        org_img = (img.shape[0], img.shape[1])
+        r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+        resized_img = cv2.resize(
+            img,
+            (int(img.shape[1] * r), int(img.shape[0] * r)),
+            interpolation=cv2.INTER_LINEAR,
+        ).astype(np.uint8)
+        padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+
+        padded_img = padded_img.transpose(swap)
+        padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+        return padded_img, org_img
+
+    @staticmethod
+    def collate_fn(batch):
+        im, img_path, r = zip(*batch)
+        return np.concatenate([i[None] for i in im], axis=0), img_path, r
+
+# Datasets just for Yolox
+class COCO2017DatasetForYolov4(COCO2017DatasetForYolox):
+    def preproc(self, img, input_size, swap=(2, 0, 1)):
+        org_img = (img.shape[0], img.shape[1])
+        img_ = cv2.resize(img, (input_size[0], input_size[1]))
+        img_ = cv2.cvtColor(img_, cv2.COLOR_BGR2RGB)
+        img_ = img_.transpose(swap) / 255.0
+        img_ = np.ascontiguousarray(img_, dtype=np.float32)
+        return img_, org_img
+    
+def get_coco2017_dataloader(data_path, label_path, batch_size, image_size, num_workers, model_framework, input_layout, custom_option=None):
+    # TODO(chen.chen)
+    # we only support pytorch-like coco2017 data preprocess
+    # some problems may occur when the data preprocess is different, e.g. tensorflow
+    assert model_framework != "tensorflow"
+    if custom_option == 'yolox':
+        dataset = COCO2017DatasetForYolox(data_path, label_path, image_size=(image_size, image_size), input_layout=input_layout)
+    elif custom_option == 'yolov4':
+        dataset = COCO2017DatasetForYolov4(data_path, label_path, image_size=(image_size, image_size), input_layout=input_layout)
+    else:
+        dataset = COCO2017Dataset(data_path, label_path, image_size, input_layout=input_layout)
+        
+    # NOTE(chen.chen)
+    # we should validate all images in the datasets to use pycocotools
+    # so we do not drop last batch which maybe smaller than a normal batch
+    # you should pad the batch dimension in the outside
+    dataloader = torch.utils.data.DataLoader(dataset,
+                                            batch_size=batch_size,
+                                            drop_last=False,
+                                            num_workers=num_workers,
+                                            collate_fn=dataset.collate_fn)
+
+    return dataloader
+
+
+class FakeDataSet(torch.utils.data.Dataset):
+    def __init__(self, input_name_list, input_shape_list, input_dtype_list):
+        self.input_name_list = input_name_list
+        self.input_shape_list = input_shape_list
+        self.input_dtype_list = input_dtype_list
+
+        self.max_length = 100000
+
+    def __len__(self):
+        return self.max_length
+        
+    def __getitem__(self, _):
+        input_data = []
+        for shape, dtype in zip(self.input_shape_list, self.input_dtype_list):
+            if dtype.startswith("float"):
+                data = np.random.randn(*shape[1:]).astype(dtype)
+            elif dtype.startswith("int"):
+                data = np.random.randint(0, 10, shape[1:]).astype(dtype)
+            else:
+                raise ValueError(f"unsupported dtype: {dtype}")
+        
+            input_data.append(data)
+            
+        return tuple(input_data)
+        
+
+    @staticmethod
+    def collate_fn(batch):
+        batch_input_data = []
+        for i in zip(*batch):
+            data = np.concatenate([j[np.newaxis,:] for j in i], axis=0)
+            batch_input_data.append(data)
+        return tuple(batch_input_data)        
+
+class NumpyDataSet(torch.utils.data.Dataset):
+    def __init__(self, input_name_list, input_shape_list, input_dtype_list, path):
+        self.input_name_list = input_name_list
+        self.input_shape_list = input_shape_list
+        self.input_dtype_list = input_dtype_list
+        self.path = path
+
+        self.ext = os.path.splitext(self.path)[-1]
+        assert self.ext.endswith(".npy") or self.ext.endswith(".data")
+
+        self.dtype_size_map = {
+            "fp32": np.dtype("float32"),
+            "float32": np.dtype("float32"),
+            "fp16": np.dtype("float16"),
+            "float16": np.dtype("float16"),
+            "int8": np.dtype("int8")
+        }
+        
+        self._process_numpy_data()
+   
+    def _process_numpy_data(self):
+        if self.ext.endswith(".npy"):
+            self.total_data_number = len(self.input_name_list)
+            
+            self.data = np.load(self.path, allow_pickle=True)
+            assert len(self.data) == self.total_data_number, f"np data length should be {self.total_data_number}, got {len(self.data)}"        
+            self.length = self.data[0].shape[0]
+        
+        elif self.ext.endswith(".data"): 
+            with open(self.path, mode='rb') as f:
+                calibrate_data = f.read()
+            
+            total_bytes = 0
+            input_size_list = []
+            for shape, dtype in zip(self.input_shape_list, self.input_dtype_list):
+                size = np.prod(shape) * self.dtype_size_map[dtype].itemsize
+                input_size_list.append(size)
+                total_bytes += size
+            
+            assert (len(calibrate_data) % total_bytes == 0), f"calibrate_data size({len(calibrate_data)}) don't match one batch size({total_bytes}) multiple."
+            
+            index = 0
+            npy_data_dict = defaultdict(list)
+            while index < len(calibrate_data):
+                for name, shape, dtype, size in zip(self.input_name_list, self.input_shape_list, self.input_dtype_list, input_size_list):   
+                    data = np.frombuffer(calibrate_data[index: index + size], dtype=dtype).reshape(shape)
+                    npy_data_dict[name].append(data)
+                    index += size
+
+            self.data = []
+            for v in npy_data_dict.values():
+                data = np.concatenate(v, axis=0)
+                self.data.append(data)
+                
+            self.length = self.data[0].shape[0]
+        else:
+            raise 
+
+    def __len__(self):
+        return self.length
+        
+    def __getitem__(self, index):
+        input_data = []
+        for i in self.data:
+            input_data.append(i[index])
+        return tuple(input_data)
+        
+    @staticmethod
+    def collate_fn(batch):
+        batch_input_data = []
+        for i in zip(*batch):
+            data = np.concatenate([j[np.newaxis,:] for j in i], axis=0)
+            batch_input_data.append(data)
+        return tuple(batch_input_data)        
+
+def download_builtin_data():
+    if not os.path.exists(_builtin_data_path):
+        if not os.path.exists(_igie_cache_dir):
+            os.makedirs(_igie_cache_dir)
+
+        pwd = os.getcwd()
+        os.chdir(_igie_cache_dir)
+        
+        cmd = f"wget {_bulitin_data_url}"
+        os.system(cmd)
+
+        cmd = f"tar -xzf ci_data.tar.gz"
+        os.system(cmd)
+                
+        os.chdir(pwd)
+    
+    if os.path.exists(_builtin_data_path) and not os.path.exists(_symbolic_link_data_path):
+        cmd = f"ln -s {_builtin_data_path} {_symbolic_link_data_path}"
+        os.system(cmd)
+        
+    print(f"Use builtin dataset path: {_builtin_data_path}")
+        
+
+def get_dataloader_from_args(args):
+    ## use built-in dataset
+    if args.use_builtin_data:
+        download_builtin_data()
+ 
+        if args.use_imagenet:
+            args.data_path = os.path.join(_builtin_data_path, "datasets", "imagenet")
+            
+            return get_imagenet_dataloader(args.data_path, args.batch_size, args.num_workers, args.model_framework, args.input_layout)
+            
+        elif args.use_coco2017:
+            args.data_path = os.path.join(_builtin_data_path, "datasets", "coco", "images", "val2017")
+            args.label_path = os.path.join(_builtin_data_path, "datasets", "coco", "annotations", "instances_val2017.json")
+
+            input_shape = args.input_shape_list[0]            
+            assert len(input_shape) == 4, f"input should be a 4d tensor, format as NCHW or NHWC, got {len(input_shape)}"
+            if args.input_layout == "NCHW":
+                assert input_shape[2] == input_shape[3], f"HW should be the same, got {input_shape[2]} and {input_shape[3]}"
+                args.image_size = input_shape[2]
+            else: #NHWC
+                assert input_shape[1] == input_shape[2], f"HW should be the same, got {input_shape[1]} and {input_shape[2]}"
+                args.image_size = input_shape[1]
+
+            # use custom option do preprocessing
+            if args.custom_option is not None  and 'process' in args.custom_option:
+                return get_coco2017_dataloader(args.data_path, args.label_path, args.batch_size, args.image_size, args.num_workers, args.model_framework, args.input_layout, args.custom_option['process'])
+            else:   
+                return get_coco2017_dataloader(args.data_path, args.label_path, args.batch_size, args.image_size, args.num_workers, args.model_framework, args.input_layout)
+            
+    
+    elif args.calibration_file_path is not None:
+        ## NOTE(chen.chen)
+        ## user-provided dataset, just use it as calibration data
+        ## we support two format .npy and .data
+        
+        ## if extetion is .npy, it should be a single npy file,
+        ## each input should be saved in a np.ndarray which has beed preprocessed
+        ## e.g. for two inputs model
+        ## the npy should be a list of two array, the shape of each array is like below
+        ## ((100, 3, 224, 224), (100, 1000))
+        
+        ## if extension is .data, we will call np.frombuffer to load the data
+        ## this is for paddle-igie compatibility and only support single input now
+        
+        
+        calibration_file_path = args.calibration_file_path
+        assert os.path.exists(calibration_file_path), f"can not find calibration file:{calibration_file_path}"
+        ext = os.path.splitext(calibration_file_path)[-1]
+        
+        assert ext in [".npy", ".data"], f"unspported calibration file format {ext}, it should be .npy or .data"
+        
+        dataset = NumpyDataSet(args.input_name_list, args.input_shape_list, args.input_dtype_list, calibration_file_path)
+        
+        dataloader = torch.utils.data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, drop_last=True, collate_fn=dataset.collate_fn)
+          
+        return dataloader
+    
+    else:
+        ## NOTE(chen.chen)
+        ## use fake data for calibration, just used for perf test
+        ## here we should know the shape/dtype info of the input to generate the fake input data
+        dataset = FakeDataSet(args.input_name_list, args.input_shape_list, args.input_dtype_list)
+        dataloader = torch.utils.data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, drop_last=True, collate_fn=dataset.collate_fn)
+
+        return dataloader
+    
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/__init__.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/__init__.py
new file mode 100755
index 000000000..8229884b1
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/__init__.py
@@ -0,0 +1,9 @@
+# import torch first to make jit op work without `ImportError of libc10.so`
+import torch
+
+from .jit_ops import FastCOCOEvalOp, JitOp
+
+try:
+    from .fast_coco_eval_api import COCOeval_opt
+except ImportError:  #  exception will be raised when users build yolox from source
+    pass
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.cpp b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.cpp
new file mode 100755
index 000000000..2e63bc995
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.cpp
@@ -0,0 +1,502 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "cocoeval.h"
+#include <time.h>
+#include <algorithm>
+#include <cstdint>
+#include <numeric>
+
+using namespace pybind11::literals;
+
+namespace COCOeval {
+
+// Sort detections from highest score to lowest, such that
+// detection_instances[detection_sorted_indices[t]] >=
+// detection_instances[detection_sorted_indices[t+1]].  Use stable_sort to match
+// original COCO API
+void SortInstancesByDetectionScore(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    std::vector<uint64_t>* detection_sorted_indices) {
+  detection_sorted_indices->resize(detection_instances.size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_instances](size_t j1, size_t j2) {
+        return detection_instances[j1].score > detection_instances[j2].score;
+      });
+}
+
+// Partition the ground truth objects based on whether or not to ignore them
+// based on area
+void SortInstancesByIgnore(
+    const std::array<double, 2>& area_range,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    std::vector<uint64_t>* ground_truth_sorted_indices,
+    std::vector<bool>* ignores) {
+  ignores->clear();
+  ignores->reserve(ground_truth_instances.size());
+  for (auto o : ground_truth_instances) {
+    ignores->push_back(
+        o.ignore || o.area < area_range[0] || o.area > area_range[1]);
+  }
+
+  ground_truth_sorted_indices->resize(ground_truth_instances.size());
+  std::iota(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      0);
+  std::stable_sort(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      [&ignores](size_t j1, size_t j2) {
+        return (int)(*ignores)[j1] < (int)(*ignores)[j2];
+      });
+}
+
+// For each IOU threshold, greedily match each detected instance to a ground
+// truth instance (if possible) and store the results
+void MatchDetectionsToGroundTruth(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    const std::vector<uint64_t>& ground_truth_sorted_indices,
+    const std::vector<bool>& ignores,
+    const std::vector<std::vector<double>>& ious,
+    const std::vector<double>& iou_thresholds,
+    const std::array<double, 2>& area_range,
+    ImageEvaluation* results) {
+  // Initialize memory to store return data matches and ignore
+  const int num_iou_thresholds = iou_thresholds.size();
+  const int num_ground_truth = ground_truth_sorted_indices.size();
+  const int num_detections = detection_sorted_indices.size();
+  std::vector<uint64_t> ground_truth_matches(
+      num_iou_thresholds * num_ground_truth, 0);
+  std::vector<uint64_t>& detection_matches = results->detection_matches;
+  std::vector<bool>& detection_ignores = results->detection_ignores;
+  std::vector<bool>& ground_truth_ignores = results->ground_truth_ignores;
+  detection_matches.resize(num_iou_thresholds * num_detections, 0);
+  detection_ignores.resize(num_iou_thresholds * num_detections, false);
+  ground_truth_ignores.resize(num_ground_truth);
+  for (auto g = 0; g < num_ground_truth; ++g) {
+    ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]];
+  }
+
+  for (auto t = 0; t < num_iou_thresholds; ++t) {
+    for (auto d = 0; d < num_detections; ++d) {
+      // information about best match so far (match=-1 -> unmatched)
+      double best_iou = std::min(iou_thresholds[t], 1 - 1e-10);
+      int match = -1;
+      for (auto g = 0; g < num_ground_truth; ++g) {
+        // if this ground truth instance is already matched and not a
+        // crowd, it cannot be matched to another detection
+        if (ground_truth_matches[t * num_ground_truth + g] > 0 &&
+            !ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) {
+          continue;
+        }
+
+        // if detected instance matched to a regular ground truth
+        // instance, we can break on the first ground truth instance
+        // tagged as ignore (because they are sorted by the ignore tag)
+        if (match >= 0 && !ground_truth_ignores[match] &&
+            ground_truth_ignores[g]) {
+          break;
+        }
+
+        // if IOU overlap is the best so far, store the match appropriately
+        if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) {
+          best_iou = ious[d][ground_truth_sorted_indices[g]];
+          match = g;
+        }
+      }
+      // if match was made, store id of match for both detection and
+      // ground truth
+      if (match >= 0) {
+        detection_ignores[t * num_detections + d] = ground_truth_ignores[match];
+        detection_matches[t * num_detections + d] =
+            ground_truth_instances[ground_truth_sorted_indices[match]].id;
+        ground_truth_matches[t * num_ground_truth + match] =
+            detection_instances[detection_sorted_indices[d]].id;
+      }
+
+      // set unmatched detections outside of area range to ignore
+      const InstanceAnnotation& detection =
+          detection_instances[detection_sorted_indices[d]];
+      detection_ignores[t * num_detections + d] =
+          detection_ignores[t * num_detections + d] ||
+          (detection_matches[t * num_detections + d] == 0 &&
+           (detection.area < area_range[0] || detection.area > area_range[1]));
+    }
+  }
+
+  // store detection score results
+  results->detection_scores.resize(detection_sorted_indices.size());
+  for (size_t d = 0; d < detection_sorted_indices.size(); ++d) {
+    results->detection_scores[d] =
+        detection_instances[detection_sorted_indices[d]].score;
+  }
+}
+
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges,
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances) {
+  const int num_area_ranges = area_ranges.size();
+  const int num_images = image_category_ground_truth_instances.size();
+  const int num_categories =
+      image_category_ious.size() > 0 ? image_category_ious[0].size() : 0;
+  std::vector<uint64_t> detection_sorted_indices;
+  std::vector<uint64_t> ground_truth_sorted_indices;
+  std::vector<bool> ignores;
+  std::vector<ImageEvaluation> results_all(
+      num_images * num_area_ranges * num_categories);
+
+  // Store results for each image, category, and area range combination. Results
+  // for each IOU threshold are packed into the same ImageEvaluation object
+  for (auto i = 0; i < num_images; ++i) {
+    for (auto c = 0; c < num_categories; ++c) {
+      const std::vector<InstanceAnnotation>& ground_truth_instances =
+          image_category_ground_truth_instances[i][c];
+      const std::vector<InstanceAnnotation>& detection_instances =
+          image_category_detection_instances[i][c];
+
+      SortInstancesByDetectionScore(
+          detection_instances, &detection_sorted_indices);
+      if ((int)detection_sorted_indices.size() > max_detections) {
+        detection_sorted_indices.resize(max_detections);
+      }
+
+      for (size_t a = 0; a < area_ranges.size(); ++a) {
+        SortInstancesByIgnore(
+            area_ranges[a],
+            ground_truth_instances,
+            &ground_truth_sorted_indices,
+            &ignores);
+
+        MatchDetectionsToGroundTruth(
+            detection_instances,
+            detection_sorted_indices,
+            ground_truth_instances,
+            ground_truth_sorted_indices,
+            ignores,
+            image_category_ious[i][c],
+            iou_thresholds,
+            area_ranges[a],
+            &results_all
+                [c * num_area_ranges * num_images + a * num_images + i]);
+      }
+    }
+  }
+
+  return results_all;
+}
+
+// Convert a python list to a vector
+template <typename T>
+std::vector<T> list_to_vec(const py::list& l) {
+  std::vector<T> v(py::len(l));
+  for (int i = 0; i < (int)py::len(l); ++i) {
+    v[i] = l[i].cast<T>();
+  }
+  return v;
+}
+
+// Helper function to Accumulate()
+// Considers the evaluation results applicable to a particular category, area
+// range, and max_detections parameter setting, which begin at
+// evaluations[evaluation_index].  Extracts a sorted list of length n of all
+// applicable detection instances concatenated across all images in the dataset,
+// which are represented by the outputs evaluation_indices, detection_scores,
+// image_detection_indices, and detection_sorted_indices--all of which are
+// length n. evaluation_indices[i] stores the applicable index into
+// evaluations[] for instance i, which has detection score detection_score[i],
+// and is the image_detection_indices[i]'th of the list of detections
+// for the image containing i.  detection_sorted_indices[] defines a sorted
+// permutation of the 3 other outputs
+int BuildSortedDetectionList(
+    const std::vector<ImageEvaluation>& evaluations,
+    const int64_t evaluation_index,
+    const int64_t num_images,
+    const int max_detections,
+    std::vector<uint64_t>* evaluation_indices,
+    std::vector<double>* detection_scores,
+    std::vector<uint64_t>* detection_sorted_indices,
+    std::vector<uint64_t>* image_detection_indices) {
+  assert(evaluations.size() >= evaluation_index + num_images);
+
+  // Extract a list of object instances of the applicable category, area
+  // range, and max detections requirements such that they can be sorted
+  image_detection_indices->clear();
+  evaluation_indices->clear();
+  detection_scores->clear();
+  image_detection_indices->reserve(num_images * max_detections);
+  evaluation_indices->reserve(num_images * max_detections);
+  detection_scores->reserve(num_images * max_detections);
+  int num_valid_ground_truth = 0;
+  for (auto i = 0; i < num_images; ++i) {
+    const ImageEvaluation& evaluation = evaluations[evaluation_index + i];
+
+    for (int d = 0;
+         d < (int)evaluation.detection_scores.size() && d < max_detections;
+         ++d) { // detected instances
+      evaluation_indices->push_back(evaluation_index + i);
+      image_detection_indices->push_back(d);
+      detection_scores->push_back(evaluation.detection_scores[d]);
+    }
+    for (auto ground_truth_ignore : evaluation.ground_truth_ignores) {
+      if (!ground_truth_ignore) {
+        ++num_valid_ground_truth;
+      }
+    }
+  }
+
+  // Sort detections by decreasing score, using stable sort to match
+  // python implementation
+  detection_sorted_indices->resize(detection_scores->size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_scores](size_t j1, size_t j2) {
+        return (*detection_scores)[j1] > (*detection_scores)[j2];
+      });
+
+  return num_valid_ground_truth;
+}
+
+// Helper function to Accumulate()
+// Compute a precision recall curve given a sorted list of detected instances
+// encoded in evaluations, evaluation_indices, detection_scores,
+// detection_sorted_indices, image_detection_indices (see
+// BuildSortedDetectionList()). Using vectors precisions and recalls
+// and temporary storage, output the results into precisions_out, recalls_out,
+// and scores_out, which are large buffers containing many precion/recall curves
+// for all possible parameter settings, with precisions_out_index and
+// recalls_out_index defining the applicable indices to store results.
+void ComputePrecisionRecallCurve(
+    const int64_t precisions_out_index,
+    const int64_t precisions_out_stride,
+    const int64_t recalls_out_index,
+    const std::vector<double>& recall_thresholds,
+    const int iou_threshold_index,
+    const int num_iou_thresholds,
+    const int num_valid_ground_truth,
+    const std::vector<ImageEvaluation>& evaluations,
+    const std::vector<uint64_t>& evaluation_indices,
+    const std::vector<double>& detection_scores,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<uint64_t>& image_detection_indices,
+    std::vector<double>* precisions,
+    std::vector<double>* recalls,
+    std::vector<double>* precisions_out,
+    std::vector<double>* scores_out,
+    std::vector<double>* recalls_out) {
+  assert(recalls_out->size() > recalls_out_index);
+
+  // Compute precision/recall for each instance in the sorted list of detections
+  int64_t true_positives_sum = 0, false_positives_sum = 0;
+  precisions->clear();
+  recalls->clear();
+  precisions->reserve(detection_sorted_indices.size());
+  recalls->reserve(detection_sorted_indices.size());
+  assert(!evaluations.empty() || detection_sorted_indices.empty());
+  for (auto detection_sorted_index : detection_sorted_indices) {
+    const ImageEvaluation& evaluation =
+        evaluations[evaluation_indices[detection_sorted_index]];
+    const auto num_detections =
+        evaluation.detection_matches.size() / num_iou_thresholds;
+    const auto detection_index = iou_threshold_index * num_detections +
+        image_detection_indices[detection_sorted_index];
+    assert(evaluation.detection_matches.size() > detection_index);
+    assert(evaluation.detection_ignores.size() > detection_index);
+    const int64_t detection_match =
+        evaluation.detection_matches[detection_index];
+    const bool detection_ignores =
+        evaluation.detection_ignores[detection_index];
+    const auto true_positive = detection_match > 0 && !detection_ignores;
+    const auto false_positive = detection_match == 0 && !detection_ignores;
+    if (true_positive) {
+      ++true_positives_sum;
+    }
+    if (false_positive) {
+      ++false_positives_sum;
+    }
+
+    const double recall =
+        static_cast<double>(true_positives_sum) / num_valid_ground_truth;
+    recalls->push_back(recall);
+    const int64_t num_valid_detections =
+        true_positives_sum + false_positives_sum;
+    const double precision = num_valid_detections > 0
+        ? static_cast<double>(true_positives_sum) / num_valid_detections
+        : 0.0;
+    precisions->push_back(precision);
+  }
+
+  (*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0;
+
+  for (int64_t i = static_cast<int64_t>(precisions->size()) - 1; i > 0; --i) {
+    if ((*precisions)[i] > (*precisions)[i - 1]) {
+      (*precisions)[i - 1] = (*precisions)[i];
+    }
+  }
+
+  // Sample the per instance precision/recall list at each recall threshold
+  for (size_t r = 0; r < recall_thresholds.size(); ++r) {
+    // first index in recalls >= recall_thresholds[r]
+    std::vector<double>::iterator low = std::lower_bound(
+        recalls->begin(), recalls->end(), recall_thresholds[r]);
+    size_t precisions_index = low - recalls->begin();
+
+    const auto results_ind = precisions_out_index + r * precisions_out_stride;
+    assert(results_ind < precisions_out->size());
+    assert(results_ind < scores_out->size());
+    if (precisions_index < precisions->size()) {
+      (*precisions_out)[results_ind] = (*precisions)[precisions_index];
+      (*scores_out)[results_ind] =
+          detection_scores[detection_sorted_indices[precisions_index]];
+    } else {
+      (*precisions_out)[results_ind] = 0;
+      (*scores_out)[results_ind] = 0;
+    }
+  }
+}
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evaluations) {
+  const std::vector<double> recall_thresholds =
+      list_to_vec<double>(params.attr("recThrs"));
+  const std::vector<int> max_detections =
+      list_to_vec<int>(params.attr("maxDets"));
+  const int num_iou_thresholds = py::len(params.attr("iouThrs"));
+  const int num_recall_thresholds = py::len(params.attr("recThrs"));
+  const int num_categories = params.attr("useCats").cast<int>() == 1
+      ? py::len(params.attr("catIds"))
+      : 1;
+  const int num_area_ranges = py::len(params.attr("areaRng"));
+  const int num_max_detections = py::len(params.attr("maxDets"));
+  const int num_images = py::len(params.attr("imgIds"));
+
+  std::vector<double> precisions_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+  std::vector<double> recalls_out(
+      num_iou_thresholds * num_categories * num_area_ranges *
+          num_max_detections,
+      -1);
+  std::vector<double> scores_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+
+  // Consider the list of all detected instances in the entire dataset in one
+  // large list.  evaluation_indices, detection_scores,
+  // image_detection_indices, and detection_sorted_indices all have the same
+  // length as this list, such that each entry corresponds to one detected
+  // instance
+  std::vector<uint64_t> evaluation_indices; // indices into evaluations[]
+  std::vector<double> detection_scores; // detection scores of each instance
+  std::vector<uint64_t> detection_sorted_indices; // sorted indices of all
+                                                  // instances in the dataset
+  std::vector<uint64_t>
+      image_detection_indices; // indices into the list of detected instances in
+                               // the same image as each instance
+  std::vector<double> precisions, recalls;
+
+  for (auto c = 0; c < num_categories; ++c) {
+    for (auto a = 0; a < num_area_ranges; ++a) {
+      for (auto m = 0; m < num_max_detections; ++m) {
+        // The COCO PythonAPI assumes evaluations[] (the return value of
+        // COCOeval::EvaluateImages() is one long list storing results for each
+        // combination of category, area range, and image id, with categories in
+        // the outermost loop and images in the innermost loop.
+        const int64_t evaluations_index =
+            c * num_area_ranges * num_images + a * num_images;
+        int num_valid_ground_truth = BuildSortedDetectionList(
+            evaluations,
+            evaluations_index,
+            num_images,
+            max_detections[m],
+            &evaluation_indices,
+            &detection_scores,
+            &detection_sorted_indices,
+            &image_detection_indices);
+
+        if (num_valid_ground_truth == 0) {
+          continue;
+        }
+
+        for (auto t = 0; t < num_iou_thresholds; ++t) {
+          // recalls_out is a flattened vectors representing a
+          // num_iou_thresholds X num_categories X num_area_ranges X
+          // num_max_detections matrix
+          const int64_t recalls_out_index =
+              t * num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          // precisions_out and scores_out are flattened vectors
+          // representing a num_iou_thresholds X num_recall_thresholds X
+          // num_categories X num_area_ranges X num_max_detections matrix
+          const int64_t precisions_out_stride =
+              num_categories * num_area_ranges * num_max_detections;
+          const int64_t precisions_out_index = t * num_recall_thresholds *
+                  num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          ComputePrecisionRecallCurve(
+              precisions_out_index,
+              precisions_out_stride,
+              recalls_out_index,
+              recall_thresholds,
+              t,
+              num_iou_thresholds,
+              num_valid_ground_truth,
+              evaluations,
+              evaluation_indices,
+              detection_scores,
+              detection_sorted_indices,
+              image_detection_indices,
+              &precisions,
+              &recalls,
+              &precisions_out,
+              &scores_out,
+              &recalls_out);
+        }
+      }
+    }
+  }
+
+  time_t rawtime;
+  struct tm local_time;
+  std::array<char, 200> buffer;
+  time(&rawtime);
+#ifdef _WIN32
+  localtime_s(&local_time, &rawtime);
+#else
+  localtime_r(&rawtime, &local_time);
+#endif
+  strftime(
+      buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time);
+  return py::dict(
+      "params"_a = params,
+      "counts"_a = std::vector<int64_t>({num_iou_thresholds,
+                                         num_recall_thresholds,
+                                         num_categories,
+                                         num_area_ranges,
+                                         num_max_detections}),
+      "date"_a = buffer,
+      "precision"_a = precisions_out,
+      "recall"_a = recalls_out,
+      "scores"_a = scores_out);
+}
+
+} // namespace COCOeval
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.h b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.h
new file mode 100755
index 000000000..dbf5aab4b
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.h
@@ -0,0 +1,98 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+#include <vector>
+
+namespace py = pybind11;
+
+namespace COCOeval {
+
+// Annotation data for a single object instance in an image
+struct InstanceAnnotation {
+  InstanceAnnotation(
+      uint64_t id,
+      double score,
+      double area,
+      bool is_crowd,
+      bool ignore)
+      : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {}
+  uint64_t id;
+  double score = 0.;
+  double area = 0.;
+  bool is_crowd = false;
+  bool ignore = false;
+};
+
+// Stores intermediate results for evaluating detection results for a single
+// image that has D detected instances and G ground truth instances. This stores
+// matches between detected and ground truth instances
+struct ImageEvaluation {
+  // For each of the D detected instances, the id of the matched ground truth
+  // instance, or 0 if unmatched
+  std::vector<uint64_t> detection_matches;
+
+  // The detection score of each of the D detected instances
+  std::vector<double> detection_scores;
+
+  // Marks whether or not each of G instances was ignored from evaluation (e.g.,
+  // because it's outside area_range)
+  std::vector<bool> ground_truth_ignores;
+
+  // Marks whether or not each of D instances was ignored from evaluation (e.g.,
+  // because it's outside aRng)
+  std::vector<bool> detection_ignores;
+};
+
+template <class T>
+using ImageCategoryInstances = std::vector<std::vector<std::vector<T>>>;
+
+// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg().  For each
+// combination of image, category, area range settings, and IOU thresholds to
+// evaluate, it matches detected instances to ground truth instances and stores
+// the results into a vector of ImageEvaluation results, which will be
+// interpreted by the COCOeval::Accumulate() function to produce precion-recall
+// curves.  The parameters of nested vectors have the following semantics:
+//   image_category_ious[i][c][d][g] is the intersection over union of the d'th
+//     detected instance and g'th ground truth instance of
+//     category category_ids[c] in image image_ids[i]
+//   image_category_ground_truth_instances[i][c] is a vector of ground truth
+//     instances in image image_ids[i] of category category_ids[c]
+//   image_category_detection_instances[i][c] is a vector of detected
+//     instances in image image_ids[i] of category category_ids[c]
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges, // vector of 2-tuples
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances);
+
+// C++ implementation of COCOeval.accumulate(), which generates precision
+// recall curves for each set of category, IOU threshold, detection area range,
+// and max number of detections parameters.  It is assumed that the parameter
+// evaluations is the return value of the functon COCOeval::EvaluateImages(),
+// which was called with the same parameter settings params
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evalutations);
+
+} // namespace COCOeval
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate");
+    m.def(
+        "COCOevalEvaluateImages",
+        &COCOeval::EvaluateImages,
+        "COCOeval::EvaluateImages");
+    pybind11::class_<COCOeval::InstanceAnnotation>(m, "InstanceAnnotation")
+        .def(pybind11::init<uint64_t, double, double, bool, bool>());
+    pybind11::class_<COCOeval::ImageEvaluation>(m, "ImageEvaluation")
+        .def(pybind11::init<>());
+}
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/fast_coco_eval_api.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/fast_coco_eval_api.py
new file mode 100755
index 000000000..374031ab8
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/fast_coco_eval_api.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# This file comes from
+# https://github.com/facebookresearch/detectron2/blob/master/detectron2/evaluation/fast_eval_api.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import copy
+import time
+
+import numpy as np
+from pycocotools.cocoeval import COCOeval
+
+from .jit_ops import FastCOCOEvalOp
+
+
+class COCOeval_opt(COCOeval):
+    """
+    This is a slightly modified version of the original COCO API, where the functions evaluateImg()
+    and accumulate() are implemented in C++ to speedup evaluation
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        try:
+            self.module = FastCOCOEvalOp().load()
+        except:
+            raise ImportError
+        
+    def evaluate(self):
+        """
+        Run per image evaluation on given images and store results in self.evalImgs_cpp, a
+        datastructure that isn't readable from Python but is used by a c++ implementation of
+        accumulate().  Unlike the original COCO PythonAPI, we don't populate the datastructure
+        self.evalImgs because this datastructure is a computational bottleneck.
+        :return: None
+        """
+        tic = time.time()
+
+        print("Running per image evaluation...")
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = "segm" if p.useSegm == 1 else "bbox"
+            print(
+                "useSegm (deprecated) is not None. Running {} evaluation".format(
+                    p.iouType
+                )
+            )
+        print("Evaluate annotation type *{}*".format(p.iouType))
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        self._prepare()
+
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        if p.iouType == "segm" or p.iouType == "bbox":
+            computeIoU = self.computeIoU
+        elif p.iouType == "keypoints":
+            computeIoU = self.computeOks
+        self.ious = {
+            (imgId, catId): computeIoU(imgId, catId)
+            for imgId in p.imgIds
+            for catId in catIds
+        }
+
+        maxDet = p.maxDets[-1]
+
+        # <<<< Beginning of code differences with original COCO API
+        def convert_instances_to_cpp(instances, is_det=False):
+            # Convert annotations for a list of instances in an image to a format that's fast
+            # to access in C++
+            instances_cpp = []
+            for instance in instances:
+                instance_cpp = self.module.InstanceAnnotation(
+                    int(instance["id"]),
+                    instance["score"] if is_det else instance.get("score", 0.0),
+                    instance["area"],
+                    bool(instance.get("iscrowd", 0)),
+                    bool(instance.get("ignore", 0)),
+                )
+                instances_cpp.append(instance_cpp)
+            return instances_cpp
+
+        # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
+        ground_truth_instances = [
+            [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
+            for imgId in p.imgIds
+        ]
+        detected_instances = [
+            [
+                convert_instances_to_cpp(self._dts[imgId, catId], is_det=True)
+                for catId in p.catIds
+            ]
+            for imgId in p.imgIds
+        ]
+        ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
+
+        if not p.useCats:
+            # For each image, flatten per-category lists into a single list
+            ground_truth_instances = [
+                [[o for c in i for o in c]] for i in ground_truth_instances
+            ]
+            detected_instances = [
+                [[o for c in i for o in c]] for i in detected_instances
+            ]
+
+        # Call C++ implementation of self.evaluateImgs()
+        self._evalImgs_cpp = self.module.COCOevalEvaluateImages(
+            p.areaRng,
+            maxDet,
+            p.iouThrs,
+            ious,
+            ground_truth_instances,
+            detected_instances,
+        )
+        self._evalImgs = None
+
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic))
+        # >>>> End of code differences with original COCO API
+
+    def accumulate(self):
+        """
+        Accumulate per image evaluation results and store the result in self.eval.  Does not
+        support changing parameter settings from those used by self.evaluate()
+        """
+        print("Accumulating evaluation results...")
+        tic = time.time()
+        if not hasattr(self, "_evalImgs_cpp"):
+            print("Please run evaluate() first")
+
+        self.eval = self.module.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp)
+
+        # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
+        self.eval["recall"] = np.array(self.eval["recall"]).reshape(
+            self.eval["counts"][:1] + self.eval["counts"][2:]
+        )
+
+        # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
+        # num_area_ranges X num_max_detections
+        self.eval["precision"] = np.array(self.eval["precision"]).reshape(
+            self.eval["counts"]
+        )
+        self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
+        toc = time.time()
+        print(
+            "COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic)
+        )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/jit_ops.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/jit_ops.py
new file mode 100755
index 000000000..cce3195ff
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/jit_ops.py
@@ -0,0 +1,179 @@
+import glob
+import importlib
+import os
+import sys
+import time
+from typing import List
+from torch import distributed as dist
+from contextlib import contextmanager
+
+__all__ = ["JitOp", "FastCOCOEvalOp"]
+
+_LOCAL_PROCESS_GROUP = None
+
+def get_rank() -> int:
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+
+def get_local_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the local (per-machine) process group.
+    """
+    if _LOCAL_PROCESS_GROUP is None:
+        return get_rank()
+
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
+
+@contextmanager
+def wait_for_the_master(local_rank: int = None):
+    """
+    Make all processes waiting for the master to do some task.
+
+    Args:
+        local_rank (int): the rank of the current process. Default to None.
+            If None, it will use the rank of the current process.
+    """
+    if local_rank is None:
+        local_rank = get_local_rank()
+
+    if local_rank > 0:
+        dist.barrier()
+    yield
+    if local_rank == 0:
+        if not dist.is_available():
+            return
+        if not dist.is_initialized():
+            return
+        else:
+            dist.barrier()
+
+class JitOp:
+    """
+    Just-in-time compilation of ops.
+
+    Some code of `JitOp` is inspired by `deepspeed.op_builder`,
+    check the following link for more details:
+    https://github.com/microsoft/DeepSpeed/blob/master/op_builder/builder.py
+    """
+
+    def __init__(self, name):
+        self.name = name
+
+    def absolute_name(self) -> str:
+        """Get absolute build path for cases where the op is pre-installed."""
+        pass
+
+    def sources(self) -> List:
+        """Get path list of source files of op.
+
+        NOTE: the path should be elative to root of package during building,
+            Otherwise, exception will be raised when building package.
+            However, for runtime building, path will be absolute.
+        """
+        pass
+
+    def include_dirs(self) -> List:
+        """
+        Get list of include paths, relative to root of package.
+
+        NOTE: the path should be elative to root of package.
+            Otherwise, exception will be raised when building package.
+        """
+        return []
+
+    def define_macros(self) -> List:
+        """Get list of macros to define for op"""
+        return []
+
+    def cxx_args(self) -> List:
+        """Get optional list of compiler flags to forward"""
+        args = ["-O2"] if sys.platform == "win32" else ["-O3", "-std=c++14", "-g", "-Wno-reorder"]
+        return args
+
+    def nvcc_args(self) -> List:
+        """Get optional list of compiler flags to forward to nvcc when building CUDA sources"""
+        args = [
+            "-O3", "--use_fast_math",
+            "-std=c++17" if sys.platform == "win32" else "-std=c++14",
+            "-U__CUDA_NO_HALF_OPERATORS__",
+            "-U__CUDA_NO_HALF_CONVERSIONS__",
+            "-U__CUDA_NO_HALF2_OPERATORS__",
+        ]
+        return args
+
+    def build_op(self):
+        from torch.utils.cpp_extension import CppExtension
+        return CppExtension(
+            name=self.absolute_name(),
+            sources=self.sources(),
+            include_dirs=self.include_dirs(),
+            define_macros=self.define_macros(),
+            extra_compile_args={
+                "cxx": self.cxx_args(),
+            },
+        )
+
+    def load(self, verbose=False):
+        try:
+            # try to import op from pre-installed package
+            return importlib.import_module(self.absolute_name())
+        except Exception:  # op not compiled, jit load
+            with wait_for_the_master():  # to avoid race condition
+                return self.jit_load(verbose)
+
+    def jit_load(self, verbose=False):
+        from torch.utils.cpp_extension import load
+        try:
+            import ninja  # noqa
+        except ImportError:
+            if verbose:
+                print(
+                    f"Ninja is not installed, fall back to normal installation for {self.name}."
+                )
+
+        build_tik = time.time()
+        # build op and load
+        op_module = load(
+            name=self.name,
+            sources=self.sources(),
+            extra_cflags=self.cxx_args(),
+            extra_cuda_cflags=self.nvcc_args(),
+            verbose=verbose,
+        )
+        build_duration = time.time() - build_tik
+        if verbose:
+            print(f"Load {self.name} op in {build_duration:.3f}s.")
+        return op_module
+
+    def clear_dynamic_library(self):
+        """Remove dynamic libraray files generated by JIT compilation."""
+        module = self.load()
+        os.remove(module.__file__)
+
+
+class FastCOCOEvalOp(JitOp):
+
+    def __init__(self, name="fast_cocoeval"):
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'fastCoCoeval.{self.name}'
+
+    def sources(self):
+        sources = glob.glob(os.path.join("fastCoCoeval", "cocoeval", "*.cpp"))
+        if not sources:  # source will be empty list if the so file is removed after install
+            # use abosolute path to compile
+            code_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "cocoeval", "*.cpp")
+            sources = glob.glob(code_path)
+        return sources
+
+    def include_dirs(self):
+        return [os.path.join("fastCoCoeval", "cocoeval")]
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/file.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/file.py
new file mode 100755
index 000000000..5b413788a
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/file.py
@@ -0,0 +1,20 @@
+import os
+import json
+
+def download_file(src_url, save_path):
+    if not os.path.exists(save_path):
+        cmd = f"wget {src_url}"
+        os.system(cmd)
+        
+    assert os.path.exists(save_path)
+    
+
+def load_json(path):
+    with open(path, "r") as f:
+        data = json.load(f)
+    return data
+
+
+def save_json(data, path):
+    with open(path, "w") as f:
+        json.dump(data, f, indent=4)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/imagenet_metric.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/imagenet_metric.py
new file mode 100755
index 000000000..d034bc0a1
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/imagenet_metric.py
@@ -0,0 +1,23 @@
+import torch
+import numpy as np
+
+
+def get_topk_accuracy(pred, label):
+    if isinstance(pred, np.ndarray):
+        pred = torch.from_numpy(pred)
+        
+    if isinstance(label, np.ndarray):
+        label = torch.from_numpy(label)
+    
+    top1_acc = 0
+    top5_acc = 0
+    for idx in range(len(label)):
+        label_value = label[idx]
+        if label_value == torch.topk(pred[idx].float(), 1).indices.data:
+            top1_acc += 1
+            top5_acc += 1
+
+        elif label_value in torch.topk(pred[idx].float(), 5).indices.data:
+            top5_acc += 1
+            
+    return top1_acc, top5_acc
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/import_model.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/import_model.py
new file mode 100755
index 000000000..f0b5e12c5
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/import_model.py
@@ -0,0 +1,113 @@
+import os
+import shutil
+import onnx
+import torch
+import torchvision
+import tensorflow as tf
+try:
+    tf_compat_v1 = tf.compat.v1
+except ImportError:
+    tf_compat_v1 = tf
+
+import tvm
+from tvm import relay
+import tvm.relay.testing.tf as tf_testing
+from .onnx_util import get_batch_size, rewrite_int64_input_to_int32
+from .onnx_rewrite_batch_size import rewrite_batch_size
+from .argument import to_bool
+from tvm.relay.transform.iluvatar import SimplifyGraph
+
+def import_model_to_igie(model_path_or_name, input_dict, model_framework):
+    
+    base_name = os.path.splitext(os.path.split(model_path_or_name)[1])[0]
+    cache_hash = f"{base_name}_cache_dir"
+    mod_path = os.path.join(cache_hash, "mod.cache")
+    params_path = os.path.join(cache_hash, "params.cache")
+    
+    # find cached mod and params
+    if os.path.exists(cache_hash) and to_bool(os.environ.get("IGIE_USE_CACHE", False)):
+        with open(mod_path, "r") as mod_file:
+            mod = tvm.parser.fromtext(mod_file.read())
+        
+        with open(params_path, "rb") as params_file:
+            params = relay.load_param_dict(params_file.read())
+
+        return mod, params
+    
+    paddle_dir_path = os.path.split(model_path_or_name)[0]
+    if os.path.exists(model_path_or_name) or os.path.exists(paddle_dir_path):
+        if model_framework == "onnx":
+            batch_size = list(input_dict.values())[0][0]
+            model_path = model_path_or_name
+            
+                  
+            # we don't want to handle multi_input case here,
+            # e.g. input_ids:1000,22 pixel_values:32,3,224,224 attention_mask:1000,22 for clip model
+            if len(input_dict) == 1:
+                batch_size_from_model = get_batch_size(model_path_or_name)
+                if isinstance(batch_size_from_model, int) and batch_size_from_model != batch_size:
+                    model_path = f"{model_path[:-5]}_rewrite_b{batch_size}.onnx"
+                    rewrite_batch_size(model_path_or_name, batch_size, save_model_path=model_path)
+
+            model = onnx.load(model_path)
+            # model = rewrite_int64_input_to_int32(model)
+            mod, params = relay.frontend.from_onnx(model, input_dict, freeze_params=True)
+    
+        elif model_framework == "pytorch":
+            scripted_model = torch.jit.load(model_path_or_name).eval()
+            input_infos = [(k, v) for k, v in input_dict.items()]
+            mod, params = relay.frontend.from_pytorch(scripted_model, input_infos=input_infos)
+    
+        elif model_framework == "tensorflow":
+            with tf_compat_v1.gfile.GFile(model_path_or_name, "rb") as f:
+                graph_def = tf_compat_v1.GraphDef()
+                graph_def.ParseFromString(f.read())
+                graph_def = tf_testing.ProcessGraphDefParam(graph_def)
+            mod, params = relay.frontend.from_tensorflow(graph_def, shape=input_dict)
+            
+        elif model_framework == "paddle":
+                import paddle
+                model = paddle.jit.load(model_path_or_name)
+                mod, params = relay.frontend.from_paddle(model, input_dict)
+        else:
+            raise ValueError(f"framwork {model_framework} is not supported yet")
+        
+    else:
+        # In this case we will try to find from tochvision
+        # e.g. model_path_or_name="resnet18"
+
+        try:
+            import ssl 
+            ssl._create_default_https_context = ssl._create_unverified_context
+            model = getattr(torchvision.models, model_path_or_name)(pretrained=True).eval()
+        except:
+            raise ValueError(f"can not find model {model_path_or_name} from torchvision and current working directory")
+        
+        
+        input_datas = []
+        for shape in input_dict.values():
+            # currently torchvision model should always use float32 input
+            input_datas.append(torch.randn(shape))
+        
+        scripted_model = torch.jit.trace(model, tuple(input_datas)).eval()
+        input_infos = [(k, v) for k, v in input_dict.items()]
+        mod, params = relay.frontend.from_pytorch(scripted_model, input_infos=input_infos) 
+
+    # save cache
+    if to_bool(os.environ.get("IGIE_USE_CACHE", False)):
+        if os.path.exists(cache_hash):
+            shutil.rmtree(cache_hash)
+        os.makedirs(cache_hash)
+        
+        mod_path = os.path.join(cache_hash, "mod.cache")
+        with open(mod_path, "w") as mod_file:
+            mod_file.write(mod.astext())
+
+        params_path = os.path.join(cache_hash, "params.cache")
+        with open(params_path, "wb") as params_file:
+            params_file.write(relay.save_param_dict(params))
+    
+    # need SimlifyGraph mod when importing onnx models, especially the model contains Q/DQ node
+    mod = SimplifyGraph(mod, params)   
+    
+    return mod, params
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/mod_rewriter.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/mod_rewriter.py
new file mode 100755
index 000000000..452916efb
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/mod_rewriter.py
@@ -0,0 +1,81 @@
+import tvm
+from tvm import relay
+from tvm.relay import Expr
+from tvm.relay.dataflow_pattern import wildcard, is_constant, is_op, DFPatternCallback, rewrite
+from tvm.relay.expr_functor import ExprMutator
+
+#TODO(chen.chen): we should move this class to igie repo
+class MainFunctionParamsRewriter(ExprMutator):
+    def __init__(self, target_input_dict, preprocess_rewriter=None):        
+        self.target_input = target_input_dict
+        self.preprocess_rewriter = preprocess_rewriter
+        self.target_input_name_list = list(self.target_input.keys())
+        
+        super().__init__()
+        
+    def visit_function(self, fn):
+        params = [self.visit(i) for i in fn.params]
+        body  = self.visit(fn.body)
+        
+        original_input_name_list = [param.name_hint for param in params]
+        assert len(set(self.target_input_name_list) - set(original_input_name_list)) == 0, f"invalid target_input_name: {set(self.target_input_name_list) - set(original_input_name_list)}"
+        
+        new_params = []
+        bind = {}
+        for param in params:
+            old_param = param
+            name = param.name_hint
+            
+            new_param = old_param
+            if name in self.target_input:
+                shape = self.target_input[name][0]
+                if len(self.target_input[name]) == 2:
+                    dtype = self.target_input[name][1]
+                else:
+                    dtype = old_param.type_annotation.dtype
+                new_param = relay.var(name_hint=name, shape=shape, dtype=dtype)
+
+            new_params.append(new_param)
+            bind[old_param] = new_param
+            
+        new_body = relay.expr.bind(body, bind)
+        
+        new_function = relay.Function(params=new_params,
+                                      body=new_body,
+                                      ret_type=None,
+                                      type_params=fn.type_params,
+                                      attrs=fn.attrs)
+        return new_function            
+               
+    def __call__(self, mod):
+        if self.preprocess_rewriter:
+            mod["main"] = rewrite(self.preprocess_rewriter, mod["main"])
+        mod["main"] = self.visit(mod["main"])
+        return mod
+    
+    
+# TODO(chen.chen) this function is designeg for bert model, but it doesn't work now
+# the reason is that, position_embedding is fixed when mod is generated from onnx
+# e.g. the meta[relay.Constant][51] is fixed as 256
+# even if we rewrite the seq_len to 384, the InferType will failed for %9 = add(%8, meta[relay.Constant][51] /* ty=Tensor[(1, 256, 768), float32] */)
+
+# def @main(%input_ids: Tensor[(8, 256), int64], %attention_mask: Tensor[(8, 256), int64], %token_type_ids: Tensor[(8, 256), int64]) -> (Tensor[(8, 256), float32], Tensor[(8, 256), float32]) {
+#   %0 = less(%input_ids, 0 /* ty=int64 */) /* ty=Tensor[(8, 256), bool] */;
+#   %1 = add(%input_ids, 30522 /* ty=int64 */) /* ty=Tensor[(8, 256), int64] */;
+#   %2 = where(%0, %1, %input_ids) /* ty=Tensor[(8, 256), int64] */;
+#   %3 = less(%token_type_ids, 0 /* ty=int64 */) /* ty=Tensor[(8, 256), bool] */;
+#   %4 = add(%token_type_ids, 2 /* ty=int64 */) /* ty=Tensor[(8, 256), int64] */;
+#   %5 = where(%3, %4, %token_type_ids) /* ty=Tensor[(8, 256), int64] */;
+#   %6 = take(meta[relay.Constant][49] /* ty=Tensor[(30522, 768), float32] */, %2, axis=0) /* ty=Tensor[(8, 256, 768), float32] */;
+#   %7 = take(meta[relay.Constant][50] /* ty=Tensor[(2, 768), float32] */, %5, axis=0) /* ty=Tensor[(8, 256, 768), float32] */;
+#   %8 = add(%6, %7) /* ty=Tensor[(8, 256, 768), float32] */;
+#   %9 = add(%8, meta[relay.Constant][51] /* ty=Tensor[(1, 256, 768), float32] */) /* ty=Tensor[(8, 256, 768), float32] */;
+  
+  
+def modify_seq_len_for_nlp(mod, input_dict, target_seq_len):
+    target_input_dict = {}
+    for name, shape in input_dict.items():
+        target_input_dict[name] = [(shape[0], target_seq_len)]
+    mod = relay.transform.InferType()(mod)
+    mod = MainFunctionParamsRewriter(target_input_dict=target_input_dict)(mod)
+    return mod
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_rewrite_batch_size.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_rewrite_batch_size.py
new file mode 100755
index 000000000..5332febfb
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_rewrite_batch_size.py
@@ -0,0 +1,113 @@
+"""
+rewrite src onnx model and infer shape if possible, current sypport
+
+1. rewrite batch_size, e.g 1x3x640x640 -> 32x3x640x640
+
+Attention:
+1. all inputs/outputs batchszie dim will be modified together, which means some NLP/Audio senquence models will introduce problems
+
+
+"""
+import onnx
+from onnx import OperatorSetIdProto
+import onnx.numpy_helper
+
+import onnxoptimizer
+from onnxsim import simplify
+
+from .onnx_util import get_batch_size, rewrite_tensor_batch_size
+
+def rewrite_batch_size(model,
+                       batch_size,
+                       modify_reshape_dim=True,
+                       save_model_path=None):
+
+    ## rewrite input and output
+    if isinstance(model, str):
+        model = onnx.load(model)
+
+        
+    ## there is a issue that when the onnx model comes from tf,
+    ## some shape info is stored as constant node's output instead of initializer
+    passes = [
+        "extract_constant_to_initializer", "eliminate_unused_initializer"
+    ]
+    model = onnxoptimizer.optimize(model, passes)
+    
+    
+
+    # to support qlinear op if the opset_import is not supported
+    # if we have some ohter domains need to import, add them here
+    ms_opset = OperatorSetIdProto()
+    ms_opset.domain = "com.microsoft"
+    ms_opset.version = 1
+
+    ori_opset_import = model.opset_import
+
+    if ms_opset not in ori_opset_import:
+        ori_opset_import.append(ms_opset)
+
+    model, check = simplify(model)
+    assert check, "Simplified ONNX model could not be validated"
+    
+
+    graph = model.graph
+    initializer = graph.initializer
+    inputs = graph.input
+    outputs = graph.output
+    nodes = graph.node
+
+    ori_batch_size = get_batch_size(model)
+
+    ## in case that some onnx model inputs contain initializers' shape info, we will remove them to avoid rewriting input failure
+
+    initializer_names = set([i.name for i in initializer])
+    import copy
+    tmp_inputs = copy.deepcopy(inputs)
+    for i in tmp_inputs:
+        if i.name in initializer_names:
+            inputs.remove(i)
+
+    for i in inputs:
+        rewrite_tensor_batch_size(i, batch_size)
+
+    for i in outputs:
+        rewrite_tensor_batch_size(i, batch_size)
+
+    ## we may need to modify reshape initializer if we modify input batchsize
+    ## this code only works when the target shape is fixed, and occurs as a input initializer in the node
+    ## so this may introduce some other problems when the purpose of reshape operations are totally different
+
+    if modify_reshape_dim:
+        reshape_input = []
+        for idx, i in enumerate(nodes):
+            if i.op_type == "Reshape":
+                reshape_input.extend(i.input)
+            if i.op_type == "Resize" and len(i.input) == 4:
+                reshape_input.append(i.input[3])
+        for idx, i in enumerate(initializer):
+            if i.name in reshape_input:
+                shape = onnx.numpy_helper.to_array(i).copy()
+                if shape.dtype == "int64":
+                    shape[0] = batch_size
+                    initializer[idx].CopyFrom(
+                        onnx.numpy_helper.from_array(shape, i.name))
+
+    for i in graph.value_info:
+        if i.type.tensor_type.shape.dim:
+            if i.type.tensor_type.shape.dim[0].dim_value == ori_batch_size:
+                i.type.tensor_type.shape.dim[0].dim_value = batch_size
+
+    model, check = simplify(model)
+    assert check, "Simplified ONNX model could not be validated"
+
+    model = onnx.shape_inference.infer_shapes(model,
+                                              check_type=True,
+                                              strict_mode=True,
+                                              data_prop=True)
+    onnx.checker.check_model(model)
+
+    if save_model_path:
+        onnx.save(model, save_model_path)
+    return model
+
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_util.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_util.py
new file mode 100755
index 000000000..968236472
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_util.py
@@ -0,0 +1,130 @@
+import onnx
+from collections import defaultdict
+
+import onnx
+import os
+
+## FYI
+ONNX_DTYPE = {
+    0: onnx.TensorProto.FLOAT,
+    1: onnx.TensorProto.FLOAT,
+    2: onnx.TensorProto.UINT8,
+    3: onnx.TensorProto.INT8,
+    4: onnx.TensorProto.UINT16,
+    5: onnx.TensorProto.INT16,
+    6: onnx.TensorProto.INT32,
+    7: onnx.TensorProto.INT64,
+    8: onnx.TensorProto.STRING,
+    9: onnx.TensorProto.BOOL,
+}
+
+
+def rewrite_tensor_dim(tensor, dim_value_dict):
+    if isinstance(dim_value_dict, list):
+        dim_value_dict = {idx: i for idx, i in enumerate(dim_value_dict)}
+    all_dim = tensor.type.tensor_type.shape.dim
+    for idx, value in dim_value_dict.items():
+        if isinstance(value, str):
+            all_dim[idx].dim_param = "batch"
+        else:
+            all_dim[idx].dim_value = value
+
+
+def rewrite_tensor_batch_size(tensor, batch_size):
+
+    dim_value_dict = {0: batch_size}
+    rewrite_tensor_dim(tensor, dim_value_dict)
+
+
+def get_tensor_dim(tensor):
+    dims = []
+    all_dim = tensor.type.tensor_type.shape.dim
+    rank = len(all_dim)
+    for i in range(rank):
+        if all_dim[i].dim_value:
+            dims.append(all_dim[i].dim_value)
+        else:
+            dims.append(all_dim[i].dim_param)
+    return dims
+
+
+def get_tensor_name(tensor):
+    return tensor.name
+
+
+def nchw_dim_to_nhwc_dim(dim_list):
+    assert len(dim_list) == 4
+    new_dim = [dim_list[0], dim_list[2], dim_list[3], dim_list[1]]
+    return new_dim
+
+
+def get_input_number(model):
+    if isinstance(model, str):
+        model = onnx.load(model)
+    inputs = model.graph.input
+    return len(inputs)
+
+def get_batch_size(model):
+    if isinstance(model, str):
+        model = onnx.load(model)
+    inputs = model.graph.input
+    return get_tensor_dim(inputs[0])[0]
+
+
+def count_op_type(model):
+    if isinstance(model, str):
+        model = onnx.load(model)
+
+    nodes = model.graph.node
+
+    node2count = defaultdict(int)
+    for i in nodes:
+        node2count[i.op_type] += 1
+
+    return node2count
+
+
+def contain_qlinear_opearator(onnx_model):
+    if isinstance(onnx_model, str):
+        onnx_model = onnx.load(onnx_model)
+
+    nodes = onnx_model.graph.node
+
+    for i in nodes:
+        op_type = i.op_type.lower()
+        if op_type.startswith("qlinear") or op_type.startswith("qgemm"):
+            return True
+    return False
+
+
+def get_all_node_name(model, exclude_constant=False, pretty_print=False):
+    if isinstance(model, str):
+        model = onnx.load(model)
+
+    nodes = model.graph.node
+    if exclude_constant:
+        all_node = [i.name for i in nodes if i.op_type != "Constant"]
+    else:
+        all_node = [i.name for i in nodes]
+
+    all_node.sort()
+    if pretty_print:
+        res = [f'"{i}"' for i in all_node]
+        res = ",\n".join(res)
+        res = f'[\n{res}\n]'
+        print(res)
+
+    return all_node
+
+def rewrite_int64_input_to_int32(model):
+    inputs = model.graph.input
+    
+    for i in inputs:
+        if i.type.tensor_type.elem_type == 7:
+            i.type.tensor_type.elem_type = 6
+    
+    print(inputs)
+    import pdb;pdb.set_trace()
+    onnx.checker.check_model(model)
+
+    return model
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/quantization.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/quantization.py
new file mode 100755
index 000000000..2490c6643
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/quantization.py
@@ -0,0 +1,531 @@
+import os
+import psutil
+from itertools import permutations
+import numpy as np
+
+import tvm
+from tvm import relay
+
+import onnx
+import onnx.helper as onnx_helper
+import onnxoptimizer
+from onnxsim import simplify
+from onnxruntime.quantization import (CalibrationDataReader, QuantFormat,
+                                      quantize_static, QuantType,
+                                      CalibrationMethod)
+
+from .onnx_util import contain_qlinear_opearator, rewrite_tensor_dim
+from .onnx_rewrite_batch_size import rewrite_batch_size
+from .dataloader import get_dataloader_from_args
+
+class Node:
+    def __init__(self, name, op_type, input, output):
+        self.name = name
+        self.op_type = op_type
+        self.input = input
+        self.output = output
+        
+        
+        self.from_node = []
+        self.to_node = []
+
+    def __repr__(self) -> str:
+        return f"{self.name} [{self.op_type}], input = {self.input}, output = {self.output}"
+
+    
+    @staticmethod
+    def connect(node_list):
+        perm = permutations(node_list, 2)
+        for (i, j) in perm:
+            i._connect(j)    
+    
+    def _connect(self, node):
+        if node in self.from_node or node in self.to_node:
+            return
+        for output in node.output:
+            if output in set(self.input):
+                node.to_node.append(self)
+                self.from_node.append(node)
+
+class Model:
+    @staticmethod
+    def add_ms_opset_domain(model,
+                            ms_opset_domain="com.microsoft",
+                            ms_opset_version=1):
+        found = False
+        for i in model.opset_import:
+            if i.domain == ms_opset_domain:
+                found = True
+                break
+
+        if not found:
+            ms_opset = onnx_helper.make_operatorsetid(ms_opset_domain,
+                                                        ms_opset_version)
+            model.opset_import.append(ms_opset)
+
+        return model
+
+    @staticmethod
+    def preprocess_onnx(model):
+        model = Model.add_ms_opset_domain(model)
+
+        passes = onnxoptimizer.get_available_passes()
+
+        no_need = [
+            # NOTE(chen.chen): the following passes cause some error, need to debug
+            "lift_lexical_references",
+            "split_init",
+            "split_predict",
+
+            # we do not want to rename anything
+            "rename_input_output",
+            "set_unique_name_for_nodes"
+        ]
+        passes = [i for i in passes if i not in no_need]       
+        model = onnxoptimizer.optimize(model, passes)
+
+        model, check = simplify(model)
+        assert check, "Simplified ONNX model could not be validated"
+
+        # model = onnx.shape_inference.infer_shapes(model, check_type=True, strict_mode=True, data_prop=True)
+        return model
+    
+    def __init__(self, model):
+        if isinstance(model, str):
+            model = onnx.load(model)
+        self.model = Model.preprocess_onnx(model)
+        
+        self.graph = self.model.graph
+        self.nodes = self.graph.node
+        self.node_list = []
+        for i in self.nodes:
+            self.node_list.append(Node(i.name, i.op_type, i.input, i.output))
+        Node.connect(self.node_list)
+        
+    
+    
+def find_detect_node(model):
+    if isinstance(model, str):
+        model = Model(model)
+    assert isinstance(model, Model)
+    
+    node_list = model.node_list
+    
+    
+    last_conv = []
+    # find last conv nodes before detect
+    for i in range(len(node_list) - 1, -1,  -1):
+        node = node_list[i]
+        if not node.op_type == "Conv":
+            continue
+        
+        after_node = node.to_node[:]
+        find_conv = False
+        while after_node:
+            last = after_node.pop()
+            after_node.extend(last.to_node)
+            
+            if last.op_type == "Conv":
+                find_conv = True
+                break
+
+        if not find_conv:
+            last_conv.append(node)
+    
+    
+    
+    exclude_detect_node_type = [
+        "Add", "Mul", "Concat",  
+        # "Reshape", "Exp", "Power", "Slice", "Split" ## these node will not be quantized actually
+        ]
+    exclude_detect_node_name = []
+    for i in last_conv:
+        after_node = i.to_node[:]
+        while after_node:
+            last = after_node.pop()
+            after_node.extend(last.to_node)
+            
+            if last.op_type in exclude_detect_node_type:
+                exclude_detect_node_name.append(last.name)
+    
+    exclude_detect_node_name = sorted(list(set(exclude_detect_node_name)))
+    return exclude_detect_node_name
+
+
+def find_unsupported_node(model):
+    if isinstance(model, str):
+        model = Model(model)
+    assert isinstance(model, Model)
+    
+    node_list = model.node_list
+    
+    
+    igie_not_supported_node_type = [
+        "Softmax",
+        "Gemm", # igie onnx frontend error for mobilenetv2
+    ]
+    exclude_node_name = []
+    for i in node_list:
+        if i.op_type in igie_not_supported_node_type:
+            exclude_node_name.append(i.name)
+       
+    return exclude_node_name
+
+
+def find_group_conv_node(model):
+    if isinstance(model, str):
+        model = Model(model)
+    assert isinstance(model, Model)
+    
+    nodes = model.graph.node
+
+    exclude_node_name = []
+    for node in nodes:
+        if node.op_type == "Conv":
+            attrs = node.attribute
+            for j in attrs:
+                if j.name == "group" and j.i != 1:
+                    exclude_node_name.append(node.name)
+       
+    return exclude_node_name
+
+class BaseDataReader(CalibrationDataReader):
+
+    def __init__(self, dataloader, cnt_limit=500):
+        # pytorch-like dataloader
+        self.dataloader = dataloader
+        self.cnt = 0
+        self.cnt_limit = cnt_limit
+        self.rewind()
+
+    def get_next(self):
+        raise NotImplementedError
+
+    def reset_dataloader(self):
+        self.dataloader_iter = iter(self.dataloader)
+
+    def rewind(self):
+        self.reset_dataloader()
+        self.cnt = 0
+
+    def set_dataloader(self, dataloader):
+        self.dataloader = dataloader
+        self.rewind()
+
+    def should_stop(self, memory_upper_bound=80):
+        # avoid oom
+        if BaseDataReader._exceed_memory_upper_bound(
+                upper_bound=memory_upper_bound
+        ) or self.cnt + 1 > self.cnt_limit:
+            return True
+        self.cnt += 1
+        return False
+
+    def get_next_data(self):
+        data = next(self.dataloader_iter, None)
+        if data is None:
+            self.reset_dataloader()
+            data = next(self.dataloader_iter, None)
+        return data
+
+    @staticmethod
+    def _exceed_memory_upper_bound(upper_bound=90):
+        # upper_bound in [0, 100]
+
+        info = psutil.virtual_memory()
+        total_percent = info.percent
+        if total_percent >= upper_bound:
+            return True
+        return False
+
+class ONNXDataReader(BaseDataReader):
+    def __init__(self, input_name_list, dataloader, cnt_limit=500):
+        self.input_name_list = input_name_list
+        super().__init__(dataloader, cnt_limit)
+    
+    def get_next(self):
+        if self.should_stop(memory_upper_bound=90):
+            return None
+        print(f"onnx calibration data count: {self.cnt}")
+        all_input = self.get_next_data()
+        
+        #NOTE(chen.chen)
+        # we assumen the all_input contains each input tensorin input_name_list with the same order
+        assert len(all_input) >= len(self.input_name_list)
+        ort_input = {k: np.array(v) for k, v in zip(self.input_name_list, all_input)}
+        return ort_input
+            
+
+def fill_onnx_input_shape(model_path, input_shape_list, model_save_path=None):
+    model = onnx.load(model_path)
+    inputs = model.graph.input
+
+    assert len(inputs) == len(input_shape_list), f"input number error, should be {len(inputs)}, got {len(input_shape_list)}"
+    for tensor, shape in zip(inputs, input_shape_list):
+        rewrite_tensor_dim(tensor, shape)
+        
+    model = Model.preprocess_onnx(model)
+    
+    if model_save_path is None:
+        model_save_path = f"{model_path[:-5]}_fill_input.onnx"
+    onnx.save(model, model_save_path)
+    
+    return model_save_path
+
+
+def onnx_quantize_model_from_args(args):
+    ori_model_path = args.model_path
+    assert ori_model_path.endswith(".onnx")
+    
+    # NOTE(chen.chen)
+    # we should just rewrite input_shape here since some batch_size dim of reshape op is fixed
+    # ori_model_path = fill_onnx_input_shape(ori_model_path, args.input_shape_list)
+    
+    # skip model which has been quantized
+    if contain_qlinear_opearator(ori_model_path):
+        return ori_model_path
+    
+    # check if quantization_config is valid
+    # NOTE(chen.chen)
+    # if user has not specified the quantization_config
+    # we should have a default config here
+
+    config = args.quantization_config.get("onnx", {})  
+    quant_format = config.get("quant_format", "qoperator").lower()
+    if quant_format == "qdq":   
+        quant_format = QuantFormat.QDQ
+    elif quant_format == "qoperator":
+        quant_format = QuantFormat.QOperator
+    else:
+        raise ValueError(f"invalid quant_format: {quant_format}")
+    
+    
+    
+    op_types_to_quantize = config.get("op_types_to_quantize", [])
+    per_channel = config.get("per_channel", False)
+    reduce_range = config.get("reduce_range", False)
+    nodes_to_quantize = config.get("nodes_to_quantize", [])
+    nodes_to_exclude = config.get("nodes_to_exclude", [])
+    skip_group_conv_layer = config.get("skip_group_conv_layer", False)
+    
+    if args.automatic_yolo_quantization:
+        yolo_detect_nodes = find_detect_node(ori_model_path)
+        nodes_to_exclude.extend([i for i in yolo_detect_nodes if i not in nodes_to_exclude])
+        
+    if skip_group_conv_layer:
+        group_conv_node = find_group_conv_node(ori_model_path)
+        print(group_conv_node)
+        nodes_to_exclude.extend([i for i in group_conv_node if i not in nodes_to_exclude])
+    
+    unsupport_node = find_unsupported_node(ori_model_path)
+    nodes_to_exclude.extend([i for i in unsupport_node if i not in nodes_to_exclude])
+    
+    calibrate_method = config.get("calibrate_method", "percentile").lower()
+    if calibrate_method == "minmax":
+        calibrate_method=CalibrationMethod.MinMax
+    elif calibrate_method == "entropy":
+        calibrate_method=CalibrationMethod.Entropy
+    elif calibrate_method == "percentile":
+        calibrate_method=CalibrationMethod.Percentile
+    else:
+        raise ValueError(f"invalid calibrate_method: {calibrate_method}")
+    
+    quant_model_path = f"{os.path.split(ori_model_path)[1][:-5]}_quant.onnx"
+    
+    
+    ## NOTE(chen.chen)
+    ## for memory issue, we will try to change the batchsize of model to 1 during quantization
+    ## but it only works for simple cv model
+    ## we reserve a field for user to control this behavior to avoid some strange batch-rewriting result 
+    memory_efficient_quant = config.get("memory_efficient_quant", True)
+    batch_size =  args.batch_size
+    if memory_efficient_quant:
+        model_input = ori_model_path[:-5] + "_b1.onnx"
+        rewrite_batch_size(ori_model_path, 
+                           batch_size=1,
+                           save_model_path=model_input)
+        args.batch_size = 1
+    else:
+        model_input = ori_model_path
+        
+    dataloader = get_dataloader_from_args(args)
+    
+    calibrate_data_count = config.get("calibrate_data_count", 20)
+    datareader = ONNXDataReader(args.input_name_list, dataloader, calibrate_data_count)
+    
+    args.batch_size = batch_size    
+    
+    if args.verbose:
+        print("onnx quanziation config:")
+        print("model_input: ", model_input)
+        print("model_output: ", quant_model_path)
+        print("quant_format: ", quant_format)
+        print("op_types_to_quantize: ", op_types_to_quantize)
+        print("per_channel: ", per_channel)
+        print("reduce_range: ", reduce_range)
+        print("nodes_to_quantize: ", nodes_to_quantize)
+        print("nodes_to_exclude: ", nodes_to_exclude)
+        print("calibrate_method: ", calibrate_method)
+        print("skip_group_conv_layer: ", skip_group_conv_layer)
+    
+    symmetric_quantize(
+        model_input=model_input,
+        model_output=quant_model_path, 
+        calibration_data_reader=datareader,
+        quant_format=quant_format,
+        op_types_to_quantize=op_types_to_quantize,
+        per_channel=per_channel,
+        reduce_range=reduce_range,
+        nodes_to_quantize=nodes_to_quantize,
+        nodes_to_exclude=nodes_to_exclude,
+        calibrate_method=calibrate_method)
+    
+    ## NOTE(chen.chen)
+    ## rewrite the batchsize back to the origin batchsize
+    if memory_efficient_quant: 
+        rewrite_batch_size(quant_model_path, 
+                           batch_size=args.batch_size,
+                           save_model_path=quant_model_path)
+    
+    return quant_model_path
+
+
+
+
+def igie_calibrate_dataset(dataloader, input_name_list, calibrate_data_count=3):
+    calibration_data_list = []
+    for idx, batch in enumerate(dataloader):
+        if idx >= calibrate_data_count:
+            break
+        
+        data_dict = {}
+        for data, name in zip(batch, input_name_list):
+            data_dict[name] = data
+        
+        calibration_data_list.append(data_dict)
+    return calibration_data_list
+
+def igie_quantize_model_from_args(mod, params, args):
+    
+    # NOTE(chen.chen)
+    # we need to remove unused function for tensorflow
+    from tvm.relay.transform.iluvatar import SimplifyGraph
+    mod = SimplifyGraph(mod, params)
+    
+    
+    config = args.quantization_config.get("igie", {})  
+    
+    
+    base_name = os.path.splitext(os.path.split(args.model_path)[1])[0]
+    
+    scale_file_path = config.get("scale_file_path", "")
+    if scale_file_path == "":
+        scale_file_path = f"quantize_scale_file_{base_name}_{args.target}.npy"
+    calibrate_mode = config.get("calibrate_mode", "percentile")
+    weight_scale = config.get("weight_scale", "max")
+    
+    
+    skip_first_conv_layer = config.get("skip_first_conv_layer", False)
+    if args.target != "iluvatar_with_all_libs":
+        skip_first_conv_layer = True
+        
+    skip_conv_layers = None
+    if skip_first_conv_layer:
+        skip_conv_layers = [0]
+
+    skip_dense_layer = config.get("skip_dense_layer", False)
+    calibrate_chunk_by = config.get("calibrate_chunk_by", -1)
+    skip_group_conv_layer = config.get("skip_group_conv_layer", False)
+    
+    global_scale = config.get("global_scale", 0.8)
+    calibrate_data_count = config.get("calibrate_data_count", 3)
+    
+    if args.verbose:
+        print("igie quanziation config:")
+        print("calibrate_mode: ", calibrate_mode)
+        print("weight_scale: ", weight_scale)
+        print("scale_file_path: ", scale_file_path)
+        print("skip_dense_layer: ", skip_dense_layer)
+        print("skip_first_conv_layer: ", skip_first_conv_layer)
+        print("skip_group_conv_layer: ", skip_group_conv_layer)
+        print("calibrate_chunk_by: ", calibrate_chunk_by)
+        print("global_scale: ", global_scale)
+        print("calibrate_data_count: ", calibrate_data_count)
+    
+    
+    if calibrate_mode == "global_scale":
+        with tvm.transform.PassContext(opt_level=3):
+            with relay.quantize.qconfig(calibrate_mode=calibrate_mode,
+                                        global_scale=global_scale,
+                                        skip_conv_layers=skip_conv_layers,
+                                        skip_dense_layer=skip_dense_layer):
+                mod = relay.quantize.quantize(mod, params)
+    
+    elif calibrate_mode == "percentile" or calibrate_mode == "kl_divergence":
+
+        dataloader = get_dataloader_from_args(args)
+        dataset = igie_calibrate_dataset(dataloader, args.input_name_list, calibrate_data_count)
+            
+        with tvm.transform.PassContext(opt_level=3):
+            with relay.quantize.qconfig(calibrate_mode=calibrate_mode,
+                                        weight_scale=weight_scale,
+                                        skip_conv_layers=skip_conv_layers,
+                                        skip_dense_layer=skip_dense_layer,
+                                        calibrate_chunk_by=calibrate_chunk_by,
+                                        import_scale_file=scale_file_path,
+                                        skip_group_conv_layers=skip_group_conv_layer):
+                mod = relay.quantize.quantize(mod, params, dataset=dataset)
+        
+    else:
+        raise ValueError(f"unsupported calibrate_mode: {calibrate_mode}")
+    
+
+    
+    
+    return mod, params
+
+
+
+
+def _modify_symmetric(extra_options):
+    if extra_options is None:
+        extra_options = {"ActivationSymmetric": True, "WeightSymmetric": True}
+    else:
+        extra_options["ActivationSymmetric"] = True
+        extra_options["WeightSymmetric"] = True
+
+    return extra_options
+
+
+
+def symmetric_quantize(
+    model_input,
+    model_output,
+    calibration_data_reader: CalibrationDataReader,
+    quant_format=QuantFormat.QOperator,
+    op_types_to_quantize=None,
+    per_channel=False,
+    reduce_range=False,
+    nodes_to_quantize=None,
+    nodes_to_exclude=None,
+    optimize_model=False,
+    calibrate_method=CalibrationMethod.Percentile,
+    extra_options=None,
+):
+    extra_options = _modify_symmetric(extra_options)
+    assert quant_format in [QuantFormat.QOperator, QuantFormat.QDQ]
+    quantize_static(model_input,
+                    model_output,
+                    calibration_data_reader=calibration_data_reader,
+                    quant_format=quant_format,
+                    op_types_to_quantize=op_types_to_quantize,
+                    per_channel=per_channel,
+                    reduce_range=reduce_range,
+                    activation_type=QuantType.QInt8,
+                    weight_type=QuantType.QInt8,
+                    nodes_to_quantize=nodes_to_quantize,
+                    nodes_to_exclude=nodes_to_exclude,
+                    optimize_model=optimize_model,
+                    use_external_data_format=False,
+                    calibrate_method=calibrate_method,
+                    extra_options=extra_options)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/stauts_checker.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/stauts_checker.py
new file mode 100755
index 000000000..907288df1
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/stauts_checker.py
@@ -0,0 +1,21 @@
+def check_status(result_dict, args):
+    is_valid = True
+    if args.acc_target is not None:
+        acc_result = result_dict["acc_result"]
+        if acc_result < args.acc_target:
+            print(f"Expected acc_target is {args.acc_target}, got {acc_result}")
+            is_valid = False
+            
+    if args.fps_target is not None:
+        fps_result = result_dict["fps_result"]
+        if fps_result < args.fps_target:
+            print(f"Expected fps_target is {args.fps_target}, got {fps_result}")
+            is_valid = False
+    
+    if is_valid:
+        print("\n====Test Success!====\n")
+    else:
+        print("\n====Test failed!====\n")
+        exit(1)
+    
+    
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/target.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/target.py
new file mode 100755
index 000000000..2df46829c
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/target.py
@@ -0,0 +1,24 @@
+import tvm
+
+def get_target(target_name):
+    
+    target = None
+    if target_name == "llvm":
+        target = tvm.target.Target(target_name)
+    
+    elif target_name == "iluvatar":
+        target = tvm.target.iluvatar(model="MR")
+    
+    elif target_name == "iluvatar_with_cudnn_cublas":
+        target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas")
+    elif target_name == "iluvatar_with_ixinfer":
+        target = tvm.target.iluvatar(model="MR", options="-libs=ixinfer")
+    elif target_name == "iluvatar_with_all_libs":
+        target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")
+
+    else:
+        raise Exception(f"Unsupport Target name: {target_name}!")
+    
+    device = tvm.device(target.kind.name, 0)
+    
+    return target, device
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/utils/timer.py b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/timer.py
new file mode 100755
index 000000000..ed0ad0f73
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/utils/timer.py
@@ -0,0 +1,81 @@
+import tvm
+import time
+from contextlib import contextmanager
+
+
+_get_timer = tvm.get_global_func("profiling.get_timer")
+_start = tvm.get_global_func("profiling.start")
+_stop = tvm.get_global_func("profiling.stop")
+_elapse_time = tvm.get_global_func("profiling.elapse_time")
+
+
+class Timer:
+    def __init__(self, device=None):
+        self.last_duration = 0  # ms
+        self.duration_list = []  # ms
+        
+        self.device = device
+        self._timer = None
+        if device is not None:
+            self._timer =  _get_timer(device)
+
+        self.start_cnt = 0
+        self.end_cnt = 0
+
+    def total_duration(self):
+        return sum(self.duration_list)
+
+    def _update(self, duration):
+        self.last_duration = duration
+        self.duration_list.append(self.last_duration)
+
+
+    def start(self):
+        assert self._timer is not None
+        self.start_cnt += 1
+        self.device.sync()
+        _start(self._timer)
+         
+    
+    def stop(self):
+        assert self._timer is not None
+        self.end_cnt += 1
+        assert self.end_cnt == self.start_cnt
+    
+        _stop(self._timer)
+        self._update(_elapse_time(self._timer) / 1e6)  ## ns / 1e6 -> ms
+
+
+
+    # @contextmanager
+    # def timeit_sync(self, device, use_host_time=False):
+    #     # NOTE(chen.chen)
+    #     # not works as expected when use device timer
+    #     # it seems python contextmanager always use host time?
+    #     if use_host_time:
+    #         device.sync()
+    #         t1 = time.time()
+
+    #         yield
+
+    #         device.sync()
+    #         t2 = time.time()
+    #         self._update((t2 - t1) * 1e3)  ## s * 1e3 -> ms
+    #     else:
+    #         timer = _get_timer(device)
+    #         device.sync()
+    #         _start(timer)
+
+    #         yield
+
+    #         _stop(timer)
+    #         self._update(_elapse_time(timer) / 1e6)  ## ns / 1e6 -> ms
+
+    # @contextmanager
+    # def timeit(self):
+    #     t1 = time.time()
+
+    #     yield
+
+    #     t2 = time.time()
+    #     self._update((t2 - t1) * 1e3)  ## s * 1e3 -> ms

From 30e6b30b5bb0f343a129dcaad67eb2508ddf84ad Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Fri, 26 Apr 2024 14:18:36 +0800
Subject: [PATCH 06/28] update README

---
 .../general_perf/backends/ILUVATAR/README.zh_CN.md       | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
index 016f01309..c60d5ea5f 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
@@ -46,7 +46,7 @@
         生成的测试报告在：general_perf/reports/ILUVATAR/videobert-onnx-fp32
     
     6、widedeep模型：
-        ***该模型经过了特殊的处理，需要采用的onnx模型：widedeep_dynamicshape_sim.onnx；将其放到：general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
+        ***该模型经过了特殊的处理，需要采用的onnx模型：widedeep_dynamicshape.onnx；将其放到：general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
         ***
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
         生成的测试报告在：general_perf/reports/ILUVATAR/widedeep-tf-fp32
@@ -74,6 +74,11 @@
         ***********该模型暂时没有解决，等待后续解决了再修改代码，再进行测试***********
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roformer-tf-fp32
         生成的测试报告在：general_perf/reports/ILUVATAR/roformer-tf-fp32
+
+    12、gpt2模型：
+        *******在进行测试时，请把workloads下面的gpt2-torch-fp32.json里面的精度、数值对比测试改成false
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task gpt2-torch-fp32
+        生成的测试报告在：general_perf/reports/ILUVATAR/gpt2-torch-fp32
 """
 
 """
@@ -85,4 +90,4 @@
         2）python3 byte_infer_perf/llm_perf/core/perf_engine.py --task chinese-llama2-torch-fp16-13b --hardware_type ILU, 得到chinese-llama2-torch-fp16-13b的精度和性能数据
 
     3. 在byte_infer_perf/llm_perf/reports/ILU目录下查看得到模型精度和性能数据的json文件。
-"""
\ No newline at end of file
+"""

From 2ff176dff0b707f329d369822061b9b4947133cb Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Fri, 26 Apr 2024 15:47:42 +0800
Subject: [PATCH 07/28] update

---
 .../backends/ILUVATAR/README.zh_CN.md         |  1 -
 .../ILUVATAR/compile_backend_iluvatar.py      |  3 ++-
 .../ILUVATAR/runtime_backend_iluvatar.py      | 20 ++++++++++++++++++-
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
index c60d5ea5f..98204b120 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
@@ -32,7 +32,6 @@
            ***给定的pt模型转成onnx后输入只有2个，因此这里特殊处理了一下；目前不能直接使用optimizer脚本优化后的onnx直接进行推理，我们把这个模型优化流程给出了，但是实际上使用了处理好的onnx：
               deberta-base-squad-sim_end.onnx，将其放到：general_perf/model_zoo/popular/open_deberta/ 目录下；
            ***
-           其次，需要修改model_zoo下面的general_perf/model_zoo/deberta-torch-fp32.json里面输入的个数，去掉token_type_ids.1相关的配置
 
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task deberta-torch-fp32
         生成的测试报告在：general_perf/reports/ILUVATAR/deberta-torch-fp32/
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
index 2f81557d8..b81510a76 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
@@ -71,7 +71,8 @@ def compile(self, configs, dataloader=None):
             engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
 
         # model preprocessing
-        self.get_onnx(configs)
+        if model_name != 'deberta':
+            self.get_onnx(configs)
 
         # build engine
         if model_name == 'widedeep':
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
index 6b07ccdd1..47eeb1607 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -159,6 +159,21 @@ def predict(self, feeds):
                     
                         input_idx = engine.get_binding_index(input_name)
                         context.set_binding_shape(input_idx, Dims(input_shape))
+
+                elif model_name == 'deberta':
+                    input_names = [
+                        "input_ids.1",
+                        "attention_mask.1",
+                    ]
+                    for input_name in input_names:
+                        if input_name == 'input_ids.1':
+                            input_shape = input_tensors[0].shape
+                        if input_name == 'attention_mask.1':
+                            input_shape = input_tensors[1].shape
+                    
+                        input_idx = engine.get_binding_index(input_name)
+                        context.set_binding_shape(input_idx, Dims(input_shape))
+
                 else:
                     input_shape = input_tensors[i].shape
                     input_idx = engine.get_binding_index(input_name)
@@ -225,11 +240,14 @@ def predict(self, feeds):
                 result[output_name[i]] = outputs_list[i]
 
         else:
-            result = None
             self.predict_igie(feeds)
             
         if model_name == 'videobert':
             return outputs_list
+        
+        elif model_name == 'gpt2':
+            return None
+        
         else:
             return result
     

From d5055d35851f4d1c6b685f372b96bc7bfa890f1a Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Fri, 26 Apr 2024 16:53:04 +0800
Subject: [PATCH 08/28] update

---
 .../backends/ILUVATAR/README.zh_CN.md         | 97 ++++++++++++-------
 1 file changed, 62 insertions(+), 35 deletions(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
index 98204b120..21c5d05c6 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
@@ -1,92 +1,119 @@
 """
-    操作说明：如果不想跑CPU端的性能、精度、数值指标，可以执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32（示例）;
+    ****************************************操作说明*********************************
+    如果不想跑CPU端的性能、精度、数值指标对比，可以直接执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32（示例）
              如果模型提供了pt、pb格式的优先选择torch的配置进行测试；
+             如果执行整个pipeline，需要执行：python3 lauch.py --hardware_type ILUVATAR --task widedeep-tf-fp32（示例）（跑cpu结果会很耗时）
 
     功能实现：
         1、pt、pb模型转换在compile模块预处理过程中实现；
         2、在天数智芯BI-150显卡上，调用推理引擎tensorrt进行推理，一些onnx模型需要利用前面一步导出的onnx模型再进行插件算子的优化；
     
     环境准备：
-        1、sdk版本：http://sw.iluvatar.ai/download/corex/daily_packages/latest/x86_64/bi150/sdk/corex-installer-linux64-3.4.0.20240418.74_x86_64_10.2.run
-        2、ixrt版本：http://sw.iluvatar.ai/download/corex/daily_packages/latest/x86_64/bi150/apps/py3.10/ixrt-0.9.1+corex.3.4.0.20240418.71-cp310-cp310-linux_x86_64.whl
+        1、sdk版本：由天数智芯工程师提供
+        2、ixrt版本：由天数智芯工程师提供
 
     遗留问题：
-        1、roformer、conformer、widedeep模型做了特殊处理，目前还不能做到加载模型预处理的onnx模型直接进行推理，研发还在继续优化
+        1、roformer模型暂时还不支持动态shape推理，因此本次暂不提交
 """
 
+
 """
-    ******************下面简单的说明11个小模型是如何测试与测试报告生成的*****************
+    ***************************11个小模型的测试与测试报告生成的操作方法****************************
     整个代码运行过程中，主要是从workloads目录下加载对应的模型的配置，主要有test_perf、test_accuracy、test_numeric三项测试内容，用户可以根据自己的需要选择开启与否；
-    一般情况下采用字节默认的配置项即可；
+    一般情况下采用字节默认的配置项即可；需要特别修改的配置下面会进行说明
+
+    输出性能文档里面涉及的字段说明：
+        1、QPS、AVG Latency、P99 Latency：这3个指标是走字节框架，采用天数智芯的推理引擎IxRT会计算H2D、D2H的时间，也就是数据在不同的设备（CPU、GPU）之间传输耗时；
+        2、predict QPS、predict AVG Latency、predict P99 Latency：这部分指标把上面一步计算H2D、D2H的耗时剔除出去了，因此可以看做纯推理耗时，这个耗时可以与利用
+           ixerexec命令跑出来的结果做一定的对比，但是不一定完全对齐，因为走整个框架代码肯定会导致一部分性能损失
+
+
+    cd ByteMLPerf/byte_infer_perf
 
-    cd ByteMLPerf/byte_infer_perf;
     1、bert模型：
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
-        生成的测试报告在：general_perf/reports/ILUVATAR/bert-torch-fp32/
+        生成的测试报告位置：general_perf/reports/ILUVATAR/bert-torch-fp32/
 
     2、albert模型：
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task albert-torch-fp32
-        生成的测试报告在：general_perf/reports/ILUVATAR/albert-torch-fp32/
+        生成的测试报告位置：general_perf/reports/ILUVATAR/albert-torch-fp32/
 
     3、debert模型：
-           ***给定的pt模型转成onnx后输入只有2个，因此这里特殊处理了一下；目前不能直接使用optimizer脚本优化后的onnx直接进行推理，我们把这个模型优化流程给出了，但是实际上使用了处理好的onnx：
-              deberta-base-squad-sim_end.onnx，将其放到：general_perf/model_zoo/popular/open_deberta/ 目录下；
-           ***
+           给定的pt模型转成onnx后输入只有2个，因此这里特殊处理了一下；加载处理好的onnx模型：deberta-base-squad-sim_end.onnx
+           将其放到：general_perf/model_zoo/popular/open_deberta/ 目录下；
+
+           下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                    cd files/yudefu/ ; get deberta-base-squad-sim_end.onnx
 
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task deberta-torch-fp32
-        生成的测试报告在：general_perf/reports/ILUVATAR/deberta-torch-fp32/
+        生成的测试报告位置：general_perf/reports/ILUVATAR/deberta-torch-fp32/
 
     4、roberta模型：
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roberta-torch-fp32
-        生成的测试报告在：general_perf/reports/ILUVATAR/roberta-torch-fp32/
+        生成的测试报告位置：general_perf/reports/ILUVATAR/roberta-torch-fp32/
 
     5、videobert模型：
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task videobert-onnx-fp32
-        生成的测试报告在：general_perf/reports/ILUVATAR/videobert-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/videobert-onnx-fp32
     
     6、widedeep模型：
-        ***该模型经过了特殊的处理，需要采用的onnx模型：widedeep_dynamicshape.onnx；将其放到：general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
-        ***
+           该模型经过了特殊的处理，需要采用处理好的onnx模型：widedeep_dynamicshape.onnx；
+           将其放到：general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
+
+           下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+           cd files/yudefu/ ; get widedeep_dynamicshape.onnx
+        
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
-        生成的测试报告在：general_perf/reports/ILUVATAR/widedeep-tf-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/widedeep-tf-fp32
 
     7、swin-transformer模型：
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task swin-large-torch-fp32
-        生成的测试报告在：general_perf/reports/ILUVATAR/swin-large-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/swin-large-torch-fp32
 
     8、resnet50模型：
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task resnet50-torch-fp32
-        生成的测试报告在：general_perf/reports/ILUVATAR/resnet50-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/resnet50-torch-fp32
 
     9、yolov5模型：
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task yolov5-onnx-fp32
-        生成的测试报告在：general_perf/reports/ILUVATAR/yolov5-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/yolov5-onnx-fp32
 
     10、conformer模型：
-        ***该onnx模型的transpose算子的逻辑是有问题，做了特殊处理；采用处理好的onnx模型：conformer_encoder_optimizer_end.onnx；
-           将其放到：general_perf/model_zoo/popular/open_conformer/ 
-        ***
+            该onnx模型的transpose算子实现逻辑需要特殊处理；采用处理好的onnx模型：conformer_encoder_optimizer_end.onnx
+            将其放到：general_perf/model_zoo/popular/open_conformer/ 
+
+            下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+            cd files/yudefu/ ; get conformer_encoder_optimizer_end.onnx
+        
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task conformer-encoder-onnx-fp32
-        生成的测试报告在：general_perf/reports/ILUVATAR/conformer-encoder-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/conformer-encoder-onnx-fp32
 
     11、roformer模型：
-        ***********该模型暂时没有解决，等待后续解决了再修改代码，再进行测试***********
+            该模型暂时没有解决，等待后续解决了再提供测试说明
+
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roformer-tf-fp32
-        生成的测试报告在：general_perf/reports/ILUVATAR/roformer-tf-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/roformer-tf-fp32
 
     12、gpt2模型：
-        *******在进行测试时，请把workloads下面的gpt2-torch-fp32.json里面的精度、数值对比测试改成false
+            在进行测试时，请把workloads下面的gpt2-torch-fp32.json里面的精度、数值对比测试改成false
+
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task gpt2-torch-fp32
-        生成的测试报告在：general_perf/reports/ILUVATAR/gpt2-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/gpt2-torch-fp32
 """
 
 """
-    ****************大模型操作流程******
-    1. 进入ByteMLPerf目录
-    2. 执行
-        1）python3 byte_infer_perf/llm_perf/core/perf_engine.py --task chatglm2-torch-fp16-6b --hardware_type ILU, 得到chatglm2-torch-fp16-6b的精度和性能数据
+    ***************************大模型操作流程********************
+    说明：
+        此部分侵入了字节代码框架，因此需要重新重构，暂时不需要进行测试
+
+    操作流程：
+        1. 进入ByteMLPerf目录
+        2. 执行
+            1）python3 byte_infer_perf/llm_perf/core/perf_engine.py --task chatglm2-torch-fp16-6b --hardware_type ILU, 
+               得到chatglm2-torch-fp16-6b的精度和性能数据
 
-        2）python3 byte_infer_perf/llm_perf/core/perf_engine.py --task chinese-llama2-torch-fp16-13b --hardware_type ILU, 得到chinese-llama2-torch-fp16-13b的精度和性能数据
+            2）python3 byte_infer_perf/llm_perf/core/perf_engine.py --task chinese-llama2-torch-fp16-13b --hardware_type ILU,
+               得到 chinese-llama2-torch-fp16-13b的精度和性能数据
 
-    3. 在byte_infer_perf/llm_perf/reports/ILU目录下查看得到模型精度和性能数据的json文件。
+        3. 在byte_infer_perf/llm_perf/reports/ILU目录下查看得到模型精度和性能数据的json文件
 """

From 94bdf134998d254885fe7e26156226c210c0172c Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Fri, 26 Apr 2024 19:20:18 +0800
Subject: [PATCH 09/28] update README.zh_CN.md

---
 .../backends/ILUVATAR/README.zh_CN.md         | 136 ++++++++++++++++--
 1 file changed, 122 insertions(+), 14 deletions(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
index 21c5d05c6..8af0ed6b4 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
@@ -27,46 +27,142 @@
         2、predict QPS、predict AVG Latency、predict P99 Latency：这部分指标把上面一步计算H2D、D2H的耗时剔除出去了，因此可以看做纯推理耗时，这个耗时可以与利用
            ixerexec命令跑出来的结果做一定的对比，但是不一定完全对齐，因为走整个框架代码肯定会导致一部分性能损失
 
+    数据集、模型准备：
+        cd ByteMLPerf/byte_infer_perf/general_perf
+
+        bash general_perf/prepare_model_and_dataset.sh bert-torch-fp32 open_squad
+        bash general_perf/prepare_model_and_dataset.sh resnet50-torch-fp32 open_imagenet
+        bash general_perf/prepare_model_and_dataset.sh widedeep-tf-fp32 open_criteo_kaggle
+        bash general_perf/prepare_model_and_dataset.sh albert-torch-fp32
+        bash general_perf/prepare_model_and_dataset.sh roformer-tf-fp32 open_cail2019
+        bash general_perf/prepare_model_and_dataset.sh videobert-onnx-fp32 open_cifar
+        bash general_perf/prepare_model_and_dataset.sh yolov5-onnx-fp32 
+        bash general_perf/prepare_model_and_dataset.sh conformer-encoder-onnx-fp32
+        bash general_perf/prepare_model_and_dataset.sh roberta-torch-fp32
+        bash general_perf/prepare_model_and_dataset.sh deberta-torch-fp32 
+        bash general_perf/prepare_model_and_dataset.sh swin-large-torch-fp32
+        bash general_perf/prepare_model_and_dataset.sh gpt2-torch-fp32 
+
+        上面的模型与数据集下载完毕后会生成在：general_perf/general_perf，需要把该目录在的model_zoo下面的regular、popular、sota移到general_perf/model_zoo下面
+        如果还缺少什么模型、数据集可以在prepare_model_and_dataset.sh里面执行类似上面的操作即可；
+
+
+    测试开始：
 
     cd ByteMLPerf/byte_infer_perf
 
     1、bert模型：
+        测试过程中如果缺少：dev-v1.1.json、vocab.txt，按照下面的操作进行下载
+
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd files/yudefu/open_squad ; get dev-v1.1.json; get vocab.txt
+                 exit
+
+        移动：mv dev-v1.1.json vocab.txt general_perf/datasets/open_squad/;
+
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/bert-torch-fp32/
 
     2、albert模型：
+        测试过程中如果从huggingface网址不能下载文件，可以按照下面的操作进行下载
+        
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd files/yudefu/ ; get albert.rar
+                 exit
+
+        mkdir -p madlag/albert-base-v2-squad;
+        解压：unrar x albert.rar madlag/albert-base-v2-squad;
+
+        接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
+        AutoTokenizer.from_pretrained("madlag/albert-base-v2-squad") => AutoTokenizer.from_pretrained("/ByteMLPerf/byte_infer_perf/madlag/albert-base-v2-squad")  (注意绝对路径根据实际情况修改，需要在ByteMLPerf前面在加一个当前目录最上层的路径，下同)
+
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task albert-torch-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/albert-torch-fp32/
 
     3、debert模型：
-           给定的pt模型转成onnx后输入只有2个，因此这里特殊处理了一下；加载处理好的onnx模型：deberta-base-squad-sim_end.onnx
-           将其放到：general_perf/model_zoo/popular/open_deberta/ 目录下；
+        测试过程中如果从huggingface网址不能下载文件，可以按照下面的操作进行下载
+
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd files/yudefu/ ; get deberta.rar
+                 exit
+
+        mkdir -p Palak/microsoft_deberta-base_squad;
+        解压：unrar x deberta.rar Palak/microsoft_deberta-base_squad;
 
-           下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-                    cd files/yudefu/ ; get deberta-base-squad-sim_end.onnx
+        接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
+        AutoTokenizer.from_pretrained("Palak/microsoft_deberta-base_squad") => AutoTokenizer.from_pretrained("/ByteMLPerf/byte_infer_perf/Palak/microsoft_deberta-base_squad")
+
+        给定的pt模型转成onnx后输入只有2个，因此这里特殊处理了一下；加载处理好的onnx模型：deberta-base-squad-sim_end.onnx
+        将其放到：general_perf/model_zoo/popular/open_deberta/ 目录下；
+
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd files/yudefu/ ; get deberta-base-squad-sim_end.onnx
+                 exit
+        
+        移动：mv deberta-base-squad-sim_end.onnx general_perf/model_zoo/popular/open_deberta/
 
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task deberta-torch-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/deberta-torch-fp32/
 
     4、roberta模型：
+        测试过程中如果从huggingface网址不能下载文件，可以按照下面的操作进行下载
+
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd files/yudefu/ ; get roberta.rar
+                 exit
+
+        mkdir -p csarron/roberta-base-squad-v1;
+        解压：unrar x roberta.rar csarron/roberta-base-squad-v1;
+
+        接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
+        AutoTokenizer.from_pretrained("csarron/roberta-base-squad-v1") => AutoTokenizer.from_pretrained("/ByteMLPerf/byte_infer_perf/csarron/roberta-base-squad-v1")
+
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roberta-torch-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/roberta-torch-fp32/
 
     5、videobert模型：
+        测试过程中如果在 open_cifar 数据集中缺少某些文件，可以按照下面的操作进行下载
+
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd files/yudefu/open_cifar ; get cifar-100-python.tar.gz
+                 exit
+
+        解压：tar -zxvf cifar-100-python.tar.gz； mv cifar-100-python general_perf/datasets/open_cifar
+
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task videobert-onnx-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/videobert-onnx-fp32
     
     6、widedeep模型：
-           该模型经过了特殊的处理，需要采用处理好的onnx模型：widedeep_dynamicshape.onnx；
-           将其放到：general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
+        测试过程中如果在 open_criteo_kaggle 数据集中缺少：eval.csv、categorical.npy、label.npy、numeric.npy，可以按照下面的操作进行下载
+        （根据缺少的文件进行下载即可，不需要的可以不下载，下同）
+
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd files/yudefu/open_criteo_kaggle ; get eval.csv； get categorical.npy；get label.npy； get numeric.npy
+                 exit
+
+        移动：mv eval.csv categorical.npy label.npy numeric.npy general_perf/datasets/open_criteo_kaggle;
+
+        该模型经过了特殊的处理，需要采用处理好的onnx模型：widedeep_dynamicshape.onnx；
+        将其放到：general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
 
-           下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-           cd files/yudefu/ ; get widedeep_dynamicshape.onnx
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd files/yudefu/ ; get widedeep_dynamicshape.onnx
+                 exit
+        
+        移动：mv widedeep_dynamicshape.onnx general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
         
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/widedeep-tf-fp32
 
     7、swin-transformer模型：
+        测试过程中如果缺少：open_imagenet下面相关的文件或者数据集，按照下面的操作进行下载
+
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd files/yudefu/open_imagenet ; get ILSVRC2012_img_val.tar.gz; get val_map.txt
+                 exit
+        
+        解压：tar -zxvf ILSVRC2012_img_val.tar.gz；mv ILSVRC2012_img_val val_map.txt general_perf/datasets/open_imagenet
+
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task swin-large-torch-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/swin-large-torch-fp32
 
@@ -79,23 +175,35 @@
         生成的测试报告位置：general_perf/reports/ILUVATAR/yolov5-onnx-fp32
 
     10、conformer模型：
-            该onnx模型的transpose算子实现逻辑需要特殊处理；采用处理好的onnx模型：conformer_encoder_optimizer_end.onnx
-            将其放到：general_perf/model_zoo/popular/open_conformer/ 
+        该onnx模型的transpose算子实现逻辑需要特殊处理；采用处理好的onnx模型：conformer_encoder_optimizer_end.onnx
+        将其放到：general_perf/model_zoo/popular/open_conformer/ 
 
-            下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-            cd files/yudefu/ ; get conformer_encoder_optimizer_end.onnx
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd files/yudefu/ ; get conformer_encoder_optimizer_end.onnx
+                 exit
+        
+        移动：mv conformer_encoder_optimizer_end.onnx general_perf/model_zoo/popular/open_conformer/ 
         
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task conformer-encoder-onnx-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/conformer-encoder-onnx-fp32
 
     11、roformer模型：
-            该模型暂时没有解决，等待后续解决了再提供测试说明
+        该模型暂时没有解决，等待后续解决了再提供测试说明
+
+        测试过程中如果缺少：open_cail2019下面相关的文件或者数据集，按照下面的操作进行下载
+
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd files/yudefu/open_cail2019 ; get batch_segment_ids.npy； get batch_token_ids.npy； 
+                    get label.py； get test.json；get vocab.txt
+                  exit
+
+        移动：mv batch_segment_ids.npy batch_token_ids.npy label.py test.json vocab.txt general_perf/datasets/open_cail2019
 
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roformer-tf-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/roformer-tf-fp32
 
     12、gpt2模型：
-            在进行测试时，请把workloads下面的gpt2-torch-fp32.json里面的精度、数值对比测试改成false
+        在进行测试时，请把workloads下面的gpt2-torch-fp32.json里面的精度、数值对比测试改成false
 
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task gpt2-torch-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/gpt2-torch-fp32

From f10ee9efe7bb458a30263203ffac22c66abad387 Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Sat, 11 May 2024 14:17:53 +0800
Subject: [PATCH 10/28] add stable diffusion models

---
 .../backends/ILUVATAR/README.zh_CN.md         | 43 ++++++++++++
 .../ILUVATAR/compile_backend_iluvatar.py      |  7 +-
 .../ILUVATAR/runtime_backend_iluvatar.py      | 67 +++++++++++++++++--
 3 files changed, 110 insertions(+), 7 deletions(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
index 8af0ed6b4..6caa35876 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
@@ -225,3 +225,46 @@
 
         3. 在byte_infer_perf/llm_perf/reports/ILU目录下查看得到模型精度和性能数据的json文件
 """
+
+"""
+    ***************************Stable Diffusion模型操作流程********************
+    环境准备：官方的onnx2torch有bug存在，所以需要安装天数智芯适配版本的onnx2torch，采用pytorch推理框架
+
+    操作过程：
+        1、cd ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch
+        2、执行：python3 setup.py install
+        3、cd -
+
+        数据集、模型准备：
+        cd ByteMLPerf/byte_infer_perf/general_perf
+
+        bash general_perf/prepare_model_and_dataset.sh vae-encoder-onnx-fp32
+
+        上面的模型与数据集下载完毕后会生成在：general_perf/general_perf，需要把该目录在的model_zoo下面的regular、popular、sota移到general_perf/model_zoo下面
+        如果还缺少什么模型、数据集可以在prepare_model_and_dataset.sh里面执行类似上面的操作即可；
+
+    测试开始：
+
+    cd ByteMLPerf/byte_infer_perf
+
+    1、vae-decoder模型:
+        注意事项：由于天数智芯的显卡基本上都是32G显存, 因此需要修改workloads下面的模型启动配置
+            "batch_sizes":[4,8], "test_numeric": false, 
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task vae-decoder-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/vae-decoder-onnx-fp32
+
+    2、vae-encoder模型：
+        注意事项：由于天数智芯的显卡基本上都是32G显存, 因此需要修改workloads下面的模型启动配置
+            "batch_sizes":[4,8], "test_numeric": false, 
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task vae-encoder-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/vae-encoder-onnx-fp32
+
+    2、clip模型：
+        注意事项：为了实现性能测试, 因此需要修改workloads下面的模型启动配置
+            "test_numeric": false, 
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task clip-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/clip-onnx-fp32
+"""
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
index b81510a76..8c20da5db 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
@@ -110,7 +110,10 @@ def compile(self, configs, dataloader=None):
                     input_dict[key] = val
                     
                 build_igie_engine(model_name=model_name, model_path=onnx_model_path, input_dict=input_dict, model_framework='onnx', precision='fp16', engine_path=engine_path)
-
+        
+        elif model == 'vae-decoder-onnx-fp32' or model == 'vae-encoder-onnx-fp32' or model == 'clip-onnx-fp32':
+            pass
+        
         else:
             build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
 
@@ -228,4 +231,4 @@ def get_onnx(self, configs):
             print("***Convert onnx model to plugin operator model success!***")
 
         else:
-            pass
+            pass
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
index 47eeb1607..2f97bed40 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -37,6 +37,7 @@
     "FLOAT32": torch.float32,
     "FLOAT16": torch.float16,
     "INT8": torch.int8,
+    "INT32":torch.int32,
     "LONG": torch.long,
     "INT64": torch.int64,
     "BOOL": torch.bool
@@ -66,6 +67,12 @@ def __init__(self):
         self.predict_time = None
         self.task = None
 
+    def isSDmodel(self, model_name):
+        result = False
+        if model_name == 'vae-decoder-onnx-fp32' or model_name == 'vae-encoder-onnx-fp32' or model_name == 'clip-onnx-fp32':
+            result = True
+        return result
+
     # Dual-core inference of Tian SoC BI-150 graphics card
     def benchmark(self, dataloader):
         performance_reports = []
@@ -84,8 +91,9 @@ def benchmark(self, dataloader):
             work.join()
         
         if model_name != 'gpt2':
-            del self.engine
-            del self.context
+            if not self.isSDmodel(self.configs["model"]):
+                del self.engine
+                del self.context
             
         if len(performance_reports[0]) == len(performance_reports[1]):
             if performance_reports[0].keys() == performance_reports[1].keys():
@@ -116,7 +124,15 @@ def predict(self, feeds):
         i = 0
 
         model_name = self.configs["model"].split("-")[0]
-        if model_name != 'gpt2':
+        if self.isSDmodel(self.configs["model"]):
+            for key, _ in feeds.items():
+                tmp_tensor = torch.tensor(feeds[key],
+                                    dtype=pt_dtype_map[self.input_type[i]])
+                input_tensors.append(tmp_tensor)
+                i += 1
+            self.predict_sd(input_tensors)
+            return
+        elif model_name != 'gpt2':
             if model_name == 'deberta':
                 keys = list(feeds.keys())
                 input_ids = torch.tensor(feeds[keys[0]], dtype=pt_dtype_map[self.input_type[0]])
@@ -238,7 +254,6 @@ def predict(self, feeds):
                     break
 
                 result[output_name[i]] = outputs_list[i]
-
         else:
             self.predict_igie(feeds)
             
@@ -269,7 +284,10 @@ def benchmark_interact(self, dataloader):
 
         if model_name == 'gpt2':
             self.load_igie(batch_size)
+        elif self.isSDmodel(self.configs["model"]):
+            self.load_sd(batch_size)   
 
+         
         test_data = self._get_fake_samples(batch_size=batch_size,
                         shape=self.configs['segments'][0]['input_tensor_map'],
                         input_type=self.configs['input_type'])
@@ -300,6 +318,11 @@ def benchmark_interact(self, dataloader):
         log.info(
             'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
             format(self.batch_size, qps, avg_latency, tail_latency))
+        
+        log.info(
+            'Batch size is {}, fps: {}, predict_avg_latency:{}, predict_tail_latency:{}'.
+            format(self.batch_size, fps, predict_avg_latency, tail_latency))
+
 
         report['QPS'] = qps
         report['AVG Latency'] = avg_latency
@@ -325,6 +348,10 @@ def load(self, batch_size) -> None:
         if model_name == 'gpt2':
             self.batch_size = batch_size
             return
+        elif self.isSDmodel(model):
+            self.batch_size = batch_size
+            #self.load_sd(batch_size)
+            return
         
         if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
             engine_path = model_path.split(".")[0] + "_end.engine"
@@ -364,6 +391,36 @@ def load(self, batch_size) -> None:
         self.engine = engine
         self.context = context         
     
+
+    def load_sd(self, batch_size):
+        model_path = self.configs['model_path']
+
+        import onnx
+        from onnx2torch import convert
+        
+        device = "cpu"
+        if torch.cuda.is_available():
+            device = "cuda"
+
+        self.model_sd = convert(model_path).to(device)
+
+        self.input_type = self.configs['input_type']
+        self.batch_size = batch_size
+        pass
+
+    def predict_sd(self, dataloader):
+        self.model_sd = self.model_sd.eval()
+        dataloader = dataloader[0].to('cuda')
+        torch.cuda.synchronize()
+        starttime = time.time()
+        out = self.model_sd(dataloader)
+        torch.cuda.synchronize()
+        endtime = time.time()
+
+        self.predict_time = endtime - starttime
+
+        return out
+
     def load_igie(self, batch_size):
         model = self.configs['model']
         model_path = self.configs['model_path']
@@ -392,4 +449,4 @@ def _get_fake_samples(self, batch_size, shape, input_type):
                 i += 1
             return data
         else:
-            raise ValueError("Please provide input type")
+            raise ValueError("Please provide input type")
\ No newline at end of file

From ac66bf879ebf021ab5c45e51456269de61310721 Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Sat, 11 May 2024 16:31:40 +0800
Subject: [PATCH 11/28] adding vllm_inference

---
 .../backends/ILUVATAR/iluvatar_engine.py      | 170 ++++++++++++++++++
 .../ILUVATAR/iluvatar_process_messager.py     |  49 +++++
 .../backends/ILUVATAR/iluvatar_sampler.py     |  32 ++++
 .../backends/ILUVATAR/iluvatar_scheduler.py   |  89 +++++++++
 .../backends/ILUVATAR/model_impl/__init__.py  |   8 +
 .../llm_perf/backends/ILUVATAR/setup.py       |  27 +++
 6 files changed, 375 insertions(+)
 create mode 100755 byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_engine.py
 create mode 100755 byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_process_messager.py
 create mode 100755 byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_sampler.py
 create mode 100755 byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_scheduler.py
 create mode 100755 byte_infer_perf/llm_perf/backends/ILUVATAR/model_impl/__init__.py
 create mode 100755 byte_infer_perf/llm_perf/backends/ILUVATAR/setup.py

diff --git a/byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_engine.py b/byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_engine.py
new file mode 100755
index 000000000..e9da6a2f3
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_engine.py
@@ -0,0 +1,170 @@
+import os
+import json
+import asyncio
+from typing import Dict, List
+
+import torch
+
+from llm_perf.core.generation import GenerateRequest
+from llm_perf.core.engine import CoreEngine
+from llm_perf.backends.ILUVATAR.iluvatar_process_messager import IluvatarMultiProcessMsgr
+from llm_perf.utils.logger import logger
+
+from vllm.utils import Counter, random_uuid
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.arg_utils import AsyncEngineArgs
+
+
+class IluvatarEngine(CoreEngine):
+    
+    class Packet(CoreEngine.Packet):
+        def __init__(self, request: GenerateRequest):
+            CoreEngine.Packet.__init__(self, request)
+
+            self.generation_start_time = None
+
+        def _is_finished(self) -> bool:
+            return self.is_finished()
+
+        @staticmethod
+        def prepare_inputs(
+            batch: List[CoreEngine.Packet],
+            **kwargs
+        ) -> Dict:
+            model_config = kwargs["model_config"]
+            pad_token_id = kwargs["pad_token_id"]
+
+            all_input_ids = []
+            all_position_ids = []
+
+            max_seq_len = -1
+            for packet in batch:
+                cur_id_len = len(packet.request.input_ids) + len(packet.generate_ids)
+                max_seq_len = cur_id_len if cur_id_len > max_seq_len else max_seq_len
+
+            for packet in batch:
+                cur_id_len = len(packet.request.input_ids) + len(packet.generate_ids)
+                pad_len = max_seq_len - cur_id_len
+                input_ids = (
+                    packet.request.input_ids + 
+                    packet.generate_ids + 
+                    [pad_token_id] * pad_len
+                )
+                all_input_ids.append(input_ids)
+                all_position_ids.append([i for i in range(max_seq_len)])
+
+            model_inputs = {
+                "past_key_values": None, 
+                "attention_mask": None, 
+                "use_cache": None
+            }
+            model_inputs["input_ids"] = all_input_ids
+            model_inputs["position_ids"] = all_position_ids
+
+            model_name = model_config['model_name']
+            if model_name == 'chatglm2':
+                model_inputs["return_last_logit"] = False
+            return model_inputs
+
+
+    def __init__(
+        self, model_config, pad_token_id, 
+        **kwarg
+    ) -> None:
+        super().__init__()
+
+        self.model_config = model_config
+        self.pad_token_id = pad_token_id
+        self.engine = None
+        
+        # set up environ
+        self.setup()
+        
+        # init multiprocessr msgr
+        if self.world_size > 1:
+            self.mlp_manager = IluvatarMultiProcessMsgr(
+                self.local_rank, self.world_size, "MultiProcessMsgr"
+            )
+
+
+    def setup(self):
+        # init distributed env if needed
+        os.environ["TORCHELASTIC_USE_AGENT_STORE"] = "False"
+
+        self.world_size = int(os.environ.get("WORLD_SIZE", "1"))
+        self.local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+        
+        model = self.model_config['model_path']
+        tokenizer = self.model_config["tokenizer"]["path"]
+        llm_engine = self.load_model(model, tokenizer)
+        self.engine = llm_engine
+
+
+    def load_model(self, model, tokenizer):
+        self.request_counter = Counter()
+        
+        # Create the AsyncLLMEngine
+        engine_args = AsyncEngineArgs(model=model, tokenizer=tokenizer, trust_remote_code=True,)
+        llm_engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+        return llm_engine
+    
+
+    def broadcast_inputs(self, *args):
+        if self.world_size <= 1:
+            return args
+        
+        if self.local_rank == 0:
+            self.mlp_manager.broadcast(args)
+            return args
+        else:
+            inputs = self.mlp_manager.receive()
+            return inputs
+
+
+    def prepare_inputs(self, batch: List[CoreEngine.Packet]) -> Dict:
+        model_inputs = IluvatarEngine.Packet.prepare_inputs(
+            batch, 
+            model_config=self.model_config, 
+            pad_token_id=self.pad_token_id
+        )
+        return model_inputs
+    
+
+    async def generate(self, samplingparams, request_id, input):  
+        async for output in self.engine.generate(None, samplingparams, request_id, input):
+            result = output.outputs[0] 
+            ret = {"token_ids": result.token_ids, "finish_reason":result.finish_reason}  
+            yield json.dumps(ret).encode("utf-8") 
+
+
+    async def consume_stream(self, samplingparams, input): 
+        handler_list = list()
+        for i in input:
+            request_id = random_uuid()
+            handler_list.append(self.generate(samplingparams, str(request_id), i))
+
+        while True:
+            data = list()
+            try:
+                for h in handler_list:
+                    result = await anext(h)
+                    result.decode('utf-8')
+                    result = json.loads(result) 
+                    data.append(result)
+            except:
+                break
+            yield data
+
+
+    async def do_inference(self, packets: List[CoreEngine.Packet], sampler):
+        # set device
+        torch.cuda.set_device(self.local_rank)
+
+        # prepare inputs for each process
+        model_inputs = self.prepare_inputs(packets) if self.local_rank == 0 else None
+        model_inputs = self.broadcast_inputs(model_inputs)[0]
+
+        # AsyncLLMEngine
+        async for i in self.consume_stream(sampler,model_inputs["input_ids"]):
+            yield i
diff --git a/byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_process_messager.py b/byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_process_messager.py
new file mode 100755
index 000000000..ca8525b47
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_process_messager.py
@@ -0,0 +1,49 @@
+import queue
+from multiprocessing import managers
+
+import torch
+
+from llm_perf.core.engine import MultiProcessMsgr
+
+
+class IluvatarMultiProcessMsgr(MultiProcessMsgr, managers.BaseManager):
+    def __init__(self, local_rank: int, world_size: int, name: str):
+        self.rank = local_rank
+        self.world_size = world_size
+
+        def make_message_queue(rank):
+            if rank != 0:
+                return None
+            new_queue = queue.Queue()
+            return lambda: new_queue
+
+        for i in range(1, world_size):
+            self.register(f"message_queue_{i}", callable=make_message_queue(local_rank))
+        if local_rank == 0:
+            super().__init__(authkey=name.encode("utf-8"))
+            self.start()
+            addr = [self.address]
+            torch.distributed.broadcast_object_list(addr, device=f"cuda:{local_rank}")
+            self.msg_queue_list = [
+                getattr(self, f"message_queue_{rank}")()
+                for rank in range(1, world_size)
+            ]
+        else:
+            addr = [None]
+            torch.distributed.broadcast_object_list(addr, device=f"cuda:{local_rank}")
+            super().__init__(address=addr[0], authkey=name.encode("utf-8"))
+            self.connect()
+            self.msg_queue = getattr(self, f"message_queue_{local_rank}")()
+
+    def broadcast(self, obj):
+        assert (
+            self.rank == 0
+        ), f"InterProcessMessager broadcast_message only allow rank0 to call!"
+        for rank in range(1, self.world_size):
+            self.msg_queue_list[rank - 1].put(obj)
+
+    def receive(self):
+        assert (
+            self.rank > 0
+        ), f"InterProcessMessager receive_message don't allow rank0 to call!"
+        return self.msg_queue.get()
\ No newline at end of file
diff --git a/byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_sampler.py b/byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_sampler.py
new file mode 100755
index 000000000..166a9dbcf
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_sampler.py
@@ -0,0 +1,32 @@
+from typing import Any, Dict, List, Tuple, Union
+
+import torch
+
+from llm_perf.core.generation import GenerateResult
+from llm_perf.core.engine import CoreEngine
+from llm_perf.core.sampler import CoreSampler
+
+from vllm import SamplingParams
+
+
+class IluvatarSampler(CoreSampler):
+    def __init__(self) -> None:
+        super().__init__()
+
+
+    def sample(self, packets: List[CoreEngine.Packet], logits: torch.FloatTensor) -> List[int]:
+        raise NotImplementedError
+
+
+    def postprocess(
+        self,
+        packets: List[CoreEngine.Packet],
+        infer_outputs: Dict[str, torch.FloatTensor],
+        next_tokens: List[int],
+    ) -> List[GenerateResult]:
+        
+        raise NotImplementedError
+    
+
+    def sampling(self, max_new_tokens):
+        return SamplingParams(n=1, temperature=0.8, top_k=5, presence_penalty=0.2, max_tokens=max_new_tokens, ignore_eos=True)
diff --git a/byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_scheduler.py b/byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_scheduler.py
new file mode 100755
index 000000000..f89146ec4
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/ILUVATAR/iluvatar_scheduler.py
@@ -0,0 +1,89 @@
+import asyncio
+from typing import List
+
+import torch
+
+from llm_perf.core.engine import CoreEngine
+from llm_perf.core.sampler import CoreSampler
+from llm_perf.core.scheduler import CoreScheduler
+from llm_perf.backends.ILUVATAR.iluvatar_engine import IluvatarEngine
+from llm_perf.backends.ILUVATAR.iluvatar_sampler import IluvatarSampler
+from llm_perf.utils.logger import logger
+from llm_perf.core.generation import GenerateResult
+
+
+class IluvatarScheduler(CoreScheduler):
+    def __init__(
+        self,
+        engine: CoreEngine,
+        sampler: CoreSampler,
+        **kwargs,
+    ) -> None:
+        super().__init__( 
+            engine=engine, 
+            sampler=sampler, 
+            packet_cls=IluvatarEngine.Packet, 
+            **kwargs
+        )
+        self.max_batch_size = kwargs.get("max_batch_size")
+
+    @torch.inference_mode()
+    def scheduler_loop(self):
+        batch: List[CoreEngine.Packet] = [] 
+        while True:
+            # 1. select batch --> batch
+            batch = self.select_batch(batch)
+            if not batch:
+                with self.packet_queue.not_empty:
+                    self.packet_queue.not_empty.wait(0.1)
+                continue
+
+            logger.debug(f"get batch size: {len(batch)}") 
+
+            # 2. AsyncLLMEngine
+            for b in batch:
+                max_new_tokens = b.request.generate_config.max_new_tokens
+                break
+
+            sampling = self.sampler.sampling(max_new_tokens)
+            asyncio.run(self.inference(batch,sampling))
+
+            # 3. is not finished -> remain
+            remained: List[CoreEngine.Packet] = []
+            for packet in batch:
+                if not packet.is_finished():
+                    remained.append(packet)
+
+            batch = remained
+
+
+    def select_batch(self, batch):
+        batching_size: int = len(batch)
+        new_select_packets: List[CoreEngine.Packet] = []
+
+        while not self.packet_queue.empty():
+            if batching_size == self.max_batch_size:
+                break
+
+            batching_size += 1
+            new_select_packets.append(self.packet_queue.get())
+
+        return batch + new_select_packets
+    
+
+    async def inference(self, batch, sampler):
+        async for results in self.engine.do_inference(batch, sampler):    
+            for j, result in enumerate(results):
+                token = result["token_ids"][-1]
+                finish_reason = result["finish_reason"]
+
+                if finish_reason == None :
+                    gen_res = GenerateResult(token,"")
+
+                else :
+                    gen_res = GenerateResult(token,"max_length")
+
+                batch[j].add_result(gen_res)
+
+                if gen_res.finish_reason:
+                    batch[j].finish()  
diff --git a/byte_infer_perf/llm_perf/backends/ILUVATAR/model_impl/__init__.py b/byte_infer_perf/llm_perf/backends/ILUVATAR/model_impl/__init__.py
new file mode 100755
index 000000000..12e048c40
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/ILUVATAR/model_impl/__init__.py
@@ -0,0 +1,8 @@
+## __all__ is a dict:
+##   key is model_name in `model_zoo/chatglm-xx.json`
+##   value is vendor specify model impl
+# __all__ = {
+#     "chatglm" : ChatGLMForConditionalGeneration,
+#     "chatglm2" : ChatGLM2ForConditionalGeneration
+# }
+__all__ = {}
\ No newline at end of file
diff --git a/byte_infer_perf/llm_perf/backends/ILUVATAR/setup.py b/byte_infer_perf/llm_perf/backends/ILUVATAR/setup.py
new file mode 100755
index 000000000..a92c43ba6
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/ILUVATAR/setup.py
@@ -0,0 +1,27 @@
+from typing import Any, Dict
+
+from llm_perf.core.scheduler import CoreScheduler
+from llm_perf.backends.ILUVATAR.iluvatar_engine import IluvatarEngine
+from llm_perf.backends.ILUVATAR.iluvatar_sampler import IluvatarSampler
+from llm_perf.backends.ILUVATAR.iluvatar_scheduler import IluvatarScheduler
+from llm_perf.utils.logger import logger
+
+def setup_scheduler(
+    model_config: Dict[str, Any], 
+    pad_token_id, max_batch_size, 
+    **kwargs
+) -> CoreScheduler:
+    # create engine
+    engine = IluvatarEngine(model_config, pad_token_id)
+
+    # create sampler
+    sampler = IluvatarSampler()
+
+    # create scheduler
+    scheduler = IluvatarScheduler(
+        engine=engine, 
+        sampler=sampler, 
+        max_batch_size=max_batch_size
+    )
+
+    return scheduler

From 5a93a12bfe5204ef70bb7c8ac8a3d6124fb883ba Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Mon, 13 May 2024 10:20:42 +0800
Subject: [PATCH 12/28] adding the vllm-framework

---
 .../backends/ILUVATAR/README_LLM.zh_CN.md     | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100755 byte_infer_perf/llm_perf/backends/ILUVATAR/README_LLM.zh_CN.md

diff --git a/byte_infer_perf/llm_perf/backends/ILUVATAR/README_LLM.zh_CN.md b/byte_infer_perf/llm_perf/backends/ILUVATAR/README_LLM.zh_CN.md
new file mode 100755
index 000000000..5aef4f2e0
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/ILUVATAR/README_LLM.zh_CN.md
@@ -0,0 +1,26 @@
+"""
+    ***************************大模型操作流程-VLLM框架********************
+    说明：
+        此部分代码未侵入框架代码，由于vllm框架未实现精度测试，因此精度测试可以沿用GPU的backends；其次，vllm的tp定义目前与框架定义的tp含义不一样，
+        因此chatglm2、llama2模型的workloads配置里面的tp=2暂时不考虑，待后续商定好解决方案在继续
+
+    环境准备：
+        需要提前下载天数智芯适配的vllm安装包到测试环境下，为了方便看输出日志，省掉不必要的信息，安装完毕后，请注释掉：
+        /usr/local/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py 内部函数async def add_request 下面的logger.info输出日志
+
+    数据集模型准备：
+        bash prepare_model.sh chatglm2-torch-fp16-6b 注意这里会把chatglm、chatglm2、llama2的数据集、模型都下载下来，我们只需要关注chatglm2、llama2模型，
+	在模型这两个模型放到modelzoo/sota 目录下
+
+    测试开始：
+
+    cd ByteMLPerf/byte_infer_perf
+        
+    1、chatglm2模型：
+        执行：python3 llm_perf/launch.py --task chatglm2-torch-fp16-6b --hardware_type ILUVATAR 
+        生成的测试报告位置：llm_perf/reports/ILUVATAR/chatglm2-torch-fp16-6b
+    
+    2、llama2模型：
+        执行：python3 llm_perf/launch.py --task chinese-llama2-torch-fp16-13b --hardware_type ILUVATAR
+        生成的测试报告位置：llm_perf/reports/ILUVATAR/chinese-llama2-torch-fp16-13b
+"""

From 0f0fcc2f1f56ba226f131c88604eea8cc8ddeb24 Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Mon, 13 May 2024 10:24:36 +0800
Subject: [PATCH 13/28] update

---
 .../llm_perf/backends/ILUVATAR/README_LLM.zh_CN.md           | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/byte_infer_perf/llm_perf/backends/ILUVATAR/README_LLM.zh_CN.md b/byte_infer_perf/llm_perf/backends/ILUVATAR/README_LLM.zh_CN.md
index 5aef4f2e0..78a1ca23f 100755
--- a/byte_infer_perf/llm_perf/backends/ILUVATAR/README_LLM.zh_CN.md
+++ b/byte_infer_perf/llm_perf/backends/ILUVATAR/README_LLM.zh_CN.md
@@ -9,8 +9,9 @@
         /usr/local/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py 内部函数async def add_request 下面的logger.info输出日志
 
     数据集模型准备：
-        bash prepare_model.sh chatglm2-torch-fp16-6b 注意这里会把chatglm、chatglm2、llama2的数据集、模型都下载下来，我们只需要关注chatglm2、llama2模型，
-	在模型这两个模型放到modelzoo/sota 目录下
+        bash prepare_model.sh chatglm2-torch-fp16-6b 
+	bash prepare_model.sh chinese-llama2-torch-fp16-13b
+	将这两个模型放到modelzoo/sota 目录下
 
     测试开始：
 

From 11226477ae7f930ae7cc1024765d00ead76ef1b2 Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Mon, 13 May 2024 19:12:00 +0800
Subject: [PATCH 14/28] cudart.cudaDeviceSynchronize()

---
 .../backends/ILUVATAR/runtime_backend_iluvatar.py             | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
index 2f97bed40..b96c64259 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -219,8 +219,10 @@ def predict(self, feeds):
                             cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
                 )
             
+            cudart.cudaDeviceSynchronize()
             starttime = time.time()
             context.execute_v2(allocations)
+            cudart.cudaDeviceSynchronize()
             endtime = time.time()
 
             self.predict_time = endtime - starttime
@@ -449,4 +451,4 @@ def _get_fake_samples(self, batch_size, shape, input_type):
                 i += 1
             return data
         else:
-            raise ValueError("Please provide input type")
\ No newline at end of file
+            raise ValueError("Please provide input type")

From 3b3f1910df7990cccd2e23d4516a8d0c4e4f624c Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Thu, 16 May 2024 15:21:22 +0800
Subject: [PATCH 15/28] update readme

---
 .../backends/ILUVATAR/README.zh_CN.md         | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
index 6caa35876..c6a2fef32 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
@@ -267,4 +267,86 @@
 
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task clip-onnx-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/clip-onnx-fp32
+"""
+
+
+"""
+    ***************************大模型操作流程-VLLM框架********************
+    说明：
+        此部分代码未侵入框架代码，由于vllm框架未实现精度测试，因此精度测试可以沿用GPU的backends；其次，vllm的tp定义目前与框架定义的tp含义不一样，
+        因此chatglm2、llama2模型的workloads配置里面的tp=2暂时不考虑，待后续商定好解决方案在继续
+
+    环境准备：
+        需要提前下载天数智芯适配的vllm安装包到测试环境下，为了方便看输出日志，省掉不必要的信息，安装完毕后，请注释掉：
+        /usr/local/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py 内部函数async def add_request 下面的logger.info输出日志
+
+    测试开始：
+
+    cd ByteMLPerf/byte_infer_perf
+        
+    1、chatglm2模型：
+        执行：python3 llm_perf/launch.py --task chatglm2-torch-fp16-6b --hardware_type ILUVATAR 
+        生成的测试报告位置：llm_perf/reports/ILUVATAR/chatglm2-torch-fp16-6b
+    
+    2、llama2模型：
+        执行：python3 llm_perf/launch.py --task chinese-llama2-torch-fp16-13b --hardware_type ILUVATAR
+        生成的测试报告位置：llm_perf/reports/ILUVATAR/chinese-llama2-torch-fp16-13b
+"""
+
+
+"""
+    **************************部分小模型的int8精度推理测试************************
+    说明：
+        字节目前想验证部分小模型的int8精度推理的性能，因此需要基于ixrt（tensorrt）推理引擎进行适配支持
+        目前需要验证的小模型包括：resnet50、yolov5、widedeep、bert
+    
+    环境准备：不需要特别准备，之前如果测试过小模型的性能，相关的环境已经存在了；
+
+    测试开始：
+
+    cd ByteMLPerf/byte_infer_perf
+
+    1、bert模型：
+        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型
+
+        下载方式：
+            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
+            cd yudefu  get quantized_Resnet50.onnx  exit退出
+            mv quantized_Resnet50.onnx general_perf/model_zoo/regular/open_resnet50
+
+        代码更改：
+            1）general_perf/backends/ILUVATAR/common.py 将build_config.set_flag(tensorrt.BuilderFlag.FP16) 更改为：
+            build_config.set_flag(tensorrt.BuilderFlag.INT8)
+
+            2）general_perf/backends/ILUVATAR/compile_backend_iluvatar.py 第118行添加以下的代码：
+            onnx_model_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50.onnx"
+            engine_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50" + ".engine" 
+
+            3）general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py load函数部分添加以下的代码（大概在370行）：
+            engine_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50" + ".engine" 
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task resnet50-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/resnet50-torch-fp32
+
+    2、yolov5模型：
+        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型
+
+        下载方式：
+            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
+            cd yudefu  get quantized_yolov5s.onnx  exit退出
+            mv quantized_yolov5s.onnx general_perf/model_zoo/popular/open_yolov5/
+
+        代码更改：
+            1）general_perf/backends/ILUVATAR/common.py 将build_config.set_flag(tensorrt.BuilderFlag.FP16) 更改为：
+            build_config.set_flag(tensorrt.BuilderFlag.INT8)
+
+            2）general_perf/backends/ILUVATAR/compile_backend_iluvatar.py 第118行添加以下的代码：
+            onnx_model_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s.onnx"
+            engine_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s" + ".engine" 
+
+            3）general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py load函数部分添加以下的代码（大概在359行）：
+            engine_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s" + ".engine" 
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task yolov5-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/yolov5-onnx-fp32
 """
\ No newline at end of file

From 548a1b90dad32cfcfeca9595d8e04e06db1b2824 Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Tue, 21 May 2024 14:15:12 +0800
Subject: [PATCH 16/28] update code 0521

---
 .../backends/ILUVATAR/README.zh_CN.md         |  56 ++++-
 .../ILUVATAR/compile_backend_iluvatar.py      |  11 +-
 .../backends/ILUVATAR/requirements.txt        |  21 +-
 .../ILUVATAR/runtime_backend_iluvatar.py      | 210 ++++++++++++++++--
 byte_infer_perf/llm_perf/launch.py            |   2 +-
 5 files changed, 257 insertions(+), 43 deletions(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
index c6a2fef32..c4f0e9dbc 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
@@ -299,6 +299,9 @@
     说明：
         字节目前想验证部分小模型的int8精度推理的性能，因此需要基于ixrt（tensorrt）推理引擎进行适配支持
         目前需要验证的小模型包括：resnet50、yolov5、widedeep、bert
+
+        注意如果在测试bert的int8推理时，报错，可能是sdk、ixrt版本问题导致；需要升级；
+        生成的报告，并没有更改里面的精度标识，这里只是给出一个测试case，因此并没有将这部分代码加到代码中
     
     环境准备：不需要特别准备，之前如果测试过小模型的性能，相关的环境已经存在了；
 
@@ -306,7 +309,7 @@
 
     cd ByteMLPerf/byte_infer_perf
 
-    1、bert模型：
+    1、resnet50 模型：
         模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型
 
         下载方式：
@@ -318,17 +321,19 @@
             1）general_perf/backends/ILUVATAR/common.py 将build_config.set_flag(tensorrt.BuilderFlag.FP16) 更改为：
             build_config.set_flag(tensorrt.BuilderFlag.INT8)
 
-            2）general_perf/backends/ILUVATAR/compile_backend_iluvatar.py 第118行添加以下的代码：
+            2）general_perf/backends/ILUVATAR/compile_backend_iluvatar.py 函数compile 最后一个else 添加以下的代码：
             onnx_model_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50.onnx"
             engine_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50" + ".engine" 
+            （在 build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize) 前面加上面两行）
 
-            3）general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py load函数部分添加以下的代码（大概在370行）：
+            3）general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py 函数load 最后一个else 添加以下的代码：
             engine_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50" + ".engine" 
+            （注释掉 engine_path = os.path.dirname(model_path) + "/" + model + ".engine"）
 
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task resnet50-torch-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/resnet50-torch-fp32
 
-    2、yolov5模型：
+    2、yolov5 模型：
         模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型
 
         下载方式：
@@ -340,13 +345,52 @@
             1）general_perf/backends/ILUVATAR/common.py 将build_config.set_flag(tensorrt.BuilderFlag.FP16) 更改为：
             build_config.set_flag(tensorrt.BuilderFlag.INT8)
 
-            2）general_perf/backends/ILUVATAR/compile_backend_iluvatar.py 第118行添加以下的代码：
+            2）general_perf/backends/ILUVATAR/compile_backend_iluvatar.py 函数compile 最后一个else 添加以下的代码：
             onnx_model_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s.onnx"
             engine_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s" + ".engine" 
+           （在 build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize) 前面加上面两行）
 
-            3）general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py load函数部分添加以下的代码（大概在359行）：
+            3）general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py 函数load 添加以下的代码：
             engine_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s" + ".engine" 
+           （在 if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5': 下面添加；
+             注释掉：engine_path = model_path.split(".")[0] + "_end.engine"）
 
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task yolov5-onnx-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/yolov5-onnx-fp32
+
+    3、bert 模型：
+        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型；该模型直接拿生成好的engine进行推理
+
+        下载方式：
+            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
+            cd yudefu  get bert_zijie_int8_b196.engine  exit退出
+            mv quantized_yolov5s.onnx general_perf/model_zoo/regular/open_bert/
+
+        代码更改：
+            1）general_perf/backends/ILUVATAR/common.py 将build_config.set_flag(tensorrt.BuilderFlag.FP16) 更改为：
+            build_config.set_flag(tensorrt.BuilderFlag.INT8)
+
+            2）general_perf/backends/ILUVATAR/compile_backend_iluvatar.py 函数compile 最后一个else 做以下操作：
+            注释掉 build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
+            因为这里直接加载已经生成的engine，不需要进行compile生成；这里可以加一个输出：
+                print("\n****bert-int8推理直接采用加载生成好的engine, 不需要进行编译！****") 看程序走到哪里
+
+            3）general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py 函数load 添加以下的代码：
+            engine_path = "general_perf/model_zoo/regular/open_bert/bert_zijie_int8_b196.engine"
+           （在 elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin':
+             注释掉：engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine"）
+
+             第二个还需要修改函数 predict_dump 以下四行代码：
+             input_shape = input_tensors[i].shape
+             input_idx = engine.get_binding_index(input_name)
+             context.set_binding_shape(input_idx, Dims(input_shape))
+             i += 1
+             更改为：
+             input_shape = input_tensors[i].shape
+             for binding in range(3):
+                 context.set_binding_shape(binding, Dims(input_shape))
+            i += 1
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/bert-torch-fp32
 """
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
index 8c20da5db..93590c0d6 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
@@ -80,15 +80,6 @@ def compile(self, configs, dataloader=None):
             engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape" + ".engine"    
             build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
         
-        # elif model_name == 'roformer':
-        #     # onnx_model_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen-sim-modified-bs32.onnx"
-        #     # engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen-sim-modified-" + str(32) + ".engine"
-        #     # build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=32)
-        #     for bs in configs['workload']['batch_sizes']:
-        #         onnx_model_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen-sim-modified-bs32_bak.onnx"
-        #         engine_paths = "general_perf/general_perf/model_zoo/popular/open_roformer/roformer-frozen-sim-modified-" + str(bs) + ".engine" 
-        #         build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_paths, MaxBatchSize=bs)
-        
         elif model_name == 'conformer':
             onnx_model_path = "general_perf/model_zoo/popular/open_conformer/conformer_encoder_optimizer_end.onnx"
             engine_path = "general_perf/model_zoo/popular/open_conformer/conformer_encoder_optimizer_end" + ".engine"    
@@ -96,7 +87,7 @@ def compile(self, configs, dataloader=None):
 
         elif model_name == 'deberta':
             onnx_model_path = "general_perf/model_zoo/popular/open_deberta/deberta-base-squad-sim_end.onnx"
-            engine_path = "general_perf/model_zoo/popular/open_conformer/deberta-base-squad-sim_end" + ".engine"    
+            engine_path = "general_perf/model_zoo/popular/open_deberta/deberta-base-squad-sim_end" + ".engine"    
             build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
 
         elif model_name == 'gpt2':
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt b/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
index 5c4a8abca..396998600 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
@@ -1,4 +1,19 @@
-transformers==4.35.2
-datasets==2.14.7
 onnx==1.15.0
-pandas==2.1.3
\ No newline at end of file
+onnxsim==0.4.36
+tf2onnx==1.16.1
+onnxruntime
+onnxoptimizer==0.3.13
+bert-tensorflow==1.0.1
+
+pandas==2.1.1
+numpy==1.23.0
+matplotlib
+scikit-learn
+opencv-python==4.6.0.66
+opencv-python-headless
+tokenization==1.0.7
+tokenizers==0.13.3
+sentencepiece==0.1.96
+typing_extensions==4.10.0
+
+py-libnuma==1.2
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
index b96c64259..d8abd805d 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -24,9 +24,10 @@
 from general_perf.backends import runtime_backend
 from general_perf.backends.ILUVATAR.common import init_by_tensorrt, setup_io_bindings
 from general_perf.backends.ILUVATAR.utils import get_target
-from general_perf.backends.ILUVATAR.common import Task, TaskThread, _cudaGetErrorEnum, checkCudaErrors
+from general_perf.backends.ILUVATAR.common import Task, TaskThread
 from tensorrt import Dims
 from cuda import cuda, cudart
+import numa
 
 from general_perf.backends.ILUVATAR.common import load_ixrt_plugin
 load_ixrt_plugin()
@@ -66,6 +67,11 @@ def __init__(self):
         self.predict_fps = None
         self.predict_time = None
         self.task = None
+        self.inputs = None
+        self.outputs = None
+        self.allocations = None
+        numa.memory.set_local_alloc()
+        numa.schedule.run_on_nodes(0)
 
     def isSDmodel(self, model_name):
         result = False
@@ -117,6 +123,159 @@ def benchmark(self, dataloader):
                     merged_dict["predict P99 Latency"] = predict_p99_latency
                 
         return merged_dict
+    
+    def init_allocs(self):
+        if self.inputs is not None:
+            for i in range(len(self.inputs)):
+                err, = cudart.cudaFree(self.inputs[i]["allocation"])
+                assert err == cudart.cudaError_t.cudaSuccess
+
+            for i in range(len(self.outputs)):
+                err, = cudart.cudaFree(self.outputs[i]["allocation"])
+                assert err == cudart.cudaError_t.cudaSuccess
+            self.inputs = None
+
+    def get_allocs(self):
+        if self.inputs is None:
+            self.inputs, self.outputs, self.allocations = setup_io_bindings(self.engine, self.context)
+        return self.inputs, self.outputs, self.allocations
+
+    def predict_dump(self, feeds):
+        input_tensors = []
+        i = 0
+
+        model_name = self.configs["model"].split("-")[0]
+    
+        if model_name != 'gpt2':
+            if model_name == 'deberta':
+                keys = list(feeds.keys())
+                input_ids = feeds[keys[0]]
+                attention_mask = feeds[keys[1]]
+                input_tensors = [input_ids, attention_mask]
+
+            else:
+                for key, _ in feeds.items():
+                    input_tensors.append(feeds[key])
+                    i += 1
+
+            # ixrt inference
+            engine = self.engine
+            assert engine
+            context = self.context
+            assert context
+
+            # set dynamic shape
+            input_tensor_map = self.configs["segments"][0]["input_tensor_map"]
+            input_shape = input_tensor_map.values()
+
+            i = 0
+            for input_name, _ in input_tensor_map.items():
+                if model_name == 'widedeep':
+                    input_tensors.append(np.zeros((self.batch_size, 1), dtype=np.float32))
+                    input_names = [
+                        "new_categorical_placeholder:0",
+                        "new_numeric_placeholder:0",
+                        "import/head/predictions/zeros_like:0"
+                    ]
+                    for input_name in input_names:
+                        if input_name == 'new_categorical_placeholder:0':
+                            input_shape = input_tensors[0].shape
+                        if input_name == 'new_numeric_placeholder:0':
+                            input_shape = input_tensors[1].shape
+                        if input_name == 'import/head/predictions/zeros_like:0':
+                            input_shape = input_tensors[2].shape
+                    
+                        input_idx = engine.get_binding_index(input_name)
+                        context.set_binding_shape(input_idx, Dims(input_shape))
+
+                elif model_name == 'deberta':
+                    input_names = [
+                        "input_ids.1",
+                        "attention_mask.1",
+                    ]
+                    for input_name in input_names:
+                        if input_name == 'input_ids.1':
+                            input_shape = input_tensors[0].shape
+                        if input_name == 'attention_mask.1':
+                            input_shape = input_tensors[1].shape
+                    
+                        input_idx = engine.get_binding_index(input_name)
+                        context.set_binding_shape(input_idx, Dims(input_shape))
+
+                else:
+                    input_shape = input_tensors[i].shape
+                    input_idx = engine.get_binding_index(input_name)
+                    context.set_binding_shape(input_idx, Dims(input_shape))
+                    i += 1
+            
+            # Setup I/O bindings
+            inputs, outputs, allocations = self.get_allocs()
+
+            # Prepare the output data
+            outputs_list = []
+            for i in range(len(outputs)):
+                output = np.zeros(outputs[i]["shape"], outputs[i]["dtype"])
+                outputs_list.append(output)
+
+            data_batch_list = []
+            for i in range(len(input_tensors)):
+                data_batch = np.ascontiguousarray(input_tensors[i])
+                data_batch_list.append(data_batch)
+
+        return input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list
+
+    def predict_timing(self, input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list):
+        model_name = self.configs["model"].split("-")[0]
+
+        # H2D: host to device
+        for i in range(len(inputs)):
+            (err, ) = cudart.cudaHostRegister(data_batch_list[i], inputs[i]["nbytes"], 2)
+
+        for i in range(len(inputs)):
+            (err, ) = cudart.cudaMemcpy(
+                        inputs[i]["allocation"],
+                        data_batch_list[i],
+                        inputs[i]["nbytes"],
+                        cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
+            )
+
+        for i in range(len(inputs)):
+            (err, ) = cudart.cudaHostUnregister(data_batch_list[i])
+
+        starttime = time.time()
+        context.execute_v2(allocations)
+        endtime = time.time()
+
+        self.predict_time = endtime - starttime
+        
+        # D2H: device to host
+        for i in range(len(outputs)):
+            (err, )= cudart.cudaMemcpy(outputs_list[i], 
+                        outputs[i]["allocation"], 
+                        outputs[i]["nbytes"], 
+                        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
+            )
+        
+        result = {}
+
+        output_tensor_map = self.configs["segments"][0]["output_tensor_map"]
+        output_name = output_tensor_map.split(",")
+
+        for i in range(len(output_name)):
+            if model_name == 'yolov5':
+                result[output_name[0]] = outputs_list[0]
+                break
+
+            result[output_name[i]] = outputs_list[i]
+        
+        if model_name == 'videobert':
+            return outputs_list
+        
+        elif model_name == 'gpt2':
+            return None
+        
+        else:
+            return result
 
     def predict(self, feeds):
         # The deberta model is currently unable to undergo accuracy testing temporarily
@@ -130,8 +289,10 @@ def predict(self, feeds):
                                     dtype=pt_dtype_map[self.input_type[i]])
                 input_tensors.append(tmp_tensor)
                 i += 1
+
             self.predict_sd(input_tensors)
             return
+        
         elif model_name != 'gpt2':
             if model_name == 'deberta':
                 keys = list(feeds.keys())
@@ -197,7 +358,7 @@ def predict(self, feeds):
                     i += 1
             
             # Setup I/O bindings
-            inputs, outputs, allocations = setup_io_bindings(engine, context)
+            inputs, outputs, allocations = self.get_allocs()
 
             # Prepare the output data
             outputs_list = []
@@ -211,6 +372,9 @@ def predict(self, feeds):
                 data_batch_list.append(data_batch)
 
             # H2D: host to device
+            for i in range(len(inputs)):
+                (err, ) = cudart.cudaHostRegister(data_batch_list[i], inputs[i]["nbytes"], 2)
+
             for i in range(len(inputs)):
                 (err, ) = cudart.cudaMemcpy(
                             inputs[i]["allocation"],
@@ -219,10 +383,11 @@ def predict(self, feeds):
                             cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
                 )
             
-            cudart.cudaDeviceSynchronize()
+            for i in range(len(inputs)):
+                (err, ) = cudart.cudaHostUnregister(data_batch_list[i])
+
             starttime = time.time()
             context.execute_v2(allocations)
-            cudart.cudaDeviceSynchronize()
             endtime = time.time()
 
             self.predict_time = endtime - starttime
@@ -237,13 +402,7 @@ def predict(self, feeds):
             
             # Free Gpu Memory
             # cuda-python
-            for i in range(len(inputs)):
-                err, = cudart.cudaFree(inputs[i]["allocation"])
-                assert err == cudart.cudaError_t.cudaSuccess
-
-            for i in range(len(outputs)):
-                err, = cudart.cudaFree(outputs[i]["allocation"])
-                assert err == cudart.cudaError_t.cudaSuccess
+            self.init_allocs()
             
             result = {}
 
@@ -288,22 +447,29 @@ def benchmark_interact(self, dataloader):
             self.load_igie(batch_size)
         elif self.isSDmodel(self.configs["model"]):
             self.load_sd(batch_size)   
-
-         
+    
         test_data = self._get_fake_samples(batch_size=batch_size,
                         shape=self.configs['segments'][0]['input_tensor_map'],
                         input_type=self.configs['input_type'])
+        
+        # Free Gpu Memory
+        # cuda-python
+        self.init_allocs()
 
         for _ in range(30):
             self.predict(test_data)
 
         for _ in range(iterations):
+            input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list = self.predict_dump(test_data)
+
             start_time = time.time()
-            self.predict(test_data)
+            self.predict_timing(input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list)
             end_time = time.time()
+
             times_range.append(end_time - start_time)
-            predict_range.append(self.predict_time)
+            predict_range.append(self.predict_time)           
 
+            
         times_range.sort()
         tail_latency = round(
             times_range[int(len(times_range) * 0.99)] * 1000, 2)
@@ -321,9 +487,9 @@ def benchmark_interact(self, dataloader):
             'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
             format(self.batch_size, qps, avg_latency, tail_latency))
         
-        log.info(
-            'Batch size is {}, fps: {}, predict_avg_latency:{}, predict_tail_latency:{}'.
-            format(self.batch_size, fps, predict_avg_latency, tail_latency))
+        # log.info(
+        #     'Batch size is {}, fps: {}, predict_avg_latency:{}, predict_tail_latency:{}'.
+        #     format(self.batch_size, fps, predict_avg_latency, tail_latency))
 
 
         report['QPS'] = qps
@@ -350,6 +516,7 @@ def load(self, batch_size) -> None:
         if model_name == 'gpt2':
             self.batch_size = batch_size
             return
+        
         elif self.isSDmodel(model):
             self.batch_size = batch_size
             #self.load_sd(batch_size)
@@ -377,11 +544,8 @@ def load(self, batch_size) -> None:
         if model_name == 'conformer':
             engine_path = "general_perf/model_zoo/popular/open_conformer/conformer_encoder_optimizer_end" + ".engine"    
         
-        # if model_name == 'roformer':
-        #     engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen-sim-modified-" + str(batch_size) + ".engine" 
-        
         if model_name == 'deberta':
-            engine_path = "general_perf/model_zoo/popular/open_conformer/deberta-base-squad-sim_end" + ".engine"   
+            engine_path = "general_perf/model_zoo/popular/open_deberta/deberta-base-squad-sim_end" + ".engine"   
 
         engine, context = init_by_tensorrt(engine_path)
 
@@ -451,4 +615,4 @@ def _get_fake_samples(self, batch_size, shape, input_type):
                 i += 1
             return data
         else:
-            raise ValueError("Please provide input type")
+            raise ValueError("Please provide input type")
\ No newline at end of file
diff --git a/byte_infer_perf/llm_perf/launch.py b/byte_infer_perf/llm_perf/launch.py
index 42e7fbc85..a753ce0ee 100644
--- a/byte_infer_perf/llm_perf/launch.py
+++ b/byte_infer_perf/llm_perf/launch.py
@@ -22,7 +22,7 @@
 
 
 # ${prj_root}/
-BYTE_MLPERF_ROOT = os.path.byte_infer_perfdirname(os.path.dirname(os.path.abspath(__file__)))
+BYTE_MLPERF_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 os.chdir(BYTE_MLPERF_ROOT)
 sys.path.insert(0, BYTE_MLPERF_ROOT)
 

From d8b8ca23700b5a2194e3bbd79f4a47fdc238547d Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Wed, 22 May 2024 11:37:03 +0800
Subject: [PATCH 17/28] update readme

---
 .../general_perf/backends/ILUVATAR/README.zh_CN.md        | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
index c4f0e9dbc..9e527d38e 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
@@ -364,7 +364,7 @@
         下载方式：
             sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
             cd yudefu  get bert_zijie_int8_b196.engine  exit退出
-            mv quantized_yolov5s.onnx general_perf/model_zoo/regular/open_bert/
+            mv bert_zijie_int8_b196.engine general_perf/model_zoo/regular/open_bert/
 
         代码更改：
             1）general_perf/backends/ILUVATAR/common.py 将build_config.set_flag(tensorrt.BuilderFlag.FP16) 更改为：
@@ -391,6 +391,12 @@
                  context.set_binding_shape(binding, Dims(input_shape))
             i += 1
 
+            第三需要更改的地方：将函数predict_timing 里面的 result[output_name[i]] = outputs_list[i] 改成：result[output_name[i]] = outputs_list[0]
+
+            精度测试时还需要更改下面的地方：函数predict 里面的 result[output_name[i]] = outputs_list[i] 改成：
+                result[output_name[0]] = outputs_list[0][:,:,0]
+                result[output_name[1]] = outputs_list[0][:,:,1]
+
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/bert-torch-fp32
 """
\ No newline at end of file

From de18cd6363197c7cfd61bf8f6cad2fc0eb609101 Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Thu, 23 May 2024 15:28:56 +0800
Subject: [PATCH 18/28] update code 0523

---
 .../backends/ILUVATAR/README.zh_CN.md         | 112 ++++++------------
 .../general_perf/backends/ILUVATAR/common.py  |   7 +-
 .../ILUVATAR/compile_backend_iluvatar.py      |  54 +++++----
 .../ILUVATAR/runtime_backend_iluvatar.py      |  19 ++-
 4 files changed, 72 insertions(+), 120 deletions(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
index 9e527d38e..0d35e41d9 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
@@ -9,11 +9,8 @@
         2、在天数智芯BI-150显卡上，调用推理引擎tensorrt进行推理，一些onnx模型需要利用前面一步导出的onnx模型再进行插件算子的优化；
     
     环境准备：
-        1、sdk版本：由天数智芯工程师提供
+        1、sdk版本： 由天数智芯工程师提供
         2、ixrt版本：由天数智芯工程师提供
-
-    遗留问题：
-        1、roformer模型暂时还不支持动态shape推理，因此本次暂不提交
 """
 
 
@@ -51,28 +48,20 @@
 
     cd ByteMLPerf/byte_infer_perf
 
-    1、bert模型：
-        测试过程中如果缺少：dev-v1.1.json、vocab.txt，按照下面的操作进行下载
-
-        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-                 cd files/yudefu/open_squad ; get dev-v1.1.json; get vocab.txt
-                 exit
-
-        移动：mv dev-v1.1.json vocab.txt general_perf/datasets/open_squad/;
+    备注：由于sftp机器崩溃，文件全部丢失，因此已有的获取数据方式可能不存在了
 
+    1、bert模型：
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/bert-torch-fp32/
 
     2、albert模型：
         测试过程中如果从huggingface网址不能下载文件，可以按照下面的操作进行下载
         
-        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-                 cd files/yudefu/ ; get albert.rar
+        下载方式：sftp -P 29880 vipzjtd@iftp.iluvatar.com.cn（如果链接不上用ip替换：10.160.20.60）  密码：123..com
+                 get /upload/3-app/byteperf/madlag.tar
+                 tar -zxvf madlag.tar
                  exit
 
-        mkdir -p madlag/albert-base-v2-squad;
-        解压：unrar x albert.rar madlag/albert-base-v2-squad;
-
         接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
         AutoTokenizer.from_pretrained("madlag/albert-base-v2-squad") => AutoTokenizer.from_pretrained("/ByteMLPerf/byte_infer_perf/madlag/albert-base-v2-squad")  (注意绝对路径根据实际情况修改，需要在ByteMLPerf前面在加一个当前目录最上层的路径，下同)
 
@@ -82,24 +71,22 @@
     3、debert模型：
         测试过程中如果从huggingface网址不能下载文件，可以按照下面的操作进行下载
 
-        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-                 cd files/yudefu/ ; get deberta.rar
+        下载方式：sftp -P 29880 vipzjtd@iftp.iluvatar.com.cn（如果链接不上用ip替换：10.160.20.60）  密码：123..com
+                 get /upload/3-app/byteperf/Palak.tar
+                 tar -zxvf Palak.tar
                  exit
 
-        mkdir -p Palak/microsoft_deberta-base_squad;
-        解压：unrar x deberta.rar Palak/microsoft_deberta-base_squad;
-
         接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
         AutoTokenizer.from_pretrained("Palak/microsoft_deberta-base_squad") => AutoTokenizer.from_pretrained("/ByteMLPerf/byte_infer_perf/Palak/microsoft_deberta-base_squad")
 
-        给定的pt模型转成onnx后输入只有2个，因此这里特殊处理了一下；加载处理好的onnx模型：deberta-base-squad-sim_end.onnx
+        给定的pt模型转成onnx后输入只有2个，因此这里特殊处理了一下；加载处理好的onnx模型：deberta-sim-drop-clip-drop-invaild-cast.onnx
         将其放到：general_perf/model_zoo/popular/open_deberta/ 目录下；
 
         下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-                 cd files/yudefu/ ; get deberta-base-squad-sim_end.onnx
+                 cd yudefu/bytedance_perf ; get deberta-sim-drop-clip-drop-invaild-cast.onnx
                  exit
         
-        移动：mv deberta-base-squad-sim_end.onnx general_perf/model_zoo/popular/open_deberta/
+        移动：mv deberta-sim-drop-clip-drop-invaild-cast.onnx general_perf/model_zoo/popular/open_deberta/
 
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task deberta-torch-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/deberta-torch-fp32/
@@ -107,13 +94,11 @@
     4、roberta模型：
         测试过程中如果从huggingface网址不能下载文件，可以按照下面的操作进行下载
 
-        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-                 cd files/yudefu/ ; get roberta.rar
+        下载方式：sftp -P 29880 vipzjtd@iftp.iluvatar.com.cn（如果链接不上用ip替换：10.160.20.60）  密码：123..com
+                 get /upload/3-app/byteperf/csarron.tar
+                 tar -zxvf csarron.tar
                  exit
 
-        mkdir -p csarron/roberta-base-squad-v1;
-        解压：unrar x roberta.rar csarron/roberta-base-squad-v1;
-
         接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
         AutoTokenizer.from_pretrained("csarron/roberta-base-squad-v1") => AutoTokenizer.from_pretrained("/ByteMLPerf/byte_infer_perf/csarron/roberta-base-squad-v1")
 
@@ -121,48 +106,23 @@
         生成的测试报告位置：general_perf/reports/ILUVATAR/roberta-torch-fp32/
 
     5、videobert模型：
-        测试过程中如果在 open_cifar 数据集中缺少某些文件，可以按照下面的操作进行下载
-
-        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-                 cd files/yudefu/open_cifar ; get cifar-100-python.tar.gz
-                 exit
-
-        解压：tar -zxvf cifar-100-python.tar.gz； mv cifar-100-python general_perf/datasets/open_cifar
-
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task videobert-onnx-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/videobert-onnx-fp32
     
     6、widedeep模型：
-        测试过程中如果在 open_criteo_kaggle 数据集中缺少：eval.csv、categorical.npy、label.npy、numeric.npy，可以按照下面的操作进行下载
-        （根据缺少的文件进行下载即可，不需要的可以不下载，下同）
-
-        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-                 cd files/yudefu/open_criteo_kaggle ; get eval.csv； get categorical.npy；get label.npy； get numeric.npy
-                 exit
-
-        移动：mv eval.csv categorical.npy label.npy numeric.npy general_perf/datasets/open_criteo_kaggle;
-
-        该模型经过了特殊的处理，需要采用处理好的onnx模型：widedeep_dynamicshape.onnx；
+        该模型经过了特殊的处理，需要采用处理好的onnx模型：widedeep_dynamicshape_new.onnx；
         将其放到：general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
 
         下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-                 cd files/yudefu/ ; get widedeep_dynamicshape.onnx
+                 cd yudefu/bytedance_perf ; get widedeep_dynamicshape_new.onnx
                  exit
         
-        移动：mv widedeep_dynamicshape.onnx general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
+        移动：mv widedeep_dynamicshape_new.onnx general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
         
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/widedeep-tf-fp32
 
     7、swin-transformer模型：
-        测试过程中如果缺少：open_imagenet下面相关的文件或者数据集，按照下面的操作进行下载
-
-        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-                 cd files/yudefu/open_imagenet ; get ILSVRC2012_img_val.tar.gz; get val_map.txt
-                 exit
-        
-        解压：tar -zxvf ILSVRC2012_img_val.tar.gz；mv ILSVRC2012_img_val val_map.txt general_perf/datasets/open_imagenet
-
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task swin-large-torch-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/swin-large-torch-fp32
 
@@ -175,30 +135,18 @@
         生成的测试报告位置：general_perf/reports/ILUVATAR/yolov5-onnx-fp32
 
     10、conformer模型：
-        该onnx模型的transpose算子实现逻辑需要特殊处理；采用处理好的onnx模型：conformer_encoder_optimizer_end.onnx
-        将其放到：general_perf/model_zoo/popular/open_conformer/ 
-
-        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-                 cd files/yudefu/ ; get conformer_encoder_optimizer_end.onnx
-                 exit
-        
-        移动：mv conformer_encoder_optimizer_end.onnx general_perf/model_zoo/popular/open_conformer/ 
-        
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task conformer-encoder-onnx-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/conformer-encoder-onnx-fp32
 
     11、roformer模型：
-        该模型暂时没有解决，等待后续解决了再提供测试说明
-
-        测试过程中如果缺少：open_cail2019下面相关的文件或者数据集，按照下面的操作进行下载
+        该模型经过了特殊的处理，需要采用处理好的onnx模型：roformer_frozen.onnx；
+        将其放到：general_perf/model_zoo/popular/open_roformer/ 
 
         下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-                 cd files/yudefu/open_cail2019 ; get batch_segment_ids.npy； get batch_token_ids.npy； 
-                    get label.py； get test.json；get vocab.txt
-                  exit
-
-        移动：mv batch_segment_ids.npy batch_token_ids.npy label.py test.json vocab.txt general_perf/datasets/open_cail2019
-
+                 cd yudefu/bytedance_perf ; get roformer_frozen.onnx
+                 exit
+        
+        移动：mv roformer_frozen.onnx general_perf/model_zoo/popular/open_roformer/ 
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roformer-tf-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/roformer-tf-fp32
 
@@ -314,7 +262,9 @@
 
         下载方式：
             sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
-            cd yudefu  get quantized_Resnet50.onnx  exit退出
+            cd yudefu/bytedance_perf  
+            get quantized_Resnet50.onnx  
+            exit
             mv quantized_Resnet50.onnx general_perf/model_zoo/regular/open_resnet50
 
         代码更改：
@@ -338,7 +288,9 @@
 
         下载方式：
             sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
-            cd yudefu  get quantized_yolov5s.onnx  exit退出
+            cd yudefu/bytedance_perf  
+            get quantized_yolov5s.onnx 
+            exit
             mv quantized_yolov5s.onnx general_perf/model_zoo/popular/open_yolov5/
 
         代码更改：
@@ -363,7 +315,9 @@
 
         下载方式：
             sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
-            cd yudefu  get bert_zijie_int8_b196.engine  exit退出
+            cd yudefu/bytedance_perf  
+            get bert_zijie_int8_b196.engine  
+            exit
             mv bert_zijie_int8_b196.engine general_perf/model_zoo/regular/open_bert/
 
         代码更改：
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/common.py b/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
index 9d9a1a5d9..1b871ec13 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
@@ -89,9 +89,9 @@ def build_engine(model_name, onnx_model_path, engine_path, MaxBatchSize):
         
     elif model_name == 'roformer':
         profile.set_shape(
-            "input_segment:0", Dims([1, 1024]), Dims([16, 1024]), Dims([MaxBatchSize, 1024]))
+            "input_segment0", Dims([1, 1024]), Dims([16, 1024]), Dims([MaxBatchSize, 1024]))
         profile.set_shape(
-            "input_token:0", Dims([1, 1024]), Dims([16, 1024]), Dims([MaxBatchSize, 1024]))
+            "input_token0", Dims([1, 1024]), Dims([16, 1024]), Dims([MaxBatchSize, 1024]))
         
     elif model_name == 'swin':
         profile.set_shape(
@@ -148,7 +148,7 @@ def build_engine(model_name, onnx_model_path, engine_path, MaxBatchSize):
         
         elif model_name == 'roformer':
             input_tensor = network.get_input(i)
-            input_tensor.shape = Dims([32, 1024])
+            input_tensor.shape = Dims([-1, 1024])
 
         elif model_name == 'swin':
             input_tensor = network.get_input(i)
@@ -286,4 +286,3 @@ def checkCudaErrors(result):
         return result[1]
     else:
         return result[1:]
-
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
index 93590c0d6..2391b9123 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
@@ -54,15 +54,12 @@ def compile(self, configs, dataloader=None):
             onnx_model_path = model_path.split(".")[0] + "_end.onnx"
             engine_path = model_path.split(".")[0] + "_end.engine"
 
-        elif model_name == 'widedeep':
+        elif model_name == 'widedeep' or model_name == 'roformer':
             onnx_model_path = model_path + "/" + model + "_end.onnx"
             engine_path = model_path + "/" + model + "_end.engine"
-        
-        elif model_name == 'roformer':
-            onnx_model_path = model_path + "/" + model + ".onnx"
-            engine_path = model_path + "/" + model + ".engine"
 
-        elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin':
+        elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin' \
+             or model_name == 'resnet50':
             onnx_model_path = os.path.dirname(model_path) + "/" + model + "_end.onnx"
             engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine"
         
@@ -71,23 +68,22 @@ def compile(self, configs, dataloader=None):
             engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
 
         # model preprocessing
-        if model_name != 'deberta':
-            self.get_onnx(configs)
+        self.get_onnx(configs)
 
         # build engine
         if model_name == 'widedeep':
-            onnx_model_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape.onnx"
-            engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape" + ".engine"    
-            build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
-        
-        elif model_name == 'conformer':
-            onnx_model_path = "general_perf/model_zoo/popular/open_conformer/conformer_encoder_optimizer_end.onnx"
-            engine_path = "general_perf/model_zoo/popular/open_conformer/conformer_encoder_optimizer_end" + ".engine"    
+            onnx_model_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_new.onnx"
+            engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_new" + ".engine"    
             build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
 
         elif model_name == 'deberta':
-            onnx_model_path = "general_perf/model_zoo/popular/open_deberta/deberta-base-squad-sim_end.onnx"
-            engine_path = "general_perf/model_zoo/popular/open_deberta/deberta-base-squad-sim_end" + ".engine"    
+            onnx_model_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast_end.onnx"
+            engine_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast_end" + ".engine"    
+            build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
+
+        elif model_name == 'roformer':
+            onnx_model_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen_end.onnx"
+            engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen_end" + ".engine"    
             build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
 
         elif model_name == 'gpt2':
@@ -193,19 +189,19 @@ def get_onnx(self, configs):
             print("***Convert pb model to onnx model success!***")
 
         # Convert ONNX model to plugin operator model: Support fusion of dynamic and static graphs
-        """
-            *********************待处理问题记录: 后续会更新进展************************
-            conformer 模型不能利用optimizer.py脚本转换, 因为attention比较特殊, 利用处理好的onnx模型进行测试;
-            roformer  模型目前没有实现通过加载固定shape的onnx, 生成不同的batch的engine实现动态shape推理;
-            widedeep  模型目前对原始的onnx暂时不支持直接动态shape推理, 对模型做了一系列处理, 并且不需要进行optimizer.py脚本处理, 直接加载处理好的onnx模型;
-        """        
-        if model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or \
-            model_name == 'videobert':
+        if model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or \
+            model_name == 'videobert' or model_name == 'resnet50' or model_name == 'widedeep':
             
             cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path}'
             subprocess.call(cmd, shell=True)
             print("***Convert onnx model to plugin operator model success!***")
 
+        elif model_name == 'deberta':
+            onnx_model_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast.onnx"
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path}'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
         elif model_name == 'swin':
             cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type swint'
             subprocess.call(cmd, shell=True)
@@ -217,7 +213,13 @@ def get_onnx(self, configs):
             print("***Convert onnx model to plugin operator model success!***")
 
         elif model_name == 'roformer':
-            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type roformer'
+            onnx_model_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen.onnx"
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type roformer --input_shapes input_segment0:bsx1024,input_token0:bsx1024'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
+        elif model_name == 'conformer':
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type conformer --hidden_size 512 --num_heads 8'
             subprocess.call(cmd, shell=True)
             print("***Convert onnx model to plugin operator model success!***")
 
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
index d8abd805d..d9c814941 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -525,27 +525,24 @@ def load(self, batch_size) -> None:
         if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
             engine_path = model_path.split(".")[0] + "_end.engine"
 
-        elif model_name == 'widedeep':
+        elif model_name == 'widedeep' or model_name == 'roformer':
             engine_path = model_path + "/" + model + "_end.engine"
-        
-        elif model_name == 'roformer':
-            engine_path = model_path + "/" + model + ".engine"
-        
-        elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin':
+                
+        elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin' \
+             or model_name == 'resnet50':
             engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine" 
 
         else:
             engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
         
-        # **************to do*************
         if model_name == 'widedeep':      
-            engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape" + ".engine"
+            engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_new" + ".engine"
 
-        if model_name == 'conformer':
-            engine_path = "general_perf/model_zoo/popular/open_conformer/conformer_encoder_optimizer_end" + ".engine"    
+        if model_name == 'roformer':
+            engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen_end" + ".engine"     
         
         if model_name == 'deberta':
-            engine_path = "general_perf/model_zoo/popular/open_deberta/deberta-base-squad-sim_end" + ".engine"   
+            engine_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast_end" + ".engine"
 
         engine, context = init_by_tensorrt(engine_path)
 

From 4725fde40617ef31813e885cf2e53a9d837bd9a2 Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Thu, 23 May 2024 15:36:28 +0800
Subject: [PATCH 19/28] adding optimizer

---
 .../ILUVATAR/optimizer/dltest/README.md       |  44 ++
 .../optimizer/dltest/dltest/__init__.py       |   1 +
 .../optimizer/dltest/dltest/cli/__init__.py   |   0
 .../optimizer/dltest/dltest/cli/assert_cli.py | 209 +++++++
 .../optimizer/dltest/dltest/cli/check_cli.py  |  56 ++
 .../dltest/dltest/cli/entry_points.py         |  35 ++
 .../dltest/dltest/cli/fetch_log_cli.py        | 115 ++++
 .../dltest/dltest/cli/log_comparator_cli.py   |  69 +++
 .../dltest/dltest/cli/log_parser_cli.py       |  35 ++
 .../dltest/dltest/cli/model_validator_cli.py  | 153 +++++
 .../optimizer/dltest/dltest/log_comparator.py | 101 +++
 .../optimizer/dltest/dltest/log_parser.py     | 185 ++++++
 .../dltest/dltest/model_compare_config.py     | 306 ++++++++++
 .../optimizer/dltest/dltest/utils/__init__.py |   0
 .../optimizer/dltest/dltest/utils/base_cli.py |  44 ++
 .../optimizer/dltest/dltest/utils/get_env.py  |  65 ++
 .../optimizer/dltest/dltest/utils/iluvatar.py |  32 +
 .../dltest/dltest/utils/infer_args.py         |  87 +++
 .../optimizer/dltest/dltest/utils/misc.py     |  41 ++
 .../dltest/dltest/utils/real_tempfile.py      |  64 ++
 .../dltest/dltest/utils/subprocess_tools.py   |  84 +++
 .../ILUVATAR/optimizer/dltest/setup.py        |  27 +
 .../ILUVATAR/optimizer/onnx_model_bert.py     |  12 +-
 .../optimizer/onnx_model_conformer.py         | 576 ++++++++++++++++++
 .../ILUVATAR/optimizer/onnx_model_roformer.py | 540 ++++++++++++++++
 .../ILUVATAR/optimizer/onnx_model_yolo.py     |   2 +-
 .../backends/ILUVATAR/optimizer/optimizer.py  |  13 +-
 .../passes/fusion_conformer_attention.py      | 150 +++++
 .../passes/fusion_conformer_xsoftmax.py       | 129 ++++
 .../optimizer/passes/fusion_customfc.py       | 209 ++++---
 .../passes/fusion_format_roformer.py          |  64 +-
 .../optimizer/passes/fusion_layernorm.py      | 237 ++++++-
 .../optimizer/passes/fusion_options.py        |   3 +
 .../passes/fusion_roformer_attention.py       | 368 +++++++++++
 .../ILUVATAR/optimizer/passes/fusion_rope.py  |  83 +++
 .../optimizer/passes/fusion_splitQKV.py       | 109 ++++
 .../optimizer/passes/fusion_vit_attention.py  | 354 +++++++++++
 37 files changed, 4468 insertions(+), 134 deletions(-)
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/__init__.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/__init__.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py
 create mode 100755 byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md
new file mode 100755
index 000000000..65175643c
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md
@@ -0,0 +1,44 @@
+## CI Test tool for IxRT
+
+### 1. Install dltest tool
+    
+    python setup.py develop
+
+### 2. Usage
+
+#### 2.1 Fetch log
+
+Commmand:
+
+```shell
+ixdltest-fetch args_or_pipe ${log_path}
+```
+
+Arguments:
+
+- p or patterns, The pattern of fetch log;
+- pn or pattern_names, The name of pattern;
+- use_re, Whether use regular expression;
+- d or nearest_distance, default=10, The nearest distance of matched pattern;
+- start_flag, The flag of start to record log;
+- end_flag, The flag of stop to record log;
+- split_pattern, The pattern is used to match line, If the line is matched, argument `split_sep` to split the line.
+- split_sep, The seperator is used to split line;
+- split_idx, The index of split line;
+- saved, Save result to path;
+- log, Log path.
+
+Example
+Analyse from file
+```
+$ ixdltest-fetch run.log -p "Throughput" -t_bi150 Throughput:100 -t_mr100 Throughput:100
+{'results': [{'Throughput': [188.5461778786721]}]}
+- Check Throughput on BI150 passed (result vs target): 188.5461778786721>=100.0
+```
+
+Analyse from command line pipe
+```
+$ cat run.log | ixdltest-fetch -p "Throughput" -t_bi150 Throughput:100 -t_mr100 Throughput:100
+{'results': [{'Throughput': [188.5461778786721]}]}
+- Check Throughput on BI150 passed (result vs target): 188.5461778786721>=100.0
+```
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py
new file mode 100644
index 000000000..5458f3166
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py
@@ -0,0 +1 @@
+from .utils.infer_args import show_infer_arguments
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/__init__.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py
new file mode 100644
index 000000000..ca6e197c0
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+import os
+from typing import List, Iterable, Optional
+
+from dltest.cli.log_parser_cli import LogParserCLI
+from dltest.log_parser import LogParser
+from dltest.model_compare_config import get_compare_config_with_full_path
+from dltest.utils.misc import get_full_path
+from dltest.utils.subprocess_tools import get_output
+from dltest.model_compare_config import ComparatorConfig
+
+
+FRAMEWORKS = list(ComparatorConfig.get_frameworks())
+
+REMAINDER = '...'
+
+assertion_expr_factory = dict(
+    eq = "a == b",
+    ne = "a != b",
+    ge = "a >= b",
+    le = "a <= b",
+    gt = "a > b",
+    lt = "a < b",
+)
+
+
+class AssertCLI(LogParserCLI):
+
+    def command_name(self):
+        return "assert"
+
+    def predefine_args(self):
+        super(AssertCLI, self).predefine_args()
+        self.parser.add_argument('-b', '--assertion_second_value', type=float, default=None,
+                                 help='It is used in assertion expression.')
+        self.parser.add_argument('--print_result', action="store_true", default=False,
+                                 help='Whether print result')
+        self.parser.add_argument('--capture_output', type=str, default='pipe', choices=['pipe', 'tempfile'],
+                                 help='The method of capture output')
+        # FIXME: Using store_action to replase it
+        self.parser.add_argument('--only_last', type=int, default=0,
+                                 help='Whether use the last result to compare')
+        self.parser.add_argument('--expr', type=str, default="ge",
+                                 help=f"Assertion expression, option keys: {', '.join(assertion_expr_factory.keys())}" +
+                                 ", or a executable code, such as `a > b`, `a > 1`, ...")
+        self.parser.add_argument('--use_predefined_parser_rules', action="store_true", default=False,
+                                 help='Whether use predefined args of parser.')
+        self.parser.add_argument('--log', type=str, default=None, help="Log path")
+        self.parser.add_argument("--run_script", default=[], nargs=REMAINDER)
+
+    def parse_args(self, *args, **kwargs):
+        args = super(AssertCLI, self).parse_args()
+        args.only_last = args.only_last > 0
+        if len(args.run_script) == 0 and args.log is None:
+            raise ValueError("The one of `--run_script` or `--log` must be given.")
+
+        if args.assertion_second_value is None:
+            if args.expr is None:
+                raise ValueError("The one of `--assertion_second_value` or `--expr` must be given.")
+
+            if args.expr in assertion_expr_factory:
+                raise ValueError(
+                    "The comparison operators depend on the argument `assertion_second_value`."
+                )
+
+        return args
+
+    def create_parser(self, args):
+        if args.use_predefined_parser_rules:
+            script_path = self._get_script_path(args.run_script)
+            config = get_compare_config_with_full_path(script_path, to_dict=False)
+
+            return LogParser(
+                patterns=config.patterns, pattern_names=config.pattern_names,
+                use_re=config.use_re, nearest_distance=config.nearest_distance,
+                start_line_pattern_flag=config.start_line_pattern_flag,
+                end_line_pattern_flag=config.end_line_pattern_flag,
+                split_pattern=config.split_pattern,
+                split_sep=config.split_sep,
+                split_idx=config.split_idx
+            )
+
+        return LogParser(
+            patterns=args.patterns, pattern_names=args.pattern_names,
+            use_re=args.use_re, nearest_distance=args.nearest_distance,
+            start_line_pattern_flag=args.start_flag,
+            end_line_pattern_flag=args.end_flag,
+            split_pattern=args.split_pattern,
+            split_sep=args.split_sep,
+            split_idx=args.split_idx
+        )
+
+    def run(self):
+        args = self.parse_args()
+        parser = self.create_parser(args)
+
+        if args.print_result:
+            print(args)
+
+        output = self.get_log(args)
+        parsed_logs = self.parser_log(parser, output, args)
+        self.check_logs(parsed_logs, args)
+
+    def get_log(self, args):
+        if len(args.run_script) == 0:
+            try:
+                with open(args.log) as f:
+                    return f.readlines()
+            except:
+                print(f"ERROR: Read log fail in {args.log}")
+                exit(1)
+        else:
+            return get_output(args.run_script, capture_output_method=args.capture_output)
+
+    def parser_log(self, parser, output, args) -> List[float]:
+        results = parser.parse(output)
+        if args.only_last:
+            results = results[-1:]
+
+        if len(results) == 0:
+            raise ValueError("The parsed results is empty, please check patterns.")
+        if isinstance(results[0], dict):
+            if len(results[0]) == 0:
+                raise ValueError("The parsed results is empty, please check patterns.")
+            key = list(results[0].keys())[0]
+            results = [result[key] for result in results]
+
+        if isinstance(results[0], Iterable):
+            results = [result[0] for result in results]
+
+        return results
+
+    def check_logs(self, parsed_logs, args):
+        if args.print_result:
+            print("Parsed result:", parsed_logs)
+
+        assertion_expr = assertion_expr_factory.get(args.expr, args.expr)
+
+        assert_results = []
+        b = args.assertion_second_value
+        for a in parsed_logs:
+            assert_results.append(eval(assertion_expr))
+
+        if args.print_result:
+            print("The result of assertion expression:", assert_results)
+
+        if any(assert_results):
+            print("SUCCESS")
+            exit(0)
+        print("FAIL")
+        exit(1)
+
+    def _get_script_path(self, run_script: List[str]):
+        # Find shell script by current run_script
+        def _find_real_shell_script(cmd: List[str]):
+            for i, field in enumerate(cmd):
+                if field.endswith('.sh') and self._get_framework(field) in FRAMEWORKS:
+                    return field
+
+        real_shell_script = _find_real_shell_script(run_script)
+
+        # Find shell script by parent process
+        if real_shell_script is None:
+            ppid = os.getppid()
+            import psutil
+            pproc = psutil.Process(ppid)
+            pproc_cmd = pproc.cmdline()
+            real_shell_script = _find_real_shell_script(pproc_cmd)
+
+        if real_shell_script is not None:
+            real_shell_script = self._get_script_abs_path(real_shell_script)
+            return real_shell_script
+
+        raise RuntimeError("The script is not named correctly, " + \
+                           "please use a script name ending with the framework, " + \
+                           f"got `{' '.join(run_script)}`, " + \
+                           "e.g. train_resnet50_torch.sh")
+
+    def _get_framework(self, shell_script: str) -> Optional[str]:
+        try:
+            return shell_script.split('.')[-2].split('_')[-1]
+        except:
+            return None
+
+    def _get_script_abs_path(self, run_script):
+        real_run_script = os.path.realpath(run_script)
+        if os.path.exists(real_run_script):
+            return real_run_script
+
+        if "MODEL_DIR" in os.environ:
+            return os.path.join(os.environ["MODEL_DIR"], run_script)
+
+        if "OLDPWD" in os.environ:
+            real_run_script = os.path.join(os.environ["OLDPWD"], run_script)
+            if os.path.exists(real_run_script):
+                return real_run_script
+
+        raise FileNotFoundError("Not found running script path, " + \
+                                "please set environment variable `MODEL_DIR`, " + \
+                                "e.g /path/to/deeplearningsamples/executables/resnet.")
+
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py
new file mode 100644
index 000000000..b40f3a72f
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py
@@ -0,0 +1,56 @@
+import os
+
+from .assert_cli import AssertCLI
+from ..utils.subprocess_tools import execute_shell
+
+RUN_MODE_KEY = "RUN_MODE"
+RUN_MODE_STRICT = "strict"
+
+
+class CheckCli(AssertCLI):
+
+    def __init__(self, *args, **kwargs):
+        super(CheckCli, self).__init__(*args, **kwargs)
+        self.args = None
+
+    def command_name(self):
+        return "check"
+
+    def predefine_args(self):
+        self.parser.add_argument("--check_mode", type=str, default="no",
+                                 choices=["all", "strict", "nonstrict", "no"],
+                                 help="which running mode needs to be checked")
+        self.parser.add_argument("--nonstrict_mode_args", type=str, default="",
+                                 help="the arguments are used with nonstric testing")
+        super(CheckCli, self).predefine_args()
+
+    def parse_args(self, *args, **kwargs):
+        if self.args is None:
+            args = super(CheckCli, self).parse_args(*args, **kwargs)
+            args.use_predefined_parser_rules = True
+            args.nonstrict_mode_args = args.nonstrict_mode_args.split(" ")
+
+            if not self.is_strict_testing():
+                args.run_script.extend(args.nonstrict_mode_args)
+
+            if args.check_mode == "all":
+                args.check_mode = self.current_running_mode()
+
+            self.args = args
+        return self.args
+
+    def run(self):
+        args = self.parse_args()
+        if args.check_mode == self.current_running_mode():
+            return super(CheckCli, self).run()
+        else:
+            res = execute_shell(args.run_script)
+            exit(res.returncode)
+
+    def current_running_mode(self):
+        return os.environ.get(RUN_MODE_KEY, RUN_MODE_STRICT)
+
+    def is_strict_testing(self):
+        return self.current_running_mode() == RUN_MODE_STRICT
+
+
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py
new file mode 100644
index 000000000..3451623d5
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from dltest.cli.assert_cli import AssertCLI
+from dltest.cli.log_comparator_cli import LogComparatorCLI
+from dltest.cli.model_validator_cli import ModelValidatorCLI
+from dltest.cli.fetch_log_cli import FetchLog
+from dltest.cli.check_cli import CheckCli
+
+
+#log_comparator_cli = LogComparatorCLI()
+#model_validator_cli = ModelValidatorCLI()
+fetch_log_cli = FetchLog()
+#assert_cli = AssertCLI()
+#check_cli = CheckCli()
+
+
+def make_execute_path():
+    preffix = "dltest.cli.entry_points"
+    clis = []
+    for cli_var in globals():
+        if cli_var.endswith('_cli'):
+            cmd_name = globals()[cli_var].command_name()
+            clis.append(f"ixdltest-{cmd_name}={preffix}:{cli_var}")
+
+    return clis
+
+
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py
new file mode 100644
index 000000000..0059cecf7
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import json
+import sys
+from typing import Mapping
+from os.path import basename, join, exists, expanduser, dirname
+
+from dltest.log_parser import LogParser
+from dltest.cli.log_parser_cli import LogParserCLI
+from dltest.utils.iluvatar import get_iluvatar_card_type, IluvatarGPU
+
+
+
+
+def parse_target(target):
+    result = {}
+    targets = target.split(",")
+    for i in targets:
+        item = i.split(":")
+        assert len(item) == 2
+        key, value = item
+        result[key] = float(value)
+    return result
+        
+
+def load_json(file):
+    file_path = expanduser(file)
+    # 检查文件是否存在
+    if exists(file_path):
+        # 加载json文件
+        with open(file_path, 'r') as file:
+            data = json.load(file)
+    else:
+        # 创建一个空的json文件
+        data = {}
+
+    return data
+
+def process_results(results):
+    result = dict()
+    for i in results["results"]:
+        for k, v in i.items():
+            result[k] = v[0]
+    return result
+
+class FetchLog(LogParserCLI):
+
+    def command_name(self):
+        return "fetch"
+
+    def predefine_args(self):
+        super(FetchLog, self).predefine_args()
+        self.parser.add_argument('log', nargs='?', type=str, help="Log path")
+        self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
+        self.parser.add_argument('--saved_entry', type=str, default=None, help='Save to path')
+        self.parser.add_argument('-t_bi150','--target_bi150', type=str, default=-1.)
+        self.parser.add_argument('-t_mr100','--target_mr100', type=str, default=-1.)
+        self.parser.add_argument('-t_mr50','--target_mr50', type=str, default=-1.)
+
+    def run(self):
+        args = self.parse_args()
+        parser = LogParser(
+            patterns=args.patterns, pattern_names=args.pattern_names,
+            use_re=args.use_re, nearest_distance=args.nearest_distance,
+            start_line_pattern_flag=args.start_flag,
+            end_line_pattern_flag=args.end_flag,
+            split_pattern=args.split_pattern,
+            split_sep=args.split_sep,
+            split_idx=args.split_idx
+        )
+
+        results = parser.parse(args.log)
+        if not isinstance(results, Mapping):
+            results = dict(results=results)
+        results = process_results(results)
+        print(results)
+
+        if args.saved is not None:
+            saved = load_json(args.saved)
+            if not args.saved_entry:
+                raise Exception("You need to use --saved_entry to specify entry name of the result")
+
+            saved[args.saved_entry] = results
+            with open(args.saved, 'w') as f:
+                json.dump(saved, f, indent=4)
+        self.compare_results(args, results)
+
+
+    def compare_results(self, args, results):
+        card = get_iluvatar_card_type()
+        if card == IluvatarGPU.UNKNOWN:
+            print("Not known which card is used, can you use ixsmi in the environment?")
+            return
+        user_target = getattr(args, 'target_'+card.name.lower(), "")
+        user_target = parse_target(user_target)
+
+        is_expected = True
+        for key, target in user_target.items():
+            if key not in results:
+                continue
+            if results[key]<target:
+                is_expected = False
+                print(f"- Check {key} on {card.name} failed (result vs target): {results[key]}<{target}")
+            else:
+                print(f"- Check {key} on {card.name} passed (result vs target): {results[key]}>={target}")
+        if not is_expected:
+            sys.exit(1)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py
new file mode 100644
index 000000000..a5863b56d
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import json
+from pprint import pprint
+
+from dltest.cli.log_parser_cli import LogParserCLI
+from dltest.log_comparator import compare_logs_with_paths, DEFAULT_NEAREST_MATCH_CHARS
+
+
+class LogComparatorCLI(LogParserCLI):
+
+    def command_name(self):
+        return "compare"
+
+    def predefine_args(self):
+        super(LogComparatorCLI, self).predefine_args()
+        self.parser.add_argument('--log1', type=str, help="First log")
+        self.parser.add_argument('--log2', type=str, help="Second log")
+        self.parser.add_argument('--threshold', type=float, default=0.0001, help="Threshold")
+        self.parser.add_argument('--only_last', type=int, default=1, help='Whether use the last result to compare')
+        self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
+        self.parser.add_argument('--print_result', action="store_true", default=False, help='Whether print result')
+        self.parser.add_argument('--allow_greater_than', action="store_true", default=False, help='Allow log1 greater than log2')
+
+    def parse_args(self, *args, **kwargs):
+        args = super(LogComparatorCLI, self).parse_args(*args, **kwargs)
+        args.only_last = args.only_last >= 1
+
+        return args
+
+    def run(self):
+        args = self.parse_args()
+        satisfied, results = compare_logs_with_paths(
+            log1=args.log1, log2=args.log2,
+            threshold=args.threshold,
+            patterns=args.patterns, pattern_names=args.pattern_names,
+            use_re=args.use_re, nearest_distance=args.nearest_distance,
+            start_line_pattern_flag=args.start_flag,
+            end_line_pattern_flag=args.end_flag,
+            only_last=args.only_last,
+            split_pattern=args.split_pattern,
+            split_sep=args.split_sep,
+            split_idx=args.split_idx,
+            allow_greater_than=True
+        )
+
+        if args.print_result:
+            pprint(results)
+
+        if satisfied:
+            print("SUCCESS")
+        else:
+            print("FAIL")
+
+        if args.saved is not None:
+            with open(args.saved, 'w') as f:
+                json.dump(results, f)
+
+
+
+
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py
new file mode 100644
index 000000000..7263543ef
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import json
+from typing import Mapping
+
+from dltest.log_parser import LogParser, DEFAULT_NEAREST_MATCH_CHARS
+from dltest.utils.base_cli import BaseCLI
+
+
+class LogParserCLI(BaseCLI):
+
+    def predefine_args(self):
+        self.parser.add_argument('-p', '--patterns', nargs="*", type=str, default=None, help='Fetched patterns')
+        self.parser.add_argument('-pn', '--pattern_names', nargs="*", type=str, default=None, help='The name of pattern')
+        self.parser.add_argument('--use_re', action="store_true", default=False, help='Whether use regular expression')
+        self.parser.add_argument('-d', '--nearest_distance', type=int, default=DEFAULT_NEAREST_MATCH_CHARS, help='The nearest distance of matched pattern')
+        self.parser.add_argument('--start_flag', type=str, default=None, help='The flag of start to record log')
+        self.parser.add_argument('--end_flag', type=str, default=None, help='The flag of stop to record log')
+        self.parser.add_argument('--split_pattern', type=str, default=None, help='The pattern is used to match line')
+        self.parser.add_argument('--split_sep', nargs="*", type=str, default=None, help='The seperator is used to split line')
+        self.parser.add_argument('--split_idx', nargs="*", type=int, default=None, help='The index of split line')
+
+    def parse_args(self, *args, **kwargs):
+        args = super(LogParserCLI, self).parse_args(*args, **kwargs)
+
+        return args
+
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py
new file mode 100644
index 000000000..1c4f68c58
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import json
+import os
+import os.path as ospath
+from pprint import pprint
+from typing import List, Union
+
+from dltest.utils.base_cli import BaseCLI
+from dltest.utils.get_env import get_gpu_type
+from dltest.utils.misc import get_full_path
+from dltest.model_compare_config import get_compare_config_with_full_path
+from dltest.log_comparator import compare_logs_with_paths
+from dltest.utils.subprocess_tools import get_output
+
+
+REMAINDER = '...'
+
+
+class ModelValidatorCLI(BaseCLI):
+
+    def command_name(self):
+        return "validate"
+
+    def predefine_args(self):
+        super(ModelValidatorCLI, self).predefine_args()
+        self.parser.add_argument('-l', '--compare_log', type=str, default=None, help="Compare log")
+        self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
+        self.parser.add_argument('--with_exit_code', type=int, default=1, help="Add exit code for the result of compared")
+        self.parser.add_argument('--print_result', action="store_true", default=False, help='Whether print result')
+        self.parser.add_argument('--capture_output', type=str, default='pipe', choices=['pipe', 'tempfile'], help='The method of capture output')
+        self.parser.add_argument("run_script", nargs=REMAINDER)
+
+    def parse_args(self, *args, **kwargs):
+        args = super(ModelValidatorCLI, self).parse_args()
+        if len(args.run_script) == 0:
+            print("ERROR: Invalid run_script")
+            exit(1)
+
+        return args
+
+    def run(self):
+        args = self.parse_args()
+        output = self._run_script(args.run_script, capture_output_method=args.capture_output)
+        self.compare_logs(
+            output, args.compare_log, args.run_script,
+            args.saved, args.with_exit_code,
+            args.print_result
+        )
+
+    def compare_logs(self, output: List, compare_log: str,
+                     run_script: List[str], saved: str=None,
+                     with_exit_code: int=1, print_result=False):
+        script_path = self._get_script_path(run_script)
+        script_path = get_full_path(script_path)
+        compare_args = get_compare_config_with_full_path(script_path)
+
+        if compare_log is None:
+            epoch = self._get_epoch(run_script)
+            script_name = ospath.basename(script_path)
+            dist_tag = self._get_dist_tag(script_name)
+            compare_log = self._find_comparable_log(script_path, epoch, dist_tag)
+
+            if not ospath.exists(compare_log):
+                print(f"ERROR: {compare_log} not exist. Or please use argument `l` to locate log.")
+                exit(1)
+
+        compare_args['log1'] = output
+        compare_args['log2'] = compare_log
+
+        satisfied, results = compare_logs_with_paths(**compare_args)
+
+        if print_result:
+            pprint(results)
+
+        if satisfied:
+            print("SUCCESS")
+        else:
+            print("FAIL")
+
+        if saved is not None:
+            with open(saved, 'w') as f:
+                json.dump(results, f)
+
+        if with_exit_code:
+            if satisfied:
+                exit(0)
+            else:
+                exit(1)
+
+    def _run_script(self, command: List, capture_output_method: str='tempfile'):
+        return get_output(command, capture_output_method=capture_output_method)
+
+    def _get_script_path(self, run_script: List[str]):
+        for i, field in enumerate(run_script):
+            if field.endswith('.py') or field.endswith('.sh'):
+                return field
+
+        raise RuntimeError("Not found the name of script, " +
+                           "only support python or `sh` script, but got {}.".format(run_script))
+
+    def _find_comparable_log(self, script_path: str, epoch: Union[str, int], dist_tag: str):
+        gpu_type = get_gpu_type().lower()
+
+        # Get the platform of trained log
+        if gpu_type == "nv":
+            gpu_type = 'bi'
+        else:
+            gpu_type = 'nv'
+
+        script_path = get_full_path(script_path)
+        project_dir = self._get_project_dir(script_path)
+        script_name = ospath.basename(script_path)
+
+        log_path = f"{project_dir}/runing_logs/{gpu_type}/{gpu_type}-{script_name}.epoch_{epoch}{dist_tag}.log"
+
+        return log_path
+
+
+    def _get_epoch(self, run_script: List[str]):
+        for i, field in enumerate(run_script):
+            if "--epoch" in field:
+                if "=" in field:
+                    return field.split("=")[1]
+                else:
+                    return run_script[i + 1]
+
+        return 'default'
+
+    def _get_dist_tag(self, script_name: str):
+        try:
+            import torch
+            num_gpus = torch.cuda.device_count()
+        except:
+            num_gpus = os.environ.get("CUDA_VISIBLE_DEVICES", "all")
+
+        if '_dist_' in script_name or '_multigpu_' in script_name:
+            return f".{num_gpus}card"
+        return ""
+
+    def _get_project_dir(self, abs_path):
+        abs_path = ospath.abspath(abs_path)
+        script_dir = ospath.dirname(abs_path)
+        executables_dir = ospath.dirname(script_dir)
+        return ospath.dirname(executables_dir)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py
new file mode 100644
index 000000000..8a633b63b
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from typing import List, Mapping, Union, Tuple
+from .log_parser import LogParser, DEFAULT_NEAREST_MATCH_CHARS
+
+LogLines = List[Mapping]
+CompareResult = Tuple[bool, Union[List, Mapping]]
+
+
+def _compute_errors(value1: Mapping, value2: Mapping, threshold: Mapping, allow_greater_than=False) -> CompareResult:
+    if not isinstance(threshold, Mapping):
+        _thds = dict()
+        for key in value1.keys():
+            _thds[key] = threshold
+        threshold = _thds
+
+    result = dict()
+    satisfied = True
+    for key, _thd in threshold.items():
+        v1, v2 = value1[key], value2[key]
+        origin_value_type = list
+        if not isinstance(v1, (tuple, list)):
+            origin_value_type = float
+            v1 = [v1]
+            v2 = [v2]
+
+        real_errors = []
+        for v1_i, v2_i in zip(v1, v2):
+            real_error = v1_i - v2_i
+            real_errors.append(real_error)
+            if satisfied and abs(real_error) > _thd:
+                if allow_greater_than and real_error > 0:
+                    continue
+                satisfied = False
+
+        if origin_value_type is float and len(real_errors) > 0:
+            real_errors = real_errors[0]
+
+        result[key] = real_errors
+
+    return satisfied, result
+
+
+def compare_logs(log1: LogLines, log2: LogLines, threshold: Union[float, Mapping], allow_greater_than=False) -> CompareResult:
+    total_lines = len(log1[0])
+    real_errors = []
+    satisfied = True
+    for line_idx in range(total_lines):
+        _satisfied, _error = _compute_errors(log1[line_idx], log2[line_idx], threshold, allow_greater_than=allow_greater_than)
+        real_errors.append(_error)
+        if satisfied and not _satisfied:
+            satisfied = False
+
+    return satisfied, real_errors
+
+
+def compare_logs_by_last_result(log1: LogLines, log2: LogLines, threshold: Union[float, Mapping], allow_greater_than=False) -> CompareResult:
+    if len(log1) == 0 or len(log2) == 0:
+        return False, []
+    return _compute_errors(log1[-1], log2[-1], threshold, allow_greater_than=allow_greater_than)
+
+
+def compare_logs_with_paths(log1, log2, threshold: Union[float, Mapping],
+                            patterns: List[str],
+                            pattern_names: List[str] = None,
+                            use_re: bool = False,
+                            nearest_distance: int = DEFAULT_NEAREST_MATCH_CHARS,
+                            start_line_pattern_flag: str = None,
+                            end_line_pattern_flag: str = None,
+                            only_last: bool=True,
+                            split_pattern: Union[str, List] = None,
+                            split_sep: List = None,
+                            split_idx: List = None,
+                            allow_greater_than: bool = False):
+    parser = LogParser(
+        patterns=patterns, pattern_names=pattern_names,
+        use_re=use_re, nearest_distance=nearest_distance,
+        start_line_pattern_flag=start_line_pattern_flag,
+        end_line_pattern_flag=end_line_pattern_flag,
+        split_pattern=split_pattern,
+        split_sep=split_sep,
+        split_idx=split_idx
+    )
+
+    log1 = parser.parse(log1)
+    log2 = parser.parse(log2)
+
+    if only_last:
+        compare_result = compare_logs_by_last_result(log1, log2, threshold, allow_greater_than=allow_greater_than)
+    else:
+        compare_result = compare_logs(log1, log2, threshold, allow_greater_than=allow_greater_than)
+
+    return compare_result[0], dict(log1=log1, log2=log2, errors=compare_result[-1])
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py
new file mode 100644
index 000000000..663b028a1
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from typing import List, Optional, Union, Mapping
+import re
+import sys
+
+
+DEFAULT_NEAREST_MATCH_CHARS = 10
+
+
+def read_file(file):
+    with open(file, 'r') as f:
+        return f.readlines()
+
+def read_pipe():
+    result = []
+    for line in sys.stdin:
+        result.append(line)
+    return result
+
+def postprocess_search_result(results: List[str]) -> List[float]:
+    if len(results) != 0:
+        results = list(map(float, results))
+    return results
+
+
+def extract_nearest_value_by_key_inline(content: str, key: str,
+                                        nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS) -> List[float]:
+    pattern = "%s[\s\S]{0,%d}?(\d+(?:\.\d+)?)" % (key, nearest_distance)
+    return extract_value_by_pattern_inline(content, pattern)
+
+
+def extract_value_by_pattern_inline(content: str, pattern: str) -> List[float]:
+    results = re.findall(pattern, content)
+    return postprocess_search_result(results)
+
+
+def extract_value(content: str, pattern: str,
+                  inline=True, use_re=False,
+                  nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS) -> List[float]:
+    if inline:
+        if use_re:
+            return extract_value_by_pattern_inline(content, pattern)
+        else:
+            return extract_nearest_value_by_key_inline(content, pattern, nearest_distance)
+    else:
+        raise NotImplementedError()
+
+
+class LogParser:
+
+    def __init__(self,
+                 patterns: List[str]=None,
+                 pattern_names: List[str]=None,
+                 use_re: bool=False,
+                 nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS,
+                 start_line_pattern_flag: str=None,
+                 end_line_pattern_flag: str=None,
+                 split_pattern: Union[str, List]=None,
+                 split_sep: List[str]=None,
+                 split_idx: List[int]=None):
+        if patterns is None and split_sep is None:
+            raise ValueError("The one of argument `patterns` or `split_sep` must be given.")
+
+        if pattern_names is not None:
+            if isinstance(patterns, (tuple, list)) and patterns is not None and len(patterns) != len(pattern_names):
+                raise ValueError("The length of `pattern_names` argument not equal to `patterns`.")
+            if isinstance(split_sep, (tuple, list)) and split_sep is not None and len(split_sep) != len(pattern_names):
+                raise ValueError("The length of `pattern_names` argument not equal to `split_sep`.")
+
+        if split_sep is not None and (split_idx is None or not isinstance(split_idx, (int, tuple, list))):
+            raise ValueError("Invalid index to split text, got {}.".format(split_idx))
+
+        if split_sep is not None and split_pattern is None:
+            raise ValueError("Invalid pattern to split text, got {}.".format(split_pattern))
+
+        self.patterns = patterns
+        self.use_re = use_re
+        self.nearest_distance = nearest_distance
+        self.start_line_pattern_flag = start_line_pattern_flag
+        self.end_line_pattern_flag = end_line_pattern_flag
+
+        if not isinstance(split_sep, (tuple, list)) and split_sep is not None:
+            split_sep = [split_sep]
+
+            if not isinstance(split_idx, (tuple, list)):
+                split_idx = [split_idx]
+
+        self.split_sep = split_sep
+        self.split_idx = split_idx
+
+        if pattern_names is None:
+            if patterns is None:
+                pattern_names = split_idx
+            else:
+                pattern_names = patterns
+        self.pattern_names = pattern_names
+
+        if not isinstance(split_pattern, (tuple, list)) and split_sep is not None:
+            split_pattern = [split_pattern] * len(split_sep)
+        self.split_pattern = split_pattern
+
+        self.start_record = start_line_pattern_flag is None
+
+    def parse(self, path_or_logs: Union[str, List]) -> List[dict]:
+        """
+        : return: [{matric_name: value}, ...]
+        """
+
+        
+        if path_or_logs:
+            path_or_logs = read_file(path_or_logs)
+        else:
+            path_or_logs = read_pipe()
+
+        ret = []
+        for line in path_or_logs:
+            result = self.parse_inline(line)
+            if len(result) == 0:
+                continue
+            ret.append(result)
+        return ret
+
+    def parse_inline(self, line) -> dict:
+        if not self.can_record(line):
+            return {}
+
+        if self.split_sep is None:
+            return self._parse_inline_by_match(line)
+        return self._parse_inline_by_split(line)
+
+    def _parse_inline_by_match(self, line: str):
+        ret = {}
+        for name, pattern in zip(self.pattern_names, self.patterns):
+            result = extract_value(
+                line, pattern, inline=True, use_re=self.use_re,
+                nearest_distance=self.nearest_distance
+            )
+            if len(result) == 0:
+                continue
+            ret[name] = result
+        return ret
+
+    def _parse_inline_by_split(self, line: str, to_type=float):
+        ret = {}
+        for name, sep, idx, pattern in zip(self.pattern_names,
+                                  self.split_sep,
+                                  self.split_idx,
+                                  self.split_pattern):
+            if not self.can_matched(line, pattern):
+                continue
+            if '\t' in sep:
+                segs = line.strip().split(sep)
+            else:
+                segs = line.strip().replace('\t', ' ').split(sep)
+            segs = list(filter(lambda kv: kv.strip() not in ["", " ", None], segs))
+            if len(segs) <= idx:
+                continue
+            ret[name] = to_type(segs[idx])
+        return ret
+
+    def can_record(self, line: str):
+        if self.start_line_pattern_flag is None:
+            self.start_record = True
+        elif not self.start_record:
+            self.start_record = self.can_matched(line, self.start_line_pattern_flag)
+
+        if self.start_record:
+            if self.end_line_pattern_flag is not None and self.can_matched(line, self.end_line_pattern_flag):
+                self.start_record = False
+
+        return self.start_record
+
+    def can_matched(self, content: str, pattern: str):
+        result = re.findall(pattern, content)
+        return len(result) != 0
+
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py
new file mode 100644
index 000000000..8b03a7092
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py
@@ -0,0 +1,306 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import os.path as ospath
+
+from typing import NamedTuple, Union, List, Mapping
+
+from dltest.log_parser import DEFAULT_NEAREST_MATCH_CHARS
+
+
+class LogComparatorArgs(NamedTuple):
+    threshold: Union[float, Mapping]
+    patterns: List[str] = None
+    pattern_names: List[str] = None
+    use_re: bool = False
+    nearest_distance: int = DEFAULT_NEAREST_MATCH_CHARS
+    start_line_pattern_flag: str = None
+    end_line_pattern_flag: str = None
+    split_pattern: Union[str, List] = None
+    split_sep: List = None
+    split_idx: List = None
+    only_last: bool = True
+    allow_greater_than: bool = True
+
+    def to_dict(self):
+        return self._asdict()
+
+
+class ArgsModelsTuple(NamedTuple):
+
+    args: LogComparatorArgs
+    models: List[str]
+
+
+class BaseConfig:
+
+    def __getitem__(self, item):
+        return self.__class__.__dict__[item]
+
+    def __getattr__(self, item):
+        return self.__class__.__dict__[item]
+
+    def __iter__(self):
+        for attr, value in self.__class__.__dict__.items():
+            if isinstance(value, ArgsModelsTuple):
+                yield attr
+
+    def iter_items(self):
+        for attr, value in self.__class__.__dict__.items():
+            if isinstance(value, ArgsModelsTuple):
+                yield attr, value
+
+
+class _TFComparatorConfig(BaseConfig):
+
+    cnn_benchmarks = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["Accuracy @ 1 =", "Accuracy @ 5 ="],
+            pattern_names=["Acc@1", "Acc@5"]
+        ),
+        models=["alexnet", "inceptionv3", "resnet50", "resnet101", "vgg16"]
+    )
+
+    dist_cnn_becnmarks = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_sep=[' ', ' '],
+            split_idx=[9, 10],
+            split_pattern="[\s\S]*?images/sec:[\s\S]*?jitter",
+            pattern_names=['Acc@1', 'Acc@5']
+        ),
+        models=[
+            "alexnet_dist", "inceptionv3_dist", "resnet50_dist", "resnet101_dist", "vgg16_dist"
+        ]
+    )
+
+    bert = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["eval_accuracy ="],
+            pattern_names=["Accuracy"]
+        ),
+        models=["bert"]
+    )
+
+    ssd = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["acc="],
+            pattern_names=["Acc@1"]
+        ),
+        models=["ssd"]
+    )
+
+    yolov3 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.8,
+            patterns=["mAP"]
+        ),
+        models=["yolov3"]
+    )
+
+    vnet = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["background_dice", "anterior_dice", "posterior_dice"]
+        ),
+        models=["vnet"]
+    )
+
+
+class _TorchComparatorConfig(BaseConfig):
+    classification = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=8.0, patterns=['Acc@1', 'Acc@5'],
+            start_line_pattern_flag="Start training",
+        ),
+        models=[
+            'googlenet', 'inceptionv3', 'mobilenetv3', 'resnet', 'shufflenetv2',
+            'vgg', 'resnet50_dali', 'resnext', 'densenet'
+        ]
+    )
+
+    detection = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.03,
+            patterns=[
+                "Average Precision  \(AP\) @\[ IoU=0.50:0.95 \| area=   all \| maxDets=100 \] ="
+            ],
+            pattern_names=["mAP"],
+            start_line_pattern_flag="IoU metric: bbox",
+            end_line_pattern_flag="IoU metric: segm"
+        ),
+        models=[
+            'maskrcnn', 'retinanet', 'ssd'
+        ]
+    )
+
+    bert_cola = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=['mcc']
+        ),
+        models=['bert_cola']
+    )
+
+    bert_mrpc = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=['acc']
+        ),
+        models=['bert_mrpc']
+    )
+
+    bert_pretrain_apex = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=['eval_mlm_accaracy']
+        ),
+        models=['bert_pretrain_apex']
+    )
+
+    segmentation = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=8.0,
+            patterns=['mean IoU:'],
+            pattern_names=['mIoU']
+        ),
+        models=[
+            'deeplabv3', 'fcn'
+        ]
+    )
+
+    t5 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=5.0,
+            split_pattern="eval_bleu[\s\S]*?=",
+            split_sep=["="],
+            split_idx=[1],
+            pattern_names=['EvalBleu']
+        ),
+        models=['t5']
+    )
+
+    yolov3 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["mAP"]
+        ),
+        models=['yolov3']
+    )
+
+    yolov5 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=[
+                "Average Precision  \(AP\) @\[ IoU=0.50:0.95 \| area=   all \| maxDets=100 \] ="
+            ],
+            pattern_names=["mAP"],
+        ),
+        models=['yolov5'],
+    )
+
+    yolov5s_coco128 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
+            split_sep=[" ", " "],
+            split_idx=[5, 6],
+            pattern_names=["AP50", "mAP"]
+        ),
+        models=['yolov5s_coco128']
+    )
+    
+    centernet_resnet18 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
+            split_sep=[" ", " "],
+            split_idx=[5, 6],
+            pattern_names=["AP50", "mAP"]
+        ),
+        models=['centernet_resnet18']
+    )
+    
+    fcos_resnet50_fpn = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
+            split_sep=[" ", " "],
+            split_idx=[5, 6],
+            pattern_names=["AP50", "mAP"]
+        ),
+        models=['fcos_resnet50_fpn']
+    )
+
+    ocr_recognition = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.5,  patterns=["0_word_acc"],
+        ),
+        models=[
+            "sar", "satrn"
+        ]
+    )
+
+
+
+class ComparatorConfig:
+
+    _configs = dict(tf=_TFComparatorConfig(), torch=_TorchComparatorConfig())
+
+    @classmethod
+    def get_frameworks(cls) -> List:
+        return list(cls._configs.keys())
+
+    @classmethod
+    def get(cls, tf_or_torch, name, default=None):
+        for model_kind, comb in cls._configs[tf_or_torch].iter_items():
+            if name in comb.models:
+                return comb.args
+        if default is not None:
+            return default
+        raise KeyError("Not found config, but got {name} for {fw}".format(name=name, fw=tf_or_torch))
+
+    @classmethod
+    def find_config(cls, script_path: str) -> LogComparatorArgs:
+        tf_or_torch = script_path.split('.')[-2].split('_')[-1]
+
+        # Find by the name of script
+        script_name = ospath.basename(script_path).rsplit('.', maxsplit=1)[0]
+        if script_name.startswith('train_'):
+            script_name = script_name.replace("train_", "", 1)
+        while script_name not in [None, "", "/", "\\"]:
+            try:
+                config = cls.get(tf_or_torch, script_name)
+                return config
+            except:
+                pass
+            script_name = script_name.rsplit('_', maxsplit=1)
+            if len(script_name) <= 1:
+                break
+            script_name = script_name[0]
+
+        # Find by the name of model's dir
+        model_dir_name = ospath.basename(ospath.dirname(script_path))
+        try:
+            config = cls.get(tf_or_torch, model_dir_name)
+            return config
+        except:
+            raise RuntimeError("Not found for", script_path)
+
+
+def get_compare_config_with_full_path(script_path: str, to_dict=True):
+    config = ComparatorConfig.find_config(script_path)
+    if to_dict:
+        return config.to_dict()
+    return config
+
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/__init__.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py
new file mode 100644
index 000000000..91562d0db
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from argparse import ArgumentParser
+from abc import abstractmethod
+
+
+class BaseCLI:
+
+    def __init__(self, parser=None, *args, **kwargs):
+        if parser is None:
+            self.parser = ArgumentParser(description=self.description ,*args, **kwargs)
+
+    def __call__(self):
+        self.run()
+
+    @property
+    def description(self):
+        return None
+
+    @abstractmethod
+    def command_name(self):
+        pass
+
+    def predefine_args(self):
+        pass
+
+    def parse_args(self, *args, **kwargs):
+        self.predefine_args()
+        return self.parser.parse_args(*args, **kwargs)
+
+    @abstractmethod
+    def run(self):
+        pass
+
+
+
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py
new file mode 100644
index 000000000..911933312
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+import os
+from collections import defaultdict
+import os.path as osp
+import subprocess
+import sys
+
+
+def get_envinfo():
+    import torch
+    env_info = {}
+    env_info['sys.platform'] = sys.platform
+    env_info['Python'] = sys.version.replace('\n', '')
+
+    cuda_available = torch.cuda.is_available()
+    env_info['CUDA available'] = cuda_available
+    if cuda_available:
+        from torch.utils.cpp_extension import CUDA_HOME
+        env_info['CUDA_HOME'] = CUDA_HOME
+        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
+            try:
+                nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
+                nvcc = subprocess.check_output(
+                    f'"{nvcc}" -V | tail -n1', shell=True)
+                nvcc = nvcc.decode('utf-8').strip()
+            except subprocess.SubprocessError:
+                nvcc = 'Not Available'
+            env_info['NVCC'] = nvcc
+
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+        for name, devids in devices.items():
+            env_info['GPU ' + ','.join(devids)] = name
+
+    gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
+    gcc = gcc.decode('utf-8').strip()
+    env_info['GCC'] = gcc
+
+    env_info['PyTorch'] = torch.__version__
+
+    return env_info
+
+
+def get_gpu_type():
+    import torch
+    if "DEBUG_GPU_TYPE" in os.environ:
+        return os.environ["DEBUG_GPU_TYPE"]
+
+    if not torch.cuda.is_available():
+        return "BI"
+    dev_name = torch.cuda.get_device_name(0)
+    if 'IX BI' in dev_name or getattr(torch, "corex", False):
+        _type = "BI"
+    else:
+        _type = "NV"
+
+    return _type
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py
new file mode 100644
index 000000000..adcdefc52
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py
@@ -0,0 +1,32 @@
+import sys
+import subprocess
+from enum import Enum
+
+__all__ = ["get_iluvatar_card_type", "IluvatarGPU"]
+
+class IluvatarGPU(Enum):
+    UNKNOWN = -1
+    MR50 = 0
+    MR100 = 1
+    BI150 = 2
+
+card_ixsmi_names = {
+        "BI150": IluvatarGPU.BI150,
+        "BI-V150": IluvatarGPU.BI150,
+        "MR100": IluvatarGPU.MR100,
+        "MR-V100": IluvatarGPU.MR100,
+        "MR50": IluvatarGPU.MR50,
+        "MR-V50": IluvatarGPU.MR50,
+}
+
+def get_iluvatar_card_type():
+    command = 'ixsmi -L | grep "GPU \{1,\}0"'
+    result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if result.returncode == 0:
+        for key, value in card_ixsmi_names.items():
+            if key in result.stdout:
+                return value
+        else:
+            return IluvatarGPU.UNKNOWN
+    else:
+        return IluvatarGPU.UNKNOWN
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py
new file mode 100644
index 000000000..13c1d6c7f
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py
@@ -0,0 +1,87 @@
+import os
+
+from typing import Union, List, Dict, Any, Mapping
+from argparse import Namespace, ArgumentParser
+import json
+
+
+def _obj_to_dict(obj) -> Dict:
+    if isinstance(obj, Mapping):
+        return obj
+
+    try:
+        from absl import flags
+        if isinstance(obj, flags.FlagValues):
+            return obj.flag_values_dict()
+    except:
+        pass
+    if isinstance(obj, Namespace):
+        return obj.__dict__
+    elif isinstance(obj, List):
+        new_obj = dict()
+        for _o in obj:
+            _o_dict = _obj_to_dict(_o)
+            new_obj.update(_o_dict)
+        return new_obj
+    elif not isinstance(obj, Dict):
+        if hasattr(obj, "__dict__"):
+            return obj.__dict__
+    try:
+        typename = type(obj).__name__
+    except:
+        typename = str(obj)
+    return {typename: str(obj)}
+
+
+def json_dump_obj(o):
+    if hasattr(o, "__name__"):
+        return o.__name__
+    return str(o)
+
+
+def show_infer_arguments(args: Union[List, Dict, Any]):
+    """ print running arguments
+    Example 1: For ArgumentParser
+        >>> parser = ArgumentParser("Test")
+        >>> parser.add_argument("--arg0", type=str)
+        >>> args = parser.parse_args()
+        >>> show_infer_arguments(args)
+
+    Example 2: For dict
+        >>> args = dict(arg=1)
+        >>> show_infer_arguments(args)
+
+    Example 3: For custom object
+        >>> from collections import namedtuple
+        >>> ArgsType = namedtuple("ArgsType", ["arg"])
+        >>> args = ArgsType(arg=123)
+        >>> show_infer_arguments(args)
+
+    Example 4: For absl
+        >>> from absl import flags
+        >>> flags.DEFINE_string("arg", "123", "test")
+        >>> show_infer_arguments(flags.FLAGS)
+
+    Example 5: For multi args
+        >>> args1 = dict(a=1)
+        >>> args2 = dict(b=2)
+        >>> show_infer_arguments([args1, args2])
+
+    """
+    if not "SHOW_RUNNING_ARGS" in os.environ:
+        return
+
+    if os.environ["SHOW_RUNNING_ARGS"].lower() in ["0", "f", "false"]:
+        return
+
+    if "LOCAL_RANK" in os.environ:
+        if os.environ["LOCAL_RANK"] != "0":
+            return
+    args = _obj_to_dict(args)
+    args = json.dumps(args, default=json_dump_obj)
+    print("[RunningArguments]", args)
+
+
+if __name__ == '__main__':
+    os.environ["SHOW_RUNNING_ARGS"] = "1"
+    show_infer_arguments([dict(a=1), dict(b=1), object()])
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py
new file mode 100644
index 000000000..f8cfacfbc
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+import copy
+import os
+
+
+def get_full_path(fname):
+    pwd = os.getcwd()
+    if fname.startswith('/'):
+        return fname
+    return os.path.join(pwd, fname)
+
+
+def is_main_proc(rank):
+    return str(rank) in ["None", "-1", "0"]
+
+
+def main_proc_print(*args, **kwargs):
+    if "RANK" in os.environ:
+        if is_main_proc(os.environ["RANK"]):
+            print(*args, **kwargs)
+            return
+
+    if "LOCAL_RANK" in os.environ:
+        if is_main_proc(os.environ["LOCAL_RANK"]):
+            print(*args, **kwargs)
+            return
+
+    print(*args, **kwargs)
+
+
+def create_subproc_env():
+    env = copy.copy(os.environ)
+    env["USE_DLTEST"] = "1"
+    return env
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py
new file mode 100644
index 000000000..e23230de1
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import os
+import os.path as ospath
+from pathlib import Path
+import tempfile
+
+
+class TemporaryFile:
+
+    def __init__(self, with_open=False, mode='r'):
+        self.name = None
+        self.with_open = with_open
+        self.mode = mode
+
+        self.file = None
+
+    def create(self):
+        self.name = tempfile.mktemp()
+        file_path = Path(self.name)
+        file_path.touch()
+
+    def delete(self):
+        if self.name is not None and ospath.exists(self.name):
+            os.unlink(self.name)
+
+    def read(self):
+        self._check_file_status()
+        return self.file.read()
+
+    def readlines(self):
+        self._check_file_status()
+        return self.file.readlines()
+
+    def _check_file_status(self):
+        if self.file is None:
+            raise RuntimeError("File is closed, please reopen it.")
+
+    def __enter__(self):
+        self.create()
+        if self.with_open:
+            self.file = open(self.name, mode=self.mode)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.with_open:
+            self.file.close()
+        self.delete()
+
+
+
+
+
+
+
+
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py
new file mode 100644
index 000000000..135faa89e
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import subprocess
+from typing import Callable, Union, List
+
+from dltest.utils.real_tempfile import TemporaryFile
+from dltest.utils import misc
+
+
+def get_output_with_pipe(command, shell=None, callback: Callable[[list], None]=None, *args, **kwargs):
+    if shell is None:
+        shell = True
+
+    if shell and not isinstance(command, str):
+        command = " ".join(command)
+
+    stream = subprocess.Popen(
+        command, shell=shell,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        *args, **kwargs
+    )
+    outputs = []
+    while 1:
+        exit_code = stream.poll()
+        if exit_code is None:
+            if stream.stdout.readable():
+                outputs.append(stream.stdout.readline().decode("utf8").rstrip())
+                if callback is not None:
+                    callback(outputs[-1:])
+                print(outputs[-1])
+        else:
+            if stream.stdout.readable():
+                lines = stream.stdout.readlines()
+                lines = [line.decode("utf8".rstrip()) for line in lines]
+                outputs.extend(lines)
+                if callback is not None:
+                    callback(outputs[-1:])
+                print('\n'.join(lines))
+            break
+
+    return outputs
+
+
+def get_output_with_tempfile(command, *args, **kwargs):
+    if not isinstance(command, (list, tuple)):
+        command = [command]
+    stdout = None
+    with TemporaryFile(with_open=True) as file:
+        command.extend(['|', 'tee', file.name])
+        command = " ".join(command)
+
+        res = subprocess.run(command, stdout=stdout, stderr=subprocess.STDOUT, shell=True, *args, **kwargs)
+        output = file.readlines()
+
+    return output
+
+def execute_shell(command, *args, **kwargs):
+    if "env" not in kwargs:
+        kwargs["env"] = misc.create_subproc_env()
+
+    if not isinstance(command, (list, tuple)):
+        command = [command]
+
+    command = " ".join(command)
+    res = subprocess.run(command,
+                         shell=True, *args, **kwargs)
+    return res
+
+def get_output(command: List, capture_output_method: str = 'tempfile', *args, **kwargs):
+    if "env" not in kwargs:
+        kwargs["env"] = misc.create_subproc_env()
+
+    if capture_output_method == "tempfile":
+        return get_output_with_tempfile(command, *args, **kwargs)
+    return get_output_with_pipe(command, *args, **kwargs)
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py
new file mode 100755
index 000000000..52d5db6f6
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from setuptools import setup, find_packages
+from dltest.cli.entry_points import make_execute_path
+
+setup(
+    name="dltest",
+    version="0.1",
+    description='Iluvatar Corex AI Toolbox',
+    packages=find_packages(exclude=('examples')),
+    include_package_data=True,
+    zip_safe=False,
+    entry_points = {
+        'console_scripts': make_execute_path(),
+    },
+    install_requires=[
+        'psutil'
+    ]
+)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py
index c5ca9cfb5..7c40a978e 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py
@@ -42,6 +42,7 @@
 from passes.fusion_swinl_attention import FusionSwinLAttention
 from passes.fusion_utils import FusionUtils
 from passes.fusion_videobert_attention import FusionVideoBertAttention
+from passes.fusion_vit_attention import FusionVITAttention
 from passes.fusion_xsoftmax import FusionXSoftmax
 from passes.onnx_model import OnnxModel
 
@@ -89,8 +90,8 @@ def fuse_attention(self):
         FusionAlbertAttention(
             self, self.hidden_size, self.num_heads, self.attention_mask
         ).apply()
-        fusion = FusionVideoBertAttention(self)
-        fusion.apply()
+        FusionVideoBertAttention(self).apply()
+        FusionVITAttention(self).apply()
         FusionSwinLAttention(self).apply()
         FusionGptAttentionNoPast(self).apply()
         # Only relevant in models with Q-DQ nodes
@@ -159,7 +160,7 @@ def fuse_embed_layer(self):
         fusion.apply()
 
     def fuse_layer_norm(self):
-        fusion = FusionLayerNormalization(self)
+        fusion = FusionLayerNormalization(self, self.hidden_size)
         fusion.apply()
 
         fusion = FusionLayerNormalizationTF(self)
@@ -468,9 +469,12 @@ def optimize(
         if options.enable_format_roformer:
             self.fuse_format_roformer()
 
-        if options.enable_gpt2_classify:
+        if options.enable_gpt2_classify or options.enable_vit:
             self.fuse_custom_fc_gpt2_classify()
 
+        if options.enable_vit:
+            self.fuse_custom_fc()
+
         if (options is None) or options.enable_attention:
             if options is not None:
                 self.attention_mask.set_mask_format(options.attention_mask_format)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py
new file mode 100755
index 000000000..a250a9ea0
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py
@@ -0,0 +1,576 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+import onnx
+from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
+from passes.fuse_series_bias_add import FusionSerialBiasAdd
+from passes.fusion_albert_attention import FusionAlbertAttention
+from passes.fusion_attention import AttentionMask, FusionAttention
+from passes.fusion_biasgelu import FusionBiasGelu
+from passes.fusion_conformer_attention import FusionConformerAttention
+from passes.fusion_conformer_xsoftmax import FusionConformerXSoftmax
+from passes.fusion_customfc import (
+    FusionConformerCustomFCActivation,
+    FusionCustomFC,
+    FusionCustomFCGPT2,
+)
+from passes.fusion_disentangled_attention import FusionDisentangledAttention
+from passes.fusion_embedlayer import FusionEmbedLayerNormalization
+from passes.fusion_fastgelu import FusionFastGelu
+from passes.fusion_format_roformer import (
+    FusionFormatInvalidMask,
+    FusionRemoveUselessElementwise,
+)
+from passes.fusion_gelu import FusionGelu
+from passes.fusion_gelu_approximation import FusionGeluApproximation
+from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast
+from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
+from passes.fusion_options import FusionOptions
+from passes.fusion_qordered_attention import FusionQOrderedAttention
+from passes.fusion_qordered_gelu import FusionQOrderedGelu
+from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
+from passes.fusion_qordered_matmul import FusionQOrderedMatMul
+from passes.fusion_reshape import FusionReshape
+from passes.fusion_shape import FusionShape
+from passes.fusion_skiplayernorm import (
+    FusionBiasSkipLayerNormalization,
+    FusionSkipLayerNormalization,
+)
+from passes.fusion_splitQKV import FusionSplitQKV
+from passes.fusion_swinl_attention import FusionSwinLAttention
+from passes.fusion_utils import FusionUtils
+from passes.fusion_vit_attention import FusionVITAttention
+from passes.onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class ConformerOptimizationOptions(FusionOptions):
+    """This class is deprecated"""
+
+    def __init__(self, model_type):
+        logger.warning(
+            f"BertOptimizationOptions is depreciated. Please use FusionOptions instead."
+        )
+        super().__init__(model_type)
+
+
+class conformerOnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
+        """Initialize BERT ONNX Model.
+
+        Args:
+            model (ModelProto): the ONNX model
+            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
+            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
+        """
+        assert (num_heads == 0 and hidden_size == 0) or (
+            num_heads > 0 and hidden_size % num_heads == 0
+        )
+
+        super().__init__(model)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+
+        self.attention_mask = AttentionMask(self)
+        self.attention_fusion = FusionAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.qordered_attention_fusion = FusionQOrderedAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.utils = FusionUtils(self)
+
+    def fuse_attention(self):
+        FusionConformerAttention(self, self.hidden_size, self.num_heads).apply()
+        # Only relevant in models with Q-DQ nodes
+        self.qordered_attention_fusion.apply()
+
+    def fuse_format_roformer(self):
+        FusionRemoveUselessElementwise(self).apply()
+        fusion = FusionFormatInvalidMask(self)
+        fusion.apply()
+
+    def fuse_custom_fc(self):
+        fusion = FusionCustomFC(self)
+        fusion.apply()
+
+    def fuse_custom_fc_conformer_activation(self):
+        fusion = FusionConformerCustomFCActivation(self)
+        fusion.apply()
+
+    def fuse_custom_fc_gpt2_classify(self):
+        fusion = FusionCustomFCGPT2(self)
+        fusion.apply()
+
+    def fuse_swinT_serial_bias_add(self):
+        fusion = FusionSerialBiasAdd(self)
+        fusion.apply()
+
+    def fuse_gelu(self):
+        fusion = FusionGelu(self)
+        fusion.apply()
+        fusion = FusionFastGelu(self)
+        fusion.apply()
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedGelu(self)
+        fusion.apply()
+
+    def fuse_bias_gelu(self, is_fastgelu):
+        fusion = FusionBiasGelu(self, is_fastgelu)
+        fusion.apply()
+
+    def fuse_custom_xsoftmax(self):
+        fusion = FusionConformerXSoftmax(self)
+        fusion.apply()
+
+    def fuse_disentangled_attention(self):
+        fusion = FusionDisentangledAttention(self)
+        fusion.apply()
+
+    def gelu_approximation(self):
+        fusion = FusionGeluApproximation(self)
+        fusion.apply()
+
+    def fuse_add_bias_skip_layer_norm(self):
+        fusion = FusionBiasSkipLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_reshape(self):
+        fusion = FusionReshape(self)
+        fusion.apply()
+
+    def fuse_shape(self):
+        fusion = FusionShape(self)
+        fusion.apply()
+
+    def fuse_embed_layer(self):
+        fusion = FusionEmbedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_layer_norm(self):
+        fusion = FusionLayerNormalization(self, self.hidden_size)
+        fusion.apply()
+
+        fusion = FusionLayerNormalizationTF(self)
+        fusion.apply()
+
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_skip_layer_norm(self):
+        fusion = FusionSkipLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_split_qkv(self):
+        fusion = FusionSplitQKV(self, self.hidden_size, self.num_heads)
+        fusion.apply()
+
+    # Only relevant in models with Q-DQ nodes
+    def fuse_qordered_mamtul(self):
+        fusion = FusionQOrderedMatMul(self)
+        fusion.apply()
+
+    def get_graph_inputs_from_node_type(
+        self, op_type: str, input_indices: List[int], casted: bool
+    ):
+        """
+        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
+        Returns a list of the graph input names based on the filter whether it is casted or not.
+        """
+        graph_inputs = []
+
+        output_name_to_node = self.output_name_to_node()
+        nodes = self.get_nodes_by_op_type(op_type)
+        for node in nodes:
+            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
+            for bert_input in bert_inputs:
+                if self.find_graph_input(bert_input):
+                    if not casted:
+                        graph_inputs.append(bert_input)
+                elif bert_input in output_name_to_node:
+                    parent = output_name_to_node[bert_input]
+                    if (
+                        parent.op_type == "Cast"
+                        and self.find_graph_input(parent.input[0]) is not None
+                    ):
+                        if casted:
+                            graph_inputs.append(parent.input[0])
+        return graph_inputs
+
+    def get_graph_inputs_from_fused_nodes(self, casted: bool):
+        inputs = self.get_graph_inputs_from_node_type(
+            "EmbedLayerNormalization", [0, 1, 7], casted
+        )
+        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
+        return inputs
+
+    def change_graph_input_type(
+        self,
+        graph: GraphProto,
+        graph_input: ValueInfoProto,
+        new_type: int = TensorProto.INT32,
+    ):
+        """Change graph input type, and add Cast node if needed.
+
+        Args:
+            graph (GraphProto): graph
+            graph_input (TensorProto): input of the graph
+            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
+
+        Returns:
+            NodeProto: a new Cast node that added. None if Cast node is not added.
+            List[NodeProto]: Cast nodes that have been removed.
+        """
+        assert isinstance(graph, GraphProto)
+        assert isinstance(graph_input, ValueInfoProto)
+        assert self.find_graph_input(graph_input.name)
+
+        if graph_input.type.tensor_type.elem_type == int(new_type):
+            return None, []
+
+        new_cast_node = None
+        nodes_to_remove = []
+
+        input_name_to_nodes = self.input_name_to_nodes()
+        if graph_input.name in input_name_to_nodes:
+            nodes = input_name_to_nodes[graph_input.name]
+
+            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
+            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
+            if nodes_not_cast:
+                node_name = self.create_node_name("Cast")
+                output_name = node_name + "_" + graph_input.name
+                new_value_info = graph.value_info.add()
+                new_value_info.CopyFrom(graph_input)
+                new_value_info.name = output_name
+                new_cast_node = helper.make_node(
+                    "Cast",
+                    [graph_input.name],
+                    [output_name],
+                    to=int(graph_input.type.tensor_type.elem_type),
+                    name=node_name,
+                )
+                graph.node.extend([new_cast_node])
+
+                for node in nodes_not_cast:
+                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
+
+            # For children that is Cast node, no need to insert Cast.
+            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
+            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
+            for node in nodes_cast:
+                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
+                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
+                if not self.find_graph_output(node.output[0]):
+                    nodes_to_remove.append(node)
+            if nodes_to_remove:
+                self.remove_nodes(nodes_to_remove)
+
+        graph_input.type.tensor_type.elem_type = int(new_type)
+        return new_cast_node, nodes_to_remove
+
+    def change_graph_inputs_to_int32(self):
+        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
+        graph = self.graph()
+        add_cast_count = 0
+        remove_cast_count = 0
+        for graph_input in graph.input:
+            new_node, removed_nodes = self.change_graph_input_type(
+                graph, graph_input, TensorProto.INT32
+            )
+            if new_node:
+                add_cast_count += 1
+            remove_cast_count += len(removed_nodes)
+        logger.info(
+            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
+        )
+
+    def use_dynamic_axes(
+        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
+    ):
+        """
+        Update input and output shape to use dynamic axes.
+        """
+        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
+            casted=True
+        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
+
+        dynamic_batch_inputs = {}
+        for input in self.model.graph.input:
+            if input.name in bert_graph_inputs:
+                dim_proto = input.type.tensor_type.shape.dim[0]
+                dim_proto.dim_param = dynamic_batch_dim
+                if dynamic_seq_len is not None:
+                    dim_proto = input.type.tensor_type.shape.dim[1]
+                    dim_proto.dim_param = dynamic_seq_len
+
+        for output in self.model.graph.output:
+            dim_proto = output.type.tensor_type.shape.dim[0]
+            dim_proto.dim_param = dynamic_batch_dim
+
+    def preprocess(self):
+        self.adjust_reshape_and_expand()
+        return
+
+    def adjust_reshape_and_expand(self):
+        nodes_to_remove = []
+        for node in self.nodes():
+            if node.op_type == "Reshape":
+                # Clean up unneccessary reshape nodes.
+                # Find reshape nodes with no actually data in "shape" attribute and remove.
+                reshape_shape = self.get_constant_value(node.input[1])
+                if reshape_shape is not None and reshape_shape.size == 0:
+                    nodes_to_remove.extend([node])
+                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
+                    continue
+
+                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
+                # changing current reshape's input to output of slice.
+                reshape_path = self.match_parent_path(
+                    node,
+                    ["Expand", "Expand", "Reshape", "Slice"],
+                    [0, 0, 0, 0],
+                    self.output_name_to_node(),
+                )
+                if reshape_path is not None:
+                    expand_node = reshape_path[-3]
+                    expand_shape_value = self.get_constant_value(expand_node.input[1])
+
+                    reshape_before_expand = reshape_path[-2]
+                    shape_value = self.get_constant_value(
+                        reshape_before_expand.input[1]
+                    )
+
+                    slice_node = reshape_path[-1]
+                    if (
+                        expand_shape_value is not None
+                        and shape_value is not None
+                        and len(expand_shape_value) == 2
+                        and len(shape_value) == 1
+                        and expand_shape_value[1] == shape_value[0]
+                    ):
+                        node.input[0] = slice_node.output[0]
+
+        if nodes_to_remove:
+            self.remove_nodes(nodes_to_remove)
+            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
+
+    def clean_graph(self):
+        output_name_to_node = self.output_name_to_node()
+        nodes_to_remove = []
+        for node in self.nodes():
+            # Before:
+            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
+            #          |                                                     |
+            #          |                                                     v
+            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # After:
+            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
+            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
+            if node.op_type in op_input_id:
+                i = op_input_id[node.op_type]
+                parent_nodes = self.match_parent_path(
+                    node,
+                    [
+                        "Cast",
+                        "ConstantOfShape",
+                        "Concat",
+                        "Unsqueeze",
+                        "Gather",
+                        "Shape",
+                    ],
+                    [i, 0, 0, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    (
+                        cast,
+                        constantOfShape,
+                        concat,
+                        unsqueeze,
+                        gather,
+                        shape,
+                    ) = parent_nodes
+                    if shape.input[0] == self.graph().input[0].name:
+                        constantOfShape.input[0] = shape.output[0]
+                        output_name_to_node = self.output_name_to_node()
+
+            if node.op_type == "Attention":
+                # Before:
+                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
+                # After:
+                #   remove this path, and remove the optional mask_index input of Attention node.
+                parent_nodes = self.match_parent_path(
+                    node,
+                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
+                    [3, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
+                        attention_node = helper.make_node(
+                            "Attention",
+                            inputs=node.input[0 : len(node.input) - 1],
+                            outputs=node.output,
+                            name=node.name + "_remove_mask",
+                        )
+                        attention_node.domain = "com.microsoft"
+                        attention_node.attribute.extend(
+                            [helper.make_attribute("num_heads", self.num_heads)]
+                        )
+                        self.add_node(
+                            attention_node, self.get_graph_by_node(attention_node).name
+                        )
+                        nodes_to_remove.append(node)
+        self.remove_nodes(nodes_to_remove)
+
+    def postprocess(self):
+        self.clean_graph()
+        self.prune_graph()
+
+    def optimize(
+        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
+    ):
+        if (options is not None) and not options.enable_shape_inference:
+            self.disable_shape_inference()
+
+        self.utils.remove_identity_nodes()
+
+        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
+        self.utils.remove_useless_cast_nodes()
+
+        if (options is None) or options.enable_layer_norm:
+            self.fuse_layer_norm()
+
+        if (options is None) or options.enable_gelu:
+            self.fuse_gelu()
+
+        self.preprocess()
+
+        self.fuse_reshape()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        if options.enable_swint_opt:
+            self.fuse_custom_fc()
+            self.fuse_swinT_serial_bias_add()
+
+        if options.enable_format_roformer:
+            self.fuse_format_roformer()
+
+        if options.enable_gpt2_classify or options.enable_vit:
+            self.fuse_custom_fc_gpt2_classify()
+
+        if options.enable_vit:
+            self.fuse_custom_fc()
+
+        self.fuse_custom_fc()
+        self.fuse_custom_xsoftmax()
+
+        self.fuse_attention()
+
+        self.fuse_split_qkv()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        # Perform the MatMul fusion after the Attention fusion as we do not
+        # want to fuse the MatMuls inside the Attention subgraphs
+        if (options is None) or options.enable_qordered_matmul:
+            self.fuse_qordered_mamtul()
+
+        self.fuse_shape()
+
+        if (options is None) or options.enable_embed_layer_norm:
+            self.fuse_embed_layer()
+
+        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
+        self.utils.remove_useless_reshape_nodes()
+
+        self.postprocess()
+
+        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
+        if (options is None) or options.enable_bias_gelu:
+            # Fuse Gelu and Add Bias before it.
+            self.fuse_bias_gelu(is_fastgelu=True)
+            self.fuse_bias_gelu(is_fastgelu=False)
+
+        if (options is None) or options.enable_bias_skip_layer_norm:
+            # Fuse SkipLayerNormalization and Add Bias before it.
+            self.fuse_add_bias_skip_layer_norm()
+
+        if options is not None and options.enable_gelu_approximation:
+            self.gelu_approximation()
+
+        self.remove_unused_constant()
+        self.fuse_custom_fc_conformer_activation()
+
+        # Use symbolic batch dimension in input and output.
+        if add_dynamic_axes:
+            self.use_dynamic_axes()
+
+        logger.info(f"opset version: {self.get_opset_version()}")
+
+    def get_fused_operator_statistics(self):
+        """
+        Returns node count of fused operators.
+        """
+        op_count = {}
+        ops = [
+            "EmbedLayerNormalization",
+            "Attention",
+            "QOrderedAttention",
+            "Gelu",
+            "QOrderedGelu",
+            "FastGelu",
+            "BiasGelu",
+            "LayerNormalization",
+            "QOrderedLayerNormalization",
+            "SkipLayerNormalization",
+            "QOrderedMatMul",
+        ]
+        for op in ops:
+            nodes = self.get_nodes_by_op_type(op)
+            op_count[op] = len(nodes)
+        logger.info(f"Optimized operators:{op_count}")
+        return op_count
+
+    def is_fully_optimized(self):
+        """
+        Returns True when the model is fully optimized.
+        """
+        op_count = self.get_fused_operator_statistics()
+        embed = op_count["EmbedLayerNormalization"]
+        attention = op_count["Attention"] + op_count["QOrderedAttention"]
+        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
+        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
+        is_perfect = (
+            (embed > 0)
+            and (attention > 0)
+            and (attention == gelu)
+            and (layer_norm >= 2 * attention)
+        )
+
+        if layer_norm == 0:
+            logger.debug("Layer Normalization not fused")
+
+        if gelu == 0:
+            logger.debug("Gelu/FastGelu not fused")
+
+        if embed == 0:
+            logger.debug("Embed Layer not fused")
+
+        if attention == 0:
+            logger.warning("Attention not fused")
+
+        return is_perfect
+
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py
new file mode 100755
index 000000000..858893199
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py
@@ -0,0 +1,540 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+import onnx
+from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
+from passes.fuse_series_bias_add import FusionSerialBiasAdd
+from passes.fusion_albert_attention import FusionAlbertAttention
+from passes.fusion_attention import AttentionMask, FusionAttention
+from passes.fusion_biasgelu import FusionBiasGelu
+from passes.fusion_customfc import (
+    FusionCustomFC,
+    FusionCustomFCActivation,
+    FusionCustomFcRoformer,
+)
+from passes.fusion_disentangled_attention import FusionDisentangledAttention
+from passes.fusion_embedlayer import FusionEmbedLayerNormalization
+from passes.fusion_fastgelu import FusionFastGelu
+from passes.fusion_format_roformer import (
+    FusionFormatInvalidMask,
+    FusionRemoveUselessElementwise,
+)
+from passes.fusion_gelu import FusionGelu
+from passes.fusion_gelu_approximation import FusionGeluApproximation
+from passes.fusion_layernorm import (
+    FusionLayerNormalization,
+    FusionLayerNormalizationKeras,
+    FusionLayerNormalizationTF,
+)
+from passes.fusion_options import FusionOptions
+from passes.fusion_qordered_attention import FusionQOrderedAttention
+from passes.fusion_qordered_gelu import FusionQOrderedGelu
+from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
+from passes.fusion_qordered_matmul import FusionQOrderedMatMul
+from passes.fusion_reshape import FusionReshape
+from passes.fusion_roformer_attention import FusionRoformerCrossAttention
+from passes.fusion_rope import FusionRoPE
+from passes.fusion_shape import FusionShape
+from passes.fusion_skiplayernorm import (
+    FusionBiasSkipLayerNormalization,
+    FusionSkipLayerNormalization,
+)
+from passes.fusion_swinl_attention import FusionSwinLAttention
+from passes.fusion_utils import FusionUtils
+from passes.fusion_videobert_attention import FusionVideoBertAttention
+from passes.fusion_vit_attention import FusionVITAttention
+from passes.fusion_xsoftmax import FusionXSoftmax
+from passes.onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class RoformerOnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
+        """Initialize BERT ONNX Model.
+
+        Args:
+            model (ModelProto): the ONNX model
+            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
+            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
+        """
+        assert (num_heads == 0 and hidden_size == 0) or (
+            num_heads > 0 and hidden_size % num_heads == 0
+        )
+
+        super().__init__(model)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+
+        self.attention_mask = AttentionMask(self)
+        self.attention_fusion = FusionAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.qordered_attention_fusion = FusionQOrderedAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.utils = FusionUtils(self)
+
+    def fuse_attention(self):
+        FusionRoformerCrossAttention(self).apply()
+
+    def fuse_format_roformer(self):
+        # FusionRemoveUselessElementwise(self).apply()
+        fusion = FusionFormatInvalidMask(self)
+        fusion.apply()
+
+    def fuse_custom_fc(self):
+        fusion = FusionCustomFC(self)
+        fusion.apply()
+
+    def fuse_custom_fc_activation(self):
+        fusion = FusionCustomFCActivation(self)
+        fusion.apply()
+
+    def fuse_custom_fc_roformer(self):
+        fusion = FusionCustomFcRoformer(self)
+        fusion.apply()
+
+    def fuse_rope(self):
+        fusion = FusionRoPE(self)
+        fusion.apply()
+
+    def fuse_swinT_serial_bias_add(self):
+        fusion = FusionSerialBiasAdd(self)
+        fusion.apply()
+
+    def fuse_gelu(self):
+        fusion = FusionGelu(self)
+        fusion.apply()
+        fusion = FusionFastGelu(self)
+        fusion.apply()
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedGelu(self)
+        fusion.apply()
+
+    def fuse_bias_gelu(self, is_fastgelu):
+        fusion = FusionBiasGelu(self, is_fastgelu)
+        fusion.apply()
+
+    def gelu_approximation(self):
+        fusion = FusionGeluApproximation(self)
+        fusion.apply()
+
+    def fuse_add_bias_skip_layer_norm(self):
+        fusion = FusionBiasSkipLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_reshape(self):
+        fusion = FusionReshape(self)
+        fusion.apply()
+
+    def fuse_shape(self):
+        fusion = FusionShape(self)
+        fusion.apply()
+
+    def fuse_embed_layer(self):
+        fusion = FusionEmbedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_layer_norm(self):
+        fusion = FusionLayerNormalizationKeras(self)
+        fusion.apply()
+
+    def fuse_skip_layer_norm(self):
+        fusion = FusionSkipLayerNormalization(self)
+        fusion.apply()
+
+    # Only relevant in models with Q-DQ nodes
+    def fuse_qordered_mamtul(self):
+        fusion = FusionQOrderedMatMul(self)
+        fusion.apply()
+
+    def get_graph_inputs_from_node_type(
+        self, op_type: str, input_indices: List[int], casted: bool
+    ):
+        """
+        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
+        Returns a list of the graph input names based on the filter whether it is casted or not.
+        """
+        graph_inputs = []
+
+        output_name_to_node = self.output_name_to_node()
+        nodes = self.get_nodes_by_op_type(op_type)
+        for node in nodes:
+            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
+            for bert_input in bert_inputs:
+                if self.find_graph_input(bert_input):
+                    if not casted:
+                        graph_inputs.append(bert_input)
+                elif bert_input in output_name_to_node:
+                    parent = output_name_to_node[bert_input]
+                    if (
+                        parent.op_type == "Cast"
+                        and self.find_graph_input(parent.input[0]) is not None
+                    ):
+                        if casted:
+                            graph_inputs.append(parent.input[0])
+        return graph_inputs
+
+    def get_graph_inputs_from_fused_nodes(self, casted: bool):
+        inputs = self.get_graph_inputs_from_node_type(
+            "EmbedLayerNormalization", [0, 1, 7], casted
+        )
+        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
+        return inputs
+
+    def change_graph_input_type(
+        self,
+        graph: GraphProto,
+        graph_input: ValueInfoProto,
+        new_type: int = TensorProto.INT32,
+    ):
+        """Change graph input type, and add Cast node if needed.
+
+        Args:
+            graph (GraphProto): graph
+            graph_input (TensorProto): input of the graph
+            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
+
+        Returns:
+            NodeProto: a new Cast node that added. None if Cast node is not added.
+            List[NodeProto]: Cast nodes that have been removed.
+        """
+        assert isinstance(graph, GraphProto)
+        assert isinstance(graph_input, ValueInfoProto)
+        assert self.find_graph_input(graph_input.name)
+
+        if graph_input.type.tensor_type.elem_type == int(new_type):
+            return None, []
+
+        new_cast_node = None
+        nodes_to_remove = []
+
+        input_name_to_nodes = self.input_name_to_nodes()
+        if graph_input.name in input_name_to_nodes:
+            nodes = input_name_to_nodes[graph_input.name]
+
+            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
+            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
+            if nodes_not_cast:
+                node_name = self.create_node_name("Cast")
+                output_name = node_name + "_" + graph_input.name
+                new_value_info = graph.value_info.add()
+                new_value_info.CopyFrom(graph_input)
+                new_value_info.name = output_name
+                new_cast_node = helper.make_node(
+                    "Cast",
+                    [graph_input.name],
+                    [output_name],
+                    to=int(graph_input.type.tensor_type.elem_type),
+                    name=node_name,
+                )
+                graph.node.extend([new_cast_node])
+
+                for node in nodes_not_cast:
+                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
+
+            # For children that is Cast node, no need to insert Cast.
+            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
+            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
+            for node in nodes_cast:
+                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
+                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
+                if not self.find_graph_output(node.output[0]):
+                    nodes_to_remove.append(node)
+            if nodes_to_remove:
+                self.remove_nodes(nodes_to_remove)
+
+        graph_input.type.tensor_type.elem_type = int(new_type)
+        return new_cast_node, nodes_to_remove
+
+    def change_graph_inputs_to_int32(self):
+        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
+        graph = self.graph()
+        add_cast_count = 0
+        remove_cast_count = 0
+        for graph_input in graph.input:
+            new_node, removed_nodes = self.change_graph_input_type(
+                graph, graph_input, TensorProto.INT32
+            )
+            if new_node:
+                add_cast_count += 1
+            remove_cast_count += len(removed_nodes)
+        logger.info(
+            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
+        )
+
+    def use_dynamic_axes(
+        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
+    ):
+        """
+        Update input and output shape to use dynamic axes.
+        """
+        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
+            casted=True
+        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
+
+        dynamic_batch_inputs = {}
+        for input in self.model.graph.input:
+            if input.name in bert_graph_inputs:
+                dim_proto = input.type.tensor_type.shape.dim[0]
+                dim_proto.dim_param = dynamic_batch_dim
+                if dynamic_seq_len is not None:
+                    dim_proto = input.type.tensor_type.shape.dim[1]
+                    dim_proto.dim_param = dynamic_seq_len
+
+        for output in self.model.graph.output:
+            dim_proto = output.type.tensor_type.shape.dim[0]
+            dim_proto.dim_param = dynamic_batch_dim
+
+    def preprocess(self):
+        self.adjust_reshape_and_expand()
+        return
+
+    def adjust_reshape_and_expand(self):
+        nodes_to_remove = []
+        for node in self.nodes():
+            if node.op_type == "Reshape":
+                # Clean up unneccessary reshape nodes.
+                # Find reshape nodes with no actually data in "shape" attribute and remove.
+                reshape_shape = self.get_constant_value(node.input[1])
+                if reshape_shape is not None and reshape_shape.size == 0:
+                    nodes_to_remove.extend([node])
+                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
+                    continue
+
+                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
+                # changing current reshape's input to output of slice.
+                reshape_path = self.match_parent_path(
+                    node,
+                    ["Expand", "Expand", "Reshape", "Slice"],
+                    [0, 0, 0, 0],
+                    self.output_name_to_node(),
+                )
+                if reshape_path is not None:
+                    expand_node = reshape_path[-3]
+                    expand_shape_value = self.get_constant_value(expand_node.input[1])
+
+                    reshape_before_expand = reshape_path[-2]
+                    shape_value = self.get_constant_value(
+                        reshape_before_expand.input[1]
+                    )
+
+                    slice_node = reshape_path[-1]
+                    if (
+                        expand_shape_value is not None
+                        and shape_value is not None
+                        and len(expand_shape_value) == 2
+                        and len(shape_value) == 1
+                        and expand_shape_value[1] == shape_value[0]
+                    ):
+                        node.input[0] = slice_node.output[0]
+
+        if nodes_to_remove:
+            self.remove_nodes(nodes_to_remove)
+            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
+
+    def clean_graph(self):
+        output_name_to_node = self.output_name_to_node()
+        nodes_to_remove = []
+        for node in self.nodes():
+            # Before:
+            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
+            #          |                                                     |
+            #          |                                                     v
+            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # After:
+            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
+            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
+            if node.op_type in op_input_id:
+                i = op_input_id[node.op_type]
+                parent_nodes = self.match_parent_path(
+                    node,
+                    [
+                        "Cast",
+                        "ConstantOfShape",
+                        "Concat",
+                        "Unsqueeze",
+                        "Gather",
+                        "Shape",
+                    ],
+                    [i, 0, 0, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    (
+                        cast,
+                        constantOfShape,
+                        concat,
+                        unsqueeze,
+                        gather,
+                        shape,
+                    ) = parent_nodes
+                    if shape.input[0] == self.graph().input[0].name:
+                        constantOfShape.input[0] = shape.output[0]
+                        output_name_to_node = self.output_name_to_node()
+
+            if node.op_type == "Attention":
+                # Before:
+                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
+                # After:
+                #   remove this path, and remove the optional mask_index input of Attention node.
+                parent_nodes = self.match_parent_path(
+                    node,
+                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
+                    [3, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
+                        attention_node = helper.make_node(
+                            "Attention",
+                            inputs=node.input[0 : len(node.input) - 1],
+                            outputs=node.output,
+                            name=node.name + "_remove_mask",
+                        )
+                        attention_node.domain = "com.microsoft"
+                        attention_node.attribute.extend(
+                            [helper.make_attribute("num_heads", self.num_heads)]
+                        )
+                        self.add_node(
+                            attention_node, self.get_graph_by_node(attention_node).name
+                        )
+                        nodes_to_remove.append(node)
+        self.remove_nodes(nodes_to_remove)
+
+    def postprocess(self):
+        self.clean_graph()
+        self.prune_graph()
+
+    def optimize(
+        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
+    ):
+        if (options is not None) and not options.enable_shape_inference:
+            self.disable_shape_inference()
+
+        self.utils.remove_identity_nodes()
+
+        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
+        self.utils.remove_useless_cast_nodes()
+
+        if (options is None) or options.enable_layer_norm:
+            self.fuse_layer_norm()
+
+        if (options is None) or options.enable_gelu:
+            self.fuse_gelu()
+
+        self.preprocess()
+
+        self.fuse_reshape()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        if options.enable_format_roformer:
+            self.fuse_format_roformer()
+
+        self.fuse_custom_fc_roformer()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        self.fuse_custom_fc()
+
+        if (options is None) or options.enable_attention:
+            if options is not None:
+                self.attention_mask.set_mask_format(options.attention_mask_format)
+            self.fuse_attention()
+
+        self.fuse_rope()
+
+        self.fuse_shape()
+
+        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
+        self.utils.remove_useless_reshape_nodes()
+
+        self.postprocess()
+
+        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
+        if (options is None) or options.enable_bias_gelu:
+            # Fuse Gelu and Add Bias before it.
+            self.fuse_bias_gelu(is_fastgelu=True)
+            self.fuse_bias_gelu(is_fastgelu=False)
+
+        if (options is None) or options.enable_bias_skip_layer_norm:
+            # Fuse SkipLayerNormalization and Add Bias before it.
+            self.fuse_add_bias_skip_layer_norm()
+
+        if options is not None and options.enable_gelu_approximation:
+            self.gelu_approximation()
+
+        self.fuse_custom_fc_activation()
+
+        self.remove_unused_constant()
+
+        # Use symbolic batch dimension in input and output.
+        if add_dynamic_axes:
+            self.use_dynamic_axes()
+
+        logger.info(f"opset version: {self.get_opset_version()}")
+
+    def get_fused_operator_statistics(self):
+        """
+        Returns node count of fused operators.
+        """
+        op_count = {}
+        ops = [
+            "EmbedLayerNormalization",
+            "Attention",
+            "QOrderedAttention",
+            "Gelu",
+            "QOrderedGelu",
+            "FastGelu",
+            "BiasGelu",
+            "LayerNormalization",
+            "QOrderedLayerNormalization",
+            "SkipLayerNormalization",
+            "QOrderedMatMul",
+        ]
+        for op in ops:
+            nodes = self.get_nodes_by_op_type(op)
+            op_count[op] = len(nodes)
+        logger.info(f"Optimized operators:{op_count}")
+        return op_count
+
+    def is_fully_optimized(self):
+        """
+        Returns True when the model is fully optimized.
+        """
+        op_count = self.get_fused_operator_statistics()
+        embed = op_count["EmbedLayerNormalization"]
+        attention = op_count["Attention"] + op_count["QOrderedAttention"]
+        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
+        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
+        is_perfect = (
+            (embed > 0)
+            and (attention > 0)
+            and (attention == gelu)
+            and (layer_norm >= 2 * attention)
+        )
+
+        if layer_norm == 0:
+            logger.debug("Layer Normalization not fused")
+
+        if gelu == 0:
+            logger.debug("Gelu/FastGelu not fused")
+
+        if embed == 0:
+            logger.debug("Embed Layer not fused")
+
+        if attention == 0:
+            logger.warning("Attention not fused")
+
+        return is_perfect
+
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py
index 88e6c99c1..57982d0cc 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py
@@ -80,7 +80,7 @@ def fuse_shape(self):
         fusion.apply()
 
     def fuse_layer_norm(self):
-        fusion = FusionLayerNormalization(self)
+        fusion = FusionLayerNormalization(self, 0)
         fusion.apply()
 
         fusion = FusionLayerNormalizationTF(self)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py
index 49ed79498..701bd7a41 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py
@@ -6,6 +6,8 @@
 import onnx
 from onnx import ModelProto, helper, load_model
 from onnx_model_bert import BertOnnxModel
+from onnx_model_roformer import RoformerOnnxModel
+from onnx_model_conformer import conformerOnnxModel
 from onnx_model_t5 import T5OnnxModel
 from onnx_model_yolo import YoloOnnxModel
 from onnxsim import simplify
@@ -16,10 +18,12 @@
 MODEL_TYPES = {
     "bert": (BertOnnxModel, None, "pytorch", 1),
     "swint": (BertOnnxModel, None, "pytorch", 1),
-    "roformer": (BertOnnxModel, None, "tf2onnx", 1),
+    "roformer": (RoformerOnnxModel, None, "tf2onnx", 1),
     "gpt2": (BertOnnxModel, None, "pytorch", 1),
     "t5": (T5OnnxModel, None, "tf2onnx", 1),
     "yolo": (YoloOnnxModel, None, "pytorch", 1),
+    "vit": (BertOnnxModel, None, "pytorch", 1),
+    "conformer": (conformerOnnxModel, None, "pytorch", 1),
 }
 
 
@@ -105,8 +109,11 @@ def optimize_to_ixrt(args):
                 input_tensor.type.tensor_type.shape.dim.extend(dim_list)
 
     try:
+        auto_merge = False
+        if args.model_type in ["roformer"]:
+            auto_merge = True
         static_model = SymbolicShapeInference.infer_shapes(
-            simplified_model, 2**31 - 1, False, False, 3
+            simplified_model, 2**31 - 1, auto_merge, False, 3
         )
         static_sim_model, check = simplify(static_model)
         if args.dump_onnx:
@@ -164,7 +171,7 @@ def args_parser():
         "--model_type",
         type=str,
         default="bert",
-        choices=["bert", "swint", "roformer", "t5", "yolo", "gpt2"],
+        choices=["bert", "swint", "roformer", "t5", "yolo", "gpt2", "vit", "conformer"],
         help="Which kind of model to optimize",
     )
     parser.add_argument(
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py
new file mode 100755
index 000000000..e825f95cb
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py
@@ -0,0 +1,150 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+class FusionConformerAttention(Fusion):
+    """
+    Fuse VideoBertAttention subgraph into one Attention node.
+    """
+
+    def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
+        super().__init__(model, "CustomQKVToContextPluginDynamic_IxRT", ["Concat"])
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+
+    def get_num_heads_and_hidden_size(
+        self, atten_matmul: NodeProto, div: NodeProto
+    ) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        atten_matul_initializer = self.model.get_initializer(atten_matmul.input[1])
+        div_initializer = self.model.get_initializer(div.input[1])
+
+        # 检查float_data是否为空
+        if len(div_initializer.float_data) > 0:
+            div_value = div_initializer.float_data[0]
+        else:
+            # 如果float_data为空，尝试其他方式获取数据
+            # 例如，如果数据存储在raw_data中
+            if len(div_initializer.raw_data) > 0:
+                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type]
+                div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0]
+            else:
+                raise ValueError("Data not found in the div_initializer")
+
+        atten_matul_shape_value = NumpyHelper.to_array(atten_matul_initializer).shape
+        head_dim = math.ceil(div_value * div_value)
+        hidden_size = atten_matul_shape_value[0]
+        num_heads = hidden_size // head_dim
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+        self, num_heads: int, hidden_size: int, inputs: str, outputs: str
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(
+                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
+            )
+            return None
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=inputs,
+            outputs=outputs,
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend(
+            [helper.make_attribute("hidden_size", hidden_size)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
+
+        return attention_node
+
+    def fuse_reshape(self, shape_data_name):
+
+        shape_tensor = helper.make_tensor(
+            name=shape_data_name,
+            data_type=TensorProto.INT64,
+            dims=[3],
+            vals=np.int64([128, -1, self.hidden_size // self.num_heads]).tobytes(),
+            raw=True,
+        )
+        self.model.add_initializer(shape_tensor, self.this_graph_name)
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+
+        paths = {
+            "path": (
+                ["Unsqueeze", "Mul", "Gather", "Shape", "LayerNormalization"],
+                [None, None, None, None, None],
+            ),
+        }
+
+        reshape_nodes, reshape_path = self.match_parent_path_from_dict(
+            start_node, paths
+        )
+        if reshape_nodes is None:
+            return
+
+        self.nodes_to_remove.append(start_node)
+
+        self.nodes_to_remove.extend(reshape_nodes[:-1])
+        self.fuse_reshape(start_node.output[0])
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py
new file mode 100755
index 000000000..78a40973f
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py
@@ -0,0 +1,129 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Tuple, Union
+
+import numpy as np
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionConformerXSoftmax(Fusion):
+    """
+    Fuse Where + Softmax + Where into one node: XSoftmax
+    """
+
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "XSoftmax_IxRT", "Softmax")
+
+    def create_xsoftmax_node(
+        self, data_input: str, mask_input: str, output: str
+    ) -> Union[NodeProto, None]:
+        """Create an XSoftmax node.
+
+        Args:
+            data_input (str): data input name
+            mask_input (str): max input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+
+        unique_index = data_input
+        new_edge = "edge_modified_" + unique_index
+        shape_tensor = helper.make_tensor(
+            name="shape_modified_tensor_" + unique_index,
+            data_type=TensorProto.INT64,
+            dims=[4],
+            vals=np.int64(
+                [-1, 8, 128, 128]  # (BSZ, HEAD_NUM, SEQ_LEN, SEQ_LEN)
+            ).tobytes(),
+            raw=True,
+        )
+        self.model.add_initializer(shape_tensor, self.this_graph_name)
+        self.model.add_node(
+            helper.make_node(
+                "Reshape",
+                [data_input, shape_tensor.name],
+                [new_edge],
+                "reshape_modified_" + unique_index,
+            ),
+            self.this_graph_name,
+        )
+
+        new_edge2 = "edge_modified2_" + unique_index
+        xsoftmax_node_name = self.model.create_node_name("XSoftmax")
+
+        xsoftmax_node = helper.make_node(
+            "XSoftmax_IxRT",
+            inputs=[new_edge, mask_input],
+            outputs=[new_edge2],
+            name=xsoftmax_node_name,
+        )
+        xsoftmax_node.domain = "com.iluvatar"
+        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        xsoftmax_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        xsoftmax_node.attribute.extend([helper.make_attribute("dim", -1)])
+        xsoftmax_node.attribute.extend([helper.make_attribute("is_conformer", 1)])
+
+        shape_tensor2 = helper.make_tensor(
+            name="shape_modified_tensor2_" + unique_index,
+            data_type=TensorProto.INT64,
+            dims=[3],
+            vals=np.int64(
+                [-1, 128, 128]  # (BSZ, HEAD_NUM, SEQ_LEN, SEQ_LEN)
+            ).tobytes(),
+            raw=True,
+        )
+        self.model.add_initializer(shape_tensor2, self.this_graph_name)
+        self.model.add_node(
+            helper.make_node(
+                "Reshape",
+                [new_edge2, shape_tensor2.name],
+                [output],
+                "reshape_modified2_" + unique_index,
+            ),
+            self.this_graph_name,
+        )
+
+        return xsoftmax_node
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+
+        xsoftmax_paths = {
+            "path": (["Add", "Where", "Reshape", "Expand"], [None, None, None, None]),
+        }
+        xsoftmax_nodes, xsoftmax_path = self.match_parent_path_from_dict(
+            node, xsoftmax_paths
+        )
+
+        if xsoftmax_nodes is None:
+            logger.debug("fuse_xsoftmax: failed to match xsoftmax path")
+            return
+        else:
+            (add_node, where_node, reshape_node, expand_node) = xsoftmax_nodes
+
+            mask_input = expand_node.input[0]
+
+            data_output = node.output[0]
+
+            data_input = add_node.input[0]
+            if where_node.output[0] == add_node.input[0]:
+                data_input = add_node.input[1]
+            xsoftmax_node = self.create_xsoftmax_node(
+                data_input, mask_input, data_output
+            )
+
+            self.nodes_to_remove.extend(xsoftmax_nodes)
+            self.nodes_to_add.append(xsoftmax_node)
+            self.node_name_to_graph_name[xsoftmax_node.name] = self.this_graph_name
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py
index 074b6d595..e9e401150 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py
@@ -37,7 +37,15 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
         w = NumpyHelper.to_array(matmul_weight)
         b = NumpyHelper.to_array(matmul_bias)
 
-        trans_matmul_weight = w.transpose(1, 0)
+        transB = 0
+        for attr in matmul.attribute:
+            if attr.name == "transB":
+                transB = attr.i
+                break
+
+        trans_matmul_weight = w
+        if transB == 0:
+            trans_matmul_weight = w.transpose(1, 0)
         if matmul_weight.name not in self.model.initializer_visited.keys():
             self.model.initializer_visited[matmul_weight.name] = True
             if matmul_weight.data_type == 10:
@@ -77,6 +85,96 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
         self.nodes_to_remove.extend([matmul, node, reshape_before_matmul])
 
 
+class FusionCustomFcRoformer(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Add"], "roformer fc")
+
+        # For model Roformer.
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if len(node.input) != 2:
+            return False
+
+        fc_paths = {
+            "path1": (["Reshape", "MatMul", "Reshape"], [0, 0, 0]),
+            "path2": (["Reshape", "MatMul", "Reshape"], [1, 0, 0]),
+        }
+
+        nodes, paths = self.match_parent_path_from_dict(node, fc_paths)
+        if nodes is None:
+            return False
+
+        reshape_after_matmul = nodes[0]
+        matmul = nodes[1]
+        reshape_before_matmul = nodes[2]
+
+        reshape_before_shape = None
+        reshape_after_shape = None
+        for value_info in self.model.graph().value_info:
+            if value_info.name == reshape_before_matmul.input[0]:
+                reshape_before_shape = len(value_info.type.tensor_type.shape.dim)
+                break
+        for value_info in self.model.graph().value_info:
+            if value_info.name == reshape_after_matmul.output[0]:
+                reshape_after_shape = len(value_info.type.tensor_type.shape.dim)
+                break
+        if reshape_before_shape != reshape_after_shape:
+            return False
+
+        weight = self.model.get_initializer(matmul.input[1])
+        bias = self.model.get_initializer(node.input[1]) or self.model.get_initializer(
+            node.input[0]
+        )
+
+        if weight is None or bias is None:
+            return False
+
+        w = NumpyHelper.to_array(weight)
+        w_in_size = w.shape[0]
+        weight_dim = np.prod(w.shape[1:])
+
+        b = NumpyHelper.to_array(bias)
+        bias_dim = np.prod(b.shape)
+        trans_matmul_weight = w.transpose(1, 0)
+        weight.CopyFrom(onnx.numpy_helper.from_array(trans_matmul_weight, weight.name))
+        # Sometimes weights and bias are stored in fp16
+        if weight.data_type == 10:
+            weight.CopyFrom(
+                numpy_helper.from_array(
+                    trans_matmul_weight.astype(np.float16), weight.name
+                )
+            )
+        bias_arr = onnx.numpy_helper.to_array(bias).flatten()
+        bias.CopyFrom(onnx.numpy_helper.from_array(bias_arr, bias.name))
+        if bias.data_type == 10:
+            bias.CopyFrom(
+                numpy_helper.from_array(
+                    NumpyHelper.to_array(bias).astype(np.float16), bias.name
+                )
+            )
+
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[reshape_before_matmul.input[0]],
+            outputs=node.output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fused_node.attribute.extend([helper.make_attribute("W", weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", bias)])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+
+        self.nodes_to_remove.extend([node])
+        self.nodes_to_remove.extend(nodes)
+        return True
+
+
 class FusionCustomFC(Fusion):
     def __init__(self, model: OnnxModel):
         super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Add"])
@@ -145,77 +243,6 @@ def fuse_1(self, node, input_name_to_nodes, output_name_to_node):
         self.nodes_to_remove.extend([matmul, node])
         return True
 
-    # For model Roformer.
-    def fuse_2(self, node, input_name_to_nodes, output_name_to_node):
-        if len(node.input) != 2:
-            return False
-
-        fc_paths = {
-            "path1": (["Reshape", "MatMul"], [0, 0]),
-            "path2": (["Reshape", "MatMul"], [1, 0]),
-        }
-
-        nodes, paths = self.match_parent_path_from_dict(node, fc_paths)
-        if nodes is None:
-            return False
-
-        reshape_after_matmul = nodes[0]
-        matmul = nodes[1]
-
-        weight = self.model.get_initializer(matmul.input[1])
-        bias = self.model.get_initializer(node.input[1]) or self.model.get_initializer(
-            node.input[0]
-        )
-
-        if weight is None or bias is None:
-            return False
-
-        w = NumpyHelper.to_array(weight)
-        w_in_size = w.shape[0]
-        weight_dim = np.prod(w.shape[1:])
-
-        b = NumpyHelper.to_array(bias)
-        bias_dim = np.prod(b.shape)
-        weight_arr = (
-            onnx.numpy_helper.to_array(weight).flatten().reshape(w_in_size, weight_dim)
-        )
-        weight.CopyFrom(onnx.numpy_helper.from_array(weight_arr, weight.name))
-        # Sometimes weights and bias are stored in fp16
-        if weight.data_type == 10:
-            weight.CopyFrom(
-                numpy_helper.from_array(
-                    NumpyHelper.to_array(weight).astype(np.float16), weight.name
-                )
-            )
-        bias_arr = onnx.numpy_helper.to_array(bias).flatten()
-        bias.CopyFrom(onnx.numpy_helper.from_array(bias_arr, bias.name))
-        if bias.data_type == 10:
-            bias.CopyFrom(
-                numpy_helper.from_array(
-                    NumpyHelper.to_array(bias).astype(np.float16), bias.name
-                )
-            )
-
-        fused_node = helper.make_node(
-            "CustomFCPluginDynamic_IxRT",
-            inputs=[matmul.input[0]],
-            outputs=node.output,
-            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
-        )
-        fused_node.domain = "com.iluvatar"
-        fused_node.attribute.extend([helper.make_attribute("out_dims", 1)])
-        fused_node.attribute.extend([helper.make_attribute("type_id", 1)])
-        fused_node.attribute.extend([helper.make_attribute("W", weight)])
-        fused_node.attribute.extend([helper.make_attribute("B", bias)])
-        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
-        fused_node.attribute.extend([helper.make_attribute("plugin_version", 1)])
-        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
-        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
-        self.nodes_to_add.append(fused_node)
-
-        self.nodes_to_remove.extend([node, nodes[0], nodes[1]])
-        return True
-
 
 class FusionCustomFCActivation(Fusion):
     def __init__(self, model: OnnxModel):
@@ -277,3 +304,41 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
             self.nodes_to_add.append(fc_node)
             self.nodes_to_remove.extend([node, fc_node])
             self.node_name_to_graph_name[fc_node.name] = self.this_graph_name
+
+
+class FusionConformerCustomFCActivation(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model,
+            "CustomFCPluginDynamic_IxRT",
+            ["Mul"],
+            "with activation",
+        )
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+
+        # return_indice = []
+        nodes = self.model.match_parent_path(
+            node,
+            ["Sigmoid", "CustomFCPluginDynamic_IxRT"],
+            [
+                None,
+                0,
+            ],
+            # return_indice=return_indice,
+        )
+        if nodes is None:
+            return
+        (sigmoid_node, custom_fc_node) = nodes
+        # if output_name_to_node[node.input[1 - return_indice[0]]] != custom_fc_node:
+        #     return
+        activation_type = 20
+        for attr in custom_fc_node.attribute:
+            if attr.name == "act_type":
+                attr.i = activation_type
+                break
+        custom_fc_node.attribute.extend([helper.make_attribute("swish_alpha", 1.0)])
+        custom_fc_node.output[0] = node.output[0]
+        self.nodes_to_add.append(custom_fc_node)
+        self.nodes_to_remove.extend([node, sigmoid_node, custom_fc_node])
+        self.node_name_to_graph_name[custom_fc_node.name] = self.this_graph_name
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py
index 5bd2e0c48..5b6d66ad3 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py
@@ -2,13 +2,13 @@
 # Copyright (c) Microsoft Corporation.  All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
+import math
 from enum import Enum
 from logging import getLogger
 from os import name
 from sys import path
 from typing import Tuple, Union
 
-import math
 import numpy as np
 import onnx
 from onnx import NodeProto, TensorProto, helper, numpy_helper
@@ -18,9 +18,9 @@
 from .fusion_utils import FusionUtils, NumpyHelper
 from .onnx_model import OnnxModel
 
-
 logger = getLogger(__name__)
 
+
 class FusionRemoveUselessElementwise(Fusion):
     """
     Fusion to remove useless elementwise in roformer model.
@@ -38,7 +38,10 @@ def __init__(
 
     def fuse(self, node, input_name_to_nodes, output_name_to_node):
         paths = {
-            "path1" : (["Max", "Min", "Add", "GlobalAveragePool"], [None, None, None, None]),
+            "path1": (
+                ["Max", "Min", "Add", "GlobalAveragePool"],
+                [None, None, None, None],
+            ),
         }
 
         pool_nodes, pool_path = self.match_parent_path_from_dict(node, paths)
@@ -70,44 +73,35 @@ def __init__(
         self,
         model: OnnxModel,
     ):
-        super().__init__(model, "Softmax", ["Softmax"])
+        super().__init__(model, "", ["Greater"])
 
-        # Flags to show warning only once
-        self.num_heads_warning = True
-        self.hidden_size_warning = True
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+    def fuse(self, start_node, input_name_to_nodes, output_name_to_node):
         nodes = self.model.match_parent_path(
-            node,
-            ["Add", "Mul"],
-            [0, 1],
+            start_node,
+            [
+                "ReduceMin",
+                "Cast",
+                "Concat",
+                "Unsqueeze",
+                "Greater",
+                "ReduceMin",
+                "Cast",
+                "Concat",
+                "Unsqueeze",
+            ],
+            [0, 0, 0, 0, 0, 0, 0, 0, 0],
         )
 
         if nodes is None:
-            logger.debug("Roformer: unable to format the mul.")
+            logger.debug("Roformer: unable to format the mask.")
             return
 
-        mul_node = nodes[1]
-
-        inputs = mul_node.input
-        outputs = mul_node.output
-
-        coef0 = self.model.get_initializer(inputs[0])
-        coef1 = self.model.get_initializer(inputs[1])
-        if (coef0 and coef1) or (not coef0 and not coef1):
-            return
-        coef = coef0 if coef0 else coef1
-        coef.CopyFrom(numpy_helper.from_array(np.array([-100.0]).astype(np.float32), coef.name))
-
-        new_node = helper.make_node(
-            "Mul",
-            inputs = inputs,
-            outputs = outputs,
-            name = mul_node.name,
-        )
-        new_node.domain = "com.iluvatar"
+        unsqueeze_node = nodes[-1]
 
-        self.nodes_to_add.append(new_node)
-        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+        for node in self.model.graph().node:
+            for (id, input) in enumerate(node.input):
+                if start_node.output[0] == input:
+                    node.input[id] = unsqueeze_node.input[0]
 
-        self.nodes_to_remove.extend([mul_node])
\ No newline at end of file
+        self.nodes_to_remove.extend(nodes)
+        self.nodes_to_remove.extend([start_node])
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py
index 922afa78a..727a1aa50 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py
@@ -5,15 +5,18 @@
 from logging import getLogger
 from typing import Dict
 
+import numpy as np
+from onnx import TensorProto, helper
+
 from .fusion_base import Fusion
-from onnx import helper
 from .onnx_model import OnnxModel
 
 logger = getLogger(__name__)
 
 
 class FusionLayerNormalization(Fusion):
-    def __init__(self, model: OnnxModel):
+    def __init__(self, model: OnnxModel, hidden_size):
+        self.hidden_size = hidden_size
         super().__init__(model, "LayerNormalization", "ReduceMean")
 
     def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
@@ -53,7 +56,9 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
 
         div_node = None
         for child in children:
-            div_node = self.model.find_first_child_by_type(child, "Div", input_name_to_nodes, recursive=False)
+            div_node = self.model.find_first_child_by_type(
+                child, "Div", input_name_to_nodes, recursive=False
+            )
             if div_node is not None:
                 break
         if div_node is None:
@@ -87,6 +92,169 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
         if not self.model.find_constant_input(pow_node, 2.0) == 1:
             return
 
+        mul_node = input_name_to_nodes[div_node.output[0]][0]
+        is_not_have_mul_and_add = False
+        is_not_have_mul_and_add_lst_node = None
+        # deal with special case : layernorm do not have mul and add
+        if mul_node.op_type != "Mul" and mul_node.op_type == "MatMul":
+            is_not_have_mul_and_add = True
+            is_not_have_mul_and_add_lst_node = div_node
+        elif mul_node.op_type != "Mul":
+            return
+
+        if is_not_have_mul_and_add:
+            last_add_node = is_not_have_mul_and_add_lst_node
+            if self.hidden_size == 0:
+                print(
+                    "[Error] Please add '--hidden_size' and '--num_head' to fuse layernorm ..."
+                )
+                exit(0)
+
+            subgraph_nodes = [node]
+            subgraph_nodes.extend(children)
+            subgraph_nodes.extend(parent_nodes[:-1])
+            subgraph_nodes.extend([last_add_node])
+            if len(subgraph_nodes) == 7:
+                self.nodes_to_remove.extend(subgraph_nodes)
+            else:
+                return
+
+            norm_name = self.model.create_node_name(
+                "LayerNormalization", name_prefix="LayerNorm"
+            )
+            np_weights = np.ones((self.hidden_size)).astype(np.float32)
+            np_weights_name = norm_name + "_weights"
+            weights_tensor = helper.make_tensor(
+                np_weights_name, TensorProto.FLOAT, np_weights.shape, np_weights
+            )
+            np_bias = np.zeros((self.hidden_size)).astype(np.float32)
+            np_bias_name = norm_name + "_bias"
+            bias_tensor = helper.make_tensor(
+                np_bias_name, TensorProto.FLOAT, np_bias.shape, np_bias
+            )
+            self.model.add_initializer(weights_tensor)
+            self.model.add_initializer(bias_tensor)
+            normalize_node = helper.make_node(
+                "LayerNormalization",
+                inputs=[node.input[0], np_weights_name, np_bias_name],
+                outputs=[last_add_node.output[0]],
+                name=norm_name,
+            )
+            normalize_node.attribute.extend(
+                [helper.make_attribute("epsilon", float(add_weight))]
+            )
+            self.nodes_to_add.append(normalize_node)
+            self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+        else:
+            last_add_node = input_name_to_nodes[mul_node.output[0]][0]
+            if last_add_node.op_type != "Add":
+                return
+
+            subgraph_nodes = [node]
+            subgraph_nodes.extend(children)
+            subgraph_nodes.extend(parent_nodes[:-1])
+
+            subgraph_nodes.extend([last_add_node, mul_node, div_node])
+            if not self.model.is_safe_to_fuse_nodes(
+                subgraph_nodes,
+                last_add_node.output,
+                input_name_to_nodes,
+                output_name_to_node,
+            ):
+                logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
+                return
+
+            weight_input = mul_node.input[
+                1 - self.model.input_index(div_node.output[0], mul_node)
+            ]
+            if not self.model.is_constant_with_specified_dimension(
+                weight_input, 1, "layernorm weight"
+            ):
+                return
+
+            bias_input = last_add_node.input[
+                1 - self.model.input_index(mul_node.output[0], last_add_node)
+            ]
+            if not self.model.is_constant_with_specified_dimension(
+                bias_input, 1, "layernorm bias"
+            ):
+                return
+
+            self.nodes_to_remove.extend(subgraph_nodes)
+            normalize_node = helper.make_node(
+                "LayerNormalization",
+                inputs=[node.input[0], weight_input, bias_input],
+                outputs=[last_add_node.output[0]],
+                name=self.model.create_node_name(
+                    "LayerNormalization", name_prefix="LayerNorm"
+                ),
+            )
+            normalize_node.attribute.extend(
+                [helper.make_attribute("epsilon", float(add_weight))]
+            )
+            self.nodes_to_add.append(normalize_node)
+            self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+
+
+class FusionLayerNormalizationKeras(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model, "LayerNormalization", "GlobalAveragePool", "Keras layernorm"
+        )
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        """
+          +-------------------------------+
+          |                               |
+          |                               v
+        [Root] -->  GlobalAveragePool-->  Sub  --> Mul --> GlobalAveragePool --> Add/Min/Max --> Sqrt --> Div --> Mul --> Add
+                                           |                                                               ^
+                                           |                                                               |
+                                           +---------------------------------------------------------------+
+        """
+        children = self.model.get_children(node, input_name_to_nodes)
+        # print(len(children))
+        if len(children) != 1:
+            return
+
+        root_input = node.input[0]
+
+        if children[0].op_type != "Sub" or children[0].input[0] != root_input:
+            return
+
+        div_node = None
+        for child in children:
+            div_node = self.model.find_first_child_by_type(
+                child, "Div", input_name_to_nodes, recursive=False
+            )
+            if div_node is not None:
+                break
+        if div_node is None:
+            return
+        # print('div_node_name:', div_node.name)
+        path_id, parent_nodes, _ = self.model.match_parent_paths(
+            div_node,
+            [
+                (
+                    ["Sqrt", "Max", "Min", "Add", "GlobalAveragePool", "Mul", "Sub"],
+                    [1, 0, 0, 0, None, 0, None],
+                ),
+            ],
+            output_name_to_node,
+        )
+        if path_id < 0:
+            return
+
+        sub_node = parent_nodes[-1]
+        if sub_node not in children:
+            return
+
+        second_add_node = parent_nodes[3]
+        i, add_weight = self.model.get_constant_input(second_add_node)
+        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
+            logger.warning(f"epsilon value is not expeced: {add_weight}")
+            return
+
         mul_node = input_name_to_nodes[div_node.output[0]][0]
         if mul_node.op_type != "Mul":
             return
@@ -109,23 +277,34 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
             logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
             return
 
-        weight_input = mul_node.input[1 - self.model.input_index(div_node.output[0], mul_node)]
-        if not self.model.is_constant_with_specified_dimension(weight_input, 1, "layernorm weight"):
+        weight_input = mul_node.input[
+            1 - self.model.input_index(div_node.output[0], mul_node)
+        ]
+        if not self.model.is_constant_with_specified_dimension(
+            weight_input, 1, "layernorm weight"
+        ):
             return
 
-        bias_input = last_add_node.input[1 - self.model.input_index(mul_node.output[0], last_add_node)]
-        if not self.model.is_constant_with_specified_dimension(bias_input, 1, "layernorm bias"):
+        bias_input = last_add_node.input[
+            1 - self.model.input_index(mul_node.output[0], last_add_node)
+        ]
+        if not self.model.is_constant_with_specified_dimension(
+            bias_input, 1, "layernorm bias"
+        ):
             return
 
         self.nodes_to_remove.extend(subgraph_nodes)
-
         normalize_node = helper.make_node(
             "LayerNormalization",
             inputs=[node.input[0], weight_input, bias_input],
             outputs=[last_add_node.output[0]],
-            name=self.model.create_node_name("LayerNormalization", name_prefix="LayerNorm"),
+            name=self.model.create_node_name(
+                "LayerNormalization", name_prefix="LayerNorm"
+            ),
+        )
+        normalize_node.attribute.extend(
+            [helper.make_attribute("epsilon", float(add_weight))]
         )
-        normalize_node.attribute.extend([helper.make_attribute("epsilon", float(add_weight))])
         self.nodes_to_add.append(normalize_node)
         self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
 
@@ -193,8 +372,14 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
             return
 
         assert len(return_indice) == 3
-        if not (return_indice[0] in [0, 1] and return_indice[1] in [0, 1] and return_indice[2] in [0, 1]):
-            logger.debug("return indice is exepected in [0, 1], but got {return_indice}")
+        if not (
+            return_indice[0] in [0, 1]
+            and return_indice[1] in [0, 1]
+            and return_indice[2] in [0, 1]
+        ):
+            logger.debug(
+                "return indice is exepected in [0, 1], but got {return_indice}"
+            )
             return
 
         (
@@ -205,7 +390,9 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
             sqrt_node,
             add_node_0,
         ) = parent_nodes[:6]
-        reduce_mean_node_0, mul_node_2, sub_node_1, reduce_mean_node_1 = parent_nodes[-4:]
+        reduce_mean_node_0, mul_node_2, sub_node_1, reduce_mean_node_1 = parent_nodes[
+            -4:
+        ]
 
         cast_node_3 = None
         if len(parent_nodes) == 11:
@@ -217,7 +404,9 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
             logger.debug("mul_node_3 not found")
             return
 
-        node_before_reduce = self.model.get_parent(reduce_mean_node_1, 0, output_name_to_node)
+        node_before_reduce = self.model.get_parent(
+            reduce_mean_node_1, 0, output_name_to_node
+        )
         root_node = (
             node_before_reduce
             if cast_node_3 is None
@@ -228,18 +417,24 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
             return
 
         i, epsilon = self.model.get_constant_input(add_node_0)
-        if epsilon is None or epsilon <= 0 or (epsilon > 1.0e-5 and cast_node_3 is None):
+        if (
+            epsilon is None
+            or epsilon <= 0
+            or (epsilon > 1.0e-5 and cast_node_3 is None)
+        ):
             logger.debug("epsilon is not matched")
             return
 
         if cast_node_3 is None and (
-            reduce_mean_node_1.input[0] not in mul_node_3.input or reduce_mean_node_1.input[0] not in sub_node_1.input
+            reduce_mean_node_1.input[0] not in mul_node_3.input
+            or reduce_mean_node_1.input[0] not in sub_node_1.input
         ):
             logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
             return
 
         if cast_node_3 is not None and (
-            node_before_reduce.input[0] not in mul_node_3.input or reduce_mean_node_1.input[0] not in sub_node_1.input
+            node_before_reduce.input[0] not in mul_node_3.input
+            or reduce_mean_node_1.input[0] not in sub_node_1.input
         ):
             logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
             return
@@ -264,7 +459,9 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
         ]
 
         if cast_node_3 is not None:
-            cast_node_2 = self.model.match_parent(mul_node_0, "Cast", 0, output_name_to_node)
+            cast_node_2 = self.model.match_parent(
+                mul_node_0, "Cast", 0, output_name_to_node
+            )
             if cast_node_2 is None:
                 logger.debug("cast_node_2 not found")
                 return
@@ -289,7 +486,9 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
             "LayerNormalization",
             inputs=[mul_node_3.input[0], weight_input, bias_input],
             outputs=[node.output[0]],
-            name=self.model.create_node_name("LayerNormalization", name_prefix="LayerNorm"),
+            name=self.model.create_node_name(
+                "LayerNormalization", name_prefix="LayerNorm"
+            ),
         )
         fused_node.attribute.extend([helper.make_attribute("epsilon", float(epsilon))])
         self.nodes_to_add.append(fused_node)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py
index af315ce4f..e0a1a535b 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py
@@ -30,6 +30,7 @@ def __init__(self, model_type):
         self.enable_swint_opt = False
         self.enable_format_roformer = False
         self.enable_gpt2_classify = False
+        self.enable_vit = False
         self.attention_mask_format = AttentionMaskFormat.AttentionMask
 
         if model_type == "gpt2":
@@ -39,6 +40,8 @@ def __init__(self, model_type):
             self.enable_swint_opt = True
         elif model_type == "roformer":
             self.enable_format_roformer = True
+        elif model_type == "vit":
+            self.enable_vit = True
 
     def use_raw_attention_mask(self, use_raw_mask=True):
         if use_raw_mask:
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py
new file mode 100755
index 000000000..a5079c2d3
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py
@@ -0,0 +1,368 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+class FusionRoformerCrossAttention(Fusion):
+    """
+    Fuse VideoBertAttention subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(
+            model,
+            "CustomQkvCrossToContext_IxRT",
+            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
+        )
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(
+        self, custom_fc: NodeProto, mul: NodeProto
+    ) -> Tuple[int, int]:
+        mul_initializer = self.model.get_initializer(mul.input[1])
+
+        # 检查float_data是否为空
+        if len(mul_initializer.float_data) > 0:
+            mul_value = mul_initializer.float_data[0]
+        else:
+            # 如果float_data为空，尝试其他方式获取数据
+            # 例如，如果数据存储在raw_data中
+            if len(mul_initializer.raw_data) > 0:
+                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[mul_initializer.data_type]
+                mul_value = np.frombuffer(mul_initializer.raw_data, dtype=dtype)[0]
+            else:
+                raise ValueError("Data not found in the mul_initializer")
+
+        for attr in custom_fc.attribute:
+            if attr.name == "W":
+                tensor_value = attr.t
+                tensor_shape = [dim for dim in tensor_value.dims]
+                break
+        head_dim = math.floor(1.0 / (mul_value * mul_value))
+        hidden_size = tensor_shape[0]
+        num_heads = hidden_size // head_dim
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+        self,
+        num_heads: int,
+        hidden_size: int,
+        input_q: str,
+        input_k: str,
+        input_v: str,
+        input_mask: str,
+        output: str,
+        matmul_qk_add: NodeProto,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input_q: str,
+            input_k: str,
+            input_v: str,
+            input_mask: str,
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(
+                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
+            )
+            return None
+
+        attention_node_name = self.model.create_node_name("CrossAttention")
+
+        attention_inputs = [input_q, input_k, input_v, input_mask]
+
+        attention_node = helper.make_node(
+            "CustomQkvCrossToContext_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+
+        return attention_node
+
+    def get_shape(self, edge_name):
+        for info in self.model.graph().value_info:
+            if info.name == edge_name:
+                return info.type.tensor_type.shape.dim
+        return None
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_paths = {
+            "path1": (
+                [
+                    "CustomFCPluginDynamic_IxRT",
+                    "Reshape",
+                    "Transpose",
+                    "Reshape",
+                    "MatMul",
+                ],
+                [0, 0, 0, 0, 0],
+            ),
+            "path2": (
+                [
+                    "CustomFCPluginDynamic_IxRT",
+                    "Reshape",
+                    "Transpose",
+                    "Reshape",
+                    "MatMul",
+                ],
+                [1, 0, 0, 0, 0],
+            ),
+        }
+        # print('start_nodes:', start_node.name)
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+
+        fc_after_atten = None
+        if qkv_path in ["path1", "path2"]:
+            (
+                fc_after_atten,
+                reshape_qkv_2,
+                transpose_qkv,
+                reshape_qkv_1,
+                matmul_qkv,
+            ) = qkv_nodes
+
+        """
+        Match
+        Add --> LayerNormalization -->  Attention -->     Add --> LayerNormalization
+         |                                                        |
+         |                                                        |
+         +---------------------------------------------------------
+        """
+        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
+        if add_before_layernorm is not None:
+            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == "LayerNormalization":
+                    root_input = child.output[0]
+
+        v_paths = {"path1": (["Reshape", "Transpose", "Reshape"], [1, 0, 0])}
+
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+        if v_path == "path1":
+            (reshape_v, transpose_v, v_reshape) = v_nodes
+
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+
+        qk_paths = {
+            "path1": (
+                ["Softmax", "Add", "Mul", "Mul", "Reshape", "MatMul"],
+                [0, 0, None, None, None, 0],
+            )
+        }
+
+        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
+
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+        # print('qk_nodes', qk_nodes[0].name)
+        matmul_qk_add = None
+        if qk_path == "path1":
+            (_, add_mask, mul_mask, mul_qk, reshape_qk, matmul_qk) = qk_nodes
+
+        q_paths = {
+            "path1": (["Transpose", "Add"], [0, 0]),
+        }
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        # print('q_nodes', q_nodes[0].name)
+        if q_path == "path1":
+            (q_tranpose, q_add) = q_nodes
+
+        k_paths = {
+            "path1": (["Reshape", "Transpose", "Add"], [1, 0, 0]),
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
+
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        # print('k_nodes', k_nodes[0].name)
+        if k_path == "path1":
+            (_, k_transpose, k_add) = k_nodes
+        # print('add_mask', add_mask.name)
+        mask_paths = {
+            "path1": (
+                ["Mul", "Sub", "Unsqueeze", "Cast", "Greater"],
+                [1, None, 1, 0, 0],
+            )
+        }
+        mask_nodes, mask_path = self.match_parent_path_from_dict(add_mask, mask_paths)
+
+        if mask_nodes is None:
+            logger.debug("fuse_attention: failed to match mask path")
+            return
+        # print('mask_nodes', mask_nodes[0].name)
+        (_, mask_sub, mask_unsqueeze, mask_cast, mask_greater) = mask_nodes
+
+        if (
+            self.get_shape(q_add.output[0]) == self.get_shape(k_add.output[0])
+            and self.get_shape(k_add.output[0]) == self.get_shape(v_reshape.output[0])
+            and mul_mask.input[1] in mask_unsqueeze.output
+        ):
+            attention_last_node = reshape_qkv_1
+
+            num_heads, hidden_size = self.get_num_heads_and_hidden_size(
+                fc_after_atten, mul_qk
+            )
+
+            q_transpose_type = None
+            q_transpose_name = None
+            for info in self.model.graph().value_info:
+                if info.name == q_tranpose.output[0]:
+                    q_transpose_type = info.type
+                    q_transpose_name = info.name
+                    break
+
+            q_transpose_output = helper.make_value_info(
+                q_transpose_name[:-2] + "_fake_q", q_transpose_type
+            )
+            q_transpose_node = helper.make_node(
+                "Transpose",
+                inputs=[q_add.output[0]],
+                outputs=[q_transpose_output.name],
+                name=q_transpose_output.name,
+            )
+            q_transpose_node.attribute.extend(
+                [helper.make_attribute("perm", [0, 2, 1, 3])]
+            )
+
+            k_transpose_output = helper.make_value_info(
+                q_transpose_name[:-2] + "_fake_k", q_transpose_type
+            )
+            k_transpose_node = helper.make_node(
+                "Transpose",
+                inputs=[k_add.output[0]],
+                outputs=[k_transpose_output.name],
+                name=k_transpose_output.name,
+            )
+            k_transpose_node.attribute.extend(
+                [helper.make_attribute("perm", [0, 2, 1, 3])]
+            )
+
+            v_transpose_output = helper.make_value_info(
+                q_transpose_name[:-2] + "_fake_v", q_transpose_type
+            )
+            v_transpose_node = helper.make_node(
+                "Transpose",
+                inputs=[v_reshape.output[0]],
+                outputs=[v_transpose_output.name],
+                name=v_transpose_output.name,
+            )
+            v_transpose_node.attribute.extend(
+                [helper.make_attribute("perm", [0, 2, 1, 3])]
+            )
+
+            mask_type = None
+            for info in self.model.graph().value_info:
+                if info.name == mask_sub.output[0]:
+                    mask_type = info.type
+                    break
+
+            new_mask_type = onnx.TypeProto()
+            new_mask_type.tensor_type.elem_type = onnx.TensorProto.INT32
+            for dim in mask_type.tensor_type.shape.dim:
+                new_dim = new_mask_type.tensor_type.shape.dim.add()
+                new_dim.CopyFrom(dim)
+
+            mask_cast_to_int32_output = helper.make_value_info(
+                mask_sub.name + "_cast_to_int32", new_mask_type
+            )
+            mask_cast_to_int32_node = helper.make_node(
+                "Cast",
+                inputs=[mask_sub.output[0]],
+                outputs=[mask_cast_to_int32_output.name],
+                name=mask_cast_to_int32_output.name,
+            )
+            mask_cast_to_int32_node.attribute.extend([helper.make_attribute("to", 6)])
+
+            new_node = self.create_attention_node(
+                num_heads,
+                hidden_size,
+                q_transpose_node.output[0],
+                k_transpose_node.output[0],
+                v_transpose_node.output[0],
+                mask_cast_to_int32_node.output[0],
+                attention_last_node.output[0],
+                matmul_qk_add,
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.extend(
+                [
+                    q_transpose_node,
+                    k_transpose_node,
+                    v_transpose_node,
+                    new_node,
+                    mask_cast_to_int32_node,
+                ]
+            )
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[q_transpose_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[k_transpose_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[v_transpose_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[
+                mask_cast_to_int32_node.name
+            ] = self.this_graph_name
+
+            self.nodes_to_remove.extend(qkv_nodes[3:])
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes[:-1])
+            self.nodes_to_remove.extend(k_nodes[:-1])
+            self.nodes_to_remove.extend(v_nodes[:-1])
+            self.nodes_to_remove.extend([mask_nodes[0]])
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py
new file mode 100755
index 000000000..2ca376c39
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py
@@ -0,0 +1,83 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+from onnx import helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionRoPE(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "CustomRoPEPluginDynamic_IxRT", "Add")
+
+    def fuse(self, start_node, input_name_to_nodes, output_name_to_node):
+        src_paths = {"path1": (["Mul", "Concat", "Split", "Slice"], [0, 1, None, 0])}
+        src_nodes, src_path = self.match_parent_path_from_dict(start_node, src_paths)
+        if src_nodes is None:
+            logger.debug("fuse_rope: failed to match src_node")
+            return
+
+        src_node = src_nodes[0]
+
+        rotate_paths = {"path1": (["Mul", "Reshape", "Concat"], [1, 0, 0])}
+        rotate_nodes, rotate_path = self.match_parent_path_from_dict(
+            start_node, rotate_paths
+        )
+
+        if rotate_nodes is None:
+            logger.debug("fuse_rope: failed to match rotate_path")
+            return
+
+        concat_node = rotate_nodes[-1]
+        mul_right_node = rotate_nodes[0]
+
+        odd_paths = {"path1": (["Unsqueeze", "Neg", "Slice", "Reshape"], [0, 0, 0, 0])}
+        odd_nodes, odd_path = self.match_parent_path_from_dict(concat_node, odd_paths)
+
+        if odd_nodes is None:
+            logger.debug("fuse_rope: failed to match odd_path")
+            return
+
+        even_paths = {"path1": (["Unsqueeze", "Slice", "Reshape"], [1, 0, 0])}
+        even_nodes, even_path = self.match_parent_path_from_dict(
+            concat_node, even_paths
+        )
+
+        if even_nodes is None:
+            logger.debug("fuse_rope: failed to match even_path")
+            return
+        reshape_node = even_nodes[-1]
+
+        if reshape_node.output[0] == src_node.input[0]:
+            rope_node_name = self.model.create_node_name("RoPE")
+            rope_node = helper.make_node(
+                "CustomRoPEPluginDynamic_IxRT",
+                inputs=[
+                    reshape_node.output[0],
+                    src_nodes[0].input[1],
+                    mul_right_node.input[1],
+                ],
+                outputs=[start_node.output[0]],
+                name=rope_node_name,
+            )
+            rope_node.domain = "com.iluvatar"
+            rope_node.attribute.extend([helper.make_attribute("type_id", 2)])
+            rope_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+            rope_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+
+            self.nodes_to_add.append(rope_node)
+            self.node_name_to_graph_name[rope_node.name] = self.this_graph_name
+
+            self.nodes_to_remove.extend([start_node])
+            self.nodes_to_remove.extend([src_nodes[0]])
+            self.nodes_to_remove.extend(rotate_nodes)
+            self.nodes_to_remove.extend(odd_nodes[:-1])
+            self.nodes_to_remove.extend(even_nodes[:-1])
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py
new file mode 100755
index 000000000..a74fe9ee0
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py
@@ -0,0 +1,109 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Tuple, Union
+
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionSplitQKV(Fusion):
+    """
+    Fuse FusionSplitQKV
+    """
+
+    def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
+        super().__init__(model, "SplitQKV_IxRT", "MatMul")
+
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+
+    def create_splitqkv_node(
+        self, input: str, query_out: str, key_out: str, value_out: str
+    ) -> Union[NodeProto, None]:
+        """Create an XSoftmax node.
+
+        Args:
+            data_input (str): data input name
+            mask_input (str): max input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        node_name = self.model.create_node_name("SplitQKV_IxRT")
+
+        new_node = helper.make_node(
+            "SplitQKV_IxRT",
+            inputs=[input],
+            outputs=[query_out, key_out, value_out],
+            name=node_name,
+        )
+        new_node.domain = "com.iluvatar"
+        new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        new_node.attribute.extend(
+            [helper.make_attribute("atten_scale", 1 / self.num_heads)]
+        )
+
+        return new_node
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+
+        split_query_paths = {
+            "query_path": (
+                ["Div", "Transpose", "Reshape", "Slice", "CustomFCPluginDynamic_IxRT"],
+                [0, 0, 0, 0, 0],
+            ),
+        }
+
+        split_key_paths = {
+            "key_path": (["Transpose", "Reshape", "Slice"], [1, 0, 0]),
+        }
+
+        q_nodes, q_path = self.match_parent_path_from_dict(node, split_query_paths)
+
+        k_nodes, k_path = self.match_parent_path_from_dict(node, split_key_paths)
+
+        if (q_nodes is not None) and (k_nodes is not None):
+            (
+                q_div_node,
+                q_transpose_node,
+                q_reshape_node,
+                q_slice_node,
+                coustom_fc_node,
+            ) = q_nodes
+            k_transpose_node, k_reshape_node, k_slice_node = k_nodes
+            slice_nodes = self.model.get_children(coustom_fc_node)
+
+            if len(slice_nodes) != 3:
+                return
+            slice_nodes.remove(q_slice_node)
+            slice_nodes.remove(k_slice_node)
+            v_slice_node = slice_nodes[0]
+
+            node.input[0] = q_div_node.input[0]  # dele div
+            new_node = self.create_splitqkv_node(
+                coustom_fc_node.output[0],
+                q_slice_node.output[0],
+                k_slice_node.output[0],
+                v_slice_node.output[0],
+            )
+
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+            self.nodes_to_remove.append(q_slice_node)
+            self.nodes_to_remove.append(k_slice_node)
+            self.nodes_to_remove.append(v_slice_node)
+            self.nodes_to_remove.append(q_div_node)
+
+        else:
+            return
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py
new file mode 100755
index 000000000..e6e16f17a
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py
@@ -0,0 +1,354 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+class FusionVITAttention(Fusion):
+    """
+    Fuse VITAttention subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(
+            model,
+            "CustomQKVToContextPluginDynamic_IxRT",
+            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
+        )
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(
+        self, custom_fc: NodeProto, mul: NodeProto
+    ) -> Tuple[int, int]:
+        mul_initializer = self.model.get_initializer(mul.input[1])
+
+        # 检查float_data是否为空
+        if len(mul_initializer.float_data) > 0:
+            mul_value = mul_initializer.float_data[0]
+        else:
+            # 如果float_data为空，尝试其他方式获取数据
+            # 例如，如果数据存储在raw_data中
+            if len(mul_initializer.raw_data) > 0:
+                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[mul_initializer.data_type]
+                mul_value = np.frombuffer(mul_initializer.raw_data, dtype=dtype)[0]
+            else:
+                raise ValueError("Data not found in the mul_initializer")
+
+        for attr in custom_fc.attribute:
+            if attr.name == "W":
+                tensor_value = attr.t
+                tensor_shape = [dim for dim in tensor_value.dims]
+                break
+        head_dim = math.floor(1.0 / (mul_value * mul_value)) * math.floor(
+            1.0 / (mul_value * mul_value)
+        )
+        hidden_size = tensor_shape[0]
+        num_heads = hidden_size // head_dim
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+        self,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        matmul_qk_add: NodeProto,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+        # print(hidden_size, num_heads)
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(
+                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
+            )
+            return None
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        qk_bias = None
+        has_mask = 0
+        has_qk_bias = 0
+        if matmul_qk_add is not None:
+            has_qk_bias = 1
+            qk_bias = self.model.get_initializer(matmul_qk_add.input[1])
+            qk_bias_arr = NumpyHelper.to_array(qk_bias)
+            if len(qk_bias_arr.shape) == 3:
+                qk_bias_arr = qk_bias_arr.squeeze(0)
+            has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0)
+            if np.any(has_neg_inf):
+                qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype(
+                    np.float32
+                )
+            qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name))
+
+        attention_inputs = [input]
+
+        if qk_bias is not None:
+            has_mask = 1
+            attention_inputs.append(qk_bias.name)
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend(
+            [helper.make_attribute("hidden_size", hidden_size)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend(
+            [helper.make_attribute("has_qk_bias", has_qk_bias)]
+        )
+
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+        if normalize_node.op_type == "LayerNormalization":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                start_node = add_before_layernorm
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_paths = {
+            "path1": (["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"], [0, 0, 0]),
+            "path2": (["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"], [1, 0, 0]),
+        }
+
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+
+        if qkv_path in ["path1", "path2"]:
+            (custom_fc_after_atten, transpose_qkv, matmul_qkv) = qkv_nodes
+
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+        """
+        Match VIT
+        transpose --> LayerNormalization -->  custom_fc -> attention -> Add
+         |                                                                  |
+         |                                                                  |
+         +-------------------------------------------------------------------
+        """
+        transpose_before_layernorm = self.model.match_parent(start_node, "Transpose", 0)
+        if transpose_before_layernorm is not None:
+            node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == "LayerNormalization":
+                    root_input = child.output[0]
+
+        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
+        if add_before_layernorm is not None:
+            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == "LayerNormalization":
+                    root_input = child.output[0]
+
+        # print("root_input: ", root_input, matmul_qkv.name)
+        v_paths = {
+            "path1": (
+                [
+                    "Reshape",
+                    "Transpose",
+                    "Reshape",
+                    "Gather",
+                    "Squeeze",
+                    "Transpose",
+                    "Unsqueeze",
+                    "Reshape",
+                    "CustomFCPluginDynamic_IxRT",
+                ],
+                [1, 0, 0, 0, 0, 0, 0, 0, 0],
+            )  # vit
+        }
+
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+
+        squeeze_input = custom_fc = None
+        if v_path == "path1":
+            (_, _, _, _, squeeze_input, _, _, _, custom_fc) = v_nodes
+
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+
+        qk_paths = {
+            "path1": (["Softmax", "MatMul"], [0, 0]),
+            "path2": (["Softmax", "Add", "MatMul"], [0, 0, None]),
+        }
+
+        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
+        # print("qk_nodes:", qk_nodes[1].name)
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+
+        matmul_qk_add = None
+        if qk_path == "path1":
+            (_, matmul_qk) = qk_nodes
+        else:
+            (_, matmul_qk_add, matmul_qk) = qk_nodes
+
+        q_paths = {
+            "path1": (
+                ["Mul", "Reshape", "Transpose", "Reshape", "Gather", "Squeeze"],
+                [0, 0, 0, 0, 0, 0],
+            ),
+        }
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
+        # print("q_nodes:", q_nodes[0].name)
+        squeeze_q = mul_q = None
+        if q_path == "path1":
+            squeeze_q = q_nodes[-1]
+            mul_q = q_nodes[0]
+
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+
+        k_paths = {
+            "path1": (
+                [
+                    "Mul",
+                    "Transpose",
+                    "Reshape",
+                    "Transpose",
+                    "Reshape",
+                    "Gather",
+                    "Squeeze",
+                ],
+                [1, 0, 0, 0, 0, 0, 0],
+            ),
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
+        # print("k_nodes:", k_nodes[0].name)
+        squeeze_k = None
+        if k_path == "path1":
+            squeeze_k = k_nodes[-1]
+
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+
+        if (
+            custom_fc.input[0] == root_input
+            and squeeze_input == squeeze_q
+            and squeeze_input == squeeze_k
+        ):
+            attention_last_node = transpose_qkv
+
+            num_heads, hidden_size = self.get_num_heads_and_hidden_size(
+                custom_fc_after_atten, mul_q
+            )
+
+            new_node = self.create_attention_node(
+                num_heads,
+                hidden_size,
+                custom_fc.output[0],
+                attention_last_node.output[0],
+                matmul_qk_add,
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+            self.nodes_to_remove.extend([transpose_qkv, matmul_qkv])
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes[:-1])
+            self.nodes_to_remove.extend(k_nodes[:-1])
+            self.nodes_to_remove.extend(v_nodes[:-1])
+
+            # fuse head and tail transpose
+            if transpose_before_layernorm is not None:
+                node_children = input_name_to_nodes[
+                    transpose_before_layernorm.output[0]
+                ]
+                for child in node_children:
+                    for i, input in enumerate(child.input):
+                        if child.input[i] == transpose_before_layernorm.output[0]:
+                            child.input[i] = transpose_before_layernorm.input[0]
+                self.nodes_to_remove.extend([transpose_before_layernorm])
+
+                node = transpose_before_layernorm
+                while True:
+                    found = False
+                    node_children = input_name_to_nodes[node.output[0]]
+                    for child in node_children:
+                        if child is not None and child.op_type in [
+                            "SkipLayerNorm",
+                            "Add",
+                        ]:
+                            node = child
+                            found = True
+                            break
+                    if not found:
+                        break
+                node_children = input_name_to_nodes[node.output[0]]
+                if len(node_children) == 1 and node_children[0].op_type == "Transpose":
+                    transpose_node = node_children[0]
+                    transpose_children = input_name_to_nodes[transpose_node.output[0]]
+                    for i, input in enumerate(transpose_children[0].input):
+                        if transpose_children[0].input[i] == transpose_node.output[0]:
+                            transpose_children[0].input[i] = transpose_node.input[0]
+                    self.nodes_to_remove.extend([transpose_node])
+            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
+            # self.nodes_to_remove.extend(mask_nodes)
+            # self.prune_graph = True

From 36e110c0ff1206caae2821855d06eb3a8111661e Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Fri, 24 May 2024 21:48:07 +0800
Subject: [PATCH 20/28] update code 0524

---
 .../backends/ILUVATAR/README.zh_CN.md         | 77 +++++------------
 .../general_perf/backends/ILUVATAR/common.py  |  9 +-
 .../ILUVATAR/compile_backend_iluvatar.py      | 84 ++++++++++++-------
 .../ILUVATAR/runtime_backend_iluvatar.py      | 54 +++++++-----
 4 files changed, 114 insertions(+), 110 deletions(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
index 0d35e41d9..402e2dcec 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
@@ -267,50 +267,42 @@
             exit
             mv quantized_Resnet50.onnx general_perf/model_zoo/regular/open_resnet50
 
-        代码更改：
-            1）general_perf/backends/ILUVATAR/common.py 将build_config.set_flag(tensorrt.BuilderFlag.FP16) 更改为：
-            build_config.set_flag(tensorrt.BuilderFlag.INT8)
-
-            2）general_perf/backends/ILUVATAR/compile_backend_iluvatar.py 函数compile 最后一个else 添加以下的代码：
-            onnx_model_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50.onnx"
-            engine_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50" + ".engine" 
-            （在 build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize) 前面加上面两行）
-
-            3）general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py 函数load 最后一个else 添加以下的代码：
-            engine_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50" + ".engine" 
-            （注释掉 engine_path = os.path.dirname(model_path) + "/" + model + ".engine"）
+        手动更改配置文件：general_perf/model_zoo/resnet50-torch-fp32.json 中的 model_precision 精度为 INT8
 
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task resnet50-torch-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/resnet50-torch-fp32
 
-    2、yolov5 模型：
+    2、widedeep 模型：
         模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型
 
         下载方式：
             sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
             cd yudefu/bytedance_perf  
-            get quantized_yolov5s.onnx 
+            get quantized_widedeep_staticshape.onnx 
             exit
-            mv quantized_yolov5s.onnx general_perf/model_zoo/popular/open_yolov5/
+            mv quantized_widedeep_staticshape.onnx general_perf/model_zoo/regular/open_wide_deep_saved_model/
 
-        代码更改：
-            1）general_perf/backends/ILUVATAR/common.py 将build_config.set_flag(tensorrt.BuilderFlag.FP16) 更改为：
-            build_config.set_flag(tensorrt.BuilderFlag.INT8)
+        手动更改配置文件：general_perf/model_zoo/widedeep-tf-fp32.json 中的 model_precision 精度为 INT8
 
-            2）general_perf/backends/ILUVATAR/compile_backend_iluvatar.py 函数compile 最后一个else 添加以下的代码：
-            onnx_model_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s.onnx"
-            engine_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s" + ".engine" 
-           （在 build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize) 前面加上面两行）
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/widedeep-tf-fp32
+
+    3、yolov5 模型：
+        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型
+
+        下载方式：
+            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
+            cd yudefu/bytedance_perf  
+            get quantized_yolov5s.onnx 
+            exit
+            mv quantized_yolov5s.onnx general_perf/model_zoo/popular/open_yolov5/
 
-            3）general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py 函数load 添加以下的代码：
-            engine_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s" + ".engine" 
-           （在 if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5': 下面添加；
-             注释掉：engine_path = model_path.split(".")[0] + "_end.engine"）
+        手动更改配置文件：general_perf/model_zoo/yolov5-onnx-fp32.json 中的 model_precision 精度为 INT8
 
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task yolov5-onnx-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/yolov5-onnx-fp32
 
-    3、bert 模型：
+    4、bert 模型：
         模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型；该模型直接拿生成好的engine进行推理
 
         下载方式：
@@ -320,36 +312,7 @@
             exit
             mv bert_zijie_int8_b196.engine general_perf/model_zoo/regular/open_bert/
 
-        代码更改：
-            1）general_perf/backends/ILUVATAR/common.py 将build_config.set_flag(tensorrt.BuilderFlag.FP16) 更改为：
-            build_config.set_flag(tensorrt.BuilderFlag.INT8)
-
-            2）general_perf/backends/ILUVATAR/compile_backend_iluvatar.py 函数compile 最后一个else 做以下操作：
-            注释掉 build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
-            因为这里直接加载已经生成的engine，不需要进行compile生成；这里可以加一个输出：
-                print("\n****bert-int8推理直接采用加载生成好的engine, 不需要进行编译！****") 看程序走到哪里
-
-            3）general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py 函数load 添加以下的代码：
-            engine_path = "general_perf/model_zoo/regular/open_bert/bert_zijie_int8_b196.engine"
-           （在 elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin':
-             注释掉：engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine"）
-
-             第二个还需要修改函数 predict_dump 以下四行代码：
-             input_shape = input_tensors[i].shape
-             input_idx = engine.get_binding_index(input_name)
-             context.set_binding_shape(input_idx, Dims(input_shape))
-             i += 1
-             更改为：
-             input_shape = input_tensors[i].shape
-             for binding in range(3):
-                 context.set_binding_shape(binding, Dims(input_shape))
-            i += 1
-
-            第三需要更改的地方：将函数predict_timing 里面的 result[output_name[i]] = outputs_list[i] 改成：result[output_name[i]] = outputs_list[0]
-
-            精度测试时还需要更改下面的地方：函数predict 里面的 result[output_name[i]] = outputs_list[i] 改成：
-                result[output_name[0]] = outputs_list[0][:,:,0]
-                result[output_name[1]] = outputs_list[0][:,:,1]
+        手动更改配置文件：general_perf/model_zoo/bert-torch-fp32.json 中的 model_precision 精度为 INT8
 
         执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
         生成的测试报告位置：general_perf/reports/ILUVATAR/bert-torch-fp32
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/common.py b/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
index 1b871ec13..e2dbaa471 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
@@ -36,7 +36,7 @@ def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="",
     print(f"Loaded plugin from {dynamic_path}")
 
 
-def build_engine(model_name, onnx_model_path, engine_path, MaxBatchSize):
+def build_engine(model_name, onnx_model_path, engine_path, MaxBatchSize, BuildFlag):
     IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
     builder = tensorrt.Builder(IXRT_LOGGER)
     EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
@@ -104,7 +104,12 @@ def build_engine(model_name, onnx_model_path, engine_path, MaxBatchSize):
 
     parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
     parser.parse_from_file(onnx_model_path)
-    build_config.set_flag(tensorrt.BuilderFlag.FP16)
+    
+    if BuildFlag == 'FP16':
+        build_config.set_flag(tensorrt.BuilderFlag.FP16)
+    
+    if BuildFlag == 'INT8':
+        build_config.set_flag(tensorrt.BuilderFlag.INT8)
 
     # set dynamic shape
     num_inputs = network.num_inputs
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
index 2391b9123..f5f08b203 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
@@ -71,38 +71,60 @@ def compile(self, configs, dataloader=None):
         self.get_onnx(configs)
 
         # build engine
-        if model_name == 'widedeep':
-            onnx_model_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_new.onnx"
-            engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_new" + ".engine"    
-            build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
+        if configs['model_info']['model_precision'].replace('FP32', 'FP16') == 'FP16':
+            precision_flag = "FP16"
+            if model_name == 'widedeep':
+                onnx_model_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape.onnx"
+                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='FP16')
+
+            elif model_name == 'deberta':
+                onnx_model_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast_end.onnx"
+                engine_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast_end" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='FP16')
+
+            elif model_name == 'roformer':
+                onnx_model_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen_end.onnx"
+                engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen_end" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='FP16')
+
+            elif model_name == 'gpt2':
+                for bs in configs['workload']['batch_sizes']:
+                    onnx_model_path = os.path.dirname(model_path) + "/" + model + ".onnx"
+                    engine_path = os.path.dirname(model_path) + "/" + model + "_bs" + str(bs) + ".so" 
+
+                    for key, val in configs['model_info']['input_shape'].items():
+                        input_dict = {}
+                        val = val = [val[0] * bs] + val[1:] 
+                        input_dict[key] = val
+                        
+                    build_igie_engine(model_name=model_name, model_path=onnx_model_path, input_dict=input_dict, model_framework='onnx', precision='fp16', engine_path=engine_path)
+            
+            elif model == 'vae-decoder-onnx-fp32' or model == 'vae-encoder-onnx-fp32' or model == 'clip-onnx-fp32':
+                pass
+            
+            else:
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='FP16')
+            
+        if configs['model_info']['model_precision'] == 'INT8':
+            precision_flag = "INT8"
+            if model_name == 'widedeep':
+                onnx_model_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/quantized_widedeep_staticshape.onnx"
+                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/quantized_widedeep_staticshape" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='INT8')
+            
+            if model_name == 'resnet50':
+                onnx_model_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50.onnx"
+                engine_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='INT8')
 
-        elif model_name == 'deberta':
-            onnx_model_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast_end.onnx"
-            engine_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast_end" + ".engine"    
-            build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
+            if model_name == 'yolov5':
+                onnx_model_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s.onnx"
+                engine_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='INT8')
 
-        elif model_name == 'roformer':
-            onnx_model_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen_end.onnx"
-            engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen_end" + ".engine"    
-            build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
-
-        elif model_name == 'gpt2':
-            for bs in configs['workload']['batch_sizes']:
-                onnx_model_path = os.path.dirname(model_path) + "/" + model + ".onnx"
-                engine_path = os.path.dirname(model_path) + "/" + model + "_bs" + str(bs) + ".so" 
-
-                for key, val in configs['model_info']['input_shape'].items():
-                    input_dict = {}
-                    val = val = [val[0] * bs] + val[1:] 
-                    input_dict[key] = val
-                    
-                build_igie_engine(model_name=model_name, model_path=onnx_model_path, input_dict=input_dict, model_framework='onnx', precision='fp16', engine_path=engine_path)
-        
-        elif model == 'vae-decoder-onnx-fp32' or model == 'vae-encoder-onnx-fp32' or model == 'clip-onnx-fp32':
-            pass
-        
-        else:
-            build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize)
+            if model_name == 'bert':
+                print(f"\n==========****bert模型的int8精度推理采用直接加载engine文件, 因此不需要build engine! ****===========")
 
         result = {
             "model": 
@@ -114,7 +136,7 @@ def compile(self, configs, dataloader=None):
             "framework": 
                 configs['model_info']['framework'],
             "compile_precision": 
-                configs['model_info']['model_precision'].replace('FP32', 'FP16'),
+                precision_flag,
             "input_type": 
                 configs['model_info']['input_type'].split(","),
             "max_batch_size": 
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
index d9c814941..d15e451e3 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -226,7 +226,7 @@ def predict_dump(self, feeds):
 
     def predict_timing(self, input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list):
         model_name = self.configs["model"].split("-")[0]
-
+        
         # H2D: host to device
         for i in range(len(inputs)):
             (err, ) = cudart.cudaHostRegister(data_batch_list[i], inputs[i]["nbytes"], 2)
@@ -522,27 +522,41 @@ def load(self, batch_size) -> None:
             #self.load_sd(batch_size)
             return
         
-        if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
-            engine_path = model_path.split(".")[0] + "_end.engine"
+        if self.configs['compile_precision'] == 'FP16':
+            if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
+                engine_path = model_path.split(".")[0] + "_end.engine"
 
-        elif model_name == 'widedeep' or model_name == 'roformer':
-            engine_path = model_path + "/" + model + "_end.engine"
-                
-        elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin' \
-             or model_name == 'resnet50':
-            engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine" 
+            elif model_name == 'widedeep' or model_name == 'roformer':
+                engine_path = model_path + "/" + model + "_end.engine"
+                    
+            elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin' \
+                or model_name == 'resnet50':
+                engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine" 
 
-        else:
-            engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
-        
-        if model_name == 'widedeep':      
-            engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_new" + ".engine"
+            else:
+                engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
+            
+            if model_name == 'widedeep':      
+                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape" + ".engine"
 
-        if model_name == 'roformer':
-            engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen_end" + ".engine"     
-        
-        if model_name == 'deberta':
-            engine_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast_end" + ".engine"
+            if model_name == 'roformer':
+                engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen_end" + ".engine"     
+            
+            if model_name == 'deberta':
+                engine_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast_end" + ".engine"
+
+        if self.configs['compile_precision'] == 'INT8':
+            if model_name == 'widedeep':
+                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/quantized_widedeep_staticshape" + ".engine"    
+            
+            if model_name == 'resnet50':
+                engine_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50" + ".engine"
+
+            if model_name == 'yolov5':
+                engine_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s" + ".engine"    
+
+            if model_name == 'bert':
+                engine_path = "general_perf/model_zoo/regular/open_bert/bert_zijie_int8_b196.engine"
 
         engine, context = init_by_tensorrt(engine_path)
 
@@ -612,4 +626,4 @@ def _get_fake_samples(self, batch_size, shape, input_type):
                 i += 1
             return data
         else:
-            raise ValueError("Please provide input type")
\ No newline at end of file
+            raise ValueError("Please provide input type")

From 5e0d6eeaa15fe15e8effeb88352deb46f84b954a Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Mon, 27 May 2024 20:05:02 +0800
Subject: [PATCH 21/28] update

---
 .../ILUVATAR/runtime_backend_iluvatar.py         | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
index d15e451e3..9e4a522f0 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -285,8 +285,7 @@ def predict(self, feeds):
         model_name = self.configs["model"].split("-")[0]
         if self.isSDmodel(self.configs["model"]):
             for key, _ in feeds.items():
-                tmp_tensor = torch.tensor(feeds[key],
-                                    dtype=pt_dtype_map[self.input_type[i]])
+                tmp_tensor = np.array(feeds[key], dtype=INPUT_TYPE[self.input_type[i]])
                 input_tensors.append(tmp_tensor)
                 i += 1
 
@@ -296,14 +295,13 @@ def predict(self, feeds):
         elif model_name != 'gpt2':
             if model_name == 'deberta':
                 keys = list(feeds.keys())
-                input_ids = torch.tensor(feeds[keys[0]], dtype=pt_dtype_map[self.input_type[0]])
-                attention_mask = torch.tensor(feeds[keys[1]], dtype=pt_dtype_map[self.input_type[1]])
+                input_ids = np.array(feeds[keys[0]], dtype=INPUT_TYPE[self.input_type[i]])
+                attention_mask = np.array(feeds[keys[1]], dtype=INPUT_TYPE[self.input_type[i]])
                 input_tensors = [input_ids, attention_mask]
 
             else:
                 for key, _ in feeds.items():
-                    tmp_tensor = torch.tensor(feeds[key],
-                                        dtype=pt_dtype_map[self.input_type[i]])
+                    tmp_tensor = np.array(feeds[key], dtype=INPUT_TYPE[self.input_type[i]])
                     input_tensors.append(tmp_tensor)
                     i += 1
 
@@ -566,8 +564,8 @@ def load(self, batch_size) -> None:
         
         self.batch_size = batch_size
         self.engine = engine
-        self.context = context         
-    
+        self.context = context
+
 
     def load_sd(self, batch_size):
         model_path = self.configs['model_path']
@@ -626,4 +624,4 @@ def _get_fake_samples(self, batch_size, shape, input_type):
                 i += 1
             return data
         else:
-            raise ValueError("Please provide input type")
+            raise ValueError("Please provide input type")
\ No newline at end of file

From 55412b34bd0a11f162dec302590d608b1b3d673c Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Wed, 29 May 2024 19:02:39 +0800
Subject: [PATCH 22/28] update codes 0528

---
 .../general_perf/backends/ILUVATAR/common.py  | 41 +++++++++++---
 .../ILUVATAR/compile_backend_iluvatar.py      | 25 +++++++--
 .../ILUVATAR/runtime_backend_iluvatar.py      | 55 +++++++++++++++----
 3 files changed, 96 insertions(+), 25 deletions(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/common.py b/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
index e2dbaa471..aa0cf2f24 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
@@ -5,15 +5,16 @@
 import numpy as np
 from os.path import join, dirname, exists
 
-import tensorrt
-from tensorrt import Dims
 import pycuda.driver as cuda
 from cuda import cuda,cudart
 import threading
 
-import tvm
-from general_perf.backends.ILUVATAR.utils.import_model import import_model_to_igie
+import importlib
 
+tensorrt = None      
+Dims = None                                                                           
+                          
+tvm = None  
 
 def setup_seed(seed):
      torch.manual_seed(seed)
@@ -23,7 +24,25 @@ def setup_seed(seed):
      torch.backends.cudnn.deterministic = True
 
 
-def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
+def load_ixrt_plugin(logger=None, namespace="", dynamic_path="", model="", precision=""):
+    global tensorrt
+    global Dims
+
+    if tensorrt is not None:
+        return
+    
+    if precision == 'FP16':
+        if model == 'resnet50' or model == 'bert' or model == 'albert' or model == 'deberta' or model == 'yolov5':
+            tensorrt = importlib.import_module("tensorrt_legacy")
+            Dims = getattr(tensorrt, "Dims")
+        else:
+            tensorrt = importlib.import_module("tensorrt")
+            Dims = getattr(tensorrt, "Dims")
+    
+    if precision == 'INT8':
+        tensorrt = importlib.import_module("tensorrt")
+        Dims = getattr(tensorrt, "Dims")
+    
     if not dynamic_path:
         dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
 
@@ -32,7 +51,7 @@ def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="",
             f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
     
     ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
-    tensorrt.init_libnvinfer_plugins(logger, namespace)
+    tensorrt.init_libnvinfer_plugins(tensorrt.Logger(tensorrt.Logger.INFO), namespace)
     print(f"Loaded plugin from {dynamic_path}")
 
 
@@ -171,9 +190,17 @@ def build_engine(model_name, onnx_model_path, engine_path, MaxBatchSize, BuildFl
 
 
 def build_igie_engine(model_name, model_path, input_dict, model_framework, precision, engine_path):
+    global tvm
+
+    if tvm is not None:
+        return
+    
     if not os.path.exists(engine_path):
+        tvm = importlib.import_module("tvm")
+        from general_perf.backends.ILUVATAR.utils.import_model import import_model_to_igie
+
         target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")
-        mod, params = import_model_to_igie(model_path, input_dict, model_framework)
+        mod, params = import_model_to_igie(model_path, input_dict, model_framework, backend='igie')
         lib = tvm.relay.build(mod, target=target, params=params, precision=precision, verbose=False)
         lib.export_library(engine_path)
     else:
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
index f5f08b203..c548a649d 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
@@ -16,12 +16,9 @@
 import logging
 import subprocess
 
-import tensorrt
-
 from general_perf.backends.ILUVATAR.common import load_ixrt_plugin
-load_ixrt_plugin()
 
-from general_perf.backends.ILUVATAR.common import build_engine, build_igie_engine
+from general_perf.backends.ILUVATAR.common import build_engine
 from general_perf.backends.ILUVATAR.optimizer.passes import *
 from general_perf.tools.torch_to_onnx import torch_to_onnx
 from general_perf.tools.saved_to_onnx import savedmodel_to_onnx
@@ -49,6 +46,22 @@ def compile(self, configs, dataloader=None):
         model_path = configs['model_info']['model_path']
         MaxBatchSize = configs['model_info']['max_batch_size']
 
+        precision = configs['model_info']['model_precision'].replace('FP32', 'FP16')
+
+        if precision == 'FP16':
+            if model_name == 'resnet50' or model_name == 'bert' or model_name == 'albert' or model == 'deberta' or model_name == 'yolov5':
+                import tensorrt_legacy as tensorrt
+            else:
+                import tensorrt
+        
+        if precision == 'INT8':
+            import tensorrt
+
+        load_ixrt_plugin(model=model_name, precision=precision)
+
+        if model_name == 'gpt2':
+            from general_perf.backends.ILUVATAR.common import build_igie_engine
+
         # call the ONNX model and the compiled engine file
         if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
             onnx_model_path = model_path.split(".")[0] + "_end.onnx"
@@ -68,7 +81,7 @@ def compile(self, configs, dataloader=None):
             engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
 
         # model preprocessing
-        self.get_onnx(configs)
+        # self.get_onnx(configs)
 
         # build engine
         if configs['model_info']['model_precision'].replace('FP32', 'FP16') == 'FP16':
@@ -246,4 +259,4 @@ def get_onnx(self, configs):
             print("***Convert onnx model to plugin operator model success!***")
 
         else:
-            pass
\ No newline at end of file
+            pass
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
index 9e4a522f0..543591a31 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -19,21 +19,20 @@
 import numpy as np
 from tqdm import tqdm
 import threading
+import importlib
 
-import tvm
 from general_perf.backends import runtime_backend
 from general_perf.backends.ILUVATAR.common import init_by_tensorrt, setup_io_bindings
-from general_perf.backends.ILUVATAR.utils import get_target
 from general_perf.backends.ILUVATAR.common import Task, TaskThread
-from tensorrt import Dims
 from cuda import cuda, cudart
 import numa
 
 from general_perf.backends.ILUVATAR.common import load_ixrt_plugin
-load_ixrt_plugin()
 
 log = logging.getLogger("RuntimeBackendILUVATAR")
 
+Dims = None
+
 pt_dtype_map = {
     "FLOAT32": torch.float32,
     "FLOAT16": torch.float16,
@@ -285,7 +284,8 @@ def predict(self, feeds):
         model_name = self.configs["model"].split("-")[0]
         if self.isSDmodel(self.configs["model"]):
             for key, _ in feeds.items():
-                tmp_tensor = np.array(feeds[key], dtype=INPUT_TYPE[self.input_type[i]])
+                tmp_tensor = torch.tensor(feeds[key],
+                                        dtype=pt_dtype_map[self.input_type[i]])
                 input_tensors.append(tmp_tensor)
                 i += 1
 
@@ -300,8 +300,12 @@ def predict(self, feeds):
                 input_tensors = [input_ids, attention_mask]
 
             else:
+                trans_index = [0, 1, 2]
+                if model_name == 'bert' and self.configs['compile_precision'] == 'INT8':
+                    trans_index = [0, 2, 1]
+
                 for key, _ in feeds.items():
-                    tmp_tensor = np.array(feeds[key], dtype=INPUT_TYPE[self.input_type[i]])
+                    tmp_tensor = np.array(feeds[key], dtype=INPUT_TYPE[self.input_type[trans_index[i]]])
                     input_tensors.append(tmp_tensor)
                     i += 1
 
@@ -426,6 +430,7 @@ def predict(self, feeds):
             return result
     
     def predict_igie(self, dataloader):
+        tvm = importlib.import_module("tvm")
         self.task.module.set_input("input_ids", tvm.nd.array(dataloader["input_ids"].astype('int64'), self.device))
         self.task.module.run()
         output = self.task.module.get_output(0)
@@ -458,16 +463,21 @@ def benchmark_interact(self, dataloader):
             self.predict(test_data)
 
         for _ in range(iterations):
-            input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list = self.predict_dump(test_data)
+            if model_name != 'gpt2' and model_name != 'vae' and model_name != 'clip':
+                input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list = self.predict_dump(test_data)
 
-            start_time = time.time()
-            self.predict_timing(input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list)
-            end_time = time.time()
+                start_time = time.time()
+                self.predict_timing(input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list)
+                end_time = time.time()
+            
+            else:
+                start_time = time.time()
+                self.predict(test_data)
+                end_time = time.time()
 
             times_range.append(end_time - start_time)
             predict_range.append(self.predict_time)           
 
-            
         times_range.sort()
         tail_latency = round(
             times_range[int(len(times_range) * 0.99)] * 1000, 2)
@@ -506,11 +516,29 @@ def get_loaded_batch_size(self):
         return self.batch_size
 
     def load(self, batch_size) -> None:
+        global Dims
+
         # load engine
         model = self.configs['model']
         model_name = self.configs['model'].split("-")[0]
         model_path = self.configs['model_path']
-        
+
+        precision = self.configs['compile_precision'].replace('FP32', 'FP16')
+
+        if precision == 'FP16':
+            if model_name == 'resnet50' or model_name == 'bert' or model_name == 'albert' or model == 'deberta' or model_name == 'yolov5':
+                mod = importlib.import_module("tensorrt_legacy")
+                Dims = getattr(mod, "Dims")
+            else:
+                mod = importlib.import_module("tensorrt")
+                Dims = getattr(mod, "Dims")
+
+        if precision == 'INT8':
+            mod = importlib.import_module("tensorrt")
+            Dims = getattr(mod, "Dims")     
+
+        load_ixrt_plugin(model=model_name, precision=precision)
+
         if model_name == 'gpt2':
             self.batch_size = batch_size
             return
@@ -600,6 +628,9 @@ def load_igie(self, batch_size):
         model = self.configs['model']
         model_path = self.configs['model_path']
 
+        tvm = importlib.import_module("tvm")
+        from general_perf.backends.ILUVATAR.utils import get_target
+
         target, _ = get_target('iluvatar_with_all_libs')
         device = tvm.device(target.kind.name, self.task.device_id)
         engine_path = os.path.dirname(model_path) + "/" + model + "_bs" + str(batch_size) + ".so"

From 1e95fc8fc43d9bc7ffc0d55583f709c9d0a5201a Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.ai>
Date: Thu, 30 May 2024 15:29:13 +0800
Subject: [PATCH 23/28] update

---
 .../general_perf/backends/ILUVATAR/compile_backend_iluvatar.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
index c548a649d..194e94847 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
@@ -81,7 +81,7 @@ def compile(self, configs, dataloader=None):
             engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
 
         # model preprocessing
-        # self.get_onnx(configs)
+        self.get_onnx(configs)
 
         # build engine
         if configs['model_info']['model_precision'].replace('FP32', 'FP16') == 'FP16':

From 60142a49fd854e8d2dcaaa60108ceaab66f9eecc Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.com>
Date: Wed, 31 Jul 2024 08:50:28 +0000
Subject: [PATCH 24/28] adding workloads and backends

---
 byte_micro_perf/README.md                     |  20 +-
 byte_micro_perf/backends/GPU/backend_gpu.py   | 200 ++++--
 byte_micro_perf/backends/GPU/custom_ops.py    | 119 ++++
 byte_micro_perf/backends/GPU/requirements.txt |   3 +-
 .../backends/ILUVATAR/backend_iluvatar.py     | 280 +++++++++
 .../backends/ILUVATAR/custom_ops.py           | 119 ++++
 .../backends/ILUVATAR/requirements.txt        |   0
 byte_micro_perf/backends/backend.py           | 170 +++--
 byte_micro_perf/backends/module_store.py      | 583 +++++++++++++++---
 byte_micro_perf/backends/utils.py             | 228 +++++--
 byte_micro_perf/compiled_cache.db             | Bin 0 -> 12288 bytes
 byte_micro_perf/core/perf_engine.py           | 173 +++++-
 byte_micro_perf/launch.py                     |  34 +-
 byte_micro_perf/requirements.txt              |   1 +
 byte_micro_perf/workloads/add.json            |  40 +-
 byte_micro_perf/workloads/allgather.json      |  42 +-
 byte_micro_perf/workloads/allreduce.json      |  42 +-
 byte_micro_perf/workloads/alltoall.json       |  86 +--
 byte_micro_perf/workloads/batch_gemm.json     |  32 +
 byte_micro_perf/workloads/broadcast.json      |  44 +-
 byte_micro_perf/workloads/cast.json           |  17 +
 byte_micro_perf/workloads/cos.json            |  23 +-
 byte_micro_perf/workloads/device2host.json    |  39 +-
 byte_micro_perf/workloads/div.json            |  22 +
 byte_micro_perf/workloads/exp.json            |  23 +-
 byte_micro_perf/workloads/exponential.json    |  13 +-
 byte_micro_perf/workloads/gather.json         |  17 +
 byte_micro_perf/workloads/gelu.json           |  23 +-
 byte_micro_perf/workloads/gemm.json           | 129 +---
 byte_micro_perf/workloads/gemv.json           |  22 +
 byte_micro_perf/workloads/group_gemm.json     |  14 +
 byte_micro_perf/workloads/host2device.json    |  38 +-
 byte_micro_perf/workloads/index_add.json      |  21 +
 byte_micro_perf/workloads/indexadd.json       |  31 -
 byte_micro_perf/workloads/layernorm.json      |  40 +-
 byte_micro_perf/workloads/mul.json            |  22 +
 byte_micro_perf/workloads/p2p.json            |  26 +
 byte_micro_perf/workloads/reduce_max.json     |  17 +
 byte_micro_perf/workloads/reduce_min.json     |  17 +
 byte_micro_perf/workloads/reduce_sum.json     |  17 +
 byte_micro_perf/workloads/reducescatter.json  |  42 +-
 byte_micro_perf/workloads/scatter.json        |  17 +
 byte_micro_perf/workloads/silu.json           |  17 +
 byte_micro_perf/workloads/sin.json            |  23 +-
 byte_micro_perf/workloads/softmax.json        |  40 +-
 byte_micro_perf/workloads/sort.json           |  26 +-
 byte_micro_perf/workloads/sub.json            |  22 +
 byte_micro_perf/workloads/swiglu.json         |  17 +
 byte_micro_perf/workloads/unique.json         |  26 +-
 49 files changed, 2077 insertions(+), 940 deletions(-)
 create mode 100644 byte_micro_perf/backends/GPU/custom_ops.py
 create mode 100644 byte_micro_perf/backends/ILUVATAR/backend_iluvatar.py
 create mode 100644 byte_micro_perf/backends/ILUVATAR/custom_ops.py
 create mode 100644 byte_micro_perf/backends/ILUVATAR/requirements.txt
 create mode 100644 byte_micro_perf/compiled_cache.db
 create mode 100644 byte_micro_perf/workloads/batch_gemm.json
 create mode 100644 byte_micro_perf/workloads/cast.json
 create mode 100644 byte_micro_perf/workloads/div.json
 create mode 100644 byte_micro_perf/workloads/gather.json
 create mode 100644 byte_micro_perf/workloads/gemv.json
 create mode 100644 byte_micro_perf/workloads/group_gemm.json
 create mode 100644 byte_micro_perf/workloads/index_add.json
 delete mode 100644 byte_micro_perf/workloads/indexadd.json
 create mode 100644 byte_micro_perf/workloads/mul.json
 create mode 100644 byte_micro_perf/workloads/p2p.json
 create mode 100644 byte_micro_perf/workloads/reduce_max.json
 create mode 100644 byte_micro_perf/workloads/reduce_min.json
 create mode 100644 byte_micro_perf/workloads/reduce_sum.json
 create mode 100644 byte_micro_perf/workloads/scatter.json
 create mode 100644 byte_micro_perf/workloads/silu.json
 create mode 100644 byte_micro_perf/workloads/sub.json
 create mode 100644 byte_micro_perf/workloads/swiglu.json

diff --git a/byte_micro_perf/README.md b/byte_micro_perf/README.md
index 0f606fc6f..6033f40e2 100644
--- a/byte_micro_perf/README.md
+++ b/byte_micro_perf/README.md
@@ -46,18 +46,26 @@ Example:
     "Operator": "EXP",
     "Backend": "GPU",
     "Host Info": "Intel(R) Xeon(R) Platinum 8336C CPU @ 2.30GHz",
-    "Device Info": "A100-PCIE-40GB",
+    "Device Info": "NVIDIA A800-SXM4-80GB",
     "Performance": [
         {
             "Dtype": "float32",
-            "Memory Size(MB)": 4.0,
-            "Kernel bandwidth(GB/s)": 271.83,
-            "Bandwidth Utilization(%)": 0.17,
-            "Avg latency(us)": 15.43
+            "Tensor Shapes": [
+                [
+                    256,
+                    8192
+                ]
+            ],
+            "Read IO Size(MB)": 8.0,
+            "Write IO Size(MB)": 8.0,
+            "Memory Size(MB)": 16.0,
+            "Kernel bandwidth(GB/s)": 1790.52,
+            "Bandwidth Utilization(%)": 87.81,
+            "Avg latency(us)": 9.37,
+            "QPS": 27321.24
         }
     ]
 }
-
 ```
 
 ## Trouble Shooting
diff --git a/byte_micro_perf/backends/GPU/backend_gpu.py b/byte_micro_perf/backends/GPU/backend_gpu.py
index 651bb9037..cb40d5ea2 100644
--- a/byte_micro_perf/backends/GPU/backend_gpu.py
+++ b/byte_micro_perf/backends/GPU/backend_gpu.py
@@ -22,8 +22,13 @@
 import torch
 import torch.distributed as dist
 import torch.distributed.distributed_c10d as dist_c10d
+
 from backends.backend import Backend
 from backends.module_store import *
+from backends.utils import get_dtype_bytes
+
+from .custom_ops import GPUGemmOp, GPUBatchGemmOp, GPUGroupGemmOp
+
 
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("PerfEngine")
@@ -38,7 +43,7 @@ def get_backend_properties(self):
             torch.cuda.get_device_properties(0).total_memory / (1024**3)
         )
 
-        if os.path.exists(self.vendor_path) and (self.vendor_path).endswith(".json"):
+        if self.vendor_path is not None and os.path.exists(self.vendor_path) and (self.vendor_path).endswith(".json"):
             with open(self.vendor_path, "r") as f:
                 self.hw_info_dict = json.load(f)
                 # if the vendor path does not exist, please set this param manaually
@@ -50,12 +55,42 @@ def get_backend_properties(self):
                 )
             )
 
-    def gemm(self):
-        self.op = GemmOp()
 
-    def add(self):
-        self.op = AddOp()
+    # device/host ops
+    def host2device(self):
+        self.op = Host2DeviceOp(torch.device("cuda"))
+
+    def device2host(self):
+        self.op = Device2HostOp()
+
+
+    # communication ops
+    def allreduce(self):
+        self.setup_2d_group()
+        self.op = AllReduceOp(self.group)
+
+    def allgather(self):
+        self.setup_2d_group()
+        self.op = AllGatherOp(self.group)
+
+    def reducescatter(self):
+        self.setup_2d_group()
+        self.op = ReduceScatterOp(self.group)
+
+    def alltoall(self):
+        self.setup_2d_group()
+        self.op = AllToAllOp(self.group)
+
+    def broadcast(self):
+        self.setup_2d_group()
+        self.op = BroadcastOp(self.group)
 
+    def p2p(self):
+        self.setup_2d_group()
+        self.op = P2POp(self.group, self.ranks, self.rank)
+
+    # compute ops
+    # unary ops
     def sin(self):
         self.op = SinOp()
 
@@ -68,75 +103,128 @@ def exp(self):
     def exponential(self):
         self.op = ExponentialOp()
 
+    def silu(self):
+        self.op = SiluOp()
+
     def gelu(self):
         self.op = GeluOp()
 
-    def sort(self):
-        self.op = SortOp()
+    def swiglu(self):
+        self.op = SwiGLUOp()
 
-    def unique(self):
-        self.op = UniqueOp()
+    def cast(self):
+        self.op = CastOp()
 
-    def indexadd(self):
-        self.op = IndexAddOp()
 
-    def softmax(self):
-        self.op = SoftmaxOp()
+    # binary ops
+    def add(self):
+        self.op = AddOp()
+
+    def mul(self):
+        self.op = MulOp()
+
+    def sub(self):
+        self.op = SubOp()
 
+    def div(self):
+        self.op = DivOp()
+
+
+    # reduce ops
     def layernorm(self):
         self.op = LayerNormOp()
 
-    def allreduce(self):
-        self.setup_2d_group()
-        self.op = AllReduceOp(self.group)
+    def softmax(self):
+        self.op = SoftmaxOp()
 
-    def allgather(self):
-        self.setup_2d_group()
-        self.op = AllGatherOp(self.group)
+    def reduce_sum(self):
+        self.op = ReduceSumOp()
 
-    def reducescatter(self):
-        self.setup_2d_group()
-        self.op = ReduceScatterOp(self.group)
+    def reduce_min(self):
+        self.op = ReduceMinOp()
 
-    def alltoall(self):
-        self.setup_2d_group()
-        self.op = AllToAllOp(self.group)
+    def reduce_max(self):
+        self.op = ReduceMaxOp()
 
-    def broadcast(self):
-        self.setup_2d_group()
-        self.op = BroadcastOp(self.group)
 
-    def host2device(self):
-        self.op = Host2DeviceOp(torch.device("cuda"))
+    # index ops
+    def index_add(self):
+        self.op = IndexAddOp()
 
-    def device2host(self):
-        self.op = Device2HostOp()
+    def sort(self):
+        self.op = SortOp()
+
+    def unique(self):
+        self.op = UniqueOp()
+
+    def scatter(self):
+        self.op = ScatterOp()
+    
+    def gather(self):
+        self.op = GatherOp()
+
+    # gemm ops
+    def gemm(self):
+        self.op = GPUGemmOp()
+
+    def gemv(self):
+        self.op = GPUGemmOp()
+
+    def batch_gemm(self):
+        self.op = GPUBatchGemmOp()
 
+    def group_gemm(self):
+        self.op = GPUGroupGemmOp()
+
+
+
+    # create input tensors
     def build_tensor(self, input_shapes, dtype):
-        torch_type = getattr(torch, dtype)
-        if torch_type == torch.int32:
-            dtype_size = torch.iinfo(torch_type).bits // 8
+        torch.cuda.empty_cache()
+        torch_dtype = getattr(torch, dtype)
+
+        # compute size of input and output tensors
+        if hasattr(self.op, "compute_size"):
+            bytes_per_cnt = self.op.compute_size(input_shapes, dtype)
+        # default: input_tensors_size == output_tensor_size, all tensors have same dtype
         else:
-            dtype_size = torch.finfo(torch_type).bits // 8
-        size = sum([math.prod(shape) for shape in input_shapes])
-        data_amount = size * 2 * dtype_size
-        data_cnt = (self.memory_limit - 4) * 1024**3 // data_amount
-        data_cnt = min(data_cnt, self.iterations)
-        input_tensors_list = []
-        for _ in range(data_cnt):
-            input_tensors = [
-                torch.randn(shape).type(torch_type).to(torch.device("cuda"))
-                for shape in input_shapes
-            ]
-            input_tensors_list.append(input_tensors)
+            dtype_size = get_dtype_bytes(dtype)
+            element_num = 2 * sum([math.prod(shape) for shape in input_shapes])
+            bytes_per_cnt = dtype_size * element_num
 
+        # compute max avail tensors for compute
+        avail_bytes = (self.memory_limit - 4) * 1024**3
+        avail_cnts = avail_bytes // bytes_per_cnt
+        max_data_cnt = min(self.iterations, avail_cnts)
+
+        # create input tensors for each op
+        input_tensors_list = []
+        for _ in range(max_data_cnt):
+            # create input tensors
+            if hasattr(self.op, "custom_create_tensors"):
+                input_tensors = self.op.custom_create_tensors(input_shapes, torch_dtype, "cuda")
+                input_tensors_list.append(input_tensors)
+            # default: all input tensors have same dtype
+            else:
+                if torch_dtype in [torch.int8, torch.int32]:
+                    input_tensors = [
+                        torch.randint(-3, 3, size=shape, dtype=torch_dtype, device="cuda")
+                        for shape in input_shapes
+                    ]
+                else:
+                    input_tensors = [
+                        torch.randn(shape, dtype=torch_dtype, device="cuda")
+                        for shape in input_shapes
+                    ]
+                input_tensors_list.append(input_tensors)
         if hasattr(self.op, "process_inputs"):
             input_tensors_list = [
                 self.op.process_inputs(*(input_tensor))
                 for input_tensor in input_tensors_list
             ]
+        return input_tensors_list, max_data_cnt, bytes_per_cnt
+
 
-        return input_tensors_list, max(data_cnt, 1)
 
     def _run_operation(self, operation, inputs):
         result = operation(*inputs)
@@ -150,6 +238,14 @@ def initialize_ccl(self, rank, world_size):
         """
         initialize distributed process groups and relevant ENVs
         """
+        # check device_count
+        device_count = torch.cuda.device_count()
+        if world_size > device_count:
+            world_size = device_count
+        if rank >= world_size:
+            return False
+
+        # set envs
         os.environ["MASTER_ADDR"] = "127.0.0.1"
         os.environ["MASTER_PORT"] = "49373"
         os.environ["LOCAL_RANK"] = str(rank)
@@ -157,6 +253,7 @@ def initialize_ccl(self, rank, world_size):
         os.environ["WORLD_SIZE"] = str(world_size)
 
         torch.cuda.set_device(rank)
+
         # Call the init process
         timeout_seconds = int(os.environ.get("MEGATRON_NCCL_TIMEOUT_SECOND", 30))
         torch.distributed.init_process_group(
@@ -168,6 +265,7 @@ def initialize_ccl(self, rank, world_size):
         )
         self.setup_2d_group()
         log.warning("DIST: rank {}, world_size {}".format(rank, world_size))
+        return True
 
     def setup_2d_group(self):
         self.rank = dist.get_rank()
@@ -175,9 +273,9 @@ def setup_2d_group(self):
         origin_store_based_barrier = dist_c10d._store_based_barrier
         dist_c10d._store_based_barrier = lambda *a, **kw: None
         self.world_size = dist.get_world_size()
-        ranks = range(0, self.world_size)
-        group = dist.new_group(ranks)
-        if self.rank in ranks:
+        self.ranks = range(0, self.world_size)
+        group = dist.new_group(self.ranks)
+        if self.rank in self.ranks:
             self.group = group
         dist_c10d._store_based_barrier = origin_store_based_barrier
         # wait for all ranks finish group initializing
diff --git a/byte_micro_perf/backends/GPU/custom_ops.py b/byte_micro_perf/backends/GPU/custom_ops.py
new file mode 100644
index 000000000..6f4a6b9ac
--- /dev/null
+++ b/byte_micro_perf/backends/GPU/custom_ops.py
@@ -0,0 +1,119 @@
+from typing import List
+
+import torch
+import cutlass
+
+from backends.module_store import GemmOp, BatchGemmOp, GroupGemmOp
+
+
+# gemm(pytorch) float32/float16/bfloat16 --> float32/float16/bfloat16
+# gemm(cutlass) int8 --> int32
+class GPUGemmOp(GemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+            dtype = torch.int8
+            accum_dtype=torch.int32
+            self.plan = cutlass.op.Gemm(
+                alpha=1, beta=0,
+                element_A=dtype,
+                element_B=dtype,
+                element_C=accum_dtype,
+                element_D=accum_dtype,
+                layout_A=cutlass.LayoutType.RowMajor,
+                layout_B=cutlass.LayoutType.RowMajor,
+                layout_C=cutlass.LayoutType.RowMajor
+            )
+            self.op = self.plan.construct()
+            self.gemm_op_int8 = cutlass.emit.pytorch(
+                self.op, name='gemm', cc=self.plan.cc, 
+                jit=True, sourcedir='out'
+            )
+        except:
+            self.gemm_op_int8 = None
+            raise Exception("GPUGemmOp cutlass error")
+
+    def forward(
+        self, 
+        input_tensor_a : torch.Tensor, 
+        input_tensor_b : torch.Tensor
+    ):
+        compute_dtype = input_tensor_a.dtype
+        if compute_dtype == torch.int8:
+            output_tensor = self.gemm_op_int8.run(input_tensor_a, input_tensor_b)
+        else:
+            output_tensor = torch.mm(input_tensor_a, input_tensor_b)
+        return output_tensor
+
+
+# batch_gemm(pytorch)   float32/float16/bfloat16 --> float32/float16/bfloat16
+# batch_gemm(cutlass)   int8 --> int32
+class GPUBatchGemmOp(BatchGemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+        except:
+            raise Exception("GPUBatchGemmOp import cutlass error")
+
+    def forward(
+        self, 
+        input_tensor_a : torch.Tensor, 
+        input_tensor_b : torch.Tensor
+    ):
+        compute_dtype = input_tensor_a.dtype
+
+        output_tensor = None
+        if compute_dtype == torch.int8:
+            bs, m, n = input_tensor_a.shape[0], input_tensor_a.shape[1], input_tensor_b.shape[2]
+            c_tensor = torch.randint(-3, 3, [bs, m, n], dtype=torch.int32, device="cuda")
+            output_tensor = torch.randint(-3, 3, [bs, m, n], dtype=torch.int32, device="cuda")
+            plan = cutlass.op.Gemm(A=input_tensor_a, B=input_tensor_b, C=c_tensor, D=output_tensor, element_accumulator=cutlass.DataType.s32)
+            plan.run(input_tensor_a, input_tensor_b, c_tensor, output_tensor, 1, 0)
+        else:
+            output_tensor = torch.bmm(input_tensor_a, input_tensor_b)
+        return output_tensor
+
+
+# group_gemm(pytorch)   float32/float16/bfloat16 --> float32/float16/bfloat16
+# group_gemm(cutlass)   int8 --> int32
+class GPUGroupGemmOp(GroupGemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+            dtype = torch.int8
+            accum_dtype=torch.int32
+            self.plan = cutlass.op.GroupedGemm(
+                alpha=1, beta=0, 
+                element_A=dtype, 
+                element_B=dtype, 
+                element_C=accum_dtype, 
+                element_D=accum_dtype, 
+                layout_A=cutlass.LayoutType.RowMajor, 
+                layout_B=cutlass.LayoutType.RowMajor, 
+                layout_C=cutlass.LayoutType.RowMajor
+            )
+            self.op = self.plan.construct()
+            self.gemm_op_int8 = cutlass.emit.pytorch(
+                self.op, name='group_gemm', cc=self.plan.cc,
+                jit=True, sourcedir='out'
+            )
+        except:
+            self.gemm_op_int8 = None
+            raise Exception("GPUGroupGemmOp cutlass error")
+
+    def forward(self, 
+        a_list : List[torch.Tensor], 
+        b_list : List[torch.Tensor]
+    ):
+        compute_dtype = a_list[0].dtype
+        if compute_dtype == torch.int8:
+            output_tensors = self.gemm_op_int8.run(a_list, b_list)
+        else:
+            output_tensors = [a @ b for a, b in zip(a_list, b_list)]
+        return output_tensors
\ No newline at end of file
diff --git a/byte_micro_perf/backends/GPU/requirements.txt b/byte_micro_perf/backends/GPU/requirements.txt
index 846d92140..e45aca82d 100644
--- a/byte_micro_perf/backends/GPU/requirements.txt
+++ b/byte_micro_perf/backends/GPU/requirements.txt
@@ -1 +1,2 @@
-torch==2.1.0
\ No newline at end of file
+torch==2.1.0
+nvidia-cutlass
diff --git a/byte_micro_perf/backends/ILUVATAR/backend_iluvatar.py b/byte_micro_perf/backends/ILUVATAR/backend_iluvatar.py
new file mode 100644
index 000000000..02807ac42
--- /dev/null
+++ b/byte_micro_perf/backends/ILUVATAR/backend_iluvatar.py
@@ -0,0 +1,280 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+## limitations under the License.
+
+import json
+import logging
+import math
+import os
+from datetime import timedelta
+from typing import Any, Dict, List
+
+import torch
+import torch.distributed as dist
+import torch.distributed.distributed_c10d as dist_c10d
+
+from backends.backend import Backend
+from backends.module_store import *
+from backends.utils import get_dtype_bytes 
+
+from backends.module_store import GemmOp, GemvOp, BatchGemmOp, GroupGemmOp
+
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+
+
+class BackendILUVATAR(Backend):
+    def get_device_name(self):
+        return torch.cuda.get_device_name(0)
+
+    def get_backend_properties(self):
+        self.memory_limit = int(
+            torch.cuda.get_device_properties(0).total_memory / (1024**3)
+        )
+        if self.vendor_path is not None and os.path.exists(self.vendor_path) and (self.vendor_path).endswith(".json"):
+            with open(self.vendor_path, "r") as f:
+                self.hw_info_dict = json.load(f)
+                # if the vendor path does not exist, please set this param manaually
+                self.bandwidth_limit = self.hw_info_dict["内存参数"]["内存"]["内存带宽(GB/s)"]
+        else:
+            log.warning(
+                "Vendor_path: [ {} ] was not found or not a full path points to json, please check your path!!! Otherwise, please set the hardware info manaually.".format(
+                    self.vendor_path
+                )
+            )
+  
+    # device/host ops
+    def host2device(self):
+        self.op = Host2DeviceOp(torch.device("cuda"))
+
+    def device2host(self):
+        self.op = Device2HostOp()
+
+
+    # communication ops
+    def allreduce(self):
+        self.setup_2d_group()
+        self.op = AllReduceOp(self.group)
+
+    def allgather(self):
+        self.setup_2d_group()
+        self.op = AllGatherOp(self.group)
+
+    def reducescatter(self):
+        self.setup_2d_group()
+        self.op = ReduceScatterOp(self.group)
+
+    def alltoall(self):
+        self.setup_2d_group()
+        self.op = AllToAllOp(self.group)
+
+    def broadcast(self):
+        self.setup_2d_group()
+        self.op = BroadcastOp(self.group)
+
+    def p2p(self):
+        self.setup_2d_group()
+        self.op = P2POp(self.group, self.ranks, self.rank)
+    
+
+    # compute ops
+    # unary ops
+    def sin(self):
+        self.op = SinOp()
+
+    def cos(self):
+        self.op = CosOp()
+
+    def exp(self):
+        self.op = ExpOp()
+
+    def exponential(self):
+        self.op = ExponentialOp()
+
+    def silu(self):
+        self.op = SiluOp()
+
+    def gelu(self):
+        self.op = GeluOp()
+
+    def swiglu(self):
+        self.op = SwiGLUOp()
+
+    def cast(self):
+        self.op = CastOp()
+
+
+    # binary ops
+    def add(self):
+        self.op = AddOp()
+
+    def mul(self):
+        self.op = MulOp()
+
+    def sub(self):
+        self.op = SubOp()
+
+    def div(self):
+        self.op = DivOp()
+
+
+    # reduce ops
+    def layernorm(self):
+        self.op = LayerNormOp()
+
+    def softmax(self):
+        self.op = SoftmaxOp()
+
+    def reduce_sum(self):
+        self.op = ReduceSumOp()
+
+    def reduce_min(self):
+        self.op = ReduceMinOp()
+
+    def reduce_max(self):
+        self.op = ReduceMaxOp()
+
+
+    # index ops
+    def index_add(self):
+        self.op = IndexAddOp()
+
+    def sort(self):
+        self.op = SortOp()
+
+    def unique(self):
+        self.op = UniqueOp()
+
+    def scatter(self):
+        self.op = ScatterOp()
+
+    def gather(self):
+        self.op = GatherOp()
+
+
+    # gemm ops
+    def gemm(self):
+        self.op = GemmOp()
+
+    def gemv(self):
+        self.op = GemvOp()
+
+    def batch_gemm(self):
+        self.op = BatchGemmOp()
+
+    def group_gemm(self):
+        self.op = GroupGemmOp()
+
+
+    # create input tensors
+    def build_tensor(self, input_shapes, dtype):
+        torch.cuda.empty_cache()
+        torch_dtype = getattr(torch, dtype)
+
+        # compute size of input and output tensors
+        if hasattr(self.op, "compute_size"):
+            bytes_per_cnt = self.op.compute_size(input_shapes, dtype)
+        # default: input_tensors_size == output_tensor_size, all tensors have same dtype
+        else:
+            dtype_size = get_dtype_bytes(dtype)
+            element_num = 2 * sum([math.prod(shape) for shape in input_shapes])
+            bytes_per_cnt = dtype_size * element_num
+
+        # compute max avail tensors for compute
+        avail_bytes = (self.memory_limit - 4) * 1024**3
+        avail_cnts = avail_bytes // bytes_per_cnt
+        max_data_cnt = min(self.iterations, avail_cnts)
+
+        # create input tensors for each op
+        input_tensors_list = []
+        for _ in range(max_data_cnt):
+            # create input tensors
+            if hasattr(self.op, "custom_create_tensors"):
+                input_tensors = self.op.custom_create_tensors(input_shapes, torch_dtype, "cuda")
+                input_tensors_list.append(input_tensors)
+            # default: all input tensors have same dtype
+            else:
+                if torch_dtype in [torch.int8, torch.int32]:
+                    input_tensors = [
+                        torch.randint(-3, 3, size=shape, dtype=torch_dtype, device="cuda")
+                        for shape in input_shapes
+                    ]
+                else:
+                    input_tensors = [
+                        torch.randn(shape, dtype=torch_dtype, device="cuda")
+                        for shape in input_shapes
+                    ]
+                input_tensors_list.append(input_tensors)
+        if hasattr(self.op, "process_inputs"):
+            input_tensors_list = [
+                self.op.process_inputs(*(input_tensor))
+                for input_tensor in input_tensors_list
+            ]
+        return input_tensors_list, max_data_cnt, bytes_per_cnt
+
+
+    def _run_operation(self, operation, inputs):
+        result = operation(*inputs)
+        return result
+
+    def device_synchronize(self):
+        torch.cuda.synchronize()
+        return True
+
+    def initialize_ccl(self, rank, world_size):
+        """
+        initialize distributed process groups and relevant ENVs
+        """
+        # check device_count
+        device_count = torch.cuda.device_count()
+        if world_size > device_count:
+            world_size = device_count
+        if rank >= world_size:
+            return False
+
+        # set envs
+        os.environ["MASTER_ADDR"] = "127.0.0.1"
+        os.environ["MASTER_PORT"] = "49373"
+        os.environ["LOCAL_RANK"] = str(rank)
+        os.environ["RANK"] = str(rank)
+        os.environ["WORLD_SIZE"] = str(world_size)
+
+        torch.cuda.set_device(rank)
+
+        # Call the init process
+        timeout_seconds = int(os.environ.get("MEGATRON_NCCL_TIMEOUT_SECOND", 30))
+        torch.distributed.init_process_group(
+            backend="nccl",
+            world_size=world_size,
+            rank=rank,
+            store=None,
+            timeout=timedelta(seconds=timeout_seconds),
+        )
+        self.setup_2d_group()
+        log.warning("DIST: rank {}, world_size {}".format(rank, world_size))
+        return True
+
+    def setup_2d_group(self):
+        self.rank = dist.get_rank()
+        torch.cuda.set_device(self.rank)
+        origin_store_based_barrier = dist_c10d._store_based_barrier
+        dist_c10d._store_based_barrier = lambda *a, **kw: None
+        self.world_size = dist.get_world_size()
+        self.ranks = range(0, self.world_size)
+        group = dist.new_group(self.ranks)
+        if self.rank in self.ranks:
+            self.group = group
+        dist_c10d._store_based_barrier = origin_store_based_barrier
+        # wait for all ranks finish group initializing
+        torch.distributed.barrier()
\ No newline at end of file
diff --git a/byte_micro_perf/backends/ILUVATAR/custom_ops.py b/byte_micro_perf/backends/ILUVATAR/custom_ops.py
new file mode 100644
index 000000000..0fcb1dfb6
--- /dev/null
+++ b/byte_micro_perf/backends/ILUVATAR/custom_ops.py
@@ -0,0 +1,119 @@
+from typing import List
+
+import torch
+import cutlass
+
+from backends.module_store import GemmOp, BatchGemmOp, GroupGemmOp
+
+
+# gemm(pytorch) float32/float16/bfloat16 --> float32/float16/bfloat16
+# gemm(cutlass) int8 --> int32
+class ILUVATARGemmOp(GemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+            dtype = torch.int8
+            accum_dtype=torch.int32
+            self.plan = cutlass.op.Gemm(
+                alpha=1, beta=0,
+                element_A=dtype,
+                element_B=dtype,
+                element_C=accum_dtype,
+                element_D=accum_dtype,
+                layout_A=cutlass.LayoutType.RowMajor,
+                layout_B=cutlass.LayoutType.RowMajor,
+                layout_C=cutlass.LayoutType.RowMajor
+            )
+            self.op = self.plan.construct()
+            self.gemm_op_int8 = cutlass.emit.pytorch(
+                self.op, name='gemm', cc=self.plan.cc, 
+                jit=True, sourcedir='out'
+            )
+        except:
+            self.gemm_op_int8 = None
+            raise Exception("ILUVATARGemmOp cutlass error")
+
+    def forward(
+        self, 
+        input_tensor_a : torch.Tensor, 
+        input_tensor_b : torch.Tensor
+    ):
+        compute_dtype = input_tensor_a.dtype
+        if compute_dtype == torch.int8:
+            output_tensor = self.gemm_op_int8.run(input_tensor_a, input_tensor_b)
+        else:
+            output_tensor = torch.mm(input_tensor_a, input_tensor_b)
+        return output_tensor
+
+
+# batch_gemm(pytorch)   float32/float16/bfloat16 --> float32/float16/bfloat16
+# batch_gemm(cutlass)   int8 --> int32
+class ILUVATARBatchGemmOp(BatchGemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+        except:
+            raise Exception("ILUVATARBatchGemmOp import cutlass error")
+
+    def forward(
+        self, 
+        input_tensor_a : torch.Tensor, 
+        input_tensor_b : torch.Tensor
+    ):
+        compute_dtype = input_tensor_a.dtype
+
+        output_tensor = None
+        if compute_dtype == torch.int8:
+            bs, m, n = input_tensor_a.shape[0], input_tensor_a.shape[1], input_tensor_b.shape[2]
+            c_tensor = torch.randint(-3, 3, [bs, m, n], dtype=torch.int32, device="cuda")
+            output_tensor = torch.randint(-3, 3, [bs, m, n], dtype=torch.int32, device="cuda")
+            plan = cutlass.op.Gemm(A=input_tensor_a, B=input_tensor_b, C=c_tensor, D=output_tensor, element_accumulator=cutlass.DataType.s32)
+            plan.run(input_tensor_a, input_tensor_b, c_tensor, output_tensor, 1, 0)
+        else:
+            output_tensor = torch.bmm(input_tensor_a, input_tensor_b)
+        return output_tensor
+
+
+# group_gemm(pytorch)   float32/float16/bfloat16 --> float32/float16/bfloat16
+# group_gemm(cutlass)   int8 --> int32
+class ILUVATARGroupGemmOp(GroupGemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+            dtype = torch.int8
+            accum_dtype=torch.int32
+            self.plan = cutlass.op.GroupedGemm(
+                alpha=1, beta=0, 
+                element_A=dtype, 
+                element_B=dtype, 
+                element_C=accum_dtype, 
+                element_D=accum_dtype, 
+                layout_A=cutlass.LayoutType.RowMajor, 
+                layout_B=cutlass.LayoutType.RowMajor, 
+                layout_C=cutlass.LayoutType.RowMajor
+            )
+            self.op = self.plan.construct()
+            self.gemm_op_int8 = cutlass.emit.pytorch(
+                self.op, name='group_gemm', cc=self.plan.cc,
+                jit=True, sourcedir='out'
+            )
+        except:
+            self.gemm_op_int8 = None
+            raise Exception("ILUVATARGroupGemmOp cutlass error")
+
+    def forward(self, 
+        a_list : List[torch.Tensor], 
+        b_list : List[torch.Tensor]
+    ):
+        compute_dtype = a_list[0].dtype
+        if compute_dtype == torch.int8:
+            output_tensors = self.gemm_op_int8.run(a_list, b_list)
+        else:
+            output_tensors = [a @ b for a, b in zip(a_list, b_list)]
+        return output_tensors
\ No newline at end of file
diff --git a/byte_micro_perf/backends/ILUVATAR/requirements.txt b/byte_micro_perf/backends/ILUVATAR/requirements.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/byte_micro_perf/backends/backend.py b/byte_micro_perf/backends/backend.py
index a16dcc08d..00ac40f2c 100644
--- a/byte_micro_perf/backends/backend.py
+++ b/byte_micro_perf/backends/backend.py
@@ -15,12 +15,12 @@
 import os
 import time
 import random
+import traceback
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List
 
 from backends.utils import dump_communication_ops_report, dump_computation_ops_report
 
-
 class Backend(ABC):
     def __init__(self, workload_dict: Dict[str, Any], vendor_path: str):
         self.op_name = workload_dict["operator"]
@@ -28,14 +28,19 @@ def __init__(self, workload_dict: Dict[str, Any], vendor_path: str):
         self.warmup = int(0.1 * workload_dict["iterations"])
         self.vendor_path = vendor_path
         self.op = None
+
         # communication params
         self.rank = None
         self.world_size = None
         self.group = None
+
         # hardware info
         self.hw_info_dict = None
         self.memory_limit = None
         self.bandwidth_limit = None
+        self.get_backend_properties()
+
+        self.target_dtype = None
 
     @abstractmethod
     def get_device_name(self):
@@ -65,12 +70,34 @@ def initialize_ccl(self, rank, world_size):
     def setup_2d_group(self):
         pass
 
-    def gemm(self):
+
+    # communication ops
+    def host2device(self):
         pass
 
-    def add(self):
+    def device2host(self):
+        pass
+
+    def allreduce(self):
+        pass
+
+    def allgather(self):
+        pass
+
+    def reducescatter(self):
+        pass
+
+    def alltoall(self):
         pass
 
+    def broadcast(self):
+        pass
+
+    def p2p(self):
+        pass
+
+    # compute ops
+    # unary ops
     def sin(self):
         pass
 
@@ -83,81 +110,144 @@ def exp(self):
     def exponential(self):
         pass
 
+    def silu(self):
+        pass
+
     def gelu(self):
         pass
 
-    def indexadd(self):
+    def swiglu(self):
         pass
 
-    def sort(self):
+    def cast(self):
         pass
 
-    def unique(self):
+
+    # binary ops
+    def add(self):
         pass
 
-    def softmax(self):
+    def mul(self):
         pass
 
+    def sub(self):
+        pass
+
+    def div(self):
+        pass
+
+
+    # reduce ops
     def layernorm(self):
         pass
 
-    def allreduce(self):
+    def softmax(self):
         pass
 
-    def allgather(self):
+    def reduce_sum(self):
         pass
 
-    def reducescatter(self):
+    def reduce_min(self):
         pass
 
-    def alltoall(self):
+    def reduce_max(self):
         pass
 
-    def broadcast(self):
+
+    # index ops
+    def index_add(self):
         pass
 
-    def host2device(self):
+    def sort(self):
         pass
 
-    def device2host(self):
+    def unique(self):
         pass
 
+    def scatter(self):
+        pass
+        
+    def gather(self):
+        pass
+
+
+    # gemm ops
+    def gemm(self):
+        pass
+
+    def gemv(self):
+        pass
+
+    def batch_gemm(self):
+        pass
+
+    def group_gemm(self):
+        pass
+
+
+    # perf specify input_shape for 
     def perf(self, input_shapes: List[List[int]], dtype):
-        self.get_backend_properties()
+        error = ""
+
+        # create input tensors based on input_shapes and dtype
+        tensor_list, tensor_cnt, tensor_size_perc_cnt = self.build_tensor(
+            input_shapes, dtype
+        )
+
+        if tensor_cnt > 0:
+            try:
+                # random select input tensors
+                input_index_list = [
+                    random.randint(0, tensor_cnt - 1) for _ in range(self.iterations)
+                ]
+
+                # warmup
+                num_warm_up = 10
+                for _ in range(num_warm_up):
+                    self._run_operation(self.op, tensor_list[0])
+
+                # perf
+                self.device_synchronize()
+                start_time = time.perf_counter_ns()
+                for i in range(self.iterations):
+                    self._run_operation(
+                        self.op,
+                        tensor_list[input_index_list[i]]
+                    )
+                self.device_synchronize()
+                end_time = time.perf_counter_ns()
+
+                # time in us
+                total_exec_time = (end_time - start_time) / 1e3
+                latency = round(total_exec_time / self.iterations, 2)
+            except Exception as e:
+                traceback.print_exc()
+                latency = 0
+                error = "RUN_OP_ERROR"
+        else:
+            latency = 0
+            error = "OOM"
 
-        inputs_list, data_cnt = self.build_tensor(input_shapes, dtype)
-        input_index_list = [
-            random.randint(0, data_cnt - 1) for _ in range(self.iterations)
-        ]
-
-        # warmup
-        num_warm_up = 10
-        for _ in range(num_warm_up):
-            self._run_operation(self.op, inputs_list[0])
-
-        # perf
-        self.device_synchronize()
-        start_time = time.perf_counter_ns()
-        for i in range(self.iterations):
-            result = self._run_operation(self.op, inputs_list[input_index_list[i]])
-        self.device_synchronize()
-        end_time = time.perf_counter_ns()
-
-        # time in us
-        exec_time = (end_time - start_time) / 1e3
-        latency = round(exec_time / self.iterations, 2)
-
-        if self.op_name in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast"]:
+        tensor_list = []
+        
+        if self.op_name in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast", "p2p"]:
             report = dump_communication_ops_report(
                 self.op_name,
                 dtype,
                 input_shapes,
                 self.group.size(),
-                self.bandwidth_limit,
+                None,
                 latency,
+                error
             )
         else:
             report = dump_computation_ops_report(
-                self.op_name, dtype, input_shapes, self.bandwidth_limit, latency
+                self.op_name, 
+                dtype, 
+                input_shapes, 
+                self.bandwidth_limit, 
+                latency, 
+                error
             )
         return report
+
diff --git a/byte_micro_perf/backends/module_store.py b/byte_micro_perf/backends/module_store.py
index a8a5ff096..a821ab114 100644
--- a/byte_micro_perf/backends/module_store.py
+++ b/byte_micro_perf/backends/module_store.py
@@ -12,17 +12,299 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
+import random
+from typing import List
+
 import torch
 import torch.distributed as dist
 
+from .utils import get_dtype_bytes
 
-class AddOp(torch.nn.Module):
+
+class GemmOp(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def compute_size(self, input_shapes, dtype):
+        # input_shapes: [[M, K], [K, N]]
+        torch_dtype = getattr(torch, dtype)
+        a_shape, b_shape = input_shapes
+        M, K = a_shape
+        K, N = b_shape
+        d_shape = [M, N]
+        dtype_size = get_dtype_bytes(dtype)
+        input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+        output_element_num = sum([math.prod(shape) for shape in [d_shape]])
+        if torch_dtype == torch.int8:
+            bytes_per_cnt = dtype_size * input_element_num + get_dtype_bytes("float32") * output_element_num
+        else:
+            bytes_per_cnt = dtype_size * (input_element_num + output_element_num)
+        return bytes_per_cnt
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        compute_dtype = input_tensor_a.dtype
+        output_tensor = None
+        if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            output_tensor = torch.mm(input_tensor_a, input_tensor_b)
+        else:
+            raise Exception(f"GemmOp with dtype {compute_dtype} is not implemented")
+        return output_tensor
+
+
+class GemvOp(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+    
+    def compute_size(self, input_shapes, dtype):
+        # input_shapes: [[M, K], [K, N]]
+        torch_dtype = getattr(torch, dtype)
+        a_shape, b_shape = input_shapes
+        M, K = a_shape
+        K, N = b_shape
+        d_shape = [M, N]
+        dtype_size = get_dtype_bytes(dtype)
+        input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+        output_element_num = sum([math.prod(shape) for shape in [d_shape]])
+        if torch_dtype == torch.int8:
+            bytes_per_cnt = dtype_size * input_element_num + get_dtype_bytes("float32") * output_element_num
+        else:
+            bytes_per_cnt = dtype_size * (input_element_num + output_element_num)
+        return bytes_per_cnt
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        compute_dtype = input_tensor_a.dtype
+        output_tensor = None
+        if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            output_tensor = torch.mm(input_tensor_a, input_tensor_b)
+        else:
+            raise Exception(f"GemvOp with dtype {compute_dtype} is not implemented")
+        return output_tensor
+
+
+class BatchGemmOp(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def compute_size(self, input_shapes, dtype):
+        # input_shapes: [[bs, M, K], [bs, K, N]]
+        torch_dtype = getattr(torch, dtype)
+        a_shape, b_shape = input_shapes
+        bs, M, K = a_shape
+        bs, K, N = b_shape
+        d_shape = [bs, M, N]
+        dtype_size = get_dtype_bytes(dtype)
+        input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+        output_element_num = sum([math.prod(shape) for shape in [d_shape]])
+        if torch_dtype == torch.int8:
+            bytes_per_cnt = dtype_size * input_element_num + get_dtype_bytes("int32") * output_element_num * 2
+        else:
+            bytes_per_cnt = dtype_size * (input_element_num + output_element_num)
+        return bytes_per_cnt
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        compute_dtype = input_tensor_a.dtype
+        output_tensor = None
+        if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            output_tensor = torch.bmm(input_tensor_a, input_tensor_b)
+        else:
+            raise Exception(f"BatchGemmOp with dtype {compute_dtype} is not implemented")
+        return output_tensor
+
+
+class GroupGemmOp(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def compute_size(self, input_shapes, dtype):
+        """
+        [
+            [[M1, K1], [K1, N1]], 
+            [[M2, K2], [K2, N2]]
+        ]
+        """
+        torch_dtype = getattr(torch, dtype)
+        bytes_per_cnt = 0
+        for problem_shape in input_shapes:
+            a_shape, b_shape = problem_shape
+            M, K = a_shape
+            K, N = b_shape
+            d_shape = [M, N]
+            dtype_size = get_dtype_bytes(dtype)
+            input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+            output_element_num = sum([math.prod(shape) for shape in [d_shape]])
+            if torch_dtype == torch.int8:
+                bytes_per_cnt += dtype_size * input_element_num + get_dtype_bytes("float32") * output_element_num
+            else:
+                bytes_per_cnt += dtype_size * (input_element_num + output_element_num)
+        return bytes_per_cnt
+
+    def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device):
+        """
+        [
+            [[M1, K1], [K1, N1]], 
+            [[M2, K2], [K2, N2]]
+        ]
+        """
+        left_tensors = []
+        right_tensors = []
+
+        for problem_shape in input_shapes:
+            a_shape, b_shape = problem_shape
+            if torch_dtype in [torch.int8, torch.int32]:
+                left_tensor = torch.randint(-3, 3, size=a_shape, dtype=torch_dtype, device=xpu_device)
+                right_tensor = torch.randint(-3, 3, size=b_shape, dtype=torch_dtype, device=xpu_device)
+            else:
+                left_tensor = torch.randn(a_shape, dtype=torch_dtype, device=xpu_device)
+                right_tensor = torch.randn(b_shape, dtype=torch_dtype, device=xpu_device)
+            left_tensors.append(left_tensor)
+            right_tensors.append(right_tensor)
+
+        return [left_tensors, right_tensors]
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        compute_dtype = input_tensor_a[0].dtype
+        output_tensor_list = []
+        for a, b in zip(input_tensor_a, input_tensor_b):
+            if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                output_tensor = torch.mm(a, b)
+                output_tensor_list.append(output_tensor)
+            else:
+                raise Exception(f"GroupGemmOp with dtype {compute_dtype} is not implemented")
+        return output_tensor_list
+
+
+class Host2DeviceOp(torch.nn.Module):
+    def __init__(self, xpu_device):
+        super().__init__()
+        self.xpu_device = xpu_device
+
+    def process_inputs(self, input_tensors):
+        new_inputs = input_tensors.cpu()
+        return [new_inputs]
+
+    def forward(self, input_tensors):
+        assert input_tensors.device.type == "cpu"
+        output_xpu = input_tensors.to(self.xpu_device)
+        return output_xpu
+
+
+class Device2HostOp(torch.nn.Module):
     def __init__(self):
         super().__init__()
 
-    def forward(self, input_tensor_a, input_tensor_b):
-        result = input_tensor_a + input_tensor_b
-        return result
+    def forward(self, input_tensors):
+        assert input_tensors.device.type != "cpu"
+        output_cpu = input_tensors.cpu()
+        return output_cpu
+
+
+class AllReduceOp(torch.nn.Module):
+    def __init__(self, group):
+        super().__init__()
+        self.group = group
+
+    def forward(self, input_tensors):
+        dist.all_reduce(input_tensors, group=self.group)
+        return True
+
+
+class AllGatherOp(torch.nn.Module):
+    def __init__(self, group):
+        super().__init__()
+        self.group = group
+
+    def process_inputs(self, input_tensors):
+        input_tensor_list = list(
+            torch.chunk(input_tensors, dist.get_world_size(self.group))
+        )
+        return [input_tensor_list]
+
+    def forward(self, input_tensor_list):
+        dist.all_gather(
+            input_tensor_list,
+            input_tensor_list[dist.get_rank(self.group)],
+            group=self.group,
+        )
+        return True
+
+
+class ReduceScatterOp(torch.nn.Module):
+    def __init__(self, group):
+        super().__init__()
+        self.group = group
+
+    def process_inputs(self, input_tensors):
+        input_tensor_list = list(
+            torch.chunk(input_tensors, dist.get_world_size(self.group))
+        )
+        return [input_tensor_list]
+
+    def forward(self, input_tensor_list):
+        dist.reduce_scatter(
+            input_tensor_list[dist.get_rank(self.group)],
+            input_tensor_list,
+            group=self.group,
+        )
+        return True
+
+
+class AllToAllOp(torch.nn.Module):
+    def __init__(self, group):
+        super().__init__()
+        self.group = group
+
+    def process_inputs(self, input_tensor, output_tensor):
+        input_tensor_list = list(
+            torch.chunk(input_tensor, dist.get_world_size(self.group))
+        )
+        output_tensor_list = list(
+            torch.chunk(output_tensor, dist.get_world_size(self.group))
+        )
+        return [input_tensor_list, output_tensor_list]
+
+    def forward(self, in_tensors_list, out_tensors_list):
+        dist.all_to_all(out_tensors_list, in_tensors_list, group=self.group)
+        return True
+
+
+class BroadcastOp(torch.nn.Module):
+    def __init__(self, group):
+        super().__init__()
+        self.group = group
+
+    def forward(self, input_tensors):
+        dist.broadcast(input_tensors, 0, self.group)
+        return True
+
+
+class P2POp(torch.nn.Module):
+    def __init__(self, group, ranks, rank):
+        super().__init__()
+        self.group = group
+        self.group_size = self.group.size()
+        self.rank = rank
+        self.ranks = ranks
+        self.rank_size = len(ranks)
+
+    def next_rank(self):
+        return self.ranks[(self.rank + 1) % self.rank_size]
+
+    def prev_rank(self):
+        return self.ranks[(self.rank - 1) % self.rank_size]
+
+    def forward(self, send_tensor, recv_tensor):
+        reqs = []
+        if self.rank != (self.group_size - 1):
+            send_req = dist.isend(send_tensor, self.next_rank(), self.group)
+            reqs.append(send_req)
+        if self.rank != 0:
+            recv_req = dist.irecv(recv_tensor, self.prev_rank(), self.group)
+            reqs.append(recv_req)
+
+        for req in reqs:
+            req.wait()
+        return True
 
 
 class SinOp(torch.nn.Module):
@@ -43,12 +325,12 @@ def forward(self, input_tensors):
         return result
 
 
-class GeluOp(torch.nn.Module):
+class ExpOp(torch.nn.Module):
     def __init__(self):
         super().__init__()
 
     def forward(self, input_tensors):
-        result = torch.nn.functional.gelu(input_tensors)
+        result = torch.exp(input_tensors)
         return result
 
 
@@ -61,176 +343,273 @@ def forward(self, input_tensors):
         return result
 
 
-class IndexAddOp(torch.nn.Module):
+class SiluOp(torch.nn.Module):
     def __init__(self):
         super().__init__()
 
-    def process_inputs(self, input_tensor, source_tensor):
-        index = torch.randint(0, input_tensor.shape[0], (source_tensor.shape[0],)).to(
-            input_tensor.device
-        )
-        return [input_tensor, index, source_tensor]
-
-    def forward(self, input_tensor, index, source_tensor):
-        result = input_tensor.index_add_(0, index, source_tensor)
+    def forward(self, input_tensors):
+        result = torch.nn.functional.silu(input_tensors)
         return result
 
 
-class SortOp(torch.nn.Module):
+class GeluOp(torch.nn.Module):
     def __init__(self):
         super().__init__()
 
     def forward(self, input_tensors):
-        result = torch.sort(input_tensors)
+        result = torch.nn.functional.gelu(input_tensors)
         return result
 
 
-class UniqueOp(torch.nn.Module):
-    def __init__(self):
+class SwiGLUOp(torch.nn.Module):
+    def __init__(self) -> None:
         super().__init__()
+        self.w = 1
+        self.v = 2
 
     def forward(self, input_tensors):
-        result = torch.unique(input_tensors, return_counts=True)
+        result = (torch.nn.functional.sigmoid(input_tensors) * self.w) + (input_tensors * self.v)
         return result
 
 
-class ExpOp(torch.nn.Module):
+class CastOp(torch.nn.Module):
     def __init__(self):
         super().__init__()
 
+    def set_dtype(self, src_dtype: str):
+        target_dtype = "bfloat16" if src_dtype == "float32" else "float32"
+        self.target_dtype = target_dtype
+        self.target_torch_dtype = getattr(torch, target_dtype)
+
+    def compute_size(self, input_shapes, dtype):
+        torch_dtype = getattr(torch, dtype)
+        self.set_dtype(dtype)
+        dtype_size = get_dtype_bytes(dtype)
+        target_dtype_size = get_dtype_bytes(self.target_dtype)
+        element_num = sum([math.prod(shape) for shape in input_shapes])
+        bytes_per_cnt = dtype_size * element_num + target_dtype_size * element_num
+        return bytes_per_cnt
+
     def forward(self, input_tensors):
-        result = torch.exp(input_tensors)
+        result = input_tensors.to(self.target_torch_dtype)
         return result
 
 
-class GemmOp(torch.nn.Module):
+class AddOp(torch.nn.Module):
     def __init__(self):
         super().__init__()
 
     def forward(self, input_tensor_a, input_tensor_b):
-        logits = torch.matmul(input_tensor_a, input_tensor_b)
-        return logits
+        result = input_tensor_a + input_tensor_b
+        return result
 
 
-class SoftmaxOp(torch.nn.Module):
+class MulOp(torch.nn.Module):
     def __init__(self):
         super().__init__()
 
-    def forward(self, hidden_states):
-        logits = torch.nn.functional.softmax(hidden_states, dim=-1)
-        return logits
+    def forward(self, input_tensor_a, input_tensor_b):
+        result = input_tensor_a * input_tensor_b
+        return result
+
+
+class SubOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        result = input_tensor_a - input_tensor_b
+        return result
+
+
+class DivOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        result = input_tensor_a / input_tensor_b
+        return result
 
 
 class LayerNormOp(torch.nn.Module):
     def __init__(self):
         super().__init__()
 
-    def forward(self, hidden_states):
-        logits = torch.nn.functional.layer_norm(
-            hidden_states, (hidden_states.shape[-1],)
+    def forward(self, input_tensors):
+        result = torch.nn.functional.layer_norm(
+            input_tensors, (input_tensors.shape[-1],)
         )
-        return logits
+        return result
 
 
-class AllReduceOp(torch.nn.Module):
-    def __init__(self, group):
+class SoftmaxOp(torch.nn.Module):
+    def __init__(self):
         super().__init__()
-        self.group = group
 
     def forward(self, input_tensors):
-        dist.all_reduce(input_tensors, group=self.group)
-        return True
+        result = torch.nn.functional.softmax(input_tensors, dim=-1)
+        return result
 
 
-class AllGatherOp(torch.nn.Module):
-    def __init__(self, group):
+class ReduceSumOp(torch.nn.Module):
+    def __init__(self):
         super().__init__()
-        self.group = group
 
-    def process_inputs(self, input_tensors):
-        input_tensor_list = list(
-            torch.chunk(input_tensors, dist.get_world_size(self.group))
-        )
-        return [input_tensor_list]
-
-    def forward(self, input_tensor_list):
-        dist.all_gather(
-            input_tensor_list,
-            input_tensor_list[dist.get_rank(self.group)],
-            group=self.group,
-        )
-        return True
+    def forward(self, input_tensors):
+        result = torch.sum(input_tensors, dim=-1)
+        return result
 
 
-class ReduceScatterOp(torch.nn.Module):
-    def __init__(self, group):
+class ReduceMinOp(torch.nn.Module):
+    def __init__(self):
         super().__init__()
-        self.group = group
 
-    def process_inputs(self, input_tensors):
-        input_tensor_list = list(
-            torch.chunk(input_tensors, dist.get_world_size(self.group))
-        )
-        return [input_tensor_list]
+    def forward(self, input_tensors):
+        result = torch.min(input_tensors, dim=-1)
+        return result
 
-    def forward(self, input_tensor_list):
-        dist.reduce_scatter(
-            input_tensor_list[dist.get_rank(self.group)],
-            input_tensor_list,
-            group=self.group,
-        )
-        return True
 
+class ReduceMaxOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
 
-class AllToAllOp(torch.nn.Module):
-    def __init__(self, group):
+    def forward(self, input_tensors):
+        result = torch.max(input_tensors, dim=-1)
+        return result
+
+
+class IndexAddOp(torch.nn.Module):
+    def __init__(self):
         super().__init__()
-        self.group = group
 
-    def process_inputs(self, input_tensor, output_tensor):
-        input_tensor_list = list(
-            torch.chunk(input_tensor, dist.get_world_size(self.group))
-        )
-        output_tensor_list = list(
-            torch.chunk(output_tensor, dist.get_world_size(self.group))
+    def process_inputs(self, input_tensor, source_tensor):
+        index = torch.randint(0, input_tensor.shape[0], (source_tensor.shape[0],)).to(
+            input_tensor.device
         )
-        return [input_tensor_list, output_tensor_list]
+        return [input_tensor, index, source_tensor]
 
-    def forward(self, in_tensors_list, out_tensors_list):
-        dist.all_to_all(out_tensors_list, in_tensors_list, group=self.group)
-        return True
+    def forward(self, input_tensor, index, source_tensor):
+        result = input_tensor.index_add_(0, index, source_tensor)
+        return result
 
 
-class BroadcastOp(torch.nn.Module):
-    def __init__(self, group):
+class SortOp(torch.nn.Module):
+    def __init__(self):
         super().__init__()
-        self.group = group
 
     def forward(self, input_tensors):
-        dist.broadcast(input_tensors, 0, self.group)
-        return True
-    
+        result = torch.sort(input_tensors)
+        return result
 
-class Device2HostOp(torch.nn.Module):
+
+class UniqueOp(torch.nn.Module):
     def __init__(self):
         super().__init__()
 
     def forward(self, input_tensors):
-        assert input_tensors.device.type != "cpu"
-        output_cpu = input_tensors.cpu()
-        return output_cpu
+        result = torch.unique(input_tensors, return_counts=True)
+        return result
 
 
-class Host2DeviceOp(torch.nn.Module):
-    def __init__(self, xpu_device):
+class ScatterOp(torch.nn.Module):
+    def __init__(self):
         super().__init__()
-        self.xpu_device = xpu_device
 
-    def process_inputs(self, input_tensors):
-        new_inputs = input_tensors.cpu()
-        return [new_inputs]
+    def compute_size(self, input_shapes, dtype):
+        # dst: [batch_size, len], dtype
+        # index: [batch_size, len], int64
+        # src: [batch_size, len], dtype
+        tensor_shape = input_shapes[0]
 
-    def forward(self, input_tensors):
-        assert input_tensors.device.type == "cpu"
-        output_xpu = input_tensors.to(self.xpu_device)
-        return output_xpu
+        tensor_dtype_size = get_dtype_bytes(dtype)
+        index_dtype_size = get_dtype_bytes("int64")
+
+        shape_func = lambda shape: math.prod(shape)
+
+        bytes_per_cnt = (
+            shape_func(tensor_shape) * tensor_dtype_size
+            + shape_func(tensor_shape) * index_dtype_size
+            + shape_func(tensor_shape) * tensor_dtype_size
+        )
+        
+        return bytes_per_cnt
+
+    def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device):
+        # dst: [batch_size, len], dtype
+        # index: [batch_size, len], int64
+        # src: [batch_size, len], dtype
+        tensor_shape = input_shapes[0]
+
+        dst_tensor = torch.empty(tensor_shape, dtype=torch_dtype, device=xpu_device)
+        src_tensor = torch.empty(tensor_shape, dtype=torch_dtype, device=xpu_device)
+
+        # dim = 0
+        # dst[index[i, j], j] = src[i, j]
+        batch_size = tensor_shape[0]
+        tensor_len = tensor_shape[1]
+
+        index = [i for i in range(batch_size)]
+        random.shuffle(index)
+        index_tensor = torch.cat(
+            [torch.full((1, tensor_len), i, dtype=torch.int64, device=xpu_device) for i in index], 
+            dim=0
+        )
+        
+        return [dst_tensor, index_tensor, src_tensor]
+
+
+    def forward(self, dst_tensor, index_tensor, src_tensor):
+        dst_tensor.scatter_(0, index_tensor, src_tensor)
+        return dst_tensor
+
+
+class GatherOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def compute_size(self, input_shapes, dtype):
+        # dst: [batch_size, len], dtype
+        # index: [batch_size, len], int64
+        # src: [batch_size, len], dtype
+        tensor_shape = input_shapes[0]
+
+        tensor_dtype_size = get_dtype_bytes(dtype)
+        index_dtype_size = get_dtype_bytes("int64")
+
+        shape_func = lambda shape: math.prod(shape)
+
+        bytes_per_cnt = (
+            shape_func(tensor_shape) * tensor_dtype_size
+            + shape_func(tensor_shape) * index_dtype_size
+            + shape_func(tensor_shape) * tensor_dtype_size
+        )
+        
+        return bytes_per_cnt
+
+    def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device):
+        # dst: [batch_size, len], dtype
+        # index: [batch_size, len], int64
+        # src: [batch_size, len], dtype
+        tensor_shape = input_shapes[0]
+
+        dst_tensor = torch.empty(tensor_shape, dtype=torch_dtype, device=xpu_device)
+        src_tensor = torch.empty(tensor_shape, dtype=torch_dtype, device=xpu_device)
+
+        # dim = 0
+        # dst[index[i, j], j] = src[i, j]
+        batch_size = tensor_shape[0]
+        tensor_len = tensor_shape[1]
+
+        index = [i for i in range(batch_size)]
+        random.shuffle(index)
+        index_tensor = torch.cat(
+            [torch.full((1, tensor_len), i, dtype=torch.int64, device=xpu_device) for i in index], 
+            dim=0
+        )
+        
+        return [dst_tensor, index_tensor, src_tensor]
+
+
+    def forward(self, dst_tensor, index_tensor, src_tensor):
+        torch.gather(src_tensor, 0, index_tensor, out=dst_tensor)
+        return dst_tensor
\ No newline at end of file
diff --git a/byte_micro_perf/backends/utils.py b/byte_micro_perf/backends/utils.py
index 345674e4d..3216286ac 100644
--- a/byte_micro_perf/backends/utils.py
+++ b/byte_micro_perf/backends/utils.py
@@ -19,6 +19,93 @@
 import torch
 
 
+def get_dtype_bytes(dtype: str):
+    torch_dtype = getattr(torch, dtype)
+    dtype_size = 0
+    if torch_dtype in [torch.int64, torch.int32, torch.int8]:
+        dtype_size = torch.iinfo(torch_dtype).bits // 8
+    elif torch_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+        dtype_size = torch.finfo(torch_dtype).bits // 8
+    else:
+        # not supported yet
+        pass
+    return dtype_size
+
+
+def get_io_amount(op_name, input_shapes, dtype):
+    batch_size = input_shapes[0][0]
+    dtype_size = get_dtype_bytes(dtype)
+    if op_name in ["add", "mul", "sub", "div"]:
+        # c = a + b
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = dtype_size * math.prod(input_shapes[0])
+    elif op_name == "gemm":
+        M = input_shapes[0][0]
+        K = input_shapes[0][1]
+        N = input_shapes[1][1]
+        read_io_amount = dtype_size * (M * K + K * N)
+        if dtype != torch.int8:
+            write_io_amount = dtype_size * (M * N)
+        else:
+            write_io_amount = get_dtype_bytes("int32") * (M * N)
+    elif op_name == "batch_gemm":
+        bs = input_shapes[0][0]
+        M = input_shapes[0][1]
+        K = input_shapes[0][2]
+        N = input_shapes[1][2]
+        read_io_amount = dtype_size * bs * (M * K + K * N)
+        if dtype != torch.int8:
+            write_io_amount = dtype_size * bs * (M * N)
+        else:
+            write_io_amount = get_dtype_bytes("int32") * bs * (M * N)
+    elif op_name == "group_gemm":
+        in_size_list = []
+        out_size_list = []
+        m_list = []
+        for problem_shape in input_shapes:
+            M = problem_shape[0][0]
+            K = problem_shape[0][1]
+            N = problem_shape[1][1]
+            in_size_list.append(M * K + K * N)
+            out_size_list.append(M * N)
+            m_list.append(M)
+        batch_size = sum(m_list)
+        read_io_amount = dtype_size * sum(in_size_list)
+        if dtype != torch.int8:
+            write_io_amount = dtype_size * sum(out_size_list)
+        else:
+            write_io_amount = get_dtype_bytes("int32") * sum(out_size_list)
+    elif op_name in ["device2host"]:
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = 0
+    elif op_name in ["host2device"]:
+        read_io_amount = 0
+        write_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+    elif op_name in ["reduce_sum", "reduce_max", "reduce_min"]:
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = dtype_size * sum([math.prod(shape[:-1]) for shape in input_shapes])
+    elif op_name in ["unqiue", "sort"]:
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = 2 * dtype_size * sum([math.prod(shape) for shape in input_shapes])
+    elif op_name in ["scatter", "gather"]:
+        tensor_shape = input_shapes[0]
+        read_io_amount = (dtype_size + get_dtype_bytes("int64")) * math.prod(tensor_shape)
+        write_io_amount = dtype_size * math.prod(tensor_shape)
+    elif op_name == "cast":
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = read_io_amount / 2 if dtype == torch.float32 else read_io_amount * 2
+    elif op_name in ["index_add"]:
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes]) + get_dtype_bytes("int32") * input_shapes[1][0]
+        write_io_amount = dtype_size * math.prod(input_shapes[0])
+    else:
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+
+    total_io_amount = read_io_amount + write_io_amount
+
+    return batch_size, total_io_amount, read_io_amount, write_io_amount
+
+
 def dump_communication_ops_report(
     op_name: str,
     dtype: str,
@@ -26,36 +113,53 @@ def dump_communication_ops_report(
     group_size: List[int],
     bandwidth_limit: float,
     latency: float,
+    error: str = ""
 ):
     size = math.prod(input_shapes[0])
-    torch_type = getattr(torch, dtype)
-    if torch_type == torch.int32:
-        dtype_size = torch.iinfo(torch_type).bits // 8
-    else:
-        dtype_size = torch.finfo(torch_type).bits // 8
+    dtype_size = get_dtype_bytes(dtype)
     mb = dtype_size * size / 1024 / 1024
-    algo_bw = dtype_size * size / latency / 1e3
-    bus_bw = algo_bw * (group_size - 1) / group_size
-
-    if op_name == "broadcast":
-        bus_bw = algo_bw
-    if op_name == "allreduce":
-        bus_bw *= 2
-
-    bandwidth_utils = None
-    if bandwidth_limit is not None:
-        bandwidth_utils = round((algo_bw / bandwidth_limit) * 1e2, 2)
-
-    report = {
-        "Dtype": dtype,
-        "Tensor Shapes": input_shapes, 
-        "Memory Size(MB)": round(mb, 2),
-        "Group": group_size,
-        "Kernel bandwidth(GB/s)": round(algo_bw, 2),
-        "Bus bandwidth(GB/s)": round(bus_bw, 2),
-        "Bandwidth Utilization(%)": bandwidth_utils,
-        "Avg latency(us)": round(latency, 2),
-    }
+    if error == "":
+        algo_bw = dtype_size * size / latency / 1e3
+
+        """
+        allreduce:      2 * (group_size - 1) * (tensor_size / group_size)
+        allgather:      1 * (group_size - 1) * (tensor_size / group_size)
+        reducescatter:  1 * (group_size - 1) * (tensor_size / group_size)
+        alltoall:       1 * (group_size - 1) * (tensor_size / group_size)
+        broadcast:      tensor_size
+        p2p:            tensor_size
+        """
+        bus_bw = algo_bw * (group_size - 1) / group_size
+        if op_name in ["broadcast", "p2p"]:
+            bus_bw = algo_bw
+        if op_name == "allreduce":
+            bus_bw *= 2
+
+        bandwidth_utils = None
+        if bandwidth_limit is not None:
+            bandwidth_utils = round((algo_bw / bandwidth_limit) * 1e2, 2)
+        report = {
+            "Dtype": str(dtype),
+            "Tensor Shapes": input_shapes,
+            "Memory Size(MB)": round(mb, 2),
+            "Group": group_size,
+            "Kernel bandwidth(GB/s)": round(algo_bw, 2),
+            "Bus bandwidth(GB/s)": round(bus_bw, 2),
+            "Bandwidth Utilization(%)": bandwidth_utils,
+            "Avg latency(us)": round(latency, 2),
+        }
+    else:
+        report = {
+            "Dtype": str(dtype),
+            "Tensor Shapes": input_shapes,
+            "Memory Size(MB)": round(mb, 2),
+            "Group": group_size,
+            "Kernel bandwidth(GB/s)": 0,
+            "Bus bandwidth(GB/s)": 0,
+            "Bandwidth Utilization(%)": None,
+            "Avg latency(us)": 0,
+            "Error": error,
+        }
     return report
 
 
@@ -65,45 +169,39 @@ def dump_computation_ops_report(
     input_shapes: List[List[int]],
     bandwidth_limit: float,
     latency: float,
+    error: str = ""
 ):
-    if op_name == "add":
-        # c = a + b
-        # MAC_total = MAC_a + MAC_b + MAC_c
-        size = sum(
-            [math.prod(shape) for shape in input_shapes], math.prod(input_shapes[0])
-        )
-    elif op_name == "gemm":
-        # c = gemm(a, b)
-        # MAC_total = MAC_a + MAC_b + MAC_c
-        M = input_shapes[0][0]
-        K = input_shapes[0][1]
-        N = input_shapes[1][1]
-        size = M * K + K * N + M * N
-    elif op_name == "unique" or op_name == "device2host" or "host2device":
-        size = sum([math.prod(shape) for shape in input_shapes])
-    else:
-        # out = func(in)
-        # MAC_total = MAC_in + MAC_out
-        size = sum([math.prod(shape) for shape in input_shapes]) * 2
+    batch_size, total_io_amount, read_io_amount, write_io_amount = get_io_amount(op_name, input_shapes, dtype)
+
+    if error == "":
+        qps = round(1000 / latency * batch_size, 2)
+        algo_bw = total_io_amount / latency / 1e3
 
-    torch_type = getattr(torch, dtype)
-    if torch_type == torch.int32:
-        dtype_size = torch.iinfo(torch_type).bits // 8
+        bandwidth_utils = None
+        if bandwidth_limit is not None:
+            bandwidth_utils = round((algo_bw / bandwidth_limit) * 1e2, 2)
+        report = {
+            "Dtype": str(dtype),
+            "Tensor Shapes": input_shapes,
+            "Read IO Size(MB)": round(read_io_amount / 1024 / 1024, 2),
+            "Write IO Size(MB)": round(write_io_amount / 1024 / 1024, 2),
+            "Memory Size(MB)": round(total_io_amount / 1024 / 1024, 2),
+            "Kernel bandwidth(GB/s)": round(algo_bw, 2),
+            "Bandwidth Utilization(%)": bandwidth_utils,
+            "Avg latency(us)": round(latency, 2),
+            "QPS": qps,
+        }
     else:
-        dtype_size = torch.finfo(torch_type).bits // 8
-    mb = dtype_size * size / 1024 / 1024
-    algo_bw = dtype_size * size / latency / 1e3
-
-    bandwidth_utils = None
-    if bandwidth_limit is not None:
-        bandwidth_utils = round((algo_bw / bandwidth_limit) * 1e2, 2)
-
-    report = {
-        "Dtype": dtype,
-        "Tensor Shapes": input_shapes, 
-        "Memory Size(MB)": round(mb, 2),
-        "Kernel bandwidth(GB/s)": round(algo_bw, 2),
-        "Bandwidth Utilization(%)": bandwidth_utils,
-        "Avg latency(us)": round(latency, 2),
-    }
+        report = {
+            "Dtype": str(dtype),
+            "Tensor Shapes": input_shapes,
+            "Read IO Size(MB)": round(read_io_amount / 1024 / 1024, 2),
+            "Write IO Size(MB)": round(write_io_amount / 1024 / 1024, 2),
+            "Memory Size(MB)": round(total_io_amount / 1024 / 1024, 2),
+            "Kernel bandwidth(GB/s)": 0,
+            "Bandwidth Utilization(%)": None,
+            "Avg latency(us)": 0,
+            "QPS": 0,
+            "Error": error,
+        }
     return report
diff --git a/byte_micro_perf/compiled_cache.db b/byte_micro_perf/compiled_cache.db
new file mode 100644
index 0000000000000000000000000000000000000000..1894846cceaf75183b5eb2e4bb7f753badc9056f
GIT binary patch
literal 12288
zcmeI#&r8EF6bJC66_vq$+;+`TQ3UB9psVB{l#OW%dMeo#G1xY>%|H*H#6QSC$N$8m
z$zZ~uFerK%-v>!~Y5VB=q?g=IBV{Ce&{ZxB3&?;_N-h~Agit$tJJ#Xyy>()*F5ecE
zoLzi%``^Ut`K0IfKexSr`w)Nt1Rwwb2tWV=5P$##An<Pj?<e$N7zFg=&4^hh=Q=M{
zCKstLWhIQ#McuIEUET0e%vn4NBhDHgIMQYMBwty~?_xHd*pWmLOUBpJgr9a;%a|``
zs$gL>34d$0FZQTw^M~bZ!-dG@X2<Qn&XyC#RP|=PWA5z_?*mFyv5+tIb7oKPsaP7l
z%F~98NY5J<-MRkI{ZU;v4cvr)00bZa0SG_<0uX=z1Rwwb2tZ(01v;)Vod0)qd(jsJ
XAOHafKmY;|fB*y_009U<U?uPaNxg#=

literal 0
HcmV?d00001

diff --git a/byte_micro_perf/core/perf_engine.py b/byte_micro_perf/core/perf_engine.py
index ff1155f9f..e2774c821 100644
--- a/byte_micro_perf/core/perf_engine.py
+++ b/byte_micro_perf/core/perf_engine.py
@@ -20,9 +20,13 @@
 import os
 import subprocess
 import sys
+import pathlib
+import traceback
+import random
 from typing import Any, Dict, List
+import itertools
+
 
-import torch
 import torch.multiprocessing as mp
 import virtualenv
 
@@ -88,6 +92,99 @@ def load_workload(task: str) -> Dict[str, Any]:
             "Task name: [ {} ] was not found, please check your task name".format(task)
         )
 
+def parse_workload(workload):
+    shape_list = []
+    if "input_shape_list" in workload:
+        shape_list.extend(workload["input_shape_list"])
+    # gemm or batch_gemm
+    elif "M/K/N" in workload:
+        if "batch_size" in workload:
+            for batch_size in workload["batch_size"]:
+                for M, K, N in workload["M/K/N"]:
+                    shape_list.append([
+                        [batch_size, M, K],
+                        [batch_size, K, N]
+                    ])
+        else:
+            for M, K, N in workload["M/K/N"]:
+                shape_list.append([[M, K], [K, N]])
+    # group_gemm
+    elif "MKN_choices" in workload:
+        seed = workload["seed"]
+        MKN_list = workload["MKN_choices"]
+        problems_list = workload["problems"]
+
+        random.seed(seed)
+        for problems in problems_list:
+            cur_inputs = []
+            for _ in range(problems):
+                M, K, N = [random.choice(MKN_list) for _ in range(3)]
+                cur_shapes = [[M, K], [K, N]]
+                cur_inputs.append(cur_shapes)
+        shape_list.append(cur_inputs)
+
+
+    if "input_shape_groups" in workload:
+        input_shape_groups = workload["input_shape_groups"] if isinstance(workload["input_shape_groups"], list) else [workload["input_shape_groups"]]
+
+        for input_shape_group in input_shape_groups:
+            if "inputs" in input_shape_group:
+                input_shape_list = []
+                for input_shapes in input_shape_group["inputs"]:
+                    input_shape_list.append([list(shape) for shape in itertools.product(*input_shapes)])
+                if len(input_shape_list) == 1:
+                    shape_list.extend(input_shape_list[0])
+                else:
+                    shape_list.extend([list(input_shape) for input_shape in zip(*input_shape_list)])
+
+            else:
+                gemm_keys = ["M", "K", "N", "MN", "MK", "KN"]
+                gemm_values = [input_shape_group.get(k, []) for k in gemm_keys]
+                if any(gemm_values):
+                    m ,k, n, mn, mk, kn = gemm_values
+                    # batch gemm
+                    if "batch_size" in input_shape_group:
+                        bs = input_shape_group.get("batch_size", [])
+                        if m and n and k:
+                            for p in itertools.product(bs, m, k, n):
+                                shape_list.append([[p[0], p[1], p[2]], [p[0], p[2], p[3]]])
+                        if mn and k:
+                            for p in itertools.product(bs, mn, k):
+                                shape_list.append([[p[0], p[1][0], p[2]], [p[0], p[2], p[1][1]]])
+                        if mk and n:
+                            for p in itertools.product(bs, mk, n):
+                                shape_list.append([[p[0], p[1][0], p[1][1]], [p[0], p[1][1], p[2]]])
+                        if m and kn:
+                            for p in itertools.product(bs, m, kn):
+                                shape_list.append([[p[0], p[1], p[2][0]], [p[0], p[2][0], p[2][1]]])
+                    # group gemm
+                    elif "gemm_group" in input_shape_group:
+                        groups = input_shape_group.get("gemm_group", [])
+                        kn = input_shape_group.get("KN", [])
+                        if k and n:
+                            kn.append([list(shape) for shape in itertools.product(k, n)])
+                        for group in groups:
+                            for _kn in kn:
+                                group_input_shape_list = []
+                                for m in group:
+                                    group_input_shape_list.append([[m, _kn[0]], [_kn[0], _kn[1]]])
+                                shape_list.append(group_input_shape_list)
+                    # gemm
+                    else:
+                        if m and n and k:
+                            for p in itertools.product(m, k, n):
+                                shape_list.append([[p[0], p[1]], [p[1], p[2]]])
+                        if mn and k:
+                            for p in itertools.product(mn, k):
+                                shape_list.append([[p[0][0], p[1]], [p[1], p[0][1]]])
+                        if mk and n:
+                            for p in itertools.product(mk, n):
+                                shape_list.append([[p[0][0], p[0][1]], [p[0][1], p[1]]])
+                        if m and kn:
+                            for p in itertools.product(m, kn):
+                                shape_list.append([[p[0], p[1][0]], [p[1][0], p[1][1]]])
+    return shape_list
+
 
 class PerfEngine:
     def __init__(self) -> None:
@@ -105,8 +202,14 @@ def init_process(self, rank: int, world_size: int):
 
         """
         initialize_func = getattr(self.backend, "initialize_ccl")
-        initialize_func(rank, world_size)
+
+        # world_size may excced available device count
+        ret = initialize_func(rank, world_size)
+        if ret is not None and not ret:
+            return
+
         status = self.start_perf(self.workload)
+        return status
 
     def init_backend(self, hardware_type: str) -> Backend:
         """
@@ -125,33 +228,37 @@ def init_backend(self, hardware_type: str) -> Backend:
         return backend(self.workload, self.args.vendor_path)
 
     def start_engine(self) -> None:
-        status = self.activate_venv(self.backend_type)
-        if not status:
-            log.warning("Activate virtualenv Failed, Please Check...")
+        #status = self.activate_venv(self.backend_type)
+        #if not status:
+        #    log.warning("Activate virtualenv Failed, Please Check...")
 
         self.backend = self.init_backend(self.backend_type)
         output_dir = os.path.abspath("reports/" + self.backend_type)
         os.makedirs(output_dir, exist_ok=True)
 
-        if self.args.task in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast"]:
+        if self.args.task in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast", "p2p"]:
             for group in self.workload["group"]:
-                mp.spawn(fn=self.init_process, args=(group,), nprocs=group)
+                try:
+                    mp.spawn(fn=self.init_process, args=(group,), nprocs=group)
+                except Exception as e:
+                    traceback.print_exc()
+                    log.error(f"Execute task: {self.args.task} failed, group: {group}, error msg: {e}")
         else:
             status = self.start_perf(self.workload)
 
         self.deactivate_venv()
 
     def start_perf(self, workload: Dict[str, Any]) -> bool:
-        log.info(
-            "******************************************* Start to test op: {}. *******************************************".format(
-                workload["operator"]
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        if local_rank == 0:
+            log.info(
+                "******************************************* Start to test op: [{}]. *******************************************".format(
+                    workload["operator"]
+                )
             )
-        )
 
         # Initalize Output Dir and Reports
-        output_dir = os.path.abspath(
-            "reports/" + self.backend_type + "/" + workload["operator"]
-        )
+        output_dir = pathlib.Path("reports").joinpath(self.backend_type).joinpath(workload["operator"])
         os.makedirs(output_dir, exist_ok=True)
 
         op_name = workload["operator"]
@@ -168,24 +275,32 @@ def start_perf(self, workload: Dict[str, Any]) -> bool:
         else:
             raise ValueError(f"Unknown operation: {op_name.lower()}")
 
-        perf_reports = []
-        if "input_shape_list" in self.workload:
-            shape_list = self.workload["input_shape_list"]
-        else:
-            shape_list = []
-            for M, N, K in self.workload["M/N/K"]:
-                shape_list.append([[M, K], [K, N]])
+        # get input shape info
+        shape_list = parse_workload(self.workload)
+
+        # dtype list
+        dtype_list = self.workload["dtype"]
 
-        for dtype in self.workload["dtype"]:
+        for dtype in dtype_list:
             perf_reports = []
             base_report["Performance"] = {}
+
             for input_shape in shape_list:
+                """
+                input_shape could be:
+                  List[int]: single shape. cos
+                  List[List[int]]: multiple inputs. add
+                  List[List[List[in]]]: multiple inputs with multiple problems. group_gemm
+                """
+                if local_rank == 0:
+                    log.info(f"Execute op: [{op_name.lower()}], input_shape: {input_shape}, dtype: {dtype}")
                 if isinstance(input_shape[0], int):
                     input_shape = [input_shape]
                 try:
                     reports = self.backend.perf(input_shape, dtype)
                 except Exception as e:
-                    print(e)
+                    traceback.print_exc()
+                    log.error(f"Execute op: {op_name.lower()} failed, input_shape: {input_shape}, dtype: {dtype}, error msg: {e}")
                     reports = {}
                 perf_reports.append(reports)
             base_report["Performance"] = perf_reports
@@ -202,12 +317,16 @@ def start_perf(self, workload: Dict[str, Any]) -> bool:
                 + ".json"
             )
             output_report_path = os.path.join(output_dir, output_report_path)
-            local_rank = int(os.environ.get("LOCAL_RANK", 0))
             if local_rank == 0:
-                logging.info(base_report["Performance"])
+                # logging.info(base_report["Performance"])
                 with open(output_report_path, "w") as file:
                     json.dump(base_report, file, indent=4)
-
+        if local_rank == 0:
+            log.info(
+                "******************************************* Test op: [{}] SUCCESS. *******************************************".format(
+                    workload["operator"]
+                )
+            )
         return True
 
     def get_cpu_name(self):
@@ -269,7 +388,7 @@ def activate_venv(self, hardware_type: str) -> bool:
         return True
 
     def deactivate_venv(self):
-        sys.path[:0] = self.prev_sys_path  # will also revert the added site-packages
+        sys.path[:0] = self.prev_sys_path   #will also revert the added site-packages
         sys.prefix = self.real_prefix
         os.environ["PATH"] = self.old_os_path
 
diff --git a/byte_micro_perf/launch.py b/byte_micro_perf/launch.py
index 76ead1a4d..f31829180 100644
--- a/byte_micro_perf/launch.py
+++ b/byte_micro_perf/launch.py
@@ -28,11 +28,25 @@
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("lanuch")
 
+
+def parse_task(task_dir):
+    tasks = []
+    if os.path.isdir(task_dir):
+        for root, _, files in os.walk(task_dir, topdown=False):
+            for name in files:
+                if name.endswith(".json"):
+                    tasks.append(name.rsplit('.', 1)[0])
+    return tasks
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--task", default="", help="The task going to be evaluted, refs to workloads/"
     )
+    parser.add_argument(
+        "--task_dir", default="", help="The direcotry of tasks going to be evaluted, e.g., set to workloads"
+    )
     parser.add_argument(
         "--hardware_type",
         default="GPU",
@@ -67,7 +81,7 @@
         for file in os.listdir("backends"):
             if not file.endswith(".py") and not file.startswith("_"):
                 print(file)
-    if args.task:
+    if args.task or args.task_dir:
         log.info("******************* Pip Package Installing *******************")
         subprocess.call(
             ["python3", "-m", "pip", "install", "pip", "--upgrade", "--quiet"]
@@ -77,8 +91,18 @@
             ["python3", "-m", "pip", "install", "-r", "requirements.txt", "--quiet"]
         )
 
-        cmd = "python3 core/perf_engine.py --hardware_type {} --task {} --vendor_path {}".format(
-            args.hardware_type, args.task, args.vendor_path
-        )
-        exit_code = subprocess.call(cmd, shell=True)
+        if args.task:
+            if args.task_dir:
+                log.warning("task and task_dir are both set, task_dir will be ignored")
+            tasks = args.task.split(',')
+        elif args.task_dir:
+            tasks = parse_task(args.task_dir)
+        logging.info(f"******************* Tasks: {tasks}")
+        exit_code = 0
+        for task in tasks:
+            cmd = "python3 core/perf_engine.py --hardware_type {} --task {} --vendor_path {}".format(
+                args.hardware_type, task, args.vendor_path
+            )
+            exit_code = subprocess.call(cmd, shell=True)
+
         sys.exit(exit_code)
diff --git a/byte_micro_perf/requirements.txt b/byte_micro_perf/requirements.txt
index 5011e26e5..9adbfddb2 100644
--- a/byte_micro_perf/requirements.txt
+++ b/byte_micro_perf/requirements.txt
@@ -11,3 +11,4 @@ fpdf
 attrs
 decorator
 typing-extensions
+pydot
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/add.json b/byte_micro_perf/workloads/add.json
index 147141b19..5885cc87a 100644
--- a/byte_micro_perf/workloads/add.json
+++ b/byte_micro_perf/workloads/add.json
@@ -1,44 +1,18 @@
 {
   "operator": "add",
   "iterations": 100,
-  "input_shape_list": [
-    [
+  "input_shape_groups": {
+    "inputs": [
       [
-        4,
-        1024,
-        1024
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
       ],
       [
-        4,
-        1024,
-        1024
-      ]
-    ],
-    [
-      [
-        16,
-        1024,
-        1024
-      ],
-      [
-        16,
-        1024,
-        1024
-      ]
-    ],
-    [
-      [
-        64,
-        1024,
-        1024
-      ],
-      [
-        64,
-        1024,
-        1024
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
       ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "bfloat16",
diff --git a/byte_micro_perf/workloads/allgather.json b/byte_micro_perf/workloads/allgather.json
index a268047d7..a7d0b0a6e 100644
--- a/byte_micro_perf/workloads/allgather.json
+++ b/byte_micro_perf/workloads/allgather.json
@@ -1,42 +1,14 @@
 {
   "operator": "allgather",
   "iterations": 100,
-  "input_shape_list": [
-    [
-      1024,
-      1024
-    ],
-    [
-      8,
-      1024,
-      1024
-    ],
-    [
-      64,
-      1024,
-      1024
-    ], 
-    [
-      128,
-      1024,
-      1024
-    ], 
-    [
-      256,
-      1024,
-      1024
-    ], 
-    [
-      512,
-      1024,
-      1024
-    ], 
-    [
-      1024,
-      1024,
-      1024
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608],
+        [1024]
+      ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "bfloat16",
diff --git a/byte_micro_perf/workloads/allreduce.json b/byte_micro_perf/workloads/allreduce.json
index cbc616e3d..d81356ccb 100644
--- a/byte_micro_perf/workloads/allreduce.json
+++ b/byte_micro_perf/workloads/allreduce.json
@@ -1,42 +1,14 @@
 {
   "operator": "allreduce",
   "iterations": 100,
-  "input_shape_list": [
-    [
-      1024,
-      1024
-    ],
-    [
-      8,
-      1024,
-      1024
-    ],
-    [
-      64,
-      1024,
-      1024
-    ], 
-    [
-      128, 
-      1024, 
-      1024
-    ], 
-    [
-      256, 
-      1024, 
-      1024
-    ], 
-    [
-      512, 
-      1024, 
-      1024
-    ], 
-    [
-      1024, 
-      1024, 
-      1024
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608],
+        [1024]
+      ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "bfloat16",
diff --git a/byte_micro_perf/workloads/alltoall.json b/byte_micro_perf/workloads/alltoall.json
index ed53a2291..7550fa719 100644
--- a/byte_micro_perf/workloads/alltoall.json
+++ b/byte_micro_perf/workloads/alltoall.json
@@ -1,90 +1,18 @@
 {
   "operator": "alltoall",
   "iterations": 100,
-  "input_shape_list": [
-    [
+  "input_shape_groups": {
+    "inputs": [
       [
-        1024,
-        1024
+        [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608],
+        [1024]
       ],
       [
-        1024,
-        1024
-      ]
-    ],
-    [
-      [
-        8,
-        1024,
-        1024
-      ],
-      [
-        8,
-        1024,
-        1024
-      ]
-    ],
-    [
-      [
-        64,
-        1024,
-        1024
-      ],
-      [
-        64,
-        1024,
-        1024
-      ]
-    ], 
-    [
-      [
-        128,
-        1024,
-        1024
-      ],
-      [
-        128,
-        1024,
-        1024
-      ]
-    ], 
-    [
-      [
-        256,
-        1024,
-        1024
-      ],
-      [
-        256,
-        1024,
-        1024
-      ]
-    ], 
-    [
-      [
-        512,
-        1024,
-        1024
-      ],
-      [
-        512,
-        1024,
-        1024
-      ]
-    ], 
-    [
-      [
-        1024,
-        1024,
-        1024
-      ],
-      [
-        1024,
-        1024,
-        1024
+        [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608],
+        [1024]
       ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "bfloat16",
diff --git a/byte_micro_perf/workloads/batch_gemm.json b/byte_micro_perf/workloads/batch_gemm.json
new file mode 100644
index 000000000..13c3773e7
--- /dev/null
+++ b/byte_micro_perf/workloads/batch_gemm.json
@@ -0,0 +1,32 @@
+{
+  "operator": "batch_gemm", 
+  "iterations": 100, 
+  "input_shape_groups": [
+    {
+      "batch_size": [4, 8, 16, 32, 64, 128, 256, 512, 1024],
+      "MN": [[1, 1], [1, 1024], [1, 2048], [1, 4096]],
+      "K": [128, 256, 512]
+    },
+    {
+      "batch_size": [4, 8, 16, 32, 64, 128, 256],
+      "MN": [[1, 8192],[1, 16384], [1, 32768], [1, 65536], [1, 131072]],
+      "K": [128, 256, 512]
+    },
+    {
+      "batch_size": [1, 2, 4, 8, 16, 32],
+      "MN": [[1, 1], [1024, 1024], [2048, 2048], [4096, 4096], [8192, 8192]],
+      "K": [128, 256, 512]
+    },
+    {
+      "batch_size": [1, 2, 4],
+      "MN": [[16384, 16384], [32768, 32768], [65536, 65536], [131072, 131072]],
+      "K": [128, 256, 512]
+    }
+  ],
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "half", 
+    "int8"
+  ]
+}
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/broadcast.json b/byte_micro_perf/workloads/broadcast.json
index a4cbe85aa..b815360a1 100644
--- a/byte_micro_perf/workloads/broadcast.json
+++ b/byte_micro_perf/workloads/broadcast.json
@@ -1,42 +1,14 @@
 {
-  "operator": "broadcast", 
+  "operator": "broadcast",
   "iterations": 100,
-  "input_shape_list": [
-    [
-      1024,
-      1024
-    ],
-    [
-      8,
-      1024,
-      1024
-    ],
-    [
-      64,
-      1024,
-      1024
-    ], 
-    [
-      128,
-      1024,
-      1024
-    ], 
-    [
-      256,
-      1024,
-      1024
-    ], 
-    [
-      512,
-      1024,
-      1024
-    ], 
-    [
-      1024,
-      1024,
-      1024
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608],
+        [1024]
+      ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "bfloat16",
diff --git a/byte_micro_perf/workloads/cast.json b/byte_micro_perf/workloads/cast.json
new file mode 100644
index 000000000..07ab85dd4
--- /dev/null
+++ b/byte_micro_perf/workloads/cast.json
@@ -0,0 +1,17 @@
+{
+  "operator": "cast",
+  "iterations": 100,
+  "input_shape_groups": {
+  "inputs": [
+    [
+      [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+      [8192]
+    ]
+  ]
+},
+"dtype": [
+    "float32",
+    "bfloat16",
+    "half"
+  ]
+}
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/cos.json b/byte_micro_perf/workloads/cos.json
index 7203d99a0..62725bcac 100644
--- a/byte_micro_perf/workloads/cos.json
+++ b/byte_micro_perf/workloads/cos.json
@@ -1,23 +1,14 @@
 {
   "operator": "cos",
   "iterations": 100,
-  "input_shape_list": [
-    [
-      4,
-      1024,
-      1024
-    ],
-    [
-      16,
-      1024,
-      1024
-    ],
-    [
-      64,
-      1024,
-      1024
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "bfloat16",
diff --git a/byte_micro_perf/workloads/device2host.json b/byte_micro_perf/workloads/device2host.json
index 4b00e8e45..3bb34dab0 100644
--- a/byte_micro_perf/workloads/device2host.json
+++ b/byte_micro_perf/workloads/device2host.json
@@ -1,40 +1,17 @@
 {
   "operator": "device2host",
   "iterations": 100,
-  "input_shape_list": [
-    [
-      1,
-      1024,
-      1024
-    ],
-    [
-      4,
-      1024,
-      1024
-    ],
-    [
-      16,
-      1024,
-      1024
-    ],
-    [
-      64,
-      1024,
-      1024
-    ],
-    [
-      128,
-      1024,
-      1024
-    ],
-    [
-      256,
-      1024,
-      1024
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [1024]
+      ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
+    "bfloat16",
     "half"
   ]
 }
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/div.json b/byte_micro_perf/workloads/div.json
new file mode 100644
index 000000000..bb55608ba
--- /dev/null
+++ b/byte_micro_perf/workloads/div.json
@@ -0,0 +1,22 @@
+{
+  "operator": "div",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ],
+      
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "half"
+  ]
+}
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/exp.json b/byte_micro_perf/workloads/exp.json
index 025d03035..17d00d0fd 100644
--- a/byte_micro_perf/workloads/exp.json
+++ b/byte_micro_perf/workloads/exp.json
@@ -1,23 +1,14 @@
 {
   "operator": "exp",
   "iterations": 100,
-  "input_shape_list": [
-    [
-      4,
-      1024,
-      1024
-    ],
-    [
-      16,
-      1024,
-      1024
-    ],
-    [
-      64,
-      1024,
-      1024
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "bfloat16",
diff --git a/byte_micro_perf/workloads/exponential.json b/byte_micro_perf/workloads/exponential.json
index d5ce4832c..967a58e95 100644
--- a/byte_micro_perf/workloads/exponential.json
+++ b/byte_micro_perf/workloads/exponential.json
@@ -1,13 +1,14 @@
 {
   "operator": "exponential",
   "iterations": 100,
-  "input_shape_list": [
-    [
-      8,
-      1024,
-      1024
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "bfloat16",
diff --git a/byte_micro_perf/workloads/gather.json b/byte_micro_perf/workloads/gather.json
new file mode 100644
index 000000000..6def4c141
--- /dev/null
+++ b/byte_micro_perf/workloads/gather.json
@@ -0,0 +1,17 @@
+{
+    "operator": "gather", 
+    "iterations": 100, 
+    "input_shape_groups": {
+      "inputs": [
+        [
+          [1024], 
+          [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288]
+        ]
+      ]
+    }, 
+    "dtype": [
+      "float32", 
+      "float16", 
+      "bfloat16"
+    ]
+  }
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/gelu.json b/byte_micro_perf/workloads/gelu.json
index 2f0984d1d..655749557 100644
--- a/byte_micro_perf/workloads/gelu.json
+++ b/byte_micro_perf/workloads/gelu.json
@@ -1,23 +1,14 @@
 {
   "operator": "gelu",
   "iterations": 100,
-  "input_shape_list": [
-    [
-      4,
-      1024,
-      1024
-    ],
-    [
-      16,
-      1024,
-      1024
-    ],
-    [
-      64,
-      1024,
-      1024
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "bfloat16",
diff --git a/byte_micro_perf/workloads/gemm.json b/byte_micro_perf/workloads/gemm.json
index 41fac5365..1fbd9d8ff 100644
--- a/byte_micro_perf/workloads/gemm.json
+++ b/byte_micro_perf/workloads/gemm.json
@@ -1,131 +1,14 @@
 {
   "operator": "gemm",
   "iterations": 100,
-  "M/N/K": [
-    [
-      64,
-      2048,
-      65536
-    ],
-    [
-      64,
-      65536,
-      2048
-    ],
-    [
-      2048,
-      64,
-      65536
-    ],
-    [
-      2048,
-      65536,
-      64
-    ],
-    [
-      65536,
-      2048,
-      64
-    ],
-    [
-      65536,
-      64,
-      2048
-    ],
-    [
-      64,
-      2048,
-      65280
-    ],
-    [
-      64,
-      65280,
-      2048
-    ],
-    [
-      2048,
-      64,
-      65280
-    ],
-    [
-      2048,
-      65280,
-      64
-    ],
-    [
-      65280,
-      2048,
-      64
-    ],
-    [
-      65280,
-      64,
-      2048
-    ],
-    [
-      800,
-      12288,
-      1536
-    ],
-    [
-      128,
-      12288,
-      1536
-    ],
-    [
-      800,
-      1536,
-      12288
-    ],
-    [
-      128,
-      1536,
-      12288
-    ],
-    [
-      64,
-      65536,
-      64
-    ],
-    [
-      64,
-      65536,
-      65536
-    ],
-    [
-      65536,
-      65536,
-      64
-    ],
-    [
-      64,
-      64,
-      64
-    ],
-    [
-      65536,
-      64,
-      64
-    ],
-    [
-      64,
-      64,
-      65536
-    ],
-    [
-      65536,
-      64,
-      65536
-    ],
-    [
-      65536,
-      65536,
-      65536
-    ]
-  ],
+  "input_shape_groups": {
+    "M": [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+    "KN": [[1024, 1024], [16384, 1024], [16384, 32], [1024, 16384], [4096, 4096], [8192, 8192], [12288, 12288]]
+  },
   "dtype": [
     "float32",
     "bfloat16",
-    "half"
+    "half", 
+    "int8"
   ]
 }
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/gemv.json b/byte_micro_perf/workloads/gemv.json
new file mode 100644
index 000000000..dcf6a1d0a
--- /dev/null
+++ b/byte_micro_perf/workloads/gemv.json
@@ -0,0 +1,22 @@
+{
+  "operator": "gemv",
+  "iterations": 100,
+  "input_shape_groups": [
+    {
+      "M": [1],
+      "K": [16, 32, 64, 128, 256, 512],
+      "N": [4096, 8192]
+    },
+    {
+      "M": [1],
+      "K": [4096, 8192],
+      "N": [16, 32, 64, 128, 256, 512]
+    }
+  ],
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "half", 
+    "int8"
+  ]
+}
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/group_gemm.json b/byte_micro_perf/workloads/group_gemm.json
new file mode 100644
index 000000000..745d66a6a
--- /dev/null
+++ b/byte_micro_perf/workloads/group_gemm.json
@@ -0,0 +1,14 @@
+{
+  "operator": "group_gemm", 
+  "iterations": 100, 
+  "input_shape_groups": {
+    "gemm_group": [[1, 16, 32, 64, 128, 256, 512, 1024]],
+    "KN": [[4096, 4096], [7168, 7168], [16384, 16384]]
+  },
+  "dtype": [ 
+    "float32", 
+    "bfloat16", 
+    "half", 
+    "int8"
+  ]
+}
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/host2device.json b/byte_micro_perf/workloads/host2device.json
index 417cf8d66..8982c6780 100644
--- a/byte_micro_perf/workloads/host2device.json
+++ b/byte_micro_perf/workloads/host2device.json
@@ -1,38 +1,14 @@
 {
   "operator": "host2device",
   "iterations": 100,
-  "input_shape_list": [
-    [
-      1,
-      1024,
-      1024
-    ],
-    [
-      4,
-      1024,
-      1024
-    ],
-    [
-      16,
-      1024,
-      1024
-    ],
-    [
-      64,
-      1024,
-      1024
-    ],
-    [
-      128,
-      1024,
-      1024
-    ],
-    [
-      256,
-      1024,
-      1024
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [1024]
+      ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "bfloat16",
diff --git a/byte_micro_perf/workloads/index_add.json b/byte_micro_perf/workloads/index_add.json
new file mode 100644
index 000000000..64744d143
--- /dev/null
+++ b/byte_micro_perf/workloads/index_add.json
@@ -0,0 +1,21 @@
+{
+  "operator": "index_add",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [1024],
+        [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288]
+      ],
+      [
+        [1024],
+        [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "half",
+    "bfloat16"
+  ]
+}
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/indexadd.json b/byte_micro_perf/workloads/indexadd.json
deleted file mode 100644
index 5617d6b21..000000000
--- a/byte_micro_perf/workloads/indexadd.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "operator": "indexadd",
-  "iterations": 100,
-  "input_shape_list": [
-    [
-      [
-        4,
-        7168
-      ],
-      [
-        20,
-        7168
-      ]
-    ],
-    [
-      [
-        2048,
-        7168
-      ],
-      [
-        10240,
-        7168
-      ]
-    ]
-  ],
-  "dtype": [
-    "float32",
-    "half",
-    "bfloat16"
-  ]
-}
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/layernorm.json b/byte_micro_perf/workloads/layernorm.json
index 801464280..87711ee2a 100644
--- a/byte_micro_perf/workloads/layernorm.json
+++ b/byte_micro_perf/workloads/layernorm.json
@@ -1,40 +1,14 @@
 {
   "operator": "layernorm",
   "iterations": 100,
-  "input_shape_list": [
-    [
-      131072,
-      32
-    ],
-    [
-      131072,
-      64
-    ],
-    [
-      131072,
-      128
-    ],
-    [
-      131072,
-      512
-    ],
-    [
-      131072,
-      1024
-    ],
-    [
-      131072,
-      4096
-    ],
-    [
-      131072,
-      16384
-    ],
-    [
-      131072,
-      32768
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [1024],
+        [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288]
+      ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "bfloat16",
diff --git a/byte_micro_perf/workloads/mul.json b/byte_micro_perf/workloads/mul.json
new file mode 100644
index 000000000..c7935637c
--- /dev/null
+++ b/byte_micro_perf/workloads/mul.json
@@ -0,0 +1,22 @@
+{
+  "operator": "mul",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ],
+      
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "half"
+  ]
+}
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/p2p.json b/byte_micro_perf/workloads/p2p.json
new file mode 100644
index 000000000..7d0c5310a
--- /dev/null
+++ b/byte_micro_perf/workloads/p2p.json
@@ -0,0 +1,26 @@
+{
+  "operator": "p2p",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608],
+        [1024]
+      ],
+      [
+        [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608],
+        [1024]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "half"
+  ],
+  "group": [
+    2,
+    4,
+    8
+  ]
+}
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/reduce_max.json b/byte_micro_perf/workloads/reduce_max.json
new file mode 100644
index 000000000..ae311a3e4
--- /dev/null
+++ b/byte_micro_perf/workloads/reduce_max.json
@@ -0,0 +1,17 @@
+{
+  "operator": "reduce_max",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [1024],
+        [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "half"
+  ]
+}
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/reduce_min.json b/byte_micro_perf/workloads/reduce_min.json
new file mode 100644
index 000000000..7b7edb040
--- /dev/null
+++ b/byte_micro_perf/workloads/reduce_min.json
@@ -0,0 +1,17 @@
+{
+  "operator": "reduce_min",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [1024],
+        [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "half"
+  ]
+}
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/reduce_sum.json b/byte_micro_perf/workloads/reduce_sum.json
new file mode 100644
index 000000000..56cf77d85
--- /dev/null
+++ b/byte_micro_perf/workloads/reduce_sum.json
@@ -0,0 +1,17 @@
+{
+  "operator": "reduce_sum",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [1024],
+        [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "half"
+  ]
+}
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/reducescatter.json b/byte_micro_perf/workloads/reducescatter.json
index 967d76062..228f1f6d0 100644
--- a/byte_micro_perf/workloads/reducescatter.json
+++ b/byte_micro_perf/workloads/reducescatter.json
@@ -1,42 +1,14 @@
 {
   "operator": "reducescatter",
   "iterations": 100,
-  "input_shape_list": [
-    [
-      1024,
-      1024
-    ],
-    [
-      8,
-      1024,
-      1024
-    ],
-    [
-      64,
-      1024,
-      1024
-    ], 
-    [
-      128,
-      1024,
-      1024
-    ], 
-    [
-      256,
-      1024,
-      1024
-    ], 
-    [
-      512,
-      1024,
-      1024
-    ], 
-    [
-      1024,
-      1024,
-      1024
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608],
+        [1024]
+      ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "bfloat16",
diff --git a/byte_micro_perf/workloads/scatter.json b/byte_micro_perf/workloads/scatter.json
new file mode 100644
index 000000000..63b86a831
--- /dev/null
+++ b/byte_micro_perf/workloads/scatter.json
@@ -0,0 +1,17 @@
+{
+    "operator": "scatter", 
+    "iterations": 100, 
+    "input_shape_groups": {
+      "inputs": [
+        [
+          [1024], 
+          [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288]
+        ]
+      ]
+    }, 
+    "dtype": [
+      "float32", 
+      "float16", 
+      "bfloat16"
+    ]
+  }
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/silu.json b/byte_micro_perf/workloads/silu.json
new file mode 100644
index 000000000..3770218c5
--- /dev/null
+++ b/byte_micro_perf/workloads/silu.json
@@ -0,0 +1,17 @@
+{
+  "operator": "silu",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "half"
+  ]
+}
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/sin.json b/byte_micro_perf/workloads/sin.json
index 00d37b94d..bf2bacda1 100644
--- a/byte_micro_perf/workloads/sin.json
+++ b/byte_micro_perf/workloads/sin.json
@@ -1,23 +1,14 @@
 {
   "operator": "sin",
   "iterations": 100,
-  "input_shape_list": [
-    [
-      4,
-      1024,
-      1024
-    ],
-    [
-      16,
-      1024,
-      1024
-    ],
-    [
-      64,
-      1024,
-      1024
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "bfloat16",
diff --git a/byte_micro_perf/workloads/softmax.json b/byte_micro_perf/workloads/softmax.json
index 1cf923d84..a90f294db 100644
--- a/byte_micro_perf/workloads/softmax.json
+++ b/byte_micro_perf/workloads/softmax.json
@@ -1,40 +1,14 @@
 {
   "operator": "softmax",
   "iterations": 100,
-  "input_shape_list": [
-    [
-      131072,
-      32
-    ],
-    [
-      131072,
-      64
-    ],
-    [
-      131072,
-      128
-    ],
-    [
-      131072,
-      512
-    ],
-    [
-      131072,
-      1024
-    ],
-    [
-      131072,
-      4096
-    ],
-    [
-      131072,
-      16384
-    ],
-    [
-      131072,
-      32768
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [1024],
+        [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288]
+      ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "bfloat16",
diff --git a/byte_micro_perf/workloads/sort.json b/byte_micro_perf/workloads/sort.json
index f19dd2b25..a30222a08 100644
--- a/byte_micro_perf/workloads/sort.json
+++ b/byte_micro_perf/workloads/sort.json
@@ -1,26 +1,14 @@
 {
   "operator": "sort",
   "iterations": 100,
-  "input_shape_list": [
-    [
-      20
-    ],
-    [
-      128
-    ],
-    [
-      1024
-    ],
-    [
-      10240
-    ],
-    [
-      61440
-    ],
-    [
-      102400
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [1024],
+        [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288]
+      ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "bfloat16",
diff --git a/byte_micro_perf/workloads/sub.json b/byte_micro_perf/workloads/sub.json
new file mode 100644
index 000000000..0b6a46c69
--- /dev/null
+++ b/byte_micro_perf/workloads/sub.json
@@ -0,0 +1,22 @@
+{
+  "operator": "sub",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ],
+      
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "half"
+  ]
+}
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/swiglu.json b/byte_micro_perf/workloads/swiglu.json
new file mode 100644
index 000000000..9982a2c9a
--- /dev/null
+++ b/byte_micro_perf/workloads/swiglu.json
@@ -0,0 +1,17 @@
+{
+  "operator": "swiglu",
+  "iterations": 100,
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072],
+        [8192]
+      ]
+    ]
+  },
+  "dtype": [
+    "float32",
+    "bfloat16",
+    "half"
+  ]
+}
\ No newline at end of file
diff --git a/byte_micro_perf/workloads/unique.json b/byte_micro_perf/workloads/unique.json
index 452e243b0..ba88ea4ef 100644
--- a/byte_micro_perf/workloads/unique.json
+++ b/byte_micro_perf/workloads/unique.json
@@ -1,26 +1,14 @@
 {
   "operator": "unique",
   "iterations": 100,
-  "input_shape_list": [
-    [
-      20
-    ],
-    [
-      128
-    ],
-    [
-      1024
-    ],
-    [
-      10240
-    ],
-    [
-      61440
-    ],
-    [
-      102400
+  "input_shape_groups": {
+    "inputs": [
+      [
+        [1024],
+        [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288]
+      ]
     ]
-  ],
+  },
   "dtype": [
     "float32",
     "half",

From 8fde9d158d185a8196d32e98e52ca7c6f15ae239 Mon Sep 17 00:00:00 2001
From: "zhiwei.shangguan" <zhiwei.shangguan@iluvatar.ai>
Date: Tue, 13 Aug 2024 02:12:12 +0000
Subject: [PATCH 25/28] gemm of int8 of ix

---
 .gitignore                                    |  3 +-
 .../backends/ILUVATAR/backend_iluvatar.py     | 10 +--
 .../backends/ILUVATAR/custom_ops.py           | 75 ++++++-----------
 .../ILUVATAR/ixgemmblaslt/__init__.py         | 33 ++++++++
 .../ILUVATAR/ixgemmblaslt/ixgemmblaslt.cpp    | 82 +++++++++++++++++++
 .../ILUVATAR/ixgemmblaslt/ixgemmblaslt.hpp    | 36 ++++++++
 .../ixgemmblaslt/ixgemmblaslt_kernel.cu       | 73 +++++++++++++++++
 .../backends/ILUVATAR/ixgemmblaslt/setup.py   | 25 ++++++
 .../backends/ILUVATAR/ixgemmblaslt_demo.py    | 80 ++++++++++++++++++
 byte_micro_perf/core/perf_engine.py           |  1 +
 10 files changed, 363 insertions(+), 55 deletions(-)
 create mode 100644 byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/__init__.py
 create mode 100644 byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.cpp
 create mode 100644 byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.hpp
 create mode 100644 byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt_kernel.cu
 create mode 100644 byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/setup.py
 create mode 100644 byte_micro_perf/backends/ILUVATAR/ixgemmblaslt_demo.py

diff --git a/.gitignore b/.gitignore
index 2e06b0742..e899a95f1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,4 +25,5 @@ init_env.sh
 
 byte_infer_perf/llm_perf/download
 byte_infer_perf/llm_perf/model_zoo/sota
-byte_infer_perf/llm_perf/reports
\ No newline at end of file
+byte_infer_perf/llm_perf/reports
+byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/build_tmp/
\ No newline at end of file
diff --git a/byte_micro_perf/backends/ILUVATAR/backend_iluvatar.py b/byte_micro_perf/backends/ILUVATAR/backend_iluvatar.py
index 02807ac42..ebad1ac0a 100644
--- a/byte_micro_perf/backends/ILUVATAR/backend_iluvatar.py
+++ b/byte_micro_perf/backends/ILUVATAR/backend_iluvatar.py
@@ -27,7 +27,7 @@
 from backends.module_store import *
 from backends.utils import get_dtype_bytes 
 
-from backends.module_store import GemmOp, GemvOp, BatchGemmOp, GroupGemmOp
+from .custom_ops import ILUVATARGemmOp, ILUVATARBatchGemmOp, ILUVATARGroupGemmOp
 
 
 logging.basicConfig(level=logging.INFO)
@@ -165,16 +165,16 @@ def gather(self):
 
     # gemm ops
     def gemm(self):
-        self.op = GemmOp()
+        self.op = ILUVATARGemmOp()
 
     def gemv(self):
-        self.op = GemvOp()
+        self.op = ILUVATARGemmOp()
 
     def batch_gemm(self):
-        self.op = BatchGemmOp()
+        self.op = ILUVATARBatchGemmOp()
 
     def group_gemm(self):
-        self.op = GroupGemmOp()
+        self.op = ILUVATARGroupGemmOp()
 
 
     # create input tensors
diff --git a/byte_micro_perf/backends/ILUVATAR/custom_ops.py b/byte_micro_perf/backends/ILUVATAR/custom_ops.py
index 0fcb1dfb6..78bc4ba17 100644
--- a/byte_micro_perf/backends/ILUVATAR/custom_ops.py
+++ b/byte_micro_perf/backends/ILUVATAR/custom_ops.py
@@ -1,7 +1,9 @@
 from typing import List
 
 import torch
-import cutlass
+
+#import cutlass
+from .ixgemmblaslt import gemm88
 
 from backends.module_store import GemmOp, BatchGemmOp, GroupGemmOp
 
@@ -13,27 +15,14 @@ def __init__(self):
         super().__init__()
 
         try:
-            import cutlass
-            dtype = torch.int8
-            accum_dtype=torch.int32
-            self.plan = cutlass.op.Gemm(
-                alpha=1, beta=0,
-                element_A=dtype,
-                element_B=dtype,
-                element_C=accum_dtype,
-                element_D=accum_dtype,
-                layout_A=cutlass.LayoutType.RowMajor,
-                layout_B=cutlass.LayoutType.RowMajor,
-                layout_C=cutlass.LayoutType.RowMajor
-            )
-            self.op = self.plan.construct()
-            self.gemm_op_int8 = cutlass.emit.pytorch(
-                self.op, name='gemm', cc=self.plan.cc, 
-                jit=True, sourcedir='out'
-            )
+            self.blasLtIns = gemm88.gemm_init()
         except:
-            self.gemm_op_int8 = None
-            raise Exception("ILUVATARGemmOp cutlass error")
+            self.blasLtIns = None
+            raise Exception("ILUVATARGemmOp ixgemmblaslt error")
+        
+    def __del__(self):
+        if not self.blasLtIns is None:
+            gemm88.gemm_release(self.blasLtIns)
 
     def forward(
         self, 
@@ -42,7 +31,7 @@ def forward(
     ):
         compute_dtype = input_tensor_a.dtype
         if compute_dtype == torch.int8:
-            output_tensor = self.gemm_op_int8.run(input_tensor_a, input_tensor_b)
+            output_tensor = gemm88.gemm_run(self.blasLtIns, [input_tensor_a], [input_tensor_b])[0]
         else:
             output_tensor = torch.mm(input_tensor_a, input_tensor_b)
         return output_tensor
@@ -55,9 +44,14 @@ def __init__(self):
         super().__init__()
 
         try:
-            import cutlass
+            self.blasLtIns = gemm88.gemm_init()
         except:
-            raise Exception("ILUVATARBatchGemmOp import cutlass error")
+            self.blasLtIns = None
+            raise Exception("ILUVATARBatchGemmOp import ixgemmblaslt error")
+        
+    def __del__(self):
+        if not self.blasLtIns is None:
+            gemm88.gemm_release(self.blasLtIns)
 
     def forward(
         self, 
@@ -68,11 +62,7 @@ def forward(
 
         output_tensor = None
         if compute_dtype == torch.int8:
-            bs, m, n = input_tensor_a.shape[0], input_tensor_a.shape[1], input_tensor_b.shape[2]
-            c_tensor = torch.randint(-3, 3, [bs, m, n], dtype=torch.int32, device="cuda")
-            output_tensor = torch.randint(-3, 3, [bs, m, n], dtype=torch.int32, device="cuda")
-            plan = cutlass.op.Gemm(A=input_tensor_a, B=input_tensor_b, C=c_tensor, D=output_tensor, element_accumulator=cutlass.DataType.s32)
-            plan.run(input_tensor_a, input_tensor_b, c_tensor, output_tensor, 1, 0)
+            output_tensor = gemm88.gemm_run(self.blasLtIns, [input_tensor_a], [input_tensor_b])[0]
         else:
             output_tensor = torch.bmm(input_tensor_a, input_tensor_b)
         return output_tensor
@@ -85,27 +75,14 @@ def __init__(self):
         super().__init__()
 
         try:
-            import cutlass
-            dtype = torch.int8
-            accum_dtype=torch.int32
-            self.plan = cutlass.op.GroupedGemm(
-                alpha=1, beta=0, 
-                element_A=dtype, 
-                element_B=dtype, 
-                element_C=accum_dtype, 
-                element_D=accum_dtype, 
-                layout_A=cutlass.LayoutType.RowMajor, 
-                layout_B=cutlass.LayoutType.RowMajor, 
-                layout_C=cutlass.LayoutType.RowMajor
-            )
-            self.op = self.plan.construct()
-            self.gemm_op_int8 = cutlass.emit.pytorch(
-                self.op, name='group_gemm', cc=self.plan.cc,
-                jit=True, sourcedir='out'
-            )
+            self.blasLtIns = gemm88.gemm_init()
         except:
-            self.gemm_op_int8 = None
+            self.blasLtIns = None
             raise Exception("ILUVATARGroupGemmOp cutlass error")
+        
+    def __del__(self):
+        if not self.blasLtIns is None:
+            gemm88.gemm_release(self.blasLtIns)
 
     def forward(self, 
         a_list : List[torch.Tensor], 
@@ -113,7 +90,7 @@ def forward(self,
     ):
         compute_dtype = a_list[0].dtype
         if compute_dtype == torch.int8:
-            output_tensors = self.gemm_op_int8.run(a_list, b_list)
+            output_tensors = gemm88.gemm_run(self.blasLtIns, a_list, b_list)
         else:
             output_tensors = [a @ b for a, b in zip(a_list, b_list)]
         return output_tensors
\ No newline at end of file
diff --git a/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/__init__.py b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/__init__.py
new file mode 100644
index 000000000..0ee0f95c2
--- /dev/null
+++ b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/__init__.py
@@ -0,0 +1,33 @@
+import os
+from torch.utils.cpp_extension import load as load_cplusplus
+
+print("to build ixgemmblaslt module ...")
+
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+build_dir = os.path.join(cur_dir, 'build_tmp')
+if not os.path.exists(build_dir):
+    os.makedirs(build_dir, exist_ok=True)
+
+gemm88 = load_cplusplus(
+    name='gemm88',
+    extra_cflags=['-std=c++17',
+                ],
+    extra_cuda_cflags=['-std=c++17', 
+                        #'-DCAL_TFLOPS_TEST',
+                    ],
+    sources=[os.path.join(cur_dir, cur_dir, f) for f in [
+        'ixgemmblaslt_kernel.cu',
+        'ixgemmblaslt.cpp',
+        ]],
+    extra_ldflags=['-lcudart', 
+                    '-lcublasLt', 
+                ],
+    with_cuda = True,
+    verbose = True,
+    build_directory = build_dir,
+    )
+print("build ixgemmblaslt ok")
+
+
+__all__ = ['gemm88']
+
diff --git a/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.cpp b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.cpp
new file mode 100644
index 000000000..29800cf7e
--- /dev/null
+++ b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.cpp
@@ -0,0 +1,82 @@
+
+#include "ixgemmblaslt.hpp"
+
+
+gemm_kernel_param gemm_init()
+{
+  return gemm_kernel_init();
+}
+
+void free_device(void * d_data)
+{
+  cudaFree(d_data);
+}
+
+std::vector<at::Tensor> gemm_run(gemm_kernel_param pins, std::vector<at::Tensor> &alist, std::vector<at::Tensor> &blist)
+{
+  std::vector<at::Tensor> clist(alist.size());
+
+  for (size_t i = 0; i < alist.size(); i++)
+  {
+    //int dataSize = alist[i].numel();
+    c10::IntArrayRef shape_a = alist[i].sizes();
+    c10::IntArrayRef shape_b = blist[i].sizes();
+    if(shape_a.size() == 2 && shape_b.size() == 2)
+    {
+      // 二维矩阵
+      {
+        int *d_c;
+        cudaMalloc((void **)&d_c, sizeof(int) * shape_a[0] * shape_b[1]);
+        auto options = torch::TensorOptions().dtype(torch::kInt32);
+        options.device(at::kCUDA);
+        clist[i] = torch::from_blob(d_c, {shape_a[0], shape_b[1]}, std::bind(&free_device, d_c), options);
+        clist[i] = clist[i].cuda();
+      }
+      int M = shape_a[0];
+      int N = shape_b[1];
+      int K = shape_a[1];
+      gemm_kernel_run(pins, (char *)alist[i].data_ptr(), (char *)blist[i].data_ptr(), (int32_t *)clist[i].data_ptr(), M, N, K);
+    }
+    else if (shape_a.size() == 3 && shape_b.size() == 3)
+    {
+      // 三维矩阵
+      {
+        int *d_c;
+        cudaMalloc((void **)&d_c, sizeof(int) * shape_a[0] * shape_a[1] * shape_b[2]);
+        auto options = torch::TensorOptions().dtype(torch::kInt32);
+        options.device(at::kCUDA);
+        clist[i] = torch::from_blob(d_c, {shape_a[0], shape_a[1], shape_b[2]}, std::bind(&free_device, d_c), options);
+        clist[i] = clist[i].cuda();
+      }
+      for (size_t j = 0; j < shape_a[0]; j++)
+      {
+        int M = shape_a[1];
+        int N = shape_b[2];
+        int K = shape_a[2];
+        gemm_kernel_run(pins, (char *)alist[i][j].data_ptr(), (char *)blist[i][j].data_ptr(), (int32_t *)clist[i][j].data_ptr(), M, N, K);
+      }
+    }
+    else
+    {
+      std::cout << "tensor shapes are illegal" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  cudaDeviceSynchronize();
+  return clist;
+}
+
+void gemm_release(gemm_kernel_param ins)
+{
+  gemm_kernel_release(ins);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  py::class_<gemm_kernel_param>(m, "gemm_kernel_param")
+        .def_readwrite("lt_handle", &gemm_kernel_param::lt_handle)
+        .def_readwrite("op_desc", &gemm_kernel_param::op_desc);
+  m.def("gemm_init", &gemm_init, "");
+  m.def("gemm_run", &gemm_run, "");
+  m.def("gemm_release", &gemm_release, "");
+}
+
diff --git a/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.hpp b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.hpp
new file mode 100644
index 000000000..38d79d954
--- /dev/null
+++ b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.hpp
@@ -0,0 +1,36 @@
+#ifndef IXGEMMBLASLT_HPP
+#define IXGEMMBLASLT_HPP
+
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cublasLt.h>
+#include <torch/extension.h>
+#include <pybind11/stl.h>
+
+#define checkBlasStatus(status)                                               \
+  do                                                                          \
+  {                                                                           \
+    if (status != CUBLAS_STATUS_SUCCESS)                                      \
+    {                                                                         \
+      std::cout << "cublasLt API failed with status " << status << std::endl; \
+      exit(EXIT_FAILURE);                                                     \
+    }                                                                         \
+  } while (0)
+
+struct gemm_kernel_param
+{
+    gemm_kernel_param(){
+
+    }
+  uintptr_t lt_handle;
+  uintptr_t op_desc;
+};
+
+
+gemm_kernel_param gemm_kernel_init();
+
+void gemm_kernel_run(gemm_kernel_param pins, char *d_A, char *d_B, int32_t *d_C, const int M, const int N, const int K);
+
+void gemm_kernel_release(gemm_kernel_param pins);
+
+#endif // !IXGEMMBLASLT_HPP
diff --git a/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt_kernel.cu b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt_kernel.cu
new file mode 100644
index 000000000..79201e997
--- /dev/null
+++ b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt_kernel.cu
@@ -0,0 +1,73 @@
+#include <iostream>
+#include "ixgemmblaslt.hpp"
+
+//#define CAL_TFLOPS_TEST
+
+gemm_kernel_param gemm_kernel_init()
+{
+  cublasLtHandle_t lt_handle = nullptr;
+  checkBlasStatus(cublasLtCreate(&(lt_handle)));
+
+  cublasLtMatmulDesc_t op_desc = nullptr;
+#ifdef __ILUVATAR__
+    cudaDataType compute_type = CUDA_R_32I;
+#else
+    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32I;
+#endif
+  cudaDataType scale_type = CUDA_R_32I;
+  cublasOperation_t op_trans_a = CUBLAS_OP_N;
+  cublasOperation_t op_trans_b = CUBLAS_OP_N;
+#ifdef __ILUVATAR__
+    checkBlasStatus(cublasLtMatmulDescCreate(&op_desc, compute_type));
+#else
+    checkBlasStatus(cublasLtMatmulDescCreate(&op_desc, compute_type, scale_type));
+#endif
+  checkBlasStatus(cublasLtMatmulDescSetAttribute(op_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_type, sizeof(scale_type)));
+  checkBlasStatus(cublasLtMatmulDescSetAttribute(op_desc, CUBLASLT_MATMUL_DESC_TRANSA, &op_trans_a, sizeof(op_trans_a)));
+  checkBlasStatus(cublasLtMatmulDescSetAttribute(op_desc, CUBLASLT_MATMUL_DESC_TRANSB, &op_trans_b, sizeof(op_trans_b)));
+
+  gemm_kernel_param ins;
+  ins.lt_handle = reinterpret_cast<uintptr_t>(lt_handle);
+  ins.op_desc = reinterpret_cast<uintptr_t>(op_desc);
+
+  return ins;
+}
+
+void gemm_kernel_run(gemm_kernel_param ins, char *d_A, char *d_B, int32_t *d_C, const int M, const int N, const int K)
+{
+  int alpha_int8 = 1;
+  int beta_int8 = 0;
+  cudaDataType ab_type = CUDA_R_8I;
+  cudaDataType c_type = CUDA_R_32I;
+  cublasLtMatrixLayout_t a_desc = nullptr, b_desc = nullptr, c_desc = nullptr;
+
+  cublasLtHandle_t lt_handle = reinterpret_cast<cublasLtHandle_t>(reinterpret_cast<uintptr_t *>(ins.lt_handle));
+  cublasLtMatmulDesc_t op_desc = reinterpret_cast<cublasLtMatmulDesc_t>(reinterpret_cast<uintptr_t *>(ins.op_desc));
+
+  checkBlasStatus(cublasLtMatrixLayoutCreate(&a_desc, ab_type, K, M, K));
+  checkBlasStatus(cublasLtMatrixLayoutCreate(&b_desc, ab_type, N, K, N));
+  checkBlasStatus(cublasLtMatrixLayoutCreate(&c_desc, c_type, N, M, N));
+
+#ifdef CAL_TFLOPS_TEST
+  cudaDeviceSynchronize();
+  auto start = std::chrono::steady_clock::now();
+#endif
+
+  checkBlasStatus(cublasLtMatmul(lt_handle, op_desc, &alpha_int8, d_B, b_desc, d_A, a_desc, &beta_int8, d_C, c_desc, d_C, c_desc, nullptr, nullptr, 0, nullptr));
+
+#ifdef CAL_TFLOPS_TEST  
+  cudaThreadSynchronize();
+  auto stop = std::chrono::steady_clock::now();
+
+  std::chrono::duration<double, std::milli> dur_ms = stop - start;
+  double elapse = dur_ms.count();
+  double tflops = 1e-9 * 2.0f * M * N * K;
+  printf("\n---------------elapse: %lf ms, TOPs: %lf\n\n", elapse, tflops / elapse);
+#endif
+}
+
+void gemm_kernel_release(gemm_kernel_param ins)
+{
+  cublasLtHandle_t lt_handle = reinterpret_cast<cublasLtHandle_t>(reinterpret_cast<uintptr_t *>(ins.lt_handle));
+  cublasLtDestroy(lt_handle);
+}
diff --git a/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/setup.py b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/setup.py
new file mode 100644
index 000000000..f5c6af3de
--- /dev/null
+++ b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/setup.py
@@ -0,0 +1,25 @@
+
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name='ixgemmblaslt',
+    version="1.0.0",
+    ext_modules=[
+        CUDAExtension('ixgemmblaslt', [
+            'ixgemmblaslt.cpp',
+            'ixgemmblaslt_kernel.cu',
+        ],
+        include_dirs=[],
+        #define_macros=[('CAL_TFLOPS_TEST', 1)],
+        extra_compile_args={
+            'cxx': ['-std=c++17'],
+            'clang++': ['-std=c++17', ],
+        },
+        libraries=['cudart', 'cublasLt']
+        ),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    })
+
diff --git a/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt_demo.py b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt_demo.py
new file mode 100644
index 000000000..967550ef3
--- /dev/null
+++ b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt_demo.py
@@ -0,0 +1,80 @@
+import torch
+import numpy as np
+import time
+
+from ixgemmblaslt import gemm88
+
+shape_0 = 10
+shape_m = 3072
+shape_n = 4096
+shape_k = 30176
+# shape_0 = 1
+# shape_m = 1
+# shape_n = 4
+# shape_k = 4
+
+np.random.seed(int(time.time()))
+
+for kk in range(0, 3):
+    alist = []
+    blist = []
+    clist = []
+
+    for ii in range(0,3):
+        begini = 1
+        endi = (2 * ii + 2) % 100 + 7
+        #arr1 = np.random.randint(begini, endi, (shape_0, shape_m, shape_k))
+        arr1 = np.random.randint(begini, endi, (shape_m, shape_k))
+        t1 = torch.from_numpy(arr1).to(torch.int8).to("cuda")
+        alist.append(t1)
+
+        #arr2 = np.random.randint(begini, endi, (shape_0, shape_k, shape_n))
+        arr2 = np.random.randint(begini, endi, (shape_k, shape_n))
+        t2 = torch.from_numpy(arr2).to(torch.int8).to("cuda")
+        blist.append(t2)
+
+    blasLtIns = gemm88.gemm_init()
+
+    begin_t = int(time.time() * 1000)
+    clist11 = gemm88.gemm_run(blasLtIns, alist, blist)
+    end_t = int(time.time() * 1000)
+
+    #print("clist11:", clist11)
+
+    gemm88.gemm_release(blasLtIns)
+
+    alist2 = []
+    blist2 = []
+    clist2 = []
+    for a, b in zip(alist, blist):
+        a2 = a.clone().to(torch.float32)
+        alist2.append(a2)
+        b2 = b.clone().to(torch.float32)
+        blist2.append(b2)
+        c_shape = (a2.shape[0], b2.shape[1])
+        zeros_tensor = torch.zeros(c_shape).to("cuda").to(torch.float32)
+        clist2.append(zeros_tensor)
+
+    begin_t2 = int(time.time() * 1000)
+    #clist2 = [a @ b for a, b in zip(alist2, blist2)]
+    clist2 = [torch.matmul(a,b) for a, b in zip(alist2, blist2)]
+    end_t2 = int(time.time() * 1000)
+    #print("clist2:", clist2)
+
+    print("cost time:", end_t - begin_t, "; ", end_t2 - begin_t2)
+
+
+    ball = True
+    
+    for c, c2 in zip(clist11, clist2):
+        c1 = c.to(torch.float32)
+        ball = torch.allclose(c1, c2, rtol=1e-4, atol=1e-6)
+        if not ball:
+            break
+
+    print("\n")
+    if ball:
+        print("***all is ok***")
+    else:
+        print("??? not all is ok !!!")
+
diff --git a/byte_micro_perf/core/perf_engine.py b/byte_micro_perf/core/perf_engine.py
index e2774c821..258416079 100644
--- a/byte_micro_perf/core/perf_engine.py
+++ b/byte_micro_perf/core/perf_engine.py
@@ -247,6 +247,7 @@ def start_engine(self) -> None:
             status = self.start_perf(self.workload)
 
         self.deactivate_venv()
+        del self.backend
 
     def start_perf(self, workload: Dict[str, Any]) -> bool:
         local_rank = int(os.environ.get("LOCAL_RANK", 0))

From 4664d8c0444468be62e4bb674e4b476ef5ff5b7d Mon Sep 17 00:00:00 2001
From: "zhiwei.shangguan" <zhiwei.shangguan@iluvatar.ai>
Date: Wed, 14 Aug 2024 03:15:28 +0000
Subject: [PATCH 26/28] gemm of int8 of ix  --2

---
 .../ILUVATAR/ixgemmblaslt/ixgemmblaslt.cpp    | 16 ++++++-------
 .../ILUVATAR/ixgemmblaslt/ixgemmblaslt.hpp    |  2 +-
 .../ixgemmblaslt/ixgemmblaslt_kernel.cu       | 12 ++++++----
 .../backends/ILUVATAR/ixgemmblaslt_demo.py    | 24 +++++++++----------
 4 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.cpp b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.cpp
index 29800cf7e..53d9d3d16 100644
--- a/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.cpp
+++ b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.cpp
@@ -25,9 +25,9 @@ std::vector<at::Tensor> gemm_run(gemm_kernel_param pins, std::vector<at::Tensor>
     {
       // 二维矩阵
       {
-        int *d_c;
-        cudaMalloc((void **)&d_c, sizeof(int) * shape_a[0] * shape_b[1]);
-        auto options = torch::TensorOptions().dtype(torch::kInt32);
+        char *d_c;
+        cudaMalloc((void **)&d_c, sizeof(char) * shape_a[0] * shape_b[1]);
+        auto options = torch::TensorOptions().dtype(torch::kInt8);
         options.device(at::kCUDA);
         clist[i] = torch::from_blob(d_c, {shape_a[0], shape_b[1]}, std::bind(&free_device, d_c), options);
         clist[i] = clist[i].cuda();
@@ -35,15 +35,15 @@ std::vector<at::Tensor> gemm_run(gemm_kernel_param pins, std::vector<at::Tensor>
       int M = shape_a[0];
       int N = shape_b[1];
       int K = shape_a[1];
-      gemm_kernel_run(pins, (char *)alist[i].data_ptr(), (char *)blist[i].data_ptr(), (int32_t *)clist[i].data_ptr(), M, N, K);
+      gemm_kernel_run(pins, (char *)alist[i].data_ptr(), (char *)blist[i].data_ptr(), (char *)clist[i].data_ptr(), M, N, K);
     }
     else if (shape_a.size() == 3 && shape_b.size() == 3)
     {
       // 三维矩阵
       {
-        int *d_c;
-        cudaMalloc((void **)&d_c, sizeof(int) * shape_a[0] * shape_a[1] * shape_b[2]);
-        auto options = torch::TensorOptions().dtype(torch::kInt32);
+        char *d_c;
+        cudaMalloc((void **)&d_c, sizeof(char) * shape_a[0] * shape_a[1] * shape_b[2]);
+        auto options = torch::TensorOptions().dtype(torch::kInt8);
         options.device(at::kCUDA);
         clist[i] = torch::from_blob(d_c, {shape_a[0], shape_a[1], shape_b[2]}, std::bind(&free_device, d_c), options);
         clist[i] = clist[i].cuda();
@@ -53,7 +53,7 @@ std::vector<at::Tensor> gemm_run(gemm_kernel_param pins, std::vector<at::Tensor>
         int M = shape_a[1];
         int N = shape_b[2];
         int K = shape_a[2];
-        gemm_kernel_run(pins, (char *)alist[i][j].data_ptr(), (char *)blist[i][j].data_ptr(), (int32_t *)clist[i][j].data_ptr(), M, N, K);
+        gemm_kernel_run(pins, (char *)alist[i][j].data_ptr(), (char *)blist[i][j].data_ptr(), (char *)clist[i][j].data_ptr(), M, N, K);
       }
     }
     else
diff --git a/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.hpp b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.hpp
index 38d79d954..4c4da4b46 100644
--- a/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.hpp
+++ b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt.hpp
@@ -29,7 +29,7 @@ struct gemm_kernel_param
 
 gemm_kernel_param gemm_kernel_init();
 
-void gemm_kernel_run(gemm_kernel_param pins, char *d_A, char *d_B, int32_t *d_C, const int M, const int N, const int K);
+void gemm_kernel_run(gemm_kernel_param pins, char *d_A, char *d_B, char *d_C, const int M, const int N, const int K);
 
 void gemm_kernel_release(gemm_kernel_param pins);
 
diff --git a/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt_kernel.cu b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt_kernel.cu
index 79201e997..075fa80b3 100644
--- a/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt_kernel.cu
+++ b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt/ixgemmblaslt_kernel.cu
@@ -14,7 +14,8 @@ gemm_kernel_param gemm_kernel_init()
 #else
     cublasComputeType_t compute_type = CUBLAS_COMPUTE_32I;
 #endif
-  cudaDataType scale_type = CUDA_R_32I;
+  //cudaDataType scale_type = CUDA_R_32I;
+  cudaDataType scale_type = CUDA_R_32F;
   cublasOperation_t op_trans_a = CUBLAS_OP_N;
   cublasOperation_t op_trans_b = CUBLAS_OP_N;
 #ifdef __ILUVATAR__
@@ -33,12 +34,13 @@ gemm_kernel_param gemm_kernel_init()
   return ins;
 }
 
-void gemm_kernel_run(gemm_kernel_param ins, char *d_A, char *d_B, int32_t *d_C, const int M, const int N, const int K)
+void gemm_kernel_run(gemm_kernel_param ins, char *d_A, char *d_B, char *d_C, const int M, const int N, const int K)
 {
-  int alpha_int8 = 1;
-  int beta_int8 = 0;
+  float alpha_int8 = 1.0;
+  float beta_int8 = 0.0;
   cudaDataType ab_type = CUDA_R_8I;
-  cudaDataType c_type = CUDA_R_32I;
+  //cudaDataType c_type = CUDA_R_32I;
+  cudaDataType c_type = CUDA_R_8I;
   cublasLtMatrixLayout_t a_desc = nullptr, b_desc = nullptr, c_desc = nullptr;
 
   cublasLtHandle_t lt_handle = reinterpret_cast<cublasLtHandle_t>(reinterpret_cast<uintptr_t *>(ins.lt_handle));
diff --git a/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt_demo.py b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt_demo.py
index 967550ef3..f95447961 100644
--- a/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt_demo.py
+++ b/byte_micro_perf/backends/ILUVATAR/ixgemmblaslt_demo.py
@@ -4,25 +4,25 @@
 
 from ixgemmblaslt import gemm88
 
-shape_0 = 10
-shape_m = 3072
-shape_n = 4096
-shape_k = 30176
-# shape_0 = 1
-# shape_m = 1
-# shape_n = 4
-# shape_k = 4
+#shape_0 = 1
+# shape_m = 3072
+# shape_n = 4096
+# shape_k = 30176
+
+shape_0 = 1
+shape_m = 4
+shape_n = 8
+shape_k = 4
 
 np.random.seed(int(time.time()))
 
-for kk in range(0, 3):
+for kk in range(0, 2):
     alist = []
     blist = []
     clist = []
-
+    begini = -5
+    endi = 5
     for ii in range(0,3):
-        begini = 1
-        endi = (2 * ii + 2) % 100 + 7
         #arr1 = np.random.randint(begini, endi, (shape_0, shape_m, shape_k))
         arr1 = np.random.randint(begini, endi, (shape_m, shape_k))
         t1 = torch.from_numpy(arr1).to(torch.int8).to("cuda")

From 2bb14bdc0299cd2b49d550eb2caf447a2f224171 Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.com>
Date: Tue, 20 Aug 2024 10:33:04 +0000
Subject: [PATCH 27/28] update-0820

---
 .../backends/ILUVATAR/README.zh_CN.md         | 708 ++++++++++--------
 .../backends/ILUVATAR/onnx2torch/CITATION.cff |  24 +
 .../backends/ILUVATAR/onnx2torch/LICENSE      | 201 +++++
 .../backends/ILUVATAR/onnx2torch/MANIFEST.in  |   1 +
 .../backends/ILUVATAR/onnx2torch/README.md    | 290 +++++++
 .../assets/logo/onnx2torch_dark.png           | Bin 0 -> 44573 bytes
 .../assets/logo/onnx2torch_light.png          | Bin 0 -> 44759 bytes
 .../onnx2torch/onnx2torch/__init__.py         |   1 +
 .../onnx2torch/onnx2torch/converter.py        | 175 +++++
 .../onnx2torch/node_converters/__init__.py    |  63 ++
 .../onnx2torch/node_converters/activations.py | 243 ++++++
 .../onnx2torch/node_converters/argmax.py      |  44 ++
 .../node_converters/average_pool.py           |  61 ++
 .../node_converters/base_element_wise.py      |  36 +
 .../onnx2torch/node_converters/batch_norm.py  | 102 +++
 .../node_converters/binary_math_operations.py |  81 ++
 .../onnx2torch/node_converters/cast.py        |  54 ++
 .../onnx2torch/node_converters/clip.py        |  84 +++
 .../onnx2torch/node_converters/comparisons.py |  48 ++
 .../onnx2torch/node_converters/concat.py      |  37 +
 .../onnx2torch/node_converters/constant.py    |  63 ++
 .../node_converters/constant_of_shape.py      |  54 ++
 .../onnx2torch/node_converters/conv.py        |  97 +++
 .../onnx2torch/node_converters/cumsum.py      |  88 +++
 .../node_converters/depth_to_space.py         |  37 +
 .../onnx2torch/node_converters/dropout.py     |  54 ++
 .../onnx2torch/node_converters/einsum.py      |  32 +
 .../onnx2torch/node_converters/expand.py      |  38 +
 .../onnx2torch/node_converters/eye_like.py    |  67 ++
 .../onnx2torch/node_converters/flatten.py     |  39 +
 .../onnx2torch/node_converters/functions.py   |  60 ++
 .../onnx2torch/node_converters/gather.py      | 171 +++++
 .../onnx2torch/node_converters/gemm.py        |  98 +++
 .../node_converters/global_average_pool.py    |  62 ++
 .../onnx2torch/node_converters/identity.py    |  29 +
 .../node_converters/instance_norm.py          |  88 +++
 .../onnx2torch/node_converters/isinf.py       |  33 +
 .../onnx2torch/node_converters/isnan.py       |  33 +
 .../onnx2torch/node_converters/layer_norm.py  |  78 ++
 .../onnx2torch/node_converters/logical.py     |  79 ++
 .../onnx2torch/node_converters/lrn.py         |  23 +
 .../onnx2torch/node_converters/matmul.py      |  28 +
 .../onnx2torch/node_converters/max_pool.py    |  65 ++
 .../onnx2torch/node_converters/mean.py        |  36 +
 .../onnx2torch/node_converters/min_max.py     |  43 ++
 .../onnx2torch/node_converters/mod.py         |  36 +
 .../onnx2torch/node_converters/neg.py         |  28 +
 .../onnx2torch/node_converters/nms.py         | 121 +++
 .../onnx2torch/node_converters/nonzero.py     |  33 +
 .../onnx2torch/node_converters/pad.py         | 141 ++++
 .../onnx2torch/node_converters/pow.py         |  64 ++
 .../node_converters/random_normal_like.py     |  59 ++
 .../onnx2torch/node_converters/range.py       |  66 ++
 .../onnx2torch/node_converters/reciprocal.py  |  28 +
 .../onnx2torch/node_converters/reduce.py      | 276 +++++++
 .../onnx2torch/node_converters/registry.py    |  71 ++
 .../onnx2torch/node_converters/reshape.py     |  49 ++
 .../onnx2torch/node_converters/resize.py      | 171 +++++
 .../onnx2torch/node_converters/roialign.py    | 145 ++++
 .../onnx2torch/node_converters/roundings.py   |  40 +
 .../onnx2torch/node_converters/scatter_nd.py  |  97 +++
 .../onnx2torch/node_converters/shape.py       |  66 ++
 .../onnx2torch/node_converters/slice.py       | 125 ++++
 .../onnx2torch/node_converters/split.py       |  78 ++
 .../onnx2torch/node_converters/squeeze.py     | 100 +++
 .../onnx2torch/node_converters/sum.py         |  35 +
 .../onnx2torch/node_converters/tile.py        |  36 +
 .../onnx2torch/node_converters/topk.py        |  55 ++
 .../onnx2torch/node_converters/transpose.py   |  50 ++
 .../onnx2torch/node_converters/trilu.py       |  46 ++
 .../onnx2torch/node_converters/unsqueeze.py   |  86 +++
 .../onnx2torch/node_converters/where.py       |  32 +
 .../onnx2torch/onnx2torch/onnx_graph.py       | 117 +++
 .../onnx2torch/onnx2torch/onnx_node.py        |  77 ++
 .../onnx2torch/onnx2torch/onnx_tensor.py      |  33 +
 .../onnx2torch/onnx2torch/utils/__init__.py   |   0
 .../onnx2torch/onnx2torch/utils/common.py     |  85 +++
 .../onnx2torch/utils/custom_export_to_onnx.py | 100 +++
 .../onnx2torch/onnx2torch/utils/dtype.py      |  87 +++
 .../onnx2torch/onnx2torch/utils/indices.py    |  30 +
 .../onnx2torch/onnx2torch/utils/padding.py    |  35 +
 .../onnx2torch/utils/safe_shape_inference.py  |  46 ++
 .../backends/ILUVATAR/onnx2torch/operators.md | 178 +++++
 .../ILUVATAR/onnx2torch/pyproject.toml        | 109 +++
 .../backends/ILUVATAR/onnx2torch/setup.py     |  26 +
 .../ILUVATAR/onnx2torch/tests/__init__.py     |   9 +
 .../onnx2torch/tests/models/README.md         |  21 +
 .../onnx2torch/tests/models/__init__.py       |   0
 .../onnx2torch/tests/models/models_test.py    | 226 ++++++
 .../tests/models/test_clip_text_encoder.py    |  39 +
 .../models/test_clip_text_encoder_half.py     |  55 ++
 .../tests/node_converters/__init__.py         |   0
 .../tests/node_converters/activations_test.py | 121 +++
 .../average_pool_max_pool_test.py             |  99 +++
 .../tests/node_converters/batch_norm_test.py  |  65 ++
 .../node_converters/binary_operations_test.py |  60 ++
 .../tests/node_converters/clip_test.py        |  61 ++
 .../tests/node_converters/comparisons_test.py |  63 ++
 .../tests/node_converters/concat_test.py      |  51 ++
 .../node_converters/constant_of_shape_test.py |  44 ++
 .../tests/node_converters/constant_test.py    |  38 +
 .../tests/node_converters/conv_test.py        | 191 +++++
 .../tests/node_converters/cumsum_test.py      |  68 ++
 .../node_converters/depth_to_space_test.py    |  42 ++
 .../tests/node_converters/dropout_test.py     |  59 ++
 .../tests/node_converters/einsum_test.py      |  51 ++
 .../tests/node_converters/expand_test.py      |  52 ++
 .../tests/node_converters/eye_like_test.py    |  31 +
 .../tests/node_converters/flatten_test.py     |  30 +
 .../tests/node_converters/gather_test.py      | 104 +++
 .../tests/node_converters/gemm_test.py        | 129 ++++
 .../node_converters/global_avg_pool_test.py   |  34 +
 .../node_converters/instance_norm_test.py     |  47 ++
 .../tests/node_converters/layer_norm_test.py  |  76 ++
 .../tests/node_converters/logical_test.py     |  57 ++
 .../tests/node_converters/lrn_test.py         |  37 +
 .../tests/node_converters/matmul_test.py      |  40 +
 .../tests/node_converters/mean_test.py        |  47 ++
 .../tests/node_converters/min_max_test.py     |  56 ++
 .../tests/node_converters/mod_test.py         |  42 ++
 .../tests/node_converters/neg_test.py         |  26 +
 .../tests/node_converters/nms_test.py         | 177 +++++
 .../tests/node_converters/pad_test.py         |  71 ++
 .../tests/node_converters/pow_test.py         |  54 ++
 .../tests/node_converters/range_test.py       |  52 ++
 .../tests/node_converters/reciprocal_test.py  |  27 +
 .../tests/node_converters/reduce_test.py      | 175 +++++
 .../tests/node_converters/reshape_test.py     |  54 ++
 .../tests/node_converters/resize_test.py      | 140 ++++
 .../tests/node_converters/roialign_test.py    | 218 ++++++
 .../tests/node_converters/scatter_nd_test.py  |  77 ++
 .../tests/node_converters/shape_test.py       |  43 ++
 .../tests/node_converters/slice_test.py       |  79 ++
 .../tests/node_converters/split_test.py       |  89 +++
 .../tests/node_converters/squeeze_test.py     |  90 +++
 .../tests/node_converters/sum_test.py         |  47 ++
 .../tests/node_converters/test_functions.py   |  83 ++
 .../tests/node_converters/tile_test.py        |  51 ++
 .../tests/node_converters/topk_test.py        |  44 ++
 .../tests/node_converters/transpose_test.py   |  36 +
 .../tests/node_converters/unsqueeze_test.py   |  82 ++
 .../tests/node_converters/where_test.py       |  59 ++
 .../ILUVATAR/onnx2torch/tests/pytest.ini      |   8 +
 .../onnx2torch/tests/utils/__init__.py        |   0
 .../ILUVATAR/onnx2torch/tests/utils/common.py | 302 ++++++++
 .../onnx2torch/tests/utils/resources.py       |  71 ++
 .../backends/ILUVATAR/requirements.txt        |  36 +-
 .../ILUVATAR/runtime_backend_iluvatar.py      |   6 +-
 vendor_zoo/Iluvatar/BI-V150-PCIe.json         |  49 ++
 149 files changed, 10862 insertions(+), 334 deletions(-)
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/CITATION.cff
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/LICENSE
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/MANIFEST.in
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/README.md
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/assets/logo/onnx2torch_dark.png
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/assets/logo/onnx2torch_light.png
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/__init__.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/converter.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/__init__.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/activations.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/argmax.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/average_pool.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/base_element_wise.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/batch_norm.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/binary_math_operations.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/cast.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/clip.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/comparisons.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/concat.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/constant.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/constant_of_shape.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/conv.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/cumsum.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/depth_to_space.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/dropout.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/einsum.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/expand.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/eye_like.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/flatten.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/functions.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/gather.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/gemm.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/global_average_pool.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/identity.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/instance_norm.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/isinf.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/isnan.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/layer_norm.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/logical.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/lrn.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/matmul.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/max_pool.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/mean.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/min_max.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/mod.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/neg.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/nms.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/nonzero.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/pad.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/pow.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/random_normal_like.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/range.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/reciprocal.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/reduce.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/registry.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/reshape.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/resize.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/roialign.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/roundings.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/scatter_nd.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/shape.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/slice.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/split.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/squeeze.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/sum.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/tile.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/topk.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/transpose.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/trilu.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/unsqueeze.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/where.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/onnx_graph.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/onnx_node.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/onnx_tensor.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/__init__.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/common.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/custom_export_to_onnx.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/dtype.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/indices.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/padding.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/safe_shape_inference.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/operators.md
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/pyproject.toml
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/setup.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/__init__.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/README.md
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/__init__.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/models_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/test_clip_text_encoder.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/test_clip_text_encoder_half.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/__init__.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/activations_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/average_pool_max_pool_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/batch_norm_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/binary_operations_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/clip_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/comparisons_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/concat_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/constant_of_shape_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/constant_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/conv_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/cumsum_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/depth_to_space_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/dropout_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/einsum_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/expand_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/eye_like_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/flatten_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/gather_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/gemm_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/global_avg_pool_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/instance_norm_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/layer_norm_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/logical_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/lrn_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/matmul_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/mean_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/min_max_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/mod_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/neg_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/nms_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/pad_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/pow_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/range_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/reciprocal_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/reduce_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/reshape_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/resize_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/roialign_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/scatter_nd_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/shape_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/slice_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/split_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/squeeze_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/sum_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/test_functions.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/tile_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/topk_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/transpose_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/unsqueeze_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/where_test.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/pytest.ini
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/utils/__init__.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/utils/common.py
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/utils/resources.py
 create mode 100644 vendor_zoo/Iluvatar/BI-V150-PCIe.json

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
index 402e2dcec..72ee47b9a 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
@@ -1,319 +1,389 @@
-"""
-    ****************************************操作说明*********************************
-    如果不想跑CPU端的性能、精度、数值指标对比，可以直接执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32（示例）
-             如果模型提供了pt、pb格式的优先选择torch的配置进行测试；
-             如果执行整个pipeline，需要执行：python3 lauch.py --hardware_type ILUVATAR --task widedeep-tf-fp32（示例）（跑cpu结果会很耗时）
-
-    功能实现：
-        1、pt、pb模型转换在compile模块预处理过程中实现；
-        2、在天数智芯BI-150显卡上，调用推理引擎tensorrt进行推理，一些onnx模型需要利用前面一步导出的onnx模型再进行插件算子的优化；
-    
-    环境准备：
-        1、sdk版本： 由天数智芯工程师提供
-        2、ixrt版本：由天数智芯工程师提供
-"""
-
-
-"""
-    ***************************11个小模型的测试与测试报告生成的操作方法****************************
-    整个代码运行过程中，主要是从workloads目录下加载对应的模型的配置，主要有test_perf、test_accuracy、test_numeric三项测试内容，用户可以根据自己的需要选择开启与否；
-    一般情况下采用字节默认的配置项即可；需要特别修改的配置下面会进行说明
-
-    输出性能文档里面涉及的字段说明：
-        1、QPS、AVG Latency、P99 Latency：这3个指标是走字节框架，采用天数智芯的推理引擎IxRT会计算H2D、D2H的时间，也就是数据在不同的设备（CPU、GPU）之间传输耗时；
-        2、predict QPS、predict AVG Latency、predict P99 Latency：这部分指标把上面一步计算H2D、D2H的耗时剔除出去了，因此可以看做纯推理耗时，这个耗时可以与利用
-           ixerexec命令跑出来的结果做一定的对比，但是不一定完全对齐，因为走整个框架代码肯定会导致一部分性能损失
-
-    数据集、模型准备：
-        cd ByteMLPerf/byte_infer_perf/general_perf
-
-        bash general_perf/prepare_model_and_dataset.sh bert-torch-fp32 open_squad
-        bash general_perf/prepare_model_and_dataset.sh resnet50-torch-fp32 open_imagenet
-        bash general_perf/prepare_model_and_dataset.sh widedeep-tf-fp32 open_criteo_kaggle
-        bash general_perf/prepare_model_and_dataset.sh albert-torch-fp32
-        bash general_perf/prepare_model_and_dataset.sh roformer-tf-fp32 open_cail2019
-        bash general_perf/prepare_model_and_dataset.sh videobert-onnx-fp32 open_cifar
-        bash general_perf/prepare_model_and_dataset.sh yolov5-onnx-fp32 
-        bash general_perf/prepare_model_and_dataset.sh conformer-encoder-onnx-fp32
-        bash general_perf/prepare_model_and_dataset.sh roberta-torch-fp32
-        bash general_perf/prepare_model_and_dataset.sh deberta-torch-fp32 
-        bash general_perf/prepare_model_and_dataset.sh swin-large-torch-fp32
-        bash general_perf/prepare_model_and_dataset.sh gpt2-torch-fp32 
-
-        上面的模型与数据集下载完毕后会生成在：general_perf/general_perf，需要把该目录在的model_zoo下面的regular、popular、sota移到general_perf/model_zoo下面
-        如果还缺少什么模型、数据集可以在prepare_model_and_dataset.sh里面执行类似上面的操作即可；
-
-
-    测试开始：
-
-    cd ByteMLPerf/byte_infer_perf
-
-    备注：由于sftp机器崩溃，文件全部丢失，因此已有的获取数据方式可能不存在了
-
-    1、bert模型：
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/bert-torch-fp32/
-
-    2、albert模型：
-        测试过程中如果从huggingface网址不能下载文件，可以按照下面的操作进行下载
-        
-        下载方式：sftp -P 29880 vipzjtd@iftp.iluvatar.com.cn（如果链接不上用ip替换：10.160.20.60）  密码：123..com
-                 get /upload/3-app/byteperf/madlag.tar
-                 tar -zxvf madlag.tar
-                 exit
-
-        接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
-        AutoTokenizer.from_pretrained("madlag/albert-base-v2-squad") => AutoTokenizer.from_pretrained("/ByteMLPerf/byte_infer_perf/madlag/albert-base-v2-squad")  (注意绝对路径根据实际情况修改，需要在ByteMLPerf前面在加一个当前目录最上层的路径，下同)
-
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task albert-torch-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/albert-torch-fp32/
-
-    3、debert模型：
-        测试过程中如果从huggingface网址不能下载文件，可以按照下面的操作进行下载
-
-        下载方式：sftp -P 29880 vipzjtd@iftp.iluvatar.com.cn（如果链接不上用ip替换：10.160.20.60）  密码：123..com
-                 get /upload/3-app/byteperf/Palak.tar
-                 tar -zxvf Palak.tar
-                 exit
-
-        接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
-        AutoTokenizer.from_pretrained("Palak/microsoft_deberta-base_squad") => AutoTokenizer.from_pretrained("/ByteMLPerf/byte_infer_perf/Palak/microsoft_deberta-base_squad")
-
-        给定的pt模型转成onnx后输入只有2个，因此这里特殊处理了一下；加载处理好的onnx模型：deberta-sim-drop-clip-drop-invaild-cast.onnx
-        将其放到：general_perf/model_zoo/popular/open_deberta/ 目录下；
-
-        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-                 cd yudefu/bytedance_perf ; get deberta-sim-drop-clip-drop-invaild-cast.onnx
-                 exit
-        
-        移动：mv deberta-sim-drop-clip-drop-invaild-cast.onnx general_perf/model_zoo/popular/open_deberta/
-
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task deberta-torch-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/deberta-torch-fp32/
-
-    4、roberta模型：
-        测试过程中如果从huggingface网址不能下载文件，可以按照下面的操作进行下载
-
-        下载方式：sftp -P 29880 vipzjtd@iftp.iluvatar.com.cn（如果链接不上用ip替换：10.160.20.60）  密码：123..com
-                 get /upload/3-app/byteperf/csarron.tar
-                 tar -zxvf csarron.tar
-                 exit
-
-        接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
-        AutoTokenizer.from_pretrained("csarron/roberta-base-squad-v1") => AutoTokenizer.from_pretrained("/ByteMLPerf/byte_infer_perf/csarron/roberta-base-squad-v1")
-
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roberta-torch-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/roberta-torch-fp32/
-
-    5、videobert模型：
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task videobert-onnx-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/videobert-onnx-fp32
-    
-    6、widedeep模型：
-        该模型经过了特殊的处理，需要采用处理好的onnx模型：widedeep_dynamicshape_new.onnx；
-        将其放到：general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
-
-        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-                 cd yudefu/bytedance_perf ; get widedeep_dynamicshape_new.onnx
-                 exit
-        
-        移动：mv widedeep_dynamicshape_new.onnx general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
-        
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/widedeep-tf-fp32
-
-    7、swin-transformer模型：
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task swin-large-torch-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/swin-large-torch-fp32
-
-    8、resnet50模型：
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task resnet50-torch-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/resnet50-torch-fp32
-
-    9、yolov5模型：
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task yolov5-onnx-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/yolov5-onnx-fp32
-
-    10、conformer模型：
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task conformer-encoder-onnx-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/conformer-encoder-onnx-fp32
-
-    11、roformer模型：
-        该模型经过了特殊的处理，需要采用处理好的onnx模型：roformer_frozen.onnx；
-        将其放到：general_perf/model_zoo/popular/open_roformer/ 
-
-        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
-                 cd yudefu/bytedance_perf ; get roformer_frozen.onnx
-                 exit
-        
-        移动：mv roformer_frozen.onnx general_perf/model_zoo/popular/open_roformer/ 
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roformer-tf-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/roformer-tf-fp32
-
-    12、gpt2模型：
-        在进行测试时，请把workloads下面的gpt2-torch-fp32.json里面的精度、数值对比测试改成false
-
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task gpt2-torch-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/gpt2-torch-fp32
-"""
-
-"""
-    ***************************大模型操作流程********************
-    说明：
-        此部分侵入了字节代码框架，因此需要重新重构，暂时不需要进行测试
-
-    操作流程：
-        1. 进入ByteMLPerf目录
-        2. 执行
-            1）python3 byte_infer_perf/llm_perf/core/perf_engine.py --task chatglm2-torch-fp16-6b --hardware_type ILU, 
-               得到chatglm2-torch-fp16-6b的精度和性能数据
-
-            2）python3 byte_infer_perf/llm_perf/core/perf_engine.py --task chinese-llama2-torch-fp16-13b --hardware_type ILU,
-               得到 chinese-llama2-torch-fp16-13b的精度和性能数据
-
-        3. 在byte_infer_perf/llm_perf/reports/ILU目录下查看得到模型精度和性能数据的json文件
-"""
-
-"""
-    ***************************Stable Diffusion模型操作流程********************
-    环境准备：官方的onnx2torch有bug存在，所以需要安装天数智芯适配版本的onnx2torch，采用pytorch推理框架
-
-    操作过程：
-        1、cd ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch
-        2、执行：python3 setup.py install
-        3、cd -
-
-        数据集、模型准备：
-        cd ByteMLPerf/byte_infer_perf/general_perf
-
-        bash general_perf/prepare_model_and_dataset.sh vae-encoder-onnx-fp32
-
-        上面的模型与数据集下载完毕后会生成在：general_perf/general_perf，需要把该目录在的model_zoo下面的regular、popular、sota移到general_perf/model_zoo下面
-        如果还缺少什么模型、数据集可以在prepare_model_and_dataset.sh里面执行类似上面的操作即可；
-
-    测试开始：
-
-    cd ByteMLPerf/byte_infer_perf
-
-    1、vae-decoder模型:
-        注意事项：由于天数智芯的显卡基本上都是32G显存, 因此需要修改workloads下面的模型启动配置
-            "batch_sizes":[4,8], "test_numeric": false, 
-
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task vae-decoder-onnx-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/vae-decoder-onnx-fp32
-
-    2、vae-encoder模型：
-        注意事项：由于天数智芯的显卡基本上都是32G显存, 因此需要修改workloads下面的模型启动配置
-            "batch_sizes":[4,8], "test_numeric": false, 
-
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task vae-encoder-onnx-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/vae-encoder-onnx-fp32
-
-    2、clip模型：
-        注意事项：为了实现性能测试, 因此需要修改workloads下面的模型启动配置
-            "test_numeric": false, 
-
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task clip-onnx-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/clip-onnx-fp32
-"""
-
-
-"""
-    ***************************大模型操作流程-VLLM框架********************
-    说明：
-        此部分代码未侵入框架代码，由于vllm框架未实现精度测试，因此精度测试可以沿用GPU的backends；其次，vllm的tp定义目前与框架定义的tp含义不一样，
-        因此chatglm2、llama2模型的workloads配置里面的tp=2暂时不考虑，待后续商定好解决方案在继续
-
-    环境准备：
-        需要提前下载天数智芯适配的vllm安装包到测试环境下，为了方便看输出日志，省掉不必要的信息，安装完毕后，请注释掉：
-        /usr/local/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py 内部函数async def add_request 下面的logger.info输出日志
-
-    测试开始：
-
-    cd ByteMLPerf/byte_infer_perf
-        
-    1、chatglm2模型：
-        执行：python3 llm_perf/launch.py --task chatglm2-torch-fp16-6b --hardware_type ILUVATAR 
-        生成的测试报告位置：llm_perf/reports/ILUVATAR/chatglm2-torch-fp16-6b
-    
-    2、llama2模型：
-        执行：python3 llm_perf/launch.py --task chinese-llama2-torch-fp16-13b --hardware_type ILUVATAR
-        生成的测试报告位置：llm_perf/reports/ILUVATAR/chinese-llama2-torch-fp16-13b
-"""
-
-
-"""
-    **************************部分小模型的int8精度推理测试************************
-    说明：
-        字节目前想验证部分小模型的int8精度推理的性能，因此需要基于ixrt（tensorrt）推理引擎进行适配支持
-        目前需要验证的小模型包括：resnet50、yolov5、widedeep、bert
-
-        注意如果在测试bert的int8推理时，报错，可能是sdk、ixrt版本问题导致；需要升级；
-        生成的报告，并没有更改里面的精度标识，这里只是给出一个测试case，因此并没有将这部分代码加到代码中
-    
-    环境准备：不需要特别准备，之前如果测试过小模型的性能，相关的环境已经存在了；
-
-    测试开始：
-
-    cd ByteMLPerf/byte_infer_perf
-
-    1、resnet50 模型：
-        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型
-
-        下载方式：
-            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
-            cd yudefu/bytedance_perf  
-            get quantized_Resnet50.onnx  
-            exit
-            mv quantized_Resnet50.onnx general_perf/model_zoo/regular/open_resnet50
-
-        手动更改配置文件：general_perf/model_zoo/resnet50-torch-fp32.json 中的 model_precision 精度为 INT8
-
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task resnet50-torch-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/resnet50-torch-fp32
-
-    2、widedeep 模型：
-        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型
-
-        下载方式：
-            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
-            cd yudefu/bytedance_perf  
-            get quantized_widedeep_staticshape.onnx 
-            exit
-            mv quantized_widedeep_staticshape.onnx general_perf/model_zoo/regular/open_wide_deep_saved_model/
-
-        手动更改配置文件：general_perf/model_zoo/widedeep-tf-fp32.json 中的 model_precision 精度为 INT8
-
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/widedeep-tf-fp32
-
-    3、yolov5 模型：
-        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型
-
-        下载方式：
-            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
-            cd yudefu/bytedance_perf  
-            get quantized_yolov5s.onnx 
-            exit
-            mv quantized_yolov5s.onnx general_perf/model_zoo/popular/open_yolov5/
-
-        手动更改配置文件：general_perf/model_zoo/yolov5-onnx-fp32.json 中的 model_precision 精度为 INT8
-
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task yolov5-onnx-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/yolov5-onnx-fp32
-
-    4、bert 模型：
-        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型；该模型直接拿生成好的engine进行推理
-
-        下载方式：
-            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
-            cd yudefu/bytedance_perf  
-            get bert_zijie_int8_b196.engine  
-            exit
-            mv bert_zijie_int8_b196.engine general_perf/model_zoo/regular/open_bert/
-
-        手动更改配置文件：general_perf/model_zoo/bert-torch-fp32.json 中的 model_precision 精度为 INT8
-
-        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
-        生成的测试报告位置：general_perf/reports/ILUVATAR/bert-torch-fp32
-"""
\ No newline at end of file
+# ByteMLPerf 操作说明
+# 1、基础信息描述
+
+完整的代码框架包括CPU端的性能、精度、数值指标等，是否跑CPU端数据通过workloads里面每一个模型的test_numeric参数控制，并且执行代码需要按照下面的指令发起：python3 lauch.py --hardware_type ILUVATAR --task widedeep-tf-fp32（示例），会比较耗时。
+
+如果不想跑CPU端的性能、精度、数值指标对比，可以直接执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32（示例）
+
+如果模型提供了pt、pb格式的优先选择torch的配置进行测试；
+
+### 功能实现
+* pt、pb模型转换在compile模块预处理过程中实现；
+* 在天数智芯BI-150显卡上，调用推理引擎tensorrt进行推理，一些onnx模型需要利用前面一步导出的onnx模型再进行插件算子的优化；
+
+### 环境准备：
+* sdk版本： 由天数智芯工程师提供
+* ixrt版本：由天数智芯工程师提供
+
+# 2、11个常规小模型测试方法
+### 数据集、模型准备
+```
+cd ByteMLPerf/byte_infer_perf/general_perf
+bash general_perf/prepare_model_and_dataset.sh bert-torch-fp32 open_squad
+bash general_perf/prepare_model_and_dataset.sh resnet50-torch-fp32 open_imagenet
+bash general_perf/prepare_model_and_dataset.sh widedeep-tf-fp32 open_criteo_kaggle
+bash general_perf/prepare_model_and_dataset.sh albert-torch-fp32
+bash general_perf/prepare_model_and_dataset.sh roformer-tf-fp32 open_cail2019
+bash general_perf/prepare_model_and_dataset.sh videobert-onnx-fp32 open_cifar
+bash general_perf/prepare_model_and_dataset.sh yolov5-onnx-fp32 
+bash general_perf/prepare_model_and_dataset.sh conformer-encoder-onnx-fp32
+bash general_perf/prepare_model_and_dataset.sh roberta-torch-fp32
+bash general_perf/prepare_model_and_dataset.sh deberta-torch-fp32 
+bash general_perf/prepare_model_and_dataset.sh swin-large-torch-fp32
+bash general_perf/prepare_model_and_dataset.sh gpt2-torch-fp32 
+
+上面的模型下载完毕后会生成在：general_perf/general_perf，需要把该目录在的model_zoo下面的regular、popular、sota移到general_perf/model_zoo目录下。roberta、albert、deberta模型会从huggingface网址下载模型文件，可能遇见访问服务器失败。需要从其他的途径获取。
+
+数据集会生成在：byte_infer_perf/general_perf/datasets/ 目录下，如果依赖的模型数据集下载不完整，会导致推理时报错，各个数据集树形结果如下：
+.
+├── data_loader.py
+├── fake_dataset
+│   ├── data_loader.py
+│   └── test_accuracy.py
+├── open_cail2019
+│   ├── data_loader.py
+│   ├── pre_process_data.py
+│   └── test_accuracy.py
+├── open_cifar
+│   ├── data_loader.py
+│   └── test_accuracy.py
+├── open_criteo_kaggle
+│   ├── data_loader.py
+│   ├── preprocess_dataset.py
+│   └── test_accuracy.py
+├── open_imagenet
+│   ├── data_loader.py
+│   └── test_accuracy.py
+├── open_squad
+│   ├── bert
+│   │   ├── accuracy_squad.py
+│   │   └── evaluate.py
+│   ├── create_squad_data.py
+│   ├── data_loader.py
+│   └── test_accuracy.py
+└── test_accuracy.py
+
+以上的模型、数据集均可以联系天数智芯工程师获取即可。
+```
+### 性能指标说明
+```
+整个代码在运行过程中，主要是从workloads目录下加载对应的模型的配置：test_perf、test_accuracy、test_numeric三项测试内容，用户可以根据自己的需要选择开启与否；workloads下面的配置文件修改一般会与modelzoo下面的配置文件保持同步更改。
+
+一般情况下采用字节默认的配置项即可；需要特别修改的配置下面会进行说明。
+
+输出性能文档里面涉及的字段说明：
+* QPS、AVG Latency、P99 Latency：这3个指标是字节框架生成的，采用天数智芯的推理引擎IxRT会计算H2D、D2H的时间，也就是数据在不同的设备（CPU、GPU）之间传输耗时；
+
+* predict QPS、predict AVG Latency、predict P99 Latency：这部分指标把上面一步计算H2D、D2H的耗时剔除出去了，因此可以看做纯推理耗时，这个耗时可以与利用ixerexec命令跑出来的结果做一定的对比，但是不一定完全对齐，因为走整个框架代码肯定会导致一部分性能损失。
+```
+
+## 支持的模型
+### nlp模型
+* bert
+* albert
+* deberta
+* videobert
+* roberta
+* swin-transformer
+
+### 分类与回归模型
+* wide&deep
+
+### 分类模型
+* renset50
+
+### 检测模型
+* yolov5
+
+### 语音识别模型
+* conformer
+
+### 预训练语言模型
+* roformer
+
+## 测试说明
+cd ByteMLPerf/byte_infer_perf
+
+### FP16精度推理
+#### bert模型
+```bash
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
+# 测试报告位置
+general_perf/reports/ILUVATAR/bert-torch-fp32/
+
+注：目前粗略给出最大batch到322
+# 更改workloads配置
+byte_infer_perf/general_perf/workloads/bert-torch-fp32.json 里面的配置项更改为： "batch_sizes":[1,4,8,16,24,32,48,64,96,128,196,224,322]；
+# 更改model_zoo配置
+byte_infer_perf/general_perf/model_zoo/bert-torch-fp32.json 配置项更改为："max_batch_size": 322；
+# 注意事项
+max_batch_size最好与batch_sizes的最大值保持一致，至少不能小于batch_sizes的最大值。
+```
+
+#### albert模型
+```bash
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task albert-torch-fp32
+# 测试报告位置
+general_perf/reports/ILUVATAR/albert-torch-fp32/
+```
+
+#### debert模型
+```bash
+给定的pt模型转成onnx后输入只有2个，因此这里特殊处理了一下；加载处理好的onnx模型：deberta-sim-drop-clip-drop-invaild-cast.onnx，移动：mv deberta-sim-drop-clip-drop-invaild-cast.onnx general_perf/model_zoo/popular/open_deberta/
+具体获取方式像天数智芯工程师获取
+
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task deberta-torch-fp32
+# 测试报告位置
+general_perf/reports/ILUVATAR/deberta-torch-fp32/
+```
+
+#### roberta模型
+```bash
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roberta-torch-fp32
+# 测试报告位置
+general_perf/reports/ILUVATAR/roberta-torch-fp32/
+```
+
+#### videobert模型
+```bash
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task videobert-onnx-fp32
+# 测试报告位置
+general_perf/reports/ILUVATAR/videobert-onnx-fp32
+```
+
+#### widedeep模型
+```bash
+该模型经过了特殊的处理，需要采用处理好的onnx模型：widedeep_dynamicshape_new.onnx；
+将其放到：general_perf/model_zoo/regular/open_wide_deep_saved_model/
+移动：mv widedeep_dynamicshape.onnx general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
+具体获取方式像天数智芯工程师获取。
+
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
+# 测试报告位置
+general_perf/reports/ILUVATAR/widedeep-tf-fp32
+
+注：目前粗略测试到最大batch为2000000
+# 更改workloads配置
+byte_infer_perf/general_perf/workloads/widedeep-tf-fp32.json 配置项更改为：
+"batch_sizes":[1024,4096,6000,8000,10000,12000,14000,16384,18000,20000,32200,40000,50000,60000,100000,130000,160000,200000,220000,240000,300000,350000,400000,500000,800000,1000000,1500000,2000000]；
+# 更改model_zoo配置
+byte_infer_perf/general_perf/model_zoo/widedeep-tf-fp32.json 配置项更改为：
+"max_batch_size": 2000000；
+# 注意事项
+max_batch_size最好与batch_sizes的最大值保持一致，至少不能小于batch_sizes的最大值。
+```
+
+#### swin-transformer模型
+```bash
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task swin-large-torch-fp32
+# 测试报告位置
+general_perf/reports/ILUVATAR/swin-large-torch-fp32
+```
+
+#### resnet50模型
+```bash
+# 修改：将general_perf/model_zoo/resnet50-torch-fp32.json 里面的inputs 和 input_shape 中的 "input_1.1" 改为 "input"
+## 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task resnet50-torch-fp32
+# 测试报告位置
+general_perf/reports/ILUVATAR/resnet50-torch-fp32
+
+注：目前粗略测试到最大batch为1300，
+# 更改workloads配置
+workloads/resnet50-torch-fp32.json配置项更改为："batch_sizes":[1,4,8,16,32,48, 64,82,128,512,1024,1200,1300]；
+# 更改model_zoo配置
+model_zoo/resnet50-torch-fp32.json 配置项更改为："max_batch_size": 1300；
+# 注意事项
+max_batch_size 最好与batch_sizes的最大值保持一致，至少不能小于batch_sizes的最大值。
+```
+
+#### yolov5模型
+```bash
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task yolov5-onnx-fp32
+# 测试报告位置
+general_perf/reports/ILUVATAR/yolov5-onnx-fp32
+```
+
+#### conformer模型
+```bash
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task conformer-encoder-onnx-fp32
+# 测试报告位置
+general_perf/reports/ILUVATAR/conformer-encoder-onnx-fp32
+```
+
+#### roformer模型
+```bash
+该模型经过了特殊的处理，需要采用处理好的onnx模型：roformer_frozen.onnx；
+将其放到：general_perf/model_zoo/popular/open_roformer/ 
+移动：mv roformer_frozen.onnx general_perf/model_zoo/popular/open_roformer/ 
+具体获取方式像天数智芯工程师获取
+
+# 修改：byte_infer_perf/general_perf/model_zoo/roformer-tf-fp32.json里面的inputs及其input_shape，
+将两个输入及其输入shape的：冒号去掉
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roformer-tf-fp32
+# 测试报告位置
+general_perf/reports/ILUVATAR/roformer-tf-fp32
+```
+
+### 部分小模型INT8精度推理
+```bash
+* 目前ixrt推理引擎只实现了部分模型的int8精度推理，因此只提供了下面4个小模型的int8推理case；支持int8推理的模型：resnet50、yolov5、widedeep、bert；
+* 注意如果在测试bert的int8推理时，报错，可能是sdk、ixrt版本问题导致，需要升级；
+```
+
+#### resnet50模型
+```bash
+# 更改配置文件
+general_perf/model_zoo/resnet50-torch-fp32.json中的model_precision精度为INT8
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task resnet50-torch-fp32
+# 测试报告位置
+general_perf/reports/ILUVATAR/resnet50-torch-fp32
+
+注：目前粗略测试到最大batch为2000
+# 更改workloads配置
+byte_infer_perf/general_perf/workloads/resnet50-torch-fp32.json配置项更改为："batch_sizes":[1,4,8,16,32,48,64,82,128,512,1024,1200,1300,1600,2000]；
+### 更改model_zoo配置
+byte_infer_perf/general_perf/model_zoo/resnet50-torch-fp32.json 配置项更改为："max_batch_size": 2000；
+### 注意事项
+max_batch_size最好与batch_sizes的最大值保持一致，至少不能小于batch_sizes的最大值。
+```
+
+
+#### widedeep
+
+```bash
+# 更改配置文件
+general_perf/model_zoo/widedeep-tf-fp32.json 中的 model_precision 精度为 INT8
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
+# 测试报告位置
+general_perf/reports/ILUVATAR/widedeep-tf-fp32
+
+注：目前粗略测试到最大batch为130000
+# 更改workloads配置
+byte_infer_perf/general_perf/workloads/widedeep-tf-fp32.json配置项更改为："batch_sizes":[1024,4096,6000,8000,10000,12000,13000]；
+# 更改model_zoo配置
+byte_infer_perf/general_perf/model_zoo/widedeep-tf-fp32.json 配置项更改为："max_batch_size": 130000；
+# 注意事项
+max_batch_size最好与batch_sizes的最大值保持一致，至少不能小于batch_sizes的最大值。
+```
+
+
+#### yolov5
+
+```bash
+# 更改配置文件
+general_perf/model_zoo/yolov5-onnx-fp32.json 中的 model_precision 精度为 INT8
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task yolov5-onnx-fp32
+# 测试报告位置
+general_perf/reports/ILUVATAR/yolov5-onnx-fp32
+```
+
+
+#### bert
+
+```bash
+# 更改配置文件
+general_perf/model_zoo/bert-torch-fp32.json 中的 model_precision 精度为 INT8，"input_type": "INT32,INT32,INT32"
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
+# 测试报告位置
+general_perf/reports/ILUVATAR/bert-torch-fp32
+
+注：目前粗略给出最大batch到196
+# 更改workloads配置
+byte_infer_perf/general_perf/workloads/bert-torch-fp32.json，配置项更改为："batch_sizes":[1,4,8,16,24,32,48,64,96,128,196]
+# 更改model_zoo配置
+byte_infer_perf/general_perf/model_zoo/bert-torch-fp32.json 配置项更改为："max_batch_size": 196
+# 注意事项
+max_batch_size最好与batch_sizes的最大值保持一致，至少不能小于batch_sizes的最大值。
+```
+
+# 3、gpt2模型推理
+```bash
+# 采用的推理引擎：igie
+在进行测试时，请把workloads下面的gpt2-torch-fp32.json里面的精度、数值对比测试改成false；
+执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task gpt2-torch-fp32
+生成的测试报告位置：general_perf/reports/ILUVATAR/gpt2-torch-fp32
+```
+
+# 4、Stable Diffusion模型推理
+```bash
+# 采用的推理引擎：pytorch
+此模块涉及到general_perf下面的vae-decoder、vae-encoder、clip三个模型的推理；
+
+# 环境准备：官方的onnx2torch有bug存在，所以需要安装天数智芯适配版本的onnx2torch，采用pytorch推理框架
+cd ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch
+执行：pip3 install .
+
+# 数据集、模型准备：
+cd ByteMLPerf/byte_infer_perf/general_perf
+bash general_perf/prepare_model_and_dataset.sh vae-encoder-onnx-fp32
+上面的模型与数据集下载完毕后会生成在：general_perf/general_perf，需要把该目录下的model_zoo下面的regular、popular、sota移到general_perf/model_zoo下面。
+
+# 测试开始
+cd ByteMLPerf/byte_infer_perf 
+```
+
+#### vae-decoder模型
+```bash
+注意事项：由于天数智芯的显卡基本上都是32G显存, 因此需要修改workloads下面的模型启动配置为："batch_sizes":[4,8], "test_numeric": false, 
+
+执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task vae-decoder-onnx-fp32
+生成的测试报告位置：general_perf/reports/ILUVATAR/vae-decoder-onnx-fp32
+```
+
+#### vae-encoder模型
+```bash
+注意事项：为了实现性能测试, 因此需要修改workloads下面的模型启动配置为："batch_sizes":[4,8], "test_numeric": false, 
+
+执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task clip-onnx-fp32
+生成的测试报告位置：general_perf/reports/ILUVATAR/clip-onnx-fp32
+```
+
+#### clip模型
+```bash
+注意事项：为了实现性能测试, 因此需要修改workloads下面的模型启动配置为："test_numeric": false, 
+
+执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task clip-onnx-fp32
+生成的测试报告位置：general_perf/reports/ILUVATAR/clip-onnx-fp32
+```
+
+# 5、大模型推理
+```bash
+# 说明：
+此部分代码未侵入框架代码，由于vllm框架未实现精度测试，因此精度测试可以沿用GPU的backends；
+其次，vllm的TP定义目前与框架定义的tp含义不一样，因此chatglm2、llama2模型的workloads配置里面的TP=2 暂时不考虑，待后续商定好解决方案在继续。
+
+# 环境准备：
+需要提前下载天数智芯适配的vllm安装包到测试环境下，为了方便看输出日志，省掉不必要的信息，安装完毕后，
+请注释掉：/usr/local/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py 
+内部函数async def add_request 下面的logger.info输出日志。
+
+# 测试开始：
+cd ByteMLPerf/byte_infer_perf
+```
+
+#### chatglm2模型
+```bash
+执行：python3 llm_perf/launch.py --task chatglm2-torch-fp16-6b --hardware_type ILUVATAR 
+生成的测试报告位置：llm_perf/reports/ILUVATAR/chatglm2-torch-fp16-6b
+```
+
+#### llama2模型
+```bash
+执行：python3 llm_perf/launch.py --task chinese-llama2-torch-fp16-13b --hardware_type ILUVATAR
+生成的测试报告位置：llm_perf/reports/ILUVATAR/chinese-llama2-torch-fp16-13b
+```
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/CITATION.cff b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/CITATION.cff
new file mode 100644
index 000000000..f4bcaac17
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/CITATION.cff
@@ -0,0 +1,24 @@
+cff-version: 1.2.0
+title: onnx2torch
+message: "Please use this information to cite onnx2torch in research or other publications."
+authors:
+  - affiliation: ENOT LLC
+    given-names: ENOT developers
+  - family-names: Kalgin
+    given-names: Igor
+  - family-names: Yanchenko
+    given-names: Arseny
+  - family-names: Ivanov
+    given-names: Pyoter
+  - family-names: Goncharenko
+    given-names: Alexander
+date-released: 2021-12-14
+url: "https://enot.ai"
+repository-code: "https://github.com/ENOT-AutoDL/onnx2torch"
+license: "Apache-2.0"
+keywords:
+  - onnx
+  - pytorch
+  - convert
+  - deep learning
+  - machine learning
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/LICENSE b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/LICENSE
new file mode 100644
index 000000000..0dd1688ff
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2023 ENOT LLC
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/MANIFEST.in b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/MANIFEST.in
new file mode 100644
index 000000000..aae95799a
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/MANIFEST.in
@@ -0,0 +1 @@
+recursive-exclude tests *
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/README.md b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/README.md
new file mode 100644
index 000000000..dae844c4c
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/README.md
@@ -0,0 +1,290 @@
+<div align="center">
+  <img src="assets/logo/onnx2torch_light.png#gh-light-mode-only">
+  <img src="assets/logo/onnx2torch_dark.png#gh-dark-mode-only">
+</div>
+
+<p align="center">
+    <a href="https://pypi.org/project/onnx2torch">
+        <img src="https://badgen.net/pypi/v/onnx2torch" />
+    </a>
+    <a href="https://anaconda.org/conda-forge/onnx2torch">
+        <img src="https://img.shields.io/conda/vn/conda-forge/onnx2torch" />
+    </a>
+    <a href="https://pypi.org/project/onnx2torch">
+        <img src="https://img.shields.io/github/license/ENOT-AutoDL/onnx2torch?color=blue" />
+    </a>
+    <a href="https://pypi.org/project/onnx2torch">
+        <img src="https://img.shields.io/pypi/dm/onnx2torch?color=blue" />
+    </a>
+    <a href="https://github.com/ENOT-AutoDL/onnx2torch/stargazers">
+        <img src="https://img.shields.io/github/stars/ENOT-AutoDL/onnx2torch.svg?style=social&label=Star&maxAge=2592000" />
+    </a>
+    <br>
+    <a href="https://github.com/psf/black">
+        <img src="https://img.shields.io/badge/code%20style-black-black?color=blue" />
+    </a>
+    <a href="https://github.com/pre-commit/pre-commit">
+        <img src="https://img.shields.io/badge/pre--commit-enabled-blue?logo=pre-commit" />
+    </a>
+    <a href="https://conventionalcommits.org">
+        <img src="https://img.shields.io/badge/Conventional%20Commits-1.0.0-%23FE5196?logo=conventionalcommits&logoColor=white&color=blue" />
+    </a>
+</p>
+
+onnx2torch is an ONNX to PyTorch converter.
+Our converter:
+
+- Is easy to use – Convert the ONNX model with the function call `convert`;
+- Is easy to extend – Write your own custom layer in PyTorch and register it with `@add_converter`;
+- Convert back to ONNX – You can convert the model back to ONNX using the `torch.onnx.export` function.
+
+If you find an issue, please [let us know](https://github.com/ENOT-AutoDL/onnx2torch/issues)!
+And feel free to create merge requests.
+
+Please note that this converter covers only a limited number of PyTorch / ONNX models and operations.
+Let us know which models you use or want to convert from ONNX to PyTorch [here](https://github.com/ENOT-AutoDL/onnx2torch/discussions).
+
+## Installation
+
+```bash
+pip install onnx2torch
+```
+
+or
+
+```bash
+conda install -c conda-forge onnx2torch
+```
+
+## Usage
+
+Below you can find some examples of use.
+
+### Convert
+
+```python
+import onnx
+import torch
+from onnx2torch import convert
+
+# Path to ONNX model
+onnx_model_path = "/some/path/mobile_net_v2.onnx"
+# You can pass the path to the onnx model to convert it or...
+torch_model_1 = convert(onnx_model_path)
+
+# Or you can load a regular onnx model and pass it to the converter
+onnx_model = onnx.load(onnx_model_path)
+torch_model_2 = convert(onnx_model)
+```
+
+### Execute
+
+We can execute the returned `PyTorch model` in the same way as the original torch model.
+
+```python
+import onnxruntime as ort
+
+# Create example data
+x = torch.ones((1, 2, 224, 224)).cuda()
+
+out_torch = torch_model_1(x)
+
+ort_sess = ort.InferenceSession(onnx_model_path)
+outputs_ort = ort_sess.run(None, {"input": x.numpy()})
+
+# Check the Onnx output against PyTorch
+print(torch.max(torch.abs(outputs_ort - out_torch.detach().numpy())))
+print(np.allclose(outputs_ort, out_torch.detach().numpy(), atol=1.0e-7))
+```
+
+## Models
+
+We have tested the following models:
+
+Segmentation models:
+
+- [x] DeepLabV3+
+- [x] DeepLabV3 ResNet-50 (TorchVision)
+- [x] HRNet
+- [x] UNet (TorchVision)
+- [x] FCN ResNet-50 (TorchVision)
+- [x] LRASPP MobileNetV3 (TorchVision)
+
+Detection from MMdetection:
+
+- [x] [SSDLite with MobileNetV2 backbone](https://github.com/open-mmlab/mmdetection)
+- [x] [RetinaNet R50](https://github.com/open-mmlab/mmdetection)
+- [x] [SSD300 with VGG backbone](https://github.com/open-mmlab/mmdetection)
+- [x] [YOLOv3 d53](https://github.com/open-mmlab/mmdetection)
+- [x] [YOLOv5](https://github.com/ultralytics/yolov5)
+
+Classification from __TorchVision__:
+
+- [x] ResNet-18
+- [x] ResNet-50
+- [x] MobileNetV2
+- [x] MobileNetV3 Large
+- [x] EfficientNet-B{0, 1, 2, 3}
+- [x] WideResNet-50
+- [x] ResNext-50
+- [x] VGG-16
+- [x] GoogLeNet
+- [x] MnasNet
+- [x] RegNet
+
+Transformers:
+
+- [x] ViT
+- [x] Swin
+- [x] GPT-J
+
+#### :page_facing_up: List of currently supported operations can be founded [here](operators.md).
+
+## How to add new operations to converter
+
+Here we show how to extend onnx2torch with new ONNX operation, that supported by both PyTorch and ONNX
+
+<details>
+<summary>and has the same behaviour</summary>
+
+An example of such a module is [Relu](./onnx2torch/node_converters/activations.py)
+
+```python
+@add_converter(operation_type="Relu", version=6)
+@add_converter(operation_type="Relu", version=13)
+@add_converter(operation_type="Relu", version=14)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    return OperationConverterResult(
+        torch_module=nn.ReLU(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+```
+
+Here we have registered an operation named `Relu` for opset versions 6, 13, 14.
+Note that the `torch_module` argument in `OperationConverterResult` must be a torch.nn.Module, not just a callable object!
+If Operation's behaviour differs from one opset version to another, you should implement it separately.
+
+</details>
+
+<details>
+<summary>but has different behaviour</summary>
+
+An example of such a module is [ScatterND](./onnx2torch/node_converters/scatter_nd.py)
+
+```python
+# It is recommended to use Enum for string ONNX attributes.
+class ReductionOnnxAttr(Enum):
+    NONE = "none"
+    ADD = "add"
+    MUL = "mul"
+
+
+class OnnxScatterND(nn.Module, OnnxToTorchModuleWithCustomExport):
+    def __init__(self, reduction: ReductionOnnxAttr):
+        super().__init__()
+        self._reduction = reduction
+
+    # The following method should return ONNX attributes with their values as a dictionary.
+    # The number of attributes, their names and values depend on opset version;
+    # method should return correct set of attributes.
+    # Note: add type-postfix for each key: reduction -> reduction_s, where s means "string".
+    def _onnx_attrs(self, opset_version: int) -> Dict[str, Any]:
+        onnx_attrs: Dict[str, Any] = {}
+
+        # Here we handle opset versions < 16 where there is no "reduction" attribute.
+        if opset_version < 16:
+            if self._reduction != ReductionOnnxAttr.NONE:
+                raise ValueError(
+                    "ScatterND from opset < 16 does not support"
+                    f"reduction attribute != {ReductionOnnxAttr.NONE.value},"
+                    f"got {self._reduction.value}"
+                )
+            return onnx_attrs
+
+        onnx_attrs["reduction_s"] = self._reduction.value
+        return onnx_attrs
+
+    def forward(
+        self,
+        data: torch.Tensor,
+        indices: torch.Tensor,
+        updates: torch.Tensor,
+    ) -> torch.Tensor:
+        def _forward():
+            # ScatterND forward implementation...
+            return output
+
+        if torch.onnx.is_in_onnx_export():
+            # Please follow our convention, args consists of:
+            # forward function, operation type, operation inputs, operation attributes.
+            onnx_attrs = self._onnx_attrs(opset_version=get_onnx_version())
+            return DefaultExportToOnnx.export(
+                _forward, "ScatterND", data, indices, updates, onnx_attrs
+            )
+
+        return _forward()
+
+
+@add_converter(operation_type="ScatterND", version=11)
+@add_converter(operation_type="ScatterND", version=13)
+@add_converter(operation_type="ScatterND", version=16)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    node_attributes = node.attributes
+    reduction = ReductionOnnxAttr(node_attributes.get("reduction", "none"))
+    return OperationConverterResult(
+        torch_module=OnnxScatterND(reduction=reduction),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+```
+
+Here we have used a trick to convert the model from torch back to ONNX by defining the custom `_ScatterNDExportToOnnx`.
+
+</details>
+
+## Opset version workaround
+
+Incase you are using a model with older opset, try the following workaround:
+
+[ONNX Version Conversion - Official Docs](https://github.com/onnx/onnx/blob/main/docs/PythonAPIOverview.md#converting-version-of-an-onnx-model-within-default-domain-aionnx)
+
+<details>
+<summary>Example</summary>
+
+```python
+import onnx
+from onnx import version_converter
+import torch
+from onnx2torch import convert
+
+# Load the ONNX model.
+model = onnx.load("model.onnx")
+# Convert the model to the target version.
+target_version = 13
+converted_model = version_converter.convert_version(model, target_version)
+# Convert to torch.
+torch_model = convert(converted_model)
+torch.save(torch_model, "model.pt")
+```
+
+</details>
+
+Note: use this only when the model does not convert to PyTorch using the existing opset version. Result might vary.
+
+## Citation
+
+To cite onnx2torch use `Cite this repository` button, or:
+
+```
+@misc{onnx2torch,
+  title={onnx2torch},
+  author={ENOT developers and Kalgin, Igor and Yanchenko, Arseny and Ivanov, Pyoter and Goncharenko, Alexander},
+  year={2021},
+  howpublished={\url{https://enot.ai/}},
+  note={Version: x.y.z}
+}
+```
+
+## Acknowledgments
+
+Thanks to Dmitry Chudakov [@cakeofwar42](https://github.com/cakeofwar42) for his contributions.\
+Special thanks to Andrey Denisov [@denisovap2013](https://github.com/denisovap2013) for the logo design.
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/assets/logo/onnx2torch_dark.png b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/assets/logo/onnx2torch_dark.png
new file mode 100644
index 0000000000000000000000000000000000000000..7735f6f7bf109fda68b27f964222e10dc77ee087
GIT binary patch
literal 44573
zcmeFZg;Sed^e!6QTBNvBq_`C)Xp0mmPH@-a?oLYy?(R_B-Q9~7EAH+ZTu$hBe&^n~
zf5M&f&M=uIJMX*qTF+kcthEVNkdu6iN`wjk0NzS{5mN#H5G4Ts7%pT)=qEZ!Je1HE
z6uU1PjsO4))$1ROTb{5B^g}`?adjtUTT>@j0|yg;tE(%Exs9cxk%65Fi>-rM>WKgm
z06+nd5))N%OFLS2_eujngwMvCIAY88Gn|afWfGmLq)@G6Se&vI6pW)~6B7E~VyR45
zpdu?_E<J_(zA5m*75@f0D>*7et~W8~n5rH6MLlZVTv;!2!~HPgIDQ(*@*z(qVK+IM
zI`igmJ7#~R$?T&{#xCDY1{x;T|GoaN1pa?30p}!YAPnXITsh^)s$$n7QQN8l1N^H=
ze9n__Hhok%)0^@xuwwt`(xKdg!>@1-IfuA1;$h`|h&_S_|G%zx!F^c&b3^Fs0Z5(X
z{r~emM>p{Q^L68U;6L#yqM`+IX9T<v9OWkf_{!)BPa+j9ypfD%+_>%i&BC7FS5*zy
zo`7fM4g7Od!N5TD;8N{K`+8$(5L3i|+mvSBay^_J1*~gRmj1p!13rmF<Q^ua`~VpT
z6Rz`qy%Pq23@G$R%__!J?7YtgM4e=DlK2Cd{iZ|rSO9g!*E77Z^5YBy4<3^lOs{iN
zMAf(d@o#B#{wgzini`&MSYEIpup&XQ5qh|oFj+y(<d%v3(Qq%Zh~H8e5Mc@6k|OWd
zm&i-Q)B!+%jy1&#P*C{vj^(YytMsNvD|tWt6XRA@adGicbK~8+UTthmJzy7LN4O3{
z%Igy<oIWxIzQ$UP0`EID(K3EhbNlr<^58E5=TiTKL?ibuqC5EClstka?M6*to5%y~
z|D4!A(}-wiSMaHxLbbM0Y`ps!`Z%&KJ8DFH+5j#6p|F5f&!rGav7#VG$^Uev{j2zy
zYkhB-p^@;9Ka$N?2PM)hKk}$1A*u=lMyk`2-j@>N-&=V@W1+9(2W5A9UH?;K&NZ^C
z&F!fI?_^evD;Z0elUYc%;Zr0cBc+y14_6`$ko2Fn8|=1`|I;ge{7*|34z>=(8JOi<
zcU2^mpJ-dGU3SLGOSNf0%inRbLvsICRLI!q@Sn5`WB-;d3Vb9e_tv5CRowG4-i-wq
zJ#x2ktLt9&7}Nff;WY05Xx+Ix1rpuIt~V?pLQ4xw>{WuJ(!)>{ThVQ>xg0b$gGvG_
z3&Q_1TA{G|diL)DmEYBhYF|LCQ9cb(X}O^Ae9=D)Ipc&<{Zo<Hv5IOv7p=O+Lqf3+
zx}bw&H2OP~FK@=<5+rTqT^Rn2A?@1u-)jWM*{ldmqZ~zkg2Wl`#z^BZ7#_LGFk1K|
z9%=uR_d@9ZhJ6k;@+-`rs_6VyD)&7Kz=LnQLKdKM{{A|yP|(BHS3AdMZNw7qYXDbM
zGDE7~?IoHmEsz{4)tr@YCcFZg>!tWv^3{$~?m4qR7%UG%F9ueVj#Qc{aTeU9jXc^c
zcZa<Vt8NNpQ~$213cM=gfFDm1tTfWih@YW!>Y9Mcri|c25Q+$>!Dsyje>R{-?!E5_
zu!@<YejSUOU_1Yj!bnesfGMmtwb;i-R{Z#6KXB7l#)Tz4Z+|TRjWPUEh}l&234&dN
z*7?6#AI_<U(glvRpOg|Gz>N`8Cc)h)oPWgbEHGo$JlokCW*wK4@Oc$kTX1p8NE`Du
z;Qj+J@k9b)m5w=5H||K_pNt{`q2N^N4YS-~_3Qf#J7|AO$Lp@&e}WGaNBx20-}>nZ
zB3kJvb7N9|SiFg9W0-qitTi*Jy19pL`E`kQ4jq8*lx5HP+500bL%}DuPh~qH6ddN}
zpT)z3$WX0Gv{X0~jtz!De33g$%T;azImWx9&^av=Lsd@SZRX_*dvzNa-BauK7h;y!
zdVLm}w_+*|9WV_<VB~q^QQ4vO&<^|?N?CZ|SvAl3=x$@sl`z)Nu-xiiIFGMojF5=p
zL_`rR^dd9GNsFv67BG-U#$={3=BSW{<o8ee2k&0>Aad?}Iqo;J6d@q7asL_MC?Nd`
z|H4^KP`kkgeH+<nH&{-sK1YU&9zq0n(T0tLG?#@3fRDqOFkKngN4mEOElSZQ-2PL*
zde`qk){x7~_M}Soq5*z&f;)mnBr52pYx47qz|8Y=8gLa2BBxFgJWQ#v?Tevda(MUp
zr5NZ*DDzZf4@EP*S%kuqsnXcv`L;BsyH&;vC(N~s&4H69aMMRko6VpWoJ<=qsv9wr
zLf2mTk%TP0@VwB>0z~j?2pkYti{Iq|7`dOsw8BXO!4sKdD_0l3n+ph@+0}~E2_fS*
zBg4M8v!QEN46mv~*HXrMVhlsSa={7j*(eZKzHDDV=9N^76Skr8W6us5oK-m!1i-Kk
zD0doDZ1MqOTS<aESnwpEq8UVkZr;3~&gs5|$T)C-^nA*x=>;uQR?RHH^=BuvY4|ex
z3_a(CrQJ$nsinHGS7~}Aam7bRYLwu607F_c{PPhIl+lFFT+*-^8Acw4hG%AczuKOc
z1?o2r^vE*3oz+%VVq1R^APe#!`0t5~uTS(_m;Ii(kkBsko~6!~7qftz0s)hm%|eLB
za*8pe&+HWI+cc_biBYCTo7J66d6?uYRN&P6dRDlmlnOUyX#(9dZ!CU{C2^Qfa)Si<
za3We^qmy+bOwf<KyEl;Qh|yjx%@)loCOY+=FA@{xL-Bae&N%4(0Q{ID(TX@-Nvz|#
zwIT+U#0-iEs-Lg+XUW{no7DJTEmdCe<i*~x-zbE-QKWBUsxJ`R=Tz#E<sBB9zA77u
z28w=!+9uuB^yCF(H|}ft1#3lrRtA-s`B)8881(1SiBPXB4P&P3`*(N7{AKriGE%QP
zSAQ~22<UohRl3}&Xqgi|(y-#rI8`M#u)~iEA0q09ho<nRuMBE56aX#nl}aqs9!Oqd
zSzpcr6M^?>uRFLobbx%8^IW{+Erg;`q-?{HL#dW+Qqx{^Wl7(}_$|&<E79k#3*P}3
zts(M)pT{QC%wW<o|84$4Gb8&SbVGP7y1c(u`h8b2ihqLFQijV{_)NODNB{)vR$@C~
z11dJ#9TGr@y1+`U_rUc}7TmAsM{k6Yi)mo#tAutLYw*+aD63^=B-S}9?q=K_`luLn
zkM|sDpzxCxr@FN8g%kgYUqw1_cQ8^E>vJe>#fl?KC`^S@zKrV6Q}QiqQ?I<7Ovr5V
zm-R|-!AV5mYZxPq165B<sbcyzEDFh~P%<ImH5K-U)0eq`NQCO@k4WTCx&eN|QS63u
z;5V~qA1HG=^}&R1LSuNn^itct`J+~%1!xww;stpdLMDAH4ivOs_X4#t$oHG~m2~05
zutezst*Ko4mEWPiAT{N=^C7o<WO4%C*6l!HIth$9^NtOGzU~G!5Ah!^>7rlNmcti1
zrfVopOM18p=g+Bg0ckPx$z**i@)8lbX3_<Zg!-ZL4S4KK^x5)notc(m3tE&W$o;L)
z<8fV(9@DYg3KVr>!BwVD{ViE3J1*M(?OeHb8#FcpvOlKeg^x!kF9ypfZ^>CV(6*GI
z0gI%*!K+t(7v6`JL!IpUgn(@*x2ZEw@&zYTsBG-?65{M?e2V%Z#{w_}6GfH7vA%W)
zg-9j9MU)2CNH1{wrMINR_x)UkTMd4uvp><@FJ1!pXwpBo<a#>I8|3M3ldsc}G?{4<
z|1-hzYk=0T4#u;-L=?N!*ka`c4K(eeDvqXu!7@m%gP*SAaNlS#Yoy%J+x_ZzY2cui
zTk2gTPdjzaj@wQ0$8{=h_IZ*reOTUeH>-hio$%DvCF7FZ=&(^+8@b4VP0}_GeZZCo
zClW&qrfa)<mOdpACLC~IJkIZ{)nv7u@);w^b6G=;5AOvd3dYc0A0LN~qP@?IMYnRQ
z*ITXZaSH9l;cy-s8X2Cds1i2KDZX(KtW45N`1(7vnT)4v<L$k@9`{PXM&m$fdUn4L
zS!d!>gv>I=zl|Qp1hpwNT}OSF<giY?!6!UkbKnuNEk`Dy4q-N@!KltZ??1B{-M<0Y
z;C<!97h$2OA>Hh-zOF-K^7B@J09C6aAtDHfK%w2tnhN0IRK@f9YRBng4b^K{Fb3<|
zNUH4DCVsbRV-K|XIFgopV`FYIItAM_cRVN5CD34`bQ4>{v`a|$-&{yu=Q5X8f<IOT
zUj1{=jn(D%40m)Q*n%<sTSi7^_Ntb1_Ce1z-V7lYj^iCtmY_8=JS9VihTC>G{=b04
zb;BHLMif~Kia)3jCNGp5MWTbX%z{f`Vy*1*Pn&h#HWTwchwMfu>8GWh7e4Soos(Pj
zCB^Gxp^;DyeDmYQ@YKrXsRt2*^;oyC%ipGl2J2OpmYY(+Rzt(?Is0%x^}7sX+S&sB
z>izK1{fMb`+z$i4Q1!e!-3dBj8w{bYOz~9l+T)nZuhCP8y4rJRS_-3)h-0WfoVcNa
zlf+%tk3$DNy$b5K1RJvSLx=5)i2p;f5)2eID7DPqv#|<Sw%&_TY1(FEBYg8W&wD`Y
zlT=~$4us`jybhefbV&qzUuWNafKFPsPxB5otGu5R@8!^<Q{{bog-H6xhMsG7Zx+Cr
zW5cGBh0(uVqJ;2z?~jK+12e^z?!Fx^hOsx^Df9Fhoi|SNaMhb<b9rnP##t84fFv1{
zYI`+P>QCoZFv~S<b?LoL{ARB8ekpuGOkg@U8j5LD<9h|#ICYLP&~DCt!_6=Lw%v0%
zG>KpOVxau(PbyEq8#@l0r78rQ{@m$-e;Xq2o!UeHyNQX>7M~RWcBHV^*0GxJMyw~t
zB*$b!dqZ6|<k%2?-n%-snb0%m*WJ`dl9J>)t(+%*B?3O<l@MH}i&wD*wbuJQ5rBTR
z5Go@a=0n3syA4kld4}fhr$X-Wr=+-I_16d>N2Mn;L<|MfbwtL`xVGK;swh`0|7*%5
zIiQ$|c%e3d$a!+mv~wBzS_QiyLb;xM`t}Bw)r5zO%c&fb8eFDcl{2(G4K*x<K^)xH
z(TD5?WG+BchI3ILBmdClF#67+*JZ^+F>C&x>6?MtqOB^vP<*dZ*n)<3X3Q@GX86l?
zZrgwF-yS~HKHoe_-G%RE_eJ;j_s_{xJTA7^n>shwaX<7%@^UMB&z7~mKI{f_rsjS6
z%JostIfZ7!WqOb{T2es1Ea6KeSG*#q(6WOUmvBp!Nm_?**dZfmBbT<_EU2-?;ML<o
zq@kO}eXmqLldC$9P`DGTASzSGsYfVtd=&$rGIO%jRx)bFZabo`w^4nS@#%c!;gm<{
zY20j*yQ7I_O;ww5Lz4+={SO4ijAK&IRUpfEm%BH6Go_S<-`5w?0S?+>Lq<(xCu<>C
z4QNz;eLOmmQc%m-h7Ls(SxB8e0=zXUX<+Onu|$TKMaO61)1|f=bdip~H-x<3>{J7k
zuHuRpic^aF&2-(kEyvrv2dvY%4&oH6BGbKfb60m8I_`Ta9vXTLY188A@MuR{Tvm>7
zBTL<8+Df4TQ{iqbjE$TCeze1EKpvZ7Ywm`wJ-mgx(8708rZDX_5B;uLSM8Qf+VTOY
zH_g!vLdq2*OLjjFKvH2ZhFE)7-|xj|-WsR2Nm5Q|7C^?oBfSkFj<$K4x3aPbX;Zon
zsd0cX5}v!Yj#r(~N+@`FDE4%_n?2kcUrQlj0{nm-4w6$9<WFU-Eyv574S3iTHcycN
z-q9Iak)Zf^>(OAO>(R?#zEQ<3R4a1aeMSa2z^Dp{N`eS?4xt2MSliWkqx+`aRRLd*
z&(V}jD-)u?s-}i9?&M`wl1nwSM^Q*<HFJ%dWQFxBG;5`*UMj2zf9A<z+ZjzOdS_bg
zYU^#NXU=Ov&W|#}W&gF_?x(1%F!DX>pIOagv#=v&TuJd8aL)Yw*J%Rp<=H6GG;ZtO
z2;*IOr(d9w6*73vB<$51I2E;KB35w?qgBlKPYoL=z00%yN<Rw%qb*$?=OB$i0uff)
z{`}(`%EZMMe@j>0@-4jZ;Q3HvEmiIrt%L)6!zl7qU4o-!Bf^8=&hLG%5;g}zLS#2n
z;p#hi`4<$;{HkKiy<^})nT(=}R3Ad4+w7=Ky_+#q553tuakve=tLkW!^Hn}P@F@(G
zX*IYTLvw3{O=5(BeKYSEsz2=tLx|I?lyCp$U?IMRJv!GY3kjyTU4Ym$7F6idTu3*x
z17zZgLn=~zf7)NkNM2VzcSCq)LcgSXb2AjrHfYd%G}(|Ovg?(OIoWu^Sh01x4>W@#
zZm>dK)6d@g_VFVJB=>TxDAmV9F=0Fjqbx>BUVi*`(#)p+^RSPeS__}Y?G4oM?WsFD
za@4msK!mHrCC8f$)@k(~p3owjp1eF_6H7{h%y^$Ivpf<Q`T+#6U-Xb3ykY%8f^(#D
z6sROA){K;X8=W1&0ESA1TTG;TK-X_DXa-TF{xjsQQDy6ptuOF3aFg;Vo<^}OX^87J
zlFh_Vn!@Ep33&#0nfdor00Bv=vyCbk&!wH&!+Jsly!osF>tD39$wl&{d&{GCI&;Ov
zjaKFQ9dqkHWrr7hNoku$-GOnT?tN1nYDtt;3Gn+~QAOrvE)MT;1{6h?>*k3-An8Oo
zk}RzLTYNw!V)1^=qNIJm6ekSh!+H@G=??K9UO0AFBW~%b)ur&HlwMIBmIMx_r7(#i
z%70m**9fE&?`8WsBwDe9oGXh(;f4`__MX5G4><uAb2>C0#Ju3Jxng-^WA8Y!a<qML
zu)c49$3LA`@JIQt%HL}~I|c*vOyxa)sxY`0Js9vOkU?)JiZPWs);wA+Bj-s=?$(w4
z%#gQ)4n8UfcfrXqP{j1ph&jL^n>&}#o&sGR01L;(YgbAW<im;-*)`H-#KvbwhKSso
z|6Uyn2A)~rDDYcdAk(<YOC~DX!1RaneI-Vd3fr{P-s6Jz9{hgmuvek)P0?Ei@z{dK
z+Aws~_|H;?{V`Ahoml?<l$L*x;d`P9a85(JJiw2@Wo{uQ1%)$Fr3?~99%%9t{qVK(
z{u$l8MPr-K5$|Z6P@wP>>t?2*p?${Gxi=1lyeh+Nk23I}f0X#}<$5XB+vRxPd&xz=
z>AU0comr+DkNwh*c1*qt*You`)y1opBu<-8n{iU)(WftulGdoK?;?ef{raNO=@=P@
z9Ui@He+=UK;Zgkj`lR5Ag})F^7ba2-WBl2;mSLx0!0QIFC)xv<8yxT3OMIo)W*<0o
zFq^x|$t^S;u5SPK(MGjvZgv@rK#~MH9=5uT3DISu#NMn*;Ou<dC7pnjxjn%W!gL{b
zrseLBJwbV`#E^EYHP0+MU+u^g|F&ks9-4mrdg3Y-6OdHbj~7R(zy59(0qfzjaATD0
zJ8wV*tG+^aL1XPLc;uZFA~IR|y&@g&-bkioHOsAg7B=c1^-xYSKcB>xt=`f|8s)Zv
zt@U-&GBR&k@vCCHhF9a&^}H3=Z(Ew*_4GTX46AK9+lY!*G`cbPT>5L`FGe?E7MB|K
zd>UA5Etj;A!4cwdeXO^@M`YwlJG*Vj#ql?xxigw=CT*y3ARBf69or2?Dy#h|z`xbc
zuY5yswdNfCm$fk`G9&{#sxyDdq)B+yXY^WYnPky2TLF}9nq;{gLK|_XJTcwMmF{k(
zG5?o@w(@2G^HQ-^e=vLVi@(!`)n|aHZ=p7|5D>dFYHTa+D=uzoKTql{{>+8|7HJ;l
z%lhZTo&g;l;W;4!%C`a9B0EI-HZ{G8ukh)W$W~7{ZV=8CaQ~<#O*80U{^r3JyGt^g
zV3$`e3nFvey_yDqr!CnqF7&CzA{0hd6>|Zv(I8zY%PCOuzp`H1E<1I*0{Iq>TC3b{
zfAn{cn$W5vTWxi8biQj^Pq|opJNaW&pDj(pT$NPQY~_5}q52GD#IwBn;|LTJJBw}|
zlHJX`R0P^}b}~8XgGTC)PaEh-(m!q(zc~`FJZ)rm51){YR`pE!GM3hd)_#4F%UX4F
zW?xwBJ9;>wHFID_(sv7oUA-i5d>SRB@Kt%jJRjraO(4~2@ImamQ&%-x^ByXDPo7>9
zSJsSc)WR~PKieB$%*C&iDX)p6ex56U8|@AM`g|ALHVkazRo7@Cwj;Z)n_ivA)}$pk
z_a-f-vFUxM597yzFR2=28T$4J<5PUJ2IYopF6JZ*A!agtCCBH(+>ST#v(HpJw$BHp
z^vc{c4E|%45u<@g6>jf{8=H!YE$rkQMHDs3+6Jq}##=Qpvk=Ffxqf0K?H#n}<jvMr
z*_EEyL*kD|>L#ev&zE6Avx_`UPvLYTDASD>4O!Ou1%x8fIFhgMK@EUrYDc1sWl*5F
zs#~7g4xlj?-*2bo@%6YeEu2g7IG)8IKH`5~Usz@0N`cAx+!qZ^cq7B^3^DkseTAlz
zzK0SmZFb*C=KPC5J7imkJMTup7~R?PFTsL;g{T2sv4ADL-K2*ATrOLbk=!Y?{$(o4
zb7Si1saKL2dr$}cT0R=S-4Ua`&jIK5m%q<Ps5vwc_kD7qK3!ofk4oq@7LB`a3`-sq
zH`8^!zAL`2{@Xx^wdBwv%gp;bdOV+}q1OB$Dml8yUt}%mxKOF9z4<9TQV`-BzWUtt
zZuOG&nTin)c)zNWIUecGvGaHJVU(syJW(&*zLIScCdV1_Fr~Tr@!J{&Ie@ACN;;ZK
zGeKkKRDb9lo>$~X9prETp3yS%Z6MWK3y37(B5JK<Wx&x6bjHSb;Z9rm@?D?z9lGjl
zM&?m#Gd350)>I^j%aXyOgZz!;b?{seU`sUi@mYZsnM==p^8N%PepU!KT%yIKM3fb5
zk%(&1RlK*|6_Ud9rWPlvNhIc`QAt)g7ysT`--u27!Glg^dqEe`hLM59HxX914E;+<
z=$*C}RvZ)3b#^s|C$HFYD%U2<z6E9p|DSMUbdKiL88`Zo(=Tl$Fc{89xi;e^-QZ6h
z`z{8?e=-qARrb6|1s0kpUCmEMZFVNwzO{^R=S+O-&~G#qA0c0a)rXm`Ih3Q<BS8U8
zFJZos<i1a@4F}-C5yTy(Ehr$1Nfhs;Xa8RKsnB@6{|bVyE}DY=#rNOiqxIr`T3$m-
z1b~a^Ulwx)0VHsDe+OH?c+<76%bS>(khV&Ep5~~6CId0<mpdk%2SJ)Zo%?h7qv7Ab
z`49R^?VKA&`(A?Gv6lqbokC$xt(G-kYF?fK+g=J*hqcw3Sd=5#dm4RLeXb8=$`@Q;
zj&JrP$Auc?HEBZy?RR@Xi9#Cl=AHh6QX%<rww~`lQ9JJJsI@MpRS|1~u+oW2!%9)+
zuIN<8gT6@Pub{sJ+=2INy6i^lXVwSJV$u~FveFj|zR7O?B>o&<=UKYP1Jk&2kUNYc
z5w6?V(sjaAR(Ka*e0>jF_=z7xf)X#8kJJ+#4qhC19}_^^1}j3l=;Z5eE<VawqAD&Y
z?IIzGYd40_Y7UoAxm%7ti|2_2R18=je1ZG3Z+Lu82MA6r(Mj3gJLv<6SI|eyL6eBL
zbavX|l2Yp?2FANN>B3IH^6`7J{Sh%t+$aAa{Cj0Bpc=3OT~er~Wx@wAs9iz3I2hy!
zTM+ijCPJN8L&zg~XVP?Xf{QjxpzfcI&^D~%LLD_t9YL0P6lb-rv!B3#^#yN*C5OsB
zrx2`;ZteYFkVPG$h{s!fV(lE=P;vleVz2yp+b@yS1a&HTGD2Xk-2ywSrUZ|*m19)0
zuzkqz;&+$u;=1j4T<4qTEuqJaY@`P(Z{J1s4i1TQ2aDlE;fKCvpiSE-&ex2|)6X(M
z{=)34D%ns}EPpmC1(g5k1sFF&*IVk?YmGD`n&KukvcH6u8RWaZNA_ygF*qo(Y1jOw
z9pk6Vr?2%SiSPx#;pnvev_4q>XQA-$_@l?WlKRhq;nlM|V;DazSL-c&0#%qoNJL>e
zz)J+EvnR+P)p02oA*lqPjh4p&C|%qGt>x${=Dr6Q3H%=7f)x=neW*U$c?}c3x(G=!
zBy-YR;0Rw()&tmBYQM@PyaJk&G84kkUie(I7iK@+h&9~Vs6~Rccx2IvlmD0S#6ra0
zod%Ug+m$iwjl0F5Lft}BM^K0mpYQK|?y|ZiYLCYu!r?^Lab#7AzXWSM#iT&h{Zf_E
z@KOzO4H8tzYjD+SpkypU>j!{}Q*rPeRhQi{hj!PpPYDB$4(S_77uX#itj0;9dF5hr
zSTAOra&KRdxRXWo)O%^XSFgxsHW5G!Gh*ePI=%}{qrDH3V#QK4Q&O}HnXRAQr-!IG
zT7Oqjr4XeWjz1lqhP!&*EbffU3ufr7x?Lr_NWd5DX0F_pRV=NzO+hl{iP^0hEdzsf
zJYDLN2^UyfDy(&E&+i_e9e2}f3P!)fKl>*G$sHJjP9#9Guhq9_Avd%r!Xe$dLp+3^
zMXN$Onu^NN)ip!0U-cIt^U5G>h6-<n6W$lZE8~hwhx9Dr=>|%Ih{yNyMbzP0%Z!}?
zxX~5A1|Drg2ajxSyeZBI2YiHaB2*IMt({%-J@92&NqmXLRxUm?XQYLQXJ3OK495#4
zcky7Sd5%_Gs<h70ug6@&)%Bn|205%Gl^o3}G4-6Ts#(hb9&jY&shs#s&4AaG@Ri5V
z{-{3VUn)uDG`XnoRmNCuV?ugt%+31g9$K<U(X>1V!#>5PM-rodt(=NHwXk#f$zX8;
zg+!6n-NiN`)GVN8_9wod(JZJ7q8OlcIX(y(9FI*$Rs48$e~lxiQ)hSH{T}k}C6I)j
z@8u_B#WFrJd<5U{Mta#PXU@zU{mk?CR8&;vHE1{?81=TcZU>q21=b7Id;pRMSZxW3
z{0t5=pXIQ9jvBB2^tXADS1X>23y%U$gMSDjlGp`p|1S9q>*-|6GLOF8ZI@_sZ!Sb;
z*4{^(twJa(RB($+Gq8(Gy<wNn^-I&mes#`DuDpTLtrdlKk2*tRwDI6Qlu)X?!<^Vp
z>kIL0gZrk5LyLdl)b1*v+*4os-1C%Y_s?U}oEVp!7AC!R0V>C}GyL~SGjA8{$rE9f
zNqm=6vlV7ab8?1ohsJiJzQT`L$0abZE|d@qtZ0FtEj-KSj@OeAQ0TN_h}wC${>0<m
zrOwjT(lhqiu_JR@)p!dYV_(tU8daowOSk5-%Z8^S!*7j*OX^xrTnT7zk-N^_6dglB
zW8zny97$G&iej)Fr0>X*SMI!*ky|{%K46<m_y?Ds@_tcSaBoGxDKwrOPQNd`<1owU
z&T;vd%8<Km$WzfQnmL0By70fp*KvgAhcRMByH#WV$NM)Il8p`LSl>9@ep&goV_y(1
zcyciXnVD(-D;!D2*R1%<v2Cm6(~N~IXZ6cahw!rNczfG`H3vJV8Q5{XiQJaJaUrQ4
z_PiE%yyeu|`tpGgTE;*VT9WJ6_UNK%NqmCHm+`1_30Z3*>SrY0<;CcmY~iCWh1u|M
zwQ@`s1OUIU59SemOHq5xoT@pb5sC*+GxeZ>noihCP)400=Q!ehJ1(kqnx+cp^|3K+
zEE_b{QIlq*tPwvGXcld$b-^B;bXrZB!6g>rgHrsfG3n@lt=-6Sl=#_4?eL`o(c9W5
z_EkNYv_ny=*`i`$6^#c*j3762O`nxm8$_nD%v%u8CgcMw;jx84G!f6Hq9F;C4cc<i
z9!H}>LfW6)q!BM4q)j5YQ?NI-h9tzemEPfs#5m|k>BlrX7~dr@!&P7R=^{8exMqe;
z%H2I5{VanP`kciB-N_d$>v$MmeWCNwF1(H<eTI_CxVo~&gDEtH2r!?LIQrtkFBbeK
zmL&$0W@*H7tXbZ#^>2fTvFQO<qK<PnT<_aD;sxyyrZfSXY-U2kbM?pVNwtHHrykF(
z^#QWu2D8-_qi4}^Vvh3tdRCvd^%9=t17gpGx%L=8%d=CP#lJfMuihvFNx@s@q%`3y
zz14Qf$J8;t%LR_hwqX`7vD^uty!AmuI-7Hko*o>|DwKWOc#xHP(vg@_*EwnlwDM|%
zbWe8EI&X#grrunR8fRh7)6|~9Yv8^>L45RW|Dg2MukJyE4W9l-z5V;*ICC>4PUmhl
zoaUL>HA=KuRa^RV>iXeNxLlEIV8Pj?oDc~;tBymCsiI32`;?ONtQRK^GWQBCd<pb7
zDvNbNhd`UW$@D?22XN_|AW7hP0`3q)7y!WOju5QiZtRiiJ$gz;7j{Kf{6%d-7-`tD
z^`$hw5gLjGf|usJZ*By*5hQrcA9%_%Auf@}(xTIvku3_J;^@qDYxHB-9Uh_^eAk=D
zXNWg}JAI4*Fi_6x^tsLYgkz`HK7(AH!EgEDn)Bs6c)-xr+c)^7$@i&EUS@dW#_#6g
ziP!U5!}GM`IAgA$y#2k=nxFrPjjjhQ`O(744nTm`Y)}z{Uj<a&baX(aWrQ46KV7q>
zBtoYGYAoq$r|SFxZI%#-IU%@=goaPcLu1tQ@RGL9m~UoF{We0xc4NO#4o*82I6$Ga
z>@YPP@h&=-tfkCH@Lm?|{-tggw+mevT=gs*mrx;0J6}Yy(dk=`59~(ub1S9U<}s@%
zUHT{*b}C)5A=9Znd#JgaSNU3>Y}D`qW!cZxo9To~W*lAzVgd3q?@ZLlSQ#Q4<7HfP
z%2dx-SF24#K79O*U3Y%Zp1gI$S8b|u46-n;`}EGkCygJT?E*6SJB2nTUFsvroF4T%
z7-O4i$Md+HOxBic52)p*uVQ71PV_iI;fx+A#b8|nk1GrjibFm-24z&vZe1TQ9%{WX
zQ4&ns`XwT%-S;or{(ZVq6LxKXsUgn~(!)TyBH*yTzd5w6Xx}RKzFBsO%;0!AdVUtk
zT26gw9_-iDT)>G$ygu|D5s*k1aOjL$TH*$gfxaet&UD8$`R(LahrC7OTJAs;bZ~J)
zqVEOBK$)%z0YyPE@rAUs;^J3ld42o+`i0<+nVwAi!MT<k1YR;V9gm^nuE_rA#&ct&
zj*4Et7d`_cuoyKq9Stx7?TX?TYqOwtRx)?L`W(xqB=z0ey}#rpy||n%L*Rr}w>N)`
zkjr$l$)<gdLG$!P_v2Pxh5`YM9|p_)nz!Qo^ia^OZHK96#iOoL-@4#<^)<KJukQTK
zoZyP^(xR}HQXw70J$u^E*N8a|_Juk1tvn>Y<nPdff{fpioa6bO4~@~h$N9{G6rrAZ
z#0hTg-#}KR9ntl=rpn<hf>^^nU{vD!9_6^;@)q;he7n4?T*_t#KV%LxV!H+$esY%a
zc_h8t?&%ow@qO?d5C9Xq9}Fh2`d)`RyL#QF=IdX?`A`lWrV?73-~9=8kdU~y=Xu}$
z9AZfv**bckzS^9u#-I*r+P4@;>iF&{L27Q#M&iysFGW4>%e}7(@~y0{S6yG;87%tZ
zkCpxVTSZcrDhTA<hzw2Jd=4kt@#()lrmTCigH(rb@W0*$z@B<yX?ZnM6NQtxV$>MZ
zGe>%f!W#*qluyp_tZA=FSsid5Its0u3iz)hB-(UCIT=`mrP#sl!S@loq5L(jVD=qi
zOls%MP-84n0N_)6UB}PCLE~LGKuB*pp2;DPj}*u^yUj3j;97B_TfwB35)?7issaMV
zFN3e%Vs1e@Z^Yhp&FQ=K^gz3g24w`F(ZU9;9m{ojk&3N7GD4AUnx8LakC(xg-ZS^y
z;&JrYv@HuCbIP;J?}B$TEt)SMTpymHoZNhPIC(KK0N~XhlsqLvbn;vQOzp5+4mYRz
z`Br$nI8kg)O>H)xcY{MHtq?T7Ci?v$oQSV^bQ6cy`EQP+(586A5N2tJ?n%vgo`R1n
zc~5|Tql<HO>=#6kXCrTQe)TmMjP^2HLyN-RfSQ_BBlWzyTUisIBOBlc1En6^**<ka
zED6K+S(U#`GA=|xNjtahu=X8yO%@Ad9<<x0^sLaB<g=L15y%VDC&c4^vBh7pp}Y%@
zzFZD7gKx{pT3+53NV)t$8;G@*K^p4HWY1*-Jl3o&QK4{ksT8=lMN;!Udl8VPhoHb_
zUkp0Y38fyXpewRIpOAfq8g%i${1{h1LCvHEv9mvNEo0LI6QLb~yYR*ElZ-cO3Q32*
zbys<X=iM>g1M%m;{A(A`<fNwG9Z~Y=i_d!Z<;~6MY=yUVl?tQ{$|#v*aoImo<5w%U
z);L=}eeZZ8-vtWa*?8<P=>J^EC1*7?F*(G=*R4N&2A<U6r@I=cs+y`gJoJ$n?3R<B
z{}sj{;~X>}-z@JqO2H5|m>{dtSUL!Kn@C|w4!mfSl$?y23>icl8l#*DBbj@{p-$-j
z)gRjX6w3tLNqg~4Lqj9-1co$mE{f~Wq9R~T61vrrnMUG^9CB1uIGa9YcU)4|BP-cB
zUJJp=dA!L<*6mfDQIpC4dNT3w(^)e6PrtGXdv6;!q`BSy#hI1N{r&Dl8ZAZR+)S1f
zOBhM@Q_AsbozDYbV86Vb8keC@v(ZF^{0Hi)I2stpkg#$QuRed(eF4X(_=%?e`<G5!
zs)78RKxpI~V!XQ^uV8YlS(uS>BoHGGP#<_M<0-|=SqZka%0Zt?Xn$}=UZ`y#5He2U
zFuEUcIen_Md|A1tx_KZ-NnI?c;VUZkZp$w|c}DWd>TuiZ60)`Ryh&nY`nmJQ*{1cn
z#<SyTW^p?#VCsR?(rDG#^QNuL8eJd=gK%XjW55I|b3C74S)*T-{m}(37aULQI6B0j
z%$D0vfA_S&=xeMlzC2v64DDZEHBeU%>_)u>y)Plx3SSbi_YP;sFzLIx)2zMSY8x_t
zp#cV}=xbILplvW_04rDFY6?8d=aL6|U_q_JCL!B_>@Z_=eml%=ZbolDOLXlmX}Qlj
zxK-HqYERJSC$%eN3%c=J$`>H1{w;{;h-U<T%hsCKC<o%(dhEo+mt9=sUR>r<`KUAg
z3ZzL#oDpm8_S7Wk7>awsakFC!qB{wy(^ri~c2pN8fV-hFs(BoK3U!S|st+0ONP`6r
zgBQ$CoXvE$)pej<0Ga3O`NTr__@XU5#*4M|3#KkgE-v+uyGPPzCT^~V+E%4|X|KNG
zc#U(n$E4%RFys&6nbVKwf#i;e=mbfDICBE0Vk;~6E~!uAazxw$E~O7-`_*VHvL&U(
zKdeE1N_DOVLowurLowK+Z)Esh6RSkL2UPwuGDi(GzvJTjp9J)-BxbSahv=3c>+cRz
zUHT`dHen{ZR5j(W;1NHiPgE*V1XQ68A*>CJY+z$W6H5>0PQ(;#wkL+2W6R?DP~9dJ
zd<S!O9r&=%m30ag*4_)4`}$uiM#JO`PW~d-+XGE}yGpR%>_3*VAzQ($y6IW${_=*>
z8Ftya>;#`Z$dd!L76V>G)j11V9w2cQraRpqf(ER#p{SQ+&|e^_4<bl7aZNAJN>We-
ztiN@ITJy*9RC{22sB?x&WC%ne?zmPdE~L>TdP&_xrVQsVF)CG%)#GHpH=tbL>auV(
zlIX1et!<^<>|6CiZ~I?_`<7WL>k!_7Kd{j^ATHP$*?u$bVhmzx7H9iVZ+7#!SkgV7
zS`BM*>>4WZbQHr8_`nDxCmaftQ}p)%#Gi4aw~HLKcTibF15g2z$hi;Mb5sFs#SVp*
zKLHCEl=Fc59*azBU2NW@sFj@JhTty^E#;q_1fW&yx}Mg<-M4lmUf(|4L*wZ-Rzc?6
z&%}AEDrL5gQ5R6w8v<BpyD;88Ne>@yJG&79SvwO&I4UwQGrzqw9Vto$I@&a^p2`%}
zYtU9ho^BRM{Lo21mM$Z;!*mEAk0d+ZlzUl=A5tpxCu`|G_c3<LmeV9t?A!Ymsg=n@
zNB}_kjQV<UrVk>D6I&tWy;_9mkz=*{dP-FIB5eH84|(9Z2eo)xbw0;~UQjR*hxyi-
zZxgN@vs0>nt6}!bZsd%hu=PT%%V{_cs<l9$@dkE;GfS^Ox8Ct{gk#XHE7!s*U@w`O
zZK<8DYz}Qt6Le=O(|p;RCKX#>caseQf#yBa!_b}<9ANZ!H1yTG6KjykI{h^|q<C`m
zz1RpMm=G%GAZr^fK%#J=q>&swXvE_J7MxGrYxcO0Ly7djPHi>>-Th2PedEZQlfyo`
za;Vv8>R4mEC$Amq8dZ0l7xC54B1lp!?2u85qt@Vi>bZxq!PYl*f_<Sk(^DH?*jm>O
z0;$X6V??s&n4-+ADV~LCVn2v4o)<C?+y}jKM^@Zb6?gE73{GN(cqC{lXM2{1vx%cA
zA5D~D5sz$eC{O8}@`95G4sY5nE1+EjJn6RvNUw>3nMFShLnCG<A%K6`|Cb}R_vn2%
z2B49qYV)0YiCTjZg+;}D=y);UR~E#2Q!>)>?kb0vpW8^)-`4w-r8oV>D*HTHJad7=
z=4uXx*2CDzvG&<^TyWlOHe-3<X$hT-b57T8Y~bKSSmvBs^R}6`{e!n_@yyFr%~L;q
zd$aG^TAGWVw(~T<tG8|NxV@&lm1OH_XKk17OW$hC=+_JV#%oqT-Gr=lLx%gbvREzC
zu@TSNP#9?Gy_(ptom?Zy@_kB_;CuMOiqk|F2Zs6+Xu~TfnA*%_v}y<)d|9ZR=)F3U
zTyW&TXkotH&mophUTByxjGq!Ag*&EIw^>u(_(}5}`^q_dPhAO&Y>^-p{I~j$-8IXm
z0^{Ft($G#?XfJ_1>_bN+Twsw=xrI|I$a6oeJJ?rd+HD}W_=7s({7GMadhxBAl)=iU
zhRu3OPNL(6lCBc~ChvMW@C*DX9LH!>M)N{R3u9Sf5vI8($r4S7YY5t<oIP`fBs9iG
z)As5JP4a|xWG^493b1Yor9AN9KK)p<$MqN(69;<sS*+wLC3X3d33%^<fZv0OKB|*x
zz<!VO=^zayS^CyS`j9$}hrhM$9^JEWz)W-QHq6kV7C;sbfaKg4_C73gSucP|b;HgV
z?jc5Aato~D#xPNS-c&e?&^buB35_Yf7R{8bC>sH%8gr`{V;8=wqekga?{9_N&KRYE
z;;sV&SkTsHd#_Pa1Qv)^(jyQn+8couuChf^vAabm-p=YdTL-<|a&mHW_LCNj_We<o
zOmi3!W~y>qLMdmqz8$#0f9N-49%vinyqH#GPE2z(xU9_Xxrg2RW09a@SQf4^!%dsx
z{Qx;u!1_(w@4DW!d+M`sSMXc|ue!F{QE-O=&=y|5{PbGl@{<)Y5Zg37Un8Dqwe|4E
zy3;R-O9d?9XzM+QSKSLj-0DMNy{oV5;$V~3wt1H`+q3_=S`JOy%e!N>QTrZg0?Z_#
zSvv4w6XPN5l(K^73ftdy`gV{9u(S&uuF!_yqY@+h45Tz%M)x`U^~?bfs^KSTS_Cw}
zoIS$-Mp|tE{1USl6SFcG-=f5FpFa+_6u(LXZ4;Y*FcM!X_t-@NH|i3=04zuG)d{3z
z^JZo?F&(SeU`4zxICwUrfewk=&#E)q-Gg1YR6zWjlkP8L)oISf{4~(;2hEjf<y^a3
zSsDg{X{Dmyre5j@`z#%P_^@A9dK1Bf9v2^PGJ^#Bc`AypS3O-H!Hl4b8juVB?wqpe
zdn`!?Hh3Tq%lut5xm@PQYcke|Ax&Uoh|&55vlgQ9vGp?}yqP<Hq!L>zC&`_?l<11D
zJI|K}g_8R1gpDHQ=WX#bXfL-K%m_W>V#?<24Rc_0;0pc~bz3dveNjxeeVb*Z{J})?
zdZ|NBZ*q;>*tgJHx-4r`e&6#co>O7>UMW%=>`RYd9yhs>tb$%$LYT;Fz=q|^Y>bWe
z`zPk{jx!7u70@rzCU;LupG*vzDd}tGG*1^z4-b#X!WACQzo#9$12+wK%RVpV4|P3A
z1;rjWk*A-Pl{eQ1ionNHh7xUOyZ9?E^g8l!s%nw3Nkxdgo3Qn9rdj7X0RaIRt)#nq
zDl;AV)h<P_YMq-MlEoJ-G2Q2x*A8J3(t+_*s7FD;zS*xwQ-VnvoOGOG2p%Zmdjv_p
zY?W<1o>K{u=8`HIZG(AX1$YI0m(<#XU~J1uOPPjIjD;}<wOI$SI#cGJ+eyt|cIp&(
zFU%lWb*TWYP}B=K2jI#14~ASvP&%%z7YGfLfnku*MUbs2r#5G2mGa|i8MG~5ew;ir
ze(*GV>_C=L5()sJ!eX8lQZ=#mI%t$DwaQ3uj`QIrvT3_G#rMo?;QlyGNA38cwKH9s
zQ4+@w%Su)9Wg*SRcOm*ZeF&3j^-Pk$G2RT0E~~TS@%#y&_OQ|9AB)CxEwHxkLj+3<
zMv?<QT}`cAS%u|(iH@ggnNE>+=Q=-9|Fj*w&vx4a`yIM&D`)%X-l2>L(Kr#5!za0E
zQa<ET-fFgw-Uqcm$8aPLh!2P|yq=4=Q?uoXfK|_bOb|hR2|Nwcf@g=Bd^ayQH<Q2h
zScOdusDVJOipJaO<<+JqUC*xW%LhnYH!|R8p{1e;GQMw4NXi-p@B?eRPnfhUF8#6~
zdc2m``8whF?($`Ev-=B)kE6JK*Dc{IFXadBg-WzsWl9OzzkuUs-`?BT42pGNPZy{7
zR?GhRRy3F;3?MSJzgZj#HC!T)%nWp6$aMVr**E!tJw!1DV!KD95Y+zFls&YVseD;W
zI+i3hjF1Z073;KOzhI>MrG3!jSW<@YvxR@-%A`Cr>kdizA*2@Qy_8GjBGW2#n5zq)
zU#<l`0QAKutRzqt*S#5tBjFLdl-H!Ea_C^N)!FdW*4D=6VAuEMd3_OHXftX1iF|)m
zCPJ2zrPHXMj23@;H=I6$f1S8Umq0_SEl=zpk5tmFsOM!5^$y*}a(lc+lY5gqRGU9x
znkQ>Saq|1{AA-!zm7@TtbHX^=k>j#X(0W2h{!UqPhOiHERtZ#BqrKZ&^=XA%0*VdL
zqeBzyPzGb{J8bQ|Uzx3oUfb1UD3Ei?^K_-nS@>=nuFBK*V!8Cm`C-v(hbWb3G9@+D
zD9`e{^XlEX_w5W-$I!}k*TcZkfbW4+L9fl-)u`ubGX$N_==VNc_?O6sH?}WpPu#R4
zpB;jnnrEhZ?Xi*GlCb?T_beGpvEQCo1`<zMc3lj<6A@fE!UI&&>O94U(ndan1{pbh
z8H%HipKrgF0tjdjVE`1sTyFg}x65FFy(H{x9L|mo1LZlka5oTo=wJ*x`UPSIfcY48
z&OE+Rz%N*7QA{Vy$~sYV9igI~bfa9igRQHJg9~=85deycm7XWdv=$;44L0OXeEX#N
zEkRYH9jOnK)sVPK&%x1`%0X>}-1GM~9(ds>(s#bkZCbo(*g-h|PiB+mg9;$P6WT@)
z{jLNrW<%cqUb5Cyh>lqwcH2$B{=tDJ1*~4kWI0z?W~(T=60dt~wigKHqvdTHdIiM%
zJk(<|!-AzG9s6)6m7phEGCEsSgJ8cRmxb%Dfga;1Cv#w$vKsX_AVX>J6qNOi`rTha
z@{6O=JMH<lZUmWh*&a)QAWEK*b^`<UX`gI}HDy4m?X6cX*!-qzGJ&W5=y`?JC+Vpx
z0wbcCy2E9-zq7&DO@0l^m;5fbwGgm)LlwY*I#$?mS<=9>^jMS%Ex7U)`O4dXZ^k@N
zYw6L`1+3h5Z@Mt`C-T%zg)g}EL=v6(^MktA&oJ4~jy{z$teo{-XR4_4=Kn2W=eNAr
z8glMKaK{F?L*K#^5WVttYUeEo(aVG}+$N^aNx~n1@t;GW2gB%8Gp-rh8Svb2fqC>K
z{yq%x3q$c=NpIrnE;K6eqL8;#DsHOx3(d(1TeN3{)<yR^#}FUXFgEJ3pxhg{nD(h#
z++_LcF}UNJlG}Otl~oXl^e|{(d`>^8>GzC<=^$mBEcSlK!it7XwQ6Xvl@u8L`F@vk
z_s6?<CCZ|Wa0%$KD{Wq%*WylUOuD=z{4mul%*WrMbErp>HMWMYCZG+xiYtz=X^T74
z(#kJ1<tMZ!wWdPprk64@w;{EuY$~7qdNv6J0aa|Ejoj(>eUPq=<M*eQMv+Ol<+6D}
zH}}l38*6nuc+Ej)R@h`mWy3pk?-%_C7CE-uCZ{GLt#`Xe4`bFhy%mjTlS1APSJpRk
z%kzs`4fzev?!A>*Iw|u$7vNaLFamx7m)=oqGgcwSZPQ9lnf{sskgrN#egCo+ZN8H5
zb1pp#fiH~8%Q47{Ye%N7VKhdC>&VMh%;$M8#n{Z@S5DC;YrqC6t6}J`6$Hl$`dT5D
z_U@c+ln!|p7D@g}_gsMuaVvzcl^-2~@vB;Le(87PV%oPsv(0f`=-IFQMSCsFPdmzF
zb?#xwZr4p@y)^iR#q5S;4jlO+Dk<JPjfAnyA_kAi&mdTPYx(%^bJCXkD9}m_@`oLk
z;bT!^B1Q_g%3o}@S4#l_%tK@O9KD?-xH#eUWZ0o0{rN}jiQ<9SC(3&_0x0Dk78)j7
zhVq6500e1h(My;QS8sJAptTA1=n7iIcGvBtk^ZUcT;*3NXlm<m$Li4AZ~WQ8A-%?Q
z{P@r0t|XZjLppuf7W$a8kT(R<a{AQY!0vnf=;PBNJf9j&GmW&kYN<>@_|f=FqFD`d
zyWvET88f}l%uJ&Vs;W4vTA-w-;o!;7XY<(R;iIyRZWJ_%>c6Rb79XSb=NSU0X+Y3(
z3>V6Gf>Gv^V_2QvY4$AsmRGd>d1@?@%6e-qx3HywV5A2y3-kM*UI3_k2bAK3pAqR?
z+^`naYvw{5b>36jXF9XE`by+?((b%yqkE`VHeY**$UqzDdc8~*YOyj}kqQr=g*|Ub
z$wA2wn8!3AO&gDhBE1wQH45zefDm<8=3z|{QOuT>1ZN#jq}jR#wb^!Vc*V+6OMFbU
zPsq$DN-=(gSDcf>BTKujzH=Xwe;Ds%0BIcCGb4SCKVG&(qDbusZKrj|T+jmi>Jl0$
zAtr2cB!MMycl4QnF2LAV6*sfFOoHUZX7tJ%+H&w-PHmI9@(4+SFohqbY$hKI?L&%4
zWS?3|=rB2j-;a#UeNuQkgBD9Kx;Q+>LBU6E3eLRl02{;o^3I%igr<2(gk}|cS4>3>
ztLrJ4yGR5q5ix`PzY|MMpWp+SB9JG#ZqYu-B8oKVyiLjPipPKI3)D^HrB%AGC-1l^
zmhzBRs`i-`wy|wn|9!}$@wqw%;Qj3r*#@$o=HkMVOC=5M2f~$!gtR1_L@v(ZHXFhX
zKNJqR=IL+T#04G;rN(W^5^_YG;5Pco<O@mD9k^~G$H{$g&ZO)>N-U{YR3W7>EeGoy
z@!HPTn}wX%k9qdl=)B8eOKab)f9GGoF6;fG1s8%jm3>|q!uyk>EvJ)OdxVkx+zumB
zUa>MPJJ7l^VVSQ5%r-gt4zUoX#+v_K6);e+Kd)z!g<IKKLPp&<7M;PSOUoORNssWO
z<rr8|b{3s!8%fUY2?~r}RbPXA$0tj(>MneQcLwk?tKWaS4FGd~x(E8Q`Jt87TwF@x
z)R?6jYwP}f16X72`s(z>eE-_us0rrBkHdj&bPfnSYtTdw5-#k-tf?_oQ$W5~1*XK{
z1{kx0fZarzi9!3%-FZ>$xH&i^EM)J1YuWsp#f69KwW1MfilL!ltHt%oY@_zr^;l$|
zn8YGqb-~M>cH{u&G!@Cb1TL;^6Ef!H$l!;`=Xa>Uc>d2~xY?xx&t5$yuz61daZ426
zMTDOeGQ7nqPfQQOG!0o1IM=U`712FS&+HpTh_%?f+{CkxCW#SpKdvH)UF&Qi$91O<
zZ4XqB)WBo~0~2wUhL+RX3Tuy6MNSI7CG+N1rh6)>vRQoI{YbImDW-I5(Y4m~!P*UT
zh2ZSbPX(~X(nL2p>#YOLeken}Io2|7lO<$gHlK|=E*WGZdQeD|vJ@qK2IwxI-k^3L
zc1`bAG+gcM<pQFgYTZ*$yrO^{`k36|qoX9)dj?ktvJK;~*eSm$veHPfykCKM9{LM>
z&<EbD=e3`EAH*j?8odko(X*?^eD2{;F*<?gLMzU1X=Co<iTx9trNP8h7nPfnlheNw
zzQ2?vxSpgzJF-5=695=>&;4>WKMr~NtgO2d<v`s$A7=GwL|eJF$yWXkRz3HC<d$4|
zNQ4MRUPFH*xw;JeY6_-z6iTw_cj&((S>Wr^mM&pTq3?M3(`>VR6Lhgl#N*7s|0hp&
zw}V4m%R1B}k+Hn++o1^H7`41L!YOYxE;UulT%w+>ywYuW7<o|30;SMI+a{m>CF92p
z$wb6Bz|Uk8%P(&7KJV^cQx9K)e^4ep@wcx-m+==nqvyR53|K+*T;zR=XN5wd#8ht>
z&vNoGa^Xt6_~sa3BIgW`NBDcr+UX-Q(xZY4;QhAB9pZ^LCBaBKy%@zS&H#2-2(;QL
zg-tUv^@~<^^LO~35RMHy{sq0X8;8Rbh~-8rY0tt2gngwwzoMw9$lfw70~5c(Q5FB;
zlAKo3LbJfOC?_@lYjU?s>>hKCTsWqJqmRw?{!-I(LT2I<9ael&bQzV9t%MDm2!PTe
zlf+_s8UPUCdmbVU_G9;MqHp<m`-x6YE{yPh@$?lAQMJL|OD_Vl3ra|XfS`0N-6<s?
z(%s$N2)J~2N;d-1r648U-QC^ccX;o;-+yqPbLN?uU(6it3w4m))r}Yv_lWx&SDK`U
z(G)V!==MXOlZd|k?C1ySm^(C($wSME>$)VCZ-Ahx-8=g7<RS{4-i%VH78!jZ{(V!t
zI~^NnRVtDB&Rzd)nT$Dt(B8wuFamiL-O=yw3NMZMQOsx%)nb7MGt2JLpD4LBuJLQ^
z!6Ap3Ckj@Z+^PF4{ZNheW+kC|0kcZ~owjQVe&Ni;l6=qQf);YK{FNfr{;wLieZK}5
zMdV_NC5ddD?3EikxY=wjD5BLmSYGI7j+uDpBKA<oOp|x1LVnfN^l^Q>O8nvVX03PE
zk+<ifz){lDaPB}N^h;@%%L7Z?0!<h?vx9=sAzS+NDXRI6A7SI0-yGh?tRjwT!_0+>
zFw!rdHCSQ<VFW92D7g}hFE8GCDKcAZ7$94$4!kATYSrIoNl*EW2>pBRSS{k9dNs^@
zWXIrIQBa)yq*H&i%ti=S{}+KoPCiDNOZWa`y&M{A3sTd2#?YEfGm#3tA9fV8uZyw4
zEoMKadGig=g86&W>Ab2S2U1>XBJ2~SSl57#4I0en7ZcMuRy=g>bN*-imlfZjvmw1K
zI`iRQ1j_Mhq?n%qhC^_5d;2MjSAAG`OG&n<4BuhvmWaNI*Z1?)o={XL{;hkb4g&es
zEkd<dvwkOJNtWV40Z&qFCbte=3HogfHQHFEnnNYyDN-#I^jWCQH5sD19=p%-CaCP4
z@%*N`l$!Rr$zHVedo-lZaag_{M+5De7w#<ym<Nc-Z0j3U_eA_KBz1ix=&x%%lz=w|
z1&=j4Lcc$3!vwuWeV#^0LQhoKszWaB2Ya#JzAfn;9(>U_hG_$VRh!cD7ts!QZ%h!|
z<0A5dsp3&?X#;v=2Q&N3#`2`CUtFX-^v&2BOQ#HPTElxSD3a)C_da!M=5LFg@UWL2
z;3A*gO`Q9a`-QDpc{h6*gQcDGp3opU+|`hi!@q<HFgv55roR~P1<?X9{B054mCgb-
zvTaYo<<m#;IrY6Yt1rRiJb|zTEosF$_m&j@m?T(UI{b@+38hf)#qNn;1E-*^8l36H
z`t?FcYPG`J(5sGRBYUyH+FVmoUqOhmi`NHwp3s6Uj`AK>8HPcIa)s6l*?Tr7!L+ft
zxxLexS0uQijh&JnD*IV*y{BdjoJ8;D4<8hw59Vs-%b_~$K992s800xug@R9ghx;<1
z#GSe+)yB$6+<d1im?~IGzB+v{gUD>%fsIMj&*Vr8nYi9$@J(W0K(?RINA)M-9eo1H
z1L~YtW?viFkg(9nh}lepoPoO)SkACTeIz?ZZkAU4Z?CAObD28Fc%kvl=wW}I?R7C<
zZwc0BB}nMQc{B*ZxfwO!5y-FlL+w_(YtYY)&y~@?NfYl+fau-*E+NQHKoSp!u_9HG
zBZ#WBvZyh8%lVTX)$|#>%RFT`&gdob3?ww4ui1cNF>opFM#99z1j*M%Ps|*eWVZzs
z>Ii*pOfiiCL5N=xi$el(C4}SZFXESu?+Ym+H)$Ut;XgQ@7S<y{ztVUFeJ0D^r*cGf
zziL<hSUlWz(_%klcr9_b(8`u`zRe-Vb($<a=&J3?D<|lg`glf|#O|D6IQqeEAuuE&
zH<DmeQDc3a36GY+?Rt<#PoVqFfyMQ7lQ;kP*qH1s2?ytP@1Nw}clDJF&<u}*0mDa5
z_a@-_K~LUY>E&^JpNemUK@?*UuK*+Hh?A!*{1|jcg5#rRXS7uS+@N=x)h(S_ACYd^
zkkCud&#R9xFW!aU8QT{oaC}xow9J$)t0)-vzyC)vil@DNbUCc>+{3)h>{E8#7)uk9
z$3`xGy$_43f7dh|xMRKUfDNNky%FLlLHT9Btnj1O!X)8=&-MFVz@8+6K7u5kxs%%8
zmiX+0ejyO(g&J)?(u<H<VJ_%S)t70N^c4TZI5sm3$ic?I&u)t($F1A1lpfX(HwW7O
zq~G-Oe0xbloCsVCAWt-5RFqogse{$XJdc>N>2_L<8WtYY%{1tGGUDosi`dIwb-nJ5
zlk(9}vH1qvx3OHg<av(r2>xW%e8YT$Lxhii%n9{95k})ZYIUxA>RDF^;XNLp8;y%h
znV8`&bu%<S7-!;NCs%5!{1^8<P^!@>zu8+5$8denJU$XWKEzY$m}~J_&$O|4H^P5*
zvmP-Z2$mr9eO+m3U?nCi$r;->!v{o}-jzW}9!XH@=#h=~`}<1ofSg~8w+V#eD;v{_
zm7RX}w_ibEH`gS#gh-b8WEMjWc9L7#6i6h{JUW%ZnC#6eu>IPXk)x{@CTVGK@3M{O
z>uEJcs*~Z#$7qU;>}dOexhIQ$+h}<>(W_4u@EIH1s&m`DUC|QL&kAUMWI&oY6^<mf
z0pDmBU1{!=**ylwL70QNufm{I5>AaQtnm~O%RV`!yU}!a6H&h{`^H&{`t0!pvcJ{s
zD@#75aFL_8(U6S3H8@xX*Cc}(pGPj9nwq9qnKol#Gw`FfsQ0<(u)v<j=HN6A$;)p&
z8^TUT?Ak2*XFv^TS->M9++_KF?9`NGl;O~wQl4sZYi;m=-IMVOc39>}1X;}Wooi)V
z^DiW&EF9leaj*C^v1Mkv|DYx&me<oq%$Op|8W$U<#54If-(FJJ>5QwUVm1cVj?nX+
zf%Na*nEZoAVMJ(x$R(F2qu$w^+&o5u<6{H_@2Z%H74aw*Sy<Pi52LvoYxwTn$XC73
zzX^l=rZTf$6L*uQ)cU~_&kHCa=C+qtv!yaHU14PP7DEuo?<_zu#k-<XSr@8utcE3Z
zfDwXxG7}g>Rd2ujVwo#9gNZ!*)YFH=<fzTSRVvB;0?k;l2g1tEM?(;OC$UF}DQb`p
zx*MYT@N=;{)7<`|Plr#MC6N{yZTnGsoYm7u>4f|<(?t*|F9@Vun1cjTS74x`^u$)7
zc#wN8SKKCZ(zuFf^^22lD7Cy-bxHh=nrWODH#Kganz63`Eq`U<;NT#QV*rp)AFhH`
z(|+*0{t3l5XVvwzxjBsmhd5j%KAOjKiH+`mhi8^=%JHr{k)1EGraVcTho>gSCR}Ow
zn@2{}mX}TqOD8|JXST8Q4>%ThUj4A!bsPj-*+SCN(maqK$3I3GruG}zzRC&9Uug?%
zYz#R!5>MNk-Vx!rrwd^iBmbKB43XW*hj5XG^?i>}OB2O`RH#)^rLc|5(y=6tgJNzy
zLm$7CGo>w#*CaA%;uXgSd5;o;oviB9do$mwy-Kjs@dH0_bU=OSdk33h_*HLA2*qM0
zf@x=<rwJt(C3yCj#g(NQUz)#aIibiT59!KJ5jc<}q}9@o<gNr1RoR4#__caF$7wt#
zW9Z@LU(uZCR9V%$Ni+}4GWI1;31s;804^ouxsoNUNAfbpmfd~Jz!!e<n#&?GTsNTd
zeN6Vtc3Ka<YbH%M)Yfb&=-0jKWUa}lv2E#h@2KkOG1#UeBWd6vBSHYB4K5is79s3D
zuGzF+B~JCeyY?Y?Dare^e~M<{dbqe~N>qDXTITYeEHN{2NC07J*3ah=_oaZDo@e2=
zn#ff9>SC<&d6ql+r#0$Nygq-T^AC!JQK7HRdY7bLzWM>neceqig5{{wn@$7fHPWXO
z9g9hvAz-BqVr63^5-M!}fo5r^sW3R3t8~W%%6oStlPlqPcK+>XW$1PH>jazDa3S8|
zK(nMl<I9rtG-lPm74bd(nEwt-%X!-O3=$E7Lgv(KnW*%>uaXCr#iP*qaF3GO0PSZl
zLUwOv)~u)0h3-&GcCI2+^%a8H=g*&e13j`KPK-=8a75O^S4|L~w?LLHYe3Zy05JGs
zC(_-eYeYZMy(V@{$=+Vm{l8-h@8`0w)nwWn^{2tDX)CrgjfuxLk)gw;{?ci1tInie
z+9%A!A1MP4j|d5AI-k?jq6_J>!B_qGJC5mzx7h4)PryCT8xU)4Bu!raB^YrZ?RZsF
zAbYJASe5;ho-+M5r>22TF=m%mCjh09{YtwyFk0j0@!GqTR=ihqO5hbT@PxlVj;r@u
zAb|Y(NtIxW>k`K8sKjyRM^=iLhsQ40eXqM-=R)jQ&pG_Tbz9q*_q47R%r?f-w_J#h
zOY3Gu6#{3yzYj1nIwX<b;OLmKqoNO+63ET`=6YZwCpaPsVe|?$<m<=~uYXM7jU%4A
zZ(#cxf>N$)w>8OLN<753ib6Sc`&LZus7kCit_up~jO6zna{?t#oU(*<z3^>3d4Y&*
z$CYtd$(THU(OVNmJq4TaWdB=dhR3zOp-g1^c+X&W;)l>`v-CxEdBw%yEep%{9*=}C
zX@f57lhf<pzY*_^T5e=uFUvey&Bl6c31}eHTVBi8lb|87-reUzy}h(V4&eHoys+D$
zWafP_-2M-v%u8I^1kFCQmL`z*T&#|`&(%~6^vgpsi%AAg$tk088!S5Ltp5UwL1iK3
zZvYw}3p89~&Ds|kG(~C%XD)8uI>3u~!uaDI33YC+mgX)ezql-rd*lc(7Ajcy$3T+i
z$Yt57qc3}w)|Uhd_?~eJR#W4s*;B4y35xsMmvH}SwI=v0JAvcO`^7{XJOL#*t^F4C
zld`TLL$Ssx|B~2XxQ=O#t6RTCOf2xm&Wv(|0)s}AvSLcu``>4fWjP;(qc#uPXjgfN
zKWbwSM$WW2%Or8^MiFr49A>5+yO8o9?QnsPX>cqoEElI(#*{o%8DBta+lQJK1b9qk
zw?jN{3~uq!&+&#V5po1-B&zEzDmBRx!!LY%ny)7Mw$*r4B}6lx&}kkbwQ7qW*g;!o
z0^sHAVu^)TKlt$vQ;i$fPZ&t-gX^wYEY(~%Wm+$i@jE~xyHYDd?DKiTU-uvkszCh!
zsq>}QR5CKOqr+dWw&H<s7MKdr0){(+jH2w+J`TM5jtX)XOf=505UF4m8m}$Il-MEo
zR*;x-xL;!OSsMe??eQ9-ms@*^mkAA7h7flXeghG4od=9Eg$Y_YV!s(7ulTS^GUiuH
z!AtI8^#PA^qF2lxj4^7>pet0U0T+Eqbns)=>h9p%anQBkBeQ?}_VV4&_|K19mNr%?
z+?x+lrIT}p=;aj$rL)^tj#Esv*RAXjNP)b-`@5gGPb*bT|FmZdc4&1y=8EIb`q42Q
zDL6P`jo!XWamo*MNEBnFgo?}_>GvDNA_JrN#KS&(8r&B$4Qw18^+0XVP8+-=zy3?k
zviuJHYxH#l?_bZ%Z1F_St1e)fQ!hg#2r%tqdP5=84kkE4?Kxwi-0ePEii*%m_)NbJ
z^w-rxOW6<ZE`pCQ2N`v-z~(e!xb$4^d!LQ?Pv2r{pvDL>kLmqwPKbN2-me)<f*j;m
z&-PZ@oK3<q6qbbvE{kjYYv7*6#$*PqI~CtWYOD@)Wv8q=1~#-ppPLnThX?5k@wrj9
zNM6FxSka=Am=C77F+Ervb!2Vzw)d;|jA@dy)|_11;Wcj@Bscyb>HUoPeNZ~LT~`zv
z6LWNF&|S426|ui;E41Ek+k#WbLyR|DYg+&4PfIps+i=$8(!5A&?+4G`8UuzC?1+42
zKga9-^Bv;4o!C*?4;7-jy#M;5lJ}@#-Ng5HGs()998$|yQI-YGet!f7+C$falU2_z
z35hC*!aRm$-iuAg*a_Ohzx}~;-20D^tCX)U-jOy{Kh;!*htESzux76tJd`K;5|a~K
zK`Y}0;T4c#%_QsPr<><Uw9uH~1^-&$!%e8Dtx2SV@_k8&XF?=bwrI8t#Eu|IsGSS9
zT5bNhX#S7<Efo=BZ%%_kRZw9Gp^8G`fJ`Gcg7?waAUGN=lN6~MZ+V?UlJyzB_>JUI
zKx|!Ia*`YFS6ub<B+Xt<KPs6T37^3jsus!YzF<@e9`sovNeZ4cR^w1_1*1Vk&;u?=
z4ISjiPR^v@8#zR>=ykPgDxtw1k)LAetWrC_A>p8us$kp0iiRE0yjrrKn&tLXOz`de
zx|8-Y;xZBFP;{GjOWI*wJ=9L>cpqkJfByU_T-XIoTeP_}on#K{vf$rrh}#pmU8`B%
zzoVSdMk;3l{X)^29)+NuJlX0`CbJ-^rIiF>YBQk7rFFkCcLogL*!CtbC<0;6L|Svx
z&ov(Lz2)$#x}v+Q>(Gy|ML98A4|R&4l7SZ^&mCS>@Qcppo&`&2J2^&9jN0(LI)6cd
zS;?;IJDGie@jc^=Y9A-*=_?QcukczxGiZf1aWs%yq}Mdk3Ny*GDj<ex3&jId1gwur
z$3k%z_?*p}U^y-@P$r51<K1Qd&UVw|aebZq<M=EKo}3}?n}&9ucFiwJUM<66v2@bK
zEcK6%iB}CXYW2I$g*7C{o@d`@vRsZAjxAltCnqMG65hzbqg2KIv=kPa;`Wdc+1AW1
zw$;Jey)QOR^Bgvk;^Xsh5%xm}DBo=jU{K?Q)kX6vtL`Bm$x;NKOwFVZ6e4^NWW$9<
zQvLAy)yj-0CWyCTwEUNdK3t==%HV~CU6S;v^B2`|x47ycC<a9$`x`5A|8&mnO{;Y%
zz_|@-1K{GVsHogsyWObWJ4@1l^c|=O96}pLm7@T+vDT3T)3vp5(zCu`<o}mbu+R7}
zjYEJjvR08AN|$t`BmSP1{_iKcu%g}~xMpRm^rg(OU8A-Pl<#qjO}cdIb=9;DY`zU(
zu<+ipj@=VB`JcR|v@p$j+HQ)Bg8EaJ<Zog{dZEn?Z}jeh8Ez0<O^OKkS@(T^%@9C<
zZOsu=J9r{syqF0WR?tP0g8DGT&u{ay)i<%%2z7DXY(e(#W&ZZ~D>IN^B`||P4lxyS
z1Z*a<7`}DP^oub9NWH(VGxRhc-7g(pgyRMvBIX`XD}NK<j=#Kiol$BVBy>V6K86|M
zi@@9DfiAt5NbvkmKlp^%R`(m6+5F*8L9=axvhm9y_IzTM=G;X?C8`b!HZ9SG9CZfY
z_BGqYGKv(^OUmWPo%YjX-z#DM<M^sZm!qY1m(;E7@Tc%N;Uz+6F&zUhE356jYg$tN
z^BX#en%8P-3e*tLPK4q)H=8IrB|YK_!@Zt-rv~sfHEe8bs38l)JNCqIL^nPrTPgSK
z9txjp73n^5tO?ZxtZ%*&La9EAzhhF;Zfa1_D0%uv_WhqaIq*(-HS-STa|ti|F4?%d
zafV*Ntp9F2Ey9buJ5y)_AszpoJ7zwN(dAZex%Qu!-d7Q>+)s8LbXD-JM`Cu4+6K8Z
z&GM-%Rbv8javj!3k^9kg8n=T5O~s$At6jHD^@tGHFiquyx%&R{iO>JS5&Vf^^1PlJ
zt@l)ZpoWtj!k&8BWcQA?T+_^x!>VzGi`6Y<vfu2$VBCnG#`7)hPa(93HluRPa~@3P
z7#&Cv0k6i{yU#`;m8_)0X=c=;)4WBqzft(qTisv%c?Z|R_A8_2^A$%PmKnTVk&w#8
zY|)w|);jmK`U-8I^&UE60O=Jmp&(Rc31P!hMR`+s$JeU8dNb&NqqhHdR1v;``Drbl
z^|h;BI9PPE+{4gUv|LlJJU;KOs_XD2RtF(%7M}OfM)6>VV2$(rRFP!Xqo6h&zg{7;
zM`=j`YxQ~$p?R^-F>Gu!TNIVLtEb1EafDogLWl<xu8QYj+JVyJ3%ffT)sfA!{Lm`n
zlMlMK-wL7>u_kdXsFsXc9eO1K)~(gt6+_nZ`KJd72pv11Me!h5bH%`q<uCYM6$8Q<
z=#}gMR8D<g7Aj3<b<~&@!<1Fs_>Q+`hD1?zxl)hQz57t+D9aD4FNFI|<R_x|98RKI
zfH(I)n37CLC#g6zH1w*Gt_fCRxc~#_7v_1!|6a2*lGhnf!T*;pJ-*<Z_rCPK^*@gT
z9~n(e%~HbYO(CJC@m&WZH!rJzT?|#U08lhR*1~!Y4o)Q8MsDY^4@G22<2SOV`ngE-
zR!3zco3y%wqbZPUD<eq+ys#k)I6L-Efjs;v0Fxv09Gk8?Kh7u-3G_a4iG1C%NY?vU
zrx#cKep5^IQ3A5X`cl#<XKu1pLPtS}?t~8*+C+NQ0U6>$qz(%a0gs{qLIT(wzTzw@
zH5kt`ka_jx$XPQ`T1F0Ly}eC=MuOvc&H<~HkZ2(3@GF?YJIw4k3M(ltNB=8vczF0S
zYHME+@KS3N+c#-NW;FZksYt&-<U4p4WOlV#s}dW9atwxAKIJJsDjmrp@&)iJgDJv$
zhx>hAfdX>s(GU)Y$T$v~grY<rY*eIYamaf)=#~h-ml@_Gc%F<)r5&7yp_7l{-M(43
z6QgxbntB9pJbGJ7_QmvV+egq6Lq~q6yGs$oUrz%cq=gE=QX4S=mrHzt4Ak1w>|wfi
zl2<A!O&J!u0B=8`^Csl<R1qixc&17sIrGj=izwKF=V}j$p$sY}I?MPGauGJXHXrcb
zlUw-QJ~?f-Dm-m}+7OrYL{YuGh>wkpbpjzZ-!>W4mzfM=2)D~l{YZ=y&HJA2E&~3I
zFwa1X=)ZK%CMPDuqo#nqLlBtOvBtoJ^b5Tpkm_Anm%~RukIGg{#zm8mYD>Z#%l6rx
z$k1Qx8De6eu{ybwWQlnD!5qAs?hfz1>jhH;f?rm1QHo?9b@+mWD^06S(;YLHKgz01
zPc<+^=Dnp&S(2dwAmF0YB^Cf~zAnfEn~Gb`wibRese0jlh6$)}sMz50o72%DRF>*g
z>wA~MhH3rc>68nIPkzeqiV>Szea_TkWOw;GsY-75Pu7%q8=WFh|MphY^<AJZsuM(N
zSgx1kPFtvl+(#K8<k6r^th5<^@h-E;RKbjFw7SXJS;Li|fB4%I=Mafq6vs?k7Lg2b
zB-)7~horlQ>M@ySLPX%dqMyOXm$pVDeo)&?+IA$F=Ewq5YnCeRvH+D|!4^$@SM*H-
za{C3(D7#!U0Bt`sQb~Bkm)#ld()QW5w*Oe&hN3PVwR$eLNg%PlJR_(PghCpfj$vCz
zwbP_0KM~9|3WmS#byu*?_5mvZ79{7OxoYQukbpkL##zTf4ra~&^8!HhHOVU?K%RhT
z+sq1?#>@PY7_a>?Z`t^Z0K%efbkc3L`fUd?xb3=be8sUl25!?(R-qc+k_msg`9_9|
zl6-1x5N-kc|A!i-{g73^@q*l=t`PU_dQT**woFBmF~7M;*}Px}{?7Mm?ANZTTo~8x
z3~m~)uYtC?{R2j2`;0uY&%K8eV4TjdDYSgKx13cg7F!R2@WSejv6=%?k+$~ee{Tyr
z6M`-FK0Ey|3I1RpfF&NoFDMD7K+GFtMF%}<>x8}1kiS0X5QlxST5t(KS#dwI$Y`ew
zaK&)IZXb-d(+>rNvIsQ9ESUamdR-+~4^`6_U!6xz(DHFRggMBbOCJrv9Y&us)td?w
zBua81OJ5N7Z+qJt2jT`(tn`3B$!VvD&uG?8+u|M{T3eG&SI?FCaC481hTuj;rT9$!
zL>0Rx+OZb2it^gnm5lGtM{{B^m~LCNX}(|^L(&}W-F4tjI6KP>`c>=nd(0cnBXo?p
zySIU_ucP<aOwckt06fJ<5vUiRCFq3a{}e4)83Jd*+rZS`AGQpwc>UUXpr%19D^~j2
z9{lNZbdq-+;cN$=bFnjSM2n9Cut2Y(3QkIXBk(;Eg8tT#OF-1<aSkek`@bJXuq;PG
z{cdXP%vML&VeV`?sX#HFKQ4NK=1EZFA>_DcuytZ&Vr7b#s)JNoZwjxSbR&|0@Lzp@
zg~d;oJyG4(_Nlu{+M}Afx}SID*kMG7W@ctP6G_xv#~4!WC6-0DK0C|Up39h|`q~N#
zqe5Zl=ARa|W8eLdbN`AiOk_)Za~_xk5pgRgzdo7w+mUS7!u-xENW1@{T}y^|1kf?i
z=$f5T{ElgnU&8gaXVS~i3yd(7zhgLJwJabg3v({sgashk6zw5gr7hbM%&T=^(w#jC
z9D}0c0gYS)E+7OnM_ILNK?-%|e#b6eR=$fqmfoUYb(j8p2C@Q)pp=~S%N_Ix1VW@|
zCT(k~x9D9xE(b$-x!W=BwYbU`m9s1B9Tdt<a?0zr52IE7$;k!A{Ye~rCmW(HSP>CR
zk2-J3#d?hS1tPS{T5Z6>c#=NCZ)`<f#Nuc&fI9`9os>4q=98%;hwPDW{tWm9?9?<Y
zXZaKW-n!@#Sy+zMBMX67WM+K`h(6v-1OrC5P$bH{LkXQqc4x@6T1P|y#j%9=AY~ZU
z|JIzHaV>~5zY~4s<t%>Kqr>6fJ^Ff>uDQ86!g9=OB3vTpp4$3`*tpn58fC4fmr?xx
zd{2Sxaz;msc6x*545YkI<(z5drQ50bwdJ*kF{Wh7S&W**%Z-w<Owp)r<1oi|y2I0y
zqEmSjNbg1$(<gIMZ)Z%JzsnL7Vk7Sf1X@H8_-wW~q()0`bE{R;+_6U4D62V}KM?ix
zBHtj83pqAVau|$d3p<-^U<m&3J8nH>FES+#*+GCFGM{;%&?f*LL9m}1zGQsPug|&=
zpaguR@e|9cb2$3e3@$2LH{5U2#4p_cdl9l(viGrmar;r3HNc>-ebA(4(P{G1ZinJq
zQ-%=*WpcBb&)Jx5;@x4lo3D4qe1M{rY#E|&0x6XrN0;UPHlK&Bk&1Kev}752HVy~h
z`o6mi>F(fpOth!{+jmd=C@62Isx|ytVDbPmN(@*yAOhJHa__3~Gs5nb9IJz1Uy?q*
zg04`|bn#M-3T(}+c3vmPGd&0k=lD<mMXnV5UKPhv#GGRJZWXn432j<A)8uGZjv^4^
zLjeGf;qgz8rRzOn$oXj><(MdHD+|WT`95oCNQ|_&i-;Xo)-cvb3=wYHvt8UK$>1)#
zZ4c7y<b3C^_g3*-tcZz8TumA?d|5btdz=KAuI^Sckd<2Rm*4uazP*FdwG0(osxP;-
zv9G61CwfiHTjH?(YnvSS_Y?@Z)c{j;;6Tm&{-(3a2*rfqNvHf4(p1s7f>K1V(vmo9
z7fW+<f2~W7V|oaR_Byhblp3J5>+&bh{@w)@iH!r|ivpRdaTV&l1O!34>%ahRCY4{Z
zF4?`^X%Bj?#8yDcw^zWn|E5Q;{I|3ylLl`QJ0|*+b6CgNQU!F*iZunpW99OzI4<w!
zE1Py+lm@@CTyKg55XEMq_osgdFF|nTeGd>Qtse#(@Vt+3OFD8tp`@pm$fu@m^qFfN
zx7H7XxOnrr7PAWi-6!<(mn54M?CcaJ3v`eHHp!PKQc${yQrph&lCkPSzm?By)lkKl
z*ECUNJ~uDe&Vqa?^e}JRRP#a{Kdk-g6`3r$_rVVJlMe5M0>|pm`6M^>LLcv~Tj6>S
z;Au=ueXK3WAf*6VSlhaKeh}ahfJAhAvY!?i8df%}RNoCf?z$Y!K~GoWutl3Y^))Kv
z7d>5?83HkT3G_6I-z8P>>L%Z3O{p$>wDt$Zw8Xeyx;{Occ>9Wel^n(;+Sw*-74To<
ze=RzOzpnb{p_ZJQRxuFr+qUa&a2?^Nqg*RIKv^{@F@=?z>7~`C>uVU&?@7Q1MeeVZ
z6iJjhxsUsUvH==9oR$J%S@DK;hKSb_G@(r<*o>JA10hDq*SC=h#uywOC5@OBRKF`W
z|M!$3Unr>%2k(_?=X`VBy&{T6y<oR_DVO2-eVSuJ^T_+bM^n@=m#*jV`iqK$%L5lb
z$6)+%U*sb1zNz4Y!u+#otsw;<?VN50WJ(Gj-7Q2$?>G)4<gkuP#R8XoxUc&+C3^gL
zY&!+w<9yqNBMdx!67k*NJ!%2Rih#3+nYd>S9E}HoLRZRzFzDXF+%8mts5OU1<C~Vt
zybvEFA6oP)lM|Q_e@QJvPy~`TaF*CS&N^Rw8SfXp)^71-Ezj-;5=`I@(lh!As%7%}
z6agpV5vbEY?^avPPa9j$sjcl|z=!_MF&E}f$%suS_Uu`<HIE7jan=-b;F>#qq9qHT
zt#Xv)Kc{{MyVgmXN?w%vh_Ol^eZ5B?5`Eh*WCO-`8|fG=rA)@tiH^s<5p%ws%;&wt
z#`8T8=!?vDVu&?WtthBsWTyKh{CM0~3C>kt&zo(9Q1+J3eR_K1x4bvhLz=36DnG1%
zOC0*~Jq@LqEka7oNl<7eL6YZ<w~0|0XzW*SAou?LJA1uAqoVIT=0-)P(p*$;r^Z+R
z=2Y2?-~eGw0JNtaXdTuWWEwU4*13d9K!J4S<;KA$%9UN3@1ta8rC+PFv{fQBC$x#{
z{783m{eGCHOryCZabH8eFt_#G5D0s#X4?`=%QwgF2fA~vnw%aBC+h4N+FnaR_lzk%
zie+5}RfwRL^U?d4+Q&{SlB>R7h92$k`~5zhK50_1m*dM8Q7EzS3-)7xNom`GfKq#v
zXhRZ#ynTP3itOYuIv2|J;67;gS*wL1Ov?Kuz^vy&J6awTJ1f|zG?9~8e{co-x)=CJ
z48eJ$K`bm%cVG3RvlSuqle{)Mg%}w*04A)w&gWvIKnu3S0_sg%F>bJE!2IMa7ct*M
zC%s#c{)y%5q(QCLbyB;zx+p7uq(;mA>3ts=<5Tn|vb3R$vSv>dLn_LLH=^gRh?hf>
z#~l0*7rFN3<>jO@O5cdRq{jJS%bUGJ1AlKWtAY!<>zt|gb9gq{naDgYRDyfXF_7Qy
zuYq&oC+}Y8bZdALB8Y5ltAL?8a4Auxtbp0gpFF}K^3qhM&DsHFK(Qe_+|fV-6?jF2
z=-C6pqAXjr_{#TSXH*@=Fe)E0sgPiu%A$g8(~yc7139E5z;gG$hiu8#7&+~nnldRb
zX&UPt6icGFylQ;3x|ET@E|&NPh9}9kj|fosddX}Ldn-3kuI8gu*82Tc$p@AM)-T=`
z_t6Ero=r?nb3B}%-e;RFT0~I|6;a6Y?zfbbGB#08S=2r*fipomn#|KR=}D-t-v_f^
zwBTqrg~;BuUR$`EOn>zAvjlurG{1fC^PzB3eL;WrwJqAtocCNRhh<LHXd}WvS{@@q
zZ@<87o)r}N6<wA>B!uAo`P&xJP*g(q)-H?UjW;qg6cLvj-~e$vydX#p_A~PMvW7!x
ziwLSGp(6Y67}%<K*H+?flR{{2PBGI7T2&}aCw*ofNLyL|)+dS!UM>7<)FO2*@~~Cd
zB(2x3d}O<GkDuLRK+Jybk{14Lh@Ju&m$ZF)OnF>$d%?|BW(l7Db12Qa{_$tI{q~Z!
z4eb}m&bLj+B!TsPw<hA;uoA9wKhSj9EJ+c1b#?$K*k#3?97#?t2-5LZe80aNQAEh~
z3AczX^kdxvMYQ3LC0uIU+CCo?Z6YB+Y1Kd6_r_SF4ooU`(R$337)SLK_=ePC<MO4`
z%~R`3NBEy6gix@Nz9N|74x~;D)!&RO6fg(<awZ{rU->S|zA0TmVZb%NscFhu)z`ve
zXGotcB@W;5lG@jLg8f<-|GwnCHTL7Ze);0)(fG7YPo}mFh7=kWS8pM#G-5D<0bAhq
z$DPyc+RadwsMxcPR{G4#_kx=jbFabBk%#)_3ohfF4|D9g-45hCP6>9V^qz}$M+4D;
zzveKZ&})Y+bP8m&u@$PCRHtjw07SL7o3%Hu625q0fJ9}{C1<9!YVN^eMhw(c0SfU9
zdzQMoVQ`I(1!U1e!a#g5tA~#gjEP#QcHF|%?j`Mjo7ElLj@}jk93TkK_woQTCYM2`
zSz7_Wj8L+E^r?>n_nk^pEb@5*t7hzBx{k`0dHfxRr+2}qhgngJWQxn5mN%@oG65vn
zahHF6ULlK*P)4V&>ASp+Ep~i-afu3&3RiGixQ&{B6G`goL=Sc5PPuSiJ=@w!G^rnk
z{=#2wYswobt-9!#sp@QBADV~<+%X`(h09s!MuGBkkxz(!M3d!FUtvHF=l@V+pqSk=
zieE-gr;99v0+d9d)}>ecUeAs+enHK^ChhG*dNfd$3_$XDu&>+wL4ZUb(>h0cI|!CR
z5cj2h#6AN0`u`=_p?b$8Myg+rBxLu1KKYi}PIMFsEgqUIFfcHjm2th*H1P78IGkSG
zll>^`Fz>zN#Iy2LNE*7>wk}!eQ9m&?`5$9YaMAQl!qZdpex2zd%X0MJlJqC-p90)~
zC(08*3pARtPT@V~vAW#p^3$feUQ@e&5caGUN4KwBh?j)K@DWcgnGrzvW`^A7B8rNm
zQ~rM*>Sbf-|9eWRFRxU?MSg`L@kd6+OZd`jXxd9;_dx%?kl|cL1IYfkNbB+uPK$>d
z|JtpMuxzoF(f2><KXt}&eWPqjy!xm?>Qr&MkAlUq3o`&A9&8Ozb-W#V{?h(H3reZg
zI(3TlEtkagXm)Tcxpi^&NRgDjgLIVkufois%e<WGA_xtetJgX<>(y_^N`b5fgUisn
zrHyyu@=8aA+y;h*o|hbRN{U}#+cCc42jvUfRmnh!+HZPn(%e%3>`G4~oe@3rWG(As
z37+o>f-a7qYxZKA{Y}8gb=p*gnUtg*z$XqK@kH>Bmsk=tvH`bO74H)n)X5u>^%Dft
zTe2nN&ahlI`{UA9YOWk;v!yM`U@S1#X6ai>DT<uo<~8Ui`N*5EB#i854wQHPrsS*W
zuOEBU%vN4S0~AU2^C3UfiXXO<Ko?B9m;YTnN<|)Q6`0fqA%&SBh#jyATBZiiap3Ek
zbMxKXMX_QX``+nyc#^d*OKGQX?G%-_4zAZiIp+}+rW?}bw`(c|w%wlXLcB~CAkeGu
zp~|N9Ce7NLnVZ1OCo+qD3(b$5iA~<^=-KvN3;F+cc58#*%`KnDc|<`lIh+nKoc^3H
zq(HySd;F(si0hvwZiBx20>K>7Ao?wV0rP@fWbp6Wpy}>M-;Ver7Vry2-eSFtUM1Wy
z@o1_u@3kbqm#f5KJ8z&FW*e9AUY!D1`a7Ym@l~TFIYc0!@9qWy%OOi@_Z@S1lxmZw
zRaq`aa0f1u;C<HP%1XzyMg>_ZJ%;>OuORkQ6Dcu0B>YuX=Y=n(D?;ywACl%l+4Eaj
z&d&EFB!ZEV;WR=AE~zW`WcAa_mnVYOf(JK!>w~7B#qxYQRi;%l-S@J_K_2zs%ek~l
zNjJWtl~kmNls?8O_MBH!Ef6s=?lONcyTgXQ(ia&p)J|NS=p1r-CKR>q2d2TVSSH_e
z^DsXwI7E4)ffwLk&9f*hr1Vj)Z@BLvlvgtlRYb#PWi!#o_9Cs9lVFrujM##C%`wqG
zkZ#e$Scgl3AiZ{z3$V!s{*R5$Dh8$U_+jQurC&2*3s^X=YWz_~?)qKLBtALl!G4U+
zej9Lrww*V1PaV!vlDXg6J2Yx(Gh!Z=S2$?~_DT)!oLT0@RSbLYUrudrw<V7>Mqz-2
zmf$US&9*H_W3kE}i<p&Gmg?(ECEgiC4N1PdzPpy%ZGKRSiVZn2%hh14mN2C6sQO^b
z%m7p|cF!1DFog%6{v+}Y?I>3Ju>Chby!&~viTyVH)xCKvqTn6*o+Qx3Ek76uF=8S?
za+BnV(YFr#nz3~MwP+>UbiS4$G;>T>W|)F<7)zs7JiDYo)%!13-Veu#li#{k?1YEE
zg68@LVxBp2*=OI?-PI&@5a_7Zy<P3enUc_IpsjMwd#zR1LL*Cy_*P{I9)s@Ne6WWa
zw&hc)tD<S#czk*f@D}ByxGbJmNA~xNY}vZLlH1z{i}m&PSEAPzMEx*H=cn{V8`Wto
zhs_FeQ#m=I2@n20p}d`lYrO78L2#baU_8x1hIx_qFVd0&{2!*la1KkCLQ1_4mPNq;
zuMQ{_Xr*EGd@`U%HBt0;0cGxfzvf4h$bBY*tHsMwyaz&G^Tw7N=}Lmy6v9dU5Yzc-
zY-b#S-grdunNA}=d3?o(;7JBM<*goB4A#N^Ni=eyeB}LxvI&wM!hx&z;hB;G|9keU
zOvNiPIX%qmI;~*y`mj$Vd+Vy?YaWMRgkuh)5_%8#g+HCvYkN9c8X4uycPKe(zF)fk
z^Qbsk%VD!w-}cYk)2eX#-xUWX<>izwKBwSQN1Xjj$>yHOS^X6CT1T4}89}#SztP-z
zhm8B%t;{XY)k<%jx`@KUS_jX%W@1F!5x$1mW|#-|IBo^uydHI;@O)QHdG#z>iagJ<
zd}RBcj@KdM?k@V*Sv#Yz)e#T<>BDAiQyt)Ktxev-5`x%{QS0!~!#pFUXDR<=n`9Rv
z3M=iE7UvRvqgo*3(f2!`9qD}4q0nNl_w#cqkkLT@o%y9h-U8A@G0QzInHW)DS$Vnr
zdxn_UbPI|8!E9V2B7odkwsoearEWen17iJThb^kh`onyS?zHw$E%p8t@E?nD>un<?
z16B~|^A}Z)rSqNE1{+a(*=;Wjjh(e!{3o|jcYGSlRF_{{huZdB#1&n-zN6gJgh@t%
z!}A|>AGIpbu^8H%3;cosJqSl>&*|z@+7;BZ0O04oM-a>SCokvXPHb-*0lN>FeI{1l
zL-gn@1<1`(y<_61&z#I$+*uG`GSiYQlP0Ax<VXLpg^bJ$*I^wQy$RjPUM6~^!HNDp
zK4kVgNNXM4RO*L6n*!9x4rqA-PXwQvG3(JB@s~n3P3wl9SBH^i)|Qs195ofizYn+R
z={T46LmoaE*UVbMw4M2hIZq(<N(Q0eEk-Z?-MaguYKDj9dFc$@>#cj+uGZ6`fJDhG
zZ@!BB+DSu}-RQ5_#qHZuvxl>y)h{odtuU5!^WyiMa7cEm6f*DfeoxEllH*v;(ulz|
z)RhDx+1NhYkr_$~-JLcMbfrJ5eGv&=#tJx~doiB9xq-;X_}JXq8qlxYynHBf^e44y
zahhYEcORCjF+ne3RB50#!lX2gu5c@86Z_bc;X5J!bP$`rOk+4BkD`kDq^~+E?iPCv
z6ZbjfceTNK9WED897S#bWC8zC=yXVyL3V$ZW-K5_u=8;@Zy`)YOtL3%a19Is=16>7
zPnhah8gtV;?^A<=QOk|SQKN8q-%!d4?zfPrYCDMsOd5n-ILiVm218n%)lXNiaL_eV
z)_oU4o5|hB7>$mMVhXL{48jAzXFww37cB4YEKy`pC@((MCjy}F@($<MoeXbux-w)y
zLASYViyk`0CMKNZzpWRh#~{P;(m7e>^mJ)q@Sc>oNFoe}nkWuh&A{qbwksP%#AVnM
zxme+#Omnck)S5dSHDDdn7-IExc4dw5lbhquKOZ(295D6l8$0UI@*pA?D9<=}DHMi>
zkF<Ie8O5O6N(udSAMAQ~XiYjeZT-bkjUn>E;&0YaWQv;I&ta@%06wH`EqJ-g9KqB5
zL2aruROD{nqkK_8udzbfGd-Ea{MXTsG>*mSA82xVJ&;`UlrEfyhXKg#m>ncQMY(mo
zwGkRn*Qf1GBgk~GNO|Lb?Ey~hztm|+nx+RPYLGx|gS`@di-0eR6c|j}p@Imo0F6iN
zm`3<m&UhvYa##$eh)0Sg->S`XN9OmbKSdh3T8$wPLh+%@Rx9x4wZ#oP^l^VWO1tUh
z=CW;h{~JE`*|hZTZV6h@11HqBeij|lnX@gWD$jo!#qxgowSVX8qR9iBLPm?}!p_5!
znV#Z?YT92;*imUo=>81>j@8}PWxg%Z%WGjkS2x!0OVj{e>@83Elj}JEm5CdLN;1*G
z7?l^r0;i`vD$)uxT!s3ALX7Ek`SKYT0?9x|m{}tejJCeId3fK9yA!n}-fza|JEf48
zqInl02;3j5@N6*tF4f-O+Z}nN?;Jjv!ca)u<v&ApSVLkR=?DOSDH(4nXN}8?J(Tdy
z7Ya4MKfcK@-N-|P7N>r-B;6*87YbAnJCSVTBkoOjJplNIe8FO0)jPmC#7RjE|0;qY
z!ou>AU<A-IX66ZmydmT6W@Q|Ey~)$Z)nd<LQ9dp2E%mqSQ&X|<D7g_&pS5G9SY<7g
z+@7ir;nDvk?M7cR^xmUDHmq+%*>u+2o<%mE$^2|muM@4f(GajWYA=K~h~r#uw#$GU
zI*O5>KmRQT<zIJ37^wN&k@Z+$#F1v><>N~s->~~RWT&Cj<2w1ZAuN|t&V;Oj_^UEP
z3@YS_SLo$b65#}^=fzr3QkSe6PYlRs!agDVr5=t<yTr_tCkSN5)5qi*V$UWUz*)h9
z4hpJ(x#9LzTNsKjFGGZ!SRWQ7`%ipd=tZ}GV+w>U#&kg`Kp>N07k|Wh9Y_L4LS4aD
zfzI_H2vk&3;c%bwQ9%xSKf{|bsb9UOeXi7S<BuBhRZS&ceI4)IBAJ+n@2;jy|19Gg
zQm7w(0~^#{sUe<4c*LaUn?*wiKOg~ne1Dyug>>3RD=9RFFuXVD*t3P1CM~4Xy1aXP
z2WqNlCt~zX0;U&a>LVp+AMqQaI1oXg+Wk~ECRqKwvXF>V-ltZ}jmx}I{k}?S-2slw
zA3q@mujx9y^VsD)gAKXXT-PHBo@n;vfGJxU?ZH%hBRs~pyr3Tm81)t!gsO=U5dvyJ
ziFiOn=kp-=@VJFhDerSoTH>S$?v;|kn2)217?fj(;=6B<*jT)m*S_A%mad<hn<vrr
zzE=sq&)qSLC=txsL_`fI%DjYv+%5`Pdh1Hu_9F&TgtzzQ((aJP?2feZ&6Rk7r0W_)
zy@`z!L%7K$z()_WRI$y3!x_UWv`sn)aKbo0GanCO$%~6?qEYVl_XWmwfmbMh$|o~B
zY;Yb~jk=oW4I;2}-j<rr#W8FCwY#S;=Red285T-_`&86*`r3-=7btSVKt4v140=Nt
z^xuq<9z)o$jSW*Fl<@y1QEyD1#^jJslzVz!Oyf~Q6f22pRKKV;9u`O`46Yotk{s-O
z%iw*J<+W=*-5Rxs-`Z}|K0Sz5gKAvofNAu^Yn#3IlR3B7d3g?hs?KhMNa6y^^{vIk
zHohcf#?-dIo9iZyf1Ba>r(*Z_fhVPFYu4&-Mg(u-s>&N(ra#;2uw3<)p-8C>@Z(UX
zK2LVV@WPS-8}uL1K>DDU{M!xWeq&M@U!vKZ7+mXi=1WY!93!AuVg`}_$ulnK`_qE0
zw3Q!Dofj9rMZW6EOw{30laT^mRoRmAT)nG)pThPv7}bf>RL3yfnE$!qA^^}75E>)a
z0iWqmfE+Y^d9^v|JZaNh3f#$#9750L&Rw-SFZE6m?164mU3X5K=_74{V^&`KHS}Qo
z_^U5pIFoEk?KF!-{8C?^x1}4hr-g|eaz1_jsG;3-`{HR~<%h}O(Pt2V2JvGOWgImr
z0^4i90=IOvwIOK2IgUo|tVy7?G)NvzD04ln9T>mK`G=#J)v`@aX=z<ECnXeeunOJ~
z*_Yk@M(tJ5sT&~y87&Kh;Rf4b(hd;NTcU!5g#PAy@NW97OkO*A<O4zQXhpAvv14cr
zr{K%JTB*`f2T!n?k7Vl%R{<V@*}IL1P)OUI%Tp|^|1ay<i^pM)>EsM>c5VKoHOpJT
zA6Cw2dO}fDR&3$pJxX-w%I{YHIBp!ao*{2|-oM^MS}VvRK(RDZ1lBw1XjCN48nH`^
z(Xx@;G$}_6usG5ZpA!U}s6jTC9$n!fV%jn27%D)9YE7WlJVN;++USU`*Pq;?KYV*O
zce-BV`dvxkG%DmdqFeehh%f=+sSGvn>tOGfRG0mmbP9dNMA_uryN*4e=IAp^)HGTx
zJ6wzJ^^fT&c)mcu`lloGtMXd=hr%6ZiqSJsf_y?^N(05zsq#5@8ye92VIV#$3}*Iw
zA0;$Dlt3K_A>NRSgdDvh_g*;lzYFHk)D{&rRA8ELPLjk9=>;fKXOpin75fz&)Rz0l
z?(cxL_y6YwxDfETNa-<bn?KVQa~+O|h*0t6dx~#TjuOllU=djLYgX)ue4`D!#V({J
zB+=b180!X9mZSE0NLahOb)O{g<j}QJnm+~QjOEM=4=ghe4+ZJ+#@IAPq=I`VuMZl5
zR}|49m&VF5VzW6UE{4Ln_(|^MnRP}Ix(f1q%T-w9ggaj`uJ@zFt-mlZ-c*8<g3oRO
znu47g!_a|2|3D-vD*?>C3N-%eV|bjrh?AyDyk3P)!38|u@Tt86U11@-Sz}vB@iz%k
z{D;_G(k)g3X8-^l!Sij;Fk3k*`uBL7ly$APSDatlWoZA$p!On#CipTUqWswV%xC`)
zd%;-yqTLlIRWr5O`&&G>nc2SJlY{nS#`IzV2~6K$?GHLGF2D(y^w%coXp`xm^U1QE
zWcweQX8E|}jKZ{=l=cp>mz;TbMd2T*denmh<Zw)#hgJoKut3?b_*?8KE{lV>E!lna
zx{IOY9zoC9{^cM2TVlWVfmhI0C1$UNQSG&CO@?hk%k_g0zvy3crhhG8CnO4D!wa9C
zp_Q*TjB1!NLw=1*UpL`rDXOrQ3ZMkEVk#<+ZRKb5M~|D5w4AIM8LJ#SUUZB88G9zv
zwh5AUbcE%n`oSYZDItE1L0!g@jwYx$P<80fb2sHv<0XJmVlGfv$Oix3-Jfn>dI_2b
zWSI&l@0N;EB+gDtGYhACFDs{Tk)^g#A4y~j<=KKJw11CqpJm)@$=4O?uf<FD331aH
z0-o$1$o7s}-RsuJ2=OJ2^YW>ke1?GkWa3mLJBy2M_e2SLC}H%d?u|1ESUNh<ALb)4
z+--WkaRRFIC^9&!3XMvfBvD%+i618P1<zI>ELy^Tn^NdL>h-Lv-8RtoxPD2rYQ%@W
zTptqmyL(74>*ucSjv}aj<=3{->mVBcv*{<_M!(qec7U&NwYDizh#-A0kxLcbY=<R$
z;1LXp<v#boD0jn>d`U5w@I<vpJ~B5y9|j_irB?Wgq3-Ypfg*>}uii}*;ZtYyKh5jX
z94t*zN<W+T;y2li<V>F2hR<mfAU8#vgb|t&Sjj2h&3@ivDPC5Tv#y;l5&r0~hk&mf
zpsuOeY}hxg%f5X2OMA`|W1D*FsPfRIP!n)9Zs)TxR@`ngxOV2D_t4B8Tp?wu5sdf>
zwG3`Oy=or3K}egbo4T@nktEs74<9kPm2MxgYY6#Vc1S5^B*-rUQVt8{OjSDS%!2op
z_?!Gaq7-pcd{c!%@vvDM8!P-KhmIC8e$2%Fi9tHDDjgj}^WgKR_`8;l<>Xzpy5AiC
zR^EDvJ-2#cS{#Gd_gBYjU_?|S`#cMc$$3A$hgXRHiv{zcY+ELIY?!4A(kWSISQMup
zh@oC89OV}LGKo8iGA)l9S8)$wNMWJ%fe)x<9mnm%pfz=^N30IjvS+Io;KK;q=?I-p
zY5{}F0aDftP?hcccyK*SOBQ@Gf!#8q)?*2P5S+WZo0Mt4^msyizgC&}r$I}*@>q=>
zLzV#6W`A|*`cvmR$<jLIYjA-clcfO#2$O`bp?ux2_WqhkfQ!|2l`o?A@jjQX<-DJ%
z-D-b}TKK!($%=R0gaK|&45gJWL{Tcq1rWub(8=A>MOa@T%$-Y`X!m5i2b#B#j^3^<
zc8=~2VA}53kp`IJ;>-@NcQiqKP|^t#scD%CjhL4g!qTB3zcTy{|CE^(H@lsB6U>)~
zbk%>32c>q{a!2Ce!;b#0M=D%au}l3Ud^b>xESi()kOX<-?sl@)P3~{f?dY>T9;`>`
zT7j6}`kbACr2%;y4FU*4Q=Zbu+IQyY6an(Ri^<?sCaD$9(UF-2qvFN(_W5xt&+9{H
z%YsoR2L8A5kzCsH(<RR0a;}12On_av=2Y--X?OP32EKHN^c{+_$;9J+o##ZByZ}EV
zJrB>{BumnGmS5$9{cU?6Mccz%fC;gS$EbPUD9_>P^F%&_x6``smWG<v>qQ-u6r~A$
zXBh?6YxMV*AuD2R)y@I<5SF(-5<!sfl|?@dUP6B)<qCt6x(Gc{T^T__PrYT5pdGX)
zqvvvOEZ|$=-o`NN5!khP_a}Tw75wA=8-qI4pgS3P_=wQDcb1>VFFeVT0+n(!6s@qL
z$1#_DYj)Er*w4H{8P8b184n_hY<@-VMh7z0%pH>IU0?nY8Yx}!fG(V{_v)gLML&KX
z_*_+YgA;DKJKIeL0j~h6{=Z-exQ#gP6T-$5<L-KXmUFY_kEXggYX<>-j)=w#Ix<AR
zy}8uxTU#cx#&iGKDObTeTPsb?X=_1tv0iR3qy7k0;BPGb954+DOd_Do4EAv~$}4D=
z109rB9JmM<(25~D;tz%GJ~VV0r)*jlw7<qAZNc^%Q2!&u=ku)Kr^Q<5JdblZ<&&~e
z6IYd!!}iGln-YZ>?f#Uv?>lH!RtE?K>OkubC;VjnkrNcXm5#Z<1~`oMxM82)O!>5$
zEBn4LRspS?dnVXW9N9+JM@8dA{SJYrCL>|%51uPs*6q>x{W|+6CiHcDD?j1eIeP4d
zd*3h=V%qn8?|I^$c`HPR9`NTNLRD+faIrpoSUe;3tmu0&ttu-4y1JLto<81ww)H;y
zBgJ5C?yG71en0Zp4T(5y;p5s(HKp6rlgF<6{V~bO6V_RgLKKv>_Wj*+zWhGzPbcG1
z-3c~Jkj2JVla>Fkz3={K^ZWnScdOcJt4h%-+S*%Dsa@5Un5|KJ)n2j1inLW~s~JJ$
zO^w>MN0649iCr^7TSBcML=fYS@8@&h|HA#t_lG>P&UKw}o%1};*Ep}&176-0rc^z*
zCQ63<;LZ@74Uv^|v*%IbVgYO8yz=<CF~ho{?>+a}$w%Yklg*Or_t?VgfBDI18N&sH
zwG^3jN;a>*Z3(f}-OfN~CVNJWj&5|8XPIAqvUGIy{w!bd3!3it&v1@5_5kuC^z6v@
z;?S|x*|FM7Uq30r-riuIW7;E`yHAwIBzn7Y(h0XLJBP-6eeRW?oO<zEsr?X}lI$0|
z0d+0=ft?8t@B6<CEqy-7@;RFHPn>!9E%#r&CA?#xGX4bhUi`+pwCB1`7#3^y!UjJ@
zulU8cAE4G|H3Vy-Wo`LX)5e9mFW`3}Ceo<J8~+9$efh2LO*^-70rqeEedY&Se~E)h
z<y26chO794m)DD_SwKsj@*$R!**n>s7AKSghJ;2uxL6K6a<tf8)D04Jy{^0{)iVyW
zG*hKUDf*0#r2Y|qElSnv+usjroGA=QAgk!!5w<cprLIG(5j=^3E`|_p%eM|R5Hgj;
zB+!gubbs%t)aLu+{AEb;5Mu-BFX?yJqVIJSXs0D`!pUjCX`0Mb^u-^8h?YuIgwcI-
zi+h}^*eGi6SK~zYr1tVi-}5euCHN-{EmRLooTzwD$Bs$E|CwvU$U=|v=9?il4^1_G
zuLtHd&lE&|_6mY_7RT#cM~`LjrJ$EqzW5|zM#HBqjZdqXT}pNgP452^&Z*WaVRtVc
zul$8&or7m)qY!jD@`Su|v+Zu`@?Q3SEUcAPndS0UNVtJ?(w5Je^3?XsjqUC?(j?mS
z147_}iaE*5V0nz~`o~Utc3Lci_voYdfat#;xm4Goi|`%<MI{ir+gSFxTX;M`-o0%6
zN)Okg>VvbBc9!yMzSM8$7zFRtP&kiu{T<S9lTob4&`=7uv!fXidK9YcK5)+I;?gDa
z1N9Gfk9)5QU1!iY;|^q~J@mwz+fGYpiz=Gpe(og(q5>P%Oi+OzJwr|YDRgvn%~FfK
z@{f$|piV6I$*;?-OLW+SuM>X9Cv@rO-rY7GDnF;PIjL<kNHUv*RFJdG)`D}X?N!=#
z9M@Y`2%D$=HbW<=2GS_|6)SC<$1PXK>?d|(<{wcKk}rVGW>d9G-H^@vDf_}-SNoio
z97U<FK#iZ6C2yT8sPW?lp*QEhos^aV5P$NxzV<%2g<U_9DNeLudA>e*TlBZugB1Kn
zwhJbeAtbonbaa@fRi|5Ocd^n}QMitPq>v;J`u7$CZ<`olITbj`ZouXDHA_y;O29+}
zEJm<M@)R+%B!PwF<=5le-FH9yL9%<Lmr1AlShd$Xs%ftV=KYH9V)10;YXhZ6f3_zL
zqb;lqq4ujZi8(;`IDzd0B~v&SK}~qI=}-Um^Y_I|-W+*rlhl*h@p0yL7tl=Gru_!O
z^H?umOp0FZQ$~$BEhMBe#zeyF7=~k;VS)yRs@eN3s0q4uvnA<T`w<udk8uO1t6r$B
zl$>&G`TgmQn{i!#sHzh<oZ^=hgAFS#Wn-urZd0HI1+Gxq$Z=1SL;GKm=>+$(0u5io
zvRT9GOPx9+IxEtEXYJcI^@x0s#gWVhIR14M8!P^*i{ULK_%&%Lc2OWApeMxQqyNXn
zoRzVRUZd>QCJD0}Q;^OLMOR&XRH$W1+c^=}A$A|>prKZ{q3QkbtKK_l0K@AAwI&s%
z`kNAQky<~b-z;n0+3hHZkak2JoPVw=d8pR^<~naATBubZH{R=$zk$d?i&Ap5;Gn*v
z&fR9ST3Wj76^mCt<ba@)fz_6%)5HfS$-b2RteHbc&|y*9jDh)RE~1iquU&QA=iI&Y
zsPV{4Qg<Vj5y<iI0nJUDy(gzNFHvAJs;PH~6VVcv@Xp&c)P8};Gq=4^32pbZn~1=`
z5>}_78eP_GnS(;ApVuaaW@&cyHI-Z0mgX5=%sH31V&TGqq0i8iy86$aFU$Pdj!Dba
z$Kb6C??9s+kv;yBG4h7RAlXJ2j0)VUrlHbqjb*J78{?mI&a~Z8`y?yt8V0{RAmo(@
z-zBY+YV6Pmu5a6Rn-XonS3eSi_u*%qe8Eh8nX(b`(Lq=h{m{%VV59qaNn&UL&E(r|
z58BiVu!JpMQLzkmT7i@#Cb;Z6@@?t|=)m5aCXUXrdr{MI$dt6zrFkdN<Sj|zxvi+v
zaCM(p`{JIbU2!m0Nyd&}f=hwl{#1UxC6tM#_T!=20hV{0<q3`e4B0?-M3)%uX^fZA
zpPZOTjB$#8XR)(rBWZ5YSUpm}$jkZTcw1hkxcFFxiSBx2AM-D%*Yj;H&q*y1o-$g-
z$yCi<ax2NtP}>iU+U<($N|v3Nvyp~tmX|@^P|)E!q9>=}yBi}zDrmyLWkd}^!gWdL
z<SG8R-l9F_=p*JN{KuEs*522DnV#9%`JjRE85v9GHZy|4*LF5&QQke!jI61-Hr0QB
zFXY3XId&E3Yw!7|**Ezc<X1V9v$75gIRPLYZQg<Lda})(W|Z?sc$@r8veHN)(ax-a
z{@b?mP^mZ7_t|ewr-96)&(UvqMq7g&Ep!j7gx+KyP@~j?D^{}(In@t}A6(|X*ocL{
zI!x7k!+UdC7jc*Zd?nTbuPJis)B@`t4D5Y($(rBg@{7ywd1|H)8q$8Q9h(Rj_*qrF
zHQ15FsxLHG5e>SFI?_nk;e_6P?r|6Kp-Un$U!Gc~lsQvTUg7SBJTP~#zuS>Dklpio
zd_uMKo?i)OhxcwNuOx4<B@B1oJIvPi0tX%AdkYG*d{xrj#;X2+l(=y$0qRav7JhGD
zSmo;J?s?kfMRY@x=h3Wa>fC%h@h>q1kKl1Ca!oBF<Ad<d3ZUa%-S6eGG8zmSxxO}3
zPD9JlO=UMXSMI-tkddnJ<l57ShuYjGxBy0t8Kvgut<qBtMRDc|skAY?MmZr!X7@Z-
zoXf4ijX%0(Q&L*tb-f@`I<5hbq|MD&MKjjD&AYkycgSGXgd^MR@9jQtkpj`y=(Wya
z9hTuJQu(EXnbIYqu5)ljX)(Z>`0j)^wxUjeIwUWneX9I^I66}s_c_|#$=PJ`QC)cj
zZhu?o50lZt`)*2>8UA<D-MfBq+3OQDOa-t@N`z~aE@-&^)YnL7la#2uobr7`Iw9WS
z#^mRLo$RVOKDor1>02zwE4=1cx9-Gczt1jnTf>#Q-)Wa-orjK(XX}-ePH))`=S~T%
zv}$T=WX_5O_0+kJ&3wLlk<Q?LMVawi`>t5Uldw~H#<>}hmGN79+Av9sDvIO*n{5Di
zZh|Ag9<PA?p)Z<3P0=NJWH8fQcB9|q;H)~A;sZt85Hb{m3-%l2Vvg&P;VW#EKUzCG
zb%;(6k_3Qwh6@H(1B|&mH!6;;(J1avhizU_L%2;#dPYXW>&Xgew$e#&uJ!#_HBA`F
zjRdllxv>da<r6+xwIiJ7;5?I6$~-YKj#f@nv9_A@PWBLj@@y>o73o_@`!Ae(oLo`i
zzF#~H@nx;<lxnpf{8_^JIa-+`cOBI)=I2_<=-XmsM4pUN<P+EQT<o+kG=wIm{I(*w
ztX61;LySF-$t8)92hsG^LjzeV+bCD<dslf3Z&==Xwc9$}d0JZOx!*Y48Tx*1A0{Fy
zc>T-eqt%(H>4%ewR=<^C9)t0HUh<=ScVzAii4euZtw*PL6MSA)K82sWY;D>H0#03a
zj+W|$d3c}C1x<1al6T3o5@W%!Ul+gjHOLNng<*!FqjpkDgF>jh(uUv4DQMw8dA|6g
zxbc8%HeX6BY#d{LWtWJdo5BuA>*Yt)JaP2Z`b{@k_Gq3u#`_U91*gsg?5GAW&m}s8
zw|&gEZ^gE3f?Rk)3CqMeZd?NdHwcJVZvMBhuyB6f;1%uhb#D+?h~|4!>TP9Zlks{{
zed?glW7Xn#?bd~kMsPytLN*C2CSI6w(M`t(h_0>KPz-e7BFe`&*pS)_DU@uEhr;rs
zZ(zj==->a=gss|_xdXz)KXHBa6TTt<tSrepYFN@kSCm$UqxdWOBOKC0x2D!(fo0hB
zuPo4{3LB{JYyI^%0Hixx4sWp8U-eUnebGSrt<)tZauc1N&RK~X%7F>*Wsn=f%6jj&
z<{PHb9QYWlqGWn2TA0kwtQ&Gl+fBkwu_X(W1qyrLfBro3G)Us-PSL3S9KOjlVb`ch
zFl7#T<FC+@<mhnp7_Rf;k^BRoZ*V)@YQe0}y>x9gwBw~Dd>Z4_E|W&-t>|^2_Z@<z
z362T%c@B-<PHhh4G5*Ko3xH}eZHNj*0e~W!-?bG!xfb=6YG<T*1X4J&*klV^k(lH=
zem$8asY_^F>bkr>n<vkU)JFg(xeQDiDbioFjXA^T6^x?HfAG@CI$B#{XyJFT@gp)2
zp8F`$y;I6GdF3_*>C^2xbd9gKB>)%Oi5;XgePqWNxmFrlTetvZJzwA<gmrkTstjMm
zxhGeo<o~()lK7=OdnS-<T?XkNIX+3EI9gG@yVsX_g3*kt9V4F;S1<JQLw#^@6_nXi
zRI-K5oZQn<HEP6<1)l+xo$?08A&C1@FEei~AAxINO1;p<UkyzuZ!TQ<iqxPeZBgeO
zq(?T}yeT-Cgjoyk_`x-HKT!c_b;}<7a^<i7yu0AdXMLtjM^*+~5;7$%mDa}f<ThXJ
zrbxU1xnKOgFiz)op6D1>dVKjI{Y)t2_Hlnf#MF2)EZC{#B7^szKCLOFa=zbM=I*<9
zI(kg$`8iXL;#PA)6l~hXK(hxUI}BhR8KbI3lRgw*;a<S6riW|0F&G+%QDvX8SQ7iC
z$4XL=tCbMqIQ9Fc`$Y|C&rX-9wTG~^rP2WQj%9c+*Q!P*p?Cui9R2(5R3U?tN=n28
zJ~L=pf2nu8QgTpnyP`339g^1pAnHoEDf%7y3djw56<~)S=C_6&MXk2zPBsgA@r=IH
zeF$W+{C<Ja-w}D(jxK7j79eY_`UksQQ6Nabd#w6pnQCqY1&-q0n?PZZzg0i5KVO<z
z2cFzuVx-fc5dzz{C;M%x4Y{aa9fFqO6QW7V)Q(zS8&Ftq<HtEKV_a{tyoP@=9*4v2
z1d$h%nJU8kjcjZ-@0;`|2ulFlY0I!NT{lOT<_{2aIT=D&S|X!**Z;0w#bWU`=nqB~
zp%#>uEc7}6xVRw-Q#y+ITXUCNM6}4#*tj<Di4&{<z>->^19#5ySafIFkl&UIC7?&K
z<%+z_NLuLkZPX~@%3BuoGj2=Fx})n~mSwE60v`FHB2B}H3iq`;C-P|^?O^Z5d>Xq%
z0$UUa)i&VAfdRjUEkk}jaO+20$<6om)@97H7HO^ZcKdWGccxa6|7p=pKa_+AW0k12
z!BGxsK7H#4bhAAK?4bN7Hr#mk#ag{PA;bn%uYZa4>VR|zP`ds{O11Ei^^VwbQBP(+
zDMr=W&;**>Jletu*_3oID!{fe4qqYn6CVO=WU(LChsLKyYAQO)J*LE#=-$3N;{_k?
zG0$ej3&M!hsU+h5_BnPY_-*)k^l;z`p&J%BkJ>#cM|)1^h_C!hC=`=I_#DC!P|8G{
zhY@J*67Zu~?V}t=XJ@W0ZvLB)7*nzEU!lvRL*YI>?~<~iS_%&v3WstZ9&DDn0m@T6
zKrMc^mqPF%PP7GEaL!Yi;qB*or=6whxd|#+sNXm<t=C`@Wx)NIeozFtn^2l{Z4%>Q
z*&mU;R7Kl6PG)0)#==S0t{U7}Wn+rkT+|d%4K}ZI3$|O&uP)4sT(h!H>lpqhJ%|PM
ze~r(Mg|UKxrA)UZ;spOd=SFJn3sp)7)if+C*Q+ghauF~8{DZ4)a)}K|=XL!e!r0b5
zo{57g#>6DsPv($n=-u7qa@?Gj3lw2?m{$1-fZHlPyX7!~W+JowntAu!#or@}FbPHH
z?-mw(x7tajwA^p!il>s4+Vjar!OM&EYTm;n^FD`6QxoblY6R1<tG_dN`93%%evp%v
zZ{hoMPiGxuGUtWrWDW9zsF-`vUH`~LQe{7^t*@?Pa9$cQe4gPTKWZGVaig(oUarcr
z?np3MP$3>$nW<FO($GFwxUw?nXezy?8ztvFx4ROsqDTKNhSz1yVyQ-Ruq2g)6A=-K
z@?6ne({0DqE<l07ynZnHQJz`Iy0?g66pJANTT@SCwm8cW^%g(*=(Lk#Pjm=1F=ieS
zG(+sb*Gs5AO0Q<H{ZMfG-TT{iN=IQnt)}fis8~;!&VY14@3^obN@u|LR)4zam}X>u
zvQhby&-KP|AU~zNM1R38B-Imybt`u<AATlWVL!VbTeDNPb|*JQmLK1I6(BMHTK0il
zVcpTSij_fny2J~qyx(l;4@;ingTAP9Tb|gSjB3DEW5HH)(taV&f484{EfcLam{fKJ
z%PeeQEJWAxpqGv;S}4i3MMO+Weu^c~)l(DhCSa%ss%)&UZwpbIoSB$`!B94lQZUJr
z0V{c`GM@T=@s|3hrLTUk4+?IN2auVxUx-gaGJi4)l>5%zE?gT!FurB!DHWWADs)_4
z>u$c4ispE=p7y0Y=e42n<d%D%q&~!7va-^BzodYvKt)dKvD0BRDoU4S8`3u6As--z
z|5{>PdAUnFmM25FVmU_u^Qq*SL0@|t>!EH)lXdn3EwF2zNs}ArG9XuNhrAF0k0geN
z1Xy*Zru2^?hPHkW1=hF97S_Gz;-yQp`u>~g+}Enga;wun=Y<HYQF+8eyd)!!`N7s8
zF*lc3yoQ`UAa5RPkk;mpUw{V2>gB1#%&Ey}W?uhkepmyPyad$^`wN+L*hFy1OWa~-
zC2vq^k`(5gSAT42`UV2ALABXcHVWt=2|myTWlGn22X-w2vUf55%hDcAcRE|+FZ*S3
zoRj+C=I-9}<NXvMKYAl2#crP{^rm>mk5g+o`zJHdd+a@p*w^yLT)Wy~MsQ@(JhW)p
z2~u8SCJ#zQ(r|6gqh|B@&C|zAOR65`!?F=E$qRAt$aF%gQ<keI4G09d%fuwJp9qW7
z^(Y8vyj2VNvnQy)@5S+VTN`%T{7ZX&-*+$HMqFa5*?n0J<B<?LmVI*)`nRP)5<-m2
zGLZ)WxRZD&im2|i_zi$O$zTC<EI~Y^fsm@oZt;l)&@#pWysUavEu_=j@%eQ6J9^|-
zmV)K}?@gf_e%DgxWS8s48j2O!=>#)Cj*_YQu1lUp3`+w!H6pQ7C4pVHCO9v~Bt5@5
z>sYjP4!oR2?+WOUo|nJt6{ukqT<H_OaBXQ%&TlQo|KEkT7^;{P41h)-9QrT6cUgzu
z6D~`u6T&waB=ntc#5tLyA@o<Y>3Ysf$iC&ZWh-z9cqYa+Ii2o@L+?*pC{&@B4)>b?
zxhc--bNo^eJrM~Yk-D?DA?Zka!&6QK;8&?!4*;igt0<Gkv+m595enyNoGjOKlt_DE
zKXsNQs7pDwQ)wBhBFROHJhUE_qUFL2UZsyqttFBhp1&*-Q8?rH4}?>~^#w%BODj&1
zG)-2-D{GZ-b&d;-1&T?>&0?gukKObMOf6xpmglYS1?E4TuI1jjoM7_kL;>G?FQitq
zU*+DU^T&k`u0L8*tM{Kf4`l8}`V0O;SJFXKHeHx~M0Cluky20%n-8sWJ6>gHJ48t|
z%K{_?DNQalDyCgs{Kx$+o?bKCt5;?bsK9`66)V{I=H<$X9(L8E=KETkncho3x7~9q
zkh0xYm{9~~xEcX;CFaBek@{c(Iz6Se2~W}2(nqh2w$D#5I}K!w-!?w_VyZ7zV@zh7
zVkkqFtBS@>eY3?Xx9!TM;$cLkJsCdHG8)Nj`&svI30D2I1%-40d|(<}1f6ttA0#YT
z&^C|z_!Zl~*xXlp8y|W^_}sO?66&2M;KY8V;JLcXOofUpbejCM*~x25YVG6+__yYA
zp)vqM+Cc#DTl#8Ubt|96qQAV@dYEr(6pog*7e@uQn`?QD;NxodR8`n^l2$HA@{&U;
zn>s?4u|lY+&bc^#t|640((YDnKWX$KjTRagD&D3OERvdQ*KvB4KIZngtaX=4)8Cjo
z5`M7r9p6j&CBb+e`iW%Daa}}IpbhgB1;7WlI=M&+W>taO<$uS-_>Ny+`PMlW=uK-W
z9o;n$@R^zs7^!=^f@Shdj?J=qSn`Z5+@f;QOC$taOT0HcUMngvgt9%_w^|vbiL42X
zSAX^19f=v5fLX`RLpJe;8$876<KEJqEM%$8d@H%ILeatkhYO*GpsaG0!qOHrKzbl_
zwl;T53bz2K+@m_r(1XHoq`t(sYDiBf__}9QPX<l1BUvVfG<xIIh+5k}w34o%ICvk<
z-Zq?bcZ%^ZKL6`S-BRD-8b0g=-*>Cr+uf20e0w+GG)V;r(~t(Di(4nUc{G?gIm-Lt
zsS<t^<n<(ryx*TB8)hJ6qBsp*oJ_GQ{3Y&Rdh`L{&D1rF?oPIzTkgD$AIK%}AZ<uM
z>@{&T%6~aU#EH$^h0=X!9ygd}-S8NHK=;Hbks3Ah<XILULTO5{9^B9~gLLO<K4^Ak
z3jd@n-z*QIKcI*3(|1AmpLfNKe15@yg)vrK)ypW%f!G^g9`yS44MyqN<IK6>l=Sp~
zZ~ZtcRAAtI@Q@@02VWqz{hntsw(hBE)7@5S3BzP$zi8MKmpi~6o912ugx>!IT`woq
zF#Hd9#FZLLvYMB#;YiWMDDx(l7hNCh+jF6_$E6>pk@=6+9`3kL{PaBho<oY!e|CeL
zBTZgL0P3<~vj+Q^cVc3|hw8id0C9YUv&!wLl5LfNzmUHXMIBFh^AR#yJwRxwk}o?r
zZh5+~qIa<qb2brDWE}$rWHukwZo=2EtqiVcuCHwT6}`D}(jL6i-*5&T(RXNjaq^{I
z#>ke3DN_Og{{9aBv{!2rxl5fpD>WM?#DSPMF)*Dnv3!{$Bq6cgaTG=%B*rFTxs&0^
z$;r&7H+U+Q=fuCB2hE%Y_36Lx52V<<bz)Qcx(Z`bedG|s#jpnoZuh5UxIU1tz^nSx
zZ>FaOH!WXdlIXkhhCAH%byr&9*_`AP&cM|h5l6xjv+7^#A&>HC;%l{;Op#bcDSqpV
z!?MPWIWe7+SEHGiXfPHyv0KwsUtbSh-DZZydYDYx?Z2EHpP7Hke`kFE{;?K-b&oY%
z^y{w(1*buIhHkuiYQ^83c6VA1qEK3<#P3^j;O?|}t;3BhL4dWB+P-+Rz}3mMZ3vt-
zidd$6wG!diT|izLWT1<>LK_`sQ~8rw$(-8YF4g{Y<e-wm4DJuthUYeWW4qL!mpus_
z$*E!@y_=#OR#SkYf`Y2Y=x>ZyV#OsTB{LMI1$kw7c{~5XaIB^~Z)|XJrTFeaJeDgF
zRQ=4|djDIN^8RLAHhX$0mIUJW)huVwC9%DdZyq`B97F}*HKTv2GB<A{RTtvNa<#z<
zbX&1BNsGrvKbHoBQaaYP;N^LrJU^oelt#lI@g-~7iY`A*t~`}N<*#?rX!G57v}itD
zDNdEg+MULBzfOe>`#uTSb}pt7uhchwA$&IflEj|Q2I+fBjW0h9KB?{|#C(gO*3{Oh
z9{sr`$vp%)zVVKxY=cU!P?Ao`RV;sUx5fRe7Hg7M2G{7EDpDl-WB5V2a-Me-^XYQD
zX&gE6M@M7~78-_$pR+DYm0kU}r<u<%0;SBEsr58Cx%mag;{!(K<ElYXSsCx%8xxK7
zE>DYEGWL5C28-2VV`rt@Vq2tSu%`3HU199(c>k0W0b`HH4q~wV{n+TP_e}<0pL&k@
zWm}wpw@=x)_ntH&RcP76a|*Z$+W`1>-WA2N*KIF37X9%aB#P#p;yMU>Q-N)9Xp<ba
zcfY&Um4#bNpL@klt*HyIr)AvwZNsZ1_R-xRVpA!*4*j5K7eH;a??H3FNWhX<Cnxg%
z{0SzIgY-1CycEok=$!r_l7oMnteB~C<C}i3&(R18_IMt2SxeE$((qA+(4Xx0+H#=T
z$ECD@bYvI%=FkXiQs1bc;D-WbB$4&vYPpO6UU+5j>=|xL`IKS1uZ;|Isml(26En&;
z8#onPLB?gU;yr5gH5D38x^)!A>yUu9l9ip7C-@b;Wz8n~A?*^yq!bVE>KlaVV6Mus
zSIV3?uq^ikXJHS4AYs!U!WE3-;J%|*hU>U=LSa(T!=yC7$vA1%ePx;jtvM0CnfFd=
zM>kK@5qV93i`>ld3DoHTzwGUyurs3_r{NzsCw_2HbAoBWj^FK$DC0Suu8G*wY6|+^
zW#4ih$sA=@(ubKP3z%(P1o8{;Y#3Bp!d=A$jufP2q&ts}-nY{d+|AMG`7@xMPtCvo
zej{ltGssW#%GB1@NQ<#yzD9&~*9FuChHc;)2_MdX!GTDd6{bg9*My<WLF1)@NH}{^
zm&(6Vt=}SSEsY%Q_a>_F!j!Cg(Z_Po4{XJMy4mRHzIpEiZ4`~k`~;VUdgo=w8U{(v
z3sO9#p?tZ?$Dhv(bEz%EBSNjUTde%MUEkhg_!4x%Dpft$5T)Hm@M^3iD0IFdowl@l
z(2ueKN!MmVs{$r13T?Bnj0EQ0*x6d!+JRZQzFpIzo^UDq?&|r4t8@mArg(4qL}PZz
zrXU@SVM<c9gOt4MM2cO^yi(2I<TQj4dLz|~b9`n3FrJ#lEu^GdG{!p9C!G)=Y(or|
z#?;tN1Szt`eWd@a!5_=09m1%=&i5jTdjpr<(BpPuuIP4;g*S!Iq^F;siSQy@c&{FW
zZ%K|ZotRb=vriPwB-%ah6BF`{<mJa)T6_p|)MS(Cv&@whUI78&Ar5oS2B92>J5C|p
z$_;vOi0G0IqU9mJ>Ke_9;fv>NlaG<1bI|j9s2_`tpgW(3U-T(~40NRZgwvGL%tn=U
zv?f4BPE3%G1>*1O9$-R<^A_D}nU9td9}?`uT8&4WQ}hfJHi4PuVs(OIuJg0o-??_1
z)n_Wq(CxAry-kmIy`yIJQ>Xh<2+bYIPHZMFK5%DEwoKMmQnMXN1;%f+Dm#W*+6xmJ
z|Hu|9O@ph=GY$tQPF|$zyhTG+pG8{Q@S)kPYInEUn}*8VX_3GLnX%JkTFB@w0gTV2
zCj8CHV8BZq9-HRBc{~M*v|iEC3IDm$ee?#4QfI32Kd_WhG~=EQh}CwJYZM$<Z&XwW
z6dTp(x<(7ot>>+iNGJFO0NonxQ;x~)Q!Dq1<+m@^3XlCq(TwyU^+hBpna#Xg>LAUu
z-FI8v3kbe>cKdvpC5nA_6xVMHF}rkWUB81M_!UBbhOmT*PSKO9S&US(C6NeBB?DIj
zc3!8p-3ZyQ`(qC1c%jrn?ypE=rnz=AaBw#&pSx<|CyBJ+NLtpT#CvDW>IXI&4y?2G
z_4FDG)hfZ9HOxja1IX~Ok<Ma`TRdkK(<XNeO$A0LyuHNKE^m{k$J9AhSK#sV##9hx
ziw}ly7<K9M!;4{N8I)t*jIQ2_<H=%wKKGw1X;gXJUy$zC{#Nf0_cXsNVGep-;4x*D
zaWjAdzADm04}F|9``@CrHp%JV&%K>wu6(?a14e~TK0*E>T8XlWQ$x~#=l)aZaWze6
zYU?6ynR)5Y;riRefM)oKngNHW!T+s}ullUIA3I?K8379--RwX)s%=kGu#IgXp}&ji
z6eQ$#{y#-&i4J`)JTX_YX(S{qIO&-RPI>?jQglL3#+QhcD7=+d_|afbuBKPUfA36-
z$_fuWAT+NG{<kUr@Av=x@V_JQ|9=E7Zl0e1t7y}~dwx^ytTn*rde17K+P?okG{;e{

literal 0
HcmV?d00001

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/assets/logo/onnx2torch_light.png b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/assets/logo/onnx2torch_light.png
new file mode 100644
index 0000000000000000000000000000000000000000..4210bb0564d5124f7469bdead918c305d52eb24e
GIT binary patch
literal 44759
zcmeFZg;!Kz+djGh0VO3Qq;yD?lJ3x<q+=wcySsA~=@L;o6p$E7K{^!)r4$$%2Bo{C
z>-XUMz27<O{0V3M_R{4N_ufz4aoyK_ZKE{Q6o?6K6F?9|tfUCnf*?Ev2*MJ)jt72X
zlqw7Zf820Y)c1fOZsx1sSiT>my}=J@J>~Q~wOy<|eazggARiweZaZfO4+}F_D{dEe
zn~Y6~+YodIQi99s_-3w8`vp?&ALDNi@Hb{$yIzZYNB-(T=1i`8PWC=(2cocMb8C*S
zHgJ%%uSM4!Y~M$ZD0)*DeST|FW>8Xw?~Bpnk&$_bg)6(j`{m{OrbMgsb%Cgx?!D{I
zuLn-|e<<^(`L(E%SBmciwcyLZ|L^sGci{iGJ1}hddi7f5|6T@P1^Slc<~3^n6x*8#
z+z#q5^M$_&J}Xv3{_ibCtnhqkBo=r0`m#AE?8#$!6vZ9b|8qTIqf+?adm|%@jLh6#
z{=fb{aqIu{&%eXh|FgMvoHRt<ab0tg6j5+|NI%@nGDZ=S(d}|0f)>Kfz9Fio+%oG1
zmP)oy&vU4$skao}R{uR{7t=xZZ}?ow2lwx<CMmMSYoi}RBsi|Ak@L_pQ%pw#gfu&I
z`O}KRhT!@AOW|fR2(p+CQ~eP&hz@eRwq1-Wr%dz-l){MbEe&8Yo3`SwCd_gEQRLCT
z5#GSxYjksPaY-7(*-)!HFn2>(;xg%3#n|uHYLL5w)Ip~j3puZ<2L-ZLZeAY|#CJO`
zc-gxB-?J<6|1&#S!otFWm8Ff1I+m-No!v?rY>?WunP$*da3hCg)|0G$sIrs~x}-{s
zI6d6DRN<g>*LqDu1}@Fl|DDHt;)@qA9)_D4vci_ImI?C9@3=#v5H|v?-1Fnd568*)
zXvKVRaiXKHo+I;n{@<ag(VQ7x4UvY#ruL{kyKs8Mc`@qZTV*Lbeb)^xy^p4TsHA^8
z=$_;@tNowDDtf3g)cfGUgJ)c@g!f)l^7i~trWs9VMvc>D-n)0EDkzHoJV9s^fAimH
z)f%$0Uubd$d`OYsi~Bopa#1#?fi3)8JqDIEor0{atk1NwD6j6+RCZmw{_hb@{<MPe
zzRJ>5u=SI+8DWVNM34u4m%ds0Vtuh%_|-&;SoP@d{u{|2Z?Yx<cUuVAgLR~psm8~^
z2@9!pI=wq2(D*@&_Ue}0MbrN=wW27;Np1}^HQdq^<)L&7(MORe91|TKUMNQW@5Tc2
z|K6DR;)|(5Pvu1ciV*Tt^pdhO{WvGWGg%|akB<7cE<xNF=6?qXk8|y~rEA$`{)rvR
z%vm)Yj6!+qqC?UO?Ed@K&i}r383Jz|1xIoTrM>}ozA=F>K1=qpLs9%&iak*XaGBh5
z@p~cGF3i6zZ92Bm{L0zmiK~HEOP<Knud?<^=rF&;ay10g_rRW&nyh)y{(JkY4*L-K
z7t@apa&ln~p*O0ksvbng#8^0YFkh`c*~0B5N1w_-#!NMzas}BPCPku`q(!?}b9;r&
z4}Lb97=HG|f>gb-uU511+%8=ow#)O8cTg`J749_2O&ZaDsp!SmcPpl2roXpWVyeRz
zgSB1vZ0Fx8EYy`!Y2HrL;N3q$aT>9<aY1+^r@LeXQL(MQ6OWnqcKNgtnXb-OoEcig
z5qXG?`phSVFcPup&rB><`{)0*AtD|xIaD)!cjmvKb_I~!#MvAo6#*k7UQoIE6Z3zw
zdSj(8jm0Y-c8>#@AUDqp(b-E0;f#^o!7{~%MB<a2Xyoyb#t?}%8Vd@?rimMt+_Rz!
z-#Ikp6{)GKOYLcYT@o%SzFvtQdU|;e?FY_JbFK5wb<OncFNXp2MZCMxqwzec!CT6N
zIr>{xYX63OYnSc^JyLRhir80e=Vip-_D3Ml88Os6Gb2-`x`u9Y-d0u%J18-nHe8ri
zh;`*?c*ooh2h!v96nC{*dkV>A;-d<zwm(+}U40#xirwtD7sU}WZciw<ccSv-#+uKl
zu57)&=;KG;F1Jklj{*8+mU>~+49%C3@_BSq1pb(yLj<H}u;&(PrEh^{h&vm|vZAkn
z7^vY~%T;9AO|kr};Pgz@Y<_f1GBo<iCFJUsu4~@j7Vk{7{nK6>9q4k_F~rVH`3Lh5
z43wz+=>4nTB5K!UMp*<7Gf~-$QE+*ASRzINMm9>od`cb#H%@@Vs^lzFBRRirS~b4@
zU4CV6>RQi+<<a+JuZ+I_1;0Y%;0wHTH&vhO$#IuQ9Tkbv8%|Kq`v&@O&JerbRSt<w
z5k^-=CzSX3^Iv<iv_EeAYlwaPU$%eE5+=tBT}7HmWS(2;Yp!R1;`=toc4}#DJwr09
zl`#2UC_3yaT<>CEZI;aMz`++$hDMMI9xp8ni4AP^?c;sPNY02K?)$9CtrD(-_g_AW
zcTl1L)2e=0V7D0E&C}%g7`_?PVJ>a(+4h$j&!k<fB1`3P&4vZUse`VPbS^J?_ilD3
zjF7rz$xVE?^CRz76gLKGh=cg=wAWAkjLMp1oaJ?`?*5y!_@?ymyYbI=){Lj#9}guz
z{zQ+LEUspLToMwz$(%BA<yiLn@Od>zpE0aL2d&JU*bPhgBdjb!&yDX|?lf6jyc)+S
zWOa0YK?fPmo6-I6aDR+(;dZ9qU$fqTO#CMzb4ws?IoB+Xi&YBD#Ln?hO&m|^N$=5_
zT`Qx{&fh)tbbhZ_g`!uR|KZ+bjSaR4I&WK?s6CZFPJRf*s}&R8{yPIYn;MeW*;5(%
z>wkmCo;jS0!JC2bDmcxv_S4^ef<v*WLR(0JCv<SO*Dtaptq@B;blOwVGSRS@RNarN
zcJOyT@@#+glEP)RH_2He4bL<^@)7-)qqCJs$fzYiiQA?g>+%^7r~hgaTPN$)!UcDC
zcLP!W5emOspQ5#&Jb5y+&C%ijOYBzX>RG)uYmkt>jaA1elc=y$97j{*Ip&3ZtcuzL
zU?8_9u7fO>G9yj8YDhReQee@L%*{GScy0VwJ+nF<GC1h;y$L2bHE@sx8oL$KHFNnV
zhNtFpscG&smP*8`r_Ah~=U01#tw*x#PNu)fPwpZ~9aR3=&MgYq|Dd+FcMY!T8~pbc
zDF3Gbqx6N2Wp{u7eypjlKkf<@`jMHyxhtJc@dbPQ$bty}?a&)e&iI=j#)Q-BaAZPz
z2raF=$0Y2Jwgqier1LByNc8y+JPfScm}oXszg&}qdLhg3`#05(c<+Wx=xXSb{94e^
z&vwnMHP&zXTpIOn5?c2u324_wD_YeMMhRu8QMtTdONOND?=?8)Qwv0&!Dp`Vj)k&`
ziN$*eAw7-EW3?jxS^Yn7q-FcHV@O^1*X%`0Hd*(WUyqmcvB2yeG0x9IT2<etyqfRW
z;WpT*H#BLa!0FhTVC$8?05jlzQEO~))O%t*uL{om4W*TpRUsD5s<$E8u>%Wveb+|a
z_nUF@wUaeY#jAVUra4dAd948gKt=p#WFdgNv@BnE+4IEd4&dK86<<2aO9&92-OHMH
zA2ROe6}{f}Jy(QQLk4WUb}M8ldZ*`ZzkB;q$XZypHr;Is$FJ*l%=Aobk$Kbb*s^#w
zW8dR}o~Oe5bZl3q(sn2rS<lLt5Hb*2r$j*#VN%KIz$kHaT(tW4?CfkNhKg(AkGG?J
zmL3gAfw?*$QFF)%JPne*hA-^dzx<ndPU(q_32u@eC)K6zT1{ETNZF-Gi~Gjv=Gqf!
zQ3=Kb!B3{JW+)ST`JTpe3U$-Ki_csdoDs|Jn);hV-<}Of9!(Vnl&9+Po^vQ<HBzRs
z5Xf932#dV3qHH1Z&bK_A!Tx0#($LjZLkI{76-49t`!?!OC!rur1TT0`S(cQSx3DrZ
zTTmg5xg!n$aw&!`R^1Kl!~SQ+SK$M<(?kOOVm1>eYqUbWb>@4wjWqUiWQa!2Of&xe
z$}}B2V*qT{;LEk<yCSz<X(NzzN4!!Y6V_14oo;uXF6_!yT0wsLoW0JMFN)b)>s(jp
zxK2obMHd-)(A$vQPVJaf;gQe#)xq7R5{1tcxa}}m6H<)w@w*u;p+|3b=$_Meeix#W
z2QI*Ig^He3Mji9xiz!YSou)><dp63QQR{^H<YST;8yg!P8%x9^^u_T84<TC+^jCK7
z;L%3Gs}KMVr2eF_4y@ani(h3w=nm?{Xne~zc2N`k%#b+MJWI^$#6nK4V_&A7KRd(h
zMbA#r+wVa_hY>f=g#MFIPdh+DB?nOUSjcP*8#1zIl6G4C&f_vs+0p8BIvEhdT>m({
z49nHC(!()a{+5aq_xb!be29?kv~XE>T($z+wZ!4!)uN|AI-SxW7?$0~tU%TfO|d8#
zZt~mnPAfP@MxK4^tAp{`kCf_9Q}Y-%_4U;ai~T2zuOLBSVVQVyj3&h@=6_PIUNK`u
zp_a+JI3gZQRuUAbvRPD-l#x~PT8ozqz4zr`UKSkibH=Ak^mFb~!H=Jt+S)sIzBcRe
zurx6(O%$(fkGnJfDe^x)LJgt>>vyHaSms36VB^zNGn%Le8$A#HL>-l9y~PlfomW4{
zA@OlZmh&Ie{gQv5sv_8>C&k3s?=bW*kf-I~4~qiYHc*i^UZY!vW8U|zz*W*xMV&CD
zJa#@fJx^$)=jY-ytlQ@AsWiss{l4%sBD5pXJM2GWv#Tz@WXWAjcU0@=Baf1U`0SZl
zmoo{CzxQ_2*RSlU(m60uKmL;?=1HGC%%aWQv(-bBtSBNm8d&KKA0hxuO4`|!M<4^p
zSh;NXI62p`OzV2gE)5M0>BYD0%90`_{>YBWGJeY!==gdCRzA>O<%}14UynY>$fKo%
zK3=vVEaYRQu%M^q#`-o<FESsWRFEZ-eeZoZ{wL~Nr|{)zXLtxtz+xl{^&>Bxw%~95
zQg%#Z`c;Wi?H2|AbIr|<-5U`!+~>Q}>6RH1({02ZAvv|+`=-D>Wx<%x%IpdS4ibW_
z&zO-a?S;ZQ4&JbME@N%=tHT=~!h(E57Vys<Le3*K6`Zktvc)hX5O0|z_r4CUreFfS
z>gGhw4X9L2JlG#x`LH^C9-cDi)bstjAG$Afh3$4+d5u#*2Fly#p<{f^Ck|C&6QMJI
zMDz5*NBTQ*z|y$@4or+-%SdYMpdgoPW?hfwuE78|pM%=m{_gSAx|Hpiri#02WG&QA
zAXim0q)m>;LN5G%HdLzPHQv&>o95gB#?3u=3|G@Z2tSshAPAfAw<$xd(m`t853Q-G
zEi`ZcFiH+6!w*YzA)@gxj8_7qZrr?D?h}^45Xe<;!$KiW!u%Ndm{i<uVCh+{%lY-T
zFVzDpBh5=~^Onn8UB7sAON}*8j<gd<ColF#6@)yU`^`kx)G~r63XmA;)1eoC@=zru
zjf@;Ekd2BEJE<@CiDB_SY^0e{h+;M(Id16L8AM08eiF$=WlcrPtrS|{E5p>&AJE}+
zradSu4;)#j1JfEN`6TjaKG6+#h+Cs$eWI&U+<U_NV%+Z>Yu#whe?Ogc&2h?4<1q9G
z(PM*sa-6q~-eo~MXzKeWzWW`TpM1~WooBI!i&!LxU1TK{_I59iy`SS4E)@?NLJK$P
zHf=wT9gxDL1|SRgEZi_<(5{h6j{Ob1P=*o|3h(s%Vg_o!?#BM9glHT-YsGr^1YRQ^
zuYKd)58hZOR6}(`!%5y%@LQuSP-m=kYUD?n#m~`Sdsh|dS++92$^uJ6oe+Cp1{7WU
zNFkFJ>9~CyNH@6~-r=NaY-}t)T5g0d8t)M~Co)akzI^Rn(wxXRY)~K3Bgb+VW`a}X
zO~1t7in%N{qp|Af=)mNQh2)+9kRt{y&snP=&MPu@2H!@$X%L;0hl7xtXW(FFIscfM
z|7m7x?nKGaUoP{>0~rKEF3O0geVODPKe8WnTN_;$>4?9bhDx@wv<w_G-eeB_GAp~|
zdv+6Q)~Qa4w8UO6K<T3<zVOC!$Ik^4-?65V51SKt8*Zi(*=-XAHxClHeJ;^~!4?d)
zOP>anutt}ge~hBS`bE~3U*|5^n|rXgFs%=$h4{HrVIqqlHMRZ%Z{E<i-4q~N`TZ+5
zBLB3UbX9!hHgTfqyiuLd+_-u!M!LkT5Icq(slRX!>`Ct1k~kQilN<@&-FX$R*85}i
z<KFN+{u#-G<vI7W@U3N+AW43YUPMYD&(lhgvy=1$>D!G<N566K%8OA`R*q9mZd-$f
zuvD+tqI2=Ox$3QRo3%qNV>Q#FiHh_*ur6#G7%UWvD89oy;wVsxzK9u)_+(=;DY;~A
zY1vj;$^!)w%`!s{a`UPXPgwhJchXh{cYRq|*^pL@bh}$-9#MNY0&Rx!=YWQ127FB}
zO-?cKsIPAHU6rN#(9Bcx0<2Qv=JlqhhK5bdN*EHsJO81Iv=RH)pmw-r9f~nqk>#vy
z6SJ0$S;@w@onxva?<6XMUst2c%^N{3qVZLRaN`&_jPCVi7TLM_m8r0h-_RAdz*IZv
zHgd#^#8=HM1g-1S+l7YnFkCPr&Z~ROS~byIrI|JE#Tl~r(ho)WU)|!+`L*{>MCdtu
zY3f5g2GYAR%k*4kHepNMHzXs&sm7+p$p=^&C0QDYDG@ReH9=bxknl<2JfKI9^Qm3>
zVg_tiQGzkV^;fRkv)5^@)PO0#=y7^CMbdvjwSNlO`X&2Guk`W8p8cic!C9$2CUirz
zX6cfv2ZgPHH)}!WH{_Nn%*V$!!@<pc!RLoz5VY*i6b%KH0?Di`)DF#KFVBgPK!Iyp
zW+nqu;hzF<4=XDxUs~x~!72fNwtzZGo)f9y0+o}^>lTLGr&rwBVbnk({+(sb)7igK
z+9n(0c?wy5(unH7^G@Q&4or0Xjyf<}r3Sh8CKX19zfci5zLwoJPLuIY%TvaZ0ZyV$
zC-)o_h`ux6CMMk$qn3f<hiQ*_1Al(xkx!s^nz}f}jLg)2<0bFkfYj_3YkzX%hJKNe
zU}PelwM0njBNXVufA763HWp-JhXP#wkSAL;6VKuzf~2ToBr%LOsQ^vm$OsFYAD>@P
zFAn6ij>qY>ZEB*88a&fhMfz1BeysVC;AQ)}IBgJw1D`j9B0J0<PS%(}+ak!WufMk&
zl7(KYsSMG_O_PdqEsFH7{;Lb+OK+R&kL*BdREcSy^c*?ur<oeIPo)$Qg;=RIOJ@B7
zeTPKtzJ>@UU@tC^tJfTUg<i6t5+Wdg^5Wc9;N-o+r=<S590UURu8lJGx4E6?+&<eb
zMoqqMjmr`UGz0hLx<TNUriKRZFmEjLlwtv+E}<|=47`{GU#iNEzRS<*_t&tTdWPD1
z_moBBJl9rU^EQ#8jjxKE{PpZ#+e|3awYRd0n2{D(^mH*D!6Jd<+daCf`uoDdlLrR8
zH0#fdj3j5ue1lsr(Z_pBArrlZVx4n4G4x%c3W&le#ixW?D|cWNSSo){&jE8c@<Q((
z84(_7bcZ>=j#ojXN?eEeIhO8b{c6wGO_D?4c?PO(H<GP_N>S=D-ORUc-Reo^)c=~M
zr9@7IC*EoPX$Vrm=fx3xU{m)Ef7TTtX$aqiUkf)|uXGAXn-jUDObl^HlK*PC$}5#O
zaEDq8QH3K?BUJ$daYO8CbcoBT-K<+B=wD*EVWFq=Qv+-C(>G%}gqn62OH3rrzYhNT
zq_;l?vI>kQOMLfy$U5`sbhATk9wg9EWjRNkbBH=Wejo97bknFYN5t{xPoJkDO$7&x
zZ0zd2Ib!=^a>_<QruGhi_}KZ*o}==RXznX;DH3G%y(b;`vm#JeRChHkEo~!b&j~4#
z6?p6KgG!==$g|sRn$F*%yXI`LZ8dG~Lg0j{WXO;bU2Y_GTM04P0b)O~rtI`bJWwZH
z08=YWd>y<IZr1x0y*3@Wuy=d`nK~=a=_PxmMpBB!YeS}YX1_l^mWt{Is1|2dezIm%
zB;KQSU@0c~%^OG6C)j<X`f$cwj)T>vrlyBZ??cUj<1%AOaKW&6{={@A{M;RF#Ny7^
zde*q&c$2JF;a$y<BxdU{L;*2v-16NEwrj!j=j8mt1M93kW=!>Mqrl;9$fu5eB<aIv
zP#I+1_+kO^k-FOZAkKTXt=dTq1g+4`&Wp1j;4rt(8C&}CbB08-UkVH@(yfKbj*U5~
z8VO{0QV2^l5M5{_<9z@a6BSun9B4Xmfj{s;Ig`=}Y&0yeP{=SjE33I3aK$?NJUDnn
zN4gx8_JYzwBl&;@jvZKPWTA>NB1*yQ64WX{Wy4^FP^!)3fi`rZXKZ{?!K$4686{f}
zHwM&bw;aV@A*s)*>Rn-YCV<r~HxpfMNlNLP7}(goCt(nC+&euFZR)SW{kyiyuq9dl
z_ukpzxlg<Xy8XZ>cVMX^oA)5!r$8-pgrEN=&2((dj|caEtbN-W&kLY}dZi8@U8;xp
zl%Q5;#GfcEki2}<g%tM0XzA&VJw^}$di+JW8IjcXEv1wjvnr4cuxP(ztMl=ty=mc)
zi!(&AT50vo#PwuD=gEU{ls^NKg4DO>@LC%P<61^W&EEz3FkYKe$1{F=iyKe8%<XdJ
znA4mlrE&{2k_u55lv1fUkd9YT<kWAGey3zBIh^NhWdL7fXr&Ja!(oYCPLsy@DC2bx
z!BUT<{!yrII&|ziOSst-VO!GI>&Vfbo}Mj7yPITg)kh_$zT2wPw~qr`i~^ERU6mSX
zZvJBYT=2EKyHMdELHW!DTU5i$l2jAjs35`rr}^!WZp*SYcbQ_IO+as!xG;EqmT=K3
zDdf3x|1O!*X<A`eSXh^MyazxW)RBr$EAes#WOX6xsBSqxAFMe}whXvszP1%;kVHpS
zdtY@q9z1A-+7Lzha(6q35R$lU_r}M^*SLBbByV2tbs@z06=@I+QbvkOw#0Z2k{i;n
zX*jF5$=rKV7@u_@`r)|w?H0OTpZ+0>4>9t*|EuDAl)A7};N|7T@!?}(r<vi|N?apx
zpNrkUD}@zO=JU(HZ;H0m6SI-;>_hS(U=&*vGD5G?-U_Y)aw245f1%>i#g6++;@n7c
z>LK4xR6$d~_nM7hlz-fuND8*C(vlya7tVGwu<<Y}{YxG6WZti3hhjC<E!(FN^;Y`q
zSVvi?g2$Tk&*1qyNb&pk@0atb?y&<i_WjtLVC&#~x3s8e`6qb%nQB+dZ+8RdWoUL6
zBx=wbGD4%#t*TuM2k?1&L>AVpX{|-cYG1OI<XC!&1h49zIg}(p&`OBh7*y<v=l(XP
z44)V}*~l`A@2dVei5$_P!9u>|aVwEY7xSOoqUq1lpeMCs=#gzG)$AY}5>PKHD&qLE
z8N}a|Tyyxc#%VwYO~=r+vnu(EFrs923qEgCU?(*Ttcj4C`|LhisaihSD$DTc(}M?f
zpPwM{PjKpZ7!ypK^hZ<^73a5XS$n3)|FwHkzMY%~vqjI@Gp&W#UV3#g_^n?MT>B)*
zR6+anv~8IzZ^-^{<{V3lFDO|@&-aSE6t+sw7VE<M8C>=H)2q~_92_t$_)IvahRT!W
zMWv-4%xti2+Lo38De1L0L!`lbtu9Z+<C8ue(sxwIFD{&0$kFK~=d)T5WKRS3WUdS4
zFSvSZ&43K2O&{i-Y^9%oRd<&>s#L{GYvl<dLx;Y2{A|k`!gOq=-Cp5VfKw$WCsToJ
zlLi`Zmn`TTAD(E3i~<Dp6wSm+9_29W9`R8@469C*nYmp+btqmwPrXp={RBq&LEwM2
z0DzUONiXoLkM#ZYQk5O@!S3F>Q?q2BtbSakyCcU!NfI&b&*U1w6ZJ%R?gM|9TSFtg
zdtT2D$9IYQu{!<;8rphCOYcV%)B=0E779(gf4bl59h>MT*)!bcjDPv)ZV2EMH=y!x
z!hjosS`%K26Ez@3&J?34w6c^5J;@VG&Vw)Mdn$v|va%!-`0MBZV9k@87Z&WnEtEts
z5ST_eO<v+@(2ztNXRB;ziN({i*n6=VOYE~~4(qGWEFEqUEK;*Vmg{TjM(D;79o??g
zSg2oAplbua-@3T8)Zp~nka~;9P(`r+PZOpf+x7k*E(hc9ts_Mg5=2+0Gs;^orhEF7
zBMvqKwr{xF-HUUYtQnzI8e?tTOMz9^=3#7(NTscIV|7=Jgg$~^i}I9h>s#{48bSae
zN3iR#%4&JFFlo?C73W&^c$H0nkYMXu2S<gR)B?XaGnxlbX2ggl&A2k0jg36YhpXrQ
z6EE(2!^>($lQo4!tM?({JTFRldvP83g+=-z6M(V?3yg8{Ok|0z{*)IILjpFwl#>p%
zSj%&o>$N9`TU(3S<j$8FD0flg339U9EyS}8m3zD8yf9ccr99N8X+uuZG|(Mv_vqQ0
zgJ(?9YH$oQum$c14+7}hsuhjRt|{iFF@u06$=O3tvbqKtc$iu8AT~&5YXKisqB11G
ziayS-0u*emllZmS{ShsiVpW8?Ci+lysEHR{fAa%2XjWGL>^)1%YDt7s8XU_1Y%uN5
zz-kT2>9NCVE2NV?&plX@YOgH;obco3A5~;1VVCjUi#&D7k-0hb5n{XLZzTE*_Y2h1
zPiCUJGI`sU_5|yCpLztf_-SV)$}|TY&JfMMo9JT86tEkv;*r*EaTqN(P?lI$@9LNG
z`n5(XmFC^^$+x(u$oAcw2<hj}j8F2rSatV-=QI}B8H2rh)^zOqY%ujh^6uVV06z9}
zTtSDv8tW`92@sLSWbU}i1w3P~eb~B|{ww1yq8Y0!;7)90_gheHzBJ`M%JQ6u`L=a9
z0KlC#)?aV&n#f)Ur;MtBH2x(Tvijb<ZFIPq9-yc<pUNPDMtF%vN3^LEm!_l91a-}c
z>;5Tony-mW#Ln;u_Valw?SG(b4C5P9^3*Pq*K0rgyx=YvN?)%M(W;8wYK7?2<^z!e
zX`<$!RSP&ZLek_Z6?_=q%{Gc;tLbN&nmc({sXt@<b=*VH1C#~HCPdzU{lOr)Bz&Hy
z#))aPHRN(_;#8}1xCL~#haZWvy47;5Qc}nh(;a_#)sCO_xzz!hZALs;>*b(L<u?^X
zK4i=vyMs8lM^BJS2d-lh;-+dWx*if|O*ZOo5!+<Bcb>@N;4!ei!fS0!K5-oC6k~26
zSs8LsIQJm>BD<7zG5vv5s>yftGX6;glE<4E`6}i314U0I_Y)TKkZ+sB`x6b1;qwoB
zDx>FijsyW;{l{RTQBWk~o%Qq_E@~o~aU$CMlb#wcP45Fxd5x?+e&+Sgsu~m)8c(0r
z(8w5UG7r(C{8c!q0{R5{_;Y_PpBDGyL!dp|DxkU-0NZ_q|2x9U#>RwjR+sUcw^3hs
zUdT(Iyf7E8d-smp`jEeDJ#QdcojV;BNPI+R6u7(cSc>k&POZ4>4G=O58jQT3kvgQq
zI(W$3{O;YmR#Z#m^eJ`zqbK4nszbfxiQ-s>**_#+!=XSPaq$3@U3wMBgW|<%91N)i
zX*{sQ_xtNxeljS3RS?bEOnH6Xz-7R7sy6?2L?Zna$3mx(#DWTlNHeStO4|F!myE`6
z@Q+Rns{Q4%B6^q=7!!y4`ptLr#tAE$j~2OFZ+IDP*UHV+Y>a)vBqAT<6Z>6u>7Lsj
zSEm>SEze?wcHQ{4$EqgeJfp*&6C8B1;o*4%GEb-X%9Ni+Rw!yzO^ChrgW7CRy10*X
zfzxCMM3ol=;s_ytx1)pv1X`k#_nicW%>WjAi#lPMShTI71)x-)m&Q^3^5XDx8=<G?
z+Z5RpC8R@B{Gm0n>8j0)7jyKTGEP3FXcZ$#5TXsZg%X;qrr`6FJ4n1kR)^=8c-YTP
z0w^=mWf4Z_bSf<58*W?SPs`p?%bQ02%Kb0}|6s(8s~e(UcGV0$JPuTcd4-WhD*hlq
z$jQk?M0fYtG68*pd}Q~70=w*tRk&7P&Bm)pPB`8D*kN}yPO4LNz&}0HFUTm81+<Ld
zkkT=KKnxA(uypWorm{3Ikx6TKc^o_W2k{@R<wKgHmOp1Ln|4Gdy&X+<QbHvD?x;JH
zE}X5cuqTw1$8UJxrD3)z(xuE+_iE3NY?!F~E9HH*CN^kTTtcv?&lae?f~D{oh8=SJ
zqTda5EcF>6wp_g>em9O+tq;mv6^ELdpHvW&dyEg5gQPHYV8+Q;Gv<vAqEc|2Bq?5H
z`dNe7N&wYhLVCgYto(bYu*mg2AE)$1qAs^=x#k7@$<*?2GYPYF>riEBwj|6}D>;%n
zlunE`{I~Z~-Wq4mD(-9w=!jC0l{9l@rSxAl3ZW@Owd&RFc2biSbBSF&2cB36dAD=V
z_KTEkIE{LJs+vRPp;V7RUukb|uL4%AqM4Ki`b{rA;Ho9=Cn`4K%`WSF&cPFl><0O%
zLd^?Se3KyTtiY~c7oAfeMx78h(BB`-K}q1&ao2ut!rkUZV&S!@Qw^809f|~o)6Or=
z0SS^ntDm);<@QH1ll|h(Y9#R%?En<qpQQZ{E>m8YO<c8?7sAY#<Jl}()G4FwP_^<X
zk9C&E@1EM3?W!NNNrmbJ*RVsIemI2a?D%m$ovuCj<ZRTi?gN+ae=j<B@^|yxt-1Y6
zRF!qUDt>wf0RcfcC?j-#3MoVgBXr69h6VZ#3|U^FMvhOfhdm0nOAoQd7v2a^Ay4dh
zuTj*88C`cse&sZ|6w^Ito54J~x<))J4a6K<SOY3AKnd^i6kXC3$pynyI|UR_f11uk
zwHA+mM8}M^5=FFc`IBX&vjg9zA^8|<p<T2Z;JnHDc*dSEj11CyXeEb>Y!zHH|B~^5
zt-`jVLK$>nVC3?*9mTSv)0hkF)LiM8{GbSy#ti|BqE#ahp{E%)TMxSj2DWqnq%4Z0
zED%d!5xr{VKf`$uHIeSU2a}&KxN$thLpIx3o#3@LxZlICE?K3id{$~$RoNeBm}49y
zB?LEQ+MrWskqUA;UUNBHkDojXG0+G+TY4t6;iI3Gc_GSVY~+b{5DvaPaaM@E&EJpo
z9UT8f^T~H#emEd#^;H_W<he4o$L_9lMTiQJnw7Jsm?v8?Z0sF__Q)M{eLh)hB;~VY
zbb$>aL)%P0Dt-Tw8f3d-6%O3~d!SHZLo2b!${QQc%t)3vP6{FRc_6;qbp@(qiO&Ie
z*JN=KvIqr{*c?jmqYR#zs`ub04K&ytuK_x)MPJH`=`3-dnrKPg2?~UlSCxJ{Z*o0t
zLetPw0_K1t@|I_m4b4$09SkC)pXVeMDxtQT4Qz<|g9tVBJx3IfM=WH1*fL9&H5rR7
zCSVis^S?``r2s$RI}5&Kh9=M`T$uSp&9rf}?64Z3MTX%Ndp0CG%M8^CQIW=WCYshV
z2aPTYPFodc@A@mB<wYb+EL+Fl5`XYi@~lE(ruF#X8Od=MMFOLVzJC3{f^}f{fauB?
zMRG*2?_TtSMZ@&T+O03_(Dk}BWsf%@Mw&}z9X3bi;RB<E?;ZRPmi-22pNFnF4wtkY
z&snSc?8{2Mb)1yKZqtTt->0m$sRcMT0F|T|Hc@rsiL$Z_zN;I-kkW@qkk7JKS(-+Q
zwF~1R06BCJ%}g8H!D?`8gJds#)ZQ5L&FhAErX`bZ2rpX@7$<I>1J<S0`nwd5*yBI(
zbClfrmH88)W5ayT6%TX%s=3)W{2m}_oe2Hf&Ha4-LoP2i^uW4QkLNx$G=}b0)`6EB
z^IP>P8!x_Htc;G1Cc$;}K~WU5+I1wROAcH_9XwR7E#|J}Tr1aM|NTLO!xG5nc&_nE
z0@HYbn&AgwKfi+G(HqW<O|$eO)~g^C-%0&`#x{}PCi}oXr9X?VUiL$9^>Iv}myiY1
zL(4k4EW$>q$-12~>E71Ep=VoX3)`cZV2jUU4Fh#qV%9FLv!zYcA#QU6ts-l~{0S>R
z-_I#hPx$nuOtzHny>r(X$fm<OB5_UYz1R$kI_WZ9<}p;|mIJ$1h8n<LP6Jx6G?cMR
z`eIyhKl3=8g>J0*7|8BYRs*#-wBfm5OnDdNjClZ+x&HO**E~q#g|~o(B~x$)H>v#<
zT>$wazSW6N3)%2+aWxZcZc1uSZV*y&>TPN|+83Dejv?Sv>y#)@`a($`fm(w09ERZP
z=!WT)dgVJ9BS9~8t6}^k3zORE_C1kQ?00{mLT3b;@Tsc4mkaselV`BOhlz@uf9>ye
zEGmvv8UPIDOw-xgM$1fm+~PF(9&GSiz!V2ShObs7w2G3<X!d@QSYOvX{RaBI6plY8
zym8>2XsMBk!!LAdV!>0IadDr{*BBfS)%_Ub9I0kfCug<zJd^Sx*9C{xy3+%rR-WVb
zPjo^-H5k1h|NVQ70++9;o$C)eWAnP`1!dv4GvgBP(@A6WmQ-t9wpvVt78~Lj#2EeN
zmzhl}f>e;j6uX5v>(;^FpK55hD)v!2Q_0_!g*)@B4yk!DDvbaZfFH4+t;7BrFTT9I
zTu3|XWeR;4E#P0~W*_^rxAP8K*XXg3>qE0jq>p_59tYa-!>Se~@6Br-kpdcEV(N9O
zlPt*EB#>~5Z-CJ4IH3S-$6e!R+_CFBzyFe&#OZc;AodBaCu4C`9BZ*6YoRI1>(}rq
zvTOX6Rx4HLC?FLjFY@Y@3!z`0n!&-r?1&&r83QTdLJFX*v`arO?r(R3?LLMu%3h3i
zrj=Iu7|Crrcv*rp`o_&_Dhk2^riG%5{DX%%zhg|SjSNU;KX!KNpYABUt(G3x99|*J
z`Azq8I1q*MF7sR2Px+D3ynB|o;~UtjtUwE+5%oNZ9#WUeLj{aFE8uk7*+2VUV>TBq
zmJxIyS+~{SX&f*^yW|<N73xC~zdDn~MsQC2=zSp>0jrmhfiP&==p!z1-;LvbrqX~)
z5ME)Cf!9sLP|jLt_f|}6`_kTqr}x?$MV1E8t5b`D=O&oZ)I;(`_S{HK#fyNr$nNtR
zr;ytu5jjuXCyNIQtuawJeIM=1?I8!K6n|S3=>)6vp)f*haKnsr_BGanB~@0|Vk4&{
zd^V@T_7?jb*4=5#qEX!8J{?-PA4%H$$&PgpwNc$Ohg@B5m2&y~$l$T_27&9vM)5nK
zC}O0IlgB8Tkpuur%oLV*1Qfw3k8>RKN4^8%Lz=tGiLVtCl7jNrOpS?&>{$3b_%yp`
zC%yKr5UX4O2RO`Mc1#=C+g<6TM0!GZN<5$3BlN)}dj~0eAM;=Vl$I`9)3~;ZlmuA~
z&ERflM>;C9kmulHKaY!_TgTQwfNSOOR2&py02ukBR__89vD6PwoUF~2#ZTsQ<P`Ar
z(-n7<MRJ~)7l^U*rt^Eq0XfTzNES#2fh!{nO~dDJBKl<QPp*Mn{pfwf<Fbz?VMTZC
z4)GM;#DjKp)t?(Sckg&5^P~mMl#cT8^K}Wcj)*^)DI0n(Xg^kz@bgHLC9`a@#eZw3
zFlV{ad>@4;95T?|JtnyDP3baA#QBj+>p;%a9&3hfkaiD?&iybmLATEr>9+C1)x_)u
zV|t8(ogbC8Td<V|xQ)cL<rpdRDe3HE`PO=VZ+TO)str&cSYb|){{4b+>%Mff<bwwe
z;i&BJ>5NfUhgQl~6gM{jXa!fYVkM`7L8-%A_RrlR^^(=9__?MxZ{Jp<>gq&B#$rN&
z;uVnYEZzxx62xsg+vToRJm=uZ^{>4v<9`0z+KlBriU)`_iI7&%3~@yJ0)B&S0${>p
z*VorIp-yZO+%^Fy>vqcz1PmsP<NB0*a;oC0v0<-2LW(6Z>+MK(d=4^rSAOL(;Fnbp
zUX4kC2^b)YtCfgW<<ij8OPv$xFXj#}rA$<qCRjW?UFw?K38}l~LW&3?QKf;<*Ha?9
zwW#N?AoEXS*1~hug<rc{fc(|px4AUgX)+Au|31ZOcux=+QncF2&&QX2kDJ@MclE}k
z{l%%~IZtZAIC0xOn7foWCeO>22YtSJ2NuEucBVW(|J#1^*aUI|$|g^=LF|9X%*+%7
zMRssoiG6RnU`c?r^`a)kh9z>V?V+MGmZK%meuqI+?4ML&16R1<`p(NOAnn8su6U0V
zQO+ppcod~dgwStj`K+dCF}zKNzdo;l-mT6F{Ji^{osqT@Fg(n3Vzo92jL-U0-A*Qc
zKlV_#o-gPu-{LTNu;C>S|GPR<rR1<WkjZD!Mp)7MR~5#%RkxIySGPZ0U&!s}K9f{t
z%~Efd?vdD3tvVXj-DK409R@n?Un-^v8hm&DI6;iq+Zw9utH&_+YLt+BLN?XPib(%u
zT7l?uWgmfPY~LpcWpBWSAH8N(zEA4g5ljix_|SG>UzZauF78zWm{8{CrnkUG3d%Sq
zCFQUXPyxf3m`LwuxU=f06AjQaYpR#Emp^Vb+S};@dF>%%LQVxL#mwgm3vCNvl9oY-
zE%^0>g9Z!vy-%ELfe`;xYzgWypvfz6x0QwGghsBg91#0o%xWIJ1q8M99*0o%t2vR;
zwhmfz*sia3^hbQHz3i&?GT#8bWc0EuR@`4d5qje#cNJaYXF}_zsB&s~)H@Le^k>9|
zmC#o0J<DOjJd$(ML3}!`tS1j{QYX4Y8t8Js(v0?`F@sf|BlguQ{*1E0dJy}YGIzFJ
z-Q0GBojc;>^Cq#*Znc>Z5)uZmkVo~vExD1Q)Akn9B)VvFXvud<&EYrXOD`iLE`EyP
zqMg^;b8P;+I9FI;M*^9~bhB<>91=h`6X}3bsrn%(92we<UaSng7X!6NtW2k{2X|8)
zPPz!W|B^7KG@hN@E1K^7l)?MwMSnuq9G!8%=|1tL@yTD%KJyCAQo%VP)${7VXX7MU
zr=If(>J;>J76a0<=Dhw_6J26nD}!bC9ug(C_ow@v41zYU=W4bEHqnv)X?YLU)P@!+
z8))`8kr>wZ#rrUKBR{`mO!FqZd<A~akCdjSqEhpuUm}6J{Kwgl7IK!1NM#A<LP2dP
z#EpI_<qD<zmczLstBKm2Ko{cN2|RBFbLU5%w;a^Ze^kp7>w~sk5+?1OW^~ZN1cJyL
zLk`PPenkn`k$}IwjWhv`v)ajG1^BlYK6JUlZ*PaYMa_w9wJ!vj!RHGAUc%RY$m|UI
z8`9Xk^GJ3fBW&dK)08g`8=w&L$L8?lw}vIwVfuXM<EO0Z*(i;BX+$fUspWzAXof=J
z?CaeD(zMUsgA<_DbFaf3njuRZg#sbdvPj39iSF&ZKh~)60e|@}Hy0+(&(Hr1L5jz#
z2*<;raij^?l~&$44rGNId?`Z`NOkr*fht=>qK>_!%$ll;2XKo+ys>LK`(2P2<8s5I
z7P^Y7M~sI&>MTds=`;2XL>%^!m%weKH94GLwx226xW1TfANWcHBl@<`8y}3;=s<^Y
z47~B=xjL&o1JL!^w;k!vH9i)^e%FupJB7+{w;Z$OvRX>^#+jpxzgmF#<`EnJgSBTp
z7H_vWP0o*o)K?psnlegIrwmpbhsWmZ^x_9`X%vM%eZ4=PbQlzsmz1K;u=mAG_|BfQ
zCA^>s=-p9}sQ+8M@fPtmfbao!y`BHe-)`{zfm@L0`wV+A7$f%2d|G~0N{U2ri%(-^
zfC{t!o6YTf@7_JXyOacRTmYM@z=NMpuNC`$=&mf)g(WI}YZo#o(WURIxBv12w<qbB
zp6b$)<ni%w$c<~*hb61y#x?9PHa+@r3ufe%eE9M95msOVaB?^=O`sTK0wy0N{B^Qf
z$r?ZO%UVX|Sw;$A^?z{oG%&UWaYt+}i}V{o{a&KZp`e5@M7V6jYEUx(791n93V>eb
zt*cLQ69QPEYmnVN`!t}+sCvgC+ia6fg~E0{Koys0wdiX77zPY;FD_*A^-yQCa_R<?
z+VMw$>L}J}H8Q_3w@d}l?Qgip-(^2DT8j>JgtiNGl{U>L4?dv8rrX(2&=l46Yr#qD
zr^)H#e`~2IDLH}chz!yu^;GpJUfoh13M{b3Q*MO81^`dK6(G1>+LE68X9E;SA>hj(
zvpR`@dMSh;*7i)?L~OouKYV!gh;Qq0vuZ`W^zPUt<HXjh0xEe=n}z-bQt6%L<q3QR
zBE}0Wd$Be|wY$+cdj-bpa!a~(qu+!$VCme)9dS4v@-}PQZ|&G-fva)9OZswEoPU63
z6jZLz$Bng$wJA;;YilR!M8;F`kjwm~Fq@@jqpew4gnnH)?h0$zz|tuN@`6!F^FFK#
zsBxGIb9pNjzz0X5Rt`dm(YGlx3j6qIYHFf{KPt1og<5iwlKAiH1Nw^!2uE1yNphHB
z!fliBCskK~VY>aOOimA>0KPG3y{i<KD0%fs%5PT@78tw5p;(Xc3PVhX+8%Fkk{m2i
z8q2po^yd=jE=1$I9rn0oV%jsRf9P&a)$bKDeq*H`F4dJj1isxDq`13wQ_%6Ks+}GC
zdi!+CbEBK{K_dI@2gWq@bY7R?Bw>HP|Kf>vraojtokXFG^^J_CdFV-~;d#FzrVX_Q
z2nBoW&PUqWghb-`rGP8K-r|T+aU0gP<mQ@N_q8}>bE>n!f{5FX;;26j>D5P4f^DGv
z4yZgK&<$1xBLDAD@Pd^%%>BJxy7QKw(i=Vyn>V^ZY_5Wa$B|{q-Ul@v10sb=StFnh
z_ucS0oUYznYCY|Ec<a`Jmr&k}{=Vv=5gNhN+;CPs8WP;)R2T9a$3hyjdmL<|G*3Tq
znO85T+p?X+H1+d$>O%$9heS$08eV_@yl_>jvaAJZY+1Wp&lYDn{I)sWfNnGn;oba7
zd!9}=<N*2~g*YMu?QGek*RZdS&qvZRPYE2|66m81LA{3`)3~GHUpL66$`G{39e|P#
zL0`A3I-QQ0o&;me-`HLv0Sdj!LG!##%3^F&1z6%&orDR(i1r>(K#{(Cm=Pm1dd%4A
zOyWgfcpbSWx18QL&OaUB$OC9ZE?8l}%?T{iTFVgT-DEQw>4-+<JI=Qfak)G5sBSkn
z+LlKHlA&)j$^Lr(sq2r3ghrYwuVWLlc~nrKpfHg1@Et{cQSu{JPvpTe6f$i?z6aW0
zDd9u@zv;jzZy_BMzP8><=t@xSXv+`Mue4Nx(UU|#z1SKXa&#yITm6yiJm!UTWcqV^
z$DIKCk>`8gL;}O3!wST-3pJpL0%7(PAcRgtM2+;ezw>w5uz<aIu>N2qN^t<xSGo<(
zD;pt4eQqX!`yKCsf`T^KnTYEbMx~j~x+xOk&I>BtbC~&v$(B-?<Ae4V9b5uW3DzpV
zL>x$3!kTEC)2M`;&p!88K8X0cGWnXMnS?q=I6$@8f`N55e}MxHbna@P6%sJZpP_(Y
zBD#jn4?LlHJ<~B6;2#~Z&B~Cv%|T#S&4%Zigldfy6c5Bj`!<LVDgc=jp|a0sKm*ua
z|E1>vkS8V$+KJYulX==eu1;0I{0rKrS<MU_L3Cv*Syg3x$*2WdERRY5j3nmdV3gpS
zI~-?{;G}OQm}?UPoMwRF@ey4&T9LAEk`#3xPn;$aytvWUr%!~q?@vX@e99HO{_t1M
zLQYEQo7YMRrm1R&(VvPe?janM(M(6tf5G>6ZV>k_$JvADGNvcFaxpc&l&#|+aAS1#
zIoGrGH-&K8xEQ~w3MYUPac*fKCr2oN0Jt{>r>?J^ZDNg|s``oqM^h8fM)|^`{R8gw
zW=ia#fqI8wXp9%pcpt-c;ql!P;WSvV5nyR;xhdhb+Dq-y_<PA}DaUWqElb>*X*yG+
ztS>w+2R$yc{wny~z%$9)7W26CaBXNS&p(2Yy*E4An_2HHIOMuaJoP8J`9v%nywsIS
z#>)#JxWGh3+6ezd`KyZ?TdyRDii+|Bm!>XO%Wtt2@Gp$y+=10Zsyt5jN)1p&U_dDE
z0ur63#8t!9lJTZrC5evBpbH_-bo6=tHm_-$gm0DSY#eaA>?WIaGLQ$bU6TC_>8@XM
zKDfhO{}^t`XgHOl1+M{BqtH0*)ct$;1PC)I)F{?~SaxRhUdiN^u}R#6*mbw>A3wA^
zR%k!Tb%A=L<@2tT?Y_@YNp>V;^*A$A{4Lr>MnU$VGc&On0J7U<+;rIcm-xT;z~>5+
zy?nF##j6`&B{m=CwuH%}rm$!2627YMad2x6f+|91pf16r?#Zf?%E;0?;~Z65>7B@e
zZ~EJ{_;QGhGvSdEn+o*}eUIexGlkl3><JkXUzYZW9qnUB9QRV5`Hx;S0UbS8P}c6P
zOzDzfn~&<m&DCLUMplytWKj75$C*}_8@D}IUM<kmmBHK#6@HWU|M;=%eoeLE!~%SJ
z!k2YPT(tTYYNAo~aWymhtw{Q(prjTnQvw9jzG5}Y5C9RPB7Fz_fb?}D&KwlNYOq9(
zm#55p;;7YW?yjj4&=9RdU9^S%j7L$8I5Ix5>lWxEgjC%53s5sz@eB$Z2#~ay9=pZI
z+!eLRw*#y<Qs#o!eY~65yoaszzghsv-uwDO;_>_sLBwZ8M+u|=wjD~Hm0hj`ol^q`
zJp)eiC<%huC!~+);^#i2vB+A%F$m}T7yrSt1rx%a?PzWNeO1Ev-b~?G0dWLnHI;df
z8CT0@?|wUPW=~m<&-Hu@i@BmXM?>8jXBF*<&GF`?&KyIJp+@PZKltb2#J7cTVMe5Z
zh>Sgi6~Zb@wJN}K>(_G5>4oRk$lSL}o2{0)4AiJJQhMO~azUw9IMH|{K{R${M8H}#
z$*b4-dic4<hbMLn)bzR<n!`_$G_E!2RNz>3XW#Gn&QWP2>{+YS?-v}RSD`d9$hH)a
zp6@qPF?!C-&YrXP*7(Qw<KWhTp+Xy;3Pw8Lp6#d<QbYz90hucP^9KUc^qwO0wPPCx
z_lYY7EvgNe@FCR}Q;J`2?r9gi9Lmot1D$HpN3Vq2s>dCV1w%+*YExRpQq{{Y3+pFW
z5P60q%HPpXX2K`-@4NGd=2sdFe9*V~_}e8#mNH!U8E3Ksd<h%+QF$f>(Fk@24>;Gl
z4Btt)@hUxif(m-ct#Kz;8{i28E`L+0)!GJ32ekdVO%ea*RKK`<U<OSw8EaD0awgZ(
zOZV^d6p<ogdMCC=H=>qJZqGl(YAah=SqVP;>1Z#MvB3=c@$=3JRn>Ur&uzQ+p^-D?
zD;s3PeEj?gus@t!mHYnZG+geq4)zkFU-jZHl$7K(@{ggLE5T<6bS{(Rw`f<iZxjqh
z56-n1UslRKKIrQFMug<O2v~6!RfmuY1BpeEMI*4vt)^qOE&TTDiWM>4Pam+dTJb`L
zC9<C>AhwrZpL`H|1$bPXHYMW9xJpjK5JlS9n>#~S)aeUf!K4U4LVLmn{}AS@i(fv{
zsZlFzgz@uN@&3fIi%q{^&au16@L76oaB#2|YZm`kJ)EUUw6ViH4T3D>tVcv<-Roam
z$UYMB5X)k^YmT3o7I`$4DeCb9Zq0W|WWdLf{)+0w=y?qR$(=f<t)ph;YYSnFm+bNv
zgTgiU4+Ik*+dr~_W}lRx(;1eZ7n$*;mE$9Wb3LM#-<z)Pjp_lB#1J&=^W~E~ES&{N
znWZWxHxlstZ^nJ#_%XS?gwg#~FB^dAOXc}`aF^3!#z2m$<t0R2&$KpDKgmRG#OoTF
zpt<X__Vm=Sq2P!9t%DFI<({7CDHG=0($O{X=<fLV=-a+4NLft4VtWzU_qht{#ES4-
z+o3y*WF|$$TTHL*Z+EF<6F*JCY0ICVpP#uskC$XZDfiIE%F0etT#rIuU;iz{s~fuf
zZtd}1JyB@tcJGJJtB>GYb~h)-E%~Y)k3JWqynV~M<|#%9=@7<m>Vv+^%c76N!^6I?
zliDH)VK`hHXoOA5WrZl(EcpQW`}E5v@kZ|rF?CnAJ#9ag;}Ynnf`;MSM?d<0OFbNP
zQEk<qz5Lc~Qn0U&R*1P_$5ju_G6`ik`Mkr=rGiNn$gXQOPG5Y&t4{|CLev|Ud$`v^
z@E?C$9JH{}=2-vXqgz#SM)9luHH+ZkmL8|d%ky@<P6(1`F_FTCBpEqrh{IO{Hfw0w
zBHkj<YF(QN(%;9bHwrh}0~y-mb3(+ZFPgp+J(DYpkyuUWs<m|^fOJyOCp~?m({DJG
zUN7Z5{1uy@&yrQ-aP*|ah#CI5S~-Q=C9>ZMc!Co#WV-_)Duz}B(%uk|e*oVD+Aj>-
z_u}uE2%Qvu_#ikLzz7e#L&>BDTwWD&)o|r}@|yCqsx&B|8}Yv-R~`3Mu_YB}MBYj1
z*(ZQLQztHvkV$`>+*pGz@jc|EQucvUPPfZzQZBsL7;he%+BX{Vxmgqvd6{A#_U74P
z9Hv%9!&q^(s$jt5<=>A;4)_h}_4?0+KaG&Ll}qZ?N$uE@R<v$TS=@l0Sn8|Y${f!J
zilZ9xU-&O9PSvv|{9euW79@k&KPpHm6jyq^7}L)J%ceK$dsG#q(WYlXTuYRIldA%4
z`^q@Cc<hoi8)M2I(q7Z~?6;QOXIIfWY;*Y@M{57}anH)1p{#?=ftlF>n~a#GdeP;J
zhM-3Q@B7mB#exoZ@;aRlB%+s=;8b{yF%_IWJhnGWmOUHn`WDPyelhjMf<Eh!g%oy}
zuawI=^VW4a*Itq>%O2%fPf6d1kCXIV*6`Ti2f?x|?K$Jioa<y&FC*}dsjJJ48e48a
zlek^-QNACYgfe*8a`E9r-95j%flO1N+X|b7m31Z3RtoxdBR-qoWu~c*IHyyfmc|p>
zCi4x#>?EnclfDL-UjzRU4mFNU%e%Un6OQwt7nK5Gkr`Lkn)jPOG>58DKGN17kcT&Z
z)vNZ|3KU?;6nCpAz5cwMG@?TZdy*~gQ$D~t94g!Qa`f((Pgz=8S})(o+Hk(|b$=Nq
z=_%KY12K~JaW9Wv9)c3WWF#`7F+NkIVfTZuiDrxOPM>So|KaH?!=mcKb~hm*HG@bg
zHMA1a(lvl6A=0f3B`w_`pbjMp2m(?=_X|jusC0{TH%NEGS$y9)=Qmt1d-i(jj%Pg?
zq9Q%dNk9cm9iDp&(WGzu?pUC|H0fRIS6=O*hJIAfEu2wD;zdQ%<DUFW`+F~B^ZDz-
zMD0X_1KLt6b91{PMiIt8gM;B%vos_aRzxC3)*)@oPIw{!Taw^QND1=qnrlyK#JGQO
z^CRy5;OFuhu|pVvP>$Sy;n%NU^B~HgKzXMFRkoOp?Ep<1cikQ=J~H*G&lnEP>J{V;
z=$(PC*LV7GGLZSEbX-jIT=Z9|ni4?-`lYHeQI=7Clv1U1VMiv79t=#zM39+(6I^1-
z(XFXiuI&2)Bj)e&e+0Ijv7DjM4%z1fq$@B?s2%Xs2R>0(=au<A7KAFH_%>n=+qJKo
zYLw_-c&>cn!KEJ;eAbv$+B(|-ryN$Cmb|iL<ga505BvBa!D&527z#Y}C59ltANhOp
zqI$1HhsjEE3-fK(Fa1?iR5}9pduP_WJkbPH7ant~6-mRzZ>5u7fB3Q>$}Xux_g#I1
zv0gWb<<<GUlE#5LGNqi={LVZ~$q?*p|Juju@;ZT`(^bs4+}7Q;d~Ac!KT8M-WEB$9
z*3$g*^7)2yjhxxg+!!Nf=bndxMu;#qDIW5`+S}V(;g=O2Z^l414X!#l+JUX;7D!n4
zpo?d&JcvdVe*)s3RC#5FQj{7NWO`mGq<98AmLcjve0(ZwgLjdc3JD!PUUFQI4@@Vd
zDW9u|O3Qgm`4}D5A<7cNH%iAQLRZ;fgX#~WKV=iO4-mus?uTVbtRuSi=%6&Jgi!RL
zRbL8sZI#{+*uVtik(B-F@6bDj1l_g|Y5@Ok5UXHs^Ro%198Ga>JQX1n!mgw@O6xpw
zf|PrDEG6Z(=jX$ccZL5jOfE;KP=ELGnPY?4&S=MiU&Bm1?A2uMn+cPXmGWM~>#zOC
zDRr9vjG9YgTzmO`DcaYp2E7%ppD}E7U#eCx^{q`thirc%Fe?V&UDWdW=6L16V)|hg
zm5Brg+k^3pQ*pS2fZ0#9j)74LkC3053N12#+D0|uUMP$5(?Ld&fjOtiLRkkH#`%Xp
zg-T`ckq7oo&8lc7+~(a+92ZmAfQNgX<YLn7u*w#JU%MQVX}4UMnpgmuwQ4SvWL4g)
z5%saLu>t%l2HC0c;n!{^UX4m%Y;0^Xx*GO-n=;X=d`yR+*WCZblT&sGebVrl*T=4M
zM|o<Q(Z~0Q7U7M2K56VXd{l#c9%nt|-EdVQLncRoNaT3Drw1xxZk$+AOpj%k((e*n
zMcN3(1&+5~szK-!<NsdDDyOxj$FV4b?^vkqmNnPK&LRPku66$}BS^(b`myEcsy(6o
zoqvWx`ZdQ<4eO@9`&pE>!X-__|MYfe8rEy@O*}77HwqL@{W%#cv7l8ihp1+5gWo<9
z=<Uf;m!pS$^|3xZCj}p-y!JP7T8SCXgFS}(ny$Ua@GHqae!_e^xRw@hg>kl7C>))5
z{jRdk>d7bneIo8yGr0biq5|6(8B<gp6&hM|8|sDMH&jMC67~|)0w2aRxJLMgWu~b|
z1FG(?U+1kBZDVN&G&ASF7lA^0arPCRndYT&BndRo*75E2&l3oqsjGWDjW)+>7FP&e
zO#uDs$zN34nrkB|gt1k;gP0z?PtPW(#1_LDjh4a>G^k6}?)=QMrqP_xtsWdl9LB~7
zR~<e6{8_FxB4fC$%_@KZ3dcn<aqN5xygvuyW7az7_7q4^X@I@?uU!Rk_UPfmXRaOH
zftnO85BSO8yyOWZxRoC3!#(t?vgqdq2DJ=OKGO1ir=JMQ>Q@Xi7Z-^-ed=v!q>jOM
zh+pdX-y7Y-{Pha~))vE=uL1%iN3WNyKm39(fE3}XvfI#+x4x~~m}97AuEnr@Oh=kC
zaCs>0P%Wy?t{fmqJz^@b6~^Q-<=w`H>^qHPX8)LaJ?`UJ`%NuJeW^xH!@^CwXo?ix
zibfU{kJKsky(%)SJZh{-74G!4uqrBEYe9&?;49;sNB2Q17Xr;%kT|!ov$J~)Fjgru
z&39WKy;E6r2(olvxN>vhe61mW5b&)1q7blM{@8&dpqzD-NfeZ_C-p490^XJ#BoOir
z*;M$mi^UV}W5Eg6WamiX4BhSR95yE^!xWuno!NnBg#SIb6+2O`Wd33BqkGe1^R`PU
zt#*zNG5t3{`+6UjSeXSME|g@<3_X%{O2L6b)DXDg)z#G%jQJ&`5NzNSQ<jJgZy31m
zy#Yby78V`QCvN`o1a2ABtFW7VLBlt<xAkNv`<##Pb9Q3RGD7az@j@Abq4VtDex$&(
zmmionWkt5Rl%rSk2XrrXE>L|OQ+4k5NZevs88Kz~`C)p;DEs*d&v?h5Z!hs9JXcd*
z8y<)yx843D(9&XTk|NsRr>~IUks4u{_5J&Alttcj8m~NV51SIi$bb5=j?9}b{DqNo
z;`<R?C{Q_ut%Y&kEeJ&jPx2$NE*PrCg^q|mDm0h`1kZNMaF889xZ-S&dDG`K<rG~C
z7I@FB=#pY%DOewC3IGLc3G`z-2<7-xS^4$OOv3l?f|3_wA;zxP1q+H9yL`<FDzNe~
zUui_5Ao?n5$mY^0*l76!pKaSJ{O>kD@;Cl*<iv3}nor0BzvQtV*IQSr+!;7JEE9KV
z`#D1FRrwh5+K28Dn&UfM9O7~;4*v@@`Nomx0$fEC#zzI5f(6@pNEX*)Dui&(O|QBO
za>YYbYs5fs3MC}-?R+<>q~Y|udUTZN+FKP`YT7#UyCf>favQx`|M=O>;&5}!!*o1`
zm;Szp&Bk-WOm83UYNKKUaRjj<D*0ix7+WR7QTH$P?XZZjy{{ZjkMj+$Y=>>BlG{Z&
z>)-19*WS~2d_q1@3|L#nt5=a9NH>9sH)lcAr=C=uUU==Ps&4t};aMp(tNUy6jB!2@
ztS=ad!D~iCvnUeh62}vY;ol)gy|TZpKYOX%SB?%jRt9RJ+X<;2Iy!N|Xf9}$JfVQ&
zX>iY26usmbBSZ}IG7gmPx|^MoLlxd@@lT&Ct_zyAC&7^28StZB<d;qk()562M85D9
z97RmUFui>I3UypPB~3n{QXz^fn0MWI#ne<TN}ZsReFQO{otwnX)~CrK@8Px{CJNgd
zg#3_LzBdr#$>h@4qvv<iZ{4ja-_mEK{^wtaI{*0Ly^-%)ix6*K+J7%MBRovR8)ln`
zl<LfDhUMSIaT|O~5MFAwU#^SxQJ^N4q^-5I&~BkQ9uZU4jnuooy*&E$>)Sr*3s?W7
zxFWS023?AlldZPx$f=|K%B7tZoO#P7WG`pK`MMgNn4p3(p0V>E<hH@87~$fYfe8h?
zWPnlFIJi4)(Ya=0XJy#!720|OhfD%Cv6E1k$B602Uq({!ixd8g>{|L`mX*~{Xk9e6
zd1Q~~sI}4}Ll#!i)PiRC36htvkq!2F%BF(nPIA{@fux?~y9%fGp$L+cjeAs(%J+#)
zD_V&dArfrlPj;>B#}r!CeQ1V*=qqjLaXNo1<FhifKVLTy!i3a}YOaPD?-%)7uV(9<
zA6$N~UvtKr-G&W>N5eihp-MlsfedOFQSUR^g1b|H&X;VmUEAkcLJWOY%W<i)*I2F;
zS?RuSX!PPmfkvg;pBjhif3@7|(g(3u-$?0r-ziQ%Pv+q)b`6*+v*<F)&A5mY4{q*X
z0NXQ0+&2FV)zP!~S1yJZ=oT7Q5aLSTC;fhhUENdS>(<{!O$)!3Fp3$3|C8&+6|b3%
zsYeoC7kWj-%E*!1AGx@=xi4QhS@tJN2l&$?{8=9hV=$QcryxY>1}b{w|Gn~hJg|QF
zvnzy#`2DAIOg6_;TF7(TQfqzfEU|o~(9VJB#G4&U#*K!7Ev3aQ6ep0C_R!S3!?N6d
zfHD3wZ>H8?US<jS9ug}YHySNxj-M0NwQfDY!LEqWga|Zii#A)2dFPFJ9y|KD%y7Sc
zMD0(R@BwWQ*Jo+PejrZCD}(C6QvV1RN^u;%x0W*^fbfnNI;v(aGX~Ne%Pl72NUo0+
z4YnK3k4cNy_o)sfuSeoj{pUu}&A&b{Vblw{z$!F}L=V>QuMJ1P_A!~&owMh-H=56~
zPBhFImcQv~KMj{Yr9&t7sE+2<Yi>@OR@siY9zXM#jHP&srmuVRX;!E|t#AY&M`v%(
z$nmEeUt302-$NpQ+9ule<G;_-WaW==oD5FvPX#S3R~OUqe&xibHu@_zRQ6+(LqE>;
z%kwflE`S2iVVB@Q5Ym-lo1Q>QSy@>OaYiA?k&XwggK^u^`csz{Bc@JE@qWqJ<&TT+
z9+T7RSTgpUn3E@A6b-pySP@j;s&t%8`tuSl;5uV((c?nCl!NHibBB~47EDo~dg!6^
z3q5h5vucL*Hx5w^cn!_Fe+;i*G#Xs>aJ+c3eOcpfPp&3;*&idmt(NGNZeo?|vScCe
zyze5xW<Fk}YUe7?iyNU<J2xC&Oc!u?JiUFH(r`ZSA+WsoAZ+x#0oDrx9h2o!!yNmr
zVO5<v{-4wX0qto_(*do)N-R-z`Ue$8Jn=hIgtQwN0&jFR1(biQjg;zJn-(J?UD|Yd
zkFFl3ak7Y3%OUDWtpW+39zFxV?~^fRcV%jt^?EQ{0XE<hRDuQVgibj)ZdoYEy|A;}
zn1Iq+aE1}dY92Saubogbd)Tc1?RbZJ(1=Im6sxbUsh%vO^5+!Clizr-+~id=hv=9X
z9~_yY2XGshMtSP}yf-RZpIDS7M9CR3O{iUL!QuOVy{WtP1Ok6;(@)%EHMU}`p^&9j
zg~&WpU1$5>O5u~zts3eE4!q25+<YHkN7sM-vl&XuzmQRrHHkQ1ci$vWK1?<wn_7L9
z!L$Ciap9PT$X@;~=6cieG1$<!Ju>Ylxe|&`KV|=jwog-_S0>tGL<3sg_UB?_%XZWJ
zIM-TGy#AlticSuOBS75}uEHQ-*&Xe+K1^sE9gdDicjjy4jjZbu^b&jP6jb~*=Px3!
zpU9}{Ft_O_NC|#l()942^RV-KE+_PpgFad`hk3wsR#sMDTWjky1U2BcHrn$i=ztuT
z4<Nd>W?N8GM59?GxgN(5zT{8H#`+O#V9q=rTjR1K{6Rsi=jn{FUmH$fBKPP!#rOd>
z8;Lo)<E1*0BLII0<mv|oJNX@dnOYz-dGp_x7BK-!gNgK~C)Vd`9|o)^x%eOHafU@>
z2G<Ni+nsBS&XJ3cng|-U6@PzfSh3vT1(;-FB*1f~^4c+Z&<-`^iiLna^$yq6_eLuR
z;~AB0R<1k#V^OaCjA+$<{=C2S(k*G&{$jjz2-v8*=zV5WCuWpW{nCG}U6?ZW%>Ri#
z;_drJfByTgQh+sl#B6npKS*0&p~fSR@6Y<7>UgFq`&w^<clX`(xMF?2<?W17-K|0q
zoF8r2yl$mGxS&8cfUDjO5|9sKQJ#ST#uFaPvPKT+u|A67YxRx(@<sbKf5Hpc8#?sF
z(i9GqlE`oIG@Gld?{?<}1EN-%nOR;B7&jFGQSTOp>HfoUWn3_H_D*q3g-%<X1h$iG
zT5?t~AU-uV2JgQ|knirLM0?npg!M8dm+zhEz|t*`kS>qX8#^t*0W~#Bg~&)JT!<+6
z1%S6gA2(1y(^!E#SLWXxN3PoUqa^kRH$|qs^H|ozZqp_@*PeWD>nzgYp1&;a7_D@h
zc$VU~m~i=~^bP&P2|6{UjROzUrp5M96jM!BRWA5l5)0pON`UNXI#R(ENf?IutRH#%
z(y(NHm(3F);M^5@{U?(eR&NlG77pzuVnuUYG$o^X?k0CoqZ$E1dJn$qcLT)6a&;v@
zjJ0-42ryx1&8lQax0unSadaa3Ek`E3gPV7%!Q7X16@GpXYmV|R+#*_@{H%_B(juzI
z1RJW;eg~W6v=mg)pcsf@Zo1uzir^dm4LS!QB8@Lg40obU8}1{RGahKNB?erqG?2Lg
z_@nO0Vl_GzJO0_xX!_l|G<Qqb1Rn73)mzc~V+2c$=iyPdoSCVp%a<Eu()JU$PL(7A
z&Fm{$#ciGw>@3T-%<dC>^_)1Yx>|knDGlYS0Cq&~0eBpOU3`+u8GN_7_(V~$l?OIT
z4TB&!et0+dKmnr3y#PN>`_aw;Q8PUvgBbf>K9XR09Fu1*oFsl{Km>y2$X;l~ce(j^
zd)Mwbd#Xs*MEVj!C)dWov*pIR+w;aR5lk?cZ!W)uN?(>Fb4UbxHxs761CG2<HK#{K
zlxE+Q`$dvx^Z_)G+RmX1`pLrks8goR-Rp<jQw<}q{<LnRieDPDe%;oBEV(^H%4fY%
zljqNUXXqP_yqk_&L`)-2Y+9ltq33x=Vya6gc`@`N=XB#9TqJ+>cs2r>XRAl`qEhcM
zNl4Cbl_DH672^Nq^Kf#Gu{{=ug+T;GyU&m~3d?-0Uz=`V;*V%e#%d}F6;*Zh61S3H
z*j7J72#cVXGz1QhjWS+n#2Nx-li`yBS0NmxSa9}uPEJGqRe%ePQ7U#|BK`Z%iW#4&
zc5-9CeVfF?y>?jafVWZ;`GFvpOUe9UZYmu!%CqY^zHoiqroB*nKa9Y>f|?4F(f6up
zllN~hF(nO@qm6tP$lY_Nd*vfFf4$CTAGcu>gOEqM8!!CbZ=CQL(Kf@{rmfM&4Cd!3
z+25me4h>!BY$Bx3$HnHicBLbq&|mlLEbd;@3hnL@`S!43u1HdsVvGS~z$8GJeG5q;
zDf(AbNN9-hF^K02vkHR|@&To{^h(AszAnphKg!iN|22XBwjFF`DuHAUF2Q<k*g%7k
zi_~i5i`}t$q_NqPxK$GObEVBb{J<d_e)CvuRbM^`^jGq(AP<0e<!hC@589Ca51yeV
z49dx0qp4_U#tg&fs>S`X+ZUJuCUlBi(o+<7{KKl}I@fs`7g{rmrX4Tdyu4O$^xJVS
zEv!byE%swC-Cg!M@O(W|S+rv;(YQKrI5`K*cZ;5M!q`7$7^_JMQ?yymycRRNq^V=Y
z+T-!pk`+GQ`8&+(K##npS*1`IFstB3MVu-`XR>3>J_%+QF7ibr(yq$>*94Z%uEE|g
zK@opf$+rOxeoAX0fYn+^mw_Mn>dK#<kgDijom$Tpp1UwXxR#%hoX{OpVhEiZGh46e
zUU#YT%ZeYoQp=mXjb$^oBbtK9KqO_3dA!m{Zv$u`VUAEWv5CYirkTuO>smINrR6g6
z;^zhZb>gXkk%ik*AFYut0YYrexke&!xC$LPf>EsGjVPKo6xMRoN2Hx&9xQA|k|_M<
z`S*S)6n%v5CQ>lCpy3Gnqw2~Ao~O&EA`Uz7R#0v|U?Qx`$M$d}Nsp?)=&TRr)4oaV
zVFM{A1f<(fnoc&1(D$9={o9<Tg!f7J$q7H3Vg+t^K+XTjPWTzbw$!6dpm*rDn}bhH
zM0|s?H!GV;GhzvMEVk#>e|vH~LP!{T*TA^IC8t+D)q5w@l$n`XvXkqJt`18*zl&l?
z(4fM-irPwA(wuz?&&}-6j}dlMX>$-1IJh+FvGKc9DpLF&d&F14t@nzGM8-w9P=$AI
z$UX4+@F4>3Uy8^n<W0;O+^`mKeH_V;wATPq4+jv>kx2R2g#=u6kmHIhxY>zF<Rux_
z;&m|`%H@gTxt1N()8m$ox;!O@a`peC)YN~$SsU=GdTn%vQ)>5A6c;3v*&77r04T_t
zgwj>(7NXVI&suZUYEDn-3#!Jep8a(#@a{S}dD|zo*Bz>}yGw7VV;=u-8Q9r`?6w(v
z;$M6D1;ob1dD<xquN`W0C~x8j(L;f}@w}o%ur=CiLf_nU0XEe{3LHr8tmgupFdW3b
zoJP;DL{t!}j_@Xl4*^6D1;JLe^WTXjZ@q=0-q!FXs8B+l`WYv_3L3@A>gr^n^zL(N
z*m)&QsjQhrZ6D~!)}`E4%^MQQ9_F503RptXEQH3n@6qfx-_rlSAf_mz+9<5F@6C5?
zC;cXjRgYe#|Hm<d!bc|Q6#072Zlw|gr?13lraHN8^><`QiNe?HQY}r&{kz_4LomU^
z5TF^5(79y)ZsS6ZeO}jbx?dtqa<D7F+!{gSGAzZ37cA?^K%^X9e~t+$&<xs7G~bH_
zHUD!;iiryH7|)p)eX;`-wk|lLcsFmzR}$fmYj!?%nS~8Gd=+st8Rh*=a5v=d;7s3B
z#yoH9KH3j~T9=|3K@G@ARu#gAHeHH$>`K(ALYf;KeNQ-`c8>)nMG%?W;y0ERZ*M!p
z^TzrPF%jq?zt`7)Ie$x8%8f~y{~4|J^jqSdOcK^5@8r_u^q-+FUek-E0DoKtw%G5k
z4^}eUw$nduic=1IIA}7a@yBy1ot%`mcaqi$TZv=A@AMrt9=*10_<vdehQ;}LGn0-}
z_JJ7W+byqO)F?Ml%%5Q~3dqD?kAuOMpuZx4DyBn_0=mM1tc<_SvC}|oqjr@HLhY6Y
zOgrbzHHGblNYZyhs@wgJFYYqRI?BouK0{J4^Q%X*m~j1|M)g1-`&qskWG~<keMv63
z^CRH*>?=8+JKrL^SXba)y(35^h4V>?X;U<{_}sAgxe?#LJ^ShH&6CaeIR*=VUnCQO
zr@+w7NIIG6zS^%*-pCdCHjo=Y%BR=tXSw<19}VLy((j;<v4Hhl3eap@R#;smfD0+B
zBU-8VKLGxU07~?(_(!83SMyEh(-URowNNw&eVuRzdcfjOeiIKlFd7%YhQ!X#z5b#`
zk4tTD*A-7|<389jRNf8nfZ;GA24>o@&l<jf@TC5q{<J5x&?L>HV+uaY3F(m%v`nvj
z>y{icuW&8M&sXzd7+iD3c#sJq15Q`p#H&;<_4c4s?oyG`i@IJJZub>B)h-6cJ$szu
zlV5bcPD;3WZxSn$HK(h>RD&XeH<c;yS5i^Zz46dHd1iVQjAk+(qquiSuDCo`2vA(s
zSpciS`SCZqo(&=OEg_*}Nj^vh;Oh7a8amiAb$IGWIPiwo<zv%Bh@{u}sQ<*F6T`-l
z&&f%CT8o}~mjr8MP0|tlL{DP)y<dS`oIcZrMxP%%YSk0?5yM=luU}<t+}wIvFf%F`
za41wJJ3|MPaFB{GHX0^M%$-qut*=~jzK%w>3!tgdABrE(UJ!kd7PR<x?A_42(9|&N
zDm${~sP3q(uYaluwcdiB>%TDADD5m#T^E*fH1K<qo$0y0zTL34zeP{DTdw$x&D+iD
z%WVB~wu3%|On^EKs&K$Y=aAPX%oaXSrBfjEJgfd|Qs^D_N++@88EhF5?makPfnk+{
zy6xwl?^!W!##+7?jFY7G_iFNB8=AXoqx3U}Phi4vkK~aWh<(`S{~un%c7Ca)7y@Y8
zUE00&#C2qTKH9({`}=dJ+I|!QW4|u!rTS*-%++pYw=9Rj+eX*i+%@5RH>|UBxTH66
znwMAF5v3WW{P_!Nnl)nH3}rCn(WryYwcE^gOEU2}j%)^_l$76`OrTuyqIW_1K8rHB
zw*aYx=)BrN>aF-8vxr-g`f)i}fy0Nr;h?+pqcsf{l6&zYdca!H9}l8>?HXDUhG7AG
zG$-wn00~@!s7ZRG;%mffBEsW>(|W!?)Vwxaub!+P^C?GzW|*%v5$mQrC29)$V?*>Y
zSY0DUcF|?3UZ=O{4JYo5S4s|2j7TO`e;vi@lit%(Th}R)q|dH47j<4%+Kw2luo}tF
zzOSyJtx%N0?w#C-iusscws)Ms(CBA*I=-#wd5nG?!v8!|e0Xd`DBJzN)iQ%2kk1qF
z49DLI+nPu4aRX5cB{rA{O@h(%z#O{uhP$Mh7OW^?1Ma*YoWO^IwZGkw^&JjX`#;0`
zmyUmuvJiWc{rnHn%qhg6BxCY>K|1v*y9hFFiEZ7g6HGPYH!)SgK*jYCi!rL~j`U@v
z%YR%CKU6fj`)d@pb;gnshRW`~8+`fbz}NX-#%RSe-6-v6$dH6o?rPt|_d@H?k1-0A
zsNeGUw%^(+Dt52##0ln)U+sU4<n8l4tgjEcaPhq;Bqo@78W=3XqgPY=ITdca5XY`$
zgHGW<Ac~i|sfh@IYDNIER(Y)yPzdOK;mEQCGExR9u$6o&PvgP_svfqf5dJG3c}w7+
z$&?oB3&2qByqB4p5n|tV(ve-2ltSQr*9H9uAK%Ti5<u>^comI4XxhBe9&%r9ZRc)x
z+`JM?w>RtmyQdI4J|!($v5-c@dp367usvKtTfVceARd~FTz|8;yyAV4@Lm0hT{kK5
z(0#QwhHVR9CxmUs3bS&$;w3#xNR5mj7D1aBj9kQ`xY7UDg|?4xbz%QZ)nc1){Bqbe
z7d7e?vG$ii(YiPQ6<ruJ{phybk3pZH6?fUe&^&HgzT5j2mg6h}pfN$#7so5A<Zm1$
zTESaj^h?A4@pV|Z$FL5yn7KPaF#D(JqhGo6>!!C<#D#c)1VzP$T3TO$a=?CFNA~cT
z1S`X_iE}N;op94%uhP5a`SG#;*yY5hp8`3@64s3+6s&nMI;h7fE|Vx*mmE^M@QPX;
zl93cYpT_IO@^|vaWp+FM0uLUD@upi83@WK7uP*q1gQRA@)tu2swt){6ar)a%cm^NR
zwV{w^5f(n;gFNLke1{=WX{n&@evU?&oIztg;GJToT`JGc%4XrXd7_{f+UHbS8{}pY
ze64!+^|TT1@G-8(@mWh2$+y+1Y=*xkqo3yI7f5yjE9rNFR_^{<K$Vg;c>IyP$5LY7
z+@?;yErL2q68rG0RJR~JRl=<^BmHID5uH%M#J}jJ%e4fTfU0Y&tM*zpMZ6Sgr(f5S
z7p`jwWznQ2j@mS?dkfg_8NX}MAn=CF4Fd@;tBB#oi2a+v9ryx_v1Z*OdH16`%>Nba
z3hB|^aLdq3eEY_dFUrxbQ=pab8^h`fqQ*K6Fnyw`sTO~!oHLlW{zYk=dkdVb)}Qbj
zH1j!}t{eub*D)AYdMgy+FJc}W7fWoTz<$}HbukfY*KPN$Ty}a-*8*UmSOuiai^;1m
z`h$I(oDYZ05u75tRR9$KElgN@)~o&YW3oi)E>JqGInA~=znylZC4N8e!<JoGy&gBA
zWOwQ1<FEWWPuGjw*EZR2>J`=kY|7<*6ZMeHZ|mmIcY|)8OHhkqi5cqAg()F0lAN5%
z;Q^`ok^oBMMgHc3O-fY1$AKKWfDbQUR+tEe)N<J-k07c}<ff|myRnCb{eVg+&WYI@
zD;C31uNV1$^V+LdN@vlxk$SWK8`rVqOI7yu=mU-mOYf&%eCEV($Da)dm~ZCzMC
zC17z4hmY@#!d4p{snQ1E5Sms;cTx9nP5G-j=T=w!85<Sq`#(s86@L5L<k{S}ZBZ}%
zXU_HS8R8FM1sP2J_ul<Xnes7WtutbeD>U-X8{w=`Yhld%MgY$<WYtOP$J<ViKx1g7
zy4XR6q9(GT3}2#$s`Ep6L~agbs;X<ze^#Y>wd=06cOlr!Y)7Le>`io!rC~aFS{5PI
zMpkXsny*n>M<1|e`K|pmDHIOOgs!B(+5VrN=)bqPF3^t#*VyluBXo&>^?4p}?pE)H
zMMieuKm%k>@4_P@{#{T0Qje`rw+{GwS&iX1JGNtpID6GbR)8e~f5V+K=tT}m=wK8T
z6j{;jcn}BIgNv&r{Xvv}-?ZE3V;{hknAN!KiE!TGihmMaEDrrxneo@>=;G=7QW~Ou
zRsAyIQ|kee<?OBa_WjeO)nsN2dl`7AUxDgO4VmdC%I#Q^*t5K!c#`;vT}Y^c8_s<z
z@Ca}l;&Q*wi*(k_NXqnQPcMHM{rKg*j3ehpGDL`GSGov6%WdznD&IcaJ_d)Dw)KX(
zu>0C%Lk8KoOIO+*dQb)775oklyDh;68?SuILE%SIwCZ}sLpCE)qyQic8=zM*TN_zu
z(m&l)ggr;(e=j_UXmevt*8HQK&w$5)dn+|ye`#1`I$*u5vokDidfP)ZXMb2X#d}u|
z>{}@Qkl2wBUG`_j{i1fTZgF^6RZ__p<1P^Z_Y)vB!k!C2#t;3OwQ8Y8Vbaucx}ZRV
z>&Dky*wC`G7x$*b&JUbYz^QOT2Lp&ybV`*$(Hco=hg@`6GBYx>g@;v#tRhB3LKG9$
z@8e72ULpV{5eU@cSFPoIJ{A@Vb(29xbC-h7M9z|D2`p?Zlp`JvkD?*NME0+JbbRKc
zq}#{BSO0l^cN?fTmdUjF$=>u|KPo88KB=<8J^JLI%5Fl>pp|NT)zx>m|Bih)UJZp9
zZW{;>`CdcGCNnde++Nc;3Dc&x31JMu-vycK%pr(E0e<;IQp3rl{RM@>YU(?@I$!V#
z34rtyTUA%BYr+C3rK50H$9oKHZ1U5~%A7$OjX$PXlB}DOGS0B0#3j%^OkaIq$5<%)
zWK>@}5Q7OZ_$rS~x2U-3HzS0|z{&sl)Tj&H0>y6oZzO$r_HJaXr=#ys5B|5&I7t3L
zC{gA&iLOb&E}aP5kj-d9gS1b7H`xql^xdUTO>?4N>;so>ZSI>y7d;3}v|G}M{kPq%
zd%u7r(ngZQainsJmsj$1z8Fk1)p+%t8g!4Hoz{0X*=wsA<OAO&@$1^}c}Td)BAJ$I
zCci&XRlRGh_JBn>`ixF#>juaU<7?GQ;<xA&g?6@OKE6epl<ILMZrBl^R$axU@d9lH
zJNv$m*^#}D?FkAYb**_IHT`T<MlYBkHz%>LuKh0mUBaNq)mzWQFgwK$Td8gFK=FR}
z#iBf38|P*Ad8yy|HT9yis7H9c+N)O@_l`&G4Xf_b%*5`@V^NW$*wh^3D@RMXYb;VO
z9(wy7tq*8_v}wB9_?RL#xBIe5{#?y}sxVF`cGLH=At0XHaDVE__AzY0>+<eMvFFB>
zk~fk%BDj8r9W(zfm0N6P-HDY+YQ0adx7AvZo+_EsyA7H9f+8!UT>=of*M;Dva_h$p
z$|kW*HKGv+@%RyPaK}zJ6xc!A5-<py2^2dUHEoGZ3K~ZRC!RlV-Q%@pwR=>^)S&Gv
zd_8*?Up^lp`)SvQ&%~HT`5y~^6sV3bZ-g-7VMvC+m3ZWZ|4L;HHm|HgOO7OMcvYCA
z1;y8y#ah>YVh4HnsV71|VRIU^`PG?%yQTtGuG~DFR|6y6nxdjbQ>GqI>{e_E3pzKO
zHuR+ZmLuNg^nEKnbgH*7D%hZ+_uFdwEU#pA>+oP@bmv!ggT>t?dQ~{#tJdspZKZWF
z)5wo>;>*~9n&#PqYZPLfuzZ-tK`$J{RKzWC$&MNJraR?{jkStO_pi6~NS7Nj0zRNg
z+v3OtHk(LtbN6IvND(1?cxMa=AcZUc={e^E*yH=|?l$iKE|}~nUJ=0GO)uO5mGl43
zI9wZc302DwR-v2p(bV{tc-ZGpXGLLh@5;KeJftb_-MObbbDEB^@V)r4!r-&)SrE5N
z|Ip#Sno@Vfn4^*N$>`jn)Nj%7UW$|j|1|scO88;&Mp13P_ef#XbWg|G+5W)o&&S(*
zUX$&mrjq;ZcL)at2AKcR1$KE^w<K=Y+xjq+ghlullrHQjjHg~cL#q9J%c5+flOjZv
zmzTE-ed?bLm6Acu#ejOh%p2bF2M$@{dEeiHy{R3ulj7}PAh573KV?IH%hI@T1mZCf
zpqB{U@@Nf8nj!1eXbwk`h71+Wy!QvfGu2@BVf7x;^#5`=ItiOU-xRR_Ejv5Nlh5bR
zK4jD`g?@{fP(X?s4V_N6MYIy6lR4tFSU)=Q-h0cw$V1n+yDB7idrj)UoT%wC`z}1l
z+6f<x>F0Zqn{{6Lb@Xei-HON7C2IRDe%iajL(QbOX4<i}rIpU>=yQ&+snBv*5|rjf
zmwKJr7k5!*q+=d6^EPDX<l_OWf$)M98-E1)4DbjF;2~CUAYo4o8BAlfFBHVHcPcul
zXi*gx>R#G2xZ+Fd$-tV`g!<?EwjsE2jnmxEL0a?j+2!M47u@@-&Wwfh{JCc?Y=~)5
z3^w9+@_kw4&Dkln47_1B^TVVrZ>6m~xGIU~h40O#V+y-b_+e^)w4ad+hj+jsNSU2g
ztzvTKtX^v6^55mOZdVh{VauNqf84-wwMSaox<fX!H0dHK6@t-SOXJt+m%nf-)diHg
z!t8duliriox}Lg3NzlO=sAgv4KmHU@kiNV)=bdg)thDPjYN$@{%V#-A7e~CmTqHo;
zHmxb-xcxuF;4OfsgrXft4<7$}qi<)&C31H3xx>5`f*NrHpA3v4U2nrnfObF>Q?5@S
zbM5gF_J{f{^Q%K9>Z0X^XqMeY^lc2g5;|9am;7Z!Y5xsbz_s~gW{w4_J1=E!iz0P>
zoZOVx1KJ7q>L!OFgcSaFeH!Y==-n8{rn>FXxs1kR|Md@a(k4f{7ZEtOhh}!RRCZrS
z5oWIDel?RREa@n8-q}ef;jKS)UXgG6cytKbbj04t9Wtq-P41CtXE$aa^~VCz$=g%y
zMj8~JxPlIPVdxLEQU$x{+$JDJ3qeoE7}<OowG&S7p`}4}Gk~K_2ix70iORq#NmOAI
zI^6&D)Y;jY3TcD~ao>{|Q$l;vLd9hUMfeI<<*@W0YIjqma*p$-r`|=r8ww57QvKM@
zc#W{(ntKvFHKzdzn&hn;%-PoLrZ!Px`{P1o=cTIBc7@vHUAnj4-J)mpA7^7ZnYz!{
zPE$hfao8VPrKT6Psy}3XJ%>Y8FJxu;?pfUWH{u()jWucF*Z$&Oll%OfZ7Y{nSB?xl
z*PgK(&O`r{`FAd`fE;k%%WfjS=P;7)U>av^!e!5;;a<W-6_;K^Q7O$80+WhdW}2^I
zi&3@~*n<h`nN!0riD}iNL4HL&PVOa3e@p(^|3u%(L=a&x_~*}wxB?E4ARB;|aVbqT
z^6a*LoFkYw^Qlh%>$iABD=f?0IWepho!NDZ^n=%0Q)Pe;^t^ms++*ge-x#L#&POqi
zEngVasbEPyaO-BUh!B#zLj3wUe3MM!PVdf3g_1;!p33Cg2(B!y(A?zaCtR8_REs-5
zuU4Y`Pg3wj)>)DVFC&96K|F(2J!syN+}w(vYv#SmQDu5`6CPYgQta#^9xmhec&jEG
z9Qz>9#No+L2Xh`@9~T>LQHOE}T`}Qk%wev1j(Q&|{E?nnCB6&O=m~2)<6pNIo0-hY
z&Q`?v(Si_QRVM%AgXU}f`zWc$1LsF%5Q>Nd@7G=ALDB3aTq%kk1h>^^p!x!?3f;7=
zW^2E1$XLM}5jco$*xp3@j2hsV3CqNh;0!DyMmEzh>huT)Xz`YWO(p<Jpq;u|z5j`H
z*i8Twa<cdcT6&1lz4;JDr+Em8moW17Ub@N2VG0^ulEO(t>E284x5w0B5X_P~^Rc=*
z-&1ybZ5wW@QQ5zd_Fr91|Co}heqqYVkLbKS2*0*V{c_do1O$oeentKbXNBIUc~sBD
zJT`nEo}7r=^<=x@=jT7jkR&C)v|V6-sCF8?z*{inO#p>#u^=#W04-5=b33$vUfS&-
zX9!Y%fML}1YtJ$+^fgcR00ju;TVlx$574Hf=e#^6+Esw<HU^pncA!}>!;|~11*Fzm
z!FH61!nr3KU#AOvT!LRmGR_*?NU7h>I~^-)3Z0hih+&T4qny9#S1LXZ**D!PVH-2$
zKCi{e<tWh9)0-XeZBShwezkkd#lz+>3zAgKV6C=ol;yfw{KK=N;~Cmoai4A4_zkh@
zh4^G!Fb8f8Jj_t-5{95px}Tbsma0sIt`5$5<2_fNMQfQSyfiScb=k8<cC3sSljbb+
zcajwF9*X%g7HcClhh5lVzVaW@`8HG^_><K{Ubtr8=qA9t^gRk<Om8mtl|kOK6E<Xn
zWB`{6hdfoyr7rYM9tL;W4pH8_Cyoz+9F>HouQVuf=PuK?Mra*E#(&bc++PL1y4D=*
zvp^>j1JeWiE!zL?ef@Qxhk(%N|7rSCPY=jnqTw%#&r3CH*!zHg|DZ}Fq`bQDcZQT_
zSM>=;6F#*WF`j%BdyZ7tRIRJ%tiKUc^fGtv{o~04s#@Nwv+_67sfGtjIhU0FyK6^P
zQeTA%F=b_6j?az7Jf?R#2YHiM2Q!Y1p_e=UO?*<9h^};wuoWHmO!l}Q6P-=<l-n4F
zMlwO5`nqNH9v4w7>twDE_FYIdCji#k8aO-DplSuDLLB%ue$_jM($Z2#>>no0BQ#zo
zJ9Dprq9C!u+#3$PsAy-y=x(^5+kuB)brv<cC)0x9#ZCn3z(PEP8g!Sn0!riX(X3o`
ze*P*HjTEWa{uUkmQ+136#T5eNiKxtLYDnVDqHyYaqYha_>n97*REoXo-N?|7uhY$3
znea7iJWTieHt~%gTPvZYHD4Gc?sRQ84Nr3iz3BLDxUDBK<a!NjP+K{UTLQcsU0s>S
z;Tvg1$X3{*`1I$D%3Wc?UU{fm8R)!jI_(<9>KOvKeFymJ6)gEdl2c6-%%elx4j5HM
z3fK!zI2@zOt@?`b0`t=5nQV+<Z^Wco&SF4y!w2h}!>)fgR5hh(2ot@+FBbcx^u%UM
z2>;M<$}#poEI#V*exI<OFR^eok9@)lOJjOh0mO^R&~sLPO^sN-|4nw>AmU#+-h>=h
zv&dFkrQMW72(;(dlkcuY<;@_ldMNh{6@raK*IZ@?5K{}8|JDf*7-wa<+S$|O-Dvpl
zI&cDuQLe@5eu93g=XT*I){ubw_JgFZC4*ZK#-ku)Y7vd$Fo@-tOWceo4FuY?Z5$cH
z=29gKegBGL5#ZUNMHq0Lcn<(22hs7K0cgd%k>4^jE~r2oRxkfyNd2uszSPG~P0ahB
z(1WKB*_hMDji>Ydvo{=XOk3z`eRDQ=Bs`%FkUp>1a3rDDAS{GGQs(+0YA$wZ^%DQs
zac!t!WvOuyux)WQO~SM!7J;LRe;><7T-81-UfciGH}QaOXh`Y{W!>TUy(Vf2>(ws=
zunc!t)Am#aGq3bD|1pZ-WVf|j8?)Xc+<9@j-_>KeuL$j|T^o{61BK7{1Q0#HO^}U5
zJKRam<LKlcxH+rT0M~Hm_6HX<{sp^)MkaiXOGc!#IXWwlS_PR|w|@a3RN^l5vurVD
z%#I!p3YP_o@*jeGJGT~%KW^AP!Emwo;$Ltnga?kKR9)4FyGKoLIwrh2iWmHZUM2j(
zdAgZ(WP5bGB!s$=LP0+iy`ZqfG_R+j-`s6hDj|!MbN_&t&(SGxD?)R%Qfode)OF)K
zijS03EW`~A-f9CfG*w+_rjnN`h10pF$$@T$&5mAYgpf~)hpjh<+``_VN>swUzuyq4
z62TD`-E>mW$FnEqtGMEPnL2JK1g1oJPaH(zEi1etfNKz8z092*y-Y|*m~;%`8(!?T
zbOE~R9N7aZ001YICqDvsq7lMd?cg2!P3vMuj1Rh(#9cF6Wep!P#R;UmT=sXWF}iN=
z>~yyN)RXwx4|RMO*8fz9Cah56Mg>*a*DiGp1y0-W!*<HuP$M<8QZYi5@FMd2V*_TT
z#ISEPuIN>%lk%vzf7d_+HTrkKDx&lZ%wZ@9dj0w~EUS*8lec?aTgRkwD*o%mPv58}
z%}_$Z90g)tts3)@W(<&|_?;4SX5MkT{Iae?m*=jf<h_W~@ZDjc7F%Ov=YvVfh}nmq
zu}+*QAh6oHlN^e!w+-Q=<k-dvTqXZz0qr=U)HcQ|)|HTp9yHzFzn?_M!g;QFupkzo
ze~-5CqJgYSQJ(QE$|K44R4^3%b2pmox?V$<X&4iVQ1g=NSOhWu(8k3)Rv})R*k>ih
zXG<M=UU0vX7MNr@X3{`G&%}W$mgXjtVP65nk{{ATMcGP=Nt?!l=nHmr9Q8Ie9nq8Z
z5>a1*Hp$5!-Xi9v(0Hvs)*BG;=wVoUl}Sk%0`<#Es4zd@W~P>Vy&<XIN>|}-6EDIv
z5@^r1umW)arOY7dt!189oMv9CHxIqUtQsk3#G!l5BTC=vam58dKQ=KgaK0YPayr}Y
z8sMRN+LKwn`a(4$>b~e+;B0qVY+@oUukxQ4&}zQy1swQxJAqdJ4`Y&$F5by)=&2UV
z6Wjxy%-wzLfDZemHmtO$&ad+4!`l-GGl>mH@b_-RxvB0yyqMX0Nq+b45W@x6kS-NM
zjPn6ddi<SFqHDFWOSm?shb~4gN6EyiNZ5ijBYWB(b~~<JiSeZy>kWmlVgCGT`{;kY
zHF+FWFJ^nzlB1rwHEXN5?czg!moM(twQ)$;$Zz7ed#fK0=+xk+Hcayp0|`h9QW8N;
zth9Ht-6-HuwZ#QJ@TflMw<=i?+-w^G>wmJk#2bAX!Io_H6Bqn&>k^>n-U9B?$QzL)
z=tX(qyK*G+`{w$MzzH0qJIbGG9@WmH@FVK4Ijn!yKjZwJ>X^=F@8V4_Qb?je4E>TV
z^HgU!K`3#RPSlT^vWuWOK~_Fzkc5hQ1&O@!h~%RbykVm}7Ef8_G-A<`V|4+J8*M}Y
z*t4;!bQsdn3(bA0v+1;DT;(;@N@WtDeD>pN&VpSnd3D)w_%4b^_wp7ayR^qdBTm4<
zo>;x}{zb*JA<J+%B*<q^J&aVDM=d#pgoF%aG`kF71D!qgG$_>`U}zx%h8Bky4977b
z%ubC`r{|pNAqb=E+zQjcg9yg%(%J#za3*jVpF)omgHOnz>i6fb_yRb2mh7RO*HrZ)
zh{S;auj}EU_Y}ehVg8i4+o^glU1{tQt-oy6hl?gWpik_Ng|(eb9>Mj~XGQ9!OjsWQ
zyhLjbXqP7K&di6rq49P9jjp9<-dHlhbtLGtdax&CfvsA>&aQBl2`}c~WKg|*f<Dnv
z<IY*QUwqN2L*}ow=ZCDbWzRkohdRPbY_xB~wR_I{E?gQ&#aVwS4w*PU0QrX+k#7^H
zAGIRXPJCM}XCDi8&8Q)ZyqfQzwtWFov?5e}<5vS$S2#{v21wPtYz`tvUgWIuVf~;k
zACu4yo(vRsUIgZxr+*m7Wsy~5c9qGXnSS)y&*DfK*C>sjO;)%wL8iwgNR!A<_qc%w
z=Ab#5?(*dSX#wa$cBV^oFFXC-3%)2vzbplRsaAlMRUv#Y3`^+E{BKIgDe^}<!!1id
zgWp)kzp^8ss43=_td^<XSV~9R)!?%E&O<41QNhsD#o3^@Ty-BEuDw>?>rN+{NbLt#
z@lI8E=+2z=Mn;BqG`UY|uNTt<9KJuH+!eLzYH*!RI}|bXTX1?_Q+p6U)pV)lTAN$c
zCL1DpZ@FYVgH4y&MX0^L)to=_FQZ7BjYDU}sL-DAO+z_Q6TzpG3Ua$JGBbZtRr0nG
z&xn8xb|C<$Aw4E*i7iW&1pXGEIk_k0dR*%A%t_PdPuMWx8}4<KL>ziwHDtVSWZd+#
z*UyUA&ePDlf*yXlhnpk>2>&Ksjx^kQbu$+wqzyY+0}X|OmzI;|4{%Q&3zm!)P*9P>
z?Ww+iEQh#B;3m)AyK$f>#eXwJ`eQG3cf-Ou1x!9X$r-G;BU%Z)c=|Lv;M8m^zDeSE
zJbcBR`_xWLUqQdYaKk5gVn4oZ%g21-TJp?ja&<1FErLea=5INf^w7~9Cz(UUBiaY`
zecONX_O~V+PTDLWSi0q>vcg5*#>J-KdQz&uf-_d38Us#lZe#5EDi<6?uiDe6Hae_R
zz)<iEV!YS}p~4V;lOD@~--`~i<`tuYr?z)@b7>_pEB^@wYRyT7Lawf!?E1JCGi5f+
z8ZGh;JPX*Pc!9>F(RSx*h6+8pIMYzxW?^+_U?e{nHZ9?J=|x3I{i58hH_inE%?Utg
zO}h6lr$pRM-=1ldmX}U!N@EAE-g3>maZk^72P!{%7K2{pd;1v|nq2?PN2zs_Ie3#O
z9mFCRYg*h_9HmL(ES7f0q;y~CZ1AO%-=i}||FiAEi<SG-B<^!uJ7RS$zGCXwBsKeC
z(-8lYoZHp(BiaZH-N*);%ZKV&o5{{q7BY1q?#x(>e>N%n+DPe?ro}lcbVAX)%x%k1
zpvI<We2L*3!2WnHGuBtXp35GzDX;hh1S5=?1X?5+8W`|&K>$5UP@!VPP(GwQj*X8G
z$dctjMGSSAl}@73cZAT>))fXTp#-2@j7FgiBu*G?`4hHAfN2Z}1Pel~d8{7|1f$i*
zHr)$Wjh02UFU9_)U8UDZ6PVy0`fVX7^iN~Yqy7d%r~)%-D1-<%^Gi&OB|-Zu5@rZ?
zY0O0qH>LVU^XLN^!xcVsaEH2P%AtyZ@+)W-^KaUGw@<8noBc@h6N}@L^S+~mAdFlT
z_qzSW_pT8$JJs~sd@-M))}aX#&&_uyUg6R-ny%ToD8H3T$M};rNe{zDpVKv}<ORhD
zdV4|Ib?)+8P(Qdurdcs~0CkKu0bw6_Lk{${Zi^Uiyn)S$Txy6K+gHY6GBN-KJNNJx
zZ1)RTAV7+7Nir}nNW_IeLtzqpqFW6YM-}F&e0`Mu1)MS)*e2iu{bALel5A|tBv3o$
zmt!iVuVieOE%=I_!TE?FV%pzLuPO{?z}|~f_0^kxdl#`!`|x~k!<4X{hcUm(Oe@Z~
zdj97plZ&i(jkf2Fx1pGKA@B?F^UjSJxiY<A^y(cbSDW52YeJQ6a41M1Y4XOn6OO*7
zynmiB3z%%pNFe(t_!ZYZG=uYlcdIQ9s8^_Z_gDe<IkPZlFE>%yG9ICJ52@vT@OxKv
z+p50l`kjdBzjEiPf6tP6bd&2uU88&XeA{=pd8Ljn`mXDb=S(E~HoaJ}g)5uFN|lbc
z=<lfJsQc)$?E9U7L4rC|B9$ne>(XVa){UoP&LC01N{t55m?i6Kn=^Q-+_NaxE(X%F
z0Q&LM;8jVmMe;HV?<QsC07+>TT;n8D2+{iuq;#Io2{)6I4uwKE+rLt<WUy>q&KA(d
zlJMPo5PFDl+NOLgWtcHOkJ;538J$Uq*!P_eqJq-2?1`~R+^OlpZzP*7m@l9HTB)Xf
zfM)e&->|zys~!BZcBgc&Yq))7gl45cWRD!E5bJ{-SU}B6_4ae-;942(jqDJ{blDH2
z9ZczMTc7{z{c!=a*FXF2l$g}@(y)AaIn1i}RH9Qyue|iu<eB_<)Ag-w!<C4~-h0Tt
zvtIk5{GrfAu)yc|Upa4T&2H!Vdj%yWvV)bxWODJ?nx<aPZtV)}ePnM(5q{MDe{*gp
zU6DjO#X2z)S_o$585|P#C}5)p_G_seS%(;a7iD0{#Q5_e51heK8^_B6BMQgOtOY$8
zW;_FjH9$Yu2ng(^Z6Li|*@MSG5ReLXPb2{4JqE+kB^x}g3+*U?iR%HumSb}W%$rvo
z8EO24#pvkE82jg)TTHKTMHU~_2&eTYEm9QCjQe{aX1O%HLJoF{OY1s8#~#YkwTBPd
zTHfrZ$&rev_9J1z1QikH`CkZuQ0q0o)02gE104(;<gik2j@C}~AGYs?fDmupm{t4v
z*Ad`I0JlpApFS(j_%C<o{8lRW_~}K&cZciCqpoyq?YnNpA+fNvpZ-)EK+b}ik-nV0
z6MA|6O+-;n>(GgjFTJ77;bZ-UM3e2KI~o?2!$Bq{ALydFB-b1*HIr`67W{SD(+IQ)
z_)NOipNm*;Sb{5GjvNKl=}ev5dS&(8ARvn*V><qlL*F@aZXpG^B_)jyaOjRN+8~+3
zqh7Ik_yZDKZpvrhFdQ@3fyW49_%QhZF3&FYH<*TXs1b65o4-%jM|jJN3i7A;tnkF+
z*mz=uvRC}cS<#kAD4Jh(QsB$d7^=Kx@j3LrTA=$R7{@9-t|;qI1$iAY?8iz-arDem
zM;(S5B^!DZj%}9f%P%0kMZoDdG?B$woJNVM<p{J`kx!C+w;3}C_x=lbpRHoaI-Q2S
zqllpYA$LuWFWmXyLtcY-?%vgR?x}t#(s=gw89w{8`$!9z`f7+lTD6|auV};B1{xpX
z!{muibkbl~d3VBKN)Hp<%3iO!tT<J(|7J(RJ?O{=RwVzMmkq4dFbcL)y@BHp287Vz
zQLP4`U=lm)6sb|F)|8OzAI9T{0D(gWPhNvECJJa@(%*W@(Ls&%I)Flz)rjpwTYp-$
zLr#M2eGbK(Yd?cDKej(Y)K<@x3&!Gwk-8mY1<F64U%cOWs(vHy2is>N8Vk;)RG;k2
z7+t56vk>Rex%H?5fY24<pa6@3t}FGpM8bN9o~jadd;!&1>5n`3|7uP^PDI!YaK2;`
z&gq9wGLxGwU8{#*<Gv%!8O)GY3LmBP-+m8M;Vj~qs4;I5nUNO1ydF3Aubka0`}4<}
zLCj<Qk9}V-r>01@<W-P=_YNPk8(zcCm*`z?17uwj4-dBk8Kh?FVy@qNYSS_@O~$o{
zc=+ml;_j`V8)M}KShG_7*@cP`!1bb*BB6isi*A5T<!k?Ks1YY{U$7Rb2wE?^$T9%$
z2e+8j;K5aCk?~z+7oYHIDqPoxH|3$+s>0kRv~@lBsjh3W3?O8S(4JX4FMs<XU3-FV
z5j{m(3A_R9wU{Wl9$SS?)3iB3YD~y|q>tbv0GruyNa<hvF_2|u`aBe*#X2$^+$NFv
ziiOSY0e2ct+4cE|?nE^jRy)nThWaroSs&4qS(%yE1M(WVH^)zx!XlFLasd4#hCD5T
zj}05Jf9{k?Qu^ma-jc8r_n|$kS?!p=YfgXvQN0qRzMLju!qwQFLn#f{pEef;moy4p
z&Z`<$83Ds$vbgktx8LE{R~eO!7k|Ugw^F~2m$<a4fj*<Q*hK*yhThAqQMtW#_nos}
z&#8Ay^!*QmYc*y>y9wI*1!3Bxe2f4!@`nDirC<PSTyzpaqaf6&%gxG~^?%y?@_#75
zzyImoLfNxa2-(Y4wz88fW6i#kFeQdGGPa@gMk#!Ztua|bWZ%~bNfKr(gAijX*_pA+
zjNyB_@6R9c{o%fTK9B3V&g-1>TAt^e=hN@jt<U_B^|DNJvETL%4)JC!DqkG$o0^*^
zO63R3)pBjT0|Fog&fXEGrl#QJNQ*G!-^?h{{Oe%%pfzEDx6@m(jU9sY<3tI9&7w*F
zo2IJQHaIuZPStQLf8FlQVT(9qgRGo1Od2*?zP{@;Qk?t|pEvTi>_Lt1GSE+QI0YH|
zIYAMMU@JxEJZl^WMEB+Lae!>es$dDT%3t|D`Ku<UiZ2~a;kMRqtUNlJ&81_+-O%;E
zBZ>d;aow+HmKLhMd0Ebfi~YUPN{ltE=uPR^8KIpYpDab0(sM<HDs<H1LSR|>?}PNV
zpJd1%OOUN)hQqy_oTcmEz4OaZ?bY0)Cf9#YlZUEXO8U-Jtt{KYLgx=>ev0<uw-_^f
zZMMV>u75iemQ5TWH_rMeLE^bg)=t%3S5ZltXx?uK5s=itE>C42)O~b*_R%oJD2nbE
z5a8}af4HXaab)51a}ASgHuE8DM31RFSy1~C{8zz<Q1iYF`F<qzWagW(s`nmE+c$v{
zZ{VN6I$W4Tao=kJeu2Z?37cqNBsb|@ab`|vz&!Ivf0@53T>_dieKlzB5^Oi#TXElK
zr`lc;C=x$9A7?4un#cvE9pu<}LO5II=jW+Vb@`Z~en7T+s1ei5qyjcHqzJMJQLFDf
z&+j(xI`?{Ssw?&Hu-z`H@agXc48U{$hARA^$RI9`;I;QRlA|>{?D*2puzZS1HV#*b
zx3W~4<o3#Z#Rz5h+C5(^t3FrmZ?$_G+7gglFZoRnn=<=)F~795ln<G_EMiwvs3V@*
z>&w+mMFVsWsQN~l6LZDW4XQr}AP1N3AR%8>RE?f=F(XwD&1yrg+fv;^J#eK7fGWFj
zg{09Oynbq~k_m3K`n}nA<XO{TS@`_X0lBYfrYW@ZUlKu%sr;b&<0oX!ZqmK>5Yo=z
zK~Hfd%BH)Qm)l(Rx9+%MH~th4$N7nX2}Tl)Mzi>rxhy*DUQGm@-L~dqm5oBKM9n>s
zALW2xyoKJx{o*sV$Q-*{#5}(ApHR&i9YAmUzVZ6+E>YLzoM3g-_)mr7PsB2f#pPA&
zyidH<V(qeYu(NyI$~%|x9q5O#^{o*`+ll~{v=9Pu6?F77spFj9;wfn6yOE_115N=h
z=9oLNkI0nZTUJ)XNQ7YBNQR|TyH!!L=>vl2#90ImvHGh9F>t`Zq}b&l%S=xGHL~-a
z&Mx%Ko7qM4YcElEx+P>9f8UT#(v?hYnZi?&qz@p-+;ho2fgQ`GWeH-=1zo>3{`3Mq
zI5+Vn*xdcb#beWd5&bdQ+Yl57R_vpvruCL_s>}xjdBMJ-GQz>nZ$-x<wXJ<_Cq(#7
z9LL3sEzbZZViw$O>$8%j_QR2i>(^&?b;vq_1XDi?or#8LCQKcwI|&@x>lF&*`VBEE
zHk_76>@1c^mQsmlmkV^bSOceLuye#m`jSkeowhc=hrn-`(Yl{)@d@+Ri4&K$6~swG
z8;Y6WoXB74vogR#+t)8b0Hn$#7&)an6N9}tSIXfp3JEuB)y?z-wWh}F{*F1Sr#0dM
zSU4OD>H$WpnB)y%uGpgh%llOg#dE!ybB9XSbr^<=dUlj-PY~Bz+baz=);wz~FfbHY
z85x-^np5Ohp<P43VYh3K%H($3uedp{bIB-=N-tz_LM-pB%s^S_^mKl6&}x9TS9c+A
zG$F|Kw=U<2T<a5ElY5GFmb^SX2XcU<b1uSr9*>cDaqE`=tkI0P&n3T;Nu!0s5+REl
z6N1OoMT&leH0Y?olC^dwbSx|@$@MiP!d!}&VMx)L{5qi3P6^$vBp)0M3$&wVUuQ)`
zY1O<f30izAudI});WNFg5J}rgj$UhBiEm%oR+2hZ<2b-n{<!=3=Fh2J6rlf^XD8iy
zv`5=l(TP}zvb^4XcT!<NwnZqfu<-X5Y}fm9KdW+G;J5MdDig_6kLNwJR%zBvHcn0#
zfXH!wYVYrxN^}n>6Br;d+%)RST0JB<6R2#Fu3S#&`#|tK?=Gx2&2Cye((!QbB-g+4
zY+dW@a~ng3iWp5dVxYtv?W3-=uk)4kgkK#3qVp8XTRb$szcY<Lxku_2Plx`p<QPM@
zNZ5drMv%lsfT_%48TI2ggWR-tpO;0ZPlnIoK=)~eiMkTn!1el=c`5MW=R}2pJ?n-^
zIszMtf#_hQXiBIu5XY<x_`|pk7%n=2Yw9;QH&-yBwLUtc2Y5^oRto_9c2OI08nS<O
z66*Hj{^pN^See1L<(PPhY}{VU$p#j3(16pzYMO0p(8w1es!CYJ-}^-1KasPgV~@)y
zha*(W@2!tla2${4J{~fV7LkKt(b*4oTME2Ro|Ib9&J??T_s{bplyDUoyl5l$y+xa3
z1+rB~0co5{MEWT;_&s{<K10aSir7;bcU4kh?UEvCEjQkiYNl7fb`x)jZrC7??0`_f
zGf+6qWS5=%wGkSNajOYg`+fnoo566&H&|>1BWcjqdjf(k^j1{sCRM!<ZNzzW6z|+^
zN7)nW(CXU}IBt0}$<(79lXs#Q?nX=hRSCSxN>!?phUwKC)9itohW&{edW^Sv_Q2fw
zy2a_3J8yB(SM%3r39FsAM6FgRTYxDb%FWKMp-oHyNT&7R=oj=j-~6Ox09wOHP?i_j
zjoYO;yb&4c2vPac`Fb_RQFV8FLc%Lhf%QzFi+0d{e~Y`A!xcXJw0>pNq7`wrqz4y)
zhnoAHPJVtx`;k~`?|~-pakq)_qVCK7N;s($<MI?+xutV0{vChGkE#K|G=d;ILnJEN
zSvS>xP^!C7V!Iqmab$rc@nFStnJ`OIl%r|E$%cr%p&fy71vS|3YpL$!+IOV#e*sSV
zCi`Hb0k?#q3p>^_QaUshwFO{OLd)N;^(2RF)Lzc46h=&M+%xW5f!23jP>Z}3Lhh+*
zYh0=pzUl1Z;&I4`IYNy`S$0^LPaq#<xpMrKG6YwKyj|f^PPhVMl_QZ=32WuyLFP>E
z@sp5Kt>dM*XZCu3D?Q%U1e*DtkK;vO!TWzG+%r+X)c2X=Gk(6<x+FJvuN6Wo0{n!_
z6f3rK9>BK}xjwJen@QIF<wCD5mbh!tLPr*ZAkwqEye#c};o{J%3rcYf_4Nl<*0$*l
z4DsyqR;vX$n7PLx2CCRL9cIyJmXLN#HknAYPB#``o@OkGFn^2nY*t~8A$gu>XN$>m
zC9Yp(HOiVSLC0lbrrjuqd4-itmHXATJJFR#4|^%-qjW#(%J%oDZKb9a=emmA3%JnL
z!7j%?Yh&CF3y~8jheQ{}09#dj)RkrP6fM3JRZJsbKi22jeSDg-JF$jT{q6LFviU#Z
zqA8A2zA1%0cz>t!as5gde4~DQc=81$7^^=Vt_8v!5;Li$(6&u9fRQvjY>5OgpqZo(
zl91pFe+M+3Rlanx1-=JIvyti3r05UDt0NB4wLbKLwKY|vI98#^TN0&|b}tgmA-9D-
zgEOjdIqK6}cy4U%si(I!N!48Gy}V#9q<rV&$Dlq&^+#jY3!2TOk2HddDjNtr5scj#
zoX22eqfiGGL1~TZ%x2+#l<i@eV)Bjx1<Eh!+p_`gtrK|p)L_i$w|wBPl}*AY@yjGs
zS>Y)qzYN=DN;ld>JTZ807#6njs&@Q1$bMcQ>wqLhd7h8z!RNom4xc^IS)5Mq9CEXF
zb`Cey*Y8TFM42S%bDW9!!4A!Qk8Pg;=~NYFaso`_wn5;Kb1cQrmPtG|k_a&)PT^PK
zl*%@&aKur@_nG9iL3~o^9*h+6O(oGyeHZo3pGEv)?M((VPu6NFSZo_j@?k5vH)D0=
zZg*1c&78S>G=3dd_v}5*php~;O1NQmR>4_Np<M%`%EtC+27MeAdJ!HM{vuet?J~y9
z;_;p{)PU5y9fYpT2;P~=JqF}{^oadv)zS*lAS|W@cdld&F~y0DwlOvBf4G#ZBhy;r
z99?Cro_!dOR5$}MiXm|2J)>YzWXkE0B0S<~;m0Qxg$h*9mvL3i-*21Bi02($iSHU&
z2?T<SaMZzIm4dr7J!w8;Uf(|I)p^ysBp)03u*0-eOEtOMSKDP)=4_xrz8mey0$Ke1
z5@Tg+r|1H!@5wc*FYe{27m!kmjD&MK7Vo7<OzH!CX9`54JIJGu9GeT$zN_tlwOi|P
zII#>Ioerj5Wu(+Ud4Us3U+JkJ%pU@5Bh!cnX9`G>+B7g`QHDfDOA;J*u6uR^V4e7j
zElDU5Jx-pTA+pb++B~#XjSLmMX<_2BJWw$e_KP9qPE`0t<&f9eTd!KI{YWAMEkjs3
zyxh@sLjzHES^CO*H&st3Z>3*f<S)ALD>E&$(Bps8p5#}Z-Dk>FSRN-*Q7e*Fe(DEf
zi%{#5QL$d{M;W*U9Dkdmb$5;q`g=6{%B7AT%0)nZAvN}l9Rblbs#g0aIHtilz2|`C
zd$G@#wXKB@@S~M8EN3`ePSzy5t$4T`Ws((|kzcnIw4;|VjdRFqZq+O0wt5rOVu)zH
z;<7bcXqECE-6KYSM1{%}%~`Mfhb~rZG7L!|M>{t6cdd*X%|yPlE^#^g;<E_UqZcW!
z<(iH2m>4D^AUCXmxSFi5zyBMr;WT^&91dR#$Gu?&tz$vnA`cbre+=BUN{aEpwQP&M
ziTx9c8bDU(1E?t28tg*0Z>iBIw?4%x`0K1$7P}E$M-N<D<neaX`sbyWyaEnuz#)L@
zyFuNGh_c69OuP@mf}A#9TDGtBW(=ItitgP%iLGDy*FF%1ZOJayT_|y)28YSgYR1=E
z4rbyG*@R26u03Cj71B-Km4aX2KnHvUzn+d$kiK#y%h}0kGK=a8$*cRDO8xB2C3VoI
z%j6x!efwq>viJr1_c|y~b#%6yUd&1<Gd;Z+yd{byRH8PkEZago)?%8jtEvjtns`54
z${Sb63jVo?%nAR}t0&udgimz8%*XSJ70*LK^!bcel3Yv@SZ0PY>D>(qn(YByGIYD1
ziEPsag)rI8TM_>WR%eWbYHEMQYo}6O!$p<oxWeA0;BftR#IYfyva0ILU04cW0IM4$
z<mz<vGy=*Y*@K&4k-h6<oL<#KQ*KY;OSOZT|Ir!;{J-CT7IuU<N`%}Zc*`O_Uo&E(
zj!lXbL7Y3uV6jG5;(b=Tn8mXx^TnpE_D@Hv3*I&W<MJ!HtWj3P*Mh_J@8~_l1*$k<
z`d`jn$tbaf*Ilxh`5`b<-l*yhcQFd1UA`ND{NkQ$UYu@E&9o@)$T1c-G|q=p1sFv@
z60#2nLsJKkqL3k8P^@Ox-@8gr(|6nI>5kTn)?162PyRsFe>ret=T$Fl;YH<Y<JweC
zGjcdL#wR*r7eB=KPg`$K`&`=GQTVV=qF5YTbbO&cyB9S!A>pNI<7zosEttwZg$(eM
zy&NzOH>I?r4y&RLcNy+3xee8p_P4ziF-~B$)ZxglfgxW@0gzrEs}p_bw{LMmm&}#i
zGkCoY!xh^BrUQxFYW2<g(o*MKWAT9hC~L&5pOZTKVhNbt;zZ3;i=e%#tjW3Mr@6T#
zt%U~8K{wiTK#R6jP4nFbGY<#$G8^?0T_ThcMVyf`J9^XkD*~ytsl7~j64m@@vv1{m
zd(`MU7}b(nqQkOFb^8`YO#73suX$A#-94cjm@#4_sp~N`5j{l}uh5x(?0KG#v@_-5
zUcLC%paD?2IydGA%%NSPwn=Q{!=K23C|I}B(b-nvb$Z~(C&v;UTqE}j$h2j1zf^}W
zmKU%2|8cTzVm-YY(iO)(RZpEIxHNk2b-<PiII8^bmy@f;GRT|8JHH(A36(vYwpZ+y
zD7h)0u|ukR?B}oEIV&IrX&te1aI^D=A$SS!mN+f_iHHxNa-EXU0<gTxUcKV6%`2(B
z<>>8^$b}%__il|)`hSB}V&@BUFj=nQ_1C&xfQ`O8_S<f}_S&P-4bX2jXw$q#iqJ<1
zQkq4vPUjcqCo@UF`KRWThu0cx>I84oPBy%hB>f>_XWSY6`>Blqr0~_wN&%2y`N1T$
z84Cg(q0a`?{0?wlH8J}3>4I#Atim;3<Q@Tc7cxcJ-PEr?i_UknRF29;_?m|}jKE1L
z9+nxs-chTRR0%j9DId1m1^OTpbucSDzM#PcL7D35b;_Y<hR!G&1pp~w%J=7Y7i>YI
zuL^2vXn5DLc)|0sm*3@t%L}3inbVAFrovVio^&y~vzqVRpj%W?A+J>>A}zF+Aob0>
zbR|W=_v>lP;t>z@HWjU|#jzk=(}mZ5ZBZP^Q=xE7@Fegjt)I%To%6hV@?^!r8<oQv
z0qvu)V$e6#r`NMd(aF81=0!sSua-uJ;@S%ZRQ>%OE?FZ6s|0F28%cwgQ;!zJRFba_
z-L{rY4U@(aawL}CMlKLZc>gjCN;=!Z!^OsiyPHi7a^wBdNO7*z<<b!6exNGVA#Am3
zjK^N5LZMAYJW(c^LZD_6SgZyS97-HRe~MhF&Hbt~GHza3wk0Y|-E6U{8hc3^djr?5
z2(WguO%Gnw0|m3760TV);I&~dDVHtg2j2F|&Qv5IY;XBA&pdRJ@;&;LVR0Ip{8aE9
zlQwPrdR5hQP<MA`uA8&<=78(mH!Q1wYB>p-nuxuA=+(^l2A?{l;S=ulG(<e5cLA4o
zXm|bHxQwlXLx-)0hfXi4L$aibHNth3QVh~vkN+8~L6E_oiu1hTF#I7o1exn(XE(ND
zQNYK?H^Vt*E~<oE%kryKlTVn?y$2RNov^C2mEF^RAW{*IsGv-@W<O3{sk~OlVf;OK
z;@tMdslEE)*3DQJkBJ<r_YevMRCG*C4APip(~ej=Re^4oTUrH&YP2({-Vl+@3BkT2
zay|sYE)Bh<XM3#mV_{S9++#0Gb)UJNTsqOlV}Ck2BR6W<cfpa9RqLQE$z8uct`YkV
zB%R;9Mt8rIHMGJGpEKk+o{7AcFhP>@ii#w$z25C}l;W(({Ky6Fw{S`n3Ah4zv*Hnj
z1&s?3)ZcWeNjy7vf(0j;4WJVrl{=urAxiX{w|+7KKG^#ERk1=M=_aT_QUNWxZ3JKL
z(xG5);T(np>Isfe9oxaO{3Z8F9iGo;W-J6M+WiLJs0oZLZA(xNJ)u<#8oAG9mFhN@
zf@x(KGnhkr>ai!TUWgNmS@LWbBj&~&`lG`3s<u~j4tC#u*U@=+l+fBC7wW^1c?r&d
z?Jvw{kosK@fess$`0Hu+JG%Dn5KEnB^H1tvRn);4+lo;W9a%K`Y>XwBGQ%2?`L`@d
zH*vcUb1+rUQsr=#U=%hqJocAN-T$lVPCgW27YRi7{%%3T40sNAfKJznSOQ57%sDqS
zpCOpAqAAafz=TaRlSE)~mYJj60X!k}do&;J47i(0*jNKv5_eEVYK!7ne`A(e)p}=a
zu3t)CCu;^#@KO<DW-iMoa1Vx*&br{bUp-0c078GEuq)~LZh}t=>^pxd9D4}qAB|m3
zp?#rwEqj#e%y(zHYmWLPN^~FhkQdMlo?3r9rxBmYL3UHUzfQTN-}kJR*vUA>$01E|
zm<W7-Qq|dd>Fqx|!3GA5zP)ImI9{<rrL7GPPEeOFR>-x!#S)j-G}BG&fS901*5o}2
zloj!7q|}(f+d_{PdLppC9k4#&rOsA={G{;xci&!oWmReEcR_<yN*V#C!~L>5Cw){Q
zX1&QAs|wcxTO%@DpHHbvr1lm9o7GVcS)fw$*U~>+40Q#zI9@u6@fs8mxIP&T<ws4N
z+X--yAPG(h(uWf`supF|{D<`r^Y}?=UqelyTsSNpJySIkCeq?ggoFH=Okc8$ZWSZ<
zLUc#}`l^p+I*-TcVrO)!+uv?{ro|anL_8+Ft(>xRb=}+?x}@(etm=KgtNKzGe>|JO
zJecy1zCpf6zn&g91vzHCBj>9$Ws!W>WMkRgFO7GR(`pNZM04}CI3jkEkW6ZL=(Xqb
zjlOm{tX)MY`RFVDEV?pXS#{9&v26j4%p3O;Sz1<h1Xd?R7yVaXIAqTz|2Z7S0sMh2
zPYuom7=jG4cR5Ow*NLC~cWWHe3;RW%vjqcWDEwDxS*3bKUfvJvgR(3AJ&8jHjo30<
z$Kuw}W{)JCk|1By=$wZgHEeLw3j3QD<Z1PjBg5RWQH9)_cdy@Dv0oguzFbx%bnZlF
zra0=B^^s=r7?*$q!50o^`~(L_J^GP~kRi9fCi?M~Tk2qbdO^4}Jvd5eWRW~{J33Nd
zpRu7(rQxlD?0<6XxW`5HLjRGW&bb}2xa9j+6l7TX<Js$oXkz}aW5-RwT~8(#P5~e5
zAf$u>RAU*bEwkiC*KT?|wd+aOR-Ht+WW^oiX>6%ytxk%!xDk&4hV}vtJlP)fN^c-6
z&>Lpi8u5ih#d|AGZDw;oR@2QS0m#z86&rZEYkfpL0rnu)GxR0K?efxZ=1(5ripgw=
z?G<i4|GW`EU_};LY6Y=kkfm!K2F(mvpuxV0Srk!A3g6qmqm;kC9fZM<er(olW{-0@
z&(0sYS$jO{NzK%md<~o?4uq7z*up>COCv{OtX*XisGCHMtvBBUUQc#W4v&^sR-)Z(
zZ1fFyo@mE|FfVgOrYq*pzVLm*mnem$X?@og7h<-&mhe}@=EyXhA=bCBaCTG+%M}ZD
zc?vvy9|$t~%BoCcIKtf;J*<XlZ+*I&(l{<pX-7`1yUpqMyY$7gpEh}E%3^vdBZF^8
zhM$Keh6{a4*w=cn0_8b8y^oL*qNpPo<7Z|H=VI<DVw|B3HpLbf0sV4#3C=^_H@}d3
zGtDNo&`4a4H6{;6nFbQsmy<s{-0Q*1-g_oMLZK?}?wp{s591%H;qorW*Qj@sa%}<@
zu9koGEdMaog1F}gNCYevY=Kg->la|aSy-cZ<~0D*o&@Ql6Hm<+SfZ;Y01-fMr%V6B
z(R~2M-0HWxI*CJYmRSrFpq*T2x~j+Pv#=oX7(vTV)nU|4iA)l_25ptNy>eAV`WXk`
zF{%Wk{jOEFfg;KpE@0U9^Y@3*Z{Hg1D*;!Il}bbE1Dvv-B6&BG`g`>r76bokSKj`&
zD=y+F58@s;IvIt=MSmqJq-+a8W5bLu%lMWQk};E15bs#RzFtzK99CoIRuMk3d@Cv)
z@#>%O?m>I5?bNq@%x&&brVpRGQ}7ouIRu9$)+I0~3u_!7pi|9j5NX~`!N)usFlk;a
zeVHI27f~M1f8lk~<8fn*p=rpg-v#1{s=jHafv*WEM(&P|+u#3FTPxSq*Vn`ba+>ky
zo162so%+uJnPLq3Sa-L9L99=Wqh1R;8j0!}=;_#JxD2q8yy9K(g~?jV(Zhp?L%Scs
zMV7hRUr17|^nF-J5-#^Wv+5whzpSuWT#GZ#6Ap0B*_P&-lMu+zh%17iA_n;HJ1~>H
z6m4h=ri<Go+?~h`Ykc3(uyY9Tu*^rC*R7>yqJRB}s)?c^_DU4p$`7VFwcm8~+7;+x
z>{9J3dXb7ZXoNRvb>0A3<`&OhqfqT#?0VOkGrmhKEom{;3CYRIx&|xbZybN(>WMh<
z_36vX*7wcY_qC)l#;`X}YMfzl7Oo<UR`%K*><xqmc4SYv7{+aSdA&7kU2@!N?C{}V
znrVEI-`F{&m0$0U1{onXo|S()JZyg*KKCqMnVsKuz6w6$_-EWF8iib(8RjrZR><yH
zTs7u4&;T{Qw0}T=_9f<+)M<v?1HYGw&eoX1rw=st9VC78FgsI-o48&)KM94yT|cQ;
zEy3Eg@Ju9hfRqb(+Xn3%92Uw={((U9nqI5;*>?b%l6HM!hcUk&%uD>DLW!+)4TiTR
zqTqY!fB>m&7_Vv){@BuIvA@RAKMaA`O3;FT2(Jh-{gQZwH|i>D*JYTh-oD0(Z;P8o
z-p+TO%A8bucoCHUAV-SKWjJDfz+oZzn6_S0vdz>bydsoA@PC4%G8UfF52hy?7pMeS
zqao25>Nm~k27%uPs3psV(a?~*9*~XPNHEJ*S804pPU5oeza}(sADPHHYM=6}$m_W8
zzfBEW{3QX>TZ0$sPMemUlP4hgA>euR*@u_I{!OX9d783S(FQsx@qR-ES}1cnuU}Yt
z!gW1U3t<g`=n8)UX{(XB@dspRql$*L-pZngz$^48FP@2OXkoD?FK>Q{@tOx^JYQu}
z10wGq=3_UrnZVnhEWUD4to(djM-8Ok{<W#U)t!oF#o1D9S2ypbroUekZ-ainRMP34
zC{LCwnVA##ha)F-i7G@}zRFj5NH6RdF=U27J{-e}LD+o{MmLV%jqUDvuUjzplk@To
z!=m9wI)=2K$h6^v+IN}zYoJ$yuf)N%qrcA+ZK;Lh$YX+TLM35^P$Emjq2TEL#unX_
zpSST;K^@<#2Le}xlO9e6J@x;$KiSR&b5K1ne<xt7HBIw>UjPw=fX_b4SGB>J60@|T
z*}q7dVK)tsM-~hDQ^(ybfGWlRJiFZHEP{|5SECm{BRUp{BNwBdZg1;vos)mAc=^9g
zP%Pk0tqwju=TH2Uh00xQ4tL>P=0<D&D*tbT?MpS0;Kmj;Yzd>HkgUe|JcgD}wGI9F
z(yWXSyrhqmcb=)}J@&tiZ!FiS{Uf5_9=_W>C-vqqgZ$M_bjwRZCS%l}I7>Oqe&{gr
zDxMm?o*uby`oFJ-aJ~q#Ddw94*Q5UL`9BN%|7L;34@Vra3OkT$Iz9I9zd;QwZdKmA
H6Z8K7?Uv~`

literal 0
HcmV?d00001

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/__init__.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/__init__.py
new file mode 100644
index 000000000..a7f6aa13e
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/__init__.py
@@ -0,0 +1 @@
+from onnx2torch.converter import convert
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/converter.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/converter.py
new file mode 100644
index 000000000..891fd7932
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/converter.py
@@ -0,0 +1,175 @@
+import inspect
+from collections import OrderedDict
+from operator import getitem
+from pathlib import Path
+from typing import Union
+
+import torch
+from onnx.onnx_ml_pb2 import ModelProto
+from torch import fx
+from torch import nn
+
+from onnx2torch.node_converters import get_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_graph import ValueType
+from onnx2torch.utils.safe_shape_inference import safe_shape_inference
+
+
+def _remove_initializers_from_input(model: ModelProto) -> ModelProto:
+    graph_inputs = model.graph.input
+    graph_inputs_mapping = {one_input.name: one_input for one_input in graph_inputs}
+
+    for initializer in model.graph.initializer:
+        if initializer.name in graph_inputs_mapping:
+            graph_inputs.remove(graph_inputs_mapping[initializer.name])
+
+    return model
+
+
+class InitializersContainer(nn.Module):
+    """Module for storing initializers in torch fx graph."""
+
+    def add_initializer(self, name: str, initializer: torch.Tensor) -> None:  # pylint: disable=missing-docstring
+        self.register_buffer(name, initializer)
+
+    def forward(self, *args, **kwargs):  # pylint: disable=missing-function-docstring
+        raise RuntimeError('Got unexpected "forward" on constant container')
+
+
+def convert(  # pylint: disable=too-many-locals, too-many-branches, too-many-statements
+    onnx_model_or_path: Union[str, Path, ModelProto],
+    save_input_names: bool = False,
+    attach_onnx_mapping: bool = False,
+) -> fx.GraphModule:
+    """Convert model from onnx to PyTorch.
+
+    This function build torch.fx GraphModule from onnx ModelProto using operations from the converter registry.
+    The registered operation can be found in onnx2torch/node_converters.
+
+    Usage example:
+
+        from onnx2torch import convert
+        torch_module = convert('path/to/onnx_model.onnx')
+
+
+    Parameters
+    ----------
+    onnx_model_or_path : Union[str, Path, ModelProto]
+        Onnx ModelProto or model path to convert.
+    save_input_names : bool
+        Whether to use original onnx inputs names as fx graph placeholders names or to use generated names (input_n).
+        False by default.
+    attach_onnx_mapping : bool
+        Whether to attach info about mapping to original onnx tensors names.
+
+    Returns
+    -------
+    fx.GraphModule
+        PyTorch GraphModule
+
+    """
+
+    onnx_model = safe_shape_inference(onnx_model_or_path)
+
+    if onnx_model.ir_version < 3:
+        raise NotImplementedError('Onnx IR is too old (minimal supported version is 3).')
+
+    onnx_model = _remove_initializers_from_input(onnx_model)
+    opset_import = {opsetid_proto.domain: opsetid_proto.version for opsetid_proto in onnx_model.opset_import}
+
+    onnx_graph = OnnxGraph(onnx_model.graph)  # pylint: disable=no-member
+    torch_graph = fx.Graph()
+
+    torch_initializers = InitializersContainer()
+    torch_modules = nn.Module()
+    torch_modules.add_module('initializers', torch_initializers)
+    torch_nodes = {}
+
+    # create input nodes
+    for input_value, name in enumerate(onnx_graph.input_values, 1):
+        if save_input_names:
+            if not name.isidentifier():
+                raise ValueError(f'Input name "{name}" cannot be used as name of placeholder in fx.GraphModule.')
+
+            placeholder_name = name
+        else:
+            placeholder_name = f'input_{input_value}'
+
+        torch_nodes[name] = torch_graph.placeholder(name=placeholder_name)
+
+    # create intermediate nodes
+    # IMPORTANT: nodes already topologically sorted
+    for name, onnx_node in onnx_graph.nodes.items():
+        version = opset_import[onnx_node.domain]
+        converter = get_converter(
+            domain=onnx_node.domain,
+            operation_type=onnx_node.operation_type,
+            version=version,
+        )
+
+        torch_module, onnx_mapping = converter(onnx_node, onnx_graph)
+        if attach_onnx_mapping:
+            setattr(torch_module, 'onnx_mapping', onnx_mapping)
+
+        torch_modules.add_module(name, torch_module)
+
+        args = []
+        for value_name in onnx_mapping.inputs:
+            value_type = onnx_graph.value_type(value_name)
+            if value_type == ValueType.GRAPH_INPUT:
+                args.append(torch_nodes[value_name])
+
+            elif value_type == ValueType.NODE_OUTPUT:
+                onnx_input_node, _ = onnx_graph.value_as_node_output(value_name)
+                torch_input_node = torch_nodes[onnx_input_node.unique_name]
+
+                # Get only one needed output of torch_input_node by index
+                if len(onnx_input_node.output_values) > 1:
+                    index = onnx_input_node.output_values.index(value_name)
+                    torch_input_node = torch_graph.call_function(getitem, args=(torch_input_node, index))
+                    torch_nodes[name + '_split_output'] = torch_input_node
+                args.append(torch_input_node)
+
+            elif value_type == ValueType.GRAPH_INITIALIZER:
+                # The name of pytorch buffer must not contain '.'(dot)
+                len_torch_initializers = sum(1 for _ in torch_initializers.buffers())
+                torch_buffer_name = f'onnx_initializer_{len_torch_initializers}'
+                if value_name not in torch_nodes:
+                    torch_initializers.add_initializer(
+                        torch_buffer_name,
+                        onnx_graph.initializers[value_name].to_torch(),
+                    )
+                    torch_nodes[torch_buffer_name] = torch_graph.get_attr(f'initializers.{torch_buffer_name}')
+                args.append(torch_nodes[torch_buffer_name])
+
+            elif value_type == ValueType.EMPTY:
+                args.append(None)
+
+            else:
+                raise RuntimeError(f'Got unexpected input value type ({value_type})')
+
+        # Collect kwargs if there are some skipped args
+        kwargs = {}
+        if None in args:
+            first_skipped_arg = args.index(None)
+            forward_args = tuple(inspect.signature(torch_module.forward).parameters.keys())
+            forward_args = forward_args[first_skipped_arg : len(args)]
+            args, kwargs_values = args[:first_skipped_arg], args[first_skipped_arg:]
+            kwargs.update({name: value for name, value in zip(forward_args, kwargs_values) if value is not None})
+
+        torch_nodes[name] = torch_graph.call_module(module_name=name, args=tuple(args), kwargs=kwargs)
+
+    # Create output nodes
+    onnx_output_nodes = [onnx_graph.value_as_node_output(value_name)[0] for value_name in onnx_graph.output_values]
+    # Delete duplicates and save order
+    onnx_output_nodes = list(OrderedDict.fromkeys(onnx_output_nodes))
+
+    torch_output_nodes = [torch_nodes[onnx_node.unique_name] for onnx_node in onnx_output_nodes]
+    if len(torch_output_nodes) == 1:
+        torch_output_nodes = torch_output_nodes[0]
+    torch_graph.output(torch_output_nodes)
+
+    torch_graph.lint()
+    torch_model = fx.GraphModule(root=torch_modules, graph=torch_graph)
+
+    return torch_model
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/__init__.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/__init__.py
new file mode 100644
index 000000000..df168a21f
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/__init__.py
@@ -0,0 +1,63 @@
+from onnx2torch.node_converters.activations import *
+from onnx2torch.node_converters.argmax import *
+from onnx2torch.node_converters.average_pool import *
+from onnx2torch.node_converters.batch_norm import *
+from onnx2torch.node_converters.binary_math_operations import *
+from onnx2torch.node_converters.cast import *
+from onnx2torch.node_converters.clip import *
+from onnx2torch.node_converters.comparisons import *
+from onnx2torch.node_converters.concat import *
+from onnx2torch.node_converters.constant import *
+from onnx2torch.node_converters.constant_of_shape import *
+from onnx2torch.node_converters.conv import *
+from onnx2torch.node_converters.cumsum import *
+from onnx2torch.node_converters.depth_to_space import *
+from onnx2torch.node_converters.dropout import *
+from onnx2torch.node_converters.einsum import *
+from onnx2torch.node_converters.expand import *
+from onnx2torch.node_converters.eye_like import *
+from onnx2torch.node_converters.flatten import *
+from onnx2torch.node_converters.functions import *
+from onnx2torch.node_converters.gather import *
+from onnx2torch.node_converters.gemm import *
+from onnx2torch.node_converters.global_average_pool import *
+from onnx2torch.node_converters.identity import *
+from onnx2torch.node_converters.instance_norm import *
+from onnx2torch.node_converters.isinf import *
+from onnx2torch.node_converters.isnan import *
+from onnx2torch.node_converters.layer_norm import *
+from onnx2torch.node_converters.logical import *
+from onnx2torch.node_converters.lrn import *
+from onnx2torch.node_converters.matmul import *
+from onnx2torch.node_converters.max_pool import *
+from onnx2torch.node_converters.mean import *
+from onnx2torch.node_converters.min_max import *
+from onnx2torch.node_converters.mod import *
+from onnx2torch.node_converters.neg import *
+from onnx2torch.node_converters.nms import *
+from onnx2torch.node_converters.nonzero import *
+from onnx2torch.node_converters.pad import *
+from onnx2torch.node_converters.pow import *
+from onnx2torch.node_converters.random_normal_like import *
+from onnx2torch.node_converters.range import *
+from onnx2torch.node_converters.reciprocal import *
+from onnx2torch.node_converters.reduce import *
+from onnx2torch.node_converters.registry import OperationDescription
+from onnx2torch.node_converters.registry import TConverter
+from onnx2torch.node_converters.registry import get_converter
+from onnx2torch.node_converters.reshape import *
+from onnx2torch.node_converters.resize import *
+from onnx2torch.node_converters.roialign import *
+from onnx2torch.node_converters.roundings import *
+from onnx2torch.node_converters.scatter_nd import *
+from onnx2torch.node_converters.shape import *
+from onnx2torch.node_converters.slice import *
+from onnx2torch.node_converters.split import *
+from onnx2torch.node_converters.squeeze import *
+from onnx2torch.node_converters.sum import *
+from onnx2torch.node_converters.tile import *
+from onnx2torch.node_converters.topk import *
+from onnx2torch.node_converters.transpose import *
+from onnx2torch.node_converters.trilu import *
+from onnx2torch.node_converters.unsqueeze import *
+from onnx2torch.node_converters.where import *
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/activations.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/activations.py
new file mode 100644
index 000000000..3f6f52346
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/activations.py
@@ -0,0 +1,243 @@
+__all__ = [
+    'OnnxErf',
+    'OnnxHardSigmoid',
+    'OnnxSoftmaxV1V11',
+    'OnnxPReLU',
+]
+
+import numpy as np
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+
+class OnnxErf(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        return torch.erf(input_tensor)
+
+
+class OnnxHardSigmoid(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, alpha: float = 0.2, beta: float = 0.5):
+        super().__init__()
+        self.alpha = alpha
+        self.beta = beta
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        return torch.clip(input_tensor * self.alpha + self.beta, min=0.0, max=1.0)
+
+
+class OnnxSoftmaxV1V11(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, axis: int = 1, is_log: bool = False):
+        super().__init__()
+        self.axis = axis
+        self.is_log = is_log
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        shape = input_tensor.shape
+        result = torch.flatten(input_tensor, start_dim=self.axis)
+        result = torch.log_softmax(result, -1) if self.is_log else torch.softmax(result, -1)
+
+        return torch.reshape(result, shape)
+
+
+class OnnxPReLU(nn.Module, OnnxToTorchModuleWithCustomExport):  # pylint: disable=missing-docstring
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_tensor: torch.Tensor,
+        slope: torch.Tensor,
+    ) -> torch.Tensor:
+        def _forward():
+            if slope.nelement() == 1 or (
+                slope.shape[0] == input_tensor.shape[1] and all(s == 1 for s in slope.shape[1:])
+            ):
+                return nn.functional.prelu(input_tensor, weight=slope.view(-1))  # pylint: disable=not-callable
+
+            output = input_tensor.clone()
+            output = output * slope
+            mask = input_tensor >= 0
+            output[mask] = input_tensor[mask]
+            return output
+
+        if torch.onnx.is_in_onnx_export():
+            return DefaultExportToOnnx.export(_forward, 'PRelu', input_tensor, slope, {})
+
+        return _forward()
+
+
+@add_converter(operation_type='Erf', version=9)
+@add_converter(operation_type='Erf', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxErf(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='HardSigmoid', version=1)
+@add_converter(operation_type='HardSigmoid', version=6)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    alpha = node.attributes.get('alpha', 0.2)
+    beta = node.attributes.get('beta', 0.5)
+
+    return OperationConverterResult(
+        torch_module=OnnxHardSigmoid(alpha=alpha, beta=beta),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='HardSwish', version=14)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=nn.Hardswish(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='LeakyRelu', version=1)
+@add_converter(operation_type='LeakyRelu', version=6)
+@add_converter(operation_type='LeakyRelu', version=16)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    alpha = node.attributes.get('alpha', 0.01)
+
+    return OperationConverterResult(
+        torch_module=nn.LeakyReLU(negative_slope=alpha),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='LogSoftmax', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    dim = node.attributes.get('axis', -1)
+
+    return OperationConverterResult(
+        torch_module=nn.LogSoftmax(dim=dim),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='LogSoftmax', version=1)
+@add_converter(operation_type='LogSoftmax', version=11)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    axis = node.attributes.get('axis', 1)
+
+    return OperationConverterResult(
+        torch_module=OnnxSoftmaxV1V11(axis=axis, is_log=True),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='Relu', version=6)
+@add_converter(operation_type='Relu', version=13)
+@add_converter(operation_type='Relu', version=14)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=nn.ReLU(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='PRelu', version=7)
+@add_converter(operation_type='PRelu', version=9)
+@add_converter(operation_type='PRelu', version=16)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxPReLU(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='Elu', version=6)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    alpha = node.attributes.get('alpha', 1.0)
+
+    return OperationConverterResult(
+        torch_module=nn.ELU(alpha=alpha),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='Celu', version=12)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    alpha = node.attributes.get('alpha', 1.0)
+
+    return OperationConverterResult(
+        torch_module=nn.CELU(alpha=alpha),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='Selu', version=6)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    default_alpha = 1.67326319217681884765625
+    default_gamma = 1.05070102214813232421875
+
+    alpha = node.attributes.get('alpha', default_alpha)
+    gamma = node.attributes.get('gamma', default_gamma)
+
+    if not np.isclose(alpha, default_alpha):
+        raise ValueError(f'alpha parameter must be {default_alpha}, not {alpha}')
+    if not np.isclose(gamma, default_gamma):
+        raise ValueError(f'gamma parameter must be {default_gamma}, not {gamma}')
+
+    return OperationConverterResult(
+        torch_module=nn.SELU(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='Sigmoid', version=1)
+@add_converter(operation_type='Sigmoid', version=6)
+@add_converter(operation_type='Sigmoid', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=nn.Sigmoid(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='Softmax', version=1)
+@add_converter(operation_type='Softmax', version=11)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    axis = node.attributes.get('axis', 1)
+
+    return OperationConverterResult(
+        torch_module=OnnxSoftmaxV1V11(axis=axis),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='Softmax', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    dim = node.attributes.get('axis', -1)
+
+    return OperationConverterResult(
+        torch_module=torch.nn.Softmax(dim=dim),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='Softsign', version=1)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=torch.nn.Softsign(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='Softplus', version=1)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    beta = node.attributes.get('beta', 1.0)
+    threshold = node.attributes.get('threshold', 20.0)
+
+    return OperationConverterResult(
+        torch_module=torch.nn.Softplus(beta=beta, threshold=threshold),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/argmax.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/argmax.py
new file mode 100644
index 000000000..5a3707169
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/argmax.py
@@ -0,0 +1,44 @@
+__all__ = [
+    'OnnxArgMax',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxArgMax(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, axis: int, keepdims: int, **kwargs):
+        super().__init__()
+        self.axis = axis
+        self.keepdims = keepdims == 1
+
+    def forward(self, *input_tensors) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        if len(input_tensors) != 1:
+            raise RuntimeError(f"Invalid input tensors, expect one tensor, but got {len(input_tensors)}.")
+        return torch.argmax(input_tensors[0], dim=self.axis, keepdim=self.keepdims)
+
+
+@add_converter(operation_type='ArgMax', version=4)
+@add_converter(operation_type='ArgMax', version=11)
+@add_converter(operation_type='ArgMax', version=13)
+@add_converter(operation_type='ArgMax', version=14)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    axis = node.attributes.get('axis', 0)
+    keepdims = node.attributes.get("keepdims", 1)
+
+    torch_module = OnnxArgMax(
+        axis=axis,
+        keepdims=keepdims
+    )
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/average_pool.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/average_pool.py
new file mode 100644
index 000000000..3f07f963b
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/average_pool.py
@@ -0,0 +1,61 @@
+__all__ = []
+
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import get_shape_from_value_info
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.padding import onnx_auto_pad_to_torch_padding
+
+_AVGPOOL_CLASS_FROM_SPATIAL_RANK = {
+    1: nn.AvgPool1d,
+    2: nn.AvgPool2d,
+    3: nn.AvgPool3d,
+}
+
+
+@add_converter(operation_type='AveragePool', version=7)
+@add_converter(operation_type='AveragePool', version=10)
+@add_converter(operation_type='AveragePool', version=11)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    input_value_info = graph.value_info[node.input_values[0]]
+    input_shape = get_shape_from_value_info(input_value_info)
+
+    spatial_rank = len(input_shape) - 2
+    try:
+        avgpool_class = _AVGPOOL_CLASS_FROM_SPATIAL_RANK[spatial_rank]
+    except KeyError as exc:
+        raise NotImplementedError(
+            f'Average pool operation with spatial rank == {spatial_rank} is not implemented'
+        ) from exc
+
+    node_attributes = node.attributes
+    # required
+    kernel_shape = node_attributes['kernel_shape']
+    # optional
+    ceil_mode = node_attributes.get('ceil_mode', 0)
+    strides = node_attributes.get('strides', 1)
+    count_include_pad = node_attributes.get('count_include_pad', 0)
+
+    padding, padding_module = onnx_auto_pad_to_torch_padding(
+        onnx_padding=node_attributes.get('pads', [0] * spatial_rank * 2),
+        auto_pad=node_attributes.get('auto_pad', 'NOTSET'),
+    )
+    if padding_module is not None:
+        raise NotImplementedError('AvgPool with non symmetrical padding is not implemented.')
+
+    torch_module = avgpool_class(
+        kernel_size=kernel_shape,
+        stride=strides,
+        padding=padding,
+        count_include_pad=count_include_pad == 1,
+        ceil_mode=ceil_mode == 1,
+    )
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/base_element_wise.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/base_element_wise.py
new file mode 100644
index 000000000..762e513ab
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/base_element_wise.py
@@ -0,0 +1,36 @@
+# pylint: disable=missing-docstring
+import torch
+from torch import nn
+
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+
+class OnnxBaseElementWise(nn.Module, OnnxToTorchModuleWithCustomExport):
+    def __init__(self, op_type: str):
+        super().__init__()
+        self._op_type = op_type
+
+    @staticmethod
+    def _broadcast_shape(*tensors: torch.Tensor):
+        shapes = [t.shape for t in tensors]
+        broadcast_shape = torch.broadcast_shapes(*shapes)
+        return broadcast_shape
+
+    def apply_reduction(self, *tensors: torch.Tensor) -> torch.Tensor:
+        del tensors
+        raise NotImplementedError
+
+    def forward(self, *input_tensors: torch.Tensor) -> torch.Tensor:
+        if len(input_tensors) == 1:
+            # If there is a single element, return it (no op).
+            # Also, no need for manually building the ONNX node.
+            return input_tensors[0]
+
+        def _forward() -> torch.Tensor:
+            return self.apply_reduction(*input_tensors)
+
+        if torch.onnx.is_in_onnx_export():
+            return DefaultExportToOnnx.export(_forward, self._op_type, *input_tensors, {})
+
+        return _forward()
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/batch_norm.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/batch_norm.py
new file mode 100644
index 000000000..b79a68da5
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/batch_norm.py
@@ -0,0 +1,102 @@
+__all__ = [
+    'OnnxBatchNorm',
+]
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxMapping
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import get_shape_from_value_info
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+_BN_CLASS_FROM_SPATIAL_RANK = {
+    0: nn.BatchNorm1d,
+    1: nn.BatchNorm1d,
+    2: nn.BatchNorm2d,
+    3: nn.BatchNorm3d,
+}
+
+
+class OnnxBatchNorm(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, momentum: float, epsilon: float):
+        super().__init__()
+        self.momentum = momentum
+        self.epsilon = epsilon
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_data: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        running_mean: torch.Tensor,
+        running_var: torch.Tensor,
+    ) -> torch.Tensor:
+        return F.batch_norm(
+            input_data,
+            running_mean=running_mean,
+            running_var=running_var,
+            weight=weight,
+            bias=bias,
+            training=self.training,
+            momentum=self.momentum,
+            eps=self.epsilon,
+        )
+
+
+@add_converter(operation_type='BatchNormalization', version=15)
+@add_converter(operation_type='BatchNormalization', version=14)
+@add_converter(operation_type='BatchNormalization', version=9)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    node_attributes = node.attributes
+    epsilon = node_attributes.get('epsilon', 1e-5)
+    momentum = 1 - node_attributes.get('momentum', 0.9)  # See PyTorch documentation for batch norm.
+
+    if all(value_name in graph.initializers for value_name in node.input_values[1:]) and len(node.output_values) == 1:
+        input_value_info = graph.value_info[node.input_values[0]]
+        input_shape = get_shape_from_value_info(input_value_info)
+        spatial_rank = len(input_shape) - 2
+        try:
+            bn_class = _BN_CLASS_FROM_SPATIAL_RANK[spatial_rank]
+        except KeyError as exc:
+            raise NotImplementedError(
+                f'BatchNorm operation with spatial rank == {spatial_rank} is not implemented'
+            ) from exc
+
+        scale_value_name = node.input_values[1]
+        bias_value_name = node.input_values[2]
+        mean_value_name = node.input_values[3]
+        var_value_name = node.input_values[4]
+
+        scale = graph.initializers[scale_value_name].to_torch()
+        torch_module = bn_class(
+            num_features=scale.size()[0],
+            eps=epsilon,
+            momentum=momentum,
+        )
+        with torch.no_grad():
+            torch_module.running_mean.data = graph.initializers[mean_value_name].to_torch()
+            torch_module.running_var.data = graph.initializers[var_value_name].to_torch()
+            torch_module.weight.data = scale
+            torch_module.bias.data = graph.initializers[bias_value_name].to_torch()
+
+        onnx_mapping = OnnxMapping(
+            inputs=(node.input_values[0],),
+            outputs=node.output_values,
+        )
+    else:
+        if len(node.output_values) != 1:
+            raise NotImplementedError('BatchNorm operation with mean/var output is not implemented')
+
+        torch_module = OnnxBatchNorm(momentum=momentum, epsilon=epsilon)
+        onnx_mapping = onnx_mapping_from_node(node)
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping,
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/binary_math_operations.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/binary_math_operations.py
new file mode 100644
index 000000000..ccf589c1b
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/binary_math_operations.py
@@ -0,0 +1,81 @@
+__all__ = [
+    'OnnxBinaryMathOperation',
+]
+
+from typing import Optional
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import old_style_broadcast
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+def _onnx_div(first: torch.Tensor, second: torch.Tensor) -> torch.Tensor:
+    if first.is_floating_point() or second.is_floating_point():  # float division
+        return torch.div(first, second)
+
+    return torch.div(first, second, rounding_mode='trunc')  # integer division
+
+
+_TORCH_FUNCTION_FROM_ONNX_TYPE = {
+    'Add': torch.add,
+    'Sub': torch.sub,
+    'Mul': torch.mul,
+    'Div': _onnx_div,
+}
+
+
+class OnnxBinaryMathOperation(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, operation_type: str, broadcast: Optional[int] = None, axis: Optional[int] = None):
+        super().__init__()
+
+        self.broadcast = broadcast
+        self.axis = axis
+        self.math_op_function = _TORCH_FUNCTION_FROM_ONNX_TYPE[operation_type]
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        first: torch.Tensor,
+        second: torch.Tensor,
+    ) -> torch.Tensor:
+        if self.broadcast == 1 and self.axis is not None:
+            second = old_style_broadcast(first, second, self.axis)
+
+        return self.math_op_function(first, second)
+
+
+@add_converter(operation_type='Add', version=1)
+@add_converter(operation_type='Add', version=6)
+@add_converter(operation_type='Add', version=7)
+@add_converter(operation_type='Add', version=13)
+@add_converter(operation_type='Add', version=14)
+@add_converter(operation_type='Sub', version=1)
+@add_converter(operation_type='Sub', version=6)
+@add_converter(operation_type='Sub', version=7)
+@add_converter(operation_type='Sub', version=13)
+@add_converter(operation_type='Sub', version=14)
+@add_converter(operation_type='Mul', version=1)
+@add_converter(operation_type='Mul', version=6)
+@add_converter(operation_type='Mul', version=7)
+@add_converter(operation_type='Mul', version=13)
+@add_converter(operation_type='Mul', version=14)
+@add_converter(operation_type='Div', version=1)
+@add_converter(operation_type='Div', version=6)
+@add_converter(operation_type='Div', version=7)
+@add_converter(operation_type='Div', version=13)
+@add_converter(operation_type='Div', version=14)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxBinaryMathOperation(
+            operation_type=node.operation_type,
+            broadcast=node.attributes.get('broadcast', None),
+            axis=node.attributes.get('axis', None),
+        ),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/cast.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/cast.py
new file mode 100644
index 000000000..b9b60cf53
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/cast.py
@@ -0,0 +1,54 @@
+__all__ = [
+    'OnnxCast',
+]
+
+import torch
+from onnx import TensorProto  # pylint: disable=no-name-in-module
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+# pylint: disable=no-member
+TENSOR_TYPE_TO_TORCH_TYPE = {
+    int(TensorProto.FLOAT): torch.float32,
+    int(TensorProto.UINT8): torch.uint8,
+    int(TensorProto.INT8): torch.int8,
+    int(TensorProto.INT16): torch.int16,
+    int(TensorProto.INT32): torch.int32,
+    int(TensorProto.INT64): torch.int64,
+    int(TensorProto.BOOL): torch.bool,
+    int(TensorProto.FLOAT16): torch.float16,
+    int(TensorProto.DOUBLE): torch.float64,
+    int(TensorProto.COMPLEX64): torch.complex64,
+    int(TensorProto.COMPLEX128): torch.complex128,
+}
+# pylint: enable=no-member
+
+
+class OnnxCast(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, onnx_dtype: int):
+        super().__init__()
+        try:
+            self.torch_dtype = TENSOR_TYPE_TO_TORCH_TYPE[onnx_dtype]
+        except KeyError as exc:
+            raise NotImplementedError(f'Conversion to "{onnx_dtype}" is not implemented') from exc
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        return input_tensor.to(self.torch_dtype)
+
+
+@add_converter(operation_type='Cast', version=9)
+@add_converter(operation_type='Cast', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    node_attributes = node.attributes
+    onnx_dtype = node_attributes.get('to', None)
+
+    return OperationConverterResult(
+        torch_module=OnnxCast(onnx_dtype),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/clip.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/clip.py
new file mode 100644
index 000000000..8d0dd0051
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/clip.py
@@ -0,0 +1,84 @@
+__all__ = [
+    'OnnxClip',
+]
+
+from typing import Optional
+
+import torch
+from torch import nn
+from torch.types import Number
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxMapping
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import get_const_value
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxClip(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(
+        self,
+        min_val: Optional[Number] = None,
+        max_val: Optional[Number] = None,
+    ):
+        super().__init__()
+        self.min_val = min_val
+        self.max_val = max_val
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        return torch.clamp(input_tensor, self.min_val, self.max_val)
+
+
+def _create_torch_module(min_val: Optional[torch.Tensor], max_val: Optional[torch.Tensor]) -> nn.Module:
+    if min_val is None and max_val is None:
+        torch_module = nn.Identity()
+    elif min_val == 0 and max_val is None:
+        torch_module = nn.ReLU()
+    elif min_val == 0 and max_val == 6:
+        torch_module = nn.ReLU6()
+    else:
+        torch_module = OnnxClip(min_val=min_val, max_val=max_val)
+
+    return torch_module
+
+
+@add_converter(operation_type='Clip', version=11)
+@add_converter(operation_type='Clip', version=12)
+@add_converter(operation_type='Clip', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    # Min and Max inputs are optional
+    min_name = node.input_values[1] if len(node.input_values) > 1 else None
+    max_name = node.input_values[2] if len(node.input_values) > 2 else None
+
+    try:
+        min_val = float(get_const_value(min_name, graph)) if min_name is not None else None
+        max_val = float(get_const_value(max_name, graph)) if max_name is not None else None
+    except KeyError as exc:
+        raise NotImplementedError('Dynamic value of min/max is not implemented') from exc
+
+    torch_module = _create_torch_module(min_val=min_val, max_val=max_val)
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=OnnxMapping(
+            inputs=(node.input_values[0],),
+            outputs=node.output_values,
+        ),
+    )
+
+
+@add_converter(operation_type='Clip', version=6)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    node_attributes = node.attributes
+    min_val = node_attributes.get('min', None)
+    max_val = node_attributes.get('max', None)
+
+    torch_module = _create_torch_module(min_val=min_val, max_val=max_val)
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/comparisons.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/comparisons.py
new file mode 100644
index 000000000..08de7d57f
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/comparisons.py
@@ -0,0 +1,48 @@
+__all__ = [
+    'OnnxCompare',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+_TORCH_FUNCTION_FROM_ONNX_TYPE = {
+    'Equal': torch.eq,
+    'Less': torch.less,
+    'LessOrEqual': torch.less_equal,
+    'Greater': torch.greater,
+    'GreaterOrEqual': torch.greater_equal,
+}
+
+
+class OnnxCompare(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, operation_type: str):
+        super().__init__()
+        self.compare_function = _TORCH_FUNCTION_FROM_ONNX_TYPE[operation_type]
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        return self.compare_function(x, y)
+
+
+@add_converter(operation_type='Equal', version=7)
+@add_converter(operation_type='Equal', version=11)
+@add_converter(operation_type='Equal', version=13)
+@add_converter(operation_type='Less', version=7)
+@add_converter(operation_type='Less', version=9)
+@add_converter(operation_type='Less', version=13)
+@add_converter(operation_type='Greater', version=7)
+@add_converter(operation_type='Greater', version=9)
+@add_converter(operation_type='Greater', version=13)
+@add_converter(operation_type='LessOrEqual', version=12)
+@add_converter(operation_type='GreaterOrEqual', version=12)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxCompare(operation_type=node.operation_type),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/concat.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/concat.py
new file mode 100644
index 000000000..dca2b4ac0
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/concat.py
@@ -0,0 +1,37 @@
+__all__ = [
+    'OnnxConcat',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxConcat(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, axis: int):
+        super().__init__()
+        self.axis = axis
+
+    def forward(self, *input_tensors) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        return torch.cat(input_tensors, self.axis)
+
+
+@add_converter(operation_type='Concat', version=4)
+@add_converter(operation_type='Concat', version=11)
+@add_converter(operation_type='Concat', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    axis = node.attributes.get('axis', 0)
+    torch_module = OnnxConcat(
+        axis=axis,
+    )
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/constant.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/constant.py
new file mode 100644
index 000000000..0b5830826
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/constant.py
@@ -0,0 +1,63 @@
+__all__ = [
+    'OnnxConstant',
+]
+
+from typing import Any
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+_CONSTANT_PARSING_MAPPING = {
+    'value': lambda x: x.to_torch(),
+    'value_float': torch.tensor,
+    'value_floats': torch.tensor,
+    'value_int': torch.tensor,
+    'value_ints': torch.tensor,
+    'value_string': lambda x: x,
+    'value_strings': lambda x: x,
+}
+
+
+class OnnxConstant(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, value: Any):
+        super().__init__()
+        # We need it for placing constant to cuda.
+        if isinstance(value, torch.Tensor):
+            self.register_buffer('value', value)
+        else:
+            self.value = value
+
+    def forward(self) -> Any:  # pylint: disable=missing-function-docstring
+        return self.value
+
+
+def _prepare_output_value(value: Any, attr_name: str) -> Any:
+    if attr_name in _CONSTANT_PARSING_MAPPING:
+        return _CONSTANT_PARSING_MAPPING[attr_name](value)
+
+    raise NotImplementedError(f'value type "{attr_name}" not supported yet.')
+
+
+@add_converter(operation_type='Constant', version=9)
+@add_converter(operation_type='Constant', version=11)
+@add_converter(operation_type='Constant', version=12)
+@add_converter(operation_type='Constant', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    attr_name, value = list(node.attributes.items())[0]
+    prepared_value = _prepare_output_value(value, attr_name)
+
+    torch_module = OnnxConstant(
+        value=prepared_value,
+    )
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/constant_of_shape.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/constant_of_shape.py
new file mode 100644
index 000000000..e81a6137f
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/constant_of_shape.py
@@ -0,0 +1,54 @@
+__all__ = [
+    'OnnxConstantOfShape',
+]
+
+from typing import Optional
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxConstantOfShape(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, value: Optional[torch.Tensor] = None):
+        super().__init__()
+
+        if value is None:
+            value = torch.tensor(0.0, dtype=torch.float32)
+
+        if value.numel() != 1:
+            raise ValueError('parameter "value" must be scalar')
+
+        self.value: torch.Tensor
+        self.register_buffer('value', value)
+
+    def forward(self, shape: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        fill_value = self.value.item()
+
+        return torch.full(
+            size=torch.Size(shape),
+            fill_value=int(fill_value) if isinstance(fill_value, bool) else fill_value,
+            dtype=self.value.dtype,
+            device=self.value.device,
+        )
+
+
+@add_converter(operation_type='ConstantOfShape', version=9)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    node_attributes = node.attributes
+
+    if 'value' in node_attributes:
+        value = node_attributes['value'].to_torch()
+    else:
+        value = None
+
+    return OperationConverterResult(
+        torch_module=OnnxConstantOfShape(value=value),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/conv.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/conv.py
new file mode 100644
index 000000000..8e783e570
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/conv.py
@@ -0,0 +1,97 @@
+__all__ = []
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxMapping
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.padding import onnx_auto_pad_to_torch_padding
+
+_CONV_CLASS_FROM_SPATIAL_RANK = {
+    ('Conv', 1): nn.Conv1d,
+    ('Conv', 2): nn.Conv2d,
+    ('Conv', 3): nn.Conv3d,
+    ('ConvTranspose', 1): nn.ConvTranspose1d,
+    ('ConvTranspose', 2): nn.ConvTranspose2d,
+    ('ConvTranspose', 3): nn.ConvTranspose3d,
+}
+
+
+@add_converter(operation_type='Conv', version=1)
+@add_converter(operation_type='Conv', version=11)
+@add_converter(operation_type='ConvTranspose', version=1)
+@add_converter(operation_type='ConvTranspose', version=11)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    weights_value_name = node.input_values[1]
+    weights = graph.initializers[weights_value_name]
+    weights = weights.to_torch()
+    if len(node.input_values) == 3:
+        bias_value_name = node.input_values[2]
+        bias = graph.initializers[bias_value_name]
+        bias = bias.to_torch()
+    else:
+        bias = None
+
+    op_type = node.operation_type
+    spatial_rank = len(weights.shape) - 2
+    try:
+        conv_class = _CONV_CLASS_FROM_SPATIAL_RANK[op_type, spatial_rank]
+    except KeyError as exc:
+        raise NotImplementedError(
+            f'Convolution operation with spatial rank == {spatial_rank} is not implemented'
+        ) from exc
+
+    node_attributes = node.attributes
+    padding, input_padding_module = onnx_auto_pad_to_torch_padding(
+        onnx_padding=node_attributes.get('pads', [0] * spatial_rank * 2),
+        auto_pad=node_attributes.get('auto_pad', 'NOTSET'),
+    )
+    common_kwargs = {
+        'kernel_size': node_attributes.get('kernel_shape', weights.shape[2:]),
+        'stride': node_attributes.get('strides', 1),
+        'dilation': node_attributes.get('dilations', 1),
+        'groups': node_attributes.get('group', 1),
+        'padding': padding,
+        'bias': bias is not None,
+    }
+
+    if op_type == 'Conv':
+        special_kwargs = {
+            'out_channels': weights.shape[0],
+            'in_channels': weights.shape[1] * common_kwargs['groups'],
+        }
+    elif op_type == 'ConvTranspose':
+        if input_padding_module is not None:
+            raise NotImplementedError('ConvTranspose with non symmetrical padding is not implemented.')
+
+        output_padding = node_attributes.get('output_padding', [0] * spatial_rank)
+        special_kwargs = {
+            'out_channels': weights.shape[1] * common_kwargs['groups'],
+            'in_channels': weights.shape[0],
+            'output_padding': output_padding,
+        }
+    else:
+        raise ValueError(f'Got unknown op_type "{op_type}"')
+
+    torch_module = conv_class(
+        **common_kwargs,
+        **special_kwargs,
+    )
+    with torch.no_grad():
+        torch_module.weight.data = weights
+        if bias is not None:
+            torch_module.bias.data = bias
+
+    if input_padding_module is not None:
+        torch_module = nn.Sequential(input_padding_module, torch_module)
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=OnnxMapping(
+            inputs=(node.input_values[0],),
+            outputs=node.output_values,
+        ),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/cumsum.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/cumsum.py
new file mode 100644
index 000000000..16a79b02a
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/cumsum.py
@@ -0,0 +1,88 @@
+__all__ = [
+    'OnnxCumSum',
+]
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+def _arbitrary_dim_shift_and_insert_zero(
+    input_tensor: torch.Tensor,
+    insert_dim: int,
+) -> torch.Tensor:
+    # single item shift
+    slice_index, insertion = [[slice(None)] * len(input_tensor.shape)] * 2
+    insert_dim_size = input_tensor.shape[insert_dim]
+
+    slice_index[insert_dim] = slice(0, -1)
+    slice_index = tuple(slice_index)
+    tensor_slice = input_tensor[slice_index]
+
+    insert_index = torch.arange(start=1, end=insert_dim_size, dtype=torch.int64, device=input_tensor.device)
+    index_shape = [1] * len(input_tensor.shape)
+    index_shape[insert_dim] = insert_dim_size - 1
+
+    insert_index = torch.reshape(insert_index, index_shape)
+    insert_index = insert_index + torch.zeros_like(tensor_slice, dtype=torch.int64, device=input_tensor.device)
+
+    input_tensor = torch.scatter(
+        input=input_tensor,
+        dim=insert_dim,
+        index=insert_index,
+        src=tensor_slice,
+    )
+
+    insertion[insert_dim] = slice(0, 1)
+    insertion = tuple(insertion)
+    input_tensor[insertion] = 0
+
+    return input_tensor
+
+
+class OnnxCumSum(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(
+        self,
+        exclusive: bool = False,
+        reverse: bool = False,
+    ):
+        super().__init__()
+        self.exclusive = exclusive
+        self.reverse = reverse
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_tensor: torch.Tensor,
+        axis: torch.Tensor,
+    ) -> torch.Tensor:
+        axis = axis.item()
+        if self.reverse:
+            input_tensor = torch.flip(input_tensor, dims=(axis,))
+
+        if self.exclusive:
+            input_tensor = _arbitrary_dim_shift_and_insert_zero(input_tensor, insert_dim=axis)
+
+        input_tensor = torch.cumsum(input_tensor, dim=axis)
+
+        if self.reverse:
+            input_tensor = torch.flip(input_tensor, dims=(axis,))
+
+        return input_tensor
+
+
+@add_converter(operation_type='CumSum', version=11)
+@add_converter(operation_type='CumSum', version=14)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    node_attributes = node.attributes
+    exclusive = bool(node_attributes.get('exclusive', 0))
+    reverse = bool(node_attributes.get('reverse', 1))
+
+    return OperationConverterResult(
+        torch_module=OnnxCumSum(exclusive, reverse),
+        onnx_mapping=onnx_mapping_from_node(node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/depth_to_space.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/depth_to_space.py
new file mode 100644
index 000000000..10ec05b25
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/depth_to_space.py
@@ -0,0 +1,37 @@
+__all__ = ['OnnxDepthToSpace']
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxDepthToSpace(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, blocksize: int):
+        super().__init__()
+        self._upscale_factor = blocksize
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        return torch.pixel_shuffle(input_tensor, upscale_factor=self._upscale_factor)
+
+
+@add_converter(operation_type='DepthToSpace', version=11)
+@add_converter(operation_type='DepthToSpace', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    del graph
+
+    blocksize: int = node.attributes['blocksize']  # required
+    mode: str = node.attributes.get('mode', 'DCR')
+
+    if mode != 'CRD':
+        raise NotImplementedError('DepthToSpace for mode other than CRD is not implemented')
+
+    return OperationConverterResult(
+        torch_module=OnnxDepthToSpace(blocksize=blocksize),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/dropout.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/dropout.py
new file mode 100644
index 000000000..0631f7434
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/dropout.py
@@ -0,0 +1,54 @@
+__all__ = [
+    'OnnxDropoutDynamic',
+]
+
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxDropoutDynamic(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def forward(  # pylint: disable=missing-function-docstring, unused-argument
+        self,
+        input_tensor: torch.Tensor,
+        ratio: float = 0.5,
+        training_mode: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # Ignoring training_mode from ONNX and use the one from PyTorch
+        return F.dropout(input_tensor, p=ratio, training=self.training)
+
+
+@add_converter(operation_type='Dropout', version=10)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    node_attributes = node.attributes
+    ratio = node_attributes.get('ratio', 0.5)
+
+    torch_module = nn.Dropout(p=ratio)
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='Dropout', version=12)
+@add_converter(operation_type='Dropout', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    node_attributes = node.attributes
+    seed = node_attributes.get('seed')
+    if seed is not None:
+        raise NotImplementedError('Dropout nodes with seeds are not supported.')
+
+    return OperationConverterResult(
+        torch_module=OnnxDropoutDynamic(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/einsum.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/einsum.py
new file mode 100644
index 000000000..4c136e424
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/einsum.py
@@ -0,0 +1,32 @@
+__all__ = [
+    'OnnxEinsum',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxEinsum(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, equation: str):
+        super().__init__()
+        self.equation = equation
+
+    def forward(self, *args):  # pylint: disable=missing-function-docstring
+        return torch.einsum(self.equation, *args)
+
+
+@add_converter(operation_type='Einsum', version=12)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxEinsum(
+            equation=node.attributes['equation'],
+        ),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/expand.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/expand.py
new file mode 100644
index 000000000..3860bca89
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/expand.py
@@ -0,0 +1,38 @@
+__all__ = [
+    'OnnxExpand',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+
+class OnnxExpand(nn.Module, OnnxToTorchModuleWithCustomExport):  # pylint: disable=missing-docstring
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_tensor: torch.Tensor,
+        shape: torch.Tensor,
+    ) -> torch.Tensor:
+        def _forward():
+            return input_tensor * torch.ones(torch.Size(shape), dtype=input_tensor.dtype, device=input_tensor.device)
+
+        if torch.onnx.is_in_onnx_export():
+            return DefaultExportToOnnx.export(_forward, 'Expand', input_tensor, shape, {})
+
+        return _forward()
+
+
+@add_converter(operation_type='Expand', version=8)
+@add_converter(operation_type='Expand', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxExpand(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/eye_like.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/eye_like.py
new file mode 100644
index 000000000..dc13c54fb
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/eye_like.py
@@ -0,0 +1,67 @@
+__all__ = [
+    'OnnxEyeLike',
+]
+
+from typing import Optional
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.dtype import onnx_dtype_to_torch_dtype
+
+
+class OnnxEyeLike(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, dtype: Optional[int] = None, k: int = 0):  # pylint: disable=invalid-name
+        super().__init__()
+        self.dtype = dtype
+        self.k = k  # pylint: disable=invalid-name
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        if len(x.shape) != 2:
+            raise ValueError(f'EyeLike only supports 2D tensors, got {len(x.shape)}')
+
+        dtype = x.dtype if self.dtype is None else onnx_dtype_to_torch_dtype(self.dtype)
+        if not isinstance(dtype, torch.dtype):
+            raise ValueError(f'Expected type of dtype is torch.dtype, got {type(dtype)}')
+
+        rows, cols = x.size()
+        if self.k > rows:
+            raise ValueError(
+                f'EyeLike attribute k should be less or equal than the zero dimension of input tensor,'
+                f'got {self.k} and {rows}'
+            )
+
+        if self.k == 0:
+            return torch.eye(n=rows, m=cols, dtype=dtype)
+        if self.k > 0:
+            return torch.concat(
+                [
+                    torch.zeros(rows, self.k, dtype=dtype),
+                    torch.eye(n=rows, m=(cols - self.k), dtype=dtype),
+                ],
+                dim=1,
+            )
+        return torch.concat(  # k < 0:
+            [
+                torch.zeros(-self.k, cols, dtype=dtype),
+                torch.eye(n=(rows + self.k), m=cols, dtype=dtype),
+            ],
+            dim=0,
+        )
+
+
+@add_converter(operation_type='EyeLike', version=9)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    node_attributes = node.attributes
+    k = node_attributes.get('k', 0)  # pylint: disable=invalid-name
+    dtype = node_attributes.get('dtype', None)
+    return OperationConverterResult(
+        torch_module=OnnxEyeLike(dtype=dtype, k=k),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/flatten.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/flatten.py
new file mode 100644
index 000000000..07d530407
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/flatten.py
@@ -0,0 +1,39 @@
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxFlatten(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, axis: int = 1):
+        super().__init__()
+        self.axis = axis
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        x = torch.flatten(x, end_dim=self.axis - 1)
+        return torch.flatten(x, start_dim=1)
+
+    @classmethod
+    def maybe_create_simple_flatten(cls, axis: int = 1) -> nn.Module:  # pylint: disable=missing-docstring
+        if axis == 1:
+            return nn.Flatten(start_dim=axis)
+
+        return cls(axis=axis)
+
+
+@add_converter(operation_type='Flatten', version=13)
+@add_converter(operation_type='Flatten', version=11)
+@add_converter(operation_type='Flatten', version=9)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    axis = node.attributes.get('axis', 1)
+    torch_module = OnnxFlatten.maybe_create_simple_flatten(axis=axis)
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/functions.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/functions.py
new file mode 100644
index 000000000..aee6875ac
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/functions.py
@@ -0,0 +1,60 @@
+__all__ = [
+    'OnnxFunction',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+# Exporting from pytorch to onnx operators atanh, asinh, acosh, cosh, sinh are not supported
+_TORCH_FUNCTION_FROM_ONNX_TYPE = {
+    'Abs': torch.abs,
+    'Acos': torch.acos,
+    'Asin': torch.asin,
+    'Atan': torch.atan,
+    'Cos': torch.cos,
+    'Exp': torch.exp,
+    'Log': torch.log,
+    'Sign': torch.sign,
+    'Sin': torch.sin,
+    'Tan': torch.tan,
+    'Tanh': torch.tanh,
+}
+
+
+class OnnxFunction(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, function_type: str):
+        super().__init__()
+        self.function = _TORCH_FUNCTION_FROM_ONNX_TYPE[function_type]
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        return self.function(input_tensor)
+
+
+@add_converter(operation_type='Abs', version=13)
+@add_converter(operation_type='Abs', version=6)
+@add_converter(operation_type='Acos', version=7)
+@add_converter(operation_type='Asin', version=7)
+@add_converter(operation_type='Atan', version=7)
+@add_converter(operation_type='Cos', version=7)
+@add_converter(operation_type='Exp', version=6)
+@add_converter(operation_type='Exp', version=13)
+@add_converter(operation_type='Log', version=13)
+@add_converter(operation_type='Log', version=6)
+@add_converter(operation_type='Sign', version=13)
+@add_converter(operation_type='Sign', version=9)
+@add_converter(operation_type='Sin', version=7)
+@add_converter(operation_type='Tan', version=7)
+@add_converter(operation_type='Tanh', version=13)
+@add_converter(operation_type='Tanh', version=6)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxFunction(node.operation_type),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/gather.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/gather.py
new file mode 100644
index 000000000..f89c1b181
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/gather.py
@@ -0,0 +1,171 @@
+__all__ = [
+    'OnnxGather',
+    'OnnxGatherElements',
+    'OnnxGatherND',
+]
+
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import get_onnx_version
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+from onnx2torch.utils.indices import upcast_indices
+
+
+class OnnxGatherElements(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, axis: int = 0):
+        super().__init__()
+        self._axis = axis
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_tensor: torch.Tensor,
+        indices: torch.Tensor,
+    ) -> torch.Tensor:
+        return torch.gather(input_tensor, dim=self._axis, index=upcast_indices(indices))
+
+
+class OnnxGather(nn.Module, OnnxToTorchModuleWithCustomExport):
+    """ONNX Gather implementation (or numpy.take implementation)."""
+
+    def __init__(self, axis: int = 0):
+        super().__init__()
+        self._axis = axis
+
+    def _onnx_attrs(self, opset_version: int) -> Dict[str, Any]:
+        return {'axis_i': self._axis}
+
+    @staticmethod
+    def slice_from_axis(  # pylint: disable=missing-docstring
+        input_tensor: torch.Tensor,
+        axis: int,
+        indices: torch.Tensor,
+    ) -> Tuple[Union[slice, torch.Tensor], ...]:
+        axis = input_tensor.dim() + axis if axis < 0 else axis
+        skip_axis: List[Union[slice, torch.Tensor]] = [slice(None)] * axis
+        skip_axis.append(upcast_indices(indices))
+        return tuple(skip_axis)
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self, input_tensor: torch.Tensor, indices: torch.Tensor
+    ) -> torch.Tensor:
+        def _forward():
+            # pytorch Gather differs from onnx Gather, onnx gather work like numpy.take
+            # But torch.take does not support different axis. So we make it by yourself
+            # numpy.take is input_data[:, :, indices] where we pass NONE slices AXIS time
+            slice_for_take = self.slice_from_axis(input_tensor, self._axis, indices)
+            return input_tensor[slice_for_take]
+
+        if torch.onnx.is_in_onnx_export():
+            onnx_attrs = self._onnx_attrs(opset_version=get_onnx_version())
+            return DefaultExportToOnnx.export(_forward, 'Gather', input_tensor, indices, onnx_attrs)
+
+        return _forward()
+
+
+class OnnxGatherND(nn.Module, OnnxToTorchModuleWithCustomExport):
+    """ONNX GatherND implementation."""
+
+    def __init__(self, batch_dims: int = 0):
+        super().__init__()
+        self._batch_dims: int = batch_dims
+
+    def _onnx_attrs(self, opset_version: int) -> Dict[str, Any]:
+        onnx_attrs: Dict[str, Any] = {}
+
+        if opset_version == 11:
+            if self._batch_dims != 0:
+                raise ValueError(f'GatherND from opset 11 does not support batch_dims != 0, got {self._batch_dims}')
+            return onnx_attrs
+
+        onnx_attrs['batch_dims_i'] = self._batch_dims
+        return onnx_attrs
+
+    def forward(self, input_tensor: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:  # pylint: disable=C0116
+        def _forward():
+            return self._gather_nd(data=input_tensor, indices=indices, batch_dims=self._batch_dims)
+
+        if torch.onnx.is_in_onnx_export():
+            onnx_attrs = self._onnx_attrs(opset_version=get_onnx_version())
+            return DefaultExportToOnnx.export(_forward, 'GatherND', input_tensor, indices, onnx_attrs)
+
+        return _forward()
+
+    @staticmethod
+    def _gather_nd(data: torch.Tensor, indices: torch.Tensor, batch_dims: int) -> torch.Tensor:
+        if batch_dims != 0:
+            raise NotImplementedError('GatherND for batch_dims != 0 is not implemented')
+
+        r, m = len(data.shape), indices.shape[-1]  # pylint: disable=C0103
+        if m > r or m < 1:
+            raise ValueError(
+                f'The last dimension of indices should have a value between 1 (inclusive) and data rank (inclusive), '
+                f'got {m} and {r} respectively'
+            )
+
+        total_samples = indices.shape[:-1].numel()
+        output_shape = indices.shape[:-1] + data.shape[m:]
+        indices_ = torch.split(
+            tensor=indices.reshape(total_samples, m).transpose(0, 1),
+            split_size_or_sections=1,
+            dim=0,
+        )
+
+        return data[indices_].reshape(output_shape).contiguous()
+
+
+@add_converter(operation_type='Gather', version=1)
+@add_converter(operation_type='Gather', version=11)
+@add_converter(operation_type='Gather', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    axis = node.attributes.get('axis', 0)
+    torch_module = OnnxGather(
+        axis=axis,
+    )
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='GatherElements', version=11)
+@add_converter(operation_type='GatherElements', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    axis = node.attributes.get('axis', 0)
+    torch_module = OnnxGatherElements(
+        axis=axis,
+    )
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='GatherND', version=11)
+@add_converter(operation_type='GatherND', version=12)
+@add_converter(operation_type='GatherND', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    batch_dims = node.attributes.get('batch_dims', 0)
+    torch_module = OnnxGatherND(
+        batch_dims=batch_dims,
+    )
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/gemm.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/gemm.py
new file mode 100644
index 000000000..b49f734b5
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/gemm.py
@@ -0,0 +1,98 @@
+__all__ = [
+    'OnnxGemm',
+]
+
+from typing import Optional
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxMapping
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxGemm(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def __init__(self, alpha: float, beta: float, trans_a: bool, trans_b: bool):
+        super().__init__()
+
+        self.alpha = alpha
+        self.beta = beta
+        self.trans_a = trans_a
+        self.trans_b = trans_b
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_a: torch.Tensor,
+        input_b: torch.Tensor,
+        input_c: Optional[torch.Tensor] = None,
+    ):
+        if self.trans_a:
+            input_a = torch.transpose(input_a, dim0=0, dim1=1)
+        if self.trans_b:
+            input_b = torch.transpose(input_b, dim0=0, dim1=1)
+
+        output = input_a @ input_b * self.alpha
+        if input_c is not None:
+            output += input_c * self.beta
+
+        return output
+
+
+@add_converter(operation_type='Gemm', version=9)
+@add_converter(operation_type='Gemm', version=11)
+@add_converter(operation_type='Gemm', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    a_name = node.input_values[0]
+    b_name = node.input_values[1]
+    c_name = node.input_values[2] if len(node.input_values) > 2 else None
+
+    node_attributes = node.attributes
+    alpha = node_attributes.get('alpha', 1.0)
+    beta = node_attributes.get('beta', 1.0)
+    trans_a = node_attributes.get('transA', 0) != 0
+    trans_b = node_attributes.get('transB', 0) != 0
+
+    if not trans_a and b_name in graph.initializers and (c_name is None or c_name in graph.initializers):
+        if c_name is None:
+            bias = None
+        else:
+            bias = graph.initializers[c_name]
+            bias = bias.to_torch()
+
+        if bias is None or bias.dim() == 1:
+            weights = graph.initializers[b_name]
+            weights = weights.to_torch()
+            if not trans_b:
+                weights = weights.T
+
+            in_features, out_features = weights.shape[1], weights.shape[0]
+            torch_module = nn.Linear(
+                in_features=in_features,
+                out_features=out_features,
+                bias=bias is not None,
+            )
+
+            with torch.no_grad():
+                weights = weights * alpha
+                torch_module.weight.data = weights
+                if bias is not None:
+                    bias = bias * beta
+                    torch_module.bias.data = bias
+
+            return OperationConverterResult(
+                torch_module=torch_module,
+                onnx_mapping=OnnxMapping(
+                    inputs=(a_name,),
+                    outputs=node.output_values,
+                ),
+            )
+
+    return OperationConverterResult(
+        torch_module=OnnxGemm(alpha=alpha, beta=beta, trans_a=trans_a, trans_b=trans_b),
+        onnx_mapping=onnx_mapping_from_node(node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/global_average_pool.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/global_average_pool.py
new file mode 100644
index 000000000..4e27287ab
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/global_average_pool.py
@@ -0,0 +1,62 @@
+# pylint: disable=missing-docstring
+__all__ = [
+    'OnnxGlobalAveragePool',
+    'OnnxGlobalAveragePoolWithKnownInputShape',
+]
+
+from typing import List
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import get_shape_from_value_info
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+
+class OnnxGlobalAveragePool(nn.Module, OnnxToTorchModuleWithCustomExport):
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        def _forward():
+            x_dims = list(range(2, len(input_tensor.shape)))
+            return torch.mean(input_tensor, dim=x_dims, keepdim=True)
+
+        if torch.onnx.is_in_onnx_export():
+            return DefaultExportToOnnx.export(_forward, 'GlobalAveragePool', input_tensor, {})
+
+        return _forward()
+
+
+class OnnxGlobalAveragePoolWithKnownInputShape(nn.Module, OnnxToTorchModuleWithCustomExport):
+    def __init__(self, input_shape: List[int]):
+        super().__init__()
+        self._x_dims = list(range(2, len(input_shape)))
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        def _forward() -> torch.Tensor:
+            return torch.mean(input_tensor, dim=self._x_dims, keepdim=True)
+
+        if torch.onnx.is_in_onnx_export():
+            return DefaultExportToOnnx.export(_forward, 'GlobalAveragePool', input_tensor, {})
+
+        return _forward()
+
+
+@add_converter(operation_type='GlobalAveragePool', version=1)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    input_value_info = graph.value_info[node.input_values[0]]
+    input_shape = get_shape_from_value_info(input_value_info)
+
+    if input_shape is not None:
+        torch_module = OnnxGlobalAveragePoolWithKnownInputShape(input_shape=input_shape)
+    else:
+        torch_module = OnnxGlobalAveragePool()
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/identity.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/identity.py
new file mode 100644
index 000000000..d304f929b
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/identity.py
@@ -0,0 +1,29 @@
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxCopyIdentity(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def forward(self, x: torch.Tensor):  # pylint: disable=missing-function-docstring
+        return x.clone()
+
+
+@add_converter(operation_type='Identity', version=16)
+@add_converter(operation_type='Identity', version=14)
+@add_converter(operation_type='Identity', version=13)
+@add_converter(operation_type='Identity', version=1)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    # We need copy identity because in onnx identity create new tensor.
+    # Pytorch identity simply returns the same tensor.
+    # Which ruin quantization logic, because we should mark quantized tensors.
+    # For example, input quantization node will be supressed if input tensor is already quantized.
+    return OperationConverterResult(
+        torch_module=OnnxCopyIdentity(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/instance_norm.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/instance_norm.py
new file mode 100644
index 000000000..e0acb9967
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/instance_norm.py
@@ -0,0 +1,88 @@
+__all__ = [
+    'OnnxInstanceNorm',
+]
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxMapping
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import get_shape_from_value_info
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+_IN_CLASS_FROM_SPATIAL_RANK = {
+    0: nn.InstanceNorm1d,
+    1: nn.InstanceNorm1d,
+    2: nn.InstanceNorm2d,
+    3: nn.InstanceNorm3d,
+}
+
+
+class OnnxInstanceNorm(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, momentum: float, epsilon: float):
+        super().__init__()
+        self.momentum = momentum
+        self.epsilon = epsilon
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_data: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+    ) -> torch.Tensor:
+        return F.instance_norm(
+            input=input_data,
+            running_mean=None,
+            running_var=None,
+            weight=weight,
+            bias=bias,
+            use_input_stats=True,
+            momentum=self.momentum,
+            eps=self.epsilon,
+        )
+
+
+@add_converter(operation_type='InstanceNormalization', version=1)
+@add_converter(operation_type='InstanceNormalization', version=6)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    node_attributes = node.attributes
+    epsilon = node_attributes.get('epsilon', 1e-5)
+    momentum = 0.1
+
+    if all(value_name in graph.initializers for value_name in node.input_values[1:]):
+        input_value_info = graph.value_info[node.input_values[0]]
+        input_shape = get_shape_from_value_info(input_value_info)
+        spatial_rank = len(input_shape) - 2
+        try:
+            in_class = _IN_CLASS_FROM_SPATIAL_RANK[spatial_rank]
+        except KeyError as exc:
+            raise NotImplementedError(
+                f'InstanceNorm operation with spatial rank == {spatial_rank} is not implemented'
+            ) from exc
+
+        scale_value_name = node.input_values[1]
+        bias_value_name = node.input_values[2]
+
+        scale = graph.initializers[scale_value_name].to_torch()
+        torch_module = in_class(
+            num_features=scale.size()[0],
+            eps=epsilon,
+            momentum=momentum,
+            affine=True,
+            track_running_stats=False,
+        )
+        with torch.no_grad():
+            torch_module.weight.data = graph.initializers[scale_value_name].to_torch()
+            torch_module.bias.data = graph.initializers[bias_value_name].to_torch()
+
+        onnx_mapping = OnnxMapping(inputs=(node.input_values[0],), outputs=node.output_values)
+    else:
+        torch_module = OnnxInstanceNorm(momentum=momentum, epsilon=epsilon)
+        onnx_mapping = onnx_mapping_from_node(node)
+
+    return OperationConverterResult(torch_module=torch_module, onnx_mapping=onnx_mapping)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/isinf.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/isinf.py
new file mode 100644
index 000000000..00130d952
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/isinf.py
@@ -0,0 +1,33 @@
+# pylint: disable=missing-docstring
+__all__ = [
+    'OnnxIsInf',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxMapping
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+
+
+class OnnxIsInf(nn.Module, OnnxToTorchModule):
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        return torch.isinf(input_tensor)
+
+
+@add_converter(operation_type='IsInf', version=10)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    del graph
+    torch_module = OnnxIsInf()
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=OnnxMapping(
+            inputs=(node.input_values[0],),
+            outputs=node.output_values,
+        ),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/isnan.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/isnan.py
new file mode 100644
index 000000000..d491e5a2d
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/isnan.py
@@ -0,0 +1,33 @@
+# pylint: disable=missing-docstring
+__all__ = [
+    'OnnxIsNaN',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxMapping
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+
+
+class OnnxIsNaN(nn.Module, OnnxToTorchModule):
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        return torch.isnan(input_tensor)
+
+
+@add_converter(operation_type='IsNaN', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    del graph
+    torch_module = OnnxIsNaN()
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=OnnxMapping(
+            inputs=(node.input_values[0],),
+            outputs=node.output_values,
+        ),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/layer_norm.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/layer_norm.py
new file mode 100644
index 000000000..c31c7a2e7
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/layer_norm.py
@@ -0,0 +1,78 @@
+__all__ = [
+    'OnnxLayerNorm',
+]
+
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxMapping
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import get_shape_from_value_info
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+AXIS_DEFAULT_VALUE = -1
+EPSILON_DEFAULT_VALUE = 1e-5
+
+
+class OnnxLayerNorm(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, axis: int, epsilon: float):
+        super().__init__()
+        self.axis = axis
+        self.epsilon = epsilon
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        inputs: torch.Tensor,
+        scale: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        normalized_shape = inputs.shape[self.axis :]
+        return F.layer_norm(
+            input=inputs,
+            normalized_shape=normalized_shape,
+            weight=scale,
+            bias=bias,
+            eps=self.epsilon,
+        )
+
+
+@add_converter(operation_type='LayerNormalization', version=17)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    node_attributes = node.attributes
+
+    axis = node_attributes.get('axis', AXIS_DEFAULT_VALUE)
+    epsilon = node_attributes.get('epsilon', EPSILON_DEFAULT_VALUE)
+
+    if all(value_name in graph.initializers for value_name in node.input_values[1:]):
+        input_value_info = graph.value_info[node.input_values[0]]
+        input_shape = get_shape_from_value_info(input_value_info)
+
+        torch_module = nn.LayerNorm(
+            normalized_shape=input_shape[axis:],
+            eps=epsilon,
+            elementwise_affine=True,
+        )
+
+        scale_value_name = node.input_values[1]
+        bias_value_name = node.input_values[2] if len(node.input_values) > 2 else None
+
+        with torch.no_grad():
+            torch_module.weight.data = graph.initializers[scale_value_name].to_torch()
+            if bias_value_name is not None:
+                torch_module.bias.data = graph.initializers[bias_value_name].to_torch()
+
+        onnx_mapping = OnnxMapping(inputs=(node.input_values[0],), outputs=node.output_values)
+    else:
+        input_value_info = graph.value_info[node.input_values[0]]
+        input_shape = get_shape_from_value_info(input_value_info)
+        torch_module = OnnxLayerNorm(axis=axis, epsilon=epsilon)
+        onnx_mapping = onnx_mapping_from_node(node)
+
+    return OperationConverterResult(torch_module=torch_module, onnx_mapping=onnx_mapping)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/logical.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/logical.py
new file mode 100644
index 000000000..ac5b46811
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/logical.py
@@ -0,0 +1,79 @@
+# pylint: disable=missing-docstring
+__all__ = [
+    'OnnxNot',
+    'OnnxLogical',
+]
+
+from typing import Optional
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import old_style_broadcast
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+_TORCH_FUNCTION_FROM_ONNX_TYPE = {
+    'Or': torch.logical_or,
+    'And': torch.logical_and,
+    'Xor': torch.logical_xor,
+}
+
+
+class OnnxNot(nn.Module, OnnxToTorchModuleWithCustomExport):
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        def _forward() -> torch.Tensor:
+            return torch.logical_not(input_tensor)
+
+        if torch.onnx.is_in_onnx_export():
+            return DefaultExportToOnnx.export(_forward, 'Not', input_tensor, {})
+
+        return _forward()
+
+
+class OnnxLogical(nn.Module, OnnxToTorchModule):
+    def __init__(self, operation_type: str, broadcast: Optional[int] = None, axis: Optional[int] = None):
+        super().__init__()
+        self.broadcast = broadcast
+        self.axis = axis
+
+        self.logic_op_function = _TORCH_FUNCTION_FROM_ONNX_TYPE[operation_type]
+
+    def forward(self, first_tensor: torch.Tensor, second_tensor: torch.Tensor):
+        if self.broadcast == 1 and self.axis is not None:
+            second_tensor = old_style_broadcast(first_tensor, second_tensor, self.axis)
+
+        return self.logic_op_function(first_tensor, second_tensor)
+
+
+@add_converter(operation_type='Xor', version=1)
+@add_converter(operation_type='Xor', version=7)
+@add_converter(operation_type='And', version=1)
+@add_converter(operation_type='And', version=7)
+@add_converter(operation_type='Or', version=1)
+@add_converter(operation_type='Or', version=7)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    del graph
+    return OperationConverterResult(
+        torch_module=OnnxLogical(
+            operation_type=node.operation_type,
+            broadcast=node.attributes.get('broadcast', None),
+            axis=node.attributes.get('axis', None),
+        ),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='Not', version=1)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    del graph
+    return OperationConverterResult(
+        torch_module=OnnxNot(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/lrn.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/lrn.py
new file mode 100644
index 000000000..f79928c91
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/lrn.py
@@ -0,0 +1,23 @@
+__all__ = []
+
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+@add_converter(operation_type='LRN', version=13)
+@add_converter(operation_type='LRN', version=1)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    size = node.attributes.get('size')
+    alpha = node.attributes.get('alpha', 0.0001)
+    beta = node.attributes.get('beta', 0.75)
+    k = node.attributes.get('bias', 1)  # pylint: disable=invalid-name
+
+    return OperationConverterResult(
+        torch_module=nn.LocalResponseNorm(size=size, alpha=alpha, beta=beta, k=k),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/matmul.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/matmul.py
new file mode 100644
index 000000000..32d5aa567
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/matmul.py
@@ -0,0 +1,28 @@
+__all__ = [
+    'OnnxMatMul',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxMatMul(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        return torch.matmul(x, y)
+
+
+@add_converter(operation_type='MatMul', version=1)
+@add_converter(operation_type='MatMul', version=9)
+@add_converter(operation_type='MatMul', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxMatMul(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/max_pool.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/max_pool.py
new file mode 100644
index 000000000..a4fb11577
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/max_pool.py
@@ -0,0 +1,65 @@
+__all__ = []
+
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import get_shape_from_value_info
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.padding import onnx_auto_pad_to_torch_padding
+
+_MAXPOOL_CLASS_FROM_SPATIAL_RANK = {
+    1: nn.MaxPool1d,
+    2: nn.MaxPool2d,
+    3: nn.MaxPool3d,
+}
+
+
+@add_converter(operation_type='MaxPool', version=12)
+@add_converter(operation_type='MaxPool', version=11)
+@add_converter(operation_type='MaxPool', version=10)
+@add_converter(operation_type='MaxPool', version=8)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    input_value_info = graph.value_info[node.input_values[0]]
+    input_shape = get_shape_from_value_info(input_value_info)
+
+    spatial_rank = len(input_shape) - 2
+    try:
+        maxpool_class = _MAXPOOL_CLASS_FROM_SPATIAL_RANK[spatial_rank]
+    except KeyError as exc:
+        raise NotImplementedError(f'Max pool operation with spatial rank == {spatial_rank} is not implemented') from exc
+
+    node_attributes = node.attributes
+    # required
+    kernel_shape = node_attributes['kernel_shape']
+    # optional
+    ceil_mode = node_attributes.get('ceil_mode', 0)
+    dilation = node_attributes.get('dilations', 1)
+    strides = node_attributes.get('strides', 1)
+    storage_order = node_attributes.get('storage_order', 0)
+    if storage_order != 0:
+        raise NotImplementedError('Only row major (0) order is supported.')
+
+    padding, padding_module = onnx_auto_pad_to_torch_padding(
+        onnx_padding=node_attributes.get('pads', [0] * spatial_rank * 2),
+        auto_pad=node_attributes.get('auto_pad', 'NOTSET'),
+    )
+
+    torch_module = maxpool_class(
+        kernel_size=kernel_shape,
+        stride=strides,
+        padding=padding,
+        dilation=dilation,
+        ceil_mode=ceil_mode == 1,
+    )
+    if padding_module is not None:
+        # MaxPool must ignore padded values, so we should pad by -inf
+        padding_module.constant_value = float('-inf')
+        torch_module = nn.Sequential(padding_module, torch_module)
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/mean.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/mean.py
new file mode 100644
index 000000000..19ebef0a7
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/mean.py
@@ -0,0 +1,36 @@
+__all__ = [
+    'OnnxMean',
+]
+
+import torch
+
+from onnx2torch.node_converters.base_element_wise import OnnxBaseElementWise
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxMean(OnnxBaseElementWise):  # pylint: disable=missing-docstring
+    def __init__(self):
+        super().__init__(op_type='Mean')
+
+    def apply_reduction(self, *tensors: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        broadcast_shape = self._broadcast_shape(*tensors)
+
+        output = torch.zeros(broadcast_shape, dtype=tensors[0].dtype, device=tensors[0].device)
+        for y in tensors:
+            output.add_(y)
+
+        output = output.div(len(tensors))  # Divide by the number of tensors
+        return output
+
+
+@add_converter(operation_type='Mean', version=8)
+@add_converter(operation_type='Mean', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxMean(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/min_max.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/min_max.py
new file mode 100644
index 000000000..03212f37a
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/min_max.py
@@ -0,0 +1,43 @@
+__all__ = [
+    'OnnxMinMax',
+]
+
+import torch
+
+from onnx2torch.node_converters.base_element_wise import OnnxBaseElementWise
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxMinMax(OnnxBaseElementWise):  # pylint: disable=missing-docstring
+    _OPERATORS = {
+        'Min': torch.amin,
+        'Max': torch.amax,
+    }
+
+    def __init__(self, op_type: str):
+        super().__init__(op_type=op_type)
+        self._operator = self._OPERATORS[op_type]
+
+    def apply_reduction(self, *tensors: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        broadcast_shape = self._broadcast_shape(*tensors)
+        broadcast_tensors = [t.broadcast_to(broadcast_shape) for t in tensors]
+        stacked_tensors = torch.stack(broadcast_tensors)
+        output = self._operator(stacked_tensors, dim=0)
+        return output
+
+
+@add_converter(operation_type='Min', version=8)
+@add_converter(operation_type='Min', version=12)
+@add_converter(operation_type='Min', version=13)
+@add_converter(operation_type='Max', version=8)
+@add_converter(operation_type='Max', version=12)
+@add_converter(operation_type='Max', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxMinMax(node.operation_type),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/mod.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/mod.py
new file mode 100644
index 000000000..46dcfd05e
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/mod.py
@@ -0,0 +1,36 @@
+__all__ = [
+    'OnnxMod',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxMod(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def __init__(self, fmod: int):
+        super().__init__()
+        self.fmod = fmod
+
+        if self.fmod not in [0, 1]:
+            raise ValueError(f'OnnxMod fom must be 0 or 1, but get {self.fmod}')
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        return torch.fmod(x, y) if self.fmod else torch.remainder(x, y)
+
+
+@add_converter(operation_type='Mod', version=10)
+@add_converter(operation_type='Mod', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    node_attributes = node.attributes
+    fmod = node_attributes.get('fmod', 0)
+    return OperationConverterResult(
+        torch_module=OnnxMod(fmod=fmod),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/neg.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/neg.py
new file mode 100644
index 000000000..5ede4c104
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/neg.py
@@ -0,0 +1,28 @@
+__all__ = [
+    'OnnxNeg',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxNeg(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def forward(self, input_tensor: torch.Tensor):  # pylint: disable=missing-function-docstring
+        return -input_tensor
+
+
+@add_converter(operation_type='Neg', version=1)
+@add_converter(operation_type='Neg', version=6)
+@add_converter(operation_type='Neg', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxNeg(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/nms.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/nms.py
new file mode 100644
index 000000000..c6b873171
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/nms.py
@@ -0,0 +1,121 @@
+# pylint: disable=missing-docstring
+__all__ = [
+    'OnnxNonMaxSuppression',
+]
+
+from typing import Any
+from typing import Dict
+from typing import Optional
+
+import torch
+import torchvision
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import get_onnx_version
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+
+class OnnxNonMaxSuppression(nn.Module, OnnxToTorchModuleWithCustomExport):
+    def __init__(self, center_point_box: int = 0):
+        super().__init__()
+        self._center_point_box = center_point_box
+
+    def _onnx_attrs(self, opset_version: int) -> Dict[str, Any]:
+        del opset_version
+        return {'center_point_box_i': self._center_point_box}
+
+    def forward(
+        self,
+        boxes: torch.Tensor,
+        scores: torch.Tensor,
+        max_output_boxes_per_class: Optional[torch.Tensor] = None,
+        iou_threshold: Optional[torch.Tensor] = None,
+        score_threshold: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        def _forward() -> torch.Tensor:
+            return self._nms(boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold)
+
+        if torch.onnx.is_in_onnx_export():
+            if max_output_boxes_per_class is None:
+                max_output_boxes_per_class = torch.tensor([0], dtype=torch.int64)
+            if iou_threshold is None:
+                iou_threshold = torch.tensor([0.0], dtype=torch.float32)
+            if score_threshold is None:
+                score_threshold = torch.tensor([0.0], dtype=torch.float32)
+
+            onnx_attrs = self._onnx_attrs(opset_version=get_onnx_version())
+            return DefaultExportToOnnx.export(
+                _forward,
+                'NonMaxSuppression',
+                boxes,
+                scores,
+                max_output_boxes_per_class,
+                iou_threshold,
+                score_threshold,
+                onnx_attrs,
+            )
+
+        return _forward()
+
+    def _nms(
+        self,
+        boxes: torch.Tensor,
+        scores: torch.Tensor,
+        max_output_boxes_per_class: Optional[torch.Tensor],
+        iou_threshold: Optional[torch.Tensor],
+        score_threshold: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if max_output_boxes_per_class is None:
+            return torch.empty([0, 3], dtype=torch.int64, device=boxes.device)
+
+        max_output_boxes_per_class = max_output_boxes_per_class.item()
+        iou_threshold = 0.0 if iou_threshold is None else iou_threshold.item()
+        score_threshold = 0.0 if score_threshold is None else score_threshold.item()
+
+        out = []
+        # boxes - [bs, num_boxes, 4], scores - [bs, n_classes, num_boxes]
+        for batch_index, (batch_boxes, batch_scores) in enumerate(zip(boxes, scores)):
+            # bbox - [num_boxes, 4], score - [n_classes, num_boxes]
+            for class_index, class_scores in enumerate(batch_scores):
+                confidence_mask = class_scores > score_threshold
+                confidence_indexes = confidence_mask.nonzero(as_tuple=False).squeeze(1)
+
+                filtered_batch_boxes = batch_boxes[confidence_indexes]
+                if self._center_point_box:
+                    filtered_batch_boxes = torchvision.ops.box_convert(
+                        filtered_batch_boxes,
+                        in_fmt='cxcywh',
+                        out_fmt='xyxy',
+                    )
+
+                nms_indexes = torchvision.ops.nms(
+                    boxes=filtered_batch_boxes,
+                    scores=class_scores[confidence_indexes],
+                    iou_threshold=iou_threshold,
+                )
+                num_boxes = min(max_output_boxes_per_class, nms_indexes.size(0))
+                nms_indexes = nms_indexes[:num_boxes]
+                indexes = confidence_indexes[nms_indexes]
+
+                out.extend([batch_index, class_index, box_index] for box_index in indexes)
+        if len(out) == 0:
+            return torch.empty([0, 3], dtype=torch.int64, device=boxes.device)
+
+        return torch.tensor(out, dtype=torch.int64, device=boxes.device)
+
+
+@add_converter(operation_type='NonMaxSuppression', version=10)
+@add_converter(operation_type='NonMaxSuppression', version=11)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    del graph
+    center_point_box = node.attributes.get('center_point_box', 0)
+    return OperationConverterResult(
+        torch_module=OnnxNonMaxSuppression(center_point_box=center_point_box),
+        onnx_mapping=onnx_mapping_from_node(node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/nonzero.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/nonzero.py
new file mode 100644
index 000000000..4f8722834
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/nonzero.py
@@ -0,0 +1,33 @@
+# pylint: disable=missing-docstring
+__all__ = [
+    'OnnxNonZero',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxMapping
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+
+
+class OnnxNonZero(nn.Module, OnnxToTorchModule):
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        return torch.nonzero(input_tensor)
+
+
+@add_converter(operation_type='NonZero', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    del graph
+    torch_module = OnnxNonZero()
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=OnnxMapping(
+            inputs=(node.input_values[0],),
+            outputs=node.output_values,
+        ),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/pad.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/pad.py
new file mode 100644
index 000000000..cb0dd1eb5
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/pad.py
@@ -0,0 +1,141 @@
+__all__ = [
+    'OnnxPadStatic',
+    'OnnxPadDynamic',
+]
+
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxMapping
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+_ONNX_TO_TORCH_MODE = {
+    'constant': 'constant',
+    'reflect': 'reflect',
+    'edge': 'replicate',
+}
+
+
+def _onnx_to_torch_mode(mode: str) -> str:
+    try:
+        return _ONNX_TO_TORCH_MODE[mode]
+    except KeyError as exc:
+        raise NotImplementedError(f'{mode} mode is not implemented') from exc
+
+
+def _torch_padding_to_mode_format(pads: List[int], mode: str) -> List[int]:
+    if mode in ('replicate', 'reflect'):
+        batch_channel_pads = pads[-4:]
+        if set(batch_channel_pads) == {0}:
+            return pads[:-4]
+
+        raise RuntimeError(
+            f'{mode} padding is implemented for padding the last 3 dimensions of 5D input tensor, '
+            f'or the last 2 dimensions of 4D input tensor, or the last dimension of 3D input tensor.'
+        )
+
+    return pads
+
+
+def _onnx_padding_to_torch(pads: List[int]) -> List[int]:
+    # Convert padding from onnx format to torch format
+    # onnx format: [x1_begin, x2_begin, ... , x1_end, x2_end, ...]
+    # torch format [xn_begin, xn_end, ... , x2_begin, x2_end, x1_begin, x1_end]
+    middle = len(pads) // 2
+    onnx_pad_begin, onnx_pad_end = pads[:middle], pads[middle:]
+    onnx_pad_begin, onnx_pad_end = onnx_pad_begin[::-1], onnx_pad_end[::-1]
+    torch_pads = []
+    for begin, end in zip(onnx_pad_begin, onnx_pad_end):
+        torch_pads.extend([begin, end])
+
+    return torch_pads
+
+
+class OnnxPadStatic(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def __init__(
+        self,
+        pads: Union[Tuple[int, ...], List[int]],
+        mode: str = 'constant',
+        constant_value: float = 0.0,
+    ):
+        super().__init__()
+        self.mode = mode
+        self.pads = pads
+        self.constant_value = constant_value
+
+    @classmethod
+    def create_from_onnx_params(  # pylint: disable=missing-function-docstring
+        cls,
+        onnx_pads: Union[Tuple[int, ...], List[int]],
+        onnx_mode: str = 'constant',
+        constant_value: float = 0.0,
+    ) -> 'OnnxPadStatic':
+        torch_mode = _onnx_to_torch_mode(onnx_mode)
+        torch_padding = _onnx_padding_to_torch(onnx_pads)
+        torch_padding = _torch_padding_to_mode_format(torch_padding, torch_mode)
+        return cls(pads=torch_padding, mode=torch_mode, constant_value=constant_value)
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        return F.pad(  # pylint: disable=not-callable
+            input_tensor,
+            mode=self.mode,
+            pad=self.pads,
+            value=self.constant_value,
+        )
+
+
+class OnnxPadDynamic(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def __init__(self, mode: str = 'constant'):
+        super().__init__()
+        self.mode = mode
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_tensor: torch.Tensor,
+        pads: torch.Tensor,
+        constant_value: Optional[float] = 0.0,
+    ) -> torch.Tensor:
+        torch_pads = _onnx_padding_to_torch(pads.tolist())
+        torch_pads = _torch_padding_to_mode_format(torch_pads, self.mode)
+
+        return F.pad(input_tensor, mode=self.mode, pad=torch_pads, value=constant_value)  # pylint: disable=not-callable
+
+
+@add_converter(operation_type='Pad', version=11)
+@add_converter(operation_type='Pad', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    mode = node.attributes.get('mode', 'constant')
+    mode = _onnx_to_torch_mode(mode)
+
+    return OperationConverterResult(
+        torch_module=OnnxPadDynamic(mode=mode),
+        onnx_mapping=OnnxMapping(
+            inputs=node.input_values,
+            outputs=node.output_values,
+        ),
+    )
+
+
+@add_converter(operation_type='Pad', version=2)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    torch_module = OnnxPadStatic.create_from_onnx_params(
+        onnx_mode=node.attributes.get('mode', 'constant'),
+        onnx_pads=node.attributes.get('pads'),
+        constant_value=node.attributes.get('constant_value', 0.0),
+    )
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/pow.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/pow.py
new file mode 100644
index 000000000..8088cfe0e
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/pow.py
@@ -0,0 +1,64 @@
+__all__ = [
+    'OnnxPow',
+    'OnnxSqrt',
+]
+
+from typing import Optional
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import old_style_broadcast
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxPow(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def __init__(self, broadcast: Optional[int] = None, axis: Optional[int] = None):
+        super().__init__()
+        self.axis = axis
+        self.broadcast = broadcast
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_tensor: torch.Tensor,
+        exponent: torch.Tensor,
+    ) -> torch.Tensor:
+        if self.broadcast == 1 and self.axis is not None:
+            exponent = old_style_broadcast(input_tensor, exponent, self.axis)
+
+        return torch.pow(input_tensor, exponent)
+
+
+class OnnxSqrt(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        return torch.sqrt(input_tensor)
+
+
+@add_converter(operation_type='Pow', version=1)
+@add_converter(operation_type='Pow', version=7)
+@add_converter(operation_type='Pow', version=12)
+@add_converter(operation_type='Pow', version=13)
+@add_converter(operation_type='Pow', version=15)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxPow(
+            broadcast=node.attributes.get('broadcast', None),
+            axis=node.attributes.get('axis', None),
+        ),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='Sqrt', version=1)
+@add_converter(operation_type='Sqrt', version=6)
+@add_converter(operation_type='Sqrt', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxSqrt(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/random_normal_like.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/random_normal_like.py
new file mode 100644
index 000000000..ecd843008
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/random_normal_like.py
@@ -0,0 +1,59 @@
+__all__ = [
+    'OnnxRandomNormalLike',
+]
+
+from typing import Optional
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+import onnx2torch.utils.dtype as dtype_utils
+
+
+class OnnxRandomNormalLike(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, dtype: Optional[int], mean: Optional[float], scale: Optional[float], seed: Optional[int]):
+        super().__init__()
+        if dtype is not None:
+            dtype = dtype_utils.onnx_dtype_to_torch_dtype(dtype)
+
+        self.dtype = dtype
+        self.mean = mean
+        self.scale = scale
+        self.seed = seed
+
+    def forward(self, *input_tensors) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        if self.seed is not None:
+            raise RuntimeError("The argument `seed` is not supported now.")
+
+        dtype = input_tensors[0].dtype if self.dtype is None else self.dtype
+        return torch.normal(self.mean, self.scale, input_tensors[0].shape, dtype=dtype, device=input_tensors[0].device)
+
+
+@add_converter(operation_type='RandomNormalLike', version=1)
+@add_converter(operation_type='RandomNormalLike', version=4)
+@add_converter(operation_type='RandomNormalLike', version=11)
+@add_converter(operation_type='RandomNormalLike', version=13)
+@add_converter(operation_type='RandomNormalLike', version=14)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    dtype = node.attributes.get('dtype', None)
+    mean = node.attributes.get("mean", 0.0)
+    scale = node.attributes.get("scale", 1.0)
+    seed = node.attributes.get("seed", None)
+
+    torch_module = OnnxRandomNormalLike(
+        dtype=dtype,
+        mean=mean,
+        scale=scale,
+        seed=seed
+    )
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/range.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/range.py
new file mode 100644
index 000000000..392e32e8b
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/range.py
@@ -0,0 +1,66 @@
+# pylint: disable=missing-docstring
+__all__ = [
+    'OnnxRange',
+]
+
+from typing import Union
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+
+class OnnxRange(nn.Module, OnnxToTorchModuleWithCustomExport):
+    def __init__(self):
+        super().__init__()
+        self.register_buffer('dummy_buffer', torch.Tensor(), persistent=False)
+
+    @staticmethod
+    def _get_scalar(value) -> Union[float, int]:
+        if isinstance(value, torch.Tensor):
+            return value.item()
+
+        return value
+
+    def _arange(
+        self,
+        start: Union[torch.Tensor, float, int],
+        limit: Union[torch.Tensor, float, int],
+        delta: Union[torch.Tensor, float, int],
+    ) -> torch.Tensor:
+        return torch.arange(
+            start=self._get_scalar(start),
+            end=self._get_scalar(limit),
+            step=self._get_scalar(delta),
+            device=self.dummy_buffer.device,
+        )
+
+    def forward(
+        self,
+        start: Union[torch.Tensor, float, int],
+        limit: Union[torch.Tensor, float, int],
+        delta: Union[torch.Tensor, float, int],
+    ) -> torch.Tensor:
+        def _forward() -> torch.Tensor:
+            return self._arange(start, limit, delta)
+
+        if torch.onnx.is_in_onnx_export():
+            return DefaultExportToOnnx.export(_forward, 'Range', start, limit, delta, {})
+
+        return _forward()
+
+
+@add_converter(operation_type='Range', version=11)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    del graph
+    return OperationConverterResult(
+        torch_module=OnnxRange(),
+        onnx_mapping=onnx_mapping_from_node(node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/reciprocal.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/reciprocal.py
new file mode 100644
index 000000000..3507bb6ec
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/reciprocal.py
@@ -0,0 +1,28 @@
+__all__ = [
+    'OnnxReciprocal',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxReciprocal(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def forward(self, x):  # pylint: disable=missing-function-docstring
+        return torch.reciprocal(x)
+
+
+@add_converter(operation_type='Reciprocal', version=1)
+@add_converter(operation_type='Reciprocal', version=6)
+@add_converter(operation_type='Reciprocal', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxReciprocal(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/reduce.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/reduce.py
new file mode 100644
index 000000000..fb73c32e3
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/reduce.py
@@ -0,0 +1,276 @@
+# pylint: disable=missing-class-docstring
+__all__ = [
+    'OnnxReduceSumDynamicAxes',
+    'OnnxReduceSumStaticAxes',
+    'OnnxReduceStaticAxes',
+]
+
+from functools import partial
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+from typing import cast
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxMapping
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import get_const_value
+from onnx2torch.utils.common import get_onnx_version
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+
+@torch.fx.wrap
+def _get_element(x: Any, index: int = 0) -> Any:
+    if isinstance(x, (tuple, list)):
+        return x[index]
+
+    return x
+
+
+def _initialize_none_dim(
+    dim: Optional[Union[int, Tuple[int, ...]]],
+    input_dim: int,
+) -> Union[List[int], Tuple[int, ...], int]:
+    if dim is None:
+        return list(range(input_dim))
+
+    return dim
+
+
+def _log_sum(
+    input_tensor: torch.Tensor,
+    dim: Optional[Union[int, Tuple[int, ...]]] = None,
+    keepdim: bool = False,
+) -> torch.Tensor:
+    dim_ = _initialize_none_dim(dim, input_tensor.dim())
+    return torch.log(torch.sum(input_tensor, dim=dim_, keepdim=keepdim))
+
+
+def _log_sum_exp(
+    input_tensor: torch.Tensor,
+    dim: Optional[Union[int, Tuple[int, ...]]] = None,
+    keepdim: bool = False,
+) -> torch.Tensor:
+    dim_ = _initialize_none_dim(dim, input_tensor.dim())
+    return torch.logsumexp(input_tensor, dim=dim_, keepdim=keepdim)
+
+
+def _sum_square(
+    input_tensor: torch.Tensor,
+    dim: Optional[Union[int, Tuple[int, ...]]] = None,
+    keepdim: bool = False,
+) -> torch.Tensor:
+    dim_ = _initialize_none_dim(dim, input_tensor.dim())
+    return torch.sum(torch.square(input_tensor), dim=dim_, keepdim=keepdim)
+
+
+_TORCH_FUNCTION_FROM_ONNX_TYPE = {
+    'ReduceL1': partial(torch.norm, p=1),
+    'ReduceL2': partial(torch.norm, p=2),
+    'ReduceLogSum': _log_sum,
+    'ReduceLogSumExp': _log_sum_exp,
+    'ReduceMax': torch.max,
+    'ReduceMean': torch.mean,
+    'ReduceMin': torch.min,
+    'ReduceProd': torch.prod,
+    'ReduceSum': torch.sum,
+    'ReduceSumSquare': _sum_square,
+}
+
+
+class OnnxReduceSumDynamicAxes(nn.Module, OnnxToTorchModuleWithCustomExport):
+    def __init__(self, keepdims: int = 1, noop_with_empty_axes: int = 0):
+        super().__init__()
+
+        self._keepdims = keepdims
+        self._noop_with_empty_axes = noop_with_empty_axes
+
+    def _onnx_attrs(self, opset_version: int) -> Dict[str, Any]:
+        del opset_version
+        return {
+            'noop_with_empty_axes_i': self._noop_with_empty_axes,
+            'keepdims_i': self._keepdims,
+        }
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_tensor: torch.Tensor,
+        axes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        def _forward() -> torch.Tensor:
+            if axes is None or axes.nelement() == 0:
+                if self._noop_with_empty_axes:
+                    return input_tensor
+
+                if not self._keepdims:
+                    return torch.sum(input_tensor)
+
+                fixed_axes = list(range(input_tensor.dim()))
+            else:
+                fixed_axes = torch.sort(axes).values.tolist()
+
+            return torch.sum(input_tensor, dim=fixed_axes, keepdim=bool(self._keepdims))
+
+        if torch.onnx.is_in_onnx_export():
+            args = [input_tensor]
+            if axes is not None:
+                args.append(axes)
+
+            onnx_attrs = self._onnx_attrs(opset_version=get_onnx_version())
+            return DefaultExportToOnnx.export(_forward, 'ReduceSum', *args, onnx_attrs)
+
+        return _forward()
+
+
+class OnnxReduceSumStaticAxes(nn.Module, OnnxToTorchModule):
+    def __init__(
+        self,
+        axes: List[int],
+        keepdims: int = 1,
+        noop_with_empty_axes: int = 0,
+    ):
+        super().__init__()
+        if axes is not None:
+            axes = sorted(axes)
+
+        self._keepdims = keepdims
+        self._noop_with_empty_axes = noop_with_empty_axes
+        self._axes = axes
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        if self._axes is None or len(self._axes) == 0:
+            if self._noop_with_empty_axes:
+                return input_tensor
+
+            if not self._keepdims:
+                return self.math_op_function(input_tensor)
+
+            self._axes = list(range(input_tensor.dim()))
+
+        return torch.sum(input_tensor, dim=self._axes, keepdim=bool(self._keepdims))
+
+
+class OnnxReduceStaticAxes(nn.Module, OnnxToTorchModule):
+    def __init__(
+        self,
+        operation_type: str,
+        axes: Optional[List[int]],
+        keepdims: int = 1,
+    ):
+        super().__init__()
+        self.operation_type = operation_type
+        self.math_op_function = _TORCH_FUNCTION_FROM_ONNX_TYPE[operation_type]
+
+        if axes is not None:
+            axes = sorted(axes)
+
+        self.keepdims = keepdims == 1
+        self.axes = axes
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        if self.axes is None or len(self.axes) == 0:
+            if not self.keepdims:
+                return self.math_op_function(input_tensor)
+
+            self.axes = list(range(input_tensor.dim()))
+
+        if self.operation_type not in ['ReduceMax', 'ReduceMin', 'ReduceProd']:
+            return self.math_op_function(input_tensor, dim=self.axes, keepdim=self.keepdims)
+
+        result = input_tensor
+        for passed_dims, axis in enumerate(self.axes):
+            result = self.math_op_function(
+                result,
+                dim=axis if self.keepdims else axis - passed_dims,
+                keepdim=self.keepdims,
+            )
+            result = _get_element(result, 0)
+
+        return result
+
+
+@add_converter(operation_type='ReduceL1', version=1)
+@add_converter(operation_type='ReduceL1', version=11)
+@add_converter(operation_type='ReduceL1', version=13)
+@add_converter(operation_type='ReduceL2', version=1)
+@add_converter(operation_type='ReduceL2', version=11)
+@add_converter(operation_type='ReduceL2', version=13)
+@add_converter(operation_type='ReduceLogSum', version=1)
+@add_converter(operation_type='ReduceLogSum', version=11)
+@add_converter(operation_type='ReduceLogSum', version=13)
+@add_converter(operation_type='ReduceLogSumExp', version=1)
+@add_converter(operation_type='ReduceLogSumExp', version=11)
+@add_converter(operation_type='ReduceLogSumExp', version=13)
+@add_converter(operation_type='ReduceMax', version=1)
+@add_converter(operation_type='ReduceMax', version=11)
+@add_converter(operation_type='ReduceMax', version=12)
+@add_converter(operation_type='ReduceMax', version=13)
+@add_converter(operation_type='ReduceMean', version=1)
+@add_converter(operation_type='ReduceMean', version=11)
+@add_converter(operation_type='ReduceMean', version=13)
+@add_converter(operation_type='ReduceMin', version=1)
+@add_converter(operation_type='ReduceMin', version=11)
+@add_converter(operation_type='ReduceMin', version=12)
+@add_converter(operation_type='ReduceMin', version=13)
+@add_converter(operation_type='ReduceProd', version=1)
+@add_converter(operation_type='ReduceProd', version=11)
+@add_converter(operation_type='ReduceProd', version=13)
+@add_converter(operation_type='ReduceSum', version=1)
+@add_converter(operation_type='ReduceSum', version=11)
+@add_converter(operation_type='ReduceSumSquare', version=1)
+@add_converter(operation_type='ReduceSumSquare', version=11)
+@add_converter(operation_type='ReduceSumSquare', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    del graph
+    node_attributes = node.attributes
+    axes: Optional[List[int]] = node_attributes.get('axes', None)
+    keepdims: int = node_attributes.get('keepdims', 1)
+
+    return OperationConverterResult(
+        torch_module=OnnxReduceStaticAxes(
+            operation_type=node.operation_type,
+            axes=axes,
+            keepdims=keepdims,
+        ),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='ReduceSum', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    keepdims: int = node.attributes.get('keepdims', 1)
+    noop_with_empty_axes: int = node.attributes.get('noop_with_empty_axes', 0)
+
+    if len(node.input_values) == 2:
+        try:
+            axes = cast(torch.Tensor, get_const_value(node.input_values[1], graph))
+            axes = axes.tolist()
+            return OperationConverterResult(
+                torch_module=OnnxReduceSumStaticAxes(
+                    axes=axes,
+                    keepdims=keepdims,
+                    noop_with_empty_axes=noop_with_empty_axes,
+                ),
+                onnx_mapping=OnnxMapping(
+                    inputs=(node.input_values[0],),
+                    outputs=node.output_values,
+                ),
+            )
+        except KeyError:
+            pass
+
+    return OperationConverterResult(
+        torch_module=OnnxReduceSumDynamicAxes(keepdims=keepdims, noop_with_empty_axes=noop_with_empty_axes),
+        onnx_mapping=onnx_mapping_from_node(node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/registry.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/registry.py
new file mode 100644
index 000000000..b4ddd1745
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/registry.py
@@ -0,0 +1,71 @@
+import logging
+from typing import Callable
+from typing import NamedTuple
+
+from onnx import defs
+
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+
+_LOGGER = logging.getLogger(__name__)
+_CONVERTER_REGISTRY = {}
+
+
+class OperationDescription(NamedTuple):  # pylint: disable=missing-class-docstring
+    domain: str
+    operation_type: str
+    version: int
+
+
+TConverter = Callable[[OnnxNode, OnnxGraph], OperationConverterResult]
+
+
+def add_converter(  # pylint: disable=missing-function-docstring
+    operation_type: str,
+    version: int,
+    domain: str = defs.ONNX_DOMAIN,
+):
+    description = OperationDescription(
+        domain=domain,
+        operation_type=operation_type,
+        version=version,
+    )
+
+    def deco(converter: TConverter):
+        if description in _CONVERTER_REGISTRY:
+            raise ValueError(f'Operation "{description}" already registered')
+
+        _CONVERTER_REGISTRY[description] = converter
+        _LOGGER.debug(f'Operation converter registered {description}')
+
+        return converter
+
+    return deco
+
+
+def get_converter(  # pylint: disable=missing-function-docstring
+    operation_type: str,
+    version: int,
+    domain: str = defs.ONNX_DOMAIN,
+) -> TConverter:
+    try:
+        version = defs.get_schema(
+            operation_type,
+            domain=domain,
+            max_inclusive_version=version,
+        ).since_version
+    except (RuntimeError, defs.SchemaError):
+        pass
+
+    description = OperationDescription(
+        domain=domain,
+        operation_type=operation_type,
+        version=version,
+    )
+
+    converter = _CONVERTER_REGISTRY.get(description, None)
+    if converter is None:
+        raise NotImplementedError(f'Converter is not implemented ({description})')
+
+    return converter
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/reshape.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/reshape.py
new file mode 100644
index 000000000..e8ec3aa75
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/reshape.py
@@ -0,0 +1,49 @@
+__all__ = [
+    'OnnxReshape',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+
+class OnnxReshape(nn.Module, OnnxToTorchModuleWithCustomExport):  # pylint: disable=missing-class-docstring
+    @staticmethod
+    def _do_reshape(input_tensor: torch.Tensor, shape: torch.Tensor) -> torch.Tensor:
+        if torch.any(shape == 0):
+            shape = [input_tensor.shape[i] if dim_size == 0 else dim_size for i, dim_size in enumerate(shape)]
+
+        return torch.reshape(input_tensor, torch.Size(shape))
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_tensor: torch.Tensor,
+        shape: torch.Tensor,
+    ) -> torch.Tensor:
+        def _forward() -> torch.Tensor:
+            return self._do_reshape(input_tensor, shape)
+
+        if torch.onnx.is_in_onnx_export():
+            return DefaultExportToOnnx.export(_forward, 'Reshape', input_tensor, shape, {})
+
+        return _forward()
+
+
+@add_converter(operation_type='Reshape', version=5)
+@add_converter(operation_type='Reshape', version=13)
+@add_converter(operation_type='Reshape', version=14)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    if node.attributes.get('allowzero', 0) == 1:
+        raise NotImplementedError('"allowzero=1" is not implemented')
+
+    return OperationConverterResult(
+        torch_module=OnnxReshape(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/resize.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/resize.py
new file mode 100644
index 000000000..8a4ca9aca
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/resize.py
@@ -0,0 +1,171 @@
+__all__ = [
+    'OnnxResize',
+]
+
+import warnings
+from typing import Optional
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+_MODES_MAPPING = {
+    ('nearest', 1): 'nearest',
+    ('nearest', 2): 'nearest',
+    ('nearest', 3): 'nearest',
+    ('linear', 1): 'linear',
+    ('linear', 2): 'bilinear',
+    ('linear', 3): 'trilinear',
+    ('cubic', 2): 'bicubic',
+}
+
+
+def _get_torch_align_corners(mode: str, coordinate_transformation_mode: str) -> Optional[bool]:
+    if mode == 'nearest':
+        return None
+
+    return coordinate_transformation_mode == 'align_corners'
+
+
+def _onnx_mode_to_torch_mode(onnx_mode: str, dim_size: int) -> str:
+    torch_mode = _MODES_MAPPING.get((onnx_mode, dim_size), None)
+    if torch_mode is None:
+        raise NotImplementedError(f'{dim_size}D input is not implemented for "{onnx_mode}" mode.')
+
+    return torch_mode
+
+
+class OnnxResize(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def __init__(
+        self,
+        mode: str = 'nearest',
+        align_corners: Optional[bool] = None,
+        ignore_roi: bool = False,
+        ignore_bs_ch_size: bool = False,
+    ):
+        super().__init__()
+        self.onnx_mode = mode
+        self.align_corners = align_corners
+        self.ignore_roi = ignore_roi
+        self.ignore_bs_ch_size = ignore_bs_ch_size
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_tensor: torch.Tensor,
+        roi: Optional[torch.Tensor] = None,
+        scales: Optional[torch.Tensor] = None,
+        sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        torch_mode = _onnx_mode_to_torch_mode(self.onnx_mode, input_tensor.dim() - 2)
+        if not self.ignore_roi and roi is not None and roi.nelement() != 0:
+            raise NotImplementedError('roi logic is not implemented.')
+
+        # Format of onnx scales and sizes is [n, c, d, h, w]
+        # But in torch only [d, h, w] (without batch and channel dimensions)
+        if sizes is not None:
+            if sizes.nelement() != 0:
+                sizes = sizes.tolist()
+                input_shape = list(input_tensor.shape)
+                if not self.ignore_bs_ch_size and input_shape[:2] != sizes[:2]:
+                    raise NotImplementedError('Pytorch\'s interpolate cannot resize channel or batch dimensions.')
+                sizes = sizes[2:]
+            else:
+                sizes = None
+
+        if scales is not None:
+            if scales.nelement() != 0:
+                scales = scales.tolist()
+                if scales[:2] != [1, 1]:
+                    raise NotImplementedError('Pytorch\'s interpolate cannot scale channel or batch dimensions.')
+                scales = scales[2:]
+            else:
+                scales = None
+
+        return torch.nn.functional.interpolate(
+            input_tensor,
+            size=sizes,
+            scale_factor=scales,
+            mode=torch_mode,
+            align_corners=self.align_corners,
+        )
+
+
+class OnnxResizeV10(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def __init__(self, mode: str = 'nearest'):
+        super().__init__()
+        self._resize = OnnxResize(mode=mode)
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_tensor: torch.Tensor,
+        scales: torch.Tensor,
+    ) -> torch.Tensor:
+        return self._resize(input_tensor, scales=scales)
+
+
+@add_converter(operation_type='Resize', version=10)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    node_attributes = node.attributes
+    mode = node_attributes.get('mode', 'nearest')
+
+    torch_module = OnnxResizeV10(mode=mode)
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node),
+    )
+
+
+@add_converter(operation_type='Resize', version=11)
+@add_converter(operation_type='Resize', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    node_attributes = node.attributes
+    coordinate_transformation_mode = node_attributes.get('coordinate_transformation_mode', 'half_pixel')
+    cubic_coeff_a = node_attributes.get('cubic_coeff_a', -0.75)
+    exclude_outside = node_attributes.get('exclude_outside', 0)
+    extrapolation_value = node_attributes.get('extrapolation_value', 0.0)
+    mode = node_attributes.get('mode', 'nearest')
+    nearest_mode = node_attributes.get('nearest_mode', 'round_prefer_floor')
+
+    if mode == 'nearest':
+        if nearest_mode != 'floor':
+            warnings.warn(
+                'Pytorch\'s nearest neighbor interpolate uses the "floor" nearest_mode. '
+                'For others modes, the results might differ significantly!'
+            )
+
+        if coordinate_transformation_mode != 'asymmetric':
+            warnings.warn(
+                'Pytorch\'s nearest neighbor interpolation uses "asymmetric" coordinate_transformation_mode. '
+                'For others modes, the results might differ significantly!'
+            )
+    else:
+        if coordinate_transformation_mode not in ['pytorch_half_pixel', 'half_pixel']:
+            warnings.warn(
+                'For linear and cubic interpolation in "asymmetric" and "align_corners" coordinate_transformation_mode'
+                'results might differ significantly!'
+            )
+
+    if cubic_coeff_a != -0.75:
+        warnings.warn('With a cubic coefficient value other than 0.75, the results might differ significantly!')
+
+    if exclude_outside != 0:
+        warnings.warn('With a exclude outside value other than 0, the results might differ significantly!')
+
+    if extrapolation_value != 0.0:
+        warnings.warn('With a extrapolation value other than 0.0, the results might differ significantly!')
+
+    ignore_roi = coordinate_transformation_mode != 'tf_crop_and_resize'
+    return OperationConverterResult(
+        torch_module=OnnxResize(
+            mode=mode,
+            align_corners=_get_torch_align_corners(mode, coordinate_transformation_mode),
+            ignore_roi=ignore_roi,
+        ),
+        onnx_mapping=onnx_mapping_from_node(node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/roialign.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/roialign.py
new file mode 100644
index 000000000..f03c35593
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/roialign.py
@@ -0,0 +1,145 @@
+__all__ = [
+    'OnnxRoiAlign',
+]
+
+from enum import Enum
+from typing import Any
+from typing import Dict
+
+import torch
+from torch import nn
+from torchvision.ops import roi_align
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import get_onnx_version
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+
+class CoordinateTransformationModeOnnxAttr(Enum):
+    """
+    Representation of new attribute in 16 opset: `coordinate_transformation_mode`.
+
+    Allowed values are `half_pixel` and `output_half_pixel`.
+    Use the value `half_pixel` to pixel shift the input coordinates by -0.5 (the recommended behavior).
+    Use the value `output_half_pixel` to omit the pixel shift for the inputs
+    (use this for a backward-compatible behavior).
+    """
+
+    HALF_PIXEL = 'half_pixel'
+    OUTPUT_HALF_PIXEL = 'output_half_pixel'
+
+
+CTMOnnxAttr = CoordinateTransformationModeOnnxAttr  # Type alias.
+
+
+class OnnxRoiAlign(nn.Module, OnnxToTorchModuleWithCustomExport):  # pylint: disable=missing-class-docstring
+    def __init__(
+        self,
+        coordinate_transformation_mode: CTMOnnxAttr = CTMOnnxAttr.HALF_PIXEL,
+        mode: str = 'avg',
+        output_height: int = 1,
+        output_width: int = 1,
+        sampling_ratio: int = 0,
+        spatial_scale: float = 1.0,
+    ):
+        super().__init__()
+
+        self._coordinate_transformation_mode = coordinate_transformation_mode
+
+        if mode != 'avg':
+            raise NotImplementedError(f'"{mode}" roi align mode is not implemented.')
+        self._mode = mode
+
+        self._output_height = output_height
+        self._output_width = output_width
+        self._sampling_ratio = sampling_ratio
+        self._spatial_scale = spatial_scale
+
+    def _onnx_attrs(self, opset_version: int) -> Dict[str, Any]:
+        onnx_attrs: Dict[str, Any] = {
+            'mode_s': self._mode,
+            'output_height_i': self._output_height,
+            'output_width_i': self._output_width,
+            'sampling_ratio_i': self._sampling_ratio,
+            'spatial_scale_f': self._spatial_scale,
+        }
+
+        if opset_version < 16:
+            if self._coordinate_transformation_mode != CTMOnnxAttr.OUTPUT_HALF_PIXEL:
+                raise ValueError(
+                    'RoiAlign from opset 10 does not support coordinate_transform_mode != "output_half_pixel"'
+                    f', got {self._coordinate_transformation_mode.value}'
+                )
+            return onnx_attrs
+
+        onnx_attrs['coordinate_transformation_mode_s'] = self._coordinate_transformation_mode.value
+        return onnx_attrs
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_tensor: torch.Tensor,
+        rois: torch.Tensor,
+        batch_indices: torch.Tensor,
+    ) -> torch.Tensor:
+        def _forward():
+            fixed_batch_indices = batch_indices.unsqueeze(1).to(rois.dtype)
+            batched_rois = torch.cat([fixed_batch_indices, rois], dim=1)
+            output_size = self._output_height, self._output_width
+            sampling_ratio = self._sampling_ratio
+            spatial_scale = self._spatial_scale
+
+            return roi_align(
+                input=input_tensor,
+                boxes=batched_rois,
+                output_size=output_size,
+                spatial_scale=spatial_scale,
+                sampling_ratio=sampling_ratio,
+                aligned=self._coordinate_transformation_mode == CTMOnnxAttr.HALF_PIXEL,
+            )
+
+        if torch.onnx.is_in_onnx_export():
+            onnx_attrs = self._onnx_attrs(get_onnx_version())
+            return DefaultExportToOnnx.export(_forward, 'RoiAlign', input_tensor, rois, batch_indices, onnx_attrs)
+
+        return _forward()
+
+
+def converter_schema(  # pylint: disable=missing-function-docstring, unused-argument
+    node: OnnxNode,
+    graph: OnnxGraph,
+    default_ctm: str,
+) -> OperationConverterResult:
+    node_attributes = node.attributes
+    coordinate_transformation_mode = CTMOnnxAttr(node_attributes.get('coordinate_transformation_mode', default_ctm))
+    mode = node_attributes.get('mode', 'avg')
+    output_height = node_attributes.get('output_height', 1)
+    output_width = node_attributes.get('output_width', 1)
+    sampling_ratio = node_attributes.get('sampling_ratio', 0)
+    spatial_scale = node_attributes.get('spatial_scale', 1.0)
+
+    return OperationConverterResult(
+        torch_module=OnnxRoiAlign(
+            coordinate_transformation_mode=coordinate_transformation_mode,
+            mode=mode,
+            output_height=output_height,
+            output_width=output_width,
+            sampling_ratio=sampling_ratio,
+            spatial_scale=spatial_scale,
+        ),
+        onnx_mapping=onnx_mapping_from_node(node),
+    )
+
+
+@add_converter(operation_type='RoiAlign', version=10)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return converter_schema(node=node, graph=graph, default_ctm='output_half_pixel')
+
+
+@add_converter(operation_type='RoiAlign', version=16)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return converter_schema(node=node, graph=graph, default_ctm='half_pixel')
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/roundings.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/roundings.py
new file mode 100644
index 000000000..046179571
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/roundings.py
@@ -0,0 +1,40 @@
+__all__ = [
+    'OnnxRound',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+_TORCH_ROUND_FROM_ONNX_TYPE = {
+    'Ceil': torch.ceil,
+    'Floor': torch.floor,
+    'Round': torch.round,
+}
+
+
+class OnnxRound(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def __init__(self, round_type: str):
+        super().__init__()
+        self.round_function = _TORCH_ROUND_FROM_ONNX_TYPE[round_type]
+
+    def forward(self, input_tensor: torch.Tensor):  # pylint: disable=missing-function-docstring
+        return self.round_function(input_tensor)
+
+
+@add_converter(operation_type='Ceil', version=13)
+@add_converter(operation_type='Ceil', version=6)
+@add_converter(operation_type='Floor', version=13)
+@add_converter(operation_type='Floor', version=6)
+@add_converter(operation_type='Round', version=11)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxRound(node.operation_type),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/scatter_nd.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/scatter_nd.py
new file mode 100644
index 000000000..620de9cb3
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/scatter_nd.py
@@ -0,0 +1,97 @@
+__all__ = [
+    'OnnxScatterND',
+]
+
+from enum import Enum
+from typing import Any
+from typing import Dict
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import get_onnx_version
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+
+class ReductionOnnxAttr(Enum):
+    """
+    Representation of new attribute in 16 opset: `reduction`.
+
+    Type of reduction to apply: none (default), add, mul.
+    - `none`: no reduction applied.
+    - `add`: reduction using the addition operation.
+    - `mul`: reduction using the multiplication operation.
+    """
+
+    NONE = 'none'
+    ADD = 'add'
+    MUL = 'mul'
+
+
+class OnnxScatterND(nn.Module, OnnxToTorchModuleWithCustomExport):  # pylint: disable=missing-class-docstring
+    def __init__(self, reduction: ReductionOnnxAttr):
+        super().__init__()
+
+        if reduction != ReductionOnnxAttr.NONE:
+            raise NotImplementedError(f'ScatterND with reduction attribute "{reduction.value}" is not implemented')
+
+        self._reduction = reduction
+
+    def _onnx_attrs(self, opset_version: int) -> Dict[str, Any]:
+        onnx_attrs: Dict[str, Any] = {}
+
+        if opset_version < 16:
+            if self._reduction != ReductionOnnxAttr.NONE:
+                raise ValueError(
+                    'ScatterND from opset < 16 does not support'
+                    f'reduction attribute != {ReductionOnnxAttr.NONE.value},'
+                    f'got {self._reduction.value}'
+                )
+            return onnx_attrs
+
+        onnx_attrs['reduction_s'] = self._reduction.value
+        return onnx_attrs
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        data: torch.Tensor,
+        indices: torch.Tensor,
+        updates: torch.Tensor,
+    ) -> torch.Tensor:
+        def _forward():
+            # There is no scatter nd for torch, use following formula:
+            # https://github.com/onnx/onnx/blob/master/docs/Operators.md#ScatterND
+            output = data.clone()
+
+            ind_dim = indices.dim()
+            # last dimension is a partial-index into data
+            output_indices = indices.reshape((-1, indices.shape[-1])).T.tolist()
+            # update.shape = indices.shape[0:ind_dim-1] ++ data.shape[indices.shape[-1]:data.dim()-1]
+            output_updates = updates.reshape((-1, *updates.shape[ind_dim - 1 :]))
+            output[output_indices] = output_updates
+
+            return output
+
+        if torch.onnx.is_in_onnx_export():
+            onnx_attrs = self._onnx_attrs(opset_version=get_onnx_version())
+            return DefaultExportToOnnx.export(_forward, 'ScatterND', data, indices, updates, onnx_attrs)
+
+        return _forward()
+
+
+@add_converter(operation_type='ScatterND', version=11)
+@add_converter(operation_type='ScatterND', version=13)
+@add_converter(operation_type='ScatterND', version=16)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    node_attributes = node.attributes
+    reduction = ReductionOnnxAttr(node_attributes.get('reduction', 'none'))
+    return OperationConverterResult(
+        torch_module=OnnxScatterND(reduction=reduction),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/shape.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/shape.py
new file mode 100644
index 000000000..1b306d068
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/shape.py
@@ -0,0 +1,66 @@
+__all__ = [
+    'OnnxShape',
+]
+
+from typing import Any
+from typing import Dict
+from typing import Optional
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import get_onnx_version
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+
+class OnnxShape(nn.Module, OnnxToTorchModuleWithCustomExport):  # pylint: disable=missing-class-docstring
+    def __init__(self, start: int = 0, end: Optional[int] = None):
+        super().__init__()
+        self._start = start
+        self._end = end
+
+    def _onnx_attrs(self, opset_version: int) -> Dict[str, Any]:
+        if opset_version < 15:
+            if self._start != 0:
+                raise ValueError(f'Shape from opset < 15 does not support start != 0, got {self._start}')
+            if self._end is not None:
+                raise ValueError(f'Shape from opset < 15 does not support end != None, got {self._end}')
+            return {}
+
+        onnx_attrs: Dict[str, Any] = {'start_i': self._start}
+        if self._end:
+            onnx_attrs['end_i'] = self._end
+
+        return onnx_attrs
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        def _forward():
+            return torch.tensor(
+                input_tensor.shape[self._start : self._end],
+                device=input_tensor.device,
+            )
+
+        if torch.onnx.is_in_onnx_export():
+            onnx_attrs = self._onnx_attrs(opset_version=get_onnx_version())
+            return DefaultExportToOnnx.export(_forward, 'Shape', input_tensor, onnx_attrs)
+
+        return _forward()
+
+
+@add_converter(operation_type='Shape', version=1)
+@add_converter(operation_type='Shape', version=13)
+@add_converter(operation_type='Shape', version=15)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxShape(
+            start=node.attributes.get('start', 0),
+            end=node.attributes.get('end', None),
+        ),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/slice.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/slice.py
new file mode 100644
index 000000000..21cc2a0ba
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/slice.py
@@ -0,0 +1,125 @@
+__all__ = [
+    'OnnxSlice',
+]
+
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+
+def _get_slices(
+    starts: Union[torch.Tensor, np.ndarray],
+    ends: Union[torch.Tensor, np.ndarray],
+    axes: Optional[Union[torch.Tensor, np.ndarray]],
+    steps: Optional[Union[torch.Tensor, np.ndarray]],
+) -> Tuple[List, List, List]:
+    if axes is None:
+        axes = list(range(len(starts)))
+    else:
+        axes = axes.detach().cpu().numpy()
+
+    if steps is None:
+        steps = [1] * len(starts)
+    else:
+        steps = steps.detach().cpu().numpy()
+
+    slices = {}
+    flip_dims = []
+    for start, end, axis, step in zip(starts, ends, axes, steps):
+        if step < 0:
+            flip_dims.append(axis)
+            start, end, step = -start - 1, -end - 1, -step
+
+        slices[axis] = slice(start, end, step)
+
+    pos_axes_slices = list(slices.get(a, slice(None, None)) for a in range(max(axes) + 1))
+    neg_axes_slices = list(slices.get(a, slice(None, None)) for a in range(min(axes), 0))
+
+    if neg_axes_slices:
+        neg_axes_slices = [Ellipsis] + neg_axes_slices
+
+    return flip_dims, pos_axes_slices, neg_axes_slices
+
+
+def _do_slice(x: torch.Tensor, flip_dims: List, pos_axes_slices: List, neg_axes_slices: List):
+    if flip_dims:
+        x = torch.flip(x, dims=flip_dims)
+
+    if pos_axes_slices:
+        x = x[pos_axes_slices]
+
+    if neg_axes_slices:
+        x = x[neg_axes_slices]
+
+    return x
+
+
+class OnnxSliceV9(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def __init__(self, starts: np.ndarray, ends: np.ndarray, axes: Optional[np.ndarray] = None):
+        super().__init__()
+        self._flip_dims, self._pos_axes_slices, self._neg_axes_slices = _get_slices(starts, ends, axes, None)
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        return _do_slice(input_tensor, self.flip_dims, self.pos_axes_slices, self.neg_axes_slices)
+
+
+class OnnxSlice(nn.Module, OnnxToTorchModuleWithCustomExport):  # pylint: disable=missing-class-docstring
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_tensor: torch.Tensor,
+        starts: torch.Tensor,
+        ends: torch.Tensor,
+        axes: Optional[torch.Tensor] = None,
+        steps: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        def _forward():
+            flip_dims, pos_axes_slices, neg_axes_slices = _get_slices(starts, ends, axes, steps)
+            return _do_slice(input_tensor, flip_dims, pos_axes_slices, neg_axes_slices)
+
+        if torch.onnx.is_in_onnx_export():
+            args = [input_tensor, starts, ends]
+            if axes is not None:
+                args.append(axes)
+            if steps is not None:
+                args.append(steps)
+
+            return DefaultExportToOnnx.export(_forward, 'Slice', *args, {})
+
+        return _forward()
+
+
+@add_converter(operation_type='Slice', version=9)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    node_attributes = node.attributes
+    return OperationConverterResult(
+        torch_module=OnnxSliceV9(
+            starts=node_attributes['starts'],
+            ends=node_attributes['ends'],
+            axes=node_attributes.get('axes', None),
+        ),
+        onnx_mapping=onnx_mapping_from_node(node),
+    )
+
+
+@add_converter(operation_type='Slice', version=10)
+@add_converter(operation_type='Slice', version=11)
+@add_converter(operation_type='Slice', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxSlice(),
+        onnx_mapping=onnx_mapping_from_node(node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/split.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/split.py
new file mode 100644
index 000000000..749371f76
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/split.py
@@ -0,0 +1,78 @@
+__all__ = [
+    'OnnxSplit',
+    'OnnxSplit13',
+]
+
+from typing import List
+from typing import Optional
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxSplit13(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def __init__(self, num_splits: int, axis: int = 0):
+        super().__init__()
+
+        self.axis = axis
+        self.num_splits = num_splits
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_tensor: torch.Tensor,
+        split: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if split is None:
+            axis_len = input_tensor.shape[self.axis]
+            split_size_or_sections = axis_len // self.num_splits
+        else:
+            split_size_or_sections = split.tolist()
+
+        return torch.split(input_tensor, split_size_or_sections, dim=self.axis)
+
+
+class OnnxSplit(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def __init__(self, num_splits: int, axis: int = 0, split: Optional[List[int]] = None):
+        super().__init__()
+
+        self.axis = axis
+        self.num_splits = num_splits
+        self.split = split
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        if self.split is None:
+            axis_len = input_tensor.shape[self.axis]
+            split_size_or_sections = axis_len // self.num_splits
+        else:
+            split_size_or_sections = self.split
+
+        return torch.split(input_tensor, split_size_or_sections, dim=self.axis)
+
+
+@add_converter(operation_type='Split', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    axis = node.attributes.get('axis', 0)
+    num_splits = len(node.output_values)
+    return OperationConverterResult(
+        torch_module=OnnxSplit13(axis=axis, num_splits=num_splits),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
+
+
+@add_converter(operation_type='Split', version=11)
+@add_converter(operation_type='Split', version=2)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    axis = node.attributes.get('axis', 0)
+    split = node.attributes.get('split', None)
+    num_splits = len(node.output_values)
+    return OperationConverterResult(
+        torch_module=OnnxSplit(axis=axis, split=split, num_splits=num_splits),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/squeeze.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/squeeze.py
new file mode 100644
index 000000000..824b29130
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/squeeze.py
@@ -0,0 +1,100 @@
+__all__ = [
+    'OnnxSqueezeStaticAxes',
+    'OnnxSqueezeDynamicAxes',
+]
+
+from typing import List
+from typing import Optional
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import get_onnx_version
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+
+class OnnxSqueezeStaticAxes(nn.Module, OnnxToTorchModuleWithCustomExport):  # pylint: disable=missing-class-docstring
+    def __init__(self, axes: Optional[List[int]] = None):
+        super().__init__()
+        if axes is not None:
+            axes = sorted(axes, reverse=True)
+
+        self.axes = axes
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        def _forward():
+            if not self.axes:
+                return torch.squeeze(input_tensor)
+
+            result = input_tensor
+            for axes_id in self.axes:
+                result = torch.squeeze(result, dim=axes_id)
+
+            return result
+
+        if torch.onnx.is_in_onnx_export() and get_onnx_version() >= 13:
+            args = [input_tensor]
+            if self.axes:
+                axes = torch.tensor(self.axes, device=input_tensor.device, dtype=torch.int64)
+                args.append(axes)
+
+            return DefaultExportToOnnx.export(_forward, 'Squeeze', *args, {})
+
+        return _forward()
+
+
+class OnnxSqueezeDynamicAxes(  # pylint: disable=missing-class-docstring
+    nn.Module,
+    OnnxToTorchModuleWithCustomExport,
+):
+    @staticmethod
+    def is_empty_axes(axes: torch.Tensor) -> bool:  # pylint: disable=missing-function-docstring
+        return axes is None or axes.nelement() == 0
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_tensor: torch.Tensor,
+        axes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        def _forward():
+            if OnnxSqueezeDynamicAxes.is_empty_axes(axes):
+                return torch.squeeze(input_tensor)
+
+            result = input_tensor
+            for axes_id in torch.sort(axes, descending=True).values:
+                result = torch.squeeze(result, dim=axes_id)
+
+            return result
+
+        if torch.onnx.is_in_onnx_export():
+            args = [input_tensor]
+            if not self.is_empty_axes(axes):
+                args.append(axes)
+
+            return DefaultExportToOnnx.export(_forward, 'Squeeze', *args, {})
+
+        return _forward()
+
+
+@add_converter(operation_type='Squeeze', version=1)
+@add_converter(operation_type='Squeeze', version=11)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    axes = node.attributes.get('axes', None)
+    return OperationConverterResult(
+        torch_module=OnnxSqueezeStaticAxes(axes=axes),
+        onnx_mapping=onnx_mapping_from_node(node),
+    )
+
+
+@add_converter(operation_type='Squeeze', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxSqueezeDynamicAxes(),
+        onnx_mapping=onnx_mapping_from_node(node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/sum.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/sum.py
new file mode 100644
index 000000000..f7ba93b8c
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/sum.py
@@ -0,0 +1,35 @@
+__all__ = [
+    'OnnxSum',
+]
+
+import torch
+
+from onnx2torch.node_converters.base_element_wise import OnnxBaseElementWise
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxSum(OnnxBaseElementWise):  # pylint: disable=missing-docstring
+    def __init__(self):
+        super().__init__(op_type='Sum')
+
+    def apply_reduction(self, *tensors: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        broadcast_shape = self._broadcast_shape(*tensors)
+
+        output = torch.zeros(broadcast_shape, dtype=tensors[0].dtype, device=tensors[0].device)
+        for y in tensors:
+            output.add_(y)
+
+        return output
+
+
+@add_converter(operation_type='Sum', version=8)
+@add_converter(operation_type='Sum', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxSum(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/tile.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/tile.py
new file mode 100644
index 000000000..0508a8774
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/tile.py
@@ -0,0 +1,36 @@
+# pylint: disable=missing-docstring
+__all__ = [
+    'OnnxTile',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+
+class OnnxTile(nn.Module, OnnxToTorchModuleWithCustomExport):
+    def forward(self, input_tensor: torch.Tensor, repeats: torch.Tensor) -> torch.Tensor:
+        def _forward() -> torch.Tensor:
+            return input_tensor.repeat(torch.Size(repeats))
+
+        if torch.onnx.is_in_onnx_export():
+            return DefaultExportToOnnx.export(_forward, 'Tile', input_tensor, repeats, {})
+
+        return _forward()
+
+
+@add_converter(operation_type='Tile', version=6)
+@add_converter(operation_type='Tile', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:
+    del graph
+    return OperationConverterResult(
+        torch_module=OnnxTile(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/topk.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/topk.py
new file mode 100644
index 000000000..6ef9fa912
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/topk.py
@@ -0,0 +1,55 @@
+__all__ = [
+    'OnnxTopK',
+]
+
+from typing import Tuple
+from typing import Union
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxTopK(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def __init__(self, dim: int = -1, largest: int = 1, sorted_: int = 1):
+        super().__init__()
+        self.dim = dim
+        self.largest = largest == 1
+        self.sorted = sorted_ == 1
+
+    def forward(  # pylint: disable=missing-function-docstring, invalid-name
+        self,
+        input_tensor: torch.Tensor,
+        k: Union[torch.Tensor, int],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        k = k[0] if isinstance(k, torch.Tensor) else k
+
+        top_k = torch.topk(
+            input_tensor,
+            k=k,
+            dim=self.dim,
+            largest=self.largest,
+            sorted=self.sorted,
+        )
+        return top_k.values, top_k.indices
+
+
+@add_converter(operation_type='TopK', version=1)
+@add_converter(operation_type='TopK', version=10)
+@add_converter(operation_type='TopK', version=11)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    node_attributes = node.attributes
+    axis = node_attributes.get('axis', -1)
+    largest = node_attributes.get('largest', 1)
+    sorted_ = node_attributes.get('sorted', 1)
+
+    return OperationConverterResult(
+        torch_module=OnnxTopK(dim=axis, largest=largest, sorted_=sorted_),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/transpose.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/transpose.py
new file mode 100644
index 000000000..2276bb3b9
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/transpose.py
@@ -0,0 +1,50 @@
+__all__ = [
+    'OnnxTranspose',
+]
+
+from typing import List
+from typing import Optional
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxMapping
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+
+
+class OnnxTranspose(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def __init__(self, perm: Optional[List[int]] = None):
+        super().__init__()
+        self.perm = perm
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        if self.perm is None:
+            self.perm = list(range(input_tensor.dim()))[::-1]
+
+        return input_tensor.permute(self.perm)
+
+
+@add_converter(operation_type='Transpose', version=1)
+@add_converter(operation_type='Transpose', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    input_values = [node.input_values[0]]
+    perm_value_name = node.input_values[1] if len(node.input_values) > 1 else None
+
+    if perm_value_name is not None:
+        perm = graph.initializers[perm_value_name].to_torch().tolist()
+    else:
+        perm = node.attributes.get('perm', None)
+        if perm is not None:
+            perm = torch.tensor(perm, dtype=torch.long).tolist()
+
+    return OperationConverterResult(
+        torch_module=OnnxTranspose(perm=perm),
+        onnx_mapping=OnnxMapping(
+            inputs=tuple(input_values),
+            outputs=node.output_values,
+        ),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/trilu.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/trilu.py
new file mode 100644
index 000000000..f40c70f89
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/trilu.py
@@ -0,0 +1,46 @@
+__all__ = [
+    'OnnxTrilu',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxTrilu(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-docstring
+    def __init__(self, upper: int):
+        super().__init__()
+        self.upper = upper
+
+    def forward(self, *input_tensors) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        diagonal = 0
+        if len(input_tensors) > 1:
+            diagonal = input_tensors[1]
+
+        if self.upper:
+            return torch.triu(input_tensors[0], diagonal)
+        else:
+            return torch.tril(input_tensors[0], diagonal)
+
+
+@add_converter(operation_type='Trilu', version=4)
+@add_converter(operation_type='Trilu', version=11)
+@add_converter(operation_type='Trilu', version=13)
+@add_converter(operation_type='Trilu', version=14)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    upper = node.attributes.get('k', 1)
+
+    torch_module = OnnxTrilu(
+        upper=upper,
+    )
+
+    return OperationConverterResult(
+        torch_module=torch_module,
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/unsqueeze.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/unsqueeze.py
new file mode 100644
index 000000000..b4d8a6b4e
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/unsqueeze.py
@@ -0,0 +1,86 @@
+__all__ = [
+    'OnnxUnsqueezeStaticAxes',
+    'OnnxUnsqueezeDynamicAxes',
+]
+
+from typing import List
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxMapping
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import get_const_value
+from onnx2torch.utils.common import onnx_mapping_from_node
+from onnx2torch.utils.custom_export_to_onnx import DefaultExportToOnnx
+from onnx2torch.utils.custom_export_to_onnx import OnnxToTorchModuleWithCustomExport
+
+
+class OnnxUnsqueezeStaticAxes(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def __init__(self, axes: List[int]):
+        super().__init__()
+        self._axes = sorted(axes)
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        result = input_tensor
+        for axes_id in self._axes:
+            result = torch.unsqueeze(result, dim=axes_id)
+
+        return result
+
+
+class OnnxUnsqueezeDynamicAxes(  # pylint: disable=missing-class-docstring
+    nn.Module,
+    OnnxToTorchModuleWithCustomExport,
+):
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        input_tensor: torch.Tensor,
+        axes: torch.Tensor,
+    ) -> torch.Tensor:
+        def _forward():
+            result = input_tensor
+            for axes_id in torch.sort(axes).values:
+                result = torch.unsqueeze(result, dim=axes_id)
+
+            return result
+
+        if torch.onnx.is_in_onnx_export():
+            return DefaultExportToOnnx.export(_forward, 'Unsqueeze', input_tensor, axes, {})
+
+        return _forward()
+
+
+@add_converter(operation_type='Unsqueeze', version=1)
+@add_converter(operation_type='Unsqueeze', version=11)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    axes = node.attributes['axes']
+    return OperationConverterResult(
+        torch_module=OnnxUnsqueezeStaticAxes(axes=axes),
+        onnx_mapping=onnx_mapping_from_node(node),
+    )
+
+
+@add_converter(operation_type='Unsqueeze', version=13)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    try:
+        axes = get_const_value(node.input_values[1], graph)
+        axes = axes.tolist()
+        return OperationConverterResult(
+            torch_module=OnnxUnsqueezeStaticAxes(axes=axes),
+            onnx_mapping=OnnxMapping(
+                inputs=(node.input_values[0],),
+                outputs=node.output_values,
+            ),
+        )
+    except KeyError:
+        pass
+
+    return OperationConverterResult(
+        torch_module=OnnxUnsqueezeDynamicAxes(),
+        onnx_mapping=onnx_mapping_from_node(node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/where.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/where.py
new file mode 100644
index 000000000..e3fd40a6a
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/node_converters/where.py
@@ -0,0 +1,32 @@
+__all__ = [
+    'OnnxWhere',
+]
+
+import torch
+from torch import nn
+
+from onnx2torch.node_converters.registry import add_converter
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.utils.common import OnnxToTorchModule
+from onnx2torch.utils.common import OperationConverterResult
+from onnx2torch.utils.common import onnx_mapping_from_node
+
+
+class OnnxWhere(nn.Module, OnnxToTorchModule):  # pylint: disable=missing-class-docstring
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        condition: torch.Tensor,
+        x: torch.Tensor,
+        y: torch.Tensor,
+    ) -> torch.Tensor:
+        return torch.where(condition, x, y)
+
+
+@add_converter(operation_type='Where', version=9)
+@add_converter(operation_type='Where', version=16)
+def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult:  # pylint: disable=unused-argument
+    return OperationConverterResult(
+        torch_module=OnnxWhere(),
+        onnx_mapping=onnx_mapping_from_node(node=node),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/onnx_graph.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/onnx_graph.py
new file mode 100644
index 000000000..91a15a054
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/onnx_graph.py
@@ -0,0 +1,117 @@
+from collections import OrderedDict
+from enum import Enum
+from types import MappingProxyType
+from typing import Mapping
+from typing import Tuple
+
+from onnx.onnx_ml_pb2 import GraphProto
+from onnx.onnx_ml_pb2 import NodeProto
+from onnx.onnx_ml_pb2 import ValueInfoProto
+
+from onnx2torch.onnx_node import OnnxNode
+from onnx2torch.onnx_tensor import OnnxTensor
+
+
+class ValueType(Enum):  # pylint: disable=missing-class-docstring
+    GRAPH_INPUT = 0
+    NODE_OUTPUT = 1
+    GRAPH_INITIALIZER = 2
+    UNKNOWN = 3
+    EMPTY = 4
+
+
+class OnnxGraph:  # pylint: disable=missing-class-docstring
+    def __init__(self, onnx_graph_proto: GraphProto):
+        self._proto = onnx_graph_proto
+        self._input_values = tuple(value_info.name for value_info in self._proto.input)
+        self._output_values = tuple(value_info.name for value_info in self._proto.output)
+
+        unique_names = []
+        counters = {}
+        for node in onnx_graph_proto.node:
+            name = OnnxGraph.generate_node_name(node)
+            name_counter = counters.setdefault(name, 0)
+            counters[name] += 1
+            unique_names.append(f'{name}' + (f'_{name_counter}' if name_counter > 0 else ''))
+
+        self._nodes = OrderedDict(
+            (name, OnnxNode(node, unique_name=name)) for name, node in zip(unique_names, onnx_graph_proto.node)
+        )
+        self._initializers = {initializer.name: OnnxTensor(initializer) for initializer in onnx_graph_proto.initializer}
+        self._node_output_values = {
+            output_name: (node, i) for node in self._nodes.values() for i, output_name in enumerate(node.output_values)
+        }
+        self._value_info = {value_info.name: value_info for value_info in onnx_graph_proto.value_info}
+        for input_value_info in onnx_graph_proto.input:
+            self._value_info[input_value_info.name] = input_value_info
+        for output_value_info in onnx_graph_proto.output:
+            self._value_info[output_value_info.name] = output_value_info
+
+    @property
+    def proto(self) -> GraphProto:  # pylint: disable=missing-function-docstring
+        return self._proto
+
+    @property
+    def value_info(self) -> Mapping[str, ValueInfoProto]:  # pylint: disable=missing-function-docstring
+        return self._value_info
+
+    @property
+    def name(self) -> str:  # pylint: disable=missing-function-docstring
+        return self._proto.name
+
+    @property
+    def input_values(self) -> Tuple[str, ...]:  # pylint: disable=missing-function-docstring
+        return self._input_values
+
+    @property
+    def output_values(self) -> Tuple[str, ...]:  # pylint: disable=missing-function-docstring
+        return self._output_values
+
+    @property
+    def nodes(self) -> Mapping[str, OnnxNode]:  # pylint: disable=missing-function-docstring
+        return self._nodes
+
+    @property
+    def initializers(self) -> Mapping[str, OnnxTensor]:  # pylint: disable=missing-function-docstring
+        return MappingProxyType(self._initializers)
+
+    def value_type(self, value_name: str) -> ValueType:  # pylint: disable=missing-function-docstring
+        if value_name in self._input_values:
+            return ValueType.GRAPH_INPUT
+
+        if value_name in self._node_output_values:
+            return ValueType.NODE_OUTPUT
+
+        if value_name in self._initializers:
+            return ValueType.GRAPH_INITIALIZER
+
+        if value_name == '':
+            return ValueType.EMPTY
+
+        return ValueType.UNKNOWN
+
+    def value_as_node_output(  # pylint: disable=missing-function-docstring
+        self,
+        value_name: str,
+    ) -> Tuple[OnnxNode, int]:
+        return self._node_output_values[value_name]
+
+    @staticmethod
+    def generate_node_name(node: NodeProto) -> str:
+        """Generate a torch module name from the given onnx node import it with.
+
+        Uses the ONNX node's name by default, falling back to the op_type in case the former is empty. The node's
+        domain is prepended to this.
+
+        Dots (.) are not allowed within names in torch, so they are replaced with a slash (/) instead.
+
+        Parameters
+        ----------
+        node
+            The ONNX node to create a name from.
+
+        Returns
+        -------
+        A torch-compatible module name based on the given node's properties.
+        """
+        return (f'{node.domain}/' + (node.name.replace('.', '/') or node.op_type)).lstrip('/')
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/onnx_node.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/onnx_node.py
new file mode 100644
index 000000000..becaec68a
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/onnx_node.py
@@ -0,0 +1,77 @@
+from types import MappingProxyType
+from typing import Any
+from typing import Mapping
+from typing import Tuple
+
+from onnx.onnx_ml_pb2 import AttributeProto
+from onnx.onnx_ml_pb2 import NodeProto
+
+from onnx2torch.onnx_tensor import OnnxTensor
+
+
+class OnnxNode:  # pylint: disable=missing-class-docstring
+    def __init__(self, onnx_node_proto: NodeProto, unique_name: str):
+        self._proto = onnx_node_proto
+        self._unique_name = unique_name
+        self._input_values = tuple(onnx_node_proto.input)
+        self._output_values = tuple(onnx_node_proto.output)
+        self._inputs = None
+
+        self._proto_attributes = {
+            attribute.name: OnnxNode._parse_attribute_value(attribute) for attribute in self._proto.attribute
+        }
+
+    @staticmethod
+    def _parse_attribute_value(attribute: AttributeProto) -> Any:
+        if attribute.HasField('i'):
+            value = attribute.i
+        elif attribute.HasField('f'):
+            value = attribute.f
+        elif attribute.HasField('s'):
+            value = str(attribute.s, 'utf-8')
+        elif attribute.HasField('t'):
+            value = OnnxTensor(attribute.t)
+        elif attribute.ints:
+            value = list(attribute.ints)
+        elif attribute.floats:
+            value = list(attribute.floats)
+        elif attribute.strings:
+            value = [str(s, 'utf-8') for s in attribute.strings]
+        elif attribute.tensors:
+            value = [OnnxTensor(t) for t in attribute.tensors]
+        else:
+            value = attribute
+
+        return value
+
+    @property
+    def proto(self) -> NodeProto:  # pylint: disable=missing-function-docstring
+        return self._proto
+
+    @property
+    def name(self) -> str:  # pylint: disable=missing-function-docstring
+        return self._proto.name
+
+    @property
+    def unique_name(self) -> str:  # pylint: disable=missing-function-docstring
+        return self._unique_name
+
+    @property
+    def domain(self) -> str:  # pylint: disable=missing-function-docstring
+        return self._proto.domain
+
+    @property
+    def operation_type(self) -> str:  # pylint: disable=missing-function-docstring
+        return self._proto.op_type
+
+    @property
+    def input_values(self) -> Tuple[str, ...]:  # pylint: disable=missing-function-docstring
+        return self._input_values
+
+    @property
+    def output_values(self) -> Tuple[str, ...]:  # pylint: disable=missing-function-docstring
+        return self._output_values
+
+    @property
+    def attributes(self) -> Mapping[str, Any]:  # pylint: disable=missing-function-docstring
+        return MappingProxyType(self._proto_attributes)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/onnx_tensor.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/onnx_tensor.py
new file mode 100644
index 000000000..e29ea53d6
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/onnx_tensor.py
@@ -0,0 +1,33 @@
+import numpy as np
+import torch
+from onnx import numpy_helper
+from onnx.onnx_ml_pb2 import TensorProto
+
+
+class OnnxTensor:  # pylint: disable=missing-class-docstring
+    def __init__(self, onnx_tensor_proto: TensorProto):
+        self._proto = onnx_tensor_proto
+
+    @classmethod
+    def from_numpy(cls, array: np.ndarray, name: str = None):  # pylint: disable=missing-function-docstring
+        onnx_tensor_proto = numpy_helper.from_array(array, name=name)
+        return cls(onnx_tensor_proto)
+
+    @classmethod
+    def from_torch(cls, tensor: torch.Tensor, name: str = None):  # pylint: disable=missing-function-docstring
+        array = tensor.detach().cpu().numpy()
+        return cls.from_numpy(array, name=name)
+
+    @property
+    def proto(self) -> TensorProto:  # pylint: disable=missing-function-docstring
+        return self._proto
+
+    @property
+    def name(self) -> str:  # pylint: disable=missing-function-docstring
+        return self._proto.name
+
+    def to_numpy(self) -> np.ndarray:  # pylint: disable=missing-function-docstring
+        return numpy_helper.to_array(self._proto).copy()
+
+    def to_torch(self) -> torch.Tensor:  # pylint: disable=missing-function-docstring
+        return torch.from_numpy(self.to_numpy())
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/__init__.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/common.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/common.py
new file mode 100644
index 000000000..517f7c197
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/common.py
@@ -0,0 +1,85 @@
+from typing import List
+from typing import NamedTuple
+from typing import Tuple
+from typing import Union
+
+import torch
+from onnx import ValueInfoProto  # pylint: disable=no-name-in-module
+from torch import nn
+from torch.onnx import symbolic_helper
+
+from onnx2torch.onnx_graph import OnnxGraph
+from onnx2torch.onnx_node import OnnxNode
+
+
+class OnnxToTorchModule:
+    """
+    Marker class for onnx2torch modules.
+    """
+
+    pass  # pylint: disable=unnecessary-pass
+
+
+class OnnxMapping(NamedTuple):  # pylint: disable=missing-class-docstring
+    inputs: Tuple[str, ...]
+    outputs: Tuple[str, ...]
+
+
+class OperationConverterResult(NamedTuple):  # pylint: disable=missing-class-docstring
+    torch_module: nn.Module
+    onnx_mapping: OnnxMapping
+
+
+def onnx_mapping_from_node(node: OnnxNode) -> OnnxMapping:  # pylint: disable=missing-function-docstring
+    return OnnxMapping(
+        inputs=node.input_values,
+        outputs=node.output_values,
+    )
+
+
+def get_onnx_version() -> int:
+    """Returns opset version at the time of the export."""
+    if hasattr(symbolic_helper, 'GLOBALS'):
+        return symbolic_helper.GLOBALS.export_onnx_opset_version
+
+    return symbolic_helper._export_onnx_opset_version  # pylint: disable=no-member, protected-access
+
+
+def get_shape_from_value_info(value_info: ValueInfoProto) -> List[int]:  # pylint: disable=missing-function-docstring
+    return [dim.dim_value for dim in value_info.type.tensor_type.shape.dim]
+
+
+def get_const_value(  # pylint: disable=missing-function-docstring
+    name: str,
+    graph: OnnxGraph,
+) -> Union[torch.Tensor, float, int, str, List]:
+    if name in graph.initializers:
+        return graph.initializers[name].to_torch()
+
+    try:
+        node, _ = graph.value_as_node_output(name)
+    except KeyError as exc:
+        raise KeyError(f'Tensor "{name}" is not found in constant values') from exc
+
+    if node.operation_type == 'Constant':
+        attr_name, attr_value = next(iter(node.attributes.items()))
+        if attr_name == 'value':
+            attr_value = attr_value.to_torch()
+
+        return attr_value
+
+    raise KeyError(f'Tensor "{name}" is not found in constant values')
+
+
+def old_style_broadcast(  # pylint: disable=missing-function-docstring
+    first: torch.Tensor,
+    second: torch.Tensor,
+    axis: int,
+) -> torch.Tensor:
+    rank = len(first.shape)
+    axis = axis + rank if axis < 0 else axis
+
+    second_shape = [1] * axis + list(second.shape)
+    second_shape = second_shape + [1] * (rank - len(second_shape))
+
+    return second.view(second_shape)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/custom_export_to_onnx.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/custom_export_to_onnx.py
new file mode 100644
index 000000000..78e802974
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/custom_export_to_onnx.py
@@ -0,0 +1,100 @@
+__all__ = [
+    'CustomExportToOnnx',
+    'DefaultExportToOnnx',
+    'OnnxToTorchModuleWithCustomExport',
+]
+
+from typing import Any
+from typing import Callable
+from typing import Dict
+from typing import Optional
+
+import torch
+from torch import _C as torch_C
+
+from onnx2torch.utils.common import OnnxToTorchModule
+
+
+class OnnxToTorchModuleWithCustomExport(OnnxToTorchModule):
+    """
+    Marker class for onnx2torch modules with custom export to onnx.
+    """
+
+    def _onnx_attrs(self, opset_version: int) -> Dict[str, Any]:  # pylint: disable=unused-argument
+        """
+        Returns ONNX attributes with their values as a dictionary.
+
+        Parameters
+        ----------
+        opset_version : int
+            ONNX opset version.
+            The number of attributes, their names and values depend on opset version;
+            function should return correct set of attributes.
+
+        Returns
+        -------
+        Dict[str, Any]
+            ONNX attributes.
+
+        """
+        return {}
+
+
+class CustomExportToOnnx(torch.autograd.Function):
+    """Customizes ONNX exporting from PyTorch."""
+
+    _NEXT_FORWARD_FUNCTION: Optional[Callable] = None
+
+    @classmethod
+    def export(cls, forward_function: Callable, *args) -> Any:
+        """
+        Substitues custom forward function.
+        This function is closely related to forward function, it substitues `forward_function` to real forward.
+
+        Old name: `set_forward_and_apply`.
+        """
+        CustomExportToOnnx._NEXT_FORWARD_FUNCTION = forward_function
+        return cls.apply(*args)
+
+    @staticmethod
+    def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:  # pylint: disable=unused-argument
+        """Applies custom forward function."""
+        if CustomExportToOnnx._NEXT_FORWARD_FUNCTION is None:
+            raise RuntimeError('Forward function is not set')
+
+        try:
+            return CustomExportToOnnx._NEXT_FORWARD_FUNCTION()  # pylint: disable=not-callable
+        finally:
+            CustomExportToOnnx._NEXT_FORWARD_FUNCTION = None
+
+    @staticmethod
+    def backward(ctx: Any, *grad_outputs: Any) -> Any:  # pylint: disable=unused-argument, missing-function-docstring
+        raise RuntimeError('Backward called while converting to ONNX')
+
+    @staticmethod
+    def symbolic(graph: torch_C.Graph, *args) -> torch_C.Value:  # pylint: disable=unused-argument
+        """Export implementation. Return ONNX operation from this function using graph."""
+        raise NotImplementedError
+
+
+class DefaultExportToOnnx(CustomExportToOnnx):  # pylint: disable=abstract-method
+    """
+    CustomExportToOnnx with default symbolic method implementation.
+
+    Please follow our convention, args consists of:
+        - op_type
+        - operation inputs
+        - operation attributes
+
+    DO NOT REORDER!
+
+    Note: the number of operation outputs can be added later.
+
+    This class should be used in most cases:
+    >>> return DefaultExportToOnnx.export(_forward, op_type, *inputs, onnx_attrs)
+    """
+
+    @staticmethod
+    def symbolic(graph: torch_C.Graph, *args) -> torch_C.Value:
+        op_type, *inputs, onnx_attrs = args
+        return graph.op(op_type, *inputs, **onnx_attrs, outputs=1)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/dtype.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/dtype.py
new file mode 100644
index 000000000..5c3c6e269
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/dtype.py
@@ -0,0 +1,87 @@
+from typing import Any
+from typing import Dict
+from typing import Type
+from typing import Union
+
+import numpy as np
+import torch
+
+
+def onnx_dtype_to_torch_dtype(dtype: int) -> Union[torch.dtype, Type[str], Type[bool]]:
+    """
+    Convert ONNX dtype to PyTorch dtype.
+
+    Parameters
+    ----------
+    dtype : int
+        ONNX data type.
+
+    Returns
+    -------
+    Union[torch.dtype, Type[str], Type[bool]]
+        Corresponding PyTorch dtype.
+
+    """
+    # https://github.com/onnx/onnx/blob/main/onnx/onnx-ml.proto#L485
+    _dtypes: Dict[int, Union[torch.dtype, Type[str], Type[bool]]] = {
+        1: torch.float32,
+        2: torch.uint8,
+        3: torch.int8,
+        # 4: UINT16 is not supported: https://github.com/pytorch/pytorch/issues/58734.
+        5: torch.int16,
+        6: torch.int32,
+        7: torch.int64,
+        8: str,
+        9: bool,
+        10: torch.float16,
+        11: torch.float64,
+        # 12: UINT32 is not supported: https://github.com/pytorch/pytorch/issues/58734.
+        # 13: UINT64 is not supported: https://github.com/pytorch/pytorch/issues/58734.
+        14: torch.complex64,
+        15: torch.complex128,
+        16: torch.bfloat16,
+    }
+    try:
+        return _dtypes[dtype]
+    except KeyError as exc:
+        raise ValueError(f'dtype={dtype} is not supported') from exc
+
+
+def onnx_dtype_to_numpy_dtype(dtype: int) -> Union[np.dtype, Type[str], Type[bool]]:
+    """
+    Convert ONNX dtype to Numpy dtype.
+
+    Parameters
+    ----------
+    dtype : int
+        ONNX data type.
+
+    Returns
+    -------
+    Union[torch.dtype, Type[str], Type[bool]]
+        Corresponding Numpy dtype.
+
+    """
+    # https://numpy.org/doc/stable/reference/arrays.dtypes.html
+    _dtypes: Dict[int, Any] = {
+        1: np.float32,
+        2: np.uint8,
+        3: np.int8,
+        4: np.uint16,
+        5: np.int16,
+        6: np.int32,
+        7: np.int64,
+        8: str,
+        9: bool,
+        10: np.float16,
+        11: np.float64,
+        12: np.uint32,
+        13: np.uint64,
+        14: np.complex64,
+        15: np.complex128,
+        # 16: bfloat16 is not supported.
+    }
+    try:
+        return _dtypes[dtype]
+    except KeyError as exc:
+        raise ValueError(f'dtype={dtype} is not supported') from exc
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/indices.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/indices.py
new file mode 100644
index 000000000..9a58f8dda
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/indices.py
@@ -0,0 +1,30 @@
+import torch
+
+__all__ = [
+    'upcast_indices',
+]
+
+_INT_DTYPES = (
+    torch.int8,
+    torch.int16,
+    torch.int32,
+    torch.int64,
+)
+
+
+def upcast_indices(indices: torch.Tensor) -> torch.Tensor:
+    """
+    Upcasts indices tensor to torch.int64 (long) dtype.
+
+    indices : torch.Tensor
+        Indices for upcasting to torch.int64.
+
+    Returns
+    -------
+    torch.Tensor
+        Upcasted to torch.int64 tensor.
+
+    """
+    if not any(indices.dtype == dtype for dtype in _INT_DTYPES):
+        raise ValueError(f'Expected types of indices: {_INT_DTYPES}, got {indices.dtype} instead')
+    return indices.type(dtype=torch.int64)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/padding.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/padding.py
new file mode 100644
index 000000000..e95ea82c6
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/padding.py
@@ -0,0 +1,35 @@
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+from torch import nn
+
+from onnx2torch.node_converters.pad import OnnxPadStatic
+
+
+def is_symmetric_onnx_padding(padding: Tuple[int, ...]) -> bool:  # pylint: disable=missing-function-docstring
+    half_len = len(padding) // 2
+    return padding[:half_len] == padding[half_len:]
+
+
+def onnx_auto_pad_to_torch_padding(  # pylint: disable=missing-function-docstring
+    auto_pad: str,
+    onnx_padding: Tuple[int, ...],
+) -> Tuple[Union[int, Tuple[int, ...]], Optional[nn.Module]]:
+    if auto_pad == 'NOTSET':
+        if onnx_padding is None:
+            return 0, None
+
+        if is_symmetric_onnx_padding(onnx_padding):
+            half_len = len(onnx_padding) // 2
+            return onnx_padding[:half_len], None
+
+        return 0, OnnxPadStatic.create_from_onnx_params(onnx_pads=onnx_padding)
+
+    if auto_pad == 'VALID':
+        return 0, None
+
+    if auto_pad in ('SAME_UPPER', 'SAME_LOWER'):
+        raise NotImplementedError(f'"{auto_pad}" auto_pad is not implemented')
+
+    raise ValueError(f'Got unexpected auto_pad value "{auto_pad}"')
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/safe_shape_inference.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/safe_shape_inference.py
new file mode 100644
index 000000000..7b93ddc4a
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/onnx2torch/utils/safe_shape_inference.py
@@ -0,0 +1,46 @@
+import tempfile
+from pathlib import Path
+from typing import Union
+
+import onnx
+from onnx.onnx_ml_pb2 import ModelProto
+from onnx.shape_inference import infer_shapes
+from onnx.shape_inference import infer_shapes_path
+
+
+def _is_big_model(model: ModelProto) -> bool:
+    return model.ByteSize() / (1024 * 1024 * 1024) > 2.0
+
+
+def _shape_inference_by_model_path(
+    model_path: Union[Path, str],
+    output_path: Union[Path, str],
+    **kwargs,
+) -> ModelProto:
+    model_path = str(Path(model_path).resolve())
+    output_path = str(Path(output_path).resolve())
+    infer_shapes_path(model_path, output_path=output_path, **kwargs)
+
+    return onnx.load(output_path)
+
+
+def safe_shape_inference(  # pylint: disable=missing-function-docstring
+    onnx_model_or_path: Union[ModelProto, Path, str],
+    **kwargs,
+) -> ModelProto:
+    if isinstance(onnx_model_or_path, ModelProto):
+        if not _is_big_model(onnx_model_or_path):
+            return infer_shapes(onnx_model_or_path, **kwargs)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_model_path = Path(tmp_dir) / 'model.onnx'
+            onnx.save_model(
+                proto=onnx_model_or_path,
+                f=str(tmp_model_path),
+                save_as_external_data=True,
+                all_tensors_to_one_file=True,
+            )
+            return _shape_inference_by_model_path(tmp_model_path, output_path=tmp_model_path, **kwargs)
+
+    with tempfile.NamedTemporaryFile(dir=Path(onnx_model_or_path).parent) as tmp_model_file:
+        return _shape_inference_by_model_path(onnx_model_or_path, output_path=tmp_model_file.name, **kwargs)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/operators.md b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/operators.md
new file mode 100644
index 000000000..949a1c6eb
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/operators.md
@@ -0,0 +1,178 @@
+# Currently supported operations
+
+Minimal tested opset version 9, maximum tested opset version 16, recommended opset version 13
+
+| Operation type            | Supported | Restrictions                                                                                                                                                  |
+| ------------------------- | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Abs                       | Y         |                                                                                                                                                               |
+| Acos                      | Y         |                                                                                                                                                               |
+| Acosh                     | N         |                                                                                                                                                               |
+| Add                       | Y         |                                                                                                                                                               |
+| And                       | Y         |                                                                                                                                                               |
+| ArgMax                    | N         |                                                                                                                                                               |
+| ArgMin                    | N         |                                                                                                                                                               |
+| Asin                      | Y         |                                                                                                                                                               |
+| Asinh                     | N         |                                                                                                                                                               |
+| Atan                      | Y         |                                                                                                                                                               |
+| Atanh                     | N         |                                                                                                                                                               |
+| AveragePool               | Y         | Average pool operation with spatial rank > 3 is not implemented                                                                                               |
+| BatchNormalization        | Y         | BatchNorm operation with spatial rank > 3 is not implemented. BatchNorm nodes in training mode are not supported                                              |
+| BitShift                  | N         |                                                                                                                                                               |
+| Cast                      | Y         |                                                                                                                                                               |
+| Ceil                      | Y         |                                                                                                                                                               |
+| Clip                      | Y         | Dynamic value of min/max is not implemented                                                                                                                   |
+| Compress                  | N         |                                                                                                                                                               |
+| Concat                    | Y         |                                                                                                                                                               |
+| ConcatFromSequence        | N         |                                                                                                                                                               |
+| Constant                  | Y         |                                                                                                                                                               |
+| ConstantOfShape           | Y         | Parameter "value" must be scalar                                                                                                                              |
+| Conv                      | Y         | Convolution operation with spatial rank > 3 is not implemented                                                                                                |
+| ConvInteger               | N         |                                                                                                                                                               |
+| ConvTranspose             | Y         | Convolution operation with spatial rank > 3 is not implemented                                                                                                |
+| Cos                       | Y         |                                                                                                                                                               |
+| Cosh                      | N         |                                                                                                                                                               |
+| CumSum                    | Y         |                                                                                                                                                               |
+| DepthToSpace              | Y         | DCR mode is not implemented                                                                                                                                   |
+| DequantizeLinear          | N         |                                                                                                                                                               |
+| Det                       | N         |                                                                                                                                                               |
+| Div                       | Y         |                                                                                                                                                               |
+| Dropout                   | Y         |                                                                                                                                                               |
+| Einsum                    | Y         |                                                                                                                                                               |
+| Elu                       | Y         |                                                                                                                                                               |
+| Equal                     | Y         |                                                                                                                                                               |
+| Erf                       | Y         |                                                                                                                                                               |
+| Exp                       | Y         |                                                                                                                                                               |
+| Expand                    | Y         |                                                                                                                                                               |
+| EyeLike                   | Y         |                                                                                                                                                               |
+| Flatten                   | Y         |                                                                                                                                                               |
+| Floor                     | Y         |                                                                                                                                                               |
+| GRU                       | N         |                                                                                                                                                               |
+| Gather                    | Y         |                                                                                                                                                               |
+| GatherElements            | Y         |                                                                                                                                                               |
+| GatherND                  | Y         | GatherND operation with parameter "batch_dims" > 0 is not implemented                                                                                         |
+| Gemm                      | Y         |                                                                                                                                                               |
+| GlobalAveragePool         | Y         |                                                                                                                                                               |
+| GlobalLpPool              | N         |                                                                                                                                                               |
+| GlobalMaxPool             | N         |                                                                                                                                                               |
+| Greater                   | Y         |                                                                                                                                                               |
+| GridSample                | N         |                                                                                                                                                               |
+| HardSigmoid               | Y         |                                                                                                                                                               |
+| Hardmax                   | N         |                                                                                                                                                               |
+| Identity                  | Y         |                                                                                                                                                               |
+| If                        | N         |                                                                                                                                                               |
+| InstanceNormalization     | Y         |                                                                                                                                                               |
+| IsInf                     | Y         |                                                                                                                                                               |
+| IsNaN                     | Y         |                                                                                                                                                               |
+| LayerNormalization        | Y         | LayerNormalization outputs "Mean" and "InvStdDev" are not implemented                                                                                         |
+| LRN                       | Y         |                                                                                                                                                               |
+| LSTM                      | N         |                                                                                                                                                               |
+| LeakyRelu                 | Y         |                                                                                                                                                               |
+| Less                      | Y         |                                                                                                                                                               |
+| Log                       | Y         |                                                                                                                                                               |
+| Loop                      | N         |                                                                                                                                                               |
+| LpNormalization           | N         |                                                                                                                                                               |
+| LpPool                    | N         |                                                                                                                                                               |
+| MatMul                    | Y         |                                                                                                                                                               |
+| MatMulInteger             | N         |                                                                                                                                                               |
+| Max                       | Y         |                                                                                                                                                               |
+| MaxPool                   | Y         | Max pool operation with spatial rank > 3 is not implemented                                                                                                   |
+| MaxRoiPool                | N         |                                                                                                                                                               |
+| MaxUnpool                 | N         |                                                                                                                                                               |
+| Mean                      | Y         |                                                                                                                                                               |
+| Min                       | Y         |                                                                                                                                                               |
+| Mod                       | Y         |                                                                                                                                                               |
+| Mul                       | Y         |                                                                                                                                                               |
+| Multinomial               | N         |                                                                                                                                                               |
+| Neg                       | Y         |                                                                                                                                                               |
+| NonMaxSuppression         | Y         |                                                                                                                                                               |
+| NonZero                   | Y         |                                                                                                                                                               |
+| Not                       | Y         |                                                                                                                                                               |
+| OneHot                    | N         |                                                                                                                                                               |
+| Optional                  | N         |                                                                                                                                                               |
+| OptionalGetElement        | N         |                                                                                                                                                               |
+| OptionalHasElement        | N         |                                                                                                                                                               |
+| Or                        | Y         |                                                                                                                                                               |
+| PRelu                     | Y         |                                                                                                                                                               |
+| Pad                       | Y         | Padding is implemented to pad the last 3 dimensions of 5D input tensor, or the last 2 dimensions of 4D input tensor, or the last dimension of 3D input tensor |
+| Pow                       | Y         |                                                                                                                                                               |
+| QLinearConv               | N         |                                                                                                                                                               |
+| QLinearMatMul             | N         |                                                                                                                                                               |
+| QuantizeLinear            | N         |                                                                                                                                                               |
+| RNN                       | N         |                                                                                                                                                               |
+| RandomNormal              | N         |                                                                                                                                                               |
+| RandomNormalLike          | N         |                                                                                                                                                               |
+| RandomUniform             | N         |                                                                                                                                                               |
+| RandomUniformLike         | N         |                                                                                                                                                               |
+| Reciprocal                | Y         |                                                                                                                                                               |
+| ReduceL1                  | Y         |                                                                                                                                                               |
+| ReduceL2                  | Y         |                                                                                                                                                               |
+| ReduceLogSum              | Y         |                                                                                                                                                               |
+| ReduceLogSumExp           | Y         |                                                                                                                                                               |
+| ReduceMax                 | Y         |                                                                                                                                                               |
+| ReduceMean                | Y         |                                                                                                                                                               |
+| ReduceMin                 | Y         |                                                                                                                                                               |
+| ReduceProd                | Y         |                                                                                                                                                               |
+| ReduceSum                 | Y         |                                                                                                                                                               |
+| ReduceSumSquare           | Y         |                                                                                                                                                               |
+| Relu                      | Y         |                                                                                                                                                               |
+| Reshape                   | Y         | Parameter "allowzero" = 1 is not implemented                                                                                                                  |
+| Resize                    | Y         | Roi logic is not implemented (pytorch's interpolate cannot resize channel or batch dimensions)                                                                |
+| ReverseSequence           | N         |                                                                                                                                                               |
+| RoiAlign                  | Y         | Only "avg" mode is supported                                                                                                                                  |
+| Round                     | Y         |                                                                                                                                                               |
+| Scan                      | N         |                                                                                                                                                               |
+| Scatter(deprecated)       | N         |                                                                                                                                                               |
+| ScatterElements           | N         |                                                                                                                                                               |
+| ScatterND                 | Y         | Only "none" reduction is supported                                                                                                                            |
+| Selu                      | Y         | Parameters "alpha" and "gamma" must be default                                                                                                                |
+| SequenceAt                | N         |                                                                                                                                                               |
+| SequenceConstruct         | N         |                                                                                                                                                               |
+| SequenceEmpty             | N         |                                                                                                                                                               |
+| SequenceErase             | N         |                                                                                                                                                               |
+| SequenceInsert            | N         |                                                                                                                                                               |
+| SequenceLength            | N         |                                                                                                                                                               |
+| Shape                     | Y         |                                                                                                                                                               |
+| Shrink                    | N         |                                                                                                                                                               |
+| Sigmoid                   | Y         |                                                                                                                                                               |
+| Sign                      | Y         |                                                                                                                                                               |
+| Sin                       | Y         |                                                                                                                                                               |
+| Sinh                      | N         |                                                                                                                                                               |
+| Size                      | N         |                                                                                                                                                               |
+| Slice                     | Y         |                                                                                                                                                               |
+| Softplus                  | Y         |                                                                                                                                                               |
+| Softsign                  | Y         |                                                                                                                                                               |
+| SpaceToDepth              | N         |                                                                                                                                                               |
+| Split                     | Y         |                                                                                                                                                               |
+| SplitToSequence           | N         |                                                                                                                                                               |
+| Sqrt                      | Y         |                                                                                                                                                               |
+| Squeeze                   | Y         |                                                                                                                                                               |
+| StringNormalizer          | N         |                                                                                                                                                               |
+| Sub                       | Y         |                                                                                                                                                               |
+| Sum                       | Y         |                                                                                                                                                               |
+| Tan                       | Y         |                                                                                                                                                               |
+| Tanh                      | Y         |                                                                                                                                                               |
+| TfIdfVectorizer           | N         |                                                                                                                                                               |
+| ThresholdedRelu           | N         |                                                                                                                                                               |
+| Tile                      | Y         |                                                                                                                                                               |
+| TopK                      | Y         |                                                                                                                                                               |
+| Transpose                 | Y         |                                                                                                                                                               |
+| Trilu                     | N         |                                                                                                                                                               |
+| Unique                    | N         |                                                                                                                                                               |
+| Unsqueeze                 | Y         |                                                                                                                                                               |
+| Upsample(deprecated)      | N         |                                                                                                                                                               |
+| Where                     | Y         |                                                                                                                                                               |
+| Xor                       | Y         |                                                                                                                                                               |
+| Function                  | N         |                                                                                                                                                               |
+| Bernoulli                 | N         |                                                                                                                                                               |
+| CastLike                  | N         |                                                                                                                                                               |
+| Celu                      | Y         |                                                                                                                                                               |
+| DynamicQuantizeLinear     | N         |                                                                                                                                                               |
+| GreaterOrEqual            | Y         |                                                                                                                                                               |
+| HardSwish                 | Y         |                                                                                                                                                               |
+| LessOrEqual               | Y         |                                                                                                                                                               |
+| LogSoftmax                | Y         |                                                                                                                                                               |
+| MeanVarianceNormalization | N         |                                                                                                                                                               |
+| NegativeLogLikelihoodLoss | N         |                                                                                                                                                               |
+| Range                     | Y         |                                                                                                                                                               |
+| SequenceMap               | N         |                                                                                                                                                               |
+| Softmax                   | Y         |                                                                                                                                                               |
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/pyproject.toml b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/pyproject.toml
new file mode 100644
index 000000000..797e804a4
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/pyproject.toml
@@ -0,0 +1,109 @@
+[project]
+name = 'onnx2torch'
+version = '1.5.14'
+license = {file = 'LICENSE'}
+description = 'ONNX to PyTorch converter'
+readme = 'README.md'
+keywords = ['AI', 'onnx', 'torch', 'onnx2torch', 'converters']
+authors = [{name = 'ENOT LLC', email = 'enot@enot.ai'}]
+classifiers = [
+    'Development Status :: 5 - Production/Stable',
+    'License :: OSI Approved :: Apache Software License',
+    'Programming Language :: Python',
+    'Programming Language :: Python :: 3 :: Only',
+]
+requires-python = '>=3.6'
+dependencies = [
+    'numpy>=1.16.4',
+    'onnx>=1.9.0',
+    'torch>=1.8.0',
+    'torchvision>=0.9.0',
+]
+
+[project.optional-dependencies]
+dev = [
+    'pytest',
+    'black',
+    'isort',
+    'pylint',
+    'pre-commit',
+    'onnxruntime',
+    'Pillow',
+    'requests',
+    'googledrivedownloader',
+]
+
+[project.urls]
+homepage = 'https://enot.ai'
+repository = 'https://github.com/ENOT-AutoDL/onnx2torch'
+
+[tool.setuptools.packages.find]
+include = ['onnx2torch*']
+
+[tool.commitizen]
+name = 'cz_conventional_commits'
+tag_format = '$version'
+version_scheme = 'pep440'
+version_provider = 'pep621'
+update_changelog_on_bump = true
+major_version_zero = true
+
+[tool.docformatter]
+recursive = true
+wrap-summaries = 0
+wrap-descriptions = 0
+blank = true
+black = true
+pre-summary-newline = true
+
+[tool.yamlfix]
+line_length = 120
+explicit_start = false
+sequence_style = 'keep_style'
+whitelines = 1
+section_whitelines = 1
+
+[tool.black]
+line-length = 120
+target-version = ['py36', 'py37', 'py38', 'py39']
+include = '\.pyi?$'
+skip-string-normalization = true
+
+[tool.isort]
+profile = 'black'
+line_length = 120
+ensure_newline_before_comments = true
+force_single_line = true
+
+[tool.pylint.master]
+load-plugins = ['pylint.extensions.docparams']
+
+[tool.pylint.format]
+max-line-length = 120
+
+[tool.pylint.design]
+max-args = 12
+max-locals = 30
+max-attributes = 20
+min-public-methods = 0
+
+[tool.pylint.typecheck]
+generated-members = ['torch.*']
+
+[tool.pylint.messages_control]
+disable = [
+    'logging-fstring-interpolation',
+    'cyclic-import',
+    'duplicate-code',
+    'missing-module-docstring',
+    'unnecessary-pass',
+    'no-name-in-module',
+]
+
+[tool.pylint.BASIC]
+good-names = ['bs', 'bn']
+
+[tool.pyright]
+reportMissingImports = false
+reportMissingTypeStubs = false
+reportWildcardImportFromLibrary = false
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/setup.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/setup.py
new file mode 100644
index 000000000..81368830f
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/setup.py
@@ -0,0 +1,26 @@
+import setuptools
+
+
+import setuptools  # noqa
+
+setuptools.setup(
+    name="onnx2torch",
+    author="ENOT LLC",
+    version="1.15.4",
+    author_email="enot@enot.ai",
+    packages=setuptools.find_packages(where="onnx2torch"),
+    python_requires=">=3.6",
+    install_requires=[
+        'numpy>=1.16.4',
+        'onnx>=1.9.0',
+        'torch>=1.8.0',
+        'torchvision>=0.9.0',
+    ],
+    entry_points={
+        'console_scripts': [
+            'apap = package.main:main',
+        ]
+
+    }
+)
+
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/__init__.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/__init__.py
new file mode 100644
index 000000000..3e3d8e059
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/__init__.py
@@ -0,0 +1,9 @@
+from pathlib import Path
+
+TMP_DIR = Path(__file__).parent / '.tmp'
+MODELS_DIR = TMP_DIR / 'models'
+DATASETS_DIR = TMP_DIR / 'datasets'
+
+TMP_DIR.mkdir(exist_ok=True)
+MODELS_DIR.mkdir(exist_ok=True)
+DATASETS_DIR.mkdir(exist_ok=True)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/README.md b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/README.md
new file mode 100644
index 000000000..9877da81b
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/README.md
@@ -0,0 +1,21 @@
+## 环境安装
+```
+
+git clone -b bytemlperf ssh://git@bitbucket.iluvatar.ai:7999/swapp/onnx2torch.git 
+cd onnx2torch/
+# 模型路径
+# ln -s /home/data/bytemlperf/stable_diffusion .
+pip3 install onnx onnxconverter onnxconverter_common onnx-simplifier
+# 修改为你的路径/path/to/onnx2torch
+export PYTHONPATH=${PYTHONPATH}:/path/to/onnx2torch
+
+
+```
+## float32推理
+```
+python3 tests/models/test_clip_text_encoder.py 
+```
+## float16推理
+```
+python3 tests/models/test_clip_text_encoder_half.py 
+```
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/__init__.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/models_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/models_test.py
new file mode 100644
index 000000000..e3855823c
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/models_test.py
@@ -0,0 +1,226 @@
+from typing import Tuple
+
+import numpy as np
+import pytest
+import torchvision
+from onnx import version_converter
+from PIL import Image
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import check_torch_model
+from tests.utils.resources import get_minimal_dataset_path
+from tests.utils.resources import get_model
+
+_COCO_MEAN = np.array([0.406, 0.485, 0.456], dtype=np.float32)
+_COCO_STD = np.array([0.225, 0.224, 0.229], dtype=np.float32)
+
+
+def create_test_batch(  # pylint: disable=missing-function-docstring
+    batch_size: int = 32,
+    target_size: Tuple[int, int] = (224, 224),
+) -> np.ndarray:
+    minimal_dataset_path = get_minimal_dataset_path()
+
+    batch = []
+    for index, image_path in enumerate(minimal_dataset_path.glob('*.jpg')):
+        if index >= batch_size:
+            break
+
+        image = Image.open(image_path).convert('RGB')
+        image = image.resize(size=target_size)
+        image = (np.array(image, dtype=np.float32) / 255.0 - _COCO_MEAN) / _COCO_STD
+        image = image.transpose([2, 0, 1])
+
+        batch.append(image)
+    else:
+        raise ValueError('Batch size ({n}) is too large.')
+
+    return np.array(batch)
+
+
+@pytest.mark.filterwarnings('ignore::torch.jit._trace.TracerWarning')
+def test_resnet50():  # pylint: disable=missing-function-docstring
+    model = get_model('resnet50')
+    model = version_converter.convert_version(model, 11)
+
+    input_name = model.graph.input[0].name
+    test_inputs = {input_name: np.random.randn(1, 3, 224, 224).astype(dtype=np.float32)}
+
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=10**-5,
+        atol_torch_cpu_cuda=10**-5,
+    )
+
+
+@pytest.mark.filterwarnings('ignore::torch.jit._trace.TracerWarning')
+@pytest.mark.parametrize(
+    'model_name,resolution',
+    (
+        ('retinanet', (604, 604)),
+        ('ssd300_vgg', (604, 604)),
+        ('ssdlite', (224, 224)),
+        ('yolov3_d53', (604, 604)),
+        ('yolov5_ultralitics', (672, 256)),
+        ('deeplabv3_mnv3_large', (320, 320)),
+        ('deeplabv3_plus_resnet101', (486, 500)),
+        ('hrnet', (321, 321)),
+        ('unet', (320, 320)),
+    ),
+)
+def test_onnx_models(  # pylint: disable=missing-function-docstring
+    model_name: str, resolution: Tuple[int, int]
+) -> None:
+    model = get_model(model_name)
+    input_name = model.graph.input[0].name
+    test_inputs = {
+        input_name: create_test_batch(batch_size=1, target_size=resolution),
+    }
+
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=10**-3,
+        atol_torch_cpu_cuda=10**-3,
+        atol_onnx_torch2onnx=10**-3,
+    )
+
+
+@pytest.mark.filterwarnings('ignore::torch.jit._trace.TracerWarning')
+@pytest.mark.parametrize(
+    'model_name',
+    (
+        'resnet18',
+        'resnet50',
+        'mobilenet_v2',
+        'mobilenet_v3_large',
+        'efficientnet_b0',
+        'efficientnet_b1',
+        'efficientnet_b2',
+        'efficientnet_b3',
+        'wide_resnet50_2',
+        'resnext50_32x4d',
+        'vgg16',
+        'googlenet',
+        'mnasnet1_0',
+        'regnet_y_400mf',
+        'regnet_y_16gf',
+    ),
+)
+def test_torchvision_classification(model_name: str) -> None:  # pylint: disable=missing-function-docstring
+    torch_model = getattr(torchvision.models, model_name)(pretrained=True)
+    test_inputs = {
+        'inputs': create_test_batch(batch_size=32),
+    }
+
+    check_torch_model(
+        torch_model,
+        test_inputs,
+        atol_onnx_torch=10**-4,
+        atol_torch_cpu_cuda=10**-4,
+        atol_onnx_torch2onnx=10**-4,
+    )
+
+
+@pytest.mark.filterwarnings('ignore::torch.jit._trace.TracerWarning')
+@pytest.mark.parametrize(
+    'model_name',
+    (
+        'fcn_resnet50',
+        'deeplabv3_resnet50',
+        'lraspp_mobilenet_v3_large',
+    ),
+)
+def test_torchvision_segmentation(model_name: str) -> None:  # pylint: disable=missing-function-docstring
+    torch_model = getattr(torchvision.models.segmentation, model_name)(pretrained=True)
+    test_inputs = {
+        'inputs': create_test_batch(batch_size=8),
+    }
+
+    check_torch_model(
+        torch_model,
+        test_inputs,
+        atol_onnx_torch=10**-3,
+        atol_torch_cpu_cuda=10**-3,
+        atol_onnx_torch2onnx=10**-3,
+    )
+
+
+@pytest.mark.filterwarnings('ignore::torch.jit._trace.TracerWarning')
+@pytest.mark.parametrize(
+    'model_name',
+    (
+        'vit',
+        'swin',
+    ),
+)
+def test_transformer_models(model_name: str) -> None:  # pylint: disable=missing-function-docstring
+    model = get_model(model_name)
+    input_name = model.graph.input[0].name
+    test_inputs = {
+        input_name: create_test_batch(batch_size=8, target_size=(224, 224)),
+    }
+
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=10**-4,
+        atol_torch_cpu_cuda=10**-4,
+        atol_onnx_torch2onnx=10**-4,
+    )
+
+
+def test_3d_gan() -> None:  # pylint: disable=missing-function-docstring
+    model = get_model('3d_gan')
+    input_name = model.graph.input[0].name
+    test_inputs = {input_name: np.random.randn(32, 200).astype(dtype=np.float32)}
+
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=10**-4,
+        atol_torch_cpu_cuda=10**-4,
+    )
+
+
+def test_shelfnet() -> None:  # pylint: disable=missing-function-docstring
+    model = get_model('shelfnet')
+    input_name = model.graph.input[0].name
+    test_inputs = {input_name: np.random.randn(8, 3, 384, 288).astype(dtype=np.float32)}
+
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=10**-4,
+        atol_torch_cpu_cuda=10**-4,
+    )
+
+
+def test_model_with_pad_node() -> None:  # pylint: disable=missing-function-docstring
+    model = get_model('point_arch')
+    input_name = model.graph.input[0].name
+    test_inputs = {input_name: np.random.randn(1, 49, 40, 1).astype(dtype=np.float32)}
+
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=10**-4,
+        atol_torch_cpu_cuda=10**-4,
+    )
+
+
+def test_gptj() -> None:  # pylint: disable=missing-function-docstring
+    model = get_model('gptj_2_random_blocks')
+    input_name = model.graph.input[0].name
+    test_inputs = {
+        input_name: np.random.randint(low=1, high=1024, size=[4, 256], dtype=np.int64),
+    }
+
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=10**-5,
+        atol_torch_cpu_cuda=10**-5,
+        atol_onnx_torch2onnx=10**-7,
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/test_clip_text_encoder.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/test_clip_text_encoder.py
new file mode 100644
index 000000000..23951fb45
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/test_clip_text_encoder.py
@@ -0,0 +1,39 @@
+import os.path
+
+import onnx
+import torch
+from onnx2torch import convert
+
+
+models = ["clip-text-encoder.onnx", "vae-decoder.onnx", "vae-encoder.onnx"]
+model_paths = [os.path.abspath(os.path.join(__file__, "../../../stable_diffusion", p)) for p in models]
+
+device = "cpu"
+if torch.cuda.is_available():
+    device = "cuda"
+
+converted_models = [convert(model).to(device) for model in model_paths]
+model_inputs = [
+    (torch.randint(0, 10, (2, 16), device=device),),
+    (torch.randn([2, 4, 32, 32], device=device),),
+    (torch.randn([2, 3, 256, 256], device=device),)
+]
+
+import time
+#warmup
+for name, model, inputs in zip(models, converted_models, model_inputs):  
+    model = model.eval()
+    out = model(*inputs)     
+    
+for name, model, inputs in zip(models, converted_models, model_inputs):
+    model = model.eval()
+    torch.cuda.synchronize()
+    time_start = time.time()
+    out = model(*inputs)
+    torch.cuda.synchronize()        
+    time_each = time.time() - time_start
+    print(f"{name} time is {time_each}")
+    if torch.is_tensor(out):
+        print(name, out.shape)
+    else:
+        print(name, [t.shape for t in out])
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/test_clip_text_encoder_half.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/test_clip_text_encoder_half.py
new file mode 100644
index 000000000..79b148c6d
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/models/test_clip_text_encoder_half.py
@@ -0,0 +1,55 @@
+import os.path
+
+import onnx
+from onnxsim import simplify
+import torch
+from onnx2torch import convert
+from onnx import load_model, save_model
+from onnxmltools.utils import float16_converter
+models = ["clip-text-encoder.onnx", "vae-decoder.onnx", "vae-encoder.onnx"]
+# models = ["clip-text-encoder.onnx"]
+
+model_paths = [os.path.abspath(os.path.join(__file__, "../../../stable_diffusion", p)) for p in models]
+
+device = "cpu"
+if torch.cuda.is_available():
+    device = "cuda"
+#onnx 转成 fp16
+onnx_model = [load_model(model) for model in model_paths]
+trans_model = [float16_converter.convert_float_to_float16(model,keep_io_types=False) for model in onnx_model]
+
+#onnxsim
+for i,model in enumerate(trans_model):
+    model_simply, check = simplify(model)
+    # onnx.save(model_simply, "new.onnx")
+    trans_model[i]=model_simply
+    assert check, "Simplified ONNX model could not be validated"
+#onnx 转pytorch
+converted_models = [convert(model).to(device) for model in trans_model]
+dtype=torch.float16
+model_inputs = [
+    (torch.randint(0, 10, (2, 16), device=device),),    
+    (torch.randn([2, 4, 32, 32], device=device,dtype=dtype),),
+    (torch.randn([2, 3, 256, 256], device=device,dtype=dtype),)
+]
+import time
+#warmup
+for name, model, inputs in zip(models, converted_models, model_inputs):        
+    model = model.eval()   
+    out = model(*inputs) 
+ 
+for name, model, inputs in zip(models, converted_models, model_inputs):    
+    model = model.eval()   
+    torch.cuda.synchronize()
+    time_start = time.time()
+    # torch.cuda.profiler.start()
+    out = model(*inputs)
+    # torch.cuda.profiler.stop()
+
+    torch.cuda.synchronize()        
+    time_each = time.time() - time_start
+    print(f"{name} time is {time_each}")
+    if torch.is_tensor(out):
+        print(name, out.shape)
+    else:
+        print(name, [t.shape for t in out])
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/__init__.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/activations_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/activations_test.py
new file mode 100644
index 000000000..c92fdb10e
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/activations_test.py
@@ -0,0 +1,121 @@
+from typing import List
+from typing import Optional
+
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_activation(  # pylint: disable=missing-function-docstring
+    activation: str, data: np.ndarray, opset_version, **kwargs
+) -> None:
+    test_inputs = {'input_tensor': data}
+
+    node = onnx.helper.make_node(op_type=activation, inputs=['input_tensor'], outputs=['y'], **kwargs)
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        opset_version=opset_version,
+    )
+
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=10**-6,
+        atol_torch_cpu_cuda=10**-6,
+    )
+
+
+@pytest.mark.parametrize(
+    'activation,input_shape,opset_version',
+    (
+        ('Erf', [8, 3, 32, 32], 11),
+        ('HardSigmoid', [8, 3, 32, 32], 11),
+        ('HardSwish', [8, 3, 32, 32], 14),
+        ('LeakyRelu', [8, 3, 32, 32], 11),
+        ('LeakyRelu', [8, 3, 32, 32], 16),
+        ('LogSoftmax', [8, 3, 32, 32], 11),
+        ('Softsign', [8, 3, 32, 32], 1),
+        ('Softplus', [8, 3, 32, 32], 1),
+        ('Relu', [8, 3, 32, 32], 11),
+        ('Elu', [8, 3, 32, 32], 6),
+        ('Celu', [8, 3, 32, 32], 12),
+        ('Selu', [8, 3, 32, 32], 6),
+        ('Sigmoid', [8, 3, 32, 32], 11),
+    ),
+)
+def test_common_activations(  # pylint: disable=missing-function-docstring
+    activation: str,
+    input_shape: List[int],
+    opset_version: int,
+) -> None:
+    data = np.random.randn(*input_shape).astype(np.float32)
+    _test_activation(activation, data=data, opset_version=opset_version)
+
+
+@pytest.mark.parametrize(
+    'input_shape,axis,opset_version',
+    (
+        ([8, 3, 32, 32], None, 9),
+        ([8, 3, 32, 32], None, 11),
+        ([8, 3, 32, 32], None, 13),
+        ([8, 3, 32, 32], 0, 9),
+        ([8, 3, 32, 32], 0, 11),
+        ([8, 3, 32, 32], 0, 13),
+        ([8, 3, 32, 32], 1, 9),
+        ([8, 3, 32, 32], 1, 11),
+        ([8, 3, 32, 32], 1, 13),
+        ([8, 3, 32, 32], -1, 9),
+        ([8, 3, 32, 32], -1, 11),
+        ([8, 3, 32, 32], -1, 13),
+    ),
+)
+@pytest.mark.parametrize('activation', ('Softmax', 'LogSoftmax'))
+def test_softmax(  # pylint: disable=missing-function-docstring
+    activation: str,
+    input_shape: List[int],
+    axis: Optional[int],
+    opset_version: int,
+) -> None:
+    data = np.random.randn(*input_shape).astype(np.float32)
+    if axis is None:
+        _test_activation(activation, data=data, opset_version=opset_version)
+    else:
+        _test_activation(activation, data=data, opset_version=opset_version, axis=axis)
+
+
+@pytest.mark.parametrize(
+    'opset_version',
+    (7, 9, 11),
+)
+@pytest.mark.parametrize(
+    'input_shape,slope_shape',
+    (
+        ([8, 3, 32, 32], [1, 1, 32]),
+        ([8, 3, 32, 32], [1, 32, 32]),
+        ([8, 3, 32, 32], [3, 1, 1]),
+        ([8, 3, 32, 32], [1]),
+    ),
+)
+def test_prelu(  # pylint: disable=missing-function-docstring
+    input_shape: List[int],
+    slope_shape: List[int],
+    opset_version: int,
+) -> None:
+    data = np.random.randn(*input_shape).astype(np.float32)
+    slope = np.random.randn(*slope_shape).astype(np.float32)
+    test_inputs = {'input_tensor': data, 'slope': slope}
+
+    node = onnx.helper.make_node(op_type='PRelu', inputs=['input_tensor', 'slope'], outputs=['y'])
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        opset_version=opset_version,
+    )
+
+    check_onnx_model(model, test_inputs)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/average_pool_max_pool_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/average_pool_max_pool_test.py
new file mode 100644
index 000000000..247ae5227
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/average_pool_max_pool_test.py
@@ -0,0 +1,99 @@
+from typing import Dict
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_pool_op(
+    op_type,
+    input_shape: List[int],
+    atol_onnx_torch: float = 0.0,
+    **kwargs,
+) -> None:
+    x = np.random.uniform(low=-1.0, high=1.0, size=input_shape).astype(np.float32)
+    test_inputs = {'x': x}
+
+    node = onnx.helper.make_node(
+        op_type,
+        inputs=['x'],
+        outputs=['y'],
+        **kwargs,
+    )
+    model = make_model_from_nodes(nodes=node, initializers={}, inputs_example=test_inputs)
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=atol_onnx_torch,
+    )
+
+
+@pytest.mark.parametrize(
+    'op',
+    (
+        'MaxPool',
+        'AveragePool',
+    ),
+)
+@pytest.mark.parametrize(
+    'input_shape,kernel_shape,optional_attrs',
+    (
+        # 1d
+        ([2, 3, 16], [2], {}),
+        ([2, 3, 16], [1], {}),
+        ([2, 3, 16], [3], {}),
+        ([2, 3, 16], [2], {'strides': [3]}),
+        ([2, 3, 16], [2], {'ceil_mode': 1}),
+        # 2d
+        ([2, 3, 16, 16], [2, 2], {}),
+        ([2, 3, 16, 16], [1, 2], {}),
+        ([2, 3, 16, 16], [3, 2], {}),
+        ([2, 3, 16, 16], [2, 2], {'strides': [2, 3]}),
+        ([2, 3, 16, 16], [2, 2], {'ceil_mode': 1}),
+        # 3d
+        ([2, 3, 16, 16, 16], [2, 2, 2], {}),
+        ([2, 3, 16, 16, 16], [1, 2, 3], {}),
+        ([2, 3, 16, 16, 16], [3, 2, 1], {}),
+        ([2, 3, 16, 16, 16], [2, 2, 2], {'strides': [1, 2, 3]}),
+        ([2, 3, 16, 16, 16], [2, 2, 2], {'ceil_mode': 1}),
+    ),
+)
+def test_max_pool_average_pool(  # pylint: disable=missing-function-docstring
+    op: str,  # pylint: disable=invalid-name
+    input_shape: List[int],
+    kernel_shape: List[int],
+    optional_attrs: Dict,
+) -> None:
+    if op == 'AveragePool':
+        optional_attrs['atol_onnx_torch'] = 10**-7
+
+    _test_pool_op(op, input_shape=input_shape, kernel_shape=kernel_shape, **optional_attrs)
+
+
+@pytest.mark.parametrize(
+    'input_shape,kernel_shape,optional_attrs',
+    (
+        # 1d
+        ([2, 3, 16], [2], {'pads': [1] * 2}),
+        ([2, 3, 16], [3], {'pads': [0, 1]}),
+        ([2, 3, 16], [3], {'pads': [2, 0]}),
+        # 2d
+        ([2, 3, 16, 16], [2, 2], {'pads': [1] * 4}),
+        ([2, 3, 16, 16], [2, 2], {'pads': [0] * 2 + [1] * 2}),
+        ([2, 3, 16, 16], [3, 3], {'pads': [0, 1, 1, 0]}),
+        # 3d
+        ([2, 3, 16, 16, 16], [2, 2, 2], {'pads': [1] * 6}),
+        ([2, 3, 16, 16, 16], [2, 2, 2], {'pads': [0] * 3 + [1] * 3}),
+        ([2, 3, 16, 16, 16], [3, 3, 3], {'pads': [0, 1, 2, 2, 1, 0]}),
+    ),
+)
+def test_max_pool_padding(  # pylint: disable=missing-function-docstring
+    input_shape: List[int],
+    kernel_shape: List[int],
+    optional_attrs: Dict,
+) -> None:
+    _test_pool_op('MaxPool', input_shape=input_shape, kernel_shape=kernel_shape, **optional_attrs)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/batch_norm_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/batch_norm_test.py
new file mode 100644
index 000000000..7db854df0
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/batch_norm_test.py
@@ -0,0 +1,65 @@
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+@pytest.mark.parametrize(
+    'parameters_as_inputs',
+    (True, False),
+)
+@pytest.mark.parametrize(
+    'input_shape',
+    (
+        # 1d
+        [2, 3, 16],
+        [2, 1, 7],
+        # 2d
+        [2, 3, 16, 16],
+        [2, 1, 7, 16],
+        # 3d
+        [2, 3, 16, 16, 16],
+        [2, 1, 16, 7, 16],
+    ),
+)
+def test_batch_norm(  # pylint: disable=missing-function-docstring
+    input_shape: List[int],
+    parameters_as_inputs: bool,
+) -> None:
+    num_features = input_shape[1]
+    x = np.random.uniform(low=-1.0, high=1.0, size=input_shape).astype(np.float32)
+    scale = np.random.uniform(low=0.0, high=1.0, size=num_features).astype(np.float32)
+    bias = np.random.uniform(low=-1.0, high=1.0, size=num_features).astype(np.float32)
+    mean = np.random.uniform(low=-1.0, high=1.0, size=num_features).astype(np.float32)
+    var = np.random.uniform(low=0.001, high=0.5, size=num_features).astype(np.float32)
+
+    test_inputs = {'x': x}
+    initializers = {}
+    parameters = {
+        'scale': scale,
+        'bias': bias,
+        'mean': mean,
+        'var': var,
+    }
+    if parameters_as_inputs:
+        initializers.update(parameters)
+    else:
+        test_inputs.update(parameters)
+
+    node = onnx.helper.make_node(
+        op_type='BatchNormalization',
+        inputs=['x', 'scale', 'bias', 'mean', 'var'],
+        outputs=['y'],
+    )
+
+    model = make_model_from_nodes(nodes=node, initializers=initializers, inputs_example=test_inputs)
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=10**-6,
+        atol_torch_cpu_cuda=10**-6,
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/binary_operations_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/binary_operations_test.py
new file mode 100644
index 000000000..73bec3bbf
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/binary_operations_test.py
@@ -0,0 +1,60 @@
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+@pytest.mark.parametrize(
+    'op_type',
+    ('Add', 'Sub', 'Mul', 'Div'),
+)
+def test_math_binary_operation(op_type: str) -> None:  # pylint: disable=missing-function-docstring
+    input_shape = [10, 3, 128, 128]
+    x = np.random.uniform(low=-1.0, high=1.0, size=input_shape).astype(np.float32)
+    y_variants = [
+        np.random.uniform(low=-1.0, high=1.0, size=1).astype(np.float32),
+        np.random.uniform(low=-1.0, high=1.0, size=[1] * len(input_shape)).astype(np.float32),
+        np.random.uniform(low=-1.0, high=1.0, size=input_shape).astype(np.float32),
+        np.array([0.0], dtype=np.float32),
+    ]
+    for y in y_variants:
+        test_inputs = {'x': x, 'y': y}
+        initializers = {}
+        node = onnx.helper.make_node(
+            op_type=op_type,
+            inputs=['x', 'y'],
+            outputs=['z'],
+        )
+
+        model = make_model_from_nodes(nodes=node, initializers=initializers, inputs_example=test_inputs)
+        check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.parametrize(
+    'x, y',
+    [
+        (1, 2),
+        (1, 5),
+        (5, 30),
+        (-1, 2),
+        (-1, 5),
+        (5, -30),
+        (5, 2),
+        (-5, 2),
+    ],
+)
+def test_div_operation(x: int, y: int) -> None:  # pylint: disable=missing-function-docstring
+    x_ = np.array(x, dtype=np.int64)  # pylint: disable=invalid-name
+    y_ = np.array(y, dtype=np.int64)  # pylint: disable=invalid-name
+    test_inputs = {'x': x_, 'y': y_}
+
+    node = onnx.helper.make_node(
+        op_type='Div',
+        inputs=['x', 'y'],
+        outputs=['z'],
+    )
+
+    model = make_model_from_nodes(nodes=node, initializers={}, inputs_example=test_inputs)
+    check_onnx_model(model, test_inputs)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/clip_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/clip_test.py
new file mode 100644
index 000000000..22c1607ec
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/clip_test.py
@@ -0,0 +1,61 @@
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import onnx
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_clip(
+    input_shape: Tuple[int, int, int, int],
+    min_value: Optional[float] = None,
+    max_value: Optional[float] = None,
+    **kwargs,
+) -> None:
+    x_range = 2 * max_value if max_value is not None else 5
+    x = np.random.uniform(low=-x_range, high=x_range, size=input_shape).astype(np.float32)
+    test_inputs = {'x': x}
+
+    initializers = {}
+    if min_value is not None:
+        initializers['min'] = np.array(min_value, dtype=np.float32)
+
+    if max_value is not None:
+        initializers['max'] = np.array(max_value, dtype=np.float32)
+
+    node = onnx.helper.make_node(
+        op_type='Clip',
+        inputs=list(test_inputs) + list(initializers),
+        outputs=['y'],
+        **kwargs,
+    )
+    model = make_model_from_nodes(nodes=node, initializers=initializers, inputs_example=test_inputs)
+    check_onnx_model(model, test_inputs)
+
+
+def _test_clip_opset9(
+    input_shape: Tuple[int, int, int, int],
+    **kwargs,
+) -> None:
+    x = np.random.uniform(low=-10.0, high=10.0, size=input_shape).astype(np.float32)
+    test_inputs = {'x': x}
+
+    node = onnx.helper.make_node(
+        op_type='Clip',
+        inputs=list(test_inputs),
+        outputs=['y'],
+        **kwargs,
+    )
+    model = make_model_from_nodes(nodes=node, initializers={}, inputs_example=test_inputs, opset_version=9)
+    check_onnx_model(model, test_inputs)
+
+
+def test_clip() -> None:  # pylint: disable=missing-function-docstring
+    _test_clip(input_shape=(2, 3, 16, 16), min_value=0.0, max_value=6.0)
+    _test_clip(input_shape=(2, 3, 16, 16), min_value=0.0)
+    _test_clip(input_shape=(2, 3, 16, 16), min_value=-1.5, max_value=2.5)
+    _test_clip_opset9(input_shape=(2, 3, 16, 16), min=0.0, max=6.0)
+    _test_clip_opset9(input_shape=(2, 3, 16, 16), min=0.0)
+    _test_clip_opset9(input_shape=(2, 3, 16, 16), min=-1.7, max=2.8)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/comparisons_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/comparisons_test.py
new file mode 100644
index 000000000..ac633215d
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/comparisons_test.py
@@ -0,0 +1,63 @@
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_comparison(op_type: str, x: np.ndarray, y: np.ndarray, opset_version: int = 13) -> None:
+    test_inputs = {'x': x, 'y': y}
+
+    node = onnx.helper.make_node(
+        op_type=op_type,
+        inputs=list(test_inputs),
+        outputs=['out'],
+    )
+    outputs_info = [
+        make_tensor_value_info(
+            name='out',
+            elem_type=NP_TYPE_TO_TENSOR_TYPE[np.dtype('bool')],
+            shape=x.shape,
+        ),
+    ]
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+        opset_version=opset_version,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.parametrize(
+    'op_type,x_shape,y_shape',
+    (
+        ('Equal', [3, 4, 5], [5]),
+        ('Equal', [3, 4, 5], [3, 4, 5]),
+        ('Less', [3, 4, 5], [5]),
+        ('Less', [3, 4, 5], [3, 4, 5]),
+        ('Greater', [3, 4, 5], [5]),
+        ('Greater', [3, 4, 5], [3, 4, 5]),
+        ('LessOrEqual', [3, 4, 5], [5]),
+        ('LessOrEqual', [3, 4, 5], [3, 4, 5]),
+        ('GreaterOrEqual', [3, 4, 5], [5]),
+        ('GreaterOrEqual', [3, 4, 5], [3, 4, 5]),
+    ),
+)
+def test_comparison(  # pylint: disable=missing-function-docstring
+    op_type: str,
+    x_shape: List[int],
+    y_shape: List[int],
+) -> None:
+    _test_comparison(
+        op_type=op_type,
+        x=np.random.randn(*x_shape),
+        y=np.random.randn(*y_shape),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/concat_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/concat_test.py
new file mode 100644
index 000000000..2695e140a
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/concat_test.py
@@ -0,0 +1,51 @@
+from itertools import product
+from typing import List
+
+import numpy as np
+import onnx
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_concat(
+    input_arrays_shapes: List[List[int]],
+    opset_version: int,
+    **kwargs,
+) -> None:
+    test_inputs = {}
+    for index, input_array_shape in enumerate(input_arrays_shapes):
+        x = np.random.uniform(low=-1.0, high=1.0, size=input_array_shape).astype(np.float32)
+        node_name = f'x_{index}'
+        test_inputs[node_name] = x
+
+    node = onnx.helper.make_node(
+        'Concat',
+        inputs=list(test_inputs),
+        outputs=['y'],
+        **kwargs,
+    )
+
+    onnx_type = NP_TYPE_TO_TENSOR_TYPE[np.dtype('float32')]
+    outputs_info = [make_tensor_value_info(name='y', elem_type=onnx_type, shape=None)]
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+        opset_version=opset_version,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+def test_concat() -> None:  # pylint: disable=missing-function-docstring
+    opset_variants = (9, 13)
+    axis_variants = (0, 1)
+    for opset_version, axis in product(opset_variants, axis_variants):
+        _test_concat(
+            input_arrays_shapes=[[1, 3, 16, 16], [1, 3, 16, 16], [1, 3, 16, 16]],
+            axis=axis,
+            opset_version=opset_version,
+        )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/constant_of_shape_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/constant_of_shape_test.py
new file mode 100644
index 000000000..8277bb274
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/constant_of_shape_test.py
@@ -0,0 +1,44 @@
+import random
+
+import numpy as np
+import onnx
+import pytest
+from onnx import numpy_helper
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_constant_of_shape(shape: np.ndarray, value: np.ndarray) -> None:
+    test_inputs = {'shape': shape}
+    onnx_type = NP_TYPE_TO_TENSOR_TYPE[value.dtype]
+
+    node = onnx.helper.make_node(
+        'ConstantOfShape',
+        inputs=list(test_inputs),
+        outputs=['output'],
+        value=numpy_helper.from_array(value, name='value'),
+    )
+
+    outputs_info = [make_tensor_value_info(name='output', elem_type=onnx_type, shape=shape.tolist())]
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.filterwarnings('ignore::torch.jit._trace.TracerWarning')
+def test_constant_of_shape() -> None:  # pylint: disable=missing-function-docstring
+    for _ in range(10):
+        size = random.randint(1, 6)
+        shape = np.random.randint(low=1, high=2, size=(size,))
+        value = np.random.uniform(low=-10000, high=10000, size=(1,))
+        _test_constant_of_shape(shape, value)
+
+    _test_constant_of_shape(np.asarray([3, 3]), np.asarray([True]))
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/constant_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/constant_test.py
new file mode 100644
index 000000000..1936da8a9
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/constant_test.py
@@ -0,0 +1,38 @@
+from typing import Tuple
+
+import numpy as np
+import onnx
+import pytest
+from onnx import numpy_helper
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_constant_as_tensor(shape: Tuple[int, ...], dtype: np.dtype) -> None:
+    values = np.random.randn(*shape).astype(dtype)
+    onnx_type = NP_TYPE_TO_TENSOR_TYPE[values.dtype]
+    node = onnx.helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['values'],
+        value=numpy_helper.from_array(values, name='const_tensor'),
+    )
+
+    outputs_info = [make_tensor_value_info(name='values', elem_type=onnx_type, shape=values.shape)]
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example={},
+        outputs_info=outputs_info,
+    )
+    check_onnx_model(model, onnx_inputs={})
+
+
+@pytest.mark.filterwarnings('ignore:No input args')
+def test_constant() -> None:  # pylint: disable=missing-function-docstring
+    _test_constant_as_tensor((16, 16, 16), np.dtype('int32'))
+    _test_constant_as_tensor((16, 16, 16), np.dtype('int32'))
+    _test_constant_as_tensor((16, 16, 16), np.dtype('float32'))
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/conv_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/conv_test.py
new file mode 100644
index 000000000..72dcf3ffa
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/conv_test.py
@@ -0,0 +1,191 @@
+from itertools import chain
+from itertools import product
+from typing import Tuple
+
+import numpy as np
+import onnx
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_conv(
+    op_type: str,
+    in_channels: int,
+    out_channels: int,
+    kernel_shape: Tuple[int, int],
+    input_hw: Tuple[int, int],
+    **kwargs,
+) -> None:
+    group = kwargs.get('group', 1)
+
+    x_shape = (2, in_channels) + input_hw
+    x = np.random.uniform(low=-1.0, high=1.0, size=x_shape).astype(np.float32)
+    if op_type == 'Conv':
+        weights_shape = (out_channels, in_channels // group) + kernel_shape
+    elif op_type == 'ConvTranspose':
+        weights_shape = (in_channels, out_channels // group) + kernel_shape
+    weights = np.random.uniform(low=-1.0, high=1.0, size=weights_shape).astype(np.float32)
+
+    test_inputs = {'x': x}
+    initializers = {'weights': weights}
+    node = onnx.helper.make_node(
+        op_type=op_type,
+        inputs=['x', 'weights'],
+        outputs=['y'],
+        kernel_shape=kernel_shape,
+        **kwargs,
+    )
+
+    model = make_model_from_nodes(nodes=node, initializers=initializers, inputs_example=test_inputs)
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=10**-4,
+        atol_torch_cpu_cuda=10**-4,
+    )
+
+
+def test_conv2d_base_params() -> None:  # pylint: disable=missing-function-docstring
+    op_type_variants = ('ConvTranspose', 'Conv')
+    in_channels_variants = (1, 2, 3, 4, 16)
+    out_channels_variants = (1, 2, 3, 4, 16)
+    input_hw_variants = ((32, 32), (32, 31), (31, 32), (31, 31))
+    kernel_shape_variants = tuple(
+        chain(
+            ((i, i) for i in range(1, 6)),
+            ((1, 2), (1, 3), (1, 5)),
+            ((2, 2), (2, 3), (2, 5)),
+        )
+    )
+    all_variants = product(
+        op_type_variants, in_channels_variants, out_channels_variants, input_hw_variants, kernel_shape_variants
+    )
+    for op_type, in_channels, out_channels, input_hw, kernel_shape in all_variants:
+        _test_conv(
+            op_type=op_type,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            input_hw=input_hw,
+            kernel_shape=kernel_shape,
+        )
+
+    in_out_channels_variants = (2, 3, 4, 16)
+    all_variants = product(op_type_variants, in_out_channels_variants, input_hw_variants, kernel_shape_variants)
+    for op_type, in_out_channels, input_hw, kernel_shape in all_variants:
+        _test_conv(
+            op_type=op_type,
+            in_channels=in_out_channels,
+            out_channels=in_out_channels,
+            input_hw=input_hw,
+            kernel_shape=kernel_shape,
+            group=in_out_channels,
+        )
+
+
+def test_conv_stride_dilations_pads() -> None:  # pylint: disable=missing-function-docstring
+    input_hw_variants = ((32, 32), (32, 27), (27, 32), (27, 27))
+    kernel_shape_variants = tuple(
+        chain(
+            ((i, i) for i in range(1, 4)),
+            ((1, 2), (1, 3), (2, 3)),
+        )
+    )
+    stride_variants = (
+        (1, 1),
+        (2, 2),
+        (3, 3),
+        (1, 2),
+        (2, 1),
+        (1, 3),
+        (3, 1),
+    )
+    dilations_variants = (
+        (1, 1),
+        (2, 2),
+        (1, 2),
+        (2, 1),
+    )
+    pads = (
+        [1, 1, 1, 1],
+        [1, 0, 0, 1],
+        [0, 2, 7, 0],
+        [3, 0, 1, 2],
+    )
+
+    all_variants = product(
+        input_hw_variants,
+        kernel_shape_variants,
+        stride_variants,
+        dilations_variants,
+        pads,
+    )
+    for input_hw, kernel_shape, strides, dilations, pads in all_variants:
+        _test_conv(
+            op_type='Conv',
+            in_channels=16,
+            out_channels=16,
+            input_hw=input_hw,
+            kernel_shape=kernel_shape,
+            strides=strides,
+            dilations=dilations,
+            pads=pads,
+        )
+
+    pads = (
+        [1, 1, 1, 1],
+        [1, 2, 1, 2],
+        [2, 2, 2, 2],
+    )
+
+    all_variants = product(
+        input_hw_variants,
+        kernel_shape_variants,
+        stride_variants,
+        dilations_variants,
+        pads,
+    )
+    for input_hw, kernel_shape, strides, dilations, pads in all_variants:
+        _test_conv(
+            op_type='ConvTranspose',
+            in_channels=16,
+            out_channels=16,
+            input_hw=input_hw,
+            kernel_shape=kernel_shape,
+            strides=strides,
+            dilations=dilations,
+            pads=pads,
+        )
+
+
+def test_conv_transpose_output_pads() -> None:  # pylint: disable=missing-function-docstring
+    input_hw_variants = ((5, 5), (6, 6), (7, 7))
+    stride_variants = (
+        (4, 4),
+        (3, 4),
+        (4, 3),
+        (3, 3),
+    )
+    dilations_variants = (
+        (3, 3),
+        (2, 3),
+        (3, 2),
+    )
+    output_pads_variants = (
+        (1, 1),
+        (2, 2),
+        (1, 2),
+    )
+
+    all_variants = product(input_hw_variants, stride_variants, dilations_variants, output_pads_variants)
+    for input_hw, strides, dilations, output_pads in all_variants:
+        _test_conv(
+            op_type='ConvTranspose',
+            in_channels=16,
+            out_channels=32,
+            input_hw=input_hw,
+            kernel_shape=(3, 3),
+            strides=strides,
+            dilations=dilations,
+            output_padding=output_pads,
+        )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/cumsum_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/cumsum_test.py
new file mode 100644
index 000000000..cfc173ab7
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/cumsum_test.py
@@ -0,0 +1,68 @@
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_cumsum(
+    input_tensor: np.ndarray,
+    axis: int,
+    exclusive: int,
+    reverse: int,
+) -> None:
+    test_inputs = {'x': input_tensor, 'axis': np.array(axis)}
+    node = onnx.helper.make_node(
+        op_type='CumSum',
+        inputs=list(test_inputs.keys()),
+        outputs=['y'],
+        exclusive=exclusive,
+        reverse=reverse,
+    )
+
+    outputs_info = [
+        make_tensor_value_info(
+            name='y',
+            elem_type=NP_TYPE_TO_TENSOR_TYPE[input_tensor.dtype],
+            shape=input_tensor.shape,
+        ),
+    ]
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.parametrize(
+    'tensor_size',
+    (
+        (10,),
+        (10, 10),
+        (10, 10, 5),
+        (10, 10, 5, 6),
+    ),
+)
+@pytest.mark.parametrize(
+    'exclusive,reverse',
+    (
+        (0, 0),
+        (0, 1),
+        (1, 0),
+        (1, 1),
+    ),
+)
+def test_cumsum(tensor_size, exclusive, reverse) -> None:  # pylint: disable=missing-function-docstring
+    input_tensor = np.random.randint(low=-10, high=10, size=tensor_size)
+    for axis in range(-len(tensor_size), len(tensor_size) - 1):
+        _test_cumsum(
+            input_tensor=input_tensor,
+            axis=axis,
+            exclusive=exclusive,
+            reverse=reverse,
+        )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/depth_to_space_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/depth_to_space_test.py
new file mode 100644
index 000000000..c6e860abf
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/depth_to_space_test.py
@@ -0,0 +1,42 @@
+# pylint: disable=missing-function-docstring
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_depth_to_space(
+    input_shape: List[int],
+    blocksize: int,
+    mode: str,
+    opset: int,
+) -> None:
+    x = np.random.uniform(low=-1.0, high=1.0, size=input_shape).astype(np.float32)
+    test_inputs = {'x': x}
+
+    node = onnx.helper.make_node(  # type: ignore
+        op_type='DepthToSpace',
+        inputs=['x'],
+        outputs=['y'],
+        blocksize=blocksize,
+        mode=mode,
+    )
+    model = make_model_from_nodes(nodes=node, initializers={}, inputs_example=test_inputs, opset_version=opset)
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.parametrize(
+    'input_shape, blocksize',
+    [
+        ([1, 12, 3, 3], 2),
+        ([5, 75, 3, 3], 5),
+        ([7, 588, 3, 4], 7),
+    ],
+)
+@pytest.mark.parametrize('opset', [11, 13])
+def test_depth_to_space(input_shape: List[int], blocksize: int, opset: int) -> None:
+    _test_depth_to_space(input_shape=input_shape, blocksize=blocksize, mode='CRD', opset=opset)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/dropout_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/dropout_test.py
new file mode 100644
index 000000000..6705de652
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/dropout_test.py
@@ -0,0 +1,59 @@
+from typing import List
+from typing import Optional
+
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_dropout(data: np.ndarray, opset_version: int, **kwargs) -> None:
+    test_inputs = {'input_tensor': data}
+
+    if opset_version >= 12:
+        if 'ratio' in kwargs:
+            test_inputs['ratio'] = np.array(kwargs.pop('ratio'), dtype=np.float16)
+        if 'training_mode' in kwargs:
+            test_inputs['training_mode'] = np.array(kwargs.pop('training_mode'), dtype=bool)
+
+    node = onnx.helper.make_node(op_type='Dropout', inputs=list(test_inputs), outputs=['y'], **kwargs)
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        opset_version=opset_version,
+    )
+
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.parametrize(
+    'input_shape,ratio,training_mode,opset_version',
+    (
+        ([3, 32, 32], None, None, 10),
+        ([3, 32, 32], None, None, 12),
+        ([3, 32, 32], None, None, 13),
+        ([3, 32, 32], 0.8, None, 10),
+        ([3, 32, 32], 0.8, None, 12),
+        ([3, 32, 32], 0.8, None, 13),
+        ([3, 32, 32], 0.8, False, 13),
+        ([3, 32, 32], 0.8, False, 13),
+        ([8, 3, 32, 32], None, None, 10),
+        ([8, 3, 32, 32, 32], None, None, 10),
+    ),
+)
+def test_dropout(  # pylint: disable=missing-function-docstring
+    input_shape: List[int],
+    ratio: Optional[float],
+    training_mode: Optional[bool],
+    opset_version: int,
+) -> None:
+    data = np.random.randn(*input_shape).astype(np.float32)
+    kwargs = {}
+    if ratio is not None:
+        kwargs['ratio'] = ratio
+    if training_mode is not None:
+        kwargs['training_mode'] = training_mode
+    _test_dropout(data=data, opset_version=opset_version, **kwargs)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/einsum_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/einsum_test.py
new file mode 100644
index 000000000..4a93ec549
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/einsum_test.py
@@ -0,0 +1,51 @@
+from typing import List
+from typing import Tuple
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+@pytest.mark.parametrize(
+    'equation,input_shapes,output_shape',
+    (
+        ('...ii ->...i', [(3, 5, 5)], (3, 5)),
+        ('i,i', [(5,), (5,)], None),
+        ('ij->i', [(3, 4)], (3,)),
+        ('ij->ji', [(3, 4)], (4, 3)),
+    ),
+)
+def test_einsum(  # pylint: disable=missing-function-docstring
+    equation: str,
+    input_shapes: List[Tuple[int, ...]],
+    output_shape: Tuple[int, ...],
+) -> None:
+    test_inputs = {f'input_{index}': np.random.randn(*shape) for index, shape in enumerate(input_shapes)}
+
+    node = onnx.helper.make_node(
+        op_type='Einsum',
+        inputs=list(test_inputs),
+        outputs=['out'],
+        equation=equation,
+    )
+    outputs_info = [
+        make_tensor_value_info(
+            name='out',
+            elem_type=NP_TYPE_TO_TENSOR_TYPE[np.dtype('float')],
+            shape=output_shape,
+        ),
+    ]
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+        opset_version=13,
+    )
+    check_onnx_model(model, test_inputs)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/expand_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/expand_test.py
new file mode 100644
index 000000000..64d4c403b
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/expand_test.py
@@ -0,0 +1,52 @@
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_expand(
+    data: np.ndarray,
+    shape: List[int],
+) -> None:
+    test_inputs = {
+        'x': data,
+        'shape': np.array(shape, dtype=np.int64),
+    }
+
+    node = onnx.helper.make_node(op_type='Expand', inputs=list(test_inputs), outputs=['y'])
+    outputs_info = [
+        make_tensor_value_info(
+            name='y',
+            elem_type=NP_TYPE_TO_TENSOR_TYPE[data.dtype],
+            shape=[None] * len(shape),
+        ),
+    ]
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.parametrize(
+    'src_shape,dst_shape',
+    (
+        ([3, 1], [2, 1, 6]),
+        ([3, 1], [3, 4]),
+    ),
+)
+def test_expand(src_shape: List[int], dst_shape: List[int]) -> None:  # pylint: disable=missing-function-docstring
+    data = np.reshape(np.arange(1, np.prod(src_shape) + 1, dtype=np.float32), src_shape)
+    _test_expand(
+        data=data,
+        shape=dst_shape,
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/eye_like_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/eye_like_test.py
new file mode 100644
index 000000000..bf44e6f5e
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/eye_like_test.py
@@ -0,0 +1,31 @@
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+@pytest.mark.parametrize('dtype', [None, 1, 6, 7, 11])
+@pytest.mark.parametrize('k', [-2, -1, 0, 1, 2])
+@pytest.mark.parametrize('shape', [[2, 3], [3, 4], [3, 3]])
+def test_eye_like(  # pylint: disable=missing-function-docstring
+    shape: Tuple[int],
+    dtype: Optional[int],
+    k: int,  # pylint: disable=invalid-name
+) -> None:
+    input_values = np.random.randn(*shape).astype(np.float32)
+    test_inputs = {'x': input_values}
+
+    node = onnx.helper.make_node(op_type='EyeLike', inputs=['x'], outputs=['z'], dtype=dtype, k=k)
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=[make_tensor_value_info(name='z', elem_type=dtype, shape=shape)] if dtype else None,
+    )
+    check_onnx_model(model, test_inputs)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/flatten_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/flatten_test.py
new file mode 100644
index 000000000..c52d4b513
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/flatten_test.py
@@ -0,0 +1,30 @@
+from typing import List
+
+import numpy as np
+import onnx
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_flatten(
+    input_shape: List[int],
+    **kwargs,
+) -> None:
+    x = np.random.uniform(low=-1.0, high=1.0, size=input_shape).astype(np.float32)
+    test_inputs = {'x': x}
+
+    node = onnx.helper.make_node(
+        op_type='Flatten',
+        inputs=['x'],
+        outputs=['y'],
+        **kwargs,
+    )
+    model = make_model_from_nodes(nodes=node, initializers={}, inputs_example=test_inputs)
+    check_onnx_model(model, test_inputs)
+
+
+def test_flatten() -> None:  # pylint: disable=missing-function-docstring
+    _test_flatten(input_shape=[2, 3, 16, 16, 16])
+    _test_flatten(input_shape=[2, 3, 16, 16], axis=2)
+    _test_flatten(input_shape=[2, 3, 16], axis=-1)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/gather_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/gather_test.py
new file mode 100644
index 000000000..da1d97617
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/gather_test.py
@@ -0,0 +1,104 @@
+from typing import List
+from typing import cast
+
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_gather(
+    op_type: str,
+    input_array: np.ndarray,
+    indices: np.ndarray,
+    opset_version: int,
+    **kwargs,
+) -> None:
+    test_inputs = {
+        'x': input_array,
+        'indices': indices,
+    }
+
+    node = onnx.helper.make_node(
+        op_type,
+        inputs=list(test_inputs),
+        outputs=['y'],
+        **kwargs,
+    )
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        opset_version=opset_version,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.parametrize(
+    'op_type,axis,opset_version',
+    (
+        ('Gather', 0, 13),
+        ('Gather', 0, 11),
+        ('Gather', 0, 9),
+        ('Gather', 1, 13),
+        ('Gather', 1, 11),
+        ('Gather', 1, 9),
+        ('GatherElements', 0, 13),
+        ('GatherElements', 0, 11),
+        ('GatherElements', 1, 13),
+        ('GatherElements', 1, 11),
+    ),
+)
+def test_gather(op_type: str, axis: int, opset_version: int) -> None:  # pylint: disable=missing-function-docstring
+    input_tensor = np.asarray(
+        [
+            [1.0, 1.2, 1.9],
+            [2.3, 3.4, 3.9],
+            [4.5, 5.7, 5.9],
+        ],
+        dtype=np.float32,
+    )
+    indices = np.asarray(
+        [
+            [1, 0],
+        ],
+        dtype=np.int64,
+    )
+    _test_gather(op_type=op_type, input_array=input_tensor, indices=indices, axis=axis, opset_version=opset_version)
+
+
+@pytest.mark.parametrize('opset_version', (11, 12, 13))
+@pytest.mark.parametrize(
+    'data_shape, indices_shape, batch_dims',
+    (
+        # Examples from ONNX opset doc: https://github.com/onnx/onnx/blob/main/docs/Changelog.md#GatherND-13.
+        ([2, 2], [2, 2], 0),
+        ([2, 2], [2, 1], 0),
+        ([2, 2, 2], [2, 2], 0),
+        ([2, 2, 2], [2, 1, 2], 0),
+        pytest.param([2, 2, 2], [2, 1], 1, marks=pytest.mark.xfail(reason='implemented for batch_dims = 0 only')),
+        # Our tests.
+        ([8, 3, 16, 16], [16, 3], 0),
+        ([16, 3, 224, 224], [32, 1, 3], 0),
+    ),
+)
+def test_gather_nd(  # pylint: disable=missing-function-docstring
+    data_shape: List[int],
+    indices_shape: List[int],
+    batch_dims: int,
+    opset_version: int,
+) -> None:
+    input_tensor = cast(np.ndarray, np.random.rand(*data_shape))
+    indices_high = data_shape[: indices_shape[-1]]
+    indices = np.random.randint(low=0, high=indices_high, size=indices_shape, dtype=np.int64)
+
+    _test_gather(
+        op_type='GatherND',
+        input_array=input_tensor,
+        indices=indices,
+        batch_dims=batch_dims if opset_version > 11 else None,
+        opset_version=opset_version,
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/gemm_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/gemm_test.py
new file mode 100644
index 000000000..2701b35ab
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/gemm_test.py
@@ -0,0 +1,129 @@
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_gemm(
+    input_a_shape: Tuple[int, int],
+    input_b_shape: Tuple[int, int],
+    has_input_c: bool,
+    abc_as_initializers: Tuple[bool, bool, bool],
+    **kwargs,
+) -> None:
+    input_a = np.random.uniform(low=-1.0, high=1.0, size=input_a_shape).astype(np.float32)
+    input_b = np.random.uniform(low=-1.0, high=1.0, size=input_b_shape).astype(np.float32)
+    input_c_shape = input_b_shape[1] if kwargs.get('transB', 0) == 0 else input_b_shape[0]
+    input_c = np.random.uniform(low=-1.0, high=1.0, size=(input_c_shape,)).astype(np.float32) if has_input_c else None
+
+    output_shape = [None] * 2
+    output_shape[0] = input_a_shape[0 if kwargs.get('transA', 0) == 0 else 1]
+    output_shape[1] = input_b_shape[1 if kwargs.get('transB', 0) == 0 else 0]
+
+    test_inputs = {}
+    initializers = {}
+    gemm_inputs = ['a', 'b']
+
+    if abc_as_initializers[0]:
+        initializers['a'] = input_a
+    else:
+        test_inputs['a'] = input_a
+
+    if abc_as_initializers[1]:
+        initializers['b'] = input_b
+    else:
+        test_inputs['b'] = input_b
+
+    if has_input_c:
+        gemm_inputs.append('c')
+        if abc_as_initializers[2]:
+            initializers['c'] = input_c
+        else:
+            test_inputs['c'] = input_c
+
+    outputs_info = [
+        make_tensor_value_info(
+            name='output',
+            elem_type=NP_TYPE_TO_TENSOR_TYPE[np.dtype(np.float32)],
+            shape=output_shape,
+        ),
+    ]
+    node = onnx.helper.make_node(
+        op_type='Gemm',
+        inputs=gemm_inputs,
+        outputs=['output'],
+        **kwargs,
+    )
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers=initializers,
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+    )
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=10**-5,
+        atol_torch_cpu_cuda=10**-5,
+    )
+
+
+@pytest.mark.parametrize(
+    'abc_as_initializers',
+    (
+        (False, False, False),
+        (True, False, False),
+        (False, True, False),
+        (False, False, True),
+        (False, True, True),
+        (True, False, True),
+        (True, True, False),
+        (True, True, True),
+    ),
+)
+@pytest.mark.parametrize(
+    'has_input_c',
+    (False, True),
+)
+@pytest.mark.parametrize(
+    'input_a_shape,input_b_shape,trans_a,trans_b,alpha,beta',
+    (
+        ([3, 4], [4, 3], False, False, None, None),
+        ([3, 4], [4, 3], False, False, None, None),
+        ([4, 3], [4, 3], True, False, None, None),
+        ([3, 4], [3, 4], False, True, None, None),
+        ([3, 4], [4, 3], True, True, None, None),
+        ([3, 4], [4, 3], False, False, 3.1415926, 2.71828),
+    ),
+)
+def test_gemm(  # pylint: disable=missing-function-docstring
+    input_a_shape: Tuple[int, int],
+    input_b_shape: Tuple[int, int],
+    has_input_c: bool,
+    abc_as_initializers: Tuple[bool, bool, bool],
+    trans_a: Optional[bool],
+    trans_b: Optional[bool],
+    alpha: Optional[float],
+    beta: Optional[float],
+) -> None:
+    kwargs = {
+        'transA': trans_a,
+        'transB': trans_b,
+        'alpha': alpha,
+        'beta': beta,
+    }
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    _test_gemm(
+        input_a_shape=input_a_shape,
+        input_b_shape=input_b_shape,
+        has_input_c=has_input_c,
+        abc_as_initializers=abc_as_initializers,
+        **kwargs,
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/global_avg_pool_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/global_avg_pool_test.py
new file mode 100644
index 000000000..9dd5287fe
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/global_avg_pool_test.py
@@ -0,0 +1,34 @@
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+@pytest.mark.parametrize(
+    'input_shape',
+    (
+        [2, 3, 16, 16, 16],
+        [2, 3, 16, 16],
+        [2, 3, 16],
+    ),
+)
+def test_global_avg_pool(input_shape: List[int]) -> None:  # pylint: disable=missing-function-docstring
+    x = np.random.uniform(low=-1.0, high=1.0, size=input_shape).astype(np.float32)
+    test_inputs = {'x': x}
+
+    node = onnx.helper.make_node(
+        op_type='GlobalAveragePool',
+        inputs=['x'],
+        outputs=['y'],
+    )
+    model = make_model_from_nodes(nodes=node, initializers={}, inputs_example=test_inputs)
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=10**-7,
+        atol_torch_cpu_cuda=10**-7,
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/instance_norm_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/instance_norm_test.py
new file mode 100644
index 000000000..71bd6fc4f
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/instance_norm_test.py
@@ -0,0 +1,47 @@
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+@pytest.mark.parametrize('parameters_as_inputs', (True, False))
+@pytest.mark.parametrize(
+    'input_shape',
+    (
+        # 1d
+        [2, 3, 16],
+        [2, 1, 7],
+        # 2d
+        [2, 3, 16, 16],
+        [2, 1, 7, 16],
+        # 3d
+        [2, 3, 16, 16, 16],
+        [2, 1, 16, 7, 16],
+    ),
+)
+def test_instance_norm(  # pylint: disable=missing-function-docstring
+    input_shape: List[int],
+    parameters_as_inputs: bool,
+) -> None:
+    num_features = input_shape[1]
+    x = np.random.uniform(low=-1.0, high=1.0, size=input_shape).astype(np.float32)
+    scale = np.random.uniform(low=0.0, high=1.0, size=num_features).astype(np.float32)
+    bias = np.random.uniform(low=-1.0, high=1.0, size=num_features).astype(np.float32)
+
+    inputs = {'input': x}
+    parameters = {'scale': scale, 'bias': bias}
+    initializers = {}
+
+    if parameters_as_inputs:
+        inputs.update(parameters)
+    else:
+        initializers.update(parameters)
+
+    node = onnx.helper.make_node(op_type='InstanceNormalization', inputs=['input', 'scale', 'bias'], outputs=['y'])
+
+    model = make_model_from_nodes(nodes=node, initializers=initializers, inputs_example=inputs)
+    check_onnx_model(onnx_model=model, onnx_inputs=inputs, atol_onnx_torch=1e-6, atol_torch_cpu_cuda=1e-6)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/layer_norm_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/layer_norm_test.py
new file mode 100644
index 000000000..8341f7a66
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/layer_norm_test.py
@@ -0,0 +1,76 @@
+# pylint: disable=missing-function-docstring
+from typing import List
+from typing import Optional
+
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_layer_norm(
+    x: np.ndarray,
+    scale: np.ndarray,
+    bias: Optional[np.ndarray],
+    axis: int,
+    parameters_as_inputs: bool,
+) -> None:
+    inputs = {'input': x}
+    parameters = {'scale': scale}
+    if bias is not None:
+        parameters['bias'] = bias
+
+    initializers = {}
+
+    if parameters_as_inputs:
+        inputs.update(parameters)
+    else:
+        initializers.update(parameters)
+
+    node = onnx.helper.make_node(
+        op_type='LayerNormalization',
+        inputs=['input', 'scale', 'bias'] if bias is not None else ['input', 'scale'],
+        outputs=['y'],
+        axis=axis,
+    )
+    model = make_model_from_nodes(nodes=node, initializers=initializers, inputs_example=inputs, opset_version=17)
+    check_onnx_model(
+        onnx_model=model,
+        onnx_inputs=inputs,
+        atol_onnx_torch=1e-5,
+        atol_torch_cpu_cuda=1e-5,
+        atol_onnx_torch2onnx=1e-5,
+    )
+
+
+@pytest.mark.parametrize('parameters_as_inputs', (True, False))
+@pytest.mark.parametrize(
+    'input_shape',
+    (
+        [2, 3, 16],
+        [3, 1, 224],
+        [4, 3, 16, 16],
+        [5, 1, 32, 32],
+        [6, 3, 16, 16, 8],
+        [7, 1, 7, 7, 16],
+    ),
+)
+def test_layer_norm(input_shape: List[int], parameters_as_inputs: bool) -> None:
+    x = np.random.randn(*input_shape).astype(np.float32)
+
+    for axis in [*range(len(input_shape))] + [-1]:
+        normalized_shape = input_shape[axis:]
+
+        scale = np.random.randn(*normalized_shape).astype(np.float32)
+        bias = np.random.randn(*normalized_shape).astype(np.float32)
+
+        for bias_ in [bias, None]:
+            _test_layer_norm(
+                x=x,
+                scale=scale,
+                bias=bias_,
+                axis=axis,
+                parameters_as_inputs=parameters_as_inputs,
+            )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/logical_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/logical_test.py
new file mode 100644
index 000000000..fe536cf9c
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/logical_test.py
@@ -0,0 +1,57 @@
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+@pytest.mark.parametrize(
+    'op_type',
+    ('Or', 'And', 'Xor'),
+)
+def test_logical_operation(op_type: str) -> None:  # pylint: disable=missing-function-docstring
+    x = np.random.randn(10, 1, 64, 128) > 0
+    y_variants = (
+        (np.random.randn(128) > 0),
+        (np.random.randn(64, 128) > 0),
+        (np.random.randn(1, 64, 128) > 0),
+        (np.random.randn(1, 3, 1, 128) > 0),
+        (np.random.randn(10, 1, 64, 128) > 0),
+    )
+    for y in y_variants:
+        test_inputs = {'x': x, 'y': y}
+        initializers = {}
+        node = onnx.helper.make_node(
+            op_type=op_type,
+            inputs=['x', 'y'],
+            outputs=['z'],
+        )
+
+        model = make_model_from_nodes(
+            nodes=node,
+            initializers=initializers,
+            inputs_example=test_inputs,
+        )
+        check_onnx_model(model, test_inputs)
+
+
+def test_not() -> None:  # pylint: disable=missing-function-docstring
+    x_variants = (
+        (np.random.randn(128) > 0),
+        (np.random.randn(64, 128) > 0),
+        (np.random.randn(1, 64, 128) > 0),
+        (np.random.randn(10, 1, 64, 128) > 0),
+    )
+
+    for x in x_variants:
+        test_inputs = {'x': x}
+        initializers = {}
+        node = onnx.helper.make_node(
+            op_type='Not',
+            inputs=['x'],
+            outputs=['y'],
+        )
+
+        model = make_model_from_nodes(nodes=node, initializers=initializers, inputs_example=test_inputs)
+        check_onnx_model(model, test_inputs)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/lrn_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/lrn_test.py
new file mode 100644
index 000000000..18623aa7b
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/lrn_test.py
@@ -0,0 +1,37 @@
+from random import randrange
+
+import numpy as np
+import onnx
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_lrn(data: np.ndarray, alpha: float, beta: float, bias: float, size: int) -> None:
+    test_inputs = {'input_tensor': data}
+    node = onnx.helper.make_node(
+        op_type='LRN',
+        inputs=list(test_inputs),
+        outputs=['y'],
+        alpha=alpha,  # ONNX attributes are passed as regular keyword arguments.
+        beta=beta,
+        bias=bias,
+        size=size,
+    )
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+def test_lrn() -> None:  # pylint: disable=missing-function-docstring
+    shape = (1, 3, 227, 227)
+    data = np.random.random_sample(shape).astype(np.float32)
+    alpha = np.random.uniform(low=0.0, high=1.0)
+    beta = np.random.uniform(low=0.0, high=1.0)
+    bias = np.random.uniform(low=1.0, high=5.0)
+    size = randrange(start=1, stop=10, step=2)  # diameter of channels, not radius, must be odd
+    _test_lrn(data=data, alpha=alpha, beta=beta, bias=bias, size=size)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/matmul_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/matmul_test.py
new file mode 100644
index 000000000..4bbdf1ebf
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/matmul_test.py
@@ -0,0 +1,40 @@
+import numpy as np
+import onnx
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def test_matmul() -> None:  # pylint: disable=missing-function-docstring
+    x_variants = [
+        np.random.randn(3, 4).astype(np.float32),
+        np.random.randn(2, 3, 4).astype(np.float32),
+        np.random.randn(1, 2, 3, 4).astype(np.float32),
+    ]
+
+    y_variants = [
+        np.random.randn(4, 3).astype(np.float32),
+        np.random.randn(2, 4, 3).astype(np.float32),
+        np.random.randn(1, 2, 4, 3).astype(np.float32),
+    ]
+
+    for x, y in zip(x_variants, y_variants):
+        test_inputs = {'x': x, 'y': y}
+        initializers = {}
+        node = onnx.helper.make_node(
+            op_type='MatMul',
+            inputs=['x', 'y'],
+            outputs=['z'],
+        )
+
+        model = make_model_from_nodes(
+            nodes=node,
+            initializers=initializers,
+            inputs_example=test_inputs,
+        )
+        check_onnx_model(
+            model,
+            test_inputs,
+            atol_onnx_torch=10**-6,
+            atol_torch_cpu_cuda=10**-6,
+        )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/mean_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/mean_test.py
new file mode 100644
index 000000000..96a05af9d
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/mean_test.py
@@ -0,0 +1,47 @@
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_mean(
+    data_list: List[np.ndarray],
+) -> None:
+    test_inputs = {f'data_{i}': data for i, data in enumerate(data_list)}
+
+    node = onnx.helper.make_node(op_type='Mean', inputs=list(test_inputs), outputs=['y'])
+    outputs_info = [
+        make_tensor_value_info(
+            name='y',
+            elem_type=NP_TYPE_TO_TENSOR_TYPE[data_list[0].dtype],
+            shape=None,
+        ),
+    ]
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.parametrize(
+    'input_shapes',
+    (
+        ([],),
+        ([2, 3, 4],),
+        ([3, 1], [2, 1, 6]),
+        ([3, 1], [3, 4]),
+    ),
+)
+def test_mean(input_shapes: List[List[int]]) -> None:  # pylint: disable=missing-function-docstring
+    input_tensors = [np.random.normal(size=i_shape).astype(np.float32) for i_shape in input_shapes]
+    _test_mean(data_list=input_tensors)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/min_max_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/min_max_test.py
new file mode 100644
index 000000000..ec1013436
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/min_max_test.py
@@ -0,0 +1,56 @@
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_min_max(
+    data_list: List[np.ndarray],
+    operation_type: str,
+) -> None:
+    test_inputs = {f'data_{i}': data for i, data in enumerate(data_list)}
+
+    node = onnx.helper.make_node(op_type=operation_type, inputs=list(test_inputs), outputs=['y'])
+    outputs_info = [
+        make_tensor_value_info(
+            name='y',
+            elem_type=NP_TYPE_TO_TENSOR_TYPE[data_list[0].dtype],
+            shape=None,
+        ),
+    ]
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.parametrize(
+    'input_shapes',
+    (
+        ([],),
+        ([2, 3, 4],),
+        ([3, 1], [2, 1, 6]),
+        ([3, 1], [3, 4]),
+    ),
+)
+@pytest.mark.parametrize('operation_type', ['Min', 'Max'])
+def test_min_amx(  # pylint: disable=missing-function-docstring
+    input_shapes: List[List[int]],
+    operation_type: str,
+) -> None:
+    input_tensors = [np.random.normal(size=i_shape).astype(np.float32) for i_shape in input_shapes]
+
+    _test_min_max(
+        data_list=input_tensors,
+        operation_type=operation_type,
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/mod_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/mod_test.py
new file mode 100644
index 000000000..b274ff816
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/mod_test.py
@@ -0,0 +1,42 @@
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+@pytest.mark.parametrize(
+    'dividend',
+    [
+        [-4, 7, 5, 4, -7, 8],
+        [-4.3, 7.2, 5.0, 4.3, -7.2, 8.0],
+    ],
+)
+@pytest.mark.parametrize(
+    'divisor',
+    [
+        [2, -3, 8, -2, 3, 5],
+        [2.1, -3.4, 8.0, -2.1, 3.4, 5.0],
+    ],
+)
+@pytest.mark.parametrize('fmod', [0, 1])
+def test_mod(  # pylint: disable=missing-function-docstring
+    dividend: List[float],
+    divisor: List[float],
+    fmod: int,
+) -> None:
+    x_variants = np.array(dividend).astype(np.float32 if fmod else np.int32)
+    y_variants = np.array(divisor).astype(np.float32 if fmod else np.int32)
+
+    test_inputs = {'x': x_variants, 'y': y_variants}
+
+    node = onnx.helper.make_node(op_type='Mod', inputs=['x', 'y'], outputs=['z'], fmod=fmod)
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+    )
+    check_onnx_model(model, test_inputs)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/neg_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/neg_test.py
new file mode 100644
index 000000000..6612c100b
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/neg_test.py
@@ -0,0 +1,26 @@
+import numpy as np
+import onnx
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def test_neg() -> None:  # pylint: disable=missing-function-docstring
+    x_variants = (
+        np.random.randn(128),
+        np.random.randn(64, 128),
+        np.random.randn(1, 64, 128),
+        np.random.randn(10, 1, 64, 128),
+    )
+
+    for x in x_variants:
+        test_inputs = {'x': x}
+        initializers = {}
+        node = onnx.helper.make_node(
+            op_type='Neg',
+            inputs=['x'],
+            outputs=['y'],
+        )
+
+        model = make_model_from_nodes(nodes=node, initializers=initializers, inputs_example=test_inputs)
+        check_onnx_model(model, test_inputs)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/nms_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/nms_test.py
new file mode 100644
index 000000000..1ffd45ed8
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/nms_test.py
@@ -0,0 +1,177 @@
+from typing import Optional
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_nms(
+    boxes: np.ndarray,
+    scores: np.ndarray,
+    max_output_boxes_per_class: Optional[int] = None,
+    iou_threshold: Optional[float] = None,
+    score_threshold: Optional[float] = None,
+    center_point_box: Optional[bool] = None,
+) -> None:
+    test_inputs = {
+        'boxes': boxes,
+        'scores': scores,
+    }
+    inputs = list(test_inputs)
+
+    if max_output_boxes_per_class is not None:
+        test_inputs['max_output_boxes_per_class'] = np.array(max_output_boxes_per_class, dtype=np.int64)
+        inputs.append('max_output_boxes_per_class')
+    else:
+        inputs.append('')
+
+    if iou_threshold is not None:
+        test_inputs['iou_threshold'] = np.array(iou_threshold, dtype=np.float32)
+        inputs.append('iou_threshold')
+    else:
+        inputs.append('')
+
+    if score_threshold is not None:
+        test_inputs['score_threshold'] = np.array(score_threshold, dtype=np.float32)
+        inputs.append('score_threshold')
+    else:
+        inputs.append('')
+
+    node = onnx.helper.make_node(
+        op_type='NonMaxSuppression',
+        inputs=inputs,
+        outputs=['y'],
+        center_point_box=center_point_box,
+    )
+
+    outputs_info = [
+        make_tensor_value_info(
+            name='y',
+            elem_type=NP_TYPE_TO_TENSOR_TYPE[np.dtype('int64')],
+            shape=None,
+        )
+    ]
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+_BOXES = np.array(
+    [
+        [
+            [0.0, 0.0, 1.0, 1.0],
+            [0.0, 0.1, 1.0, 1.1],
+            [0.0, -0.1, 1.0, 0.9],
+            [0.0, 10.0, 1.0, 11.0],
+            [0.0, 10.1, 1.0, 11.1],
+            [0.0, 100.0, 1.0, 101.0],
+        ]
+    ],
+    dtype=np.float32,
+)
+_SCORES = np.array([[[0.9, 0.75, 0.6, 0.95, 0.5, 0.3]]], dtype=np.float32)
+
+_BOXES_CXCYWH_FORMAT_TEST = np.array([[[1.0, 1.0, 1.1, 1.1], [1.5, 1.5, 1.6, 1.6]]], dtype=np.float32)
+_SCORES_CXCYWH_FORMAT_TEST = np.array([[[0.9, 0.75]]], dtype=np.float32)
+
+_BOXES_FLIPPED_COORDINATES_TEST = np.array(
+    [
+        [
+            [1.0, 1.0, 0.0, 0.0],
+            [0.0, 0.1, 1.0, 1.1],
+            [0.0, 0.9, 1.0, -0.1],
+            [0.0, 10.0, 1.0, 11.0],
+            [1.0, 10.1, 0.0, 11.1],
+            [1.0, 101.0, 0.0, 100.0],
+        ]
+    ],
+    dtype=np.float32,
+)
+_SCORES_FLIPPED_COORDINATES_TEST = np.array([[[0.9, 0.75, 0.6, 0.95, 0.5, 0.3]]], dtype=np.float32)
+
+_BOXES_IDENTICAL_BOXES_TEST = np.array([[[0.0, 0.0, 1.0, 1.0]] * 10], dtype=np.float32)
+_SCORES_IDENTICAL_BOXES_TEST = np.array([[[0.9] * 9 + [0.91]]], dtype=np.float32)
+
+_BOXES_LIMIT_OUT_TEST = _BOXES
+_SCORES_LIMIT_OUT_TEST = _SCORES
+
+_BOXES_1_BOX_TEST = np.array([[[0.0, 0.0, 1.0, 1.0]]], dtype=np.float32)
+_SCORES_1_BOX_TEST = np.array([[[0.9]]], dtype=np.float32)
+
+_BOXES_SCORE_TEST = _BOXES
+_SCORES_SCORE_TEST = _SCORES
+
+_BOXES_IOU_SCORE_TEST = _BOXES
+_SCORES_IOU_SCORE_TEST = _SCORES
+
+_BOXES_2_BATCHES_TEST = np.asarray([_BOXES[0], _BOXES[0]])
+_SCORES_2_BATCHES_TEST = np.asarray([_SCORES[0], _SCORES[0]])
+
+_BOXES_2_BATCHES_2_CLASSES_TEST = np.asarray([_BOXES[0], _BOXES[0]])
+_SCORES_2_BATCHES_2_CLASSES_TEST = np.asarray(
+    [
+        [_SCORES[0, 0], _SCORES[0, 0][::-1]],  # 1 batch
+        [_SCORES[0, 0][::-1], _SCORES[0, 0]],  # 2 batch
+    ]
+)
+
+_BOXES_2_CLASSES_TEST = _BOXES
+_SCORES_2_CLASSES_TEST = np.asarray([[_SCORES[0, 0], _SCORES[0, 0]]])
+
+_BOXES_NONE_TEST = _BOXES
+_SCORES_NONE_TEST = _SCORES
+
+
+@pytest.mark.parametrize(
+    'boxes,scores,max_output_boxes_per_class,iou_threshold,score_threshold,center_point_box',
+    (
+        (_BOXES_CXCYWH_FORMAT_TEST, _SCORES_CXCYWH_FORMAT_TEST, 3, 0.1, 0.0, 1),  # center point box format
+        # flipped coordinates
+        # (_BOXES_FLIPPED_COORDINATES_TEST, _SCORES_FLIPPED_COORDINATES_TEST, 3, 0.5, 0.0, None),
+        (_BOXES_IDENTICAL_BOXES_TEST, _SCORES_IDENTICAL_BOXES_TEST, 3, 0.5, 0.0, None),  # identical boxes
+        (_BOXES, _SCORES, 2, 0.5, 0.0, None),  # limit output size
+        (_BOXES_1_BOX_TEST, _SCORES_1_BOX_TEST, 3, 0.5, 0.0, None),  # single box
+        (_BOXES_SCORE_TEST, _SCORES_SCORE_TEST, 3, 0.5, 0.0, None),  # suppress by IOU
+        (_BOXES_IOU_SCORE_TEST, _SCORES_IOU_SCORE_TEST, 3, 0.5, 0.4, None),  # suppress by IOU and score
+        (_BOXES_2_BATCHES_TEST, _SCORES_2_BATCHES_TEST, 2, 0.5, 0.0, None),  # two batches
+        (_BOXES_2_CLASSES_TEST, _SCORES_2_CLASSES_TEST, 2, 0.5, 0.0, None),  # two classes
+        (
+            _BOXES_2_BATCHES_2_CLASSES_TEST,
+            _SCORES_2_BATCHES_2_CLASSES_TEST,
+            2,
+            0.5,
+            0.8,
+            None,
+        ),  # two batches two classes
+        (_BOXES_NONE_TEST, _SCORES_NONE_TEST, 3, None, 0.4, None),  # test None params
+        (_BOXES_NONE_TEST, _SCORES_NONE_TEST, 3, 0.5, None, None),  # test None params
+        (_BOXES_NONE_TEST, _SCORES_NONE_TEST, None, 0.5, 0.4, None),  # test None params
+        (_BOXES_NONE_TEST, _SCORES_NONE_TEST, 3, None, None, None),  # test None params
+    ),
+)
+def test_nms(  # pylint: disable=missing-function-docstring
+    boxes: np.ndarray,
+    scores: np.ndarray,
+    max_output_boxes_per_class: Optional[int],
+    iou_threshold: Optional[float],
+    score_threshold: Optional[float],
+    center_point_box: Optional[bool],
+) -> None:
+    _test_nms(
+        boxes=boxes,
+        scores=scores,
+        max_output_boxes_per_class=max_output_boxes_per_class,
+        iou_threshold=iou_threshold,
+        score_threshold=score_threshold,
+        center_point_box=center_point_box,
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/pad_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/pad_test.py
new file mode 100644
index 000000000..51c4f7971
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/pad_test.py
@@ -0,0 +1,71 @@
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_pad(
+    input_array: np.ndarray,
+    opset_version: int,
+    **kwargs,
+) -> None:
+    test_inputs = {
+        'x': input_array,
+    }
+
+    if opset_version != 2:
+        test_inputs['pads'] = np.array(kwargs.pop('pads'), dtype=np.int64)
+
+    node = onnx.helper.make_node(
+        'Pad',
+        inputs=list(test_inputs),
+        outputs=['y'],
+        **kwargs,
+    )
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        opset_version=opset_version,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.parametrize(
+    'input_shape,pads,mode',
+    (
+        ([1, 1, 1, 3, 3], [0, 1, 1, 1, 1, 0, 0, 0, 1, 1], 'constant'),
+        ([1, 1, 1, 3, 3], [0, 0, 5, 3, 7, 0, 0, 2, 3, 11], 'edge'),
+        ([1, 1, 3, 3, 3], [0, 0, 1, 2, 1, 0, 0, 1, 2, 1], 'reflect'),
+        ([1, 1, 3, 3], [0, 0, 0, 0, 0, 0, 0, 0], 'constant'),
+        ([1, 1, 3, 3], [0, 1, 1, 1, 1, 0, 0, 0], 'constant'),
+        ([1, 1, 3, 3], [0, 2, 0, 2, 0, 2, 0, 2], 'constant'),
+        ([1, 1, 3, 3], [1, 2, 4, 2, 5, 4, 4, 2], 'constant'),
+        ([1, 1, 3, 3], [0, 0, 0, 0, 0, 0, 0, 0], 'edge'),
+        ([1, 1, 3, 3], [0, 0, 2, 3, 0, 0, 2, 3], 'edge'),
+        ([1, 1, 3, 3], [0, 0, 0, 0, 0, 0, 0, 0], 'reflect'),
+        ([1, 1, 3, 3], [0, 0, 2, 1, 0, 0, 2, 1], 'reflect'),
+        ([1, 3, 3], [0, 4, 0, 1, 0, 1], 'constant'),
+        ([1, 3, 3], [0, 0, 3, 0, 0, 3], 'edge'),
+        ([1, 3, 3], [0, 0, 1, 0, 0, 1], 'reflect'),
+        # negative padding
+        ([3, 3, 3, 3, 3], [0, -1, 1, -1, 1, 0, 0, 0, 1, 1], 'constant'),
+        ([3, 3, 3, 3], [0, -1, -1, -1, -1, 0, 0, 0], 'constant'),
+        ([5, 7, 6], [0, -4, 0, -1, 0, 1], 'constant'),
+    ),
+)
+@pytest.mark.parametrize('opset_version', (2, 11, 13))
+def test_pad(  # pylint: disable=missing-function-docstring
+    input_shape: List[int],
+    pads: List[int],
+    mode: str,
+    opset_version: int,
+) -> None:
+    input_array = np.random.random(size=input_shape).astype(np.float32)
+    print(len(input_array.shape), len(pads))
+    _test_pad(input_array=input_array, mode=mode, opset_version=opset_version, pads=pads)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/pow_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/pow_test.py
new file mode 100644
index 000000000..c8cc59488
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/pow_test.py
@@ -0,0 +1,54 @@
+import numpy as np
+import onnx
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def test_pow() -> None:  # pylint: disable=missing-function-docstring
+    input_shape = [10, 3, 128, 128]
+    x_variants = [
+        np.random.uniform(low=0.0, high=4.0, size=input_shape).astype(np.float32),
+        np.random.uniform(low=-4.0, high=4.0, size=input_shape).astype(np.float32),
+        np.random.uniform(low=-4.0, high=0.001, size=input_shape).astype(np.float32),
+        np.random.uniform(low=-4.0, high=4.0, size=input_shape).astype(np.float32),
+    ]
+
+    y_variants = [
+        np.random.uniform(low=-3.0, high=3.0, size=1).astype(np.float32),
+        np.random.randint(low=0, high=4, size=[1] * len(input_shape)).astype(np.float32),
+        np.random.randint(low=-4, high=0, size=input_shape).astype(np.float32),
+        np.array([0.0], dtype=np.float32),
+    ]
+
+    for x, y in zip(x_variants, y_variants):
+        test_inputs = {'x': x, 'y': y}
+        initializers = {}
+        node = onnx.helper.make_node(
+            op_type='Pow',
+            inputs=['x', 'y'],
+            outputs=['z'],
+        )
+
+        model = make_model_from_nodes(
+            nodes=node,
+            initializers=initializers,
+            inputs_example=test_inputs,
+        )
+        check_onnx_model(model, test_inputs)
+
+
+def test_sqrt() -> None:  # pylint: disable=missing-function-docstring
+    input_shape = [10, 3, 128, 128]
+    x = np.random.uniform(low=0.0, high=10.0, size=input_shape).astype(np.float32)
+
+    test_inputs = {'x': x}
+    initializers = {}
+    node = onnx.helper.make_node(
+        op_type='Sqrt',
+        inputs=['x'],
+        outputs=['z'],
+    )
+
+    model = make_model_from_nodes(nodes=node, initializers=initializers, inputs_example=test_inputs)
+    check_onnx_model(model, test_inputs)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/range_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/range_test.py
new file mode 100644
index 000000000..3cafb56e9
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/range_test.py
@@ -0,0 +1,52 @@
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_range(
+    start: np.ndarray,
+    limit: np.ndarray,
+    delta: np.ndarray,
+) -> None:
+    test_inputs = {'start': start, 'limit': limit, 'delta': delta}
+    node = onnx.helper.make_node(op_type='Range', inputs=list(test_inputs), outputs=['y'])
+
+    num_elements = int(max(np.ceil((limit - start) / delta), 0))
+    outputs_info = [
+        make_tensor_value_info(
+            name='y',
+            elem_type=NP_TYPE_TO_TENSOR_TYPE[delta.dtype],
+            shape=[num_elements],
+        ),
+    ]
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.filterwarnings('ignore::torch.jit._trace.TracerWarning')
+def test_range() -> None:  # pylint: disable=missing-function-docstring
+    _test_range(
+        start=np.array(1, dtype=np.int32),
+        limit=np.array(5, dtype=np.int32),
+        delta=np.array(2, dtype=np.int32),
+    )
+    _test_range(
+        start=np.array(10.0, dtype=np.float32),
+        limit=np.array(6.0, dtype=np.float32),
+        delta=np.array(-2.3, dtype=np.float32),
+    )
+    _test_range(
+        start=np.array(1, dtype=np.int64),
+        limit=np.array(60, dtype=np.int64),
+        delta=np.array(7, dtype=np.int64),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/reciprocal_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/reciprocal_test.py
new file mode 100644
index 000000000..caedcf228
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/reciprocal_test.py
@@ -0,0 +1,27 @@
+import numpy as np
+import onnx
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def test_reciprocal() -> None:  # pylint: disable=missing-function-docstring
+    x_variants = (
+        np.random.randn(128),
+        np.random.randn(64, 128),
+        np.random.randn(3, 64, 128),
+        np.random.randn(10, 2, 64, 128),
+        np.zeros([3, 3, 5]),
+    )
+
+    for x in x_variants:
+        test_inputs = {'x': x}
+        initializers = {}
+        node = onnx.helper.make_node(
+            op_type='Reciprocal',
+            inputs=['x'],
+            outputs=['y'],
+        )
+
+        model = make_model_from_nodes(nodes=node, initializers=initializers, inputs_example=test_inputs)
+        check_onnx_model(model, test_inputs)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/reduce_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/reduce_test.py
new file mode 100644
index 000000000..e761010f4
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/reduce_test.py
@@ -0,0 +1,175 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_reduce(input_tensor: np.ndarray, op_type: str, tol: float, **kwargs) -> None:
+    test_inputs = {'input_tensor': input_tensor}
+    node = onnx.helper.make_node(
+        op_type=op_type,
+        inputs=list(test_inputs),
+        outputs=['y'],
+        **kwargs,
+    )
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+    )
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=tol,
+        atol_torch_cpu_cuda=tol,
+        atol_onnx_torch2onnx=0.0,
+    )
+
+
+def _test_reduce_sum(
+    input_tensor: np.ndarray,
+    axes: Optional[List[int]],
+    keepdims: Optional[int] = 1,
+    noop_with_empty_axes: Optional[int] = 0,
+) -> None:
+    test_inputs = {'input_tensor': input_tensor}
+    kwargs = {}
+
+    if keepdims is not None:
+        kwargs['keepdims'] = keepdims
+    else:
+        keepdims = 1
+
+    if noop_with_empty_axes is not None:
+        kwargs['noop_with_empty_axes'] = noop_with_empty_axes == 1
+    else:
+        noop_with_empty_axes = 0
+
+    if axes is not None and len(axes) > 0:
+        test_inputs['axes'] = np.array(axes, dtype=np.int64)
+        output_shape = np.sum(input_tensor, axis=tuple(axes), keepdims=bool(keepdims)).shape
+    else:
+        test_inputs['axes'] = np.array([], dtype=np.int64)
+        if noop_with_empty_axes == 0:
+            output_shape = np.sum(input_tensor, keepdims=bool(keepdims)).shape
+        else:
+            output_shape = input_tensor.shape
+
+    node = onnx.helper.make_node(
+        op_type='ReduceSum',
+        inputs=list(test_inputs),
+        outputs=['y'],
+        **kwargs,
+    )
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        opset_version=13,
+        outputs_info=(
+            make_tensor_value_info(
+                name='y',
+                elem_type=NP_TYPE_TO_TENSOR_TYPE[input_tensor.dtype],
+                shape=output_shape,
+            ),
+        ),
+    )
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=10**-5,
+        atol_torch_cpu_cuda=10**-5,
+        atol_onnx_torch2onnx=0.0,
+    )
+
+
+@pytest.mark.parametrize(
+    'op_type,tol',
+    (
+        ('ReduceL1', 10**-5),
+        ('ReduceL2', 10**-5),
+        ('ReduceLogSum', 10**-5),
+        ('ReduceLogSumExp', 10**-5),
+        ('ReduceMax', 0),
+        ('ReduceMin', 0),
+        ('ReduceMean', 10**-5),
+        ('ReduceSum', 10**-5),
+        ('ReduceProd', 10**-5),
+        ('ReduceSumSquare', 10**-5),
+    ),
+)
+@pytest.mark.parametrize(
+    'shape,axes,keepdims',
+    (
+        ((1, 3, 8, 8), None, None),
+        ((1, 3, 8, 8), None, 0),
+        ((1, 3, 8, 8), None, 1),
+        ((1, 3, 8, 8), [1], 0),
+        ((1, 3, 8, 8), [1], 1),
+        ((1, 3, 8, 8), [1], None),
+        ((1, 3, 8, 8), [-2], 1),
+        ((2, 3, 8, 8), [-2, -4], 1),
+        ((2, 3, 8, 8), [1, 3], 1),
+    ),
+)
+def test_reduce(  # pylint: disable=missing-function-docstring
+    op_type: str,
+    tol: float,
+    shape: Tuple[int],
+    axes: Optional[int],
+    keepdims: Optional[int],
+) -> None:
+    if op_type == 'ReduceLogSum':
+        left_boundary = 10**-5
+    else:
+        left_boundary = -10
+
+    test_kwargs = {
+        'input_tensor': np.random.uniform(left_boundary, 10, shape).astype(np.float32),
+        'op_type': op_type,
+        'tol': tol,
+    }
+    if axes is not None:
+        test_kwargs['axes'] = axes
+    if keepdims is not None:
+        test_kwargs['keepdims'] = keepdims
+
+    _test_reduce(**test_kwargs)
+
+
+@pytest.mark.parametrize(
+    'shape,axes,keepdims,noop_with_empty_axes',
+    (
+        ((1, 3, 8, 8), None, None, None),
+        ((1, 3, 8, 8), None, 0, 0),
+        ((1, 3, 8, 8), None, 1, 0),
+        ((1, 3, 8, 8), None, 1, 1),
+        ((1, 3, 8, 8), None, 1, 0),
+        ((1, 3, 8, 8), [1], 0, 0),
+        ((1, 3, 8, 8), [1], 1, 0),
+        ((1, 3, 8, 8), [1], None, 0),
+        ((1, 3, 8, 8), [-2], 1, 0),
+        ((2, 3, 8, 8), [-2, -4], 1, 0),
+        ((2, 3, 8, 8), [1, 3], 1, 0),
+    ),
+)
+def test_reduce_sum(  # pylint: disable=missing-function-docstring
+    shape: Tuple[int],
+    axes: Optional[List[int]],
+    keepdims: Optional[int],
+    noop_with_empty_axes: Optional[int],
+) -> None:
+    _test_reduce_sum(
+        input_tensor=np.random.uniform(-10, 10, shape).astype(np.float32),
+        axes=axes,
+        keepdims=keepdims,
+        noop_with_empty_axes=noop_with_empty_axes,
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/reshape_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/reshape_test.py
new file mode 100644
index 000000000..e3d952bba
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/reshape_test.py
@@ -0,0 +1,54 @@
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_reshape(
+    input_shape: List[int],
+    output_shape: List[int],
+    opset_version: int,
+    **kwargs,
+) -> None:
+    test_inputs = {'x': np.random.uniform(low=-1.0, high=1.0, size=input_shape).astype(np.float32)}
+    initializers = {'output_shape': np.asarray(output_shape, dtype=np.int64)}
+
+    node = onnx.helper.make_node(
+        op_type='Reshape',
+        inputs=['x', 'output_shape'],
+        outputs=['y'],
+        **kwargs,
+    )
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers=initializers,
+        inputs_example=test_inputs,
+        opset_version=opset_version,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.filterwarnings('ignore::torch.jit._trace.TracerWarning')
+@pytest.mark.parametrize(
+    'input_shape,output_shape,opset_version',
+    (
+        ([2, 3, 16, 16], [2, -1, 3], 9),
+        ([2, 3, 16, 16], [2, 0, -1], 9),
+        ([2, 3, 16, 16], [2, 0, 1, 1, 1, 1, 1, 1, -1], 9),
+        ([2, 3, 16, 16], [-1, 1, 1, 2, 1, 1, 1, 2, 1, 1], 14),
+    ),
+)
+def test_reshape(  # pylint: disable=missing-function-docstring
+    input_shape: List[int],
+    output_shape: List[int],
+    opset_version: int,
+) -> None:
+    _test_reshape(
+        input_shape=input_shape,
+        output_shape=output_shape,
+        opset_version=opset_version,
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/resize_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/resize_test.py
new file mode 100644
index 000000000..20a908d6f
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/resize_test.py
@@ -0,0 +1,140 @@
+from typing import Optional
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_resize(
+    x: np.ndarray,
+    scales: Optional[np.ndarray] = None,
+    sizes: Optional[np.ndarray] = None,
+    align_corners: bool = False,
+    **kwargs,
+) -> None:
+    if align_corners:
+        kwargs['coordinate_transformation_mode'] = 'align_corners'
+
+    inputs = ['x', '']
+    test_inputs = {'x': x}
+    if scales is not None:
+        test_inputs['scales'] = scales
+        inputs.append('scales')
+    else:
+        inputs.append('')
+
+    if sizes is not None:
+        test_inputs['sizes'] = sizes
+        inputs.append('sizes')
+    else:
+        inputs.append('')
+
+    node = onnx.helper.make_node(op_type='Resize', inputs=inputs, outputs=['y'], **kwargs)
+
+    outputs_info = [
+        make_tensor_value_info(
+            name='y',
+            elem_type=NP_TYPE_TO_TENSOR_TYPE[x.dtype],
+            shape=None,
+        ),
+    ]
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+        opset_version=13,
+    )
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=10**-6,
+        atol_torch_cpu_cuda=10**-6,
+    )
+
+
+def _test_resize_v10(
+    x: np.ndarray,
+    scales: np.ndarray = None,
+    mode: str = 'nearest',
+) -> None:
+    test_inputs = {'x': x, 'scales': scales}
+
+    node = onnx.helper.make_node(op_type='Resize', inputs=list(test_inputs), outputs=['y'], mode=mode)
+
+    outputs_info = [
+        make_tensor_value_info(
+            name='y',
+            elem_type=NP_TYPE_TO_TENSOR_TYPE[x.dtype],
+            shape=None,
+        ),
+    ]
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+        opset_version=10,
+    )
+    check_onnx_model(
+        model,
+        test_inputs,
+        atol_onnx_torch=10**-7,
+        atol_torch_cpu_cuda=10**-7,
+        atol_onnx_torch2onnx=0.0,
+    )
+
+
+_UPSAMPLE_SIZES = np.array([1, 1, 500, 500]).astype(np.int64)
+_UPSAMPLE_SCALES = np.array([1.0, 1.0, 2.0, 2.0]).astype(np.float32)
+
+_DOWNSAMPLE_SIZES = np.array([1, 1, 125, 125]).astype(np.int64)
+_DOWNSAMPLE_SCALES = np.array([1.0, 1.0, 0.5, 0.5]).astype(np.float32)
+
+_DATA = np.random.normal(scale=3.0, size=[1, 1, 250, 250]).astype(np.float32)
+
+
+@pytest.mark.parametrize(
+    'sizes,scales,mode,coordinate_transformation_mode',
+    (
+        (_UPSAMPLE_SIZES, None, 'linear', 'half_pixel'),
+        (None, _UPSAMPLE_SCALES, 'linear', 'half_pixel'),
+        (_DOWNSAMPLE_SIZES, None, 'linear', 'half_pixel'),
+        (None, _DOWNSAMPLE_SCALES, 'linear', 'half_pixel'),
+        (_UPSAMPLE_SIZES, None, 'nearest', 'asymmetric'),
+        (None, _UPSAMPLE_SCALES, 'nearest', 'asymmetric'),
+        (_DOWNSAMPLE_SIZES, None, 'nearest', 'asymmetric'),
+        (None, _DOWNSAMPLE_SCALES, 'nearest', 'asymmetric'),
+        (_UPSAMPLE_SIZES, None, 'cubic', 'half_pixel'),
+        (None, _UPSAMPLE_SCALES, 'cubic', 'half_pixel'),
+        (_DOWNSAMPLE_SIZES, None, 'cubic', 'half_pixel'),
+        (None, _DOWNSAMPLE_SCALES, 'cubic', 'half_pixel'),
+    ),
+)
+def test_resize(  # pylint: disable=missing-function-docstring
+    sizes: np.ndarray,
+    scales: np.ndarray,
+    mode: str,
+    coordinate_transformation_mode: str,
+) -> None:
+    _test_resize(
+        x=_DATA,
+        sizes=sizes,
+        scales=scales,
+        mode=mode,
+        nearest_mode='floor',
+        coordinate_transformation_mode=coordinate_transformation_mode,
+    )
+
+
+@pytest.mark.parametrize('mode', ('nearest',))
+def test_resize_v10(mode: str) -> None:  # pylint: disable=missing-function-docstring
+    _test_resize_v10(x=_DATA, scales=_UPSAMPLE_SCALES, mode=mode)
+    _test_resize_v10(x=_DATA, scales=_DOWNSAMPLE_SCALES, mode=mode)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/roialign_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/roialign_test.py
new file mode 100644
index 000000000..5919eb019
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/roialign_test.py
@@ -0,0 +1,218 @@
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def get_roi_align_input_values():  # type: ignore pylint: disable=missing-function-docstring
+    x = np.array(
+        [
+            [
+                [
+                    [
+                        0.2764,
+                        0.7150,
+                        0.1958,
+                        0.3416,
+                        0.4638,
+                        0.0259,
+                        0.2963,
+                        0.6518,
+                        0.4856,
+                        0.7250,
+                    ],
+                    [
+                        0.9637,
+                        0.0895,
+                        0.2919,
+                        0.6753,
+                        0.0234,
+                        0.6132,
+                        0.8085,
+                        0.5324,
+                        0.8992,
+                        0.4467,
+                    ],
+                    [
+                        0.3265,
+                        0.8479,
+                        0.9698,
+                        0.2471,
+                        0.9336,
+                        0.1878,
+                        0.4766,
+                        0.4308,
+                        0.3400,
+                        0.2162,
+                    ],
+                    [
+                        0.0206,
+                        0.1720,
+                        0.2155,
+                        0.4394,
+                        0.0653,
+                        0.3406,
+                        0.7724,
+                        0.3921,
+                        0.2541,
+                        0.5799,
+                    ],
+                    [
+                        0.4062,
+                        0.2194,
+                        0.4473,
+                        0.4687,
+                        0.7109,
+                        0.9327,
+                        0.9815,
+                        0.6320,
+                        0.1728,
+                        0.6119,
+                    ],
+                    [
+                        0.3097,
+                        0.1283,
+                        0.4984,
+                        0.5068,
+                        0.4279,
+                        0.0173,
+                        0.4388,
+                        0.0430,
+                        0.4671,
+                        0.7119,
+                    ],
+                    [
+                        0.1011,
+                        0.8477,
+                        0.4726,
+                        0.1777,
+                        0.9923,
+                        0.4042,
+                        0.1869,
+                        0.7795,
+                        0.9946,
+                        0.9689,
+                    ],
+                    [
+                        0.1366,
+                        0.3671,
+                        0.7011,
+                        0.6234,
+                        0.9867,
+                        0.5585,
+                        0.6985,
+                        0.5609,
+                        0.8788,
+                        0.9928,
+                    ],
+                    [
+                        0.5697,
+                        0.8511,
+                        0.6711,
+                        0.9406,
+                        0.8751,
+                        0.7496,
+                        0.1650,
+                        0.1049,
+                        0.1559,
+                        0.2514,
+                    ],
+                    [
+                        0.7012,
+                        0.4056,
+                        0.7879,
+                        0.3461,
+                        0.0415,
+                        0.2998,
+                        0.5094,
+                        0.3727,
+                        0.5482,
+                        0.0502,
+                    ],
+                ]
+            ]
+        ],
+        dtype=np.float32,
+    )
+    batch_indices = np.array([0, 0, 0], dtype=np.int64)
+    rois = np.array([[0, 0, 9, 9], [0, 5, 4, 9], [5, 5, 9, 9]], dtype=np.float32)
+    return x, batch_indices, rois
+
+
+def _test_roi(
+    input_tensor: np.ndarray,
+    rois: np.ndarray,
+    batch_indices: np.ndarray,
+    opset_version: int,
+    **kwargs,
+) -> None:
+    test_inputs = {'X': input_tensor, 'rois': rois, 'batch_indices': batch_indices}
+
+    node = onnx.helper.make_node(
+        op_type='RoiAlign',
+        inputs=list(test_inputs),
+        outputs=['y'],
+        **kwargs,
+    )
+    onnx_type = NP_TYPE_TO_TENSOR_TYPE[np.dtype('float32')]
+    outputs_info = [make_tensor_value_info(name='y', elem_type=onnx_type, shape=None)]
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+        opset_version=opset_version,
+    )
+    check_onnx_model(
+        onnx_model=model,
+        onnx_inputs=test_inputs,
+        opset_version=opset_version,
+    )
+
+
+@pytest.mark.parametrize('opset_version', (10, 13, 16))
+@pytest.mark.parametrize('coordinate_transformation_mode', ('half_pixel', 'output_half_pixel'))
+@pytest.mark.parametrize('mode', ('avg',))
+@pytest.mark.parametrize(
+    'spatial_scale,sampling_ratio,output_height,output_width',
+    (
+        (1.0, 2, 5, 5),
+        (0.25, 0, 7, 7),
+        (0.125, 0, 7, 7),
+        (0.6, 0, 1, 1),
+        (None, None, None, None),
+    ),
+)
+@pytest.mark.filterwarnings('ignore::torch.jit._trace.TracerWarning')
+def test_roi(  # pylint: disable=missing-function-docstring
+    coordinate_transformation_mode: str,
+    mode: str,
+    spatial_scale: float,
+    sampling_ratio: int,
+    output_height: int,
+    output_width: int,
+    opset_version: int,
+) -> None:
+    x, batch_indices, rois = get_roi_align_input_values()
+    kwargs = {}
+    if spatial_scale is not None:
+        kwargs['spatial_scale'] = spatial_scale
+    if sampling_ratio is not None:
+        kwargs['sampling_ratio'] = sampling_ratio
+    if output_height is not None:
+        kwargs['output_height'] = output_height
+    if output_width is not None:
+        kwargs['output_width'] = output_width
+    _test_roi(
+        input_tensor=x,
+        rois=rois,
+        batch_indices=batch_indices,
+        opset_version=opset_version,
+        mode=mode,
+        coordinate_transformation_mode=coordinate_transformation_mode if opset_version >= 16 else None,
+        **kwargs,
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/scatter_nd_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/scatter_nd_test.py
new file mode 100644
index 000000000..0032e653f
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/scatter_nd_test.py
@@ -0,0 +1,77 @@
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_scatter_nd(
+    data: np.ndarray,
+    indices: np.ndarray,
+    updates: np.ndarray,
+    opset_version: int,
+    **kwargs,
+) -> None:
+    test_inputs = {'data': data, 'indices': indices, 'updates': updates}
+
+    node = onnx.helper.make_node(
+        op_type='ScatterND',
+        inputs=['data', 'indices', 'updates'],
+        outputs=['y'],
+        **kwargs,
+    )
+
+    model = make_model_from_nodes(nodes=node, initializers={}, inputs_example=test_inputs, opset_version=opset_version)
+    check_onnx_model(model, test_inputs, opset_version=opset_version)
+
+
+@pytest.mark.parametrize('opset_version', (11, 13, 14, 16))
+@pytest.mark.parametrize('reduction', ('none',))
+@pytest.mark.parametrize(
+    'data',
+    (
+        np.array(
+            [
+                [[1, 2, 3, 4], [5, 6, 7, 8], [8, 7, 6, 5], [4, 3, 2, 1]],
+                [[1, 2, 3, 4], [5, 6, 7, 8], [8, 7, 6, 5], [4, 3, 2, 1]],
+                [[8, 7, 6, 5], [4, 3, 2, 1], [1, 2, 3, 4], [5, 6, 7, 8]],
+                [[8, 7, 6, 5], [4, 3, 2, 1], [1, 2, 3, 4], [5, 6, 7, 8]],
+            ],
+            dtype=np.float32,
+        ),
+    ),
+)
+@pytest.mark.parametrize(
+    'indices, updates',
+    (
+        (
+            np.array([[0, 1, 2], [1, 2, 3]], dtype=np.int64),
+            np.array([1232, 5463], dtype=np.float32),
+        ),
+        (
+            np.array([[0, 1], [1, 2]], dtype=np.int64),
+            np.array([[8, 7, 6, 5], [4, 3, 2, 1]], dtype=np.float32),
+        ),
+        (
+            np.array([[0], [2]], dtype=np.int64),
+            np.array(
+                [
+                    [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+                    [[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3], [4, 4, 4, 4]],
+                ],
+                dtype=np.float32,
+            ),
+        ),
+    ),
+)
+def test_scatter_nd(  # pylint: disable=missing-function-docstring
+    data: np.ndarray, indices: np.ndarray, updates: np.ndarray, opset_version: int, reduction: str
+) -> None:
+    _test_scatter_nd(
+        data=data,
+        indices=indices,
+        updates=updates,
+        opset_version=opset_version,
+        reduction=reduction if opset_version >= 16 else None,
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/shape_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/shape_test.py
new file mode 100644
index 000000000..3dc4a2a3a
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/shape_test.py
@@ -0,0 +1,43 @@
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_shape(
+    input_shape: List[int],
+    opset_version: int,
+    **kwargs,
+) -> None:
+    x = np.random.uniform(low=-1.0, high=1.0, size=input_shape).astype(np.float32)
+    test_inputs = {'x': x}
+
+    node = onnx.helper.make_node(
+        op_type='Shape',
+        inputs=list(test_inputs),
+        outputs=['y'],
+        **kwargs,
+    )
+    onnx_type = NP_TYPE_TO_TENSOR_TYPE[np.dtype('int64')]
+    outputs_info = [make_tensor_value_info(name='y', elem_type=onnx_type, shape=None)]
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+        opset_version=opset_version,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.filterwarnings('ignore::torch.jit._trace.TracerWarning')
+def test_shape() -> None:  # pylint: disable=missing-function-docstring
+    _test_shape(input_shape=[2, 3, 16, 16, 16], opset_version=9)
+    _test_shape(input_shape=[2, 3, 16, 16], opset_version=9)
+    _test_shape(input_shape=[2, 3, 16], opset_version=9)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/slice_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/slice_test.py
new file mode 100644
index 000000000..e6d94d6b6
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/slice_test.py
@@ -0,0 +1,79 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_slice(
+    input_tensor: np.ndarray,
+    starts: np.ndarray,
+    ends: np.ndarray,
+    axes: Optional[np.ndarray] = None,
+    steps: Optional[np.ndarray] = None,
+) -> None:
+    test_inputs = {'input_tensor': input_tensor}
+
+    initializers = {'starts': starts, 'ends': ends}
+    if axes is not None:
+        initializers['axes'] = axes
+    if steps is not None:
+        initializers['steps'] = steps
+
+    node = onnx.helper.make_node(
+        op_type='Slice',
+        inputs=list(test_inputs.keys()) + list(initializers.keys()),
+        outputs=['y'],
+    )
+    outputs_info = [
+        make_tensor_value_info(
+            name='y',
+            elem_type=NP_TYPE_TO_TENSOR_TYPE[input_tensor.dtype],
+            shape=None,
+        ),
+    ]
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers=initializers,
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+    )
+    # onnx checker in torch 1.12 has problems with negative steps in Slice, so we disable it
+    ignore_export_checker = steps is not None and np.any(steps < 0)
+    check_onnx_model(model, test_inputs, ignore_export_checker=ignore_export_checker)
+
+
+@pytest.mark.filterwarnings('ignore::torch.jit._trace.TracerWarning')
+@pytest.mark.parametrize(
+    'input_shape,starts,ends,axes,steps',
+    (
+        ((20, 10, 15), [0, 0], [3, 10], [0, 1], [1, 1]),
+        ((20, 10, 15), [0, 0, 3], [20, 10, 4], None, None),
+        ((20, 10, 15), [1], [1000], [1], [1]),
+        ((20, 10, 15), [0], [-1], [1], [1]),
+        ((20, 10, 15), [20, 10, 4], [0, 0, 1], [0, 1, 2], [-1, -3, -2]),
+        ((20, 10, 15), [0, 0, 3], [20, 10, 4], [0, -2, -1], None),
+    ),
+)
+def test_slice(  # pylint: disable=missing-function-docstring
+    input_shape: Tuple[int, ...],
+    starts: List[int],
+    ends: List[int],
+    axes: Optional[List[int]],
+    steps: Optional[List[int]],
+) -> None:
+    x = np.random.randn(*input_shape).astype(np.float32)
+    _test_slice(
+        input_tensor=x,
+        starts=np.array(starts, dtype=np.int64),
+        ends=np.array(ends, dtype=np.int64),
+        axes=np.array(axes, dtype=np.int64) if axes is not None else None,
+        steps=np.array(steps, dtype=np.int64) if steps is not None else None,
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/split_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/split_test.py
new file mode 100644
index 000000000..975c8dd92
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/split_test.py
@@ -0,0 +1,89 @@
+from typing import List
+from typing import Optional
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_split(
+    x: np.ndarray,
+    expected_output: List[np.ndarray],
+    opset_version: int,
+    **kwargs,
+) -> None:
+    inputs = [
+        'x',
+    ]
+    test_inputs = {'x': x}
+
+    if opset_version >= 13 and kwargs.get('split') is not None:
+        split = kwargs.pop('split')
+        test_inputs['split'] = split
+        inputs.append('split')
+
+    node = onnx.helper.make_node(
+        op_type='Split',
+        inputs=inputs,
+        outputs=[f'output_{i}' for i, _ in enumerate(expected_output)],
+        **kwargs,
+    )
+
+    outputs_info = [
+        make_tensor_value_info(
+            name=f'output_{i}',
+            elem_type=NP_TYPE_TO_TENSOR_TYPE[out.dtype],
+            shape=out.shape,
+        )
+        for i, out in enumerate(expected_output)
+    ]
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+        opset_version=opset_version,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+INPUT_1D = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).astype(np.float32)
+INPUT_2D = np.array([[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], [7.0, 8.0, 9.0, 10.0, 11.0, 12.0]]).astype(np.float32)
+
+EMPTY_INPUT = np.array([]).astype(np.float32)
+EXPECTED_EMPTY_OUT = [np.array([]).astype(np.float32), np.array([]).astype(np.float32), np.array([]).astype(np.float32)]
+
+
+@pytest.mark.parametrize(
+    'input_array,expected_out,axis,split',
+    (
+        (INPUT_1D, np.split(INPUT_1D, 3), None, None),
+        (INPUT_1D, np.split(INPUT_1D, 3), 0, None),
+        (INPUT_1D, np.split(INPUT_1D, [2]), None, np.array([2, 4]).astype(np.int64)),
+        (INPUT_1D, np.split(INPUT_1D, [2]), 0, np.array([2, 4]).astype(np.int64)),
+        (INPUT_2D, np.split(INPUT_2D, 2, axis=1), 1, None),
+        (INPUT_2D, np.split(INPUT_2D, [2], axis=1), 1, np.array([2, 4]).astype(np.int64)),
+        (EMPTY_INPUT, EXPECTED_EMPTY_OUT, None, np.array([0, 0, 0]).astype(np.int64)),
+    ),
+)
+@pytest.mark.parametrize('opset_version', (13, 11, 2))
+def test_split(  # pylint: disable=missing-function-docstring
+    input_array: np.ndarray,
+    expected_out: List[np.ndarray],
+    axis: Optional[int],
+    split: Optional[np.ndarray],
+    opset_version: int,
+) -> None:
+    kwargs = {}
+    if axis is not None:
+        kwargs['axis'] = axis
+    if split is not None:
+        kwargs['split'] = split
+
+    _test_split(input_array, expected_out, opset_version=opset_version, **kwargs)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/squeeze_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/squeeze_test.py
new file mode 100644
index 000000000..dd04cfeef
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/squeeze_test.py
@@ -0,0 +1,90 @@
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_squeeze(
+    input_tensor: np.ndarray,
+    axes: Optional[List[int]],
+    opset_version: int,
+    **kwargs,
+) -> None:
+    test_inputs: Dict[str, Any] = {'input_tensor': input_tensor}
+
+    if axes is not None and len(axes) > 0:
+        if opset_version >= 13:
+            test_inputs['axes'] = np.array(axes, dtype=np.int64)
+        else:
+            kwargs['axes'] = axes
+
+        output_shape = np.squeeze(input_tensor, axis=tuple(a for a in axes if input_tensor.shape[a] == 1)).shape
+    else:
+        output_shape = np.squeeze(input_tensor).shape
+
+    node = onnx.helper.make_node(
+        op_type='Squeeze',
+        inputs=list(test_inputs),
+        outputs=['y'],
+        **kwargs,
+    )
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        opset_version=opset_version,
+        outputs_info=(
+            make_tensor_value_info(
+                name='y',
+                elem_type=NP_TYPE_TO_TENSOR_TYPE[input_tensor.dtype],
+                shape=output_shape,
+            ),
+        ),
+    )
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.filterwarnings('ignore::torch.jit._trace.TracerWarning')
+@pytest.mark.parametrize(
+    'shape,axes,opset_version',
+    (
+        ([1, 3, 4, 5], [0], 11),
+        ([1, 3, 1, 5], [-2], 11),
+        ([1, 3, 1, 5], [0, 2], 11),
+        ([1, 3, 1, 5], [2, 0], 11),
+        ([1, 3, 1, 1, 1, 5, 1], [2, 0, 6], 11),
+        ([1, 3, 1, 5], [0, -2], 11),
+        ([1, 3, 1, 5], [-2, 0], 11),
+        ([1, 3, 1, 5], None, 11),
+        ([1, 1, 1, 1], None, 11),
+        ([1], None, 11),
+        ([3, 3, 3], None, 11),
+        ([1, 3, 4, 5], [0], 13),
+        ([1, 3, 1, 5], [-2], 13),
+        ([1, 3, 1, 5], [0, 2], 13),
+        ([1, 3, 1, 5], [2, 0], 13),
+        ([1, 3, 1, 5], [0, -2], 13),
+        ([1, 3, 1, 5], [-2, 0], 13),
+        ([1, 3, 1, 5], None, 13),
+        ([1, 1, 1, 1], None, 13),
+        ([1], None, 13),
+        ([3, 3, 3], None, 13),
+    ),
+)
+def test_squeeze(  # pylint: disable=missing-function-docstring
+    shape: List[int],
+    axes: List[int],
+    opset_version: int,
+) -> None:
+    x = np.random.randn(*shape).astype(np.float32)
+    _test_squeeze(input_tensor=x, axes=axes, opset_version=opset_version)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/sum_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/sum_test.py
new file mode 100644
index 000000000..074c05986
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/sum_test.py
@@ -0,0 +1,47 @@
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_sum(
+    data_list: List[np.ndarray],
+) -> None:
+    test_inputs = {f'data_{i}': data for i, data in enumerate(data_list)}
+
+    node = onnx.helper.make_node(op_type='Sum', inputs=list(test_inputs), outputs=['y'])
+    outputs_info = [
+        make_tensor_value_info(
+            name='y',
+            elem_type=NP_TYPE_TO_TENSOR_TYPE[data_list[0].dtype],
+            shape=None,
+        ),
+    ]
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.parametrize(
+    'input_shapes',
+    (
+        ([],),
+        ([2, 3, 4],),
+        ([3, 1], [2, 1, 6]),
+        ([3, 1], [3, 4]),
+    ),
+)
+def test_sum(input_shapes: List[List[int]]) -> None:  # pylint: disable=missing-function-docstring
+    input_tensors = [np.random.normal(size=i_shape).astype(np.float32) for i_shape in input_shapes]
+    _test_sum(data_list=input_tensors)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/test_functions.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/test_functions.py
new file mode 100644
index 000000000..aa1f60482
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/test_functions.py
@@ -0,0 +1,83 @@
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_functions(function: str, data: np.ndarray, opset_version, **kwargs) -> None:
+    test_inputs = {'input_tensor': data}
+
+    node = onnx.helper.make_node(op_type=function, inputs=['input_tensor'], outputs=['y'], **kwargs)
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        opset_version=opset_version,
+    )
+
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.parametrize(
+    'function,input_shape',
+    (
+        ('Ceil', [8, 3, 32, 32]),
+        ('Floor', [8, 3, 32, 32]),
+        ('Round', [8, 3, 32, 32]),
+    ),
+)
+def test_roundings(function: str, input_shape: List[int]) -> None:  # pylint: disable=missing-function-docstring
+    data = np.random.randn(*input_shape).astype(np.float32)
+    _test_functions(function, data=data, opset_version=11)
+
+
+@pytest.mark.parametrize(
+    'function,input_shape',
+    (
+        ('Abs', [8, 3, 32, 32]),
+        ('Cos', [8, 3, 32, 32]),
+        ('Exp', [8, 3, 32, 32]),
+        ('Log', [8, 3, 32, 32]),
+        ('Sign', [8, 3, 32, 32]),
+        ('Sin', [8, 3, 32, 32]),
+        ('Tan', [8, 3, 32, 32]),
+    ),
+)
+def test_common_functions(function: str, input_shape: List[int]) -> None:  # pylint: disable=missing-function-docstring
+    data = np.random.randn(*input_shape).astype(np.float32)
+    if function == 'Log':
+        data[data <= 0] = 10**-4
+    _test_functions(function, data=data, opset_version=11)
+
+
+@pytest.mark.parametrize(
+    'function,input_shape',
+    (
+        ('Acos', [8, 3, 32, 32]),
+        ('Asin', [8, 3, 32, 32]),
+        ('Atan', [8, 3, 32, 32]),
+    ),
+)
+def test_arc_functions(function: str, input_shape: List[int]) -> None:  # pylint: disable=missing-function-docstring
+    if function in ['Acos', 'Asin']:
+        data = np.random.uniform(-1, 1, input_shape).astype(np.float32)
+    else:
+        data = np.random.randn(*input_shape).astype(np.float32)
+
+    _test_functions(function, data=data, opset_version=11)
+
+
+@pytest.mark.parametrize(
+    'function,input_shape',
+    (('Tanh', [8, 3, 32, 32]),),
+)
+def test_hyperbolic_functions(  # pylint: disable=missing-function-docstring
+    function: str,
+    input_shape: List[int],
+) -> None:
+    data = np.random.randn(*input_shape).astype(np.float32)
+    _test_functions(function, data=data, opset_version=11)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/tile_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/tile_test.py
new file mode 100644
index 000000000..4bb4f663f
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/tile_test.py
@@ -0,0 +1,51 @@
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_tile(
+    data: np.ndarray,
+    repeats: np.ndarray,
+    desire_out: np.ndarray,
+) -> None:
+    test_inputs = {'input_tensor': data, 'repeats': repeats}
+    node = onnx.helper.make_node(
+        op_type='Tile',
+        inputs=list(test_inputs),
+        outputs=['y'],
+    )
+    outputs_info = [
+        make_tensor_value_info(name='y', elem_type=NP_TYPE_TO_TENSOR_TYPE[data.dtype], shape=desire_out.shape),
+    ]
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+@pytest.mark.filterwarnings('ignore::torch.jit._trace.TracerWarning')
+def test_tile() -> None:  # pylint: disable=missing-function-docstring
+    data = np.random.rand(2, 3, 4, 5).astype(np.float32)
+    repeats = np.random.randint(low=1, high=10, size=(np.ndim(data),)).astype(np.int64)
+    _test_tile(
+        data=data,
+        repeats=repeats,
+        desire_out=np.tile(data, repeats),
+    )
+
+    data = np.array([[0, 1], [2, 3]], dtype=np.float32)
+
+    repeats = np.array([2, 2], dtype=np.int64)
+    _test_tile(
+        data=data,
+        repeats=repeats,
+        desire_out=np.array([[0, 1, 0, 1], [2, 3, 2, 3], [0, 1, 0, 1], [2, 3, 2, 3]], dtype=np.float32),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/topk_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/topk_test.py
new file mode 100644
index 000000000..13d4c2c4e
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/topk_test.py
@@ -0,0 +1,44 @@
+import numpy as np
+import onnx
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_topk(data: np.ndarray, k: np.ndarray, **kwargs) -> None:  # pylint: disable=invalid-name
+    test_inputs = {'input_tensor': data, 'k': k}
+
+    node = onnx.helper.make_node(
+        op_type='TopK',
+        inputs=list(test_inputs),
+        outputs=['y_0', 'y_1'],
+        **kwargs,
+    )
+    outputs_info = [
+        make_tensor_value_info(name='y_0', elem_type=NP_TYPE_TO_TENSOR_TYPE[data.dtype], shape=None),
+        make_tensor_value_info(name='y_1', elem_type=NP_TYPE_TO_TENSOR_TYPE[np.dtype('int64')], shape=None),
+    ]
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        outputs_info=outputs_info,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+def test_topk() -> None:  # pylint: disable=missing-function-docstring
+    x = np.array(
+        [
+            [0, 1, 2, 3],
+            [4, 5, 6, 7],
+            [8, 9, 10, 11],
+        ],
+        dtype=np.float32,
+    )
+
+    _test_topk(data=x, k=np.array([3], dtype=np.int64), axis=1, largest=1)
+    _test_topk(data=x, k=np.array([3], dtype=np.int64), axis=-1, largest=1)
+    _test_topk(data=x, k=np.array([3], dtype=np.int64), axis=1, largest=1, sorted=1)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/transpose_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/transpose_test.py
new file mode 100644
index 000000000..11ada6f9a
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/transpose_test.py
@@ -0,0 +1,36 @@
+import itertools
+
+import numpy as np
+import onnx
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_transpose(data: np.ndarray, **kwargs) -> None:
+    test_inputs = {'input_tensor': data}
+    node = onnx.helper.make_node(
+        op_type='Transpose',
+        inputs=list(test_inputs),
+        outputs=['y'],
+        **kwargs,
+    )
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+    )
+    check_onnx_model(model, test_inputs)
+
+
+def test_transpose() -> None:  # pylint: disable=missing-function-docstring
+    shape = (2, 3, 4)
+    data = np.random.random_sample(shape).astype(np.float32)
+    permutations = list(itertools.permutations(np.arange(len(shape))))
+    for permutation in permutations:
+        _test_transpose(
+            data=data,
+            perm=np.array(permutation, dtype=np.int64),
+        )
+
+    _test_transpose(data=data)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/unsqueeze_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/unsqueeze_test.py
new file mode 100644
index 000000000..aa89626e3
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/unsqueeze_test.py
@@ -0,0 +1,82 @@
+from typing import Any
+from typing import Dict
+from typing import List
+
+import numpy as np
+import onnx
+import pytest
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def _test_unsqueeze(
+    input_tensor: np.ndarray,
+    axes: List[int],
+    opset_version: int,
+    **kwargs,
+) -> None:
+    test_inputs: Dict[str, Any] = {'input_tensor': input_tensor}
+
+    if opset_version >= 13:
+        test_inputs['axes'] = np.array(axes, dtype=np.int64)
+    else:
+        kwargs['axes'] = axes
+
+    node = onnx.helper.make_node(
+        op_type='Unsqueeze',
+        inputs=list(test_inputs),
+        outputs=['y'],
+        **kwargs,
+    )
+
+    model = make_model_from_nodes(
+        nodes=node,
+        initializers={},
+        inputs_example=test_inputs,
+        opset_version=opset_version,
+        outputs_info=(
+            make_tensor_value_info(
+                name='y',
+                elem_type=NP_TYPE_TO_TENSOR_TYPE[input_tensor.dtype],
+                shape=np.expand_dims(input_tensor, axis=axes).shape,
+            ),
+        ),
+    )
+    check_onnx_model(model, test_inputs)
+
+
+# Known warning. Shape Inference do not work properly in opset_version=9 and negative indices.
+# [W:onnxruntime:, execution_frame.cc:721 VerifyOutputSizes]
+# Expected shape from model of {2,3,16,16} does not match actual shape of {2,1,3,16,1,16} for output y
+@pytest.mark.filterwarnings('ignore::torch.jit._trace.TracerWarning')
+@pytest.mark.parametrize(
+    'shape,axes,opset_version',
+    (
+        ([2, 3, 16, 16], [0], 11),
+        ([2, 3, 16, 16], [2], 11),
+        ([2, 3, 16, 16], [-1], 11),
+        ([2, 3, 16, 16], [-3], 11),
+        ([2, 3, 16, 16], [0, 1], 11),
+        ([2, 3, 16, 16], [1, 2, 3, 4, 5], 11),
+        ([2, 3, 16, 16], [1, -2], 11),
+        ([2, 3, 16, 16], [-2, 1], 11),
+        ([2, 3, 16, 16], [0], 13),
+        ([2, 3, 16, 16], [2], 13),
+        ([2, 3, 16, 16], [-1], 13),
+        ([2, 3, 16, 16], [-3], 13),
+        ([2, 3, 16, 16], [0, 1], 13),
+        ([2, 3, 16, 16], [1, 2, 3, 4, 5], 13),
+        ([2, 3, 16, 16], [1, -2], 13),
+        ([2, 3, 16, 16], [-2, 1], 13),
+    ),
+)
+def test_unsqueeze(  # pylint: disable=missing-function-docstring
+    shape: List[int],
+    axes: List[int],
+    opset_version: int,
+) -> None:
+    x = np.random.randn(*shape).astype(np.float32)
+    _test_unsqueeze(input_tensor=x, axes=axes, opset_version=opset_version)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/where_test.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/where_test.py
new file mode 100644
index 000000000..09f83f7f2
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/node_converters/where_test.py
@@ -0,0 +1,59 @@
+import numpy as np
+import onnx
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+
+from tests.utils.common import check_onnx_model
+from tests.utils.common import make_model_from_nodes
+
+
+def where_test(  # pylint: disable=missing-function-docstring
+    condition: np.ndarray,
+    x: np.ndarray,
+    y: np.ndarray,
+) -> None:
+    test_inputs = {'condition': condition, 'x': x, 'y': y}
+    node = onnx.helper.make_node(
+        op_type='Where',
+        inputs=list(test_inputs),
+        outputs=['z'],
+    )
+    outputs_info = [
+        make_tensor_value_info(
+            name='z',
+            elem_type=NP_TYPE_TO_TENSOR_TYPE[x.dtype],
+            shape=None,
+        )
+    ]
+    model = make_model_from_nodes(nodes=node, initializers={}, inputs_example=test_inputs, outputs_info=outputs_info)
+    check_onnx_model(model, test_inputs)
+
+
+def test_where() -> None:  # pylint: disable=missing-function-docstring
+    where_test(
+        condition=np.array([[1, 0], [1, 1]], dtype=bool),
+        x=np.array([[1, 2], [3, 4]], dtype=np.int64),
+        y=np.array([[9, 8], [7, 6]], dtype=np.int64),
+    )
+
+    where_test(
+        condition=np.array([[1, 0], [1, 1]], dtype=bool),
+        x=np.array([[1, 2], [3, 4]], dtype=np.float32),
+        y=np.array([[9, 8], [7, 6]], dtype=np.float32),
+    )
+
+    where_test(
+        condition=np.array([[1, 0], [1, 1]], dtype=bool),
+        x=np.array(
+            [
+                [
+                    1,
+                ],
+                [
+                    3,
+                ],
+            ],
+            dtype=np.float32,
+        ),
+        y=np.array([[9, 8], [7, 6]], dtype=np.float32),
+    )
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/pytest.ini b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/pytest.ini
new file mode 100644
index 000000000..d2abac1a1
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/pytest.ini
@@ -0,0 +1,8 @@
+[pytest]
+
+log_level=ERROR
+log_cli=True
+log_cli_level=INFO
+
+filterwarnings =
+    ignore::DeprecationWarning
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/utils/__init__.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/utils/common.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/utils/common.py
new file mode 100644
index 000000000..e7ce145fd
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/utils/common.py
@@ -0,0 +1,302 @@
+import io
+from typing import Any
+from typing import Callable
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Type
+from typing import Union
+
+import numpy as np
+import onnx
+import onnxruntime as ort
+import torch
+from onnx import defs
+from onnx import numpy_helper
+from onnx.helper import make_graph
+from onnx.helper import make_model
+from onnx.helper import make_operatorsetid
+from onnx.helper import make_tensor_value_info
+from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+from onnx.onnx_ml_pb2 import ModelProto
+from onnx.onnx_ml_pb2 import NodeProto
+from onnx.onnx_ml_pb2 import ValueInfoProto
+from onnx.shape_inference import infer_shapes
+
+from onnx2torch.converter import convert
+
+try:
+    from torch.onnx import CheckerError
+except ImportError:
+
+    class CheckerError(Exception):
+        """Fake CheckerError for torch < 1.12."""
+
+
+def make_model_from_nodes(  # pylint: disable=missing-function-docstring
+    nodes: Union[NodeProto, Sequence[NodeProto]],
+    initializers: Dict[str, np.ndarray],
+    inputs_example: Optional[Dict[str, np.ndarray]] = None,
+    inputs_info: Optional[Sequence[ValueInfoProto]] = None,
+    outputs_info: Optional[Sequence[ValueInfoProto]] = None,
+    opset_version: Optional[int] = 11,
+) -> ModelProto:
+    if inputs_info is None and inputs_example is None:
+        raise ValueError('inputs_example or inputs_info must be set')
+
+    if inputs_info is None:
+        inputs_info = []
+        for name, data in inputs_example.items():
+            elem_type = NP_TYPE_TO_TENSOR_TYPE[data.dtype]
+            inputs_info.append(make_tensor_value_info(name=name, elem_type=elem_type, shape=data.shape))
+
+    if outputs_info is None:
+        outputs_info = []
+        elem_type = inputs_info[0].type.tensor_type.elem_type
+        for name in tuple(nodes.output):
+            output_proto = make_tensor_value_info(name=name, elem_type=elem_type, shape=None)
+            outputs_info.append(output_proto)
+
+    graph_proto = make_graph(
+        nodes=(nodes,),
+        name='test_graph',
+        inputs=inputs_info,
+        outputs=outputs_info,
+        initializer=[numpy_helper.from_array(data, name=name) for name, data in initializers.items()],
+    )
+
+    opset_imports = None
+    if opset_version is not None:
+        opset_imports = [
+            make_operatorsetid(
+                domain=defs.ONNX_DOMAIN,
+                version=opset_version,
+            ),
+        ]
+
+    model = make_model(graph_proto, opset_imports=opset_imports)
+    model = infer_shapes(model, check_type=False)
+    onnx.checker.check_model(model, False)
+
+    return model
+
+
+def _convert_data(data: Any, from_type: Type, convert_function: Callable) -> Any:
+    if isinstance(data, Dict):
+        return {k: _convert_data(v, from_type, convert_function) for k, v in data.items()}
+
+    if isinstance(data, (Tuple, List)):
+        return type(data)(_convert_data(v, from_type, convert_function) for v in data)
+
+    if isinstance(data, from_type):
+        return convert_function(data)
+
+    return data
+
+
+def convert_data_onnx2torch(data: Any, device: str = 'cpu') -> Any:  # pylint: disable=missing-function-docstring
+    def convert_function(z):  # pylint: disable=missing-function-docstring
+        return torch.from_numpy(z).to(device=device)
+
+    return _convert_data(data, from_type=np.ndarray, convert_function=convert_function)
+
+
+def convert_data_torch2onnx(data: Any) -> Any:  # pylint: disable=missing-function-docstring
+    def convert_function(z):  # pylint: disable=missing-function-docstring
+        return z.detach().cpu().numpy()
+
+    return _convert_data(data, from_type=torch.Tensor, convert_function=convert_function)
+
+
+def convert_onnx_inputs_to_torch_inputs(  # pylint: disable=missing-function-docstring
+    onnx_model: ModelProto,
+    onnx_inputs: Dict[str, Any],
+    device: str = 'cpu',
+) -> List[Any]:
+    return [
+        convert_data_onnx2torch(onnx_inputs[graph_input.name], device=device)
+        for graph_input in onnx_model.graph.input
+        if graph_input.name in onnx_inputs
+    ]
+
+
+def calc_ort_outputs(  # pylint: disable=missing-function-docstring
+    model: ModelProto,
+    inputs: Dict[str, Any],
+    skip_unused_inputs: bool = False,
+) -> List[Any]:
+    ort_session = ort.InferenceSession(
+        model.SerializeToString(),
+        providers=['CPUExecutionProvider'],
+    )
+
+    if skip_unused_inputs:
+        graph_inputs = [i.name for i in model.graph.input]
+        inputs = {k: v for k, v in inputs.items() if k in graph_inputs}
+
+    outputs = ort_session.run(
+        output_names=None,
+        input_feed=inputs,
+    )
+
+    return outputs
+
+
+def calc_torch_outputs(  # pylint: disable=missing-function-docstring
+    model: ModelProto,
+    inputs: Dict[str, Any],
+    device: str = 'cpu',
+) -> Any:
+    inputs = convert_onnx_inputs_to_torch_inputs(onnx_model=model, onnx_inputs=inputs, device=device)
+    model = convert(model)
+    model = model.eval()
+    model = model.to(device=device)
+    outputs = model(*inputs)  # pylint: disable=not-callable
+
+    return convert_data_torch2onnx(outputs)
+
+
+def calc_torch_and_ort_outputs(  # pylint: disable=missing-function-docstring
+    model: ModelProto,
+    test_inputs: Dict[str, np.ndarray],
+):
+    torch_outputs = calc_torch_outputs(model=model, inputs=test_inputs)
+    ort_outputs = calc_ort_outputs(model=model, inputs=test_inputs)
+
+    return torch_outputs, ort_outputs
+
+
+def convert_onnx2torch2onnx(  # pylint: disable=missing-function-docstring
+    model: ModelProto,
+    inputs: Dict[str, np.ndarray],
+    opset_version: int = 13,
+    ignore_export_checker: bool = False,
+    **export_kwargs,
+) -> ModelProto:
+    torch_model = convert(model)
+    input_names = list(inputs.keys())
+    args = list(inputs.values())
+    args = tuple(torch.tensor(arg) for arg in args)
+
+    with io.BytesIO() as tmp_file:
+        try:
+            torch.onnx.export(
+                model=torch_model,
+                args=args,
+                f=tmp_file,
+                input_names=input_names,
+                opset_version=opset_version,
+                **export_kwargs,
+            )
+        except CheckerError:
+            if not ignore_export_checker:
+                raise
+
+        return onnx.load_from_string(tmp_file.getvalue())
+
+
+def _check_onnx_model(
+    onnx_model: ModelProto,
+    onnx_inputs: Dict[str, Any],
+    onnx_torch_check_function: Callable,
+    torch_cpu_cuda_check_function: Optional[Callable] = None,
+    onnx_torch2onnx_check_function: Optional[Callable] = None,
+    ignore_export_checker: bool = False,
+    opset_version: int = 13,
+) -> None:
+    ort_outputs = calc_ort_outputs(onnx_model, onnx_inputs)
+    torch_outputs = calc_torch_outputs(onnx_model, onnx_inputs, device='cpu')
+
+    onnx_torch_check_function(ort_outputs, torch_outputs)
+
+    if torch_cpu_cuda_check_function is not None:
+        torch_cuda_outputs = calc_torch_outputs(onnx_model, onnx_inputs, device='cuda')
+        torch_cpu_cuda_check_function(torch_outputs, torch_cuda_outputs)
+
+    if onnx_torch2onnx_check_function is not None:
+        torch2onnx_model = convert_onnx2torch2onnx(
+            onnx_model,
+            inputs=onnx_inputs,
+            ignore_export_checker=ignore_export_checker,
+            opset_version=opset_version,
+        )
+        ort_torch2onnx_outputs = calc_ort_outputs(torch2onnx_model, onnx_inputs, skip_unused_inputs=True)
+        onnx_torch2onnx_check_function(ort_outputs, ort_torch2onnx_outputs)
+
+
+def check_onnx_model(  # pylint: disable=missing-function-docstring
+    onnx_model: ModelProto,
+    onnx_inputs: Dict[str, Any],
+    atol_onnx_torch: float = 0.0,
+    atol_torch_cpu_cuda: float = 0.0,
+    atol_onnx_torch2onnx: float = 0.0,
+    ignore_export_checker: bool = False,
+    opset_version: int = 13,
+) -> None:
+    def onnx_torch_check_function(onnx_output, torch_output):  # pylint: disable=missing-function-docstring
+        if len(onnx_output) == 1:
+            torch_output = [torch_output]
+
+        for x, y in zip(onnx_output, torch_output):
+            assert np.all(np.isclose(x, y, atol=atol_onnx_torch)), 'ort and torch outputs have significant difference'
+
+    def torch_cpu_cuda_check_function(  # pylint: disable=missing-function-docstring
+        torch_cpu_output,
+        torch_cuda_output,
+    ):
+        if not isinstance(torch_cpu_output, (List, Tuple)):
+            torch_cpu_output = [torch_cpu_output]
+            torch_cuda_output = [torch_cuda_output]
+
+        for x, y in zip(torch_cpu_output, torch_cuda_output):
+            assert np.all(
+                np.isclose(x, y, atol=atol_torch_cpu_cuda)
+            ), 'torch cpu and torch cuda outputs have significant difference'
+
+        return True
+
+    def onnx_torch2onnx_check_function(onnx_output, torch2onnx_output):  # pylint: disable=missing-function-docstring
+        for x, y in zip(onnx_output, torch2onnx_output):
+            assert np.all(
+                np.isclose(x, y, atol=atol_onnx_torch2onnx)
+            ), 'ort and ort+torch2onnx outputs have significant difference'
+
+        return True
+
+    _check_onnx_model(
+        onnx_model=onnx_model,
+        onnx_inputs=onnx_inputs,
+        onnx_torch_check_function=onnx_torch_check_function,
+        torch_cpu_cuda_check_function=torch_cpu_cuda_check_function,
+        onnx_torch2onnx_check_function=onnx_torch2onnx_check_function,
+        ignore_export_checker=ignore_export_checker,
+        opset_version=opset_version,
+    )
+
+
+def check_torch_model(  # pylint: disable=missing-function-docstring,unused-argument
+    torch_model: torch.nn.Module,
+    onnx_inputs: Dict[str, Any],
+    atol_onnx_torch: float = 0.0,
+    atol_torch_cpu_cuda: float = 0.0,
+    atol_onnx_torch2onnx: float = 0.0,
+    opset_version: int = 13,
+) -> None:
+    arguments = locals()
+    input_names = list(onnx_inputs.keys())
+    args = tuple(torch.tensor(arg) for arg in onnx_inputs.values())
+
+    with io.BytesIO() as tmp_file:
+        torch.onnx.export(
+            model=torch_model,
+            args=args,
+            f=tmp_file,
+            input_names=input_names,
+            opset_version=opset_version,
+        )
+
+        arguments.pop('torch_model')
+        arguments['onnx_model'] = onnx.load_from_string(tmp_file.getvalue())
+        check_onnx_model(**arguments)
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/utils/resources.py b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/utils/resources.py
new file mode 100644
index 000000000..89c7589fe
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch/tests/utils/resources.py
@@ -0,0 +1,71 @@
+import tarfile
+import urllib.request
+from pathlib import Path
+
+import onnx
+from google_drive_downloader import GoogleDriveDownloader
+from onnx import ModelProto  # pylint: disable=no-name-in-module
+
+from tests import DATASETS_DIR
+from tests import MODELS_DIR
+
+_BASE_URL = 'https://gitlab.expasoft.com/p.ivanov/onnx2torch_data/-/raw/main/models_for_tests'
+
+_CHKP_DETECTION_URL = f'{_BASE_URL}/detection'
+_CHKP_SEGMENTATION_URL = f'{_BASE_URL}/segmentation'
+_CHKP_TRANSFORMERS_URL = f'{_BASE_URL}/transformers'
+_CHKP_KEYPOINTS_URL = f'{_BASE_URL}/keypoints'
+_CHKP_OTHER_URL = f'{_BASE_URL}/other'
+
+_ONNX_MODELS_IDS = {
+    'deeplabv3_mnv3_large': f'{_CHKP_SEGMENTATION_URL}/deeplabv3_mobilenet_v3_large.onnx',
+    'deeplabv3_plus_resnet101': f'{_CHKP_SEGMENTATION_URL}/deeplabv3_resnet101_dimans.onnx',
+    'hrnet': f'{_CHKP_SEGMENTATION_URL}/hrnet.onnx',
+    'unet': f'{_CHKP_SEGMENTATION_URL}/unet_resnet34.onnx',
+    'retinanet': f'{_CHKP_DETECTION_URL}/retinanet_r50_fpn.onnx',
+    'ssd300_vgg': f'{_CHKP_DETECTION_URL}/ssd300.onnx',
+    'ssdlite': f'{_CHKP_DETECTION_URL}/ssdlite.onnx',
+    'yolov3_d53': f'{_CHKP_DETECTION_URL}/yolov3_d53_tuned_shape.onnx',
+    'yolov5_ultralitics': f'{_CHKP_DETECTION_URL}/yolov5_ultralitics.onnx',
+    'swin': f'{_CHKP_TRANSFORMERS_URL}/swin.onnx',
+    'vit': f'{_CHKP_TRANSFORMERS_URL}/vit.onnx',
+    'gptj_2_random_blocks': f'{_CHKP_TRANSFORMERS_URL}/gptj_2_random_blocks.onnx',
+    'resnet50': 'https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v2-7.onnx',
+    '3d_gan': f'{_CHKP_OTHER_URL}/3d_gan.onnx',
+    'shelfnet': f'{_CHKP_KEYPOINTS_URL}/shelfnet.onnx',
+    'point_arch': f'{_CHKP_OTHER_URL}/point_arch_nq.onnx',
+}
+
+_MINIMAL_DATASETS_ID = '1Vd7qfQotrRADPLFxViA2tRpz7tBymR31'
+
+
+def get_model_path(name: str) -> Path:  # pylint: disable=missing-function-docstring
+    model_path = MODELS_DIR / f'{name}.onnx'
+    if not model_path.exists():
+        if name in _ONNX_MODELS_IDS:
+            url = _ONNX_MODELS_IDS[name]
+            urllib.request.urlretrieve(url=url, filename=model_path)
+        else:
+            raise RuntimeError('Cannot find model path.')
+
+    return model_path
+
+
+def get_model(name: str) -> ModelProto:  # pylint: disable=missing-function-docstring
+    model_path = get_model_path(name)
+    return onnx.load_model(str(model_path))
+
+
+def get_minimal_dataset_path():  # pylint: disable=missing-function-docstring
+    dataset_path = DATASETS_DIR / 'minimal_dataset'
+    if not dataset_path.exists():
+        arch_path = dataset_path.with_suffix('.tar.gz')
+        GoogleDriveDownloader.download_file_from_google_drive(
+            file_id=_MINIMAL_DATASETS_ID,
+            dest_path=arch_path,
+            overwrite=True,
+        )
+        with tarfile.open(arch_path, 'r:gz') as arch_file:
+            arch_file.extractall(path=dataset_path)
+
+    return dataset_path
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt b/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
index 396998600..36983c06e 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
@@ -1,19 +1,31 @@
-onnx==1.15.0
-onnxsim==0.4.36
-tf2onnx==1.16.1
+tqdm
+fpdf
+pandas==1.5.1
+onnx
+onnxsim
 onnxruntime
-onnxoptimizer==0.3.13
-bert-tensorflow==1.0.1
-
-pandas==2.1.1
-numpy==1.23.0
+tf2onnx==1.16.1
 matplotlib
+onnxoptimizer
 scikit-learn
-opencv-python==4.6.0.66
-opencv-python-headless
+prompt_toolkit
+pycocotools
+transformers==4.33.2
+
 tokenization==1.0.7
-tokenizers==0.13.3
+opencv-python==4.6.0.66
+typing-extensions==3.7.4.3
+protobuf==3.20.3
 sentencepiece==0.1.96
+py-libnuma==1.2
+tokenizers==0.13.3
+virtualenv==16.7.9
+bert-tensorflow==1.0.1
 typing_extensions==4.10.0
+numpy==1.23.0
 
-py-libnuma==1.2
+
+# 利用numa需要安装的依赖
+apt-get install -y libnuma-dev
+apt-get update
+apt-get upgrade -y libnuma-dev
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
index 543591a31..379b3964f 100755
--- a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -495,9 +495,9 @@ def benchmark_interact(self, dataloader):
             'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
             format(self.batch_size, qps, avg_latency, tail_latency))
         
-        # log.info(
-        #     'Batch size is {}, fps: {}, predict_avg_latency:{}, predict_tail_latency:{}'.
-        #     format(self.batch_size, fps, predict_avg_latency, tail_latency))
+        log.info(
+            'Batch size is {}, fps: {}, predict_avg_latency:{}, predict_tail_latency:{}'.
+            format(self.batch_size, fps, predict_avg_latency, tail_latency))
 
 
         report['QPS'] = qps
diff --git a/vendor_zoo/Iluvatar/BI-V150-PCIe.json b/vendor_zoo/Iluvatar/BI-V150-PCIe.json
new file mode 100644
index 000000000..c774b98fd
--- /dev/null
+++ b/vendor_zoo/Iluvatar/BI-V150-PCIe.json
@@ -0,0 +1,49 @@
+{
+    "SKU参数": {
+        "厂商": "Iluvatar",
+        "型号": "BI-V150",
+        "用途": "Training/Inference",
+        "照片": "vendor_zoo/Iluvatar/image_3.png"
+    },  
+    "硬件参数": {
+        "制程(NM)": 7,
+        "尺寸": "FHFL, Dual Slot Card",
+        "接口": "PCIe 4.0x16",
+        "功耗(W/TDP)": 350 
+    },  
+    "内存参数": {
+        "内存层次架构图": null,
+        "内存": {
+            "内存类型": "HBM2e",
+            "内存容量(GB)": 64, 
+            "内存带宽(GB/s)": 1126.4
+        }   
+    },  
+    "算力参数": {
+        "PE层次架构图": null,
+        "PE参数": {
+            "算力架构": "同构众核",
+            "并行方式": "SIMT",
+            "通信带宽(GB/s)": null
+        },
+        "标量参数": {
+            "标量精度": null,
+            "INT8标量算力(TFLOPS)": null,
+            "FP16标量算力(TFLOPS)": null,
+            "FP32标量算力(TFLOPS)": null
+        },
+        "张量参数": {
+            "张量精度": "FP32, FP16, BF16, INT8",
+            "INT8张量算力(TFLOPS)": 590.0,
+            "FP16张量算力(TFLOPS)": 196.0,
+            "FP32张量算力(TFLOPS)": 48.0
+        }
+    },
+    "卡间通信参数": {
+        "通信方式": "PCIe",
+        "端口数量": null,
+        "RDMA协议": null,
+        "下行带宽(GB/s)": null,
+        "上行带宽(GB/s)": null
+    }
+}

From c0e8567bd47c64b08b933a761c7968d8fea373ec Mon Sep 17 00:00:00 2001
From: "defu.yu" <defu.yu@iluvatar.com>
Date: Tue, 27 Aug 2024 10:47:51 +0000
Subject: [PATCH 28/28] update general micro

---
 .../backends/ILUVATAR/batch_exec.sh           |  51 ++++
 byte_micro_perf/README.zh_CN.md               | 234 ++++++++++++++++++
 byte_micro_perf/batch_exec.sh                 |  41 +++
 byte_micro_perf/init_environment.sh           |  19 ++
 4 files changed, 345 insertions(+)
 create mode 100644 byte_infer_perf/general_perf/backends/ILUVATAR/batch_exec.sh
 create mode 100644 byte_micro_perf/README.zh_CN.md
 create mode 100644 byte_micro_perf/batch_exec.sh
 create mode 100644 byte_micro_perf/init_environment.sh

diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/batch_exec.sh b/byte_infer_perf/general_perf/backends/ILUVATAR/batch_exec.sh
new file mode 100644
index 000000000..539d7661e
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/ILUVATAR/batch_exec.sh
@@ -0,0 +1,51 @@
+## 批量执行测试代码时，要先执行bash init_environment.sh脚本构建环境；以下的测试只是按照workloads目录下的配置文件来测试案例,
+## 如果想更改配置参数请参考：byte_infer_perf/general_perf/backends/ILUVATAR/README.md
+
+
+cd ByteMLPerf/byte_infer_perf
+
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
+# 测试报告位置
+# general_perf/reports/ILUVATAR/bert-torch-fp32/
+
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task albert-torch-fp32
+# 测试报告位置
+# general_perf/reports/ILUVATAR/albert-torch-fp32/
+
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task deberta-torch-fp32
+# 测试报告位置
+# general_perf/reports/ILUVATAR/deberta-torch-fp32/
+
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roberta-torch-fp32
+# 测试报告位置
+# general_perf/reports/ILUVATAR/roberta-torch-fp32/
+
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task videobert-onnx-fp32
+# 测试报告位置
+# general_perf/reports/ILUVATAR/videobert-onnx-fp32
+
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
+# 测试报告位置
+# general_perf/reports/ILUVATAR/widedeep-tf-fp32
+
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task swin-large-torch-fp32
+# 测试报告位置
+# general_perf/reports/ILUVATAR/swin-large-torch-fp32
+
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task resnet50-torch-fp32
+# 测试报告位置
+# general_perf/reports/ILUVATAR/resnet50-torch-fp32
+
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task yolov5-onnx-fp32
+# 测试报告位置
+# general_perf/reports/ILUVATAR/yolov5-onnx-fp32
+
+# 执行
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roformer-tf-fp32
+# 测试报告位置
+# general_perf/reports/ILUVATAR/roformer-tf-fp32
diff --git a/byte_micro_perf/README.zh_CN.md b/byte_micro_perf/README.zh_CN.md
new file mode 100644
index 000000000..bc391afc3
--- /dev/null
+++ b/byte_micro_perf/README.zh_CN.md
@@ -0,0 +1,234 @@
+# ByteMicroPerf 操作说明
+# 1、基础信息描述
+ByteMicroPerf 是 ByteMLPerf 的一部分，主要用于评估主流深度学习模型在新兴异构硬件上的频繁计算和通信操作的性能。其主要特点如下：
+* 方便快捷地支持各种异构硬件
+* 评估过程贴合实际业务场景
+* 涵盖多个类别的频繁操作符
+* 推理精度包括：float32、bfloat16、half、int8
+
+
+### 功能实现
+* 由于天数智芯对cutlass库的维护已经废弃，目前关于矩阵算力Gemm算子相关的int8精度推理主要采用cublas相关的库来实现的，目前通过即时编译的方式实现矩阵算力相关的算子int8精度推理。
+
+### 环境准备：
+* sdk版本： 由天数智芯工程师提供
+* ixrt版本：由天数智芯工程师提供
+
+# 2、支持的算子列表
+### 访存密集型算子
+访存密集型算子主要指的是那些在进行计算时需要频繁访问内存的算子，这类算子的执行往往受到内存访问速度的限制，而不是计算能力的限制。访存密集型算子的执行时间很大程度上取决于数据传输的速度>和效率，而不是单纯的计算速度。
+
+* layernorm
+* softmax
+* reduce
+* reduce_min
+* reduce_max
+
+### 张量算子
+这些算子在张量计算中扮演着重要的角色，它们分别执行不同的操作，以满足各种数据处理和分析的需求。
+
+* index_add
+* sort
+* unique
+* scatter
+* gather
+
+### 矩阵运算算子
+gemm：它表示一般的矩阵乘法操作，其中矩阵的维度可以是任意的；
+gemv：矩阵与向量相乘的操作，它是GEMM的一个特例，其中一个是向量而不是矩阵。GEMV操作在深度学习中的线性层计算中非常常见，因为它能够高效地处理大量的数据。
+group_gemm：是一种特殊的矩阵乘法操作，其中多个矩阵被分组并进行并行计算。这种操作在提升计算效率方面非常有效，特别是在处理大规模矩阵运算时，如多专家模型（MoE）的训练中，通过将细碎的专家
+计算操作与通信通过Group GEMM算子对多专家计算进行合并，从而提升性能。
+batch_gemm：扩展了标准的GEMM操作，允许同时对多个矩阵进行乘法运算。这种操作在需要并行处理多个矩阵乘积的场景中非常有用，例如在深度学习模型的批量处理中。
+
+* gemm
+* gemv
+* group_gemm
+* batch_gemm
+
+### 数据传输算子
+在GPU编程中，数据传输涉及到将数据从一个地方（Host，即CPU）移动到另一个地方（Device，即GPU）。
+
+* host2device
+* device2host
+
+### 数学运算算子
+* sin
+* cos
+* exp
+* exponential
+* silu
+* gelu
+* swiglu
+* cast
+
+### 通信算子
+这些算子主要用于分布式计算环境中，特别是在大规模并行处理（MPP）和分布式计算领域，它们用于在不同的计算节点之间传输和同步数据。
+
+* allreduce
+* allgather
+* reducescatter
+* alltotal
+* broadcast
+* p2p
+
+### 二元算子
+这些操作涉及两个输入值进行计算。
+
+* add
+* mul
+* sub
+* div
+
+### 注意事项
+* 通信类的算子根据workloads下面的配置文件json描述可以实现2、4、8卡直接的通信测试；如果要测试这类算子至少需要两张显卡资源。
+
+# 3、用例测试
+### 访存密集型算子
+#### layernorm 算子
+```bash
+python3 launch.py --task layernorm --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### softmax 算子
+```bash
+python3 launch.py --task softmax --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### reduce_sum 算子
+```bash
+python3 launch.py --task reduce_sum --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### reduce_min 算子
+```bash
+python3 launch.py --task reduce_min --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### reduce_max 算子
+```bash
+python3 launch.py --task reduce_max --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+
+### 张量算子
+#### index_add 算子
+```bash
+python3 launch.py --task index_add --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### sort 算子
+```bash
+python3 launch.py --task sort --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### unique 算子
+```bash
+python3 launch.py --task unique --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### scatter 算子
+```bash
+python3 launch.py --task scatter --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### gather 算子
+```bash
+python3 launch.py --task gather --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+
+### 矩阵运算算子
+#### gemm 算子
+```bash
+python3 launch.py --task gemm --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### gemv 算子
+```bash
+python3 launch.py --task gemv --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### group_gemm 算子
+```bash
+python3 launch.py --task group_gemm --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### batch_gemm 算子
+```bash
+python3 launch.py --task batch_gemm --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+
+### 数据传输算子
+####  host2device 算子
+```bash
+python3 launch.py --task host2device --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### device2host 算子
+```bash
+python3 launch.py --task device2host --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+
+### 数学运算算子
+#### sin 算子
+```bash
+python3 launch.py --task sin --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### cos 算子
+```bash
+python3 launch.py --task cos --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### exp 算子
+```bash
+python3 launch.py --task exp --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### exponential 算子
+```bash
+python3 launch.py --task exponential --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### silu 算子
+```bash
+python3 launch.py --task silu --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### gelu 算子
+```bash
+python3 launch.py --task gelu --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### swiglu 算子
+```bash
+python3 launch.py --task swiglu --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### cast 算子
+```bash
+python3 launch.py --task cast --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+
+### 通信算子
+#### allreduce 算子
+```bash
+python3 launch.py --task allreduce --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### allgather 算子
+```bash
+python3 launch.py --task allgather --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### reducescatter 算子
+```bash
+python3 launch.py --task reducescatter --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### alltotal 算子
+```bash
+python3 launch.py --task alltotal --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### broadcast 算子
+```bash
+python3 launch.py --task broadcast --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### p2p 算子
+```bash
+python3 launch.py --task p2p --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+
+### 二元算子
+#### add 算子
+```bash
+python3 launch.py --task add --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### mul 算子
+```bash
+python3 launch.py --task mul --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### sub 算子
+```bash
+python3 launch.py --task sub --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
+#### div 算子
+```bash
+python3 launch.py --task div --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json
+```
diff --git a/byte_micro_perf/batch_exec.sh b/byte_micro_perf/batch_exec.sh
new file mode 100644
index 000000000..bdad07c40
--- /dev/null
+++ b/byte_micro_perf/batch_exec.sh
@@ -0,0 +1,41 @@
+## 访存密集型算子
+for task in 'layernorm' 'softmax' 'reduce_sum' 'reduce_min' 'reduce_max'
+do
+    python3 launch.py --task ${task} --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json;
+done
+
+## 张量算子
+for task in 'index_add' 'sort' 'unique' 'scatter' 'gather'
+do
+    python3 launch.py --task ${task} --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json;
+done
+
+## 矩阵运算算子
+for task in 'gemm' 'gemv' 'group_gemm' 'batch_gemm'
+do
+    python3 launch.py --task ${task} --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json;
+done
+
+## 数据传输算子
+for task in 'host2device' 'device2host'
+do
+    python3 launch.py --task ${task} --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json;
+done
+
+## 数学运算算子
+for task in 'sin' 'cos' 'exp' 'exponential' 'silu' 'gelu' 'swiglu' 'cast'
+do
+    python3 launch.py --task ${task} --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json;
+done
+
+## 通信算子
+for task in 'allreduce' 'allgather' 'reducescatter' 'alltoall' 'broadcast' 'p2p'
+do
+    python3 launch.py --task ${task} --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json;
+done
+
+## 二元算子
+for task in 'add' 'mul' 'sub' 'div'
+do
+    python3 launch.py --task ${task} --hardware_type ILUVATAR --vendor_path ../vendor_zoo/Iluvatar/BI-V150-PCIe.json;
+done
diff --git a/byte_micro_perf/init_environment.sh b/byte_micro_perf/init_environment.sh
new file mode 100644
index 000000000..7c231a51b
--- /dev/null
+++ b/byte_micro_perf/init_environment.sh
@@ -0,0 +1,19 @@
+# 因为基础镜像内部已经安装了一个numpy版本，会导致程序运行时，调用numpy出现问题，因此先进行卸载
+pip3 uninstall numpy -y
+pip3 uninstall numpy -y
+
+matplotlib
+pandas
+scikit-learn
+prompt_toolkit
+tqdm
+opencv-python
+transformers
+tokenization
+fpdf
+attrs
+decorator
+typing-extensions
+pydot
+
+apt-get install -y ninja-build