diff --git a/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.sh b/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.sh
old mode 100644
new mode 100755
index f786dfa0b..f7960afdf
--- a/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.sh
+++ b/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.sh
@@ -1,12 +1,16 @@
 #！bin/bash
-if [ ! -d "general_perf/backends/CPU/venv" ];then
-    virtualenv -p python3 general_perf/backends/CPU/venv
-    source general_perf/backends/CPU/venv/bin/activate
-    general_perf/backends/CPU/venv/bin/python3 -m pip install --upgrade pip  -q
-    general_perf/backends/CPU/venv/bin/python3 -m pip install -r general_perf/backends/CPU/requirements.txt -q
+if [ "$3" != 'TPU' ]; then
+    if [ ! -d "general_perf/backends/CPU/venv" ];then
+        virtualenv -p python3 general_perf/backends/CPU/venv
+        source general_perf/backends/CPU/venv/bin/activate
+        general_perf/backends/CPU/venv/bin/python3 -m pip install --upgrade pip  -q
+        general_perf/backends/CPU/venv/bin/python3 -m pip install -r general_perf/backends/CPU/requirements.txt -q
+    else
+        source general_perf/backends/CPU/venv/bin/activate
+        general_perf/backends/CPU/venv/bin/python3 -m pip install -r general_perf/backends/CPU/requirements.txt -q
+    fi
 else
-    source general_perf/backends/CPU/venv/bin/activate
-    general_perf/backends/CPU/venv/bin/python3 -m pip install -r general_perf/backends/CPU/requirements.txt -q
+    echo "Hardware Type is TPU, Skip venv..."
 fi
 
 python3 general_perf/backends/CPU/calculate_cpu_diff.py --task $1 --batch_size $2
diff --git a/byte_infer_perf/general_perf/backends/CPU/compile_backend_cpu.py b/byte_infer_perf/general_perf/backends/CPU/compile_backend_cpu.py
old mode 100644
new mode 100755
index 3d88a1114..76fa45675
--- a/byte_infer_perf/general_perf/backends/CPU/compile_backend_cpu.py
+++ b/byte_infer_perf/general_perf/backends/CPU/compile_backend_cpu.py
@@ -2,8 +2,8 @@
 import json
 import logging
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-import tensorflow as tf
 import torch
+import tensorflow as tf
 import onnxruntime
 import time
 import numpy as np
diff --git a/byte_infer_perf/general_perf/backends/CPU/runtime_backend_cpu.py b/byte_infer_perf/general_perf/backends/CPU/runtime_backend_cpu.py
old mode 100644
new mode 100755
index eec8c98b2..78fa11efc
--- a/byte_infer_perf/general_perf/backends/CPU/runtime_backend_cpu.py
+++ b/byte_infer_perf/general_perf/backends/CPU/runtime_backend_cpu.py
@@ -2,8 +2,8 @@
 import json
 import logging
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-import tensorflow as tf
 import torch
+import tensorflow as tf
 import onnxruntime
 import time
 import numpy as np
diff --git a/byte_infer_perf/general_perf/backends/TPU/README.md b/byte_infer_perf/general_perf/backends/TPU/README.md
new file mode 100644
index 000000000..9313a42ed
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/TPU/README.md
@@ -0,0 +1,32 @@
+
+
+# How to run
+
+## 1. Create docker container
+
+```bash
+docker pull sophgo/tpuc_dev:latest
+docker run --privileged --name TPUPerf -td -v /dev/:/dev/ -v /opt/:/opt/ -v <your path>:/workspace/ --entrypoint bash sophgo/tpuc_dev:latest
+docker exec -it TPUPerf bash
+```
+
+## 2. Environment Initialization
+
+```bash
+pip3 install tpu_mlir
+apt install unzip
+pip3 install dfss
+python3 -m dfss --url=open@sophgo.com:/sophon-demo/Stable_diffusion_3/BM1690/sophon-sail2.zip
+unzip sophon-sail2.zip
+# 依照sail2目录下的README，在当前环境编译出whl并安装
+```
+
+## 3. Run ByteMLPerf for TPU backend
+
+```bash
+python3  launch.py --task yolov5-onnx-fp32 --hardware_type TPU
+python3  launch.py --task resnet50-torch-fp32 --hardware_type TPU
+```
+
+# Notes
+> Support FP32 and INT8 quantization for resnet50-torch-fp32 now, .
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/TPU/compile_backend_tpu.py b/byte_infer_perf/general_perf/backends/TPU/compile_backend_tpu.py
new file mode 100755
index 000000000..261203068
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/TPU/compile_backend_tpu.py
@@ -0,0 +1,178 @@
+# Copyright 2023 Graphcore Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import json
+import logging
+import os
+import subprocess
+from pathlib import Path
+from typing import Any, Dict
+import tpu_mlir
+import shutil
+
+from general_perf.backends import compile_backend
+
+log = logging.getLogger("CompileBackendTPU")
+
+class CompileBackendTPU(compile_backend.CompileBackend):
+    def __init__(self):
+        super().__init__()
+        self.hardware_type = "TPU"
+        self.need_reload = False
+        self.need_quant = False
+        self.current_dir = os.path.split(os.path.abspath(__file__))[0]
+        self.model_config = None
+        self.precision = "fp32"
+        self.model_precision = "F32"
+        self.mean = "0.0,0.0,0.0"
+        self.scale = "1.0,1.0,1.0"
+        self.pixel_format = "rgb"
+        self.input_num = 200
+        
+    def version(self) -> str:
+        """
+        Return compile backend version details
+        """
+        return tpu_mlir.distribution
+    
+    def pre_optimize(self, configs: Dict[str, Any]):
+        """Model pre-optimization interface.
+
+        Requirements: Model pre-optimization
+        cannot change the model format. Torch model export to ONNX is allowed.
+        """
+        
+        return configs
+    
+    def compile(self, configs: Dict[str, Any], dataloader=None) -> Dict[str, Any]:
+        if not self.model_config:
+            self.model_config = configs
+        
+        self.model_info = configs["model_info"]
+        self.interact_info = configs["interact_info"]
+        self.model_path = self.model_info["model_path"]
+        self.input_shapes = self.model_info["input_shape"][self.model_info["inputs"]]
+        self.input_shapes_str = ','.join(str(num) for num in self.input_shapes)
+        self.model_name = self.model_info["model"]
+        if("model_precision" in self.interact_info.keys()):
+            self.model_precision = self.interact_info["model_precision"]
+            self.mean = self.interact_info["mean"]
+            self.scale = self.interact_info["scale"]
+            self.pixel_format = self.interact_info["pixel_format"]
+            self.input_num = self.interact_info["input_num"]
+
+        self.precision=self.model_precision.upper()
+        gen_mlir_commands = f'model_transform \
+            --model_name {self.model_name} \
+            --model_def ../../{self.model_path} \
+            --mean {self.mean} \
+            --scale {self.scale} \
+            --pixel_format {self.pixel_format}  \
+            --input_shapes [[{self.input_shapes_str}]] \
+            --mlir {self.model_name}.mlir'
+        gen_mlir_logs = './model_transform.log'
+
+        current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+        origin_dir = os.getcwd()
+        self.compile_dir_name = current_dir + '/compiled_models/'
+        if os.path.exists(self.compile_dir_name):
+            shutil.rmtree(self.compile_dir_name)
+        os.mkdir(self.compile_dir_name)
+        os.chdir(self.compile_dir_name)
+        with open(gen_mlir_logs, 'w') as logfile:
+            subprocess.call(gen_mlir_commands, stdout=logfile, stderr=subprocess.STDOUT, shell=True)
+        if(self.precision == "INT8"):
+            self.dataset_path = current_dir+"/datasets/"+self.model_info["dataset_name"]+"/"+self.interact_info["dataset_path"]
+
+            run_calibration_commands = f'run_calibration {self.model_name}.mlir \
+                --dataset {self.dataset_path} \
+                --input_num {self.input_num}  \
+                -o {self.model_name}_cali_table'
+                
+            run_calibration_logs = './run_calibration.log'
+
+        
+            with open(run_calibration_logs , 'w') as logfile:
+                subprocess.call(run_calibration_commands, stdout=logfile, stderr=subprocess.STDOUT, shell=True)
+        
+            deploy_commands = f'model_deploy \
+                --mlir {self.model_name}.mlir \
+                --quantize {self.model_precision} \
+                --chip bm1690 \
+                --calibration_table {self.model_name}_cali_table \
+                --model {self.model_name}.bmodel'
+        else:
+            deploy_commands = f'model_deploy \
+                --mlir {self.model_name}.mlir \
+                --quantize {self.model_precision} \
+                --chip bm1690 \
+                --model {self.model_name}.bmodel'
+        deploy_commands_logs = './model_deploy.log'
+        
+        with open(deploy_commands_logs, 'w') as logfile:
+            subprocess.call(deploy_commands, stdout=logfile, stderr=subprocess.STDOUT, shell=True)
+        
+        os.chdir(origin_dir)
+        
+        result = {
+            "model": self.model_name,
+            "framework": configs["model_info"]["framework"],
+            "compile_precision": self.precision,
+            "input_type": configs["model_info"]["input_type"].split(","),
+            "max_batch_size": configs["model_info"]["max_batch_size"],
+            "compile_status": "success",
+            "optimizations": {},
+            "instance_count": 1,
+            "device_count": 1,
+            "sg_percent": 100,
+            "segments": [
+                {
+                    "sg_idx": 0,
+                    "is_fallback": False,
+                    "input_tensor_map": configs["model_info"]["input_shape"],
+                    "output_tensor_map": configs["model_info"]["outputs"],
+                    "compiled_model": [
+                        {
+                            "compiled_bs": 1,
+                            "compiled_obj": configs["model_info"]["model_path"],
+                        },
+                    ],
+                },
+            ],
+            "interact_info": self.model_config,
+        }
+        
+        return result
+    
+    def get_interact_profile(self, config: Dict[str, Any]):
+        """Collect information for core engine to let user interactively fill in configurations."""
+        # load the interact_info by model name
+        model_profile = []
+
+        interact_info_file = os.path.join(
+            self.current_dir, "interact_infos", config["model_info"]["model"] + ".json"
+        )
+        file_path = os.path.join(self.current_dir, self.hardware_type + ".json")
+
+        with open(interact_info_file, "r") as f:
+            interact_info = json.load(f)
+
+       
+
+        return interact_info
+    
+    def get_best_batch_size(self) -> compile_backend.List[int] | None:
+        return None
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/TPU/interact_infos/resnet50-torch-fp32.json b/byte_infer_perf/general_perf/backends/TPU/interact_infos/resnet50-torch-fp32.json
new file mode 100644
index 000000000..90dec8edb
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/TPU/interact_infos/resnet50-torch-fp32.json
@@ -0,0 +1,52 @@
+[
+  
+  {
+    "name": "model_precision",
+    "default": "FP32",
+    "depends": null,
+    "note" : "选择数据格式",
+    "type": "string",
+    "options":["F32", "INT8", "F16"],
+    "dialog_type": "Radiolist Dialog"
+  },
+  {
+    "name": "dataset_path",
+    "default": "ILSVRC2012_img_val",
+    "depends": null,
+    "note" : "量化数据集路径",
+    "type": "string",
+    "dialog_type": "Input Dialog"
+  },
+  {
+    "name": "mean",
+    "default": "103.53,116.28,123.67",
+    "depends": null,
+    "note" : "前处理均值",
+    "type": "string",
+    "dialog_type": "Input Dialog"
+  },
+  {
+    "name": "scale",
+    "default": "0.01742919,0.017507,0.01712475",
+    "depends": null,
+    "note" : "前处理比例",
+    "type": "string",
+    "dialog_type": "Input Dialog"
+  },
+  {
+    "name": "pixel_format",
+    "default": "rgb",
+    "depends": null,
+    "note" : "前处理像素格式",
+    "type": "string",
+    "dialog_type": "Input Dialog"
+  },
+  {
+    "name": "input_num",
+    "default": "200",
+    "depends": null,
+    "note" : "量化数据集数量",
+    "type": "int",
+    "dialog_type": "Input Dialog"
+  }
+]
diff --git a/byte_infer_perf/general_perf/backends/TPU/runtime_backend_tpu.py b/byte_infer_perf/general_perf/backends/TPU/runtime_backend_tpu.py
new file mode 100755
index 000000000..0b19c76c6
--- /dev/null
+++ b/byte_infer_perf/general_perf/backends/TPU/runtime_backend_tpu.py
@@ -0,0 +1,143 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import time
+
+import numpy as np
+import sophon.sail as sail
+from general_perf.backends import runtime_backend
+import multiprocessing
+log = logging.getLogger("RuntimeBackendTPU")
+
+class RuntimeBackendTPU(runtime_backend.RuntimeBackend):
+    def __init__(self):
+        super().__init__()
+        self.hardware_type = "TPU"
+        self.need_reload = False
+        self.model_runtimes = []
+        self.configs = None
+        self.pack_config = None
+        self.batch_size = -1
+        self.pack_bs = -1
+        self.packrunner = False
+        self.engine = None
+        self.runner_name = "SAIL"
+        self.compiled_dir = (
+            os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + "/compiled_models/"
+        )
+        self.precision = "fp32"
+        self.max_time=multiprocessing.Value('d', -float('inf'))
+        self.min_time=multiprocessing.Value('d', float('inf'))
+        self.lock = multiprocessing.Lock()
+   
+    def version(self) -> str:
+        return sail.__version__
+    
+    def load(self, batch_size) -> None:
+        log.warning("TPU Backend only support static batch_size now.")
+        self.bmodel_path = self.compiled_dir + self.configs["model"] + ".bmodel"
+        # self.input_key = self.configs["input_shape"][self.configs["inputs"]]
+        self.dev_id = 1
+        self.net = sail.nn.Engine(self.bmodel_path, self.dev_id)
+        self.stream = sail.nn.Engine(self.bmodel_path, self.dev_id)
+        self.net_name = self.net.get_net_names()[0]
+        self.input_name = self.net.get_input_names(self.net_name)[0]
+        self.output_names = self.net.get_output_names(self.net_name)
+        self.input_shape = self.net.get_input_shapes(self.net_name, 0)[0]
+        self.output_shapes = self.net.get_output_shapes(self.net_name, 0)
+        self.batch_size = self.input_shape[0]
+        self.net_h = self.input_shape[2]
+        self.net_w = self.input_shape[3]
+    
+    def get_loaded_batch_size(self) -> int:
+        return self.batch_size
+    
+    def predict(self, data):
+        if isinstance(data, dict):
+            input_data = {0: next(iter(data.values()))}
+        else:
+            input_data = {0: data}
+
+        output_arrays = [np.ndarray(shape=(self.output_shapes[i]), dtype=np.float32) for i in range(len(self.output_shapes))]
+        outputs = {i:array for i, array in enumerate(output_arrays)}
+        ret = self.net.process(input_data, outputs, self.stream, self.net_name)
+        return outputs
+
+    def single_chip_test(self, dev_id, iter, thread_id):
+        net = sail.nn.Engine(self.bmodel_path, dev_id)
+        stream = sail.nn.Stream(dev_id)
+        net_name = net.get_net_names()[0]
+        input_shape = net.get_input_shapes(net_name, 0)[0]
+        output_shapes = net.get_output_shapes(net_name, 0)
+        
+        input = np.random.rand(*input_shape).astype(np.float32)
+        input_tensor = sail.nn.Tensor(input, sail.DataType.TPU_FLOAT32, dev_id)
+        input_data = {0: input_tensor}
+
+        output_arrays = [sail.nn.Tensor(output_shapes[i], sail.DataType.TPU_FLOAT32, dev_id) for i in range(len(output_shapes))]
+        outputs = {i:array for i, array in enumerate(output_arrays)}
+
+        start_time=time.time()
+        for i in range(iter):
+            net.process_async(input_data, outputs, stream, net_name)
+        stream.sync()
+        end_time=time.time()
+
+        with self.lock:
+            self.min_time.value=min(self.min_time.value,start_time)
+            self.max_time.value=max(self.max_time.value,end_time)
+
+ 
+    def _run_benchmark(self, bs, iter):
+        chip_num, core_num, start_chip =2, 1, 0
+        thread_list = []
+        for chip_id in range(chip_num):
+            for core_id in range(core_num):
+                thread_list.append(multiprocessing.Process(target=self.single_chip_test, args=(chip_id+start_chip, iter, chip_id*core_num+core_id)))
+
+        logging.info("Predict running...")
+        for thread in thread_list:
+            thread.start()
+
+        for thread in thread_list:
+            thread.join()
+        logging.info("Predict finished")
+
+        total_time = self.max_time.value - self.min_time.value
+
+        frame_num = chip_num * core_num * iter
+        qps = frame_num / total_time
+        avg_latency = total_time / frame_num
+        tail_latency = -1
+        print(f'chip_num = {chip_num}, core_num = {core_num}, frame_num = {frame_num}, qps = {qps}')  
+        
+        return qps, avg_latency, tail_latency
+    
+    def benchmark(self, dataloader):
+        report = {}
+        report["BS"] = self.batch_size
+        interact_info = self.configs.get("interact_info", {})
+        iterations = self.workload["iterations"]
+
+        qps, avg_latency, tail_latency = self._run_benchmark(
+            self.batch_size, iterations*100
+        )
+
+        report["QPS"] = int(qps)
+        report["AVG Latency"] = avg_latency
+        report["P99 Latency"] = tail_latency
+
+        return report
\ No newline at end of file
diff --git a/byte_infer_perf/general_perf/backends/TPU/tpu.json b/byte_infer_perf/general_perf/backends/TPU/tpu.json
new file mode 100755
index 000000000..e69de29bb
diff --git a/byte_infer_perf/general_perf/backends/runtime_backend.py b/byte_infer_perf/general_perf/backends/runtime_backend.py
old mode 100644
new mode 100755
index db856b0bd..8bdd4e6db
--- a/byte_infer_perf/general_perf/backends/runtime_backend.py
+++ b/byte_infer_perf/general_perf/backends/runtime_backend.py
@@ -29,7 +29,7 @@ def version(self) -> str:
 
     def load(self, batch_size) -> str:
         """
-        Return runtime backend version details
+        Load model for current backend
         """
         raise NotImplementedError("RuntimeBackend:load")
 
diff --git a/byte_infer_perf/general_perf/core/perf_engine.py b/byte_infer_perf/general_perf/core/perf_engine.py
old mode 100644
new mode 100755
index 231211e2c..fb1d761ef
--- a/byte_infer_perf/general_perf/core/perf_engine.py
+++ b/byte_infer_perf/general_perf/core/perf_engine.py
@@ -285,9 +285,14 @@ def get_model_info(self, model_name: str) -> Dict[str, Any]:
         return model_info
 
     def get_cpu_name(self):
-        command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
-        cpu_name = subprocess.check_output(command, shell=True)
-        return cpu_name.decode().strip()
+        try:
+            lscpu_output = subprocess.check_output(["lscpu"], text=True)
+            for line in lscpu_output.split('\n'):
+                if 'Model name' in line:
+                    return line.split(':')[1].strip()
+        except subprocess.CalledProcessError as e:
+            print(f"Command failed: {e}")
+            return None
 
     def check_interact_info(
             self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
diff --git a/byte_infer_perf/general_perf/launch.py b/byte_infer_perf/general_perf/launch.py
old mode 100644
new mode 100755
index 51cf30609..6b2afd6d4
--- a/byte_infer_perf/general_perf/launch.py
+++ b/byte_infer_perf/general_perf/launch.py
@@ -75,12 +75,13 @@ def main():
             subprocess.call([
                 'bash', 'general_perf/backends/CPU/calculate_cpu_diff.sh',
                 workload['model'],
-                str(workload['batch_sizes'][0])
+                str(workload['batch_sizes'][0]),
+                str(parsed_args.hardware_type)
             ])
 
         cmd = f'python3 general_perf/core/perf_engine.py --hardware_type {parsed_args.hardware_type} --task {parsed_args.task}'
         if parsed_args.compile_only:
-            cmd += '--compile_only'
+            cmd += ' --compile_only'
         exit_code = subprocess.call(cmd, shell=True)
         sys.exit(exit_code)
 
diff --git a/byte_infer_perf/llm_perf/backends/TPU/model_impl/__init__.py b/byte_infer_perf/llm_perf/backends/TPU/model_impl/__init__.py
new file mode 100644
index 000000000..d0f1edaca
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/TPU/model_impl/__init__.py
@@ -0,0 +1,21 @@
+## __all__ is a dict:
+##   key is model_name in `model_zoo/chatglm-xx.json`
+##   value is vendor specify model impl
+# __all__ = {
+#     "chatglm" : ChatGLMForConditionalGeneration,
+#     "chatglm2" : ChatGLM2ForConditionalGeneration
+# }
+
+from typing import Dict, Tuple, Any
+
+import torch
+import torch.nn as nn
+import torch_tpu
+
+from .tpu_llama import TPULlama
+
+from llm_perf.utils.logger import logger
+
+__all__ = {
+    "llama3.1": TPULlama,
+}
\ No newline at end of file
diff --git a/byte_infer_perf/llm_perf/backends/TPU/model_impl/modeling_llama3.py b/byte_infer_perf/llm_perf/backends/TPU/model_impl/modeling_llama3.py
new file mode 100755
index 000000000..e77f8a659
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/TPU/model_impl/modeling_llama3.py
@@ -0,0 +1,1445 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch_tpu
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from transformers.modeling_utils import PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.utils import (
+    LossKwargs,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.llama.configuration_llama import LlamaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "meta-llama/Llama-2-7b-hf"
+_CONFIG_FOR_DOC = "LlamaConfig"
+
+
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
+
+
+class LlamaRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config: Optional[LlamaConfig] = None,
+    ):
+        super().__init__()
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.46"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class LlamaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape # [bsz, 8, q_len, 128]
+    if n_rep == 1:
+        return hidden_states
+    device = hidden_states.device
+    if device.type == 'tpu':
+        hidden_states = hidden_states[:, :, None, :, :].to('cpu').expand(batch, num_key_value_heads, n_rep, slen, head_dim).to('tpu')
+    else:
+        hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout       # 0.0
+        self.hidden_size = config.hidden_size                   # 4096
+        self.num_heads = config.num_attention_heads             # 32
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)  # 128
+        self.num_key_value_heads = config.num_key_value_heads   # 8
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads  # 4
+        self.max_position_embeddings = config.max_position_embeddings # 131072
+        self.rope_theta = config.rope_theta                     # 5e5
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)           # [4096, 4096]
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) # [4096, 1024]
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) # [4096, 1024]
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)           # [4096, 4096]
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)        # [bsz, q_len, 4096]
+        key_states = self.k_proj(hidden_states)          # [bsz, q_len, 1024]
+        value_states = self.v_proj(hidden_states)        # [bsz, q_len, 1024]
+
+        # Multi-Head Attention
+        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)   # [bsz, 32, q_len, 128]
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)       # [bsz, 8, q_len, 128]
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)   # [bsz, 8, q_len, 128]
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            # kv cache，这里是把新算出来的kv和本层之前的kv在qlen维度做concat
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups) # 在第二个维度上重复4次
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        # q: [bsz, 32, q_len, 128], k.transpose: [bsz, 32, 128, q_len]
+        # output: [bsz, 32, q_len, q_len]
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        # [bsz, 32, q_len, 128]
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        # [bsz, q_len, num_heads, head_dim]
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        # [bsz, q_len, hidden_size]
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        # [bsz, q_len, hidden_size]
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class LlamaFlashAttention2(LlamaAttention):
+    """
+    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class LlamaSdpaAttention(LlamaAttention):
+    """
+    Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from LlamaAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        
+        cur_device = query_states.device
+        
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states.to('cpu'),
+            key_states.to('cpu'),
+            value_states.to('cpu'),
+            attn_mask=causal_mask.to('cpu'),
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        # attn_output = torch.nn.functional.scaled_dot_product_attention(
+        #     query_states,
+        #     key_states,
+        #     value_states,
+        #     attn_mask=causal_mask,
+        #     dropout_p=self.attention_dropout if self.training else 0.0,
+        #     is_causal=is_causal,
+        # )
+
+        attn_output = attn_output.transpose(1, 2).contiguous().to(cur_device)
+        attn_output = attn_output.view(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+LLAMA_ATTENTION_CLASSES = {
+    "eager": LlamaAttention,
+    "flash_attention_2": LlamaFlashAttention2,
+    "sdpa": LlamaSdpaAttention,
+}
+
+
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = LlamaRotaryEmbedding(config=config)
+
+        self.gradient_checkpointing = False
+        if getattr(config, "pretraining_tp", 1) != 1:
+            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+            
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **flash_attn_kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        # if False:
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            
+            # origin_dtype = causal_mask.dtype
+            causal_mask *= torch.arange(target_length, device=device).to(torch.int32) > cache_position.reshape(-1, 1).to(torch.int32)
+            # causal_mask.to_(origin_dtype)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
+class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.vocab_size = config.vocab_size  # 128256
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) # 4096, 128256
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **kwargs: Unpack[KwargsForCausalLM],
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+
+        >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        
+        
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,      # [[0, 1, 2, 3, 4]]
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,    # None
+            use_cache=use_cache,            # True
+            output_attentions=output_attentions, # False
+            output_hidden_states=output_hidden_states, # False
+            return_dict=return_dict, # True
+            cache_position=cache_position, # [0, 1, 2, 3, 4]
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The LLaMa Model transformer with a sequence classification head on top (linear layer).
+
+    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class LlamaForSequenceClassification(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+The Llama Model transformer with a span classification head on top for extractive question-answering tasks like
+SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class LlamaForQuestionAnswering(LlamaPreTrainedModel):
+    base_model_prefix = "transformer"
+
+    # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Llama
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = LlamaModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.transformer.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.transformer.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The Llama Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class LlamaForTokenClassification(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/byte_infer_perf/llm_perf/backends/TPU/model_impl/tpu_llama.py b/byte_infer_perf/llm_perf/backends/TPU/model_impl/tpu_llama.py
new file mode 100755
index 000000000..806e36601
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/TPU/model_impl/tpu_llama.py
@@ -0,0 +1,168 @@
+import os
+import pathlib
+
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+
+from typing import Dict, Any
+from llm_perf.utils.logger import logger
+from llm_perf.utils.ps_utils import check_memory_usage
+from llm_perf.utils.dist_utils import check_dist
+
+from accelerate import init_empty_weights
+
+from llm_perf.backends.TPU.tpu_ckpt_loader import TpuCkptLoader
+from llm_perf.core.ckpt_loader import Llama_ModelLoader
+from transformers import LlamaConfig
+from transformers.cache_utils import DynamicCache, StaticCache
+from .modeling_llama3 import LlamaForCausalLM
+
+
+class TPULlamaLoader(TpuCkptLoader):
+    def __init__(
+        self, 
+        model : LlamaForCausalLM, 
+        model_config : LlamaConfig, 
+        ckpt_path : str = ""
+    ):
+        mp_size = int(os.environ.get("WORLD_SIZE", "1"))
+        local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+
+        super().__init__("", model, mp_size, local_rank, ckpt_path)
+        self.model_config = model_config
+
+    def parallel_loader(self):
+        self.state_dict = {}
+
+        model_dir = pathlib.Path(self.ckpt_path).absolute()
+        if not model_dir.exists() or not model_dir.is_dir():
+            if self.mp_rank == 0:
+                print(f"{model_dir} not exists or is not a directory")
+            return
+        
+        split_model_dir = model_dir.joinpath(f"TP{self.mp_size}")
+        if not split_model_dir.exists() and self.mp_size == 1:
+            split_model_dir = model_dir
+            real_model_dir = split_model_dir
+        elif not split_model_dir.exists() or not split_model_dir.is_dir():
+            if self.mp_rank == 0:
+                print(f"{split_model_dir} not exists or is not a directory, please split model first.")
+            return
+        elif split_model_dir.exists():
+            real_model_dir = split_model_dir / f"device_{self.mp_rank}"
+
+        model_loader = Llama_ModelLoader(real_model_dir)
+        self.state_dict = model_loader.load_weight()
+        
+    def infusion_to_model(self):
+        self.model.model.embed_tokens.weight = self.to_parameter(self.state_dict["model.embed_tokens.weight"])
+        for i in range(self.model_config.num_hidden_layers):
+            self.model.model.layers[i].input_layernorm.weight = self.to_parameter(self.state_dict[f"model.layers.{i}.input_layernorm.weight"])
+
+            self.model.model.layers[i].self_attn.q_proj.weight = self.to_parameter(self.state_dict[f"model.layers.{i}.self_attn.q_proj.weight"])
+            self.model.model.layers[i].self_attn.k_proj.weight = self.to_parameter(self.state_dict[f"model.layers.{i}.self_attn.k_proj.weight"])
+            self.model.model.layers[i].self_attn.v_proj.weight = self.to_parameter(self.state_dict[f"model.layers.{i}.self_attn.v_proj.weight"])
+            self.model.model.layers[i].self_attn.o_proj.weight = self.to_parameter(self.state_dict[f"model.layers.{i}.self_attn.o_proj.weight"])
+
+            self.model.model.layers[i].post_attention_layernorm.weight = self.to_parameter(self.state_dict[f"model.layers.{i}.post_attention_layernorm.weight"])
+
+            self.model.model.layers[i].mlp.gate_proj.weight = self.to_parameter(self.state_dict[f"model.layers.{i}.mlp.gate_proj.weight"])
+            self.model.model.layers[i].mlp.up_proj.weight = self.to_parameter(self.state_dict[f"model.layers.{i}.mlp.up_proj.weight"])
+            self.model.model.layers[i].mlp.down_proj.weight = self.to_parameter(self.state_dict[f"model.layers.{i}.mlp.down_proj.weight"])
+
+        self.model.model.norm.weight = self.to_parameter(self.state_dict["model.norm.weight"])
+        self.model.lm_head.weight = self.to_parameter(self.state_dict["lm_head.weight"])
+
+
+class TPULlama(nn.Module):
+    def __init__(self, xpu_cfg: Dict[str, Any]) -> None:
+        super().__init__()
+
+        self.xpu_cfg = xpu_cfg
+        self.model_config = xpu_cfg["model_config"]
+
+        self.model_name = self.model_config["model_name"]
+        self.model_path = self.model_config["model_path"]
+        self.model_network = self.model_config["network"]
+
+        self.llama_config : LlamaConfig = LlamaConfig(**self.model_network)
+        # print(self.llama_config)
+
+        # dist config
+        self.mp_size = int(os.environ.get("WORLD_SIZE", "1"))
+        self.local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+
+        self.transformer_model : LlamaForCausalLM = None
+
+    
+    def init_inference(self):
+        torch.tpu.set_device(self.local_rank)
+
+        if self.mp_size > 1:
+            logger.info(f"RANK: {self.local_rank} {self.mp_size} init_process_group...")
+            dist.init_process_group(
+                backend="sccl",
+                world_size=self.mp_size,
+                rank=self.local_rank
+            )
+            check_dist()
+
+        # check_memory_usage("Begin")
+
+        with init_empty_weights():
+            self.transformer_model = LlamaForCausalLM(self.llama_config).to(self.llama_config.torch_dtype).eval()
+
+        # check_memory_usage("After build model")
+
+        self.load_weight(self.model_path)
+        
+        # check_memory_usage("After load_weight")
+
+        self.transformer_model.tpu()
+
+        # check_memory_usage("After model to device")
+
+        self.kv_cache = self.init_kvcache(self.llama_config.torch_dtype)
+
+        if self.mp_size > 1:
+            dist.barrier()
+
+    def finalize_inference(self):
+        if self.mp_size > 1 and dist.is_initialized():
+            dist.destroy_process_group()
+
+    def load_weight(self, ckpt_path):
+        p_loader = TPULlamaLoader(self.transformer_model, self.llama_config, ckpt_path)
+        p_loader.parallel_loader()
+        p_loader.infusion_to_model()
+
+    def init_kvcache(self, dtype):
+        max_batch_size = self.xpu_cfg["max_batch_size"]
+        cur_device = self.transformer_model.device
+        # cache = DynamicCache(num_layers)
+        
+        cache = StaticCache(self.llama_config,
+                            max_batch_size,
+                            4096,
+                            torch.device('cpu'), # torch.zeros not support bf16 with TPU now
+                            dtype,
+                            max_batch_size).to(cur_device)
+        return cache
+    
+
+    def forward(self, inputs : Dict[str, torch.Tensor]):
+        # inputs = inputs.to(torch.int32)
+        model_outputs = self.transformer_model.forward(
+            **inputs, 
+            past_key_values=self.kv_cache
+            # past_key_values=None
+        )
+        # context: [1, seq_len] --> [1, seq_len, vocab_size] or [1, 1, vocab_size]
+        # decode: [max_batch_size, 1]
+        logits = model_outputs.logits
+
+        output_dict = {
+            "logits": logits
+        }
+        return output_dict
\ No newline at end of file
diff --git a/byte_infer_perf/llm_perf/backends/TPU/setup.py b/byte_infer_perf/llm_perf/backends/TPU/setup.py
new file mode 100644
index 000000000..84aa5154c
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/TPU/setup.py
@@ -0,0 +1,61 @@
+import torch
+import importlib
+from typing import Any, Dict
+
+from llm_perf.core.scheduler import CoreScheduler
+from llm_perf.backends.TPU.tpu_inferencer import TpuInferencer
+from llm_perf.backends.TPU.tpu_sampler import TpuSampler
+from llm_perf.backends.TPU.tpu_scheduler import TpuScheduler
+from llm_perf.backends.TPU.tpu_mp_engine import TpuMpEngine
+from llm_perf.utils.logger import logger
+
+
+
+def get_engine(xpu_cfg) -> TpuMpEngine:
+    # get model impl
+    hardware_type = xpu_cfg["hardware_type"]
+    model_config = xpu_cfg["model_config"]
+    model_name = model_config["model_name"]
+    
+    vendor_model_path = f"llm_perf/backends/{hardware_type}/model_impl"
+    vendor_model_impl = importlib.import_module(
+        ".", package=vendor_model_path.replace("/", ".")
+    )
+    vendor_model = vendor_model_impl.__all__[model_name]
+    
+    mp_engine = TpuMpEngine(
+        world_size=xpu_cfg["tp_size"],
+        model_impl=vendor_model,
+        xpu_cfg=xpu_cfg
+    )
+    
+    return mp_engine
+
+
+
+def setup_scheduler(xpu_cfg) -> CoreScheduler:
+    # get model impl
+    hardware_type = xpu_cfg["hardware_type"]
+    model_config = xpu_cfg["model_config"]
+    model_name = model_config["model_name"]
+
+    vendor_model_path = f"llm_perf/backends/{hardware_type}/model_impl"
+    vendor_model_impl = importlib.import_module(
+        ".", package=vendor_model_path.replace("/", ".")
+    )
+    vendor_model = vendor_model_impl.__all__[model_name]
+    
+    # create inferencer
+    inferencer = TpuInferencer(vendor_model, xpu_cfg)
+
+    # create sampler
+    sampler = TpuSampler()
+
+    # create scheduler
+    scheduler = TpuScheduler(
+        inferencer=inferencer, 
+        sampler=sampler, 
+        xpu_cfg=xpu_cfg
+    )
+
+    return scheduler
\ No newline at end of file
diff --git a/byte_infer_perf/llm_perf/backends/TPU/tpu_ckpt_loader.py b/byte_infer_perf/llm_perf/backends/TPU/tpu_ckpt_loader.py
new file mode 100644
index 000000000..dbdf8a8bc
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/TPU/tpu_ckpt_loader.py
@@ -0,0 +1,50 @@
+import torch
+import torch_tpu
+import torch.distributed as dist
+
+from llm_perf.core.ckpt_loader import CoreCkptLoader
+
+class TpuCkptLoader(CoreCkptLoader):
+    def __init__(
+        self,
+        prefix, model,
+        mp_size=1, mp_rank=0,
+        ckpt_path: str=""
+    ):
+        super().__init__(prefix, model, mp_size, mp_rank, ckpt_path)
+        
+    def weight_to_device(self, weight: torch.Tensor, non_blocking=False):
+        if self.mp_rank == 0:
+            weight = weight.tpu(non_blocking=non_blocking)
+        else:
+            cur_device = torch.tpu.current_device()
+            weight = torch.emtpy_like(weight, device=f"tpu:{cur_device}")
+        return weight
+    
+    
+    def broadcast_weight(self, key, device='cpu', non_blocking=False):
+        if self.mp_rank != 0:
+            tensor_shape = self.state_dict[key]["shape"]
+            tensor_dtype = self.state_dict[key]["dtype"]
+            tensor = torch.empty(tensor_shape, dtype=tensor_dtype)
+        else:
+            tensor = self.state_dict[key].cpu()
+        tensor_tpu = self.weight_to_device(tensor, non_blocking=non_blocking)
+        dist.broadcast(tensor_tpu, src=0)
+        self.state_dict[key] = tensor_tpu
+        
+    def scatter_weight(self, key, dim, split_mode='default', outter=1, device='cpu', non_blocking=False):
+        self.broadcast_weight(key, non_blocking=non_blocking)
+        weight = self.state_dict[key]
+
+        if split_mode == 'default':
+            weight_split = self.split(weight, dim)
+        elif split_mode == 'with_outter':
+            weight_split = self.with_outter_split(weight, dim, outter)
+        elif split_mode == 'split_outter':
+            weight_split = self.split(weight, dim, outter)
+        else:
+            assert False, f"unknown split mode {split_mode}"
+
+        weight_split = [x.contiguous() for x in weight_split]
+        self.state_dict[key] = weight_split[self.mp_rank]
\ No newline at end of file
diff --git a/byte_infer_perf/llm_perf/backends/TPU/tpu_inferencer.py b/byte_infer_perf/llm_perf/backends/TPU/tpu_inferencer.py
new file mode 100644
index 000000000..7b30a71f4
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/TPU/tpu_inferencer.py
@@ -0,0 +1,129 @@
+from dataclasses import dataclass
+from typing import Dict, Iterable, List
+
+from llm_perf.core.generation import GenerateRequest
+from llm_perf.core.inferencer import CoreInferencer
+from llm_perf.backends.TPU.tpu_mp_engine import TpuMpEngine
+from llm_perf.utils.logger import logger
+
+class TpuInferencer(CoreInferencer):
+    def __init__(self, model_impl, xpu_cfg) -> None:
+        super().__init__()
+        
+        self.tp_size = xpu_cfg["tp_size"]
+        self.pad_token_id = xpu_cfg["pad_token_id"]
+        self.max_batch_size = xpu_cfg["max_batch_size"]
+        self.mp_engine = TpuMpEngine(self.tp_size, model_impl, xpu_cfg)
+
+    def prepare_inputs(
+        self, 
+        tasks: List[CoreInferencer.Task], 
+        **kwargs
+    ):
+        input_dict = {
+            "input_ids": None, 
+            "position_ids": None, 
+            "attention_mask": None, 
+            "all_q_len": None, 
+            "all_kv_len": None, 
+            "is_context": None, 
+            "valid_slot_ids": None
+        }
+
+        is_context = kwargs.get("is_context") if "is_context" in kwargs.keys() else False
+        valid_slot_ids = kwargs.get("valid_slot_ids") if "valid_slot_ids" in kwargs.keys() else [i for i in range(self.max_batch_size)]
+    
+
+        get_input_logits = False
+        for task in tasks:
+            if task.request.generate_config.get_input_logits:
+                get_input_logits = True
+                break
+
+        input_dict["is_context"] = is_context
+        input_dict["valid_slot_ids"] = valid_slot_ids
+        input_dict["get_input_logits"] = get_input_logits
+
+        if is_context:
+            q_len = len(tasks[0].request.input_ids)
+            kv_len = len(tasks[0].request.input_ids)
+
+            input_dict["input_ids"] = [
+                tasks[0].request.input_ids
+            ]
+            input_dict["position_ids"] = [
+                [i for i in range(q_len)]
+            ]
+            input_dict["attention_mask"] = [
+                [1 for _ in range(q_len)]
+            ]
+            input_dict["all_q_len"] = [
+                q_len
+            ]
+            input_dict["all_kv_len"] = [
+                kv_len
+            ]
+        else:
+            all_input_ids = []
+            all_position_ids = []
+            all_attention_mask = []
+            all_q_len = []
+            all_kv_len = []
+
+            for task in tasks:
+                q_len = 1
+                kv_len = 0
+
+                if task is None:
+                    kv_len = 1
+
+                    input_ids = [
+                        self.pad_token_id
+                    ]
+                    position_ids = [
+                        0
+                    ]
+                    attention_mask = [
+                        0
+                    ]
+                else:
+                    kv_len = len(task.request.input_ids) + len(task.generate_ids) - 1
+
+                    input_ids = [
+                        task.generate_ids[-1]
+                    ]
+                    position_ids = [
+                        kv_len
+                    ]
+                    attention_mask = [
+                        1
+                    ]
+                all_input_ids.append(input_ids)
+                all_position_ids.append(position_ids)
+                all_attention_mask.append(attention_mask)
+                all_q_len.append(q_len)
+                all_kv_len.append(kv_len)
+
+            input_dict["input_ids"] = all_input_ids
+            input_dict["position_ids"] = all_position_ids
+            input_dict["attention_mask"] = all_attention_mask
+            input_dict["all_q_len"] = all_q_len
+            input_dict["all_kv_len"] = all_kv_len
+
+        return input_dict
+    
+    def infer(
+        self,
+        tasks: List[CoreInferencer.Task],
+        **kwargs
+    ):
+        input_dict = self.prepare_inputs(tasks, **kwargs)
+        output_dict = self.mp_engine.mp_forward(input_dict)
+        
+        logits = output_dict["logits"]
+        next_token_logits = logits[:, -1, :].contiguous()
+        infer_outputs = {
+            "logits": logits, 
+            "last_logits": next_token_logits
+        }
+        return infer_outputs
\ No newline at end of file
diff --git a/byte_infer_perf/llm_perf/backends/TPU/tpu_mp_engine.py b/byte_infer_perf/llm_perf/backends/TPU/tpu_mp_engine.py
new file mode 100644
index 000000000..55c0ca52f
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/TPU/tpu_mp_engine.py
@@ -0,0 +1,187 @@
+import os
+import sys
+import time
+import signal
+import pathlib
+from multiprocessing import Queue
+from typing import List
+from abc import ABC
+
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+import torch_tpu
+
+from llm_perf.core.mp_engine import CoreMpEngine
+from llm_perf.utils.logger import logger
+
+
+# context: 
+#   input_ids: [1, s_q]
+#   attention_mask = [1, s_q]
+#   full_attention_mask = [1, 1, s_q, s_kv] (sq == s_kv)
+def get_context_masks(
+    input_ids : torch.Tensor, 
+    padding_mask : torch.Tensor
+):
+    # input_ids: [1, q_len]
+    # padding_mask = [1, q_len]
+    _, q_len = input_ids.shape
+
+    # [1, q_len, q_len]
+    full_attention_mask = torch.ones(
+        1, q_len, q_len, 
+        device=input_ids.device
+    )
+    # full_attention_mask.tril_()
+    full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
+    full_attention_mask -= padding_mask.unsqueeze(-1) - 1
+    full_attention_mask = (full_attention_mask < 0.5).bool()
+    full_attention_mask.unsqueeze_(1)
+    return full_attention_mask
+
+# decode
+#   input_ids: [bs, 1]
+#   attention_mask = [bs, 1]
+#   full_attention_mask = [bs, 1, 1, s_kv]
+def get_decode_masks(
+    input_ids : torch.Tensor, 
+    all_kv_len: List[int]
+):
+    # input_ids: [batch_size, 1]
+    # padding_mask: [batch_size, 1 + max_kv_len]
+    batch_size, q_len = input_ids.shape
+    max_qkv_len = q_len + max(all_kv_len)
+    
+    # [batch_size, 1, max_qkv_len]
+    padding_mask = []
+    for i in range(batch_size):
+        cur_qkv_len = q_len + all_kv_len[i]
+        mask_per_batch = [1] * cur_qkv_len + [0] * (max_qkv_len - cur_qkv_len)
+        padding_mask.append(mask_per_batch)
+    full_attention_mask = torch.tensor(
+        padding_mask, 
+        device=input_ids.device
+    ).unsqueeze_(1)
+    full_attention_mask = (full_attention_mask < 0.5).bool()
+    full_attention_mask.unsqueeze_(1)
+    return full_attention_mask
+
+
+class TpuMpEngine(CoreMpEngine):
+    def __init__(self, world_size: int, model_impl: nn.Module, xpu_cfg) -> None:
+        super().__init__(world_size, model_impl, xpu_cfg)
+    def build_inputs(self, forward_inputs):
+        # list --> torch.Tensor --> tpu
+        forward_inputs["input_ids"] = torch.tensor(
+            forward_inputs["input_ids"]
+        ).to(torch.int32).tpu()
+        forward_inputs["position_ids"] = torch.tensor(
+            forward_inputs["position_ids"]
+        ).to(torch.int32).tpu()
+        forward_inputs["attention_mask"] = torch.tensor(
+            forward_inputs["attention_mask"]
+        ).to(torch.int32).tpu()
+        
+        is_context = forward_inputs["is_context"]
+        if is_context:
+            forward_inputs["full_attention_mask"] = get_context_masks(
+                forward_inputs["input_ids"],
+                forward_inputs["attention_mask"]
+            )
+        else:
+            forward_inputs["full_attention_mask"] = get_decode_masks(
+                forward_inputs["input_ids"],
+                forward_inputs["all_kv_len"]
+            )
+        return forward_inputs
+
+    @torch.no_grad()
+    def mp_loop_worker(
+        self,
+        local_rank: int,
+        world_size: int,
+        input_queue: Queue,
+        output_queue: Queue,
+        model_impl,
+        xpu_config
+    ):
+        try:
+            torch.manual_seed(1)
+
+            # set rank and world_size
+            os.environ["RANK"] = str(local_rank)
+            os.environ["LOCAL_RANK"] = str(local_rank)
+            os.environ["WORLD_SIZE"] = str(world_size)
+            os.environ["LOCAL_WORLD_SIZE"] = str(world_size)
+
+            # create and init model based on model_impl and xpu_config
+            model = model_impl(xpu_config)
+            if hasattr(model, 'init_inference'):
+                model.init_inference()
+        
+            def signal_handler(signum, frame):
+                logger.info(f"rank {local_rank} received signal {signum}, exiting...")
+                if hasattr(model, 'finalize_inference'):
+                    model.finalize_inference()
+                os._exit(0)
+
+            signal.signal(signal.SIGINT, signal_handler)
+            signal.signal(signal.SIGTERM, signal_handler)
+
+            # current rank is ready
+            output_queue.put("ready", block=True)
+            logger.info(f"{local_rank}/{world_size} rank is ready")
+
+            # model process loop
+            while True:
+                (
+                    forward_inputs,
+                ) = input_queue.get(block=True)
+
+                log = forward_inputs.get("log", False)
+                workspace = forward_inputs.get("workspace", None)
+
+                forward_inputs["log_file"] = None
+                if log and workspace is not None:
+                    workspace_dir = workspace / f"rank_{local_rank}"
+                    workspace_dir.mkdir(exist_ok=True, parents=True)
+                    forward_inputs["log_file"] = open(workspace_dir / "run.log", "w")
+
+                inputs_dict = self.build_inputs(forward_inputs)
+                start_time = time.perf_counter_ns()
+
+                output_dict = model.forward(inputs_dict)
+
+                torch.tpu.synchronize()
+                end_time = time.perf_counter_ns()
+                duration_ms = round((end_time - start_time) / 1e6, 3)
+                output_dict["duration_ms"] = duration_ms
+
+                output_dict["logits"] = output_dict["logits"].to('cpu')
+                # TP realization: rank0 send result back to main process
+                if local_rank == 0:
+                    output_queue.put(output_dict)
+                
+                if log and workspace is not None:
+                    forward_inputs["log_file"].close()
+
+        except Exception as e:
+            logger.exception(f"[BUG] engine _load_and_listen failed, no more requests will be handled. {e}")
+            output_queue.put(RuntimeError("[BUG] fatal exception in model subprocess"))
+    
+    def mp_forward(self, *args):
+        # extra args
+        #   workspace: pathlib.Path, where to save files for each rank
+        #   log: bool, whether to save logs to file
+        #   override_hidden_states: bool, whether to override hidden_states
+        #   random_seed: int, random seed for torch.manual_seed
+
+        # send inputs to all subprocesses
+        for _ in range(self.world_size):
+            self._input_queues.put(args, block=True)
+
+        # wait for one subprocess send result back to main process
+        output_dict = self._output_queues.get(block=True)
+
+        return output_dict
\ No newline at end of file
diff --git a/byte_infer_perf/llm_perf/backends/TPU/tpu_sampler.py b/byte_infer_perf/llm_perf/backends/TPU/tpu_sampler.py
new file mode 100644
index 000000000..9502664bf
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/TPU/tpu_sampler.py
@@ -0,0 +1,155 @@
+from typing import Any, Dict, List, Tuple, Union
+
+import torch
+import torch_tpu
+
+from llm_perf.core.generation import GenerateResult
+from llm_perf.core.inferencer import CoreInferencer
+from llm_perf.core.sampler import CoreSampler
+
+from llm_perf.utils.logger import logger
+
+
+class TpuSampler(CoreSampler):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def sample(
+        self, 
+        tasks: List[CoreInferencer.Task], 
+        logits: torch.FloatTensor
+    ) -> List[int]:
+        top_p = [p.request.generate_config.top_p for p in tasks]
+        if all(p == 1.0 for p in top_p):
+            top_p = None
+
+        top_k = [p.request.generate_config.top_k for p in tasks]
+        if all(k == 0 for k in top_k):
+            top_k = None
+
+        temperature = [p.request.generate_config.temperature for p in tasks]
+        if all(t == 1.0 for t in temperature):
+            temperature = None
+
+        (
+            sp_input_ids,
+            sp_cu_seqlens,
+            sp_max_seqlens,
+            repetition_penalty,
+            mask_eos_token,
+        ) = (None, None, 0, None, None)
+        eos_token_id = [p.request.generate_config.eos_token_id or -1 for p in tasks]
+
+        next_tokens, softmax_out = self._sample(
+            logits.float(),
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            input_ids=sp_input_ids,
+            cu_seqlens=sp_cu_seqlens,
+            max_seqlens=sp_max_seqlens,
+            repetition_penalty=repetition_penalty,
+            mask_eos_token=mask_eos_token,
+            min_tokens_to_keep=1,
+            eos_token_id=eos_token_id,
+        )
+
+        next_tokens = next_tokens.tolist()
+
+        # The aux_data is softmax_out here
+        return next_tokens, softmax_out
+
+
+    def _sample(
+        self,
+        logits: torch.FloatTensor,
+        temperature: Union[List[float], torch.FloatTensor] = None,
+        top_k: Union[List[int], torch.IntTensor] = None,
+        top_p: Union[List[float], torch.FloatTensor] = None,
+        input_ids: Union[List[int], torch.IntTensor] = None,
+        cu_seqlens: Union[List[int], torch.IntTensor] = None,
+        repetition_penalty: Union[List[float], torch.FloatTensor] = None,
+        mask_eos_token: Union[List[int], torch.IntTensor] = None,
+        min_tokens_to_keep: int = 1,
+        eos_token_id: int = 0,
+        max_seqlens: int = 0,
+    ) -> Tuple[List[int], torch.FloatTensor]:
+        _is_greedy = False
+        _is_random = False
+        _is_fastpath = False
+
+        if top_k:
+            assert all(
+                k == top_k[0] for k in top_k
+            ), f"expect the same batch top_k, but got {top_k}"
+            if all(k == 1 for k in top_k):
+                _is_greedy = True
+        elif top_p:
+            _is_random = True
+            if all(p == top_p[0] for p in top_p):
+                _is_fastpath = True
+                _top_p = top_p[0]
+        else:
+            raise RuntimeError(
+                f"Unsupported sample strategy, parameter top_k: {top_k} top_p: {top_p}"
+            )
+
+        if _is_greedy:
+            return torch.argmax(logits, dim=-1), torch.nn.functional.softmax(logits, dim=-1)
+        else:
+            raise NotImplementedError
+
+    def postprocess(
+        self,
+        tasks: List[CoreInferencer.Task],
+        infer_outputs: Dict[str, torch.FloatTensor],
+        next_tokens: List[int],
+    ) -> List[GenerateResult]:
+        generate_result = []
+        for i in range(len(tasks)):
+            token_id = next_tokens[i]
+            task = tasks[i]
+
+            # take current generated token into account
+            generate_tokens_len = len(task.generate_ids) + 1
+
+            if token_id == task.request.generate_config.eos_token_id:
+                if generate_tokens_len < task.request.generate_config.min_new_tokens:
+                    finish_reason = ""
+                    token_id = task.request.generate_config.eos_token_id
+                else:
+                    finish_reason = "stop"
+            elif generate_tokens_len >= task.request.generate_config.max_new_tokens:
+                finish_reason = "max_length"
+            else:
+                finish_reason = ""
+
+
+            if task.request.generate_config.get_input_logits:
+                gen_res = GenerateResult(
+                    token_id=token_id,
+                    finish_reason=finish_reason, 
+
+                    wait_time=task.wait_time[-1], 
+                    model_time=task.model_time[-1], 
+                    post_process_time=task.post_process_time[-1], 
+
+                    logits=infer_outputs["logits"][i].float().cpu(), 
+                    last_logits=infer_outputs["last_logits"][i].float().cpu(), 
+                )
+            else:
+                gen_res = GenerateResult(
+                    token_id=token_id,
+                    finish_reason=finish_reason, 
+
+                    wait_time=task.wait_time[-1], 
+                    model_time=task.model_time[-1], 
+                    post_process_time=task.post_process_time[-1], 
+
+                    logits=None, 
+                    last_logits=None, 
+                )
+
+            generate_result.append(gen_res)
+
+        return generate_result
diff --git a/byte_infer_perf/llm_perf/backends/TPU/tpu_scheduler.py b/byte_infer_perf/llm_perf/backends/TPU/tpu_scheduler.py
new file mode 100644
index 000000000..2bc97e9e3
--- /dev/null
+++ b/byte_infer_perf/llm_perf/backends/TPU/tpu_scheduler.py
@@ -0,0 +1,139 @@
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import List, Set
+
+import torch
+import torch_tpu
+
+from llm_perf.core.scheduler import CoreScheduler
+from llm_perf.core.inferencer import CoreInferencer
+from llm_perf.core.sampler import CoreSampler
+from llm_perf.backends.TPU.tpu_inferencer import TpuInferencer
+from llm_perf.utils.logger import logger
+
+class TpuScheduler(CoreScheduler):
+    def __init__(
+        self,
+        inferencer: CoreInferencer,
+        sampler: CoreSampler,
+        xpu_cfg
+    ) -> None:
+        super().__init__(inferencer, sampler)
+        self.max_batch_size = xpu_cfg["max_batch_size"]
+        
+        
+    @torch.inference_mode()
+    def scheduler_loop(self):
+        task_slots: List[CoreInferencer.Task] = [None] * self.max_batch_size
+        avail_slots: List[int] = [self.max_batch_size - 1 - i for i in range(self.max_batch_size)]
+        context_slots: List[int] = []
+
+        while self.started:
+            while not self.task_queue.empty():
+                if len(avail_slots) == 0:
+                    break
+                slot = avail_slots.pop()
+                task_slots[slot] = self.task_queue.get()
+                context_slots.append(slot)
+
+            if len(avail_slots) == self.max_batch_size:
+                with self.task_queue.not_empty:
+                    self.task_queue.not_empty.wait(0.1)
+                continue
+
+
+            # context phase
+            if len(context_slots) != 0:
+                # do inference --> logits
+                select_slot = context_slots.pop(0)
+                select_slots= [
+                    select_slot
+                ]
+
+                cur_task = task_slots[select_slot]
+                cur_tasks = [
+                    cur_task
+                ]
+
+                cur_task.update_st("model_start")
+
+                outputs = self.inferencer.infer(
+                    cur_tasks, 
+                    is_context=True, 
+                    valid_slot_ids=select_slots
+                )
+
+                cur_task.update_st("model_end")
+
+                # sample logits --> tokens
+                next_tokens, _ = self.sampler.sample(
+                    tasks=cur_tasks, 
+                    logits=outputs["last_logits"]
+                )
+
+                cur_task.update_st("process_end")
+
+                # postprocess -> gen result
+                generation_results = self.sampler.postprocess(
+                    tasks=cur_tasks,
+                    infer_outputs=outputs,
+                    next_tokens=next_tokens,
+                )
+
+                # add result to task
+                cur_task.add_result(generation_results[0])
+                if generation_results[0].finish_reason:
+                    cur_task.finish()
+
+            
+            # decode phase
+            else:
+                select_slots = []
+                valid_tasks = []
+                for i, task in enumerate(task_slots):
+                    if task is not None:
+                        select_slots.append(i)
+                        valid_tasks.append(task)
+
+                for task in valid_tasks:
+                    task.update_st("model_start")
+
+                outputs = self.inferencer.infer(
+                    valid_tasks, 
+                    is_context=False, 
+                    valid_slot_ids=select_slots
+                )
+
+                for task in valid_tasks:
+                    task.update_st("model_end")
+
+
+                # sample logits --> tokens
+                next_tokens, _ = self.sampler.sample(
+                    tasks=valid_tasks, 
+                    logits=outputs["last_logits"]
+                )
+
+                for task in valid_tasks:
+                    task.update_st("process_end")
+
+                # postprocess -> gen result
+                generation_results = self.sampler.postprocess(
+                    tasks=valid_tasks,
+                    infer_outputs=outputs,
+                    next_tokens=next_tokens,
+                )
+
+                # add result to task
+                for i, gen_res in enumerate(generation_results):
+                    valid_tasks[i].add_result(gen_res)
+                    if gen_res.finish_reason:
+                        valid_tasks[i].finish()
+                    
+            for i, task in enumerate(task_slots):
+                if task is not None and task.is_finished():
+                    avail_slots.append(i)
+                    task_slots[i] = None
+            
+            avail_slots.sort(reverse=True)
\ No newline at end of file
diff --git a/byte_infer_perf/llm_perf/model_zoo/llama3.1-8b-torch-bf16.json b/byte_infer_perf/llm_perf/model_zoo/llama3.1-8b-torch-bf16.json
new file mode 100755
index 000000000..e0e4327cb
--- /dev/null
+++ b/byte_infer_perf/llm_perf/model_zoo/llama3.1-8b-torch-bf16.json
@@ -0,0 +1,48 @@
+{
+    "model_name": "llama3.1",
+    "model_path": "llm_perf/model_zoo/sota/llama3.1-8b",
+    "model_interface": "LlamaForCausalLM",
+    "tokenizer": {
+        "path": "llm_perf/model_zoo/sota/llama3.1-8b",
+        "support_chn": true
+    },
+    "network": {
+        "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
+        "architectures": [
+            "LlamaForCausalLM"
+        ],
+        "attention_bias": false,
+        "attention_dropout": 0.0,
+        "bos_token_id": 128000,
+        "eos_token_id": [
+            128001,
+            128008,
+            128009
+        ],
+        "hidden_act": "silu",
+        "hidden_size": 4096,
+        "initializer_range": 0.02,
+        "intermediate_size": 14336,
+        "max_position_embeddings": 131072,
+        "mlp_bias": false,
+        "model_type": "llama",
+        "num_attention_heads": 32,
+        "num_hidden_layers": 32,
+        "num_key_value_heads": 8,
+        "pretraining_tp": 1,
+        "rms_norm_eps": 1e-05,
+        "rope_scaling": {
+            "factor": 8.0,
+            "low_freq_factor": 1.0,
+            "high_freq_factor": 4.0,
+            "original_max_position_embeddings": 8192,
+            "rope_type": "llama3"
+        },
+        "rope_theta": 500000.0,
+        "tie_word_embeddings": false,
+        "torch_dtype": "bfloat16",
+        "transformers_version": "4.42.3",
+        "use_cache": true,
+        "vocab_size": 128256
+    }
+}
\ No newline at end of file
diff --git a/byte_infer_perf/llm_perf/prepare_model.py b/byte_infer_perf/llm_perf/prepare_model.py
index 431c6e05b..fde3df514 100644
--- a/byte_infer_perf/llm_perf/prepare_model.py
+++ b/byte_infer_perf/llm_perf/prepare_model.py
@@ -13,6 +13,7 @@
     "llama3-torch-bf16-70b": ("llama3-70b", "shenzhi-wang/Llama3-70B-Chinese-Chat"), 
     "falcon-torch-bf16-180b": ("falcon-180b", "tiiuae/falcon-180B"), 
     "mixtral-torch-bf16-8x22b": ("mixtral-8x22b-instruct", "mistralai/Mixtral-8x22B-Instruct-v0.1"),
+    "llama3.1-8b-torch-bf16": ("llama3.1-8b", "meta-llama/Llama-3.1-8B-Instruct")
 }
 
 if __name__ == "__main__":
diff --git a/byte_infer_perf/llm_perf/workloads/llama3.1-8b-torch-bf16.json b/byte_infer_perf/llm_perf/workloads/llama3.1-8b-torch-bf16.json
new file mode 100644
index 000000000..39f61087d
--- /dev/null
+++ b/byte_infer_perf/llm_perf/workloads/llama3.1-8b-torch-bf16.json
@@ -0,0 +1,18 @@
+{
+    "model": "llama3.1-8b",
+    "test_accuracy": false,
+    "min_tp_size": 1, 
+    "accuracy_config": {
+        "dataset": "llm_perf/datasets/merged_52_test.csv", 
+        "min_new_tokens": 1, 
+        "max_new_tokens": 512
+    }, 
+    "test_perf": true,
+    "perf_config": {
+        "tp_sizes": [1], 
+        "batch_sizes": [1, 4, 8],
+        "input_tokens": [1024], 
+        "output_tokens": 200, 
+        "perf_time": 100
+    }
+}
\ No newline at end of file