diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/ILUVATAR.json b/byte_infer_perf/general_perf/backends/ILUVATAR/ILUVATAR.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/byte_infer_perf/general_perf/backends/ILUVATAR/ILUVATAR.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md b/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md new file mode 100644 index 000000000..e69de29bb diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/common.py b/byte_infer_perf/general_perf/backends/ILUVATAR/common.py new file mode 100644 index 000000000..2d96ef2f8 --- /dev/null +++ b/byte_infer_perf/general_perf/backends/ILUVATAR/common.py @@ -0,0 +1,212 @@ +import random +import torch +import time +import ctypes +import argparse +import numpy as np +from os.path import join, dirname, exists + +import tensorrt +from tensorrt import Dims +import pycuda.driver as cuda +from cuda import cuda,cudart +from datasets import load_dataset +from torch.utils.data import SequentialSampler, DataLoader +from transformers import DataCollatorForLanguageModeling, BertTokenizer + + +def setup_seed(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + np.random.seed(seed) + random.seed(seed) + torch.backends.cudnn.deterministic = True + + +def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""): + if not dynamic_path: + dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so") + + if not exists(dynamic_path): + raise FileNotFoundError( + f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!") + + ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL) + tensorrt.init_libnvinfer_plugins(logger, namespace) + print(f"Loaded plugin from {dynamic_path}") + + +def build_engine(model_name, onnx_model_path, engine_path, MaxBatchSize): + IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING) + builder = tensorrt.Builder(IXRT_LOGGER) + EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + network = builder.create_network(EXPLICIT_BATCH) + build_config = builder.create_builder_config() + + profile = builder.create_optimization_profile() + + if model_name == 'resnet50': + profile.set_shape( + "input", Dims([1, 3,224,224]), Dims([32, 3,224,224]), Dims([64, 3,224,224])) + + elif model_name == 'yolov5': + profile.set_shape( + "images", Dims([1, 3,640,640]), Dims([32, 3,640,640]), Dims([64, 3,640,640])) + + elif model_name == 'bert': + profile.set_shape("input_ids.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384])) + profile.set_shape("attention_mask.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384])) + profile.set_shape("token_type_ids.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384])) + + elif model_name == 'widedeep': + profile.set_shape( + "new_numeric_placeholder:0", Dims([MaxBatchSize, 13]), Dims([MaxBatchSize, 13]), Dims([MaxBatchSize, 13])) + profile.set_shape( + "new_categorical_placeholder:0", Dims([MaxBatchSize * 26, 2]), Dims([MaxBatchSize * 26, 2]), Dims([MaxBatchSize * 26, 2])) + profile.set_shape( + "import/head/predictions/zeros_like:0", Dims([MaxBatchSize, 1]), Dims([MaxBatchSize, 1]), Dims([MaxBatchSize, 1])) + else: + pass + + build_config.add_optimization_profile(profile) + + parser = tensorrt.OnnxParser(network, IXRT_LOGGER) + parser.parse_from_file(onnx_model_path) + build_config.set_flag(tensorrt.BuilderFlag.FP16) + + # set dynamic + num_inputs = network.num_inputs + + for i in range(num_inputs): + if model_name == 'resnet50': + input_tensor = network.get_input(i) + input_tensor.shape = Dims([-1, 3, 224, 224]) + + elif model_name == 'yolov5': + input_tensor = network.get_input(i) + input_tensor.shape = Dims([-1, 3, 640, 640]) + network.get_input(i).dtype = tensorrt.float16 + + elif model_name == 'bert': + input_tensor = network.get_input(i) + input_tensor.shape = Dims([-1, 384]) + + elif model_name == 'widedeep': + input_tensor = network.get_input(i) + if i == 0: + input_tensor.shape = Dims([-26, 2]) + elif i == 1: + input_tensor.shape = Dims([-1, 13]) + else: + input_tensor.shape = Dims([-1, 1]) + + else: + pass + + plan = builder.build_serialized_network(network, build_config) + + with open(engine_path, "wb") as f: + f.write(plan) + + print("Build dynamic shape engine done!") + + +def init_by_tensorrt(engine_path): + datatype = tensorrt.DataType.FLOAT + host_mem = tensorrt.IHostMemory + logger = tensorrt.Logger(tensorrt.Logger.ERROR) + + with open(engine_path, "rb") as f, tensorrt.Runtime(logger) as runtime: + runtime = tensorrt.Runtime(logger) + assert runtime + engine = runtime.deserialize_cuda_engine(f.read()) + assert engine + context = engine.create_execution_context() + assert context + + return engine, context + + +def setup_io_bindings(engine, context): + # Setup I/O bindings + inputs = [] + outputs = [] + allocations = [] + + for i in range(engine.num_bindings): + is_input = False + if engine.binding_is_input(i): + is_input = True + + name = engine.get_binding_name(i) + dtype = engine.get_binding_dtype(i) + shape = context.get_binding_shape(i) + + if is_input: + batch_size = shape[0] + size = np.dtype(tensorrt.nptype(dtype)).itemsize + + for s in shape: + size *= s + + # allocation = cuda.mem_alloc(size) + err, allocation = cudart.cudaMalloc(size) + assert err == cudart.cudaError_t.cudaSuccess + + binding = { + "index": i, + "name": name, + "dtype": np.dtype(tensorrt.nptype(dtype)), + "shape": list(shape), + "allocation": allocation, + "nbytes": size + } + # print( + # f"binding {i}, name : {name} dtype : {np.dtype(tensorrt.nptype(dtype))} shape : {list(shape)}" + # ) + allocations.append(allocation) + + if engine.binding_is_input(i): + inputs.append(binding) + else: + outputs.append(binding) + + return inputs, outputs, allocations + + +def tensorrt_infer_dynamic(engine, context, input_ids, token_type_ids): + input_names = [ + "input_ids", + "token_type_ids" + ] + + # set dynamic shape + for input_name in input_names: + if input_name == "input_ids": + input_shape = input_ids.shape + elif input_name == "token_type_ids": + input_shape = token_type_ids.shape + + input_idx = engine.get_binding_index(input_name) + context.set_binding_shape(input_idx, Dims(input_shape)) + + # Setup I/O bindings + inputs, outputs, allocations = setup_io_bindings(engine, context) + + ### infer + # Prepare the output data + output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"]) + + # Process I/O and execute the network + cuda.memcpy_htod(inputs[0]["allocation"], input_ids) + cuda.memcpy_htod(inputs[1]["allocation"], token_type_ids) + + torch.cuda.synchronize() + time_start = time.time() + context.execute_v2(allocations) + torch.cuda.synchronize() + time_each = time.time() - time_start + + cuda.memcpy_dtoh(output, outputs[0]["allocation"]) + + return output, time_each \ No newline at end of file diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py new file mode 100644 index 000000000..72ce451d4 --- /dev/null +++ b/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py @@ -0,0 +1,122 @@ +# Copyright 2023 Graphcore Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +import tensorrt +from tensorrt import Dims +from general_perf.backends.ILUVATAR.common import load_ixrt_plugin +load_ixrt_plugin() + +from general_perf.backends.ILUVATAR.common import build_engine +from general_perf.tools import torch_to_onnx + +from general_perf.backends import compile_backend + +log = logging.getLogger("CompileBackendILUVATAR") + + +class CompileBackendILUVATAR(compile_backend.CompileBackend): + def __init__(self): + super(CompileBackendILUVATAR, self).__init__() + self.hardware_type = "ILUVATAR" + self.need_reload = False + self.model_runtimes = [] + self.model_config = None + + def version(self) -> str: + """Return compile backend version details.""" + return tensorrt.__version__ + + def compile(self, configs, dataloader=None): + model_name = configs["model_info"]["model"].split("-")[0] + MaxBatchSize = configs['model_info']['max_batch_size'] + onnx_model_path = configs['model_info']['onnx_model_path'] + engine_path = configs['model_info']['engine_path'] + + # build engine + if model_name == 'widedeep': + for bs in configs['workload']['batch_sizes']: + engine_paths = "general_perf/general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_sim_" + str(bs) + ".engine" + build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_paths, MaxBatchSize=bs) + else: + build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize) + + result = { + "model": configs["model_info"]["model"], + "engine_path": engine_path, + "model_name": configs['model_info']["model"].split("-")[0], + "framework": configs["model_info"]["framework"], + "framework_iluvatar": configs["model_info"]["framework_iluvatar"], + "compile_precision": configs['model_info']['model_precision'], + "input_type": configs["model_info"]["input_type"].split(","), + "max_batch_size": configs["model_info"]["max_batch_size"], + "compile_status":"success", + "sg_percent": 100, + "segments": [ + { + "sg_idx": 0, + "is_fallback": False, + "input_tensor_map": configs["model_info"]["input_shape"], + "output_tensor_map": configs["model_info"]["outputs"], + "compiled_model": [ + { + "compiled_bs": 1, + "compiled_obj": configs['model_info']['model_path'], + }, + ], + }, + ], + } + + self.configs = result + self.workload = configs['workload'] + self.model_info = configs["model_info"] + + for key, value in result.items(): + print('{key}: {value}'.format(key=key, value=value)) + + return result + + + def get_interact_profile(self, configs): + """ + Collect information for core engine to let user interactively fill in configurations. + """ + return [] + + def get_best_batch_size(self): + """Get Best Batch Size for the model. + + Usually take the max batch size can be loaded to IPU as the best batch size to + get highest throughput. + """ + return None + + # 下面的两个函数待优化, 目前得到的onnx模型都是事先转换好的 + # to do + def get_onnx(self, model_path, onnx_path): + torch_to_onnx(model_path, onnx_path) + + def pre_optimize(self, configs): + # todo: pt转onnx模型 + model_name = configs["model_info"]["model"].split("-")[0] + + if model_name == "resnet50": + configs["model_info"]["model_path"] = "general_perf/general_perf/model_zoo/regular/open_resnet50/resnet50.onnx" + + elif model_name == "yolov5": + configs["model_info"]["model_path"] = 'general_perf/general_perf/model_zoo/popular/open_yolov5/yolov5s_sim.onnx' + + return configs \ No newline at end of file diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt b/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt new file mode 100644 index 000000000..5c4a8abca --- /dev/null +++ b/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt @@ -0,0 +1,4 @@ +transformers==4.35.2 +datasets==2.14.7 +onnx==1.15.0 +pandas==2.1.3 \ No newline at end of file diff --git a/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py new file mode 100644 index 000000000..9bb5f9ab5 --- /dev/null +++ b/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py @@ -0,0 +1,263 @@ +# Copyright 2023 Graphcore Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import torch +import logging +import numpy as np +from tqdm import tqdm + +from general_perf.backends import runtime_backend +from general_perf.backends.ILUVATAR.common import init_by_tensorrt, setup_io_bindings +from tensorrt import Dims +from cuda import cuda, cudart + +from general_perf.backends.ILUVATAR.common import load_ixrt_plugin +load_ixrt_plugin() + +log = logging.getLogger("RuntimeBackendILUVATAR") + +pt_dtype_map = { + "FLOAT32": torch.float32, + "FLOAT16": torch.float16, + "INT8": torch.int8, + "LONG": torch.long, + "INT64": torch.int64 +} + +INPUT_TYPE = { + "UINT8": np.uint8, + "FLOAT32": np.float32, + "LONG": np.long, + "INT32": np.int32, + "INT64": np.int64, + "BOOL": np.bool +} + +class RuntimeBackendILUVATAR(runtime_backend.RuntimeBackend): + def __init__(self): + super(RuntimeBackendILUVATAR, self).__init__() + self.hardware_type = "ILUVATAR" + self.need_reload = False + self.model_runtimes = [] + self.configs = None + self.engine = None + self.context = None + self.batch_size = -1 + + def predict(self, feeds): + input_tensors = [] + i = 0 + for key, _ in feeds.items(): + tmp_tensor = torch.tensor(feeds[key], + dtype=pt_dtype_map[self.input_type[i]]) + input_tensors.append(tmp_tensor) + i += 1 + + # ixrt inference + engine = self.engine + assert engine + context = self.context + assert context + + # set dynamic shape + model_name = self.configs["model"].split("-")[0] + + if model_name == 'resnet50': + input_name = "input" + input_shape = input_tensors[0].shape + input_idx = engine.get_binding_index(input_name) + context.set_binding_shape(input_idx, Dims(input_shape)) + + elif model_name == 'yolov5': + input_name = "images" + input_shape = input_tensors[0].shape + input_idx = engine.get_binding_index(input_name) + context.set_binding_shape(input_idx, Dims(input_shape)) + + elif model_name == 'bert': + input_names = [ + "input_ids.1", + "attention_mask.1", + "token_type_ids.1" + ] + for input_name in input_names: + if input_name == "input_ids.1": + input_shape = input_tensors[0].shape + if input_name == "attention_mask.1": + input_shape = input_tensors[1].shape + if input_name == 'token_type_ids.1': + input_shape = input_tensors[2].shape + + input_idx = engine.get_binding_index(input_name) + context.set_binding_shape(input_idx, Dims(input_shape)) + + elif model_name == 'widedeep': + input_tensors.append(np.zeros((self.batch_size, 1), dtype=np.float32)) + input_names = [ + "new_categorical_placeholder:0", + "new_numeric_placeholder:0", + "import/head/predictions/zeros_like:0" + ] + for input_name in input_names: + if input_name == 'new_categorical_placeholder:0': + input_shape = input_tensors[0].shape + if input_name == 'new_numeric_placeholder:0': + input_shape = input_tensors[1].shape + if input_name == 'import/head/predictions/zeros_like:0': + input_shape = input_tensors[2].shape + + input_idx = engine.get_binding_index(input_name) + context.set_binding_shape(input_idx, Dims(input_shape)) + + else: + pass + + # Setup I/O bindings + inputs, outputs, allocations = setup_io_bindings(engine, context) + + # Prepare the output data + outputs_list = [] + for i in range(len(outputs)): + output = np.zeros(outputs[i]["shape"], outputs[i]["dtype"]) + outputs_list.append(output) + + data_batch_list = [] + for i in range(len(input_tensors)): + data_batch = np.ascontiguousarray(input_tensors[i]) + data_batch_list.append(data_batch) + + # H2D: host to device + for i in range(len(inputs)): + (err, ) = cudart.cudaMemcpy( + inputs[i]["allocation"], + data_batch_list[i], + inputs[i]["nbytes"], + cudart.cudaMemcpyKind.cudaMemcpyHostToDevice + ) + + context.execute_v2(allocations) + + # D2H: device to host + for i in range(len(outputs)): + (err, )= cudart.cudaMemcpy(outputs_list[i], + outputs[i]["allocation"], + outputs[i]["nbytes"], + cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost + ) + + # Free Gpu Memory + # cuda-python + for i in range(len(inputs)): + err, = cudart.cudaFree(inputs[i]["allocation"]) + assert err == cudart.cudaError_t.cudaSuccess + + for i in range(len(outputs)): + err, = cudart.cudaFree(outputs[i]["allocation"]) + assert err == cudart.cudaError_t.cudaSuccess + + result = {} + + if model_name == 'resnet50': + result['softmax_tensor'] = outputs_list[0] + elif model_name == 'yolov5': + result['output'] = outputs_list[0] + result['345'] = outputs_list[1] + result['403'] = outputs_list[2] + result['461'] = outputs_list[3] + elif model_name == 'bert': + result['start_logits'] = outputs_list[0] + result['end_logits'] = outputs_list[1] + elif model_name == 'widedeep': + result['import/head/predictions/probabilities:0'] = outputs_list[0] + else: + pass + + return result + + def benchmark(self, dataloader): + batch_size = self.get_loaded_batch_size() + iterations = self.workload['iterations'] + times_range = [] + report = {} + report["BS"] = batch_size + + test_data = self._get_fake_samples(batch_size=batch_size, + shape=self.configs['segments'][0]['input_tensor_map'], + input_type=self.configs['input_type']) + + for _ in range(30): + self.predict(test_data) + + for _ in range(iterations): + start_time = time.time() + self.predict(test_data) + end_time = time.time() + times_range.append(end_time - start_time) + + times_range.sort() + tail_latency = round( + times_range[int(len(times_range) * 0.99)] * 1000, 2) + avg_latency = round(sum(times_range) / iterations * 1000, 2) + qps = int(1000.0 * self.batch_size / avg_latency) + + log.info( + 'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'. + format(self.batch_size, qps, avg_latency, tail_latency)) + + report['QPS'] = qps + report['AVG Latency'] = avg_latency + report['P99 Latency'] = tail_latency + + return report + + def get_loaded_batch_size(self): + # return self.workload['batch_sizes'][0] + return self.batch_size + + def load(self, batch_size) -> None: + # load engine + engine_path = self.configs['engine_path'] + + model_name = self.configs["model"].split("-")[0] + + if model_name == 'widedeep': + engine_path = "general_perf/general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_sim_" + str(batch_size) + ".engine" + + engine, context = init_by_tensorrt(engine_path) + + self.framework_iluvatar = self.configs['framework_iluvatar'] + self.input_type = self.configs['input_type'] + + self.batch_size = batch_size + self.model_runtimes = [] + self.engine = engine + self.context = context + + def _get_fake_samples(self, batch_size, shape, input_type): + data = {} + if input_type: + i = 0 + for key, val in shape.items(): + if key != "text": + val = [val[0] * batch_size] + val[1:] + data[key] = np.random.random(size=val).astype( + INPUT_TYPE[input_type[i]]) + else: + data[key] = np.random.random(size=val).astype( + INPUT_TYPE[input_type[i]]) + i += 1 + return data + else: + raise ValueError("Please provide input type") \ No newline at end of file diff --git a/byte_infer_perf/general_perf/model_zoo/bert-ixrt-fp16.json b/byte_infer_perf/general_perf/model_zoo/bert-ixrt-fp16.json new file mode 100644 index 000000000..f63163203 --- /dev/null +++ b/byte_infer_perf/general_perf/model_zoo/bert-ixrt-fp16.json @@ -0,0 +1,19 @@ +{ + "model": "bert-ixrt-fp16", + "model_path": "general_perf/general_perf/model_zoo/regular/open_bert/bert-base-uncased-squad-v1.pt", + "onnx_model_path": "general_perf/general_perf/model_zoo/regular/open_bert/bert-base-uncased-squad-v1_end.onnx", + "engine_path": "general_perf/general_perf/model_zoo/regular/open_bert/bert-base-uncased-squad-v1.engine", + "framework": "Pytorch", + "framework_version": "1.9.1", + "framework_iluvatar": "IXRT", + "framework_iluvatar_version": "tensorrt.__version__", + "model_format": "pt", + "model_precision": "FP16", + "inputs":"input_ids.1,attention_mask.1,token_type_ids.1", + "outputs":"start_logits,end_logits", + "input_shape": {"input_ids.1": [1,384], "attention_mask.1": [1,384], "token_type_ids.1": [1,384]}, + "input_type": "LONG,LONG,LONG", + "dataset_name": "open_squad", + "max_batch_size": 64, + "is_quantized": false +} diff --git a/byte_infer_perf/general_perf/model_zoo/resnet50-ixrt-fp16.json b/byte_infer_perf/general_perf/model_zoo/resnet50-ixrt-fp16.json new file mode 100644 index 000000000..a9007b714 --- /dev/null +++ b/byte_infer_perf/general_perf/model_zoo/resnet50-ixrt-fp16.json @@ -0,0 +1,19 @@ +{ + "model": "resnet50-ixrt-fp16", + "model_path": "general_perf/general_perf/model_zoo/regular/open_resnet50/resnet50.pt", + "onnx_model_path": "general_perf/general_perf/model_zoo/regular/open_resnet50/resnet50.onnx", + "engine_path": "general_perf/general_perf/model_zoo/regular/open_resnet50/resnet50.engine", + "framework": "Pytorch", + "framework_version": "1.8.1", + "framework_iluvatar": "IXRT", + "framework_iluvatar_version": "tensorrt.__version__", + "model_format": "pt", + "model_precision": "FP32", + "inputs": "input", + "outputs": "softmax_tensor", + "input_shape": {"input_1.1": [1, 3, 224, 224]}, + "input_type": "FLOAT32", + "dataset_name": "open_imagenet", + "max_batch_size": 64, + "layout": "NCHW" +} diff --git a/byte_infer_perf/general_perf/model_zoo/widedeep-ixrt-fp16.json b/byte_infer_perf/general_perf/model_zoo/widedeep-ixrt-fp16.json new file mode 100644 index 000000000..f933048ae --- /dev/null +++ b/byte_infer_perf/general_perf/model_zoo/widedeep-ixrt-fp16.json @@ -0,0 +1,18 @@ +{ + "model": "widedeep-tf-fp32", + "model_path": "general_perf/general_perf/model_zoo/regular/open_wide_deep_saved_model", + "onnx_model_path": "general_perf/general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_sim.onnx", + "engine_path": "general_perf/general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape_sim.engine", + "framework": "Tensorflow", + "framework_version": "2.4.0", + "framework_iluvatar": "IXRT", + "framework_iluvatar_version": "tensorrt.__version__", + "model_format": "saved_model", + "model_precision": "FP32", + "inputs": "new_categorical_placeholder:0,new_numeric_placeholder:0", + "outputs": "import/head/predictions/probabilities:0", + "input_shape": {"new_categorical_placeholder:0": [26, 2], "new_numeric_placeholder:0": [1, 13]}, + "input_type": "INT64,FLOAT32", + "dataset_name": "open_criteo_kaggle", + "max_batch_size": 16384 +} \ No newline at end of file diff --git a/byte_infer_perf/general_perf/model_zoo/yolov5-ixrt-fp16.json b/byte_infer_perf/general_perf/model_zoo/yolov5-ixrt-fp16.json new file mode 100644 index 000000000..8f22d9ac8 --- /dev/null +++ b/byte_infer_perf/general_perf/model_zoo/yolov5-ixrt-fp16.json @@ -0,0 +1,18 @@ +{ + "model": "yolov5-ixrt-fp16", + "model_path": "general_perf/general_perf/model_zoo/popular/open_yolov5/yolov5s.onnx", + "onnx_model_path": "general_perf/general_perf/model_zoo/popular/open_yolov5/yolov5s_sim.onnx", + "engine_path": "general_perf/general_perf/model_zoo/popular/open_yolov5/yolov5s.engine", + "framework": "Onnx", + "framework_version": "1.10.2", + "framework_iluvatar": "IXRT", + "framework_iluvatar_version": "tensorrt.__version__", + "model_format": "onnx", + "model_precision": "FP32", + "inputs":"images", + "outputs":"output,345,403,461", + "input_shape": {"images": [1,3,640,640]}, + "input_type": "FLOAT32", + "dataset_name": null, + "max_batch_size": 64 +} \ No newline at end of file diff --git a/byte_infer_perf/general_perf/workloads/bert-ixrt-fp16.json b/byte_infer_perf/general_perf/workloads/bert-ixrt-fp16.json new file mode 100644 index 000000000..ba542ef59 --- /dev/null +++ b/byte_infer_perf/general_perf/workloads/bert-ixrt-fp16.json @@ -0,0 +1,12 @@ +{ + "model": "bert-ixrt-fp16", + "test_perf": true, + "test_accuracy": true, + "test_numeric": true, + "clients": 3, + "iterations": 100, + "batch_sizes":[1,4,8,16,24,32,64], + "data_percent": 100, + "compile_only": false, + "multi_cores": true +} diff --git a/byte_infer_perf/general_perf/workloads/resnet50-ixrt-fp16.json b/byte_infer_perf/general_perf/workloads/resnet50-ixrt-fp16.json new file mode 100644 index 000000000..0c2fbe5ce --- /dev/null +++ b/byte_infer_perf/general_perf/workloads/resnet50-ixrt-fp16.json @@ -0,0 +1,14 @@ +{ + "model": "resnet50-ixrt-fp16", + "test_perf": true, + "test_accuracy": true, + "test_numeric": true, + "clients": 1, + "iterations": 100, + "batch_sizes":[1,4,8,16,24,32,64], + "data_percent": 100, + "compile_only": false, + "fake_data": false, + "multi_cores": true + +} \ No newline at end of file diff --git a/byte_infer_perf/general_perf/workloads/widedeep-ixrt-fp16.json b/byte_infer_perf/general_perf/workloads/widedeep-ixrt-fp16.json new file mode 100644 index 000000000..47c6c0429 --- /dev/null +++ b/byte_infer_perf/general_perf/workloads/widedeep-ixrt-fp16.json @@ -0,0 +1,12 @@ +{ + "model": "widedeep-ixrt-fp16", + "test_perf": true, + "test_accuracy": true, + "test_numeric": true, + "clients": 3, + "iterations": 100, + "batch_sizes":[1024,4096,16384], + "data_percent": 100, + "compile_only": false, + "multi_cores": true +} \ No newline at end of file diff --git a/byte_infer_perf/general_perf/workloads/yolov5-ixrt-fp16.json b/byte_infer_perf/general_perf/workloads/yolov5-ixrt-fp16.json new file mode 100644 index 000000000..345a7b14a --- /dev/null +++ b/byte_infer_perf/general_perf/workloads/yolov5-ixrt-fp16.json @@ -0,0 +1,12 @@ +{ + "model": "yolov5-ixrt-fp16", + "test_perf": true, + "test_accuracy": false, + "test_numeric": false, + "clients": 3, + "iterations": 100, + "batch_sizes":[1,4,8,16,24,32,64], + "data_percent": 100, + "compile_only": false, + "multi_cores": true +} diff --git a/byte_infer_perf/llm_perf/backends/ILU/benchmark.py b/byte_infer_perf/llm_perf/backends/ILU/benchmark.py new file mode 100644 index 000000000..c01651f59 --- /dev/null +++ b/byte_infer_perf/llm_perf/backends/ILU/benchmark.py @@ -0,0 +1,145 @@ +from vllm import LLM, SamplingParams +import torch +import time +import argparse +import os +import csv +from typing import List + +''' +{ + "Model": "chinese-llama2-torch-fp16-13b", + "Backend": "GPU", + "Host Info": "Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz", + "Min New Tokens": 128, + "Max New Tokens": 256, + "Accuracy": { + "PPL": [], + "Token Diff": {}, + "Logits Diff": {} + }, + "Performance": [ + { + "TP Size": 1, + "Batch Size": 1, + "Input Tokens": 256, + "First Token Latency(AVG)": 0.2663203875223796, + "Per Token Latency(AVG)": 0.2794939676920573, + "First Token Latency(P90)": 0.27227325439453126, + "Per Token Latency(P90)": 0.2796707717361153, + "Token Throughput": 3.577788481980987, + "Request Number": 3, + "QPS": 0.013921355961015514 + } + ] +} + +''' + +class ILUbenchmark(): + def __init__(self, batch, workload, input_tokens, result_queue, max_tokens) -> None: + self.batch = batch + self.workload = workload + self.input_tokens = input_tokens + self.result_queue =result_queue + self.max_tokens = max_tokens + self.config_vllm() + + self.ftl = 0 + self.tps = 0 + self.qps = 0 + + self.model_path = self.getmodelPath() + self.init_inference() + + + def getmodelPath(self): + modelpath = "llm_perf/model_zoo/sota/" + self.workload["model"] + return modelpath + + + def getresult(self): + return self.input_tokens, self.batch * self.output_token, self.ftl, self.ftl + + + def config_vllm(self): + self.gpu_memory_utilization = 0.9 + self.tensor_parallel_size = 2 + self.output_token = 256 + self.target = None + self.quantization = None + self.max_num_seqs = 8 + + def init_inference(self): + self.max_num_seqs = self.max_num_seqs if self.max_num_seqs is not None else self.batch + + self.llm = LLM(model=self.model_path, + gpu_memory_utilization=self.gpu_memory_utilization, + max_num_batched_tokens=self.input_tokens * self.max_num_seqs, + tensor_parallel_size=self.tensor_parallel_size, + max_num_seqs=self.max_num_seqs, + trust_remote_code=True, + quantization=self.quantization, + ) + + self.sampling_params = SamplingParams(temperature=0.0, ignore_eos=True, max_tokens=self.output_token,) + + self.first_token_sampling_params = SamplingParams(temperature=0.0, ignore_eos=True, max_tokens=1,) + + + def benchmark_vllm_ftl(self, batch=1, input_tokens=1024): + print(f'======================= benchmark_vllm_ftl ===============') + prompt_token_ids = [[0] * input_tokens] * batch + #outputs = self.llm.generate(sampling_params=self.first_token_sampling_params, prompt_token_ids=prompt_token_ids[:self.max_num_seqs],use_tqdm=False) + + torch.cuda.synchronize() + start_time = time.perf_counter() + outputs = self.llm.generate(sampling_params=self.first_token_sampling_params, prompt_token_ids=prompt_token_ids) + torch.cuda.synchronize() + end_time = time.perf_counter() + duration_time = end_time - start_time + + num_tokens = 0 + for output in outputs: + assert len(output.outputs[0].token_ids) == 1 + num_tokens += 1 + assert num_tokens == batch + self.ftl = duration_time + + + def benchmark_vllm(self, batch=1, input_tokens=1024): + print(f'@@@@@@@@@@@@@@@@@@ benchmark_vllm @@@@@@@@@@@@@@@@@@@@') + prompt_token_ids = [[0] * input_tokens] * batch + + #outputs = self.llm.generate(sampling_params=self.sampling_params, prompt_token_ids=prompt_token_ids[:self.max_num_seqs],use_tqdm=False) + + torch.cuda.synchronize() + start_time = time.perf_counter() + outputs = self.llm.generate(sampling_params=self.sampling_params, prompt_token_ids=prompt_token_ids) + torch.cuda.synchronize() + end_time = time.perf_counter() + duration_time = end_time - start_time + + num_tokens = 0 + for output in outputs: + assert len(output.outputs[0].token_ids) == self.output_token + num_tokens += self.output_token + assert num_tokens == batch * self.output_token + + model_path = self.model_path.strip() + model_path = model_path if self.model_path[-1]!='/' else model_path[:-1] + if self.target is not None: + val_qps = num_tokens / duration_time + target_qps = self.target + if val_qps < target_qps and ((target_qps - val_qps) / target_qps > 0.1 or target_qps - val_qps > 1.5): + print(f"target qps: {target_qps:.3f}, val qps: {val_qps:.3f}, fail") + exit(1) + else: + print(f"target qps: {target_qps:.3f}, val qps: {val_qps:.3f}, pass") + else: + print(f"model: {model_path}, tp: {self.tensor_parallel_size}, batch size: {batch}, input tokens: {self.input_tokens}, output tokens: {self.output_token}, totol output tokens: {batch * self.output_token}, ftl: {self.ftl:.3f} ms, tps: {num_tokens / duration_time :.3f}, requests :{batch / duration_time :.3f} /s, duration_time:{duration_time*1000}ms") + + self.qps = num_tokens / duration_time + self.tps = batch / duration_time + + diff --git a/byte_infer_perf/llm_perf/backends/ILU/common.py b/byte_infer_perf/llm_perf/backends/ILU/common.py new file mode 100644 index 000000000..d9255ed4c --- /dev/null +++ b/byte_infer_perf/llm_perf/backends/ILU/common.py @@ -0,0 +1,61 @@ +import os +import queue +from multiprocessing import managers +from typing import Any, Dict, List + +import torch + +from llm_perf.core import common + + +class Packet(common.Packet): + def __init__(self, request: common.GenerateRequest): + common.Packet.__init__(self, request) + + self.generation_start_time = None + + def _is_finished(self) -> bool: + return self.is_finished() + + +class GPUMultiProcessMsgr(common.MultiProcessMsgr, managers.BaseManager): + def __init__(self, local_rank: int, world_size: int, name: str): + self.rank = local_rank + self.world_size = world_size + + def make_message_queue(rank): + if rank != 0: + return None + new_queue = queue.Queue() + return lambda: new_queue + + for i in range(1, world_size): + self.register(f"message_queue_{i}", callable=make_message_queue(local_rank)) + if local_rank == 0: + super().__init__(authkey=name.encode("utf-8")) + self.start() + addr = [self.address] + torch.distributed.broadcast_object_list(addr, device=f"cuda:{local_rank}") + self.msg_queue_list = [ + getattr(self, f"message_queue_{rank}")() + for rank in range(1, world_size) + ] + else: + addr = [None] + torch.distributed.broadcast_object_list(addr, device=f"cuda:{local_rank}") + super().__init__(address=addr[0], authkey=name.encode("utf-8")) + self.connect() + self.msg_queue = getattr(self, f"message_queue_{local_rank}")() + + def broadcast(self, obj): + assert ( + self.rank == 0 + ), f"InterProcessMessager broadcast_message only allow rank0 to call!" + for rank in range(1, self.world_size): + self.msg_queue_list[rank - 1].put(obj) + + def receive(self): + assert ( + self.rank > 0 + ), f"InterProcessMessager receive_message don't allow rank0 to call!" + return self.msg_queue.get() diff --git a/byte_infer_perf/llm_perf/backends/ILU/gpu_engine_chatglm.py b/byte_infer_perf/llm_perf/backends/ILU/gpu_engine_chatglm.py new file mode 100644 index 000000000..dd1b32818 --- /dev/null +++ b/byte_infer_perf/llm_perf/backends/ILU/gpu_engine_chatglm.py @@ -0,0 +1,118 @@ +import json +import os +import time +from typing import Any, Dict, List + +import torch +from torch import distributed as dist +from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizer + +from llm_perf.backends.GPU.common import GPUMultiProcessMsgr +from llm_perf.core.common import Packet +from llm_perf.core.engine import CoreEngine +from llm_perf.utils.logger import logger + + +class GpuEngineChatGLM(CoreEngine): + def __init__( + self, + modelcls, + model_config: Dict[str, Any], + tokenizer: PreTrainedTokenizer, + max_batch_size: int, + **kwarg, + ) -> None: + super().__init__() + if dist.is_initialized(): + self.local_rank = dist.get_rank() + self.world_size = dist.get_world_size() + else: + self.local_rank = 0 + self.world_size = 1 + + if self.world_size > 1: + self.mlp_manager = GPUMultiProcessMsgr( + self.local_rank, self.world_size, "MultiProcessMsgr" + ) + + self.tokenizer = tokenizer + self.pad_token_id = tokenizer.pad_token_id + self.max_bs = max_batch_size + + self.init_inference(modelcls, model_config) + + def init_inference(self, modelcls, model_config: Dict[str, Any]): + torch.cuda.set_device(self.local_rank) + from llm_perf.model_zoo.chatglm import ChatGLMConfig + + self.model = modelcls.from_pretrained( + model_config["model_path"], config=ChatGLMConfig(**model_config["network"]) + ) + self.model.eval() + self.model.half().cuda() + + logger.info(f"cuda model {model_config['model_path']} loaded {self.model}") + + def broadcast_inputs(self, *args): + if self.world_size <= 1: + return args + + if self.local_rank == 0: + self.mlp_manager.broadcast(args) + return args + else: + inputs = self.mlp_manager.receive() + return inputs + + def prepare_inputs(self, batch: List[Packet]): + # TODO: chatglm is left padding, get wrong input when batching + all_input_ids = [] + all_position_ids = [] + max_seq_len = -1 + for packet in batch: + if len(packet.request.input_ids) + len(packet.generate_ids) > max_seq_len: + max_seq_len = len(packet.request.input_ids) + len(packet.generate_ids) + for packet in batch: + pad_len = max_seq_len - ( + len(packet.request.input_ids) + len(packet.generate_ids) + ) + input_ids = ( + packet.request.input_ids + + packet.generate_ids + + [self.pad_token_id] * pad_len + ) + all_input_ids.append(input_ids) + all_position_ids.append([i for i in range(max_seq_len)]) + + model_inputs = { + "past_key_values": None, + "attention_mask": None, + "use_cache": None, + } + model_inputs["input_ids"] = all_input_ids + model_inputs["position_ids"] = all_position_ids + return model_inputs + + def do_inference(self, packets: List[Packet]): + torch.cuda.set_device(self.local_rank) + + model_inputs = self.prepare_inputs(packets) if self.local_rank == 0 else None + model_inputs = self.broadcast_inputs(model_inputs)[0] + + model_inputs["input_ids"] = torch.tensor(model_inputs["input_ids"]) + model_inputs["position_ids"] = torch.tensor(model_inputs["position_ids"]) + + for k, v in model_inputs.items(): + if isinstance(v, torch.Tensor): + model_inputs[k] = v.cuda() + + outputs = self.model(**model_inputs) + + if self.local_rank == 0: + next_tokens_logits = outputs.logits[:, -1, :].contiguous() + input_logits = outputs.logits[..., :-1, :].contiguous() + + return { + "input_logits": input_logits, + "last_logits": next_tokens_logits, + } diff --git a/byte_infer_perf/llm_perf/backends/ILU/gpu_engine_chatglm2.py b/byte_infer_perf/llm_perf/backends/ILU/gpu_engine_chatglm2.py new file mode 100644 index 000000000..a334b5223 --- /dev/null +++ b/byte_infer_perf/llm_perf/backends/ILU/gpu_engine_chatglm2.py @@ -0,0 +1,119 @@ +import json +import os +import time +from typing import Any, Dict, List + +import torch +from torch import distributed as dist +from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizer + +from llm_perf.backends.GPU.common import GPUMultiProcessMsgr +from llm_perf.core.common import Packet +from llm_perf.core.engine import CoreEngine +from llm_perf.utils.logger import logger + + +class GpuEngineChatGLM2(CoreEngine): + def __init__( + self, + modelcls, + model_config: Dict[str, Any], + tokenizer: PreTrainedTokenizer, + max_batch_size: int, + **kwarg, + ) -> None: + super().__init__() + if dist.is_initialized(): + self.local_rank = dist.get_rank() + self.world_size = dist.get_world_size() + else: + self.local_rank = 0 + self.world_size = 1 + + if self.world_size > 1: + self.mlp_manager = GPUMultiProcessMsgr( + self.local_rank, self.world_size, "MultiProcessMsgr" + ) + + self.tokenizer = tokenizer + self.pad_token_id = tokenizer.pad_token_id + self.max_bs = max_batch_size + + self.init_inference(modelcls, model_config) + + def init_inference(self, modelcls, model_config: Dict[str, Any]): + torch.cuda.set_device(self.local_rank) + from llm_perf.model_zoo.chatglm2 import ChatGLMConfig + + self.model = modelcls.from_pretrained( + model_config["model_path"], config=ChatGLMConfig(**model_config["network"]) + ) + self.model.eval() + self.model.half().cuda() + + logger.info(f"cuda model {model_config['model_path']} loaded {self.model}") + + def broadcast_inputs(self, *args): + if self.world_size <= 1: + return args + + if self.local_rank == 0: + self.mlp_manager.broadcast(args) + return args + else: + inputs = self.mlp_manager.receive() + return inputs + + def prepare_inputs(self, batch: List[Packet]) -> Dict: + all_input_ids = [] + all_position_ids = [] + max_seq_len = -1 + for packet in batch: + if len(packet.request.input_ids) + len(packet.generate_ids) > max_seq_len: + max_seq_len = len(packet.request.input_ids) + len(packet.generate_ids) + for packet in batch: + pad_len = max_seq_len - ( + len(packet.request.input_ids) + len(packet.generate_ids) + ) + input_ids = ( + packet.request.input_ids + + packet.generate_ids + + [self.pad_token_id] * pad_len + ) + all_input_ids.append(input_ids) + all_position_ids.append([i for i in range(max_seq_len)]) + + model_inputs = { + "past_key_values": None, + "attention_mask": None, + "use_cache": None, + } + + model_inputs["input_ids"] = all_input_ids + model_inputs["position_ids"] = all_position_ids + model_inputs["return_last_logit"] = False + return model_inputs + + def do_inference(self, packets: List[Packet]): + torch.cuda.set_device(self.local_rank) + + model_inputs = self.prepare_inputs(packets) if self.local_rank == 0 else None + model_inputs = self.broadcast_inputs(model_inputs)[0] + + model_inputs["input_ids"] = torch.tensor(model_inputs["input_ids"]) + model_inputs["position_ids"] = torch.tensor(model_inputs["position_ids"]) + + for k, v in model_inputs.items(): + if isinstance(v, torch.Tensor): + model_inputs[k] = v.cuda() + + outputs = self.model(**model_inputs) + + if self.local_rank == 0: + next_tokens_logits = outputs.logits[:, -1, :].contiguous() + input_logits = outputs.logits[..., :-1, :].contiguous() + + return { + "input_logits": input_logits, + "last_logits": next_tokens_logits, + } diff --git a/byte_infer_perf/llm_perf/backends/ILU/gpu_engine_chinese_llama2.py b/byte_infer_perf/llm_perf/backends/ILU/gpu_engine_chinese_llama2.py new file mode 100644 index 000000000..6206c2d29 --- /dev/null +++ b/byte_infer_perf/llm_perf/backends/ILU/gpu_engine_chinese_llama2.py @@ -0,0 +1,119 @@ +import json +import os +import time +from typing import Any, Dict, List + +import torch +from torch import distributed as dist +from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizer + +from llm_perf.backends.GPU.common import GPUMultiProcessMsgr +from llm_perf.core.common import Packet +from llm_perf.core.engine import CoreEngine +from llm_perf.utils.logger import logger + + +class GpuEngineChineseLlama2(CoreEngine): + def __init__( + self, + modelcls, + model_config: Dict[str, Any], + tokenizer: PreTrainedTokenizer, + max_batch_size: int, + **kwarg, + ) -> None: + super().__init__() + if dist.is_initialized(): + self.local_rank = dist.get_rank() + self.world_size = dist.get_world_size() + else: + self.local_rank = 0 + self.world_size = 1 + + if self.world_size > 1: + self.mlp_manager = GPUMultiProcessMsgr( + self.local_rank, self.world_size, "MultiProcessMsgr" + ) + + self.tokenizer = tokenizer + self.pad_token_id = tokenizer.pad_token_id + self.max_bs = max_batch_size + + self.init_inference(modelcls, model_config) + + def init_inference(self, modelcls, model_config: Dict[str, Any]): + torch.cuda.set_device(self.local_rank) + from llm_perf.model_zoo.llama2 import LlamaConfig + + self.model = modelcls.from_pretrained( + model_config["model_path"], config=LlamaConfig(**model_config["network"]) + ) + self.model.eval() + self.model.half().cuda() + + logger.info(f"cuda model {model_config['model_path']} loaded {self.model}") + + def broadcast_inputs(self, *args): + if self.world_size <= 1: + return args + + if self.local_rank == 0: + self.mlp_manager.broadcast(args) + return args + else: + inputs = self.mlp_manager.receive() + return inputs + + def prepare_inputs(self, batch: List[Packet]) -> Dict: + all_input_ids = [] + all_position_ids = [] + max_seq_len = -1 + for packet in batch: + if len(packet.request.input_ids) + len(packet.generate_ids) > max_seq_len: + max_seq_len = len(packet.request.input_ids) + len(packet.generate_ids) + for packet in batch: + pad_len = max_seq_len - ( + len(packet.request.input_ids) + len(packet.generate_ids) + ) + input_ids = ( + packet.request.input_ids + + packet.generate_ids + + [self.pad_token_id] * pad_len + ) + all_input_ids.append(input_ids) + all_position_ids.append([i for i in range(max_seq_len)]) + + model_inputs = { + "past_key_values": None, + "attention_mask": None, + "use_cache": None, + } + model_inputs["input_ids"] = all_input_ids + model_inputs["position_ids"] = all_position_ids + return model_inputs + + def do_inference(self, packets: List[Packet]): + torch.cuda.set_device(self.local_rank) + + model_inputs = self.prepare_inputs(packets) if self.local_rank == 0 else None + model_inputs = self.broadcast_inputs(model_inputs)[0] + + model_inputs["input_ids"] = torch.tensor(model_inputs["input_ids"]) + model_inputs["position_ids"] = torch.tensor(model_inputs["position_ids"]) + + logger.info(f"================= do_inference 0000000000 ===============") + for k, v in model_inputs.items(): + if isinstance(v, torch.Tensor): + model_inputs[k] = v.cuda() + + outputs = self.model(**model_inputs) + + logger.info(f"================= do_inference ===============") + if self.local_rank == 0: + next_tokens_logits = outputs.logits[:, -1, :].contiguous() + input_logits = outputs.logits[..., :-1, :].contiguous() + + return { + "input_logits": input_logits, + "last_logits": next_tokens_logits, + } diff --git a/byte_infer_perf/llm_perf/backends/ILU/gpu_sampler.py b/byte_infer_perf/llm_perf/backends/ILU/gpu_sampler.py new file mode 100644 index 000000000..3075047b6 --- /dev/null +++ b/byte_infer_perf/llm_perf/backends/ILU/gpu_sampler.py @@ -0,0 +1,131 @@ +from typing import Any, Dict, List, Tuple, Union + +import torch + +from llm_perf.core.common import GenerateResult, Packet +from llm_perf.core.sampler import CoreSampler +from llm_perf.utils.logger import logger + + +class GpuSampler(CoreSampler): + def __init__(self) -> None: + super().__init__() + + def sample(self, packets: List[Packet], logits: torch.FloatTensor) -> List[int]: + top_p = [p.request.generate_config.top_p for p in packets] + if all(p == 1.0 for p in top_p): + top_p = None + + top_k = [p.request.generate_config.top_k for p in packets] + if all(k == 0 for k in top_k): + top_k = None + + temperature = [p.request.generate_config.temperature for p in packets] + if all(t == 1.0 for t in temperature): + temperature = None + + ( + sp_input_ids, + sp_cu_seqlens, + sp_max_seqlens, + repetition_penalty, + mask_eos_token, + ) = (None, None, 0, None, None) + eos_token_id = [p.request.generate_config.eos_token_id or -1 for p in packets] + + next_tokens, softmax_out = self._sample( + logits.float(), + temperature=temperature, + top_k=top_k, + top_p=top_p, + input_ids=sp_input_ids, + cu_seqlens=sp_cu_seqlens, + max_seqlens=sp_max_seqlens, + repetition_penalty=repetition_penalty, + mask_eos_token=mask_eos_token, + min_tokens_to_keep=1, + eos_token_id=eos_token_id, + ) + + next_tokens = next_tokens.tolist() + + # The aux_data is softmax_out here + return next_tokens, softmax_out + + def _sample( + self, + logits: torch.FloatTensor, + temperature: Union[List[float], torch.FloatTensor] = None, + top_k: Union[List[int], torch.IntTensor] = None, + top_p: Union[List[float], torch.FloatTensor] = None, + input_ids: Union[List[int], torch.IntTensor] = None, + cu_seqlens: Union[List[int], torch.IntTensor] = None, + repetition_penalty: Union[List[float], torch.FloatTensor] = None, + mask_eos_token: Union[List[int], torch.IntTensor] = None, + min_tokens_to_keep: int = 1, + eos_token_id: int = 0, + max_seqlens: int = 0, + ) -> Tuple[List[int], torch.FloatTensor]: + _is_greedy = False + _is_random = False + _is_fastpath = False + + if top_k: + assert all( + k == top_k[0] for k in top_k + ), f"expect the same batch top_k, but got {top_k}" + if all(k == 1 for k in top_k): + _is_greedy = True + elif top_p: + _is_random = True + if all(p == top_p[0] for p in top_p): + _is_fastpath = True + _top_p = top_p[0] + else: + raise RuntimeError( + f"Unsupported sample strategy, parameter top_k: {top_k} top_p: {top_p}" + ) + + if _is_greedy: + return torch.argmax(logits, dim=-1), torch.nn.functional.softmax( + logits, dim=-1 + ) + else: + raise NotImplementedError + + def postprocess( + self, + packets: List[Packet], + infer_outputs: Dict[str, torch.FloatTensor], + next_tokens: List[int], + ) -> List[GenerateResult]: + generate_result = [] + for i in range(len(packets)): + token_id = next_tokens[i] + packet = packets[i] + + if token_id == packet.request.generate_config.eos_token_id: + finish_reason = "stop" + elif ( + len(packet.generate_ids) + >= packet.request.generate_config.max_new_tokens + ): + finish_reason = "max_length" + else: + finish_reason = "" + + if packet.request.generate_config.get_input_logits: + last_logits = infer_outputs["last_logits"] + input_logits = infer_outputs["input_logits"] + gen_res = GenerateResult( + token_id=token_id, + finish_reason=finish_reason, + last_logits=last_logits.view(-1).tolist(), + input_logits=input_logits.view(-1).tolist(), + ) + else: + gen_res = GenerateResult(token_id=token_id, finish_reason=finish_reason) + + generate_result.append(gen_res) + + return generate_result diff --git a/byte_infer_perf/llm_perf/backends/ILU/gpu_scheduler.py b/byte_infer_perf/llm_perf/backends/ILU/gpu_scheduler.py new file mode 100644 index 000000000..8d2e73475 --- /dev/null +++ b/byte_infer_perf/llm_perf/backends/ILU/gpu_scheduler.py @@ -0,0 +1,85 @@ +import sys +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import List + +import torch +from transformers import PreTrainedTokenizer + +import llm_perf.backends.GPU.common as comm +from llm_perf.core.common import Packet +from llm_perf.core.engine import CoreEngine +from llm_perf.core.sampler import CoreSampler +from llm_perf.core.scheduler import CoreScheduler +from llm_perf.utils.logger import logger + + +class GpuScheduler(CoreScheduler): + def __init__( + self, + engine: CoreEngine, + sampler: CoreSampler, + tokenizer: PreTrainedTokenizer, + **kwargs, + ) -> None: + super().__init__( + engine=engine, sampler=sampler, tokenizer=tokenizer, comm=comm, **kwargs + ) + self.max_batch_size = kwargs.get("max_batch_size") + + @torch.inference_mode() + def scheduler_loop(self): + batch: List[Packet] = [] + while True: + # 1. get new task + batch = self.select_batch(batch) + if not batch: + with self.packet_queue.not_empty: + self.packet_queue.not_empty.wait(0.1) + continue + + logger.info(f"get batch size: {len(batch)}") + + # 2. do inference -> logits + outputs = self.engine.do_inference(batch) + + logger.info(f"@@@@@@@@@@@@@@ 2. do inference -> logits") + # 3. sample logits -> tokens + next_tokens, softmax_out = self.sampler.sample( + packets=batch, logits=outputs["last_logits"] + ) + + logger.info(f"@@@@@@@@@@@@@@ 4.postprocess -> gen result ") + # 4.postprocess -> gen result + generation_results = self.sampler.postprocess( + packets=batch, + infer_outputs=outputs, + next_tokens=next_tokens, + ) + + logger.info(f"@@@@@@@@@@@@@@ 5. add result to packet ") + # 5. add result to packet + for i, gen_res in enumerate(generation_results): + batch[i].add_result(gen_res) + if gen_res.finish_reason: + batch[i].finish() + + # 6. is not finished -> remain + remained: List[Packet] = [] + for packet in batch: + if not packet.is_finished(): + remained.append(packet) + logger.info(f"@@@@@@@@@@@@@@ 6. is not finished -> remain ") + batch = remained + + def select_batch(self, batch): + batching_size: int = len(batch) + new_select_packets: List[Packet] = [] + + while not self.packet_queue.empty(): + if batching_size == self.max_batch_size: + break + batching_size += 1 + new_select_packets.append(self.packet_queue.get()) + + return batch + new_select_packets diff --git a/byte_infer_perf/llm_perf/backends/ILU/model_impl/__init__.py b/byte_infer_perf/llm_perf/backends/ILU/model_impl/__init__.py new file mode 100644 index 000000000..12e048c40 --- /dev/null +++ b/byte_infer_perf/llm_perf/backends/ILU/model_impl/__init__.py @@ -0,0 +1,8 @@ +## __all__ is a dict: +## key is model_name in `model_zoo/chatglm-xx.json` +## value is vendor specify model impl +# __all__ = { +# "chatglm" : ChatGLMForConditionalGeneration, +# "chatglm2" : ChatGLM2ForConditionalGeneration +# } +__all__ = {} \ No newline at end of file diff --git a/byte_infer_perf/llm_perf/backends/ILU/setup.py b/byte_infer_perf/llm_perf/backends/ILU/setup.py new file mode 100644 index 000000000..9f26845e1 --- /dev/null +++ b/byte_infer_perf/llm_perf/backends/ILU/setup.py @@ -0,0 +1,55 @@ +import os +from typing import Any, Dict + +import torch +from transformers import AutoTokenizer, PreTrainedTokenizer + +from llm_perf.backends.GPU.gpu_engine_chatglm import GpuEngineChatGLM +from llm_perf.backends.GPU.gpu_engine_chatglm2 import GpuEngineChatGLM2 +from llm_perf.backends.GPU.gpu_engine_chinese_llama2 import GpuEngineChineseLlama2 +from llm_perf.backends.GPU.gpu_sampler import GpuSampler +from llm_perf.backends.GPU.gpu_scheduler import GpuScheduler +from llm_perf.core.scheduler import CoreScheduler + + +def setup_scheduler( + modelcls, model_config: Dict[str, Any], max_batch_size: int, **kwargs +) -> CoreScheduler: + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size > 1: + torch.distributed.init_process_group( + backend="nccl", world_size=world_size, rank=local_rank + ) + + model_name = model_config["model_name"] + tokenizer_path = model_config["tokenizer"]["path"] + add_sep_token = model_config["tokenizer"]["add_sep_token"] + + tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=tokenizer_path, + local_files_only=True, + trust_remote_code=True, + ) + if model_name == "gpt2": + pass + elif model_name == "chatglm": + engine = GpuEngineChatGLM(modelcls, model_config, tokenizer, max_batch_size) + elif model_name == "chatglm2": + engine = GpuEngineChatGLM2(modelcls, model_config, tokenizer, max_batch_size) + elif model_name == "llama2": + engine = GpuEngineChineseLlama2( + modelcls, model_config, tokenizer, max_batch_size + ) + else: + raise ValueError(f"Unknown model name: {model_name}") + + sampler = GpuSampler() + + return GpuScheduler( + engine=engine, + sampler=sampler, + tokenizer=tokenizer, + add_sep_token=add_sep_token, + max_batch_size=max_batch_size, + ) diff --git a/byte_infer_perf/llm_perf/benchmark/bench.py b/byte_infer_perf/llm_perf/benchmark/bench.py index ebc83050d..c876ae0b3 100644 --- a/byte_infer_perf/llm_perf/benchmark/bench.py +++ b/byte_infer_perf/llm_perf/benchmark/bench.py @@ -18,6 +18,7 @@ from llm_perf import server_pb2, server_pb2_grpc from llm_perf.utils.pb import deserialize_value, serialize_value +from llm_perf.backends.ILU.benchmark import ILUbenchmark @backoff.on_exception(backoff.expo, Exception, factor=0.1, max_value=1, max_tries=3) def gen_stream_request( @@ -110,7 +111,6 @@ def bench_performance( ): res = {k: deserialize_value(v) for k, v in res.outputs.items()} output_messages += res["choice"]["message"] - if not first_token_latency: first_token_latency = time.time() - st @@ -134,6 +134,35 @@ def bench_performance( result_queue.put(None) +def bench_performance_ILU( + ilu_benchmark: ILUbenchmark, + batch_size: int, + input_tokens: int, + result_queue: mp.Queue, +): + first_token_latency =0 + prompt_tokens = 0 + completion_tokens = 0 + per_token_latency = 0 + + ilu_benchmark.benchmark_vllm_ftl(batch_size, input_tokens) + ilu_benchmark.benchmark_vllm(batch_size, input_tokens) + prompt_tokens, completion_tokens, first_token_latency, per_token_latency = ilu_benchmark.getresult() + + result = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "first_token_latency": first_token_latency, + "per_token_latency": per_token_latency, + } + logger.info(f"prompt response: {result}") + result_queue.put(result) + + time.sleep(1) + result_queue.put(None) + + + def benchmark( index: int, workload: Dict[str, Any], @@ -141,8 +170,11 @@ def benchmark( input_tokens: int, result_queue: mp.Queue, args, + batch_size: int, + backend_type: int, + ilu_benchmark: ILUbenchmark, ): - logger.debug(f"{report_type.name} bench_{index} start") + logger.info(f"{report_type.name} bench_{index} start") with grpc.insecure_channel(f"{args.host}:{args.port}") as channel: stub = server_pb2_grpc.InferenceStub(channel) @@ -152,7 +184,11 @@ def benchmark( if report_type == ReportType.ACCURACY: bench_accuracy(stub, workload, result_queue) elif report_type == ReportType.PERFORMANCE: - bench_performance(stub, index, workload, input_tokens, result_queue) + logger.info(f" ========= report_type == ReportType.PERFORMANCE backend_type: {backend_type}") + #lzh add for 0328 + if backend_type != 'ILU': + logger.info(f" ============ backend_type != 'ILU'") + bench_performance(stub, index, workload, input_tokens, result_queue) except Exception as e: logger.error(f"{report_type.name} bench_{index} error: {e}") raise e diff --git a/byte_infer_perf/llm_perf/core/perf_engine.py b/byte_infer_perf/llm_perf/core/perf_engine.py index de86647b4..6a2c1d5dc 100644 --- a/byte_infer_perf/llm_perf/core/perf_engine.py +++ b/byte_infer_perf/llm_perf/core/perf_engine.py @@ -28,10 +28,11 @@ sys.path.insert(0, BYTE_MLPERF_ROOT) -from llm_perf.benchmark.bench import benchmark +from llm_perf.benchmark.bench import benchmark, bench_performance_ILU from llm_perf.utils.logger import logger, setup_logger from llm_perf.utils.reporter import Reporter, ReportType +from llm_perf.backends.ILU.benchmark import ILUbenchmark def get_args(): """Parse commandline.""" @@ -98,6 +99,11 @@ def __init__(self) -> None: self.result_queue = mp.Queue() self.jobs: List[mp.Process] = [] self.server_process = None + + #lzh modify for 0328 + self.batch_size = 1 + self.ilu_benchmark = None + def __del__(self): self.stop_server() @@ -112,7 +118,7 @@ def start_server(self, tp_size: int, batch_size: int): "torchrun", "--master_port", "19999", - "--nproc-per-node", + "--nproc_per_node", str(tp_size), "llm_perf/launch.py", "--task", @@ -151,6 +157,7 @@ def stop_server(self): # logger already exit print(f"server process already exit with {self.server_process.poll()}") + def start_benchmark( self, workload: Dict[str, Any], @@ -158,21 +165,44 @@ def start_benchmark( input_tokens: int, report_type: ReportType, ): - clients = 1 if report_type == ReportType.ACCURACY else batch_size - for i in range(clients): - p = mp.Process( - target=benchmark, - args=( - i, - workload, - report_type, - input_tokens, - self.result_queue, - self.args, - ), - ) - self.jobs.append(p) - p.start() + + #lzh add for 0328 + if not self.isILU(self.backend_type): + clients = 1 if report_type == ReportType.ACCURACY else batch_size + else: + self.batch_size = batch_size + clients = 1 + max_tokens = workload['max_new_tokens'] + if report_type == ReportType.PERFORMANCE and self.ilu_benchmark == None: + self.ilu_benchmark = ILUbenchmark(self.batch_size, workload, input_tokens, self.result_queue, max_tokens) + + if self.isILU(self.backend_type) and report_type == ReportType.PERFORMANCE: + pass + else: + for i in range(clients): + p = mp.Process( + target=benchmark, + args=( + i, + workload, + report_type, + input_tokens, + self.result_queue, + self.args, + self.batch_size, + self.backend_type + ,self.ilu_benchmark + ), + ) + self.jobs.append(p) + p.start() + + + def isILU(self, backend_type): + return backend_type == 'ILU' + + def isAccuracyTest(self, report_type): + return report_type == ReportType.ACCURACY def run_perf( self, @@ -182,14 +212,25 @@ def run_perf( input_tokens: int, report_type: ReportType, ) -> None: - # 1. Start server - self.start_server(tp_size, batch_size) - + + if self.isILU(self.backend_type) and not self.isAccuracyTest(report_type): + pass + else: + # 1. Start server + self.start_server(tp_size, batch_size) # 2. Benchmark clients self.start_benchmark(workload, batch_size, input_tokens, report_type) # 3. Get result - alive_clients = batch_size if report_type == ReportType.PERFORMANCE else 1 + #lzh add for 0328 start + if self.backend_type != 'ILU': + alive_clients = batch_size if report_type == ReportType.PERFORMANCE else 1 + else: + alive_clients = 1 + if report_type == ReportType.PERFORMANCE: + self.result_queue.put("@start") + #lzh add for 0328 stop + started: bool = False while alive_clients: result = self.result_queue.get() @@ -197,19 +238,28 @@ def run_perf( if not started: # Reset reporter mate information self.reporter.update_meta(tp_size, batch_size, input_tokens) + if self.backend_type == 'ILU' and report_type == ReportType.PERFORMANCE: + bench_performance_ILU(self.ilu_benchmark, batch_size, input_tokens, self.result_queue) started = True continue elif result is None: alive_clients = alive_clients - 1 continue + + self.reporter.submit(result, report_type) + logger.info(f"run_perf batch_size:{batch_size}") + # 4. Join benchmark client process for p in self.jobs: p.join() - # 5. Kill server process - self.stop_server() + if self.isILU(self.backend_type) and not self.isAccuracyTest(report_type): + pass + else: + # 5. Kill server process + self.stop_server() def start_engine(self) -> None: """ @@ -265,6 +315,7 @@ def start_engine(self) -> None: for tp_size in workload["tp_sizes"]: for batch_size in workload["batch_sizes"]: for input_tokens in workload["input_tokens"]: + logger.info(f"***** start_engine batch_size:{batch_size} input_tokens:{input_tokens} *******") self.run_perf( workload, tp_size, diff --git a/byte_infer_perf/llm_perf/datasets/test_mini.csv b/byte_infer_perf/llm_perf/datasets/test_mini.csv index 7d38bff59..abc075246 100644 --- a/byte_infer_perf/llm_perf/datasets/test_mini.csv +++ b/byte_infer_perf/llm_perf/datasets/test_mini.csv @@ -1,2 +1,6 @@ id,question,A,B,C,D 0,最早向中国介绍西方进化论的是____,严复,梁启超,康有为,谭嗣同 +0,“大将筹边尚未还,湖湘子弟满天山。新栽杨柳三千里,引得春风度玉关。”这首诗颂扬了一位清代名将率军收复新疆、治理边疆的业绩。这位名将是____。,林则徐,左宗棠,邓世昌,丁汝昌 +0,巴黎公社为国际社会主义运动提供的最主要的经验是:____,同强大的敌人勇于斗争,联合农民建立工农联盟,用暴力手段推翻资产阶级统治,建立无产阶级专政,清除败类,纯洁革命队伍 +0,“五证合一、一照一码”是指企业进行登记后,由市场监督管理部门核发一个加载法人和其他组织____营业执照的登记制度。,组织机构代码,统一社会信用代码,纳税人识别号,社会保险代码 +0,下列不属于动作要素的内容是____,动作轨迹,动作时间,动作速度,动作方向 diff --git a/byte_infer_perf/llm_perf/model_zoo/chatglm.py b/byte_infer_perf/llm_perf/model_zoo/chatglm.py index 43b197a22..62b39bd83 100644 --- a/byte_infer_perf/llm_perf/model_zoo/chatglm.py +++ b/byte_infer_perf/llm_perf/model_zoo/chatglm.py @@ -902,7 +902,7 @@ def __init__(self, config: ChatGLMConfig, empty_init=True): self.layernorm_epsilon = config.layernorm_epsilon self.inner_hidden_size = config.inner_hidden_size self.hidden_size_per_attention_head = self.hidden_size // self.num_attention_heads - self.position_encoding_2d = config.position_encoding_2d + self.position_encoding_2d = False#config.position_encoding_2d self.pre_seq_len = config.pre_seq_len self.prefix_projection = config.prefix_projection @@ -1520,4 +1520,4 @@ def quantize(self, bits: int, empty_init=False, **kwargs): self.config.quantization_bit = bits self.transformer = quantize(self.transformer, bits, empty_init=empty_init, **kwargs) - return self \ No newline at end of file + return self diff --git a/byte_infer_perf/llm_perf/utils/logger.py b/byte_infer_perf/llm_perf/utils/logger.py index 406bce7eb..ebb9838ce 100644 --- a/byte_infer_perf/llm_perf/utils/logger.py +++ b/byte_infer_perf/llm_perf/utils/logger.py @@ -6,7 +6,7 @@ def setup_logger(loglevel: str): fmt = logging.Formatter( - fmt="%(asctime)s.%(msecs)03d %(filename)s:%(lineno)d [%(levelname)s]: %(message)s", + fmt="%(asctime)s.%(msecs)03d %(filename)s:%(lineno)d [%(levelname)s] [%(process)d]: %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) handler = logging.StreamHandler(stream=sys.stdout) diff --git a/byte_infer_perf/llm_perf/utils/reporter.py b/byte_infer_perf/llm_perf/utils/reporter.py index 38e5ece6d..d963dc814 100644 --- a/byte_infer_perf/llm_perf/utils/reporter.py +++ b/byte_infer_perf/llm_perf/utils/reporter.py @@ -180,6 +180,11 @@ def _calc_performance(self): performance["Token Throughput"] = completion_tokens / ( self.last_submit_time - self.start_time ) + + logger.info( + f"completion_tokens={completion_tokens}, use time={self.last_submit_time - self.start_time}" + ) + performance["Request Number"] = self.request performance["QPS"] = self.request / (self.last_submit_time - self.start_time) diff --git a/byte_infer_perf/llm_perf/workloads/chatglm-torch-fp16-6b.json b/byte_infer_perf/llm_perf/workloads/chatglm-torch-fp16-6b.json index c0b33c0e3..6decbe41e 100644 --- a/byte_infer_perf/llm_perf/workloads/chatglm-torch-fp16-6b.json +++ b/byte_infer_perf/llm_perf/workloads/chatglm-torch-fp16-6b.json @@ -1,12 +1,12 @@ { "model": "chatglm-torch-fp16-6b", - "test_accuracy": true, + "test_accuracy": false, "test_perf": true, "min_new_tokens": 128, "max_new_tokens": 256, - "tp_sizes": [1, 2], - "batch_sizes":[1, 2, 4, 8], - "input_tokens": [1024, 2048], + "tp_sizes": [1], + "batch_sizes":[64], + "input_tokens": [256], "dataset": "llm_perf/datasets/merged_52_test.csv", "perf_time": 180 -} \ No newline at end of file +} diff --git a/byte_infer_perf/llm_perf/workloads/chatglm2-torch-fp16-6b.json b/byte_infer_perf/llm_perf/workloads/chatglm2-torch-fp16-6b.json index 4739126b8..0bdc94a2a 100644 --- a/byte_infer_perf/llm_perf/workloads/chatglm2-torch-fp16-6b.json +++ b/byte_infer_perf/llm_perf/workloads/chatglm2-torch-fp16-6b.json @@ -1,12 +1,12 @@ { "model": "chatglm2-torch-fp16-6b", - "test_accuracy": false, + "test_accuracy": true, "test_perf": true, "min_new_tokens": 128, "max_new_tokens": 256, - "tp_sizes": [1, 2], - "batch_sizes":[1, 2, 4, 8], - "input_tokens": [1024, 2048], - "dataset": "llm_perf/datasets/merged_52_test.csv", + "tp_sizes": [1], + "batch_sizes":[1,2,4,8], + "input_tokens": [256,1024], + "dataset": "llm_perf/datasets/test_mini.csv", "perf_time": 180 -} \ No newline at end of file +} diff --git a/byte_infer_perf/llm_perf/workloads/chinese-llama2-torch-fp16-13b.json b/byte_infer_perf/llm_perf/workloads/chinese-llama2-torch-fp16-13b.json index 5113c543b..bf34a1876 100644 --- a/byte_infer_perf/llm_perf/workloads/chinese-llama2-torch-fp16-13b.json +++ b/byte_infer_perf/llm_perf/workloads/chinese-llama2-torch-fp16-13b.json @@ -1,12 +1,12 @@ { "model": "chinese-llama2-torch-fp16-13b", - "test_accuracy": false, + "test_accuracy": true, "test_perf": true, "min_new_tokens": 128, "max_new_tokens": 256, - "tp_sizes": [1, 2], - "batch_sizes":[1, 2, 4, 8], - "input_tokens": [1024, 2048], - "dataset": "llm_perf/datasets/merged_52_test.csv", + "tp_sizes": [1], + "batch_sizes":[1,2,4,8], + "input_tokens": [256,1024], + "dataset": "llm_perf/datasets/test_mini.csv", "perf_time": 180 -} \ No newline at end of file +} diff --git a/byte_micro_perf/backends/iluvatar/backend_iluvatar.py b/byte_micro_perf/backends/iluvatar/backend_iluvatar.py new file mode 100644 index 000000000..d9d6b0767 --- /dev/null +++ b/byte_micro_perf/backends/iluvatar/backend_iluvatar.py @@ -0,0 +1,180 @@ +# Copyright 2023 ByteDance and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import math +import os +from datetime import timedelta +from typing import Any, Dict, List + +import torch +import torch.distributed as dist +import torch.distributed.distributed_c10d as dist_c10d +from backends.backend import Backend +from backends.module_store import * + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("PerfEngine") + + +class Backendiluvatar(Backend): + def get_device_name(self): + return torch.cuda.get_device_name(0) + + def get_backend_properties(self): + self.memory_limit = int( + torch.cuda.get_device_properties(0).total_memory / (1024**3) + ) + + if os.path.exists(self.vendor_path) and (self.vendor_path).endswith(".json"): + with open(self.vendor_path, "r") as f: + self.hw_info_dict = json.load(f) + # if the vendor path does not exist, please set this param manaually + self.bandwidth_limit = self.hw_info_dict["内存参数"]["内存"]["内存带宽(GB/s)"] + else: + log.warning( + "Vendor_path: [ {} ] was not found or not a full path points to json, please check your path!!! Otherwise, please set the hardware info manaually.".format( + self.vendor_path + ) + ) + + def gemm(self): + self.op = GemmOp() + + def add(self): + self.op = AddOp() + + def sin(self): + self.op = SinOp() + + def cos(self): + self.op = CosOp() + + def exp(self): + self.op = ExpOp() + + def exponential(self): + self.op = ExponentialOp() + + def gelu(self): + self.op = GeluOp() + + def sort(self): + self.op = SortOp() + + def unique(self): + self.op = UniqueOp() + + def indexadd(self): + self.op = IndexAddOp() + + def softmax(self): + self.op = SoftmaxOp() + + def layernorm(self): + self.op = LayerNormOp() + + def allreduce(self): + self.setup_2d_group() + self.op = AllReduceOp(self.group) + + def allgather(self): + self.setup_2d_group() + self.op = AllGatherOp(self.group) + + def reducescatter(self): + self.setup_2d_group() + self.op = ReduceScatterOp(self.group) + + def alltoall(self): + self.setup_2d_group() + self.op = AllToAllOp(self.group) + + def host2device(self): + self.op = Host2DeviceOp(torch.device("cuda")) + + def device2host(self): + self.op = Device2HostOp() + + def build_tensor(self, input_shapes, dtype): + torch_type = getattr(torch, dtype) + if torch_type == torch.int32: + dtype_size = torch.iinfo(torch_type).bits // 8 + else: + dtype_size = torch.finfo(torch_type).bits // 8 + size = sum([math.prod(shape) for shape in input_shapes]) + data_amount = size * 2 * dtype_size + data_cnt = (self.memory_limit - 4) * 1024**3 // data_amount + data_cnt = min(data_cnt, self.iterations) + input_tensors_list = [] + for _ in range(data_cnt): + input_tensors = [ + torch.randn(shape).type(torch_type).to(torch.device("cuda")) + for shape in input_shapes + ] + input_tensors_list.append(input_tensors) + + if hasattr(self.op, "process_inputs"): + input_tensors_list = [ + self.op.process_inputs(*(input_tensor)) + for input_tensor in input_tensors_list + ] + + return input_tensors_list, max(data_cnt, 1) + + def _run_operation(self, operation, inputs): + result = operation(*inputs) + return result + + def device_synchronize(self): + torch.cuda.synchronize() + return True + + def initialize_ccl(self, rank, world_size): + """ + initialize distributed process groups and relevant ENVs + """ + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = "49373" + os.environ["LOCAL_RANK"] = str(rank) + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + + torch.cuda.set_device(rank) + # Call the init process + timeout_seconds = int(os.environ.get("MEGATRON_NCCL_TIMEOUT_SECOND", 30)) + torch.distributed.init_process_group( + backend="nccl", + world_size=world_size, + rank=rank, + store=None, + timeout=timedelta(seconds=timeout_seconds), + ) + self.setup_2d_group() + log.warning("DIST: rank {}, world_size {}".format(rank, world_size)) + + def setup_2d_group(self): + self.rank = dist.get_rank() + torch.cuda.set_device(self.rank) + origin_store_based_barrier = dist_c10d._store_based_barrier + dist_c10d._store_based_barrier = lambda *a, **kw: None + self.world_size = dist.get_world_size() + ranks = range(0, self.world_size) + group = dist.new_group(ranks) + if self.rank in ranks: + self.group = group + dist_c10d._store_based_barrier = origin_store_based_barrier + # wait for all ranks finish group initializing + torch.distributed.barrier() diff --git a/byte_micro_perf/backends/iluvatar/requirements.txt b/byte_micro_perf/backends/iluvatar/requirements.txt new file mode 100644 index 000000000..e69de29bb diff --git a/vendor_zoo/Iluvatar/BI-V150-PCIe.json b/vendor_zoo/Iluvatar/BI-V150-PCIe.json new file mode 100644 index 000000000..daf97d971 --- /dev/null +++ b/vendor_zoo/Iluvatar/BI-V150-PCIe.json @@ -0,0 +1,49 @@ +{ + "SKU参数": { + "厂商": "Iluvatar", + "型号": "BI-V150", + "用途": "Training/Inference", + "照片": "vendor_zoo/Iluvatar/image_3.png" + }, + "硬件参数": { + "制程(NM)": 7, + "尺寸": "FHFL, Dual Slot Card", + "接口": "PCIe 4.0x16", + "功耗(W/TDP)": 350 + }, + "内存参数": { + "内存层次架构图": null, + "内存": { + "内存类型": "HBM2e", + "内存容量(GB)": 64, + "内存带宽(GB/s)": 1126.4 + } + }, + "算力参数": { + "PE层次架构图": null, + "PE参数": { + "算力架构": "同构众核", + "并行方式": "SIMT", + "通信带宽(GB/s)": null + }, + "标量参数": { + "标量精度": null, + "INT8标量算力(TFLOPS)": null, + "FP16标量算力(TFLOPS)": null, + "FP32标量算力(TFLOPS)": null + }, + "张量参数": { + "张量精度": "FP32, FP16, BF16, INT8", + "INT8张量算力(TFLOPS)": 590.0, + "FP16张量算力(TFLOPS)": 196.0, + "FP32张量算力(TFLOPS)": 48.0 + } + }, + "卡间通信参数": { + "通信方式": "PCIe", + "端口数量": null, + "RDMA协议": null, + "下行带宽(GB/s)": null, + "上行带宽(GB/s)": null + } +}