diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..63e9669 Binary files /dev/null and b/.DS_Store differ diff --git a/ai-compiler/.DS_Store b/ai-compiler/.DS_Store new file mode 100644 index 0000000..a843f83 Binary files /dev/null and b/ai-compiler/.DS_Store differ diff --git a/ai-compiler/Treebeard/.DS_Store b/ai-compiler/Treebeard/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/ai-compiler/Treebeard/.DS_Store differ diff --git a/ai-framework/.DS_Store b/ai-framework/.DS_Store new file mode 100644 index 0000000..3512a2d Binary files /dev/null and b/ai-framework/.DS_Store differ diff --git a/ai-framework/huggingface-transformers/FSDP.md b/ai-framework/huggingface-transformers/FSDP.md index 690c6a2..e54a24f 100644 --- a/ai-framework/huggingface-transformers/FSDP.md +++ b/ai-framework/huggingface-transformers/FSDP.md @@ -3,8 +3,6 @@ - https://pytorch.org/docs/stable/fsdp.html - - - https://huggingface.co/docs/accelerate/usage_guides/fsdp diff --git a/ai-framework/pai-megatron-patch/.DS_Store b/ai-framework/pai-megatron-patch/.DS_Store new file mode 100644 index 0000000..cbf9ce2 Binary files /dev/null and b/ai-framework/pai-megatron-patch/.DS_Store differ diff --git a/ai-framework/vllm/.DS_Store b/ai-framework/vllm/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/ai-framework/vllm/.DS_Store differ diff --git a/ai-infra/.DS_Store b/ai-infra/.DS_Store new file mode 100644 index 0000000..0ae4851 Binary files /dev/null and b/ai-infra/.DS_Store differ diff --git a/ai-infra/ai-hardware/.DS_Store b/ai-infra/ai-hardware/.DS_Store new file mode 100644 index 0000000..49320e7 Binary files /dev/null and b/ai-infra/ai-hardware/.DS_Store differ diff --git "a/ai-infra/\347\275\221\347\273\234/.DS_Store" "b/ai-infra/\347\275\221\347\273\234/.DS_Store" new file mode 100644 index 0000000..bc1f5a3 Binary files /dev/null and "b/ai-infra/\347\275\221\347\273\234/.DS_Store" differ diff --git "a/ai-infra/\347\275\221\347\273\234/pic/.DS_Store" "b/ai-infra/\347\275\221\347\273\234/pic/.DS_Store" new file mode 100644 index 0000000..5008ddf Binary files /dev/null and "b/ai-infra/\347\275\221\347\273\234/pic/.DS_Store" differ diff --git a/blog/.DS_Store b/blog/.DS_Store new file mode 100644 index 0000000..5759da0 Binary files /dev/null and b/blog/.DS_Store differ diff --git a/docs/.DS_Store b/docs/.DS_Store new file mode 100644 index 0000000..04a503d Binary files /dev/null and b/docs/.DS_Store differ diff --git a/docs/llm-base/.DS_Store b/docs/llm-base/.DS_Store new file mode 100644 index 0000000..997257e Binary files /dev/null and b/docs/llm-base/.DS_Store differ diff --git a/docs/llm-base/distribution-parallelism/.DS_Store b/docs/llm-base/distribution-parallelism/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/docs/llm-base/distribution-parallelism/.DS_Store differ diff --git a/docs/llm-base/distribution-training/.DS_Store b/docs/llm-base/distribution-training/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/docs/llm-base/distribution-training/.DS_Store differ diff --git a/llm-algo/.DS_Store b/llm-algo/.DS_Store new file mode 100644 index 0000000..e7b3749 Binary files /dev/null and b/llm-algo/.DS_Store differ diff --git a/llm-inference/.DS_Store b/llm-inference/.DS_Store new file mode 100644 index 0000000..a0535d7 Binary files /dev/null and b/llm-inference/.DS_Store differ diff --git a/llm-inference/ascend/.DS_Store b/llm-inference/ascend/.DS_Store new file mode 100644 index 0000000..8779145 Binary files /dev/null and b/llm-inference/ascend/.DS_Store differ diff --git a/llm-inference/ascend/mindformers/.DS_Store b/llm-inference/ascend/mindformers/.DS_Store new file mode 100644 index 0000000..0ff48be Binary files /dev/null and b/llm-inference/ascend/mindformers/.DS_Store differ diff --git a/llm-localization/.DS_Store b/llm-localization/.DS_Store new file mode 100644 index 0000000..177bdb5 Binary files /dev/null and b/llm-localization/.DS_Store differ diff --git a/llm-localization/ascend/.DS_Store b/llm-localization/ascend/.DS_Store new file mode 100644 index 0000000..8441325 Binary files /dev/null and b/llm-localization/ascend/.DS_Store differ diff --git a/llm-localization/ascend/mindformers/.DS_Store b/llm-localization/ascend/mindformers/.DS_Store new file mode 100644 index 0000000..f76bd0f Binary files /dev/null and b/llm-localization/ascend/mindformers/.DS_Store differ diff --git a/llm-localization/ascend/mindie/.DS_Store b/llm-localization/ascend/mindie/.DS_Store new file mode 100644 index 0000000..88aafe1 Binary files /dev/null and b/llm-localization/ascend/mindie/.DS_Store differ diff --git a/llm-localization/ascend/mindie/config/.DS_Store b/llm-localization/ascend/mindie/config/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/llm-localization/ascend/mindie/config/.DS_Store differ diff --git a/llm-localization/ascend/mindie/mindie-1.0-baichuan2-13b.md b/llm-localization/ascend/mindie/mindie-1.0-baichuan2-13b.md deleted file mode 100644 index d651733..0000000 --- a/llm-localization/ascend/mindie/mindie-1.0-baichuan2-13b.md +++ /dev/null @@ -1,461 +0,0 @@ -[TOC] - -# BaiChuan2-13B模型-推理指导 - -# 概述 - -BaiChuan 2 是百川智能推出的新一代开源大语言模型,采用 2.6 万亿 Tokens 的高质量语料训练,在权威的中文和英文 -benchmark上均取得同尺寸最好的效果。本次发布包含有 7B、13B 的 Base 和 Chat 版本,并提供了 Chat 版本的 4bits -量化,所有版本不仅对学术研究完全开放,开发者也仅需邮件申请并获得官方商用许可后,即可以免费商用。 - -- 参考实现: - - ``` - https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat - ``` - -# 快速上手 - -## 路径变量解释 - -| 变量名 | 含义 | -|---------------------|----------------------------------------------------------------------| -| model_download_path | 开源权重放置目录 | -| llm_path | 加速库及模型库下载后放置目录 | -| model_path | 工作时模型所在的目录,可以和model_download_path相同,但一般模型是公共的,为了避免影响其他用户,单独建一个模型工作目录 | -| script_path | 工作脚本所在路径,本文为${llm_path}/pytorch/examples/baichuan2/13b | -| ceval_work_dir | ceval数据集、及结果保存所在目录,不必和模型脚本在相同目录 | - -## 获取源码及依赖 - -### 1.python requirements - -| 包名 | 推荐版本 | -|---------------|--------| -| transformers | 4.30.2 | -| decorator | 5.1.1 | -| sympy | 1.11.1 | -| scipy | 1.11.3 | -| attrs | 23.1.0 | -| psutil | 5.9.6 | -| sentencepiece | 0.1.99 | - -### 下载模型权重 - -下载模型权重,放置到自定义`${model_download_path}` (请下载链接中'Files and versions'页签下的所有文件) - -``` -https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/tree/main -``` - -### 拷贝文件 - -### 准备 - -#### 1. 将开源模型拷贝到模型工作目录,bin文件使用软链接即可,同时将modeling文件拷贝到模型,并修改开源的config.json, - -```shell -cd ${model_path} -cp ${model_download_path}/*.py ${model_path}/ -cp ${model_download_path}/*.json ${model_path}/ -cp ${model_download_path}/*.model ${model_path}/ -cp -s ${model_download_path}/*.bin ${model_path}/ -``` - -#### 2. 安装 atb_speed_sdk - -```shell -cd ${llm_path}/pytorch/examples/atb_speed_sdk -pip install . -``` - -#### 3. 张量并行模型切分(仅在模型需要多卡并行时使用) - -```shell -cp ${script_path}/modeling_baichuan_cut.py ${model_path} -``` - -修改 ${model_path}里的config.json中的kv对,改成`"AutoModelForCausalLM": "modeling_baichuan_cut.BaichuanForCausalLM"` - -```text -修改`${script_path}/cut_model_and_run.sh` -将 `input_dir` 修改为模型所在路径 `${model_path}` -将 `output_dir` 修改为切分后的模型所存储的路径,比如仍为原目录 `${model_path}`。模型切分成功后,会自动生成新目录part_model(用户无需新建该文件夹),即:${model_path/part_model} -将 `world_size_` 修改成希望切成的卡的数量 -``` - -目录结构示例建议 - -``` ---model_path - *.py(模型源文件) - *.json(模型源文件) - *.model(模型源文件) - *.bin(模型源文件,软链接) - modeling_baichuan_cut.py(权重切分脚本) - --part_model(权重切分成功后文件夹) - --0 - --1 - ......(其他) ---script_path - cut_model_and_run.sh - cut_model_util.py - main.py - config.ini - ......(其他) -``` - -执行 - -```shell -cd ${script_path} -bash cut_model_and_run.sh -``` - -切分所需时间较长,切分完成后,将会打印 'Tensor parallelism weights have been successfully saved.'。 - -#### 4.修改config.json配置 - -- 单卡运行时**必须**修改 -- 多卡运行时,会在切分阶段会自动修改,没有定制的情况下,可以不操作 - -##### 单卡 -拷贝修改后的modeling -```shell -cp ${script_path}/modeling_baichuan_ascend.py ${model_path} -``` -修改${model_path}/config.json中的kv对,改成 - -``` -AutoModelForCausalLM": "modeling_baichuan_ascend.BaichuanForCausalLM -``` - -##### 多卡 - -修改 -${model_path}/part_model/{rank_id}/config.json中的kv对,改成 - -``` -AutoModelForCausalLM": "modeling_baichuan_ascend.BaichuanForCausalLM -``` - -# CPU高性能模式 - -可开启CPU Performance模式以提高模型推理性能。 - -``` -cpupower frequency-set -g performance -``` - -### 执行推理 - -#### 修改 ${script_path}/config.ini - -[config文件配置参考](../../atb_speed_sdk/README.md) -提示:多卡并行推理时,config.json中model_path路径为part_model父文件夹。例如: - -``` -# 正确示例: -model_path=../model -# 错误示例: -model_path=../model/part_model -``` - -#### main.py - -提供了demo推理,精度测试,性能测试三种下游任务。 -task_name可选inference、precision、performance。 -is_quant代表是否量化(0代表浮点,1代表量化),本节为浮点推理,设置为0即可。 - -- 单芯 - 修改 ${model_path}里的config.json中的kv对,改成`"AutoModelForCausalLM": "modeling_baichuan_ascend.BaichuanForCausalLM"` - -```shell -python main.py --task ${task_name} --is_quant ${is_quant} -``` - -- 多芯 - -```shell -bash cut_model_and_run.sh ${task_name} ${is_quant} -``` - -#### FAQ - -1. **可以使用 MAX_SEQ_LEN 环境变量来设置model支持的最大长度以优化显存占用, - 一般设置为最大输入输出token之和,默认使用config里面的max_model_length** - 如 - -```shell -MAX_SEQ_LEN=2048 python main.py --task ${task_name} --is_quant ${is_quant} -``` - -或 -修改cut_model_and_run.sh 中的 max_seq_length - -```shell -bash cut_model_and_run.sh ${task_name} ${is_quant} -``` - -2. ImportError: /root/miniconda3/envs/wqh39/bin/../lib/libgomp.so.1: cannot allocate memory in static TLS block - -如果遇到 - -```text -Traceback (most recent call last): - File "/root/miniconda3/envs/wqh39/lib/python3.9/site-packages/torch_npu/__init__.py", line 31, in - import torch_npu.npu - File "/root/miniconda3/envs/wqh39/lib/python3.9/site-packages/torch_npu/npu/__init__.py", line 46, in - from .utils import (is_initialized, _lazy_call, _lazy_init, init, set_dump, - File "/root/miniconda3/envs/wqh39/lib/python3.9/site-packages/torch_npu/npu/utils.py", line 27, in - import torch_npu._C -ImportError: /root/miniconda3/envs/wqh39/bin/../lib/libgomp.so.1: cannot allocate memory in static TLS block -Segmentation fault (core dumped) -``` - -则在命令行前加上`LD_PRELOAD=上面的error路径`。如 - -```shell -LD_PRELOAD=/root/miniconda3/envs/wqh39/bin/../lib/libgomp.so.1 MAX_SEQ_LEN=2048 python main.py --task ${task_name} --is_quant ${is_quant} -``` - - -## 量化推理 - -# 量化工具使用 - -量化权重的获取需要使用大模型量化工具(集成至CANN包中),详细操作手册可见[大模型权重量化工具-ModelSlim](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/devtools/auxiliarydevtool/modelslim_0001.html) -。针对Baichuan2-13B的权重量化可参考如下步骤,运行时需将下述三个步骤的代码整合为一个python文件 - -**特别注意1**:本章节依赖**pytorch >= 2.0.0 CANN >= 7.0.0.B060** -环境,大模型量化工具依赖指定pytorch版本(不依赖torch_npu,只依赖原生torch)。该环境的pytorch版本与后续步骤可能不同,后续将优化pytorch版本依赖的限制 - -**特别注意2**:本章节依赖 hugging face 的标准 transformers 包。若环境中的 transformers 包被改动过,可能引起相关报错,此时建议重新安装 -transformers 包 - -**特别注意3**:本章节执行完毕后,在`QUANT_WEIGHT_PATH`路径下生成如下权重文件,请检查是否缺失: - -``` -deq_scale.npy fp_bias.npy -input_offset.npy input_scale.npy -quant_bias.npy quant_weight.npy -weight_offset.npy weight_scale.npy -``` - -## 校准数据准备 - -```python -calib_list = ["中国的首都在哪里?", - "请做一首诗歌:", - "我想要学习python,该怎么学习?", - "请帮我写一篇关于大模型推理优化的任职报告:", - "中国最值得去的几个景点"] - - -# 获取校准数据函数定义 -def get_calib_dataset(tokenizer, calib_list): - calib_dataset = [] - for calib_data in calib_list: - inputs = tokenizer([calib_data], return_tensors='pt').to('cpu') - print(inputs) - calib_dataset.append([inputs.data['input_ids'], inputs.data['position_ids'], inputs.data['attention_mask']]) - return calib_dataset - - -dataset_calib = get_calib_dataset(tokenizer, calib_list) # 校准数据获取 -``` - -## 量化参数配置与运行 - -```python -from modelslim.pytorch.llm_ptq.llm_ptq_tools import Calibrator, QuantConfig - -quant_config = QuantConfig(w_bit=8, disable_names=['transformer.output_layer'], dev_type='cpu', act_method=3, pr=0.5, - mm_tensor=False, w_hessian=False) -calibrator = Calibrator(model, quant_config, calib_data=dataset_calib, disable_level='L1') -calibrator.run() # 执行PTQ量化校准 -calibrator.save('QUANT_WEIGHT_PATH') # 保存量化参数 -``` - -- 建议直接使用量化权重生成脚本,生成量化权重 - ``` - python quant.py - ``` - -> 注:要使用torch2.0.0导出量化权重,否则会有精度偏差 quant.py脚本需要修改calibrator.save('QUANT_WEIGHT_PATH') 最终量化全的指定路径 - -2. 量化权重切分 - -- 修改代码 - 1. 修改`cut_quant_model_util.py`中`--input_path`为实际存放量化权重的路径 - 2. 修改`cut_quant_model_util.py`中`--output_dir`为自定义路径,用于存放切分后的模型量化权重 -- 执行切分 - ``` - python cut_quant_model_util.py - # 切分好的模型权重会存放在自定义的output_dir - ``` - -3. 适配量化推理代码 - -- 进入modeling_baichuan_quant_parallel.py,适配量化权重路径和回退层 - ``` - # 修改以下全局变量 - self.quant_weight_path = '/code/models/baichuan2/quanted_weight_cut_1123_' 量化切分权重路径 及上一步的output_dir - self.cut_float_weight = '' 浮点切分权重路径 - self.roll_back_layer = [0,1,2,3,4,7,9,10,17,18,19,20,22,23,24,26,36,37,38,39] - ``` - **特别注意**:此处的self.roll_back_layer必须与quant.py里面的disable_idx_lst 保持一致 - -4. 执行量化模型推理 - - ``` - 单独推理 - bash cut_model_and_run_baichuan.sh inference 1 - 精度 - bash cut_model_and_run_baichuan.sh precision 1 - 性能 - bash cut_model_and_run_baichuan.sh performance 1 - - 具体参考atb_speed_sdk 使用README.md - ``` - -**特别注意 ** - -# 竞品对比 - -# 800T A2 - -## 精度 - -| 精度 | NPU | GPU | 对比 | -|----------------|-------------|-------------|----| -| STEM | 0.472093023 | 0.472093023 | 1 | -| Social Science | 0.661818182 | 0.661818182 | 1 | -| Humanities | 0.630350195 | 0.630350195 | 1 | -| Other | 0.567708333 | 0.567708333 | 1 | -| Avg acc | 0.568350669 | 0.568350669 | 1 | - -## 性能 - -| 芯片型号 | 首token推理速度(token/s) | 比例 | 增量推理速度(token/s) | 对比 | -|-------------------------------|---------------------|-------------|-------------------|-------------| -| Baichuan-13B NPU | 14.260809086490132 | | 31.69616807901823 | | -| Baichuan-13B A100(80G) NVlink | 15.642417690338782 | 0.911675508 | 36.41638939692089 | 0.870381952 | - -# 300I DUO - -## 性能 - -浮点 - -| 硬件形态 | 批大小 | 输入长度 | 输出长度 | 首次推理(ms/token) | 非首次推理(ms/token) | -|-------|-----|----------|----------|----------------|-----------------| -| Duo双芯 | 1 | 2^5~2^10 | 2^5~2^10 | 327 | 103 | - -量化 - -| 硬件形态 | 批大小 | 输入长度 | 输出长度 | 首次推理(ms/token) | 非首次推理(ms/token) | -|-------|-----|----------|----------|----------------|-----------------| -| Duo双芯 | 1 | 2^5~2^10 | 2^5~2^10 | \ | 75 | - -## 精度 - -| 精度 | NPU | GPU | 对比 | -|----------------|-------------|-------------|-------------| -| STEM | 0.472093023 | 0.472093023 | 1 | -| Social Science | 0.658181818 | 0.661818182 | 0.994505494 | -| Humanities | 0.630350195 | 0.630350195 | 1 | -| Other | 0.572916667 | 0.567708333 | 1.009174313 | -| Avg acc | 0.569093611 | 0.568350669 | 1.001307189 | - -# 附录 - -# 精度测试指南 - -## 配置说明 - -参考 [SDK精度测试指南CEVAL章节](../../atb_speed_sdk/README.md) - -## 运行脚本 - -- 单芯 - -```shell -cd ${script_path} -python main.py --task precision -``` - -- 多芯 - -```shell -cd ${script_path} -bash cut_model_and_run.sh precision -``` - -结束后在${ceval_work_dir}/test_result目录下查看测试结果。[双芯结果每个两份,只需看其中一份即可]。 - -| 文件 | 用途 | -|---------------------------|----------------------| -| device0.log | 运行过程日志 | -| cache0.csv | 结果详情,C列为预期答案,D列为测试答案 | -| result_0_classes_acc.json | 测试数据下按不同维度统计准确率 | -| result_0_subject_acc.json | 测试数据下按不同学科统计准确率 | - -**注意:后续重新运行, 需要删除当前目录下生成的test_result文件夹,否则只会读取当前的目录下的测试结果** - -# 性能测试 - -在功能运行正常的基础下,执行以下步骤进行性能测试 - -## 按照推理指导,下载模型及配置路径,并安装atb_speed_sdk - -## 1. 准备 - -参考 [SDK性能测试指南精确打点法章节](../../atb_speed_sdk/README.md) 进行准备 - -## 2. 修改配置文件 - -- 配置config.ini中[performance]属性, 如下: - ``` - model_name=baichuan2_13b - perf_mode=detail - ``` - -## 3. 执行测试脚本 - -- 单芯 - -```shell -cd ${script_path} -TIMEIT=1 python main.py --task performance -``` - -- 多芯 - -```shell -cd ${script_path} -TIMEIT=1 bash cut_model_and_run.sh performance 0 -``` - -将`TIMEIT`设置成1来返回具体的性能测试的值,默认是0 -上述多芯场景参数 - -* performance表示性能测试。 -* 0 表示浮点,1表示量化 - -### 性能测试结果 - -得到性能测试结果csv `performance_test_npu_${model_name}_xxx.csv` - -### 结果分析 - -| 列名 | 含义 | -|-------------------------------|------------| -| batch_size | batch大小 | -| input_seq_len(Encoding) | 输入长度 | -| output_seq_len(Decoding) | 输出长度 | -| ResponseTime(s) | 总响应时间 | -| forward_first_token_time(ms) | 首token推理时长 | -| forward_next_token_time(ms) | 增量推理时长 | -| pre_next_token_time(ms) | 前处理时长 | -| post_next_token_time_post(ms) | 后处理时长 | \ No newline at end of file diff --git a/llm-localization/ascend/mindie/mindie-1.0-chatglm2.md b/llm-localization/ascend/mindie/mindie-1.0-chatglm2.md deleted file mode 100644 index 26a27b9..0000000 --- a/llm-localization/ascend/mindie/mindie-1.0-chatglm2.md +++ /dev/null @@ -1,357 +0,0 @@ -# ChatGLM2-6B 模型推理指导 - -- [概述](#概述) -- [输入输出数据](#输入输出数据) -- [推理前准备](#推理前准备) -- [量化工具使用](#量化工具使用) -- [快速上手](#快速上手) - - [获取源码及依赖](#获取源码及依赖) - - [模型推理](#模型推理) -- [模型参考精度和性能结果](#模型参考精度和性能结果) - -# 概述 - -[ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B/) 是开源中英双语对话模型 [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) 的第二代版本,在保留了初代模型对话流畅、部署门槛较低等众多优秀特性的基础之上,ChatGLM2-6B有更强大的性能、更长的上下文、更高效的推理和更开放的协议。 - -# 输入输出数据 - -- 输入数据 - - | 输入数据 | 大小 | 数据类型 | 数据排布格式 | 是否必选 | - | -------------- | -------------------- | -------- | ------------ | -------- | - | input_ids | BATCH_SIZE x SEQ_LEN | INT64 | ND | 是 | - | attention_mask | BATCH_SIZE x SEQ_LEN | FLOAT32 | ND | 否 | - -- 输出数据 - - | 输出数据 | 大小 | 数据类型 | 数据排布格式 | - | ---------- | --------------------------- | -------- | ------------ | - | output_ids | BATCH_SIZE x OUTPUT_SEQ_LEN | INT64 | ND | - -# 推理前准备 - -1. 参见 [推理环境准备](../../../../docs/推理环境准备.md) 安装 固件与驱动,CANN,PyTorchAdapter等基础软件。 - ```shell - # 使能cann环境变量(根据实际安装路径修改) - source ${path-to-ascend-toolkit}/set_env.sh - # 使能加速库环境变量(根据实际安装路径修改) - source ${path-to-ascendTB}/set_env.sh - # 使能inference库环境变量 - source ${path-to-atb_models}/set_env.sh - # 稀疏工具在线编译(可选) - cd ${path-to-ascend-toolkit}/tools/modelslim/pytorch/weight_compression/compress_graph/ - bash build.sh ${path-to-ascend-toolkit}/ascend-toolkit/latest/ - ``` - -2. 下载模型实现文件和权重文件,并存储到任意路径下 `CHECKPOINT={path-to-weights}` - - - 推荐下载方式 - - ```shell - # 请自行确认已安装 git-lfs - git lfs install - git clone https://huggingface.co/THUDM/chatglm2-6b - cd chatglm2-6b - git reset --hard 4e38bef4c028beafc8fb1837462f74c02e68fcc2 - ``` - - - 其他下载方式 - - 如果你的网络环境较差,下载模型参数可能会花费较长时间甚至失败。此时可以先将模型下载到本地,然后从本地加载。 - - 分开下载模型实现文件和权重文件 - ```shell - # 只下载模型实现文件 - GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/THUDM/chatglm2-6b - cd chatglm2-6b - git reset --hard 4e38bef4c028beafc8fb1837462f74c02e68fcc2 - ``` - 从 [这里](https://cloud.tsinghua.edu.cn/d/674208019e314311ab5c/) 手动下载模型参数文件,并将下载的文件替换到本地的 `chatglm2-6b` 目录下。 - - - 手动从 [THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b) 下载所有文件 - - - 下载后检查`${CHECKPOINT}`目录如下所示 - - ``` - |-- config.json - |-- configuration_chatglm.py - |-- modeling_chatglm.py - |-- pytorch_model-00001-of-00007.bin - |-- pytorch_model-00002-of-00007.bin - |-- pytorch_model-00003-of-00007.bin - |-- pytorch_model-00004-of-00007.bin - |-- pytorch_model-00005-of-00007.bin - |-- pytorch_model-00006-of-00007.bin - |-- pytorch_model-00007-of-00007.bin - |-- pytorch_model.bin.index.json - |-- quantization.py - |-- tokenization_chatglm.py - |-- tokenizer_config.json - |-- tokenizer.model - ``` - - - 在config.json中添加如下配置: - - ``` - { - ...... - "world_size": 1, - "float_layers_id": [0] - } - ``` - -3. 获取量化权重 - - - 直接下载量化权重 - - - [A300I DUO 量化权重下载](https://model-weight.obs.cn-north-4.myhuaweicloud.com/chatglm2_6B_310p.tar.gz) - - [A800I A2 量化权重下载](https://model-weight.obs.cn-north-4.myhuaweicloud.com/chatglm2_6B_910b.tar.gz) - - 请使用wget下载,下载完成后请将文件解压到任意路径`QUANT_WEIGHT_PATH=${path-to-quant-weight}` - - - 手动生成量化权重 - - 详见章节[量化工具使用](#量化工具使用) - -4. 下载 `C-Eval` 数据集 - - 从 [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/e84444333b6d434ea7b0) 下载处理好的 `C-Eval` 数据集,解压到任意目录下 `DATASET={path-to-dataset}` 。 - -# 量化工具使用 - -量化权重的获取需要使用大模型量化工具(集成至CANN包中),详细操作手册可见[大模型权重量化工具-ModelSlim](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/devtools/auxiliarydevtool/modelslim_0001.html)。 - -导出 ChatGLM2-6B 的量化权重或者是稀疏量化权重: - -```shell -# 量化权重导出 -python export_quant_weight.py --float_weight ${CHECKPOINT} --data_path ${DATASET}/val/Social_Science/teacher_qualification.jsonl --quant_weight ${QUANT_WEIGHT_PATH} -# 稀疏量化权重导出 -python export_quant_weight.py --float_weight ${CHECKPOINT} --data_path ${DATASET}/val/Other/civil_servant.jsonl --quant_weight ${QUANT_WEIGHT_PATH} --sparse -``` - -参数说明: - -- float_weight:浮点权重路径。 -- data_path:用于校准的数据文件路径。 -- quant_weight:导出的量化权重或者是稀疏量化权重路径。 -- sparse:默认为false,指量化,True指稀疏量化。 - -**特别注意1**:本章节依赖**pytorch 2.0.0**环境,大模型量化工具依赖指定pytorch版本(不依赖torch_npu,只依赖原生torch)。该环境的pytorch版本与后续步骤可能不同,后续将优化pytorch版本依赖的限制 - -**特别注意2**:本章节依赖 hugging face 的标准 transformers 包。若环境中的 transformers 包被改动过,可能引起相关报错,此时建议重新安装 transformers 包 - -**特别注意3**:稀疏量化权重的获取详见[大模型稀疏权重工具使用文档](https://codehub-y.huawei.com/mindstudio/MindStudio-Backend/automl/files?ref=master&filePath=modelslim%2Fpytorch%2Fllm_sparsequant%2FREADME.md&isFile=true) - -**特别注意4**:本章节执行完毕后,在`QUANT_WEIGHT_PATH`路径下生成如下权重文件,请检查是否缺失: - -``` -deq_scale.npy fp_bias.npy -input_offset.npy input_scale.npy -quant_bias.npy quant_weight.npy -weight_offset.npy weight_scale.npy -``` - -# 快速上手 - -## 获取源码及依赖 - -1. 获取源码 - - ```shell - cd ${path-to-atb_models}/pytorch/examples/chatglm2/6b - ``` -2. 安装第三方依赖 - - ```shell - pip install -r requirements.txt - ``` - -## 模型推理 - -- 可开启CPU Performance模式以提高模型推理性能 - - ``` - cpupower frequency-set -g performance - ``` - -- 推理前开启如下环境变量 - - ```shell - export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE - export TASK_QUEUE_ENABLE=1 - export ATB_OPERATION_EXECUTE_ASYNC=1 - export ATB_LAYER_INTERNAL_TENSOR_REUSE=1 - - # 仅300 Ipro和300 IDuo上开启 - export HCCL_BUFFSIZE=110 - export ATB_USE_TILING_COPY_STREAM=1 - ``` - -- `C-Eval` 数据集推理 - - ```shell - # 浮点 - # 将TP_SIZE设为对应的并行数,例如单芯场景TP_SIZE=1,双芯场景TP_SIZE=2 - # 多芯场景请先执行权重生成(浮点单芯跳过) - python process_weights.py --model_path ${CHECKPOINT} --tp_size ${TP_SIZE} - # 执行浮点推理 - torchrun --nproc_per_node ${TP_SIZE} --master_port 2000 main.py --mode precision_dataset --model_path ${CHECKPOINT} --ceval_dataset ${DATASET} --batch 8 --tp_size ${TP_SIZE} - - # 量化 - # 添加量化环境变量 - export ENABLE_QUANT=1 - export QUANT_WEIGHT_PATH=${QUANT_WEIGHT_PATH} - # 将TP_SIZE设为对应的并行数,例如单芯场景TP_SIZE=1,双芯场景TP_SIZE=2 - # 执行权重生成(单芯/多芯都要执行) - python process_weights.py --model_path ${CHECKPOINT} --tp_size ${TP_SIZE} - # 执行量化推理 - torchrun --nproc_per_node ${TP_SIZE} --master_port 2000 main.py --mode precision_dataset --model_path ${CHECKPOINT} --ceval_dataset ${DATASET} --batch 8 --tp_size ${TP_SIZE} - - # 稀疏量化(当前仅支持300I DUO) - # 添加稀疏量化环境变量 - export ENABLE_SPARSE=1 - export QUANT_WEIGHT_PATH=${QUANT_WEIGHT_PATH} - export COMPRESS_WEIGHT_PATH=${COMPRESS_WEIGHT_PATH} - # 将TP_SIZE设为对应的并行数,例如单芯场景TP_SIZE=1,双芯场景TP_SIZE=2 - # 执行权重生成(单芯/多芯都要执行) - python process_weights.py --model_path ${CHECKPOINT} --tp_size ${TP_SIZE} - python3 generate_compress_weight.py --weight_path=${QUANT_WEIGHT_PATH} --save_path=${COMPRESS_WEIGHT_PATH} - # 执行稀疏量化推理 - torchrun --nproc_per_node ${TP_SIZE} --master_port 2000 main.py --mode precision_dataset --model_path ${CHECKPOINT} --ceval_dataset ${DATASET} --batch 8 --tp_size ${TP_SIZE} - ``` - -- 模型性能数据测试 - - **性能测试请先配置环境变量`export TIMEIT=1`,测试结束后删除该环境变量`unset TIMEIT`。** - - ```shell - # 浮点 - # 将TP_SIZE设为对应的并行数,例如单芯场景TP_SIZE=1,双芯场景TP_SIZE=2 - # 多芯场景请先执行权重生成(浮点单芯跳过) - python process_weights.py --model_path ${CHECKPOINT} --tp_size ${TP_SIZE} - # 执行浮点推理 - torchrun --nproc_per_node ${TP_SIZE} --master_port 2000 main.py --mode performance --model_path ${CHECKPOINT} --batch ${batch_size} --tp_size ${TP_SIZE} - - # 量化 - # 添加量化环境变量 - export ENABLE_QUANT=1 - export QUANT_WEIGHT_PATH=${QUANT_WEIGHT_PATH} - # 将TP_SIZE设为对应的并行数,例如单芯场景TP_SIZE=1,双芯场景TP_SIZE=2 - # 执行权重生成(单芯/多芯都要执行) - python process_weights.py --model_path ${CHECKPOINT} --tp_size ${TP_SIZE} - # 执行量化推理 - torchrun --nproc_per_node ${TP_SIZE} --master_port 2000 main.py --mode performance --model_path ${CHECKPOINT} --batch ${batch_size} --tp_size ${TP_SIZE} - - # 稀疏量化(当前仅支持300I DUO) - # 添加稀疏量化环境变量 - export ENABLE_SPARSE=1 - export QUANT_WEIGHT_PATH=${QUANT_WEIGHT_PATH} - export COMPRESS_WEIGHT_PATH=${COMPRESS_WEIGHT_PATH} - # 将TP_SIZE设为对应的并行数,例如单芯场景TP_SIZE=1,双芯场景TP_SIZE=2 - # 执行权重生成(单芯/多芯都要执行) - python process_weights.py --model_path ${CHECKPOINT} --tp_size ${TP_SIZE} - python3 generate_compress_weight.py --weight_path=${QUANT_WEIGHT_PATH} --save_path=${COMPRESS_WEIGHT_PATH} - # 执行稀疏量化推理 - torchrun --nproc_per_node ${TP_SIZE} --master_port 2000 main.py --mode performance --model_path ${CHECKPOINT} --batch ${batch_size} --tp_size ${TP_SIZE} - ``` - - 备注: - - 1. 可通过配置`--seqlen_in_pair`和`--seqlen_out_pair`指定输入输出序列长度,例如以下命令测试的输入输出组合为[256,256],[512,512],[1024,1024] - - ```shell - torchrun --nproc_per_node ${TP_SIZE} --master_port 2000 main.py --mode performance --model_path ${CHECKPOINT} --device 0 --seqlen_in_pair 256,512,1024 --seqlen_out_pair 256,512,1024 --batch 1 --tp_size ${TP_SIZE} --performance_output_file performance_bs1.csv - ``` - - 2. 环境变量 `MAX_SEQ_LEN` (默认值2048)必须大于等于 `seqlen_in + seqlen_out`,例如: - - ```shell - # 若 seqlen_in = 3584 seqlen_out = 512 - export MAX_SEQ_LEN=4096 - ``` - -- UI 交互 - - - 命令行交互 - - ```shell - # 浮点 - # 将TP_SIZE设为对应的并行数,例如单芯场景TP_SIZE=1,双芯场景TP_SIZE=2 - # 多芯场景请先执行权重生成(浮点单芯跳过) - python process_weights.py --model_path ${CHECKPOINT} --tp_size ${TP_SIZE} - # 执行浮点推理 - torchrun --nproc_per_node ${TP_SIZE} --master_port 2000 main.py --mode cli_demo --model_path ${CHECKPOINT} --tp_size ${TP_SIZE} - - # 量化 - # 添加量化环境变量 - export ENABLE_QUANT=1 - export QUANT_WEIGHT_PATH=${QUANT_WEIGHT_PATH} - # 将TP_SIZE设为对应的并行数,例如单芯场景TP_SIZE=1,双芯场景TP_SIZE=2 - # 执行权重生成(单芯/多芯都要执行) - python process_weights.py --model_path ${CHECKPOINT} --tp_size ${TP_SIZE} - # 执行量化推理 - torchrun --nproc_per_node ${TP_SIZE} --master_port 2000 main.py --mode cli_demo --model_path ${CHECKPOINT} --tp_size ${TP_SIZE} - ``` - - - Web 交互 - - ```shell - # 安装依赖 - pip install -r web_requirements.txt - - # 下载 GitHub 仓库 - git clone https://github.com/THUDM/ChatGLM2-6B.git - cd ChatGLM2-6B - git reset --hard 921d7e9adc69020a19169d1ba4f76c2675a2dd29 - - # 应用适配代码 - git apply ../web_demo.patch - cd .. - - # 将 TP_SIZE 设为对应的并行数,例如单芯场景 TP_SIZE=1,双芯场景 TP_SIZE=2 - - # Gradio 框架 - torchrun --nproc_per_node ${TP_SIZE} --master_port 2000 ChatGLM2-6B/web_demo.py --model_path ${CHECKPOINT} --tp_size ${TP_SIZE} - - # Streamlit 框架 - # ATB OpsRunner 的全局缓存暂不支持多线程,需要降低缓存级别,否则会报错 - # 0 不开启缓存,1 开启本地缓存,2 开启全局缓存,3 同时开启本地和全局缓存,默认为 3 - export ATB_OPSRUNNER_KERNEL_CACHE_TYPE=1 - torchrun --nproc_per_node ${TP_SIZE} --master_port 2000 -m streamlit run ChatGLM2-6B/web_demo2.py -- --model_path ${CHECKPOINT} --tp_size ${TP_SIZE} - ``` - -- `main.py` 参数说明: - - ```shell - --mode: 推理模式,可选单数据推理,数据集推理,性能测试以及命令行交互 - --model_path:模型权重路径 - --model:模型名称,当前仅支持chatglm2和chatglm3,默认为chatglm2 - --tp_size:张量并行数,等于使用的芯片数量 - --device:NPU设备id(可通过npu-smi info查看),多芯场景则为NPU设备起始id,例:--device=0 --tp_size=4,则使用device:0,1,2,3 - --batch:batch大小 - --model_file:推理使用的modeling文件 - --ceval_dataset:CEval数据集路径 - --seqlen_in_pair:性能测试时需要测试的输入长度,默认为[256, 512, 1024] - --seqlen_out_pair:性能测试时需要测试的输出长度,默认为[256, 512, 1024] - --performance_output_file:性能测试数据保存文件,默认为performance.csv - --print_response:是否打印性能测试的推理回答 - ``` - -# 模型参考精度和性能结果 - -- 参考精度 - - > 因为 `C-Eval` 数据集test子集需要上传官网得到结果,所以这里使用val子集进行精度对比 - - | ChatGLM2 | 类别 | Average Accuracy | - | ---------- | ---- | ---------------- | - | GPU (浮点bs8) | val | 53.56% | - | NPU (浮点bs8) | val | 53.12% | - -- 推理性能 - - > 这里性能结果仅作为参考,并非版本极致性能优化结果。 - - | 硬件形态 | 批大小 | 输入长度 | 输出长度 | 解码速度 | - | -------- | ------ | -------- | -------- | -------- | - | 300I Duo | 1 | 8192 | 1024 | 162ms | \ No newline at end of file diff --git a/llm-localization/ascend/mindie/mindie-1.0-qwen-72b.md b/llm-localization/ascend/mindie/mindie-1.0-qwen-72b.md deleted file mode 100644 index f94e801..0000000 --- a/llm-localization/ascend/mindie/mindie-1.0-qwen-72b.md +++ /dev/null @@ -1,302 +0,0 @@ -[TOC] - -# Qwen-72B模型-推理指导 - -注意,QWen-72b与14b版本模型结构一致,因此加速库及modeling等文件可复用,此处不再重复归档 - -# 快速上手 -### 路径变量解释 - -| 变量名 | 含义 | -|---------------------|----------------------------------------------------------------------| -| model_download_path | 开源权重放置目录 | -| llm_path | 加速库及模型库下载后放置目录 | -| model_path | 工作时模型所在的目录,可以和model_download_path相同,但一般模型是公共的,为了避免影响其他用户,单独建一个模型工作目录 | -| script_path | 工作脚本所在路径,本文为${llm_path}/pytorch/examples/qwen/72b | -| ceval_work_dir | ceval数据集、及结果保存所在目录,不必和模型脚本在相同目录 | - - -## 获取源码及依赖 -#### python requirements - -| 包名 | 推荐版本 | -|-------------------------------|--------| -| transformers | 4.30.2 | -| decorator | 5.1.1 | -| sympy | 1.11.1 | -| scipy | 1.11.3 | -| attrs | 23.1.0 | -| psutil | 5.9.6 | -| sentencepiece | 0.1.99 | -| tiktoken | 0.5.2 | -| transformers-stream-generator | 0.0.4 | -| einops | 0.7.0 | -| pandas | 0.8.2 | - -### 下载模型权重 - -下载模型权重,放置到自定义`${model_download_path}` (请下载链接中'Files and versions'页签下的所有文件) -``` -https://huggingface.co/Qwen/Qwen-72B -``` -注意:实际使用的模型可以是base版或chat版,应根据实际需求确定。例子中给出的是base版。 - -### 拷贝文件 - -### 准备 - -#### 1. 将开源模型拷贝到模型工作目录,权重文件使用软链接即可,同时将modeling文件拷贝到模型,并修改开源的config.json, - -```shell -cd ${model_path} -cp ${model_download_path}/*.py ./ -cp ${model_download_path}/*.json ./ -cp ${model_download_path}/*.tiktoken ./ -cp -s ${model_download_path}/*.safetensors ./ -``` - -#### 2. 安装 atb_speed_sdk - -```shell -cd ${llm_path}/pytorch/examples/atb_speed_sdk -pip install . -``` - -#### 3. 张量并行模型切分(仅在模型需要多卡并行时使用) - -```shell -cp ${script_path}/modeling_qwen_cut.py ${model_path} -cp ${script_path}/modeling_qwen_ascend.py ${model_path} -``` - -修改 ${model_path}里的config.json中的kv对,改成`"AutoModelForCausalLM": "modeling_qwen_cut.QWenLMHeadModel"` - -```text -修改`${script_path}/cut_model_and_run.sh` -将 `input_dir` 修改为模型所在路径 `${model_path}` -将 `output_dir` 修改为切分后的模型所存储的路径,如: `${model_path/part_model}`。模型切分成功后,会自动生成新目录part_model(用户无需新建该文件夹) -将 `rank_size` 修改为期望切分的份数,例如rank_size=8表示模型切分为8份。实际切分份数应视显存大小而定。 - -``` - -目录结构示例建议 - -``` ---model_path - *.py(模型源文件) - *.json(模型源文件) - *.tiktoken(模型源文件) - *.bin(模型源文件,软链接,部分模型权重为其它格式,如*.safetensors等) - modeling_qwen_cut.py(权重切分脚本) - --part_model(以双卡为例,权重切分成功后文件夹) - --0 - --1 - ......(其他) ---script_path - cut_model_and_run.sh - cut_model_util.py - main.py - config.ini - ......(其他) -``` - -执行 - -```shell -cd ${script_path} -bash cut_model_and_run.sh -``` - -切分所需时间较长,切分完成后,将会打印 'Tensor parallelism weights have been successfully saved.'。 - -#### 4.修改config.json配置 - -- 单卡运行时**必须**修改 -- 多卡运行时,会在切分阶段会自动修改,没有定制的情况下,可以不操作 - -##### 单卡 -修改${model_path}/config.json中的kv对,改成 - -``` -"AutoModelForCausalLM": "modeling_qwen_ascend.QWenLMHeadModel" -``` - -##### 多卡 - -修改 -${model_path}/part_model/{rank_id}/config.json中的kv对,改成 - -``` -"AutoModelForCausalLM": "modeling_qwen_ascend.QWenLMHeadModel" -``` - -# CPU高性能模式 - -可开启CPU Performance模式以提高模型推理性能。 - -``` - -cpupower frequency-set -g performance - -``` - -### 执行推理 - -#### 修改 ${script_path}/config.ini - -[config文件配置参考](../../atb_speed_sdk/README.md) -提示:多卡并行推理时,config.ini中model_path路径为part_model父文件夹。例如: - -``` -# 正确示例: - -model_path=../model - -# 错误示例: - -model_path=../model/part_model -``` - -#### main.py - -提供了demo推理,精度测试,性能测试三种下游任务。 -task_name可选inference、precision、performance。 - -- 单卡 - 修改 ${model_path}里的config.json中的kv对,改成`"AutoModelForCausalLM": "modeling_qwen_ascend.QWenLMHeadModel"` - -```shell -python main.py --task ${task_name} -``` - -注意,由于本模型体量较大,受硬件限制,单卡很可能无法跑起。 - -- 多卡 -```shell -bash cut_model_and_run.sh ${task_name} -``` - -**注意** -1.docker环境与conda环境有所不同,docker环境中启动模型时需要修改环境变量"ATB_OPERATION_EXECUTE_ASYNC=0"、"TASK_QUEUE_ENABLE=0",否则可能出现算子下发同步失败。 - -**可以使用 MAX_SEQ_LEN 环境变量来设置model支持的最大长度以优化显存占用, 默认使用config里面的max_model_length** -如 - -```shell -MAX_SEQ_LEN=2048 python main.py --task ${task_name} -``` - -或 - -```shell -MAX_SEQ_LEN=2048 bash cut_model_and_run.sh ${task_name} -``` - -如果遇到 - -```text -Traceback (most recent call last): - File "/root/miniconda3/envs/wqh39/lib/python3.9/site-packages/torch_npu/__init__.py", line 31, in - import torch_npu.npu - File "/root/miniconda3/envs/wqh39/lib/python3.9/site-packages/torch_npu/npu/__init__.py", line 46, in - from .utils import (is_initialized, _lazy_call, _lazy_init, init, set_dump, - File "/root/miniconda3/envs/wqh39/lib/python3.9/site-packages/torch_npu/npu/utils.py", line 27, in - import torch_npu._C -ImportError: /root/miniconda3/envs/wqh39/bin/../lib/libgomp.so.1: cannot allocate memory in static TLS block -Segmentation fault (core dumped) -``` - -则在命令行前加上`LD_PRELOAD=上面的error路径`。如 - -```shell -LD_PRELOAD=/root/miniconda3/envs/wqh39/bin/../lib/libgomp.so.1 MAX_SEQ_LEN=2048 python main.py --task ${task_name} --is_quant ${is_quant} -``` -# 竞品对比 - -待补充 - -# 附录: - -# 精度测试指南 - -## 配置说明 - -参考 [SDK精度测试指南CEVAL章节](../../atb_speed_sdk/README.md) - -## 运行脚本 - -- 单芯 - -```shell -cd ${script_path} -python main.py --task precision -``` - -- 多芯 -```shell -cd ${script_path} -bash cut_model_and_run.sh precision -``` - -结束后在${ceval_work_dir}/test_result目录下查看测试结果。[双芯结果每个两份,只需看其中一份即可]。 - -| 文件 | 用途 | -|---------------------------|----------------------| -| device0.log | 运行过程日志 | -| cache0.csv | 结果详情,C列为预期答案,D列为测试答案 | -| result_0_classes_acc.json | 测试数据下按不同维度统计准确率 | -| result_0_subject_acc.json | 测试数据下按不同学科统计准确率 | - -**注意:后续重新运行, 需要删除当前目录下生成的test_result文件夹,否则只会读取当前的目录下的测试结果** - -# 性能测试 - -在功能运行正常的基础下,执行以下步骤进行性能测试 - -## 按照推理指导,下载模型及配置路径,并安装atb_speed_sdk - -## 1. 准备 - -参考 [SDK性能测试指南精确打点法章节](../../atb_speed_sdk/README.md) 进行准备 - -## 2. 修改配置文件 - -- 配置config.ini中[performance]属性, 如下: - ``` - model_name=qwen_72b - perf_mode=detail - ``` - -## 3. 执行测试脚本 - -- 单芯 - -```shell -cd ${script_path} -TIMEIT=1 python main.py --task performance -``` - -- 多芯 -```shell -cd ${script_path} -TIMEIT=1 bash cut_model_and_run.sh performance -``` - -为了不影响正常使用,将`TIMEIT`设置成1来返回具体的性能测试的值,默认是0 - -### 性能测试结果 - -得到性能测试结果csv `performance_test_npu_${model_name}_xxx.csv` - -### 结果分析 - -| 列名 | 含义 | -|-------------------------------|------------| -| batch_size | batch大小 | -| input_seq_len(Encoding) | 输入长度 | -| output_seq_len(Decoding) | 输出长度 | -| ResponseTime(s) | 总响应时间 | -| forward_first_token_time(ms) | 首token推理时长 | -| forward_next_token_time(ms) | 增量推理时长 | -| pre_next_token_time(ms) | 前处理时长 | -| post_next_token_time_post(ms) | 后处理时长 | \ No newline at end of file diff --git a/llm-localization/ascend/mindie/mindie-1.0.RC2.md b/llm-localization/ascend/mindie/mindie-1.0.RC2.md new file mode 100644 index 0000000..1fca7d8 --- /dev/null +++ b/llm-localization/ascend/mindie/mindie-1.0.RC2.md @@ -0,0 +1,132 @@ + + + +文档: +- https://www.hiascend.com/document/detail/zh/mindie/10RC2/whatismindie/mindie_what_0001.html + +docker: +- https://www.hiascend.com/developer/ascendhub/detail/af85b724a7e5469ebd7ea13c3439d48f + + + +rsync -P --rsh=ssh -r root@192.168.16.211:/root/mindie-1.0.rc2.tar . + + + +swr.cn-south-1.myhuaweicloud.com/ascendhub/mindie:1.0.RC2-800I-A2-aarch64 + + +``` +docker run -it -d --name mindie-rc2-45 --net=host \ +-e ASCEND_VISIBLE_DEVICES=4,5 \ +-p 1925:1025 \ +--shm-size=32g \ +-w /workspace \ +-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /data/model_from_hf:/workspace/model \ +swr.cn-south-1.myhuaweicloud.com/ascendhub/mindie:1.0.RC2-800I-A2-aarch64 \ +/bin/bash + + +docker exec -it mindie-rc2-45 bash + + + +cd /opt/package +# 安装CANN包 +source ./install_and_enable_cann.sh + + + +source /usr/local/Ascend/ascend-toolkit/set_env.sh +source /usr/local/Ascend/nnal/atb/set_env.sh +source /usr/local/Ascend/mindie/set_env.sh +source /usr/local/Ascend/llm_model/set_env.sh + + + +vim /usr/local/Ascend/mindie/latest/mindie-service/conf/config.json + +/workspace/model/Qwen1.5-7B-Chat/ + + +export MIES_PYTHON_LOG_TO_FILE=1 +export MIES_PYTHON_LOG_TO_STDOUT=1 +export PYTHONPATH=/usr/local/Ascend/llm_model:$PYTHONPATH +cd /usr/local/Ascend/mindie/latest/mindie-service/bin +./mindieservice_daemon + +``` + + +## 新镜像 + +``` +docker commit -a "guodong" -m "mindie-1.0.RC2" 365815a95f16 harbor/ascend/mindie-base:1.0.RC2 + +# -p 192.168.16.xx:1025:1025 + +docker run -it --rm \ +-e ASCEND_VISIBLE_DEVICES=2,3 \ +-p 1025:1025 \ +--shm-size=32g \ +-w /workspace \ +-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /data/model_from_hf:/workspace/model \ +harbor/ascend/mindie-base:1.0.RC2 \ +/bin/bash + +``` + + +``` +llm-server3.sh + + + +docker run -it --rm \ +-e ASCEND_VISIBLE_DEVICES=6,7 \ +-p 1825:1025 \ +--env AIE_LLM_CONTINUOUS_BATCHING=1 \ +--shm-size=32g \ +-w /workspace \ +-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /data/model_from_hf/Qwen1.5-7B-Chat:/workspace/model \ +-v /home/workspace/llm-server3.sh:/workspace/llm-server.sh \ +-v /home/workspace/mindservice.log:/usr/local/Ascend/mindie/latest/mindie-service/logs/mindservice.log \ +harbor/ascend/mindie-base:1.0.RC2 \ +/bin/bash + + + + + +docker run -it --rm \ +-e ASCEND_VISIBLE_DEVICES=4,5 \ +-p 1525:1025 \ +--env AIE_LLM_CONTINUOUS_BATCHING=1 \ +--shm-size=32g \ +-w /workspace \ +-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /data/model_from_hf/Qwen1.5-7B-Chat:/workspace/model \ +-v /home/workspace/llm-server3.sh:/workspace/llm-server.sh \ +-v /home/workspace/mindservice.log:/usr/local/Ascend/mindie/latest/mindie-service/logs/mindservice.log \ +harbor/ascend/mindie-base:1.0.RC2 \ +/workspace/llm-server.sh \ +--model_name=qwen-chat \ +--model_weight_path=/workspace/model \ +--world_size=2 \ +--npu_mem_size=15 + + + + + +``` + + + diff --git a/llm-localization/ascend/mindie/mindie-1.0.rc2-config.json b/llm-localization/ascend/mindie/mindie-1.0.rc2-config.json new file mode 100644 index 0000000..60f14c2 --- /dev/null +++ b/llm-localization/ascend/mindie/mindie-1.0.rc2-config.json @@ -0,0 +1,88 @@ +{ + "OtherParam" : + { + "ResourceParam" : + { + "cacheBlockSize" : 128 + }, + "LogParam" : + { + "logLevel" : "Info", + "logPath" : "logs/mindservice.log" + }, + "ServeParam" : + { + "ipAddress" : "172.17.0.2", + "managementIpAddress" : "127.0.0.2", + "port" : 1025, + "managementPort" : 1026, + "maxLinkNum" : 1000, + "httpsEnabled" : false, + "tlsCaPath" : "security/ca/", + "tlsCaFile" : ["ca.pem"], + "tlsCert" : "security/certs/server.pem", + "tlsPk" : "security/keys/server.key.pem", + "tlsPkPwd" : "security/pass/mindie_server_key_pwd.txt", + "tlsCrl" : "security/certs/server_crl.pem", + "managementTlsCaFile" : ["management_ca.pem"], + "managementTlsCert" : "security/certs/management_server.pem", + "managementTlsPk" : "security/keys/management_server.key.pem", + "managementTlsPkPwd" : "security/pass/management_mindie_server_key_pwd.txt", + "managementTlsCrl" : "security/certs/management_server_crl.pem", + "kmcKsfMaster" : "tools/pmt/master/ksfa", + "kmcKsfStandby" : "tools/pmt/standby/ksfb", + "multiNodesInferPort" : 1120, + "interNodeTLSEnabled" : true, + "interNodeTlsCaFile" : "security/ca/ca.pem", + "interNodeTlsCert" : "security/certs/server.pem", + "interNodeTlsPk" : "security/keys/server.key.pem", + "interNodeTlsPkPwd" : "security/pass/mindie_server_key_pwd.txt", + "interNodeKmcKsfMaster" : "tools/pmt/master/ksfa", + "interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb" + } + }, + "WorkFlowParam" : + { + "TemplateParam" : + { + "templateType" : "Standard", + "templateName" : "Standard_llama" + } + }, + "ModelDeployParam" : + { + "engineName" : "mindieservice_llm_engine", + "modelInstanceNumber" : 1, + "tokenizerProcessNumber" : 8, + "maxSeqLen" : 2560, + "npuDeviceIds" : [[$npuids]], + "multiNodesInferEnabled" : false, + "ModelParam" : [ + { + "modelName" : "$model_name", + "modelWeightPath" : "$model_weight_path", + "worldSize" : $world_size, + "cpuMemSize" : 5, + "npuMemSize" : $npu_mem_size, + "backendType": "atb", + "pluginParams" : "" + } + ] + }, + "ScheduleParam" : + { + "maxPrefillBatchSize" : 50, + "maxPrefillTokens" : 8192, + "prefillTimeMsPerReq" : 150, + "prefillPolicyType" : 0, + + "decodeTimeMsPerReq" : 50, + "decodePolicyType" : 0, + + "maxBatchSize" : 200, + "maxIterTimes" : 512, + "maxPreemptCount" : 0, + "supportSelectBatch" : true, + "maxQueueDelayMicroseconds" : 5000 + } +} \ No newline at end of file diff --git a/llm-localization/ascend/mindie/mindie-1.0.rc2-llm-server.sh b/llm-localization/ascend/mindie/mindie-1.0.rc2-llm-server.sh new file mode 100644 index 0000000..df28164 --- /dev/null +++ b/llm-localization/ascend/mindie/mindie-1.0.rc2-llm-server.sh @@ -0,0 +1,170 @@ +#!/bin/bash + +echo "入参:" $@ + +for a in "$@"; do + #echo $a + if [[ `echo $a | grep "^--model_name="` ]]; then + model_name=`echo $a | grep "^--model_name=" | awk -F '=' '{print $2}'` + fi + if [[ `echo $a | grep "^--model_weight_path="` ]]; then + model_weight_path=`echo $a | grep "^--model_weight_path=" | awk -F '=' '{print $2}'` + fi + if [[ `echo $a | grep "^--world_size="` ]]; then + world_size=`echo $a | grep "^--world_size=" | awk -F '=' '{print $2}'` + fi + if [[ `echo $a | grep "^--npu_mem_size="` ]]; then + npu_mem_size=`echo $a | grep "^--npu_mem_size=" | awk -F '=' '{print $2}'` + fi +done + +if [ -z "$model_name" ]; then + model_name="default" +fi + +if [ -z "$model_weight_path" ]; then + model_weight_path="/workspace/model" +fi + +if [ -z "$world_size" ]; then + world_size=4 +fi + +if [ -z "$npu_mem_size" ]; then + npu_mem_size=8 +fi + +echo "平台入参: model_name: $model_name, model_weight_path: $model_weight_path , world_size: $world_size , npu_mem_size: $npu_mem_size" + + +npuids="" +card_num=$(($world_size - 1)) +for i in `seq 0 $card_num` + do + if [[ $i == $card_num ]] ; + then + npuids=$npuids$i + else + npuids=$npuids$i"," + fi + done + + +echo $npuids + + +ip=`hostname -I` + +echo "docker ip: [$ip]" +ip=$(echo "$ip" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') +echo "docker handle ip: [$ip]" + +# DEPLOYMENT_CONF_PATH="/home/guodong.li/workspace/config.json" + +DEPLOYMENT_CONF_PATH="/usr/local/Ascend/mindie/latest/mindie-service/conf/config.json" + +cat < $DEPLOYMENT_CONF_PATH +{ + "OtherParam" : + { + "ResourceParam" : + { + "cacheBlockSize" : 128 + }, + "LogParam" : + { + "logLevel" : "Info", + "logPath" : "logs/mindservice.log" + }, + "ServeParam" : + { + "ipAddress" : "$ip", + "managementIpAddress" : "127.0.0.2", + "port" : 1025, + "managementPort" : 1026, + "maxLinkNum" : 1000, + "httpsEnabled" : false, + "tlsCaPath" : "security/ca/", + "tlsCaFile" : ["ca.pem"], + "tlsCert" : "security/certs/server.pem", + "tlsPk" : "security/keys/server.key.pem", + "tlsPkPwd" : "security/pass/mindie_server_key_pwd.txt", + "tlsCrl" : "security/certs/server_crl.pem", + "managementTlsCaFile" : ["management_ca.pem"], + "managementTlsCert" : "security/certs/management_server.pem", + "managementTlsPk" : "security/keys/management_server.key.pem", + "managementTlsPkPwd" : "security/pass/management_mindie_server_key_pwd.txt", + "managementTlsCrl" : "security/certs/management_server_crl.pem", + "kmcKsfMaster" : "tools/pmt/master/ksfa", + "kmcKsfStandby" : "tools/pmt/standby/ksfb", + "multiNodesInferPort" : 1120, + "interNodeTLSEnabled" : true, + "interNodeTlsCaFile" : "security/ca/ca.pem", + "interNodeTlsCert" : "security/certs/server.pem", + "interNodeTlsPk" : "security/keys/server.key.pem", + "interNodeTlsPkPwd" : "security/pass/mindie_server_key_pwd.txt", + "interNodeKmcKsfMaster" : "tools/pmt/master/ksfa", + "interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb" + } + }, + "WorkFlowParam" : + { + "TemplateParam" : + { + "templateType" : "Standard", + "templateName" : "Standard_llama" + } + }, + "ModelDeployParam" : + { + "engineName" : "mindieservice_llm_engine", + "modelInstanceNumber" : 1, + "tokenizerProcessNumber" : 8, + "maxSeqLen" : 2560, + "npuDeviceIds" : [[$npuids]], + "multiNodesInferEnabled" : false, + "ModelParam" : [ + { + "modelName" : "$model_name", + "modelWeightPath" : "$model_weight_path", + "worldSize" : $world_size, + "cpuMemSize" : 5, + "npuMemSize" : $npu_mem_size, + "backendType": "atb", + "pluginParams" : "" + } + ] + }, + "ScheduleParam" : + { + "maxPrefillBatchSize" : 50, + "maxPrefillTokens" : 8192, + "prefillTimeMsPerReq" : 150, + "prefillPolicyType" : 0, + + "decodeTimeMsPerReq" : 50, + "decodePolicyType" : 0, + + "maxBatchSize" : 200, + "maxIterTimes" : 512, + "maxPreemptCount" : 0, + "supportSelectBatch" : true, + "maxQueueDelayMicroseconds" : 5000 + } +} +EOF + +echo "部署参数,$DEPLOYMENT_CONF_PATH" +cat $DEPLOYMENT_CONF_PATH + +source /usr/local/Ascend/ascend-toolkit/set_env.sh +source /usr/local/Ascend/nnal/atb/set_env.sh +source /usr/local/Ascend/mindie/set_env.sh +source /usr/local/Ascend/llm_model/set_env.sh + +export MIES_PYTHON_LOG_TO_FILE=1 +export MIES_PYTHON_LOG_TO_STDOUT=1 +export PYTHONPATH=/usr/local/Ascend/llm_model:$PYTHONPATH +cd /usr/local/Ascend/mindie/latest/mindie-service/bin + +./mindieservice_daemon diff --git a/llm-localization/ascend/mindie/mindie-api.md b/llm-localization/ascend/mindie/mindie-api.md index 4c0137e..ee940ac 100644 --- a/llm-localization/ascend/mindie/mindie-api.md +++ b/llm-localization/ascend/mindie/mindie-api.md @@ -73,8 +73,11 @@ curl "http://127.0.0.1:1025/v1/chat/completions" \ }' +# http://127.0.0.1:1025/v1/chat/completions +# +# http://192.168.16.xxx:1725/v1/chat/completions -curl "http://127.0.0.1:1025/v1/chat/completions" \ +curl "http://172.17.0.2:1025/v1/chat/completions" \ -H "Content-Type: application/json" \ -d '{ "model": "qwen1.5-14b", diff --git "a/llm-localization/ascend/mindie/\346\227\245\345\277\227\345\210\206\346\236\220.txt" "b/llm-localization/ascend/mindie/\346\227\245\345\277\227\345\210\206\346\236\220.txt" new file mode 100644 index 0000000..c3fd59e --- /dev/null +++ "b/llm-localization/ascend/mindie/\346\227\245\345\277\227\345\210\206\346\236\220.txt" @@ -0,0 +1,33 @@ + + +tail -100f mindservice.log | grep "COMPLETED REQ ID" + +2024-07-24 16:25:04.777655 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1236, 3 , 272 , 16 , 256 , 20 , 30 , 1 +2024-07-24 16:25:05.234118 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1239, 3 , 271 , 15 , 256 , 20 , 29 , 2 +2024-07-24 16:25:05.360007 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1262, 1 , 99 , 22 , 77 , 20 , 26 , 1 +2024-07-24 16:25:05.571847 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1244, 2 , 234 , 22 , 212 , 20 , 26 , 1 +2024-07-24 16:25:05.705152 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1241, 3 , 281 , 25 , 256 , 20 , 25 , 1 +2024-07-24 16:25:06.538975 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1258, 2 , 145 , 16 , 129 , 20 , 27 , 2 +2024-07-24 16:25:06.901611 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1274, 1 , 41 , 15 , 26 , 20 , 27 , 1 +2024-07-24 16:25:07.724699 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1253, 2 , 195 , 13 , 182 , 20 , 29 , 1 +2024-07-24 16:25:07.940994 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1277, 1 , 45 , 17 , 28 , 20 , 29 , 1 +2024-07-24 16:25:08.764214 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1257, 2 , 201 , 17 , 184 , 20 , 31 , 1 +2024-07-24 16:25:08.973185 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1280, 1 , 40 , 19 , 21 , 20 , 31 , 1 +2024-07-24 16:25:10.494941 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1282, 1 , 56 , 25 , 31 , 20 , 33 , 1 +2024-07-24 16:25:10.541398 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1250, 3 , 269 , 13 , 256 , 19 , 32 , 1 +2024-07-24 16:25:13.150968 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1284, 1 , 82 , 28 , 54 , 20 , 40 , 1 +2024-07-24 16:25:13.282448 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1259, 3 , 273 , 17 , 256 , 20 , 41 , 2 +2024-07-24 16:25:13.913430 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1261, 3 , 273 , 17 , 256 , 20 , 38 , 1 +2024-07-24 16:25:14.495745 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1263, 3 , 279 , 23 , 256 , 20 , 40 , 1 +2024-07-24 16:25:14.717521 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1264, 3 , 268 , 12 , 256 , 20 , 38 , 2 +2024-07-24 16:25:15.027415 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1266, 3 , 268 , 12 , 256 , 20 , 35 , 1 +2024-07-24 16:25:15.521481 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1287, 1 , 61 , 16 , 45 , 20 , 34 , 1 +2024-07-24 16:25:15.567090 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1267, 3 , 273 , 17 , 256 , 19 , 33 , 1 +2024-07-24 16:25:16.039858 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1268, 3 , 272 , 16 , 256 , 20 , 33 , 1 +2024-07-24 16:25:16.432710 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1269, 3 , 329 , 73 , 256 , 20 , 31 , 1 +2024-07-24 16:25:17.082790 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1270, 3 , 263 , 16 , 247 , 20 , 30 , 1 +2024-07-24 16:25:17.339481 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1289, 1 , 72 , 15 , 57 , 20 , 30 , 1 +2024-07-24 16:25:17.993777 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1271, 3 , 270 , 14 , 256 , 20 , 31 , 1 +2024-07-24 16:25:18.121696 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1273, 3 , 271 , 15 , 256 , 20 , 29 , 1 +2024-07-24 16:25:18.248203 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1286, 1 , 116 , 17 , 99 , 20 , 27 , 1 +2024-07-24 16:25:18.458886 1360 info ibis_request.cc:240] COMPLETED REQ ID: 1275, 3 , 280 , 24 , 256 , 20 , 27 , 1 diff --git a/llm-localization/ascend/pytorch/.DS_Store b/llm-localization/ascend/pytorch/.DS_Store new file mode 100644 index 0000000..8a52378 Binary files /dev/null and b/llm-localization/ascend/pytorch/.DS_Store differ diff --git a/llm-performance/.DS_Store b/llm-performance/.DS_Store new file mode 100644 index 0000000..32e64ca Binary files /dev/null and b/llm-performance/.DS_Store differ diff --git a/llm-performance/mindie/.DS_Store b/llm-performance/mindie/.DS_Store new file mode 100644 index 0000000..f74692a Binary files /dev/null and b/llm-performance/mindie/.DS_Store differ diff --git a/llm-performance/mindie/locust-lantency-throughput/.DS_Store b/llm-performance/mindie/locust-lantency-throughput/.DS_Store new file mode 100644 index 0000000..47d2c46 Binary files /dev/null and b/llm-performance/mindie/locust-lantency-throughput/.DS_Store differ diff --git a/llm-train/.DS_Store b/llm-train/.DS_Store new file mode 100644 index 0000000..b0a154d Binary files /dev/null and b/llm-train/.DS_Store differ diff --git a/llm-train/ascend/.DS_Store b/llm-train/ascend/.DS_Store new file mode 100644 index 0000000..0ed62d2 Binary files /dev/null and b/llm-train/ascend/.DS_Store differ diff --git a/llm-train/peft/.DS_Store b/llm-train/peft/.DS_Store new file mode 100644 index 0000000..a599417 Binary files /dev/null and b/llm-train/peft/.DS_Store differ diff --git a/paper/inference/llm-in-a-flash.md b/paper/inference/llm-in-a-flash.md new file mode 100644 index 0000000..fb545e2 --- /dev/null +++ b/paper/inference/llm-in-a-flash.md @@ -0,0 +1,10 @@ + + +LLM in a Flash: Efficient Inference Techniques With Limited Memory + +https://arxiv.org/abs/2312.11514 + + +https://medium.com/@marketing_novita.ai/llm-in-a-flash-efficient-inference-techniques-with-limited-memory-5f0a404794b0 + + diff --git a/pic/.DS_Store b/pic/.DS_Store new file mode 100644 index 0000000..4734df3 Binary files /dev/null and b/pic/.DS_Store differ diff --git a/pic/llm/.DS_Store b/pic/llm/.DS_Store new file mode 100644 index 0000000..80eed90 Binary files /dev/null and b/pic/llm/.DS_Store differ diff --git a/pic/llm/train/.DS_Store b/pic/llm/train/.DS_Store new file mode 100644 index 0000000..f4e842a Binary files /dev/null and b/pic/llm/train/.DS_Store differ