From 31c75e949dd2b227cbf7c59b2888268fb133fc2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=9B=BD=E5=86=AC?= Date: Mon, 29 Jul 2024 00:12:47 +0800 Subject: [PATCH] update-2024-07-29_00:12:47 --- README.md | 6 +- ai-infra/ai-hardware/CUDA.md | 7 + .../cuda\351\225\234\345\203\217.md" | 1 - .../\345\255\230\345\202\250/README.md" | 0 .../\345\255\230\345\202\250/REF.md" | 0 .../\345\255\230\345\202\250/nvme-ssd.md" | 0 ...72\346\200\201\347\241\254\347\233\230.md" | 0 .../\345\255\230\345\202\250.md" | 0 .../AI\350\212\257\347\211\207.md" | 0 ...45\344\275\234\345\216\237\347\220\206.md" | 6 + .../NVIDIA-GPU\345\236\213\345\217\267.md" | 2 - ...50\347\220\206\350\212\257\347\211\207.md" | 0 .../\346\230\207\350\205\276NPU.md" | 0 docs/llm-base/scenes/README.md | 5 + ...24\347\224\250\345\234\272\346\231\257.md" | 41 +++ llm-localization/ascend/FAQ.md | 25 ++ .../ascend/ascend-docker-runtime.md | 17 ++ llm-localization/ascend/mindie/README.md | 1 + .../ascend/mindie/config-1.0.RC1.json | 69 +++++ llm-localization/ascend/mindie/llm-server.sh | 142 +++++++++ llm-localization/ascend/mindie/mindie-api.md | 4 +- ...47\350\203\275\350\260\203\344\274\230.md" | 35 +++ llm-optimizer/FlashAttention.md | 12 + .../baichuan2-13b-8tp.html | 277 ------------------ .../baichuan2-14b-2tp.html | 277 ------------------ .../baichuan2-14b-4tp.html | 277 ------------------ .../chatglm-4tp.html | 277 ------------------ .../chatglm3-6b-2tp.html | 277 ------------------ .../locust-lantency-throughput/qwen-14B.html | 277 ------------------ .../qwen-72b-8tp.html | 277 ------------------ .../qwen1.5-14b-2tp.html | 277 ------------------ .../qwen1.5-14b-8p.html | 277 ------------------ .../qwen1.5-7b-2tp.html | 277 ------------------ ...long.py => vllm-locust-qwen1.5-7b-long.py} | 0 34 files changed, 365 insertions(+), 2778 deletions(-) rename "ai-infra/ai-hardware/\345\255\230\345\202\250/README.md" => "ai-infra/\345\255\230\345\202\250/README.md" (100%) rename "ai-infra/ai-hardware/\345\255\230\345\202\250/REF.md" => "ai-infra/\345\255\230\345\202\250/REF.md" (100%) rename ai-infra/ai-hardware/nvme-ssd.md => "ai-infra/\345\255\230\345\202\250/nvme-ssd.md" (100%) rename "ai-infra/ai-hardware/\345\233\272\346\200\201\347\241\254\347\233\230.md" => "ai-infra/\345\255\230\345\202\250/\345\233\272\346\200\201\347\241\254\347\233\230.md" (100%) rename "ai-infra/ai-hardware/\345\255\230\345\202\250.md" => "ai-infra/\345\255\230\345\202\250/\345\255\230\345\202\250.md" (100%) rename "ai-infra/ai-hardware/AI\350\212\257\347\211\207.md" => "ai-infra/\347\256\227\345\212\233/AI\350\212\257\347\211\207.md" (100%) create mode 100644 "ai-infra/\347\256\227\345\212\233/GPU\345\267\245\344\275\234\345\216\237\347\220\206.md" rename "ai-infra/ai-hardware/NVIDIA-GPU\345\236\213\345\217\267.md" => "ai-infra/\347\256\227\345\212\233/NVIDIA-GPU\345\236\213\345\217\267.md" (99%) rename "ai-infra/ai-hardware/\346\216\250\347\220\206\350\212\257\347\211\207.md" => "ai-infra/\347\256\227\345\212\233/\346\216\250\347\220\206\350\212\257\347\211\207.md" (100%) rename "ai-infra/ai-hardware/\346\230\207\350\205\276NPU.md" => "ai-infra/\347\256\227\345\212\233/\346\230\207\350\205\276NPU.md" (100%) create mode 100644 "llm-application/\345\272\224\347\224\250\345\234\272\346\231\257.md" create mode 100644 llm-localization/ascend/FAQ.md create mode 100644 llm-localization/ascend/ascend-docker-runtime.md create mode 100644 llm-localization/ascend/mindie/config-1.0.RC1.json create mode 100644 llm-localization/ascend/mindie/llm-server.sh create mode 100644 "llm-localization/ascend/mindie/\346\200\247\350\203\275\350\260\203\344\274\230.md" create mode 100644 llm-optimizer/FlashAttention.md delete mode 100644 llm-performance/mindie/locust-lantency-throughput/baichuan2-13b-8tp.html delete mode 100644 llm-performance/mindie/locust-lantency-throughput/baichuan2-14b-2tp.html delete mode 100644 llm-performance/mindie/locust-lantency-throughput/baichuan2-14b-4tp.html delete mode 100644 llm-performance/mindie/locust-lantency-throughput/chatglm-4tp.html delete mode 100644 llm-performance/mindie/locust-lantency-throughput/chatglm3-6b-2tp.html delete mode 100644 llm-performance/mindie/locust-lantency-throughput/qwen-14B.html delete mode 100644 llm-performance/mindie/locust-lantency-throughput/qwen-72b-8tp.html delete mode 100644 llm-performance/mindie/locust-lantency-throughput/qwen1.5-14b-2tp.html delete mode 100644 llm-performance/mindie/locust-lantency-throughput/qwen1.5-14b-8p.html delete mode 100644 llm-performance/mindie/locust-lantency-throughput/qwen1.5-7b-2tp.html rename llm-performance/vllm/{locust-qwen1.5-7b-long.py => vllm-locust-qwen1.5-7b-long.py} (100%) diff --git a/README.md b/README.md index 73a674a..f1488d6 100644 --- a/README.md +++ b/README.md @@ -196,10 +196,10 @@ ### LLM推理优化技术 - [LLM推理优化技术概述]() +- [大模型推理优化技术-KV Cache](https://www.zhihu.com/question/653658936/answer/3569365986) +- Continuous Batching - FlashAttention - PagedAttention -- Continuous Batching -- [大模型推理优化技术-KV Cache](https://www.zhihu.com/question/653658936/answer/3569365986) - Flash Decoding - FlashDecoding++ @@ -228,7 +228,7 @@ - [大模型量化技术原理:AWQ、AutoAWQ](https://zhuanlan.zhihu.com/p/681578090) - [大模型量化技术原理:SpQR](https://zhuanlan.zhihu.com/p/682871823) - [大模型量化技术原理:ZeroQuant系列](https://zhuanlan.zhihu.com/p/683813769) - - [大模型量化技术原理:FP8]() + - [大模型量化技术原理:FP8](https://juejin.cn/post/7392071348480917515) - [大模型量化技术原理:FP6]() - [大模型量化技术原理:FP4]() - [大模型量化技术原理:总结]() diff --git a/ai-infra/ai-hardware/CUDA.md b/ai-infra/ai-hardware/CUDA.md index 0fa2aed..76527c0 100644 --- a/ai-infra/ai-hardware/CUDA.md +++ b/ai-infra/ai-hardware/CUDA.md @@ -20,3 +20,10 @@ CUDA CURAND库:这是CUDA的随机数库,用于生成各种分布的随机 - https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html - https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#id4 CUDA Toolkit and Corresponding Driver Versions + + + + +- CUDA 编程手册: https://github.com/HeKun-NVIDIA/CUDA-Programming-Guide-in-Chinese + + diff --git "a/ai-infra/ai-hardware/cuda\351\225\234\345\203\217.md" "b/ai-infra/ai-hardware/cuda\351\225\234\345\203\217.md" index 794b10d..a3e0ec7 100644 --- "a/ai-infra/ai-hardware/cuda\351\225\234\345\203\217.md" +++ "b/ai-infra/ai-hardware/cuda\351\225\234\345\203\217.md" @@ -8,7 +8,6 @@ https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/12.1.1/centos nvcr.io/nvidia/cuda:12.1.0-cudnn8-runtime-centos7 - ``` diff --git "a/ai-infra/ai-hardware/\345\255\230\345\202\250/README.md" "b/ai-infra/\345\255\230\345\202\250/README.md" similarity index 100% rename from "ai-infra/ai-hardware/\345\255\230\345\202\250/README.md" rename to "ai-infra/\345\255\230\345\202\250/README.md" diff --git "a/ai-infra/ai-hardware/\345\255\230\345\202\250/REF.md" "b/ai-infra/\345\255\230\345\202\250/REF.md" similarity index 100% rename from "ai-infra/ai-hardware/\345\255\230\345\202\250/REF.md" rename to "ai-infra/\345\255\230\345\202\250/REF.md" diff --git a/ai-infra/ai-hardware/nvme-ssd.md "b/ai-infra/\345\255\230\345\202\250/nvme-ssd.md" similarity index 100% rename from ai-infra/ai-hardware/nvme-ssd.md rename to "ai-infra/\345\255\230\345\202\250/nvme-ssd.md" diff --git "a/ai-infra/ai-hardware/\345\233\272\346\200\201\347\241\254\347\233\230.md" "b/ai-infra/\345\255\230\345\202\250/\345\233\272\346\200\201\347\241\254\347\233\230.md" similarity index 100% rename from "ai-infra/ai-hardware/\345\233\272\346\200\201\347\241\254\347\233\230.md" rename to "ai-infra/\345\255\230\345\202\250/\345\233\272\346\200\201\347\241\254\347\233\230.md" diff --git "a/ai-infra/ai-hardware/\345\255\230\345\202\250.md" "b/ai-infra/\345\255\230\345\202\250/\345\255\230\345\202\250.md" similarity index 100% rename from "ai-infra/ai-hardware/\345\255\230\345\202\250.md" rename to "ai-infra/\345\255\230\345\202\250/\345\255\230\345\202\250.md" diff --git "a/ai-infra/ai-hardware/AI\350\212\257\347\211\207.md" "b/ai-infra/\347\256\227\345\212\233/AI\350\212\257\347\211\207.md" similarity index 100% rename from "ai-infra/ai-hardware/AI\350\212\257\347\211\207.md" rename to "ai-infra/\347\256\227\345\212\233/AI\350\212\257\347\211\207.md" diff --git "a/ai-infra/\347\256\227\345\212\233/GPU\345\267\245\344\275\234\345\216\237\347\220\206.md" "b/ai-infra/\347\256\227\345\212\233/GPU\345\267\245\344\275\234\345\216\237\347\220\206.md" new file mode 100644 index 0000000..1460d3b --- /dev/null +++ "b/ai-infra/\347\256\227\345\212\233/GPU\345\267\245\344\275\234\345\216\237\347\220\206.md" @@ -0,0 +1,6 @@ + + + +- [GPU 工作原理解析](https://zhuanlan.zhihu.com/p/697694330) +- [GPU 架构与 CUDA 关系](https://zhuanlan.zhihu.com/p/697746975) + diff --git "a/ai-infra/ai-hardware/NVIDIA-GPU\345\236\213\345\217\267.md" "b/ai-infra/\347\256\227\345\212\233/NVIDIA-GPU\345\236\213\345\217\267.md" similarity index 99% rename from "ai-infra/ai-hardware/NVIDIA-GPU\345\236\213\345\217\267.md" rename to "ai-infra/\347\256\227\345\212\233/NVIDIA-GPU\345\236\213\345\217\267.md" index c64ad83..b743d92 100644 --- "a/ai-infra/ai-hardware/NVIDIA-GPU\345\236\213\345\217\267.md" +++ "b/ai-infra/\347\256\227\345\212\233/NVIDIA-GPU\345\236\213\345\217\267.md" @@ -1,8 +1,6 @@ - - Nvidia下游市场分为四类:游戏、专业可视化、数据中心、汽车,各市场重点产品如下: 游戏:GeForce RTX/GTX系列GPU(PCs)、GeForce NOW(云游戏)、SHIELD(游戏主机); diff --git "a/ai-infra/ai-hardware/\346\216\250\347\220\206\350\212\257\347\211\207.md" "b/ai-infra/\347\256\227\345\212\233/\346\216\250\347\220\206\350\212\257\347\211\207.md" similarity index 100% rename from "ai-infra/ai-hardware/\346\216\250\347\220\206\350\212\257\347\211\207.md" rename to "ai-infra/\347\256\227\345\212\233/\346\216\250\347\220\206\350\212\257\347\211\207.md" diff --git "a/ai-infra/ai-hardware/\346\230\207\350\205\276NPU.md" "b/ai-infra/\347\256\227\345\212\233/\346\230\207\350\205\276NPU.md" similarity index 100% rename from "ai-infra/ai-hardware/\346\230\207\350\205\276NPU.md" rename to "ai-infra/\347\256\227\345\212\233/\346\230\207\350\205\276NPU.md" diff --git a/docs/llm-base/scenes/README.md b/docs/llm-base/scenes/README.md index 1f270e6..b54c2ba 100644 --- a/docs/llm-base/scenes/README.md +++ b/docs/llm-base/scenes/README.md @@ -12,9 +12,14 @@ 关系抽取(Relation Extraction):从文本中抽取出实体之间的关系或联系。 信息抽取(Information Extraction):从非结构化文本中提取结构化的信息,如实体、关系和属性等。 句子相似度(Sentence Similarity):衡量两个句子之间的语义相似度或相关性。 + 文本翻译(Translation):将一种语言的文本转换为另一种语言的过程。 自然语言推理(NLI:Natural Language Inference):判断给定的前提和假设之间的逻辑关系,包括蕴含、矛盾和中立等。 + + 情感分类(Sentiment Classification):将文本分为积极、消极或中性等情感类别。 + + 人像抠图(Portrait Matting):从图像中准确地分离人物主体与背景。 通用抠图(Universal Matting):从图像中准确地分离目标物体与背景,不限于人像。 人体检测(Human Detection):检测图像或视频中的人体位置。 diff --git "a/llm-application/\345\272\224\347\224\250\345\234\272\346\231\257.md" "b/llm-application/\345\272\224\347\224\250\345\234\272\346\231\257.md" new file mode 100644 index 0000000..5034adf --- /dev/null +++ "b/llm-application/\345\272\224\347\224\250\345\234\272\346\231\257.md" @@ -0,0 +1,41 @@ + + + + +文生图: + - Stable Diffusion + - 文心一格:https://yige.baidu.com/creation?mode=0 + +图生文: + - Blip2 + + + + + + +数字人 +- 百度智能云曦灵数字人:https://xiling.cloud.baidu.com/main/plaza/portrait + + + +AI教研平台 + + +音乐生成模型: Suno V3 Alpha + +弊端就是Suno最多只能生成2分钟的音乐,所以可以听到最后,会戛然而止直接截断,但是已经比V2好很多了。 + +但是这个音质、咬字、节奏编排啥的,也都好太多太多了。 + +https://app.suno.ai/ + +要生成音乐的话,第一步肯定是写prompt,第二步(纯音乐没有)就是写歌词。 + + + + + + + + diff --git a/llm-localization/ascend/FAQ.md b/llm-localization/ascend/FAQ.md new file mode 100644 index 0000000..a257881 --- /dev/null +++ b/llm-localization/ascend/FAQ.md @@ -0,0 +1,25 @@ + + + + + +docker: Error response from daemon: failed to create shim task: OCI runtime create failed: unable to retrieve OCI runtime error (open /var/run/docker/containerd/daemon/io.containerd.runtime.v2.task/moby/579418211a825ef5c7fcf5becdbe90804f0ed7862d9c59663995f9dd463937b4/log.json: no such file or directory): /usr/local/Ascend/Ascend-Docker-Runtime/ascend-docker-runtime did not terminate successfully: exit status 1: 2024/07/24 09:59:29 owner not right /usr/bin/runc 1000 + + + + +错误信息表明/usr/bin/runc这个文件的所有权不正确,即它不是由root用户拥有或者它的所属用户不是1000。Docker在创建并运行容器时需要runc这个二进制文件,如果权限设置不当,Docker将无法正确执行。 + + +解决办法: + + +查看权限 + +ls -lah /usr/bin/runc + + +修改权限 + +sudo chown root:root /usr/bin/runc + diff --git a/llm-localization/ascend/ascend-docker-runtime.md b/llm-localization/ascend/ascend-docker-runtime.md new file mode 100644 index 0000000..6f3ddbf --- /dev/null +++ b/llm-localization/ascend/ascend-docker-runtime.md @@ -0,0 +1,17 @@ + + + +昇腾docker runtime仓库,在docker容器场景下,使用昇腾NPU,提供更简单的设备和依赖路径挂载方法。 + + +https://gitee.com/ascend/ascend-docker-runtime + + + +安装:https://www.hiascend.com/document/detail/zh/mindx-dl/300/dluserguide/clusterscheduling/dlug_installation_02_000025.html + + +Ascend Docker Runtime组件参考信息说明: + +https://www.hiascend.com/document/detail/zh/mindx-dl/300/dluserguide/clusterscheduling/dlug_installation_02_000036.html + diff --git a/llm-localization/ascend/mindie/README.md b/llm-localization/ascend/mindie/README.md index fbeae83..336d853 100644 --- a/llm-localization/ascend/mindie/README.md +++ b/llm-localization/ascend/mindie/README.md @@ -112,6 +112,7 @@ docker save -o mindie-1.0.tar ascendhub.huawei.com/public-ascendhub/mindie:1.0.R scp root@192.xxx.16.211:/root/mindie-1.0.tar . + # 断点续传 rsync -P --rsh=ssh -r root@192.xxx.16.211:/root/mindie-1.0.tar . ``` diff --git a/llm-localization/ascend/mindie/config-1.0.RC1.json b/llm-localization/ascend/mindie/config-1.0.RC1.json new file mode 100644 index 0000000..f90c857 --- /dev/null +++ b/llm-localization/ascend/mindie/config-1.0.RC1.json @@ -0,0 +1,69 @@ +{ + "OtherParam": + { + "ResourceParam" : + { + "cacheBlockSize" : 128, + "preAllocBlocks" : 8 + }, + "LogParam" : + { + "logLevel" : "Info", + "logPath" : "/logs/mindservice.log" + }, + "ServeParam" : + { + "ipAddress" : "0.0.0.0", + "port" : 1025, + "maxLinkNum" : 300, + "httpsEnabled" : false, + "tlsCaPath" : "security/ca/", + "tlsCaFile" : ["ca.pem"], + "tlsCert" : "security/certs/server.pem", + "tlsPk" : "security/keys/server.key.pem", + "tlsPkPwd" : "security/pass/mindie_server_key_pwd.txt", + "kmcKsfMaster" : "tools/pmt/master/ksfa", + "kmcKsfStandby" : "tools/pmt/standby/ksfb", + "tlsCrl" : "security/certs/server_crl.pem" + } + }, + "WorkFlowParam": + { + "TemplateParam" : + { + "templateType": "Standard", + "templateName" : "Standard_llama", + "pipelineNumber" : 1 + } + }, + "ModelDeployParam": + { + "maxSeqLen" : 2560, + "npuDeviceIds" : [[$npuids]], + "ModelParam" : [ + { + "modelInstanceType": "Standard", + "modelName" : "$model_name", + "modelWeightPath" : "$model_weight_path", + "worldSize" : $world_size, + "cpuMemSize" : 5, + "npuMemSize" : $npu_mem_size, + "backendType": "atb" + } + ] + }, + "ScheduleParam": + { + "maxPrefillBatchSize" : 192, + "maxPrefillTokens" : 12000, + "prefillTimeMsPerReq" : 150, + "prefillPolicyType" : 0, + "decodeTimeMsPerReq" : 50, + "decodePolicyType" : 0, + "maxBatchSize" : 256, + "maxIterTimes" : 1024, + "maxPreemptCount" : 200, + "supportSelectBatch" : true, + "maxQueueDelayMicroseconds" : 5000 + } +} \ No newline at end of file diff --git a/llm-localization/ascend/mindie/llm-server.sh b/llm-localization/ascend/mindie/llm-server.sh new file mode 100644 index 0000000..e3b2ef9 --- /dev/null +++ b/llm-localization/ascend/mindie/llm-server.sh @@ -0,0 +1,142 @@ +#!/bin/bash + +echo "入参:" $@ + +for a in "$@"; do + #echo $a + if [[ `echo $a | grep "^--model_name="` ]]; then + model_name=`echo $a | grep "^--model_name=" | awk -F '=' '{print $2}'` + fi + if [[ `echo $a | grep "^--model_weight_path="` ]]; then + model_weight_path=`echo $a | grep "^--model_weight_path=" | awk -F '=' '{print $2}'` + fi + if [[ `echo $a | grep "^--world_size="` ]]; then + world_size=`echo $a | grep "^--world_size=" | awk -F '=' '{print $2}'` + fi + if [[ `echo $a | grep "^--npu_mem_size="` ]]; then + npu_mem_size=`echo $a | grep "^--npu_mem_size=" | awk -F '=' '{print $2}'` + fi +done + +if [ -z "$model_name" ]; then + model_name="default" +fi + +if [ -z "$model_weight_path" ]; then + model_weight_path="/workspace/model" +fi + +if [ -z "$world_size" ]; then + world_size=4 +fi + +if [ -z "$npu_mem_size" ]; then + npu_mem_size=8 +fi + +echo "平台入参: model_name: $model_name, model_weight_path: $model_weight_path , world_size: $world_size , npu_mem_size: $npu_mem_size" + + +npuids="" +card_num=$(($world_size - 1)) +for i in `seq 0 $card_num` + do + if [[ $i == $card_num ]] ; + then + npuids=$npuids$i + else + npuids=$npuids$i"," + fi + done + + +echo $npuids + + +# DEPLOYMENT_CONF_PATH="/home/guodong.li/workspace/config.json" + +DEPLOYMENT_CONF_PATH="/usr/local/Ascend/mindie/latest/mindie-service/conf/config.json" + +cat < $DEPLOYMENT_CONF_PATH +{ + "OtherParam": + { + "ResourceParam" : + { + "cacheBlockSize" : 128, + "preAllocBlocks" : 8 + }, + "LogParam" : + { + "logLevel" : "Info", + "logPath" : "/logs/mindservice.log" + }, + "ServeParam" : + { + "ipAddress" : "0.0.0.0", + "port" : 1025, + "maxLinkNum" : 300, + "httpsEnabled" : false, + "tlsCaPath" : "security/ca/", + "tlsCaFile" : ["ca.pem"], + "tlsCert" : "security/certs/server.pem", + "tlsPk" : "security/keys/server.key.pem", + "tlsPkPwd" : "security/pass/mindie_server_key_pwd.txt", + "kmcKsfMaster" : "tools/pmt/master/ksfa", + "kmcKsfStandby" : "tools/pmt/standby/ksfb", + "tlsCrl" : "security/certs/server_crl.pem" + } + }, + "WorkFlowParam": + { + "TemplateParam" : + { + "templateType": "Standard", + "templateName" : "Standard_llama", + "pipelineNumber" : 1 + } + }, + "ModelDeployParam": + { + "maxSeqLen" : 2560, + "npuDeviceIds" : [[$npuids]], + "ModelParam" : [ + { + "modelInstanceType": "Standard", + "modelName" : "$model_name", + "modelWeightPath" : "$model_weight_path", + "worldSize" : $world_size, + "cpuMemSize" : 5, + "npuMemSize" : $npu_mem_size, + "backendType": "atb" + } + ] + }, + "ScheduleParam": + { + "maxPrefillBatchSize" : 256, + "maxPrefillTokens" : 8192, + "prefillTimeMsPerReq" : 150, + "prefillPolicyType" : 0, + "decodeTimeMsPerReq" : 50, + "decodePolicyType" : 0, + "maxBatchSize" : 256, + "maxIterTimes" : 1024, + "maxPreemptCount" : 200, + "supportSelectBatch" : true, + "maxQueueDelayMicroseconds" : 50000 + } +} +EOF + +echo "部署参数,$DEPLOYMENT_CONF_PATH" +cat $DEPLOYMENT_CONF_PATH + +source /usr/local/Ascend/ascend-toolkit/set_env.sh +source /usr/local/Ascend/mindie/set_env.sh +source /usr/local/Ascend/llm_model/set_env.sh + +export PYTHONPATH=/usr/local/Ascend/llm_model:$PYTHONPATH +cd /usr/local/Ascend/mindie/latest/mindie-service/bin + +./mindieservice_daemon \ No newline at end of file diff --git a/llm-localization/ascend/mindie/mindie-api.md b/llm-localization/ascend/mindie/mindie-api.md index 294830a..4c0137e 100644 --- a/llm-localization/ascend/mindie/mindie-api.md +++ b/llm-localization/ascend/mindie/mindie-api.md @@ -19,7 +19,7 @@ curl -H "Accept: application/json" -H "Content-type: application/json" -X POST - "content": "如何养生?" } ] - }' http://127.0.0.1:1025/v1/chat/completions + }' http://127.0.0.1:1125/v1/chat/completions @@ -130,7 +130,7 @@ curl "http://127.0.0.1:1025/v1/chat/completions" \ ---- -curl "http://127.0.0.1:1025/v1/chat/completions" \ +curl "http://127.0.0.1:1125/v1/chat/completions" \ -H "Content-Type: application/json" \ -d '{ "model": "qwen1.5-14b", diff --git "a/llm-localization/ascend/mindie/\346\200\247\350\203\275\350\260\203\344\274\230.md" "b/llm-localization/ascend/mindie/\346\200\247\350\203\275\350\260\203\344\274\230.md" new file mode 100644 index 0000000..0b0cba9 --- /dev/null +++ "b/llm-localization/ascend/mindie/\346\200\247\350\203\275\350\260\203\344\274\230.md" @@ -0,0 +1,35 @@ + + + + +910b4 llama-7b 10g KV CACHE + + +Total Block Num = 160 + + + + +Block Num = Ceil(输入Token数/Block Size)+Ceil(最大输出Token数/Block Size) + + +560/4 + 512/4 = 9 + +batch_size: 20 + + + + +910B3 llama-7b 30g KV CACHE + +Total Block Num = 480 + + + +560/4 + 512/4 = 9 + + + +batch_size: 50 + + diff --git a/llm-optimizer/FlashAttention.md b/llm-optimizer/FlashAttention.md new file mode 100644 index 0000000..1901ba9 --- /dev/null +++ b/llm-optimizer/FlashAttention.md @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/llm-performance/mindie/locust-lantency-throughput/baichuan2-13b-8tp.html b/llm-performance/mindie/locust-lantency-throughput/baichuan2-13b-8tp.html deleted file mode 100644 index 8f3c826..0000000 --- a/llm-performance/mindie/locust-lantency-throughput/baichuan2-13b-8tp.html +++ /dev/null @@ -1,277 +0,0 @@ - - - - - - - - - Locust - - -
- - - - - - \ No newline at end of file diff --git a/llm-performance/mindie/locust-lantency-throughput/baichuan2-14b-2tp.html b/llm-performance/mindie/locust-lantency-throughput/baichuan2-14b-2tp.html deleted file mode 100644 index f8480b5..0000000 --- a/llm-performance/mindie/locust-lantency-throughput/baichuan2-14b-2tp.html +++ /dev/null @@ -1,277 +0,0 @@ - - - - - - - - - Locust - - -
- - - - - - \ No newline at end of file diff --git a/llm-performance/mindie/locust-lantency-throughput/baichuan2-14b-4tp.html b/llm-performance/mindie/locust-lantency-throughput/baichuan2-14b-4tp.html deleted file mode 100644 index 13559fd..0000000 --- a/llm-performance/mindie/locust-lantency-throughput/baichuan2-14b-4tp.html +++ /dev/null @@ -1,277 +0,0 @@ - - - - - - - - - Locust - - -
- - - - - - \ No newline at end of file diff --git a/llm-performance/mindie/locust-lantency-throughput/chatglm-4tp.html b/llm-performance/mindie/locust-lantency-throughput/chatglm-4tp.html deleted file mode 100644 index 2eb34e0..0000000 --- a/llm-performance/mindie/locust-lantency-throughput/chatglm-4tp.html +++ /dev/null @@ -1,277 +0,0 @@ - - - - - - - - - Locust - - -
- - - - - - \ No newline at end of file diff --git a/llm-performance/mindie/locust-lantency-throughput/chatglm3-6b-2tp.html b/llm-performance/mindie/locust-lantency-throughput/chatglm3-6b-2tp.html deleted file mode 100644 index e83a2f3..0000000 --- a/llm-performance/mindie/locust-lantency-throughput/chatglm3-6b-2tp.html +++ /dev/null @@ -1,277 +0,0 @@ - - - - - - - - - Locust - - -
- - - - - - \ No newline at end of file diff --git a/llm-performance/mindie/locust-lantency-throughput/qwen-14B.html b/llm-performance/mindie/locust-lantency-throughput/qwen-14B.html deleted file mode 100644 index 14a653c..0000000 --- a/llm-performance/mindie/locust-lantency-throughput/qwen-14B.html +++ /dev/null @@ -1,277 +0,0 @@ - - - - - - - - - Locust - - -
- - - - - - \ No newline at end of file diff --git a/llm-performance/mindie/locust-lantency-throughput/qwen-72b-8tp.html b/llm-performance/mindie/locust-lantency-throughput/qwen-72b-8tp.html deleted file mode 100644 index ed2d104..0000000 --- a/llm-performance/mindie/locust-lantency-throughput/qwen-72b-8tp.html +++ /dev/null @@ -1,277 +0,0 @@ - - - - - - - - - Locust - - -
- - - - - - \ No newline at end of file diff --git a/llm-performance/mindie/locust-lantency-throughput/qwen1.5-14b-2tp.html b/llm-performance/mindie/locust-lantency-throughput/qwen1.5-14b-2tp.html deleted file mode 100644 index 32a6b4f..0000000 --- a/llm-performance/mindie/locust-lantency-throughput/qwen1.5-14b-2tp.html +++ /dev/null @@ -1,277 +0,0 @@ - - - - - - - - - Locust - - -
- - - - - - \ No newline at end of file diff --git a/llm-performance/mindie/locust-lantency-throughput/qwen1.5-14b-8p.html b/llm-performance/mindie/locust-lantency-throughput/qwen1.5-14b-8p.html deleted file mode 100644 index 6b82b01..0000000 --- a/llm-performance/mindie/locust-lantency-throughput/qwen1.5-14b-8p.html +++ /dev/null @@ -1,277 +0,0 @@ - - - - - - - - - Locust - - -
- - - - - - \ No newline at end of file diff --git a/llm-performance/mindie/locust-lantency-throughput/qwen1.5-7b-2tp.html b/llm-performance/mindie/locust-lantency-throughput/qwen1.5-7b-2tp.html deleted file mode 100644 index 94fadc5..0000000 --- a/llm-performance/mindie/locust-lantency-throughput/qwen1.5-7b-2tp.html +++ /dev/null @@ -1,277 +0,0 @@ - - - - - - - - - Locust - - -
- - - - - - \ No newline at end of file diff --git a/llm-performance/vllm/locust-qwen1.5-7b-long.py b/llm-performance/vllm/vllm-locust-qwen1.5-7b-long.py similarity index 100% rename from llm-performance/vllm/locust-qwen1.5-7b-long.py rename to llm-performance/vllm/vllm-locust-qwen1.5-7b-long.py