forked from huggingface/text-generation-inference
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add nightly load testing (huggingface#358)
- Loading branch information
1 parent
0a64947
commit 5f67923
Showing
4 changed files
with
173 additions
and
100 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
name: Nightly load test | ||
|
||
on: | ||
schedule: | ||
- cron: '0 0 * * 1-5' | ||
|
||
pull_request: | ||
paths: | ||
- ".github/workflows/load_test.yaml" | ||
branches: | ||
- 'main' | ||
|
||
jobs: | ||
start-runner: | ||
name: Start self-hosted EC2 runner | ||
runs-on: ubuntu-latest | ||
env: | ||
AWS_REGION: us-east-1 | ||
EC2_AMI_ID: ami-03cfed9ea28f4b002 | ||
EC2_INSTANCE_TYPE: g5.12xlarge | ||
EC2_SUBNET_ID: subnet-931b34f5,subnet-ecb993cd,subnet-943dc2d8,subnet-45371f1a,subnet-ee93e0df,subnet-fddc3dfc | ||
EC2_SECURITY_GROUP: sg-04d472c808f365022 | ||
outputs: | ||
label: ${{ steps.start-ec2-runner.outputs.label }} | ||
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} | ||
steps: | ||
- name: Configure AWS credentials | ||
uses: aws-actions/configure-aws-credentials@v1 | ||
with: | ||
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | ||
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | ||
aws-region: ${{ env.AWS_REGION }} | ||
- name: Start EC2 runner | ||
id: start-ec2-runner | ||
uses: philschmid/philschmid-ec2-github-runner@main | ||
with: | ||
mode: start | ||
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} | ||
ec2-image-id: ${{ env.EC2_AMI_ID }} | ||
ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }} | ||
subnet-id: ${{ env.EC2_SUBNET_ID }} | ||
security-group-id: ${{ env.EC2_SECURITY_GROUP }} | ||
aws-resource-tags: > # optional, requires additional permissions | ||
[ | ||
{"Key": "Name", "Value": "ec2-tgi-github-runner"}, | ||
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"} | ||
] | ||
load-tests: | ||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} | ||
cancel-in-progress: true | ||
needs: start-runner # required to start the main job when the runner is ready | ||
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner | ||
env: | ||
DOCKER_VOLUME: /cache | ||
steps: | ||
- name: Checkout repository | ||
uses: actions/checkout@v3 | ||
|
||
- name: Prepare disks | ||
run: | | ||
sudo mkfs -t ext4 /dev/nvme1n1 | ||
sudo mkdir ${{ env.DOCKER_VOLUME }} | ||
sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }} | ||
- name: Install k6 | ||
run: | | ||
curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1 | ||
- name: Start starcoder | ||
run: | | ||
docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768 | ||
sleep 10 | ||
wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health | ||
- name: Run k6 | ||
run: | | ||
./k6 run load_tests/starcoder_load.js | ||
- name: Stop starcoder | ||
if: ${{ always() }} | ||
run: | | ||
docker stop tgi-starcoder || true | ||
stop-runner: | ||
name: Stop self-hosted EC2 runner | ||
needs: | ||
- start-runner | ||
- load-tests | ||
runs-on: ubuntu-latest | ||
env: | ||
AWS_REGION: us-east-1 | ||
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs | ||
steps: | ||
- name: Configure AWS credentials | ||
uses: aws-actions/configure-aws-credentials@v1 | ||
with: | ||
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | ||
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | ||
aws-region: ${{ env.AWS_REGION }} | ||
- name: Stop EC2 runner | ||
uses: philschmid/philschmid-ec2-github-runner@main | ||
with: | ||
mode: stop | ||
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} | ||
label: ${{ needs.start-runner.outputs.label }} | ||
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import {check} from 'k6'; | ||
import http from 'k6/http'; | ||
import {Trend} from 'k6/metrics'; | ||
|
||
const host = __ENV.HOST || '127.0.0.1:3000'; | ||
|
||
const totalTime = new Trend('total_time', true); | ||
const validationTime = new Trend('validation_time', true); | ||
const queueTime = new Trend('queue_time', true); | ||
const inferenceTime = new Trend('inference_time', true); | ||
const timePerToken = new Trend('time_per_token', true); | ||
|
||
const example = { | ||
payload: JSON.stringify({ | ||
inputs: '# This is a fibonacci function written in the Python programming language.' + | ||
'def fibonacci', | ||
parameters: { | ||
details: true, | ||
max_new_tokens: 60, | ||
temperature: 0.2, | ||
top_p: 0.95, | ||
seed: 0, | ||
}, | ||
}), | ||
generated_tokens: 60 | ||
}; | ||
|
||
export const options = { | ||
thresholds: { | ||
http_req_failed: ['rate==0'], | ||
time_per_token: ['p(95)<90'], | ||
queue_time: ['p(95)<1500'], | ||
}, | ||
scenarios: { | ||
load_test: { | ||
executor: 'constant-arrival-rate', | ||
duration: '60s', | ||
preAllocatedVUs: 100, | ||
rate: 10, | ||
timeUnit: '1s', | ||
}, | ||
}, | ||
}; | ||
|
||
export default function () { | ||
const headers = {'Content-Type': 'application/json'}; | ||
const res = http.post(`http://${host}/generate`, example.payload, { | ||
headers, | ||
}); | ||
|
||
check(res, { | ||
'Post status is 200': (r) => res.status === 200, | ||
'Post response generated tokens': (r) => res.status === 200 && res.json().details.generated_tokens === example.generated_tokens, | ||
}); | ||
|
||
if (res.status === 200) { | ||
totalTime.add(res.headers["X-Total-Time"]); | ||
validationTime.add(res.headers["X-Validation-Time"]); | ||
queueTime.add(res.headers["X-Queue-Time"]); | ||
inferenceTime.add(res.headers["X-Inference-Time"]); | ||
timePerToken.add(res.headers["X-Time-Per-Token"]); | ||
} | ||
} |