Skip to content

Commit fb437aa

Browse files
authored
fix: add more gpu models, helm chart in CN issue, optimize helm chart (#194)
* fix: correct gpu info for 50 series, add cn specific images * fix: ha config for production use
1 parent cfe79f0 commit fb437aa

8 files changed

+76
-10
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
[![Issues][issues-shield]][issues-url]
2222
[![MIT License][license-shield]][license-url]
2323
[![LinkedIn][linkedin-shield]][linkedin-url]
24+
[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/NexusGPU/tensor-fusion)
2425

2526
Tensor Fusion is a state-of-the-art **GPU virtualization and pooling solution** designed to optimize GPU cluster utilization to its fullest potential.
2627

charts/tensor-fusion/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.2.20
18+
version: 1.2.21
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to

charts/tensor-fusion/templates/controller-deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ spec:
7474
{{- toYaml .Values.agent.resources | nindent 12 }}
7575
{{- end }}
7676
- name: vector
77-
image: docker.io/timberio/vector:latest-alpine
77+
image: {{ .Values.controller.vectorAgentImage }}
7878
env:
7979
- name: NODE_NAME
8080
valueFrom:

charts/tensor-fusion/templates/gpu-public-gpu-info.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -392,14 +392,14 @@ data:
392392
- model: RTX_5060Ti
393393
fullModelName: "NVIDIA GeForce RTX 5060 Ti"
394394
vendor: NVIDIA
395-
costPerHour: 0.25
396-
fp16TFlops: 91
395+
costPerHour: 0.2
396+
fp16TFlops: 51
397397
398398
- model: RTX_5060
399399
fullModelName: "NVIDIA GeForce RTX 5060"
400400
vendor: NVIDIA
401401
costPerHour: 0.18
402-
fp16TFlops: 75
402+
fp16TFlops: 42
403403
404404
# NVIDIA Quadro RTX Ampere Series
405405
- model: RTXA2000

charts/tensor-fusion/values-cn.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
greptime:
2+
image:
3+
repository: greptime-registry.cn-hangzhou.cr.aliyuncs.com/greptime/greptimedb
4+
5+
controller:
6+
image:
7+
repository: registry.cn-hangzhou.aliyuncs.com/tensorfusion/tensor-fusion-operator
8+
vectorAgentImage: docker.m.daocloud.io/timberio/vector:latest-alpine
9+
10+
admissionWebhooks:
11+
patch:
12+
image: k8s.m.daocloud.io/ingress-nginx/kube-webhook-certgen:v1.5.0
13+
14+
agent:
15+
image:
16+
repository: registry.cn-hangzhou.aliyuncs.com/tensorfusion/tensor-fusion-agent
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
controller:
2+
replicaCount: 2
3+
resources:
4+
requests:
5+
memory: 1Gi
6+
cpu: 1000m
7+
limits:
8+
memory: 4Gi
9+
cpu: 4000m
10+
11+
# Bring your own Greptime in production for HA
12+
greptime:
13+
installStandalone: false
14+
15+
agent:
16+
resources:
17+
requests:
18+
cpu: 500m
19+
memory: 256Mi
20+
limits:
21+
cpu: 4000m
22+
memory: 2Gi

charts/tensor-fusion/values.schema.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,11 @@
8484
},
8585
"required": ["repository"]
8686
},
87+
"vectorAgentImage": {
88+
"type": "string",
89+
"description": "Tag for the vector agent, for collecting metrics to TSDB",
90+
"default": "docker.io/timberio/vector:latest-alpine"
91+
},
8792
"podAnnotations": {
8893
"type": "object",
8994
"description": "Annotations to add to the controller pods",

charts/tensor-fusion/values.yaml

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,41 @@ controller:
3232
tag: "latest"
3333
# This is for setting Kubernetes Annotations to a Pod.
3434
# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/
35+
36+
vectorAgentImage: docker.io/timberio/vector:latest-alpine
37+
3538
podAnnotations: {}
3639
tolerations: []
3740
affinity: {}
38-
# livenessProbe: {}
39-
# readinessProbe: {}
40-
# resources: {}
41-
41+
livenessProbe:
42+
httpGet:
43+
path: /healthz
44+
port: 8081
45+
initialDelaySeconds: 15
46+
periodSeconds: 20
47+
timeoutSeconds: 5
48+
failureThreshold: 5
49+
readinessProbe:
50+
httpGet:
51+
path: /readyz
52+
port: 8081
53+
initialDelaySeconds: 5
54+
periodSeconds: 15
55+
timeoutSeconds: 5
56+
failureThreshold: 2
57+
resources:
58+
requests:
59+
memory: 256Mi
60+
cpu: 50m
61+
limits:
62+
memory: 2Gi
63+
cpu: 2000m
64+
4265
admissionWebhooks:
4366
failurePolicy: Fail
4467
secretName: tensor-fusion-webhook-secret
4568
patch:
4669
image: registry.k8s.io/ingress-nginx/kube-webhook-certgen:v1.5.0
47-
4870
greptime:
4971
isCloud: false
5072
host: greptimedb-standalone.greptimedb.svc.cluster.local

0 commit comments

Comments
 (0)