fix: add more gpu models, helm chart in CN issue, optimize helm chart (#194)

Code2Life · web-flow · commit fb437aa86014 · 2025-05-21T17:53:47.000+08:00
* fix: correct gpu info for 50 series, add cn specific images

* fix: ha config for production use
diff --git a/README.md b/README.md
@@ -21,6 +21,7 @@
 [![Issues][issues-shield]][issues-url]
 [![MIT License][license-shield]][license-url]
 [![LinkedIn][linkedin-shield]][linkedin-url]
+[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/NexusGPU/tensor-fusion)
 
 Tensor Fusion is a state-of-the-art **GPU virtualization and pooling solution** designed to optimize GPU cluster utilization to its fullest potential.
 
diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.2.20
+version: 1.2.21
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
diff --git a/charts/tensor-fusion/templates/controller-deployment.yaml b/charts/tensor-fusion/templates/controller-deployment.yaml
@@ -74,7 +74,7 @@ spec:
             {{- toYaml .Values.agent.resources | nindent 12 }}
         {{- end }}
         - name: vector
-          image: docker.io/timberio/vector:latest-alpine
+          image: {{ .Values.controller.vectorAgentImage }}
           env:
             - name: NODE_NAME
               valueFrom:
diff --git a/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml b/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml
@@ -392,14 +392,14 @@ data:
     - model: RTX_5060Ti
       fullModelName: "NVIDIA GeForce RTX 5060 Ti"
       vendor: NVIDIA
-      costPerHour: 0.25
-      fp16TFlops: 91
+      costPerHour: 0.2
+      fp16TFlops: 51
 
     - model: RTX_5060
       fullModelName: "NVIDIA GeForce RTX 5060"
       vendor: NVIDIA
       costPerHour: 0.18
-      fp16TFlops: 75
+      fp16TFlops: 42
 
     # NVIDIA Quadro RTX Ampere Series
     - model: RTXA2000
diff --git a/charts/tensor-fusion/values-cn.yaml b/charts/tensor-fusion/values-cn.yaml
@@ -0,0 +1,16 @@
+greptime:
+  image:
+    repository: greptime-registry.cn-hangzhou.cr.aliyuncs.com/greptime/greptimedb
+
+controller:
+  image:
+    repository: registry.cn-hangzhou.aliyuncs.com/tensorfusion/tensor-fusion-operator
+  vectorAgentImage: docker.m.daocloud.io/timberio/vector:latest-alpine
+
+admissionWebhooks:
+  patch:
+    image: k8s.m.daocloud.io/ingress-nginx/kube-webhook-certgen:v1.5.0
+
+agent:
+  image:
+    repository: registry.cn-hangzhou.aliyuncs.com/tensorfusion/tensor-fusion-agent
diff --git a/charts/tensor-fusion/values-production.yaml b/charts/tensor-fusion/values-production.yaml
@@ -0,0 +1,22 @@
+controller:
+  replicaCount: 2
+  resources:
+    requests:
+      memory: 1Gi
+      cpu: 1000m
+    limits:
+      memory: 4Gi
+      cpu: 4000m
+
+# Bring your own Greptime in production for HA
+greptime:
+  installStandalone: false
+
+agent:
+  resources:
+    requests:
+      cpu: 500m
+      memory: 256Mi
+    limits:
+      cpu: 4000m
+      memory: 2Gi
diff --git a/charts/tensor-fusion/values.schema.json b/charts/tensor-fusion/values.schema.json
@@ -84,6 +84,11 @@
           },
           "required": ["repository"]
         },
+        "vectorAgentImage": {
+          "type": "string",
+          "description": "Tag for the vector agent, for collecting metrics to TSDB",
+          "default": "docker.io/timberio/vector:latest-alpine"
+        },
         "podAnnotations": {
           "type": "object",
           "description": "Annotations to add to the controller pods",
diff --git a/charts/tensor-fusion/values.yaml b/charts/tensor-fusion/values.yaml
@@ -32,19 +32,41 @@ controller:
     tag: "latest"
   # This is for setting Kubernetes Annotations to a Pod.
   # For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/ 
+  
+  vectorAgentImage: docker.io/timberio/vector:latest-alpine
+
   podAnnotations: {}
   tolerations: []
   affinity: {}
-  # livenessProbe: {}
-  # readinessProbe: {}
-  # resources: {}
-  
+  livenessProbe:
+    httpGet:
+      path: /healthz
+      port: 8081
+    initialDelaySeconds: 15
+    periodSeconds: 20
+    timeoutSeconds: 5
+    failureThreshold: 5
+  readinessProbe:
+    httpGet:
+      path: /readyz
+      port: 8081
+    initialDelaySeconds: 5
+    periodSeconds: 15
+    timeoutSeconds: 5
+    failureThreshold: 2
+  resources:
+    requests:
+      memory: 256Mi
+      cpu: 50m
+    limits:
+      memory: 2Gi
+      cpu: 2000m
+
   admissionWebhooks:
     failurePolicy: Fail
     secretName: tensor-fusion-webhook-secret
     patch:
       image: registry.k8s.io/ingress-nginx/kube-webhook-certgen:v1.5.0
-
 greptime:
   isCloud: false
   host: greptimedb-standalone.greptimedb.svc.cluster.local