-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathegs-installer-config.yaml
442 lines (389 loc) · 25.3 KB
/
egs-installer-config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
########################### MANDATORY PARAMETERS ####################################################################
# Global image pull secret settings
global_image_pull_secret:
repository: "https://index.docker.io/v1/" # Docker registry URL
username: "" # Global Docker registry username
password: "" # Global Docker registry password
# Kubeconfig settings
global_kubeconfig: "" # Relative path to the global kubeconfig file (must be in the script directory) - Mandatory
global_kubecontext: "" # Global kubecontext to use - Mandatory
use_global_context: true # If true, use the global kubecontext for all operations by default
# Enable or disable specific stages of the installation
enable_install_controller: true # Enable the installation of the Kubeslice controller
enable_install_ui: true # Enable the installation of the Kubeslice UI
enable_install_worker: true # Enable the installation of Kubeslice workers
# Enable or disable the installation of additional applications(prometheus, gpu-operator, postgresql)
enable_install_additional_apps: false # Set to true to enable additional apps installation
#########################################################################################################################
########################### OPTIONAL CONFIGURATION PARAMETERS ###########################################################
# Project and cluster registration settings
enable_project_creation: true # Enable project creation in Kubeslice
enable_cluster_registration: true # Enable cluster registration in Kubeslice
enable_prepare_worker_values_file: true # Prepare the worker values file for Helm charts
# Enable custom applications
enable_custom_apps: false # Set to true to enable custom apps
# Command execution settings
run_commands: false # Enable the execution of commands defined in the YAML
# Global monitoring endpoint settings
global_auto_fetch_endpoint: false # Enable automatic fetching of monitoring endpoints globally
global_grafana_namespace: egs-monitoring # Namespace where Grafana is globally deployed
global_grafana_service_type: ClusterIP # Service type for Grafana (accessible only within the cluster)
global_grafana_service_name: prometheus-grafana # Service name for accessing Grafana globally
global_prometheus_namespace: egs-monitoring # Namespace where Prometheus is globally deployed
global_prometheus_service_name: prometheus-kube-prometheus-prometheus # Service name for accessing Prometheus globally
global_prometheus_service_type: ClusterIP # Service type for Prometheus (accessible only within the cluster)
# Precheck options
precheck: true # Run general prechecks before starting the installation
kubeslice_precheck: true # Run specific prechecks for Kubeslice components
# Global installation verification settings
verify_install: false # Enable verification of installations globally
verify_install_timeout: 600 # Timeout for global installation verification (in seconds)
skip_on_verify_fail: true # If set to true, skip steps where verification fails, otherwise exit on failure
# Base path settings
base_path: "" # If left empty, the script will use the relative path to the script as the base path
# Helm repository settings
use_local_charts: true # Use local Helm charts instead of fetching them from a repository
local_charts_path: "charts" # Path to the directory containing local Helm charts
global_helm_repo_url: "" # URL for the global Helm repository (if not using local charts)
global_helm_username: "" # Username for accessing the global Helm repository
global_helm_password: "" # Password for accessing the global Helm repository
readd_helm_repos: true # Re-add Helm repositories even if they are already present
#### Kubeslice Controller Installation Settings ####
kubeslice_controller_egs:
skip_installation: false # Do not skip the installation of the controller
use_global_kubeconfig: true # Use global kubeconfig for the controller installation
specific_use_local_charts: true # Override to use local charts for the controller
kubeconfig: "" # Path to the kubeconfig file specific to the controller
kubecontext: "" # Kubecontext specific to the controller; if empty, uses the global context
namespace: "kubeslice-controller" # Kubernetes namespace where the controller will be installed
release: "egs-controller" # Helm release name for the controller
chart: "kubeslice-controller-egs" # Helm chart name for the controller
#### Inline Helm Values for the Controller Chart ####
inline_values:
global:
imageRegistry: docker.io/aveshasystems # Docker registry for the images
kubeTally:
enabled: true # Enable KubeTally in the controller
postgresSecretName: kubetally-db-credentials # Secret name for PostgreSQL credentials
postgresAddr: "kt-postgresql.kt-postgresql.svc.cluster.local" # Address of the PostgreSQL service
postgresPort: 5432 # Port for the PostgreSQL service
postgresUser: "postgres" # PostgreSQL username
postgresPassword: "postgres" # PostgreSQL password
postgresDB: "postgres" # PostgreSQL database name
postgresSslmode: disable # SSL mode for PostgreSQL connection
prometheusUrl: http://prometheus-kube-prometheus-prometheus.egs-monitoring.svc.cluster.local:9090 # Prometheus URL for monitoring
kubeslice:
controller:
endpoint: "" # Endpoint of the controller API server; auto-fetched if left empty
#### Helm Flags and Verification Settings ####
helm_flags: "--wait --timeout 5m --debug" # Additional Helm flags for the installation
verify_install: false # Verify the installation of the controller
verify_install_timeout: 30 # Timeout for the controller installation verification (in seconds)
skip_on_verify_fail: true # If verification fails, do not skip the step
#### Troubleshooting Settings ####
enable_troubleshoot: false # Enable troubleshooting mode for additional logs and checks
#### Kubeslice Controller Installation Settings ####
#### Kubeslice UI Installation Settings ####
kubeslice_ui_egs:
skip_installation: false # Do not skip the installation of the UI
use_global_kubeconfig: true # Use global kubeconfig for the UI installation
kubeconfig: "" # Path to the kubeconfig file specific to the UI
kubecontext: "" # Kubecontext specific to the UI; if empty, uses the global context
namespace: "kubeslice-controller" # Kubernetes namespace where the UI will be installed
release: "egs-ui" # Helm release name for the UI
chart: "kubeslice-ui-egs" # Helm chart name for the UI
#### Inline Helm Values for the UI Chart ####
inline_values:
global:
imageRegistry: docker.io/aveshasystems # Docker registry for the UI images
kubeslice:
prometheus:
url: http://prometheus-kube-prometheus-prometheus.egs-monitoring.svc.cluster.local:9090 # Prometheus URL for monitoring
uiproxy:
service:
type: LoadBalancer # Service type for the UI proxy
egsCoreApis:
enabled: true # Enable EGS core APIs for the UI
service:
type: LoadBalancer # Service type for the EGS core APIs
#### Helm Flags and Verification Settings ####
helm_flags: "--wait --timeout 5m --debug" # Additional Helm flags for the UI installation
verify_install: false # Verify the installation of the UI
verify_install_timeout: 50 # Timeout for the UI installation verification (in seconds)
skip_on_verify_fail: true # If UI verification fails, do not skip the step
#### Chart Source Settings ####
specific_use_local_charts: true # Override to use local charts for the UI
#### Kubeslice Worker Installation Settings ####
kubeslice_worker_egs:
- name: "worker-1" # Worker name
use_global_kubeconfig: true # Use global kubeconfig for this worker
kubeconfig: "" # Path to the kubeconfig file specific to the worker
kubecontext: "" # Kubecontext specific to the worker; if empty, uses the global context
skip_installation: false # Do not skip the installation of the worker
specific_use_local_charts: true # Override to use local charts for this worker
namespace: "kubeslice-system" # Kubernetes namespace for this worker
release: "egs-worker" # Helm release name for the worker
chart: "kubeslice-worker-egs" # Helm chart name for the worker
#### Inline Helm Values for the Worker Chart ####
inline_values:
global:
imageRegistry: docker.io/aveshasystems # Docker registry for worker images
egs:
prometheusEndpoint: "http://prometheus-kube-prometheus-prometheus.egs-monitoring.svc.cluster.local:9090" # Prometheus endpoint
grafanaDashboardBaseUrl: "http://<grafana-lb>/d/Oxed_c6Wz" # Grafana dashboard base URL
metrics:
insecure: true # Allow insecure connections for metrics
kserve:
enabled: true # Enable KServe for the worker
kserve: # KServe chart options
controller:
gateway:
domain: kubeslice.com
ingressGateway:
className: "nginx" # Ingress class name for the KServe gateway
#### Helm Flags and Verification Settings ####
helm_flags: "--wait --timeout 5m --debug" # Additional Helm flags for the worker installation
verify_install: true # Verify the installation of the worker
verify_install_timeout: 60 # Timeout for the worker installation verification (in seconds)
skip_on_verify_fail: false # Do not skip if worker verification fails
#### Troubleshooting Settings ####
enable_troubleshoot: false # Enable troubleshooting mode for additional logs and checks
#### Local Monitoring Endpoint Settings (Optional) ####
# local_auto_fetch_endpoint: true # Enable automatic fetching of monitoring endpoints
# local_grafana_namespace: egs-monitoring # Namespace where Grafana is deployed
# local_grafana_service_name: prometheus-grafana # Service name for accessing Grafana
# local_grafana_service_type: ClusterIP # Service type for Grafana (accessible only within the cluster)
# local_prometheus_namespace: egs-monitoring # Namespace where Prometheus is deployed
# local_prometheus_service_name: prometheus-kube-prometheus-prometheus # Service name for accessing Prometheus
# local_prometheus_service_type: ClusterIP # Service type for Prometheus (accessible only within the cluster)
#### Define Projects ####
projects:
- name: "avesha" # Name of the Kubeslice project
username: "admin" # Username for accessing the Kubeslice project
#### Define Cluster Registration ####
cluster_registration:
- cluster_name: "worker-1" # Name of the cluster to be registered
project_name: "avesha" # Name of the project to associate with the cluster
#### Telemetry Settings ####
telemetry:
enabled: true # Enable telemetry for this cluster
endpoint: "http://prometheus-kube-prometheus-prometheus.egs-monitoring.svc.cluster.local:9090" # Telemetry endpoint
telemetryProvider: "prometheus" # Telemetry provider (Prometheus in this case)
#### Geo-Location Settings ####
geoLocation:
cloudProvider: "GCP" # Cloud provider for this cluster (e.g., GCP)
cloudRegion: "us-central1" # Cloud region for this cluster (e.g., us-central1)
#### Define Additional Applications to Install ####
additional_apps:
- name: "gpu-operator" # Name of the application
skip_installation: false # Do not skip the installation of the GPU operator
use_global_kubeconfig: true # Use global kubeconfig for this application
kubeconfig: "" # Path to the kubeconfig file specific to this application
kubecontext: "" # Kubecontext specific to this application; uses global context if empty
namespace: "egs-gpu-operator" # Namespace where the GPU operator will be installed
release: "gpu-operator" # Helm release name for the GPU operator
chart: "gpu-operator" # Helm chart name for the GPU operator
repo_url: "https://helm.ngc.nvidia.com/nvidia" # Helm repository URL for the GPU operator
version: "v24.6.0" # Version of the GPU operator to install
specific_use_local_charts: true # Use local charts for this application
#### Inline Helm Values for GPU Operator ####
inline_values:
hostPaths:
driverInstallDir: "/home/kubernetes/bin/nvidia"
toolkit:
installDir: "/home/kubernetes/bin/nvidia"
cdi:
enabled: true
default: true
driver:
enabled: false
helm_flags: "--debug" # Additional Helm flags for this application's installation
verify_install: false # Verify the installation of the GPU operator
verify_install_timeout: 600 # Timeout for verification (in seconds)
skip_on_verify_fail: true # Skip the step if verification fails
enable_troubleshoot: false # Enable troubleshooting mode for additional logs and checks
- name: "prometheus" # Name of the application
skip_installation: false # Do not skip the installation of Prometheus
use_global_kubeconfig: true # Use global kubeconfig for Prometheus
kubeconfig: "" # Path to the kubeconfig file specific to this application
kubecontext: "" # Kubecontext specific to this application; uses global context if empty
namespace: "egs-monitoring" # Namespace where Prometheus will be installed
release: "prometheus" # Helm release name for Prometheus
chart: "kube-prometheus-stack" # Helm chart name for Prometheus
repo_url: "https://prometheus-community.github.io/helm-charts" # Helm repository URL for Prometheus
version: "v45.0.0" # Version of the Prometheus stack to install
specific_use_local_charts: true # Use local charts for this application
values_file: "" # Path to an external values file, if any
#### Inline Helm Values for Prometheus ####
inline_values:
prometheus:
service:
type: ClusterIP # Service type for Prometheus
prometheusSpec:
storageSpec: {} # Placeholder for storage configuration
additionalScrapeConfigs:
- job_name: tgi
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod_name
- source_labels: [__meta_kubernetes_pod_container_name]
target_label: container_name
- job_name: gpu-metrics
scrape_interval: 1s
metrics_path: /metrics
scheme: http
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- egs-gpu-operator
relabel_configs:
- source_labels: [__meta_kubernetes_endpoints_name]
action: drop
regex: .*-node-feature-discovery-master
- source_labels: [__meta_kubernetes_pod_node_name]
action: replace
target_label: kubernetes_node
grafana:
enabled: true # Enable Grafana
grafana.ini:
auth:
disable_login_form: true
disable_signout_menu: true
auth.anonymous:
enabled: true
org_role: Viewer
service:
type: LoadBalancer # Service type for Grafana
persistence:
enabled: false # Disable persistence
size: 1Gi # Default persistence size
helm_flags: "--debug" # Additional Helm flags for this application's installation
verify_install: false # Verify the installation of Prometheus
verify_install_timeout: 600 # Timeout for verification (in seconds)
skip_on_verify_fail: true # Skip the step if verification fails
enable_troubleshoot: false # Enable troubleshooting mode for additional logs and checks
- name: "postgresql" # Name of the application
skip_installation: false # Do not skip the installation of PostgreSQL
use_global_kubeconfig: true # Use global kubeconfig for PostgreSQL
kubeconfig: "" # Path to the kubeconfig file specific to this application
kubecontext: "" # Kubecontext specific to this application; uses global context if empty
namespace: "kt-postgresql" # Namespace where PostgreSQL will be installed
release: "kt-postgresql" # Helm release name for PostgreSQL
chart: "postgresql" # Helm chart name for PostgreSQL
repo_url: "oci://registry-1.docker.io/bitnamicharts/postgresql" # Helm repository URL for PostgreSQL
version: "16.2.1" # Version of the PostgreSQL chart to install
specific_use_local_charts: true # Use local charts for this application
values_file: "" # Path to an external values file, if any
#### Inline Helm Values for PostgreSQL ####
inline_values:
auth:
postgresPassword: "postgres" # Explicit password (use if not relying on `existingSecret`)
username: "postgres" # Explicit username (fallback if `existingSecret` is not used)
password: "postgres" # Password for PostgreSQL (optional)
database: "postgres" # Default database to create
primary:
persistence:
enabled: false # Disable persistent storage for PostgreSQL
size: 10Gi # Size of the Persistent Volume Claim
helm_flags: "--wait --debug" # Additional Helm flags for this application's installation
verify_install: true # Verify the installation of PostgreSQL
verify_install_timeout: 600 # Timeout for verification (in seconds)
skip_on_verify_fail: false # Do not skip if verification fails
#### Define Custom Applications and Associated Manifests ####
manifests:
- appname: gpu-operator-quota # Name of the custom application
manifest: "" # URL or path to the manifest file; if empty, inline YAML is used
overrides_yaml: "" # Path to an external YAML file with overrides, if any
inline_yaml: | # Inline YAML content for this custom application
apiVersion: v1
kind: ResourceQuota
metadata:
name: gpu-operator-quota
spec:
hard:
pods: 100 # Maximum number of pods
scopeSelector:
matchExpressions:
- operator: In
scopeName: PriorityClass # Define scope for PriorityClass
values:
- system-node-critical
- system-cluster-critical
use_global_kubeconfig: true # Use global kubeconfig for this application
skip_installation: false # Do not skip the installation of this application
verify_install: false # Verify the installation of this application
verify_install_timeout: 30 # Timeout for verification (in seconds)
skip_on_verify_fail: true # Skip if verification fails
namespace: egs-gpu-operator # Namespace for this application
kubeconfig: "" # Path to the kubeconfig file specific to this application
kubecontext: "" # Kubecontext specific to this application; uses global context if empty
- appname: nvidia-driver-installer # Name of the custom application
manifest: "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml"
# URL to the manifest file
overrides_yaml: "" # Path to an external YAML file with overrides, if any
inline_yaml: null # Inline YAML content for this application
use_global_kubeconfig: true # Use global kubeconfig for this application
kubeconfig: "" # Path to the kubeconfig file specific to this application
kubecontext: "" # Kubecontext specific to this application; uses global context if empty
skip_installation: false # Do not skip the installation of this application
verify_install: false # Verify the installation of this application
verify_install_timeout: 200 # Timeout for verification (in seconds)
skip_on_verify_fail: true # Skip if verification fails
namespace: kube-system # Namespace for this application
#### Define Commands to Execute ####
commands:
- use_global_kubeconfig: true # Use global kubeconfig for these commands
kubeconfig: "" # Path to the kubeconfig file specific to these commands
kubecontext: "" # Kubecontext specific to these commands; uses global context if empty
skip_installation: true # Do not skip the execution of these commands
verify_install: false # Verify the execution of these commands
verify_install_timeout: 200 # Timeout for verification (in seconds)
skip_on_verify_fail: true # Skip if command verification fails
namespace: kube-system # Namespace context for these commands
command_stream: | # Commands to execute
kubectl create namespace egs-gpu-operator --dry-run=client -o yaml | kubectl apply -f -
kubectl get nodes
kubectl get nodes -o json | jq -r '.items[] | select(.status.capacity["nvidia.com/gpu"] != null) | .metadata.name' | xargs -I {} kubectl label nodes {} gke-no-default-nvidia-gpu-device-plugin=true --overwrite
#### Troubleshooting Mode Settings ####
enable_troubleshoot:
enabled: false # Global enable troubleshooting mode for additional logs and checks
#### Resource Types to Troubleshoot ####
resource_types:
- pods
- deployments
- daemonsets
- statefulsets
- replicasets
- jobs
- configmaps
- secrets
- services
- serviceaccounts
- roles
- rolebindings
- crds
#### API Groups to Troubleshoot ####
api_groups:
- controller.kubeslice.io
- worker.kubeslice.io
- inventory.kubeslice.io
- aiops.kubeslice.io
- networking.kubeslice.io
- monitoring.coreos.com
#### Upload Log Settings ####
upload_logs:
enabled: false # Enable log upload functionality
command: | # Command to execute for log upload
#### List of Required Binaries ####
required_binaries:
- yq # YAML processor
- helm # Helm package manager
- jq # JSON processor
- kubectl # Kubernetes command-line tool
#### Node Labeling Settings ####
add_node_label: false # Enable node labeling during installation
# Version of the input configuration file
version: "1.0.0"