-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcluster.yaml
406 lines (369 loc) · 16.9 KB
/
cluster.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
# Title: Task YAML — SkyPilot documentation
# URL Source: https://docs.skypilot.co/en/latest/reference/yaml-spec.html
# Markdown Content:
# SkyPilot provides an intuitive YAML interface to specify a task (resource requirements, setup commands, run commands, file mounts, storage mounts, and so on).
# Task YAMLs can be used with the [CLI](https://docs.skypilot.co/en/latest/reference/cli.html#cli), or the programmatic API ([`sky.Task.from_yaml()`](https://docs.skypilot.co/en/latest/reference/api.html#sky.Task.from_yaml "sky.Task.from_yaml")).
# Available fields:
# \# Task name (optional), used for display purposes.
# name: my-task
# \# Working directory (optional), synced to ~/sky\_workdir on the remote cluster
# \# each time launch or exec is run with the yaml file.
# #
# \# Commands in "setup" and "run" will be executed under it.
# #
# \# If a relative path is used, it's evaluated relative to the location from
# \# which \`sky\` is called.
# #
# \# To exclude files from syncing, see
# \# https://docs.skypilot.co/en/latest/examples/syncing-code-artifacts.html#exclude-uploading-files
# workdir: ~/my-task-code
# \# Number of nodes (optional; defaults to 1) to launch including the head node.
# #
# \# A task can set this to a smaller value than the size of a cluster.
# num\_nodes: 4
# \# Per-node resource requirements (optional).
# resources:
# cloud: aws \# The cloud to use (optional).
# \# The region to use (optional). Auto-failover will be disabled
# \# if this is specified.
# region: us-east-1
# \# The zone to use (optional). Auto-failover will be disabled
# \# if this is specified.
# zone: us-east-1a
# \# Accelerator name and count per node (optional).
# #
# \# Use \`sky show-gpus\` to view available accelerator configurations.
# #
# \# The following three ways are valid for specifying accelerators for a cluster:
# #
# \# To specify a single type of accelerator:
# \# Format: <name\>:<count\> (or simply <name\>, short for a count of 1).
# \# accelerators: H100:4
# #
# \# To specify an ordered list of accelerators (try the accelerators in
# \# the specified order):
# \# Format: \[<name\>:<count\>, ...\]
# \# accelerators: \['L4:1', 'H100:1', 'A100:1'\]
# #
# \# To specify an unordered set of accelerators (optimize all specified
# \# accelerators together, and try accelerator with lowest cost first):
# \# Format: {<name\>:<count\>, ...}
# \# accelerators: {'L4:1', 'H100:1', 'A100:1'}
# accelerators: H100:8
# \# Number of vCPUs per node (optional).
# #
# \# Format:
# \# <count\>: exactly <count\> vCPUs
# \# <count\>+: at least <count\> vCPUs
# #
# \# E.g., 4+ means first try to find an instance type with \>\= 4 vCPUs. If
# \# not found, use the next cheapest instance with more than 4 vCPUs.
# cpus: 4+
# \# Memory in GiB per node (optional).
# #
# \# Format:
# \# <num\>: exactly <num\> GiB
# \# <num\>+: at least <num\> GiB
# #
# \# E.g., 32+ means first try to find an instance type with \>\= 32 GiB. If
# \# not found, use the next cheapest instance with more than 32 GiB.
# memory: 32+
# \# Instance type to use (optional). If 'accelerators' is specified,
# \# the corresponding instance type is automatically inferred.
# instance\_type: p3.8xlarge
# \# Whether the cluster should use spot instances (optional).
# \# If unspecified, defaults to False (on-demand instances).
# use\_spot: False
# \# The recovery strategy for managed jobs (optional).
# #
# \# In effect for managed jobs. Possible values are \`FAILOVER\` and \`EAGER\_NEXT\_REGION\`.
# #
# \# If \`FAILOVER\` is specified, the job will be restarted in the same region
# \# if the node fails, and go to the next region if no available resources
# \# are found in the same region.
# #
# \# If \`EAGER\_NEXT\_REGION\` is specified, the job will go to the next region
# \# directly if the node fails. This is useful for spot instances, as in
# \# practice, preemptions in a region usually indicate a shortage of resources
# \# in that region.
# #
# \# default: EAGER\_NEXT\_REGION
# job\_recovery: none
# \# Or, to allow up to 3 restarts (default: 0) on user code errors:
# \# job\_recovery:
# \# strategy: EAGER\_NEXT\_REGION
# \# max\_restarts\_on\_errors: 3
# \# Disk size in GB to allocate for OS (mounted at /). Increase this if you
# \# have a large working directory or tasks that write out large outputs.
# disk\_size: 256
# \# Disk tier to use for OS (optional).
# \# Could be one of 'low', 'medium', 'high', 'ultra' or 'best' (default: 'medium').
# \# if 'best' is specified, use the best disk tier enabled.
# \# Rough performance estimate:
# \# low: 1000 IOPS; read 90 MB/s; write 90 MB/s
# \# medium: 3000 IOPS; read 220 MB/s; write 220 MB/s
# \# high: 6000 IOPS; read 400 MB/s; write 400 MB/s
# \# ultra: 60000 IOPS; read 4000 MB/s; write 3000 MB/s
# \# Measured by examples/perf/storage\_rawperf.yaml
# disk\_tier: medium
# \# Ports to expose (optional).
# #
# \# All ports specified here will be exposed to the public Internet. Under
# \# the hood, a firewall rule / inbound rule is automatically added to allow
# \# inbound traffic to these ports. Applies to all VMs of a cluster created
# \# with this field set.
# #
# \# Currently only TCP protocol is supported.
# #
# \# Ports Lifecycle:
# \# A cluster's ports will be updated whenever \`sky launch\` is executed.
# \# When launching an existing cluster, any new ports specified will be
# \# opened for the cluster, and the firewall rules for old ports will never
# \# be removed until the cluster is terminated.
# #
# \# Could be an integer, a range, or a list of integers and ranges:
# \# To specify a single port:
# \# ports: 8081
# \# To specify a port range:
# \# ports: 10052-10100
# \# To specify multiple ports / port ranges:
# \# ports:
# \# - 8080
# \# - 10022-10040
# ports: 8081
# \# Additional accelerator metadata (optional); only used for TPU node
# \# and TPU VM.
# \# Example usage:
# #
# \# To request a TPU VM:
# \# accelerator\_args:
# \# tpu\_vm: True (optional, default: True)
# #
# \# To request a TPU node:
# \# accelerator\_args:
# \# tpu\_name: ...
# \# tpu\_vm: False
# #
# \# By default, the value for "runtime\_version" is decided based on which is
# \# requested and should work for either case. If passing in an incompatible
# \# version, GCP will throw an error during provisioning.
# accelerator\_args:
# \# Default is "tpu-vm-base" for TPU VM and "2.12.0" for TPU node.
# runtime\_version: tpu-vm-base
# \# tpu\_name: mytpu
# \# tpu\_vm: True # True to use TPU VM (the default); False to use TPU node.
# \# Custom image id (optional, advanced). The image id used to boot the
# \# instances. Only supported for AWS, GCP, OCI and IBM (for non-docker image).
# \# If not specified, SkyPilot will use the default debian-based image
# \# suitable for machine learning tasks.
# #
# \# Docker support
# \# You can specify docker image to use by setting the image\_id to
# \# \`docker:<image name\>\` for Azure, AWS and GCP. For example,
# \# image\_id: docker:ubuntu:latest
# \# Currently, only debian and ubuntu images are supported.
# \# If you want to use a docker image in a private registry, you can specify your
# \# username, password, and registry server as task environment variable. For
# \# details, please refer to the \`envs\` section below.
# #
# \# AWS
# \# To find AWS AMI ids: https://leaherb.com/how-to-find-an-aws-marketplace-ami-image-id
# \# You can also change the default OS version by choosing from the
# \# following image tags provided by SkyPilot:
# \# image\_id: skypilot:gpu-ubuntu-2004
# \# image\_id: skypilot:k80-ubuntu-2004
# \# image\_id: skypilot:gpu-ubuntu-1804
# \# image\_id: skypilot:k80-ubuntu-1804
# #
# \# It is also possible to specify a per-region image id (failover will only
# \# go through the regions specified as keys; useful when you have the
# \# custom images in multiple regions):
# \# image\_id:
# \# us-east-1: ami-0729d913a335efca7
# \# us-west-2: ami-050814f384259894c
# #
# \# GCP
# \# To find GCP images: https://cloud.google.com/compute/docs/images
# \# image\_id: projects/deeplearning-platform-release/global/images/common-cpu-v20230615-debian-11-py310
# \# Or machine image: https://cloud.google.com/compute/docs/machine-images
# \# image\_id: projects/my-project/global/machineImages/my-machine-image
# #
# \# Azure
# \# To find Azure images: https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
# \# image\_id: microsoft-dsvm:ubuntu-2004:2004:21.11.04
# #
# \# OCI
# \# To find OCI images: https://docs.oracle.com/en-us/iaas/images
# \# You can choose the image with OS version from the following image tags
# \# provided by SkyPilot:
# \# image\_id: skypilot:gpu-ubuntu-2204
# \# image\_id: skypilot:gpu-ubuntu-2004
# \# image\_id: skypilot:gpu-oraclelinux9
# \# image\_id: skypilot:gpu-oraclelinux8
# \# image\_id: skypilot:cpu-ubuntu-2204
# \# image\_id: skypilot:cpu-ubuntu-2004
# \# image\_id: skypilot:cpu-oraclelinux9
# \# image\_id: skypilot:cpu-oraclelinux8
# #
# \# It is also possible to specify your custom image's OCID with OS type,
# \# for example:
# \# image\_id: ocid1.image.oc1.us-sanjose-1.aaaaaaaaywwfvy67wwe7f24juvjwhyjn3u7g7s3wzkhduxcbewzaeki2nt5q:oraclelinux
# \# image\_id: ocid1.image.oc1.us-sanjose-1.aaaaaaaa5tnuiqevhoyfnaa5pqeiwjv6w5vf6w4q2hpj3atyvu3yd6rhlhyq:ubuntu
# #
# \# IBM
# \# Create a private VPC image and paste its ID in the following format:
# \# image\_id: <unique\_image\_id\>
# \# To create an image manually:
# \# https://cloud.ibm.com/docs/vpc?topic=vpc-creating-and-using-an-image-from-volume.
# \# To use an official VPC image creation tool:
# \# https://www.ibm.com/cloud/blog/use-ibm-packer-plugin-to-create-custom-images-on-ibm-cloud-vpc-infrastructure
# \# To use a more limited but easier to manage tool:
# \# https://github.com/IBM/vpc-img-inst
# image\_id: ami-0868a20f5a3bf9702
# \# Labels to apply to the instances (optional).
# #
# \# If specified, these labels will be applied to the VMs or pods created
# \# by SkyPilot. These are useful for assigning metadata that may be
# \# used by external tools. Implementation depends on the chosen cloud -
# \# On AWS, labels map to instance tags. On GCP, labels map to instance
# \# labels. On Kubernetes, labels map to pod labels. On other clouds,
# \# labels are not supported and will be ignored.
# #
# \# Note: Labels are applied only on the first launch of the cluster. They
# \# are not updated on subsequent launches.
# labels:
# my-label: my-value
# \# Candidate resources (optional). If specified, SkyPilot will only use
# \# these candidate resources to launch the cluster. The fields specified
# \# outside of \`any\_of\`, \`ordered\` will be used as the default values for
# \# all candidate resources, and any duplicate fields specified inside
# \# \`any\_of\`, \`ordered\` will override the default values.
# \# \`any\_of:\` means that SkyPilot will try to find a resource that matches
# \# any of the candidate resources, i.e. the failover order will be decided
# \# by the optimizer.
# \# \`ordered:\` means that SkyPilot will failover through the candidate
# \# resources with the specified order.
# \# Note: accelerators under \`any\_of\` and \`ordered\` cannot be a list or set.
# any\_of:
# \- cloud: aws
# region: us-west-2
# accelerators: H100
# \- cloud: gcp
# accelerators: H100
# \# Environment variables (optional). These values can be accessed in the
# \# \`file\_mounts\`, \`setup\`, and \`run\` sections below.
# #
# \# Values set here can be overridden by a CLI flag:
# \# \`sky launch/exec --env ENV=val\` (if ENV is present).
# #
# \# If you want to use a docker image as runtime environment in a private
# \# registry, you can specify your username, password, and registry server as
# \# task environment variable. For example:
# \# envs:
# \# SKYPILOT\_DOCKER\_USERNAME: <username\>
# \# SKYPILOT\_DOCKER\_PASSWORD: <password\>
# \# SKYPILOT\_DOCKER\_SERVER: <registry server\>
# #
# \# SkyPilot will execute \`docker login --username <username\> --password
# \# <password\> <registry server\>\` before pulling the docker image. For \`docker
# \# login\`, see https://docs.docker.com/engine/reference/commandline/login/
# #
# \# You could also specify any of them through the CLI flag if you don't want
# \# to store them in your yaml file or if you want to generate them for
# \# constantly changing password. For example:
# \# sky launch --env SKYPILOT\_DOCKER\_PASSWORD=$(aws ecr get-login-password --region us-east-1).
# #
# \# For more information about docker support in SkyPilot, please refer to the \`image\_id\` section above.
# envs:
# MY\_BUCKET: skypilot-temp-gcs-test
# MY\_LOCAL\_PATH: tmp-workdir
# MODEL\_SIZE: 13b
# file\_mounts:
# \# Uses rsync to sync local files/directories to all nodes of the cluster.
# #
# \# If a relative path is used, it's evaluated relative to the location from
# \# which \`sky\` is called.
# #
# \# If symlinks are present, they are copied as symlinks, and their targets
# \# must also be synced using file\_mounts to ensure correctness.
# /remote/dir1/file: /local/dir1/file
# /remote/dir2: /local/dir2
# \# Create a S3 bucket named sky-dataset, uploads the contents of
# \# /local/path/datasets to the bucket, and marks the bucket as persistent
# \# (it will not be deleted after the completion of this task).
# \# Symlinks and their contents are NOT copied.
# #
# \# Mounts the bucket at /datasets-storage on every node of the cluster.
# /datasets-storage:
# name: sky-dataset \# Name of storage, optional when source is bucket URI
# source: /local/path/datasets \# Source path, can be local or bucket URI. Optional, do not specify to create an empty bucket.
# store: s3 \# Could be either 's3', 'gcs', 'azure', 'r2', 'oci', or 'ibm'; default: None. Optional.
# persistent: True \# Defaults to True; can be set to false to delete bucket after cluster is downed. Optional.
# mode: MOUNT \# Either MOUNT or COPY. Defaults to MOUNT. Optional.
# \# Copies a cloud object store URI to the cluster. Can be private buckets.
# /datasets-s3: s3://my-awesome-dataset
# \# Demoing env var usage.
# /checkpoint/${MODEL\_SIZE}: ~/${MY\_LOCAL\_PATH}
# /mydir:
# name: ${MY\_BUCKET} \# Name of the bucket.
# mode: MOUNT
# \# Setup script (optional) to execute on every \`sky launch\`.
# \# This is executed before the 'run' commands.
# #
# \# The '|' separator indicates a multiline string. To specify a single command:
# \# setup: pip install -r requirements.txt
# setup: |
# echo "Begin setup."
# pip install -r requirements.txt
# echo "Setup complete."
# \# Main program (optional, but recommended) to run on every node of the cluster.
# run: |
# echo "Beginning task."
# python train.py
# \# Demoing env var usage.
# echo Env var MODEL\_SIZE has value: ${MODEL\_SIZE}
# Experimental Configurations[#](https://docs.skypilot.co/en/latest/reference/yaml-spec.html#experimental-configurations "Permalink to this heading")
# ---------------------------------------------------------------------------------------------------------------------------------------------------
# Note
# Experimental features and APIs may be changed or removed without any notice.
# In additional to the above fields, SkyPilot also supports the following experimental fields in the task YAML:
# experimental:
# \# Override the configs in ~/.sky/config.yaml from a task level.
# #
# \# The following fields can be overridden. Please refer to docs of Advanced
# \# Configuration for more details of those fields:
# \# https://docs.skypilot.co/en/latest/reference/config.html
# config\_overrides:
# docker:
# run\_options: ...
# kubernetes:
# pod\_config: ...
# provision\_timeout: ...
# gcp:
# managed\_instance\_group: ...
# nvidia\_gpus:
# disable\_ecc: ...
workdir: .
resources:
accelerators: ["H100:1", "A100-80GB:1"]
envs:
GIT_USER_NAME: "Your Name"
GIT_USER_EMAIL: "[email protected]"
GITHUB_TOKEN:
setup: |
# Configure git user name and email
git config --global user.name "${GIT_USER_NAME}"
git config --global user.email "${GIT_USER_EMAIL}"
# Reset any uncommitted changes to the last commit
git reset --hard HEAD
# Remove all untracked files and directories
git clean -fd
# Install astral-uv
sudo snap install --classic astral-uv
# Sync the dependencies
uv sync
# Unfortunate workaround because of a cli tool naming conflict
uv remove torchtune
uv add torchtune
# Move ./.gcloud.json to ~/.gcloud.json
mv ./.gcloud.json ~/.gcloud.json