root-hbx
diff --git a/‎docs/source/reference/yaml-spec.rst
+10-2 b/‎docs/source/reference/yaml-spec.rst
+10-2
diff --git a/‎examples/multi_accelerators.yaml
+12 b/‎examples/multi_accelerators.yaml
+12
diff --git a/‎examples/multi_resources.yaml
+18 b/‎examples/multi_resources.yaml
+18
diff --git a/‎sky/backends/backend_utils.py
+80-36 b/‎sky/backends/backend_utils.py
+80-36
diff --git a/‎sky/backends/cloud_vm_ray_backend.py
+73-47 b/‎sky/backends/cloud_vm_ray_backend.py
+73-47
@@ -43,8 +43,16 @@ Available fields:
       # Accelerator name and count per node (optional).
       #
       # Use `sky show-gpus` to view available accelerator configurations.
-      #
-      # Format: <name>:<count> (or simply <name>, short for a count of 1).
+      # The following three ways are valid for specifying accelerators for a cluster:
+      #   To specify a single accelerator:
+      #     Format: <name>:<count> (or simply <name>, short for a count of 1).
+      #     accelerators: V100:4
+      #   To specify a ordered list of accelerators: Try the accelerators in the specified order.
+      #     Format: [<name>:<count>, ...]
+      #     accelerators: ['K80:1', 'V100:1', 'T4:1']
+      #   To specify an unordered set of accelerators: Optimize all specified accelerators together, and try accelerator with lowest cost first.
+      #     Format: {<name>:<count>, ...}
+      #     accelerators: {'K80:1', 'V100:1', 'T4:1'}
       accelerators: V100:4
 
       # Number of vCPUs per node (optional).
 
@@ -0,0 +1,12 @@
+name: multi-accelerators
+
+resources:
+
+  # Ordered list of accelerators: Try the accelerators in the specified order.
+  # accelerators: ['A100-40GB:1', 'V100:1', 'K80:1', 'T4:1']
+
+  # Unordered set of accelerators: Optimize all specified accelerators together, and try accelerator with lowest cost first.
+  accelerators: {'A100-40GB:1', 'K80:1', 'V100:1', 'T4:1', 'T4:4'}
+
+run: |
+  nvidia-smi
@@ -0,0 +1,18 @@
+name: multi-resources
+
+resources:
+  ordered:
+    - cloud: AWS
+      accelerators: A10g
+    - cloud: GCP
+      accelerators: L4
+
+# resources:
+#   any_of:
+    # - cloud: AWS
+    #   accelerators: A10g
+    # - cloud: GCP
+    #   accelerators: L4
+
+run: |
+  nvidia-smi
@@ -1967,43 +1967,66 @@ def check_can_clone_disk_and_override_task(
                     'disk is only supported when creating a new cluster. To fix: specify '
                     'a new target cluster name.')
 
-    assert len(task.resources) == 1, task.resources
-    task_resources = list(task.resources)[0]
-    if handle.launched_resources.disk_size > task_resources.disk_size:
-        # The target cluster's disk should be at least as large as the source.
-        with ux_utils.print_exception_no_traceback():
-            target_cluster_name_str = f' {target_cluster_name!r}'
-            if target_cluster_name is None:
-                target_cluster_name_str = ''
-            raise exceptions.NotSupportedError(
-                f'The target cluster{target_cluster_name_str} should have a disk size '
-                f'of at least {handle.launched_resources.disk_size} GB to clone the '
-                f'disk from {cluster_name!r}.')
-    override_param = {}
+    new_task_resources = []
     original_cloud = handle.launched_resources.cloud
-    assert original_cloud is not None, handle.launched_resources
-    if task_resources.cloud is None:
-        override_param['cloud'] = original_cloud
-    else:
-        if not original_cloud.is_same_cloud(task_resources.cloud):
-            with ux_utils.print_exception_no_traceback():
-                raise ValueError(
-                    f'Cannot clone disk across cloud from {original_cloud} to '
-                    f'{task_resources.cloud}.')
     original_cloud.check_features_are_supported(
         {clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER})
 
-    if task_resources.region is None:
-        override_param['region'] = handle.launched_resources.region
-
-    if override_param:
-        logger.info(
-            f'No cloud/region specified for the task. Using the same region '
-            f'as source cluster {cluster_name!r}: '
-            f'{handle.launched_resources.cloud}'
-            f'({handle.launched_resources.region}).')
+    assert original_cloud is not None, handle.launched_resources
+    has_override = False
+    has_disk_size_met = False
+    has_cloud_met = False
+    for task_resources in task.resources:
+        if handle.launched_resources.disk_size > task_resources.disk_size:
+            # The target cluster's disk should be at least as large as the source.
+            continue
+        has_disk_size_met = True
+        if task_resources.cloud is not None and not original_cloud.is_same_cloud(
+                task_resources.cloud):
+            continue
+        has_cloud_met = True
+
+        override_param = {}
+        if task_resources.cloud is None:
+            override_param['cloud'] = original_cloud
+        if task_resources.region is None:
+            override_param['region'] = handle.launched_resources.region
+
+        if override_param:
+            logger.info(
+                f'No cloud/region specified for the task {task_resources}. Using the same region '
+                f'as source cluster {cluster_name!r}: '
+                f'{handle.launched_resources.cloud}'
+                f'({handle.launched_resources.region}).')
+            has_override = True
         task_resources = task_resources.copy(**override_param)
-        task.set_resources({task_resources})
+        new_task_resources.append(task_resources)
+
+    if not new_task_resources:
+        if not has_disk_size_met:
+            with ux_utils.print_exception_no_traceback():
+                target_cluster_name_str = f' {target_cluster_name!r}'
+                if target_cluster_name is None:
+                    target_cluster_name_str = ''
+                raise exceptions.NotSupportedError(
+                    f'The target cluster{target_cluster_name_str} should have a disk size '
+                    f'of at least {handle.launched_resources.disk_size} GB to clone the '
+                    f'disk from {cluster_name!r}.')
+        if not has_cloud_met:
+            task_resources_cloud_str = '[' + ','.join(
+                [f'{res.cloud}' for res in task.resources]) + ']'
+            task_resources_str = '[' + ','.join(
+                [f'{res}' for res in task.resources]) + ']'
+            with ux_utils.print_exception_no_traceback():
+                raise ValueError(
+                    f'Cannot clone disk across cloud from {original_cloud} to '
+                    f'{task_resources_cloud_str} for resources {task_resources_str}.'
+                )
+        assert False, 'Should not reach here.'
+    # set the new_task_resources to be the same type (list or set) as the
+    # original task.resources
+    if has_override:
+        task.set_resources(type(task.resources)(new_task_resources))
         # Reset the best_resources to triger re-optimization
         # later, so that the new task_resources will be used.
         task.best_resources = None
@@ -2724,11 +2747,32 @@ def get_task_demands_dict(task: 'task_lib.Task') -> Optional[Dict[str, float]]:
 
 
 def get_task_resources_str(task: 'task_lib.Task') -> str:
-    resources_dict = get_task_demands_dict(task)
-    if resources_dict is None:
-        resources_str = f'CPU:{DEFAULT_TASK_CPU_DEMAND}'
+    if task.best_resources is not None:
+        accelerator_dict = task.best_resources.accelerators
+        if accelerator_dict is None:
+            resources_str = f'CPU:{DEFAULT_TASK_CPU_DEMAND}'
+        else:
+            resources_str = ', '.join(
+                f'{k}:{v}' for k, v in accelerator_dict.items())
+    elif len(task.resources) == 1:
+        resources_dict = list(task.resources)[0].accelerators
+        if resources_dict is None:
+            resources_str = f'CPU:{DEFAULT_TASK_CPU_DEMAND}'
+        else:
+            resources_str = ', '.join(
+                f'{k}:{v}' for k, v in resources_dict.items())
     else:
-        resources_str = ', '.join(f'{k}:{v}' for k, v in resources_dict.items())
+        resource_accelerators = []
+        for resource in task.resources:
+            if resource.accelerators is None:
+                continue
+            for k, v in resource.accelerators.items():
+                resource_accelerators.append(f'{k}:{v}')
+
+        if resource_accelerators:
+            resources_str = ', '.join(set(resource_accelerators))
+        else:
+            resources_str = f'CPU:{DEFAULT_TASK_CPU_DEMAND}'
     resources_str = f'{task.num_nodes}x [{resources_str}]'
     return resources_str
 
 
@@ -2199,7 +2199,7 @@ def provision_with_retries(
                 config_dict = self._retry_zones(
                     to_provision,
                     num_nodes,
-                    requested_resources=task.resources,
+                    requested_resources=set(task.resources),
                     dryrun=dryrun,
                     stream_logs=stream_logs,
                     cluster_name=cluster_name,
@@ -2706,21 +2706,23 @@ def check_resources_fit_cluster(
         handle: CloudVmRayResourceHandle,
         task: task_lib.Task,
         check_ports: bool = False,
-    ):
+    ) -> resources_lib.Resources:
         """Check if resources requested by the task fit the cluster.
 
         The resources requested by the task should be smaller than the existing
         cluster.
+        If multiple resources are specified, this checking will pass when
+        at least one resource fits the cluster.
 
         Raises:
             exceptions.ResourcesMismatchError: If the resources in the task
                 does not match the existing cluster.
         """
-        assert len(task.resources) == 1, task.resources
 
         launched_resources = handle.launched_resources
-        task_resources = list(task.resources)[0]
         cluster_name = handle.cluster_name
+
+        # Usage Collection:
         usage_lib.messages.usage.update_cluster_resources(
             handle.launched_nodes, launched_resources)
         record = global_user_state.get_cluster_from_name(cluster_name)
@@ -2739,40 +2741,55 @@ def check_resources_fit_cluster(
                 launched_resources)
             mismatch_str = ('To fix: use accelerators/number of nodes that can '
                             'be satisfied by the local cluster')
-        # Requested_resources <= actual_resources.
-        # Special handling for local cloud case, which assumes a cluster can
-        # be heterogeneous. Here, launched_resources is a list of custom
-        # accelerators per node, and Resources.less_demanding_than determines
-        # how many nodes satisfy task resource requirements.
-        if not (task.num_nodes <= handle.launched_nodes and
-                task_resources.less_demanding_than(
-                    launched_resources,
-                    requested_num_nodes=task.num_nodes,
-                    check_ports=check_ports)):
-            if (task_resources.region is not None and
-                    task_resources.region != launched_resources.region):
-                with ux_utils.print_exception_no_traceback():
-                    raise exceptions.ResourcesMismatchError(
-                        'Task requested resources in region '
-                        f'{task_resources.region!r}, but the existing cluster '
-                        f'is in region {launched_resources.region!r}.')
-            if (task_resources.zone is not None and
-                    task_resources.zone != launched_resources.zone):
-                zone_str = (f'is in zone {launched_resources.zone!r}.'
-                            if launched_resources.zone is not None else
-                            'does not have zone specified.')
-                with ux_utils.print_exception_no_traceback():
-                    raise exceptions.ResourcesMismatchError(
-                        'Task requested resources in zone '
-                        f'{task_resources.zone!r}, but the existing cluster '
-                        f'{zone_str}')
+
+        valid_resource = None
+        requested_resource_list = []
+        for resource in task.resources:
+            if (task.num_nodes <= handle.launched_nodes and
+                    resource.less_demanding_than(
+                        launched_resources,
+                        requested_num_nodes=task.num_nodes,
+                        check_ports=check_ports)):
+                valid_resource = resource
+                break
+            else:
+                requested_resource_list.append(f'{task.num_nodes}x {resource}')
+
+        if valid_resource is None:
+            for example_resource in task.resources:
+                if (example_resource.region is not None and
+                        example_resource.region != launched_resources.region):
+                    with ux_utils.print_exception_no_traceback():
+                        raise exceptions.ResourcesMismatchError(
+                            f'Task requested resources {example_resource} in region '  # pylint: disable=line-too-long
+                            f'{example_resource.region!r}'
+                            ', but the existing cluster '
+                            f'is in region {launched_resources.region!r}.')
+                if (example_resource.zone is not None and
+                        example_resource.zone != launched_resources.zone):
+                    zone_str = (f'is in zone {launched_resources.zone!r}.'
+                                if launched_resources.zone is not None else
+                                'does not have zone specified.')
+                    with ux_utils.print_exception_no_traceback():
+                        raise exceptions.ResourcesMismatchError(
+                            f'Task requested resources {example_resource} in zone '  # pylint: disable=line-too-long
+                            f'{example_resource.zone!r},'
+                            'but the existing cluster '
+                            f'{zone_str}')
+            requested_resource_str = ', '.join(requested_resource_list)
+            if isinstance(task.resources, list):
+                requested_resource_str = f'[{requested_resource_str}]'
+            elif isinstance(task.resources, set):
+                requested_resource_str = f'{{{requested_resource_str}}}'
             with ux_utils.print_exception_no_traceback():
                 raise exceptions.ResourcesMismatchError(
-                    'Requested resources do not match the existing cluster.\n'
-                    f'  Requested:\t{task.num_nodes}x {task_resources} \n'
+                    'Requested resources do not match the existing '
+                    'cluster.\n'
+                    f'  Requested:\t{requested_resource_str}\n'
                     f'  Existing:\t{handle.launched_nodes}x '
                     f'{handle.launched_resources}\n'
                     f'{mismatch_str}')
+        return valid_resource
 
     def _provision(
             self,
@@ -3092,7 +3109,7 @@ def _update_after_cluster_provisioned(
             global_user_state.add_or_update_cluster(
                 handle.cluster_name,
                 handle,
-                task.resources,
+                set(task.resources),
                 ready=True,
             )
             usage_lib.messages.usage.update_final_cluster_status(
@@ -3499,23 +3516,31 @@ def _execute(
         # Check the task resources vs the cluster resources. Since `sky exec`
         # will not run the provision and _check_existing_cluster
         # We need to check ports here since sky.exec shouldn't change resources
-        self.check_resources_fit_cluster(handle, task, check_ports=True)
-
-        resources_str = backend_utils.get_task_resources_str(task)
+        valid_resource = self.check_resources_fit_cluster(handle,
+                                                          task,
+                                                          check_ports=True)
+        task_copy = copy.copy(task)
+        # Handle multiple resources exec case.
+        task_copy.set_resources(valid_resource)
+        if len(task.resources) > 1:
+            logger.info('Multiple resources are specified'
+                        f'for the task, using: {valid_resource}')
+        task_copy.best_resources = None
+        resources_str = backend_utils.get_task_resources_str(task_copy)
 
         if dryrun:
             logger.info(f'Dryrun complete. Would have run:\n{task}')
             return None
 
-        job_id = self._add_job(handle, task.name, resources_str)
+        job_id = self._add_job(handle, task_copy.name, resources_str)
 
         is_tpu_vm_pod = tpu_utils.is_tpu_vm_pod(handle.launched_resources)
         # Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods
-        if task.num_nodes > 1 or is_tpu_vm_pod:
-            self._execute_task_n_nodes(handle, task, job_id, detach_run)
+        if task_copy.num_nodes > 1 or is_tpu_vm_pod:
+            self._execute_task_n_nodes(handle, task_copy, job_id, detach_run)
         else:
             # Case: task_lib.Task(run, num_nodes=1)
-            self._execute_task_one_node(handle, task, job_id, detach_run)
+            self._execute_task_one_node(handle, task_copy, job_id, detach_run)
 
         return job_id
 
@@ -4343,7 +4368,9 @@ def _check_existing_cluster(
             self.check_resources_fit_cluster(handle, task)
             # Use the existing cluster.
             assert handle.launched_resources is not None, (cluster_name, handle)
-            assert len(task.resources) == 1
+            # Assume resources share the same ports.
+            for resource in task.resources:
+                assert resource.ports == list(task.resources)[0].ports
             all_ports = resources_utils.port_set_to_ranges(
                 resources_utils.port_ranges_to_set(
                     handle.launched_resources.ports) |
@@ -4359,13 +4386,12 @@ def _check_existing_cluster(
                 prev_cluster_status=prev_cluster_status,
                 prev_handle=handle)
         usage_lib.messages.usage.set_new_cluster()
-        assert len(task.resources) == 1, task.resources
         # Use the task_cloud, because the cloud in `to_provision` can be changed
         # later during the retry.
-        resources = list(task.resources)[0]
-        task_cloud = (resources.cloud
-                      if resources.cloud is not None else clouds.Cloud)
-        task_cloud.check_cluster_name_is_valid(cluster_name)
+        for resources in task.resources:
+            task_cloud = (resources.cloud
+                          if resources.cloud is not None else clouds.Cloud)
+            task_cloud.check_cluster_name_is_valid(cluster_name)
 
         if to_provision is None:
             # The cluster is recently terminated either by autostop or manually