@@ -2199,7 +2199,7 @@ def provision_with_retries(
2199
2199
config_dict = self ._retry_zones (
2200
2200
to_provision ,
2201
2201
num_nodes ,
2202
- requested_resources = task .resources ,
2202
+ requested_resources = set ( task .resources ) ,
2203
2203
dryrun = dryrun ,
2204
2204
stream_logs = stream_logs ,
2205
2205
cluster_name = cluster_name ,
@@ -2706,21 +2706,23 @@ def check_resources_fit_cluster(
2706
2706
handle : CloudVmRayResourceHandle ,
2707
2707
task : task_lib .Task ,
2708
2708
check_ports : bool = False ,
2709
- ):
2709
+ ) -> resources_lib . Resources :
2710
2710
"""Check if resources requested by the task fit the cluster.
2711
2711
2712
2712
The resources requested by the task should be smaller than the existing
2713
2713
cluster.
2714
+ If multiple resources are specified, this checking will pass when
2715
+ at least one resource fits the cluster.
2714
2716
2715
2717
Raises:
2716
2718
exceptions.ResourcesMismatchError: If the resources in the task
2717
2719
does not match the existing cluster.
2718
2720
"""
2719
- assert len (task .resources ) == 1 , task .resources
2720
2721
2721
2722
launched_resources = handle .launched_resources
2722
- task_resources = list (task .resources )[0 ]
2723
2723
cluster_name = handle .cluster_name
2724
+
2725
+ # Usage Collection:
2724
2726
usage_lib .messages .usage .update_cluster_resources (
2725
2727
handle .launched_nodes , launched_resources )
2726
2728
record = global_user_state .get_cluster_from_name (cluster_name )
@@ -2739,40 +2741,55 @@ def check_resources_fit_cluster(
2739
2741
launched_resources )
2740
2742
mismatch_str = ('To fix: use accelerators/number of nodes that can '
2741
2743
'be satisfied by the local cluster' )
2742
- # Requested_resources <= actual_resources.
2743
- # Special handling for local cloud case, which assumes a cluster can
2744
- # be heterogeneous. Here, launched_resources is a list of custom
2745
- # accelerators per node, and Resources.less_demanding_than determines
2746
- # how many nodes satisfy task resource requirements.
2747
- if not (task .num_nodes <= handle .launched_nodes and
2748
- task_resources .less_demanding_than (
2749
- launched_resources ,
2750
- requested_num_nodes = task .num_nodes ,
2751
- check_ports = check_ports )):
2752
- if (task_resources .region is not None and
2753
- task_resources .region != launched_resources .region ):
2754
- with ux_utils .print_exception_no_traceback ():
2755
- raise exceptions .ResourcesMismatchError (
2756
- 'Task requested resources in region '
2757
- f'{ task_resources .region !r} , but the existing cluster '
2758
- f'is in region { launched_resources .region !r} .' )
2759
- if (task_resources .zone is not None and
2760
- task_resources .zone != launched_resources .zone ):
2761
- zone_str = (f'is in zone { launched_resources .zone !r} .'
2762
- if launched_resources .zone is not None else
2763
- 'does not have zone specified.' )
2764
- with ux_utils .print_exception_no_traceback ():
2765
- raise exceptions .ResourcesMismatchError (
2766
- 'Task requested resources in zone '
2767
- f'{ task_resources .zone !r} , but the existing cluster '
2768
- f'{ zone_str } ' )
2744
+
2745
+ valid_resource = None
2746
+ requested_resource_list = []
2747
+ for resource in task .resources :
2748
+ if (task .num_nodes <= handle .launched_nodes and
2749
+ resource .less_demanding_than (
2750
+ launched_resources ,
2751
+ requested_num_nodes = task .num_nodes ,
2752
+ check_ports = check_ports )):
2753
+ valid_resource = resource
2754
+ break
2755
+ else :
2756
+ requested_resource_list .append (f'{ task .num_nodes } x { resource } ' )
2757
+
2758
+ if valid_resource is None :
2759
+ for example_resource in task .resources :
2760
+ if (example_resource .region is not None and
2761
+ example_resource .region != launched_resources .region ):
2762
+ with ux_utils .print_exception_no_traceback ():
2763
+ raise exceptions .ResourcesMismatchError (
2764
+ f'Task requested resources { example_resource } in region ' # pylint: disable=line-too-long
2765
+ f'{ example_resource .region !r} '
2766
+ ', but the existing cluster '
2767
+ f'is in region { launched_resources .region !r} .' )
2768
+ if (example_resource .zone is not None and
2769
+ example_resource .zone != launched_resources .zone ):
2770
+ zone_str = (f'is in zone { launched_resources .zone !r} .'
2771
+ if launched_resources .zone is not None else
2772
+ 'does not have zone specified.' )
2773
+ with ux_utils .print_exception_no_traceback ():
2774
+ raise exceptions .ResourcesMismatchError (
2775
+ f'Task requested resources { example_resource } in zone ' # pylint: disable=line-too-long
2776
+ f'{ example_resource .zone !r} ,'
2777
+ 'but the existing cluster '
2778
+ f'{ zone_str } ' )
2779
+ requested_resource_str = ', ' .join (requested_resource_list )
2780
+ if isinstance (task .resources , list ):
2781
+ requested_resource_str = f'[{ requested_resource_str } ]'
2782
+ elif isinstance (task .resources , set ):
2783
+ requested_resource_str = f'{{{ requested_resource_str } }}'
2769
2784
with ux_utils .print_exception_no_traceback ():
2770
2785
raise exceptions .ResourcesMismatchError (
2771
- 'Requested resources do not match the existing cluster.\n '
2772
- f' Requested:\t { task .num_nodes } x { task_resources } \n '
2786
+ 'Requested resources do not match the existing '
2787
+ 'cluster.\n '
2788
+ f' Requested:\t { requested_resource_str } \n '
2773
2789
f' Existing:\t { handle .launched_nodes } x '
2774
2790
f'{ handle .launched_resources } \n '
2775
2791
f'{ mismatch_str } ' )
2792
+ return valid_resource
2776
2793
2777
2794
def _provision (
2778
2795
self ,
@@ -3092,7 +3109,7 @@ def _update_after_cluster_provisioned(
3092
3109
global_user_state .add_or_update_cluster (
3093
3110
handle .cluster_name ,
3094
3111
handle ,
3095
- task .resources ,
3112
+ set ( task .resources ) ,
3096
3113
ready = True ,
3097
3114
)
3098
3115
usage_lib .messages .usage .update_final_cluster_status (
@@ -3499,23 +3516,31 @@ def _execute(
3499
3516
# Check the task resources vs the cluster resources. Since `sky exec`
3500
3517
# will not run the provision and _check_existing_cluster
3501
3518
# We need to check ports here since sky.exec shouldn't change resources
3502
- self .check_resources_fit_cluster (handle , task , check_ports = True )
3503
-
3504
- resources_str = backend_utils .get_task_resources_str (task )
3519
+ valid_resource = self .check_resources_fit_cluster (handle ,
3520
+ task ,
3521
+ check_ports = True )
3522
+ task_copy = copy .copy (task )
3523
+ # Handle multiple resources exec case.
3524
+ task_copy .set_resources (valid_resource )
3525
+ if len (task .resources ) > 1 :
3526
+ logger .info ('Multiple resources are specified'
3527
+ f'for the task, using: { valid_resource } ' )
3528
+ task_copy .best_resources = None
3529
+ resources_str = backend_utils .get_task_resources_str (task_copy )
3505
3530
3506
3531
if dryrun :
3507
3532
logger .info (f'Dryrun complete. Would have run:\n { task } ' )
3508
3533
return None
3509
3534
3510
- job_id = self ._add_job (handle , task .name , resources_str )
3535
+ job_id = self ._add_job (handle , task_copy .name , resources_str )
3511
3536
3512
3537
is_tpu_vm_pod = tpu_utils .is_tpu_vm_pod (handle .launched_resources )
3513
3538
# Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods
3514
- if task .num_nodes > 1 or is_tpu_vm_pod :
3515
- self ._execute_task_n_nodes (handle , task , job_id , detach_run )
3539
+ if task_copy .num_nodes > 1 or is_tpu_vm_pod :
3540
+ self ._execute_task_n_nodes (handle , task_copy , job_id , detach_run )
3516
3541
else :
3517
3542
# Case: task_lib.Task(run, num_nodes=1)
3518
- self ._execute_task_one_node (handle , task , job_id , detach_run )
3543
+ self ._execute_task_one_node (handle , task_copy , job_id , detach_run )
3519
3544
3520
3545
return job_id
3521
3546
@@ -4343,7 +4368,9 @@ def _check_existing_cluster(
4343
4368
self .check_resources_fit_cluster (handle , task )
4344
4369
# Use the existing cluster.
4345
4370
assert handle .launched_resources is not None , (cluster_name , handle )
4346
- assert len (task .resources ) == 1
4371
+ # Assume resources share the same ports.
4372
+ for resource in task .resources :
4373
+ assert resource .ports == list (task .resources )[0 ].ports
4347
4374
all_ports = resources_utils .port_set_to_ranges (
4348
4375
resources_utils .port_ranges_to_set (
4349
4376
handle .launched_resources .ports ) |
@@ -4359,13 +4386,12 @@ def _check_existing_cluster(
4359
4386
prev_cluster_status = prev_cluster_status ,
4360
4387
prev_handle = handle )
4361
4388
usage_lib .messages .usage .set_new_cluster ()
4362
- assert len (task .resources ) == 1 , task .resources
4363
4389
# Use the task_cloud, because the cloud in `to_provision` can be changed
4364
4390
# later during the retry.
4365
- resources = list ( task .resources )[ 0 ]
4366
- task_cloud = (resources .cloud
4367
- if resources .cloud is not None else clouds .Cloud )
4368
- task_cloud .check_cluster_name_is_valid (cluster_name )
4391
+ for resources in task .resources :
4392
+ task_cloud = (resources .cloud
4393
+ if resources .cloud is not None else clouds .Cloud )
4394
+ task_cloud .check_cluster_name_is_valid (cluster_name )
4369
4395
4370
4396
if to_provision is None :
4371
4397
# The cluster is recently terminated either by autostop or manually
0 commit comments