From deb33fd9f072f055183b595ed86a39926b6d3ac8 Mon Sep 17 00:00:00 2001 From: A Vertex SDK engineer Date: Fri, 6 Dec 2024 15:34:24 -0800 Subject: [PATCH] feat: Add multihost_gpu_node_count to Vertex SDK. PiperOrigin-RevId: 703635901 --- google/cloud/aiplatform/models.py | 42 ++++++++++++++++++++++ tests/unit/aiplatform/test_models.py | 52 ++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+) diff --git a/google/cloud/aiplatform/models.py b/google/cloud/aiplatform/models.py index 8b13f95502..ed068fd428 100644 --- a/google/cloud/aiplatform/models.py +++ b/google/cloud/aiplatform/models.py @@ -252,6 +252,7 @@ def create( reservation_affinity_values: Optional[List[str]] = None, spot: bool = False, required_replica_count: Optional[int] = 0, + multihost_gpu_node_count: Optional[int] = None, ) -> "DeploymentResourcePool": """Creates a new DeploymentResourcePool. @@ -332,6 +333,9 @@ def create( set, the model deploy/mutate operation will succeed once available_replica_count reaches required_replica_count, and the rest of the replicas will be retried. + multihost_gpu_node_count (int): + Optional. The number of nodes per replica for multihost GPU DeployedModel. + Required for multihost GPU deployments. Returns: DeploymentResourcePool @@ -363,6 +367,7 @@ def create( sync=sync, create_request_timeout=create_request_timeout, required_replica_count=required_replica_count, + multihost_gpu_node_count=multihost_gpu_node_count, ) @classmethod @@ -389,6 +394,7 @@ def _create( sync=True, create_request_timeout: Optional[float] = None, required_replica_count: Optional[int] = 0, + multihost_gpu_node_count: Optional[int] = None, ) -> "DeploymentResourcePool": """Creates a new DeploymentResourcePool. @@ -472,6 +478,9 @@ def _create( set, the model deploy/mutate operation will succeed once available_replica_count reaches required_replica_count, and the rest of the replicas will be retried. + multihost_gpu_node_count (int): + Optional. The number of nodes per replica for multihost GPU DeployedModel. + Required for multihost GPU deployments. Returns: DeploymentResourcePool @@ -505,6 +514,7 @@ def _create( [autoscaling_metric_spec] ) + # TODO(joelletiangco): accelerator_type present here if accelerator_type and accelerator_count: utils.validate_accelerator_type(accelerator_type) machine_spec.accelerator_type = accelerator_type @@ -1327,6 +1337,7 @@ def deploy( accelerator_type: Optional[str] = None, accelerator_count: Optional[int] = None, tpu_topology: Optional[str] = None, + multihost_gpu_node_count: Optional[int] = None, service_account: Optional[str] = None, explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None, explanation_parameters: Optional[ @@ -1399,6 +1410,9 @@ def deploy( tpu_topology (str): Optional. The TPU topology to use for the DeployedModel. Required for CloudTPU multihost deployments. + multihost_gpu_node_count (int): + Optional. The number of nodes per replica for multihost GPU DeployedModel. + Required for multihost GPU deployments. service_account (str): The service account that the DeployedModel's container runs as. Specify the email address of the service account. If this service account is not @@ -1500,6 +1514,7 @@ def deploy( accelerator_type=accelerator_type, accelerator_count=accelerator_count, tpu_topology=tpu_topology, + multihost_gpu_node_count=multihost_gpu_node_count, reservation_affinity_type=reservation_affinity_type, reservation_affinity_key=reservation_affinity_key, reservation_affinity_values=reservation_affinity_values, @@ -1532,6 +1547,7 @@ def _deploy( accelerator_type: Optional[str] = None, accelerator_count: Optional[int] = None, tpu_topology: Optional[str] = None, + multihost_gpu_node_count: Optional[int] = None, reservation_affinity_type: Optional[str] = None, reservation_affinity_key: Optional[str] = None, reservation_affinity_values: Optional[List[str]] = None, @@ -1601,6 +1617,9 @@ def _deploy( tpu_topology (str): Optional. The TPU topology to use for the DeployedModel. Required for CloudTPU multihost deployments. + multihost_gpu_node_count (int): + Optional. The number of nodes per replica for multihost GPU DeployedModel. + Required for multihost GPU deployments. reservation_affinity_type (str): Optional. The type of reservation affinity. One of NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION, @@ -1686,6 +1705,7 @@ def _deploy( accelerator_type=accelerator_type, accelerator_count=accelerator_count, tpu_topology=tpu_topology, + multihost_gpu_node_count=multihost_gpu_node_count, reservation_affinity_type=reservation_affinity_type, reservation_affinity_key=reservation_affinity_key, reservation_affinity_values=reservation_affinity_values, @@ -1725,6 +1745,7 @@ def _deploy_call( accelerator_type: Optional[str] = None, accelerator_count: Optional[int] = None, tpu_topology: Optional[str] = None, + multihost_gpu_node_count: Optional[int] = None, reservation_affinity_type: Optional[str] = None, reservation_affinity_key: Optional[str] = None, reservation_affinity_values: Optional[List[str]] = None, @@ -1803,6 +1824,9 @@ def _deploy_call( tpu_topology (str): Optional. The TPU topology to use for the DeployedModel. Required for CloudTPU multihost deployments. + multihost_gpu_node_count (int): + Optional. The number of nodes per replica for multihost GPU DeployedModel. + Required for multihost GPU deployments. reservation_affinity_type (str): Optional. The type of reservation affinity. One of NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION, @@ -2030,6 +2054,9 @@ def _deploy_call( if tpu_topology is not None: machine_spec.tpu_topology = tpu_topology + if multihost_gpu_node_count is not None: + machine_spec.multihost_gpu_node_count = multihost_gpu_node_count + dedicated_resources.machine_spec = machine_spec deployed_model.dedicated_resources = dedicated_resources if fast_tryout_enabled: @@ -4012,6 +4039,7 @@ def deploy( accelerator_type: Optional[str] = None, accelerator_count: Optional[int] = None, tpu_topology: Optional[str] = None, + multihost_gpu_node_count: Optional[int] = None, service_account: Optional[str] = None, explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None, explanation_parameters: Optional[ @@ -4089,6 +4117,9 @@ def deploy( tpu_topology (str): Optional. The TPU topology to use for the DeployedModel. Required for CloudTPU multihost deployments. + multihost_gpu_node_count (int): + Optional. The number of nodes per replica for multihost GPU DeployedModel. + Required for multihost GPU deployments. service_account (str): The service account that the DeployedModel's container runs as. Specify the email address of the service account. If this service account is not @@ -4190,6 +4221,7 @@ def deploy( accelerator_type=accelerator_type, accelerator_count=accelerator_count, tpu_topology=tpu_topology, + multihost_gpu_node_count=multihost_gpu_node_count, reservation_affinity_type=reservation_affinity_type, reservation_affinity_key=reservation_affinity_key, reservation_affinity_values=reservation_affinity_values, @@ -5241,6 +5273,7 @@ def deploy( accelerator_type: Optional[str] = None, accelerator_count: Optional[int] = None, tpu_topology: Optional[str] = None, + multihost_gpu_node_count: Optional[int] = None, service_account: Optional[str] = None, explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None, explanation_parameters: Optional[ @@ -5318,6 +5351,9 @@ def deploy( tpu_topology (str): Optional. The TPU topology to use for the DeployedModel. Requireid for CloudTPU multihost deployments. + multihost_gpu_node_count (int): + Optional. The number of nodes per replica for multihost GPU DeployedModel. + Required for multihost GPU deployments. service_account (str): The service account that the DeployedModel's container runs as. Specify the email address of the service account. If this service account is not @@ -5462,6 +5498,7 @@ def deploy( accelerator_type=accelerator_type, accelerator_count=accelerator_count, tpu_topology=tpu_topology, + multihost_gpu_node_count=multihost_gpu_node_count, reservation_affinity_type=reservation_affinity_type, reservation_affinity_key=reservation_affinity_key, reservation_affinity_values=reservation_affinity_values, @@ -5505,6 +5542,7 @@ def _deploy( accelerator_type: Optional[str] = None, accelerator_count: Optional[int] = None, tpu_topology: Optional[str] = None, + multihost_gpu_node_count: Optional[int] = None, reservation_affinity_type: Optional[str] = None, reservation_affinity_key: Optional[str] = None, reservation_affinity_values: Optional[List[str]] = None, @@ -5579,6 +5617,9 @@ def _deploy( tpu_topology (str): Optional. The TPU topology to use for the DeployedModel. Requireid for CloudTPU multihost deployments. + multihost_gpu_node_count (int): + Optional. The number of nodes per replica for multihost GPU DeployedModel. + Required for multihost GPU deployments. reservation_affinity_type (str): Optional. The type of reservation affinity. One of NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION, @@ -5713,6 +5754,7 @@ def _deploy( accelerator_type=accelerator_type, accelerator_count=accelerator_count, tpu_topology=tpu_topology, + multihost_gpu_node_count=multihost_gpu_node_count, reservation_affinity_type=reservation_affinity_type, reservation_affinity_key=reservation_affinity_key, reservation_affinity_values=reservation_affinity_values, diff --git a/tests/unit/aiplatform/test_models.py b/tests/unit/aiplatform/test_models.py index 2a78321a3d..66e3f3c014 100644 --- a/tests/unit/aiplatform/test_models.py +++ b/tests/unit/aiplatform/test_models.py @@ -146,6 +146,11 @@ _TEST_TPU_MACHINE_TYPE = "ct5lp-hightpu-4t" _TEST_TPU_TOPOLOGY = "2x2" +_TEST_GPU_MACHINE_TYPE = "a3-highgpu-8g" +_TEST_GPU_ACCELERATOR_TYPE = "NVIDIA_TESLA_A100" +_TEST_GPU_ACCELERATOR_COUNT = 8 +_TEST_MULTIHOST_GPU_NODE_COUNT = 2 + _TEST_BATCH_SIZE = 16 _TEST_PIPELINE_RESOURCE_NAME = ( @@ -2239,6 +2244,53 @@ def test_deploy_no_endpoint_with_tpu_topology(self, deploy_model_mock, sync): timeout=None, ) + @pytest.mark.usefixtures( + "get_endpoint_mock", "get_model_mock", "create_endpoint_mock" + ) + @pytest.mark.parametrize("sync", [True, False]) + def test_deploy_no_endpoint_with_multihost_gpu_node_count(self, deploy_model_mock, sync): + test_model = models.Model(_TEST_ID) + test_model._gca_resource.supported_deployment_resources_types.append( + aiplatform.gapic.Model.DeploymentResourcesType.DEDICATED_RESOURCES + ) + test_endpoint = test_model.deploy( + machine_type=_TEST_GPU_MACHINE_TYPE, + accelerator_type=_TEST_GPU_ACCELERATOR_TYPE, + accelerator_count=_TEST_GPU_ACCELERATOR_COUNT, + multihost_gpu_node_count=_TEST_MULTIHOST_GPU_NODE_COUNT, + sync=sync, + deploy_request_timeout=None, + ) + + if not sync: + test_endpoint.wait() + + expected_machine_spec = gca_machine_resources.MachineSpec( + machine_type=_TEST_GPU_MACHINE_TYPE, + accelerator_type=_TEST_GPU_ACCELERATOR_COUNT, + accelerator_count=_TEST_GPU_ACCELERATOR_COUNT, + multihost_gpu_node_count=_TEST_MULTIHOST_GPU_NODE_COUNT, + ) + expected_dedicated_resources = gca_machine_resources.DedicatedResources( + machine_spec=expected_machine_spec, + min_replica_count=1, + max_replica_count=1, + spot=False, + ) + expected_deployed_model = gca_endpoint.DeployedModel( + dedicated_resources=expected_dedicated_resources, + model=test_model.resource_name, + display_name=None, + ) + deploy_model_mock.assert_called_once_with( + endpoint=test_endpoint.resource_name, + deployed_model=expected_deployed_model, + traffic_split={"0": 100}, + metadata=(), + timeout=None, + ) + + @pytest.mark.usefixtures( "get_endpoint_mock", "get_model_mock", "create_endpoint_mock" )