feat: Add multihost_gpu_node_count to Vertex SDK.

vertex-sdk-bot · copybara-github · commit f4809bdb1847 · 2025-01-27T14:51:13.000-08:00
PiperOrigin-RevId: 703635901
diff --git a/google/cloud/aiplatform/models.py b/google/cloud/aiplatform/models.py
@@ -1327,6 +1327,7 @@ def deploy(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         service_account: Optional[str] = None,
         explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
         explanation_parameters: Optional[
@@ -1399,6 +1400,9 @@ def deploy(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Required for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             service_account (str):
                 The service account that the DeployedModel's container runs as. Specify the
                 email address of the service account. If this service account is not
@@ -1500,6 +1504,7 @@ def deploy(
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
             tpu_topology=tpu_topology,
+            multihost_gpu_node_count=multihost_gpu_node_count,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
             reservation_affinity_values=reservation_affinity_values,
@@ -1532,6 +1537,7 @@ def _deploy(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         reservation_affinity_type: Optional[str] = None,
         reservation_affinity_key: Optional[str] = None,
         reservation_affinity_values: Optional[List[str]] = None,
@@ -1601,6 +1607,9 @@ def _deploy(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Required for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             reservation_affinity_type (str):
                 Optional. The type of reservation affinity.
                 One of NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION,
@@ -1686,6 +1695,7 @@ def _deploy(
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
             tpu_topology=tpu_topology,
+            multihost_gpu_node_count=multihost_gpu_node_count,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
             reservation_affinity_values=reservation_affinity_values,
@@ -1725,6 +1735,7 @@ def _deploy_call(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         reservation_affinity_type: Optional[str] = None,
         reservation_affinity_key: Optional[str] = None,
         reservation_affinity_values: Optional[List[str]] = None,
@@ -1803,6 +1814,9 @@ def _deploy_call(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Required for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             reservation_affinity_type (str):
                 Optional. The type of reservation affinity.
                 One of NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION,
@@ -2030,6 +2044,9 @@ def _deploy_call(
                 if tpu_topology is not None:
                     machine_spec.tpu_topology = tpu_topology
 
+                if multihost_gpu_node_count is not None:
+                    machine_spec.multihost_gpu_node_count = multihost_gpu_node_count
+
                 dedicated_resources.machine_spec = machine_spec
                 deployed_model.dedicated_resources = dedicated_resources
                 if fast_tryout_enabled:
@@ -4012,6 +4029,7 @@ def deploy(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         service_account: Optional[str] = None,
         explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
         explanation_parameters: Optional[
@@ -4089,6 +4107,9 @@ def deploy(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Required for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             service_account (str):
                 The service account that the DeployedModel's container runs as. Specify the
                 email address of the service account. If this service account is not
@@ -4190,6 +4211,7 @@ def deploy(
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
             tpu_topology=tpu_topology,
+            multihost_gpu_node_count=multihost_gpu_node_count,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
             reservation_affinity_values=reservation_affinity_values,
@@ -5241,6 +5263,7 @@ def deploy(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         service_account: Optional[str] = None,
         explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
         explanation_parameters: Optional[
@@ -5318,6 +5341,9 @@ def deploy(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Requireid for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             service_account (str):
                 The service account that the DeployedModel's container runs as. Specify the
                 email address of the service account. If this service account is not
@@ -5462,6 +5488,7 @@ def deploy(
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
             tpu_topology=tpu_topology,
+            multihost_gpu_node_count=multihost_gpu_node_count,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
             reservation_affinity_values=reservation_affinity_values,
@@ -5505,6 +5532,7 @@ def _deploy(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         reservation_affinity_type: Optional[str] = None,
         reservation_affinity_key: Optional[str] = None,
         reservation_affinity_values: Optional[List[str]] = None,
@@ -5579,6 +5607,9 @@ def _deploy(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Requireid for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             reservation_affinity_type (str):
                 Optional. The type of reservation affinity.
                 One of NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION,
@@ -5713,6 +5744,7 @@ def _deploy(
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
             tpu_topology=tpu_topology,
+            multihost_gpu_node_count=multihost_gpu_node_count,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
             reservation_affinity_values=reservation_affinity_values,
diff --git a/tests/unit/aiplatform/test_models.py b/tests/unit/aiplatform/test_models.py
@@ -146,6 +146,11 @@
 _TEST_TPU_MACHINE_TYPE = "ct5lp-hightpu-4t"
 _TEST_TPU_TOPOLOGY = "2x2"
 
+_TEST_GPU_MACHINE_TYPE = "a3-highgpu-8g"
+_TEST_GPU_ACCELERATOR_TYPE = "NVIDIA_TESLA_A100"
+_TEST_GPU_ACCELERATOR_COUNT = 8
+_TEST_MULTIHOST_GPU_NODE_COUNT = 2
+
 _TEST_BATCH_SIZE = 16
 
 _TEST_PIPELINE_RESOURCE_NAME = (
@@ -2239,6 +2244,53 @@ def test_deploy_no_endpoint_with_tpu_topology(self, deploy_model_mock, sync):
             timeout=None,
         )
 
+    @pytest.mark.usefixtures(
+        "get_endpoint_mock", "get_model_mock", "create_endpoint_mock"
+    )
+    @pytest.mark.parametrize("sync", [True, False])
+    def test_deploy_no_endpoint_with_multihost_gpu_node_count(self, deploy_model_mock, sync):
+        test_model = models.Model(_TEST_ID)
+        test_model._gca_resource.supported_deployment_resources_types.append(
+            aiplatform.gapic.Model.DeploymentResourcesType.DEDICATED_RESOURCES
+        )
+        test_endpoint = test_model.deploy(
+            machine_type=_TEST_GPU_MACHINE_TYPE,
+            accelerator_type=_TEST_GPU_ACCELERATOR_TYPE,
+            accelerator_count=_TEST_GPU_ACCELERATOR_COUNT,
+            multihost_gpu_node_count=_TEST_MULTIHOST_GPU_NODE_COUNT,
+            sync=sync,
+            deploy_request_timeout=None,
+        )
+
+        if not sync:
+            test_endpoint.wait()
+
+        expected_machine_spec = gca_machine_resources.MachineSpec(
+            machine_type=_TEST_GPU_MACHINE_TYPE,
+            accelerator_type=_TEST_GPU_ACCELERATOR_COUNT,
+            accelerator_count=_TEST_GPU_ACCELERATOR_COUNT,
+            multihost_gpu_node_count=_TEST_MULTIHOST_GPU_NODE_COUNT,
+        )
+        expected_dedicated_resources = gca_machine_resources.DedicatedResources(
+            machine_spec=expected_machine_spec,
+            min_replica_count=1,
+            max_replica_count=1,
+            spot=False,
+        )
+        expected_deployed_model = gca_endpoint.DeployedModel(
+            dedicated_resources=expected_dedicated_resources,
+            model=test_model.resource_name,
+            display_name=None,
+        )
+        deploy_model_mock.assert_called_once_with(
+            endpoint=test_endpoint.resource_name,
+            deployed_model=expected_deployed_model,
+            traffic_split={"0": 100},
+            metadata=(),
+            timeout=None,
+        )
+
+
     @pytest.mark.usefixtures(
         "get_endpoint_mock", "get_model_mock", "create_endpoint_mock"
     )