feat: Add multihost_gpu_node_count to Vertex SDK.

vertex-sdk-bot · copybara-github · commit 0d35a6dbb69d · 2024-12-10T09:12:21.000-08:00
PiperOrigin-RevId: 703635901
diff --git a/google/cloud/aiplatform/models.py b/google/cloud/aiplatform/models.py
@@ -1294,6 +1294,7 @@ def deploy(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         service_account: Optional[str] = None,
         explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
         explanation_parameters: Optional[
@@ -1365,6 +1366,9 @@ def deploy(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Required for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             service_account (str):
                 The service account that the DeployedModel's container runs as. Specify the
                 email address of the service account. If this service account is not
@@ -1457,6 +1461,7 @@ def deploy(
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
             tpu_topology=tpu_topology,
+            multihost_gpu_node_count=multihost_gpu_node_count,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
             reservation_affinity_values=reservation_affinity_values,
@@ -1488,6 +1493,7 @@ def _deploy(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         reservation_affinity_type: Optional[str] = None,
         reservation_affinity_key: Optional[str] = None,
         reservation_affinity_values: Optional[List[str]] = None,
@@ -1556,6 +1562,9 @@ def _deploy(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Required for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             reservation_affinity_type (str):
                 Optional. The type of reservation affinity.
                 One of NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION,
@@ -1633,6 +1642,7 @@ def _deploy(
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
             tpu_topology=tpu_topology,
+            multihost_gpu_node_count=multihost_gpu_node_count,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
             reservation_affinity_values=reservation_affinity_values,
@@ -1671,6 +1681,7 @@ def _deploy_call(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         reservation_affinity_type: Optional[str] = None,
         reservation_affinity_key: Optional[str] = None,
         reservation_affinity_values: Optional[List[str]] = None,
@@ -1748,6 +1759,9 @@ def _deploy_call(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Required for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             reservation_affinity_type (str):
                 Optional. The type of reservation affinity.
                 One of NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION,
@@ -1966,6 +1980,9 @@ def _deploy_call(
                 if tpu_topology is not None:
                     machine_spec.tpu_topology = tpu_topology
 
+                if multihost_gpu_node_count is not None:
+                    machine_spec.multihost_gpu_node_count = multihost_gpu_node_count
+
                 dedicated_resources.machine_spec = machine_spec
                 deployed_model.dedicated_resources = dedicated_resources
                 if fast_tryout_enabled:
@@ -3948,6 +3965,7 @@ def deploy(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         service_account: Optional[str] = None,
         explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
         explanation_parameters: Optional[
@@ -4024,6 +4042,9 @@ def deploy(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Required for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             service_account (str):
                 The service account that the DeployedModel's container runs as. Specify the
                 email address of the service account. If this service account is not
@@ -4116,6 +4137,7 @@ def deploy(
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
             tpu_topology=tpu_topology,
+            multihost_gpu_node_count=multihost_gpu_node_count,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
             reservation_affinity_values=reservation_affinity_values,
@@ -5166,6 +5188,7 @@ def deploy(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         service_account: Optional[str] = None,
         explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
         explanation_parameters: Optional[
@@ -5242,6 +5265,9 @@ def deploy(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Requireid for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             service_account (str):
                 The service account that the DeployedModel's container runs as. Specify the
                 email address of the service account. If this service account is not
@@ -5377,6 +5403,7 @@ def deploy(
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
             tpu_topology=tpu_topology,
+            multihost_gpu_node_count=multihost_gpu_node_count,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
             reservation_affinity_values=reservation_affinity_values,
@@ -5419,6 +5446,7 @@ def _deploy(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         reservation_affinity_type: Optional[str] = None,
         reservation_affinity_key: Optional[str] = None,
         reservation_affinity_values: Optional[List[str]] = None,
@@ -5492,6 +5520,9 @@ def _deploy(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Requireid for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             reservation_affinity_type (str):
                 Optional. The type of reservation affinity.
                 One of NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION,
@@ -5618,6 +5649,7 @@ def _deploy(
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
             tpu_topology=tpu_topology,
+            multihost_gpu_node_count=multihost_gpu_node_count,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
             reservation_affinity_values=reservation_affinity_values,
diff --git a/tests/unit/aiplatform/test_models.py b/tests/unit/aiplatform/test_models.py
@@ -145,6 +145,11 @@
 _TEST_TPU_MACHINE_TYPE = "ct5lp-hightpu-4t"
 _TEST_TPU_TOPOLOGY = "2x2"
 
+_TEST_GPU_MACHINE_TYPE = "a3-highgpu-8g"
+_TEST_GPU_ACCELERATOR_TYPE = "NVIDIA_TESLA_A100"
+_TEST_GPU_ACCELERATOR_COUNT = 8
+_TEST_MULTIHOST_GPU_NODE_COUNT = 2
+
 _TEST_BATCH_SIZE = 16
 
 _TEST_PIPELINE_RESOURCE_NAME = (
@@ -2234,6 +2239,53 @@ def test_deploy_no_endpoint_with_tpu_topology(self, deploy_model_mock, sync):
             timeout=None,
         )
 
+    @pytest.mark.usefixtures(
+        "get_endpoint_mock", "get_model_mock", "create_endpoint_mock"
+    )
+    @pytest.mark.parametrize("sync", [True, False])
+    def test_deploy_no_endpoint_with_multihost_gpu_node_count(self, deploy_model_mock, sync):
+        test_model = models.Model(_TEST_ID)
+        test_model._gca_resource.supported_deployment_resources_types.append(
+            aiplatform.gapic.Model.DeploymentResourcesType.DEDICATED_RESOURCES
+        )
+        test_endpoint = test_model.deploy(
+            machine_type=_TEST_GPU_MACHINE_TYPE,
+            accelerator_type=_TEST_GPU_ACCELERATOR_COUNT,
+            accelerator_count=_TEST_GPU_ACCELERATOR_COUNT,
+            multihost_gpu_node_count=_TEST_MULTIHOST_GPU_NODE_COUNT,
+            sync=sync,
+            deploy_request_timeout=None,
+        )
+
+        if not sync:
+            test_endpoint.wait()
+
+        expected_machine_spec = gca_machine_resources.MachineSpec(
+            machine_type=_TEST_GPU_MACHINE_TYPE,
+            accelerator_type=_TEST_GPU_ACCELERATOR_COUNT,
+            accelerator_count=_TEST_GPU_ACCELERATOR_COUNT,
+            multihost_gpu_node_count=_TEST_MULTIHOST_GPU_NODE_COUNT,
+        )
+        expected_dedicated_resources = gca_machine_resources.DedicatedResources(
+            machine_spec=expected_machine_spec,
+            min_replica_count=1,
+            max_replica_count=1,
+            spot=False,
+        )
+        expected_deployed_model = gca_endpoint.DeployedModel(
+            dedicated_resources=expected_dedicated_resources,
+            model=test_model.resource_name,
+            display_name=None,
+        )
+        deploy_model_mock.assert_called_once_with(
+            endpoint=test_endpoint.resource_name,
+            deployed_model=expected_deployed_model,
+            traffic_split={"0": 100},
+            metadata=(),
+            timeout=None,
+        )
+
+
     @pytest.mark.usefixtures(
         "get_endpoint_mock", "get_model_mock", "create_endpoint_mock"
     )