Skip to content

Commit 10bfd83

Browse files
authored
Add support for autoscaling deployments (#305)
* Add support for autoscaling deployments * Change deployment autoscaling request schema * Add autoscaling options to output of the deployments details command * Add missing help for new options * Add autoscaling options to deployments update command
1 parent 0e994d7 commit 10bfd83

File tree

11 files changed

+388
-24
lines changed

11 files changed

+388
-24
lines changed

gradient/api_sdk/clients/deployment_client.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def create(
5555
workspace_username=None,
5656
workspace_password=None,
5757
project_id=None,
58+
autoscaling=None,
5859
):
5960
"""
6061
Method to create a Deployment instance.
@@ -104,6 +105,7 @@ def create(
104105
:param str workspace_username: Project git repository username
105106
:param str workspace_password: Project git repository password
106107
:param str project_id: Project ID
108+
:param models.AutoscalingDefinition autoscaling: Deployment autoscaling definition
107109
108110
:returns: Created deployment id
109111
:rtype: str
@@ -135,6 +137,7 @@ def create(
135137
workspace_username=workspace_username,
136138
workspace_password=workspace_password,
137139
project_id=project_id,
140+
autoscaling=autoscaling,
138141
)
139142

140143
repository = self.build_repository(repositories.CreateDeployment)
@@ -233,6 +236,7 @@ def update(
233236
workspace_password=None,
234237
project_id=None,
235238
command=None,
239+
autoscaling=None,
236240
):
237241
deployment = models.Deployment(
238242
deployment_type=deployment_type,
@@ -261,6 +265,7 @@ def update(
261265
workspace_password=workspace_password,
262266
project_id=project_id,
263267
command=command,
268+
autoscaling=autoscaling,
264269
)
265270

266271
repository = self.build_repository(repositories.UpdateDeployment)

gradient/api_sdk/models/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from .artifact import Artifact
22
from .cluster import Cluster
33
from .dataset import Dataset, VolumeOptions
4-
from .deployment import Deployment
4+
from .deployment import Deployment, AutoscalingDefinition, AutoscalingMetric
55
from .experiment import BaseExperiment, MultiNodeExperiment, SingleNodeExperiment, MpiMultiNodeExperiment
66
from .hyperparameter import Hyperparameter
77
from .job import Job

gradient/api_sdk/models/deployment.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,22 @@
33
import attr
44

55

6+
@attr.s
7+
class AutoscalingMetric(object):
8+
type = attr.ib(type=str)
9+
name = attr.ib(type=str)
10+
value_type = attr.ib(type=str)
11+
value = attr.ib(type=float)
12+
13+
14+
@attr.s
15+
class AutoscalingDefinition(object):
16+
min_instance_count = attr.ib(type=int, default=None)
17+
max_instance_count = attr.ib(type=int, default=None)
18+
scale_cooldown_period = attr.ib(type=int, default=None)
19+
metrics = attr.ib(type=list, factory=list) # instances of AutoscalerMetric
20+
21+
622
@attr.s
723
class Deployment(object):
824
"""
@@ -82,6 +98,7 @@ class Deployment(object):
8298
workspace_username = attr.ib(type=str, default=None)
8399
workspace_password = attr.ib(type=str, default=None)
84100
metrics_url = attr.ib(type=str, default=None)
101+
autoscaling = attr.ib(type=AutoscalingDefinition, default=None)
85102

86103
dt_created = attr.ib(type=datetime.datetime, default=None)
87104
dt_modified = attr.ib(type=datetime.datetime, default=None)

gradient/api_sdk/serializers/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@ class BaseSchema(marshmallow.Schema):
55
MODEL = None
66

77
@marshmallow.post_dump
8-
def remove_none_values(self, data):
8+
def remove_none_or_empty_values(self, data):
99
return {
1010
key: value for key, value in data.items()
11-
if value not in (None, {})
11+
if value not in (None, {}, [])
1212
}
1313

1414
def get_instance(self, obj_dict, many=False):

gradient/api_sdk/serializers/deployment.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,24 @@
44
from .. import models
55

66

7+
class AutoscalingMetricSchema(BaseSchema):
8+
MODEL = models.AutoscalingMetric
9+
10+
type = ma.fields.Str()
11+
name = ma.fields.Str()
12+
value_type = ma.fields.Str(dump_to="valueType", load_from="valueType")
13+
value = ma.fields.Float()
14+
15+
16+
class AutoscalingDefinitionSchema(BaseSchema):
17+
MODEL = models.AutoscalingDefinition
18+
19+
min_instance_count = ma.fields.Int(dump_to="minInstanceCount", load_from="minInstanceCount")
20+
max_instance_count = ma.fields.Int(dump_to="maxInstanceCount", load_from="maxInstanceCount")
21+
scale_cooldown_period = ma.fields.Int(dump_to="scaleCooldownPeriod", load_from="scaleCooldownPeriod")
22+
metrics = ma.fields.Nested(AutoscalingMetricSchema, many=True, default=None)
23+
24+
725
class DeploymentSchema(BaseSchema):
826
MODEL = models.Deployment
927

@@ -46,6 +64,7 @@ class DeploymentSchema(BaseSchema):
4664
dt_started = ma.fields.DateTime(dump_to="dtStarted", load_from="dtStarted")
4765
dt_stopped = ma.fields.DateTime(dump_to="dtStopped", load_from="dtStopped")
4866
dt_deleted = ma.fields.DateTime(dump_to="dtDeleted", load_from="dtDeleted")
67+
autoscaling = ma.fields.Nested(AutoscalingDefinitionSchema)
4968

5069

5170
class DeploymentCreateSchema(DeploymentSchema):

gradient/cli/deployments.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,52 @@ def get_workspace_handler(api_key):
2525
return workspace_handler
2626

2727

28+
def validate_autoscaling_metric_or_resource(ctx, param, value, metric_type):
29+
"""
30+
value in = ("cpu/targetAverage:10")
31+
value out = ({"type": instance,
32+
"name": "cpu",
33+
"value_type": "targetAverage",
34+
"value": 10})
35+
"""
36+
37+
if value is None:
38+
return None
39+
40+
old_values = value
41+
new_values = []
42+
43+
for old_value in old_values:
44+
try:
45+
name, values = old_value.split("/", 1)
46+
value_type, value = values.split(":", 1)
47+
value = float(value)
48+
except Exception as e:
49+
debug_msg = "Error occurred while validating autoscaling {} with value {}: {}" \
50+
.format(metric_type, old_value, e)
51+
clilogger.CliLogger().debug(debug_msg)
52+
53+
msg = "value need to be in format resource_name/value_type:value for example cpu/targetAverage:60" \
54+
.format(old_value)
55+
raise click.BadParameter(msg)
56+
57+
new_value = {"type": metric_type,
58+
"name": name,
59+
"value_type": value_type,
60+
"value": value}
61+
new_values.append(new_value)
62+
63+
return tuple(new_values)
64+
65+
66+
def validate_autoscaling_metric(ctx, param, value):
67+
return validate_autoscaling_metric_or_resource(ctx, param, value, "Metric")
68+
69+
70+
def validate_autoscaling_resource(ctx, param, value):
71+
return validate_autoscaling_metric_or_resource(ctx, param, value, "Resource")
72+
73+
2874
@cli.group("deployments", help="Manage deployments", cls=ClickGroup)
2975
def deployments_group():
3076
pass
@@ -220,6 +266,40 @@ def deployments_metrics():
220266
help="Workspace password",
221267
cls=common.GradientOption,
222268
)
269+
@click.option(
270+
"--minInstanceCount",
271+
"min_instance_count",
272+
help="Minimal instance count",
273+
cls=common.GradientOption,
274+
)
275+
@click.option(
276+
"--maxInstanceCount",
277+
"max_instance_count",
278+
help="Maximal instance count",
279+
cls=common.GradientOption,
280+
)
281+
@click.option(
282+
"--scaleCooldownPeriod",
283+
"scale_cooldown_period",
284+
help="Scale cooldown period",
285+
cls=common.GradientOption,
286+
)
287+
@click.option(
288+
"--metric",
289+
"metrics",
290+
multiple=True,
291+
callback=validate_autoscaling_metric,
292+
help="Autoscaling metrics. Example: my_metric/targetAverage:21.37",
293+
cls=common.GradientOption,
294+
)
295+
@click.option(
296+
"--resource",
297+
"resources",
298+
multiple=True,
299+
callback=validate_autoscaling_resource,
300+
help="Autoscaling resources. Example: cpu/target:60",
301+
cls=common.GradientOption,
302+
)
223303
@api_key_option
224304
@common.options_file
225305
def create_deployment(api_key, options_file, **kwargs):
@@ -490,6 +570,40 @@ def delete_deployment(id_, options_file, api_key):
490570
help="Workspace password",
491571
cls=common.GradientOption,
492572
)
573+
@click.option(
574+
"--minInstanceCount",
575+
"min_instance_count",
576+
help="Minimal instance count",
577+
cls=common.GradientOption,
578+
)
579+
@click.option(
580+
"--maxInstanceCount",
581+
"max_instance_count",
582+
help="Maximal instance count",
583+
cls=common.GradientOption,
584+
)
585+
@click.option(
586+
"--scaleCooldownPeriod",
587+
"scale_cooldown_period",
588+
help="Scale cooldown period",
589+
cls=common.GradientOption,
590+
)
591+
@click.option(
592+
"--metric",
593+
"metrics",
594+
multiple=True,
595+
callback=validate_autoscaling_metric,
596+
help="Autoscaling metrics. Example: my_metric/targetAverage:21.37",
597+
cls=common.GradientOption,
598+
)
599+
@click.option(
600+
"--resource",
601+
"resources",
602+
multiple=True,
603+
callback=validate_autoscaling_resource,
604+
help="Autoscaling resources. Example: cpu/target:60",
605+
cls=common.GradientOption,
606+
)
493607
@api_key_option
494608
@common.options_file
495609
def update_deployment(deployment_id, api_key, options_file, **kwargs):

gradient/commands/deployments.py

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import abc
2+
import itertools
23
import json
34
import pydoc
45

@@ -7,8 +8,8 @@
78
from click import style
89
from halo import halo
910

10-
from gradient import exceptions, DeploymentsClient
11-
from gradient.api_sdk import sdk_exceptions, utils, models
11+
from gradient import exceptions, DeploymentsClient, AutoscalingMetric, AutoscalingDefinition
12+
from gradient.api_sdk import sdk_exceptions, utils
1213
from gradient.api_sdk.config import config
1314
from gradient.api_sdk.utils import concatenate_urls
1415
from gradient.cli_constants import CLI_PS_CLIENT_NAME
@@ -27,6 +28,29 @@ def _get_client(self, api_key, logger):
2728
return client
2829

2930

31+
class HandleAutoscalingOptions(object):
32+
def _handle_autoscaling_options(self, kwargs):
33+
autoscaling_metrics_and_resources = []
34+
metrics = kwargs.pop("metrics", None) or []
35+
resources = kwargs.pop("resources", None) or []
36+
for metric_dict in itertools.chain(resources, metrics):
37+
metric = AutoscalingMetric(
38+
type=metric_dict["type"],
39+
name=metric_dict["name"],
40+
value_type=metric_dict["value_type"],
41+
value=metric_dict["value"],
42+
)
43+
autoscaling_metrics_and_resources.append(metric)
44+
45+
autoscaling_definition = AutoscalingDefinition(
46+
min_instance_count=kwargs.pop("min_instance_count", None),
47+
max_instance_count=kwargs.pop("max_instance_count", None),
48+
scale_cooldown_period=kwargs.pop("scale_cooldown_period", None),
49+
metrics=autoscaling_metrics_and_resources,
50+
)
51+
kwargs["autoscaling"] = autoscaling_definition
52+
53+
3054
class HandleWorkspaceMixin(object):
3155
def _handle_workspace(self, instance_dict):
3256
handler = self.workspace_handler.handle(instance_dict)
@@ -37,14 +61,15 @@ def _handle_workspace(self, instance_dict):
3761
instance_dict["workspace_url"] = handler
3862

3963

40-
class CreateDeploymentCommand(BaseDeploymentCommand, HandleWorkspaceMixin):
64+
class CreateDeploymentCommand(HandleAutoscalingOptions, BaseDeploymentCommand, HandleWorkspaceMixin):
4165
def __init__(self, workspace_handler, *args, **kwargs):
4266
super(CreateDeploymentCommand, self).__init__(*args, **kwargs)
4367
self.workspace_handler = workspace_handler
4468

4569
def execute(self, **kwargs):
4670
self._handle_auth(kwargs)
4771
self._handle_workspace(kwargs)
72+
self._handle_autoscaling_options(kwargs)
4873
with halo.Halo(text="Creating new deployment", spinner="dots"):
4974
deployment_id = self.client.create(**kwargs)
5075

@@ -131,13 +156,14 @@ def execute(self, **kwargs):
131156
self.logger.log("Deployment deleted")
132157

133158

134-
class UpdateDeploymentCommand(BaseDeploymentCommand, HandleWorkspaceMixin):
159+
class UpdateDeploymentCommand(HandleAutoscalingOptions, BaseDeploymentCommand, HandleWorkspaceMixin):
135160
def __init__(self, workspace_handler, *args, **kwargs):
136161
super(UpdateDeploymentCommand, self).__init__(*args, **kwargs)
137162
self.workspace_handler = workspace_handler
138163

139164
def execute(self, deployment_id, **kwargs):
140165
self._handle_workspace(kwargs)
166+
self._handle_autoscaling_options(kwargs)
141167

142168
with halo.Halo(text="Updating deployment data", spinner="dots"):
143169
self.client.update(deployment_id, **kwargs)
@@ -151,6 +177,7 @@ def _get_table_data(self, instance):
151177
:param models.Deployment instance:
152178
"""
153179
tags_string = ", ".join(instance.tags)
180+
autoscaling_metrics_string = self.get_autoscaling_metrics_string(instance)
154181

155182
data = (
156183
("ID", instance.id),
@@ -166,9 +193,21 @@ def _get_table_data(self, instance):
166193
("API type", instance.api_type),
167194
("Cluster ID", instance.cluster_id),
168195
("Tags", tags_string),
196+
("Min Instance Count", getattr(instance.autoscaling, "min_instance_count", "")),
197+
("Max Instance Count", getattr(instance.autoscaling, "max_instance_count", "")),
198+
("Scale Cooldown Period", getattr(instance.autoscaling, "scale_cooldown_period", "")),
199+
("Autoscaling Metrics", autoscaling_metrics_string),
169200
)
170201
return data
171202

203+
def get_autoscaling_metrics_string(self, instance):
204+
if not instance.autoscaling or not instance.autoscaling.metrics:
205+
return ""
206+
207+
s = "\n".join("{}/{}:{}".format(m.name, m.value_type, m.value)
208+
for m in instance.autoscaling.metrics)
209+
return s
210+
172211

173212
class DeploymentAddTagsCommand(BaseDeploymentCommand):
174213
def execute(self, deployment_id, *args, **kwargs):

tests/config_files/deployments_update.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,19 @@ imageUrl: https://www.latlmes.com/breaking/paperspace-now-has-a-100-bilion-valua
2020
imageUsername: some_image_username
2121
instanceCount: 666
2222
machineType: G1
23+
maxInstanceCount: 64
2324
method: some_method
25+
metric:
26+
- loss/target:2.0
27+
- keton/target:21.37
28+
minInstanceCount: "4"
2429
modelId: some_model_id
2530
name: some_name
2631
ports: '5000'
2732
projectId: some_project_id
33+
resource:
34+
- cpu/targetAverage:10
35+
scaleCooldownPeriod: 123
2836
workspace: s3://some-workspace
2937
workspaceRef: some_branch_name
3038
workspaceUsername: username

0 commit comments

Comments
 (0)