Skip to content

Commit d161578

Browse files
authored
Merge pull request #63 from truefoundry/md_add_wf_example
added workflow example to deploy a model
2 parents 49e619e + db84285 commit d161578

File tree

10 files changed

+333
-0
lines changed

10 files changed

+333
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
## There are few pre-requisite we have fulfill before deploying the workflow
2+
3+
### Creating the ml repo and giving the workspace the access to that ml repo
4+
- First create a ml repo where you want to log the models. To learn about how to create a ml repo, click [here](https://docs.truefoundry.com/docs/creating-a-ml-repo#/).
5+
- Give ml repo access to the workspace where you will be deploying your workflow and the model. To know about how to give access click [here](https://docs.truefoundry.com/docs/key-concepts#/grant-access-of-ml-repo-to-workspace)
6+
7+
### Setting the value of default variables
8+
9+
- Set the value of env variables `TFY_API_KEY` `TFY_HOST` in `task_config` in `train-deploy-workflow.py` file.
10+
- you can use virtual account token as `TFY_API_KEY`, click [here](https://docs.truefoundry.com/docs/generating-truefoundry-api-keys#virtual-accounts) to learn about how to create a virtual account.
11+
- `host` value in `Port` field in `deploy.py` file
12+
13+
## Running the workflow locally
14+
To run the workflow locally, run the following command
15+
```bash
16+
python train-deploy-workflow.py
17+
```
18+
## Deploying the workflow
19+
20+
You can deploy the workflow using the following command, make sure your truefoudry cli version is more thatn `4.0.0`.
21+
22+
```bash
23+
tfy deploy workflow --name <wf-name> --file train-deploy-workflow.py --workspace-fqn <workspace-fqn>
24+
```
25+
**Make sure you have workflow helm chart installed in the workspace in which you are deploying workflow**
26+
27+
## Executing the workflow
28+
The workflow takes following arguments as input while executing the workflow.
29+
`ml_repo`: The name of the ml repo where you want to deploy the model. The workspace should have access to this ml repo.
30+
`workspace_fqn`: Workspace fqn where you want to deploy the model.
31+
`epochs`: An array of integer which define the number of epoch you want to train the model for, each epoch will run with corresponding learning rate which you will give in `learning_rate` argument. The lenght of `epochs` and `learning_rate` shoud be same.
32+
`learning_rate`: An array of float where each number is the learning rate you want your model to train with, corresponding to the epochs defined at same postion.
33+
`accuracy_threshold`: The threshold value, so the workflow will deploy the model if its validation accuracy is greater than this threshold accuracy.

workflows/train_and_deploy_workflow/deploy_model/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import argparse
2+
import logging
3+
4+
from truefoundry.deploy import (
5+
ArtifactsDownload,
6+
Build,
7+
LocalSource,
8+
Port,
9+
PythonBuild,
10+
Resources,
11+
Service,
12+
TruefoundryArtifactSource,
13+
)
14+
from truefoundry.deploy.v2.lib.deploy import ServiceFoundryServiceClient
15+
16+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)-8s %(message)s")
17+
18+
19+
def str_or_none(value):
20+
return None if not value or value == "None" else value
21+
22+
def deploy_service(model_version_fqn: str, workspace_fqn: str):
23+
service_name = "mnist-classification"
24+
service = Service(
25+
name=service_name,
26+
image=Build(
27+
build_source=LocalSource(local_build=False),
28+
build_spec=PythonBuild(
29+
python_version="3.11",
30+
command="python deploy_model/gradio_demo.py",
31+
# for deploying fastapi
32+
# command="uvicorn fastapi_service:app --port 8000 --host 0.0.0.0",
33+
requirements_path="deploy_model/requirements.txt",
34+
),
35+
),
36+
ports=[Port(port=8000, host=f"{service_name}-<your-host>")],
37+
resources=Resources(
38+
memory_limit=500,
39+
memory_request=500,
40+
ephemeral_storage_limit=600,
41+
ephemeral_storage_request=600,
42+
cpu_limit=0.3,
43+
cpu_request=0.3,
44+
),
45+
artifacts_download=ArtifactsDownload(
46+
artifacts=[
47+
TruefoundryArtifactSource(
48+
artifact_version_fqn=model_version_fqn,
49+
download_path_env_variable="MODEL_DOWNLOAD_PATH",
50+
)
51+
]
52+
),
53+
labels={"tfy_openapi_path": "openapi.json"},
54+
)
55+
deployment = service.deploy(workspace_fqn=workspace_fqn, wait=False)
56+
client = ServiceFoundryServiceClient()
57+
58+
url = f"{client.base_url.strip('/')}/applications/{deployment.applicationId}?tab=deployments"
59+
return url
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import os
2+
3+
import gradio as gr
4+
from predict import load_model, predict_fn
5+
6+
model_path = os.path.join(os.environ.get("MODEL_DOWNLOAD_PATH", "."), "mnist_model.h5")
7+
model = load_model(model_path)
8+
9+
10+
def get_inference(img_arr):
11+
return predict_fn(model, img_arr)
12+
13+
14+
interface = gr.Interface(
15+
fn=get_inference,
16+
inputs="image",
17+
outputs="label",
18+
examples=[["deploy_model/sample_images/3.jpg"], ["deploy_model/sample_images/5.png"], ["deploy_model/sample_images/7.png"]],
19+
)
20+
21+
interface.launch(
22+
server_name="0.0.0.0",
23+
server_port=8000,
24+
root_path=os.environ.get("TFY_SERVICE_ROOT_PATH"),
25+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import numpy as np
2+
import tensorflow as tf
3+
4+
5+
def load_model(model_path: str) -> tf.keras.Model:
6+
# Load the trained model
7+
model = tf.keras.models.load_model(model_path)
8+
return model
9+
10+
11+
def predict_fn(model, img_arr: np.ndarray) -> str:
12+
# Preprocess the image before passing it to the model
13+
img_arr = tf.expand_dims(img_arr, 0)
14+
15+
# Resize to (1, 28, 28, 1)
16+
resized_image = tf.image.resize(img_arr, [28, 28])
17+
img_arr = resized_image[:, :, :, 0] # Keep only the first channel (grayscale)
18+
19+
predictions = model.predict(img_arr)
20+
predicted_label = tf.argmax(predictions[0]).numpy()
21+
22+
return str(predicted_label)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
tensorflow==2.15.0
2+
gradio==4.43.0
Loading
Loading
Loading
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
from typing import Any, Dict, List, Tuple, Union
2+
from truefoundry.workflow import (
3+
task,
4+
workflow,
5+
PythonTaskConfig,
6+
TaskPythonBuild,
7+
map_task,
8+
conditional,
9+
)
10+
from truefoundry.deploy import Resources
11+
from functools import partial
12+
import tensorflow as tf
13+
from tensorflow.keras.datasets import mnist
14+
import numpy as np
15+
16+
17+
task_config = PythonTaskConfig(
18+
image=TaskPythonBuild(
19+
python_version="3.9",
20+
pip_packages=[
21+
"truefoundry[workflow]==0.5.2",
22+
"tensorflow==2.15.0",
23+
"s3fs>=2024.10.0",
24+
],
25+
),
26+
resources=Resources(
27+
cpu_request=1.2,
28+
cpu_limit=1.2,
29+
memory_limit=3000,
30+
memory_request=3000,
31+
ephemeral_storage_limit=2000,
32+
ephemeral_storage_request=2000,
33+
),
34+
service_account="default",
35+
env={
36+
"TF_CPP_MIN_LOG_LEVEL": "3", # suppress tensorflow warnings
37+
"FLYTE_SDK_LOGGING_LEVEL": "40",
38+
"TFY_API_KEY": "<your-api-key>",
39+
"TFY_HOST": "<tfy-host-value>",
40+
},
41+
)
42+
43+
44+
@task(task_config=task_config)
45+
def fetch_data() -> Dict[str, np.array]:
46+
(x_train, y_train), (x_test, y_test) = mnist.load_data()
47+
return {"x_train": x_train, "y_train": y_train, "x_test": x_test, "y_test": y_test}
48+
49+
50+
@task(task_config=task_config)
51+
def train_model(
52+
epochs: int, learning_rate: float, data: Dict[str, np.array], ml_repo: str
53+
) -> str:
54+
from truefoundry.ml import get_client
55+
56+
x_train, y_train, x_test, y_test = (
57+
data["x_train"],
58+
data["y_train"],
59+
data["x_test"],
60+
data["y_test"],
61+
)
62+
x_train = x_train / 255.0
63+
x_test = x_test / 255.0
64+
65+
client = get_client()
66+
run = client.create_run(ml_repo=ml_repo)
67+
68+
model = tf.keras.Sequential(
69+
[
70+
tf.keras.layers.Flatten(),
71+
tf.keras.layers.Dense(128, activation="relu"),
72+
tf.keras.layers.Dense(10, activation="softmax"),
73+
]
74+
)
75+
76+
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
77+
# Compile the model
78+
model.compile(
79+
optimizer=optimizer,
80+
loss="sparse_categorical_crossentropy",
81+
metrics=["accuracy"],
82+
)
83+
84+
epochs = epochs
85+
print(f"Started training the model for {epochs} epochs")
86+
history = model.fit(
87+
x_train, y_train, epochs=epochs, validation_data=(x_test, y_test)
88+
)
89+
90+
# Evaluate the model
91+
loss, accuracy = model.evaluate(x_test, y_test)
92+
print(f"Test loss: {loss}")
93+
print(f"Test accuracy: {accuracy}")
94+
95+
history_dict = history.history
96+
train_accuracy = history_dict["accuracy"]
97+
val_accuracy = history_dict["val_accuracy"]
98+
loss = history_dict["loss"]
99+
100+
for epoch in range(epochs):
101+
run.log_metrics(
102+
{
103+
"train_accuracy": train_accuracy[epoch],
104+
"val_accuracy": val_accuracy[epoch],
105+
"loss": loss[epoch],
106+
},
107+
step=epoch + 5,
108+
)
109+
110+
model.save("mnist_model.h5")
111+
112+
run.log_model(
113+
name="handwritten-digits-recognition",
114+
model_file_or_folder="mnist_model.h5",
115+
framework="tensorflow",
116+
description="sample model to recognize the handwritten digits",
117+
metadata={"accuracy": accuracy, "loss": loss},
118+
step=1,
119+
)
120+
121+
run_fqn = run.fqn
122+
run.end()
123+
return run_fqn
124+
125+
126+
@task(task_config=task_config)
127+
def get_best_model(fqns: List[str], threshold: float) -> Tuple[str, bool]:
128+
from truefoundry.ml import get_client
129+
130+
client = get_client()
131+
curr_accuracy = 0
132+
best_fqn = None
133+
print(f"Finding the best models from {len(fqns)} models")
134+
for fqn_no in range(len(fqns)):
135+
print(f"Comparing accuracy for model {fqn_no+1}")
136+
run = client.get_run_by_fqn(fqns[fqn_no])
137+
accuracy_metric = run.get_metrics().get("val_accuracy", 0)
138+
accuracy = accuracy_metric[-1].value
139+
if accuracy > curr_accuracy and accuracy > threshold:
140+
curr_accuracy = accuracy
141+
best_fqn = fqns[fqn_no]
142+
if best_fqn:
143+
print("The fqn of the best model is: ", best_fqn)
144+
return best_fqn, True
145+
print("No model found with accuracy greater than threshold")
146+
return "", False
147+
148+
149+
@task(task_config=task_config)
150+
def deploy_model(run_fqn: str, workspace_fqn: str) -> str:
151+
from truefoundry.ml import get_client
152+
from deploy_model.deploy import deploy_service
153+
154+
client = get_client()
155+
run = client.get_run_by_fqn(run_fqn)
156+
models = run.list_model_versions()
157+
model = models.__next__()
158+
print(f"Deploying model {model.fqn}")
159+
url = deploy_service(model_version_fqn=model.fqn, workspace_fqn=workspace_fqn)
160+
return f"Model deployed at {url}"
161+
162+
163+
@task(task_config=task_config)
164+
def do_nothing(threshold: float) -> str:
165+
return f"Model with threshold greater than {threshold} not found"
166+
167+
168+
@workflow
169+
def model_training_workflow(
170+
ml_repo: str,
171+
workspace_fqn: str,
172+
epochs: List[int] = [2, 3, 5],
173+
learning_rate: List[float] = [0.1, 0.001, 0.001],
174+
accuracy_threshold: float = 0.15,
175+
) -> Union[str, None]:
176+
data = fetch_data()
177+
train_model_function = partial(train_model, data=data, ml_repo=ml_repo)
178+
fqns = map_task(train_model_function, concurrency=2)(
179+
epochs=epochs, learning_rate=learning_rate
180+
)
181+
model_version_fqn, does_model_pass_threshold_accuracy = get_best_model(
182+
fqns=fqns, threshold=accuracy_threshold
183+
)
184+
message = (
185+
conditional("Deploy model")
186+
.if_(does_model_pass_threshold_accuracy == True)
187+
.then(deploy_model(run_fqn=model_version_fqn, workspace_fqn=workspace_fqn))
188+
.else_()
189+
.then(do_nothing(threshold=accuracy_threshold))
190+
)
191+
192+
return message

0 commit comments

Comments
 (0)