truefoundry
diff --git a/‎workflows/train_and_deploy_workflow/README.md
+33 b/‎workflows/train_and_deploy_workflow/README.md
+33
diff --git a/‎workflows/train_and_deploy_workflow/deploy_model/__init__.py b/‎workflows/train_and_deploy_workflow/deploy_model/__init__.py
diff --git a/‎workflows/train_and_deploy_workflow/deploy_model/deploy.py
+59 b/‎workflows/train_and_deploy_workflow/deploy_model/deploy.py
+59
diff --git a/‎workflows/train_and_deploy_workflow/deploy_model/gradio_demo.py
+25 b/‎workflows/train_and_deploy_workflow/deploy_model/gradio_demo.py
+25
diff --git a/‎workflows/train_and_deploy_workflow/deploy_model/predict.py
+22 b/‎workflows/train_and_deploy_workflow/deploy_model/predict.py
+22
diff --git a/‎workflows/train_and_deploy_workflow/deploy_model/requirements.txt
+2 b/‎workflows/train_and_deploy_workflow/deploy_model/requirements.txt
+2
diff --git a/‎workflows/train_and_deploy_workflow/deploy_model/sample_images/3.jpg
2.91 KB b/‎workflows/train_and_deploy_workflow/deploy_model/sample_images/3.jpg
2.91 KB
diff --git a/‎workflows/train_and_deploy_workflow/deploy_model/sample_images/5.png
6.03 KB b/‎workflows/train_and_deploy_workflow/deploy_model/sample_images/5.png
6.03 KB
diff --git a/‎workflows/train_and_deploy_workflow/deploy_model/sample_images/7.png
24.6 KB b/‎workflows/train_and_deploy_workflow/deploy_model/sample_images/7.png
24.6 KB
diff --git a/‎workflows/train_and_deploy_workflow/train-deploy-workflow.py
+192 b/‎workflows/train_and_deploy_workflow/train-deploy-workflow.py
+192
@@ -0,0 +1,33 @@
+## There are few pre-requisite we have fulfill before deploying the workflow
+
+### Creating the ml repo and giving the workspace the access to that ml repo
+- First create a ml repo where you want to log the models. To learn about how to create a ml repo, click [here](https://docs.truefoundry.com/docs/creating-a-ml-repo#/).
+- Give ml repo access to the workspace where you will be deploying your workflow and the model. To know about how to give access click [here](https://docs.truefoundry.com/docs/key-concepts#/grant-access-of-ml-repo-to-workspace)
+
+### Setting the value of default variables
+
+- Set the value of env variables `TFY_API_KEY` `TFY_HOST` in `task_config` in `train-deploy-workflow.py` file.
+- you can use virtual account token as `TFY_API_KEY`, click [here](https://docs.truefoundry.com/docs/generating-truefoundry-api-keys#virtual-accounts) to learn about how to create a virtual account.
+- `host` value in `Port` field in `deploy.py` file
+
+## Running the workflow locally
+To run the workflow locally, run the following command
+```bash
+python train-deploy-workflow.py
+```
+## Deploying the workflow
+
+You can deploy the workflow using the following command, make sure your truefoudry cli version is more thatn `4.0.0`.
+
+```bash
+tfy deploy workflow --name <wf-name> --file train-deploy-workflow.py --workspace-fqn <workspace-fqn>
+```
+**Make sure you have workflow helm chart installed in the workspace in which you are deploying workflow**
+
+## Executing the workflow
+The workflow takes following arguments as input while executing the workflow.
+`ml_repo`: The name of the ml repo where you want to deploy the model. The workspace should have access to this ml repo.
+`workspace_fqn`: Workspace fqn where you want to deploy the model.
+`epochs`: An array of integer which define the number of epoch you want to train the model for, each epoch will run with corresponding learning rate which you will give in `learning_rate` argument. The lenght of `epochs` and `learning_rate` shoud be same.
+`learning_rate`: An array of float where each number is the learning rate you want your model to train with, corresponding to the epochs defined at same postion.
+`accuracy_threshold`: The threshold value, so the workflow will deploy the model if its validation accuracy is greater than this threshold accuracy.
@@ -0,0 +1,59 @@
+import argparse
+import logging
+
+from truefoundry.deploy import (
+    ArtifactsDownload,
+    Build,
+    LocalSource,
+    Port,
+    PythonBuild,
+    Resources,
+    Service,
+    TruefoundryArtifactSource,
+)
+from truefoundry.deploy.v2.lib.deploy import ServiceFoundryServiceClient
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)-8s %(message)s")
+
+
+def str_or_none(value):
+    return None if not value or value == "None" else value
+
+def deploy_service(model_version_fqn: str, workspace_fqn: str):
+    service_name = "mnist-classification" 
+    service = Service(
+        name=service_name,
+        image=Build(
+            build_source=LocalSource(local_build=False),
+            build_spec=PythonBuild(
+                python_version="3.11",
+                command="python deploy_model/gradio_demo.py",
+                # for deploying fastapi
+                # command="uvicorn fastapi_service:app --port 8000 --host 0.0.0.0",
+                requirements_path="deploy_model/requirements.txt",
+            ),
+        ),
+        ports=[Port(port=8000, host=f"{service_name}-<your-host>")],
+        resources=Resources(
+            memory_limit=500,
+            memory_request=500,
+            ephemeral_storage_limit=600,
+            ephemeral_storage_request=600,
+            cpu_limit=0.3,
+            cpu_request=0.3,
+        ),
+        artifacts_download=ArtifactsDownload(
+            artifacts=[
+                TruefoundryArtifactSource(
+                    artifact_version_fqn=model_version_fqn,
+                    download_path_env_variable="MODEL_DOWNLOAD_PATH",
+                )
+            ]
+        ),
+        labels={"tfy_openapi_path": "openapi.json"},
+    )
+    deployment = service.deploy(workspace_fqn=workspace_fqn, wait=False)
+    client = ServiceFoundryServiceClient()
+
+    url = f"{client.base_url.strip('/')}/applications/{deployment.applicationId}?tab=deployments"
+    return url
@@ -0,0 +1,25 @@
+import os
+
+import gradio as gr
+from predict import load_model, predict_fn
+
+model_path = os.path.join(os.environ.get("MODEL_DOWNLOAD_PATH", "."), "mnist_model.h5")
+model = load_model(model_path)
+
+
+def get_inference(img_arr):
+    return predict_fn(model, img_arr)
+
+
+interface = gr.Interface(
+    fn=get_inference,
+    inputs="image",
+    outputs="label",
+    examples=[["deploy_model/sample_images/3.jpg"], ["deploy_model/sample_images/5.png"], ["deploy_model/sample_images/7.png"]],
+)
+
+interface.launch(
+    server_name="0.0.0.0",
+    server_port=8000,
+    root_path=os.environ.get("TFY_SERVICE_ROOT_PATH"),
+)
@@ -0,0 +1,22 @@
+import numpy as np
+import tensorflow as tf
+
+
+def load_model(model_path: str) -> tf.keras.Model:
+    # Load the trained model
+    model = tf.keras.models.load_model(model_path)
+    return model
+
+
+def predict_fn(model, img_arr: np.ndarray) -> str:
+    # Preprocess the image before passing it to the model
+    img_arr = tf.expand_dims(img_arr, 0)
+
+    # Resize to (1, 28, 28, 1)
+    resized_image = tf.image.resize(img_arr, [28, 28])
+    img_arr = resized_image[:, :, :, 0]  # Keep only the first channel (grayscale)
+
+    predictions = model.predict(img_arr)
+    predicted_label = tf.argmax(predictions[0]).numpy()
+
+    return str(predicted_label)
@@ -0,0 +1,2 @@
+tensorflow==2.15.0
+gradio==4.43.0
@@ -0,0 +1,192 @@
+from typing import Any, Dict, List, Tuple, Union
+from truefoundry.workflow import (
+    task,
+    workflow,
+    PythonTaskConfig,
+    TaskPythonBuild,
+    map_task,
+    conditional,
+)
+from truefoundry.deploy import Resources
+from functools import partial
+import tensorflow as tf
+from tensorflow.keras.datasets import mnist
+import numpy as np
+
+
+task_config = PythonTaskConfig(
+    image=TaskPythonBuild(
+        python_version="3.9",
+        pip_packages=[
+            "truefoundry[workflow]==0.5.2",
+            "tensorflow==2.15.0",
+            "s3fs>=2024.10.0",
+        ],
+    ),
+    resources=Resources(
+        cpu_request=1.2,
+        cpu_limit=1.2,
+        memory_limit=3000,
+        memory_request=3000,
+        ephemeral_storage_limit=2000,
+        ephemeral_storage_request=2000,
+    ),
+    service_account="default",
+    env={
+        "TF_CPP_MIN_LOG_LEVEL": "3",  # suppress tensorflow warnings
+        "FLYTE_SDK_LOGGING_LEVEL": "40",
+        "TFY_API_KEY": "<your-api-key>",
+        "TFY_HOST": "<tfy-host-value>",
+    },
+)
+
+
+@task(task_config=task_config)
+def fetch_data() -> Dict[str, np.array]:
+    (x_train, y_train), (x_test, y_test) = mnist.load_data()
+    return {"x_train": x_train, "y_train": y_train, "x_test": x_test, "y_test": y_test}
+
+
+@task(task_config=task_config)
+def train_model(
+    epochs: int, learning_rate: float, data: Dict[str, np.array], ml_repo: str
+) -> str:
+    from truefoundry.ml import get_client
+
+    x_train, y_train, x_test, y_test = (
+        data["x_train"],
+        data["y_train"],
+        data["x_test"],
+        data["y_test"],
+    )
+    x_train = x_train / 255.0
+    x_test = x_test / 255.0
+
+    client = get_client()
+    run = client.create_run(ml_repo=ml_repo)
+
+    model = tf.keras.Sequential(
+        [
+            tf.keras.layers.Flatten(),
+            tf.keras.layers.Dense(128, activation="relu"),
+            tf.keras.layers.Dense(10, activation="softmax"),
+        ]
+    )
+
+    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
+    # Compile the model
+    model.compile(
+        optimizer=optimizer,
+        loss="sparse_categorical_crossentropy",
+        metrics=["accuracy"],
+    )
+
+    epochs = epochs
+    print(f"Started training the model for {epochs} epochs")
+    history = model.fit(
+        x_train, y_train, epochs=epochs, validation_data=(x_test, y_test)
+    )
+
+    # Evaluate the model
+    loss, accuracy = model.evaluate(x_test, y_test)
+    print(f"Test loss: {loss}")
+    print(f"Test accuracy: {accuracy}")
+
+    history_dict = history.history
+    train_accuracy = history_dict["accuracy"]
+    val_accuracy = history_dict["val_accuracy"]
+    loss = history_dict["loss"]
+
+    for epoch in range(epochs):
+        run.log_metrics(
+            {
+                "train_accuracy": train_accuracy[epoch],
+                "val_accuracy": val_accuracy[epoch],
+                "loss": loss[epoch],
+            },
+            step=epoch + 5,
+        )
+
+    model.save("mnist_model.h5")
+
+    run.log_model(
+        name="handwritten-digits-recognition",
+        model_file_or_folder="mnist_model.h5",
+        framework="tensorflow",
+        description="sample model to recognize the handwritten digits",
+        metadata={"accuracy": accuracy, "loss": loss},
+        step=1,
+    )
+
+    run_fqn = run.fqn
+    run.end()
+    return run_fqn
+
+
+@task(task_config=task_config)
+def get_best_model(fqns: List[str], threshold: float) -> Tuple[str, bool]:
+    from truefoundry.ml import get_client
+
+    client = get_client()
+    curr_accuracy = 0
+    best_fqn = None
+    print(f"Finding the best models from {len(fqns)} models")
+    for fqn_no in range(len(fqns)):
+        print(f"Comparing accuracy for model {fqn_no+1}")
+        run = client.get_run_by_fqn(fqns[fqn_no])
+        accuracy_metric = run.get_metrics().get("val_accuracy", 0)
+        accuracy = accuracy_metric[-1].value
+        if accuracy > curr_accuracy and accuracy > threshold:
+            curr_accuracy = accuracy
+            best_fqn = fqns[fqn_no]
+    if best_fqn:
+        print("The fqn of the best model is: ", best_fqn)
+        return best_fqn, True
+    print("No model found with accuracy greater than threshold")
+    return "", False
+
+
+@task(task_config=task_config)
+def deploy_model(run_fqn: str, workspace_fqn: str) -> str:
+    from truefoundry.ml import get_client
+    from deploy_model.deploy import deploy_service
+
+    client = get_client()
+    run = client.get_run_by_fqn(run_fqn)
+    models = run.list_model_versions()
+    model = models.__next__()
+    print(f"Deploying model {model.fqn}")
+    url = deploy_service(model_version_fqn=model.fqn, workspace_fqn=workspace_fqn)
+    return f"Model deployed at {url}"
+
+
+@task(task_config=task_config)
+def do_nothing(threshold: float) -> str:
+    return f"Model with threshold greater than {threshold} not found"
+
+
+@workflow
+def model_training_workflow(
+    ml_repo: str,
+    workspace_fqn: str,
+    epochs: List[int] = [2, 3, 5],
+    learning_rate: List[float] = [0.1, 0.001, 0.001],
+    accuracy_threshold: float = 0.15,
+) -> Union[str, None]:
+    data = fetch_data()
+    train_model_function = partial(train_model, data=data, ml_repo=ml_repo)
+    fqns = map_task(train_model_function, concurrency=2)(
+        epochs=epochs, learning_rate=learning_rate
+    )
+    model_version_fqn, does_model_pass_threshold_accuracy = get_best_model(
+        fqns=fqns, threshold=accuracy_threshold
+    )
+    message = (
+        conditional("Deploy model")
+        .if_(does_model_pass_threshold_accuracy == True)
+        .then(deploy_model(run_fqn=model_version_fqn, workspace_fqn=workspace_fqn))
+        .else_()
+        .then(do_nothing(threshold=accuracy_threshold))
+    )
+
+    return message
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+tensorflow==2.15.0`
	`2`	`+gradio==4.43.0`