NVIDIA · nvkevlu · Jan 28, 2025 · Jan 28, 2025 · Jan 28, 2025 · Jan 31, 2025
diff --git a/...-1_running_federated_learning_applications/01.5_experiment_tracking/code/data/download.py b/...-1_running_federated_learning_applications/01.5_experiment_tracking/code/data/download.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This Dirichlet sampling strategy for creating a heterogeneous partition is adopted
+# from FedMA (https://github.com/IBM/FedMA).
+
+# MIT License
+
+# Copyright (c) 2020 International Business Machines
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import argparse
+
+import torchvision.datasets as datasets
+
+# default dataset path
+CIFAR10_ROOT = "/tmp/nvflare/data/cifar10"
+
+
+def define_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_path", type=str, default=CIFAR10_ROOT, nargs="?")
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    datasets.CIFAR10(root=args.dataset_path, train=True, download=True)
+    datasets.CIFAR10(root=args.dataset_path, train=False, download=True)
+
+
+if __name__ == "__main__":
+    main(define_parser())
diff --git a/...Chapter-1_running_federated_learning_applications/01.5_experiment_tracking/code/fl_job.py b/...Chapter-1_running_federated_learning_applications/01.5_experiment_tracking/code/fl_job.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from src.network import SimpleNetwork
+
+from nvflare.app_opt.pt.job_config.fed_avg import FedAvgJob
+from nvflare.job_config.script_runner import ScriptRunner
+
+if __name__ == "__main__":
+    n_clients = 5
+    num_rounds = 2
+
+    train_script = "src/client.py"
+
+    job = FedAvgJob(name="fedavg", n_clients=n_clients, num_rounds=num_rounds, initial_model=SimpleNetwork())
+
+    # Add clients
+    for i in range(n_clients):
+        executor = ScriptRunner(
+            script=train_script, script_args=""  # f"--batch_size 32 --data_path /tmp/data/site-{i}"
+        )
+        job.to(executor, f"site-{i + 1}")
+
+    job.simulator_run(workspace="/tmp/nvflare/jobs/workdir", log_config="./log_config.json")
diff --git a/...-1_running_federated_learning_applications/01.5_experiment_tracking/code/fl_job_mlflow.py b/...-1_running_federated_learning_applications/01.5_experiment_tracking/code/fl_job_mlflow.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nvflare.app_opt.tracking.mlflow.mlflow_receiver import MLflowReceiver
+from src.network import SimpleNetwork
+
+from nvflare.app_opt.pt.job_config.fed_avg import FedAvgJob
+from nvflare.job_config.script_runner import ScriptRunner
+
+if __name__ == "__main__":
+    n_clients = 5
+    num_rounds = 2
+
+    train_script = "src/client.py"
+
+    job = FedAvgJob(name="fedavg", n_clients=n_clients, num_rounds=num_rounds, initial_model=SimpleNetwork())
+    receiver = MLflowReceiver(
+            tracking_uri="file:///tmp/nvflare/jobs/workdir/server/simulate_job/mlruns",
+            kw_args={
+                "experiment_name": "nvflare-fedavg-experiment",
+                "run_name": "nvflare-fedavg-with-mlflow",
+                "experiment_tags": {"mlflow.note.content": "## **NVFlare FedAvg experiment with MLflow**"},
+                "run_tags": {"mlflow.note.content": "## Federated Experiment tracking with MLflow.\n"},
+            },
+        )
+    job.to_server(receiver)
+
+    # Add clients
+    for i in range(n_clients):
+        executor = ScriptRunner(
+            script=train_script, script_args=""  # f"--batch_size 32 --data_path /tmp/data/site-{i}"
+        )
+        job.to(executor, f"site-{i + 1}")
+
+    job.simulator_run(workspace="/tmp/nvflare/jobs/workdir", log_config="./log_config.json")
diff --git a/...g_federated_learning_applications/01.5_experiment_tracking/code/img/cifar10.png b/...g_federated_learning_applications/01.5_experiment_tracking/code/img/cifar10.png
diff --git a/...r-1_running_federated_learning_applications/01.5_experiment_tracking/code/log_config.json b/...r-1_running_federated_learning_applications/01.5_experiment_tracking/code/log_config.json
@@ -0,0 +1,87 @@
+{
+    "version": 1,
+    "disable_existing_loggers": false,
+    "formatters": {
+        "baseFormatter": {
+            "()": "nvflare.fuel.utils.log_utils.BaseFormatter",
+            "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s"
+        },
+        "colorFormatter": {
+            "()": "nvflare.fuel.utils.log_utils.ColorFormatter",
+            "fmt": "%(asctime)s - %(levelname)s - %(message)s",
+            "datefmt": "%Y-%m-%d %H:%M:%S"
+        },
+        "jsonFormatter": {
+            "()": "nvflare.fuel.utils.log_utils.JsonFormatter",
+            "fmt": "%(asctime)s - %(identity)s - %(name)s - %(fullName)s - %(levelname)s - %(fl_ctx)s - %(message)s"
+        }
+    },
+    "filters": {
+        "FLFilter": {
+            "()": "nvflare.fuel.utils.log_utils.LoggerNameFilter",
+            "logger_names": ["custom", "nvflare.app_common", "nvflare.app_opt"]
+        }
+    },
+    "handlers": {
+        "consoleHandler": {
+            "class": "logging.StreamHandler",
+            "level": "INFO",
+            "formatter": "colorFormatter",
+            "filters": ["FLFilter"],
+            "stream": "ext://sys.stdout"
+        },
+        "logFileHandler": {
+            "class": "logging.handlers.RotatingFileHandler",
+            "level": "DEBUG",
+            "formatter": "baseFormatter",
+            "filename": "log.txt",
+            "mode": "a",
+            "maxBytes": 20971520,
+            "backupCount": 10
+        },
+        "errorFileHandler": {
+            "class": "logging.handlers.RotatingFileHandler",
+            "level": "ERROR",
+            "formatter": "baseFormatter",
+            "filename": "log_error.txt",
+            "mode": "a",
+            "maxBytes": 20971520,
+            "backupCount": 10
+        },
+        "jsonFileHandler": {
+            "class": "logging.handlers.RotatingFileHandler",
+            "level": "DEBUG",
+            "formatter": "jsonFormatter",
+            "filename": "log.json",
+            "mode": "a",
+            "maxBytes": 20971520,
+            "backupCount": 10
+        },
+        "FLFileHandler": {
+            "class": "logging.handlers.RotatingFileHandler",
+            "level": "DEBUG",
+            "formatter": "baseFormatter",
+            "filters": ["FLFilter"],
+            "filename": "log_fl.txt",
+            "mode": "a",
+            "maxBytes": 20971520,
+            "backupCount": 10,
+            "delay": true
+        }
+    },
+    "loggers": {
+        "root": {
+            "level": "INFO",
+            "handlers": ["consoleHandler", "logFileHandler", "errorFileHandler", "jsonFileHandler", "FLFileHandler"]
+        }
+    }
+}
+
+
+
+
+
+
+
+
+
diff --git a/...-1_running_federated_learning_applications/01.5_experiment_tracking/code/requirements.txt b/...-1_running_federated_learning_applications/01.5_experiment_tracking/code/requirements.txt
@@ -0,0 +1,3 @@
+torch
+torchvision
+tensorboard
diff --git a/...ter-1_running_federated_learning_applications/01.5_experiment_tracking/code/src/client.py b/...ter-1_running_federated_learning_applications/01.5_experiment_tracking/code/src/client.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+from network import SimpleNetwork
+from torch import nn
+from torch.optim import SGD
+from torch.utils.data.dataloader import DataLoader
+from torchvision.datasets import CIFAR10
+from torchvision.transforms import Compose, Normalize, ToTensor
+
+import nvflare.client as flare
+from nvflare.client.tracking import SummaryWriter
+
+DATASET_PATH = "/tmp/nvflare/data"
+
+
+def main():
+    batch_size = 4
+    epochs = 1
+    lr = 0.01
+    model = SimpleNetwork()
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    loss = nn.CrossEntropyLoss()
+    optimizer = SGD(model.parameters(), lr=lr, momentum=0.9)
+    transforms = Compose(
+        [
+            ToTensor(),
+            Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        ]
+    )
+
+    flare.init()
+    summary_writer = SummaryWriter()
+    sys_info = flare.system_info()
+    site_name = sys_info["site_name"]
+
+    data_path = os.path.join(DATASET_PATH, site_name)
+
+    train_dataset = CIFAR10(root=data_path, transform=transforms, download=True, train=True)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    n_loaders = len(train_loader)
+
+    print("number of loaders = ", n_loaders)
+
+    round = 0
+    last_loss = 0
+    while flare.is_running():
+        input_model = flare.receive()
+        round = input_model.current_round
+
+        print(f"\n\nsite_name={site_name}, current_round={round + 1}\n ")
+
+        model.load_state_dict(input_model.params)
+        model.to(device)
+
+        steps = epochs * n_loaders
+        for epoch in range(epochs):
+            running_loss = 0.0
+
+            for i, batch in enumerate(train_loader):
+                images, labels = batch[0].to(device), batch[1].to(device)
+
+                optimizer.zero_grad()
+
+                predictions = model(images)
+                cost = loss(predictions, labels)
+                cost.backward()
+                optimizer.step()
+
+                running_loss += cost.cpu().detach().numpy() / batch_size
+
+                if i % 100 == 0:
+                    global_step = epoch * n_loaders + i
+                    summary_writer.add_scalar(tag="training_loss", scalar=running_loss, global_step=global_step)
+
+                if i % 3000 == 0:
+                    print(
+                        f"Round: {round + 1}, Epoch: {epoch + 1}/{epochs}, batch: {i + 1}, Loss: {running_loss / 3000}"
+                    )
+                    running_loss = 0.0
+
+            last_loss = {running_loss / (i + 1)}
+            print(
+                f"site: {site_name}, round: {round + 1}, Epoch: {epoch + 1}/{epochs}, batch: {i + 1}, Loss: {last_loss}"
+            )
+
+        print("Finished Training")
+
+        PATH = "./cifar_net.pth"
+        torch.save(model.state_dict(), PATH)
+
+        output_model = flare.FLModel(
+            params=model.cpu().state_dict(),
+            meta={"NUM_STEPS_CURRENT_ROUND": steps},
+        )
+
+        flare.send(output_model)
+
+    print(
+        f"\n"
+        f"Result Summary\n"
+        "    Training parameters:\n"
+        "       number of clients = 5\n"
+        f"       round = {round + 1},\n"
+        f"       batch_size = {batch_size},\n"
+        f"       epochs = {epochs},\n"
+        f"       lr = {lr},\n"
+        f"       total data batches = {n_loaders},\n"
+        f"    Metrics: last_loss = {last_loss}\n"
+    )
+
+
+if __name__ == "__main__":
+    main()