Skip to content

Commit 16e0c27

Browse files
ScriptExecutor improvements (#2820)
* script executor improvements * move ScriptExecutor to job_config * rename ScriptExecutor to ScriptRunner, add TF versions of in process and ex process executors * fix dead links --------- Co-authored-by: Chester Chen <[email protected]>
1 parent bf836c5 commit 16e0c27

38 files changed

+394
-294
lines changed

examples/advanced/job_api/pt/README.md

+8-8
Original file line numberDiff line numberDiff line change
@@ -19,27 +19,27 @@ python "script_name.py"
1919
```
2020

2121
```commandline
22-
python fedavg_script_executor_lightning_cifar10.py
22+
python fedavg_script_runner_lightning_cifar10.py
2323
```
24-
### 1. [Federated averaging using the script executor](./fedavg_script_executor_cifar10.py)
24+
### 1. [Federated averaging using the script executor](./fedavg_script_runner_cifar10.py)
2525
Implementation of [FedAvg](https://arxiv.org/abs/1602.05629) using the [Client API](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/client_api.html).
2626

2727

28-
### 2. [Federated averaging using script executor and differential privacy filter](./fedavg_script_executor_dp_filter_cifar10.py)
28+
### 2. [Federated averaging using script executor and differential privacy filter](./fedavg_script_runner_dp_filter_cifar10.py)
2929
Implementation of [FedAvg](https://arxiv.org/abs/1602.05629) using the [Client API](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/client_api.html)
3030
with additional [differential privacy filters](https://arxiv.org/abs/1910.00962) on the client side.
3131
```commandline
32-
python fedavg_script_executor_dp_filter_cifar10.py
32+
python fedavg_script_runner_dp_filter_cifar10.py
3333
```
34-
### 3. [Swarm learning using script executor](./swarm_script_executor_cifar10.py)
34+
### 3. [Swarm learning using script executor](./swarm_script_runner_cifar10.py)
3535
Implementation of [swarm learning](https://www.nature.com/articles/s41586-021-03583-3) using the [Client API](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/client_api.html)
3636
```commandline
37-
python swarm_script_executor_cifar10.py
37+
python swarm_script_runner_cifar10.py
3838
```
39-
### 4. [Cyclic weight transfer using script executor](./cyclic_cc_script_executor_cifar10.py)
39+
### 4. [Cyclic weight transfer using script executor](./cyclic_cc_script_runner_cifar10.py)
4040
Implementation of [cyclic weight transfer](https://arxiv.org/abs/1709.05929) using the [Client API](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/client_api.html)
4141
```commandline
42-
python cyclic_cc_script_executor_cifar10.py
42+
python cyclic_cc_script_runner_cifar10.py
4343
```
4444
### 5. [Federated averaging using model learning](./fedavg_model_learner_xsite_val_cifar10.py))
4545
Implementation of [FedAvg](https://arxiv.org/abs/1602.05629) using the [model learner class](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/model_learner.html),

examples/advanced/job_api/pt/cyclic_cc_script_executor_cifar10.py examples/advanced/job_api/pt/cyclic_cc_script_runner_cifar10.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616

1717
from nvflare.app_common.ccwf.ccwf_job import CCWFJob, CyclicClientConfig, CyclicServerConfig
1818
from nvflare.app_common.ccwf.comps.simple_model_shareable_generator import SimpleModelShareableGenerator
19-
from nvflare.app_common.executors.script_executor import ScriptExecutor
2019
from nvflare.app_opt.pt.file_model_persistor import PTFileModelPersistor
20+
from nvflare.job_config.script_runner import ScriptRunner
2121

2222
if __name__ == "__main__":
2323
n_clients = 2
@@ -29,7 +29,7 @@
2929
job.add_cyclic(
3030
server_config=CyclicServerConfig(num_rounds=num_rounds, max_status_report_interval=300),
3131
client_config=CyclicClientConfig(
32-
executor=ScriptExecutor(task_script_path=train_script),
32+
executor=ScriptRunner(script=train_script),
3333
persistor=PTFileModelPersistor(model=Net()),
3434
shareable_generator=SimpleModelShareableGenerator(),
3535
),

examples/advanced/job_api/pt/fedavg_script_executor_cifar10.py examples/advanced/job_api/pt/fedavg_script_runner_cifar10.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414

1515
from src.net import Net
1616

17-
from nvflare.app_common.executors.script_executor import ScriptExecutor
1817
from nvflare.app_common.workflows.fedavg import FedAvg
1918
from nvflare.app_opt.pt.job_config.model import PTModel
2019

2120
# from nvflare.app_opt.pt.job_config.fed_avg import FedAvgJob
2221
from nvflare.job_config.api import FedJob
22+
from nvflare.job_config.script_runner import ScriptRunner
2323

2424
if __name__ == "__main__":
2525
n_clients = 2
@@ -44,8 +44,8 @@
4444

4545
# Add clients
4646
for i in range(n_clients):
47-
executor = ScriptExecutor(
48-
task_script_path=train_script, task_script_args="" # f"--batch_size 32 --data_path /tmp/data/site-{i}"
47+
executor = ScriptRunner(
48+
script=train_script, script_args="" # f"--batch_size 32 --data_path /tmp/data/site-{i}"
4949
)
5050
job.to(executor, target=f"site-{i}")
5151
# job.to_clients(executor)

examples/advanced/job_api/pt/fedavg_script_executor_dp_filter_cifar10.py examples/advanced/job_api/pt/fedavg_script_runner_dp_filter_cifar10.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515
from src.net import Net
1616

1717
from nvflare import FilterType
18-
from nvflare.app_common.executors.script_executor import ScriptExecutor
1918
from nvflare.app_common.filters.percentile_privacy import PercentilePrivacy
2019
from nvflare.app_opt.pt.job_config.fed_avg import FedAvgJob
20+
from nvflare.job_config.script_runner import ScriptRunner
2121

2222
if __name__ == "__main__":
2323
n_clients = 2
@@ -27,7 +27,7 @@
2727
job = FedAvgJob(name="cifar10_fedavg_privacy", num_rounds=num_rounds, n_clients=n_clients, initial_model=Net())
2828

2929
for i in range(n_clients):
30-
executor = ScriptExecutor(task_script_path=train_script, task_script_args="")
30+
executor = ScriptRunner(script=train_script, script_args="")
3131
job.to(executor, f"site-{i}", tasks=["train"])
3232

3333
# add privacy filter.

examples/advanced/job_api/pt/fedavg_script_executor_lightning_cifar10.py examples/advanced/job_api/pt/fedavg_script_runner_lightning_cifar10.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414

1515
from src.lit_net import LitNet
1616

17-
from nvflare.app_common.executors.script_executor import ScriptExecutor
1817
from nvflare.app_opt.pt.job_config.fed_avg import FedAvgJob
18+
from nvflare.job_config.script_runner import ScriptRunner
1919

2020
if __name__ == "__main__":
2121
n_clients = 2
@@ -26,8 +26,8 @@
2626

2727
# Add clients
2828
for i in range(n_clients):
29-
executor = ScriptExecutor(
30-
task_script_path=train_script, task_script_args="" # f"--batch_size 32 --data_path /tmp/data/site-{i}"
29+
executor = ScriptRunner(
30+
script=train_script, script_args="" # f"--batch_size 32 --data_path /tmp/data/site-{i}"
3131
)
3232
job.to(executor, f"site-{i}")
3333

examples/advanced/job_api/pt/swarm_script_executor_cifar10.py examples/advanced/job_api/pt/swarm_script_runner_cifar10.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
from nvflare.app_common.aggregators.intime_accumulate_model_aggregator import InTimeAccumulateWeightedAggregator
1919
from nvflare.app_common.ccwf.ccwf_job import CCWFJob, CrossSiteEvalConfig, SwarmClientConfig, SwarmServerConfig
2020
from nvflare.app_common.ccwf.comps.simple_model_shareable_generator import SimpleModelShareableGenerator
21-
from nvflare.app_common.executors.script_executor import ScriptExecutor
2221
from nvflare.app_opt.pt.file_model_persistor import PTFileModelPersistor
22+
from nvflare.job_config.script_runner import ScriptRunner
2323

2424
if __name__ == "__main__":
2525
n_clients = 2
@@ -31,7 +31,7 @@
3131
job.add_swarm(
3232
server_config=SwarmServerConfig(num_rounds=num_rounds),
3333
client_config=SwarmClientConfig(
34-
executor=ScriptExecutor(task_script_path=train_script, evaluate_task_name="validate"),
34+
executor=ScriptRunner(script=train_script, evaluate_task_name="validate"),
3535
aggregator=aggregator,
3636
persistor=PTFileModelPersistor(model=Net()),
3737
shareable_generator=SimpleModelShareableGenerator(),

examples/advanced/job_api/sklearn/README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ You can also run any of the below scripts directly using
1515
```commandline
1616
python "script_name.py"
1717
```
18-
### 1. [Federated K-Means Clustering](./kmeans_script_executor_higgs.py)
18+
### 1. [Federated K-Means Clustering](./kmeans_script_runner_higgs.py)
1919
Implementation of [K-Means](https://arxiv.org/abs/1602.05629). For more details see this [example](../../../advanced/sklearn-kmeans/README.md)
2020
```commandline
21-
python kmeans_script_executor_higgs.py
21+
python kmeans_script_runner_higgs.py
2222
```
2323

2424
> [!NOTE]

examples/advanced/job_api/sklearn/kmeans_script_executor_higgs.py examples/advanced/job_api/sklearn/kmeans_script_runner_higgs.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,11 @@
2020

2121
from nvflare import FedJob
2222
from nvflare.app_common.aggregators.collect_and_assemble_aggregator import CollectAndAssembleAggregator
23-
from nvflare.app_common.executors.script_executor import ScriptExecutor
2423
from nvflare.app_common.shareablegenerators.full_model_shareable_generator import FullModelShareableGenerator
2524
from nvflare.app_common.workflows.scatter_and_gather import ScatterAndGather
2625
from nvflare.app_opt.sklearn.joblib_model_param_persistor import JoblibModelParamPersistor
2726
from nvflare.client.config import ExchangeFormat
27+
from nvflare.job_config.script_runner import ScriptRunner
2828

2929
preprocess = True # if False, assume data is already preprocessed and split
3030

@@ -137,9 +137,9 @@ def split_higgs(input_data_path, input_header_path, output_dir, site_num, sample
137137

138138
# Add clients
139139
for i in range(n_clients):
140-
executor = ScriptExecutor(
141-
task_script_path=train_script,
142-
task_script_args=f"--data_root_dir {data_output_dir}",
140+
executor = ScriptRunner(
141+
script=train_script,
142+
script_args=f"--data_root_dir {data_output_dir}",
143143
params_exchange_format=ExchangeFormat.RAW, # kmeans requires raw values only rather than PyTorch Tensors (the default)
144144
)
145145
job.to(executor, f"site-{i+1}") # HIGGs data splitter assumes site names start from 1

examples/advanced/job_api/tf/README.md

+9-9
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ In this example, the latest Client APIs were used to implement
1919
client-side training logics (details in file
2020
[`cifar10_tf_fl_alpha_split.py`](src/cifar10_tf_fl_alpha_split.py)),
2121
and the new
22-
[`FedJob`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/job_config/fed_job.py#L106)
22+
[`FedJob`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/job_config/api.py)
2323
APIs were used to programmatically set up an
2424
`nvflare` job to be exported or ran by simulator (details in file
25-
[`tf_fl_script_executor_cifar10.py`](tf_fl_script_executor_cifar10.py)),
25+
[`tf_fl_script_runner_cifar10.py`](tf_fl_script_runner_cifar10.py)),
2626
alleviating the need of writing job config files, simplifying
2727
development process.
2828

@@ -41,7 +41,7 @@ pip install -r ./requirements.txt
4141
## 2. Run experiments
4242

4343
This example uses simulator to run all experiments. The script
44-
[`tf_fl_script_executor_cifar10.py`](tf_fl_script_executor_cifar10.py)
44+
[`tf_fl_script_runner_cifar10.py`](tf_fl_script_runner_cifar10.py)
4545
is the main script to be used to launch different experiments with
4646
different arguments (see sections below for details). A script
4747
[`run_jobs.sh`](run_jobs.sh) is also provided to run all experiments
@@ -55,7 +55,7 @@ any experiment, and you can use `Tensorboard` to visualize the
5555
training and validation process as the experiment runs. Data split
5656
files, summary logs and results will be saved in a workspace
5757
directory, which defaults to `/tmp` and can be configured by setting
58-
`--workspace` argument of the `tf_fl_script_executor_cifar10.py`
58+
`--workspace` argument of the `tf_fl_script_runner_cifar10.py`
5959
script.
6060

6161
> [!WARNING]
@@ -82,7 +82,7 @@ To simulate a centralized training baseline, we run FedAvg algorithm
8282
with 1 client for 25 rounds, where each round consists of one single epoch.
8383

8484
```
85-
python ./tf_fl_script_executor_cifar10.py \
85+
python ./tf_fl_script_runner_cifar10.py \
8686
--algo centralized \
8787
--n_clients 1 \
8888
--num_rounds 25 \
@@ -101,7 +101,7 @@ in the centralized baseline above (50*4 divided by 8 clients is 25):
101101
```
102102
for alpha in 1.0 0.5 0.3 0.1; do
103103
104-
python ./tf_fl_script_executor_cifar10.py \
104+
python ./tf_fl_script_runner_cifar10.py \
105105
--algo fedavg \
106106
--n_clients 8 \
107107
--num_rounds 50 \
@@ -120,7 +120,7 @@ Next, let's try some different FL algorithms on a more heterogeneous split:
120120
side to update the global model from client-side gradients. Here we
121121
use SGD with momentum and cosine learning rate decay:
122122
```
123-
python ./tf_fl_script_executor_cifar10.py \
123+
python ./tf_fl_script_runner_cifar10.py \
124124
--algo fedopt \
125125
--n_clients 8 \
126126
--num_rounds 50 \
@@ -130,7 +130,7 @@ python ./tf_fl_script_executor_cifar10.py \
130130
```
131131
[FedProx](https://arxiv.org/abs/1812.06127) adds a regularizer to the loss:
132132
```
133-
python ./tf_fl_script_executor_cifar10.py \
133+
python ./tf_fl_script_runner_cifar10.py \
134134
--algo fedprox \
135135
--n_clients 8 \
136136
--num_rounds 50 \
@@ -145,7 +145,7 @@ during local training following the
145145
described in [Li et al.](https://arxiv.org/abs/2102.02079)
146146

147147
```
148-
python ./tf_fl_script_executor_cifar10.py \
148+
python ./tf_fl_script_runner_cifar10.py \
149149
--algo scaffold \
150150
--n_clients 8 \
151151
--num_rounds 50 \

examples/advanced/job_api/tf/tf_fl_script_executor_cifar10.py examples/advanced/job_api/tf/tf_fl_script_runner_cifar10.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
from src.tf_net import ModerateTFNet
2222

2323
from nvflare import FedJob
24-
from nvflare.app_common.executors.script_executor import ScriptExecutor
2524
from nvflare.app_opt.tf.job_config.model import TFModel
25+
from nvflare.job_config.script_runner import ScriptRunner
2626

2727
gpu_devices = tf.config.experimental.list_physical_devices("GPU")
2828
for device in gpu_devices:
@@ -156,7 +156,7 @@
156156
# Add clients
157157
for i, train_idx_path in enumerate(train_idx_paths):
158158
curr_task_script_args = task_script_args + f" --train_idx_path {train_idx_path}"
159-
executor = ScriptExecutor(task_script_path=train_script, task_script_args=curr_task_script_args)
159+
executor = ScriptRunner(script=train_script, script_args=curr_task_script_args)
160160
job.to(executor, f"site-{i+1}")
161161

162162
# Can export current job to folder.

examples/getting_started/pt/README.md

-62
Original file line numberDiff line numberDiff line change
@@ -4,65 +4,3 @@
44
We provide several examples to quickly get you started using NVFlare's Job API.
55
All examples in this folder are based on using [PyTorch](https://pytorch.org/) as the model training framework.
66
Furthermore, we support [PyTorch Lightning](https://lightning.ai).
7-
8-
## Setup environment
9-
First, install nvflare and dependencies:
10-
```commandline
11-
pip install -r requirements.txt
12-
```
13-
14-
## Tutorials
15-
A good starting point for understanding the Job API scripts and NVFlare components are the following tutorials.
16-
### 1. [Federated averaging using script executor](./nvflare_pt_getting_started.ipynb)
17-
Tutorial on [FedAvg](https://arxiv.org/abs/1602.05629) using the [Client API](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/client_api.html).
18-
19-
### 2. [Federated averaging using script executor with Lightning API](./nvflare_lightning_getting_started.ipynb)
20-
Tutorial on [FedAvg](https://arxiv.org/abs/1602.05629) using the [Lightning Client API](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/client_api.html#id4)
21-
22-
## Examples
23-
You can also run any of the below scripts directly using
24-
```commandline
25-
python "script_name.py"
26-
```
27-
### 1. [Federated averaging using script executor](./fedavg_script_executor_cifar10.py)
28-
Implementation of [FedAvg](https://arxiv.org/abs/1602.05629) using the [Client API](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/client_api.html).
29-
```commandline
30-
python fedavg_script_executor_cifar10.py
31-
```
32-
### 2. [Federated averaging using script executor with Lightning API](./fedavg_script_executor_lightning_cifar10.py)
33-
Implementation of [FedAvg](https://arxiv.org/abs/1602.05629) using the [Lightning Client API](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/client_api.html#id4)
34-
```commandline
35-
python fedavg_script_executor_lightning_cifar10.py
36-
```
37-
### 3. [Federated averaging using the script executor for all clients](./fedavg_script_executor_cifar10_all.py)
38-
Implementation of [FedAvg](https://arxiv.org/abs/1602.05629) using the [Client API](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/client_api.html).
39-
Here, we deploy the same configuration to all clients.
40-
```commandline
41-
python fedavg_script_executor_cifar10_all.py
42-
```
43-
### 4. [Federated averaging using script executor and differential privacy filter](./fedavg_script_executor_dp_filter_cifar10.py)
44-
Implementation of [FedAvg](https://arxiv.org/abs/1602.05629) using the [Client API](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/client_api.html)
45-
with additional [differential privacy filters](https://arxiv.org/abs/1910.00962) on the client side.
46-
```commandline
47-
python fedavg_script_executor_dp_filter_cifar10.py
48-
```
49-
### 5. [Swarm learning using script executor](./swarm_script_executor_cifar10.py)
50-
Implementation of [swarm learning](https://www.nature.com/articles/s41586-021-03583-3) using the [Client API](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/client_api.html)
51-
```commandline
52-
python swarm_script_executor_cifar10.py
53-
```
54-
### 6. [Cyclic weight transfer using script executor](./cyclic_cc_script_executor_cifar10.py)
55-
Implementation of [cyclic weight transfer](https://arxiv.org/abs/1709.05929) using the [Client API](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/client_api.html)
56-
```commandline
57-
python cyclic_cc_script_executor_cifar10.py
58-
```
59-
### 7. [Federated averaging using model learning](./fedavg_model_learner_xsite_val_cifar10.py))
60-
Implementation of [FedAvg](https://arxiv.org/abs/1602.05629) using the [model learner class](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/model_learner.html),
61-
followed by [cross site validation](https://nvflare.readthedocs.io/en/main/programming_guide/controllers/cross_site_model_evaluation.html)
62-
for federated model evaluation.
63-
```commandline
64-
python fedavg_model_learner_xsite_val_cifar10.py
65-
```
66-
67-
> [!NOTE]
68-
> More examples can be found at https://nvidia.github.io/NVFlare.

examples/getting_started/pt/nvflare_lightning_getting_started.ipynb

+3-3
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@
330330
"outputs": [],
331331
"source": [
332332
"from nvflare import FedJob\n",
333-
"from nvflare.app_common.executors.script_executor import ScriptExecutor\n",
333+
"from nvflare.job_config.script_runner import ScriptRunner\n",
334334
"from nvflare.app_common.workflows.fedavg import FedAvg\n",
335335
"\n",
336336
"job = FedJob(name=\"cifar10_fedavg_lightning\")"
@@ -411,8 +411,8 @@
411411
"outputs": [],
412412
"source": [
413413
"for i in range(n_clients):\n",
414-
" executor = ScriptExecutor(\n",
415-
" task_script_path=\"src/cifar10_lightning_fl.py\", task_script_args=\"\" # f\"--batch_size 32 --data_path /tmp/data/site-{i}\"\n",
414+
" executor = ScriptRunner(\n",
415+
" script=\"src/cifar10_lightning_fl.py\", script_args=\"\" # f\"--batch_size 32 --data_path /tmp/data/site-{i}\"\n",
416416
" )\n",
417417
" job.to(executor, f\"site-{i+1}\")"
418418
]

0 commit comments

Comments
 (0)