forked from wpeebles/G.pt
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_batch.py
More file actions
45 lines (36 loc) · 1.32 KB
/
train_batch.py
File metadata and controls
45 lines (36 loc) · 1.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import argparse
import atexit
import subprocess
import time
import os
import copy
import torch
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Batch training")
parser.add_argument("--runs-per-gpu", type=int, default=1)
parser.add_argument("--cmd", type=str, default='train_mnist.py')
args = parser.parse_args()
num_gpus = torch.cuda.device_count()
print(f'Found {num_gpus} GPUs')
processes = [[None for _ in range(args.runs_per_gpu)] for _ in range(num_gpus)]
env = os.environ.copy()
envs = []
for gpu_ix in range(num_gpus):
env_copy = copy.deepcopy(env)
env_copy["CUDA_VISIBLE_DEVICES"] = str(gpu_ix)
envs.append(env_copy)
def terminate_processes():
for gpu in range(num_gpus):
for process in processes[gpu]:
if process is not None:
process.terminate()
atexit.register(terminate_processes)
while True:
for gpu in range(num_gpus):
for run in range(args.runs_per_gpu):
process = processes[gpu][run]
if process is None or process.poll() is not None:
process = subprocess.Popen(["python", args.cmd], env=envs[gpu])
processes[gpu][run] = process
time.sleep(5)
time.sleep(60)