-
Notifications
You must be signed in to change notification settings - Fork 237
/
Copy pathmultinode_training_demo.sh
145 lines (127 loc) · 4.3 KB
/
multinode_training_demo.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/bin/bash
RUN_NAME=multinode_training # assume there is a ${RUN_NAME}_args.yaml file in the current directory
declare -A node2ip_map
node2ip_map=(
["node1"]="192.168.1.101"
["node2"]="192.168.1.102"
["node3"]="192.168.1.103"
["node4"]="192.168.1.104"
)
# Default nodes if no arguments provided
DEFAULT_NODES=("node1" "node2")
# Local codebase path in file system
LOCAL_CODEBASE_PATH="/path/to/your/codebase"
# Use provided nodes or default nodes
if [ "$#" -ge 1 ]; then
NODES=("$@")
else
NODES=("${DEFAULT_NODES[@]}")
echo "Using default nodes: ${NODES[*]}"
fi
# Add this debug line
echo "All nodes in order: ${NODES[@]}"
TOTAL_NODES=${#NODES[@]}
MASTER_NODE=${NODES[0]}
MASTER_PORT=12345
# Get project root directory (using the directory where this script is located)
PROJECT_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
echo "Project root directory: $PROJECT_ROOT"
# Get master node IP address
echo "MASTER_NODE: $MASTER_NODE"
MASTER_IP="${node2ip_map[$MASTER_NODE]}"
echo "Master node IP: $MASTER_IP"
# Create log directory for each node
LOG_DIR="path/to/your/log/dir"
mkdir -p $LOG_DIR
# Generate docker-compose.yml
echo "Generating docker-compose.yml..."
cat > docker-compose.yml << EOL
version: '3.8'
services:
trainer:
image: your/training-image:tag
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
shm_size: '8gb'
volumes:
- /path/to/data:/data
- $LOCAL_CODEBASE_PATH/src:/workspace/src
environment:
- MASTER_ADDR=\${MASTER_ADDR:-$MASTER_IP}
- MASTER_PORT=\${MASTER_PORT:-12345}
- NODE_RANK=\${NODE_RANK:-0}
- WORLD_SIZE=\${WORLD_SIZE:-4}
- DEBUG_MODE=true
- LOG_PATH=${LOG_DIR}/debug_log.txt
- WANDB_API_KEY=your_wandb_api_key # Optional: for logging with weights & biases
- WANDB_PROJECT=your_project_name
- WANDB_RUN_NAME=${RUN_NAME}-$(date +%Y-%m-%d-%H-%M-%S)
- PYTHONPATH=/workspace/src
network_mode: "host"
command: /bin/bash
working_dir: /workspace
EOL
# Function to build training arguments from yaml
build_train_args() {
args=""
while IFS=": " read -r key value; do
[[ -z "$key" || "$key" =~ ^[[:space:]]*# ]] && continue
value=$(echo "$value" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e 's/^"//' -e 's/"$//')
if [[ "$value" == "true" ]]; then
args="$args --$key"
elif [[ "$value" == "false" ]]; then
continue
else
args="$args --$key $value"
fi
done < ${RUN_NAME}_args.yaml
echo "$args"
}
# Get training arguments
TRAIN_ARGS=$(build_train_args)
echo "TRAIN_ARGS: $TRAIN_ARGS"
# Launch containers on each node
NODE_RANK=0
for host in "${NODES[@]}"; do
LOG_FILE="$LOG_DIR/${host}_rank${NODE_RANK}.log"
if [ "$host" = "$MASTER_NODE" ]; then
echo "Launching on master $host with rank $NODE_RANK, logging to $LOG_FILE"
ssh $host "cd $PROJECT_ROOT && \
MASTER_ADDR=$MASTER_IP \
NODE_RANK=$NODE_RANK \
WORLD_SIZE=$TOTAL_NODES \
sudo -E docker-compose -f docker-compose.yml run --rm trainer \
torchrun --nproc_per_node=8 \
--nnodes=$TOTAL_NODES \
--node_rank=$NODE_RANK \
--master_addr=$MASTER_IP \
--master_port=$MASTER_PORT \
src/train.py \
$TRAIN_ARGS" > "$LOG_FILE" 2>&1 &
else
echo "Launching on $host with rank $NODE_RANK, logging to $LOG_FILE"
ssh $host "cd $PROJECT_ROOT && \
MASTER_ADDR=$MASTER_IP \
NODE_RANK=$NODE_RANK \
WORLD_SIZE=$TOTAL_NODES \
sudo -E docker-compose -f docker-compose.yml run --rm trainer \
torchrun --nproc_per_node=8 \
--nnodes=$TOTAL_NODES \
--node_rank=$NODE_RANK \
--master_addr=$MASTER_IP \
--master_port=$MASTER_PORT \
src/train.py \
$TRAIN_ARGS" > "$LOG_FILE" 2>&1 &
fi
NODE_RANK=$((NODE_RANK + 1))
done
echo "Jobs launched. To monitor the logs, you can:"
echo "1. Use 'tail -f $LOG_DIR/*.log' to watch all logs"
echo "2. Use 'tail -f $LOG_DIR/<node_name>_rank<N>.log' to watch a specific node"
# Wait for all background processes to complete
wait