File size: 1,186 Bytes
bbfa6f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/bin/bash

cd /mnt/bn/algo-masp-nas-2/xiangchen/repo/LLaVA

echo "$PWD"

ports=(`echo $METIS_WORKER_0_PORT | tr ',' ' '`)
port=${ports[0]}

echo "total workers: ${ARNOLD_WORKER_NUM}"
echo "cur worker id: ${ARNOLD_ID}"
echo "gpus per worker: ${ARNOLD_WORKER_GPU}"
echo "master ip: ${METIS_WORKER_0_HOST}"
echo "master port: ${port}"

#export OMP_NUM_THREADS=8
#export NCCL_IB_DISABLE=0
#export NCCL_IB_GID_INDEX=3
#export NCCL_IB_HCA=${ARNOLD_RDMA_DEVICE}
#export NCCL_SOCKET_IFNAME=eth0
# export NCCL_DEBUG=INFO

env="$1"
cmd="$2"
echo $env
echo $cmd

if [ $env == "torchrun" ];
then
  torchrun \
  --nnodes $ARNOLD_WORKER_NUM \
  --node_rank $ARNOLD_ID \
  --nproc_per_node $ARNOLD_WORKER_GPU \
  --master_addr $METIS_WORKER_0_HOST \
  --master_port $port \
  $cmd
elif [ $env == "deepspeed" ];
then
  deepspeed \
  --num_nodes $ARNOLD_WORKER_NUM \
  --num_gpus $ARNOLD_WORKER_GPU \
  --master_addr $METIS_WORKER_0_HOST \
  --master_port $port \
  $cmd
fi

#torchrun \
#--nnodes $ARNOLD_WORKER_NUM \
#--node_rank $ARNOLD_ID \
#--nproc_per_node $ARNOLD_WORKER_GPU \
#--master_addr $METIS_WORKER_0_HOST \
#--master_port $port \
#$cmd


#for i in "$*"; do
#    echo $i
#    $i
#done