cd /mnt/bn/algo-masp-nas-2/xiangchen/repo/LLaVA | |
echo "$PWD" | |
ports=(`echo $METIS_WORKER_0_PORT | tr ',' ' '`) | |
port=${ports[0]} | |
echo "total workers: ${ARNOLD_WORKER_NUM}" | |
echo "cur worker id: ${ARNOLD_ID}" | |
echo "gpus per worker: ${ARNOLD_WORKER_GPU}" | |
echo "master ip: ${METIS_WORKER_0_HOST}" | |
echo "master port: ${port}" | |
#export OMP_NUM_THREADS=8 | |
#export NCCL_IB_DISABLE=0 | |
#export NCCL_IB_GID_INDEX=3 | |
#export NCCL_IB_HCA=${ARNOLD_RDMA_DEVICE} | |
#export NCCL_SOCKET_IFNAME=eth0 | |
# export NCCL_DEBUG=INFO | |
env="$1" | |
cmd="$2" | |
echo $env | |
echo $cmd | |
if [ $env == "torchrun" ]; | |
then | |
torchrun \ | |
--nnodes $ARNOLD_WORKER_NUM \ | |
--node_rank $ARNOLD_ID \ | |
--nproc_per_node $ARNOLD_WORKER_GPU \ | |
--master_addr $METIS_WORKER_0_HOST \ | |
--master_port $port \ | |
$cmd | |
elif [ $env == "deepspeed" ]; | |
then | |
deepspeed \ | |
--num_nodes $ARNOLD_WORKER_NUM \ | |
--num_gpus $ARNOLD_WORKER_GPU \ | |
--master_addr $METIS_WORKER_0_HOST \ | |
--master_port $port \ | |
$cmd | |
fi | |
#torchrun \ | |
#--nnodes $ARNOLD_WORKER_NUM \ | |
#--node_rank $ARNOLD_ID \ | |
#--nproc_per_node $ARNOLD_WORKER_GPU \ | |
#--master_addr $METIS_WORKER_0_HOST \ | |
#--master_port $port \ | |
#$cmd | |
#for i in "$*"; do | |
# echo $i | |
# $i | |
#done | |