model1 / run.sh
multitensor's picture
Upload folder using huggingface_hub
bbfa6f6 verified
#!/bin/bash
cd /mnt/bn/algo-masp-nas-2/xiangchen/repo/LLaVA
echo "$PWD"
ports=(`echo $METIS_WORKER_0_PORT | tr ',' ' '`)
port=${ports[0]}
echo "total workers: ${ARNOLD_WORKER_NUM}"
echo "cur worker id: ${ARNOLD_ID}"
echo "gpus per worker: ${ARNOLD_WORKER_GPU}"
echo "master ip: ${METIS_WORKER_0_HOST}"
echo "master port: ${port}"
#export OMP_NUM_THREADS=8
#export NCCL_IB_DISABLE=0
#export NCCL_IB_GID_INDEX=3
#export NCCL_IB_HCA=${ARNOLD_RDMA_DEVICE}
#export NCCL_SOCKET_IFNAME=eth0
# export NCCL_DEBUG=INFO
env="$1"
cmd="$2"
echo $env
echo $cmd
if [ $env == "torchrun" ];
then
torchrun \
--nnodes $ARNOLD_WORKER_NUM \
--node_rank $ARNOLD_ID \
--nproc_per_node $ARNOLD_WORKER_GPU \
--master_addr $METIS_WORKER_0_HOST \
--master_port $port \
$cmd
elif [ $env == "deepspeed" ];
then
deepspeed \
--num_nodes $ARNOLD_WORKER_NUM \
--num_gpus $ARNOLD_WORKER_GPU \
--master_addr $METIS_WORKER_0_HOST \
--master_port $port \
$cmd
fi
#torchrun \
#--nnodes $ARNOLD_WORKER_NUM \
#--node_rank $ARNOLD_ID \
#--nproc_per_node $ARNOLD_WORKER_GPU \
#--master_addr $METIS_WORKER_0_HOST \
#--master_port $port \
#$cmd
#for i in "$*"; do
# echo $i
# $i
#done