深度学习框架使用简介

不同深度学习框架的python运行环境加载方式如下:

//加载TensorFlow运行环境

source /public/software/apps/DeepLearning/TensorFlow/tensorflow-env.sh

//加载PyTorch运行环境

source /public/software/apps/DeepLearning/PyTorch/pytorch-env.sh

//加载MxNet运行环境

source /public/software/apps/DeepLearning/MxNet/mxnet-env.sh

下面分别介绍三个框架的单机和分布式运行示例:

1. TensorFlow:

//下载测试程序:
https://github.com/horovod/horovod/tree/master/examples/tensorflow/tensorflow_synthetic_benchmark.py
或
https://github.com/horovod/horovod/tree/master/examples/tensorflow2/tensorflow_synthetic_benchmark.py

//单机单卡
python3 tensorflow_synthetic_benchmark.py --model=resnet50 --batch-size=128 --num-iters=500

//单机多卡
cat single_process.sh

#!/bin/bash

lrank=$OMPI_COMM_WORLD_LOCAL_RANK

APP="python3 tensorflow_synthetic_benchmark.py --model=resnet50 --batch-size=128 --num-iters=500"
case ${lrank} in
[0])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_0:1
  export UCX_IB_PCI_BW=mlx5_0:50Gbs
  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[1])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_1:1
  export UCX_IB_PCI_BW=mlx5_1:50Gbs
  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
[2])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_2:1
  export UCX_IB_PCI_BW=mlx5_2:50Gbs
  numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
[3])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_3:1
  export UCX_IB_PCI_BW=mlx5_3:50Gbs
  numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
esac

mpirun -np 4 ./single_process.sh

//多机多卡,结合调度系统使用
#!/bin/bash
#SBATCH -p debug
#SBATCH -N 16
#SBATCH -J xuan-tf
#SBATCH -n 512
#SBATCH --gres=dcu:4

hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat ./${hostfile}`
do
    echo ${i} slots=4 >> ./hostfile-dl-$SLURM_JOB_ID    ((num_node=${num_node}+1))
done
echo "resnet50 node is " ${num_node}
((num_DCU=${num_node}*4))

mpirun -np ${num_DCU} ./single_process.h

2. PyTorch:

//下载测试程序
https://github.com/pytorch/examples/tree/master/imagenet/main.py
/*usage: main.py [-h] [-a ARCH] [-j N] [--epochs N] [--start-epoch N] [-b N]
               [--lr LR] [--momentum M] [--wd W] [-p N] [--resume PATH] [-e]
               [--pretrained] [--world-size WORLD_SIZE] [--rank RANK]
               [--dist-url DIST_URL] [--dist-backend DIST_BACKEND]
               [--seed SEED] [--gpu GPU] [--multiprocessing-distributed]
               DIR
*/
//单机单卡
python3 main.py \
    --batch-size=32 \
    --arch=resnet50 \
    --workers 6 \
    --epochs=1 \
    --gpu=0 \
    /imagenet/

//单机多卡(-)
python3 main.py \
    --batch-size=128 \
    --arch=resnet50 \
    --workers 24 \
    --epochs=1 \
    /imagenet/

//单机多卡(二)
cat single_process.sh
export GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3
export MIOPEN_USER_DB_PATH=/tmp/pytorch-miopen-2.8
export HSA_USERPTR_FOR_PAGED_MEM=0


lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE

APP="python3 main.py --batch-size=32 --a=resnet50 -j 6 --epochs=1 --dist-url tcp://${1}:34567 --dist-backend gloo --world-size=${comm_size} --rank=${comm_rank} /imagenet/"
case ${lrank} in
[0])
  export HIP_VISIBLE_DEVICES=0
  export UCX_NET_DEVICES=mlx5_0:1
  export UCX_IB_PCI_BW=mlx5_0:50Gbs
  GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[1])
  export HIP_VISIBLE_DEVICES=1
  export UCX_NET_DEVICES=mlx5_1:1
  export UCX_IB_PCI_BW=mlx5_1:50Gbs
  GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
[2])
  export HIP_VISIBLE_DEVICES=2
  export UCX_NET_DEVICES=mlx5_2:1
  export UCX_IB_PCI_BW=mlx5_2:50Gbs
  GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
[3])
  export HIP_VISIBLE_DEVICES=3
  export UCX_NET_DEVICES=mlx5_3:1
  export UCX_IB_PCI_BW=mlx5_3:50Gbs
  GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
esac
//单机四卡
mpirun -np 4 `pwd`/single_process.sh $dist_url

//多机多卡
#!/bin/bash
#SBATCH -p debug
#SBATCH -N 2
#SBATCH -J xuan-pytorch
#SBATCH -n 64
which mpirun

hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
num_node=$(cat $hostfile|sort|uniq |wc -l)

num_DCU=$(($num_node*4))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`

rm `pwd`/hostfile-xuan -f
cat $hostfile|sort|uniq >`pwd`/tmp

for i in `cat ./tmp`
do
    echo ${i} slots=4 >> `pwd`/hostfile-xuan
done

mpirun -np $np --allow-run-as-root -hostfile `pwd`/hostfile-xuan `pwd`/single_process.sh $dist_url

3. Mxnet:

//下载测试程序
https://github.com/apache/incubator-mxnet/tree/v1.4.x/example/image-classification

//单机单卡
python3 train_imagenet.py \
     --benchmark 1 \
     --gpus 0 \
     --network inception-v3 \
     --batch-size 64 \
     --image-shape 3,299,299 \
     --num-epochs 10 \
     --kv-store device

//单机多卡
python3 train_imagenet.py \
     --benchmark 1 \
     --gpus 0,1,2,3 \
     --network inception-v3 \
     --batch-size 64 \
     --image-shape 3,299,299 \
     --num-epochs 10 \
     --kv-store device

//多机多卡,ps-worker
#!/bin/bash
#SBATCH -J mxnet
#SBATCH -p dl
#SBATCH -N 2
#SBATCH -n 64
#SBATCH --gres=dcu:4

hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
num_node=0
for i in `cat ./${hostfile}`
do
#    echo ${i} slots=4 >> ./mxnet-$SLURM_JOB_ID
    gethostip ${i} | awk '{print $2}' >>./mxnet-$SLURM_JOB_ID
    ((num_node=${num_node}+1))
done

source /public/home/yangxuan1/mxnet-env.sh
which python3

python3 ../tools/launch.py \
     -n ${num_node} -s 2 -H mxnet-$SLURM_JOB_ID \
     --sync-dst-dir ../example/distributed_training/ \
     --launcher ssh \
     "source /public/home/yangxuan1/mxnet-env.sh; python3 cifar10_dist.py \
     --network resnet \
     --num-layers 110 \
     --batch-size 128 \
     --kv-store dist_device_sync"

/*cat hosts
10.11.7.51
10.11.7.53*/

results matching ""

    No results matching ""