深度学习框架使用简介
不同深度学习框架的python运行环境加载方式如下:
//加载TensorFlow运行环境
source /public/software/apps/DeepLearning/TensorFlow/tensorflow-env.sh
//加载PyTorch运行环境
source /public/software/apps/DeepLearning/PyTorch/pytorch-env.sh
//加载MxNet运行环境
source /public/software/apps/DeepLearning/MxNet/mxnet-env.sh
下面分别介绍三个框架的单机和分布式运行示例:
1. TensorFlow:
//下载测试程序:
https://github.com/horovod/horovod/tree/master/examples/tensorflow/tensorflow_synthetic_benchmark.py
或
https://github.com/horovod/horovod/tree/master/examples/tensorflow2/tensorflow_synthetic_benchmark.py
//单机单卡
python3 tensorflow_synthetic_benchmark.py --model=resnet50 --batch-size=128 --num-iters=500
//单机多卡
cat single_process.sh
#!/bin/bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
APP="python3 tensorflow_synthetic_benchmark.py --model=resnet50 --batch-size=128 --num-iters=500"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
mpirun -np 4 ./single_process.sh
//多机多卡,结合调度系统使用
#!/bin/bash
#SBATCH -p debug
#SBATCH -N 16
#SBATCH -J xuan-tf
#SBATCH -n 512
#SBATCH --gres=dcu:4
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat ./${hostfile}`
do
echo ${i} slots=4 >> ./hostfile-dl-$SLURM_JOB_ID ((num_node=${num_node}+1))
done
echo "resnet50 node is " ${num_node}
((num_DCU=${num_node}*4))
mpirun -np ${num_DCU} ./single_process.h
2. PyTorch:
//下载测试程序
https://github.com/pytorch/examples/tree/master/imagenet/main.py
/*usage: main.py [-h] [-a ARCH] [-j N] [--epochs N] [--start-epoch N] [-b N]
[--lr LR] [--momentum M] [--wd W] [-p N] [--resume PATH] [-e]
[--pretrained] [--world-size WORLD_SIZE] [--rank RANK]
[--dist-url DIST_URL] [--dist-backend DIST_BACKEND]
[--seed SEED] [--gpu GPU] [--multiprocessing-distributed]
DIR
*/
//单机单卡
python3 main.py \
--batch-size=32 \
--arch=resnet50 \
--workers 6 \
--epochs=1 \
--gpu=0 \
/imagenet/
//单机多卡(-)
python3 main.py \
--batch-size=128 \
--arch=resnet50 \
--workers 24 \
--epochs=1 \
/imagenet/
//单机多卡(二)
cat single_process.sh
export GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3
export MIOPEN_USER_DB_PATH=/tmp/pytorch-miopen-2.8
export HSA_USERPTR_FOR_PAGED_MEM=0
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
APP="python3 main.py --batch-size=32 --a=resnet50 -j 6 --epochs=1 --dist-url tcp://${1}:34567 --dist-backend gloo --world-size=${comm_size} --rank=${comm_rank} /imagenet/"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
GLOO_SOCKET_IFNAME=ib0,ib1,ib2,ib3 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
//单机四卡
mpirun -np 4 `pwd`/single_process.sh $dist_url
//多机多卡
#!/bin/bash
#SBATCH -p debug
#SBATCH -N 2
#SBATCH -J xuan-pytorch
#SBATCH -n 64
which mpirun
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
num_node=$(cat $hostfile|sort|uniq |wc -l)
num_DCU=$(($num_node*4))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
rm `pwd`/hostfile-xuan -f
cat $hostfile|sort|uniq >`pwd`/tmp
for i in `cat ./tmp`
do
echo ${i} slots=4 >> `pwd`/hostfile-xuan
done
mpirun -np $np --allow-run-as-root -hostfile `pwd`/hostfile-xuan `pwd`/single_process.sh $dist_url
3. Mxnet:
//下载测试程序
https://github.com/apache/incubator-mxnet/tree/v1.4.x/example/image-classification
//单机单卡
python3 train_imagenet.py \
--benchmark 1 \
--gpus 0 \
--network inception-v3 \
--batch-size 64 \
--image-shape 3,299,299 \
--num-epochs 10 \
--kv-store device
//单机多卡
python3 train_imagenet.py \
--benchmark 1 \
--gpus 0,1,2,3 \
--network inception-v3 \
--batch-size 64 \
--image-shape 3,299,299 \
--num-epochs 10 \
--kv-store device
//多机多卡,ps-worker
#!/bin/bash
#SBATCH -J mxnet
#SBATCH -p dl
#SBATCH -N 2
#SBATCH -n 64
#SBATCH --gres=dcu:4
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
num_node=0
for i in `cat ./${hostfile}`
do
# echo ${i} slots=4 >> ./mxnet-$SLURM_JOB_ID
gethostip ${i} | awk '{print $2}' >>./mxnet-$SLURM_JOB_ID
((num_node=${num_node}+1))
done
source /public/home/yangxuan1/mxnet-env.sh
which python3
python3 ../tools/launch.py \
-n ${num_node} -s 2 -H mxnet-$SLURM_JOB_ID \
--sync-dst-dir ../example/distributed_training/ \
--launcher ssh \
"source /public/home/yangxuan1/mxnet-env.sh; python3 cifar10_dist.py \
--network resnet \
--num-layers 110 \
--batch-size 128 \
--kv-store dist_device_sync"
/*cat hosts
10.11.7.51
10.11.7.53*/