diff --git a/MLPerf/training/image_segmentation/pytorch/README.md b/MLPerf/training/image_segmentation/pytorch/README.md index 7a862bdca79d87d017a74a2914e4b16578cb45ad..e76f0e92ff411849fc25eb13e795b0d36054122f 100644 --- a/MLPerf/training/image_segmentation/pytorch/README.md +++ b/MLPerf/training/image_segmentation/pytorch/README.md @@ -5,7 +5,7 @@ The U-Net3D from MLPerf has no version control. ``` -MODEL_NAME=nnunet_for_pytorch +MODEL_NAME=U-Net3D MODEL_BASE=/proj/nsc_testing/xuan/containers/pytorch_1.7.1-cuda11.0-cudnn8-runtime.sif CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODEL_NAME}.sif DEF_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/MLPerf/training/image_segmentation/pytorch//${MODEL_NAME}.def @@ -44,3 +44,9 @@ apptainer exec --nv -B ${WORK_DIR}/raw-data:/raw-data -B ${WORK_DIR}/data:/data ``` bash submit_benchmark_jobs.sh ``` + +### Known issues + +#### Issue 1 + +The line 23 in `main.py` will try to create a file in the container `/workspace/unet3d`, which will cause a write permission issue. We comment out this line. \ No newline at end of file diff --git a/MLPerf/training/image_segmentation/pytorch/generate_benchmark_jobs.sh b/MLPerf/training/image_segmentation/pytorch/generate_benchmark_jobs.sh new file mode 100644 index 0000000000000000000000000000000000000000..cc53456770379b53f6e4991af4176c374006dbec --- /dev/null +++ b/MLPerf/training/image_segmentation/pytorch/generate_benchmark_jobs.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +SBATCH_DIR=$WORK_DIR/sbatch_scripts/benchmark_${6}_${5}_dim${1}_nodes${2}_gpus${3}_batchsize_${4}.sbatch +SBATCH_OUT_DIR=$WORK_DIR/sbatch_out/benchmark_${6}_${5}_dim${1}_nodes${2}_gpus${3}_batchsize_${4}.out +LOG_DIR=benchmark_${6}_${5}_dim${1}_nodes${2}_gpus${3}_batchsize_${4}_amp.log + +cat <<EOT > $SBATCH_DIR +#!/bin/bash + +#SBATCH -A nsc +#SBATCH --nodes=${2} +#SBATCH --gpus=${3} +#SBATCH --time=0-0:20:00 +#SBATCH --output=$SBATCH_OUT_DIR + +EOT + +if [ "${6}" = "thin" ]; then + cat <<EOT >> $SBATCH_DIR +#SBATCH -C "thin" +#SBATCH --reservation=$GPU_RESERVATION +EOT +else + cat <<EOT >> $SBATCH_DIR +#SBATCH -C "fat" +EOT +fi + +cat <<EOT >> $SBATCH_DIR + +rm -f $WORK_DIR/results/$LOG_DIR +apptainer exec --nv -B ${WORK_DIR}/raw-data:/raw-data -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results $CONTAINER_DIR bash -c "cd /workspace/unet3d && bash run_and_time.sh 1" + +mv ${WORK_DIR}/results/unet3d.log ${WORK_DIR}/results/$LOG_DIR + +EOT diff --git a/MLPerf/training/image_segmentation/pytorch/submit_benchmark_jobs.sh b/MLPerf/training/image_segmentation/pytorch/submit_benchmark_jobs.sh new file mode 100644 index 0000000000000000000000000000000000000000..12c3c5a1193235b7be0744f1bb9b5d0559e22edc --- /dev/null +++ b/MLPerf/training/image_segmentation/pytorch/submit_benchmark_jobs.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -e + +export MODEL_NAME=nnunet_for_pytorch +export WORK_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/MLPerf/training/image_segmentation/pytorch +export CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODEL_NAME}.sif +export GPU_RESERVATION=nodeimage +mkdir -p $WORK_DIR/sbatch_out $WORK_DIR/sbatch_scripts $WORK_DIR/results + +benchmark_modes=("train" "predict") +# node_types=("thin" "fat") +node_types=("thin") + +dim=2 +for nodes in {1..1}; do + for gpus in {1..8}; do + for benchmark_mode in "${benchmark_modes[@]}"; do + for node_type in "${node_types[@]}"; do + + + if [ "${node_type}" = "thin" ]; then + batch_size=512 + else + batch_size=1024 + fi + + echo dim ${dim}, nodes ${nodes}, gpus ${gpus}, batch_size ${batch_size}, benchmark_mode ${benchmark_mode}, node_type ${node_type} + + # For single node + bash $WORK_DIR/generate_benchmark_jobs.sh ${dim} ${nodes} ${gpus} ${batch_size} ${benchmark_mode} ${node_type} + SBATCH_DIR=$WORK_DIR/sbatch_scripts/benchmark_${node_type}_${benchmark_mode}_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize_${batch_size}.sbatch + # sbatch $SBATCH_DIR + # sleep 1 + done + done + done +done diff --git a/NVIDIA/DeepLearningExamples/PyTorch/README.md b/NVIDIA/DeepLearningExamples/PyTorch/README.md index 7b2cba50eb4358131416d4b8187cca14575c5c77..06f65902c4d5e8d60cdfe574e7763c40fd3369b9 100644 --- a/NVIDIA/DeepLearningExamples/PyTorch/README.md +++ b/NVIDIA/DeepLearningExamples/PyTorch/README.md @@ -39,3 +39,41 @@ apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results -- bash submit_benchmark_jobs.sh ``` + +### Known issues + +#### Isssue 1 +https://github.com/NVIDIA/DeepLearningExamples/issues/1113 + +When running the container, an error occurred: +``` +ImportError: cannot import name 'get_num_classes' from 'torchmetrics.utilities.data' (/opt/conda/lib/python3.8/site-packages/torchmetrics/utilities/data.py) +``` + + +Solution 1 (not working): `pip install pytorch-lightning==1.5.10`. + +Another error raised when benchmarking predict: +``` +Traceback (most recent call last): + File "main.py", line 110, in <module> + trainer.current_epoch = 1 +AttributeError: can't set attribute +``` + +Solution 2: `pip install torchmetrics==0.6.0`. + +Another error raised: + File "main.py", line 34, in <module> + set_affinity(int(os.getenv("LOCAL_RANK", "0")), args.gpus, mode=args.affinity) + File "/workspace/nnunet_pyt/utils/gpu_affinity.py", line 376, in set_affinity + set_socket_unique_affinity(gpu_id, nproc_per_node, cores, "contiguous", balanced) + File "/workspace/nnunet_pyt/utils/gpu_affinity.py", line 263, in set_socket_unique_affinity + os.sched_setaffinity(0, ungrouped_affinities[gpu_id]) +OSError: [Errno 22] Invalid argument + +We need to comment out the L32-33 in the `main.py` to fix it. + +#### Issue 2 + +Muiti-node jobs is not supported yet in 21.11.0 but only in the most recent code on GitHub.