diff --git a/NVIDIA/DeepLearningExamples/PyTorch/README.md b/NVIDIA/DeepLearningExamples/PyTorch/README.md deleted file mode 100644 index 798bc662205958dde14d42851b47b70040ae331d..0000000000000000000000000000000000000000 --- a/NVIDIA/DeepLearningExamples/PyTorch/README.md +++ /dev/null @@ -1,88 +0,0 @@ - -### Setting paths - -``` -MODEL_NAME=nnunet_for_pytorch -MODEL_VERSION=21.11.0 -MODEL_BASE=/proj/nsc_testing/xuan/containers/nvidia_pytorch_21.11-py3.sif -CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODEL_NAME}_${MODEL_VERSION}.sif -DEF_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/${MODEL_NAME}_${MODEL_VERSION}.def -WORK_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet -``` -### Building the container - -``` -apptainer build $MODEL_BASE docker://nvcr.io/nvidia/pytorch:21.11-py3 -apptainer build $CONTAINER_DIR $DEF_DIR -``` - - -### Downloading and preprocessing the data - -``` -apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python download.py --task 01 -apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python /workspace/nnunet_pyt/preprocess.py --task 01 --dim 2 -``` - - - -### Running benchmarking - -``` -apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python scripts/benchmark.py --mode train --gpus 1 --dim 2 --batch_size 256 --amp -apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python scripts/benchmark.py --mode predict --gpus 1 --dim 2 --batch_size 256 --amp -``` - -### Running benchmarking using batch jobs - -``` -bash submit_benchmark_jobs.sh -``` - - -### Known issues - -#### Isssue 1 (21.11.0) -https://github.com/NVIDIA/DeepLearningExamples/issues/1113 - -When running the container, an error occurred: -``` -ImportError: cannot import name 'get_num_classes' from 'torchmetrics.utilities.data' (/opt/conda/lib/python3.8/site-packages/torchmetrics/utilities/data.py) -``` - - -Solution 1 (not working): `pip install pytorch-lightning==1.5.10`. - -Another error raised when benchmarking predict: -``` -Traceback (most recent call last): - File "main.py", line 110, in <module> - trainer.current_epoch = 1 -AttributeError: can't set attribute -``` - -Solution 2: `pip install torchmetrics==0.6.0`. - -Another error raised: - File "main.py", line 34, in <module> - set_affinity(int(os.getenv("LOCAL_RANK", "0")), args.gpus, mode=args.affinity) - File "/workspace/nnunet_pyt/utils/gpu_affinity.py", line 376, in set_affinity - set_socket_unique_affinity(gpu_id, nproc_per_node, cores, "contiguous", balanced) - File "/workspace/nnunet_pyt/utils/gpu_affinity.py", line 263, in set_socket_unique_affinity - os.sched_setaffinity(0, ungrouped_affinities[gpu_id]) -OSError: [Errno 22] Invalid argument - -We need to comment out the L32-33 in the `main.py` to fix it. - -#### Issue 2 (21.11.0) - -Muiti-node jobs is not supported yet in 21.11.0 but only in the most recent code on GitHub. - - -#### Issue 3 (latest) - -The last line of the `Dockerfile` has to change to: - -``` -cp utils/instance_norm.py /opt/conda/lib/python3.8/site-packages/apex/normalization -``` \ No newline at end of file diff --git a/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/generate_benchmark_jobs.sh b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/generate_benchmark_jobs.sh index b7ad2773692d11c7ace525b991cfd424f5dfc44b..9f0bb49145d6398e1feeb0769a54ab51427f7f7e 100644 --- a/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/generate_benchmark_jobs.sh +++ b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/generate_benchmark_jobs.sh @@ -1,21 +1,28 @@ #!/bin/bash -SBATCH_DIR=$WORK_DIR/sbatch_scripts/benchmark_${6}_${5}_dim${1}_nodes${2}_gpus${3}_batchsize_${4}.sbatch -SBATCH_OUT_DIR=$WORK_DIR/sbatch_out/benchmark_${6}_${5}_dim${1}_nodes${2}_gpus${3}_batchsize_${4}.out -LOG_DIR=benchmark_${6}_${5}_dim${1}_nodes${2}_gpus${3}_batchsize_${4}_amp.json +dim=$1 +nodes=$2 +gpus=$3 +batch_size=$4 +benchmark_mode=$5 +node_type=$6 + +SBATCH_DIR=$WORK_DIR/sbatch_scripts/benchmark_${node_type}_${benchmark_mode}_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize_${batch_size}.sbatch +SBATCH_OUT_DIR=$WORK_DIR/sbatch_out/benchmark_${node_type}_${benchmark_mode}_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize_${batch_size}.out +LOG_DIR=benchmark_${node_type}_${benchmark_mode}_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize_${batch_size}_amp.json cat <<EOT > $SBATCH_DIR #!/bin/bash #SBATCH -A nsc -#SBATCH --nodes=${2} -#SBATCH --gpus=${3} +#SBATCH --nodes=${nodes} +#SBATCH --gpus=${gpus} #SBATCH --time=$TIME_RESERVATION #SBATCH --output=$SBATCH_OUT_DIR EOT -if [ "${6}" = "thin" ]; then +if [ "${node_type}" = "thin" ]; then cat <<EOT >> $SBATCH_DIR #SBATCH -C "thin" #SBATCH --reservation=$GPU_RESERVATION @@ -23,12 +30,13 @@ EOT else cat <<EOT >> $SBATCH_DIR #SBATCH -C "fat" +#SBATCH --reservation=$GPU_RESERVATION EOT fi cat <<EOT >> $SBATCH_DIR rm -f $WORK_DIR/results/$LOG_DIR -apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python scripts/benchmark.py --mode ${5} --gpus ${3} --dim ${1} --batch_size ${4} --amp --logname='$LOG_DIR' +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python scripts/benchmark.py --mode ${benchmark_mode} --gpus ${gpus} --dim ${dim} --batch_size ${batch_size} --amp --logname='$LOG_DIR' EOT diff --git a/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/nnunet_for_pytorch_latest.def b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/nnunet_for_pytorch_latest.def index 5545cd4f24621eac1ee09f2eb4d419ed2bf941b9..65af3a2211e0a75a67236df542e3ab40b40c2287 100644 --- a/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/nnunet_for_pytorch_latest.def +++ b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/nnunet_for_pytorch_latest.def @@ -1,5 +1,5 @@ Bootstrap: localimage -From: /proj/nsc_testing/xuan/containers/nvidia_pytorch_21.11-py3.sif +From: /proj/nsc_testing/xuan/containers/nvidia_pytorch_22.11-py3.sif %environment @@ -20,10 +20,11 @@ rm -rf DeepLearningExamples pip install --disable-pip-version-check -r requirements.txt pip install monai==1.0.0 --no-dependencies pip install numpy --upgrade +pip install torchmetrics==0.11.4 curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" unzip -qq awscliv2.zip ./aws/install rm -rf awscliv2.zip aws -cp utils/instance_norm.py /opt/conda/lib/python3.8/site-packages/apex/normalization +cp utils/instance_norm.py /usr/local/lib/python3.8/dist-packages/apex/normalization diff --git a/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/submit_benchmark_jobs.sh b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/submit_benchmark_jobs.sh index a3faf4a88d999f577014b3547a8112b52ab27fde..d5650922339bb974f4a6c5d29bb0d47383adddfd 100644 --- a/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/submit_benchmark_jobs.sh +++ b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/submit_benchmark_jobs.sh @@ -5,16 +5,16 @@ export MODEL_NAME=nnunet_for_pytorch export MODLE_VERSION=latest export WORK_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet export CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODEL_NAME}_${MODLE_VERSION}.sif -export GPU_RESERVATION=nodeimage -export TIME_RESERVATIOn=0-0:20:00 +export GPU_RESERVATION= +export TIME_RESERVATION=00:20:00 mkdir -p $WORK_DIR/sbatch_out $WORK_DIR/sbatch_scripts $WORK_DIR/results benchmark_modes=("train" "predict") # node_types=("thin" "fat") -node_types=("thin") +node_types=("fat") dim=2 -for nodes in {1..1}; do +for nodes in {1..2}; do for gpus in {1..8}; do for benchmark_mode in "${benchmark_modes[@]}"; do for node_type in "${node_types[@]}"; do @@ -31,8 +31,8 @@ for nodes in {1..1}; do # For single node bash $WORK_DIR/generate_benchmark_jobs.sh ${dim} ${nodes} ${gpus} ${batch_size} ${benchmark_mode} ${node_type} SBATCH_DIR=$WORK_DIR/sbatch_scripts/benchmark_${node_type}_${benchmark_mode}_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize_${batch_size}.sbatch - sbatch $SBATCH_DIR - sleep 180 + # sbatch $SBATCH_DIR + # sleep 180 done done done