diff --git a/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/MaskRCNN/README.md b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/MaskRCNN/README.md new file mode 100644 index 0000000000000000000000000000000000000000..907ee108fc434bafa267e3fc3c6804f0d9b38590 --- /dev/null +++ b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/MaskRCNN/README.md @@ -0,0 +1,88 @@ + +### Setting paths + +``` +MODEL_NAME=nnunet_for_pytorch +MODEL_VERSION=21.11.0 +MODEL_BASE=/proj/nsc_testing/xuan/containers/nvidia_pytorch_21.11-py3.sif +CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODEL_NAME}_${MODEL_VERSION}.sif +DEF_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/${MODEL_NAME}_${MODEL_VERSION}.def +WORK_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet +``` +### Building the container + +``` +apptainer build $MODEL_BASE docker://nvcr.io/nvidia/pytorch:21.11-py3 +apptainer build $CONTAINER_DIR $DEF_DIR +``` + + +### Downloading and preprocessing the data + +``` +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python download.py --task 01 +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python /workspace/nnunet_pyt/preprocess.py --task 01 --dim 2 +``` + + + +### Running benchmarking + +``` +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python scripts/benchmark.py --mode train --gpus 1 --dim 2 --batch_size 256 --amp +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python scripts/benchmark.py --mode predict --gpus 1 --dim 2 --batch_size 256 --amp +``` + +### Running benchmarking using batch jobs + +``` +bash submit_benchmark_jobs.sh +``` + + +### Known issues + +#### Isssue 1 (21.11.0) +https://github.com/NVIDIA/DeepLearningExamples/issues/1113 + +When running the container, an error occurred: +``` +ImportError: cannot import name 'get_num_classes' from 'torchmetrics.utilities.data' (/opt/conda/lib/python3.8/site-packages/torchmetrics/utilities/data.py) +``` + + +Solution 1 (not working): `pip install pytorch-lightning==1.5.10`. + +Another error raised when benchmarking predict: +``` +Traceback (most recent call last): + File "main.py", line 110, in <module> + trainer.current_epoch = 1 +AttributeError: can't set attribute +``` + +Solution 2: `pip install torchmetrics==0.6.0`. + +Another error raised: + File "main.py", line 34, in <module> + set_affinity(int(os.getenv("LOCAL_RANK", "0")), args.gpus, mode=args.affinity) + File "/workspace/nnunet_pyt/utils/gpu_affinity.py", line 376, in set_affinity + set_socket_unique_affinity(gpu_id, nproc_per_node, cores, "contiguous", balanced) + File "/workspace/nnunet_pyt/utils/gpu_affinity.py", line 263, in set_socket_unique_affinity + os.sched_setaffinity(0, ungrouped_affinities[gpu_id]) +OSError: [Errno 22] Invalid argument + +We need to comment out the L32-33 in the `main.py` to fix it. + +#### Issue 2 (21.11.0) + +Muiti-node jobs is not supported yet in 21.11.0 but only in the latest (nightly) version. + + +#### Issue 3 (latest) + +``` +ImportError: cannot import name '_compare_version' from 'torchmetrics.utilities.imports +``` + +Solution: `pip install torchmetrics==0.11.4`. \ No newline at end of file diff --git a/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/MaskRCNN/build.txt b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/MaskRCNN/build.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2b105102f28b165a7bf5b43e563100d1e53436a --- /dev/null +++ b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/MaskRCNN/build.txt @@ -0,0 +1,23 @@ +https://catalog.ngc.nvidia.com/orgs/nvidia/resources/nnunet_for_pytorch/quick-start-guide + +apptainer build containers/nvidia_pytorch_21.11-py3.sif docker://nvcr.io/nvidia/pytorch:21.11-py3 + +MODULE_NAME=nnunet_for_pytorch +MODULE_VERSION=21.11.0 +CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODULE_NAME}_${MODULE_VERSION}.sif +DEF_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/PyTorch/Segmentation/nnUNet/${MODULE_NAME}_${MODULE_VERSION}.def + +apptainer build $CONTAINER_DIR $DEF_DIR + + + +MODULE_NAME=nnunet_for_pytorch +MODULE_VERSION=latest +CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODULE_NAME}_${MODULE_VERSION}.sif +DEF_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/${MODULE_NAME}_${MODULE_VERSION}.def + +apptainer build $CONTAINER_DIR $DEF_DIR + + +# Known issue +https://github.com/NVIDIA/DeepLearningExamples/issues/1113 diff --git a/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/MaskRCNN/maskrcnn_for_pytorch_latest.def b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/MaskRCNN/maskrcnn_for_pytorch_latest.def new file mode 100644 index 0000000000000000000000000000000000000000..65af3a2211e0a75a67236df542e3ab40b40c2287 --- /dev/null +++ b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/MaskRCNN/maskrcnn_for_pytorch_latest.def @@ -0,0 +1,30 @@ +Bootstrap: localimage +From: /proj/nsc_testing/xuan/containers/nvidia_pytorch_22.11-py3.sif + + +%environment + +export PYTHONNOUSERSITE=True +export OMP_NUM_THREADS=2 + +%post + +VERSION=latest + +mkdir /workspace/nnunet_pyt +cd /workspace/nnunet_pyt +git clone https://github.com/NVIDIA/DeepLearningExamples.git +mv DeepLearningExamples/PyTorch/Segmentation/nnUNet/* ./ +rm -rf DeepLearningExamples + +pip install --disable-pip-version-check -r requirements.txt +pip install monai==1.0.0 --no-dependencies +pip install numpy --upgrade +pip install torchmetrics==0.11.4 + +curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" +unzip -qq awscliv2.zip +./aws/install +rm -rf awscliv2.zip aws + +cp utils/instance_norm.py /usr/local/lib/python3.8/dist-packages/apex/normalization diff --git a/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/README.md b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/README.md new file mode 100644 index 0000000000000000000000000000000000000000..907ee108fc434bafa267e3fc3c6804f0d9b38590 --- /dev/null +++ b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/README.md @@ -0,0 +1,88 @@ + +### Setting paths + +``` +MODEL_NAME=nnunet_for_pytorch +MODEL_VERSION=21.11.0 +MODEL_BASE=/proj/nsc_testing/xuan/containers/nvidia_pytorch_21.11-py3.sif +CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODEL_NAME}_${MODEL_VERSION}.sif +DEF_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/${MODEL_NAME}_${MODEL_VERSION}.def +WORK_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet +``` +### Building the container + +``` +apptainer build $MODEL_BASE docker://nvcr.io/nvidia/pytorch:21.11-py3 +apptainer build $CONTAINER_DIR $DEF_DIR +``` + + +### Downloading and preprocessing the data + +``` +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python download.py --task 01 +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python /workspace/nnunet_pyt/preprocess.py --task 01 --dim 2 +``` + + + +### Running benchmarking + +``` +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python scripts/benchmark.py --mode train --gpus 1 --dim 2 --batch_size 256 --amp +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python scripts/benchmark.py --mode predict --gpus 1 --dim 2 --batch_size 256 --amp +``` + +### Running benchmarking using batch jobs + +``` +bash submit_benchmark_jobs.sh +``` + + +### Known issues + +#### Isssue 1 (21.11.0) +https://github.com/NVIDIA/DeepLearningExamples/issues/1113 + +When running the container, an error occurred: +``` +ImportError: cannot import name 'get_num_classes' from 'torchmetrics.utilities.data' (/opt/conda/lib/python3.8/site-packages/torchmetrics/utilities/data.py) +``` + + +Solution 1 (not working): `pip install pytorch-lightning==1.5.10`. + +Another error raised when benchmarking predict: +``` +Traceback (most recent call last): + File "main.py", line 110, in <module> + trainer.current_epoch = 1 +AttributeError: can't set attribute +``` + +Solution 2: `pip install torchmetrics==0.6.0`. + +Another error raised: + File "main.py", line 34, in <module> + set_affinity(int(os.getenv("LOCAL_RANK", "0")), args.gpus, mode=args.affinity) + File "/workspace/nnunet_pyt/utils/gpu_affinity.py", line 376, in set_affinity + set_socket_unique_affinity(gpu_id, nproc_per_node, cores, "contiguous", balanced) + File "/workspace/nnunet_pyt/utils/gpu_affinity.py", line 263, in set_socket_unique_affinity + os.sched_setaffinity(0, ungrouped_affinities[gpu_id]) +OSError: [Errno 22] Invalid argument + +We need to comment out the L32-33 in the `main.py` to fix it. + +#### Issue 2 (21.11.0) + +Muiti-node jobs is not supported yet in 21.11.0 but only in the latest (nightly) version. + + +#### Issue 3 (latest) + +``` +ImportError: cannot import name '_compare_version' from 'torchmetrics.utilities.imports +``` + +Solution: `pip install torchmetrics==0.11.4`. \ No newline at end of file