diff --git a/MLPerf/training/image_segmentation/pytorch/README.md b/MLPerf/training/image_segmentation/pytorch/README.md new file mode 100644 index 0000000000000000000000000000000000000000..01a0a915d502e2d64417ae8f70fe27f3fd653bf9 --- /dev/null +++ b/MLPerf/training/image_segmentation/pytorch/README.md @@ -0,0 +1,46 @@ + +### Seting paths + +The U-Net3D from MLPerf has no version control. + + +``` +MODEL_NAME=nnunet_for_pytorch +MODEL_BASE=/proj/nsc_testing/xuan/containers/pytorch_1.7.1-cuda11.0-cudnn8-runtime.sif +CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODEL_NAME}.sif +DEF_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/MLPerf/training/image_segmentation/pytorch//${MODEL_NAME}.def +WORK_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/MLPerf/training/image_segmentation/pytorch +``` +### Building the container + +``` +apptainer build $MODEL_BASE docker://pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime +apptainer build $CONTAINER_DIR $DEF_DIR +``` + + +### Downloading and preprocessing the data + +``` +cd $WORK_DIR +git clone https://github.com/neheller/kits19 +apptainer exec $CONTAINER_DIR bash -c "cd kits19 && python3 -m starter_code.get_imaging" +mv kits19/data/* raw-data/ +rm -rf kits19 + +apptainer exec --nv -B ${WORK_DIR}/raw-data:/raw-data -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results $CONTAINER_DIR bash -c "cd /workspace/unet3d && python3 preprocess_dataset.py --data_dir /raw-data --results_dir /data" +``` + + + +### Running benchmarking + +``` +apptainer exec --nv -B ${WORK_DIR}/raw-data:/raw-data -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results $CONTAINER_DIR bash -c "cd /workspace/unet3d && bash run_and_time.sh 1" +``` + +### Running benchmarking using batch jobs + +``` +bash submit_benchmark_jobs.sh +``` diff --git a/MLPerf/training/image_segmentation/pytorch/build.txt b/MLPerf/training/image_segmentation/pytorch/build.txt index 59ad0a5c1e4cd7fa613054ba736eaa6205cf6ce4..fd77634d28b8eb9e2b9bede17302a09144147e58 100644 --- a/MLPerf/training/image_segmentation/pytorch/build.txt +++ b/MLPerf/training/image_segmentation/pytorch/build.txt @@ -1,5 +1,5 @@ -apptainer build containers/pytorch_1.7.1-cuda11.0-cudnn8-devel.sif docker://pytorch/pytorch:1.7.1-cuda11.0-cudnn8-devel +apptainer build containers/pytorch_1.7.1-cuda11.0-cudnn8-runtime.sif docker://pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime MODULE_NAME=U-Net3D diff --git a/NVIDIA/DeepLearningExamples/PyTorch/README.md b/NVIDIA/DeepLearningExamples/PyTorch/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f5a2771786c4cfbbebe6ca80c41635e49f31fef --- /dev/null +++ b/NVIDIA/DeepLearningExamples/PyTorch/README.md @@ -0,0 +1,41 @@ + +### Seting paths + +``` +MODEL_NAME=nnunet_for_pytorch +MODEL_VERSION=21.11.0 +MODEL_BASE=/proj/nsc_testing/xuan/containers/nvidia_pytorch_21.11-py3.sif +CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODEL_NAME}_${MODEL_VERSION}.sif +DEF_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/${MODEL_NAME}_${MODEL_VERSION}.def +WORK_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet +``` +### Building the container + +``` +apptainer build $MODEL_BASE docker://nvcr.io/nvidia/pytorch:21.11-py3 +apptainer build $CONTAINER_DIR $DEF_DIR +``` + + +### Downloading and preprocessing the data + +``` +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python download.py --task 01 +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python /workspace/nnunet_pyt/preprocess.py --task 01 --dim 2 +``` + + + +### Running benchmarking + +``` +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python scripts/benchmark.py --mode train --gpus 1 --dim 2 --batch_size 256 --amp +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python scripts/benchmark.py --mode predict --gpus 1 --dim 2 --batch_size 256 --amp +``` + +### Running benchmarking using batch jobs + +``` +bash submit_benchmark_jobs.sh +``` + diff --git a/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/build.txt b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/build.txt index 1a679dabd10f3fb4cb20e1b5054bd66370d207b9..2bcba44942d06aded307addf68739bd5a716e829 100644 --- a/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/build.txt +++ b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/build.txt @@ -2,8 +2,6 @@ https://catalog.ngc.nvidia.com/orgs/nvidia/resources/nnunet_for_pytorch/quick-st apptainer build containers/nvidia_pytorch_21.11-py3.sif docker://nvcr.io/nvidia/pytorch:21.11-py3 -export APPTAINER_BINDPATH= - MODULE_NAME=nnunet_for_pytorch MODULE_VERSION=21.11.0 CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODULE_NAME}_${MODULE_VERSION}.sif