From 78a63ee3082be4f6a9e1a9a28f41330638b9ed50 Mon Sep 17 00:00:00 2001 From: Xuan Gu <xuan.gu@liu.se> Date: Thu, 12 Oct 2023 22:18:35 +0000 Subject: [PATCH] Update 4 files - /NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/build.txt - /NVIDIA/DeepLearningExamples/PyTorch/README.md - /MLPerf/training/image_segmentation/pytorch/build.txt - /MLPerf/training/image_segmentation/pytorch/README.md --- .../image_segmentation/pytorch/README.md | 46 +++++++++++++++++++ .../image_segmentation/pytorch/build.txt | 2 +- NVIDIA/DeepLearningExamples/PyTorch/README.md | 41 +++++++++++++++++ .../PyTorch/Segmentation/nnUNet/build.txt | 2 - 4 files changed, 88 insertions(+), 3 deletions(-) create mode 100644 MLPerf/training/image_segmentation/pytorch/README.md create mode 100644 NVIDIA/DeepLearningExamples/PyTorch/README.md diff --git a/MLPerf/training/image_segmentation/pytorch/README.md b/MLPerf/training/image_segmentation/pytorch/README.md new file mode 100644 index 0000000..01a0a91 --- /dev/null +++ b/MLPerf/training/image_segmentation/pytorch/README.md @@ -0,0 +1,46 @@ + +### Seting paths + +The U-Net3D from MLPerf has no version control. + + +``` +MODEL_NAME=nnunet_for_pytorch +MODEL_BASE=/proj/nsc_testing/xuan/containers/pytorch_1.7.1-cuda11.0-cudnn8-runtime.sif +CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODEL_NAME}.sif +DEF_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/MLPerf/training/image_segmentation/pytorch//${MODEL_NAME}.def +WORK_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/MLPerf/training/image_segmentation/pytorch +``` +### Building the container + +``` +apptainer build $MODEL_BASE docker://pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime +apptainer build $CONTAINER_DIR $DEF_DIR +``` + + +### Downloading and preprocessing the data + +``` +cd $WORK_DIR +git clone https://github.com/neheller/kits19 +apptainer exec $CONTAINER_DIR bash -c "cd kits19 && python3 -m starter_code.get_imaging" +mv kits19/data/* raw-data/ +rm -rf kits19 + +apptainer exec --nv -B ${WORK_DIR}/raw-data:/raw-data -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results $CONTAINER_DIR bash -c "cd /workspace/unet3d && python3 preprocess_dataset.py --data_dir /raw-data --results_dir /data" +``` + + + +### Running benchmarking + +``` +apptainer exec --nv -B ${WORK_DIR}/raw-data:/raw-data -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results $CONTAINER_DIR bash -c "cd /workspace/unet3d && bash run_and_time.sh 1" +``` + +### Running benchmarking using batch jobs + +``` +bash submit_benchmark_jobs.sh +``` diff --git a/MLPerf/training/image_segmentation/pytorch/build.txt b/MLPerf/training/image_segmentation/pytorch/build.txt index 59ad0a5..fd77634 100644 --- a/MLPerf/training/image_segmentation/pytorch/build.txt +++ b/MLPerf/training/image_segmentation/pytorch/build.txt @@ -1,5 +1,5 @@ -apptainer build containers/pytorch_1.7.1-cuda11.0-cudnn8-devel.sif docker://pytorch/pytorch:1.7.1-cuda11.0-cudnn8-devel +apptainer build containers/pytorch_1.7.1-cuda11.0-cudnn8-runtime.sif docker://pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime MODULE_NAME=U-Net3D diff --git a/NVIDIA/DeepLearningExamples/PyTorch/README.md b/NVIDIA/DeepLearningExamples/PyTorch/README.md new file mode 100644 index 0000000..4f5a277 --- /dev/null +++ b/NVIDIA/DeepLearningExamples/PyTorch/README.md @@ -0,0 +1,41 @@ + +### Seting paths + +``` +MODEL_NAME=nnunet_for_pytorch +MODEL_VERSION=21.11.0 +MODEL_BASE=/proj/nsc_testing/xuan/containers/nvidia_pytorch_21.11-py3.sif +CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODEL_NAME}_${MODEL_VERSION}.sif +DEF_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/${MODEL_NAME}_${MODEL_VERSION}.def +WORK_DIR=/proj/nsc_testing/xuan/berzelius-benchmarks/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet +``` +### Building the container + +``` +apptainer build $MODEL_BASE docker://nvcr.io/nvidia/pytorch:21.11-py3 +apptainer build $CONTAINER_DIR $DEF_DIR +``` + + +### Downloading and preprocessing the data + +``` +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python download.py --task 01 +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python /workspace/nnunet_pyt/preprocess.py --task 01 --dim 2 +``` + + + +### Running benchmarking + +``` +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python scripts/benchmark.py --mode train --gpus 1 --dim 2 --batch_size 256 --amp +apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd /workspace/nnunet_pyt $CONTAINER_DIR python scripts/benchmark.py --mode predict --gpus 1 --dim 2 --batch_size 256 --amp +``` + +### Running benchmarking using batch jobs + +``` +bash submit_benchmark_jobs.sh +``` + diff --git a/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/build.txt b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/build.txt index 1a679da..2bcba44 100644 --- a/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/build.txt +++ b/NVIDIA/DeepLearningExamples/PyTorch/Segmentation/nnUNet/build.txt @@ -2,8 +2,6 @@ https://catalog.ngc.nvidia.com/orgs/nvidia/resources/nnunet_for_pytorch/quick-st apptainer build containers/nvidia_pytorch_21.11-py3.sif docker://nvcr.io/nvidia/pytorch:21.11-py3 -export APPTAINER_BINDPATH= - MODULE_NAME=nnunet_for_pytorch MODULE_VERSION=21.11.0 CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODULE_NAME}_${MODULE_VERSION}.sif -- GitLab