diff --git a/README.md b/README.md index 142dad83fb9a07fb04c720930424363bf37f1f38..92841f2620fde2f3c501bae62c2e53e0eb61a749 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ -# Benchmark of nnU-Net for PyTorch on Berzelius - +# Berzelius nnU-Net Benchmark The benchmarking is based on [Nvidia NGC nnU-net for Pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/resources/nnunet_for_pytorch) v21.11.0. @@ -27,8 +26,9 @@ docker push xuagu37/nvidia_nnu-net_for_pytorch:21.11.0 - Create directories ``` -mkdir -p /proj/nsc_testing/xuan/nnUnet_benchmark -cd /proj/nsc_testing/xuan/nnUnet_benchmark +cd /proj/nsc_testing/xuan +git clone https://gitlab.liu.se/xuagu37/Berzelius-nnU-Net-Benchmark.git +cd Berzelius-nnU-Net-Benchmark mkdir data results ``` <!-- - Clone the repository @@ -77,14 +77,13 @@ Exit the image. - For benchmarking purpose, we use 1000 copied of a single image ``` -bash copy_data_for_benchmark.sh +bash srtips/copy_data_for_benchmark.sh ``` - Run the script. You need to modify the script for e.g. the name of your reservation, number of nodes, batch_size, etc. Also, choose either singularity or enroot. ``` -cd /proj/nsc/xuan/ngc/DeepLearningExamples/PyTorch/Segmentation/nnUNet mkdir sbatch_out -bash benchmark_sbatch_submit.sh +bash srtips/benchmark_sbatch_submit.sh ``` ### Results diff --git a/scripts/benchmark_multi_node.sbatch b/scripts/benchmark_multi_node.sbatch index 6ce9c388fdfdc9058c99749817f3b6a6a235573f..5eb92f2f79c003ec5c55c49066131e21dcf8e506 100644 --- a/scripts/benchmark_multi_node.sbatch +++ b/scripts/benchmark_multi_node.sbatch @@ -14,7 +14,7 @@ srun singularity exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidi #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json #srun singularity exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname="benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json"" -############## Running srun enroot ... stopped working at 20220220 +############## Running srun enroot ... stopped working at 20230220 # For enroot #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json #srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname="benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json"" diff --git a/scripts/benchmark_sbatch_submit.sh b/scripts/benchmark_sbatch_submit.sh index 7bdcdc764745b95108b975771db0c319044842a2..9107d84cc038eb23715f0de7154a5bf2fbf35fcc 100644 --- a/scripts/benchmark_sbatch_submit.sh +++ b/scripts/benchmark_sbatch_submit.sh @@ -1,16 +1,28 @@ cd /proj/nsc/users/xuan/ngc/DeepLearningExamples/PyTorch/Segmentation/nnUNet dim=2 + nodes=1 +for gpus in {1..8}; do + for batch_size in 128; do + for iteration in {1..100}; do -for gpus in 8; do + echo dim ${dim}, nodes ${nodes}, gpus ${gpus}, batch_size ${batch_size}, iteration ${iteration} + # For single node + sbatch -o sbatch_out/benchmark_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize${batch_size}_iteration${iteration}.out benchmark_single_node.sbatch ${dim} ${nodes} ${gpus} ${batch_size} ${iteration} - for batch_size in 256; do + sleep 1 # pause to be kind to the scheduler - for iteration in {1..10}; do - - echo dim ${dim}, nodes ${nodes}, gpus ${gpus}, batch_size ${batch_size}, iteration ${iteration} + done + done +done + +for nodes in {2..8} +for gpus in {8}; do + for batch_size in 128; do + for iteration in {1..100}; do + echo dim ${dim}, nodes ${nodes}, gpus ${gpus}, batch_size ${batch_size}, iteration ${iteration} # For single node sbatch -o sbatch_out/benchmark_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize${batch_size}_iteration${iteration}.out benchmark_single_node.sbatch ${dim} ${nodes} ${gpus} ${batch_size} ${iteration} @@ -20,7 +32,6 @@ for gpus in 8; do sleep 1 # pause to be kind to the scheduler done - done - done +done \ No newline at end of file