diff --git a/scripts/benchmark_nnunet_pytorch_berzelius_multi_node.sh b/scripts/benchmark_nnunet_pytorch_berzelius_multi_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..1a327eb06bc74a080804e08bd3ae3b37ce4c52ca --- /dev/null +++ b/scripts/benchmark_nnunet_pytorch_berzelius_multi_node.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +#SBATCH -A nsc +#SBATCH --nodes=2 +#SBATCH --gres=gpu:8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=0-21:00:00 +#SBATCH --reservation=nsc-testing +#SBATCH -o benchmark_nodes2.out + +cd /proj/nsc/users/xuan/ngc/DeepLearningExamples/PyTorch/Segmentation/nnUNet + +dim=2 +nodes=2 + +for gpus in 8; do + + for batch_size in 128; do + + for iteration in {1..100}; do + + echo dim $dim, nodes $nodes, gpus $gpus, batch_size $batch_size, tf32, iteration $iteration + rm -f results/benchmark_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize${batch_size}_tf32_iteration${iteration}.json + srun singularity exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif python scripts/benchmark.py --mode train --gpus $gpus --dim $dim --batch_size $batch_size --nodes $nodes --logname="benchmark_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize${batch_size}_tf32_iteration${iteration}.json" + + echo dim $dim, nodes $nodes, gpus $gpus, batch_size $batch_size, amp, iteration $iteration + rm -f results/benchmark_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize${batch_size}_amp_iteration${iteration}.json + srun singularity exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif python scripts/benchmark.py --mode train --gpus $gpus --dim $dim --batch_size $batch_size --nodes $nodes --amp --logname="benchmark_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize${batch_size}_amp_iteration${iteration}.json" + + done + + done + +done