diff --git a/scripts/benchmark_multi_node.sbatch b/scripts/benchmark_multi_node.sbatch index 5eb92f2f79c003ec5c55c49066131e21dcf8e506..43eab69783cce54a264480ecd56f306376152fe5 100644 --- a/scripts/benchmark_multi_node.sbatch +++ b/scripts/benchmark_multi_node.sbatch @@ -1,15 +1,18 @@ +#!/bin/bash +sbatch <<EOT + #!/bin/bash #SBATCH -A nsc -#SBATCH --nodes=8 +#SBATCH --nodes=${2} #SBATCH --gres=gpu:8 #SBATCH --ntasks-per-node=8 #SBATCH --time=0-00:10:00 #####SBATCH --reservation=bt-xuan_2nodes # For singularity -rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -srun singularity exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname="benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json"" +#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json +# srun singularity exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname="benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json"" #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json #srun singularity exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname="benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json"" @@ -24,9 +27,10 @@ srun singularity exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidi ############## Running srun with pyxis works # For enroot -#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -#srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname="benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json"" +rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json +srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname="benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json"" -#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -#srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname="benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json"" +rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json +srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname="benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json"" +EOT \ No newline at end of file diff --git a/scripts/benchmark_sbatch_submit.sh b/scripts/benchmark_sbatch_submit.sh index 9107d84cc038eb23715f0de7154a5bf2fbf35fcc..f51c9960ce624e34af7bfd5ab8778b303126ae37 100644 --- a/scripts/benchmark_sbatch_submit.sh +++ b/scripts/benchmark_sbatch_submit.sh @@ -17,21 +17,17 @@ for gpus in {1..8}; do done done -for nodes in {2..8} -for gpus in {8}; do - for batch_size in 128; do - for iteration in {1..100}; do +for nodes in {2..8}; do + for gpus in {8}; do + for batch_size in 128; do + for iteration in {1..100}; do - echo dim ${dim}, nodes ${nodes}, gpus ${gpus}, batch_size ${batch_size}, iteration ${iteration} - # For single node - sbatch -o sbatch_out/benchmark_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize${batch_size}_iteration${iteration}.out benchmark_single_node.sbatch ${dim} ${nodes} ${gpus} ${batch_size} ${iteration} - - # For multi node - #sbatch -o sbatch_out/benchmark_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize${batch_size}_iteration${iteration}.out benchmark_multi_node.sbatch ${dim} ${nodes} ${gpus} ${batch_size} ${iteration} - - sleep 1 # pause to be kind to the scheduler + # For multi node + sbatch -o sbatch_out/benchmark_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize${batch_size}_iteration${iteration}.out benchmark_multi_node.sbatch ${dim} ${nodes} ${gpus} ${batch_size} ${iteration} + + sleep 1 # pause to be kind to the scheduler + done done done done -done \ No newline at end of file