diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index 4a55f1ab5d28202d8d51318121f2c00534b938d7..f45cac51556c5a3319afe1f76cf0168e3c188c04 100644 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -12,15 +12,13 @@ for nodes in {1..1}; do # For single node if [ $nodes -eq 1 ]; then bash scripts/run_benchmark_single_node.sh ${dim} ${nodes} ${gpus} ${batch_size} ${iteration} - - sleep 1 # pause to be kind to the scheduler + sleep 1 else # For multi node gpus=8 bash scripts/run_benchmark_multi_node.sh ${dim} ${nodes} ${gpus} ${batch_size} ${iteration} - - sleep 1 # pause to be kind to the scheduler + sleep 1 fi done done @@ -38,15 +36,13 @@ for nodes in {1..1}; do # For single node if [ $nodes -eq 1 ]; then bash scripts/run_benchmark_single_node.sh ${dim} ${nodes} ${gpus} ${batch_size} ${iteration} - - sleep 1 # pause to be kind to the scheduler + sleep 1 else # For multi node gpus=8 bash scripts/run_benchmark_multi_node.sh ${dim} ${nodes} ${gpus} ${batch_size} ${iteration} - - sleep 1 # pause to be kind to the scheduler + sleep 1 fi done done diff --git a/scripts/run_benchmark_single_node.sh b/scripts/run_benchmark_single_node.sh index a52d11011d0a7f2fb709b0d57680705a2bade4d8..c6e1fc5669da4491bff9689461a47bdaee993257 100644 --- a/scripts/run_benchmark_single_node.sh +++ b/scripts/run_benchmark_single_node.sh @@ -1,13 +1,13 @@ #!/bin/bash -cat <<EOT > scripts/benchmark_single_node.sbatch +cat <<EOT > scripts/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_iteration${5}.sbatch #!/bin/bash #SBATCH -A nsc #SBATCH --nodes=1 #SBATCH --gres=gpu:${3} #SBATCH --time=0-0:10:00 -#SBATCH -o sbatch_out/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_iteration${5}.out +#SBATCH --output=sbatch_out/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_iteration${5}.out #SBATCH --reservation=devel ###################21.11.0 @@ -28,13 +28,6 @@ apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-n #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json #apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" -# For enroot -#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -#enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" - -#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -#enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" - EOT -sbatch scripts/benchmark_single_node.sbatch \ No newline at end of file +#sbatch scripts/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_iteration${5}.sbatch \ No newline at end of file diff --git a/scripts/run_bencmark_multi_node.sh b/scripts/run_bencmark_multi_node.sh index a94f0ac85ed96aaa3adf4ffffdd20c71401176a9..9ff4ae421343d8b0e600d1610d999c913499858c 100644 --- a/scripts/run_bencmark_multi_node.sh +++ b/scripts/run_bencmark_multi_node.sh @@ -8,7 +8,7 @@ cat <<EOT > scripts/benchmark_multi_node.sbatch #SBATCH --gres=gpu:8 #SBATCH --ntasks-per-node=8 #SBATCH --time=0-00:10:00 -#SBATCH -o sbatch_out/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_iteration${5}.out +#SBATCH --output=sbatch_out/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_iteration${5}.out #SBATCH --reservation=devel # For apptainer