diff --git a/scripts/benchmark_multi_node.sbatch b/scripts/benchmark_multi_node.sbatch index 0afae23ca8cb4cbe9d5bcb61432254ef69348d14..de2e57ff41556ba7b6020c61ae6d1707406f83be 100644 --- a/scripts/benchmark_multi_node.sbatch +++ b/scripts/benchmark_multi_node.sbatch @@ -8,29 +8,29 @@ sbatch <<EOT #SBATCH --gres=gpu:8 #SBATCH --ntasks-per-node=8 #SBATCH --time=0-00:10:00 -#####SBATCH --reservation=bt-xuan_2nodes +#SBATCH --reservation=bt-xuan_2nodes # For apptainer #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -# srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname="benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json"" +# srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -#srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname="benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json"" +#srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" ############## Running srun enroot ... stopped working at 20230220 # For enroot #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname="benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json"" +#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname="benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json"" +#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" ############## Running srun with pyxis works # For enroot rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname="benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json"" +srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname="benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json"" +srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" EOT \ No newline at end of file diff --git a/scripts/benchmark_sbatch_submit.sh b/scripts/benchmark_sbatch_submit.sh index db4c0fd1a76d91847ef30bbdfe86e5c70f10d733..cb652f8f7ddeb7ae273e3f6710acf4212557c6d1 100644 --- a/scripts/benchmark_sbatch_submit.sh +++ b/scripts/benchmark_sbatch_submit.sh @@ -1,15 +1,15 @@ #!/bin/bash -NUM_NODES=$2 +NUM_NODES=1 NUM_GPUS=$3 NUM_ITERATIONS=$4 dim=$1 batch_size=$5 if [ $NUM_NODES -eq 1 ]; then - for ((nodes = 1; nodes <= $NUM_NODES; nodes++)); do - for ((gpus = 1; gpus <= $NUM_GPUS; gpus++)); do - for ((iteration = 1; iteration <= $NUM_ITERATIONS; iteration++)); do + for nodes in {1..1}; do + for gpus in {1,8}; do + for iteration in {1..1}; do echo dim ${dim}, nodes ${nodes}, gpus ${gpus}, batch_size ${batch_size}, iteration ${iteration} # For single node @@ -21,9 +21,9 @@ if [ $NUM_NODES -eq 1 ]; then done done else - for ((nodes = 2; nodes <= $NUM_NODES; nodes++)); do - for ((gpus = 8; gpus <= $NUM_GPUS; gpus++)); do - for ((iteration = 1; iteration <= $NUM_ITERATIONS; iteration++)); do + for nodes in {2.. $NUM_NODES}; do + for gpus in {8,8}; do + for iteration in {1..1}; do batch_size=$BATCH_SIZE # For multi node sbatch -o sbatch_out/benchmark_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize${batch_size}_iteration${iteration}.out scripts/benchmark_multi_node.sbatch ${dim} ${nodes} ${gpus} ${batch_size} ${iteration}