diff --git a/scripts/benchmark_multi_node.sh b/scripts/benchmark_multi_node.sh deleted file mode 100644 index d9d700c256dee121a5a93ac693969d8ef65f551e..0000000000000000000000000000000000000000 --- a/scripts/benchmark_multi_node.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -sbatch <<EOT - -#!/bin/bash - -#SBATCH -A nsc -#SBATCH --nodes=${2} -#SBATCH --gres=gpu:8 -#SBATCH --ntasks-per-node=8 -#SBATCH --time=0-00:10:00 -#SBATCH -o "sbatch_out/benchmark_dim"${1}"_nodes"${2}"_gpus"${3}"_batchsize"${4}"_iteration"${5}".out" -#SBATCH --reservation=bt-xuan_2nodes - -# For apptainer -#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -# srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" - -#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -#srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" - -############## Running srun enroot ... stopped working at 20230220 -# For enroot -#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" - -#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" - -############## Running srun with pyxis works -# For enroot -rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" - -rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" - -EOT \ No newline at end of file diff --git a/scripts/benchmark_single_node.sh b/scripts/benchmark_single_node.sh deleted file mode 100644 index 9d7208028307d3b7ca68c4459b27f100a7abeed2..0000000000000000000000000000000000000000 --- a/scripts/benchmark_single_node.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -sbatch <<EOT - -#!/bin/bash - -#SBATCH -A nsc -#SBATCH --nodes=1 -#SBATCH --gres=gpu:${3} -#SBATCH --time=0-0:10:00 -#SBATCH -o "sbatch_out/benchmark_dim"${1}"_nodes"${2}"_gpus"${3}"_batchsize"${4}"_iteration"${5}".out" -#SBATCH --reservation=devel - -###################21.11.0 -# This version does not run on multi-node -# For apptainer -rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_21.11.0.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" - -rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_21.11.0.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" - - -###################22.11.0 -# For apptainer -#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -#apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" - -#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -#apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" - -# For enroot -#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -#enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" - -#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -#enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" - -EOT \ No newline at end of file diff --git a/scripts/generate_single_node_job.sh b/scripts/run_benchmark_single_node_job.sh similarity index 98% rename from scripts/generate_single_node_job.sh rename to scripts/run_benchmark_single_node_job.sh index d8e5c30eefe0041d4a8a359e61ca904500a953a5..a52d11011d0a7f2fb709b0d57680705a2bade4d8 100644 --- a/scripts/generate_single_node_job.sh +++ b/scripts/run_benchmark_single_node_job.sh @@ -35,4 +35,6 @@ apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-n #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json #enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" -EOT \ No newline at end of file +EOT + +sbatch scripts/benchmark_single_node.sbatch \ No newline at end of file diff --git a/scripts/generate_multi_node_job.sh b/scripts/run_bencmark_multi_node_job.sh similarity index 97% rename from scripts/generate_multi_node_job.sh rename to scripts/run_bencmark_multi_node_job.sh index 6ed7508fee143bb65bfcf98fcb67f34b7ed6b0bb..a94f0ac85ed96aaa3adf4ffffdd20c71401176a9 100644 --- a/scripts/generate_multi_node_job.sh +++ b/scripts/run_bencmark_multi_node_job.sh @@ -1,6 +1,6 @@ #!/bin/bash -cat <<EOT > scripts/benchmark_single_node.sbatch +cat <<EOT > scripts/benchmark_multi_node.sbatch #!/bin/bash #SBATCH -A nsc @@ -34,4 +34,6 @@ srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_ rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" -EOT \ No newline at end of file +EOT + +sbatch scripts/benchmark_multi_node.sbatch \ No newline at end of file