diff --git a/scripts/run_benchmark_single_node.sh b/scripts/run_benchmark_single_node.sh index 1648fb14da6b33fa7910c1fa60916ea9366096c4..26a7d81429e495b4eaab20ff53d664a07b5b0f19 100644 --- a/scripts/run_benchmark_single_node.sh +++ b/scripts/run_benchmark_single_node.sh @@ -14,19 +14,19 @@ cat <<EOT > scripts/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_iteration # This version does not run on multi-node # For apptainer rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_21.11.0.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" +apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_21.11.0.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json' rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_21.11.0.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" +apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_21.11.0.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json' ###################22.11.0 # For apptainer #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -#apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" +#apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json' #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -#apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" +#apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json' EOT diff --git a/scripts/run_bencmark_multi_node.sh b/scripts/run_bencmark_multi_node.sh index 9ff4ae421343d8b0e600d1610d999c913499858c..86c8d9fad190bb1e2cff75079b42fb8a0c77f5aa 100644 --- a/scripts/run_bencmark_multi_node.sh +++ b/scripts/run_bencmark_multi_node.sh @@ -13,26 +13,26 @@ cat <<EOT > scripts/benchmark_multi_node.sbatch # For apptainer #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -# srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" +# srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json' #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -#srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" +#srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json' ############## Running srun enroot ... stopped working at 20230220 # For enroot #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" +#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json' #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" +#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json' ############## Running srun with pyxis works # For enroot rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" +srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json' rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" +srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json' EOT