diff --git a/README.md b/README.md index a7e9f6137db329a9f7453a8dd6aa538b7b8f6845..ee550dbeb77cb3ce531120ba6652f1e62f383575 100644 --- a/README.md +++ b/README.md @@ -44,8 +44,8 @@ Docker is not available on Berzelius. We use Apptainer or Enroot. With Apptainer ``` apptainer pull nvidia_nnu-net_for_pytorch_$VERSION.sif docker://berzeliushub/nvidia_nnu-net_for_pytorch:$VERSION -apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results --nv nvidia_nnu-net_for_pytorch_$VERSION.sif cd /workspace/nnunet_pyt && python download.py --task 01 -apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results --nv nvidia_nnu-net_for_pytorch_$VERSION.sif cd /workspace/nnunet_pyt && python preprocess.py --task 01 --dim 2 +apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_$VERSION.sif bash -c "cd /workspace/nnunet_pyt && python download.py --task 01" +apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_$VERSION.sif bash -c "cd /workspace/nnunet_pyt && python preprocess.py --task 01 --dim 2" ``` <!-- With Enroot diff --git a/scripts/run_benchmark_single_node.sh b/scripts/run_benchmark_single_node.sh index 26a7d81429e495b4eaab20ff53d664a07b5b0f19..1b52ca43c1ba432b2819dfef6574b2bc7b03a475 100644 --- a/scripts/run_benchmark_single_node.sh +++ b/scripts/run_benchmark_single_node.sh @@ -14,19 +14,19 @@ cat <<EOT > scripts/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_iteration # This version does not run on multi-node # For apptainer rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_21.11.0.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json' +apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_21.11.0.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_21.11.0.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json' +apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_21.11.0.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" ###################22.11.0 # For apptainer #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -#apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json' +#apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -#apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json' +#apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" EOT diff --git a/scripts/run_bencmark_multi_node.sh b/scripts/run_bencmark_multi_node.sh index 86c8d9fad190bb1e2cff75079b42fb8a0c77f5aa..9ff4ae421343d8b0e600d1610d999c913499858c 100644 --- a/scripts/run_bencmark_multi_node.sh +++ b/scripts/run_bencmark_multi_node.sh @@ -13,26 +13,26 @@ cat <<EOT > scripts/benchmark_multi_node.sbatch # For apptainer #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -# srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json' +# srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -#srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json' +#srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" ############## Running srun enroot ... stopped working at 20230220 # For enroot #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json' +#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" #rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json' +#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" ############## Running srun with pyxis works # For enroot rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json -srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json' +srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'" rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json -srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json' +srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'" EOT