Skip to content
Snippets Groups Projects
Commit ae618fe7 authored by Xuan Gu's avatar Xuan Gu
Browse files

Update 3 files

- /README.md
- /scripts/run_benchmark_single_node.sh
- /scripts/run_bencmark_multi_node.sh
parent cba717eb
No related branches found
No related tags found
No related merge requests found
......@@ -44,8 +44,8 @@ Docker is not available on Berzelius. We use Apptainer or Enroot.
With Apptainer
```
apptainer pull nvidia_nnu-net_for_pytorch_$VERSION.sif docker://berzeliushub/nvidia_nnu-net_for_pytorch:$VERSION
apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results --nv nvidia_nnu-net_for_pytorch_$VERSION.sif cd /workspace/nnunet_pyt && python download.py --task 01
apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results --nv nvidia_nnu-net_for_pytorch_$VERSION.sif cd /workspace/nnunet_pyt && python preprocess.py --task 01 --dim 2
apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_$VERSION.sif bash -c "cd /workspace/nnunet_pyt && python download.py --task 01"
apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_$VERSION.sif bash -c "cd /workspace/nnunet_pyt && python preprocess.py --task 01 --dim 2"
```
<!-- With Enroot
......
......@@ -14,19 +14,19 @@ cat <<EOT > scripts/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_iteration
# This version does not run on multi-node
# For apptainer
rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json
apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_21.11.0.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'
apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_21.11.0.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'"
rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json
apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_21.11.0.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'
apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch_21.11.0.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'"
###################22.11.0
# For apptainer
#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json
#apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'
#apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'"
#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json
#apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'
#apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'"
EOT
......
......@@ -13,26 +13,26 @@ cat <<EOT > scripts/benchmark_multi_node.sbatch
# For apptainer
#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json
# srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'
# srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'"
#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json
#srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'
#srun apptainer exec --nv -B ${PWD}/data:/data -B ${PWD}/results:/results nvidia_nnu-net_for_pytorch.sif bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'"
############## Running srun enroot ... stopped working at 20230220
# For enroot
#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json
#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'
#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'"
#rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json
#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'
#srun enroot start --rw --mount ${PWD}/data:/data --mount ${PWD}/results:/results nnunet bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'"
############## Running srun with pyxis works
# For enroot
rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json
srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'
srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_tf32_iteration${5}.json'"
rm -f results/benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json
srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'
srun --container-image=/proj/nsc_testing/xuan/enroot/xuagu37+nvidia_nnu-net_for_pytorch+21.11.0.sqsh --container-name=nnunet --container-mounts=${PWD}/data:/data,${PWD}/results:/results --container-writable bash -c "cd /workspace/nnunet_pyt && python scripts/benchmark.py --mode train --gpus ${3} --dim ${1} --batch_size ${4} --nodes ${2} --amp --logname='benchmark_dim${1}_nodes${2}_gpus${3}_batchsize${4}_amp_iteration${5}.json'"
EOT
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment