update

8512b062 · Xuan Gu · 4a66b719 · 8512b062 · 8512b062 · 8512b062
Commit 8512b062 authored 1 year ago by Xuan Gu
--- a/NVIDIA/DeepLearningExamples/PyTorch/Classification/efficientnet_for_pytorch/README.md
+++ b/NVIDIA/DeepLearningExamples/PyTorch/Classification/efficientnet_for_pytorch/README.md
+
+### Setting paths
+
+```
+export BENCHMARK_CATEGORY=/proj/nsc_testing/xuan/berzelius-benchmarks/NVIDIA/DeepLearningExamples/PyTorch/Classification
+export MODEL_NAME=efficientnet_for_pytorch
+export MODEL_BASE_VERSION=21.03
+
+export MODEL_VERSION=latest
+export MODEL_DIR=${BENCHMARK_CATEGORY}/${MODEL_NAME}
+export WORK_DIR=$MODEL_DIR/workspace
+export DEF_DIR=$MODEL_DIR/${MODEL_NAME}_${MODEL_VERSION}.def
+export MODEL_BASE=/proj/nsc_testing/xuan/containers/nvidia_pytorch_${MODEL_BASE_VERSION}-py3.sif
+export CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODEL_NAME}_${MODEL_VERSION}.sif
+
+mkdir -p $WORK_DIR/results
+```
+### Building the container
+
+```
+apptainer build $MODEL_BASE  docker://nvcr.io/nvidia/pytorch:${MODEL_BASE_VERSION}-py3
+apptainer build $CONTAINER_DIR $DEF_DIR
+```
+
+### Make a copy of the code
+
+```
+apptainer exec $CONTAINER_DIR bash -c "cp -a /workspace/* ${WORK_DIR}/"
+```
+
+### Downloading and preprocessing the data
+
+Make a symlink from the ResNet50 directory.
+
+```
+ln -s $BENCHMARK_CATEGORY/resnet50_v1.5_for_pytorch/workspace/data $WORK_DIR/data
+```
+
+
+### Running benchmarking 
+
+```
+apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd ${WORK_DIR} $CONTAINER_DIR python ./multiproc.py --nnodes 1 --nproc_per_node 1 ./launch.py --model efficientnet-b0 --precision AMP --mode benchmark_training --platform DGXA100 /data --raport-file benchmark.json --epochs 1 --prof 100 --batch-size 512 
+apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd ${WORK_DIR} $CONTAINER_DIR python ./multiproc.py --nnodes 1 --nproc_per_node 1 ./launch.py --model efficientnet-b0 --precision AMP --mode benchmark_inference --platform DGXA100 /data --raport-file benchmark.json --epochs 1 --prof 100 --batch-size 512
+```
+
+### Running benchmarking using batch jobs
+
+```
+bash $WORK_DIR/submit_benchmark_jobs.sh
+python cal_mean_performance.py  
+```
+
+
+### Known issues
+
+#### Isssue 1 (latest)
+https://github.com/NVIDIA/DeepLearningExamples/issues/1113
+
+```
+Traceback (most recent call last):
+  File "./launch.py", line 53, in <module>
+    main(args, model_args, model_arch)
+  File "/workspace/rn50/main.py", line 623, in main
+    ) = prepare_for_training(args, model_args, model_arch)
+  File "/workspace/rn50/main.py", line 376, in prepare_for_training
+    affinity = set_affinity(args.gpu, mode=args.gpu_affinity)
+  File "/workspace/rn50/image_classification/gpu_affinity.py", line 410, in set_affinity
+    set_socket_unique_affinity(
+  File "/workspace/rn50/image_classification/gpu_affinity.py", line 277, in set_socket_unique_affinity
+    os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
+OSError: [Errno 22] Invalid argument
+```
+
+Solution: comment out lines 376-377 in `main.py`
\ No newline at end of file
--- a/NVIDIA/DeepLearningExamples/PyTorch/Classification/efficientnet_for_pytorch/cal_mean_performance.py
+++ b/NVIDIA/DeepLearningExamples/PyTorch/Classification/efficientnet_for_pytorch/cal_mean_performance.py
+import re
+import os
+
+# Specify the folder path
+folder_path = "workspace/results/"  # Replace with the actual path to your folder
+
+# Check if the folder exists
+if os.path.exists(folder_path) and os.path.isdir(folder_path):
+    
+    for root, dirs, files in os.walk(folder_path):
+        # Sort the list of file names alphabetically
+        sorted_files = sorted(files)
+        # Loop over the file names
+        for file_name in sorted_files:
+            # Print the file name
+            print(file_name)
+            # Construct the full file path
+            file_path = os.path.join(root, file_name)
+            # Open and read the text file
+            with open(file_path, 'r') as file:
+                content = file.read()
+            
+            if "train" in file_name:    
+                # Use regular expressions to find and extract all matches between '"train.total_ips": ' and ', "train.lr"'
+                pattern = r'"train\.total_ips": (.*?), "train\.lr"'
+            else:
+                pattern = r'"val\.total_ips": (.*?), "val\.compute_latency"'
+
+            # Find all matches
+            matches = re.findall(pattern, content)
+            # print(matches)
+            matches = [float(match) for match in matches]
+
+            # # Print all the extracted content
+            # for match in matches:
+            #     print(match)    
+        
+            # Remove the small 50% elements
+            matches.sort()
+            remaining_elements = matches[int(len(matches) * 0.50):]
+            # print(remaining_elements)
+            # Calculate the mean of the remaining elements
+            mean = sum(remaining_elements) / len(remaining_elements)
+            print("Mean throughput:", int(mean))    
+            print()
+
+    
+else:
+    print(f"The folder '{folder_path}' does not exist or is not a directory.")
+
+
+
+    
--- a/NVIDIA/DeepLearningExamples/PyTorch/Classification/efficientnet_for_pytorch/efficientnet_for_pytorch_latest.def
+++ b/NVIDIA/DeepLearningExamples/PyTorch/Classification/efficientnet_for_pytorch/efficientnet_for_pytorch_latest.def
+Bootstrap: localimage
+From: /proj/nsc_testing/xuan/containers/nvidia_pytorch_21.03-py3.sif
+
+
+%environment
+
+export PYTHONNOUSERSITE=True
+
+
+%post 
+
+cd /workspace
+git clone https://github.com/NVIDIA/DeepLearningExamples.git 
+mv DeepLearningExamples/PyTorch/Classification/ConvNets/* ./
+pip install --no-cache-dir -r requirements.txt
+rm -rf DeepLearningExamples
+
+# Fix for https://github.com/NVIDIA/DeepLearningExamples/issues/1113
+sed 's/affinity = set_affinity(args.gpu/#affinity = set_affinity(args.gpu/g' main.py > temp_file && mv temp_file main.py
+sed 's/print(f"Training process {args.local_rank} affinity/#print(f"Training process {args.local_rank} affinity/g' main.py > temp_file && mv temp_file main.py
\ No newline at end of file
--- a/NVIDIA/DeepLearningExamples/PyTorch/Classification/efficientnet_for_pytorch/generate_benchmark_jobs.sh
+++ b/NVIDIA/DeepLearningExamples/PyTorch/Classification/efficientnet_for_pytorch/generate_benchmark_jobs.sh
+#!/bin/bash
+
+dim=$1 
+nodes=$2 
+gpus=$3 
+batch_size=$4 
+benchmark_mode=$5
+node_type=$6
+
+SBATCH_DIR=$WORK_DIR/sbatch_scripts/benchmark_${node_type}_${benchmark_mode}_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize_${batch_size}.sbatch
+SBATCH_OUT_DIR=$WORK_DIR/sbatch_out/benchmark_${node_type}_${benchmark_mode}_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize_${batch_size}.out
+LOG_DIR=benchmark_${node_type}_${benchmark_mode}_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize_${batch_size}_amp.json
+
+cat <<EOT >  $SBATCH_DIR
+#!/bin/bash
+
+#SBATCH -A nsc
+#SBATCH --nodes=${nodes}
+#SBATCH --gres=gpu:${gpus}
+#SBATCH --ntasks-per-node=${gpus}
+#SBATCH --time=$TIME_RESERVATION
+#SBATCH --output=$SBATCH_OUT_DIR
+
+EOT
+
+if [ "${node_type}" = "thin" ]; then
+    cat <<EOT >>  $SBATCH_DIR
+#SBATCH -C "thin"    
+#SBATCH --reservation=$GPU_RESERVATION
+EOT
+else
+    cat <<EOT >>  $SBATCH_DIR
+#SBATCH -C "fat"
+#SBATCH --reservation=$GPU_RESERVATION
+EOT
+fi
+
+
+if [ "${benchmark_mode}" = "train" ]; then
+    cat <<EOT >>  $SBATCH_DIR
+rm -f $WORK_DIR/results/$LOG_DIR
+apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd ${WORK_DIR} $CONTAINER_DIR python ./multiproc.py --nnodes ${nodes} --nproc_per_node ${gpus} --master_port ${MASTER_PORT} ./launch.py --model efficientnet-b0 --precision AMP --mode benchmark_training --platform DGXA100 /data --raport-file $WORK_DIR/results/$LOG_DIR --epochs 1 --prof 100 --batch-size ${batch_size} 
+EOT
+else
+    cat <<EOT >>  $SBATCH_DIR
+rm -f $WORK_DIR/results/$LOG_DIR
+apptainer exec --nv -B ${WORK_DIR}/data:/data -B ${WORK_DIR}/results:/results --pwd ${WORK_DIR} $CONTAINER_DIR python ./multiproc.py --nnodes ${nodes} --nproc_per_node ${gpus} --master_port ${MASTER_PORT} ./launch.py --model efficientnet-b0 --precision AMP --mode benchmark_inference --platform DGXA100 /data --raport-file $WORK_DIR/results/$LOG_DIR --epochs 1 --prof 100 --batch-size ${batch_size}
+EOT
+fi
+
--- a/NVIDIA/DeepLearningExamples/PyTorch/Classification/efficientnet_for_pytorch/submit_benchmark_jobs.sh
+++ b/NVIDIA/DeepLearningExamples/PyTorch/Classification/efficientnet_for_pytorch/submit_benchmark_jobs.sh
+#!/bin/bash
+set -e
+
+export BENCHMARK_CATEGORY=/proj/nsc_testing/xuan/berzelius-benchmarks/NVIDIA/DeepLearningExamples/PyTorch/Classification
+export MODEL_NAME=efficientnet_for_pytorch
+export MODLE_VERSION=latest
+export MODEL_DIR=${BENCHMARK_CATEGORY}/${MODEL_NAME}
+export WORK_DIR=${MODEL_DIR}/workspace
+export CONTAINER_DIR=/proj/nsc_testing/xuan/containers/${MODEL_NAME}_${MODLE_VERSION}.sif
+export GPU_RESERVATION=
+export TIME_RESERVATION=00:20:00
+
+mkdir -p ${WORK_DIR}/sbatch_out ${WORK_DIR}/sbatch_scripts ${WORK_DIR}/results
+benchmark_modes=("train" "predict")
+# benchmark_modes=("train")
+# node_types=("thin" "fat")
+node_types=("fat")
+
+dim=2
+for nodes in {1..1}; do
+    for gpus in {1..8}; do
+        for benchmark_mode in "${benchmark_modes[@]}"; do
+            for node_type in "${node_types[@]}"; do
+        
+                    # Use all GPUs for multi node jobs    
+                    if [ "$nodes" -gt 1 ] && [ "$gpus" -ne 8 ]; then
+                        break
+                    fi  
+
+                    export MASTER_PORT=$(shuf -i 10000-99999 -n 1)
+                    echo Master port on $MASTER_PORT
+
+                    if [ "${node_type}" = "thin" ]; then
+                        batch_size=512
+                    else
+                        batch_size=1024
+                    fi
+
+                    echo dim ${dim}, nodes ${nodes}, gpus ${gpus}, batch_size ${batch_size}, benchmark_mode ${benchmark_mode}, node_type ${node_type}
+
+                    bash ${MODEL_DIR}/generate_benchmark_jobs.sh ${dim} ${nodes} ${gpus} ${batch_size} ${benchmark_mode} ${node_type}
+                    SBATCH_DIR=${WORK_DIR}/sbatch_scripts/benchmark_${node_type}_${benchmark_mode}_dim${dim}_nodes${nodes}_gpus${gpus}_batchsize_${batch_size}.sbatch
+                    # sbatch --wait $SBATCH_DIR
+                    sbatch --nodelist=node088,node089 $SBATCH_DIR
+                    echo ""
+                    # sleep 180 
+            done
+        done
+    done
+done