refactor: move python scripts to code folder

ae274cb1 · Rasmus Ringdahl · b28f9db2 · ae274cb1 · ae274cb1 · ae274cb1
Commit ae274cb1 authored 6 months ago by Rasmus Ringdahl
--- a/1_single_job_step/README.md
+++ b/1_single_job_step/README.md
@@ -35,7 +35,7 @@ Python needs to be loaded into the environment in order to be accessible this is
 The single job step is allocated and performed with the __srun__ command.

 #### The python script
-The python script represents the taskt to be done. In this case the task is read an input file and wait to simulate a calculation and afterwards print to an output file.
+The python script represents the taskt to be done. In this case the task is read an input file and wait to simulate a calculation and afterwards print to an output file. The python script can be found in the _[code](https://gitlab.liu.se/rasri17/lundgren-examples/-/tree/main/code)_ folder.

 - The environment variable __JOB_ID__ can be used to create temporary files and folders.
 - The environment variable __SLURM_CPUS_PER_TASK__ is used to restrict the worker pool to the allocated number of cpus when running in parallel.

--- a/1_single_job_step/single_job_step_parallel.sh
+++ b/1_single_job_step/single_job_step_parallel.sh
@@ -10,4 +10,4 @@
 module load python/anaconda3-2024.02-3.11.7

 # Start job stage
-srun python parallel_task.py ../data/data_1.txt output_paralell.csv
\ No newline at end of file
+srun python ../code/parallel_task.py ../data/data_1.txt output_paralell.csv
\ No newline at end of file
--- a/1_single_job_step/single_job_step_sequential.sh
+++ b/1_single_job_step/single_job_step_sequential.sh
@@ -10,4 +10,4 @@
 module load python/anaconda3-2024.02-3.11.7

 # Start job stage
-srun python sequential_task.py ../data/data_1.txt output_output_sequential.csv
\ No newline at end of file
+srun python ../code/sequential_task.py ../data/data_1.txt output_output_sequential.csv
\ No newline at end of file
--- a/2_multiple_job_steps/README.md
+++ b/2_multiple_job_steps/README.md
@@ -39,7 +39,7 @@ The job steps is allocated and performed with the __srun__ commands.
 _In this example only the computational step needs multiple CPU's therefore the srun for all job steps except for step 3 are set to use 1 CPU per task._

 #### The python script
-The python script represents the taskt to be done. In this case the task is read an input file and wait to simulate a calculation and afterwards print to an output file.
+The python script represents the taskt to be done. In this case the task is read an input file and wait to simulate a calculation and afterwards print to an output file. The python script can be found in the _[code](https://gitlab.liu.se/rasri17/lundgren-examples/-/tree/main/code)_ folder.

 - The environment variable __JOB_ID__ can be used to create temporary files and folders.
 - The environment variable __SLURM_CPUS_PER_TASK__ is used to restrict the worker pool to the allocated number of cpus when running in parallel.

--- a/2_multiple_job_steps/multiple_job_steps.sh
+++ b/2_multiple_job_steps/multiple_job_steps.sh
@@ -21,7 +21,7 @@ srun --cpus-per-task=1 mkdir -v -p ${working_folder}
 srun --cpus-per-task=1 cp -v ${PWD}/../data/${file} ${working_folder}

 # Step 3 - Start job stage
-srun python parallel_task.py ${working_folder}/${file} ${working_folder}/output.csv
+srun python ../code/parallel_task.py ${working_folder}/${file} ${working_folder}/output.csv

 # Step 4 - Compress data all csv files.
 srun --cpus-per-task=1 tar -czvf ${working_folder}/output.tar.gz -C ${working_folder} $(cd ${working_folder} && ls *.csv)

--- a/2_multiple_job_steps/parallel_task.py
+++ b/2_multiple_job_steps/parallel_task.py
-from datetime import datetime
-from multiprocessing import Pool
-
-import json
-import logging
-import os
-import sys
-import time
-
-logger = logging.getLogger(__name__)
-
-def sleep(input) -> int:
-    time.sleep(input[1])
-    logger.info('Task %d done.',input[0])
-
-    return input[1]
-
-def main(input_file: str, output_file: str):
-    # Read environment variables.
-    JOB_NAME = os.environ.get('SLURM_JOB_NAME','Unknown')
-    JOB_ID = os.environ.get('SLURM_JOB_ID','Unknown')
-    NUMBER_OF_CPUS = os.environ.get('SLURM_CPUS_PER_TASK','Unknown')
-    if NUMBER_OF_CPUS in 'Unknown':
-        logger.error('Unkown number of CPU''s, exiting.')
-        return
-
-    NUMBER_OF_CPUS = int(NUMBER_OF_CPUS)
-    logger.info('**** Output for job %s (%s) ****', JOB_NAME, JOB_ID)
-    logger.info('Running program with %d CPU''s.',NUMBER_OF_CPUS)
-
-    # Reading configuration file and create a list of tasks
-    # This represents the reading of parameters and calculations
-    logger.info('Reading configuration from %s.',input_file)
-    with open(input_file, 'r') as file:
-        data = json.load(file)
-    
-    tasks = []
-    total_time = 0
-    for i in range(len(data['sleep'])):
-        time = data['sleep'][i]
-        tasks.append((i, time))
-        total_time = total_time + time
-
-    # Creating a multiprocessing pool to perform the tasks
-    pool = Pool(processes=NUMBER_OF_CPUS)
-
-    # Running submitting the tasks to the worker pool
-    tic = datetime.now()
-    logger.info('Submitting tasks to pool.')
-    results = pool.map(sleep, tasks)
-    toc = datetime.now()
-
-    logger.info('All tasks are done, took %d seconds, compared to %d seconds with single thread.',
-        (toc-tic).seconds, total_time)
-    
-    logger.info('Writing result to %s', output_file)
-    with open(output_file, 'w') as file:
-        file.write('time\n')
-        for result in results:
-            file.write('{}\n'.format(result))
-    
-
-if __name__ == '__main__':
-    logging.basicConfig(level=logging.INFO)
-    input_file = sys.argv[1]
-    output_file = sys.argv[2]
-    main(input_file, output_file)
-    sys.exit(0)
--- a/3_job_array/README.md
+++ b/3_job_array/README.md
@@ -37,7 +37,7 @@ The _config.txt_ is a text file containing a simple table, the first column cont
 For simpler applications the data files could be ignored and the _config.txt_ contains all relevant data. 

 #### The python script
-The python script represents the task to be done. In this case the task is to wait a time based on the input data file and print the waiting is done.
+The python script represents the task to be done. In this case the task is to wait a time based on the input data file and print the waiting is done. The python script can be found in the _[code](https://gitlab.liu.se/rasri17/lundgren-examples/-/tree/main/code)_ folder.

 The environment variable __SLURM_CPUS_PER_TASK__ is used to restrict the worker pool to the allocated number of cores.


--- a/3_job_array/job_array.sh
+++ b/3_job_array/job_array.sh
@@ -17,4 +17,4 @@ config=config.txt
 file=$(awk -v task=$SLURM_ARRAY_TASK_ID '$1==task {print $2}' $config)

 # Start job stage
-srun python job_array_task.py ${file} output_${SLURM_ARRAY_TASK_ID}.csv
\ No newline at end of file
+srun python ../code/parallel_task.py ${file} output_${SLURM_ARRAY_TASK_ID}.csv
\ No newline at end of file
--- a/3_job_array/job_array_task.py
+++ b/3_job_array/job_array_task.py
-from datetime import datetime
-from multiprocessing import Pool
-
-import json
-import logging
-import os
-import sys
-import time
-
-logger = logging.getLogger(__name__)
-
-def sleep(input) -> int:
-    time.sleep(input[1])
-    logger.info('Task %d done.',input[0])
-
-    return input[1]
-
-def main(input_file: str, output_file: str):
-    # Read environment variables.
-    JOB_NAME = os.environ.get('SLURM_JOB_NAME','Unknown')
-    JOB_ID = os.environ.get('SLURM_JOB_ID','Unknown')
-    NUMBER_OF_CPUS = os.environ.get('SLURM_CPUS_PER_TASK','Unknown')
-    if NUMBER_OF_CPUS in 'Unknown':
-        logger.error('Unkown number of cpu''s, exiting.')
-        return
-
-    NUMBER_OF_CPUS = int(NUMBER_OF_CPUS)
-    logger.info('**** Output for job %s (%s) ****', JOB_NAME, JOB_ID)
-    logger.info('Running program with %d cpu''s.',NUMBER_OF_CPUS)
-
-    # Reading configuration file and create a list of tasks
-    # This represents the reading of parameters and calculations
-    logger.info('Reading configuration from %s.',input_file)
-    with open(input_file, 'r') as file:
-        data = json.load(file)
-    
-    tasks = []
-    total_time = 0
-    for i in range(len(data['sleep'])):
-        time = data['sleep'][i]
-        tasks.append((i, time))
-        total_time = total_time + time
-
-    # Creating a multiprocessing pool to perform the tasks
-    pool = Pool(processes=NUMBER_OF_CPUS)
-
-    # Running submitting the tasks to the worker pool
-    tic = datetime.now()
-    logger.info('Submitting tasks to pool.')
-    results = pool.map(sleep, tasks)
-    toc = datetime.now()
-
-    logger.info('All tasks are done, took %d seconds, compared to %d seconds with single thread.',
-        (toc-tic).seconds, total_time)
-    
-    logger.info('Writing result to %s', output_file)
-    with open(output_file, 'w') as file:
-        file.write('time\n')
-        for result in results:
-            file.write('{}\n'.format(result))
-
-if __name__ == '__main__':
-    logging.basicConfig(level=logging.INFO)
-    input_file = sys.argv[1]
-    output_file = sys.argv[2]
-    main(input_file, output_file)
-    sys.exit(0)
--- a/1_single_job_step/parallel_task.py
+++ b/1_single_job_step/parallel_task.py
--- a/1_single_job_step/sequential_task.py
+++ b/1_single_job_step/sequential_task.py