diff --git a/1_single_core_job/README.md b/1_single_core_job/README.md deleted file mode 100644 index acda6518f9b946b3e8590f4880c3bd48e91af5d2..0000000000000000000000000000000000000000 --- a/1_single_core_job/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# Single core jobs -A single core job is a job with only a single thread. This type of job is used when it is hard or impossible to make use of multiple cores/threads. A simple example could be a data parser that reads a file and transforms it into a more suitable format. - -## How to run -To run the example do the following steps: -1. Log in to Lundgren -2. Change directory to the example code -3. Run `sbatch single_core_job.sh` -4. Check queue status by running `squeue` -5. When the job is completed check the file _single_core_job.log_ - -## Detailed description of the example -The batch script is the main file for the job allocation and preparation. Inside the python script a few environmental variables are fetched and printed out. - -### The batch script -The batch script, _single_core_job.sh_, contains three sections. The first section contains input arguments to the Slurm scheduler. The second section loads Python into environment so it is accessible and lastly the a job step is performed. - -The input arguments are defined with a comment beginning with SBATCH followed by the argument key and value. For easier readablility the -- method is used. - -- __job-name:__ The name of the job is set to _demo_single_core_ -- __time:__ The requeted time is set to 1 minute, _00:01:00_ -- __ntasks:__ The number of tasks to be performed in this job is set to _1_. -- __cpus-per-task:__ The requested number of cores per task is set to _1_ -- __mem:__ The requested memory is set to _50 MB_ -- __output:__ The standard output should be sent to the file _single_core_job.log_ - -Python needs to be loaded into the environment in order to be accessible this is done in the next step with the __module__ command. - -The job step with the single task is allocated and performed with the __srun__ command. - -#### The python script -The python script represents the taskt to be done. In this case the task is to print out some environment variables that are set by Slurm. - -The environment variable __JOB_ID__ can be used to create temporary files and folders. In this example it creates a file named _<JOB_ID>_.txt and writes the job name into it. \ No newline at end of file diff --git a/1_single_core_job/single_core_job.sh b/1_single_core_job/single_core_job.sh deleted file mode 100644 index e0bd2984d4ab484180eef6b7040eff35da5571c1..0000000000000000000000000000000000000000 --- a/1_single_core_job/single_core_job.sh +++ /dev/null @@ -1,12 +0,0 @@ -#! /bin/bash -#SBATCH --job-name=demo_single_core -#SBATCH --time=00:01:00 -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=1 -#SBATCH --mem=50MB -#SBATCH --output=single_core_job.log - -# Loading Python into the environment -module load python/anaconda3-2024.02-3.11.7 - -srun python single_core_task.py \ No newline at end of file diff --git a/1_single_core_job/single_core_task.py b/1_single_core_job/single_core_task.py deleted file mode 100644 index 9e8e26d1f9b1a1ce66ac181168e6b900bd7d2f75..0000000000000000000000000000000000000000 --- a/1_single_core_job/single_core_task.py +++ /dev/null @@ -1,37 +0,0 @@ -from datetime import datetime - -import logging -import os -import time - -logger = logging.getLogger(__name__) - -def main(): - # Read environment variables. - JOB_NAME = os.environ.get('SLURM_JOB_NAME','Unknown') - JOB_ID = os.environ.get('SLURM_JOB_ID','Unknown') - NUMBER_OF_CORES = os.environ.get('SLURM_CPUS_PER_TASK','Unknown') - MAXIMUM_MEMORY = os.environ.get('SLURM_MEM_PER_NODE','Unknown') - - # Sleeping until next minute. - # This represents the calculations - current_time = datetime.now() - sleep_time = 60 - current_time.second - logger.info('%s - Sleeping for %d seconds.',current_time.strftime('%Y-%m-%d %H:%M:%S'), sleep_time) - time.sleep(sleep_time) - - # Printing some things to standard output. - logger.info('\nJob ID:\t\t\t%s\nJob name:\t\t%s\nAllocated cores:\t%s\nAllocated memory:\t%s', - JOB_ID, JOB_NAME, NUMBER_OF_CORES,MAXIMUM_MEMORY) - - # Writing some output to a file based on the Slurm job id. - output_file = '{}.txt'.format(JOB_ID) - with open(output_file,'w') as file: - file.write('This file was created by the job {} with id {}\n'.format - (JOB_NAME, JOB_ID)) - - logger.info('Job completed.') - -if __name__ == '__main__': - logging.basicConfig(level=logging.INFO) - main() diff --git a/1_single_job_step/README.md b/1_single_job_step/README.md new file mode 100644 index 0000000000000000000000000000000000000000..117447e295974afb0dee83e52d6ee1adafcd92b2 --- /dev/null +++ b/1_single_job_step/README.md @@ -0,0 +1,49 @@ +# Single job step +In SLURM, Job Steps are a way to launch distinct parallel (most commonly) and/or sequential tasks from within a single job script. Job Steps are executed using the SLURM command "srun" + +This is an example where Slurm is instructed to run a single job step. A single job step is fast to write and simple to use. + +This folder contains one batch script that runs a sequential program and one batch sript that runs a parallel program. The batch scripts it self, that defines the job steps, are similar with only slight modifications in the settings. + +## How to run +To run the example do the following steps: +1. Log in to Lundgren +2. Change directory to the example code +3. Run `sbatch single_job_step_sequential.sh` or `sbatch single_job_step_parallel.sh` +4. Check queue status by running `squeue` +5. When the job is completed check the file _sequential_single_job_step.log_ or _parallel_single_job_step.log_ for the program log. + +_If you run the parallel example, try change the batch file to run on 1 and or 4 cores._ + +## Detailed description of the example +The batch script is the main file for the job allocation and preparation. Inside the python script a few environmental variables are fetched and printed out. + +### The batch script +The batch script, _single_job_step_sequential.sh_ / _single_job_step_parallel.sh_ contains three sections. The first section contains input arguments to the Slurm scheduler. The second section loads Python into environment so it is accessible and lastly the a job step is performed. + +The input arguments are defined with a comment beginning with SBATCH followed by the argument key and value. For easier readablility the -- method is used. + +- __job-name:__ The name of the job +- __time:__ The requeted time +- __ntasks:__ The number of tasks to be performed in this job +- __cpus-per-task:__ The requested number of cpus per task +- __mem-per-cpu:__ The requested memory adjusted per the number of cpu's +- __output:__ File name for standard output + +Python needs to be loaded into the environment in order to be accessible this is done in the next step with the __module__ command. + +The single job step is allocated and performed with the __srun__ command. + +#### The python script +The python script represents the taskt to be done. In this case the task is read an input file and wait to simulate a calculation and afterwards print to an output file. The python script can be found in the _[code](https://gitlab.liu.se/rasri17/lundgren-examples/-/tree/main/code)_ folder. + +- The environment variable __JOB_ID__ can be used to create temporary files and folders. +- The environment variable __SLURM_CPUS_PER_TASK__ is used to restrict the worker pool to the allocated number of cpus when running in parallel. + +### How to set the number of cores in different programing languages and softwares +Most programming languages and softwares tries to make use of all cores that are available. This can lead to an oversubscription on the resources. On a shared resource one must match the maximum used resources with the allocated ones. This section gives a reference in how to do it in commonly used softwares. + +- __CPLEX:__ Use the parameter _global thread count_. Read more in the [documentation](https://www.ibm.com/docs/en/icos/22.1.2?topic=parameters-global-thread-count) +- __Gurobi:__ Use the configuration parameter _ThreadLimit_. Read more in the [documentation](https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#threadlimit) +- __MATLAB:__ Create a instance of the parpool object with the _poolsize_ set to the number of cores and use the pool when running in parallell. Read more in the [documentation](https://se.mathworks.com/help/parallel-computing/parpool.html) +- __Python:__ If the multiprocessing package is used, create an instance of the pool class with the _processes_ set to the number of cores and use the pool when running in parallell. Read more in the [documentation](https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool) \ No newline at end of file diff --git a/1_single_job_step/single_job_step_parallel.sh b/1_single_job_step/single_job_step_parallel.sh new file mode 100644 index 0000000000000000000000000000000000000000..1ebf18993af067f462e7a34fba769ce08f01142c --- /dev/null +++ b/1_single_job_step/single_job_step_parallel.sh @@ -0,0 +1,13 @@ +#! /bin/bash +#SBATCH --job-name=parallel_single_job_step +#SBATCH --time=00:02:00 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=2 +#SBATCH --mem-per-cpu=50MB +#SBATCH --output=parallel_single_job_step.log + +# Loading Python into the environment +module load python/anaconda3-2024.02-3.11.7 + +# Start job stage +srun python ../code/parallel_task.py ../data/data_1.txt output_paralell.csv \ No newline at end of file diff --git a/1_single_job_step/single_job_step_sequential.sh b/1_single_job_step/single_job_step_sequential.sh new file mode 100644 index 0000000000000000000000000000000000000000..376cf5299ff6f2183dcc84b13ee2791dff3b23bd --- /dev/null +++ b/1_single_job_step/single_job_step_sequential.sh @@ -0,0 +1,13 @@ +#! /bin/bash +#SBATCH --job-name=sequential_single_job_step +#SBATCH --time=00:02:00 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=1 +#SBATCH --mem-per-cpu=50MB +#SBATCH --output=sequential_single_job_step.log + +# Loading Python into the environment +module load python/anaconda3-2024.02-3.11.7 + +# Start job stage +srun python ../code/sequential_task.py ../data/data_1.txt output_output_sequential.csv \ No newline at end of file diff --git a/2_multi_core_job/README.md b/2_multi_core_job/README.md deleted file mode 100644 index e89274c2340ea2224b51de8c3d9950e62bafe3ab..0000000000000000000000000000000000000000 --- a/2_multi_core_job/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# Multi core jobs -A multi core job is a job that splits the computation to multiple cores. This type of job is the most suitable and most common ones to run on Lundgren. This includes optimization problems and heavy computations. - -## How to run -To run the example do the following steps: -1. Log in to Lundgren -2. Change directory to the example code -3. Run `sbatch multi_core_job.sh` -4. Check queue status by running `squeue` -5. When the job is completed check the file _multi_core_job.log_ - -Try changing the number of cpus in _multi_core_job.sh_ and see the changes in processing time. - -## Detailed description of the example -The batch script is the main file for the job allocation and preparation. Inside the python script a few environmental variables are fetched and printed out. - -### The batch script -The batch script, multi_core_job.sh_, contains three sections. The first section contains input arguments to the Slurm scheduler. The second section loads Python into environment so it is accessible and lastly the a job step is performed. - -The input arguments are defined with a comment beginning with SBATCH followed by the argument key and value. For easier readablility the -- method is used. - -- __job-name:__ The name of the job is set to demo_multi_core -- __time:__ The requeted time is set to 5 minutes, _00:05:00_ -- __ntasks:__ The number of tasks to be performed in this job is set to _1_. -- __cpus-per-task:__ The requested number of cores per task is set to _2_ -- __mem:__ The requested memory is set to _50 MB_ -- __output:__ The standard output should be sent to the file multi_core_job.log_ - -Python needs to be loaded into the environment in order to be accessible this is done in the next step with the __module__ command. - -The job step with the single task is allocated and performed with the __srun__ command. - -#### The python script -The python script represents the taskt to be done. In this case the task is to wait a random time and print the waiting is done. - -The environment variable __SLURM_CPUS_PER_TASK__ is used to restrict the worker pool to the allocated number of cores. - -### How to set the number of cores in different programing languages and softwares -Most programming languages and softwares tries to make use of all cores that are available. This can lead to an oversubscription on the resources. On a shared resource one must match the maximum used resources with the allocated ones. This section gives a reference in how to do it in commonly used softwares. - -- __CPLEX:__ Use the parameter _global thread count_. Read more in the [documentation](https://www.ibm.com/docs/en/icos/22.1.2?topic=parameters-global-thread-count) -- __Gurobi:__ Use the configuration parameter _ThreadLimit_. Read more in the [documentation](https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#threadlimit) -- __MATLAB:__ Create a instance of the parpool object with the _poolsize_ set to the number of cores and use the pool when running in parallell. Read more in the [documentation](https://se.mathworks.com/help/parallel-computing/parpool.html) -- __Python:__ If the multiprocessing package is used, create an instance of the pool class with the _processes_ set to the number of cores and use the pool when running in parallell. Read more in the [documentation](https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool) diff --git a/2_multi_core_job/multi_core_job.sh b/2_multi_core_job/multi_core_job.sh deleted file mode 100644 index 139ff3abd50baf9a7b31f73ab4013f76f9c50d70..0000000000000000000000000000000000000000 --- a/2_multi_core_job/multi_core_job.sh +++ /dev/null @@ -1,13 +0,0 @@ -#! /bin/bash -#SBATCH --job-name=demo_multi_core -#SBATCH --time=00:05:00 -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=2 -#SBATCH --mem-per-cpu=50MB -#SBATCH --output=multi_core_job.log - -# Loading Python into the environment -module load python/anaconda3-2024.02-3.11.7 - -# Start job stage -srun python multi_core_task.py \ No newline at end of file diff --git a/2_multi_core_job/multi_core_task.py b/2_multi_core_job/multi_core_task.py deleted file mode 100644 index 1879628bc4ca0073beab543afb3b47a110b719ac..0000000000000000000000000000000000000000 --- a/2_multi_core_job/multi_core_task.py +++ /dev/null @@ -1,49 +0,0 @@ -from datetime import datetime -from multiprocessing import Pool -import logging -import os -import random -import time - -logger = logging.getLogger(__name__) - -def sleep(input): - time.sleep(input[1]) - logger.info('Task %d done.',input[0]) - -def main(): - # Read environment variables. - NUMBER_OF_CORES = os.environ.get('SLURM_CPUS_PER_TASK','Unknown') - if NUMBER_OF_CORES in 'Unknown': - logger.error('Unkown number of cores, exiting.') - return - - NUMBER_OF_CORES = int(NUMBER_OF_CORES) - logger.info('Running program with %d cores.',NUMBER_OF_CORES) - - # Creating a list of tasks to be performed - # This represents the calculations - random.seed(1) - tasks = [] - total_time = 0 - for i in range(10): - time = random.randrange(1,29) - tasks.append((i, time)) - total_time = total_time + time - - # Creating a multiprocessing pool to perform the tasks - pool = Pool(processes=NUMBER_OF_CORES) - - # Running submitting the tasks to the worker pool - tic = datetime.now() - logger.info('Submitting tasks to pool.') - pool.map(sleep, tasks) - toc = datetime.now() - - logger.info('All tasks are done, took %d seconds, compared to %d seconds with single thread.', - (toc-tic).seconds, total_time) - - -if __name__ == '__main__': - logging.basicConfig(level=logging.INFO) - main() diff --git a/2_multiple_job_steps/README.md b/2_multiple_job_steps/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e8fcb279422c14479ca3018077adcd5956d9b51f --- /dev/null +++ b/2_multiple_job_steps/README.md @@ -0,0 +1,53 @@ +# Multiple job steps +In SLURM, Job Steps are a way to launch distinct parallel (most commonly) and/or sequential tasks from within a single job script. Job Steps are executed using the SLURM command "srun" + +Multiple job steps is good to use when a chain of tasks is needed. The chain of tasks could be pre-processing -> calculations -> post-processing. In this example Slurm is instructed to run with muliple job step. In the example input data will be copied to local storage then some calculations will be done and lastly the output is compressed and sent back to the home folder. + +## How to run +To run the example do the following steps: +1. Log in to Lundgren +2. Change directory to the example code +3. Run `sbatch multiple_job_steps.sh` +4. Check queue status by running `squeue` +5. When the job is completed check the file _multiple_job_step.log_. + +## Detailed description of the example +The batch script is the main file for the job allocation and preparation. Inside the python script a few environmental variables are fetched and printed out. + +### The batch script +The batch script, _multiple_job_steps.sh_contains three sections. The first section contains input arguments to the Slurm scheduler. The second section loads Python into environment so it is accessible and lastly all the job steps is performed. + +The input arguments are defined with a comment beginning with SBATCH followed by the argument key and value. For easier readablility the -- method is used. + +- __job-name:__ The name of the job +- __time:__ The requeted time +- __ntasks:__ The number of tasks to be performed in this job +- __cpus-per-task:__ The requested number of cpus per task +- __mem-per-cpu:__ The requested memory adjusted per the number of cpu's +- __output:__ File name for standard output + +Python needs to be loaded into the environment in order to be accessible this is done in the next step with the __module__ command. + +The job steps is allocated and performed with the __srun__ commands. +1. A folder is created with the same name as the Job ID on the local hard drive in the data folder of Lundgren _/local/data1/<LiU-ID>_. +2. Input data files are copied to the newly created folder. +3. The third step is the computational step of the job. +4. The output files are compressed. +5. The compressed output files are moved to the home folder. +6. The folder with the data is removed and if the <LiU-ID> folder in the data folder is empty it is removed as well. + +_In this example only the computational step needs multiple CPU's therefore the srun for all job steps except for step 3 are set to use 1 CPU per task._ + +#### The python script +The python script represents the taskt to be done. In this case the task is read an input file and wait to simulate a calculation and afterwards print to an output file. The python script can be found in the _[code](https://gitlab.liu.se/rasri17/lundgren-examples/-/tree/main/code)_ folder. + +- The environment variable __JOB_ID__ can be used to create temporary files and folders. +- The environment variable __SLURM_CPUS_PER_TASK__ is used to restrict the worker pool to the allocated number of cpus when running in parallel. + +### How to set the number of cores in different programing languages and softwares +Most programming languages and softwares tries to make use of all cores that are available. This can lead to an oversubscription on the resources. On a shared resource one must match the maximum used resources with the allocated ones. This section gives a reference in how to do it in commonly used softwares. + +- __CPLEX:__ Use the parameter _global thread count_. Read more in the [documentation](https://www.ibm.com/docs/en/icos/22.1.2?topic=parameters-global-thread-count) +- __Gurobi:__ Use the configuration parameter _ThreadLimit_. Read more in the [documentation](https://docs.gurobi.com/projects/optimizer/en/current/reference/parameters.html#threadlimit) +- __MATLAB:__ Create a instance of the parpool object with the _poolsize_ set to the number of cores and use the pool when running in parallell. Read more in the [documentation](https://se.mathworks.com/help/parallel-computing/parpool.html) +- __Python:__ If the multiprocessing package is used, create an instance of the pool class with the _processes_ set to the number of cores and use the pool when running in parallell. Read more in the [documentation](https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool) \ No newline at end of file diff --git a/2_multiple_job_steps/multiple_job_steps.sh b/2_multiple_job_steps/multiple_job_steps.sh new file mode 100644 index 0000000000000000000000000000000000000000..fa10b1c1a78c65acd4ae3b69c51a8534409540bc --- /dev/null +++ b/2_multiple_job_steps/multiple_job_steps.sh @@ -0,0 +1,36 @@ +#! /bin/bash +#SBATCH --job-name=multiple_job_step +#SBATCH --time=00:02:00 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=2 +#SBATCH --mem-per-cpu=50MB +#SBATCH --output=multiple_job_step.log + +# Loading Python into the environment +module load python/anaconda3-2024.02-3.11.7 + +# Specify_ input file +file=data_4.txt +temporary_folder=/local/data1/${USER} +working_folder=${temporary_folder}/${SLURM_JOB_ID} + +# Step 1 - Create a temporary folder to store data in. +srun --cpus-per-task=1 mkdir -v -p ${working_folder} + +# Step 2 - Copy indata to the temporary folder. +srun --cpus-per-task=1 cp -v ${PWD}/../data/${file} ${working_folder} + +# Step 3 - Start job stage +srun python ../code/parallel_task.py ${working_folder}/${file} ${working_folder}/output.csv + +# Step 4 - Compress data all csv files. +srun --cpus-per-task=1 tar -czvf ${working_folder}/output.tar.gz -C ${working_folder} $(cd ${working_folder} && ls *.csv) + +# Step 5 - Move output data to home folder +srun --cpus-per-task=1 mv -v ${working_folder}/output.tar.gz ${PWD} + +# Step 6a - Remove temporary files. +srun --cpus-per-task=1 rm -rfv ${working_folder} + +# Step 6b - Clear folder +srun --cpus-per-task=1 test -n "$(ls -A "$temporary_folder")" || rmdir -v "$temporary_folder" \ No newline at end of file diff --git a/3_job_array/README.md b/3_job_array/README.md index 62c3a276d2f8b7dc1f7cec82897f421272647220..4ebc688dfb76b5bdc584699f1981018f9ccfa719 100644 --- a/3_job_array/README.md +++ b/3_job_array/README.md @@ -7,25 +7,23 @@ To run the example do the following steps: 2. Change directory to the example code 3. Run `sbatch job_array.sh` 4. Check queue status by running `squeue` -5. When the job is completed check the file _job_array_XXX_YY.log_ - -Try to extend the job array with one more run by adding a new file in the _data_ folder and update the _config.txt_ file. +5. When the job is completed check the file _job_array_<JOB_ID>_<ARRAY_TASK_ID>.log_ ## Detailed description of the example -The batch script is the main file for the job allocation and preparation. Inside the python script a few environmental variables are fetched and printed out. Furthermore there are a folder, _data_ that contains some figurative input data and a _config.txt_ file that maps the array to the input data. +The batch script is the main file for the job allocation and preparation. Inside the python script a few environmental variables are fetched and printed out. Furthermore there is a _config.txt_ file that maps the array to the input data. ### The batch script The batch script, _job_array.sh_, contains four sections. The first section contains input arguments to the Slurm scheduler. The second section loads Python into environment so it is accessible. In the third step a the _config.txt_ file is read and the filename of the file corresponding to the array index is stored. Lastly the job step is performed with the relevant filename as input argument. The input arguments are defined with a comment beginning with SBATCH followed by the argument key and value. For easier readablility the -- method is used. -- __job-name:__ The name of the job is set to _demo_job_array_ -- __time:__ The requeted time is set to 5 minutes, _00:05:00_ -- __ntasks:__ The number of tasks to be performed in this job is set to _1_. -- __cpus-per-task:__ The requested number of cores per task is set to _2_ -- __mem:__ The requested memory is set to _50 MB_ -- __output:__ The standard output should be sent to the file _job_array_%A_%a.log__, the %A will expand to the job number and %a will expand to the array index. -- __array:__ The array is set t _1-3_. This represents a list of array ids that should be created. Each id will be a separate job. The array can be of any numbering that suites the user. +- __job-name:__ The name of the job +- __time:__ The requeted time +- __ntasks:__ The number of tasks to be performed in this job +- __cpus-per-task:__ The requested number of cpus per task +- __mem-per-cpu:__ The requested memory adjusted per the number of cpu's +- __output:__ File name for standard output +- __array:__ The ids of the array. _Note: Multiple similar jobs will be run and output files need to be handled in a way so they are not overwritten._ @@ -36,12 +34,10 @@ The job step with the single task is allocated and performed with the __srun__ c #### The configuration file and data files The _config.txt_ is a text file containing a simple table, the first column contains a the array index and the second column contains the filepath to the data file to be loaded into the job. It is importat that the index in the file matches the _--array_ argument. -The data files in this example is a simple json object but could be a CSV-file or other file formats. - For simpler applications the data files could be ignored and the _config.txt_ contains all relevant data. #### The python script -The python script represents the task to be done. In this case the task is to wait a time based on the input data file and print the waiting is done. +The python script represents the task to be done. In this case the task is to wait a time based on the input data file and print the waiting is done. The python script can be found in the _[code](https://gitlab.liu.se/rasri17/lundgren-examples/-/tree/main/code)_ folder. The environment variable __SLURM_CPUS_PER_TASK__ is used to restrict the worker pool to the allocated number of cores. diff --git a/3_job_array/config.txt b/3_job_array/config.txt index b0170826ca4cd98eb37b180a1c74658328004eca..30738612f796242a5cddfd091991ab7cbd90d173 100644 --- a/3_job_array/config.txt +++ b/3_job_array/config.txt @@ -1,5 +1,6 @@ task file -1 data/parameters_first_job.txt -2 data/parameters_second_job.txt -3 data/parameters_third_job.txt +1 ../data/data_1.txt +2 ../data/data_2.txt +3 ../data/data_3.txt +4 ../data/data_4.txt diff --git a/3_job_array/data/parameters_first_job.txt b/3_job_array/data/parameters_first_job.txt deleted file mode 100644 index 157fbdd75311373eefecd941ca470da3495cdf1f..0000000000000000000000000000000000000000 --- a/3_job_array/data/parameters_first_job.txt +++ /dev/null @@ -1 +0,0 @@ -{"name": "First file", "sleep": [2,12,7,4,4]} \ No newline at end of file diff --git a/3_job_array/data/parameters_second_job.txt b/3_job_array/data/parameters_second_job.txt deleted file mode 100644 index b87fdb1475706cfa574605a3e894a47be42860b3..0000000000000000000000000000000000000000 --- a/3_job_array/data/parameters_second_job.txt +++ /dev/null @@ -1 +0,0 @@ -{"name": "Second file", "sleep": [3,10,4,11,2]} \ No newline at end of file diff --git a/3_job_array/data/parameters_third_job.txt b/3_job_array/data/parameters_third_job.txt deleted file mode 100644 index 35894cee98d17d75a541af51933f486c5877980e..0000000000000000000000000000000000000000 --- a/3_job_array/data/parameters_third_job.txt +++ /dev/null @@ -1 +0,0 @@ -{"name": "Third file", "sleep": [12,3,14,10,20]} \ No newline at end of file diff --git a/3_job_array/job_array.sh b/3_job_array/job_array.sh index f448025d2b58cb83ec31af3974e78c477c76bf05..e8732bc8fb4be466a62d0b99ef145ae5958cea2d 100644 --- a/3_job_array/job_array.sh +++ b/3_job_array/job_array.sh @@ -5,7 +5,7 @@ #SBATCH --cpus-per-task=2 #SBATCH --mem-per-cpu=50MB #SBATCH --output=job_array_%A_%a.log -#SBATCH --array=1-3 +#SBATCH --array=1-4 # Loading Python into the environment module load python/anaconda3-2024.02-3.11.7 @@ -17,4 +17,4 @@ config=config.txt file=$(awk -v task=$SLURM_ARRAY_TASK_ID '$1==task {print $2}' $config) # Start job stage -srun python job_array_task.py ${file} \ No newline at end of file +srun python ../code/parallel_task.py ${file} output_${SLURM_ARRAY_TASK_ID}.csv \ No newline at end of file diff --git a/3_job_array/job_array_task.py b/3_job_array/job_array_task copy.py similarity index 100% rename from 3_job_array/job_array_task.py rename to 3_job_array/job_array_task copy.py diff --git a/README.md b/README.md index b211785b977a7d4ef79a8ee772870493793cda6f..a615788eebe9d3cf145afc1d17214c4b3978b0c1 100644 --- a/README.md +++ b/README.md @@ -8,26 +8,24 @@ Each example is located in a separate folder with different topics and detailed The best way to use the examples is to try them. Download the repository by cloning it to your harddrive. If you are on Lundgren you can clone it to a suitable folder under your home folder. If you are on your personal computer you can clone it into a suitable folder under the personal LiU-clound folder, _P:_ (_\\ad.liu.se\home\<LiU-ID>_) -All the examples will use the method of creating a job and submit it to the Slurm scheduler as a background process. Each example will use a bash file, _.sh_ as a base for instructing Slurm on the job steps to perform. Each job step uses the command ___srun___ or ___sbatch___ to perform one or several tasks. +All the examples will use the method of creating a job and submit it to the Slurm scheduler as a background process. Each example will use a bash file, _.sh_ as a base for instructing Slurm on the job steps to perform. Each job step uses the command ___srun___ to perform one or several tasks. ## Examples This section describes the different examples briefly so you can find an example that fits your needs. __Note:__ _Since our Slurm cluster only has one node there can be a queue for running the examples depending on the workload of Lundgren._ -#### Example 1 - Single core job -A single core job is a job with only a single thread. This type of job is used when it is hard or impossible to make use of multiple cores/threads. +#### Example 1 - Single job step +This example creates a job with a single job step. This type of job is easy and fast to set up. There are two different batch files, one with a sequetial program and one with a parallel program. -A simple example could be a data parser that reads a file and transforms it into a more suitable format. +Learn more about the [example](https://gitlab.liu.se/rasri17/lundgren-examples/-/tree/main/1_single_job_step). -Learn more about the [example](https://gitlab.liu.se/rasri17/lundgren-examples/-/tree/main/1_single_core_job). +#### Example 2 - Multiple job steps +This example creates a job with mulitple steps. This type of job is good to use if there is a need of pre- or post-processing. -#### Example 2 - Mutli core job -A multi core job is a job that splits the computation to multiple cores. This type of job is the most suitable and most common ones to run on Lundgren. This includes optimization problems and heavy computations. - -Learn more about the [example](https://gitlab.liu.se/rasri17/lundgren-examples/-/tree/main/2_multi_core_job). +Learn more about the [example](https://gitlab.liu.se/rasri17/lundgren-examples/-/tree/main/2_multiple_job_steps). #### Example 3 - Job arrays -A job array is a method to queue multiple jobs with similar resource needs. Job arrays are suitable when the same type of computation is needed to be run multiple times with different input data. +A job array is a method to queue multiple jobs with similar resource needs. Job arrays are suitable when the same type of computation is needed to be run multiple times with different input data. A job array can use mulitple job steps and they are performed in every job in the array. Learn more about the [example](https://gitlab.liu.se/rasri17/lundgren-examples/-/tree/main/3_job_array). \ No newline at end of file diff --git a/code/parallel_task.py b/code/parallel_task.py new file mode 100644 index 0000000000000000000000000000000000000000..e579a6fe4f724c14b8ac6aef0464b1b9a47f1875 --- /dev/null +++ b/code/parallel_task.py @@ -0,0 +1,69 @@ +from datetime import datetime +from multiprocessing import Pool + +import json +import logging +import os +import sys +import time + +logger = logging.getLogger(__name__) + +def sleep(input) -> int: + time.sleep(input[1]) + logger.info('Task %d done.',input[0]) + + return input[1] + +def main(input_file: str, output_file: str): + # Read environment variables. + JOB_NAME = os.environ.get('SLURM_JOB_NAME','Unknown') + JOB_ID = os.environ.get('SLURM_JOB_ID','Unknown') + NUMBER_OF_CPUS = os.environ.get('SLURM_CPUS_PER_TASK','Unknown') + if NUMBER_OF_CPUS in 'Unknown': + logger.error('Unkown number of CPU''s, exiting.') + return + + NUMBER_OF_CPUS = int(NUMBER_OF_CPUS) + logger.info('**** Output for job %s (%s) ****', JOB_NAME, JOB_ID) + logger.info('Running program with %d CPU''s.',NUMBER_OF_CPUS) + + # Reading configuration file and create a list of tasks + # This represents the reading of parameters and calculations + logger.info('Reading configuration from %s.',input_file) + with open(input_file, 'r') as file: + data = json.load(file) + + tasks = [] + total_time = 0 + for i in range(len(data['sleep'])): + time = data['sleep'][i] + tasks.append((i, time)) + total_time = total_time + time + + # Creating a multiprocessing pool to perform the tasks + pool = Pool(processes=NUMBER_OF_CPUS) + + # Running submitting the tasks to the worker pool + tic = datetime.now() + logger.info('Submitting tasks to pool.') + results = pool.map(sleep, tasks) + toc = datetime.now() + + logger.info('All tasks are done, took %d seconds, compared to %d seconds with single thread.', + (toc-tic).seconds, total_time) + + logger.info('Writing result to %s', output_file) + with open(output_file, 'w') as file: + file.write('time\n') + for result in results: + file.write('{}\n'.format(result)) + + logger.info('Done.') + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + input_file = sys.argv[1] + output_file = sys.argv[2] + main(input_file, output_file) + sys.exit(0) diff --git a/code/sequential_task.py b/code/sequential_task.py new file mode 100644 index 0000000000000000000000000000000000000000..0862c077978d01286f96b097bd965fc6bd3b9dcb --- /dev/null +++ b/code/sequential_task.py @@ -0,0 +1,63 @@ +from datetime import datetime +from multiprocessing import Pool + +import json +import logging +import os +import sys +import time + +logger = logging.getLogger(__name__) + +def sleep(input) -> int: + time.sleep(input[1]) + logger.info('Task %d done.',input[0]) + + return input[1] + +def main(input_file: str, output_file: str): + # Read environment variables. + JOB_NAME = os.environ.get('SLURM_JOB_NAME','Unknown') + JOB_ID = os.environ.get('SLURM_JOB_ID','Unknown') + + logger.info('**** Output for job %s (%s) ****', JOB_NAME, JOB_ID) + logger.info('Running program sequential.') + + # Reading configuration file and create a list of tasks + # This represents the reading of parameters and calculations + logger.info('Reading configuration from %s.',input_file) + with open(input_file, 'r') as file: + data = json.load(file) + + tasks = [] + results = [] + total_time = 0 + for i in range(len(data['sleep'])): + time = data['sleep'][i] + tasks.append((i, time)) + total_time = total_time + time + + # Running submitting the tasks in sequence. + tic = datetime.now() + logger.info('Running tasks sequentially.') + for task in tasks: + results.append(sleep(task)) + toc = datetime.now() + + logger.info('All tasks are done, took %d seconds, compared to %d seconds with single thread.', + (toc-tic).seconds, total_time) + + logger.info('Writing result to %s', output_file) + with open(output_file, 'w') as file: + file.write('time\n') + for result in results: + file.write('{}\n'.format(result)) + + logger.info('Done.') + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + input_file = sys.argv[1] + output_file = sys.argv[2] + main(input_file, output_file) + sys.exit(0) diff --git a/data/data_1.txt b/data/data_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..689136b83c2ca6cf3d28734667f898d0c2748cdb --- /dev/null +++ b/data/data_1.txt @@ -0,0 +1 @@ +{"name": "First data file", "sleep": [8,24,14,8,5]} \ No newline at end of file diff --git a/data/data_2.txt b/data/data_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..f053d92dd1b96a65cc76772cea3c271e5c164737 --- /dev/null +++ b/data/data_2.txt @@ -0,0 +1 @@ +{"name": "Second data file", "sleep": [2,20,8,19,10]} \ No newline at end of file diff --git a/data/data_3.txt b/data/data_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..db11e7d40341c849bb2c88f815a58b974dd9521b --- /dev/null +++ b/data/data_3.txt @@ -0,0 +1 @@ +{"name": "Third data file", "sleep": [15,6,18,15,5]} \ No newline at end of file diff --git a/data/data_4.txt b/data/data_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..00493762730f0133be5793b5216e1a16ca357521 --- /dev/null +++ b/data/data_4.txt @@ -0,0 +1 @@ +{"name": "Forth data file", "sleep": [6,2,6,1,6,2,2,6,2,1,6,1,1,1,2,2,1,1,2,2,1,2,1,1,1]} \ No newline at end of file