diff --git a/.gitignore b/.gitignore index 259d9fe..5c07472 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ *.Rproj .Rhistory .RData +*__pycache__/ *.snakemake diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..931bcc2 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,49 @@ +repos: +- repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update + rev: v0.6.0post1 # Insert the latest tag here + hooks: + - id: pre-commit-update + args: [--exclude, black, --keep, isort] + # Formats import order +- repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + name: isort (python) + args: ["--profile", "black", "--filter-files"] + + #Code formatter for both python files and jupyter notebooks +- repo: https://github.com/psf/black + rev: 22.10.0 + hooks: + - id: black-jupyter + - id: black + language_version: python3.10 + +- repo: https://github.com/nbQA-dev/nbQA + rev: 1.9.0 + hooks: + - id: nbqa-isort + additional_dependencies: [isort==5.6.4] + args: [--profile=black] + + + # remove unused imports +- repo: https://github.com/hadialqattan/pycln.git + rev: v2.4.0 + hooks: + - id: pycln + + # additional hooks found with in the pre-commit lib +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace # removes trailing white spaces + - id: mixed-line-ending # removes mixed end of line + args: + - --fix=lf + - id: pretty-format-json # JSON Formatter + args: + - --autofix + - --indent=4 + - --no-sort-keys diff --git a/lectures/10.hpc_and_parallel_compute/README.md b/lectures/10.hpc_and_parallel_compute/README.md new file mode 100644 index 0000000..65eb3ca --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/README.md @@ -0,0 +1,26 @@ +# Lecture 10: High performance computing and parallel computing + +This lecture will cover parallel computing and high performance computing. +We have the following learning objectives: +1. Familiarize with the concept of parallel computing +2. Understand how to leverage parallel computing +3. Learn about high performance computing +4. Understand how to leverage high performance computing +5. Learn how to use HPC resources and best practices + +We will be using some pre-written scripts to explore parallel computing and high performance computing. +The following scripts are available in the [scripts](./scripts) directory: +* [analyze_sequences](scripts/analyze_sequences.py) + * This script contains the core sequence analysis function that we use to analyze sequences. + Note this script is can be run for a single sequence and in a serial fashion but we will also call to be parallelized. +* [multiprocessing_run](scripts/multiprocessing_run.sh) + * This script runs itself in parallel using the `multiprocessing` module in Python. + This shell script calls the `multiprocessing_sequence_analysis.py` script below +* [multiprocessing_sequence_analysis](scripts/multiprocessing_sequence_analysis.py) + * The script is called by the `multiprocessing_run.sh` script. +* [plot_parallel_compute_analysis](scripts/plot_parallel_compute_analysis.py) + * This script plots the results of the parallel computing analysis. +* [serial_run](scripts/serial_run.sh) + * This script runs the `analyze_sequences.py` script in serial. +* [submit_jobs_HPC](scripts/submit_jobs_HPC.sh) + * This script submits jobs to the HPC cluster in an array job. diff --git a/lectures/10.hpc_and_parallel_compute/SLURM_cheatsheet.md b/lectures/10.hpc_and_parallel_compute/SLURM_cheatsheet.md new file mode 100644 index 0000000..e21c554 --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/SLURM_cheatsheet.md @@ -0,0 +1,79 @@ +# Slurm Guide + +For bash scripts, this line should be the first line of code in every script +``` +#!/bin/bash # Shebang slash bin slash bash +``` + +Next are the SBATCH directives that tell slurm scheduler how to handle your job. +These directives should be at the top of your script, but under the shebang line. + +### Frequent SLURM directives +``` +#SBATCH --job-name=parallel_job # job name +#SBATCH -t 1-23:59:59 # D-HH-MM-SS +#SBATCH -t 59 # MM +#SBATCH -t 59:59 # MM:SS +#SBATCH -t 59:59:59 # HH:MM:SS +#SBATCH -t 1-23 # D-HH +#SBATCH -t 1-23:59 # D-HH-MM +#SBATCH --mem=16G # 16 Gigabytes +#SBATCH --output=out_%j.log +#SBATCH --ntasks # number of tasks +#SBATCH --mail-type=NONE, BEGIN, END, FAIL, ALL # email events +#SBATCH --mail-user=email@ufl.edu +``` +### Slurm Commands +#### Environment modules +``` +module purge # removes all modules +module avail # lists all modules availble for loading +module list # list all currently loaded modules +module load # loads module (hint: us the tab key to autocomplete) +``` +#### Submitting a job +``` +sbatch script.sh # submit script.sh +``` +#### Checking job status +``` +squeue -u {User} # check submitted jobs in queue +``` +#### Canceling a job or all jobs +``` +scancel {jobid} # Cancel job +scancel -u {User} # Cancel all jobs for user +``` +#### Check job details +``` +jobstats $USER {days} # Check job stats for user for the last {days} +``` +#### Check job efficiency +``` +seff {jobid} +``` +#### Check fairshare +``` +module load slurmtools +levelfs $USER +``` +#### Check user and institution account billings +``` +suuser $USER +suacct amc-general +``` + +#### Example SBATCH +``` +#!/bin/bash +#SBATCH --job-name=Slurm_job # job name "slurm_job" +#SBATCH -t 1-23 # Time 1 day, 23 hours +#SBATCH --mem=16G # 16 Gigabytes of RAM +#SBATCH --output=out_%j.log # std output/error file + +#SBATCH --mail-type=END,FAIL # send email on job end/fail +#SBATCH --mail-user=email@ufl.edu # send email to this address + +module load python/3.9.6 +module list +``` diff --git a/lectures/10.hpc_and_parallel_compute/data/parallel_compute_analysis.csv b/lectures/10.hpc_and_parallel_compute/data/parallel_compute_analysis.csv new file mode 100644 index 0000000..087e6ad --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/data/parallel_compute_analysis.csv @@ -0,0 +1,111 @@ +sequences,time_per_sequence(s),core_count +10,0.5,1 +100,0.5,1 +1000,0.5,1 +10000,0.5,1 +100000,0.5,1 +1000000,0.5,1 +10000000,0.5,1 +100000000,0.5,1 +1000000000,0.5,1 +10000000000,0.5,1 +10,0.5,2 +100,0.5,2 +1000,0.5,2 +10000,0.5,2 +100000,0.5,2 +1000000,0.5,2 +10000000,0.5,2 +100000000,0.5,2 +1000000000,0.5,2 +10000000000,0.5,2 +10,0.5,4 +100,0.5,4 +1000,0.5,4 +10000,0.5,4 +100000,0.5,4 +1000000,0.5,4 +10000000,0.5,4 +100000000,0.5,4 +1000000000,0.5,4 +10000000000,0.5,4 +10,0.5,8 +100,0.5,8 +1000,0.5,8 +10000,0.5,8 +100000,0.5,8 +1000000,0.5,8 +10000000,0.5,8 +100000000,0.5,8 +1000000000,0.5,8 +10000000000,0.5,8 +10,0.5,16 +100,0.5,16 +1000,0.5,16 +10000,0.5,16 +100000,0.5,16 +1000000,0.5,16 +10000000,0.5,16 +100000000,0.5,16 +1000000000,0.5,16 +10000000000,0.5,16 +10,0.5,32 +100,0.5,32 +1000,0.5,32 +10000,0.5,32 +100000,0.5,32 +1000000,0.5,32 +10000000,0.5,32 +100000000,0.5,32 +1000000000,0.5,32 +10000000000,0.5,32 +10,0.5,64 +100,0.5,64 +1000,0.5,64 +10000,0.5,64 +100000,0.5,64 +1000000,0.5,64 +10000000,0.5,64 +100000000,0.5,64 +1000000000,0.5,64 +10000000000,0.5,64 +10,0.5,128 +100,0.5,128 +1000,0.5,128 +10000,0.5,128 +100000,0.5,128 +1000000,0.5,128 +10000000,0.5,128 +100000000,0.5,128 +1000000000,0.5,128 +10000000000,0.5,128 +10,0.5,256 +100,0.5,256 +1000,0.5,256 +10000,0.5,256 +100000,0.5,256 +1000000,0.5,256 +10000000,0.5,256 +100000000,0.5,256 +1000000000,0.5,256 +10000000000,0.5,256 +10,0.5,512 +100,0.5,512 +1000,0.5,512 +10000,0.5,512 +100000,0.5,512 +1000000,0.5,512 +10000000,0.5,512 +100000000,0.5,512 +1000000000,0.5,512 +10000000000,0.5,512 +10,0.5,1024 +100,0.5,1024 +1000,0.5,1024 +10000,0.5,1024 +100000,0.5,1024 +1000000,0.5,1024 +10000000,0.5,1024 +100000000,0.5,1024 +1000000000,0.5,1024 +10000000000,0.5,1024 diff --git a/lectures/10.hpc_and_parallel_compute/data/sequences_to_analyze.txt b/lectures/10.hpc_and_parallel_compute/data/sequences_to_analyze.txt new file mode 100644 index 0000000..3953de6 --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/data/sequences_to_analyze.txt @@ -0,0 +1,10 @@ +GCXCCXAGGGTTGCAGTCAAATGTCCA +CGGCCAATGAGGGXCGCXTAGGTCAT +TAGGTGGATACCXCTXATATATGATT +CCXATATTAAGACATATAATTGGAGG +TATTACACGCCCAAATAATTTGGCXA +TCAGCXGCXGGGAAGCGGGCGCXATACT +CGGATGATCATCXGGGATGATGTCTA +GCGCCXGGAAGACGAATCTTAATTA +TTAGGAACXTXXCAATATGTTTCGGT +ACTTCTATGTCTXTGGATTACAAACA diff --git a/lectures/10.hpc_and_parallel_compute/environments/README.md b/lectures/10.hpc_and_parallel_compute/environments/README.md new file mode 100644 index 0000000..88b509d --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/environments/README.md @@ -0,0 +1,10 @@ +# Environment creation +We need to create the environment needed for this lecture and hands on activity. +To do so run the following command from this directory: +```bash +conda env create -f parallel_and_hpc_compute_env.yaml +``` +OR +```bash +mamba env create -f parallel_and_hpc_compute_env.yaml +``` diff --git a/lectures/10.hpc_and_parallel_compute/environments/parallel_and_hpc_compute_env.yaml b/lectures/10.hpc_and_parallel_compute/environments/parallel_and_hpc_compute_env.yaml new file mode 100644 index 0000000..a97fa31 --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/environments/parallel_and_hpc_compute_env.yaml @@ -0,0 +1,16 @@ +name: parallel_and_hpc_compute_env +channels: + - conda-forge + - defaults +dependencies: + - python=3.11 + - conda-forge::pandas + - conda-forge::jupyter + - conda-forge::ipykernel + - conda-forge::nbconvert + - conda-forge::pip + - conda-forge::matplotlib + - conda-forge::seaborn + - pip: + - argparse + diff --git a/lectures/10.hpc_and_parallel_compute/hands_on_activity/5mc_sequence_analysis_activity.md b/lectures/10.hpc_and_parallel_compute/hands_on_activity/5mc_sequence_analysis_activity.md new file mode 100644 index 0000000..b71d9b5 --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/hands_on_activity/5mc_sequence_analysis_activity.md @@ -0,0 +1,24 @@ +# Hand on: 5mC sequence analysis activity + +You want to identify the 5mC content in each of 10 sequences. +Where X is 5mC and C is cytosine. +The goal is to identify the number of 5mC in each sequence byt using mutliple compute approaches. +For the sequences below, identify the number of 5mC in each sequence by using the following approaches: +* Serial approach +* Parallel approach + * Python multiprocessing approach + * GNU parallel approach +* HPC approach + +Sequences: +0. GCXCCXAGGGTTGCAGTCAAATGTCC +1. ACTTCTATGTCTXTGGATTACAAACA +2. CGGCCAATGAGGGXCGCXTAGGTCAT +3. TAGGTGGATACCXCTXATATATGATT +4. CCXATATTAAGACATATAATTGGAGG +5. TATTACACGCCCAAATAATTTGGCXA +6. TCAGCXGCXGGGAAGCGGGCGCXATA +7. CGGATGATCATCXGGGATGATGTCTA +8. GCGCCXGGAAGACGAATCTTAATTAX +9. TTAGGAACXTXXCAATATGTTTCGGT + diff --git a/lectures/10.hpc_and_parallel_compute/notebooks/plot_parallel_compute_analysis.ipynb b/lectures/10.hpc_and_parallel_compute/notebooks/plot_parallel_compute_analysis.ipynb new file mode 100644 index 0000000..050ed8d --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/notebooks/plot_parallel_compute_analysis.ipynb @@ -0,0 +1,205 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook generates a hypothetical situation calculating parallel computing time for a given number of tasks and processors. \n", + "The goal is to show how the time to complete a task decreases as the number of processors increases." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pathlib\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequencestime_per_sequence(s)core_count
0100.51
11000.51
210000.51
3100000.51
41000000.51
\n", + "
" + ], + "text/plain": [ + " sequences time_per_sequence(s) core_count\n", + "0 10 0.5 1\n", + "1 100 0.5 1\n", + "2 1000 0.5 1\n", + "3 10000 0.5 1\n", + "4 100000 0.5 1" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_path = pathlib.Path(\"../data/parallel_compute_analysis.csv\").resolve()\n", + "df = pd.read_csv(data_path)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"time(s)\"] = df[\"sequences\"] / df[\"core_count\"] * df[\"time_per_sequence(s)\"]\n", + "# log 10 transform the data\n", + "df[\"sequences\"] = df[\"sequences\"].apply(np.log10)\n", + "\n", + "# convert to hours\n", + "df[\"time(h)\"] = df[\"time(s)\"] / 3600\n", + "df[\"time(d)\"] = df[\"time(h)\"] / 24" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"core_count\"].unique()\n", + "# make each core count a different color\n", + "colors = sns.color_palette(\"husl\", len(df[\"core_count\"].unique()))\n", + "color_dict = dict(zip(df[\"core_count\"].unique(), colors))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot the results\n", + "sns.set(style=\"whitegrid\")\n", + "plt.figure(figsize=(12, 6))\n", + "# custom palette\n", + "custom_palette = color_dict\n", + "\n", + "plt.subplot(1, 2, 1)\n", + "sns.lineplot(\n", + " x=\"sequences\", y=\"time(d)\", hue=\"core_count\", data=df, palette=custom_palette\n", + ")\n", + "plt.xlabel(\"Number of Sequences (log10)\")\n", + "plt.ylabel(\"Time (Days)\")\n", + "plt.title(\"Comparison of parallel computing methods\")\n", + "plt.xlim(1, max(df[\"sequences\"]))\n", + "\n", + "plt.subplot(1, 2, 2)\n", + "sns.lineplot(\n", + " x=\"sequences\", y=\"time(h)\", hue=\"core_count\", data=df, palette=custom_palette\n", + ")\n", + "plt.xlabel(\"Number of Sequences (log10)\")\n", + "plt.ylabel(\"Time (Hours)\")\n", + "plt.title(\"Comparison of parallel computing methods\")\n", + "plt.xlim(1, 4)\n", + "plt.ylim(0, 1)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "lecture10_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lectures/10.hpc_and_parallel_compute/results/5mc_HPC.csv b/lectures/10.hpc_and_parallel_compute/results/5mc_HPC.csv new file mode 100644 index 0000000..41c38a1 --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/results/5mc_HPC.csv @@ -0,0 +1,11 @@ +raw_count,c_count,count_5mc,sequence +26,1,0,CGGATGATCATCXGGGATGATGTCTA +25,1,0,GCGCCXGGAAGACGAATCTTAATTA +26,3,0,TTAGGAACXTXXCAATATGTTTCGGT +26,1,0,ACTTCTATGTCTXTGGATTACAAACA +26,2,0,CGGCCAATGAGGGXCGCXTAGGTCAT +26,1,0,TATTACACGCCCAAATAATTTGGCXA +28,3,0,TCAGCXGCXGGGAAGCGGGCGCXATACT +26,2,0,TAGGTGGATACCXCTXATATATGATT +26,1,0,CCXATATTAAGACATATAATTGGAGG +27,2,0,GCXCCXAGGGTTGCAGTCAAATGTCCA diff --git a/lectures/10.hpc_and_parallel_compute/results/5mc_multiprocessing.csv b/lectures/10.hpc_and_parallel_compute/results/5mc_multiprocessing.csv new file mode 100644 index 0000000..cbe9981 --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/results/5mc_multiprocessing.csv @@ -0,0 +1,11 @@ +raw_count,c_count,count_5mc,sequence +2,27,0,GCXCCXAGGGTTGCAGTCAAATGTCCA +2,26,0,CGGCCAATGAGGGXCGCXTAGGTCAT +2,26,0,TAGGTGGATACCXCTXATATATGATT +1,26,0,CCXATATTAAGACATATAATTGGAGG +1,26,0,TATTACACGCCCAAATAATTTGGCXA +3,28,0,TCAGCXGCXGGGAAGCGGGCGCXATACT +1,26,0,CGGATGATCATCXGGGATGATGTCTA +1,25,0,GCGCCXGGAAGACGAATCTTAATTA +3,26,0,TTAGGAACXTXXCAATATGTTTCGGT +1,26,0,ACTTCTATGTCTXTGGATTACAAACA diff --git a/lectures/10.hpc_and_parallel_compute/results/5mc_parallel.csv b/lectures/10.hpc_and_parallel_compute/results/5mc_parallel.csv new file mode 100644 index 0000000..5dc179b --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/results/5mc_parallel.csv @@ -0,0 +1,11 @@ +raw_count,c_count,count_5mc,sequence +26,2,0,CGGCCAATGAGGGXCGCXTAGGTCAT +28,3,0,TCAGCXGCXGGGAAGCGGGCGCXATACT +26,1,0,CCXATATTAAGACATATAATTGGAGG +26,1,0,CGGATGATCATCXGGGATGATGTCTA +25,1,0,GCGCCXGGAAGACGAATCTTAATTA +26,2,0,TAGGTGGATACCXCTXATATATGATT +26,1,0,TATTACACGCCCAAATAATTTGGCXA +26,1,0,ACTTCTATGTCTXTGGATTACAAACA +26,3,0,TTAGGAACXTXXCAATATGTTTCGGT +27,2,0,GCXCCXAGGGTTGCAGTCAAATGTCCA diff --git a/lectures/10.hpc_and_parallel_compute/results/5mc_serial.csv b/lectures/10.hpc_and_parallel_compute/results/5mc_serial.csv new file mode 100644 index 0000000..f47a484 --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/results/5mc_serial.csv @@ -0,0 +1,11 @@ +raw_count,c_count,count_5mc,sequence +27,2,0,GCXCCXAGGGTTGCAGTCAAATGTCCA +26,2,0,CGGCCAATGAGGGXCGCXTAGGTCAT +26,2,0,TAGGTGGATACCXCTXATATATGATT +26,1,0,CCXATATTAAGACATATAATTGGAGG +26,1,0,TATTACACGCCCAAATAATTTGGCXA +28,3,0,TCAGCXGCXGGGAAGCGGGCGCXATACT +26,1,0,CGGATGATCATCXGGGATGATGTCTA +25,1,0,GCGCCXGGAAGACGAATCTTAATTA +26,3,0,TTAGGAACXTXXCAATATGTTTCGGT +26,1,0,ACTTCTATGTCTXTGGATTACAAACA diff --git a/lectures/10.hpc_and_parallel_compute/scripts/5mc_analysis.py b/lectures/10.hpc_and_parallel_compute/scripts/5mc_analysis.py new file mode 100644 index 0000000..190c28d --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/scripts/5mc_analysis.py @@ -0,0 +1,36 @@ +import argparse +import pathlib +import sys + +import pandas as pd + +sys.path.append("../utils/") +from sequence_analysis import count_5mc_sequences + + +def main(): + # set up parser for command line arguments + parser = argparse.ArgumentParser( + description="Analyze DNA sequences for CpG content" + ) + parser.add_argument("--sequence", help="A DNA sequence to analyze") + parser.add_argument("--output_file_name", help="The file to write the results to") + args = parser.parse_args() + sequence = args.sequence + output_file_name = args.output_file_name + c_count, raw_count, count_5mc, sequence = count_5mc_sequences(sequence) + # write the results to a file + output_file = pathlib.Path("../results", output_file_name) + # make the results directory if it doesn't exist + output_file.parent.mkdir(parents=True, exist_ok=True) + if not output_file.exists(): + with open(output_file, "w") as f: + f.write("raw_count,c_count,count_5mc,sequence\n") + f.write(f"{raw_count},{c_count},{count_5mc},{sequence}\n") + else: + with open(output_file, "a") as f: + f.write(f"{raw_count},{c_count},{count_5mc},{sequence}\n") + + +if __name__ == "__main__": + main() diff --git a/lectures/10.hpc_and_parallel_compute/scripts/gnu_parallel.sh b/lectures/10.hpc_and_parallel_compute/scripts/gnu_parallel.sh new file mode 100644 index 0000000..7e344cd --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/scripts/gnu_parallel.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# this script will run the 5mc_analysis.py script in parallel using GNU parallel + +conda activate parallel_and_hpc_compute_env + +sequences_path="../data/sequences_to_analyze.txt" +output_file_name="../results/5mc_parallel.csv" +# remove the output file if it already exists +if [ -f "$output_file_name" ]; then + rm "$output_file_name" +fi + +# read in the sequences +readarray -t sequences < $sequences_path + +# get the start time +start_time=$(date +%s) + +# use gnu parallel to run the 5mc_analysis.py script in parallel +parallel -j 10 python 5mc_analysis.py --sequence {} --output_file_name "$output_file_name" ::: "${sequences[@]}" + +# get the end time +end_time=$(date +%s) +echo "Parallel run time: $((end_time - start_time)) seconds" + diff --git a/lectures/10.hpc_and_parallel_compute/scripts/multiprocessing_run.sh b/lectures/10.hpc_and_parallel_compute/scripts/multiprocessing_run.sh new file mode 100644 index 0000000..b82ad66 --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/scripts/multiprocessing_run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +conda activate parallel_and_hpc_compute_env + +# profile the serial run +sequences_path="../data/sequences_to_analyze.txt" +output_file_name="../results/5mc_multiprocessing.csv" +# remove the output file if it exists +if [ -f "$output_file_name" ]; then + rm $output_file_name +fi + +python multiprocessing_sequence_analysis.py + +conda deactivate diff --git a/lectures/10.hpc_and_parallel_compute/scripts/multiprocessing_sequence_analysis.py b/lectures/10.hpc_and_parallel_compute/scripts/multiprocessing_sequence_analysis.py new file mode 100644 index 0000000..5984b5a --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/scripts/multiprocessing_sequence_analysis.py @@ -0,0 +1,42 @@ +import multiprocessing +import pathlib +import sys +import time + +import pandas as pd + +sys.path.append("../utils/") +from sequence_analysis import count_5mc_sequences + +if __name__ == "__main__": + # set the path for the input file + input_file_path = pathlib.Path("../data/sequences_to_analyze.txt").resolve( + strict=True + ) + # import the data + data = pd.read_csv(input_file_path, sep="\t", header=None) + data.rename(columns={0: "sequence"}, inplace=True) + + core_count = multiprocessing.cpu_count() - 2 + print(f"Running on {core_count} cores") + # Create a pool of worker processes + pool = multiprocessing.Pool(processes=core_count) + # start time profiling + pool_start_time = time.time() + # Use the pool to map the analyze_sequences function to the sequences in the data + results = pool.map(count_5mc_sequences, data["sequence"]) + + # Close the pool and wait for the work to finish + pool.close() + pool.join() + # end time profiling + pool_end_time = time.time() + + # Convert the results to a DataFrame for easier analysis + results_df = pd.DataFrame( + results, columns=["raw_count", "c_count", "count_5mc", "sequence"] + ) + # write the results to a file + pathlib.Path("../results").mkdir(parents=True, exist_ok=True) + results_df.to_csv("../results/5mc_multiprocessing.csv", sep=",", index=False) + print(f"{pool_end_time - pool_start_time} seconds to analyze.") diff --git a/lectures/10.hpc_and_parallel_compute/scripts/plot_parallel_compute_analysis.py b/lectures/10.hpc_and_parallel_compute/scripts/plot_parallel_compute_analysis.py new file mode 100644 index 0000000..51361d1 --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/scripts/plot_parallel_compute_analysis.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# coding: utf-8 + +# This notebook generates a hypothetical situation calculating parallel computing time for a given number of tasks and processors. +# The goal is to show how the time to complete a task decreases as the number of processors increases. + +# In[1]: + + +import pathlib + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns + +# In[2]: + + +data_path = pathlib.Path("../data/parallel_compute_analysis.csv").resolve() +df = pd.read_csv(data_path) +df.head() + + +# In[3]: + + +df["time(s)"] = df["sequences"] / df["core_count"] * df["time_per_sequence(s)"] +# log 10 transform the data +df["sequences"] = df["sequences"].apply(np.log10) + +# convert to hours +df["time(h)"] = df["time(s)"] / 3600 +df["time(d)"] = df["time(h)"] / 24 + + +# In[4]: + + +df["core_count"].unique() +# make each core count a different color +colors = sns.color_palette("husl", len(df["core_count"].unique())) +color_dict = dict(zip(df["core_count"].unique(), colors)) + + +# In[5]: + + +# plot the results +sns.set(style="whitegrid") +plt.figure(figsize=(12, 6)) +# custom palette +custom_palette = color_dict + +plt.subplot(1, 2, 1) +sns.lineplot( + x="sequences", y="time(d)", hue="core_count", data=df, palette=custom_palette +) +plt.xlabel("Number of Sequences (log10)") +plt.ylabel("Time (Days)") +plt.title("Comparison of parallel computing methods") +plt.xlim(1, max(df["sequences"])) + +plt.subplot(1, 2, 2) +sns.lineplot( + x="sequences", y="time(h)", hue="core_count", data=df, palette=custom_palette +) +plt.xlabel("Number of Sequences (log10)") +plt.ylabel("Time (Hours)") +plt.title("Comparison of parallel computing methods") +plt.xlim(1, 4) +plt.ylim(0, 1) +plt.tight_layout() +plt.show() diff --git a/lectures/10.hpc_and_parallel_compute/scripts/serial_run.sh b/lectures/10.hpc_and_parallel_compute/scripts/serial_run.sh new file mode 100644 index 0000000..62f20cb --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/scripts/serial_run.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +conda activate parallel_and_hpc_compute_env + +# profile the serial run +sequences_path="../data/sequences_to_analyze.txt" +output_file_name="../results/5mc_serial.csv" +# remove the output file if it exists +if [ -f "$output_file_name" ]; then + rm $output_file_name +fi + +# read in the sequences +# if bash version is 4.0 or higher, use readarray +# uncomment line below and comment while loop +# readarray -t sequences < $sequences_path + +# if bash version is lower than 4.0, use while loop +sequences=() +while IFS= read -r line +do + sequences+=("$line") +done < $sequences_path + +echo "Number of sequences to analyze: ${#sequences[@]}" + +# start time profiling +start_time=$(date +%s) +for sequence in "${sequences[@]}" +do + echo sequence: $sequence + python 5mc_analysis.py --sequence "$sequence" --output_file_name "$output_file_name" +done + +end_time=$(date +%s) +echo "Serial run time: $((end_time - start_time)) seconds" + +conda deactivate diff --git a/lectures/10.hpc_and_parallel_compute/scripts/submit_jobs_HPC.sh b/lectures/10.hpc_and_parallel_compute/scripts/submit_jobs_HPC.sh new file mode 100644 index 0000000..a497764 --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/scripts/submit_jobs_HPC.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --partition=amilan +#SBATCH --qos=normal +#SBATCH --account=amc-general +#SBATCH --time=00:00:30 +#SBATCH --output=lecture10_output_%j.out +#SBATCH --array=0-9 + +module load anaconda + +conda activate parallel_and_hpc_compute_env + +sequences_path="../data/sequences_to_analyze.txt" +output_file_name="../results/5mc_HPC.csv" +# read in the sequences +readarray -t sequences < $sequences_path + +# set the number of sequences to analyze +num_sequences=${#sequences[@]} +SLURM_ID=$SLURM_ARRAY_TASK_ID +# get start time +start_time=$(date +%s) +python 5mc_analysis.py --sequence "${sequences[$SLURM_ID]}" --output_file_name "$output_file_name" +end_time=$(date +%s) +echo "HPC parallel run time: $((end_time - start_time)) seconds" +conda deactivate + + diff --git a/lectures/10.hpc_and_parallel_compute/utils/sequence_analysis.py b/lectures/10.hpc_and_parallel_compute/utils/sequence_analysis.py new file mode 100644 index 0000000..cf7f286 --- /dev/null +++ b/lectures/10.hpc_and_parallel_compute/utils/sequence_analysis.py @@ -0,0 +1,28 @@ +def count_5mc_sequences(sequence: str) -> tuple: + """ + This function analyzes a sequence for 5mc content and returns the results. + + Parameters + ---------- + sequence : string + a string of DNA sequence + + Returns + ------- + c_count : int + the number of 5mc sites in the sequence + raw_count : int + the total number of C sites in the sequence + """ + + # cast the sequence to uppercase + sequence = sequence.upper() + raw_count = 0 + c_count = 0 + count_5mc = 0 + for sequence_element in enumerate(sequence): + if sequence_element[1] == "X": + c_count += 1 + raw_count += 1 + + return c_count, raw_count, count_5mc, sequence