Merge pull request #8 from ncezid-biome/dev

Dev
ncezid-biome · Aug 26, 2024 · 374d7cf · 374d7cf
2 parents d811976 + 8165511
commit 374d7cf
Show file tree

Hide file tree

Showing 22 changed files with 7,618 additions and 63 deletions.
diff --git a/HMAS2_1.svg b/HMAS2_1.svg
diff --git a/HMAS_MultiQC_REPORT.png b/HMAS_MultiQC_REPORT.png
diff --git a/M3235-24-004-multiqc_07192024/multiqc_report.html b/M3235-24-004-multiqc_07192024/multiqc_report.html
diff --git a/README.md b/README.md
@@ -32,8 +32,10 @@ By default, the pipeline runs the following [workflow](#workflow):
 ## USAGE
 
  1. **Test with the default test_data**:  
- Run the following: `nextflow run hmas2.nf -profile test`  
+     - Run the following: `nextflow run hmas2.nf -profile test`  
  Depends on your hardware, the test run should be completed in a few minutes. And the output will be in the `test_output` folder  
+     - Alternatively, change directory to the **test_data** folder, and run the following: `./test_pipeline.sh`  
+     The script will automatically run the pipeline with the default test data and compare the output to the expected result and print out **'PASSED ! CSV files match'** if the results match, or **WARNING messages** otherwise.  
 
 2. **Test with your own data** - Make sure to provide path for the 3 required parameters in **nextflow.config** file.    
 

diff --git a/bin/combine_logs.py b/bin/combine_logs.py
@@ -6,9 +6,7 @@
 import re
 
 '''
-This script reads all report.csv (for each sample) and concantenate them into one single report.csv. It will 
-also add a 'Mean read depth across entire run' row at the end.
-
+This script reads all report.csv (for each sample) and concantenate them into one single report.csv. 
 This script requires all report.csv file names be passed in a concatenated string as a command line argument.
 '''
 

diff --git a/bin/combine_reports.py b/bin/combine_reports.py
@@ -5,22 +5,209 @@
 import os
 import re
 import yaml
+import utilities
 
-'''
-This script reads all report.csv (for each sample) and concantenate them into one single report.csv. It will 
-also add a 'Mean read depth across entire run' row at the end.
+def make_primer_stats_yaml(output_file, primer_stats, oligos_file):
+    '''
+    this method generates a custom content yaml file specific for the multiqc report
+    this yaml file is for the primer pair performance report
+
+    Parameters
+    ----------
+    output_file: String, output file (yaml) name
+    primer_stats: String, concatenated names of each primer_stats csv file (per sample)
+    oligos_file: String, oligo file name(which contains the primer information)
+
+    Returns: None
+    ----------
+    '''   
+    def create_df(primer_stats, oligos_file):
+        oligo_primers = utilities.Primers(oligos_file)
+        # our primer panel as a dictionary, its value is a list of 3 items
+        # running total: int
+        # (min read count, sample name), tuple
+        # (max read count, sample name), tuple
+        primer_dict = {primer: [] for primer in oligo_primers.pnames}
+
+        #read each report csv file as a df
+        df_list = [pd.read_csv(report, index_col=[0], sep='\t') for report in primer_stats.split()]
+        # each df has the following format (it has only 1 row)
+        #                            primer1 primer2 primer3
+        #sample(abundance count)     10      20      30
+        for df in df_list:
+            df_dict = df.to_dict(orient='list')
+            sample = df.index.to_list()[0]
+            #go through each primer in our original primer panel
+            #check if it exists in the current primer_stats
+            for primer in primer_dict:
+                if primer in df_dict:
+                    read_count = df_dict[primer][0]
+                    if len(primer_dict[primer]) <= 0: #first timer
+                        primer_dict[primer].append(read_count) #running total
+                        primer_dict[primer].append((read_count,sample)) #min 
+                        primer_dict[primer].append((read_count,sample)) #max
+                    else:
+                        primer_dict[primer][0] = primer_dict[primer][0] + read_count
+                        if read_count < primer_dict[primer][1][0]:
+                            primer_dict[primer][1] = (read_count,sample)
+                        elif read_count > primer_dict[primer][2][0]:
+                            primer_dict[primer][2] = (read_count,sample)
+
+        #go through our primer panel again
+        #if one primer has no associated reads at all, mark as 'n/a'
+        #otherwise, calculate the mean reads count, and conver the min/max tuples to String
+        for primer in primer_dict:
+            if len(primer_dict[primer]) <= 0:
+                primer_dict[primer].extend(['n/a']*3)
+            else:
+                primer_dict[primer][0] = primer_dict[primer][0]/len(df_list)
+                primer_dict[primer][1] = f'{primer_dict[primer][1][0]} / {primer_dict[primer][1][1]}'
+                primer_dict[primer][2] = f'{primer_dict[primer][2][0]} / {primer_dict[primer][2][1]}'
+
+        df = pd.DataFrame(primer_dict).transpose()
+        df.columns = ['p_col1', 'p_col2', 'p_col3']
+
+        return df
+
+    # Create headers dictionary
+    headers = {
+        'p_col1': {
+            'title': 'average read count',
+            'description': 'mean reads count per primer pair across all samples',
+            'format': '{:,.1f}',
+        },
+        'p_col2': {
+            'title': 'minimum read count',
+            'description': 'minimum reads count for this primer pair across all samples (if there is a tie, only show the first one)',
+            # 'format': '{:,.3f}',
+            "scale": False
+        },
+        'p_col3': {
+            'title': 'maxmium read count',
+            'description': 'maximum reads count for this primer pair across all samples (if there is a tie, only show the first one)',
+        },
+    }
+
+    # Convert the DataFrame to the required format
+    data_yaml = create_df(primer_stats, oligos_file).to_dict(orient='index')
+
+    # Create the full YAML dictionary
+    yaml_dict = {
+        'id': 'primer_report',
+        'section_name': 'Primer performance report',
+        'description': 'reads count report per primer pair across all samples in this run',
+        'plot_type': 'table',
+        'pconfig': {
+            'id': 'primer_report',
+            'sort_rows': False,
+            'col1_header': 'Primer Name',
+            "no_violin": True,
+        },
+        'headers': headers,
+        'data': data_yaml
+    }
+
+    # Write to a YAML file
+    with open(output_file, 'w') as file:
+        yaml.dump(yaml_dict, file, sort_keys=False)
+
+
+def make_read_length_yaml(output, read_length, noshow_samples):
+    '''
+    this method generates a custom content yaml file specific for the multiqc report
+    this yaml file is for the final combined reads length report
+
+    Parameters
+    ----------
+    output: String, output file (yaml) name
+    read_length: String, concatenated names of each read_length csv file
+    noshow_samples: List, a list of sample names which does not generate any valid sequences
+
+    Returns: None
+    ----------
+    '''   
+    df_list = [pd.read_csv(report, index_col=[0], sep='\t') for report in read_length.split()]
+    report_df = pd.concat(df_list)
+    report_df.columns = ['l_col0','l_col1','l_col2','l_col3', 'l_col4']
+    report_df.fillna('n/a', inplace=True)
+    for sample in noshow_samples:
+        report_df.loc[f'{sample}'] = [0, 0,'n/a', 'n/a', 'n/a']
+
+
+    # Create headers dictionary
+    headers = {
+        'l_col0': {
+            'title': 'total reads(non-unique) count',
+            'description': 'total high quality reads count per sample across all primer pairs',
+            "format": "{:,.0f}",
+        },
+        'l_col1': {
+            'title': 'total reads(unique) count',
+            'description': 'total high quality unique reads count per sample across all primer pairs',
+            "format": "{:,.0f}",
+        },
+        'l_col2': {
+            'title': 'average read length',
+            'description': 'mean reads length per sample across all primer pairs',
+            'format': '{:,.1f}',
+            "scale": False,
+        },
+        'l_col3': {
+            'title': 'minimum read length',
+            'description': 'minimum reads length per sample across all primer pairs',
+            "scale": False,
+            "format": "{:,.0f}"
+        },
+        'l_col4': {
+            'title': 'maximum read length',
+            'description': 'maximum reads length per sample across all primer pairs',
+            "scale": False,
+            "format": "{:,.0f}"
+        },
+    }
+
+    # Convert the DataFrame to the required format
+    data_yaml = report_df.to_dict(orient='index')
+
+    # Create the full YAML dictionary
+    yaml_dict = {
+        'id': 'read_length_report',
+        'section_name': 'Sample read length report',
+        'description': 'reads length report per sample across all primer pairs in this run',
+        'plot_type': 'table',
+        'pconfig': {
+            'id': 'read_length_report',
+            'sort_rows': False,
+        },
+        'headers': headers,
+        'data': data_yaml
+    }
+
+    # Write to a YAML file
+    with open(output, 'w') as file:
+        yaml.dump(yaml_dict, file, sort_keys=False)
 
-This script requires all report.csv file names be passed in a concatenated string as a command line argument.
-'''
 
 def make_report_yaml(output_file, data_df):
+    '''
+    this method generates a custom content yaml file specific for the multiqc report
+    this yaml file is for the final combined hmas summary report
+
+    Parameters
+    ----------
+    output_file: output file (yaml) name
+    data_df: data part of the yaml file in the format of dataframe
 
+    Returns: None
+    ----------
+    '''   
     # Create headers dictionary
     headers = {
         'col1': {
             'title': 'Mean read depth',
             'description': 'we include only reads with at least 2 sequence count',
             'format': '{:,.1f}',
+            "scale": False
         },
         'col2': {
             'title': '% of successful primer-pairs',
@@ -41,7 +228,9 @@ def make_report_yaml(output_file, data_df):
     yaml_dict = {
         'id': 'hmas_run_report',
         'section_name': 'HMAS run report',
-        'description': 'combined summary report for all samples in this run',
+        'description': "Combined summary statistics for all the samples in the run, "
+            "showing the mean read depth and the number (and percentage) of successful"
+            "primer pairs (out of total 2461 in the Salmonella HMAS primer panel).",
         'plot_type': 'table',
         'pconfig': {
             'id': 'hmas_run_report',
@@ -95,6 +284,13 @@ def parse_argument():
     parser.add_argument('-p', '--reports', metavar = '', required = True, help = 'Specify reports')
     parser.add_argument('-i', '--folder_path', metavar = '', required = True, help = 'Specify folder path for fasta.gz files')
 
+    parser.add_argument('-z', '--pyaml', metavar = '', required = True, help = 'Specify output primer_stats mqc report file')
+    parser.add_argument('-q', '--primer_stats', metavar = '', required = True, help = 'Specify input primer_stats')  
+    parser.add_argument('-l', '--primers', metavar = '', required = True, help = 'Specify primers')  
+
+    parser.add_argument('-x', '--lyaml', metavar = '', required = True, help = 'Specify output read_length mqc report file')
+    parser.add_argument('-r', '--read_length', metavar = '', required = True, help = 'Specify input read_length file')  
+
     return parser.parse_args()
 
 if __name__ == "__main__":
@@ -122,4 +318,5 @@ def parse_argument():
     #update empty cell to n/a
     report_df.fillna('n/a', inplace=True)
     make_report_yaml(args.yaml, report_df)
-
+    make_primer_stats_yaml(args.pyaml, args.primer_stats, args.primers)
+    make_read_length_yaml(args.lyaml, args.read_length, noshow_samples)
diff --git a/bin/create_report.py b/bin/create_report.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import utilities
 import argparse
+from Bio import SeqIO
 
 
 def parse_argument():
@@ -12,9 +13,92 @@ def parse_argument():
     parser.add_argument('-c', '--count_table', metavar = '', required = True, help = 'Specify count table file')
     parser.add_argument('-p', '--primers', metavar = '', required = True, help = 'Specify oligos/(primer) file')
     parser.add_argument('-o', '--output', metavar = '', required = True, help = 'Specify output file')
+    parser.add_argument('-q', '--primer_stats', metavar = '', required = True, help = 'Specify primer_stats file')
+    parser.add_argument('-f', '--fasta', metavar = '', required = True, help = 'Specify fasta file')
+    parser.add_argument('-l', '--read_length', metavar = '', required = True, help = 'Specify output read_length file')
     return parser.parse_args()
 
 
+def generate_primer_stats(sample, count_file, primer_stats):
+    '''
+    this method calculates primer stats for the given sample 
+    and generate a report (tsv file) 
+    #                            primer1 primer2 primer3
+    #sample(abundance count)     10      20      30
+
+    Parameters
+    ----------
+    sample: sample name
+    count_file: original count_table file
+    primer_stats: the output file
+
+    Returns: DataFrame
+    ----------
+    '''   
+    raw_df = pd.read_csv(count_file, sep='\t')
+    raw_df.drop(['seq'], axis=1, inplace=True)
+
+    #convert to the format of:
+    #                            primer1 primer2 primer3
+    #sample(abundance count)     10      20      30
+    report_df = raw_df.sum().to_frame(name=sample).T
+
+    # to update the column name
+    # from 2014K_0979.OG0000348primerGroup3 to OG0000348primerGroup3
+    def rename_column(col):
+        return col.split('.', 1)[1] if '.' in col else col
+
+    # Rename columns using a function directly
+    report_df = report_df.rename(columns=rename_column)
+
+    report_df.to_csv(f'{primer_stats}', sep='\t')
+
+    return report_df
+
+
+def generate_read_length(sample, fasta_file, output):
+    '''
+    this method calculates read length stats for the given sample 
+    and generate a report (tsv file) 
+    #                            num_seqs   min_len avg_len max_len
+    #sample                      10         150 175 200
+
+    Parameters
+    ----------
+    sample: sample name
+    fasta_file: the fasta file name
+    output: the output read_length file name
+
+    Returns: DataFrame
+    ----------
+    '''   
+    # Read the sequences from the FASTA file
+    sequences = list(SeqIO.parse(fasta_file, "fasta"))
+    # total count of non-unique sequences
+    total_count = [int(record.id.lower().split("size=")[1]) for record in sequences if "size=" in record.id.lower() and record.id.lower().split("size=")[1].isdigit()]
+
+    # Calculate the statistics
+    num_seqs = len(sequences)
+    lengths = [len(seq.seq) for seq in sequences]
+    min_length = min(lengths)
+    avg_length = round(sum(lengths) / num_seqs, 1)
+    max_length = max(lengths)
+
+    stats = {
+        "num_total_seqs": [sum(total_count)],
+        "num_seqs": [num_seqs],
+        "avg_len": [avg_length],
+        "min_len": [min_length],
+        "max_len": [max_length]
+    }
+
+    # Convert to DataFrame
+    df = pd.DataFrame(stats, index=[sample])
+    df.to_csv(f'{output}', sep='\t')
+
+    return df
+
+
 def report(sample, count_file, oligos_file, report_file):
     '''
     this method calculates metrics like: Mean read depth, # of failed primer pairs and 
@@ -30,7 +114,6 @@ def report(sample, count_file, oligos_file, report_file):
 
     Returns: DataFrame
     ----------
-    None
     '''   
     raw_df = pd.read_csv(count_file, sep='\t')
     raw_df.drop(['seq'], axis=1, inplace=True)
@@ -63,12 +146,14 @@ def report(sample, count_file, oligos_file, report_file):
                       f'# of primer pairs with more than 2 amplicons mapping\nover total primer-pairs']
     report_df.columns = report_columns
 
-    report_df.to_csv(f'{report_file}.csv')
+    report_df.to_csv(f'{report_file}')
 
     return report_df
 
 
 if __name__ == "__main__":
 
     args = parse_argument()
-    report(args.sample, args.count_table, args.primers, args.output)
+    report(args.sample, args.count_table, args.primers, args.output)
+    generate_primer_stats(args.sample, args.count_table, args.primer_stats)
+    generate_read_length(args.sample, args.fasta, args.read_length)