Skip to content

Commit

Permalink
Merge pull request #8 from ncezid-biome/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
jinfinance authored Aug 26, 2024
2 parents d811976 + 8165511 commit 374d7cf
Show file tree
Hide file tree
Showing 22 changed files with 7,618 additions and 63 deletions.
2 changes: 1 addition & 1 deletion HMAS2_1.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified HMAS_MultiQC_REPORT.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7,042 changes: 7,042 additions & 0 deletions M3235-24-004-multiqc_07192024/multiqc_report.html

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@ By default, the pipeline runs the following [workflow](#workflow):
## USAGE

1. **Test with the default test_data**:
Run the following: `nextflow run hmas2.nf -profile test`
- Run the following: `nextflow run hmas2.nf -profile test`
Depends on your hardware, the test run should be completed in a few minutes. And the output will be in the `test_output` folder
- Alternatively, change directory to the **test_data** folder, and run the following: `./test_pipeline.sh`
The script will automatically run the pipeline with the default test data and compare the output to the expected result and print out **'PASSED ! CSV files match'** if the results match, or **WARNING messages** otherwise.

2. **Test with your own data** - Make sure to provide path for the 3 required parameters in **nextflow.config** file.

Expand Down
4 changes: 1 addition & 3 deletions bin/combine_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
import re

'''
This script reads all report.csv (for each sample) and concantenate them into one single report.csv. It will
also add a 'Mean read depth across entire run' row at the end.
This script reads all report.csv (for each sample) and concantenate them into one single report.csv.
This script requires all report.csv file names be passed in a concatenated string as a command line argument.
'''

Expand Down
211 changes: 204 additions & 7 deletions bin/combine_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,209 @@
import os
import re
import yaml
import utilities

'''
This script reads all report.csv (for each sample) and concantenate them into one single report.csv. It will
also add a 'Mean read depth across entire run' row at the end.
def make_primer_stats_yaml(output_file, primer_stats, oligos_file):
'''
this method generates a custom content yaml file specific for the multiqc report
this yaml file is for the primer pair performance report
Parameters
----------
output_file: String, output file (yaml) name
primer_stats: String, concatenated names of each primer_stats csv file (per sample)
oligos_file: String, oligo file name(which contains the primer information)
Returns: None
----------
'''
def create_df(primer_stats, oligos_file):
oligo_primers = utilities.Primers(oligos_file)
# our primer panel as a dictionary, its value is a list of 3 items
# running total: int
# (min read count, sample name), tuple
# (max read count, sample name), tuple
primer_dict = {primer: [] for primer in oligo_primers.pnames}

#read each report csv file as a df
df_list = [pd.read_csv(report, index_col=[0], sep='\t') for report in primer_stats.split()]
# each df has the following format (it has only 1 row)
# primer1 primer2 primer3
#sample(abundance count) 10 20 30
for df in df_list:
df_dict = df.to_dict(orient='list')
sample = df.index.to_list()[0]
#go through each primer in our original primer panel
#check if it exists in the current primer_stats
for primer in primer_dict:
if primer in df_dict:
read_count = df_dict[primer][0]
if len(primer_dict[primer]) <= 0: #first timer
primer_dict[primer].append(read_count) #running total
primer_dict[primer].append((read_count,sample)) #min
primer_dict[primer].append((read_count,sample)) #max
else:
primer_dict[primer][0] = primer_dict[primer][0] + read_count
if read_count < primer_dict[primer][1][0]:
primer_dict[primer][1] = (read_count,sample)
elif read_count > primer_dict[primer][2][0]:
primer_dict[primer][2] = (read_count,sample)

#go through our primer panel again
#if one primer has no associated reads at all, mark as 'n/a'
#otherwise, calculate the mean reads count, and conver the min/max tuples to String
for primer in primer_dict:
if len(primer_dict[primer]) <= 0:
primer_dict[primer].extend(['n/a']*3)
else:
primer_dict[primer][0] = primer_dict[primer][0]/len(df_list)
primer_dict[primer][1] = f'{primer_dict[primer][1][0]} / {primer_dict[primer][1][1]}'
primer_dict[primer][2] = f'{primer_dict[primer][2][0]} / {primer_dict[primer][2][1]}'

df = pd.DataFrame(primer_dict).transpose()
df.columns = ['p_col1', 'p_col2', 'p_col3']

return df

# Create headers dictionary
headers = {
'p_col1': {
'title': 'average read count',
'description': 'mean reads count per primer pair across all samples',
'format': '{:,.1f}',
},
'p_col2': {
'title': 'minimum read count',
'description': 'minimum reads count for this primer pair across all samples (if there is a tie, only show the first one)',
# 'format': '{:,.3f}',
"scale": False
},
'p_col3': {
'title': 'maxmium read count',
'description': 'maximum reads count for this primer pair across all samples (if there is a tie, only show the first one)',
},
}

# Convert the DataFrame to the required format
data_yaml = create_df(primer_stats, oligos_file).to_dict(orient='index')

# Create the full YAML dictionary
yaml_dict = {
'id': 'primer_report',
'section_name': 'Primer performance report',
'description': 'reads count report per primer pair across all samples in this run',
'plot_type': 'table',
'pconfig': {
'id': 'primer_report',
'sort_rows': False,
'col1_header': 'Primer Name',
"no_violin": True,
},
'headers': headers,
'data': data_yaml
}

# Write to a YAML file
with open(output_file, 'w') as file:
yaml.dump(yaml_dict, file, sort_keys=False)


def make_read_length_yaml(output, read_length, noshow_samples):
'''
this method generates a custom content yaml file specific for the multiqc report
this yaml file is for the final combined reads length report
Parameters
----------
output: String, output file (yaml) name
read_length: String, concatenated names of each read_length csv file
noshow_samples: List, a list of sample names which does not generate any valid sequences
Returns: None
----------
'''
df_list = [pd.read_csv(report, index_col=[0], sep='\t') for report in read_length.split()]
report_df = pd.concat(df_list)
report_df.columns = ['l_col0','l_col1','l_col2','l_col3', 'l_col4']
report_df.fillna('n/a', inplace=True)
for sample in noshow_samples:
report_df.loc[f'{sample}'] = [0, 0,'n/a', 'n/a', 'n/a']


# Create headers dictionary
headers = {
'l_col0': {
'title': 'total reads(non-unique) count',
'description': 'total high quality reads count per sample across all primer pairs',
"format": "{:,.0f}",
},
'l_col1': {
'title': 'total reads(unique) count',
'description': 'total high quality unique reads count per sample across all primer pairs',
"format": "{:,.0f}",
},
'l_col2': {
'title': 'average read length',
'description': 'mean reads length per sample across all primer pairs',
'format': '{:,.1f}',
"scale": False,
},
'l_col3': {
'title': 'minimum read length',
'description': 'minimum reads length per sample across all primer pairs',
"scale": False,
"format": "{:,.0f}"
},
'l_col4': {
'title': 'maximum read length',
'description': 'maximum reads length per sample across all primer pairs',
"scale": False,
"format": "{:,.0f}"
},
}

# Convert the DataFrame to the required format
data_yaml = report_df.to_dict(orient='index')

# Create the full YAML dictionary
yaml_dict = {
'id': 'read_length_report',
'section_name': 'Sample read length report',
'description': 'reads length report per sample across all primer pairs in this run',
'plot_type': 'table',
'pconfig': {
'id': 'read_length_report',
'sort_rows': False,
},
'headers': headers,
'data': data_yaml
}

# Write to a YAML file
with open(output, 'w') as file:
yaml.dump(yaml_dict, file, sort_keys=False)

This script requires all report.csv file names be passed in a concatenated string as a command line argument.
'''

def make_report_yaml(output_file, data_df):
'''
this method generates a custom content yaml file specific for the multiqc report
this yaml file is for the final combined hmas summary report
Parameters
----------
output_file: output file (yaml) name
data_df: data part of the yaml file in the format of dataframe
Returns: None
----------
'''
# Create headers dictionary
headers = {
'col1': {
'title': 'Mean read depth',
'description': 'we include only reads with at least 2 sequence count',
'format': '{:,.1f}',
"scale": False
},
'col2': {
'title': '% of successful primer-pairs',
Expand All @@ -41,7 +228,9 @@ def make_report_yaml(output_file, data_df):
yaml_dict = {
'id': 'hmas_run_report',
'section_name': 'HMAS run report',
'description': 'combined summary report for all samples in this run',
'description': "Combined summary statistics for all the samples in the run, "
"showing the mean read depth and the number (and percentage) of successful"
"primer pairs (out of total 2461 in the Salmonella HMAS primer panel).",
'plot_type': 'table',
'pconfig': {
'id': 'hmas_run_report',
Expand Down Expand Up @@ -95,6 +284,13 @@ def parse_argument():
parser.add_argument('-p', '--reports', metavar = '', required = True, help = 'Specify reports')
parser.add_argument('-i', '--folder_path', metavar = '', required = True, help = 'Specify folder path for fasta.gz files')

parser.add_argument('-z', '--pyaml', metavar = '', required = True, help = 'Specify output primer_stats mqc report file')
parser.add_argument('-q', '--primer_stats', metavar = '', required = True, help = 'Specify input primer_stats')
parser.add_argument('-l', '--primers', metavar = '', required = True, help = 'Specify primers')

parser.add_argument('-x', '--lyaml', metavar = '', required = True, help = 'Specify output read_length mqc report file')
parser.add_argument('-r', '--read_length', metavar = '', required = True, help = 'Specify input read_length file')

return parser.parse_args()

if __name__ == "__main__":
Expand Down Expand Up @@ -122,4 +318,5 @@ def parse_argument():
#update empty cell to n/a
report_df.fillna('n/a', inplace=True)
make_report_yaml(args.yaml, report_df)

make_primer_stats_yaml(args.pyaml, args.primer_stats, args.primers)
make_read_length_yaml(args.lyaml, args.read_length, noshow_samples)
91 changes: 88 additions & 3 deletions bin/create_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pandas as pd
import utilities
import argparse
from Bio import SeqIO


def parse_argument():
Expand All @@ -12,9 +13,92 @@ def parse_argument():
parser.add_argument('-c', '--count_table', metavar = '', required = True, help = 'Specify count table file')
parser.add_argument('-p', '--primers', metavar = '', required = True, help = 'Specify oligos/(primer) file')
parser.add_argument('-o', '--output', metavar = '', required = True, help = 'Specify output file')
parser.add_argument('-q', '--primer_stats', metavar = '', required = True, help = 'Specify primer_stats file')
parser.add_argument('-f', '--fasta', metavar = '', required = True, help = 'Specify fasta file')
parser.add_argument('-l', '--read_length', metavar = '', required = True, help = 'Specify output read_length file')
return parser.parse_args()


def generate_primer_stats(sample, count_file, primer_stats):
'''
this method calculates primer stats for the given sample
and generate a report (tsv file)
# primer1 primer2 primer3
#sample(abundance count) 10 20 30
Parameters
----------
sample: sample name
count_file: original count_table file
primer_stats: the output file
Returns: DataFrame
----------
'''
raw_df = pd.read_csv(count_file, sep='\t')
raw_df.drop(['seq'], axis=1, inplace=True)

#convert to the format of:
# primer1 primer2 primer3
#sample(abundance count) 10 20 30
report_df = raw_df.sum().to_frame(name=sample).T

# to update the column name
# from 2014K_0979.OG0000348primerGroup3 to OG0000348primerGroup3
def rename_column(col):
return col.split('.', 1)[1] if '.' in col else col

# Rename columns using a function directly
report_df = report_df.rename(columns=rename_column)

report_df.to_csv(f'{primer_stats}', sep='\t')

return report_df


def generate_read_length(sample, fasta_file, output):
'''
this method calculates read length stats for the given sample
and generate a report (tsv file)
# num_seqs min_len avg_len max_len
#sample 10 150 175 200
Parameters
----------
sample: sample name
fasta_file: the fasta file name
output: the output read_length file name
Returns: DataFrame
----------
'''
# Read the sequences from the FASTA file
sequences = list(SeqIO.parse(fasta_file, "fasta"))
# total count of non-unique sequences
total_count = [int(record.id.lower().split("size=")[1]) for record in sequences if "size=" in record.id.lower() and record.id.lower().split("size=")[1].isdigit()]

# Calculate the statistics
num_seqs = len(sequences)
lengths = [len(seq.seq) for seq in sequences]
min_length = min(lengths)
avg_length = round(sum(lengths) / num_seqs, 1)
max_length = max(lengths)

stats = {
"num_total_seqs": [sum(total_count)],
"num_seqs": [num_seqs],
"avg_len": [avg_length],
"min_len": [min_length],
"max_len": [max_length]
}

# Convert to DataFrame
df = pd.DataFrame(stats, index=[sample])
df.to_csv(f'{output}', sep='\t')

return df


def report(sample, count_file, oligos_file, report_file):
'''
this method calculates metrics like: Mean read depth, # of failed primer pairs and
Expand All @@ -30,7 +114,6 @@ def report(sample, count_file, oligos_file, report_file):
Returns: DataFrame
----------
None
'''
raw_df = pd.read_csv(count_file, sep='\t')
raw_df.drop(['seq'], axis=1, inplace=True)
Expand Down Expand Up @@ -63,12 +146,14 @@ def report(sample, count_file, oligos_file, report_file):
f'# of primer pairs with more than 2 amplicons mapping\nover total primer-pairs']
report_df.columns = report_columns

report_df.to_csv(f'{report_file}.csv')
report_df.to_csv(f'{report_file}')

return report_df


if __name__ == "__main__":

args = parse_argument()
report(args.sample, args.count_table, args.primers, args.output)
report(args.sample, args.count_table, args.primers, args.output)
generate_primer_stats(args.sample, args.count_table, args.primer_stats)
generate_read_length(args.sample, args.fasta, args.read_length)
Loading

0 comments on commit 374d7cf

Please sign in to comment.