-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add file input argparse feature for different file formats
- Loading branch information
1 parent
8fa4974
commit fab239f
Showing
1 changed file
with
102 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
# ===================================================== DEPENDENCIES ============================================================== | ||
import argparse | ||
from Bio import SeqIO | ||
from Bio import Align | ||
from Bio.SeqUtils import gc_fraction | ||
from matplotlib import pyplot as plt | ||
plt.style.use('seaborn-v0_8') | ||
|
||
# ===================================================== ARGPARSE SETUP ============================================================= | ||
# Create parser instance | ||
parser = argparse.ArgumentParser(description = 'Calculate GC content of DNA/RNA sequences from files.') | ||
|
||
# ----------------------------- Single sequence file formats as optional arguments ------------------------------ | ||
# FASTA nucleic acid files | ||
parser.add_argument('-fna', nargs='+', type = argparse.FileType('r', encoding = 'UTF-8'), help = 'Provide .fna file') | ||
|
||
# FASTA files | ||
parser.add_argument('-fasta', nargs = '+', type = argparse.FileType('r', encoding = 'UTF-8'), help = 'Provide .fasta file') | ||
|
||
# GenBank files | ||
parser.add_argument('-gb', nargs = '+', type = argparse.FileType('r', encoding = 'UTF-8'), help = 'Provide .gb file') | ||
|
||
|
||
# ------------------------------ Sequence alignment file formats as optional arguments ------------------------------ | ||
# Aligned FASTA files | ||
parser.add_argument('-fa', type = argparse.FileType('r', encoding = 'UTF-8'), help = 'Provide .fa file') | ||
|
||
# ClustalW files | ||
parser.add_argument('-aln', type = argparse.FileType('r', encoding = 'UTF-8'), help = 'Provide .aln file') | ||
|
||
|
||
# Parse command-line arguments that were passed to the script and store them as attributes to args object | ||
args = parser.parse_args() | ||
|
||
if args.fna is not None: | ||
record_list =[SeqIO.read(file, "fasta") for file in args.fna] | ||
sequence_list = [record.seq for record in record_list] | ||
accession_num_list = [record.id.split('|')[3] for record in record_list] | ||
|
||
elif args.fasta is not None: | ||
record_list =[SeqIO.read(file, "fasta") for file in args.fasta] | ||
sequence_list = [record.seq for record in record_list] | ||
accession_num_list = [record.id for record in record_list] | ||
|
||
elif args.gb is not None: | ||
record_list =[SeqIO.read(file, "genbank") for file in args.gb] | ||
sequence_list = [record.seq for record in record_list] | ||
accession_num_list = [record.id for record in record_list] | ||
|
||
elif args.fa is not None: | ||
alignment = Align.read(args.fa, 'fasta') | ||
record_list = alignment.sequences | ||
sequence_list = [record.seq for record in record_list] | ||
accession_num_list = [record.id for record in record_list] | ||
|
||
elif args.aln is not None: | ||
alignment = Align.read(args.aln, 'clustal') | ||
record_list = alignment.sequences | ||
sequence_list = [record.seq for record in record_list] | ||
accession_num_list = [record.id.split('|')[3] for record in record_list] | ||
|
||
# ===================================================== FUNCTIONS ================================================================== | ||
def calc_gc(sequence_list): | ||
''' Calculates GC content of sequences and returns list of GC content of each sequence. | ||
''' | ||
gc_list = [] | ||
for i in sequence_list: | ||
gc_list.append(round(gc_fraction(i), 2)) | ||
return gc_list | ||
|
||
def create_GC_barplot(accession_num_list, gc_list): | ||
'''Function that creates barplot of the GC value calculated (not with SWAN) of multiple sequences. | ||
''' | ||
fig, ax = plt.subplots() | ||
|
||
# Create the bar plot | ||
bar_container = ax.bar(accession_num_list, gc_list, width = 0.3, color = '#3937b3') | ||
|
||
# Set plot titles and labels | ||
ax.set_title("Sequence GC Content Analysis") | ||
ax.set_xlabel("Sequences") | ||
ax.set_ylabel("GC content") | ||
|
||
# Add labels on the bars | ||
ax.bar_label(bar_container) | ||
|
||
# Set axis limits if only one sequence is present | ||
if len(accession_num_list) == 1: | ||
ax.set_xlim(-0.5, 0.5) | ||
|
||
# Style the spines (borders) of the plot | ||
ax.spines['top'].set_color('0.5') | ||
ax.spines['right'].set_color('0.5') | ||
ax.spines['bottom'].set_color('0.5') | ||
ax.spines['left'].set_color('0.5') | ||
|
||
# Display the plot | ||
plt.show() | ||
|
||
|
||
gc_content_list = calc_gc(sequence_list) | ||
create_GC_barplot(accession_num_list, gc_content_list) |