-
Notifications
You must be signed in to change notification settings - Fork 6
/
gc_calculator.py
100 lines (87 loc) · 3.46 KB
/
gc_calculator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import Bio
from Bio import SeqIO
import csv
import sys
import math
def main(input_file,output_file_name):
#the DNA sequence file
dna_sequence = Bio.SeqIO.parse(input_file, 'fasta')
#variables
c_counter = 0
g_counter = 0
n_counter = 0
a_counter = 0
t_counter = 0
final = 0
total2 = 0
seq = 0
#create a csv file called gcdata.csv to append our data
with open(output_file_name, 'w') as f:
writer = csv.writer(f)
scaffolds = []
#for loop that loops through the different sequence in the data file
for seq_record in dna_sequence:
seq = (seq_record.seq)
scaffolds.append(seq_record.id)
m = len(scaffolds)
#loops through every nucleotide in each sequence and incriments the specific
#counter by 1 everytime it loops through the corresponding nucleotide
for base in seq:
if base == 'C':
c_counter += 1
elif base == 'G':
g_counter += 1
elif base == 'A':
a_counter += 1
elif base == 'T':
t_counter += 1
#prints the sequence id
print (seq_record.id)
#the total of all the Bases to find the mean
total = c_counter + g_counter + a_counter + t_counter
#find the percentage of C and G and then adding them together
cpercentage = float(c_counter)/ float(total) * 100
gpercentage = float(g_counter)/ float(total) * 100
gcpercentage = cpercentage + gpercentage
#add all the GC percentages of the different DNA sequences to find the mean
final += gcpercentage
#append the items in my list to the csv file created above
mylist =[seq_record.id, c_counter, g_counter, gcpercentage]
writer.writerow(mylist)
#print the number of G and C and the Percentage for each DNA sequence
print ('NUMBER OF G: ', g_counter)
print ('NUMBER OF C: ', c_counter)
print ('PERCENTAGE: ', gcpercentage,'%')
print ('--------------------------------------------------')
c_counter = 0
g_counter = 0
n_counter = 0
a_counter = 0
t_counter = 0
#calculate the mean
mean = final/ m
first = gcpercentage - mean
second = math.pow(first, 2)
total2 += second
mean2 = total2 / m
finaldeviation = math.sqrt(total2)
#print ('MEAN: ', mean, '%')
#print ('STANDARD DEVIATION: ', finaldeviation
#print ('--------------------------------------------------')
#append the mean to the csv file
#writer.writerow(['MEAN', mean])
#writer.writerow(['STANDARD DEVIATION', finaldeviation])
#close csv file
f.close()
for i in range(len(sys.argv)):
#this is a Boolean statement that says if the argument you've reached
#in the for loop is -i, then the next argument should be assigned
#to the Python variable Argument_1
if sys.argv[i] == "-i":
input_file = sys.argv[i+1]
#This elif statement is another Boolean that says if the argument
#you've reached in the for loop is -o, then the next argument
#should be assigned to the Python variable Argument_2.
elif sys.argv[i] == "-o":
output_file_name = sys.argv[i+1]
main(input_file,output_file_name)