This repository has been archived by the owner on Jul 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Contig_stats.py
executable file
·80 lines (60 loc) · 1.83 KB
/
Contig_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
import os, sys, string, numpy
from Bio.Seq import Seq
from modules.Si_SeqIO import *
for filename in sys.argv[1:]:
try:
contigs=SeqIO.parse(open(filename), "fasta")
except StandardError:
print "Could not open file", filename
continue
print filename
heading=['contig','length','length without Ns','GC%']
lengths=[]
GCs=[]
for x in ["A","C","G","T"]:
for y in ["A","C","G","T"]:
heading.append(x+y)
print '\t'.join(heading)
for contig in contigs:
dinucleotides={}
for x in ["A","C","G","T"]:
dinucleotides[x]={}
for y in ["A","C","G","T"]:
dinucleotides[x][y]=0
seq=str(contig.seq).upper()
length=len(seq)
lengths.append(length)
lengthnons=len(seq.upper().replace("N",""))
GC=(float(len(seq.upper().replace("N","").replace("A","").replace("T","")))/lengthnons)*100
GCs.append(GC)
for x in range(0, length-1):
if seq[x]!="N" and seq[x+1]!="N":
dinucleotides[seq[x]][seq[x+1]]+=1
outline=[contig.id,str(length),str(lengthnons),str(GC)]
for x in ["A","C","G","T"]:
for y in ["A","C","G","T"]:
outline.append(str(dinucleotides[x][y]))
print '\t'.join(outline)
print "Total length =", numpy.sum(lengths)
print "Number of contigs =", len(lengths)
print "Mean length =", numpy.mean(lengths)
print "Standard deviation of lengths =", numpy.std(lengths)
print "Maximum length =", numpy.max(lengths)
print "Minimum length =", numpy.min(lengths)
lengths.sort()
lengths.reverse()
fifty=float(numpy.sum(lengths))/2
count=0
sum=0
while sum<fifty:
N50=lengths[count]
sum+=lengths[count]
count+=1
print "N50 =", N50
print "N50n =", count
print "Mean GC% =", numpy.mean(GCs)
print "GC% standard deviation =", numpy.std(GCs)
print "Median GC% =", numpy.median(GCs)
print "Maximum GC% =", numpy.max(GCs)
print "Minimum GC% =", numpy.min(GCs)