-
Notifications
You must be signed in to change notification settings - Fork 44
/
count_spacers.py
115 lines (100 loc) · 4.42 KB
/
count_spacers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#Supplementary Data 3: count_spacers.py
from Bio import SeqIO
import csv
from collections import OrderedDict
import numpy as np
import sys
import argparse
KEY_REGION_START = 30 #start index of key region
KEY_REGION_END = 55 #end index of key region
KEY = "CGAAACACC" #identifies sequence before guide to determine guide position
def count_spacers(input_file, fastq_file, output_file, guide_g):
"""
creates a dictionary with guide counts from fastq_file, writes to output_file
fastq_file: forward read fastq file
output_file: csv file to write guide dictionary to
dictionary: guide sequence as key, guide count as entry
"""
num_reads = 0 #total number of reads processed
perfect_matches = 0 # guides with perfect match to library
non_perfect_matches = 0 #number of guides without a perfect match to the library
key_not_found = 0 #count of reads where key was not found
# add 'G' to key sequence if included in library
if guide_g:
global KEY
KEY += "G"
# open library sequences and initiate dictionary of read counts for each guide
try:
with open(input_file, mode='rU') as infile: #rU mode is necessary for excel!
reader = csv.reader(infile)
dictionary = {rows[0]:0 for rows in reader}
except:
print 'could not open', input_file
# open fastq file
try:
handle = open(fastq_file, "rU")
except:
print "could not find fastq file"
return
# process reads in fastq file
readiter = SeqIO.parse(handle, "fastq")
for record in readiter: #contains the seq and Qscore etc.
num_reads += 1
read_sequence = str.upper(str(record.seq))
key_region = read_sequence[KEY_REGION_START:KEY_REGION_END]
key_index = key_region.find(KEY)
if key_index >= 0:
start_index = key_index + KEY_REGION_START + len(KEY)
guide = read_sequence[start_index:(start_index + 20)]
if guide in dictionary:
dictionary[guide] += 1
perfect_matches += 1
else:
non_perfect_matches += 1
else:
key_not_found += 1
# create ordered dictionary with guides and respective counts and output as a csv file
dict_sorted = OrderedDict(sorted(dictionary.items(), key=lambda t: t[0]))
with open(output_file, 'w') as csvfile:
mywriter = csv.writer(csvfile, delimiter=',')
for guide in dict_sorted:
count = dict_sorted[guide]
mywriter.writerow([guide,count])
# percentage of guides that matched perfectly
percent_matched = round(perfect_matches/float(perfect_matches + non_perfect_matches) * 100, 1)
# percentage of undetected guides with no read counts
guides_with_reads = np.count_nonzero(dictionary.values())
guides_no_reads = len(dictionary.values()) - guides_with_reads
percent_no_reads = round(guides_no_reads/float(len(dictionary.values())) * 100, 1)
# skew ratio of top 10% to bottom 10% of guide counts
top_10 = np.percentile(dictionary.values(), 90)
bottom_10 = np.percentile(dictionary.values(), 10)
if top_10 != 0 and bottom_10 != 0:
skew_ratio = top_10/bottom_10
else:
skew_ratio = 'Not enough perfect matches to determine skew ratio'
# write analysis statistics to statistics.txt
with open('statistics.txt', 'w') as infile:
infile.write('Number of perfect guide matches: ' + str(perfect_matches) + '\n')
infile.write('Number of nonperfect guide matches: ' + str(non_perfect_matches) + '\n')
infile.write('Number of reads where key was not found: ' + str(key_not_found) + '\n')
infile.write('Number of reads processed: ' + str(num_reads) + '\n')
infile.write('Percentage of guides that matched perfectly: ' + str(percent_matched) + '\n')
infile.write('Percentage of undetected guides: ' + str(percent_no_reads) + '\n')
infile.write('Skew ratio of top 10% to bottom 10%: ' + str(skew_ratio))
infile.close()
handle.close()
return
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Analyze sequencing data for sgRNA library distribution')
parser.add_argument('-f', '--fastq', type=str, dest='fastq_file',
help='fastq file name', default='NGS.fastq')
parser.add_argument('-o', '--output', type=str, dest='output_file',
help='output file name', default='library_count.csv')
parser.add_argument('-i', '--input', type=str, dest='input_file',
help='input file name', default='library_sequences.csv')
parser.add_argument('-no-g', dest='guide_g', help='presence of guanine before spacer', action='store_false')
parser.set_defaults(guide_g=True)
args = parser.parse_args()
count_spacers(args.input_file, args.fastq_file, args.output_file, args.guide_g)