-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcdr3_hamming_accumulation.py
111 lines (99 loc) · 4.96 KB
/
cdr3_hamming_accumulation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/python
import random
from blist import blist
from collections import defaultdict
import sys
import operator
from itertools import imap
__author__ = 'Csaba Kiss'
__email__ = 'csakis[at]lanl[dot]gov'
"""This script's input (.cdr3) is the file created by the cdr3_pipeline.py python script, each line
containing a single CDR3 sequence.
It searches for the unique cdr3s and count to occurrences of them.
After that, the script bins the CDR3s at a given Hamming distance."""
def dist(str1, str2):
ne = operator.ne
hamming = sum(imap(ne, str1, str2))
return hamming
if len(sys.argv) < 2:
cdr_in = raw_input('Please input the sample name sd6.cdr3: ')
else:
cdr_in = sys.argv[1]
bins_out = 'bins_' + cdr_in
f_in = open(cdr_in, 'rU')
cdr3_dict = defaultdict(blist)
full_cdr_list_dict = defaultdict(blist) # a dict containing all the CDR3s by length
cdr3_length_list = blist()
no_of_all_cdr3 = 0
for line in f_in.readlines(): #read CDR3s line by line
line = line[:-1] #remove \n character from lines
no_of_all_cdr3 += 1
cdr3_length = len(line)
full_cdr_list_dict[cdr3_length].append(line)
if cdr3_length not in cdr3_length_list:
cdr3_length_list.append(cdr3_length)
if line in cdr3_dict: #check if the cdr3 is unique
cdr3_dict[line] += 1
else:
cdr3_dict[line] = 1
print 'There are %d CDR3s to cluster.' % no_of_all_cdr3
print 'There are %d unique CDR3s in the file.' % len(cdr3_dict)
unique_cdr3_name_count = {}
unique_cdr3list_by_count = defaultdict(blist)
for cdr, index in sorted(cdr3_dict.items(), key=operator.itemgetter(1), reverse=True):
unique_cdr3_name_count[cdr] = 0 # the count of the unique CDR3s are set to zero
unique_cdr3list_by_count[len(cdr)].append(cdr) # unique cdr3 list descending count by length
Hamming_distance = int(raw_input('Please enter the desired Hamming distance: '))
sample_name = cdr_in.split('.')[0]
file_out_cluster_name = 'bins_Hamming_%d_%s.csv' % (Hamming_distance, sample_name) #file that contains the CDR3 clusters
file_out_accumulation_name = 'accumulation_Hamming_%d_%s.csv' % (Hamming_distance, sample_name)
hamming_accumulation_dict = {} # contains the accumulation numbers at a given Hamming distance
hamming_accumulation_counter = 0
unique_no = 0
for cdr3_length in sorted(cdr3_length_list, reverse=True):
all_cdr_list = blist(full_cdr_list_dict[cdr3_length]) # all the CDR3s with the particular length
# random.shuffle(all_cdr_list)
unique_cdrlist = blist(unique_cdr3list_by_count[cdr3_length]) #The unique CDR3 list of 'cdr3_length' length with the most common first
""" We have to create a hamming cdr that contains all the unique CDR3s list for the given Hamming Distance"""
print 'We will create the Hamming list now for length %d' %cdr3_length
hamming_cdrlist = blist(unique_cdrlist)
abs_counter = 0
unique_length = len(unique_cdrlist)
while unique_length - abs_counter >=2:
for j in range(abs_counter + 1, unique_length):
if dist(unique_cdrlist[abs_counter],unique_cdrlist[j]) <= Hamming_distance:
if unique_cdrlist[j] in hamming_cdrlist:
hamming_cdrlist.remove(unique_cdrlist[j])
unique_cdrlist = blist(hamming_cdrlist)
unique_length = len(unique_cdrlist)
abs_counter +=1
for cdr3 in all_cdr_list: # we go through each cdr3 on by one
hamming_accumulation_counter += 1
if hamming_accumulation_counter % 10000 == 0:
print "%d CDR3s have been checked so far, %d to go." % (hamming_accumulation_counter, (no_of_all_cdr3 - hamming_accumulation_counter))
if unique_cdr3_name_count[cdr3] == 0: # if we have not found this CDR3 yet
for hamming_cdr in hamming_cdrlist:
if dist(cdr3, hamming_cdr) <= Hamming_distance:
if unique_cdr3_name_count[hamming_cdr] == 0:
unique_no += 1
unique_cdr3_name_count[hamming_cdr] = 1
else:
unique_cdr3_name_count[hamming_cdr] += 1
break
else:
unique_cdr3_name_count[cdr3] += 1
hamming_accumulation_dict[hamming_accumulation_counter] = unique_no
cdr3_count = 0
f_out = open(file_out_cluster_name, 'w')
for cdr3, no_of_cdr3 in sorted(unique_cdr3_name_count.items(), key=operator.itemgetter(1), reverse=True):
if int(no_of_cdr3) > 0:
cdr3_count += 1
f_out.write('%s, %s\n' % (cdr3, no_of_cdr3)) #the csv file contains unique seq, count
f_out.close()
f_out = open(file_out_accumulation_name, 'w')
for cdr3, unique_no in sorted(hamming_accumulation_dict.items(), key=operator.itemgetter(1)):
f_out.write('%s, %s\n' % (cdr3, unique_no)) #the csv file contains unique seq, count
f_out.close()
print 'There are %d unique clusters at Hamming distance %d' % (cdr3_count, Hamming_distance)
print 'The %s file has been created successfully.' % file_out_accumulation_name
print 'The %s file has been created successfully.' % file_out_cluster_name