-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgtf_with_sequence_extract_flex.py
164 lines (129 loc) · 5.79 KB
/
gtf_with_sequence_extract_flex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#gtf files may sometimes contain a sequence entry
#this will extract that entry along with the transcript treating the gtf file as a tsv
#also this is definitely easier done with bedtools, especially in the general case where sequences are not availabel
#which they are typically not
import pandas as pd
import subprocess
import random
random.seed(31523)
class my_dictionary(dict):
# __init__ function
def __init__(self):
self = dict()
# Function to add key:value
def add(self, key, value):
self[key] = value
#set up some variables
out_dir = "out/"
subprocess.run(["mkdir", out_dir])
fasta_out = out_dir + "unique_transcripts.fasta"
summary_out = out_dir + "summary.txt"
gtf_path = "synthetic/SPRMT_merge_031323_seq_remove_merged.gtf"
dup_remove = True
summary_sub_out = out_dir + "summary_SPRMT_RMD.txt"
sub_gtf = out_dir + "subset_SPRMT_RMD.gtf"
#read in gtf
gtf_data = pd.read_table(gtf_path, header = None)
gtf_data.columns = ["chr", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"]
gtf_data[['attribute', 'transcript_copy_id', 'sequence', 'biotype', 'bed_sequence']] = gtf_data['attribute'].str.split(';',expand = True)
#keep only non-duplicated entries for base testings (i.e. the pipelines can function with no external variables)
all_entry = len(gtf_data)
#gtf_unique = gtf_data.drop_duplicates(subset=['attribute'], keep = False)
#gtf_unique = gtf_unique.drop_duplicates(subset=['sequence'], keep = False)
gtf_unique = gtf_data.drop_duplicates(subset=['bed_sequence'], keep = False)
num_unique = len(gtf_unique)
print(str(all_entry - num_unique) + " Transcripts have been removed")
#fast conversion
unique_fasta = gtf_unique[['attribute', 'bed_sequence', 'biotype']]
#
num_5S = 0
num_rRNA = 0
num_transcripts = 0
biotypes = my_dictionary()
with open(fasta_out, 'w') as f:
for index, row in unique_fasta.iterrows():
transcript_name = ">" + str(row['attribute'])[15:-1] + "_" + str(row['biotype'])
if "5S" in str(row['attribute'])[15:-1]:
num_5S = num_5S + 1
# continue
if "rRNA" in str(row['attribute'])[15:-1]:
num_rRNA = num_rRNA + 1
# continue
print(">" + str(row['attribute'])[15:-1] + "._." + str(row['biotype'])[10:-1], file=f)
print(str(row['bed_sequence'])[15:-1], file=f)
if str(row['biotype']) not in biotypes:
biotypes.add(str(row['biotype']), 0)
biotypes[str(row['biotype'])] = biotypes[str(row['biotype'])] + 1
num_transcripts = num_transcripts + 1
with open(summary_out, 'w') as f:
print("number of 5S RNA removed: " + str(num_5S), file=f)
print("number of rRNA removed: " + str(num_rRNA), file=f)
print("number of total transcripts removed: " + str(all_entry - num_unique), file = f)
print("number of transcripts in fasta: " + str(num_transcripts), file=f)
print("number of RNA types in fasta: " + str(biotypes), file=f)
#we will take a random subset of a fasta file & create gtf file for the subset
subset_num = 1400
random_set = []
#set path
transcript_path = 'out/unique_transcripts.fasta'
subset_path = 'out/subset_transcripts.fasta'
transcript_list = []
#pick number of each transcripts based on the databases
#im not too sure if this is how i should do it...
#however, I think that this is what I need to do for now--taking user input
#may fix in future versions to not rely on human input as will be a pain for users
#but will be integrating more datasets with the annotation file so i'd like to keep this
#at least some what applicable if things begin to change
set_numb_types = my_dictionary()
for set in biotypes:
print(set)
num_of_trans = input("How many transcripts would you like for the printed set:")
num_of_trans = int(num_of_trans)
set_numb_types.add(set, num_of_trans)
#i can't tell if this a really bad way to randomly pick transcripts of x type
#it should work at an okay speed though, it is doubtful that users will pick a subset of 40k
source_sum = 0
with open(transcript_path, 'r') as f:
contents = f.readlines()
#here, we populate the random number list with each transcript type, one by one, till number is reached
for source in biotypes:
source_sum = source_sum + set_numb_types[source]
print(source_sum)
while len(random_set) < source_sum:
the_number = random.randint(0, num_transcripts-1)
the_number = (the_number * 2)
if source[10:-1] in contents[the_number].strip():
if the_number not in random_set:
random_set.append(the_number)
for num in random_set:
transcript_list.append(contents[num].strip())
transcript_list.append(contents[num+1].strip())
subset_summary = my_dictionary()
rRNA_types = {"5S":0, "rRNA":0}
with open(subset_path, 'w') as f:
for item in transcript_list:
for source in biotypes:
if source[10:-1] in item:
if source not in subset_summary:
subset_summary.add(source, 0)
subset_summary[source] = subset_summary[source] + 1
#count the rRNA types
if "rRNA" in item:
if "rRNA" not in rRNA_types:
rRNA_types = rRNA_types.add("rRNA", 0)
rRNA_types["rRNA"] = rRNA_types["rRNA"] + 1
if "5S" in item:
if "5S" not in rRNA_types:
rRNA_types = rRNA_types.add("5S", 0)
rRNA_types["5S"] = rRNA_types["5S"] + 1
print(item.partition('._.')[0], file=f)
with open(summary_sub_out, 'w') as f:
print(subset_summary, file=f)
print(rRNA_types, file=f)
#note that the following gtf file creation will only work if each transcript is unique with unique sequence
#length_fast = len(transcript_list)
#for i in range(int(length_fast)):
#if i%2 == 1:
#transcript_list[i]
#else:
#continue