-
Notifications
You must be signed in to change notification settings - Fork 0
/
DNA_MIDI.py
350 lines (300 loc) · 15.9 KB
/
DNA_MIDI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
from Bio.Seq import Seq
from midiutil.MidiFile import MIDIFile
import os, os.path
import enum
'''a module to convert DNA (in the form of a text file of nucleotides) into a MIDI file'''
CHANNEL = 0 #set instrument to a piano
class ChordQuality(enum.Enum):
MAJOR = 1
MINOR = 2
DIMINISHED = 3
#pitches correspondence according to MIDIUtil
PITCH_DICTIONARY = {"C": 60,
"C#": 61,
"D": 62,
"Eb": 63,
"E": 64,
"F": 65,
"F#": 66,
"G": 67,
"Ab": 68,
"A": 69,
"Bb": 70,
"B": 71}
KEYS = ["C", "C#", "D", "Eb", "E", "F", "F#", "G", "Ab", "A", "Bb", "B"]
STOP_CODONS = ["TAA", "TGA", "TAG"]
AMINO_ACIDS = { #essential
"ATG":"C", #methionine/start codon
"ATT":"C#",
"ATC":"C#",
"ATA":"C#", #isoleucine
"TTA":"C#",
"TTG":"C#",
"CTT":"C#",
"CTC":"C#",
"CTA":"C#",
"CTG":"C#", #leucine -- group with isoleucine since they're most similar by Grantham's distance
"AAA":"D",
"AAG":"D", #lysine
"ACT":"Eb",
"ACC":"Eb",
"ACA":"Eb",
"ACG":"Eb", #threonine
"TTT":"E",
"TTC":"E", #phenylalanine
"TGG":"F", #tryptophan
"CAT":"F#",
"CAC":"F#", #histidine
"GTT":"G",
"GTC":"G",
"GTA":"G",
"GTG":"G", #valine
#nonessential
"AAT":"Ab",
"AAC":"Ab", #asparagine
"GAT":"A",
"GAC":"A", #aspartate
"GCT":"Bb",
"GCC":"Bb",
"GCA":"Bb",
"GCG":"Bb", #alanine
"GAA":"B",
"GAG":"B", #glutamate
#conditionally essential
"TAT":"C",
"TAC":"C", #tyrosine
"TGT":"D",
"TGC":"D", #cysteine
"TCC":"E",
"TCT":"E",
"TCA":"E",
"TCG":"E",
"AGT":"E",
"AGC":"E", #serine
"AGA":"F",
"AGG":"F",
"CGT":"F",
"CGC":"F",
"CGA":"F",
"CGG":"F", #arginine
"CCT":"G",
"CCC":"G",
"CCA":"G",
"CCG":"G", #proline
"CAA":"A",
"CAG":"A", #glutamine
"GGT":"B",
"GGC":"B",
"GGA":"B",
"GGG":"B", #glycine
#stop codons
"TAA":"C",
"TAG":"E",
"TGA":"G" }
COMPLEMENT = {
"A":"T",
"T":"A",
"C":"G",
"G":"C",
}
CONDITIONALLY_ESSENTIAL = ["TAT",
"TAC", #tyrosine
"TGT",
"TGC", #cysteine
"TCC",
"TCT",
"TCA",
"TCG",
"AGT",
"AGC", #serine
"AGA",
"AGG",
"CGT",
"CGC",
"CGA",
"CGG", #arginine
"CCT",
"CCC",
"CCA",
"CCG", #proline
"CAA",
"CAG", #glutamine
"GGT",
"GGC",
"GGA",
"GGG"] #glycine
def change_key(curr_tonic, curr_tonic_note_name, new_key, chord_quality, dna_to_chromatic_dict):
'''changes the key given a new key and a mode (major or minor). The intervallic distance between the root of the current key and the root of the new key (e.g. C and A) is determined,
and, along with the mode, the tonic, mediant, and dominant notes of the new key (which are the 3 notes of the major/minor triad of that key) are determined'''
semitones_diff = PITCH_DICTIONARY[curr_tonic_note_name] - PITCH_DICTIONARY[new_key]
semitones_up_scale = abs(semitones_diff) if semitones_diff < 0 else len(KEYS) - semitones_diff
new_tonic = (curr_tonic + semitones_up_scale) % len(KEYS)
mediant = (new_tonic + (4 if chord_quality == ChordQuality.MAJOR else 3)) % len(KEYS)
dominant = (new_tonic + (6 if chord_quality == ChordQuality.DIMINISHED else 7)) % len(KEYS)
tonic_note_name = KEYS[new_tonic]
mediant_note_name = KEYS[mediant]
dominant_note_name = KEYS[dominant]
dna_to_chromatic_dict["A"] = tonic_note_name
dna_to_chromatic_dict["C"] = mediant_note_name
dna_to_chromatic_dict["T"] = mediant_note_name
dna_to_chromatic_dict["G"] = dominant_note_name
return (new_tonic, tonic_note_name, mediant_note_name, dominant_note_name)
def get_submediant_note_name(curr_tonic):
submediant = curr_tonic + 9
if submediant >= len(KEYS):
submediant = submediant - len(KEYS)
return KEYS[submediant]
def process_UTR_frag(UTR_frag, midi_file, track_num, dna_to_chromatic_dict, time, duration, volume):
for nucleotide in UTR_frag:
midi_file.addNote(track_num, CHANNEL, PITCH_DICTIONARY[dna_to_chromatic_dict[nucleotide.upper()]], time, duration, volume)
midi_file.addNote(track_num, CHANNEL, PITCH_DICTIONARY[dna_to_chromatic_dict[COMPLEMENT[nucleotide].upper()]], time, duration, volume)
time += duration
return time
def add_notes(folder_name, track_name, full_file_name, UTR_5prime_exons_list, CDS_list):
'''adds notes to the MIDI file. Initial key is always C minor. Before the start codon AUG is encountered, individual nucleotides outline
the 3 notes of minor triad (base pairs form dyads, or 2-note chords). Then, during translation, the key is major and the volume doubles. The
key is determined by the codon (see the AMINO_ACIDS dictionary above). Individual nucleotides no longer determine the notes; this is now dictated
by the codons. Each time the key changes based on the codon, the major triad of that key is played, until a stop codon is reached. Then we remain in
the same minor key as the previous stop codon, but the volume is halved again and individual nucleotides once again outline the notes of the new
minor triad. Then this process can repeat if another start codon is then encountered'''
'''create_track creates the specified track of the midi file with the above parameters specified. Nucleotides refers to the string
of DNA nucleotides for the desired organism and gene to be turned into music'''
tonic_note_name = "C"
mediant_note_name = "Eb"
dominant_note_name = "G"
dna_to_chromatic_dict = {}
dna_to_chromatic_dict["A"] = tonic_note_name
dna_to_chromatic_dict["C"] = mediant_note_name
dna_to_chromatic_dict["T"] = mediant_note_name
dna_to_chromatic_dict["G"] = dominant_note_name
volume = 50
duration = 1
time = 0
tonic = 0
track_num = 0
midi_file = MIDIFile(1)
midi_file.addTrackName(track_num, 0, track_name)
midi_file.addTempo(track_num, 0, 200)
translation_keys_file = open(os.path.join(folder_name, track_name + '_translation_keys.txt'), 'w') #create file to write the keys to during translation
UTR_frag_index = 0
translation_initiated = False
translation_completed = False
start_codon_encountered = False
remaining_length_in_codon = 0 # for codons across splice regions
is_spliced = False
regionIdx = 0
while regionIdx < len(CDS_list):
region = CDS_list[regionIdx]
if UTR_frag_index < len(UTR_5prime_exons_list):
change_key(tonic, tonic_note_name, tonic_note_name, ChordQuality.MINOR, dna_to_chromatic_dict)
time = process_UTR_frag(UTR_5prime_exons_list[UTR_frag_index], midi_file, track_num, dna_to_chromatic_dict, time, 1, volume)
UTR_frag_index += 1
if UTR_frag_index == len(UTR_5prime_exons_list): # translation begins when we finish the UTR
translation_initiated = True
# then the entire region is part of the 5' UTR based on how the list is preprocessed
# so the current exon is empty since we just processed the entire exon (UTR)
# and so we want to skip it and immediately process the next intron now which is at the next index
if len(region) == 0:
regionIdx += 1
region = CDS_list[regionIdx]
nucleotideIdx = 0
is_exon = regionIdx % 2 == 0
while nucleotideIdx < len(region):
nucleotide = region[nucleotideIdx]
base_pair = COMPLEMENT[nucleotide]
if is_exon:
# we are in the middle of translation
if translation_initiated and not translation_completed:
if nucleotideIdx == 0 and remaining_length_in_codon > 0: # if we're in a new exon and the codon began in the previous region (end of splice)
prev_exon = CDS_list[regionIdx - 2] # bc the next region in the list is an intron
initial_codon_frag_length = 3 - remaining_length_in_codon
initial_codon_frag = prev_exon[-initial_codon_frag_length:]
remaining_codon_frag = region[:remaining_length_in_codon]
triplet_codon = initial_codon_frag + remaining_codon_frag
nucleotideIdx += remaining_length_in_codon
remaining_length_in_codon = 0
# is_spliced remains true from the else clause
elif nucleotideIdx < len(region) - 2: # normal codon
triplet_codon = region[nucleotideIdx:nucleotideIdx+3]
nucleotideIdx += 3
is_spliced = False
else: # the codon begins across a splice region
initial_codon_frag = region[nucleotideIdx:len(region)]
remaining_length_in_codon = 3 - len(initial_codon_frag)
next_exon = CDS_list[regionIdx + 2] # bc the next region in the list is an intron
remaining_codon_frag = next_exon[:remaining_length_in_codon]
triplet_codon = initial_codon_frag + remaining_codon_frag
nucleotideIdx = len(region)
is_spliced = True
if not start_codon_encountered: # i.e. we're just beginning translation, the next codon should be ATG or the file is corrupt
if triplet_codon != "ATG":
raise Exception("Invalid sequence, 5' UTR is not followed by start codon, it is followed by", triplet_codon)
start_codon_encountered = True
volume = 100
duration = 0.5 if is_spliced else 2
(tonic, tonic_note_name, mediant_note_name, dominant_note_name) = change_key(tonic, tonic_note_name, AMINO_ACIDS[triplet_codon], ChordQuality.MAJOR, dna_to_chromatic_dict)
dominant_pitch = PITCH_DICTIONARY[dominant_note_name] + (1 if triplet_codon in CONDITIONALLY_ESSENTIAL else 0) # we want augmented chord in this case
#longer (major) chord to signify start of translation.
midi_file.addNote(track_num, CHANNEL, PITCH_DICTIONARY[tonic_note_name], time, duration, volume)
midi_file.addNote(track_num, CHANNEL, PITCH_DICTIONARY[mediant_note_name], time, duration, volume)
midi_file.addNote(track_num, CHANNEL, dominant_pitch, time, duration, volume)
translation_keys_file.write(AMINO_ACIDS[triplet_codon] + " ")
time += duration
elif triplet_codon in STOP_CODONS:
volume = 50
duration = 0.5 if is_spliced else 2
(tonic, tonic_note_name, mediant_note_name, dominant_note_name) = change_key(tonic, tonic_note_name, AMINO_ACIDS[triplet_codon], ChordQuality.MINOR, dna_to_chromatic_dict)
#longer (minor) chord to signify end of translation.
midi_file.addNote(track_num, CHANNEL, PITCH_DICTIONARY[tonic_note_name], time, duration, volume)
midi_file.addNote(track_num, CHANNEL, PITCH_DICTIONARY[mediant_note_name], time, duration, volume)
midi_file.addNote(track_num, CHANNEL, PITCH_DICTIONARY[dominant_note_name], time, duration, volume)
translation_completed = True
translation_keys_file.write(AMINO_ACIDS[triplet_codon] + " ")
time += duration
# process the remainder of the exon as 3' UTR
duration = 1
time = process_UTR_frag(region[nucleotideIdx:], midi_file, track_num, dna_to_chromatic_dict, time, duration, volume)
nucleotideIdx = len(region)
else: # we are actively translating
volume = 100
duration = 0.5 if is_spliced else 1
(tonic, tonic_note_name, mediant_note_name, dominant_note_name) = change_key(tonic, tonic_note_name, AMINO_ACIDS[triplet_codon], ChordQuality.MAJOR, dna_to_chromatic_dict)
dominant_pitch = PITCH_DICTIONARY[dominant_note_name] + (1 if triplet_codon in CONDITIONALLY_ESSENTIAL else 0) # we want augmented chord in this case
midi_file.addNote(track_num, CHANNEL, PITCH_DICTIONARY[tonic_note_name], time, duration, volume)
midi_file.addNote(track_num, CHANNEL, PITCH_DICTIONARY[mediant_note_name], time, duration, volume)
midi_file.addNote(track_num, CHANNEL, dominant_pitch, time, duration, volume)
translation_keys_file.write(AMINO_ACIDS[triplet_codon] + " ")
time += duration
# we are dealing with entire exons in the 3' UTR following translation
else:
time = process_UTR_frag(region, midi_file, track_num, dna_to_chromatic_dict, time, duration, volume)
nucleotideIdx += 1 # since individual nucleotides, rather than codons, dictate the harmony here
else: # we are in an intron
duration = 1
volume = 50
(tonic, _, _, _) = change_key(tonic, tonic_note_name, tonic_note_name, ChordQuality.DIMINISHED, dna_to_chromatic_dict)
midi_file.addNote(track_num, CHANNEL, PITCH_DICTIONARY[dna_to_chromatic_dict[nucleotide.upper()]], time, duration, volume)
midi_file.addNote(track_num, CHANNEL, PITCH_DICTIONARY[dna_to_chromatic_dict[base_pair.upper()]], time, duration, volume)
midi_file.addNote(track_num, CHANNEL, PITCH_DICTIONARY[get_submediant_note_name(tonic)], time, duration, volume)
nucleotideIdx += 1
time += duration
duration = 1
regionIdx += 1
write_to_disk(full_file_name, midi_file)
def get_sequence_complement(sequence):
'''get the genetic sequence complement (base pairs) with Biopython'''
sequence = Seq(sequence)
return str(sequence.complement())
def get_uppercase_prefix(input_str):
c = input_str[0]
idx = 0
res = ""
while idx < len(input_str) and c.isupper():
res += c
c = input_str[idx]
idx += 1
return res
def write_to_disk(full_file_name, midi_file):
'''write the MIDI file to the disk'''
with open(full_file_name[:-4] + ".mid", 'wb') as outf:
midi_file.writeFile(outf)