-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.rb
107 lines (78 loc) · 2.6 KB
/
run.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
require 'concurrent'
allele_sequences_file = 'hla_prot.fasta'
current_allele_name = nil
allele_sequences = {}
skip_line = false
ALLOWED_LOCI = %w[DRB1 DRB3 DRB4 DRB5 DQA1 DQB1 DPA1 DPB1]
ESM_TIMEOUT = 16
unless File.exist?(allele_sequences_file)
system('wget', "https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/#{allele_sequences_file}")
end
File.open(allele_sequences_file).each_line do |line|
if line.include?('>')
current_allele_name = line.split(' ')[1].split(':')[0..1].join(':')
current_locus = current_allele_name.split('*').first
unless allele_sequences[current_allele_name].nil?
skip_line = true
next
end
unless ALLOWED_LOCI.include?(current_locus)
skip_line = true
next
end
if current_allele_name[-1].match(/[A-Za-z]/)
skip_line = true
next
end
allele_sequences[current_allele_name] = ''
skip_line = false
elsif skip_line == false
allele_sequences[current_allele_name] += line.strip
end
end
File.delete(allele_sequences_file)
puts "DOWNLOADED ALLELES QTD: #{allele_sequences.size}"
min_sequence_sizes = {
'A' => 270,
'B' => 270,
'C' => 270,
'DRB1' => 180,
'DRB3' => 180,
'DRB4' => 180,
'DRB5' => 180,
'DQA1' => 180,
'DQB1' => 180,
'DPA1' => 180,
'DPB1' => 180
}
filtered_allele_sequences = {}
allele_sequences.each do |allele, sequence|
locus = allele.split('*').first
next if sequence.size < min_sequence_sizes[locus]
filtered_allele_sequences[allele] = sequence
end
puts "FILTERED ALLELES QTD: #{filtered_allele_sequences.size}"
output = File.new('input/sequences.fasta', 'w')
output.sync = true
filtered_allele_sequences.each do |allele, sequence|
output.puts ">#{allele}".strip
sequence.split('').each_slice(60) do |slice|
output.puts slice.join.strip
end
end
output.flush
output.close
puts "MODELING WHAT IS POSSIBLE USING #{Concurrent.processor_count} THREADS..."
pool = Concurrent::FixedThreadPool.new(Concurrent.processor_count)
filtered_allele_sequences.each do |allele, sequence|
pool.post do
output_folder = "output/HLA_#{allele.split('*').first}"
model_name = "#{allele.gsub('*', '_').gsub(':', '_')}.pdb"
`mkdir -p #{output_folder}`
puts "[#{Time.now}] Running: curl -X POST -s --insecure --connect-timeout #{ESM_TIMEOUT} --data \"#{sequence}\" https://api.esmatlas.com/foldSequence/v1/pdb/ > #{output_folder}/#{model_name}"
`curl -X POST -s --insecure --connect-timeout #{ESM_TIMEOUT} --data "#{sequence}" https://api.esmatlas.com/foldSequence/v1/pdb/ > #{output_folder}/#{model_name}`
sleep ESM_TIMEOUT
end
end
pool.shutdown
pool.wait_for_termination