forked from dePamphilis/PlantTribes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKaKsAnalysis
678 lines (645 loc) · 35.2 KB
/
KaKsAnalysis
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
#!/usr/bin/env perl
# Author: Eric Wafula
# Email: [email protected]
# Institution: Penn State University, Biology Dept, Claude dePamphilis Lab
# Date: 02-15-2017
use strict;
use warnings;
use File::Spec;
use File::Basename;
use Getopt::Long qw(:config no_ignore_case);
use FindBin;
my $home = "$FindBin::Bin/..";
my $usage = <<__EOUSAGE__;
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#
# DETERMINE PAIRWISE SEQUENCE SYNONYMOUS SUBSTITUTIONS (Ks) AND SIGNIFICANT DUPLICATION COMPONENTS
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Required Options:
#
# --coding_sequences_species_1 <string> : Coding sequences (CDS) fasta file for the first species (species1.fna)
#
#
# --proteins_species_1 <string> : Amino acids (proteins) sequences fasta file for the first species (species1.faa)
#
#
# --comparison <string> : pairwise sequence comparison to determine homolgous pairs
# If self species comparison: paralogs
# If cross species comparison: orthologs (requires second species data)
#
# # # # # # # # # # # # # # # # # #
# Others Options:
#
# --coding_sequences_species_2 <string> : Coding sequences (CDS) fasta file for the first species (species2.fna)
# requires "--comparison" to be set to "orthologs"
#
#
# --proteins_species_2 <string> : Amino acids (proteins) sequences fasta file for the first species (species2.faa)
# requires "--comparison" to be set to "orthologs"
#
# --crb_blast <string> : Use conditional reciprocal best BLAST to determine for cross-species orthologs
# instead of the default reciprocal best BLAST
# requires "--comparison" to be set to "orthologs"
#
# --min_coverage <float> : Minimum sequence pairwise coverage length between homologous pairs
# Default: 0.5 (50% coverage) - [0.3 to 1.0]
#
# --recalibration_rate <float> : Recalibrate synonymous substitution (ks) rates of a species using a predetermined evolutionary rate that
# can be determined from a species tree inferred from a collection single copy genes from taxa of interest
# (Cui et al., 2006) - mainly applies only paralogous ks analysis
#
# --codeml_ctl_file <string> : PAML's codeml control file to carry out ML analysis of protein-coding DNA sequences using codon
# substitution models. The defaults in the "codeml.ctl.args" template in the config directory of
# the installation will be used if not provided. NOTE: input (seqfile, treefile) and output (outfile)
# parameters of codeml are not included in the template.
#
#
# --fit_components : Fit a mixture model of multivariate normal components to synonymous (ks) distribution to identify
# significant duplication event(s) in a genome
#
#
# --num_of_components <int> : Number components to fit to synonymous substitutions (ks) distribution - required if "--fit_components"
#
#
# --min_ks <float> : Lower limit of synonymous substitutions (ks) - necessary if fitting components to the distribution to
# reduce background noise from young paralogous pairs due to normal gene births and deaths in a genome.
#
#
# --max_ks <float> : Upper limit of synonymous substitutions (ks) - necessary if fitting components to the distribution to
# exclude likey ancient paralogous pairs.
#
# --num_threads <int> : number of threads (CPUs)
# Default: 1
#
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Example Usage:
#
# KaKsAnalysis --coding_sequences_species_1 species1.fna --proteins_species_1 species1.faa --comparison paralogs --num_threads 4
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
__EOUSAGE__
;
# Declare and initialize variables;
my $coding_sequences_species_1;
my $proteins_species_1;
my $comparison;
my $coding_sequences_species_2;
my $proteins_species_2;
my $crb_blast;
my $min_coverage;
my $codeml_ctl_file;
my $recalibration_rate;
my $fit_components;
my $num_of_components;
my $min_ks;
my $max_ks;
my $num_threads;
my $options = GetOptions ( 'coding_sequences_species_1=s' => \$coding_sequences_species_1,
'proteins_species_1=s' => \$proteins_species_1,
'comparison=s' => \$comparison,
'coding_sequences_species_2=s' => \$coding_sequences_species_2,
'proteins_species_2=s' => \$proteins_species_2,
'crb_blast' => \$crb_blast,
'min_coverage=f' => \$min_coverage,
'codeml_ctl_file=s' => \$codeml_ctl_file,
'recalibration_rate=f' => \$recalibration_rate,
'fit_components' => \$fit_components,
'num_of_components=i' => \$num_of_components,
'min_ks=f' => \$min_ks,
'max_ks=f' => \$max_ks,
'num_threads=i' => \$num_threads
);
# validate options
unless ( $options ) { die $usage; }
unless ( $coding_sequences_species_1 and $proteins_species_1 and $comparison ) { print "\nOne or more required options not set\n"; die $usage; }
if ( $comparison and ($comparison ne "paralogs") and ($comparison ne "orthologs") ) {
print "\nUnknown species comparison to determine homolgous pairs\n"; die $usage; }
if ( $comparison eq "orthologs" and (!$coding_sequences_species_2 or !$proteins_species_2) ) {
print "Orthologous synonymous (ks) analysis requires both coding (CDS) and amino acid (proteins) sequences for the second species\n"; die $usage;
}
if ( $crb_blast and ($comparison ne "orthologs") ) { print "Conditional reciprocal best BLAST ortholog assignment requires cross species comparison\n"; die $usage; }
if ( $min_coverage and ($min_coverage < 0.3) ) { print "0.3 (30%) is the required minimum sequence pairwise coverage length between homologous pairs\n"; die $usage; }
if ( $fit_components and !$num_of_components ) {
print "Fitting a mixture model of multivariate normal components to synonymous (ks) distribution requires number of components to be set (--num_of_components)\n"; die $usage;
}
# defaults
if (!$min_coverage) { $min_coverage = 0.5; }
if (!$num_threads) { $num_threads = 1; }
if (!$codeml_ctl_file) {($codeml_ctl_file = qq{
noisy = 0 * 0,1,2,3,9: how much rubbish on the screen
verbose = 0 * 0: concise; 1: detailed, 2: too much
runmode = -2 * 0: user tree; 1: semi-automatic; 2: automatic
* 3: StepwiseAddition; (4,5):PerturbationNNI; -2: pairwise
seqtype = 1 * 1:codons; 2:AAs; 3:codons-->AAs
CodonFreq = 2 * 0:1/61 each, 1:F1X4, 2:F3X4, 3:codon table
aaDist = 0 * 0:equal, +:geometric; -:linear, 1-6:G1974,Miyata,c,p,v,a
model = 1 * 0:one, 1:b, 2:2 or more dN/dS ratios for branches
NSsites = 0 * 0:one w;1:neutral;2:selection; 3:discrete;4:freqs;
* 5:gamma;6:2gamma;7:beta;8:beta&w;9:betaγ
* 10:beta&gamma+1; 11:beta&normal>1; 12:0&2normal>1;
* 13:3normal>0
icode = 0 * 0:universal code; 1:mammalian mt; 2-10:see below
Mgene = 0 * 0:rates, 1:separate; 2:diff pi, 3:diff kapa, 4:all diff
fix_kappa = 0 * 1: kappa fixed, 0: kappa to be estimated
kappa = 2 * initial or fixed kappa
fix_omega = 0 * 1: omega or omega_1 fixed, 0: estimate
omega = 2 * initial or fixed omega, for codons
fix_alpha = 1 * 0: estimate gamma shape parameter; 1: fix it at alpha
alpha = 0. * initial or fixed alpha, 0:infinity (constant rate)
Malpha = 0 * different alphas for genes
ncatG = 2 * # of categories in dG of NSsites models
clock = 0 * 0:no clock, 1:global clock; 2:local clock; 3:TipDate
getSE = 0 * 0: don't want them, 1: want S.E.s of estimates
RateAncestor = 0 * (0,1,2): rates (alpha>0) or ancestral states (1 or 2)
* Genetic codes: 0:universal, 1:mammalian mt., 2:yeast mt., 3:mold mt.,
* 4: invertebrate mt., 5: ciliate nuclear, 6: echinoderm mt.,
* 7: euplotid mt., 8: alternative yeast nu. 9: ascidian mt.,
* 10: blepharisma nu.
* These codes correspond to transl_table 1 to 11 of GENEBANK.
Small_Diff = .5e-6
* cleandata = 1 * remove sites with ambiguity data (1:yes, 0:no)?
* ndata = 2
* fix_blength = -1 * 0: ignore, -1: random, 1: initial, 2: fixed
method = 1 * 0: simultaneous; 1: one branch at a time})
}
print "\n";
print localtime()." - Starting nonsynonymous (ka) and synonymous (ks) analysis\n\n";
# create output directory
my $dirname = "./kaksAnalysis_dir";
if (-d $dirname) { die "Exiting...!\nKaKs analysis output directory ($dirname) already exists!\n\n"; }
make_directory($dirname);
get_homologs ( $coding_sequences_species_1, $proteins_species_1, $coding_sequences_species_2, $proteins_species_2, $comparison, $crb_blast, $num_threads, $dirname );
compute_kaks ( $comparison, $codeml_ctl_file, $recalibration_rate, $min_coverage, $min_ks, $max_ks, $num_threads, $dirname );
if ( $fit_components ) {
fit_components ( $num_of_components, $comparison, $dirname )
}
print localtime()." - Completed nonsynonymous (ka) and synonymous (ks) analysis\n\n";
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub-routines # # # # # # # # # # # # # # # # # # # # # # # # # # #
sub make_directory {
my ( $new_dir ) = @_;
if (!-d $new_dir) {
mkdir($new_dir, 0755);
}
}
sub get_homologs {
my ( $coding_sequences_species_1, $proteins_species_1, $coding_sequences_species_2, $proteins_species_2, $comparison, $crb_blast, $num_threads, $dirname ) = @_;
# create dna blast database and run blastn against self/cross-species
my ( %best1, %max1, %best2, %max2, %rbhb );
system "cp $coding_sequences_species_1 $dirname/species1.fna";
system "cp $proteins_species_1 $dirname/species1.faa";
if ( $comparison eq "orthologs") {
print localtime()." - Determining orthologous pairs using reciprocal best BLAST\n\n";
system "cp $coding_sequences_species_2 $dirname/species2.fna";
system "cp $proteins_species_2 $dirname/species2.faa";
if ( $crb_blast ) {
print localtime()." - Determining orthologous pairs using conditional reciprocal best BLAST\n\n";
my $crb_blast_call = system "crb-blast --query $dirname/species1.fna --target $dirname/species2.fna --evalue 1e-20 --threads $num_threads --output $dirname/species1and2.fna.blastn.orthologs.rbhb --split >/dev/null";
if ($crb_blast_call != 0) { print "\n-- ".localtime()." - Running CRB-BLAST failed\n\n"; exit(1); }
system "mv species1_into_species2.1.blast $dirname/species1.fna.blastn.orthologs >/dev/null";
system "mv species2_into_species1.2.blast $dirname/species2.fna.blastn.orthologs >/dev/null";
system "rm evalues_data fitting_data *.nhr *.nin *.nsq >/dev/null";
}
else {
print localtime()." - Determining orthologous pairs using reciprocal best BLAST\n\n";
my $makeblastdb_call = system "makeblastdb -in $dirname/species2.fna -dbtype nucl >/dev/null";
if ($makeblastdb_call != 0) { print "\n-- ".localtime()." - Running makeblastdb failed\n\n"; exit(1); }
my $blastn_call = system "blastn -outfmt 6 -evalue 1e-20 -num_threads $num_threads -query $dirname/species1.fna -db $dirname/species2.fna -out $dirname/species1.fna.blastn.orthologs >/dev/null";
if ($blastn_call != 0) { print "\n-- ".localtime()." - Running BLASTN failed\n\n"; exit(1); }
$makeblastdb_call = system "makeblastdb -in $dirname/species1.fna -dbtype nucl >/dev/null";
if ($makeblastdb_call != 0) { print "\n-- ".localtime()." - Running makeblastdb failed\n\n"; exit(1); }
$blastn_call = system "blastn -outfmt 6 -evalue 1e-20 -num_threads $num_threads -query $dirname/species2.fna -db $dirname/species1.fna -out $dirname/species2.fna.blastn.orthologs >/dev/null";
if ($blastn_call != 0) { print "\n-- ".localtime()." - Running BLASTN failed\n\n"; exit(1); }
open (OUT, ">$dirname/species1and2.fna.blastn.orthologs.rbhb") or die "can't open $dirname/species1and2.fna.blastn.orthologs.rbhb file\n";
open (IN, "$dirname/species1.fna.blastn.orthologs") or die "can't open $dirname/species1.fna.blastn.orthologs file\n";
while (<IN>) {
chomp;
my @F=split(/\t/, $_);
if ($F[0] eq $F[1]) { next; }
if (!$best1{$F[0]}) { $best1{$F[0]} = $_; $max1{$F[0]} = $F[11]; }
else { if ($F[11] > $max1{$F[0]}) { $best1{$F[0]} = $_; $max1{$F[0]} = $F[11]; } }
}
close IN;
open (IN, "$dirname/species2.fna.blastn.orthologs") or die "can't open $dirname/species2.fna.blastn.orthologs file\n";
while (<IN>) {
chomp;
my @F=split(/\t/, $_);
if ($F[0] eq $F[1]) { next; }
if (!$best2{$F[0]}) { $best2{$F[0]} = $_; $max2{$F[0]} = $F[11]; }
else { if ($F[11] > $max2{$F[0]}) { $best2{$F[0]} = $_; $max2{$F[0]} = $F[11]; } }
}
close IN;
foreach (keys %best1) {
if ($rbhb{$_}) { next; }
my @F = split(/\t/, $best1{$_});
my @S = split(/\t/, $best2{$F[1]}) if $best2{$F[1]};
if (!$S[1]) { next; }
if ($F[0] eq $S[1]) { print OUT "$F[0]\t$F[1]\n"; $rbhb{$F[0]} = $F[0]; $rbhb{$F[1]} = $F[1]; }
}
close OUT;
system "rm $dirname/*.nhr $dirname/*.nin $dirname/*.nsq >/dev/null";
}
}
else { # $comparison eq "paralogs"
print localtime()." - Determining paralogous pairs using reciprocal best BLAST\n\n";
my $makeblastdb_call = system "makeblastdb -in $dirname/species1.fna -dbtype nucl >/dev/null";
if ($makeblastdb_call != 0) { print "\n-- ".localtime()." - Running makeblastdb failed\n\n"; exit(1); }
my $blastn_call = system "blastn -outfmt 6 -evalue 1e-20 -num_threads $num_threads -query $dirname/species1.fna -db $dirname/species1.fna -out $dirname/species1.fna.blastn.paralogs >/dev/null";
if ($blastn_call != 0) { print "\n-- ".localtime()." - Running BLASTN failed\n\n"; exit(1); }
open (OUT, ">$dirname/species1.fna.blastn.paralogs.rbhb") or die "can't open $dirname/species1.fna.blastn.paralogs.rbhb file\n";
open (IN, "$dirname/species1.fna.blastn.paralogs") or die "can't open $dirname/species1.fna.blastn.paralogs file\n";
while (<IN>) {
chomp;
my @F = split(/\t/, $_);
if ($F[0] eq $F[1]) { next; }
if (!$best1{$F[0]}) { $best1{$F[0]} = $_; $max1{$F[0]} = $F[11]; }
else { if ($F[11] > $max1{$F[0]}) { $best1{$F[0]} = $_; $max1{$F[0]} = $F[11]; } }
}
close IN;
foreach (keys %best1) {
if ($rbhb{$_}) { next; }
my @F = split(/\t/, $best1{$_});
my @S = split(/\t/, $best1{$F[1]}) if $best1{$F[1]};
if (!$S[1]) { next; }
if ($F[0] eq $S[1]) { print OUT "$F[0]\t$F[1]\n"; $rbhb{$F[0]} = $F[0]; $rbhb{$F[1]} = $F[1]; }
}
close OUT;
system "rm $dirname/*.nhr $dirname/*.nin $dirname/*.nsq >/dev/null";
}
}
sub compute_kaks {
my ( $comparison, $codeml_ctl_file, $recalibration_rate, $min_coverage, $min_ks, $max_ks, $num_threads, $dirname ) = @_;
my ( %cds1, %pep1, %cds2, %pep2, $pair, $shorter, $longer, $longer_len, $shorter_len, $kaks);
%cds1 = get_sequences ("$dirname/species1.fna");
%pep1 = get_sequences ("$dirname/species1.faa");
if ( $comparison eq "orthologs" ) {
print localtime()." - Performing orthologous pairwise alignment and kaks analysis with MAFFT and CODEML respectively\n\n";
%cds2 = get_sequences ("$dirname/species2.fna");
%pep2 = get_sequences ("$dirname/species2.faa");
open (KAKS, ">$dirname/species1and2.fna.blastn.orthologs.rbhb.kaks") or die "can't open $dirname/species1and2.fna.blastn.orthologs.rbhb.kaks file\n";
print KAKS "SEQ1\tSEQ2\tKa\tKs\tKa\\Ks\n";
open (RBHB, "$dirname/species1and2.fna.blastn.orthologs.rbhb") or die "can't open $dirname/species1and2.fna.blastn.orthologs.rbhb file\n";
while (<RBHB>) {
chomp;
$_ =~ /(\S+\s+\S+)(.*)/;
$pair = $1;
my @F = split(/\t/, $pair);
if (length($cds1{$F[0]}) > length($cds2{$F[1]})) { $longer = $F[0]; $shorter = $F[1]; $longer_len = length($cds1{$F[0]}); $shorter_len = length($cds2{$F[1]}); }
else { $longer = $F[1]; $shorter = $F[0]; $longer_len = length($cds2{$F[1]}); $shorter_len = length($cds1{$F[0]}); }
open (DNA, ">$dirname/dna.tmp") or die "can't open $dirname/dna.tmp file\n";
open (AA, ">$dirname/aa.tmp") or die "can't open $dirname/aa.tmp file\n";
print DNA ">$F[0]\n$cds1{$F[0]}\n";
print AA ">$F[0]\n$pep1{$F[0]}\n";
print DNA ">$F[1]\n$cds2{$F[1]}\n";
print AA ">$F[1]\n$pep2{$F[1]}\n";
close DNA;
close AA;
# run alignment on the pair
my $mafft_call = system "mafft --maxiterate 1000 --thread $num_threads --localpair $dirname/aa.tmp > $dirname/aa.tmp.aln";
if ($mafft_call != 0) { print "\n-- ".localtime()." - Running MAFFT failed\n\n"; exit(1); }
if ( -z "$dirname/aa.tmp.aln" ) { print "Mafft could not align $longer and $shorter pair...\nskipping $longer and $shorter pair...\n\n"; next; }
my ( $aa_aln, $dna_aln, $dna, %seq, $aln_len, $coverage);
$aa_aln = "$dirname/aa.tmp.aln";
$dna_aln = "$dirname/dna.tmp.aln";
$dna = "$dirname/dna.tmp";
create_codon_alignments ( $aa_aln, $dna_aln, $dna );
%seq = get_sequences ("$dirname/dna.tmp.aln");
open (PHY, ">$dirname/dna.tmp.aln.phylip") or die "can't open $dirname/dna.tmp.aln.phylip file\n";
$aln_len = length($seq{$longer});
print PHY "2 $aln_len\n";
print PHY "$longer $seq{$longer}\n";
print PHY "$shorter $seq{$shorter}\n";
close PHY;
$coverage = 1.0 - ((length($seq{$longer}) - $longer_len) / $shorter_len);
if ($coverage >= $min_coverage) {
# run paml on the the alignment
open (CODEML, ">codeml.ctl") or die "can't open codeml.ctl file\n";
print CODEML "seqfile = $dirname/dna.tmp.aln.phylip\n";
print CODEML "outfile = $dirname/dna.tmp.aln.phylip.codeml\n";
if (File::Spec->file_name_is_absolute($codeml_ctl_file)) {
close CODEML;
system "cat $codeml_ctl_file >> codeml.ctl";
} else {
print CODEML "$codeml_ctl_file";
close CODEML;
}
my $codeml_call = system "codeml >/dev/null";
if ($codeml_call != 0) { print "\n-- ".localtime()." - Running CODEML failed\n\n"; exit(1); }
#chdir("$dirname") or die "cant change directory to $dirname";";
# remove paml files
system "rm rst rst1 rub 2ML.dN 2ML.dS 2ML.t 2NG.dN 2NG.dS 2NG.t codeml.ctl >/dev/null";
# parse paml-codeml results and write to output file
open (CODEML, "$dirname/dna.tmp.aln.phylip.codeml") or die "can't open $dirname/dna.tmp.aln.phylip.codeml file\n";
while (<CODEML>) {
if ($_ =~ /^t=\s+\d+\.\d+\s+S=\s+\d+\.\d+\s+N=\s+\d+\.\d+\s+dN\/dS=\s+(\d+\.\d+)\s+dN\s+=\s+(\d+\.\d+)\s+dS\s+=\s+(\d+\.\d+)/) {
my ($ka, $ks, $kaks);
$ka = $2;
$ks = $3;
$kaks = $1;
if ($min_ks and !$max_ks and !$recalibration_rate) { if ($ks >= $min_ks) { print KAKS "$pair\t$ka\t$ks\t$kaks\n"; } }
elsif ($max_ks and !$min_ks and !$recalibration_rate) { if ($ks <= $max_ks) { print KAKS "$pair\t$ka\t$ks\t$kaks\n"; } }
elsif ($min_ks and $max_ks and !$recalibration_rate) {
if ($ks >= $min_ks and $ks <= $max_ks) { print KAKS "$pair\t$ka\t$ks\t$kaks\n"; }
}
elsif ($recalibration_rate and !$min_ks and !$max_ks) {
$ks = sprintf("%.4f", ($ks * $recalibration_rate));
print KAKS "$pair\t-\t$ks\t-\n";
}
elsif ($recalibration_rate and $min_ks and !$max_ks) {
$ks = sprintf("%.4f", ($ks * $recalibration_rate));
if ($ks >= $min_ks) { print KAKS "$pair\t-\t$ks\t-\n"; }
}
elsif ($recalibration_rate and $max_ks and !$min_ks) {
$ks = sprintf("%.4f", ($ks * $recalibration_rate));
if ($ks <= $max_ks) { print KAKS "$pair\t-\t$ks\t-\n"; }
}
elsif ($recalibration_rate and $min_ks and $max_ks) {
$ks = sprintf("%.4f", ($ks * $recalibration_rate));
if ($ks >= $min_ks and $ks <= $max_ks) { print KAKS "$pair\t-\t$ks\t-\n"; }
}
else { print KAKS "$pair\t$ka\t$ks\t$kaks\n"; }
#t= 1.6915 S= 278.8 N= 942.2 dN/dS= 0.4533 dN = 0.4421 dS = 0.9752
}
}
close CODEML;
}
}
close RBHB;
close KAKS;
# remove tmp files
system "rm $dirname/aa.tmp* $dirname/dna.tmp* >/dev/null";
}
else { # $comparison eq "paralogs"
print localtime()." - Performing paralogous pairwise alignment and kaks analysis with MAFFT and CODEML respectively\n\n";
open (KAKS, ">$dirname/species1.fna.blastn.paralogs.rbhb.kaks") or die "can't open $dirname/species1.fna.blastn.paralogs.rbhb.kaks file\n";
print KAKS "SEQ1\tSEQ2\tKa\tKs\tKa\\Ks\n";
open (RBHB, "$dirname/species1.fna.blastn.paralogs.rbhb") or die "can't open $dirname/species1.fna.blastn.paralogs.rbhb file\n";
while (<RBHB>) {
chomp;
$pair = $_;
my @F = split(/\t/, $pair);
if (length($cds1{$F[0]}) > length($cds1{$F[1]})) { $longer = $F[0]; $shorter = $F[1]; $longer_len = length($cds1{$F[0]}); $shorter_len = length($cds1{$F[1]}); }
else { $longer = $F[1]; $shorter = $F[0]; $longer_len = length($cds1{$F[1]}); $shorter_len = length($cds1{$F[0]}); }
open (DNA, ">$dirname/dna.tmp") or die "can't open $dirname/dna.tmp file\n";
open (AA, ">$dirname/aa.tmp") or die "can't open $dirname/aa.tmp file\n\n";
print DNA ">$F[0]\n$cds1{$F[0]}\n";
print AA ">$F[0]\n$pep1{$F[0]}\n";
print DNA ">$F[1]\n$cds1{$F[1]}\n";
print AA ">$F[1]\n$pep1{$F[1]}\n";
close DNA;
close AA;
# run alignment on the pair
my $mafft_call = system "mafft --maxiterate 1000 --thread $num_threads --localpair $dirname/aa.tmp > $dirname/aa.tmp.aln";
if ($mafft_call != 0) { print "\n-- ".localtime()." - Running MAFFT failed\n\n"; exit(1); }
if ( -z "$dirname/aa.tmp.aln" ) { print "Mafft could not align $longer and $shorter pair...\nskipping $longer and $shorter pair...\n\n"; next; }
my ( $aa_aln, $dna_aln, $dna, %seq, $aln_len, $coverage);
$aa_aln = "$dirname/aa.tmp.aln";
$dna_aln = "$dirname/dna.tmp.aln";
$dna = "$dirname/dna.tmp";
create_codon_alignments ( $aa_aln, $dna_aln, $dna );
%seq = get_sequences ("$dirname/dna.tmp.aln");
open (PHY, ">$dirname/dna.tmp.aln.phylip") or die "can't open $dirname/dna.tmp.aln.phylip file\n";
$aln_len = length($seq{$longer});
print PHY "2 $aln_len\n";
print PHY "$longer $seq{$longer}\n";
print PHY "$shorter $seq{$shorter}\n";
close PHY;
$coverage = 1.0 - ((length($seq{$longer}) - $longer_len) / $shorter_len);
if ($coverage >= $min_coverage) {
# run paml on the the alignment
open (CODEML, ">codeml.ctl") or die "can't open codeml.ctl file\n";
print CODEML "seqfile = $dirname/dna.tmp.aln.phylip\n";
print CODEML "outfile = $dirname/dna.tmp.aln.phylip.codeml\n";
if (File::Spec->file_name_is_absolute($codeml_ctl_file)) {
close CODEML;
system "cat $codeml_ctl_file >> codeml.ctl";
} else {
print CODEML "$codeml_ctl_file";
close CODEML;
}
my $codeml_call = system "codeml >/dev/null";
if ($codeml_call != 0) { print "\n-- ".localtime()." - Running CODEML failed\n\n"; exit(1); }
#chdir("$dirname") or die "cant change directory to $dirname";";
# remove paml files
system "rm rst rst1 rub 2ML.dN 2ML.dS 2ML.t 2NG.dN 2NG.dS 2NG.t codeml.ctl >/dev/null";
# parse paml-codeml results and write to output file
open (CODEML, "$dirname/dna.tmp.aln.phylip.codeml") or die "can't open $dirname/dna.tmp.aln.phylip.codeml file\n";
while (<CODEML>) {
if ($_ =~ /^t=\s+\d+\.\d+\s+S=\s+\d+\.\d+\s+N=\s+\d+\.\d+\s+dN\/dS=\s+(\d+\.\d+)\s+dN\s+=\s+(\d+\.\d+)\s+dS\s+=\s+(\d+\.\d+)/) {
my ($ka, $ks, $kaks);
$ka = $2;
$ks = $3;
$kaks = $1;
if ($min_ks and !$max_ks and !$recalibration_rate) { if ($ks >= $min_ks) { print KAKS "$pair\t$ka\t$ks\t$kaks\n"; } }
elsif ($max_ks and !$min_ks and !$recalibration_rate) { if ($ks <= $max_ks) { print KAKS "$pair\t$ka\t$ks\t$kaks\n"; } }
elsif ($min_ks and $max_ks and !$recalibration_rate) {
if ($ks >= $min_ks and $ks <= $max_ks) { print KAKS "$pair\t$ka\t$ks\t$kaks\n"; }
}
elsif ($recalibration_rate and !$min_ks and !$max_ks) {
$ks = sprintf("%.4f", ($ks * $recalibration_rate));
print KAKS "$pair\t-\t$ks\t-\n";
}
elsif ($recalibration_rate and $min_ks and !$max_ks) {
$ks = sprintf("%.4f", ($ks * $recalibration_rate));
if ($ks >= $min_ks) { print KAKS "$pair\t-\t$ks\t-\n"; }
}
elsif ($recalibration_rate and $max_ks and !$min_ks) {
$ks = sprintf("%.4f", ($ks * $recalibration_rate));
if ($ks <= $max_ks) { print KAKS "$pair\t-\t$ks\t-\n"; }
}
elsif ($recalibration_rate and $min_ks and $max_ks) {
$ks = sprintf("%.4f", ($ks * $recalibration_rate));
if ($ks >= $min_ks and $ks <= $max_ks) { print KAKS "$pair\t-\t$ks\t-\n"; }
}
else { print KAKS "$pair\t$ka\t$ks\t$kaks\n"; }
#t= 1.6915 S= 278.8 N= 942.2 dN/dS= 0.4533 dN = 0.4421 dS = 0.9752
}
}
close CODEML;
}
}
close RBHB;
close KAKS;
# remove tmp files
system "rm $dirname/aa.tmp* $dirname/dna.tmp* >/dev/null";
}
}
sub fit_components {
my ( $num_of_components, $comparison, $dirname ) = @_;
if ($comparison eq "orthologs") {
print localtime()." - Fitting a mixture model of multivariate normal components to orthologous ks distribution\n\n";
open (OUT, ">$dirname/species1and2.fna.blastn.orthologs.rbhb.kaks.components") or die "can't open $dirname/species1and2.fna.blastn.orthologs.rbhb.kaks.components file\n";
open (IN, "$dirname/species1and2.fna.blastn.orthologs.rbhb.kaks") or die "can't open $dirname/species1and2.fna.blastn.orthologs.rbhb.kaks file\n";
# Generate input data file for EMMIX assuming log-normal ks
open (DAT, ">emmix.ks.dat") or die "can't open emmix.ks.dat file\n";
my $rows = 0;
my $rand;
while (<IN>) { chomp; if (/^SEQ1/) { next; } my @F = split(/\t/, $_); print DAT log($F[3]),"\n"; $rows++; }
close IN;
close DAT;
# Generate input control file for EMMIX with three random starts <30000
if (!$rows) { die "Error! no ks data read for fitting components!\nExiting...\n\n";};
open (CTR, ">emmix.ks.ctr") or die "can't open emmix.ks.ctr file\n";
print CTR "3\nemmix.ks.dat\nemmix.ks.mix\nYes\n$rows\n1\n1\n1\n$num_of_components\n4\n100\n100\n10\nN\n";
foreach (1..3) { $rand = int(rand() * 30000); print CTR $rand,"\n"; }
close CTR;
# run EMMIX
my $emmix_call = system "EMMIX < emmix.ks.ctr > /dev/null";
if ($emmix_call != 0) { print "\n-- ".localtime()." - Running EMMIX failed\n\n"; exit(1); }
open (EMMIX, "emmix.ks.mix") or die "can't open emmix.ks.mix file\n";;
print OUT "species\tn\tnumber_comp\tlnL\tAIC\tBIC\tmean\tvariance\tporportion\n";
my ( @means, @variances, $mean, $variance, $mix, @mixpi, $oldbic, $bic, $aic, $likelihood, $lnL, $crit, $nobs, $val, $len, $species );
$mixpi[0] = 1;
while (<EMMIX>) {
$_ =~ s/^\s+//;
if (/number of entities:\s+(\d+)/) { $nobs = $1; next; }
if (/Mixing proportion/) { $mix = 1; next; }
if (/Final Log-likelihood is\s+(-\d+\.\d+)/i) { $lnL = $1; next; }
if (/Estimated mean.*for each component/) { $mean = 1; next; }
if (/Estimated\s.*matrix/) { $variance = 1; $mean = 0; next; }
if (/AIC BIC/) { $crit = 1; next; }
if ($mean) { $val = $_; $val =~ s/\s//; chomp($val); push(@means, $val); next; }
if ($variance) { $val = $_; $val =~ s/\s//; chomp($val); push(@variances, $val); $variance = 0; next; }
if ($mix) { chomp($_); @mixpi = split(/\s+/, $_); $mix = 0; next; }
if (/(\d+\.\d+)\s+(\d+\.\d+)/ and $crit) {
$aic = $1; $bic = $2; $species = "species1and2";
# print out mixture
$len = $#means;
if ($oldbic && $bic > $oldbic) { last; }
printf OUT ("%s\t%d\t%d\t%.4f\t%.2f\t%.2f\t%.4f\t%.4f\t%.2f\n", $species, $nobs, $len, $lnL, $aic, $bic, exp($means[0]), $variances[0] * exp(2 * $means[0]), $mixpi[0]);
$oldbic = $bic;
foreach my $i (1..$len-1) {
#transform to exp(x)
printf OUT ("\t\t\t\t\t\t%.4f\t%.4f\t%.2f\n", exp($means[$i]), $variances[$i] * exp(2 * $means[$i]), $mixpi[$i]);
}
@means = ();
@variances = ();
@mixpi = ();
$crit = 0;
}
}
close EMMIX;
close OUT;
}
else { # $comparison eq "paralogs"
print localtime()." - Fitting a mixture model of multivariate normal components to paralogous ks distribution\n\n";
open (OUT, ">$dirname/species1.fna.blastn.paralogs.rbhb.kaks.components") or die "can't open $dirname/species1.fna.blastn.paralogs.rbhb.kaks.components file\n";
open (IN, "$dirname/species1.fna.blastn.paralogs.rbhb.kaks") or die "can't open $dirname/species1.fna.blastn.paralogs.rbhb.kaks file\n";
# Generate input data file for EMMIX assuming log-normal ks
open (DAT, ">emmix.ks.dat") or die "can't open emmix.ks.dat file\n";
my $rows = 0;
my $rand;
while (<IN>) { chomp; if (/^SEQ1/) { next; } my @F = split(/\t/, $_); print DAT log($F[3]),"\n"; $rows++; }
close IN;
close DAT;
# Generate input control file for EMMIX with three random starts <30000
if (!$rows) { die "Error! no ks data read for fitting components!\nExiting...\n\n";};
open (CTR, ">emmix.ks.ctr") or die "can't open emmix.ks.ctr file\n";
print CTR "3\nemmix.ks.dat\nemmix.ks.mix\nYes\n$rows\n1\n1\n1\n$num_of_components\n4\n100\n100\n10\nN\n";
foreach (1..3) { $rand = int(rand() * 30000); print CTR $rand,"\n"; }
close CTR;
# run EMMIX
my $emmix_call = system "EMMIX < emmix.ks.ctr > /dev/null";
if ($emmix_call != 0) { print "\n-- ".localtime()." - Running EMMIX failed\n\n"; exit(1); }
open (EMMIX, "emmix.ks.mix") or die "can't open emmix.ks.mix file\n";;
print OUT "species\tn\tnumber_comp\tlnL\tAIC\tBIC\tmean\tvariance\tporportion\n";
my ( @means, @variances, $mean, $variance, $mix, @mixpi, $oldbic, $bic, $aic, $likelihood, $lnL, $crit, $nobs, $val, $len, $species );
$mixpi[0] = 1;
while (<EMMIX>) {
$_ =~ s/^\s+//;
if (/number of entities:\s+(\d+)/) { $nobs = $1; next; }
if (/Mixing proportion/) { $mix = 1; next; }
if (/Final Log-likelihood is\s+(-\d+\.\d+)/i) { $lnL = $1; next; }
if (/Estimated mean.*for each component/) { $mean = 1; next; }
if (/Estimated\s.*matrix/) { $variance = 1; $mean = 0; next; }
if (/AIC BIC/) { $crit = 1; next; }
if ($mean) { $val = $_; $val =~ s/\s//; chomp($val); push(@means, $val); next; }
if ($variance) { $val = $_; $val =~ s/\s//; chomp($val); push(@variances, $val); $variance = 0; next; }
if ($mix) { chomp($_); @mixpi = split(/\s+/, $_); $mix = 0; next; }
if (/(\d+\.\d+)\s+(\d+\.\d+)/ and $crit) {
$aic = $1; $bic = $2; $species = "species1";
# print out mixture
my $len = $#means;
if ($oldbic && $bic > $oldbic) { last; }
printf OUT ("%s\t%d\t%d\t%.4f\t%.2f\t%.2f\t%.4f\t%.4f\t%.2f\n", $species, $nobs, $len, $lnL, $aic, $bic, exp($means[0]), $variances[0] * exp(2 * $means[0]), $mixpi[0]);
$oldbic = $bic;
foreach my $i (1..$len-1) {
#transform to exp(x)
printf OUT ("\t\t\t\t\t\t%.4f\t%.4f\t%.2f\n", exp($means[$i]), $variances[$i] * exp(2 * $means[$i]), $mixpi[$i]);
}
@means = ();
@variances = ();
@mixpi = ();
$crit = 0;
}
}
close EMMIX;
close OUT;
}
system "rm emmix.ks.dat emmix.ks.ctr emmix.ks.mix >/dev/null";
}
sub create_codon_alignments {
my ( $aa_aln, $dna_aln, $cds ) = @_;
my (%seq, @seq_order, $aa_id, $dna_id, $id);
$aa_aln =~ /(\d+)(.*)/;
my $ortho_id = $1;
open (IN, "$aa_aln") or die "can't open $aa_aln file\n";
while (<IN>) {
chomp;
if (/^>(\S+)/) { $aa_id = $1; push (@seq_order, $aa_id); }
else { s/\s+//g; $seq{$aa_id}{aa} .= $_; }
}
close IN;
open (IN, "$cds") or die "can't open $cds file\n";
while (<IN>) {
chomp;
if (/^>(\S+)/) { $dna_id = $1; }
else { s/\s+//g; s/X/N/g; s/x/n/g; $seq{$dna_id}{dna} .= $_; }
}
close IN;
open (OUT, ">$dna_aln") or die "can't open $dna_aln file\n";
foreach $id (@seq_order) {
unless ($seq{$id}{aa} && $seq{$id}{dna}) { print "Skipping sequence $id in orthogroup $ortho_id does not have a valid aa and dna\n\n"; next; }
my $aln = $seq{$id}{aa};
my $aln_no_gaps = $aln;
$aln_no_gaps =~ s/-//g;
my $dna = $seq{$id}{dna};
if (int(length($dna)/3) < length($aln_no_gaps)) {
print "Skipping sequence $id in orthogroup $ortho_id has dna condon length less than aa length\n\n";
next;
}
if (length($dna) % length($aln_no_gaps) != 0) {
# check to see if the last 3 bases are stop codon -> TAA,TAG,TGA
my $last_codon = substr($dna,-3);
if ($last_codon eq "TAA" || $last_codon eq "TAG" || $last_codon eq "TGA" || $last_codon eq "taa" || $last_codon eq "tag" || $last_codon eq "tga") {
$dna = substr($dna, 0, -3);
}
else { print "Skipping sequence $id in orthogroup $ortho_id not in codon frame\n\n"; next; }
}
my @aa_aln = split("", $aln);
my @codon = unpack("a3" x int(length($dna)/3) ,$dna);
if (length($aln_no_gaps) != ($#codon + 1)) { print "Skipping sequence $id in orthogroup $ortho_id not in codon frame\n\n"; next; }
my $index = 0;
my $codon_aln = "";
foreach my $aa (@aa_aln) {
if ($aa eq "-") { $codon_aln .= "---"; }
elsif ($aa eq "?") { $codon_aln .= "???"; $index++; }
else { $codon_aln .= $codon[$index]; $index++; }
}
$codon_aln =~ s/.{60}(?=.)/$&\n/g;
print OUT ">$id\n$codon_aln\n";
}
close OUT;
}
sub get_sequences {
my ( $file ) = @_;
my (%sequences, $id);
open (IN, "$file") or die "can't open $file file\n";
while (<IN>) {
if ($_ =~ />(\S+)/){ $id = $1; }
else { s/\s+//g; $sequences{$id} .= $_; }
}
close IN;
return %sequences;
}