-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranslate_fasta.pl
139 lines (131 loc) · 4.89 KB
/
translate_fasta.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/perl -w
# translate_fasta.pl
#
# Specify a FASTA file to produce all 6 translations of the sequences
#
# main program begins line 92
#
# Revision 4 seq in hash, hash->hash->array, simplified loop length
# 11 February 2011 Copyright Lenna Xiao Ping Peterson
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
use strict;
# reverseComplement -----------------------------------------------------------
# finds reverse complement of a DNA sequence
# USAGE:
# $reverse = reverseComplement( $dna );
sub reverseComplement {
my ($seq) = @_;
my $rev_seq = reverse $seq;
$rev_seq =~ tr/atgcATGC/TACGTACG/; # force UC, find complement
return $rev_seq;
}
# translate -------------------------------------------------------------------
# translates a DNA sequence into 1-letter amino acid
# returns array of the 3 reading frames
# USAGE:
# @proteins = translate( $dna );
sub translate {
my ($seq) = @_;
my %aa_list = (
TTT => "F", TTC => "F", TTA => "L", TTG => "L",
TCT => "S", TCC => "S", TCA => "S", TCG => "S",
TAT => "Y", TAC => "Y", TAA => "*", TAG => "*",
TGT => "C", TGC => "C", TGA => "*", TGG => "W",
CTT => "L", CTC => "L", CTA => "L", CTG => "L",
CCT => "P", CCC => "P", CCA => "P", CCG => "P",
CAT => "H", CAC => "H", CAA => "Q", CAG => "Q",
CGT => "R", CGC => "R", CGA => "R", CGG => "R",
ATT => "I", ATC => "I", ATA => "I", ATG => "M",
ACT => "T", ACC => "T", ACA => "T", ACG => "T",
AAT => "N", AAC => "N", AAA => "K", AAG => "K",
AGT => "S", AGC => "S", AGA => "R", AGG => "R",
GTT => "V", GTC => "V", GTA => "V", GTG => "V",
GCT => "A", GCC => "A", GCA => "A", GCG => "A",
GAT => "D", GAC => "D", GAA => "E", GAG => "E",
GGT => "G", GGC => "G", GGA => "G", GGG => "G"
);
my @proteins;
my $length = length $seq;
for my $frame ( 0 .. 2 ) {
my $protein = "";
for ( my $k = 0; $k < $length; $k += 3 ) {
my $codon = substr $seq, $k + $frame, 3;
if ( length $codon == 3 ) {
$protein .= $aa_list{$codon};
}
}
push @proteins, $protein;
}
return @proteins;
}
# formatSeq -------------------------------------------------------------------
# formats a sequence to be printed with a specified block size and line length
# if line length is not an even multiple of block size, rounds down
# if line length is less than block size, treated as # of blocks
# USAGE:
# $output = formatSeq( $proteins[0], 10, 50 );
sub formatSeq {
my ($seq, $block_len, $line_len) = @_;
# force block_len to be logical
if ($line_len < $block_len) { $line_len *= $block_len; }
elsif ($line_len % $block_len) { $line_len -= $line_len % $block_len; }
my @sequence = split "", $seq; # break sequence into characters
my $out;
foreach my $char (0 .. $#sequence) {
unless ($char % $block_len) {
unless ($char % $line_len) {
$out .= "\n" if $char; # \n if even block and line, not first
} else {
$out .= " "; # space if even block but not line
}
}
$out .= $sequence[$char]; # print a letter every loop
}
return $out;
}
### grab sequence name and full sequence from input
my ($seq, $key, %sequences, %proteins);
while ( my $line = <> ) {
$line =~ s/\r\n$//; # remove trailing CR-LF newline
chomp $line;
if ( $line =~ s/^>// ) { # remove leading >
$sequences{$key} = $seq, $seq = "" if $seq; # store and clear seq
( my $seq_name ) = split " ", $line, 2; # part before space
( $key ) = split/\|/, $seq_name, 2; # part before pipe
}
else { $seq .= $line; } # concatenate the sequence
}
$sequences{$key} = $seq if $seq; # store last sequence
### generate 3D hash of proteins, struct: {$key}{$direction}[$frame]
for my $k ( sort keys %sequences ) {
$proteins{$k} = { forward => [ translate($sequences{$k}) ],
reverse => [ translate(reverseComplement $sequences{$k}) ] };
}
#### format and print sequences in FASTA format
my $block_len = 10;
my $line_len = $block_len * 5;
for my $key ( sort keys %proteins ) { # loop through sequences
for my $direction ( sort keys %{$proteins{$key}}) { # forward and reverse
my $dir = substr $direction, 0, 1;
for my $frame ( 0 .. 2 ) { # 2 = $#{$proteins{$key}{$direction}}
my $number = $frame + 1;
print ">$key", "_$dir$number ",
"$key $direction reading frame $number\n",
formatSeq($proteins{$key}{$direction}[$frame],
$block_len, $line_len), "\n";
}
}
}
### end of translate_fasta.pl ----------------------------------------------------