forked from carloartieri/fraser_lab
-
Notifications
You must be signed in to change notification settings - Fork 0
/
BED2FASTA.pl
executable file
·142 lines (107 loc) · 2.95 KB
/
BED2FASTA.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/perl
# The purpose of this script is to extract FASTA sequences of all of the transcripts within a GTF
# file.
#USAGE: perl GTF2FASTA.pl [GENOME FASTA FILE] [GTF FILE] [OUTFILE]
############################################
# UPDATE HISTORY AND LIST OF THINGS TO FIX #
############################################
#11.03.15
# - Charted out the initial concept and finished script.
#12.11.09
# - Modified script to use options and all user to specify reverse transcribe genes on the neg.
# strand.
#15.03.12
# - Cleaned up and put in shared script directory
#
#15.03.27
# - Added ANSIColor module
###############
# SUBROUTINES #
###############
use Getopt::Long;
use Term::ANSIColor;
sub fasta2hash {
my($file) = @_;
my @FASTA;
my $line;
my @line2;
my %fastahash;
my $curhead;
my $seq = "";
my $tmp;
#Open the FASTA file and store it in an array.
open (LIST, "$file");
@FASTA = <LIST>;
close LIST;
#Now go through the FASTA an store '>' lines as KEYS and sequence as VALUES.
foreach $line (@FASTA) {
chomp($line);
if(($line =~ />/) && ($seq eq "")) { #Here's what we do with the first header.
@line2 = split(/>/, $line);
$curhead = $line2[1];
}
if($line !~ />/) { #Here's what we do with sequence lines.
@line2 = split(/>/, $line);
$seq .= $line;
}
if(($line =~ />/) && ($seq ne "")) { #Here's what we do with the subsequent headers.
$fastahash{$curhead} = $seq;
$seq = "";
@line2 = split(/>/, $line);
$curhead = $line2[1];
}
}
$fastahash{$curhead} = $seq; #The final FASTA seq will be put in here.
return %fastahash;
}
$rev = 0;
GetOptions ( "genome=s" => \$genome,
"bed=s" => \$bed,
"out=s" => \$out,
'rev' => \$rev,
'help' => \$help,
'h' => \$help,
);
if(($help == 1) || ($genome eq "") || ($bed eq "") || ($out eq "")) {
print colored['bright_red'], '
This script takes an annotation file in BED format as well as a genome in FASTA format and
outputs a new FASTA file containing the sequences of the annotations (e.g., spliced genes).
USAGE: perl BED2FASTA.pl --genome <genome.fa> --bed <annotation.bed> --out <annotation.fa> --rev
Options and formatting are as follows:
--rev
If this option is specified, reverse transcribe genes on the negative strand so that their
sequence is 5\' - 3\'.
--help or --h
Print this text.
';
exit;
}
##########
# SCRIPT #
##########
#Read in the genome
%genomefasta = fasta2hash($genome);
open(OUT, ">$out");
#Open the BED file.
open (BED, "$bed");
while(<BED>) {
chomp($_);
@line = split(/\t/, $_);
$chr = $line[0];
$start = $line[1];
$gene = $line[3];
$or = $line[5];
@lengths = split(/\,/, $line[10]);
@starts = split(/\,/, $line[11]);
$seq = "";
for($i = 0; $i < scalar(@lengths); ++$i) {
$seq .= substr($genomefasta{$chr},($start+$starts[$i]),$lengths[$i]);
}
if(($rev == 1) && ($or eq "-")) {
$seq = reverse($seq);
$seq =~ tr/actgACTG/tgacTGAC/;
}
print OUT ">$gene\n$seq\n";
}
close BED;
close OUT;