-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_region_from_depthfile.pl
156 lines (120 loc) · 3.59 KB
/
get_region_from_depthfile.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/perl -w
use strict;
=head1
Author: Ian Beddows
Date: 3/9/18
Short description:
./get.pl -region_file *.txt -depth deliverables/depth_of_coverage.filtered.txt
=cut
use Getopt::Long;
my $usage = <<EOF;
OPTIONS:
-region_file
-depth_file
-outfile
OPTIONAL:
-h|help = print usage
-b = uppercase arguments
EOF
my($help,$bold,$region_file,$depth_file,$outfile);
#=======================================================================
GetOptions(
'region_file=s' => \$region_file, # string
'depth_file=s' => \$depth_file, # string
'outfile=s' => \$outfile, # string
#~ '=i' => \$, # integer
#~ '=f' => \$, # floating point number
#~ 'bold' => \$bold, # flag
'h|help' => \$help # flag for help
);
if (defined($help)){die print "HELP:\n",$usage;}
# First load all depth of coverage data:
open(my $in,'<',$depth_file)||die;
open(my $out,'>',$outfile)||die;
my %samples=();
my %d=(); # depth
my $c=0;
while(<$in>){
chomp;
my @data = split('\t',$_);
my $chr = shift @data;
#~ next if $chr ne 'X';
my $pos = shift @data;
if(1..1){ # get header
for(my $i=0; $i<@data; $i++){
$samples{$i} = $data[$i];
#~ print "Sample $i: $samples{$i}\n";
}
next;
}
#~ for(my $i=0; $i<@data; $i++){
#~ my $sample = $samples{$i};
#~ $d{$chr}{$pos}{$sample}=$data[$i];
@{$d{$chr}{$pos}}=@data;
#~ print "$sample\n\t$chr $pos\n\t\t$d{$chr}{$pos}{$sample}\n";
#~ print "$sample\n\t@data\n";
#~ }
$c++;
if($c % 1000000 == 0){
print STDOUT "$c\t$chr\t$pos\n";
}
#~ if($chr>1){last;}
#~ if($c>100){last;}
}
close($in);
foreach my $c (keys %d){
#~ foreach my $p (keys %{$d{$c}}){
#~ print "$c\t$p\n";
#~ print join("\t",@{$d{$c}{$p}}),"\n";
#~ }
print "Found ",scalar keys %{$d{$c}}," positions on chromosome |$c|\n";
}
#~ exit;
# Then load the regions and foreach subset the depth data and save that as the output
open($in,'<',$region_file)||die;
while(<$in>){
chomp;
next if 1..1;
my($ext_gene,$ensembl_gene_id,$exon_start,$exon_end,$distance,$strand,$chromosome_name)=split('\t',$_);
print "$ext_gene\n";
#~ $genes{$ext_gene}{'chromosome_name'}=$chromosome_name;
#~ $genes{$ext_gene}{'exon_start'}=$exon_start;
#~ $genes{$ext_gene}{'exon_end'}=$exon_end;
#~ $genes{$ext_gene}{'strand'}=$strand;
# go through all positions that are in the gene body +/- 50kb, keeping track of how far they are
my $tss_distance;
my $found_pos=0;
if($strand==1){
$tss_distance = -50000; # always starts -50kb from first exon
}else{ # else if reverse strand then flip
$tss_distance = $distance - 50000; # because distance includes +/- 50k
}
print "\tnow searching for postions from $exon_start to $exon_end\n";
for(my $p=$exon_start;$p<=$exon_end;$p++){ # go from -50k to +50k of gene body
if(exists $d{$chromosome_name}{$p}){
print $out "$ext_gene\t$p\t$tss_distance\t";
print $out join("\t",@{$d{$chromosome_name}{$p}}),"\n";
$found_pos++;
#~ print STDOUT "Found position |$p| on |$chromosome_name| for $ext_gene\n";
}else{
#~ print STDOUT "No position |$p| on |$chromosome_name| for $ext_gene\n";
#~ print $out "$ext_gene\t$p\t$tss_distance","\n";
}
if($strand==1){$tss_distance++;}else{$tss_distance--} # add or subtract distance as needed.
}
print "\tFound $found_pos positions for $ext_gene\n";
#~ last; # test with ACTN2
}
close($in);
print "Done\n";
exit;
#=======================================================================
#( Subroutines )
# ------------------------------------
# o
# o \_\_ _/_/
# o \__/
# (oo)\_______
# (__)\ )\/\
# ||----w |
# || ||