-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaffy_chip_parse.pl
153 lines (145 loc) · 6.33 KB
/
affy_chip_parse.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/perl
##v0.1.1 Robert R Butler III on 3/6/2017
##parsing fields from ClinVar and adding to affy chip master file
##Must not have "," in field values, change all to something else first
##Reads each line and defines variable for Header field (replaces " " with "_"; and '"' with '')
##grabs data from ClinVar using Entrez Direct tools (must install)
use strict;
use warnings;
use Timer::Runtime;
use Sort::Naturally;
use Data::Dumper qw(Dumper); #for debug
my $usage = 'USAGE = perl affy_chip_parse.pl *.csv '; #gimme the right inputs
##loading Affy chip hash (%table)
my $file = $ARGV[0] or die $usage; #going through text file and making outfile
system "dos2unix $file";
print "Loading Affy array table...\n";
open(IN,"$file");
$file =~ s/.csv//; #dropping extension for outfile
open(OUT,">$file.out.txt");
my %row; # hash of column values for each line/row
my %table; #hash of row hashes
my @names = (); #column headers
my @values = (); #line/row values
my $affyID; #Probe.Set.ID is unique key for table hash
while (my $line = <IN>){ #reading lines for variant info
chomp $line;
if ($line =~ /^VariationID/){
@names = split(/,/, $line); #column names
for (@names){s/ /_/g}; #replacing " " with "_"
for (@names){s/"//g}; #getting rid of "
} elsif ($line =~ /\S+/){ #splitting variant info columns into variables
@values = split(/,/, $line);
for (@names){s/ /_/g}; #replacing " " with "_"
for (@values){s/"//g}; #getting rid of "
@row{@names} = @values;
$affyID = $values[1]; #defining subhash by Probe.Set.ID
for my $name (keys %row){
$table{$affyID}{$name} = $row{$name}; #putting values into hash
}
} else {next;}
}
close IN;
##end loading Affy chip hash (%table)
##fetch Entrez Direct for submitter data
print "Querying NCBI for Submitter Data...\n";
my @VIDlist;
for my $key (keys %table){#grabbing all VariationIDs in the affy table
if ($table{$key}{VariationID} eq 'NA'){#skipping NA VID
next;
} else {
push @VIDlist, $table{$key}{VariationID};
}
}
my @UIDlist = keys { map { $_ => 1 } @VIDlist }; #remove redundant UIDs
open(OUT2,">UIDs.temp"); #write UIDs to file
for my $UIDlist (@UIDlist){
print OUT2 "$UIDlist\n";
}
close OUT2;
system 'epost -db clinvar -input UIDs.temp | efetch -db clinvar -format variation | xtract -pattern VariationReport -element @VariationID \
-block ObservationList -if ClinicalSignificance/Explanation -element ClinicalSignificance/Explanation -else -lbl "\-" \
-block ObservationList -sep "|" -element @VariationID -element ClinicalSignificance/Description \
-block Germline -tab "|" -element @SubmitterName ClinicalSignificance/Description > efetch.txt';
open(IN2, "efetch.txt");
print "Adding submitter data to table...\n";
#Column Headers added
my @newnames = ("ClinVar_Significance", "Invitae_Sig", "GeneDx_Sig", "Ambry_Sig", "Number_of_Submissions", "Submission_Details"); #new columns
push (@names, @newnames); #adding the column headers to the affy table
while (my $line = <IN2>){
chomp $line;
my ($VariationID, $subdetail ,$observationIDs, $observsigs, $subsigs) = split ("\t", $line, 5); # breaking up clinvar results sections
for ($observsigs, $subsigs){s/, /;/g}; #changing ", " to ";"
for ($observsigs, $subsigs){s/ /_/g}; #changing " " to "_"
#ClinVar_Significance value
my @OIDs = split(/\|/, $observationIDs); #VariationID for each clinical observation
my @Osigs = split(/\|/, $observsigs); #clinical observations
my %clinobs; #clinical observations hash by VariationID
@clinobs{@OIDs} = @Osigs;
#Invitae/Genedx/Ambry Clinical Significances
my %clinsubs; #initialize submitter hash
my @subs = split(/\|/, $subsigs); #already an ordered array "key, value, key, value"
my $submittercount;
while (@subs){ #Have to deal with duplicate significances without deleting them
my $key = shift@subs;
my $value = shift@subs;
my $submittercount;
if (exists $clinsubs{$key}){ #if exists add a z to the key
$clinsubs{$key . "z"} = $value; #will make key, keyz, keyzz for output
} else { #kind of a waste since we don't care about most submitters, but wont't delete or overwrite stuff
$clinsubs{$key} = $value;
}
}
$submittercount = scalar(keys %clinsubs); # of submissions
#fill new array values
for my $key (keys %table){
if ($table{$key}{VariationID} eq "NA"){ #if in affy array but no varID, make "NA"
$table{$key}{ClinVar_Significance} = "NA";
$table{$key}{Invitae_Sig} = "NA";
$table{$key}{GeneDx_Sig} = "NA";
$table{$key}{Ambry_Sig} = "NA";
$table{$key}{Number_of_Submissions} = "NA";
$table{$key}{Submission_Details} = "NA";
} elsif ($table{$key}{VariationID} eq "$VariationID"){ #VarID in %table matches ClinVar query
$table{$key}{ClinVar_Significance} = $clinobs{$VariationID};
$table{$key}{Number_of_Submissions} = $submittercount;
$table{$key}{Submission_Details} = $subdetail;
if (exists $clinsubs{Invitae}){ #either they have submitter data or "NA"
$table{$key}{Invitae_Sig} = $clinsubs{Invitae};
} else {
$table{$key}{Invitae_Sig} = "NA";
}
if (exists $clinsubs{GeneDx}){
$table{$key}{GeneDx_Sig} = $clinsubs{GeneDx};
} else {
$table{$key}{GeneDx_Sig} = "NA";
}if (exists $clinsubs{Ambry_Genetics}){
$table{$key}{Ambry_Sig} = $clinsubs{Ambry_Genetics};
} else {
$table{$key}{Ambry_Sig} = "NA";
}
} else {next;} #if variantID not in ClinVar query have to fill in undef values later w/ "NA"
}
}
close IN2;
##end fetch Entrez Direct for submitter data
##print Affy chip hash to file
print "Writing output file...\n";
print OUT join("\t", @names), "\n";#printing header
shift@names; #remove VariationID from column list see line 132
for my $key (sort { ncmp($table{$a}{VariationID},$table{$b}{VariationID}) || $table{$a}{"Probe.Set.ID"} cmp $table{$b}{"Probe.Set.ID"} } keys %table){
print OUT "$table{$key}{VariationID}"; #no tab in front of first column
for my $name (@names){ #above sort by VariationID (naturally) then Probe.Set.ID
if (exists $table{$key}{$name}){ #filling in blank spaces with "NA*"
print OUT "\t$table{$key}{$name}"; #print tab separated row for each "Probe.Set.ID" key found in VarIDs
} else {
print OUT "\tNA\*"; #if variantID not in ClinVar query have to fill in undef values later w/ "NA*"
print "Some VariationIDs were not in the ClinVar query \"NA\*\"\n";
}
}
print OUT "\n"; #line return
}
close OUT;
##end print Affy chip hash to file
#print Dumper \%clinsubs; #for debug
#print Dumper \%clinobs; #for debug