diff --git a/resources/int2bnt.pl b/resources/int2bnt.pl index b07b0e9..a39067c 100644 --- a/resources/int2bnt.pl +++ b/resources/int2bnt.pl @@ -1,12 +1,16 @@ -#!/usr/bin/perl +#!/software/bin/perl ## Description: This script generates binary intensity files in the format required by Evoker ## Note: We assume that the input intensity file has the same snps and samples as the bim and fam files for the mathing bed file ## -## Usage: >./int2bnt.pl collection.chr.int --filetype="illuminus" +## Usage: >./int2bnt.pl --intensities collection.chr.int --filetype="illuminus" --out output_filename {--samples samples.fam} ## Input: Intensity file in one of the accepted formats named in the form collection.chromosome.int ## Output: Binary Intensity file collection.chromosome.bnt -## Arguments: --filetype [chaimo | affy | illuminus] +## Arguments: +## -i --input input intensity file +## -f --filetype [chaimo | affy | illuminus] +## -o --output output file name and path +## -s --samples .fam file for sample to include in output ## default format: ## A matrix of intensities with SNPs as rows and individuals as pairs of whitespaceÐseparated columns. ##Êchaimo input format: @@ -21,17 +25,30 @@ use strict; use Getopt::Long; -my $filetype = ''; +my $inputfile = ''; +my $samples = ''; +my $filetype = ''; +my $outfile = ''; -GetOptions( 'filetype=s' => \$filetype ); - -my $inputfile = $ARGV[0]; +GetOptions( 'input=s' => \$inputfile, + 'samples=s' => \$samples, + 'filetype=s' => \$filetype, + 'output=s' => \$outfile + ); + +unless ($inputfile && $outfile && $filetype) { + die "Missing required arguments: -i -o -f "; +} -$inputfile =~ /(.+)\.(.+)\.int/; -open (OUT, ">$1.$2.bnt") or die "Can't open output '$1.$2.bnt': $!"; +my $out_fh; +if ($outfile =~ /\.bnt$/i) { + open ($out_fh, ">$outfile") or die "Can't open output '$outfile': $!"; +} else { + open ($out_fh, ">$outfile.bnt") or die "Can't open output '$outfile.bnt': $!"; +} ## magic number to ensure the binary is a real evoker file not just garbage -print OUT pack('B*',"0001101000110001"); +print $out_fh pack('B*',"0001101000110001"); if ($inputfile =~ /\.gz$/){ open (IN, "zcat $inputfile |") or die "Can't open '>zcat $inputfile': $!"; @@ -39,13 +56,39 @@ open (IN, $inputfile) or die "Error: Can't open '$inputfile': $!"; } +if ($samples) { + if ($filetype =~ /illuminus/i || $filetype =~ /beagle/i) { + + my $aSamples = parse_sample_file($samples); + my $header = ; + + my $aSampPos = sample_position_array($header, $aSamples); + + while (my $line = ){ + chomp($line); + my @fields = split(/\s+/, $line); + + for my $pos (@$aSampPos) { + my $int = $fields[$pos]; + if ($int eq 'NaN') { + print $out_fh pack('f*', -1); + } else { + print $out_fh pack('f*', $int); + } + } + } + } else { + die "Sample filtering is not supported for file type '$filetype'\n"; + } +} + if ($filetype =~ /chiamo/i) { my $header = ; while (my $line = ){ chomp($line); my @fields = split(/\s+/, $line); for (my $i = 5; $i < scalar(@fields); $i++){ - print OUT pack('f*', $fields[$i]); + print $out_fh pack('f*', $fields[$i]); } } } elsif ($filetype =~ /affy/i) { @@ -83,7 +126,7 @@ $snp_id_b =~ s/-B$//; if ($snp_id_a eq $snp_id_b) { for (my $i=0; $i<@allele_a; $i++) { - print OUT pack('f*', ($allele_a[$i],$allele_b[$i])); + print $out_fh pack('f*', ($allele_a[$i],$allele_b[$i])); } @allele_a = (); @allele_b = (); @@ -100,18 +143,18 @@ for (my $i = 3; $i < scalar(@fields); $i++){ my $int = $fields[$i]; if ($int eq 'NaN') { - print OUT pack('f*', -1); + print $out_fh pack('f*', -1); } else { - print OUT pack('f*', $int); + print $out_fh pack('f*', $int); } } } -}else { +}elsif ($filetype =~ /default/) { ; while (){ my @fields = split; for (my $i = 1; $i <= $#fields; $i++){ - print OUT pack('f*',$fields[$i]); + print $out_fh pack('f*',$fields[$i]); } } } @@ -119,4 +162,43 @@ close IN; close OUT; - \ No newline at end of file +sub parse_sample_file { + my $file = shift; + open(FAM, $file) or die "Can't open '$file': $!\n"; + my @samples; + while (my $line = ) { + chomp $line; + my @vals = split(/\s+/, $line); + push(@samples, $vals[1]); + } + close FAM; + return \@samples; +} + +##Êcreate a sample_position array to match each of the samples to a position on the vcf line +sub sample_position_array { + my $header = shift; + my $aSamples = shift; + + my @sample_position; + my @hsamples = split( /\s+/, $header ); + ## create a hash of all the samples with their position on the line + my %sample_pos; + for (my $i = 0; $i <= $#hsamples; $i++) { + $sample_pos{$hsamples[$i]} = $i; + } + + for my $sample (@$aSamples) { + ## get both A and B + for my $channel ('A','B') { + my $test = $sample.$channel; + if (exists $sample_pos{$test}) { + push(@sample_position, $sample_pos{$test}); + print "$test $sample_pos{$test}\n"; + } else { + warn "Sample '$test' not in data\n"; + } + } + } + return \@sample_position; +} \ No newline at end of file