users can now pass in the name of their output file.

wtsi-medical-genomics · Apr 5, 2012 · ee09229 · ee09229
1 parent a049396
commit ee09229
Showing 1 changed file with 99 additions and 17 deletions.
diff --git a/resources/int2bnt.pl b/resources/int2bnt.pl
@@ -1,12 +1,16 @@
-#!/usr/bin/perl
+#!/software/bin/perl
 
 ## Description: This script generates binary intensity files in the format required by Evoker
 ## Note: We assume that the input intensity file has the same snps and samples as the bim and fam files for the mathing bed file
 ##
-## Usage: >./int2bnt.pl collection.chr.int --filetype="illuminus"
+## Usage: >./int2bnt.pl --intensities collection.chr.int --filetype="illuminus" --out output_filename {--samples samples.fam}
 ## Input: Intensity file in one of the accepted formats named in the form collection.chromosome.int
 ## Output: Binary Intensity file collection.chromosome.bnt
-## Arguments: --filetype [chaimo | affy | illuminus]
+## Arguments: 
+## -i --input input intensity file 
+## -f --filetype [chaimo | affy | illuminus]
+## -o --output output file name and path
+## -s --samples .fam file for sample to include in output
 ## default format:
 ##	A matrix of intensities with SNPs as rows and individuals as pairs of whitespaceÐseparated columns. 
 ##Êchaimo input format:  
@@ -21,31 +25,70 @@
 use strict;
 use Getopt::Long;
 
-my $filetype = '';
+my $inputfile = '';
+my $samples   = '';
+my $filetype  = '';
+my $outfile   = '';
 
-GetOptions(	'filetype=s' => \$filetype );
-
-my $inputfile = $ARGV[0];
+GetOptions(	'input=s'    => \$inputfile,
+			'samples=s'  => \$samples,
+			'filetype=s' => \$filetype,
+			'output=s'   => \$outfile 
+		  );
+
+unless ($inputfile && $outfile && $filetype) {
+	die "Missing required arguments: -i <intensity file> -o <output file> -f <filetype>";	
+}
 
-$inputfile =~ /(.+)\.(.+)\.int/;
-open (OUT, ">$1.$2.bnt") or die "Can't open output '$1.$2.bnt': $!";
+my $out_fh;
+if ($outfile =~ /\.bnt$/i) {
+	open ($out_fh, ">$outfile") or die "Can't open output '$outfile': $!";
+} else {
+	open ($out_fh, ">$outfile.bnt") or die "Can't open output '$outfile.bnt': $!";	
+}
 
 ## magic number to ensure the binary is a real evoker file not just garbage
-print OUT pack('B*',"0001101000110001");
+print $out_fh pack('B*',"0001101000110001");
 
 if ($inputfile =~ /\.gz$/){
 	open (IN, "zcat $inputfile |") or die "Can't open '>zcat $inputfile': $!";
 }else{
 	open (IN, $inputfile) or die "Error: Can't open '$inputfile': $!";
 }
 
+if ($samples) {
+	if ($filetype =~ /illuminus/i || $filetype =~ /beagle/i) {
+
+  		my $aSamples = parse_sample_file($samples);
+		my $header   = <IN>;  		
+
+  		my $aSampPos = sample_position_array($header, $aSamples);
+
+		while (my $line = <IN>){
+  			chomp($line);
+  			my @fields = split(/\s+/, $line);
+
+  			for my $pos (@$aSampPos) {
+				my $int = $fields[$pos];
+   	 			if ($int eq 'NaN') {
+   	 				print $out_fh pack('f*', -1);
+   	 			} else {
+   	 				print $out_fh pack('f*', $int);	
+   	 			}
+			}
+		}
+	} else {
+		die "Sample filtering is not supported for file type '$filetype'\n";
+	}	
+}
+
 if ($filetype =~ /chiamo/i) {	 
 	my $header = <IN>;
 	while (my $line = <IN>){
   		chomp($line);
   		my @fields = split(/\s+/, $line);
   		for (my $i = 5; $i < scalar(@fields); $i++){
-   	 		print OUT pack('f*', $fields[$i]);
+   	 		print $out_fh pack('f*', $fields[$i]);
   		}  		
 	}
 } elsif ($filetype =~ /affy/i) {
@@ -83,7 +126,7 @@
 			$snp_id_b =~ s/-B$//;
 			if ($snp_id_a eq $snp_id_b) {
 				for (my $i=0; $i<@allele_a; $i++) {
-					print OUT pack('f*', ($allele_a[$i],$allele_b[$i]));
+					print $out_fh pack('f*', ($allele_a[$i],$allele_b[$i]));
 				}
 				@allele_a = ();
 				@allele_b = ();	
@@ -100,23 +143,62 @@
   		for (my $i = 3; $i < scalar(@fields); $i++){
    	 		my $int = $fields[$i];
    	 		if ($int eq 'NaN') {
-   	 			print OUT pack('f*', -1);
+   	 			print $out_fh pack('f*', -1);
    	 		} else {
-   	 			print OUT pack('f*', $int);	
+   	 			print $out_fh pack('f*', $int);	
    	 		}
   		}  		
 	}						
-}else {
+}elsif ($filetype =~ /default/) {
 	<IN>;
 	while (<IN>){
   		my @fields = split; 		
   		for (my $i = 1; $i <= $#fields; $i++){
-   	 		print OUT pack('f*',$fields[$i]);
+   	 		print $out_fh pack('f*',$fields[$i]);
   		}	
 	}
 }
 
 close IN;
 close OUT;
 
-
+sub parse_sample_file {
+	my $file = shift;
+	open(FAM, $file) or die "Can't open '$file': $!\n";
+	my @samples;
+	while (my $line = <FAM>) {
+		chomp $line;
+		my @vals = split(/\s+/, $line);
+		push(@samples, $vals[1]);
+	}
+	close FAM;
+	return \@samples;
+}  	
+
+##Êcreate a sample_position array to match each of the samples to a position on the vcf line
+sub sample_position_array {
+	my $header   = shift;
+	my $aSamples = shift;
+
+	my @sample_position;
+	my @hsamples = split( /\s+/, $header );
+	## create a hash of all the samples with their position on the line
+	my %sample_pos;
+	for (my $i = 0; $i <= $#hsamples; $i++) {
+		$sample_pos{$hsamples[$i]} = $i;
+	}
+
+	for my $sample (@$aSamples) {
+		## get both A and B
+		for my $channel ('A','B') {
+			my $test = $sample.$channel;
+			if (exists $sample_pos{$test}) {
+				push(@sample_position, $sample_pos{$test});
+				print "$test $sample_pos{$test}\n";	
+			} else {
+				warn "Sample '$test' not in data\n";
+			}	
+		}
+	}
+	return \@sample_position;
+}