Skip to content

Commit

Permalink
users can now pass in the name of their output file.
Browse files Browse the repository at this point in the history
  • Loading branch information
james_morris81 committed Apr 5, 2012
1 parent a049396 commit ee09229
Showing 1 changed file with 99 additions and 17 deletions.
116 changes: 99 additions & 17 deletions resources/int2bnt.pl
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
#!/usr/bin/perl
#!/software/bin/perl

## Description: This script generates binary intensity files in the format required by Evoker
## Note: We assume that the input intensity file has the same snps and samples as the bim and fam files for the mathing bed file
##
## Usage: >./int2bnt.pl collection.chr.int --filetype="illuminus"
## Usage: >./int2bnt.pl --intensities collection.chr.int --filetype="illuminus" --out output_filename {--samples samples.fam}
## Input: Intensity file in one of the accepted formats named in the form collection.chromosome.int
## Output: Binary Intensity file collection.chromosome.bnt
## Arguments: --filetype [chaimo | affy | illuminus]
## Arguments:
## -i --input input intensity file
## -f --filetype [chaimo | affy | illuminus]
## -o --output output file name and path
## -s --samples .fam file for sample to include in output
## default format:
## A matrix of intensities with SNPs as rows and individuals as pairs of whitespaceÐseparated columns.
##Êchaimo input format:
Expand All @@ -21,31 +25,70 @@
use strict;
use Getopt::Long;

my $filetype = '';
my $inputfile = '';
my $samples = '';
my $filetype = '';
my $outfile = '';

GetOptions( 'filetype=s' => \$filetype );

my $inputfile = $ARGV[0];
GetOptions( 'input=s' => \$inputfile,
'samples=s' => \$samples,
'filetype=s' => \$filetype,
'output=s' => \$outfile
);

unless ($inputfile && $outfile && $filetype) {
die "Missing required arguments: -i <intensity file> -o <output file> -f <filetype>";
}

$inputfile =~ /(.+)\.(.+)\.int/;
open (OUT, ">$1.$2.bnt") or die "Can't open output '$1.$2.bnt': $!";
my $out_fh;
if ($outfile =~ /\.bnt$/i) {
open ($out_fh, ">$outfile") or die "Can't open output '$outfile': $!";
} else {
open ($out_fh, ">$outfile.bnt") or die "Can't open output '$outfile.bnt': $!";
}

## magic number to ensure the binary is a real evoker file not just garbage
print OUT pack('B*',"0001101000110001");
print $out_fh pack('B*',"0001101000110001");

if ($inputfile =~ /\.gz$/){
open (IN, "zcat $inputfile |") or die "Can't open '>zcat $inputfile': $!";
}else{
open (IN, $inputfile) or die "Error: Can't open '$inputfile': $!";
}

if ($samples) {
if ($filetype =~ /illuminus/i || $filetype =~ /beagle/i) {

my $aSamples = parse_sample_file($samples);
my $header = <IN>;

my $aSampPos = sample_position_array($header, $aSamples);

while (my $line = <IN>){
chomp($line);
my @fields = split(/\s+/, $line);

for my $pos (@$aSampPos) {
my $int = $fields[$pos];
if ($int eq 'NaN') {
print $out_fh pack('f*', -1);
} else {
print $out_fh pack('f*', $int);
}
}
}
} else {
die "Sample filtering is not supported for file type '$filetype'\n";
}
}

if ($filetype =~ /chiamo/i) {
my $header = <IN>;
while (my $line = <IN>){
chomp($line);
my @fields = split(/\s+/, $line);
for (my $i = 5; $i < scalar(@fields); $i++){
print OUT pack('f*', $fields[$i]);
print $out_fh pack('f*', $fields[$i]);
}
}
} elsif ($filetype =~ /affy/i) {
Expand Down Expand Up @@ -83,7 +126,7 @@
$snp_id_b =~ s/-B$//;
if ($snp_id_a eq $snp_id_b) {
for (my $i=0; $i<@allele_a; $i++) {
print OUT pack('f*', ($allele_a[$i],$allele_b[$i]));
print $out_fh pack('f*', ($allele_a[$i],$allele_b[$i]));
}
@allele_a = ();
@allele_b = ();
Expand All @@ -100,23 +143,62 @@
for (my $i = 3; $i < scalar(@fields); $i++){
my $int = $fields[$i];
if ($int eq 'NaN') {
print OUT pack('f*', -1);
print $out_fh pack('f*', -1);
} else {
print OUT pack('f*', $int);
print $out_fh pack('f*', $int);
}
}
}
}else {
}elsif ($filetype =~ /default/) {
<IN>;
while (<IN>){
my @fields = split;
for (my $i = 1; $i <= $#fields; $i++){
print OUT pack('f*',$fields[$i]);
print $out_fh pack('f*',$fields[$i]);
}
}
}

close IN;
close OUT;


sub parse_sample_file {
my $file = shift;
open(FAM, $file) or die "Can't open '$file': $!\n";
my @samples;
while (my $line = <FAM>) {
chomp $line;
my @vals = split(/\s+/, $line);
push(@samples, $vals[1]);
}
close FAM;
return \@samples;
}

##Êcreate a sample_position array to match each of the samples to a position on the vcf line
sub sample_position_array {
my $header = shift;
my $aSamples = shift;

my @sample_position;
my @hsamples = split( /\s+/, $header );
## create a hash of all the samples with their position on the line
my %sample_pos;
for (my $i = 0; $i <= $#hsamples; $i++) {
$sample_pos{$hsamples[$i]} = $i;
}

for my $sample (@$aSamples) {
## get both A and B
for my $channel ('A','B') {
my $test = $sample.$channel;
if (exists $sample_pos{$test}) {
push(@sample_position, $sample_pos{$test});
print "$test $sample_pos{$test}\n";
} else {
warn "Sample '$test' not in data\n";
}
}
}
return \@sample_position;
}

0 comments on commit ee09229

Please sign in to comment.