Skip to content

Commit

Permalink
Adding merge_data -uniq along with some test files
Browse files Browse the repository at this point in the history
Former-commit-id: 42b4295e640332ce925d706df3f8d8007d08ac75
  • Loading branch information
barrymoore committed Jun 18, 2015
1 parent 2c6ffb3 commit 7ad7b74
Show file tree
Hide file tree
Showing 12 changed files with 170 additions and 23 deletions.
1 change: 1 addition & 0 deletions Build.PL
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ my $builder = $class->new(
bin/t/gtf2gff3.t
bin/t/gvf_validator.t
bin/t/map_seqids.t
bin/t/merge_data.t
bin/t/sam_inspector.t
bin/t/script_test_template.t
bin/t/ucsc2gff.t
Expand Down
2 changes: 2 additions & 0 deletions bin/fastqc_reporter
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ sub build_basic_report {
my @parts = split m|/|, $file;
pop @parts if scalar @parts > 1;
my $path = join '/', @parts;
$path =~ s/\.zip$//;
my $name = $parts[-1];
$data{files}{$file}{path} = $path;
$data{files}{$file}{name} = $name;
Expand Down Expand Up @@ -160,6 +161,7 @@ sub build_summary_report {
my @parts = split m|/|, $file;
pop @parts if scalar @parts > 1;
my $path = join '/', @parts;
$path =~ s/\.zip$//;
my $name = $parts[-1];
$data{files}{$file}{path} = $path;
$data{files}{$file}{name} = $name;
Expand Down
4 changes: 3 additions & 1 deletion bin/gff_tool
Original file line number Diff line number Diff line change
Expand Up @@ -838,7 +838,7 @@ sub blend_gff {
'blend_gff_not_thouroughly_tested',
'blend has not been thouroughly tested. Please help',
'us improve gff_tool by carefully evaluating your',
'output and contacting us if you find errors. Thanks',
'output and contacting us if you find errors.',
);

my %features;
Expand Down Expand Up @@ -3821,6 +3821,8 @@ sub send_message {
my $comment = join ' ', @comments;

my $message = join ' : ', ($class, $code, $comment);
$message .= "\n";
$message =~ s/\n+$/\n/;

print STDERR $message;

Expand Down
68 changes: 46 additions & 22 deletions bin/merge_data
Original file line number Diff line number Diff line change
Expand Up @@ -24,35 +24,55 @@ Merge together the data from two files based on a shared ID columns;
Options:
col1: The ID column(s) in the 1st data file (use comma separated
list for multiple column keys).
--col1
col2: The ID column in the 2nd data file (use comma separated
list for multiple column keys).
The ID column(s) in the 1st data file (use comma separated list
for multiple column keys).
split: The charachter to split columns on.
--col2
pcol1: The columns to print from file 1 (defaults to all).
The ID column in the 2nd data file (use comma separated list for
multiple column keys).
pcol1: The columns to print from file 2 (defaults to all).
--split, -s
The charachter to split columns on.
--pcol1
The columns to print from file 1 (defaults to all).
--pcol2
The columns to print from file 2 (defaults to all).
--uniq, -u
Ensure that output lines are unique.
";


my ($help, $col1, $col2, $split, $pcol1, $pcol2);
my $opt_success = GetOptions('help' => \$help,
'col1=s' => \$col1,
'col2=s' => \$col2,
'split=s' => \$split,
'pcol1=s' => \$pcol1,
'pcol2=s' => \$pcol2,
my ($help, $col1, $col2, $split, $pcol1, $pcol2, $print_uniq);
my $opt_success = GetOptions('help' => \$help,
'col1=s' => \$col1,
'col2=s' => \$col2,
'split|s=s' => \$split,
'pcol1=s' => \$pcol1,
'pcol2=s' => \$pcol2,
'uniq|u' => \$print_uniq,
);

die $usage if ! $opt_success;

if ($help) {
print $usage;
exit(0);
}

$split ||= "\t";
$split = qr|$split|;

die $usage if $help || ! $opt_success;

my ($file1, $file2) = @ARGV;
die $usage unless $file1 && $file2;

Expand All @@ -62,30 +82,34 @@ $col2 ||= 0;
my @cols1 = split /,/, $col1;
my @cols2 = split /,/, $col2;
my @pcols1;
@pcols1 = split /,/, $pcol1 if $pcol1;
@pcols1 = split /,/, $pcol1 if defined $pcol1;
my @pcols2;
@pcols2 = split /,/, $pcol2 if $pcol2;
@pcols2 = split /,/, $pcol2 if defined $pcol2;

my $index = parse_file(\@cols1, $file1);

open (my $IN, '<', $file2) or die "Can't open $file2 for reading\n$!\n";

my %uniq;
while (<$IN>) {

chomp;
my @columns2 = split /$split/, $_;
my $key = join ':', @columns2[@cols2];
next unless $index->{$key};
my @print_columns2 = @pcols2 ? @columns2[@pcols2] : @columns2;
my $column1_set = $index->{$key};
SET:
for my $columns1 (@{$column1_set}) {
my @print_columns1 = @pcols1 ? @{$columns1}[@pcols1] : @{$columns1};
my @print_columns2 = @pcols2 ? @columns2[@pcols2] : @columns2;
print join "\t", (@print_columns1, @print_columns2);
print "\n";
my $output = join "\t", (@print_columns1, @print_columns2);
next SET if $print_uniq && $uniq{$output}++;
print "$output\n";
}

}

exit(0);

#-----------------------------------------------------------------------------
#-------------------------------- SUBROUTINES --------------------------------
#-----------------------------------------------------------------------------
Expand Down
14 changes: 14 additions & 0 deletions bin/random_list
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,20 @@ With the options described below you can pick a given number of values
from the list and print just those, and you can permute the shuffle
pick sequence a given number of times.
Options:
--permute, -m 1
Lists are randomized by a Fisher-Yates shuffle. The permute
option describes how many rounds of shuffling are done. The
default is 1 which is sufficient for most purposes.
--pick, -p 100
Provide an integer value to --print and the script will print only
that given number of values from the top of the shuffled list.
Default is to print the entire shuffled list.
";

my ($help, $permute, $pick);
Expand Down
76 changes: 76 additions & 0 deletions bin/t/merge_data.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/perl

use strict;
use warnings;

use Test::More;
use FindBin;
use lib "$FindBin::RealBin/../../lib";
use lib "$FindBin::RealBin/../../lib/cpan";
use GAL::Run;

chdir $FindBin::Bin;
my $path = "$FindBin::Bin/..";

my $tool = GAL::Run->new(path => $path,
command => 'merge_data');

################################################################################
# Testing that merge_data compiles and returns usage statement
################################################################################

ok(! $tool->run(cl_args => ['--help']), 'merge_data complies');
like($tool->get_stdout, qr/Synopsis/, 'merge_data prints usage statement');

################################################################################
# Testing that merge_data does something else
################################################################################

my $file1 = "$FindBin::Bin/data/file1.txt";
my $file2 = "$FindBin::Bin/data/file2.txt";

my @cl_args = ('--pcol1 0',
'--pcol2 1',
'--uniq',
$file1,
$file2,
);

ok(! $tool->run(cl_args => \@cl_args), 'merge_data does something');
my @lines = split /\n/, $tool->get_stdout;
my $lc = scalar @lines;
ok($lc == 8,
'merge_data has the correct line count');

$tool->clean_up;
done_testing();

################################################################################
################################# Ways to Test #################################
################################################################################

__END__
# Various other ways to say "ok"
ok($this eq $that, $test_name);
is ($this, $that, $test_name);
isnt($this, $that, $test_name);
# Rather than print STDERR "# here's what went wrong\n"
diag("here's what went wrong");
like ($this, qr/that/, $test_name);
unlike($this, qr/that/, $test_name);
cmp_ok($this, '==', $that, $test_name);
is_deeply($complex_structure1, $complex_structure2, $test_name);
can_ok($module, @methods);
isa_ok($object, $class);
pass($test_name);
fail($test_name);
BAIL_OUT($why);
Binary file added dev/GAL_Indexing_Benchmarks.xlsx
Binary file not shown.
11 changes: 11 additions & 0 deletions t/data/file1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
1 A
2 B
3 C
4 D
5 E
6 F
7 G
8 H
3 S
4 T
5 U
8 changes: 8 additions & 0 deletions t/data/file2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
1 Q
2 R
3 S
4 T
5 U
6 V
7 W
8 X
4 changes: 4 additions & 0 deletions t/data/test_blend1.gvf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
chr22 . SNV 14432513 14432513 . + . ID=1;Variant_seq=C,G;Variant_effect=gene_variant;
chr22 . SNV 14433409 14433409 . + . ID=2;Variant_seq=C,T;Variant_effect=transcript_variant;
chr22 . SNV 14433624 14433624 . + . ID=3;Variant_seq=A,G;Variant_effect=exon_variant;
chr22 . SNV 14433863 14433863 . + . ID=4;Variant_seq=A,G;
4 changes: 4 additions & 0 deletions t/data/test_blend2.gvf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
chr22 . SNV 14432513 14432513 . + . ID=1;Variant_seq=C,G;Variant_effect=exon_variant;
chr22 . SNV 14433409 14433409 . + . ID=2;Variant_seq=C,T;Variant_effect=exon_variant,transcript_variant;
chr22 . SNV 14433624 14433624 . + . ID=3;Variant_seq=A,G;
chr22 . SNV 14433863 14433863 . + . ID=4;Variant_seq=A,G;Variant_effect=gene_variant;
1 change: 1 addition & 0 deletions t/data/test_blend3.gvf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
chr22 . SNV 14433863 14433863 . + . ID=4;Variant_seq=A,G;Variant_effect=missense_variant;test_attribute=5;

0 comments on commit 7ad7b74

Please sign in to comment.