-
Notifications
You must be signed in to change notification settings - Fork 2
/
a2b.pl
120 lines (106 loc) · 3.29 KB
/
a2b.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env perl
######################################
# compare lines between a anf b
#
######################################
use strict;
use warnings;
use File::Basename qw(basename dirname);
use File::Spec::Functions qw(catfile catdir);
use Getopt::Std;
use Data::Dumper;
my %opts = ();
$opts{m} = $opts{n} = 1;
getopt("m:n:o:",\%opts);
die "Usage: a2b.pl [-m] <a-col> [-n] <b-col> [-o] <out-dir> fileA fileB\n" if(@ARGV != 2);
my $file_a = shift;
my $file_b = shift;
my ($stat, $a_only, $b_only, $a_both, $b_both) =
&cmp_both_files($file_a, $file_b, $opts{m}, $opts{n});
print $stat,"\n";
if($opts{o}) {
my $outdir = $opts{o};
mkdir $outdir unless -d $outdir;
my ($a_pre, $a_ext) = basename($file_a) =~ /(.*)\.(\w+)$/;
my ($b_pre, $b_ext) = basename($file_b) =~ /(.*)\.(\w+)$/;
my $file_a_both = catfile($outdir, "$a_pre\_both\.$a_ext");
my $file_a_only = catfile($outdir, "$a_pre\_only\.$a_ext");
my $file_b_both = catfile($outdir, "$b_pre\_both\.$b_ext");
my $file_b_only = catfile($outdir, "$b_pre\_only\.$b_ext");
&write_file($a_both, $file_a_both);
&write_file($a_only, $file_a_only);
&write_file($b_both, $file_b_both);
&write_file($b_only, $file_b_only);
}
# Subroutines #
sub write_file{
my ($var, $file) = @_;
open OUT, "> $file" or die "$!";
print OUT $var,"\n";
close OUT;
}
sub cmp_both_files{
my ($a, $b, $col_a, $col_b) = @_;
my %id_a = &readfile($a, $col_a);
my %id_b = &readfile($b, $col_b);
# check a to b
my @A_both = ();
my @B_both = ();
my @A_only = ();
my @B_only = ();
foreach my $ta (keys %id_a) {
if(exists $id_b{$ta}) {
push @A_both, $id_a{$ta};
} else {
push @A_only, $id_a{$ta};
}
}
# check b to a
foreach my $tb (keys %id_b) {
if(exists $id_a{$tb}) {
push @B_both, $id_b{$tb};
} else {
push @B_only, $id_b{$tb};
}
}
# stat num
my $len_a = keys %id_a;
my $len_b = keys %id_b;
my $len_both = @A_both;
my $len_a_only = $len_a - $len_both;
my $len_b_only = $len_b - $len_both;
my $a_name = basename($a);
my $b_name = basename($b);
my $stat_out = join "\n", ("\#\t$a_name\t$b_name",
"Total:\t$len_a\t$len_b",
"Both:\t$len_both\t$len_both",
"Uniq:\t$len_a_only\t$len_b_only"
);
my ($file_a_only, $file_a_both, $file_b_only, $file_b_both);
$file_a_only = join "\n", (sort @A_only);
$file_b_only = join "\n", (sort @B_only);
$file_a_both = join "\n", (sort @A_both);
$file_b_both = join "\n", (sort @B_both);
return ($stat_out, $file_a_only, $file_b_only, $file_a_both, $file_b_both);
}
sub readfile{
my ($in, $col) = @_;
my %id = ();
open F, "< $in" or die "$!";
while(<F>){
chomp;
next if(/(^\s*$)|(^\#)/);
my $col_index = $col - 1;
my @tabs = split /\s+/;
my $name = $tabs[$col_index];
push @{$id{$name}}, $_;
}
close F;
# check id replicates
my %hn = ();
foreach my $n (keys %id) {
die "Error: found replicates in column [$col] of [$in]\n" if(@{$id{$n}} > 1);
$hn{$n} = shift(@{$id{$n}});
}
return %hn;
}