-
Notifications
You must be signed in to change notification settings - Fork 0
/
wranglR.r
196 lines (175 loc) · 8.28 KB
/
wranglR.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
################ Within-Rank Analysis of Novel Groups by Likeness in R ###############
###################### 1987 Nei-Saitou-inspired special edition ######################
# Read map and taxonomy files in old QIIME format (no comments), OR provide directly
read_map_taxa = function(mapPath, taxaPath, GroupCol, taxThres) {
if (length(mapPath) > 1) { map = mapPath; }
else if (!length(mapPath)) { map = 0; }
else map = read.delim(mapPath,sep='\t',comment='',head=T,row.names=1,check.names=F);
if (length(taxaPath) > 1) { taxa = taxaPath; } else taxa =
read.delim(taxaPath,sep='\t',comment='',head=T,row.names=1,check.names=F);
if (map) {
# map = map[!is.na(match(rownames(map),colnames(taxa))),]; # asymmetry
taxa = as.matrix(taxa[,match(rownames(map),colnames(taxa))]);
}
if (length(GroupCol) > 1) { Groups = GroupCol; } else Groups = map[[GroupCol]];
return(list(map = map, XT = taxa[rowMeans(taxa) > taxThres,], Groups = factor(Groups)));
}
# Setup function; this generates initial matrices upon which later algorithms operate
setup = function(mapPath, taxaPath, GroupCol, taxThres = 0, randSubset = 0, mat.tt = NULL,
A=T, hardCut = F, shift = 1) {
data = read_map_taxa(mapPath, taxaPath, GroupCol, taxThres);
map = data$map; XT = data$XT; Groups = data$Groups;
num = dim(XT)[1]; area = num * num; mdim = num; whole = seq(1,dim(XT)[2]);
G1ix = Groups == levels(Groups)[1]; G2ix = Groups == levels(Groups)[2];
if (randSubset) { # Randomly subset a portion of samples to examine
whole = sample(seq(1,dim(XT)[2]),dim(XT)[2]);
part = whole[seq(1,length(whole)*randSubset)];
XT = XT[,part]; Groups = Groups[part];
}
# Generate baseline t-test p-values between (binary) groups for each feature
baseline.tt = apply(XT,1,function(xx) t.test(xx[G1ix], xx[G2ix])$p.value);
# Generate matrix of pairwise additive t-test p values for each feature pair
if (is.null(mat.tt)) {
print("Generating baseline matrix...");
mat.tt = matrix(rep(0,area),num); count = 0; timer = 0;
for (i in 1:num) {
for (j in i:num) {
mat.tt[i,j] = t.test(XT[i,G1ix] + XT[j,G1ix], XT[i,G2ix] + XT[j,G2ix])$p.value;
mat.tt[j,i] = mat.tt[i,j];
q = ifelse(j == i, 1, 2);
count = count + q; timer = timer + q;
}
if (timer >= area*0.1)
{ print(paste("Progress:",count/area*100,"%...")); timer = 0; }
}
}
print("Baseline matrix generated.");
# Calculate pairwise distances based on ratio of improvement
mat.tt.dist = matrix(rep(0,area),num);
for (i in 1:mdim) for (j in i:mdim) {
if (A) { mat.tt.dist[i,j] = mat.tt[i,j] / (.5 * (baseline.tt[i] + baseline.tt[j])); }
else mat.tt.dist[i,j] = 0.5*(mat.tt[i,j]/baseline.tt[j]+mat.tt[i,j]/baseline.tt[i]);
if (mat.tt.dist[i,j] > 1) mat.tt.dist[i,j] = shift +
ifelse(hardCut, 0, log10(mat.tt.dist[i,j]));
mat.tt.dist[j,i] = mat.tt.dist[i,j];
}
# stor = mat.tt.dist;
# mat.tt.dist = apply(mat.tt,1,function(xx) xx / baseline.tt)
# for (j in 1:mdim) for (k in j:mdim) {
# mat.tt.dist[j,k] = 0.5*(mat.tt.dist[j,k]+mat.tt.dist[k,j]);
# if (mat.tt.dist[j,k] > 1) mat.tt.dist[j,k] = 1+log10(mat.tt.dist[j,k]);
# mat.tt.dist[k,j]=mat.tt.dist[j,k];
# }
# print(paste("premat similarity:",sum(mat.tt.dist==stor)/length(stor)))
# Calculate D, the Nei-Saitou neighbor-finder, based on improvement distances
D = matrix(rep(0,area),num);
r = rowSums(mat.tt.dist);
for (i in 1:mdim) for (j in i:mdim) {
d = mat.tt.dist[i,j];
ri = 1 / (mdim - 2) * r[i] - d;
rj = 1 / (mdim - 2) * r[j] - d;
D[i,j] = d - ri - rj;
D[j,i] = D[i,j];
}
## faster version, only use if not reusing ri, rj in later dist calculation
# for (i in 1:mdim) for (j in i:mdim) {
# D[i,j] = (mdim - 2) * mat.tt.dist[i,j] - r[i] - r[j];
# D[j,i] = D[i,j];
# }
for (i in 1:mdim) D[i,i] = Inf; # Set diagonal to infinity to avert self-selection
return(list(G1ix = G1ix, G2ix = G2ix, XT = XT, baseline.tt = baseline.tt, mat.tt = mat.tt,
mat.tt.dist = mat.tt.dist, D = D, r = r, randSubset = randSubset, whole = whole));
}
# The main function to be called from R or commandline; only binary groups are supported
wranglR = function(mapPath, taxaPath, GroupCol = "Group", taxThres = 0, randSubset = 0, dat=NA,
A=T, hardCut = F, shift = 1) {
if (!is.na(dat)) { dt = dat; }
else dt = setup(mapPath, taxaPath, GroupCol, taxThres, randSubset, NULL, A, hardCut, shift);
G1ix = dt$G1ix; G2ix = dt$G2ix; XT = dt$XT; baseline.tt = dt$baseline.tt; D = dt$D;
mat.tt = dt$mat.tt; mat.tt.dist = dt$mat.tt.dist; randSubset = dt$randSubset; r = dt$r;
num = dim(XT)[1]; mdim = num; # Cache the initial number of species/entries
iter = 1; found = 0;
numTests = num*(num-1)/2 + num; # The number of t-tests performed thus far
while (mdim > 2) {
pair = which.min(D);
px = ceiling(pair/mdim); py = pair - mdim * (px - 1);
low = min(px,py); high = max(px, py); # to shrink matrix later
tt.both = mat.tt[low,high];
if (low == high) { print(paste("match exact at:",low)); break; } # Error condition
found = found + 1;
XT[low,] = XT[low,] + XT[high,]; # Merge rows into lower ix row
# distG = 0.5 * (mat.tt.dist[low,high] + (1 / (mdim - 2) * r[low] -
# mat.tt.dist[low,high]) - (1 / (mdim - 2) * r[high] - mat.tt.dist[low,high]));
distG = mat.tt.dist[low,high];
print(paste("found:",found,"at:",low,high,"Dist:",mat.tt.dist[low,high],
"New, l, h:",tt.both,baseline.tt[low], baseline.tt[high]));
# Assign the new row a Newick name with distances according to Nei-Saitou NJ
rownames(XT)[low] = paste('(',rownames(XT)[low],':',distG,',',
rownames(XT)[high],':',distG,')',sep='');
# Remove the trailing extra row (and column, for the matrices based on XT)
XT = XT[-high,];
mat.tt = mat.tt[-high,-high];
mat.tt.dist = mat.tt.dist[-high,-high];
baseline.tt = baseline.tt[-high];
mdim = mdim - 1;
# The baseline of this row becomes this round's combined score
baseline.tt[low] = tt.both;
for (j in 1:mdim) {
mat.tt[low,j] = t.test(XT[low,G1ix] + XT[j,G1ix],XT[low,G2ix] + XT[j,G2ix])$p.value;
if (A) { mat.tt.dist[low,j] = mat.tt[low,j] / (.5 * (baseline.tt[j] + baseline.tt[low])); }
else mat.tt.dist[low,j] = 0.5*(mat.tt[low,j]/baseline.tt[j]+mat.tt[low,j]/baseline.tt[low]);
if (mat.tt.dist[low,j] > 1) mat.tt.dist[low,j] = shift +
ifelse(hardCut, 0, log10(mat.tt.dist[low,j]));
}
mat.tt[,low] = mat.tt[low,] # row and col the same
mat.tt.dist[,low] = mat.tt.dist[low,];
# stor = mat.tt.dist;
# mat.tt.dist = matrix(rep(0,mdim*mdim),mdim);
# for (i in 1:mdim) for (j in i:mdim) {
# if (A) { mat.tt.dist[i,j] = mat.tt[i,j] / (.5 * (baseline.tt[i] + baseline.tt[j])); }
# else mat.tt.dist[i,j] = 0.5*(mat.tt[i,j]/baseline.tt[j]+mat.tt[i,j]/baseline.tt[i]);
# if (mat.tt.dist[i,j] > 1) mat.tt.dist[i,j] = 1 + log10(mat.tt.dist[i,j]);
# mat.tt.dist[j,i] = mat.tt.dist[i,j];
# }
# print(sum(mat.tt.dist==stor)/length(stor))
# This bit repeats the D calculation code. Functionalize?
D = matrix(rep(0,mdim*mdim),mdim);
r = rowSums(mat.tt.dist);
for (i in 1:mdim) for (j in i:mdim) {
d = mat.tt.dist[i,j];
ri = 1 / (mdim - 2) * r[i] - d;
rj = 1 / (mdim - 2) * r[j] - d;
D[i,j] = d - ri - rj;
D[j,i] = D[i,j];
}
# for (i in 1:mdim) for (j in i:mdim) {
# D[i,j] = (mdim - 2) * mat.tt.dist[i,j] - r[i] - r[j];
# D[j,i] = D[i,j];
# }
for (i in 1:mdim) D[i,i] = Inf; # Set diagonal to infinity
numTests = numTests + mdim - 1; # Increment the number of tests performed
iter = iter + 1;
}
names(baseline.tt) = rownames(XT); # Assign new names back to tt object
distG = mat.tt.dist[1,2]; # Set the final distance to the remaining
tree_pre = paste('(',rownames(XT)[1],':',distG,',',rownames(XT)[2],':',distG,')',sep='');
tree = paste(gsub(";","-",tree_pre),";",sep="");
tree = gsub('[[]',"<",tree);
tree = gsub('[]]',">",tree);
tree = gsub(" ","_",tree);
return(list(tree = tree, baseline.tt = baseline.tt, combinedData = XT));
}
# Re-combine any dataset of same row order from given path (w/o branch lengths)
fitTree = function(YT, path) {
endp = length(path) / 2;
for (i in 1:endp) {
high = path[i*2]; low = path[i*2 - 1];
YT[low,] = YT[low,] + YT[high,];
rownames(YT)[low] = paste('(',rownames(YT)[low],',',
rownames(XT)[high],')',sep='');
YT = YT[-high,];
}
}
# Cuts the tree at a given branch depth and returns subtrees with paths
cutTree = function (XT, depth, useDist = F) {
}