-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathem.c
169 lines (132 loc) · 4.03 KB
/
em.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <float.h>
#include <string.h>
#include <math.h>
#include "d_model.h"
static double comp_ll (em_data *data)
{
double logsum = 0.0;
int n,i,j;
all_f *arr = data->arr;
for (n = 0; n < arr->poscount; ++n) {
pos_f p = arr->a[n];
for (i = 0; i < p.num; ++i) {
double freq = p.pos[i];
double possum = 0.0;
for( j = 0; j < 3; ++j ) {
possum = possum + ( data->a[j] * prob_norm( data->m[j], data->w[j], freq ));
}
possum = possum + data->a[3] * prob_unif( arr->minfrac, freq );
if (possum == 0.0) possum = DBL_MIN;
logsum = logsum + log( possum );
}
}
return logsum;
}
static void iter_EM (em_data *data, double *x)
{
double tmp;
all_f *arr = data->arr;
int i, j;
/* i: number of positions, j: number of normals in mixture */
double g[3];
/* stores each row g[j] in M[i][j] matrix while iterating i */
memset(g, 0, sizeof(double) * 3 );
double cs[3];
/* calculates column sums cs[j] of M[i][j] */
memset(cs, 0, sizeof(double) * 3 );
double csx[3];
/* calculates column sum of g[j] * x[i] */
memset(csx, 0, sizeof(double) * 3 );
double csd[3];
/* calculates column sum of g[j] * (x[i] - m[j])^2 */
memset(csd, 0, sizeof(double) * 3 );
double sum = 0;
/* calculates sum of all values in M[i][j] */
for (i = 0; i < arr->basecount; ++i) {
/*
This loop is the E-step of EM and computes the sums of the
posterior gamma(j) that will be neccessary for the
calculations done in M.
*/
double rs = 0;
/* row sums of g[j] for normalization */
for( j = 0; j < 3; ++j ) {
g[j] = data->a[j] * ((tmp = prob_norm( data->m[j], data->w[j], x[i] )) == 0 ? DBL_MIN : tmp);
rs = rs + g[j];
}
double ug;
/* ug stores the positionwise posterior of the uniform mixture component*/
ug = data->a[3] * prob_unif( arr->minfrac, x[i] );
/* ug needs to be included in the row sum, it can be thought of as a fourth column */
rs = rs + ug;
for( j = 0; j < 3; ++j ) {
g[j] = g[j] / rs;
cs[j] = cs[j] + g[j];
csx[j] = csx[j] + (g[j] * x[i]);
csd[j] = csd[j] + (g[j] * pow(x[i] - data->m[j],2));
sum = sum + g[j];
}
/* ug of course also is included in the total sum */
sum = sum + (ug / rs);
}
for( j = 0; j < 3; ++j ) {
/*
This loop is the M-step of EM which gets new estimates
for a, m and w based on the different sums calculated in E.
*/
double a = cs[j] / sum;
double m = (1.0 / cs[j]) * csx[j];
double w = sqrt( (1.0 / cs[j]) * csd[j] );
if ((data->f >> 0) & 1 && !isnan(a))
data->a[j] = a;
if ((data->f >> 1) & 1 && !isnan(m))
data->m[j] = m;
if ((data->f >> 2) & 1 && !isnan(w))
data->w[j] = w;
}
/* ua is the proportion of the uniform mixture that is included in the
* em_data struct at the fourth position of the "a" array a[3]. */
double ua = 1 - (data->a[0] + data->a[1] + data->a[2]);
if ((data->f >> 3) & 1 && !isnan(ua))
data->a[3] = ua;
}
void run_EM (em_data *data)
{
double ll = comp_ll(data);
double delta_ll = 1;
double last = ll;
all_f *arr = data->arr;
double *x = malloc( sizeof(double) * arr->basecount );
if (flatten_arr( x, arr ) == 0) {
fprintf(stderr, "problem allocating Data in EM\n");
return;
}
int c = 0;
while ( delta_ll > 0.01 ) {
iter_EM(data, x);
if (c % 10 == 0) {
last = ll;
ll = comp_ll(data);
delta_ll = ll - last;
}
// if (c % 100 == 0) fprintf(stderr, "EM iteration %d, delta loglik: %f\n", c, delta_ll);
++c;
}
// fprintf(stderr, "EM iteration %d, delta loglik: %f\n", c, delta_ll);
free(x);
data->ll = ll;
}
void setup_EMstr (em_data *data,
const double a[4],
const double m[3],
const double w[3],
const int f)
{
memcpy(data->a, a, sizeof a[0] * 4);
memcpy(data->m, m, sizeof m[0] * 3);
memcpy(data->w, w, sizeof w[0] * 3);
data->f = f;
}