-
Notifications
You must be signed in to change notification settings - Fork 0
/
simple_good_turing_estimator.c
308 lines (273 loc) · 10.4 KB
/
simple_good_turing_estimator.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
/*
*
*
* Simple Good-Turing Frequency Estimator
*
*
* Geoffrey Sampson, with help from Miles Dennis
*
* Department of Informatics
* Sussex University
*
* www.grsampson.net
*
*
* First release: 27 June 1995
* Revised release: 24 July 2000
* This header information revised: 23 March 2005
* Further revised release: 8 April 2008
* Further revised release: 11 July 2008
*
*
* Takes a set of (frequency, frequency-of-frequency) pairs, and
* applies the "Simple Good-Turing" technique for estimating
* the probabilities corresponding to the observed frequencies,
* and P.0, the joint probability of all unobserved species.
* The Simple Good-Turing technique was devised by the late William
* A. Gale of AT&T Bell Labs, and described in Gale & Sampson,
* "Good-Turing Frequency Estimation Without Tears" (JOURNAL
* OF QUANTITATIVE LINGUISTICS, vol. 2, pp. 217-37 -- reprinted in
* Geoffrey Sampson, EMPIRICAL LINGUISTICS, Continuum, 2001).
*
* Anyone is welcome to take copies of this program and use it
* for any purpose, at his or her own risk. If it is used in
* connexion with published work, acknowledgment of Sampson and
* the University of Sussex would be a welcome courtesy.
*
* The program is written to take input from "stdin" and send output
* to "stdout"; redirection can be used to take input from and
* send output to permanent files. The code is in ANSI standard C.
*
* The input file should be a series of lines separated by newline
* characters, where all nonblank lines contain two positive integers
* (an observed frequency, followed by the frequency of that frequency)
* separated by whitespace. (Blank lines are ignored.)
* The lines should be in ascending order of frequency, and must
* begin with frequency 1.
*
* No checks are made for linearity; the program simply assumes that the
* requirements for using the SGT estimator are met.
*
* The output is a series of lines each containing an integer followed
* by a probability (a real number between zero and one), separated by a
* tab. In the first line, the integer is 0 and the real number is the
* estimate for P.0. In subsequent lines, the integers are the
* successive observed frequencies, and the reals are the estimated
* probabilities corresponding to those frequencies.
*
* Later releases cure bugs to which my attention has kindly been
* drawn at different times by Martin Jansche of Ohio State University
* and Steve Arons of New York City. No warranty is given
* as to absence of further bugs.
*
* Fan Yang of Next IT Inc., Spokane, Washington, has suggested to me
* that in the light of his experience with the SGT technique, for some
* data-sets it could be preferable to use the 0.1 significance criterion
* actually used in the experiments reported in the Gale & Sampson
* paper, rather than the 0.05 criterion suggested in that paper
* for the sake of conformity with standard statistical convention.
* (See note 8 of the paper.) Neither Fan Yang nor I have pursued
* this far enough to formulate a definite recommendation; but, in
* order to make it easier for users of the software to experiment
* with alternative confidence levels, the July 2008 release moves
* the relevant "magic number" out of the middle of the program into
* a #define line near the beginning where it is given the constant
* name CONFID_FACTOR. The value 1.96 corresponds to the p < 0.05
* criterion; in order to use the p < 0.1 criterion, 1.96 in the
* #define line should be changed to 1.65.
*
*
*/
#include <stdio.h>
#include <math.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#define TRUE 1
#define FALSE 0
#define MAX_LINE 100
#define MAX_ROWS 200
#define MIN_INPUT 5
#define CONFID_FACTOR 1.96
int r[MAX_ROWS], n[MAX_ROWS];
double Z[MAX_ROWS], log_r[MAX_ROWS], log_Z[MAX_ROWS],
rStar[MAX_ROWS], p[MAX_ROWS];
int rows, bigN;
double PZero, bigNprime, slope, intercept;
int main(void)
{
int readValidInput(void);
void analyseInput(void);
if ((rows = readValidInput()) >= 0)
{
if (rows < MIN_INPUT)
printf("\nFewer than %d input value-pairs\n",
MIN_INPUT);
else
analyseInput();
}
return(TRUE);
}
double sq(double x)
{
return(x * x);
}
int readValidInput(void)
/*
* returns number of rows if input file is valid, else -1
* NB: number of rows is one more than index of last row
*
*/
{
char line[MAX_LINE];
const char* whiteSpace = " \t\n\v\f\r";
int lineNumber = 0;
int rowNumber = 0;
const int error = -1;
while (fgets(line, MAX_LINE, stdin) != NULL && rowNumber < MAX_ROWS)
{
char* ptr = line;
char* integer;
int i;
++lineNumber;
while (isspace(*ptr))
++ptr; /* skip white space at the start of a line */
if (*ptr == '\0')
continue;
if ((integer = strtok(ptr, whiteSpace)) == NULL ||
(i = atoi(integer)) < 1)
{
fprintf(stderr, "Invalid field 1, line %d\n",
lineNumber);
return(error);
}
if (rowNumber > 0 && i <= r[rowNumber - 1])
{
fprintf(stderr,
"Frequency not in ascending order, line %d\n",
lineNumber);
return(error);
}
r[rowNumber] = i;
if ((integer = strtok(NULL, whiteSpace)) == NULL ||
(i = atoi(integer)) < 1)
{
fprintf(stderr, "Invalid field 2, line %d\n",
lineNumber);
return(error);
}
n[rowNumber] = i;
if (strtok(NULL, whiteSpace) != NULL)
{
fprintf(stderr, "Invalid extra field, line %d\n",
lineNumber);
return(error);
}
++rowNumber;
}
if (rowNumber >= MAX_ROWS)
{
fprintf(stderr, "\nInsufficient memory reserved for input\
values\nYou need to change the definition of\
MAX_ROWS\n");
return(error);
}
return(rowNumber);
}
void findBestFit(void)
{
double XYs, Xsquares, meanX, meanY;
double sq(double);
int i;
XYs = Xsquares = meanX = meanY = 0.0;
for (i = 0; i < rows; ++i)
{
meanX += log_r[i];
meanY += log_Z[i];
}
meanX /= rows;
meanY /= rows;
for (i = 0; i < rows; ++i)
{
XYs += (log_r[i] - meanX) * (log_Z[i] - meanY);
Xsquares += sq(log_r[i] - meanX);
}
slope = XYs / Xsquares;
intercept = meanY - slope * meanX;
}
double smoothed(int i)
{
return(exp(intercept + slope * log(i)));
}
int row(int i)
{
int j = 0;
while (j < rows && r[j] < i)
++j;
return((j < rows && r[j] == i) ? j : -1);
}
void showEstimates(void)
{
int i;
printf("intercept %f slope %f\n", intercept, slope);
printf("0\t%.4g\n", PZero);
for (i = 0; i < rows; ++i)
printf("%d\t%.4g\n", r[i], p[i]);
for (i = 0; i < rows; ++i)
printf("%d\t%.4g\n", r[i], smoothed(r[i]));
}
void analyseInput(void)
{
int i, j, next_n;
double k, x, y;
int indiffValsSeen = FALSE;
int row(int);
void findBestFit(void);
double smoothed(int);
double sq(double);
void showEstimates(void);
bigN = 0;
for (j = 0; j < rows; ++j)
bigN += r[j] * n[j];
next_n = row(1);
PZero = (next_n < 0) ? 0 : n[next_n] / (double) bigN;
for (j = 0; j < rows; ++j)
{
i = (j == 0 ? 0 : r[j - 1]);
if (j == rows - 1)
k = (double) (2 * r[j] - i);
else
k = (double) r[j + 1];
Z[j] = 2 * n[j] / (k - i);
log_r[j] = log(r[j]);
log_Z[j] = log(Z[j]);
}
for (i = 0; i < rows; ++i)
printf("%d\t%.4g\n", r[i], Z[i]);
findBestFit();
for (j = 0; j < rows; ++j)
{
y = (r[j] + 1) * smoothed(r[j] + 1) / smoothed(r[j]);
if (row(r[j] + 1) < 0)
indiffValsSeen = TRUE;
if (! indiffValsSeen)
{
x = (r[j] + 1) * (next_n = n[row(r[j] + 1)]) /
(double) n[j];
if (fabs(x - y) <= CONFID_FACTOR * sqrt(sq(r[j] + 1.0)
* next_n / (sq((double) n[j]))
* (1 + next_n / (double) n[j])))
indiffValsSeen = TRUE;
else
rStar[j] = x;
}
if (indiffValsSeen)
rStar[j] = y;
}
bigNprime = 0.0;
for (j = 0; j < rows; ++j)
bigNprime += n[j] * rStar[j];
for (j = 0; j < rows; ++j)
p[j] = (1 - PZero) * rStar[j] / bigNprime;
showEstimates();
}