forked from erelsgl-nlp/languagemodel
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLanguageModel.js
178 lines (150 loc) · 6.17 KB
/
LanguageModel.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
var util = require("util");
var logSumExp = require('./logSumExp');
/**
* This class represents a simple, unigram-based language model.
* Based on:
*
* Leuski Anton, Traum David. A Statistical Approach for Text Processing in Virtual Humans tech. rep.University of Southern California, Institute for Creative Technologies 2008.
* http://www.citeulike.org/user/erelsegal-halevi/article/12540655
*
* @author Erel Segal-Halevi
* @since 2013-08
*
* opts - may contain the following options:
* * smoothingCoefficient - the lambda-factor for smoothing the unigram probabilities.
*/
var LanguageModel = function(opts) {
this.smoothingCoefficient = opts.smoothingCoefficient || 0.9;
}
LanguageModel.prototype = {
/**
* Train the language with all the given documents.
*
* @param dataset
* an array with hashes of the format:
* {word1:count1, word2:count2,...}
* each object represents the a sentence (it should be tokenized in advance).
*/
trainBatch : function(dataset) {
// calculate counts for equation (3):
var mapWordToTotalCount = {};
var totalNumberOfWordsInDataset = 0;
for (var i in dataset) {
var datum = dataset[i];
var totalPerDatum = 0;
// for each input sentence, count the total number of words in it:
for (var word in datum) {
mapWordToTotalCount[word] |= 0;
mapWordToTotalCount[word] += datum[word];
totalPerDatum += datum[word];
}
datum["_total"] = totalPerDatum;
totalNumberOfWordsInDataset += totalPerDatum;
}
mapWordToTotalCount["_total"] = totalNumberOfWordsInDataset;
this.dataset = dataset;
this.mapWordToTotalCount = mapWordToTotalCount;
// calculate smoothing factor for equation (3):
var mapWordToSmoothingFactor = {};
for (var word in mapWordToTotalCount) {
mapWordToSmoothingFactor[word] =
(1-this.smoothingCoefficient) * this.mapWordToTotalCount[word] / this.mapWordToTotalCount["_total"];
}
this.mapWordToSmoothingFactor = mapWordToSmoothingFactor;
this.globalSmoothingFactor = (1/totalNumberOfWordsInDataset) // a global smoother, for totally unseen words.
},
/**
* @return the map of all words in the training Dataset, each word with its total count in the Dataset.
*/
getAllWordCounts: function() {
return this.mapWordToTotalCount;
},
/**
* @param sentenceCounts a hash {word1: count1, word2: count2, ... "_total": totalCount}, representing a sentence.
* @return the log-probability of that sentence, given the model built from the Dataset.
*/
logProbSentenceGivenDataset: function(sentenceCounts) { // (2) log P(w1...wn) = ...
var logProducts = [];
for (var i in this.dataset) {
var datum = this.dataset[i];
logProducts.push(this.logProbSentenceGivenSentence(sentenceCounts, datum));
}
var logSentenceLikelihood = logSumExp(logProducts);
return logSentenceLikelihood - Math.log(this.dataset.length); // The last element is not needed in practice (see eq. (5))
},
/**
* @param sentenceCounts a hash {word1: count1, word2: count2, ... "_total": totalCount}, representing a sentence.
* @param givenSentenceCounts a hash {word1: count1, word2: count2, ... "_total": totalCount}, representing a sentence.
* @return the (smoothed) log product probabilities that the words in sentenceCounts appear in the givenSentenceCounts.
*/
logProbSentenceGivenSentence: function(sentenceCounts, givenSentenceCounts) {
var logProduct=0;
for (var word in sentenceCounts)
logProduct += sentenceCounts[word] * this.logProbWordGivenSentence(word, givenSentenceCounts);
return logProduct;
},
/**
* @param word a word from the INPUT domain.
* @param givenSentenceCounts a hash {word1: count1, word2: count2, ... "_total": totalCount}, representing a sentence.
* @return the (smoothed) probability that the word appears in the sentence.
*/
logProbWordGivenSentence: function(word, givenSentenceCounts) { // (3) p_s(w) =~ pi_s(w) = ...
if (givenSentenceCounts!==Object(givenSentenceCounts))
throw new Error("expected givenSentenceCounts to be an object, but found "+JSON.stringify(givenSentenceCounts));
var totalGivenSentenceCounts = ("_total" in givenSentenceCounts?
givenSentenceCounts["_total"]:
Object.keys(givenSentenceCounts).
map(function(key){return givenSentenceCounts[key]}).
reduce(function(memo, num){ return memo + num; }, 0));
var prob = (
word in givenSentenceCounts?
this.smoothingCoefficient * givenSentenceCounts[word] / totalGivenSentenceCounts + this.mapWordToSmoothingFactor[word] + this.globalSmoothingFactor:
word in this.mapWordToSmoothingFactor?
this.mapWordToSmoothingFactor[word] + this.globalSmoothingFactor:
this.globalSmoothingFactor);
if (isNaN(prob)) {
console.log(util.inspect(this,{depth:3}));
throw new Error("logProbWordGivenSentence("+word+", "+JSON.stringify(givenSentenceCounts)+") is NaN!");
}
return Math.log(prob);
},
toJSON: function() {
return {
mapWordToTotalCount: this.mapWordToTotalCount,
mapWordToSmoothingFactor: this.mapWordToSmoothingFactor,
globalSmoothingFactor: this.globalSmoothingFactor,
dataset: this.dataset,
};
},
fromJSON: function() {
this.mapWordToTotalCount = mapWordToTotalCount;
this.mapWordToSmoothingFactor = mapWordToSmoothingFactor;
this.globalSmoothingFactor = globalSmoothingFactor;
this.dataset = dataset;
},
}
module.exports = LanguageModel;
if (process.argv[1] === __filename) {
console.log("LanguageModel.js demo start");
var model = new LanguageModel({
smoothingFactor : 0.9,
});
var wordcounts = require('./wordcounts');
model.trainBatch([
wordcounts("I want aa"),
wordcounts("I want bb"),
wordcounts("I want cc")
]);
var assertProbSentence = function(sentence, expected) {
var p = Math.exp(model.logProbSentenceGivenDataset(wordcounts(sentence)));
if (Math.abs(p-expected)/expected>0.01) {
console.warn("p("+sentence+") = "+Math.exp(model.logProbSentenceGivenDataset(wordcounts(sentence))), " should be "+expected);
}
}
assertProbSentence("I", 1/3);
assertProbSentence("I want", 1/9);
assertProbSentence("I want aa", 0.0123456);
assertProbSentence("I want aa bb",0.00026);
assertProbSentence("I want aa bb cc",0.00000427);
console.log("LanguageModel.js demo end");
}