-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSpellChecker.cpp
126 lines (83 loc) · 3.16 KB
/
SpellChecker.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#include "SpellChecker.h"
#include "FileReader.h"
#include "Indexer.h"
#include "Stemmer.h"
static void add_count_or_init(cds::container::FeldmanHashMap<cds::gc::HP, std::string, size_t, map_traits>& ngram,
const std::string& word,
size_t count_of_word) {
if (auto gp = ngram.get(word); gp) {
gp->second += count_of_word;
} else {
ngram.insert(word, count_of_word);
}
}
SpellChecker::SpellChecker(size_t n, bool n_or_count) : n(n), n_or_count(n_or_count) {}
void SpellChecker::spell_check(const std::string& filename) {
init_map(filename);
init_ngrams();
init_map_indices();
}
void SpellChecker::init_map(const std::string& filename) {
FileReader reader(filename);
while (true) {
std::optional<std::string> optional_word = reader.read_word();
if (!optional_word) {
break;
}
std::string word = stemmer::add_dots_and_clean(optional_word.value());
if (auto gp = cds_words_count_index.get(word); gp) {
gp->second.first++;
} else {
cds_words_count_index.insert(word, std::make_pair(1, 0));
}
}
}
void SpellChecker::init_ngrams() {
#pragma omp parallel for schedule(static, 500)
for (const auto& [word, count_index] : cds_words_count_index) {
std::vector<std::string> bgrams = stemmer::get_bigrams(word);
std::vector<std::string> tgrams = stemmer::get_trigrams(word);
for (const auto& bgram : bgrams) {
add_count_or_init(cds_bigrams, bgram, count_index.first);
}
for (const auto& tgram : tgrams) {
add_count_or_init(cds_threegrams, tgram, count_index.first);
}
}
}
void SpellChecker::init_map_indices() {
#pragma omp parallel for schedule(static, 500)
for (auto& [word, count_index] : cds_words_count_index) {
count_index.second = indexer::get_index(word, cds_bigrams, cds_threegrams);
}
}
void SpellChecker::print_map() {
for (const auto& [word, count_index] : cds_words_count_index) {
std::cout << word << " <" << count_index.first << " " << count_index.second << ">" << std::endl;
}
}
void SpellChecker::print_result_if_count() {
std::priority_queue<std::pair<double, std::string>> queue_for_print;
for (const auto& [word, count_index] : cds_words_count_index) {
queue_for_print.push({count_index.second, word});
}
for (size_t i = 0; i < n; ++i) {
auto [index, word] = queue_for_print.top();
std::cout << word << " <" << index << ">" << std::endl;
queue_for_print.pop();
}
}
void SpellChecker::print_result_if_n() {
for (const auto& [word, count_index] : cds_words_count_index) {
if (count_index.second > n) {
std::cout << word << " <" << count_index.first << " " << count_index.second << ">" << std::endl;
}
}
}
void SpellChecker::print_result() {
if (n_or_count) {
print_result_if_n();
} else {
print_result_if_count();
}
}