-
Notifications
You must be signed in to change notification settings - Fork 0
/
DatasetSentences.cpp
88 lines (80 loc) · 2.76 KB
/
DatasetSentences.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#include "DatasetSentences.h"
#include <iostream>
#include<fstream>
#include<cstdio>
#include <ctype.h>
#include<string.h>
using namespace std;
/**
* Map which keeps the reads from the "DatasetSentences.txt" stores only the sentences used for training
* and their index. One can find which senteces are used for testing by looking in datasetSplit.txt and
* retrieve the sentences annotated with 1.
*/
DatasetSentences::~DatasetSentences() {}
DatasetSentences::DatasetSentences() {
// Read from the file.
//ifstream input("stanfordSentimentTreebank/datasetSentences.txt");
ifstream input("Preprocessing.txt");
ifstream input_datasplit("stanfordSentimentTreebank/datasetSplit.txt");
ofstream outputFile;
outputFile.open("debug.txt", std::ios_base::app);
string line;
string line_dataset;
bool is_number = false;
long long number = 0;
string word = "";
getline( input, line );
getline( input_datasplit, line_dataset );
while(getline( input, line ) ) {
getline( input_datasplit, line_dataset );
is_number = true;
number = 0;
word.clear();
for(char & c : line) {
if (c == '\t') {
is_number = false;
} else {
if (!is_number){
char aux = tolower(c);
word +=aux;
}
else number = number*10 + c -'0';
}
}
is_number = true;
long long number2= 0, number1 = 0;
for(char & c : line_dataset) {
if (c == ',') {
is_number = false;
} else {
if (!is_number) number2 = number2*10 + c -'0';
else number1 = number1*10 + c -'0';
}
}
if (number == number1 && number2 == 1) {
sentences.insert(make_pair(word, number));
outputFile<<word<<" "<<number<<endl;
}
}
}
long long DatasetSentences::retrieveSentenceIndex(string phrase) {
unordered_map<string, long long>::const_iterator found_iter = sentences.find(phrase);
if (found_iter == sentences.end()) {
ofstream outputFile;
outputFile.open("debug.txt", std::ios_base::app);
outputFile<<"The given phrase: "<<phrase<<" was not found in the sentence dataset."<<endl;
return -1;
}
return found_iter->second;
}
unordered_map<string, long long> DatasetSentences::getSentencesMap() {
return sentences;
}
// Transform the map to a vector of pairs.
vector<pair<string, long long>> DatasetSentences::getVectorOfSentences() {
vector<pair<string, long long>> result;
for (auto it = sentences.begin(); it != sentences.end(); it++) {
result.push_back(make_pair(it->first, it->second));
}
return result;
}