-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgraph_tokenizer.h
229 lines (192 loc) · 7.53 KB
/
graph_tokenizer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#ifndef _GRAPH_TOKENIZER_H
#define _GRAPH_TOKENIZER_H
#include <fstream>
#include "element_parser.h"
#include "tokenizer_utils.h"
using namespace std;
template <typename PAT>
class graph_tokenizer
{
public:
graph_tokenizer(const int max=LINE_SZ): MAXLINE(max) {} /**<constructor for tokenizer */
template<class SM_T>
int parse_next_trans(ifstream& infile, pat_fam<GRAPH_PATTERN>& freq_pats, storage_manager<GRAPH_PATTERN,
VAT, ALLOC, SM_T>& vat_hmap) {
char* line=new char[MAXLINE];
char word[MAXLINE];
char* startline=line;
int len;
int count; //# of words parsed from line
int tid=-1;
int num_items=-1; //# of words to be read from this line
int pos; //stores starting position of input stream's get pointer
GRAPH_PATTERN* g1;
map<int, typename GRAPH_PATTERN::VERTEX_T> vid_to_lbl; // map from vertex-id
// to its label
typename map<int, typename GRAPH_PATTERN::VERTEX_T>::iterator tmp_it;
while(1) {
pos=infile.tellg();
line=startline;
*line='\0';
infile.getline(line, MAXLINE-1);
len=strlen(line);
if(!len || !line) {
delete[] startline;
return tid;
}
line[len++]='\0';
count=0;
if(line[0]=='#') // comment line
continue;
if(!(line=parse_word()(line, word))) {
//parse_word() failed
delete[] startline;
return -1;
}
if(word[0]=='t') { // this is the tid line
if(tid!=-1) { // this is a new tid, stop here
infile.seekg(pos);
delete[] startline;
return tid; // this is the line from where function should
// return on most calls
}
line=parse_word()(line, word); // read in the '#'
if(!line) {
//parse_word() failed
delete[] startline;
return -1;
}
line=parse_word()(line, word); // read in the tid
if(!line) {
//parse_word() failed
delete[] startline;
return -1;
}
tid=atoi(word);
}//if word[0]=='t'
else if(word[0]=='v') { // this is a vid-line
num_items=2; // 2 more words to be parsed from this line
int vid=0;
typename GRAPH_PATTERN::VERTEX_T v_lbl;
while(count<num_items) {
if(!(line=parse_word()(line, word))) {
// parse_word() failed
delete[] startline;
return -1;
}
switch(count) {
case 0: vid=atoi(word); break;
case 1:
v_lbl=el_prsr.parse_element(word);
/// INPUT-FORMAT: if the datafile format is to append
/// vertex labels with a letter (as is true for data
/// files in /dmtl/ascii_data on hd-01)
/// then simply change the
/// above line to:
/// v_lbl=el_prsr.parse_element(word+1);
vid_to_lbl.insert(make_pair(vid, v_lbl));
}
count++;
}//while(count<..)
}//if word[0]=='v'
else if(word[0]=='e') { // undirected edge
/// INPUT-FORMAT: if running for files in /dmtl/ascii_data on hd-01
/// simply change the above line to:
/// else if(word[0]=='u')
int vid1, vid2;
typename GRAPH_PATTERN::EDGE_T e_lbl;
typename GRAPH_PATTERN::VERTEX_T v_lbl1, v_lbl2;
num_items=3; // 3 more words to be parsed
bool swap_vids; // flag=false if v_lbl1<v_lbl2
while(count<num_items) {
if(!(line=parse_word()(line, word))) {
// parse_word() failed
delete[] startline;
return -1;
}
switch(count) {
case 0:
vid1=atoi(word);
if((tmp_it=vid_to_lbl.find(vid1))==vid_to_lbl.end()) {
cerr<<"graph_tokenizer.parse_next_trans: vid "<<vid1<<" not found in vid_to_lbl"<<endl;
return -1;
}
v_lbl1=tmp_it->second;
break;
case 1:
vid2=atoi(word);
if((tmp_it=vid_to_lbl.find(vid2))==vid_to_lbl.end()) {
cerr<<"graph_tokenizer.parse_next_trans: vid "<<vid2<<" not found in vid_to_lbl"<<endl;
return -1;
}
v_lbl2=tmp_it->second;
break;
case 2:
e_lbl=edge_prsr.parse_element(word);
/// INPUT-FORMAT: if the datafile format is to append
/// edge labels with a letter (as is true for data
/// files in /dmtl/ascii_data on hd-01)
/// then simply change the
/// above line to:
/// e_lbl=el_prsr.parse_element(word+1);
/// prepare pattern ///
g1=new GRAPH_PATTERN;
if(v_lbl1<=v_lbl2) {
make_edge(g1, v_lbl1, v_lbl2, e_lbl);
swap_vids=0;
}
else {
make_edge(g1, v_lbl2, v_lbl1, e_lbl);
swap_vids=1;
}
/// if g1's vat is present, check if this tid is also
/// present. If yes, then insert pair of vids.
/// If tid not present, create a new entry in vat and
/// insert it
if(!(gvat=vat_hmap.get_vat(g1))) { // vat not found
gvat=new VAT;
if(!swap_vids)
gvat->insert_occurrence_tid(tid, make_pair(vid1, vid2));
else
gvat->insert_occurrence_tid(tid, make_pair(vid2, vid1));
gvat->insert_vid_tid(vid1);
gvat->insert_vid(vid2);
vat_hmap.add_vat(g1, gvat); // add pattern-vat mapping
freq_pats.push_back(g1); // this is the first time
// this pattern has been encountered, so add it
}
else if(gvat->back().first!=tid) { // or, new tid
if(!swap_vids)
gvat->insert_occurrence_tid(tid, make_pair(vid1, vid2));
else
gvat->insert_occurrence_tid(tid, make_pair(vid2, vid1));
gvat->insert_vid_tid(vid1);
gvat->insert_vid(vid2);
delete g1;
}
else { // assert: gvat->back().first=tid
if(!swap_vids)
gvat->insert_occurrence(make_pair(vid1, vid2));
else
gvat->insert_occurrence(make_pair(vid2, vid1));
gvat->insert_vid_hs(vid1);
gvat->insert_vid(vid2);
delete g1;
}
}//switch
count++;
}//while(count<..)
}//if(word[0]=='u')
else {
cerr<<"graph.tokenizer.parse_next_trans: Unidentifiable line="<<line<<endl;
return -1;
}
}//while(1)
return tid;
}//parse_next_trans()
private:
int MAXLINE; /**< max length of line to be parsed */
element_parser<typename GRAPH_PATTERN::VERTEX_T> el_prsr; /**< parses an element of desired type */
element_parser<typename GRAPH_PATTERN::EDGE_T> edge_prsr; /**< parses an element of desired type */
}; //end class tokenizer
#endif