forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIndexTable.h
309 lines (252 loc) · 10.6 KB
/
IndexTable.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
// Matt Wells, copyright Aug 2001
// . calls for intersecting a bunch of IndexLists to generate docIds
// . IndexLists are data-less lists (keys only)
// . each key in an IndexList is a termId/score/adultBit/docId tuple
// . we try to use as small a sublist of each IndexList as possible to avoid
// wasting network bandwidth
// . TODO: split into 2+ classes
// TODO: implement site clustering??????? in getNumResults()
// TODO: if we have in cache we can hash right into the table, but
// we must do that before blocking on something in case it disappears
// from the cache
// TODO: it is possible to get a better scoring result, even if we found
// 10 docIds in the heads of ALL the IndexLists. Because it may have
// a really high score in 3 of the IndexLists, but a low score in the
// fourth, but it's sum may be the highest of all docIds.
// TODO: the search "cell phone cable hp jornada 680", w/o quotes, should
// be quote forced anyway. Some pages will match all but "cable hp"
// so we should break that up into it's 2 terms, cable and hp
// 6912 (30%) of queries are 1 word queries 138
// 7825 (34%) of queries are 2 word queries 156
// 4512 (20%) of queries are 3 word queries 90
// 1771 ( 8%) of queries are 4 word queries 32
// 869 ( 4%) of queries are 5 word queries 17
// 391 ( 2%) of queries are 6 word queries 8
// 290 ( 1%) of queries are 7 word queries 6
// 183 ( 1%) of queries are 8 word queries (4 per second)
#ifndef _INDEXTABLE_H_
#define _INDEXTABLE_H_
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
#include "Indexdb.h" // makeStartKey(), getTruncationLimit()
#include "IndexList.h" // for m_lists[]
#include "Titledb.h" // g_titledb.getTotalNumDocs()
#include "IndexReadInfo.h" // MAX_TIERS
// max # search results that can be viewed
#define MAX_RESULTS 1000
class IndexTable {
public:
// . returns false on error and sets errno
// . we now support multiple plus signs before the query term
// . start/endTermNums apply to phrase termIds only
// . allows us to set multiple bits when a phrase termId is matched
// in case the singleton was truncated, but doc has the phrase
// . if you want Default AND behaviour set requireAllTerms to true
// it is much faster, too
void init (Query *q,bool isDebug,void *logstate,bool requireAllTerms,
class TopTree *topTree );
// has init already been called?
bool isInitialized ( ) { return m_initialized; };
// sets m_positiveBits, etc.
//void prepareToAddLists ( );
// . returns false on error and sets errno
// . we assume there are "m_numTerms" lists passed in (see set() above)
void addLists_r ( IndexList lists[MAX_TIERS][MAX_QUERY_TERMS] ,
long numTiers ,
long numListsPerTier ,
Query *q ,
long docsWanted ,
long *totalListSizes ,
bool useDateLists ,
bool sortByDate ,
float sortByDateWeight );
// . these are set from calling addLists() above
// . we log all matching topDocIds if isDebug is true
long long *getTopDocIds ( long tier ) { return m_topDocIds[tier]; };
unsigned char *getTopBitScores ( long tier )
{ return m_topBitScores[tier]; };
char *getTopExplicits ( long tier ) { return m_topExplicits[tier]; };
long *getTopScores ( long tier ) { return m_topScores[tier]; };
//unsigned long *getTopBitScores () { return m_finalTopBitScores; };
// make sure to call getTopDocIds() before calling this
long getNumTopDocIds ( long tier ) { return m_numTopDocIds[tier]; };
// . get how many results we have in the topDocIds list
// . if "thatIncludeAllTerms" is true, results must have all terms
// from all indexLists that we haven't read ALL of yet
long getNumExactExplicitMatches ( long tier ) {
return m_numExactExplicitMatches[tier];};
long getNumExactImplicitMatches ( long tier ) {
return m_numExactImplicitMatches[tier];};
// some generic stuff
IndexTable();
~IndexTable();
void reset();
// . call to set the m_final* member vars from the m_top* member vars
// . ALWAYS call this BEFORE calling
// getTopDocIds(), getNumTopDocIds() or getNumExactMatches()
void filterTopDocIds ( ) ;
// how long to add the last batch of lists
long long m_addListsTime;
unsigned long m_totalDocIds;
long m_numPanics;
long m_numCollisions;
long m_numPtrs; // in the beginning at least
long m_numLoops;
// how long to get top docIds
long long m_setTopDocIdsTime;
long long m_estimatedTotalHits;
long m_numSlots;
// Msg39 needs to call these
void freeMem ( ) ;
bool alloc (IndexList lists[MAX_TIERS][MAX_QUERY_TERMS],
long numTiers ,
long numListsPerTier ,
long docsWanted ,
bool sortByDate );
bool doRecall() { return m_doRecall; };
long getNumDocsInTier ( long i ) { return m_numDocsInTier[i]; };
// . sets m_scoreWeights[] based on termFreqs (IDF)
void setScoreWeights ( Query *q );
void setScoreWeights ( Query *q , bool phrase );
long *getScoreWeights ( ) { return m_scoreWeights; };
private:
void addLists2_r ( IndexList lists[MAX_TIERS][MAX_QUERY_TERMS] ,
long numTiers ,
long numListsPerTier ,
Query *q ,
long docsWanted ,
long *imap ,
bool lastRound ,
long numBaseLists ,
bool useDateLists ,
bool sortByDate ,
float sortByDateWeight,
long *minHardCountPtr );
void hashTopDocIds2 ( unsigned long *maxDocId ,
char **docIdPtrs ,
long *scores ,
qvec_t *explicitBits ,
short *hardCounts ,
unsigned long mask ,
long numSlots ) ;
// . used for getting which topDocId to kick out of the top list
long getWeakestTopDocId ( char **topp ,
long *tops ,
unsigned char *topb ,
long numTop ,
unsigned char *minBitScore2 ,
long *score ,
char **docIdPtr ) ;
// . get the termBits for the termId represented by this list
// . only phrases may set multiple bits
qvec_t getTermImplicitBitMask_r ( long i );
// . set the m_bitScores[] array
// . "count" is the # of query term (single or phrase) bit combinations
void setBitScores ( long count );
// are lists swapped?
bool m_swapped [ MAX_TIERS ] [ MAX_QUERY_TERMS ] ;
// these describe the lists associated with each m_termId
long m_scoreWeights [ MAX_QUERY_TERMS ];
// for each tier we have a list of the top docids
long long m_topDocIds [ MAX_TIERS ] [ MAX_RESULTS ];
char *m_topDocIdPtrs [ MAX_TIERS ] [ MAX_RESULTS ];
long m_topScores [ MAX_TIERS ] [ MAX_RESULTS ];
//short m_topHardCounts [ MAX_TIERS ] [ MAX_RESULTS ];
unsigned char m_topBitScores [ MAX_TIERS ] [ MAX_RESULTS ];
char m_topExplicits [ MAX_TIERS ] [ MAX_RESULTS ];
long m_numTopDocIds [ MAX_TIERS ] ;
long m_numExactExplicitMatches [ MAX_TIERS ];
long m_numExactImplicitMatches [ MAX_TIERS ];
long m_numTiers;
// when filterTopDocIds() is called it uniquifies and combines
// m_topDocIds[*][] into m_finalDocIds
/* long long m_finalTopDocIds [ MAX_RESULTS ];
long m_finalTopScores [ MAX_RESULTS ];
//unsigned long m_finalTopBitScores [ MAX_RESULTS ];
long m_finalNumExactExplicitMatches ;
long m_finalNumExactImplicitMatches ;
long m_finalNumTopDocIds ;*/
// a reference to the query
Query *m_q;
// has init() been called?
bool m_initialized;
// are we in debug mode?
bool m_isDebug;
// for debug msgs
long m_logstate;
// . did we already call m_q->setBitScores() for this query?
// . don't call it more than once
bool m_alreadySet;
bool m_doRecalc;
bool m_requireAllTerms;
char **m_topDocIdPtrs2;
long *m_topScores2;
qvec_t *m_topExplicits2;
short *m_topHardCounts2;
long m_maxTopDocIds2;
long m_numTopDocIds2;
long m_nexti;
long m_oldnexti;
bool m_doRecall;
// allocated memory
char *m_buf;
long m_bufSize;
char *m_bufMiddle;
// for large hashtable for sortByDate
char *m_bigBuf;
long m_bigBufSize;
// the imap stuff
long m_imap [ MAX_QUERY_TERMS ];
long m_sizes [ MAX_QUERY_TERMS ];
long m_blocksize [ MAX_QUERY_TERMS ];
long m_nb;
class TopTree *m_topTree;
// these are for removing component lists replaced by their compounds
long *m_componentCodes;
//char *m_ignore;
//bool m_scoresSet;
long m_numDocsInTier [ MAX_TIERS ] ;
};
// . get the LOWEST scoring docId from our list of top docIds
// . set "minBitScore22" and "score" for that lowest docId
// . inline this for speed
// . BUT lower docIds are considered higher scoring than higher docIds
inline long IndexTable::getWeakestTopDocId ( char **topp ,
long *tops ,
unsigned char *topb ,
long numTop ,
unsigned char *minBitScore2 ,
long *score ,
char **docIdPtr ) {
long long tmp = 0LL;
long minScore = 0x7fffffff;
unsigned char minBitScore = 0xff;
char *minDocIdPtr = (char *)&tmp;
long mini = 0;
for ( long i = 0 ; i < numTop ; i++ ) {
if ( topb [i] > minBitScore ) continue;
if ( topb [i] < minBitScore ) goto gotIt;
if ( tops [i] > minScore ) continue;
if ( tops [i] < minScore ) goto gotIt;
if ( *(unsigned long *)(topp[i]+1 ) <
*(unsigned long *)(minDocIdPtr+1) ) continue;
if ( *(unsigned long *)(topp[i]+1 ) >
*(unsigned long *)(minDocIdPtr+1) ) goto gotIt;
if ( (*(unsigned char *)(topp[i] ) & 0xfc) <
(*(unsigned char *)(minDocIdPtr) & 0xfc) ) continue;
// ties should not be happening for docid, unless
// it tied with initial setting of minDocIdPtr, in that
// case we should add it!
gotIt:
minScore = tops [i];
minBitScore = topb [i];
minDocIdPtr = topp [i];
mini = i;
}
// set the callers ptrs
*minBitScore2 = minBitScore;
*score = minScore;
*docIdPtr = minDocIdPtr;
// return the lowest scoring docId's position
return mini;
}
#endif