forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCachedb.cpp
171 lines (156 loc) · 4.95 KB
/
Cachedb.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#include "Cachedb.h"
#include "Threads.h"
// for seo-related objects:
Cachedb g_cachedb;
// for seo serps:
Cachedb g_serpdb;
void Cachedb::reset() {
m_rdb.reset();
}
bool Cachedb::init ( ) {
// we use the same disk page size as indexdb (for rdbmap.cpp)
long pageSize = GB_INDEXDB_PAGE_SIZE;
// set this for debugging
//long long maxTreeMem = 1000000;
// i've seen some debug entries like 33MB because of
// m_debugDocIdScoreBuf and m_origDocIdScoreBuf take up so much space!
// so don't cache those any more!!
long long maxTreeMem = 40000000; // 40MB g_serpdb, 40MB g_cachedb
// . what's max # of tree nodes?
// . key+4+left+right+parents+dataPtr = sizeof(key96_t)+4 +4+4+4+4
// . 32 bytes per record when in the tree
// . >1000 bytes of data per rec
long maxTreeNodes = maxTreeMem /(sizeof(key96_t)+16+1000);
// disk page cache mem, 100MB on gk0 now
long pcmem = 0; // g_conf.m_cachedbMaxDiskPageCacheMem;
// keep this low if we are the tmp cluster
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
// TODO: would be nice to just do page caching on the satellite files;
// look into "minimizeDiskSeeks" at some point...
m_name = "cachedb";
m_rdbId = RDB_CACHEDB;
if ( this == &g_serpdb ) {
m_name = "serpdb";
m_rdbId = RDB_SERPDB;
}
if ( ! m_pc.init ( m_name ,
m_rdbId, // RDB_CACHEDB,
pcmem ,
pageSize ,
true , // use shared mem?
false )) // minimizeDiskSeeks?
return log("db: %s init failed.",m_name);
// init the rdb
return m_rdb.init ( g_hostdb.m_dir ,
m_name ,
true , // dedup
-1 , // fixeddatasize is 0 since no data
4,//g_conf.m_cachedbMinFilesToMerge ,
// fix this to 15 and rely on the page cache of
// just the satellite files and the daily merge to
// keep things fast.
//15 ,
maxTreeMem ,
maxTreeNodes ,
true , //isTreeBalanced
0 , // cache mem
0 , // cache nodes
false, // true , // use half keys
false , // load cache from disk
&m_pc ,
false , // false
false , // preload page cache
sizeof(key96_t) ,
true , // bias page cache? (true!)
true ); // is collectionless???? !!!!
}
bool Cachedb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
// verify
if ( verify(coll) ) return true;
// if not allowing scale, return false
if ( ! g_conf.m_allowScale ) return false;
// otherwise let it go
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
bool Cachedb::verify ( char *coll ) {
log ( LOG_INFO, "db: Verifying %s for coll %s...", coll,m_name );
g_threads.disableThreads();
Msg5 msg5;
Msg5 msg5b;
RdbList list;
key224_t startKey;
key224_t endKey;
startKey.setMin();
endKey.setMax();
long minRecSizes = 64000;
if ( ! msg5.getList ( m_rdbId,//RDB_CACHEDB ,
coll ,
&list ,
(char*)&startKey ,
(char*)&endKey ,
minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL ,
0 ,
-1 ,
true ,
-1LL ,
&msg5b ,
true )) {
g_threads.enableThreads();
return log("db: HEY! it did not block");
}
long count = 0;
long got = 0;
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key224_t k;
list.getCurrentKey((char*)&k);
count++;
uint32_t groupId = getGroupId ( m_rdbId , &k );//RDB_CACHEDB
if ( groupId == g_hostdb.m_groupId ) got++;
}
if ( got != count ) {
log ("db: Out of first %li records in %s , "
"only %li belong to our group.",count,m_name,got);
/*
// repeat with log
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key224_t k;
list.getCurrentKey((char*)&k);
uint32_t groupId = getGroupId ( RDB_CACHEDB , &k );
long groupNum = g_hostdb.getGroupNum(groupId);
unsigned long sh32 ;
sh32 = g_cachedb.getLinkeeSiteHash32_uk(&k);
uint16_t sh16 = sh32 >> 19;
log("db: sh16=0x%lx group=%li",
(long)sh16,groupNum);
}
*/
// exit if NONE, we probably got the wrong data
if ( got == 0 ) log("db: Are you sure you have the "
"right "
"data in the right directory? "
"Exiting.");
log ( "db: Exiting due to inconsistency.");
g_threads.enableThreads();
return g_conf.m_bypassValidation;
}
log ( LOG_INFO, "db: %s passed verification successfully for "
"%li recs.", m_name,count );
// DONE
g_threads.enableThreads();
return true;
}