-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathConf.h
505 lines (395 loc) · 13.3 KB
/
Conf.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
// Copyright Matt Wells, Apr 2001
// . every host has a config record
// . like tagdb, record in 100% xml
// . allows remote configuration of hosts through Msg4 class
// . remote user sends some xml, we set our member vars using that xml
// . when we save to disk we convert our mem vars to xml
// . is global so everybody can see it
// . conf record can be changed by director OR with the host's priv key
// . use Conf remotely to get setup info about a specific host
// . get your local ip/port/groupMask/etc. from this class not HostMap
#ifndef GB_CONF_H
#define GB_CONF_H
#include "max_coll_len.h"
#include "max_url_len.h"
#include "SafeBuf.h"
#include "BaseScoringParameters.h"
#include <sys/types.h> //mode_t etc.
#define USERAGENTMAXSIZE 128
#define MAX_DNSIPS 16
#define MAX_RNSIPS 13
//Publicly accessible and generallyy HA / reachable DNS servers. Use Google's servers - works reasonably well
#define PUBLICLY_AVAILABLE_DNS1 "8.8.8.8"
#define PUBLICLY_AVAILABLE_DNS2 "8.8.4.4"
class TcpSocket;
class HttpRequest;
mode_t getFileCreationFlags();
mode_t getDirCreationFlags ();
class Conf {
public:
Conf();
bool isCollAdmin ( TcpSocket *socket , HttpRequest *hr );
bool isCollAdminForColl (TcpSocket *sock, HttpRequest *hr, const char *coll );
bool isCollAdmin2 (TcpSocket *socket , HttpRequest *hr,
class CollectionRec *cr);
bool isMasterAdmin ( TcpSocket *socket , HttpRequest *hr );
bool hasMasterPwd ( HttpRequest *hr );
bool isMasterIp ( uint32_t ip );
bool isConnectIp ( uint32_t ip );
// loads conf parms from this file "{dir}/gb.conf"
bool init ( char *dir );
void setRootIps();
// saves any changes to the conf file
bool save ( );
// reset all values to their defaults
void reset();
// defaults to default collection
const char *getDefaultColl ( );
// max amount of memory we can use
size_t m_maxMem;
bool m_mlockAllCurrent;
bool m_mlockAllFuture;
// if this is false, we do not save, used by dump routines
// in main.cpp so they can change parms here and not worry about
// a core dump saving them
bool m_save;
bool m_runAsDaemon;
bool m_logToFile;
char m_defaultColl[MAX_COLL_LEN + 1];
// . dns parameters
// . dnsDir should hold our saved cached (TODO: save the dns cache)
int32_t m_numDns;
int32_t m_dnsIps[MAX_DNSIPS];
int16_t m_dnsPorts[MAX_DNSIPS];
int64_t m_dnsCacheSize;
int64_t m_dnsCacheMaxAge;
int32_t m_dnsMaxCacheMem;
int32_t m_clusterdbQuickCacheMem;
SafeBuf m_proxyIps;
SafeBuf m_proxyAuth;
// built-in dns parameters using name servers
bool m_askRootNameservers;
int32_t m_numRns;
int32_t m_rnsIps[MAX_RNSIPS];
char m_queryLanguageServerName[64];
int32_t m_queryLanguageServerPort;
unsigned m_maxOutstandingQueryLanguage;
unsigned m_queryLanguageTimeout;
char m_siteMedianPageTemperatureServerName[64];
int32_t m_siteMedianPageTemperatureServerPort;
unsigned m_maxOutstandingSiteMedianPageTemperature;
unsigned m_siteMedianPageTemperatureTimeout;
char m_siteNumInlinksServerName[64];
int32_t m_siteNumInlinksServerPort;
unsigned m_maxOutstandingSiteNumInlinks;
unsigned m_siteNumInlinksTimeout;
char m_urlClassificationServerName[64];
int32_t m_urlClassificationServerPort;
unsigned m_maxOutstandingUrlClassifications;
unsigned m_urlClassificationTimeout;
// used to limit all rdb's to one merge per machine at a time
int32_t m_mergeBufSize;
int32_t m_doledbNukeInterval;
// rdb settings
// posdb
int32_t m_posdbMaxLostPositivesPercentage;
int64_t m_posdbFileCacheSize;
int32_t m_posdbMaxTreeMem;
// tagdb
int32_t m_tagdbMaxLostPositivesPercentage;
int64_t m_tagdbFileCacheSize;
int32_t m_tagdbMaxTreeMem;
char m_mergespaceLockDirectory[1024];
int32_t m_mergespaceMinLockFiles;
char m_mergespaceDirectory[1024];
// clusterdb for site clustering, each rec is 16 bytes
int32_t m_clusterdbMaxLostPositivesPercentage;
int64_t m_clusterdbFileCacheSize;
int32_t m_clusterdbMaxTreeMem;
int32_t m_clusterdbMinFilesToMerge;
// titledb
int32_t m_titledbMaxLostPositivesPercentage;
int64_t m_titledbFileCacheSize;
int32_t m_titledbMaxTreeMem;
// spiderdb
int32_t m_spiderdbMaxLostPositivesPercentage;
int64_t m_spiderdbFileCacheSize;
int32_t m_spiderdbMaxTreeMem;
// linkdb for storing linking relations
int32_t m_linkdbMaxLostPositivesPercentage;
int32_t m_linkdbMaxTreeMem;
int32_t m_linkdbMinFilesToMerge;
// are we doing a command line thing like 'gb 0 dump s ....' in
// which case we do not want to log certain things
bool m_doingCommandLine;
int32_t m_maxCoordinatorThreads;
int32_t m_maxCpuThreads;
int32_t m_maxSummaryThreads;
int32_t m_maxIOThreads;
int32_t m_maxExternalThreads;
int32_t m_maxFileMetaThreads;
int32_t m_maxMergeThreads;
int32_t m_maxJobCleanupTime;
char m_vagusClusterId[128];
int32_t m_vagusPort;
int32_t m_vagusKeepaliveSendInterval; //milliseconds
int32_t m_vagusKeepaliveLifetime; //milliseconds
int32_t m_vagusMaxDeadTime; //minutes
int32_t m_maxDocsWanted; //maximum number of results in one go. Puts a limit on SearchInput::m_docsWanted
int32_t m_maxFirstResultNum; //maximum document offset / result-page. Puts a limit on SearchInput::m_firstResultNum
int32_t min_docid_splits; //minimum number of DocId splits using Msg40
int32_t max_docid_splits; //maximum number of DocId splits using Msg40
int64_t m_msg40_msg39_timeout; //timeout for entire get-docid-list phase, in milliseconds.
int64_t m_msg3a_msg39_network_overhead; //additional latency/overhead of sending reqeust+response over network.
bool m_useHighFrequencyTermCache;
bool m_spideringEnabled;
bool m_injectionsEnabled;
bool m_queryingEnabled;
bool m_returnResultsAnyway;
bool m_spiderIPUrl;
bool m_spiderAdultContent;
bool m_addUrlEnabled; // TODO: use at http interface level
bool m_doStripeBalancing;
// . true if the server is on the production cluster
// . we enforce the 'elvtune -w 32 /dev/sd?' cmd on all drives because
// that yields higher performance when dumping/merging on disk
bool m_isLive;
int32_t m_maxTotalSpiders;
int32_t m_spiderFilterableMaxWordCount;
int32_t m_spiderDeadHostCheckInterval;
int64_t m_spiderUrlCacheMaxAge;
int64_t m_spiderUrlCacheSize;
// indexdb has a max cached age for getting IndexLists (10 mins deflt)
int32_t m_indexdbMaxIndexListAge;
int32_t m_udpMaxSockets;
// TODO: parse these out!!!!
int32_t m_httpMaxSockets;
int32_t m_httpsMaxSockets;
int32_t m_httpMaxSendBufSize;
// a search results cache (for Msg40)
int64_t m_docSummaryWithDescriptionMaxCacheAge; //cache timeout for document summaries for documents with a meta-tag with description, in milliseconds
// for Weights.cpp
int32_t m_sliderParm;
float m_sameLangWeight;
float m_unknownLangWeight;
BaseScoringParameters m_baseScoringParameters;
int32_t m_numFlagScoreMultipliers; //constant = 26
int32_t m_numFlagRankAdjustments; //constant = 26
int32_t m_maxCorruptLists;
int32_t m_defaultQueryResultsValidityTime; //in seconds
bool m_useCollectionPasswords;
// if in read-only mode we do no spidering and load no saved trees
// so we can use all mem for caching index lists
bool m_readOnlyMode;
// if this is true we use /etc/hosts for hostname lookup before dns
bool m_useEtcHosts;
//verify integrity of tree/buckets after modification operations
bool m_verifyTreeIntegrity;
// just ensure lists being written are valid rdb records (titlerecs)
// trying to isolate titlerec corruption
bool m_verifyDumpedLists;
// verify validity of index while merging
bool m_verifyIndex;
// calls fsync(fd) if true after each write
bool m_flushWrites;
bool m_verifyWrites;
int32_t m_corruptRetries;
int m_sqliteSynchronous;
// verify tagrec while indexing
bool m_verifyTagRec;
bool m_spiderHostToQueryHostFallbackAllowed;
bool m_queryHostToSpiderHostFallbackAllowed;
int64_t m_docDeleteDelayMs;
int64_t m_docRebuildDelayMs;
int64_t m_docReindexDelayMs;
int64_t m_docDeleteMaxPending;
int64_t m_docRebuildMaxPending;
int64_t m_docReindexMaxPending;
// log unfreed memory on exit
bool m_detectMemLeaks;
bool m_forceIt;
// if this is true we do not add indexdb keys that *should* already
// be in indexdb. but if you recently upped the m_truncationLimit
// then you can set this to false to add all indexdb keys.
//bool m_onlyAddUnchangedTermIds;
bool m_doIncrementalUpdating;
int64_t m_stableSummaryCacheSize;
int64_t m_stableSummaryCacheMaxAge;
int64_t m_unstableSummaryCacheSize;
int64_t m_unstableSummaryCacheMaxAge;
bool m_useShotgun;
bool m_testMem;
bool m_doConsistencyTesting;
int32_t m_titleRecVersion;
// defaults to "Gigabot/1.0"
char m_spiderUserAgent[USERAGENTMAXSIZE];
char m_spiderBotName[USERAGENTMAXSIZE];
int32_t m_autoSaveFrequency;
int32_t m_docCountAdjustment;
bool m_profilingEnabled;
//
// See Log.h for an explanation of the switches below
//
// GET and POST requests.
bool m_logHttpRequests;
bool m_logAutobannedQueries;
int32_t m_logLoopTimeThreshold;
int32_t m_logRdbIndexAddListTimeThreshold;
int32_t m_logRdbMapAddListTimeThreshold;
// if query took this or more milliseconds, log its time
int32_t m_logQueryTimeThreshold;
// if disk read took this or more milliseconds, log its time
int32_t m_logDiskReadTimeThreshold;
int32_t m_logSqliteTransactionTimeThreshold;
bool m_logQueryReply;
// log what gets into the index
bool m_logSpideredUrls;
// log informational messages, they are not indicative of any error.
bool m_logInfo;
// when out of udp slots
bool m_logNetCongestion;
// doc quota limits, url truncation limits
bool m_logLimits;
// log debug switches
bool m_logDebugAddurl;
bool m_logDebugAdmin;
bool m_logDebugBuild;
bool m_logDebugBuildTime;
bool m_logDebugDate;
bool m_logDebugDb;
bool m_logDebugDetailed;
bool m_logDebugDirty;
bool m_logDebugDisk;
bool m_logDebugDns;
bool m_logDebugDownloads;
bool m_logDebugHttp;
bool m_logDebugImage;
bool m_logDebugLang;
bool m_logDebugLinkInfo;
bool m_logDebugLoop;
bool m_logDebugMem;
bool m_logDebugMemUsage;
bool m_logDebugMerge;
bool m_logDebugMsg13;
bool m_logDebugMsg20;
bool m_logDebugMulticast;
bool m_logDebugNet;
bool m_logDebugProxies;
bool m_logDebugQuery;
bool m_logDebugRepair;
bool m_logDebugRobots;
bool m_logDebugSections;
bool m_logDebugSpcache; // SpiderCache.cpp debug
bool m_logDebugSpeller;
bool m_logDebugSpider;
bool m_logDebugReindex;
bool m_logDebugSEO;
bool m_logDebugStats;
bool m_logDebugSummary;
bool m_logDebugTagdb;
bool m_logDebugTcp;
bool m_logDebugTcpBuf;
bool m_logDebugTitle;
bool m_logDebugTopDocs;
bool m_logDebugUdp;
bool m_logDebugUnicode;
bool m_logDebugUrlAttempts;
bool m_logDebugVagus;
bool m_logTraceBigFile;
bool m_logTraceMatchList;
bool m_logTraceContentTypeBlockList;
bool m_logTraceDocid2FlagsAndSiteMap;
bool m_logTraceDocProcess;
bool m_logTraceDns;
bool m_logTraceDnsBlockList;
bool m_logTraceDnsCache;
bool m_logTraceFile;
bool m_logTraceHttpMime;
bool m_logTraceIpBlockList;
bool m_logTraceLanguageResultOverride;
bool m_logTraceMem;
bool m_logTraceMsg0;
bool m_logTraceMsg4In;
bool m_logTraceMsg4Out;
bool m_logTraceMsg4OutData;
bool m_logTraceMsg25;
bool m_logTracePageLinkdbLookup;
bool m_logTracePageSpiderdbLookup;
bool m_logTracePos;
bool m_logTracePosdb;
bool m_logTraceQuery;
bool m_logTraceQueryLanguage;
bool m_logTraceRdb;
bool m_logTraceRdbBase;
bool m_logTraceRdbBuckets;
bool m_logTraceRdbDump;
bool m_logTraceRdbIndex;
bool m_logTraceRdbList;
bool m_logTraceRdbMap;
bool m_logTraceRdbMerge;
bool m_logTraceRdbTree;
bool m_logTraceRepairs;
bool m_logTraceRobots;
bool m_logTraceRobotsCheckList;
bool m_logTraceSiteMedianPageTemperature;
bool m_logTraceSiteNumInlinks;
bool m_logTraceSpider;
bool m_logTraceSpiderUrlCache;
bool m_logTraceReindex;
bool m_logTraceSpiderdbRdbSqliteBridge;
bool m_logTraceSummary;
bool m_logTraceTitledb;
bool m_logTraceXmlDoc;
bool m_logTracePhrases;
bool m_logTraceTokenIndexing;
bool m_logTraceUrlMatchList;
bool m_logTraceUrlResultOverride;
bool m_logTraceWordSpam;
bool m_logTraceUrlClassification;
bool m_logTraceTopTree;
bool m_logTraceTermCheckList;
// expensive timing messages
bool m_logTimingAddurl;
bool m_logTimingAdmin;
bool m_logTimingBuild;
bool m_logTimingDb;
bool m_logTimingNet;
bool m_logTimingQuery;
bool m_logTimingLinkInfo;
bool m_logTimingRobots;
// programmer reminders.
bool m_logReminders;
SafeBuf m_masterPwds;
// these are the new master ips
SafeBuf m_connectIps;
char m_redirect[MAX_URL_LEN];
bool m_useCompressionProxy;
bool m_gzipDownloads;
// used by proxy to make proxy point to the temp cluster while
// the original cluster is updated
bool m_useTmpCluster;
// allow scaling up of hosts by removing recs not in the correct
// group. otherwise a sanity check will happen.
bool m_allowScale;
bool m_bypassValidation;
int32_t m_maxCallbackDelay;
// used by Repair.cpp
bool m_repairingEnabled;
int32_t m_maxRepairinjections;
int64_t m_repairMem;
SafeBuf m_collsToRepair;
bool m_fullRebuild;
bool m_rebuildAddOutlinks;
bool m_rebuildRecycleLinkInfo;
bool m_rebuildUseTitleRecTagRec;
bool m_rebuildTitledb;
bool m_rebuildPosdb;
bool m_rebuildClusterdb;
bool m_rebuildSpiderdb;
bool m_rebuildSpiderdbSmall;
bool m_rebuildLinkdb;
bool m_rebuildRoots;
bool m_rebuildNonRoots;
};
extern class Conf g_conf;
#endif // GB_CONF_H