-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCollectiondb.h
408 lines (291 loc) · 11.7 KB
/
Collectiondb.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
// Matt Wells, copyright Feb 2001
// maintains a simple array of CollectionRecs
#ifndef GB_COLLECTIONDB_H
#define GB_COLLECTIONDB_H
#include <atomic>
#include "SafeBuf.h"
#include "rdbid_t.h"
#include "collnum_t.h"
#include "spider_status_t.h"
#include "GbMutex.h"
#include "WordVariationsConfig.h"
class Collectiondb {
public:
Collectiondb();
~Collectiondb();
void reset() ;
// called by main.cpp to fill in our m_recs[] array with
// all the coll.*.*/coll.conf info
bool loadAllCollRecs ( );
// . this will save all conf files back to disk that need it
// . returns false and sets g_errno on error, true on success
bool save ( );
bool isInitializing() const { return m_initializing; }
// returns i so that m_recs[i].m_coll = coll
collnum_t getCollnum(const char *coll, int32_t collLen) const;
collnum_t getCollnum(const char *coll) const; // coll is NULL terminated here
const char *getCollName(collnum_t collnum) const;
// get coll rec specified in the HTTP request
class CollectionRec *getRec ( class HttpRequest *r ,
bool useDefaultRec = true );
//Returns the specified collection name, or the default collection if no collection name was specified
const char *getDefaultColl(const char *collname_from_httprequest);
// . get collectionRec from name
// returns NULL if not available
class CollectionRec *getRec ( const char *coll );
class CollectionRec *getRec ( const char *coll , int32_t collLen );
class CollectionRec *getRec ( collnum_t collnum);
//class CollectionRec *getDefaultRec ( ) ;
class CollectionRec *getFirstRec ( ) ;
collnum_t getFirstCollnum() const ;
int32_t getNumRecs() const { return m_numRecs; }
int32_t getNumRecsUsed() const { return m_numRecsUsed; }
// what collnum will be used the next time a coll is added?
collnum_t reserveCollNum ( ) ;
bool addExistingColl ( const char *coll, collnum_t collnum );
bool addNewColl( const char *coll, collnum_t newCollnum ) ;
bool addRdbBaseToAllRdbsForEachCollRec ( ) ;
bool addRdbBasesForCollRec ( CollectionRec *cr ) ;
// returns false if blocked, true otherwise.
bool deleteRec2 ( collnum_t collnum );
//void deleteSpiderColl ( class SpiderColl *sc );
// returns false if blocked, true otherwise.
bool resetColl2(collnum_t oldCollnum, collnum_t newCollnum);
private:
// after main.cpp loads all rdb trees it calls this to remove
// bogus collnums from the trees i guess
bool cleanTrees();
bool registerCollRec(CollectionRec *cr);
bool growRecPtrBuf(collnum_t collnum);
bool setRecPtr(collnum_t collnum, CollectionRec *cr);
class CollectionRec **m_recs;
// m_recs[] points into a safebuf that is just an array
// of collectionrec ptrs. so we have to grow that safebuf possibly
// in order to add a new collection rec ptr to m_recs
SafeBuf m_recPtrBuf;
int32_t m_numRecs;
int32_t m_numRecsUsed;
int32_t m_wrapped;
bool m_initializing;
};
extern class Collectiondb g_collectiondb;
// Matt Wells, copyright Feb 2002
// . a collection record specifies the spider/index/search parms of a
// collection of web pages
// . there's a Msg class to send an update signal to all the hosts once
// we've used Msg1 to add a new rec or delete an old. The update signal
// will make the receiving hosts flush their CollectionRec buf so they
// have to send out a Msg0 to get it again
// . we have a default collection record, a main collection record and
// then other collection records
// . the default collection record values override all
// . but the collection record values can override SiteRec values
// . so if spider is disabled in default collection record, then nobody
// can spider!
// . override the g_conf.* vars where * is in this class to use
// Collection db's default values
// . then add in the values of the specialzed collection record
// . so change "if ( g_conf.m_spideringEnabled )" to something like
// Msg33 msg33;
// if ( ! msg33.getCollectionRec ( m_coll, m_collLen ) ) return false;
// CollectionRec *r = msg33.getRec();
// CollectoinRec *d = msg33.getDefaultRec();
// if ( ! r->m_spideringEnabled || ! d->m_spideringEnabled ) continue;
// ... otherwise, spider for the m_coll collection
// ... pass msg33 to Msg14::spiderDoc(), etc...
// how many url filtering patterns?
#define MAX_FILTERS 96 // up to 96 url regular expression patterns
#define SUMMARYHIGHLIGHTTAGMAXSIZE 128
#include "max_coll_len.h"
#include "HashTableX.h"
// fake this for now
#define RDB_END2 80
class CollectionRec {
public:
// active linked list of collectionrecs used by spider.cpp
class CollectionRec *m_nextActive;
// these just set m_xml to NULL
CollectionRec();
virtual ~CollectionRec();
int64_t getNumDocsIndexed();
// . stuff used by Collectiondb
// . do we need a save or not?
bool save();
void setNeedsSave() { m_needsSave = true; }
private:
std::atomic<bool> m_needsSave;
public:
bool load ( const char *coll , int32_t collNum ) ;
void reset();
// Clear memory structures used by URL filters
void clearUrlFilters();
// for customcrawls
bool rebuildUrlFilters();
// for regular crawls
bool rebuildUrlFilters2();
bool rebuildLangRules( const char *lang , const char *tld );
bool rebuildPrivacoreRules();
bool rebuildPrivacoreDKOnlyRules();
bool rebuildPrivacoreOldOnlyRules();
bool m_urlFiltersHavePageCounts;
// the all important collection name, NULL terminated
char m_coll [ MAX_COLL_LEN + 1 ] ;
int32_t m_collLen;
// used by SpiderCache.cpp. g_collectiondb.m_recs[m_collnum] = this
collnum_t m_collnum;
// for doing DailyMerge.cpp stuff
int32_t m_dailyMergeStarted; // time_t
int32_t m_dailyMergeTrigger;
char m_dailyMergeDOWList[48];
int64_t m_spiderCorruptCount;
// holds ips that have been detected as being throttled and we need
// to backoff and use proxies on
HashTableX m_twitchyTable;
// spider controls for this collection
bool m_spideringEnabled ;
int32_t m_spiderDelayInMilliseconds;
int32_t m_spiderReindexDelayMS;
// is in active list in spider.cpp?
bool m_isActive;
bool m_makeImageThumbnails;
int32_t m_thumbnailMaxWidthHeight ;
bool m_indexBody;
bool m_dedupingEnabled ; // dedup content on same hostname
bool m_dedupURLByDefault ;
bool m_dupCheckWWW ;
bool m_useSimplifiedRedirects ;
bool m_oneVotePerIpDom ;
bool m_doUrlSpamCheck ; //filter urls w/ naughty hostnames
bool m_doLinkSpamCheck ; //filters dynamically generated pages
bool m_siteClusterByDefault ;
bool m_useRobotsTxt ;
bool m_obeyRelNoFollowLinks ;
bool m_forceUseFloaters ;
bool m_automaticallyUseProxies ;
bool m_automaticallyBackOff ;
bool m_recycleContent ;
bool m_getLinkInfo ; // turn off to save seeks
bool m_computeSiteNumInlinks ;
int32_t m_percentSimilarSummary ; // Dedup by summary similiarity
int32_t m_summDedupNumLines ;
int32_t m_maxQueryTerms;
spider_status_t m_spiderStatus;
//ranking settings
float m_sameLangWeight;
float m_unknownLangWeight;
float m_siteRankMultiplier;
// Language stuff
char m_defaultSortLanguage2[6];
SafeBuf m_collectionPasswords;
SafeBuf m_collectionIps;
// from Conf.h
int32_t m_posdbMinFilesToMerge ;
int32_t m_titledbMinFilesToMerge ;
int32_t m_linkdbMinFilesToMerge ;
int32_t m_tagdbMinFilesToMerge ;
int32_t m_spiderdbMinFilesToMerge;
bool m_dedupResultsByDefault ;
bool m_doTagdbLookups ;
bool m_useCanonicalRedirects ;
int32_t m_maxNumSpiders ; // per local spider host
int32_t m_lastResetCount;
// controls for query-dependent summary/title generation
int32_t m_titleMaxLen;
int32_t m_summaryMaxLen;
int32_t m_summaryMaxNumLines;
int32_t m_summaryMaxNumCharsPerLine;
bool m_getDocIdScoringInfo;
// list of url patterns to be indexed.
SafeBuf m_siteListBuf;
// can be "web" "english" "romantic" "german" etc.
SafeBuf m_urlFiltersProfile;
// . now the url regular expressions
// . we chain down the regular expressions
// . if a url matches we use that tagdb rec #
// . if it doesn't match any of the patterns, we use the default site #
// . just one regexp per Pattern
// . all of these arrays should be the same size, but we need to
// include a count because Parms.cpp expects a count before each
// array since it handle them each individually
int32_t m_numRegExs;
// make this now use g_collectiondb.m_stringBuf safebuf and
// make Parms.cpp use that stringbuf rather than store into here...
SafeBuf m_regExs[ MAX_FILTERS ];
int32_t m_numSpiderFreqs; // useless, just for Parms::setParm()
float m_spiderFreqs[ MAX_FILTERS ];
int32_t m_numSpiderPriorities; // useless, just for Parms::setParm()
char m_spiderPriorities[ MAX_FILTERS ];
int32_t m_numMaxSpidersPerRule; // useless, just for Parms::setParm()
int32_t m_maxSpidersPerRule[ MAX_FILTERS ];
// same ip waits now here instead of "page priority"
int32_t m_numSpiderIpWaits; // useless, just for Parms::setParm()
int32_t m_spiderIpWaits[ MAX_FILTERS ];
// same goes for max spiders per ip
int32_t m_numSpiderIpMaxSpiders;
int32_t m_spiderIpMaxSpiders [ MAX_FILTERS ];
int32_t m_numHarvestLinks;
bool m_harvestLinks[ MAX_FILTERS ];
int32_t m_numForceDelete;
char m_forceDelete[ MAX_FILTERS ];
// dummy?
int32_t m_numRegExs9;
bool m_doQueryHighlighting;
char m_summaryFrontHighlightTag[SUMMARYHIGHLIGHTTAGMAXSIZE] ;
char m_summaryBackHighlightTag [SUMMARYHIGHLIGHTTAGMAXSIZE] ;
SafeBuf m_htmlRoot;
SafeBuf m_htmlHead;
SafeBuf m_htmlTail;
class SpiderColl *m_spiderColl;
GbMutex m_spiderCollMutex;
int32_t m_overflow;
int32_t m_overflow2;
int32_t m_maxAddUrlsPerIpDomPerDay;
// . max content length of text/html or text/plain document
// . we will not download, index or store more than this many bytes
int32_t m_maxTextDocLen;
// . max content length of other (pdf, word, xls, ppt, ps)
// . we will not index or store more than this many bytes
int32_t m_maxOtherDocLen;
// . max content length of other (pdf, word, xls, ppt, ps)
// . we will not download more than this many bytes
// . if content would be truncated, we will not even download at all
// because the html converter needs 100% of the doc otherwise it
// will have an error
int32_t m_maxOtherDocDownloadLen;
// . puts <br>s in the summary to keep its width below this
// . but we exceed this width before we would split a word
int32_t m_summaryMaxWidth;
// how long a robots.txt can be in the cache (Msg13.cpp/Robotdb.cpp)
int32_t m_maxRobotsCacheAge;
int32_t m_crawlDelayDefaultForNoRobotsTxtMS;
int32_t m_crawlDelayDefaultForRobotsTxtMS;
// check URL filters for manual ban and force delete?
bool m_checkURLFilters;
// rewrite domain-like queries for this collection?
bool m_modifyDomainLikeSearches;
bool m_domainLikeSearchDisablesSiteCluster;
// rewrite API-like queries?
bool m_modifyAPILikeSearches;
WordVariationsConfig m_word_variations_config;
// read from cache
bool m_rcache;
bool m_hideAllClustered;
// use this not m_bases to get the RdbBase
class RdbBase *getBase(rdbid_t rdbId);
// Rdb.cpp uses this after deleting an RdbBase and adding new one
void setBasePtr(rdbid_t rdbId, class RdbBase *base);
private:
// . now chuck this into CollectionRec instead of having a fixed
// array of them in Rdb.h called m_bases[]
// . leave this out of any copy of course
class RdbBase *m_bases[RDB_END2];
public:
// for poulating the sortbydate table
class Msg5 *m_msg5;
// each Rdb has a tree, so keep the pos/neg key count here so
// that RdbTree does not have to have its own array limited by
// MAX_COLLS which we did away with because we made this dynamic.
int32_t m_numPosKeysInTree[RDB_END2];
int32_t m_numNegKeysInTree[RDB_END2];
};
#endif // GB_COLLECTIONDB_H