forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClusterdb.h
178 lines (140 loc) · 5.31 KB
/
Clusterdb.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
// Copyright Matt Wells, Jul 2002
// . a clusterRec now no longer exists, per se
// . it is the same thing as the key of the titleRec in titledb
// . titleRecs now contain the site and content hashes in the low bits
// of their key.
// . this allows us to store much cluster info in Titledb's RdbMap
// . so to get cluster info, just read in the titleRec, you do not even
// need to uncompress it, just get the info from its key
// . we still use the cache here, however, to cache the keys (clusterRecs)
// . later, i may have to do some fancy footwork if we want to store all
// clusterRecs (titleKeys) in memory.
// . TODO: what if stored file offsets in tfndb, too, then titledb RdbMap
// would not be necessary?
//
// . clusterdb will now serve to help do fast site clustering by retaining
// docids and site hashes in memory
//
// 00000000 00000000 0000000d dddddddd d = docid
// dddddddd dddddddd dddddddd dddddfll f = family filter bit
// llllssss ssssssss ssssssss sssssshz q = year quarter bits
// l = language bits
// s = site hash
// h = half bit
// z = del bit
#ifndef _CLUSTERDB_H_
#define _CLUSTERDB_H_
//#include "TitleRec.h" // SAMPLE_VECTOR_SIZE
#include "Rdb.h"
#include "Url.h"
#include "Conf.h"
#include "Titledb.h"
#include "DiskPageCache.h"
// these are now just TitleRec keys
#define CLUSTER_REC_SIZE (sizeof(key_t))
// this now includes the gigabit vector
#define VECTOR_REC_SIZE (sizeof(key_t)+SAMPLE_VECTOR_SIZE+GIGABIT_VECTOR_SIZE)
class Clusterdb {
public:
// reset rdb
void reset();
// set up our private rdb
bool init ( );
// init the rebuild/secondary rdb, used by PageRepair.cpp
bool init2 ( long treeMem );
bool verify ( char *coll );
bool addColl ( char *coll, bool doVerify = true );
Rdb *getRdb ( ) { return &m_rdb; };
// make the cluster rec
void makeRecFromTitleRec ( char *rec,
class TitleRec *titleRec,
bool isDelKey );
// make the cluster rec
void makeRecFromTitleRecKey ( char *rec,
char *key,
bool isDelKey );
// make the cluster rec key
key_t makeClusterRecKey ( long long docId,
bool familyFilter,
uint8_t languageBits,
long siteHash,
bool isDelKey,
bool isHalfKey = false );
key_t makeFirstClusterRecKey ( long long docId ) {
return makeClusterRecKey ( docId, false, 0, 0, true ); };
key_t makeLastClusterRecKey ( long long docId ) {
return makeClusterRecKey ( docId, true, 0xff, 0xffffffff,
false, true ); };
// convert a titlerec key into a clusterec key
key_t convertTitleRecKey ( key_t titleKey );
/*
unsigned long getGroupId ( long long docId ) {
return g_titledb.getGroupId ( docId ); };
// cluster rec should be stored on same host as titleRec with the
// same docId that this key contains
unsigned long getGroupIdFromKey ( key_t *key ) {
return g_titledb.getGroupId ( getDocId ( *key ) ); };
*/
// NOTE: THESE NOW USE THE REAL CLUSTERDB REC
// // docId occupies the most significant bytes of the key
// now docId occupies the bits after the first 23
long long getDocId ( void *k ) {
//long long docId = (k.n0) >> (32+24);
//docId |= ( ((unsigned long long)(k.n1)) << 8 );
long long docId = (((key_t *)k)->n0) >> 35;
docId |= ( ((unsigned long long)(((key_t *)k)->n1)) << 29 );
return docId;
};
//long long getDocId ( char *r ) {
// return getDocId(*(key_t*)r);
//}
unsigned long getSiteHash26 ( char *r ) {
//return g_titledb.getSiteHash ( (key_t *)r ); };
return ((unsigned long)(((key_t*)r)->n0 >> 2) & 0x03FFFFFF);
};
unsigned long hasAdultContent ( char *r ) {
//return g_titledb.hasAdultContent ( *(key_t *)r ); };
return ((unsigned long)(((key_t*)r)->n0 >> 34) & 0x00000001);
};
unsigned char getLanguage ( char *r ) {
return ((unsigned char)(((key_t*)r)->n0 >> 28) & 0x0000003F);
}
// NOTE: THESE USE THE OLD "CLUSTERDB" REC GENERATED BY MSG22 (VECTOR)
//unsigned long getContentHash ( char *r ) {
// return g_titledb.getContentHash ( *(key_t *)r ); };
char getFamilyFilter ( char *r ) {
if ( (*(long long *)r) & 0x0000000400000000LL ) return 1;
return 0;
};
//unsigned long hasAdultWords ( char *r ) {
// return g_titledb.hasAdultWords ( *(key_t *)r ); };
//unsigned long hasAdultCategory ( char *r ) {
// return g_titledb.hasAdultCategory ( *(key_t *)r ); };
//unsigned char getLanguageFromVector ( char *r ) {
// return 0;
//}
// the random sample vector
/*
void getSampleVector ( char *vec ,
class Doc *doc,
char *coll ,
long collLen ,
long niceness = 0 );
*/
//void getSampleVector ( char *vec , class TermTable *table );
char getSampleSimilarity ( char *vec0 , char *vec1 , long size );
// get the content vector from a cluster rec (used by Msg38.cpp)
//char *getSampleVector ( char *rec ) { return rec + sizeof(key_t); };
//char *getGigabitVector ( char *rec ) {
// return rec + sizeof(key_t) + SAMPLE_VECTOR_SIZE ; };
//char getGigabitSimilarity ( char *vec0 , char *vec1 ,
// long *qtable , long numSlots ) ;
DiskPageCache *getDiskPageCache() { return &m_pc; };
private:
// this rdb holds urls waiting to be spidered or being spidered
Rdb m_rdb;
DiskPageCache m_pc;
};
extern class Clusterdb g_clusterdb;
extern class Clusterdb g_clusterdb2;
#endif