-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBits.h
130 lines (101 loc) · 3.91 KB
/
Bits.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
// Matt Wells, copyright Jun 2001
// . each word has several bits of information we like to keep track of
// . these bits are used for making phrases in Phrases.h
// . also used by spam detector in Spam.h
// . TODO: rename this class to PhraseBits
// . TODO: separate words in phrases w/ period OR space so a search for
// "chicken.rib" gives you the renderman file, not a recipe or something
#ifndef GB_BITS_H
#define GB_BITS_H
#include <inttypes.h>
//
// Bits set by Bits::set() and available via queryBits/clearBits/assignBits/setBits
// . no punctuation or "big" numbers can be in a phrase
#define D_CAN_BE_IN_PHRASE 0x01
// is this word a stop word?
#define D_IS_STOPWORD 0x02
// . used for phrasing
// . can we continue forming our phrase after this word?
// . some puntuation words and all stop words can be paired across
#define D_CAN_PAIR_ACROSS 0x04
// set by Sections.cpp::setMenu() function
#define D_IN_LINK 0x08
// is this word potentiialy part of a URL?
#define D_IS_IN_URL 0x10
//If adding more bits make sure they fit in 8 bits or extend wbit_t type below
//
// Bits set by Bits::setForSummary() and available via querySWBits/setSWBits
//
// . is this word a strong connector?
// . used by Summary.cpp so we don't split strongly connected things
// . right now, just single character punctuation that is not a space
// . i don't want to split possessive words at the apostrophe, or split
// ip addresses at the period, etc. applies to unicode as well.
#define D_IS_STRONG_CONNECTOR 0x0001
// . does it start a sentence?
// . if our summary excerpt starts with this then it will get bonus points
#define D_STARTS_SENTENCE 0x0002
// . or does it start a sentence fragment, like after a comma or something
// . the summary excerpt will get *some* bonus points for this
#define D_STARTS_FRAGMENT 0x0004
// . does this word have a quote right before it?
#define D_IN_QUOTES 0x0008
// more bits so we can get rid of Summary::setSummaryScores() so that
// Summary::getBestWindow() just uses these bits to score the window now
#define D_IN_TITLE 0x0010
#define D_IN_PARENTHESES 0x0020
#define D_IN_HYPERLINK 0x0040
#define D_IN_BOLDORITALICS 0x0080
#define D_IN_LIST 0x0100
#define D_IN_SUP 0x0200
#define D_IN_PARAGRAPH 0x0400
#define D_IN_BLOCKQUOTE 0x0800
// for Summary.cpp
#define D_USED 0x1000
//If adding more bits make sure they fit in 16 bits or extend swbit_t type below
// bitmask type for normal bits
typedef uint8_t wbit_t;
// bitmask type for summary bits
typedef uint16_t swbit_t;
class TokenizerResult;
class Sections;
class Bits {
public:
Bits();
~Bits();
// . returns false and sets errno on error
bool set(const TokenizerResult *tr);
bool setForSummary(const TokenizerResult *tr);
void reset();
bool isStopWord( int32_t i ) const {
return m_bits[i] & D_IS_STOPWORD;
}
bool canBeInPhrase( int32_t i ) const {
return m_bits[i] & D_CAN_BE_IN_PHRASE;
}
bool canPairAcross( int32_t i ) const {
return m_bits[i] & D_CAN_PAIR_ACROSS;
}
void setInLinkBits(const Sections *ss);
void setInUrlBits ();
wbit_t queryBits(unsigned i) const { return m_bits[i]; }
void assignBits(unsigned i, wbit_t b){ m_bits[i] = b; }
void setBits(unsigned i, wbit_t b) { m_bits[i] |= b; }
void clearBits(unsigned i, wbit_t b) { m_bits[i] &= ~b; }
swbit_t querySWBits(unsigned i) const { return m_swbits[i]; }
void setSWBits(unsigned i, swbit_t b) { m_swbits[i] |= b; }
private:
const TokenizerResult *m_tr;
wbit_t *m_bits;
int32_t m_bitsSize;
// . wordbits
// . used only by setForSummary() now to avoid having to update a lot of code
swbit_t *m_swbits;
int32_t m_swbitsSize;
bool m_inLinkBitsSet;
bool m_inUrlBitsSet;
char m_localBuf[sizeof(wbit_t)*10];
wbit_t getAlnumBits( int32_t i ) const;
wbit_t getNonAlnumBits(unsigned i) const;
};
#endif // GB_BITS_H