Skip to content

Commit d7089f6

Browse files
authored
Merge pull request #66 from farsightsec/next
Next Release (1.5.0)
2 parents 8a7e6ad + acb9b37 commit d7089f6

23 files changed

+645
-78
lines changed

COPYRIGHT

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
Copyright (c) 2022 DomainTools LLC
12
Copyright (c) 2012-2021 by Farsight Security, Inc.
23

34
Licensed under the Apache License, Version 2.0 (the "License");

ChangeLog

+10
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
mtbl (1.5.0)
2+
3+
* Add reader filter function option to mtbl_fileset.
4+
* Use "galloping search" instead of full binary search for mtbl_iter_seek().
5+
* Make index block verification optional for more efficient reader
6+
initialization.
7+
* Fix underflow errors triggered by short keys and empty mtbl files.
8+
* mtbl_fileset_partition() use is deprecated in favor of
9+
mtbl_fileset_dup() with the fname_filter_func option set.
10+
111
mtbl (1.4.0)
212

313
* Add -l option to specify compression level for mtbl_merge.

Makefile.am

+12-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ AM_LDFLAGS =
2323
##
2424
#
2525

26-
LIBMTBL_VERSION_INFO=2:1:1
26+
LIBMTBL_VERSION_INFO=3:0:2
2727

2828
include_HEADERS = mtbl/mtbl.h
2929
lib_LTLIBRARIES = mtbl/libmtbl.la
@@ -123,6 +123,17 @@ t_test_fileset_partition_SOURCES = t/test-fileset-partition.c
123123
t_test_fileset_partition_LDADD = mtbl/libmtbl.la
124124
t/test-fileset-partition.sh: t/test-fileset-partition
125125

126+
TESTS += t/test-fileset-filter.sh
127+
EXTRA_DIST += t/test-fileset-filter.sh
128+
EXTRA_DIST += t/fileset-filter-data/animals.fileset
129+
EXTRA_DIST += t/fileset-filter-data/animals-1.mtbl
130+
EXTRA_DIST += t/fileset-filter-data/animals-2.mtbl
131+
EXTRA_DIST += t/fileset-filter-data/animals-3.mtbl
132+
check_PROGRAMS += t/test-fileset-filter
133+
t_test_fileset_filter_SOURCES = t/test-fileset-filter.c
134+
t_test_fileset_filter_LDADD = mtbl/libmtbl.la
135+
t/test-fileset-filter.sh: t/test-fileset-filter
136+
126137
TESTS += t/test-fixed
127138
check_PROGRAMS += t/test-fixed
128139
t_test_fixed_SOURCES = t/test-fixed.c

configure.ac

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
AC_PREREQ(2.64)
22
AC_INIT([mtbl],
3-
[1.4.0],
3+
[1.5.0],
44
[https://github.com/farsightsec/mtbl/issues],
55
[mtbl],
66
[https://github.com/farsightsec/mtbl])

man/mtbl_fileset.3

+22-3
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
.\" Title: mtbl_fileset
33
.\" Author: [FIXME: author] [see http://docbook.sf.net/el/author]
44
.\" Generator: DocBook XSL Stylesheets v1.79.1 <http://docbook.sf.net/>
5-
.\" Date: 03/27/2019
5+
.\" Date: 11/28/2022
66
.\" Manual: \ \&
77
.\" Source: \ \&
88
.\" Language: English
99
.\"
10-
.TH "MTBL_FILESET" "3" "03/27/2019" "\ \&" "\ \&"
10+
.TH "MTBL_FILESET" "3" "11/28/2022" "\ \&" "\ \&"
1111
.\" -----------------------------------------------------------------
1212
.\" * Define some portability stuff
1313
.\" -----------------------------------------------------------------
@@ -112,6 +112,14 @@ mtbl_fileset_options_set_filename_filter_func(
112112
.sp
113113
.nf
114114
\fBvoid
115+
mtbl_fileset_options_set_reader_filter_func(
116+
struct mtbl_fileset_options *\fR\fB\fIfopt\fR\fR\fB,
117+
mtbl_reader_filter_func \fR\fB\fIfp\fR\fR\fB,
118+
void *\fR\fB\fIclos\fR\fR\fB);\fR
119+
.fi
120+
.sp
121+
.nf
122+
\fBvoid
115123
mtbl_fileset_options_set_reload_interval(
116124
struct mtbl_fileset_options *\fR\fB\fIfopt\fR\fR\fB,
117125
uint32_t \fR\fB\fIreload_interval\fR\fR\fB);\fR
@@ -134,7 +142,7 @@ Accesses via the \fBmtbl_source\fR(3) interface will implicitly check for update
134142
.sp
135143
The \fBmtbl_fileset_reload\fR() function avoids checking for updates more frequently than every \fIreload_interval\fR seconds\&. If \fBreload_interval\fR is set to \fBMTBL_FILESET_RELOAD_INTERVAL_NEVER\fR, then \fBmtbl_fileset_reload\fR() function will only load the fileset once\&. The \fBmtbl_fileset_reload_now\fR() function can be called to bypass the \fIreload_interval\fR check\&.
136144
.sp
137-
The \fBmtbl_fileset_partition\fR() function yields two \fBstruct mtbl_merger\fR objects that are split based on the output of a callback\&. The caller is responsible for calling \fBmtbl_merger_destroy\fR() on each of these mergers\&. Calls to \fBmtbl_source_*\fR() on the fileset\(cqs source object, and calls to \fBmtbl_fileset_reload\fR() and \fBmtbl_fileset_reload_now\fR() may leave these mergers in an inconsistent state\&.
145+
The \fBmtbl_fileset_partition\fR() function yields two \fBstruct mtbl_merger\fR objects that are split based on the output of a callback\&. The caller is responsible for calling \fBmtbl_merger_destroy\fR() on each of these mergers\&. Calls to \fBmtbl_source_*\fR() on the fileset\(cqs source object, and calls to \fBmtbl_fileset_reload\fR() and \fBmtbl_fileset_reload_now\fR(), including those implicitly performed by operations on the fileset source, may leave these mergers in an inconsistent state\&. For this reason, \fBmtbl_fileset_partition\fR() use is deprecated in favor of \fBmtbl_fileset_dup\fR() with the \fBfname_filter_func\fR option set\&.
138146
.SS "Fileset options"
139147
.sp
140148
.it 1 an-trap
@@ -175,6 +183,17 @@ Used to filter specific files by name from a fileset\&. If the function returns
175183
.nr an-break-flag 1
176184
.br
177185
.ps +1
186+
\fBreader_filter_func\fR
187+
.RS 4
188+
.sp
189+
Used to filter specific readers from a fileset\&. If the function returns \fBfalse\fR, the reader\(cqs data will not be included in the results returned by any iterators on the fileset\&.
190+
.RE
191+
.sp
192+
.it 1 an-trap
193+
.nr an-no-space-flag 1
194+
.nr an-break-flag 1
195+
.br
196+
.ps +1
178197
\fBreload_interval\fR
179198
.RS 4
180199
.sp

man/mtbl_fileset.3.txt

+16-2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,13 @@ mtbl_fileset_options_set_filename_filter_func(
7373
mtbl_filename_filter_func 'fp',
7474
void *'clos');^
7575

76+
[verse]
77+
^void
78+
mtbl_fileset_options_set_reader_filter_func(
79+
struct mtbl_fileset_options *'fopt',
80+
mtbl_reader_filter_func 'fp',
81+
void *'clos');^
82+
7683
[verse]
7784
^void
7885
mtbl_fileset_options_set_reload_interval(
@@ -134,8 +141,11 @@ The ^mtbl_fileset_partition^() function yields two ^struct mtbl_merger^
134141
objects that are split based on the output of a callback. The caller is
135142
responsible for calling ^mtbl_merger_destroy^() on each of these mergers.
136143
Calls to ^mtbl_source_*^() on the fileset's source object, and calls to
137-
^mtbl_fileset_reload^() and ^mtbl_fileset_reload_now^() may leave these
138-
mergers in an inconsistent state.
144+
^mtbl_fileset_reload^() and ^mtbl_fileset_reload_now^(), including those
145+
implicitly performed by operations on the fileset source, may leave these
146+
mergers in an inconsistent state. For this reason, ^mtbl_fileset_partition^()
147+
use is deprecated in favor of ^mtbl_fileset_dup^() with the ^fname_filter_func^
148+
option set.
139149

140150
=== Fileset options ===
141151

@@ -150,6 +160,10 @@ See ^mtbl_merger^(3). Used to sort the entries with duplicate keys during the me
150160
Used to filter specific files by name from a fileset. If the function returns ^false^, the file's data
151161
will not be included in the results returned by any iterators on the fileset.
152162

163+
==== reader_filter_func ====
164+
Used to filter specific readers from a fileset. If the function returns ^false^, the reader's data
165+
will not be included in the results returned by any iterators on the fileset.
166+
153167
==== reload_interval ====
154168
Specifies the interval between checks for updates to the setfile, in seconds.
155169
Defaults to 60 seconds. ^MTBL_FILESET_RELOAD_INTERVAL_NEVER^ is a special value that indicates to never reload the fileset.

man/mtbl_reader.3

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
'\" t
22
.\" Title: mtbl_reader
33
.\" Author: [FIXME: author] [see http://docbook.sf.net/el/author]
4-
.\" Generator: DocBook XSL Stylesheets v1.78.1 <http://docbook.sf.net/>
5-
.\" Date: 02/03/2015
4+
.\" Generator: DocBook XSL Stylesheets v1.79.1 <http://docbook.sf.net/>
5+
.\" Date: 11/14/2022
66
.\" Manual: \ \&
77
.\" Source: \ \&
88
.\" Language: English
99
.\"
10-
.TH "MTBL_READER" "3" "02/03/2015" "\ \&" "\ \&"
10+
.TH "MTBL_READER" "3" "11/14/2022" "\ \&" "\ \&"
1111
.\" -----------------------------------------------------------------
1212
.\" * Define some portability stuff
1313
.\" -----------------------------------------------------------------
@@ -104,7 +104,7 @@ File\-level metadata may be accessed using the \fBmtbl_metadata\fR(3) interface,
104104
\fBverify_checksums\fR
105105
.RS 4
106106
.sp
107-
Specifies whether or not the CRC32C checksum on each data block should be verified or not\&. If \fIverify_checksums\fR is enabled, a checksum mismatch will cause a runtime error\&. Note that the checksum on the index block is always verified, since the overhead of doing this once when the reader object is instantiated is minimal\&. The default is to not verify data block checksums\&.
107+
Specifies whether or not the CRC32C checksum on the index block and each data block should be verified or not\&. If \fIverify_checksums\fR is enabled, a checksum mismatch will cause a fatal runtime error\&. The default is to not verify index or data block checksums\&.
108108
.RE
109109
.sp
110110
.it 1 an-trap

man/mtbl_reader.3.txt

+4-5
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,10 @@ metadata object is valid only as long as the reader object exists.
7676

7777
==== verify_checksums ====
7878

79-
Specifies whether or not the CRC32C checksum on each data block should be
80-
verified or not. If _verify_checksums_ is enabled, a checksum mismatch will
81-
cause a runtime error. Note that the checksum on the index block is always
82-
verified, since the overhead of doing this once when the reader object is
83-
instantiated is minimal. The default is to not verify data block checksums.
79+
Specifies whether or not the CRC32C checksum on the index block and each
80+
data block should be verified or not. If _verify_checksums_ is enabled, a
81+
checksum mismatch will cause a fatal runtime error. The default is to not verify
82+
index or data block checksums.
8483

8584
==== madvise_random ====
8685

mtbl.spec

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
Name: mtbl
2-
Version: 1.3.0
2+
Version: 1.5.0
33
Release: 1%{?dist}
44
Summary: immutable sorted string table utilities
55

@@ -17,7 +17,7 @@ mtbl is a C library implementation of the Sorted String Table (SSTable)
1717
data structure. mtbl exposes primitives for creating, searching and
1818
merging SSTable files.
1919

20-
This package contains the shared library for libmbtl and the mtbl
20+
This package contains the shared library for libmtbl and the mtbl
2121
command-line tools.
2222

2323
%package devel

mtbl/block.c

+41-15
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*
2-
* Copyright (c) 2012, 2014 by Farsight Security, Inc.
2+
* Copyright (c) 2022 DomainTools LLC
3+
* Copyright (c) 2012, 2014, 2017 by Farsight Security, Inc.
34
*
45
* Licensed under the Apache License, Version 2.0 (the "License");
56
* you may not use this file except in compliance with the License.
@@ -249,33 +250,58 @@ block_iter_seek_to_last(struct block_iter *bi)
249250
}
250251
}
251252

252-
void
253+
static int
254+
compare_restart_point(struct block_iter *bi, const uint32_t i, const uint8_t *target, size_t target_len) {
255+
uint32_t shared, non_shared, value_length;
256+
uint64_t region_offset = get_restart_point(bi, i);
257+
const uint8_t *key_ptr = decode_entry(bi->data + region_offset,
258+
bi->data + bi->restarts,
259+
&shared, &non_shared, &value_length);
260+
/* check for corruption */
261+
assert(key_ptr != NULL && shared == 0);
262+
return bytes_compare(key_ptr, non_shared, target, target_len);
263+
}
264+
265+
void
253266
block_iter_seek(struct block_iter *bi, const uint8_t *target, size_t target_len)
254267
{
255-
/* binary search in restart array to find the first restart point
256-
* with a key >= target
268+
/*
269+
* If the restart_index is not zero and not equal to the number of
270+
* restarts, then begin with galloping search in the restart array to find
271+
* the first restart point with a key >= target, otherwise just do binary
272+
* search from the start of the restart array
257273
*/
258274
uint32_t left = 0;
259275
uint32_t right = bi->num_restarts - 1;
276+
if (bi->num_restarts != bi->restart_index && bi->restart_index != 0) {
277+
/* Start galloping from the current restart index */
278+
uint32_t i = bi->restart_index;
279+
right = i;
280+
uint32_t incr = 1;
281+
while (compare_restart_point(bi, i, target, target_len) < 0) {
282+
left = i;
283+
i += incr;
284+
/* Stop galloping if i is past the end of the restart array */
285+
if (i > bi->num_restarts - 1) {
286+
right = bi->num_restarts - 1;
287+
break;
288+
}
289+
right = i;
290+
incr *= 2;
291+
}
292+
}
293+
294+
/* binary search */
260295
while (left < right) {
261296
uint32_t mid = (left + right + 1) / 2;
262-
uint64_t region_offset = get_restart_point(bi, mid);
263-
uint32_t shared, non_shared, value_length;
264-
const uint8_t *key_ptr = decode_entry(bi->data + region_offset,
265-
bi->data + bi->restarts,
266-
&shared, &non_shared, &value_length);
267-
if (key_ptr == NULL || (shared != 0)) {
268-
/* corruption */
269-
return;
270-
}
271-
if (bytes_compare(key_ptr, non_shared, target, target_len) < 0) {
297+
if (compare_restart_point(bi, mid, target, target_len) < 0) {
272298
/* key at "mid" is smaller than "target", therefore all
273299
* keys before "mid" are uninteresting
274300
*/
275301
left = mid;
276302
} else {
277303
/* key at "mid" is larger than "target", therefore all
278-
* keys at or before "mid" are uninteresting
304+
* keys at or after "mid" are uninteresting
279305
*/
280306
right = mid - 1;
281307
}

mtbl/bytes.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ bytes_shortest_separator(ubuf *start, const uint8_t *limit, size_t len_limit)
2222
if (diff_byte < 0xFF && diff_byte + 1 < limit[diff_index]) {
2323
ubuf_data(start)[diff_index]++;
2424
ubuf_clip(start, diff_index + 1);
25-
} else if (diff_index < min_length - sizeof(uint16_t)) {
25+
} else if (diff_index + sizeof(uint16_t) < min_length) {
2626
/* awww yeah, big endian arithmetic on strings */
2727
uint16_t u_start, u_limit, u_between;
2828
memcpy(&u_start, &ubuf_data(start)[diff_index], sizeof(u_start));

mtbl/fileset.c

+32-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/*
2+
* Copyright (c) 2022 DomainTools LLC
23
* Copyright (c) 2012-2019 by Farsight Security, Inc.
34
*
45
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,6 +30,8 @@ struct mtbl_fileset_options {
2930
void *dupsort_clos;
3031
mtbl_filename_filter_func fname_filter;
3132
void *fname_filter_clos;
33+
mtbl_reader_filter_func reader_filter;
34+
void *reader_filter_clos;
3235
};
3336

3437
struct shared_fileset {
@@ -48,6 +51,8 @@ struct mtbl_fileset {
4851
struct mtbl_source *source;
4952
mtbl_filename_filter_func fname_filter;
5053
void *fname_filter_clos;
54+
mtbl_reader_filter_func reader_filter;
55+
void *reader_filter_clos;
5156
};
5257

5358
struct fileset_iter {
@@ -180,6 +185,14 @@ mtbl_fileset_options_set_filename_filter_func(struct mtbl_fileset_options *opt,
180185
opt->fname_filter_clos = clos;
181186
}
182187

188+
void
189+
mtbl_fileset_options_set_reader_filter_func(struct mtbl_fileset_options *opt,
190+
mtbl_reader_filter_func reader_filter, void *clos)
191+
{
192+
opt->reader_filter = reader_filter;
193+
opt->reader_filter_clos = clos;
194+
}
195+
183196
void
184197
mtbl_fileset_options_set_reload_interval(struct mtbl_fileset_options *opt,
185198
uint32_t reload_interval)
@@ -215,6 +228,8 @@ mtbl_fileset_set_options(struct mtbl_fileset *f, const struct mtbl_fileset_optio
215228
mtbl_merger_options_set_dupsort_func(f->mopt, opt->dupsort, opt->dupsort_clos);
216229
f->fname_filter = opt->fname_filter;
217230
f->fname_filter_clos = opt->fname_filter_clos;
231+
f->reader_filter = opt->reader_filter;
232+
f->reader_filter_clos = opt->reader_filter_clos;
218233
f->merger = mtbl_merger_init(f->mopt);
219234
f->source = mtbl_source_init(fileset_source_iter,
220235
fileset_source_get,
@@ -290,11 +305,25 @@ fs_reinit_merger(struct mtbl_fileset *f)
290305
f->merger = mtbl_merger_init(f->mopt);
291306
}
292307
assert(f->merger != NULL);
293-
while (my_fileset_get(f->shared_fs->my_fs, i++, &fname, (void **) &reader))
294-
if ((reader != NULL) && ((f->fname_filter == NULL) ||
295-
f->fname_filter(fname, f->fname_filter_clos))) {
308+
while (my_fileset_get(f->shared_fs->my_fs, i++, &fname, (void **) &reader)) {
309+
if (reader == NULL) {
310+
continue;
311+
}
312+
313+
/*
314+
* Add the reader's source to the merger unless filtered out by the
315+
* fname_filter or reader_filter.
316+
*/
317+
if (
318+
(f->fname_filter == NULL
319+
|| f->fname_filter(fname, f->fname_filter_clos))
320+
&&
321+
(f->reader_filter == NULL
322+
|| f->reader_filter(reader, f->reader_filter_clos))
323+
) {
296324
mtbl_merger_add_source(f->merger, mtbl_reader_source(reader));
297325
}
326+
}
298327
}
299328

300329
void

mtbl/libmtbl.sym

+5
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,8 @@ global:
9898
mtbl_fileset_options_set_dupsort_func;
9999
mtbl_fileset_dup;
100100
} LIBMTBL_1.1.0;
101+
102+
LIBMTBL_1.4.0 {
103+
global:
104+
mtbl_fileset_options_set_reader_filter_func;
105+
} LIBMTBL_1.2.0;

0 commit comments

Comments
 (0)