Skip to content

Commit f0eb304

Browse files
authored
Upgrade datasketches lib from 4.1.0 to 5.0.2 (#713)
1 parent 1cd7926 commit f0eb304

24 files changed

+657
-366
lines changed

3rd/datasketches/common/CMakeLists.txt

+3-2
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,10 @@ target_sources(common
3737
${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_back_inserter.hpp
3838
${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_forward.hpp
3939
${CMAKE_CURRENT_SOURCE_DIR}/include/ceiling_power_of_2.hpp
40-
${CMAKE_CURRENT_SOURCE_DIR}/include/kolmogorov_smirnov.hpp
40+
${CMAKE_CURRENT_SOURCE_DIR}/include/kolmogorov_smirnov.hpp
4141
${CMAKE_CURRENT_SOURCE_DIR}/include/kolmogorov_smirnov_impl.hpp
4242
${CMAKE_CURRENT_SOURCE_DIR}/include/quantiles_sorted_view.hpp
4343
${CMAKE_CURRENT_SOURCE_DIR}/include/quantiles_sorted_view_impl.hpp
44-
${CMAKE_CURRENT_SOURCE_DIR}/include/version.hpp.in
44+
${CMAKE_CURRENT_SOURCE_DIR}/include/optional.hpp
45+
${CMAKE_CURRENT_SOURCE_DIR}/include/version.hpp.in
4546
)

3rd/datasketches/common/include/common_defs.hpp

+11-8
Original file line numberDiff line numberDiff line change
@@ -28,27 +28,30 @@
2828
#include <chrono>
2929
#include <thread>
3030

31+
/// DataSketches namespace
3132
namespace datasketches {
3233

3334
static const uint64_t DEFAULT_SEED = 9001;
3435

3536
enum resize_factor { X1 = 0, X2, X4, X8 };
3637

37-
template<typename A> using AllocChar = typename std::allocator_traits<A>::template rebind_alloc<char>;
38-
template<typename A> using string = std::basic_string<char, std::char_traits<char>, AllocChar<A>>;
39-
40-
// thread-safe random bit
41-
static thread_local std::independent_bits_engine<std::mt19937, 1, uint32_t>
42-
random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()
43-
+ std::hash<std::thread::id>{}(std::this_thread::get_id())));
38+
template<typename A> using string = std::basic_string<char, std::char_traits<char>, typename std::allocator_traits<A>::template rebind_alloc<char>>;
4439

4540
// common random declarations
4641
namespace random_utils {
4742
static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
4843
static thread_local std::mt19937_64 rand(rd());
4944
static thread_local std::uniform_real_distribution<> next_double(0.0, 1.0);
50-
}
5145

46+
// thread-safe random bit
47+
static thread_local std::independent_bits_engine<std::mt19937, 1, uint32_t>
48+
random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()
49+
+ std::hash<std::thread::id>{}(std::this_thread::get_id())));
50+
51+
inline void override_seed(uint64_t s) {
52+
rand.seed(s);
53+
}
54+
}
5255

5356
// utility function to hide unused compiler warning
5457
// usually has no additional cost

3rd/datasketches/common/include/count_zeros.hpp

-2
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@
2222

2323
#include <cstdint>
2424

25-
#include <stdio.h>
26-
2725
namespace datasketches {
2826

2927
static const uint8_t byte_leading_zeros_table[256] = {

3rd/datasketches/common/include/kolmogorov_smirnov.hpp

+9-6
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,16 @@
2222

2323
namespace datasketches {
2424

25+
/**
26+
* Kolmogorov-Smirnov test for KLL or Quantiles sketches
27+
*/
2528
class kolmogorov_smirnov {
2629
public:
2730
/**
2831
* Computes the raw delta area between two quantile sketches for the Kolmogorov-Smirnov Test.
2932
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
30-
* @param sketch1 KLL sketch 1
31-
* @param sketch2 KLL sketch 2
33+
* @param sketch1 sketch 1
34+
* @param sketch2 sketch 2
3235
* @return the raw delta between two KLL quantile sketches
3336
*/
3437
template<typename Sketch>
@@ -39,8 +42,8 @@ class kolmogorov_smirnov {
3942
* Adjusts the computed threshold by the error epsilons of the two given sketches.
4043
* See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
4144
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
42-
* @param sketch1 KLL sketch 1
43-
* @param sketch2 KLL sketch 2
45+
* @param sketch1 sketch 1
46+
* @param sketch2 sketch 2
4447
* @param p Target p-value. Typically .001 to .1, e.g., .05.
4548
* @return the adjusted threshold to be compared with the raw delta
4649
*/
@@ -52,8 +55,8 @@ class kolmogorov_smirnov {
5255
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
5356
* Note: if the given sketches have insufficient data or if the sketch sizes are too small,
5457
* this will return false.
55-
* @param sketch1 KLL sketch 1
56-
* @param sketch2 KLL sketch 2
58+
* @param sketch1 sketch 1
59+
* @param sketch2 sketch 2
5760
* @param p Target p-value. Typically .001 to .1, e.g., .05.
5861
* @return Boolean indicating whether we can reject the null hypothesis (that the sketches
5962
* reflect the same underlying distribution) using the provided p-value.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#ifndef _OPTIONAL_HPP_
21+
#define _OPTIONAL_HPP_
22+
23+
// This is a simplistic substitute for std::optional until we require C++17
24+
25+
#if (__cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L))
26+
#include <optional>
27+
using std::optional;
28+
#else
29+
30+
#include <type_traits>
31+
32+
namespace datasketches {
33+
34+
template<typename T>
35+
class optional {
36+
public:
37+
38+
optional() noexcept: initialized_(false) {}
39+
40+
optional(const T& value) noexcept(std::is_nothrow_copy_constructible<T>::value) {
41+
new (&value_) T(value);
42+
initialized_ = true;
43+
}
44+
45+
optional(T&& value) noexcept(std::is_nothrow_move_constructible<T>::value) {
46+
new (&value_) T(std::move(value));
47+
initialized_ = true;
48+
}
49+
50+
// conversion from compatible types
51+
template<typename TT>
52+
optional(const optional<TT>& other) noexcept(std::is_nothrow_constructible<T, TT>::value): initialized_(false) {
53+
if (other.initialized_) {
54+
new (&value_) T(other.value_);
55+
initialized_ = true;
56+
}
57+
}
58+
59+
optional(const optional& other) noexcept(std::is_nothrow_copy_constructible<T>::value): initialized_(false) {
60+
if (other.initialized_) {
61+
new (&value_) T(other.value_);
62+
initialized_ = true;
63+
}
64+
}
65+
66+
optional(optional&& other) noexcept(std::is_nothrow_move_constructible<T>::value): initialized_(false) {
67+
if (other.initialized_) {
68+
new (&value_) T(std::move(other.value_));
69+
initialized_ = true;
70+
}
71+
}
72+
73+
~optional() noexcept(std::is_nothrow_destructible<T>::value) {
74+
if (initialized_) value_.~T();
75+
}
76+
77+
explicit operator bool() const noexcept {
78+
return initialized_;
79+
}
80+
81+
optional& operator=(const optional& other)
82+
noexcept(std::is_nothrow_copy_constructible<T>::value && std::is_nothrow_copy_assignable<T>::value) {
83+
if (initialized_) {
84+
if (other.initialized_) {
85+
value_ = other.value_;
86+
} else {
87+
reset();
88+
}
89+
} else {
90+
if (other.initialized_) {
91+
new (&value_) T(other.value_);
92+
initialized_ = true;
93+
}
94+
}
95+
return *this;
96+
}
97+
98+
optional& operator=(optional&& other)
99+
noexcept(std::is_nothrow_move_constructible<T>::value && std::is_nothrow_move_assignable<T>::value) {
100+
if (initialized_) {
101+
if (other.initialized_) {
102+
value_ = std::move(other.value_);
103+
} else {
104+
reset();
105+
}
106+
} else {
107+
if (other.initialized_) {
108+
new (&value_) T(std::move(other.value_));
109+
initialized_ = true;
110+
}
111+
}
112+
return *this;
113+
}
114+
115+
template<typename... Args>
116+
void emplace(Args&&... args) noexcept(std::is_nothrow_constructible<T, Args...>::value) {
117+
new (&value_) T(args...);
118+
initialized_ = true;
119+
}
120+
121+
T& operator*() & noexcept { return value_; }
122+
const T& operator*() const & noexcept { return value_; }
123+
T&& operator*() && noexcept { return std::move(value_); }
124+
const T&& operator*() const && noexcept { return std::move(value_); }
125+
126+
T* operator->() noexcept { return &value_; }
127+
const T* operator->() const noexcept { return &value_; }
128+
129+
void reset() noexcept(std::is_nothrow_destructible<T>::value) {
130+
if (initialized_) value_.~T();
131+
initialized_ = false;
132+
}
133+
134+
private:
135+
union {
136+
T value_;
137+
};
138+
bool initialized_;
139+
140+
// for converting constructor
141+
template<typename TT> friend class optional;
142+
};
143+
144+
} // namespace
145+
146+
#endif // C++17
147+
148+
#endif // _OPTIONAL_HPP_

3rd/datasketches/common/include/quantiles_sorted_view.hpp

+95-2
Original file line numberDiff line numberDiff line change
@@ -27,37 +27,129 @@
2727

2828
namespace datasketches {
2929

30+
/**
31+
* Sorted view for quantiles sketches (REQ, KLL and Quantiles)
32+
*/
3033
template<
3134
typename T,
3235
typename Comparator, // strict weak ordering function (see C++ named requirements: Compare)
3336
typename Allocator
3437
>
3538
class quantiles_sorted_view {
3639
public:
40+
/// Entry type
3741
using Entry = typename std::conditional<std::is_arithmetic<T>::value, std::pair<T, uint64_t>, std::pair<const T*, uint64_t>>::type;
3842
using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>;
3943
using Container = std::vector<Entry, AllocEntry>;
4044

45+
/// @private
4146
quantiles_sorted_view(uint32_t num, const Comparator& comparator, const Allocator& allocator);
4247

48+
/// @private
4349
template<typename Iterator>
4450
void add(Iterator begin, Iterator end, uint64_t weight);
4551

52+
/// @private
4653
void convert_to_cummulative();
4754

4855
class const_iterator;
56+
57+
/**
58+
* Iterator pointing to the first entry in the view.
59+
* If the view is empty, the returned iterator must not be dereferenced or incremented.
60+
* @return iterator pointing to the first entry
61+
*/
4962
const_iterator begin() const;
63+
64+
/**
65+
* Iterator pointing to the past-the-end entry in the view.
66+
* The past-the-end entry is the hypothetical entry that would follow the last entry.
67+
* It does not point to any entry, and must not be dereferenced or incremented.
68+
* @return iterator pointing to the past-the-end entry
69+
*/
5070
const_iterator end() const;
5171

72+
/// @return size of the view
5273
size_t size() const;
5374

75+
/**
76+
* Returns an approximation to the normalized rank of the given item.
77+
*
78+
* <p>If the view is empty this throws std::runtime_error.
79+
*
80+
* @param item to be ranked
81+
* @param inclusive if true the weight of the given item is included into the rank.
82+
* Otherwise the rank equals the sum of the weights of all items that are less than the given item
83+
* according to the Comparator.
84+
*
85+
* @return an approximate normalized rank of the given item (0 to 1 inclusive)
86+
*/
5487
double get_rank(const T& item, bool inclusive = true) const;
5588

89+
/**
90+
* Quantile return type.
91+
* This is to return quantiles either by value (for arithmetic types) or by const reference (for all other types)
92+
*/
5693
using quantile_return_type = typename std::conditional<std::is_arithmetic<T>::value, T, const T&>::type;
94+
95+
/**
96+
* Returns an item from the sketch that is the best approximation to an item
97+
* from the original stream with the given normalized rank.
98+
*
99+
* <p>If the view is empty this throws std::runtime_error.
100+
*
101+
* @param rank of an item in the hypothetical sorted stream.
102+
* @param inclusive if true, the given rank is considered inclusive (includes weight of an item)
103+
*
104+
* @return approximate quantile associated with the given normalized rank
105+
*/
57106
quantile_return_type get_quantile(double rank, bool inclusive = true) const;
58107

59108
using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
109+
110+
/**
111+
* Returns an approximation to the Cumulative Distribution Function (CDF), which is the
112+
* cumulative analog of the PMF, of the input stream given a set of split points (items).
113+
*
114+
* <p>If the view is empty this throws std::runtime_error.
115+
*
116+
* @param split_points an array of <i>m</i> unique, monotonically increasing items
117+
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
118+
*
119+
* @param size the number of split points in the array
120+
*
121+
* @param inclusive if true the rank of an item includes its own weight, and therefore
122+
* if the sketch contains items equal to a slit point, then in CDF such items are
123+
* included into the interval to the left of split point. Otherwise they are included into
124+
* the interval to the right of split point.
125+
*
126+
* @return an array of m+1 doubles, which are a consecutive approximation to the CDF
127+
* of the input stream given the split_points. The value at array position j of the returned
128+
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
129+
* array. This can be viewed as array of ranks of the given split points plus one more value
130+
* that is always 1.
131+
*/
60132
vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
133+
134+
/**
135+
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
136+
* given a set of split points (items).
137+
*
138+
* <p>If the view is empty this throws std::runtime_error.
139+
*
140+
* @param split_points an array of <i>m</i> unique, monotonically increasing items
141+
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
142+
*
143+
* @param size the number of split points in the array
144+
*
145+
* @param inclusive if true the rank of an item includes its own weight, and therefore
146+
* if the sketch contains items equal to a slit point, then in PMF such items are
147+
* included into the interval to the left of split point. Otherwise they are included into the interval
148+
* to the right of split point.
149+
*
150+
* @return an array of m+1 doubles each of which is an approximation
151+
* to the fraction of the input stream items (the mass) that fall into one of those intervals.
152+
*/
61153
vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
62154

63155
private:
@@ -122,8 +214,6 @@ class quantiles_sorted_view<T, C, A>::const_iterator: public quantiles_sorted_vi
122214
using Base = typename quantiles_sorted_view<T, C, A>::Container::const_iterator;
123215
using value_type = typename std::conditional<std::is_arithmetic<T>::value, typename Base::value_type, std::pair<const T&, const uint64_t>>::type;
124216

125-
const_iterator(const Base& it, const Base& begin): Base(it), begin(begin) {}
126-
127217
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
128218
const value_type operator*() const { return Base::operator*(); }
129219

@@ -147,6 +237,9 @@ class quantiles_sorted_view<T, C, A>::const_iterator: public quantiles_sorted_vi
147237

148238
private:
149239
Base begin;
240+
241+
friend class quantiles_sorted_view<T, C, A>;
242+
const_iterator(const Base& it, const Base& begin): Base(it), begin(begin) {}
150243
};
151244

152245
} /* namespace datasketches */

0 commit comments

Comments
 (0)