From fb1ed44857d33ccfe608bdc4208c88bf3ff91746 Mon Sep 17 00:00:00 2001
From: Serge Druzkin <sdruzkin@meta.com>
Date: Mon, 7 Oct 2024 22:05:56 -0700
Subject: [PATCH] Order dictionary indicies to prevent 2x-50x size regression
 (#94)

Summary:

I checked Vader [results](https://fburl.com/daiquery/4kk9jnm1) and found significant size differences between Nimble JNI and Velox rewrites for some files.

After some debugging I learned that we almost never get exactly the same Nimble files. You can do the rewrite 100 times and get 100 files with different sizes, meaning that Nimble writer has non-deterministic behavior. For example in this example P1630232570 **rewriting the same file multiple times yields files with sizes from 112MB to 364MB**.

After checking the encodings and enabling NIMBLE_ENCODING_SELECTION_DEBUG in the EncodingSelectionPolicy I **root-caused it to the dictionary encoding. All rewrites of the same files always had the same alphabet in different order, and that also caused dictionary indicies to be totally different.**

**Random order of the dictionary alphabet was caused by usage of the absl::flat_hash_map for the Statistic.uniqueCounts**. After switching Statistic.uniqueCounts type to F14FastMap, which has consistent iteration order, different rewrites of the same files became consistent.

After a bit of digging and comparing encoding selection logs I noticed that the **main size contributor in files with different sizes is the varint encoding of dictionary indicies**. Size of the encoded data depends on the encoded numeric value, the bigger it's the more bytes it needs in the encoded form.

So, **I sorted indicies by the alphabet key frequency and put the most frequent keys first**, that would mean that the most frequent keys would get the smallest indicies. In case of the first 127 indicies we would only need 1 byte, for the next 16383 need 2 bytes, then 3, etc. **It consistently produced small files, but still with different sizes overall. I also did a reverse experiment, and it consistently produced large files.**


# Samples
Sorting alphabet by frequency would obviously impact writer performance, but IMHO it's worth it. In some examples the size regression is up to 580x (!!!), instead of 60MB files we can get a 35GB file for ifr_test_hive_table.

Checkout https://docs.google.com/spreadsheets/d/1d7m--4x6e0YddyfTdJvjrkYz-pB11O3sBdNM8usxmZM/edit?usp=sharing. A list of corresponding files can be found here P1634630740.

I did testing on some top sized NImble tables and some tables where Vader shows significant size differences between JNI and Velox. `Lil` is the version from the diff, `big` is the version with reverse sorted indicies.

Some notable examples - top Nimble tables:
P1634619561

Some notable examples - top size differences:
P1634619725

# Other Notes 1
As you can see in the spreadsheet in some cases the original Nimble file is still much smaller, it means there are still some optimization and size fixing opportunities. May be sorting the alphabet would also lead to size decrease, or may be there is something else.

# Compression
It looks like we are not compressing Varint encoding, we should absolutely start doing that.

Differential Revision: D63964981
---
 dwio/nimble/common/Vector.h                |  7 ++++
 dwio/nimble/common/tests/VectorTests.cpp   | 10 +++++
 dwio/nimble/encodings/DictionaryEncoding.h | 47 ++++++++++++++++++----
 dwio/nimble/encodings/Statistics.h         |  6 +++
 4 files changed, 63 insertions(+), 7 deletions(-)
diff --git a/dwio/nimble/common/Vector.h b/dwio/nimble/common/Vector.h
index 3694bde..47bc9e4 100644
--- a/dwio/nimble/common/Vector.h
+++ b/dwio/nimble/common/Vector.h
@@ -64,6 +64,13 @@ class Vector {
     std::copy(first, last, dataRawPtr_);
   }
 
+  template <typename It>
+  Vector(velox::memory::MemoryPool* memoryPool, size_t size, It first, It last)
+      : memoryPool_{memoryPool} {
+    init(size);
+    std::copy(first, last, dataRawPtr_);
+  }
+
   Vector(const Vector& other) {
     *this = other;
   }
diff --git a/dwio/nimble/common/tests/VectorTests.cpp b/dwio/nimble/common/tests/VectorTests.cpp
index 75fe94d..de697d9 100644
--- a/dwio/nimble/common/tests/VectorTests.cpp
+++ b/dwio/nimble/common/tests/VectorTests.cpp
@@ -50,6 +50,16 @@ TEST_F(VectorTests, FromRange) {
   EXPECT_EQ(6, v1[2]);
 }
 
+TEST_F(VectorTests, FromRangeWithSize) {
+  std::vector<int32_t> source{4, 5, 6};
+  nimble::Vector<int32_t> v1(
+      pool_.get(), source.size(), source.begin(), source.end());
+  EXPECT_EQ(3, v1.size());
+  EXPECT_EQ(4, v1[0]);
+  EXPECT_EQ(5, v1[1]);
+  EXPECT_EQ(6, v1[2]);
+}
+
 TEST_F(VectorTests, EqualOp1) {
   nimble::Vector<int32_t> v1(pool_.get());
   v1.push_back(1);
diff --git a/dwio/nimble/encodings/DictionaryEncoding.h b/dwio/nimble/encodings/DictionaryEncoding.h
index 42568d7..1b62d2c 100644
--- a/dwio/nimble/encodings/DictionaryEncoding.h
+++ b/dwio/nimble/encodings/DictionaryEncoding.h
@@ -18,16 +18,13 @@
 #include <span>
 #include "dwio/nimble/common/Buffer.h"
 #include "dwio/nimble/common/EncodingPrimitives.h"
-#include "dwio/nimble/common/EncodingType.h"
 #include "dwio/nimble/common/Types.h"
 #include "dwio/nimble/common/Vector.h"
 #include "dwio/nimble/encodings/Encoding.h"
 #include "dwio/nimble/encodings/EncodingFactory.h"
 #include "dwio/nimble/encodings/EncodingIdentifier.h"
 #include "dwio/nimble/encodings/EncodingSelection.h"
-#include "dwio/nimble/encodings/TrivialEncoding.h"
 #include "folly/container/F14Map.h"
-#include "velox/common/memory/Memory.h"
 
 // A dictionary encoded stream is comprised of two pieces: a mapping from the
 // n unique values in a stream to the integers [0, n) and the vector of indices
@@ -205,10 +202,46 @@ std::string_view DictionaryEncoding<T>::encode(
   alphabetMapping.reserve(alphabetCount);
   Vector<physicalType> alphabet{&buffer.getMemoryPool()};
   alphabet.reserve(alphabetCount);
-  uint32_t index = 0;
-  for (const auto& pair : selection.statistics().uniqueCounts()) {
-    alphabet.push_back(pair.first);
-    alphabetMapping.emplace(pair.first, index++);
+
+  /// Indicies are usually stored with VARINT encoding which depends on the
+  /// number of set bits in the value.
+  ///
+  /// 127 (1 << 7) is the maximum number that can be stored in one byte with
+  /// VARINT encoding. Meaning that if the alphabet has less than 127 unique
+  /// values, then all of them can be stored as one byte per index value.
+  ///
+  /// If the alphabet has more than 127 unique values, then we need to put more
+  /// frequent alphabet values at the beginning of the alphabet to reduce the
+  /// number of bytes needed to store the indices encoded as VARINT.
+  ///
+  /// This sorting optimization gives 3-5x size reduction if you compare most
+  /// and least optimal order of indicies when they use VARINT encoding.
+  if (alphabetCount > (1 << 7)) {
+    const auto& uniqueCounts = selection.statistics().uniqueCounts();
+    Vector<std::pair<physicalType, uint64_t>> sortedAlphabet(
+        &buffer.getMemoryPool(),
+        uniqueCounts.size(),
+        uniqueCounts.cbegin(),
+        uniqueCounts.cend());
+    sort(
+        sortedAlphabet.begin(),
+        sortedAlphabet.end(),
+        [](const std::pair<physicalType, uint32_t>& a,
+           const std::pair<physicalType, uint32_t>& b) {
+          return a.second < b.second;
+        });
+
+    uint32_t index = 0;
+    for (const auto& pair : sortedAlphabet) {
+      alphabet.push_back(pair.first);
+      alphabetMapping.emplace(pair.first, index++);
+    }
+  } else {
+    uint32_t index = 0;
+    for (const auto& pair : selection.statistics().uniqueCounts()) {
+      alphabet.push_back(pair.first);
+      alphabetMapping.emplace(pair.first, index++);
+    }
   }
 
   Vector<uint32_t> indices{&buffer.getMemoryPool()};
diff --git a/dwio/nimble/encodings/Statistics.h b/dwio/nimble/encodings/Statistics.h
index a4009a0..37d990b 100644
--- a/dwio/nimble/encodings/Statistics.h
+++ b/dwio/nimble/encodings/Statistics.h
@@ -30,6 +30,12 @@ namespace facebook::nimble {
 template <typename T, typename InputType = T>
 class UniqueValueCounts {
  public:
+  // NOTICE: absl::flat_hash_map has incosistent iteration order
+  // every single time, this might be a problem for some encodings that depend
+  // on the order of the elements, such as dictionary indicies encoded with
+  // VARINT encoding. This also means that the order of the elements in the
+  // dictionary alphabet will be different every time giving incosistent file
+  // sizes.
   using MapType = absl::flat_hash_map<T, uint64_t>;
 
   struct Iterator {