From 7ea06dc05e95fd6e973d2dac6153f079babd5a24 Mon Sep 17 00:00:00 2001 From: Karteekmurthys Date: Fri, 3 Nov 2023 15:36:10 -0700 Subject: [PATCH] Fix the HashTable::storeRowPointer to accept int64 bucketoffset (#7308) Summary: The bucket offsets in a HashTable could cross INT32_MAX. This leads to incorrect address calculation in `HashTable::storeRowPoint` while storing a row pointer in the table. We can reproduce this on a TPC-DS 1K dataset. Incorrect results: ``` select count(ws_order_number) as "order count" from web_sales where ws_order_number in (select ws_order_number from web_sales); order count ------------- 289805994 (1 row) ``` Correct results: ``` presto:tpcds_sf_1000> select count(ws_order_number) from web_sales_on where ws_order_number in (select ws_order_number from web_sales_on); _col0 ----------- 720000376 (1 row) ``` Resolves https://github.com/prestodb/presto/issues/20972 Pull Request resolved: https://github.com/facebookincubator/velox/pull/7308 Reviewed By: Yuhta Differential Revision: D50988774 Pulled By: xiaoxmeng fbshipit-source-id: 5233deec0dc803e80509cce7541053e501f8e859 --- velox/exec/HashTable.cpp | 12 ++++++------ velox/exec/HashTable.h | 16 ++++++++++++++-- velox/exec/tests/HashTableTest.cpp | 25 +++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/velox/exec/HashTable.cpp b/velox/exec/HashTable.cpp index 1fba2fde33e2..8ecbe18145ac 100644 --- a/velox/exec/HashTable.cpp +++ b/velox/exec/HashTable.cpp @@ -290,7 +290,7 @@ void HashTable::storeKeys( template void HashTable::storeRowPointer( - int32_t index, + uint64_t index, uint64_t hash, char* row) { if (hashMode_ == HashMode::kArray) { @@ -307,7 +307,7 @@ void HashTable::storeRowPointer( template char* HashTable::insertEntry( HashLookup& lookup, - int32_t index, + uint64_t index, vector_size_t row) { char* group = rows_->newRow(); lookup.hits[row] = group; // NOLINT @@ -374,8 +374,8 @@ FOLLY_ALWAYS_INLINE void HashTable::fullProbe( return RowContainer::normalizedKey(group) == lookup.normalizedKeys[row]; }, - [&](int32_t index, int32_t row) { - return isJoin ? nullptr : insertEntry(lookup, row, index); + [&](int32_t row, uint64_t index) { + return isJoin ? nullptr : insertEntry(lookup, index, row); }, numTombstones_, !isJoin && extraCheck); @@ -386,8 +386,8 @@ FOLLY_ALWAYS_INLINE void HashTable::fullProbe( *this, 0, [&](char* group, int32_t row) { return compareKeys(group, lookup, row); }, - [&](int32_t index, int32_t row) { - return isJoin ? nullptr : insertEntry(lookup, row, index); + [&](int32_t row, uint64_t index) { + return isJoin ? nullptr : insertEntry(lookup, index, row); }, numTombstones_, !isJoin && extraCheck); diff --git a/velox/exec/HashTable.h b/velox/exec/HashTable.h index ee9dbf27c519..c0c01f7338b4 100644 --- a/velox/exec/HashTable.h +++ b/velox/exec/HashTable.h @@ -500,6 +500,18 @@ class HashTable : public BaseHashTable { setHashMode(mode, numNew); } + void testingStoreRowPointer(uint64_t index, uint64_t hash, char* row) { + storeRowPointer(index, hash, row); + } + + auto testingLoadTags(uint64_t bucketOffset) { + return loadTags(bucketOffset); + } + + auto testingGetRow(uint64_t bucketOffset, uint64_t slotIndex) { + return row(bucketOffset, slotIndex); + } + auto& testingOtherTables() const { return otherTables_; } @@ -622,7 +634,7 @@ class HashTable : public BaseHashTable { void rehash(bool initNormalizedKeys); void storeKeys(HashLookup& lookup, vector_size_t row); - void storeRowPointer(int32_t index, uint64_t hash, char* row); + void storeRowPointer(uint64_t index, uint64_t hash, char* row); // Allocates new tables for tags and payload pointers. The size must // a power of 2. @@ -711,7 +723,7 @@ class HashTable : public BaseHashTable { bool initNormalizedKeys, raw_vector& hashes); - char* insertEntry(HashLookup& lookup, int32_t index, vector_size_t row); + char* insertEntry(HashLookup& lookup, uint64_t index, vector_size_t row); bool compareKeys(const char* group, HashLookup& lookup, vector_size_t row); diff --git a/velox/exec/tests/HashTableTest.cpp b/velox/exec/tests/HashTableTest.cpp index e6bbc88a79ff..6e2de769ef7a 100644 --- a/velox/exec/tests/HashTableTest.cpp +++ b/velox/exec/tests/HashTableTest.cpp @@ -827,6 +827,31 @@ TEST_P(HashTableTest, offsetOverflowLoadTags) { } } +TEST_P(HashTableTest, offsetOverflowStoreRowPointer) { + std::vector> hashers; + hashers.push_back(std::make_unique(TINYINT(), 0)); + auto table = HashTable::createForJoin( + std::move(hashers), + {} /*dependentTypes=*/, + true, + false, + 1'000, + pool_.get()); + table->testingSetHashMode( + BaseHashTable::HashMode::kNormalizedKey, 700'000'000); + // Set greater than INT_MAX bucketOffset. + uint64_t bucketOffset = 2'848'622'336; + // Setting random value and hash value. + uint8_t value = 10; + uint64_t hash = 168'520'4148'842'825'593; + table->testingStoreRowPointer(bucketOffset, hash, (char*)&value); + auto expectedTag = BaseHashTable::hashTag(hash); + auto actualTagSimdBatch = table->testingLoadTags(bucketOffset); + ASSERT_EQ(expectedTag, actualTagSimdBatch.get(0)); + auto slotIndex = bucketOffset & (sizeof(BaseHashTable::TagVector) - 1); + ASSERT_EQ(value, *(uint8_t*)table->testingGetRow(bucketOffset, slotIndex)); +} + DEBUG_ONLY_TEST_P(HashTableTest, failureInCreateRowPartitions) { // This tests an issue in parallelJoinBuild where an exception in // createRowPartitions could lead to concurrency issues in async table