Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor refactorings for the Join class #1560

Merged
merged 4 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/engine/IndexScan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ Permutation::IdTableGenerator IndexScan::lazyScanForJoinOfColumnWithScan(
std::span<const Id> joinColumn) const {
AD_EXPENSIVE_CHECK(std::ranges::is_sorted(joinColumn));
AD_CORRECTNESS_CHECK(numVariables_ <= 3 && numVariables_ > 0);
AD_CONTRACT_CHECK(joinColumn.empty() || !joinColumn[0].isUndefined());

auto metaBlocks1 = getMetadataForScan();

Expand Down
13 changes: 4 additions & 9 deletions src/engine/Join.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ using std::string;
// _____________________________________________________________________________
Join::Join(QueryExecutionContext* qec, std::shared_ptr<QueryExecutionTree> t1,
std::shared_ptr<QueryExecutionTree> t2, ColumnIndex t1JoinCol,
ColumnIndex t2JoinCol, bool keepJoinColumn)
ColumnIndex t2JoinCol)
: Operation(qec) {
AD_CONTRACT_CHECK(t1 && t2);
// Currently all join algorithms require both inputs to be sorted, so we
Expand Down Expand Up @@ -55,7 +55,6 @@ Join::Join(QueryExecutionContext* qec, std::shared_ptr<QueryExecutionTree> t1,
_leftJoinCol = t1JoinCol;
_right = std::move(t2);
_rightJoinCol = t2JoinCol;
_keepJoinColumn = keepJoinColumn;
_sizeEstimate = 0;
_sizeEstimateComputed = false;
_multiplicities.clear();
Expand Down Expand Up @@ -92,10 +91,7 @@ string Join::getDescriptor() const { return "Join on " + _joinVar.name(); }
// _____________________________________________________________________________
ProtoResult Join::computeResult([[maybe_unused]] bool requestLaziness) {
LOG(DEBUG) << "Getting sub-results for join result computation..." << endl;
size_t leftWidth = _left->getResultWidth();
size_t rightWidth = _right->getResultWidth();
IdTable idTable{getExecutionContext()->getAllocator()};
idTable.setNumColumns(leftWidth + rightWidth - 1);
IdTable idTable{getResultWidth(), getExecutionContext()->getAllocator()};

if (_left->knownEmptyResult() || _right->knownEmptyResult()) {
_left->getRootOperation()->updateRuntimeInformationWhenOptimizedOut();
Expand Down Expand Up @@ -158,7 +154,7 @@ ProtoResult Join::computeResult([[maybe_unused]] bool requestLaziness) {
std::shared_ptr<const Result> leftRes =
leftResIfCached ? leftResIfCached : _left->getResult();
checkCancellation();
if (leftRes->idTable().size() == 0) {
if (leftRes->idTable().empty()) {
_right->getRootOperation()->updateRuntimeInformationWhenOptimizedOut();
// TODO<joka921, hannahbast, SPARQL update> When we add triples to the
// index, the vocabularies of index scans will not necessarily be empty and
Expand Down Expand Up @@ -206,8 +202,7 @@ VariableToColumnMap Join::computeVariableToColumnMap() const {

// _____________________________________________________________________________
size_t Join::getResultWidth() const {
size_t res = _left->getResultWidth() + _right->getResultWidth() -
(_keepJoinColumn ? 1 : 2);
size_t res = _left->getResultWidth() + _right->getResultWidth() - 1;
AD_CONTRACT_CHECK(res > 0);
return res;
}
Expand Down
10 changes: 3 additions & 7 deletions src/engine/Join.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ class Join : public Operation {

Variable _joinVar{"?notSet"};

bool _keepJoinColumn;

bool _sizeEstimateComputed;
size_t _sizeEstimate;

Expand All @@ -33,7 +31,7 @@ class Join : public Operation {
public:
Join(QueryExecutionContext* qec, std::shared_ptr<QueryExecutionTree> t1,
std::shared_ptr<QueryExecutionTree> t2, ColumnIndex t1JoinCol,
ColumnIndex t2JoinCol, bool keepJoinColumn = true);
ColumnIndex t2JoinCol);

// A very explicit constructor, which initializes an invalid join object (it
// has no subtrees, which violates class invariants). These invalid Join
Expand Down Expand Up @@ -108,8 +106,8 @@ class Join : public Operation {
* @return The result is only sorted, if the bigger table is sorted.
* Otherwise it is not sorted.
**/
void hashJoin(const IdTable& dynA, ColumnIndex jc1, const IdTable& dynB,
ColumnIndex jc2, IdTable* dynRes);
static void hashJoin(const IdTable& dynA, ColumnIndex jc1,
const IdTable& dynB, ColumnIndex jc2, IdTable* dynRes);

protected:
virtual string getCacheKeyImpl() const override;
Expand All @@ -134,8 +132,6 @@ class Join : public Operation {
IndexScan& scan,
ColumnIndex joinColScan);

using ScanMethodType = std::function<IdTable(Id)>;

/*
* @brief Combines 2 rows like in a join and inserts the result in the
* given table.
Expand Down
45 changes: 15 additions & 30 deletions src/util/JoinAlgorithms/JoinAlgorithms.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,6 @@ namespace ad_utility {

// Some helper concepts.

// A predicate type `Pred` fulfills `BinaryRangePredicate<Range>` if it can be
// called with two values of the `Range`'s `value_type` and produces a result
// that can be converted to `bool`.
template <typename Pred, typename Range>
concept BinaryRangePredicate =
std::indirect_binary_predicate<Pred, std::ranges::iterator_t<Range>,
std::ranges::iterator_t<Range>>;

// A function `F` fulfills `UnaryIteratorFunction` if it can be called with a
// single argument of the `Range`'s iterator type (NOT value type).
template <typename F, typename Range>
Expand Down Expand Up @@ -667,14 +659,14 @@ class BlockAndSubrange {
template <typename Iterator, typename End, typename Projection>
struct JoinSide {
using CurrentBlocks =
std::vector<detail::BlockAndSubrange<typename Iterator::value_type>>;
std::vector<detail::BlockAndSubrange<std::iter_value_t<Iterator>>>;
Iterator it_;
[[no_unique_address]] const End end_;
const Projection& projection_;
CurrentBlocks currentBlocks_{};

// Type aliases for a single element from a block from the left/right input.
using value_type = std::ranges::range_value_t<typename Iterator::value_type>;
using value_type = std::ranges::range_value_t<std::iter_value_t<Iterator>>;
Copy link
Collaborator Author

@RobinTF RobinTF Oct 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This helper type also works if the iterator type is a pointer.

// Type alias for the result of the projection.
using ProjectedEl =
std::decay_t<std::invoke_result_t<const Projection&, value_type>>;
Expand All @@ -691,7 +683,7 @@ template <typename Blocks>
auto makeJoinSide(Blocks& blocks, const auto& projection) {
return JoinSide{std::ranges::begin(blocks), std::ranges::end(blocks),
projection};
};
}

// A concept to identify instantiations of the `JoinSide` template.
template <typename T>
Expand Down Expand Up @@ -759,14 +751,10 @@ struct BlockZipperJoinImpl {
using ProjectedEl = LeftSide::ProjectedEl;
static_assert(std::same_as<ProjectedEl, typename RightSide::ProjectedEl>);

// The largest element for which all blocks are currently stored in the buffer
// and processed.
std::optional<ProjectedEl> currentMinEl_ = std::nullopt;

// Create an equality comparison from the `lessThan` predicate.
bool eq(const auto& el1, const auto& el2) {
return !lessThan_(el1, el2) && !lessThan_(el2, el1);
};
}

// Recompute the `currentEl`. It is the minimum of the last element in the
// first block of either of the join sides.
Expand All @@ -775,7 +763,7 @@ struct BlockZipperJoinImpl {
return side.projection_(side.currentBlocks_.front().back());
};
return std::min(getFirst(leftSide_), getFirst(rightSide_), lessThan_);
};
}

// Fill `side.currentBlocks_` with blocks from the range `[side.it_,
// side.end_)` and advance `side.it_` for each read buffer until all elements
Expand Down Expand Up @@ -823,7 +811,7 @@ struct BlockZipperJoinImpl {
} else {
return BlockStatus::allFilled;
}
};
}

// Remove all elements from `blocks` (either `leftSide_.currentBlocks_` or
// `rightSide_.currentBlocks`) s.t. only elements `> lastProcessedElement`
Expand All @@ -848,7 +836,7 @@ struct BlockZipperJoinImpl {
if (blocks.at(0).empty()) {
blocks.clear();
}
};
}

// For one of the inputs (`leftSide_.currentBlocks_` or
// `rightSide_.currentBlocks_`) obtain a tuple of the following elements:
Expand All @@ -860,7 +848,7 @@ struct BlockZipperJoinImpl {
const auto& first = currentBlocks.at(0);
auto it = std::ranges::lower_bound(first.subrange(), currentEl, lessThan_);
return std::tuple{std::ref(first.fullBlock()), first.subrange(), it};
};
}

// Call `compatibleRowAction` for all pairs of elements in the Cartesian
// product of the blocks in `blocksLeft` and `blocksRight`.
Expand Down Expand Up @@ -892,7 +880,7 @@ struct BlockZipperJoinImpl {
}
}
compatibleRowAction_.flush();
};
}

// Return a vector of subranges of all elements in `blocks` that are equal to
// `currentEl`. Effectively, these subranges cover all the blocks completely
Expand All @@ -907,7 +895,7 @@ struct BlockZipperJoinImpl {
last.setSubrange(
std::ranges::equal_range(last.subrange(), currentEl, lessThan_));
return result;
};
}

// Join the first block in `currentBlocksLeft` with the first block in
// `currentBlocksRight`, but ignore all elements that are `>= currentEl`
Expand Down Expand Up @@ -949,7 +937,7 @@ struct BlockZipperJoinImpl {
// Remove the joined elements.
currentBlocksLeft.at(0).setSubrange(currentElItL, subrangeLeft.end());
currentBlocksRight.at(0).setSubrange(currentElItR, subrangeRight.end());
};
}

// If the `targetBuffer` is empty, read the next nonempty block from `[it,
// end)` if there is one.
Expand All @@ -965,7 +953,7 @@ struct BlockZipperJoinImpl {
}
++it;
}
};
}

// Fill both buffers (left and right) until they contain at least one block.
// Then recompute the `currentEl()` and keep on filling the buffers until at
Expand All @@ -988,9 +976,7 @@ struct BlockZipperJoinImpl {
}

// Add the remaining blocks such that condition 3 from above is fulfilled.
auto blockStatus = fillEqualToCurrentElBothSides(getCurrentEl());
currentMinEl_ = getCurrentEl();
return blockStatus;
return fillEqualToCurrentElBothSides(getCurrentEl());
}

// Combine the above functionality and perform one round of joining.
Expand All @@ -999,11 +985,10 @@ struct BlockZipperJoinImpl {
void joinBuffers(BlockStatus& blockStatus) {
auto& currentBlocksLeft = leftSide_.currentBlocks_;
auto& currentBlocksRight = rightSide_.currentBlocks_;
ProjectedEl currentEl = getCurrentEl();
joinAndRemoveLessThanCurrentEl<DoOptionalJoin>(
currentBlocksLeft, currentBlocksRight, getCurrentEl());
currentBlocksLeft, currentBlocksRight, currentEl);

// TODO<joka921> This should still be the same.
ProjectedEl currentEl = getCurrentEl();
// At this point the `currentBlocksLeft/Right` only consist of elements `>=
// currentEl`. We now obtain a view on the elements `== currentEl` which are
// needed for the next step (the Cartesian product). In the last block,
Expand Down
2 changes: 1 addition & 1 deletion src/util/JoinAlgorithms/JoinColumnMapping.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ struct IdTableAndFirstCol {
auto begin() const { return col().begin(); }
auto end() const { return col().end(); }

bool empty() { return col().empty(); }
bool empty() const { return col().empty(); }

const Id& operator[](size_t idx) const { return col()[idx]; }

Expand Down
3 changes: 1 addition & 2 deletions test/engine/SpatialJoinTestHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,7 @@ inline std::shared_ptr<QueryExecutionTree> buildJoin(
auto varCol2 = tree2->getVariableColumns();
size_t col1 = varCol1[joinVariable].columnIndex_;
size_t col2 = varCol2[joinVariable].columnIndex_;
return ad_utility::makeExecutionTree<Join>(qec, tree1, tree2, col1, col2,
true);
return ad_utility::makeExecutionTree<Join>(qec, tree1, tree2, col1, col2);
}

inline std::shared_ptr<QueryExecutionTree> buildMediumChild(
Expand Down
4 changes: 3 additions & 1 deletion test/engine/ValuesForTesting.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,9 @@ class ValuesForTesting : public Operation {
}

size_t getResultWidth() const override {
return tables_.empty() ? 0 : tables_.at(0).numColumns();
// Assume a width of 1 if we have no tables and no other information to base
// it on because 0 would otherwise cause stuff to break.
return tables_.empty() ? 1 : tables_.at(0).numColumns();
}

vector<ColumnIndex> resultSortedOn() const override {
Expand Down
7 changes: 3 additions & 4 deletions test/util/JoinHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,8 @@ IdTable useJoinFunctionOnIdTables(const IdTableAndJoinColumn& tableA,
* `ad_utility::callFixedSize`.
*/
auto makeHashJoinLambda() {
Join J{Join::InvalidOnlyForTestingJoinTag{}, ad_utility::testing::getQec()};
return [J = std::move(J)]<int A, int B, int C>(auto&&... args) mutable {
return J.hashJoin(AD_FWD(args)...);
return []<int A, int B, int C>(auto&&... args) {
return Join::hashJoin(AD_FWD(args)...);
};
}

Expand All @@ -70,7 +69,7 @@ auto makeHashJoinLambda() {
*/
auto makeJoinLambda() {
Join J{Join::InvalidOnlyForTestingJoinTag{}, ad_utility::testing::getQec()};
return [J = std::move(J)]<int A, int B, int C>(auto&&... args) mutable {
return [J = std::move(J)]<int A, int B, int C>(auto&&... args) {
return J.join(AD_FWD(args)...);
};
}
Loading