diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 37b7cfda..56c881bd 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -510,8 +510,9 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse } } -std::pair ExecutionPlan::Impl::calcSizePerRank(int rank, size_t inputSize, size_t outputSize) const { - std::pair sizePerRank; +std::pair ExecutionPlan::Impl::getSizeAndChunksForRank(int rank, size_t inputSize, + size_t outputSize) const { + std::pair sizePerRank; if (this->inputChunks.at(rank) == 0 && this->outputChunks.at(rank) == 0) { throw mscclpp::Error("Output or Input chunks must be greater than 0", mscclpp::ErrorCode::ExecutorError); } else if (this->inputChunks.at(rank) != 0 && this->outputChunks.at(rank) != 0) { @@ -534,15 +535,15 @@ size_t ExecutionPlan::Impl::getOffset(int rank, size_t inputSize, size_t outputS } const int nGroups = this->chunkGroups.at(rank); - auto sizePerRank = calcSizePerRank(rank, inputSize, outputSize); - uint32_t nInputChunks = sizePerRank.second; - uint32_t nelems = sizePerRank.first / (alignment * sizeof(uint8_t)); + auto rankSizeAndChunks = getSizeAndChunksForRank(rank, inputSize, outputSize); + uint32_t nChunks = rankSizeAndChunks.second; + uint32_t nelems = rankSizeAndChunks.first / (alignment * sizeof(uint8_t)); if (nelems % nGroups != 0) { throw Error("Input size must be a multiple of nGroups", ErrorCode::ExecutorError); } int nelemsPerGroup = nelems / nGroups; - int nChunksPerGroup = nInputChunks / nGroups; + int nChunksPerGroup = nChunks / nGroups; uint32_t minNelems = nelemsPerGroup / nChunksPerGroup; uint32_t remainder = nelemsPerGroup % nChunksPerGroup; uint32_t groupIdx = chunkIndex / nChunksPerGroup; @@ -568,9 +569,17 @@ size_t ExecutionPlan::Impl::getNChunkSize(int rank, size_t inputSize, size_t out } size_t ExecutionPlan::Impl::getUpperBoundChunkSize(int rank, size_t inputSize, size_t outputSize) const { - auto sizePerRank = calcSizePerRank(rank, inputSize, outputSize); - uint32_t nChunks = sizePerRank.second; - return (sizePerRank.first + nChunks - 1) / nChunks; + size_t nInputChunks = this->inputChunks.at(rank); + size_t nOutputChunks = this->outputChunks.at(rank); + size_t inputChunkSize = 0; + size_t outputChunkSize = 0; + if (nInputChunks != 0) { + inputChunkSize = inputSize / nInputChunks; + } + if (nOutputChunks != 0) { + outputChunkSize = outputSize / nOutputChunks; + } + return std::max(inputChunkSize, outputChunkSize); } void ExecutionPlan::Impl::reset() { diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index 95c3aadd..080a7688 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -113,7 +113,7 @@ struct ExecutionPlan::Impl { bool isInPlace; private: - std::pair calcSizePerRank(int rank, size_t inputSize, size_t outputSize) const; + std::pair getSizeAndChunksForRank(int rank, size_t inputSize, size_t outputSize) const; size_t getOffset(int rank, size_t inputSize, size_t outputSize, uint32_t chunkIndex, uint32_t alignment = 16) const; size_t getNChunkSize(int rank, size_t inputSize, size_t outputSize, uint32_t nChunks, const std::vector offsets) const;