Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor of beam search to process factor groups in parallel #772

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,6 @@
[submodule "src/3rd_party/simple-websocket-server"]
path = src/3rd_party/simple-websocket-server
url = https://github.com/marian-nmt/Simple-WebSocket-Server
[submodule "src/3rd_party/cub"]
path = src/3rd_party/cub
url = https://github.com/NVIDIA/cub
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
## [Unreleased]

### Added
- Adds a fast path to perform a max reduction along the last axis to reduce the H2D communication.
- Refactors the beam search to batch processing of secondary factors for factored vocabulary models.
- Add --train-embedder-rank for fine-tuning any encoder(-decoder) model for multi-lingual similarity via softmax-margin loss
- Add --logical-epoch that allows to redefine the displayed epoch counter as a multiple of n data epochs, updates or labels. Also allows to define width of fractional part with second argument.
- Add --metrics chrf for computing ChrF according to https://www.aclweb.org/anthology/W15-3049/ and SacreBLEU reference implementation
Expand Down
1 change: 1 addition & 0 deletions src/3rd_party/cub
Submodule cub added at 52d58a
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
add_definitions(-DCUB_IGNORE_DEPRECATED_CPP_DIALECT=1)
add_subdirectory(3rd_party)

include_directories(.)
Expand Down
8 changes: 8 additions & 0 deletions src/graph/node_operators_unary.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
/* Part of this file was contributed by NVIDIA under license:
* Copyright (C) 2020 NVIDIA Corporation
* SPDX-License-Identifier: MIT
*/

#pragma once

#include "tensors/backend.h"
Expand Down Expand Up @@ -495,6 +500,9 @@ struct ReduceNodeOp : public UnaryNodeOp {
case ReduceNodeOpCode::min:
return {NodeOp(Reduce(_1, min(_1,_2), std::numeric_limits<float>::max(), val_, child(0)->val()))};
case ReduceNodeOpCode::max:
if(axis_ == child(0)->shape().size() - 1 && graph()->getBackend()->getDeviceId().type == DeviceType::gpu ) {
return {NodeOp(ReduceMaxLastAxis(val_, child(0)->val()))};
}
return {NodeOp(Reduce(_1, max(_1,_2), std::numeric_limits<float>::lowest(), val_, child(0)->val()))};
case ReduceNodeOpCode::prod:
return {NodeOp(Reduce(_1, _1 * _2, 1.0f, val_, child(0)->val()))};
Expand Down
43 changes: 41 additions & 2 deletions src/layers/generic.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
/* Part of this file was contributed by NVIDIA under license:
* Copyright (C) 2020 NVIDIA Corporation
* SPDX-License-Identifier: MIT
*/

#include "graph/node_initializers.h"
#include "marian.h"

#include "layers/generic.h"
Expand Down Expand Up @@ -66,6 +72,37 @@ namespace marian {
// return logits_.front();
//}

std::vector<Expr> Logits::getSecondaryFactorLogits(std::vector<size_t> factorGroups,
const std::vector<IndexType>& hypIndices,
size_t batchSize, size_t beamSize,
const std::vector<Expr>& expandedPathScores,
float scorerWeight) const {
const size_t totalElts = batchSize * beamSize;
std::vector<Expr> updatedPathScores(factorGroups.size());
auto indices = graph()->indices(hypIndices);

for(int fgIndex = 0; fgIndex < (int)factorGroups.size(); ++fgIndex) {
size_t factorGroup = factorGroups[fgIndex];
ABORT_IF(factorGroup == 0, "Lemmas not supported");

// Find and subtract max from factor scores
auto sel = logits_[factorGroup]->loss(); // [localBeamSize, 1, dimBatch, dimFactorVocab]
sel = sel - max(sel, -1);

// Obtain slice for indices
int start = (int)totalElts * fgIndex;
int end = (int)totalElts * (fgIndex + 1);
Slice fgSlice(start, end, 1);
Expr fgIndices = slice(indices, 0, fgSlice);

// Select relevant scores
Expr logProbs = rnn::State::select(sel, fgIndices, (int)beamSize, /*isBatchMajor=*/false);
updatedPathScores[fgIndex] = expandedPathScores[fgIndex] + scorerWeight * logProbs;
}

return updatedPathScores;
}

// get logits for one factor group
// For groupIndex == 0, the function also requires the shortlist if there is one.
Expr Logits::getFactoredLogits(size_t groupIndex, Ptr<data::Shortlist> shortlist /*= nullptr*/, const std::vector<IndexType>& hypIndices /*= {}*/, size_t beamSize /*= 0*/) const {
Expand All @@ -88,8 +125,10 @@ namespace marian {
}

// if selIdx are given, then we must reshuffle accordingly
if (!hypIndices.empty()) // use the same function that shuffles decoder state
sel = rnn::State::select(sel, hypIndices, (int)beamSize, /*isBatchMajor=*/false);
if (!hypIndices.empty()) { // use the same function that shuffles decoder state
auto indices = graph()->indices(hypIndices);
sel = rnn::State::select(sel, indices, (int)beamSize, /*isBatchMajor=*/false);
}
return sel;
}

Expand Down
7 changes: 7 additions & 0 deletions src/layers/generic.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
/* Part of this file was contributed by NVIDIA under license:
* Copyright (C) 2020 NVIDIA Corporation
* SPDX-License-Identifier: MIT
*/

#pragma once

#include "marian.h"
Expand Down Expand Up @@ -115,6 +120,8 @@ class Logits {
Logits(std::vector<Ptr<RationalLoss>>&& logits, Ptr<FactoredVocab> embeddingFactorMapping) // factored-output constructor
: logits_(std::move(logits)), factoredVocab_(embeddingFactorMapping) {}
Expr getLogits() const; // assume it holds logits: get them, possibly aggregating over factors
std::vector<Expr> getSecondaryFactorLogits(std::vector<size_t> factorGroups, const std::vector<IndexType>& hypIndices, size_t batchSize, size_t beamSize,
const std::vector<Expr>& expandedPathScores, float scorerWeight) const; // get logits all secondary factor groups in factorGroups vector
Expr getFactoredLogits(size_t groupIndex, Ptr<data::Shortlist> shortlist = nullptr, const std::vector<IndexType>& hypIndices = {}, size_t beamSize = 0) const; // get logits for only one factor group, with optional reshuffle
//Ptr<RationalLoss> getRationalLoss() const; // assume it holds a loss: get that
Expr applyLossFunction(const Words& labels, const std::function<Expr(Expr/*logits*/,Expr/*indices*/)>& lossFn) const;
Expand Down
38 changes: 33 additions & 5 deletions src/rnn/types.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
/* Part of this file was contributed by NVIDIA under license:
* Copyright (C) 2020 NVIDIA Corporation
* SPDX-License-Identifier: MIT
*/

#pragma once

#include "common/definitions.h"
#include "common/shape.h"
#include "marian.h"

#include <iostream>
Expand All @@ -12,23 +19,22 @@ struct State {
Expr output;
Expr cell;

State select(const std::vector<IndexType>& selIdx, // [beamIndex * activeBatchSize + batchIndex]
State select(Expr selIdx, // [beamIndex * activeBatchSize + batchIndex]
int beamSize, bool isBatchMajor) const {
return{ select(output, selIdx, beamSize, isBatchMajor),
select(cell, selIdx, beamSize, isBatchMajor) };
}

// this function is also called by Logits
static Expr select(Expr sel, // [beamSize, dimTime, dimBatch, dimDepth] or [beamSize, dimBatch, dimTime, dimDepth] (dimTime = 1 for RNN)
const std::vector<IndexType>& selIdx, // [beamIndex * activeBatchSize + batchIndex]
Expr selIdx, // [beamIndex * activeBatchSize + batchIndex]
int beamSize, bool isBatchMajor)
{
if (!sel)
return sel; // keep nullptr untouched

sel = atleast_4d(sel);

int dimBatch = (int)selIdx.size() / beamSize;
int dimBatch =(int) selIdx->shape().elements()/beamSize;
int dimDepth = sel->shape()[-1];
int dimTime = isBatchMajor ? sel->shape()[-2] : sel->shape()[-3];

Expand Down Expand Up @@ -83,8 +89,30 @@ class States {
States select(const std::vector<IndexType>& selIdx, // [beamIndex * activeBatchSize + batchIndex]
int beamSize, bool isBatchMajor) const {
States selected;
Expr indices;

// We need to check if either a states's cell or output fields are non-null. In this case, we need
// to select rows from at least one of the tensors. If only some exprs are non-null, the call to
// select will handle this for us by returning a null expr naturally.
for (auto& state : states_) {
if (state.cell) {
indices = state.cell->graph()->indices(selIdx);
break;
}

if (state.output) {
indices = state.output->graph()->indices(selIdx);
break;
}
}

// If indices is null here, then all of the state.cell and state.output entries are null. Therefore,
// select will ignore the null indices expr and simply return a null pointer which is the expected
// behavior

// GPU OPT: Implement kernel to batch these on GPU
for(auto& state : states_)
selected.push_back(state.select(selIdx, beamSize, isBatchMajor));
selected.push_back(state.select(indices, beamSize, isBatchMajor));
return selected;
}

Expand Down
11 changes: 11 additions & 0 deletions src/tensors/cpu/tensor_operators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
* SPDX-License-Identifier: MIT
*/

/* Part of this file was contributed by NVIDIA under license:
* Copyright (C) 2020 NVIDIA Corporation
* SPDX-License-Identifier: MIT
*/


#include "tensors/tensor_operators.h"
#include "tensors/cpu/backend.h"
#include "tensors/allocator.h"
Expand All @@ -24,6 +30,11 @@ namespace cpu {
ABORT("Not implemented");
}

void ReduceMaxLastAxis(Tensor /*out*/,
const marian::Tensor& /*input*/) {
ABORT("Not implemented");
}

template <typename To, typename From>
void CopyCastTo(To* out, const From* in, int length) {
for(int i = 0; i < length; ++i)
Expand Down
86 changes: 86 additions & 0 deletions src/tensors/gpu/tensor_operators.cu
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
/* Part of this file was contributed by NVIDIA under license:
* Copyright (C) 2020 NVIDIA Corporation
* SPDX-License-Identifier: MIT
*/

#include "common/types.h"
#include "tensors/tensor_operators.h"

Expand All @@ -8,6 +13,22 @@
#include "tensors/gpu/cuda_helpers.h"

#include "tensors/gpu/add_all.h"
#include "3rd_party/reduce_all.h"

#if COMPILE_FP16
#include <cuda_fp16.h>
__device__ __forceinline__ half max(const half a, const half b) {
return a > b ? a : b;
}
#endif


#if CUDA_VERSION >= 11000
#include <cub/cub.cuh>
#else
#include "cub/cub/cub.cuh"
#endif


namespace marian {

Expand Down Expand Up @@ -2930,5 +2951,70 @@ void PoolingWithMaskingBackward(Tensor adj,
width,
lastWidth);
}

template<typename T, int BLOCK_THREADS>
__global__ void gReduceMaxLastAxis(T* outTensor, const T* inputTensor, int innerDimSize) {

typedef cub::BlockReduce<T, BLOCK_THREADS> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;

size_t inputBlockStartOffset = blockIdx.x * innerDimSize;
const T* blockInputPtr = inputTensor + inputBlockStartOffset;
T blockMax = cub::FpLimits<T>::Lowest();

for(int tid = threadIdx.x; tid < innerDimSize; tid += BLOCK_THREADS) {
blockMax = max(blockMax, blockInputPtr[tid]);
}

int aggregate = BlockReduce(temp_storage).Reduce(blockMax, cub::Max());
if(threadIdx.x == 0) outTensor[blockIdx.x] = aggregate;
}

#define CASE_THREADS(BLOCKS, THREADS) \
case THREADS: \
gReduceMaxLastAxis<T, THREADS><<<BLOCKS, THREADS>>>( \
out, \
input, \
sizeOfLastDim); \
break; \

template<typename T>
void ReduceMaxLastAxisTyped(T* out, const T* input, int sizeOfLastDim, int blocks, int threads) {
threads = std::max(nextPow2((unsigned int)threads), 32U);
switch(threads) {
CASE_THREADS(blocks, 32);
CASE_THREADS(blocks, 64);
CASE_THREADS(blocks, 128);
CASE_THREADS(blocks, 256);
CASE_THREADS(blocks, 512);
CASE_THREADS(blocks, 1024);
default:
ABORT("Invalid number of threads in config for ReduceMaxLastAxis");
}
}

void ReduceMaxLastAxis(Tensor out,
const marian::Tensor& input) {

cudaSetDevice(out->getDeviceId().no);
int outputElts = out->shape().elements();
int inputElts = input->shape().elements();

int sizeOfLastDim = input->shape()[-1];
int blocks = inputElts / sizeOfLastDim;

ABORT_IF(blocks != outputElts, "Expected {} elts in output tensor but tensor has size {}", blocks, outputElts);
int threads = std::min(sizeOfLastDim, MAX_THREADS);

if(out->type() == Type::float32) {
ReduceMaxLastAxisTyped(out->data<float>(), input->data<float>(), sizeOfLastDim, blocks, threads);
#if COMPILE_FP16
} else if(out->type() == Type::float16) {
ReduceMaxLastAxisTyped(out->data<half>(), input->data<half>(), sizeOfLastDim, blocks, threads);
#endif
} else {
ABORT("ReduceMaxLastAxis not implemented for type {}", out->type());
}
}
} // namespace gpu
} // namespace marian
6 changes: 6 additions & 0 deletions src/tensors/tensor_operators.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
/* Part of this file was contributed by NVIDIA under license:
* Copyright (C) 2020 NVIDIA Corporation
* SPDX-License-Identifier: MIT
*/

#pragma once

#include "common/definitions.h"
Expand Down Expand Up @@ -103,6 +108,7 @@ DISPATCH7(Prod, marian::Tensor, const marian::Tensor&, const marian::Tensor&, bo
DISPATCH8(ProdBatched, marian::Tensor, Ptr<Allocator>, const marian::Tensor, const marian::Tensor, bool, bool, float, float)
DISPATCH9(CSRProd, marian::Tensor, Ptr<Allocator>, const marian::Tensor&, const marian::Tensor&, const marian::Tensor&, const marian::Tensor&, bool, bool, float)

DISPATCH2(ReduceMaxLastAxis, marian::Tensor, const marian::Tensor&)
DISPATCH2(Softmax, marian::Tensor, marian::Tensor)
DISPATCH3(SoftmaxGrad, marian::Tensor, marian::Tensor, marian::Tensor)

Expand Down
Loading