Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faiss GPU: bfloat16 brute-force kNN support #4014

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions faiss/gpu/GpuDistance.cu
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
FAISS_THROW_IF_NOT_MSG(
args.vectorType == args.queryType,
"limitation: both vectorType and queryType must currently "
"be the same (F32 or F16");
"be the same (F32 / F16 / BF16");

#if defined USE_NVIDIA_RAFT
// Note: For now, RAFT bfknn requires queries and vectors to be same layout
Expand Down Expand Up @@ -374,6 +374,8 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
bfKnnConvert<float>(prov, args);
} else if (args.vectorType == DistanceDataType::F16) {
bfKnnConvert<half>(prov, args);
} else if (args.vectorType == DistanceDataType::BF16) {
bfKnnConvert<__nv_bfloat16>(prov, args);
} else {
FAISS_THROW_MSG("unknown vectorType");
}
Expand Down Expand Up @@ -440,8 +442,10 @@ void bfKnn_single_query_shard(
args.k > 0,
"bfKnn_tiling: tiling vectors is only supported for k > 0");
size_t distance_size = args.vectorType == DistanceDataType::F32 ? 4
: args.vectorType == DistanceDataType::F16 ? 2
: 0;
: (args.vectorType == DistanceDataType::F16 ||
args.vectorType == DistanceDataType::BF16)
? 2
: 0;
FAISS_THROW_IF_NOT_MSG(
distance_size > 0, "bfKnn_tiling: unknown vectorType");
size_t shard_size = vectorsMemoryLimit / (args.dims * distance_size);
Expand Down Expand Up @@ -498,8 +502,10 @@ void bfKnn_tiling(
args.k > 0,
"bfKnn_tiling: tiling queries is only supported for k > 0");
size_t distance_size = args.queryType == DistanceDataType::F32 ? 4
: args.queryType == DistanceDataType::F16 ? 2
: 0;
: (args.queryType == DistanceDataType::F16 ||
args.queryType == DistanceDataType::BF16)
? 2
: 0;
FAISS_THROW_IF_NOT_MSG(
distance_size > 0, "bfKnn_tiling: unknown queryType");
size_t label_size = args.outIndicesType == IndicesDataType::I64 ? 8
Expand Down
1 change: 1 addition & 0 deletions faiss/gpu/GpuDistance.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class GpuResourcesProvider;
enum class DistanceDataType {
F32 = 1,
F16,
BF16,
};

// Scalar type of the indices data
Expand Down
89 changes: 89 additions & 0 deletions faiss/gpu/impl/Distance.cu
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,27 @@ void runAllPairwiseL2Distance(
outDistances);
}

void runAllPairwiseL2Distance(
GpuResources* res,
cudaStream_t stream,
Tensor<__nv_bfloat16, 2, true>& vectors,
bool vectorsRowMajor,
Tensor<float, 1, true>* vectorNorms,
Tensor<__nv_bfloat16, 2, true>& queries,
bool queriesRowMajor,
Tensor<float, 2, true>& outDistances) {
runAllPairwiseDistance<__nv_bfloat16>(
true,
res,
stream,
vectors,
vectorsRowMajor,
vectorNorms,
queries,
queriesRowMajor,
outDistances);
}

void runAllPairwiseIPDistance(
GpuResources* res,
cudaStream_t stream,
Expand Down Expand Up @@ -544,6 +565,26 @@ void runAllPairwiseIPDistance(
outDistances);
}

void runAllPairwiseIPDistance(
GpuResources* res,
cudaStream_t stream,
Tensor<__nv_bfloat16, 2, true>& vectors,
bool vectorsRowMajor,
Tensor<__nv_bfloat16, 2, true>& queries,
bool queriesRowMajor,
Tensor<float, 2, true>& outDistances) {
runAllPairwiseDistance<__nv_bfloat16>(
false,
res,
stream,
vectors,
vectorsRowMajor,
nullptr,
queries,
queriesRowMajor,
outDistances);
}

void runL2Distance(
GpuResources* res,
cudaStream_t stream,
Expand Down Expand Up @@ -596,6 +637,32 @@ void runL2Distance(
ignoreOutDistances);
}

void runL2Distance(
GpuResources* res,
cudaStream_t stream,
Tensor<__nv_bfloat16, 2, true>& vectors,
bool vectorsRowMajor,
Tensor<float, 1, true>* vectorNorms,
Tensor<__nv_bfloat16, 2, true>& queries,
bool queriesRowMajor,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<idx_t, 2, true>& outIndices,
bool ignoreOutDistances) {
runL2Distance<__nv_bfloat16>(
res,
stream,
vectors,
vectorsRowMajor,
vectorNorms,
queries,
queriesRowMajor,
k,
outDistances,
outIndices,
ignoreOutDistances);
}

void runIPDistance(
GpuResources* res,
cudaStream_t stream,
Expand Down Expand Up @@ -640,5 +707,27 @@ void runIPDistance(
outIndices);
}

void runIPDistance(
GpuResources* res,
cudaStream_t stream,
Tensor<__nv_bfloat16, 2, true>& vectors,
bool vectorsRowMajor,
Tensor<__nv_bfloat16, 2, true>& queries,
bool queriesRowMajor,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<idx_t, 2, true>& outIndices) {
runIPDistance<__nv_bfloat16>(
res,
stream,
vectors,
vectorsRowMajor,
queries,
queriesRowMajor,
k,
outDistances,
outIndices);
}

} // namespace gpu
} // namespace faiss
43 changes: 43 additions & 0 deletions faiss/gpu/impl/Distance.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@ void runAllPairwiseL2Distance(
bool queriesRowMajor,
Tensor<float, 2, true>& outDistances);

void runAllPairwiseL2Distance(
GpuResources* res,
cudaStream_t stream,
Tensor<__nv_bfloat16, 2, true>& vectors,
bool vectorsRowMajor,
Tensor<float, 1, true>* vectorNorms,
Tensor<__nv_bfloat16, 2, true>& queries,
bool queriesRowMajor,
Tensor<float, 2, true>& outDistances);

void runAllPairwiseIPDistance(
GpuResources* res,
cudaStream_t stream,
Expand All @@ -59,6 +69,15 @@ void runAllPairwiseIPDistance(
bool queriesRowMajor,
Tensor<float, 2, true>& outDistances);

void runAllPairwiseIPDistance(
GpuResources* res,
cudaStream_t stream,
Tensor<__nv_bfloat16, 2, true>& vectors,
bool vectorsRowMajor,
Tensor<__nv_bfloat16, 2, true>& queries,
bool queriesRowMajor,
Tensor<float, 2, true>& outDistances);

/// Calculates brute-force L2 distance between `vectors` and
/// `queries`, returning the k closest results seen
void runL2Distance(
Expand Down Expand Up @@ -91,6 +110,19 @@ void runL2Distance(
Tensor<idx_t, 2, true>& outIndices,
bool ignoreOutDistances = false);

void runL2Distance(
GpuResources* resources,
cudaStream_t stream,
Tensor<__nv_bfloat16, 2, true>& vectors,
bool vectorsRowMajor,
Tensor<float, 1, true>* vectorNorms,
Tensor<__nv_bfloat16, 2, true>& queries,
bool queriesRowMajor,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<idx_t, 2, true>& outIndices,
bool ignoreOutDistances = false);

/// Calculates brute-force inner product distance between `vectors`
/// and `queries`, returning the k closest results seen
void runIPDistance(
Expand All @@ -115,6 +147,17 @@ void runIPDistance(
Tensor<float, 2, true>& outDistances,
Tensor<idx_t, 2, true>& outIndices);

void runIPDistance(
GpuResources* resources,
cudaStream_t stream,
Tensor<__nv_bfloat16, 2, true>& vectors,
bool vectorsRowMajor,
Tensor<__nv_bfloat16, 2, true>& queries,
bool queriesRowMajor,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<idx_t, 2, true>& outIndices);

//
// General distance implementation, assumes that all arguments are on the
// device. This is the top-level internal distance function to call to dispatch
Expand Down
8 changes: 4 additions & 4 deletions faiss/gpu/impl/GpuScalarQuantizer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_fp16, 1> {
inline __device__ void decode(void* data, idx_t vec, int d, float* out)
const {
half* p = (half*)&((uint8_t*)data)[vec * bytesPerVec];
out[0] = Convert<half, float>()(p[d]);
out[0] = ConvertTo<float>::to(p[d]);
}

inline __device__ float decodePartial(
Expand All @@ -172,7 +172,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_fp16, 1> {
int d,
float v[kDimPerIter]) const {
half* p = (half*)&((uint8_t*)data)[vec * bytesPerVec];
p[d] = Convert<float, half>()(v[0]);
p[d] = ConvertTo<half>::to(v[0]);
}

inline __device__ void encodePartial(
Expand All @@ -191,11 +191,11 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_fp16, 1> {
static constexpr int kEncodeBits = 16;

inline __device__ EncodeT encodeNew(int dim, float v) const {
return Convert<float, half>()(v);
return ConvertTo<half>::to(v);
}

inline __device__ float decodeNew(int dim, EncodeT v) const {
return Convert<half, float>()(v);
return ConvertTo<float>::to(v);
}

int bytesPerVec;
Expand Down
10 changes: 10 additions & 0 deletions faiss/gpu/impl/L2Norm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -276,5 +276,15 @@ void runL2Norm(
runL2Norm<half, half2>(input, inputRowMajor, output, normSquared, stream);
}

void runL2Norm(
Tensor<__nv_bfloat16, 2, true>& input,
bool inputRowMajor,
Tensor<float, 1, true>& output,
bool normSquared,
cudaStream_t stream) {
runL2Norm<__nv_bfloat16, __nv_bfloat162>(
input, inputRowMajor, output, normSquared, stream);
}

} // namespace gpu
} // namespace faiss
7 changes: 7 additions & 0 deletions faiss/gpu/impl/L2Norm.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,12 @@ void runL2Norm(
bool normSquared,
cudaStream_t stream);

void runL2Norm(
Tensor<__nv_bfloat16, 2, true>& input,
bool inputRowMajor,
Tensor<float, 1, true>& output,
bool normSquared,
cudaStream_t stream);

} // namespace gpu
} // namespace faiss
8 changes: 2 additions & 6 deletions faiss/gpu/impl/VectorResidual.cu
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,8 @@ __global__ void gatherReconstructByIds(
auto vec = vecs[id];
auto outVec = out[blockIdx.x];

Convert<T, float> conv;

for (idx_t i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
outVec[i] = id == idx_t(-1) ? 0.0f : conv(vec[i]);
outVec[i] = id == idx_t(-1) ? 0.0f : ConvertTo<float>::to(vec[i]);
}
}

Expand All @@ -131,10 +129,8 @@ __global__ void gatherReconstructByRange(
auto vec = vecs[id];
auto outVec = out[blockIdx.x];

Convert<T, float> conv;

for (idx_t i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
outVec[i] = id == idx_t(-1) ? 0.0f : conv(vec[i]);
outVec[i] = id == idx_t(-1) ? 0.0f : ConvertTo<float>::to(vec[i]);
}
}

Expand Down
Loading
Loading