Skip to content

Commit

Permalink
Initial change for save/load as one file. (#504)
Browse files Browse the repository at this point in the history
Co-authored-by: REDMOND\ninchen <[email protected]>
  • Loading branch information
NingyuanChen and REDMOND\ninchen authored Jan 8, 2024
1 parent 720c45c commit ee62d8d
Show file tree
Hide file tree
Showing 12 changed files with 382 additions and 104 deletions.
3 changes: 2 additions & 1 deletion include/abstract_data_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,14 @@ template <typename data_t> class AbstractDataStore
virtual ~AbstractDataStore() = default;

// Return number of points returned
virtual location_t load(const std::string &filename) = 0;
virtual location_t load(const std::string &filename, size_t offset) = 0;

// Why does store take num_pts? Since store only has capacity, but we allow
// resizing we can end up in a situation where the store has spare capacity.
// To optimize disk utilization, we pass the number of points that are "true"
// points, so that the store can discard the empty locations before saving.
virtual size_t save(const std::string &filename, const location_t num_pts) = 0;
virtual size_t save(std::ofstream &writer, const location_t num_pts, size_t offset) = 0;

DISKANN_DLLEXPORT virtual location_t capacity() const;

Expand Down
7 changes: 5 additions & 2 deletions include/abstract_graph_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,14 @@ class AbstractGraphStore
virtual ~AbstractGraphStore() = default;

// returns tuple of <nodes_read, start, num_frozen_points>
virtual std::tuple<uint32_t, uint32_t, size_t> load(const std::string &index_path_prefix,
const size_t num_points) = 0;
virtual std::tuple<uint32_t, uint32_t, size_t> load(const std::string &index_path_prefix, const size_t num_points,
size_t offset) = 0;
virtual int store(const std::string &index_path_prefix, const size_t num_points, const size_t num_fz_points,
const uint32_t start) = 0;

virtual int store(std::ofstream &writer, const size_t num_points, const size_t num_fz_points, const uint32_t start,
size_t offset) = 0;

// not synchronised, user should use lock when necvessary.
virtual const std::vector<location_t> &get_neighbours(const location_t i) const = 0;
virtual void add_neighbour(const location_t i, location_t neighbour_id) = 0;
Expand Down
1 change: 1 addition & 0 deletions include/distance.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once
#include "windows_customizations.h"
#include <cstdint>
#include <cstring>

namespace diskann
Expand Down
9 changes: 5 additions & 4 deletions include/in_mem_data_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ template <typename data_t> class InMemDataStore : public AbstractDataStore<data_
InMemDataStore(const location_t capacity, const size_t dim, std::unique_ptr<Distance<data_t>> distance_fn);
virtual ~InMemDataStore();

virtual location_t load(const std::string &filename) override;
virtual size_t save(const std::string &filename, const location_t num_points) override;
virtual location_t load(const std::string &filename, size_t offset = 0) override;
virtual size_t save(const std::string &filename, const location_t num_pts) override;
virtual size_t save(std::ofstream &writer, const location_t num_pts, size_t offset) override;

virtual size_t get_aligned_dim() const override;

Expand Down Expand Up @@ -59,9 +60,9 @@ template <typename data_t> class InMemDataStore : public AbstractDataStore<data_
virtual location_t expand(const location_t new_size) override;
virtual location_t shrink(const location_t new_size) override;

virtual location_t load_impl(const std::string &filename);
virtual location_t load_impl(const std::string &filename, size_t offset);
#ifdef EXEC_ENV_OLS
virtual location_t load_impl(AlignedFileReader &reader);
virtual location_t load_impl(AlignedFileReader &reader, size_t offset);
#endif

private:
Expand Down
17 changes: 10 additions & 7 deletions include/in_mem_graph_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@ class InMemGraphStore : public AbstractGraphStore
InMemGraphStore(const size_t total_pts, const size_t reserve_graph_degree);

// returns tuple of <nodes_read, start, num_frozen_points>
virtual std::tuple<uint32_t, uint32_t, size_t> load(const std::string &index_path_prefix,
const size_t num_points) override;
virtual std::tuple<uint32_t, uint32_t, size_t> load(const std::string &index_path_prefix, const size_t num_points,
size_t offset) override;
virtual int store(const std::string &index_path_prefix, const size_t num_points, const size_t num_frozen_points,
const uint32_t start) override;

virtual int store(std::ofstream &writer, const size_t num_points, const size_t num_fz_points, const uint32_t start,
size_t offset) override;
virtual const std::vector<location_t> &get_neighbours(const location_t i) const override;
virtual void add_neighbour(const location_t i, location_t neighbour_id) override;
virtual void clear_neighbours(const location_t i) override;
Expand All @@ -33,13 +34,15 @@ class InMemGraphStore : public AbstractGraphStore
virtual uint32_t get_max_observed_degree() override;

protected:
virtual std::tuple<uint32_t, uint32_t, size_t> load_impl(const std::string &filename, size_t expected_num_points);
virtual std::tuple<uint32_t, uint32_t, size_t> load_impl(const std::string &filename, size_t expected_num_points,
size_t offset);
#ifdef EXEC_ENV_OLS
virtual std::tuple<uint32_t, uint32_t, size_t> load_impl(AlignedFileReader &reader, size_t expected_num_points);
virtual std::tuple<uint32_t, uint32_t, size_t> load_impl(AlignedFileReader &reader, size_t expected_num_points,
size_t offset);
#endif

int save_graph(const std::string &index_path_prefix, const size_t active_points, const size_t num_frozen_points,
const uint32_t start);
int save_graph(std::ofstream &writer, const size_t active_points, const size_t num_frozen_points,
const uint32_t start, size_t offset);

private:
size_t _max_range_of_graph = 0;
Expand Down
33 changes: 23 additions & 10 deletions include/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@

namespace diskann
{
// This struct is used for storing metadata for save_as_one_file version 1.
struct SaveLoadMetaDataV1
{
uint64_t data_offset;
uint64_t delete_list_offset;
uint64_t tags_offset;
uint64_t graph_offset;
};

inline double estimate_ram_usage(size_t size, uint32_t dim, uint32_t datasize, uint32_t degree)
{
Expand Down Expand Up @@ -57,7 +65,9 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
const size_t num_frozen_pts = 0, const bool dynamic_index = false,
const bool enable_tags = false, const bool concurrent_consolidate = false,
const bool pq_dist_build = false, const size_t num_pq_chunks = 0,
const bool use_opq = false, const bool filtered_index = false);
const bool use_opq = false, const bool filtered_index = false,
bool save_as_one_file = false, uint64_t save_as_one_file_version = 1,
bool load_from_one_file = false, uint64_t load_from_one_file_version = 1);

DISKANN_DLLEXPORT Index(const IndexConfig &index_config, std::unique_ptr<AbstractDataStore<T>> data_store,
std::unique_ptr<AbstractGraphStore> graph_store);
Expand Down Expand Up @@ -313,15 +323,15 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas
DISKANN_DLLEXPORT size_t save_tags(std::string filename);
DISKANN_DLLEXPORT size_t save_delete_list(const std::string &filename);
#ifdef EXEC_ENV_OLS
DISKANN_DLLEXPORT size_t load_graph(AlignedFileReader &reader, size_t expected_num_points);
DISKANN_DLLEXPORT size_t load_data(AlignedFileReader &reader);
DISKANN_DLLEXPORT size_t load_tags(AlignedFileReader &reader);
DISKANN_DLLEXPORT size_t load_delete_set(AlignedFileReader &reader);
DISKANN_DLLEXPORT size_t load_graph(AlignedFileReader &reader, size_t expected_num_points, size_t offset = 0);
DISKANN_DLLEXPORT size_t load_data(AlignedFileReader &reader, size_t offset = 0);
DISKANN_DLLEXPORT size_t load_tags(AlignedFileReader &reader, size_t offset = 0);
DISKANN_DLLEXPORT size_t load_delete_set(AlignedFileReader &reader, size_t offset = 0);
#else
DISKANN_DLLEXPORT size_t load_graph(const std::string filename, size_t expected_num_points);
DISKANN_DLLEXPORT size_t load_data(std::string filename0);
DISKANN_DLLEXPORT size_t load_tags(const std::string tag_file_name);
DISKANN_DLLEXPORT size_t load_delete_set(const std::string &filename);
DISKANN_DLLEXPORT size_t load_graph(const std::string filename, size_t expected_num_points, size_t offset = 0);
DISKANN_DLLEXPORT size_t load_data(std::string filename, size_t offset = 0);
DISKANN_DLLEXPORT size_t load_tags(const std::string &filename, size_t offset = 0);
DISKANN_DLLEXPORT size_t load_delete_set(const std::string &filename, size_t offset = 0);
#endif

private:
Expand Down Expand Up @@ -360,7 +370,10 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas

bool _has_built = false;
bool _saturate_graph = false;
bool _save_as_one_file = false; // plan to support in next version
bool _save_as_one_file; // plan to support filtered index in next version.
uint64_t _save_as_one_file_version; // Version used for save index to single file.
bool _load_from_one_file; // Whether to load index from single file.
uint64_t _load_from_one_file_version; // Version used for save index to single file.
bool _dynamic_index = false;
bool _enable_tags = false;
bool _normalize_vecs = false; // Using normalied L2 for cosine.
Expand Down
42 changes: 39 additions & 3 deletions include/index_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ struct IndexConfig
bool concurrent_consolidate;
bool use_opq;
bool filtered_index;
bool save_as_one_file;
uint64_t save_as_one_file_version;
bool load_from_one_file;
uint64_t load_from_one_file_version;

size_t num_pq_chunks;
size_t num_frozen_pts;
Expand All @@ -45,12 +49,15 @@ struct IndexConfig
IndexConfig(DataStoreStrategy data_strategy, GraphStoreStrategy graph_strategy, Metric metric, size_t dimension,
size_t max_points, size_t num_pq_chunks, size_t num_frozen_points, bool dynamic_index, bool enable_tags,
bool pq_dist_build, bool concurrent_consolidate, bool use_opq, bool filtered_index,
std::string &data_type, const std::string &tag_type, const std::string &label_type,
std::shared_ptr<IndexWriteParameters> index_write_params,
bool save_as_one_file, uint64_t save_as_one_file_version, bool load_from_one_file,
uint64_t load_from_one_file_version, std::string &data_type, const std::string &tag_type,
const std::string &label_type, std::shared_ptr<IndexWriteParameters> index_write_params,
std::shared_ptr<IndexSearchParams> index_search_params)
: data_strategy(data_strategy), graph_strategy(graph_strategy), metric(metric), dimension(dimension),
max_points(max_points), dynamic_index(dynamic_index), enable_tags(enable_tags), pq_dist_build(pq_dist_build),
concurrent_consolidate(concurrent_consolidate), use_opq(use_opq), filtered_index(filtered_index),
save_as_one_file(save_as_one_file), save_as_one_file_version(save_as_one_file_version),
load_from_one_file(load_from_one_file), load_from_one_file_version(load_from_one_file_version),
num_pq_chunks(num_pq_chunks), num_frozen_pts(num_frozen_points), label_type(label_type), tag_type(tag_type),
data_type(data_type), index_write_params(index_write_params), index_search_params(index_search_params)
{
Expand Down Expand Up @@ -194,6 +201,30 @@ class IndexConfigBuilder
return *this;
}

IndexConfigBuilder &with_save_as_single_file(bool save_as_one_file)
{
this->_save_as_one_file = save_as_one_file;
return *this;
}

IndexConfigBuilder &with_save_as_single_file_version(uint64_t save_as_one_file_version)
{
this->_save_as_one_file_version = save_as_one_file_version;
return *this;
}

IndexConfigBuilder &with_load_from_single_file(bool load_from_one_file)
{
this->_load_from_one_file = load_from_one_file;
return *this;
}

IndexConfigBuilder &with_load_from_single_file_version(uint64_t load_from_one_file_version)
{
this->_save_as_one_file_version = load_from_one_file_version;
return *this;
}

IndexConfig build()
{
if (_data_type == "" || _data_type.empty())
Expand All @@ -219,7 +250,8 @@ class IndexConfigBuilder

return IndexConfig(_data_strategy, _graph_strategy, _metric, _dimension, _max_points, _num_pq_chunks,
_num_frozen_pts, _dynamic_index, _enable_tags, _pq_dist_build, _concurrent_consolidate,
_use_opq, _filtered_index, _data_type, _tag_type, _label_type, _index_write_params,
_use_opq, _filtered_index, _save_as_one_file, _save_as_one_file_version, _load_from_one_file,
_load_from_one_file_version, _data_type, _tag_type, _label_type, _index_write_params,
_index_search_params);
}

Expand All @@ -240,6 +272,10 @@ class IndexConfigBuilder
bool _concurrent_consolidate = false;
bool _use_opq = false;
bool _filtered_index{defaults::HAS_LABELS};
bool _save_as_one_file;
uint64_t _save_as_one_file_version;
bool _load_from_one_file;
uint64_t _load_from_one_file_version;

size_t _num_pq_chunks = 0;
size_t _num_frozen_pts{defaults::NUM_FROZEN_POINTS_STATIC};
Expand Down
43 changes: 30 additions & 13 deletions include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -714,13 +714,8 @@ inline void open_file_to_write(std::ofstream &writer, const std::string &filenam
}
}

template <typename T>
inline size_t save_bin(const std::string &filename, T *data, size_t npts, size_t ndims, size_t offset = 0)
template <typename T> inline size_t save_bin(std::ofstream &writer, T *data, size_t npts, size_t ndims, size_t offset)
{
std::ofstream writer;
open_file_to_write(writer, filename);

diskann::cout << "Writing bin: " << filename.c_str() << std::endl;
writer.seekp(offset, writer.beg);
int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
size_t bytes_written = npts * ndims * sizeof(T) + 2 * sizeof(uint32_t);
Expand All @@ -730,11 +725,22 @@ inline size_t save_bin(const std::string &filename, T *data, size_t npts, size_t
<< std::endl;

writer.write((char *)data, npts * ndims * sizeof(T));
writer.close();
diskann::cout << "Finished writing bin." << std::endl;
return bytes_written;
}

template <typename T>
inline size_t save_bin(const std::string &filename, T *data, size_t npts, size_t ndims, size_t offset = 0)
{
std::ofstream writer;
open_file_to_write(writer, filename);
diskann::cout << "Writing bin file: " << filename.c_str() << std::endl;
size_t bytes_written = save_bin<T>(writer, data, npts, ndims, offset);
writer.close();
diskann::cout << "Close file " << filename << "." << std::endl;
return bytes_written;
}

inline void print_progress(double percentage)
{
int val = (int)(percentage * 100);
Expand Down Expand Up @@ -938,12 +944,11 @@ template <typename T> void save_Tvecs(const char *filename, T *data, size_t npts
writer.write((char *)cur_pt, ndims * sizeof(T));
}
}

template <typename T>
inline size_t save_data_in_base_dimensions(const std::string &filename, T *data, size_t npts, size_t ndims,
size_t aligned_dim, size_t offset = 0)
inline size_t save_data_in_base_dimensions(std::ofstream &writer, T *data, size_t npts, size_t ndims,
size_t aligned_dim, size_t offset)
{
std::ofstream writer; //(filename, std::ios::binary | std::ios::out);
open_file_to_write(writer, filename);
int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
size_t bytes_written = 2 * sizeof(uint32_t) + npts * ndims * sizeof(T);
writer.seekp(offset, writer.beg);
Expand All @@ -953,10 +958,21 @@ inline size_t save_data_in_base_dimensions(const std::string &filename, T *data,
{
writer.write((char *)(data + i * aligned_dim), ndims * sizeof(T));
}
writer.close();
return bytes_written;
}

template <typename T>
inline size_t save_data_in_base_dimensions(const std::string &filename, T *data, size_t npts, size_t ndims,
size_t aligned_dim, size_t offset = 0)
{
std::ofstream writer; //(filename, std::ios::binary | std::ios::out);
open_file_to_write(writer, filename);
size_t file_size = save_data_in_base_dimensions(writer, data, npts, ndims, aligned_dim, offset);
writer.close();

return file_size;
}

template <typename T>
inline void copy_aligned_data_from_file(const char *bin_file, T *&data, size_t &npts, size_t &dim,
const size_t &rounded_dim, size_t offset = 0)
Expand All @@ -968,11 +984,12 @@ inline void copy_aligned_data_from_file(const char *bin_file, T *&data, size_t &
throw diskann::ANNException("Null pointer passed to copy_aligned_data_from_file function", -1, __FUNCSIG__,
__FILE__, __LINE__);
}

std::ifstream reader;
reader.exceptions(std::ios::badbit | std::ios::failbit);
reader.open(bin_file, std::ios::binary);
reader.seekg(offset, reader.beg);

reader.seekg(offset, reader.beg);
int npts_i32, dim_i32;
reader.read((char *)&npts_i32, sizeof(int));
reader.read((char *)&dim_i32, sizeof(int));
Expand Down
3 changes: 3 additions & 0 deletions src/distance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -730,4 +730,7 @@ template DISKANN_DLLEXPORT class SlowDistanceL2<float>;
template DISKANN_DLLEXPORT class SlowDistanceL2<int8_t>;
template DISKANN_DLLEXPORT class SlowDistanceL2<uint8_t>;

template DISKANN_DLLEXPORT Distance<float> *get_distance_function(Metric m);
template DISKANN_DLLEXPORT Distance<int8_t> *get_distance_function(Metric m);
template DISKANN_DLLEXPORT Distance<uint8_t> *get_distance_function(Metric m);
} // namespace diskann
Loading

0 comments on commit ee62d8d

Please sign in to comment.