Skip to content

Commit

Permalink
Quickly merge multiple non-overlapping GBWTs (samples still missing)
Browse files Browse the repository at this point in the history
  • Loading branch information
jltsiren committed Nov 15, 2017
1 parent 9db6fb6 commit 8f792df
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 18 deletions.
96 changes: 96 additions & 0 deletions gbwt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,102 @@ GBWT::copy(const GBWT& source)

//------------------------------------------------------------------------------

GBWT::GBWT(const std::vector<GBWT>& sources)
{
if(sources.empty()) { return; }

// Merge the headers.
size_type valid_sources = 0;
size_type data_size = 0;
for(const GBWT& source : sources)
{
if(source.empty()) { continue; }
this->header.sequences += source.header.sequences;
this->header.size += source.header.size;
if(valid_sources == 0)
{
this->header.offset = source.header.offset;
this->header.alphabet_size = source.header.alphabet_size;
}
else
{
this->header.offset = std::min(this->header.offset, source.header.offset);
this->header.alphabet_size = std::max(this->header.alphabet_size, source.header.alphabet_size);
}
data_size += source.bwt.data.size();
valid_sources++;
}
if(valid_sources == 0) { return; }

// Determine the origin of each record.
sdsl::int_vector<0> origin(this->effective(), sources.size(), bit_length(sources.size()));
for(size_type source_id = 0; source_id < sources.size(); source_id++)
{
const GBWT& source = sources[source_id];
for(comp_type source_comp = 1; source_comp < source.effective(); source_comp++)
{
comp_type merged_comp = this->toCompInternal(source.toNodeInternal(source_comp));
if(origin[merged_comp] != sources.size())
{
std::cerr << "GBWT::GBWT(): Sources " << origin[merged_comp] << " and " << source_id << " both have node " << this->toNode(merged_comp) << std::endl;
std::exit(EXIT_FAILURE);
}
origin[merged_comp] = source_id;
}
}

// Merge the endmarkers.
std::vector<size_type> limits(sources.size(), 0); // Pointers to the end of the current records.
{
DynamicRecord merged;
for(size_type i = 0; i < sources.size(); i++)
{
const GBWT& source = sources[i];
CompressedRecord record = source.record(ENDMARKER);
for(CompressedRecordIterator iter(record); !(iter.end()); ++iter)
{
run_type run = *iter; run.first += merged.outdegree();
merged.body.push_back(run); merged.body_size += run.second;
}
for(edge_type outedge : record.outgoing)
{
merged.outgoing.push_back(outedge);
}
limits[i] = source.bwt.limit(0);
}
merged.recode();
merged.writeBWT(this->bwt.data);
}

// Merge the BWTs.
this->bwt.data.reserve(data_size + this->bwt.data.size());
std::vector<size_type> offsets(this->effective(), 0);
for(comp_type comp = 1; comp < this->effective(); comp++)
{
offsets[comp] = this->bwt.data.size();
if(origin[comp] >= sources.size())
{
this->bwt.data.push_back(0); // Empty record, outdegree 0.
continue;
}
const GBWT& source = sources[origin[comp]];
size_type start = limits[origin[comp]], limit = source.bwt.limit(source.toCompInternal(this->toNodeInternal(comp)));
limits[origin[comp]] = limit;
for(size_type i = start; i < limit; i++)
{
this->bwt.data.push_back(source.bwt.data[i]);
}
}

// Build the index for the BWT.
this->bwt.buildIndex(offsets);
offsets = std::vector<size_type>();

// FIXME samples
}

//------------------------------------------------------------------------------

size_type
GBWT::runs() const
{
Expand Down
1 change: 1 addition & 0 deletions include/gbwt/dynamic_gbwt.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ class DynamicGBWT
}

comp_type toComp(node_type node) const { return (node == 0 ? node : node - this->header.offset); }
node_type toNode(comp_type comp) const { return (comp == 0 ? comp : comp + this->header.offset); }

size_type nodeSize(node_type node) const { return this->record(node).size(); }

Expand Down
6 changes: 6 additions & 0 deletions include/gbwt/gbwt.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class GBWT
GBWT();
GBWT(const GBWT& source);
GBWT(GBWT&& source);
GBWT(const std::vector<GBWT>& sources);
~GBWT();

void swap(GBWT& another);
Expand Down Expand Up @@ -133,6 +134,7 @@ class GBWT
}

comp_type toComp(node_type node) const { return (node == 0 ? node : node - this->header.offset); }
node_type toNode(comp_type comp) const { return (comp == 0 ? comp : comp + this->header.offset); }

size_type nodeSize(node_type node) const { return this->record(node).size(); }

Expand Down Expand Up @@ -211,6 +213,10 @@ class GBWT

private:
void copy(const GBWT& source);

// These assume that the node/comp is not the endmarker.
comp_type toCompInternal(node_type node) const { return node - this->header.offset; }
node_type toNodeInternal(comp_type comp) const { return comp + this->header.offset; }
}; // class GBWT

//------------------------------------------------------------------------------
Expand Down
7 changes: 7 additions & 0 deletions include/gbwt/support.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ struct DynamicRecord
// Sort the outgoing edges if they are not sorted.
void recode();

// Write the compressed representation.
void writeBWT(std::vector<byte_type>& data) const;

//------------------------------------------------------------------------------

// Returns (node, LF(i, node)) or invalid_edge() if the offset is invalid.
Expand Down Expand Up @@ -223,6 +226,10 @@ struct RecordArray

explicit RecordArray(const std::vector<DynamicRecord>& bwt);

// Set the number of records, build the data manually, and give the offsets to build the index.
explicit RecordArray(size_type array_size);
void buildIndex(const std::vector<size_type>& offsets);

void swap(RecordArray& another);
RecordArray& operator=(const RecordArray& source);
RecordArray& operator=(RecordArray&& source);
Expand Down
49 changes: 31 additions & 18 deletions support.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,26 @@ DynamicRecord::recode()
for(run_type& run : this->body) { run.first = this->edgeTo(run.first); }
}

void
DynamicRecord::writeBWT(std::vector<byte_type>& data) const
{
// Write the outgoing edges.
ByteCode::write(data, this->outdegree());
node_type prev = 0;
for(edge_type outedge : this->outgoing)
{
ByteCode::write(data, outedge.first - prev);
prev = outedge.first;
ByteCode::write(data, outedge.second);
}

// Write the body.
if(this->outdegree() > 0)
{
Run encoder(this->outdegree());
for(run_type run : this->body) { encoder.write(data, run); }
}
}
//------------------------------------------------------------------------------

edge_type
Expand Down Expand Up @@ -390,27 +410,20 @@ RecordArray::RecordArray(const std::vector<DynamicRecord>& bwt) :
for(size_type i = 0; i < bwt.size(); i++)
{
offsets[i] = this->data.size();
const DynamicRecord& current = bwt[i];
bwt[i].writeBWT(this->data);
}

// Write the outgoing edges.
ByteCode::write(this->data, current.outdegree());
node_type prev = 0;
for(edge_type outedge : current.outgoing)
{
ByteCode::write(this->data, outedge.first - prev);
prev = outedge.first;
ByteCode::write(this->data, outedge.second);
}
this->buildIndex(offsets);
}

// Write the body.
if(current.outdegree() > 0)
{
Run encoder(current.outdegree());
for(run_type run : current.body) { encoder.write(this->data, run); }
}
}
RecordArray::RecordArray(size_type array_size) :
records(array_size)
{
}

// Compress the index.
void
RecordArray::buildIndex(const std::vector<size_type>& offsets)
{
sdsl::sd_vector_builder builder(this->data.size(), offsets.size());
for(size_type offset : offsets) { builder.set(offset); }
this->index = sdsl::sd_vector<>(builder);
Expand Down

0 comments on commit 8f792df

Please sign in to comment.