Skip to content

Commit

Permalink
Almost finished fast merging
Browse files Browse the repository at this point in the history
  • Loading branch information
jltsiren committed Nov 24, 2017
1 parent 8f792df commit 7cd8431
Show file tree
Hide file tree
Showing 6 changed files with 258 additions and 71 deletions.
20 changes: 6 additions & 14 deletions dynamic_gbwt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,23 +151,15 @@ DynamicGBWT::load(std::istream& in)
{
DASamples samples;
samples.load(in);
sdsl::sd_vector<>::select_1_type offset_select(&(samples.sampled_offsets));
size_type record_rank = 0, max_rank = samples.record_rank(samples.sampled_records.size());
size_type record_start = 0;
size_type sample_rank = 0;
for(comp_type comp = 0; comp < this->effective(); comp++)
SampleIterator sample_iter(samples);
for(SampleRangeIterator range_iter(samples); !(range_iter.end()); ++range_iter)
{
if(samples.sampled_records[comp] == 0) { continue; }
DynamicRecord& current = this->bwt[comp];
size_type limit = (record_rank + 1 < max_rank ? samples.bwt_select(record_rank + 2) : samples.bwt_ranges.size());
while(sample_rank < samples.size())
DynamicRecord& current = this->bwt[range_iter.record()];
while(!(sample_iter.end()) && sample_iter.offset() < range_iter.limit())
{
size_type sample_offset = offset_select(sample_rank + 1);
if(sample_offset >= limit) { break; }
current.ids.push_back(sample_type(sample_offset - record_start, samples.array[sample_rank]));
sample_rank++;
current.ids.push_back(sample_type(sample_iter.offset() - range_iter.start(), *sample_iter));
++sample_iter;
}
record_rank++; record_start = limit;
}
}

Expand Down
68 changes: 23 additions & 45 deletions gbwt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,6 @@ GBWT::GBWT(const std::vector<GBWT>& sources)

// Merge the headers.
size_type valid_sources = 0;
size_type data_size = 0;
for(const GBWT& source : sources)
{
if(source.empty()) { continue; }
Expand All @@ -140,76 +139,55 @@ GBWT::GBWT(const std::vector<GBWT>& sources)
this->header.offset = std::min(this->header.offset, source.header.offset);
this->header.alphabet_size = std::max(this->header.alphabet_size, source.header.alphabet_size);
}
data_size += source.bwt.data.size();
valid_sources++;
}
if(valid_sources == 0) { return; }

// Determine the mapping between source comp values and merged comp values.
std::vector<size_type> record_offsets(sources.size());
for(size_type i = 0; i < sources.size(); i++)
{
record_offsets[i] = sources[i].header.offset - this->header.offset;
}

// Determine the origin of each record.
sdsl::int_vector<0> origin(this->effective(), sources.size(), bit_length(sources.size()));
sdsl::int_vector<0> origins(this->effective(), sources.size(), bit_length(sources.size()));
for(size_type source_id = 0; source_id < sources.size(); source_id++)
{
const GBWT& source = sources[source_id];
for(comp_type source_comp = 1; source_comp < source.effective(); source_comp++)
{
comp_type merged_comp = this->toCompInternal(source.toNodeInternal(source_comp));
if(origin[merged_comp] != sources.size())
comp_type merged_comp = source_comp + record_offsets[source_id];
if(origins[merged_comp] != sources.size())
{
std::cerr << "GBWT::GBWT(): Sources " << origin[merged_comp] << " and " << source_id << " both have node " << this->toNode(merged_comp) << std::endl;
std::cerr << "GBWT::GBWT(): Sources " << origins[merged_comp] << " and " << source_id << " both have node " << this->toNode(merged_comp) << std::endl;
std::exit(EXIT_FAILURE);
}
origin[merged_comp] = source_id;
origins[merged_comp] = source_id;
}
}

// Merge the endmarkers.
std::vector<size_type> limits(sources.size(), 0); // Pointers to the end of the current records.
// Interleave the BWTs.
{
DynamicRecord merged;
std::vector<RecordArray const*> bwt_sources(sources.size());
for(size_type i = 0; i < sources.size(); i++)
{
const GBWT& source = sources[i];
CompressedRecord record = source.record(ENDMARKER);
for(CompressedRecordIterator iter(record); !(iter.end()); ++iter)
{
run_type run = *iter; run.first += merged.outdegree();
merged.body.push_back(run); merged.body_size += run.second;
}
for(edge_type outedge : record.outgoing)
{
merged.outgoing.push_back(outedge);
}
limits[i] = source.bwt.limit(0);
bwt_sources[i] = &(sources[i].bwt);
}
merged.recode();
merged.writeBWT(this->bwt.data);
this->bwt = RecordArray(bwt_sources, origins, record_offsets);
}

// Merge the BWTs.
this->bwt.data.reserve(data_size + this->bwt.data.size());
std::vector<size_type> offsets(this->effective(), 0);
for(comp_type comp = 1; comp < this->effective(); comp++)
// Interleave the samples.
{
offsets[comp] = this->bwt.data.size();
if(origin[comp] >= sources.size())
{
this->bwt.data.push_back(0); // Empty record, outdegree 0.
continue;
}
const GBWT& source = sources[origin[comp]];
size_type start = limits[origin[comp]], limit = source.bwt.limit(source.toCompInternal(this->toNodeInternal(comp)));
limits[origin[comp]] = limit;
for(size_type i = start; i < limit; i++)
std::vector<DASamples const*> sample_sources(sources.size());
std::vector<size_type> sequence_counts(sources.size());
for(size_type i = 0; i < sources.size(); i++)
{
this->bwt.data.push_back(source.bwt.data[i]);
sample_sources[i] = &(sources[i].da_samples);
sequence_counts[i] = sources[i].sequences();
}
this->da_samples = DASamples(sample_sources, origins, record_offsets, sequence_counts);
}

// Build the index for the BWT.
this->bwt.buildIndex(offsets);
offsets = std::vector<size_type>();

// FIXME samples
}

//------------------------------------------------------------------------------
Expand Down
4 changes: 0 additions & 4 deletions include/gbwt/gbwt.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,10 +213,6 @@ class GBWT

private:
void copy(const GBWT& source);

// These assume that the node/comp is not the endmarker.
comp_type toCompInternal(node_type node) const { return node - this->header.offset; }
node_type toNodeInternal(comp_type comp) const { return comp + this->header.offset; }
}; // class GBWT

//------------------------------------------------------------------------------
Expand Down
83 changes: 83 additions & 0 deletions include/gbwt/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,89 @@ struct CompressedRecordFullIterator

//------------------------------------------------------------------------------

/*
Iterator for DASamples. The iterator does not care about records. If the record
for the current sample starts at offset i, the correct sample_type is
(iter.offset() - i, *iter).
*/

struct SampleIterator
{
explicit SampleIterator(const DASamples& source) :
data(source),
pos(0), sample_offset(0),
offset_select(&(source.sampled_offsets))
{
this->update();
}

bool end() const { return (this->pos >= this->data.size()); }
void operator++() { this->pos++; this->update(); }

size_type operator*() const { return this->data.array[this->pos]; }

size_type offset() const { return this->sample_offset; }

const DASamples& data;
size_type pos, sample_offset;

private:
sdsl::sd_vector<>::select_1_type offset_select;

void update()
{
if(!(this->end()))
{
this->sample_offset = this->offset_select(this->pos + 1);
}
}
};

/*
Iterator for sampled ranges in DASamples.
*/

struct SampleRangeIterator
{
explicit SampleRangeIterator(const DASamples& source) :
data(source),
record_id(0), record_rank(0),
record_start(0), record_limit(0)
{
this->advance();
}

bool end() const { return (this->record_id >= this->data.records()); }
void operator++() { this->record_id++; this->record_rank++; this->advance(); }

size_type record() const { return this->record_id; }
size_type rank() const { return this->record_rank; }
size_type start() const { return this->record_start; }
size_type limit() const { return this->record_limit; }
size_type length() const { return this->limit() - this->start(); }

const DASamples& data;
size_type record_id, record_rank;
size_type record_start, record_limit;

private:
void advance()
{
while(!(this->end()))
{
if(this->data.isSampled(this->record_id))
{
this->record_start = this->record_limit;
this->record_limit = this->data.limit(this->record_rank);
return;
}
record_id++;
}
}
};

//------------------------------------------------------------------------------

} // namespace gbwt

#endif // GBWT_INTERNAL_H
11 changes: 11 additions & 0 deletions include/gbwt/support.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ struct RecordArray
~RecordArray();

explicit RecordArray(const std::vector<DynamicRecord>& bwt);
RecordArray(const std::vector<RecordArray const*> sources, const sdsl::int_vector<0>& origins, const std::vector<size_type>& record_offsets);

// Set the number of records, build the data manually, and give the offsets to build the index.
explicit RecordArray(size_type array_size);
Expand Down Expand Up @@ -274,6 +275,7 @@ struct DASamples
~DASamples();

explicit DASamples(const std::vector<DynamicRecord>& bwt);
DASamples(const std::vector<DASamples const*> sources, const sdsl::int_vector<0>& origins, const std::vector<size_type>& record_offsets, const std::vector<size_type>& sequence_counts);

void swap(DASamples& another);
DASamples& operator=(const DASamples& source);
Expand All @@ -282,6 +284,7 @@ struct DASamples
size_type serialize(std::ostream& out, sdsl::structure_tree_node* v = nullptr, std::string name = "") const;
void load(std::istream& in);

size_type records() const { return this->sampled_records.size(); }
size_type size() const { return this->array.size(); }

// Returns invalid_sequence() if there is no sample.
Expand All @@ -290,6 +293,14 @@ struct DASamples
// Returns the first sample at >= offset or invalid_sample() if there is no sample.
sample_type nextSample(size_type record, size_type offset) const;

bool isSampled(size_type record) const { return this->sampled_records[record]; }

// We assume that 'record' has samples.
size_type start(size_type record) const { return this->bwt_select(this->record_rank(record) + 1); }

// Upper bound for the range of a record, given its rank among records with samples.
size_type limit(size_type rank) const;

private:
void copy(const DASamples& source);
void setVectors();
Expand Down
Loading

0 comments on commit 7cd8431

Please sign in to comment.