Skip to content

Commit

Permalink
stop sorting the input mapping file
Browse files Browse the repository at this point in the history
ekg committed Jun 1, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent 70e896a commit c738c1d
Showing 2 changed files with 4 additions and 71 deletions.
71 changes: 1 addition & 70 deletions src/interface/main.cpp
Original file line number Diff line number Diff line change
@@ -91,76 +91,7 @@ int main(int argc, char** argv) {
if (yeet_parameters.approx_mapping) {
return 0;
}
} else {
robin_hood::unordered_flat_map< std::string, std::pair<skch::seqno_t, uint64_t> > seqName_to_seqCounterAndLen;
skch::seqno_t seqCounter = 0;
for(const auto &fileName : map_parameters.querySequences) {
// check if there is a .fai
std::string fai_name = fileName + ".fai";
if (fs::exists(fai_name)) {
// if so, process the .fai to determine our sequence length
std::string line;
std::ifstream in(fai_name.c_str());
while (std::getline(in, line)) {
auto line_split = skch::CommonFunc::split(line, '\t');
const std::string seq_name = line_split[0];
const uint64_t seq_len = std::stoull(line_split[1]);
seqName_to_seqCounterAndLen[seq_name] = std::make_pair(seqCounter++, seq_len);
}
} else {
// if not, warn that this is expensive
std::cerr << "[wfmash::align] WARNING, no .fai index found for " << fileName << ", reading the file to sort the mappings (slow)" << std::endl;
for(const auto &fileName : map_parameters.querySequences)
{
seqiter::for_each_seq_in_file(
fileName, {}, "",
[&](const std::string& seq_name,
const std::string& seq) {
seqName_to_seqCounterAndLen[seq_name] = std::make_pair(seqCounter++, seq.length());
});
}
}
}


igzstream mappingListStream(map_parameters.outFileName.c_str());
std::string mappingRecordLine;
align::MappingBoundaryRow currentRecord;
std::vector<align::MappingBoundaryRow> allReadMappings;

while (!mappingListStream.eof()){
std::getline(mappingListStream, mappingRecordLine);
if( !mappingRecordLine.empty() ) {
align::Aligner::parseMashmapRow(mappingRecordLine, currentRecord);

allReadMappings.push_back(currentRecord);
}
}

std::sort(allReadMappings.begin(), allReadMappings.end(), [&seqName_to_seqCounterAndLen](const align::MappingBoundaryRow &a, const align::MappingBoundaryRow &b)
{
return (seqName_to_seqCounterAndLen[a.qId].first < seqName_to_seqCounterAndLen[b.qId].first);
});

std::ofstream outstrm(align_parameters.mashmapPafFile);
for(auto &e : allReadMappings)
{
outstrm << e.qId
<< "\t" << seqName_to_seqCounterAndLen[e.qId].second
<< "\t" << e.qStartPos
<< "\t" << e.qEndPos
<< "\t" << (e.strand == skch::strnd::FWD ? "+" : "-")
<< "\t" << e.refId
<< "\t" << seqName_to_seqCounterAndLen[e.refId].second
<< "\t" << e.rStartPos
<< "\t" << e.rEndPos
<< "\t" << 0
<< "\t" << std::max(e.rEndPos - e.rStartPos, e.qEndPos - e.qStartPos)
<< "\t" << 255
<< "\t" << "id:f:" << e.mashmap_estimated_identity
<< "\n";
}
}
}

if (align_parameters.sam_format) {
// Prepare SAM header
4 changes: 3 additions & 1 deletion src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
@@ -638,10 +638,12 @@ void parse_args(int argc,
}

if (align_input_paf) {
// directly use the input mapping file
yeet_parameters.remapping = true;
map_parameters.outFileName = args::get(align_input_paf);
align_parameters.mashmapPafFile = temp_file::create();
align_parameters.mashmapPafFile = args::get(align_input_paf);
} else {
// make a temporary mapping file
map_parameters.outFileName = temp_file::create();
align_parameters.mashmapPafFile = map_parameters.outFileName;
}

0 comments on commit c738c1d

Please sign in to comment.