Skip to content

Commit

Permalink
Add a workflow to pick more central cluster representatives
Browse files Browse the repository at this point in the history
  • Loading branch information
martin-steinegger committed Feb 10, 2025
1 parent 1668032 commit 8ef870f
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 1 deletion.
1 change: 1 addition & 0 deletions data/workflow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ set(GENERATED_WORKFLOWS
workflow/taxonomy.sh
workflow/linsearch.sh
workflow/databases.sh
workflow/pickconsensusrep.sh
workflow/nucleotide_clustering.sh
workflow/iterativepp.sh
workflow/tsv2exprofiledb.sh
Expand Down
82 changes: 82 additions & 0 deletions data/workflow/pickconsensusrep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/bin/sh -e

fail() {
echo "Error: $1"
exit 1
}

notExists() {
[ ! -f "$1" ]
}

[ -z "$MMSEQS" ] && echo "Please set the environment variable \$MMSEQS to your MMSEQS binary." && exit 1;
[ "$#" -ne 4 ] && echo "Please provide <seqDB> <clusterDB> <outClusterDB> <tmp>" && exit 1;
[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1;
[ ! -f "$2.dbtype" ] && echo "$2.dbtype not found!" && exit 1;
[ ! -d "$4" ] && echo "tmp directory $4 not found!" && mkdir -p "$4";
TMP_PATH="$4"

if notExists "${TMP_PATH}/msa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" result2msa "$1" "$1" "$2" "$TMP_PATH/msa" ${RESULT2MSA_PAR} \
|| fail "result2msa failed"
fi

if notExists "${TMP_PATH}/profile.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" msa2profile "$TMP_PATH/msa" "$TMP_PATH/profile" ${MSA2PROFILE_PAR} \
|| fail "result2msa failed"
fi

if notExists "${TMP_PATH}/aln.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" align "$TMP_PATH/profile" "$1" "$2" "${TMP_PATH}/aln" || fail "align failed"
fi

if notExists "${TMP_PATH}/aln.tsv"; then
# shellcheck disable=SC2086
"$MMSEQS" prefixid "${TMP_PATH}/aln" "${TMP_PATH}/aln.tsv" --tsv ${VERBOSITY} || fail "prefixid1 -tsv failed"
fi

awk 'FNR == NR{ best[$1]=1; rep[$1] = $1; next}
{
cluster = $1; member = $2; score = $3;
if (!(cluster in best) || score > best[cluster]) {
best[cluster] = score;
rep[cluster] = member;
}
}
END {
for (cluster in rep) {
print cluster "\t" rep[cluster];
}
}' "${TMP_PATH}/aln.index" "${TMP_PATH}/aln.tsv" > "${TMP_PATH}/rep_mapping.txt"

if notExists "${TMP_PATH}/clu.tsv"; then
# shellcheck disable=SC2086
"$MMSEQS" prefixid "$2" "${TMP_PATH}/clu.tsv" --tsv ${VERBOSITY} || fail "prefixid2 -tsv failed"
fi

if notExists "${TMP_PATH}/updated_clu.tsv"; then
awk 'FNR == NR{f[$1] = $2; next}
$1 != prev { print f[$1] "\t" f[$1]; prev = $1; }
$1 in f && $2 != f[$1]{print f[$1]"\t"$2}' "${TMP_PATH}/rep_mapping.txt" "${TMP_PATH}/clu.tsv" > "${TMP_PATH}/updated_clu.tsv"
fi

"$MMSEQS" tsv2db "${TMP_PATH}/updated_clu.tsv" "${3}" --output-dbtype 6 || fail "tsv2db failed"

if [ -n "$REMOVE_TMP" ]; then
rm -f "${TMP_PATH}/updated_clu.tsv"
rm -f "${TMP_PATH}/aln.tsv"
rm -f "${TMP_PATH}/rep_mapping.txt"
rm -f "${TMP_PATH}/clu.tsv"
rm -f "${TMP_PATH}/updated_clu.tsv"
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/msa" ${VERBOSITY}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/profile" ${VERBOSITY}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY}

rm -rf "${TMP_PATH}/pickconsensusrep.sh"
fi
1 change: 1 addition & 0 deletions src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ extern int nrtotaxmapping(int argc, const char **argv, const Command& command);
extern int offsetalignment(int argc, const char **argv, const Command& command);
extern int orftocontig(int argc, const char **argv, const Command& command);
extern int touchdb(int argc, const char **argv, const Command& command);
extern int pickconsensusrep(int argc, const char **argv, const Command& command);
extern int prefilter(int argc, const char **argv, const Command& command);
extern int prefixid(int argc, const char **argv, const Command& command);
extern int profile2cs(int argc, const char **argv, const Command& command);
Expand Down
10 changes: 9 additions & 1 deletion src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,15 @@ std::vector<Command> baseCommands = {
{"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},



{"pickconsensusrep", pickconsensusrep, &par.verbandcompression, COMMAND_CLUSTER,
"Select new representatives for each cluster based on consensus",
NULL,
"Martin Steinegger <[email protected]> & Maria Hauser",
"<i:seqDb> <i:clusterDB> <o:clusterDB> <tmpDir>",
CITATION_MMSEQS2, {{"seqDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"clusterDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::clusterDb },
{"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::clusterDb },
{"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
{"prefilter", prefilter, &par.prefilter, COMMAND_PREFILTER,
"Double consecutive diagonal k-mer search",
NULL,
Expand Down
1 change: 1 addition & 0 deletions src/workflow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ set(workflow_source_files
workflow/Rbh.cpp
workflow/Search.cpp
workflow/Taxonomy.cpp
workflow/PickConsensusRep.cpp
workflow/EasyTaxonomy.cpp
workflow/CreateIndex.cpp
PARENT_SCOPE
Expand Down
45 changes: 45 additions & 0 deletions src/workflow/PickConsensusRep.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#include "Parameters.h"
#include "FileUtil.h"
#include "CommandCaller.h"
#include <cassert>
#include <string>
// Include the embedded shell script.
#include "pickconsensusrep.sh.h"

// Minimal workflow function that runs the pickcenterrep workflow.
int pickconsensusrep(int argc, const char **argv, const Command &command) {
Parameters &par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, 0);

CommandCaller cmd;
par.allowDeletion = 1;
par.PARAM_ALLOW_DELETION.wasSet = true;
cmd.addVariable("RESULT2MSA_PAR", par.createParameterString(par.result2msa, true).c_str());
par.matchMode = 1;
par.PARAM_MATCH_MODE.wasSet = true;
cmd.addVariable("MSA2PROFILE_PAR", par.createParameterString(par.msa2profile, true).c_str());
cmd.addVariable("RENAMEDBKEYS_PAR", par.createParameterString(par.renamedbkeys).c_str());
cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());

// The temporary directory is provided as the 4th argument.
std::string tmpDir = par.db4;
std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, par.mapworkflow));
if (par.reuseLatest) {
hash = FileUtil::getHashFromSymLink(tmpDir + "/latest");
}
tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash);
par.filenames.pop_back();
par.filenames.push_back(tmpDir);

// Write out the embedded shell script to a file in the temporary directory.
std::string program = tmpDir + "/pickconsensusrep.sh";
FileUtil::writeFile(program, pickconsensusrep_sh, pickconsensusrep_sh_len);

// Execute the shell script.
cmd.execProgram(program.c_str(), par.filenames);

// The shell script should not return; if it does, abort.
assert(false);
return 0;
}

0 comments on commit 8ef870f

Please sign in to comment.