-
Notifications
You must be signed in to change notification settings - Fork 206
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a workflow to pick more central cluster representatives
- Loading branch information
1 parent
1668032
commit 8ef870f
Showing
6 changed files
with
139 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
#!/bin/sh -e | ||
|
||
fail() { | ||
echo "Error: $1" | ||
exit 1 | ||
} | ||
|
||
notExists() { | ||
[ ! -f "$1" ] | ||
} | ||
|
||
[ -z "$MMSEQS" ] && echo "Please set the environment variable \$MMSEQS to your MMSEQS binary." && exit 1; | ||
[ "$#" -ne 4 ] && echo "Please provide <seqDB> <clusterDB> <outClusterDB> <tmp>" && exit 1; | ||
[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1; | ||
[ ! -f "$2.dbtype" ] && echo "$2.dbtype not found!" && exit 1; | ||
[ ! -d "$4" ] && echo "tmp directory $4 not found!" && mkdir -p "$4"; | ||
TMP_PATH="$4" | ||
|
||
if notExists "${TMP_PATH}/msa.dbtype"; then | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" result2msa "$1" "$1" "$2" "$TMP_PATH/msa" ${RESULT2MSA_PAR} \ | ||
|| fail "result2msa failed" | ||
fi | ||
|
||
if notExists "${TMP_PATH}/profile.dbtype"; then | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" msa2profile "$TMP_PATH/msa" "$TMP_PATH/profile" ${MSA2PROFILE_PAR} \ | ||
|| fail "result2msa failed" | ||
fi | ||
|
||
if notExists "${TMP_PATH}/aln.dbtype"; then | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" align "$TMP_PATH/profile" "$1" "$2" "${TMP_PATH}/aln" || fail "align failed" | ||
fi | ||
|
||
if notExists "${TMP_PATH}/aln.tsv"; then | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" prefixid "${TMP_PATH}/aln" "${TMP_PATH}/aln.tsv" --tsv ${VERBOSITY} || fail "prefixid1 -tsv failed" | ||
fi | ||
|
||
awk 'FNR == NR{ best[$1]=1; rep[$1] = $1; next} | ||
{ | ||
cluster = $1; member = $2; score = $3; | ||
if (!(cluster in best) || score > best[cluster]) { | ||
best[cluster] = score; | ||
rep[cluster] = member; | ||
} | ||
} | ||
END { | ||
for (cluster in rep) { | ||
print cluster "\t" rep[cluster]; | ||
} | ||
}' "${TMP_PATH}/aln.index" "${TMP_PATH}/aln.tsv" > "${TMP_PATH}/rep_mapping.txt" | ||
|
||
if notExists "${TMP_PATH}/clu.tsv"; then | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" prefixid "$2" "${TMP_PATH}/clu.tsv" --tsv ${VERBOSITY} || fail "prefixid2 -tsv failed" | ||
fi | ||
|
||
if notExists "${TMP_PATH}/updated_clu.tsv"; then | ||
awk 'FNR == NR{f[$1] = $2; next} | ||
$1 != prev { print f[$1] "\t" f[$1]; prev = $1; } | ||
$1 in f && $2 != f[$1]{print f[$1]"\t"$2}' "${TMP_PATH}/rep_mapping.txt" "${TMP_PATH}/clu.tsv" > "${TMP_PATH}/updated_clu.tsv" | ||
fi | ||
|
||
"$MMSEQS" tsv2db "${TMP_PATH}/updated_clu.tsv" "${3}" --output-dbtype 6 || fail "tsv2db failed" | ||
|
||
if [ -n "$REMOVE_TMP" ]; then | ||
rm -f "${TMP_PATH}/updated_clu.tsv" | ||
rm -f "${TMP_PATH}/aln.tsv" | ||
rm -f "${TMP_PATH}/rep_mapping.txt" | ||
rm -f "${TMP_PATH}/clu.tsv" | ||
rm -f "${TMP_PATH}/updated_clu.tsv" | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" rmdb "${TMP_PATH}/msa" ${VERBOSITY} | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" rmdb "${TMP_PATH}/profile" ${VERBOSITY} | ||
# shellcheck disable=SC2086 | ||
"$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY} | ||
|
||
rm -rf "${TMP_PATH}/pickconsensusrep.sh" | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -572,7 +572,15 @@ std::vector<Command> baseCommands = { | |
{"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}}, | ||
|
||
|
||
|
||
{"pickconsensusrep", pickconsensusrep, &par.verbandcompression, COMMAND_CLUSTER, | ||
"Select new representatives for each cluster based on consensus", | ||
NULL, | ||
"Martin Steinegger <[email protected]> & Maria Hauser", | ||
"<i:seqDb> <i:clusterDB> <o:clusterDB> <tmpDir>", | ||
CITATION_MMSEQS2, {{"seqDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, | ||
{"clusterDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::clusterDb }, | ||
{"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::clusterDb }, | ||
{"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}}, | ||
{"prefilter", prefilter, &par.prefilter, COMMAND_PREFILTER, | ||
"Double consecutive diagonal k-mer search", | ||
NULL, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#include "Parameters.h" | ||
#include "FileUtil.h" | ||
#include "CommandCaller.h" | ||
#include <cassert> | ||
#include <string> | ||
// Include the embedded shell script. | ||
#include "pickconsensusrep.sh.h" | ||
|
||
// Minimal workflow function that runs the pickcenterrep workflow. | ||
int pickconsensusrep(int argc, const char **argv, const Command &command) { | ||
Parameters &par = Parameters::getInstance(); | ||
par.parseParameters(argc, argv, command, true, 0, 0); | ||
|
||
CommandCaller cmd; | ||
par.allowDeletion = 1; | ||
par.PARAM_ALLOW_DELETION.wasSet = true; | ||
cmd.addVariable("RESULT2MSA_PAR", par.createParameterString(par.result2msa, true).c_str()); | ||
par.matchMode = 1; | ||
par.PARAM_MATCH_MODE.wasSet = true; | ||
cmd.addVariable("MSA2PROFILE_PAR", par.createParameterString(par.msa2profile, true).c_str()); | ||
cmd.addVariable("RENAMEDBKEYS_PAR", par.createParameterString(par.renamedbkeys).c_str()); | ||
cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str()); | ||
|
||
// The temporary directory is provided as the 4th argument. | ||
std::string tmpDir = par.db4; | ||
std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, par.mapworkflow)); | ||
if (par.reuseLatest) { | ||
hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); | ||
} | ||
tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); | ||
par.filenames.pop_back(); | ||
par.filenames.push_back(tmpDir); | ||
|
||
// Write out the embedded shell script to a file in the temporary directory. | ||
std::string program = tmpDir + "/pickconsensusrep.sh"; | ||
FileUtil::writeFile(program, pickconsensusrep_sh, pickconsensusrep_sh_len); | ||
|
||
// Execute the shell script. | ||
cmd.execProgram(program.c_str(), par.filenames); | ||
|
||
// The shell script should not return; if it does, abort. | ||
assert(false); | ||
return 0; | ||
} | ||
|