Skip to content

Commit

Permalink
Merge pull request #22 from mourisl/centrifuger_download
Browse files Browse the repository at this point in the history
Supporting download refseq protein sequences in centrifuger-download
  • Loading branch information
mourisl authored Dec 18, 2024
2 parents 05283f0 + 55f0684 commit 8ae0e23
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 12 deletions.
55 changes: 52 additions & 3 deletions Taxonomy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,10 +271,13 @@ class Taxonomy
}
}

// Assume we already have the taxonomy tree loaded
void ReadSeqNameFile(std::string fname, bool conversionTableAtFileLevel)
{
std::ifstream seqmap_file(fname.c_str(), std::ios::in);
std::map<std::string, uint64_t> rawSeqNameMap ;
SimpleVector<size_t> pathA ;
SimpleVector<size_t> pathB ;
if(seqmap_file.is_open()) {
char line[1024];
while(!seqmap_file.eof()) {
Expand All @@ -291,8 +294,34 @@ class Taxonomy
Utils::GetFileBaseName(seqIdStr.c_str(), "fna|fa|fasta|faa", buffer) ;
seqIdStr = buffer ;
}
_seqStrNameMap.Add(seqIdStr) ;
rawSeqNameMap[seqIdStr] = tid ;
if (!_seqStrNameMap.IsIn(seqIdStr))
{
_seqStrNameMap.Add(seqIdStr) ;
rawSeqNameMap[seqIdStr] = tid ;
}
else // a sequence ID maps is found in multiple taxonomy IDs.
{
size_t a = rawSeqNameMap[seqIdStr] ;
size_t b = tid ;

// Convert a, b to compact tax ID to work with the taxonomy tree
a = CompactTaxId(a) ;
b = CompactTaxId(b) ;
int sizeA = GetTaxLineagePath(a, pathA) ;
int sizeB = GetTaxLineagePath(b, pathB) ;

int i, j ;
for (i = sizeA - 1, j = sizeB - 1 ; i >= 0 && j >= 0 ; --i, --j)
{
if (pathA[i] != pathB[j])
break ;
}

if (i == sizeA - 1 || pathA[i + 1] != pathB[i + 1])
rawSeqNameMap[seqIdStr] = GetOrigTaxId(_rootCTaxId) ;
else
rawSeqNameMap[seqIdStr] = GetOrigTaxId(pathA[i + 1]) ;
}
}
seqmap_file.close();
} else {
Expand All @@ -305,7 +334,7 @@ class Taxonomy
for (std::map<std::string, uint64_t>::iterator iter = rawSeqNameMap.begin() ;
iter != rawSeqNameMap.end() ; ++iter)
{
_seqIdToTaxId[ _seqStrNameMap.Map(iter->first) ] = _taxIdMap.Map(iter->second) ;
_seqIdToTaxId[ _seqStrNameMap.Map(iter->first) ] = _taxIdMap.Map(iter->second) ;
}
_seqCnt = _seqStrNameMap.GetSize() ;
}
Expand Down Expand Up @@ -702,6 +731,26 @@ class Taxonomy
promotedTaxIds.PushBack(_rootCTaxId) ;
}

// Get the taxonomy lineage for two tax IDs
// @return: length to the root
int GetTaxLineagePath(size_t ctid, SimpleVector<size_t> &path)
{
path.Clear() ;
if (ctid >= _nodeCnt)
{
path.PushBack(_rootCTaxId) ;
return 1 ;
}

do
{
path.PushBack(ctid) ;
ctid = _taxonomyTree[ctid].parentTid ;
} while (ctid != _taxonomyTree[ctid].parentTid) ;

return (int)path.Size() ;
}

// Promote the taxIds to the ranks defined in the "IsCanonicalRankNum" function
// dedup: true: remove the duplicated item in the taxIds
void PromoteToCanonicalTaxRank(SimpleVector<size_t> &taxIds, bool dedup)
Expand Down
23 changes: 14 additions & 9 deletions centrifuger-download
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ function download_n_process() {
NAME=`basename $FILEPATH .gz`
GZIPPED_FILE="$LIBDIR/$DOMAIN/$NAME.gz"
UNZIPPED_FILE="$LIBDIR/$DOMAIN/$NAME"
DUSTMASKED_FILE="$LIBDIR/$DOMAIN/${NAME%.fna.gz}_dustmasked.fna.gz"
DUSTMASKED_FILE="$LIBDIR/$DOMAIN/${NAME%.fna}_dustmasked.fna.gz"
[[ "$DO_DUST" == "1" ]] && RES_FILE=$DUSTMASKED_FILE || RES_FILE=$GZIPPED_FILE

if [[ ! -s "$RES_FILE" || -n `gzip -t "$RES_FILE" 2>&1 >/dev/null` ]]; then
Expand All @@ -50,7 +50,7 @@ function download_n_process() {
$DL_CMD "$LIBDIR/$DOMAIN/$NAME.gz" "$FILEPATH" || \
$DL_CMD "$LIBDIR/$DOMAIN/$NAME.gz" "$FILEPATH" || \
$DL_CMD "$LIBDIR/$DOMAIN/$NAME.gz" "$FILEPATH" || \
{ printf "\nError downloading $FILEPATH!\n" >&2 && exit 1; }
{ printf "\nError downloading $FILEPATH!\n" >&2 && exit 1; }
fi

[[ -s "$GZIPPED_FILE" ]] || return;
Expand Down Expand Up @@ -158,6 +158,7 @@ BASE_DIR="."
N_PROC=1
CHANGE_HEADER=0
DOWNLOAD_RNA=0
DOWNLOAD_PROTEIN=0
DO_DUST=0
FILTER_UNPLACED=0
VERBOSE=0
Expand All @@ -173,14 +174,15 @@ ARGUMENT
COMMON OPTIONS
-o <directory> Folder to which the files are downloaded. Default: '$BASE_DIR'.
-P <# of threads> Number of processes when downloading (uses xargs). Default: '$N_PROC'
-P <# of threads> Number of processes when downloading (uses xargs and flock). Default: '$N_PROC'
WHEN USING database refseq OR genbank:
-d <domain> What domain to download. One or more of ${ALL_GENOMES// /, } (comma separated).
-a <assembly level> Only download genomes with the specified assembly level. Default: '$ASSEMBLY_LEVEL'. Use 'Any' for any assembly level.
-c <refseq category> Only download genomes in the specified refseq category. Default: any.
-t <taxids> Only download the specified taxonomy IDs, comma separated. Default: any.
-g <program> Download using program. Options: rsync, curl, wget. Default $DL_PROG (auto-detected).
-p Download protein sequences
-r Download RNA sequences, too.
-u Filter unplaced sequences.
-m Mask low-complexity regions using dustmasker. Default: off.
Expand All @@ -189,7 +191,7 @@ WHEN USING database refseq OR genbank:
"

# arguments: $OPTFIND (current index), $OPTARG (argument for option), $OPTERR (bash-specific)
while getopts "o:P:d:a:c:t:g:urlmv" OPT "$@"; do
while getopts "o:P:d:a:c:t:g:uprlmv" OPT "$@"; do
case $OPT in
o) BASE_DIR="$OPTARG" ;;
P) N_PROC="$OPTARG" ;;
Expand All @@ -198,6 +200,7 @@ while getopts "o:P:d:a:c:t:g:urlmv" OPT "$@"; do
c) REFSEQ_CATEGORY="$OPTARG" ;;
g) DL_PROG="$OPTARG" ;;
t) TAXID="$OPTARG" ;;
p) DOWNLOAD_PROTEIN=1 ;;
r) DOWNLOAD_RNA=1 ;;
u) FILTER_UNPLACED=1 ;;
m) DO_DUST=1 ;;
Expand Down Expand Up @@ -349,6 +352,10 @@ if [[ "$CHANGE_HEADER" == "1" ]]; then
echo "Modifying header to include taxonomy ID" >&2
fi

FILE_EXTENSION="genomic.fna.gz"
if [[ "$DOWNLOAD_PROTEIN" == "1" ]]; then
FILE_EXTENSION="protein.faa.gz"
fi

for DOMAIN in $DOMAINS; do
if [[ -s .listing ]]; then
Expand Down Expand Up @@ -394,21 +401,19 @@ for DOMAIN in $DOMAINS; do
## Wrong columns in viral assembly summary files - the path is sometimes in field 20, sometimes 21
#cut -f "$TAXID_FIELD,$FTP_PATH_FIELD,$FTP_PATH_FIELD2" "$ASSEMBLY_SUMMARY_FILE" | \
# sed 's/^\(.*\)\t\(ftp:.*\)\t.*/\1\t\2/;s/^\(.*\)\t.*\t\(ftp:.*\)/\1\t\2/' | \
#sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
#sed 's#\([^/]*\)$#\1/\1_${FILE_EXTENSION}#' |\
# tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process_nofail "$@"' _ | count $N_EXPECTED
cut -f "$TAXID_FIELD,$FTP_PATH_FIELD,$FTP_PATH_FIELD2" "$ASSEMBLY_SUMMARY_FILE" | \
awk -F "\t" '{if ($2~/ftp/) print $1"\t"$2; if ($3~/ftp/) print $1"\t"$3}' | \
sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
sed 's#\([^/]*\)$#\1/\1_'"${FILE_EXTENSION}"'#' |\
tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process_nofail "$@"' _ | count $N_EXPECTED

else
cut -f "$TAXID_FIELD,$FTP_PATH_FIELD" "$ASSEMBLY_SUMMARY_FILE" | sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
cut -f "$TAXID_FIELD,$FTP_PATH_FIELD" "$ASSEMBLY_SUMMARY_FILE" | sed 's#\([^/]*\)$#\1/\1_'"${FILE_EXTENSION}"'#' |\
tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process_nofail "$@"' _ | count $N_EXPECTED
fi
echo >&2



if [[ "$DOWNLOAD_RNA" == "1" && ! `echo $DOMAIN | egrep 'bacteria|viral|archaea'` ]]; then
echo "Downloadinging rna sequence files" >&2
cut -f $TAXID_FIELD,$FTP_PATH_FIELD "$ASSEMBLY_SUMMARY_FILE"| sed 's#\([^/]*\)$#\1/\1_rna.fna.gz#' |\
Expand Down

0 comments on commit 8ae0e23

Please sign in to comment.