Merge pull request #201 from liulab-dfci/airr_alignment

Output AIRR format from annotator
liulab-dfci · Jul 20, 2023 · 8418d9d · 8418d9d
2 parents 31311db + fe76427
commit 8418d9d
Show file tree

Hide file tree

Showing 4 changed files with 192 additions and 36 deletions.
diff --git a/Annotator.cpp b/Annotator.cpp
@@ -31,6 +31,7 @@ char usage[] = "./annotator [OPTIONS]:\n"
 		"\t--notIMGT: the receptor genome sequence is not in IMGT format (default: not set(in IMGT format))\n"
 		"\t--outputCDR3File: output CDR3 file when not using -r option (default: no output)\n"
 		"\t--needReverseComplement: reverse complement sequences on another strand (default: no)\n"
+    "\t--outputFormat INT: 0-fasta, 1-AIRR. (default: 0 (fasta))\n" 
 		"\t--readAssignment STRING: output the read assignment to the file (default: no output)\n";
 
 int nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2, 
@@ -58,6 +59,7 @@ static struct option long_options[] = {
 			{ "needReverseComplement", no_argument, 0, 10010 },
 			{ "fastq", no_argument, 0, 10011 },
 			{ "airrAlignment", no_argument, 0, 10012 }, 
+			{ "outputFormat", required_argument, 0, 10013 }, 
 			{ (char *)0, 0, 0, 0} 
 			} ;
 
@@ -413,7 +415,8 @@ int main( int argc, char *argv[] )
 	bool outputCDR3File = false ; // whether output the cdr3 file when the input is fasta
 	bool needRC = false ; // need reverse complment
 	int format = 0 ; // 0-trust4 format. 1-fasta, 2-fastq
-	std::map<std::string, int> barcodeStrToInt ;
+	int outputFormat = 0 ; //0-fasta, 1-airr
+  std::map<std::string, int> barcodeStrToInt ;
 
 	while ( 1 )
 	{
@@ -497,6 +500,10 @@ int main( int argc, char *argv[] )
 		{
 			outputAirrAlignment = true ;
 		}
+    else if (c == 10013) // outputFormat
+    {
+      outputFormat = atoi(optarg) ;
+    }
 		else
 		{
 			fprintf( stderr, "%s", usage ) ;
@@ -643,39 +650,57 @@ int main( int argc, char *argv[] )
 	// Use global information to break ties
 	AnnotationTieBreak( annotations, seqSet, refSet ) ;
 
-	// Output the annotation of consensus assemblies
-	FILE *fpAirrAlignment = NULL ;
-	if (outputAirrAlignment) 
+  // Add other informations for annotation.
+	for ( i = 0 ;i < seqCnt ; ++i )
 	{
-		sprintf(buffer, "%s_airr_align.tsv", outputPrefix) ;
-		fpAirrAlignment = fopen(buffer, "w") ;
+		annotations[i].isFullLength = IsFullLengthAssembly( seqSet.GetSeqConsensus(i), annotations[i], refSet ) ;
 	}
 
+	// Output the annotation of consensus assemblies
+  if (outputFormat == 1)
+  {
+    refSet.GetPartAirrHeader(buffer) ;
+    printf("sequence_id\t%s\tcomplete_vdj\n", buffer) ;
+  }
+
 	for ( i = 0 ; i < seqCnt ; ++i )
 	{
-		int weightSum = seqSet.GetSeqWeightSum( i ) ; 
-		int len = seqSet.GetSeqConsensusLen( i ) ;
-		sprintf( buffer, ">%s %d %.2lf", seqSet.GetSeqName( i ), len, (double)weightSum / 500.0 ) ;
-		refSet.AnnotationToString( seqSet.GetSeqConsensus( i ), annotations[i].geneOverlap, 
-			annotations[i].cdr, &annotations[i].secondaryGeneOverlaps, outputGeneAlignment, buffer + strlen( buffer ) ) ;
-		printf( "%s\n%s\n", buffer, seqSet.GetSeqConsensus( i ) ) ;
-
-		if (outputAirrAlignment && annotations[i].cdr[2].seqIdx != -1)
-		{
-			refSet.AnnotationToAirrAlign(seqSet.GetSeqConsensus(i), annotations[i].geneOverlap, annotations[i].cdr, buffer) ;
-			fprintf(fpAirrAlignment, "%s\t%s\n", seqSet.GetSeqName(i), buffer) ;
-		}
+		if (outputFormat != 1)
+    {
+      int weightSum = seqSet.GetSeqWeightSum( i ) ; 
+      int len = seqSet.GetSeqConsensusLen( i ) ;
+      sprintf( buffer, ">%s %d %.2lf", seqSet.GetSeqName( i ), len, (double)weightSum / 500.0 ) ;
+      refSet.AnnotationToString( seqSet.GetSeqConsensus( i ), annotations[i].geneOverlap, 
+          annotations[i].cdr, &annotations[i].secondaryGeneOverlaps, outputGeneAlignment, buffer + strlen( buffer ) ) ;
+      printf( "%s\n%s\n", buffer, seqSet.GetSeqConsensus( i ) ) ;
+    }
+    else
+    {
+      char *airrStr = refSet.AnnotationToAirrString( seqSet.GetSeqConsensus( i ), annotations[i].geneOverlap, annotations[i].cdr) ;
+      printf("%s\t%s\t%c\n", seqSet.GetSeqName( i ), 
+          airrStr, annotations[i].isFullLength ? 'T' : 'F') ;    
+      free(airrStr) ;
+    }
 	}
 
 	if (outputAirrAlignment)
+  {
+    FILE *fpAirrAlignment = NULL ;
+		sprintf(buffer, "%s_airr_align.tsv", outputPrefix) ;
+		fpAirrAlignment = fopen(buffer, "w") ;
+
+    for (i = 0 ; i < seqCnt ; ++i)
+    {
+      if (annotations[i].cdr[2].seqIdx != -1)
+      {
+        char *airrAlignStr = refSet.AnnotationToAirrAlign(seqSet.GetSeqConsensus(i), annotations[i].geneOverlap, annotations[i].cdr, true) ;
+        fprintf(fpAirrAlignment, "%s\t%s\n", seqSet.GetSeqName(i), airrAlignStr) ;
+        free(airrAlignStr) ;
+      }
+    }
 		fclose( fpAirrAlignment ) ;
-
-	// Add other informations for annotation.
-	for ( i = 0 ;i < seqCnt ; ++i )
-	{
-		annotations[i].isFullLength = IsFullLengthAssembly( seqSet.GetSeqConsensus(i), annotations[i], refSet ) ;
-	}
-
+  }
+
 	// Output more CDR3 information 
 	if ( flrReads.IsOpen() )
 	{

diff --git a/README.md b/README.md
@@ -166,6 +166,12 @@ The last step of generating simple report can be done with the command:
 	perl trust-simplerep.pl trust_cdr3.out > trust_report.out
 
 If you are interested in a subset of chains, you can "grep" those from trust_cdr3.out and run trust-simplerep.pl on the subset.
+
+* #### Annotation only
+
+You can use the "annotator" from TRUST4 to annotate the V,D,J,C genes and CDRs for any given sequences, just like using IgBLAST or IMGT/VQuest. To obtain the annotation in AIRR format for human sequences with eight threads, you can use the command
+
+  ./annotator -f human_IMGT+C.fa -a input.fa --fasta -t 8 --needReveserComplement --noImpute --outputFormat 1 > annotation.tsv 
 
 ### Example
 

diff --git a/SeqSet.hpp b/SeqSet.hpp
@@ -464,9 +464,9 @@ class SeqSet
 	char DnaToAa( char a, char b, char c )
 	{
 		if ( a == 'N' || b == 'N' || c == 'N' )
-			return '-' ;
+			return '?' ;
 		if ( a == 'M' || b == 'M' || c == 'M' )
-			return '-' ;
+			return '?' ;
 
 		if ( a == 'A' )
 		{
@@ -546,7 +546,7 @@ class SeqSet
 			if ( b == 'A' )
 			{
 				if ( c == 'A' || c == 'G' )
-					return '*' ;
+					return '_' ;
 				else
 					return 'Y' ;
 			}
@@ -557,7 +557,7 @@ class SeqSet
 			else if ( b == 'G' )
 			{
 				if ( c == 'A' )
-					return '*' ;
+					return '_' ;
 				else if ( c == 'G' )
 					return 'W' ;
 				else
@@ -7837,13 +7837,16 @@ class SeqSet
 		free( r ) ;
 	}
 
-	// Output the information for AIRR format: vcigar, dcigar, jcigar, sequence_alignment, germline_alignment, cdr3_start in the sequence_alignment string.
-	void AnnotationToAirrAlign(char *read, struct _overlap geneOverlap[4], struct _overlap cdr[3], char *buffer)
+	// Output the information for AIRR alignmend format: vcigar, dcigar, jcigar, sequence_alignment, germline_alignment, cdr3_start in the sequence_alignment string.
+  // return: the airr alignment string. Need to be release externally.
+	char *AnnotationToAirrAlign(char *read, struct _overlap geneOverlap[4], struct _overlap cdr[3], bool includeCDR3Coordinate)
 	{
 		int i, j, k, l, m ;
-		char *align[3] ; // align part for v, d, j genes.
-		char *buffer2 = new char[10023] ;
-		char *buffer3 = new char[10023] ;
+		int len = strlen(read) ;
+    char *align[3] ; // align part for v, d, j genes.
+    char *buffer = (char *)malloc(sizeof(char) * 5 * len) ;
+    char *buffer2 = new char[2 * len] ; // sequence_align
+		char *buffer3 = new char[2 * len] ; // germline_align
 		align[0] = align[1] = align[2] = NULL ;
 		for (i = 0 ; i < 3 ; ++i)
 			align[i] = GetGeneOverlapAlignment(read, geneOverlap[i]) ;
@@ -7942,15 +7945,137 @@ class SeqSet
 		}
 		buffer2[i] = '\0' ;
 		buffer3[i] = '\0' ;
-		sprintf(buffer + strlen(buffer), "%s\t%s\t%d\t%d", buffer2, buffer3, cdr3AdjustedStart, cdr3AdjustedEnd) ;
+    if (includeCDR3Coordinate)
+      sprintf(buffer + strlen(buffer), "%s\t%s\t%d\t%d", buffer2, buffer3, cdr3AdjustedStart, cdr3AdjustedEnd) ;
+    else
+      sprintf(buffer + strlen(buffer), "%s\t%s", buffer2, buffer3) ;
 
 		for (i = 0 ; i < 3 ; ++i)
 			if (align[i])
 				delete[] align[i] ;
 		delete[] buffer2 ;
 		delete[] buffer3 ;
+
+    return buffer ;
 	}
 
+  // sequence_id, isfulllength, etc will be handled outside
+  // this is only the information generated internally by seqset
+  void GetPartAirrHeader(char *buffer)
+  {
+    // 18 columns
+    strcpy(buffer, "sequence\trev_comp\tv_call\td_call\tj_call\tc_call\tv_cigar\td_cigar\tj_cigar\tsequence_alignment\tgermline_alignment\tcdr1\tcdr2\tjunction\tjunction_aa\tproductive\tv_identity\tj_identity") ;
+  }
+
+  // Notice the difference to AirrAlign, which is part of the airr fields focusing on the alignment information
+  // sequence_id will be printed externally
+	char *AnnotationToAirrString(char *read, struct _overlap geneOverlap[4], struct _overlap cdr[3])
+  {
+    int i ;
+    int len = strlen(read) ;
+    char *buffer = (char *)malloc(sizeof(char) * 20 * len) ;
+
+    sprintf(buffer, "%s", read) ;
+
+    int gidx = 0 ;
+    for (gidx = 0 ; gidx < 4 ; ++gidx)
+      if (geneOverlap[gidx].seqIdx != -1)
+        break ;
+
+    if (gidx >= 4 && cdr[2].seqIdx == -1)
+    {
+      // sequence is already written
+      for (i = 1 ; i < 18 ; ++i)
+      {
+        if (i != 15)
+          sprintf(buffer + strlen(buffer), "\t") ;
+        else
+          sprintf(buffer + strlen(buffer), "\tF") ;
+      }
+    }
+
+    char revComp = 'F' ;
+    if (geneOverlap[gidx].strand == -1)
+      revComp = 'T' ;
+    sprintf(buffer + strlen(buffer), "\t%c", revComp) ;
+
+    //v,d,j,c calls
+    for (i = 0 ; i < 4 ; ++i)
+    {
+      if (geneOverlap[i].seqIdx == -1)
+        sprintf(buffer + strlen(buffer), "\t") ;
+      else
+        sprintf(buffer + strlen(buffer), "\t%s", GetSeqName(geneOverlap[i].seqIdx)) ;
+    }
+
+    //v,d,j cigar, sequen align, germline align
+    char *alignStr = AnnotationToAirrAlign(read, geneOverlap, cdr, false) ;
+    sprintf(buffer + strlen(buffer), "\t%s", alignStr) ;
+    free(alignStr) ;
+
+    //cdr1, 2
+    char cdrBuffer[1024] ;
+    for (i = 0 ; i <= 1 ; ++i)
+    {
+      if (cdr[i].seqIdx == -1)
+      {
+        sprintf(buffer + strlen(buffer), "\t") ;
+        continue ;
+      }
+      else
+      {
+        memcpy(cdrBuffer, read + cdr[i].readStart, cdr[i].readEnd - cdr[i].readStart + 1) ;
+        cdrBuffer[cdr[i].readEnd - cdr[i].readStart + 1] = '\0' ;
+        sprintf(buffer + strlen(buffer), "\t%s", cdrBuffer) ;
+      }
+    }
+
+    // cdr3 (junction) nt, aa, productive
+    if (cdr[2].seqIdx == -1)
+    {
+      sprintf(buffer + strlen(buffer), "\t\t\tF") ;
+    }
+    else
+    {
+      int cdrLen = cdr[2].readEnd - cdr[2].readStart + 1 ;
+      memcpy(cdrBuffer, read + cdr[2].readStart, cdrLen) ;
+      cdrBuffer[cdrLen] = '\0' ;
+      sprintf(buffer + strlen(buffer), "\t%s", cdrBuffer) ;
+
+      if (cdrLen % 3 != 0)
+      {
+        sprintf(buffer + strlen(buffer), "\tout_of_frame\tF") ;
+      }
+      else
+      {
+        int k = 0 ;
+        char productive = 'T' ;
+        for (i = cdr[2].readStart ; i <= cdr[2].readEnd ; i += 3)
+        {
+          char aa = DnaToAa(read[i], read[i + 1], read[i + 2]) ;
+          if (aa < 'A' || aa > 'Z')
+            productive = 'F' ;
+
+          cdrBuffer[k] = aa ;
+          ++k ;
+        }
+        cdrBuffer[k] = '\0' ;
+        sprintf(buffer + strlen(buffer), "\t%s\t%c", cdrBuffer, productive) ;
+      }
+    }
+
+    //v,j identitiy
+    for (i = 0 ; i <= 2 ; i += 2)
+    {
+      if (geneOverlap[i].seqIdx != -1)
+        sprintf(buffer + strlen(buffer), "\t%.2lf", geneOverlap[i].similarity * 100) ;
+      else
+        sprintf(buffer + strlen(buffer), "\t") ;
+    }
+
+    return buffer ;
+  }
+
 	char *GetGeneOverlapAlignment(char *read, const struct _overlap gene )
 	{
 		if ( gene.seqIdx == -1 )

diff --git a/run-trust4 b/run-trust4
@@ -7,7 +7,7 @@ use Cwd 'cwd' ;
 use Cwd 'abs_path' ;
 use File::Basename ;
 
-die "TRUST4 v1.0.10.1-r454 usage: ./run-trust4 [OPTIONS]:\n".
+die "TRUST4 v1.0.11-r458 usage: ./run-trust4 [OPTIONS]:\n".
     "Required:\n".
     #"\t[Input]:\n".
     "\t-b STRING: path to bam file\n".