Skip to content

Commit

Permalink
Merge pull request #201 from liulab-dfci/airr_alignment
Browse files Browse the repository at this point in the history
Output AIRR format from annotator
  • Loading branch information
mourisl authored Jul 20, 2023
2 parents 31311db + fe76427 commit 8418d9d
Show file tree
Hide file tree
Showing 4 changed files with 192 additions and 36 deletions.
75 changes: 50 additions & 25 deletions Annotator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ char usage[] = "./annotator [OPTIONS]:\n"
"\t--notIMGT: the receptor genome sequence is not in IMGT format (default: not set(in IMGT format))\n"
"\t--outputCDR3File: output CDR3 file when not using -r option (default: no output)\n"
"\t--needReverseComplement: reverse complement sequences on another strand (default: no)\n"
"\t--outputFormat INT: 0-fasta, 1-AIRR. (default: 0 (fasta))\n"
"\t--readAssignment STRING: output the read assignment to the file (default: no output)\n";

int nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2,
Expand Down Expand Up @@ -58,6 +59,7 @@ static struct option long_options[] = {
{ "needReverseComplement", no_argument, 0, 10010 },
{ "fastq", no_argument, 0, 10011 },
{ "airrAlignment", no_argument, 0, 10012 },
{ "outputFormat", required_argument, 0, 10013 },
{ (char *)0, 0, 0, 0}
} ;

Expand Down Expand Up @@ -413,7 +415,8 @@ int main( int argc, char *argv[] )
bool outputCDR3File = false ; // whether output the cdr3 file when the input is fasta
bool needRC = false ; // need reverse complment
int format = 0 ; // 0-trust4 format. 1-fasta, 2-fastq
std::map<std::string, int> barcodeStrToInt ;
int outputFormat = 0 ; //0-fasta, 1-airr
std::map<std::string, int> barcodeStrToInt ;

while ( 1 )
{
Expand Down Expand Up @@ -497,6 +500,10 @@ int main( int argc, char *argv[] )
{
outputAirrAlignment = true ;
}
else if (c == 10013) // outputFormat
{
outputFormat = atoi(optarg) ;
}
else
{
fprintf( stderr, "%s", usage ) ;
Expand Down Expand Up @@ -643,39 +650,57 @@ int main( int argc, char *argv[] )
// Use global information to break ties
AnnotationTieBreak( annotations, seqSet, refSet ) ;

// Output the annotation of consensus assemblies
FILE *fpAirrAlignment = NULL ;
if (outputAirrAlignment)
// Add other informations for annotation.
for ( i = 0 ;i < seqCnt ; ++i )
{
sprintf(buffer, "%s_airr_align.tsv", outputPrefix) ;
fpAirrAlignment = fopen(buffer, "w") ;
annotations[i].isFullLength = IsFullLengthAssembly( seqSet.GetSeqConsensus(i), annotations[i], refSet ) ;
}

// Output the annotation of consensus assemblies
if (outputFormat == 1)
{
refSet.GetPartAirrHeader(buffer) ;
printf("sequence_id\t%s\tcomplete_vdj\n", buffer) ;
}

for ( i = 0 ; i < seqCnt ; ++i )
{
int weightSum = seqSet.GetSeqWeightSum( i ) ;
int len = seqSet.GetSeqConsensusLen( i ) ;
sprintf( buffer, ">%s %d %.2lf", seqSet.GetSeqName( i ), len, (double)weightSum / 500.0 ) ;
refSet.AnnotationToString( seqSet.GetSeqConsensus( i ), annotations[i].geneOverlap,
annotations[i].cdr, &annotations[i].secondaryGeneOverlaps, outputGeneAlignment, buffer + strlen( buffer ) ) ;
printf( "%s\n%s\n", buffer, seqSet.GetSeqConsensus( i ) ) ;

if (outputAirrAlignment && annotations[i].cdr[2].seqIdx != -1)
{
refSet.AnnotationToAirrAlign(seqSet.GetSeqConsensus(i), annotations[i].geneOverlap, annotations[i].cdr, buffer) ;
fprintf(fpAirrAlignment, "%s\t%s\n", seqSet.GetSeqName(i), buffer) ;
}
if (outputFormat != 1)
{
int weightSum = seqSet.GetSeqWeightSum( i ) ;
int len = seqSet.GetSeqConsensusLen( i ) ;
sprintf( buffer, ">%s %d %.2lf", seqSet.GetSeqName( i ), len, (double)weightSum / 500.0 ) ;
refSet.AnnotationToString( seqSet.GetSeqConsensus( i ), annotations[i].geneOverlap,
annotations[i].cdr, &annotations[i].secondaryGeneOverlaps, outputGeneAlignment, buffer + strlen( buffer ) ) ;
printf( "%s\n%s\n", buffer, seqSet.GetSeqConsensus( i ) ) ;
}
else
{
char *airrStr = refSet.AnnotationToAirrString( seqSet.GetSeqConsensus( i ), annotations[i].geneOverlap, annotations[i].cdr) ;
printf("%s\t%s\t%c\n", seqSet.GetSeqName( i ),
airrStr, annotations[i].isFullLength ? 'T' : 'F') ;
free(airrStr) ;
}
}

if (outputAirrAlignment)
{
FILE *fpAirrAlignment = NULL ;
sprintf(buffer, "%s_airr_align.tsv", outputPrefix) ;
fpAirrAlignment = fopen(buffer, "w") ;

for (i = 0 ; i < seqCnt ; ++i)
{
if (annotations[i].cdr[2].seqIdx != -1)
{
char *airrAlignStr = refSet.AnnotationToAirrAlign(seqSet.GetSeqConsensus(i), annotations[i].geneOverlap, annotations[i].cdr, true) ;
fprintf(fpAirrAlignment, "%s\t%s\n", seqSet.GetSeqName(i), airrAlignStr) ;
free(airrAlignStr) ;
}
}
fclose( fpAirrAlignment ) ;

// Add other informations for annotation.
for ( i = 0 ;i < seqCnt ; ++i )
{
annotations[i].isFullLength = IsFullLengthAssembly( seqSet.GetSeqConsensus(i), annotations[i], refSet ) ;
}

}

// Output more CDR3 information
if ( flrReads.IsOpen() )
{
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,12 @@ The last step of generating simple report can be done with the command:
perl trust-simplerep.pl trust_cdr3.out > trust_report.out

If you are interested in a subset of chains, you can "grep" those from trust_cdr3.out and run trust-simplerep.pl on the subset.

* #### Annotation only

You can use the "annotator" from TRUST4 to annotate the V,D,J,C genes and CDRs for any given sequences, just like using IgBLAST or IMGT/VQuest. To obtain the annotation in AIRR format for human sequences with eight threads, you can use the command

./annotator -f human_IMGT+C.fa -a input.fa --fasta -t 8 --needReveserComplement --noImpute --outputFormat 1 > annotation.tsv

### Example

Expand Down
145 changes: 135 additions & 10 deletions SeqSet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -464,9 +464,9 @@ class SeqSet
char DnaToAa( char a, char b, char c )
{
if ( a == 'N' || b == 'N' || c == 'N' )
return '-' ;
return '?' ;
if ( a == 'M' || b == 'M' || c == 'M' )
return '-' ;
return '?' ;

if ( a == 'A' )
{
Expand Down Expand Up @@ -546,7 +546,7 @@ class SeqSet
if ( b == 'A' )
{
if ( c == 'A' || c == 'G' )
return '*' ;
return '_' ;
else
return 'Y' ;
}
Expand All @@ -557,7 +557,7 @@ class SeqSet
else if ( b == 'G' )
{
if ( c == 'A' )
return '*' ;
return '_' ;
else if ( c == 'G' )
return 'W' ;
else
Expand Down Expand Up @@ -7837,13 +7837,16 @@ class SeqSet
free( r ) ;
}

// Output the information for AIRR format: vcigar, dcigar, jcigar, sequence_alignment, germline_alignment, cdr3_start in the sequence_alignment string.
void AnnotationToAirrAlign(char *read, struct _overlap geneOverlap[4], struct _overlap cdr[3], char *buffer)
// Output the information for AIRR alignmend format: vcigar, dcigar, jcigar, sequence_alignment, germline_alignment, cdr3_start in the sequence_alignment string.
// return: the airr alignment string. Need to be release externally.
char *AnnotationToAirrAlign(char *read, struct _overlap geneOverlap[4], struct _overlap cdr[3], bool includeCDR3Coordinate)
{
int i, j, k, l, m ;
char *align[3] ; // align part for v, d, j genes.
char *buffer2 = new char[10023] ;
char *buffer3 = new char[10023] ;
int len = strlen(read) ;
char *align[3] ; // align part for v, d, j genes.
char *buffer = (char *)malloc(sizeof(char) * 5 * len) ;
char *buffer2 = new char[2 * len] ; // sequence_align
char *buffer3 = new char[2 * len] ; // germline_align
align[0] = align[1] = align[2] = NULL ;
for (i = 0 ; i < 3 ; ++i)
align[i] = GetGeneOverlapAlignment(read, geneOverlap[i]) ;
Expand Down Expand Up @@ -7942,15 +7945,137 @@ class SeqSet
}
buffer2[i] = '\0' ;
buffer3[i] = '\0' ;
sprintf(buffer + strlen(buffer), "%s\t%s\t%d\t%d", buffer2, buffer3, cdr3AdjustedStart, cdr3AdjustedEnd) ;
if (includeCDR3Coordinate)
sprintf(buffer + strlen(buffer), "%s\t%s\t%d\t%d", buffer2, buffer3, cdr3AdjustedStart, cdr3AdjustedEnd) ;
else
sprintf(buffer + strlen(buffer), "%s\t%s", buffer2, buffer3) ;

for (i = 0 ; i < 3 ; ++i)
if (align[i])
delete[] align[i] ;
delete[] buffer2 ;
delete[] buffer3 ;

return buffer ;
}

// sequence_id, isfulllength, etc will be handled outside
// this is only the information generated internally by seqset
void GetPartAirrHeader(char *buffer)
{
// 18 columns
strcpy(buffer, "sequence\trev_comp\tv_call\td_call\tj_call\tc_call\tv_cigar\td_cigar\tj_cigar\tsequence_alignment\tgermline_alignment\tcdr1\tcdr2\tjunction\tjunction_aa\tproductive\tv_identity\tj_identity") ;
}

// Notice the difference to AirrAlign, which is part of the airr fields focusing on the alignment information
// sequence_id will be printed externally
char *AnnotationToAirrString(char *read, struct _overlap geneOverlap[4], struct _overlap cdr[3])
{
int i ;
int len = strlen(read) ;
char *buffer = (char *)malloc(sizeof(char) * 20 * len) ;

sprintf(buffer, "%s", read) ;

int gidx = 0 ;
for (gidx = 0 ; gidx < 4 ; ++gidx)
if (geneOverlap[gidx].seqIdx != -1)
break ;

if (gidx >= 4 && cdr[2].seqIdx == -1)
{
// sequence is already written
for (i = 1 ; i < 18 ; ++i)
{
if (i != 15)
sprintf(buffer + strlen(buffer), "\t") ;
else
sprintf(buffer + strlen(buffer), "\tF") ;
}
}

char revComp = 'F' ;
if (geneOverlap[gidx].strand == -1)
revComp = 'T' ;
sprintf(buffer + strlen(buffer), "\t%c", revComp) ;

//v,d,j,c calls
for (i = 0 ; i < 4 ; ++i)
{
if (geneOverlap[i].seqIdx == -1)
sprintf(buffer + strlen(buffer), "\t") ;
else
sprintf(buffer + strlen(buffer), "\t%s", GetSeqName(geneOverlap[i].seqIdx)) ;
}

//v,d,j cigar, sequen align, germline align
char *alignStr = AnnotationToAirrAlign(read, geneOverlap, cdr, false) ;
sprintf(buffer + strlen(buffer), "\t%s", alignStr) ;
free(alignStr) ;

//cdr1, 2
char cdrBuffer[1024] ;
for (i = 0 ; i <= 1 ; ++i)
{
if (cdr[i].seqIdx == -1)
{
sprintf(buffer + strlen(buffer), "\t") ;
continue ;
}
else
{
memcpy(cdrBuffer, read + cdr[i].readStart, cdr[i].readEnd - cdr[i].readStart + 1) ;
cdrBuffer[cdr[i].readEnd - cdr[i].readStart + 1] = '\0' ;
sprintf(buffer + strlen(buffer), "\t%s", cdrBuffer) ;
}
}

// cdr3 (junction) nt, aa, productive
if (cdr[2].seqIdx == -1)
{
sprintf(buffer + strlen(buffer), "\t\t\tF") ;
}
else
{
int cdrLen = cdr[2].readEnd - cdr[2].readStart + 1 ;
memcpy(cdrBuffer, read + cdr[2].readStart, cdrLen) ;
cdrBuffer[cdrLen] = '\0' ;
sprintf(buffer + strlen(buffer), "\t%s", cdrBuffer) ;

if (cdrLen % 3 != 0)
{
sprintf(buffer + strlen(buffer), "\tout_of_frame\tF") ;
}
else
{
int k = 0 ;
char productive = 'T' ;
for (i = cdr[2].readStart ; i <= cdr[2].readEnd ; i += 3)
{
char aa = DnaToAa(read[i], read[i + 1], read[i + 2]) ;
if (aa < 'A' || aa > 'Z')
productive = 'F' ;

cdrBuffer[k] = aa ;
++k ;
}
cdrBuffer[k] = '\0' ;
sprintf(buffer + strlen(buffer), "\t%s\t%c", cdrBuffer, productive) ;
}
}

//v,j identitiy
for (i = 0 ; i <= 2 ; i += 2)
{
if (geneOverlap[i].seqIdx != -1)
sprintf(buffer + strlen(buffer), "\t%.2lf", geneOverlap[i].similarity * 100) ;
else
sprintf(buffer + strlen(buffer), "\t") ;
}

return buffer ;
}

char *GetGeneOverlapAlignment(char *read, const struct _overlap gene )
{
if ( gene.seqIdx == -1 )
Expand Down
2 changes: 1 addition & 1 deletion run-trust4
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use Cwd 'cwd' ;
use Cwd 'abs_path' ;
use File::Basename ;

die "TRUST4 v1.0.10.1-r454 usage: ./run-trust4 [OPTIONS]:\n".
die "TRUST4 v1.0.11-r458 usage: ./run-trust4 [OPTIONS]:\n".
"Required:\n".
#"\t[Input]:\n".
"\t-b STRING: path to bam file\n".
Expand Down

0 comments on commit 8418d9d

Please sign in to comment.