diff --git a/Annotator.cpp b/Annotator.cpp index b7a0945..deab4d8 100644 --- a/Annotator.cpp +++ b/Annotator.cpp @@ -31,6 +31,7 @@ char usage[] = "./annotator [OPTIONS]:\n" "\t--notIMGT: the receptor genome sequence is not in IMGT format (default: not set(in IMGT format))\n" "\t--outputCDR3File: output CDR3 file when not using -r option (default: no output)\n" "\t--needReverseComplement: reverse complement sequences on another strand (default: no)\n" + "\t--outputFormat INT: 0-fasta, 1-AIRR. (default: 0 (fasta))\n" "\t--readAssignment STRING: output the read assignment to the file (default: no output)\n"; int nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2, @@ -58,6 +59,7 @@ static struct option long_options[] = { { "needReverseComplement", no_argument, 0, 10010 }, { "fastq", no_argument, 0, 10011 }, { "airrAlignment", no_argument, 0, 10012 }, + { "outputFormat", required_argument, 0, 10013 }, { (char *)0, 0, 0, 0} } ; @@ -413,7 +415,8 @@ int main( int argc, char *argv[] ) bool outputCDR3File = false ; // whether output the cdr3 file when the input is fasta bool needRC = false ; // need reverse complment int format = 0 ; // 0-trust4 format. 1-fasta, 2-fastq - std::map barcodeStrToInt ; + int outputFormat = 0 ; //0-fasta, 1-airr + std::map barcodeStrToInt ; while ( 1 ) { @@ -497,6 +500,10 @@ int main( int argc, char *argv[] ) { outputAirrAlignment = true ; } + else if (c == 10013) // outputFormat + { + outputFormat = atoi(optarg) ; + } else { fprintf( stderr, "%s", usage ) ; @@ -643,39 +650,57 @@ int main( int argc, char *argv[] ) // Use global information to break ties AnnotationTieBreak( annotations, seqSet, refSet ) ; - // Output the annotation of consensus assemblies - FILE *fpAirrAlignment = NULL ; - if (outputAirrAlignment) + // Add other informations for annotation. + for ( i = 0 ;i < seqCnt ; ++i ) { - sprintf(buffer, "%s_airr_align.tsv", outputPrefix) ; - fpAirrAlignment = fopen(buffer, "w") ; + annotations[i].isFullLength = IsFullLengthAssembly( seqSet.GetSeqConsensus(i), annotations[i], refSet ) ; } + // Output the annotation of consensus assemblies + if (outputFormat == 1) + { + refSet.GetPartAirrHeader(buffer) ; + printf("sequence_id\t%s\tcomplete_vdj\n", buffer) ; + } + for ( i = 0 ; i < seqCnt ; ++i ) { - int weightSum = seqSet.GetSeqWeightSum( i ) ; - int len = seqSet.GetSeqConsensusLen( i ) ; - sprintf( buffer, ">%s %d %.2lf", seqSet.GetSeqName( i ), len, (double)weightSum / 500.0 ) ; - refSet.AnnotationToString( seqSet.GetSeqConsensus( i ), annotations[i].geneOverlap, - annotations[i].cdr, &annotations[i].secondaryGeneOverlaps, outputGeneAlignment, buffer + strlen( buffer ) ) ; - printf( "%s\n%s\n", buffer, seqSet.GetSeqConsensus( i ) ) ; - - if (outputAirrAlignment && annotations[i].cdr[2].seqIdx != -1) - { - refSet.AnnotationToAirrAlign(seqSet.GetSeqConsensus(i), annotations[i].geneOverlap, annotations[i].cdr, buffer) ; - fprintf(fpAirrAlignment, "%s\t%s\n", seqSet.GetSeqName(i), buffer) ; - } + if (outputFormat != 1) + { + int weightSum = seqSet.GetSeqWeightSum( i ) ; + int len = seqSet.GetSeqConsensusLen( i ) ; + sprintf( buffer, ">%s %d %.2lf", seqSet.GetSeqName( i ), len, (double)weightSum / 500.0 ) ; + refSet.AnnotationToString( seqSet.GetSeqConsensus( i ), annotations[i].geneOverlap, + annotations[i].cdr, &annotations[i].secondaryGeneOverlaps, outputGeneAlignment, buffer + strlen( buffer ) ) ; + printf( "%s\n%s\n", buffer, seqSet.GetSeqConsensus( i ) ) ; + } + else + { + char *airrStr = refSet.AnnotationToAirrString( seqSet.GetSeqConsensus( i ), annotations[i].geneOverlap, annotations[i].cdr) ; + printf("%s\t%s\t%c\n", seqSet.GetSeqName( i ), + airrStr, annotations[i].isFullLength ? 'T' : 'F') ; + free(airrStr) ; + } } if (outputAirrAlignment) + { + FILE *fpAirrAlignment = NULL ; + sprintf(buffer, "%s_airr_align.tsv", outputPrefix) ; + fpAirrAlignment = fopen(buffer, "w") ; + + for (i = 0 ; i < seqCnt ; ++i) + { + if (annotations[i].cdr[2].seqIdx != -1) + { + char *airrAlignStr = refSet.AnnotationToAirrAlign(seqSet.GetSeqConsensus(i), annotations[i].geneOverlap, annotations[i].cdr, true) ; + fprintf(fpAirrAlignment, "%s\t%s\n", seqSet.GetSeqName(i), airrAlignStr) ; + free(airrAlignStr) ; + } + } fclose( fpAirrAlignment ) ; - - // Add other informations for annotation. - for ( i = 0 ;i < seqCnt ; ++i ) - { - annotations[i].isFullLength = IsFullLengthAssembly( seqSet.GetSeqConsensus(i), annotations[i], refSet ) ; - } - + } + // Output more CDR3 information if ( flrReads.IsOpen() ) { diff --git a/README.md b/README.md index b76eeec..b2c52fc 100644 --- a/README.md +++ b/README.md @@ -166,6 +166,12 @@ The last step of generating simple report can be done with the command: perl trust-simplerep.pl trust_cdr3.out > trust_report.out If you are interested in a subset of chains, you can "grep" those from trust_cdr3.out and run trust-simplerep.pl on the subset. + +* #### Annotation only + +You can use the "annotator" from TRUST4 to annotate the V,D,J,C genes and CDRs for any given sequences, just like using IgBLAST or IMGT/VQuest. To obtain the annotation in AIRR format for human sequences with eight threads, you can use the command + + ./annotator -f human_IMGT+C.fa -a input.fa --fasta -t 8 --needReveserComplement --noImpute --outputFormat 1 > annotation.tsv ### Example diff --git a/SeqSet.hpp b/SeqSet.hpp index e63745e..c2d9cef 100644 --- a/SeqSet.hpp +++ b/SeqSet.hpp @@ -464,9 +464,9 @@ class SeqSet char DnaToAa( char a, char b, char c ) { if ( a == 'N' || b == 'N' || c == 'N' ) - return '-' ; + return '?' ; if ( a == 'M' || b == 'M' || c == 'M' ) - return '-' ; + return '?' ; if ( a == 'A' ) { @@ -546,7 +546,7 @@ class SeqSet if ( b == 'A' ) { if ( c == 'A' || c == 'G' ) - return '*' ; + return '_' ; else return 'Y' ; } @@ -557,7 +557,7 @@ class SeqSet else if ( b == 'G' ) { if ( c == 'A' ) - return '*' ; + return '_' ; else if ( c == 'G' ) return 'W' ; else @@ -7837,13 +7837,16 @@ class SeqSet free( r ) ; } - // Output the information for AIRR format: vcigar, dcigar, jcigar, sequence_alignment, germline_alignment, cdr3_start in the sequence_alignment string. - void AnnotationToAirrAlign(char *read, struct _overlap geneOverlap[4], struct _overlap cdr[3], char *buffer) + // Output the information for AIRR alignmend format: vcigar, dcigar, jcigar, sequence_alignment, germline_alignment, cdr3_start in the sequence_alignment string. + // return: the airr alignment string. Need to be release externally. + char *AnnotationToAirrAlign(char *read, struct _overlap geneOverlap[4], struct _overlap cdr[3], bool includeCDR3Coordinate) { int i, j, k, l, m ; - char *align[3] ; // align part for v, d, j genes. - char *buffer2 = new char[10023] ; - char *buffer3 = new char[10023] ; + int len = strlen(read) ; + char *align[3] ; // align part for v, d, j genes. + char *buffer = (char *)malloc(sizeof(char) * 5 * len) ; + char *buffer2 = new char[2 * len] ; // sequence_align + char *buffer3 = new char[2 * len] ; // germline_align align[0] = align[1] = align[2] = NULL ; for (i = 0 ; i < 3 ; ++i) align[i] = GetGeneOverlapAlignment(read, geneOverlap[i]) ; @@ -7942,15 +7945,137 @@ class SeqSet } buffer2[i] = '\0' ; buffer3[i] = '\0' ; - sprintf(buffer + strlen(buffer), "%s\t%s\t%d\t%d", buffer2, buffer3, cdr3AdjustedStart, cdr3AdjustedEnd) ; + if (includeCDR3Coordinate) + sprintf(buffer + strlen(buffer), "%s\t%s\t%d\t%d", buffer2, buffer3, cdr3AdjustedStart, cdr3AdjustedEnd) ; + else + sprintf(buffer + strlen(buffer), "%s\t%s", buffer2, buffer3) ; for (i = 0 ; i < 3 ; ++i) if (align[i]) delete[] align[i] ; delete[] buffer2 ; delete[] buffer3 ; + + return buffer ; } + // sequence_id, isfulllength, etc will be handled outside + // this is only the information generated internally by seqset + void GetPartAirrHeader(char *buffer) + { + // 18 columns + strcpy(buffer, "sequence\trev_comp\tv_call\td_call\tj_call\tc_call\tv_cigar\td_cigar\tj_cigar\tsequence_alignment\tgermline_alignment\tcdr1\tcdr2\tjunction\tjunction_aa\tproductive\tv_identity\tj_identity") ; + } + + // Notice the difference to AirrAlign, which is part of the airr fields focusing on the alignment information + // sequence_id will be printed externally + char *AnnotationToAirrString(char *read, struct _overlap geneOverlap[4], struct _overlap cdr[3]) + { + int i ; + int len = strlen(read) ; + char *buffer = (char *)malloc(sizeof(char) * 20 * len) ; + + sprintf(buffer, "%s", read) ; + + int gidx = 0 ; + for (gidx = 0 ; gidx < 4 ; ++gidx) + if (geneOverlap[gidx].seqIdx != -1) + break ; + + if (gidx >= 4 && cdr[2].seqIdx == -1) + { + // sequence is already written + for (i = 1 ; i < 18 ; ++i) + { + if (i != 15) + sprintf(buffer + strlen(buffer), "\t") ; + else + sprintf(buffer + strlen(buffer), "\tF") ; + } + } + + char revComp = 'F' ; + if (geneOverlap[gidx].strand == -1) + revComp = 'T' ; + sprintf(buffer + strlen(buffer), "\t%c", revComp) ; + + //v,d,j,c calls + for (i = 0 ; i < 4 ; ++i) + { + if (geneOverlap[i].seqIdx == -1) + sprintf(buffer + strlen(buffer), "\t") ; + else + sprintf(buffer + strlen(buffer), "\t%s", GetSeqName(geneOverlap[i].seqIdx)) ; + } + + //v,d,j cigar, sequen align, germline align + char *alignStr = AnnotationToAirrAlign(read, geneOverlap, cdr, false) ; + sprintf(buffer + strlen(buffer), "\t%s", alignStr) ; + free(alignStr) ; + + //cdr1, 2 + char cdrBuffer[1024] ; + for (i = 0 ; i <= 1 ; ++i) + { + if (cdr[i].seqIdx == -1) + { + sprintf(buffer + strlen(buffer), "\t") ; + continue ; + } + else + { + memcpy(cdrBuffer, read + cdr[i].readStart, cdr[i].readEnd - cdr[i].readStart + 1) ; + cdrBuffer[cdr[i].readEnd - cdr[i].readStart + 1] = '\0' ; + sprintf(buffer + strlen(buffer), "\t%s", cdrBuffer) ; + } + } + + // cdr3 (junction) nt, aa, productive + if (cdr[2].seqIdx == -1) + { + sprintf(buffer + strlen(buffer), "\t\t\tF") ; + } + else + { + int cdrLen = cdr[2].readEnd - cdr[2].readStart + 1 ; + memcpy(cdrBuffer, read + cdr[2].readStart, cdrLen) ; + cdrBuffer[cdrLen] = '\0' ; + sprintf(buffer + strlen(buffer), "\t%s", cdrBuffer) ; + + if (cdrLen % 3 != 0) + { + sprintf(buffer + strlen(buffer), "\tout_of_frame\tF") ; + } + else + { + int k = 0 ; + char productive = 'T' ; + for (i = cdr[2].readStart ; i <= cdr[2].readEnd ; i += 3) + { + char aa = DnaToAa(read[i], read[i + 1], read[i + 2]) ; + if (aa < 'A' || aa > 'Z') + productive = 'F' ; + + cdrBuffer[k] = aa ; + ++k ; + } + cdrBuffer[k] = '\0' ; + sprintf(buffer + strlen(buffer), "\t%s\t%c", cdrBuffer, productive) ; + } + } + + //v,j identitiy + for (i = 0 ; i <= 2 ; i += 2) + { + if (geneOverlap[i].seqIdx != -1) + sprintf(buffer + strlen(buffer), "\t%.2lf", geneOverlap[i].similarity * 100) ; + else + sprintf(buffer + strlen(buffer), "\t") ; + } + + return buffer ; + } + char *GetGeneOverlapAlignment(char *read, const struct _overlap gene ) { if ( gene.seqIdx == -1 ) diff --git a/run-trust4 b/run-trust4 index 50ff626..4c0c1c7 100755 --- a/run-trust4 +++ b/run-trust4 @@ -7,7 +7,7 @@ use Cwd 'cwd' ; use Cwd 'abs_path' ; use File::Basename ; -die "TRUST4 v1.0.10.1-r454 usage: ./run-trust4 [OPTIONS]:\n". +die "TRUST4 v1.0.11-r458 usage: ./run-trust4 [OPTIONS]:\n". "Required:\n". #"\t[Input]:\n". "\t-b STRING: path to bam file\n".