Skip to content

Commit

Permalink
Add fastx function, which prints fasta/fastq, originally in lh3/pull/12
Browse files Browse the repository at this point in the history
  • Loading branch information
hwalinga committed Aug 13, 2019
1 parent 25948f7 commit 7da360f
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 12 deletions.
26 changes: 21 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,27 +53,27 @@ See `awk.1`.

2. Extract unmapped reads without header:

bioawk -c sam 'and($flag,4)' aln.sam.gz
bioawk -c sam 'and($flag, 4)' aln.sam.gz

3. Extract mapped reads with header:

bioawk -Hc sam '!and($flag,4)'
bioawk -Hc sam '!and($flag, 4)'

4. Reverse complement FASTA:

bioawk -c fastx '{print ">"$name;print revcomp($seq)}' seq.fa.gz
bioawk -c fastx '{print fastx($name, revcomp($seq))}' seq.fa.gz

5. Create FASTA from SAM (uses revcomp if FLAG & 16)

samtools view aln.bam | \
bioawk -c sam '{s=$seq; if(and($flag, 16)) {s=revcomp($seq)} print ">"$qname"\n"s}'
bioawk -c sam '{s=$seq; if(and($flag, 16)) {s=revcomp($seq)} print fastx($qname, s)}'

6. Print the genotypes of sample `foo` and `bar` from a VCF:

grep -v ^## in.vcf | bioawk -tc hdr '{print $foo,$bar}'

7. Translate nucleotide into protein sequence

bioawk -c fastx '{print ">"$name;print translate($seq)}' seq.fa.gz
can also use different translation tables. To translate using the
bactera/archaea code:
Expand All @@ -89,6 +89,22 @@ bactera/archaea code:
} \
if(tgs["NM"] < 3) print }' alignments.sam

9. Get the %GC from FASTA

awk -c fastx '{print $name, gc($seq)}' seq.fa.gz

10. Get the mean Phred quality score table from FASTQ:

awk -c fastx '{print $name, meanqual($qual)}' seq.fq.gz

11. Take column name from the first line (where "age" appears in the first line) of input.txt):

awk -c header '{print $age}' input.txt

12. Use awk's redirection to split a FASTA file by sequence lengths.

awk -c fastx '{if (length($seq) < 35) {print fastx($name, $seq) > "short.fasta"} else {print fastx($name, $seq) > "long.fasta"}}' seq.fq.gz


### Potential limitations

Expand Down
34 changes: 31 additions & 3 deletions addon.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,14 @@ static char comp_tab[] = {
'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127
};

/* The master codon/protein table.
/* The master codon/protein table.
*
* http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
* Tables 7,8,17,18,19,20 are depreciated and do not exist
* on the NCBI website
*
*/
const char codon_table[64][25] =
const char codon_table[64][25] =
{
/* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25*/
/*ttt*/{'F', 'F', 'F', 'F', 'F', 'F', '\0', '\0', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', '\0', '\0', '\0', '\0', 'F', 'F', 'F', 'F', 'F'},
Expand Down Expand Up @@ -241,7 +241,7 @@ void bio_translate(char *dna, char *out, int table)
}
break;
}

int i;
int dnaSize;
int protSize = 0;
Expand Down Expand Up @@ -449,6 +449,34 @@ Cell *bio_func(int f, Cell *x, Node **a)
setsval(y, out);
free(out);

} else if (f == BIO_FFASTX) {
if (a[1]->nnext == 0) {
FATAL("fastx requires at least two arguments");
} else {
char *buf, *name, *seq, *qual;
int bufsz=3*recsize;
int has_qual;
z = execute(a[1]->nnext);
if ((has_qual = a[1]->nnext->nnext != 0)) {
y = execute(a[1]->nnext->nnext);
qual = getsval(y);
}

if ((buf = (char *) malloc(bufsz)) == NULL)
FATAL("out of memory in fastx");

name = getsval(x);
seq = getsval(z);
if (!has_qual) {
sprintf(buf, ">%s\n%s", name, seq);
} else {
if (strlen(seq) != strlen(qual))
WARNING("fastx arguments seq and qual are not same length");
sprintf(buf, "@%s\n%s\n+\n%s", name, seq, qual);
}
setsval(y, buf);
free(buf);
}
} /* else: never happens */
return y;
}
Expand Down
1 change: 1 addition & 0 deletions addon.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ int bio_getrec(char **pbuf, int *psize, int isrecord);
#define BIO_FMINQUAL 208
#define BIO_FMAXQUAL 209
#define BIO_FMEDIANQUAL 210
#define BIO_FFASTX 211


struct Cell;
Expand Down
9 changes: 5 additions & 4 deletions lex.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,12 @@ Keyword keywords[] ={ /* keep sorted: binary searched */
{ "else", ELSE, ELSE },
{ "exit", EXIT, EXIT },
{ "exp", FEXP, BLTIN },
{ "fastx", BIO_FFASTX, BLTIN },
{ "fflush", FFLUSH, BLTIN },
{ "for", FOR, FOR },
{ "func", FUNC, FUNC },
{ "function", FUNC, FUNC },
{ "gc", BIO_FGC, BLTIN },
{ "gc", BIO_FGC, BLTIN },
{ "getline", GETLINE, GETLINE },
{ "gsub", GSUB, GSUB },
{ "if", IF, IF },
Expand All @@ -75,7 +76,7 @@ Keyword keywords[] ={ /* keep sorted: binary searched */
{ "lshift", BIO_FLSHIFT, BLTIN },
{ "match", MATCHFCN, MATCHFCN },
{ "maxqual", BIO_FMAXQUAL, BLTIN },
{ "meanqual", BIO_FMEANQUAL, BLTIN },
{ "meanqual", BIO_FMEANQUAL, BLTIN },
{ "medianqual", BIO_FMEDIANQUAL, BLTIN },
{ "minqual", BIO_FMINQUAL, BLTIN },
{ "next", NEXT, NEXT },
Expand All @@ -86,8 +87,8 @@ Keyword keywords[] ={ /* keep sorted: binary searched */
{ "qualcount", BIO_FQUALCOUNT, BLTIN },
{ "rand", FRAND, BLTIN },
{ "return", RETURN, RETURN },
{ "revcomp",BIO_FREVCOMP, BLTIN },
{ "reverse",BIO_FREVERSE, BLTIN },
{ "revcomp", BIO_FREVCOMP, BLTIN },
{ "reverse", BIO_FREVERSE, BLTIN },
{ "rshift", BIO_FRSHIFT, BLTIN },
{ "sin", FSIN, BLTIN },
{ "split", SPLIT, SPLIT },
Expand Down

0 comments on commit 7da360f

Please sign in to comment.