diff --git a/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java b/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java
new file mode 100644
index 000000000..96abbf20f
--- /dev/null
+++ b/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java
@@ -0,0 +1,563 @@
+package org.qcmg.qmule;
+
+import htsjdk.samtools.*;
+import htsjdk.samtools.fastq.FastqConstants;
+import htsjdk.samtools.fastq.FastqReader;
+import htsjdk.samtools.fastq.FastqRecord;
+import htsjdk.samtools.util.*;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import picard.PicardException;
+import picard.cmdline.CommandLineProgram;
+import picard.cmdline.StandardOptionDefinitions;
+import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Converts a FASTQ file to an unaligned BAM or SAM file.
+ *
+ * Output read records will contain the original base calls and quality scores will be
+ * translated depending on the base quality score encoding: FastqSanger, FastqSolexa and FastqIllumina.
+ *
+ *
+ * There are also arguments to provide values for SAM header and read attributes that are not present in FASTQ
+ * (e.g see RG
or SM
below).
+ *
+ * Inputs
+ *
+ * One FASTQ file name for single-end or two for pair-end sequencing input data.
+ * These files might be in gzip compressed format (when file name is ending with ".gz").
+ *
+ *
+ * Alternatively, for larger inputs you can provide a collection of FASTQ files indexed by their name (see USE_SEQUENCIAL_FASTQ
for details below).
+ *
+ *
+ * By default, this tool will try to guess the base quality score encoding. However you can indicate it explicitly
+ * using the QUALITY_FORMAT
argument.
+ *
+ * Output
+ * A single unaligned BAM or SAM file. By default, the records are sorted by query (read) name.
+ * Usage examples
+ *
+ * Example 1:
+ *
+ * Single-end sequencing FASTQ file conversion. All reads are annotated
+ * as belonging to the "rg0013" read group that in turn is part of the sample "sample001".
+ *
+ *
+ * java -jar picard.jar FastqToSam \
+ * F1=input_reads.fastq \
+ * O=unaligned_reads.bam \
+ * SM=sample001 \
+ * RG=rg0013
+ *
+ * Example 2:
+ *
+ * Similar to example 1 above, but for paired-end sequencing.
+ *
+ *
+ * java -jar picard.jar FastqToSam \
+ * F1=forward_reads.fastq \
+ * F2=reverse_reads.fastq \
+ * O=unaligned_read_pairs.bam \
+ * SM=sample001 \
+ * RG=rg0013
+ *
+ */
+@CommandLineProgramProperties(
+ summary = "" + org.qcmg.qmule.FastqToSamWithHeaders.USAGE_SUMMARY + ".
" + org.qcmg.qmule.FastqToSamWithHeaders.USAGE_DETAILS,
+ oneLineSummary = org.qcmg.qmule.FastqToSamWithHeaders.USAGE_SUMMARY,
+ programGroup = ReadDataManipulationProgramGroup.class)
+@DocumentedFeature
+public class FastqToSamWithHeaders extends CommandLineProgram {
+
+ public static void main(final String[] argv) {
+
+
+ int exitStatus = new FastqToSamWithHeaders().instanceMain(argv);
+
+ System.exit(exitStatus);
+ }
+ static final String USAGE_SUMMARY =
+ "Converts a FASTQ file to an unaligned BAM or SAM file";
+ static final String USAGE_DETAILS =
+ "Output read records will contain the original base calls and quality scores will be " +
+ "translated depending on the base quality score encoding: FastqSanger, FastqSolexa and FastqIllumina.
" +
+ "There are also arguments to provide values for SAM header and read attributes that are not present in FASTQ " +
+ "(e.g see RG or SM below).
" +
+ "Inputs
" +
+ "One FASTQ file name for single-end or two for pair-end sequencing input data. " +
+ "These files might be in gzip compressed format (when file name is ending with \".gz\").
" +
+ "Alternatively, for larger inputs you can provide a collection of FASTQ files indexed by their name " +
+ "(see USE_SEQUENCIAL_FASTQ for details below).
" +
+ "By default, this tool will try to guess the base quality score encoding. However you can indicate it explicitly " +
+ "using the QUALITY_FORMAT argument.
" +
+ "Output
" +
+ "A single unaligned BAM or SAM file. By default, the records are sorted by query (read) name.
" +
+ "Usage examples
" +
+ "Example 1:
" +
+ "Single-end sequencing FASTQ file conversion. All reads are annotated " +
+ "as belonging to the \"rg0013\" read group that in turn is part of the sample \"sample001\".
" +
+ "java -jar picard.jar FastqToSam \\\n" +
+ " F1=input_reads.fastq \\\n" +
+ " O=unaligned_reads.bam \\\n" +
+ " SM=sample001 \\\n" +
+ " RG=rg0013
" +
+ "Example 2:
" +
+ "Similar to example 1 above, but for paired-end sequencing.
" +
+ "java -jar picard.jar FastqToSam \\\n" +
+ " F1=forward_reads.fastq \\\n" +
+ " F2=reverse_reads.fastq \\\n" +
+ " O=unaligned_read_pairs.bam \\\n" +
+ " SM=sample001 \\\n" +
+ " RG=rg0013
";
+
+ private static final Log LOG = Log.getInstance(picard.sam.FastqToSam.class);
+
+ @Argument(shortName="F1", doc="Input fastq file (optionally gzipped) for single end data, or first read in paired end data.")
+ public File FASTQ;
+
+ @Argument(shortName="F2", doc="Input fastq file (optionally gzipped) for the second read of paired end data.", optional=true)
+ public File FASTQ2;
+
+ @Argument(doc="Use sequential fastq files with the suffix _###.fastq or _###.fastq.gz." +
+ "The files should be named:\n" +
+ " _001., _002., ..., _XYZ.\n" +
+ " The base files should be:\n" +
+ " _001.\n" +
+ " An example would be:\n" +
+ " RUNNAME_S8_L005_R1_001.fastq\n" +
+ " RUNNAME_S8_L005_R1_002.fastq\n" +
+ " RUNNAME_S8_L005_R1_003.fastq\n" +
+ " RUNNAME_S8_L005_R1_004.fastq\n" +
+ "RUNNAME_S8_L005_R1_001.fastq should be provided as FASTQ.", optional=true)
+ public boolean USE_SEQUENTIAL_FASTQS = false;
+
+ @Argument(shortName="V", doc="A value describing how the quality values are encoded in the input FASTQ file. " +
+ "Either Solexa (phred scaling + 66), Illumina (phred scaling + 64) or Standard (phred scaling + 33). " +
+ "If this value is not specified, the quality format will be detected automatically.", optional = true)
+ public FastqQualityFormat QUALITY_FORMAT;
+
+ @Argument(doc="Output SAM/BAM file. ", shortName= StandardOptionDefinitions.OUTPUT_SHORT_NAME)
+ public File OUTPUT ;
+
+ @Argument(shortName="RG", doc="Read group name")
+ public String READ_GROUP_NAME = "A";
+
+ @Argument(shortName="SM", doc="Sample name to insert into the read group header")
+ public String SAMPLE_NAME;
+
+ @Argument(shortName="LB", doc="The library name to place into the LB attribute in the read group header", optional=true)
+ public String LIBRARY_NAME;
+
+ @Argument(shortName="PU", doc="The platform unit (often run_barcode.lane) to insert into the read group header", optional=true)
+ public String PLATFORM_UNIT;
+
+ @Argument(shortName="PL", doc="The platform type (e.g. ILLUMINA, SOLID) to insert into the read group header", optional=true)
+ public String PLATFORM;
+
+ @Argument(shortName="CN", doc="The sequencing center from which the data originated", optional=true)
+ public String SEQUENCING_CENTER;
+
+ @Argument(shortName = "PI", doc = "Predicted median insert size, to insert into the read group header", optional = true)
+ public Integer PREDICTED_INSERT_SIZE;
+
+ @Argument(shortName = "PG", doc = "Program group to insert into the read group header.", optional=true)
+ public String PROGRAM_GROUP;
+
+ @Argument(shortName = "PM", doc = "Platform model to insert into the group header (free-form text providing further details of the platform/technology used)", optional=true)
+ public String PLATFORM_MODEL;
+
+ @Argument(doc="Comment(s) to include in the merged output file's header.", optional=true, shortName="CO")
+ public List COMMENT = new ArrayList<>();
+
+ @Argument(shortName = "DS", doc = "Inserted into the read group header", optional = true)
+ public String DESCRIPTION;
+
+ @Argument(shortName = "DT", doc = "Date the run was produced, to insert into the read group header", optional = true)
+ public Iso8601Date RUN_DATE;
+
+ @Argument(shortName="SO", doc="The sort order for the output sam/bam file.")
+ public SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.queryname;
+
+ @Argument(doc="Minimum quality allowed in the input fastq. An exception will be thrown if a quality is less than this value.")
+ public int MIN_Q = 0;
+
+ @Argument(doc="Maximum quality allowed in the input fastq. An exception will be thrown if a quality is greater than this value.")
+ public int MAX_Q = SAMUtils.MAX_PHRED_SCORE;
+
+ @Deprecated
+ @Argument(doc="Deprecated (No longer used). If true and this is an unpaired fastq any occurrence of '/1' or '/2' will be removed from the end of a read name.")
+ public Boolean STRIP_UNPAIRED_MATE_NUMBER = false;
+
+ @Argument(doc="Allow (and ignore) empty lines")
+ public Boolean ALLOW_AND_IGNORE_EMPTY_LINES = false;
+
+ public static final String ZT_ATTRIBUTE = "ZT";
+ public static final String ZH_ATTRIBUTE = "ZH";
+
+ public static final String TRIMMED_BASES = " TB:";
+
+ private static final SolexaQualityConverter solexaQualityConverter = SolexaQualityConverter.getSingleton();
+
+ /**
+ * Looks at fastq input(s) and attempts to determine the proper quality format
+ *
+ * Closes the reader(s) by side effect
+ *
+ * @param reader1 The first fastq input
+ * @param reader2 The second fastq input, if necessary. To not use this input, set it to null
+ * @param expectedQuality If provided, will be used for sanity checking. If left null, autodetection will occur
+ */
+ public static FastqQualityFormat determineQualityFormat(final FastqReader reader1, final FastqReader reader2, final FastqQualityFormat expectedQuality) {
+ final QualityEncodingDetector detector = new QualityEncodingDetector();
+
+ if (reader2 == null) {
+ detector.add(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, reader1);
+ } else {
+ detector.add(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, reader1, reader2);
+ reader2.close();
+ }
+
+ reader1.close();
+
+ final FastqQualityFormat qualityFormat = detector.generateBestGuess(QualityEncodingDetector.FileContext.FASTQ, expectedQuality);
+ if (detector.isDeterminationAmbiguous()) {
+ LOG.warn("Making ambiguous determination about fastq's quality encoding; more than one format possible based on observed qualities.");
+ }
+ LOG.info(String.format("Auto-detected quality format as: %s.", qualityFormat));
+
+ return qualityFormat;
+ }
+
+
+ /**
+ * Get a list of FASTQs that are sequentially numbered based on the first (base) fastq.
+ * The files should be named:
+ * _001., _002., ..., _XYZ.
+ * The base files should be:
+ * _001.
+ * An example would be:
+ * RUNNAME_S8_L005_R1_001.fastq
+ * RUNNAME_S8_L005_R1_002.fastq
+ * RUNNAME_S8_L005_R1_003.fastq
+ * RUNNAME_S8_L005_R1_004.fastq
+ * where `baseFastq` is the first in that list.
+ */
+ protected static List getSequentialFileList(final File baseFastq) {
+ final List files = new ArrayList<>();
+ files.add(baseFastq);
+
+ // Find the correct extension used in the base FASTQ
+ FastqConstants.FastqExtensions fastqExtensions = null;
+ String suffix = null; // store the suffix including the extension
+ for (final FastqConstants.FastqExtensions ext : FastqConstants.FastqExtensions.values()) {
+ suffix = "_001" + ext.getExtension();
+ if (baseFastq.getAbsolutePath().endsWith(suffix)) {
+ fastqExtensions = ext;
+ break;
+ }
+ }
+ if (null == fastqExtensions) {
+ throw new PicardException(String.format("Could not parse the FASTQ extension (expected '_001' + '%s'): %s", FastqConstants.FastqExtensions.values().toString(), baseFastq));
+ }
+
+ // Find all the files
+ for (int idx = 2; true; idx++) {
+ String fastq = baseFastq.getAbsolutePath();
+ fastq = String.format("%s_%03d%s", fastq.substring(0, fastq.length() - suffix.length()), idx, fastqExtensions.getExtension());
+ try {
+ IOUtil.assertFileIsReadable(new File(fastq));
+ } catch (final SAMException e) { // the file is not readable, so do not continue
+ break;
+ }
+ files.add(new File(fastq));
+ }
+
+ return files;
+ }
+
+ /* Simply invokes the right method for unpaired or paired data. */
+ protected int doWork() {
+ IOUtil.assertFileIsReadable(FASTQ);
+ if (FASTQ2 != null) {
+ IOUtil.assertFileIsReadable(FASTQ2);
+ }
+ IOUtil.assertFileIsWritable(OUTPUT);
+
+ final SAMFileHeader header = createSamFileHeader();
+ final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, OUTPUT);
+
+ // Set the quality format
+ QUALITY_FORMAT = picard.sam.FastqToSam.determineQualityFormat(fileToFastqReader(FASTQ),
+ (FASTQ2 == null) ? null : fileToFastqReader(FASTQ2),
+ QUALITY_FORMAT);
+
+ // Lists for sequential files, but also used when not sequential
+ final List readers1 = new ArrayList<>();
+ final List readers2 = new ArrayList<>();
+
+ if (USE_SEQUENTIAL_FASTQS) {
+ // Get all the files
+ for (final File fastq : getSequentialFileList(FASTQ)) {
+ readers1.add(fileToFastqReader(fastq));
+ }
+ if (null != FASTQ2) {
+ for (final File fastq : getSequentialFileList(FASTQ2)) {
+ readers2.add(fileToFastqReader(fastq));
+ }
+ if (readers1.size() != readers2.size()) {
+ throw new PicardException(String.format("Found %d files for FASTQ and %d files for FASTQ2.", readers1.size(), readers2.size()));
+ }
+ }
+ }
+ else {
+ readers1.add(fileToFastqReader(FASTQ));
+ if (FASTQ2 != null) {
+ readers2.add(fileToFastqReader(FASTQ2));
+ }
+ }
+
+ // Loop through the FASTQs
+ for (int idx = 0; idx < readers1.size(); idx++) {
+ makeItSo(readers1.get(idx),
+ (readers2.isEmpty()) ? null : readers2.get(idx),
+ writer);
+ }
+
+ // Close all the things
+ for (final FastqReader reader : readers1) reader.close();
+ for (final FastqReader reader : readers2) reader.close();
+ writer.close();
+
+ return 0;
+ }
+
+ /**
+ * Handles the FastqToSam execution on the FastqReader(s).
+ *
+ * In some circumstances it might be useful to circumvent the command line based instantiation of this
+ * class, however note that there is no handholding or guardrails to running in this manner.
+ *
+ * It is the caller's responsibility to close the reader(s)
+ *
+ * @param reader1 The FastqReader for the first fastq file
+ * @param reader2 The second FastqReader if applicable. Pass in null if only using a single reader
+ * @param writer The SAMFileWriter where the new SAM file is written
+ *
+ */
+ public void makeItSo(final FastqReader reader1, final FastqReader reader2, final SAMFileWriter writer) {
+ final int readCount = (reader2 == null) ? doUnpaired(reader1, writer) : doPaired(reader1, reader2, writer);
+ LOG.info("Processed " + readCount + " fastq reads");
+ }
+
+ /** Creates a simple SAM file from a single fastq file. */
+ protected int doUnpaired(final FastqReader freader, final SAMFileWriter writer) {
+ int readCount = 0;
+ final ProgressLogger progress = new ProgressLogger(LOG);
+ for ( ; freader.hasNext() ; readCount++) {
+ final FastqRecord frec = freader.next();
+ final String frecName = SequenceUtil.getSamReadNameFromFastqHeader(frec.getReadName());
+ final SAMRecord srec = createSamRecord(writer.getFileHeader(), frecName , frec, false) ;
+ srec.setReadPairedFlag(false);
+ writer.addAlignment(srec);
+ progress.record(srec);
+ }
+
+ return readCount;
+ }
+
+ /** More complicated method that takes two fastq files and builds pairing information in the SAM. */
+ protected int doPaired(final FastqReader freader1, final FastqReader freader2, final SAMFileWriter writer) {
+ int readCount = 0;
+ final ProgressLogger progress = new ProgressLogger(LOG);
+ for ( ; freader1.hasNext() && freader2.hasNext() ; readCount++) {
+ final FastqRecord frec1 = freader1.next();
+ final FastqRecord frec2 = freader2.next();
+
+ final String frec1Name = SequenceUtil.getSamReadNameFromFastqHeader(frec1.getReadName());
+ final String frec2Name = SequenceUtil.getSamReadNameFromFastqHeader(frec2.getReadName());
+ final String baseName = getBaseName(frec1Name, frec2Name, freader1, freader2);
+
+ final SAMRecord srec1 = createSamRecord(writer.getFileHeader(), baseName, frec1, true) ;
+ srec1.setFirstOfPairFlag(true);
+ srec1.setSecondOfPairFlag(false);
+ writer.addAlignment(srec1);
+ progress.record(srec1);
+
+ final SAMRecord srec2 = createSamRecord(writer.getFileHeader(), baseName, frec2, true) ;
+ srec2.setFirstOfPairFlag(false);
+ srec2.setSecondOfPairFlag(true);
+ writer.addAlignment(srec2);
+ progress.record(srec2);
+ }
+
+ if (freader1.hasNext() || freader2.hasNext()) {
+ throw new PicardException("Input paired fastq files must be the same length");
+ }
+
+ return readCount;
+ }
+
+ private FastqReader fileToFastqReader(final File file) {
+ return new FastqReader(file, ALLOW_AND_IGNORE_EMPTY_LINES);
+ }
+
+
+ public static SAMRecord createSamRecord(final SAMFileHeader header, final String baseName, final FastqRecord frec, final boolean paired, String readGroupName, FastqQualityFormat fqFormat, int minQ, int maxQ) {
+ final SAMRecord srec = new SAMRecord(header);
+ srec.setReadName(baseName);
+ srec.setReadString(frec.getReadString());
+ srec.setReadUnmappedFlag(true);
+ srec.setAttribute(ReservedTagConstants.READ_GROUP_ID, readGroupName);
+ String additionalHeader = frec.getReadName().replace(baseName, "");
+ if (additionalHeader.length() > 0) {
+ /*
+ If this contains the trimmed bases flag (TB:) then put that in a separate tag
+ */
+ int tbIndex = additionalHeader.indexOf(TRIMMED_BASES);
+ if (tbIndex == -1) {
+ srec.setAttribute(ZH_ATTRIBUTE, additionalHeader);
+ } else {
+ srec.setAttribute(ZT_ATTRIBUTE, additionalHeader.substring(tbIndex + 4));
+ if (tbIndex > 0) {
+ srec.setAttribute(ZH_ATTRIBUTE, additionalHeader.substring(0, tbIndex));
+ }
+ }
+ }
+ final byte[] quals = StringUtil.stringToBytes(frec.getBaseQualityString());
+ convertQuality(quals, fqFormat);
+ for (final byte qual : quals) {
+ final int uQual = qual & 0xff;
+ if (uQual < minQ || uQual > maxQ) {
+ throw new PicardException("Base quality " + uQual + " is not in the range " + minQ + ".." +
+ maxQ + " for read " + frec.getReadName());
+ }
+ }
+ srec.setBaseQualities(quals);
+
+ if (paired) {
+ srec.setReadPairedFlag(true);
+ srec.setMateUnmappedFlag(true);
+ }
+ return srec;
+ }
+ private SAMRecord createSamRecord(final SAMFileHeader header, final String baseName, final FastqRecord frec, final boolean paired) {
+ return FastqToSamWithHeaders.createSamRecord(header, baseName, frec, paired, READ_GROUP_NAME, QUALITY_FORMAT, MIN_Q, MAX_Q);
+ }
+
+ /** Creates a simple header with the values provided on the command line. */
+ public SAMFileHeader createSamFileHeader() {
+ final SAMReadGroupRecord rgroup = new SAMReadGroupRecord(this.READ_GROUP_NAME);
+ rgroup.setSample(this.SAMPLE_NAME);
+ if (this.LIBRARY_NAME != null) rgroup.setLibrary(this.LIBRARY_NAME);
+ if (this.PLATFORM != null) rgroup.setPlatform(this.PLATFORM);
+ if (this.PLATFORM_UNIT != null) rgroup.setPlatformUnit(this.PLATFORM_UNIT);
+ if (this.SEQUENCING_CENTER != null) rgroup.setSequencingCenter(SEQUENCING_CENTER);
+ if (this.PREDICTED_INSERT_SIZE != null) rgroup.setPredictedMedianInsertSize(PREDICTED_INSERT_SIZE);
+ if (this.DESCRIPTION != null) rgroup.setDescription(this.DESCRIPTION);
+ if (this.RUN_DATE != null) rgroup.setRunDate(this.RUN_DATE);
+ if (this.PLATFORM_MODEL != null) rgroup.setPlatformModel(this.PLATFORM_MODEL);
+ if (this.PROGRAM_GROUP != null) rgroup.setProgramGroup(this.PROGRAM_GROUP);
+
+ final SAMFileHeader header = new SAMFileHeader();
+ header.addReadGroup(rgroup);
+
+ for (final String comment : COMMENT) {
+ header.addComment(comment);
+ }
+
+ header.setSortOrder(this.SORT_ORDER);
+ return header ;
+ }
+
+ /** Based on the type of quality scores coming in, converts them to a numeric byte[] in phred scale. */
+ static void convertQuality(final byte[] quals, final FastqQualityFormat version) {
+ switch (version) {
+ case Standard:
+ SAMUtils.fastqToPhred(quals);
+ break ;
+ case Solexa:
+ solexaQualityConverter.convertSolexaQualityCharsToPhredBinary(quals);
+ break ;
+ case Illumina:
+ solexaQualityConverter.convertSolexa_1_3_QualityCharsToPhredBinary(quals);
+ break ;
+ }
+ }
+
+ /** Returns read baseName and asserts correct pair read name format:
+ *
+ * - Paired reads must either have the exact same read names or they must contain at least one "/"
+ *
- and the First pair read name must end with "/1" and second pair read name ends with "/2"
+ *
- The baseName (read name part before the /) must be the same for both read names
+ *
- If the read names are exactly the same but end in "/2" or "/1" then an exception will be thrown
+ *
+ */
+ String getBaseName(final String readName1, final String readName2, final FastqReader freader1, final FastqReader freader2) {
+ String [] toks = getReadNameTokens(readName1, 1, freader1);
+ final String baseName1 = toks[0] ;
+ final String num1 = toks[1] ;
+
+ toks = getReadNameTokens(readName2, 2, freader2);
+ final String baseName2 = toks[0] ;
+ final String num2 = toks[1];
+
+ if (!baseName1.equals(baseName2)) {
+ throw new PicardException(String.format("In paired mode, read name 1 (%s) does not match read name 2 (%s)", baseName1,baseName2));
+ }
+
+ final boolean num1Blank = StringUtil.isBlank(num1);
+ final boolean num2Blank = StringUtil.isBlank(num2);
+ if (num1Blank || num2Blank) {
+ if(!num1Blank) throw new PicardException(error(freader1,"Pair 1 number is missing (" +readName1+ "). Both pair numbers must be present or neither.")); //num1 != blank and num2 == blank
+ else if(!num2Blank) throw new PicardException(error(freader2, "Pair 2 number is missing (" +readName2+ "). Both pair numbers must be present or neither.")); //num1 == blank and num =2 != blank
+ } else {
+ if (!num1.equals("1")) throw new PicardException(error(freader1,"Pair 1 number must be 1 ("+readName1+")"));
+ if (!num2.equals("2")) throw new PicardException(error(freader2,"Pair 2 number must be 2 ("+readName2+")"));
+ }
+
+ return baseName1 ;
+ }
+
+ /** Breaks up read name into baseName and number separated by the last / */
+ private String [] getReadNameTokens(final String readName, final int pairNum, final FastqReader freader) {
+ if(readName.equals("")) throw new PicardException(error(freader,"Pair read name "+pairNum+" cannot be empty: "+readName));
+
+ final int idx = readName.lastIndexOf('/');
+ final String[] result = new String[2];
+
+ if (idx == -1) {
+ result[0] = readName;
+ result[1] = null;
+ } else {
+ result[1] = readName.substring(idx + 1); // should be a 1 or 2
+
+ if(!result[1].equals("1") && !result[1].equals("2")) { //if not a 1 or 2 then names must be identical
+ result[0] = readName;
+ result[1] = null;
+ }
+ else {
+ result[0] = readName.substring(0,idx); // baseName
+ }
+ }
+
+ return result ;
+ }
+
+ /** Little utility to give error messages corresponding to line numbers in the input files. */
+ private String error(final FastqReader freader, final String str) {
+ return str +" at line "+freader.getLineNumber() +" in file "+freader.getFile().getAbsolutePath();
+ }
+
+ @Override
+ protected String[] customCommandLineValidation() {
+ if (MIN_Q < 0) return new String[]{"MIN_Q must be >= 0"};
+ if (MAX_Q > SAMUtils.MAX_PHRED_SCORE) return new String[]{"MAX_Q must be <= " + SAMUtils.MAX_PHRED_SCORE};
+ return null;
+ }
+}
diff --git a/qmule/src/org/qcmg/qmule/SamToFastqWithHeaders.java b/qmule/src/org/qcmg/qmule/SamToFastqWithHeaders.java
new file mode 100644
index 000000000..95644f450
--- /dev/null
+++ b/qmule/src/org/qcmg/qmule/SamToFastqWithHeaders.java
@@ -0,0 +1,627 @@
+package org.qcmg.qmule;
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2009 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+import htsjdk.samtools.*;
+import htsjdk.samtools.fastq.FastqRecord;
+import htsjdk.samtools.fastq.FastqWriter;
+import htsjdk.samtools.fastq.FastqWriterFactory;
+import htsjdk.samtools.util.*;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import picard.PicardException;
+import picard.cmdline.CommandLineProgram;
+import picard.cmdline.StandardOptionDefinitions;
+import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
+
+import java.io.File;
+import java.util.*;
+
+/**
+ * Extracts read sequences and qualities from the input SAM/BAM file and writes them into
+ * the output file in Sanger FASTQ format. .
+ * See MAQ FASTQ specification for details.
+ * This tool can be used by way of a pipe to run BWA MEM on unmapped BAM (uBAM) files efficiently.
+ *
In the RC mode (default is True), if the read is aligned and the alignment is to the reverse strand on the genome,
+ * the read's sequence from input sam file will be reverse-complemented prior to writing it to FASTQ in order restore correctly
+ * the original read sequence as it was generated by the sequencer.
+ *
+ *
Usage example:
+ *
+ * java -jar picard.jar SamToFastqWithHeaders \
+ * I=input.bam \
+ * FASTQ=output.fastq
+ *
+ *
+ */
+@CommandLineProgramProperties(
+ summary = SamToFastqWithHeaders.USAGE_SUMMARY + SamToFastqWithHeaders.USAGE_DETAILS,
+ oneLineSummary = SamToFastqWithHeaders.USAGE_SUMMARY,
+ programGroup = ReadDataManipulationProgramGroup.class)
+@DocumentedFeature
+public class SamToFastqWithHeaders extends CommandLineProgram {
+ static final String USAGE_SUMMARY = "Converts a SAM/BAM/CRAM file to FASTQ.";
+ static final String USAGE_DETAILS = " Extracts read sequences and qualities from the input SAM/BAM/CRAM file and writes them into" +
+ "the output file in Sanger FASTQ format." +
+ "See MAQ FASTQ specification for details." +
+ "This tool can be used by way of a pipe to run BWA MEM on unmapped BAM (uBAM) files efficiently." +
+ "In the RC mode (default is True), if the read is aligned and the alignment is to the reverse strand on the genome," +
+ "the read's sequence from input sam file will be reverse-complemented prior to writing it to FASTQ in order restore correctly" +
+ "the original read sequence as it was generated by the sequencer.
" +
+ "
" +
+ "Usage example:
" +
+ "" +
+ "java -jar picard.jar SamToFastqWithHeaders
" +
+ " I=input.bam
" +
+ " FASTQ=output.fastq" +
+ "
" +
+ "
";
+ @Argument(doc = "Input SAM/BAM/CRAM file to extract reads from", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME)
+ public File INPUT;
+
+ @Argument(shortName = "F", doc = "Output FASTQ file (single-end fastq or, if paired, first end of the pair FASTQ).",
+ mutex = {"OUTPUT_PER_RG", "COMPRESS_OUTPUTS_PER_RG", "OUTPUT_DIR"})
+ public File FASTQ;
+
+ @Argument(shortName = "F2", doc = "Output FASTQ file (if paired, second end of the pair FASTQ).", optional = true,
+ mutex = {"OUTPUT_PER_RG", "COMPRESS_OUTPUTS_PER_RG"})
+ public File SECOND_END_FASTQ;
+
+ @Argument(shortName = "FU", doc = "Output FASTQ file for unpaired reads; may only be provided in paired-FASTQ mode", optional = true,
+ mutex = {"OUTPUT_PER_RG", "COMPRESS_OUTPUTS_PER_RG"})
+ public File UNPAIRED_FASTQ;
+
+ @Argument(shortName = "OPRG", doc = "Output a FASTQ file per read group (two FASTQ files per read group if the group is paired).",
+ optional = true, mutex = {"FASTQ", "SECOND_END_FASTQ", "UNPAIRED_FASTQ"})
+ public boolean OUTPUT_PER_RG;
+
+ @Argument(shortName = "GZOPRG", doc = "Compress output FASTQ files per read group using gzip and append a .gz extension to the file names.",
+ mutex = {"FASTQ", "SECOND_END_FASTQ", "UNPAIRED_FASTQ"})
+ public Boolean COMPRESS_OUTPUTS_PER_RG = false;
+
+ @Argument(shortName = "RGT", doc = "The read group tag (PU or ID) to be used to output a FASTQ file per read group.")
+ public String RG_TAG = "PU";
+
+ @Argument(shortName = "ODIR", doc = "Directory in which to output the FASTQ file(s). Used only when OUTPUT_PER_RG is true.",
+ optional = true)
+ public File OUTPUT_DIR;
+
+ @Argument(shortName = "RC", doc = "Re-reverse bases and qualities of reads with negative strand flag set before writing them to FASTQ",
+ optional = true)
+ public boolean RE_REVERSE = true;
+
+ @Argument(shortName = "INTER", doc = "Will generate an interleaved fastq if paired, each line will have /1 or /2 to describe which end it came from")
+ public boolean INTERLEAVE = false;
+
+ @Argument(shortName = "NON_PF", doc = "Include non-PF reads from the SAM file into the output " +
+ "FASTQ files. PF means 'passes filtering'. Reads whose 'not passing quality controls' " +
+ "flag is set are non-PF reads. See GATK Dictionary for more info.")
+ public boolean INCLUDE_NON_PF_READS = false;
+
+ @Argument(shortName = "CLIP_ATTR", doc = "The attribute that stores the position at which " +
+ "the SAM record should be clipped", optional = true)
+ public String CLIPPING_ATTRIBUTE;
+
+ @Argument(shortName = "CLIP_ACT", doc = "The action that should be taken with clipped reads: " +
+ "'X' means the reads and qualities should be trimmed at the clipped position; " +
+ "'N' means the bases should be changed to Ns in the clipped region; and any " +
+ "integer means that the base qualities should be set to that value in the " +
+ "clipped region.", optional = true)
+ public String CLIPPING_ACTION;
+
+ @Argument(shortName = "CLIP_MIN", doc = "When performing clipping with the CLIPPING_ATTRIBUTE and CLIPPING_ACTION " +
+ "parameters, ensure that the resulting reads after clipping are at least CLIPPING_MIN_LENGTH bases long. " +
+ "If the original read is shorter than CLIPPING_MIN_LENGTH then the original read length will be maintained.")
+ public int CLIPPING_MIN_LENGTH = 0;
+
+ @Argument(shortName = "R1_TRIM", doc = "The number of bases to trim from the beginning of read 1.")
+ public int READ1_TRIM = 0;
+
+ @Argument(shortName = "R1_MAX_BASES", doc = "The maximum number of bases to write from read 1 after trimming. " +
+ "If there are fewer than this many bases left after trimming, all will be written. If this " +
+ "value is null then all bases left after trimming will be written.", optional = true)
+ public Integer READ1_MAX_BASES_TO_WRITE;
+
+ @Argument(shortName = "R2_TRIM", doc = "The number of bases to trim from the beginning of read 2.")
+ public int READ2_TRIM = 0;
+
+ @Argument(shortName = "R2_MAX_BASES", doc = "The maximum number of bases to write from read 2 after trimming. " +
+ "If there are fewer than this many bases left after trimming, all will be written. If this " +
+ "value is null then all bases left after trimming will be written.", optional = true)
+ public Integer READ2_MAX_BASES_TO_WRITE;
+
+ @Argument(shortName = "Q", doc = "End-trim reads using the phred/bwa quality trimming algorithm and this quality.", optional = true)
+ public Integer QUALITY;
+
+ @Argument(doc = "If true, include non-primary alignments in the output. Support of non-primary alignments in SamToFastq " +
+ "is not comprehensive, so there may be exceptions if this is set to true and there are paired reads with non-primary alignments.")
+ public boolean INCLUDE_NON_PRIMARY_ALIGNMENTS = false;
+
+ private static final String CLIP_TRIM = "X";
+ private static final String CLIP_TO_N = "N";
+
+ private static final short ZH_ATTRIBUTE = SAMTag.makeBinaryTag("ZH");
+ private static final short ZT_ATTRIBUTE = SAMTag.makeBinaryTag("ZT");
+ private static final short OQ_ATTRIBUTE = SAMTag.makeBinaryTag("OQ");
+
+ private static final String[] EMPTY_STRING_ARRAY = {};
+
+ private final Log log = Log.getInstance(SamToFastqWithHeaders.class);
+
+ protected int doWork() {
+ IOUtil.assertFileIsReadable(INPUT);
+ final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
+ final Map firstSeenMates = new HashMap<>();
+ final FastqWriterFactory factory = new FastqWriterFactory();
+ factory.setCreateMd5(CREATE_MD5_FILE);
+
+ initializeAdditionalWriters();
+ final Map writers = generateWriters(reader.getFileHeader().getReadGroups(),
+ factory);
+ final Map> additionalWriters = generateAdditionalWriters(reader.getFileHeader().getReadGroups(), factory);
+ if (writers.isEmpty()) {
+ final String msgBase = INPUT + " does not contain Read Groups";
+ final String msg = OUTPUT_PER_RG ? msgBase + ", consider not using the OUTPUT_PER_RG option" : msgBase;
+ throw new PicardException(msg);
+ }
+
+ final ProgressLogger progress = new ProgressLogger(log);
+
+ for (final SAMRecord currentRecord : reader) {
+ handleRecord(currentRecord, writers, additionalWriters, firstSeenMates);
+ progress.record(currentRecord);
+ }
+
+ CloserUtil.close(reader);
+
+ // Close all the fastq writers being careful to close each one only once!
+ for (final FastqWriters writerMapping : new HashSet<>(writers.values())) {
+ writerMapping.closeAll();
+ }
+
+ // close all `additionalWriters` only once
+ final Set additionalWriterSet = new HashSet<>();
+ additionalWriters.values().forEach(additionalWriterSet::addAll);
+ for (final FastqWriter fastqWriter : additionalWriterSet) {
+ fastqWriter.close();
+ }
+
+ if (!firstSeenMates.isEmpty()) {
+ SAMUtils.processValidationError(new SAMValidationError(SAMValidationError.Type.MATE_NOT_FOUND,
+ "Found " + firstSeenMates.size() + " unpaired mates", null), VALIDATION_STRINGENCY);
+ }
+
+ return 0;
+ }
+
+ /**
+ * Generates the writers for the given read groups or, if we are not emitting per-read-group, just returns the single set of writers.
+ */
+ private Map generateWriters(List samReadGroupRecords,
+ FastqWriterFactory factory) {
+
+ final Map writerMap = new HashMap<>();
+
+ final FastqWriters fastqWriters;
+ if (!OUTPUT_PER_RG) {
+ IOUtil.assertFileIsWritable(FASTQ);
+ final FastqWriter firstOfPairWriter = factory.newWriter(FASTQ);
+
+ final FastqWriter secondOfPairWriter;
+ if (INTERLEAVE) {
+ secondOfPairWriter = firstOfPairWriter;
+ } else if (SECOND_END_FASTQ != null) {
+ IOUtil.assertFileIsWritable(SECOND_END_FASTQ);
+ secondOfPairWriter = factory.newWriter(SECOND_END_FASTQ);
+ } else {
+ secondOfPairWriter = null;
+ }
+
+ /* Prepare the writer that will accept unpaired reads. If we're emitting a single fastq - and assuming single-ended reads -
+ * then this is simply that one fastq writer. Otherwise, if we're doing paired-end, we emit to a third new writer, since
+ * the other two fastqs are accepting only paired end reads. */
+ final FastqWriter unpairedWriter = UNPAIRED_FASTQ == null ? firstOfPairWriter : factory.newWriter(UNPAIRED_FASTQ);
+
+ fastqWriters = new FastqWriters(firstOfPairWriter, secondOfPairWriter, unpairedWriter);
+
+ // For all read groups we may find in the sam, register this single set of writers for them.
+ writerMap.put(null, fastqWriters);
+ for (final SAMReadGroupRecord rg : samReadGroupRecords) {
+ writerMap.put(rg, fastqWriters);
+ }
+ } else {
+ // When we're creating a fastq-group per readgroup, by convention we do not emit a special fastq for unpaired reads.
+ for (final SAMReadGroupRecord rg : samReadGroupRecords) {
+ final FastqWriter firstOfPairWriter = factory.newWriter(makeReadGroupFile(rg, "_1"));
+ // Create this writer on-the-fly; if we find no second-of-pair reads, don't bother making a writer (or delegating,
+ // if we're interleaving).
+ final Lazy lazySecondOfPairWriter = new Lazy<>(() -> INTERLEAVE ? firstOfPairWriter : factory.newWriter(makeReadGroupFile(rg, "_2")));
+
+ writerMap.put(rg, new FastqWriters(firstOfPairWriter, lazySecondOfPairWriter, firstOfPairWriter));
+ }
+ }
+ return writerMap;
+ }
+
+ protected void initializeAdditionalWriters() {
+ }
+
+ protected Map> generateAdditionalWriters(List readGroups,
+ FastqWriterFactory factory) {
+ return Collections.emptyMap();
+ }
+
+ private void handleRecord(final SAMRecord currentRecord, final Map writers,
+ final Map> additionalWriters,
+ final Map firstSeenMates) {
+ if (currentRecord.isSecondaryOrSupplementary() && !INCLUDE_NON_PRIMARY_ALIGNMENTS) {
+ return;
+ }
+
+ // Skip non-PF reads as necessary
+ if (currentRecord.getReadFailsVendorQualityCheckFlag() && !INCLUDE_NON_PF_READS) {
+ return;
+ }
+
+ final FastqWriters fq = writers.get(currentRecord.getReadGroup());
+ SAMRecord read1 = null;
+ SAMRecord read2 = null;
+ if (currentRecord.getReadPairedFlag()) {
+ final String currentReadName = currentRecord.getReadName();
+ final SAMRecord firstRecord = firstSeenMates.remove(currentReadName);
+ if (firstRecord == null) {
+ firstSeenMates.put(currentReadName, currentRecord);
+ } else {
+ assertPairedMates(firstRecord, currentRecord);
+
+ read1 = currentRecord.getFirstOfPairFlag() ? currentRecord : firstRecord;
+ read2 = currentRecord.getFirstOfPairFlag() ? firstRecord : currentRecord;
+ writeRecord(read1, 1, fq.getFirstOfPair(), READ1_TRIM, READ1_MAX_BASES_TO_WRITE);
+ final FastqWriter secondOfPairWriter = fq.getSecondOfPair();
+ if (secondOfPairWriter == null) {
+ throw new PicardException("Input contains paired reads but no SECOND_END_FASTQ specified.");
+ }
+ writeRecord(read2, 2, secondOfPairWriter, READ2_TRIM, READ2_MAX_BASES_TO_WRITE);
+ }
+ } else {
+ writeRecord(currentRecord, null, fq.getUnpaired(), READ1_TRIM, READ1_MAX_BASES_TO_WRITE);
+ }
+
+ handleAdditionalRecords(currentRecord, additionalWriters, read1, read2);
+ }
+
+ protected void handleAdditionalRecords(SAMRecord currentRecord, Map> additionalWriters, SAMRecord read1, SAMRecord read2) {
+ }
+
+ private File makeReadGroupFile(final SAMReadGroupRecord readGroup, final String preExtSuffix) {
+ String fileName = null;
+ if (RG_TAG.equalsIgnoreCase("PU")) {
+ fileName = readGroup.getPlatformUnit();
+ } else if (RG_TAG.equalsIgnoreCase("ID")) {
+ fileName = readGroup.getReadGroupId();
+ }
+ if (fileName == null) {
+ throw new PicardException("The selected RG_TAG: " + RG_TAG + " is not present in the header.");
+ }
+ fileName = IOUtil.makeFileNameSafe(fileName);
+ if (preExtSuffix != null) {
+ fileName += preExtSuffix;
+ }
+ fileName += COMPRESS_OUTPUTS_PER_RG ? ".fastq.gz" : ".fastq";
+
+ final File result = (OUTPUT_DIR != null)
+ ? new File(OUTPUT_DIR, fileName)
+ : new File(fileName);
+ IOUtil.assertFileIsWritable(result);
+ return result;
+ }
+
+ public static int getPositionOfCaseChange(String s) {
+ if (null != s && s.length() > 0) {
+ boolean isLower = Character.isLowerCase(s.charAt(0));
+ for (int i = 1, len = s.length(); i < len; i++) {
+ if (Character.isLowerCase(s.charAt(i)) != isLower) {
+ return i;
+ }
+ }
+ /*
+ If we are here, then there has been no case change in the string
+ Return 0 which will convey to the user that the case changed at the first character in the string
+ */
+ return 0;
+ }
+ return -1;
+ }
+
+ public static String[] getTrimmedBaseDetailsFromTag(String tag) {
+ if (null != tag && tag.length() > 0) {
+ String [] results = new String[4];
+ int plusIndex = tag.indexOf('+');
+
+ String bases = tag.substring(0, plusIndex);
+ String quals = tag.substring(plusIndex + 1);
+ int caseChangeIndex = getPositionOfCaseChange(bases);
+ if (caseChangeIndex > 0) {
+ String bases1 = bases.substring(0, caseChangeIndex);
+ String bases2 = bases.substring(caseChangeIndex);
+ if (Character.isLowerCase(bases1.charAt(0))) {
+ results[0] = bases1;
+ results[1] = quals.substring(0, caseChangeIndex);
+ results[2] = bases2;
+ results[3] = quals.substring(caseChangeIndex);
+ } else {
+ results[0] = bases2;
+ results[1] = quals.substring(caseChangeIndex);
+ results[2] = bases1;
+ results[3] = quals.substring(0, caseChangeIndex);
+ }
+ } else {
+ boolean isLower = Character.isLowerCase(bases.charAt(0));
+ results[0] = isLower? bases : "";
+ results[1] = isLower ? quals : "";
+ results[2] = isLower ? "" : bases;
+ results[3] = isLower ? "" : quals;
+ }
+ return results;
+ }
+ return EMPTY_STRING_ARRAY;
+ }
+
+
+
+ public static FastqRecord getFastqRecordFromSamRecord(final SAMRecord read, final Integer mateNumber, final int basesToTrim,
+ final Integer maxBasesToWrite, final String clippingAttribute,
+ int clippingMinLength, final String clippingAction, boolean reReverse, final Integer quality) {
+ /*
+ get ZH tag and add to header
+ */
+ String seqHeader = read.getReadName();
+ String additionalHeader = (String)read.getAttribute(ZH_ATTRIBUTE);
+ String trimmedAdapterSequenceAndQual = (String)read.getAttribute(ZT_ATTRIBUTE);
+ if (null != additionalHeader && additionalHeader.length() > 0) {
+ seqHeader += additionalHeader;
+ } else if (null != mateNumber){
+ seqHeader += "/" + mateNumber;
+ }
+
+ String readString = read.getReadString();
+ String origBaseQuals = (String)read.getAttribute(OQ_ATTRIBUTE);
+ String baseQualities = (null != origBaseQuals && origBaseQuals.length() == readString.length()) ? origBaseQuals : read.getBaseQualityString();
+
+
+ if (reReverse && read.getReadNegativeStrandFlag()) {
+ readString = SequenceUtil.reverseComplement(readString);
+ baseQualities = StringUtil.reverseString(baseQualities);
+ }
+
+ if (null != trimmedAdapterSequenceAndQual && trimmedAdapterSequenceAndQual.length() > 0) {
+ String [] adapterInfo = getTrimmedBaseDetailsFromTag(trimmedAdapterSequenceAndQual);
+ if (null != adapterInfo && adapterInfo.length == 4) {
+ if (adapterInfo[0].length() > 0) {
+ readString = adapterInfo[0].toUpperCase() + readString;
+ baseQualities = adapterInfo[1] + baseQualities;
+ }
+ if (adapterInfo[2].length() > 0) {
+ readString = readString + adapterInfo[2].toUpperCase();
+ baseQualities = baseQualities + adapterInfo[3];
+ }
+ }
+ }
+
+ // If we're clipping, do the right thing to the bases or qualities
+ if (clippingAttribute != null) {
+ Integer clipPoint = (Integer) read.getAttribute(clippingAttribute);
+ if (clipPoint != null && clipPoint < clippingMinLength) {
+ clipPoint = Math.min(readString.length(), clippingMinLength);
+ }
+
+ if (clipPoint != null) {
+ if (clippingAction.equalsIgnoreCase(CLIP_TRIM)) {
+ readString = clip(readString, clipPoint, null, !read.getReadNegativeStrandFlag());
+ baseQualities = clip(baseQualities, clipPoint, null, !read.getReadNegativeStrandFlag());
+ } else if (clippingAction.equalsIgnoreCase(CLIP_TO_N)) {
+ readString = clip(readString, clipPoint, CLIP_TO_N.charAt(0), !read.getReadNegativeStrandFlag());
+ } else {
+ final char newQual = SAMUtils.phredToFastq(new byte[]{(byte) Integer.parseInt(clippingAction)}).charAt(0);
+ baseQualities = clip(baseQualities, clipPoint, newQual, !read.getReadNegativeStrandFlag());
+ }
+ }
+ }
+
+ if (basesToTrim > 0) {
+ readString = readString.substring(basesToTrim);
+ baseQualities = baseQualities.substring(basesToTrim);
+ }
+
+ // Perform quality trimming if desired, making sure to leave at least one base!
+ if (quality != null) {
+ final byte[] quals = SAMUtils.fastqToPhred(baseQualities);
+ final int qualityTrimIndex = Math.max(1, TrimmingUtil.findQualityTrimPoint(quals, quality));
+ if (qualityTrimIndex < quals.length) {
+ readString = readString.substring(0, qualityTrimIndex);
+ baseQualities = baseQualities.substring(0, qualityTrimIndex);
+ }
+ }
+
+ if (maxBasesToWrite != null && maxBasesToWrite < readString.length()) {
+ readString = readString.substring(0, maxBasesToWrite);
+ baseQualities = baseQualities.substring(0, maxBasesToWrite);
+ }
+
+ return new FastqRecord(seqHeader, readString, "", baseQualities);
+ }
+ private void writeRecord(final SAMRecord read, final Integer mateNumber, final FastqWriter writer,
+ final int basesToTrim, final Integer maxBasesToWrite) {
+
+
+ writer.write(getFastqRecordFromSamRecord(read, mateNumber, basesToTrim, maxBasesToWrite, CLIPPING_ATTRIBUTE, CLIPPING_MIN_LENGTH, CLIPPING_ACTION, RE_REVERSE, QUALITY));
+ }
+
+ /**
+ * Utility method to handle the changes required to the base/quality strings by the clipping
+ * parameters.
+ *
+ * @param src The string to clip
+ * @param point The 1-based position of the first clipped base in the read
+ * @param replacement If non-null, the character to replace in the clipped positions
+ * in the string (a quality score or 'N'). If null, just trim src
+ * @param posStrand Whether the read is on the positive strand
+ * @return String The clipped read or qualities
+ */
+ private static String clip(final String src, final int point, final Character replacement, final boolean posStrand) {
+ final int len = src.length();
+ StringBuilder result = new StringBuilder(posStrand ? src.substring(0, point - 1) : src.substring(len - point + 1));
+ if (replacement != null) {
+ if (posStrand) {
+ for (int i = point; i <= len; i++) {
+ result.append(replacement);
+ }
+ } else {
+ for (int i = 0; i <= len - point; i++) {
+ result.insert(0, replacement);
+ }
+ }
+ }
+ return result.toString();
+ }
+
+ protected static void assertPairedMates(final SAMRecord record1, final SAMRecord record2) {
+ if (!(record1.getFirstOfPairFlag() && record2.getSecondOfPairFlag() ||
+ record2.getFirstOfPairFlag() && record1.getSecondOfPairFlag())) {
+ throw new PicardException("Illegal mate state: " + record1.getReadName());
+ }
+ }
+
+ /**
+ * Put any custom command-line validation in an override of this method.
+ * clp is initialized at this point and can be used to print usage and access argv.
+ * Any options set by command-line parser can be validated.
+ *
+ * @return null if command line is valid. If command line is invalid, returns an array of error
+ * messages to be written to the appropriate place.
+ */
+ protected String[] customCommandLineValidation() {
+
+ List errors = new ArrayList<>();
+
+ if (INTERLEAVE && SECOND_END_FASTQ != null) {
+ errors.add("Cannot set INTERLEAVE to true and pass in a SECOND_END_FASTQ");
+ }
+
+ if (UNPAIRED_FASTQ != null && SECOND_END_FASTQ == null) {
+ errors.add("UNPAIRED_FASTQ may only be set when also emitting read1 and read2 fastqs (so SECOND_END_FASTQ must also be set).");
+ }
+
+ if ((CLIPPING_ATTRIBUTE != null && CLIPPING_ACTION == null) ||
+ (CLIPPING_ATTRIBUTE == null && CLIPPING_ACTION != null)) {
+ errors.add("Both or neither of CLIPPING_ATTRIBUTE and CLIPPING_ACTION should be set.");
+ }
+
+ if (CLIPPING_ACTION != null) {
+ if (!CLIPPING_ACTION.equals(CLIP_TO_N) && !CLIPPING_ACTION.equals(CLIP_TRIM)) {
+ try {
+ Integer.parseInt(CLIPPING_ACTION);
+ } catch (NumberFormatException nfe) {
+ errors.add("CLIPPING ACTION must be one of: N, X, or an integer");
+ }
+ }
+ }
+
+ if ((OUTPUT_PER_RG && OUTPUT_DIR == null) || ((!OUTPUT_PER_RG) && OUTPUT_DIR != null)) {
+ errors.add("If OUTPUT_PER_RG is true, then OUTPUT_DIR should be set. If ");
+
+ }
+
+ if (OUTPUT_PER_RG) {
+ if (RG_TAG == null) {
+ errors.add("If OUTPUT_PER_RG is true, then RG_TAG should be set.");
+ } else if (!(RG_TAG.equalsIgnoreCase("PU") || RG_TAG.equalsIgnoreCase("ID"))) {
+ errors.add("RG_TAG must be: PU or ID");
+ }
+ }
+
+ return errors.isEmpty() ? super.customCommandLineValidation() : errors.toArray(new String[0]);
+ }
+
+ /**
+ * A collection of {@link htsjdk.samtools.fastq.FastqWriter}s for particular types of reads.
+ *
+ * Allows for lazy construction of the second-of-pair writer, since when we are in the "output per read group mode", we only wish to
+ * generate a second-of-pair fastq if we encounter a second-of-pair read.
+ */
+ private static final class FastqWriters {
+ private final FastqWriter firstOfPair, unpaired;
+ private final Lazy secondOfPair;
+
+ /**
+ * Constructor if the consumer wishes for the second-of-pair writer to be built on-the-fly.
+ */
+ private FastqWriters(final FastqWriter firstOfPair, final Lazy secondOfPair, final FastqWriter unpaired) {
+ this.firstOfPair = firstOfPair;
+ this.unpaired = unpaired;
+ this.secondOfPair = secondOfPair;
+ }
+
+ /**
+ * Simple constructor; all writers are pre-initialized..
+ */
+ private FastqWriters(final FastqWriter firstOfPair, final FastqWriter secondOfPair, final FastqWriter unpaired) {
+ this(firstOfPair, new Lazy<>(() -> secondOfPair), unpaired);
+ }
+
+ private FastqWriter getFirstOfPair() {
+ return firstOfPair;
+ }
+
+ private FastqWriter getSecondOfPair() {
+ return secondOfPair.get();
+ }
+
+ private FastqWriter getUnpaired() {
+ return unpaired;
+ }
+
+ private void closeAll() {
+ final Set fastqWriters = new HashSet<>();
+ fastqWriters.add(firstOfPair);
+ fastqWriters.add(unpaired);
+ // Make sure this is a no-op if the second writer was never fetched.
+ if (secondOfPair.isInitialized()) {
+ fastqWriters.add(secondOfPair.get());
+ }
+ for (final FastqWriter fastqWriter : fastqWriters) {
+ fastqWriter.close();
+ }
+ }
+ }
+
+ public static void main(final String[] argv) {
+
+
+ int exitStatus = new SamToFastqWithHeaders().instanceMain(argv);
+
+ System.exit(exitStatus);
+ }
+}
\ No newline at end of file
diff --git a/qmule/test/org/qcmg/qmule/FastqToSamWithHeadersTest.java b/qmule/test/org/qcmg/qmule/FastqToSamWithHeadersTest.java
new file mode 100644
index 000000000..a19e74b63
--- /dev/null
+++ b/qmule/test/org/qcmg/qmule/FastqToSamWithHeadersTest.java
@@ -0,0 +1,265 @@
+package org.qcmg.qmule;
+
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.SAMRecordIterator;
+import htsjdk.samtools.SamReader;
+import htsjdk.samtools.SamReaderFactory;
+import htsjdk.samtools.fastq.FastqReader;
+import htsjdk.samtools.fastq.FastqRecord;
+import htsjdk.samtools.util.FastqQualityFormat;
+import org.junit.Assert;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import picard.PicardException;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static htsjdk.samtools.SAMUtils.MAX_PHRED_SCORE;
+import static org.junit.Assert.*;
+
+public class FastqToSamWithHeadersTest {
+
+
+ @Rule
+ public TemporaryFolder testFolder = new TemporaryFolder();
+
+ private static final FastqToSamWithHeaders fastqToSam = new FastqToSamWithHeaders();
+ private static File newTempFile(final String filename) throws IOException {
+ return newTempFile(filename, ".tmp");
+ }
+ private static File newTempFile(final String filename, String suffix) throws IOException {
+ final File file = File.createTempFile(filename, suffix);
+ file.deleteOnExit();
+ return file;
+ }
+
+ private static FastqReader freader1;
+ private static FastqReader freader2;
+
+ static {
+ try {
+ freader1 = new FastqReader(newTempFile("dummyFile"));
+ freader2 = new FastqReader(newTempFile("dummyFile"));
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Test
+ public void createSAMRecordNoAdditionalHeader() {
+ FastqRecord fqRec = new FastqRecord("basename","ACGT", "", "????");
+ SAMRecord samRec = FastqToSamWithHeaders.createSamRecord(null, "basename", fqRec, true, "A", FastqQualityFormat.Standard, 0, MAX_PHRED_SCORE);
+ assertEquals("basename", samRec.getReadName());
+ assertEquals("ACGT", samRec.getReadString());
+ assertEquals("????", samRec.getBaseQualityString());
+ assertTrue(samRec.getReadPairedFlag());
+ assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZT_ATTRIBUTE));
+ assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE));
+
+ samRec = FastqToSamWithHeaders.createSamRecord(null, "basename", fqRec, false, "A", FastqQualityFormat.Standard, 0, MAX_PHRED_SCORE);
+ assertEquals("basename", samRec.getReadName());
+ assertEquals("ACGT", samRec.getReadString());
+ assertEquals("????", samRec.getBaseQualityString());
+ assertFalse(samRec.getReadPairedFlag());
+ assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZT_ATTRIBUTE));
+ assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE));
+ }
+
+ @Test
+ public void createSAMRecordAdditionalHeaderNoTrimming() {
+ String basename = "basename";
+ String pairing = "/1";
+ String additionalHeader = " 1.2.ACGTTGCA";
+ FastqRecord fqRec = new FastqRecord(basename + pairing + additionalHeader, "ACGT", "", "????");
+ SAMRecord samRec = FastqToSamWithHeaders.createSamRecord(null, "basename", fqRec, true, "A", FastqQualityFormat.Standard, 0, MAX_PHRED_SCORE);
+ assertEquals(basename, samRec.getReadName());
+ assertEquals("ACGT", samRec.getReadString());
+ assertEquals("????", samRec.getBaseQualityString());
+ assertTrue(samRec.getReadPairedFlag());
+ assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZT_ATTRIBUTE));
+ assertEquals(pairing + additionalHeader, samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE));
+ }
+
+ @Test
+ public void createSAMRecordAdditionalHeader() {
+ String basename = "basename";
+ String pairing = "/1";
+ String additionalHeader = " 1.2.ACGTTGCA";
+ String trimmedBases = "aaaaAAAA+????????";
+ FastqRecord fqRec = new FastqRecord(basename + pairing + additionalHeader + " TB:" + trimmedBases, "ACGT", "", "????");
+ SAMRecord samRec = FastqToSamWithHeaders.createSamRecord(null, "basename", fqRec, true, "A", FastqQualityFormat.Standard, 0, MAX_PHRED_SCORE);
+ assertEquals(basename, samRec.getReadName());
+ assertEquals("ACGT", samRec.getReadString());
+ assertEquals("????", samRec.getBaseQualityString());
+ assertTrue(samRec.getReadPairedFlag());
+ assertEquals(trimmedBases, samRec.getAttribute(FastqToSamWithHeaders.ZT_ATTRIBUTE));
+ assertEquals(pairing + additionalHeader, samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE));
+ }
+
+ @Test
+ public void readPairNameOk() {
+ assertEquals("aa", fastqToSam.getBaseName("aa/1", "aa/2" , freader1, freader2));
+ assertEquals("aa", fastqToSam.getBaseName("aa", "aa", freader1, freader2));
+ assertEquals("aa/bb", fastqToSam.getBaseName("aa/bb", "aa/bb" , freader1, freader2));
+ assertEquals("aa/bb/", fastqToSam.getBaseName("aa/bb/", "aa/bb/" , freader1, freader2));
+ assertEquals("aa/bb", fastqToSam.getBaseName("aa/bb/1", "aa/bb/2" , freader1, freader2));
+ assertEquals("aa/bb/cc/dd/ee/ff", fastqToSam.getBaseName("aa/bb/cc/dd/ee/ff/1", "aa/bb/cc/dd/ee/ff/2" , freader1, freader2));
+ assertEquals("///", fastqToSam.getBaseName("////1", "////2" , freader1, freader2));
+ assertEquals("/", fastqToSam.getBaseName("/", "/" , freader1, freader2));
+ assertEquals("////", fastqToSam.getBaseName("////", "////", freader1, freader2));
+ assertEquals("/aa", fastqToSam.getBaseName("/aa", "/aa" , freader1, freader2));
+ assertEquals("aa/", fastqToSam.getBaseName("aa/", "aa/" , freader1, freader2));
+ assertEquals("ab/c", fastqToSam.getBaseName("ab/c", "ab/c", freader1, freader2));
+ }
+
+ @Test
+ public void readPairNamesBad() {
+ try {
+ fastqToSam.getBaseName("", "" , freader1, freader2);
+ Assert.fail("Should have thrown an exception");
+ } catch (PicardException ignored) {}
+ try {
+ fastqToSam.getBaseName("aa/1", "bb/2" , freader1, freader2);
+ Assert.fail("Should have thrown an exception");
+ } catch (PicardException ignored) {}
+ try {
+ fastqToSam.getBaseName("aa", "bb" , freader1, freader2);
+ Assert.fail("Should have thrown an exception");
+ } catch (PicardException ignored) {}
+ try {
+ fastqToSam.getBaseName("aa/1", "aa" , freader1, freader2);
+ Assert.fail("Should have thrown an exception");
+ } catch (PicardException ignored) {}
+ try {
+ fastqToSam.getBaseName("aa", "aa/2" , freader1, freader2);
+ Assert.fail("Should have thrown an exception");
+ } catch (PicardException ignored) {}
+ try {
+ fastqToSam.getBaseName("aa/1", "aa/1" , freader1, freader2);
+ Assert.fail("Should have thrown an exception");
+ } catch (PicardException ignored) {}
+ try {
+ fastqToSam.getBaseName("aa/2", "aa/2" , freader1, freader2);
+ Assert.fail("Should have thrown an exception");
+ } catch (PicardException ignored) {}
+ }
+
+ @Test
+ public void testSequentialFiles() throws Exception {
+ File singleEnd = testFolder.newFile("single_end_R1_001.fastq");
+ File singleEnd2 = testFolder.newFile("single_end_R1_002.fastq");
+ File pairedEnd1 = testFolder.newFile("paired_end_R1_001.fastq");
+ File pairedEnd12 = testFolder.newFile("paired_end_R1_002.fastq");
+ File pairedEnd2 = testFolder.newFile("paired_end_R2_001.fastq");
+ File pairedEnd22 = testFolder.newFile("paired_end_R2_002.fastq");
+
+ assertEquals(2, FastqToSamWithHeaders.getSequentialFileList(singleEnd).size());
+ assertEquals(2, FastqToSamWithHeaders.getSequentialFileList(pairedEnd1).size());
+ assertEquals(2, FastqToSamWithHeaders.getSequentialFileList(pairedEnd2).size());
+
+ populateFile(Arrays.asList(singleEnd, singleEnd2, pairedEnd1, pairedEnd12, pairedEnd2, pairedEnd22), Arrays.asList("@FAKE0001 Original version has PHRED scores from 93 to 0 inclusive (in that order)",
+ "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG",
+ "+",
+ "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"));
+
+ File singleEnd1Output = testFolder.newFile("singleEnd1.sam");
+ File singleEnd2Output = testFolder.newFile("singleEnd2.sam");
+ File pairedEnd1Output = testFolder.newFile("pairedEnd1.sam");
+ File pairedEnd2Output = testFolder.newFile("pairedEnd2.sam");
+ convertFileAndVerifyRecordCount(1, singleEnd, null, singleEnd1Output, FastqQualityFormat.Illumina, true, false);
+ convertFileAndVerifyRecordCount(2, singleEnd, null, singleEnd2Output,FastqQualityFormat.Illumina, true, true);
+ convertFileAndVerifyRecordCount(2, pairedEnd1, pairedEnd2, pairedEnd1Output, FastqQualityFormat.Illumina, true, false);
+ convertFileAndVerifyRecordCount(4, pairedEnd1, pairedEnd2, pairedEnd2Output, FastqQualityFormat.Illumina, true, true);
+ }
+
+ private void populateFile(List files, List data) {
+ for (File f : files) {
+ try (FileWriter fw = new FileWriter(f)) {
+ for (String s : data) {
+ fw.write(s + "\n");
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ @Test
+ public void testEmptyFastq() throws IOException {
+ final File emptyFastq = testFolder.newFile("empty.fastq");
+ final File emptyFastqSam = testFolder.newFile("empty.fastq.sam");
+ convertFile(emptyFastq, null, emptyFastqSam, FastqQualityFormat.Illumina, false, false, false, false);
+ }
+
+ private void convertFile(final File fastq1,
+ final File fastq2,
+ final File outputFile,
+ final FastqQualityFormat version,
+ final boolean permissiveFormat,
+ final boolean useSequentialFastqs,
+ final boolean allowEmptyFastq) throws IOException {
+ convertFile(fastq1, fastq2, outputFile, version, permissiveFormat, useSequentialFastqs, allowEmptyFastq, true);
+ }
+ private void convertFile(final File fastq1,
+ final File fastq2,
+ final File outputFile,
+ final FastqQualityFormat version,
+ final boolean permissiveFormat,
+ final boolean useSequentialFastqs,
+ final boolean allowEmptyFastq,
+ final boolean expectSuccess) throws IOException {
+
+ final List args = new ArrayList<>();
+
+ args.add("FASTQ=" + fastq1.getAbsolutePath());
+ args.add("OUTPUT=" + outputFile.getAbsolutePath());
+ args.add("QUALITY_FORMAT=" + version);
+ args.add("READ_GROUP_NAME=rg");
+ args.add("SAMPLE_NAME=s1");
+
+ if (fastq2 != null) args.add("FASTQ2=" + fastq2.getAbsolutePath());
+ if (permissiveFormat) args.add("ALLOW_AND_IGNORE_EMPTY_LINES=true");
+ if (useSequentialFastqs) args.add("USE_SEQUENTIAL_FASTQS=true");
+ if (allowEmptyFastq) args.add("ALLOW_EMPTY_FASTQ=true");
+ int exitStatus = 1;
+ try {
+ exitStatus = new FastqToSamWithHeaders().instanceMain(args.toArray(new String[]{}));
+ if ( ! expectSuccess) {
+ Assert.fail("Should have thrown a PicardException");
+ }
+ } catch (Exception ignored) {}
+ assertEquals(expectSuccess ? 0 : 1, exitStatus);
+ }
+
+ private void convertFileAndVerifyRecordCount(final int expectedCount,
+ final File fastqFilename1,
+ final File fastqFilename2,
+ final File outputFile,
+ final FastqQualityFormat version,
+ final boolean permissiveFormat,
+ final boolean useSequentialFastqs) throws IOException {
+
+ convertFile(fastqFilename1, fastqFilename2, outputFile, version, permissiveFormat, useSequentialFastqs, false);
+ final SamReader samReader = SamReaderFactory.makeDefault().open(outputFile);
+ final SAMRecordIterator iterator = samReader.iterator();
+ int actualCount = 0;
+ while (iterator.hasNext()) {
+ iterator.next();
+ actualCount++;
+ }
+ samReader.close();
+ Assert.assertEquals(expectedCount, actualCount);
+ }
+
+ @Test
+ public void runWithNoArgs() {
+ int exitStatus = new FastqToSamWithHeaders().instanceMain(new String[]{});
+ assertEquals(1, exitStatus);
+ }
+}
diff --git a/qmule/test/org/qcmg/qmule/SamToFastqWithHeadersTest.java b/qmule/test/org/qcmg/qmule/SamToFastqWithHeadersTest.java
new file mode 100644
index 000000000..40c6f5c31
--- /dev/null
+++ b/qmule/test/org/qcmg/qmule/SamToFastqWithHeadersTest.java
@@ -0,0 +1,504 @@
+package org.qcmg.qmule;
+
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.SamReader;
+import htsjdk.samtools.SamReaderFactory;
+import htsjdk.samtools.fastq.FastqReader;
+import htsjdk.samtools.fastq.FastqRecord;
+import htsjdk.samtools.util.IOUtil;
+import org.junit.Assert;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import picard.PicardException;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.math.BigInteger;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.qcmg.qmule.SamToFastqWithHeaders.getTrimmedBaseDetailsFromTag;
+
+public class SamToFastqWithHeadersTest {
+
+ @Rule
+ public TemporaryFolder testFolder = new TemporaryFolder();
+
+ @Test
+ public void getPositionOfCaseChange() {
+ assertEquals(-1, SamToFastqWithHeaders.getPositionOfCaseChange(null));
+ assertEquals(-1, SamToFastqWithHeaders.getPositionOfCaseChange(""));
+ assertEquals(0, SamToFastqWithHeaders.getPositionOfCaseChange("ACBD"));
+ assertEquals(0, SamToFastqWithHeaders.getPositionOfCaseChange("abcd"));
+ assertEquals(4, SamToFastqWithHeaders.getPositionOfCaseChange("abcdABCD"));
+ assertEquals(4, SamToFastqWithHeaders.getPositionOfCaseChange("ABCDxyz"));
+ assertEquals(1, SamToFastqWithHeaders.getPositionOfCaseChange("xYZ"));
+ assertEquals(1, SamToFastqWithHeaders.getPositionOfCaseChange("Xyz"));
+ assertEquals(2, SamToFastqWithHeaders.getPositionOfCaseChange("xyZ"));
+ assertEquals(2, SamToFastqWithHeaders.getPositionOfCaseChange("XYz"));
+ }
+
+ @Test
+ public void getTrimmedBasesFromTag() {
+ assertArrayEquals(new String[]{}, getTrimmedBaseDetailsFromTag(null));
+ assertArrayEquals(new String[]{}, getTrimmedBaseDetailsFromTag(""));
+ assertArrayEquals(new String[]{"","","ABCD","!!!!"}, getTrimmedBaseDetailsFromTag("ABCD+!!!!"));
+ assertArrayEquals(new String[]{"abcd","!!!!","",""}, getTrimmedBaseDetailsFromTag("abcd+!!!!"));
+ assertArrayEquals(new String[]{"abcd","!!!!","XYZ","%%%"}, getTrimmedBaseDetailsFromTag("abcdXYZ+!!!!%%%"));
+ assertArrayEquals(new String[]{"xyz","%%%","ABCD","!!!!"}, getTrimmedBaseDetailsFromTag("ABCDxyz+!!!!%%%"));
+ }
+
+ @Test
+ public void getFastqFromSam() {
+ SAMRecord record = new SAMRecord(null);
+ record.setReadName("ERR194147.1758538");
+
+ // first read
+ record.setReadBases("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT".getBytes());
+ record.setBaseQualityString("???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????");
+ record.setFlags(99);
+ record.setMappingQuality(60);
+ record.setCigarString("97M");
+ record.setInferredInsertSize(330);
+ record.setAttribute("OQ", "???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????");
+ record.setAttribute("MQ", 60);
+ record.setAttribute("MC", "3S98M");
+ record.setAttribute("ZH", "/1 ACGTACGT");
+ record.setAttribute("ZT", "GCGA+???'");
+
+ FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null);
+ assertEquals("ERR194147.1758538/1 ACGTACGT", fq.getReadName());
+ assertEquals("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT" + "GCGA", fq.getReadString());
+ assertEquals("???????????????????5???????????????????????????????????????&5?????????+??55?????????????????????????'", fq.getBaseQualityString());
+ }
+
+ @Test
+ public void getFastqFromSamOQ() {
+ SAMRecord record = new SAMRecord(null);
+ record.setReadName("ERR194147.1758538");
+
+ // first read
+ record.setReadBases("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT".getBytes());
+ record.setBaseQualityString("???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????");
+ record.setFlags(99);
+ record.setMappingQuality(60);
+ record.setCigarString("97M");
+ record.setInferredInsertSize(330);
+ record.setAttribute("OQ", "55555??????????????5???????????????????????????????????????&5?????????+??55??????????????????????");
+ record.setAttribute("MQ", 60);
+ record.setAttribute("MC", "3S98M");
+ record.setAttribute("ZH", "/1 ACGTACGT");
+ record.setAttribute("ZT", "GCGA+???'");
+
+ FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null);
+ assertEquals("ERR194147.1758538/1 ACGTACGT", fq.getReadName());
+ assertEquals("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT" + "GCGA", fq.getReadString());
+ assertEquals("55555??????????????5???????????????????????????????????????&5?????????+??55?????????????????????????'", fq.getBaseQualityString());
+ }
+
+ @Test
+ public void getFastqFromSamReverse() {
+ SAMRecord record = new SAMRecord(null);
+ record.setReadName("ERR194147.421820475");
+
+ // first read
+ record.setReadBases("TTCTTTGGCCTAATGACATGGCTATTAGTGCACAAGGAAATGGTCAAAAATGGGAAGAAATGTAGGTCACAAAATATTGCACAAAGCTATACTTACTT".getBytes());
+ record.setBaseQualityString("??????????????????????????????????????????????????????????????????????????????????????????????????");
+ record.setFlags(83);
+ record.setMappingQuality(60);
+ record.setCigarString("98M");
+ record.setInferredInsertSize(-323);
+ record.setAttribute("OQ", "??????????????????????????????????????????????????????????????????????????????????????????????????");
+ record.setAttribute("MQ", 60);
+ record.setAttribute("MC", "3S98M");
+ record.setAttribute("ZH", "/1 XYZ.123.25");
+ record.setAttribute("ZT", "CTG+???");
+
+ FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null);
+ assertEquals("ERR194147.421820475/1 XYZ.123.25", fq.getReadName());
+ assertEquals("AAGTAAGTATAGCTTTGTGCAATATTTTGTGACCTACATTTCTTCCCATTTTTGACCATTTCCTTGTGCACTAATAGCCATGTCATTAGGCCAAAGAACTG", fq.getReadString());
+ assertEquals("?????????????????????????????????????????????????????????????????????????????????????????????????????", fq.getBaseQualityString());
+ }
+ @Test
+ public void getFastqFromSamDodgyTag() {
+ SAMRecord record = new SAMRecord(null);
+ record.setReadName("ERR194147.1758538");
+
+ // first read
+ record.setReadBases("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT".getBytes());
+ record.setBaseQualityString("???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????");
+ record.setFlags(99);
+ record.setMappingQuality(60);
+ record.setCigarString("97M");
+ record.setInferredInsertSize(330);
+ record.setAttribute("OQ", "???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????");
+ record.setAttribute("MQ", 60);
+ record.setAttribute("MC", "3S98M");
+ record.setAttribute("ZH", "/1 foo bar");
+ record.setAttribute("ZT", "GCGA+???'");
+
+ FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null);
+ assertEquals("ERR194147.1758538/1 foo bar", fq.getReadName());
+ assertEquals("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT" + "GCGA", fq.getReadString());
+ assertEquals("???????????????????5???????????????????????????????????????&5?????????+??55?????????????????????????'", fq.getBaseQualityString());
+ }
+
+ @Test
+ public void testMissingRgFileOutputPerRg() throws IOException {
+ File inputSam = testFolder.newFile("testMissingRgFileOutputPerRg.sam");
+ File outputDir = testFolder.newFolder();
+ convertFile(new String[]{
+ "INPUT=" + inputSam.getAbsolutePath(),
+ "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/",
+ "OUTPUT_PER_RG=true"
+ }, false);
+ }
+
+ @Test
+ public void groupedUnpairedMate() throws IOException {
+ File inputSam = testFolder.newFile("groupedUnpairedMate.sam");
+ File outputDir = testFolder.newFolder();
+
+ populateSamFile(inputSam, Arrays.asList("@HD\tVN:1.0\tSO:unsorted",
+ "@RG\tID:rg1\tSM:s1\tPU:rg1\tPL:ILLUMINA",
+ "@RG\tID:rg2\tSM:s2\tPU:rg2\tPL:ILLUMINA",
+ "foo:record:1\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg1",
+ "foo:record:1\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg1",
+ "foo:record:2\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg1",
+ "foo:record:3\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg1",
+ "bar:record:1\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg2",
+ "bar:record:2\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg2"));
+ convertFile(new String[]{
+ "INPUT=" + inputSam.getAbsolutePath(),
+ "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/",
+ "OUTPUT_PER_RG=true"
+ }, false);
+
+ populateSamFile(inputSam, Arrays.asList("@HD\tVN:1.0\tSO:unsorted",
+ "@RG\tID:rg1\tSM:s1\tPU:rg1\tPL:ILLUMINA",
+ "@RG\tID:rg2\tSM:s2\tPU:rg2\tPL:ILLUMINA",
+ "foo:record:1\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg1",
+ "foo:record:1\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg1",
+ "foo:record:2\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg1",
+ "foo:record:2\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg1",
+ "bar:record:1\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg2",
+ "bar:record:1\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg2"));
+ convertFile(new String[]{
+ "INPUT=" + inputSam.getAbsolutePath(),
+ "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/",
+ "OUTPUT_PER_RG=true"
+ }, true);
+ File[] fastqFiles = outputDir.listFiles((dir, file) -> file.startsWith("rg1"));
+ assert fastqFiles != null;
+ assertEquals(2, fastqFiles.length);
+ boolean arraySorted = fastqFiles[0].getName().endsWith("1.fastq");
+ verifyFastq(arraySorted ? fastqFiles[0] : fastqFiles[1], arraySorted ? fastqFiles[1] : fastqFiles[0], inputSam, "rg1");
+
+ fastqFiles = outputDir.listFiles((dir, file) -> file.startsWith("rg2"));
+ assert fastqFiles != null;
+ assertEquals(2, fastqFiles.length);
+ arraySorted = fastqFiles[0].getName().endsWith("1.fastq");
+ verifyFastq(arraySorted ? fastqFiles[0] : fastqFiles[1], arraySorted ? fastqFiles[1] : fastqFiles[0], inputSam, "rg2");
+
+ File outputFastq = testFolder.newFile("groupedUnpairedMate.fastq");
+ convertFile(new String[]{
+ "INPUT=" + inputSam.getAbsolutePath(),
+ "FASTQ=" + outputFastq.getAbsolutePath(),
+ "INTERLEAVE=true"
+ }, true);
+ final Set outputHeaderSet = createFastqReadHeaderSet(outputFastq);
+ // Create map of mate pairs from SAM records
+ final Map map = createSamMatePairsMap(inputSam, null) ;
+ Assert.assertEquals(map.size() * 2, outputHeaderSet.size());
+
+ // Ensure that each mate of each pair in SAM file is in the correct fastq pair file
+ for (final Map.Entry entry : map.entrySet() ) {
+ final MatePair mpair = entry.getValue();
+ Assert.assertNotNull(mpair.mate1); // ensure we have two mates
+ Assert.assertNotNull(mpair.mate2);
+ Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName());
+ final String readName = mpair.mate1.getReadName() ;
+ Assert.assertTrue(outputHeaderSet.contains(readName + "/1")); // ensure mate is in correct file
+ Assert.assertTrue(outputHeaderSet.contains(readName + "/2"));
+ }
+ }
+
+ @Test
+ public void firstMateAtStartLastMateAtEnd() throws IOException {
+ File inputSam = testFolder.newFile("groupedUnpairedMate.sam");
+ File outputDir = testFolder.newFolder();
+ populateSamFile(inputSam, Arrays.asList("@HD VN:1.0 SO:unsorted",
+ "@RG ID:rg1 SM:s1 PU:blah PL:ILLUMINA",
+ "bar:record:1 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1",
+ "bar:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1",
+ "bar:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1",
+ "bar:record:3 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1",
+ "bar:record:3 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1",
+ "bar:record:4 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1",
+ "bar:record:4 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1",
+ "bar:record:5 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1",
+ "bar:record:5 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1",
+ "bar:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1"));
+ convertFile(new String[]{
+ "INPUT=" + inputSam.getAbsolutePath(),
+ "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/",
+ "OUTPUT_PER_RG=true"
+ }, true);
+ File[] fastqFiles = outputDir.listFiles((dir, file) -> file.endsWith(".fastq"));
+ assert fastqFiles != null;
+ assertEquals(2, fastqFiles.length);
+ boolean arraySorted = fastqFiles[0].getName().endsWith("1.fastq");
+ verifyFastq(arraySorted ? fastqFiles[0] : fastqFiles[1], arraySorted ? fastqFiles[1] : fastqFiles[0], inputSam);
+
+ File outputFastq = testFolder.newFile("groupedUnpairedMate.fastq");
+ convertFile(new String[]{
+ "INPUT=" + inputSam.getAbsolutePath(),
+ "FASTQ=" + outputFastq.getAbsolutePath(),
+ "INTERLEAVE=true"
+ }, true);
+ final Set outputHeaderSet = createFastqReadHeaderSet(outputFastq);
+ // Create map of mate pairs from SAM records
+ final Map map = createSamMatePairsMap(inputSam, null) ;
+ Assert.assertEquals(map.size() * 2, outputHeaderSet.size());
+
+ // Ensure that each mate of each pair in SAM file is in the correct fastq pair file
+ for (final Map.Entry entry : map.entrySet() ) {
+ final MatePair mpair = entry.getValue();
+ Assert.assertNotNull(mpair.mate1); // ensure we have two mates
+ Assert.assertNotNull(mpair.mate2);
+ Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName());
+ final String readName = mpair.mate1.getReadName() ;
+ Assert.assertTrue(outputHeaderSet.contains(readName + "/1")); // ensure mate is in correct file
+ Assert.assertTrue(outputHeaderSet.contains(readName + "/2"));
+ }
+ }
+
+ @Test
+ public void trimmedHeaders() throws IOException {
+ File inputOrigSam = testFolder.newFile("trimmedHeadersOrig.sam");
+ File inputSam = testFolder.newFile("trimmedHeaders.sam");
+ File outputOrigDir = testFolder.newFolder();
+ File outputDir = testFolder.newFolder();
+ populateSamFile(inputOrigSam, Arrays.asList("@HD VN:1.0 SO:queryname",
+ "@RG ID:rg1 SM:s1 PU:rg1 PL:ILLUMINA",
+ "@RG ID:rg2 SM:s2 PU:rg2 PL:ILLUMINA",
+ "foo:record:1 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222",
+ "foo:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222",
+ "foo:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222",
+ "foo:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222",
+ "foo:record:3 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222",
+ "foo:record:3 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222",
+ "bar:record:1 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111",
+ "bar:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111",
+ "bar:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111",
+ "bar:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111"));
+ populateSamFile(inputSam, Arrays.asList("@HD VN:1.0 SO:queryname",
+ "@RG ID:rg1 SM:s1 PU:rg1 PL:ILLUMINA",
+ "@RG ID:rg2 SM:s2 PU:rg2 PL:ILLUMINA",
+ "foo:record:1 77 * 0 0 * * 0 0 AAAAAAAAA 111111111 RG:Z:rg1 ZT:Z:AAAA+1111\tCR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222",
+ "foo:record:1 141 * 0 0 * * 0 0 CCCCCCCCC 222222222 RG:Z:rg1 CR:Z:AAAAA ZT:Z:cccc+2222\tUR:Z:TTT CY:Z:11111 UY:Z:222",
+ "foo:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222",
+ "foo:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222",
+ "foo:record 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222\tZH:Z::3/1",
+ "foo:record 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT\tZH:Z::3/2 CY:Z:11111 UY:Z:222",
+ "bar:record:1 77 * 0 0 * * 0 0 AAAAAAAAA 111111111 RG:Z:rg2 CR:Z:CCCCC ZT:Z:aaaa+1111\tUR:Z:GGG CY:Z:22222 UY:Z:111",
+ "bar:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111",
+ "bar:record:2 141 * 0 0 * * 0 0 CCCCCCCCCC 2222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 ZT:Z:CCC+222\tUY:Z:111",
+ "bar:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111"));
+ convertFile(new String[]{
+ "INPUT=" + inputOrigSam.getAbsolutePath(),
+ "OUTPUT_DIR=" + outputOrigDir.getAbsolutePath() + "/",
+ "OUTPUT_PER_RG=true"
+ }, true);
+ convertFile(new String[]{
+ "INPUT=" + inputSam.getAbsolutePath(),
+ "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/",
+ "OUTPUT_PER_RG=true"
+ }, true);
+ File[] fastqOrigFiles = outputOrigDir.listFiles((dir, file) -> file.endsWith(".fastq"));
+ File[] fastqFiles = outputDir.listFiles((dir, file) -> file.endsWith(".fastq"));
+
+ /*
+ loop through each one, calculating md5, then compare for equality
+ */
+ assert fastqOrigFiles != null;
+ Map mapOrig = Arrays.stream(fastqOrigFiles).collect(Collectors.toMap(File::getName, f -> {
+ try {
+ return new BigInteger(1, MessageDigest.getInstance("MD5").digest(Files.readAllBytes(Paths.get(f.getPath())))).toString(16);
+ } catch (NoSuchAlgorithmException | IOException e) {
+ throw new RuntimeException(e);
+ }
+ }));
+ assert fastqFiles != null;
+ Map map = Arrays.stream(fastqFiles).collect(Collectors.toMap(File::getName, f -> {
+ try {
+ return new BigInteger(1, MessageDigest.getInstance("MD5").digest(Files.readAllBytes(Paths.get(f.getPath())))).toString(16);
+ } catch (NoSuchAlgorithmException | IOException e) {
+ throw new RuntimeException(e);
+ }
+ }));
+
+ for (Map.Entry entry : mapOrig.entrySet()) {
+ String md5 = map.get(entry.getKey());
+ assertEquals(md5, entry.getValue());
+ }
+ }
+
+ @Test
+ public void groupedLastPairMatesFlipped() throws IOException {
+ File inputSam = testFolder.newFile("groupedUnpairedMate.sam");
+ File outputDir = testFolder.newFolder();
+ populateSamFile(inputSam, Arrays.asList("@HD VN:1.0 SO:queryname",
+ "@RG ID:rg1 SM:s1 PU:rg1 PL:ILLUMINA",
+ "@RG ID:rg2 SM:s2 PU:rg2 PL:ILLUMINA",
+ "foo:record:1 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222",
+ "foo:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222",
+ "foo:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222",
+ "foo:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222",
+ "foo:record:3 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222",
+ "foo:record:3 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222",
+ "bar:record:1 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111",
+ "bar:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111",
+ "bar:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111",
+ "bar:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111"));
+ convertFile(new String[]{
+ "INPUT=" + inputSam.getAbsolutePath(),
+ "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/",
+ "OUTPUT_PER_RG=true"
+ }, true);
+ File[] fastqFiles = outputDir.listFiles((dir, file) -> file.startsWith("rg1"));
+ assert fastqFiles != null;
+ assertEquals(2, fastqFiles.length);
+ boolean arraySorted = fastqFiles[0].getName().endsWith("1.fastq");
+ verifyFastq(arraySorted ? fastqFiles[0] : fastqFiles[1], arraySorted ? fastqFiles[1] : fastqFiles[0], inputSam, "rg1");
+
+ fastqFiles = outputDir.listFiles((dir, file) -> file.startsWith("rg2"));
+ assert fastqFiles != null;
+ assertEquals(2, fastqFiles.length);
+ arraySorted = fastqFiles[0].getName().endsWith("1.fastq");
+ verifyFastq(arraySorted ? fastqFiles[0] : fastqFiles[1], arraySorted ? fastqFiles[1] : fastqFiles[0], inputSam, "rg2");
+
+ File outputFastq = testFolder.newFile("groupedUnpairedMate.fastq");
+ convertFile(new String[]{
+ "INPUT=" + inputSam.getAbsolutePath(),
+ "FASTQ=" + outputFastq.getAbsolutePath(),
+ "INTERLEAVE=true"
+ }, true);
+ final Set outputHeaderSet = createFastqReadHeaderSet(outputFastq);
+ // Create map of mate pairs from SAM records
+ final Map map = createSamMatePairsMap(inputSam, null) ;
+ Assert.assertEquals(map.size() * 2, outputHeaderSet.size());
+
+ // Ensure that each mate of each pair in SAM file is in the correct fastq pair file
+ for (final Map.Entry entry : map.entrySet() ) {
+ final MatePair mpair = entry.getValue();
+ Assert.assertNotNull(mpair.mate1); // ensure we have two mates
+ Assert.assertNotNull(mpair.mate2);
+ Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName());
+ final String readName = mpair.mate1.getReadName() ;
+ Assert.assertTrue(outputHeaderSet.contains(readName + "/1")); // ensure mate is in correct file
+ Assert.assertTrue(outputHeaderSet.contains(readName + "/2"));
+ }
+ }
+
+ private void convertFile(final String [] args, boolean expectSuccess) {
+ int exitStatus = 1;
+ try {
+ exitStatus = new SamToFastqWithHeaders().instanceMain(args);
+ if ( ! expectSuccess) {
+ Assert.fail("Should have thrown a PicardException");
+ }
+ } catch (Exception ignored) {System.out.println("ignored exception: " + ignored);}
+ assertEquals(expectSuccess ? 0 : 1, exitStatus);
+ }
+
+ private void populateSamFile(File sam, List data) {
+ try (FileWriter fw = new FileWriter(sam)) {
+ for (String s : data) {
+ fw.write(s + "\n");
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private void verifyFastq(final File pair1File, final File pair2File, final File samFile) throws IOException {
+ verifyFastq(pair1File, pair2File, samFile, null);
+ }
+ private void verifyFastq(final File pair1File, final File pair2File, final File samFile, String readGroup) throws IOException {
+ // Check that paired fastq files are same size
+ final Set outputHeaderSet1 = createFastqReadHeaderSet(pair1File);
+ final Set outputHeaderSet2 = createFastqReadHeaderSet(pair2File);
+ Assert.assertEquals(outputHeaderSet1.size(), outputHeaderSet2.size());
+
+ // Create map of mate pairs from SAM records
+ final Map map = createSamMatePairsMap(samFile, readGroup) ;
+ Assert.assertEquals(map.size(), outputHeaderSet2.size());
+
+ // Ensure that each mate of each pair in SAM file is in the correct fastq pair file
+ for (final Map.Entry entry : map.entrySet() ) {
+ final MatePair mpair = entry.getValue();
+ Assert.assertNotNull(mpair.mate1); // ensure we have two mates
+ Assert.assertNotNull(mpair.mate2);
+ Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName());
+ final String readName = mpair.mate1.getReadName() ;
+ Assert.assertTrue(outputHeaderSet1.contains(readName + "/1")); // ensure mate is in correct file
+ Assert.assertTrue(outputHeaderSet2.contains(readName + "/2"));
+ }
+ }
+
+ private Map createSamMatePairsMap(final File samFile, final String readGroup) throws IOException {
+ IOUtil.assertFileIsReadable(samFile);
+ final SamReader reader = SamReaderFactory.makeDefault().open(samFile);
+
+ final Map map = new LinkedHashMap<>();
+ for (final SAMRecord record : reader ) {
+ if (null == readGroup || record.getReadGroup().getReadGroupId().equals(readGroup)) {
+ MatePair mpair = map.get(record.getReadName());
+ if (mpair == null) {
+ mpair = new MatePair();
+ map.put(record.getReadName(), mpair);
+ }
+ mpair.add(record);
+ }
+ }
+ reader.close();
+ return map;
+ }
+
+ protected static Set createFastqReadHeaderSet(final File file) {
+ final Set set = new HashSet<>();
+ try (final FastqReader freader = new FastqReader(file)) {
+ while (freader.hasNext()) {
+ final FastqRecord frec = freader.next();
+ set.add(frec.getReadName());
+ }
+ }
+ return set ;
+ }
+
+ static class MatePair {
+ SAMRecord mate1 ;
+ SAMRecord mate2 ;
+ void add(final SAMRecord record) {
+ if (!record.getReadPairedFlag()) throw new PicardException("Record "+record.getReadName()+" is not paired");
+ if (record.getFirstOfPairFlag()) {
+ if (mate1 != null) throw new PicardException("Mate 1 already set for record: "+record.getReadName());
+ mate1 = record ;
+ }
+ else if (record.getSecondOfPairFlag()) {
+ if (mate2 != null) throw new PicardException("Mate 2 already set for record: "+record.getReadName());
+ mate2 = record ;
+ }
+ else throw new PicardException("Neither FirstOfPairFlag or SecondOfPairFlag is set for a paired record");
+ }
+ }
+}