diff --git a/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java b/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java new file mode 100644 index 000000000..96abbf20f --- /dev/null +++ b/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java @@ -0,0 +1,563 @@ +package org.qcmg.qmule; + +import htsjdk.samtools.*; +import htsjdk.samtools.fastq.FastqConstants; +import htsjdk.samtools.fastq.FastqReader; +import htsjdk.samtools.fastq.FastqRecord; +import htsjdk.samtools.util.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import picard.PicardException; +import picard.cmdline.CommandLineProgram; +import picard.cmdline.StandardOptionDefinitions; +import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +/** + * Converts a FASTQ file to an unaligned BAM or SAM file. + *

+ * Output read records will contain the original base calls and quality scores will be + * translated depending on the base quality score encoding: FastqSanger, FastqSolexa and FastqIllumina. + *

+ *

+ * There are also arguments to provide values for SAM header and read attributes that are not present in FASTQ + * (e.g see RG or SM below). + *

+ *

Inputs

+ *

+ * One FASTQ file name for single-end or two for pair-end sequencing input data. + * These files might be in gzip compressed format (when file name is ending with ".gz"). + *

+ *

+ * Alternatively, for larger inputs you can provide a collection of FASTQ files indexed by their name (see USE_SEQUENCIAL_FASTQ for details below). + *

+ *

+ * By default, this tool will try to guess the base quality score encoding. However you can indicate it explicitly + * using the QUALITY_FORMAT argument. + *

+ *

Output

+ * A single unaligned BAM or SAM file. By default, the records are sorted by query (read) name. + *

Usage examples

+ * + *

Example 1:

+ *

+ * Single-end sequencing FASTQ file conversion. All reads are annotated + * as belonging to the "rg0013" read group that in turn is part of the sample "sample001". + *

+ *
+     * java -jar picard.jar FastqToSam \
+     *      F1=input_reads.fastq \
+     *      O=unaligned_reads.bam \
+     *      SM=sample001 \
+     *      RG=rg0013
+     * 
+ *

Example 2:

+ *

+ * Similar to example 1 above, but for paired-end sequencing. + *

+ *
+     * java -jar picard.jar FastqToSam \
+     *      F1=forward_reads.fastq \
+     *      F2=reverse_reads.fastq \
+     *      O=unaligned_read_pairs.bam \
+     *      SM=sample001 \
+     *      RG=rg0013
+     * 
+ */ +@CommandLineProgramProperties( + summary = "

" + org.qcmg.qmule.FastqToSamWithHeaders.USAGE_SUMMARY + ".

" + org.qcmg.qmule.FastqToSamWithHeaders.USAGE_DETAILS, + oneLineSummary = org.qcmg.qmule.FastqToSamWithHeaders.USAGE_SUMMARY, + programGroup = ReadDataManipulationProgramGroup.class) +@DocumentedFeature +public class FastqToSamWithHeaders extends CommandLineProgram { + + public static void main(final String[] argv) { + + + int exitStatus = new FastqToSamWithHeaders().instanceMain(argv); + + System.exit(exitStatus); + } + static final String USAGE_SUMMARY = + "Converts a FASTQ file to an unaligned BAM or SAM file"; + static final String USAGE_DETAILS = + "

Output read records will contain the original base calls and quality scores will be " + + "translated depending on the base quality score encoding: FastqSanger, FastqSolexa and FastqIllumina.

" + + "

There are also arguments to provide values for SAM header and read attributes that are not present in FASTQ " + + "(e.g see RG or SM below).

" + + "

Inputs

" + + "

One FASTQ file name for single-end or two for pair-end sequencing input data. " + + "These files might be in gzip compressed format (when file name is ending with \".gz\").

" + + "

Alternatively, for larger inputs you can provide a collection of FASTQ files indexed by their name " + + "(see USE_SEQUENCIAL_FASTQ for details below).

" + + "

By default, this tool will try to guess the base quality score encoding. However you can indicate it explicitly " + + "using the QUALITY_FORMAT argument.

" + + "

Output

" + + "

A single unaligned BAM or SAM file. By default, the records are sorted by query (read) name.

" + + "

Usage examples

" + + "

Example 1:

" + + "

Single-end sequencing FASTQ file conversion. All reads are annotated " + + "as belonging to the \"rg0013\" read group that in turn is part of the sample \"sample001\".

" + + "
java -jar picard.jar FastqToSam \\\n" +
+                        "        F1=input_reads.fastq \\\n" +
+                        "        O=unaligned_reads.bam \\\n" +
+                        "        SM=sample001 \\\n" +
+                        "        RG=rg0013
" + + "

Example 2:

" + + "

Similar to example 1 above, but for paired-end sequencing.

" + + "
java -jar picard.jar FastqToSam \\\n" +
+                        "       F1=forward_reads.fastq \\\n" +
+                        "       F2=reverse_reads.fastq \\\n" +
+                        "       O=unaligned_read_pairs.bam \\\n" +
+                        "       SM=sample001 \\\n" +
+                        "       RG=rg0013

"; + + private static final Log LOG = Log.getInstance(picard.sam.FastqToSam.class); + + @Argument(shortName="F1", doc="Input fastq file (optionally gzipped) for single end data, or first read in paired end data.") + public File FASTQ; + + @Argument(shortName="F2", doc="Input fastq file (optionally gzipped) for the second read of paired end data.", optional=true) + public File FASTQ2; + + @Argument(doc="Use sequential fastq files with the suffix _###.fastq or _###.fastq.gz." + + "The files should be named:\n" + + " _001., _002., ..., _XYZ.\n" + + " The base files should be:\n" + + " _001.\n" + + " An example would be:\n" + + " RUNNAME_S8_L005_R1_001.fastq\n" + + " RUNNAME_S8_L005_R1_002.fastq\n" + + " RUNNAME_S8_L005_R1_003.fastq\n" + + " RUNNAME_S8_L005_R1_004.fastq\n" + + "RUNNAME_S8_L005_R1_001.fastq should be provided as FASTQ.", optional=true) + public boolean USE_SEQUENTIAL_FASTQS = false; + + @Argument(shortName="V", doc="A value describing how the quality values are encoded in the input FASTQ file. " + + "Either Solexa (phred scaling + 66), Illumina (phred scaling + 64) or Standard (phred scaling + 33). " + + "If this value is not specified, the quality format will be detected automatically.", optional = true) + public FastqQualityFormat QUALITY_FORMAT; + + @Argument(doc="Output SAM/BAM file. ", shortName= StandardOptionDefinitions.OUTPUT_SHORT_NAME) + public File OUTPUT ; + + @Argument(shortName="RG", doc="Read group name") + public String READ_GROUP_NAME = "A"; + + @Argument(shortName="SM", doc="Sample name to insert into the read group header") + public String SAMPLE_NAME; + + @Argument(shortName="LB", doc="The library name to place into the LB attribute in the read group header", optional=true) + public String LIBRARY_NAME; + + @Argument(shortName="PU", doc="The platform unit (often run_barcode.lane) to insert into the read group header", optional=true) + public String PLATFORM_UNIT; + + @Argument(shortName="PL", doc="The platform type (e.g. ILLUMINA, SOLID) to insert into the read group header", optional=true) + public String PLATFORM; + + @Argument(shortName="CN", doc="The sequencing center from which the data originated", optional=true) + public String SEQUENCING_CENTER; + + @Argument(shortName = "PI", doc = "Predicted median insert size, to insert into the read group header", optional = true) + public Integer PREDICTED_INSERT_SIZE; + + @Argument(shortName = "PG", doc = "Program group to insert into the read group header.", optional=true) + public String PROGRAM_GROUP; + + @Argument(shortName = "PM", doc = "Platform model to insert into the group header (free-form text providing further details of the platform/technology used)", optional=true) + public String PLATFORM_MODEL; + + @Argument(doc="Comment(s) to include in the merged output file's header.", optional=true, shortName="CO") + public List COMMENT = new ArrayList<>(); + + @Argument(shortName = "DS", doc = "Inserted into the read group header", optional = true) + public String DESCRIPTION; + + @Argument(shortName = "DT", doc = "Date the run was produced, to insert into the read group header", optional = true) + public Iso8601Date RUN_DATE; + + @Argument(shortName="SO", doc="The sort order for the output sam/bam file.") + public SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.queryname; + + @Argument(doc="Minimum quality allowed in the input fastq. An exception will be thrown if a quality is less than this value.") + public int MIN_Q = 0; + + @Argument(doc="Maximum quality allowed in the input fastq. An exception will be thrown if a quality is greater than this value.") + public int MAX_Q = SAMUtils.MAX_PHRED_SCORE; + + @Deprecated + @Argument(doc="Deprecated (No longer used). If true and this is an unpaired fastq any occurrence of '/1' or '/2' will be removed from the end of a read name.") + public Boolean STRIP_UNPAIRED_MATE_NUMBER = false; + + @Argument(doc="Allow (and ignore) empty lines") + public Boolean ALLOW_AND_IGNORE_EMPTY_LINES = false; + + public static final String ZT_ATTRIBUTE = "ZT"; + public static final String ZH_ATTRIBUTE = "ZH"; + + public static final String TRIMMED_BASES = " TB:"; + + private static final SolexaQualityConverter solexaQualityConverter = SolexaQualityConverter.getSingleton(); + + /** + * Looks at fastq input(s) and attempts to determine the proper quality format + * + * Closes the reader(s) by side effect + * + * @param reader1 The first fastq input + * @param reader2 The second fastq input, if necessary. To not use this input, set it to null + * @param expectedQuality If provided, will be used for sanity checking. If left null, autodetection will occur + */ + public static FastqQualityFormat determineQualityFormat(final FastqReader reader1, final FastqReader reader2, final FastqQualityFormat expectedQuality) { + final QualityEncodingDetector detector = new QualityEncodingDetector(); + + if (reader2 == null) { + detector.add(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, reader1); + } else { + detector.add(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, reader1, reader2); + reader2.close(); + } + + reader1.close(); + + final FastqQualityFormat qualityFormat = detector.generateBestGuess(QualityEncodingDetector.FileContext.FASTQ, expectedQuality); + if (detector.isDeterminationAmbiguous()) { + LOG.warn("Making ambiguous determination about fastq's quality encoding; more than one format possible based on observed qualities."); + } + LOG.info(String.format("Auto-detected quality format as: %s.", qualityFormat)); + + return qualityFormat; + } + + + /** + * Get a list of FASTQs that are sequentially numbered based on the first (base) fastq. + * The files should be named: + * _001., _002., ..., _XYZ. + * The base files should be: + * _001. + * An example would be: + * RUNNAME_S8_L005_R1_001.fastq + * RUNNAME_S8_L005_R1_002.fastq + * RUNNAME_S8_L005_R1_003.fastq + * RUNNAME_S8_L005_R1_004.fastq + * where `baseFastq` is the first in that list. + */ + protected static List getSequentialFileList(final File baseFastq) { + final List files = new ArrayList<>(); + files.add(baseFastq); + + // Find the correct extension used in the base FASTQ + FastqConstants.FastqExtensions fastqExtensions = null; + String suffix = null; // store the suffix including the extension + for (final FastqConstants.FastqExtensions ext : FastqConstants.FastqExtensions.values()) { + suffix = "_001" + ext.getExtension(); + if (baseFastq.getAbsolutePath().endsWith(suffix)) { + fastqExtensions = ext; + break; + } + } + if (null == fastqExtensions) { + throw new PicardException(String.format("Could not parse the FASTQ extension (expected '_001' + '%s'): %s", FastqConstants.FastqExtensions.values().toString(), baseFastq)); + } + + // Find all the files + for (int idx = 2; true; idx++) { + String fastq = baseFastq.getAbsolutePath(); + fastq = String.format("%s_%03d%s", fastq.substring(0, fastq.length() - suffix.length()), idx, fastqExtensions.getExtension()); + try { + IOUtil.assertFileIsReadable(new File(fastq)); + } catch (final SAMException e) { // the file is not readable, so do not continue + break; + } + files.add(new File(fastq)); + } + + return files; + } + + /* Simply invokes the right method for unpaired or paired data. */ + protected int doWork() { + IOUtil.assertFileIsReadable(FASTQ); + if (FASTQ2 != null) { + IOUtil.assertFileIsReadable(FASTQ2); + } + IOUtil.assertFileIsWritable(OUTPUT); + + final SAMFileHeader header = createSamFileHeader(); + final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, OUTPUT); + + // Set the quality format + QUALITY_FORMAT = picard.sam.FastqToSam.determineQualityFormat(fileToFastqReader(FASTQ), + (FASTQ2 == null) ? null : fileToFastqReader(FASTQ2), + QUALITY_FORMAT); + + // Lists for sequential files, but also used when not sequential + final List readers1 = new ArrayList<>(); + final List readers2 = new ArrayList<>(); + + if (USE_SEQUENTIAL_FASTQS) { + // Get all the files + for (final File fastq : getSequentialFileList(FASTQ)) { + readers1.add(fileToFastqReader(fastq)); + } + if (null != FASTQ2) { + for (final File fastq : getSequentialFileList(FASTQ2)) { + readers2.add(fileToFastqReader(fastq)); + } + if (readers1.size() != readers2.size()) { + throw new PicardException(String.format("Found %d files for FASTQ and %d files for FASTQ2.", readers1.size(), readers2.size())); + } + } + } + else { + readers1.add(fileToFastqReader(FASTQ)); + if (FASTQ2 != null) { + readers2.add(fileToFastqReader(FASTQ2)); + } + } + + // Loop through the FASTQs + for (int idx = 0; idx < readers1.size(); idx++) { + makeItSo(readers1.get(idx), + (readers2.isEmpty()) ? null : readers2.get(idx), + writer); + } + + // Close all the things + for (final FastqReader reader : readers1) reader.close(); + for (final FastqReader reader : readers2) reader.close(); + writer.close(); + + return 0; + } + + /** + * Handles the FastqToSam execution on the FastqReader(s). + * + * In some circumstances it might be useful to circumvent the command line based instantiation of this + * class, however note that there is no handholding or guardrails to running in this manner. + * + * It is the caller's responsibility to close the reader(s) + * + * @param reader1 The FastqReader for the first fastq file + * @param reader2 The second FastqReader if applicable. Pass in null if only using a single reader + * @param writer The SAMFileWriter where the new SAM file is written + * + */ + public void makeItSo(final FastqReader reader1, final FastqReader reader2, final SAMFileWriter writer) { + final int readCount = (reader2 == null) ? doUnpaired(reader1, writer) : doPaired(reader1, reader2, writer); + LOG.info("Processed " + readCount + " fastq reads"); + } + + /** Creates a simple SAM file from a single fastq file. */ + protected int doUnpaired(final FastqReader freader, final SAMFileWriter writer) { + int readCount = 0; + final ProgressLogger progress = new ProgressLogger(LOG); + for ( ; freader.hasNext() ; readCount++) { + final FastqRecord frec = freader.next(); + final String frecName = SequenceUtil.getSamReadNameFromFastqHeader(frec.getReadName()); + final SAMRecord srec = createSamRecord(writer.getFileHeader(), frecName , frec, false) ; + srec.setReadPairedFlag(false); + writer.addAlignment(srec); + progress.record(srec); + } + + return readCount; + } + + /** More complicated method that takes two fastq files and builds pairing information in the SAM. */ + protected int doPaired(final FastqReader freader1, final FastqReader freader2, final SAMFileWriter writer) { + int readCount = 0; + final ProgressLogger progress = new ProgressLogger(LOG); + for ( ; freader1.hasNext() && freader2.hasNext() ; readCount++) { + final FastqRecord frec1 = freader1.next(); + final FastqRecord frec2 = freader2.next(); + + final String frec1Name = SequenceUtil.getSamReadNameFromFastqHeader(frec1.getReadName()); + final String frec2Name = SequenceUtil.getSamReadNameFromFastqHeader(frec2.getReadName()); + final String baseName = getBaseName(frec1Name, frec2Name, freader1, freader2); + + final SAMRecord srec1 = createSamRecord(writer.getFileHeader(), baseName, frec1, true) ; + srec1.setFirstOfPairFlag(true); + srec1.setSecondOfPairFlag(false); + writer.addAlignment(srec1); + progress.record(srec1); + + final SAMRecord srec2 = createSamRecord(writer.getFileHeader(), baseName, frec2, true) ; + srec2.setFirstOfPairFlag(false); + srec2.setSecondOfPairFlag(true); + writer.addAlignment(srec2); + progress.record(srec2); + } + + if (freader1.hasNext() || freader2.hasNext()) { + throw new PicardException("Input paired fastq files must be the same length"); + } + + return readCount; + } + + private FastqReader fileToFastqReader(final File file) { + return new FastqReader(file, ALLOW_AND_IGNORE_EMPTY_LINES); + } + + + public static SAMRecord createSamRecord(final SAMFileHeader header, final String baseName, final FastqRecord frec, final boolean paired, String readGroupName, FastqQualityFormat fqFormat, int minQ, int maxQ) { + final SAMRecord srec = new SAMRecord(header); + srec.setReadName(baseName); + srec.setReadString(frec.getReadString()); + srec.setReadUnmappedFlag(true); + srec.setAttribute(ReservedTagConstants.READ_GROUP_ID, readGroupName); + String additionalHeader = frec.getReadName().replace(baseName, ""); + if (additionalHeader.length() > 0) { + /* + If this contains the trimmed bases flag (TB:) then put that in a separate tag + */ + int tbIndex = additionalHeader.indexOf(TRIMMED_BASES); + if (tbIndex == -1) { + srec.setAttribute(ZH_ATTRIBUTE, additionalHeader); + } else { + srec.setAttribute(ZT_ATTRIBUTE, additionalHeader.substring(tbIndex + 4)); + if (tbIndex > 0) { + srec.setAttribute(ZH_ATTRIBUTE, additionalHeader.substring(0, tbIndex)); + } + } + } + final byte[] quals = StringUtil.stringToBytes(frec.getBaseQualityString()); + convertQuality(quals, fqFormat); + for (final byte qual : quals) { + final int uQual = qual & 0xff; + if (uQual < minQ || uQual > maxQ) { + throw new PicardException("Base quality " + uQual + " is not in the range " + minQ + ".." + + maxQ + " for read " + frec.getReadName()); + } + } + srec.setBaseQualities(quals); + + if (paired) { + srec.setReadPairedFlag(true); + srec.setMateUnmappedFlag(true); + } + return srec; + } + private SAMRecord createSamRecord(final SAMFileHeader header, final String baseName, final FastqRecord frec, final boolean paired) { + return FastqToSamWithHeaders.createSamRecord(header, baseName, frec, paired, READ_GROUP_NAME, QUALITY_FORMAT, MIN_Q, MAX_Q); + } + + /** Creates a simple header with the values provided on the command line. */ + public SAMFileHeader createSamFileHeader() { + final SAMReadGroupRecord rgroup = new SAMReadGroupRecord(this.READ_GROUP_NAME); + rgroup.setSample(this.SAMPLE_NAME); + if (this.LIBRARY_NAME != null) rgroup.setLibrary(this.LIBRARY_NAME); + if (this.PLATFORM != null) rgroup.setPlatform(this.PLATFORM); + if (this.PLATFORM_UNIT != null) rgroup.setPlatformUnit(this.PLATFORM_UNIT); + if (this.SEQUENCING_CENTER != null) rgroup.setSequencingCenter(SEQUENCING_CENTER); + if (this.PREDICTED_INSERT_SIZE != null) rgroup.setPredictedMedianInsertSize(PREDICTED_INSERT_SIZE); + if (this.DESCRIPTION != null) rgroup.setDescription(this.DESCRIPTION); + if (this.RUN_DATE != null) rgroup.setRunDate(this.RUN_DATE); + if (this.PLATFORM_MODEL != null) rgroup.setPlatformModel(this.PLATFORM_MODEL); + if (this.PROGRAM_GROUP != null) rgroup.setProgramGroup(this.PROGRAM_GROUP); + + final SAMFileHeader header = new SAMFileHeader(); + header.addReadGroup(rgroup); + + for (final String comment : COMMENT) { + header.addComment(comment); + } + + header.setSortOrder(this.SORT_ORDER); + return header ; + } + + /** Based on the type of quality scores coming in, converts them to a numeric byte[] in phred scale. */ + static void convertQuality(final byte[] quals, final FastqQualityFormat version) { + switch (version) { + case Standard: + SAMUtils.fastqToPhred(quals); + break ; + case Solexa: + solexaQualityConverter.convertSolexaQualityCharsToPhredBinary(quals); + break ; + case Illumina: + solexaQualityConverter.convertSolexa_1_3_QualityCharsToPhredBinary(quals); + break ; + } + } + + /** Returns read baseName and asserts correct pair read name format: + *
    + *
  • Paired reads must either have the exact same read names or they must contain at least one "/" + *
  • and the First pair read name must end with "/1" and second pair read name ends with "/2" + *
  • The baseName (read name part before the /) must be the same for both read names + *
  • If the read names are exactly the same but end in "/2" or "/1" then an exception will be thrown + *
+ */ + String getBaseName(final String readName1, final String readName2, final FastqReader freader1, final FastqReader freader2) { + String [] toks = getReadNameTokens(readName1, 1, freader1); + final String baseName1 = toks[0] ; + final String num1 = toks[1] ; + + toks = getReadNameTokens(readName2, 2, freader2); + final String baseName2 = toks[0] ; + final String num2 = toks[1]; + + if (!baseName1.equals(baseName2)) { + throw new PicardException(String.format("In paired mode, read name 1 (%s) does not match read name 2 (%s)", baseName1,baseName2)); + } + + final boolean num1Blank = StringUtil.isBlank(num1); + final boolean num2Blank = StringUtil.isBlank(num2); + if (num1Blank || num2Blank) { + if(!num1Blank) throw new PicardException(error(freader1,"Pair 1 number is missing (" +readName1+ "). Both pair numbers must be present or neither.")); //num1 != blank and num2 == blank + else if(!num2Blank) throw new PicardException(error(freader2, "Pair 2 number is missing (" +readName2+ "). Both pair numbers must be present or neither.")); //num1 == blank and num =2 != blank + } else { + if (!num1.equals("1")) throw new PicardException(error(freader1,"Pair 1 number must be 1 ("+readName1+")")); + if (!num2.equals("2")) throw new PicardException(error(freader2,"Pair 2 number must be 2 ("+readName2+")")); + } + + return baseName1 ; + } + + /** Breaks up read name into baseName and number separated by the last / */ + private String [] getReadNameTokens(final String readName, final int pairNum, final FastqReader freader) { + if(readName.equals("")) throw new PicardException(error(freader,"Pair read name "+pairNum+" cannot be empty: "+readName)); + + final int idx = readName.lastIndexOf('/'); + final String[] result = new String[2]; + + if (idx == -1) { + result[0] = readName; + result[1] = null; + } else { + result[1] = readName.substring(idx + 1); // should be a 1 or 2 + + if(!result[1].equals("1") && !result[1].equals("2")) { //if not a 1 or 2 then names must be identical + result[0] = readName; + result[1] = null; + } + else { + result[0] = readName.substring(0,idx); // baseName + } + } + + return result ; + } + + /** Little utility to give error messages corresponding to line numbers in the input files. */ + private String error(final FastqReader freader, final String str) { + return str +" at line "+freader.getLineNumber() +" in file "+freader.getFile().getAbsolutePath(); + } + + @Override + protected String[] customCommandLineValidation() { + if (MIN_Q < 0) return new String[]{"MIN_Q must be >= 0"}; + if (MAX_Q > SAMUtils.MAX_PHRED_SCORE) return new String[]{"MAX_Q must be <= " + SAMUtils.MAX_PHRED_SCORE}; + return null; + } +} diff --git a/qmule/src/org/qcmg/qmule/SamToFastqWithHeaders.java b/qmule/src/org/qcmg/qmule/SamToFastqWithHeaders.java new file mode 100644 index 000000000..95644f450 --- /dev/null +++ b/qmule/src/org/qcmg/qmule/SamToFastqWithHeaders.java @@ -0,0 +1,627 @@ +package org.qcmg.qmule; +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +import htsjdk.samtools.*; +import htsjdk.samtools.fastq.FastqRecord; +import htsjdk.samtools.fastq.FastqWriter; +import htsjdk.samtools.fastq.FastqWriterFactory; +import htsjdk.samtools.util.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import picard.PicardException; +import picard.cmdline.CommandLineProgram; +import picard.cmdline.StandardOptionDefinitions; +import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; + +import java.io.File; +import java.util.*; + +/** + *

Extracts read sequences and qualities from the input SAM/BAM file and writes them into + * the output file in Sanger FASTQ format. . + * See MAQ FASTQ specification for details. + * This tool can be used by way of a pipe to run BWA MEM on unmapped BAM (uBAM) files efficiently. + *

In the RC mode (default is True), if the read is aligned and the alignment is to the reverse strand on the genome, + * the read's sequence from input sam file will be reverse-complemented prior to writing it to FASTQ in order restore correctly + * the original read sequence as it was generated by the sequencer. + *
+ *

Usage example:

+ *
+ * java -jar picard.jar SamToFastqWithHeaders \
+ *     I=input.bam \
+ *     FASTQ=output.fastq
+ * 
+ *
+ */ +@CommandLineProgramProperties( + summary = SamToFastqWithHeaders.USAGE_SUMMARY + SamToFastqWithHeaders.USAGE_DETAILS, + oneLineSummary = SamToFastqWithHeaders.USAGE_SUMMARY, + programGroup = ReadDataManipulationProgramGroup.class) +@DocumentedFeature +public class SamToFastqWithHeaders extends CommandLineProgram { + static final String USAGE_SUMMARY = "Converts a SAM/BAM/CRAM file to FASTQ."; + static final String USAGE_DETAILS = " Extracts read sequences and qualities from the input SAM/BAM/CRAM file and writes them into" + + "the output file in Sanger FASTQ format." + + "See MAQ FASTQ specification for details." + + "This tool can be used by way of a pipe to run BWA MEM on unmapped BAM (uBAM) files efficiently.

" + + "

In the RC mode (default is True), if the read is aligned and the alignment is to the reverse strand on the genome," + + "the read's sequence from input sam file will be reverse-complemented prior to writing it to FASTQ in order restore correctly" + + "the original read sequence as it was generated by the sequencer.

" + + "
" + + "

Usage example:

" + + "
" +
+            "java -jar picard.jar SamToFastqWithHeaders 
" + + " I=input.bam
" + + " FASTQ=output.fastq" + + "
" + + "
"; + @Argument(doc = "Input SAM/BAM/CRAM file to extract reads from", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME) + public File INPUT; + + @Argument(shortName = "F", doc = "Output FASTQ file (single-end fastq or, if paired, first end of the pair FASTQ).", + mutex = {"OUTPUT_PER_RG", "COMPRESS_OUTPUTS_PER_RG", "OUTPUT_DIR"}) + public File FASTQ; + + @Argument(shortName = "F2", doc = "Output FASTQ file (if paired, second end of the pair FASTQ).", optional = true, + mutex = {"OUTPUT_PER_RG", "COMPRESS_OUTPUTS_PER_RG"}) + public File SECOND_END_FASTQ; + + @Argument(shortName = "FU", doc = "Output FASTQ file for unpaired reads; may only be provided in paired-FASTQ mode", optional = true, + mutex = {"OUTPUT_PER_RG", "COMPRESS_OUTPUTS_PER_RG"}) + public File UNPAIRED_FASTQ; + + @Argument(shortName = "OPRG", doc = "Output a FASTQ file per read group (two FASTQ files per read group if the group is paired).", + optional = true, mutex = {"FASTQ", "SECOND_END_FASTQ", "UNPAIRED_FASTQ"}) + public boolean OUTPUT_PER_RG; + + @Argument(shortName = "GZOPRG", doc = "Compress output FASTQ files per read group using gzip and append a .gz extension to the file names.", + mutex = {"FASTQ", "SECOND_END_FASTQ", "UNPAIRED_FASTQ"}) + public Boolean COMPRESS_OUTPUTS_PER_RG = false; + + @Argument(shortName = "RGT", doc = "The read group tag (PU or ID) to be used to output a FASTQ file per read group.") + public String RG_TAG = "PU"; + + @Argument(shortName = "ODIR", doc = "Directory in which to output the FASTQ file(s). Used only when OUTPUT_PER_RG is true.", + optional = true) + public File OUTPUT_DIR; + + @Argument(shortName = "RC", doc = "Re-reverse bases and qualities of reads with negative strand flag set before writing them to FASTQ", + optional = true) + public boolean RE_REVERSE = true; + + @Argument(shortName = "INTER", doc = "Will generate an interleaved fastq if paired, each line will have /1 or /2 to describe which end it came from") + public boolean INTERLEAVE = false; + + @Argument(shortName = "NON_PF", doc = "Include non-PF reads from the SAM file into the output " + + "FASTQ files. PF means 'passes filtering'. Reads whose 'not passing quality controls' " + + "flag is set are non-PF reads. See GATK Dictionary for more info.") + public boolean INCLUDE_NON_PF_READS = false; + + @Argument(shortName = "CLIP_ATTR", doc = "The attribute that stores the position at which " + + "the SAM record should be clipped", optional = true) + public String CLIPPING_ATTRIBUTE; + + @Argument(shortName = "CLIP_ACT", doc = "The action that should be taken with clipped reads: " + + "'X' means the reads and qualities should be trimmed at the clipped position; " + + "'N' means the bases should be changed to Ns in the clipped region; and any " + + "integer means that the base qualities should be set to that value in the " + + "clipped region.", optional = true) + public String CLIPPING_ACTION; + + @Argument(shortName = "CLIP_MIN", doc = "When performing clipping with the CLIPPING_ATTRIBUTE and CLIPPING_ACTION " + + "parameters, ensure that the resulting reads after clipping are at least CLIPPING_MIN_LENGTH bases long. " + + "If the original read is shorter than CLIPPING_MIN_LENGTH then the original read length will be maintained.") + public int CLIPPING_MIN_LENGTH = 0; + + @Argument(shortName = "R1_TRIM", doc = "The number of bases to trim from the beginning of read 1.") + public int READ1_TRIM = 0; + + @Argument(shortName = "R1_MAX_BASES", doc = "The maximum number of bases to write from read 1 after trimming. " + + "If there are fewer than this many bases left after trimming, all will be written. If this " + + "value is null then all bases left after trimming will be written.", optional = true) + public Integer READ1_MAX_BASES_TO_WRITE; + + @Argument(shortName = "R2_TRIM", doc = "The number of bases to trim from the beginning of read 2.") + public int READ2_TRIM = 0; + + @Argument(shortName = "R2_MAX_BASES", doc = "The maximum number of bases to write from read 2 after trimming. " + + "If there are fewer than this many bases left after trimming, all will be written. If this " + + "value is null then all bases left after trimming will be written.", optional = true) + public Integer READ2_MAX_BASES_TO_WRITE; + + @Argument(shortName = "Q", doc = "End-trim reads using the phred/bwa quality trimming algorithm and this quality.", optional = true) + public Integer QUALITY; + + @Argument(doc = "If true, include non-primary alignments in the output. Support of non-primary alignments in SamToFastq " + + "is not comprehensive, so there may be exceptions if this is set to true and there are paired reads with non-primary alignments.") + public boolean INCLUDE_NON_PRIMARY_ALIGNMENTS = false; + + private static final String CLIP_TRIM = "X"; + private static final String CLIP_TO_N = "N"; + + private static final short ZH_ATTRIBUTE = SAMTag.makeBinaryTag("ZH"); + private static final short ZT_ATTRIBUTE = SAMTag.makeBinaryTag("ZT"); + private static final short OQ_ATTRIBUTE = SAMTag.makeBinaryTag("OQ"); + + private static final String[] EMPTY_STRING_ARRAY = {}; + + private final Log log = Log.getInstance(SamToFastqWithHeaders.class); + + protected int doWork() { + IOUtil.assertFileIsReadable(INPUT); + final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT); + final Map firstSeenMates = new HashMap<>(); + final FastqWriterFactory factory = new FastqWriterFactory(); + factory.setCreateMd5(CREATE_MD5_FILE); + + initializeAdditionalWriters(); + final Map writers = generateWriters(reader.getFileHeader().getReadGroups(), + factory); + final Map> additionalWriters = generateAdditionalWriters(reader.getFileHeader().getReadGroups(), factory); + if (writers.isEmpty()) { + final String msgBase = INPUT + " does not contain Read Groups"; + final String msg = OUTPUT_PER_RG ? msgBase + ", consider not using the OUTPUT_PER_RG option" : msgBase; + throw new PicardException(msg); + } + + final ProgressLogger progress = new ProgressLogger(log); + + for (final SAMRecord currentRecord : reader) { + handleRecord(currentRecord, writers, additionalWriters, firstSeenMates); + progress.record(currentRecord); + } + + CloserUtil.close(reader); + + // Close all the fastq writers being careful to close each one only once! + for (final FastqWriters writerMapping : new HashSet<>(writers.values())) { + writerMapping.closeAll(); + } + + // close all `additionalWriters` only once + final Set additionalWriterSet = new HashSet<>(); + additionalWriters.values().forEach(additionalWriterSet::addAll); + for (final FastqWriter fastqWriter : additionalWriterSet) { + fastqWriter.close(); + } + + if (!firstSeenMates.isEmpty()) { + SAMUtils.processValidationError(new SAMValidationError(SAMValidationError.Type.MATE_NOT_FOUND, + "Found " + firstSeenMates.size() + " unpaired mates", null), VALIDATION_STRINGENCY); + } + + return 0; + } + + /** + * Generates the writers for the given read groups or, if we are not emitting per-read-group, just returns the single set of writers. + */ + private Map generateWriters(List samReadGroupRecords, + FastqWriterFactory factory) { + + final Map writerMap = new HashMap<>(); + + final FastqWriters fastqWriters; + if (!OUTPUT_PER_RG) { + IOUtil.assertFileIsWritable(FASTQ); + final FastqWriter firstOfPairWriter = factory.newWriter(FASTQ); + + final FastqWriter secondOfPairWriter; + if (INTERLEAVE) { + secondOfPairWriter = firstOfPairWriter; + } else if (SECOND_END_FASTQ != null) { + IOUtil.assertFileIsWritable(SECOND_END_FASTQ); + secondOfPairWriter = factory.newWriter(SECOND_END_FASTQ); + } else { + secondOfPairWriter = null; + } + + /* Prepare the writer that will accept unpaired reads. If we're emitting a single fastq - and assuming single-ended reads - + * then this is simply that one fastq writer. Otherwise, if we're doing paired-end, we emit to a third new writer, since + * the other two fastqs are accepting only paired end reads. */ + final FastqWriter unpairedWriter = UNPAIRED_FASTQ == null ? firstOfPairWriter : factory.newWriter(UNPAIRED_FASTQ); + + fastqWriters = new FastqWriters(firstOfPairWriter, secondOfPairWriter, unpairedWriter); + + // For all read groups we may find in the sam, register this single set of writers for them. + writerMap.put(null, fastqWriters); + for (final SAMReadGroupRecord rg : samReadGroupRecords) { + writerMap.put(rg, fastqWriters); + } + } else { + // When we're creating a fastq-group per readgroup, by convention we do not emit a special fastq for unpaired reads. + for (final SAMReadGroupRecord rg : samReadGroupRecords) { + final FastqWriter firstOfPairWriter = factory.newWriter(makeReadGroupFile(rg, "_1")); + // Create this writer on-the-fly; if we find no second-of-pair reads, don't bother making a writer (or delegating, + // if we're interleaving). + final Lazy lazySecondOfPairWriter = new Lazy<>(() -> INTERLEAVE ? firstOfPairWriter : factory.newWriter(makeReadGroupFile(rg, "_2"))); + + writerMap.put(rg, new FastqWriters(firstOfPairWriter, lazySecondOfPairWriter, firstOfPairWriter)); + } + } + return writerMap; + } + + protected void initializeAdditionalWriters() { + } + + protected Map> generateAdditionalWriters(List readGroups, + FastqWriterFactory factory) { + return Collections.emptyMap(); + } + + private void handleRecord(final SAMRecord currentRecord, final Map writers, + final Map> additionalWriters, + final Map firstSeenMates) { + if (currentRecord.isSecondaryOrSupplementary() && !INCLUDE_NON_PRIMARY_ALIGNMENTS) { + return; + } + + // Skip non-PF reads as necessary + if (currentRecord.getReadFailsVendorQualityCheckFlag() && !INCLUDE_NON_PF_READS) { + return; + } + + final FastqWriters fq = writers.get(currentRecord.getReadGroup()); + SAMRecord read1 = null; + SAMRecord read2 = null; + if (currentRecord.getReadPairedFlag()) { + final String currentReadName = currentRecord.getReadName(); + final SAMRecord firstRecord = firstSeenMates.remove(currentReadName); + if (firstRecord == null) { + firstSeenMates.put(currentReadName, currentRecord); + } else { + assertPairedMates(firstRecord, currentRecord); + + read1 = currentRecord.getFirstOfPairFlag() ? currentRecord : firstRecord; + read2 = currentRecord.getFirstOfPairFlag() ? firstRecord : currentRecord; + writeRecord(read1, 1, fq.getFirstOfPair(), READ1_TRIM, READ1_MAX_BASES_TO_WRITE); + final FastqWriter secondOfPairWriter = fq.getSecondOfPair(); + if (secondOfPairWriter == null) { + throw new PicardException("Input contains paired reads but no SECOND_END_FASTQ specified."); + } + writeRecord(read2, 2, secondOfPairWriter, READ2_TRIM, READ2_MAX_BASES_TO_WRITE); + } + } else { + writeRecord(currentRecord, null, fq.getUnpaired(), READ1_TRIM, READ1_MAX_BASES_TO_WRITE); + } + + handleAdditionalRecords(currentRecord, additionalWriters, read1, read2); + } + + protected void handleAdditionalRecords(SAMRecord currentRecord, Map> additionalWriters, SAMRecord read1, SAMRecord read2) { + } + + private File makeReadGroupFile(final SAMReadGroupRecord readGroup, final String preExtSuffix) { + String fileName = null; + if (RG_TAG.equalsIgnoreCase("PU")) { + fileName = readGroup.getPlatformUnit(); + } else if (RG_TAG.equalsIgnoreCase("ID")) { + fileName = readGroup.getReadGroupId(); + } + if (fileName == null) { + throw new PicardException("The selected RG_TAG: " + RG_TAG + " is not present in the header."); + } + fileName = IOUtil.makeFileNameSafe(fileName); + if (preExtSuffix != null) { + fileName += preExtSuffix; + } + fileName += COMPRESS_OUTPUTS_PER_RG ? ".fastq.gz" : ".fastq"; + + final File result = (OUTPUT_DIR != null) + ? new File(OUTPUT_DIR, fileName) + : new File(fileName); + IOUtil.assertFileIsWritable(result); + return result; + } + + public static int getPositionOfCaseChange(String s) { + if (null != s && s.length() > 0) { + boolean isLower = Character.isLowerCase(s.charAt(0)); + for (int i = 1, len = s.length(); i < len; i++) { + if (Character.isLowerCase(s.charAt(i)) != isLower) { + return i; + } + } + /* + If we are here, then there has been no case change in the string + Return 0 which will convey to the user that the case changed at the first character in the string + */ + return 0; + } + return -1; + } + + public static String[] getTrimmedBaseDetailsFromTag(String tag) { + if (null != tag && tag.length() > 0) { + String [] results = new String[4]; + int plusIndex = tag.indexOf('+'); + + String bases = tag.substring(0, plusIndex); + String quals = tag.substring(plusIndex + 1); + int caseChangeIndex = getPositionOfCaseChange(bases); + if (caseChangeIndex > 0) { + String bases1 = bases.substring(0, caseChangeIndex); + String bases2 = bases.substring(caseChangeIndex); + if (Character.isLowerCase(bases1.charAt(0))) { + results[0] = bases1; + results[1] = quals.substring(0, caseChangeIndex); + results[2] = bases2; + results[3] = quals.substring(caseChangeIndex); + } else { + results[0] = bases2; + results[1] = quals.substring(caseChangeIndex); + results[2] = bases1; + results[3] = quals.substring(0, caseChangeIndex); + } + } else { + boolean isLower = Character.isLowerCase(bases.charAt(0)); + results[0] = isLower? bases : ""; + results[1] = isLower ? quals : ""; + results[2] = isLower ? "" : bases; + results[3] = isLower ? "" : quals; + } + return results; + } + return EMPTY_STRING_ARRAY; + } + + + + public static FastqRecord getFastqRecordFromSamRecord(final SAMRecord read, final Integer mateNumber, final int basesToTrim, + final Integer maxBasesToWrite, final String clippingAttribute, + int clippingMinLength, final String clippingAction, boolean reReverse, final Integer quality) { + /* + get ZH tag and add to header + */ + String seqHeader = read.getReadName(); + String additionalHeader = (String)read.getAttribute(ZH_ATTRIBUTE); + String trimmedAdapterSequenceAndQual = (String)read.getAttribute(ZT_ATTRIBUTE); + if (null != additionalHeader && additionalHeader.length() > 0) { + seqHeader += additionalHeader; + } else if (null != mateNumber){ + seqHeader += "/" + mateNumber; + } + + String readString = read.getReadString(); + String origBaseQuals = (String)read.getAttribute(OQ_ATTRIBUTE); + String baseQualities = (null != origBaseQuals && origBaseQuals.length() == readString.length()) ? origBaseQuals : read.getBaseQualityString(); + + + if (reReverse && read.getReadNegativeStrandFlag()) { + readString = SequenceUtil.reverseComplement(readString); + baseQualities = StringUtil.reverseString(baseQualities); + } + + if (null != trimmedAdapterSequenceAndQual && trimmedAdapterSequenceAndQual.length() > 0) { + String [] adapterInfo = getTrimmedBaseDetailsFromTag(trimmedAdapterSequenceAndQual); + if (null != adapterInfo && adapterInfo.length == 4) { + if (adapterInfo[0].length() > 0) { + readString = adapterInfo[0].toUpperCase() + readString; + baseQualities = adapterInfo[1] + baseQualities; + } + if (adapterInfo[2].length() > 0) { + readString = readString + adapterInfo[2].toUpperCase(); + baseQualities = baseQualities + adapterInfo[3]; + } + } + } + + // If we're clipping, do the right thing to the bases or qualities + if (clippingAttribute != null) { + Integer clipPoint = (Integer) read.getAttribute(clippingAttribute); + if (clipPoint != null && clipPoint < clippingMinLength) { + clipPoint = Math.min(readString.length(), clippingMinLength); + } + + if (clipPoint != null) { + if (clippingAction.equalsIgnoreCase(CLIP_TRIM)) { + readString = clip(readString, clipPoint, null, !read.getReadNegativeStrandFlag()); + baseQualities = clip(baseQualities, clipPoint, null, !read.getReadNegativeStrandFlag()); + } else if (clippingAction.equalsIgnoreCase(CLIP_TO_N)) { + readString = clip(readString, clipPoint, CLIP_TO_N.charAt(0), !read.getReadNegativeStrandFlag()); + } else { + final char newQual = SAMUtils.phredToFastq(new byte[]{(byte) Integer.parseInt(clippingAction)}).charAt(0); + baseQualities = clip(baseQualities, clipPoint, newQual, !read.getReadNegativeStrandFlag()); + } + } + } + + if (basesToTrim > 0) { + readString = readString.substring(basesToTrim); + baseQualities = baseQualities.substring(basesToTrim); + } + + // Perform quality trimming if desired, making sure to leave at least one base! + if (quality != null) { + final byte[] quals = SAMUtils.fastqToPhred(baseQualities); + final int qualityTrimIndex = Math.max(1, TrimmingUtil.findQualityTrimPoint(quals, quality)); + if (qualityTrimIndex < quals.length) { + readString = readString.substring(0, qualityTrimIndex); + baseQualities = baseQualities.substring(0, qualityTrimIndex); + } + } + + if (maxBasesToWrite != null && maxBasesToWrite < readString.length()) { + readString = readString.substring(0, maxBasesToWrite); + baseQualities = baseQualities.substring(0, maxBasesToWrite); + } + + return new FastqRecord(seqHeader, readString, "", baseQualities); + } + private void writeRecord(final SAMRecord read, final Integer mateNumber, final FastqWriter writer, + final int basesToTrim, final Integer maxBasesToWrite) { + + + writer.write(getFastqRecordFromSamRecord(read, mateNumber, basesToTrim, maxBasesToWrite, CLIPPING_ATTRIBUTE, CLIPPING_MIN_LENGTH, CLIPPING_ACTION, RE_REVERSE, QUALITY)); + } + + /** + * Utility method to handle the changes required to the base/quality strings by the clipping + * parameters. + * + * @param src The string to clip + * @param point The 1-based position of the first clipped base in the read + * @param replacement If non-null, the character to replace in the clipped positions + * in the string (a quality score or 'N'). If null, just trim src + * @param posStrand Whether the read is on the positive strand + * @return String The clipped read or qualities + */ + private static String clip(final String src, final int point, final Character replacement, final boolean posStrand) { + final int len = src.length(); + StringBuilder result = new StringBuilder(posStrand ? src.substring(0, point - 1) : src.substring(len - point + 1)); + if (replacement != null) { + if (posStrand) { + for (int i = point; i <= len; i++) { + result.append(replacement); + } + } else { + for (int i = 0; i <= len - point; i++) { + result.insert(0, replacement); + } + } + } + return result.toString(); + } + + protected static void assertPairedMates(final SAMRecord record1, final SAMRecord record2) { + if (!(record1.getFirstOfPairFlag() && record2.getSecondOfPairFlag() || + record2.getFirstOfPairFlag() && record1.getSecondOfPairFlag())) { + throw new PicardException("Illegal mate state: " + record1.getReadName()); + } + } + + /** + * Put any custom command-line validation in an override of this method. + * clp is initialized at this point and can be used to print usage and access argv. + * Any options set by command-line parser can be validated. + * + * @return null if command line is valid. If command line is invalid, returns an array of error + * messages to be written to the appropriate place. + */ + protected String[] customCommandLineValidation() { + + List errors = new ArrayList<>(); + + if (INTERLEAVE && SECOND_END_FASTQ != null) { + errors.add("Cannot set INTERLEAVE to true and pass in a SECOND_END_FASTQ"); + } + + if (UNPAIRED_FASTQ != null && SECOND_END_FASTQ == null) { + errors.add("UNPAIRED_FASTQ may only be set when also emitting read1 and read2 fastqs (so SECOND_END_FASTQ must also be set)."); + } + + if ((CLIPPING_ATTRIBUTE != null && CLIPPING_ACTION == null) || + (CLIPPING_ATTRIBUTE == null && CLIPPING_ACTION != null)) { + errors.add("Both or neither of CLIPPING_ATTRIBUTE and CLIPPING_ACTION should be set."); + } + + if (CLIPPING_ACTION != null) { + if (!CLIPPING_ACTION.equals(CLIP_TO_N) && !CLIPPING_ACTION.equals(CLIP_TRIM)) { + try { + Integer.parseInt(CLIPPING_ACTION); + } catch (NumberFormatException nfe) { + errors.add("CLIPPING ACTION must be one of: N, X, or an integer"); + } + } + } + + if ((OUTPUT_PER_RG && OUTPUT_DIR == null) || ((!OUTPUT_PER_RG) && OUTPUT_DIR != null)) { + errors.add("If OUTPUT_PER_RG is true, then OUTPUT_DIR should be set. If "); + + } + + if (OUTPUT_PER_RG) { + if (RG_TAG == null) { + errors.add("If OUTPUT_PER_RG is true, then RG_TAG should be set."); + } else if (!(RG_TAG.equalsIgnoreCase("PU") || RG_TAG.equalsIgnoreCase("ID"))) { + errors.add("RG_TAG must be: PU or ID"); + } + } + + return errors.isEmpty() ? super.customCommandLineValidation() : errors.toArray(new String[0]); + } + + /** + * A collection of {@link htsjdk.samtools.fastq.FastqWriter}s for particular types of reads. + *

+ * Allows for lazy construction of the second-of-pair writer, since when we are in the "output per read group mode", we only wish to + * generate a second-of-pair fastq if we encounter a second-of-pair read. + */ + private static final class FastqWriters { + private final FastqWriter firstOfPair, unpaired; + private final Lazy secondOfPair; + + /** + * Constructor if the consumer wishes for the second-of-pair writer to be built on-the-fly. + */ + private FastqWriters(final FastqWriter firstOfPair, final Lazy secondOfPair, final FastqWriter unpaired) { + this.firstOfPair = firstOfPair; + this.unpaired = unpaired; + this.secondOfPair = secondOfPair; + } + + /** + * Simple constructor; all writers are pre-initialized.. + */ + private FastqWriters(final FastqWriter firstOfPair, final FastqWriter secondOfPair, final FastqWriter unpaired) { + this(firstOfPair, new Lazy<>(() -> secondOfPair), unpaired); + } + + private FastqWriter getFirstOfPair() { + return firstOfPair; + } + + private FastqWriter getSecondOfPair() { + return secondOfPair.get(); + } + + private FastqWriter getUnpaired() { + return unpaired; + } + + private void closeAll() { + final Set fastqWriters = new HashSet<>(); + fastqWriters.add(firstOfPair); + fastqWriters.add(unpaired); + // Make sure this is a no-op if the second writer was never fetched. + if (secondOfPair.isInitialized()) { + fastqWriters.add(secondOfPair.get()); + } + for (final FastqWriter fastqWriter : fastqWriters) { + fastqWriter.close(); + } + } + } + + public static void main(final String[] argv) { + + + int exitStatus = new SamToFastqWithHeaders().instanceMain(argv); + + System.exit(exitStatus); + } +} \ No newline at end of file diff --git a/qmule/test/org/qcmg/qmule/FastqToSamWithHeadersTest.java b/qmule/test/org/qcmg/qmule/FastqToSamWithHeadersTest.java new file mode 100644 index 000000000..a19e74b63 --- /dev/null +++ b/qmule/test/org/qcmg/qmule/FastqToSamWithHeadersTest.java @@ -0,0 +1,265 @@ +package org.qcmg.qmule; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.fastq.FastqReader; +import htsjdk.samtools.fastq.FastqRecord; +import htsjdk.samtools.util.FastqQualityFormat; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import picard.PicardException; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static htsjdk.samtools.SAMUtils.MAX_PHRED_SCORE; +import static org.junit.Assert.*; + +public class FastqToSamWithHeadersTest { + + + @Rule + public TemporaryFolder testFolder = new TemporaryFolder(); + + private static final FastqToSamWithHeaders fastqToSam = new FastqToSamWithHeaders(); + private static File newTempFile(final String filename) throws IOException { + return newTempFile(filename, ".tmp"); + } + private static File newTempFile(final String filename, String suffix) throws IOException { + final File file = File.createTempFile(filename, suffix); + file.deleteOnExit(); + return file; + } + + private static FastqReader freader1; + private static FastqReader freader2; + + static { + try { + freader1 = new FastqReader(newTempFile("dummyFile")); + freader2 = new FastqReader(newTempFile("dummyFile")); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Test + public void createSAMRecordNoAdditionalHeader() { + FastqRecord fqRec = new FastqRecord("basename","ACGT", "", "????"); + SAMRecord samRec = FastqToSamWithHeaders.createSamRecord(null, "basename", fqRec, true, "A", FastqQualityFormat.Standard, 0, MAX_PHRED_SCORE); + assertEquals("basename", samRec.getReadName()); + assertEquals("ACGT", samRec.getReadString()); + assertEquals("????", samRec.getBaseQualityString()); + assertTrue(samRec.getReadPairedFlag()); + assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZT_ATTRIBUTE)); + assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE)); + + samRec = FastqToSamWithHeaders.createSamRecord(null, "basename", fqRec, false, "A", FastqQualityFormat.Standard, 0, MAX_PHRED_SCORE); + assertEquals("basename", samRec.getReadName()); + assertEquals("ACGT", samRec.getReadString()); + assertEquals("????", samRec.getBaseQualityString()); + assertFalse(samRec.getReadPairedFlag()); + assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZT_ATTRIBUTE)); + assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE)); + } + + @Test + public void createSAMRecordAdditionalHeaderNoTrimming() { + String basename = "basename"; + String pairing = "/1"; + String additionalHeader = " 1.2.ACGTTGCA"; + FastqRecord fqRec = new FastqRecord(basename + pairing + additionalHeader, "ACGT", "", "????"); + SAMRecord samRec = FastqToSamWithHeaders.createSamRecord(null, "basename", fqRec, true, "A", FastqQualityFormat.Standard, 0, MAX_PHRED_SCORE); + assertEquals(basename, samRec.getReadName()); + assertEquals("ACGT", samRec.getReadString()); + assertEquals("????", samRec.getBaseQualityString()); + assertTrue(samRec.getReadPairedFlag()); + assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZT_ATTRIBUTE)); + assertEquals(pairing + additionalHeader, samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE)); + } + + @Test + public void createSAMRecordAdditionalHeader() { + String basename = "basename"; + String pairing = "/1"; + String additionalHeader = " 1.2.ACGTTGCA"; + String trimmedBases = "aaaaAAAA+????????"; + FastqRecord fqRec = new FastqRecord(basename + pairing + additionalHeader + " TB:" + trimmedBases, "ACGT", "", "????"); + SAMRecord samRec = FastqToSamWithHeaders.createSamRecord(null, "basename", fqRec, true, "A", FastqQualityFormat.Standard, 0, MAX_PHRED_SCORE); + assertEquals(basename, samRec.getReadName()); + assertEquals("ACGT", samRec.getReadString()); + assertEquals("????", samRec.getBaseQualityString()); + assertTrue(samRec.getReadPairedFlag()); + assertEquals(trimmedBases, samRec.getAttribute(FastqToSamWithHeaders.ZT_ATTRIBUTE)); + assertEquals(pairing + additionalHeader, samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE)); + } + + @Test + public void readPairNameOk() { + assertEquals("aa", fastqToSam.getBaseName("aa/1", "aa/2" , freader1, freader2)); + assertEquals("aa", fastqToSam.getBaseName("aa", "aa", freader1, freader2)); + assertEquals("aa/bb", fastqToSam.getBaseName("aa/bb", "aa/bb" , freader1, freader2)); + assertEquals("aa/bb/", fastqToSam.getBaseName("aa/bb/", "aa/bb/" , freader1, freader2)); + assertEquals("aa/bb", fastqToSam.getBaseName("aa/bb/1", "aa/bb/2" , freader1, freader2)); + assertEquals("aa/bb/cc/dd/ee/ff", fastqToSam.getBaseName("aa/bb/cc/dd/ee/ff/1", "aa/bb/cc/dd/ee/ff/2" , freader1, freader2)); + assertEquals("///", fastqToSam.getBaseName("////1", "////2" , freader1, freader2)); + assertEquals("/", fastqToSam.getBaseName("/", "/" , freader1, freader2)); + assertEquals("////", fastqToSam.getBaseName("////", "////", freader1, freader2)); + assertEquals("/aa", fastqToSam.getBaseName("/aa", "/aa" , freader1, freader2)); + assertEquals("aa/", fastqToSam.getBaseName("aa/", "aa/" , freader1, freader2)); + assertEquals("ab/c", fastqToSam.getBaseName("ab/c", "ab/c", freader1, freader2)); + } + + @Test + public void readPairNamesBad() { + try { + fastqToSam.getBaseName("", "" , freader1, freader2); + Assert.fail("Should have thrown an exception"); + } catch (PicardException ignored) {} + try { + fastqToSam.getBaseName("aa/1", "bb/2" , freader1, freader2); + Assert.fail("Should have thrown an exception"); + } catch (PicardException ignored) {} + try { + fastqToSam.getBaseName("aa", "bb" , freader1, freader2); + Assert.fail("Should have thrown an exception"); + } catch (PicardException ignored) {} + try { + fastqToSam.getBaseName("aa/1", "aa" , freader1, freader2); + Assert.fail("Should have thrown an exception"); + } catch (PicardException ignored) {} + try { + fastqToSam.getBaseName("aa", "aa/2" , freader1, freader2); + Assert.fail("Should have thrown an exception"); + } catch (PicardException ignored) {} + try { + fastqToSam.getBaseName("aa/1", "aa/1" , freader1, freader2); + Assert.fail("Should have thrown an exception"); + } catch (PicardException ignored) {} + try { + fastqToSam.getBaseName("aa/2", "aa/2" , freader1, freader2); + Assert.fail("Should have thrown an exception"); + } catch (PicardException ignored) {} + } + + @Test + public void testSequentialFiles() throws Exception { + File singleEnd = testFolder.newFile("single_end_R1_001.fastq"); + File singleEnd2 = testFolder.newFile("single_end_R1_002.fastq"); + File pairedEnd1 = testFolder.newFile("paired_end_R1_001.fastq"); + File pairedEnd12 = testFolder.newFile("paired_end_R1_002.fastq"); + File pairedEnd2 = testFolder.newFile("paired_end_R2_001.fastq"); + File pairedEnd22 = testFolder.newFile("paired_end_R2_002.fastq"); + + assertEquals(2, FastqToSamWithHeaders.getSequentialFileList(singleEnd).size()); + assertEquals(2, FastqToSamWithHeaders.getSequentialFileList(pairedEnd1).size()); + assertEquals(2, FastqToSamWithHeaders.getSequentialFileList(pairedEnd2).size()); + + populateFile(Arrays.asList(singleEnd, singleEnd2, pairedEnd1, pairedEnd12, pairedEnd2, pairedEnd22), Arrays.asList("@FAKE0001 Original version has PHRED scores from 93 to 0 inclusive (in that order)", + "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG", + "+", + "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~")); + + File singleEnd1Output = testFolder.newFile("singleEnd1.sam"); + File singleEnd2Output = testFolder.newFile("singleEnd2.sam"); + File pairedEnd1Output = testFolder.newFile("pairedEnd1.sam"); + File pairedEnd2Output = testFolder.newFile("pairedEnd2.sam"); + convertFileAndVerifyRecordCount(1, singleEnd, null, singleEnd1Output, FastqQualityFormat.Illumina, true, false); + convertFileAndVerifyRecordCount(2, singleEnd, null, singleEnd2Output,FastqQualityFormat.Illumina, true, true); + convertFileAndVerifyRecordCount(2, pairedEnd1, pairedEnd2, pairedEnd1Output, FastqQualityFormat.Illumina, true, false); + convertFileAndVerifyRecordCount(4, pairedEnd1, pairedEnd2, pairedEnd2Output, FastqQualityFormat.Illumina, true, true); + } + + private void populateFile(List files, List data) { + for (File f : files) { + try (FileWriter fw = new FileWriter(f)) { + for (String s : data) { + fw.write(s + "\n"); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + @Test + public void testEmptyFastq() throws IOException { + final File emptyFastq = testFolder.newFile("empty.fastq"); + final File emptyFastqSam = testFolder.newFile("empty.fastq.sam"); + convertFile(emptyFastq, null, emptyFastqSam, FastqQualityFormat.Illumina, false, false, false, false); + } + + private void convertFile(final File fastq1, + final File fastq2, + final File outputFile, + final FastqQualityFormat version, + final boolean permissiveFormat, + final boolean useSequentialFastqs, + final boolean allowEmptyFastq) throws IOException { + convertFile(fastq1, fastq2, outputFile, version, permissiveFormat, useSequentialFastqs, allowEmptyFastq, true); + } + private void convertFile(final File fastq1, + final File fastq2, + final File outputFile, + final FastqQualityFormat version, + final boolean permissiveFormat, + final boolean useSequentialFastqs, + final boolean allowEmptyFastq, + final boolean expectSuccess) throws IOException { + + final List args = new ArrayList<>(); + + args.add("FASTQ=" + fastq1.getAbsolutePath()); + args.add("OUTPUT=" + outputFile.getAbsolutePath()); + args.add("QUALITY_FORMAT=" + version); + args.add("READ_GROUP_NAME=rg"); + args.add("SAMPLE_NAME=s1"); + + if (fastq2 != null) args.add("FASTQ2=" + fastq2.getAbsolutePath()); + if (permissiveFormat) args.add("ALLOW_AND_IGNORE_EMPTY_LINES=true"); + if (useSequentialFastqs) args.add("USE_SEQUENTIAL_FASTQS=true"); + if (allowEmptyFastq) args.add("ALLOW_EMPTY_FASTQ=true"); + int exitStatus = 1; + try { + exitStatus = new FastqToSamWithHeaders().instanceMain(args.toArray(new String[]{})); + if ( ! expectSuccess) { + Assert.fail("Should have thrown a PicardException"); + } + } catch (Exception ignored) {} + assertEquals(expectSuccess ? 0 : 1, exitStatus); + } + + private void convertFileAndVerifyRecordCount(final int expectedCount, + final File fastqFilename1, + final File fastqFilename2, + final File outputFile, + final FastqQualityFormat version, + final boolean permissiveFormat, + final boolean useSequentialFastqs) throws IOException { + + convertFile(fastqFilename1, fastqFilename2, outputFile, version, permissiveFormat, useSequentialFastqs, false); + final SamReader samReader = SamReaderFactory.makeDefault().open(outputFile); + final SAMRecordIterator iterator = samReader.iterator(); + int actualCount = 0; + while (iterator.hasNext()) { + iterator.next(); + actualCount++; + } + samReader.close(); + Assert.assertEquals(expectedCount, actualCount); + } + + @Test + public void runWithNoArgs() { + int exitStatus = new FastqToSamWithHeaders().instanceMain(new String[]{}); + assertEquals(1, exitStatus); + } +} diff --git a/qmule/test/org/qcmg/qmule/SamToFastqWithHeadersTest.java b/qmule/test/org/qcmg/qmule/SamToFastqWithHeadersTest.java new file mode 100644 index 000000000..40c6f5c31 --- /dev/null +++ b/qmule/test/org/qcmg/qmule/SamToFastqWithHeadersTest.java @@ -0,0 +1,504 @@ +package org.qcmg.qmule; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.fastq.FastqReader; +import htsjdk.samtools.fastq.FastqRecord; +import htsjdk.samtools.util.IOUtil; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import picard.PicardException; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.math.BigInteger; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.*; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.qcmg.qmule.SamToFastqWithHeaders.getTrimmedBaseDetailsFromTag; + +public class SamToFastqWithHeadersTest { + + @Rule + public TemporaryFolder testFolder = new TemporaryFolder(); + + @Test + public void getPositionOfCaseChange() { + assertEquals(-1, SamToFastqWithHeaders.getPositionOfCaseChange(null)); + assertEquals(-1, SamToFastqWithHeaders.getPositionOfCaseChange("")); + assertEquals(0, SamToFastqWithHeaders.getPositionOfCaseChange("ACBD")); + assertEquals(0, SamToFastqWithHeaders.getPositionOfCaseChange("abcd")); + assertEquals(4, SamToFastqWithHeaders.getPositionOfCaseChange("abcdABCD")); + assertEquals(4, SamToFastqWithHeaders.getPositionOfCaseChange("ABCDxyz")); + assertEquals(1, SamToFastqWithHeaders.getPositionOfCaseChange("xYZ")); + assertEquals(1, SamToFastqWithHeaders.getPositionOfCaseChange("Xyz")); + assertEquals(2, SamToFastqWithHeaders.getPositionOfCaseChange("xyZ")); + assertEquals(2, SamToFastqWithHeaders.getPositionOfCaseChange("XYz")); + } + + @Test + public void getTrimmedBasesFromTag() { + assertArrayEquals(new String[]{}, getTrimmedBaseDetailsFromTag(null)); + assertArrayEquals(new String[]{}, getTrimmedBaseDetailsFromTag("")); + assertArrayEquals(new String[]{"","","ABCD","!!!!"}, getTrimmedBaseDetailsFromTag("ABCD+!!!!")); + assertArrayEquals(new String[]{"abcd","!!!!","",""}, getTrimmedBaseDetailsFromTag("abcd+!!!!")); + assertArrayEquals(new String[]{"abcd","!!!!","XYZ","%%%"}, getTrimmedBaseDetailsFromTag("abcdXYZ+!!!!%%%")); + assertArrayEquals(new String[]{"xyz","%%%","ABCD","!!!!"}, getTrimmedBaseDetailsFromTag("ABCDxyz+!!!!%%%")); + } + + @Test + public void getFastqFromSam() { + SAMRecord record = new SAMRecord(null); + record.setReadName("ERR194147.1758538"); + + // first read + record.setReadBases("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT".getBytes()); + record.setBaseQualityString("???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????"); + record.setFlags(99); + record.setMappingQuality(60); + record.setCigarString("97M"); + record.setInferredInsertSize(330); + record.setAttribute("OQ", "???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????"); + record.setAttribute("MQ", 60); + record.setAttribute("MC", "3S98M"); + record.setAttribute("ZH", "/1 ACGTACGT"); + record.setAttribute("ZT", "GCGA+???'"); + + FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null); + assertEquals("ERR194147.1758538/1 ACGTACGT", fq.getReadName()); + assertEquals("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT" + "GCGA", fq.getReadString()); + assertEquals("???????????????????5???????????????????????????????????????&5?????????+??55?????????????????????????'", fq.getBaseQualityString()); + } + + @Test + public void getFastqFromSamOQ() { + SAMRecord record = new SAMRecord(null); + record.setReadName("ERR194147.1758538"); + + // first read + record.setReadBases("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT".getBytes()); + record.setBaseQualityString("???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????"); + record.setFlags(99); + record.setMappingQuality(60); + record.setCigarString("97M"); + record.setInferredInsertSize(330); + record.setAttribute("OQ", "55555??????????????5???????????????????????????????????????&5?????????+??55??????????????????????"); + record.setAttribute("MQ", 60); + record.setAttribute("MC", "3S98M"); + record.setAttribute("ZH", "/1 ACGTACGT"); + record.setAttribute("ZT", "GCGA+???'"); + + FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null); + assertEquals("ERR194147.1758538/1 ACGTACGT", fq.getReadName()); + assertEquals("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT" + "GCGA", fq.getReadString()); + assertEquals("55555??????????????5???????????????????????????????????????&5?????????+??55?????????????????????????'", fq.getBaseQualityString()); + } + + @Test + public void getFastqFromSamReverse() { + SAMRecord record = new SAMRecord(null); + record.setReadName("ERR194147.421820475"); + + // first read + record.setReadBases("TTCTTTGGCCTAATGACATGGCTATTAGTGCACAAGGAAATGGTCAAAAATGGGAAGAAATGTAGGTCACAAAATATTGCACAAAGCTATACTTACTT".getBytes()); + record.setBaseQualityString("??????????????????????????????????????????????????????????????????????????????????????????????????"); + record.setFlags(83); + record.setMappingQuality(60); + record.setCigarString("98M"); + record.setInferredInsertSize(-323); + record.setAttribute("OQ", "??????????????????????????????????????????????????????????????????????????????????????????????????"); + record.setAttribute("MQ", 60); + record.setAttribute("MC", "3S98M"); + record.setAttribute("ZH", "/1 XYZ.123.25"); + record.setAttribute("ZT", "CTG+???"); + + FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null); + assertEquals("ERR194147.421820475/1 XYZ.123.25", fq.getReadName()); + assertEquals("AAGTAAGTATAGCTTTGTGCAATATTTTGTGACCTACATTTCTTCCCATTTTTGACCATTTCCTTGTGCACTAATAGCCATGTCATTAGGCCAAAGAACTG", fq.getReadString()); + assertEquals("?????????????????????????????????????????????????????????????????????????????????????????????????????", fq.getBaseQualityString()); + } + @Test + public void getFastqFromSamDodgyTag() { + SAMRecord record = new SAMRecord(null); + record.setReadName("ERR194147.1758538"); + + // first read + record.setReadBases("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT".getBytes()); + record.setBaseQualityString("???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????"); + record.setFlags(99); + record.setMappingQuality(60); + record.setCigarString("97M"); + record.setInferredInsertSize(330); + record.setAttribute("OQ", "???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????"); + record.setAttribute("MQ", 60); + record.setAttribute("MC", "3S98M"); + record.setAttribute("ZH", "/1 foo bar"); + record.setAttribute("ZT", "GCGA+???'"); + + FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null); + assertEquals("ERR194147.1758538/1 foo bar", fq.getReadName()); + assertEquals("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT" + "GCGA", fq.getReadString()); + assertEquals("???????????????????5???????????????????????????????????????&5?????????+??55?????????????????????????'", fq.getBaseQualityString()); + } + + @Test + public void testMissingRgFileOutputPerRg() throws IOException { + File inputSam = testFolder.newFile("testMissingRgFileOutputPerRg.sam"); + File outputDir = testFolder.newFolder(); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/", + "OUTPUT_PER_RG=true" + }, false); + } + + @Test + public void groupedUnpairedMate() throws IOException { + File inputSam = testFolder.newFile("groupedUnpairedMate.sam"); + File outputDir = testFolder.newFolder(); + + populateSamFile(inputSam, Arrays.asList("@HD\tVN:1.0\tSO:unsorted", + "@RG\tID:rg1\tSM:s1\tPU:rg1\tPL:ILLUMINA", + "@RG\tID:rg2\tSM:s2\tPU:rg2\tPL:ILLUMINA", + "foo:record:1\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg1", + "foo:record:1\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg1", + "foo:record:2\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg1", + "foo:record:3\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg1", + "bar:record:1\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg2", + "bar:record:2\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg2")); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/", + "OUTPUT_PER_RG=true" + }, false); + + populateSamFile(inputSam, Arrays.asList("@HD\tVN:1.0\tSO:unsorted", + "@RG\tID:rg1\tSM:s1\tPU:rg1\tPL:ILLUMINA", + "@RG\tID:rg2\tSM:s2\tPU:rg2\tPL:ILLUMINA", + "foo:record:1\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg1", + "foo:record:1\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg1", + "foo:record:2\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg1", + "foo:record:2\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg1", + "bar:record:1\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg2", + "bar:record:1\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg2")); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/", + "OUTPUT_PER_RG=true" + }, true); + File[] fastqFiles = outputDir.listFiles((dir, file) -> file.startsWith("rg1")); + assert fastqFiles != null; + assertEquals(2, fastqFiles.length); + boolean arraySorted = fastqFiles[0].getName().endsWith("1.fastq"); + verifyFastq(arraySorted ? fastqFiles[0] : fastqFiles[1], arraySorted ? fastqFiles[1] : fastqFiles[0], inputSam, "rg1"); + + fastqFiles = outputDir.listFiles((dir, file) -> file.startsWith("rg2")); + assert fastqFiles != null; + assertEquals(2, fastqFiles.length); + arraySorted = fastqFiles[0].getName().endsWith("1.fastq"); + verifyFastq(arraySorted ? fastqFiles[0] : fastqFiles[1], arraySorted ? fastqFiles[1] : fastqFiles[0], inputSam, "rg2"); + + File outputFastq = testFolder.newFile("groupedUnpairedMate.fastq"); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "FASTQ=" + outputFastq.getAbsolutePath(), + "INTERLEAVE=true" + }, true); + final Set outputHeaderSet = createFastqReadHeaderSet(outputFastq); + // Create map of mate pairs from SAM records + final Map map = createSamMatePairsMap(inputSam, null) ; + Assert.assertEquals(map.size() * 2, outputHeaderSet.size()); + + // Ensure that each mate of each pair in SAM file is in the correct fastq pair file + for (final Map.Entry entry : map.entrySet() ) { + final MatePair mpair = entry.getValue(); + Assert.assertNotNull(mpair.mate1); // ensure we have two mates + Assert.assertNotNull(mpair.mate2); + Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName()); + final String readName = mpair.mate1.getReadName() ; + Assert.assertTrue(outputHeaderSet.contains(readName + "/1")); // ensure mate is in correct file + Assert.assertTrue(outputHeaderSet.contains(readName + "/2")); + } + } + + @Test + public void firstMateAtStartLastMateAtEnd() throws IOException { + File inputSam = testFolder.newFile("groupedUnpairedMate.sam"); + File outputDir = testFolder.newFolder(); + populateSamFile(inputSam, Arrays.asList("@HD VN:1.0 SO:unsorted", + "@RG ID:rg1 SM:s1 PU:blah PL:ILLUMINA", + "bar:record:1 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1", + "bar:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1", + "bar:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1", + "bar:record:3 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1", + "bar:record:3 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1", + "bar:record:4 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1", + "bar:record:4 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1", + "bar:record:5 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1", + "bar:record:5 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1", + "bar:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1")); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/", + "OUTPUT_PER_RG=true" + }, true); + File[] fastqFiles = outputDir.listFiles((dir, file) -> file.endsWith(".fastq")); + assert fastqFiles != null; + assertEquals(2, fastqFiles.length); + boolean arraySorted = fastqFiles[0].getName().endsWith("1.fastq"); + verifyFastq(arraySorted ? fastqFiles[0] : fastqFiles[1], arraySorted ? fastqFiles[1] : fastqFiles[0], inputSam); + + File outputFastq = testFolder.newFile("groupedUnpairedMate.fastq"); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "FASTQ=" + outputFastq.getAbsolutePath(), + "INTERLEAVE=true" + }, true); + final Set outputHeaderSet = createFastqReadHeaderSet(outputFastq); + // Create map of mate pairs from SAM records + final Map map = createSamMatePairsMap(inputSam, null) ; + Assert.assertEquals(map.size() * 2, outputHeaderSet.size()); + + // Ensure that each mate of each pair in SAM file is in the correct fastq pair file + for (final Map.Entry entry : map.entrySet() ) { + final MatePair mpair = entry.getValue(); + Assert.assertNotNull(mpair.mate1); // ensure we have two mates + Assert.assertNotNull(mpair.mate2); + Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName()); + final String readName = mpair.mate1.getReadName() ; + Assert.assertTrue(outputHeaderSet.contains(readName + "/1")); // ensure mate is in correct file + Assert.assertTrue(outputHeaderSet.contains(readName + "/2")); + } + } + + @Test + public void trimmedHeaders() throws IOException { + File inputOrigSam = testFolder.newFile("trimmedHeadersOrig.sam"); + File inputSam = testFolder.newFile("trimmedHeaders.sam"); + File outputOrigDir = testFolder.newFolder(); + File outputDir = testFolder.newFolder(); + populateSamFile(inputOrigSam, Arrays.asList("@HD VN:1.0 SO:queryname", + "@RG ID:rg1 SM:s1 PU:rg1 PL:ILLUMINA", + "@RG ID:rg2 SM:s2 PU:rg2 PL:ILLUMINA", + "foo:record:1 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:3 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:3 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "bar:record:1 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111")); + populateSamFile(inputSam, Arrays.asList("@HD VN:1.0 SO:queryname", + "@RG ID:rg1 SM:s1 PU:rg1 PL:ILLUMINA", + "@RG ID:rg2 SM:s2 PU:rg2 PL:ILLUMINA", + "foo:record:1 77 * 0 0 * * 0 0 AAAAAAAAA 111111111 RG:Z:rg1 ZT:Z:AAAA+1111\tCR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:1 141 * 0 0 * * 0 0 CCCCCCCCC 222222222 RG:Z:rg1 CR:Z:AAAAA ZT:Z:cccc+2222\tUR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222\tZH:Z::3/1", + "foo:record 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT\tZH:Z::3/2 CY:Z:11111 UY:Z:222", + "bar:record:1 77 * 0 0 * * 0 0 AAAAAAAAA 111111111 RG:Z:rg2 CR:Z:CCCCC ZT:Z:aaaa+1111\tUR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:2 141 * 0 0 * * 0 0 CCCCCCCCCC 2222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 ZT:Z:CCC+222\tUY:Z:111", + "bar:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111")); + convertFile(new String[]{ + "INPUT=" + inputOrigSam.getAbsolutePath(), + "OUTPUT_DIR=" + outputOrigDir.getAbsolutePath() + "/", + "OUTPUT_PER_RG=true" + }, true); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/", + "OUTPUT_PER_RG=true" + }, true); + File[] fastqOrigFiles = outputOrigDir.listFiles((dir, file) -> file.endsWith(".fastq")); + File[] fastqFiles = outputDir.listFiles((dir, file) -> file.endsWith(".fastq")); + + /* + loop through each one, calculating md5, then compare for equality + */ + assert fastqOrigFiles != null; + Map mapOrig = Arrays.stream(fastqOrigFiles).collect(Collectors.toMap(File::getName, f -> { + try { + return new BigInteger(1, MessageDigest.getInstance("MD5").digest(Files.readAllBytes(Paths.get(f.getPath())))).toString(16); + } catch (NoSuchAlgorithmException | IOException e) { + throw new RuntimeException(e); + } + })); + assert fastqFiles != null; + Map map = Arrays.stream(fastqFiles).collect(Collectors.toMap(File::getName, f -> { + try { + return new BigInteger(1, MessageDigest.getInstance("MD5").digest(Files.readAllBytes(Paths.get(f.getPath())))).toString(16); + } catch (NoSuchAlgorithmException | IOException e) { + throw new RuntimeException(e); + } + })); + + for (Map.Entry entry : mapOrig.entrySet()) { + String md5 = map.get(entry.getKey()); + assertEquals(md5, entry.getValue()); + } + } + + @Test + public void groupedLastPairMatesFlipped() throws IOException { + File inputSam = testFolder.newFile("groupedUnpairedMate.sam"); + File outputDir = testFolder.newFolder(); + populateSamFile(inputSam, Arrays.asList("@HD VN:1.0 SO:queryname", + "@RG ID:rg1 SM:s1 PU:rg1 PL:ILLUMINA", + "@RG ID:rg2 SM:s2 PU:rg2 PL:ILLUMINA", + "foo:record:1 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:3 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:3 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "bar:record:1 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111")); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/", + "OUTPUT_PER_RG=true" + }, true); + File[] fastqFiles = outputDir.listFiles((dir, file) -> file.startsWith("rg1")); + assert fastqFiles != null; + assertEquals(2, fastqFiles.length); + boolean arraySorted = fastqFiles[0].getName().endsWith("1.fastq"); + verifyFastq(arraySorted ? fastqFiles[0] : fastqFiles[1], arraySorted ? fastqFiles[1] : fastqFiles[0], inputSam, "rg1"); + + fastqFiles = outputDir.listFiles((dir, file) -> file.startsWith("rg2")); + assert fastqFiles != null; + assertEquals(2, fastqFiles.length); + arraySorted = fastqFiles[0].getName().endsWith("1.fastq"); + verifyFastq(arraySorted ? fastqFiles[0] : fastqFiles[1], arraySorted ? fastqFiles[1] : fastqFiles[0], inputSam, "rg2"); + + File outputFastq = testFolder.newFile("groupedUnpairedMate.fastq"); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "FASTQ=" + outputFastq.getAbsolutePath(), + "INTERLEAVE=true" + }, true); + final Set outputHeaderSet = createFastqReadHeaderSet(outputFastq); + // Create map of mate pairs from SAM records + final Map map = createSamMatePairsMap(inputSam, null) ; + Assert.assertEquals(map.size() * 2, outputHeaderSet.size()); + + // Ensure that each mate of each pair in SAM file is in the correct fastq pair file + for (final Map.Entry entry : map.entrySet() ) { + final MatePair mpair = entry.getValue(); + Assert.assertNotNull(mpair.mate1); // ensure we have two mates + Assert.assertNotNull(mpair.mate2); + Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName()); + final String readName = mpair.mate1.getReadName() ; + Assert.assertTrue(outputHeaderSet.contains(readName + "/1")); // ensure mate is in correct file + Assert.assertTrue(outputHeaderSet.contains(readName + "/2")); + } + } + + private void convertFile(final String [] args, boolean expectSuccess) { + int exitStatus = 1; + try { + exitStatus = new SamToFastqWithHeaders().instanceMain(args); + if ( ! expectSuccess) { + Assert.fail("Should have thrown a PicardException"); + } + } catch (Exception ignored) {System.out.println("ignored exception: " + ignored);} + assertEquals(expectSuccess ? 0 : 1, exitStatus); + } + + private void populateSamFile(File sam, List data) { + try (FileWriter fw = new FileWriter(sam)) { + for (String s : data) { + fw.write(s + "\n"); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void verifyFastq(final File pair1File, final File pair2File, final File samFile) throws IOException { + verifyFastq(pair1File, pair2File, samFile, null); + } + private void verifyFastq(final File pair1File, final File pair2File, final File samFile, String readGroup) throws IOException { + // Check that paired fastq files are same size + final Set outputHeaderSet1 = createFastqReadHeaderSet(pair1File); + final Set outputHeaderSet2 = createFastqReadHeaderSet(pair2File); + Assert.assertEquals(outputHeaderSet1.size(), outputHeaderSet2.size()); + + // Create map of mate pairs from SAM records + final Map map = createSamMatePairsMap(samFile, readGroup) ; + Assert.assertEquals(map.size(), outputHeaderSet2.size()); + + // Ensure that each mate of each pair in SAM file is in the correct fastq pair file + for (final Map.Entry entry : map.entrySet() ) { + final MatePair mpair = entry.getValue(); + Assert.assertNotNull(mpair.mate1); // ensure we have two mates + Assert.assertNotNull(mpair.mate2); + Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName()); + final String readName = mpair.mate1.getReadName() ; + Assert.assertTrue(outputHeaderSet1.contains(readName + "/1")); // ensure mate is in correct file + Assert.assertTrue(outputHeaderSet2.contains(readName + "/2")); + } + } + + private Map createSamMatePairsMap(final File samFile, final String readGroup) throws IOException { + IOUtil.assertFileIsReadable(samFile); + final SamReader reader = SamReaderFactory.makeDefault().open(samFile); + + final Map map = new LinkedHashMap<>(); + for (final SAMRecord record : reader ) { + if (null == readGroup || record.getReadGroup().getReadGroupId().equals(readGroup)) { + MatePair mpair = map.get(record.getReadName()); + if (mpair == null) { + mpair = new MatePair(); + map.put(record.getReadName(), mpair); + } + mpair.add(record); + } + } + reader.close(); + return map; + } + + protected static Set createFastqReadHeaderSet(final File file) { + final Set set = new HashSet<>(); + try (final FastqReader freader = new FastqReader(file)) { + while (freader.hasNext()) { + final FastqRecord frec = freader.next(); + set.add(frec.getReadName()); + } + } + return set ; + } + + static class MatePair { + SAMRecord mate1 ; + SAMRecord mate2 ; + void add(final SAMRecord record) { + if (!record.getReadPairedFlag()) throw new PicardException("Record "+record.getReadName()+" is not paired"); + if (record.getFirstOfPairFlag()) { + if (mate1 != null) throw new PicardException("Mate 1 already set for record: "+record.getReadName()); + mate1 = record ; + } + else if (record.getSecondOfPairFlag()) { + if (mate2 != null) throw new PicardException("Mate 2 already set for record: "+record.getReadName()); + mate2 = record ; + } + else throw new PicardException("Neither FirstOfPairFlag or SecondOfPairFlag is set for a paired record"); + } + } +}