From 22332b1ab00c35b96c7f39e018d3f75a5e76f3e6 Mon Sep 17 00:00:00 2001 From: Oliver Holmes Date: Mon, 28 Aug 2023 12:34:54 +1000 Subject: [PATCH 1/3] feat(qmule): new classes for creating fastqs from BAMs and vice versa These new classes preserve trimmed bases and additional header information in the fastq file through to (unmapped) BAM and back to fastq --- .../org/qcmg/qmule/FastqToSamWithHeaders.java | 561 ++++++++++++++++ .../org/qcmg/qmule/SamToFastqWithHeaders.java | 625 ++++++++++++++++++ .../qcmg/qmule/FastqToSamWithHeadersTest.java | 56 ++ .../qcmg/qmule/SamToFastqWithHeadersTest.java | 131 ++++ 4 files changed, 1373 insertions(+) create mode 100644 qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java create mode 100644 qmule/src/org/qcmg/qmule/SamToFastqWithHeaders.java create mode 100644 qmule/test/org/qcmg/qmule/FastqToSamWithHeadersTest.java create mode 100644 qmule/test/org/qcmg/qmule/SamToFastqWithHeadersTest.java diff --git a/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java b/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java new file mode 100644 index 000000000..1456c8092 --- /dev/null +++ b/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java @@ -0,0 +1,561 @@ +package org.qcmg.qmule; + +import htsjdk.samtools.*; +import htsjdk.samtools.fastq.FastqConstants; +import htsjdk.samtools.fastq.FastqReader; +import htsjdk.samtools.fastq.FastqRecord; +import htsjdk.samtools.util.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import picard.PicardException; +import picard.cmdline.CommandLineProgram; +import picard.cmdline.StandardOptionDefinitions; +import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +/** + * Converts a FASTQ file to an unaligned BAM or SAM file. + *

+ * Output read records will contain the original base calls and quality scores will be + * translated depending on the base quality score encoding: FastqSanger, FastqSolexa and FastqIllumina. + *

+ *

+ * There are also arguments to provide values for SAM header and read attributes that are not present in FASTQ + * (e.g see RG or SM below). + *

+ *

Inputs

+ *

+ * One FASTQ file name for single-end or two for pair-end sequencing input data. + * These files might be in gzip compressed format (when file name is ending with ".gz"). + *

+ *

+ * Alternatively, for larger inputs you can provide a collection of FASTQ files indexed by their name (see USE_SEQUENCIAL_FASTQ for details below). + *

+ *

+ * By default, this tool will try to guess the base quality score encoding. However you can indicate it explicitly + * using the QUALITY_FORMAT argument. + *

+ *

Output

+ * A single unaligned BAM or SAM file. By default, the records are sorted by query (read) name. + *

Usage examples

+ * + *

Example 1:

+ *

+ * Single-end sequencing FASTQ file conversion. All reads are annotated + * as belonging to the "rg0013" read group that in turn is part of the sample "sample001". + *

+ *
+     * java -jar picard.jar FastqToSam \
+     *      F1=input_reads.fastq \
+     *      O=unaligned_reads.bam \
+     *      SM=sample001 \
+     *      RG=rg0013
+     * 
+ *

Example 2:

+ *

+ * Similar to example 1 above, but for paired-end sequencing. + *

+ *
+     * java -jar picard.jar FastqToSam \
+     *      F1=forward_reads.fastq \
+     *      F2=reverse_reads.fastq \
+     *      O=unaligned_read_pairs.bam \
+     *      SM=sample001 \
+     *      RG=rg0013
+     * 
+ */ +@CommandLineProgramProperties( + summary = "

" + org.qcmg.qmule.FastqToSamWithHeaders.USAGE_SUMMARY + ".

" + org.qcmg.qmule.FastqToSamWithHeaders.USAGE_DETAILS, + oneLineSummary = org.qcmg.qmule.FastqToSamWithHeaders.USAGE_SUMMARY, + programGroup = ReadDataManipulationProgramGroup.class) +@DocumentedFeature +public class FastqToSamWithHeaders extends CommandLineProgram { + + public static void main(final String[] argv) { + + + int exitStatus = new FastqToSamWithHeaders().instanceMain(argv); + + System.exit(exitStatus); + } + static final String USAGE_SUMMARY = + "Converts a FASTQ file to an unaligned BAM or SAM file"; + static final String USAGE_DETAILS = + "

Output read records will contain the original base calls and quality scores will be " + + "translated depending on the base quality score encoding: FastqSanger, FastqSolexa and FastqIllumina.

" + + "

There are also arguments to provide values for SAM header and read attributes that are not present in FASTQ " + + "(e.g see RG or SM below).

" + + "

Inputs

" + + "

One FASTQ file name for single-end or two for pair-end sequencing input data. " + + "These files might be in gzip compressed format (when file name is ending with \".gz\").

" + + "

Alternatively, for larger inputs you can provide a collection of FASTQ files indexed by their name " + + "(see USE_SEQUENCIAL_FASTQ for details below).

" + + "

By default, this tool will try to guess the base quality score encoding. However you can indicate it explicitly " + + "using the QUALITY_FORMAT argument.

" + + "

Output

" + + "

A single unaligned BAM or SAM file. By default, the records are sorted by query (read) name.

" + + "

Usage examples

" + + "

Example 1:

" + + "

Single-end sequencing FASTQ file conversion. All reads are annotated " + + "as belonging to the \"rg0013\" read group that in turn is part of the sample \"sample001\".

" + + "
java -jar picard.jar FastqToSam \\\n" +
+                        "        F1=input_reads.fastq \\\n" +
+                        "        O=unaligned_reads.bam \\\n" +
+                        "        SM=sample001 \\\n" +
+                        "        RG=rg0013
" + + "

Example 2:

" + + "

Similar to example 1 above, but for paired-end sequencing.

" + + "
java -jar picard.jar FastqToSam \\\n" +
+                        "       F1=forward_reads.fastq \\\n" +
+                        "       F2=reverse_reads.fastq \\\n" +
+                        "       O=unaligned_read_pairs.bam \\\n" +
+                        "       SM=sample001 \\\n" +
+                        "       RG=rg0013

"; + + private static final Log LOG = Log.getInstance(picard.sam.FastqToSam.class); + + @Argument(shortName="F1", doc="Input fastq file (optionally gzipped) for single end data, or first read in paired end data.") + public File FASTQ; + + @Argument(shortName="F2", doc="Input fastq file (optionally gzipped) for the second read of paired end data.", optional=true) + public File FASTQ2; + + @Argument(doc="Use sequential fastq files with the suffix _###.fastq or _###.fastq.gz." + + "The files should be named:\n" + + " _001., _002., ..., _XYZ.\n" + + " The base files should be:\n" + + " _001.\n" + + " An example would be:\n" + + " RUNNAME_S8_L005_R1_001.fastq\n" + + " RUNNAME_S8_L005_R1_002.fastq\n" + + " RUNNAME_S8_L005_R1_003.fastq\n" + + " RUNNAME_S8_L005_R1_004.fastq\n" + + "RUNNAME_S8_L005_R1_001.fastq should be provided as FASTQ.", optional=true) + public boolean USE_SEQUENTIAL_FASTQS = false; + + @Argument(shortName="V", doc="A value describing how the quality values are encoded in the input FASTQ file. " + + "Either Solexa (phred scaling + 66), Illumina (phred scaling + 64) or Standard (phred scaling + 33). " + + "If this value is not specified, the quality format will be detected automatically.", optional = true) + public FastqQualityFormat QUALITY_FORMAT; + + @Argument(doc="Output SAM/BAM file. ", shortName= StandardOptionDefinitions.OUTPUT_SHORT_NAME) + public File OUTPUT ; + + @Argument(shortName="RG", doc="Read group name") + public String READ_GROUP_NAME = "A"; + + @Argument(shortName="SM", doc="Sample name to insert into the read group header") + public String SAMPLE_NAME; + + @Argument(shortName="LB", doc="The library name to place into the LB attribute in the read group header", optional=true) + public String LIBRARY_NAME; + + @Argument(shortName="PU", doc="The platform unit (often run_barcode.lane) to insert into the read group header", optional=true) + public String PLATFORM_UNIT; + + @Argument(shortName="PL", doc="The platform type (e.g. ILLUMINA, SOLID) to insert into the read group header", optional=true) + public String PLATFORM; + + @Argument(shortName="CN", doc="The sequencing center from which the data originated", optional=true) + public String SEQUENCING_CENTER; + + @Argument(shortName = "PI", doc = "Predicted median insert size, to insert into the read group header", optional = true) + public Integer PREDICTED_INSERT_SIZE; + + @Argument(shortName = "PG", doc = "Program group to insert into the read group header.", optional=true) + public String PROGRAM_GROUP; + + @Argument(shortName = "PM", doc = "Platform model to insert into the group header (free-form text providing further details of the platform/technology used)", optional=true) + public String PLATFORM_MODEL; + + @Argument(doc="Comment(s) to include in the merged output file's header.", optional=true, shortName="CO") + public List COMMENT = new ArrayList<>(); + + @Argument(shortName = "DS", doc = "Inserted into the read group header", optional = true) + public String DESCRIPTION; + + @Argument(shortName = "DT", doc = "Date the run was produced, to insert into the read group header", optional = true) + public Iso8601Date RUN_DATE; + + @Argument(shortName="SO", doc="The sort order for the output sam/bam file.") + public SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.queryname; + + @Argument(doc="Minimum quality allowed in the input fastq. An exception will be thrown if a quality is less than this value.") + public int MIN_Q = 0; + + @Argument(doc="Maximum quality allowed in the input fastq. An exception will be thrown if a quality is greater than this value.") + public int MAX_Q = SAMUtils.MAX_PHRED_SCORE; + + @Deprecated + @Argument(doc="Deprecated (No longer used). If true and this is an unpaired fastq any occurrence of '/1' or '/2' will be removed from the end of a read name.") + public Boolean STRIP_UNPAIRED_MATE_NUMBER = false; + + @Argument(doc="Allow (and ignore) empty lines") + public Boolean ALLOW_AND_IGNORE_EMPTY_LINES = false; + + public static final String ZT_ATTRIBUTE = "ZT"; + public static final String ZH_ATTRIBUTE = "ZH"; + + private static final SolexaQualityConverter solexaQualityConverter = SolexaQualityConverter.getSingleton(); + + /** + * Looks at fastq input(s) and attempts to determine the proper quality format + * + * Closes the reader(s) by side effect + * + * @param reader1 The first fastq input + * @param reader2 The second fastq input, if necessary. To not use this input, set it to null + * @param expectedQuality If provided, will be used for sanity checking. If left null, autodetection will occur + */ + public static FastqQualityFormat determineQualityFormat(final FastqReader reader1, final FastqReader reader2, final FastqQualityFormat expectedQuality) { + final QualityEncodingDetector detector = new QualityEncodingDetector(); + + if (reader2 == null) { + detector.add(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, reader1); + } else { + detector.add(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, reader1, reader2); + reader2.close(); + } + + reader1.close(); + + final FastqQualityFormat qualityFormat = detector.generateBestGuess(QualityEncodingDetector.FileContext.FASTQ, expectedQuality); + if (detector.isDeterminationAmbiguous()) { + LOG.warn("Making ambiguous determination about fastq's quality encoding; more than one format possible based on observed qualities."); + } + LOG.info(String.format("Auto-detected quality format as: %s.", qualityFormat)); + + return qualityFormat; + } + + + /** + * Get a list of FASTQs that are sequentially numbered based on the first (base) fastq. + * The files should be named: + * _001., _002., ..., _XYZ. + * The base files should be: + * _001. + * An example would be: + * RUNNAME_S8_L005_R1_001.fastq + * RUNNAME_S8_L005_R1_002.fastq + * RUNNAME_S8_L005_R1_003.fastq + * RUNNAME_S8_L005_R1_004.fastq + * where `baseFastq` is the first in that list. + */ + protected static List getSequentialFileList(final File baseFastq) { + final List files = new ArrayList<>(); + files.add(baseFastq); + + // Find the correct extension used in the base FASTQ + FastqConstants.FastqExtensions fastqExtensions = null; + String suffix = null; // store the suffix including the extension + for (final FastqConstants.FastqExtensions ext : FastqConstants.FastqExtensions.values()) { + suffix = "_001" + ext.getExtension(); + if (baseFastq.getAbsolutePath().endsWith(suffix)) { + fastqExtensions = ext; + break; + } + } + if (null == fastqExtensions) { + throw new PicardException(String.format("Could not parse the FASTQ extension (expected '_001' + '%s'): %s", FastqConstants.FastqExtensions.values().toString(), baseFastq)); + } + + // Find all the files + for (int idx = 2; true; idx++) { + String fastq = baseFastq.getAbsolutePath(); + fastq = String.format("%s_%03d%s", fastq.substring(0, fastq.length() - suffix.length()), idx, fastqExtensions.getExtension()); + try { + IOUtil.assertFileIsReadable(new File(fastq)); + } catch (final SAMException e) { // the file is not readable, so do not continue + break; + } + files.add(new File(fastq)); + } + + return files; + } + + /* Simply invokes the right method for unpaired or paired data. */ + protected int doWork() { + IOUtil.assertFileIsReadable(FASTQ); + if (FASTQ2 != null) { + IOUtil.assertFileIsReadable(FASTQ2); + } + IOUtil.assertFileIsWritable(OUTPUT); + + final SAMFileHeader header = createSamFileHeader(); + final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, OUTPUT); + + // Set the quality format + QUALITY_FORMAT = picard.sam.FastqToSam.determineQualityFormat(fileToFastqReader(FASTQ), + (FASTQ2 == null) ? null : fileToFastqReader(FASTQ2), + QUALITY_FORMAT); + + // Lists for sequential files, but also used when not sequential + final List readers1 = new ArrayList<>(); + final List readers2 = new ArrayList<>(); + + if (USE_SEQUENTIAL_FASTQS) { + // Get all the files + for (final File fastq : getSequentialFileList(FASTQ)) { + readers1.add(fileToFastqReader(fastq)); + } + if (null != FASTQ2) { + for (final File fastq : getSequentialFileList(FASTQ2)) { + readers2.add(fileToFastqReader(fastq)); + } + if (readers1.size() != readers2.size()) { + throw new PicardException(String.format("Found %d files for FASTQ and %d files for FASTQ2.", readers1.size(), readers2.size())); + } + } + } + else { + readers1.add(fileToFastqReader(FASTQ)); + if (FASTQ2 != null) { + readers2.add(fileToFastqReader(FASTQ2)); + } + } + + // Loop through the FASTQs + for (int idx = 0; idx < readers1.size(); idx++) { + makeItSo(readers1.get(idx), + (readers2.isEmpty()) ? null : readers2.get(idx), + writer); + } + + // Close all the things + for (final FastqReader reader : readers1) reader.close(); + for (final FastqReader reader : readers2) reader.close(); + writer.close(); + + return 0; + } + + /** + * Handles the FastqToSam execution on the FastqReader(s). + * + * In some circumstances it might be useful to circumvent the command line based instantiation of this + * class, however note that there is no handholding or guardrails to running in this manner. + * + * It is the caller's responsibility to close the reader(s) + * + * @param reader1 The FastqReader for the first fastq file + * @param reader2 The second FastqReader if applicable. Pass in null if only using a single reader + * @param writer The SAMFileWriter where the new SAM file is written + * + */ + public void makeItSo(final FastqReader reader1, final FastqReader reader2, final SAMFileWriter writer) { + final int readCount = (reader2 == null) ? doUnpaired(reader1, writer) : doPaired(reader1, reader2, writer); + LOG.info("Processed " + readCount + " fastq reads"); + } + + /** Creates a simple SAM file from a single fastq file. */ + protected int doUnpaired(final FastqReader freader, final SAMFileWriter writer) { + int readCount = 0; + final ProgressLogger progress = new ProgressLogger(LOG); + for ( ; freader.hasNext() ; readCount++) { + final FastqRecord frec = freader.next(); + final String frecName = SequenceUtil.getSamReadNameFromFastqHeader(frec.getReadName()); + final SAMRecord srec = createSamRecord(writer.getFileHeader(), frecName , frec, false) ; + srec.setReadPairedFlag(false); + writer.addAlignment(srec); + progress.record(srec); + } + + return readCount; + } + + /** More complicated method that takes two fastq files and builds pairing information in the SAM. */ + protected int doPaired(final FastqReader freader1, final FastqReader freader2, final SAMFileWriter writer) { + int readCount = 0; + final ProgressLogger progress = new ProgressLogger(LOG); + for ( ; freader1.hasNext() && freader2.hasNext() ; readCount++) { + final FastqRecord frec1 = freader1.next(); + final FastqRecord frec2 = freader2.next(); + + final String frec1Name = SequenceUtil.getSamReadNameFromFastqHeader(frec1.getReadName()); + final String frec2Name = SequenceUtil.getSamReadNameFromFastqHeader(frec2.getReadName()); + final String baseName = getBaseName(frec1Name, frec2Name, freader1, freader2); + + final SAMRecord srec1 = createSamRecord(writer.getFileHeader(), baseName, frec1, true) ; + srec1.setFirstOfPairFlag(true); + srec1.setSecondOfPairFlag(false); + writer.addAlignment(srec1); + progress.record(srec1); + + final SAMRecord srec2 = createSamRecord(writer.getFileHeader(), baseName, frec2, true) ; + srec2.setFirstOfPairFlag(false); + srec2.setSecondOfPairFlag(true); + writer.addAlignment(srec2); + progress.record(srec2); + } + + if (freader1.hasNext() || freader2.hasNext()) { + throw new PicardException("Input paired fastq files must be the same length"); + } + + return readCount; + } + + private FastqReader fileToFastqReader(final File file) { + return new FastqReader(file, ALLOW_AND_IGNORE_EMPTY_LINES); + } + + + public static SAMRecord createSamRecord(final SAMFileHeader header, final String baseName, final FastqRecord frec, final boolean paired, String readGroupName, FastqQualityFormat fqFormat, int minQ, int maxQ) { + final SAMRecord srec = new SAMRecord(header); + srec.setReadName(baseName); + srec.setReadString(frec.getReadString()); + srec.setReadUnmappedFlag(true); + srec.setAttribute(ReservedTagConstants.READ_GROUP_ID, readGroupName); + String additionalHeader = frec.getReadName().replace(baseName, ""); + if (additionalHeader.length() > 0) { + /* + If this contains the trimmed bases flag (TB:) then put that in a separate tag + */ + int tbIndex = additionalHeader.indexOf(" TB:"); + if (tbIndex == -1) { + srec.setAttribute(ZH_ATTRIBUTE, additionalHeader); + } else { + srec.setAttribute(ZT_ATTRIBUTE, additionalHeader.substring(tbIndex + 4)); + if (tbIndex > 0) { + srec.setAttribute(ZH_ATTRIBUTE, additionalHeader.substring(0, tbIndex)); + } + } + } + final byte[] quals = StringUtil.stringToBytes(frec.getBaseQualityString()); + convertQuality(quals, fqFormat); + for (final byte qual : quals) { + final int uQual = qual & 0xff; + if (uQual < minQ || uQual > maxQ) { + throw new PicardException("Base quality " + uQual + " is not in the range " + minQ + ".." + + maxQ + " for read " + frec.getReadName()); + } + } + srec.setBaseQualities(quals); + + if (paired) { + srec.setReadPairedFlag(true); + srec.setMateUnmappedFlag(true); + } + return srec; + } + private SAMRecord createSamRecord(final SAMFileHeader header, final String baseName, final FastqRecord frec, final boolean paired) { + return FastqToSamWithHeaders.createSamRecord(header, baseName, frec, paired, READ_GROUP_NAME, QUALITY_FORMAT, MIN_Q, MAX_Q); + } + + /** Creates a simple header with the values provided on the command line. */ + public SAMFileHeader createSamFileHeader() { + final SAMReadGroupRecord rgroup = new SAMReadGroupRecord(this.READ_GROUP_NAME); + rgroup.setSample(this.SAMPLE_NAME); + if (this.LIBRARY_NAME != null) rgroup.setLibrary(this.LIBRARY_NAME); + if (this.PLATFORM != null) rgroup.setPlatform(this.PLATFORM); + if (this.PLATFORM_UNIT != null) rgroup.setPlatformUnit(this.PLATFORM_UNIT); + if (this.SEQUENCING_CENTER != null) rgroup.setSequencingCenter(SEQUENCING_CENTER); + if (this.PREDICTED_INSERT_SIZE != null) rgroup.setPredictedMedianInsertSize(PREDICTED_INSERT_SIZE); + if (this.DESCRIPTION != null) rgroup.setDescription(this.DESCRIPTION); + if (this.RUN_DATE != null) rgroup.setRunDate(this.RUN_DATE); + if (this.PLATFORM_MODEL != null) rgroup.setPlatformModel(this.PLATFORM_MODEL); + if (this.PROGRAM_GROUP != null) rgroup.setProgramGroup(this.PROGRAM_GROUP); + + final SAMFileHeader header = new SAMFileHeader(); + header.addReadGroup(rgroup); + + for (final String comment : COMMENT) { + header.addComment(comment); + } + + header.setSortOrder(this.SORT_ORDER); + return header ; + } + + /** Based on the type of quality scores coming in, converts them to a numeric byte[] in phred scale. */ + static void convertQuality(final byte[] quals, final FastqQualityFormat version) { + switch (version) { + case Standard: + SAMUtils.fastqToPhred(quals); + break ; + case Solexa: + solexaQualityConverter.convertSolexaQualityCharsToPhredBinary(quals); + break ; + case Illumina: + solexaQualityConverter.convertSolexa_1_3_QualityCharsToPhredBinary(quals); + break ; + } + } + + /** Returns read baseName and asserts correct pair read name format: + *
    + *
  • Paired reads must either have the exact same read names or they must contain at least one "/" + *
  • and the First pair read name must end with "/1" and second pair read name ends with "/2" + *
  • The baseName (read name part before the /) must be the same for both read names + *
  • If the read names are exactly the same but end in "/2" or "/1" then an exception will be thrown + *
+ */ + String getBaseName(final String readName1, final String readName2, final FastqReader freader1, final FastqReader freader2) { + String [] toks = getReadNameTokens(readName1, 1, freader1); + final String baseName1 = toks[0] ; + final String num1 = toks[1] ; + + toks = getReadNameTokens(readName2, 2, freader2); + final String baseName2 = toks[0] ; + final String num2 = toks[1]; + + if (!baseName1.equals(baseName2)) { + throw new PicardException(String.format("In paired mode, read name 1 (%s) does not match read name 2 (%s)", baseName1,baseName2)); + } + + final boolean num1Blank = StringUtil.isBlank(num1); + final boolean num2Blank = StringUtil.isBlank(num2); + if (num1Blank || num2Blank) { + if(!num1Blank) throw new PicardException(error(freader1,"Pair 1 number is missing (" +readName1+ "). Both pair numbers must be present or neither.")); //num1 != blank and num2 == blank + else if(!num2Blank) throw new PicardException(error(freader2, "Pair 2 number is missing (" +readName2+ "). Both pair numbers must be present or neither.")); //num1 == blank and num =2 != blank + } else { + if (!num1.equals("1")) throw new PicardException(error(freader1,"Pair 1 number must be 1 ("+readName1+")")); + if (!num2.equals("2")) throw new PicardException(error(freader2,"Pair 2 number must be 2 ("+readName2+")")); + } + + return baseName1 ; + } + + /** Breaks up read name into baseName and number separated by the last / */ + private String [] getReadNameTokens(final String readName, final int pairNum, final FastqReader freader) { + if(readName.equals("")) throw new PicardException(error(freader,"Pair read name "+pairNum+" cannot be empty: "+readName)); + + final int idx = readName.lastIndexOf('/'); + final String[] result = new String[2]; + + if (idx == -1) { + result[0] = readName; + result[1] = null; + } else { + result[1] = readName.substring(idx + 1); // should be a 1 or 2 + + if(!result[1].equals("1") && !result[1].equals("2")) { //if not a 1 or 2 then names must be identical + result[0] = readName; + result[1] = null; + } + else { + result[0] = readName.substring(0,idx); // baseName + } + } + + return result ; + } + + /** Little utility to give error messages corresponding to line numbers in the input files. */ + private String error(final FastqReader freader, final String str) { + return str +" at line "+freader.getLineNumber() +" in file "+freader.getFile().getAbsolutePath(); + } + + @Override + protected String[] customCommandLineValidation() { + if (MIN_Q < 0) return new String[]{"MIN_Q must be >= 0"}; + if (MAX_Q > SAMUtils.MAX_PHRED_SCORE) return new String[]{"MAX_Q must be <= " + SAMUtils.MAX_PHRED_SCORE}; + return null; + } +} diff --git a/qmule/src/org/qcmg/qmule/SamToFastqWithHeaders.java b/qmule/src/org/qcmg/qmule/SamToFastqWithHeaders.java new file mode 100644 index 000000000..1c6dccf5a --- /dev/null +++ b/qmule/src/org/qcmg/qmule/SamToFastqWithHeaders.java @@ -0,0 +1,625 @@ +package org.qcmg.qmule; +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +import htsjdk.samtools.*; +import htsjdk.samtools.fastq.FastqRecord; +import htsjdk.samtools.fastq.FastqWriter; +import htsjdk.samtools.fastq.FastqWriterFactory; +import htsjdk.samtools.util.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import picard.PicardException; +import picard.cmdline.CommandLineProgram; +import picard.cmdline.StandardOptionDefinitions; +import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; + +import java.io.File; +import java.util.*; + +/** + *

Extracts read sequences and qualities from the input SAM/BAM file and writes them into + * the output file in Sanger FASTQ format. . + * See MAQ FASTQ specification for details. + * This tool can be used by way of a pipe to run BWA MEM on unmapped BAM (uBAM) files efficiently. + *

In the RC mode (default is True), if the read is aligned and the alignment is to the reverse strand on the genome, + * the read's sequence from input sam file will be reverse-complemented prior to writing it to FASTQ in order restore correctly + * the original read sequence as it was generated by the sequencer. + *
+ *

Usage example:

+ *
+ * java -jar picard.jar SamToFastqWithHeaders \
+ *     I=input.bam \
+ *     FASTQ=output.fastq
+ * 
+ *
+ */ +@CommandLineProgramProperties( + summary = SamToFastqWithHeaders.USAGE_SUMMARY + SamToFastqWithHeaders.USAGE_DETAILS, + oneLineSummary = SamToFastqWithHeaders.USAGE_SUMMARY, + programGroup = ReadDataManipulationProgramGroup.class) +@DocumentedFeature +public class SamToFastqWithHeaders extends CommandLineProgram { + static final String USAGE_SUMMARY = "Converts a SAM/BAM/CRAM file to FASTQ."; + static final String USAGE_DETAILS = " Extracts read sequences and qualities from the input SAM/BAM/CRAM file and writes them into" + + "the output file in Sanger FASTQ format." + + "See MAQ FASTQ specification for details." + + "This tool can be used by way of a pipe to run BWA MEM on unmapped BAM (uBAM) files efficiently.

" + + "

In the RC mode (default is True), if the read is aligned and the alignment is to the reverse strand on the genome," + + "the read's sequence from input sam file will be reverse-complemented prior to writing it to FASTQ in order restore correctly" + + "the original read sequence as it was generated by the sequencer.

" + + "
" + + "

Usage example:

" + + "
" +
+            "java -jar picard.jar SamToFastqWithHeaders 
" + + " I=input.bam
" + + " FASTQ=output.fastq" + + "
" + + "
"; + @Argument(doc = "Input SAM/BAM/CRAM file to extract reads from", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME) + public File INPUT; + + @Argument(shortName = "F", doc = "Output FASTQ file (single-end fastq or, if paired, first end of the pair FASTQ).", + mutex = {"OUTPUT_PER_RG", "COMPRESS_OUTPUTS_PER_RG", "OUTPUT_DIR"}) + public File FASTQ; + + @Argument(shortName = "F2", doc = "Output FASTQ file (if paired, second end of the pair FASTQ).", optional = true, + mutex = {"OUTPUT_PER_RG", "COMPRESS_OUTPUTS_PER_RG"}) + public File SECOND_END_FASTQ; + + @Argument(shortName = "FU", doc = "Output FASTQ file for unpaired reads; may only be provided in paired-FASTQ mode", optional = true, + mutex = {"OUTPUT_PER_RG", "COMPRESS_OUTPUTS_PER_RG"}) + public File UNPAIRED_FASTQ; + + @Argument(shortName = "OPRG", doc = "Output a FASTQ file per read group (two FASTQ files per read group if the group is paired).", + optional = true, mutex = {"FASTQ", "SECOND_END_FASTQ", "UNPAIRED_FASTQ"}) + public boolean OUTPUT_PER_RG; + + @Argument(shortName = "GZOPRG", doc = "Compress output FASTQ files per read group using gzip and append a .gz extension to the file names.", + mutex = {"FASTQ", "SECOND_END_FASTQ", "UNPAIRED_FASTQ"}) + public Boolean COMPRESS_OUTPUTS_PER_RG = false; + + @Argument(shortName = "RGT", doc = "The read group tag (PU or ID) to be used to output a FASTQ file per read group.") + public String RG_TAG = "PU"; + + @Argument(shortName = "ODIR", doc = "Directory in which to output the FASTQ file(s). Used only when OUTPUT_PER_RG is true.", + optional = true) + public File OUTPUT_DIR; + + @Argument(shortName = "RC", doc = "Re-reverse bases and qualities of reads with negative strand flag set before writing them to FASTQ", + optional = true) + public boolean RE_REVERSE = true; + + @Argument(shortName = "INTER", doc = "Will generate an interleaved fastq if paired, each line will have /1 or /2 to describe which end it came from") + public boolean INTERLEAVE = false; + + @Argument(shortName = "NON_PF", doc = "Include non-PF reads from the SAM file into the output " + + "FASTQ files. PF means 'passes filtering'. Reads whose 'not passing quality controls' " + + "flag is set are non-PF reads. See GATK Dictionary for more info.") + public boolean INCLUDE_NON_PF_READS = false; + + @Argument(shortName = "CLIP_ATTR", doc = "The attribute that stores the position at which " + + "the SAM record should be clipped", optional = true) + public String CLIPPING_ATTRIBUTE; + + @Argument(shortName = "CLIP_ACT", doc = "The action that should be taken with clipped reads: " + + "'X' means the reads and qualities should be trimmed at the clipped position; " + + "'N' means the bases should be changed to Ns in the clipped region; and any " + + "integer means that the base qualities should be set to that value in the " + + "clipped region.", optional = true) + public String CLIPPING_ACTION; + + @Argument(shortName = "CLIP_MIN", doc = "When performing clipping with the CLIPPING_ATTRIBUTE and CLIPPING_ACTION " + + "parameters, ensure that the resulting reads after clipping are at least CLIPPING_MIN_LENGTH bases long. " + + "If the original read is shorter than CLIPPING_MIN_LENGTH then the original read length will be maintained.") + public int CLIPPING_MIN_LENGTH = 0; + + @Argument(shortName = "R1_TRIM", doc = "The number of bases to trim from the beginning of read 1.") + public int READ1_TRIM = 0; + + @Argument(shortName = "R1_MAX_BASES", doc = "The maximum number of bases to write from read 1 after trimming. " + + "If there are fewer than this many bases left after trimming, all will be written. If this " + + "value is null then all bases left after trimming will be written.", optional = true) + public Integer READ1_MAX_BASES_TO_WRITE; + + @Argument(shortName = "R2_TRIM", doc = "The number of bases to trim from the beginning of read 2.") + public int READ2_TRIM = 0; + + @Argument(shortName = "R2_MAX_BASES", doc = "The maximum number of bases to write from read 2 after trimming. " + + "If there are fewer than this many bases left after trimming, all will be written. If this " + + "value is null then all bases left after trimming will be written.", optional = true) + public Integer READ2_MAX_BASES_TO_WRITE; + + @Argument(shortName = "Q", doc = "End-trim reads using the phred/bwa quality trimming algorithm and this quality.", optional = true) + public Integer QUALITY; + + @Argument(doc = "If true, include non-primary alignments in the output. Support of non-primary alignments in SamToFastq " + + "is not comprehensive, so there may be exceptions if this is set to true and there are paired reads with non-primary alignments.") + public boolean INCLUDE_NON_PRIMARY_ALIGNMENTS = false; + + private static final String CLIP_TRIM = "X"; + private static final String CLIP_TO_N = "N"; + + private static final short ZH_ATTRIBUTE = SAMTag.makeBinaryTag("ZH"); + private static final short ZT_ATTRIBUTE = SAMTag.makeBinaryTag("ZT"); + private static final short OQ_ATTRIBUTE = SAMTag.makeBinaryTag("OQ"); + + private static final String[] EMPTY_STRING_ARRAY = {}; + + private final Log log = Log.getInstance(SamToFastqWithHeaders.class); + + protected int doWork() { + IOUtil.assertFileIsReadable(INPUT); + final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT); + final Map firstSeenMates = new HashMap<>(); + final FastqWriterFactory factory = new FastqWriterFactory(); + factory.setCreateMd5(CREATE_MD5_FILE); + + initializeAdditionalWriters(); + final Map writers = generateWriters(reader.getFileHeader().getReadGroups(), + factory); + final Map> additionalWriters = generateAdditionalWriters(reader.getFileHeader().getReadGroups(), factory); + if (writers.isEmpty()) { + final String msgBase = INPUT + " does not contain Read Groups"; + final String msg = OUTPUT_PER_RG ? msgBase + ", consider not using the OUTPUT_PER_RG option" : msgBase; + throw new PicardException(msg); + } + + final ProgressLogger progress = new ProgressLogger(log); + + for (final SAMRecord currentRecord : reader) { + handleRecord(currentRecord, writers, additionalWriters, firstSeenMates); + progress.record(currentRecord); + } + + CloserUtil.close(reader); + + // Close all the fastq writers being careful to close each one only once! + for (final FastqWriters writerMapping : new HashSet<>(writers.values())) { + writerMapping.closeAll(); + } + + // close all `additionalWriters` only once + final Set additionalWriterSet = new HashSet<>(); + additionalWriters.values().forEach(additionalWriterSet::addAll); + for (final FastqWriter fastqWriter : additionalWriterSet) { + fastqWriter.close(); + } + + if (!firstSeenMates.isEmpty()) { + SAMUtils.processValidationError(new SAMValidationError(SAMValidationError.Type.MATE_NOT_FOUND, + "Found " + firstSeenMates.size() + " unpaired mates", null), VALIDATION_STRINGENCY); + } + + return 0; + } + + /** + * Generates the writers for the given read groups or, if we are not emitting per-read-group, just returns the single set of writers. + */ + private Map generateWriters(List samReadGroupRecords, + FastqWriterFactory factory) { + + final Map writerMap = new HashMap<>(); + + final FastqWriters fastqWriters; + if (!OUTPUT_PER_RG) { + IOUtil.assertFileIsWritable(FASTQ); + final FastqWriter firstOfPairWriter = factory.newWriter(FASTQ); + + final FastqWriter secondOfPairWriter; + if (INTERLEAVE) { + secondOfPairWriter = firstOfPairWriter; + } else if (SECOND_END_FASTQ != null) { + IOUtil.assertFileIsWritable(SECOND_END_FASTQ); + secondOfPairWriter = factory.newWriter(SECOND_END_FASTQ); + } else { + secondOfPairWriter = null; + } + + /* Prepare the writer that will accept unpaired reads. If we're emitting a single fastq - and assuming single-ended reads - + * then this is simply that one fastq writer. Otherwise, if we're doing paired-end, we emit to a third new writer, since + * the other two fastqs are accepting only paired end reads. */ + final FastqWriter unpairedWriter = UNPAIRED_FASTQ == null ? firstOfPairWriter : factory.newWriter(UNPAIRED_FASTQ); + + fastqWriters = new FastqWriters(firstOfPairWriter, secondOfPairWriter, unpairedWriter); + + // For all read groups we may find in the sam, register this single set of writers for them. + writerMap.put(null, fastqWriters); + for (final SAMReadGroupRecord rg : samReadGroupRecords) { + writerMap.put(rg, fastqWriters); + } + } else { + // When we're creating a fastq-group per readgroup, by convention we do not emit a special fastq for unpaired reads. + for (final SAMReadGroupRecord rg : samReadGroupRecords) { + final FastqWriter firstOfPairWriter = factory.newWriter(makeReadGroupFile(rg, "_1")); + // Create this writer on-the-fly; if we find no second-of-pair reads, don't bother making a writer (or delegating, + // if we're interleaving). + final Lazy lazySecondOfPairWriter = new Lazy<>(() -> INTERLEAVE ? firstOfPairWriter : factory.newWriter(makeReadGroupFile(rg, "_2"))); + + writerMap.put(rg, new FastqWriters(firstOfPairWriter, lazySecondOfPairWriter, firstOfPairWriter)); + } + } + return writerMap; + } + + protected void initializeAdditionalWriters() { + } + + protected Map> generateAdditionalWriters(List readGroups, + FastqWriterFactory factory) { + return Collections.emptyMap(); + } + + private void handleRecord(final SAMRecord currentRecord, final Map writers, + final Map> additionalWriters, + final Map firstSeenMates) { + if (currentRecord.isSecondaryOrSupplementary() && !INCLUDE_NON_PRIMARY_ALIGNMENTS) { + return; + } + + // Skip non-PF reads as necessary + if (currentRecord.getReadFailsVendorQualityCheckFlag() && !INCLUDE_NON_PF_READS) { + return; + } + + final FastqWriters fq = writers.get(currentRecord.getReadGroup()); + SAMRecord read1 = null; + SAMRecord read2 = null; + if (currentRecord.getReadPairedFlag()) { + final String currentReadName = currentRecord.getReadName(); + final SAMRecord firstRecord = firstSeenMates.remove(currentReadName); + if (firstRecord == null) { + firstSeenMates.put(currentReadName, currentRecord); + } else { + assertPairedMates(firstRecord, currentRecord); + + read1 = currentRecord.getFirstOfPairFlag() ? currentRecord : firstRecord; + read2 = currentRecord.getFirstOfPairFlag() ? firstRecord : currentRecord; + writeRecord(read1, 1, fq.getFirstOfPair(), READ1_TRIM, READ1_MAX_BASES_TO_WRITE); + final FastqWriter secondOfPairWriter = fq.getSecondOfPair(); + if (secondOfPairWriter == null) { + throw new PicardException("Input contains paired reads but no SECOND_END_FASTQ specified."); + } + writeRecord(read2, 2, secondOfPairWriter, READ2_TRIM, READ2_MAX_BASES_TO_WRITE); + } + } else { + writeRecord(currentRecord, null, fq.getUnpaired(), READ1_TRIM, READ1_MAX_BASES_TO_WRITE); + } + + handleAdditionalRecords(currentRecord, additionalWriters, read1, read2); + } + + protected void handleAdditionalRecords(SAMRecord currentRecord, Map> additionalWriters, SAMRecord read1, SAMRecord read2) { + } + + private File makeReadGroupFile(final SAMReadGroupRecord readGroup, final String preExtSuffix) { + String fileName = null; + if (RG_TAG.equalsIgnoreCase("PU")) { + fileName = readGroup.getPlatformUnit(); + } else if (RG_TAG.equalsIgnoreCase("ID")) { + fileName = readGroup.getReadGroupId(); + } + if (fileName == null) { + throw new PicardException("The selected RG_TAG: " + RG_TAG + " is not present in the header."); + } + fileName = IOUtil.makeFileNameSafe(fileName); + if (preExtSuffix != null) { + fileName += preExtSuffix; + } + fileName += COMPRESS_OUTPUTS_PER_RG ? ".fastq.gz" : ".fastq"; + + final File result = (OUTPUT_DIR != null) + ? new File(OUTPUT_DIR, fileName) + : new File(fileName); + IOUtil.assertFileIsWritable(result); + return result; + } + + public static int getPositionOfCaseChange(String s) { + if (null != s && s.length() > 0) { + boolean isLower = Character.isLowerCase(s.charAt(0)); + for (int i = 1, len = s.length(); i < len; i++) { + if (Character.isLowerCase(s.charAt(i)) != isLower) { + return i; + } + } + /* + If we are here, then there has been no case change in the string + Return 0 which will convey to the user that the case changed at the first character in the string + */ + return 0; + } + return -1; + } + + public static String[] getTrimmedBaseDetailsFromTag(String tag) { + if (null != tag && tag.length() > 0) { + String [] results = new String[4]; + int plusIndex = tag.indexOf('+'); + + String bases = tag.substring(0, plusIndex); + String quals = tag.substring(plusIndex + 1); + int caseChangeIndex = getPositionOfCaseChange(bases); + if (caseChangeIndex > 0) { + String bases1 = bases.substring(0, caseChangeIndex); + String bases2 = bases.substring(caseChangeIndex); + if (Character.isLowerCase(bases1.charAt(0))) { + results[0] = bases1; + results[1] = quals.substring(0, caseChangeIndex); + results[2] = bases2; + results[3] = quals.substring(caseChangeIndex); + } else { + results[0] = bases2; + results[1] = quals.substring(caseChangeIndex); + results[2] = bases1; + results[3] = quals.substring(0, caseChangeIndex); + } + } else { + boolean isLower = Character.isLowerCase(bases.charAt(0)); + results[0] = isLower? bases : ""; + results[1] = isLower ? quals : ""; + results[2] = isLower ? "" : bases; + results[3] = isLower ? "" : quals; + } + return results; + } + return EMPTY_STRING_ARRAY; + } + + + + public static FastqRecord getFastqRecordFromSamRecord(final SAMRecord read, final Integer mateNumber, final int basesToTrim, + final Integer maxBasesToWrite, final String clippingAttribute, + int clippingMinLength, final String clippingAction, boolean reReverse, final Integer quality) { + /* + get ZH tag and add to header + */ + String seqHeader = read.getReadName(); + String additionalHeader = (String)read.getAttribute(ZH_ATTRIBUTE); + String trimmedAdapterSequenceAndQual = (String)read.getAttribute(ZT_ATTRIBUTE); + if (null != additionalHeader && additionalHeader.length() > 0) { + seqHeader += additionalHeader; + } + + String readString = read.getReadString(); + String origBaseQuals = (String)read.getAttribute(OQ_ATTRIBUTE); + String baseQualities = (null != origBaseQuals && origBaseQuals.length() == readString.length()) ? origBaseQuals : read.getBaseQualityString(); + + + if (reReverse && read.getReadNegativeStrandFlag()) { + readString = SequenceUtil.reverseComplement(readString); + baseQualities = StringUtil.reverseString(baseQualities); + } + + if (null != trimmedAdapterSequenceAndQual && trimmedAdapterSequenceAndQual.length() > 0) { + String [] adapterInfo = getTrimmedBaseDetailsFromTag(trimmedAdapterSequenceAndQual); + if (null != adapterInfo && adapterInfo.length == 4) { + if (adapterInfo[0].length() > 0) { + readString = adapterInfo[0].toUpperCase() + readString; + baseQualities = adapterInfo[1] + baseQualities; + } + if (adapterInfo[2].length() > 0) { + readString = readString + adapterInfo[2].toUpperCase(); + baseQualities = baseQualities + adapterInfo[3]; + } + } + } + + // If we're clipping, do the right thing to the bases or qualities + if (clippingAttribute != null) { + Integer clipPoint = (Integer) read.getAttribute(clippingAttribute); + if (clipPoint != null && clipPoint < clippingMinLength) { + clipPoint = Math.min(readString.length(), clippingMinLength); + } + + if (clipPoint != null) { + if (clippingAction.equalsIgnoreCase(CLIP_TRIM)) { + readString = clip(readString, clipPoint, null, !read.getReadNegativeStrandFlag()); + baseQualities = clip(baseQualities, clipPoint, null, !read.getReadNegativeStrandFlag()); + } else if (clippingAction.equalsIgnoreCase(CLIP_TO_N)) { + readString = clip(readString, clipPoint, CLIP_TO_N.charAt(0), !read.getReadNegativeStrandFlag()); + } else { + final char newQual = SAMUtils.phredToFastq(new byte[]{(byte) Integer.parseInt(clippingAction)}).charAt(0); + baseQualities = clip(baseQualities, clipPoint, newQual, !read.getReadNegativeStrandFlag()); + } + } + } + + if (basesToTrim > 0) { + readString = readString.substring(basesToTrim); + baseQualities = baseQualities.substring(basesToTrim); + } + + // Perform quality trimming if desired, making sure to leave at least one base! + if (quality != null) { + final byte[] quals = SAMUtils.fastqToPhred(baseQualities); + final int qualityTrimIndex = Math.max(1, TrimmingUtil.findQualityTrimPoint(quals, quality)); + if (qualityTrimIndex < quals.length) { + readString = readString.substring(0, qualityTrimIndex); + baseQualities = baseQualities.substring(0, qualityTrimIndex); + } + } + + if (maxBasesToWrite != null && maxBasesToWrite < readString.length()) { + readString = readString.substring(0, maxBasesToWrite); + baseQualities = baseQualities.substring(0, maxBasesToWrite); + } + + return new FastqRecord(seqHeader, readString, "", baseQualities); + } + private void writeRecord(final SAMRecord read, final Integer mateNumber, final FastqWriter writer, + final int basesToTrim, final Integer maxBasesToWrite) { + + + writer.write(getFastqRecordFromSamRecord(read, mateNumber, basesToTrim, maxBasesToWrite, CLIPPING_ATTRIBUTE, CLIPPING_MIN_LENGTH, CLIPPING_ACTION, RE_REVERSE, QUALITY)); + } + + /** + * Utility method to handle the changes required to the base/quality strings by the clipping + * parameters. + * + * @param src The string to clip + * @param point The 1-based position of the first clipped base in the read + * @param replacement If non-null, the character to replace in the clipped positions + * in the string (a quality score or 'N'). If null, just trim src + * @param posStrand Whether the read is on the positive strand + * @return String The clipped read or qualities + */ + private static String clip(final String src, final int point, final Character replacement, final boolean posStrand) { + final int len = src.length(); + StringBuilder result = new StringBuilder(posStrand ? src.substring(0, point - 1) : src.substring(len - point + 1)); + if (replacement != null) { + if (posStrand) { + for (int i = point; i <= len; i++) { + result.append(replacement); + } + } else { + for (int i = 0; i <= len - point; i++) { + result.insert(0, replacement); + } + } + } + return result.toString(); + } + + protected static void assertPairedMates(final SAMRecord record1, final SAMRecord record2) { + if (!(record1.getFirstOfPairFlag() && record2.getSecondOfPairFlag() || + record2.getFirstOfPairFlag() && record1.getSecondOfPairFlag())) { + throw new PicardException("Illegal mate state: " + record1.getReadName()); + } + } + + /** + * Put any custom command-line validation in an override of this method. + * clp is initialized at this point and can be used to print usage and access argv. + * Any options set by command-line parser can be validated. + * + * @return null if command line is valid. If command line is invalid, returns an array of error + * messages to be written to the appropriate place. + */ + protected String[] customCommandLineValidation() { + + List errors = new ArrayList<>(); + + if (INTERLEAVE && SECOND_END_FASTQ != null) { + errors.add("Cannot set INTERLEAVE to true and pass in a SECOND_END_FASTQ"); + } + + if (UNPAIRED_FASTQ != null && SECOND_END_FASTQ == null) { + errors.add("UNPAIRED_FASTQ may only be set when also emitting read1 and read2 fastqs (so SECOND_END_FASTQ must also be set)."); + } + + if ((CLIPPING_ATTRIBUTE != null && CLIPPING_ACTION == null) || + (CLIPPING_ATTRIBUTE == null && CLIPPING_ACTION != null)) { + errors.add("Both or neither of CLIPPING_ATTRIBUTE and CLIPPING_ACTION should be set."); + } + + if (CLIPPING_ACTION != null) { + if (!CLIPPING_ACTION.equals(CLIP_TO_N) && !CLIPPING_ACTION.equals(CLIP_TRIM)) { + try { + Integer.parseInt(CLIPPING_ACTION); + } catch (NumberFormatException nfe) { + errors.add("CLIPPING ACTION must be one of: N, X, or an integer"); + } + } + } + + if ((OUTPUT_PER_RG && OUTPUT_DIR == null) || ((!OUTPUT_PER_RG) && OUTPUT_DIR != null)) { + errors.add("If OUTPUT_PER_RG is true, then OUTPUT_DIR should be set. If "); + + } + + if (OUTPUT_PER_RG) { + if (RG_TAG == null) { + errors.add("If OUTPUT_PER_RG is true, then RG_TAG should be set."); + } else if (!(RG_TAG.equalsIgnoreCase("PU") || RG_TAG.equalsIgnoreCase("ID"))) { + errors.add("RG_TAG must be: PU or ID"); + } + } + + return errors.isEmpty() ? super.customCommandLineValidation() : errors.toArray(new String[0]); + } + + /** + * A collection of {@link htsjdk.samtools.fastq.FastqWriter}s for particular types of reads. + *

+ * Allows for lazy construction of the second-of-pair writer, since when we are in the "output per read group mode", we only wish to + * generate a second-of-pair fastq if we encounter a second-of-pair read. + */ + private static final class FastqWriters { + private final FastqWriter firstOfPair, unpaired; + private final Lazy secondOfPair; + + /** + * Constructor if the consumer wishes for the second-of-pair writer to be built on-the-fly. + */ + private FastqWriters(final FastqWriter firstOfPair, final Lazy secondOfPair, final FastqWriter unpaired) { + this.firstOfPair = firstOfPair; + this.unpaired = unpaired; + this.secondOfPair = secondOfPair; + } + + /** + * Simple constructor; all writers are pre-initialized.. + */ + private FastqWriters(final FastqWriter firstOfPair, final FastqWriter secondOfPair, final FastqWriter unpaired) { + this(firstOfPair, new Lazy<>(() -> secondOfPair), unpaired); + } + + private FastqWriter getFirstOfPair() { + return firstOfPair; + } + + private FastqWriter getSecondOfPair() { + return secondOfPair.get(); + } + + private FastqWriter getUnpaired() { + return unpaired; + } + + private void closeAll() { + final Set fastqWriters = new HashSet<>(); + fastqWriters.add(firstOfPair); + fastqWriters.add(unpaired); + // Make sure this is a no-op if the second writer was never fetched. + if (secondOfPair.isInitialized()) { + fastqWriters.add(secondOfPair.get()); + } + for (final FastqWriter fastqWriter : fastqWriters) { + fastqWriter.close(); + } + } + } + + public static void main(final String[] argv) { + + + int exitStatus = new SamToFastqWithHeaders().instanceMain(argv); + + System.exit(exitStatus); + } +} \ No newline at end of file diff --git a/qmule/test/org/qcmg/qmule/FastqToSamWithHeadersTest.java b/qmule/test/org/qcmg/qmule/FastqToSamWithHeadersTest.java new file mode 100644 index 000000000..f50307017 --- /dev/null +++ b/qmule/test/org/qcmg/qmule/FastqToSamWithHeadersTest.java @@ -0,0 +1,56 @@ +package org.qcmg.qmule; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.fastq.FastqRecord; +import htsjdk.samtools.util.FastqQualityFormat; +import org.junit.Test; + +import static htsjdk.samtools.SAMUtils.MAX_PHRED_SCORE; +import static org.junit.Assert.*; + +public class FastqToSamWithHeadersTest { + + @Test + public void createSAMRecordNoAdditionalHeader() { + FastqRecord fqRec = new FastqRecord("basename","ACGT", "", "????"); + SAMRecord samRec = FastqToSamWithHeaders.createSamRecord(null, "basename", fqRec, true, "A", FastqQualityFormat.Standard, 0, MAX_PHRED_SCORE); + assertEquals("basename", samRec.getReadName()); + assertEquals("ACGT", samRec.getReadString()); + assertEquals("????", samRec.getBaseQualityString()); + assertTrue(samRec.getReadPairedFlag()); + assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZT_ATTRIBUTE)); + assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE)); + + samRec = FastqToSamWithHeaders.createSamRecord(null, "basename", fqRec, false, "A", FastqQualityFormat.Standard, 0, MAX_PHRED_SCORE); + assertEquals("basename", samRec.getReadName()); + assertEquals("ACGT", samRec.getReadString()); + assertEquals("????", samRec.getBaseQualityString()); + assertFalse(samRec.getReadPairedFlag()); + assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZT_ATTRIBUTE)); + assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE)); + } + + @Test + public void createSAMRecordAdditionalHeaderNoTrimming() { + FastqRecord fqRec = new FastqRecord("basename 1.2.ACGTTGCA/1", "ACGT", "", "????"); + SAMRecord samRec = FastqToSamWithHeaders.createSamRecord(null, "basename", fqRec, true, "A", FastqQualityFormat.Standard, 0, MAX_PHRED_SCORE); + assertEquals("basename", samRec.getReadName()); + assertEquals("ACGT", samRec.getReadString()); + assertEquals("????", samRec.getBaseQualityString()); + assertTrue(samRec.getReadPairedFlag()); + assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZT_ATTRIBUTE)); + assertEquals(" 1.2.ACGTTGCA/1", samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE)); + } + + @Test + public void createSAMRecordAdditionalHeader() { + FastqRecord fqRec = new FastqRecord("basename 1.2.ACGTTGCA/1 TB:aaaaAAAA+????????", "ACGT", "", "????"); + SAMRecord samRec = FastqToSamWithHeaders.createSamRecord(null, "basename", fqRec, true, "A", FastqQualityFormat.Standard, 0, MAX_PHRED_SCORE); + assertEquals("basename", samRec.getReadName()); + assertEquals("ACGT", samRec.getReadString()); + assertEquals("????", samRec.getBaseQualityString()); + assertTrue(samRec.getReadPairedFlag()); + assertEquals("aaaaAAAA+????????", samRec.getAttribute(FastqToSamWithHeaders.ZT_ATTRIBUTE)); + assertEquals(" 1.2.ACGTTGCA/1", samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE)); + } +} diff --git a/qmule/test/org/qcmg/qmule/SamToFastqWithHeadersTest.java b/qmule/test/org/qcmg/qmule/SamToFastqWithHeadersTest.java new file mode 100644 index 000000000..36f5220b2 --- /dev/null +++ b/qmule/test/org/qcmg/qmule/SamToFastqWithHeadersTest.java @@ -0,0 +1,131 @@ +package org.qcmg.qmule; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.fastq.FastqRecord; +import org.junit.Test; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.qcmg.qmule.SamToFastqWithHeaders.getTrimmedBaseDetailsFromTag; + +public class SamToFastqWithHeadersTest { + + @Test + public void getPositionOfCaseChange() { + assertEquals(-1, SamToFastqWithHeaders.getPositionOfCaseChange(null)); + assertEquals(-1, SamToFastqWithHeaders.getPositionOfCaseChange("")); + assertEquals(0, SamToFastqWithHeaders.getPositionOfCaseChange("ACBD")); + assertEquals(0, SamToFastqWithHeaders.getPositionOfCaseChange("abcd")); + assertEquals(4, SamToFastqWithHeaders.getPositionOfCaseChange("abcdABCD")); + assertEquals(4, SamToFastqWithHeaders.getPositionOfCaseChange("ABCDxyz")); + assertEquals(1, SamToFastqWithHeaders.getPositionOfCaseChange("xYZ")); + assertEquals(1, SamToFastqWithHeaders.getPositionOfCaseChange("Xyz")); + assertEquals(2, SamToFastqWithHeaders.getPositionOfCaseChange("xyZ")); + assertEquals(2, SamToFastqWithHeaders.getPositionOfCaseChange("XYz")); + } + + @Test + public void getTrimmedBasesFromTag() { + assertArrayEquals(new String[]{}, getTrimmedBaseDetailsFromTag(null)); + assertArrayEquals(new String[]{}, getTrimmedBaseDetailsFromTag("")); + assertArrayEquals(new String[]{"","","ABCD","!!!!"}, getTrimmedBaseDetailsFromTag("ABCD+!!!!")); + assertArrayEquals(new String[]{"abcd","!!!!","",""}, getTrimmedBaseDetailsFromTag("abcd+!!!!")); + assertArrayEquals(new String[]{"abcd","!!!!","XYZ","%%%"}, getTrimmedBaseDetailsFromTag("abcdXYZ+!!!!%%%")); + assertArrayEquals(new String[]{"xyz","%%%","ABCD","!!!!"}, getTrimmedBaseDetailsFromTag("ABCDxyz+!!!!%%%")); + } + + @Test + public void getFastqFromSam() { + SAMRecord record = new SAMRecord(null); + record.setReadName("ERR194147.1758538"); + + // first read + record.setReadBases("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT".getBytes()); + record.setBaseQualityString("???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????"); + record.setFlags(99); + record.setMappingQuality(60); + record.setCigarString("97M"); + record.setInferredInsertSize(330); + record.setAttribute("OQ", "???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????"); + record.setAttribute("MQ", (Integer) 60); + record.setAttribute("MC", "3S98M"); + record.setAttribute("ZH", "/1"); + record.setAttribute("ZT", "GCGA+???'"); + + FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null); + assertEquals("ERR194147.1758538/1", fq.getReadName()); + assertEquals("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT" + "GCGA", fq.getReadString()); + assertEquals("???????????????????5???????????????????????????????????????&5?????????+??55?????????????????????????'", fq.getBaseQualityString()); + } + + @Test + public void getFastqFromSamOQ() { + SAMRecord record = new SAMRecord(null); + record.setReadName("ERR194147.1758538"); + + // first read + record.setReadBases("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT".getBytes()); + record.setBaseQualityString("???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????"); + record.setFlags(99); + record.setMappingQuality(60); + record.setCigarString("97M"); + record.setInferredInsertSize(330); + record.setAttribute("OQ", "55555??????????????5???????????????????????????????????????&5?????????+??55??????????????????????"); + record.setAttribute("MQ", (Integer) 60); + record.setAttribute("MC", "3S98M"); + record.setAttribute("ZH", "/1"); + record.setAttribute("ZT", "GCGA+???'"); + + FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null); + assertEquals("ERR194147.1758538/1", fq.getReadName()); + assertEquals("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT" + "GCGA", fq.getReadString()); + assertEquals("55555??????????????5???????????????????????????????????????&5?????????+??55?????????????????????????'", fq.getBaseQualityString()); + } + + @Test + public void getFastqFromSamReverse() { + SAMRecord record = new SAMRecord(null); + record.setReadName("ERR194147.421820475"); + + // first read + record.setReadBases("TTCTTTGGCCTAATGACATGGCTATTAGTGCACAAGGAAATGGTCAAAAATGGGAAGAAATGTAGGTCACAAAATATTGCACAAAGCTATACTTACTT".getBytes()); + record.setBaseQualityString("??????????????????????????????????????????????????????????????????????????????????????????????????"); + record.setFlags(83); + record.setMappingQuality(60); + record.setCigarString("98M"); + record.setInferredInsertSize(-323); + record.setAttribute("OQ", "??????????????????????????????????????????????????????????????????????????????????????????????????"); + record.setAttribute("MQ", (Integer) 60); + record.setAttribute("MC", "3S98M"); + record.setAttribute("ZH", "/1"); + record.setAttribute("ZT", "CTG+???"); + + FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null); + assertEquals("ERR194147.421820475/1", fq.getReadName()); + assertEquals("AAGTAAGTATAGCTTTGTGCAATATTTTGTGACCTACATTTCTTCCCATTTTTGACCATTTCCTTGTGCACTAATAGCCATGTCATTAGGCCAAAGAACTG", fq.getReadString()); + assertEquals("?????????????????????????????????????????????????????????????????????????????????????????????????????", fq.getBaseQualityString()); + } + @Test + public void getFastqFromSamDodgyTag() { + SAMRecord record = new SAMRecord(null); + record.setReadName("ERR194147.1758538"); + + // first read + record.setReadBases("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT".getBytes()); + record.setBaseQualityString("???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????"); + record.setFlags(99); + record.setMappingQuality(60); + record.setCigarString("97M"); + record.setInferredInsertSize(330); + record.setAttribute("OQ", "???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????"); + record.setAttribute("MQ", (Integer) 60); + record.setAttribute("MC", "3S98M"); + record.setAttribute("ZH", "/1"); + record.setAttribute("ZT", "GCGA+???'"); + + FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null); + assertEquals("ERR194147.1758538/1", fq.getReadName()); + assertEquals("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT" + "GCGA", fq.getReadString()); + assertEquals("???????????????????5???????????????????????????????????????&5?????????+??55?????????????????????????'", fq.getBaseQualityString()); + } +} From 85e106cac680a50af33ab6e5a69b43aa6fc29942 Mon Sep 17 00:00:00 2001 From: Oliver Holmes Date: Mon, 4 Sep 2023 15:24:15 +1000 Subject: [PATCH 2/3] test(qmule): added tests for FastqToSam and SamToFastq with headers classes --- .../org/qcmg/qmule/FastqToSamWithHeaders.java | 7 +- .../org/qcmg/qmule/SamToFastqWithHeaders.java | 2 + .../qcmg/qmule/FastqToSamWithHeadersTest.java | 223 +++++++++- .../qcmg/qmule/SamToFastqWithHeadersTest.java | 397 +++++++++++++++++- 4 files changed, 609 insertions(+), 20 deletions(-) diff --git a/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java b/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java index 1456c8092..6a597f3bb 100644 --- a/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java +++ b/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java @@ -197,9 +197,14 @@ public static void main(final String[] argv) { @Argument(doc="Allow (and ignore) empty lines") public Boolean ALLOW_AND_IGNORE_EMPTY_LINES = false; + @Argument(doc="Allow empty input fastq") + public Boolean ALLOW_EMPTY_FASTQ = false; + public static final String ZT_ATTRIBUTE = "ZT"; public static final String ZH_ATTRIBUTE = "ZH"; + public static final String TRIMMED_BASES = " TB:"; + private static final SolexaQualityConverter solexaQualityConverter = SolexaQualityConverter.getSingleton(); /** @@ -417,7 +422,7 @@ public static SAMRecord createSamRecord(final SAMFileHeader header, final String /* If this contains the trimmed bases flag (TB:) then put that in a separate tag */ - int tbIndex = additionalHeader.indexOf(" TB:"); + int tbIndex = additionalHeader.indexOf(TRIMMED_BASES); if (tbIndex == -1) { srec.setAttribute(ZH_ATTRIBUTE, additionalHeader); } else { diff --git a/qmule/src/org/qcmg/qmule/SamToFastqWithHeaders.java b/qmule/src/org/qcmg/qmule/SamToFastqWithHeaders.java index 1c6dccf5a..95644f450 100644 --- a/qmule/src/org/qcmg/qmule/SamToFastqWithHeaders.java +++ b/qmule/src/org/qcmg/qmule/SamToFastqWithHeaders.java @@ -402,6 +402,8 @@ public static FastqRecord getFastqRecordFromSamRecord(final SAMRecord read, fina String trimmedAdapterSequenceAndQual = (String)read.getAttribute(ZT_ATTRIBUTE); if (null != additionalHeader && additionalHeader.length() > 0) { seqHeader += additionalHeader; + } else if (null != mateNumber){ + seqHeader += "/" + mateNumber; } String readString = read.getReadString(); diff --git a/qmule/test/org/qcmg/qmule/FastqToSamWithHeadersTest.java b/qmule/test/org/qcmg/qmule/FastqToSamWithHeadersTest.java index f50307017..a19e74b63 100644 --- a/qmule/test/org/qcmg/qmule/FastqToSamWithHeadersTest.java +++ b/qmule/test/org/qcmg/qmule/FastqToSamWithHeadersTest.java @@ -1,15 +1,56 @@ package org.qcmg.qmule; import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.fastq.FastqReader; import htsjdk.samtools.fastq.FastqRecord; import htsjdk.samtools.util.FastqQualityFormat; +import org.junit.Assert; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import picard.PicardException; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import static htsjdk.samtools.SAMUtils.MAX_PHRED_SCORE; import static org.junit.Assert.*; public class FastqToSamWithHeadersTest { + + @Rule + public TemporaryFolder testFolder = new TemporaryFolder(); + + private static final FastqToSamWithHeaders fastqToSam = new FastqToSamWithHeaders(); + private static File newTempFile(final String filename) throws IOException { + return newTempFile(filename, ".tmp"); + } + private static File newTempFile(final String filename, String suffix) throws IOException { + final File file = File.createTempFile(filename, suffix); + file.deleteOnExit(); + return file; + } + + private static FastqReader freader1; + private static FastqReader freader2; + + static { + try { + freader1 = new FastqReader(newTempFile("dummyFile")); + freader2 = new FastqReader(newTempFile("dummyFile")); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + @Test public void createSAMRecordNoAdditionalHeader() { FastqRecord fqRec = new FastqRecord("basename","ACGT", "", "????"); @@ -32,25 +73,193 @@ public void createSAMRecordNoAdditionalHeader() { @Test public void createSAMRecordAdditionalHeaderNoTrimming() { - FastqRecord fqRec = new FastqRecord("basename 1.2.ACGTTGCA/1", "ACGT", "", "????"); + String basename = "basename"; + String pairing = "/1"; + String additionalHeader = " 1.2.ACGTTGCA"; + FastqRecord fqRec = new FastqRecord(basename + pairing + additionalHeader, "ACGT", "", "????"); SAMRecord samRec = FastqToSamWithHeaders.createSamRecord(null, "basename", fqRec, true, "A", FastqQualityFormat.Standard, 0, MAX_PHRED_SCORE); - assertEquals("basename", samRec.getReadName()); + assertEquals(basename, samRec.getReadName()); assertEquals("ACGT", samRec.getReadString()); assertEquals("????", samRec.getBaseQualityString()); assertTrue(samRec.getReadPairedFlag()); assertNull(samRec.getAttribute(FastqToSamWithHeaders.ZT_ATTRIBUTE)); - assertEquals(" 1.2.ACGTTGCA/1", samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE)); + assertEquals(pairing + additionalHeader, samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE)); } @Test public void createSAMRecordAdditionalHeader() { - FastqRecord fqRec = new FastqRecord("basename 1.2.ACGTTGCA/1 TB:aaaaAAAA+????????", "ACGT", "", "????"); + String basename = "basename"; + String pairing = "/1"; + String additionalHeader = " 1.2.ACGTTGCA"; + String trimmedBases = "aaaaAAAA+????????"; + FastqRecord fqRec = new FastqRecord(basename + pairing + additionalHeader + " TB:" + trimmedBases, "ACGT", "", "????"); SAMRecord samRec = FastqToSamWithHeaders.createSamRecord(null, "basename", fqRec, true, "A", FastqQualityFormat.Standard, 0, MAX_PHRED_SCORE); - assertEquals("basename", samRec.getReadName()); + assertEquals(basename, samRec.getReadName()); assertEquals("ACGT", samRec.getReadString()); assertEquals("????", samRec.getBaseQualityString()); assertTrue(samRec.getReadPairedFlag()); - assertEquals("aaaaAAAA+????????", samRec.getAttribute(FastqToSamWithHeaders.ZT_ATTRIBUTE)); - assertEquals(" 1.2.ACGTTGCA/1", samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE)); + assertEquals(trimmedBases, samRec.getAttribute(FastqToSamWithHeaders.ZT_ATTRIBUTE)); + assertEquals(pairing + additionalHeader, samRec.getAttribute(FastqToSamWithHeaders.ZH_ATTRIBUTE)); + } + + @Test + public void readPairNameOk() { + assertEquals("aa", fastqToSam.getBaseName("aa/1", "aa/2" , freader1, freader2)); + assertEquals("aa", fastqToSam.getBaseName("aa", "aa", freader1, freader2)); + assertEquals("aa/bb", fastqToSam.getBaseName("aa/bb", "aa/bb" , freader1, freader2)); + assertEquals("aa/bb/", fastqToSam.getBaseName("aa/bb/", "aa/bb/" , freader1, freader2)); + assertEquals("aa/bb", fastqToSam.getBaseName("aa/bb/1", "aa/bb/2" , freader1, freader2)); + assertEquals("aa/bb/cc/dd/ee/ff", fastqToSam.getBaseName("aa/bb/cc/dd/ee/ff/1", "aa/bb/cc/dd/ee/ff/2" , freader1, freader2)); + assertEquals("///", fastqToSam.getBaseName("////1", "////2" , freader1, freader2)); + assertEquals("/", fastqToSam.getBaseName("/", "/" , freader1, freader2)); + assertEquals("////", fastqToSam.getBaseName("////", "////", freader1, freader2)); + assertEquals("/aa", fastqToSam.getBaseName("/aa", "/aa" , freader1, freader2)); + assertEquals("aa/", fastqToSam.getBaseName("aa/", "aa/" , freader1, freader2)); + assertEquals("ab/c", fastqToSam.getBaseName("ab/c", "ab/c", freader1, freader2)); + } + + @Test + public void readPairNamesBad() { + try { + fastqToSam.getBaseName("", "" , freader1, freader2); + Assert.fail("Should have thrown an exception"); + } catch (PicardException ignored) {} + try { + fastqToSam.getBaseName("aa/1", "bb/2" , freader1, freader2); + Assert.fail("Should have thrown an exception"); + } catch (PicardException ignored) {} + try { + fastqToSam.getBaseName("aa", "bb" , freader1, freader2); + Assert.fail("Should have thrown an exception"); + } catch (PicardException ignored) {} + try { + fastqToSam.getBaseName("aa/1", "aa" , freader1, freader2); + Assert.fail("Should have thrown an exception"); + } catch (PicardException ignored) {} + try { + fastqToSam.getBaseName("aa", "aa/2" , freader1, freader2); + Assert.fail("Should have thrown an exception"); + } catch (PicardException ignored) {} + try { + fastqToSam.getBaseName("aa/1", "aa/1" , freader1, freader2); + Assert.fail("Should have thrown an exception"); + } catch (PicardException ignored) {} + try { + fastqToSam.getBaseName("aa/2", "aa/2" , freader1, freader2); + Assert.fail("Should have thrown an exception"); + } catch (PicardException ignored) {} + } + + @Test + public void testSequentialFiles() throws Exception { + File singleEnd = testFolder.newFile("single_end_R1_001.fastq"); + File singleEnd2 = testFolder.newFile("single_end_R1_002.fastq"); + File pairedEnd1 = testFolder.newFile("paired_end_R1_001.fastq"); + File pairedEnd12 = testFolder.newFile("paired_end_R1_002.fastq"); + File pairedEnd2 = testFolder.newFile("paired_end_R2_001.fastq"); + File pairedEnd22 = testFolder.newFile("paired_end_R2_002.fastq"); + + assertEquals(2, FastqToSamWithHeaders.getSequentialFileList(singleEnd).size()); + assertEquals(2, FastqToSamWithHeaders.getSequentialFileList(pairedEnd1).size()); + assertEquals(2, FastqToSamWithHeaders.getSequentialFileList(pairedEnd2).size()); + + populateFile(Arrays.asList(singleEnd, singleEnd2, pairedEnd1, pairedEnd12, pairedEnd2, pairedEnd22), Arrays.asList("@FAKE0001 Original version has PHRED scores from 93 to 0 inclusive (in that order)", + "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG", + "+", + "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~")); + + File singleEnd1Output = testFolder.newFile("singleEnd1.sam"); + File singleEnd2Output = testFolder.newFile("singleEnd2.sam"); + File pairedEnd1Output = testFolder.newFile("pairedEnd1.sam"); + File pairedEnd2Output = testFolder.newFile("pairedEnd2.sam"); + convertFileAndVerifyRecordCount(1, singleEnd, null, singleEnd1Output, FastqQualityFormat.Illumina, true, false); + convertFileAndVerifyRecordCount(2, singleEnd, null, singleEnd2Output,FastqQualityFormat.Illumina, true, true); + convertFileAndVerifyRecordCount(2, pairedEnd1, pairedEnd2, pairedEnd1Output, FastqQualityFormat.Illumina, true, false); + convertFileAndVerifyRecordCount(4, pairedEnd1, pairedEnd2, pairedEnd2Output, FastqQualityFormat.Illumina, true, true); + } + + private void populateFile(List files, List data) { + for (File f : files) { + try (FileWriter fw = new FileWriter(f)) { + for (String s : data) { + fw.write(s + "\n"); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + @Test + public void testEmptyFastq() throws IOException { + final File emptyFastq = testFolder.newFile("empty.fastq"); + final File emptyFastqSam = testFolder.newFile("empty.fastq.sam"); + convertFile(emptyFastq, null, emptyFastqSam, FastqQualityFormat.Illumina, false, false, false, false); + } + + private void convertFile(final File fastq1, + final File fastq2, + final File outputFile, + final FastqQualityFormat version, + final boolean permissiveFormat, + final boolean useSequentialFastqs, + final boolean allowEmptyFastq) throws IOException { + convertFile(fastq1, fastq2, outputFile, version, permissiveFormat, useSequentialFastqs, allowEmptyFastq, true); + } + private void convertFile(final File fastq1, + final File fastq2, + final File outputFile, + final FastqQualityFormat version, + final boolean permissiveFormat, + final boolean useSequentialFastqs, + final boolean allowEmptyFastq, + final boolean expectSuccess) throws IOException { + + final List args = new ArrayList<>(); + + args.add("FASTQ=" + fastq1.getAbsolutePath()); + args.add("OUTPUT=" + outputFile.getAbsolutePath()); + args.add("QUALITY_FORMAT=" + version); + args.add("READ_GROUP_NAME=rg"); + args.add("SAMPLE_NAME=s1"); + + if (fastq2 != null) args.add("FASTQ2=" + fastq2.getAbsolutePath()); + if (permissiveFormat) args.add("ALLOW_AND_IGNORE_EMPTY_LINES=true"); + if (useSequentialFastqs) args.add("USE_SEQUENTIAL_FASTQS=true"); + if (allowEmptyFastq) args.add("ALLOW_EMPTY_FASTQ=true"); + int exitStatus = 1; + try { + exitStatus = new FastqToSamWithHeaders().instanceMain(args.toArray(new String[]{})); + if ( ! expectSuccess) { + Assert.fail("Should have thrown a PicardException"); + } + } catch (Exception ignored) {} + assertEquals(expectSuccess ? 0 : 1, exitStatus); + } + + private void convertFileAndVerifyRecordCount(final int expectedCount, + final File fastqFilename1, + final File fastqFilename2, + final File outputFile, + final FastqQualityFormat version, + final boolean permissiveFormat, + final boolean useSequentialFastqs) throws IOException { + + convertFile(fastqFilename1, fastqFilename2, outputFile, version, permissiveFormat, useSequentialFastqs, false); + final SamReader samReader = SamReaderFactory.makeDefault().open(outputFile); + final SAMRecordIterator iterator = samReader.iterator(); + int actualCount = 0; + while (iterator.hasNext()) { + iterator.next(); + actualCount++; + } + samReader.close(); + Assert.assertEquals(expectedCount, actualCount); + } + + @Test + public void runWithNoArgs() { + int exitStatus = new FastqToSamWithHeaders().instanceMain(new String[]{}); + assertEquals(1, exitStatus); } } diff --git a/qmule/test/org/qcmg/qmule/SamToFastqWithHeadersTest.java b/qmule/test/org/qcmg/qmule/SamToFastqWithHeadersTest.java index 36f5220b2..40c6f5c31 100644 --- a/qmule/test/org/qcmg/qmule/SamToFastqWithHeadersTest.java +++ b/qmule/test/org/qcmg/qmule/SamToFastqWithHeadersTest.java @@ -1,8 +1,27 @@ package org.qcmg.qmule; import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.fastq.FastqReader; import htsjdk.samtools.fastq.FastqRecord; +import htsjdk.samtools.util.IOUtil; +import org.junit.Assert; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import picard.PicardException; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.math.BigInteger; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.*; +import java.util.stream.Collectors; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; @@ -10,6 +29,9 @@ public class SamToFastqWithHeadersTest { + @Rule + public TemporaryFolder testFolder = new TemporaryFolder(); + @Test public void getPositionOfCaseChange() { assertEquals(-1, SamToFastqWithHeaders.getPositionOfCaseChange(null)); @@ -47,13 +69,13 @@ public void getFastqFromSam() { record.setCigarString("97M"); record.setInferredInsertSize(330); record.setAttribute("OQ", "???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????"); - record.setAttribute("MQ", (Integer) 60); + record.setAttribute("MQ", 60); record.setAttribute("MC", "3S98M"); - record.setAttribute("ZH", "/1"); + record.setAttribute("ZH", "/1 ACGTACGT"); record.setAttribute("ZT", "GCGA+???'"); FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null); - assertEquals("ERR194147.1758538/1", fq.getReadName()); + assertEquals("ERR194147.1758538/1 ACGTACGT", fq.getReadName()); assertEquals("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT" + "GCGA", fq.getReadString()); assertEquals("???????????????????5???????????????????????????????????????&5?????????+??55?????????????????????????'", fq.getBaseQualityString()); } @@ -71,13 +93,13 @@ public void getFastqFromSamOQ() { record.setCigarString("97M"); record.setInferredInsertSize(330); record.setAttribute("OQ", "55555??????????????5???????????????????????????????????????&5?????????+??55??????????????????????"); - record.setAttribute("MQ", (Integer) 60); + record.setAttribute("MQ", 60); record.setAttribute("MC", "3S98M"); - record.setAttribute("ZH", "/1"); + record.setAttribute("ZH", "/1 ACGTACGT"); record.setAttribute("ZT", "GCGA+???'"); FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null); - assertEquals("ERR194147.1758538/1", fq.getReadName()); + assertEquals("ERR194147.1758538/1 ACGTACGT", fq.getReadName()); assertEquals("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT" + "GCGA", fq.getReadString()); assertEquals("55555??????????????5???????????????????????????????????????&5?????????+??55?????????????????????????'", fq.getBaseQualityString()); } @@ -95,13 +117,13 @@ public void getFastqFromSamReverse() { record.setCigarString("98M"); record.setInferredInsertSize(-323); record.setAttribute("OQ", "??????????????????????????????????????????????????????????????????????????????????????????????????"); - record.setAttribute("MQ", (Integer) 60); + record.setAttribute("MQ", 60); record.setAttribute("MC", "3S98M"); - record.setAttribute("ZH", "/1"); + record.setAttribute("ZH", "/1 XYZ.123.25"); record.setAttribute("ZT", "CTG+???"); FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null); - assertEquals("ERR194147.421820475/1", fq.getReadName()); + assertEquals("ERR194147.421820475/1 XYZ.123.25", fq.getReadName()); assertEquals("AAGTAAGTATAGCTTTGTGCAATATTTTGTGACCTACATTTCTTCCCATTTTTGACCATTTCCTTGTGCACTAATAGCCATGTCATTAGGCCAAAGAACTG", fq.getReadString()); assertEquals("?????????????????????????????????????????????????????????????????????????????????????????????????????", fq.getBaseQualityString()); } @@ -118,14 +140,365 @@ public void getFastqFromSamDodgyTag() { record.setCigarString("97M"); record.setInferredInsertSize(330); record.setAttribute("OQ", "???????????????????5???????????????????????????????????????&5?????????+??55??????????????????????"); - record.setAttribute("MQ", (Integer) 60); + record.setAttribute("MQ", 60); record.setAttribute("MC", "3S98M"); - record.setAttribute("ZH", "/1"); + record.setAttribute("ZH", "/1 foo bar"); record.setAttribute("ZT", "GCGA+???'"); FastqRecord fq = SamToFastqWithHeaders.getFastqRecordFromSamRecord(record, 1, 0, null, null, 0, null, true, null); - assertEquals("ERR194147.1758538/1", fq.getReadName()); + assertEquals("ERR194147.1758538/1 foo bar", fq.getReadName()); assertEquals("AAATGAGGGAAGAAAAGAGTTAAATGCATGTTGATTCCAAGCCCCCGCCTGCCGGGGGGACAGCGGGAGGTTGGAGCACGCAGCCCTGGTGCCTGGT" + "GCGA", fq.getReadString()); assertEquals("???????????????????5???????????????????????????????????????&5?????????+??55?????????????????????????'", fq.getBaseQualityString()); } + + @Test + public void testMissingRgFileOutputPerRg() throws IOException { + File inputSam = testFolder.newFile("testMissingRgFileOutputPerRg.sam"); + File outputDir = testFolder.newFolder(); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/", + "OUTPUT_PER_RG=true" + }, false); + } + + @Test + public void groupedUnpairedMate() throws IOException { + File inputSam = testFolder.newFile("groupedUnpairedMate.sam"); + File outputDir = testFolder.newFolder(); + + populateSamFile(inputSam, Arrays.asList("@HD\tVN:1.0\tSO:unsorted", + "@RG\tID:rg1\tSM:s1\tPU:rg1\tPL:ILLUMINA", + "@RG\tID:rg2\tSM:s2\tPU:rg2\tPL:ILLUMINA", + "foo:record:1\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg1", + "foo:record:1\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg1", + "foo:record:2\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg1", + "foo:record:3\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg1", + "bar:record:1\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg2", + "bar:record:2\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg2")); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/", + "OUTPUT_PER_RG=true" + }, false); + + populateSamFile(inputSam, Arrays.asList("@HD\tVN:1.0\tSO:unsorted", + "@RG\tID:rg1\tSM:s1\tPU:rg1\tPL:ILLUMINA", + "@RG\tID:rg2\tSM:s2\tPU:rg2\tPL:ILLUMINA", + "foo:record:1\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg1", + "foo:record:1\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg1", + "foo:record:2\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg1", + "foo:record:2\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg1", + "bar:record:1\t77\t*\t0\t0\t*\t*\t0\t0\tAAAAAAAAAAAAA\t1111111111111\tRG:Z:rg2", + "bar:record:1\t141\t*\t0\t0\t*\t*\t0\t0\tCCCCCCCCCCCCC\t2222222222222\tRG:Z:rg2")); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/", + "OUTPUT_PER_RG=true" + }, true); + File[] fastqFiles = outputDir.listFiles((dir, file) -> file.startsWith("rg1")); + assert fastqFiles != null; + assertEquals(2, fastqFiles.length); + boolean arraySorted = fastqFiles[0].getName().endsWith("1.fastq"); + verifyFastq(arraySorted ? fastqFiles[0] : fastqFiles[1], arraySorted ? fastqFiles[1] : fastqFiles[0], inputSam, "rg1"); + + fastqFiles = outputDir.listFiles((dir, file) -> file.startsWith("rg2")); + assert fastqFiles != null; + assertEquals(2, fastqFiles.length); + arraySorted = fastqFiles[0].getName().endsWith("1.fastq"); + verifyFastq(arraySorted ? fastqFiles[0] : fastqFiles[1], arraySorted ? fastqFiles[1] : fastqFiles[0], inputSam, "rg2"); + + File outputFastq = testFolder.newFile("groupedUnpairedMate.fastq"); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "FASTQ=" + outputFastq.getAbsolutePath(), + "INTERLEAVE=true" + }, true); + final Set outputHeaderSet = createFastqReadHeaderSet(outputFastq); + // Create map of mate pairs from SAM records + final Map map = createSamMatePairsMap(inputSam, null) ; + Assert.assertEquals(map.size() * 2, outputHeaderSet.size()); + + // Ensure that each mate of each pair in SAM file is in the correct fastq pair file + for (final Map.Entry entry : map.entrySet() ) { + final MatePair mpair = entry.getValue(); + Assert.assertNotNull(mpair.mate1); // ensure we have two mates + Assert.assertNotNull(mpair.mate2); + Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName()); + final String readName = mpair.mate1.getReadName() ; + Assert.assertTrue(outputHeaderSet.contains(readName + "/1")); // ensure mate is in correct file + Assert.assertTrue(outputHeaderSet.contains(readName + "/2")); + } + } + + @Test + public void firstMateAtStartLastMateAtEnd() throws IOException { + File inputSam = testFolder.newFile("groupedUnpairedMate.sam"); + File outputDir = testFolder.newFolder(); + populateSamFile(inputSam, Arrays.asList("@HD VN:1.0 SO:unsorted", + "@RG ID:rg1 SM:s1 PU:blah PL:ILLUMINA", + "bar:record:1 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1", + "bar:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1", + "bar:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1", + "bar:record:3 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1", + "bar:record:3 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1", + "bar:record:4 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1", + "bar:record:4 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1", + "bar:record:5 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1", + "bar:record:5 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1", + "bar:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1")); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/", + "OUTPUT_PER_RG=true" + }, true); + File[] fastqFiles = outputDir.listFiles((dir, file) -> file.endsWith(".fastq")); + assert fastqFiles != null; + assertEquals(2, fastqFiles.length); + boolean arraySorted = fastqFiles[0].getName().endsWith("1.fastq"); + verifyFastq(arraySorted ? fastqFiles[0] : fastqFiles[1], arraySorted ? fastqFiles[1] : fastqFiles[0], inputSam); + + File outputFastq = testFolder.newFile("groupedUnpairedMate.fastq"); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "FASTQ=" + outputFastq.getAbsolutePath(), + "INTERLEAVE=true" + }, true); + final Set outputHeaderSet = createFastqReadHeaderSet(outputFastq); + // Create map of mate pairs from SAM records + final Map map = createSamMatePairsMap(inputSam, null) ; + Assert.assertEquals(map.size() * 2, outputHeaderSet.size()); + + // Ensure that each mate of each pair in SAM file is in the correct fastq pair file + for (final Map.Entry entry : map.entrySet() ) { + final MatePair mpair = entry.getValue(); + Assert.assertNotNull(mpair.mate1); // ensure we have two mates + Assert.assertNotNull(mpair.mate2); + Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName()); + final String readName = mpair.mate1.getReadName() ; + Assert.assertTrue(outputHeaderSet.contains(readName + "/1")); // ensure mate is in correct file + Assert.assertTrue(outputHeaderSet.contains(readName + "/2")); + } + } + + @Test + public void trimmedHeaders() throws IOException { + File inputOrigSam = testFolder.newFile("trimmedHeadersOrig.sam"); + File inputSam = testFolder.newFile("trimmedHeaders.sam"); + File outputOrigDir = testFolder.newFolder(); + File outputDir = testFolder.newFolder(); + populateSamFile(inputOrigSam, Arrays.asList("@HD VN:1.0 SO:queryname", + "@RG ID:rg1 SM:s1 PU:rg1 PL:ILLUMINA", + "@RG ID:rg2 SM:s2 PU:rg2 PL:ILLUMINA", + "foo:record:1 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:3 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:3 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "bar:record:1 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111")); + populateSamFile(inputSam, Arrays.asList("@HD VN:1.0 SO:queryname", + "@RG ID:rg1 SM:s1 PU:rg1 PL:ILLUMINA", + "@RG ID:rg2 SM:s2 PU:rg2 PL:ILLUMINA", + "foo:record:1 77 * 0 0 * * 0 0 AAAAAAAAA 111111111 RG:Z:rg1 ZT:Z:AAAA+1111\tCR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:1 141 * 0 0 * * 0 0 CCCCCCCCC 222222222 RG:Z:rg1 CR:Z:AAAAA ZT:Z:cccc+2222\tUR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222\tZH:Z::3/1", + "foo:record 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT\tZH:Z::3/2 CY:Z:11111 UY:Z:222", + "bar:record:1 77 * 0 0 * * 0 0 AAAAAAAAA 111111111 RG:Z:rg2 CR:Z:CCCCC ZT:Z:aaaa+1111\tUR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:2 141 * 0 0 * * 0 0 CCCCCCCCCC 2222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 ZT:Z:CCC+222\tUY:Z:111", + "bar:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111")); + convertFile(new String[]{ + "INPUT=" + inputOrigSam.getAbsolutePath(), + "OUTPUT_DIR=" + outputOrigDir.getAbsolutePath() + "/", + "OUTPUT_PER_RG=true" + }, true); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/", + "OUTPUT_PER_RG=true" + }, true); + File[] fastqOrigFiles = outputOrigDir.listFiles((dir, file) -> file.endsWith(".fastq")); + File[] fastqFiles = outputDir.listFiles((dir, file) -> file.endsWith(".fastq")); + + /* + loop through each one, calculating md5, then compare for equality + */ + assert fastqOrigFiles != null; + Map mapOrig = Arrays.stream(fastqOrigFiles).collect(Collectors.toMap(File::getName, f -> { + try { + return new BigInteger(1, MessageDigest.getInstance("MD5").digest(Files.readAllBytes(Paths.get(f.getPath())))).toString(16); + } catch (NoSuchAlgorithmException | IOException e) { + throw new RuntimeException(e); + } + })); + assert fastqFiles != null; + Map map = Arrays.stream(fastqFiles).collect(Collectors.toMap(File::getName, f -> { + try { + return new BigInteger(1, MessageDigest.getInstance("MD5").digest(Files.readAllBytes(Paths.get(f.getPath())))).toString(16); + } catch (NoSuchAlgorithmException | IOException e) { + throw new RuntimeException(e); + } + })); + + for (Map.Entry entry : mapOrig.entrySet()) { + String md5 = map.get(entry.getKey()); + assertEquals(md5, entry.getValue()); + } + } + + @Test + public void groupedLastPairMatesFlipped() throws IOException { + File inputSam = testFolder.newFile("groupedUnpairedMate.sam"); + File outputDir = testFolder.newFolder(); + populateSamFile(inputSam, Arrays.asList("@HD VN:1.0 SO:queryname", + "@RG ID:rg1 SM:s1 PU:rg1 PL:ILLUMINA", + "@RG ID:rg2 SM:s2 PU:rg2 PL:ILLUMINA", + "foo:record:1 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:3 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "foo:record:3 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1 CR:Z:AAAAA UR:Z:TTT CY:Z:11111 UY:Z:222", + "bar:record:1 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:1 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:2 141 * 0 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111", + "bar:record:2 77 * 0 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2 CR:Z:CCCCC UR:Z:GGG CY:Z:22222 UY:Z:111")); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "OUTPUT_DIR=" + outputDir.getAbsolutePath() + "/", + "OUTPUT_PER_RG=true" + }, true); + File[] fastqFiles = outputDir.listFiles((dir, file) -> file.startsWith("rg1")); + assert fastqFiles != null; + assertEquals(2, fastqFiles.length); + boolean arraySorted = fastqFiles[0].getName().endsWith("1.fastq"); + verifyFastq(arraySorted ? fastqFiles[0] : fastqFiles[1], arraySorted ? fastqFiles[1] : fastqFiles[0], inputSam, "rg1"); + + fastqFiles = outputDir.listFiles((dir, file) -> file.startsWith("rg2")); + assert fastqFiles != null; + assertEquals(2, fastqFiles.length); + arraySorted = fastqFiles[0].getName().endsWith("1.fastq"); + verifyFastq(arraySorted ? fastqFiles[0] : fastqFiles[1], arraySorted ? fastqFiles[1] : fastqFiles[0], inputSam, "rg2"); + + File outputFastq = testFolder.newFile("groupedUnpairedMate.fastq"); + convertFile(new String[]{ + "INPUT=" + inputSam.getAbsolutePath(), + "FASTQ=" + outputFastq.getAbsolutePath(), + "INTERLEAVE=true" + }, true); + final Set outputHeaderSet = createFastqReadHeaderSet(outputFastq); + // Create map of mate pairs from SAM records + final Map map = createSamMatePairsMap(inputSam, null) ; + Assert.assertEquals(map.size() * 2, outputHeaderSet.size()); + + // Ensure that each mate of each pair in SAM file is in the correct fastq pair file + for (final Map.Entry entry : map.entrySet() ) { + final MatePair mpair = entry.getValue(); + Assert.assertNotNull(mpair.mate1); // ensure we have two mates + Assert.assertNotNull(mpair.mate2); + Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName()); + final String readName = mpair.mate1.getReadName() ; + Assert.assertTrue(outputHeaderSet.contains(readName + "/1")); // ensure mate is in correct file + Assert.assertTrue(outputHeaderSet.contains(readName + "/2")); + } + } + + private void convertFile(final String [] args, boolean expectSuccess) { + int exitStatus = 1; + try { + exitStatus = new SamToFastqWithHeaders().instanceMain(args); + if ( ! expectSuccess) { + Assert.fail("Should have thrown a PicardException"); + } + } catch (Exception ignored) {System.out.println("ignored exception: " + ignored);} + assertEquals(expectSuccess ? 0 : 1, exitStatus); + } + + private void populateSamFile(File sam, List data) { + try (FileWriter fw = new FileWriter(sam)) { + for (String s : data) { + fw.write(s + "\n"); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void verifyFastq(final File pair1File, final File pair2File, final File samFile) throws IOException { + verifyFastq(pair1File, pair2File, samFile, null); + } + private void verifyFastq(final File pair1File, final File pair2File, final File samFile, String readGroup) throws IOException { + // Check that paired fastq files are same size + final Set outputHeaderSet1 = createFastqReadHeaderSet(pair1File); + final Set outputHeaderSet2 = createFastqReadHeaderSet(pair2File); + Assert.assertEquals(outputHeaderSet1.size(), outputHeaderSet2.size()); + + // Create map of mate pairs from SAM records + final Map map = createSamMatePairsMap(samFile, readGroup) ; + Assert.assertEquals(map.size(), outputHeaderSet2.size()); + + // Ensure that each mate of each pair in SAM file is in the correct fastq pair file + for (final Map.Entry entry : map.entrySet() ) { + final MatePair mpair = entry.getValue(); + Assert.assertNotNull(mpair.mate1); // ensure we have two mates + Assert.assertNotNull(mpair.mate2); + Assert.assertEquals(mpair.mate1.getReadName(),mpair.mate2.getReadName()); + final String readName = mpair.mate1.getReadName() ; + Assert.assertTrue(outputHeaderSet1.contains(readName + "/1")); // ensure mate is in correct file + Assert.assertTrue(outputHeaderSet2.contains(readName + "/2")); + } + } + + private Map createSamMatePairsMap(final File samFile, final String readGroup) throws IOException { + IOUtil.assertFileIsReadable(samFile); + final SamReader reader = SamReaderFactory.makeDefault().open(samFile); + + final Map map = new LinkedHashMap<>(); + for (final SAMRecord record : reader ) { + if (null == readGroup || record.getReadGroup().getReadGroupId().equals(readGroup)) { + MatePair mpair = map.get(record.getReadName()); + if (mpair == null) { + mpair = new MatePair(); + map.put(record.getReadName(), mpair); + } + mpair.add(record); + } + } + reader.close(); + return map; + } + + protected static Set createFastqReadHeaderSet(final File file) { + final Set set = new HashSet<>(); + try (final FastqReader freader = new FastqReader(file)) { + while (freader.hasNext()) { + final FastqRecord frec = freader.next(); + set.add(frec.getReadName()); + } + } + return set ; + } + + static class MatePair { + SAMRecord mate1 ; + SAMRecord mate2 ; + void add(final SAMRecord record) { + if (!record.getReadPairedFlag()) throw new PicardException("Record "+record.getReadName()+" is not paired"); + if (record.getFirstOfPairFlag()) { + if (mate1 != null) throw new PicardException("Mate 1 already set for record: "+record.getReadName()); + mate1 = record ; + } + else if (record.getSecondOfPairFlag()) { + if (mate2 != null) throw new PicardException("Mate 2 already set for record: "+record.getReadName()); + mate2 = record ; + } + else throw new PicardException("Neither FirstOfPairFlag or SecondOfPairFlag is set for a paired record"); + } + } } From d0e8e586723608bef830309b18d7c4426c000c07 Mon Sep 17 00:00:00 2001 From: Oliver Holmes Date: Mon, 4 Sep 2023 15:42:14 +1000 Subject: [PATCH 3/3] refactor(qmule): removed unused option --- qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java b/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java index 6a597f3bb..96abbf20f 100644 --- a/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java +++ b/qmule/src/org/qcmg/qmule/FastqToSamWithHeaders.java @@ -197,9 +197,6 @@ public static void main(final String[] argv) { @Argument(doc="Allow (and ignore) empty lines") public Boolean ALLOW_AND_IGNORE_EMPTY_LINES = false; - @Argument(doc="Allow empty input fastq") - public Boolean ALLOW_EMPTY_FASTQ = false; - public static final String ZT_ATTRIBUTE = "ZT"; public static final String ZH_ATTRIBUTE = "ZH";