Skip to content

create universal reader and writer #209

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 25, 2020
146 changes: 146 additions & 0 deletions qio/src/org/qcmg/qio/record/RecordReader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
/**
* © Copyright The University of Queensland 2010-2014.
* © Copyright QIMR Berghofer Medical Research Institute 2014-2016.
*
* This code is released under the terms outlined in the included LICENSE file.
*/

package org.qcmg.qio.record;

import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UncheckedIOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.zip.GZIPInputStream;
import org.qcmg.common.util.FileUtils;


public abstract class RecordReader<T> implements Closeable, Iterable<T> {
public static final int DEFAULT_BUFFER_SIZE = 65536;
public static final String DEFAULT_HEADER_PREFIX = null; //no header line
public static final Charset DEFAULT_CHARSET = StandardCharsets.UTF_8;

protected final File file;
//only allow create once
protected final BufferedReader bin;
protected T next;

protected List<String> headerLines = new ArrayList<>();

public RecordReader(final File file) throws IOException {
this(file, DEFAULT_BUFFER_SIZE);
}

public RecordReader(final File file, int bufferSize) throws IOException {
this(file, bufferSize, DEFAULT_HEADER_PREFIX, DEFAULT_CHARSET);
}

public RecordReader(final File file, CharSequence headerPrefix) throws IOException {
this(file, DEFAULT_BUFFER_SIZE, headerPrefix, DEFAULT_CHARSET);
}

public RecordReader(final File file, int bufferSize, CharSequence headerPrefix, Charset charset) throws IOException {

this.file = file;
boolean isGzip = FileUtils.isInputGZip( file);
InputStream inputStream = (isGzip) ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file);
InputStreamReader streamReader = new InputStreamReader(inputStream, charset);
bin = new BufferedReader(streamReader, bufferSize);

String nextLine = readHeaderAndReturnFirstNonHeaderLine(headerPrefix);
//get first record, set to null for empty file
next = nextLine == null ? null : getRecord(nextLine);

}
/**
* this method is overridable in subclass, eg illumina file have different header patten
*
* @param headerPrefix
* @return the first line just after header
* @throws IOException
*/
public String readHeaderAndReturnFirstNonHeaderLine(CharSequence headerPrefix ) throws IOException {


String nextLine = bin.readLine();

//keep empty header and return first nonHeaderline
if (headerPrefix == null) return nextLine;

//reader header, hence file pointer to first line after header
while ( nextLine != null && nextLine.startsWith(headerPrefix + "") ) {
headerLines.add(nextLine);
//reset current read line
nextLine = bin.readLine();
}

return nextLine;
}

/**
* This reader can maxmum take Integer.max lines of file header. Please make other header if bigger than this.
* @return a list of header lines
*/
public List<String> getHeader() {
return headerLines;
}

@Override
/**
* Here, BufferedReader.close() calls InputStreamReader.close(), which API told us that it Closes the stream and releases any system resources associated with it.
*/
public void close() throws IOException {
bin.close();
}

public File getFile() {
return file;
}

@Override
public Iterator<T> iterator() {
Iterator<T> iter = new Iterator<T>() {
@Override
public boolean hasNext() {
return null != next;
}

@Override
//return the stored record (next), even it is null
public T next() {
T rec = next;
next = null; //in case exception happen, same line repeatedly

try {
//get next record, it may read multi lines
String line = bin.readLine();
if ( line != null ) {
next = getRecord( line );
}

return rec;
} catch (IOException e) {
//here we only catch IO exception
throw new UncheckedIOException(e);
}
}
};

return iter;
}

//some record cross multi lines, eg id\nseq\n, this method may call bin.readLine() inside
public abstract T getRecord(String line);


}
69 changes: 69 additions & 0 deletions qio/src/org/qcmg/qio/record/RecordWriter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/**
* © Copyright The University of Queensland 2010-2014. This code is released under the terms outlined in the included LICENSE file.
*/

package org.qcmg.qio.record;

import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.List;
import java.util.zip.GZIPOutputStream;
import org.qcmg.common.util.Constants;
import org.qcmg.common.util.FileUtils;

public class RecordWriter<T> implements Closeable {
private final File file;
private final BufferedWriter bos;

public RecordWriter(final File file, boolean append) throws IOException {
this.file = file;
boolean gzip = FileUtils.isFileNameGZip(file);
OutputStream outputStream = gzip ? new GZIPOutputStream(new FileOutputStream(file, append)) : new FileOutputStream(file, append);
bos = new BufferedWriter(new OutputStreamWriter(outputStream));
}

public RecordWriter(final File file) throws IOException {
this(file,false);
}

/**
* it appends header line to writer and also automatically append an newline mark If there is no one on the record string.
* @param header
* @throws IOException
*/
public void addHeader(final String header) throws IOException {
String line = header.endsWith(Constants.NL_STRING) ? header : header + Constants.NL;
bos.write(line);
}

public void addHeader(List<String> header) throws IOException {
for (String str : header) {
addHeader(str);
}
}

/**
* it will convert the record to string and append to writer. It will automatically append an newline mark If there is no newLine on the record string.
* @param record is not allowed to be null
* @throws IOException
*/
public void add(final T record) throws IOException {
String encoded = record instanceof String ? (String) record : record.toString();
String line = encoded.endsWith(Constants.NL_STRING) ? encoded : encoded + Constants.NL;
bos.write(line);
}

@Override
public void close() throws IOException {
bos.close();
}

public File getFile() {
return file;
}
}
30 changes: 30 additions & 0 deletions qio/src/org/qcmg/qio/record/StringFileReader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package org.qcmg.qio.record;

import java.io.File;
import java.io.IOException;
import org.qcmg.common.util.Constants;

public class StringFileReader extends RecordReader<String> {
private static final String HEADER_PREFIX = Constants.HASH_STRING;


public StringFileReader(File file) throws IOException {
super(file, HEADER_PREFIX);
}

public StringFileReader(File file, CharSequence headerPrefix) throws IOException {
super(file, headerPrefix);
}

public StringFileReader(File file, int bufferSize) throws IOException {
super(file, bufferSize);
}

@Override
/**
* return input self even it is null
*/
public String getRecord(String line) {
return line;
}
}
130 changes: 130 additions & 0 deletions qio/test/org/qcmg/qio/record/RecordWriterTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
package org.qcmg.qio.record;

import static org.junit.Assert.*;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;

import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.qcmg.common.util.FileUtils;
import org.qcmg.common.vcf.VcfRecord;
import org.qcmg.common.vcf.header.VcfHeaderUtils;
import org.qcmg.qio.record.StringFileReader;
import org.qcmg.qio.record.RecordWriter;
import org.qcmg.common.util.Constants;

public class RecordWriterTest {

public static final String[] vcfStrings = new String[] {"##test=test", VcfHeaderUtils.STANDARD_FINAL_HEADER_LINE};
public static final String[] parms = {"chrY","2675826",".","TG","CA",".","COVN12;MIUN","SOMATIC;NNS=4;END=2675826","ACCS","TG,5,37,CA,0,2","AA,1,1,CA,4,1,CT,3,1,TA,11,76,TG,2,2,TG,0,1"};

@Rule
public TemporaryFolder testFolder = new TemporaryFolder();

@Test
public void getHeaderFromZippedVcfFile1() throws IOException {
File file = testFolder.newFile("header.vcf.gz");

try(RecordWriter<VcfRecord> writer = new RecordWriter<>(file) ){
writer.addHeader(Arrays.stream(vcfStrings).collect(Collectors.joining("\n")));
}
assertEquals(true, FileUtils.isInputGZip(file) );

/*
* Should be able to get the header back out
*/
List<String> header = null;
try(StringFileReader reader = new StringFileReader(file)){
header = reader.getHeader();
}
assertEquals(2, header.size());
assertEquals(VcfHeaderUtils.STANDARD_FINAL_HEADER_LINE, header.get(1));
assertEquals("##test=test", header.get(0));
}

@Test
public void VcfRecordTest() throws IOException {
File file = testFolder.newFile("output.vcf");

//create new String file with two vcf record
try(RecordWriter<VcfRecord> writer = new RecordWriter<>(file) ){
//add record twice
writer.add(new VcfRecord(parms));
writer.add(new VcfRecord(parms));
} catch (Exception e) { fail(); }

//append two string into file
try(RecordWriter<String> writer = new RecordWriter<>(file,true) ){
writer.add("");
writer.add("after empty");
writer.add(Constants.NL_STRING);
writer.add("after new line");

//add record twice
writer.add(null);
fail();
} catch (Exception e) {
//null is not allowed
}


//now it become a valid vcf file
try(StringFileReader reader = new StringFileReader(file);){
int count = 0;
for(String rec : reader) {
count++;
if(count == 4) assertEquals("after empty", rec);
if(count == 5) assertEquals("", rec);
if(count == 6) assertEquals("after new line", rec);
}
assertEquals( 6, count );
} catch (Exception e) {fail(); }
}

@Test
public void testCreateAppendVcfWriter() throws IOException {
File file = testFolder.newFile("output.vcf");

//create new file
try(RecordWriter<VcfRecord> writer = new RecordWriter<>(file) ){
writer.addHeader(vcfStrings[0]);
} catch (Exception e) { fail(); }

// read throw exception
try(StringFileReader reader = new StringFileReader(file);){
Assert.assertTrue( reader.getHeader().size() == 1);
int count = 0;
for(String rec : reader) count++;
assertEquals( 0, count );
} catch (Exception e) { fail();}


//append to file
try(RecordWriter<VcfRecord> writer = new RecordWriter<>(file,true) ){
writer.addHeader(vcfStrings[1]);
//add record twice
writer.add(new VcfRecord(parms));
writer.add(new VcfRecord(parms));
} catch (Exception e) { fail(); }

//now it become a valid vcf file
try(StringFileReader reader = new StringFileReader(file);){
Assert.assertFalse( FileUtils.isInputGZip(file) );
assertEquals( 2, reader.getHeader().size());
int count = 0;
for(String rec : reader) {
assertEquals(rec, String.join(Constants.TAB_STRING, parms));
count++;
}
assertEquals(2, count );
} catch (Exception e) {fail(); }

}

}