Skip to content

Commit

Permalink
Add fixed-width column support (#220)
Browse files Browse the repository at this point in the history
  • Loading branch information
kosak authored Nov 8, 2024
1 parent 1c52d59 commit f9293f1
Show file tree
Hide file tree
Showing 5 changed files with 948 additions and 13 deletions.
108 changes: 102 additions & 6 deletions src/main/java/io/deephaven/csv/CsvSpecs.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
public abstract class CsvSpecs {
public interface Builder {
/**
* Copy all of the parameters from {@code specs} into {@code this} builder.
* Copy all the parameters from {@code specs} into {@code this} builder.
*/
Builder from(CsvSpecs specs);

Expand Down Expand Up @@ -117,6 +117,34 @@ public interface Builder {
*/
Builder headerValidator(Predicate<String> headerValidator);

/**
* True if the input is organized into fixed width columns rather than delimited by a delimiter.
*/
Builder hasFixedWidthColumns(boolean hasFixedWidthColumns);

/**
* When {@link #hasFixedWidthColumns} is set, the library either determines the column widths from the header
* row (provided {@link #hasHeaderRow} is set), or the column widths can be specified explicitly by the caller.
* If the caller wants to specify them explicitly, they can use this method. It is an error to set this
* parameter if {@link #hasFixedWidthColumns} is false. Note that because the library is tolerant of the last
* cell being shorter or wider than expected, the value specified here for the width of the last column is
* simply a placeholder; its value is ignored.
*/
Builder fixedColumnWidths(Iterable<Integer> fixedColumnWidths);

/**
* This setting controls what units fixed width columns are measured in. When true, fixed width columns are
* measured in Unicode code points. When false, fixed width columns are measured in UTF-16 units (aka Java
* chars). The difference arises when encountering characters outside the Unicode Basic Multilingual Plane. For
* example, the Unicode code point 💔 (U+1F494) is one Unicode code point, but takes two Java chars to
* represent. Along these lines, the string 💔💔💔 would fit in a column of width 3 when utf32CountingMode is
* true, but would require a column width of at least 6 when utf32CountingMode is false. The default setting of
* true is arguably more natural for users (the number of characters they see matches the visual width of the
* column). But some programs may want the value of false because they are counting Java chars. It is an error
* to set this parameter if {@link #hasFixedWidthColumns} is false.
*/
Builder useUtf32CountingConvention(boolean useUtf32CountingConvention);

/**
* Number of data rows to skip before processing data. This is useful when you want to parse data in chunks.
* Typically used together with {@link Builder#numRows}. Defaults to 0.
Expand Down Expand Up @@ -160,7 +188,7 @@ public interface Builder {

/**
* The field delimiter character (the character that separates one column from the next). Must be 7-bit ASCII.
* Defaults to {code ','}.
* Defaults to {code ','}. It is an error to set this parameter if {@link #hasFixedWidthColumns} is true.
*/
Builder delimiter(char delimiter);

Expand All @@ -179,6 +207,8 @@ public interface Builder {
* <li>hello, there
* <li>456
* </ul>
*
* It is an error to set this parameter if {@link #hasFixedWidthColumns} is true.
*/
Builder quote(char quote);

Expand All @@ -188,7 +218,8 @@ public interface Builder {
Builder ignoreSurroundingSpaces(boolean ignoreSurroundingSpaces);

/**
* Whether to trim leading and trailing blanks from inside quoted values. Defaults to {@code false}.
* Whether to trim leading and trailing blanks from inside quoted values. Defaults to {@code false}. It is an
* error to set this parameter if {@link #hasFixedWidthColumns} is true.
*/
Builder trim(boolean trim);

Expand Down Expand Up @@ -224,6 +255,38 @@ void check() {
if (!hasHeaderRow() && skipHeaderRows() > 0) {
problems.add("skipHeaderRows != 0 but hasHeaderRow is not set");
}

for (final Integer colWidth : fixedColumnWidths()) {
if (colWidth < 1) {
problems.add(String.format("Fixed column width %d is invalid", colWidth));
}
}

// Certain items must not be set in fixed-width column mode. Other items must not be set in delimited column
// mode.
if (hasFixedWidthColumns()) {
final String format = "Incompatible parameters: can't set %s when hasFixedWidthColumns is true";
if (quote() != defaultQuote) {
problems.add(String.format(format, "quote"));
}

if (delimiter() != defaultDelimiter) {
problems.add(String.format(format, "delimiter"));
}

if (trim() != defaultTrim) {
problems.add(String.format(format, "trim"));
}
} else {
final String format = "Incompatible parameters: can't set %s when hasFixedWidthColumns is false";
if (fixedColumnWidths().size() != 0) {
problems.add(String.format(format, "fixedColumnWidths"));
}

if (useUtf32CountingConvention() != defaultUtf32CountingConvention) {
problems.add(String.format(format, "useUtf32CountingConvention"));
}
}
if (problems.isEmpty()) {
return;
}
Expand Down Expand Up @@ -340,6 +403,32 @@ public Predicate<String> headerValidator() {
return c -> true;
}

/**
* See {@link Builder#hasFixedWidthColumns}.
*/
@Default
public boolean hasFixedWidthColumns() {
return false;
}

/**
* See {@link Builder#fixedColumnWidths}.
*/
@Default
public List<Integer> fixedColumnWidths() {
return Collections.emptyList();
}

private static final boolean defaultUtf32CountingConvention = true;

/**
* See {@link Builder#useUtf32CountingConvention}.
*/
@Default
public boolean useUtf32CountingConvention() {
return defaultUtf32CountingConvention;
}

/**
* See {@link Builder#skipRows}.
*/
Expand Down Expand Up @@ -396,20 +485,25 @@ public long skipHeaderRows() {
return 0;
}

private final char defaultDelimiter = ',';

/**
* See {@link Builder#delimiter}.
*/
@Default
public char delimiter() {
return ',';
return defaultDelimiter;
}


private static final char defaultQuote = '"';

/**
* See {@link Builder#quote}.
*/
@Default
public char quote() {
return '"';
return defaultQuote;
}

/**
Expand All @@ -420,12 +514,14 @@ public boolean ignoreSurroundingSpaces() {
return true;
}

private static boolean defaultTrim = false;

/**
* See {@link Builder#trim}.
*/
@Default
public boolean trim() {
return false;
return defaultTrim;
}

/**
Expand Down
15 changes: 14 additions & 1 deletion src/main/java/io/deephaven/csv/reading/CsvReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
import io.deephaven.csv.parsers.Parser;
import io.deephaven.csv.reading.cells.CellGrabber;
import io.deephaven.csv.reading.cells.DelimitedCellGrabber;
import io.deephaven.csv.reading.cells.FixedCellGrabber;
import io.deephaven.csv.reading.headers.DelimitedHeaderFinder;
import io.deephaven.csv.reading.headers.FixedHeaderFinder;
import io.deephaven.csv.sinks.Sink;
import io.deephaven.csv.sinks.SinkFactory;
import io.deephaven.csv.util.*;
Expand Down Expand Up @@ -63,7 +65,8 @@ private CsvReader() {}
*/
public static Result read(final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory)
throws CsvReaderException {
return delimitedReadLogic(specs, stream, sinkFactory);
return specs.hasFixedWidthColumns() ? fixedReadLogic(specs, stream, sinkFactory)
: delimitedReadLogic(specs, stream, sinkFactory);
}

private static Result delimitedReadLogic(
Expand Down Expand Up @@ -97,6 +100,16 @@ private static Result delimitedReadLogic(
return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersToUse, sinkFactory);
}

private static Result fixedReadLogic(
final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory) throws CsvReaderException {
final CellGrabber lineGrabber = FixedCellGrabber.makeLineGrabber(stream);
MutableObject<int[]> columnWidths = new MutableObject<>();
final String[] headers = FixedHeaderFinder.determineHeadersToUse(specs, lineGrabber, columnWidths);
final int numCols = headers.length;
final CellGrabber grabber = new FixedCellGrabber(lineGrabber, columnWidths.getValue(),
specs.ignoreSurroundingSpaces(), specs.useUtf32CountingConvention());
return commonReadLogic(specs, grabber, null, numCols, numCols, headers, sinkFactory);
}

private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber, byte[][] optionalFirstDataRow,
int numInputCols, int numOutputCols,
Expand Down
113 changes: 113 additions & 0 deletions src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
package io.deephaven.csv.reading.cells;

import io.deephaven.csv.containers.ByteSlice;
import io.deephaven.csv.reading.ReaderUtil;
import io.deephaven.csv.util.CsvReaderException;
import io.deephaven.csv.util.MutableBoolean;
import io.deephaven.csv.util.MutableInt;

import java.io.InputStream;

/**
* This class uses an underlying DelimitedCellGrabber to grab whole lines at a time from the input stream, and then it
* breaks them into fixed-sized cells to return to the caller.
*/
public class FixedCellGrabber implements CellGrabber {
/**
* Makes a degenerate CellGrabber that has no delimiters or quotes and therefore returns whole lines. This is a
* somewhat quick-and-dirty way to reuse the buffering and newline logic in DelimitedCellGrabber without rewriting
* it.
*
* @param stream The underlying stream.
* @return The "line grabber"
*/
public static CellGrabber makeLineGrabber(InputStream stream) {
final byte IllegalUtf8 = (byte) 0xff;
return new DelimitedCellGrabber(stream, IllegalUtf8, IllegalUtf8, true, false);
}

private final CellGrabber lineGrabber;
private final int[] columnWidths;
private final boolean ignoreSurroundingSpaces;
private final boolean utf32CountingMode;
private final ByteSlice rowText;
private boolean needsUnderlyingRefresh;
private int colIndex;
private final MutableBoolean dummy1;
private final MutableInt dummy2;

/** Constructor. */
public FixedCellGrabber(final CellGrabber lineGrabber, final int[] columnWidths, boolean ignoreSurroundingSpaces,
boolean utf32CountingMode) {
this.lineGrabber = lineGrabber;
this.columnWidths = columnWidths;
this.ignoreSurroundingSpaces = ignoreSurroundingSpaces;
this.utf32CountingMode = utf32CountingMode;
this.rowText = new ByteSlice();
this.needsUnderlyingRefresh = true;
this.colIndex = 0;
this.dummy1 = new MutableBoolean();
this.dummy2 = new MutableInt();
}

@Override
public void grabNext(ByteSlice dest, MutableBoolean lastInRow, MutableBoolean endOfInput)
throws CsvReaderException {
if (needsUnderlyingRefresh) {
// Underlying row used up, and all columns provided. Ask underlying CellGrabber for the next line.
lineGrabber.grabNext(rowText, dummy1, endOfInput);

if (endOfInput.booleanValue()) {
// Set dest to the empty string, and leave 'endOfInput' set to true.
dest.reset(rowText.data(), rowText.end(), rowText.end());
return;
}

needsUnderlyingRefresh = false;
colIndex = 0;
}

// There is data to return. Count off N characters. The final column gets all remaining characters.
final boolean lastCol = colIndex == columnWidths.length - 1;
final int numCharsToTake = lastCol ? Integer.MAX_VALUE : columnWidths[colIndex];
takeNCharactersInCharset(rowText, dest, numCharsToTake, utf32CountingMode, dummy2);
++colIndex;
needsUnderlyingRefresh = lastCol || dest.size() == 0;
lastInRow.setValue(needsUnderlyingRefresh);
endOfInput.setValue(false);

if (ignoreSurroundingSpaces) {
ReaderUtil.trimSpacesAndTabs(dest);
}
}

private static void takeNCharactersInCharset(ByteSlice src, ByteSlice dest, int numCharsToTake,
boolean utf32CountingMode, MutableInt tempInt) {
final byte[] data = src.data();
final int cellBegin = src.begin();
int current = cellBegin;
while (numCharsToTake > 0) {
if (current == src.end()) {
break;
}
final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(data[current], src.end() - current,
utf32CountingMode, tempInt);
if (numCharsToTake < tempInt.intValue()) {
// There is not enough space left in the field to store this character.
// This can happen if CsvSpecs is set for the UTF16 counting convention,
// there is one unit left in the field, and we encounter a character outside
// the Basic Multilingual Plane, which would require two units.
break;
}
numCharsToTake -= tempInt.intValue();
current += utf8Length;
}
dest.reset(src.data(), cellBegin, current);
src.reset(src.data(), current, src.end());
}

@Override
public int physicalRowNum() {
return lineGrabber.physicalRowNum();
}
}
Loading

0 comments on commit f9293f1

Please sign in to comment.