Add fixed-width column support (#220)

deephaven · Nov 8, 2024 · f9293f1 · f9293f1
1 parent 1c52d59
commit f9293f1
Show file tree

Hide file tree

Showing 5 changed files with 948 additions and 13 deletions.
diff --git a/src/main/java/io/deephaven/csv/CsvSpecs.java b/src/main/java/io/deephaven/csv/CsvSpecs.java
@@ -24,7 +24,7 @@
 public abstract class CsvSpecs {
     public interface Builder {
         /**
-         * Copy all of the parameters from {@code specs} into {@code this} builder.
+         * Copy all the parameters from {@code specs} into {@code this} builder.
          */
         Builder from(CsvSpecs specs);
 
@@ -117,6 +117,34 @@ public interface Builder {
          */
         Builder headerValidator(Predicate<String> headerValidator);
 
+        /**
+         * True if the input is organized into fixed width columns rather than delimited by a delimiter.
+         */
+        Builder hasFixedWidthColumns(boolean hasFixedWidthColumns);
+
+        /**
+         * When {@link #hasFixedWidthColumns} is set, the library either determines the column widths from the header
+         * row (provided {@link #hasHeaderRow} is set), or the column widths can be specified explicitly by the caller.
+         * If the caller wants to specify them explicitly, they can use this method. It is an error to set this
+         * parameter if {@link #hasFixedWidthColumns} is false. Note that because the library is tolerant of the last
+         * cell being shorter or wider than expected, the value specified here for the width of the last column is
+         * simply a placeholder; its value is ignored.
+         */
+        Builder fixedColumnWidths(Iterable<Integer> fixedColumnWidths);
+
+        /**
+         * This setting controls what units fixed width columns are measured in. When true, fixed width columns are
+         * measured in Unicode code points. When false, fixed width columns are measured in UTF-16 units (aka Java
+         * chars). The difference arises when encountering characters outside the Unicode Basic Multilingual Plane. For
+         * example, the Unicode code point 💔 (U+1F494) is one Unicode code point, but takes two Java chars to
+         * represent. Along these lines, the string 💔💔💔 would fit in a column of width 3 when utf32CountingMode is
+         * true, but would require a column width of at least 6 when utf32CountingMode is false. The default setting of
+         * true is arguably more natural for users (the number of characters they see matches the visual width of the
+         * column). But some programs may want the value of false because they are counting Java chars. It is an error
+         * to set this parameter if {@link #hasFixedWidthColumns} is false.
+         */
+        Builder useUtf32CountingConvention(boolean useUtf32CountingConvention);
+
         /**
          * Number of data rows to skip before processing data. This is useful when you want to parse data in chunks.
          * Typically used together with {@link Builder#numRows}. Defaults to 0.
@@ -160,7 +188,7 @@ public interface Builder {
 
         /**
          * The field delimiter character (the character that separates one column from the next). Must be 7-bit ASCII.
-         * Defaults to {code ','}.
+         * Defaults to {code ','}. It is an error to set this parameter if {@link #hasFixedWidthColumns} is true.
          */
         Builder delimiter(char delimiter);
 
@@ -179,6 +207,8 @@ public interface Builder {
          * <li>hello, there
          * <li>456
          * </ul>
+         *
+         * It is an error to set this parameter if {@link #hasFixedWidthColumns} is true.
          */
         Builder quote(char quote);
 
@@ -188,7 +218,8 @@ public interface Builder {
         Builder ignoreSurroundingSpaces(boolean ignoreSurroundingSpaces);
 
         /**
-         * Whether to trim leading and trailing blanks from inside quoted values. Defaults to {@code false}.
+         * Whether to trim leading and trailing blanks from inside quoted values. Defaults to {@code false}. It is an
+         * error to set this parameter if {@link #hasFixedWidthColumns} is true.
          */
         Builder trim(boolean trim);
 
@@ -224,6 +255,38 @@ void check() {
         if (!hasHeaderRow() && skipHeaderRows() > 0) {
             problems.add("skipHeaderRows != 0 but hasHeaderRow is not set");
         }
+
+        for (final Integer colWidth : fixedColumnWidths()) {
+            if (colWidth < 1) {
+                problems.add(String.format("Fixed column width %d is invalid", colWidth));
+            }
+        }
+
+        // Certain items must not be set in fixed-width column mode. Other items must not be set in delimited column
+        // mode.
+        if (hasFixedWidthColumns()) {
+            final String format = "Incompatible parameters: can't set %s when hasFixedWidthColumns is true";
+            if (quote() != defaultQuote) {
+                problems.add(String.format(format, "quote"));
+            }
+
+            if (delimiter() != defaultDelimiter) {
+                problems.add(String.format(format, "delimiter"));
+            }
+
+            if (trim() != defaultTrim) {
+                problems.add(String.format(format, "trim"));
+            }
+        } else {
+            final String format = "Incompatible parameters: can't set %s when hasFixedWidthColumns is false";
+            if (fixedColumnWidths().size() != 0) {
+                problems.add(String.format(format, "fixedColumnWidths"));
+            }
+
+            if (useUtf32CountingConvention() != defaultUtf32CountingConvention) {
+                problems.add(String.format(format, "useUtf32CountingConvention"));
+            }
+        }
         if (problems.isEmpty()) {
             return;
         }
@@ -340,6 +403,32 @@ public Predicate<String> headerValidator() {
         return c -> true;
     }
 
+    /**
+     * See {@link Builder#hasFixedWidthColumns}.
+     */
+    @Default
+    public boolean hasFixedWidthColumns() {
+        return false;
+    }
+
+    /**
+     * See {@link Builder#fixedColumnWidths}.
+     */
+    @Default
+    public List<Integer> fixedColumnWidths() {
+        return Collections.emptyList();
+    }
+
+    private static final boolean defaultUtf32CountingConvention = true;
+
+    /**
+     * See {@link Builder#useUtf32CountingConvention}.
+     */
+    @Default
+    public boolean useUtf32CountingConvention() {
+        return defaultUtf32CountingConvention;
+    }
+
     /**
      * See {@link Builder#skipRows}.
      */
@@ -396,20 +485,25 @@ public long skipHeaderRows() {
         return 0;
     }
 
+    private final char defaultDelimiter = ',';
+
     /**
      * See {@link Builder#delimiter}.
      */
     @Default
     public char delimiter() {
-        return ',';
+        return defaultDelimiter;
     }
 
+
+    private static final char defaultQuote = '"';
+
     /**
      * See {@link Builder#quote}.
      */
     @Default
     public char quote() {
-        return '"';
+        return defaultQuote;
     }
 
     /**
@@ -420,12 +514,14 @@ public boolean ignoreSurroundingSpaces() {
         return true;
     }
 
+    private static boolean defaultTrim = false;
+
     /**
      * See {@link Builder#trim}.
      */
     @Default
     public boolean trim() {
-        return false;
+        return defaultTrim;
     }
 
     /**

diff --git a/src/main/java/io/deephaven/csv/reading/CsvReader.java b/src/main/java/io/deephaven/csv/reading/CsvReader.java
@@ -7,7 +7,9 @@
 import io.deephaven.csv.parsers.Parser;
 import io.deephaven.csv.reading.cells.CellGrabber;
 import io.deephaven.csv.reading.cells.DelimitedCellGrabber;
+import io.deephaven.csv.reading.cells.FixedCellGrabber;
 import io.deephaven.csv.reading.headers.DelimitedHeaderFinder;
+import io.deephaven.csv.reading.headers.FixedHeaderFinder;
 import io.deephaven.csv.sinks.Sink;
 import io.deephaven.csv.sinks.SinkFactory;
 import io.deephaven.csv.util.*;
@@ -63,7 +65,8 @@ private CsvReader() {}
      */
     public static Result read(final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory)
             throws CsvReaderException {
-        return delimitedReadLogic(specs, stream, sinkFactory);
+        return specs.hasFixedWidthColumns() ? fixedReadLogic(specs, stream, sinkFactory)
+                : delimitedReadLogic(specs, stream, sinkFactory);
     }
 
     private static Result delimitedReadLogic(
@@ -97,6 +100,16 @@ private static Result delimitedReadLogic(
         return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersToUse, sinkFactory);
     }
 
+    private static Result fixedReadLogic(
+            final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory) throws CsvReaderException {
+        final CellGrabber lineGrabber = FixedCellGrabber.makeLineGrabber(stream);
+        MutableObject<int[]> columnWidths = new MutableObject<>();
+        final String[] headers = FixedHeaderFinder.determineHeadersToUse(specs, lineGrabber, columnWidths);
+        final int numCols = headers.length;
+        final CellGrabber grabber = new FixedCellGrabber(lineGrabber, columnWidths.getValue(),
+                specs.ignoreSurroundingSpaces(), specs.useUtf32CountingConvention());
+        return commonReadLogic(specs, grabber, null, numCols, numCols, headers, sinkFactory);
+    }
 
     private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber, byte[][] optionalFirstDataRow,
             int numInputCols, int numOutputCols,

diff --git a/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java b/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java
@@ -0,0 +1,113 @@
+package io.deephaven.csv.reading.cells;
+
+import io.deephaven.csv.containers.ByteSlice;
+import io.deephaven.csv.reading.ReaderUtil;
+import io.deephaven.csv.util.CsvReaderException;
+import io.deephaven.csv.util.MutableBoolean;
+import io.deephaven.csv.util.MutableInt;
+
+import java.io.InputStream;
+
+/**
+ * This class uses an underlying DelimitedCellGrabber to grab whole lines at a time from the input stream, and then it
+ * breaks them into fixed-sized cells to return to the caller.
+ */
+public class FixedCellGrabber implements CellGrabber {
+    /**
+     * Makes a degenerate CellGrabber that has no delimiters or quotes and therefore returns whole lines. This is a
+     * somewhat quick-and-dirty way to reuse the buffering and newline logic in DelimitedCellGrabber without rewriting
+     * it.
+     * 
+     * @param stream The underlying stream.
+     * @return The "line grabber"
+     */
+    public static CellGrabber makeLineGrabber(InputStream stream) {
+        final byte IllegalUtf8 = (byte) 0xff;
+        return new DelimitedCellGrabber(stream, IllegalUtf8, IllegalUtf8, true, false);
+    }
+
+    private final CellGrabber lineGrabber;
+    private final int[] columnWidths;
+    private final boolean ignoreSurroundingSpaces;
+    private final boolean utf32CountingMode;
+    private final ByteSlice rowText;
+    private boolean needsUnderlyingRefresh;
+    private int colIndex;
+    private final MutableBoolean dummy1;
+    private final MutableInt dummy2;
+
+    /** Constructor. */
+    public FixedCellGrabber(final CellGrabber lineGrabber, final int[] columnWidths, boolean ignoreSurroundingSpaces,
+            boolean utf32CountingMode) {
+        this.lineGrabber = lineGrabber;
+        this.columnWidths = columnWidths;
+        this.ignoreSurroundingSpaces = ignoreSurroundingSpaces;
+        this.utf32CountingMode = utf32CountingMode;
+        this.rowText = new ByteSlice();
+        this.needsUnderlyingRefresh = true;
+        this.colIndex = 0;
+        this.dummy1 = new MutableBoolean();
+        this.dummy2 = new MutableInt();
+    }
+
+    @Override
+    public void grabNext(ByteSlice dest, MutableBoolean lastInRow, MutableBoolean endOfInput)
+            throws CsvReaderException {
+        if (needsUnderlyingRefresh) {
+            // Underlying row used up, and all columns provided. Ask underlying CellGrabber for the next line.
+            lineGrabber.grabNext(rowText, dummy1, endOfInput);
+
+            if (endOfInput.booleanValue()) {
+                // Set dest to the empty string, and leave 'endOfInput' set to true.
+                dest.reset(rowText.data(), rowText.end(), rowText.end());
+                return;
+            }
+
+            needsUnderlyingRefresh = false;
+            colIndex = 0;
+        }
+
+        // There is data to return. Count off N characters. The final column gets all remaining characters.
+        final boolean lastCol = colIndex == columnWidths.length - 1;
+        final int numCharsToTake = lastCol ? Integer.MAX_VALUE : columnWidths[colIndex];
+        takeNCharactersInCharset(rowText, dest, numCharsToTake, utf32CountingMode, dummy2);
+        ++colIndex;
+        needsUnderlyingRefresh = lastCol || dest.size() == 0;
+        lastInRow.setValue(needsUnderlyingRefresh);
+        endOfInput.setValue(false);
+
+        if (ignoreSurroundingSpaces) {
+            ReaderUtil.trimSpacesAndTabs(dest);
+        }
+    }
+
+    private static void takeNCharactersInCharset(ByteSlice src, ByteSlice dest, int numCharsToTake,
+            boolean utf32CountingMode, MutableInt tempInt) {
+        final byte[] data = src.data();
+        final int cellBegin = src.begin();
+        int current = cellBegin;
+        while (numCharsToTake > 0) {
+            if (current == src.end()) {
+                break;
+            }
+            final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(data[current], src.end() - current,
+                    utf32CountingMode, tempInt);
+            if (numCharsToTake < tempInt.intValue()) {
+                // There is not enough space left in the field to store this character.
+                // This can happen if CsvSpecs is set for the UTF16 counting convention,
+                // there is one unit left in the field, and we encounter a character outside
+                // the Basic Multilingual Plane, which would require two units.
+                break;
+            }
+            numCharsToTake -= tempInt.intValue();
+            current += utf8Length;
+        }
+        dest.reset(src.data(), cellBegin, current);
+        src.reset(src.data(), current, src.end());
+    }
+
+    @Override
+    public int physicalRowNum() {
+        return lineGrabber.physicalRowNum();
+    }
+}