From 579de90d0f1e98249aa787dd404a49c07e852016 Mon Sep 17 00:00:00 2001 From: Tatu Saloranta Date: Tue, 20 May 2014 19:39:55 -0700 Subject: [PATCH] Implement #26 --- release-notes/VERSION | 2 + .../jackson/dataformat/csv/CsvFactory.java | 5 +- .../jackson/dataformat/csv/CsvGenerator.java | 35 ++++++--- .../jackson/dataformat/csv/CsvMapper.java | 6 +- .../dataformat/csv/impl/CsvWriter.java | 72 +++++++++++++++++-- .../jackson/dataformat/csv/TestParser.java | 30 +++++++- .../dataformat/csv/TestParserQuotes.java | 2 +- .../csv/TestParserStrictQuoting.java | 47 ++++++++++++ 8 files changed, 175 insertions(+), 24 deletions(-) create mode 100644 src/test/java/com/fasterxml/jackson/dataformat/csv/TestParserStrictQuoting.java diff --git a/release-notes/VERSION b/release-notes/VERSION index 24e1c91..ad32a02 100644 --- a/release-notes/VERSION +++ b/release-notes/VERSION @@ -1,6 +1,8 @@ Project: jackson-dataformat-csv Version: 2.4.0 (xx-xxx-2014) +#26: Inconsistent quoting of headers, values; add + `CsvGenerator.Feature.STRICT_CHECK_FOR_QUOTING` to allow more optimal checks. #32: Allow disabling of quoteChar (suggested by Jason D) #40: Allow (re)ordering of columns of Schema, using `CsvSchema.sortedBy(...)` diff --git a/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvFactory.java b/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvFactory.java index 5859bec..22beb3e 100644 --- a/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvFactory.java +++ b/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvFactory.java @@ -418,8 +418,7 @@ protected Writer _createWriter(OutputStream out, JsonEncoding enc, IOContext ctx protected CsvGenerator _createGenerator(IOContext ctxt, Writer out) throws IOException { - int feats = _csvGeneratorFeatures; - CsvGenerator gen = new CsvGenerator(ctxt, _generatorFeatures, feats, + CsvGenerator gen = new CsvGenerator(ctxt, _generatorFeatures, _csvGeneratorFeatures, _objectCodec, out, _cfgColumnSeparator, _cfgQuoteCharacter, _cfgLineSeparator); // any other initializations? No? @@ -434,7 +433,7 @@ protected Reader _createReader(InputStream in, JsonEncoding enc, IOContext ctxt) if (enc == null || enc == JsonEncoding.UTF8) { // 28-May-2012, tatu: Custom UTF8 reader should be faster, esp for small input: // return new InputStreamReader(in, UTF8); - boolean autoClose = ctxt.isResourceManaged() || this.isEnabled(JsonParser.Feature.AUTO_CLOSE_SOURCE); + boolean autoClose = ctxt.isResourceManaged() || isEnabled(JsonParser.Feature.AUTO_CLOSE_SOURCE); return new UTF8Reader(ctxt, in, autoClose); } return new InputStreamReader(in, enc.getJavaName()); diff --git a/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvGenerator.java b/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvGenerator.java index 28de61e..0be4c1c 100644 --- a/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvGenerator.java +++ b/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvGenerator.java @@ -19,9 +19,26 @@ public class CsvGenerator extends GeneratorBase */ public enum Feature { /** - * Placeholder until we have actual features + * Feature that determines how much work is done before determining that + * a column value requires quoting: when set as true, full + * check is made to only use quoting when it is strictly necessary; + * but when false, a faster but more conservative check + * is made, and possibly quoting is used for values that might not need it. + * Trade-offs is basically between optimal/minimal quoting (true), and + * faster handling (false). + * Faster check involves only checking first N characters of value, as well + * as possible looser checks. + *

+ * Note, however, that regardless setting, all values that need to be quoted + * will be: it is just that when set to false, other values may + * also be quoted (to avoid having to do more expensive checks). + *

+ * Default value is false for "loose" (approximate, conservative) + * checking. + * + * @since 2.4 */ - BOGUS(false) // placeholder + STRICT_CHECK_FOR_QUOTING(false), ; protected final boolean _defaultState; @@ -41,15 +58,16 @@ public static int collectDefaults() } return flags; } - + private Feature(boolean defaultState) { _defaultState = defaultState; _mask = (1 << ordinal()); } + public boolean enabledIn(int flags) { return (flags & getMask()) != 0; } public boolean enabledByDefault() { return _defaultState; } public int getMask() { return _mask; } - }; + } protected final static long MIN_INT_AS_LONG = Integer.MIN_VALUE; protected final static long MAX_INT_AS_LONG = Integer.MAX_VALUE; @@ -112,7 +130,7 @@ public CsvGenerator(IOContext ctxt, int jsonFeatures, int csvFeatures, char columnSeparator, char quoteChar, char[] linefeed) { this(ctxt, jsonFeatures, csvFeatures, codec, - new CsvWriter(ctxt, out, columnSeparator, quoteChar, linefeed)); + new CsvWriter(ctxt, csvFeatures, out, columnSeparator, quoteChar, linefeed)); } public CsvGenerator(IOContext ctxt, int jsonFeatures, int csvFeatures, @@ -259,6 +277,7 @@ private final void _writeFieldName(String name) public CsvGenerator enable(Feature f) { _csvFeatures |= f.getMask(); + _writer.setFeatures(_csvFeatures); return this; } @@ -273,11 +292,9 @@ public final boolean isEnabled(Feature f) { public CsvGenerator configure(Feature f, boolean state) { if (state) { - enable(f); - } else { - disable(f); + return enable(f); } - return this; + return disable(f); } /* diff --git a/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvMapper.java b/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvMapper.java index 7139714..ab119a2 100644 --- a/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvMapper.java +++ b/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvMapper.java @@ -106,8 +106,7 @@ public final CsvFactory getFactory() { * Definition will not be strictly typed (that is, all columns are * just defined to be exposed as String tokens). */ - public CsvSchema schemaFor(JavaType pojoType) - { + public CsvSchema schemaFor(JavaType pojoType) { return _schemaFor(pojoType, _untypedSchemas, false); } @@ -126,8 +125,7 @@ public final CsvSchema schemaFor(TypeReference pojoTypeRef) { * determine type limitations which may make parsing more efficient * (especially for numeric types like java.lang.Integer). */ - public CsvSchema typedSchemaFor(JavaType pojoType) - { + public CsvSchema typedSchemaFor(JavaType pojoType) { return _schemaFor(pojoType, _typedSchemas, true); } diff --git a/src/main/java/com/fasterxml/jackson/dataformat/csv/impl/CsvWriter.java b/src/main/java/com/fasterxml/jackson/dataformat/csv/impl/CsvWriter.java index a73fc04..e6f0ee0 100644 --- a/src/main/java/com/fasterxml/jackson/dataformat/csv/impl/CsvWriter.java +++ b/src/main/java/com/fasterxml/jackson/dataformat/csv/impl/CsvWriter.java @@ -2,6 +2,7 @@ import com.fasterxml.jackson.core.JsonGenerationException; import com.fasterxml.jackson.core.io.IOContext; +import com.fasterxml.jackson.dataformat.csv.CsvGenerator; import com.fasterxml.jackson.dataformat.csv.CsvSchema; import java.io.IOException; @@ -22,7 +23,7 @@ public class CsvWriter /* Also: only do check for optional quotes for short * values; longer ones will always be quoted. */ - final protected static int MAX_QUOTE_CHECK = 20; + final protected static int MAX_QUOTE_CHECK = 24; final protected BufferedValue[] NO_BUFFERED = new BufferedValue[0]; @@ -36,7 +37,7 @@ public class CsvWriter */ final protected IOContext _ioContext; - + /** * Underlying {@link Writer} used for output. */ @@ -57,6 +58,14 @@ public class CsvWriter * quotes around value */ final protected int _cfgMinSafeChar; + + protected int _csvFeatures; + + /** + * Marker flag used to determine if to do optimal (aka "strict") quoting + * checks or not (looser conservative check) + */ + protected boolean _cfgOptimalQuoting; /* /********************************************************** @@ -123,10 +132,22 @@ public class CsvWriter /********************************************************** */ + @Deprecated // since 2.4, remove in 2.5 public CsvWriter(IOContext ctxt, Writer out, char columnSeparator, char quoteChar, char[] linefeed) + { + this(ctxt, CsvGenerator.Feature.collectDefaults(), + out, columnSeparator, quoteChar, linefeed); + + } + + public CsvWriter(IOContext ctxt, int csvFeatures, Writer out, + char columnSeparator, char quoteChar, char[] linefeed) { _ioContext = ctxt; + _csvFeatures = csvFeatures; + _cfgOptimalQuoting = CsvGenerator.Feature.STRICT_CHECK_FOR_QUOTING.enabledIn(csvFeatures); + _outputBuffer = ctxt.allocConcatBuffer(); _bufferRecyclable = true; _outputEnd = _outputBuffer.length; @@ -145,6 +166,9 @@ public CsvWriter(IOContext ctxt, Writer out, public CsvWriter(CsvWriter base, CsvSchema newSchema) { _ioContext = base._ioContext; + _csvFeatures = base._csvFeatures; + _cfgOptimalQuoting = base._cfgOptimalQuoting; + _outputBuffer = base._outputBuffer; _bufferRecyclable = base._bufferRecyclable; _outputEnd = base._outputEnd; @@ -172,6 +196,14 @@ public CsvWriter withSchema(CsvSchema schema) { return new CsvWriter(this, schema); } + public CsvWriter setFeatures(int feat) { + if (feat != _csvFeatures) { + _csvFeatures = feat; + _cfgOptimalQuoting = CsvGenerator.Feature.STRICT_CHECK_FOR_QUOTING.enabledIn(feat); + } + return this; + } + /* /********************************************************** /* Read-access to output state @@ -583,17 +615,49 @@ protected boolean _mayNeedQuotes(String value, int length) if (_cfgQuoteCharacter < 0) { return false; } - // let's not bother checking long Strings, just quote already: + // may skip checks unless we want exact checking + if (_cfgOptimalQuoting) { + return _needsQuotingStrict(value); + } if (length > _cfgMaxQuoteCheckChars) { return true; } - for (int i = 0; i < length; ++i) { + return _needsQuotingLoose(value); + } + + /** + *

+ * NOTE: final since checking is not expected to be changed here; override + * calling method (_mayNeedQuotes) instead, if necessary. + * + * @since 2.4 + */ + protected final boolean _needsQuotingLoose(String value) + { + for (int i = 0, len = value.length(); i < len; ++i) { if (value.charAt(i) < _cfgMinSafeChar) { return true; } } return false; } + + /** + * @since 2.4 + */ + protected boolean _needsQuotingStrict(String value) + { + for (int i = 0, len = value.length(); i < len; ++i) { + char c = value.charAt(i); + if (c < _cfgMinSafeChar) { + if (c == _cfgColumnSeparator || c == _cfgQuoteCharacter + || c == '\r' || c == '\n') { + return true; + } + } + } + return false; + } protected void _buffer(int index, BufferedValue v) { diff --git a/src/test/java/com/fasterxml/jackson/dataformat/csv/TestParser.java b/src/test/java/com/fasterxml/jackson/dataformat/csv/TestParser.java index 74066f9..e96aa3a 100644 --- a/src/test/java/com/fasterxml/jackson/dataformat/csv/TestParser.java +++ b/src/test/java/com/fasterxml/jackson/dataformat/csv/TestParser.java @@ -32,8 +32,21 @@ public void testSimpleExplicit() throws Exception .addColumn("userImage") .addColumn("verified") .build(); + ObjectReader r = mapper.reader(schema); + _testSimpleExplicit(r, false); + _testSimpleExplicit(r, true); + } - FiveMinuteUser user = mapper.reader(schema).withType(FiveMinuteUser.class).readValue("Bob,Robertson,MALE,AQIDBAU=,false\n"); + private void _testSimpleExplicit(ObjectReader r, boolean useBytes) throws Exception + { + r = r.withType(FiveMinuteUser.class); + FiveMinuteUser user; + final String INPUT = "Bob,Robertson,MALE,AQIDBAU=,false\n"; + if (useBytes) { + user = r.readValue(INPUT); + } else { + user = r.readValue(INPUT.getBytes("UTF-8")); + } assertEquals("Bob", user.firstName); assertEquals("Robertson", user.lastName); assertEquals(FiveMinuteUser.Gender.MALE, user.getGender()); @@ -79,7 +92,12 @@ public void testSimpleAsMaps() throws Exception } // Test for [Issue#10] - public void testMapsWithLinefeeds() throws Exception + public void testMapsWithLinefeeds() throws Exception { + _testMapsWithLinefeeds(false); + _testMapsWithLinefeeds(true); + } + + private void _testMapsWithLinefeeds(boolean useBytes) throws Exception { CsvMapper mapper = mapperForCsv(); String CSV = "A,B,C\n" @@ -91,7 +109,13 @@ public void testMapsWithLinefeeds() throws Exception CsvSchema cs = CsvSchema.emptySchema().withHeader(); ObjectReader or = mapper.reader(HashMap.class).with(cs); - MappingIterator> mi = or.readValues(CSV); + MappingIterator> mi; + + if (useBytes) { + mi = or.readValues(CSV.getBytes("UTF-8")); + } else { + mi = or.readValues(CSV); + } assertTrue(mi.hasNext()); Map map = mi.nextValue(); diff --git a/src/test/java/com/fasterxml/jackson/dataformat/csv/TestParserQuotes.java b/src/test/java/com/fasterxml/jackson/dataformat/csv/TestParserQuotes.java index 6c0a23c..336aaa0 100644 --- a/src/test/java/com/fasterxml/jackson/dataformat/csv/TestParserQuotes.java +++ b/src/test/java/com/fasterxml/jackson/dataformat/csv/TestParserQuotes.java @@ -6,7 +6,7 @@ public class TestParserQuotes extends ModuleTestBase { - @JsonPropertyOrder({"age, name"}) + @JsonPropertyOrder({"age", "name"}) protected static class AgeName { public int age; public String name; diff --git a/src/test/java/com/fasterxml/jackson/dataformat/csv/TestParserStrictQuoting.java b/src/test/java/com/fasterxml/jackson/dataformat/csv/TestParserStrictQuoting.java new file mode 100644 index 0000000..a8d429e --- /dev/null +++ b/src/test/java/com/fasterxml/jackson/dataformat/csv/TestParserStrictQuoting.java @@ -0,0 +1,47 @@ +package com.fasterxml.jackson.dataformat.csv; + +import com.fasterxml.jackson.annotation.JsonPropertyOrder; + +// Tests for [Issue#26] +public class TestParserStrictQuoting extends ModuleTestBase +{ + @JsonPropertyOrder({"a", "b"}) + protected static class AB { + public String a, b; + + public AB() { } + public AB(String a, String b) { + this.a = a; + this.b = b; + } + } + + /* + /********************************************************************** + /* Test methods + /********************************************************************** + */ + + public void testStrictQuoting() throws Exception + { + final String NUMS = "12345 6789"; + final String LONG = NUMS + NUMS + NUMS + NUMS; // 40 chars should do it + + CsvMapper mapper = mapperForCsv(); + + assertFalse(mapper.getFactory().isEnabled(CsvGenerator.Feature.STRICT_CHECK_FOR_QUOTING)); + CsvSchema schema = mapper.schemaFor(AB.class).withoutHeader(); + + final AB input = new AB("x", LONG); + + // with non-strict, should quote + String csv = mapper.writer(schema).writeValueAsString(input); + assertEquals(aposToQuotes("x,'"+LONG+"'"), csv.trim()); + + // should be possible to hot-swap + // and with strict/optimal, no quoting + mapper.configure(CsvGenerator.Feature.STRICT_CHECK_FOR_QUOTING, true); + csv = mapper.writer(schema).writeValueAsString(input); + assertEquals(aposToQuotes("x,"+LONG), csv.trim()); + } +}