diff --git a/Src/KBCsv.UnitTests/CsvReaderFixture.cs b/Src/KBCsv.UnitTests/CsvReaderFixture.cs index 7e25064..bff47c0 100644 --- a/Src/KBCsv.UnitTests/CsvReaderFixture.cs +++ b/Src/KBCsv.UnitTests/CsvReaderFixture.cs @@ -181,6 +181,53 @@ public void value_delimiter_cannot_be_set_if_disposed() Assert.Throws(() => reader.ValueDelimiter = '|'); } + [Fact] + public void escape_character_defaults_to_null() + { + using (var reader = CsvReader.FromCsvString(string.Empty)) + { + Assert.Null(reader.EscapeCharacter); + } + } + + [Fact] + public void escape_character_cannot_be_gotten_if_disposed() + { + var ignore = ' '; + var reader = CsvReader.FromCsvString(string.Empty); + reader.Dispose(); + Assert.Throws(() => ignore = reader.EscapeCharacter.Value); + } + + [Fact] + public void escape_character_can_be_set() + { + using (var reader = CsvReader.FromCsvString(string.Empty)) + { + reader.EscapeCharacter = '\\'; + Assert.Equal('\\', reader.EscapeCharacter); + } + } + + [Fact] + public void escape_character_can_be_set_to_null() + { + using (var reader = CsvReader.FromCsvString(string.Empty)) + { + reader.EscapeCharacter = null; + Assert.Null(reader.EscapeCharacter); + } + } + + [Fact] + public void escape_character_cannot_be_set_if_disposed() + { + var reader = CsvReader.FromCsvString(string.Empty); + reader.Dispose(); + Assert.Throws(() => reader.EscapeCharacter = '\\'); + } + + [Fact] public void header_record_defaults_to_null() { @@ -1315,6 +1362,44 @@ public async Task read_data_records_async_returns_read_only_records() } } + [Fact] + public async Task read_data_records_async_with_escape_character() + { + var csv = @"first,""\""second\"""",third"; + + using (var reader = CsvReader.FromCsvString(csv)) + { + reader.EscapeCharacter = '\\'; + + var buffer = new DataRecord[1]; + Assert.Equal(1, await reader.ReadDataRecordsAsync(buffer, 0, buffer.Length)); + + var dataRecord = buffer[0]; + Assert.Equal(3, dataRecord.Count); + Assert.Equal("first", dataRecord[0]); + Assert.Equal("\"second\"", dataRecord[1]); + Assert.Equal("third", dataRecord[2]); + } + } + + [Fact] + public async Task read_data_records_async_with_no_escape_character() + { + var csv = @"first,\""second\"",third"; + + using (var reader = CsvReader.FromCsvString(csv)) + { + var buffer = new DataRecord[1]; + Assert.Equal(1, await reader.ReadDataRecordsAsync(buffer, 0, buffer.Length)); + + var dataRecord = buffer[0]; + Assert.Equal(3, dataRecord.Count); + Assert.Equal("first", dataRecord[0]); + Assert.Equal("\\second\\", dataRecord[1]); + Assert.Equal("third", dataRecord[2]); + } + } + [Fact] public void dispose_disposes_of_underlying_text_reader() { diff --git a/Src/KBCsv/CsvReader.cs b/Src/KBCsv/CsvReader.cs index cd12139..4a9ddc3 100644 --- a/Src/KBCsv/CsvReader.cs +++ b/Src/KBCsv/CsvReader.cs @@ -267,6 +267,30 @@ public char? ValueDelimiter } } + /// + /// Gets or sets the character used to escape values (normally to escape the ValueDelimiter) + /// + /// + /// + /// This property specifies what character is used to escape values within the CSV. This is normally only required when the CSV is not following RFC 4180, + /// e.g. the delimiter is escaped as \" rather than "" + /// + /// + public char? EscapeCharacter + { + get + { + this.EnsureNotDisposed(); + return this.parser.EscapeCharacter; + } + + set + { + this.EnsureNotDisposed(); + this.parser.EscapeCharacter = value; + } + } + /// /// Gets or sets the header record. /// diff --git a/Src/KBCsv/Internal/CsvParser.cs b/Src/KBCsv/Internal/CsvParser.cs index 14c7dfa..638fba2 100644 --- a/Src/KBCsv/Internal/CsvParser.cs +++ b/Src/KBCsv/Internal/CsvParser.cs @@ -22,6 +22,7 @@ internal sealed partial class CsvParser private bool preserveTrailingWhiteSpace; private char valueSeparator; private char? valueDelimiter; + private char? escapeCharacter; public CsvParser(TextReader reader) { @@ -33,6 +34,7 @@ public CsvParser(TextReader reader) this.valueBuilder = new ValueBuilder(this); this.valueSeparator = Constants.DefaultValueSeparator; this.valueDelimiter = Constants.DefaultValueDelimiter; + this.escapeCharacter = null; this.UpdateSpecialCharacterMask(); } @@ -78,6 +80,18 @@ public char? ValueDelimiter } } + public char? EscapeCharacter + { + get { return this.escapeCharacter; } + set + { + exceptionHelper.ResolveAndThrowIf(value == this.valueSeparator, "valueSeparatorAndEscapeCannotMatch"); + + this.escapeCharacter = value; + this.UpdateSpecialCharacterMask(); + } + } + public bool HasMoreRecords { get @@ -106,6 +120,7 @@ public int SkipRecords(int skip) var skipped = 0; var delimited = false; + var escaped = false; while (skipped < skip) { @@ -117,13 +132,24 @@ public int SkipRecords(int skip) { var ch = this.buffer[this.bufferIndex++]; - if (!this.IsPossiblySpecialCharacter(ch)) + if (escaped) + { + // we had an escape character previous, just insert this as whatever character it is + this.valueBuilder.NotifyPreviousCharIncluded(true); + escaped = false; + } + else if (ch == this.escapeCharacter) + { + // User specified escape character, do not insert into value, next read character is not special + this.valueBuilder.NotifyPreviousCharExcluded(); + escaped = true; + } + else if (!this.IsPossiblySpecialCharacter(ch)) { // if it's definitely not a special character, then we can just continue on with the loop continue; } - - if (!delimited) + else if (!delimited) { if (ch == this.valueDelimiter) { @@ -200,6 +226,7 @@ public int ParseRecords(HeaderRecord headerRecord, DataRecord[] buffer, int offs var ch = char.MinValue; var recordsParsed = 0; var delimited = false; + var escaped = false; for (var i = offset; i < offset + count; ++i) { @@ -209,14 +236,25 @@ public int ParseRecords(HeaderRecord headerRecord, DataRecord[] buffer, int offs { ch = this.buffer[this.bufferIndex++]; - if (!this.IsPossiblySpecialCharacter(ch)) + if (escaped) + { + // we had an escape character previous, just insert this as whatever character it is + this.valueBuilder.NotifyPreviousCharIncluded(true); + escaped = false; + } + else if (ch == this.escapeCharacter) + { + // User specified escape character, do not insert into value, next read character is not special + this.valueBuilder.NotifyPreviousCharExcluded(); + escaped = true; + } + else if (!this.IsPossiblySpecialCharacter(ch)) { // if it's definitely not a special character, then we can just append it and continue on with the loop this.valueBuilder.NotifyPreviousCharIncluded(delimited); continue; - } - - if (!delimited) + } + else if (!delimited) { if (ch == this.valueSeparator) { @@ -227,7 +265,7 @@ public int ParseRecords(HeaderRecord headerRecord, DataRecord[] buffer, int offs this.valueBuilder.NotifyPreviousCharExcluded(); delimited = true; - // since we're in a delimited area, the only special character is the value delimiter + // since we're in a delimited area, the only special character is the value delimiter, or escape character this.activeSpecialCharacterMask = this.valueDelimiter.Value; } else if (ch == Constants.CR) diff --git a/Src/KBCsv/Internal/CsvParser_Async.cs b/Src/KBCsv/Internal/CsvParser_Async.cs index 3b23bf8..dd806ea 100644 --- a/Src/KBCsv/Internal/CsvParser_Async.cs +++ b/Src/KBCsv/Internal/CsvParser_Async.cs @@ -15,6 +15,7 @@ public async Task SkipRecordsAsync(int skip) var skipped = 0; var delimited = false; + var escaped = false; while (skipped < skip) { @@ -26,13 +27,24 @@ public async Task SkipRecordsAsync(int skip) { var ch = this.buffer[this.bufferIndex++]; - if (!this.IsPossiblySpecialCharacter(ch)) + if (escaped) + { + // we had an escape character previous, just insert this as whatever character it is + this.valueBuilder.NotifyPreviousCharIncluded(true); + escaped = false; + } + else if (ch == this.escapeCharacter) + { + // User specified escape character, do not insert into value, next read character is not special + this.valueBuilder.NotifyPreviousCharExcluded(); + escaped = true; + } + else if (!this.IsPossiblySpecialCharacter(ch)) { // if it's definitely not a special character, then we can just continue on with the loop continue; } - - if (!delimited) + else if (!delimited) { if (ch == this.valueDelimiter) { @@ -109,6 +121,7 @@ public async Task ParseRecordsAsync(HeaderRecord headerRecord, DataRecord[] var ch = char.MinValue; var recordsParsed = 0; var delimited = false; + var escaped = false; for (var i = offset; i < offset + count; ++i) { @@ -118,14 +131,25 @@ public async Task ParseRecordsAsync(HeaderRecord headerRecord, DataRecord[] { ch = this.buffer[this.bufferIndex++]; - if (!this.IsPossiblySpecialCharacter(ch)) + if (escaped) + { + // we had an escape character previous, just insert this as whatever character it is + this.valueBuilder.NotifyPreviousCharIncluded(true); + escaped = false; + } + else if (ch == this.escapeCharacter) + { + // User specified escape character, do not insert into value, next read character is not special + this.valueBuilder.NotifyPreviousCharExcluded(); + escaped = true; + } + else if (!this.IsPossiblySpecialCharacter(ch)) { // if it's definitely not a special character, then we can just append it and continue on with the loop this.valueBuilder.NotifyPreviousCharIncluded(delimited); continue; } - - if (!delimited) + else if (!delimited) { if (ch == this.valueSeparator) {