diff --git a/PocketCsvReader.Benchmark/CustomerRecord.cs b/PocketCsvReader.Benchmark/CustomerRecord.cs index b93d17b..4394fd5 100644 --- a/PocketCsvReader.Benchmark/CustomerRecord.cs +++ b/PocketCsvReader.Benchmark/CustomerRecord.cs @@ -5,13 +5,14 @@ using System.Threading.Tasks; namespace PocketCsvReader.Benchmark; -internal class CustomerRecord -{ - public string Firstname { get; set; } - public string Lastname { get; set; } - public string Gender { get; set; } - public DateTime DateOfBirth { get; set; } - public int Year { get; set; } - public string Month { get; set; } - public decimal TotalOrder { get; set; } -} +internal record CustomerRecord +( + string Firstname, + string Lastname, + string Gender, + DateTime DateOfBirth, + int Year, + string Month, + decimal TotalOrder +) +{ } diff --git a/PocketCsvReader.Testing/CsvDataTableTest.cs b/PocketCsvReader.Testing/CsvDataTableTest.cs new file mode 100644 index 0000000..7ef21d0 --- /dev/null +++ b/PocketCsvReader.Testing/CsvDataTableTest.cs @@ -0,0 +1,259 @@ +using PocketCsvReader; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Data; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using System.Diagnostics.CodeAnalysis; +using System.Reflection; + +namespace PocketCsvReader.Testing +{ + [TestFixture] + public class CsvDataTableTest + { + [Test] + [TestCase("Ansi")] + [TestCase("Utf16-BE")] + [TestCase("Utf16-LE")] + [TestCase("Utf8-BOM")] + [TestCase("Utf8")] + public void ToDataTable_Financial_CorrectRowsColumns(string filename) + { + var reader = new CsvReader(new CsvProfile('\t', '\"', "\r\n", true)); + + using (var stream = + Assembly.GetExecutingAssembly() + .GetManifestResourceStream($"{Assembly.GetExecutingAssembly().GetName().Name}.Resources.{filename}.csv") + ?? throw new FileNotFoundException() + ) + { + var dataTable = reader.ToDataTable(stream); + Assert.That(dataTable.Columns.Count, Is.EqualTo(14)); + Assert.That(dataTable.Rows.Count, Is.EqualTo(21)); + } + } + + [Test] + [TestCase("a+b+c#a+b#a#a+b", '+', "#", "?")] + public void Read_CsvWithCsvProfileMissingCell_CorrectResults(string text, char fieldSeparator, string recordSeparator, string missingCell) + { + using (var stream = new MemoryStream()) + { + var writer = new StreamWriter(stream); + writer.Write(text); + writer.Flush(); + + stream.Position = 0; + + var profile = new CsvProfile(fieldSeparator, '`', '`', recordSeparator, false, true, 4096, "_", missingCell); + var reader = new CsvReader(profile); + var dataTable = reader.ToDataTable(stream); + + Assert.That(dataTable.Rows[0].ItemArray[0], Is.EqualTo("a")); + Assert.That(dataTable.Rows[0].ItemArray[1], Is.EqualTo("b")); + Assert.That(dataTable.Rows[0].ItemArray[2], Is.EqualTo("c")); + + Assert.That(dataTable.Rows[1].ItemArray[0], Is.EqualTo("a")); + Assert.That(dataTable.Rows[1].ItemArray[1], Is.EqualTo("b")); + Assert.That(dataTable.Rows[1].ItemArray[2], Is.EqualTo("?")); + + Assert.That(dataTable.Rows[2].ItemArray[0], Is.EqualTo("a")); + Assert.That(dataTable.Rows[2].ItemArray[1], Is.EqualTo("?")); + Assert.That(dataTable.Rows[2].ItemArray[2], Is.EqualTo("?")); + + Assert.That(dataTable.Rows[3].ItemArray[0], Is.EqualTo("a")); + Assert.That(dataTable.Rows[3].ItemArray[1], Is.EqualTo("b")); + Assert.That(dataTable.Rows[3].ItemArray[2], Is.EqualTo("?")); + + + writer.Dispose(); + } + } + + [Test] + [TestCase("a+b+c#a++c", '+', "#", "?", "a", "?", "c")] + [TestCase("a+b+c#+b+c", '+', "#", "?", "?", "b", "c")] + [TestCase("a+b+c#+b+", '+', "#", "?", "?", "b", "?")] + public void Read_CsvWithCsvProfileEmptyCell_CorrectResults(string text, char fieldSeparator, string recordSeparator, string emptyCell, params string[] expected) + { + using (var stream = new MemoryStream()) + { + var writer = new StreamWriter(stream); + writer.Write(text); + writer.Flush(); + + stream.Position = 0; + var profile = new CsvProfile(fieldSeparator, '`', '`', recordSeparator, false, true, 4096, emptyCell, "_"); + var reader = new CsvReader(profile); + var dataTable = reader.ToDataTable(stream); + + Assert.That(dataTable.Rows[0].ItemArray[0], Is.EqualTo("a")); + Assert.That(dataTable.Rows[0].ItemArray[1], Is.EqualTo("b")); + Assert.That(dataTable.Rows[0].ItemArray[2], Is.EqualTo("c")); + + for (int i = 0; i < 3;i++) + Assert.That(dataTable.Rows[1].ItemArray[i], Is.EqualTo(expected[i])); + + writer.Dispose(); + } + } + + [Test] + [TestCase("abc\r\ndef\r\nghl\r\nijk", 1, 1)] + [TestCase("abc\r\ndef\r\nghl\r\nijk", 17, 1)] + [TestCase("abc\r\ndef\r\nghl\r\nijk", 18, 1)] + [TestCase("abc\r\ndef\r\nghl\r\nijk", 19, 1)] + [TestCase("abc\r\ndef\r\nghl\r\nijk", 512, 1)] + [TestCase("abc;xyz\r\ndef;xyz\r\nghl\r\n;ijk", 1, 2)] + [TestCase("abc;xyz\r\ndef;xyz\r\nghl\r\n;ijk", 512, 2)] + [TestCase("\"abc\";\"xyz\"\r\n\"def\";\"xyz\"\r\n\"ghl\"\r\n;\"ijk\"", 512, 2)] + [TestCase("abc;\"xyz\"\r\n\"def\";xyz\r\n\"ghl\"\r\n;\"ijk\"", 512, 2)] + [TestCase("abc;\"xyz\"\r\n\"def\";xyz\r\n\"ghl\"\r\n;\"ijk\"", 512, 2)] + public void Read_Csv_CorrectResult(string text, int bufferSize, int fieldCount) + { + using (var stream = new MemoryStream()) + { + var writer = new StreamWriter(stream); + writer.Write(text); + writer.Flush(); + + stream.Position = 0; + + var reader = new CsvReader(new CsvProfile(';', '\"', "\r\n", false, false, 4096, "(empty)", "(null)"), bufferSize); + var dataTable = reader.ToDataTable(stream); + Assert.That(dataTable.Rows, Has.Count.EqualTo(4)); + Assert.That(dataTable.Columns, Has.Count.EqualTo(fieldCount)); + foreach (DataRow row in dataTable.Rows) + { + foreach (var cell in row.ItemArray) + Assert.That(cell!.ToString(), Has.Length.EqualTo(3).Or.EqualTo("(empty)").Or.EqualTo("(null)")); + } + Assert.That(dataTable.Rows[0][0], Is.EqualTo("abc")); + if (dataTable.Columns.Count == 2) + Assert.That(dataTable.Rows[0][1], Is.EqualTo("xyz")); + writer.Dispose(); + } + } + + [Test] + [TestCase("'azerty';'';'alpha'", 3)] + [TestCase("'azerty';;'alpha'", 3)] + public void Read_CsvWithTextQualifier_CorrectResult(string text, int columnCount) + { + using (var stream = new MemoryStream()) + { + var writer = new StreamWriter(stream); + writer.Write(text); + writer.Flush(); + + stream.Position = 0; + + var reader = new CsvReader(new CsvProfile(';', '\'', "\r\n", false, false, 4096, "foo", "(null)")); + var dataTable = reader.ToDataTable(stream); + Assert.That(dataTable.Columns, Has.Count.EqualTo(columnCount)); + Assert.That(dataTable.Rows[0][0], Is.EqualTo("azerty")); + Assert.That(dataTable.Rows[0][1], Is.EqualTo("foo")); + Assert.That(dataTable.Rows[0][2], Is.EqualTo("alpha")); + writer.Dispose(); + } + } + + [Test] + [TestCase("a;b;c\r\nd;e;f;g\r\n", 1, 1)] + [TestCase("a;b;c\r\nd;e;f\r\ng;h;i;j\r\n", 2, 1)] + [TestCase("a;b;c\r\nd;e;f\r\ng;h;i;j;k\r\n", 2, 2)] + public void Read_MoreFieldThanExpected_ExceptionMessage(string text, int rowNumber, int moreField) + { + using (var stream = new MemoryStream()) + { + using (var writer = new StreamWriter(stream)) + { + writer.Write(text); + writer.Flush(); + + stream.Position = 0; + + var profile = CsvProfile.SemiColumnDoubleQuote; + var reader = new CsvReader(profile, 1024); + + var ex = Assert.Throws(() => reader.ToDataTable(stream)); + Assert.That(ex!.Message, Does.Contain(string.Format("record {0} ", rowNumber + 1))); + Assert.That(ex.Message, Does.Contain(string.Format("{0} more", moreField))); + } + } + } + + [Test] + public void Read_EmptyValue_MatchWithEmpty() + { + using (var stream = new MemoryStream()) + { + using (var writer = new StreamWriter(stream)) + { + writer.Write("a;;c"); + writer.Flush(); + + stream.Position = 0; + + var profile = CsvProfile.SemiColumnDoubleQuote; + var reader = new CsvReader(profile); + var dataTable = reader.ToDataTable(stream); + Assert.That(dataTable.Rows[0][1], Is.EqualTo(string.Empty)); + } + } + } + + [Test] + public void Read_MissingValue_MatchWithNullValue() + { + using (var stream = new MemoryStream()) + { + using (var writer = new StreamWriter(stream)) + { + writer.Write("a;b;c\r\na;b\r\na;b;c"); + writer.Flush(); + + stream.Position = 0; + + var profile = new CsvProfile(';', '"', "\r\n", false, true, 512, string.Empty, "(null)"); + var reader = new CsvReader(profile); + var dataTable = reader.ToDataTable(stream); + Assert.That(dataTable.Rows[1][2], Is.EqualTo("(null)")); + } + } + } + + [Test] + [TestCase("a;b;c\r\n1;2;3")] + [TestCase("a;b;c\r\n1;2;3\r\n")] + [TestCase("a;b;c\r\n#\r\n1;2;3")] + [TestCase("a;b;c\r\n#x;y;z\r\n1;2;3")] + [TestCase("a;b;c\r\n1;2;3\r\n#x;y;z")] + [TestCase("#x;y;z\r\na;b;c\r\n1;2;3")] + [TestCase("#x;y;z\r\n#x;y;z\r\na;b;c\r\n1;2;3")] + [TestCase("#x;y;z\r\n#x;y;z\r\na;b;c\r\n1;2;3\r\n#1;2;3")] + public void Read_Comment_CommentedLinesSkipped(string content) + { + using (var stream = new MemoryStream()) + { + using (var writer = new StreamWriter(stream)) + { + writer.Write(content); + writer.Flush(); + + stream.Position = 0; + + var profile = new CsvProfile(new CsvDialectDescriptor { Header = false, Delimiter = ';', CommentChar = '#', DoubleQuote = false }); + var reader = new CsvReader(profile); + var dataTable = reader.ToDataTable(stream); + Assert.That(dataTable.Rows.Count, Is.EqualTo(2)); + Assert.That(dataTable.Columns.Count, Is.EqualTo(3)); + } + } + } + } +} diff --git a/PocketCsvReader.Testing/CsvReaderTest.cs b/PocketCsvReader.Testing/CsvReaderTest.cs index d1c5a4b..2804491 100644 --- a/PocketCsvReader.Testing/CsvReaderTest.cs +++ b/PocketCsvReader.Testing/CsvReaderTest.cs @@ -1,4 +1,4 @@ -using PocketCsvReader; +using PocketCsvReader; using NUnit.Framework; using System; using System.Collections.Generic; @@ -14,27 +14,5 @@ namespace PocketCsvReader.Testing [TestFixture] public class CsvReaderTest { - [Test] - [TestCase("Ansi")] - [TestCase("Utf16-BE")] - [TestCase("Utf16-LE")] - [TestCase("Utf8-BOM")] - [TestCase("Utf8")] - public void ToDataTable_Financial_CorrectRowsColumns(string filename) - { - var reader = new CsvReader(new CsvProfile('\t', '\"', "\r\n", true)); - - using (var stream = - Assembly.GetExecutingAssembly() - .GetManifestResourceStream($"{Assembly.GetExecutingAssembly().GetName().Name}.Resources.{filename}.csv") - ?? throw new FileNotFoundException() - ) - { - var dataTable = reader.ToDataTable(stream); - Assert.That(dataTable.Columns.Count, Is.EqualTo(14)); - Assert.That(dataTable.Rows.Count, Is.EqualTo(21)); - } - - } } } diff --git a/PocketCsvReader.Testing/EncodingDetectorTest.cs b/PocketCsvReader.Testing/EncodingDetectorTest.cs new file mode 100644 index 0000000..851b93e --- /dev/null +++ b/PocketCsvReader.Testing/EncodingDetectorTest.cs @@ -0,0 +1,77 @@ +using PocketCsvReader; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Data; +using System.IO; +using System.Linq; +using System.Text; +using System.Reflection; +using Moq; + +namespace PocketCsvReader.Testing +{ + [TestFixture] + public class EncodingDetectorTest + { + [Test] + [TestCase("Utf16-BE", 2)] + [TestCase("Utf16-LE", 2)] + [TestCase("Utf8-BOM", 3)] + [TestCase("Utf8", 0)] + public void GetStreamEncoding_Financial_CorrectEncodingInfo(string filename, int BomLength) + { + using (var stream = + Assembly.GetExecutingAssembly() + .GetManifestResourceStream($"{Assembly.GetExecutingAssembly().GetName().Name}.Resources.{filename}.csv") + ?? throw new FileNotFoundException() + ) + { + var detector = new EncodingDetector(); + var result = detector.GetStreamEncoding(stream); + Assert.That(result.Encoding.BodyName, Is.EqualTo(filename).Using(new EncodingComparer())); + Assert.That(result.BomBytesCount, Is.EqualTo(BomLength)); + } + } + + public class EncodingComparer : IEqualityComparer + { + public bool Equals(string? x, string? y) + { + if (x is null || y is null) + return false; + + static string normalize(string s) => s.Replace("-LE", "").Replace("-BOM", "").ToLowerInvariant().Replace("-", ""); + return normalize(x) == normalize(y); + } + + public int GetHashCode(string obj) + => obj.ToLowerInvariant().Replace("-", "").GetHashCode(); + } + + private static readonly Encoding[] Encodings = + { + Encoding.Unicode, + Encoding.BigEndianUnicode, + Encoding.UTF8, + Encoding.UTF32, + new UTF32Encoding(true, true), + }; + + [TestCaseSource(nameof(Encodings))] + public void ToDataReader_Financial_CorrectRowsColumns(Encoding encoding) + { + using (var stream = new MemoryStream()) + { + using var writer = new StreamWriter(stream, encoding); + writer.Write("A,B,C\r\n1,2,3\r\n4,5,6\r\n"); + writer.Flush(); + stream.Position = 0; + + var detector = new EncodingDetector(); + var result = detector.GetStreamEncoding(stream); + Assert.That(result.Encoding, Is.EqualTo(encoding)); + } + } + } +} diff --git a/PocketCsvReader.Testing/FieldParserTest.cs b/PocketCsvReader.Testing/FieldParserTest.cs new file mode 100644 index 0000000..b864674 --- /dev/null +++ b/PocketCsvReader.Testing/FieldParserTest.cs @@ -0,0 +1,110 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using NUnit.Framework; + +namespace PocketCsvReader.Testing; +public class FieldParserTest +{ + [Test] + [TestCase("foo", "foo")] + public void ReadField_NotQualified_CorrectString(string item, string result) + { + Span buffer = stackalloc char[64]; + item.AsSpan().CopyTo(buffer); + + var profile = new CsvProfile(';', '\'', '`', "\r\n", false, false, 4096, "(empty)", "(null)"); + var reader = new FieldParser(profile); + var value = reader.ReadField(buffer, 0, item.Length, false, false); + Assert.That(value, Is.EqualTo(result)); + } + + [Test] + [TestCase("", "?")] + public void ReadField_Empty_CorrectString(string item, string result) + { + Span buffer = stackalloc char[64]; + item.AsSpan().CopyTo(buffer); + + var profile = new CsvProfile(';', '\'', '`', "\r\n", false, false, 4096, "?", "(null)"); + var reader = new FieldParser(profile); + var value = reader.ReadField(buffer, 0, item.Length, false, false); + Assert.That(value, Is.EqualTo(result)); + } + + [Test] + [TestCase("(null)", null)] //Parse (null) to a real null value + [TestCase("\"(null)\"", "(null)")] //Explicitly quoted (null) should be (null) + public void ReadField_Null_CorrectString(string item, string result) + { + Span buffer = stackalloc char[64]; + item.AsSpan().CopyTo(buffer); + + var profile = new CsvProfile(';', '\"', '`', "\r\n", false, false, 4096, "?", "(null)"); + var reader = new FieldParser(profile); + var value = reader.ReadField(buffer, 0, item.Length, item.StartsWith("\""), item.StartsWith("\"")); + Assert.That(value, Is.EqualTo(result)); + } + + [TestCase("`a`", "a")] + [TestCase("`foo`", "foo")] + [TestCase("`foo bar`", "foo bar")] + [TestCase("``", "?")] + public void ReadField_Qualified_CorrectString(string item, string result) + { + Span buffer = stackalloc char[64]; + item.AsSpan().CopyTo(buffer); + + var profile = new CsvProfile(';', '`', '\\', "\r\n", false, false, 4096, "?", "(null)"); + var reader = new FieldParser(profile); + var value = reader.ReadField(buffer, 0, item.Length, true, true); + Assert.That(value, Is.EqualTo(result)); + } + + [Test] + [TestCase("'a`'b'", "a'b")] + [TestCase("'`'a`'b`''", "'a'b'")] + public void ReadField_EscapedWithOtherChar_CorrectString(string item, string result) + { + Span buffer = stackalloc char[64]; + item.AsSpan().CopyTo(buffer); + + var profile = new CsvProfile(';', '\'', '`', "\r\n", false, false, 4096, "(empty)", "(null)"); + var reader = new FieldParser(profile); + var value = reader.ReadField(buffer, 0, item.Length, true, true); + Assert.That(value, Is.EqualTo(result)); + } + + [Test] + [TestCase("\"")] + [TestCase("\"a")] + public void ReadField_ContainsQualifierChar_CorrectString(string item) + { + var profile = new CsvProfile(';', '\"', '\"', "\r\n", false, false, 4096, "(empty)", "(null)"); + var reader = new FieldParser(profile); + var value = + Assert.Throws(() => + { + Span buffer = stackalloc char[64]; + item.AsSpan().CopyTo(buffer); + reader.ReadField(buffer, 0, item.Length, false, false); + }); + } + + [TestCase("\"ab\"", "ab")] + [TestCase("\"abc\"", "abc")] + [TestCase("\"a\"\"b\"", "a\"b")] + [TestCase("\"\"\"a\"\"b\"\"\"", "\"a\"b\"")] + public void ReadField_EscapedWithDoubleChar_CorrectString(string item, string result) + { + Span buffer = stackalloc char[64]; + item.AsSpan().CopyTo(buffer); + + var profile = new CsvProfile(';', '\"', '\"', "\r\n", false, false, 4096, "(empty)", "(null)"); + var reader = new FieldParser(profile); + var value = reader.ReadField(buffer, 0, item.Length, true, true); + Assert.That(value, Is.EqualTo(result)); + } +} diff --git a/PocketCsvReader.Testing/InternalCsvReaderTest.cs b/PocketCsvReader.Testing/InternalCsvReaderTest.cs deleted file mode 100644 index a0508bc..0000000 --- a/PocketCsvReader.Testing/InternalCsvReaderTest.cs +++ /dev/null @@ -1,680 +0,0 @@ -using PocketCsvReader; -using NUnit.Framework; -using System; -using System.Collections.Generic; -using System.Data; -using System.IO; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using System.Diagnostics.CodeAnalysis; - -namespace PocketCsvReader.Testing -{ - [TestFixture] - public class InternalCsvReaderTest - { - class CsvReaderProxy : CsvReader - { - public CsvReaderProxy() - : base(new CsvProfile(false)) { } - - public CsvReaderProxy(CsvProfile profile) - : base(profile) { } - - public CsvReaderProxy(int bufferSize) - : base(bufferSize) { } - - public new int CountRecordSeparators(StreamReader reader) - => base.CountRecordSeparators(reader); - - public new string GetFirstRecord(StreamReader reader, string recordSeparator, int bufferSize) - => base.GetFirstRecord(reader, recordSeparator, bufferSize); - - public new (string?[], bool) ReadNextRecord(StreamReader reader, Span buffer, ref Span extra) - => base.ReadNextRecord(reader, buffer, ref extra); - - public new (string?[], bool) ReadNextRecord(Span buffer) - => base.ReadNextRecord(buffer); - } - - [Test] - [TestCase("foo", "foo")] - public void ReadField_NotQualified_CorrectString(string item, string result) - { - Span buffer = stackalloc char[64]; - item.AsSpan().CopyTo(buffer); - - var profile = new CsvProfile(';', '\'', '`', "\r\n", false, false, 4096, "(empty)", "(null)"); - var reader = new CsvReaderProxy(profile); - var value = reader.ReadField(buffer, 0, item.Length, false, false); - Assert.That(value, Is.EqualTo(result)); - } - - [Test] - [TestCase("", "?")] - public void ReadField_Empty_CorrectString(string item, string result) - { - Span buffer = stackalloc char[64]; - item.AsSpan().CopyTo(buffer); - - var profile = new CsvProfile(';', '\'', '`', "\r\n", false, false, 4096, "?", "(null)"); - var reader = new CsvReaderProxy(profile); - var value = reader.ReadField(buffer, 0, item.Length, false, false); - Assert.That(value, Is.EqualTo(result)); - } - - [Test] - [TestCase("(null)", null)] //Parse (null) to a real null value - [TestCase("\"(null)\"", "(null)")] //Explicitly quoted (null) should be (null) - public void ReadField_Null_CorrectString(string item, string result) - { - Span buffer = stackalloc char[64]; - item.AsSpan().CopyTo(buffer); - - var profile = new CsvProfile(';', '\"', '`', "\r\n", false, false, 4096, "?", "(null)"); - var reader = new CsvReaderProxy(profile); - var value = reader.ReadField(buffer, 0, item.Length, item.StartsWith("\""), item.StartsWith("\"")); - Assert.That(value, Is.EqualTo(result)); - } - - [TestCase("`a`", "a")] - [TestCase("`foo`", "foo")] - [TestCase("`foo bar`", "foo bar")] - [TestCase("``", "?")] - public void ReadField_Qualified_CorrectString(string item, string result) - { - Span buffer = stackalloc char[64]; - item.AsSpan().CopyTo(buffer); - - var profile = new CsvProfile(';', '`', '\\', "\r\n", false, false, 4096, "?", "(null)"); - var reader = new CsvReaderProxy(profile); - var value = reader.ReadField(buffer, 0, item.Length, true, true); - Assert.That(value, Is.EqualTo(result)); - } - - [Test] - [TestCase("'a`'b'", "a'b")] - [TestCase("'`'a`'b`''", "'a'b'")] - public void ReadField_EscapedWithOtherChar_CorrectString(string item, string result) - { - Span buffer = stackalloc char[64]; - item.AsSpan().CopyTo(buffer); - - var profile = new CsvProfile(';', '\'', '`', "\r\n", false, false, 4096, "(empty)", "(null)"); - var reader = new CsvReaderProxy(profile); - var value = reader.ReadField(buffer, 0, item.Length, true, true); - Assert.That(value, Is.EqualTo(result)); - } - - [Test] - [TestCase("\"")] - [TestCase("\"a")] - public void ReadField_ContainsQualifierChar_CorrectString(string item) - { - var profile = new CsvProfile(';', '\"', '\"', "\r\n", false, false, 4096, "(empty)", "(null)"); - var reader = new CsvReaderProxy(profile); - var value = - Assert.Throws(() => - { - Span buffer = stackalloc char[64]; - item.AsSpan().CopyTo(buffer); - reader.ReadField(buffer, 0, item.Length, false, false); - }); - } - - [TestCase("\"ab\"", "ab")] - [TestCase("\"abc\"", "abc")] - [TestCase("\"a\"\"b\"", "a\"b")] - [TestCase("\"\"\"a\"\"b\"\"\"", "\"a\"b\"")] - public void ReadField_EscapedWithDoubleChar_CorrectString(string item, string result) - { - Span buffer = stackalloc char[64]; - item.AsSpan().CopyTo(buffer); - - var profile = new CsvProfile(';', '\"', '\"', "\r\n", false, false, 4096, "(empty)", "(null)"); - var reader = new CsvReaderProxy(profile); - var value = reader.ReadField(buffer, 0, item.Length, true, true); - Assert.That(value, Is.EqualTo(result)); - } - - [Test] - [TestCase("foo")] - [TestCase("'foo'")] - [TestCase("foo;")] - [TestCase("'foo';")] - public void ReadNextRecord_SingleField_CorrectParsing(string record) - { - Span buffer = stackalloc char[64]; - record.CopyTo(buffer); - - var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, true, 4096, string.Empty, string.Empty); - var reader = new CsvReaderProxy(profile); - var (values, eof) = reader.ReadNextRecord(buffer); - Assert.That(eof, Is.True); - Assert.That(values, Has.Length.EqualTo(1)); - Assert.That(values.First(), Is.EqualTo("foo")); - } - - [TestCase("foo\r\n", "foo")] - [TestCase("foo;bar\r\n", "foo", "bar")] - [TestCase("foo;bar;\r\n", "foo", "bar")] - public void ReadNextRecord_RecordWithLineTerminator_CorrectParsing(string record, params string[] tokens) - { - Span buffer = stackalloc char[64]; - record.AsSpan().CopyTo(buffer); - - var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, false, 4096, "(empty)", "(null)"); - var reader = new CsvReaderProxy(profile); - (var values, var _) = reader.ReadNextRecord(buffer); - Assert.That(values, Has.Length.EqualTo(tokens.Length)); - for (int i = 0; i < tokens.Length; i++) - Assert.That(values[i], Is.EqualTo(tokens[i])); - } - - [TestCase("foo", "foo")] - [TestCase("foo;bar", "foo", "bar")] - [TestCase("foo;bar;", "foo", "bar")] - public void ReadNextRecord_RecordWithoutLineTerminator_CorrectParsing(string record, params string[] tokens) - { - Span buffer = stackalloc char[64]; - record.AsSpan().CopyTo(buffer); - - var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, false, 4096, "(empty)", "(null)"); - var reader = new CsvReaderProxy(profile); - (var values, var _) = reader.ReadNextRecord(buffer); - Assert.That(values, Has.Length.EqualTo(tokens.Length)); - for (int i = 0; i < tokens.Length; i++) - Assert.That(values[i], Is.EqualTo(tokens[i])); - } - - [Test] - [TestCase("'ab'';'c';'xyz'")] - [TestCase("'ab'';'c''';'xyz'")] - [TestCase("'ab'';'''c';'xyz'")] - public void ReadNextRecord_RecordWithUnescapedTextQualifier_ThrowException(string record) - { - var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, true, 4096, string.Empty, string.Empty); - var reader = new CsvReaderProxy(profile); - Assert.Throws(() => - { - Span buffer = stackalloc char[64]; - record.CopyTo(buffer); - reader.ReadNextRecord(buffer); - }); - } - - [Test] - [TestCase("abc;xyz", "abc")] - [TestCase("'abc';'xyz'", "abc")] - [TestCase("abc;'xyz'", "abc")] - [TestCase("'abc';xyz", "abc")] - [TestCase("'ab;c';xyz", "ab;c")] - [TestCase("'ab;;c';xyz", "ab;;c")] - [TestCase("'ab;;;c';xyz", "ab;;;c")] - [TestCase("'a;b;;c';xyz", "a;b;;c")] - [TestCase(";'xyz'", "")] - [TestCase(";xyz", "")] - [TestCase("'ab'';''c';'xyz'", "ab';'c")] - [TestCase("'ab'';''''c';'xyz'", "ab';''c")] - [TestCase("'a''b'';c';'xyz'", "a'b';c")] - public void ReadNextRecord_RecordWithTwoFields_CorrectParsing(string record, string firstToken) - { - Span buffer = stackalloc char[64]; - record.AsSpan().CopyTo(buffer); - - var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, false, 4096, "", "(null)"); - var reader = new CsvReaderProxy(profile); - (var values, var _) = reader.ReadNextRecord(buffer); - Assert.That(values[0], Is.EqualTo(firstToken)); - Assert.That(values[1], Is.EqualTo("xyz")); - } - - [Test] - [TestCase("'fo;o'", "fo;o")] - [TestCase("'fo;o';", "fo;o")] - [TestCase("';foo';", ";foo")] - [TestCase("'foo;';", "foo;")] - public void ReadNextRecord_SingleFieldWithTextQualifier_CorrectParsing(string record, string expected) - { - Span buffer = stackalloc char[64]; - record.CopyTo(buffer); - - var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, true, 4096, string.Empty, string.Empty); - var reader = new CsvReaderProxy(profile); - var (values, _) = reader.ReadNextRecord(buffer); - Assert.That(values, Has.Length.EqualTo(1)); - Assert.That(values.First(), Is.EqualTo(expected)); - } - - [Test] - [TestCase("'fo''o'", '\'')] - [TestCase("'fo?'o'", '?')] - [TestCase("'fo\\'o'", '\\')] - public void ReadNextRecord_SingleFieldWithTextEscaper_CorrectParsing(string record, char escapeTextQualifier) - { - Span buffer = stackalloc char[64]; - record.CopyTo(buffer); - - var profile = new CsvProfile(';', '\'', escapeTextQualifier, "\r\n", false, true, 4096, string.Empty, string.Empty); - var reader = new CsvReaderProxy(profile); - var (values, _) = reader.ReadNextRecord(buffer); - Assert.That(values, Has.Length.EqualTo(1)); - Assert.That(values.First(), Is.EqualTo("fo'o")); - } - - [Test] - [TestCase("abc;xyz;123", "123")] - [TestCase("'abc';'xyz';'123'", "123")] - [TestCase("abc;'xyz';123", "123")] - [TestCase("'abc';xyz;123", "123")] - [TestCase("'abc';xyz;'123'", "123")] - [TestCase("'ab;;;c';xyz;;", "")] - [TestCase("'a;b;;c';'x;;;y;;z';123", "123")] - [TestCase(";'xyz';;", "")] - [TestCase(";;;", "")] - public void ReadNextRecord_RecordWithThreeFields_CorrectParsing(string record, string thirdToken) - { - Span buffer = stackalloc char[64]; - record.CopyTo(buffer); - - var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, true, 4096, string.Empty, string.Empty); - var reader = new CsvReaderProxy(profile); - var (values, _) = reader.ReadNextRecord(buffer); - Assert.That(values, Has.Length.EqualTo(3)); - Assert.That(values[2], Is.EqualTo(thirdToken)); - } - - [Test] - public void ReadNextRecord_NullField_NullValue() - { - Span buffer = stackalloc char[64]; - "a;(null)".CopyTo(buffer); - - var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, true, 4096, string.Empty, string.Empty); - var reader = new CsvReaderProxy(profile); - var (values, eof) = reader.ReadNextRecord(buffer); - Assert.That(eof, Is.True); - Assert.That(values, Has.Length.EqualTo(2)); - Assert.That(values[1], Is.Null); - } - - [Test] - [TestCase("abc+abc+abc+abc", "+", 1, 4)] - [TestCase("abc+abc+abc+abc", "+", 2, 4)] - [TestCase("abc+abc+abc+abc", "+", 200, 4)] - [TestCase("abc+@abc+@abc+@abc", "+@", 1, 4)] - [TestCase("abc+@abc+@abc+@abc", "+@", 2, 4)] - [TestCase("abc+@abc+@abc+@abc", "+@", 4, 4)] - [TestCase("abc+@abc+@abc+@abc", "+@", 5, 4)] - [TestCase("abc+@abc+@abc+@abc", "+@", 200, 4)] - [TestCase("abc+@abc+abc+@abc", "+@", 1, 3)] - [TestCase("abc+@abc+abc+@abc", "+@", 2, 3)] - [TestCase("abc+@abc+abc+@abc", "+@", 4, 3)] - [TestCase("abc+@abc+abc+@abc", "+@", 5, 3)] - [TestCase("abc+@abc+abc+@abc", "+@", 200, 3)] - [TestCase("abc+@abc+abc+@abc+@", "+@", 1, 3)] - [TestCase("abc+@abc+abc+@abc+@", "+@", 2, 3)] - [TestCase("abc+@abc+abc+@abc+@", "+@", 4, 3)] - [TestCase("abc+@abc+abc+@abc+@", "+@", 5, 3)] - [TestCase("abc+@abc+abc+@abc+@", "+@", 200, 3)] - [TestCase("abc", "+@", 200, 1)] - public void CountRecordSeparator_Csv_CorrectCount(string text, string recordSeparator, int bufferSize, int result) - { - using (var stream = new MemoryStream()) - { - var writer = new StreamWriter(stream); - writer.Write(text); - writer.Flush(); - - stream.Position = 0; - - var profile = new CsvProfile(';', recordSeparator); - var reader = new CsvReaderProxy(profile); - using (StreamReader streamReader = new StreamReader(stream, Encoding.UTF8, true)) - { - var value = reader.CountRecordSeparators(streamReader); - Assert.That(value, Is.EqualTo(result)); - } - writer.Dispose(); - } - } - - [Test] - [TestCase("abc+abc+abc+abc", "+", 1)] - [TestCase("abc+abc+abc+abc", "+", 2)] - [TestCase("abc+abc+abc+abc", "+", 200)] - [TestCase("abc+@abc+@abc+@abc", "+@", 1)] - [TestCase("abc+@abc+@abc+@abc", "+@", 2)] - [TestCase("abc+@abc+@abc+@abc", "+@", 4)] - [TestCase("abc+@abc+@abc+@abc", "+@", 5)] - [TestCase("abc+@abc+@abc+@abc", "+@", 200)] - [TestCase("abc+@abc+abc+@abc", "+@", 1)] - [TestCase("abc+@abc+abc+@abc", "+@", 2)] - [TestCase("abc+@abc+abc+@abc", "+@", 4)] - [TestCase("abc+@abc+abc+@abc", "+@", 5)] - [TestCase("abc+@abc+abc+@abc", "+@", 200)] - [TestCase("abc+@abc+abc+@abc+@", "+@", 1)] - [TestCase("abc+@abc+abc+@abc+@", "+@", 2)] - [TestCase("abc+@abc+abc+@abc+@", "+@", 4)] - [TestCase("abc+@abc+abc+@abc+@", "+@", 5)] - [TestCase("abc+@abc+abc+@abc+@", "+@", 200)] - [TestCase("abc", "+@", 200)] - public void GetFirstRecord_Csv_CorrectResult(string text, string recordSeparator, int bufferSize) - { - using (var stream = new MemoryStream()) - { - var writer = new StreamWriter(stream); - writer.Write(text); - writer.Flush(); - - stream.Position = 0; - - var reader = new CsvReaderProxy(); - using (StreamReader streamReader = new StreamReader(stream, Encoding.UTF8, true)) - { - var value = reader.GetFirstRecord(streamReader, recordSeparator, bufferSize); - Assert.That(value, Is.EqualTo("abc" + recordSeparator).Or.EqualTo("abc")); - } - writer.Dispose(); - } - } - - [Test] - [TestCase("abc+abc++abc+abc", "++", 1)] - public void GetFirstRecord_CsvWithSemiSeparator_CorrectResult(string text, string recordSeparator, int bufferSize) - { - using (var stream = new MemoryStream()) - { - var writer = new StreamWriter(stream); - writer.Write(text); - writer.Flush(); - - stream.Position = 0; - - var reader = new CsvReaderProxy(); - using (StreamReader streamReader = new StreamReader(stream, Encoding.UTF8, true)) - { - var value = reader.GetFirstRecord(streamReader, recordSeparator, bufferSize); - Assert.That(value, Is.EqualTo("abc+abc" + recordSeparator).Or.EqualTo("abc+abc")); - } - writer.Dispose(); - } - } - - [Test] - [TestCase("abc+abc", "+", 1)] - [TestCase("abc+abc", "+", 2)] - [TestCase("abc+abc", "+", 200)] - [TestCase("abc+@abc", "+@", 1)] - [TestCase("abc+@abc", "+@", 2)] - [TestCase("abc+@abc", "+@", 4)] - [TestCase("abc+@abc", "+@", 5)] - [TestCase("abc+@abc", "+@", 200)] - [TestCase("abc;abc+@abc", "+@", 1)] - [TestCase("abc;abc+@abc", "+@", 2)] - [TestCase("abc;abc+@abc", "+@", 3)] - [TestCase("abc;abc+@abc", "+@", 4)] - [TestCase("abc;abc+@abc", "+@", 5)] - [TestCase("abc", "+@", 200)] - public void ReadNextRecord_Csv_CorrectResults(string text, string recordSeparator, int bufferSize) - { - Span buffer = stackalloc char[bufferSize]; - Span extra = stackalloc char[0]; - - using (var stream = new MemoryStream()) - { - var writer = new StreamWriter(stream); - writer.Write(text); - writer.Flush(); - - stream.Position = 0; - - var profile = new CsvProfile(';', recordSeparator); - var reader = new CsvReaderProxy(profile); - using (var streamReader = new StreamReader(stream, Encoding.UTF8, true)) - { - var (values, _) = reader.ReadNextRecord(streamReader, buffer, ref extra); - Assert.That(values, Has.Length.GreaterThan(0)); - foreach (var value in values) - Assert.That(value, Is.EqualTo("abc")); - } - writer.Dispose(); - } - } - - [Test] - [TestCase("a+b+c#a+b#a#a+b", '+', "#", "?")] - public void Read_CsvWithCsvProfileMissingCell_CorrectResults(string text, char fieldSeparator, string recordSeparator, string missingCell) - { - using (var stream = new MemoryStream()) - { - var writer = new StreamWriter(stream); - writer.Write(text); - writer.Flush(); - - stream.Position = 0; - - var profile = new CsvProfile(fieldSeparator, '`', '`', recordSeparator, false, true, 4096, "_", missingCell); - var reader = new CsvReader(profile); - var dataTable = reader.Read(stream, Encoding.UTF8, 0); - - Assert.That(dataTable.Rows[0].ItemArray[0], Is.EqualTo("a")); - Assert.That(dataTable.Rows[0].ItemArray[1], Is.EqualTo("b")); - Assert.That(dataTable.Rows[0].ItemArray[2], Is.EqualTo("c")); - - Assert.That(dataTable.Rows[1].ItemArray[0], Is.EqualTo("a")); - Assert.That(dataTable.Rows[1].ItemArray[1], Is.EqualTo("b")); - Assert.That(dataTable.Rows[1].ItemArray[2], Is.EqualTo("?")); - - Assert.That(dataTable.Rows[2].ItemArray[0], Is.EqualTo("a")); - Assert.That(dataTable.Rows[2].ItemArray[1], Is.EqualTo("?")); - Assert.That(dataTable.Rows[2].ItemArray[2], Is.EqualTo("?")); - - Assert.That(dataTable.Rows[3].ItemArray[0], Is.EqualTo("a")); - Assert.That(dataTable.Rows[3].ItemArray[1], Is.EqualTo("b")); - Assert.That(dataTable.Rows[3].ItemArray[2], Is.EqualTo("?")); - - - writer.Dispose(); - } - } - - [Test] - [TestCase("a+b+c#a++c", '+', "#", "?", "a", "?", "c")] - [TestCase("a+b+c#+b+c", '+', "#", "?", "?", "b", "c")] - [TestCase("a+b+c#+b+", '+', "#", "?", "?", "b", "?")] - public void Read_CsvWithCsvProfileEmptyCell_CorrectResults(string text, char fieldSeparator, string recordSeparator, string emptyCell, params string[] expected) - { - using (var stream = new MemoryStream()) - { - var writer = new StreamWriter(stream); - writer.Write(text); - writer.Flush(); - - stream.Position = 0; - var profile = new CsvProfile(fieldSeparator, '`', '`', recordSeparator, false, true, 4096, emptyCell, "_"); - var reader = new CsvReader(profile); - var dataTable = reader.Read(stream, Encoding.UTF8, 0); - - Assert.That(dataTable.Rows[0].ItemArray[0], Is.EqualTo("a")); - Assert.That(dataTable.Rows[0].ItemArray[1], Is.EqualTo("b")); - Assert.That(dataTable.Rows[0].ItemArray[2], Is.EqualTo("c")); - - for (int i = 0; i < 3;i++) - Assert.That(dataTable.Rows[1].ItemArray[i], Is.EqualTo(expected[i])); - - writer.Dispose(); - } - } - - [Test] - [TestCase("abc", "+@", "abc")] - [TestCase("abc+@", "+@", "abc")] - [TestCase("abc\0\0\0", "+@", "abc")] - public void CleanRecord_Record_CorrectResult(string text, string recordSeparator, string result) - { - Span buffer = stackalloc char[64]; - text.CopyTo(buffer); - - var profile = new CsvProfile(';', recordSeparator); - var reader = new CsvReaderProxy(profile); - var (value, _) = reader.ReadNextRecord(buffer); - Assert.That(value[0], Is.EqualTo(result)); - } - - [Test] - [TestCase("abc\r\ndef\r\nghl\r\nijk", 1, 1)] - [TestCase("abc\r\ndef\r\nghl\r\nijk", 17, 1)] - [TestCase("abc\r\ndef\r\nghl\r\nijk", 18, 1)] - [TestCase("abc\r\ndef\r\nghl\r\nijk", 19, 1)] - [TestCase("abc\r\ndef\r\nghl\r\nijk", 512, 1)] - [TestCase("abc;xyz\r\ndef;xyz\r\nghl\r\n;ijk", 1, 2)] - [TestCase("abc;xyz\r\ndef;xyz\r\nghl\r\n;ijk", 512, 2)] - [TestCase("\"abc\";\"xyz\"\r\n\"def\";\"xyz\"\r\n\"ghl\"\r\n;\"ijk\"", 512, 2)] - [TestCase("abc;\"xyz\"\r\n\"def\";xyz\r\n\"ghl\"\r\n;\"ijk\"", 512, 2)] - [TestCase("abc;\"xyz\"\r\n\"def\";xyz\r\n\"ghl\"\r\n;\"ijk\"", 512, 2)] - public void Read_Csv_CorrectResult(string text, int bufferSize, int fieldCount) - { - using (var stream = new MemoryStream()) - { - var writer = new StreamWriter(stream); - writer.Write(text); - writer.Flush(); - - stream.Position = 0; - - var reader = new CsvReader(new CsvProfile(';', '\"', "\r\n", false, false, 4096, "(empty)", "(null)"), bufferSize); - var dataTable = reader.Read(stream); - Assert.That(dataTable.Rows, Has.Count.EqualTo(4)); - Assert.That(dataTable.Columns, Has.Count.EqualTo(fieldCount)); - foreach (DataRow row in dataTable.Rows) - { - foreach (var cell in row.ItemArray) - Assert.That(cell!.ToString(), Has.Length.EqualTo(3).Or.EqualTo("(empty)").Or.EqualTo("(null)")); - } - Assert.That(dataTable.Rows[0][0], Is.EqualTo("abc")); - if (dataTable.Columns.Count == 2) - Assert.That(dataTable.Rows[0][1], Is.EqualTo("xyz")); - writer.Dispose(); - } - } - - [Test] - [TestCase("'azerty';'';'alpha'", 3)] - [TestCase("'azerty';;'alpha'", 3)] - public void Read_CsvWithTextQualifier_CorrectResult(string text, int columnCount) - { - using (var stream = new MemoryStream()) - { - var writer = new StreamWriter(stream); - writer.Write(text); - writer.Flush(); - - stream.Position = 0; - - var reader = new CsvReader(new CsvProfile(';', '\'', "\r\n", false, false, 4096, "foo", "(null)")); - var dataTable = reader.Read(stream); - Assert.That(dataTable.Columns, Has.Count.EqualTo(columnCount)); - Assert.That(dataTable.Rows[0][0], Is.EqualTo("azerty")); - Assert.That(dataTable.Rows[0][1], Is.EqualTo("foo")); - Assert.That(dataTable.Rows[0][2], Is.EqualTo("alpha")); - writer.Dispose(); - } - } - - [Test] - [TestCase("a;b;c\r\nd;e;f;g\r\n", 1, 1)] - [TestCase("a;b;c\r\nd;e;f\r\ng;h;i;j\r\n", 2, 1)] - [TestCase("a;b;c\r\nd;e;f\r\ng;h;i;j;k\r\n", 2, 2)] - public void Read_MoreFieldThanExpected_ExceptionMessage(string text, int rowNumber, int moreField) - { - using (var stream = new MemoryStream()) - { - using (var writer = new StreamWriter(stream)) - { - writer.Write(text); - writer.Flush(); - - stream.Position = 0; - - var profile = CsvProfile.SemiColumnDoubleQuote; - var reader = new CsvReader(profile, 1024); - - var ex = Assert.Throws(() => reader.Read(stream)); - Assert.That(ex!.Message, Does.Contain(string.Format("record {0} ", rowNumber + 1))); - Assert.That(ex.Message, Does.Contain(string.Format("{0} more", moreField))); - } - } - } - - [Test] - public void Read_EmptyValue_MatchWithEmpty() - { - using (var stream = new MemoryStream()) - { - using (var writer = new StreamWriter(stream)) - { - writer.Write("a;;c"); - writer.Flush(); - - stream.Position = 0; - - var profile = CsvProfile.SemiColumnDoubleQuote; - var reader = new CsvReaderProxy(profile); - var dataTable = reader.Read(stream); - Assert.That(dataTable.Rows[0][1], Is.EqualTo(string.Empty)); - } - } - } - - [Test] - public void Read_MissingValue_MatchWithNullValue() - { - using (var stream = new MemoryStream()) - { - using (var writer = new StreamWriter(stream)) - { - writer.Write("a;b;c\r\na;b\r\na;b;c"); - writer.Flush(); - - stream.Position = 0; - - var profile = new CsvProfile(';', '"', "\r\n", false, true, 512, string.Empty, "(null)"); - var reader = new CsvReaderProxy(profile); - var dataTable = reader.Read(stream); - Assert.That(dataTable.Rows[1][2], Is.EqualTo("(null)")); - } - } - } - - [Test] - [TestCase("a;b;c\r\n1;2;3")] - [TestCase("a;b;c\r\n1;2;3\r\n")] - [TestCase("a;b;c\r\n#\r\n1;2;3")] - [TestCase("a;b;c\r\n#x;y;z\r\n1;2;3")] - [TestCase("a;b;c\r\n1;2;3\r\n#x;y;z")] - [TestCase("#x;y;z\r\na;b;c\r\n1;2;3")] - [TestCase("#x;y;z\r\n#x;y;z\r\na;b;c\r\n1;2;3")] - [TestCase("#x;y;z\r\n#x;y;z\r\na;b;c\r\n1;2;3\r\n#1;2;3")] - public void Read_Comment_CommentedLinesSkipped(string content) - { - using (var stream = new MemoryStream()) - { - using (var writer = new StreamWriter(stream)) - { - writer.Write(content); - writer.Flush(); - - stream.Position = 0; - - var profile = new CsvProfile(new CsvDialectDescriptor { Header = false, Delimiter = ';', CommentChar = '#', DoubleQuote = false }); - var reader = new CsvReaderProxy(profile); - var dataTable = reader.Read(stream); - Assert.That(dataTable.Rows.Count, Is.EqualTo(2)); - Assert.That(dataTable.Columns.Count, Is.EqualTo(3)); - } - } - } - } -} diff --git a/PocketCsvReader.Testing/RecordParserTest.cs b/PocketCsvReader.Testing/RecordParserTest.cs new file mode 100644 index 0000000..9324ed1 --- /dev/null +++ b/PocketCsvReader.Testing/RecordParserTest.cs @@ -0,0 +1,331 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using NUnit.Framework; + +namespace PocketCsvReader.Testing; +public class RecordParserTest +{ + [Test] + [TestCase("foo")] + [TestCase("'foo'")] + [TestCase("foo;")] + [TestCase("'foo';")] + public void ReadNextRecord_SingleField_CorrectParsing(string record) + { + Span buffer = stackalloc char[64]; + record.CopyTo(buffer); + + var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, true, 4096, string.Empty, string.Empty); + var reader = new RecordParser(profile); + var (values, eof) = reader.ReadNextRecord(buffer); + Assert.That(eof, Is.True); + Assert.That(values, Has.Length.EqualTo(1)); + Assert.That(values.First(), Is.EqualTo("foo")); + } + + [TestCase("foo\r\n", "foo")] + [TestCase("foo;bar\r\n", "foo", "bar")] + [TestCase("foo;bar;\r\n", "foo", "bar")] + public void ReadNextRecord_RecordWithLineTerminator_CorrectParsing(string record, params string[] tokens) + { + Span buffer = stackalloc char[64]; + record.AsSpan().CopyTo(buffer); + + var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, false, 4096, "(empty)", "(null)"); + var reader = new RecordParser(profile); + (var values, var _) = reader.ReadNextRecord(buffer); + Assert.That(values, Has.Length.EqualTo(tokens.Length)); + for (int i = 0; i < tokens.Length; i++) + Assert.That(values[i], Is.EqualTo(tokens[i])); + } + + [TestCase("foo", "foo")] + [TestCase("foo;bar", "foo", "bar")] + [TestCase("foo;bar;", "foo", "bar")] + public void ReadNextRecord_RecordWithoutLineTerminator_CorrectParsing(string record, params string[] tokens) + { + Span buffer = stackalloc char[64]; + record.AsSpan().CopyTo(buffer); + + var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, false, 4096, "(empty)", "(null)"); + var reader = new RecordParser(profile); + (var values, var _) = reader.ReadNextRecord(buffer); + Assert.That(values, Has.Length.EqualTo(tokens.Length)); + for (int i = 0; i < tokens.Length; i++) + Assert.That(values[i], Is.EqualTo(tokens[i])); + } + + [Test] + [TestCase("'ab'';'c';'xyz'")] + [TestCase("'ab'';'c''';'xyz'")] + [TestCase("'ab'';'''c';'xyz'")] + public void ReadNextRecord_RecordWithUnescapedTextQualifier_ThrowException(string record) + { + var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, true, 4096, string.Empty, string.Empty); + var reader = new RecordParser(profile); + Assert.Throws(() => + { + Span buffer = stackalloc char[64]; + record.CopyTo(buffer); + reader.ReadNextRecord(buffer); + }); + } + + [Test] + [TestCase("abc;xyz", "abc")] + [TestCase("'abc';'xyz'", "abc")] + [TestCase("abc;'xyz'", "abc")] + [TestCase("'abc';xyz", "abc")] + [TestCase("'ab;c';xyz", "ab;c")] + [TestCase("'ab;;c';xyz", "ab;;c")] + [TestCase("'ab;;;c';xyz", "ab;;;c")] + [TestCase("'a;b;;c';xyz", "a;b;;c")] + [TestCase(";'xyz'", "")] + [TestCase(";xyz", "")] + [TestCase("'ab'';''c';'xyz'", "ab';'c")] + [TestCase("'ab'';''''c';'xyz'", "ab';''c")] + [TestCase("'a''b'';c';'xyz'", "a'b';c")] + public void ReadNextRecord_RecordWithTwoFields_CorrectParsing(string record, string firstToken) + { + Span buffer = stackalloc char[64]; + record.AsSpan().CopyTo(buffer); + + var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, false, 4096, "", "(null)"); + var reader = new RecordParser(profile); + (var values, var _) = reader.ReadNextRecord(buffer); + Assert.That(values[0], Is.EqualTo(firstToken)); + Assert.That(values[1], Is.EqualTo("xyz")); + } + + [Test] + [TestCase("'fo;o'", "fo;o")] + [TestCase("'fo;o';", "fo;o")] + [TestCase("';foo';", ";foo")] + [TestCase("'foo;';", "foo;")] + public void ReadNextRecord_SingleFieldWithTextQualifier_CorrectParsing(string record, string expected) + { + Span buffer = stackalloc char[64]; + record.CopyTo(buffer); + + var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, true, 4096, string.Empty, string.Empty); + var reader = new RecordParser(profile); + var (values, _) = reader.ReadNextRecord(buffer); + Assert.That(values, Has.Length.EqualTo(1)); + Assert.That(values.First(), Is.EqualTo(expected)); + } + + [Test] + [TestCase("'fo''o'", '\'')] + [TestCase("'fo?'o'", '?')] + [TestCase("'fo\\'o'", '\\')] + public void ReadNextRecord_SingleFieldWithTextEscaper_CorrectParsing(string record, char escapeTextQualifier) + { + Span buffer = stackalloc char[64]; + record.CopyTo(buffer); + + var profile = new CsvProfile(';', '\'', escapeTextQualifier, "\r\n", false, true, 4096, string.Empty, string.Empty); + var reader = new RecordParser(profile); + var (values, _) = reader.ReadNextRecord(buffer); + Assert.That(values, Has.Length.EqualTo(1)); + Assert.That(values.First(), Is.EqualTo("fo'o")); + } + + [Test] + [TestCase("abc;xyz;123", "123")] + [TestCase("'abc';'xyz';'123'", "123")] + [TestCase("abc;'xyz';123", "123")] + [TestCase("'abc';xyz;123", "123")] + [TestCase("'abc';xyz;'123'", "123")] + [TestCase("'ab;;;c';xyz;;", "")] + [TestCase("'a;b;;c';'x;;;y;;z';123", "123")] + [TestCase(";'xyz';;", "")] + [TestCase(";;;", "")] + public void ReadNextRecord_RecordWithThreeFields_CorrectParsing(string record, string thirdToken) + { + Span buffer = stackalloc char[64]; + record.CopyTo(buffer); + + var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, true, 4096, string.Empty, string.Empty); + var reader = new RecordParser(profile); + var (values, _) = reader.ReadNextRecord(buffer); + Assert.That(values, Has.Length.EqualTo(3)); + Assert.That(values[2], Is.EqualTo(thirdToken)); + } + + [Test] + public void ReadNextRecord_NullField_NullValue() + { + Span buffer = stackalloc char[64]; + "a;(null)".CopyTo(buffer); + + var profile = new CsvProfile(';', '\'', '\'', "\r\n", false, true, 4096, string.Empty, string.Empty); + var reader = new RecordParser(profile); + var (values, eof) = reader.ReadNextRecord(buffer); + Assert.That(eof, Is.True); + Assert.That(values, Has.Length.EqualTo(2)); + Assert.That(values[1], Is.Null); + } + + [Test] + [TestCase("abc+abc", "+", 1)] + [TestCase("abc+abc", "+", 2)] + [TestCase("abc+abc", "+", 200)] + [TestCase("abc+@abc", "+@", 1)] + [TestCase("abc+@abc", "+@", 2)] + [TestCase("abc+@abc", "+@", 4)] + [TestCase("abc+@abc", "+@", 5)] + [TestCase("abc+@abc", "+@", 200)] + [TestCase("abc;abc+@abc", "+@", 1)] + [TestCase("abc;abc+@abc", "+@", 2)] + [TestCase("abc;abc+@abc", "+@", 3)] + [TestCase("abc;abc+@abc", "+@", 4)] + [TestCase("abc;abc+@abc", "+@", 5)] + [TestCase("abc", "+@", 200)] + public void ReadNextRecord_Csv_CorrectResults(string text, string recordSeparator, int bufferSize) + { + Span buffer = stackalloc char[bufferSize]; + Span extra = stackalloc char[0]; + + using (var stream = new MemoryStream()) + { + var writer = new StreamWriter(stream); + writer.Write(text); + writer.Flush(); + + stream.Position = 0; + + var profile = new CsvProfile(';', recordSeparator); + var reader = new RecordParser(profile); + using (var streamReader = new StreamReader(stream, Encoding.UTF8, true)) + { + var (values, _) = reader.ReadNextRecord(streamReader, buffer, ref extra); + Assert.That(values, Has.Length.GreaterThan(0)); + foreach (var value in values) + Assert.That(value, Is.EqualTo("abc")); + } + writer.Dispose(); + } + } + [Test] + [TestCase("abc", "+@", "abc")] + [TestCase("abc+@", "+@", "abc")] + [TestCase("abc\0\0\0", "+@", "abc")] + public void CleanRecord_Record_CorrectResult(string text, string recordSeparator, string result) + { + Span buffer = stackalloc char[64]; + text.CopyTo(buffer); + + var profile = new CsvProfile(';', recordSeparator); + var reader = new RecordParser(profile); + var (value, _) = reader.ReadNextRecord(buffer); + Assert.That(value[0], Is.EqualTo(result)); + } + + + [Test] + [TestCase("abc+abc+abc+abc", "+", 1, 4)] + [TestCase("abc+abc+abc+abc", "+", 2, 4)] + [TestCase("abc+abc+abc+abc", "+", 200, 4)] + [TestCase("abc+@abc+@abc+@abc", "+@", 1, 4)] + [TestCase("abc+@abc+@abc+@abc", "+@", 2, 4)] + [TestCase("abc+@abc+@abc+@abc", "+@", 4, 4)] + [TestCase("abc+@abc+@abc+@abc", "+@", 5, 4)] + [TestCase("abc+@abc+@abc+@abc", "+@", 200, 4)] + [TestCase("abc+@abc+abc+@abc", "+@", 1, 3)] + [TestCase("abc+@abc+abc+@abc", "+@", 2, 3)] + [TestCase("abc+@abc+abc+@abc", "+@", 4, 3)] + [TestCase("abc+@abc+abc+@abc", "+@", 5, 3)] + [TestCase("abc+@abc+abc+@abc", "+@", 200, 3)] + [TestCase("abc+@abc+abc+@abc+@", "+@", 1, 3)] + [TestCase("abc+@abc+abc+@abc+@", "+@", 2, 3)] + [TestCase("abc+@abc+abc+@abc+@", "+@", 4, 3)] + [TestCase("abc+@abc+abc+@abc+@", "+@", 5, 3)] + [TestCase("abc+@abc+abc+@abc+@", "+@", 200, 3)] + [TestCase("abc", "+@", 200, 1)] + public void CountRecordSeparator_Csv_CorrectCount(string text, string recordSeparator, int bufferSize, int result) + { + using (var stream = new MemoryStream()) + { + var writer = new StreamWriter(stream); + writer.Write(text); + writer.Flush(); + + stream.Position = 0; + + var profile = new CsvProfile(';', recordSeparator); + var reader = new RecordParser(profile); + using (StreamReader streamReader = new StreamReader(stream, Encoding.UTF8, true)) + { + var value = reader.CountRecordSeparators(streamReader); + Assert.That(value, Is.EqualTo(result)); + } + writer.Dispose(); + } + } + + + [Test] + [TestCase("abc+abc+abc+abc", "+", 1)] + [TestCase("abc+abc+abc+abc", "+", 2)] + [TestCase("abc+abc+abc+abc", "+", 200)] + [TestCase("abc+@abc+@abc+@abc", "+@", 1)] + [TestCase("abc+@abc+@abc+@abc", "+@", 2)] + [TestCase("abc+@abc+@abc+@abc", "+@", 4)] + [TestCase("abc+@abc+@abc+@abc", "+@", 5)] + [TestCase("abc+@abc+@abc+@abc", "+@", 200)] + [TestCase("abc+@abc+abc+@abc", "+@", 1)] + [TestCase("abc+@abc+abc+@abc", "+@", 2)] + [TestCase("abc+@abc+abc+@abc", "+@", 4)] + [TestCase("abc+@abc+abc+@abc", "+@", 5)] + [TestCase("abc+@abc+abc+@abc", "+@", 200)] + [TestCase("abc+@abc+abc+@abc+@", "+@", 1)] + [TestCase("abc+@abc+abc+@abc+@", "+@", 2)] + [TestCase("abc+@abc+abc+@abc+@", "+@", 4)] + [TestCase("abc+@abc+abc+@abc+@", "+@", 5)] + [TestCase("abc+@abc+abc+@abc+@", "+@", 200)] + [TestCase("abc", "+@", 200)] + public void GetFirstRecord_Csv_CorrectResult(string text, string recordSeparator, int bufferSize) + { + using (var stream = new MemoryStream()) + { + var writer = new StreamWriter(stream); + writer.Write(text); + writer.Flush(); + + stream.Position = 0; + + var reader = new RecordParser(CsvProfile.SemiColumnDoubleQuote); + using (StreamReader streamReader = new StreamReader(stream, Encoding.UTF8, true)) + { + var value = reader.GetFirstRecord(streamReader, recordSeparator, bufferSize); + Assert.That(value, Is.EqualTo("abc" + recordSeparator).Or.EqualTo("abc")); + } + writer.Dispose(); + } + } + + [Test] + [TestCase("abc+abc++abc+abc", "++", 1)] + public void GetFirstRecord_CsvWithSemiSeparator_CorrectResult(string text, string recordSeparator, int bufferSize) + { + using (var stream = new MemoryStream()) + { + var writer = new StreamWriter(stream); + writer.Write(text); + writer.Flush(); + + stream.Position = 0; + + var reader = new RecordParser(CsvProfile.SemiColumnDoubleQuote); + using (StreamReader streamReader = new StreamReader(stream, Encoding.UTF8, true)) + { + var value = reader.GetFirstRecord(streamReader, recordSeparator, bufferSize); + Assert.That(value, Is.EqualTo("abc+abc" + recordSeparator).Or.EqualTo("abc+abc")); + } + writer.Dispose(); + } + } +} diff --git a/PocketCsvReader.Testing/StreamExtensionsTest.cs b/PocketCsvReader.Testing/StreamExtensionsTest.cs new file mode 100644 index 0000000..064552f --- /dev/null +++ b/PocketCsvReader.Testing/StreamExtensionsTest.cs @@ -0,0 +1,36 @@ +using PocketCsvReader; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Data; +using System.IO; +using System.Linq; +using System.Text; +using System.Reflection; +using Moq; + +namespace PocketCsvReader.Testing +{ + [TestFixture] + public class StreamExtensionsTest + { + [Test] + public void GetStreamEncoding_Financial_CorrectEncodingInfo() + { + using (var stream = + Assembly.GetExecutingAssembly() + .GetManifestResourceStream($"{Assembly.GetExecutingAssembly().GetName().Name}.Resources.Utf8.csv") + ?? throw new FileNotFoundException() + ) + { + using var reader = new StreamReader(stream); + for (int i = 0; i < 3; i++) + reader.Read(); + Assert.That(reader.BaseStream.Position, Is.GreaterThan(0)); + + reader.Rewind(); + Assert.That(reader.BaseStream.Position, Is.EqualTo(0)); + } + } + } +} diff --git a/PocketCsvReader/CsvDataReader.cs b/PocketCsvReader/CsvDataReader.cs index 1d5b44a..23d13c4 100644 --- a/PocketCsvReader/CsvDataReader.cs +++ b/PocketCsvReader/CsvDataReader.cs @@ -3,20 +3,18 @@ using System.Data; using System.Diagnostics.CodeAnalysis; using System.IO; -using System.Linq; using System.Text; using System.Threading.Tasks; -using System.Xml.Linq; namespace PocketCsvReader; public class CsvDataReader : IDataReader { private bool _isClosed = false; - protected CsvReader CsvReader { get; } + protected RecordParser RecordParser { get; } protected Stream Stream { get; } protected StreamReader? StreamReader { get; private set; } - protected FileEncoding? FileEncoding { get; private set; } + protected EncodingInfo? FileEncoding { get; private set; } protected bool IsEof { get; private set; } = false; public int RowCount { get; private set; } = 0; @@ -25,25 +23,19 @@ public class CsvDataReader : IDataReader public string[]? Fields { get; private set; } = null; public string?[]? Values { get; private set; } = null; - public CsvDataReader(CsvReader csvReader, Stream stream) + public CsvDataReader(RecordParser recordParser, Stream stream) { - CsvReader = csvReader; + RecordParser = recordParser; Stream = stream; } - protected void DetectEncoding() - { - var (encoding, encodingBytesCount) = CsvReader.GetStreamEncoding(Stream); - FileEncoding = new FileEncoding(encoding, encodingBytesCount); - } - public void Initialize() { - DetectEncoding(); + FileEncoding ??= new EncodingDetector().GetStreamEncoding(Stream); StreamReader = new StreamReader(Stream, FileEncoding!.Encoding, false); var bufferBOM = new char[1]; StreamReader.Read(bufferBOM, 0, bufferBOM.Length); - CsvReader.Rewind(StreamReader); + StreamReader.Rewind(); if (FileEncoding!.BomBytesCount > 0) StreamReader.BaseStream.Position = FileEncoding!.BomBytesCount; @@ -65,7 +57,7 @@ public bool Read() if (IsEof) return false; - (Values, IsEof) = CsvReader.ReadNextRecord(StreamReader, buffer, ref extra); + (Values, IsEof) = RecordParser.ReadNextRecord(StreamReader, buffer, ref extra); if (IsEof && Values!.Length == 0) { Values = null; @@ -80,7 +72,7 @@ public bool Read() if (RowCount == 0 && Fields is null) { int unnamedFieldIndex = 0; - if (CsvReader.Profile.Descriptor.Header) + if (RecordParser.Profile.Descriptor.Header) { Fields = Values.Select(value => value ?? $"field_{unnamedFieldIndex++}").ToArray(); return Read(); @@ -99,7 +91,7 @@ public bool Read() string.Format ( "The record {0} contains {1} more field{2} than expected." - , RowCount + Convert.ToInt32(CsvReader.Profile.Descriptor.Header) + , RowCount + Convert.ToInt32(RecordParser.Profile.Descriptor.Header) , Values.Length - Fields!.Length , Values.Length - Fields.Length > 1 ? "s" : string.Empty ) @@ -110,7 +102,7 @@ public bool Read() { var list = new List(Values); while (Fields!.Length > list.Count) - list.Add(CsvReader.Profile.MissingCell); + list.Add(RecordParser.Profile.MissingCell); Values = [.. list]; } } diff --git a/PocketCsvReader/CsvDataTable.cs b/PocketCsvReader/CsvDataTable.cs new file mode 100644 index 0000000..ecd9697 --- /dev/null +++ b/PocketCsvReader/CsvDataTable.cs @@ -0,0 +1,97 @@ +using System; +using System.Collections.Generic; +using System.Data; +using System.Linq; +using System.Text; + +namespace PocketCsvReader; +public class CsvDataTable +{ + protected RecordParser RecordParser { get; } + protected Stream Stream { get; } + protected int BufferSize { get; private set; } = 4 * 1024; + + public CsvDataTable(RecordParser recordParser, Stream stream) + { + RecordParser = recordParser; + Stream = stream; + } + + public DataTable CreateTable(string[] headers) + { + var table = new DataTable(); + foreach (var header in headers) + table.Columns.Add(header); + return table; + } + + public DataTable Read() + { + int i; + + using (var reader = new StreamReader(Stream)) + { + //Move and rewind to be sure that the BOM is not skipped by internal implementation of StreamReader + var bufferBOM = new char[1]; + reader.Read(bufferBOM, 0, bufferBOM.Length); + reader.Rewind(); + + var (encoding, encodingBytesCount) = new EncodingDetector().GetStreamEncoding(Stream); + reader.Rewind(); + + var count = RecordParser.CountRecords(reader); + if (count is not null) + reader.Rewind(encodingBytesCount); + + Span extra = stackalloc char[0]; + Span buffer = stackalloc char[BufferSize]; + var headers = RecordParser.ReadHeader(reader, buffer, ref extra); + var table = CreateTable(headers); + if (!RecordParser.Profile.Descriptor.Header) + { + reader.Rewind(encodingBytesCount); + extra = []; + } + + bool isEof = false; + i = 0; + while (!isEof) + { + i++; + buffer.Clear(); + var (fields, eof) = RecordParser.ReadNextRecord(reader, buffer, ref extra); + isEof = eof; + + if (!(isEof && fields.Length == 0)) + { + var row = table.NewRow(); + if (row.ItemArray.Length < fields.Length) + throw new InvalidDataException + ( + string.Format + ( + "The record {0} contains {1} more field{2} than expected." + , table.Rows.Count + 1 + Convert.ToInt32(RecordParser.Profile.Descriptor.Header) + , fields.Length - row.ItemArray.Length + , fields.Length - row.ItemArray.Length > 1 ? "s" : string.Empty + ) + ); + + //fill the missing cells + if (row.ItemArray.Length > fields.Length) + { + var list = new List(fields); + while (row.ItemArray.Length > list.Count) + list.Add(RecordParser.Profile.MissingCell); + fields = [.. list]; + } + + row.ItemArray = fields.ToArray(); + table.Rows.Add(row); + } + } + + return table; + } + } +} diff --git a/PocketCsvReader/CsvReader.cs b/PocketCsvReader/CsvReader.cs index d3b41aa..82480c1 100644 --- a/PocketCsvReader/CsvReader.cs +++ b/PocketCsvReader/CsvReader.cs @@ -1,11 +1,6 @@ -using System; -using System.Buffers; -using System.Collections.Generic; -using System.ComponentModel.Design; +using System.Buffers; using System.Data; using System.IO; -using System.Linq; -using System.Reflection; using System.Text; namespace PocketCsvReader @@ -13,6 +8,8 @@ namespace PocketCsvReader public class CsvReader { public event ProgressStatusHandler? ProgressStatusChanged; + protected IEncodingDetector EncodingDetector { get; set; } = new EncodingDetector(); + protected RecordParser RecordParser { get; set; } protected internal CsvProfile Profile { get; private set; } protected int BufferSize { get; private set; } @@ -31,7 +28,8 @@ public CsvReader(int bufferSize) public CsvReader(CsvProfile profile, int bufferSize) { - this.Profile = profile; + Profile = profile; + RecordParser = new RecordParser(profile); BufferSize = bufferSize; } @@ -49,10 +47,8 @@ protected void RaiseProgressStatus(string status, int current, int total) public DataTable ToDataTable(string filename) { CheckFileExists(filename); - var (encoding, encodingBytesCount) = GetFileEncoding(filename); - - using (var stream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read, Profile.BufferSize)) - return Read(stream, encoding, encodingBytesCount); + using (var stream = new FileStream(filename, FileMode.Open, FileAccess.Read)) + return ToDataTable(stream); } /// @@ -64,12 +60,23 @@ public DataTable ToDataTable(string filename) /// This method reads the entire CSV content, assuming that each line represents a new row and each comma-separated value represents a field within that row. /// public DataTable ToDataTable(Stream stream) + => new CsvDataTable(new RecordParser(Profile), stream).Read(); + + /// + /// Read the CSV file, overriding the value of isFirstRowHeader defined in the profile. + /// + /// Name of the CSV file + /// Overrides the value of isFirstRowHeader defined in the profile + /// A DataTable containing all the records (rows) and fields (columns) available in the CSV file + public DataTable ToDataTable(string filename, bool isFirstRowHeader) { - var (encoding, encodingBytesCount) = GetStreamEncoding(stream); + Profile.Descriptor.Header = isFirstRowHeader; - return Read(stream, encoding, encodingBytesCount); + using (var stream = new FileStream(filename, FileMode.Open, FileAccess.Read)) + return ToDataTable(stream); } + /// /// Reads the specified CSV file and returns an for iterating over its records and fields. /// @@ -79,14 +86,12 @@ public DataTable ToDataTable(Stream stream) /// This method provides an for efficient, read-only, forward-only access to CSV data, /// suitable for large files or cases where full file loading into memory is unnecessary. /// - //public IDataReader ToDataReader(string filename) - //{ - // CheckFileExists(filename); - // var (encoding, encodingBytesCount) = GetFileEncoding(filename); - - // using (var stream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read, Profile.BufferSize)) - // return Read(stream, encoding, encodingBytesCount); - //} + public IDataReader ToDataReader(string filename) + { + CheckFileExists(filename); + var stream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read, Profile.BufferSize); + return new CsvDataReader(new RecordParser(Profile), stream); + } /// /// Reads the CSV data from the provided stream and returns an for efficient record-by-record access. @@ -98,595 +103,12 @@ public DataTable ToDataTable(Stream stream) /// ideal for handling large datasets without loading the entire file into memory at once. /// public IDataReader ToDataReader(Stream stream) - { - var dataReader = new CsvDataReader(this, stream); - return dataReader; - } - - /// - /// Read the CSV file, overriding the value of isFirstRowHeader defined in the profile. - /// - /// Name of the CSV file - /// Overrides the value of isFirstRowHeader defined in the profile - /// A DataTable containing all the records (rows) and fields (columns) available in the CSV file - public DataTable ToDataTable(string filename, bool isFirstRowHeader) - { - CheckFileExists(filename); - var (encoding, encodingBytesCount) = GetFileEncoding(filename); - Profile.Descriptor.Header = isFirstRowHeader; - - using (var stream = new FileStream(filename, FileMode.Open, FileAccess.Read)) - return Read(stream, encoding, encodingBytesCount); - } + => new CsvDataReader(RecordParser, stream); protected virtual void CheckFileExists(string filename) { if (!File.Exists(filename)) throw new FileNotFoundException($"The file {filename} was not found.", filename); } - - protected internal DataTable Read(Stream stream) - => Read(stream, Encoding.UTF8, 0); - - protected internal DataTable Read(Stream stream, Encoding encoding, int encodingBytesCount) - { - RaiseProgressStatus("Starting to process the CSV file ..."); - int i; - - using (var reader = new StreamReader(stream, encoding, false)) - { - //Move and rewind to be sure that the BOM is not skipped by internal implementation of StreamReader - var bufferBOM = new char[1]; - reader.Read(bufferBOM, 0, bufferBOM.Length); - Rewind(reader); - - var count = CountRecords(reader); - Rewind(reader); - var table = DefineFields(reader, encodingBytesCount); - Rewind(reader); - - if (encodingBytesCount > 0) - reader.BaseStream.Position = encodingBytesCount; - - bool isEof = false; - i = 0; - Span buffer = stackalloc char[BufferSize]; - Span extra = stackalloc char[0]; - - while (!isEof) - { - if (count.HasValue) - RaiseProgressStatus($"Loading row {i} of {count} ...", i, count.Value); - else - RaiseProgressStatus($"Loading row {i}{(count.HasValue ? $" of {count}" : string.Empty)} ..."); - - i++; - buffer.Clear(); - var (fields, eof) = ReadNextRecord(reader, buffer, ref extra); - isEof = eof; - - if ((i != 1 || !Profile.Descriptor.Header) && !(isEof && fields.Length == 0)) - { - var row = table.NewRow(); - if (row.ItemArray.Length < fields.Length) - throw new InvalidDataException - ( - string.Format - ( - "The record {0} contains {1} more field{2} than expected." - , table.Rows.Count + 1 + Convert.ToInt32(Profile.Descriptor.Header) - , fields.Length - row.ItemArray.Length - , fields.Length - row.ItemArray.Length > 1 ? "s" : string.Empty - ) - ); - - //fill the missing cells - if (row.ItemArray.Length > fields.Length) - { - var list = new List(fields); - while (row.ItemArray.Length > list.Count) - list.Add(Profile.MissingCell); - fields = [.. list]; - } - - row.ItemArray = fields.ToArray(); - table.Rows.Add(row); - } - - } - RaiseProgressStatus("CSV file fully processed."); - - return table; - } - } - - protected internal static void Rewind(StreamReader reader) - { - reader.BaseStream.Position = 0; - reader.DiscardBufferedData(); - } - - protected internal virtual DataTable DefineFields(StreamReader reader, int encodingBytesCount) - { - //Get first record to know the count of fields - RaiseProgressStatus("Defining fields"); - var columnCount = 0; - var columnNames = new List(); - var firstLine = GetFirstRecord(reader, Profile.Descriptor.LineTerminator, BufferSize); - if (encodingBytesCount > 0) - firstLine = firstLine.Substring(encodingBytesCount, firstLine.Length - encodingBytesCount); - if (firstLine.EndsWith(Profile.Descriptor.LineTerminator)) - firstLine = firstLine.Substring(0, firstLine.Length - Profile.Descriptor.LineTerminator.Length); - columnCount = firstLine.Split(Profile.Descriptor.Delimiter).Length; - if (Profile.Descriptor.Header) - columnNames.AddRange(GetFields(firstLine, Profile.Descriptor.Delimiter, Profile.Descriptor.QuoteChar, Profile.Descriptor.EscapeChar, string.Empty)!); - - //Correctly define the columns for the table - var table = new DataTable(); - for (int c = 0; c < columnCount; c++) - { - if (columnNames.Count == 0) - table.Columns.Add(string.Format("No name {0}", c.ToString()), typeof(string)); - else - table.Columns.Add(columnNames[c], typeof(string)); - } - RaiseProgressStatus($"{table.Columns.Count} field{(table.Columns.Count > 1 ? "s were" : " was")} identified."); - - - return table; - } - - /// - /// Detects the byte order mark of a streams and returns - /// an appropriate encoding for the file. - /// - /// The stream to analyze for the encoding - /// - protected internal virtual (Encoding, int) GetStreamEncoding(Stream stream) - { - // Default = Ansi CodePage - var encoding = Encoding.Default; - - // Detect byte order mark if any - otherwise assume default - var buffer = new byte[5]; - var n = stream.Read(buffer, 0, 5); - - if (n < 2) - return (Encoding.ASCII, 0); - - var encodingBytesCount = 0; - - if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf) - (encoding, encodingBytesCount) = (Encoding.UTF8, 3); - else if (buffer[0] == 0xff && buffer[1] == 0xfe) - (encoding, encodingBytesCount) = (Encoding.Unicode, 2); - else if (buffer[0] == 0xfe && buffer[1] == 0xff) - (encoding, encodingBytesCount) = (Encoding.BigEndianUnicode, 2); - else if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff) - (encoding, encodingBytesCount) = (Encoding.UTF32, 4); - //else if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76) - // encoding = Encoding.UTF7; - - encoding = encoding.Equals(Encoding.Default) ? Encoding.UTF8 : encoding; - RaiseProgressStatus($"Encoding bytes was set to {encoding}{(encodingBytesCount > 0 ? $"and {encodingBytesCount} byte is used by the BOM" : string.Empty)}."); - return (encoding, encodingBytesCount); - } - - /// - /// Detects the byte order mark of a file and returns - /// an appropriate encoding for the file. - /// - /// - /// - protected virtual (Encoding, int) GetFileEncoding(string filename) - { - using (var stream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read, 8, false)) - return GetStreamEncoding(stream); - } - - protected virtual int? CountRecords(StreamReader reader) - { - if (Profile.PerformanceOptmized) - return null; - - RaiseProgressStatus("Counting records ..."); - var count = CountRecordSeparators(reader); - count -= Convert.ToInt16(Profile.Descriptor.Header); - RaiseProgressStatus($"{count} record{(count > 1 ? "s were" : " was")} identified."); - - reader.BaseStream.Position = 0; - reader.DiscardBufferedData(); - return count; - } - - protected virtual int CountRecordSeparators(StreamReader reader) - { - int i = 0; - int n = 0; - int j = 0; - bool separatorAtEnd = false; - bool isCommentLine = false; - bool isFirstCharOfLine = true; - - do - { - char[] buffer = new char[BufferSize]; - n = reader.Read(buffer, 0, BufferSize); - if (n > 0 && i == 0) - i = 1; - - - foreach (var c in buffer) - { - if (c != '\0') - { - if (c == Profile.Descriptor.CommentChar && isFirstCharOfLine) - isCommentLine = true; - isFirstCharOfLine = false; - - separatorAtEnd = false; - if (c == Profile.Descriptor.LineTerminator[j]) - { - j++; - if (j == Profile.Descriptor.LineTerminator.Length) - { - if (!isCommentLine) - i++; - j = 0; - separatorAtEnd = true; - isCommentLine = false; - isFirstCharOfLine = true; - } - } - else - j = 0; - } - } - } while (n > 0); - - if (separatorAtEnd) - i -= 1; - - if (isCommentLine) - i -= 1; - - return i; - } - - protected virtual string?[] GetFields(ReadOnlySpan record, char fieldSeparator, char textQualifier, char escapeTextQualifier, string emptyCell) - { - var fields = new List(); - var fieldStart = 0; - var startsByTextQualifier = false; - var endsByTextQualifier = false; - var isEscaped = false; - - for (var fieldPos = 0; fieldPos < record.Length; fieldPos++) - { - if (fieldPos == fieldStart && record[fieldPos] == textQualifier) - startsByTextQualifier = true; - else if (record[fieldPos] == textQualifier && !isEscaped) - endsByTextQualifier = true; - else if (record[fieldPos] != fieldSeparator) - endsByTextQualifier = false; - - if (fieldPos == record.Length - 1 - || (record[fieldPos] == fieldSeparator - && startsByTextQualifier == endsByTextQualifier) - ) - { - - if (fieldPos == record.Length - 1 && record[fieldPos] != fieldSeparator) - fieldPos += 1; - - var field = startsByTextQualifier - ? record.Slice(fieldStart + 1, fieldPos - fieldStart - 2) - : record.Slice(fieldStart, fieldPos - fieldStart); - - if (field.Length == 0) - fields.Add(emptyCell); - else if (field.ToString() == "(null)") - fields.Add(null); - else if (field.Contains(escapeTextQualifier)) - { - var candidate = field.ToString(); - CheckTextQualifierEscapation(candidate, textQualifier, escapeTextQualifier); - fields.Add(candidate.Replace(new string(new[] { escapeTextQualifier, textQualifier }), textQualifier.ToString())); - } - else - fields.Add(field.ToString()); - fieldStart = fieldPos + 1; - startsByTextQualifier = false; - endsByTextQualifier = false; - } - - if (fieldPos < record.Length && record[fieldPos] == escapeTextQualifier && fieldPos != fieldStart) - isEscaped = true; - else - isEscaped = false; - } - return [.. fields]; - } - - - private static void CheckTextQualifierEscapation(string value, char textQualifier, char escapeTextQualifier) - { - if (string.IsNullOrEmpty(value)) - return; - - if (!value.Contains(textQualifier)) - return; - - var indexes = new List(); - int j = -1; - do - { - j = value.IndexOf(textQualifier, j + 1); - if (j != -1) - indexes.Add(j); - - } while (j != -1 && j < value.Length - 1); - - if (textQualifier == escapeTextQualifier) - { - if (indexes.Count() == 1) - throw new InvalidDataException($"the token {value} contains a text-qualifier not preceded by a an escape-text-qualifier at the position {indexes[0]}"); - - var i = 1; - while (i < indexes.Count()) - { - if ((i + 1) % 2 == 0) - { - if (indexes[i - 1] != indexes[i] - 1) - throw new InvalidDataException($"the token {value} contains a text-qualifier not preceded by a an escape-text-qualifier at the position {i}"); - } - else if (i == indexes.Count - 1 || indexes[i + 1] != indexes[i] + 1) - throw new InvalidDataException($"the token {value} contains a text-qualifier not preceded by a an escape-text-qualifier at the position {i}"); - i += 1; - } - } - else - foreach (var index in indexes) - if (index == 0 || value[index - 1] != escapeTextQualifier) - throw new ArgumentException($"the token {value} contains a text-qualifier not preceded by a an escape-text-qualifier at the position {index}"); - } - - protected virtual string GetFirstRecord(StreamReader reader, string recordSeparator, int bufferSize) - { - var stringBuilder = new StringBuilder(); - int j = 0; - - while (true) - { - char[] buffer = new char[bufferSize]; - reader.Read(buffer, 0, bufferSize); - - foreach (var c in buffer) - { - - if (c != '\0') - { - stringBuilder.Append(c); - if (c == recordSeparator[j]) - { - j++; - if (j == recordSeparator.Length) - return stringBuilder.ToString(); - } - else - j = 0; - } - else - return stringBuilder.ToString(); - } - } - } - - private static ReadOnlySpan Prepend(string prefix, ReadOnlySpan value) - { - Span buffer = new char[prefix.Length + value.Length]; - prefix.AsSpan().CopyTo(buffer); - value.CopyTo(buffer.Slice(prefix.Length)); - return buffer; - } - protected virtual (string?[], bool) ReadNextRecord(Span buffer) - { - Span extra = buffer; - return ReadNextRecord(null, buffer, ref extra); - } - - protected internal virtual (string?[], bool) ReadNextRecord(StreamReader? reader, Span buffer, ref Span extra) - { - var bufferSize = 0; - var index = 0; - var eof = false; - var isFirstCharOfRecord = true; - var indexRecordSeparator = 0; - var isFirstCharOfField = true; - var fields = new List(); - var indexFieldStart = 0; - var isCommentLine = false; - var isFieldWithTextQualifier = false; - var isEndingByTextQualifier = false; - var isTextQualifierEscaped = false; - Span longField = stackalloc char[0]; - var longFieldIndex = 0; - var isLastCharDelimiter = false; - - if (extra.Length > 0) - { - extra.CopyTo(buffer); - bufferSize = extra.Length; - } - else - { - bufferSize = reader?.ReadBlock(buffer) ?? throw new ArgumentNullException(nameof(reader)); - eof = bufferSize == 0; - } - - while (!eof && index < bufferSize) - { - char c = buffer[index]; - if (c == '\0') - { - eof = true; - break; - } - - if (isFirstCharOfRecord) - { - isCommentLine = c == Profile.Descriptor.CommentChar; - isFirstCharOfRecord = false; - } - - if (isFirstCharOfField) - { - isFieldWithTextQualifier = c == Profile.Descriptor.QuoteChar; - isFirstCharOfField = false; - isEndingByTextQualifier = false; - isTextQualifierEscaped = false; - } - else if (c != Profile.Descriptor.Delimiter && c != Profile.Descriptor.LineTerminator[indexRecordSeparator] && !isFirstCharOfField) - { - isEndingByTextQualifier = c == Profile.Descriptor.QuoteChar && !isTextQualifierEscaped; - isTextQualifierEscaped = c == Profile.Descriptor.EscapeChar && !isTextQualifierEscaped; - } - - if (c == Profile.Descriptor.Delimiter && !isCommentLine && (isFieldWithTextQualifier == isEndingByTextQualifier)) - { - if (longFieldIndex == 0) - fields.Add(ReadField(buffer, indexFieldStart, index, isFieldWithTextQualifier, isEndingByTextQualifier)); - else - { - fields.Add(ReadField(longField, longFieldIndex, buffer, index, isFieldWithTextQualifier, isEndingByTextQualifier)); - longField = ArrayPool.Shared.Rent(0); - longFieldIndex = 0; - } - isFirstCharOfField = true; - indexFieldStart = index + 1; - } - - if (c == Profile.Descriptor.LineTerminator[indexRecordSeparator]) - { - indexRecordSeparator++; - if (indexRecordSeparator == Profile.Descriptor.LineTerminator.Length) - { - if (!isCommentLine) - { - if (indexFieldStart <= index + longFieldIndex - Profile.Descriptor.LineTerminator.Length) - { - if (longFieldIndex == 0) - fields.Add(ReadField(buffer, indexFieldStart, index - Profile.Descriptor.LineTerminator.Length + 1, isFieldWithTextQualifier, isEndingByTextQualifier)); - else - { - fields.Add(ReadField(longField, longFieldIndex, buffer, index - Profile.Descriptor.LineTerminator.Length + 1, isFieldWithTextQualifier, isEndingByTextQualifier)); - longField = ArrayPool.Shared.Rent(0); - longFieldIndex = 0; - } - } - - extra = ArrayPool.Shared.Rent(bufferSize - index - 1); - extra = extra.Slice(0, bufferSize - index - 1); - buffer.Slice(index + 1, bufferSize - index - 1).CopyTo(extra); - buffer.Clear(); - return (fields.ToArray(), false); - } - else - { - bufferSize = bufferSize - index; - buffer = buffer.Slice(index + 1); - isCommentLine = false; - index = -1; - indexFieldStart = 0; - } - isFirstCharOfRecord = true; - isFirstCharOfField = true; - indexRecordSeparator = 0; - isFieldWithTextQualifier = false; - isEndingByTextQualifier = false; - } - } - else - indexRecordSeparator = 0; - - - - if (++index == bufferSize) - { - if (longField.Length >= longFieldIndex + index - indexFieldStart) - { - buffer.Slice(indexFieldStart, index - indexFieldStart).CopyTo(longField.Slice(longFieldIndex)); - } - else - { - var newArray = ArrayPool.Shared.Rent(longFieldIndex + index - indexFieldStart); - longField.CopyTo(newArray); - buffer.Slice(indexFieldStart, index - indexFieldStart).ToArray().CopyTo(newArray, longFieldIndex); - longField = newArray; - } - - longFieldIndex += index - indexFieldStart; - indexFieldStart = 0; - bufferSize = reader?.ReadBlock(buffer) ?? throw new ArgumentNullException(nameof(reader)); - eof = bufferSize == 0; - index = 0; - if (eof) - isLastCharDelimiter = true; - } - } - - if (eof && (index != indexFieldStart || longFieldIndex > 0 || isLastCharDelimiter) && !isCommentLine) - if (longFieldIndex == 0) - if (isLastCharDelimiter) - fields.Add(Profile.EmptyCell); - else - fields.Add(ReadField(buffer, indexFieldStart, index, isFieldWithTextQualifier, isEndingByTextQualifier)); - else - fields.Add(ReadField(longField, longFieldIndex, buffer, index, isFieldWithTextQualifier, isEndingByTextQualifier)); - - return (fields.ToArray(), eof); - } - - protected internal string? ReadField(Span longField, int longFieldIndex, ReadOnlySpan buffer, int currentIndex, bool isFieldWithTextQualifier, bool isFieldEndingByTextQualifier) - { - if (longField.Length >= longFieldIndex + currentIndex) - { - buffer.Slice(0, currentIndex + 1).CopyTo(longField.Slice(longFieldIndex)); - } - else - { - var newArray = ArrayPool.Shared.Rent(longFieldIndex + currentIndex); - longField.CopyTo(newArray); - buffer.Slice(0, currentIndex).ToArray().CopyTo(newArray, longFieldIndex); - longField = newArray; - } - return ReadField(longField, 0, longFieldIndex + currentIndex, isFieldWithTextQualifier, isFieldEndingByTextQualifier); - } - - protected internal string? ReadField(ReadOnlySpan buffer, int indexFieldStart, int currentIndex, bool isFieldWithTextQualifier, bool isFieldEndingByTextQualifier) - { - if (isFieldWithTextQualifier != isFieldEndingByTextQualifier) - if (isFieldWithTextQualifier) - throw new InvalidDataException($"the token {buffer.Slice(indexFieldStart, currentIndex - indexFieldStart)} is starting by a text-qualifier but not ending by a text-qualifier."); - else - throw new InvalidDataException($"the token {buffer.Slice(indexFieldStart, currentIndex - indexFieldStart)} is ending by a text-qualifier but not starting by a text-qualifier."); - - var field = isFieldWithTextQualifier - ? buffer.Slice(indexFieldStart + 1, currentIndex - indexFieldStart - 2) - : buffer.Slice(indexFieldStart, currentIndex - indexFieldStart); - - if (field.Length == 0) - return Profile.EmptyCell; - else if (field.ToString() == "(null)" && !isFieldWithTextQualifier) - return null; - else if (field.Contains(Profile.Descriptor.EscapeChar)) - { - var candidate = field.ToString(); - CheckTextQualifierEscapation(candidate, Profile.Descriptor.QuoteChar, Profile.Descriptor.EscapeChar); - return candidate.Replace(new string(new[] { Profile.Descriptor.EscapeChar, Profile.Descriptor.QuoteChar }), Profile.Descriptor.QuoteChar.ToString()); - } - else - return field.ToString(); - } } } diff --git a/PocketCsvReader/EncodingDetector.cs b/PocketCsvReader/EncodingDetector.cs new file mode 100644 index 0000000..99159bb --- /dev/null +++ b/PocketCsvReader/EncodingDetector.cs @@ -0,0 +1,68 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace PocketCsvReader; +public record EncodingInfo(Encoding Encoding, int BomBytesCount) +{ } + +public interface IEncodingDetector +{ + EncodingInfo GetStreamEncoding(Stream stream); + EncodingInfo GetFileEncoding(string filename); +} + +public class EncodingDetector : IEncodingDetector +{ + /// + /// Detects the byte order mark of a streams and returns + /// an appropriate encoding for the file. + /// + /// The stream to analyze for the encoding + /// + public virtual EncodingInfo GetStreamEncoding(Stream stream) + { + // Default = Ansi CodePage + var encoding = Encoding.Default; + + // Detect byte order mark if any - otherwise assume default + var buffer = new byte[5]; + var n = stream.Read(buffer, 0, 5); + + if (n < 2) + return new(Encoding.ASCII, 0); + + var encodingBytesCount = 0; + + if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf) + (encoding, encodingBytesCount) = (Encoding.UTF8, 3); + else if (buffer[0] == 0xff && buffer[1] == 0xfe && buffer[2] == 0 && buffer[3] == 0) + (encoding, encodingBytesCount) = (Encoding.UTF32, 4); + else if (buffer[0] == 0xff && buffer[1] == 0xfe) + (encoding, encodingBytesCount) = (Encoding.Unicode, 2); + else if (buffer[0] == 0xfe && buffer[1] == 0xff) + (encoding, encodingBytesCount) = (Encoding.BigEndianUnicode, 2); + else if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff) + (encoding, encodingBytesCount) = (new UTF32Encoding(true, true), 4); + //else if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76) + // encoding = Encoding.UTF7; + + encoding = encoding.Equals(Encoding.Default) ? Encoding.UTF8 : encoding; + return new(encoding, encodingBytesCount); + } + + /// + /// Detects the byte order mark of a file and returns + /// an appropriate encoding for the file. + /// + /// + /// + public virtual EncodingInfo GetFileEncoding(string filename) + { + using (var stream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read, 8, false)) + return GetStreamEncoding(stream); + } +} diff --git a/PocketCsvReader/FieldParser.cs b/PocketCsvReader/FieldParser.cs new file mode 100644 index 0000000..cf77a27 --- /dev/null +++ b/PocketCsvReader/FieldParser.cs @@ -0,0 +1,98 @@ +using System; +using System.Buffers; +using System.Collections.Generic; +using System.Data; +using System.Linq; +using System.Text; + +namespace PocketCsvReader; +public class FieldParser +{ + protected internal CsvProfile Profile { get; private set; } + public FieldParser(CsvProfile profile) + => Profile = profile; + + public string? ReadField(Span longField, int longFieldIndex, ReadOnlySpan buffer, int currentIndex, bool isFieldWithTextQualifier, bool isFieldEndingByTextQualifier) + { + if (longField.Length >= longFieldIndex + currentIndex) + { + buffer.Slice(0, currentIndex + 1).CopyTo(longField.Slice(longFieldIndex)); + } + else + { + var newArray = ArrayPool.Shared.Rent(longFieldIndex + currentIndex); + longField.CopyTo(newArray); + buffer.Slice(0, currentIndex).ToArray().CopyTo(newArray, longFieldIndex); + longField = newArray; + } + return ReadField(longField, 0, longFieldIndex + currentIndex, isFieldWithTextQualifier, isFieldEndingByTextQualifier); + } + + public string? ReadField(ReadOnlySpan buffer, int indexFieldStart, int currentIndex, bool isFieldWithTextQualifier, bool isFieldEndingByTextQualifier) + { + if (isFieldWithTextQualifier != isFieldEndingByTextQualifier) + if (isFieldWithTextQualifier) + throw new InvalidDataException($"the token {buffer.Slice(indexFieldStart, currentIndex - indexFieldStart)} is starting by a text-qualifier but not ending by a text-qualifier."); + else + throw new InvalidDataException($"the token {buffer.Slice(indexFieldStart, currentIndex - indexFieldStart)} is ending by a text-qualifier but not starting by a text-qualifier."); + + var field = isFieldWithTextQualifier + ? buffer.Slice(indexFieldStart + 1, currentIndex - indexFieldStart - 2) + : buffer.Slice(indexFieldStart, currentIndex - indexFieldStart); + + if (field.Length == 0) + return Profile.EmptyCell; + else if (field.ToString() == "(null)" && !isFieldWithTextQualifier) + return null; + else if (field.Contains(Profile.Descriptor.EscapeChar)) + { + var candidate = field.ToString(); + CheckTextQualifierEscapation(candidate, Profile.Descriptor.QuoteChar, Profile.Descriptor.EscapeChar); + return candidate.Replace(new string(new[] { Profile.Descriptor.EscapeChar, Profile.Descriptor.QuoteChar }), Profile.Descriptor.QuoteChar.ToString()); + } + else + return field.ToString(); + } + + + private static void CheckTextQualifierEscapation(string value, char textQualifier, char escapeTextQualifier) + { + if (string.IsNullOrEmpty(value)) + return; + + if (!value.Contains(textQualifier)) + return; + + var indexes = new List(); + int j = -1; + do + { + j = value.IndexOf(textQualifier, j + 1); + if (j != -1) + indexes.Add(j); + } while (j != -1 && j < value.Length - 1); + + if (textQualifier == escapeTextQualifier) + { + if (indexes.Count() == 1) + throw new InvalidDataException($"the token {value} contains a text-qualifier not preceded by a an escape-text-qualifier at the position {indexes[0]}"); + + var i = 1; + while (i < indexes.Count()) + { + if ((i + 1) % 2 == 0) + { + if (indexes[i - 1] != indexes[i] - 1) + throw new InvalidDataException($"the token {value} contains a text-qualifier not preceded by a an escape-text-qualifier at the position {i}"); + } + else if (i == indexes.Count - 1 || indexes[i + 1] != indexes[i] + 1) + throw new InvalidDataException($"the token {value} contains a text-qualifier not preceded by a an escape-text-qualifier at the position {i}"); + i += 1; + } + } + else + foreach (var index in indexes) + if (index == 0 || value[index - 1] != escapeTextQualifier) + throw new ArgumentException($"the token {value} contains a text-qualifier not preceded by a an escape-text-qualifier at the position {index}"); + } +} diff --git a/PocketCsvReader/FileEncoding.cs b/PocketCsvReader/FileEncoding.cs deleted file mode 100644 index 9527e5c..0000000 --- a/PocketCsvReader/FileEncoding.cs +++ /dev/null @@ -1,10 +0,0 @@ -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace PocketCsvReader; -public record FileEncoding(Encoding Encoding, int BomBytesCount) -{ } diff --git a/PocketCsvReader/RecordParser.cs b/PocketCsvReader/RecordParser.cs new file mode 100644 index 0000000..06bf7e8 --- /dev/null +++ b/PocketCsvReader/RecordParser.cs @@ -0,0 +1,284 @@ +using System; +using System.Buffers; +using System.Collections.Generic; +using System.Data; +using System.Linq; +using System.Text; + +namespace PocketCsvReader; +public class RecordParser +{ + protected internal CsvProfile Profile { get; private set; } + protected FieldParser FieldParser { get; private set; } + + public RecordParser(CsvProfile profile) + => (Profile, FieldParser) = (profile, new(profile)); + + public virtual (string?[] fields, bool eof) ReadNextRecord(Span buffer) + { + Span extra = buffer; + return ReadNextRecord(null, buffer, ref extra); + } + + public virtual (string?[] fields, bool eof) ReadNextRecord(StreamReader? reader, Span buffer, ref Span extra) + { + var bufferSize = 0; + var index = 0; + var eof = false; + var isFirstCharOfRecord = true; + var indexRecordSeparator = 0; + var isFirstCharOfField = true; + var fields = new List(); + var indexFieldStart = 0; + var isCommentLine = false; + var isFieldWithTextQualifier = false; + var isEndingByTextQualifier = false; + var isTextQualifierEscaped = false; + Span longField = stackalloc char[0]; + var longFieldIndex = 0; + var isLastCharDelimiter = false; + + if (extra.Length > 0) + { + extra.CopyTo(buffer); + bufferSize = extra.Length; + } + else + { + bufferSize = reader?.ReadBlock(buffer) ?? throw new ArgumentNullException(nameof(reader)); + eof = bufferSize == 0; + } + + while (!eof && index < bufferSize) + { + char c = buffer[index]; + if (c == '\0') + { + eof = true; + break; + } + + if (isFirstCharOfRecord) + { + isCommentLine = c == Profile.Descriptor.CommentChar; + isFirstCharOfRecord = false; + } + + if (isFirstCharOfField) + { + isFieldWithTextQualifier = c == Profile.Descriptor.QuoteChar; + isFirstCharOfField = false; + isEndingByTextQualifier = false; + isTextQualifierEscaped = false; + } + else if (c != Profile.Descriptor.Delimiter && c != Profile.Descriptor.LineTerminator[indexRecordSeparator] && !isFirstCharOfField) + { + isEndingByTextQualifier = c == Profile.Descriptor.QuoteChar && !isTextQualifierEscaped; + isTextQualifierEscaped = c == Profile.Descriptor.EscapeChar && !isTextQualifierEscaped; + } + + if (c == Profile.Descriptor.Delimiter && !isCommentLine && (isFieldWithTextQualifier == isEndingByTextQualifier)) + { + if (longFieldIndex == 0) + fields.Add(FieldParser.ReadField(buffer, indexFieldStart, index, isFieldWithTextQualifier, isEndingByTextQualifier)); + else + { + fields.Add(FieldParser.ReadField(longField, longFieldIndex, buffer, index, isFieldWithTextQualifier, isEndingByTextQualifier)); + longField = ArrayPool.Shared.Rent(0); + longFieldIndex = 0; + } + isFirstCharOfField = true; + indexFieldStart = index + 1; + } + + if (c == Profile.Descriptor.LineTerminator[indexRecordSeparator]) + { + indexRecordSeparator++; + if (indexRecordSeparator == Profile.Descriptor.LineTerminator.Length) + { + if (!isCommentLine) + { + if (indexFieldStart <= index + longFieldIndex - Profile.Descriptor.LineTerminator.Length) + { + if (longFieldIndex == 0) + fields.Add(FieldParser.ReadField(buffer, indexFieldStart, index - Profile.Descriptor.LineTerminator.Length + 1, isFieldWithTextQualifier, isEndingByTextQualifier)); + else + { + fields.Add(FieldParser.ReadField(longField, longFieldIndex, buffer, index - Profile.Descriptor.LineTerminator.Length + 1, isFieldWithTextQualifier, isEndingByTextQualifier)); + longField = ArrayPool.Shared.Rent(0); + longFieldIndex = 0; + } + } + + extra = ArrayPool.Shared.Rent(bufferSize - index - 1); + extra = extra.Slice(0, bufferSize - index - 1); + buffer.Slice(index + 1, bufferSize - index - 1).CopyTo(extra); + buffer.Clear(); + return (fields.ToArray(), false); + } + else + { + bufferSize = bufferSize - index; + buffer = buffer.Slice(index + 1); + isCommentLine = false; + index = -1; + indexFieldStart = 0; + } + isFirstCharOfRecord = true; + isFirstCharOfField = true; + indexRecordSeparator = 0; + isFieldWithTextQualifier = false; + isEndingByTextQualifier = false; + } + } + else + indexRecordSeparator = 0; + + + + if (++index == bufferSize) + { + if (longField.Length >= longFieldIndex + index - indexFieldStart) + { + buffer.Slice(indexFieldStart, index - indexFieldStart).CopyTo(longField.Slice(longFieldIndex)); + } + else + { + var newArray = ArrayPool.Shared.Rent(longFieldIndex + index - indexFieldStart); + longField.CopyTo(newArray); + buffer.Slice(indexFieldStart, index - indexFieldStart).ToArray().CopyTo(newArray, longFieldIndex); + longField = newArray; + } + + longFieldIndex += index - indexFieldStart; + indexFieldStart = 0; + bufferSize = reader?.ReadBlock(buffer) ?? throw new ArgumentNullException(nameof(reader)); + eof = bufferSize == 0; + index = 0; + if (eof) + isLastCharDelimiter = true; + } + } + + if (eof && (index != indexFieldStart || longFieldIndex > 0 || isLastCharDelimiter) && !isCommentLine) + if (longFieldIndex == 0) + if (isLastCharDelimiter) + fields.Add(Profile.EmptyCell); + else + fields.Add(FieldParser.ReadField(buffer, indexFieldStart, index, isFieldWithTextQualifier, isEndingByTextQualifier)); + else + fields.Add(FieldParser.ReadField(longField, longFieldIndex, buffer, index, isFieldWithTextQualifier, isEndingByTextQualifier)); + + return (fields.ToArray(), eof); + } + + + public int? CountRecords(StreamReader reader) + { + if (Profile.PerformanceOptmized) + return null; + + //RaiseProgressStatus("Counting records ..."); + var count = CountRecordSeparators(reader); + count -= Convert.ToInt16(Profile.Descriptor.Header); + //RaiseProgressStatus($"{count} record{(count > 1 ? "s were" : " was")} identified."); + + reader.BaseStream.Position = 0; + reader.DiscardBufferedData(); + return count; + } + + private int BufferSize = 4096; + public virtual int CountRecordSeparators(StreamReader reader) + { + int i = 0; + int n = 0; + int j = 0; + bool separatorAtEnd = false; + bool isCommentLine = false; + bool isFirstCharOfLine = true; + + do + { + char[] buffer = new char[BufferSize]; + n = reader.Read(buffer, 0, BufferSize); + if (n > 0 && i == 0) + i = 1; + + + foreach (var c in buffer) + { + if (c != '\0') + { + if (c == Profile.Descriptor.CommentChar && isFirstCharOfLine) + isCommentLine = true; + isFirstCharOfLine = false; + + separatorAtEnd = false; + if (c == Profile.Descriptor.LineTerminator[j]) + { + j++; + if (j == Profile.Descriptor.LineTerminator.Length) + { + if (!isCommentLine) + i++; + j = 0; + separatorAtEnd = true; + isCommentLine = false; + isFirstCharOfLine = true; + } + } + else + j = 0; + } + } + } while (n > 0); + + if (separatorAtEnd) + i -= 1; + + if (isCommentLine) + i -= 1; + + return i; + } + + public string GetFirstRecord(StreamReader reader, string recordSeparator, int bufferSize) + { + var stringBuilder = new StringBuilder(); + int j = 0; + + while (true) + { + char[] buffer = new char[bufferSize]; + reader.Read(buffer, 0, bufferSize); + + foreach (var c in buffer) + { + if (c != '\0') + { + stringBuilder.Append(c); + if (c == recordSeparator[j]) + { + j++; + if (j == recordSeparator.Length) + return stringBuilder.ToString(); + } + else + j = 0; + } + else + return stringBuilder.ToString(); + } + } + } + + public virtual string[] ReadHeader(StreamReader? reader, Span buffer, ref Span extra) + { + var unnamedFieldIndex = 0; + return ReadNextRecord(reader, buffer, ref extra).fields + .Select(value => value is null || !Profile.Descriptor.Header + ? $"field_{unnamedFieldIndex++}" + : value).ToArray(); + } +} diff --git a/PocketCsvReader/StreamExtensions.cs b/PocketCsvReader/StreamExtensions.cs new file mode 100644 index 0000000..39a54bd --- /dev/null +++ b/PocketCsvReader/StreamExtensions.cs @@ -0,0 +1,21 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace PocketCsvReader; +public static class StreamExtensions +{ + public static void Rewind(this StreamReader reader) + { + reader.BaseStream.Position = 0; + reader.DiscardBufferedData(); + } + + public static void Rewind(this StreamReader reader, int count) + { + reader.BaseStream.Position = count; + reader.DiscardBufferedData(); + } +}