From ffc6b283589e8db882c0b262eb305e473e4c4ab5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20L=2E=20Charlier?= Date: Sat, 9 Nov 2024 10:04:22 +0100 Subject: [PATCH] feat: read CSV as an iterator of an array of strings (#16) * feat: read CSV as an iterator of an array of strings --------- Co-authored-by: codefactor-io --- PocketCsvReader.Testing/CsvArrayStringTest.cs | 105 +++++++++++++ PocketCsvReader/CsvArrayString.cs | 145 ++++++++++++++++++ PocketCsvReader/CsvReader.cs | 33 +++- 3 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 PocketCsvReader.Testing/CsvArrayStringTest.cs create mode 100644 PocketCsvReader/CsvArrayString.cs diff --git a/PocketCsvReader.Testing/CsvArrayStringTest.cs b/PocketCsvReader.Testing/CsvArrayStringTest.cs new file mode 100644 index 0000000..3c06e9d --- /dev/null +++ b/PocketCsvReader.Testing/CsvArrayStringTest.cs @@ -0,0 +1,105 @@ +using PocketCsvReader; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Data; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using System.Reflection; + +namespace PocketCsvReader.Testing +{ + [TestFixture] + public class CsvArrayStringTest + { + private static MemoryStream CreateStream(string content) + { + var byteArray = Encoding.UTF8.GetBytes(content); + var stream = new MemoryStream(byteArray); + stream.Position = 0; + return stream; + } + + [Test] + [TestCase("Ansi")] + [TestCase("Utf16-BE")] + [TestCase("Utf16-LE")] + [TestCase("Utf8-BOM")] + [TestCase("Utf8")] + public void ToDataReader_Financial_CorrectRowsColumns(string filename) + { + var reader = new CsvReader(new CsvProfile('\t', '\"', "\r\n", true)); + + using (var stream = + Assembly.GetExecutingAssembly() + .GetManifestResourceStream($"{Assembly.GetExecutingAssembly().GetName().Name}.Resources.{filename}.csv") + ?? throw new FileNotFoundException() + ) + { + var rows = reader.ToArrayString(stream); + Assert.That(rows.Count, Is.EqualTo(21)); + } + } + + [Test] + [TestCase("Ansi")] + [TestCase("Utf16-BE")] + [TestCase("Utf16-LE")] + [TestCase("Utf8-BOM")] + [TestCase("Utf8")] + public void ToDataReader_Financial_CorrectColumnByIndexer(string filename) + { + var reader = new CsvReader(new CsvProfile('\t', '\"', "\r\n", true)); + + using (var stream = + Assembly.GetExecutingAssembly() + .GetManifestResourceStream($"{Assembly.GetExecutingAssembly().GetName().Name}.Resources.{filename}.csv") + ?? throw new FileNotFoundException() + ) + { + foreach(var row in reader.ToArrayString(stream)) + { + Assert.Multiple(() => + { + Assert.That(row[0], Is.EqualTo("2018")); + Assert.That(row[1], Is.EqualTo("7")); + Assert.That(row[2], Is.EqualTo("1")); + Assert.That(row[13], Does.StartWith("2018-")); + }); + } + } + } + + [Test] + [TestCase("Ansi")] + [TestCase("Utf16-BE")] + [TestCase("Utf16-LE")] + [TestCase("Utf8-BOM")] + [TestCase("Utf8")] + public void ToDataReader_Financial_CorrectColumnWithGetStringIndex(string filename) + { + var reader = new CsvReader(new CsvProfile('\t', '\"', "\r\n", true)); + + using (var stream = + Assembly.GetExecutingAssembly() + .GetManifestResourceStream($"{Assembly.GetExecutingAssembly().GetName().Name}.Resources.{filename}.csv") + ?? throw new FileNotFoundException() + ) + { + var r = reader.ToArrayString(stream); + foreach (var row in reader.ToArrayString(stream)) + { + Assert.Multiple(() => + { + Assert.That(row[0], Is.EqualTo("2018")); + Assert.That(row[1], Is.EqualTo("7")); + Assert.That(row[2], Is.EqualTo("1")); + Assert.That(row[13], Does.StartWith("2018-")); + }); + } + } + } + } +} diff --git a/PocketCsvReader/CsvArrayString.cs b/PocketCsvReader/CsvArrayString.cs new file mode 100644 index 0000000..4981d45 --- /dev/null +++ b/PocketCsvReader/CsvArrayString.cs @@ -0,0 +1,145 @@ +using System; +using System.Collections.Generic; +using System.Data; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Text; +using System.Threading.Tasks; + +namespace PocketCsvReader; +public class CsvArrayString : IDisposable +{ + protected RecordParser RecordParser { get; } + protected Stream Stream { get; } + protected StreamReader? StreamReader { get; private set; } + + protected EncodingInfo? EncodingInfo { get; private set; } + + protected bool IsEof { get; private set; } = false; + public int RowCount { get; private set; } = 0; + protected int BufferSize { get; private set; } = 4 * 1024; + + public string[]? Fields { get; private set; } = null; + + public CsvArrayString(RecordParser recordParser, Stream stream) + { + RecordParser = recordParser; + Stream = stream; + } + + public CsvArrayString(RecordParser recordParser, Stream stream, Encoding encoding, int bomByteCount) + { + RecordParser = recordParser; + Stream = stream; + EncodingInfo = new EncodingInfo(encoding, bomByteCount); + } + + public void Initialize() + { + EncodingInfo ??= new EncodingDetector().GetStreamEncoding(Stream); + StreamReader = new StreamReader(Stream, EncodingInfo!.Encoding, false); + var bufferBOM = new char[1]; + StreamReader.Read(bufferBOM, 0, bufferBOM.Length); + StreamReader.Rewind(); + + if (EncodingInfo!.BomBytesCount > 0) + StreamReader.BaseStream.Position = EncodingInfo!.BomBytesCount; + + IsEof = false; + RowCount = 0; + } + + Memory Extra = Memory.Empty; + public IEnumerable Read() + { + if (EncodingInfo is null) + Initialize(); + + while (!IsEof) + { + string?[]? values = ReadNextRecord(); + if (values is null) + yield break; + + yield return values; + } + } + + private string?[]? ReadNextRecord() + { + Span buffer = stackalloc char[BufferSize]; + Span extra = stackalloc char[Extra.Length]; + Extra.Span.CopyTo(extra); + + if (IsEof) + return null; + + string?[]? values; + (values, IsEof) = RecordParser.ReadNextRecord(StreamReader, buffer, ref extra); + + if (IsEof && values!.Length == 0) + { + values = null; + Extra = null; + return null; + } + + if (Extra.Length != extra.Length) + Extra = new char[extra.Length]; + extra.CopyTo(Extra.Span); + + if (RowCount == 0 && Fields is null) + { + int unnamedFieldIndex = 0; + if (RecordParser.Profile.Descriptor.Header) + { + Fields = values.Select(value => value ?? $"field_{unnamedFieldIndex++}").ToArray(); + return ReadNextRecord(); // Skip header and read next record + } + else + { + Fields = values.Select(_ => $"field_{unnamedFieldIndex++}").ToArray(); + } + } + else + { + RowCount++; + + // Handle case with unexpected fields + if ((Fields?.Length ?? int.MaxValue) < values!.Length) + throw new InvalidDataException + ( + string.Format + ( + "The record {0} contains {1} more field{2} than expected.", + RowCount + Convert.ToInt32(RecordParser.Profile.Descriptor.Header), + values.Length - Fields!.Length, + values.Length - Fields.Length > 1 ? "s" : string.Empty + ) + ); + + // Fill the missing cells + if ((Fields?.Length ?? 0) > values.Length) + { + var list = new List(values); + while (Fields!.Length > list.Count) + list.Add(RecordParser.Profile.MissingCell); + values = list.ToArray(); + } + } + + return values; + } + + public void Dispose() + { + StreamReader?.Dispose(); + Stream?.Dispose(); + GC.SuppressFinalize(this); // Prevents finalizer from running + } + + ~CsvArrayString() + { + Dispose(); + } +} diff --git a/PocketCsvReader/CsvReader.cs b/PocketCsvReader/CsvReader.cs index 82480c1..fd2e957 100644 --- a/PocketCsvReader/CsvReader.cs +++ b/PocketCsvReader/CsvReader.cs @@ -1,4 +1,4 @@ -using System.Buffers; +using System.Buffers; using System.Data; using System.IO; using System.Text; @@ -76,7 +76,6 @@ public DataTable ToDataTable(string filename, bool isFirstRowHeader) return ToDataTable(stream); } - /// /// Reads the specified CSV file and returns an for iterating over its records and fields. /// @@ -105,6 +104,36 @@ public IDataReader ToDataReader(string filename) public IDataReader ToDataReader(Stream stream) => new CsvDataReader(RecordParser, stream); + + /// + /// Reads the specified CSV file and returns an for iterating over its records and fields. + /// + /// The name or full path of the CSV file to read. + /// An instance for sequentially reading each record and field in the CSV file. + /// + /// This method provides an for efficient, read-only, forward-only access to CSV data, + /// suitable for large files or cases where full file loading into memory is unnecessary. + /// + public IEnumerable ToArrayString(string filename) + { + CheckFileExists(filename); + var stream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read, Profile.BufferSize); + return new CsvArrayString(new RecordParser(Profile), stream).Read(); + } + + /// + /// Reads the CSV data from the provided stream and returns an for efficient record-by-record access. + /// + /// The containing CSV data, positioned at the beginning of the content. + /// An that allows sequential access to each record and field in the CSV file. + /// + /// This method processes the CSV data from the stream and provides an for forward-only, read-only access, + /// ideal for handling large datasets without loading the entire file into memory at once. + /// + public IEnumerable ToArrayString(Stream stream) + => new CsvArrayString(RecordParser, stream).Read(); + + protected virtual void CheckFileExists(string filename) { if (!File.Exists(filename))