From ffc6b283589e8db882c0b262eb305e473e4c4ab5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20L=2E=20Charlier?= <seddryck@gmail.com>
Date: Sat, 9 Nov 2024 10:04:22 +0100
Subject: [PATCH] feat: read CSV as an iterator of an array of strings (#16)

* feat: read CSV as an iterator of an array of strings

---------

Co-authored-by: codefactor-io <support@codefactor.io>
---
 PocketCsvReader.Testing/CsvArrayStringTest.cs | 105 +++++++++++++
 PocketCsvReader/CsvArrayString.cs             | 145 ++++++++++++++++++
 PocketCsvReader/CsvReader.cs                  |  33 +++-
 3 files changed, 281 insertions(+), 2 deletions(-)
 create mode 100644 PocketCsvReader.Testing/CsvArrayStringTest.cs
 create mode 100644 PocketCsvReader/CsvArrayString.cs

diff --git a/PocketCsvReader.Testing/CsvArrayStringTest.cs b/PocketCsvReader.Testing/CsvArrayStringTest.cs
new file mode 100644
index 0000000..3c06e9d
--- /dev/null
+++ b/PocketCsvReader.Testing/CsvArrayStringTest.cs
@@ -0,0 +1,105 @@
+﻿using PocketCsvReader;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.Data;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using System.Reflection;
+
+namespace PocketCsvReader.Testing
+{
+    [TestFixture]
+    public class CsvArrayStringTest
+    {
+        private static MemoryStream CreateStream(string content)
+        {
+            var byteArray = Encoding.UTF8.GetBytes(content);
+            var stream = new MemoryStream(byteArray);
+            stream.Position = 0;
+            return stream;
+        }
+
+        [Test]
+        [TestCase("Ansi")]
+        [TestCase("Utf16-BE")]
+        [TestCase("Utf16-LE")]
+        [TestCase("Utf8-BOM")]
+        [TestCase("Utf8")]
+        public void ToDataReader_Financial_CorrectRowsColumns(string filename)
+        {
+            var reader = new CsvReader(new CsvProfile('\t', '\"', "\r\n", true));
+
+            using (var stream =
+                    Assembly.GetExecutingAssembly()
+                        .GetManifestResourceStream($"{Assembly.GetExecutingAssembly().GetName().Name}.Resources.{filename}.csv")
+                        ?? throw new FileNotFoundException()
+            )
+            {
+                var rows = reader.ToArrayString(stream);
+                Assert.That(rows.Count, Is.EqualTo(21));
+            }
+        }
+
+        [Test]
+        [TestCase("Ansi")]
+        [TestCase("Utf16-BE")]
+        [TestCase("Utf16-LE")]
+        [TestCase("Utf8-BOM")]
+        [TestCase("Utf8")]
+        public void ToDataReader_Financial_CorrectColumnByIndexer(string filename)
+        {
+            var reader = new CsvReader(new CsvProfile('\t', '\"', "\r\n", true));
+
+            using (var stream =
+                    Assembly.GetExecutingAssembly()
+                        .GetManifestResourceStream($"{Assembly.GetExecutingAssembly().GetName().Name}.Resources.{filename}.csv")
+                        ?? throw new FileNotFoundException()
+            )
+            {
+                foreach(var row in reader.ToArrayString(stream))
+                {
+                    Assert.Multiple(() =>
+                    {
+                        Assert.That(row[0], Is.EqualTo("2018"));
+                        Assert.That(row[1], Is.EqualTo("7"));
+                        Assert.That(row[2], Is.EqualTo("1"));
+                        Assert.That(row[13], Does.StartWith("2018-"));
+                    });
+                } 
+            }
+        }
+
+        [Test]
+        [TestCase("Ansi")]
+        [TestCase("Utf16-BE")]
+        [TestCase("Utf16-LE")]
+        [TestCase("Utf8-BOM")]
+        [TestCase("Utf8")]
+        public void ToDataReader_Financial_CorrectColumnWithGetStringIndex(string filename)
+        {
+            var reader = new CsvReader(new CsvProfile('\t', '\"', "\r\n", true));
+
+            using (var stream =
+                    Assembly.GetExecutingAssembly()
+                        .GetManifestResourceStream($"{Assembly.GetExecutingAssembly().GetName().Name}.Resources.{filename}.csv")
+                        ?? throw new FileNotFoundException()
+            )
+            {
+                var r = reader.ToArrayString(stream);
+                foreach (var row in reader.ToArrayString(stream))
+                {
+                    Assert.Multiple(() =>
+                    {
+                        Assert.That(row[0], Is.EqualTo("2018"));
+                        Assert.That(row[1], Is.EqualTo("7"));
+                        Assert.That(row[2], Is.EqualTo("1"));
+                        Assert.That(row[13], Does.StartWith("2018-"));
+                    });
+                }
+            }
+        }
+    }
+}
diff --git a/PocketCsvReader/CsvArrayString.cs b/PocketCsvReader/CsvArrayString.cs
new file mode 100644
index 0000000..4981d45
--- /dev/null
+++ b/PocketCsvReader/CsvArrayString.cs
@@ -0,0 +1,145 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Data;
+using System.Diagnostics.CodeAnalysis;
+using System.IO;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace PocketCsvReader;
+public class CsvArrayString : IDisposable
+{
+    protected RecordParser RecordParser { get; }
+    protected Stream Stream { get; }
+    protected StreamReader? StreamReader { get; private set; }
+
+    protected EncodingInfo? EncodingInfo { get; private set; }
+
+    protected bool IsEof { get; private set; } = false;
+    public int RowCount { get; private set; } = 0;
+    protected int BufferSize { get; private set; } = 4 * 1024;
+
+    public string[]? Fields { get; private set; } = null;
+
+    public CsvArrayString(RecordParser recordParser, Stream stream)
+    {
+        RecordParser = recordParser;
+        Stream = stream;
+    }
+
+    public CsvArrayString(RecordParser recordParser, Stream stream, Encoding encoding, int bomByteCount)
+    {
+        RecordParser = recordParser;
+        Stream = stream;
+        EncodingInfo = new EncodingInfo(encoding, bomByteCount);
+    }
+
+    public void Initialize()
+    {
+        EncodingInfo ??= new EncodingDetector().GetStreamEncoding(Stream);
+        StreamReader = new StreamReader(Stream, EncodingInfo!.Encoding, false);
+        var bufferBOM = new char[1];
+        StreamReader.Read(bufferBOM, 0, bufferBOM.Length);
+        StreamReader.Rewind();
+
+        if (EncodingInfo!.BomBytesCount > 0)
+            StreamReader.BaseStream.Position = EncodingInfo!.BomBytesCount;
+
+        IsEof = false;
+        RowCount = 0;
+    }
+
+    Memory<char> Extra = Memory<char>.Empty;
+    public IEnumerable<string?[]> Read()
+    {
+        if (EncodingInfo is null)
+            Initialize();
+
+        while (!IsEof)
+        {
+            string?[]? values = ReadNextRecord();
+            if (values is null)
+                yield break;
+
+            yield return values;
+        }
+    }
+
+    private string?[]? ReadNextRecord()
+    {
+        Span<char> buffer = stackalloc char[BufferSize];
+        Span<char> extra = stackalloc char[Extra.Length];
+        Extra.Span.CopyTo(extra);
+
+        if (IsEof)
+            return null;
+
+        string?[]? values;
+        (values, IsEof) = RecordParser.ReadNextRecord(StreamReader, buffer, ref extra);
+
+        if (IsEof && values!.Length == 0)
+        {
+            values = null;
+            Extra = null;
+            return null;
+        }
+
+        if (Extra.Length != extra.Length)
+            Extra = new char[extra.Length];
+        extra.CopyTo(Extra.Span);
+
+        if (RowCount == 0 && Fields is null)
+        {
+            int unnamedFieldIndex = 0;
+            if (RecordParser.Profile.Descriptor.Header)
+            {
+                Fields = values.Select(value => value ?? $"field_{unnamedFieldIndex++}").ToArray();
+                return ReadNextRecord(); // Skip header and read next record
+            }
+            else
+            {
+                Fields = values.Select(_ => $"field_{unnamedFieldIndex++}").ToArray();
+            }
+        }
+        else
+        {
+            RowCount++;
+
+            // Handle case with unexpected fields
+            if ((Fields?.Length ?? int.MaxValue) < values!.Length)
+                throw new InvalidDataException
+                (
+                    string.Format
+                    (
+                        "The record {0} contains {1} more field{2} than expected.",
+                        RowCount + Convert.ToInt32(RecordParser.Profile.Descriptor.Header),
+                        values.Length - Fields!.Length,
+                        values.Length - Fields.Length > 1 ? "s" : string.Empty
+                    )
+                );
+
+            // Fill the missing cells
+            if ((Fields?.Length ?? 0) > values.Length)
+            {
+                var list = new List<string?>(values);
+                while (Fields!.Length > list.Count)
+                    list.Add(RecordParser.Profile.MissingCell);
+                values = list.ToArray();
+            }
+        }
+
+        return values;
+    }
+
+    public void Dispose()
+    {
+        StreamReader?.Dispose();
+        Stream?.Dispose();
+        GC.SuppressFinalize(this); // Prevents finalizer from running
+    }
+
+    ~CsvArrayString()
+    {
+        Dispose();
+    }
+}
diff --git a/PocketCsvReader/CsvReader.cs b/PocketCsvReader/CsvReader.cs
index 82480c1..fd2e957 100644
--- a/PocketCsvReader/CsvReader.cs
+++ b/PocketCsvReader/CsvReader.cs
@@ -1,4 +1,4 @@
-﻿using System.Buffers;
+using System.Buffers;
 using System.Data;
 using System.IO;
 using System.Text;
@@ -76,7 +76,6 @@ public DataTable ToDataTable(string filename, bool isFirstRowHeader)
                 return ToDataTable(stream);
         }
 
-
         /// <summary>
         /// Reads the specified CSV file and returns an <see cref="IDataReader"/> for iterating over its records and fields.
         /// </summary>
@@ -105,6 +104,36 @@ public IDataReader ToDataReader(string filename)
         public IDataReader ToDataReader(Stream stream)
             => new CsvDataReader(RecordParser, stream);
 
+
+        /// <summary>
+        /// Reads the specified CSV file and returns an <see cref="IDataReader"/> for iterating over its records and fields.
+        /// </summary>
+        /// <param name="filename">The name or full path of the CSV file to read.</param>
+        /// <returns>An <see cref="IDataReader"/> instance for sequentially reading each record and field in the CSV file.</returns>
+        /// <remarks>
+        /// This method provides an <see cref="IDataReader"/> for efficient, read-only, forward-only access to CSV data,
+        /// suitable for large files or cases where full file loading into memory is unnecessary.
+        /// </remarks>
+        public IEnumerable<string?[]> ToArrayString(string filename)
+        {
+            CheckFileExists(filename);
+            var stream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read, Profile.BufferSize);
+            return new CsvArrayString(new RecordParser(Profile), stream).Read();
+        }
+
+        /// <summary>
+        /// Reads the CSV data from the provided stream and returns an <see cref="IDataReader"/> for efficient record-by-record access.
+        /// </summary>
+        /// <param name="stream">The <see cref="Stream"/> containing CSV data, positioned at the beginning of the content.</param>
+        /// <returns>An <see cref="IDataReader"/> that allows sequential access to each record and field in the CSV file.</returns>
+        /// <remarks>
+        /// This method processes the CSV data from the stream and provides an <see cref="IDataReader"/> for forward-only, read-only access,
+        /// ideal for handling large datasets without loading the entire file into memory at once.
+        /// </remarks>
+        public IEnumerable<string?[]> ToArrayString(Stream stream)
+            => new CsvArrayString(RecordParser, stream).Read();
+
+
         protected virtual void CheckFileExists(string filename)
         {
             if (!File.Exists(filename))