Skip to content

Commit

Permalink
feat: read CSV as an iterator of an array of strings (#16)
Browse files Browse the repository at this point in the history
* feat: read CSV as an iterator of an array of strings

---------

Co-authored-by: codefactor-io <[email protected]>
  • Loading branch information
Seddryck and code-factor authored Nov 9, 2024
1 parent 41addbf commit ffc6b28
Show file tree
Hide file tree
Showing 3 changed files with 281 additions and 2 deletions.
105 changes: 105 additions & 0 deletions PocketCsvReader.Testing/CsvArrayStringTest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
using PocketCsvReader;
using NUnit.Framework;
using System;
using System.Collections.Generic;
using System.Data;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Reflection;

namespace PocketCsvReader.Testing
{
[TestFixture]
public class CsvArrayStringTest
{
private static MemoryStream CreateStream(string content)
{
var byteArray = Encoding.UTF8.GetBytes(content);
var stream = new MemoryStream(byteArray);
stream.Position = 0;
return stream;
}

[Test]
[TestCase("Ansi")]
[TestCase("Utf16-BE")]
[TestCase("Utf16-LE")]
[TestCase("Utf8-BOM")]
[TestCase("Utf8")]
public void ToDataReader_Financial_CorrectRowsColumns(string filename)
{
var reader = new CsvReader(new CsvProfile('\t', '\"', "\r\n", true));

using (var stream =
Assembly.GetExecutingAssembly()
.GetManifestResourceStream($"{Assembly.GetExecutingAssembly().GetName().Name}.Resources.{filename}.csv")
?? throw new FileNotFoundException()
)
{
var rows = reader.ToArrayString(stream);
Assert.That(rows.Count, Is.EqualTo(21));
}
}

[Test]
[TestCase("Ansi")]
[TestCase("Utf16-BE")]
[TestCase("Utf16-LE")]
[TestCase("Utf8-BOM")]
[TestCase("Utf8")]
public void ToDataReader_Financial_CorrectColumnByIndexer(string filename)
{
var reader = new CsvReader(new CsvProfile('\t', '\"', "\r\n", true));

using (var stream =
Assembly.GetExecutingAssembly()
.GetManifestResourceStream($"{Assembly.GetExecutingAssembly().GetName().Name}.Resources.{filename}.csv")
?? throw new FileNotFoundException()
)
{
foreach(var row in reader.ToArrayString(stream))
{
Assert.Multiple(() =>
{
Assert.That(row[0], Is.EqualTo("2018"));
Assert.That(row[1], Is.EqualTo("7"));
Assert.That(row[2], Is.EqualTo("1"));
Assert.That(row[13], Does.StartWith("2018-"));
});
}
}
}

[Test]
[TestCase("Ansi")]
[TestCase("Utf16-BE")]
[TestCase("Utf16-LE")]
[TestCase("Utf8-BOM")]
[TestCase("Utf8")]
public void ToDataReader_Financial_CorrectColumnWithGetStringIndex(string filename)
{
var reader = new CsvReader(new CsvProfile('\t', '\"', "\r\n", true));

using (var stream =
Assembly.GetExecutingAssembly()
.GetManifestResourceStream($"{Assembly.GetExecutingAssembly().GetName().Name}.Resources.{filename}.csv")
?? throw new FileNotFoundException()
)
{
var r = reader.ToArrayString(stream);
foreach (var row in reader.ToArrayString(stream))
{
Assert.Multiple(() =>
{
Assert.That(row[0], Is.EqualTo("2018"));
Assert.That(row[1], Is.EqualTo("7"));
Assert.That(row[2], Is.EqualTo("1"));
Assert.That(row[13], Does.StartWith("2018-"));
});
}
}
}
}
}
145 changes: 145 additions & 0 deletions PocketCsvReader/CsvArrayString.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
using System;
using System.Collections.Generic;
using System.Data;
using System.Diagnostics.CodeAnalysis;
using System.IO;
using System.Text;
using System.Threading.Tasks;

namespace PocketCsvReader;
public class CsvArrayString : IDisposable
{
protected RecordParser RecordParser { get; }
protected Stream Stream { get; }
protected StreamReader? StreamReader { get; private set; }

protected EncodingInfo? EncodingInfo { get; private set; }

protected bool IsEof { get; private set; } = false;
public int RowCount { get; private set; } = 0;
protected int BufferSize { get; private set; } = 4 * 1024;

public string[]? Fields { get; private set; } = null;

public CsvArrayString(RecordParser recordParser, Stream stream)
{
RecordParser = recordParser;
Stream = stream;
}

public CsvArrayString(RecordParser recordParser, Stream stream, Encoding encoding, int bomByteCount)
{
RecordParser = recordParser;
Stream = stream;
EncodingInfo = new EncodingInfo(encoding, bomByteCount);
}

public void Initialize()
{
EncodingInfo ??= new EncodingDetector().GetStreamEncoding(Stream);
StreamReader = new StreamReader(Stream, EncodingInfo!.Encoding, false);
var bufferBOM = new char[1];
StreamReader.Read(bufferBOM, 0, bufferBOM.Length);
StreamReader.Rewind();

if (EncodingInfo!.BomBytesCount > 0)
StreamReader.BaseStream.Position = EncodingInfo!.BomBytesCount;

IsEof = false;
RowCount = 0;
}

Memory<char> Extra = Memory<char>.Empty;
public IEnumerable<string?[]> Read()
{
if (EncodingInfo is null)
Initialize();

while (!IsEof)
{
string?[]? values = ReadNextRecord();
if (values is null)
yield break;

yield return values;
}
}

private string?[]? ReadNextRecord()
{
Span<char> buffer = stackalloc char[BufferSize];
Span<char> extra = stackalloc char[Extra.Length];
Extra.Span.CopyTo(extra);

if (IsEof)
return null;

string?[]? values;
(values, IsEof) = RecordParser.ReadNextRecord(StreamReader, buffer, ref extra);

if (IsEof && values!.Length == 0)
{
values = null;
Extra = null;
return null;
}

if (Extra.Length != extra.Length)
Extra = new char[extra.Length];
extra.CopyTo(Extra.Span);

if (RowCount == 0 && Fields is null)
{
int unnamedFieldIndex = 0;
if (RecordParser.Profile.Descriptor.Header)
{
Fields = values.Select(value => value ?? $"field_{unnamedFieldIndex++}").ToArray();
return ReadNextRecord(); // Skip header and read next record
}
else
{
Fields = values.Select(_ => $"field_{unnamedFieldIndex++}").ToArray();
}
}
else
{
RowCount++;

// Handle case with unexpected fields
if ((Fields?.Length ?? int.MaxValue) < values!.Length)
throw new InvalidDataException
(
string.Format
(
"The record {0} contains {1} more field{2} than expected.",
RowCount + Convert.ToInt32(RecordParser.Profile.Descriptor.Header),
values.Length - Fields!.Length,
values.Length - Fields.Length > 1 ? "s" : string.Empty
)
);

// Fill the missing cells
if ((Fields?.Length ?? 0) > values.Length)
{
var list = new List<string?>(values);
while (Fields!.Length > list.Count)
list.Add(RecordParser.Profile.MissingCell);
values = list.ToArray();
}
}

return values;
}

public void Dispose()
{
StreamReader?.Dispose();
Stream?.Dispose();
GC.SuppressFinalize(this); // Prevents finalizer from running
}

~CsvArrayString()
{
Dispose();
}
}
33 changes: 31 additions & 2 deletions PocketCsvReader/CsvReader.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using System.Buffers;
using System.Buffers;
using System.Data;
using System.IO;
using System.Text;
Expand Down Expand Up @@ -76,7 +76,6 @@ public DataTable ToDataTable(string filename, bool isFirstRowHeader)
return ToDataTable(stream);
}


/// <summary>
/// Reads the specified CSV file and returns an <see cref="IDataReader"/> for iterating over its records and fields.
/// </summary>
Expand Down Expand Up @@ -105,6 +104,36 @@ public IDataReader ToDataReader(string filename)
public IDataReader ToDataReader(Stream stream)
=> new CsvDataReader(RecordParser, stream);


/// <summary>
/// Reads the specified CSV file and returns an <see cref="IDataReader"/> for iterating over its records and fields.
/// </summary>
/// <param name="filename">The name or full path of the CSV file to read.</param>
/// <returns>An <see cref="IDataReader"/> instance for sequentially reading each record and field in the CSV file.</returns>
/// <remarks>
/// This method provides an <see cref="IDataReader"/> for efficient, read-only, forward-only access to CSV data,
/// suitable for large files or cases where full file loading into memory is unnecessary.
/// </remarks>
public IEnumerable<string?[]> ToArrayString(string filename)
{
CheckFileExists(filename);
var stream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read, Profile.BufferSize);
return new CsvArrayString(new RecordParser(Profile), stream).Read();
}

/// <summary>
/// Reads the CSV data from the provided stream and returns an <see cref="IDataReader"/> for efficient record-by-record access.
/// </summary>
/// <param name="stream">The <see cref="Stream"/> containing CSV data, positioned at the beginning of the content.</param>
/// <returns>An <see cref="IDataReader"/> that allows sequential access to each record and field in the CSV file.</returns>
/// <remarks>
/// This method processes the CSV data from the stream and provides an <see cref="IDataReader"/> for forward-only, read-only access,
/// ideal for handling large datasets without loading the entire file into memory at once.
/// </remarks>
public IEnumerable<string?[]> ToArrayString(Stream stream)
=> new CsvArrayString(RecordParser, stream).Read();


protected virtual void CheckFileExists(string filename)
{
if (!File.Exists(filename))
Expand Down

0 comments on commit ffc6b28

Please sign in to comment.