diff --git a/src/FileIO/FileIO.csproj b/src/FileIO/FileIO.csproj index a15d9bc..99f4de5 100644 --- a/src/FileIO/FileIO.csproj +++ b/src/FileIO/FileIO.csproj @@ -8,6 +8,7 @@ + diff --git a/src/FileIO/WithCsvHelperLib.cs b/src/FileIO/WithCsvHelperLib.cs index 0a9e0b1..31e63e3 100644 --- a/src/FileIO/WithCsvHelperLib.cs +++ b/src/FileIO/WithCsvHelperLib.cs @@ -1,3 +1,4 @@ +using System; using System.Collections.Generic; using System.Globalization; using System.IO; @@ -9,12 +10,28 @@ namespace FileIO public class WithCsvHelperLib { - public IEnumerable ProcessFileAsync(string filePath) + public IEnumerable ProcessFileAsync(string filePath) { using var reader = new StreamReader(filePath); + + Employee[] employees = new Employee[100000]; using var csv = new CsvReader(reader, CultureInfo.InvariantCulture); - var records = csv.GetRecords(); - return records.ToList(); + int idx = 0; + csv.Read(); + while (csv.Read()) + { + var emp = new Employee + { + Name = csv[0], + Email = csv[1], + DateOfJoining = DateTime.Parse(csv[2]), + Salary = double.Parse(csv[3]), + Age = int.Parse(csv[4]), + }; + employees[idx++] = emp; + + } + return employees; } } diff --git a/src/FileIO/WithPipeLines.cs b/src/FileIO/WithPipeLines.cs index 5a49eb4..f77d8fd 100644 --- a/src/FileIO/WithPipeLines.cs +++ b/src/FileIO/WithPipeLines.cs @@ -19,10 +19,10 @@ public class WithPipeLines /// PipeReader Sequence Position public async Task ProcessFileAsync(string filePath, Employee[] employeeRecords) { + const int BufferSize = 0x10000; var position = 0; - if (!File.Exists(filePath)) return position; - await using var fileStream = File.OpenRead(filePath); - var pipeReader = PipeReader.Create(fileStream); + await using var fileStream = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read, BufferSize); + var pipeReader = PipeReader.Create(fileStream, new StreamPipeReaderOptions(bufferSize: BufferSize)); while (true) { var fileData = await pipeReader.ReadAsync(); @@ -47,14 +47,16 @@ public async Task ProcessFileAsync(string filePath, Employee[] employeeReco private static SequencePosition ParseLines(Employee[] employeeRecords, in ReadOnlySequence buffer, ref int position) { var reader = new SequenceReader(buffer); + ReadOnlySpan line; + // skip the header row + reader.TryReadTo(out line, (byte)'\n', true); // Read the whole line till the new line is found - while (reader.TryReadTo(out ReadOnlySpan line, (byte)'\n', true)) + while (reader.TryReadTo(out line, (byte)'\n', true)) { var employee = LineParser.ParseLine(line); // we have a line to parse - if (employee is { }) // if the returned value is valid Employee object - employeeRecords[position++] = employee.Value; + employeeRecords[position++] = employee; } return reader.Position; // returning the Last position of the reader @@ -62,88 +64,48 @@ private static SequencePosition ParseLines(Employee[] employeeRecords, in ReadOn private static class LineParser { - private const byte Coma = (byte)','; - private static readonly byte[] ColumnHeaders = Encoding.UTF8.GetBytes("Name,Email,DateOfJoining,Salary,Age"); + private const byte Comma = (byte)','; - public static Employee? ParseLine(ReadOnlySpan line) + public static Employee ParseLine(ReadOnlySpan line) { - // REVIEW: There are better ways to do this - if (line.IndexOf(ColumnHeaders) >= 0) // Ignore the Header row - { - return null; - } - // Trim \r (if it exists) line = line.TrimEnd((byte)'\r'); - var fieldCount = 1; - var record = new Employee(); - while (fieldCount <= 5) // we have five fields in csv file + var idx = line.IndexOf(Comma); + + record.Name = Encoding.UTF8.GetString(line[..idx]); + line = line[(idx + 1)..]; + idx = line.IndexOf(Comma); + record.Email = Encoding.UTF8.GetString(line[..idx]); + line = line[(idx + 1)..]; + idx = line.IndexOf(Comma); + + // stand on our heads to avoid allocating a temp string to parse the date + var buffer = line[..idx]; + Span chars = stackalloc char[buffer.Length]; + for (int i = 0; i < buffer.Length; i++) + { + chars[i] = (char)buffer[i]; + } + if (DateTime.TryParse(chars, out var doj)) + { + record.DateOfJoining = doj; + } + line = line[(idx + 1)..]; + idx = line.IndexOf(Comma); + + if (Utf8Parser.TryParse(line[..idx], out double salary, out _)) + { + record.Salary = salary; + } + + line = line[(idx + 1)..]; + + if (Utf8Parser.TryParse(line, out short age, out _)) { - var comaAt = line.IndexOf(Coma); - if (comaAt < 0) // No more comas are found we have reached the last field. - { - comaAt = line.Length; - } - - switch (fieldCount) - { - case 1: - { - var value = Encoding.UTF8.GetString(line[..comaAt]); - record.Name = value; - break; - } - case 2: - { - var value = Encoding.UTF8.GetString(line[..comaAt]); - record.Email = value; - break; - } - case 3: - { - var buffer = line[..comaAt]; - if (DateTime.TryParse(Encoding.UTF8.GetString(line[..comaAt]), out var doj)) - - { - record.DateOfJoining = doj; - } - // Can't use Utf8 parser to extract datetime field because csv format doesn't have time - //https://docs.microsoft.com/en-us/dotnet/api/system.buffers.text.utf8parser.tryparse?view=net-5.0#System_Buffers_Text_Utf8Parser_TryParse_System_ReadOnlySpan_System_Byte__System_DateTime__System_Int32__System_Char_ - - // if (Utf8Parser.TryParse(buffer, out DateTime value, out var bytesConsumed)) - // { - // record.DateOfJoining = value; - // } - break; - } - - case 4: - { - var buffer = line[..comaAt]; - if (Utf8Parser.TryParse(buffer, out double value, out var bytesConsumed)) - { - record.Salary = value; - } - break; - } - - case 5: - { - var buffer = line[..comaAt]; - if (Utf8Parser.TryParse(buffer, out short value, out var bytesConsumed)) - { - record.Age = value; - } - return record; - } - } - - line = line[(comaAt + 1)..]; // slice past field - - fieldCount++; + record.Age = age; } return record; diff --git a/src/FileIO/WithSylvanLib.cs b/src/FileIO/WithSylvanLib.cs new file mode 100644 index 0000000..c431300 --- /dev/null +++ b/src/FileIO/WithSylvanLib.cs @@ -0,0 +1,57 @@ +using System.IO; +using Sylvan.Data.Csv; +using System.Buffers; +using System.Threading.Tasks; +using System.Text; + +namespace FileIO +{ + public class WithSylvanLib + { + public void ProcessFile(string filePath, Employee[] employeeRecords) + { + const int BufferSize = 0x10000; + using var reader = new StreamReader(filePath, Encoding.UTF8, false, BufferSize); + + char[] buffer = ArrayPool.Shared.Rent(BufferSize); + + using var csv = CsvDataReader.Create(reader, new CsvDataReaderOptions { Buffer = buffer }); + int idx = 0; + while (csv.Read()) + { + employeeRecords[idx++] = new Employee + { + Name = csv.GetString(0), + Email = csv.GetString(1), + DateOfJoining = csv.GetDateTime(2), + Salary = csv.GetDouble(3), + Age = csv.GetInt32(4), + }; + } + ArrayPool.Shared.Return(buffer); + } + + public async Task ProcessFileAsync(string filePath, Employee[] employeeRecords) + { + const int BufferSize = 0x10000; + using var reader = new StreamReader(filePath, Encoding.UTF8, false, BufferSize); + + char[] buffer = ArrayPool.Shared.Rent(BufferSize); + + await using var csv = await CsvDataReader.CreateAsync(reader, new CsvDataReaderOptions { Buffer = buffer }); + int idx = 0; + while (await csv.ReadAsync()) + { + employeeRecords[idx++] = new Employee + { + Name = csv.GetString(0), + Email = csv.GetString(1), + DateOfJoining = csv.GetDateTime(2), + Salary = csv.GetDouble(3), + Age = csv.GetInt32(4), + }; + } + ArrayPool.Shared.Return(buffer); + } + } +} diff --git a/tests/FileIO.Benchmarks/FileIOTest.cs b/tests/FileIO.Benchmarks/FileIOTest.cs index 2d55254..edcf8ae 100644 --- a/tests/FileIO.Benchmarks/FileIOTest.cs +++ b/tests/FileIO.Benchmarks/FileIOTest.cs @@ -13,18 +13,11 @@ namespace FileIO.Benchmarks [RankColumn()] public class FileIOTest { - private string _filePath; - [GlobalSetup] - public void Setup() - { - var directoryPath = Path.GetDirectoryName(Assembly.GetAssembly(typeof(Program))?.Location); - _filePath = Path.Combine(directoryPath ?? string.Empty, "Employees.csv"); - } + private string _filePath = "Employees.csv"; + [Benchmark] public async Task PipeLines() { - // var directoryPath = Path.GetDirectoryName(Assembly.GetAssembly(typeof(Program))?.Location); - //_filePath = Path.Combine(directoryPath ?? string.Empty, "Employees.csv"); var pool = ArrayPool.Shared; var employeeRecords = pool.Rent(100000); var pipeLinesTest = new WithPipeLines(); @@ -41,22 +34,52 @@ public async Task PipeLines() [Benchmark] public async Task> AsyncStream() - { - // var directoryPath = Path.GetDirectoryName(Assembly.GetAssembly(typeof(Program))?.Location); - // _filePath = Path.Combine(directoryPath ?? string.Empty, "Employees.csv"); + { var asyncStream = new WithAsyncStreams(); - var employees = await asyncStream.ProcessStreamAsync(_filePath); - return employees; + var employees = await asyncStream.ProcessStreamAsync(_filePath); + return employees; } [Benchmark] public void CsvHelper() { - // var directoryPath = Path.GetDirectoryName(Assembly.GetAssembly(typeof(Program))?.Location); - //_filePath = Path.Combine(directoryPath ?? string.Empty, "Employees.csv"); var csvHelper = new WithCsvHelperLib(); var employeesList = csvHelper.ProcessFileAsync(_filePath); } + + [Benchmark] + public void Sylvan() + { + var sylv = new WithSylvanLib(); + var pool = ArrayPool.Shared; + var employeeRecords = pool.Rent(100000); + + try + { + sylv.ProcessFile(_filePath, employeeRecords); + } + finally + { + pool.Return(employeeRecords, true); + } + } + + [Benchmark] + public async Task SylvanAsync() + { + var sylv = new WithSylvanLib(); + var pool = ArrayPool.Shared; + var employeeRecords = pool.Rent(100000); + + try + { + await sylv.ProcessFileAsync(_filePath, employeeRecords); + } + finally + { + pool.Return(employeeRecords, true); + } + } } -} \ No newline at end of file +}