diff --git a/PocketCsvReader.Testing/Configuration/DialectDescriptorBuilderTest.cs b/PocketCsvReader.Testing/Configuration/DialectDescriptorBuilderTest.cs index 0744f90..9cb5b70 100644 --- a/PocketCsvReader.Testing/Configuration/DialectDescriptorBuilderTest.cs +++ b/PocketCsvReader.Testing/Configuration/DialectDescriptorBuilderTest.cs @@ -309,16 +309,6 @@ public void WithCommentRows_ShouldSetCommentChar(params int[] rows) Assert.That(descriptor.CommentRows, Is.EqualTo(rows)); } - [Test] - public void WithCsvDdfVersion_ShouldSetCsvDdfVersion() - { - var descriptor = new DialectDescriptorBuilder() - .WithCsvDdfVersion("1.0") - .Build(); - - Assert.That(descriptor.CsvDdfVersion, Is.EqualTo("1.0")); - } - [Test] public void WithCsvDdfVersion_ShouldSetCsvDdfVersionToValue() { @@ -327,7 +317,6 @@ public void WithCsvDdfVersion_ShouldSetCsvDdfVersionToValue() .WithLineTerminator("\r") .WithQuoteChar('\'') .WithDoubleQuote() - .WithCsvDdfVersion("1.1") .Build(); var csvReader = new CsvReader(new CsvProfile(descriptor)); diff --git a/PocketCsvReader.Testing/EncodingDetectorTest.cs b/PocketCsvReader.Testing/EncodingDetectorTest.cs index 79d0d53..75666ad 100644 --- a/PocketCsvReader.Testing/EncodingDetectorTest.cs +++ b/PocketCsvReader.Testing/EncodingDetectorTest.cs @@ -59,7 +59,7 @@ public int GetHashCode(string obj) }; [TestCaseSource(nameof(Encodings))] - public void ToDataReader_Financial_CorrectRowsColumns(Encoding encoding) + public void GetStreamEncoding_Stream_CorrectResult(Encoding encoding) { using (var stream = new MemoryStream()) { @@ -73,5 +73,41 @@ public void ToDataReader_Financial_CorrectRowsColumns(Encoding encoding) Assert.That(result.Encoding, Is.EqualTo(encoding)); } } + + [TestCaseSource(nameof(Encodings))] + public void FromMime_StreamWithoutMime_CorrectResult(Encoding encoding) + { + using (var stream = new MemoryStream()) + { + using var writer = new StreamWriter(stream, encoding); + writer.Write("A,B,C\r\n1,2,3\r\n4,5,6\r\n"); + writer.Flush(); + stream.Position = 0; + + var detector = new EncodingDetector(); + var result = detector.GetStreamEncoding(stream, encoding.BodyName); + Assert.That(result.Encoding, Is.EqualTo(encoding)); + } + } + + [TestCase("ISO-8859-2")] + [TestCase("utf-8")] + public void FromMime_StreamWithMime_CorrectResult(string mime) + { + if (!Encoding.GetEncodings().Any(e => e.Name.Equals(mime, StringComparison.OrdinalIgnoreCase))) + Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); + + using (var stream = new MemoryStream()) + { + using var writer = new StreamWriter(stream, Encoding.GetEncoding(mime)); + writer.Write("A,B,C\r\n1,2,3\r\n4,5,6\r\n"); + writer.Flush(); + stream.Position = 0; + + var detector = new EncodingDetector(); + var result = detector.GetStreamEncoding(stream, mime); + Assert.That(result.Encoding.WebName, Is.EqualTo(mime).Using(StringComparer.OrdinalIgnoreCase)); + } + } } } diff --git a/PocketCsvReader/Configuration/CsvReaderBuilder.cs b/PocketCsvReader/Configuration/CsvReaderBuilder.cs index 2496319..3f7b2f7 100644 --- a/PocketCsvReader/Configuration/CsvReaderBuilder.cs +++ b/PocketCsvReader/Configuration/CsvReaderBuilder.cs @@ -9,6 +9,7 @@ public class CsvReaderBuilder { private DialectDescriptorBuilder _dialectBuilder = new(); private ISchemaDescriptorBuilder? _schemaBuilder; + private ResourceDescriptorBuilder? _resourceBuilder; public CsvReaderBuilder WithDialect(Func func) { @@ -33,10 +34,20 @@ public CsvReaderBuilder WithSchema(ISchemaDescriptorBuilder schemaBuilder) return this; } - public CsvReader Build() + + public CsvReaderBuilder WithResource(Func func) { - var csvReader = new CsvReader(new CsvProfile(_dialectBuilder.Build(), _schemaBuilder?.Build())); - return csvReader; + _resourceBuilder = func(new()); + return this; } + public CsvReaderBuilder WithResource(ResourceDescriptorBuilder resourceBuilder) + { + _resourceBuilder = resourceBuilder; + return this; + } + + public CsvReader Build() + => new (new CsvProfile(_dialectBuilder.Build(), _schemaBuilder?.Build(), _resourceBuilder?.Build())); + } diff --git a/PocketCsvReader/Configuration/DialectDescriptor.cs b/PocketCsvReader/Configuration/DialectDescriptor.cs index e991575..4aa97ac 100644 --- a/PocketCsvReader/Configuration/DialectDescriptor.cs +++ b/PocketCsvReader/Configuration/DialectDescriptor.cs @@ -8,7 +8,6 @@ namespace PocketCsvReader; public record DialectDescriptor ( - string Schema = "https://datapackage.org/profiles/1.0/tabledialect.json", bool Header = true, int[] HeaderRows = null!, string HeaderJoin = " ", @@ -20,8 +19,7 @@ public record DialectDescriptor bool DoubleQuote = true, char? EscapeChar = null, string? NullSequence = null, - bool SkipInitialSpace = false, - string CsvDdfVersion = "2.0" + bool SkipInitialSpace = false ) { public int[] HeaderRows { get; init; } = HeaderRows ?? [1]; diff --git a/PocketCsvReader/Configuration/DialectDescriptorBuilder.cs b/PocketCsvReader/Configuration/DialectDescriptorBuilder.cs index 2b218f1..ba22042 100644 --- a/PocketCsvReader/Configuration/DialectDescriptorBuilder.cs +++ b/PocketCsvReader/Configuration/DialectDescriptorBuilder.cs @@ -83,8 +83,6 @@ public DialectDescriptorBuilder WithCommentRows(int[] commentRows) => (Descriptor = Descriptor with { CommentRows = commentRows }, Builder: this).Builder; public DialectDescriptorBuilder WithoutCommentRows() => WithCommentRows([]); - public DialectDescriptorBuilder WithCsvDdfVersion(string version) - => (Descriptor = Descriptor with { CsvDdfVersion = version}, Builder: this).Builder; public DialectDescriptor Build() => Descriptor; diff --git a/PocketCsvReader/Configuration/ResourceDescriptor.cs b/PocketCsvReader/Configuration/ResourceDescriptor.cs new file mode 100644 index 0000000..863dee2 --- /dev/null +++ b/PocketCsvReader/Configuration/ResourceDescriptor.cs @@ -0,0 +1,13 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace PocketCsvReader; + +public record ResourceDescriptor +( + string? Encoding = null +) +{ } diff --git a/PocketCsvReader/Configuration/ResourceDescriptorBuilder.cs b/PocketCsvReader/Configuration/ResourceDescriptorBuilder.cs new file mode 100644 index 0000000..f56cfc6 --- /dev/null +++ b/PocketCsvReader/Configuration/ResourceDescriptorBuilder.cs @@ -0,0 +1,19 @@ +using System; +using System.Collections.Generic; +using System.ComponentModel.Design; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace PocketCsvReader.Configuration; +public class ResourceDescriptorBuilder +{ + private ResourceDescriptor Descriptor { get; set; } = new(); + + public ResourceDescriptorBuilder WithEncoding(string? mime) + => (Descriptor = Descriptor with { Encoding = mime }, Builder: this).Builder; + public ResourceDescriptorBuilder WithoutEncoding() + => WithEncoding(null); + public ResourceDescriptor Build() + => Descriptor; +} diff --git a/PocketCsvReader/CsvDataReader.cs b/PocketCsvReader/CsvDataReader.cs index a031219..34b76d4 100644 --- a/PocketCsvReader/CsvDataReader.cs +++ b/PocketCsvReader/CsvDataReader.cs @@ -58,7 +58,7 @@ public CsvDataReader(Stream stream, CsvProfile profile) public void Initialize() { - FileEncoding ??= new EncodingDetector().GetStreamEncoding(Stream); + FileEncoding ??= new EncodingDetector().GetStreamEncoding(Stream, Profile.Resource?.Encoding); StreamReader = new StreamReader(Stream, FileEncoding!.Encoding, false); var bufferBOM = new char[1]; StreamReader.Read(bufferBOM, 0, bufferBOM.Length); diff --git a/PocketCsvReader/CsvProfile.cs b/PocketCsvReader/CsvProfile.cs index bc53a21..2dc7a29 100644 --- a/PocketCsvReader/CsvProfile.cs +++ b/PocketCsvReader/CsvProfile.cs @@ -1,4 +1,5 @@ using System; +using System.Reflection; using PocketCsvReader.Configuration; namespace PocketCsvReader; @@ -7,6 +8,7 @@ public class CsvProfile { public DialectDescriptor Dialect { get; private set; } public SchemaDescriptor? Schema { get; private set; } + public ResourceDescriptor? Resource { get; private set; } public ParserOptimizationOptions ParserOptimizations { get; set; } public Dictionary Sequences { get; } = new(); @@ -61,7 +63,7 @@ public CsvProfile(char fieldSeparator, char textQualifier, char escapeTextQualif MissingCell = missingCell; } - public CsvProfile(DialectDescriptor dialect) + public CsvProfile(DialectDescriptor dialect, SchemaDescriptor? schema = null, ResourceDescriptor? resource = null) { if (dialect.NullSequence is not null) Sequences.Add(dialect.NullSequence, null); @@ -70,12 +72,9 @@ public CsvProfile(DialectDescriptor dialect) ParserOptimizations = new ParserOptimizationOptions(); EmptyCell = string.Empty; MissingCell = string.Empty; - } - public CsvProfile(DialectDescriptor dialect, SchemaDescriptor? schema) - : this(dialect) - { Schema = schema; + Resource = resource; } private static CsvProfile? _commaDoubleQuote; diff --git a/PocketCsvReader/EncodingDetector.cs b/PocketCsvReader/EncodingDetector.cs index 3c54f7e..579a0da 100644 --- a/PocketCsvReader/EncodingDetector.cs +++ b/PocketCsvReader/EncodingDetector.cs @@ -11,46 +11,59 @@ public record EncodingInfo(Encoding Encoding, int BomBytesCount) public interface IEncodingDetector { - EncodingInfo GetStreamEncoding(Stream stream); - EncodingInfo GetFileEncoding(string filename); + EncodingInfo GetStreamEncoding(Stream stream, string? mime = null); + EncodingInfo GetFileEncoding(string filename, string? mime = null); } public class EncodingDetector : IEncodingDetector -{ +{ /// /// Detects the byte order mark of a streams and returns /// an appropriate encoding for the file. /// /// The stream to analyze for the encoding /// - public virtual EncodingInfo GetStreamEncoding(Stream stream) + public virtual EncodingInfo GetStreamEncoding(Stream stream, string? mime = null) { + if (stream == null || !stream.CanRead) + throw new ArgumentException("The stream is null or not readable."); + // Default = Ansi CodePage var encoding = Encoding.Default; + var encodingBytesCount = 0; // Detect byte order mark if any - otherwise assume default var buffer = new byte[5]; var n = stream.Read(buffer, 0, 5); if (n < 2) - return new(Encoding.ASCII, 0); - - var encodingBytesCount = 0; - - if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf) - (encoding, encodingBytesCount) = (Encoding.UTF8, 3); - else if (buffer[0] == 0xff && buffer[1] == 0xfe && buffer[2] == 0 && buffer[3] == 0) - (encoding, encodingBytesCount) = (Encoding.UTF32, 4); - else if (buffer[0] == 0xff && buffer[1] == 0xfe) - (encoding, encodingBytesCount) = (Encoding.Unicode, 2); - else if (buffer[0] == 0xfe && buffer[1] == 0xff) - (encoding, encodingBytesCount) = (Encoding.BigEndianUnicode, 2); - else if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff) - (encoding, encodingBytesCount) = (new UTF32Encoding(true, true), 4); - //else if (Buffer[0] == 0x2b && Buffer[1] == 0x2f && Buffer[2] == 0x76) - // encoding = Encoding.UTF7; + return new(Encoding.UTF8, 0); - encoding = encoding.Equals(Encoding.Default) ? Encoding.UTF8 : encoding; + if (mime is null) + { + foreach (var encodingInfo in Encoding.GetEncodings().OrderByDescending(e => e.GetEncoding().Preamble.Length)) + { + var preamble = encodingInfo.GetEncoding().Preamble; + if (preamble.Length > 0 && buffer.AsSpan(0, preamble.Length).SequenceEqual(preamble)) + { + encoding = encodingInfo.GetEncoding(); + encodingBytesCount = preamble.Length; + break; + } + } + // Fallback to UTF-8 if no BOM matches and it's not the default encoding + if (encoding.Equals(Encoding.Default)) + (encoding, encodingBytesCount) = (Encoding.UTF8, 0); + } + else + { + if (!Encoding.GetEncodings().Any(e => e.Name.Equals(mime, StringComparison.OrdinalIgnoreCase))) + Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); + encoding = Encoding.GetEncoding(mime); + encodingBytesCount = encoding.Preamble.Length > 0 && buffer.AsSpan(0, encoding.Preamble.Length).SequenceEqual(encoding.Preamble) + ? encoding.Preamble.Length + : 0; + } return new(encoding, encodingBytesCount); } @@ -60,9 +73,9 @@ public virtual EncodingInfo GetStreamEncoding(Stream stream) /// /// /// - public virtual EncodingInfo GetFileEncoding(string filename) + public virtual EncodingInfo GetFileEncoding(string filename, string? mime = null) { using (var stream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read, 8, false)) - return GetStreamEncoding(stream); + return GetStreamEncoding(stream, mime); } } diff --git a/PocketCsvReader/PocketCsvReader.csproj b/PocketCsvReader/PocketCsvReader.csproj index ed17bc7..710cc7e 100644 --- a/PocketCsvReader/PocketCsvReader.csproj +++ b/PocketCsvReader/PocketCsvReader.csproj @@ -3,7 +3,7 @@ PocketCsvReader https://github.com/Seddryck/PocketCsvReader - CSV TSV DataTable Parser + CSV csvhelper TSV DataTable DataReader Parser Separated Delimited PocketCsvReader is a lightweight library dedicated to the parsing of delimited flat file such as CSV or TSV files. The main function is to read the content of the file and load it into a DataTable.