Skip to content

Commit

Permalink
feat: accept MIME when detecting encoding (#90)
Browse files Browse the repository at this point in the history
- if MIME is supplied the detecor will check if corresponding BOM is present or not
- without MIME, will check the standard encodings based on BOM
- if none, fallback to UTF-8 without BOM
  • Loading branch information
Seddryck authored Jan 6, 2025
1 parent e154b5c commit 631b769
Show file tree
Hide file tree
Showing 11 changed files with 126 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -309,16 +309,6 @@ public void WithCommentRows_ShouldSetCommentChar(params int[] rows)
Assert.That(descriptor.CommentRows, Is.EqualTo(rows));
}

[Test]
public void WithCsvDdfVersion_ShouldSetCsvDdfVersion()
{
var descriptor = new DialectDescriptorBuilder()
.WithCsvDdfVersion("1.0")
.Build();

Assert.That(descriptor.CsvDdfVersion, Is.EqualTo("1.0"));
}

[Test]
public void WithCsvDdfVersion_ShouldSetCsvDdfVersionToValue()
{
Expand All @@ -327,7 +317,6 @@ public void WithCsvDdfVersion_ShouldSetCsvDdfVersionToValue()
.WithLineTerminator("\r")
.WithQuoteChar('\'')
.WithDoubleQuote()
.WithCsvDdfVersion("1.1")
.Build();
var csvReader = new CsvReader(new CsvProfile(descriptor));

Expand Down
38 changes: 37 additions & 1 deletion PocketCsvReader.Testing/EncodingDetectorTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public int GetHashCode(string obj)
};

[TestCaseSource(nameof(Encodings))]
public void ToDataReader_Financial_CorrectRowsColumns(Encoding encoding)
public void GetStreamEncoding_Stream_CorrectResult(Encoding encoding)
{
using (var stream = new MemoryStream())
{
Expand All @@ -73,5 +73,41 @@ public void ToDataReader_Financial_CorrectRowsColumns(Encoding encoding)
Assert.That(result.Encoding, Is.EqualTo(encoding));
}
}

[TestCaseSource(nameof(Encodings))]
public void FromMime_StreamWithoutMime_CorrectResult(Encoding encoding)
{
using (var stream = new MemoryStream())
{
using var writer = new StreamWriter(stream, encoding);
writer.Write("A,B,C\r\n1,2,3\r\n4,5,6\r\n");
writer.Flush();
stream.Position = 0;

var detector = new EncodingDetector();
var result = detector.GetStreamEncoding(stream, encoding.BodyName);
Assert.That(result.Encoding, Is.EqualTo(encoding));
}
}

[TestCase("ISO-8859-2")]
[TestCase("utf-8")]
public void FromMime_StreamWithMime_CorrectResult(string mime)
{
if (!Encoding.GetEncodings().Any(e => e.Name.Equals(mime, StringComparison.OrdinalIgnoreCase)))
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);

using (var stream = new MemoryStream())
{
using var writer = new StreamWriter(stream, Encoding.GetEncoding(mime));
writer.Write("A,B,C\r\n1,2,3\r\n4,5,6\r\n");
writer.Flush();
stream.Position = 0;

var detector = new EncodingDetector();
var result = detector.GetStreamEncoding(stream, mime);
Assert.That(result.Encoding.WebName, Is.EqualTo(mime).Using(StringComparer.OrdinalIgnoreCase));
}
}
}
}
17 changes: 14 additions & 3 deletions PocketCsvReader/Configuration/CsvReaderBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ public class CsvReaderBuilder
{
private DialectDescriptorBuilder _dialectBuilder = new();
private ISchemaDescriptorBuilder? _schemaBuilder;
private ResourceDescriptorBuilder? _resourceBuilder;

public CsvReaderBuilder WithDialect(Func<DialectDescriptorBuilder, DialectDescriptorBuilder> func)
{
Expand All @@ -33,10 +34,20 @@ public CsvReaderBuilder WithSchema(ISchemaDescriptorBuilder schemaBuilder)
return this;
}

public CsvReader Build()

public CsvReaderBuilder WithResource(Func<ResourceDescriptorBuilder, ResourceDescriptorBuilder> func)
{
var csvReader = new CsvReader(new CsvProfile(_dialectBuilder.Build(), _schemaBuilder?.Build()));
return csvReader;
_resourceBuilder = func(new());
return this;
}

public CsvReaderBuilder WithResource(ResourceDescriptorBuilder resourceBuilder)
{
_resourceBuilder = resourceBuilder;
return this;
}

public CsvReader Build()
=> new (new CsvProfile(_dialectBuilder.Build(), _schemaBuilder?.Build(), _resourceBuilder?.Build()));

}
4 changes: 1 addition & 3 deletions PocketCsvReader/Configuration/DialectDescriptor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ namespace PocketCsvReader;

public record DialectDescriptor
(
string Schema = "https://datapackage.org/profiles/1.0/tabledialect.json",
bool Header = true,
int[] HeaderRows = null!,
string HeaderJoin = " ",
Expand All @@ -20,8 +19,7 @@ public record DialectDescriptor
bool DoubleQuote = true,
char? EscapeChar = null,
string? NullSequence = null,
bool SkipInitialSpace = false,
string CsvDdfVersion = "2.0"
bool SkipInitialSpace = false
)
{
public int[] HeaderRows { get; init; } = HeaderRows ?? [1];
Expand Down
2 changes: 0 additions & 2 deletions PocketCsvReader/Configuration/DialectDescriptorBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,6 @@ public DialectDescriptorBuilder WithCommentRows(int[] commentRows)
=> (Descriptor = Descriptor with { CommentRows = commentRows }, Builder: this).Builder;
public DialectDescriptorBuilder WithoutCommentRows()
=> WithCommentRows([]);
public DialectDescriptorBuilder WithCsvDdfVersion(string version)
=> (Descriptor = Descriptor with { CsvDdfVersion = version}, Builder: this).Builder;

public DialectDescriptor Build()
=> Descriptor;
Expand Down
13 changes: 13 additions & 0 deletions PocketCsvReader/Configuration/ResourceDescriptor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace PocketCsvReader;

public record ResourceDescriptor
(
string? Encoding = null
)
{ }
19 changes: 19 additions & 0 deletions PocketCsvReader/Configuration/ResourceDescriptorBuilder.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
using System;
using System.Collections.Generic;
using System.ComponentModel.Design;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace PocketCsvReader.Configuration;
public class ResourceDescriptorBuilder
{
private ResourceDescriptor Descriptor { get; set; } = new();

public ResourceDescriptorBuilder WithEncoding(string? mime)
=> (Descriptor = Descriptor with { Encoding = mime }, Builder: this).Builder;
public ResourceDescriptorBuilder WithoutEncoding()
=> WithEncoding(null);
public ResourceDescriptor Build()
=> Descriptor;
}
2 changes: 1 addition & 1 deletion PocketCsvReader/CsvDataReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public CsvDataReader(Stream stream, CsvProfile profile)

public void Initialize()
{
FileEncoding ??= new EncodingDetector().GetStreamEncoding(Stream);
FileEncoding ??= new EncodingDetector().GetStreamEncoding(Stream, Profile.Resource?.Encoding);
StreamReader = new StreamReader(Stream, FileEncoding!.Encoding, false);
var bufferBOM = new char[1];
StreamReader.Read(bufferBOM, 0, bufferBOM.Length);
Expand Down
9 changes: 4 additions & 5 deletions PocketCsvReader/CsvProfile.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using System.Reflection;
using PocketCsvReader.Configuration;

namespace PocketCsvReader;
Expand All @@ -7,6 +8,7 @@ public class CsvProfile
{
public DialectDescriptor Dialect { get; private set; }
public SchemaDescriptor? Schema { get; private set; }
public ResourceDescriptor? Resource { get; private set; }
public ParserOptimizationOptions ParserOptimizations { get; set; }
public Dictionary<string, string?> Sequences { get; } = new();

Expand Down Expand Up @@ -61,7 +63,7 @@ public CsvProfile(char fieldSeparator, char textQualifier, char escapeTextQualif
MissingCell = missingCell;
}

public CsvProfile(DialectDescriptor dialect)
public CsvProfile(DialectDescriptor dialect, SchemaDescriptor? schema = null, ResourceDescriptor? resource = null)
{
if (dialect.NullSequence is not null)
Sequences.Add(dialect.NullSequence, null);
Expand All @@ -70,12 +72,9 @@ public CsvProfile(DialectDescriptor dialect)
ParserOptimizations = new ParserOptimizationOptions();
EmptyCell = string.Empty;
MissingCell = string.Empty;
}

public CsvProfile(DialectDescriptor dialect, SchemaDescriptor? schema)
: this(dialect)
{
Schema = schema;
Resource = resource;
}

private static CsvProfile? _commaDoubleQuote;
Expand Down
59 changes: 36 additions & 23 deletions PocketCsvReader/EncodingDetector.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,46 +11,59 @@ public record EncodingInfo(Encoding Encoding, int BomBytesCount)

public interface IEncodingDetector
{
EncodingInfo GetStreamEncoding(Stream stream);
EncodingInfo GetFileEncoding(string filename);
EncodingInfo GetStreamEncoding(Stream stream, string? mime = null);
EncodingInfo GetFileEncoding(string filename, string? mime = null);
}

public class EncodingDetector : IEncodingDetector
{
{
/// <summary>
/// Detects the byte order mark of a streams and returns
/// an appropriate encoding for the file.
/// </summary>
/// <param name="stream">The stream to analyze for the encoding</param>
/// <returns></returns>
public virtual EncodingInfo GetStreamEncoding(Stream stream)
public virtual EncodingInfo GetStreamEncoding(Stream stream, string? mime = null)
{
if (stream == null || !stream.CanRead)
throw new ArgumentException("The stream is null or not readable.");

// Default = Ansi CodePage
var encoding = Encoding.Default;
var encodingBytesCount = 0;

// Detect byte order mark if any - otherwise assume default
var buffer = new byte[5];
var n = stream.Read(buffer, 0, 5);

if (n < 2)
return new(Encoding.ASCII, 0);

var encodingBytesCount = 0;

if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
(encoding, encodingBytesCount) = (Encoding.UTF8, 3);
else if (buffer[0] == 0xff && buffer[1] == 0xfe && buffer[2] == 0 && buffer[3] == 0)
(encoding, encodingBytesCount) = (Encoding.UTF32, 4);
else if (buffer[0] == 0xff && buffer[1] == 0xfe)
(encoding, encodingBytesCount) = (Encoding.Unicode, 2);
else if (buffer[0] == 0xfe && buffer[1] == 0xff)
(encoding, encodingBytesCount) = (Encoding.BigEndianUnicode, 2);
else if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
(encoding, encodingBytesCount) = (new UTF32Encoding(true, true), 4);
//else if (Buffer[0] == 0x2b && Buffer[1] == 0x2f && Buffer[2] == 0x76)
// encoding = Encoding.UTF7;
return new(Encoding.UTF8, 0);

encoding = encoding.Equals(Encoding.Default) ? Encoding.UTF8 : encoding;
if (mime is null)
{
foreach (var encodingInfo in Encoding.GetEncodings().OrderByDescending(e => e.GetEncoding().Preamble.Length))
{
var preamble = encodingInfo.GetEncoding().Preamble;
if (preamble.Length > 0 && buffer.AsSpan(0, preamble.Length).SequenceEqual(preamble))
{
encoding = encodingInfo.GetEncoding();
encodingBytesCount = preamble.Length;
break;
}
}
// Fallback to UTF-8 if no BOM matches and it's not the default encoding
if (encoding.Equals(Encoding.Default))
(encoding, encodingBytesCount) = (Encoding.UTF8, 0);
}
else
{
if (!Encoding.GetEncodings().Any(e => e.Name.Equals(mime, StringComparison.OrdinalIgnoreCase)))
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
encoding = Encoding.GetEncoding(mime);
encodingBytesCount = encoding.Preamble.Length > 0 && buffer.AsSpan(0, encoding.Preamble.Length).SequenceEqual(encoding.Preamble)
? encoding.Preamble.Length
: 0;
}
return new(encoding, encodingBytesCount);
}

Expand All @@ -60,9 +73,9 @@ public virtual EncodingInfo GetStreamEncoding(Stream stream)
/// </summary>
/// <param name="filename"></param>
/// <returns></returns>
public virtual EncodingInfo GetFileEncoding(string filename)
public virtual EncodingInfo GetFileEncoding(string filename, string? mime = null)
{
using (var stream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read, 8, false))
return GetStreamEncoding(stream);
return GetStreamEncoding(stream, mime);
}
}
2 changes: 1 addition & 1 deletion PocketCsvReader/PocketCsvReader.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<PropertyGroup>
<PackageId>PocketCsvReader</PackageId>
<RepositoryUrl>https://github.com/Seddryck/PocketCsvReader</RepositoryUrl>
<PackageTags>CSV TSV DataTable Parser</PackageTags>
<PackageTags>CSV csvhelper TSV DataTable DataReader Parser Separated Delimited</PackageTags>
<Description>PocketCsvReader is a lightweight library dedicated to the parsing of delimited flat file such as CSV or TSV files. The main function is to read the content of the file and load it into a DataTable.</Description>
</PropertyGroup>
<PropertyGroup>
Expand Down

0 comments on commit 631b769

Please sign in to comment.