diff --git a/appveyor.yml b/appveyor.yml
index 9faed95..eec9afb 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -4,7 +4,7 @@ environment:
version: 2.0.{build}
clone_folder: c:\utfUnknown
-image: Visual Studio 2019
+image: Visual Studio 2022
configuration: Release
platform: Any CPU
nuget:
@@ -12,7 +12,7 @@ nuget:
init:
- git config --global core.autocrlf true
build_script:
-- ps: dotnet build -c Release
+- ps: dotnet build -c Release
test_script:
- ps: cd .\tests\
- ps: dotnet test
diff --git a/example/ConsoleExample.csproj b/example/ConsoleExample.csproj
index d05a31d..abb50ac 100644
--- a/example/ConsoleExample.csproj
+++ b/example/ConsoleExample.csproj
@@ -2,7 +2,7 @@
Exe
- netcoreapp3.0
+ net6.0
diff --git a/src/CharsetDetector.cs b/src/CharsetDetector.cs
index 47ddc5e..8f791b9 100644
--- a/src/CharsetDetector.cs
+++ b/src/CharsetDetector.cs
@@ -1,4 +1,4 @@
-/* ***** BEGIN LICENSE BLOCK *****
+/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
@@ -41,7 +41,8 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
-
+using System.Threading;
+using System.Threading.Tasks;
using UtfUnknown.Core;
using UtfUnknown.Core.Probers;
@@ -118,7 +119,7 @@ private CharsetDetector()
///
/// Detect the character encoding form this byte array.
- /// It searchs for BOM from bytes[0].
+ /// It searches for BOM from bytes[0].
///
/// The byte array containing the text
///
@@ -136,7 +137,7 @@ public static DetectionResult DetectFromBytes(byte[] bytes)
///
/// Detect the character encoding form this byte array.
- /// It searchs for BOM from bytes[offset].
+ /// It searches for BOM from bytes[offset].
///
/// The byte array containing the text
/// The zero-based byte offset in buffer at which to begin reading the data from
@@ -166,8 +167,6 @@ public static DetectionResult DetectFromBytes(byte[] bytes, int offset, int len)
return detector.DataEnd();
}
-#if !NETSTANDARD1_0
-
///
/// Detect the character encoding by reading the stream.
///
@@ -210,37 +209,106 @@ public static DetectionResult DetectFromStream(Stream stream, long? maxBytesToRe
return detector.DataEnd();
}
+ ///
+ /// Detect the character encoding by reading the stream.
+ ///
+ /// Note: stream position is not reset before and after.
+ ///
+ /// The steam.
+ /// The cancellation token for this operation.
+ public static Task DetectFromStreamAsync(Stream stream, CancellationToken cancellationToken = default)
+ {
+ if (stream == null)
+ {
+ throw new ArgumentNullException(nameof(stream));
+ }
+
+ return DetectFromStreamAsync(stream, null, cancellationToken);
+ }
+
+ ///
+ /// Detect the character encoding by reading the stream.
+ ///
+ /// Note: stream position is not reset before and after.
+ ///
+ /// The steam.
+ /// max bytes to read from . If null, then no max
+ /// The cancellation token for this operation.
+ /// 0 or lower.
+ public static async Task DetectFromStreamAsync(Stream stream, long? maxBytesToRead, CancellationToken cancellationToken = default)
+ {
+ if (stream == null)
+ {
+ throw new ArgumentNullException(nameof(stream));
+ }
+
+ if (maxBytesToRead <= 0)
+ {
+ throw new ArgumentOutOfRangeException(nameof(maxBytesToRead));
+ }
+
+ var detector = new CharsetDetector();
+
+ await ReadStreamAsync(stream, maxBytesToRead, detector, cancellationToken);
+ return detector.DataEnd();
+ }
+
+ private const int BufferSize = 1024;
+
private static void ReadStream(Stream stream, long? maxBytes, CharsetDetector detector)
{
- const int bufferSize = 1024;
- byte[] buff = new byte[bufferSize];
+ byte[] buff = new byte[BufferSize];
int read;
long readTotal = 0;
- var toRead = CalcToRead(maxBytes, readTotal, bufferSize);
+ var toRead = CalcToRead(maxBytes, readTotal, BufferSize);
while ((read = stream.Read(buff, 0, toRead)) > 0)
{
- detector.Feed(buff, 0, read);
-
- if (maxBytes != null)
+ if (FeedDetector(detector, maxBytes, buff, read, ref readTotal, ref toRead))
{
- readTotal += read;
- if (readTotal >= maxBytes)
- {
- return;
- }
-
- toRead = CalcToRead(maxBytes, readTotal, bufferSize);
+ return;
}
+ }
+ }
- if (detector._done)
+ private static async Task ReadStreamAsync(Stream stream, long? maxBytes, CharsetDetector detector, CancellationToken cancellationToken = default)
+ {
+ byte[] buff = new byte[BufferSize];
+ int read;
+ long readTotal = 0;
+
+ var toRead = CalcToRead(maxBytes, readTotal, BufferSize);
+
+ while ((read = await stream.ReadAsync(buff, 0, toRead, cancellationToken)) > 0)
+ {
+ if (FeedDetector(detector, maxBytes, buff, read, ref readTotal, ref toRead))
{
return;
}
}
}
+ private static bool FeedDetector(CharsetDetector detector, long? maxBytes, byte[] buff, int read, ref long readTotal, ref int toRead)
+ {
+ detector.Feed(buff, 0, read);
+
+ if (maxBytes == null)
+ {
+ return detector._done;
+ }
+
+ readTotal += read;
+ if (readTotal >= maxBytes)
+ {
+ return true;
+ }
+
+ toRead = CalcToRead(maxBytes, readTotal, BufferSize);
+
+ return detector._done;
+ }
+
private static int CalcToRead(long? maxBytes, long readTotal, int bufferSize)
{
if (readTotal + bufferSize > maxBytes)
@@ -264,7 +332,7 @@ public static DetectionResult DetectFromFile(string filePath)
throw new ArgumentNullException(nameof(filePath));
}
- using (FileStream fs = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
+ using (FileStream fs = OpenFile(filePath))
{
return DetectFromStream(fs);
}
@@ -281,13 +349,57 @@ public static DetectionResult DetectFromFile(FileInfo file)
throw new ArgumentNullException(nameof(file));
}
- using (FileStream fs = new FileStream(file.FullName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
+ using (FileStream fs = OpenFile(file.FullName))
{
return DetectFromStream(fs);
}
}
-#endif // !NETSTANDARD1_0
+ ///
+ /// Detect the character encoding of this file.
+ ///
+ /// Path to file
+ /// The cancellation token for this operation.
+ ///
+ public static async Task DetectFromFileAsync(string filePath, CancellationToken cancellationToken = default)
+ {
+ if (filePath == null)
+ {
+ throw new ArgumentNullException(nameof(filePath));
+ }
+
+ using (FileStream fs = OpenFile(filePath))
+ {
+ return await DetectFromStreamAsync(fs, cancellationToken);
+ }
+ }
+ ///
+ /// Detect the character encoding of this file.
+ ///
+ /// The file
+ /// The cancellation token for this operation.
+ ///
+ public static Task DetectFromFileAsync(FileInfo file, CancellationToken cancellationToken = default)
+ {
+ if (file == null)
+ {
+ throw new ArgumentNullException(nameof(file));
+ }
+
+ using (FileStream fs = OpenFile(file.FullName))
+ {
+ return DetectFromStreamAsync(fs, cancellationToken);
+ }
+ }
+
+ private static FileStream OpenFile(string filePath)
+ {
+ return new FileStream(
+ filePath,
+ FileMode.Open,
+ FileAccess.Read,
+ FileShare.ReadWrite);
+ }
protected virtual void Feed(byte[] buf, int offset, int len)
{
@@ -478,7 +590,7 @@ private DetectionResult DataEnd()
return new DetectionResult();
}
- internal IList GetNewProbers()
+ private IList GetNewProbers()
{
switch (InputState)
{
@@ -499,4 +611,4 @@ internal IList GetNewProbers()
}
}
}
-}
\ No newline at end of file
+}
diff --git a/src/DetectionDetail.cs b/src/DetectionDetail.cs
index e188519..aa6b795 100644
--- a/src/DetectionDetail.cs
+++ b/src/DetectionDetail.cs
@@ -101,7 +101,7 @@ internal static Encoding GetEncoding(string encodingShortName)
(exception is ArgumentException || // unsupported name
exception is NotSupportedException)
{
-#if NETSTANDARD && !NETSTANDARD1_0 || NETCOREAPP3_0
+#if NETSTANDARD || NET6_0
return CodePagesEncodingProvider.Instance.GetEncoding(encodingName);
#else
return null;
diff --git a/src/UTF-unknown.csproj b/src/UTF-unknown.csproj
index afafb21..7bed414 100644
--- a/src/UTF-unknown.csproj
+++ b/src/UTF-unknown.csproj
@@ -1,7 +1,7 @@
- net40;netstandard1.0;netstandard1.3;netstandard2.0;netcoreapp3.0
+ netstandard2.0;net6.0
@@ -16,9 +16,12 @@
Library
-
+
+
+
+
diff --git a/tests/CharsetDetectorTest.cs b/tests/CharsetDetectorTest.cs
index 2d2df34..9eee669 100644
--- a/tests/CharsetDetectorTest.cs
+++ b/tests/CharsetDetectorTest.cs
@@ -4,15 +4,12 @@
// Rudi Pettazzi
//
-#region
-
using System.IO;
using System.Text;
+using System.Threading.Tasks;
using UtfUnknown.Core;
using NUnit.Framework;
-#endregion
-
namespace UtfUnknown.Tests
{
public class CharsetDetectorTest
@@ -34,6 +31,23 @@ public void TestAscii()
}
}
+ [Test]
+ public async Task TestAsciiAsync()
+ {
+ const string text = "The Documentation of the libraries is not complete " +
+ "and your contributions would be greatly appreciated " +
+ "the documentation you want to contribute to and " +
+ "click on the [Edit] link to start writing";
+ var stream = AsciiToStream(text);
+ using (stream)
+ {
+ var result = await CharsetDetector.DetectFromStreamAsync(stream);
+ Assert.AreEqual(CodepageName.ASCII, result.Detected.EncodingName);
+ Assert.AreEqual(1.0f, result.Detected.Confidence);
+ Assert.IsFalse(result.Detected.HasBOM);
+ }
+ }
+
[Test]
public void TestAscii_with_HZ_sequence()
{
@@ -47,6 +61,19 @@ public void TestAscii_with_HZ_sequence()
}
}
+ [Test]
+ public async Task TestAscii_with_HZ_sequenceAsync()
+ {
+ const string text = "virtual ~{{NETCLASS_NAME}}();";
+ var stream = AsciiToStream(text);
+ using (stream)
+ {
+ var result = await CharsetDetector.DetectFromStreamAsync(stream);
+ Assert.AreEqual(CodepageName.ASCII, result.Detected.EncodingName);
+ Assert.AreEqual(1.0f, result.Detected.Confidence);
+ }
+ }
+
private static MemoryStream AsciiToStream(string s)
{
return new MemoryStream(Encoding.ASCII.GetBytes(s));
@@ -74,6 +101,28 @@ public void DetectFromStreamMaxBytes(int? maxBytes, int expectedPosition, int st
Assert.AreEqual(expectedPosition, stream.Position);
}
+ [Test]
+ [TestCase(1024, 1024)]
+ [TestCase(2048, 2048)]
+ [TestCase(20, 20)]
+ [TestCase(20, 30, 10)]
+ [TestCase(null, 10000)]
+ [TestCase(1000000, 10000)]
+ [TestCase(null, 10000, 10)]
+ public async Task DetectFromStreamMaxBytesAsync(int? maxBytes, int expectedPosition, int start = 0)
+ {
+ // Arrange
+ var text = new string('a', 10000);
+ var stream = AsciiToStream(text);
+ stream.Position = start;
+
+ // Act
+ await CharsetDetector.DetectFromStreamAsync(stream, maxBytes);
+
+ // Assert
+ Assert.AreEqual(expectedPosition, stream.Position);
+ }
+
[Test]
[TestCase(0, 10, CodepageName.ASCII)]
[TestCase(0, 100, CodepageName.UTF8)]
diff --git a/tests/CharsetDetectorTestBatch.cs b/tests/CharsetDetectorTestBatch.cs
index 51ca6fd..b048d50 100644
--- a/tests/CharsetDetectorTestBatch.cs
+++ b/tests/CharsetDetectorTestBatch.cs
@@ -7,6 +7,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
+using System.Threading.Tasks;
using Newtonsoft.Json;
using NUnit.Framework;
@@ -73,6 +74,12 @@ public void TestFile(TestCase testCase)
TestFile(testCase.ExpectedEncoding, testCase.InputFile.FullName);
}
+ [TestCaseSource(nameof(AllTestFiles))]
+ public Task TestFileAsync(TestCase testCase)
+ {
+ return TestFileAsync(testCase.ExpectedEncoding, testCase.InputFile.FullName);
+ }
+
[TestCaseSource(nameof(AllTestFilesUnsupportedEncoding))]
public void TestFileUnsupportedEncodings(TestCase testCase)
{
@@ -93,6 +100,26 @@ public void TestFileUnsupportedEncodings(TestCase testCase)
$"({detected.Confidence * 100.0f:0.00############}% confidence)."));
}
+ [TestCaseSource(nameof(AllTestFilesUnsupportedEncoding))]
+ public async Task TestFileUnsupportedEncodingsAsync(TestCase testCase)
+ {
+ var result = await CharsetDetector.DetectFromFileAsync(testCase.InputFile.FullName);
+ var detected = result.Detected;
+
+ _logWriter.WriteLine(string.Concat(
+ $"- {testCase.InputFile.FullName} ({testCase.ExpectedEncoding}) -> ",
+ $"{JsonConvert.SerializeObject(result, Formatting.Indented, new EncodingJsonConverter())}"));
+
+ StringAssert.AreEqualIgnoringCase(
+ testCase.ExpectedEncoding,
+ detected.EncodingName,
+ string.Concat(
+ $"Charset detection failed for {testCase.InputFile.FullName}. ",
+ $"Expected: {testCase.ExpectedEncoding}. ",
+ $"Detected: {detected.EncodingName} ",
+ $"({detected.Confidence * 100.0f:0.00############}% confidence)."));
+ }
+
public class TestCase
{
/// Initializes a new instance of the class.
@@ -163,5 +190,16 @@ private void TestFile(string expectedCharset, string file)
$"Charset detection failed for {file}. Expected: {expectedCharset}, detected: {detected.EncodingName} ({detected.Confidence * 100.0f:0.00############}% confidence)");
Assert.NotNull(detected.Encoding);
}
+
+ private async Task TestFileAsync(string expectedCharset, string file)
+ {
+ var result = await CharsetDetector.DetectFromFileAsync(file);
+ var detected = result.Detected;
+
+ _logWriter.WriteLine($"- {file} ({expectedCharset}) -> {JsonConvert.SerializeObject(result, Formatting.Indented, new EncodingJsonConverter())}");
+ StringAssert.AreEqualIgnoringCase(expectedCharset, detected.EncodingName,
+ $"Charset detection failed for {file}. Expected: {expectedCharset}, detected: {detected.EncodingName} ({detected.Confidence * 100.0f:0.00############}% confidence)");
+ Assert.NotNull(detected.Encoding);
+ }
}
-}
+}
\ No newline at end of file
diff --git a/tests/DetectionDetailTests.cs b/tests/DetectionDetailTests.cs
index 9945dd2..766bd08 100644
--- a/tests/DetectionDetailTests.cs
+++ b/tests/DetectionDetailTests.cs
@@ -19,12 +19,15 @@ public void DetectionDetailGetEncodingIsNotNull(string codepageName)
private static readonly HashSet UnsupportedEncodings = new HashSet
{
+ #if NET6_0
+ CodepageName.UTF7, // Support dropped in .NET 6
+ #endif
CodepageName.ISO_8859_10,
CodepageName.ISO_8859_16,
CodepageName.EUC_TW,
CodepageName.VISCII,
CodepageName.X_ISO_10646_UCS_4_2143,
- CodepageName.X_ISO_10646_UCS_4_3412,
+ CodepageName.X_ISO_10646_UCS_4_3412
};
private static readonly IReadOnlyList EncodingNames = typeof(CodepageName)
@@ -46,4 +49,4 @@ public void GetEncodingShouldHandleIncorrectEncoding()
Assert.AreEqual(null, result);
}
}
-}
+}
\ No newline at end of file
diff --git a/tests/UTF-unknown.Tests.csproj b/tests/UTF-unknown.Tests.csproj
index 87ba6bf..25b6849 100644
--- a/tests/UTF-unknown.Tests.csproj
+++ b/tests/UTF-unknown.Tests.csproj
@@ -1,7 +1,7 @@
- net452;netcoreapp2.1;netcoreapp3.0
+ net462;netcoreapp3.1;net6.0
UtfUnknown.Tests
UtfUnknown.Tests
true
@@ -9,10 +9,10 @@
-
+
-
-
+
+