diff --git a/appveyor.yml b/appveyor.yml index 9faed95..eec9afb 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -4,7 +4,7 @@ environment: version: 2.0.{build} clone_folder: c:\utfUnknown -image: Visual Studio 2019 +image: Visual Studio 2022 configuration: Release platform: Any CPU nuget: @@ -12,7 +12,7 @@ nuget: init: - git config --global core.autocrlf true build_script: -- ps: dotnet build -c Release +- ps: dotnet build -c Release test_script: - ps: cd .\tests\ - ps: dotnet test diff --git a/example/ConsoleExample.csproj b/example/ConsoleExample.csproj index d05a31d..abb50ac 100644 --- a/example/ConsoleExample.csproj +++ b/example/ConsoleExample.csproj @@ -2,7 +2,7 @@ Exe - netcoreapp3.0 + net6.0 diff --git a/src/CharsetDetector.cs b/src/CharsetDetector.cs index 47ddc5e..8f791b9 100644 --- a/src/CharsetDetector.cs +++ b/src/CharsetDetector.cs @@ -1,4 +1,4 @@ -/* ***** BEGIN LICENSE BLOCK ***** +/* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version @@ -41,7 +41,8 @@ using System.Collections.Generic; using System.IO; using System.Linq; - +using System.Threading; +using System.Threading.Tasks; using UtfUnknown.Core; using UtfUnknown.Core.Probers; @@ -118,7 +119,7 @@ private CharsetDetector() /// /// Detect the character encoding form this byte array. - /// It searchs for BOM from bytes[0]. + /// It searches for BOM from bytes[0]. /// /// The byte array containing the text /// @@ -136,7 +137,7 @@ public static DetectionResult DetectFromBytes(byte[] bytes) /// /// Detect the character encoding form this byte array. - /// It searchs for BOM from bytes[offset]. + /// It searches for BOM from bytes[offset]. /// /// The byte array containing the text /// The zero-based byte offset in buffer at which to begin reading the data from @@ -166,8 +167,6 @@ public static DetectionResult DetectFromBytes(byte[] bytes, int offset, int len) return detector.DataEnd(); } -#if !NETSTANDARD1_0 - /// /// Detect the character encoding by reading the stream. /// @@ -210,37 +209,106 @@ public static DetectionResult DetectFromStream(Stream stream, long? maxBytesToRe return detector.DataEnd(); } + /// + /// Detect the character encoding by reading the stream. + /// + /// Note: stream position is not reset before and after. + /// + /// The steam. + /// The cancellation token for this operation. + public static Task DetectFromStreamAsync(Stream stream, CancellationToken cancellationToken = default) + { + if (stream == null) + { + throw new ArgumentNullException(nameof(stream)); + } + + return DetectFromStreamAsync(stream, null, cancellationToken); + } + + /// + /// Detect the character encoding by reading the stream. + /// + /// Note: stream position is not reset before and after. + /// + /// The steam. + /// max bytes to read from . If null, then no max + /// The cancellation token for this operation. + /// 0 or lower. + public static async Task DetectFromStreamAsync(Stream stream, long? maxBytesToRead, CancellationToken cancellationToken = default) + { + if (stream == null) + { + throw new ArgumentNullException(nameof(stream)); + } + + if (maxBytesToRead <= 0) + { + throw new ArgumentOutOfRangeException(nameof(maxBytesToRead)); + } + + var detector = new CharsetDetector(); + + await ReadStreamAsync(stream, maxBytesToRead, detector, cancellationToken); + return detector.DataEnd(); + } + + private const int BufferSize = 1024; + private static void ReadStream(Stream stream, long? maxBytes, CharsetDetector detector) { - const int bufferSize = 1024; - byte[] buff = new byte[bufferSize]; + byte[] buff = new byte[BufferSize]; int read; long readTotal = 0; - var toRead = CalcToRead(maxBytes, readTotal, bufferSize); + var toRead = CalcToRead(maxBytes, readTotal, BufferSize); while ((read = stream.Read(buff, 0, toRead)) > 0) { - detector.Feed(buff, 0, read); - - if (maxBytes != null) + if (FeedDetector(detector, maxBytes, buff, read, ref readTotal, ref toRead)) { - readTotal += read; - if (readTotal >= maxBytes) - { - return; - } - - toRead = CalcToRead(maxBytes, readTotal, bufferSize); + return; } + } + } - if (detector._done) + private static async Task ReadStreamAsync(Stream stream, long? maxBytes, CharsetDetector detector, CancellationToken cancellationToken = default) + { + byte[] buff = new byte[BufferSize]; + int read; + long readTotal = 0; + + var toRead = CalcToRead(maxBytes, readTotal, BufferSize); + + while ((read = await stream.ReadAsync(buff, 0, toRead, cancellationToken)) > 0) + { + if (FeedDetector(detector, maxBytes, buff, read, ref readTotal, ref toRead)) { return; } } } + private static bool FeedDetector(CharsetDetector detector, long? maxBytes, byte[] buff, int read, ref long readTotal, ref int toRead) + { + detector.Feed(buff, 0, read); + + if (maxBytes == null) + { + return detector._done; + } + + readTotal += read; + if (readTotal >= maxBytes) + { + return true; + } + + toRead = CalcToRead(maxBytes, readTotal, BufferSize); + + return detector._done; + } + private static int CalcToRead(long? maxBytes, long readTotal, int bufferSize) { if (readTotal + bufferSize > maxBytes) @@ -264,7 +332,7 @@ public static DetectionResult DetectFromFile(string filePath) throw new ArgumentNullException(nameof(filePath)); } - using (FileStream fs = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) + using (FileStream fs = OpenFile(filePath)) { return DetectFromStream(fs); } @@ -281,13 +349,57 @@ public static DetectionResult DetectFromFile(FileInfo file) throw new ArgumentNullException(nameof(file)); } - using (FileStream fs = new FileStream(file.FullName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) + using (FileStream fs = OpenFile(file.FullName)) { return DetectFromStream(fs); } } -#endif // !NETSTANDARD1_0 + /// + /// Detect the character encoding of this file. + /// + /// Path to file + /// The cancellation token for this operation. + /// + public static async Task DetectFromFileAsync(string filePath, CancellationToken cancellationToken = default) + { + if (filePath == null) + { + throw new ArgumentNullException(nameof(filePath)); + } + + using (FileStream fs = OpenFile(filePath)) + { + return await DetectFromStreamAsync(fs, cancellationToken); + } + } + /// + /// Detect the character encoding of this file. + /// + /// The file + /// The cancellation token for this operation. + /// + public static Task DetectFromFileAsync(FileInfo file, CancellationToken cancellationToken = default) + { + if (file == null) + { + throw new ArgumentNullException(nameof(file)); + } + + using (FileStream fs = OpenFile(file.FullName)) + { + return DetectFromStreamAsync(fs, cancellationToken); + } + } + + private static FileStream OpenFile(string filePath) + { + return new FileStream( + filePath, + FileMode.Open, + FileAccess.Read, + FileShare.ReadWrite); + } protected virtual void Feed(byte[] buf, int offset, int len) { @@ -478,7 +590,7 @@ private DetectionResult DataEnd() return new DetectionResult(); } - internal IList GetNewProbers() + private IList GetNewProbers() { switch (InputState) { @@ -499,4 +611,4 @@ internal IList GetNewProbers() } } } -} \ No newline at end of file +} diff --git a/src/DetectionDetail.cs b/src/DetectionDetail.cs index e188519..aa6b795 100644 --- a/src/DetectionDetail.cs +++ b/src/DetectionDetail.cs @@ -101,7 +101,7 @@ internal static Encoding GetEncoding(string encodingShortName) (exception is ArgumentException || // unsupported name exception is NotSupportedException) { -#if NETSTANDARD && !NETSTANDARD1_0 || NETCOREAPP3_0 +#if NETSTANDARD || NET6_0 return CodePagesEncodingProvider.Instance.GetEncoding(encodingName); #else return null; diff --git a/src/UTF-unknown.csproj b/src/UTF-unknown.csproj index afafb21..7bed414 100644 --- a/src/UTF-unknown.csproj +++ b/src/UTF-unknown.csproj @@ -1,7 +1,7 @@  - net40;netstandard1.0;netstandard1.3;netstandard2.0;netcoreapp3.0 + netstandard2.0;net6.0 @@ -16,9 +16,12 @@ Library - + + + + diff --git a/tests/CharsetDetectorTest.cs b/tests/CharsetDetectorTest.cs index 2d2df34..9eee669 100644 --- a/tests/CharsetDetectorTest.cs +++ b/tests/CharsetDetectorTest.cs @@ -4,15 +4,12 @@ // Rudi Pettazzi // -#region - using System.IO; using System.Text; +using System.Threading.Tasks; using UtfUnknown.Core; using NUnit.Framework; -#endregion - namespace UtfUnknown.Tests { public class CharsetDetectorTest @@ -34,6 +31,23 @@ public void TestAscii() } } + [Test] + public async Task TestAsciiAsync() + { + const string text = "The Documentation of the libraries is not complete " + + "and your contributions would be greatly appreciated " + + "the documentation you want to contribute to and " + + "click on the [Edit] link to start writing"; + var stream = AsciiToStream(text); + using (stream) + { + var result = await CharsetDetector.DetectFromStreamAsync(stream); + Assert.AreEqual(CodepageName.ASCII, result.Detected.EncodingName); + Assert.AreEqual(1.0f, result.Detected.Confidence); + Assert.IsFalse(result.Detected.HasBOM); + } + } + [Test] public void TestAscii_with_HZ_sequence() { @@ -47,6 +61,19 @@ public void TestAscii_with_HZ_sequence() } } + [Test] + public async Task TestAscii_with_HZ_sequenceAsync() + { + const string text = "virtual ~{{NETCLASS_NAME}}();"; + var stream = AsciiToStream(text); + using (stream) + { + var result = await CharsetDetector.DetectFromStreamAsync(stream); + Assert.AreEqual(CodepageName.ASCII, result.Detected.EncodingName); + Assert.AreEqual(1.0f, result.Detected.Confidence); + } + } + private static MemoryStream AsciiToStream(string s) { return new MemoryStream(Encoding.ASCII.GetBytes(s)); @@ -74,6 +101,28 @@ public void DetectFromStreamMaxBytes(int? maxBytes, int expectedPosition, int st Assert.AreEqual(expectedPosition, stream.Position); } + [Test] + [TestCase(1024, 1024)] + [TestCase(2048, 2048)] + [TestCase(20, 20)] + [TestCase(20, 30, 10)] + [TestCase(null, 10000)] + [TestCase(1000000, 10000)] + [TestCase(null, 10000, 10)] + public async Task DetectFromStreamMaxBytesAsync(int? maxBytes, int expectedPosition, int start = 0) + { + // Arrange + var text = new string('a', 10000); + var stream = AsciiToStream(text); + stream.Position = start; + + // Act + await CharsetDetector.DetectFromStreamAsync(stream, maxBytes); + + // Assert + Assert.AreEqual(expectedPosition, stream.Position); + } + [Test] [TestCase(0, 10, CodepageName.ASCII)] [TestCase(0, 100, CodepageName.UTF8)] diff --git a/tests/CharsetDetectorTestBatch.cs b/tests/CharsetDetectorTestBatch.cs index 51ca6fd..b048d50 100644 --- a/tests/CharsetDetectorTestBatch.cs +++ b/tests/CharsetDetectorTestBatch.cs @@ -7,6 +7,7 @@ using System.Collections.Generic; using System.IO; using System.Linq; +using System.Threading.Tasks; using Newtonsoft.Json; using NUnit.Framework; @@ -73,6 +74,12 @@ public void TestFile(TestCase testCase) TestFile(testCase.ExpectedEncoding, testCase.InputFile.FullName); } + [TestCaseSource(nameof(AllTestFiles))] + public Task TestFileAsync(TestCase testCase) + { + return TestFileAsync(testCase.ExpectedEncoding, testCase.InputFile.FullName); + } + [TestCaseSource(nameof(AllTestFilesUnsupportedEncoding))] public void TestFileUnsupportedEncodings(TestCase testCase) { @@ -93,6 +100,26 @@ public void TestFileUnsupportedEncodings(TestCase testCase) $"({detected.Confidence * 100.0f:0.00############}% confidence).")); } + [TestCaseSource(nameof(AllTestFilesUnsupportedEncoding))] + public async Task TestFileUnsupportedEncodingsAsync(TestCase testCase) + { + var result = await CharsetDetector.DetectFromFileAsync(testCase.InputFile.FullName); + var detected = result.Detected; + + _logWriter.WriteLine(string.Concat( + $"- {testCase.InputFile.FullName} ({testCase.ExpectedEncoding}) -> ", + $"{JsonConvert.SerializeObject(result, Formatting.Indented, new EncodingJsonConverter())}")); + + StringAssert.AreEqualIgnoringCase( + testCase.ExpectedEncoding, + detected.EncodingName, + string.Concat( + $"Charset detection failed for {testCase.InputFile.FullName}. ", + $"Expected: {testCase.ExpectedEncoding}. ", + $"Detected: {detected.EncodingName} ", + $"({detected.Confidence * 100.0f:0.00############}% confidence).")); + } + public class TestCase { /// Initializes a new instance of the class. @@ -163,5 +190,16 @@ private void TestFile(string expectedCharset, string file) $"Charset detection failed for {file}. Expected: {expectedCharset}, detected: {detected.EncodingName} ({detected.Confidence * 100.0f:0.00############}% confidence)"); Assert.NotNull(detected.Encoding); } + + private async Task TestFileAsync(string expectedCharset, string file) + { + var result = await CharsetDetector.DetectFromFileAsync(file); + var detected = result.Detected; + + _logWriter.WriteLine($"- {file} ({expectedCharset}) -> {JsonConvert.SerializeObject(result, Formatting.Indented, new EncodingJsonConverter())}"); + StringAssert.AreEqualIgnoringCase(expectedCharset, detected.EncodingName, + $"Charset detection failed for {file}. Expected: {expectedCharset}, detected: {detected.EncodingName} ({detected.Confidence * 100.0f:0.00############}% confidence)"); + Assert.NotNull(detected.Encoding); + } } -} +} \ No newline at end of file diff --git a/tests/DetectionDetailTests.cs b/tests/DetectionDetailTests.cs index 9945dd2..766bd08 100644 --- a/tests/DetectionDetailTests.cs +++ b/tests/DetectionDetailTests.cs @@ -19,12 +19,15 @@ public void DetectionDetailGetEncodingIsNotNull(string codepageName) private static readonly HashSet UnsupportedEncodings = new HashSet { + #if NET6_0 + CodepageName.UTF7, // Support dropped in .NET 6 + #endif CodepageName.ISO_8859_10, CodepageName.ISO_8859_16, CodepageName.EUC_TW, CodepageName.VISCII, CodepageName.X_ISO_10646_UCS_4_2143, - CodepageName.X_ISO_10646_UCS_4_3412, + CodepageName.X_ISO_10646_UCS_4_3412 }; private static readonly IReadOnlyList EncodingNames = typeof(CodepageName) @@ -46,4 +49,4 @@ public void GetEncodingShouldHandleIncorrectEncoding() Assert.AreEqual(null, result); } } -} +} \ No newline at end of file diff --git a/tests/UTF-unknown.Tests.csproj b/tests/UTF-unknown.Tests.csproj index 87ba6bf..25b6849 100644 --- a/tests/UTF-unknown.Tests.csproj +++ b/tests/UTF-unknown.Tests.csproj @@ -1,7 +1,7 @@  - net452;netcoreapp2.1;netcoreapp3.0 + net462;netcoreapp3.1;net6.0 UtfUnknown.Tests UtfUnknown.Tests true @@ -9,10 +9,10 @@ - + - - + +