Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Async operations + updated frameworks #158

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ environment:
version: 2.0.{build}

clone_folder: c:\utfUnknown
image: Visual Studio 2019
image: Visual Studio 2022
configuration: Release
platform: Any CPU
nuget:
project_feed: true
init:
- git config --global core.autocrlf true
build_script:
- ps: dotnet build -c Release
- ps: dotnet build -c Release
test_script:
- ps: cd .\tests\
- ps: dotnet test
Expand Down
2 changes: 1 addition & 1 deletion example/ConsoleExample.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp3.0</TargetFramework>
<TargetFramework>net6.0</TargetFramework>
</PropertyGroup>

<ItemGroup>
Expand Down
162 changes: 137 additions & 25 deletions src/CharsetDetector.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* ***** BEGIN LICENSE BLOCK *****
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
Expand Down Expand Up @@ -41,7 +41,8 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;

using System.Threading;
using System.Threading.Tasks;
using UtfUnknown.Core;
using UtfUnknown.Core.Probers;

Expand Down Expand Up @@ -118,7 +119,7 @@ private CharsetDetector()

/// <summary>
/// Detect the character encoding form this byte array.
/// It searchs for BOM from bytes[0].
/// It searches for BOM from bytes[0].
/// </summary>
/// <param name="bytes">The byte array containing the text</param>
/// <returns></returns>
Expand All @@ -136,7 +137,7 @@ public static DetectionResult DetectFromBytes(byte[] bytes)

/// <summary>
/// Detect the character encoding form this byte array.
/// It searchs for BOM from bytes[offset].
/// It searches for BOM from bytes[offset].
/// </summary>
/// <param name="bytes">The byte array containing the text</param>
/// <param name="offset">The zero-based byte offset in buffer at which to begin reading the data from</param>
Expand Down Expand Up @@ -166,8 +167,6 @@ public static DetectionResult DetectFromBytes(byte[] bytes, int offset, int len)
return detector.DataEnd();
}

#if !NETSTANDARD1_0

/// <summary>
/// Detect the character encoding by reading the stream.
///
Expand Down Expand Up @@ -210,37 +209,106 @@ public static DetectionResult DetectFromStream(Stream stream, long? maxBytesToRe
return detector.DataEnd();
}

/// <summary>
/// Detect the character encoding by reading the stream.
///
/// Note: stream position is not reset before and after.
/// </summary>
/// <param name="stream">The steam. </param>
/// <param name="cancellationToken">The cancellation token for this operation.</param>
public static Task<DetectionResult> DetectFromStreamAsync(Stream stream, CancellationToken cancellationToken = default)
{
if (stream == null)
{
throw new ArgumentNullException(nameof(stream));
}

return DetectFromStreamAsync(stream, null, cancellationToken);
}

/// <summary>
/// Detect the character encoding by reading the stream.
///
/// Note: stream position is not reset before and after.
/// </summary>
/// <param name="stream">The steam. </param>
/// <param name="maxBytesToRead">max bytes to read from <paramref name="stream"/>. If <c>null</c>, then no max</param>
/// <param name="cancellationToken">The cancellation token for this operation.</param>
/// <exception cref="ArgumentOutOfRangeException"><paramref name="maxBytesToRead"/> 0 or lower.</exception>
public static async Task<DetectionResult> DetectFromStreamAsync(Stream stream, long? maxBytesToRead, CancellationToken cancellationToken = default)
{
if (stream == null)
{
throw new ArgumentNullException(nameof(stream));
}

if (maxBytesToRead <= 0)
{
throw new ArgumentOutOfRangeException(nameof(maxBytesToRead));
}

var detector = new CharsetDetector();

await ReadStreamAsync(stream, maxBytesToRead, detector, cancellationToken);
return detector.DataEnd();
}

private const int BufferSize = 1024;

private static void ReadStream(Stream stream, long? maxBytes, CharsetDetector detector)
{
const int bufferSize = 1024;
byte[] buff = new byte[bufferSize];
byte[] buff = new byte[BufferSize];
int read;
long readTotal = 0;

var toRead = CalcToRead(maxBytes, readTotal, bufferSize);
var toRead = CalcToRead(maxBytes, readTotal, BufferSize);

while ((read = stream.Read(buff, 0, toRead)) > 0)
{
detector.Feed(buff, 0, read);

if (maxBytes != null)
if (FeedDetector(detector, maxBytes, buff, read, ref readTotal, ref toRead))
{
readTotal += read;
if (readTotal >= maxBytes)
{
return;
}

toRead = CalcToRead(maxBytes, readTotal, bufferSize);
return;
}
}
}

if (detector._done)
private static async Task ReadStreamAsync(Stream stream, long? maxBytes, CharsetDetector detector, CancellationToken cancellationToken = default)
{
byte[] buff = new byte[BufferSize];
int read;
long readTotal = 0;

var toRead = CalcToRead(maxBytes, readTotal, BufferSize);

while ((read = await stream.ReadAsync(buff, 0, toRead, cancellationToken)) > 0)
{
if (FeedDetector(detector, maxBytes, buff, read, ref readTotal, ref toRead))
{
return;
}
}
}

private static bool FeedDetector(CharsetDetector detector, long? maxBytes, byte[] buff, int read, ref long readTotal, ref int toRead)
{
detector.Feed(buff, 0, read);

if (maxBytes == null)
{
return detector._done;
}

readTotal += read;
if (readTotal >= maxBytes)
{
return true;
}

toRead = CalcToRead(maxBytes, readTotal, BufferSize);

return detector._done;
}

private static int CalcToRead(long? maxBytes, long readTotal, int bufferSize)
{
if (readTotal + bufferSize > maxBytes)
Expand All @@ -264,7 +332,7 @@ public static DetectionResult DetectFromFile(string filePath)
throw new ArgumentNullException(nameof(filePath));
}

using (FileStream fs = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
using (FileStream fs = OpenFile(filePath))
{
return DetectFromStream(fs);
}
Expand All @@ -281,13 +349,57 @@ public static DetectionResult DetectFromFile(FileInfo file)
throw new ArgumentNullException(nameof(file));
}

using (FileStream fs = new FileStream(file.FullName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
using (FileStream fs = OpenFile(file.FullName))
{
return DetectFromStream(fs);
}
}

#endif // !NETSTANDARD1_0
/// <summary>
/// Detect the character encoding of this file.
/// </summary>
/// <param name="filePath">Path to file</param>
/// <param name="cancellationToken">The cancellation token for this operation.</param>
/// <returns></returns>
public static async Task<DetectionResult> DetectFromFileAsync(string filePath, CancellationToken cancellationToken = default)
{
if (filePath == null)
{
throw new ArgumentNullException(nameof(filePath));
}

using (FileStream fs = OpenFile(filePath))
{
return await DetectFromStreamAsync(fs, cancellationToken);
}
}
/// <summary>
/// Detect the character encoding of this file.
/// </summary>
/// <param name="file">The file</param>
/// <param name="cancellationToken">The cancellation token for this operation.</param>
/// <returns></returns>
public static Task<DetectionResult> DetectFromFileAsync(FileInfo file, CancellationToken cancellationToken = default)
{
if (file == null)
{
throw new ArgumentNullException(nameof(file));
}

using (FileStream fs = OpenFile(file.FullName))
{
return DetectFromStreamAsync(fs, cancellationToken);
}
}

private static FileStream OpenFile(string filePath)
{
return new FileStream(
filePath,
FileMode.Open,
FileAccess.Read,
FileShare.ReadWrite);
}

protected virtual void Feed(byte[] buf, int offset, int len)
{
Expand Down Expand Up @@ -478,7 +590,7 @@ private DetectionResult DataEnd()
return new DetectionResult();
}

internal IList<CharsetProber> GetNewProbers()
private IList<CharsetProber> GetNewProbers()
{
switch (InputState)
{
Expand All @@ -499,4 +611,4 @@ internal IList<CharsetProber> GetNewProbers()
}
}
}
}
}
2 changes: 1 addition & 1 deletion src/DetectionDetail.cs
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ internal static Encoding GetEncoding(string encodingShortName)
(exception is ArgumentException || // unsupported name
exception is NotSupportedException)
{
#if NETSTANDARD && !NETSTANDARD1_0 || NETCOREAPP3_0
#if NETSTANDARD || NET6_0
return CodePagesEncodingProvider.Instance.GetEncoding(encodingName);
#else
return null;
Expand Down
7 changes: 5 additions & 2 deletions src/UTF-unknown.csproj
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFrameworks>net40;netstandard1.0;netstandard1.3;netstandard2.0;netcoreapp3.0</TargetFrameworks>
<TargetFrameworks>netstandard2.0;net6.0</TargetFrameworks>
</PropertyGroup>

<PropertyGroup>
Expand All @@ -16,9 +16,12 @@
<OutputType>Library</OutputType>
</PropertyGroup>

<ItemGroup Condition=" '$(TargetFramework)' == 'netstandard1.3' Or '$(TargetFramework)' == 'netstandard2.0' ">
<ItemGroup Condition=" '$(TargetFramework)' == 'netstandard2.0' ">
<PackageReference Include="System.Text.Encoding.CodePages" Version="4.7.0" />
</ItemGroup>
<ItemGroup Condition=" '$(TargetFramework)' == 'net6.0' ">
<PackageReference Include="System.Text.Encoding.CodePages" Version="6.0.0" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.SourceLink.GitHub" Version="1.1.1" PrivateAssets="All" />
</ItemGroup>
Expand Down
57 changes: 53 additions & 4 deletions tests/CharsetDetectorTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,12 @@
// Rudi Pettazzi <[email protected]>
//

#region

using System.IO;
using System.Text;
using System.Threading.Tasks;
using UtfUnknown.Core;
using NUnit.Framework;

#endregion

namespace UtfUnknown.Tests
{
public class CharsetDetectorTest
Expand All @@ -34,6 +31,23 @@ public void TestAscii()
}
}

[Test]
public async Task TestAsciiAsync()
{
const string text = "The Documentation of the libraries is not complete " +
adimosh marked this conversation as resolved.
Show resolved Hide resolved
"and your contributions would be greatly appreciated " +
"the documentation you want to contribute to and " +
"click on the [Edit] link to start writing";
var stream = AsciiToStream(text);
using (stream)
{
var result = await CharsetDetector.DetectFromStreamAsync(stream);
Assert.AreEqual(CodepageName.ASCII, result.Detected.EncodingName);
Assert.AreEqual(1.0f, result.Detected.Confidence);
Assert.IsFalse(result.Detected.HasBOM);
}
}

[Test]
public void TestAscii_with_HZ_sequence()
{
Expand All @@ -47,6 +61,19 @@ public void TestAscii_with_HZ_sequence()
}
}

[Test]
public async Task TestAscii_with_HZ_sequenceAsync()
{
const string text = "virtual ~{{NETCLASS_NAME}}();";
var stream = AsciiToStream(text);
using (stream)
{
var result = await CharsetDetector.DetectFromStreamAsync(stream);
Assert.AreEqual(CodepageName.ASCII, result.Detected.EncodingName);
Assert.AreEqual(1.0f, result.Detected.Confidence);
}
}

private static MemoryStream AsciiToStream(string s)
{
return new MemoryStream(Encoding.ASCII.GetBytes(s));
Expand Down Expand Up @@ -74,6 +101,28 @@ public void DetectFromStreamMaxBytes(int? maxBytes, int expectedPosition, int st
Assert.AreEqual(expectedPosition, stream.Position);
}

[Test]
[TestCase(1024, 1024)]
[TestCase(2048, 2048)]
[TestCase(20, 20)]
[TestCase(20, 30, 10)]
[TestCase(null, 10000)]
[TestCase(1000000, 10000)]
[TestCase(null, 10000, 10)]
public async Task DetectFromStreamMaxBytesAsync(int? maxBytes, int expectedPosition, int start = 0)
{
// Arrange
var text = new string('a', 10000);
var stream = AsciiToStream(text);
stream.Position = start;

// Act
await CharsetDetector.DetectFromStreamAsync(stream, maxBytes);

// Assert
Assert.AreEqual(expectedPosition, stream.Position);
}

[Test]
[TestCase(0, 10, CodepageName.ASCII)]
[TestCase(0, 100, CodepageName.UTF8)]
Expand Down
Loading