From 4a9b8229390e09297495ee740d6f38f8428a7c72 Mon Sep 17 00:00:00 2001 From: Ares Lazarus Date: Mon, 22 Jul 2024 11:29:17 +0300 Subject: [PATCH] added some unit tests for ITextTokenizer.GetTokens implementation --- .../KernelMemory/ITextTokenizerTests.cs | 81 +++++++++++++++++++ .../LLamaSharpTextEmbeddingGeneratorTests.cs | 30 +++++++ .../LlamaSharpTextGeneratorTests.cs | 34 ++++++++ 3 files changed, 145 insertions(+) create mode 100644 LLama.Unittest/KernelMemory/ITextTokenizerTests.cs create mode 100644 LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs create mode 100644 LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs diff --git a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs new file mode 100644 index 000000000..08d8b8132 --- /dev/null +++ b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs @@ -0,0 +1,81 @@ +using LLama.Common; +using LLamaSharp.KernelMemory; +using Microsoft.KernelMemory.AI; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using Xunit.Abstractions; + +namespace LLama.Unittest.KernelMemory +{ + public abstract class ITextTokenizerTests + { + private readonly ITestOutputHelper _testOutputHelper; + +#pragma warning disable KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + protected ITextTokenizer? _generator; +#pragma warning restore KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + + protected InferenceParams _infParams; + protected LLamaSharpConfig _lsConfig; + + public ITextTokenizerTests(ITestOutputHelper testOutputHelper) + { + _testOutputHelper = testOutputHelper; + + _infParams = new() { AntiPrompts = ["\n\n"] }; + _lsConfig = new(Constants.GenerativeModelPath) { DefaultInferenceParams = _infParams }; + + testOutputHelper.WriteLine($"Using model {Path.GetFileName(_lsConfig.ModelPath)}"); + } + + [Theory] + [InlineData("The quick brown fox jumps over the lazy dog")] + [InlineData("Well, here're some special characters!!!")] + [InlineData("And a little bit of unicode για να κρατήσουμε τα πράγματα ενδιαφέροντα")] + [InlineData(" \n \r\n \t ")] + public void GetTokens_ShouldReturnListOfTokensForInputString(string? text) + { + var tokens = _generator!.GetTokens(text); + var tokensCount = _generator.CountTokens(text); + + var expected = " " + text; // the placement of the space corresponding to BOS will vary by model + var actual = string.Join("", tokens); + + _testOutputHelper.WriteLine($"Tokens for '{text}':"); + _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})"))); + + Assert.Equal(expected, actual); + Assert.Equal(tokensCount, tokens.Count); + } + + [Fact] + public void GetToken_ShouldThrowForNull() + { + string? text = null; + + Assert.Throws(() => { _generator!.GetTokens(text!); }); + } + + [Fact] + public void GetToken_EmptyStringYieldsOneEmptyToken() + { + var text = ""; + var expected = ""; + + var tokens = _generator!.GetTokens(text); + var tokensCount = _generator.CountTokens(text); + var actual = tokens.Single(); + + _testOutputHelper.WriteLine($"Tokens for '{text}':"); + _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})"))); + + Assert.Equal(expected, actual); + Assert.Equal(tokensCount, tokens.Count); + } + } +} diff --git a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs new file mode 100644 index 000000000..91161b72c --- /dev/null +++ b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs @@ -0,0 +1,30 @@ +using LLama.Common; +using LLamaSharp.KernelMemory; +using Microsoft.KernelMemory.AI; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using Xunit.Abstractions; + +namespace LLama.Unittest.KernelMemory +{ + public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable + { + private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator; + + public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper) + { + _embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig); + + _generator = _embeddingGenerator; + } + + public void Dispose() + { + _embeddingGenerator.Dispose(); + } + } +} diff --git a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs new file mode 100644 index 000000000..02001f8cf --- /dev/null +++ b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs @@ -0,0 +1,34 @@ +using LLama.Common; +using LLamaSharp.KernelMemory; +using Microsoft.KernelMemory.AI; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Reflection.Emit; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using Xunit.Abstractions; +using Xunit.Sdk; +using static System.Net.Mime.MediaTypeNames; + +namespace LLama.Unittest.KernelMemory +{ + public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable + { + private readonly LlamaSharpTextGenerator _textGenerator; + + public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper) + { + _textGenerator = new LlamaSharpTextGenerator(_lsConfig); + + _generator = _textGenerator; + } + + public void Dispose() + { + _textGenerator.Dispose(); + } + } +}