From 60eda6ef6c081122e3e6ece760e4699137b78e2c Mon Sep 17 00:00:00 2001 From: John Lambert Date: Mon, 6 Jan 2025 10:05:27 -0500 Subject: [PATCH] Resolves #569 --- .../Services/PreprocessBuildJobTests.cs | 262 ++---------------- .../Services/CorpusService.cs | 8 +- .../Services/ICorpusService.cs | 4 +- .../ParallelCorpusPreprocessingService.cs | 27 +- 4 files changed, 41 insertions(+), 260 deletions(-) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index f318524f..26ebb18f 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -121,7 +121,27 @@ public async Task RunAsync_EnableKeyTerms() Assert.That(src1Count, Is.EqualTo(14)); Assert.That(src2Count, Is.EqualTo(0)); Assert.That(trgCount, Is.EqualTo(1)); - Assert.That(termCount, Is.EqualTo(144)); + Assert.That(termCount, Is.EqualTo(3416)); + }); + } + + [Test] + public async Task RunAsync_EnableKeyTermsNoTrainingData() + { + using TestEnvironment env = new(); + ParallelCorpus corpus1 = env.DefaultParatextCorpus; + corpus1.SourceCorpora[0].TrainOnTextIds = new HashSet(); + corpus1.TargetCorpora[0].TrainOnTextIds = new HashSet(); + + await env.RunBuildJobAsync(corpus1, useKeyTerms: true); + + (int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync(); + Assert.Multiple(() => + { + Assert.That(src1Count, Is.EqualTo(0)); + Assert.That(src2Count, Is.EqualTo(0)); + Assert.That(trgCount, Is.EqualTo(0)); + Assert.That(termCount, Is.EqualTo(0)); }); } @@ -508,246 +528,6 @@ public async Task ParallelCorpusAsync() }); } - [Test] - public async Task ParallelCorpusAsync_UseKeyTerms() - { - using TestEnvironment env = new(); - var corpora = new List() - { - new ParallelCorpus() - { - Id = "1", - SourceCorpora = new List() - { - new() - { - Id = "_1", - Language = "en", - Files = new List { env.ParatextFile("pt-source1") }, - TrainOnChapters = new() - { - { - "MAT", - new() { 1 } - }, - { - "LEV", - new() { } - } - }, - PretranslateChapters = new() - { - { - "1CH", - new() { } - } - } - }, - new() - { - Id = "_1", - Language = "en", - Files = new List { env.ParatextFile("pt-source2") }, - TrainOnChapters = new() - { - { - "MAT", - new() { 1 } - }, - { - "MRK", - new() { } - } - }, - PretranslateChapters = new() { } - }, - }, - TargetCorpora = new List() - { - new() - { - Id = "_1", - Language = "en", - Files = new List { env.ParatextFile("pt-target1") }, - TrainOnChapters = new() - { - { - "MAT", - new() { 1 } - }, - { - "MRK", - new() { } - } - } - }, - new() - { - Id = "_2", - Language = "en", - Files = new List { env.ParatextFile("pt-target2") }, - TrainOnChapters = new() - { - { - "MAT", - new() { 1 } - }, - { - "MRK", - new() { } - }, - { - "LEV", - new() { } - } - } - } - } - } - }; - await env.RunBuildJobAsync(corpora, useKeyTerms: true); - string source = await env.GetSourceExtractAsync(); - string target = await env.GetTargetExtractAsync(); - Assert.Multiple(() => - { - StringAssert.StartsWith( - @"Source one, chapter fourteen, verse fifty-five. Segment b. -Source one, chapter fourteen, verse fifty-six. -Source two, chapter one, verse one. -Source two, chapter one, verse two. -Source two, chapter one, verse three. -Source one, chapter one, verse four. -Source two, chapter one, verse five. Source two, chapter one, verse six. -Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten. -Source two, chapter one, verse one. -", - source - ); - StringAssert.StartsWith( - @"Target two, chapter fourteen, verse fifty-five. -Target two, chapter fourteen, verse fifty-six. -Target one, chapter one, verse one. -Target one, chapter one, verse two. -Target one, chapter one, verse three. - -Target one, chapter one, verse five and six. -Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. - -", - target - ); - StringAssert.Contains("Abraham", source); - StringAssert.Contains("Abraham", target); - StringAssert.DoesNotContain("Zedekiah", source); - StringAssert.DoesNotContain("Zedekiah", target); - }); - JsonArray? pretranslations = await env.GetPretranslationsAsync(); - Assert.That(pretranslations, Is.Not.Null); - Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString()); - Assert.That( - pretranslations[2]!["translation"]!.ToString(), - Is.EqualTo("Source one, chapter twelve, verse one.") - ); - } - - [Test] - public async Task ParallelCorpusAsync_UseKeyTerms_TextIds() - { - using TestEnvironment env = new(); - var corpora = new List() - { - new ParallelCorpus() - { - Id = "1", - SourceCorpora = new List() - { - new() - { - Id = "_1", - Language = "en", - Files = new List { env.ParatextFile("pt-source1") }, - TrainOnTextIds = ["MAT", "LEV"], - PretranslateTextIds = ["1CH"] - }, - new() - { - Id = "_1", - Language = "en", - Files = new List { env.ParatextFile("pt-source2") }, - TrainOnTextIds = ["MAT", "MRK"], - PretranslateTextIds = [] - }, - }, - TargetCorpora = new List() - { - new() - { - Id = "_1", - Language = "en", - Files = new List { env.ParatextFile("pt-target1") }, - TrainOnTextIds = ["MAT", "MRK"] - }, - new() - { - Id = "_2", - Language = "en", - Files = new List { env.ParatextFile("pt-target2") }, - TrainOnTextIds = ["MAT", "MRK", "LEV"] - } - } - } - }; - await env.RunBuildJobAsync(corpora, useKeyTerms: true); - string source = await env.GetSourceExtractAsync(); - string target = await env.GetTargetExtractAsync(); - Assert.Multiple(() => - { - StringAssert.StartsWith( - @"Source one, chapter fourteen, verse fifty-five. Segment b. -Source one, chapter fourteen, verse fifty-six. -Source two, chapter one, verse one. -Source two, chapter one, verse two. -Source two, chapter one, verse three. -Source one, chapter one, verse four. -Source two, chapter one, verse five. Source two, chapter one, verse six. -Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten. -Source one, chapter two, verse one. -Source one, chapter two, verse two. - -Source two, chapter one, verse one. -", - source - ); - StringAssert.StartsWith( - @"Target two, chapter fourteen, verse fifty-five. -Target two, chapter fourteen, verse fifty-six. -Target one, chapter one, verse one. -Target one, chapter one, verse two. -Target one, chapter one, verse three. - -Target one, chapter one, verse five and six. -Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. -Target one, chapter two, verse one. - -Target one, chapter two, verse three. - -", - target - ); - StringAssert.Contains("Abraham", source); - StringAssert.Contains("Abraham", target); - StringAssert.DoesNotContain("Zedekiah", source); - StringAssert.DoesNotContain("Zedekiah", target); - }); - JsonArray? pretranslations = await env.GetPretranslationsAsync(); - Assert.That(pretranslations, Is.Not.Null); - Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString()); - Assert.That( - pretranslations[2]!["translation"]!.ToString(), - Is.EqualTo("Source one, chapter twelve, verse one.") - ); - } - private class TestEnvironment : DisposableBase { private static readonly string TestDataPath = Path.Combine( diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index 793e5046..2aff4a1e 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -36,16 +36,14 @@ public IEnumerable CreateTextCorpora(IReadOnlyList file return corpora; } - public IEnumerable CreateTermCorpora( - IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora - ) + public IEnumerable CreateTermCorpora(IReadOnlyList corpusFiles) { - foreach ((CorpusFile file, Dictionary> chapters) in corpora) + foreach (CorpusFile file in corpusFiles) { switch (file.Format) { case FileFormat.Paratext: - yield return new ParatextBackupTermsCorpus(file.Location, ["PN"], chapters: chapters); + yield return new ParatextBackupTermsCorpus(file.Location, ["PN"]); break; } } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs index 3f19fccc..6520d23a 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs @@ -3,7 +3,5 @@ public interface ICorpusService { IEnumerable CreateTextCorpora(IReadOnlyList files); - IEnumerable CreateTermCorpora( - IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora - ); + IEnumerable CreateTermCorpora(IReadOnlyList corpusFiles); } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index a5fb70ac..d8546c06 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -32,6 +32,8 @@ public async Task PreprocessAsync( bool useKeyTerms = false ) { + bool parallelTrainingDataPresent = false; + List keyTermTrainingData = new(); foreach (ParallelCorpus corpus in corpora) { (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] sourceCorpora = corpus @@ -78,23 +80,19 @@ public async Task PreprocessAsync( foreach (Row row in CollapseRanges(trainingRows)) { await train(row); + if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) + { + parallelTrainingDataPresent = true; + } } if (useKeyTerms) { ITextCorpus[]? sourceTermCorpora = _corpusService - .CreateTermCorpora( - sourceCorpora - .SelectMany(corpus => GetChaptersPerFile(corpus.Corpus, corpus.TextCorpus)) - .ToArray() - ) + .CreateTermCorpora(sourceCorpora.SelectMany(corpus => corpus.Corpus.Files).ToArray()) .ToArray(); ITextCorpus[]? targetTermCorpora = _corpusService - .CreateTermCorpora( - targetCorpora - .SelectMany(corpus => GetChaptersPerFile(corpus.Corpus, corpus.TextCorpus)) - .ToArray() - ) + .CreateTermCorpora(targetCorpora.SelectMany(corpus => corpus.Corpus.Files).ToArray()) .ToArray(); if (sourceTermCorpora is not null && targetTermCorpora is not null) { @@ -107,7 +105,7 @@ ParallelTextRow row in parallelKeyTermsCorpus.DistinctBy(row => ) ) { - await train(new Row(row.TextId, row.Refs, row.SourceText, row.TargetText, 1)); + keyTermTrainingData.Add(new Row(row.TextId, row.Refs, row.SourceText, row.TargetText, 1)); } } } @@ -123,6 +121,13 @@ ParallelTextRow row in parallelKeyTermsCorpus.DistinctBy(row => await pretranslate(row, corpus); } } + if (useKeyTerms && parallelTrainingDataPresent) + { + foreach (Row row in keyTermTrainingData) + { + await train(row); + } + } } private static IEnumerable<(CorpusFile File, Dictionary> Chapters)> GetChaptersPerFile(