Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing tokenizers to correctly handle linux line endings (\n) #25

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 1 addition & 12 deletions src/Base/CasedTokenizer.cs
Original file line number Diff line number Diff line change
@@ -1,18 +1,7 @@
using BERTTokenizers.Extensions;
using System;
using System.Collections.Generic;
using System.Linq;

namespace BERTTokenizers.Base
namespace BERTTokenizers.Base
{
public abstract class CasedTokenizer : TokenizerBase
{
protected CasedTokenizer(string vocabularyFilePath) : base(vocabularyFilePath) { }

protected override IEnumerable<string> TokenizeSentence(string text)
{
return text.Split(new string[] { " ", " ", "\r\n" }, StringSplitOptions.None)
.SelectMany(o => o.SplitAndKeep(".,;:\\/?!#$%()=+-*\"'–_`<>&^@{}[]|~'".ToArray()));
}
}
}
11 changes: 8 additions & 3 deletions src/Base/TokenizerBase.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using BERTTokenizers.Helpers;
using BERTTokenizers.Extensions;
using BERTTokenizers.Helpers;
using System;
using System.Collections.Generic;
using System.Linq;
Expand Down Expand Up @@ -87,6 +88,12 @@ public List<string> Untokenize(List<string> tokens)
=> (tokenindex.Token, tokenindex.VocabularyIndex, segmentindex)).ToList();
}

protected virtual IEnumerable<string> TokenizeSentence(string text)
{
return text.Split(new char[0], StringSplitOptions.RemoveEmptyEntries)
.SelectMany(o => o.SplitAndKeep(".,;:\\/?!#$%()=+-*\"'–_`<>&^@{}[]|~'".ToArray()));
}

private IEnumerable<long> SegmentIndex(List<(string token, int index)> tokens)
{
var segmentIndex = 0;
Expand Down Expand Up @@ -152,7 +159,5 @@ private IEnumerable<long> SegmentIndex(List<(string token, int index)> tokens)

return tokens;
}

protected abstract IEnumerable<string> TokenizeSentence(string text);
}
}
8 changes: 2 additions & 6 deletions src/Base/UncasedTokenizer.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
using BERTTokenizers.Extensions;
using System;
using System.Collections.Generic;
using System.Collections.Generic;
using System.Linq;

namespace BERTTokenizers.Base
Expand All @@ -13,9 +11,7 @@ protected UncasedTokenizer(string vocabularyFilePath) : base(vocabularyFilePath)

protected override IEnumerable<string> TokenizeSentence(string text)
{
return text.Split(new string[] { " ", " ", "\r\n" }, StringSplitOptions.None)
.SelectMany(o => o.SplitAndKeep(".,;:\\/?!#$%()=+-*\"'–_`<>&^@{}[]|~'".ToArray()))
.Select(o => o.ToLower());
return base.TokenizeSentence(text).Select(o => o.ToLower());
}
}
}
2 changes: 1 addition & 1 deletion tests/BERTTokenizers.Tests.csproj
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net7.0</TargetFramework>
</PropertyGroup>

<ItemGroup>
Expand Down
15 changes: 14 additions & 1 deletion tests/BertBaseTokenizerUncasedShould.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,20 @@ public void Tokenize_sentence()
Assert.Equal(("love", 2293, 0), tokens[2]);
Assert.Equal(("you", 2017, 0), tokens[3]);
Assert.Equal(("[SEP]", 102, 0), tokens[4]);
}

[Fact]
public void Tokenize_text_with_linux_line_endings()
{
var sentence = "Linux\nline\nendings";

var tokens = _tokenizer.Tokenize(sentence);
Assert.Equal(5, tokens.Count);
Assert.Equal(("[CLS]", 101, 0), tokens[0]);
Assert.Equal(("linux", 11603, 0), tokens[1]);
Assert.Equal(("line", 2240, 0), tokens[2]);
Assert.Equal(("endings", 21306, 0), tokens[3]);
Assert.Equal(("[SEP]", 102, 0), tokens[4]);
}

[Fact]
Expand Down Expand Up @@ -61,7 +74,7 @@ public void Encode_sentence()
}

[Fact]
public void Unokenize_sentence()
public void Untokenize_sentence()
{
var tokens = new List<string>(){ "she", "##s" };

Expand Down