Skip to content

Commit 5c50319

Browse files
authored
Add Timeout to Regex used in the tokenizers (#7284)
* Add Timeout to Regex used in the tokenizers * Address the feedback
1 parent 7cce753 commit 5c50319

File tree

4 files changed

+17
-12
lines changed

4 files changed

+17
-12
lines changed

src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ private SentencePieceTokenizer(ModelProto modelProto, IReadOnlyDictionary<string
8686
_specialTokensReverse.Add(item.Value, item.Key);
8787
}
8888

89+
// We create this Regex object without a timeout, as we expect the match operation to complete in \(O(N)\) time complexity. Note that `specialTokens` are treated as constants after the tokenizer is created.
8990
_specialTokensRegex = new Regex(string.Join("|", specialTokens.Keys.Select(s => Regex.Escape(s))), RegexOptions.Compiled);
9091
}
9192
}

src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1175,23 +1175,23 @@ private static (Dictionary<string, int> SpecialTokens, Regex Regex, string Vocab
11751175
internal const string R50kBaseTypeName = "Microsoft.ML.Tokenizers.R50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.R50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
11761176

11771177
#if NET7_0_OR_GREATER
1178-
[GeneratedRegex(Cl100kBaseRegexPattern)]
1178+
[GeneratedRegex(Cl100kBaseRegexPattern, RegexOptions.None, PreTokenizer.DefaultTimeOutInMilliseconds)]
11791179
private static partial Regex Cl100kBaseRegex();
11801180

1181-
[GeneratedRegex(P50kBaseRegexPattern)]
1181+
[GeneratedRegex(P50kBaseRegexPattern, RegexOptions.None, PreTokenizer.DefaultTimeOutInMilliseconds)]
11821182
internal static partial Regex P50kBaseRegex();
11831183

1184-
[GeneratedRegex(O200kBaseRegexPattern)]
1184+
[GeneratedRegex(O200kBaseRegexPattern, RegexOptions.None, PreTokenizer.DefaultTimeOutInMilliseconds)]
11851185
internal static partial Regex O200kBaseRegex();
11861186
#else
11871187
private static Regex? _cl100kBaseRegex;
1188-
private static Regex Cl100kBaseRegex() => _cl100kBaseRegex ??= new Regex(Cl100kBaseRegexPattern, RegexOptions.Compiled);
1188+
private static Regex Cl100kBaseRegex() => _cl100kBaseRegex ??= new Regex(Cl100kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(PreTokenizer.DefaultTimeOutInMilliseconds));
11891189

11901190
private static Regex? _p50kBaseRegex;
1191-
internal static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled);
1191+
internal static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(PreTokenizer.DefaultTimeOutInMilliseconds));
11921192

11931193
private static Regex? _o200kBaseRegex;
1194-
internal static Regex O200kBaseRegex() => _o200kBaseRegex ??= new Regex(O200kBaseRegexPattern, RegexOptions.Compiled);
1194+
internal static Regex O200kBaseRegex() => _o200kBaseRegex ??= new Regex(O200kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(PreTokenizer.DefaultTimeOutInMilliseconds));
11951195
#endif
11961196

11971197
private static readonly ConcurrentDictionary<string, (Dictionary<ReadOnlyMemory<byte>, int> encoder, Dictionary<StringSpanOrdinalKey, (int Id, string Token)> vocab, Dictionary<int, ReadOnlyMemory<byte>> decoder)> _tiktokenCache = new(StringComparer.OrdinalIgnoreCase);

src/Microsoft.ML.Tokenizers/PreTokenizer/PreTokenizer.cs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,16 @@ public abstract partial class PreTokenizer
4040
}
4141
}
4242

43+
// 30 seconds is a reasonable time to process any text and find the match.
44+
internal const int DefaultTimeOutInMilliseconds = 30_000;
45+
4346
private const string WhiteSpaceOrPunctuationPattern = @"\w+|[\p{P}]";
4447
private static PreTokenizer? _whiteSpaceOrPunctuationPreTokenizer;
4548
#if NET7_0_OR_GREATER
46-
[GeneratedRegex(WhiteSpaceOrPunctuationPattern)]
49+
[GeneratedRegex(WhiteSpaceOrPunctuationPattern, RegexOptions.None, DefaultTimeOutInMilliseconds)]
4750
private static partial Regex WhiteSpaceOrPunctuationRegex();
4851
#else
49-
private static Regex WhiteSpaceOrPunctuationRegex() => new Regex(WhiteSpaceOrPunctuationPattern, RegexOptions.Compiled);
52+
private static Regex WhiteSpaceOrPunctuationRegex() => new Regex(WhiteSpaceOrPunctuationPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(DefaultTimeOutInMilliseconds));
5053
#endif
5154

5255
/// <summary>
@@ -69,10 +72,10 @@ public static PreTokenizer CreateWhiteSpaceOrPunctuationPreTokenizer(IReadOnlyDi
6972
private static PreTokenizer? _wordOrNonWordPreTokenizer;
7073

7174
#if NET7_0_OR_GREATER
72-
[GeneratedRegex(WordOrNonWordPattern)]
75+
[GeneratedRegex(WordOrNonWordPattern, RegexOptions.None, DefaultTimeOutInMilliseconds)]
7376
private static partial Regex WordOrNonWordRegex();
7477
#else
75-
private static Regex WordOrNonWordRegex() => new Regex(WordOrNonWordPattern, RegexOptions.Compiled);
78+
private static Regex WordOrNonWordRegex() => new Regex(WordOrNonWordPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(DefaultTimeOutInMilliseconds));
7679
#endif
7780

7881
/// <summary>
@@ -96,10 +99,10 @@ public static PreTokenizer CreateWordOrNonWordPreTokenizer(IReadOnlyDictionary<s
9699
private static PreTokenizer? _whiteSpacePreTokenizer;
97100

98101
#if NET7_0_OR_GREATER
99-
[GeneratedRegex(WhiteSpacePattern)]
102+
[GeneratedRegex(WhiteSpacePattern, RegexOptions.None, DefaultTimeOutInMilliseconds)]
100103
private static partial Regex WhiteSpaceRegex();
101104
#else
102-
private static Regex WhiteSpaceRegex() => new Regex(WhiteSpacePattern, RegexOptions.Compiled);
105+
private static Regex WhiteSpaceRegex() => new Regex(WhiteSpacePattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(DefaultTimeOutInMilliseconds));
103106
#endif
104107

105108
/// <summary>

src/Microsoft.ML.Tokenizers/PreTokenizer/RegexPreTokenizer.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ public RegexPreTokenizer(Regex regex, IReadOnlyDictionary<string, int>? specialT
3535

3636
if (specialTokensEncoder is { Count: > 0 })
3737
{
38+
// We create this Regex object without a timeout, as we expect the match operation to complete in \(O(N)\) time complexity. Note that `specialTokensEncoder` is treated as constants after the pre-tokenizer is created.
3839
_specialTokensRegex = new Regex(string.Join("|", specialTokensEncoder.Keys.Select(s => Regex.Escape(s))), RegexOptions.Compiled);
3940
}
4041
}

0 commit comments

Comments
 (0)