Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit e512c0d

Browse files
committedFeb 17, 2025
Add get_encoding_name_for_model to tiktoken
The `tiktoken-js` library includes a very helpful function, `getEncodingNameForModel()`. This function is buried in the implementation of `encoding_for_model()` in the rust based `tiktoken` package. This function is very useful when implementing an encoding cache based on the model used. In this case, having a mapping from model -> encoding and then caching based on the encoding name conserves resources since so many models re-use the same encoding. I've exposed a new `get_encoding_name_for_model()` function that behaves similarly to the one in the `tiktoken-js` package, and used it inside of `encoding_for_model()`. Finally, I've also added a test to ensure that this function can be called properly from typescript code, and that it properly throws exceptions in the case of invalid model names. Fixes: #123
1 parent 8963e56 commit e512c0d

File tree

2 files changed

+108
-80
lines changed

2 files changed

+108
-80
lines changed
 

‎wasm/src/lib.rs

+96-79
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ export type TiktokenModel =
439439
| "gpt-4o-realtime-preview-2024-10-01"
440440
441441
/**
442-
* @param {TiktokenModel} encoding
442+
* @param {TiktokenModel} model
443443
* @param {Record<string, number>} [extend_special_tokens]
444444
* @returns {Tiktoken}
445445
*/
@@ -452,84 +452,8 @@ pub fn encoding_for_model(
452452
model: &str,
453453
extend_special_tokens: JsValue,
454454
) -> Result<Tiktoken, JsError> {
455-
let encoding = match model {
456-
"text-davinci-003" => Ok("p50k_base"),
457-
"text-davinci-002" => Ok("p50k_base"),
458-
"text-davinci-001" => Ok("r50k_base"),
459-
"text-curie-001" => Ok("r50k_base"),
460-
"text-babbage-001" => Ok("r50k_base"),
461-
"text-ada-001" => Ok("r50k_base"),
462-
"davinci" => Ok("r50k_base"),
463-
"davinci-002" => Ok("cl100k_base"),
464-
"curie" => Ok("r50k_base"),
465-
"babbage" => Ok("r50k_base"),
466-
"babbage-002" => Ok("cl100k_base"),
467-
"ada" => Ok("r50k_base"),
468-
"code-davinci-002" => Ok("p50k_base"),
469-
"code-davinci-001" => Ok("p50k_base"),
470-
"code-cushman-002" => Ok("p50k_base"),
471-
"code-cushman-001" => Ok("p50k_base"),
472-
"davinci-codex" => Ok("p50k_base"),
473-
"cushman-codex" => Ok("p50k_base"),
474-
"text-davinci-edit-001" => Ok("p50k_edit"),
475-
"code-davinci-edit-001" => Ok("p50k_edit"),
476-
"text-embedding-ada-002" => Ok("cl100k_base"),
477-
"text-embedding-3-small" => Ok("cl100k_base"),
478-
"text-embedding-3-large" => Ok("cl100k_base"),
479-
"text-similarity-davinci-001" => Ok("r50k_base"),
480-
"text-similarity-curie-001" => Ok("r50k_base"),
481-
"text-similarity-babbage-001" => Ok("r50k_base"),
482-
"text-similarity-ada-001" => Ok("r50k_base"),
483-
"text-search-davinci-doc-001" => Ok("r50k_base"),
484-
"text-search-curie-doc-001" => Ok("r50k_base"),
485-
"text-search-babbage-doc-001" => Ok("r50k_base"),
486-
"text-search-ada-doc-001" => Ok("r50k_base"),
487-
"code-search-babbage-code-001" => Ok("r50k_base"),
488-
"code-search-ada-code-001" => Ok("r50k_base"),
489-
"gpt2" => Ok("gpt2"),
490-
"gpt-3.5-turbo" => Ok("cl100k_base"),
491-
"gpt-3.5-turbo-0301" => Ok("cl100k_base"),
492-
"gpt-3.5-turbo-0613" => Ok("cl100k_base"),
493-
"gpt-3.5-turbo-16k" => Ok("cl100k_base"),
494-
"gpt-3.5-turbo-16k-0613" => Ok("cl100k_base"),
495-
"gpt-3.5-turbo-instruct" => Ok("cl100k_base"),
496-
"gpt-3.5-turbo-instruct-0914" => Ok("cl100k_base"),
497-
"gpt-4" => Ok("cl100k_base"),
498-
"gpt-4-0314" => Ok("cl100k_base"),
499-
"gpt-4-0613" => Ok("cl100k_base"),
500-
"gpt-4-32k" => Ok("cl100k_base"),
501-
"gpt-4-32k-0314" => Ok("cl100k_base"),
502-
"gpt-4-32k-0613" => Ok("cl100k_base"),
503-
"gpt-3.5-turbo-1106" => Ok("cl100k_base"),
504-
"gpt-35-turbo" => Ok("cl100k_base"),
505-
"gpt-4-1106-preview" => Ok("cl100k_base"),
506-
"gpt-4-vision-preview" => Ok("cl100k_base"),
507-
"gpt-3.5-turbo-0125" => Ok("cl100k_base"),
508-
"gpt-4-turbo" => Ok("cl100k_base"),
509-
"gpt-4-turbo-2024-04-09" => Ok("cl100k_base"),
510-
"gpt-4-turbo-preview" => Ok("cl100k_base"),
511-
"gpt-4-0125-preview" => Ok("cl100k_base"),
512-
"gpt-4o" => Ok("o200k_base"),
513-
"gpt-4o-2024-05-13" => Ok("o200k_base"),
514-
"gpt-4o-2024-08-06" => Ok("o200k_base"),
515-
"gpt-4o-2024-11-20" => Ok("o200k_base"),
516-
"gpt-4o-mini-2024-07-18" => Ok("o200k_base"),
517-
"gpt-4o-mini" => Ok("o200k_base"),
518-
"o1" => Ok("o200k_base"),
519-
"o1-2024-12-17" => Ok("o200k_base"),
520-
"o1-mini" => Ok("o200k_base"),
521-
"o1-preview" => Ok("o200k_base"),
522-
"o1-preview-2024-09-12" => Ok("o200k_base"),
523-
"o1-mini-2024-09-12" => Ok("o200k_base"),
524-
"chatgpt-4o-latest" => Ok("o200k_base"),
525-
"gpt-4o-realtime" => Ok("o200k_base"),
526-
"gpt-4o-realtime-preview-2024-10-01" => Ok("o200k_base"),
527-
"o3-mini" => Ok("o200k_base"),
528-
"o3-mini-2025-01-31" => Ok("o200k_base"),
529-
model => Err(JsError::new(
530-
format!("Invalid model: {}", model.to_string()).as_str(),
531-
)),
532-
}?;
455+
let binding = get_encoding_name_for_model(model)?;
456+
let encoding = binding.as_str();
533457

534458
Tiktoken::with_encoding(
535459
encoding,
@@ -538,3 +462,96 @@ pub fn encoding_for_model(
538462
.ok(),
539463
)
540464
}
465+
466+
#[cfg(feature = "inline")]
467+
#[wasm_bindgen(typescript_custom_section)]
468+
const _: &'static str = r#"
469+
/**
470+
* @param {TiktokenModel} model
471+
* @returns {TiktokenEncoding}
472+
*/
473+
export function get_encoding_name_for_model(model: TiktokenModel): TiktokenEncoding;
474+
"#;
475+
476+
#[cfg(feature="inline")]
477+
#[wasm_bindgen(skip_typescript)]
478+
pub fn get_encoding_name_for_model(model: &str) -> Result<String, JsError> {
479+
match model {
480+
"text-davinci-003" => Ok("p50k_base".into()),
481+
"text-davinci-002" => Ok("p50k_base".into()),
482+
"text-davinci-001" => Ok("r50k_base".into()),
483+
"text-curie-001" => Ok("r50k_base".into()),
484+
"text-babbage-001" => Ok("r50k_base".into()),
485+
"text-ada-001" => Ok("r50k_base".into()),
486+
"davinci" => Ok("r50k_base".into()),
487+
"davinci-002" => Ok("cl100k_base".into()),
488+
"curie" => Ok("r50k_base".into()),
489+
"babbage" => Ok("r50k_base".into()),
490+
"babbage-002" => Ok("cl100k_base".into()),
491+
"ada" => Ok("r50k_base".into()),
492+
"code-davinci-002" => Ok("p50k_base".into()),
493+
"code-davinci-001" => Ok("p50k_base".into()),
494+
"code-cushman-002" => Ok("p50k_base".into()),
495+
"code-cushman-001" => Ok("p50k_base".into()),
496+
"davinci-codex" => Ok("p50k_base".into()),
497+
"cushman-codex" => Ok("p50k_base".into()),
498+
"text-davinci-edit-001" => Ok("p50k_edit".into()),
499+
"code-davinci-edit-001" => Ok("p50k_edit".into()),
500+
"text-embedding-ada-002" => Ok("cl100k_base".into()),
501+
"text-embedding-3-small" => Ok("cl100k_base".into()),
502+
"text-embedding-3-large" => Ok("cl100k_base".into()),
503+
"text-similarity-davinci-001" => Ok("r50k_base".into()),
504+
"text-similarity-curie-001" => Ok("r50k_base".into()),
505+
"text-similarity-babbage-001" => Ok("r50k_base".into()),
506+
"text-similarity-ada-001" => Ok("r50k_base".into()),
507+
"text-search-davinci-doc-001" => Ok("r50k_base".into()),
508+
"text-search-curie-doc-001" => Ok("r50k_base".into()),
509+
"text-search-babbage-doc-001" => Ok("r50k_base".into()),
510+
"text-search-ada-doc-001" => Ok("r50k_base".into()),
511+
"code-search-babbage-code-001" => Ok("r50k_base".into()),
512+
"code-search-ada-code-001" => Ok("r50k_base".into()),
513+
"gpt2" => Ok("gpt2".into()),
514+
"gpt-3.5-turbo" => Ok("cl100k_base".into()),
515+
"gpt-3.5-turbo-0301" => Ok("cl100k_base".into()),
516+
"gpt-3.5-turbo-0613" => Ok("cl100k_base".into()),
517+
"gpt-3.5-turbo-16k" => Ok("cl100k_base".into()),
518+
"gpt-3.5-turbo-16k-0613" => Ok("cl100k_base".into()),
519+
"gpt-3.5-turbo-instruct" => Ok("cl100k_base".into()),
520+
"gpt-3.5-turbo-instruct-0914" => Ok("cl100k_base".into()),
521+
"gpt-4" => Ok("cl100k_base".into()),
522+
"gpt-4-0314" => Ok("cl100k_base".into()),
523+
"gpt-4-0613" => Ok("cl100k_base".into()),
524+
"gpt-4-32k" => Ok("cl100k_base".into()),
525+
"gpt-4-32k-0314" => Ok("cl100k_base".into()),
526+
"gpt-4-32k-0613" => Ok("cl100k_base".into()),
527+
"gpt-3.5-turbo-1106" => Ok("cl100k_base".into()),
528+
"gpt-35-turbo" => Ok("cl100k_base".into()),
529+
"gpt-4-1106-preview" => Ok("cl100k_base".into()),
530+
"gpt-4-vision-preview" => Ok("cl100k_base".into()),
531+
"gpt-3.5-turbo-0125" => Ok("cl100k_base".into()),
532+
"gpt-4-turbo" => Ok("cl100k_base".into()),
533+
"gpt-4-turbo-2024-04-09" => Ok("cl100k_base".into()),
534+
"gpt-4-turbo-preview" => Ok("cl100k_base".into()),
535+
"gpt-4-0125-preview" => Ok("cl100k_base".into()),
536+
"gpt-4o" => Ok("o200k_base".into()),
537+
"gpt-4o-2024-05-13" => Ok("o200k_base".into()),
538+
"gpt-4o-2024-08-06" => Ok("o200k_base".into()),
539+
"gpt-4o-2024-11-20" => Ok("o200k_base".into()),
540+
"gpt-4o-mini-2024-07-18" => Ok("o200k_base".into()),
541+
"gpt-4o-mini" => Ok("o200k_base".into()),
542+
"o1" => Ok("o200k_base".into()),
543+
"o1-2024-12-17" => Ok("o200k_base".into()),
544+
"o1-mini" => Ok("o200k_base".into()),
545+
"o1-preview" => Ok("o200k_base".into()),
546+
"o1-preview-2024-09-12" => Ok("o200k_base".into()),
547+
"o1-mini-2024-09-12" => Ok("o200k_base".into()),
548+
"chatgpt-4o-latest" => Ok("o200k_base".into()),
549+
"gpt-4o-realtime" => Ok("o200k_base".into()),
550+
"gpt-4o-realtime-preview-2024-10-01" => Ok("o200k_base".into()),
551+
"o3-mini" => Ok("o200k_base".into()),
552+
"o3-mini-2025-01-31" => Ok("o200k_base".into()),
553+
model => Err(JsError::new(
554+
format!("Invalid model: {}", model.to_string()).as_str(),
555+
)),
556+
}
557+
}

‎wasm/test/test_simple_public.test.ts

+12-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { it, expect, describe } from "vitest";
2-
import { encoding_for_model, get_encoding } from "../dist";
2+
import { encoding_for_model, get_encoding, get_encoding_name_for_model } from "../dist";
33

44
it("encoding_for_model initialization", () => {
55
expect(() => encoding_for_model("gpt2")).not.toThrowError();
@@ -106,6 +106,17 @@ it("test_encoding_for_model", () => {
106106
expect(encoding_for_model("gpt-3.5-turbo").name).toEqual("cl100k_base");
107107
});
108108

109+
it("test_get_encoding_name_for_model", () => {
110+
expect(get_encoding_name_for_model("gpt2")).toEqual("gpt2");
111+
expect(get_encoding_name_for_model("text-davinci-003")).toEqual("p50k_base");
112+
expect(get_encoding_name_for_model("gpt-3.5-turbo")).toEqual("cl100k_base");
113+
114+
// @ts-expect-error - explicitly testing for invalid model
115+
expect(() => get_encoding_name_for_model("gpt2-unknown")).toThrowError(
116+
"Invalid model: gpt2-unknown"
117+
);
118+
})
119+
109120
it("test_custom_tokens", () => {
110121
const enc = encoding_for_model("gpt2", {
111122
"<|im_start|>": 100264,

0 commit comments

Comments
 (0)