From 89e504d5c96f7bf08225f18e00568544e509b936 Mon Sep 17 00:00:00 2001 From: Daulet Zhanguzin Date: Fri, 9 Aug 2024 16:15:14 -0700 Subject: [PATCH] Update to huggingface/tokenizers v0.20.0 (#23) * update tokenizers lib to v0.20.0 * benchmark new release * update bazel builds --- Cargo.Bazel.lock | 14 +++--- Cargo.lock | 6 +-- Cargo.toml | 2 +- README.md | 2 + ...502b65573ea00125eac62fa301c480402be19c.txt | 45 +++++++++++++++++++ ...b47dd52e68ae3349c0461d494921d6a07f7181.txt | 45 +++++++++++++++++++ 6 files changed, 103 insertions(+), 11 deletions(-) create mode 100644 test/benchmark/1b502b65573ea00125eac62fa301c480402be19c.txt create mode 100644 test/benchmark/7bb47dd52e68ae3349c0461d494921d6a07f7181.txt diff --git a/Cargo.Bazel.lock b/Cargo.Bazel.lock index 1d3f1bc..e319f99 100644 --- a/Cargo.Bazel.lock +++ b/Cargo.Bazel.lock @@ -1,5 +1,5 @@ { - "checksum": "f2d7467d755ff26d29addc5b9b7544ea4db8483d15efc59190337e6b75411f12", + "checksum": "274eda904d8c182522e7565c734cf8c608671ee2bdb346dccb07c10d44904563", "crates": { "aho-corasick 1.1.2": { "name": "aho-corasick", @@ -4486,13 +4486,13 @@ }, "license": "Apache-2.0 OR MIT" }, - "tokenizers 0.19.1": { + "tokenizers 0.20.0": { "name": "tokenizers", - "version": "0.19.1", + "version": "0.20.0", "repository": { "Http": { - "url": "https://static.crates.io/crates/tokenizers/0.19.1/download", - "sha256": "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd" + "url": "https://static.crates.io/crates/tokenizers/0.20.0/download", + "sha256": "c8a24d7f7d6be5b9d1377418b893ab1808af0074f5d1bb2c64784452ddd2aa70" } }, "targets": [ @@ -4628,7 +4628,7 @@ ], "selects": {} }, - "version": "0.19.1" + "version": "0.20.0" }, "license": "Apache-2.0" }, @@ -4649,7 +4649,7 @@ "target": "libc" }, { - "id": "tokenizers 0.19.1", + "id": "tokenizers 0.20.0", "target": "tokenizers" } ], diff --git a/Cargo.lock b/Cargo.lock index 0261517..7b8931d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -828,14 +828,14 @@ dependencies = [ "criterion", "libc", "rand", - "tokenizers 0.19.1", + "tokenizers 0.20.0", ] [[package]] name = "tokenizers" -version = "0.19.1" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd" +checksum = "c8a24d7f7d6be5b9d1377418b893ab1808af0074f5d1bb2c64784452ddd2aa70" dependencies = [ "aho-corasick", "derive_builder", diff --git a/Cargo.toml b/Cargo.toml index 20660e7..c3174e6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ crate-type = ["staticlib"] [dependencies] libc = "0.2.140" -tokenizers = {version = "0.19.1" } +tokenizers = {version = "0.20.0" } [dev-dependencies] criterion = { version = "0.5.1", features = ["html_reports"] } diff --git a/README.md b/README.md index be95ed6..119606a 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,8 @@ fmt.Println(tk.Decode([]uint32{2829, 4419, 14523, 2058, 1996, 13971, 3899}, true ## Benchmarks +`go test . -run=^\$ -bench=. -benchmem -count=10 > test/benchmark/$(git rev-parse HEAD).txt` + Decoding overhead (due to CGO and extra allocations) is between 2% to 9% depending on the benchmark. ```bash diff --git a/test/benchmark/1b502b65573ea00125eac62fa301c480402be19c.txt b/test/benchmark/1b502b65573ea00125eac62fa301c480402be19c.txt new file mode 100644 index 0000000..30f42d2 --- /dev/null +++ b/test/benchmark/1b502b65573ea00125eac62fa301c480402be19c.txt @@ -0,0 +1,45 @@ +goos: darwin +goarch: arm64 +pkg: github.com/daulet/tokenizers +BenchmarkEncodeNTimes-10 95174 12667 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 94437 12580 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 93362 12583 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 94240 13372 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 92844 12868 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 92984 12766 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 92055 12654 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 91874 13204 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 93130 12686 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 93288 12528 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNChars-10 1000000000 2.374 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 2.651 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 1.993 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 2.169 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 2.282 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 2.348 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 2.028 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 2.013 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 2.200 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 1.957 ns/op 0 B/op 0 allocs/op +BenchmarkDecodeNTimes-10 250281 4474 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 268866 4501 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 260468 4422 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 264583 4455 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 262168 4552 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 262182 4455 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 262510 4511 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 263491 4524 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 265724 4396 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 259940 4430 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTokens-10 1804423 678.7 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1827415 654.8 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1850868 648.1 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1838286 650.1 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1853236 655.6 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1835120 657.1 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1838400 652.3 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1847911 659.2 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1808113 654.2 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1820958 666.3 ns/op 7 B/op 0 allocs/op +PASS +ok github.com/daulet/tokenizers 245.425s diff --git a/test/benchmark/7bb47dd52e68ae3349c0461d494921d6a07f7181.txt b/test/benchmark/7bb47dd52e68ae3349c0461d494921d6a07f7181.txt new file mode 100644 index 0000000..5bc6733 --- /dev/null +++ b/test/benchmark/7bb47dd52e68ae3349c0461d494921d6a07f7181.txt @@ -0,0 +1,45 @@ +goos: darwin +goarch: arm64 +pkg: github.com/daulet/tokenizers +BenchmarkEncodeNTimes-10 91389 12616 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 94416 12608 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 95833 12702 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 93657 12692 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 95575 12565 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 95866 12700 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 95568 12502 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 95286 12625 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 95224 12739 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNTimes-10 93948 12949 ns/op 232 B/op 12 allocs/op +BenchmarkEncodeNChars-10 1000000000 2.254 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 3.099 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 2.273 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 2.722 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 1.965 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 2.024 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 1.997 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 2.320 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 1.866 ns/op 0 B/op 0 allocs/op +BenchmarkEncodeNChars-10 1000000000 4.136 ns/op 0 B/op 0 allocs/op +BenchmarkDecodeNTimes-10 239275 4575 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 243561 4515 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 258657 4480 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 262723 4597 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 263178 4466 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 266382 4442 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 266616 4498 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 266132 4544 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 266750 4780 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTimes-10 266880 4454 ns/op 96 B/op 3 allocs/op +BenchmarkDecodeNTokens-10 1808430 655.3 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1832203 649.4 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1851890 648.7 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1836775 649.1 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1839984 650.7 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1854864 643.8 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1854836 647.9 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1866586 643.4 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1794544 666.8 ns/op 7 B/op 0 allocs/op +BenchmarkDecodeNTokens-10 1768803 666.9 ns/op 7 B/op 0 allocs/op +PASS +ok github.com/daulet/tokenizers 226.796s