Skip to content

Commit

Permalink
Update to huggingface/tokenizers v0.20.0 (#23)
Browse files Browse the repository at this point in the history
* update tokenizers lib to v0.20.0

* benchmark new release

* update bazel builds
  • Loading branch information
daulet authored Aug 9, 2024
1 parent 7bb47dd commit 89e504d
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 11 deletions.
14 changes: 7 additions & 7 deletions Cargo.Bazel.lock
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"checksum": "f2d7467d755ff26d29addc5b9b7544ea4db8483d15efc59190337e6b75411f12",
"checksum": "274eda904d8c182522e7565c734cf8c608671ee2bdb346dccb07c10d44904563",
"crates": {
"aho-corasick 1.1.2": {
"name": "aho-corasick",
Expand Down Expand Up @@ -4486,13 +4486,13 @@
},
"license": "Apache-2.0 OR MIT"
},
"tokenizers 0.19.1": {
"tokenizers 0.20.0": {
"name": "tokenizers",
"version": "0.19.1",
"version": "0.20.0",
"repository": {
"Http": {
"url": "https://static.crates.io/crates/tokenizers/0.19.1/download",
"sha256": "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd"
"url": "https://static.crates.io/crates/tokenizers/0.20.0/download",
"sha256": "c8a24d7f7d6be5b9d1377418b893ab1808af0074f5d1bb2c64784452ddd2aa70"
}
},
"targets": [
Expand Down Expand Up @@ -4628,7 +4628,7 @@
],
"selects": {}
},
"version": "0.19.1"
"version": "0.20.0"
},
"license": "Apache-2.0"
},
Expand All @@ -4649,7 +4649,7 @@
"target": "libc"
},
{
"id": "tokenizers 0.19.1",
"id": "tokenizers 0.20.0",
"target": "tokenizers"
}
],
Expand Down
6 changes: 3 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ crate-type = ["staticlib"]

[dependencies]
libc = "0.2.140"
tokenizers = {version = "0.19.1" }
tokenizers = {version = "0.20.0" }

[dev-dependencies]
criterion = { version = "0.5.1", features = ["html_reports"] }
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ fmt.Println(tk.Decode([]uint32{2829, 4419, 14523, 2058, 1996, 13971, 3899}, true

## Benchmarks

`go test . -run=^\$ -bench=. -benchmem -count=10 > test/benchmark/$(git rev-parse HEAD).txt`

Decoding overhead (due to CGO and extra allocations) is between 2% to 9% depending on the benchmark.

```bash
Expand Down
45 changes: 45 additions & 0 deletions test/benchmark/1b502b65573ea00125eac62fa301c480402be19c.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
goos: darwin
goarch: arm64
pkg: github.com/daulet/tokenizers
BenchmarkEncodeNTimes-10 95174 12667 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 94437 12580 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 93362 12583 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 94240 13372 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 92844 12868 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 92984 12766 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 92055 12654 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 91874 13204 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 93130 12686 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 93288 12528 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNChars-10 1000000000 2.374 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 2.651 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 1.993 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 2.169 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 2.282 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 2.348 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 2.028 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 2.013 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 2.200 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 1.957 ns/op 0 B/op 0 allocs/op
BenchmarkDecodeNTimes-10 250281 4474 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 268866 4501 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 260468 4422 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 264583 4455 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 262168 4552 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 262182 4455 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 262510 4511 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 263491 4524 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 265724 4396 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 259940 4430 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTokens-10 1804423 678.7 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1827415 654.8 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1850868 648.1 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1838286 650.1 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1853236 655.6 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1835120 657.1 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1838400 652.3 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1847911 659.2 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1808113 654.2 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1820958 666.3 ns/op 7 B/op 0 allocs/op
PASS
ok github.com/daulet/tokenizers 245.425s
45 changes: 45 additions & 0 deletions test/benchmark/7bb47dd52e68ae3349c0461d494921d6a07f7181.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
goos: darwin
goarch: arm64
pkg: github.com/daulet/tokenizers
BenchmarkEncodeNTimes-10 91389 12616 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 94416 12608 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 95833 12702 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 93657 12692 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 95575 12565 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 95866 12700 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 95568 12502 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 95286 12625 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 95224 12739 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNTimes-10 93948 12949 ns/op 232 B/op 12 allocs/op
BenchmarkEncodeNChars-10 1000000000 2.254 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 3.099 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 2.273 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 2.722 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 1.965 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 2.024 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 1.997 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 2.320 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 1.866 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeNChars-10 1000000000 4.136 ns/op 0 B/op 0 allocs/op
BenchmarkDecodeNTimes-10 239275 4575 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 243561 4515 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 258657 4480 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 262723 4597 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 263178 4466 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 266382 4442 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 266616 4498 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 266132 4544 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 266750 4780 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTimes-10 266880 4454 ns/op 96 B/op 3 allocs/op
BenchmarkDecodeNTokens-10 1808430 655.3 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1832203 649.4 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1851890 648.7 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1836775 649.1 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1839984 650.7 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1854864 643.8 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1854836 647.9 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1866586 643.4 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1794544 666.8 ns/op 7 B/op 0 allocs/op
BenchmarkDecodeNTokens-10 1768803 666.9 ns/op 7 B/op 0 allocs/op
PASS
ok github.com/daulet/tokenizers 226.796s

0 comments on commit 89e504d

Please sign in to comment.