diff --git a/.github/workflows/build_ruby_gem.yml b/.github/workflows/build_ruby_gem.yml new file mode 100644 index 00000000..f73c61a0 --- /dev/null +++ b/.github/workflows/build_ruby_gem.yml @@ -0,0 +1,107 @@ +name: Build ruby gem +on: workflow_dispatch +env: + CACHE_VERSION: "v0" + CARGO_CACHE_CLEAN: "true" + RUBY_VERSIONS: "3.2" +jobs: + release: + strategy: + fail-fast: false + matrix: + include: + - platform: x86_64-linux + target: x86_64-unknown-linux-gnu + - platform: x86_64-linux-musl + target: x86_64-unknown-linux-musl + - platform: aarch64-linux + target: aarch64-unknown-linux-gnu + - platform: x86_64-darwin + target: x86_64-apple-darwin + # Rust uses external command to strip symbols and debuginfo on Mac + # Do not do for arm64 since it interferes with code signing + # and codesign binary is not present to re-sign + setup: sudo ln -s /opt/osxcross/target/bin/x86_64-apple-darwin-strip /usr/local/bin/strip + - platform: arm64-darwin + target: aarch64-apple-darwin + # - platform: x64-mingw-ucrt + # target: x86_64-pc-windows-gnu + # - platform: x64-mingw32 + # target: x86_64-pc-windows-gnu + runs-on: ubuntu-latest + name: ${{ matrix.platform }} + steps: + - uses: actions/checkout@v3 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: 3.2 + - name: generate rank + run: bundle install && bundle exec rake rank + working-directory: "ruby" + + # Didn't use https://github.com/oxidize-rb/actions/tree/main/cross-gem due to weird directory structure, but this code is adapted from there. + - name: Configure environment + run: | + : Configure environment + echo "RB_SYS_DOCK_UID=$(id -u)" >> $GITHUB_ENV + echo "RB_SYS_DOCK_GID=$(id -g)" >> $GITHUB_ENV + rb_sys_dock_cache_dir="$HOME/.cache/rb-sys-dock" + mkdir -p "$rb_sys_dock_cache_dir" + echo "RB_SYS_DOCK_CACHE_DIR=$rb_sys_dock_cache_dir" >> $GITHUB_ENV + - name: Setup caching + uses: actions/cache@v3 + with: + path: | + ${{ env.RB_SYS_DOCK_CACHE_DIR }} + ${{ github.workspace }}/ruby/tmp/rb-sys-dock/${{ matrix.platform }}/target + key: rb-sys-dock-${{ env.CACHE_VERSION }}-${{ matrix.platform }}-${{ hashFiles('**/Gemfile.lock', '**/Cargo.lock') }} + restore-keys: | + rb-sys-dock-${{ env.CACHE_VERSION }}-${{ matrix.platform }}- + - name: Install cargo-cache + uses: oxidize-rb/actions/cargo-binstall@v1 + id: install-cargo-cache + if: env.CARGO_CACHE_CLEAN == 'true' + with: + crate: cargo-cache + version: 0.8.3 + strategies: quick-install + + - name: Clean the cargo cache + if: env.CARGO_CACHE_CLEAN == 'true' + uses: oxidize-rb/actions/post-run@v1 + with: + run: cargo-cache --autoclean + cwd: ${{ github.workspace }} + + #- name: Start SSH session + # uses: luchihoratiu/debug-via-ssh@main + # with: + # NGROK_AUTH_TOKEN: ${{ secrets.NGROK_AUTH_TOKEN }} + # SSH_PASS: ${{ secrets.SSH_PASS }} + + - name: Build gem + env: + INPUT_RUBY_VERSIONS: "${{ env.RUBY_VERSIONS }}" + INPUT_PLATFORM: "${{ matrix.platform }}" + run: | + : Compile gem + set -x + args=() + args+=("--platform") + args+=("$INPUT_PLATFORM") + if [ "$INPUT_RUBY_VERSIONS" != "default" ]; then + args+=("--ruby-versions") + args+=("$INPUT_RUBY_VERSIONS") + fi + BUNDLE_GEMFILE=ruby/Gemfile bundle exec rb-sys-dock "${args[@]}" --build -- "cd ruby && bundle install && export CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc && env | grep CARGO" + + - name: Set outputs + id: set-outputs + run: | + : Set output + echo "gem-path=$(find ${{ github.workspace }}/ruby/pkg -name '*-${{ matrix.platform }}.gem')" >> $GITHUB_OUTPUT + + - uses: actions/upload-artifact@v3 + with: + name: cross-gem + path: ${{ steps.set-outputs.outputs.gem-path }} \ No newline at end of file diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml deleted file mode 100644 index d2e8dc27..00000000 --- a/.github/workflows/build_wheels.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: Build wheels - -on: [push, pull_request, workflow_dispatch] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - build_wheels: - name: py${{ matrix.python-version }} on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - # cibuildwheel builds linux wheels inside a manylinux container - # it also takes care of procuring the correct python version for us - os: [ubuntu-latest, windows-latest, macos-latest] - python-version: [38, 39, 310, 311] - - steps: - - uses: actions/checkout@v3 - - - uses: pypa/cibuildwheel@v2.11.3 - env: - CIBW_BUILD: "cp${{ matrix.python-version}}-*" - - - uses: actions/upload-artifact@v3 - with: - name: dist - path: ./wheelhouse/*.whl - - build_sdist: - name: sdist - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - name: Install Python - with: - python-version: "3.9" - - name: Run check-manifest - run: | - pip install check-manifest - check-manifest -v - - name: Build sdist - run: | - pip install --upgrade build - python -m build --sdist - - uses: actions/upload-artifact@v3 - with: - name: dist - path: ./dist/*.tar.gz diff --git a/.gitignore b/.gitignore index 9e090c8e..6b0a18bd 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,7 @@ htmlcov Cargo.lock target/ + +# WASM +ranks/ +node_modules \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 40a72b94..e9533ba6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,21 +1,17 @@ -[package] -name = "tiktoken" -version = "0.3.0" -edition = "2021" -rust-version = "1.57.0" +[workspace] -[lib] -name = "_tiktoken" -crate-type = ["cdylib"] - -[dependencies] -pyo3 = { version = "0.17.3", features = ["extension-module"] } - -# tiktoken dependencies -fancy-regex = "0.10.0" -regex = "1.7.0" -rustc-hash = "1.1.0" -bstr = "1.0.1" +members = [ + "core", + "jni", + "js", + "python", + "ruby", +] [profile.release] incremental = true +opt-level = 's' # Optimize for size +lto = true # Enable link-time optimization +codegen-units = 1 # Reduce number of codegen units to increase optimizations +panic = 'abort' # Abort on panic +strip = true # Strip symbols from binary* diff --git a/MANIFEST.in b/MANIFEST.in index 7f25b271..a841992e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,4 +5,9 @@ include Makefile global-include py.typed recursive-include scripts *.py recursive-include tests *.py -recursive-include src *.rs +recursive-include core *.rs *.toml +recursive-include python *.rs *.toml +recursive-exclude jni * +recursive-exclude java * +recursive-exclude js * +include tiktoken *.json \ No newline at end of file diff --git a/README.md b/README.md index 6a5c5f25..be49218c 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ Layout your project like this, making sure to omit the `tiktoken_ext/__init__.py ``` my_tiktoken_extension ├── tiktoken_ext -│   └── my_encodings.py +│ └── my_encodings.py └── setup.py ``` @@ -101,4 +101,3 @@ setup( Then simply `pip install ./my_tiktoken_extension` and you should be able to use your custom encodings! Make sure **not** to use an editable install. - diff --git a/core/Cargo.toml b/core/Cargo.toml new file mode 100644 index 00000000..79fd5ce9 --- /dev/null +++ b/core/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "tiktoken_core" +version = "0.3.0" +edition = "2021" +rust-version = "1.57.0" + +[lib] +name = "_tiktoken_core" +crate-type = ["lib"] + +[dependencies] +# tiktoken dependencies +fancy-regex = "0.10.0" +regex = "1.7.0" +rustc-hash = "1.1.0" +bstr = "1.0.1" +reqwest = { version = "0.11.14", features = ["rustls-tls", "blocking"], default-features = false } +sha1 = "0.10.5" +json = "0.12.4" +base64 = "0.21.0" +lazy_static = "1.4.0" + +[features] +default = [] +lazyload = [] +multithreading = [] \ No newline at end of file diff --git a/src/lib.rs b/core/src/lib.rs similarity index 63% rename from src/lib.rs rename to core/src/lib.rs index b44d9c8b..7ee05572 100644 --- a/src/lib.rs +++ b/core/src/lib.rs @@ -1,124 +1,25 @@ -// This check is new and seems buggy (possibly with PyO3 interaction) -#![allow(clippy::borrow_deref_ref)] - use std::collections::HashSet; use std::thread; use fancy_regex::Regex; -use pyo3::exceptions; -use pyo3::prelude::*; -use pyo3::types::{PyBytes, PyList, PyTuple}; -use pyo3::PyResult; use rustc_hash::FxHashMap as HashMap; -fn _byte_pair_merge( - piece: &[u8], - ranks: &HashMap, usize>, - f: impl Fn(std::ops::Range) -> T, -) -> Vec { - // This is a vector of (start, rank). - // The rank is of the byte pair starting at position start. - // The rank of the last item in the vector is not a valid value. - let mut parts: Vec<(usize, usize)> = (0..piece.len() + 1).map(|i| (i, usize::MAX)).collect(); - - // NOTE: using a macro here because a closure fails to get inlined - // according to optimization remarks. - // A closure also cannot capture a reference to `piece` without - // the borrow checker complaining about the mutable borrows during - // the assignments later in this code. - macro_rules! get_rank { - ($start_idx:expr, $skip:expr) => {{ - let start_idx: usize = $start_idx; - let skip: usize = $skip; - if (start_idx + skip + 2) < parts.len() { - ranks - .get(&piece[parts[start_idx].0..parts[start_idx + skip + 2].0]) - .map(|r| *r) - } else { - None - } - }}; - ($idx:expr) => {{ - get_rank!($idx, 0) - }}; - } +mod util; +#[cfg(feature = "lazyload")] +mod load; - // We look up the ranks once in the beggining and iteratively update - // them during each merge, which reduces the number of rank lookups. - for i in 0..parts.len() - 2 { - match get_rank!(i) { - Some(rank) => { - // usize::MAX is a sentinel value and cannot be a valid rank - debug_assert!(rank != usize::MAX); - parts[i].1 = rank; - } - None => { - continue; - } - }; - } +#[cfg(feature = "lazyload")] +pub mod openai_public; - // If you have n parts and m merges, this does O(mn) work. - // We could do something with a heap and do O(m log n) work. - // It is important to consider that n is often small (<100), and as such - // the cache-locality benefits outweigh the algorithmic complexity downsides - // of the `parts` vector data structure above. - - // Note that we hash bytes, not token pairs. As long as we train BPE the way we - // currently do, this is equivalent. An easy way to break this would be to decouple - // merge priority from token index or to prevent specific token merges. - loop { - if parts.len() == 1 { - break; - } +#[cfg(feature = "lazyload")] +#[macro_use] +extern crate lazy_static; - // usize::MAX is a sentinel rank value allowing us to - // take the min more quickly - let mut min_rank: (usize, usize) = (usize::MAX, 0); - for (i, &(_, rank)) in parts[..parts.len() - 1].iter().enumerate() { - if rank < min_rank.0 { - min_rank = (rank, i); - } - } - - if min_rank.0 != usize::MAX { - let i = min_rank.1; - - // NOTE: We are about to remove parts[i + 1]. We do not do it - // yet because there are cache-locality benefits to updating - // parts[i] and parts[i-1] before removing, which could thrash - // the cache. Thus, we update the rank calculation by skipping over - // parts[i + 1], by invoking `get_rank!` with `skip = 1`. - parts[i].1 = get_rank!(i, 1).unwrap_or(usize::MAX); - if i > 0 { - parts[i - 1].1 = get_rank!(i - 1, 1).unwrap_or(usize::MAX); - } - - parts.remove(i + 1); - } else { - break; - } - } - let mut out: Vec = Vec::with_capacity(parts.len() - 1); - for i in 0..parts.len() - 1 { - out.push(f(parts[i].0..parts[i + 1].0)); - } - out -} - -pub fn byte_pair_encode(piece: &[u8], ranks: &HashMap, usize>) -> Vec { - if piece.len() == 1 { - return vec![ranks[piece]]; - } - _byte_pair_merge(piece, ranks, |p| ranks[&piece[p.start..p.end]]) -} +#[cfg(feature = "multithreading")] +const MAX_NUM_THREADS: usize = 128; -pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap, usize>) -> Vec<&'a [u8]> { - if piece.len() == 1 { - return vec![piece]; - } - _byte_pair_merge(piece, ranks, |p| &piece[p.start..p.end]) -} +#[cfg(not(feature = "multithreading"))] +const MAX_NUM_THREADS: usize = 1; // Various performance notes: // @@ -177,9 +78,7 @@ fn hash_current_thread() -> usize { u64::from(x) as usize } -const MAX_NUM_THREADS: usize = 128; -#[pyclass] -struct CoreBPE { +pub struct CoreBPENative { encoder: HashMap, usize>, special_tokens_encoder: HashMap, decoder: HashMap>, @@ -189,7 +88,7 @@ struct CoreBPE { sorted_token_bytes: Vec>, } -impl CoreBPE { +impl CoreBPENative { fn _get_tl_regex(&self) -> &Regex { // See performance notes above for what this is about // It's also a little janky, please make a better version of it! @@ -201,7 +100,7 @@ impl CoreBPE { &self.special_regex_tls[hash_current_thread() % MAX_NUM_THREADS] } - fn _decode_native(&self, tokens: &[usize]) -> Vec { + pub fn _decode_native(&self, tokens: &[usize]) -> Vec { let mut ret = Vec::with_capacity(tokens.len() * 2); for token in tokens { let token_bytes = self @@ -213,7 +112,7 @@ impl CoreBPE { ret } - fn _encode_ordinary_native(&self, text: &str) -> Vec { + pub fn _encode_ordinary_native(&self, text: &str) -> Vec { // This is the core of the encoding logic; the other functions in here // just make things complicated :-) let regex = self._get_tl_regex(); @@ -224,12 +123,13 @@ impl CoreBPE { ret.push(*token); continue; } - ret.extend(&byte_pair_encode(piece, &self.encoder)); + ret.extend(&util::byte_pair_encode(piece, &self.encoder)); } ret } - fn _encode_native(&self, text: &str, allowed_special: &HashSet<&str>) -> (Vec, usize) { + pub fn _encode_native(&self, text: &str, allowed_special: &HashSet<&str>, max_tokens: Option) -> (Vec, usize, usize) { + let max_tokens = max_tokens.unwrap_or(usize::MAX); let special_regex = self._get_tl_special_regex(); let regex = self._get_tl_regex(); let mut ret = vec![]; @@ -260,11 +160,20 @@ impl CoreBPE { if let Some(token) = self.encoder.get(piece) { last_piece_token_len = 1; ret.push(*token); + + if ret.len() >= max_tokens { + return (ret, last_piece_token_len, start); + } continue; } - let tokens = byte_pair_encode(piece, &self.encoder); + let tokens = util::byte_pair_encode(piece, &self.encoder); last_piece_token_len = tokens.len(); - ret.extend(&tokens); + for token in tokens { + ret.push(token); + if ret.len() >= max_tokens { + return (ret, last_piece_token_len, start); + } + } } match next_special { @@ -273,8 +182,12 @@ impl CoreBPE { let piece = m.as_str(); let token = self.special_tokens_encoder[piece]; ret.push(token); + start = m.end(); last_piece_token_len = 0; + if ret.len() >= max_tokens { + return (ret, last_piece_token_len, start); + } } None => break, } @@ -282,7 +195,32 @@ impl CoreBPE { // last_piece_token_len is how many tokens came from the last regex split. This is used // for determining unstable tokens, since you can't merge across (stable) regex splits - (ret, last_piece_token_len) + (ret, last_piece_token_len, start) + } + + pub fn _encode_bytes(&self, bytes: &[u8]) -> Vec { + match std::str::from_utf8(bytes) { + Ok(text) => self._encode_ordinary_native(text), + Err(e) => { + let text = unsafe { std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) }; + let (tokens, last_piece_token_len, _) = self._encode_native(text, &HashSet::new(), None); + let (mut tokens, last_piece_token_len) = + self._increase_last_piece_token_len(tokens, last_piece_token_len); + if !tokens.is_empty() && last_piece_token_len > 0 { + // Lop off the tokens from the last piece and run BPE on the remaining bytes + // Somewhat niche, but this may not be correct if we'd have had a regex + // split between the valid UTF-8 and the invalid bytes, which is why this + // method is private + let mut unstable_bytes = + self._decode_native(&tokens[tokens.len() - last_piece_token_len..]); + unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]); + + tokens.truncate(tokens.len() - last_piece_token_len); + tokens.extend(util::byte_pair_encode(&unstable_bytes, &self.encoder)); + } + tokens + } + } } fn _increase_last_piece_token_len( @@ -324,12 +262,12 @@ impl CoreBPE { (tokens, last_piece_token_len) } - fn _encode_unstable_native( + pub fn _encode_unstable_native( &self, text: &str, allowed_special: &HashSet<&str>, ) -> (Vec, HashSet>) { - let (tokens, last_piece_token_len) = self._encode_native(text, allowed_special); + let (tokens, last_piece_token_len, _) = self._encode_native(text, allowed_special, None); if last_piece_token_len == 0 { // If last_piece_token_len is zero, the last token was a special token and we have // no unstable bytes @@ -392,7 +330,7 @@ impl CoreBPE { // would be a regex split before the UTF-8 truncation point. // Probably niche enough that no one will ever notice (after all, people didn't // notice all the big holes in the previous unstable token implementation) - Err(_) => byte_pair_encode(&possibility, &self.encoder), + Err(_) => util::byte_pair_encode(&possibility, &self.encoder), // Something like the following is intriguing but incorrect: // Err(e) => self._encode_ordinary_native(unsafe { // std::str::from_utf8_unchecked(&possibility[..e.valid_up_to()]) @@ -425,11 +363,11 @@ impl CoreBPE { if unstable_bytes.len() - last_decoded.1 > 0 && last_decoded.0.map_or(false, |c| c.is_whitespace()) { - let mut reencoded = byte_pair_encode( + let mut reencoded = util::byte_pair_encode( &unstable_bytes[..unstable_bytes.len() - last_decoded.1], &self.encoder, ); - reencoded.extend(byte_pair_encode( + reencoded.extend(util::byte_pair_encode( &unstable_bytes[unstable_bytes.len() - last_decoded.1..], &self.encoder, )); @@ -439,108 +377,8 @@ impl CoreBPE { (tokens, completions) } -} - -#[pymethods] -impl CoreBPE { - #[new] - fn new( - encoder: HashMap, usize>, - special_tokens_encoder: HashMap, - pattern: &str, - ) -> PyResult { - let regex = Regex::new(pattern) - .map_err(|e| PyErr::new::(e.to_string()))?; - - let special_regex = { - let _parts = special_tokens_encoder - .keys() - .map(|s| fancy_regex::escape(s)) - .collect::>(); - Regex::new(&_parts.join("|")) - .map_err(|e| PyErr::new::(e.to_string()))? - }; - - let decoder: HashMap> = - encoder.iter().map(|(k, v)| (*v, k.clone())).collect(); - - assert!(encoder.len() == decoder.len()); - - let special_tokens_decoder: HashMap> = special_tokens_encoder - .iter() - .map(|(k, v)| (*v, k.as_bytes().to_vec())) - .collect(); - - // Clone because I don't know how to tell Rust I'm not going to change the map - let mut sorted_token_bytes: Vec> = encoder.keys().cloned().collect(); - sorted_token_bytes.sort(); - - Ok(CoreBPE { - encoder, - special_tokens_encoder, - decoder, - special_tokens_decoder, - regex_tls: (0..MAX_NUM_THREADS).map(|_| regex.clone()).collect(), - special_regex_tls: (0..MAX_NUM_THREADS) - .map(|_| special_regex.clone()) - .collect(), - sorted_token_bytes, - }) - } - - // ==================== - // Encoding - // ==================== - - fn encode_ordinary(&self, py: Python, text: &str) -> Vec { - py.allow_threads(|| self._encode_ordinary_native(text)) - } - - fn encode(&self, py: Python, text: &str, allowed_special: HashSet<&str>) -> Vec { - py.allow_threads(|| self._encode_native(text, &allowed_special).0) - } - fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec { - py.allow_threads(|| { - match std::str::from_utf8(bytes) { - Ok(text) => self._encode_ordinary_native(text), - Err(e) => { - let text = unsafe { std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) }; - let (tokens, last_piece_token_len) = self._encode_native(text, &HashSet::new()); - let (mut tokens, last_piece_token_len) = - self._increase_last_piece_token_len(tokens, last_piece_token_len); - if !tokens.is_empty() && last_piece_token_len > 0 { - // Lop off the tokens from the last piece and run BPE on the remaining bytes - // Somewhat niche, but this may not be correct if we'd have had a regex - // split between the valid UTF-8 and the invalid bytes, which is why this - // method is private - let mut unstable_bytes = - self._decode_native(&tokens[tokens.len() - last_piece_token_len..]); - unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]); - - tokens.truncate(tokens.len() - last_piece_token_len); - tokens.extend(byte_pair_encode(&unstable_bytes, &self.encoder)); - } - tokens - } - } - }) - } - - fn encode_with_unstable( - &self, - py: Python, - text: &str, - allowed_special: HashSet<&str>, - ) -> Py { - let (tokens, completions) = - py.allow_threads(|| self._encode_unstable_native(text, &allowed_special)); - let py_completions = - PyList::new(py, completions.iter().map(|seq| PyList::new(py, &seq[..]))); - (tokens, py_completions).into_py(py) - } - - fn encode_single_token(&self, piece: &[u8]) -> PyResult { + pub fn encode_single_token(&self, piece: &[u8]) -> Result> { if let Some(token) = self.encoder.get(piece).copied() { return Ok(token); } @@ -549,66 +387,80 @@ impl CoreBPE { return Ok(token); } } - Err(PyErr::new::(piece.to_owned())) + Err(piece.to_owned()) } fn encode_single_piece(&self, piece: &[u8]) -> Vec { if let Some(token) = self.encoder.get(piece) { return vec![*token]; } - byte_pair_encode(piece, &self.encoder) + util::byte_pair_encode(piece, &self.encoder) } // ==================== // Decoding // ==================== - fn decode_bytes(&self, py: Python, tokens: Vec) -> Py { - let bytes = py.allow_threads(|| self._decode_native(&tokens)); - PyBytes::new(py, &bytes).into() - } - - fn decode_single_token_bytes(&self, py: Python, token: usize) -> PyResult> { + pub fn decode_single_token_bytes(&self, token: usize) -> Result<&[u8], String> { if let Some(bytes) = self.decoder.get(&token) { - return Ok(PyBytes::new(py, bytes).into()); + return Ok(bytes); } if let Some(bytes) = self.special_tokens_decoder.get(&token) { - return Ok(PyBytes::new(py, bytes).into()); + return Ok(bytes); } - Err(PyErr::new::(token.to_string())) + Err(token.to_string()) } // ==================== // Miscellaneous // ==================== - fn token_byte_values(&self, py: Python) -> Vec> { - self.sorted_token_bytes - .iter() - .map(|x| PyBytes::new(py, x).into()) - .collect() + pub fn token_byte_values(&self) -> &Vec> { + &self.sorted_token_bytes } -} -#[pymodule] -fn _tiktoken(_py: Python, m: &PyModule) -> PyResult<()> { - m.add_class::()?; - Ok(()) -} + pub fn new( + encoder: HashMap, usize>, + special_tokens_encoder: HashMap, + pattern: &str, + ) -> Result { + let regex = Regex::new(pattern)?; + // .map_err(|e| PyErr::new::(e.to_string()))?; + + let special_regex = { + let _parts = special_tokens_encoder + .keys() + .map(|s| fancy_regex::escape(s)) + .collect::>(); + Regex::new(&_parts.join("|"))? -#[cfg(test)] -mod tests { - use rustc_hash::FxHashMap as HashMap; + // .map_err(|e| PyErr::new::(e.to_string()))? + }; + + let decoder: HashMap> = + encoder.iter().map(|(k, v)| (*v, k.clone())).collect(); - use crate::byte_pair_split; + assert!(encoder.len() == decoder.len()); + + let special_tokens_decoder: HashMap> = special_tokens_encoder + .iter() + .map(|(k, v)| (*v, k.as_bytes().to_vec())) + .collect(); - #[test] - fn very_simple_test() { - let mut ranks = HashMap::default(); - ranks.insert(b"ab".to_vec(), 1); - ranks.insert(b"cd".to_vec(), 2); + // Clone because I don't know how to tell Rust I'm not going to change the map + let mut sorted_token_bytes: Vec> = encoder.keys().cloned().collect(); + sorted_token_bytes.sort(); - let res = byte_pair_split(b"abcd", &ranks); - assert_eq!(res, vec![b"ab", b"cd"]); + Ok(CoreBPENative { + encoder, + special_tokens_encoder, + decoder, + special_tokens_decoder, + regex_tls: (0..MAX_NUM_THREADS).map(|_| regex.clone()).collect(), + special_regex_tls: (0..MAX_NUM_THREADS) + .map(|_| special_regex.clone()) + .collect(), + sorted_token_bytes, + }) } -} +} \ No newline at end of file diff --git a/core/src/load.rs b/core/src/load.rs new file mode 100644 index 00000000..975f5fcd --- /dev/null +++ b/core/src/load.rs @@ -0,0 +1,168 @@ + +use rustc_hash::FxHashMap as HashMap; +use std::{env, path::PathBuf}; +use sha1::{Sha1, Digest}; +use std::error::Error; +use json; + +type Result = std::result::Result>; + +fn read_file(blobpath: &str) -> Result> { + // TODO: support blobs? + + if !(blobpath.starts_with("http") || blobpath.starts_with("https")) { + return Ok(std::fs::read(blobpath)?); + } + + Ok(reqwest::blocking::get(blobpath)?.bytes()?.to_vec()) +} + +fn get_tiktoken_cache_dir() -> PathBuf { + match env::var_os("TIKTOKEN_CACHE_DIR") { + Some(v) => PathBuf::from(v), + None => { + match env::var_os("DATA_GYM_CACHE_DIR") { + Some(v) => PathBuf::from(v), + None => { + let mut temp_dir = env::temp_dir(); + temp_dir.push("data-gym-cache"); + + temp_dir + } + } + } + } +} + +fn sha1_as_hex(s: &str) -> String { + let mut hasher = Sha1::new(); + hasher.update(s.as_bytes()); + let result = hasher.finalize(); + + format!("{:x}", result) +} + +fn read_file_cached(blobpath: &str) -> Result> { + let mut cache_path = get_tiktoken_cache_dir(); + + if !cache_path.exists() { + std::fs::create_dir_all(&cache_path)?; + } + + cache_path.push(sha1_as_hex(blobpath)); + + println!("cache_path: {:?}", cache_path); + + if cache_path.exists() { + let catch_path_str = cache_path.into_os_string().into_string() + .or(Err( { + // let cache_path_lossy_str = cache_path.to_string_lossy().to_string(); + // format!("Unable to convert path {cache_path_lossy_str}") + format!("Unable to convert path") + }))?; + return read_file(&catch_path_str); + } + + let content = read_file(blobpath)?; + + std::fs::write(cache_path, &content)?; + + Ok(content) +} + +fn is_printable(u: u8) -> bool { + // printable ascii characters according to python + !(u <= 31 || (u >= 127 && u <= 160) || u == 173) +} + +pub fn data_gym_to_mergeable_bpe_ranks(vocab_bpe_file: &str, encoder_json_file: &str) -> Result, usize>> { + let mut rank_to_intbyte = (0..=255) + .filter(|x| is_printable(*x) && (*x as char) != ' ') + .collect::>(); + + let mut data_gym_byte_to_byte = rank_to_intbyte + .iter() + .map(|&x| (x as u32, x)) + .collect::>(); + + let mut n = 0; + for b in 0..=255 { + if !rank_to_intbyte.contains(&b) { + rank_to_intbyte.push(b); + data_gym_byte_to_byte.insert(256 + n, b); + n += 1; + } + } + assert!(rank_to_intbyte.len() == 256); + + // vocab_bpe contains the merges along with associated ranks + let cached_vocab = read_file_cached(vocab_bpe_file)?; + let vocab_bpe_contents = std::str::from_utf8(&cached_vocab)? + .split("\n").collect::>(); + + let bpe_merges = match vocab_bpe_contents[1..(vocab_bpe_contents.len() - 1)] + .iter() + .map(|&s| s.split_whitespace()) + .map(|mut sp| match (sp.next(), sp.next()) { + (Some(a), Some(b)) => Some((a, b)), + _ => None, + }) + .collect::>>() + { + Some(v) => v, + None => return Err("Unable to parse vocab_bpe file".into()), + }; + + let decode_data_gym = + |value: &str| value.chars().map(|c| { + data_gym_byte_to_byte[&(c as u32)] + } ).collect::>(); + + // # add the single byte tokens + let mut bpe_ranks = + rank_to_intbyte + .iter() + .enumerate() + .map(|(i, b)| (vec![*b], i)) + .collect::, usize>>(); + + // add the merged tokens + let mut n = bpe_ranks.len(); + for (first, second) in bpe_merges { + bpe_ranks.insert([decode_data_gym(first), decode_data_gym(second)].concat(), n); + n += 1; + } + + // check that the encoder file matches the merges file + // this sanity check is important since tiktoken assumes that ranks are ordered the same + // as merge priority + let cached_encoder = read_file_cached(encoder_json_file)?; + let encoder_json = json::parse(&std::str::from_utf8(&cached_encoder)?)?; + + let mut encoder_json_loaded = encoder_json.entries() + .map(|(k, v)| (decode_data_gym(k), v.as_usize().unwrap())) + .collect::, usize>>(); + + // drop these two special tokens if present, since they're not mergeable bpe tokens + encoder_json_loaded.remove(&decode_data_gym("<|endoftext|>")); + encoder_json_loaded.remove(&decode_data_gym("<|startoftext|>")); + + assert!(bpe_ranks == encoder_json_loaded); + + Ok(bpe_ranks) +} + +pub fn load_tiktoken_bpe(tiktoken_bpe_file: &str) -> Result, usize>> { + use base64::{engine::general_purpose, Engine as _}; + + let content = read_file_cached(tiktoken_bpe_file)?; + + Ok(std::str::from_utf8(&content)? + .lines() + .filter(|s| s.len() > 0) + .map(|s| s.split_whitespace()) + .map(|mut sp| (sp.next().unwrap(), sp.next().unwrap())) + .map(|(first, second)| (general_purpose::STANDARD.decode(&first).unwrap(), second.parse::().unwrap())) + .collect::, usize>>()) +} + diff --git a/core/src/openai_public.rs b/core/src/openai_public.rs new file mode 100644 index 00000000..24e0ab99 --- /dev/null +++ b/core/src/openai_public.rs @@ -0,0 +1,125 @@ + +use rustc_hash::FxHashMap as HashMap; +use std::error::Error; +use std::sync::RwLock; +use json; + +#[path = "load.rs"] +mod load; + +type Result = std::result::Result>; + +lazy_static! { + pub static ref REGISTRY: HashMap = { + json::parse(include_str!("../../tiktoken/registry.json")) + .expect("Failed to parse internal JSON") + .entries() + .map(|(key, value)| { + let loading_strategy = if value.has_key("data_gym_to_mergeable_bpe_ranks") { + EncoderLoadingStrategy::DataGym( + DataGymDef { + vocab_bpe_file: value["data_gym_to_mergeable_bpe_ranks"]["vocab_bpe_file"].as_str().expect("error").into(), + encoder_json_file: value["data_gym_to_mergeable_bpe_ranks"]["encoder_json_file"].as_str().expect("error").into() + }) + } + else if value.has_key("load_tiktoken_bpe") { + EncoderLoadingStrategy::BPE(value["load_tiktoken_bpe"].as_str().expect("fail").into()) + } + else { + panic!("Invalid encoding"); + }; + + EncodingLazy::new( + key.into(), + value["explicit_n_vocab"].as_usize(), + value["pat_str"].as_str().expect("foo").into(), + value["special_tokens"].entries() + .map(|(key, value)| (key.into(), value.as_usize().expect("foo"))) + .collect::>(), + loading_strategy + ) + }) + + .map(|enc| (enc.name.clone(), enc)) + .collect::>() + }; + + pub static ref MODEL_TO_ENCODING: HashMap = + json::parse(include_str!("../../tiktoken/model_to_encoding.json")) + .expect("Failed to parse internal JSON") + .entries() + .map(|(k, v)| (k.into(), v.as_str().expect("foo").into())) + .collect::>(); +} + +#[derive(Clone, PartialEq, Eq, Hash)] +struct DataGymDef { + vocab_bpe_file: String, + encoder_json_file: String, +} + +#[derive(Clone, PartialEq, Eq, Hash)] +enum EncoderLoadingStrategy { + BPE(String), + DataGym(DataGymDef), +} + +pub struct EncodingLazy { + name: String, + explicit_n_vocab: Option, + pub pat_str: String, + pub special_tokens: HashMap, + mergeable_ranks: RwLock, usize>>>, + loading_strategy: EncoderLoadingStrategy, +} + +fn load_bpe(path: &str) -> Result, usize>> { + load::load_tiktoken_bpe(path) +} + +fn load_data_gym(def: &DataGymDef) -> Result, usize>> { + load::data_gym_to_mergeable_bpe_ranks(&def.vocab_bpe_file, &def.encoder_json_file) +} + +// #[memoize] +fn load_mergeable_ranks(loading_strategy: &EncoderLoadingStrategy) -> Result, usize>> +{ + match loading_strategy { + EncoderLoadingStrategy::BPE(path) => load_bpe(&path), + EncoderLoadingStrategy::DataGym(def) => load_data_gym(&def), + } +} + +impl EncodingLazy { + fn new(name: String, + explicit_n_vocab: Option, + pat_str: String, + special_tokens: HashMap, + loading_strategy: EncoderLoadingStrategy) -> Self { + EncodingLazy { + name, + explicit_n_vocab, + pat_str, + special_tokens, + mergeable_ranks: RwLock::new(None), + loading_strategy + } + } + + pub fn get(&self) -> Result, usize>> { + { + let read = self.mergeable_ranks.read().unwrap(); + if read.is_some() { + return Ok(read.as_ref().unwrap().clone()); + } + } + + let mut write = self.mergeable_ranks.write().unwrap(); + *write = Some(load_mergeable_ranks(&self.loading_strategy)?); + + Ok(write.as_ref().unwrap().clone()) + } +} + + + diff --git a/core/src/util.rs b/core/src/util.rs new file mode 100644 index 00000000..b9605a18 --- /dev/null +++ b/core/src/util.rs @@ -0,0 +1,136 @@ +use rustc_hash::FxHashMap as HashMap; + +fn _byte_pair_merge( + piece: &[u8], + ranks: &HashMap, usize>, + f: impl Fn(std::ops::Range) -> T, +) -> Vec { + // This is a vector of (start, rank). + // The rank is of the byte pair starting at position start. + // The rank of the last item in the vector is not a valid value. + let mut parts: Vec<(usize, usize)> = (0..piece.len() + 1).map(|i| (i, usize::MAX)).collect(); + + // NOTE: using a macro here because a closure fails to get inlined + // according to optimization remarks. + // A closure also cannot capture a reference to `piece` without + // the borrow checker complaining about the mutable borrows during + // the assignments later in this code. + macro_rules! get_rank { + ($start_idx:expr, $skip:expr) => {{ + let start_idx: usize = $start_idx; + let skip: usize = $skip; + if (start_idx + skip + 2) < parts.len() { + ranks + .get(&piece[parts[start_idx].0..parts[start_idx + skip + 2].0]) + .map(|r| *r) + } else { + None + } + }}; + ($idx:expr) => {{ + get_rank!($idx, 0) + }}; + } + + // We look up the ranks once in the beggining and iteratively update + // them during each merge, which reduces the number of rank lookups. + for i in 0..parts.len() - 2 { + match get_rank!(i) { + Some(rank) => { + // usize::MAX is a sentinel value and cannot be a valid rank + debug_assert!(rank != usize::MAX); + parts[i].1 = rank; + } + None => { + continue; + } + }; + } + + // If you have n parts and m merges, this does O(mn) work. + // We could do something with a heap and do O(m log n) work. + // It is important to consider that n is often small (<100), and as such + // the cache-locality benefits outweigh the algorithmic complexity downsides + // of the `parts` vector data structure above. + + // Note that we hash bytes, not token pairs. As long as we train BPE the way we + // currently do, this is equivalent. An easy way to break this would be to decouple + // merge priority from token index or to prevent specific token merges. + loop { + if parts.len() == 1 { + break; + } + + // usize::MAX is a sentinel rank value allowing us to + // take the min more quickly + let mut min_rank: (usize, usize) = (usize::MAX, 0); + for (i, &(_, rank)) in parts[..parts.len() - 1].iter().enumerate() { + if rank < min_rank.0 { + min_rank = (rank, i); + } + } + + if min_rank.0 != usize::MAX { + let i = min_rank.1; + + // NOTE: We are about to remove parts[i + 1]. We do not do it + // yet because there are cache-locality benefits to updating + // parts[i] and parts[i-1] before removing, which could thrash + // the cache. Thus, we update the rank calculation by skipping over + // parts[i + 1], by invoking `get_rank!` with `skip = 1`. + parts[i].1 = get_rank!(i, 1).unwrap_or(usize::MAX); + if i > 0 { + parts[i - 1].1 = get_rank!(i - 1, 1).unwrap_or(usize::MAX); + } + + parts.remove(i + 1); + } else { + break; + } + } + let mut out: Vec = Vec::with_capacity(parts.len() - 1); + for i in 0..parts.len() - 1 { + out.push(f(parts[i].0..parts[i + 1].0)); + } + out +} + +pub fn byte_pair_encode(piece: &[u8], ranks: &HashMap, usize>) -> Vec { + if piece.len() == 1 { + return vec![ranks[piece]]; + } + _byte_pair_merge(piece, ranks, |p| ranks[&piece[p.start..p.end]]) +} + +pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap, usize>) -> Vec<&'a [u8]> { + if piece.len() == 1 { + return vec![piece]; + } + _byte_pair_merge(piece, ranks, |p| &piece[p.start..p.end]) +} + +#[cfg(test)] +mod tests { + use rustc_hash::FxHashMap as HashMap; + + use crate::util::_byte_pair_merge; + pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap, usize>) -> Vec<&'a [u8]> { + if piece.len() == 1 { + return vec![piece]; + } + _byte_pair_merge(piece, ranks) + .iter() + .map(|p| &piece[p.start..p.end]) + .collect() + } + + #[test] + fn very_simple_test() { + let mut ranks = HashMap::default(); + ranks.insert(b"ab".to_vec(), 1); + ranks.insert(b"cd".to_vec(), 2); + + let res = byte_pair_split(b"abcd", &ranks); + assert_eq!(res, vec![b"ab", b"cd"]); + } +} \ No newline at end of file diff --git a/java/pom.xml b/java/pom.xml new file mode 100644 index 00000000..61cbf01c --- /dev/null +++ b/java/pom.xml @@ -0,0 +1,99 @@ + + + + 4.0.0 + + com.openai + tiktoken + 1.0-SNAPSHOT + + tiktoken + https://github.com/openai/tiktoken + jar + + + UTF-8 + 1.7 + 1.7 + + + + + junit + junit + 4.11 + test + + + org.scijava + native-lib-loader + 2.4.0 + + + + + + + ${project.basedir}/../natives/ + ${project.build.directory}/classes/natives/ + + + + + + + maven-clean-plugin + 3.1.0 + + + + maven-resources-plugin + 3.0.2 + + + maven-compiler-plugin + 3.8.0 + + + maven-surefire-plugin + 2.22.1 + + + maven-jar-plugin + 3.0.2 + + + maven-install-plugin + 2.5.2 + + + maven-deploy-plugin + 2.8.2 + + + + maven-site-plugin + 3.7.1 + + + maven-project-info-reports-plugin + 3.0.0 + + + org.apache.maven.plugins + maven-failsafe-plugin + 2.22.1 + + + + integration-test + verify + + + + + + + + diff --git a/java/src/main/java/tiktoken/Encoding.java b/java/src/main/java/tiktoken/Encoding.java new file mode 100644 index 00000000..1773225d --- /dev/null +++ b/java/src/main/java/tiktoken/Encoding.java @@ -0,0 +1,34 @@ +package tiktoken; + +import org.scijava.nativelib.NativeLoader; +import java.io.IOException; + +public class Encoding implements AutoCloseable +{ + static { + try { + // load from JAR + NativeLoader.loadLibrary("_tiktoken_jni"); + } + catch(IOException e) { + throw new RuntimeException(e); + } + } + + // initialized by init + private long handle; + + private native void init(String modelName); + + private native void destroy(); + + public native long[] encode(String text, String[] allowedSpecialTokens, long maxTokenLength); + + public Encoding(String modelName) { + this.init(modelName); + } + + public void close() throws Exception { + destroy(); + } +} diff --git a/java/src/test/java/tiktoken/EncodingTestIT.java b/java/src/test/java/tiktoken/EncodingTestIT.java new file mode 100644 index 00000000..602a1ef9 --- /dev/null +++ b/java/src/test/java/tiktoken/EncodingTestIT.java @@ -0,0 +1,21 @@ +package tiktoken; + +import static org.junit.Assert.assertArrayEquals; + +import org.junit.Test; + +// run test: mvn failsafe:integration-test +public class EncodingTestIT +{ + @Test + public void shouldAnswerWithTrue() throws Exception + { + Encoding encoding = new Encoding("text-davinci-001"); + + long[] a = encoding.encode("test", new String[0], 0); + + encoding.close(); + + assertArrayEquals(new long[] {9288}, a); + } +} diff --git a/jni/Cargo.toml b/jni/Cargo.toml new file mode 100644 index 00000000..4309eef4 --- /dev/null +++ b/jni/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "tiktoken_jni" +version = "0.2.0" +edition = "2021" +rust-version = "1.57.0" + +[lib] +name = "_tiktoken_jni" +crate-type = ["cdylib"] + +[dependencies] +tiktoken_core = { path = "../core", features = ["multithreading", "lazyload"] } +rustc-hash = "1.1.0" +jni = "0.20.0" + +[build-dependencies] +json = "0.12.4" diff --git a/jni/build.rs b/jni/build.rs new file mode 100644 index 00000000..9c866413 --- /dev/null +++ b/jni/build.rs @@ -0,0 +1,7 @@ +use json; + +fn main() { + json::parse(include_str!("../tiktoken/registry.json")).expect("Failed to parse internal JSON"); + json::parse(include_str!("../tiktoken/model_to_encoding.json")).expect("Failed to parse internal JSON"); + println!("JSON Parsing validated"); +} diff --git a/jni/src/lib.rs b/jni/src/lib.rs new file mode 100644 index 00000000..6bd99d6d --- /dev/null +++ b/jni/src/lib.rs @@ -0,0 +1,114 @@ +use std::collections::HashSet; +use std::sync::MutexGuard; + +use _tiktoken_core::openai_public::EncodingLazy; +use jni::JNIEnv; +// These objects are what you should use as arguments to your native +// function. They carry extra lifetime information to prevent them escaping +// this context and getting used after being GC'd. +use jni::objects::{JObject, JString}; + +// This is just a pointer. We'll be returning it from our function. We +// can't return one of the objects with lifetime information because the +// lifetime checker won't let us. +use jni::sys::{jarray, jlong}; + +use _tiktoken_core::{self, CoreBPENative}; + +type Result = std::result::Result>; + +fn unwrap_or_throw(env: &JNIEnv, result: Result, default: T) -> T { + // Check if an exception is already thrown + if env.exception_check().expect("exception_check() failed") { + return default; + } + + match result { + Ok(tokenizer) => tokenizer, + Err(error) => { + let exception_class = env + .find_class("java/lang/Exception") + .expect("Unable to find exception class"); + env.throw_new(exception_class, format!("{}", error)) + .expect("Unable to throw exception"); + default + } + } +} + +#[no_mangle] +pub extern "system" fn Java_tiktoken_Encoding_init(env: JNIEnv, obj: JObject, model_name: JString) { + let result = || -> Result<()> { + // First, we have to get the string out of Java. Check out the `strings` + // module for more info on how this works. + let model_name: String = env + .get_string(model_name)? + .into(); + + let encoding_name = _tiktoken_core::openai_public::MODEL_TO_ENCODING + .get(&model_name).ok_or("Unable to find model")?; + + let encoding = _tiktoken_core::openai_public::REGISTRY + .get(encoding_name).ok_or("Unable to find encoding")?; + + let bpe_native = CoreBPENative::new( + encoding.get()?, + encoding.special_tokens.clone(), + &encoding.pat_str, + )?; + + Ok(unsafe { + env.set_rust_field(obj, "handle", bpe_native)?; + }) + }(); + + unwrap_or_throw(&env, result, ()) +} + +#[no_mangle] +pub extern "system" fn Java_tiktoken_Encoding_destroy(env: JNIEnv, obj: JObject) { + unsafe { + let _: CoreBPENative = env.take_rust_field(obj, "handle").expect("Unable to get handle during destruction"); + } +} + +#[no_mangle] +pub extern "system" fn Java_tiktoken_Encoding_encode( + env: JNIEnv, + obj: JObject, + text: JString, + allowed_special_tokens: jarray, + max_token_length: jlong, +) -> jarray { + let result = || -> Result { + let encoding: MutexGuard = unsafe { env.get_rust_field(obj, "handle")? }; + + let enc = encoding; + let input: String = env + .get_string(text)? + .into(); + + let len = env.get_array_length(allowed_special_tokens)?; + let mut strings: Vec = Vec::with_capacity(len as usize); + for i in 0..len { + let element: JObject = env + .get_object_array_element(allowed_special_tokens, i)?; + let current: String = env.get_string(element.into())?.into(); + strings.push(current); + } + + let v2: HashSet<&str> = strings.iter().map(|s| &**s).collect(); + + let (tokens, _, _) = enc._encode_native(&input, &v2, Some(max_token_length as usize)); + + let output = env + .new_long_array(tokens.len().try_into()?)?; + + let array_of_u64 = tokens.iter().map(|x| *x as i64).collect::>(); + env.set_long_array_region(output, 0, array_of_u64.as_slice())?; + + Ok(output) + }(); + + unwrap_or_throw(&env, result, JObject::null().into_raw()) +} diff --git a/js/.gitignore b/js/.gitignore new file mode 100644 index 00000000..d27528eb --- /dev/null +++ b/js/.gitignore @@ -0,0 +1,3 @@ +# WASM +ranks/ +node_modules \ No newline at end of file diff --git a/js/Cargo.toml b/js/Cargo.toml new file mode 100644 index 00000000..c0698810 --- /dev/null +++ b/js/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "tiktoken_js" +version = "0.2.0" +edition = "2021" +rust-version = "1.57.0" + +[lib] +name = "tiktoken" +crate-type = ["cdylib"] + +[dependencies] +tiktoken_core = { path = "../core", features = [] } +# tiktoken dependencies +fancy-regex = "0.10.0" +regex = "1.7.0" +rustc-hash = "1.1.0" +bstr = "1.0.1" +wasm-bindgen = "0.2.83" +anyhow = "1.0.69" +base64 = "0.21.0" +gloo-utils = { version = "0.1", features = ["serde"] } + +[features] +default = ["inline"] +inline = [] \ No newline at end of file diff --git a/js/README.md b/js/README.md new file mode 100644 index 00000000..1756d4a9 --- /dev/null +++ b/js/README.md @@ -0,0 +1,255 @@ +# ⏳ tiktoken + +tiktoken is a [BPE](https://en.wikipedia.org/wiki/Byte_pair_encoding) tokeniser for use with +OpenAI's models, forked from the original tiktoken library to provide NPM bindings for Node and other JS runtimes. + +The open source version of `tiktoken` can be installed from NPM: + +``` +npm install @dqbd/tiktoken +``` + +## Usage + +Basic usage follows, which includes all the OpenAI encoders and ranks: + +```typescript +import assert from "node:assert"; +import { get_encoding, encoding_for_model } from "@dqbd/tiktoken"; + +const enc = get_encoding("gpt2"); +assert( + new TextDecoder().decode(enc.decode(enc.encode("hello world"))) === + "hello world" +); + +// To get the tokeniser corresponding to a specific model in the OpenAI API: +const enc = encoding_for_model("text-davinci-003"); + +// Extend existing encoding with custom special tokens +const enc = encoding_for_model("gpt2", { + "<|im_start|>": 100264, + "<|im_end|>": 100265, +}); + +// don't forget to free the encoder after it is not used +enc.free(); +``` + +In constrained environments (eg. Edge Runtime, Cloudflare Workers), where you don't want to load all the encoders at once, you can use the lightweight WASM binary via `@dqbd/tiktoken/lite`. + +```typescript +const { Tiktoken } = require("@dqbd/tiktoken/lite"); +const cl100k_base = require("@dqbd/tiktoken/encoders/cl100k_base.json"); + +const encoding = new Tiktoken( + cl100k_base.bpe_ranks, + cl100k_base.special_tokens, + cl100k_base.pat_str +); +const tokens = encoding.encode("hello world"); +encoding.free(); +``` + +If you want to fetch the latest ranks, use the `load` function: + +```typescript +const { Tiktoken } = require("@dqbd/tiktoken/lite"); +const { load } = require("@dqbd/tiktoken/load"); +const registry = require("@dqbd/tiktoken/registry.json"); +const models = require("@dqbd/tiktoken/model_to_encoding.json"); + +async function main() { + const model = await load(registry[models["gpt-3.5-turbo"]]); + const encoder = new Tiktoken( + model.bpe_ranks, + model.special_tokens, + model.pat_str + ); + const tokens = encoding.encode("hello world"); + encoder.free(); +} + +main(); +``` + +If desired, you can create a Tiktoken instance directly with custom ranks, special tokens and regex pattern: + +```typescript +import { Tiktoken } from "../pkg"; +import { readFileSync } from "fs"; + +const encoder = new Tiktoken( + readFileSync("./ranks/gpt2.tiktoken").toString("utf-8"), + { "<|endoftext|>": 50256, "<|im_start|>": 100264, "<|im_end|>": 100265 }, + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+" +); +``` + +Finally, you can a custom `init` function to override the WASM initialization logic for non-Node environments. This is useful if you are using a bundler that does not support WASM ESM integration. + +```typescript +import { get_encoding, init } from "@dqbd/tiktoken/init"; + +async function main() { + const wasm = "..."; // fetch the WASM binary somehow + await init((imports) => WebAssembly.instantiate(wasm, imports)); + + const encoding = get_encoding("cl100k_base"); + const tokens = encoding.encode("hello world"); + encoding.free(); +} + +main(); +``` + +## Compatibility + +As this is a WASM library, there might be some issues with specific runtimes. If you encounter any issues, please open an issue. + +| Runtime | Status | Notes | +| ------------------- | ------ | ------------------------------------------ | +| Node.js | ✅ | | +| Bun | ✅ | | +| Vite | ✅ | See [here](#vite) for notes | +| Next.js | ✅ | See [here](#nextjs) for notes | +| Vercel Edge Runtime | ✅ | See [here](#vercel-edge-runtime) for notes | +| Cloudflare Workers | ✅ | See [here](#cloudflare-workers) for notes | +| Deno | ❌ | Currently unsupported | + +### [Vite](#vite) + +If you are using Vite, you will need to add both the `vite-plugin-wasm` and `vite-plugin-top-level-await`. Add the following to your `vite.config.js`: + +```js +import wasm from "vite-plugin-wasm"; +import topLevelAwait from "vite-plugin-top-level-await"; +import { defineConfig } from "vite"; + +export default defineConfig({ + plugins: [wasm(), topLevelAwait()], +}); +``` + +### [Next.js](#nextjs) + +Both API routes and `/pages` are supported with the following `next.config.js` configuration. + +```typescript +// next.config.json +const config = { + webpack(config, { isServer, dev }) { + config.experiments = { + asyncWebAssembly: true, + layers: true, + }; + + return config; + }, +}; +``` + +Usage in pages: + +```tsx +import { get_encoding } from "@dqbd/tiktoken"; +import { useState } from "react"; + +const encoding = get_encoding("cl100k_base"); + +export default function Home() { + const [input, setInput] = useState("hello world"); + const tokens = encoding.encode(input); + + return ( +
+ setInput(e.target.value)} + /> +
{tokens.toString()}
+
+ ); +} +``` + +Usage in API routes: + +```typescript +import { get_encoding } from "@dqbd/tiktoken"; +import { NextApiRequest, NextApiResponse } from "next"; + +export default function handler(req: NextApiRequest, res: NextApiResponse) { + const encoding = get_encoding("cl100k_base"); + const tokens = encoding.encode("hello world"); + encoding.free(); + return res.status(200).json({ tokens }); +} +``` + +### [Vercel Edge Runtime](#vercel-edge-runtime) + +Vercel Edge Runtime does support WASM modules by adding a `?module` suffix. Initialize the encoder with the following snippet: + +```typescript +// @ts-expect-error +import wasm from "@dqbd/tiktoken/lite/tiktoken_bg.wasm?module"; +import model from "@dqbd/tiktoken/encoders/cl100k_base.json"; +import { init, Tiktoken } from "@dqbd/tiktoken/lite/init"; + +export const config = { runtime: "edge" }; + +export default async function (req: Request) { + await init((imports) => WebAssembly.instantiate(wasm, imports)); + + const encoding = new Tiktoken( + model.bpe_ranks, + model.special_tokens, + model.pat_str + ); + + const tokens = encoding.encode("hello world"); + encoding.free(); + + return new Response(`${tokens}`); +} +``` + +### [Cloudflare Workers](#cloudflare-workers) + +Similar to Vercel Edge Runtime, Cloudflare Workers must import the WASM binary file manually and use the `@dqbd/tiktoken/lite` version to fit the 1 MB limit. However, users need to point directly at the WASM binary via a relative path (including `./node_modules/`). + +Add the following rule to the `wrangler.toml` to upload WASM during build: + +```toml +[[rules]] +globs = ["**/*.wasm"] +type = "CompiledWasm" +``` + +Initialize the encoder with the following snippet: + +```javascript +import { init, Tiktoken } from "@dqbd/tiktoken/lite/init"; +import wasm from "./node_modules/@dqbd/tiktoken/lite/tiktoken_bg.wasm"; +import model from "@dqbd/tiktoken/encoders/cl100k_base.json"; + +export default { + async fetch() { + await init((imports) => WebAssembly.instantiate(wasm, imports)); + const encoder = new Tiktoken( + model.bpe_ranks, + model.special_tokens, + model.pat_str + ); + const tokens = encoder.encode("test"); + encoder.free(); + return new Response(`${tokens}`); + }, +}; +``` + +## Acknowledgements + +- https://github.com/zurawiki/tiktoken-rs diff --git a/js/package.json b/js/package.json new file mode 100644 index 00000000..d45178a1 --- /dev/null +++ b/js/package.json @@ -0,0 +1,29 @@ +{ + "name": "@dqbd/tiktoken", + "version": "1.0.2", + "description": "Javascript bindings for tiktoken", + "license": "MIT", + "scripts": { + "build": "run-s build:*", + "build:cleanup": "rm -rf dist/", + "build:rank": "tsx scripts/inline_ranks.ts", + "build:wasm": "run-s wasm:*", + "build:postprocess": "tsx scripts/post_process.ts", + "wasm:bundler": "wasm-pack build --target bundler --release --out-dir dist && rm -rf dist/.gitignore dist/README.md dist/package.json", + "wasm:lite": "wasm-pack build --target bundler --release --out-dir dist/lite --no-default-features && rm -rf dist/lite/.gitignore dist/lite/README.md dist/lite/package.json", + "test": "yarn vitest" + }, + "repository": { + "type": "git", + "url": "https://github.com/dqbd/tiktoken" + }, + "dependencies": {}, + "devDependencies": { + "@types/node": "^18.14.4", + "npm-run-all": "^4.1.5", + "ts-morph": "^17.0.1", + "tsx": "^3.12.3", + "typescript": "^4.9.5", + "vitest": "^0.28.5" + } +} diff --git a/js/scripts/inline_ranks.ts b/js/scripts/inline_ranks.ts new file mode 100644 index 00000000..f508fd85 --- /dev/null +++ b/js/scripts/inline_ranks.ts @@ -0,0 +1,107 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { load } from "../src/load"; + +function compress_tiktoken_bpe(tiktoken_bpe_file: string) { + const original = tiktoken_bpe_file + .split("\n") + .map((line) => line.trim() && line.split(" ")) + .filter((x): x is Array => !!x && Array.isArray(x)) + .map(([token, rank]) => [token, Number.parseInt(rank, 10)] as const) + .sort((a, b) => a[1] - b[1]); + + const newTokens = original.reduce< + Array<{ offset: number; tokens: string[] }> + >((memo, item) => { + if (memo.length === 0) return [{ offset: item[1], tokens: [item[0]] }]; + const lastSplit = memo[memo.length - 1]; + const nextOffset = lastSplit.offset + lastSplit.tokens.length; + + if (nextOffset === item[1]) { + lastSplit.tokens.push(item[0]); + return memo; + } + + return [...memo, { offset: item[1], tokens: [item[0]] }]; + }, []); + + const compressed = newTokens + .map((x) => `! ${x.offset} ${x.tokens.join(" ")}`) + .join("\n"); + + // make sure the compressed and the original files are the same + const tiktokenOld = compressed + .split("\n") + .filter(Boolean) + .reduce>((memo, x) => { + const [_, offsetStr, ...tokens] = x.split(" "); + const offset = Number.parseInt(offsetStr, 10); + tokens.forEach((token, i) => (memo[token] = offset + i)); + return memo; + }, {}); + + function normalize_map(items: Record) { + return JSON.stringify( + Object.keys(items) + .sort() + .map((key) => [key, items[key]]) + ); + } + + if ( + normalize_map(tiktokenOld) !== normalize_map(Object.fromEntries(original)) + ) { + throw new Error("Invalid compression"); + } + + return compressed; +} + +async function main() { + try { + await fs.mkdir(path.resolve(__dirname, "../ranks"), { recursive: true }); + } catch {} + + const registry = JSON.parse( + await fs.readFile(path.resolve(__dirname, "../../tiktoken/registry.json"), { + encoding: "utf-8", + }) + ); + + for (const name in registry) { + console.log(name); + const data = registry[name]; + + const tiktokenFile = path.resolve(__dirname, `../ranks/${name}.tiktoken`); + const tiktokenCompressedFile = path.resolve( + __dirname, + `../ranks/${name}.compress.tiktoken` + ); + const jsonFile = path.resolve(__dirname, `../ranks/${name}.json`); + + try { + await Promise.all([ + fs.stat(tiktokenFile), + fs.stat(jsonFile), + fs.stat(tiktokenCompressedFile), + ]); + continue; + } catch {} + + const result = await load(data); + await fs.writeFile(tiktokenFile, result.bpe_ranks, { encoding: "utf-8" }); + + const compress = compress_tiktoken_bpe(result.bpe_ranks); + await fs.writeFile(tiktokenCompressedFile, compress, { + encoding: "utf-8", + }); + + await fs.writeFile( + jsonFile, + JSON.stringify({ ...result, bpe_ranks: compress }), + { encoding: "utf-8" } + ); + } +} + +main(); diff --git a/js/scripts/post_process.ts b/js/scripts/post_process.ts new file mode 100644 index 00000000..5b6cd969 --- /dev/null +++ b/js/scripts/post_process.ts @@ -0,0 +1,296 @@ +import { Project, ScriptTarget, StructureKind, ts } from "ts-morph"; +import * as fs from "node:fs"; +import * as path from "node:path"; + +for (const baseDir of [ + path.resolve(__dirname, "../dist"), + path.resolve(__dirname, "../dist/lite"), +]) { + let publicExports: string[] = []; + // fix `any` types + { + const sourceFile = new Project().addSourceFileAtPath( + path.resolve(baseDir, "tiktoken.d.ts") + ); + const cls = sourceFile.getFirstDescendantByKindOrThrow( + ts.SyntaxKind.ClassDeclaration + ); + + cls + .getConstructors()[0] + .getParameterOrThrow("special_tokens") + .set({ type: "Record" }); + + for (const method of ["encode", "encode_with_unstable"]) { + cls + .getMethodOrThrow(method) + .getParameterOrThrow("allowed_special") + .set({ type: `"all" | string[]`, hasQuestionToken: true }); + + cls + .getMethodOrThrow(method) + .getParameterOrThrow("disallowed_special") + .set({ type: `"all" | string[]`, hasQuestionToken: true }); + } + + cls + .getMemberOrThrow("token_byte_values") + .set({ returnType: "Array>" }); + + publicExports = sourceFile + .getExportSymbols() + .filter((sym) => + sym + .getDeclarations() + .some( + (dcl) => + dcl.isKind(ts.SyntaxKind.ClassDeclaration) || + dcl.isKind(ts.SyntaxKind.FunctionDeclaration) + ) + ) + .map((i) => i.getName()); + + sourceFile.saveSync(); + } + + // tiktoken_bg.cjs + { + const sourceFile = new Project().addSourceFileAtPath( + path.resolve(baseDir, "tiktoken_bg.js") + ); + + for (const cls of sourceFile.getClasses().filter((x) => x.isExported())) { + cls.set({ + ...cls.getStructure(), + kind: StructureKind.Class, + isExported: false, + }); + + sourceFile.insertStatements(cls.getChildIndex() + 1, [ + `module.exports.${cls.getName()} = ${cls.getName()};`, + ]); + } + + for (const fn of sourceFile.getFunctions().filter((f) => f.isExported())) { + fn.set({ + ...fn.getStructure(), + kind: StructureKind.Function, + isExported: false, + }); + + sourceFile.insertStatements(fn.getChildIndex(), [ + `module.exports.${fn.getName()} = ${fn.getText()};`, + ]); + + sourceFile + .getDescendantsOfKind(ts.SyntaxKind.FunctionExpression) + .filter((x) => x.getName() === fn.getName()) + .forEach((f) => f.removeName()); + + fn.remove(); + } + + sourceFile + .copy(path.resolve(baseDir, "tiktoken_bg.cjs"), { overwrite: true }) + .saveSync(); + } + + // tiktoken.js + { + fs.writeFileSync( + path.resolve(baseDir, "tiktoken.cjs"), + [ + `const wasm = require("./tiktoken_bg.cjs");`, + `let imports = {};`, + `imports["./tiktoken_bg.js"] = wasm;`, + `const path = require("path").join(__dirname, "tiktoken_bg.wasm");`, + `const bytes = require("fs").readFileSync(path);`, + `const wasmModule = new WebAssembly.Module(bytes);`, + `const wasmInstance = new WebAssembly.Instance(wasmModule, imports);`, + `wasm.__wbg_set_wasm(wasmInstance.exports);`, + ...publicExports.map((name) => `exports["${name}"] = wasm["${name}"];`), + ].join("\n"), + { encoding: "utf-8" } + ); + } + + // init.js and init.cjs + { + for (const module of [ts.ModuleKind.CommonJS, ts.ModuleKind.ES2022]) { + const sourceFile = new Project({ + compilerOptions: { + target: ScriptTarget.ES2022, + module, + moduleResolution: ts.ModuleResolutionKind.NodeJs, + strict: true, + declaration: true, + }, + }).addSourceFileAtPath(path.resolve(__dirname, "../src/init.ts")); + + const emitOutput = sourceFile.getEmitOutput(); + for (const file of emitOutput.getOutputFiles()) { + let targetFile = path.basename(file.getFilePath()); + + let source = file.getText(); + if (module === ts.ModuleKind.CommonJS) { + targetFile = targetFile.replace(".js", ".cjs"); + source = source + .replaceAll(`"./tiktoken_bg"`, `"./tiktoken_bg.cjs"`) + .replaceAll( + `exports.init = init;`, + `exports.init = init;\n${publicExports + .map((name) => `exports["${name}"] = imports["${name}"];`) + .join("\n")}` + ); + } + + fs.writeFileSync(path.resolve(baseDir, targetFile), source, { + encoding: "utf-8", + }); + } + } + } + + // load.js and load.cjs + { + for (const module of [ts.ModuleKind.CommonJS, ts.ModuleKind.ES2022]) { + const sourceFile = new Project({ + compilerOptions: { + target: ScriptTarget.ES2022, + module, + moduleResolution: ts.ModuleResolutionKind.NodeJs, + strict: true, + declaration: true, + }, + }).addSourceFileAtPath(path.resolve(__dirname, "../src/load.ts")); + + const emitOutput = sourceFile.getEmitOutput(); + for (const file of emitOutput.getOutputFiles()) { + let targetFile = path.basename(file.getFilePath()); + + if (module === ts.ModuleKind.CommonJS) { + targetFile = targetFile.replace(".js", ".cjs"); + } + + fs.writeFileSync(path.resolve(baseDir, targetFile), file.getText(), { + encoding: "utf-8", + }); + } + } + } + + // tiktoken_bg.d.ts + { + fs.writeFileSync( + path.resolve(baseDir, "tiktoken_bg.d.ts"), + `export * from "./tiktoken";`.trim(), + { encoding: "utf-8" } + ); + } + + if (!baseDir.includes("/lite")) { + fs.writeFileSync( + path.resolve(baseDir, "lite.d.ts"), + `export * from "./lite/tiktoken";`.trim(), + { encoding: "utf-8" } + ); + } +} + +// package.json, README.md +{ + const pkg = JSON.parse( + fs.readFileSync(path.resolve(__dirname, "../package.json"), { + encoding: "utf-8", + }) + ); + + delete pkg.devDependencies; + delete pkg.scripts; + pkg.files = ["**/*"]; + + pkg["main"] = "tiktoken.cjs"; + pkg["types"] = "tiktoken.d.ts"; + pkg["exports"] = { + ".": { + types: "./tiktoken.d.ts", + node: "./tiktoken.cjs", + default: "./tiktoken.js", + }, + "./init": { + types: "./init.d.ts", + node: "./init.cjs", + default: "./init.js", + }, + "./load": { + types: "./load.d.ts", + node: "./load.cjs", + default: "./load.js", + }, + "./tiktoken_bg.wasm": { + types: "./tiktoken_bg.wasm.d.ts", + default: "./tiktoken_bg.wasm", + }, + "./lite": { + types: "./lite/tiktoken.d.ts", + node: "./lite/tiktoken.cjs", + default: "./lite/tiktoken.js", + }, + "./lite/init": { + types: "./lite/init.d.ts", + node: "./lite/init.cjs", + default: "./lite/init.js", + }, + "./lite/load": { + types: "./lite/load.d.ts", + node: "./lite/load.cjs", + default: "./lite/load.js", + }, + "./lite/tiktoken_bg.wasm": { + types: "./lite/tiktoken_bg.wasm.d.ts", + default: "./lite/tiktoken_bg.wasm", + }, + "./model_to_encoding.json": "./model_to_encoding.json", + "./registry.json": "./registry.json", + }; + + const registry = JSON.parse( + fs.readFileSync(path.resolve(__dirname, "../../tiktoken/registry.json"), { + encoding: "utf-8", + }) + ); + + fs.mkdirSync(path.resolve(__dirname, "../dist/encoders"), { + recursive: true, + }); + + for (const key in registry) { + fs.copyFileSync( + path.resolve(__dirname, `../ranks/${key}.json`), + path.resolve(__dirname, `../dist/encoders/${key}.json`) + ); + + pkg["exports"][`./encoders/${key}.json`] = `./encoders/${key}.json`; + } + + fs.copyFileSync( + path.resolve(__dirname, "../README.md"), + path.resolve(__dirname, "../dist/README.md") + ); + + fs.copyFileSync( + path.resolve(__dirname, "../../tiktoken/model_to_encoding.json"), + path.resolve(__dirname, "../dist/model_to_encoding.json") + ); + + fs.copyFileSync( + path.resolve(__dirname, "../../tiktoken/registry.json"), + path.resolve(__dirname, "../dist/registry.json") + ); + + fs.writeFileSync( + path.resolve(__dirname, "../dist/package.json"), + JSON.stringify(pkg, null, 2), + { encoding: "utf-8" } + ); +} diff --git a/js/scripts/tsconfig.json b/js/scripts/tsconfig.json new file mode 100644 index 00000000..55681af1 --- /dev/null +++ b/js/scripts/tsconfig.json @@ -0,0 +1,11 @@ +{ + "compilerOptions": { + "target": "ES2022", + "moduleResolution": "node", + "strict": true, + "declaration": true, + "allowSyntheticDefaultImports": true + }, + "include": ["./**/*.ts"], + "exclude": ["node_modules"] +} diff --git a/js/src/init.ts b/js/src/init.ts new file mode 100644 index 00000000..135099e7 --- /dev/null +++ b/js/src/init.ts @@ -0,0 +1,25 @@ +// @ts-expect-error +import * as imports from "./tiktoken_bg"; + +let isInitialized = false; +export async function init( + callback: ( + imports: WebAssembly.Imports + ) => Promise +): Promise { + if (isInitialized) return imports; + const result = await callback({ "./tiktoken_bg.js": imports }); + const instance = + "instance" in result && result.instance instanceof WebAssembly.Instance + ? result.instance + : result instanceof WebAssembly.Instance + ? result + : null; + if (instance == null) throw new Error("Missing instance"); + imports.__wbg_set_wasm(instance.exports); + isInitialized = true; + return imports; +} + +// @ts-expect-error +export * from "./tiktoken_bg"; diff --git a/js/src/lib.rs b/js/src/lib.rs new file mode 100644 index 00000000..e780f3ef --- /dev/null +++ b/js/src/lib.rs @@ -0,0 +1,448 @@ +use _tiktoken_core::CoreBPENative; +use anyhow::Error; +use base64::{engine::general_purpose, Engine as _}; +use fancy_regex::Regex; +use gloo_utils::format::JsValueSerdeExt; +use rustc_hash::FxHashMap as HashMap; +use std::collections::HashSet; +use std::result::Result; +use wasm_bindgen::prelude::*; + +#[cfg(feature = "inline")] +const ENDOFTEXT: &'static str = "<|endoftext|>"; + +#[cfg(feature = "inline")] +const FIM_PREFIX: &'static str = "<|fim_prefix|>"; + +#[cfg(feature = "inline")] +const FIM_MIDDLE: &'static str = "<|fim_middle|>"; + +#[cfg(feature = "inline")] +const FIM_SUFFIX: &'static str = "<|fim_suffix|>"; + +#[cfg(feature = "inline")] +const ENDOFPROMPT: &'static str = "<|endofprompt|>"; + +struct CoreBPEConstructor { + encoder: HashMap, usize>, + special_tokens: HashMap, + pat_str: String, +} + +impl CoreBPEConstructor { + fn new( + tiktoken_bfe: &str, + special_tokens: Option>, + pat_str: &str, + ) -> Self { + CoreBPEConstructor { + encoder: CoreBPEConstructor::parse_bfe(tiktoken_bfe).unwrap(), + special_tokens: special_tokens.unwrap_or_default(), + pat_str: String::from(pat_str), + } + } + + fn parse_bfe(tiktoken_bfe: &str) -> Result, usize>, Error> { + let mut encoder = HashMap::default(); + if tiktoken_bfe.chars().next().unwrap() == '!' { + for line in tiktoken_bfe.lines() { + let mut parts = line.split(' '); + parts.next().unwrap(); + + let offset: i32 = parts.next().unwrap().parse()?; + for (pos, token) in parts.enumerate() { + let token = &general_purpose::STANDARD.decode(token)?; + encoder.insert(token.clone(), (offset as usize) + pos); + } + } + } else { + for line in tiktoken_bfe.lines() { + let mut parts = line.split(' '); + let token = &general_purpose::STANDARD.decode(parts.next().unwrap())?; + let rank: usize = parts.next().unwrap().parse().unwrap(); + encoder.insert(token.clone(), rank); + } + } + + Ok(encoder) + } + + #[cfg(feature = "inline")] + fn gpt2() -> Self { + let mut special_tokens = HashMap::default(); + special_tokens.insert(String::from(ENDOFTEXT), 50256); + + CoreBPEConstructor::new( + include_str!("../ranks/gpt2.compress.tiktoken"), + Some(special_tokens), + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", + ) + } + + #[cfg(feature = "inline")] + fn r50k_base() -> Self { + let mut special_tokens = HashMap::default(); + special_tokens.insert(String::from(ENDOFTEXT), 50256); + + CoreBPEConstructor::new( + include_str!("../ranks/r50k_base.compress.tiktoken"), + Some(special_tokens), + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", + ) + } + + #[cfg(feature = "inline")] + fn p50k_base() -> Self { + let mut special_tokens = HashMap::default(); + special_tokens.insert(String::from(ENDOFTEXT), 50256); + + CoreBPEConstructor::new( + include_str!("../ranks/p50k_base.compress.tiktoken"), + Some(special_tokens), + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", + ) + } + + #[cfg(feature = "inline")] + fn p50k_edit() -> Self { + let mut special_tokens = HashMap::default(); + special_tokens.insert(String::from(ENDOFTEXT), 50256); + special_tokens.insert(String::from(FIM_PREFIX), 50281); + special_tokens.insert(String::from(FIM_MIDDLE), 50282); + special_tokens.insert(String::from(FIM_SUFFIX), 50283); + + CoreBPEConstructor::new( + include_str!("../ranks/p50k_base.compress.tiktoken"), + Some(special_tokens), + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", + ) + } + + #[cfg(feature = "inline")] + fn cl100k_base() -> Self { + let mut special_tokens = HashMap::default(); + special_tokens.insert(String::from(ENDOFTEXT), 100257); + special_tokens.insert(String::from(FIM_PREFIX), 100258); + special_tokens.insert(String::from(FIM_MIDDLE), 100259); + special_tokens.insert(String::from(FIM_SUFFIX), 100260); + special_tokens.insert(String::from(ENDOFPROMPT), 100276); + + CoreBPEConstructor::new( + include_str!("../ranks/cl100k_base.compress.tiktoken"), + Some(special_tokens), + "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + ) + } +} + +#[wasm_bindgen] +pub struct Tiktoken { + name: Option, + special_tokens_set: HashSet, + bpe: CoreBPENative, +} + +#[wasm_bindgen] +impl Tiktoken { + #[wasm_bindgen(constructor)] + pub fn new(tiktoken_bfe: &str, special_tokens: JsValue, pat_str: &str) -> Self { + let constructor = CoreBPEConstructor::new( + tiktoken_bfe, + special_tokens.into_serde::>().ok(), + pat_str, + ); + + Tiktoken { + name: None, + special_tokens_set: constructor + .special_tokens + .keys() + .map(|s| s.clone()) + .collect(), + bpe: CoreBPENative::new( + constructor.encoder, + constructor.special_tokens, + &constructor.pat_str, + ) + .unwrap(), + } + } + + #[cfg(feature = "inline")] + fn with_encoding( + encoding: &str, + extend_special_tokens: &Option>, + ) -> Result { + let mut constructor: CoreBPEConstructor = match encoding { + "gpt2" => Ok(CoreBPEConstructor::gpt2()), + "r50k_base" => Ok(CoreBPEConstructor::r50k_base()), + "p50k_base" => Ok(CoreBPEConstructor::p50k_base()), + "p50k_edit" => Ok(CoreBPEConstructor::p50k_edit()), + "cl100k_base" => Ok(CoreBPEConstructor::cl100k_base()), + &_ => Err(JsError::new("Invalid encoding")), + }?; + + if let Some(tokens) = extend_special_tokens { + constructor.special_tokens.extend(tokens.clone()); + } + + Ok(Tiktoken { + name: Some(String::from(encoding)), + // TODO: can we avoid cloning here? + special_tokens_set: constructor + .special_tokens + .keys() + .map(|s| s.clone()) + .collect(), + bpe: CoreBPENative::new( + constructor.encoder, + constructor.special_tokens, + &constructor.pat_str, + ) + .unwrap(), + }) + } + + #[wasm_bindgen(getter)] + pub fn name(&self) -> Option { + self.name.clone() + } + + pub fn encode( + &self, + text: &str, + allowed_special: JsValue, + disallowed_special: JsValue, + ) -> Result, JsError> { + let allowed_tokens = + self.validate_allowed_tokens(text, &allowed_special, &disallowed_special)?; + + Ok(self + .bpe + ._encode_native( + &text, + &allowed_tokens.iter().map(AsRef::as_ref).collect(), + None, + ) + .0) + } + + pub fn encode_ordinary(&self, text: &str) -> Vec { + self.bpe._encode_ordinary_native(&text) + } + + pub fn encode_with_unstable( + &self, + text: &str, + allowed_special: JsValue, + disallowed_special: JsValue, + ) -> Result { + let allowed_tokens = + self.validate_allowed_tokens(text, &allowed_special, &disallowed_special)?; + + JsValue::from_serde( + &self.bpe._encode_unstable_native( + &text, + &allowed_tokens.iter().map(AsRef::as_ref).collect(), + ), + ) + .map_err(|e| { + JsError::new(&format!( + "Failed to serialize encode_with_unstable result: {}", + e + )) + }) + } + + pub fn encode_single_token(&self, bytes: &[u8]) -> usize { + self.bpe.encode_single_token(&bytes).unwrap_throw() + } + + pub fn decode(&self, tokens: Vec) -> Vec { + self.bpe._decode_native(&tokens) + } + + pub fn decode_single_token_bytes(&self, token: usize) -> Vec { + self.bpe + .decode_single_token_bytes(token) + .unwrap_throw() + .to_vec() + } + + pub fn token_byte_values(&self) -> JsValue { + JsValue::from_serde(&self.bpe.token_byte_values()).unwrap_throw() + } + + fn validate_allowed_tokens( + &self, + text: &str, + allowed_special_param: &JsValue, + disallowed_special_param: &JsValue, + ) -> Result, JsError> { + let allowed_special: HashSet = match allowed_special_param.as_string() { + Some(value) => match value.as_str() { + "all" => Ok(self.special_tokens_set.clone()), + _ => Err(JsError::new("Invalid value for allowed_special")), + }, + _ => Ok(JsValue::into_serde(&allowed_special_param).unwrap_or_default()), + }?; + + let disallowed_special = JsValue::into_serde::>(&disallowed_special_param) + .or_else(|_| { + match disallowed_special_param + .as_string() + .unwrap_or(String::from("all")) + .as_str() + { + "all" => Ok(&self.special_tokens_set - &allowed_special), + _ => Err(JsError::new("Invalid value for disallowed_special")), + } + })?; + + if !disallowed_special.is_empty() { + if let Some(found) = Tiktoken::special_token_regex(&disallowed_special).find(text)? { + return Err(JsError::new(&format!( + "The text contains a special token that is not allowed: {}", + found.as_str() + ))); + } + } + + return Ok(allowed_special); + } + + fn special_token_regex(tokens: &HashSet) -> Regex { + let inner = tokens + .iter() + .map(|token| regex::escape(token)) + .collect::>() + .join("|"); + + Regex::new(&format!("({})", inner)).unwrap_throw() + } +} + +#[cfg(feature = "inline")] +#[wasm_bindgen(typescript_custom_section)] +const _: &'static str = r#" +export type TiktokenEncoding = "gpt2" | "r50k_base" | "p50k_base" | "p50k_edit" | "cl100k_base"; + +/** + * @param {TiktokenEncoding} encoding + * @param {Record} [extend_special_tokens] + * @returns {Tiktoken} + */ +export function get_encoding(encoding: TiktokenEncoding, extend_special_tokens?: Record): Tiktoken; +"#; + +#[cfg(feature = "inline")] +#[wasm_bindgen(skip_typescript)] +pub fn get_encoding(encoding: &str, extend_special_tokens: JsValue) -> Result { + Tiktoken::with_encoding( + encoding, + &extend_special_tokens + .into_serde::>() + .ok(), + ) +} + +#[cfg(feature = "inline")] +#[wasm_bindgen(typescript_custom_section)] +const _: &'static str = r#" +export type TiktokenModel = + | "text-davinci-003" + | "text-davinci-002" + | "text-davinci-001" + | "text-curie-001" + | "text-babbage-001" + | "text-ada-001" + | "davinci" + | "curie" + | "babbage" + | "ada" + | "code-davinci-002" + | "code-davinci-001" + | "code-cushman-002" + | "code-cushman-001" + | "davinci-codex" + | "cushman-codex" + | "text-davinci-edit-001" + | "code-davinci-edit-001" + | "text-embedding-ada-002" + | "text-similarity-davinci-001" + | "text-similarity-curie-001" + | "text-similarity-babbage-001" + | "text-similarity-ada-001" + | "text-search-davinci-doc-001" + | "text-search-curie-doc-001" + | "text-search-babbage-doc-001" + | "text-search-ada-doc-001" + | "code-search-babbage-code-001" + | "code-search-ada-code-001" + | "gpt2" + | "gpt-4" + | "gpt-4-32k" + | "gpt-3.5-turbo" + | "gpt-3.5-turbo-0301"; + +/** + * @param {TiktokenModel} encoding + * @param {Record} [extend_special_tokens] + * @returns {Tiktoken} + */ +export function encoding_for_model(model: TiktokenModel, extend_special_tokens?: Record): Tiktoken; +"#; + +#[cfg(feature = "inline")] +#[wasm_bindgen(skip_typescript)] +pub fn encoding_for_model( + model: &str, + extend_special_tokens: JsValue, +) -> Result { + let encoding = match model { + "text-davinci-003" => Ok("p50k_base"), + "text-davinci-002" => Ok("p50k_base"), + "text-davinci-001" => Ok("r50k_base"), + "text-curie-001" => Ok("r50k_base"), + "text-babbage-001" => Ok("r50k_base"), + "text-ada-001" => Ok("r50k_base"), + "davinci" => Ok("r50k_base"), + "curie" => Ok("r50k_base"), + "babbage" => Ok("r50k_base"), + "ada" => Ok("r50k_base"), + "code-davinci-002" => Ok("p50k_base"), + "code-davinci-001" => Ok("p50k_base"), + "code-cushman-002" => Ok("p50k_base"), + "code-cushman-001" => Ok("p50k_base"), + "davinci-codex" => Ok("p50k_base"), + "cushman-codex" => Ok("p50k_base"), + "text-davinci-edit-001" => Ok("p50k_edit"), + "code-davinci-edit-001" => Ok("p50k_edit"), + "text-embedding-ada-002" => Ok("cl100k_base"), + "text-similarity-davinci-001" => Ok("r50k_base"), + "text-similarity-curie-001" => Ok("r50k_base"), + "text-similarity-babbage-001" => Ok("r50k_base"), + "text-similarity-ada-001" => Ok("r50k_base"), + "text-search-davinci-doc-001" => Ok("r50k_base"), + "text-search-curie-doc-001" => Ok("r50k_base"), + "text-search-babbage-doc-001" => Ok("r50k_base"), + "text-search-ada-doc-001" => Ok("r50k_base"), + "code-search-babbage-code-001" => Ok("r50k_base"), + "code-search-ada-code-001" => Ok("r50k_base"), + "gpt2" => Ok("gpt2"), + "gpt-3.5-turbo" => Ok("cl100k_base"), + "gpt-3.5-turbo-0301" => Ok("cl100k_base"), + "gpt-4" => Ok("cl100k_base"), + "gpt-4-32k" => Ok("cl100k_base"), + model => Err(JsError::new( + format!("Invalid model: {}", model.to_string()).as_str(), + )), + }?; + + Tiktoken::with_encoding( + encoding, + &extend_special_tokens + .into_serde::>() + .ok(), + ) +} diff --git a/js/src/load.ts b/js/src/load.ts new file mode 100644 index 00000000..e4634006 --- /dev/null +++ b/js/src/load.ts @@ -0,0 +1,240 @@ +/** +Copyright (c) 2014 Jameson Little + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + */ +const lookup: string[] = []; +const revLookup: number[] = []; + +const code = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +for (var i = 0, len = code.length; i < len; ++i) { + lookup[i] = code[i]; + revLookup[code.charCodeAt(i)] = i; +} + +// Support decoding URL-safe base64 strings, as Node.js does. +// See: https://en.wikipedia.org/wiki/Base64#URL_applications +revLookup["-".charCodeAt(0)] = 62; +revLookup["_".charCodeAt(0)] = 63; + +function tripletToBase64(num: number) { + return ( + lookup[(num >> 18) & 0x3f] + + lookup[(num >> 12) & 0x3f] + + lookup[(num >> 6) & 0x3f] + + lookup[num & 0x3f] + ); +} + +function encodeChunk(uint8: number[], start: number, end: number) { + var tmp; + var output = []; + for (var i = start; i < end; i += 3) { + tmp = + ((uint8[i] << 16) & 0xff0000) + + ((uint8[i + 1] << 8) & 0xff00) + + (uint8[i + 2] & 0xff); + output.push(tripletToBase64(tmp)); + } + return output.join(""); +} + +function fromByteArray(uint8: number[]) { + var tmp; + var len = uint8.length; + var extraBytes = len % 3; // if we have 1 byte left, pad 2 bytes + var parts = []; + var maxChunkLength = 16383; // must be multiple of 3 + + // go through the array every three bytes, we'll deal with trailing stuff later + for (var i = 0, len2 = len - extraBytes; i < len2; i += maxChunkLength) { + parts.push( + encodeChunk( + uint8, + i, + i + maxChunkLength > len2 ? len2 : i + maxChunkLength + ) + ); + } + + // pad the end with zeros, but make sure to not forget the extra bytes + if (extraBytes === 1) { + tmp = uint8[len - 1]; + parts.push(lookup[tmp >> 2] + lookup[(tmp << 4) & 0x3f] + "=="); + } else if (extraBytes === 2) { + tmp = (uint8[len - 2] << 8) + uint8[len - 1]; + parts.push( + lookup[tmp >> 10] + + lookup[(tmp >> 4) & 0x3f] + + lookup[(tmp << 2) & 0x3f] + + "=" + ); + } + + return parts.join(""); +} + +function is_printable(u: number): boolean { + // printable ascii characters according to python + return !(u <= 31 || (u >= 127 && u <= 160) || u == 173); +} + +function data_gym_to_mergeable_bpe_ranks( + vocal_bpe_contents: string, + encoder_json_contents: string +) { + const rank_to_intbyte = Array.from({ length: 2 ** 8 }, (_, i) => i).filter( + (i) => is_printable(i) && String.fromCharCode(i) !== " " + ); + + const data_gym_byte_to_byte = rank_to_intbyte.reduce>( + (memo, item) => { + memo[String.fromCharCode(item)] = item; + return memo; + }, + {} + ); + + let n = 0; + for (let b = 0; b < 2 ** 8; b++) { + if (!rank_to_intbyte.includes(b)) { + rank_to_intbyte.push(b); + data_gym_byte_to_byte[String.fromCharCode(2 ** 8 + n)] = b; + n += 1; + } + } + + if (rank_to_intbyte.length !== 2 ** 8) { + throw new Error("rank_to_intbyte.length must be 2**8"); + } + + // vocab_bpe contains the merges along with associated ranks + const bpe_merges = vocal_bpe_contents + .split("\n") + .slice(1, -1) + .map((merge_str) => merge_str.split(" ")); + + function decode_data_gym(value: string) { + return value.split("").map((b) => data_gym_byte_to_byte[b]); + } + + // add the single byte tokens + const bpe_ranks = Object.fromEntries(rank_to_intbyte.map((b, i) => [b, i])); + + // add the merged tokens + n = rank_to_intbyte.length; + for (const [first, second] of bpe_merges) { + bpe_ranks[ + [...decode_data_gym(first), ...decode_data_gym(second)].join(",") + ] = n; + n += 1; + } + + // check that the encoder file matches the merges file + // this sanity check is important since tiktoken assumes that ranks are ordered the same + // as merge priority + const encoder_json: Record = JSON.parse( + encoder_json_contents + ); + + const encoder_json_loaded = Object.fromEntries( + Object.entries(encoder_json).map(([k, v]) => [ + decode_data_gym(k).join(","), + v, + ]) + ); + + // drop these two special tokens if present, since they're not mergeable bpe tokens + delete encoder_json_loaded[decode_data_gym("<|endoftext|>").join(",")]; + delete encoder_json_loaded[decode_data_gym("<|startoftext|>").join(",")]; + + function normalize_map(items: Record) { + return JSON.stringify( + Object.keys(items) + .sort() + .map((key) => [key, items[key]]) + ); + } + + if (normalize_map(bpe_ranks) !== normalize_map(encoder_json_loaded)) { + throw new Error("bpe_ranks !== encoder_json_loaded"); + } + + return bpe_ranks; +} + +function dump_tiktoken_bpe(bpe_ranks: Record) { + return ( + Object.entries(bpe_ranks) + .sort((a, b) => a[1] - b[1]) + .map(([token_str, rank]) => + [ + fromByteArray( + token_str.split(",").map((i) => Number.parseInt(i, 10)) + ), + rank, + ].join(" ") + ) + .join("\n") + "\n" + ); +} + +export async function load( + registry: ( + | { load_tiktoken_bpe: string } + | { + data_gym_to_mergeable_bpe_ranks: { + vocab_bpe_file: string; + encoder_json_file: string; + }; + } + ) & { + explicit_n_vocab?: number; + pat_str: string; + special_tokens: Record; + }, + customFetch?: (url: string) => Promise +) { + const ofetch = customFetch + ? customFetch + : (url: string) => fetch(url).then((r) => r.text()); + + if ("data_gym_to_mergeable_bpe_ranks" in registry) { + const [vocab_bpe, encoder_json] = await Promise.all([ + ofetch(registry.data_gym_to_mergeable_bpe_ranks.vocab_bpe_file), + ofetch(registry.data_gym_to_mergeable_bpe_ranks.encoder_json_file), + ]); + + return { + explicit_n_vocab: registry.explicit_n_vocab, + pat_str: registry.pat_str, + special_tokens: registry.special_tokens, + bpe_ranks: dump_tiktoken_bpe( + data_gym_to_mergeable_bpe_ranks(vocab_bpe, encoder_json) + ), + }; + } else { + return { + explicit_n_vocab: registry.explicit_n_vocab, + pat_str: registry.pat_str, + special_tokens: registry.special_tokens, + bpe_ranks: await ofetch(registry.load_tiktoken_bpe), + }; + } +} diff --git a/js/test/test_simple_public.test.ts b/js/test/test_simple_public.test.ts new file mode 100644 index 00000000..85dcd52a --- /dev/null +++ b/js/test/test_simple_public.test.ts @@ -0,0 +1,137 @@ +import { it, expect, describe } from "vitest"; +import { encoding_for_model, get_encoding } from "../dist"; + +it("encoding_for_model initialization", () => { + expect(() => encoding_for_model("gpt2")).not.toThrowError(); + // @ts-expect-error + expect(() => encoding_for_model("gpt2-unknown")).toThrowError( + "Invalid model" + ); +}); + +it("get_encoding initialization", () => { + expect(() => get_encoding("cl100k_base")).not.toThrowError(); + // @ts-expect-error + expect(() => get_encoding("unknown")).toThrowError("Invalid encoding"); +}); + +describe("gpt2", () => { + const enc = get_encoding("gpt2"); + + it("encodes hello world string", () => { + expect(enc.encode("hello world")).toStrictEqual( + new Uint32Array([31373, 995]) + ); + }); + + it("decodes hello world string", () => { + expect( + new TextDecoder().decode(enc.decode(new Uint32Array([31373, 995]))) + ).toStrictEqual("hello world"); + }); + + it("encodes hello world string, all allowed special characters", () => { + expect(enc.encode("hello <|endoftext|>", "all")).toStrictEqual( + new Uint32Array([31373, 220, 50256]) + ); + }); +}); + +describe("cl100k_base", () => { + const enc = get_encoding("cl100k_base"); + + it("encodes hello world string", () => { + expect(enc.encode("hello world")).toStrictEqual( + new Uint32Array([15339, 1917]) + ); + }); + + it("decodes hello world string", () => { + expect( + new TextDecoder().decode(enc.decode(new Uint32Array([15339, 1917]))) + ).toStrictEqual("hello world"); + }); + + it("encodes hello world string, all allowed special characters", () => { + expect(enc.encode("hello <|endoftext|>", "all")).toStrictEqual( + new Uint32Array([15339, 220, 100257]) + ); + }); +}); + +it("test_simple", () => { + const encodings = [ + "gpt2", + "r50k_base", + "p50k_base", + "p50k_edit", + "cl100k_base", + ] as const; + + for (const encoding of encodings) { + const enc = get_encoding(encoding); + for (let token = 0; token < 10_000; token++) { + expect( + enc.encode_single_token(enc.decode_single_token_bytes(token)) + ).toStrictEqual(token); + } + } +}); + +it("test_encoding_for_model", () => { + expect(encoding_for_model("gpt2").name).toEqual("gpt2"); + expect(encoding_for_model("text-davinci-003").name).toEqual("p50k_base"); + expect(encoding_for_model("gpt-3.5-turbo").name).toEqual("cl100k_base"); +}); + +it("test_custom_tokens", () => { + const enc = encoding_for_model("gpt2", { + "<|im_start|>": 100264, + "<|im_end|>": 100265, + }); + expect(enc.encode("<|im_start|>test<|im_end|>", "all")).toStrictEqual( + new Uint32Array([100264, 9288, 100265]) + ); +}); + +it("encode string tokens", () => { + const enc = get_encoding("gpt2", { "<|im_start|>": 100264 }); + + expect(enc.encode("hello world")).toStrictEqual( + new Uint32Array([31373, 995]) + ); + + expect(enc.encode("<|endoftext|>", ["<|endoftext|>"])).toStrictEqual( + new Uint32Array([50256]) + ); + + expect(enc.encode("<|endoftext|>", "all")).toStrictEqual( + new Uint32Array([50256]) + ); + + expect(() => enc.encode("<|endoftext|>")).toThrowError( + "The text contains a special token that is not allowed" + ); + + expect(() => enc.encode("<|im_start|>")).toThrowError( + "The text contains a special token that is not allowed" + ); + + expect(enc.encode("<|endoftext|>", [], [])).toStrictEqual( + new Uint32Array([27, 91, 437, 1659, 5239, 91, 29]) + ); +}); + +it("invalid (dis)allowed_tokens", () => { + const enc = get_encoding("gpt2"); + + // @ts-expect-error + expect(() => enc.encode("hello world", "invalid-string")).toThrowError( + "Invalid value for allowed_special" + ); + + // @ts-expect-error + expect(() => enc.encode("hello world", [], "invalid-string")).toThrowError( + "Invalid value for disallowed_special" + ); +}); diff --git a/js/tsconfig.json b/js/tsconfig.json new file mode 100644 index 00000000..cef970e8 --- /dev/null +++ b/js/tsconfig.json @@ -0,0 +1,14 @@ +{ + "compilerOptions": { + "target": "ES2022", + "lib": ["ESNext", "DOM"], + "module": "ES2020", + "moduleResolution": "node", + "strict": true, + "declaration": true, + "outDir": "./dist", + "allowSyntheticDefaultImports": true + }, + "include": ["./src/**/*.ts"], + "exclude": ["node_modules", "dist"] +} diff --git a/js/yarn.lock b/js/yarn.lock new file mode 100644 index 00000000..06883178 --- /dev/null +++ b/js/yarn.lock @@ -0,0 +1,1602 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +"@esbuild-kit/cjs-loader@^2.4.2": + version "2.4.2" + resolved "https://registry.yarnpkg.com/@esbuild-kit/cjs-loader/-/cjs-loader-2.4.2.tgz#cb4dde00fbf744a68c4f20162ea15a8242d0fa54" + integrity sha512-BDXFbYOJzT/NBEtp71cvsrGPwGAMGRB/349rwKuoxNSiKjPraNNnlK6MIIabViCjqZugu6j+xeMDlEkWdHHJSg== + dependencies: + "@esbuild-kit/core-utils" "^3.0.0" + get-tsconfig "^4.4.0" + +"@esbuild-kit/core-utils@^3.0.0": + version "3.1.0" + resolved "https://registry.yarnpkg.com/@esbuild-kit/core-utils/-/core-utils-3.1.0.tgz#49945d533dbd5e1b7620aa0fc522c15e6ec089c5" + integrity sha512-Uuk8RpCg/7fdHSceR1M6XbSZFSuMrxcePFuGgyvsBn+u339dk5OeL4jv2EojwTN2st/unJGsVm4qHWjWNmJ/tw== + dependencies: + esbuild "~0.17.6" + source-map-support "^0.5.21" + +"@esbuild-kit/esm-loader@^2.5.5": + version "2.5.5" + resolved "https://registry.yarnpkg.com/@esbuild-kit/esm-loader/-/esm-loader-2.5.5.tgz#b82da14fcee3fc1d219869756c06f43f67d1ca71" + integrity sha512-Qwfvj/qoPbClxCRNuac1Du01r9gvNOT+pMYtJDapfB1eoGN1YlJ1BixLyL9WVENRx5RXgNLdfYdx/CuswlGhMw== + dependencies: + "@esbuild-kit/core-utils" "^3.0.0" + get-tsconfig "^4.4.0" + +"@esbuild/android-arm64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/android-arm64/-/android-arm64-0.16.17.tgz#cf91e86df127aa3d141744edafcba0abdc577d23" + integrity sha512-MIGl6p5sc3RDTLLkYL1MyL8BMRN4tLMRCn+yRJJmEDvYZ2M7tmAf80hx1kbNEUX2KJ50RRtxZ4JHLvCfuB6kBg== + +"@esbuild/android-arm64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/android-arm64/-/android-arm64-0.17.10.tgz#ad2ee47dd021035abdfb0c38848ff77a1e1918c4" + integrity sha512-ht1P9CmvrPF5yKDtyC+z43RczVs4rrHpRqrmIuoSvSdn44Fs1n6DGlpZKdK6rM83pFLbVaSUwle8IN+TPmkv7g== + +"@esbuild/android-arm@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/android-arm/-/android-arm-0.16.17.tgz#025b6246d3f68b7bbaa97069144fb5fb70f2fff2" + integrity sha512-N9x1CMXVhtWEAMS7pNNONyA14f71VPQN9Cnavj1XQh6T7bskqiLLrSca4O0Vr8Wdcga943eThxnVp3JLnBMYtw== + +"@esbuild/android-arm@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/android-arm/-/android-arm-0.17.10.tgz#bb5a68af8adeb94b30eadee7307404dc5237d076" + integrity sha512-7YEBfZ5lSem9Tqpsz+tjbdsEshlO9j/REJrfv4DXgKTt1+/MHqGwbtlyxQuaSlMeUZLxUKBaX8wdzlTfHkmnLw== + +"@esbuild/android-x64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/android-x64/-/android-x64-0.16.17.tgz#c820e0fef982f99a85c4b8bfdd582835f04cd96e" + integrity sha512-a3kTv3m0Ghh4z1DaFEuEDfz3OLONKuFvI4Xqczqx4BqLyuFaFkuaG4j2MtA6fuWEFeC5x9IvqnX7drmRq/fyAQ== + +"@esbuild/android-x64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/android-x64/-/android-x64-0.17.10.tgz#751d5d8ae9ece1efa9627b689c888eb85b102360" + integrity sha512-CYzrm+hTiY5QICji64aJ/xKdN70IK8XZ6iiyq0tZkd3tfnwwSWTYH1t3m6zyaaBxkuj40kxgMyj1km/NqdjQZA== + +"@esbuild/darwin-arm64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/darwin-arm64/-/darwin-arm64-0.16.17.tgz#edef4487af6b21afabba7be5132c26d22379b220" + integrity sha512-/2agbUEfmxWHi9ARTX6OQ/KgXnOWfsNlTeLcoV7HSuSTv63E4DqtAc+2XqGw1KHxKMHGZgbVCZge7HXWX9Vn+w== + +"@esbuild/darwin-arm64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/darwin-arm64/-/darwin-arm64-0.17.10.tgz#85601ee7efb2129cd3218d5bcbe8da1173bc1e8b" + integrity sha512-3HaGIowI+nMZlopqyW6+jxYr01KvNaLB5znXfbyyjuo4lE0VZfvFGcguIJapQeQMS4cX/NEispwOekJt3gr5Dg== + +"@esbuild/darwin-x64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/darwin-x64/-/darwin-x64-0.16.17.tgz#42829168730071c41ef0d028d8319eea0e2904b4" + integrity sha512-2By45OBHulkd9Svy5IOCZt376Aa2oOkiE9QWUK9fe6Tb+WDr8hXL3dpqi+DeLiMed8tVXspzsTAvd0jUl96wmg== + +"@esbuild/darwin-x64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/darwin-x64/-/darwin-x64-0.17.10.tgz#362c7e988c61fe72d5edef4f717e4b4fc728da98" + integrity sha512-J4MJzGchuCRG5n+B4EHpAMoJmBeAE1L3wGYDIN5oWNqX0tEr7VKOzw0ymSwpoeSpdCa030lagGUfnfhS7OvzrQ== + +"@esbuild/freebsd-arm64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/freebsd-arm64/-/freebsd-arm64-0.16.17.tgz#1f4af488bfc7e9ced04207034d398e793b570a27" + integrity sha512-mt+cxZe1tVx489VTb4mBAOo2aKSnJ33L9fr25JXpqQqzbUIw/yzIzi+NHwAXK2qYV1lEFp4OoVeThGjUbmWmdw== + +"@esbuild/freebsd-arm64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/freebsd-arm64/-/freebsd-arm64-0.17.10.tgz#e8a85a46ede7c3a048a12f16b9d551d25adc8bb1" + integrity sha512-ZkX40Z7qCbugeK4U5/gbzna/UQkM9d9LNV+Fro8r7HA7sRof5Rwxc46SsqeMvB5ZaR0b1/ITQ/8Y1NmV2F0fXQ== + +"@esbuild/freebsd-x64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/freebsd-x64/-/freebsd-x64-0.16.17.tgz#636306f19e9bc981e06aa1d777302dad8fddaf72" + integrity sha512-8ScTdNJl5idAKjH8zGAsN7RuWcyHG3BAvMNpKOBaqqR7EbUhhVHOqXRdL7oZvz8WNHL2pr5+eIT5c65kA6NHug== + +"@esbuild/freebsd-x64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/freebsd-x64/-/freebsd-x64-0.17.10.tgz#cd0a1b68bffbcb5b65e65b3fd542e8c7c3edd86b" + integrity sha512-0m0YX1IWSLG9hWh7tZa3kdAugFbZFFx9XrvfpaCMMvrswSTvUZypp0NFKriUurHpBA3xsHVE9Qb/0u2Bbi/otg== + +"@esbuild/linux-arm64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/linux-arm64/-/linux-arm64-0.16.17.tgz#a003f7ff237c501e095d4f3a09e58fc7b25a4aca" + integrity sha512-7S8gJnSlqKGVJunnMCrXHU9Q8Q/tQIxk/xL8BqAP64wchPCTzuM6W3Ra8cIa1HIflAvDnNOt2jaL17vaW+1V0g== + +"@esbuild/linux-arm64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/linux-arm64/-/linux-arm64-0.17.10.tgz#13b183f432512ed9d9281cc89476caeebe9e9123" + integrity sha512-g1EZJR1/c+MmCgVwpdZdKi4QAJ8DCLP5uTgLWSAVd9wlqk9GMscaNMEViG3aE1wS+cNMzXXgdWiW/VX4J+5nTA== + +"@esbuild/linux-arm@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/linux-arm/-/linux-arm-0.16.17.tgz#b591e6a59d9c4fe0eeadd4874b157ab78cf5f196" + integrity sha512-iihzrWbD4gIT7j3caMzKb/RsFFHCwqqbrbH9SqUSRrdXkXaygSZCZg1FybsZz57Ju7N/SHEgPyaR0LZ8Zbe9gQ== + +"@esbuild/linux-arm@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/linux-arm/-/linux-arm-0.17.10.tgz#dd11e0a5faa3ea94dc80278a601c3be7b4fdf1da" + integrity sha512-whRdrrl0X+9D6o5f0sTZtDM9s86Xt4wk1bf7ltx6iQqrIIOH+sre1yjpcCdrVXntQPCNw/G+XqsD4HuxeS+2QA== + +"@esbuild/linux-ia32@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/linux-ia32/-/linux-ia32-0.16.17.tgz#24333a11027ef46a18f57019450a5188918e2a54" + integrity sha512-kiX69+wcPAdgl3Lonh1VI7MBr16nktEvOfViszBSxygRQqSpzv7BffMKRPMFwzeJGPxcio0pdD3kYQGpqQ2SSg== + +"@esbuild/linux-ia32@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/linux-ia32/-/linux-ia32-0.17.10.tgz#4d836f87b92807d9292379963c4888270d282405" + integrity sha512-1vKYCjfv/bEwxngHERp7huYfJ4jJzldfxyfaF7hc3216xiDA62xbXJfRlradiMhGZbdNLj2WA1YwYFzs9IWNPw== + +"@esbuild/linux-loong64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/linux-loong64/-/linux-loong64-0.16.17.tgz#d5ad459d41ed42bbd4d005256b31882ec52227d8" + integrity sha512-dTzNnQwembNDhd654cA4QhbS9uDdXC3TKqMJjgOWsC0yNCbpzfWoXdZvp0mY7HU6nzk5E0zpRGGx3qoQg8T2DQ== + +"@esbuild/linux-loong64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/linux-loong64/-/linux-loong64-0.17.10.tgz#92eb2ee200c17ef12c7fb3b648231948699e7a4c" + integrity sha512-mvwAr75q3Fgc/qz3K6sya3gBmJIYZCgcJ0s7XshpoqIAIBszzfXsqhpRrRdVFAyV1G9VUjj7VopL2HnAS8aHFA== + +"@esbuild/linux-mips64el@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/linux-mips64el/-/linux-mips64el-0.16.17.tgz#4e5967a665c38360b0a8205594377d4dcf9c3726" + integrity sha512-ezbDkp2nDl0PfIUn0CsQ30kxfcLTlcx4Foz2kYv8qdC6ia2oX5Q3E/8m6lq84Dj/6b0FrkgD582fJMIfHhJfSw== + +"@esbuild/linux-mips64el@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/linux-mips64el/-/linux-mips64el-0.17.10.tgz#14f7d50c40fe7f7ee545a9bd07c6f6e4cba5570e" + integrity sha512-XilKPgM2u1zR1YuvCsFQWl9Fc35BqSqktooumOY2zj7CSn5czJn279j9TE1JEqSqz88izJo7yE4x3LSf7oxHzg== + +"@esbuild/linux-ppc64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/linux-ppc64/-/linux-ppc64-0.16.17.tgz#206443a02eb568f9fdf0b438fbd47d26e735afc8" + integrity sha512-dzS678gYD1lJsW73zrFhDApLVdM3cUF2MvAa1D8K8KtcSKdLBPP4zZSLy6LFZ0jYqQdQ29bjAHJDgz0rVbLB3g== + +"@esbuild/linux-ppc64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/linux-ppc64/-/linux-ppc64-0.17.10.tgz#1ab5802e93ae511ce9783e1cb95f37df0f84c4af" + integrity sha512-kM4Rmh9l670SwjlGkIe7pYWezk8uxKHX4Lnn5jBZYBNlWpKMBCVfpAgAJqp5doLobhzF3l64VZVrmGeZ8+uKmQ== + +"@esbuild/linux-riscv64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/linux-riscv64/-/linux-riscv64-0.16.17.tgz#c351e433d009bf256e798ad048152c8d76da2fc9" + integrity sha512-ylNlVsxuFjZK8DQtNUwiMskh6nT0vI7kYl/4fZgV1llP5d6+HIeL/vmmm3jpuoo8+NuXjQVZxmKuhDApK0/cKw== + +"@esbuild/linux-riscv64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/linux-riscv64/-/linux-riscv64-0.17.10.tgz#4fae25201ef7ad868731d16c8b50b0e386c4774a" + integrity sha512-r1m9ZMNJBtOvYYGQVXKy+WvWd0BPvSxMsVq8Hp4GzdMBQvfZRvRr5TtX/1RdN6Va8JMVQGpxqde3O+e8+khNJQ== + +"@esbuild/linux-s390x@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/linux-s390x/-/linux-s390x-0.16.17.tgz#661f271e5d59615b84b6801d1c2123ad13d9bd87" + integrity sha512-gzy7nUTO4UA4oZ2wAMXPNBGTzZFP7mss3aKR2hH+/4UUkCOyqmjXiKpzGrY2TlEUhbbejzXVKKGazYcQTZWA/w== + +"@esbuild/linux-s390x@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/linux-s390x/-/linux-s390x-0.17.10.tgz#126254d8335bb3586918b1ca60beb4abb46e6d54" + integrity sha512-LsY7QvOLPw9WRJ+fU5pNB3qrSfA00u32ND5JVDrn/xG5hIQo3kvTxSlWFRP0NJ0+n6HmhPGG0Q4jtQsb6PFoyg== + +"@esbuild/linux-x64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/linux-x64/-/linux-x64-0.16.17.tgz#e4ba18e8b149a89c982351443a377c723762b85f" + integrity sha512-mdPjPxfnmoqhgpiEArqi4egmBAMYvaObgn4poorpUaqmvzzbvqbowRllQ+ZgzGVMGKaPkqUmPDOOFQRUFDmeUw== + +"@esbuild/linux-x64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/linux-x64/-/linux-x64-0.17.10.tgz#7fa4667b2df81ea0538e1b75e607cf04e526ce91" + integrity sha512-zJUfJLebCYzBdIz/Z9vqwFjIA7iSlLCFvVi7glMgnu2MK7XYigwsonXshy9wP9S7szF+nmwrelNaP3WGanstEg== + +"@esbuild/netbsd-x64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/netbsd-x64/-/netbsd-x64-0.16.17.tgz#7d4f4041e30c5c07dd24ffa295c73f06038ec775" + integrity sha512-/PzmzD/zyAeTUsduZa32bn0ORug+Jd1EGGAUJvqfeixoEISYpGnAezN6lnJoskauoai0Jrs+XSyvDhppCPoKOA== + +"@esbuild/netbsd-x64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/netbsd-x64/-/netbsd-x64-0.17.10.tgz#2d24727ddc2305619685bf237a46d6087a02ee9a" + integrity sha512-lOMkailn4Ok9Vbp/q7uJfgicpDTbZFlXlnKT2DqC8uBijmm5oGtXAJy2ZZVo5hX7IOVXikV9LpCMj2U8cTguWA== + +"@esbuild/openbsd-x64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/openbsd-x64/-/openbsd-x64-0.16.17.tgz#970fa7f8470681f3e6b1db0cc421a4af8060ec35" + integrity sha512-2yaWJhvxGEz2RiftSk0UObqJa/b+rIAjnODJgv2GbGGpRwAfpgzyrg1WLK8rqA24mfZa9GvpjLcBBg8JHkoodg== + +"@esbuild/openbsd-x64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/openbsd-x64/-/openbsd-x64-0.17.10.tgz#bf3fc38ee6ecf028c1f0cfe11f61d53cc75fef12" + integrity sha512-/VE0Kx6y7eekqZ+ZLU4AjMlB80ov9tEz4H067Y0STwnGOYL8CsNg4J+cCmBznk1tMpxMoUOf0AbWlb1d2Pkbig== + +"@esbuild/sunos-x64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/sunos-x64/-/sunos-x64-0.16.17.tgz#abc60e7c4abf8b89fb7a4fe69a1484132238022c" + integrity sha512-xtVUiev38tN0R3g8VhRfN7Zl42YCJvyBhRKw1RJjwE1d2emWTVToPLNEQj/5Qxc6lVFATDiy6LjVHYhIPrLxzw== + +"@esbuild/sunos-x64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/sunos-x64/-/sunos-x64-0.17.10.tgz#8deabd6dfec6256f80bb101bc59d29dbae99c69b" + integrity sha512-ERNO0838OUm8HfUjjsEs71cLjLMu/xt6bhOlxcJ0/1MG3hNqCmbWaS+w/8nFLa0DDjbwZQuGKVtCUJliLmbVgg== + +"@esbuild/win32-arm64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/win32-arm64/-/win32-arm64-0.16.17.tgz#7b0ff9e8c3265537a7a7b1fd9a24e7bd39fcd87a" + integrity sha512-ga8+JqBDHY4b6fQAmOgtJJue36scANy4l/rL97W+0wYmijhxKetzZdKOJI7olaBaMhWt8Pac2McJdZLxXWUEQw== + +"@esbuild/win32-arm64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/win32-arm64/-/win32-arm64-0.17.10.tgz#1ec1ee04c788c4c57a83370b6abf79587b3e4965" + integrity sha512-fXv+L+Bw2AeK+XJHwDAQ9m3NRlNemG6Z6ijLwJAAVdu4cyoFbBWbEtyZzDeL+rpG2lWI51cXeMt70HA8g2MqIg== + +"@esbuild/win32-ia32@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/win32-ia32/-/win32-ia32-0.16.17.tgz#e90fe5267d71a7b7567afdc403dfd198c292eb09" + integrity sha512-WnsKaf46uSSF/sZhwnqE4L/F89AYNMiD4YtEcYekBt9Q7nj0DiId2XH2Ng2PHM54qi5oPrQ8luuzGszqi/veig== + +"@esbuild/win32-ia32@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/win32-ia32/-/win32-ia32-0.17.10.tgz#a362528d7f3ad5d44fa8710a96764677ef92ebe9" + integrity sha512-3s+HADrOdCdGOi5lnh5DMQEzgbsFsd4w57L/eLKKjMnN0CN4AIEP0DCP3F3N14xnxh3ruNc32A0Na9zYe1Z/AQ== + +"@esbuild/win32-x64@0.16.17": + version "0.16.17" + resolved "https://registry.yarnpkg.com/@esbuild/win32-x64/-/win32-x64-0.16.17.tgz#c5a1a4bfe1b57f0c3e61b29883525c6da3e5c091" + integrity sha512-y+EHuSchhL7FjHgvQL/0fnnFmO4T1bhvWANX6gcnqTjtnKWbTvUMCpGnv2+t+31d7RzyEAYAd4u2fnIhHL6N/Q== + +"@esbuild/win32-x64@0.17.10": + version "0.17.10" + resolved "https://registry.yarnpkg.com/@esbuild/win32-x64/-/win32-x64-0.17.10.tgz#ac779220f2da96afd480fb3f3148a292f66e7fc3" + integrity sha512-oP+zFUjYNaMNmjTwlFtWep85hvwUu19cZklB3QsBOcZSs6y7hmH4LNCJ7075bsqzYaNvZFXJlAVaQ2ApITDXtw== + +"@nodelib/fs.scandir@2.1.5": + version "2.1.5" + resolved "https://registry.yarnpkg.com/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz#7619c2eb21b25483f6d167548b4cfd5a7488c3d5" + integrity sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g== + dependencies: + "@nodelib/fs.stat" "2.0.5" + run-parallel "^1.1.9" + +"@nodelib/fs.stat@2.0.5", "@nodelib/fs.stat@^2.0.2": + version "2.0.5" + resolved "https://registry.yarnpkg.com/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz#5bd262af94e9d25bd1e71b05deed44876a222e8b" + integrity sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A== + +"@nodelib/fs.walk@^1.2.3": + version "1.2.8" + resolved "https://registry.yarnpkg.com/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz#e95737e8bb6746ddedf69c556953494f196fe69a" + integrity sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg== + dependencies: + "@nodelib/fs.scandir" "2.1.5" + fastq "^1.6.0" + +"@ts-morph/common@~0.18.0": + version "0.18.1" + resolved "https://registry.yarnpkg.com/@ts-morph/common/-/common-0.18.1.tgz#ca40c3a62c3f9e17142e0af42633ad63efbae0ec" + integrity sha512-RVE+zSRICWRsfrkAw5qCAK+4ZH9kwEFv5h0+/YeHTLieWP7F4wWq4JsKFuNWG+fYh/KF+8rAtgdj5zb2mm+DVA== + dependencies: + fast-glob "^3.2.12" + minimatch "^5.1.0" + mkdirp "^1.0.4" + path-browserify "^1.0.1" + +"@types/chai-subset@^1.3.3": + version "1.3.3" + resolved "https://registry.yarnpkg.com/@types/chai-subset/-/chai-subset-1.3.3.tgz#97893814e92abd2c534de422cb377e0e0bdaac94" + integrity sha512-frBecisrNGz+F4T6bcc+NLeolfiojh5FxW2klu669+8BARtyQv2C/GkNW6FUodVe4BroGMP/wER/YDGc7rEllw== + dependencies: + "@types/chai" "*" + +"@types/chai@*", "@types/chai@^4.3.4": + version "4.3.4" + resolved "https://registry.yarnpkg.com/@types/chai/-/chai-4.3.4.tgz#e913e8175db8307d78b4e8fa690408ba6b65dee4" + integrity sha512-KnRanxnpfpjUTqTCXslZSEdLfXExwgNxYPdiO2WGUj8+HDjFi8R3k5RVKPeSCzLjCcshCAtVO2QBbVuAV4kTnw== + +"@types/node@*": + version "18.14.1" + resolved "https://registry.yarnpkg.com/@types/node/-/node-18.14.1.tgz#90dad8476f1e42797c49d6f8b69aaf9f876fc69f" + integrity sha512-QH+37Qds3E0eDlReeboBxfHbX9omAcBCXEzswCu6jySP642jiM3cYSIkU/REqwhCUqXdonHFuBfJDiAJxMNhaQ== + +"@types/node@^18.14.4": + version "18.14.4" + resolved "https://registry.yarnpkg.com/@types/node/-/node-18.14.4.tgz#0e64ec0b35a772e1e3d849f9a0ff61782d0cb647" + integrity sha512-VhCw7I7qO2X49+jaKcAUwi3rR+hbxT5VcYF493+Z5kMLI0DL568b7JI4IDJaxWFH0D/xwmGJNoXisyX+w7GH/g== + +"@vitest/expect@0.28.5": + version "0.28.5" + resolved "https://registry.yarnpkg.com/@vitest/expect/-/expect-0.28.5.tgz#d5a6eccd014e9ad66fe87a20d16426a2815c0e8a" + integrity sha512-gqTZwoUTwepwGIatnw4UKpQfnoyV0Z9Czn9+Lo2/jLIt4/AXLTn+oVZxlQ7Ng8bzcNkR+3DqLJ08kNr8jRmdNQ== + dependencies: + "@vitest/spy" "0.28.5" + "@vitest/utils" "0.28.5" + chai "^4.3.7" + +"@vitest/runner@0.28.5": + version "0.28.5" + resolved "https://registry.yarnpkg.com/@vitest/runner/-/runner-0.28.5.tgz#4a18fe0e40b25569763f9f1f64b799d1629b3026" + integrity sha512-NKkHtLB+FGjpp5KmneQjTcPLWPTDfB7ie+MmF1PnUBf/tGe2OjGxWyB62ySYZ25EYp9krR5Bw0YPLS/VWh1QiA== + dependencies: + "@vitest/utils" "0.28.5" + p-limit "^4.0.0" + pathe "^1.1.0" + +"@vitest/spy@0.28.5": + version "0.28.5" + resolved "https://registry.yarnpkg.com/@vitest/spy/-/spy-0.28.5.tgz#b69affa0786200251b9e5aac5c58bbfb1b3273c9" + integrity sha512-7if6rsHQr9zbmvxN7h+gGh2L9eIIErgf8nSKYDlg07HHimCxp4H6I/X/DPXktVPPLQfiZ1Cw2cbDIx9fSqDjGw== + dependencies: + tinyspy "^1.0.2" + +"@vitest/utils@0.28.5": + version "0.28.5" + resolved "https://registry.yarnpkg.com/@vitest/utils/-/utils-0.28.5.tgz#7b82b528df86adfbd4a1f6a3b72c39790e81de0d" + integrity sha512-UyZdYwdULlOa4LTUSwZ+Paz7nBHGTT72jKwdFSV4IjHF1xsokp+CabMdhjvVhYwkLfO88ylJT46YMilnkSARZA== + dependencies: + cli-truncate "^3.1.0" + diff "^5.1.0" + loupe "^2.3.6" + picocolors "^1.0.0" + pretty-format "^27.5.1" + +acorn-walk@^8.2.0: + version "8.2.0" + resolved "https://registry.yarnpkg.com/acorn-walk/-/acorn-walk-8.2.0.tgz#741210f2e2426454508853a2f44d0ab83b7f69c1" + integrity sha512-k+iyHEuPgSw6SbuDpGQM+06HQUa04DZ3o+F6CSzXMvvI5KMvnaEqXe+YVe555R9nn6GPt404fos4wcgpw12SDA== + +acorn@^8.8.1, acorn@^8.8.2: + version "8.8.2" + resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.8.2.tgz#1b2f25db02af965399b9776b0c2c391276d37c4a" + integrity sha512-xjIYgE8HBrkpd/sJqOGNspf8uHG+NOHGOw6a/Urj8taM2EXfdNAH2oFcPeIFfsv3+kz/mJrS5VuMqbNLjCa2vw== + +ansi-regex@^5.0.1: + version "5.0.1" + resolved "https://registry.yarnpkg.com/ansi-regex/-/ansi-regex-5.0.1.tgz#082cb2c89c9fe8659a311a53bd6a4dc5301db304" + integrity sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ== + +ansi-regex@^6.0.1: + version "6.0.1" + resolved "https://registry.yarnpkg.com/ansi-regex/-/ansi-regex-6.0.1.tgz#3183e38fae9a65d7cb5e53945cd5897d0260a06a" + integrity sha512-n5M855fKb2SsfMIiFFoVrABHJC8QtHwVx+mHWP3QcEqBHYienj5dHSgjbxtC0WEZXYt4wcD6zrQElDPhFuZgfA== + +ansi-styles@^3.2.1: + version "3.2.1" + resolved "https://registry.yarnpkg.com/ansi-styles/-/ansi-styles-3.2.1.tgz#41fbb20243e50b12be0f04b8dedbf07520ce841d" + integrity sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA== + dependencies: + color-convert "^1.9.0" + +ansi-styles@^5.0.0: + version "5.2.0" + resolved "https://registry.yarnpkg.com/ansi-styles/-/ansi-styles-5.2.0.tgz#07449690ad45777d1924ac2abb2fc8895dba836b" + integrity sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA== + +ansi-styles@^6.0.0: + version "6.2.1" + resolved "https://registry.yarnpkg.com/ansi-styles/-/ansi-styles-6.2.1.tgz#0e62320cf99c21afff3b3012192546aacbfb05c5" + integrity sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug== + +assertion-error@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/assertion-error/-/assertion-error-1.1.0.tgz#e60b6b0e8f301bd97e5375215bda406c85118c0b" + integrity sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw== + +available-typed-arrays@^1.0.5: + version "1.0.5" + resolved "https://registry.yarnpkg.com/available-typed-arrays/-/available-typed-arrays-1.0.5.tgz#92f95616501069d07d10edb2fc37d3e1c65123b7" + integrity sha512-DMD0KiN46eipeziST1LPP/STfDU0sufISXmjSgvVsoU2tqxctQeASejWcfNtxYKqETM1UxQ8sp2OrSBWpHY6sw== + +balanced-match@^1.0.0: + version "1.0.2" + resolved "https://registry.yarnpkg.com/balanced-match/-/balanced-match-1.0.2.tgz#e83e3a7e3f300b34cb9d87f615fa0cbf357690ee" + integrity sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw== + +brace-expansion@^1.1.7: + version "1.1.11" + resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd" + integrity sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA== + dependencies: + balanced-match "^1.0.0" + concat-map "0.0.1" + +brace-expansion@^2.0.1: + version "2.0.1" + resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-2.0.1.tgz#1edc459e0f0c548486ecf9fc99f2221364b9a0ae" + integrity sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA== + dependencies: + balanced-match "^1.0.0" + +braces@^3.0.2: + version "3.0.2" + resolved "https://registry.yarnpkg.com/braces/-/braces-3.0.2.tgz#3454e1a462ee8d599e236df336cd9ea4f8afe107" + integrity sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A== + dependencies: + fill-range "^7.0.1" + +buffer-from@^1.0.0: + version "1.1.2" + resolved "https://registry.yarnpkg.com/buffer-from/-/buffer-from-1.1.2.tgz#2b146a6fd72e80b4f55d255f35ed59a3a9a41bd5" + integrity sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ== + +cac@^6.7.14: + version "6.7.14" + resolved "https://registry.yarnpkg.com/cac/-/cac-6.7.14.tgz#804e1e6f506ee363cb0e3ccbb09cad5dd9870959" + integrity sha512-b6Ilus+c3RrdDk+JhLKUAQfzzgLEPy6wcXqS7f/xe1EETvsDP6GORG7SFuOs6cID5YkqchW/LXZbX5bc8j7ZcQ== + +call-bind@^1.0.0, call-bind@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/call-bind/-/call-bind-1.0.2.tgz#b1d4e89e688119c3c9a903ad30abb2f6a919be3c" + integrity sha512-7O+FbCihrB5WGbFYesctwmTKae6rOiIzmz1icreWJ+0aA7LJfuqhEso2T9ncpcFtzMQtzXf2QGGueWJGTYsqrA== + dependencies: + function-bind "^1.1.1" + get-intrinsic "^1.0.2" + +chai@^4.3.7: + version "4.3.7" + resolved "https://registry.yarnpkg.com/chai/-/chai-4.3.7.tgz#ec63f6df01829088e8bf55fca839bcd464a8ec51" + integrity sha512-HLnAzZ2iupm25PlN0xFreAlBA5zaBSv3og0DdeGA4Ar6h6rJ3A0rolRUKJhSF2V10GZKDgWF/VmAEsNWjCRB+A== + dependencies: + assertion-error "^1.1.0" + check-error "^1.0.2" + deep-eql "^4.1.2" + get-func-name "^2.0.0" + loupe "^2.3.1" + pathval "^1.1.1" + type-detect "^4.0.5" + +chalk@^2.4.1: + version "2.4.2" + resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424" + integrity sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ== + dependencies: + ansi-styles "^3.2.1" + escape-string-regexp "^1.0.5" + supports-color "^5.3.0" + +check-error@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/check-error/-/check-error-1.0.2.tgz#574d312edd88bb5dd8912e9286dd6c0aed4aac82" + integrity sha512-BrgHpW9NURQgzoNyjfq0Wu6VFO6D7IZEmJNdtgNqpzGG8RuNFHt2jQxWlAs4HMe119chBnv+34syEZtc6IhLtA== + +cli-truncate@^3.1.0: + version "3.1.0" + resolved "https://registry.yarnpkg.com/cli-truncate/-/cli-truncate-3.1.0.tgz#3f23ab12535e3d73e839bb43e73c9de487db1389" + integrity sha512-wfOBkjXteqSnI59oPcJkcPl/ZmwvMMOj340qUIY1SKZCv0B9Cf4D4fAucRkIKQmsIuYK3x1rrgU7MeGRruiuiA== + dependencies: + slice-ansi "^5.0.0" + string-width "^5.0.0" + +code-block-writer@^11.0.3: + version "11.0.3" + resolved "https://registry.yarnpkg.com/code-block-writer/-/code-block-writer-11.0.3.tgz#9eec2993edfb79bfae845fbc093758c0a0b73b76" + integrity sha512-NiujjUFB4SwScJq2bwbYUtXbZhBSlY6vYzm++3Q6oC+U+injTqfPYFK8wS9COOmb2lueqp0ZRB4nK1VYeHgNyw== + +color-convert@^1.9.0: + version "1.9.3" + resolved "https://registry.yarnpkg.com/color-convert/-/color-convert-1.9.3.tgz#bb71850690e1f136567de629d2d5471deda4c1e8" + integrity sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg== + dependencies: + color-name "1.1.3" + +color-name@1.1.3: + version "1.1.3" + resolved "https://registry.yarnpkg.com/color-name/-/color-name-1.1.3.tgz#a7d0558bd89c42f795dd42328f740831ca53bc25" + integrity sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw== + +concat-map@0.0.1: + version "0.0.1" + resolved "https://registry.yarnpkg.com/concat-map/-/concat-map-0.0.1.tgz#d8a96bd77fd68df7793a73036a3ba0d5405d477b" + integrity sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg== + +cross-spawn@^6.0.5: + version "6.0.5" + resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-6.0.5.tgz#4a5ec7c64dfae22c3a14124dbacdee846d80cbc4" + integrity sha512-eTVLrBSt7fjbDygz805pMnstIs2VTBNkRm0qxZd+M7A5XDdxVRWO5MxGBXZhjY4cqLYLdtrGqRf8mBPmzwSpWQ== + dependencies: + nice-try "^1.0.4" + path-key "^2.0.1" + semver "^5.5.0" + shebang-command "^1.2.0" + which "^1.2.9" + +debug@^4.3.4: + version "4.3.4" + resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.4.tgz#1319f6579357f2338d3337d2cdd4914bb5dcc865" + integrity sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ== + dependencies: + ms "2.1.2" + +deep-eql@^4.1.2: + version "4.1.3" + resolved "https://registry.yarnpkg.com/deep-eql/-/deep-eql-4.1.3.tgz#7c7775513092f7df98d8df9996dd085eb668cc6d" + integrity sha512-WaEtAOpRA1MQ0eohqZjpGD8zdI0Ovsm8mmFhaDN8dvDZzyoUMcYDnf5Y6iu7HTXxf8JDS23qWa4a+hKCDyOPzw== + dependencies: + type-detect "^4.0.0" + +define-properties@^1.1.3, define-properties@^1.1.4: + version "1.2.0" + resolved "https://registry.yarnpkg.com/define-properties/-/define-properties-1.2.0.tgz#52988570670c9eacedd8064f4a990f2405849bd5" + integrity sha512-xvqAVKGfT1+UAvPwKTVw/njhdQ8ZhXK4lI0bCIuCMrp2up9nPnaDftrLtmpTazqd1o+UY4zgzU+avtMbDP+ldA== + dependencies: + has-property-descriptors "^1.0.0" + object-keys "^1.1.1" + +diff@^5.1.0: + version "5.1.0" + resolved "https://registry.yarnpkg.com/diff/-/diff-5.1.0.tgz#bc52d298c5ea8df9194800224445ed43ffc87e40" + integrity sha512-D+mk+qE8VC/PAUrlAU34N+VfXev0ghe5ywmpqrawphmVZc1bEfn56uo9qpyGp1p4xpzOHkSW4ztBd6L7Xx4ACw== + +eastasianwidth@^0.2.0: + version "0.2.0" + resolved "https://registry.yarnpkg.com/eastasianwidth/-/eastasianwidth-0.2.0.tgz#696ce2ec0aa0e6ea93a397ffcf24aa7840c827cb" + integrity sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA== + +emoji-regex@^9.2.2: + version "9.2.2" + resolved "https://registry.yarnpkg.com/emoji-regex/-/emoji-regex-9.2.2.tgz#840c8803b0d8047f4ff0cf963176b32d4ef3ed72" + integrity sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg== + +error-ex@^1.3.1: + version "1.3.2" + resolved "https://registry.yarnpkg.com/error-ex/-/error-ex-1.3.2.tgz#b4ac40648107fdcdcfae242f428bea8a14d4f1bf" + integrity sha512-7dFHNmqeFSEt2ZBsCriorKnn3Z2pj+fd9kmI6QoWw4//DL+icEBfc0U7qJCisqrTsKTjw4fNFy2pW9OqStD84g== + dependencies: + is-arrayish "^0.2.1" + +es-abstract@^1.19.0, es-abstract@^1.20.4: + version "1.21.1" + resolved "https://registry.yarnpkg.com/es-abstract/-/es-abstract-1.21.1.tgz#e6105a099967c08377830a0c9cb589d570dd86c6" + integrity sha512-QudMsPOz86xYz/1dG1OuGBKOELjCh99IIWHLzy5znUB6j8xG2yMA7bfTV86VSqKF+Y/H08vQPR+9jyXpuC6hfg== + dependencies: + available-typed-arrays "^1.0.5" + call-bind "^1.0.2" + es-set-tostringtag "^2.0.1" + es-to-primitive "^1.2.1" + function-bind "^1.1.1" + function.prototype.name "^1.1.5" + get-intrinsic "^1.1.3" + get-symbol-description "^1.0.0" + globalthis "^1.0.3" + gopd "^1.0.1" + has "^1.0.3" + has-property-descriptors "^1.0.0" + has-proto "^1.0.1" + has-symbols "^1.0.3" + internal-slot "^1.0.4" + is-array-buffer "^3.0.1" + is-callable "^1.2.7" + is-negative-zero "^2.0.2" + is-regex "^1.1.4" + is-shared-array-buffer "^1.0.2" + is-string "^1.0.7" + is-typed-array "^1.1.10" + is-weakref "^1.0.2" + object-inspect "^1.12.2" + object-keys "^1.1.1" + object.assign "^4.1.4" + regexp.prototype.flags "^1.4.3" + safe-regex-test "^1.0.0" + string.prototype.trimend "^1.0.6" + string.prototype.trimstart "^1.0.6" + typed-array-length "^1.0.4" + unbox-primitive "^1.0.2" + which-typed-array "^1.1.9" + +es-set-tostringtag@^2.0.1: + version "2.0.1" + resolved "https://registry.yarnpkg.com/es-set-tostringtag/-/es-set-tostringtag-2.0.1.tgz#338d502f6f674301d710b80c8592de8a15f09cd8" + integrity sha512-g3OMbtlwY3QewlqAiMLI47KywjWZoEytKr8pf6iTC8uJq5bIAH52Z9pnQ8pVL6whrCto53JZDuUIsifGeLorTg== + dependencies: + get-intrinsic "^1.1.3" + has "^1.0.3" + has-tostringtag "^1.0.0" + +es-to-primitive@^1.2.1: + version "1.2.1" + resolved "https://registry.yarnpkg.com/es-to-primitive/-/es-to-primitive-1.2.1.tgz#e55cd4c9cdc188bcefb03b366c736323fc5c898a" + integrity sha512-QCOllgZJtaUo9miYBcLChTUaHNjJF3PYs1VidD7AwiEj1kYxKeQTctLAezAOH5ZKRH0g2IgPn6KwB4IT8iRpvA== + dependencies: + is-callable "^1.1.4" + is-date-object "^1.0.1" + is-symbol "^1.0.2" + +esbuild@^0.16.14: + version "0.16.17" + resolved "https://registry.yarnpkg.com/esbuild/-/esbuild-0.16.17.tgz#fc2c3914c57ee750635fee71b89f615f25065259" + integrity sha512-G8LEkV0XzDMNwXKgM0Jwu3nY3lSTwSGY6XbxM9cr9+s0T/qSV1q1JVPBGzm3dcjhCic9+emZDmMffkwgPeOeLg== + optionalDependencies: + "@esbuild/android-arm" "0.16.17" + "@esbuild/android-arm64" "0.16.17" + "@esbuild/android-x64" "0.16.17" + "@esbuild/darwin-arm64" "0.16.17" + "@esbuild/darwin-x64" "0.16.17" + "@esbuild/freebsd-arm64" "0.16.17" + "@esbuild/freebsd-x64" "0.16.17" + "@esbuild/linux-arm" "0.16.17" + "@esbuild/linux-arm64" "0.16.17" + "@esbuild/linux-ia32" "0.16.17" + "@esbuild/linux-loong64" "0.16.17" + "@esbuild/linux-mips64el" "0.16.17" + "@esbuild/linux-ppc64" "0.16.17" + "@esbuild/linux-riscv64" "0.16.17" + "@esbuild/linux-s390x" "0.16.17" + "@esbuild/linux-x64" "0.16.17" + "@esbuild/netbsd-x64" "0.16.17" + "@esbuild/openbsd-x64" "0.16.17" + "@esbuild/sunos-x64" "0.16.17" + "@esbuild/win32-arm64" "0.16.17" + "@esbuild/win32-ia32" "0.16.17" + "@esbuild/win32-x64" "0.16.17" + +esbuild@~0.17.6: + version "0.17.10" + resolved "https://registry.yarnpkg.com/esbuild/-/esbuild-0.17.10.tgz#3be050561b34c5dc05b46978f4e1f326d5cc9437" + integrity sha512-n7V3v29IuZy5qgxx25TKJrEm0FHghAlS6QweUcyIgh/U0zYmQcvogWROitrTyZId1mHSkuhhuyEXtI9OXioq7A== + optionalDependencies: + "@esbuild/android-arm" "0.17.10" + "@esbuild/android-arm64" "0.17.10" + "@esbuild/android-x64" "0.17.10" + "@esbuild/darwin-arm64" "0.17.10" + "@esbuild/darwin-x64" "0.17.10" + "@esbuild/freebsd-arm64" "0.17.10" + "@esbuild/freebsd-x64" "0.17.10" + "@esbuild/linux-arm" "0.17.10" + "@esbuild/linux-arm64" "0.17.10" + "@esbuild/linux-ia32" "0.17.10" + "@esbuild/linux-loong64" "0.17.10" + "@esbuild/linux-mips64el" "0.17.10" + "@esbuild/linux-ppc64" "0.17.10" + "@esbuild/linux-riscv64" "0.17.10" + "@esbuild/linux-s390x" "0.17.10" + "@esbuild/linux-x64" "0.17.10" + "@esbuild/netbsd-x64" "0.17.10" + "@esbuild/openbsd-x64" "0.17.10" + "@esbuild/sunos-x64" "0.17.10" + "@esbuild/win32-arm64" "0.17.10" + "@esbuild/win32-ia32" "0.17.10" + "@esbuild/win32-x64" "0.17.10" + +escape-string-regexp@^1.0.5: + version "1.0.5" + resolved "https://registry.yarnpkg.com/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz#1b61c0562190a8dff6ae3bb2cf0200ca130b86d4" + integrity sha512-vbRorB5FUQWvla16U8R/qgaFIya2qGzwDrNmCZuYKrbdSUMG6I1ZCGQRefkRVhuOkIGVne7BQ35DSfo1qvJqFg== + +fast-glob@^3.2.12: + version "3.2.12" + resolved "https://registry.yarnpkg.com/fast-glob/-/fast-glob-3.2.12.tgz#7f39ec99c2e6ab030337142da9e0c18f37afae80" + integrity sha512-DVj4CQIYYow0BlaelwK1pHl5n5cRSJfM60UA0zK891sVInoPri2Ekj7+e1CT3/3qxXenpI+nBBmQAcJPJgaj4w== + dependencies: + "@nodelib/fs.stat" "^2.0.2" + "@nodelib/fs.walk" "^1.2.3" + glob-parent "^5.1.2" + merge2 "^1.3.0" + micromatch "^4.0.4" + +fastq@^1.6.0: + version "1.15.0" + resolved "https://registry.yarnpkg.com/fastq/-/fastq-1.15.0.tgz#d04d07c6a2a68fe4599fea8d2e103a937fae6b3a" + integrity sha512-wBrocU2LCXXa+lWBt8RoIRD89Fi8OdABODa/kEnyeyjS5aZO5/GNvI5sEINADqP/h8M29UHTHUb53sUu5Ihqdw== + dependencies: + reusify "^1.0.4" + +fill-range@^7.0.1: + version "7.0.1" + resolved "https://registry.yarnpkg.com/fill-range/-/fill-range-7.0.1.tgz#1919a6a7c75fe38b2c7c77e5198535da9acdda40" + integrity sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ== + dependencies: + to-regex-range "^5.0.1" + +for-each@^0.3.3: + version "0.3.3" + resolved "https://registry.yarnpkg.com/for-each/-/for-each-0.3.3.tgz#69b447e88a0a5d32c3e7084f3f1710034b21376e" + integrity sha512-jqYfLp7mo9vIyQf8ykW2v7A+2N4QjeCeI5+Dz9XraiO1ign81wjiH7Fb9vSOWvQfNtmSa4H2RoQTrrXivdUZmw== + dependencies: + is-callable "^1.1.3" + +fsevents@~2.3.2: + version "2.3.2" + resolved "https://registry.yarnpkg.com/fsevents/-/fsevents-2.3.2.tgz#8a526f78b8fdf4623b709e0b975c52c24c02fd1a" + integrity sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA== + +function-bind@^1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/function-bind/-/function-bind-1.1.1.tgz#a56899d3ea3c9bab874bb9773b7c5ede92f4895d" + integrity sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A== + +function.prototype.name@^1.1.5: + version "1.1.5" + resolved "https://registry.yarnpkg.com/function.prototype.name/-/function.prototype.name-1.1.5.tgz#cce0505fe1ffb80503e6f9e46cc64e46a12a9621" + integrity sha512-uN7m/BzVKQnCUF/iW8jYea67v++2u7m5UgENbHRtdDVclOUP+FMPlCNdmk0h/ysGyo2tavMJEDqJAkJdRa1vMA== + dependencies: + call-bind "^1.0.2" + define-properties "^1.1.3" + es-abstract "^1.19.0" + functions-have-names "^1.2.2" + +functions-have-names@^1.2.2: + version "1.2.3" + resolved "https://registry.yarnpkg.com/functions-have-names/-/functions-have-names-1.2.3.tgz#0404fe4ee2ba2f607f0e0ec3c80bae994133b834" + integrity sha512-xckBUXyTIqT97tq2x2AMb+g163b5JFysYk0x4qxNFwbfQkmNZoiRHb6sPzI9/QV33WeuvVYBUIiD4NzNIyqaRQ== + +get-func-name@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/get-func-name/-/get-func-name-2.0.0.tgz#ead774abee72e20409433a066366023dd6887a41" + integrity sha512-Hm0ixYtaSZ/V7C8FJrtZIuBBI+iSgL+1Aq82zSu8VQNB4S3Gk8e7Qs3VwBDJAhmRZcFqkl3tQu36g/Foh5I5ig== + +get-intrinsic@^1.0.2, get-intrinsic@^1.1.1, get-intrinsic@^1.1.3, get-intrinsic@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/get-intrinsic/-/get-intrinsic-1.2.0.tgz#7ad1dc0535f3a2904bba075772763e5051f6d05f" + integrity sha512-L049y6nFOuom5wGyRc3/gdTLO94dySVKRACj1RmJZBQXlbTMhtNIgkWkUHq+jYmZvKf14EW1EoJnnjbmoHij0Q== + dependencies: + function-bind "^1.1.1" + has "^1.0.3" + has-symbols "^1.0.3" + +get-symbol-description@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/get-symbol-description/-/get-symbol-description-1.0.0.tgz#7fdb81c900101fbd564dd5f1a30af5aadc1e58d6" + integrity sha512-2EmdH1YvIQiZpltCNgkuiUnyukzxM/R6NDJX31Ke3BG1Nq5b0S2PhX59UKi9vZpPDQVdqn+1IcaAwnzTT5vCjw== + dependencies: + call-bind "^1.0.2" + get-intrinsic "^1.1.1" + +get-tsconfig@^4.4.0: + version "4.4.0" + resolved "https://registry.yarnpkg.com/get-tsconfig/-/get-tsconfig-4.4.0.tgz#64eee64596668a81b8fce18403f94f245ee0d4e5" + integrity sha512-0Gdjo/9+FzsYhXCEFueo2aY1z1tpXrxWZzP7k8ul9qt1U5o8rYJwTJYmaeHdrVosYIVYkOy2iwCJ9FdpocJhPQ== + +glob-parent@^5.1.2: + version "5.1.2" + resolved "https://registry.yarnpkg.com/glob-parent/-/glob-parent-5.1.2.tgz#869832c58034fe68a4093c17dc15e8340d8401c4" + integrity sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow== + dependencies: + is-glob "^4.0.1" + +globalthis@^1.0.3: + version "1.0.3" + resolved "https://registry.yarnpkg.com/globalthis/-/globalthis-1.0.3.tgz#5852882a52b80dc301b0660273e1ed082f0b6ccf" + integrity sha512-sFdI5LyBiNTHjRd7cGPWapiHWMOXKyuBNX/cWJ3NfzrZQVa8GI/8cofCl74AOVqq9W5kNmguTIzJ/1s2gyI9wA== + dependencies: + define-properties "^1.1.3" + +gopd@^1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/gopd/-/gopd-1.0.1.tgz#29ff76de69dac7489b7c0918a5788e56477c332c" + integrity sha512-d65bNlIadxvpb/A2abVdlqKqV563juRnZ1Wtk6s1sIR8uNsXR70xqIzVqxVf1eTqDunwT2MkczEeaezCKTZhwA== + dependencies: + get-intrinsic "^1.1.3" + +graceful-fs@^4.1.2: + version "4.2.10" + resolved "https://registry.yarnpkg.com/graceful-fs/-/graceful-fs-4.2.10.tgz#147d3a006da4ca3ce14728c7aefc287c367d7a6c" + integrity sha512-9ByhssR2fPVsNZj478qUUbKfmL0+t5BDVyjShtyZZLiK7ZDAArFFfopyOTj0M05wE2tJPisA4iTnnXl2YoPvOA== + +has-bigints@^1.0.1, has-bigints@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/has-bigints/-/has-bigints-1.0.2.tgz#0871bd3e3d51626f6ca0966668ba35d5602d6eaa" + integrity sha512-tSvCKtBr9lkF0Ex0aQiP9N+OpV4zi2r/Nee5VkRDbaqv35RLYMzbwQfFSZZH0kR+Rd6302UJZ2p/bJCEoR3VoQ== + +has-flag@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/has-flag/-/has-flag-3.0.0.tgz#b5d454dc2199ae225699f3467e5a07f3b955bafd" + integrity sha512-sKJf1+ceQBr4SMkvQnBDNDtf4TXpVhVGateu0t918bl30FnbE2m4vNLX+VWe/dpjlb+HugGYzW7uQXH98HPEYw== + +has-property-descriptors@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/has-property-descriptors/-/has-property-descriptors-1.0.0.tgz#610708600606d36961ed04c196193b6a607fa861" + integrity sha512-62DVLZGoiEBDHQyqG4w9xCuZ7eJEwNmJRWw2VY84Oedb7WFcA27fiEVe8oUQx9hAUJ4ekurquucTGwsyO1XGdQ== + dependencies: + get-intrinsic "^1.1.1" + +has-proto@^1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/has-proto/-/has-proto-1.0.1.tgz#1885c1305538958aff469fef37937c22795408e0" + integrity sha512-7qE+iP+O+bgF9clE5+UoBFzE65mlBiVj3tKCrlNQ0Ogwm0BjpT/gK4SlLYDMybDh5I3TCTKnPPa0oMG7JDYrhg== + +has-symbols@^1.0.2, has-symbols@^1.0.3: + version "1.0.3" + resolved "https://registry.yarnpkg.com/has-symbols/-/has-symbols-1.0.3.tgz#bb7b2c4349251dce87b125f7bdf874aa7c8b39f8" + integrity sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A== + +has-tostringtag@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/has-tostringtag/-/has-tostringtag-1.0.0.tgz#7e133818a7d394734f941e73c3d3f9291e658b25" + integrity sha512-kFjcSNhnlGV1kyoGk7OXKSawH5JOb/LzUc5w9B02hOTO0dfFRjbHQKvg1d6cf3HbeUmtU9VbbV3qzZ2Teh97WQ== + dependencies: + has-symbols "^1.0.2" + +has@^1.0.3: + version "1.0.3" + resolved "https://registry.yarnpkg.com/has/-/has-1.0.3.tgz#722d7cbfc1f6aa8241f16dd814e011e1f41e8796" + integrity sha512-f2dvO0VU6Oej7RkWJGrehjbzMAjFp5/VKPp5tTpWIV4JHHZK1/BxbFRtf/siA2SWTe09caDmVtYYzWEIbBS4zw== + dependencies: + function-bind "^1.1.1" + +hosted-git-info@^2.1.4: + version "2.8.9" + resolved "https://registry.yarnpkg.com/hosted-git-info/-/hosted-git-info-2.8.9.tgz#dffc0bf9a21c02209090f2aa69429e1414daf3f9" + integrity sha512-mxIDAb9Lsm6DoOJ7xH+5+X4y1LU/4Hi50L9C5sIswK3JzULS4bwk1FvjdBgvYR4bzT4tuUQiC15FE2f5HbLvYw== + +internal-slot@^1.0.4: + version "1.0.5" + resolved "https://registry.yarnpkg.com/internal-slot/-/internal-slot-1.0.5.tgz#f2a2ee21f668f8627a4667f309dc0f4fb6674986" + integrity sha512-Y+R5hJrzs52QCG2laLn4udYVnxsfny9CpOhNhUvk/SSSVyF6T27FzRbF0sroPidSu3X8oEAkOn2K804mjpt6UQ== + dependencies: + get-intrinsic "^1.2.0" + has "^1.0.3" + side-channel "^1.0.4" + +is-array-buffer@^3.0.1: + version "3.0.2" + resolved "https://registry.yarnpkg.com/is-array-buffer/-/is-array-buffer-3.0.2.tgz#f2653ced8412081638ecb0ebbd0c41c6e0aecbbe" + integrity sha512-y+FyyR/w8vfIRq4eQcM1EYgSTnmHXPqaF+IgzgraytCFq5Xh8lllDVmAZolPJiZttZLeFSINPYMaEJ7/vWUa1w== + dependencies: + call-bind "^1.0.2" + get-intrinsic "^1.2.0" + is-typed-array "^1.1.10" + +is-arrayish@^0.2.1: + version "0.2.1" + resolved "https://registry.yarnpkg.com/is-arrayish/-/is-arrayish-0.2.1.tgz#77c99840527aa8ecb1a8ba697b80645a7a926a9d" + integrity sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg== + +is-bigint@^1.0.1: + version "1.0.4" + resolved "https://registry.yarnpkg.com/is-bigint/-/is-bigint-1.0.4.tgz#08147a1875bc2b32005d41ccd8291dffc6691df3" + integrity sha512-zB9CruMamjym81i2JZ3UMn54PKGsQzsJeo6xvN3HJJ4CAsQNB6iRutp2To77OfCNuoxspsIhzaPoO1zyCEhFOg== + dependencies: + has-bigints "^1.0.1" + +is-boolean-object@^1.1.0: + version "1.1.2" + resolved "https://registry.yarnpkg.com/is-boolean-object/-/is-boolean-object-1.1.2.tgz#5c6dc200246dd9321ae4b885a114bb1f75f63719" + integrity sha512-gDYaKHJmnj4aWxyj6YHyXVpdQawtVLHU5cb+eztPGczf6cjuTdwve5ZIEfgXqH4e57An1D1AKf8CZ3kYrQRqYA== + dependencies: + call-bind "^1.0.2" + has-tostringtag "^1.0.0" + +is-callable@^1.1.3, is-callable@^1.1.4, is-callable@^1.2.7: + version "1.2.7" + resolved "https://registry.yarnpkg.com/is-callable/-/is-callable-1.2.7.tgz#3bc2a85ea742d9e36205dcacdd72ca1fdc51b055" + integrity sha512-1BC0BVFhS/p0qtw6enp8e+8OD0UrK0oFLztSjNzhcKA3WDuJxxAPXzPuPtKkjEY9UUoEWlX/8fgKeu2S8i9JTA== + +is-core-module@^2.9.0: + version "2.11.0" + resolved "https://registry.yarnpkg.com/is-core-module/-/is-core-module-2.11.0.tgz#ad4cb3e3863e814523c96f3f58d26cc570ff0144" + integrity sha512-RRjxlvLDkD1YJwDbroBHMb+cukurkDWNyHx7D3oNB5x9rb5ogcksMC5wHCadcXoo67gVr/+3GFySh3134zi6rw== + dependencies: + has "^1.0.3" + +is-date-object@^1.0.1: + version "1.0.5" + resolved "https://registry.yarnpkg.com/is-date-object/-/is-date-object-1.0.5.tgz#0841d5536e724c25597bf6ea62e1bd38298df31f" + integrity sha512-9YQaSxsAiSwcvS33MBk3wTCVnWK+HhF8VZR2jRxehM16QcVOdHqPn4VPHmRK4lSr38n9JriurInLcP90xsYNfQ== + dependencies: + has-tostringtag "^1.0.0" + +is-extglob@^2.1.1: + version "2.1.1" + resolved "https://registry.yarnpkg.com/is-extglob/-/is-extglob-2.1.1.tgz#a88c02535791f02ed37c76a1b9ea9773c833f8c2" + integrity sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ== + +is-fullwidth-code-point@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/is-fullwidth-code-point/-/is-fullwidth-code-point-4.0.0.tgz#fae3167c729e7463f8461ce512b080a49268aa88" + integrity sha512-O4L094N2/dZ7xqVdrXhh9r1KODPJpFms8B5sGdJLPy664AgvXsreZUyCQQNItZRDlYug4xStLjNp/sz3HvBowQ== + +is-glob@^4.0.1: + version "4.0.3" + resolved "https://registry.yarnpkg.com/is-glob/-/is-glob-4.0.3.tgz#64f61e42cbbb2eec2071a9dac0b28ba1e65d5084" + integrity sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg== + dependencies: + is-extglob "^2.1.1" + +is-negative-zero@^2.0.2: + version "2.0.2" + resolved "https://registry.yarnpkg.com/is-negative-zero/-/is-negative-zero-2.0.2.tgz#7bf6f03a28003b8b3965de3ac26f664d765f3150" + integrity sha512-dqJvarLawXsFbNDeJW7zAz8ItJ9cd28YufuuFzh0G8pNHjJMnY08Dv7sYX2uF5UpQOwieAeOExEYAWWfu7ZZUA== + +is-number-object@^1.0.4: + version "1.0.7" + resolved "https://registry.yarnpkg.com/is-number-object/-/is-number-object-1.0.7.tgz#59d50ada4c45251784e9904f5246c742f07a42fc" + integrity sha512-k1U0IRzLMo7ZlYIfzRu23Oh6MiIFasgpb9X76eqfFZAqwH44UI4KTBvBYIZ1dSL9ZzChTB9ShHfLkR4pdW5krQ== + dependencies: + has-tostringtag "^1.0.0" + +is-number@^7.0.0: + version "7.0.0" + resolved "https://registry.yarnpkg.com/is-number/-/is-number-7.0.0.tgz#7535345b896734d5f80c4d06c50955527a14f12b" + integrity sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng== + +is-regex@^1.1.4: + version "1.1.4" + resolved "https://registry.yarnpkg.com/is-regex/-/is-regex-1.1.4.tgz#eef5663cd59fa4c0ae339505323df6854bb15958" + integrity sha512-kvRdxDsxZjhzUX07ZnLydzS1TU/TJlTUHHY4YLL87e37oUA49DfkLqgy+VjFocowy29cKvcSiu+kIv728jTTVg== + dependencies: + call-bind "^1.0.2" + has-tostringtag "^1.0.0" + +is-shared-array-buffer@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/is-shared-array-buffer/-/is-shared-array-buffer-1.0.2.tgz#8f259c573b60b6a32d4058a1a07430c0a7344c79" + integrity sha512-sqN2UDu1/0y6uvXyStCOzyhAjCSlHceFoMKJW8W9EU9cvic/QdsZ0kEU93HEy3IUEFZIiH/3w+AH/UQbPHNdhA== + dependencies: + call-bind "^1.0.2" + +is-string@^1.0.5, is-string@^1.0.7: + version "1.0.7" + resolved "https://registry.yarnpkg.com/is-string/-/is-string-1.0.7.tgz#0dd12bf2006f255bb58f695110eff7491eebc0fd" + integrity sha512-tE2UXzivje6ofPW7l23cjDOMa09gb7xlAqG6jG5ej6uPV32TlWP3NKPigtaGeHNu9fohccRYvIiZMfOOnOYUtg== + dependencies: + has-tostringtag "^1.0.0" + +is-symbol@^1.0.2, is-symbol@^1.0.3: + version "1.0.4" + resolved "https://registry.yarnpkg.com/is-symbol/-/is-symbol-1.0.4.tgz#a6dac93b635b063ca6872236de88910a57af139c" + integrity sha512-C/CPBqKWnvdcxqIARxyOh4v1UUEOCHpgDa0WYgpKDFMszcrPcffg5uhwSgPCLD2WWxmq6isisz87tzT01tuGhg== + dependencies: + has-symbols "^1.0.2" + +is-typed-array@^1.1.10, is-typed-array@^1.1.9: + version "1.1.10" + resolved "https://registry.yarnpkg.com/is-typed-array/-/is-typed-array-1.1.10.tgz#36a5b5cb4189b575d1a3e4b08536bfb485801e3f" + integrity sha512-PJqgEHiWZvMpaFZ3uTc8kHPM4+4ADTlDniuQL7cU/UDA0Ql7F70yGfHph3cLNe+c9toaigv+DFzTJKhc2CtO6A== + dependencies: + available-typed-arrays "^1.0.5" + call-bind "^1.0.2" + for-each "^0.3.3" + gopd "^1.0.1" + has-tostringtag "^1.0.0" + +is-weakref@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/is-weakref/-/is-weakref-1.0.2.tgz#9529f383a9338205e89765e0392efc2f100f06f2" + integrity sha512-qctsuLZmIQ0+vSSMfoVvyFe2+GSEvnmZ2ezTup1SBse9+twCCeial6EEi3Nc2KFcf6+qz2FBPnjXsk8xhKSaPQ== + dependencies: + call-bind "^1.0.2" + +isexe@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/isexe/-/isexe-2.0.0.tgz#e8fbf374dc556ff8947a10dcb0572d633f2cfa10" + integrity sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw== + +json-parse-better-errors@^1.0.1: + version "1.0.2" + resolved "https://registry.yarnpkg.com/json-parse-better-errors/-/json-parse-better-errors-1.0.2.tgz#bb867cfb3450e69107c131d1c514bab3dc8bcaa9" + integrity sha512-mrqyZKfX5EhL7hvqcV6WG1yYjnjeuYDzDhhcAAUrq8Po85NBQBJP+ZDUT75qZQ98IkUoBqdkExkukOU7Ts2wrw== + +jsonc-parser@^3.2.0: + version "3.2.0" + resolved "https://registry.yarnpkg.com/jsonc-parser/-/jsonc-parser-3.2.0.tgz#31ff3f4c2b9793f89c67212627c51c6394f88e76" + integrity sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w== + +load-json-file@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/load-json-file/-/load-json-file-4.0.0.tgz#2f5f45ab91e33216234fd53adab668eb4ec0993b" + integrity sha512-Kx8hMakjX03tiGTLAIdJ+lL0htKnXjEZN6hk/tozf/WOuYGdZBJrZ+rCJRbVCugsjB3jMLn9746NsQIf5VjBMw== + dependencies: + graceful-fs "^4.1.2" + parse-json "^4.0.0" + pify "^3.0.0" + strip-bom "^3.0.0" + +local-pkg@^0.4.2: + version "0.4.3" + resolved "https://registry.yarnpkg.com/local-pkg/-/local-pkg-0.4.3.tgz#0ff361ab3ae7f1c19113d9bb97b98b905dbc4963" + integrity sha512-SFppqq5p42fe2qcZQqqEOiVRXl+WCP1MdT6k7BDEW1j++sp5fIY+/fdRQitvKgB5BrBcmrs5m/L0v2FrU5MY1g== + +loupe@^2.3.1, loupe@^2.3.6: + version "2.3.6" + resolved "https://registry.yarnpkg.com/loupe/-/loupe-2.3.6.tgz#76e4af498103c532d1ecc9be102036a21f787b53" + integrity sha512-RaPMZKiMy8/JruncMU5Bt6na1eftNoo++R4Y+N2FrxkDVTrGvcyzFTsaGif4QTeKESheMGegbhw6iUAq+5A8zA== + dependencies: + get-func-name "^2.0.0" + +memorystream@^0.3.1: + version "0.3.1" + resolved "https://registry.yarnpkg.com/memorystream/-/memorystream-0.3.1.tgz#86d7090b30ce455d63fbae12dda51a47ddcaf9b2" + integrity sha512-S3UwM3yj5mtUSEfP41UZmt/0SCoVYUcU1rkXv+BQ5Ig8ndL4sPoJNBUJERafdPb5jjHJGuMgytgKvKIf58XNBw== + +merge2@^1.3.0: + version "1.4.1" + resolved "https://registry.yarnpkg.com/merge2/-/merge2-1.4.1.tgz#4368892f885e907455a6fd7dc55c0c9d404990ae" + integrity sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg== + +micromatch@^4.0.4: + version "4.0.5" + resolved "https://registry.yarnpkg.com/micromatch/-/micromatch-4.0.5.tgz#bc8999a7cbbf77cdc89f132f6e467051b49090c6" + integrity sha512-DMy+ERcEW2q8Z2Po+WNXuw3c5YaUSFjAO5GsJqfEl7UjvtIuFKO6ZrKvcItdy98dwFI2N1tg3zNIdKaQT+aNdA== + dependencies: + braces "^3.0.2" + picomatch "^2.3.1" + +minimatch@^3.0.4: + version "3.1.2" + resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.1.2.tgz#19cd194bfd3e428f049a70817c038d89ab4be35b" + integrity sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw== + dependencies: + brace-expansion "^1.1.7" + +minimatch@^5.1.0: + version "5.1.6" + resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-5.1.6.tgz#1cfcb8cf5522ea69952cd2af95ae09477f122a96" + integrity sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g== + dependencies: + brace-expansion "^2.0.1" + +mkdirp@^1.0.4: + version "1.0.4" + resolved "https://registry.yarnpkg.com/mkdirp/-/mkdirp-1.0.4.tgz#3eb5ed62622756d79a5f0e2a221dfebad75c2f7e" + integrity sha512-vVqVZQyf3WLx2Shd0qJ9xuvqgAyKPLAiqITEtqW0oIUjzo3PePDd6fW9iFz30ef7Ysp/oiWqbhszeGWW2T6Gzw== + +mlly@^1.1.0, mlly@^1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/mlly/-/mlly-1.1.1.tgz#f1838b14795e2cc284aa4ebcc76a258a52e6f537" + integrity sha512-Jnlh4W/aI4GySPo6+DyTN17Q75KKbLTyFK8BrGhjNP4rxuUjbRWhE6gHg3bs33URWAF44FRm7gdQA348i3XxRw== + dependencies: + acorn "^8.8.2" + pathe "^1.1.0" + pkg-types "^1.0.1" + ufo "^1.1.0" + +ms@2.1.2: + version "2.1.2" + resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009" + integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w== + +nanoid@^3.3.4: + version "3.3.4" + resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.4.tgz#730b67e3cd09e2deacf03c027c81c9d9dbc5e8ab" + integrity sha512-MqBkQh/OHTS2egovRtLk45wEyNXwF+cokD+1YPf9u5VfJiRdAiRwB2froX5Co9Rh20xs4siNPm8naNotSD6RBw== + +nice-try@^1.0.4: + version "1.0.5" + resolved "https://registry.yarnpkg.com/nice-try/-/nice-try-1.0.5.tgz#a3378a7696ce7d223e88fc9b764bd7ef1089e366" + integrity sha512-1nh45deeb5olNY7eX82BkPO7SSxR5SSYJiPTrTdFUVYwAl8CKMA5N9PjTYkHiRjisVcxcQ1HXdLhx2qxxJzLNQ== + +normalize-package-data@^2.3.2: + version "2.5.0" + resolved "https://registry.yarnpkg.com/normalize-package-data/-/normalize-package-data-2.5.0.tgz#e66db1838b200c1dfc233225d12cb36520e234a8" + integrity sha512-/5CMN3T0R4XTj4DcGaexo+roZSdSFW/0AOOTROrjxzCG1wrWXEsGbRKevjlIL+ZDE4sZlJr5ED4YW0yqmkK+eA== + dependencies: + hosted-git-info "^2.1.4" + resolve "^1.10.0" + semver "2 || 3 || 4 || 5" + validate-npm-package-license "^3.0.1" + +npm-run-all@^4.1.5: + version "4.1.5" + resolved "https://registry.yarnpkg.com/npm-run-all/-/npm-run-all-4.1.5.tgz#04476202a15ee0e2e214080861bff12a51d98fba" + integrity sha512-Oo82gJDAVcaMdi3nuoKFavkIHBRVqQ1qvMb+9LHk/cF4P6B2m8aP04hGf7oL6wZ9BuGwX1onlLhpuoofSyoQDQ== + dependencies: + ansi-styles "^3.2.1" + chalk "^2.4.1" + cross-spawn "^6.0.5" + memorystream "^0.3.1" + minimatch "^3.0.4" + pidtree "^0.3.0" + read-pkg "^3.0.0" + shell-quote "^1.6.1" + string.prototype.padend "^3.0.0" + +object-inspect@^1.12.2, object-inspect@^1.9.0: + version "1.12.3" + resolved "https://registry.yarnpkg.com/object-inspect/-/object-inspect-1.12.3.tgz#ba62dffd67ee256c8c086dfae69e016cd1f198b9" + integrity sha512-geUvdk7c+eizMNUDkRpW1wJwgfOiOeHbxBR/hLXK1aT6zmVSO0jsQcs7fj6MGw89jC/cjGfLcNOrtMYtGqm81g== + +object-keys@^1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/object-keys/-/object-keys-1.1.1.tgz#1c47f272df277f3b1daf061677d9c82e2322c60e" + integrity sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA== + +object.assign@^4.1.4: + version "4.1.4" + resolved "https://registry.yarnpkg.com/object.assign/-/object.assign-4.1.4.tgz#9673c7c7c351ab8c4d0b516f4343ebf4dfb7799f" + integrity sha512-1mxKf0e58bvyjSCtKYY4sRe9itRk3PJpquJOjeIkz885CczcI4IvJJDLPS72oowuSh+pBxUFROpX+TU++hxhZQ== + dependencies: + call-bind "^1.0.2" + define-properties "^1.1.4" + has-symbols "^1.0.3" + object-keys "^1.1.1" + +p-limit@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/p-limit/-/p-limit-4.0.0.tgz#914af6544ed32bfa54670b061cafcbd04984b644" + integrity sha512-5b0R4txpzjPWVw/cXXUResoD4hb6U/x9BH08L7nw+GN1sezDzPdxeRvpc9c433fZhBan/wusjbCsqwqm4EIBIQ== + dependencies: + yocto-queue "^1.0.0" + +parse-json@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/parse-json/-/parse-json-4.0.0.tgz#be35f5425be1f7f6c747184f98a788cb99477ee0" + integrity sha512-aOIos8bujGN93/8Ox/jPLh7RwVnPEysynVFE+fQZyg6jKELEHwzgKdLRFHUgXJL6kylijVSBC4BvN9OmsB48Rw== + dependencies: + error-ex "^1.3.1" + json-parse-better-errors "^1.0.1" + +path-browserify@^1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/path-browserify/-/path-browserify-1.0.1.tgz#d98454a9c3753d5790860f16f68867b9e46be1fd" + integrity sha512-b7uo2UCUOYZcnF/3ID0lulOJi/bafxa1xPe7ZPsammBSpjSWQkjNxlt635YGS2MiR9GjvuXCtz2emr3jbsz98g== + +path-key@^2.0.1: + version "2.0.1" + resolved "https://registry.yarnpkg.com/path-key/-/path-key-2.0.1.tgz#411cadb574c5a140d3a4b1910d40d80cc9f40b40" + integrity sha512-fEHGKCSmUSDPv4uoj8AlD+joPlq3peND+HRYyxFz4KPw4z926S/b8rIuFs2FYJg3BwsxJf6A9/3eIdLaYC+9Dw== + +path-parse@^1.0.7: + version "1.0.7" + resolved "https://registry.yarnpkg.com/path-parse/-/path-parse-1.0.7.tgz#fbc114b60ca42b30d9daf5858e4bd68bbedb6735" + integrity sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw== + +path-type@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/path-type/-/path-type-3.0.0.tgz#cef31dc8e0a1a3bb0d105c0cd97cf3bf47f4e36f" + integrity sha512-T2ZUsdZFHgA3u4e5PfPbjd7HDDpxPnQb5jN0SrDsjNSuVXHJqtwTnWqG0B1jZrgmJ/7lj1EmVIByWt1gxGkWvg== + dependencies: + pify "^3.0.0" + +pathe@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/pathe/-/pathe-1.1.0.tgz#e2e13f6c62b31a3289af4ba19886c230f295ec03" + integrity sha512-ODbEPR0KKHqECXW1GoxdDb+AZvULmXjVPy4rt+pGo2+TnjJTIPJQSVS6N63n8T2Ip+syHhbn52OewKicV0373w== + +pathval@^1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/pathval/-/pathval-1.1.1.tgz#8534e77a77ce7ac5a2512ea21e0fdb8fcf6c3d8d" + integrity sha512-Dp6zGqpTdETdR63lehJYPeIOqpiNBNtc7BpWSLrOje7UaIsE5aY92r/AunQA7rsXvet3lrJ3JnZX29UPTKXyKQ== + +picocolors@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/picocolors/-/picocolors-1.0.0.tgz#cb5bdc74ff3f51892236eaf79d68bc44564ab81c" + integrity sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ== + +picomatch@^2.3.1: + version "2.3.1" + resolved "https://registry.yarnpkg.com/picomatch/-/picomatch-2.3.1.tgz#3ba3833733646d9d3e4995946c1365a67fb07a42" + integrity sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA== + +pidtree@^0.3.0: + version "0.3.1" + resolved "https://registry.yarnpkg.com/pidtree/-/pidtree-0.3.1.tgz#ef09ac2cc0533df1f3250ccf2c4d366b0d12114a" + integrity sha512-qQbW94hLHEqCg7nhby4yRC7G2+jYHY4Rguc2bjw7Uug4GIJuu1tvf2uHaZv5Q8zdt+WKJ6qK1FOI6amaWUo5FA== + +pify@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/pify/-/pify-3.0.0.tgz#e5a4acd2c101fdf3d9a4d07f0dbc4db49dd28176" + integrity sha512-C3FsVNH1udSEX48gGX1xfvwTWfsYWj5U+8/uK15BGzIGrKoUpghX8hWZwa/OFnakBiiVNmBvemTJR5mcy7iPcg== + +pkg-types@^1.0.1: + version "1.0.2" + resolved "https://registry.yarnpkg.com/pkg-types/-/pkg-types-1.0.2.tgz#c233efc5210a781e160e0cafd60c0d0510a4b12e" + integrity sha512-hM58GKXOcj8WTqUXnsQyJYXdeAPbythQgEF3nTcEo+nkD49chjQ9IKm/QJy9xf6JakXptz86h7ecP2024rrLaQ== + dependencies: + jsonc-parser "^3.2.0" + mlly "^1.1.1" + pathe "^1.1.0" + +postcss@^8.4.21: + version "8.4.21" + resolved "https://registry.yarnpkg.com/postcss/-/postcss-8.4.21.tgz#c639b719a57efc3187b13a1d765675485f4134f4" + integrity sha512-tP7u/Sn/dVxK2NnruI4H9BG+x+Wxz6oeZ1cJ8P6G/PZY0IKk4k/63TDsQf2kQq3+qoJeLm2kIBUNlZe3zgb4Zg== + dependencies: + nanoid "^3.3.4" + picocolors "^1.0.0" + source-map-js "^1.0.2" + +pretty-format@^27.5.1: + version "27.5.1" + resolved "https://registry.yarnpkg.com/pretty-format/-/pretty-format-27.5.1.tgz#2181879fdea51a7a5851fb39d920faa63f01d88e" + integrity sha512-Qb1gy5OrP5+zDf2Bvnzdl3jsTf1qXVMazbvCoKhtKqVs4/YK4ozX4gKQJJVyNe+cajNPn0KoC0MC3FUmaHWEmQ== + dependencies: + ansi-regex "^5.0.1" + ansi-styles "^5.0.0" + react-is "^17.0.1" + +queue-microtask@^1.2.2: + version "1.2.3" + resolved "https://registry.yarnpkg.com/queue-microtask/-/queue-microtask-1.2.3.tgz#4929228bbc724dfac43e0efb058caf7b6cfb6243" + integrity sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A== + +react-is@^17.0.1: + version "17.0.2" + resolved "https://registry.yarnpkg.com/react-is/-/react-is-17.0.2.tgz#e691d4a8e9c789365655539ab372762b0efb54f0" + integrity sha512-w2GsyukL62IJnlaff/nRegPQR94C/XXamvMWmSHRJ4y7Ts/4ocGRmTHvOs8PSE6pB3dWOrD/nueuU5sduBsQ4w== + +read-pkg@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/read-pkg/-/read-pkg-3.0.0.tgz#9cbc686978fee65d16c00e2b19c237fcf6e38389" + integrity sha512-BLq/cCO9two+lBgiTYNqD6GdtK8s4NpaWrl6/rCO9w0TUS8oJl7cmToOZfRYllKTISY6nt1U7jQ53brmKqY6BA== + dependencies: + load-json-file "^4.0.0" + normalize-package-data "^2.3.2" + path-type "^3.0.0" + +regexp.prototype.flags@^1.4.3: + version "1.4.3" + resolved "https://registry.yarnpkg.com/regexp.prototype.flags/-/regexp.prototype.flags-1.4.3.tgz#87cab30f80f66660181a3bb7bf5981a872b367ac" + integrity sha512-fjggEOO3slI6Wvgjwflkc4NFRCTZAu5CnNfBd5qOMYhWdn67nJBBu34/TkD++eeFmd8C9r9jfXJ27+nSiRkSUA== + dependencies: + call-bind "^1.0.2" + define-properties "^1.1.3" + functions-have-names "^1.2.2" + +resolve@^1.10.0, resolve@^1.22.1: + version "1.22.1" + resolved "https://registry.yarnpkg.com/resolve/-/resolve-1.22.1.tgz#27cb2ebb53f91abb49470a928bba7558066ac177" + integrity sha512-nBpuuYuY5jFsli/JIs1oldw6fOQCBioohqWZg/2hiaOybXOft4lonv85uDOKXdf8rhyK159cxU5cDcK/NKk8zw== + dependencies: + is-core-module "^2.9.0" + path-parse "^1.0.7" + supports-preserve-symlinks-flag "^1.0.0" + +reusify@^1.0.4: + version "1.0.4" + resolved "https://registry.yarnpkg.com/reusify/-/reusify-1.0.4.tgz#90da382b1e126efc02146e90845a88db12925d76" + integrity sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw== + +rollup@^3.10.0: + version "3.17.2" + resolved "https://registry.yarnpkg.com/rollup/-/rollup-3.17.2.tgz#a4ecd29c488672a0606e41ef57474fad715750a9" + integrity sha512-qMNZdlQPCkWodrAZ3qnJtvCAl4vpQ8q77uEujVCCbC/6CLB7Lcmvjq7HyiOSnf4fxTT9XgsE36oLHJBH49xjqA== + optionalDependencies: + fsevents "~2.3.2" + +run-parallel@^1.1.9: + version "1.2.0" + resolved "https://registry.yarnpkg.com/run-parallel/-/run-parallel-1.2.0.tgz#66d1368da7bdf921eb9d95bd1a9229e7f21a43ee" + integrity sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA== + dependencies: + queue-microtask "^1.2.2" + +safe-regex-test@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/safe-regex-test/-/safe-regex-test-1.0.0.tgz#793b874d524eb3640d1873aad03596db2d4f2295" + integrity sha512-JBUUzyOgEwXQY1NuPtvcj/qcBDbDmEvWufhlnXZIm75DEHp+afM1r1ujJpJsV/gSM4t59tpDyPi1sd6ZaPFfsA== + dependencies: + call-bind "^1.0.2" + get-intrinsic "^1.1.3" + is-regex "^1.1.4" + +"semver@2 || 3 || 4 || 5", semver@^5.5.0: + version "5.7.1" + resolved "https://registry.yarnpkg.com/semver/-/semver-5.7.1.tgz#a954f931aeba508d307bbf069eff0c01c96116f7" + integrity sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ== + +shebang-command@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/shebang-command/-/shebang-command-1.2.0.tgz#44aac65b695b03398968c39f363fee5deafdf1ea" + integrity sha512-EV3L1+UQWGor21OmnvojK36mhg+TyIKDh3iFBKBohr5xeXIhNBcx8oWdgkTEEQ+BEFFYdLRuqMfd5L84N1V5Vg== + dependencies: + shebang-regex "^1.0.0" + +shebang-regex@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/shebang-regex/-/shebang-regex-1.0.0.tgz#da42f49740c0b42db2ca9728571cb190c98efea3" + integrity sha512-wpoSFAxys6b2a2wHZ1XpDSgD7N9iVjg29Ph9uV/uaP9Ex/KXlkTZTeddxDPSYQpgvzKLGJke2UU0AzoGCjNIvQ== + +shell-quote@^1.6.1: + version "1.8.0" + resolved "https://registry.yarnpkg.com/shell-quote/-/shell-quote-1.8.0.tgz#20d078d0eaf71d54f43bd2ba14a1b5b9bfa5c8ba" + integrity sha512-QHsz8GgQIGKlRi24yFc6a6lN69Idnx634w49ay6+jA5yFh7a1UY+4Rp6HPx/L/1zcEDPEij8cIsiqR6bQsE5VQ== + +side-channel@^1.0.4: + version "1.0.4" + resolved "https://registry.yarnpkg.com/side-channel/-/side-channel-1.0.4.tgz#efce5c8fdc104ee751b25c58d4290011fa5ea2cf" + integrity sha512-q5XPytqFEIKHkGdiMIrY10mvLRvnQh42/+GoBlFW3b2LXLE2xxJpZFdm94we0BaoV3RwJyGqg5wS7epxTv0Zvw== + dependencies: + call-bind "^1.0.0" + get-intrinsic "^1.0.2" + object-inspect "^1.9.0" + +siginfo@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/siginfo/-/siginfo-2.0.0.tgz#32e76c70b79724e3bb567cb9d543eb858ccfaf30" + integrity sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g== + +slice-ansi@^5.0.0: + version "5.0.0" + resolved "https://registry.yarnpkg.com/slice-ansi/-/slice-ansi-5.0.0.tgz#b73063c57aa96f9cd881654b15294d95d285c42a" + integrity sha512-FC+lgizVPfie0kkhqUScwRu1O/lF6NOgJmlCgK+/LYxDCTk8sGelYaHDhFcDN+Sn3Cv+3VSa4Byeo+IMCzpMgQ== + dependencies: + ansi-styles "^6.0.0" + is-fullwidth-code-point "^4.0.0" + +source-map-js@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/source-map-js/-/source-map-js-1.0.2.tgz#adbc361d9c62df380125e7f161f71c826f1e490c" + integrity sha512-R0XvVJ9WusLiqTCEiGCmICCMplcCkIwwR11mOSD9CR5u+IXYdiseeEuXCVAjS54zqwkLcPNnmU4OeJ6tUrWhDw== + +source-map-support@^0.5.21: + version "0.5.21" + resolved "https://registry.yarnpkg.com/source-map-support/-/source-map-support-0.5.21.tgz#04fe7c7f9e1ed2d662233c28cb2b35b9f63f6e4f" + integrity sha512-uBHU3L3czsIyYXKX88fdrGovxdSCoTGDRZ6SYXtSRxLZUzHg5P/66Ht6uoUlHu9EZod+inXhKo3qQgwXUT/y1w== + dependencies: + buffer-from "^1.0.0" + source-map "^0.6.0" + +source-map@^0.6.0, source-map@^0.6.1: + version "0.6.1" + resolved "https://registry.yarnpkg.com/source-map/-/source-map-0.6.1.tgz#74722af32e9614e9c287a8d0bbde48b5e2f1a263" + integrity sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g== + +spdx-correct@^3.0.0: + version "3.2.0" + resolved "https://registry.yarnpkg.com/spdx-correct/-/spdx-correct-3.2.0.tgz#4f5ab0668f0059e34f9c00dce331784a12de4e9c" + integrity sha512-kN9dJbvnySHULIluDHy32WHRUu3Og7B9sbY7tsFLctQkIqnMh3hErYgdMjTYuqmcXX+lK5T1lnUt3G7zNswmZA== + dependencies: + spdx-expression-parse "^3.0.0" + spdx-license-ids "^3.0.0" + +spdx-exceptions@^2.1.0: + version "2.3.0" + resolved "https://registry.yarnpkg.com/spdx-exceptions/-/spdx-exceptions-2.3.0.tgz#3f28ce1a77a00372683eade4a433183527a2163d" + integrity sha512-/tTrYOC7PPI1nUAgx34hUpqXuyJG+DTHJTnIULG4rDygi4xu/tfgmq1e1cIRwRzwZgo4NLySi+ricLkZkw4i5A== + +spdx-expression-parse@^3.0.0: + version "3.0.1" + resolved "https://registry.yarnpkg.com/spdx-expression-parse/-/spdx-expression-parse-3.0.1.tgz#cf70f50482eefdc98e3ce0a6833e4a53ceeba679" + integrity sha512-cbqHunsQWnJNE6KhVSMsMeH5H/L9EpymbzqTQ3uLwNCLZ1Q481oWaofqH7nO6V07xlXwY6PhQdQ2IedWx/ZK4Q== + dependencies: + spdx-exceptions "^2.1.0" + spdx-license-ids "^3.0.0" + +spdx-license-ids@^3.0.0: + version "3.0.12" + resolved "https://registry.yarnpkg.com/spdx-license-ids/-/spdx-license-ids-3.0.12.tgz#69077835abe2710b65f03969898b6637b505a779" + integrity sha512-rr+VVSXtRhO4OHbXUiAF7xW3Bo9DuuF6C5jH+q/x15j2jniycgKbxU09Hr0WqlSLUs4i4ltHGXqTe7VHclYWyA== + +stackback@0.0.2: + version "0.0.2" + resolved "https://registry.yarnpkg.com/stackback/-/stackback-0.0.2.tgz#1ac8a0d9483848d1695e418b6d031a3c3ce68e3b" + integrity sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw== + +std-env@^3.3.1: + version "3.3.2" + resolved "https://registry.yarnpkg.com/std-env/-/std-env-3.3.2.tgz#af27343b001616015534292178327b202b9ee955" + integrity sha512-uUZI65yrV2Qva5gqE0+A7uVAvO40iPo6jGhs7s8keRfHCmtg+uB2X6EiLGCI9IgL1J17xGhvoOqSz79lzICPTA== + +string-width@^5.0.0: + version "5.1.2" + resolved "https://registry.yarnpkg.com/string-width/-/string-width-5.1.2.tgz#14f8daec6d81e7221d2a357e668cab73bdbca794" + integrity sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA== + dependencies: + eastasianwidth "^0.2.0" + emoji-regex "^9.2.2" + strip-ansi "^7.0.1" + +string.prototype.padend@^3.0.0: + version "3.1.4" + resolved "https://registry.yarnpkg.com/string.prototype.padend/-/string.prototype.padend-3.1.4.tgz#2c43bb3a89eb54b6750de5942c123d6c98dd65b6" + integrity sha512-67otBXoksdjsnXXRUq+KMVTdlVRZ2af422Y0aTyTjVaoQkGr3mxl2Bc5emi7dOQ3OGVVQQskmLEWwFXwommpNw== + dependencies: + call-bind "^1.0.2" + define-properties "^1.1.4" + es-abstract "^1.20.4" + +string.prototype.trimend@^1.0.6: + version "1.0.6" + resolved "https://registry.yarnpkg.com/string.prototype.trimend/-/string.prototype.trimend-1.0.6.tgz#c4a27fa026d979d79c04f17397f250a462944533" + integrity sha512-JySq+4mrPf9EsDBEDYMOb/lM7XQLulwg5R/m1r0PXEFqrV0qHvl58sdTilSXtKOflCsK2E8jxf+GKC0T07RWwQ== + dependencies: + call-bind "^1.0.2" + define-properties "^1.1.4" + es-abstract "^1.20.4" + +string.prototype.trimstart@^1.0.6: + version "1.0.6" + resolved "https://registry.yarnpkg.com/string.prototype.trimstart/-/string.prototype.trimstart-1.0.6.tgz#e90ab66aa8e4007d92ef591bbf3cd422c56bdcf4" + integrity sha512-omqjMDaY92pbn5HOX7f9IccLA+U1tA9GvtU4JrodiXFfYB7jPzzHpRzpglLAjtUV6bB557zwClJezTqnAiYnQA== + dependencies: + call-bind "^1.0.2" + define-properties "^1.1.4" + es-abstract "^1.20.4" + +strip-ansi@^7.0.1: + version "7.0.1" + resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-7.0.1.tgz#61740a08ce36b61e50e65653f07060d000975fb2" + integrity sha512-cXNxvT8dFNRVfhVME3JAe98mkXDYN2O1l7jmcwMnOslDeESg1rF/OZMtK0nRAhiari1unG5cD4jG3rapUAkLbw== + dependencies: + ansi-regex "^6.0.1" + +strip-bom@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/strip-bom/-/strip-bom-3.0.0.tgz#2334c18e9c759f7bdd56fdef7e9ae3d588e68ed3" + integrity sha512-vavAMRXOgBVNF6nyEEmL3DBK19iRpDcoIwW+swQ+CbGiu7lju6t+JklA1MHweoWtadgt4ISVUsXLyDq34ddcwA== + +strip-literal@^1.0.0: + version "1.0.1" + resolved "https://registry.yarnpkg.com/strip-literal/-/strip-literal-1.0.1.tgz#0115a332710c849b4e46497891fb8d585e404bd2" + integrity sha512-QZTsipNpa2Ppr6v1AmJHESqJ3Uz247MUS0OjrnnZjFAvEoWqxuyFuXn2xLgMtRnijJShAa1HL0gtJyUs7u7n3Q== + dependencies: + acorn "^8.8.2" + +supports-color@^5.3.0: + version "5.5.0" + resolved "https://registry.yarnpkg.com/supports-color/-/supports-color-5.5.0.tgz#e2e69a44ac8772f78a1ec0b35b689df6530efc8f" + integrity sha512-QjVjwdXIt408MIiAqCX4oUKsgU2EqAGzs2Ppkm4aQYbjm+ZEWEcW4SfFNTr4uMNZma0ey4f5lgLrkB0aX0QMow== + dependencies: + has-flag "^3.0.0" + +supports-preserve-symlinks-flag@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz#6eda4bd344a3c94aea376d4cc31bc77311039e09" + integrity sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w== + +tinybench@^2.3.1: + version "2.3.1" + resolved "https://registry.yarnpkg.com/tinybench/-/tinybench-2.3.1.tgz#14f64e6b77d7ef0b1f6ab850c7a808c6760b414d" + integrity sha512-hGYWYBMPr7p4g5IarQE7XhlyWveh1EKhy4wUBS1LrHXCKYgvz+4/jCqgmJqZxxldesn05vccrtME2RLLZNW7iA== + +tinypool@^0.3.1: + version "0.3.1" + resolved "https://registry.yarnpkg.com/tinypool/-/tinypool-0.3.1.tgz#a99c2e446aba9be05d3e1cb756d6aed7af4723b6" + integrity sha512-zLA1ZXlstbU2rlpA4CIeVaqvWq41MTWqLY3FfsAXgC8+f7Pk7zroaJQxDgxn1xNudKW6Kmj4808rPFShUlIRmQ== + +tinyspy@^1.0.2: + version "1.1.1" + resolved "https://registry.yarnpkg.com/tinyspy/-/tinyspy-1.1.1.tgz#0cb91d5157892af38cb2d217f5c7e8507a5bf092" + integrity sha512-UVq5AXt/gQlti7oxoIg5oi/9r0WpF7DGEVwXgqWSMmyN16+e3tl5lIvTaOpJ3TAtu5xFzWccFRM4R5NaWHF+4g== + +to-regex-range@^5.0.1: + version "5.0.1" + resolved "https://registry.yarnpkg.com/to-regex-range/-/to-regex-range-5.0.1.tgz#1648c44aae7c8d988a326018ed72f5b4dd0392e4" + integrity sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ== + dependencies: + is-number "^7.0.0" + +ts-morph@^17.0.1: + version "17.0.1" + resolved "https://registry.yarnpkg.com/ts-morph/-/ts-morph-17.0.1.tgz#d85df4fcf9a1fcda1b331d52c00655f381c932d1" + integrity sha512-10PkHyXmrtsTvZSL+cqtJLTgFXkU43Gd0JCc0Rw6GchWbqKe0Rwgt1v3ouobTZwQzF1mGhDeAlWYBMGRV7y+3g== + dependencies: + "@ts-morph/common" "~0.18.0" + code-block-writer "^11.0.3" + +tsx@^3.12.3: + version "3.12.3" + resolved "https://registry.yarnpkg.com/tsx/-/tsx-3.12.3.tgz#b29f6c9246d4e3ea46451cd81d7cbc98f45c4b8a" + integrity sha512-Wc5BFH1xccYTXaQob+lEcimkcb/Pq+0en2s+ruiX0VEIC80nV7/0s7XRahx8NnsoCnpCVUPz8wrqVSPi760LkA== + dependencies: + "@esbuild-kit/cjs-loader" "^2.4.2" + "@esbuild-kit/core-utils" "^3.0.0" + "@esbuild-kit/esm-loader" "^2.5.5" + optionalDependencies: + fsevents "~2.3.2" + +type-detect@^4.0.0, type-detect@^4.0.5: + version "4.0.8" + resolved "https://registry.yarnpkg.com/type-detect/-/type-detect-4.0.8.tgz#7646fb5f18871cfbb7749e69bd39a6388eb7450c" + integrity sha512-0fr/mIH1dlO+x7TlcMy+bIDqKPsw/70tVyeHW787goQjhmqaZe10uwLujubK9q9Lg6Fiho1KUKDYz0Z7k7g5/g== + +typed-array-length@^1.0.4: + version "1.0.4" + resolved "https://registry.yarnpkg.com/typed-array-length/-/typed-array-length-1.0.4.tgz#89d83785e5c4098bec72e08b319651f0eac9c1bb" + integrity sha512-KjZypGq+I/H7HI5HlOoGHkWUUGq+Q0TPhQurLbyrVrvnKTBgzLhIJ7j6J/XTQOi0d1RjyZ0wdas8bKs2p0x3Ng== + dependencies: + call-bind "^1.0.2" + for-each "^0.3.3" + is-typed-array "^1.1.9" + +typescript@^4.9.5: + version "4.9.5" + resolved "https://registry.yarnpkg.com/typescript/-/typescript-4.9.5.tgz#095979f9bcc0d09da324d58d03ce8f8374cbe65a" + integrity sha512-1FXk9E2Hm+QzZQ7z+McJiHL4NW1F2EzMu9Nq9i3zAaGqibafqYwCVU6WyWAuyQRRzOlxou8xZSyXLEN8oKj24g== + +ufo@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/ufo/-/ufo-1.1.0.tgz#a5c4c814b0a98f7e0ca42c478688663fd3e3c037" + integrity sha512-LQc2s/ZDMaCN3QLpa+uzHUOQ7SdV0qgv3VBXOolQGXTaaZpIur6PwUclF5nN2hNkiTRcUugXd1zFOW3FLJ135Q== + +unbox-primitive@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/unbox-primitive/-/unbox-primitive-1.0.2.tgz#29032021057d5e6cdbd08c5129c226dff8ed6f9e" + integrity sha512-61pPlCD9h51VoreyJ0BReideM3MDKMKnh6+V9L08331ipq6Q8OFXZYiqP6n/tbHx4s5I9uRhcye6BrbkizkBDw== + dependencies: + call-bind "^1.0.2" + has-bigints "^1.0.2" + has-symbols "^1.0.3" + which-boxed-primitive "^1.0.2" + +validate-npm-package-license@^3.0.1: + version "3.0.4" + resolved "https://registry.yarnpkg.com/validate-npm-package-license/-/validate-npm-package-license-3.0.4.tgz#fc91f6b9c7ba15c857f4cb2c5defeec39d4f410a" + integrity sha512-DpKm2Ui/xN7/HQKCtpZxoRWBhZ9Z0kqtygG8XCgNQ8ZlDnxuQmWhj566j8fN4Cu3/JmbhsDo7fcAJq4s9h27Ew== + dependencies: + spdx-correct "^3.0.0" + spdx-expression-parse "^3.0.0" + +vite-node@0.28.5: + version "0.28.5" + resolved "https://registry.yarnpkg.com/vite-node/-/vite-node-0.28.5.tgz#56d0f78846ea40fddf2e28390899df52a4738006" + integrity sha512-LmXb9saMGlrMZbXTvOveJKwMTBTNUH66c8rJnQ0ZPNX+myPEol64+szRzXtV5ORb0Hb/91yq+/D3oERoyAt6LA== + dependencies: + cac "^6.7.14" + debug "^4.3.4" + mlly "^1.1.0" + pathe "^1.1.0" + picocolors "^1.0.0" + source-map "^0.6.1" + source-map-support "^0.5.21" + vite "^3.0.0 || ^4.0.0" + +"vite@^3.0.0 || ^4.0.0": + version "4.1.4" + resolved "https://registry.yarnpkg.com/vite/-/vite-4.1.4.tgz#170d93bcff97e0ebc09764c053eebe130bfe6ca0" + integrity sha512-3knk/HsbSTKEin43zHu7jTwYWv81f8kgAL99G5NWBcA1LKvtvcVAC4JjBH1arBunO9kQka+1oGbrMKOjk4ZrBg== + dependencies: + esbuild "^0.16.14" + postcss "^8.4.21" + resolve "^1.22.1" + rollup "^3.10.0" + optionalDependencies: + fsevents "~2.3.2" + +vitest@^0.28.5: + version "0.28.5" + resolved "https://registry.yarnpkg.com/vitest/-/vitest-0.28.5.tgz#94410a8924cd7189e4f1adffa8c5cde809cbf2f9" + integrity sha512-pyCQ+wcAOX7mKMcBNkzDwEHRGqQvHUl0XnoHR+3Pb1hytAHISgSxv9h0gUiSiYtISXUU3rMrKiKzFYDrI6ZIHA== + dependencies: + "@types/chai" "^4.3.4" + "@types/chai-subset" "^1.3.3" + "@types/node" "*" + "@vitest/expect" "0.28.5" + "@vitest/runner" "0.28.5" + "@vitest/spy" "0.28.5" + "@vitest/utils" "0.28.5" + acorn "^8.8.1" + acorn-walk "^8.2.0" + cac "^6.7.14" + chai "^4.3.7" + debug "^4.3.4" + local-pkg "^0.4.2" + pathe "^1.1.0" + picocolors "^1.0.0" + source-map "^0.6.1" + std-env "^3.3.1" + strip-literal "^1.0.0" + tinybench "^2.3.1" + tinypool "^0.3.1" + tinyspy "^1.0.2" + vite "^3.0.0 || ^4.0.0" + vite-node "0.28.5" + why-is-node-running "^2.2.2" + +which-boxed-primitive@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/which-boxed-primitive/-/which-boxed-primitive-1.0.2.tgz#13757bc89b209b049fe5d86430e21cf40a89a8e6" + integrity sha512-bwZdv0AKLpplFY2KZRX6TvyuN7ojjr7lwkg6ml0roIy9YeuSr7JS372qlNW18UQYzgYK9ziGcerWqZOmEn9VNg== + dependencies: + is-bigint "^1.0.1" + is-boolean-object "^1.1.0" + is-number-object "^1.0.4" + is-string "^1.0.5" + is-symbol "^1.0.3" + +which-typed-array@^1.1.9: + version "1.1.9" + resolved "https://registry.yarnpkg.com/which-typed-array/-/which-typed-array-1.1.9.tgz#307cf898025848cf995e795e8423c7f337efbde6" + integrity sha512-w9c4xkx6mPidwp7180ckYWfMmvxpjlZuIudNtDf4N/tTAUB8VJbX25qZoAsrtGuYNnGw3pa0AXgbGKRB8/EceA== + dependencies: + available-typed-arrays "^1.0.5" + call-bind "^1.0.2" + for-each "^0.3.3" + gopd "^1.0.1" + has-tostringtag "^1.0.0" + is-typed-array "^1.1.10" + +which@^1.2.9: + version "1.3.1" + resolved "https://registry.yarnpkg.com/which/-/which-1.3.1.tgz#a45043d54f5805316da8d62f9f50918d3da70b0a" + integrity sha512-HxJdYWq1MTIQbJ3nw0cqssHoTNU267KlrDuGZ1WYlxDStUtKUhOaJmh112/TZmHxxUfuJqPXSOm7tDyas0OSIQ== + dependencies: + isexe "^2.0.0" + +why-is-node-running@^2.2.2: + version "2.2.2" + resolved "https://registry.yarnpkg.com/why-is-node-running/-/why-is-node-running-2.2.2.tgz#4185b2b4699117819e7154594271e7e344c9973e" + integrity sha512-6tSwToZxTOcotxHeA+qGCq1mVzKR3CwcJGmVcY+QE8SHy6TnpFnh8PAvPNHYr7EcuVeG0QSMxtYCuO1ta/G/oA== + dependencies: + siginfo "^2.0.0" + stackback "0.0.2" + +yocto-queue@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/yocto-queue/-/yocto-queue-1.0.0.tgz#7f816433fb2cbc511ec8bf7d263c3b58a1a3c251" + integrity sha512-9bnSc/HEW2uRy67wc+T8UwauLuPJVn28jb+GtJY16iiKWyvmYJRXVT4UamsAEGQfPohgr2q4Tq0sQbQlxTfi1g== diff --git a/pyproject.toml b/pyproject.toml index 791e3c7a..c6fa5d9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,12 @@ requires = ["setuptools>=62.4", "wheel", "setuptools-rust>=1.5.2"] build-frontend = "build" build-verbosity = 1 -linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y" +linux.before-all = [ + "ulimit -n 1024", + "((command -v yum && yum install openssl-devel -y) || echo 'no yum found')", + "((command -v apk && apk add --no-cache openssl openssl-dev) || echo 'no apk found')", + "(curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y)" +] linux.environment = { PATH = "$PATH:$HOME/.cargo/bin" } macos.before-all = "rustup target add aarch64-apple-darwin" @@ -36,5 +41,4 @@ macos.archs = ["x86_64", "arm64"] test-skip = "*-macosx_arm64" before-test = "pip install pytest" -test-command = "pytest {project}/tests" - +test-command = "pytest {project}/tests" \ No newline at end of file diff --git a/python/Cargo.toml b/python/Cargo.toml new file mode 100644 index 00000000..7e97d03b --- /dev/null +++ b/python/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "tiktoken" +version = "0.2.0" +edition = "2021" +rust-version = "1.57.0" + +[lib] +name = "_tiktoken" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = { version = "0.17.3", features = ["extension-module"] } +tiktoken_core = { path = "../core", features = ["multithreading"] } +rustc-hash = "1.1.0" diff --git a/python/src/lib.rs b/python/src/lib.rs new file mode 100644 index 00000000..599105e2 --- /dev/null +++ b/python/src/lib.rs @@ -0,0 +1,97 @@ +#![allow(clippy::borrow_deref_ref)] + +use std::collections::HashSet; + +use pyo3::exceptions; +use pyo3::prelude::*; +use pyo3::types::{PyBytes, PyList, PyTuple}; +use pyo3::PyResult; +use rustc_hash::FxHashMap as HashMap; + +use _tiktoken_core::CoreBPENative; + +#[pyclass] +struct CoreBPE { + native: CoreBPENative, +} + +#[pymethods] +impl CoreBPE { + #[new] + fn new( + encoder: HashMap, usize>, + special_tokens_encoder: HashMap, + pattern: &str, + ) -> PyResult { + let native = CoreBPENative::new(encoder, special_tokens_encoder, pattern) + .map_err(|e| PyErr::new::(e.to_string()))?; + Ok(CoreBPE { native }) + } + + // ==================== + // Encoding + // ==================== + + fn encode_ordinary(&self, py: Python, text: &str) -> Vec { + py.allow_threads(|| self.native._encode_ordinary_native(text)) + } + + fn encode(&self, py: Python, text: &str, allowed_special: HashSet<&str>) -> Vec { + py.allow_threads(|| self.native._encode_native(text, &allowed_special, None).0) + } + + fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec { + py.allow_threads(|| { + self.native._encode_bytes(bytes) + }) + } + + fn encode_with_unstable( + &self, + py: Python, + text: &str, + allowed_special: HashSet<&str>, + ) -> Py { + let (tokens, completions) = + py.allow_threads(|| self.native._encode_unstable_native(text, &allowed_special)); + let py_completions = + PyList::new(py, completions.iter().map(|seq| PyList::new(py, &seq[..]))); + (tokens, py_completions).into_py(py) + } + + fn encode_single_token(&self, piece: &[u8]) -> PyResult { + self.native.encode_single_token(piece).map_err(|e| PyErr::new::(e)) + } + + // ==================== + // Decoding + // ==================== + + fn decode_bytes(&self, py: Python, tokens: Vec) -> Py { + let bytes = py.allow_threads(|| self.native._decode_native(&tokens)); + PyBytes::new(py, &bytes).into() + } + + fn decode_single_token_bytes(&self, py: Python, token: usize) -> PyResult> { + self.native.decode_single_token_bytes(token).map(|bytes| PyBytes::new(py, &bytes).into()) + .map_err(|e| PyErr::new::(e)) + } + + // ==================== + // Miscellaneous + // ==================== + + fn token_byte_values(&self, py: Python) -> Vec> { + self.native.token_byte_values() + .iter() + .map(|x| PyBytes::new(py, x).into()) + .collect() + } +} + + +#[pymodule] +fn _tiktoken(_py: Python, m: &PyModule) -> PyResult<()> { + m.add_class::()?; + Ok(()) +} diff --git a/ruby/Cargo.toml b/ruby/Cargo.toml new file mode 100644 index 00000000..a9700f0a --- /dev/null +++ b/ruby/Cargo.toml @@ -0,0 +1,5 @@ +[workspace] +members = ["ext/tiktoken"] + +[profile.release] +strip = true \ No newline at end of file diff --git a/ruby/Gemfile b/ruby/Gemfile new file mode 100644 index 00000000..4eea8689 --- /dev/null +++ b/ruby/Gemfile @@ -0,0 +1,6 @@ +source "https://rubygems.org" + +gemspec + +gem "rake" +gem "rake-compiler" \ No newline at end of file diff --git a/ruby/Gemfile.lock b/ruby/Gemfile.lock new file mode 100644 index 00000000..b6f436ae --- /dev/null +++ b/ruby/Gemfile.lock @@ -0,0 +1,25 @@ +PATH + remote: . + specs: + tiktoken (0.1.0) + rb_sys (~> 0.9) + +GEM + remote: https://rubygems.org/ + specs: + rake (13.0.6) + rake-compiler (1.2.1) + rake + rb_sys (0.9.68) + +PLATFORMS + arm64-darwin-21 + x86_64-linux + +DEPENDENCIES + rake + rake-compiler + tiktoken! + +BUNDLED WITH + 2.4.6 diff --git a/ruby/Rakefile b/ruby/Rakefile new file mode 100644 index 00000000..89c1a076 --- /dev/null +++ b/ruby/Rakefile @@ -0,0 +1,37 @@ +require "bundler/gem_tasks" +require "rake/extensiontask" + +platforms = [ + "x86_64-linux", + "x86_64-linux-musl", + "aarch64-linux", + "x86_64-darwin", + "arm64-darwin", + "x64-mingw-ucrt", + "x64-mingw32" +] + +gemspec = Bundler.load_gemspec("tiktoken.gemspec") +Rake::ExtensionTask.new("tiktoken", gemspec) do |ext| + ext.lib_dir = "lib/tiktoken" + ext.cross_compile = true + ext.cross_platform = platforms + ext.cross_compiling do |spec| + spec.dependencies.reject! { |dep| dep.name == "rb_sys" } + spec.files.reject! { |file| File.fnmatch?("ext/*", file, File::FNM_EXTGLOB) } + end +end + +task :rank do + Dir.chdir('../js') do + system("npx tsx scripts/inline_ranks.ts") + system("mv ranks ../ruby") + end +end + +task :remove_ext do + path = "lib/tiktoken/tiktoken.bundle" + File.unlink(path) if File.exist?(path) +end + +Rake::Task["build"].enhance [:remove_ext] \ No newline at end of file diff --git a/ruby/ext/tiktoken/Cargo.toml b/ruby/ext/tiktoken/Cargo.toml new file mode 100644 index 00000000..fecc04a0 --- /dev/null +++ b/ruby/ext/tiktoken/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "tiktoken" +version = "0.2.0" +edition = "2021" +rust-version = "1.57.0" + +[lib] +name = "tiktoken" +crate-type = ["cdylib"] + +[dependencies] +tiktoken_core = { path = "../../../core", features = ["multithreading"] } +rustc-hash = "1.1.0" +magnus = "0.5.1" +serde_magnus = "0.2.1" +fancy-regex = "0.10.0" +regex = "1.7.0" +base64 = "0.21.0" +anyhow = "1.0.69" + + +[build-dependencies] +json = "0.12.4" \ No newline at end of file diff --git a/ruby/ext/tiktoken/extconf.rb b/ruby/ext/tiktoken/extconf.rb new file mode 100644 index 00000000..8f626b3b --- /dev/null +++ b/ruby/ext/tiktoken/extconf.rb @@ -0,0 +1,4 @@ +require "mkmf" +require "rb_sys/mkmf" + +create_rust_makefile("tiktoken/tiktoken") \ No newline at end of file diff --git a/ruby/ext/tiktoken/src/lib.rs b/ruby/ext/tiktoken/src/lib.rs new file mode 100644 index 00000000..0a444063 --- /dev/null +++ b/ruby/ext/tiktoken/src/lib.rs @@ -0,0 +1,363 @@ +use _tiktoken_core::CoreBPENative; + +use base64::{engine::general_purpose, Engine as _}; +use fancy_regex::Regex; +use rustc_hash::FxHashMap as HashMap; +use std::collections::HashSet; +use std::result::Result; +use anyhow::Error; + +use magnus::{define_module, exception, function, memoize, method, prelude::*, Error as MError, RModule, Value, RString}; +use serde_magnus::deserialize; + +type RbResult = Result; + +const ENDOFTEXT: &'static str = "<|endoftext|>"; + +const FIM_PREFIX: &'static str = "<|fim_prefix|>"; + +const FIM_MIDDLE: &'static str = "<|fim_middle|>"; + +const FIM_SUFFIX: &'static str = "<|fim_suffix|>"; + +const ENDOFPROMPT: &'static str = "<|endofprompt|>"; + +struct CoreBPEConstructor { + encoder: HashMap, usize>, + special_tokens: HashMap, + pat_str: String, +} + +impl CoreBPEConstructor { + fn new( + tiktoken_bfe: &str, + special_tokens: Option>, + pat_str: &str, + ) -> Self { + CoreBPEConstructor { + encoder: CoreBPEConstructor::parse_bfe(tiktoken_bfe).unwrap(), + special_tokens: special_tokens.unwrap_or_default(), + pat_str: String::from(pat_str), + } + } + + fn parse_bfe(tiktoken_bfe: &str) -> Result, usize>, Error> { + let mut encoder = HashMap::default(); + if tiktoken_bfe.chars().next().unwrap() == '!' { + for line in tiktoken_bfe.lines() { + let mut parts = line.split(' '); + parts.next().unwrap(); + + let offset: i32 = parts.next().unwrap().parse()?; + for (pos, token) in parts.enumerate() { + let token = &general_purpose::STANDARD.decode(token)?; + encoder.insert(token.clone(), (offset as usize) + pos); + } + } + } else { + for line in tiktoken_bfe.lines() { + let mut parts = line.split(' '); + let token = &general_purpose::STANDARD.decode(parts.next().unwrap())?; + let rank: usize = parts.next().unwrap().parse().unwrap(); + encoder.insert(token.clone(), rank); + } + } + + Ok(encoder) + } + + fn gpt2() -> Self { + let mut special_tokens = HashMap::default(); + special_tokens.insert(String::from(ENDOFTEXT), 50256); + + CoreBPEConstructor::new( + include_str!("../../../ranks/gpt2.compress.tiktoken"), + Some(special_tokens), + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", + ) + } + + fn r50k_base() -> Self { + let mut special_tokens = HashMap::default(); + special_tokens.insert(String::from(ENDOFTEXT), 50256); + + CoreBPEConstructor::new( + include_str!("../../../ranks/r50k_base.compress.tiktoken"), + Some(special_tokens), + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", + ) + } + + fn p50k_base() -> Self { + let mut special_tokens = HashMap::default(); + special_tokens.insert(String::from(ENDOFTEXT), 50256); + + CoreBPEConstructor::new( + include_str!("../../../ranks/p50k_base.compress.tiktoken"), + Some(special_tokens), + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", + ) + } + + fn p50k_edit() -> Self { + let mut special_tokens = HashMap::default(); + special_tokens.insert(String::from(ENDOFTEXT), 50256); + special_tokens.insert(String::from(FIM_PREFIX), 50281); + special_tokens.insert(String::from(FIM_MIDDLE), 50282); + special_tokens.insert(String::from(FIM_SUFFIX), 50283); + + CoreBPEConstructor::new( + include_str!("../../../ranks/p50k_base.compress.tiktoken"), + Some(special_tokens), + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", + ) + } + + fn cl100k_base() -> Self { + let mut special_tokens = HashMap::default(); + special_tokens.insert(String::from(ENDOFTEXT), 100257); + special_tokens.insert(String::from(FIM_PREFIX), 100258); + special_tokens.insert(String::from(FIM_MIDDLE), 100259); + special_tokens.insert(String::from(FIM_SUFFIX), 100260); + special_tokens.insert(String::from(ENDOFPROMPT), 100276); + + CoreBPEConstructor::new( + include_str!("../../../ranks/cl100k_base.compress.tiktoken"), + Some(special_tokens), + "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + ) + } +} + +#[magnus::wrap(class = "Tiktoken::Encoder")] +pub struct Encoder { + name: Option, + special_tokens_set: HashSet, + bpe: CoreBPENative, +} + +impl Encoder { + fn from_encoding( + encoding: String, + extend_special_tokens: &Option>, + ) -> RbResult { + let mut constructor: CoreBPEConstructor = match encoding.as_str() { + "gpt2" => Ok(CoreBPEConstructor::gpt2()), + "r50k_base" => Ok(CoreBPEConstructor::r50k_base()), + "p50k_base" => Ok(CoreBPEConstructor::p50k_base()), + "p50k_edit" => Ok(CoreBPEConstructor::p50k_edit()), + "cl100k_base" => Ok(CoreBPEConstructor::cl100k_base()), + &_ => Err(MError::new(exception::arg_error(), "Invalid encoding")), + }?; + + if let Some(tokens) = extend_special_tokens { + constructor.special_tokens.extend(tokens.clone()); + } + + Ok(Encoder { + name: Some(String::from(encoding)), + // TODO: can we avoid cloning here? + special_tokens_set: constructor + .special_tokens + .keys() + .map(|s| s.clone()) + .collect(), + bpe: CoreBPENative::new( + constructor.encoder, + constructor.special_tokens, + &constructor.pat_str, + ) + .unwrap(), + }) + } + + pub fn name(&self) -> Option { + self.name.clone() + } + + pub fn encode( + &self, + text: RString, + allowed_special: Value, + disallowed_special: Value, + ) -> RbResult> { + unsafe { + let text_str = text.as_str().unwrap(); + let allowed_tokens = + self.validate_allowed_tokens(&text_str, &allowed_special, &disallowed_special)?; + + Ok(self + .bpe + ._encode_native( + &text_str, + &allowed_tokens.iter().map(AsRef::as_ref).collect(), + None, + ) + .0) + } + } + + pub fn encode_ordinary(&self, text: RString) -> RbResult> { + unsafe { + let text_str = text.as_str().unwrap(); + Ok(self.bpe._encode_ordinary_native(&text_str)) + } + } + + // TODO do we need this? + //pub fn encode_single_token(&self, bytes: &[u8]) -> usize { + // self.bpe.encode_single_token(&bytes).unwrap() + //} + + pub fn decode(&self, tokens: Vec) -> Vec { + self.bpe._decode_native(&tokens) + } + + // TODO do we need this? + // pub fn decode_single_token_bytes(&self, token: usize) -> Vec { + // self.bpe + // .decode_single_token_bytes(token) + // .unwrap() + // .to_vec() + // } + + // TODO do we need this + // pub fn token_byte_values(&self) -> Value { + // JsValue::from_serde(&self.bpe.token_byte_values()).unwrap_throw() + // } + + fn validate_allowed_tokens( + &self, + text: &str, + allowed_special_param: &Value, + disallowed_special_param: &Value, + ) -> Result, MError> { + // If it's a string, only 'all' is allowed. Otherwise, needs to be a list of strings. + let allowed_special: HashSet = match allowed_special_param.class().inspect().as_str() { + "String" => { + let allowed_special_str: String = deserialize(allowed_special_param).unwrap_or_default(); + match allowed_special_str.as_str() { + "all" => Ok(self.special_tokens_set.clone()), + _ => Err(MError::new(exception::arg_error(), "Invalid value for allowed_special")), + } + }, + "Array" => Ok(deserialize(allowed_special_param).unwrap_or_default()), + _ => Err(MError::new(exception::arg_error(), "Invalid type for allowed_special")), + }?; + + let disallowed_special: HashSet = match disallowed_special_param.class().inspect().as_str() { + "String" => { + let disallowed_special_str: String = deserialize(disallowed_special_param).unwrap_or_default(); + match disallowed_special_str.as_str() { + "all" => Ok(&self.special_tokens_set - &allowed_special), + _ => Err(MError::new(exception::arg_error(), "Invalid value for disallowed_special")), + } + }, + "Array" => Ok(deserialize(disallowed_special_param).unwrap_or_default()), + _ => Err(MError::new(exception::arg_error(), "Invalid type for disallowed_special")), + }?; + + if !disallowed_special.is_empty() { + if let Some(found) = Encoder::special_token_regex(&disallowed_special).find(text).unwrap() { + let err: String = format!( + "The text contains a special token that is not allowed: {}", + found.as_str() + ); + return Err(MError::new(exception::arg_error(), err)); + } + } + + return Ok(allowed_special); + } + + fn special_token_regex(tokens: &HashSet) -> Regex { + let inner = tokens + .iter() + .map(|token| regex::escape(token)) + .collect::>() + .join("|"); + + Regex::new(&format!("({})", inner)).unwrap() + } +} + +pub fn get_encoding(encoding: String, extend_special_tokens: Value) -> RbResult { + let _extend_special_tokens: Option> = deserialize(&extend_special_tokens).ok(); + + Encoder::from_encoding( + encoding, + &_extend_special_tokens + ) +} + +pub fn encoding_for_model( + model: String, + extend_special_tokens: Value, +) -> RbResult { + let encoding = match model.as_str() { + "text-davinci-003" => Ok("p50k_base"), + "text-davinci-002" => Ok("p50k_base"), + "text-davinci-001" => Ok("r50k_base"), + "text-curie-001" => Ok("r50k_base"), + "text-babbage-001" => Ok("r50k_base"), + "text-ada-001" => Ok("r50k_base"), + "davinci" => Ok("r50k_base"), + "curie" => Ok("r50k_base"), + "babbage" => Ok("r50k_base"), + "ada" => Ok("r50k_base"), + "code-davinci-002" => Ok("p50k_base"), + "code-davinci-001" => Ok("p50k_base"), + "code-cushman-002" => Ok("p50k_base"), + "code-cushman-001" => Ok("p50k_base"), + "davinci-codex" => Ok("p50k_base"), + "cushman-codex" => Ok("p50k_base"), + "text-davinci-edit-001" => Ok("p50k_edit"), + "code-davinci-edit-001" => Ok("p50k_edit"), + "text-embedding-ada-002" => Ok("cl100k_base"), + "text-similarity-davinci-001" => Ok("r50k_base"), + "text-similarity-curie-001" => Ok("r50k_base"), + "text-similarity-babbage-001" => Ok("r50k_base"), + "text-similarity-ada-001" => Ok("r50k_base"), + "text-search-davinci-doc-001" => Ok("r50k_base"), + "text-search-curie-doc-001" => Ok("r50k_base"), + "text-search-babbage-doc-001" => Ok("r50k_base"), + "text-search-ada-doc-001" => Ok("r50k_base"), + "code-search-babbage-code-001" => Ok("r50k_base"), + "code-search-ada-code-001" => Ok("r50k_base"), + "gpt2" => Ok("gpt2"), + "gpt-3.5-turbo" => Ok("cl100k_base"), + "gpt-3.5-turbo-0301" => Ok("cl100k_base"), + "gpt-4" => Ok("cl100k_base"), + "gpt-4-32k" => Ok("cl100k_base"), + model => Err(MError::new(exception::arg_error(), + format!("Invalid model: {}", model.to_string()), + )), + }?; + + let _extend_special_tokens: Option> = deserialize(&extend_special_tokens).ok(); + + Encoder::from_encoding( + encoding.to_string(), + &_extend_special_tokens + ) +} + +fn module() -> RModule { + *memoize!(RModule: define_module("Tiktoken").unwrap()) +} + +#[magnus::init] +fn init() -> RbResult<()> { + let module = module(); + + module.define_module_function("_get_encoding", function!(get_encoding, 2))?; + module.define_module_function("_encoding_for_model", function!(encoding_for_model, 2))?; + + let class = module.define_class("Encoder", Default::default())?; + class.define_method("name", method!(Encoder::name, 0))?; + class.define_method("_encode", method!(Encoder::encode, 3))?; + class.define_method("_encode_ordinary", method!(Encoder::encode_ordinary, 1))?; + class.define_method("_decode", method!(Encoder::decode, 1))?; + + Ok(()) +} \ No newline at end of file diff --git a/ruby/lib/tiktoken.rb b/ruby/lib/tiktoken.rb new file mode 100644 index 00000000..27897fbe --- /dev/null +++ b/ruby/lib/tiktoken.rb @@ -0,0 +1,18 @@ +begin + require_relative "tiktoken/#{RUBY_VERSION.to_f}/tiktoken" +rescue LoadError + require_relative "tiktoken/tiktoken" +end + +require_relative "tiktoken/version" +require_relative "tiktoken/encoder" + +module Tiktoken + def self.get_encoding(encoding, extra_special_tokens={}) + Tiktoken._get_encoding(encoding, extra_special_tokens) + end + + def self.encoding_for_model(model, extra_special_tokens={}) + Tiktoken._encoding_for_model(model, extra_special_tokens) + end +end \ No newline at end of file diff --git a/ruby/lib/tiktoken/encoder.rb b/ruby/lib/tiktoken/encoder.rb new file mode 100644 index 00000000..cc153869 --- /dev/null +++ b/ruby/lib/tiktoken/encoder.rb @@ -0,0 +1,16 @@ +module Tiktoken + class Encoder + def encode(text, allowed_special=[], disallowed_special="all") + _encode(text, allowed_special, disallowed_special) + end + + def encode_ordinary(text) + _encode_ordinary(text) + end + + def decode(tokens, utf_opts={invalid: :replace, undef: :replace}) + _bytes = _decode(tokens) + _bytes.pack('C*').encode('UTF-8', **utf_opts) + end + end +end \ No newline at end of file diff --git a/ruby/lib/tiktoken/version.rb b/ruby/lib/tiktoken/version.rb new file mode 100644 index 00000000..7f0fabeb --- /dev/null +++ b/ruby/lib/tiktoken/version.rb @@ -0,0 +1,3 @@ +module Tiktoken + VERSION = "0.1.0" +end diff --git a/ruby/tiktoken.gemspec b/ruby/tiktoken.gemspec new file mode 100644 index 00000000..80e83b5e --- /dev/null +++ b/ruby/tiktoken.gemspec @@ -0,0 +1,20 @@ +require_relative "lib/tiktoken/version" + +Gem::Specification.new do |spec| + spec.name = "tiktoken" + spec.version = Tiktoken::VERSION + spec.summary = "Wrapper for OpenAI's tiktoken library" + spec.homepage = "https://github.com/volition-co/tiktoken" + spec.license = "MIT" + + spec.author = "Arjun Singh" + spec.email = "arjun@volition.co" + + spec.files = Dir["*.{md,txt}", "{ext,lib}/**/*", "Cargo.*"] + spec.require_path = "lib" + spec.extensions = ["ext/tiktoken/extconf.rb"] + + spec.required_ruby_version = ">= 2.7" + + spec.add_dependency "rb_sys", "~> 0.9" +end \ No newline at end of file diff --git a/setup.py b/setup.py index a22e8e5d..246487b0 100644 --- a/setup.py +++ b/setup.py @@ -7,12 +7,14 @@ RustExtension( "tiktoken._tiktoken", binding=Binding.PyO3, + path="python/Cargo.toml", # Between our use of editable installs and wanting to use Rust for performance sensitive # code, it makes sense to just always use --release debug=False, ) ], - package_data={"tiktoken": ["py.typed"]}, - packages=["tiktoken", "tiktoken_ext"], + include_package_data=True, + package_data={ "tiktoken": ["py.typed", "registry.json", "model_to_encoding.json"] }, + packages=["tiktoken"], zip_safe=False, ) diff --git a/tiktoken/load.py b/tiktoken/load.py index c5881068..5537ecf4 100644 --- a/tiktoken/load.py +++ b/tiktoken/load.py @@ -73,6 +73,7 @@ def decode_data_gym(value: str) -> bytes: # add the single byte tokens bpe_ranks = {bytes([b]): i for i, b in enumerate(rank_to_intbyte)} + # add the merged tokens n = len(bpe_ranks) for first, second in bpe_merges: diff --git a/tiktoken/model.py b/tiktoken/model.py index 33da3901..c65ca628 100644 --- a/tiktoken/model.py +++ b/tiktoken/model.py @@ -2,6 +2,13 @@ from .core import Encoding from .registry import get_encoding +import json + +try: + import importlib.resources as pkg_resources +except ImportError: + # Try backported to PY<37 `importlib_resources`. + import importlib_resources as pkg_resources # TODO: these will likely be replaced by an API endpoint MODEL_PREFIX_TO_ENCODING: dict[str, str] = { @@ -9,46 +16,7 @@ "gpt-3.5-turbo-": "cl100k_base" # e.g, gpt-3.5-turbo-0301, -0401, etc. } -MODEL_TO_ENCODING: dict[str, str] = { - # chat - "gpt-3.5-turbo": "cl100k_base", - # text - "text-davinci-003": "p50k_base", - "text-davinci-002": "p50k_base", - "text-davinci-001": "r50k_base", - "text-curie-001": "r50k_base", - "text-babbage-001": "r50k_base", - "text-ada-001": "r50k_base", - "davinci": "r50k_base", - "curie": "r50k_base", - "babbage": "r50k_base", - "ada": "r50k_base", - # code - "code-davinci-002": "p50k_base", - "code-davinci-001": "p50k_base", - "code-cushman-002": "p50k_base", - "code-cushman-001": "p50k_base", - "davinci-codex": "p50k_base", - "cushman-codex": "p50k_base", - # edit - "text-davinci-edit-001": "p50k_edit", - "code-davinci-edit-001": "p50k_edit", - # embeddings - "text-embedding-ada-002": "cl100k_base", - # old embeddings - "text-similarity-davinci-001": "r50k_base", - "text-similarity-curie-001": "r50k_base", - "text-similarity-babbage-001": "r50k_base", - "text-similarity-ada-001": "r50k_base", - "text-search-davinci-doc-001": "r50k_base", - "text-search-curie-doc-001": "r50k_base", - "text-search-babbage-doc-001": "r50k_base", - "text-search-ada-doc-001": "r50k_base", - "code-search-babbage-code-001": "r50k_base", - "code-search-ada-code-001": "r50k_base", - # open source - "gpt2": "gpt2", -} +MODEL_TO_ENCODING: dict[str, str] = json.loads(pkg_resources.read_text("tiktoken", "model_to_encoding.json")) def encoding_for_model(model_name: str) -> Encoding: diff --git a/tiktoken/model_to_encoding.json b/tiktoken/model_to_encoding.json new file mode 100644 index 00000000..d4eccd9a --- /dev/null +++ b/tiktoken/model_to_encoding.json @@ -0,0 +1,34 @@ +{ + "text-davinci-003": "p50k_base", + "text-davinci-002": "p50k_base", + "text-davinci-001": "r50k_base", + "text-curie-001": "r50k_base", + "text-babbage-001": "r50k_base", + "text-ada-001": "r50k_base", + "davinci": "r50k_base", + "curie": "r50k_base", + "babbage": "r50k_base", + "ada": "r50k_base", + "code-davinci-002": "p50k_base", + "code-davinci-001": "p50k_base", + "code-cushman-002": "p50k_base", + "code-cushman-001": "p50k_base", + "davinci-codex": "p50k_base", + "cushman-codex": "p50k_base", + "text-davinci-edit-001": "p50k_edit", + "code-davinci-edit-001": "p50k_edit", + "text-embedding-ada-002": "cl100k_base", + "text-similarity-davinci-001": "r50k_base", + "text-similarity-curie-001": "r50k_base", + "text-similarity-babbage-001": "r50k_base", + "text-similarity-ada-001": "r50k_base", + "text-search-davinci-doc-001": "r50k_base", + "text-search-curie-doc-001": "r50k_base", + "text-search-babbage-doc-001": "r50k_base", + "text-search-ada-doc-001": "r50k_base", + "code-search-babbage-code-001": "r50k_base", + "code-search-ada-code-001": "r50k_base", + "gpt2": "gpt2", + "gpt-3.5-turbo": "cl100k_base", + "gpt-3.5-turbo-0301": "cl100k_base" +} \ No newline at end of file diff --git a/tiktoken/registry.json b/tiktoken/registry.json new file mode 100644 index 00000000..aa3ee530 --- /dev/null +++ b/tiktoken/registry.json @@ -0,0 +1,50 @@ +{ + "gpt2": { + "data_gym_to_mergeable_bpe_ranks": { + "vocab_bpe_file": "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", + "encoder_json_file": "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json" + }, + "explicit_n_vocab": 50257, + "pat_str": "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", + "special_tokens": { + "<|endoftext|>": 50256 + } + }, + "r50k_base": { + "load_tiktoken_bpe": "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken", + "explicit_n_vocab": 50257, + "pat_str": "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", + "special_tokens": { + "<|endoftext|>": 50256 + } + }, + "p50k_base": { + "load_tiktoken_bpe": "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", + "explicit_n_vocab": 50281, + "pat_str": "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", + "special_tokens": { + "<|endoftext|>": 50256 + } + }, + "p50k_edit": { + "load_tiktoken_bpe": "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", + "special_tokens": { + "<|endoftext|>": 50256, + "<|fim_prefix|>": 50281, + "<|fim_middle|>": 50282, + "<|fim_suffix|>": 50283 + }, + "pat_str": "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+" + }, + "cl100k_base": { + "load_tiktoken_bpe": "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken", + "special_tokens": { + "<|endoftext|>": 100257, + "<|fim_prefix|>": 100258, + "<|fim_middle|>": 100259, + "<|fim_suffix|>": 100260, + "<|endofprompt|>": 100276 + }, + "pat_str": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + } +} \ No newline at end of file diff --git a/tiktoken/registry.py b/tiktoken/registry.py index 52d8ec2d..0a55d27e 100644 --- a/tiktoken/registry.py +++ b/tiktoken/registry.py @@ -3,46 +3,32 @@ import importlib import pkgutil import threading +import json from typing import Any, Callable, Optional -import tiktoken_ext - from tiktoken.core import Encoding +from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe _lock = threading.RLock() ENCODINGS: dict[str, Encoding] = {} -ENCODING_CONSTRUCTORS: Optional[dict[str, Callable[[], dict[str, Any]]]] = None - +ENCODING_DEFS: dict[str, Any] = None -def _find_constructors() -> None: - global ENCODING_CONSTRUCTORS - with _lock: - if ENCODING_CONSTRUCTORS is not None: - return - ENCODING_CONSTRUCTORS = {} +def _load_encoding_defs(): + global ENCODING_DEFS + if not ENCODING_DEFS is None: + return ENCODING_DEFS - # tiktoken_ext is a namespace package - # submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes - # - we use namespace package pattern so `pkgutil.iter_modules` is fast - # - it's a separate top-level package because namespace subpackages of non-namespace - # packages don't quite do what you want with editable installs - plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".") + try: + import importlib.resources as pkg_resources + except ImportError: + # Try backported to PY<37 `importlib_resources`. + import importlib_resources as pkg_resources - for _, mod_name, _ in plugin_mods: - mod = importlib.import_module(mod_name) - try: - constructors = mod.ENCODING_CONSTRUCTORS - except AttributeError as e: - raise ValueError( - f"tiktoken plugin {mod_name} does not define ENCODING_CONSTRUCTORS" - ) from e - for enc_name, constructor in constructors.items(): - if enc_name in ENCODING_CONSTRUCTORS: - raise ValueError( - f"Duplicate encoding name {enc_name} in tiktoken plugin {mod_name}" - ) - ENCODING_CONSTRUCTORS[enc_name] = constructor + # read registry.json + # note: was trying to place it into /data/registry.json but python packaging is always unhappy + ENCODING_DEFS = json.loads(pkg_resources.read_text("tiktoken", "registry.json")) + return ENCODING_DEFS def get_encoding(encoding_name: str) -> Encoding: if encoding_name in ENCODINGS: @@ -52,22 +38,26 @@ def get_encoding(encoding_name: str) -> Encoding: if encoding_name in ENCODINGS: return ENCODINGS[encoding_name] - if ENCODING_CONSTRUCTORS is None: - _find_constructors() - assert ENCODING_CONSTRUCTORS is not None - - if encoding_name not in ENCODING_CONSTRUCTORS: + _load_encoding_defs() + if encoding_name not in ENCODING_DEFS: raise ValueError(f"Unknown encoding {encoding_name}") - constructor = ENCODING_CONSTRUCTORS[encoding_name] - enc = Encoding(**constructor()) + encoding_def = dict(ENCODING_DEFS[encoding_name]) + encoding_def["name"] = encoding_name + + if "load_tiktoken_bpe" in encoding_def: + encoding_def["mergeable_ranks"] = load_tiktoken_bpe(encoding_def["load_tiktoken_bpe"]) + del encoding_def["load_tiktoken_bpe"] + elif "data_gym_to_mergeable_bpe_ranks" in encoding_def: + encoding_def["mergeable_ranks"] = data_gym_to_mergeable_bpe_ranks(**encoding_def["data_gym_to_mergeable_bpe_ranks"]) + del encoding_def["data_gym_to_mergeable_bpe_ranks"] + else: + raise ValueError(f"Unknown loader {encoding_name}") + enc = Encoding(**encoding_def) ENCODINGS[encoding_name] = enc return enc def list_encoding_names() -> list[str]: with _lock: - if ENCODING_CONSTRUCTORS is None: - _find_constructors() - assert ENCODING_CONSTRUCTORS is not None - return list(ENCODING_CONSTRUCTORS) + return list(_load_encoding_defs().keys()) diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py deleted file mode 100644 index 16a6ec50..00000000 --- a/tiktoken_ext/openai_public.py +++ /dev/null @@ -1,88 +0,0 @@ -from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe - -ENDOFTEXT = "<|endoftext|>" -FIM_PREFIX = "<|fim_prefix|>" -FIM_MIDDLE = "<|fim_middle|>" -FIM_SUFFIX = "<|fim_suffix|>" -ENDOFPROMPT = "<|endofprompt|>" - - -def gpt2(): - mergeable_ranks = data_gym_to_mergeable_bpe_ranks( - vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", - encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json", - ) - return { - "name": "gpt2", - "explicit_n_vocab": 50257, - "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", - "mergeable_ranks": mergeable_ranks, - "special_tokens": {"<|endoftext|>": 50256}, - } - - -def r50k_base(): - mergeable_ranks = load_tiktoken_bpe( - "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" - ) - return { - "name": "r50k_base", - "explicit_n_vocab": 50257, - "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", - "mergeable_ranks": mergeable_ranks, - "special_tokens": {ENDOFTEXT: 50256}, - } - - -def p50k_base(): - mergeable_ranks = load_tiktoken_bpe( - "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" - ) - return { - "name": "p50k_base", - "explicit_n_vocab": 50281, - "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", - "mergeable_ranks": mergeable_ranks, - "special_tokens": {ENDOFTEXT: 50256}, - } - - -def p50k_edit(): - mergeable_ranks = load_tiktoken_bpe( - "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" - ) - special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283} - return { - "name": "p50k_edit", - "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", - "mergeable_ranks": mergeable_ranks, - "special_tokens": special_tokens, - } - - -def cl100k_base(): - mergeable_ranks = load_tiktoken_bpe( - "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" - ) - special_tokens = { - ENDOFTEXT: 100257, - FIM_PREFIX: 100258, - FIM_MIDDLE: 100259, - FIM_SUFFIX: 100260, - ENDOFPROMPT: 100276, - } - return { - "name": "cl100k_base", - "pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""", - "mergeable_ranks": mergeable_ranks, - "special_tokens": special_tokens, - } - - -ENCODING_CONSTRUCTORS = { - "gpt2": gpt2, - "r50k_base": r50k_base, - "p50k_base": p50k_base, - "p50k_edit": p50k_edit, - "cl100k_base": cl100k_base, -}