Skip to content

Commit

Permalink
Add tiktoken library for token counting
Browse files Browse the repository at this point in the history
  • Loading branch information
sdan committed Apr 3, 2024
1 parent 6d8098a commit 363abc0
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions vlite/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import torch
import llama_cpp
from huggingface_hub import hf_hub_download
import tiktoken

class EmbeddingModel:
def __init__(self, model_name='mixedbread-ai/mxbai-embed-large-v1'):
Expand All @@ -16,9 +17,12 @@ def embed(self, texts, max_seq_length=512, device="cpu"):
embeddings_dict = self.model.create_embedding(texts)
return [item["embedding"] for item in embeddings_dict["data"]]

def token_count(self, texts):
def token_count(texts):
enc = tiktoken.get_encoding("cl100k_base")
tokens = 0
for text in texts:
tokens += len(self.tokenizer.tokenize(text))
token_ids = enc.encode(text, disallowed_special=())
tokens += len(token_ids)
return tokens


0 comments on commit 363abc0

Please sign in to comment.