-
VTRTokenizer
from Robust Open-Vocabulary Translation from Visual Text Representations -
PQRNNTokenizer
from Advancing NLP with Efficient Projection-Based Model Architectures -
CANINETokenizer
from CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation -
ByT5Tokenizer
from ByT5: Towards a token-free future with pre-trained byte-to-byte models -
GBST
andByteTokenizer
from Charformer: Fast Character Transformers via Gradient-based Subword Tokenization -
LTPMultiHeadAttention
from Learned Token Pruning for Transformers -
X
andXLoss
, a model inspired from PonderNet and Perceiver, with Byte Embeddings.
pip install text-embeddings --upgrade
from text_embeddings.visual import VTRTokenizer
data = [
"Hello world!",
"¡Hola Mundo!",
"你好,世界!",
]
tokenizer = VTRTokenizer(
font_size=14,
window_size=10,
font="resources/NotoSans-Regular.ttf",
max_length=36
)
results = tokenizer(
text=data,
text_pair=data,
add_special_tokens=True,
padding="longest",
return_tensors='pt',
truncation="longest_first",
return_attention_mask=True,
return_special_tokens_mask=True,
return_length=True,
prepend_batch_axis=True,
return_overflowing_tokens=False,
)
assert results["input_ids"].shape == (3, results["input_ids"].shape[1], 14, 10)
assert results["attention_mask"].shape == (3, results["input_ids"].shape[1])
assert results["token_type_ids"].shape == (3, results["input_ids"].shape[1])
assert results["length"].shape == (3, )
import numpy as np
from typing import Optional, List, Dict
from text_embeddings.base import EmbeddingTokenizer
class MyOwnTokenizer(EmbeddingTokenizer):
def __init__(
self,
model_input_names: Optional[List[str]] = None,
special_tokens: Optional[Dict[str, np.ndarray]] = None,
max_length: Optional[int] = 2048,
):
super().__init__(model_input_names, special_tokens, max_length)
def text2embeddings(self, text: str) -> np.ndarray:
sequence_length = 10
dimensions = (10, 10, 10) # each token is mapped to a 3-d array
return np.zeros((sequence_length, *dimensions))
def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray:
# let's create a consistent 3-d array
return np.zeros((10, 10, 10))
import torch.onnx # nightly torch only
from text_embeddings.byte.charformer import GBST, ByteTokenizer
model = GBST(
embed_size=128,
max_block_size=4,
downsampling_factor=2,
score_calibration=True,
vocab_size=259,
)
tokenizer = ByteTokenizer()
results = tokenizer(
["Life is like a box of chocolates.", "Coding is fun."],
add_special_tokens=True,
padding="longest",
truncation="longest_first",
)
# Export the model
torch.onnx.export(
model,
torch.tensor(results["input_ids"], requires_grad=True).long(),
"gbst.onnx",
export_params=True,
opset_version=11,
do_constant_folding=True,
input_names=["input"],
output_names=["output"],
dynamic_axes={
"input": {0: "batch_size", 1: "sequence_length"},
"output": {0: "batch_size"},
},
)