-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.h
37 lines (27 loc) · 849 Bytes
/
tokenizer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#pragma once
#include <cstddef>
#include <cstdint>
#include <string>
#include <vector>
#include "huggingface_tokenizer.h"
typedef std::vector<int32_t> tokens;
typedef std::vector<tokens> tokens_batch;
typedef tokenizers::HFEncoding encoding;
namespace embeddings {
struct Encoding {
std::vector<int32_t> ids;
std::vector<int32_t> attention_mask;
size_t no_pad_len;
};
class Tokenizer {
public:
Tokenizer(const std::string &path);
Encoding Encode(const std::string &, bool add_special_tokens = true);
std::vector<Encoding> EncodeBatch(const std::vector<std::string> &,
bool add_special_tokens = true);
std::string Decode(const tokens &, bool skip_special_tokens = true);
tokenizers::HFTokenizer *GetFastTokenizer();
private:
tokenizers::HFTokenizer *tok;
};
} // namespace embeddings