-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
41 lines (33 loc) · 1.57 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from huggingface_hub import login
from datasets import load_dataset
from transformers import AutoTokenizer
def prepare_datasets(args):
datasets = load_dataset(args.data_path)
# train_dataset = datasets["train"]
val_dataset = datasets["validation"]
test_dataset = datasets["test"]
# 증강한 데이터가 있다면 아래 코드 실행
augmented_train_dataset = load_dataset("csv", data_files="augmented_train.csv")['train']
tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True)
special_tokens_dict = {
'additional_special_tokens': ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&',
'&bank-account&', '&num&', '&online-account&']
}
tokenizer.add_special_tokens(special_tokens_dict)
def tokenize_function(dataset):
return tokenizer(
dataset["input"],
padding="max_length",
return_tensors="pt",
truncation=True,
max_length=256,
add_special_tokens=True,
return_token_type_ids=False
)
return (
# train_dataset.map(tokenize_function, batched=True).rename_column("output", "labels"),
# 증강한 데이터가 있다면 아래 코드 실행
augmented_train_dataset.map(tokenize_function, batched=True).rename_column("output", "labels"),
val_dataset.map(tokenize_function, batched=True).rename_column("output", "labels"),
test_dataset.map(tokenize_function, batched=True).rename_column("output", "labels")
)