Skip to content

Commit 3481514

Browse files
committed
Add token authentication and |Assistant| tag for generation
1 parent eeceffb commit 3481514

File tree

1 file changed

+32
-6
lines changed

1 file changed

+32
-6
lines changed

hf_bench/map_openthoughts_to_deepseekr1.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,31 @@
1+
import argparse
12
import json
3+
import os
24
from transformers import AutoTokenizer
35
from datasets import load_dataset
46
from jinja2 import Template
7+
from huggingface_hub import HfApi
8+
9+
10+
# args to choose whether to push the dataset to the hub
11+
parser = argparse.ArgumentParser()
12+
parser.add_argument("--push_to_hub", action="store_true", help="Push the dataset to the hub")
13+
parser.add_argument("--repo_id_name", type=str, default=None)
14+
# dataset range
15+
parser.add_argument("--num_samples", type=int, default=None)
16+
args = parser.parse_args()
517

618

719
def main():
820
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
921
# Now load the dataset open-thoughts/OpenThoughts-114k and apply the chat template
10-
dataset = load_dataset("open-thoughts/OpenThoughts-114k", split='train')
22+
# slice the dataset to the number of samples if specified
23+
if args.num_samples:
24+
dataset = load_dataset("open-thoughts/OpenThoughts-114k", split='train').select(range(args.num_samples))
25+
else:
26+
dataset = load_dataset("open-thoughts/OpenThoughts-114k", split='train')
1127

12-
output_prompts = [] # To store generated prompts
1328
output_jsonl_file = "deepseek_r1_prompts_openthoughts.jsonl"
14-
1529
# Get and print the chat template
1630
chat_template_deepseek = tokenizer.chat_template
1731
jinja_template = Template(chat_template_deepseek)
@@ -32,16 +46,28 @@ def main():
3246
elif role_from_dataset == 'assistant': # Map 'assistant' role
3347
messages_input.append({'role': 'assistant', 'content': content})
3448

35-
# Keep `add_generation_prompt=False` for dataset preprocessing
49+
# Keep `add_generation_prompt=True` to add the |Assistant| tag to the prompt
3650
bos_token = "<|startoftext|>"
3751
prompt = jinja_template.render(messages=messages_input,
38-
bos_token=bos_token, add_generation_prompt=False)
39-
output_prompts.append(prompt) # Store in a list
52+
bos_token=bos_token, add_generation_prompt=True)
4053

4154
with open(output_jsonl_file, 'a') as f: # 'a' for append mode
4255
json_record = {"prompt": prompt} # Or store more info if needed
4356
f.write(json.dumps(json_record) + '\n')
4457

58+
if args.push_to_hub:
59+
try:
60+
api = HfApi(token=os.environ["HF_TOKEN"])
61+
except Exception as e:
62+
raise Exception("HF_TOKEN not found in environment variables")
63+
64+
api.upload_file(
65+
path_or_fileobj=output_jsonl_file,
66+
path_in_repo="deepseek_r1_prompts_openthoughts.jsonl",
67+
repo_id=args.repo_id_name,
68+
repo_type="dataset",
69+
)
70+
4571

4672
if __name__ == "__main__":
4773
main()

0 commit comments

Comments
 (0)