1
+ import argparse
1
2
import json
3
+ import os
2
4
from transformers import AutoTokenizer
3
5
from datasets import load_dataset
4
6
from jinja2 import Template
7
+ from huggingface_hub import HfApi
8
+
9
+
10
+ # args to choose whether to push the dataset to the hub
11
+ parser = argparse .ArgumentParser ()
12
+ parser .add_argument ("--push_to_hub" , action = "store_true" , help = "Push the dataset to the hub" )
13
+ parser .add_argument ("--repo_id_name" , type = str , default = None )
14
+ # dataset range
15
+ parser .add_argument ("--num_samples" , type = int , default = None )
16
+ args = parser .parse_args ()
5
17
6
18
7
19
def main ():
8
20
tokenizer = AutoTokenizer .from_pretrained ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" )
9
21
# Now load the dataset open-thoughts/OpenThoughts-114k and apply the chat template
10
- dataset = load_dataset ("open-thoughts/OpenThoughts-114k" , split = 'train' )
22
+ # slice the dataset to the number of samples if specified
23
+ if args .num_samples :
24
+ dataset = load_dataset ("open-thoughts/OpenThoughts-114k" , split = 'train' ).select (range (args .num_samples ))
25
+ else :
26
+ dataset = load_dataset ("open-thoughts/OpenThoughts-114k" , split = 'train' )
11
27
12
- output_prompts = [] # To store generated prompts
13
28
output_jsonl_file = "deepseek_r1_prompts_openthoughts.jsonl"
14
-
15
29
# Get and print the chat template
16
30
chat_template_deepseek = tokenizer .chat_template
17
31
jinja_template = Template (chat_template_deepseek )
@@ -32,16 +46,28 @@ def main():
32
46
elif role_from_dataset == 'assistant' : # Map 'assistant' role
33
47
messages_input .append ({'role' : 'assistant' , 'content' : content })
34
48
35
- # Keep `add_generation_prompt=False` for dataset preprocessing
49
+ # Keep `add_generation_prompt=True` to add the |Assistant| tag to the prompt
36
50
bos_token = "<|startoftext|>"
37
51
prompt = jinja_template .render (messages = messages_input ,
38
- bos_token = bos_token , add_generation_prompt = False )
39
- output_prompts .append (prompt ) # Store in a list
52
+ bos_token = bos_token , add_generation_prompt = True )
40
53
41
54
with open (output_jsonl_file , 'a' ) as f : # 'a' for append mode
42
55
json_record = {"prompt" : prompt } # Or store more info if needed
43
56
f .write (json .dumps (json_record ) + '\n ' )
44
57
58
+ if args .push_to_hub :
59
+ try :
60
+ api = HfApi (token = os .environ ["HF_TOKEN" ])
61
+ except Exception as e :
62
+ raise Exception ("HF_TOKEN not found in environment variables" )
63
+
64
+ api .upload_file (
65
+ path_or_fileobj = output_jsonl_file ,
66
+ path_in_repo = "deepseek_r1_prompts_openthoughts.jsonl" ,
67
+ repo_id = args .repo_id_name ,
68
+ repo_type = "dataset" ,
69
+ )
70
+
45
71
46
72
if __name__ == "__main__" :
47
73
main ()
0 commit comments