From e42b8d077afe5ce2767da97b7747c864c01eb86a Mon Sep 17 00:00:00 2001
From: xiezipeng-ML <xzpaiks@163.com>
Date: Thu, 21 Mar 2024 22:33:29 +0800
Subject: [PATCH] refine

---
 projects/Qwen/utils/data_process.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/projects/Qwen/utils/data_process.py b/projects/Qwen/utils/data_process.py
index 06a7c263d..4c9d1946f 100644
--- a/projects/Qwen/utils/data_process.py
+++ b/projects/Qwen/utils/data_process.py
@@ -92,9 +92,9 @@ def qwen2_data_process(
     attention_mask = flow.where(attention_mask, flow.tensor(0.0), flow.tensor(-float("Inf")))
 
     return dict(
-        input_ids=input_ids,
-        labels=targets,
-        attention_mask=attention_mask,
+        input_ids=input_ids[0],
+        labels=targets[0],
+        attention_mask=attention_mask[0],
     )
 
 
@@ -109,11 +109,12 @@ def preprocess(input_file, targe_file, shuffle=False, tokenizer=None):
 
 
 if __name__ == "__main__":
+  
     from projects.mock_transformers.mock_tokenization import Qwen2Tokenizer
 
-    input_file = "/workspace/share/data/test-data.json"
-    target_file = "/workspace/libai/projects/Qwen"
-    model_file = "/workspace/share/Qwen1.5-14B-Chat"
+    input_file = "/data/home/xiezipeng/libai/projects/Qwen/subset.json"
+    target_file = "/data/home/xiezipeng/libai/projects/Qwen"
+    model_file = "/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B"
     
     tokenizer = Qwen2Tokenizer.from_pretrained(model_file)
     tokenizer.model_max_length = 2048