Skip to content

Commit

Permalink
refine
Browse files Browse the repository at this point in the history
  • Loading branch information
xiezipeng-ML committed Mar 21, 2024
1 parent 400ea76 commit e42b8d0
Showing 1 changed file with 7 additions and 6 deletions.
13 changes: 7 additions & 6 deletions projects/Qwen/utils/data_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ def qwen2_data_process(
attention_mask = flow.where(attention_mask, flow.tensor(0.0), flow.tensor(-float("Inf")))

return dict(
input_ids=input_ids,
labels=targets,
attention_mask=attention_mask,
input_ids=input_ids[0],
labels=targets[0],
attention_mask=attention_mask[0],
)


Expand All @@ -109,11 +109,12 @@ def preprocess(input_file, targe_file, shuffle=False, tokenizer=None):


if __name__ == "__main__":

from projects.mock_transformers.mock_tokenization import Qwen2Tokenizer

input_file = "/workspace/share/data/test-data.json"
target_file = "/workspace/libai/projects/Qwen"
model_file = "/workspace/share/Qwen1.5-14B-Chat"
input_file = "/data/home/xiezipeng/libai/projects/Qwen/subset.json"
target_file = "/data/home/xiezipeng/libai/projects/Qwen"
model_file = "/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B"

tokenizer = Qwen2Tokenizer.from_pretrained(model_file)
tokenizer.model_max_length = 2048
Expand Down

0 comments on commit e42b8d0

Please sign in to comment.