[PIR] Fix in-batch negative recall model for neural_search (#10352)

hanlintang · web-flow · commit 54c3ec31bf1e · 2025-04-10T19:36:28.000+08:00
diff --git a/slm/applications/neural_search/recall/in_batch_negative/README.md b/slm/applications/neural_search/recall/in_batch_negative/README.md
@@ -182,14 +182,15 @@ Recall@K 召回率是指预测的前 topK（top-k 是指从最后的按得分排
 
 如果使用 CPU 进行训练，则需要吧`--gpus`参数去除，然后吧`device`设置成 cpu 即可，详细请参考 train_batch_neg.sh 文件的训练设置
 
+如果不存在```checkpoints/inbatch```, 需要在命令行运行```mkdir -p checkpoints/inbatch```创建相关目录（如果运行脚本进行训练则不需要）。
+
 然后运行下面的命令使用 GPU 训练，得到语义索引模型：
 
 ```
-root_path=inbatch
 python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
     train_batch_neg.py \
     --device gpu \
-    --save_dir ./checkpoints/${root_path} \
+    --save_dir ./checkpoints/inbatch \
     --batch_size 64 \
     --learning_rate 5E-5 \
     --epochs 3 \
@@ -464,7 +465,7 @@ python deploy/python/predict.py \
 也可以运行下面的 bash 脚本：
 
 ```
-sh deploy.sh
+sh deploy/python/deploy.sh
 ```
 最终输出的是256维度的特征向量和句子对的预测概率：
 
diff --git a/slm/applications/neural_search/recall/in_batch_negative/deploy/python/deploy.sh b/slm/applications/neural_search/recall/in_batch_negative/deploy/python/deploy.sh
@@ -12,4 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-python predict.py --model_dir=../../output
+python ./deploy/python/predict.py  --model_dir=./output \
+                   --model_name_or_path rocketqa-zh-base-query-encoder
diff --git a/slm/applications/neural_search/recall/in_batch_negative/deploy/python/predict.py b/slm/applications/neural_search/recall/in_batch_negative/deploy/python/predict.py
@@ -22,6 +22,7 @@
 from paddlenlp.data import Pad, Tuple
 from paddlenlp.transformers import AutoTokenizer
 from paddlenlp.utils.log import logger
+from paddlenlp.utils.env import PADDLE_INFERENCE_MODEL_SUFFIX, PADDLE_INFERENCE_WEIGHTS_SUFFIX
 
 sys.path.append(".")
 
@@ -87,8 +88,8 @@ def __init__(
         self.max_seq_length = max_seq_length
         self.batch_size = batch_size
 
-        model_file = model_dir + "/inference.pdmodel"
-        params_file = model_dir + "/inference.pdiparams"
+        model_file = model_dir + f"/inference{PADDLE_INFERENCE_MODEL_SUFFIX}"
+        params_file = model_dir + f"/inference{PADDLE_INFERENCE_WEIGHTS_SUFFIX}"
         if not os.path.exists(model_file):
             raise ValueError("not find model file path {}".format(model_file))
         if not os.path.exists(params_file):
diff --git a/slm/applications/neural_search/recall/in_batch_negative/scripts/train.sh b/slm/applications/neural_search/recall/in_batch_negative/scripts/train.sh
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+ 
+root_dir="./checkpoints/inbatch" 
+
+if [ ! -d "$root_dir" ]; then
+    mkdir -p "$root_dir"
+    echo "Created directory: $root_dir"
+else
+    echo "Directory already exists: $root_dir"
+fi
+
+python -u -m paddle.distributed.launch --gpus "0" \
+    train_batch_neg.py \
+    --device gpu \
+    --save_dir ${root_dir} \
+    --batch_size 64 \
+    --learning_rate 5E-5 \
+    --epochs 3 \
+    --output_emb_size 256 \
+    --model_name_or_path rocketqa-zh-base-query-encoder \
+    --save_steps 10 \
+    --max_seq_length 64 \
+    --margin 0.2 \
+    --train_set_file recall/train.csv \
+    --recall_result_dir "recall_result_dir" \
+    --recall_result_file "recall_result.txt" \
+    --hnsw_m 100 \
+    --hnsw_ef 100 \
+    --recall_num 50 \
+    --similar_text_pair_file "recall/dev.csv" \
+    --corpus_file "recall/corpus.csv"