Description
rt,看了之前的一些issue 好像没有给出明确的答复 我这边也很迷惑,我是用的1b模型进行微调,多图的对话,
数据量3w左右,应该不是过拟合之类的吧?
推理脚本如下:
`
set -x
GPUS=${GPUS:-2}
BATCH_SIZE=${BATCH_SIZE:-1}
PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export MASTER_PORT=34228
export TF_CPP_MIN_LOG_LEVEL=3
export LAUNCHER=pytorch
OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_lora'
if [ ! -d "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
fi
torchrun
--nnodes=1
--node_rank=0
--master_addr=127.0.0.1
--nproc_per_node=${GPUS}
--master_port=${MASTER_PORT}
internvl/train/internvl_chat_finetune.py
--model_name_or_path "/DATA/workshop/personal/InternVL-main/pretrained/InternVL2-1B"
--conv_style "Hermes-2"
--output_dir ${OUTPUT_DIR}
--meta_path "/DATA/jupyter/personal/InternVL-main/internvl_chat/shell/data/mydata.json"
--overwrite_output_dir True
--force_image_size 448
--max_dynamic_patch 4
--down_sample_ratio 0.5
--drop_path_rate 0.0
--freeze_llm True
--freeze_mlp True
--freeze_backbone True
--use_llm_lora 16
--vision_select_layer -1
--dataloader_num_workers 1
--bf16 False
--num_train_epochs 1
--per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE}
--gradient_accumulation_steps ${GRADIENT_ACC}
--evaluation_strategy "no"
--save_strategy "steps"
--save_steps 200
--save_total_limit 1
--learning_rate 4e-5
--weight_decay 0.01
--warmup_ratio 0.03
--lr_scheduler_type "cosine"
--logging_steps 1
--max_seq_length 4096
--do_train True
--grad_checkpoint True
--group_by_length True
--dynamic_image_size True
--use_thumbnail True
--ps_version 'v2'
--deepspeed "zero_stage1_config.json"
--report_to "tensorboard"
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
`