-
Notifications
You must be signed in to change notification settings - Fork 6
Description
训练过程中很快出现loss跳变为0的现象,降低学习率无法解决该问题。

配置文件如下:
model:
arch: st_llm_hf
model_type: instructblip_vicuna0
use_grad_checkpoint: True
max_txt_len: 256
end_sym: "###"
#prompt_path: "prompts/alignment.txt"
prompt_template: '###Human: {} ###Assistant: '
llama_model: '/root/qfs/lmm/weights/stllm/pretrained/vicuna-7b-v1.1/'
ckpt: '/root/qfs/lmm/weights/stllm/pretrained/instruct_blip_vicuna7b_trimmed.pth'
q_former_model: '/root/qfs/lmm/weights/stllm/pretrained/instruct_blip_vicuna7b_trimmed.pth'
qformer_text_input: True
freeze_LLM: False
video_input: "residual"
residual_size: 16
use_mask : True
mvm_decode: True
datasets:
caption_体育240402_en:
num_frames: 64
run:
task: video_text_it
bf16: True
tf32: False
output_dir: "./output/instructblipbase_stllm_conversation"
num_train_epochs: 4
dataloader_num_workers: 2
per_device_train_batch_size: 2
per_device_eval_batch_size: 2
gradient_accumulation_steps: 1
evaluation_strategy: "no"
learning_rate: 2e-5
learning_rate: 1e-10
weight_decay: 0.
warmup_ratio: 0.03
warmup_ratio: 0.3
lr_scheduler_type: 'cosine'
logging_steps: 1
model_max_length: 1024
save_steps: 3000
#save_strategy: "epoch"
save_total_limit: 10
deepspeed: 'stllm/train/zero2.json'