更新训练配置
This commit is contained in:
parent
897dcaf386
commit
9db6753ca3
@ -210,10 +210,11 @@ def create_lora_config():
|
|||||||
task_type=TaskType.CAUSAL_LM,
|
task_type=TaskType.CAUSAL_LM,
|
||||||
target_modules=["q_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
|
target_modules=["q_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
|
||||||
inference_mode=False,
|
inference_mode=False,
|
||||||
r=8, # 增加rank以提高表达能力
|
r=8, # rank
|
||||||
lora_alpha=8, # alpha = 2 * r
|
lora_alpha=8, # 降低alpha值以增加稳定性
|
||||||
lora_dropout=0.1,
|
lora_dropout=0.05, # 降低dropout以减少不稳定性
|
||||||
modules_to_save=["lm_head", "embed_tokens"]
|
# 移除modules_to_save以避免embed_tokens参数的NaN问题
|
||||||
|
# modules_to_save=["lm_head", "embed_tokens"]
|
||||||
)
|
)
|
||||||
return config
|
return config
|
||||||
|
|
||||||
@ -231,7 +232,10 @@ def prepare_dataset(data_path, tokenizer):
|
|||||||
|
|
||||||
# 转换为Dataset格式
|
# 转换为Dataset格式
|
||||||
dataset = Dataset.from_list(data)
|
dataset = Dataset.from_list(data)
|
||||||
|
#过滤 None 和空字符串(推荐)
|
||||||
|
dataset = dataset.filter(
|
||||||
|
lambda example: example.get("output") not in [None, ""]
|
||||||
|
)
|
||||||
# 应用预处理函数
|
# 应用预处理函数
|
||||||
tokenized_dataset = dataset.map(
|
tokenized_dataset = dataset.map(
|
||||||
lambda example: process_func(example, tokenizer),
|
lambda example: process_func(example, tokenizer),
|
||||||
@ -280,17 +284,32 @@ def train_lora_model(model_path, data_path, output_dir):
|
|||||||
# 3. 应用LoRA
|
# 3. 应用LoRA
|
||||||
model = get_peft_model(model, lora_config)
|
model = get_peft_model(model, lora_config)
|
||||||
|
|
||||||
# 4. 启用梯度计算
|
# 4. 数值稳定性初始化 - 初始化LoRA权重
|
||||||
|
for name, param in model.named_parameters():
|
||||||
|
if param.requires_grad:
|
||||||
|
if 'lora_A' in name:
|
||||||
|
# LoRA A矩阵使用正态分布初始化
|
||||||
|
torch.nn.init.normal_(param, mean=0.0, std=0.01)
|
||||||
|
elif 'lora_B' in name:
|
||||||
|
# LoRA B矩阵初始化为0
|
||||||
|
torch.nn.init.zeros_(param)
|
||||||
|
|
||||||
|
# 检查初始化后是否有异常值
|
||||||
|
if torch.isnan(param).any() or torch.isinf(param).any():
|
||||||
|
logger.error(f"Abnormal values detected in parameter {name} after initialization")
|
||||||
|
torch.nn.init.normal_(param, mean=0.0, std=0.001)
|
||||||
|
|
||||||
|
# 5. 启用梯度计算
|
||||||
for param in model.parameters():
|
for param in model.parameters():
|
||||||
if param.requires_grad:
|
if param.requires_grad:
|
||||||
param.requires_grad_(True)
|
param.requires_grad_(True)
|
||||||
|
|
||||||
model.config.use_cache = False # 关闭缓存以节省显存
|
model.config.use_cache = False # 关闭缓存以节省显存
|
||||||
|
|
||||||
# 5. 准备数据集
|
# 6. 准备数据集
|
||||||
train_preparedataset = prepare_dataset(data_path, tokenizer)
|
train_preparedataset = prepare_dataset(data_path, tokenizer)
|
||||||
|
|
||||||
# 6. 配置训练参数 - 针对3080显卡优化
|
# 7. 配置训练参数 - 针对3080显卡优化
|
||||||
training_args = TrainingArguments(
|
training_args = TrainingArguments(
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
per_device_train_batch_size=2, # 减小batch size
|
per_device_train_batch_size=2, # 减小batch size
|
||||||
@ -298,9 +317,9 @@ def train_lora_model(model_path, data_path, output_dir):
|
|||||||
logging_steps=10,
|
logging_steps=10,
|
||||||
num_train_epochs=3, # 增加训练轮数以充分学习角色特征
|
num_train_epochs=3, # 增加训练轮数以充分学习角色特征
|
||||||
save_steps=50,
|
save_steps=50,
|
||||||
learning_rate=1e-5, # 降低学习率以增加稳定性
|
learning_rate=5e-6, # 进一步降低学习率
|
||||||
warmup_ratio=0.1,
|
warmup_ratio=0.1,
|
||||||
max_grad_norm=1.0, # 保持梯度裁剪
|
max_grad_norm=0.5, # 更严格的梯度裁剪
|
||||||
save_on_each_node=True,
|
save_on_each_node=True,
|
||||||
gradient_checkpointing=True,
|
gradient_checkpointing=True,
|
||||||
gradient_checkpointing_kwargs={"use_reentrant": True},
|
gradient_checkpointing_kwargs={"use_reentrant": True},
|
||||||
@ -310,6 +329,8 @@ def train_lora_model(model_path, data_path, output_dir):
|
|||||||
bf16=True, # 显式启用bf16以匹配模型加载类型
|
bf16=True, # 显式启用bf16以匹配模型加载类型
|
||||||
#fp16=False, # 确保fp16被禁用
|
#fp16=False, # 确保fp16被禁用
|
||||||
save_total_limit=3, # 只保留最新的3个检查点
|
save_total_limit=3, # 只保留最新的3个检查点
|
||||||
|
adam_epsilon=1e-8, # 增加数值稳定性
|
||||||
|
weight_decay=0.01, # 添加权重衰减
|
||||||
)
|
)
|
||||||
|
|
||||||
#添加swan监测
|
#添加swan监测
|
||||||
@ -322,7 +343,7 @@ def train_lora_model(model_path, data_path, output_dir):
|
|||||||
# 创建梯度监控回调
|
# 创建梯度监控回调
|
||||||
gradient_monitor = GradientMonitorCallback()
|
gradient_monitor = GradientMonitorCallback()
|
||||||
|
|
||||||
# 7. 创建训练器
|
# 8. 创建训练器
|
||||||
trainer = Trainer(
|
trainer = Trainer(
|
||||||
model=model,
|
model=model,
|
||||||
args=training_args,
|
args=training_args,
|
||||||
@ -331,7 +352,7 @@ def train_lora_model(model_path, data_path, output_dir):
|
|||||||
callbacks=[swanlab_callback, gradient_monitor] # 添加梯度监控回调
|
callbacks=[swanlab_callback, gradient_monitor] # 添加梯度监控回调
|
||||||
)
|
)
|
||||||
|
|
||||||
# 8. 开始训练
|
# 9. 开始训练
|
||||||
print("Starting training...")
|
print("Starting training...")
|
||||||
logger.info("Starting training...")
|
logger.info("Starting training...")
|
||||||
|
|
||||||
@ -344,7 +365,7 @@ def train_lora_model(model_path, data_path, output_dir):
|
|||||||
logger.error(traceback.format_exc())
|
logger.error(traceback.format_exc())
|
||||||
raise
|
raise
|
||||||
|
|
||||||
# 9. 保存最终模型
|
# 10. 保存最终模型
|
||||||
final_output_dir = os.path.join(output_dir, "final_model")
|
final_output_dir = os.path.join(output_dir, "final_model")
|
||||||
trainer.save_model(final_output_dir)
|
trainer.save_model(final_output_dir)
|
||||||
tokenizer.save_pretrained(final_output_dir)
|
tokenizer.save_pretrained(final_output_dir)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user