更新人工打分功能

This commit is contained in:
997146918 2025-08-23 18:13:45 +08:00
parent 5ba1d0dbdd
commit aefda38d12
3 changed files with 102 additions and 189 deletions

View File

@ -391,15 +391,16 @@ class DualAIDialogueEngine:
"""双AI对话引擎""" """双AI对话引擎"""
def __init__(self, knowledge_base: RAGKnowledgeBase, conversation_manager: ConversationManager, llm_generator, def __init__(self, knowledge_base: RAGKnowledgeBase, conversation_manager: ConversationManager, llm_generator,
enable_scoring: bool = True, base_model_path: str = None): enable_scoring: bool = True, base_model_path: str = None, use_manual_scoring: bool = False):
self.kb = knowledge_base self.kb = knowledge_base
self.conv_mgr = conversation_manager self.conv_mgr = conversation_manager
self.llm_generator = llm_generator self.llm_generator = llm_generator
self.enable_scoring = enable_scoring self.enable_scoring = enable_scoring
self.use_manual_scoring = use_manual_scoring
self.scorer = None self.scorer = None
# 初始化评分器 # 初始化评分器
if enable_scoring and base_model_path: if enable_scoring and base_model_path and not use_manual_scoring:
try: try:
from dialogue_scorer import DialogueAIScorer from dialogue_scorer import DialogueAIScorer
print("正在初始化对话评分系统...") print("正在初始化对话评分系统...")
@ -413,6 +414,74 @@ class DualAIDialogueEngine:
print(f"⚠ 对话评分系统初始化失败: {e}") print(f"⚠ 对话评分系统初始化失败: {e}")
self.enable_scoring = False self.enable_scoring = False
def _manual_score_dialogue_turn(self, dialogue_content: str, speaker: str, dialogue_history: List[DialogueTurn]) -> Tuple[float, str, str]:
"""人工打分对话轮次
Args:
dialogue_content: 对话内容
speaker: 说话者
dialogue_history: 对话历史
Returns:
tuple: (总分, 详细分数JSON, 反馈意见)
"""
print("\n" + "="*60)
print("人工对话评分")
print("="*60)
# print(f"说话者: {speaker}")
# print(f"对话内容: {dialogue_content}")
print("-" * 40)
# # 显示最近的对话历史作为参考
# if dialogue_history:
# print("最近对话历史:")
# for i, turn in enumerate(dialogue_history[-3:], 1):
# print(f" {i}. {turn.speaker}: {turn.content[:100]}...")
# print("-" * 40)
# 五个评分维度
dimensions = {
'coherence': '逻辑连贯性 (1-10)',
'character_consistency': '角色一致性 (1-10)',
'naturalness': '自然流畅度 (1-10)',
'information_density': '信息密度 (1-10)',
'creativity': '创意新颖度 (1-10)'
}
scores = {}
print("\n请为以下维度打分 (输入1-10的分数直接回车跳过该维度):")
for key, desc in dimensions.items():
while True:
try:
score_input = input(f"{desc}: ").strip()
if score_input == "":
scores[key] = 7.0 # 默认分数
break
score = float(score_input)
if 1 <= score <= 10:
scores[key] = score
break
else:
print("请输入1-10之间的分数")
except ValueError:
print("请输入有效的数字")
# 计算总分
overall_score = sum(scores.values()) / len(scores)
# 获取反馈意见
print("\n请输入对该对话的评价和建议 (可选,直接回车跳过):")
feedback = input("反馈意见: ").strip()
if not feedback:
feedback = f"人工评分完成,总分: {overall_score:.1f}"
print(f"\n✓ 评分完成 - 总分: {overall_score:.1f}")
print("="*60)
return overall_score, json.dumps(scores), feedback
def score_dialogue_turn(self, dialogue_content: str, speaker: str, dialogue_history: List[DialogueTurn]) -> Tuple[float, str, str]: def score_dialogue_turn(self, dialogue_content: str, speaker: str, dialogue_history: List[DialogueTurn]) -> Tuple[float, str, str]:
"""对单条对话进行评分 """对单条对话进行评分
@ -424,9 +493,17 @@ class DualAIDialogueEngine:
Returns: Returns:
tuple: (总分, 详细分数JSON, 反馈意见) tuple: (总分, 详细分数JSON, 反馈意见)
""" """
if not self.enable_scoring or not self.scorer: if not self.enable_scoring:
return 0.0, "{}", "评分系统未启用" return 0.0, "{}", "评分系统未启用"
# 人工打分模式
if self.use_manual_scoring:
return self._manual_score_dialogue_turn(dialogue_content, speaker, dialogue_history)
# AI自动打分模式
if not self.scorer:
return 0.0, "{}", "AI评分器未初始化"
try: try:
# 获取角色数据 # 获取角色数据
character_data = self.kb.character_data.get(speaker, {}) character_data = self.kb.character_data.get(speaker, {})

View File

@ -121,7 +121,7 @@ def show_character_info():
except Exception as e: except Exception as e:
print(f"✗ 读取角色文件失败: {char_file} - {e}") print(f"✗ 读取角色文件失败: {char_file} - {e}")
def run_dialogue_system(enableScore: bool): def run_dialogue_system(enableScore: bool, useManualScoring: bool = False):
"""运行双AI对话系统""" """运行双AI对话系统"""
print("\n" + "="*60) print("\n" + "="*60)
print("启动双AI角色对话系统") print("启动双AI角色对话系统")
@ -192,7 +192,8 @@ def run_dialogue_system(enableScore: bool):
conv_mgr, conv_mgr,
dual_generator, dual_generator,
enable_scoring=enableScore, enable_scoring=enableScore,
base_model_path=base_model_path base_model_path=base_model_path,
use_manual_scoring=useManualScoring
) )
# 创建对话会话 # 创建对话会话
@ -261,176 +262,6 @@ def run_dialogue_system(enableScore: bool):
traceback.print_exc() traceback.print_exc()
def analyze_model_performance():
"""分析模型性能"""
print("\n" + "="*60)
print("模型性能分析")
print("="*60)
try:
from dual_ai_dialogue_system import ConversationManager
import sqlite3
import json
from datetime import datetime, timedelta
conv_mgr = ConversationManager("./conversation_data/conversations.db")
with sqlite3.connect(conv_mgr.db_path) as conn:
print("\n1. 总体性能趋势分析:")
# 按时间段分析性能趋势
cursor = conn.execute("""
SELECT
DATE(timestamp) as date,
COUNT(*) as dialogue_count,
AVG(dialogue_score) as avg_score,
AVG(CASE WHEN dialogue_score >= 8.0 THEN 1.0 ELSE 0.0 END) as high_quality_rate
FROM dialogue_turns
WHERE dialogue_score > 0
AND timestamp >= datetime('now', '-7 days')
GROUP BY DATE(timestamp)
ORDER BY date DESC
""")
trend_data = cursor.fetchall()
if trend_data:
print(f" 最近7天性能趋势:")
for date, count, avg_score, hq_rate in trend_data:
print(f" {date}: 平均{avg_score:.2f}分 ({count}轮对话, {hq_rate*100:.1f}%高质量)")
else:
print(" 暂无足够数据进行趋势分析")
print("\n2. 维度问题分析:")
# 分析各维度的问题
cursor = conn.execute("""
SELECT score_details
FROM dialogue_turns
WHERE dialogue_score > 0 AND score_details != '{}'
ORDER BY timestamp DESC
LIMIT 100
""")
dimension_scores = {
'coherence': [],
'character_consistency': [],
'naturalness': [],
'information_density': [],
'creativity': []
}
for (score_details,) in cursor.fetchall():
try:
scores = json.loads(score_details)
for dim, score in scores.items():
if dim in dimension_scores:
dimension_scores[dim].append(float(score))
except:
continue
dimension_names = {
'coherence': '连贯性',
'character_consistency': '角色一致性',
'naturalness': '自然度',
'information_density': '信息密度',
'creativity': '创意性'
}
weak_dimensions = []
for dim, scores in dimension_scores.items():
if scores:
avg_score = sum(scores) / len(scores)
print(f" {dimension_names[dim]}: 平均{avg_score:.2f}分 ({len(scores)}个样本)")
if avg_score < 7.0:
weak_dimensions.append(dim)
if weak_dimensions:
print(f"\n ⚠ 发现薄弱维度: {[dimension_names[d] for d in weak_dimensions]}")
print(" 建议进行针对性优化训练")
print("\n3. 角色表现分析:")
# 分析不同角色的表现
cursor = conn.execute("""
SELECT
speaker,
COUNT(*) as dialogue_count,
AVG(dialogue_score) as avg_score,
MIN(dialogue_score) as min_score,
MAX(dialogue_score) as max_score,
AVG(CASE WHEN dialogue_score >= 8.0 THEN 1.0 ELSE 0.0 END) as high_quality_rate
FROM dialogue_turns
WHERE dialogue_score > 0
GROUP BY speaker
ORDER BY avg_score DESC
""")
character_performance = cursor.fetchall()
if character_performance:
print(" 角色表现排名:")
for i, (speaker, count, avg, min_s, max_s, hq_rate) in enumerate(character_performance, 1):
status = "" if avg >= 7.5 else "" if avg >= 6.5 else ""
print(f" {i}. {speaker} {status}")
print(f" 平均{avg:.2f}分 (范围{min_s:.1f}-{max_s:.1f}, {hq_rate*100:.1f}%高质量, {count}轮)")
print("\n4. 问题模式识别:")
# 识别低分对话的常见问题
cursor = conn.execute("""
SELECT content, dialogue_score, score_feedback
FROM dialogue_turns
WHERE dialogue_score > 0 AND dialogue_score < 6.0
ORDER BY dialogue_score ASC
LIMIT 5
""")
low_score_examples = cursor.fetchall()
if low_score_examples:
print(" 低分对话示例:")
for i, (content, score, feedback) in enumerate(low_score_examples, 1):
print(f" {i}. 分数{score:.1f}: {content[:50]}...")
if feedback:
print(f" 问题: {feedback[:80]}...")
else:
print(" 暂无低分对话样本")
print("\n5. 优化建议:")
# 生成优化建议
suggestions = []
if weak_dimensions:
if 'character_consistency' in weak_dimensions:
suggestions.append("• 加强角色设定训练,增加角色特征描述的权重")
if 'creativity' in weak_dimensions:
suggestions.append("• 增加创意性训练数据,提高对话的趣味性")
if 'coherence' in weak_dimensions:
suggestions.append("• 优化上下文理解,加强对话逻辑连贯性")
if 'naturalness' in weak_dimensions:
suggestions.append("• 增加自然语言训练,改善表达流畅度")
if 'information_density' in weak_dimensions:
suggestions.append("• 优化信息组织,避免冗余表达")
# 检查是否需要数据收集
cursor = conn.execute("SELECT COUNT(*) FROM dialogue_turns WHERE dialogue_score > 0")
total_scored = cursor.fetchone()[0]
if total_scored < 50:
suggestions.append("• 需要收集更多评分数据以进行准确分析")
if total_scored >= 100:
suggestions.append("• 数据量充足,建议开始模型迭代优化")
if suggestions:
for suggestion in suggestions:
print(f" {suggestion}")
else:
print(" 当前性能表现良好,继续保持!")
except Exception as e:
print(f"✗ 性能分析失败: {e}")
import traceback
traceback.print_exc()
def generate_training_dataset(): def generate_training_dataset():
"""生成训练数据集""" """生成训练数据集"""
@ -1271,18 +1102,19 @@ def main():
print("主菜单 - 请选择操作:") print("主菜单 - 请选择操作:")
print("1. 处理PDF世界观文档 (转换为RAG格式)") print("1. 处理PDF世界观文档 (转换为RAG格式)")
print("2. 查看角色设定信息") print("2. 查看角色设定信息")
print("3. 启动双AI对话系统 (开启ai打分)") print("3. 启动双AI对话系统 (开启AI打分)")
print("4. 启动双AI对话系统 (关闭ai打分)") print("4. 启动双AI对话系统 (关闭AI打分)")
print("5. 系统状态检查") print("5. 启动双AI对话系统 (开启人工打分)")
print("6. 查看对话评分统计") print("6. 系统状态检查")
print("7. 模型性能分析与优化") print("7. 查看对话评分统计")
print("8. 生成训练数据集") print("8. 模型性能分析与优化")
print("9. 模型迭代优化") print("9. 生成训练数据集")
print("10. 查看使用说明") print("10. 模型迭代优化")
print("11. 查看使用说明")
print("0. 退出") print("0. 退出")
print("="*50) print("="*50)
choice = input("请输入选择 (0-10): ").strip() choice = input("请输入选择 (0-11): ").strip()
if choice == '0': if choice == '0':
print("\n感谢使用双AI角色对话系统") print("\n感谢使用双AI角色对话系统")
@ -1301,21 +1133,25 @@ def main():
run_dialogue_system(enableScore = False) run_dialogue_system(enableScore = False)
elif choice == '5': elif choice == '5':
show_system_status() run_dialogue_system(enableScore = True, useManualScoring = True)
elif choice == '6': elif choice == '6':
show_scoring_statistics() show_system_status()
elif choice == '7': elif choice == '7':
analyze_model_performance() show_scoring_statistics()
elif choice == '8': elif choice == '8':
generate_training_dataset() # 模型性能分析与优化 - 待实现
print("模型性能分析与优化功能开发中...")
elif choice == '9': elif choice == '9':
run_model_optimization() generate_training_dataset()
elif choice == '10': elif choice == '10':
run_model_optimization()
elif choice == '11':
show_usage_guide() show_usage_guide()
else: else: