更新人工打分功能
This commit is contained in:
parent
5ba1d0dbdd
commit
aefda38d12
Binary file not shown.
@ -391,15 +391,16 @@ class DualAIDialogueEngine:
|
|||||||
"""双AI对话引擎"""
|
"""双AI对话引擎"""
|
||||||
|
|
||||||
def __init__(self, knowledge_base: RAGKnowledgeBase, conversation_manager: ConversationManager, llm_generator,
|
def __init__(self, knowledge_base: RAGKnowledgeBase, conversation_manager: ConversationManager, llm_generator,
|
||||||
enable_scoring: bool = True, base_model_path: str = None):
|
enable_scoring: bool = True, base_model_path: str = None, use_manual_scoring: bool = False):
|
||||||
self.kb = knowledge_base
|
self.kb = knowledge_base
|
||||||
self.conv_mgr = conversation_manager
|
self.conv_mgr = conversation_manager
|
||||||
self.llm_generator = llm_generator
|
self.llm_generator = llm_generator
|
||||||
self.enable_scoring = enable_scoring
|
self.enable_scoring = enable_scoring
|
||||||
|
self.use_manual_scoring = use_manual_scoring
|
||||||
self.scorer = None
|
self.scorer = None
|
||||||
|
|
||||||
# 初始化评分器
|
# 初始化评分器
|
||||||
if enable_scoring and base_model_path:
|
if enable_scoring and base_model_path and not use_manual_scoring:
|
||||||
try:
|
try:
|
||||||
from dialogue_scorer import DialogueAIScorer
|
from dialogue_scorer import DialogueAIScorer
|
||||||
print("正在初始化对话评分系统...")
|
print("正在初始化对话评分系统...")
|
||||||
@ -412,6 +413,74 @@ class DualAIDialogueEngine:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠ 对话评分系统初始化失败: {e}")
|
print(f"⚠ 对话评分系统初始化失败: {e}")
|
||||||
self.enable_scoring = False
|
self.enable_scoring = False
|
||||||
|
|
||||||
|
def _manual_score_dialogue_turn(self, dialogue_content: str, speaker: str, dialogue_history: List[DialogueTurn]) -> Tuple[float, str, str]:
|
||||||
|
"""人工打分对话轮次
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dialogue_content: 对话内容
|
||||||
|
speaker: 说话者
|
||||||
|
dialogue_history: 对话历史
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (总分, 详细分数JSON, 反馈意见)
|
||||||
|
"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("人工对话评分")
|
||||||
|
print("="*60)
|
||||||
|
# print(f"说话者: {speaker}")
|
||||||
|
# print(f"对话内容: {dialogue_content}")
|
||||||
|
print("-" * 40)
|
||||||
|
|
||||||
|
# # 显示最近的对话历史作为参考
|
||||||
|
# if dialogue_history:
|
||||||
|
# print("最近对话历史:")
|
||||||
|
# for i, turn in enumerate(dialogue_history[-3:], 1):
|
||||||
|
# print(f" {i}. {turn.speaker}: {turn.content[:100]}...")
|
||||||
|
# print("-" * 40)
|
||||||
|
|
||||||
|
# 五个评分维度
|
||||||
|
dimensions = {
|
||||||
|
'coherence': '逻辑连贯性 (1-10)',
|
||||||
|
'character_consistency': '角色一致性 (1-10)',
|
||||||
|
'naturalness': '自然流畅度 (1-10)',
|
||||||
|
'information_density': '信息密度 (1-10)',
|
||||||
|
'creativity': '创意新颖度 (1-10)'
|
||||||
|
}
|
||||||
|
|
||||||
|
scores = {}
|
||||||
|
print("\n请为以下维度打分 (输入1-10的分数,直接回车跳过该维度):")
|
||||||
|
|
||||||
|
for key, desc in dimensions.items():
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
score_input = input(f"{desc}: ").strip()
|
||||||
|
if score_input == "":
|
||||||
|
scores[key] = 7.0 # 默认分数
|
||||||
|
break
|
||||||
|
|
||||||
|
score = float(score_input)
|
||||||
|
if 1 <= score <= 10:
|
||||||
|
scores[key] = score
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("请输入1-10之间的分数")
|
||||||
|
except ValueError:
|
||||||
|
print("请输入有效的数字")
|
||||||
|
|
||||||
|
# 计算总分
|
||||||
|
overall_score = sum(scores.values()) / len(scores)
|
||||||
|
|
||||||
|
# 获取反馈意见
|
||||||
|
print("\n请输入对该对话的评价和建议 (可选,直接回车跳过):")
|
||||||
|
feedback = input("反馈意见: ").strip()
|
||||||
|
if not feedback:
|
||||||
|
feedback = f"人工评分完成,总分: {overall_score:.1f}"
|
||||||
|
|
||||||
|
print(f"\n✓ 评分完成 - 总分: {overall_score:.1f}")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
return overall_score, json.dumps(scores), feedback
|
||||||
|
|
||||||
def score_dialogue_turn(self, dialogue_content: str, speaker: str, dialogue_history: List[DialogueTurn]) -> Tuple[float, str, str]:
|
def score_dialogue_turn(self, dialogue_content: str, speaker: str, dialogue_history: List[DialogueTurn]) -> Tuple[float, str, str]:
|
||||||
"""对单条对话进行评分
|
"""对单条对话进行评分
|
||||||
@ -424,8 +493,16 @@ class DualAIDialogueEngine:
|
|||||||
Returns:
|
Returns:
|
||||||
tuple: (总分, 详细分数JSON, 反馈意见)
|
tuple: (总分, 详细分数JSON, 反馈意见)
|
||||||
"""
|
"""
|
||||||
if not self.enable_scoring or not self.scorer:
|
if not self.enable_scoring:
|
||||||
return 0.0, "{}", "评分系统未启用"
|
return 0.0, "{}", "评分系统未启用"
|
||||||
|
|
||||||
|
# 人工打分模式
|
||||||
|
if self.use_manual_scoring:
|
||||||
|
return self._manual_score_dialogue_turn(dialogue_content, speaker, dialogue_history)
|
||||||
|
|
||||||
|
# AI自动打分模式
|
||||||
|
if not self.scorer:
|
||||||
|
return 0.0, "{}", "AI评分器未初始化"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 获取角色数据
|
# 获取角色数据
|
||||||
|
|||||||
@ -121,7 +121,7 @@ def show_character_info():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"✗ 读取角色文件失败: {char_file} - {e}")
|
print(f"✗ 读取角色文件失败: {char_file} - {e}")
|
||||||
|
|
||||||
def run_dialogue_system(enableScore: bool):
|
def run_dialogue_system(enableScore: bool, useManualScoring: bool = False):
|
||||||
"""运行双AI对话系统"""
|
"""运行双AI对话系统"""
|
||||||
print("\n" + "="*60)
|
print("\n" + "="*60)
|
||||||
print("启动双AI角色对话系统")
|
print("启动双AI角色对话系统")
|
||||||
@ -192,7 +192,8 @@ def run_dialogue_system(enableScore: bool):
|
|||||||
conv_mgr,
|
conv_mgr,
|
||||||
dual_generator,
|
dual_generator,
|
||||||
enable_scoring=enableScore,
|
enable_scoring=enableScore,
|
||||||
base_model_path=base_model_path
|
base_model_path=base_model_path,
|
||||||
|
use_manual_scoring=useManualScoring
|
||||||
)
|
)
|
||||||
|
|
||||||
# 创建对话会话
|
# 创建对话会话
|
||||||
@ -261,176 +262,6 @@ def run_dialogue_system(enableScore: bool):
|
|||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|
||||||
def analyze_model_performance():
|
|
||||||
"""分析模型性能"""
|
|
||||||
print("\n" + "="*60)
|
|
||||||
print("模型性能分析")
|
|
||||||
print("="*60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from dual_ai_dialogue_system import ConversationManager
|
|
||||||
import sqlite3
|
|
||||||
import json
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
|
|
||||||
conv_mgr = ConversationManager("./conversation_data/conversations.db")
|
|
||||||
|
|
||||||
with sqlite3.connect(conv_mgr.db_path) as conn:
|
|
||||||
print("\n1. 总体性能趋势分析:")
|
|
||||||
|
|
||||||
# 按时间段分析性能趋势
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT
|
|
||||||
DATE(timestamp) as date,
|
|
||||||
COUNT(*) as dialogue_count,
|
|
||||||
AVG(dialogue_score) as avg_score,
|
|
||||||
AVG(CASE WHEN dialogue_score >= 8.0 THEN 1.0 ELSE 0.0 END) as high_quality_rate
|
|
||||||
FROM dialogue_turns
|
|
||||||
WHERE dialogue_score > 0
|
|
||||||
AND timestamp >= datetime('now', '-7 days')
|
|
||||||
GROUP BY DATE(timestamp)
|
|
||||||
ORDER BY date DESC
|
|
||||||
""")
|
|
||||||
|
|
||||||
trend_data = cursor.fetchall()
|
|
||||||
if trend_data:
|
|
||||||
print(f" 最近7天性能趋势:")
|
|
||||||
for date, count, avg_score, hq_rate in trend_data:
|
|
||||||
print(f" {date}: 平均{avg_score:.2f}分 ({count}轮对话, {hq_rate*100:.1f}%高质量)")
|
|
||||||
else:
|
|
||||||
print(" 暂无足够数据进行趋势分析")
|
|
||||||
|
|
||||||
print("\n2. 维度问题分析:")
|
|
||||||
|
|
||||||
# 分析各维度的问题
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT score_details
|
|
||||||
FROM dialogue_turns
|
|
||||||
WHERE dialogue_score > 0 AND score_details != '{}'
|
|
||||||
ORDER BY timestamp DESC
|
|
||||||
LIMIT 100
|
|
||||||
""")
|
|
||||||
|
|
||||||
dimension_scores = {
|
|
||||||
'coherence': [],
|
|
||||||
'character_consistency': [],
|
|
||||||
'naturalness': [],
|
|
||||||
'information_density': [],
|
|
||||||
'creativity': []
|
|
||||||
}
|
|
||||||
|
|
||||||
for (score_details,) in cursor.fetchall():
|
|
||||||
try:
|
|
||||||
scores = json.loads(score_details)
|
|
||||||
for dim, score in scores.items():
|
|
||||||
if dim in dimension_scores:
|
|
||||||
dimension_scores[dim].append(float(score))
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
|
|
||||||
dimension_names = {
|
|
||||||
'coherence': '连贯性',
|
|
||||||
'character_consistency': '角色一致性',
|
|
||||||
'naturalness': '自然度',
|
|
||||||
'information_density': '信息密度',
|
|
||||||
'creativity': '创意性'
|
|
||||||
}
|
|
||||||
|
|
||||||
weak_dimensions = []
|
|
||||||
for dim, scores in dimension_scores.items():
|
|
||||||
if scores:
|
|
||||||
avg_score = sum(scores) / len(scores)
|
|
||||||
print(f" {dimension_names[dim]}: 平均{avg_score:.2f}分 ({len(scores)}个样本)")
|
|
||||||
if avg_score < 7.0:
|
|
||||||
weak_dimensions.append(dim)
|
|
||||||
|
|
||||||
if weak_dimensions:
|
|
||||||
print(f"\n ⚠ 发现薄弱维度: {[dimension_names[d] for d in weak_dimensions]}")
|
|
||||||
print(" 建议进行针对性优化训练")
|
|
||||||
|
|
||||||
print("\n3. 角色表现分析:")
|
|
||||||
|
|
||||||
# 分析不同角色的表现
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT
|
|
||||||
speaker,
|
|
||||||
COUNT(*) as dialogue_count,
|
|
||||||
AVG(dialogue_score) as avg_score,
|
|
||||||
MIN(dialogue_score) as min_score,
|
|
||||||
MAX(dialogue_score) as max_score,
|
|
||||||
AVG(CASE WHEN dialogue_score >= 8.0 THEN 1.0 ELSE 0.0 END) as high_quality_rate
|
|
||||||
FROM dialogue_turns
|
|
||||||
WHERE dialogue_score > 0
|
|
||||||
GROUP BY speaker
|
|
||||||
ORDER BY avg_score DESC
|
|
||||||
""")
|
|
||||||
|
|
||||||
character_performance = cursor.fetchall()
|
|
||||||
if character_performance:
|
|
||||||
print(" 角色表现排名:")
|
|
||||||
for i, (speaker, count, avg, min_s, max_s, hq_rate) in enumerate(character_performance, 1):
|
|
||||||
status = "✓" if avg >= 7.5 else "⚠" if avg >= 6.5 else "✗"
|
|
||||||
print(f" {i}. {speaker} {status}")
|
|
||||||
print(f" 平均{avg:.2f}分 (范围{min_s:.1f}-{max_s:.1f}, {hq_rate*100:.1f}%高质量, {count}轮)")
|
|
||||||
|
|
||||||
print("\n4. 问题模式识别:")
|
|
||||||
|
|
||||||
# 识别低分对话的常见问题
|
|
||||||
cursor = conn.execute("""
|
|
||||||
SELECT content, dialogue_score, score_feedback
|
|
||||||
FROM dialogue_turns
|
|
||||||
WHERE dialogue_score > 0 AND dialogue_score < 6.0
|
|
||||||
ORDER BY dialogue_score ASC
|
|
||||||
LIMIT 5
|
|
||||||
""")
|
|
||||||
|
|
||||||
low_score_examples = cursor.fetchall()
|
|
||||||
if low_score_examples:
|
|
||||||
print(" 低分对话示例:")
|
|
||||||
for i, (content, score, feedback) in enumerate(low_score_examples, 1):
|
|
||||||
print(f" {i}. 分数{score:.1f}: {content[:50]}...")
|
|
||||||
if feedback:
|
|
||||||
print(f" 问题: {feedback[:80]}...")
|
|
||||||
else:
|
|
||||||
print(" 暂无低分对话样本")
|
|
||||||
|
|
||||||
print("\n5. 优化建议:")
|
|
||||||
|
|
||||||
# 生成优化建议
|
|
||||||
suggestions = []
|
|
||||||
|
|
||||||
if weak_dimensions:
|
|
||||||
if 'character_consistency' in weak_dimensions:
|
|
||||||
suggestions.append("• 加强角色设定训练,增加角色特征描述的权重")
|
|
||||||
if 'creativity' in weak_dimensions:
|
|
||||||
suggestions.append("• 增加创意性训练数据,提高对话的趣味性")
|
|
||||||
if 'coherence' in weak_dimensions:
|
|
||||||
suggestions.append("• 优化上下文理解,加强对话逻辑连贯性")
|
|
||||||
if 'naturalness' in weak_dimensions:
|
|
||||||
suggestions.append("• 增加自然语言训练,改善表达流畅度")
|
|
||||||
if 'information_density' in weak_dimensions:
|
|
||||||
suggestions.append("• 优化信息组织,避免冗余表达")
|
|
||||||
|
|
||||||
# 检查是否需要数据收集
|
|
||||||
cursor = conn.execute("SELECT COUNT(*) FROM dialogue_turns WHERE dialogue_score > 0")
|
|
||||||
total_scored = cursor.fetchone()[0]
|
|
||||||
|
|
||||||
if total_scored < 50:
|
|
||||||
suggestions.append("• 需要收集更多评分数据以进行准确分析")
|
|
||||||
|
|
||||||
if total_scored >= 100:
|
|
||||||
suggestions.append("• 数据量充足,建议开始模型迭代优化")
|
|
||||||
|
|
||||||
if suggestions:
|
|
||||||
for suggestion in suggestions:
|
|
||||||
print(f" {suggestion}")
|
|
||||||
else:
|
|
||||||
print(" 当前性能表现良好,继续保持!")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"✗ 性能分析失败: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
def generate_training_dataset():
|
def generate_training_dataset():
|
||||||
"""生成训练数据集"""
|
"""生成训练数据集"""
|
||||||
@ -1271,18 +1102,19 @@ def main():
|
|||||||
print("主菜单 - 请选择操作:")
|
print("主菜单 - 请选择操作:")
|
||||||
print("1. 处理PDF世界观文档 (转换为RAG格式)")
|
print("1. 处理PDF世界观文档 (转换为RAG格式)")
|
||||||
print("2. 查看角色设定信息")
|
print("2. 查看角色设定信息")
|
||||||
print("3. 启动双AI对话系统 (开启ai打分)")
|
print("3. 启动双AI对话系统 (开启AI打分)")
|
||||||
print("4. 启动双AI对话系统 (关闭ai打分)")
|
print("4. 启动双AI对话系统 (关闭AI打分)")
|
||||||
print("5. 系统状态检查")
|
print("5. 启动双AI对话系统 (开启人工打分)")
|
||||||
print("6. 查看对话评分统计")
|
print("6. 系统状态检查")
|
||||||
print("7. 模型性能分析与优化")
|
print("7. 查看对话评分统计")
|
||||||
print("8. 生成训练数据集")
|
print("8. 模型性能分析与优化")
|
||||||
print("9. 模型迭代优化")
|
print("9. 生成训练数据集")
|
||||||
print("10. 查看使用说明")
|
print("10. 模型迭代优化")
|
||||||
|
print("11. 查看使用说明")
|
||||||
print("0. 退出")
|
print("0. 退出")
|
||||||
print("="*50)
|
print("="*50)
|
||||||
|
|
||||||
choice = input("请输入选择 (0-10): ").strip()
|
choice = input("请输入选择 (0-11): ").strip()
|
||||||
|
|
||||||
if choice == '0':
|
if choice == '0':
|
||||||
print("\n感谢使用双AI角色对话系统!")
|
print("\n感谢使用双AI角色对话系统!")
|
||||||
@ -1301,21 +1133,25 @@ def main():
|
|||||||
run_dialogue_system(enableScore = False)
|
run_dialogue_system(enableScore = False)
|
||||||
|
|
||||||
elif choice == '5':
|
elif choice == '5':
|
||||||
show_system_status()
|
run_dialogue_system(enableScore = True, useManualScoring = True)
|
||||||
|
|
||||||
elif choice == '6':
|
elif choice == '6':
|
||||||
show_scoring_statistics()
|
show_system_status()
|
||||||
|
|
||||||
elif choice == '7':
|
elif choice == '7':
|
||||||
analyze_model_performance()
|
show_scoring_statistics()
|
||||||
|
|
||||||
elif choice == '8':
|
elif choice == '8':
|
||||||
generate_training_dataset()
|
# 模型性能分析与优化 - 待实现
|
||||||
|
print("模型性能分析与优化功能开发中...")
|
||||||
|
|
||||||
elif choice == '9':
|
elif choice == '9':
|
||||||
run_model_optimization()
|
generate_training_dataset()
|
||||||
|
|
||||||
elif choice == '10':
|
elif choice == '10':
|
||||||
|
run_model_optimization()
|
||||||
|
|
||||||
|
elif choice == '11':
|
||||||
show_usage_guide()
|
show_usage_guide()
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user