更新人工打分功能
This commit is contained in:
parent
5ba1d0dbdd
commit
aefda38d12
Binary file not shown.
@ -391,15 +391,16 @@ class DualAIDialogueEngine:
|
||||
"""双AI对话引擎"""
|
||||
|
||||
def __init__(self, knowledge_base: RAGKnowledgeBase, conversation_manager: ConversationManager, llm_generator,
|
||||
enable_scoring: bool = True, base_model_path: str = None):
|
||||
enable_scoring: bool = True, base_model_path: str = None, use_manual_scoring: bool = False):
|
||||
self.kb = knowledge_base
|
||||
self.conv_mgr = conversation_manager
|
||||
self.llm_generator = llm_generator
|
||||
self.enable_scoring = enable_scoring
|
||||
self.use_manual_scoring = use_manual_scoring
|
||||
self.scorer = None
|
||||
|
||||
# 初始化评分器
|
||||
if enable_scoring and base_model_path:
|
||||
if enable_scoring and base_model_path and not use_manual_scoring:
|
||||
try:
|
||||
from dialogue_scorer import DialogueAIScorer
|
||||
print("正在初始化对话评分系统...")
|
||||
@ -412,6 +413,74 @@ class DualAIDialogueEngine:
|
||||
except Exception as e:
|
||||
print(f"⚠ 对话评分系统初始化失败: {e}")
|
||||
self.enable_scoring = False
|
||||
|
||||
def _manual_score_dialogue_turn(self, dialogue_content: str, speaker: str, dialogue_history: List[DialogueTurn]) -> Tuple[float, str, str]:
|
||||
"""人工打分对话轮次
|
||||
|
||||
Args:
|
||||
dialogue_content: 对话内容
|
||||
speaker: 说话者
|
||||
dialogue_history: 对话历史
|
||||
|
||||
Returns:
|
||||
tuple: (总分, 详细分数JSON, 反馈意见)
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("人工对话评分")
|
||||
print("="*60)
|
||||
# print(f"说话者: {speaker}")
|
||||
# print(f"对话内容: {dialogue_content}")
|
||||
print("-" * 40)
|
||||
|
||||
# # 显示最近的对话历史作为参考
|
||||
# if dialogue_history:
|
||||
# print("最近对话历史:")
|
||||
# for i, turn in enumerate(dialogue_history[-3:], 1):
|
||||
# print(f" {i}. {turn.speaker}: {turn.content[:100]}...")
|
||||
# print("-" * 40)
|
||||
|
||||
# 五个评分维度
|
||||
dimensions = {
|
||||
'coherence': '逻辑连贯性 (1-10)',
|
||||
'character_consistency': '角色一致性 (1-10)',
|
||||
'naturalness': '自然流畅度 (1-10)',
|
||||
'information_density': '信息密度 (1-10)',
|
||||
'creativity': '创意新颖度 (1-10)'
|
||||
}
|
||||
|
||||
scores = {}
|
||||
print("\n请为以下维度打分 (输入1-10的分数,直接回车跳过该维度):")
|
||||
|
||||
for key, desc in dimensions.items():
|
||||
while True:
|
||||
try:
|
||||
score_input = input(f"{desc}: ").strip()
|
||||
if score_input == "":
|
||||
scores[key] = 7.0 # 默认分数
|
||||
break
|
||||
|
||||
score = float(score_input)
|
||||
if 1 <= score <= 10:
|
||||
scores[key] = score
|
||||
break
|
||||
else:
|
||||
print("请输入1-10之间的分数")
|
||||
except ValueError:
|
||||
print("请输入有效的数字")
|
||||
|
||||
# 计算总分
|
||||
overall_score = sum(scores.values()) / len(scores)
|
||||
|
||||
# 获取反馈意见
|
||||
print("\n请输入对该对话的评价和建议 (可选,直接回车跳过):")
|
||||
feedback = input("反馈意见: ").strip()
|
||||
if not feedback:
|
||||
feedback = f"人工评分完成,总分: {overall_score:.1f}"
|
||||
|
||||
print(f"\n✓ 评分完成 - 总分: {overall_score:.1f}")
|
||||
print("="*60)
|
||||
|
||||
return overall_score, json.dumps(scores), feedback
|
||||
|
||||
def score_dialogue_turn(self, dialogue_content: str, speaker: str, dialogue_history: List[DialogueTurn]) -> Tuple[float, str, str]:
|
||||
"""对单条对话进行评分
|
||||
@ -424,8 +493,16 @@ class DualAIDialogueEngine:
|
||||
Returns:
|
||||
tuple: (总分, 详细分数JSON, 反馈意见)
|
||||
"""
|
||||
if not self.enable_scoring or not self.scorer:
|
||||
if not self.enable_scoring:
|
||||
return 0.0, "{}", "评分系统未启用"
|
||||
|
||||
# 人工打分模式
|
||||
if self.use_manual_scoring:
|
||||
return self._manual_score_dialogue_turn(dialogue_content, speaker, dialogue_history)
|
||||
|
||||
# AI自动打分模式
|
||||
if not self.scorer:
|
||||
return 0.0, "{}", "AI评分器未初始化"
|
||||
|
||||
try:
|
||||
# 获取角色数据
|
||||
|
||||
@ -121,7 +121,7 @@ def show_character_info():
|
||||
except Exception as e:
|
||||
print(f"✗ 读取角色文件失败: {char_file} - {e}")
|
||||
|
||||
def run_dialogue_system(enableScore: bool):
|
||||
def run_dialogue_system(enableScore: bool, useManualScoring: bool = False):
|
||||
"""运行双AI对话系统"""
|
||||
print("\n" + "="*60)
|
||||
print("启动双AI角色对话系统")
|
||||
@ -192,7 +192,8 @@ def run_dialogue_system(enableScore: bool):
|
||||
conv_mgr,
|
||||
dual_generator,
|
||||
enable_scoring=enableScore,
|
||||
base_model_path=base_model_path
|
||||
base_model_path=base_model_path,
|
||||
use_manual_scoring=useManualScoring
|
||||
)
|
||||
|
||||
# 创建对话会话
|
||||
@ -261,176 +262,6 @@ def run_dialogue_system(enableScore: bool):
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
def analyze_model_performance():
|
||||
"""分析模型性能"""
|
||||
print("\n" + "="*60)
|
||||
print("模型性能分析")
|
||||
print("="*60)
|
||||
|
||||
try:
|
||||
from dual_ai_dialogue_system import ConversationManager
|
||||
import sqlite3
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
conv_mgr = ConversationManager("./conversation_data/conversations.db")
|
||||
|
||||
with sqlite3.connect(conv_mgr.db_path) as conn:
|
||||
print("\n1. 总体性能趋势分析:")
|
||||
|
||||
# 按时间段分析性能趋势
|
||||
cursor = conn.execute("""
|
||||
SELECT
|
||||
DATE(timestamp) as date,
|
||||
COUNT(*) as dialogue_count,
|
||||
AVG(dialogue_score) as avg_score,
|
||||
AVG(CASE WHEN dialogue_score >= 8.0 THEN 1.0 ELSE 0.0 END) as high_quality_rate
|
||||
FROM dialogue_turns
|
||||
WHERE dialogue_score > 0
|
||||
AND timestamp >= datetime('now', '-7 days')
|
||||
GROUP BY DATE(timestamp)
|
||||
ORDER BY date DESC
|
||||
""")
|
||||
|
||||
trend_data = cursor.fetchall()
|
||||
if trend_data:
|
||||
print(f" 最近7天性能趋势:")
|
||||
for date, count, avg_score, hq_rate in trend_data:
|
||||
print(f" {date}: 平均{avg_score:.2f}分 ({count}轮对话, {hq_rate*100:.1f}%高质量)")
|
||||
else:
|
||||
print(" 暂无足够数据进行趋势分析")
|
||||
|
||||
print("\n2. 维度问题分析:")
|
||||
|
||||
# 分析各维度的问题
|
||||
cursor = conn.execute("""
|
||||
SELECT score_details
|
||||
FROM dialogue_turns
|
||||
WHERE dialogue_score > 0 AND score_details != '{}'
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT 100
|
||||
""")
|
||||
|
||||
dimension_scores = {
|
||||
'coherence': [],
|
||||
'character_consistency': [],
|
||||
'naturalness': [],
|
||||
'information_density': [],
|
||||
'creativity': []
|
||||
}
|
||||
|
||||
for (score_details,) in cursor.fetchall():
|
||||
try:
|
||||
scores = json.loads(score_details)
|
||||
for dim, score in scores.items():
|
||||
if dim in dimension_scores:
|
||||
dimension_scores[dim].append(float(score))
|
||||
except:
|
||||
continue
|
||||
|
||||
dimension_names = {
|
||||
'coherence': '连贯性',
|
||||
'character_consistency': '角色一致性',
|
||||
'naturalness': '自然度',
|
||||
'information_density': '信息密度',
|
||||
'creativity': '创意性'
|
||||
}
|
||||
|
||||
weak_dimensions = []
|
||||
for dim, scores in dimension_scores.items():
|
||||
if scores:
|
||||
avg_score = sum(scores) / len(scores)
|
||||
print(f" {dimension_names[dim]}: 平均{avg_score:.2f}分 ({len(scores)}个样本)")
|
||||
if avg_score < 7.0:
|
||||
weak_dimensions.append(dim)
|
||||
|
||||
if weak_dimensions:
|
||||
print(f"\n ⚠ 发现薄弱维度: {[dimension_names[d] for d in weak_dimensions]}")
|
||||
print(" 建议进行针对性优化训练")
|
||||
|
||||
print("\n3. 角色表现分析:")
|
||||
|
||||
# 分析不同角色的表现
|
||||
cursor = conn.execute("""
|
||||
SELECT
|
||||
speaker,
|
||||
COUNT(*) as dialogue_count,
|
||||
AVG(dialogue_score) as avg_score,
|
||||
MIN(dialogue_score) as min_score,
|
||||
MAX(dialogue_score) as max_score,
|
||||
AVG(CASE WHEN dialogue_score >= 8.0 THEN 1.0 ELSE 0.0 END) as high_quality_rate
|
||||
FROM dialogue_turns
|
||||
WHERE dialogue_score > 0
|
||||
GROUP BY speaker
|
||||
ORDER BY avg_score DESC
|
||||
""")
|
||||
|
||||
character_performance = cursor.fetchall()
|
||||
if character_performance:
|
||||
print(" 角色表现排名:")
|
||||
for i, (speaker, count, avg, min_s, max_s, hq_rate) in enumerate(character_performance, 1):
|
||||
status = "✓" if avg >= 7.5 else "⚠" if avg >= 6.5 else "✗"
|
||||
print(f" {i}. {speaker} {status}")
|
||||
print(f" 平均{avg:.2f}分 (范围{min_s:.1f}-{max_s:.1f}, {hq_rate*100:.1f}%高质量, {count}轮)")
|
||||
|
||||
print("\n4. 问题模式识别:")
|
||||
|
||||
# 识别低分对话的常见问题
|
||||
cursor = conn.execute("""
|
||||
SELECT content, dialogue_score, score_feedback
|
||||
FROM dialogue_turns
|
||||
WHERE dialogue_score > 0 AND dialogue_score < 6.0
|
||||
ORDER BY dialogue_score ASC
|
||||
LIMIT 5
|
||||
""")
|
||||
|
||||
low_score_examples = cursor.fetchall()
|
||||
if low_score_examples:
|
||||
print(" 低分对话示例:")
|
||||
for i, (content, score, feedback) in enumerate(low_score_examples, 1):
|
||||
print(f" {i}. 分数{score:.1f}: {content[:50]}...")
|
||||
if feedback:
|
||||
print(f" 问题: {feedback[:80]}...")
|
||||
else:
|
||||
print(" 暂无低分对话样本")
|
||||
|
||||
print("\n5. 优化建议:")
|
||||
|
||||
# 生成优化建议
|
||||
suggestions = []
|
||||
|
||||
if weak_dimensions:
|
||||
if 'character_consistency' in weak_dimensions:
|
||||
suggestions.append("• 加强角色设定训练,增加角色特征描述的权重")
|
||||
if 'creativity' in weak_dimensions:
|
||||
suggestions.append("• 增加创意性训练数据,提高对话的趣味性")
|
||||
if 'coherence' in weak_dimensions:
|
||||
suggestions.append("• 优化上下文理解,加强对话逻辑连贯性")
|
||||
if 'naturalness' in weak_dimensions:
|
||||
suggestions.append("• 增加自然语言训练,改善表达流畅度")
|
||||
if 'information_density' in weak_dimensions:
|
||||
suggestions.append("• 优化信息组织,避免冗余表达")
|
||||
|
||||
# 检查是否需要数据收集
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM dialogue_turns WHERE dialogue_score > 0")
|
||||
total_scored = cursor.fetchone()[0]
|
||||
|
||||
if total_scored < 50:
|
||||
suggestions.append("• 需要收集更多评分数据以进行准确分析")
|
||||
|
||||
if total_scored >= 100:
|
||||
suggestions.append("• 数据量充足,建议开始模型迭代优化")
|
||||
|
||||
if suggestions:
|
||||
for suggestion in suggestions:
|
||||
print(f" {suggestion}")
|
||||
else:
|
||||
print(" 当前性能表现良好,继续保持!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ 性能分析失败: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def generate_training_dataset():
|
||||
"""生成训练数据集"""
|
||||
@ -1271,18 +1102,19 @@ def main():
|
||||
print("主菜单 - 请选择操作:")
|
||||
print("1. 处理PDF世界观文档 (转换为RAG格式)")
|
||||
print("2. 查看角色设定信息")
|
||||
print("3. 启动双AI对话系统 (开启ai打分)")
|
||||
print("4. 启动双AI对话系统 (关闭ai打分)")
|
||||
print("5. 系统状态检查")
|
||||
print("6. 查看对话评分统计")
|
||||
print("7. 模型性能分析与优化")
|
||||
print("8. 生成训练数据集")
|
||||
print("9. 模型迭代优化")
|
||||
print("10. 查看使用说明")
|
||||
print("3. 启动双AI对话系统 (开启AI打分)")
|
||||
print("4. 启动双AI对话系统 (关闭AI打分)")
|
||||
print("5. 启动双AI对话系统 (开启人工打分)")
|
||||
print("6. 系统状态检查")
|
||||
print("7. 查看对话评分统计")
|
||||
print("8. 模型性能分析与优化")
|
||||
print("9. 生成训练数据集")
|
||||
print("10. 模型迭代优化")
|
||||
print("11. 查看使用说明")
|
||||
print("0. 退出")
|
||||
print("="*50)
|
||||
|
||||
choice = input("请输入选择 (0-10): ").strip()
|
||||
choice = input("请输入选择 (0-11): ").strip()
|
||||
|
||||
if choice == '0':
|
||||
print("\n感谢使用双AI角色对话系统!")
|
||||
@ -1301,21 +1133,25 @@ def main():
|
||||
run_dialogue_system(enableScore = False)
|
||||
|
||||
elif choice == '5':
|
||||
show_system_status()
|
||||
run_dialogue_system(enableScore = True, useManualScoring = True)
|
||||
|
||||
elif choice == '6':
|
||||
show_scoring_statistics()
|
||||
show_system_status()
|
||||
|
||||
elif choice == '7':
|
||||
analyze_model_performance()
|
||||
show_scoring_statistics()
|
||||
|
||||
elif choice == '8':
|
||||
generate_training_dataset()
|
||||
# 模型性能分析与优化 - 待实现
|
||||
print("模型性能分析与优化功能开发中...")
|
||||
|
||||
elif choice == '9':
|
||||
run_model_optimization()
|
||||
generate_training_dataset()
|
||||
|
||||
elif choice == '10':
|
||||
run_model_optimization()
|
||||
|
||||
elif choice == '11':
|
||||
show_usage_guide()
|
||||
|
||||
else:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user