更新人工打分功能

2025-08-23 18:13:45 +08:00 · 2025-08-23 18:13:45 +08:00 · aefda38d12
commit aefda38d12
parent 5ba1d0dbdd
3 changed files with 102 additions and 189 deletions
--- a/AITrain/conversation_data/conversations.db
+++ b/AITrain/conversation_data/conversations.db
--- a/AITrain/dual_ai_dialogue_system.py
+++ b/AITrain/dual_ai_dialogue_system.py
@ -391,15 +391,16 @@ class DualAIDialogueEngine:
    """双AI对话引擎"""
    
    def __init__(self, knowledge_base: RAGKnowledgeBase, conversation_manager: ConversationManager, llm_generator, 
-                 enable_scoring: bool = True, base_model_path: str = None):
+                 enable_scoring: bool = True, base_model_path: str = None, use_manual_scoring: bool = False):
        self.kb = knowledge_base
        self.conv_mgr = conversation_manager
        self.llm_generator = llm_generator
        self.enable_scoring = enable_scoring
+        self.use_manual_scoring = use_manual_scoring
        self.scorer = None
        
        # 初始化评分器
-        if enable_scoring and base_model_path:
+        if enable_scoring and base_model_path and not use_manual_scoring:
            try:
                from dialogue_scorer import DialogueAIScorer
                print("正在初始化对话评分系统...")
@ -412,6 +413,74 @@ class DualAIDialogueEngine:
            except Exception as e:
                print(f"⚠ 对话评分系统初始化失败: {e}")
                self.enable_scoring = False
+
+    def _manual_score_dialogue_turn(self, dialogue_content: str, speaker: str, dialogue_history: List[DialogueTurn]) -> Tuple[float, str, str]:
+        """人工打分对话轮次
+        
+        Args:
+            dialogue_content: 对话内容
+            speaker: 说话者
+            dialogue_history: 对话历史
+            
+        Returns:
+            tuple: (总分, 详细分数JSON, 反馈意见)
+        """
+        print("\n" + "="*60)
+        print("人工对话评分")
+        print("="*60)
+        # print(f"说话者: {speaker}")
+        # print(f"对话内容: {dialogue_content}")
+        print("-" * 40)
+        
+        # # 显示最近的对话历史作为参考
+        # if dialogue_history:
+        #     print("最近对话历史:")
+        #     for i, turn in enumerate(dialogue_history[-3:], 1):
+        #         print(f"  {i}. {turn.speaker}: {turn.content[:100]}...")
+        #     print("-" * 40)
+        
+        # 五个评分维度
+        dimensions = {
+            'coherence': '逻辑连贯性 (1-10)',
+            'character_consistency': '角色一致性 (1-10)', 
+            'naturalness': '自然流畅度 (1-10)',
+            'information_density': '信息密度 (1-10)',
+            'creativity': '创意新颖度 (1-10)'
+        }
+        
+        scores = {}
+        print("\n请为以下维度打分 (输入1-10的分数，直接回车跳过该维度):")
+        
+        for key, desc in dimensions.items():
+            while True:
+                try:
+                    score_input = input(f"{desc}: ").strip()
+                    if score_input == "":
+                        scores[key] = 7.0  # 默认分数
+                        break
+                    
+                    score = float(score_input)
+                    if 1 <= score <= 10:
+                        scores[key] = score
+                        break
+                    else:
+                        print("请输入1-10之间的分数")
+                except ValueError:
+                    print("请输入有效的数字")
+        
+        # 计算总分
+        overall_score = sum(scores.values()) / len(scores)
+        
+        # 获取反馈意见
+        print("\n请输入对该对话的评价和建议 (可选，直接回车跳过):")
+        feedback = input("反馈意见: ").strip()
+        if not feedback:
+            feedback = f"人工评分完成，总分: {overall_score:.1f}"
+        
+        print(f"\n✓ 评分完成 - 总分: {overall_score:.1f}")
+        print("="*60)
+        
+        return overall_score, json.dumps(scores), feedback
    
    def score_dialogue_turn(self, dialogue_content: str, speaker: str, dialogue_history: List[DialogueTurn]) -> Tuple[float, str, str]:
        """对单条对话进行评分
@ -424,8 +493,16 @@ class DualAIDialogueEngine:
        Returns:
            tuple: (总分, 详细分数JSON, 反馈意见)
        """
-        if not self.enable_scoring or not self.scorer:
+        if not self.enable_scoring:
            return 0.0, "{}", "评分系统未启用"
+            
+        # 人工打分模式
+        if self.use_manual_scoring:
+            return self._manual_score_dialogue_turn(dialogue_content, speaker, dialogue_history)
+            
+        # AI自动打分模式
+        if not self.scorer:
+            return 0.0, "{}", "AI评分器未初始化"
        
        try:
            # 获取角色数据
--- a/AITrain/main_controller.py
+++ b/AITrain/main_controller.py
@ -121,7 +121,7 @@ def show_character_info():
        except Exception as e:
            print(f"✗ 读取角色文件失败: {char_file} - {e}")

-def run_dialogue_system(enableScore: bool):
+def run_dialogue_system(enableScore: bool, useManualScoring: bool = False):
    """运行双AI对话系统"""
    print("\n" + "="*60)
    print("启动双AI角色对话系统")
@ -192,7 +192,8 @@ def run_dialogue_system(enableScore: bool):
            conv_mgr, 
            dual_generator, 
            enable_scoring=enableScore, 
-            base_model_path=base_model_path
+            base_model_path=base_model_path,
+            use_manual_scoring=useManualScoring
        )
        
        # 创建对话会话
@ -261,176 +262,6 @@ def run_dialogue_system(enableScore: bool):
        traceback.print_exc()


-def analyze_model_performance():
-    """分析模型性能"""
-    print("\n" + "="*60)
-    print("模型性能分析")
-    print("="*60)
-    
-    try:
-        from dual_ai_dialogue_system import ConversationManager
-        import sqlite3
-        import json
-        from datetime import datetime, timedelta
-        
-        conv_mgr = ConversationManager("./conversation_data/conversations.db")
-        
-        with sqlite3.connect(conv_mgr.db_path) as conn:
-            print("\n1. 总体性能趋势分析:")
-            
-            # 按时间段分析性能趋势
-            cursor = conn.execute("""
-                SELECT 
-                    DATE(timestamp) as date,
-                    COUNT(*) as dialogue_count,
-                    AVG(dialogue_score) as avg_score,
-                    AVG(CASE WHEN dialogue_score >= 8.0 THEN 1.0 ELSE 0.0 END) as high_quality_rate
-                FROM dialogue_turns 
-                WHERE dialogue_score > 0 
-                    AND timestamp >= datetime('now', '-7 days')
-                GROUP BY DATE(timestamp)
-                ORDER BY date DESC
-            """)
-            
-            trend_data = cursor.fetchall()
-            if trend_data:
-                print(f"  最近7天性能趋势:")
-                for date, count, avg_score, hq_rate in trend_data:
-                    print(f"    {date}: 平均{avg_score:.2f}分 ({count}轮对话, {hq_rate*100:.1f}%高质量)")
-            else:
-                print("  暂无足够数据进行趋势分析")
-            
-            print("\n2. 维度问题分析:")
-            
-            # 分析各维度的问题
-            cursor = conn.execute("""
-                SELECT score_details 
-                FROM dialogue_turns 
-                WHERE dialogue_score > 0 AND score_details != '{}'
-                ORDER BY timestamp DESC 
-                LIMIT 100
-            """)
-            
-            dimension_scores = {
-                'coherence': [],
-                'character_consistency': [],
-                'naturalness': [],
-                'information_density': [],
-                'creativity': []
-            }
-            
-            for (score_details,) in cursor.fetchall():
-                try:
-                    scores = json.loads(score_details)
-                    for dim, score in scores.items():
-                        if dim in dimension_scores:
-                            dimension_scores[dim].append(float(score))
-                except:
-                    continue
-            
-            dimension_names = {
-                'coherence': '连贯性',
-                'character_consistency': '角色一致性',
-                'naturalness': '自然度',
-                'information_density': '信息密度',
-                'creativity': '创意性'
-            }
-            
-            weak_dimensions = []
-            for dim, scores in dimension_scores.items():
-                if scores:
-                    avg_score = sum(scores) / len(scores)
-                    print(f"    {dimension_names[dim]}: 平均{avg_score:.2f}分 ({len(scores)}个样本)")
-                    if avg_score < 7.0:
-                        weak_dimensions.append(dim)
-            
-            if weak_dimensions:
-                print(f"\n  ⚠ 发现薄弱维度: {[dimension_names[d] for d in weak_dimensions]}")
-                print("  建议进行针对性优化训练")
-            
-            print("\n3. 角色表现分析:")
-            
-            # 分析不同角色的表现
-            cursor = conn.execute("""
-                SELECT 
-                    speaker,
-                    COUNT(*) as dialogue_count,
-                    AVG(dialogue_score) as avg_score,
-                    MIN(dialogue_score) as min_score,
-                    MAX(dialogue_score) as max_score,
-                    AVG(CASE WHEN dialogue_score >= 8.0 THEN 1.0 ELSE 0.0 END) as high_quality_rate
-                FROM dialogue_turns 
-                WHERE dialogue_score > 0
-                GROUP BY speaker
-                ORDER BY avg_score DESC
-            """)
-            
-            character_performance = cursor.fetchall()
-            if character_performance:
-                print("  角色表现排名:")
-                for i, (speaker, count, avg, min_s, max_s, hq_rate) in enumerate(character_performance, 1):
-                    status = "✓" if avg >= 7.5 else "⚠" if avg >= 6.5 else "✗"
-                    print(f"    {i}. {speaker} {status}")
-                    print(f"       平均{avg:.2f}分 (范围{min_s:.1f}-{max_s:.1f}, {hq_rate*100:.1f}%高质量, {count}轮)")
-            
-            print("\n4. 问题模式识别:")
-            
-            # 识别低分对话的常见问题
-            cursor = conn.execute("""
-                SELECT content, dialogue_score, score_feedback
-                FROM dialogue_turns 
-                WHERE dialogue_score > 0 AND dialogue_score < 6.0
-                ORDER BY dialogue_score ASC
-                LIMIT 5
-            """)
-            
-            low_score_examples = cursor.fetchall()
-            if low_score_examples:
-                print("  低分对话示例:")
-                for i, (content, score, feedback) in enumerate(low_score_examples, 1):
-                    print(f"    {i}. 分数{score:.1f}: {content[:50]}...")
-                    if feedback:
-                        print(f"       问题: {feedback[:80]}...")
-            else:
-                print("  暂无低分对话样本")
-            
-            print("\n5. 优化建议:")
-            
-            # 生成优化建议
-            suggestions = []
-            
-            if weak_dimensions:
-                if 'character_consistency' in weak_dimensions:
-                    suggestions.append("• 加强角色设定训练，增加角色特征描述的权重")
-                if 'creativity' in weak_dimensions:
-                    suggestions.append("• 增加创意性训练数据，提高对话的趣味性")
-                if 'coherence' in weak_dimensions:
-                    suggestions.append("• 优化上下文理解，加强对话逻辑连贯性")
-                if 'naturalness' in weak_dimensions:
-                    suggestions.append("• 增加自然语言训练，改善表达流畅度")
-                if 'information_density' in weak_dimensions:
-                    suggestions.append("• 优化信息组织，避免冗余表达")
-            
-            # 检查是否需要数据收集
-            cursor = conn.execute("SELECT COUNT(*) FROM dialogue_turns WHERE dialogue_score > 0")
-            total_scored = cursor.fetchone()[0]
-            
-            if total_scored < 50:
-                suggestions.append("• 需要收集更多评分数据以进行准确分析")
-            
-            if total_scored >= 100:
-                suggestions.append("• 数据量充足，建议开始模型迭代优化")
-            
-            if suggestions:
-                for suggestion in suggestions:
-                    print(f"    {suggestion}")
-            else:
-                print("    当前性能表现良好，继续保持！")
-                
-    except Exception as e:
-        print(f"✗ 性能分析失败: {e}")
-        import traceback
-        traceback.print_exc()

 def generate_training_dataset():
    """生成训练数据集"""
@ -1271,18 +1102,19 @@ def main():
        print("主菜单 - 请选择操作:")
        print("1. 处理PDF世界观文档 (转换为RAG格式)")
        print("2. 查看角色设定信息")
-        print("3. 启动双AI对话系统 (开启ai打分)")
-        print("4. 启动双AI对话系统 (关闭ai打分)")
-        print("5. 系统状态检查")
-        print("6. 查看对话评分统计")
-        print("7. 模型性能分析与优化")
-        print("8. 生成训练数据集")
-        print("9. 模型迭代优化")
-        print("10. 查看使用说明")
+        print("3. 启动双AI对话系统 (开启AI打分)")
+        print("4. 启动双AI对话系统 (关闭AI打分)")
+        print("5. 启动双AI对话系统 (开启人工打分)")
+        print("6. 系统状态检查")
+        print("7. 查看对话评分统计")
+        print("8. 模型性能分析与优化")
+        print("9. 生成训练数据集")
+        print("10. 模型迭代优化")
+        print("11. 查看使用说明")
        print("0. 退出")
        print("="*50)
        
-        choice = input("请输入选择 (0-10): ").strip()
+        choice = input("请输入选择 (0-11): ").strip()
        
        if choice == '0':
            print("\n感谢使用双AI角色对话系统！")
@ -1301,21 +1133,25 @@ def main():
            run_dialogue_system(enableScore = False)
            
        elif choice == '5':
-            show_system_status()
+            run_dialogue_system(enableScore = True, useManualScoring = True)
            
        elif choice == '6':
-            show_scoring_statistics()
+            show_system_status()
            
        elif choice == '7':
-            analyze_model_performance()
+            show_scoring_statistics()
            
        elif choice == '8':
-            generate_training_dataset()
+            # 模型性能分析与优化 - 待实现
+            print("模型性能分析与优化功能开发中...")
            
        elif choice == '9':
-            run_model_optimization()
+            generate_training_dataset()
            
        elif choice == '10':
+            run_model_optimization()
+            
+        elif choice == '11':
            show_usage_guide()
            
        else: