497 lines
17 KiB
Python
497 lines
17 KiB
Python
|
|
#!/usr/bin/env python
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
'''
|
|||
|
|
对话质量评分系统
|
|||
|
|
使用AI模型对生成的对话进行多维度质量评分
|
|||
|
|
'''
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
from typing import Dict, List, Tuple, Optional
|
|||
|
|
from dataclasses import dataclass
|
|||
|
|
from datetime import datetime
|
|||
|
|
|
|||
|
|
# 尝试导入numpy,如果失败则跳过
|
|||
|
|
try:
|
|||
|
|
import numpy as np
|
|||
|
|
NUMPY_AVAILABLE = True
|
|||
|
|
except ImportError:
|
|||
|
|
NUMPY_AVAILABLE = False
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class ScoreResult:
|
|||
|
|
"""单次打分结果"""
|
|||
|
|
dialogue_id: str
|
|||
|
|
session_id: str
|
|||
|
|
speaker: str
|
|||
|
|
content: str
|
|||
|
|
timestamp: str
|
|||
|
|
scores: Dict[str, float] # 各维度分数
|
|||
|
|
overall_score: float # 总分
|
|||
|
|
feedback: str # 反馈意见
|
|||
|
|
scorer_type: str # 打分器类型 (ai/human)
|
|||
|
|
|
|||
|
|
class DialogueAIScorer:
|
|||
|
|
"""AI对话质量评分器"""
|
|||
|
|
|
|||
|
|
def __init__(self, base_model_path: str, tokenizer=None, model=None):
|
|||
|
|
"""
|
|||
|
|
初始化AI评分器
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
base_model_path: 基础模型路径
|
|||
|
|
tokenizer: 分词器(可选,复用现有的)
|
|||
|
|
model: 模型(可选,复用现有的)
|
|||
|
|
"""
|
|||
|
|
self.base_model_path = base_model_path
|
|||
|
|
self.tokenizer = tokenizer
|
|||
|
|
self.model = model
|
|||
|
|
|
|||
|
|
# 如果没有传入模型,则加载
|
|||
|
|
if self.tokenizer is None or self.model is None:
|
|||
|
|
self._load_model()
|
|||
|
|
|
|||
|
|
# 评分维度定义
|
|||
|
|
self.score_dimensions = {
|
|||
|
|
"coherence": {
|
|||
|
|
"name": "连贯性",
|
|||
|
|
"description": "对话是否与上下文逻辑连贯",
|
|||
|
|
"weight": 0.25
|
|||
|
|
},
|
|||
|
|
"character_consistency": {
|
|||
|
|
"name": "角色一致性",
|
|||
|
|
"description": "是否符合角色设定和人格特征",
|
|||
|
|
"weight": 0.25
|
|||
|
|
},
|
|||
|
|
"naturalness": {
|
|||
|
|
"name": "自然度",
|
|||
|
|
"description": "语言表达是否自然流畅",
|
|||
|
|
"weight": 0.20
|
|||
|
|
},
|
|||
|
|
"information_density": {
|
|||
|
|
"name": "信息密度",
|
|||
|
|
"description": "是否包含有意义的信息,避免废话",
|
|||
|
|
"weight": 0.15
|
|||
|
|
},
|
|||
|
|
"creativity": {
|
|||
|
|
"name": "创意性",
|
|||
|
|
"description": "内容是否有趣、有创意",
|
|||
|
|
"weight": 0.15
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def _load_model(self):
|
|||
|
|
"""加载模型和分词器"""
|
|||
|
|
try:
|
|||
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|||
|
|
import torch
|
|||
|
|
|
|||
|
|
print(f"Loading scorer tokenizer from: {self.base_model_path}")
|
|||
|
|
self.tokenizer = AutoTokenizer.from_pretrained(
|
|||
|
|
self.base_model_path,
|
|||
|
|
use_fast=False,
|
|||
|
|
trust_remote_code=True
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if self.tokenizer.pad_token is None:
|
|||
|
|
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|||
|
|
|
|||
|
|
print(f"Loading scorer model from: {self.base_model_path}")
|
|||
|
|
self.model = AutoModelForCausalLM.from_pretrained(
|
|||
|
|
self.base_model_path,
|
|||
|
|
device_map="auto",
|
|||
|
|
torch_dtype=torch.bfloat16,
|
|||
|
|
trust_remote_code=True
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"✗ AI评分器模型加载失败: {e}")
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
def score_dialogue(self,
|
|||
|
|
dialogue_content: str,
|
|||
|
|
speaker: str,
|
|||
|
|
character_data: Dict,
|
|||
|
|
dialogue_history: List[Dict] = None,
|
|||
|
|
context_info: List[Dict] = None) -> ScoreResult:
|
|||
|
|
"""
|
|||
|
|
对单条对话进行AI评分
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
dialogue_content: 对话内容
|
|||
|
|
speaker: 说话者
|
|||
|
|
character_data: 角色数据
|
|||
|
|
dialogue_history: 对话历史
|
|||
|
|
context_info: 上下文信息
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
ScoreResult: 评分结果
|
|||
|
|
"""
|
|||
|
|
# 构建评分提示
|
|||
|
|
scoring_prompt = self._build_scoring_prompt(
|
|||
|
|
dialogue_content, speaker, character_data, dialogue_history, context_info
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 使用AI模型生成评分
|
|||
|
|
try:
|
|||
|
|
scores, feedback = self._generate_ai_scores(scoring_prompt)
|
|||
|
|
|
|||
|
|
# 计算总分
|
|||
|
|
overall_score = self._calculate_overall_score(scores)
|
|||
|
|
|
|||
|
|
# 创建评分结果
|
|||
|
|
result = ScoreResult(
|
|||
|
|
dialogue_id=f"{speaker}_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
|||
|
|
session_id="", # 由调用方设置
|
|||
|
|
speaker=speaker,
|
|||
|
|
content=dialogue_content,
|
|||
|
|
timestamp=datetime.now().isoformat(),
|
|||
|
|
scores=scores,
|
|||
|
|
overall_score=overall_score,
|
|||
|
|
feedback=feedback,
|
|||
|
|
scorer_type="ai"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"✗ AI评分失败: {e}")
|
|||
|
|
# 返回默认评分
|
|||
|
|
return self._create_default_score(dialogue_content, speaker)
|
|||
|
|
|
|||
|
|
def _build_scoring_prompt(self,
|
|||
|
|
dialogue_content: str,
|
|||
|
|
speaker: str,
|
|||
|
|
character_data: Dict,
|
|||
|
|
dialogue_history: List[Dict] = None,
|
|||
|
|
context_info: List[Dict] = None) -> str:
|
|||
|
|
"""构建评分提示"""
|
|||
|
|
|
|||
|
|
# 基础角色信息
|
|||
|
|
character_info = ""
|
|||
|
|
if character_data:
|
|||
|
|
personality = character_data.get('personality', {})
|
|||
|
|
traits = personality.get('core_traits', [])
|
|||
|
|
occupation = character_data.get('basic_info', {}).get('occupation', '未知')
|
|||
|
|
character_info = f"角色职业: {occupation}, 性格特点: {', '.join(traits[:3])}"
|
|||
|
|
|
|||
|
|
# 对话历史
|
|||
|
|
history_text = ""
|
|||
|
|
if dialogue_history:
|
|||
|
|
history_text = "对话历史:\n"
|
|||
|
|
for turn in dialogue_history[-3:]: # 只取最近3轮
|
|||
|
|
history_text += f"{turn.get('speaker', '未知')}: {turn.get('content', '')}\n"
|
|||
|
|
|
|||
|
|
# 构建完整提示
|
|||
|
|
prompt = f"""请对以下对话内容进行质量评分。
|
|||
|
|
|
|||
|
|
角色设定:
|
|||
|
|
{character_info}
|
|||
|
|
|
|||
|
|
{history_text}
|
|||
|
|
|
|||
|
|
当前对话:
|
|||
|
|
{speaker}: {dialogue_content}
|
|||
|
|
|
|||
|
|
请从以下5个维度评分(1-10分):
|
|||
|
|
1. 连贯性 - 对话是否与上下文逻辑连贯
|
|||
|
|
2. 角色一致性 - 是否符合角色设定和人格特征
|
|||
|
|
3. 自然度 - 语言表达是否自然流畅
|
|||
|
|
4. 信息密度 - 是否包含有意义的信息,避免废话
|
|||
|
|
5. 创意性 - 内容是否有趣、有创意
|
|||
|
|
|
|||
|
|
请按以下格式输出:
|
|||
|
|
连贯性: X分
|
|||
|
|
角色一致性: X分
|
|||
|
|
自然度: X分
|
|||
|
|
信息密度: X分
|
|||
|
|
创意性: X分
|
|||
|
|
|
|||
|
|
总体评价: [具体的改进建议和优点分析]"""
|
|||
|
|
|
|||
|
|
return prompt
|
|||
|
|
|
|||
|
|
def _generate_ai_scores(self, prompt: str) -> Tuple[Dict[str, float], str]:
|
|||
|
|
"""使用AI模型生成评分"""
|
|||
|
|
import torch
|
|||
|
|
|
|||
|
|
# 准备消息
|
|||
|
|
messages = [
|
|||
|
|
{"role": "system", "content": "你是一个专业的对话质量评估专家,请客观公正地评分。"},
|
|||
|
|
{"role": "user", "content": prompt}
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
# 应用对话模板
|
|||
|
|
inputs = self.tokenizer.apply_chat_template(
|
|||
|
|
messages,
|
|||
|
|
add_generation_prompt=True,
|
|||
|
|
tokenize=True,
|
|||
|
|
return_tensors="pt",
|
|||
|
|
return_dict=True,
|
|||
|
|
enable_thinking=False
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 移动到设备
|
|||
|
|
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
|
|||
|
|
|
|||
|
|
# 生成评分
|
|||
|
|
with torch.no_grad():
|
|||
|
|
outputs = self.model.generate(
|
|||
|
|
**inputs,
|
|||
|
|
max_new_tokens=300,
|
|||
|
|
do_sample=True,
|
|||
|
|
temperature=0.3, # 较低温度确保评分稳定
|
|||
|
|
top_p=0.8,
|
|||
|
|
pad_token_id=self.tokenizer.eos_token_id,
|
|||
|
|
repetition_penalty=1.1
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 解码输出
|
|||
|
|
response = outputs[0][inputs['input_ids'].shape[1]:]
|
|||
|
|
result_text = self.tokenizer.decode(response, skip_special_tokens=True).strip()
|
|||
|
|
|
|||
|
|
# 解析评分结果
|
|||
|
|
scores, feedback = self._parse_score_response(result_text)
|
|||
|
|
|
|||
|
|
return scores, feedback
|
|||
|
|
|
|||
|
|
def _parse_score_response(self, response: str) -> Tuple[Dict[str, float], str]:
|
|||
|
|
"""解析AI评分响应"""
|
|||
|
|
scores = {}
|
|||
|
|
feedback = ""
|
|||
|
|
|
|||
|
|
# 定义维度映射
|
|||
|
|
dimension_map = {
|
|||
|
|
"连贯性": "coherence",
|
|||
|
|
"角色一致性": "character_consistency",
|
|||
|
|
"自然度": "naturalness",
|
|||
|
|
"信息密度": "information_density",
|
|||
|
|
"创意性": "creativity"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
lines = response.split('\n')
|
|||
|
|
feedback_start = False
|
|||
|
|
|
|||
|
|
for line in lines:
|
|||
|
|
line = line.strip()
|
|||
|
|
|
|||
|
|
# 查找评分
|
|||
|
|
for chinese_name, english_key in dimension_map.items():
|
|||
|
|
if chinese_name in line and ':' in line:
|
|||
|
|
# 提取分数
|
|||
|
|
score_match = re.search(r'(\d+(?:\.\d+)?)', line)
|
|||
|
|
if score_match:
|
|||
|
|
score = float(score_match.group(1))
|
|||
|
|
# 确保分数在1-10范围内
|
|||
|
|
score = max(1.0, min(10.0, score))
|
|||
|
|
scores[english_key] = score
|
|||
|
|
|
|||
|
|
# 查找总体评价
|
|||
|
|
if '总体评价' in line or '评价' in line:
|
|||
|
|
feedback_start = True
|
|||
|
|
feedback_content = line.split(':', 1)
|
|||
|
|
if len(feedback_content) > 1:
|
|||
|
|
feedback += feedback_content[1].strip()
|
|||
|
|
elif feedback_start and line:
|
|||
|
|
feedback += " " + line
|
|||
|
|
|
|||
|
|
# 确保所有维度都有分数
|
|||
|
|
for english_key in dimension_map.values():
|
|||
|
|
if english_key not in scores:
|
|||
|
|
scores[english_key] = 5.0 # 默认中等分数
|
|||
|
|
|
|||
|
|
if not feedback:
|
|||
|
|
feedback = "AI评分完成,建议根据各维度分数进行改进。"
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"解析评分响应失败: {e}")
|
|||
|
|
# 使用默认分数
|
|||
|
|
for english_key in dimension_map.values():
|
|||
|
|
scores[english_key] = 5.0
|
|||
|
|
feedback = "评分解析失败,使用默认分数。"
|
|||
|
|
|
|||
|
|
return scores, feedback
|
|||
|
|
|
|||
|
|
def _calculate_overall_score(self, scores: Dict[str, float]) -> float:
|
|||
|
|
"""计算总分"""
|
|||
|
|
total_score = 0.0
|
|||
|
|
total_weight = 0.0
|
|||
|
|
|
|||
|
|
for dimension, score in scores.items():
|
|||
|
|
if dimension in self.score_dimensions:
|
|||
|
|
weight = self.score_dimensions[dimension]["weight"]
|
|||
|
|
total_score += score * weight
|
|||
|
|
total_weight += weight
|
|||
|
|
|
|||
|
|
if total_weight > 0:
|
|||
|
|
return round(total_score / total_weight, 2)
|
|||
|
|
else:
|
|||
|
|
return 5.0
|
|||
|
|
|
|||
|
|
def _create_default_score(self, dialogue_content: str, speaker: str) -> ScoreResult:
|
|||
|
|
"""创建默认评分结果"""
|
|||
|
|
default_scores = {
|
|||
|
|
"coherence": 5.0,
|
|||
|
|
"character_consistency": 5.0,
|
|||
|
|
"naturalness": 5.0,
|
|||
|
|
"information_density": 5.0,
|
|||
|
|
"creativity": 5.0
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return ScoreResult(
|
|||
|
|
dialogue_id=f"{speaker}_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
|||
|
|
session_id="",
|
|||
|
|
speaker=speaker,
|
|||
|
|
content=dialogue_content,
|
|||
|
|
timestamp=datetime.now().isoformat(),
|
|||
|
|
scores=default_scores,
|
|||
|
|
overall_score=5.0,
|
|||
|
|
feedback="使用默认评分",
|
|||
|
|
scorer_type="ai"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def batch_score_dialogue(self, dialogue_list: List[Dict]) -> List[ScoreResult]:
|
|||
|
|
"""批量评分对话"""
|
|||
|
|
results = []
|
|||
|
|
|
|||
|
|
for i, dialogue_item in enumerate(dialogue_list):
|
|||
|
|
print(f"正在评分 {i+1}/{len(dialogue_list)}: {dialogue_item.get('speaker', '未知')}")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
result = self.score_dialogue(
|
|||
|
|
dialogue_content=dialogue_item.get('content', ''),
|
|||
|
|
speaker=dialogue_item.get('speaker', '未知'),
|
|||
|
|
character_data=dialogue_item.get('character_data', {}),
|
|||
|
|
dialogue_history=dialogue_item.get('dialogue_history', []),
|
|||
|
|
context_info=dialogue_item.get('context_info', [])
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 设置session_id
|
|||
|
|
result.session_id = dialogue_item.get('session_id', '')
|
|||
|
|
results.append(result)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"评分失败: {e}")
|
|||
|
|
# 添加默认评分
|
|||
|
|
default_result = self._create_default_score(
|
|||
|
|
dialogue_item.get('content', ''),
|
|||
|
|
dialogue_item.get('speaker', '未知')
|
|||
|
|
)
|
|||
|
|
default_result.session_id = dialogue_item.get('session_id', '')
|
|||
|
|
results.append(default_result)
|
|||
|
|
|
|||
|
|
return results
|
|||
|
|
|
|||
|
|
class HumanScorer:
|
|||
|
|
"""人工评分器"""
|
|||
|
|
|
|||
|
|
def __init__(self):
|
|||
|
|
self.score_dimensions = {
|
|||
|
|
"coherence": "连贯性",
|
|||
|
|
"character_consistency": "角色一致性",
|
|||
|
|
"naturalness": "自然度",
|
|||
|
|
"information_density": "信息密度",
|
|||
|
|
"creativity": "创意性"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def score_dialogue_interactive(self,
|
|||
|
|
dialogue_content: str,
|
|||
|
|
speaker: str,
|
|||
|
|
session_id: str = "") -> ScoreResult:
|
|||
|
|
"""交互式人工评分"""
|
|||
|
|
|
|||
|
|
print(f"\n=== 人工评分 ===")
|
|||
|
|
print(f"角色: {speaker}")
|
|||
|
|
print(f"对话: {dialogue_content}")
|
|||
|
|
print(f"请对以下维度评分 (1-10分):")
|
|||
|
|
|
|||
|
|
scores = {}
|
|||
|
|
|
|||
|
|
for dimension_key, dimension_name in self.score_dimensions.items():
|
|||
|
|
while True:
|
|||
|
|
try:
|
|||
|
|
score_input = input(f"{dimension_name} (1-10): ").strip()
|
|||
|
|
score = float(score_input)
|
|||
|
|
if 1 <= score <= 10:
|
|||
|
|
scores[dimension_key] = score
|
|||
|
|
break
|
|||
|
|
else:
|
|||
|
|
print("请输入1-10之间的分数")
|
|||
|
|
except ValueError:
|
|||
|
|
print("请输入有效的数字")
|
|||
|
|
|
|||
|
|
# 获取反馈
|
|||
|
|
feedback = input("请输入评价和建议 (可选): ").strip()
|
|||
|
|
if not feedback:
|
|||
|
|
feedback = "人工评分完成"
|
|||
|
|
|
|||
|
|
# 计算总分
|
|||
|
|
overall_score = sum(scores.values()) / len(scores)
|
|||
|
|
|
|||
|
|
return ScoreResult(
|
|||
|
|
dialogue_id=f"{speaker}_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
|||
|
|
session_id=session_id,
|
|||
|
|
speaker=speaker,
|
|||
|
|
content=dialogue_content,
|
|||
|
|
timestamp=datetime.now().isoformat(),
|
|||
|
|
scores=scores,
|
|||
|
|
overall_score=round(overall_score, 2),
|
|||
|
|
feedback=feedback,
|
|||
|
|
scorer_type="human"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
class QuickScorer:
|
|||
|
|
"""快速规则评分器(用于实时反馈)"""
|
|||
|
|
|
|||
|
|
def __init__(self):
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
def quick_score(self,
|
|||
|
|
dialogue_content: str,
|
|||
|
|
speaker: str,
|
|||
|
|
dialogue_history: List[Dict] = None) -> float:
|
|||
|
|
"""快速评分(基于规则)"""
|
|||
|
|
|
|||
|
|
score = 5.0 # 基础分
|
|||
|
|
|
|||
|
|
# 长度检查
|
|||
|
|
content_length = len(dialogue_content.strip())
|
|||
|
|
if content_length < 10:
|
|||
|
|
score -= 2.0 # 太短
|
|||
|
|
elif content_length > 200:
|
|||
|
|
score -= 1.0 # 太长
|
|||
|
|
elif 30 <= content_length <= 100:
|
|||
|
|
score += 0.5 # 长度适中
|
|||
|
|
|
|||
|
|
# 重复检查
|
|||
|
|
if dialogue_history:
|
|||
|
|
recent_content = [turn.get('content', '') for turn in dialogue_history[-3:]]
|
|||
|
|
for prev_content in recent_content:
|
|||
|
|
if dialogue_content == prev_content:
|
|||
|
|
score -= 3.0 # 重复内容
|
|||
|
|
elif self._calculate_similarity(dialogue_content, prev_content) > 0.8:
|
|||
|
|
score -= 1.5 # 高度相似
|
|||
|
|
|
|||
|
|
# 内容质量检查
|
|||
|
|
if any(word in dialogue_content for word in ['...', '呃', '额', '嗯嗯']):
|
|||
|
|
score -= 0.5 # 含有填充词
|
|||
|
|
|
|||
|
|
if re.search(r'[。!?]', dialogue_content):
|
|||
|
|
score += 0.3 # 有标点符号
|
|||
|
|
|
|||
|
|
# 确保分数在合理范围内
|
|||
|
|
return max(1.0, min(10.0, score))
|
|||
|
|
|
|||
|
|
def _calculate_similarity(self, text1: str, text2: str) -> float:
|
|||
|
|
"""计算文本相似度(简单方法)"""
|
|||
|
|
words1 = set(text1)
|
|||
|
|
words2 = set(text2)
|
|||
|
|
|
|||
|
|
if not words1 and not words2:
|
|||
|
|
return 1.0
|
|||
|
|
|
|||
|
|
intersection = len(words1.intersection(words2))
|
|||
|
|
union = len(words1.union(words2))
|
|||
|
|
|
|||
|
|
return intersection / union if union > 0 else 0.0
|