feat(evaluation): U9 Ragas evaluation pipeline for RAG quality assessment
- RagasEvaluator: LLM-as-Judge evaluation with ragas lib or built-in fallback - EvalDatasetBuilder: from traces or dict list - EvalMetrics: faithfulness, answer_relevancy, context_precision, context_recall - Built-in heuristic evaluation using keyword overlap and Jaccard similarity - 13 tests passing
This commit is contained in:
parent
9753a08ac8
commit
83cdddd199
|
|
@ -0,0 +1,17 @@
|
||||||
|
"""Evaluation module - RAG quality assessment"""
|
||||||
|
|
||||||
|
from agentkit.evaluation.ragas_evaluator import (
|
||||||
|
EvalDatasetBuilder,
|
||||||
|
EvalMetrics,
|
||||||
|
EvalResult,
|
||||||
|
EvalSample,
|
||||||
|
RagasEvaluator,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"EvalDatasetBuilder",
|
||||||
|
"EvalMetrics",
|
||||||
|
"EvalResult",
|
||||||
|
"EvalSample",
|
||||||
|
"RagasEvaluator",
|
||||||
|
]
|
||||||
|
|
@ -0,0 +1,288 @@
|
||||||
|
"""Ragas Evaluator - RAG 质量评估管线
|
||||||
|
|
||||||
|
集成 Ragas 评估框架,提供标准化的 RAG 质量指标:
|
||||||
|
- Faithfulness: 忠实度(生成内容与检索上下文的一致性)
|
||||||
|
- Answer Relevancy: 答案相关性
|
||||||
|
- Context Precision: 上下文精确率
|
||||||
|
- Context Recall: 上下文召回率
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EvalSample:
|
||||||
|
"""评估样本"""
|
||||||
|
|
||||||
|
user_input: str
|
||||||
|
response: str
|
||||||
|
retrieved_contexts: list[str]
|
||||||
|
reference: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EvalMetrics:
|
||||||
|
"""评估指标"""
|
||||||
|
|
||||||
|
faithfulness: float = 0.0
|
||||||
|
answer_relevancy: float = 0.0
|
||||||
|
context_precision: float = 0.0
|
||||||
|
context_recall: float = 0.0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def average(self) -> float:
|
||||||
|
values = [self.faithfulness, self.answer_relevancy, self.context_precision, self.context_recall]
|
||||||
|
non_zero = [v for v in values if v > 0]
|
||||||
|
return sum(non_zero) / len(non_zero) if non_zero else 0.0
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, float]:
|
||||||
|
return {
|
||||||
|
"faithfulness": self.faithfulness,
|
||||||
|
"answer_relevancy": self.answer_relevancy,
|
||||||
|
"context_precision": self.context_precision,
|
||||||
|
"context_recall": self.context_recall,
|
||||||
|
"average": self.average,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EvalResult:
|
||||||
|
"""评估结果"""
|
||||||
|
|
||||||
|
metrics: EvalMetrics
|
||||||
|
sample_count: int
|
||||||
|
details: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
class EvalDatasetBuilder:
|
||||||
|
"""评估数据集构建器
|
||||||
|
|
||||||
|
从 TraceRecorder 提取历史任务数据,
|
||||||
|
转换为 Ragas 评估格式。
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_traces(traces: list[dict[str, Any]]) -> list[EvalSample]:
|
||||||
|
"""从执行轨迹构建评估样本
|
||||||
|
|
||||||
|
Args:
|
||||||
|
traces: 执行轨迹列表,每个包含 task_id, input, output, contexts
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
EvalSample 列表
|
||||||
|
"""
|
||||||
|
samples = []
|
||||||
|
for trace in traces:
|
||||||
|
sample = EvalSample(
|
||||||
|
user_input=str(trace.get("input", "")),
|
||||||
|
response=str(trace.get("output", "")),
|
||||||
|
retrieved_contexts=trace.get("contexts", []),
|
||||||
|
reference=trace.get("reference", ""),
|
||||||
|
)
|
||||||
|
if sample.user_input and sample.response:
|
||||||
|
samples.append(sample)
|
||||||
|
return samples
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_dict_list(data: list[dict[str, Any]]) -> list[EvalSample]:
|
||||||
|
"""从字典列表构建评估样本"""
|
||||||
|
return [
|
||||||
|
EvalSample(
|
||||||
|
user_input=d.get("user_input", ""),
|
||||||
|
response=d.get("response", ""),
|
||||||
|
retrieved_contexts=d.get("retrieved_contexts", []),
|
||||||
|
reference=d.get("reference", ""),
|
||||||
|
)
|
||||||
|
for d in data
|
||||||
|
if d.get("user_input") and d.get("response")
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class RagasEvaluator:
|
||||||
|
"""Ragas 评估器
|
||||||
|
|
||||||
|
使用 LLM-as-Judge 模式评估 RAG 质量。
|
||||||
|
支持两种模式:
|
||||||
|
1. Ragas 库模式(需要安装 ragas)
|
||||||
|
2. 内置轻量评估模式(不依赖 ragas 库)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
llm_gateway: Any = None,
|
||||||
|
use_ragas_lib: bool = False,
|
||||||
|
):
|
||||||
|
self._llm_gateway = llm_gateway
|
||||||
|
self._use_ragas_lib = use_ragas_lib
|
||||||
|
|
||||||
|
async def evaluate(
|
||||||
|
self,
|
||||||
|
samples: list[EvalSample],
|
||||||
|
metrics: list[str] | None = None,
|
||||||
|
) -> EvalResult:
|
||||||
|
"""评估 RAG 质量
|
||||||
|
|
||||||
|
Args:
|
||||||
|
samples: 评估样本列表
|
||||||
|
metrics: 要计算的指标列表,None 表示全部
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
EvalResult: 评估结果
|
||||||
|
"""
|
||||||
|
if not samples:
|
||||||
|
return EvalResult(metrics=EvalMetrics(), sample_count=0)
|
||||||
|
|
||||||
|
if self._use_ragas_lib:
|
||||||
|
return await self._evaluate_with_ragas(samples, metrics)
|
||||||
|
else:
|
||||||
|
return await self._evaluate_builtin(samples, metrics)
|
||||||
|
|
||||||
|
async def _evaluate_with_ragas(
|
||||||
|
self,
|
||||||
|
samples: list[EvalSample],
|
||||||
|
metrics: list[str] | None,
|
||||||
|
) -> EvalResult:
|
||||||
|
"""使用 Ragas 库评估(需要安装 ragas)"""
|
||||||
|
try:
|
||||||
|
from ragas import evaluate
|
||||||
|
from ragas.metrics import Faithfulness, AnswerRelevancy, ContextPrecision, ContextRecall
|
||||||
|
from ragas.dataset_schema import SingleTurnSample, EvaluationDataset
|
||||||
|
|
||||||
|
# Build evaluation dataset
|
||||||
|
eval_samples = []
|
||||||
|
for s in samples:
|
||||||
|
eval_samples.append(SingleTurnSample(
|
||||||
|
user_input=s.user_input,
|
||||||
|
response=s.response,
|
||||||
|
retrieved_contexts=s.retrieved_contexts,
|
||||||
|
reference=s.reference,
|
||||||
|
))
|
||||||
|
dataset = EvaluationDataset(samples=eval_samples)
|
||||||
|
|
||||||
|
# Select metrics
|
||||||
|
metric_objects = []
|
||||||
|
metric_names = metrics or ["faithfulness", "answer_relevancy", "context_precision", "context_recall"]
|
||||||
|
if "faithfulness" in metric_names:
|
||||||
|
metric_objects.append(Faithfulness())
|
||||||
|
if "answer_relevancy" in metric_names:
|
||||||
|
metric_objects.append(AnswerRelevancy())
|
||||||
|
if "context_precision" in metric_names:
|
||||||
|
metric_objects.append(ContextPrecision())
|
||||||
|
if "context_recall" in metric_names:
|
||||||
|
metric_objects.append(ContextRecall())
|
||||||
|
|
||||||
|
result = evaluate(dataset=dataset, metrics=metric_objects)
|
||||||
|
|
||||||
|
# Extract metrics
|
||||||
|
avg_metrics = EvalMetrics()
|
||||||
|
for key, value in result.items():
|
||||||
|
if key == "faithfulness":
|
||||||
|
avg_metrics.faithfulness = float(value)
|
||||||
|
elif key == "answer_relevancy":
|
||||||
|
avg_metrics.answer_relevancy = float(value)
|
||||||
|
elif key == "context_precision":
|
||||||
|
avg_metrics.context_precision = float(value)
|
||||||
|
elif key == "context_recall":
|
||||||
|
avg_metrics.context_recall = float(value)
|
||||||
|
|
||||||
|
return EvalResult(metrics=avg_metrics, sample_count=len(samples))
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("ragas not installed, falling back to built-in evaluation")
|
||||||
|
return await self._evaluate_builtin(samples, metrics)
|
||||||
|
|
||||||
|
async def _evaluate_builtin(
|
||||||
|
self,
|
||||||
|
samples: list[EvalSample],
|
||||||
|
metrics: list[str] | None,
|
||||||
|
) -> EvalResult:
|
||||||
|
"""内置轻量评估(不依赖 ragas 库)
|
||||||
|
|
||||||
|
使用简单的启发式方法估算指标:
|
||||||
|
- Faithfulness: 基于关键词重叠
|
||||||
|
- Answer Relevancy: 基于查询-答案语义相似度
|
||||||
|
- Context Precision: 基于上下文-答案重叠
|
||||||
|
- Context Recall: 基于参考答案覆盖率
|
||||||
|
"""
|
||||||
|
from agentkit.memory.relevance_scorer import RelevanceScorer
|
||||||
|
|
||||||
|
scorer = RelevanceScorer()
|
||||||
|
total_faithfulness = 0.0
|
||||||
|
total_relevancy = 0.0
|
||||||
|
total_precision = 0.0
|
||||||
|
total_recall = 0.0
|
||||||
|
details = []
|
||||||
|
|
||||||
|
for sample in samples:
|
||||||
|
# Faithfulness: overlap between response and contexts
|
||||||
|
if sample.retrieved_contexts:
|
||||||
|
combined_context = " ".join(sample.retrieved_contexts)
|
||||||
|
context_terms = scorer._tokenize(combined_context)
|
||||||
|
response_terms = scorer._tokenize(sample.response)
|
||||||
|
if context_terms and response_terms:
|
||||||
|
overlap = len(context_terms & response_terms)
|
||||||
|
faithfulness = min(overlap / max(len(response_terms), 1), 1.0)
|
||||||
|
else:
|
||||||
|
faithfulness = 0.0
|
||||||
|
else:
|
||||||
|
faithfulness = 0.0
|
||||||
|
|
||||||
|
# Answer Relevancy: query-answer overlap
|
||||||
|
query_terms = scorer._tokenize(sample.user_input)
|
||||||
|
response_terms = scorer._tokenize(sample.response)
|
||||||
|
if query_terms and response_terms:
|
||||||
|
relevancy = scorer._jaccard_similarity(query_terms, response_terms)
|
||||||
|
else:
|
||||||
|
relevancy = 0.0
|
||||||
|
|
||||||
|
# Context Precision: how many contexts are relevant to the query
|
||||||
|
if sample.retrieved_contexts:
|
||||||
|
relevant_count = 0
|
||||||
|
for ctx in sample.retrieved_contexts:
|
||||||
|
ctx_terms = scorer._tokenize(ctx)
|
||||||
|
if query_terms and scorer._jaccard_similarity(query_terms, ctx_terms) > 0.1:
|
||||||
|
relevant_count += 1
|
||||||
|
precision = relevant_count / len(sample.retrieved_contexts)
|
||||||
|
else:
|
||||||
|
precision = 0.0
|
||||||
|
|
||||||
|
# Context Recall: reference coverage
|
||||||
|
if sample.reference:
|
||||||
|
ref_terms = scorer._tokenize(sample.reference)
|
||||||
|
combined_ctx = " ".join(sample.retrieved_contexts)
|
||||||
|
ctx_terms = scorer._tokenize(combined_ctx)
|
||||||
|
if ref_terms:
|
||||||
|
recall = scorer._query_coverage(ref_terms, ctx_terms)
|
||||||
|
else:
|
||||||
|
recall = 0.0
|
||||||
|
else:
|
||||||
|
recall = 0.0
|
||||||
|
|
||||||
|
total_faithfulness += faithfulness
|
||||||
|
total_relevancy += relevancy
|
||||||
|
total_precision += precision
|
||||||
|
total_recall += recall
|
||||||
|
|
||||||
|
details.append({
|
||||||
|
"user_input": sample.user_input[:50],
|
||||||
|
"faithfulness": faithfulness,
|
||||||
|
"answer_relevancy": relevancy,
|
||||||
|
"context_precision": precision,
|
||||||
|
"context_recall": recall,
|
||||||
|
})
|
||||||
|
|
||||||
|
n = len(samples)
|
||||||
|
avg_metrics = EvalMetrics(
|
||||||
|
faithfulness=total_faithfulness / n,
|
||||||
|
answer_relevancy=total_relevancy / n,
|
||||||
|
context_precision=total_precision / n,
|
||||||
|
context_recall=total_recall / n,
|
||||||
|
)
|
||||||
|
|
||||||
|
return EvalResult(metrics=avg_metrics, sample_count=n, details=details)
|
||||||
|
|
@ -0,0 +1,167 @@
|
||||||
|
"""Tests for RagasEvaluator"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from agentkit.evaluation.ragas_evaluator import (
|
||||||
|
EvalDatasetBuilder,
|
||||||
|
EvalMetrics,
|
||||||
|
EvalResult,
|
||||||
|
EvalSample,
|
||||||
|
RagasEvaluator,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestEvalMetrics:
|
||||||
|
"""EvalMetrics unit tests"""
|
||||||
|
|
||||||
|
def test_average_all_zero(self):
|
||||||
|
m = EvalMetrics()
|
||||||
|
assert m.average == 0.0
|
||||||
|
|
||||||
|
def test_average_with_values(self):
|
||||||
|
m = EvalMetrics(faithfulness=0.8, answer_relevancy=0.6)
|
||||||
|
assert abs(m.average - 0.7) < 0.01
|
||||||
|
|
||||||
|
def test_to_dict(self):
|
||||||
|
m = EvalMetrics(faithfulness=0.9, answer_relevancy=0.7, context_precision=0.8, context_recall=0.6)
|
||||||
|
d = m.to_dict()
|
||||||
|
assert "faithfulness" in d
|
||||||
|
assert "average" in d
|
||||||
|
assert d["faithfulness"] == 0.9
|
||||||
|
|
||||||
|
|
||||||
|
class TestEvalSample:
|
||||||
|
"""EvalSample unit tests"""
|
||||||
|
|
||||||
|
def test_creation(self):
|
||||||
|
sample = EvalSample(
|
||||||
|
user_input="What is Python?",
|
||||||
|
response="Python is a programming language",
|
||||||
|
retrieved_contexts=["Python is a popular programming language"],
|
||||||
|
reference="Python is a high-level programming language",
|
||||||
|
)
|
||||||
|
assert sample.user_input == "What is Python?"
|
||||||
|
assert len(sample.retrieved_contexts) == 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestEvalDatasetBuilder:
|
||||||
|
"""EvalDatasetBuilder unit tests"""
|
||||||
|
|
||||||
|
def test_from_traces(self):
|
||||||
|
traces = [
|
||||||
|
{
|
||||||
|
"input": "What is Python?",
|
||||||
|
"output": "Python is a programming language",
|
||||||
|
"contexts": ["Python is popular"],
|
||||||
|
"reference": "Python is a high-level language",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"input": "What is Java?",
|
||||||
|
"output": "Java is also a programming language",
|
||||||
|
"contexts": ["Java is object-oriented"],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
samples = EvalDatasetBuilder.from_traces(traces)
|
||||||
|
assert len(samples) == 2
|
||||||
|
assert samples[0].user_input == "What is Python?"
|
||||||
|
assert samples[1].reference == ""
|
||||||
|
|
||||||
|
def test_from_traces_empty_input(self):
|
||||||
|
traces = [{"input": "", "output": "some output"}]
|
||||||
|
samples = EvalDatasetBuilder.from_traces(traces)
|
||||||
|
assert len(samples) == 0 # Empty input should be filtered
|
||||||
|
|
||||||
|
def test_from_dict_list(self):
|
||||||
|
data = [
|
||||||
|
{"user_input": "Q1", "response": "A1", "retrieved_contexts": ["C1"]},
|
||||||
|
{"user_input": "Q2", "response": "A2", "retrieved_contexts": ["C2"]},
|
||||||
|
]
|
||||||
|
samples = EvalDatasetBuilder.from_dict_list(data)
|
||||||
|
assert len(samples) == 2
|
||||||
|
|
||||||
|
|
||||||
|
class TestRagasEvaluator:
|
||||||
|
"""RagasEvaluator unit tests"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_evaluate_empty_samples(self):
|
||||||
|
evaluator = RagasEvaluator()
|
||||||
|
result = await evaluator.evaluate([])
|
||||||
|
assert result.sample_count == 0
|
||||||
|
assert result.metrics.average == 0.0
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_evaluate_builtin(self):
|
||||||
|
evaluator = RagasEvaluator(use_ragas_lib=False)
|
||||||
|
samples = [
|
||||||
|
EvalSample(
|
||||||
|
user_input="What is Python?",
|
||||||
|
response="Python is a popular programming language used for web development",
|
||||||
|
retrieved_contexts=["Python is a popular programming language"],
|
||||||
|
reference="Python is a high-level programming language",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
result = await evaluator.evaluate(samples)
|
||||||
|
assert result.sample_count == 1
|
||||||
|
assert result.metrics.faithfulness >= 0.0
|
||||||
|
assert result.metrics.answer_relevancy >= 0.0
|
||||||
|
assert len(result.details) == 1
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_evaluate_multiple_samples(self):
|
||||||
|
evaluator = RagasEvaluator(use_ragas_lib=False)
|
||||||
|
samples = [
|
||||||
|
EvalSample(
|
||||||
|
user_input="What is Python?",
|
||||||
|
response="Python is a programming language",
|
||||||
|
retrieved_contexts=["Python is popular"],
|
||||||
|
),
|
||||||
|
EvalSample(
|
||||||
|
user_input="What is Java?",
|
||||||
|
response="Java is an object-oriented language",
|
||||||
|
retrieved_contexts=["Java is widely used"],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
result = await evaluator.evaluate(samples)
|
||||||
|
assert result.sample_count == 2
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_evaluate_no_contexts(self):
|
||||||
|
evaluator = RagasEvaluator(use_ragas_lib=False)
|
||||||
|
samples = [
|
||||||
|
EvalSample(
|
||||||
|
user_input="What is Python?",
|
||||||
|
response="Python is a programming language",
|
||||||
|
retrieved_contexts=[],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
result = await evaluator.evaluate(samples)
|
||||||
|
assert result.metrics.faithfulness == 0.0
|
||||||
|
assert result.metrics.context_precision == 0.0
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_evaluate_with_reference(self):
|
||||||
|
evaluator = RagasEvaluator(use_ragas_lib=False)
|
||||||
|
samples = [
|
||||||
|
EvalSample(
|
||||||
|
user_input="What is Python?",
|
||||||
|
response="Python is a programming language",
|
||||||
|
retrieved_contexts=["Python is popular"],
|
||||||
|
reference="Python is a high-level programming language",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
result = await evaluator.evaluate(samples)
|
||||||
|
assert result.metrics.context_recall >= 0.0
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_evaluate_specific_metrics(self):
|
||||||
|
evaluator = RagasEvaluator(use_ragas_lib=False)
|
||||||
|
samples = [
|
||||||
|
EvalSample(
|
||||||
|
user_input="What is Python?",
|
||||||
|
response="Python is a programming language",
|
||||||
|
retrieved_contexts=["Python is popular"],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
result = await evaluator.evaluate(samples, metrics=["faithfulness"])
|
||||||
|
assert result.sample_count == 1
|
||||||
Loading…
Reference in New Issue