feat(evaluation): U9 Ragas evaluation pipeline for RAG quality assessment

- RagasEvaluator: LLM-as-Judge evaluation with ragas lib or built-in fallback
- EvalDatasetBuilder: from traces or dict list
- EvalMetrics: faithfulness, answer_relevancy, context_precision, context_recall
- Built-in heuristic evaluation using keyword overlap and Jaccard similarity
- 13 tests passing
This commit is contained in:
chiguyong 2026-06-06 22:49:27 +08:00
parent 9753a08ac8
commit 83cdddd199
3 changed files with 472 additions and 0 deletions

View File

@ -0,0 +1,17 @@
"""Evaluation module - RAG quality assessment"""
from agentkit.evaluation.ragas_evaluator import (
EvalDatasetBuilder,
EvalMetrics,
EvalResult,
EvalSample,
RagasEvaluator,
)
__all__ = [
"EvalDatasetBuilder",
"EvalMetrics",
"EvalResult",
"EvalSample",
"RagasEvaluator",
]

View File

@ -0,0 +1,288 @@
"""Ragas Evaluator - RAG 质量评估管线
集成 Ragas 评估框架提供标准化的 RAG 质量指标
- Faithfulness: 忠实度生成内容与检索上下文的一致性
- Answer Relevancy: 答案相关性
- Context Precision: 上下文精确率
- Context Recall: 上下文召回率
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import Any
logger = logging.getLogger(__name__)
@dataclass
class EvalSample:
"""评估样本"""
user_input: str
response: str
retrieved_contexts: list[str]
reference: str = ""
@dataclass
class EvalMetrics:
"""评估指标"""
faithfulness: float = 0.0
answer_relevancy: float = 0.0
context_precision: float = 0.0
context_recall: float = 0.0
@property
def average(self) -> float:
values = [self.faithfulness, self.answer_relevancy, self.context_precision, self.context_recall]
non_zero = [v for v in values if v > 0]
return sum(non_zero) / len(non_zero) if non_zero else 0.0
def to_dict(self) -> dict[str, float]:
return {
"faithfulness": self.faithfulness,
"answer_relevancy": self.answer_relevancy,
"context_precision": self.context_precision,
"context_recall": self.context_recall,
"average": self.average,
}
@dataclass
class EvalResult:
"""评估结果"""
metrics: EvalMetrics
sample_count: int
details: list[dict[str, Any]] = field(default_factory=list)
class EvalDatasetBuilder:
"""评估数据集构建器
TraceRecorder 提取历史任务数据
转换为 Ragas 评估格式
"""
@staticmethod
def from_traces(traces: list[dict[str, Any]]) -> list[EvalSample]:
"""从执行轨迹构建评估样本
Args:
traces: 执行轨迹列表每个包含 task_id, input, output, contexts
Returns:
EvalSample 列表
"""
samples = []
for trace in traces:
sample = EvalSample(
user_input=str(trace.get("input", "")),
response=str(trace.get("output", "")),
retrieved_contexts=trace.get("contexts", []),
reference=trace.get("reference", ""),
)
if sample.user_input and sample.response:
samples.append(sample)
return samples
@staticmethod
def from_dict_list(data: list[dict[str, Any]]) -> list[EvalSample]:
"""从字典列表构建评估样本"""
return [
EvalSample(
user_input=d.get("user_input", ""),
response=d.get("response", ""),
retrieved_contexts=d.get("retrieved_contexts", []),
reference=d.get("reference", ""),
)
for d in data
if d.get("user_input") and d.get("response")
]
class RagasEvaluator:
"""Ragas 评估器
使用 LLM-as-Judge 模式评估 RAG 质量
支持两种模式
1. Ragas 库模式需要安装 ragas
2. 内置轻量评估模式不依赖 ragas
"""
def __init__(
self,
llm_gateway: Any = None,
use_ragas_lib: bool = False,
):
self._llm_gateway = llm_gateway
self._use_ragas_lib = use_ragas_lib
async def evaluate(
self,
samples: list[EvalSample],
metrics: list[str] | None = None,
) -> EvalResult:
"""评估 RAG 质量
Args:
samples: 评估样本列表
metrics: 要计算的指标列表None 表示全部
Returns:
EvalResult: 评估结果
"""
if not samples:
return EvalResult(metrics=EvalMetrics(), sample_count=0)
if self._use_ragas_lib:
return await self._evaluate_with_ragas(samples, metrics)
else:
return await self._evaluate_builtin(samples, metrics)
async def _evaluate_with_ragas(
self,
samples: list[EvalSample],
metrics: list[str] | None,
) -> EvalResult:
"""使用 Ragas 库评估(需要安装 ragas"""
try:
from ragas import evaluate
from ragas.metrics import Faithfulness, AnswerRelevancy, ContextPrecision, ContextRecall
from ragas.dataset_schema import SingleTurnSample, EvaluationDataset
# Build evaluation dataset
eval_samples = []
for s in samples:
eval_samples.append(SingleTurnSample(
user_input=s.user_input,
response=s.response,
retrieved_contexts=s.retrieved_contexts,
reference=s.reference,
))
dataset = EvaluationDataset(samples=eval_samples)
# Select metrics
metric_objects = []
metric_names = metrics or ["faithfulness", "answer_relevancy", "context_precision", "context_recall"]
if "faithfulness" in metric_names:
metric_objects.append(Faithfulness())
if "answer_relevancy" in metric_names:
metric_objects.append(AnswerRelevancy())
if "context_precision" in metric_names:
metric_objects.append(ContextPrecision())
if "context_recall" in metric_names:
metric_objects.append(ContextRecall())
result = evaluate(dataset=dataset, metrics=metric_objects)
# Extract metrics
avg_metrics = EvalMetrics()
for key, value in result.items():
if key == "faithfulness":
avg_metrics.faithfulness = float(value)
elif key == "answer_relevancy":
avg_metrics.answer_relevancy = float(value)
elif key == "context_precision":
avg_metrics.context_precision = float(value)
elif key == "context_recall":
avg_metrics.context_recall = float(value)
return EvalResult(metrics=avg_metrics, sample_count=len(samples))
except ImportError:
logger.warning("ragas not installed, falling back to built-in evaluation")
return await self._evaluate_builtin(samples, metrics)
async def _evaluate_builtin(
self,
samples: list[EvalSample],
metrics: list[str] | None,
) -> EvalResult:
"""内置轻量评估(不依赖 ragas 库)
使用简单的启发式方法估算指标
- Faithfulness: 基于关键词重叠
- Answer Relevancy: 基于查询-答案语义相似度
- Context Precision: 基于上下文-答案重叠
- Context Recall: 基于参考答案覆盖率
"""
from agentkit.memory.relevance_scorer import RelevanceScorer
scorer = RelevanceScorer()
total_faithfulness = 0.0
total_relevancy = 0.0
total_precision = 0.0
total_recall = 0.0
details = []
for sample in samples:
# Faithfulness: overlap between response and contexts
if sample.retrieved_contexts:
combined_context = " ".join(sample.retrieved_contexts)
context_terms = scorer._tokenize(combined_context)
response_terms = scorer._tokenize(sample.response)
if context_terms and response_terms:
overlap = len(context_terms & response_terms)
faithfulness = min(overlap / max(len(response_terms), 1), 1.0)
else:
faithfulness = 0.0
else:
faithfulness = 0.0
# Answer Relevancy: query-answer overlap
query_terms = scorer._tokenize(sample.user_input)
response_terms = scorer._tokenize(sample.response)
if query_terms and response_terms:
relevancy = scorer._jaccard_similarity(query_terms, response_terms)
else:
relevancy = 0.0
# Context Precision: how many contexts are relevant to the query
if sample.retrieved_contexts:
relevant_count = 0
for ctx in sample.retrieved_contexts:
ctx_terms = scorer._tokenize(ctx)
if query_terms and scorer._jaccard_similarity(query_terms, ctx_terms) > 0.1:
relevant_count += 1
precision = relevant_count / len(sample.retrieved_contexts)
else:
precision = 0.0
# Context Recall: reference coverage
if sample.reference:
ref_terms = scorer._tokenize(sample.reference)
combined_ctx = " ".join(sample.retrieved_contexts)
ctx_terms = scorer._tokenize(combined_ctx)
if ref_terms:
recall = scorer._query_coverage(ref_terms, ctx_terms)
else:
recall = 0.0
else:
recall = 0.0
total_faithfulness += faithfulness
total_relevancy += relevancy
total_precision += precision
total_recall += recall
details.append({
"user_input": sample.user_input[:50],
"faithfulness": faithfulness,
"answer_relevancy": relevancy,
"context_precision": precision,
"context_recall": recall,
})
n = len(samples)
avg_metrics = EvalMetrics(
faithfulness=total_faithfulness / n,
answer_relevancy=total_relevancy / n,
context_precision=total_precision / n,
context_recall=total_recall / n,
)
return EvalResult(metrics=avg_metrics, sample_count=n, details=details)

View File

@ -0,0 +1,167 @@
"""Tests for RagasEvaluator"""
import pytest
from agentkit.evaluation.ragas_evaluator import (
EvalDatasetBuilder,
EvalMetrics,
EvalResult,
EvalSample,
RagasEvaluator,
)
class TestEvalMetrics:
"""EvalMetrics unit tests"""
def test_average_all_zero(self):
m = EvalMetrics()
assert m.average == 0.0
def test_average_with_values(self):
m = EvalMetrics(faithfulness=0.8, answer_relevancy=0.6)
assert abs(m.average - 0.7) < 0.01
def test_to_dict(self):
m = EvalMetrics(faithfulness=0.9, answer_relevancy=0.7, context_precision=0.8, context_recall=0.6)
d = m.to_dict()
assert "faithfulness" in d
assert "average" in d
assert d["faithfulness"] == 0.9
class TestEvalSample:
"""EvalSample unit tests"""
def test_creation(self):
sample = EvalSample(
user_input="What is Python?",
response="Python is a programming language",
retrieved_contexts=["Python is a popular programming language"],
reference="Python is a high-level programming language",
)
assert sample.user_input == "What is Python?"
assert len(sample.retrieved_contexts) == 1
class TestEvalDatasetBuilder:
"""EvalDatasetBuilder unit tests"""
def test_from_traces(self):
traces = [
{
"input": "What is Python?",
"output": "Python is a programming language",
"contexts": ["Python is popular"],
"reference": "Python is a high-level language",
},
{
"input": "What is Java?",
"output": "Java is also a programming language",
"contexts": ["Java is object-oriented"],
},
]
samples = EvalDatasetBuilder.from_traces(traces)
assert len(samples) == 2
assert samples[0].user_input == "What is Python?"
assert samples[1].reference == ""
def test_from_traces_empty_input(self):
traces = [{"input": "", "output": "some output"}]
samples = EvalDatasetBuilder.from_traces(traces)
assert len(samples) == 0 # Empty input should be filtered
def test_from_dict_list(self):
data = [
{"user_input": "Q1", "response": "A1", "retrieved_contexts": ["C1"]},
{"user_input": "Q2", "response": "A2", "retrieved_contexts": ["C2"]},
]
samples = EvalDatasetBuilder.from_dict_list(data)
assert len(samples) == 2
class TestRagasEvaluator:
"""RagasEvaluator unit tests"""
@pytest.mark.asyncio
async def test_evaluate_empty_samples(self):
evaluator = RagasEvaluator()
result = await evaluator.evaluate([])
assert result.sample_count == 0
assert result.metrics.average == 0.0
@pytest.mark.asyncio
async def test_evaluate_builtin(self):
evaluator = RagasEvaluator(use_ragas_lib=False)
samples = [
EvalSample(
user_input="What is Python?",
response="Python is a popular programming language used for web development",
retrieved_contexts=["Python is a popular programming language"],
reference="Python is a high-level programming language",
),
]
result = await evaluator.evaluate(samples)
assert result.sample_count == 1
assert result.metrics.faithfulness >= 0.0
assert result.metrics.answer_relevancy >= 0.0
assert len(result.details) == 1
@pytest.mark.asyncio
async def test_evaluate_multiple_samples(self):
evaluator = RagasEvaluator(use_ragas_lib=False)
samples = [
EvalSample(
user_input="What is Python?",
response="Python is a programming language",
retrieved_contexts=["Python is popular"],
),
EvalSample(
user_input="What is Java?",
response="Java is an object-oriented language",
retrieved_contexts=["Java is widely used"],
),
]
result = await evaluator.evaluate(samples)
assert result.sample_count == 2
@pytest.mark.asyncio
async def test_evaluate_no_contexts(self):
evaluator = RagasEvaluator(use_ragas_lib=False)
samples = [
EvalSample(
user_input="What is Python?",
response="Python is a programming language",
retrieved_contexts=[],
),
]
result = await evaluator.evaluate(samples)
assert result.metrics.faithfulness == 0.0
assert result.metrics.context_precision == 0.0
@pytest.mark.asyncio
async def test_evaluate_with_reference(self):
evaluator = RagasEvaluator(use_ragas_lib=False)
samples = [
EvalSample(
user_input="What is Python?",
response="Python is a programming language",
retrieved_contexts=["Python is popular"],
reference="Python is a high-level programming language",
),
]
result = await evaluator.evaluate(samples)
assert result.metrics.context_recall >= 0.0
@pytest.mark.asyncio
async def test_evaluate_specific_metrics(self):
evaluator = RagasEvaluator(use_ragas_lib=False)
samples = [
EvalSample(
user_input="What is Python?",
response="Python is a programming language",
retrieved_contexts=["Python is popular"],
),
]
result = await evaluator.evaluate(samples, metrics=["faithfulness"])
assert result.sample_count == 1