From 83cdddd199d360a10219d3d9925ae328300d7059 Mon Sep 17 00:00:00 2001 From: chiguyong Date: Sat, 6 Jun 2026 22:49:27 +0800 Subject: [PATCH] feat(evaluation): U9 Ragas evaluation pipeline for RAG quality assessment - RagasEvaluator: LLM-as-Judge evaluation with ragas lib or built-in fallback - EvalDatasetBuilder: from traces or dict list - EvalMetrics: faithfulness, answer_relevancy, context_precision, context_recall - Built-in heuristic evaluation using keyword overlap and Jaccard similarity - 13 tests passing --- src/agentkit/evaluation/__init__.py | 17 ++ src/agentkit/evaluation/ragas_evaluator.py | 288 +++++++++++++++++++++ tests/unit/test_ragas_evaluator.py | 167 ++++++++++++ 3 files changed, 472 insertions(+) create mode 100644 src/agentkit/evaluation/__init__.py create mode 100644 src/agentkit/evaluation/ragas_evaluator.py create mode 100644 tests/unit/test_ragas_evaluator.py diff --git a/src/agentkit/evaluation/__init__.py b/src/agentkit/evaluation/__init__.py new file mode 100644 index 0000000..06ecc30 --- /dev/null +++ b/src/agentkit/evaluation/__init__.py @@ -0,0 +1,17 @@ +"""Evaluation module - RAG quality assessment""" + +from agentkit.evaluation.ragas_evaluator import ( + EvalDatasetBuilder, + EvalMetrics, + EvalResult, + EvalSample, + RagasEvaluator, +) + +__all__ = [ + "EvalDatasetBuilder", + "EvalMetrics", + "EvalResult", + "EvalSample", + "RagasEvaluator", +] diff --git a/src/agentkit/evaluation/ragas_evaluator.py b/src/agentkit/evaluation/ragas_evaluator.py new file mode 100644 index 0000000..7ec1da8 --- /dev/null +++ b/src/agentkit/evaluation/ragas_evaluator.py @@ -0,0 +1,288 @@ +"""Ragas Evaluator - RAG 质量评估管线 + +集成 Ragas 评估框架,提供标准化的 RAG 质量指标: +- Faithfulness: 忠实度(生成内容与检索上下文的一致性) +- Answer Relevancy: 答案相关性 +- Context Precision: 上下文精确率 +- Context Recall: 上下文召回率 +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from typing import Any + +logger = logging.getLogger(__name__) + + +@dataclass +class EvalSample: + """评估样本""" + + user_input: str + response: str + retrieved_contexts: list[str] + reference: str = "" + + +@dataclass +class EvalMetrics: + """评估指标""" + + faithfulness: float = 0.0 + answer_relevancy: float = 0.0 + context_precision: float = 0.0 + context_recall: float = 0.0 + + @property + def average(self) -> float: + values = [self.faithfulness, self.answer_relevancy, self.context_precision, self.context_recall] + non_zero = [v for v in values if v > 0] + return sum(non_zero) / len(non_zero) if non_zero else 0.0 + + def to_dict(self) -> dict[str, float]: + return { + "faithfulness": self.faithfulness, + "answer_relevancy": self.answer_relevancy, + "context_precision": self.context_precision, + "context_recall": self.context_recall, + "average": self.average, + } + + +@dataclass +class EvalResult: + """评估结果""" + + metrics: EvalMetrics + sample_count: int + details: list[dict[str, Any]] = field(default_factory=list) + + +class EvalDatasetBuilder: + """评估数据集构建器 + + 从 TraceRecorder 提取历史任务数据, + 转换为 Ragas 评估格式。 + """ + + @staticmethod + def from_traces(traces: list[dict[str, Any]]) -> list[EvalSample]: + """从执行轨迹构建评估样本 + + Args: + traces: 执行轨迹列表,每个包含 task_id, input, output, contexts + + Returns: + EvalSample 列表 + """ + samples = [] + for trace in traces: + sample = EvalSample( + user_input=str(trace.get("input", "")), + response=str(trace.get("output", "")), + retrieved_contexts=trace.get("contexts", []), + reference=trace.get("reference", ""), + ) + if sample.user_input and sample.response: + samples.append(sample) + return samples + + @staticmethod + def from_dict_list(data: list[dict[str, Any]]) -> list[EvalSample]: + """从字典列表构建评估样本""" + return [ + EvalSample( + user_input=d.get("user_input", ""), + response=d.get("response", ""), + retrieved_contexts=d.get("retrieved_contexts", []), + reference=d.get("reference", ""), + ) + for d in data + if d.get("user_input") and d.get("response") + ] + + +class RagasEvaluator: + """Ragas 评估器 + + 使用 LLM-as-Judge 模式评估 RAG 质量。 + 支持两种模式: + 1. Ragas 库模式(需要安装 ragas) + 2. 内置轻量评估模式(不依赖 ragas 库) + """ + + def __init__( + self, + llm_gateway: Any = None, + use_ragas_lib: bool = False, + ): + self._llm_gateway = llm_gateway + self._use_ragas_lib = use_ragas_lib + + async def evaluate( + self, + samples: list[EvalSample], + metrics: list[str] | None = None, + ) -> EvalResult: + """评估 RAG 质量 + + Args: + samples: 评估样本列表 + metrics: 要计算的指标列表,None 表示全部 + + Returns: + EvalResult: 评估结果 + """ + if not samples: + return EvalResult(metrics=EvalMetrics(), sample_count=0) + + if self._use_ragas_lib: + return await self._evaluate_with_ragas(samples, metrics) + else: + return await self._evaluate_builtin(samples, metrics) + + async def _evaluate_with_ragas( + self, + samples: list[EvalSample], + metrics: list[str] | None, + ) -> EvalResult: + """使用 Ragas 库评估(需要安装 ragas)""" + try: + from ragas import evaluate + from ragas.metrics import Faithfulness, AnswerRelevancy, ContextPrecision, ContextRecall + from ragas.dataset_schema import SingleTurnSample, EvaluationDataset + + # Build evaluation dataset + eval_samples = [] + for s in samples: + eval_samples.append(SingleTurnSample( + user_input=s.user_input, + response=s.response, + retrieved_contexts=s.retrieved_contexts, + reference=s.reference, + )) + dataset = EvaluationDataset(samples=eval_samples) + + # Select metrics + metric_objects = [] + metric_names = metrics or ["faithfulness", "answer_relevancy", "context_precision", "context_recall"] + if "faithfulness" in metric_names: + metric_objects.append(Faithfulness()) + if "answer_relevancy" in metric_names: + metric_objects.append(AnswerRelevancy()) + if "context_precision" in metric_names: + metric_objects.append(ContextPrecision()) + if "context_recall" in metric_names: + metric_objects.append(ContextRecall()) + + result = evaluate(dataset=dataset, metrics=metric_objects) + + # Extract metrics + avg_metrics = EvalMetrics() + for key, value in result.items(): + if key == "faithfulness": + avg_metrics.faithfulness = float(value) + elif key == "answer_relevancy": + avg_metrics.answer_relevancy = float(value) + elif key == "context_precision": + avg_metrics.context_precision = float(value) + elif key == "context_recall": + avg_metrics.context_recall = float(value) + + return EvalResult(metrics=avg_metrics, sample_count=len(samples)) + + except ImportError: + logger.warning("ragas not installed, falling back to built-in evaluation") + return await self._evaluate_builtin(samples, metrics) + + async def _evaluate_builtin( + self, + samples: list[EvalSample], + metrics: list[str] | None, + ) -> EvalResult: + """内置轻量评估(不依赖 ragas 库) + + 使用简单的启发式方法估算指标: + - Faithfulness: 基于关键词重叠 + - Answer Relevancy: 基于查询-答案语义相似度 + - Context Precision: 基于上下文-答案重叠 + - Context Recall: 基于参考答案覆盖率 + """ + from agentkit.memory.relevance_scorer import RelevanceScorer + + scorer = RelevanceScorer() + total_faithfulness = 0.0 + total_relevancy = 0.0 + total_precision = 0.0 + total_recall = 0.0 + details = [] + + for sample in samples: + # Faithfulness: overlap between response and contexts + if sample.retrieved_contexts: + combined_context = " ".join(sample.retrieved_contexts) + context_terms = scorer._tokenize(combined_context) + response_terms = scorer._tokenize(sample.response) + if context_terms and response_terms: + overlap = len(context_terms & response_terms) + faithfulness = min(overlap / max(len(response_terms), 1), 1.0) + else: + faithfulness = 0.0 + else: + faithfulness = 0.0 + + # Answer Relevancy: query-answer overlap + query_terms = scorer._tokenize(sample.user_input) + response_terms = scorer._tokenize(sample.response) + if query_terms and response_terms: + relevancy = scorer._jaccard_similarity(query_terms, response_terms) + else: + relevancy = 0.0 + + # Context Precision: how many contexts are relevant to the query + if sample.retrieved_contexts: + relevant_count = 0 + for ctx in sample.retrieved_contexts: + ctx_terms = scorer._tokenize(ctx) + if query_terms and scorer._jaccard_similarity(query_terms, ctx_terms) > 0.1: + relevant_count += 1 + precision = relevant_count / len(sample.retrieved_contexts) + else: + precision = 0.0 + + # Context Recall: reference coverage + if sample.reference: + ref_terms = scorer._tokenize(sample.reference) + combined_ctx = " ".join(sample.retrieved_contexts) + ctx_terms = scorer._tokenize(combined_ctx) + if ref_terms: + recall = scorer._query_coverage(ref_terms, ctx_terms) + else: + recall = 0.0 + else: + recall = 0.0 + + total_faithfulness += faithfulness + total_relevancy += relevancy + total_precision += precision + total_recall += recall + + details.append({ + "user_input": sample.user_input[:50], + "faithfulness": faithfulness, + "answer_relevancy": relevancy, + "context_precision": precision, + "context_recall": recall, + }) + + n = len(samples) + avg_metrics = EvalMetrics( + faithfulness=total_faithfulness / n, + answer_relevancy=total_relevancy / n, + context_precision=total_precision / n, + context_recall=total_recall / n, + ) + + return EvalResult(metrics=avg_metrics, sample_count=n, details=details) diff --git a/tests/unit/test_ragas_evaluator.py b/tests/unit/test_ragas_evaluator.py new file mode 100644 index 0000000..bbc0e73 --- /dev/null +++ b/tests/unit/test_ragas_evaluator.py @@ -0,0 +1,167 @@ +"""Tests for RagasEvaluator""" + +import pytest + +from agentkit.evaluation.ragas_evaluator import ( + EvalDatasetBuilder, + EvalMetrics, + EvalResult, + EvalSample, + RagasEvaluator, +) + + +class TestEvalMetrics: + """EvalMetrics unit tests""" + + def test_average_all_zero(self): + m = EvalMetrics() + assert m.average == 0.0 + + def test_average_with_values(self): + m = EvalMetrics(faithfulness=0.8, answer_relevancy=0.6) + assert abs(m.average - 0.7) < 0.01 + + def test_to_dict(self): + m = EvalMetrics(faithfulness=0.9, answer_relevancy=0.7, context_precision=0.8, context_recall=0.6) + d = m.to_dict() + assert "faithfulness" in d + assert "average" in d + assert d["faithfulness"] == 0.9 + + +class TestEvalSample: + """EvalSample unit tests""" + + def test_creation(self): + sample = EvalSample( + user_input="What is Python?", + response="Python is a programming language", + retrieved_contexts=["Python is a popular programming language"], + reference="Python is a high-level programming language", + ) + assert sample.user_input == "What is Python?" + assert len(sample.retrieved_contexts) == 1 + + +class TestEvalDatasetBuilder: + """EvalDatasetBuilder unit tests""" + + def test_from_traces(self): + traces = [ + { + "input": "What is Python?", + "output": "Python is a programming language", + "contexts": ["Python is popular"], + "reference": "Python is a high-level language", + }, + { + "input": "What is Java?", + "output": "Java is also a programming language", + "contexts": ["Java is object-oriented"], + }, + ] + samples = EvalDatasetBuilder.from_traces(traces) + assert len(samples) == 2 + assert samples[0].user_input == "What is Python?" + assert samples[1].reference == "" + + def test_from_traces_empty_input(self): + traces = [{"input": "", "output": "some output"}] + samples = EvalDatasetBuilder.from_traces(traces) + assert len(samples) == 0 # Empty input should be filtered + + def test_from_dict_list(self): + data = [ + {"user_input": "Q1", "response": "A1", "retrieved_contexts": ["C1"]}, + {"user_input": "Q2", "response": "A2", "retrieved_contexts": ["C2"]}, + ] + samples = EvalDatasetBuilder.from_dict_list(data) + assert len(samples) == 2 + + +class TestRagasEvaluator: + """RagasEvaluator unit tests""" + + @pytest.mark.asyncio + async def test_evaluate_empty_samples(self): + evaluator = RagasEvaluator() + result = await evaluator.evaluate([]) + assert result.sample_count == 0 + assert result.metrics.average == 0.0 + + @pytest.mark.asyncio + async def test_evaluate_builtin(self): + evaluator = RagasEvaluator(use_ragas_lib=False) + samples = [ + EvalSample( + user_input="What is Python?", + response="Python is a popular programming language used for web development", + retrieved_contexts=["Python is a popular programming language"], + reference="Python is a high-level programming language", + ), + ] + result = await evaluator.evaluate(samples) + assert result.sample_count == 1 + assert result.metrics.faithfulness >= 0.0 + assert result.metrics.answer_relevancy >= 0.0 + assert len(result.details) == 1 + + @pytest.mark.asyncio + async def test_evaluate_multiple_samples(self): + evaluator = RagasEvaluator(use_ragas_lib=False) + samples = [ + EvalSample( + user_input="What is Python?", + response="Python is a programming language", + retrieved_contexts=["Python is popular"], + ), + EvalSample( + user_input="What is Java?", + response="Java is an object-oriented language", + retrieved_contexts=["Java is widely used"], + ), + ] + result = await evaluator.evaluate(samples) + assert result.sample_count == 2 + + @pytest.mark.asyncio + async def test_evaluate_no_contexts(self): + evaluator = RagasEvaluator(use_ragas_lib=False) + samples = [ + EvalSample( + user_input="What is Python?", + response="Python is a programming language", + retrieved_contexts=[], + ), + ] + result = await evaluator.evaluate(samples) + assert result.metrics.faithfulness == 0.0 + assert result.metrics.context_precision == 0.0 + + @pytest.mark.asyncio + async def test_evaluate_with_reference(self): + evaluator = RagasEvaluator(use_ragas_lib=False) + samples = [ + EvalSample( + user_input="What is Python?", + response="Python is a programming language", + retrieved_contexts=["Python is popular"], + reference="Python is a high-level programming language", + ), + ] + result = await evaluator.evaluate(samples) + assert result.metrics.context_recall >= 0.0 + + @pytest.mark.asyncio + async def test_evaluate_specific_metrics(self): + evaluator = RagasEvaluator(use_ragas_lib=False) + samples = [ + EvalSample( + user_input="What is Python?", + response="Python is a programming language", + retrieved_contexts=["Python is popular"], + ), + ] + result = await evaluator.evaluate(samples, metrics=["faithfulness"]) + assert result.sample_count == 1