fischer-agentkit/tests/unit/test_ragas_evaluator.py

168 lines
5.6 KiB
Python

"""Tests for RagasEvaluator"""
import pytest
from agentkit.evaluation.ragas_evaluator import (
EvalDatasetBuilder,
EvalMetrics,
EvalResult,
EvalSample,
RagasEvaluator,
)
class TestEvalMetrics:
"""EvalMetrics unit tests"""
def test_average_all_zero(self):
m = EvalMetrics()
assert m.average == 0.0
def test_average_with_values(self):
m = EvalMetrics(faithfulness=0.8, answer_relevancy=0.6)
assert abs(m.average - 0.7) < 0.01
def test_to_dict(self):
m = EvalMetrics(faithfulness=0.9, answer_relevancy=0.7, context_precision=0.8, context_recall=0.6)
d = m.to_dict()
assert "faithfulness" in d
assert "average" in d
assert d["faithfulness"] == 0.9
class TestEvalSample:
"""EvalSample unit tests"""
def test_creation(self):
sample = EvalSample(
user_input="What is Python?",
response="Python is a programming language",
retrieved_contexts=["Python is a popular programming language"],
reference="Python is a high-level programming language",
)
assert sample.user_input == "What is Python?"
assert len(sample.retrieved_contexts) == 1
class TestEvalDatasetBuilder:
"""EvalDatasetBuilder unit tests"""
def test_from_traces(self):
traces = [
{
"input": "What is Python?",
"output": "Python is a programming language",
"contexts": ["Python is popular"],
"reference": "Python is a high-level language",
},
{
"input": "What is Java?",
"output": "Java is also a programming language",
"contexts": ["Java is object-oriented"],
},
]
samples = EvalDatasetBuilder.from_traces(traces)
assert len(samples) == 2
assert samples[0].user_input == "What is Python?"
assert samples[1].reference == ""
def test_from_traces_empty_input(self):
traces = [{"input": "", "output": "some output"}]
samples = EvalDatasetBuilder.from_traces(traces)
assert len(samples) == 0 # Empty input should be filtered
def test_from_dict_list(self):
data = [
{"user_input": "Q1", "response": "A1", "retrieved_contexts": ["C1"]},
{"user_input": "Q2", "response": "A2", "retrieved_contexts": ["C2"]},
]
samples = EvalDatasetBuilder.from_dict_list(data)
assert len(samples) == 2
class TestRagasEvaluator:
"""RagasEvaluator unit tests"""
@pytest.mark.asyncio
async def test_evaluate_empty_samples(self):
evaluator = RagasEvaluator()
result = await evaluator.evaluate([])
assert result.sample_count == 0
assert result.metrics.average == 0.0
@pytest.mark.asyncio
async def test_evaluate_builtin(self):
evaluator = RagasEvaluator(use_ragas_lib=False)
samples = [
EvalSample(
user_input="What is Python?",
response="Python is a popular programming language used for web development",
retrieved_contexts=["Python is a popular programming language"],
reference="Python is a high-level programming language",
),
]
result = await evaluator.evaluate(samples)
assert result.sample_count == 1
assert result.metrics.faithfulness >= 0.0
assert result.metrics.answer_relevancy >= 0.0
assert len(result.details) == 1
@pytest.mark.asyncio
async def test_evaluate_multiple_samples(self):
evaluator = RagasEvaluator(use_ragas_lib=False)
samples = [
EvalSample(
user_input="What is Python?",
response="Python is a programming language",
retrieved_contexts=["Python is popular"],
),
EvalSample(
user_input="What is Java?",
response="Java is an object-oriented language",
retrieved_contexts=["Java is widely used"],
),
]
result = await evaluator.evaluate(samples)
assert result.sample_count == 2
@pytest.mark.asyncio
async def test_evaluate_no_contexts(self):
evaluator = RagasEvaluator(use_ragas_lib=False)
samples = [
EvalSample(
user_input="What is Python?",
response="Python is a programming language",
retrieved_contexts=[],
),
]
result = await evaluator.evaluate(samples)
assert result.metrics.faithfulness == 0.0
assert result.metrics.context_precision == 0.0
@pytest.mark.asyncio
async def test_evaluate_with_reference(self):
evaluator = RagasEvaluator(use_ragas_lib=False)
samples = [
EvalSample(
user_input="What is Python?",
response="Python is a programming language",
retrieved_contexts=["Python is popular"],
reference="Python is a high-level programming language",
),
]
result = await evaluator.evaluate(samples)
assert result.metrics.context_recall >= 0.0
@pytest.mark.asyncio
async def test_evaluate_specific_metrics(self):
evaluator = RagasEvaluator(use_ragas_lib=False)
samples = [
EvalSample(
user_input="What is Python?",
response="Python is a programming language",
retrieved_contexts=["Python is popular"],
),
]
result = await evaluator.evaluate(samples, metrics=["faithfulness"])
assert result.sample_count == 1