168 lines
5.6 KiB
Python
168 lines
5.6 KiB
Python
"""Tests for RagasEvaluator"""
|
|
|
|
import pytest
|
|
|
|
from agentkit.evaluation.ragas_evaluator import (
|
|
EvalDatasetBuilder,
|
|
EvalMetrics,
|
|
EvalResult,
|
|
EvalSample,
|
|
RagasEvaluator,
|
|
)
|
|
|
|
|
|
class TestEvalMetrics:
|
|
"""EvalMetrics unit tests"""
|
|
|
|
def test_average_all_zero(self):
|
|
m = EvalMetrics()
|
|
assert m.average == 0.0
|
|
|
|
def test_average_with_values(self):
|
|
m = EvalMetrics(faithfulness=0.8, answer_relevancy=0.6)
|
|
assert abs(m.average - 0.7) < 0.01
|
|
|
|
def test_to_dict(self):
|
|
m = EvalMetrics(faithfulness=0.9, answer_relevancy=0.7, context_precision=0.8, context_recall=0.6)
|
|
d = m.to_dict()
|
|
assert "faithfulness" in d
|
|
assert "average" in d
|
|
assert d["faithfulness"] == 0.9
|
|
|
|
|
|
class TestEvalSample:
|
|
"""EvalSample unit tests"""
|
|
|
|
def test_creation(self):
|
|
sample = EvalSample(
|
|
user_input="What is Python?",
|
|
response="Python is a programming language",
|
|
retrieved_contexts=["Python is a popular programming language"],
|
|
reference="Python is a high-level programming language",
|
|
)
|
|
assert sample.user_input == "What is Python?"
|
|
assert len(sample.retrieved_contexts) == 1
|
|
|
|
|
|
class TestEvalDatasetBuilder:
|
|
"""EvalDatasetBuilder unit tests"""
|
|
|
|
def test_from_traces(self):
|
|
traces = [
|
|
{
|
|
"input": "What is Python?",
|
|
"output": "Python is a programming language",
|
|
"contexts": ["Python is popular"],
|
|
"reference": "Python is a high-level language",
|
|
},
|
|
{
|
|
"input": "What is Java?",
|
|
"output": "Java is also a programming language",
|
|
"contexts": ["Java is object-oriented"],
|
|
},
|
|
]
|
|
samples = EvalDatasetBuilder.from_traces(traces)
|
|
assert len(samples) == 2
|
|
assert samples[0].user_input == "What is Python?"
|
|
assert samples[1].reference == ""
|
|
|
|
def test_from_traces_empty_input(self):
|
|
traces = [{"input": "", "output": "some output"}]
|
|
samples = EvalDatasetBuilder.from_traces(traces)
|
|
assert len(samples) == 0 # Empty input should be filtered
|
|
|
|
def test_from_dict_list(self):
|
|
data = [
|
|
{"user_input": "Q1", "response": "A1", "retrieved_contexts": ["C1"]},
|
|
{"user_input": "Q2", "response": "A2", "retrieved_contexts": ["C2"]},
|
|
]
|
|
samples = EvalDatasetBuilder.from_dict_list(data)
|
|
assert len(samples) == 2
|
|
|
|
|
|
class TestRagasEvaluator:
|
|
"""RagasEvaluator unit tests"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_evaluate_empty_samples(self):
|
|
evaluator = RagasEvaluator()
|
|
result = await evaluator.evaluate([])
|
|
assert result.sample_count == 0
|
|
assert result.metrics.average == 0.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_evaluate_builtin(self):
|
|
evaluator = RagasEvaluator(use_ragas_lib=False)
|
|
samples = [
|
|
EvalSample(
|
|
user_input="What is Python?",
|
|
response="Python is a popular programming language used for web development",
|
|
retrieved_contexts=["Python is a popular programming language"],
|
|
reference="Python is a high-level programming language",
|
|
),
|
|
]
|
|
result = await evaluator.evaluate(samples)
|
|
assert result.sample_count == 1
|
|
assert result.metrics.faithfulness >= 0.0
|
|
assert result.metrics.answer_relevancy >= 0.0
|
|
assert len(result.details) == 1
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_evaluate_multiple_samples(self):
|
|
evaluator = RagasEvaluator(use_ragas_lib=False)
|
|
samples = [
|
|
EvalSample(
|
|
user_input="What is Python?",
|
|
response="Python is a programming language",
|
|
retrieved_contexts=["Python is popular"],
|
|
),
|
|
EvalSample(
|
|
user_input="What is Java?",
|
|
response="Java is an object-oriented language",
|
|
retrieved_contexts=["Java is widely used"],
|
|
),
|
|
]
|
|
result = await evaluator.evaluate(samples)
|
|
assert result.sample_count == 2
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_evaluate_no_contexts(self):
|
|
evaluator = RagasEvaluator(use_ragas_lib=False)
|
|
samples = [
|
|
EvalSample(
|
|
user_input="What is Python?",
|
|
response="Python is a programming language",
|
|
retrieved_contexts=[],
|
|
),
|
|
]
|
|
result = await evaluator.evaluate(samples)
|
|
assert result.metrics.faithfulness == 0.0
|
|
assert result.metrics.context_precision == 0.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_evaluate_with_reference(self):
|
|
evaluator = RagasEvaluator(use_ragas_lib=False)
|
|
samples = [
|
|
EvalSample(
|
|
user_input="What is Python?",
|
|
response="Python is a programming language",
|
|
retrieved_contexts=["Python is popular"],
|
|
reference="Python is a high-level programming language",
|
|
),
|
|
]
|
|
result = await evaluator.evaluate(samples)
|
|
assert result.metrics.context_recall >= 0.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_evaluate_specific_metrics(self):
|
|
evaluator = RagasEvaluator(use_ragas_lib=False)
|
|
samples = [
|
|
EvalSample(
|
|
user_input="What is Python?",
|
|
response="Python is a programming language",
|
|
retrieved_contexts=["Python is popular"],
|
|
),
|
|
]
|
|
result = await evaluator.evaluate(samples, metrics=["faithfulness"])
|
|
assert result.sample_count == 1
|