"""Tests for RagasEvaluator""" import pytest from agentkit.evaluation.ragas_evaluator import ( EvalDatasetBuilder, EvalMetrics, EvalResult, EvalSample, RagasEvaluator, ) class TestEvalMetrics: """EvalMetrics unit tests""" def test_average_all_zero(self): m = EvalMetrics() assert m.average == 0.0 def test_average_with_values(self): m = EvalMetrics(faithfulness=0.8, answer_relevancy=0.6) assert abs(m.average - 0.7) < 0.01 def test_to_dict(self): m = EvalMetrics(faithfulness=0.9, answer_relevancy=0.7, context_precision=0.8, context_recall=0.6) d = m.to_dict() assert "faithfulness" in d assert "average" in d assert d["faithfulness"] == 0.9 class TestEvalSample: """EvalSample unit tests""" def test_creation(self): sample = EvalSample( user_input="What is Python?", response="Python is a programming language", retrieved_contexts=["Python is a popular programming language"], reference="Python is a high-level programming language", ) assert sample.user_input == "What is Python?" assert len(sample.retrieved_contexts) == 1 class TestEvalDatasetBuilder: """EvalDatasetBuilder unit tests""" def test_from_traces(self): traces = [ { "input": "What is Python?", "output": "Python is a programming language", "contexts": ["Python is popular"], "reference": "Python is a high-level language", }, { "input": "What is Java?", "output": "Java is also a programming language", "contexts": ["Java is object-oriented"], }, ] samples = EvalDatasetBuilder.from_traces(traces) assert len(samples) == 2 assert samples[0].user_input == "What is Python?" assert samples[1].reference == "" def test_from_traces_empty_input(self): traces = [{"input": "", "output": "some output"}] samples = EvalDatasetBuilder.from_traces(traces) assert len(samples) == 0 # Empty input should be filtered def test_from_dict_list(self): data = [ {"user_input": "Q1", "response": "A1", "retrieved_contexts": ["C1"]}, {"user_input": "Q2", "response": "A2", "retrieved_contexts": ["C2"]}, ] samples = EvalDatasetBuilder.from_dict_list(data) assert len(samples) == 2 class TestRagasEvaluator: """RagasEvaluator unit tests""" @pytest.mark.asyncio async def test_evaluate_empty_samples(self): evaluator = RagasEvaluator() result = await evaluator.evaluate([]) assert result.sample_count == 0 assert result.metrics.average == 0.0 @pytest.mark.asyncio async def test_evaluate_builtin(self): evaluator = RagasEvaluator(use_ragas_lib=False) samples = [ EvalSample( user_input="What is Python?", response="Python is a popular programming language used for web development", retrieved_contexts=["Python is a popular programming language"], reference="Python is a high-level programming language", ), ] result = await evaluator.evaluate(samples) assert result.sample_count == 1 assert result.metrics.faithfulness >= 0.0 assert result.metrics.answer_relevancy >= 0.0 assert len(result.details) == 1 @pytest.mark.asyncio async def test_evaluate_multiple_samples(self): evaluator = RagasEvaluator(use_ragas_lib=False) samples = [ EvalSample( user_input="What is Python?", response="Python is a programming language", retrieved_contexts=["Python is popular"], ), EvalSample( user_input="What is Java?", response="Java is an object-oriented language", retrieved_contexts=["Java is widely used"], ), ] result = await evaluator.evaluate(samples) assert result.sample_count == 2 @pytest.mark.asyncio async def test_evaluate_no_contexts(self): evaluator = RagasEvaluator(use_ragas_lib=False) samples = [ EvalSample( user_input="What is Python?", response="Python is a programming language", retrieved_contexts=[], ), ] result = await evaluator.evaluate(samples) assert result.metrics.faithfulness == 0.0 assert result.metrics.context_precision == 0.0 @pytest.mark.asyncio async def test_evaluate_with_reference(self): evaluator = RagasEvaluator(use_ragas_lib=False) samples = [ EvalSample( user_input="What is Python?", response="Python is a programming language", retrieved_contexts=["Python is popular"], reference="Python is a high-level programming language", ), ] result = await evaluator.evaluate(samples) assert result.metrics.context_recall >= 0.0 @pytest.mark.asyncio async def test_evaluate_specific_metrics(self): evaluator = RagasEvaluator(use_ragas_lib=False) samples = [ EvalSample( user_input="What is Python?", response="Python is a programming language", retrieved_contexts=["Python is popular"], ), ] result = await evaluator.evaluate(samples, metrics=["faithfulness"]) assert result.sample_count == 1