fischer-agentkit/tests/unit/rag_platform/test_termbase.py

336 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""U5 测试 — 术语表管理 + jieba 自定义词典。
测试场景:
1. add_term 添加术语后 jieba 正确分词
2. load_from_list 批量加载术语
3. load_from_file 从 jieba 词典文件加载
4. remove_term 删除术语后 jieba 恢复默认分词
5. list_terms 列出所有术语
6. tokenize 使用自定义词典分词
7. 术语表增强后检索召回率提升(模拟场景)
8. 领域术语被正确分词(如"知识图谱""向量数据库"
"""
from __future__ import annotations
from pathlib import Path
import pytest
# jieba 是可选依赖pyproject.toml 主依赖),但测试环境可能未安装。
# importorskip 确保收集阶段不中断,符合 project_rules.md 的 pre-commit 门禁。
pytest.importorskip("jieba")
import jieba # noqa: E402 — 必须在 importorskip 之后
from agentkit.rag_platform.termbase import TermEntry, Termbase
# ---------------------------------------------------------------------------
# TermEntry 模型测试
# ---------------------------------------------------------------------------
class TestTermEntry:
"""TermEntry 模型测试。"""
def test_defaults(self):
"""默认值正确。"""
entry = TermEntry(term="知识图谱")
assert entry.term == "知识图谱"
assert entry.frequency is None
assert entry.pos is None
def test_with_all_fields(self):
"""所有字段赋值正确。"""
entry = TermEntry(term="向量数据库", frequency=100, pos="n")
assert entry.term == "向量数据库"
assert entry.frequency == 100
assert entry.pos == "n"
# ---------------------------------------------------------------------------
# Termbase 基础测试
# ---------------------------------------------------------------------------
class TestTermbaseBasic:
"""Termbase 基础功能测试。"""
def test_empty_termbase(self):
"""空术语表长度为 0。"""
tb = Termbase()
assert len(tb) == 0
assert tb.list_terms() == []
assert "知识图谱" not in tb
def test_add_term(self):
"""add_term 添加术语到字典。"""
tb = Termbase()
tb.add_term("知识图谱")
assert len(tb) == 1
assert "知识图谱" in tb
terms = tb.list_terms()
assert len(terms) == 1
assert terms[0].term == "知识图谱"
def test_add_term_with_freq_and_pos(self):
"""add_term 带词频和词性。"""
tb = Termbase()
tb.add_term("向量数据库", frequency=100, pos="n")
terms = tb.list_terms()
assert terms[0].term == "向量数据库"
assert terms[0].frequency == 100
assert terms[0].pos == "n"
def test_add_term_strips_whitespace(self):
"""add_term 去除首尾空白。"""
tb = Termbase()
tb.add_term(" 知识图谱 ")
assert "知识图谱" in tb
terms = tb.list_terms()
assert terms[0].term == "知识图谱"
def test_add_empty_term_ignored(self):
"""add_term 忽略空字符串。"""
tb = Termbase()
tb.add_term("")
tb.add_term(" ")
assert len(tb) == 0
def test_add_duplicate_term_overwrites(self):
"""重复添加同一术语覆盖原条目。"""
tb = Termbase()
tb.add_term("知识图谱", frequency=50)
tb.add_term("知识图谱", frequency=100, pos="n")
assert len(tb) == 1
terms = tb.list_terms()
assert terms[0].frequency == 100
assert terms[0].pos == "n"
def test_remove_term(self):
"""remove_term 删除术语。"""
tb = Termbase()
tb.add_term("知识图谱")
assert len(tb) == 1
tb.remove_term("知识图谱")
assert len(tb) == 0
assert "知识图谱" not in tb
def test_remove_nonexistent_term_no_error(self):
"""删除不存在的术语不报错。"""
tb = Termbase()
tb.remove_term("不存在的术语") # 不应抛异常
assert len(tb) == 0
class TestTermbaseLoadFromList:
"""load_from_list 测试。"""
def test_load_from_list(self):
"""从字符串列表加载术语。"""
tb = Termbase()
tb.load_from_list(["知识图谱", "向量数据库", "RAG"])
assert len(tb) == 3
assert "知识图谱" in tb
assert "向量数据库" in tb
assert "RAG" in tb
def test_load_from_empty_list(self):
"""空列表不添加任何术语。"""
tb = Termbase()
tb.load_from_list([])
assert len(tb) == 0
class TestTermbaseLoadFromFile:
"""load_from_file 测试。"""
def test_load_from_file(self, tmp_path: Path):
"""从 jieba 词典文件加载术语。"""
dict_file = tmp_path / "terms.txt"
dict_file.write_text(
"知识图谱 100 n\n向量数据库 100 n\nRAG 50\n",
encoding="utf-8",
)
tb = Termbase()
tb.load_from_file(str(dict_file))
assert len(tb) == 3
assert "知识图谱" in tb
assert "向量数据库" in tb
assert "RAG" in tb
# 验证词频和词性解析
terms = {t.term: t for t in tb.list_terms()}
assert terms["知识图谱"].frequency == 100
assert terms["知识图谱"].pos == "n"
assert terms["RAG"].frequency == 50
assert terms["RAG"].pos is None
def test_load_from_file_skips_comments_and_empty(self, tmp_path: Path):
"""词典文件中的注释行和空行被跳过。"""
dict_file = tmp_path / "terms.txt"
dict_file.write_text(
"# 这是注释\n\n知识图谱 100 n\n\n# 另一个注释\n",
encoding="utf-8",
)
tb = Termbase()
tb.load_from_file(str(dict_file))
assert len(tb) == 1
assert "知识图谱" in tb
def test_load_from_nonexistent_file_raises(self):
"""文件不存在时抛 FileNotFoundError。"""
tb = Termbase()
try:
tb.load_from_file("/nonexistent/path/terms.txt")
raise AssertionError("Expected FileNotFoundError")
except FileNotFoundError:
pass
# ---------------------------------------------------------------------------
# jieba 分词集成测试 — 验证术语表对分词的影响
# ---------------------------------------------------------------------------
class TestTermbaseTokenization:
"""术语表对 jieba 分词的影响测试。"""
def test_tokenize_without_termbase(self):
"""无术语表时 jieba 默认分词(领域术语可能被错误切分)。"""
# 重置 jieba 词典到默认状态
jieba.del_word("知识图谱")
jieba.del_word("向量数据库")
tb = Termbase()
tokens = tb.tokenize("知识图谱是向量数据库的基础")
# 无术语表时,"知识图谱" 可能被切分为 "知识" + "图谱"
# 注意jieba 默认词典可能已包含部分常见词,这里只验证分词返回列表
assert isinstance(tokens, list)
assert len(tokens) > 0
def test_tokenize_with_termbase(self):
"""添加术语表后,领域术语被正确识别为单个 token。"""
# 先清除可能存在的自定义词
jieba.del_word("知识图谱")
jieba.del_word("向量数据库")
tb = Termbase()
tb.add_term("知识图谱")
tb.add_term("向量数据库")
tokens = tb.tokenize("知识图谱是向量数据库的基础")
# 添加术语后,"知识图谱" 和 "向量数据库" 应作为整体 token 出现
assert "知识图谱" in tokens
assert "向量数据库" in tokens
def test_termbase_improves_tokenization(self):
"""术语表增强后分词更准确 — 验证领域术语作为整体出现。"""
# 测试前先清除
jieba.del_word("检索增强生成")
text = "检索增强生成是RAG的核心技术"
# 添加术语表
tb_after = Termbase()
tb_after.add_term("检索增强生成")
tokens_after = tb_after.tokenize(text)
# 添加术语后,"检索增强生成" 应作为整体出现
assert "检索增强生成" in tokens_after
def test_tokenize_empty_string(self):
"""空字符串返回空列表。"""
tb = Termbase()
assert tb.tokenize("") == []
def test_tokenize_english(self):
"""英文文本正常分词。"""
tb = Termbase()
tokens = tb.tokenize("hello world")
assert "hello" in tokens
assert "world" in tokens
def test_remove_term_restores_default_tokenization(self):
"""删除术语后 jieba 恢复默认分词(术语不再作为整体)。"""
# 添加术语
tb = Termbase()
tb.add_term("测试术语XYZ")
tokens_with = tb.tokenize("测试术语XYZ很重要")
assert "测试术语XYZ" in tokens_with
# 删除术语
tb.remove_term("测试术语XYZ")
# 删除后,"测试术语XYZ" 不再作为整体(可能被切分)
# 注意jieba 删除词后可能仍缓存,但 del_word 会从词典移除
# 这里验证术语已从 Termbase 字典中删除
assert "测试术语XYZ" not in tb
# ---------------------------------------------------------------------------
# 检索召回率提升模拟测试
# ---------------------------------------------------------------------------
class TestTermbaseRetrievalImprovement:
"""术语表增强后检索召回率提升的模拟测试。"""
def test_termbase_improves_keyword_matching(self):
"""术语表增强后,关键词匹配更准确。
模拟场景:用户查询"知识图谱",文档中包含"知识图谱"
无术语表时 jieba 可能将查询切分为"知识"+"图谱"
导致匹配精度下降;有术语表时整体匹配。
"""
# 清除可能的自定义词
jieba.del_word("知识图谱")
query = "知识图谱"
doc = "知识图谱是人工智能的重要分支"
# 有术语表 — "知识图谱" 作为整体
tb_with = Termbase()
tb_with.add_term("知识图谱")
query_tokens_with = set(tb_with.tokenize(query))
doc_tokens_with = set(tb_with.tokenize(doc))
# 有术语表时,查询和文档共享 "知识图谱" token
# 无术语表时,可能共享 "知识" 和 "图谱"(如果被切分)
# 关键验证:有术语表时 "知识图谱" 在两边都出现
assert "知识图谱" in query_tokens_with
assert "知识图谱" in doc_tokens_with
# 交集应包含 "知识图谱"
intersection_with = query_tokens_with & doc_tokens_with
assert "知识图谱" in intersection_with
def test_multiple_terms_improve_coverage(self):
"""多个领域术语同时增强分词。"""
# 清除可能的自定义词
for term in ["知识图谱", "向量数据库", "嵌入模型"]:
jieba.del_word(term)
tb = Termbase()
tb.load_from_list(["知识图谱", "向量数据库", "嵌入模型"])
text = "知识图谱通常使用向量数据库和嵌入模型构建"
tokens = tb.tokenize(text)
# 所有领域术语都应作为整体 token 出现
assert "知识图谱" in tokens
assert "向量数据库" in tokens
assert "嵌入模型" in tokens