331 lines
11 KiB
Python
331 lines
11 KiB
Python
"""U5 测试 — 术语表管理 + jieba 自定义词典。
|
||
|
||
测试场景:
|
||
1. add_term 添加术语后 jieba 正确分词
|
||
2. load_from_list 批量加载术语
|
||
3. load_from_file 从 jieba 词典文件加载
|
||
4. remove_term 删除术语后 jieba 恢复默认分词
|
||
5. list_terms 列出所有术语
|
||
6. tokenize 使用自定义词典分词
|
||
7. 术语表增强后检索召回率提升(模拟场景)
|
||
8. 领域术语被正确分词(如"知识图谱"、"向量数据库")
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from pathlib import Path
|
||
|
||
import jieba
|
||
|
||
from agentkit.rag_platform.termbase import TermEntry, Termbase
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# TermEntry 模型测试
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestTermEntry:
|
||
"""TermEntry 模型测试。"""
|
||
|
||
def test_defaults(self):
|
||
"""默认值正确。"""
|
||
entry = TermEntry(term="知识图谱")
|
||
assert entry.term == "知识图谱"
|
||
assert entry.frequency is None
|
||
assert entry.pos is None
|
||
|
||
def test_with_all_fields(self):
|
||
"""所有字段赋值正确。"""
|
||
entry = TermEntry(term="向量数据库", frequency=100, pos="n")
|
||
assert entry.term == "向量数据库"
|
||
assert entry.frequency == 100
|
||
assert entry.pos == "n"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Termbase 基础测试
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestTermbaseBasic:
|
||
"""Termbase 基础功能测试。"""
|
||
|
||
def test_empty_termbase(self):
|
||
"""空术语表长度为 0。"""
|
||
tb = Termbase()
|
||
assert len(tb) == 0
|
||
assert tb.list_terms() == []
|
||
assert "知识图谱" not in tb
|
||
|
||
def test_add_term(self):
|
||
"""add_term 添加术语到字典。"""
|
||
tb = Termbase()
|
||
tb.add_term("知识图谱")
|
||
|
||
assert len(tb) == 1
|
||
assert "知识图谱" in tb
|
||
terms = tb.list_terms()
|
||
assert len(terms) == 1
|
||
assert terms[0].term == "知识图谱"
|
||
|
||
def test_add_term_with_freq_and_pos(self):
|
||
"""add_term 带词频和词性。"""
|
||
tb = Termbase()
|
||
tb.add_term("向量数据库", frequency=100, pos="n")
|
||
|
||
terms = tb.list_terms()
|
||
assert terms[0].term == "向量数据库"
|
||
assert terms[0].frequency == 100
|
||
assert terms[0].pos == "n"
|
||
|
||
def test_add_term_strips_whitespace(self):
|
||
"""add_term 去除首尾空白。"""
|
||
tb = Termbase()
|
||
tb.add_term(" 知识图谱 ")
|
||
|
||
assert "知识图谱" in tb
|
||
terms = tb.list_terms()
|
||
assert terms[0].term == "知识图谱"
|
||
|
||
def test_add_empty_term_ignored(self):
|
||
"""add_term 忽略空字符串。"""
|
||
tb = Termbase()
|
||
tb.add_term("")
|
||
tb.add_term(" ")
|
||
|
||
assert len(tb) == 0
|
||
|
||
def test_add_duplicate_term_overwrites(self):
|
||
"""重复添加同一术语覆盖原条目。"""
|
||
tb = Termbase()
|
||
tb.add_term("知识图谱", frequency=50)
|
||
tb.add_term("知识图谱", frequency=100, pos="n")
|
||
|
||
assert len(tb) == 1
|
||
terms = tb.list_terms()
|
||
assert terms[0].frequency == 100
|
||
assert terms[0].pos == "n"
|
||
|
||
def test_remove_term(self):
|
||
"""remove_term 删除术语。"""
|
||
tb = Termbase()
|
||
tb.add_term("知识图谱")
|
||
assert len(tb) == 1
|
||
|
||
tb.remove_term("知识图谱")
|
||
assert len(tb) == 0
|
||
assert "知识图谱" not in tb
|
||
|
||
def test_remove_nonexistent_term_no_error(self):
|
||
"""删除不存在的术语不报错。"""
|
||
tb = Termbase()
|
||
tb.remove_term("不存在的术语") # 不应抛异常
|
||
assert len(tb) == 0
|
||
|
||
|
||
class TestTermbaseLoadFromList:
|
||
"""load_from_list 测试。"""
|
||
|
||
def test_load_from_list(self):
|
||
"""从字符串列表加载术语。"""
|
||
tb = Termbase()
|
||
tb.load_from_list(["知识图谱", "向量数据库", "RAG"])
|
||
|
||
assert len(tb) == 3
|
||
assert "知识图谱" in tb
|
||
assert "向量数据库" in tb
|
||
assert "RAG" in tb
|
||
|
||
def test_load_from_empty_list(self):
|
||
"""空列表不添加任何术语。"""
|
||
tb = Termbase()
|
||
tb.load_from_list([])
|
||
assert len(tb) == 0
|
||
|
||
|
||
class TestTermbaseLoadFromFile:
|
||
"""load_from_file 测试。"""
|
||
|
||
def test_load_from_file(self, tmp_path: Path):
|
||
"""从 jieba 词典文件加载术语。"""
|
||
dict_file = tmp_path / "terms.txt"
|
||
dict_file.write_text(
|
||
"知识图谱 100 n\n向量数据库 100 n\nRAG 50\n",
|
||
encoding="utf-8",
|
||
)
|
||
|
||
tb = Termbase()
|
||
tb.load_from_file(str(dict_file))
|
||
|
||
assert len(tb) == 3
|
||
assert "知识图谱" in tb
|
||
assert "向量数据库" in tb
|
||
assert "RAG" in tb
|
||
|
||
# 验证词频和词性解析
|
||
terms = {t.term: t for t in tb.list_terms()}
|
||
assert terms["知识图谱"].frequency == 100
|
||
assert terms["知识图谱"].pos == "n"
|
||
assert terms["RAG"].frequency == 50
|
||
assert terms["RAG"].pos is None
|
||
|
||
def test_load_from_file_skips_comments_and_empty(self, tmp_path: Path):
|
||
"""词典文件中的注释行和空行被跳过。"""
|
||
dict_file = tmp_path / "terms.txt"
|
||
dict_file.write_text(
|
||
"# 这是注释\n\n知识图谱 100 n\n\n# 另一个注释\n",
|
||
encoding="utf-8",
|
||
)
|
||
|
||
tb = Termbase()
|
||
tb.load_from_file(str(dict_file))
|
||
|
||
assert len(tb) == 1
|
||
assert "知识图谱" in tb
|
||
|
||
def test_load_from_nonexistent_file_raises(self):
|
||
"""文件不存在时抛 FileNotFoundError。"""
|
||
tb = Termbase()
|
||
try:
|
||
tb.load_from_file("/nonexistent/path/terms.txt")
|
||
raise AssertionError("Expected FileNotFoundError")
|
||
except FileNotFoundError:
|
||
pass
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# jieba 分词集成测试 — 验证术语表对分词的影响
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestTermbaseTokenization:
|
||
"""术语表对 jieba 分词的影响测试。"""
|
||
|
||
def test_tokenize_without_termbase(self):
|
||
"""无术语表时 jieba 默认分词(领域术语可能被错误切分)。"""
|
||
# 重置 jieba 词典到默认状态
|
||
jieba.del_word("知识图谱")
|
||
jieba.del_word("向量数据库")
|
||
|
||
tb = Termbase()
|
||
tokens = tb.tokenize("知识图谱是向量数据库的基础")
|
||
|
||
# 无术语表时,"知识图谱" 可能被切分为 "知识" + "图谱"
|
||
# 注意:jieba 默认词典可能已包含部分常见词,这里只验证分词返回列表
|
||
assert isinstance(tokens, list)
|
||
assert len(tokens) > 0
|
||
|
||
def test_tokenize_with_termbase(self):
|
||
"""添加术语表后,领域术语被正确识别为单个 token。"""
|
||
# 先清除可能存在的自定义词
|
||
jieba.del_word("知识图谱")
|
||
jieba.del_word("向量数据库")
|
||
|
||
tb = Termbase()
|
||
tb.add_term("知识图谱")
|
||
tb.add_term("向量数据库")
|
||
|
||
tokens = tb.tokenize("知识图谱是向量数据库的基础")
|
||
|
||
# 添加术语后,"知识图谱" 和 "向量数据库" 应作为整体 token 出现
|
||
assert "知识图谱" in tokens
|
||
assert "向量数据库" in tokens
|
||
|
||
def test_termbase_improves_tokenization(self):
|
||
"""术语表增强后分词更准确 — 验证领域术语作为整体出现。"""
|
||
# 测试前先清除
|
||
jieba.del_word("检索增强生成")
|
||
|
||
text = "检索增强生成是RAG的核心技术"
|
||
|
||
# 添加术语表
|
||
tb_after = Termbase()
|
||
tb_after.add_term("检索增强生成")
|
||
tokens_after = tb_after.tokenize(text)
|
||
|
||
# 添加术语后,"检索增强生成" 应作为整体出现
|
||
assert "检索增强生成" in tokens_after
|
||
|
||
def test_tokenize_empty_string(self):
|
||
"""空字符串返回空列表。"""
|
||
tb = Termbase()
|
||
assert tb.tokenize("") == []
|
||
|
||
def test_tokenize_english(self):
|
||
"""英文文本正常分词。"""
|
||
tb = Termbase()
|
||
tokens = tb.tokenize("hello world")
|
||
assert "hello" in tokens
|
||
assert "world" in tokens
|
||
|
||
def test_remove_term_restores_default_tokenization(self):
|
||
"""删除术语后 jieba 恢复默认分词(术语不再作为整体)。"""
|
||
# 添加术语
|
||
tb = Termbase()
|
||
tb.add_term("测试术语XYZ")
|
||
tokens_with = tb.tokenize("测试术语XYZ很重要")
|
||
assert "测试术语XYZ" in tokens_with
|
||
|
||
# 删除术语
|
||
tb.remove_term("测试术语XYZ")
|
||
# 删除后,"测试术语XYZ" 不再作为整体(可能被切分)
|
||
# 注意:jieba 删除词后可能仍缓存,但 del_word 会从词典移除
|
||
# 这里验证术语已从 Termbase 字典中删除
|
||
assert "测试术语XYZ" not in tb
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 检索召回率提升模拟测试
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestTermbaseRetrievalImprovement:
|
||
"""术语表增强后检索召回率提升的模拟测试。"""
|
||
|
||
def test_termbase_improves_keyword_matching(self):
|
||
"""术语表增强后,关键词匹配更准确。
|
||
|
||
模拟场景:用户查询"知识图谱",文档中包含"知识图谱"。
|
||
无术语表时 jieba 可能将查询切分为"知识"+"图谱",
|
||
导致匹配精度下降;有术语表时整体匹配。
|
||
"""
|
||
# 清除可能的自定义词
|
||
jieba.del_word("知识图谱")
|
||
|
||
query = "知识图谱"
|
||
doc = "知识图谱是人工智能的重要分支"
|
||
|
||
# 有术语表 — "知识图谱" 作为整体
|
||
tb_with = Termbase()
|
||
tb_with.add_term("知识图谱")
|
||
query_tokens_with = set(tb_with.tokenize(query))
|
||
doc_tokens_with = set(tb_with.tokenize(doc))
|
||
|
||
# 有术语表时,查询和文档共享 "知识图谱" token
|
||
# 无术语表时,可能共享 "知识" 和 "图谱"(如果被切分)
|
||
# 关键验证:有术语表时 "知识图谱" 在两边都出现
|
||
assert "知识图谱" in query_tokens_with
|
||
assert "知识图谱" in doc_tokens_with
|
||
|
||
# 交集应包含 "知识图谱"
|
||
intersection_with = query_tokens_with & doc_tokens_with
|
||
assert "知识图谱" in intersection_with
|
||
|
||
def test_multiple_terms_improve_coverage(self):
|
||
"""多个领域术语同时增强分词。"""
|
||
# 清除可能的自定义词
|
||
for term in ["知识图谱", "向量数据库", "嵌入模型"]:
|
||
jieba.del_word(term)
|
||
|
||
tb = Termbase()
|
||
tb.load_from_list(["知识图谱", "向量数据库", "嵌入模型"])
|
||
|
||
text = "知识图谱通常使用向量数据库和嵌入模型构建"
|
||
tokens = tb.tokenize(text)
|
||
|
||
# 所有领域术语都应作为整体 token 出现
|
||
assert "知识图谱" in tokens
|
||
assert "向量数据库" in tokens
|
||
assert "嵌入模型" in tokens
|