"""U5 测试 — 术语表管理 + jieba 自定义词典。 测试场景: 1. add_term 添加术语后 jieba 正确分词 2. load_from_list 批量加载术语 3. load_from_file 从 jieba 词典文件加载 4. remove_term 删除术语后 jieba 恢复默认分词 5. list_terms 列出所有术语 6. tokenize 使用自定义词典分词 7. 术语表增强后检索召回率提升(模拟场景) 8. 领域术语被正确分词(如"知识图谱"、"向量数据库") """ from __future__ import annotations from pathlib import Path import pytest # jieba 是可选依赖(pyproject.toml 主依赖),但测试环境可能未安装。 # importorskip 确保收集阶段不中断,符合 project_rules.md 的 pre-commit 门禁。 pytest.importorskip("jieba") import jieba # noqa: E402 — 必须在 importorskip 之后 from agentkit.rag_platform.termbase import TermEntry, Termbase # --------------------------------------------------------------------------- # TermEntry 模型测试 # --------------------------------------------------------------------------- class TestTermEntry: """TermEntry 模型测试。""" def test_defaults(self): """默认值正确。""" entry = TermEntry(term="知识图谱") assert entry.term == "知识图谱" assert entry.frequency is None assert entry.pos is None def test_with_all_fields(self): """所有字段赋值正确。""" entry = TermEntry(term="向量数据库", frequency=100, pos="n") assert entry.term == "向量数据库" assert entry.frequency == 100 assert entry.pos == "n" # --------------------------------------------------------------------------- # Termbase 基础测试 # --------------------------------------------------------------------------- class TestTermbaseBasic: """Termbase 基础功能测试。""" def test_empty_termbase(self): """空术语表长度为 0。""" tb = Termbase() assert len(tb) == 0 assert tb.list_terms() == [] assert "知识图谱" not in tb def test_add_term(self): """add_term 添加术语到字典。""" tb = Termbase() tb.add_term("知识图谱") assert len(tb) == 1 assert "知识图谱" in tb terms = tb.list_terms() assert len(terms) == 1 assert terms[0].term == "知识图谱" def test_add_term_with_freq_and_pos(self): """add_term 带词频和词性。""" tb = Termbase() tb.add_term("向量数据库", frequency=100, pos="n") terms = tb.list_terms() assert terms[0].term == "向量数据库" assert terms[0].frequency == 100 assert terms[0].pos == "n" def test_add_term_strips_whitespace(self): """add_term 去除首尾空白。""" tb = Termbase() tb.add_term(" 知识图谱 ") assert "知识图谱" in tb terms = tb.list_terms() assert terms[0].term == "知识图谱" def test_add_empty_term_ignored(self): """add_term 忽略空字符串。""" tb = Termbase() tb.add_term("") tb.add_term(" ") assert len(tb) == 0 def test_add_duplicate_term_overwrites(self): """重复添加同一术语覆盖原条目。""" tb = Termbase() tb.add_term("知识图谱", frequency=50) tb.add_term("知识图谱", frequency=100, pos="n") assert len(tb) == 1 terms = tb.list_terms() assert terms[0].frequency == 100 assert terms[0].pos == "n" def test_remove_term(self): """remove_term 删除术语。""" tb = Termbase() tb.add_term("知识图谱") assert len(tb) == 1 tb.remove_term("知识图谱") assert len(tb) == 0 assert "知识图谱" not in tb def test_remove_nonexistent_term_no_error(self): """删除不存在的术语不报错。""" tb = Termbase() tb.remove_term("不存在的术语") # 不应抛异常 assert len(tb) == 0 class TestTermbaseLoadFromList: """load_from_list 测试。""" def test_load_from_list(self): """从字符串列表加载术语。""" tb = Termbase() tb.load_from_list(["知识图谱", "向量数据库", "RAG"]) assert len(tb) == 3 assert "知识图谱" in tb assert "向量数据库" in tb assert "RAG" in tb def test_load_from_empty_list(self): """空列表不添加任何术语。""" tb = Termbase() tb.load_from_list([]) assert len(tb) == 0 class TestTermbaseLoadFromFile: """load_from_file 测试。""" def test_load_from_file(self, tmp_path: Path): """从 jieba 词典文件加载术语。""" dict_file = tmp_path / "terms.txt" dict_file.write_text( "知识图谱 100 n\n向量数据库 100 n\nRAG 50\n", encoding="utf-8", ) tb = Termbase() tb.load_from_file(str(dict_file)) assert len(tb) == 3 assert "知识图谱" in tb assert "向量数据库" in tb assert "RAG" in tb # 验证词频和词性解析 terms = {t.term: t for t in tb.list_terms()} assert terms["知识图谱"].frequency == 100 assert terms["知识图谱"].pos == "n" assert terms["RAG"].frequency == 50 assert terms["RAG"].pos is None def test_load_from_file_skips_comments_and_empty(self, tmp_path: Path): """词典文件中的注释行和空行被跳过。""" dict_file = tmp_path / "terms.txt" dict_file.write_text( "# 这是注释\n\n知识图谱 100 n\n\n# 另一个注释\n", encoding="utf-8", ) tb = Termbase() tb.load_from_file(str(dict_file)) assert len(tb) == 1 assert "知识图谱" in tb def test_load_from_nonexistent_file_raises(self): """文件不存在时抛 FileNotFoundError。""" tb = Termbase() try: tb.load_from_file("/nonexistent/path/terms.txt") raise AssertionError("Expected FileNotFoundError") except FileNotFoundError: pass # --------------------------------------------------------------------------- # jieba 分词集成测试 — 验证术语表对分词的影响 # --------------------------------------------------------------------------- class TestTermbaseTokenization: """术语表对 jieba 分词的影响测试。""" def test_tokenize_without_termbase(self): """无术语表时 jieba 默认分词(领域术语可能被错误切分)。""" # 重置 jieba 词典到默认状态 jieba.del_word("知识图谱") jieba.del_word("向量数据库") tb = Termbase() tokens = tb.tokenize("知识图谱是向量数据库的基础") # 无术语表时,"知识图谱" 可能被切分为 "知识" + "图谱" # 注意:jieba 默认词典可能已包含部分常见词,这里只验证分词返回列表 assert isinstance(tokens, list) assert len(tokens) > 0 def test_tokenize_with_termbase(self): """添加术语表后,领域术语被正确识别为单个 token。""" # 先清除可能存在的自定义词 jieba.del_word("知识图谱") jieba.del_word("向量数据库") tb = Termbase() tb.add_term("知识图谱") tb.add_term("向量数据库") tokens = tb.tokenize("知识图谱是向量数据库的基础") # 添加术语后,"知识图谱" 和 "向量数据库" 应作为整体 token 出现 assert "知识图谱" in tokens assert "向量数据库" in tokens def test_termbase_improves_tokenization(self): """术语表增强后分词更准确 — 验证领域术语作为整体出现。""" # 测试前先清除 jieba.del_word("检索增强生成") text = "检索增强生成是RAG的核心技术" # 添加术语表 tb_after = Termbase() tb_after.add_term("检索增强生成") tokens_after = tb_after.tokenize(text) # 添加术语后,"检索增强生成" 应作为整体出现 assert "检索增强生成" in tokens_after def test_tokenize_empty_string(self): """空字符串返回空列表。""" tb = Termbase() assert tb.tokenize("") == [] def test_tokenize_english(self): """英文文本正常分词。""" tb = Termbase() tokens = tb.tokenize("hello world") assert "hello" in tokens assert "world" in tokens def test_remove_term_restores_default_tokenization(self): """删除术语后 jieba 恢复默认分词(术语不再作为整体)。""" # 添加术语 tb = Termbase() tb.add_term("测试术语XYZ") tokens_with = tb.tokenize("测试术语XYZ很重要") assert "测试术语XYZ" in tokens_with # 删除术语 tb.remove_term("测试术语XYZ") # 删除后,"测试术语XYZ" 不再作为整体(可能被切分) # 注意:jieba 删除词后可能仍缓存,但 del_word 会从词典移除 # 这里验证术语已从 Termbase 字典中删除 assert "测试术语XYZ" not in tb # --------------------------------------------------------------------------- # 检索召回率提升模拟测试 # --------------------------------------------------------------------------- class TestTermbaseRetrievalImprovement: """术语表增强后检索召回率提升的模拟测试。""" def test_termbase_improves_keyword_matching(self): """术语表增强后,关键词匹配更准确。 模拟场景:用户查询"知识图谱",文档中包含"知识图谱"。 无术语表时 jieba 可能将查询切分为"知识"+"图谱", 导致匹配精度下降;有术语表时整体匹配。 """ # 清除可能的自定义词 jieba.del_word("知识图谱") query = "知识图谱" doc = "知识图谱是人工智能的重要分支" # 有术语表 — "知识图谱" 作为整体 tb_with = Termbase() tb_with.add_term("知识图谱") query_tokens_with = set(tb_with.tokenize(query)) doc_tokens_with = set(tb_with.tokenize(doc)) # 有术语表时,查询和文档共享 "知识图谱" token # 无术语表时,可能共享 "知识" 和 "图谱"(如果被切分) # 关键验证:有术语表时 "知识图谱" 在两边都出现 assert "知识图谱" in query_tokens_with assert "知识图谱" in doc_tokens_with # 交集应包含 "知识图谱" intersection_with = query_tokens_with & doc_tokens_with assert "知识图谱" in intersection_with def test_multiple_terms_improve_coverage(self): """多个领域术语同时增强分词。""" # 清除可能的自定义词 for term in ["知识图谱", "向量数据库", "嵌入模型"]: jieba.del_word(term) tb = Termbase() tb.load_from_list(["知识图谱", "向量数据库", "嵌入模型"]) text = "知识图谱通常使用向量数据库和嵌入模型构建" tokens = tb.tokenize(text) # 所有领域术语都应作为整体 token 出现 assert "知识图谱" in tokens assert "向量数据库" in tokens assert "嵌入模型" in tokens