fischer-agentkit/tests/unit/memory/test_document_loader.py

228 lines
7.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""DocumentLoader 单元测试 - 多格式文档解析器"""
import pytest
from agentkit.memory.document_loader import Document, DocumentLoader, _detect_format
class TestDetectFormat:
"""格式检测测试"""
def test_pdf_format(self):
assert _detect_format("report.pdf") == "pdf"
def test_docx_format(self):
assert _detect_format("document.docx") == "docx"
assert _detect_format("document.doc") == "docx"
def test_markdown_format(self):
assert _detect_format("readme.md") == "markdown"
assert _detect_format("notes.markdown") == "markdown"
def test_html_format(self):
assert _detect_format("page.html") == "html"
assert _detect_format("page.htm") == "html"
def test_text_format(self):
assert _detect_format("data.txt") == "text"
assert _detect_format("data.csv") == "text"
assert _detect_format("data.json") == "text"
def test_unknown_format_falls_back_to_text(self):
assert _detect_format("data.xyz") == "text"
class TestDocument:
"""Document 数据类测试"""
def test_default_metadata(self):
doc = Document(doc_id="1", title="Test", content="Hello")
assert doc.metadata["source"] == ""
assert doc.metadata["format"] == "unknown"
assert doc.metadata["page_count"] == 0
assert "created_at" in doc.metadata
def test_custom_metadata(self):
doc = Document(
doc_id="1",
title="Test",
content="Hello",
metadata={"source": "test.pdf", "format": "pdf", "page_count": 5},
)
assert doc.metadata["source"] == "test.pdf"
assert doc.metadata["format"] == "pdf"
assert doc.metadata["page_count"] == 5
def test_to_dict(self):
doc = Document(doc_id="1", title="Test", content="Hello", metadata={"format": "text"})
d = doc.to_dict()
assert d["doc_id"] == "1"
assert d["title"] == "Test"
assert d["content"] == "Hello"
assert d["metadata"]["format"] == "text"
class TestDocumentLoaderText:
"""纯文本解析测试"""
def test_load_text_bytes(self):
loader = DocumentLoader()
content = "Hello, world!\nThis is a test document.".encode("utf-8")
doc = loader.load_bytes(content, "test.txt")
assert doc.title == "test"
assert doc.content == "Hello, world!\nThis is a test document."
assert doc.metadata["format"] == "text"
assert doc.metadata["source"] == "test.txt"
assert doc.metadata["parser"] == "text"
assert doc.doc_id # 非空 UUID
def test_load_text_file(self, tmp_path):
loader = DocumentLoader()
text_file = tmp_path / "sample.txt"
text_file.write_text("Sample text content", encoding="utf-8")
doc = loader.load(text_file)
assert doc.content == "Sample text content"
assert doc.metadata["format"] == "text"
def test_load_nonexistent_file(self):
loader = DocumentLoader()
with pytest.raises(FileNotFoundError):
loader.load("/nonexistent/path/file.txt")
class TestDocumentLoaderMarkdown:
"""Markdown 解析测试"""
def test_load_markdown_bytes(self):
loader = DocumentLoader()
md_content = """# Project Title
## Introduction
This is the introduction section.
## Details
Some details here.
"""
doc = loader.load_bytes(md_content.encode("utf-8"), "readme.md")
assert doc.metadata["format"] == "markdown"
assert doc.metadata["title"] == "Project Title"
assert "Introduction" in doc.content
assert "Details" in doc.content
def test_markdown_without_title(self):
loader = DocumentLoader()
md_content = "Just some text without a heading."
doc = loader.load_bytes(md_content.encode("utf-8"), "notes.md")
assert doc.metadata["format"] == "markdown"
assert doc.content == "Just some text without a heading."
class TestDocumentLoaderHTML:
"""HTML 解析测试"""
def test_load_html_with_beautifulsoup(self):
"""测试 BeautifulSoup 解析(如果可用)"""
loader = DocumentLoader()
html_content = """<!DOCTYPE html>
<html>
<head><title>Test Page</title></head>
<body>
<script>var x = 1;</script>
<style>.cls { color: red; }</style>
<h1>Hello</h1>
<p>This is a paragraph.</p>
</body>
</html>"""
doc = loader.load_bytes(html_content.encode("utf-8"), "page.html")
assert doc.metadata["format"] == "html"
# BeautifulSoup 应该移除 script/style 标签
# 如果 BeautifulSoup 不可用,则回退到文本
if doc.metadata.get("parser") == "beautifulsoup":
assert "Test Page" in doc.metadata.get("title", "") or "Hello" in doc.content
assert "var x" not in doc.content
assert ".cls" not in doc.content
assert "Hello" in doc.content
else:
# 纯文本回退,内容可能包含 HTML 标签
assert len(doc.content) > 0
def test_load_html_fallback_to_text(self):
"""即使没有 BeautifulSoupHTML 也能作为文本加载"""
loader = DocumentLoader()
html_content = "<html><body>Simple content</body></html>"
doc = loader.load_bytes(html_content.encode("utf-8"), "page.html")
assert doc.metadata["format"] == "html"
assert len(doc.content) > 0
class TestDocumentLoaderPDF:
"""PDF 解析测试"""
def test_load_pdf_without_parser(self):
"""没有 PDF 解析器时回退到文本"""
loader = DocumentLoader()
# 传入一个非 PDF 二进制内容,模拟解析失败后的回退
content = b"%PDF-1.4 fake pdf content"
doc = loader.load_bytes(content, "report.pdf")
assert doc.metadata["format"] == "pdf"
# 即使解析失败,也应该返回文档对象(内容可能为空或乱码)
assert isinstance(doc, Document)
class TestDocumentLoaderDocx:
"""Word 解析测试"""
def test_load_docx_without_parser(self):
"""没有 python-docx 时回退到文本"""
loader = DocumentLoader()
# 传入一个非 docx 二进制内容
content = b"PK\x03\x04 fake docx content"
doc = loader.load_bytes(content, "document.docx")
assert doc.metadata["format"] == "docx"
assert isinstance(doc, Document)
class TestDocumentLoaderEdgeCases:
"""边界情况测试"""
def test_empty_content(self):
loader = DocumentLoader()
doc = loader.load_bytes(b"", "empty.txt")
assert doc.content == ""
assert doc.metadata["format"] == "text"
def test_unicode_content(self):
loader = DocumentLoader()
content = "中文内容测试\n日本語テスト\n한국어 테스트".encode("utf-8")
doc = loader.load_bytes(content, "unicode.txt")
assert "中文内容测试" in doc.content
assert "日本語テスト" in doc.content
def test_large_content(self):
loader = DocumentLoader()
content = "A" * 1_000_000 # 1MB text
doc = loader.load_bytes(content.encode("utf-8"), "large.txt")
assert len(doc.content) == 1_000_000
def test_filename_with_spaces(self):
loader = DocumentLoader()
content = "Test content".encode("utf-8")
doc = loader.load_bytes(content, "my document.txt")
assert doc.title == "my document"
def test_filename_with_path(self):
loader = DocumentLoader()
content = "Test content".encode("utf-8")
doc = loader.load_bytes(content, "reports/2024/summary.md")
assert doc.metadata["format"] == "markdown"