228 lines
7.5 KiB
Python
228 lines
7.5 KiB
Python
"""DocumentLoader 单元测试 - 多格式文档解析器"""
|
||
|
||
import pytest
|
||
|
||
from agentkit.memory.document_loader import Document, DocumentLoader, _detect_format
|
||
|
||
|
||
class TestDetectFormat:
|
||
"""格式检测测试"""
|
||
|
||
def test_pdf_format(self):
|
||
assert _detect_format("report.pdf") == "pdf"
|
||
|
||
def test_docx_format(self):
|
||
assert _detect_format("document.docx") == "docx"
|
||
assert _detect_format("document.doc") == "docx"
|
||
|
||
def test_markdown_format(self):
|
||
assert _detect_format("readme.md") == "markdown"
|
||
assert _detect_format("notes.markdown") == "markdown"
|
||
|
||
def test_html_format(self):
|
||
assert _detect_format("page.html") == "html"
|
||
assert _detect_format("page.htm") == "html"
|
||
|
||
def test_text_format(self):
|
||
assert _detect_format("data.txt") == "text"
|
||
assert _detect_format("data.csv") == "text"
|
||
assert _detect_format("data.json") == "text"
|
||
|
||
def test_unknown_format_falls_back_to_text(self):
|
||
assert _detect_format("data.xyz") == "text"
|
||
|
||
|
||
class TestDocument:
|
||
"""Document 数据类测试"""
|
||
|
||
def test_default_metadata(self):
|
||
doc = Document(doc_id="1", title="Test", content="Hello")
|
||
assert doc.metadata["source"] == ""
|
||
assert doc.metadata["format"] == "unknown"
|
||
assert doc.metadata["page_count"] == 0
|
||
assert "created_at" in doc.metadata
|
||
|
||
def test_custom_metadata(self):
|
||
doc = Document(
|
||
doc_id="1",
|
||
title="Test",
|
||
content="Hello",
|
||
metadata={"source": "test.pdf", "format": "pdf", "page_count": 5},
|
||
)
|
||
assert doc.metadata["source"] == "test.pdf"
|
||
assert doc.metadata["format"] == "pdf"
|
||
assert doc.metadata["page_count"] == 5
|
||
|
||
def test_to_dict(self):
|
||
doc = Document(doc_id="1", title="Test", content="Hello", metadata={"format": "text"})
|
||
d = doc.to_dict()
|
||
assert d["doc_id"] == "1"
|
||
assert d["title"] == "Test"
|
||
assert d["content"] == "Hello"
|
||
assert d["metadata"]["format"] == "text"
|
||
|
||
|
||
class TestDocumentLoaderText:
|
||
"""纯文本解析测试"""
|
||
|
||
def test_load_text_bytes(self):
|
||
loader = DocumentLoader()
|
||
content = "Hello, world!\nThis is a test document.".encode("utf-8")
|
||
doc = loader.load_bytes(content, "test.txt")
|
||
|
||
assert doc.title == "test"
|
||
assert doc.content == "Hello, world!\nThis is a test document."
|
||
assert doc.metadata["format"] == "text"
|
||
assert doc.metadata["source"] == "test.txt"
|
||
assert doc.metadata["parser"] == "text"
|
||
assert doc.doc_id # 非空 UUID
|
||
|
||
def test_load_text_file(self, tmp_path):
|
||
loader = DocumentLoader()
|
||
text_file = tmp_path / "sample.txt"
|
||
text_file.write_text("Sample text content", encoding="utf-8")
|
||
|
||
doc = loader.load(text_file)
|
||
assert doc.content == "Sample text content"
|
||
assert doc.metadata["format"] == "text"
|
||
|
||
def test_load_nonexistent_file(self):
|
||
loader = DocumentLoader()
|
||
with pytest.raises(FileNotFoundError):
|
||
loader.load("/nonexistent/path/file.txt")
|
||
|
||
|
||
class TestDocumentLoaderMarkdown:
|
||
"""Markdown 解析测试"""
|
||
|
||
def test_load_markdown_bytes(self):
|
||
loader = DocumentLoader()
|
||
md_content = """# Project Title
|
||
|
||
## Introduction
|
||
|
||
This is the introduction section.
|
||
|
||
## Details
|
||
|
||
Some details here.
|
||
"""
|
||
doc = loader.load_bytes(md_content.encode("utf-8"), "readme.md")
|
||
|
||
assert doc.metadata["format"] == "markdown"
|
||
assert doc.metadata["title"] == "Project Title"
|
||
assert "Introduction" in doc.content
|
||
assert "Details" in doc.content
|
||
|
||
def test_markdown_without_title(self):
|
||
loader = DocumentLoader()
|
||
md_content = "Just some text without a heading."
|
||
doc = loader.load_bytes(md_content.encode("utf-8"), "notes.md")
|
||
|
||
assert doc.metadata["format"] == "markdown"
|
||
assert doc.content == "Just some text without a heading."
|
||
|
||
|
||
class TestDocumentLoaderHTML:
|
||
"""HTML 解析测试"""
|
||
|
||
def test_load_html_with_beautifulsoup(self):
|
||
"""测试 BeautifulSoup 解析(如果可用)"""
|
||
loader = DocumentLoader()
|
||
html_content = """<!DOCTYPE html>
|
||
<html>
|
||
<head><title>Test Page</title></head>
|
||
<body>
|
||
<script>var x = 1;</script>
|
||
<style>.cls { color: red; }</style>
|
||
<h1>Hello</h1>
|
||
<p>This is a paragraph.</p>
|
||
</body>
|
||
</html>"""
|
||
doc = loader.load_bytes(html_content.encode("utf-8"), "page.html")
|
||
|
||
assert doc.metadata["format"] == "html"
|
||
# BeautifulSoup 应该移除 script/style 标签
|
||
# 如果 BeautifulSoup 不可用,则回退到文本
|
||
if doc.metadata.get("parser") == "beautifulsoup":
|
||
assert "Test Page" in doc.metadata.get("title", "") or "Hello" in doc.content
|
||
assert "var x" not in doc.content
|
||
assert ".cls" not in doc.content
|
||
assert "Hello" in doc.content
|
||
else:
|
||
# 纯文本回退,内容可能包含 HTML 标签
|
||
assert len(doc.content) > 0
|
||
|
||
def test_load_html_fallback_to_text(self):
|
||
"""即使没有 BeautifulSoup,HTML 也能作为文本加载"""
|
||
loader = DocumentLoader()
|
||
html_content = "<html><body>Simple content</body></html>"
|
||
doc = loader.load_bytes(html_content.encode("utf-8"), "page.html")
|
||
|
||
assert doc.metadata["format"] == "html"
|
||
assert len(doc.content) > 0
|
||
|
||
|
||
class TestDocumentLoaderPDF:
|
||
"""PDF 解析测试"""
|
||
|
||
def test_load_pdf_without_parser(self):
|
||
"""没有 PDF 解析器时回退到文本"""
|
||
loader = DocumentLoader()
|
||
# 传入一个非 PDF 二进制内容,模拟解析失败后的回退
|
||
content = b"%PDF-1.4 fake pdf content"
|
||
doc = loader.load_bytes(content, "report.pdf")
|
||
|
||
assert doc.metadata["format"] == "pdf"
|
||
# 即使解析失败,也应该返回文档对象(内容可能为空或乱码)
|
||
assert isinstance(doc, Document)
|
||
|
||
|
||
class TestDocumentLoaderDocx:
|
||
"""Word 解析测试"""
|
||
|
||
def test_load_docx_without_parser(self):
|
||
"""没有 python-docx 时回退到文本"""
|
||
loader = DocumentLoader()
|
||
# 传入一个非 docx 二进制内容
|
||
content = b"PK\x03\x04 fake docx content"
|
||
doc = loader.load_bytes(content, "document.docx")
|
||
|
||
assert doc.metadata["format"] == "docx"
|
||
assert isinstance(doc, Document)
|
||
|
||
|
||
class TestDocumentLoaderEdgeCases:
|
||
"""边界情况测试"""
|
||
|
||
def test_empty_content(self):
|
||
loader = DocumentLoader()
|
||
doc = loader.load_bytes(b"", "empty.txt")
|
||
assert doc.content == ""
|
||
assert doc.metadata["format"] == "text"
|
||
|
||
def test_unicode_content(self):
|
||
loader = DocumentLoader()
|
||
content = "中文内容测试\n日本語テスト\n한국어 테스트".encode("utf-8")
|
||
doc = loader.load_bytes(content, "unicode.txt")
|
||
assert "中文内容测试" in doc.content
|
||
assert "日本語テスト" in doc.content
|
||
|
||
def test_large_content(self):
|
||
loader = DocumentLoader()
|
||
content = "A" * 1_000_000 # 1MB text
|
||
doc = loader.load_bytes(content.encode("utf-8"), "large.txt")
|
||
assert len(doc.content) == 1_000_000
|
||
|
||
def test_filename_with_spaces(self):
|
||
loader = DocumentLoader()
|
||
content = "Test content".encode("utf-8")
|
||
doc = loader.load_bytes(content, "my document.txt")
|
||
assert doc.title == "my document"
|
||
|
||
def test_filename_with_path(self):
|
||
loader = DocumentLoader()
|
||
content = "Test content".encode("utf-8")
|
||
doc = loader.load_bytes(content, "reports/2024/summary.md")
|
||
assert doc.metadata["format"] == "markdown"
|