"""U3+U7 测试 — 文档处理管道。
测试场景:
1. parse + segment 管道(mock 文件 I/O)
2. preview 返回 chunk 列表
3. vectorize 调用 embed model + vector store
4. 失败时设置 error 状态
"""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from agentkit.rag_platform.document_processor import DocumentProcessor
from agentkit.rag_platform.models import DocumentStatus
from agentkit.rag_platform.preview import PreviewResult, generate_preview
class TestParseAndSegment:
"""parse + segment 管道测试。"""
def test_parse_extracts_text(self, tmp_path):
"""parse 从文件提取文本。"""
# 创建测试文件
file_path = tmp_path / "test.txt"
file_path.write_text("Hello, world!\nThis is a test document.", encoding="utf-8")
processor = DocumentProcessor()
text = processor.parse(str(file_path), "txt")
assert "Hello, world!" in text
assert "test document" in text
def test_parse_applies_sanitization(self, tmp_path):
"""parse 对文本格式应用内容净化。"""
file_path = tmp_path / "test.md"
file_path.write_text(
"Hello world",
encoding="utf-8",
)
processor = DocumentProcessor()
text = processor.parse(str(file_path), "md")
assert "