"""U3+U7 测试 — 文档处理管道。 测试场景: 1. parse + segment 管道(mock 文件 I/O) 2. preview 返回 chunk 列表 3. vectorize 调用 embed model + vector store 4. 失败时设置 error 状态 """ from __future__ import annotations from unittest.mock import AsyncMock, MagicMock, patch import pytest from agentkit.rag_platform.document_processor import DocumentProcessor from agentkit.rag_platform.models import DocumentStatus from agentkit.rag_platform.preview import PreviewResult, generate_preview class TestParseAndSegment: """parse + segment 管道测试。""" def test_parse_extracts_text(self, tmp_path): """parse 从文件提取文本。""" # 创建测试文件 file_path = tmp_path / "test.txt" file_path.write_text("Hello, world!\nThis is a test document.", encoding="utf-8") processor = DocumentProcessor() text = processor.parse(str(file_path), "txt") assert "Hello, world!" in text assert "test document" in text def test_parse_applies_sanitization(self, tmp_path): """parse 对文本格式应用内容净化。""" file_path = tmp_path / "test.md" file_path.write_text( "Hello world", encoding="utf-8", ) processor = DocumentProcessor() text = processor.parse(str(file_path), "md") assert "