96 lines
2.7 KiB
Python
96 lines
2.7 KiB
Python
"""文档上传服务"""
|
|
import hashlib
|
|
import os
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
|
|
import shortuuid
|
|
|
|
from app.services.knowledge.parsers import ParserFactory, ParsedDocument
|
|
|
|
@dataclass
|
|
class UploadResult:
|
|
"""上传结果"""
|
|
document_id: str
|
|
title: str
|
|
content: str
|
|
content_hash: str
|
|
file_size: int
|
|
file_type: str
|
|
metadata: dict
|
|
|
|
class DocumentUploader:
|
|
"""文档上传服务"""
|
|
|
|
SUPPORTED_FORMATS = {".pdf", ".docx", ".md", ".txt", ".html"}
|
|
MAX_SIZE_MB = 50
|
|
|
|
def __init__(self, storage_path: str = "/data/documents"):
|
|
self.storage_path = storage_path
|
|
|
|
async def upload(
|
|
self,
|
|
file_content: bytes,
|
|
filename: str,
|
|
kb_id: str,
|
|
) -> UploadResult:
|
|
"""上传并解析文档"""
|
|
# 1. 验证格式
|
|
ext = self._get_extension(filename)
|
|
if ext not in self.SUPPORTED_FORMATS:
|
|
raise ValueError(f"Unsupported format: {ext}")
|
|
|
|
# 2. 验证大小
|
|
size_mb = len(file_content) / (1024 * 1024)
|
|
if size_mb > self.MAX_SIZE_MB:
|
|
raise ValueError(f"File too large: {size_mb:.1f}MB > {self.MAX_SIZE_MB}MB")
|
|
|
|
# 3. 解析文档
|
|
parser = ParserFactory.create(ext)
|
|
parsed = await parser.parse(file_content)
|
|
|
|
# 4. 生成ID和哈希
|
|
doc_id = shortuuid.uuid()
|
|
content_hash = hashlib.sha256(parsed.content.encode()).hexdigest()
|
|
|
|
# 5. 保存文件
|
|
file_path = self._save_file(file_content, kb_id, doc_id, ext)
|
|
|
|
return UploadResult(
|
|
document_id=doc_id,
|
|
title=parsed.title,
|
|
content=parsed.content,
|
|
content_hash=content_hash,
|
|
file_size=len(file_content),
|
|
file_type=ext,
|
|
metadata={
|
|
**parsed.metadata,
|
|
"original_filename": filename,
|
|
"stored_path": file_path,
|
|
},
|
|
)
|
|
|
|
def _get_extension(self, filename: str) -> str:
|
|
"""获取文件扩展名"""
|
|
if "." not in filename:
|
|
raise ValueError("File has no extension")
|
|
return "." + filename.rsplit(".", 1)[1].lower()
|
|
|
|
def _save_file(
|
|
self,
|
|
content: bytes,
|
|
kb_id: str,
|
|
doc_id: str,
|
|
ext: str
|
|
) -> str:
|
|
"""保存文件到存储路径"""
|
|
# 创建目录
|
|
dir_path = os.path.join(self.storage_path, kb_id)
|
|
os.makedirs(dir_path, exist_ok=True)
|
|
|
|
# 保存文件
|
|
file_path = os.path.join(dir_path, f"{doc_id}{ext}")
|
|
with open(file_path, "wb") as f:
|
|
f.write(content)
|
|
|
|
return file_path |