geo/backend/app/services/knowledge/uploader.py

96 lines
2.7 KiB
Python

"""文档上传服务"""
import hashlib
import os
from dataclasses import dataclass
from typing import Optional
import shortuuid
from app.services.knowledge.parsers import ParserFactory, ParsedDocument
@dataclass
class UploadResult:
"""上传结果"""
document_id: str
title: str
content: str
content_hash: str
file_size: int
file_type: str
metadata: dict
class DocumentUploader:
"""文档上传服务"""
SUPPORTED_FORMATS = {".pdf", ".docx", ".md", ".txt", ".html"}
MAX_SIZE_MB = 50
def __init__(self, storage_path: str = "/data/documents"):
self.storage_path = storage_path
async def upload(
self,
file_content: bytes,
filename: str,
kb_id: str,
) -> UploadResult:
"""上传并解析文档"""
# 1. 验证格式
ext = self._get_extension(filename)
if ext not in self.SUPPORTED_FORMATS:
raise ValueError(f"Unsupported format: {ext}")
# 2. 验证大小
size_mb = len(file_content) / (1024 * 1024)
if size_mb > self.MAX_SIZE_MB:
raise ValueError(f"File too large: {size_mb:.1f}MB > {self.MAX_SIZE_MB}MB")
# 3. 解析文档
parser = ParserFactory.create(ext)
parsed = await parser.parse(file_content)
# 4. 生成ID和哈希
doc_id = shortuuid.uuid()
content_hash = hashlib.sha256(parsed.content.encode()).hexdigest()
# 5. 保存文件
file_path = self._save_file(file_content, kb_id, doc_id, ext)
return UploadResult(
document_id=doc_id,
title=parsed.title,
content=parsed.content,
content_hash=content_hash,
file_size=len(file_content),
file_type=ext,
metadata={
**parsed.metadata,
"original_filename": filename,
"stored_path": file_path,
},
)
def _get_extension(self, filename: str) -> str:
"""获取文件扩展名"""
if "." not in filename:
raise ValueError("File has no extension")
return "." + filename.rsplit(".", 1)[1].lower()
def _save_file(
self,
content: bytes,
kb_id: str,
doc_id: str,
ext: str
) -> str:
"""保存文件到存储路径"""
# 创建目录
dir_path = os.path.join(self.storage_path, kb_id)
os.makedirs(dir_path, exist_ok=True)
# 保存文件
file_path = os.path.join(dir_path, f"{doc_id}{ext}")
with open(file_path, "wb") as f:
f.write(content)
return file_path