# GEO知识库系统 - 技术实施指南 ## 快速开始 ### 1. 环境设置 ```bash # 后端依赖 cd backend pip install langchain langchain-community langchain-postgres pip install llama-index llama-index-readers-web llama-index-embeddings-huggingface pip install unstructured[pdf,docx] sentence-transformers pip install pgvector sqlalchemy asyncpg pip install tiktoken # Token计数 # 前端依赖(如需开发UI) cd frontend npm install recharts # 检索结果可视化 ``` ### 2. PostgreSQL pgvector扩展 ```sql -- 在GEO数据库中启用pgvector CREATE EXTENSION IF NOT EXISTS vector; -- 创建知识库表 CREATE TABLE knowledge_chunks ( id BIGSERIAL PRIMARY KEY, chunk_id VARCHAR(255) UNIQUE NOT NULL, content TEXT NOT NULL, embedding vector(1024), -- bge-m3维度 metadata JSONB, knowledge_base_type VARCHAR(50), -- 'industry' 或 'company' source_id VARCHAR(255), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); -- 创建HNSW索引(高性能搜索) CREATE INDEX ON knowledge_chunks USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64); -- 创建全文搜索索引 CREATE INDEX knowledge_chunks_content_tsvector_idx ON knowledge_chunks USING GIN (to_tsvector('english', content)); ``` ### 3. 核心RAG类实现 ```python # backend/app/services/rag_service.py from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_postgres import PGVector from sentence_transformers import CrossEncoder import asyncpg class RAGService: def __init__(self): self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=512, chunk_overlap=50, separators=["\n\n\n", "\n\n", "\n", " ", ""] ) self.embeddings = HuggingFaceEmbeddings( model_name="BAAI/bge-m3", model_kwargs={"device": "cuda"} ) self.vector_store = PGVector( connection_string="postgresql+psycopg://...", embedding_function=self.embeddings, collection_name="knowledge_base" ) self.cross_encoder = CrossEncoder("cross-encoder/mmarco-MiniLMv2-L12-H384") async def chunk_document(self, content: str) -> list: """文档分块""" chunks = self.text_splitter.split_text(content) return chunks async def hybrid_search( self, query: str, top_k: int = 10, alpha: float = 0.5 ) -> list: """混合搜索:BM25 + 向量 + 重排""" # 1. 向量搜索 vector_results = await self.vector_search(query, top_k * 2) # 2. BM25搜索 bm25_results = await self.bm25_search(query, top_k * 2) # 3. 分数融合 hybrid_scores = self._fuse_scores( vector_results, bm25_results, alpha ) # 4. 重排 ranked = self.cross_encoder.rank( query, [doc['content'] for doc in hybrid_scores], top_k=top_k ) return ranked async def vector_search(self, query: str, top_k: int) -> list: """纯向量搜索""" query_embedding = self.embeddings.embed_query(query) results = await self.vector_store.asimilarity_search_with_score( query, k=top_k ) return results async def bm25_search(self, query: str, top_k: int) -> list: """BM25关键词搜索""" # 使用SQL全文搜索 async with await asyncpg.connect(...) as conn: results = await conn.fetch( """ SELECT id, content, ts_rank(to_tsvector('english', content), plainto_tsquery('english', $1)) as score FROM knowledge_chunks WHERE to_tsvector('english', content) @@ plainto_tsquery('english', $1) ORDER BY score DESC LIMIT $2 """, query, top_k ) return results def _fuse_scores(self, vector_results, bm25_results, alpha): """融合两种搜索的分数""" # 实现细节... pass ``` ### 4. FastAPI集成 ```python # backend/app/api/knowledge_base.py from fastapi import APIRouter, UploadFile, File from app.services.rag_service import RAGService router = APIRouter(prefix="/api/knowledge_base", tags=["knowledge_base"]) rag_service = RAGService() @router.post("/upload") async def upload_document(file: UploadFile = File(...)): """上传知识库文档""" content = await file.read() # 文档处理 chunks = await rag_service.chunk_document(content.decode()) # 存储到向量库 for chunk in chunks: embedding = rag_service.embeddings.embed_query(chunk) # 存储到pgvector return {"chunks": len(chunks), "status": "success"} @router.post("/search") async def search( query: str, kb_type: str = "both", # 'industry', 'company', 'both' hybrid: bool = True, alpha: float = 0.5 ): """检索知识库""" if hybrid: results = await rag_service.hybrid_search(query, alpha=alpha) else: results = await rag_service.vector_search(query) return { "query": query, "results": results, "count": len(results) } ``` ### 5. 知识库管理UI(Next.js组件参考) ```typescript // frontend/components/knowledge_base/KnowledgeBaseManager.tsx import React, { useState } from 'react' import { Upload, Search } from 'lucide-react' export const KnowledgeBaseManager = () => { const [uploadProgress, setUploadProgress] = useState(0) const [searchQuery, setSearchQuery] = useState('') const [searchResults, setSearchResults] = useState([]) const handleFileUpload = async (file: File) => { const formData = new FormData() formData.append('file', file) const response = await fetch('/api/knowledge_base/upload', { method: 'POST', body: formData }) const result = await response.json() console.log(`上传成功: ${result.chunks}个文档块`) } const handleSearch = async () => { const response = await fetch('/api/knowledge_base/search', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ query: searchQuery, hybrid: true, alpha: 0.5 }) }) const result = await response.json() setSearchResults(result.results) } return (

知识库管理

拖拽文件上传或点击选择
setSearchQuery(e.target.value)} placeholder="搜索知识库..." className="w-full px-4 py-2 border rounded-lg" />
{searchResults.map((result, idx) => (

{result.title}

{result.content}

相似度: {result.score.toFixed(3)}

))}
) } ``` --- ## 性能调优检查清单 - [ ] pgvector HNSW索引已创建 - [ ] 全文搜索索引已创建 - [ ] Embedding模型缓存已配置 - [ ] Hybrid search alpha参数已根据业务调整 - [ ] Cross-Encoder模型已选择(性能vs精度权衡) - [ ] Context Window管理已实现(防止溢出) - [ ] 查询日志已记录(用于后续优化) --- ## 监控和可观测性 ```python # 添加到FastAPI中间件 from opentelemetry import trace, metrics from opentelemetry.exporter.jaeger.thrift import JaegerExporter tracer = trace.get_tracer(__name__) @router.post("/search") async def search(query: str): with tracer.start_as_current_span("kb_search") as span: span.set_attribute("query", query) # 执行搜索... span.set_attribute("results.count", len(results)) span.set_attribute("search.latency_ms", latency) return results ``` --- ## 测试脚本 ```python # tests/test_rag_service.py import pytest from app.services.rag_service import RAGService @pytest.fixture def rag_service(): return RAGService() @pytest.mark.asyncio async def test_hybrid_search(rag_service): """测试混合搜索""" results = await rag_service.hybrid_search( "GEO平台品牌优化", top_k=10, alpha=0.5 ) assert len(results) > 0 assert results[0]['score'] > 0.5 @pytest.mark.asyncio async def test_chunking(rag_service): """测试文档分块""" test_content = "这是一个很长的测试文档。" * 100 chunks = await rag_service.chunk_document(test_content) assert len(chunks) > 1 assert all(len(chunk) <= 2048 for chunk in chunks) ```