geo/backend/app/workers/platforms/search_engine.py

174 lines
6.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
通用搜索引擎模块 —— 用于在AI平台适配器无法正常工作时获取与关键词相关的真实内容。
使用 DuckDuckGo HTML 搜索(无需 API Key返回搜索结果摘要。
"""
import logging
import re
from urllib.parse import quote
import httpx
logger = logging.getLogger(__name__)
async def search_wikipedia(keyword: str, max_chars: int = 2000) -> str:
"""
使用 Wikipedia API 获取与关键词相关的百科内容。
Wikipedia API 是公开的,不需要 API Key非常稳定。
"""
# 尝试用关键词直接搜索 Wikipedia
search_url = "https://zh.wikipedia.org/w/api.php"
headers = {
"User-Agent": "GEO-Citation-Bot/1.0 (contact@example.com)",
}
# 1. 先搜索匹配的词条
async with httpx.AsyncClient(timeout=30) as client:
search_resp = await client.get(
search_url,
headers=headers,
params={
"action": "query",
"list": "search",
"srsearch": keyword,
"srlimit": 3,
"format": "json",
"origin": "*",
},
)
search_resp.raise_for_status()
search_data = search_resp.json()
search_results = search_data.get("query", {}).get("search", [])
if not search_results:
return ""
# 2. 获取第一个匹配词条的内容摘要
title = search_results[0]["title"]
async with httpx.AsyncClient(timeout=30) as client:
extract_resp = await client.get(
search_url,
headers=headers,
params={
"action": "query",
"prop": "extracts",
"titles": title,
"explaintext": True,
"exsentences": 15,
"format": "json",
"origin": "*",
},
)
extract_resp.raise_for_status()
extract_data = extract_resp.json()
pages = extract_data.get("query", {}).get("pages", {})
for page in pages.values():
extract = page.get("extract", "")
if extract:
# 清理 Wikipedia 的标记
extract = re.sub(r'\[\d+\]', '', extract) # 移除引用标记如 [1]
extract = re.sub(r'\s+', ' ', extract).strip()
return extract[:max_chars]
return ""
async def search_duckduckgo(query: str, max_results: int = 5) -> str:
"""
使用 DuckDuckGo HTML 版搜索。若被限制则回退到 Wikipedia。
"""
url = f"https://html.duckduckgo.com/html/?q={quote(query)}"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
}
try:
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
resp = await client.get(url, headers=headers)
resp.raise_for_status()
html = resp.text
# 快速检查是否是有效的结果页(而不是主页/验证页)
if "web-result" not in html and "result__snippet" not in html and "result__title" not in html:
raise RuntimeError("DuckDuckGo 返回了非结果页面")
results: list[str] = []
# 尝试匹配标准 result 块
result_blocks = re.findall(
r'<div class="result[^"]*"[^>]*>.*?<h[^>]*class="result__title"[^>]*>.*?<a[^>]*>(.*?)</a>.*?</h[^>]*>.*?<a[^>]*class="result__snippet"[^>]*>(.*?)</a>.*?</div>',
html,
re.DOTALL | re.IGNORECASE,
)
if result_blocks:
for title_raw, snippet_raw in result_blocks[:max_results]:
title = _strip_html(title_raw)
snippet = _strip_html(snippet_raw)
if title or snippet:
results.append(f"{title}\n{snippet}")
# 备选:直接抓取 .result__snippet 和 .result__title
if not results:
snippets = re.findall(
r'<a[^>]*class="result__snippet"[^>]*>(.*?)</a>', html, re.DOTALL | re.IGNORECASE
)
titles = re.findall(
r'<h[^>]*class="result__title"[^>]*>.*?<a[^>]*>(.*?)</a>.*?</h[^>]*>',
html,
re.DOTALL | re.IGNORECASE,
)
for i in range(min(len(titles), len(snippets), max_results)):
title = _strip_html(titles[i])
snippet = _strip_html(snippets[i])
if title or snippet:
results.append(f"{title}\n{snippet}")
if results:
return "\n\n".join(results)
raise RuntimeError("DuckDuckGo 未解析到结果")
except Exception as e:
logger.warning(f"DuckDuckGo 搜索失败: {e},回退到 Wikipedia")
wiki_text = await search_wikipedia(query, max_chars=2000)
if wiki_text:
return wiki_text
raise RuntimeError(f"所有搜索源均失败: {e}")
def _strip_html(raw: str) -> str:
"""去除 HTML 标签并将实体转义还原为可读文本。"""
# 先替换常见 HTML 实体
raw = raw.replace("&nbsp;", " ")
raw = raw.replace("&quot;", '"')
raw = raw.replace("&amp;", "&")
raw = raw.replace("&lt;", "<")
raw = raw.replace("&gt;", ">")
raw = raw.replace("&#39;", "'")
# 去除所有标签
text = re.sub(r"<[^>]+>", "", raw)
# 合并空白
text = re.sub(r"\s+", " ", text).strip()
return text
async def fetch_search_content(platform_name: str, keyword: str) -> str:
"""
为指定平台获取与关键词相关的搜索内容。
策略:
1. 使用关键词直接搜索 DuckDuckGo频率限制时自动回退 Wikipedia
2. 返回搜索结果摘要或百科内容
"""
logger.info(f"[{platform_name}] 搜索查询: {keyword}")
text = await search_duckduckgo(keyword, max_results=5)
return text