174 lines
6.0 KiB
Python
174 lines
6.0 KiB
Python
"""
|
||
通用搜索引擎模块 —— 用于在AI平台适配器无法正常工作时获取与关键词相关的真实内容。
|
||
|
||
使用 DuckDuckGo HTML 搜索(无需 API Key),返回搜索结果摘要。
|
||
"""
|
||
|
||
import logging
|
||
import re
|
||
from urllib.parse import quote
|
||
|
||
import httpx
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
async def search_wikipedia(keyword: str, max_chars: int = 2000) -> str:
|
||
"""
|
||
使用 Wikipedia API 获取与关键词相关的百科内容。
|
||
Wikipedia API 是公开的,不需要 API Key,非常稳定。
|
||
"""
|
||
# 尝试用关键词直接搜索 Wikipedia
|
||
search_url = "https://zh.wikipedia.org/w/api.php"
|
||
headers = {
|
||
"User-Agent": "GEO-Citation-Bot/1.0 (contact@example.com)",
|
||
}
|
||
|
||
# 1. 先搜索匹配的词条
|
||
async with httpx.AsyncClient(timeout=30) as client:
|
||
search_resp = await client.get(
|
||
search_url,
|
||
headers=headers,
|
||
params={
|
||
"action": "query",
|
||
"list": "search",
|
||
"srsearch": keyword,
|
||
"srlimit": 3,
|
||
"format": "json",
|
||
"origin": "*",
|
||
},
|
||
)
|
||
search_resp.raise_for_status()
|
||
search_data = search_resp.json()
|
||
|
||
search_results = search_data.get("query", {}).get("search", [])
|
||
if not search_results:
|
||
return ""
|
||
|
||
# 2. 获取第一个匹配词条的内容摘要
|
||
title = search_results[0]["title"]
|
||
async with httpx.AsyncClient(timeout=30) as client:
|
||
extract_resp = await client.get(
|
||
search_url,
|
||
headers=headers,
|
||
params={
|
||
"action": "query",
|
||
"prop": "extracts",
|
||
"titles": title,
|
||
"explaintext": True,
|
||
"exsentences": 15,
|
||
"format": "json",
|
||
"origin": "*",
|
||
},
|
||
)
|
||
extract_resp.raise_for_status()
|
||
extract_data = extract_resp.json()
|
||
|
||
pages = extract_data.get("query", {}).get("pages", {})
|
||
for page in pages.values():
|
||
extract = page.get("extract", "")
|
||
if extract:
|
||
# 清理 Wikipedia 的标记
|
||
extract = re.sub(r'\[\d+\]', '', extract) # 移除引用标记如 [1]
|
||
extract = re.sub(r'\s+', ' ', extract).strip()
|
||
return extract[:max_chars]
|
||
|
||
return ""
|
||
|
||
|
||
async def search_duckduckgo(query: str, max_results: int = 5) -> str:
|
||
"""
|
||
使用 DuckDuckGo HTML 版搜索。若被限制则回退到 Wikipedia。
|
||
"""
|
||
url = f"https://html.duckduckgo.com/html/?q={quote(query)}"
|
||
headers = {
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||
),
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
|
||
}
|
||
|
||
try:
|
||
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
||
resp = await client.get(url, headers=headers)
|
||
resp.raise_for_status()
|
||
html = resp.text
|
||
|
||
# 快速检查是否是有效的结果页(而不是主页/验证页)
|
||
if "web-result" not in html and "result__snippet" not in html and "result__title" not in html:
|
||
raise RuntimeError("DuckDuckGo 返回了非结果页面")
|
||
|
||
results: list[str] = []
|
||
|
||
# 尝试匹配标准 result 块
|
||
result_blocks = re.findall(
|
||
r'<div class="result[^"]*"[^>]*>.*?<h[^>]*class="result__title"[^>]*>.*?<a[^>]*>(.*?)</a>.*?</h[^>]*>.*?<a[^>]*class="result__snippet"[^>]*>(.*?)</a>.*?</div>',
|
||
html,
|
||
re.DOTALL | re.IGNORECASE,
|
||
)
|
||
if result_blocks:
|
||
for title_raw, snippet_raw in result_blocks[:max_results]:
|
||
title = _strip_html(title_raw)
|
||
snippet = _strip_html(snippet_raw)
|
||
if title or snippet:
|
||
results.append(f"{title}\n{snippet}")
|
||
|
||
# 备选:直接抓取 .result__snippet 和 .result__title
|
||
if not results:
|
||
snippets = re.findall(
|
||
r'<a[^>]*class="result__snippet"[^>]*>(.*?)</a>', html, re.DOTALL | re.IGNORECASE
|
||
)
|
||
titles = re.findall(
|
||
r'<h[^>]*class="result__title"[^>]*>.*?<a[^>]*>(.*?)</a>.*?</h[^>]*>',
|
||
html,
|
||
re.DOTALL | re.IGNORECASE,
|
||
)
|
||
for i in range(min(len(titles), len(snippets), max_results)):
|
||
title = _strip_html(titles[i])
|
||
snippet = _strip_html(snippets[i])
|
||
if title or snippet:
|
||
results.append(f"{title}\n{snippet}")
|
||
|
||
if results:
|
||
return "\n\n".join(results)
|
||
|
||
raise RuntimeError("DuckDuckGo 未解析到结果")
|
||
|
||
except Exception as e:
|
||
logger.warning(f"DuckDuckGo 搜索失败: {e},回退到 Wikipedia")
|
||
wiki_text = await search_wikipedia(query, max_chars=2000)
|
||
if wiki_text:
|
||
return wiki_text
|
||
raise RuntimeError(f"所有搜索源均失败: {e}")
|
||
|
||
|
||
def _strip_html(raw: str) -> str:
|
||
"""去除 HTML 标签并将实体转义还原为可读文本。"""
|
||
# 先替换常见 HTML 实体
|
||
raw = raw.replace(" ", " ")
|
||
raw = raw.replace(""", '"')
|
||
raw = raw.replace("&", "&")
|
||
raw = raw.replace("<", "<")
|
||
raw = raw.replace(">", ">")
|
||
raw = raw.replace("'", "'")
|
||
# 去除所有标签
|
||
text = re.sub(r"<[^>]+>", "", raw)
|
||
# 合并空白
|
||
text = re.sub(r"\s+", " ", text).strip()
|
||
return text
|
||
|
||
|
||
async def fetch_search_content(platform_name: str, keyword: str) -> str:
|
||
"""
|
||
为指定平台获取与关键词相关的搜索内容。
|
||
|
||
策略:
|
||
1. 使用关键词直接搜索 DuckDuckGo(频率限制时自动回退 Wikipedia)
|
||
2. 返回搜索结果摘要或百科内容
|
||
"""
|
||
logger.info(f"[{platform_name}] 搜索查询: {keyword}")
|
||
text = await search_duckduckgo(keyword, max_results=5)
|
||
return text
|