geo/backend/app/utils/json_extractor.py

"""JSON 提取工具函数

从可能包含 markdown 代码块或周围文本的 LLM 响应中提取 JSON 字符串。
采用深度计数器方式确保正确匹配嵌套括号。
"""
import json
import re


def extract_json(text: str) -> str:
    """从文本中提取 JSON 字符串。

    提取策略（按优先级）：
    1. 尝试直接解析整个文本为 JSON
    2. 尝试从 ```json ... ``` 代码块中提取
    3. 使用深度计数器找到第一个完整的 JSON 对象或数组

    Args:
        text: 可能包含 JSON 的文本

    Returns:
        提取出的 JSON 字符串

    Raises:
        ValueError: 无法从文本中提取有效 JSON
    """
    if not text or not text.strip():
        raise ValueError(f"无法从响应中提取JSON: {text[:200]}")

    # 1. 尝试直接解析
    try:
        json.loads(text)
        return text
    except json.JSONDecodeError:
        pass

    # 2. 尝试从代码块中提取
    match = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", text, re.DOTALL)
    if match:
        candidate = match.group(1).strip()
        try:
            json.loads(candidate)
            return candidate
        except json.JSONDecodeError:
            pass

    # 3. 使用深度计数器找到第一个完整的 JSON 对象或数组
    for i, c in enumerate(text):
        if c in "[{":
            depth = 0
            for j in range(i, len(text)):
                if text[j] in "[{":
                    depth += 1
                elif text[j] in "]}":
                    depth -= 1
                if depth == 0:
                    candidate = text[i : j + 1]
                    try:
                        json.loads(candidate)
                        return candidate
                    except json.JSONDecodeError:
                        break  # 这对括号不是有效 JSON，继续找下一对
            # 如果 depth != 0 说明括号不匹配，继续找下一个起始括号

    raise ValueError(f"无法从响应中提取JSON: {text[:200]}")