fischer-agentkit/src/agentkit/tools/bitable_tool.py

"""BitableTool — Agent tool for bitable data ingestion and CRUD via HTTP.

Implements KTD5 (REST API boundary even when co-deployed) and KTD11
(internal service token auth). The tool uses ``httpx.AsyncClient`` to call
the bitable REST API; it never imports BitableService directly.

Actions: create_table, import_excel, import_database, collect_api,
         upsert_records, query_records.

Batch chunking: upsert and import operations send at most ``BATCH_SIZE``
records per HTTP request. On partial failure, the result includes
``successful_count`` and ``resume_from`` for breakpoint continuation.
"""

from __future__ import annotations

import asyncio
import logging

import httpx

from agentkit.bitable.ingestion.excel import ParsedSheet, parse_excel, parse_excel_url
from agentkit.bitable.ingestion.database import import_table as import_db_table
from agentkit.bitable.ingestion.api_collector import transform_records
from agentkit.tools.base import Tool

logger = logging.getLogger(__name__)

BATCH_SIZE = 500


class BitableTool(Tool):
    """Agent tool for bitable operations via REST API.

    Args:
        base_url: Bitable API base URL (e.g. ``http://localhost:8001/api/v1/bitable``).
        internal_token: Service token for KTD11 auth. If ``None``, requests
            go unauthenticated (will fail if the server requires auth).
    """

    def __init__(self, base_url: str, internal_token: str | None = None) -> None:
        super().__init__(
            name="bitable",
            description=(
                "Create and manage bitable (multi-dimensional spreadsheet) tables, "
                "ingest data from Excel files, databases, or API responses, and "
                "query records. Actions: create_table, import_excel, "
                "import_database, collect_api, upsert_records, query_records."
            ),
            input_schema={
                "type": "object",
                "properties": {
                    "action": {
                        "type": "string",
                        "enum": [
                            "create_table",
                            "import_excel",
                            "import_database",
                            "collect_api",
                            "upsert_records",
                            "query_records",
                        ],
                        "description": "Bitable operation to perform.",
                    },
                    "table_name": {
                        "type": "string",
                        "description": "Name for the new bitable table (create_table, import_excel, import_database).",
                    },
                    "description": {
                        "type": "string",
                        "description": "Table description (create_table).",
                    },
                    "file_path": {
                        "type": "string",
                        "description": "Path to .xlsx file (import_excel).",
                    },
                    "file_url": {
                        "type": "string",
                        "description": "URL to download .xlsx file (import_excel).",
                    },
                    "connection_string": {
                        "type": "string",
                        "description": "Database connection string (import_database).",
                    },
                    "table_names": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Source table names to import (import_database).",
                    },
                    "table_id": {
                        "type": "string",
                        "description": "Target bitable table ID (collect_api, upsert_records, query_records).",
                    },
                    "records": {
                        "type": "array",
                        "description": "Records to write (collect_api, upsert_records).",
                    },
                    "field_mapping": {
                        "type": "object",
                        "description": "Mapping {source_key: bitable_field_id} (collect_api).",
                    },
                    "primary_key_field_id": {
                        "type": "string",
                        "description": "Field ID of the primary key (upsert_records, collect_api).",
                    },
                    "resume_from": {
                        "type": "integer",
                        "description": "Skip this many records before resuming a failed batch (upsert_records, collect_api).",
                    },
                    "cursor": {
                        "type": "string",
                        "description": "Pagination cursor (query_records).",
                    },
                    "limit": {
                        "type": "integer",
                        "description": "Max records to return (query_records).",
                    },
                },
                "required": ["action"],
            },
        )
        self._base_url = base_url.rstrip("/")
        self._internal_token = internal_token
        self._client: httpx.AsyncClient | None = None

    async def _get_client(self) -> httpx.AsyncClient:
        if self._client is None or self._client.is_closed:
            headers: dict[str, str] = {}
            if self._internal_token:
                headers["X-Internal-Token"] = self._internal_token
            self._client = httpx.AsyncClient(
                base_url=self._base_url,
                headers=headers,
                timeout=60.0,
            )
        return self._client

    async def close(self) -> None:
        if self._client is not None and not self._client.is_closed:
            await self._client.aclose()

    async def execute(self, **kwargs) -> dict[str, object]:
        action = kwargs.get("action")
        handlers = {
            "create_table": self._create_table,
            "import_excel": self._import_excel,
            "import_database": self._import_database,
            "collect_api": self._collect_api,
            "upsert_records": self._upsert_records,
            "query_records": self._query_records,
        }
        handler = handlers.get(action)
        if handler is None:
            return {"success": False, "error": f"Unknown action: {action!r}"}
        try:
            return await handler(**kwargs)
        except httpx.HTTPStatusError as e:
            return {
                "success": False,
                "error": f"Bitable API error {e.response.status_code}: {e.response.text[:500]}",
            }
        except httpx.ConnectError as e:
            return {"success": False, "error": f"Cannot connect to bitable API: {e}"}
        except Exception as e:
            return {"success": False, "error": f"{action} failed: {e}"}

    # ------------------------------------------------------------------
    # create_table
    # ------------------------------------------------------------------

    async def _create_table(self, **kwargs) -> dict[str, object]:
        table_name = kwargs.get("table_name")
        if not table_name:
            return {"success": False, "error": "Missing required field: table_name"}
        client = await self._get_client()
        resp = await client.post(
            "/tables",
            json={"name": table_name, "description": kwargs.get("description", "")},
        )
        resp.raise_for_status()
        data = resp.json()
        return {"success": True, "table": data["table"]}

    # ------------------------------------------------------------------
    # import_excel
    # ------------------------------------------------------------------

    async def _import_excel(self, **kwargs) -> dict[str, object]:
        file_path = kwargs.get("file_path")
        file_url = kwargs.get("file_url")
        if not file_path and not file_url:
            return {"success": False, "error": "Either file_path or file_url is required"}

        # Parse Excel — offload sync I/O to thread pool (P2 #21-23).
        if file_path:
            sheets = await asyncio.to_thread(parse_excel, file_path)
        else:
            sheets = await asyncio.to_thread(parse_excel_url, file_url)

        if not sheets:
            return {"success": False, "error": "Excel file has no sheets with data"}

        results: list[dict[str, object]] = []
        for sheet in sheets:
            result = await self._import_sheet(sheet)
            results.append(result)
        return {"success": True, "sheets": results}

    async def _import_sheet(self, sheet: ParsedSheet) -> dict[str, object]:
        """Create a bitable table from a parsed sheet and upsert all rows."""
        client = await self._get_client()

        # 1. Create table
        resp = await client.post("/tables", json={"name": sheet.name})
        resp.raise_for_status()
        table_id = resp.json()["table"]["id"]

        # 2. Create fields
        field_name_to_id: dict[str, str] = {}
        for col_name, field_type in zip(sheet.columns, sheet.field_types):
            resp = await client.post(
                f"/tables/{table_id}/fields",
                json={"name": col_name, "field_type": field_type, "owner": "agent"},
            )
            resp.raise_for_status()
            field_id = resp.json()["field"]["id"]
            field_name_to_id[col_name] = field_id

        # 3. Map record keys to field IDs and batch upsert
        mapped_records = [
            {field_name_to_id[k]: v for k, v in rec.items() if k in field_name_to_id}
            for rec in sheet.records
        ]

        if not mapped_records:
            return {
                "table_id": table_id,
                "table_name": sheet.name,
                "field_count": len(field_name_to_id),
                "record_count": 0,
            }

        # Use first field as PK fallback (import_excel doesn't require a PK)
        # If no PK is set, upsert won't work — use create_records instead
        upsert_result = await self._batch_create_records(table_id, mapped_records)
        return {
            "table_id": table_id,
            "table_name": sheet.name,
            "field_count": len(field_name_to_id),
            "record_count": upsert_result["successful_count"],
            **upsert_result,
        }

    async def _batch_create_records(
        self, table_id: str, records: list[dict[str, object]]
    ) -> dict[str, object]:
        """Create records in batches via POST /tables/{id}/records."""
        client = await self._get_client()
        total = len(records)
        successful = 0
        errors: list[dict[str, object]] = []

        for start in range(0, total, BATCH_SIZE):
            batch = records[start : start + BATCH_SIZE]
            try:
                resp = await client.post(
                    f"/tables/{table_id}/records",
                    json={"records": batch},
                )
                resp.raise_for_status()
                successful += len(batch)
            except httpx.HTTPStatusError as e:
                errors.append(
                    {
                        "batch_start": start,
                        "batch_size": len(batch),
                        "status": e.response.status_code,
                        "error": e.response.text[:300],
                    }
                )
                break  # stop on first failure

        return {
            "successful_count": successful,
            "total": total,
            "resume_from": successful,
            **({"errors": errors} if errors else {}),
        }

    # ------------------------------------------------------------------
    # import_database
    # ------------------------------------------------------------------

    async def _import_database(self, **kwargs) -> dict[str, object]:
        conn_str = kwargs.get("connection_string")
        table_names = kwargs.get("table_names")
        if not conn_str:
            return {"success": False, "error": "Missing required field: connection_string"}
        if not table_names:
            return {"success": False, "error": "Missing required field: table_names"}

        results: list[dict[str, object]] = []
        for src_table in table_names:
            try:
                # Offload sync DB reflection to thread pool (P2 #21-23).
                reflected = await asyncio.to_thread(import_db_table, conn_str, src_table)
                result = await self._import_reflected_table(reflected)
                results.append(result)
            except ConnectionError as e:
                return {"success": False, "error": str(e), "imported": results}
            except Exception as e:
                results.append({"table_name": src_table, "success": False, "error": str(e)})
        return {"success": True, "tables": results}

    async def _import_reflected_table(self, reflected: dict[str, object]) -> dict[str, object]:
        """Create a bitable table from reflected DB data and upsert rows."""
        client = await self._get_client()
        table_name = reflected["table_name"]

        # 1. Create table
        resp = await client.post("/tables", json={"name": table_name})
        resp.raise_for_status()
        table_id = resp.json()["table"]["id"]

        # 2. Create fields
        field_name_to_id: dict[str, str] = {}
        pk_field_id: str | None = None
        for fdef in reflected["fields"]:
            resp = await client.post(
                f"/tables/{table_id}/fields",
                json={
                    "name": fdef["name"],
                    "field_type": fdef["field_type"],
                    "owner": "agent",
                },
            )
            resp.raise_for_status()
            fid = resp.json()["field"]["id"]
            field_name_to_id[fdef["name"]] = fid
            if fdef.get("is_primary_key"):
                pk_field_id = fid

        # 3. Set primary key
        if pk_field_id:
            await client.patch("/tables/" + table_id, json={"primary_key_field_id": pk_field_id})

        # 4. Map and upsert records
        mapped = [
            {field_name_to_id[k]: v for k, v in rec.items() if k in field_name_to_id}
            for rec in reflected["records"]
        ]

        if not mapped:
            return {
                "table_id": table_id,
                "table_name": table_name,
                "record_count": 0,
                "success": True,
            }

        if pk_field_id:
            upsert = await self._batch_upsert(table_id, mapped, pk_field_id)
        else:
            upsert = await self._batch_create_records(table_id, mapped)

        return {
            "table_id": table_id,
            "table_name": table_name,
            "record_count": upsert["successful_count"],
            "success": True,
            **upsert,
        }

    # ------------------------------------------------------------------
    # collect_api
    # ------------------------------------------------------------------

    async def _collect_api(self, **kwargs) -> dict[str, object]:
        table_id = kwargs.get("table_id")
        records = kwargs.get("records")
        field_mapping = kwargs.get("field_mapping")
        pk_field_id = kwargs.get("primary_key_field_id")
        resume_from = kwargs.get("resume_from", 0)

        if not table_id:
            return {"success": False, "error": "Missing required field: table_id"}
        if not records:
            return {"success": False, "error": "Missing required field: records"}
        if not field_mapping:
            return {"success": False, "error": "Missing required field: field_mapping"}
        if not pk_field_id:
            return {"success": False, "error": "Missing required field: primary_key_field_id"}

        transformed = transform_records(records, field_mapping)
        if resume_from > 0:
            transformed = transformed[resume_from:]

        result = await self._batch_upsert(table_id, transformed, pk_field_id)
        return {"success": True, **result}

    # ------------------------------------------------------------------
    # upsert_records
    # ------------------------------------------------------------------

    async def _upsert_records(self, **kwargs) -> dict[str, object]:
        table_id = kwargs.get("table_id")
        records = kwargs.get("records")
        pk_field_id = kwargs.get("primary_key_field_id")
        resume_from = kwargs.get("resume_from", 0)

        if not table_id:
            return {"success": False, "error": "Missing required field: table_id"}
        if not records:
            return {"success": False, "error": "Missing required field: records"}
        if not pk_field_id:
            return {"success": False, "error": "Missing required field: primary_key_field_id"}

        batch = records[resume_from:] if resume_from > 0 else records
        result = await self._batch_upsert(table_id, batch, pk_field_id)
        return {"success": True, **result}

    async def _batch_upsert(
        self, table_id: str, records: list[dict[str, object]], pk_field_id: str
    ) -> dict[str, object]:
        """Upsert records in batches of BATCH_SIZE via POST /tables/{id}/upsert."""
        client = await self._get_client()
        total = len(records)
        successful = 0
        errors: list[dict[str, object]] = []

        for start in range(0, total, BATCH_SIZE):
            batch = records[start : start + BATCH_SIZE]
            try:
                resp = await client.post(
                    f"/tables/{table_id}/upsert",
                    json={
                        "records": batch,
                        "primary_key_field_id": pk_field_id,
                    },
                )
                resp.raise_for_status()
                data = resp.json()
                successful += data.get("inserted", 0) + data.get("updated", 0)
            except httpx.HTTPStatusError as e:
                errors.append(
                    {
                        "batch_start": start,
                        "batch_size": len(batch),
                        "status": e.response.status_code,
                        "error": e.response.text[:300],
                    }
                )
                break

        return {
            "successful_count": successful,
            "total": total,
            "resume_from": successful,
            **({"errors": errors} if errors else {}),
        }

    # ------------------------------------------------------------------
    # query_records
    # ------------------------------------------------------------------

    async def _query_records(self, **kwargs) -> dict[str, object]:
        table_id = kwargs.get("table_id")
        if not table_id:
            return {"success": False, "error": "Missing required field: table_id"}

        client = await self._get_client()
        params: dict[str, object] = {}
        if kwargs.get("cursor"):
            params["cursor"] = kwargs["cursor"]
        if kwargs.get("limit"):
            params["limit"] = kwargs["limit"]

        resp = await client.get(f"/tables/{table_id}/records", params=params)
        resp.raise_for_status()
        data = resp.json()
        return {
            "success": True,
            "records": data["records"],
            "next_cursor": data.get("next_cursor"),
        }