fischer-agentkit/src/agentkit/bitable/ingestion/excel.py

249 lines
8.2 KiB
Python

"""Excel ingestion — parse .xlsx into structured sheets for bitable import.
Reuses openpyxl (already a dependency via ``agentkit.memory.document_loader``).
Returns structured data ``{sheet_name: [{col: val}, ...]}`` rather than
Markdown text. First row is treated as field names; types are auto-inferred
from column values.
Known limitations (same as ``document_loader._parse_xlsx``):
- ``data_only=True`` returns ``None`` for formulas never opened in Excel.
- Merged cells: only the top-left cell has a value; others are ``None``.
"""
from __future__ import annotations
import io
import ipaddress
import logging
import socket
from dataclasses import dataclass, field
from pathlib import Path
from urllib.parse import urlparse
import httpx
logger = logging.getLogger(__name__)
MAX_ROWS_PER_SHEET = 10_000
MAX_CELL_CHARS = 10_000
@dataclass
class ParsedSheet:
"""One parsed Excel sheet ready for bitable import."""
name: str
columns: list[str] = field(default_factory=list)
field_types: list[str] = field(default_factory=list) # "text" | "number" | "date"
records: list[dict[str, object]] = field(default_factory=list)
def parse_excel(file_path: str | Path) -> list[ParsedSheet]:
"""Parse an .xlsx file from disk into structured sheets."""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"Excel file not found: {path}")
content = path.read_bytes()
return parse_excel_bytes(content)
def parse_excel_url(url: str, *, timeout: float = 30.0) -> list[ParsedSheet]:
"""Download an .xlsx from a URL and parse it.
Validates the URL to prevent SSRF: only http/https schemes are allowed,
and the resolved host must not be a private, loopback, link-local, or
reserved IP address. Hostnames are resolved before the request so the
check covers DNS rebinding to internal IPs.
"""
parsed = urlparse(url)
if parsed.scheme not in ("http", "https"):
raise ValueError(f"Disallowed URL scheme: {parsed.scheme!r} (only http/https)")
if not parsed.hostname:
raise ValueError("URL has no hostname")
_assert_safe_host(parsed.hostname)
resp = httpx.get(url, timeout=timeout, follow_redirects=False)
# Follow redirects manually, re-validating each Location (KTD: SSRF guard).
seen_redirects = 0
while resp.is_redirect and seen_redirects < 5:
seen_redirects += 1
next_url = httpx.URL(url).join(resp.headers["location"])
if next_url.scheme not in ("http", "https") or not next_url.host:
raise ValueError(f"Unsafe redirect target: {next_url}")
_assert_safe_host(next_url.host)
resp = httpx.get(next_url, timeout=timeout, follow_redirects=False)
url = str(next_url)
resp.raise_for_status()
return parse_excel_bytes(resp.content)
def _assert_safe_host(host: str) -> None:
"""Raise ``ValueError`` if ``host`` resolves to a private/loopback/reserved IP.
Accepts IPv4/IPv6 literals and DNS names. DNS names are resolved and every
returned address is checked — any private/loopback/link-local/reserved
address blocks the request.
"""
# ponytail: blocks RFC1918, loopback, link-local, and reserved ranges.
# Ceiling: does not defend against DNS rebinding after the check (TOCTOU);
# upgrade path is to pin resolved IP in the httpx transport.
try:
addr = ipaddress.ip_address(host)
except ValueError:
# Hostname — resolve and check all A/AAAA records.
try:
infos = socket.getaddrinfo(host, None)
except socket.gaierror as e:
raise ValueError(f"Cannot resolve host {host!r}: {e}") from e
for info in infos:
sockaddr = info[4]
ip_str = sockaddr[0]
try:
addr = ipaddress.ip_address(ip_str)
except ValueError:
continue
if _is_unsafe_ip(addr):
raise ValueError(f"Host {host!r} resolves to private/reserved IP {addr}")
return
if _is_unsafe_ip(addr):
raise ValueError(f"Host {host!r} is a private/loopback/reserved IP: {addr}")
def _is_unsafe_ip(addr: ipaddress._BaseAddress) -> bool:
"""True if the address is private, loopback, link-local, reserved, or multicast."""
return (
addr.is_private
or addr.is_loopback
or addr.is_link_local
or addr.is_reserved
or addr.is_multicast
or addr.is_unspecified
)
def parse_excel_bytes(content: bytes) -> list[ParsedSheet]:
"""Parse Excel content from bytes. Raises ``ValueError`` on corrupt files."""
try:
from openpyxl import load_workbook
except ImportError as e:
raise ImportError("openpyxl is required for Excel ingestion") from e
try:
wb = load_workbook(io.BytesIO(content), data_only=True, read_only=True)
except Exception as e:
raise ValueError(f"Failed to parse Excel file: {e}") from e
sheets: list[ParsedSheet] = []
try:
for ws in wb.worksheets:
sheet = _parse_worksheet(ws)
if sheet is not None:
sheets.append(sheet)
finally:
wb.close()
return sheets
def _parse_worksheet(ws) -> ParsedSheet | None:
"""Parse a single worksheet. Returns ``None`` for completely empty sheets."""
rows_iter = ws.iter_rows(values_only=True)
# First row = headers
try:
header_row = next(rows_iter)
except StopIteration:
return None # empty sheet
headers = [str(v).strip() if v is not None else f"col_{i}" for i, v in enumerate(header_row)]
# Deduplicate headers
seen: dict[str, int] = {}
clean_headers: list[str] = []
for h in headers:
if h in seen:
seen[h] += 1
clean_headers.append(f"{h}_{seen[h]}")
else:
seen[h] = 0
clean_headers.append(h)
# Collect data rows
data_rows: list[tuple] = []
for row in rows_iter:
if len(data_rows) >= MAX_ROWS_PER_SHEET:
logger.warning("Sheet %r truncated at %d rows", ws.title, MAX_ROWS_PER_SHEET)
break
data_rows.append(row)
# Infer field types and build records
col_count = len(clean_headers)
field_types = _infer_column_types(data_rows, col_count)
records: list[dict[str, object]] = []
for row in data_rows:
rec: dict[str, object] = {}
for i, col_name in enumerate(clean_headers):
val = row[i] if i < len(row) else None
if val is not None:
val = _coerce_value(val, field_types[i])
rec[col_name] = val
records.append(rec)
return ParsedSheet(
name=ws.title,
columns=clean_headers,
field_types=field_types,
records=records,
)
def _infer_column_types(rows: list[tuple], col_count: int) -> list[str]:
"""Infer bitable field type per column: 'number', 'date', or 'text'."""
from datetime import date, datetime
types: list[str] = []
for col_idx in range(col_count):
is_number = True
is_date = True
has_value = False
for row in rows:
if col_idx >= len(row):
continue
val = row[col_idx]
if val is None or val == "":
continue
has_value = True
if isinstance(val, bool):
is_number = False
is_date = False
break
if not isinstance(val, (int, float)):
is_number = False
if not isinstance(val, (datetime, date)):
is_date = False
if not is_number and not is_date:
break
if not has_value:
types.append("text")
elif is_number:
types.append("number")
elif is_date:
types.append("date")
else:
types.append("text")
return types
def _coerce_value(val: object, field_type: str) -> object:
"""Coerce a cell value to the inferred field type. Truncate long strings."""
if field_type == "date":
from datetime import datetime
if isinstance(val, datetime):
return val.isoformat()
if isinstance(val, str) and len(val) > MAX_CELL_CHARS:
return val[:MAX_CELL_CHARS]
return val