"""Excel ingestion — parse .xlsx into structured sheets for bitable import. Reuses openpyxl (already a dependency via ``agentkit.memory.document_loader``). Returns structured data ``{sheet_name: [{col: val}, ...]}`` rather than Markdown text. First row is treated as field names; types are auto-inferred from column values. Known limitations (same as ``document_loader._parse_xlsx``): - ``data_only=True`` returns ``None`` for formulas never opened in Excel. - Merged cells: only the top-left cell has a value; others are ``None``. """ from __future__ import annotations import io import ipaddress import logging import socket from dataclasses import dataclass, field from pathlib import Path from urllib.parse import urlparse import httpx logger = logging.getLogger(__name__) MAX_ROWS_PER_SHEET = 10_000 MAX_CELL_CHARS = 10_000 @dataclass class ParsedSheet: """One parsed Excel sheet ready for bitable import.""" name: str columns: list[str] = field(default_factory=list) field_types: list[str] = field(default_factory=list) # "text" | "number" | "date" records: list[dict[str, object]] = field(default_factory=list) def parse_excel(file_path: str | Path) -> list[ParsedSheet]: """Parse an .xlsx file from disk into structured sheets.""" path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"Excel file not found: {path}") content = path.read_bytes() return parse_excel_bytes(content) def parse_excel_url(url: str, *, timeout: float = 30.0) -> list[ParsedSheet]: """Download an .xlsx from a URL and parse it. Validates the URL to prevent SSRF: only http/https schemes are allowed, and the resolved host must not be a private, loopback, link-local, or reserved IP address. Hostnames are resolved before the request so the check covers DNS rebinding to internal IPs. """ parsed = urlparse(url) if parsed.scheme not in ("http", "https"): raise ValueError(f"Disallowed URL scheme: {parsed.scheme!r} (only http/https)") if not parsed.hostname: raise ValueError("URL has no hostname") _assert_safe_host(parsed.hostname) resp = httpx.get(url, timeout=timeout, follow_redirects=False) # Follow redirects manually, re-validating each Location (KTD: SSRF guard). seen_redirects = 0 while resp.is_redirect and seen_redirects < 5: seen_redirects += 1 next_url = httpx.URL(url).join(resp.headers["location"]) if next_url.scheme not in ("http", "https") or not next_url.host: raise ValueError(f"Unsafe redirect target: {next_url}") _assert_safe_host(next_url.host) resp = httpx.get(next_url, timeout=timeout, follow_redirects=False) url = str(next_url) resp.raise_for_status() return parse_excel_bytes(resp.content) def _assert_safe_host(host: str) -> None: """Raise ``ValueError`` if ``host`` resolves to a private/loopback/reserved IP. Accepts IPv4/IPv6 literals and DNS names. DNS names are resolved and every returned address is checked — any private/loopback/link-local/reserved address blocks the request. """ # ponytail: blocks RFC1918, loopback, link-local, and reserved ranges. # Ceiling: does not defend against DNS rebinding after the check (TOCTOU); # upgrade path is to pin resolved IP in the httpx transport. try: addr = ipaddress.ip_address(host) except ValueError: # Hostname — resolve and check all A/AAAA records. try: infos = socket.getaddrinfo(host, None) except socket.gaierror as e: raise ValueError(f"Cannot resolve host {host!r}: {e}") from e for info in infos: sockaddr = info[4] ip_str = sockaddr[0] try: addr = ipaddress.ip_address(ip_str) except ValueError: continue if _is_unsafe_ip(addr): raise ValueError(f"Host {host!r} resolves to private/reserved IP {addr}") return if _is_unsafe_ip(addr): raise ValueError(f"Host {host!r} is a private/loopback/reserved IP: {addr}") def _is_unsafe_ip(addr: ipaddress._BaseAddress) -> bool: """True if the address is private, loopback, link-local, reserved, or multicast.""" return ( addr.is_private or addr.is_loopback or addr.is_link_local or addr.is_reserved or addr.is_multicast or addr.is_unspecified ) def parse_excel_bytes(content: bytes) -> list[ParsedSheet]: """Parse Excel content from bytes. Raises ``ValueError`` on corrupt files.""" try: from openpyxl import load_workbook except ImportError as e: raise ImportError("openpyxl is required for Excel ingestion") from e try: wb = load_workbook(io.BytesIO(content), data_only=True, read_only=True) except Exception as e: raise ValueError(f"Failed to parse Excel file: {e}") from e sheets: list[ParsedSheet] = [] try: for ws in wb.worksheets: sheet = _parse_worksheet(ws) if sheet is not None: sheets.append(sheet) finally: wb.close() return sheets def _parse_worksheet(ws) -> ParsedSheet | None: """Parse a single worksheet. Returns ``None`` for completely empty sheets.""" rows_iter = ws.iter_rows(values_only=True) # First row = headers try: header_row = next(rows_iter) except StopIteration: return None # empty sheet headers = [str(v).strip() if v is not None else f"col_{i}" for i, v in enumerate(header_row)] # Deduplicate headers seen: dict[str, int] = {} clean_headers: list[str] = [] for h in headers: if h in seen: seen[h] += 1 clean_headers.append(f"{h}_{seen[h]}") else: seen[h] = 0 clean_headers.append(h) # Collect data rows data_rows: list[tuple] = [] for row in rows_iter: if len(data_rows) >= MAX_ROWS_PER_SHEET: logger.warning("Sheet %r truncated at %d rows", ws.title, MAX_ROWS_PER_SHEET) break data_rows.append(row) # Infer field types and build records col_count = len(clean_headers) field_types = _infer_column_types(data_rows, col_count) records: list[dict[str, object]] = [] for row in data_rows: rec: dict[str, object] = {} for i, col_name in enumerate(clean_headers): val = row[i] if i < len(row) else None if val is not None: val = _coerce_value(val, field_types[i]) rec[col_name] = val records.append(rec) return ParsedSheet( name=ws.title, columns=clean_headers, field_types=field_types, records=records, ) def _infer_column_types(rows: list[tuple], col_count: int) -> list[str]: """Infer bitable field type per column: 'number', 'date', or 'text'.""" from datetime import date, datetime types: list[str] = [] for col_idx in range(col_count): is_number = True is_date = True has_value = False for row in rows: if col_idx >= len(row): continue val = row[col_idx] if val is None or val == "": continue has_value = True if isinstance(val, bool): is_number = False is_date = False break if not isinstance(val, (int, float)): is_number = False if not isinstance(val, (datetime, date)): is_date = False if not is_number and not is_date: break if not has_value: types.append("text") elif is_number: types.append("number") elif is_date: types.append("date") else: types.append("text") return types def _coerce_value(val: object, field_type: str) -> object: """Coerce a cell value to the inferred field type. Truncate long strings.""" if field_type == "date": from datetime import datetime if isinstance(val, datetime): return val.isoformat() if isinstance(val, str) and len(val) > MAX_CELL_CHARS: return val[:MAX_CELL_CHARS] return val