249 lines
8.2 KiB
Python
249 lines
8.2 KiB
Python
"""Excel ingestion — parse .xlsx into structured sheets for bitable import.
|
|
|
|
Reuses openpyxl (already a dependency via ``agentkit.memory.document_loader``).
|
|
Returns structured data ``{sheet_name: [{col: val}, ...]}`` rather than
|
|
Markdown text. First row is treated as field names; types are auto-inferred
|
|
from column values.
|
|
|
|
Known limitations (same as ``document_loader._parse_xlsx``):
|
|
- ``data_only=True`` returns ``None`` for formulas never opened in Excel.
|
|
- Merged cells: only the top-left cell has a value; others are ``None``.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
import ipaddress
|
|
import logging
|
|
import socket
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
MAX_ROWS_PER_SHEET = 10_000
|
|
MAX_CELL_CHARS = 10_000
|
|
|
|
|
|
@dataclass
|
|
class ParsedSheet:
|
|
"""One parsed Excel sheet ready for bitable import."""
|
|
|
|
name: str
|
|
columns: list[str] = field(default_factory=list)
|
|
field_types: list[str] = field(default_factory=list) # "text" | "number" | "date"
|
|
records: list[dict[str, object]] = field(default_factory=list)
|
|
|
|
|
|
def parse_excel(file_path: str | Path) -> list[ParsedSheet]:
|
|
"""Parse an .xlsx file from disk into structured sheets."""
|
|
path = Path(file_path)
|
|
if not path.exists():
|
|
raise FileNotFoundError(f"Excel file not found: {path}")
|
|
content = path.read_bytes()
|
|
return parse_excel_bytes(content)
|
|
|
|
|
|
def parse_excel_url(url: str, *, timeout: float = 30.0) -> list[ParsedSheet]:
|
|
"""Download an .xlsx from a URL and parse it.
|
|
|
|
Validates the URL to prevent SSRF: only http/https schemes are allowed,
|
|
and the resolved host must not be a private, loopback, link-local, or
|
|
reserved IP address. Hostnames are resolved before the request so the
|
|
check covers DNS rebinding to internal IPs.
|
|
"""
|
|
parsed = urlparse(url)
|
|
if parsed.scheme not in ("http", "https"):
|
|
raise ValueError(f"Disallowed URL scheme: {parsed.scheme!r} (only http/https)")
|
|
if not parsed.hostname:
|
|
raise ValueError("URL has no hostname")
|
|
|
|
_assert_safe_host(parsed.hostname)
|
|
|
|
resp = httpx.get(url, timeout=timeout, follow_redirects=False)
|
|
# Follow redirects manually, re-validating each Location (KTD: SSRF guard).
|
|
seen_redirects = 0
|
|
while resp.is_redirect and seen_redirects < 5:
|
|
seen_redirects += 1
|
|
next_url = httpx.URL(url).join(resp.headers["location"])
|
|
if next_url.scheme not in ("http", "https") or not next_url.host:
|
|
raise ValueError(f"Unsafe redirect target: {next_url}")
|
|
_assert_safe_host(next_url.host)
|
|
resp = httpx.get(next_url, timeout=timeout, follow_redirects=False)
|
|
url = str(next_url)
|
|
|
|
resp.raise_for_status()
|
|
return parse_excel_bytes(resp.content)
|
|
|
|
|
|
def _assert_safe_host(host: str) -> None:
|
|
"""Raise ``ValueError`` if ``host`` resolves to a private/loopback/reserved IP.
|
|
|
|
Accepts IPv4/IPv6 literals and DNS names. DNS names are resolved and every
|
|
returned address is checked — any private/loopback/link-local/reserved
|
|
address blocks the request.
|
|
"""
|
|
# ponytail: blocks RFC1918, loopback, link-local, and reserved ranges.
|
|
# Ceiling: does not defend against DNS rebinding after the check (TOCTOU);
|
|
# upgrade path is to pin resolved IP in the httpx transport.
|
|
try:
|
|
addr = ipaddress.ip_address(host)
|
|
except ValueError:
|
|
# Hostname — resolve and check all A/AAAA records.
|
|
try:
|
|
infos = socket.getaddrinfo(host, None)
|
|
except socket.gaierror as e:
|
|
raise ValueError(f"Cannot resolve host {host!r}: {e}") from e
|
|
for info in infos:
|
|
sockaddr = info[4]
|
|
ip_str = sockaddr[0]
|
|
try:
|
|
addr = ipaddress.ip_address(ip_str)
|
|
except ValueError:
|
|
continue
|
|
if _is_unsafe_ip(addr):
|
|
raise ValueError(f"Host {host!r} resolves to private/reserved IP {addr}")
|
|
return
|
|
|
|
if _is_unsafe_ip(addr):
|
|
raise ValueError(f"Host {host!r} is a private/loopback/reserved IP: {addr}")
|
|
|
|
|
|
def _is_unsafe_ip(addr: ipaddress._BaseAddress) -> bool:
|
|
"""True if the address is private, loopback, link-local, reserved, or multicast."""
|
|
return (
|
|
addr.is_private
|
|
or addr.is_loopback
|
|
or addr.is_link_local
|
|
or addr.is_reserved
|
|
or addr.is_multicast
|
|
or addr.is_unspecified
|
|
)
|
|
|
|
|
|
def parse_excel_bytes(content: bytes) -> list[ParsedSheet]:
|
|
"""Parse Excel content from bytes. Raises ``ValueError`` on corrupt files."""
|
|
try:
|
|
from openpyxl import load_workbook
|
|
except ImportError as e:
|
|
raise ImportError("openpyxl is required for Excel ingestion") from e
|
|
|
|
try:
|
|
wb = load_workbook(io.BytesIO(content), data_only=True, read_only=True)
|
|
except Exception as e:
|
|
raise ValueError(f"Failed to parse Excel file: {e}") from e
|
|
|
|
sheets: list[ParsedSheet] = []
|
|
try:
|
|
for ws in wb.worksheets:
|
|
sheet = _parse_worksheet(ws)
|
|
if sheet is not None:
|
|
sheets.append(sheet)
|
|
finally:
|
|
wb.close()
|
|
return sheets
|
|
|
|
|
|
def _parse_worksheet(ws) -> ParsedSheet | None:
|
|
"""Parse a single worksheet. Returns ``None`` for completely empty sheets."""
|
|
rows_iter = ws.iter_rows(values_only=True)
|
|
|
|
# First row = headers
|
|
try:
|
|
header_row = next(rows_iter)
|
|
except StopIteration:
|
|
return None # empty sheet
|
|
|
|
headers = [str(v).strip() if v is not None else f"col_{i}" for i, v in enumerate(header_row)]
|
|
# Deduplicate headers
|
|
seen: dict[str, int] = {}
|
|
clean_headers: list[str] = []
|
|
for h in headers:
|
|
if h in seen:
|
|
seen[h] += 1
|
|
clean_headers.append(f"{h}_{seen[h]}")
|
|
else:
|
|
seen[h] = 0
|
|
clean_headers.append(h)
|
|
|
|
# Collect data rows
|
|
data_rows: list[tuple] = []
|
|
for row in rows_iter:
|
|
if len(data_rows) >= MAX_ROWS_PER_SHEET:
|
|
logger.warning("Sheet %r truncated at %d rows", ws.title, MAX_ROWS_PER_SHEET)
|
|
break
|
|
data_rows.append(row)
|
|
|
|
# Infer field types and build records
|
|
col_count = len(clean_headers)
|
|
field_types = _infer_column_types(data_rows, col_count)
|
|
|
|
records: list[dict[str, object]] = []
|
|
for row in data_rows:
|
|
rec: dict[str, object] = {}
|
|
for i, col_name in enumerate(clean_headers):
|
|
val = row[i] if i < len(row) else None
|
|
if val is not None:
|
|
val = _coerce_value(val, field_types[i])
|
|
rec[col_name] = val
|
|
records.append(rec)
|
|
|
|
return ParsedSheet(
|
|
name=ws.title,
|
|
columns=clean_headers,
|
|
field_types=field_types,
|
|
records=records,
|
|
)
|
|
|
|
|
|
def _infer_column_types(rows: list[tuple], col_count: int) -> list[str]:
|
|
"""Infer bitable field type per column: 'number', 'date', or 'text'."""
|
|
from datetime import date, datetime
|
|
|
|
types: list[str] = []
|
|
for col_idx in range(col_count):
|
|
is_number = True
|
|
is_date = True
|
|
has_value = False
|
|
for row in rows:
|
|
if col_idx >= len(row):
|
|
continue
|
|
val = row[col_idx]
|
|
if val is None or val == "":
|
|
continue
|
|
has_value = True
|
|
if isinstance(val, bool):
|
|
is_number = False
|
|
is_date = False
|
|
break
|
|
if not isinstance(val, (int, float)):
|
|
is_number = False
|
|
if not isinstance(val, (datetime, date)):
|
|
is_date = False
|
|
if not is_number and not is_date:
|
|
break
|
|
if not has_value:
|
|
types.append("text")
|
|
elif is_number:
|
|
types.append("number")
|
|
elif is_date:
|
|
types.append("date")
|
|
else:
|
|
types.append("text")
|
|
return types
|
|
|
|
|
|
def _coerce_value(val: object, field_type: str) -> object:
|
|
"""Coerce a cell value to the inferred field type. Truncate long strings."""
|
|
if field_type == "date":
|
|
from datetime import datetime
|
|
|
|
if isinstance(val, datetime):
|
|
return val.isoformat()
|
|
if isinstance(val, str) and len(val) > MAX_CELL_CHARS:
|
|
return val[:MAX_CELL_CHARS]
|
|
return val
|