fischer-agentkit/tests/unit/bitable/test_ingestion_excel.py

183 lines
5.7 KiB
Python

"""Tests for Excel ingestion (U4).
Tests parse_excel_bytes with in-memory .xlsx files created via openpyxl.
No PostgreSQL required — these are pure parsing tests.
"""
from __future__ import annotations
import io
from datetime import datetime
import pytest
from agentkit.bitable.ingestion.excel import parse_excel_bytes
pytestmark = pytest.mark.postgres # Reuse the same PG test group for consistency
def _make_xlsx(
sheets: dict[str, list[list]],
) -> bytes:
"""Create an in-memory .xlsx file from sheet data.
Args:
sheets: {sheet_name: [[row1_col1, row1_col2], [row2_col1, ...]]}
"""
from openpyxl import Workbook
wb = Workbook()
# Remove default sheet
wb.remove(wb.active)
for name, rows in sheets.items():
ws = wb.create_sheet(title=name)
for row in rows:
ws.append(row)
buf = io.BytesIO()
wb.save(buf)
return buf.getvalue()
# ---------------------------------------------------------------------------
# Happy path: basic parsing
# ---------------------------------------------------------------------------
def test_parse_simple_sheet() -> None:
"""One sheet with header + 2 data rows → correct columns, types, records."""
xlsx = _make_xlsx(
{
"Sheet1": [
["name", "age", "city"],
["Alice", 30, "NYC"],
["Bob", 25, "LA"],
]
}
)
sheets = parse_excel_bytes(xlsx)
assert len(sheets) == 1
sheet = sheets[0]
assert sheet.name == "Sheet1"
assert sheet.columns == ["name", "age", "city"]
assert sheet.field_types == ["text", "number", "text"]
assert len(sheet.records) == 2
assert sheet.records[0] == {"name": "Alice", "age": 30, "city": "NYC"}
assert sheet.records[1] == {"name": "Bob", "age": 25, "city": "LA"}
def test_parse_multiple_sheets() -> None:
"""Multiple sheets → multiple ParsedSheet objects."""
xlsx = _make_xlsx(
{
"Users": [["id", "name"], [1, "Alice"]],
"Orders": [["order_id", "amount"], [101, 99.9]],
}
)
sheets = parse_excel_bytes(xlsx)
assert len(sheets) == 2
assert sheets[0].name == "Users"
assert sheets[1].name == "Orders"
assert sheets[1].records[0]["amount"] == 99.9
# ---------------------------------------------------------------------------
# Type inference
# ---------------------------------------------------------------------------
def test_type_inference_all_number() -> None:
"""Column with all integers → 'number'."""
xlsx = _make_xlsx({"S": [["val"], [1], [2], [3]]})
sheets = parse_excel_bytes(xlsx)
assert sheets[0].field_types == ["number"]
def test_type_inference_mixed_text_number() -> None:
"""Column with mixed text and number → 'text'."""
xlsx = _make_xlsx({"S": [["val"], [1], ["two"], [3]]})
sheets = parse_excel_bytes(xlsx)
assert sheets[0].field_types == ["text"]
def test_type_inference_date_column() -> None:
"""Column with all datetime values → 'date'."""
xlsx = _make_xlsx({"S": [["when"], [datetime(2024, 1, 1)], [datetime(2024, 6, 15)]]})
sheets = parse_excel_bytes(xlsx)
assert sheets[0].field_types == ["date"]
assert "2024-01-01" in sheets[0].records[0]["when"]
def test_type_inference_empty_column() -> None:
"""Column with no values → 'text' (safe default)."""
xlsx = _make_xlsx({"S": [["a", "b"], [1, None], [2, None]]})
sheets = parse_excel_bytes(xlsx)
assert sheets[0].field_types == ["number", "text"]
# ---------------------------------------------------------------------------
# Edge cases
# ---------------------------------------------------------------------------
def test_empty_sheet_skipped() -> None:
"""Completely empty sheet → not included in results."""
xlsx = _make_xlsx({"Empty": [], "Data": [["x"], [1]]})
sheets = parse_excel_bytes(xlsx)
assert len(sheets) == 1
assert sheets[0].name == "Data"
def test_header_only_no_data_rows() -> None:
"""Sheet with only a header row → 0 records, fields still created."""
xlsx = _make_xlsx({"S": [["name", "age"]]})
sheets = parse_excel_bytes(xlsx)
assert len(sheets) == 1
assert sheets[0].columns == ["name", "age"]
assert len(sheets[0].records) == 0
def test_duplicate_headers_deduplicated() -> None:
"""Duplicate header names → suffixed with _1, _2, etc."""
xlsx = _make_xlsx({"S": [["name", "name"], ["Alice", "Bob"]]})
sheets = parse_excel_bytes(xlsx)
assert sheets[0].columns == ["name", "name_1"]
def test_none_header_replaced() -> None:
"""None header value → auto-generated column name."""
xlsx = _make_xlsx({"S": [[None, "real"], [1, 2]]})
sheets = parse_excel_bytes(xlsx)
assert sheets[0].columns[0] == "col_0"
assert sheets[0].columns[1] == "real"
def test_corrupt_file_raises_value_error() -> None:
"""Non-xlsx bytes → ValueError with clear message."""
with pytest.raises(ValueError, match="Failed to parse"):
parse_excel_bytes(b"not an excel file")
# ---------------------------------------------------------------------------
# Merged cells (known limitation)
# ---------------------------------------------------------------------------
def test_merged_cells_only_top_left_has_value() -> None:
"""Merged cell: only top-left has value, others are None (known limitation)."""
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.title = "Merged"
ws.append(["a", "b", "c"])
ws.append([1, 2, 3])
ws.merge_cells("A2:B2") # merge A2:B2 — only A2 has value
buf = io.BytesIO()
wb.save(buf)
sheets = parse_excel_bytes(buf.getvalue())
rec = sheets[0].records[0]
# A2 has value 1, B2 is None (merged cell limitation)
assert rec["a"] == 1
assert rec["b"] is None