"""Tests for Excel ingestion (U4). Tests parse_excel_bytes with in-memory .xlsx files created via openpyxl. No PostgreSQL required — these are pure parsing tests. """ from __future__ import annotations import io from datetime import datetime import pytest from agentkit.bitable.ingestion.excel import parse_excel_bytes pytestmark = pytest.mark.postgres # Reuse the same PG test group for consistency def _make_xlsx( sheets: dict[str, list[list]], ) -> bytes: """Create an in-memory .xlsx file from sheet data. Args: sheets: {sheet_name: [[row1_col1, row1_col2], [row2_col1, ...]]} """ from openpyxl import Workbook wb = Workbook() # Remove default sheet wb.remove(wb.active) for name, rows in sheets.items(): ws = wb.create_sheet(title=name) for row in rows: ws.append(row) buf = io.BytesIO() wb.save(buf) return buf.getvalue() # --------------------------------------------------------------------------- # Happy path: basic parsing # --------------------------------------------------------------------------- def test_parse_simple_sheet() -> None: """One sheet with header + 2 data rows → correct columns, types, records.""" xlsx = _make_xlsx( { "Sheet1": [ ["name", "age", "city"], ["Alice", 30, "NYC"], ["Bob", 25, "LA"], ] } ) sheets = parse_excel_bytes(xlsx) assert len(sheets) == 1 sheet = sheets[0] assert sheet.name == "Sheet1" assert sheet.columns == ["name", "age", "city"] assert sheet.field_types == ["text", "number", "text"] assert len(sheet.records) == 2 assert sheet.records[0] == {"name": "Alice", "age": 30, "city": "NYC"} assert sheet.records[1] == {"name": "Bob", "age": 25, "city": "LA"} def test_parse_multiple_sheets() -> None: """Multiple sheets → multiple ParsedSheet objects.""" xlsx = _make_xlsx( { "Users": [["id", "name"], [1, "Alice"]], "Orders": [["order_id", "amount"], [101, 99.9]], } ) sheets = parse_excel_bytes(xlsx) assert len(sheets) == 2 assert sheets[0].name == "Users" assert sheets[1].name == "Orders" assert sheets[1].records[0]["amount"] == 99.9 # --------------------------------------------------------------------------- # Type inference # --------------------------------------------------------------------------- def test_type_inference_all_number() -> None: """Column with all integers → 'number'.""" xlsx = _make_xlsx({"S": [["val"], [1], [2], [3]]}) sheets = parse_excel_bytes(xlsx) assert sheets[0].field_types == ["number"] def test_type_inference_mixed_text_number() -> None: """Column with mixed text and number → 'text'.""" xlsx = _make_xlsx({"S": [["val"], [1], ["two"], [3]]}) sheets = parse_excel_bytes(xlsx) assert sheets[0].field_types == ["text"] def test_type_inference_date_column() -> None: """Column with all datetime values → 'date'.""" xlsx = _make_xlsx({"S": [["when"], [datetime(2024, 1, 1)], [datetime(2024, 6, 15)]]}) sheets = parse_excel_bytes(xlsx) assert sheets[0].field_types == ["date"] assert "2024-01-01" in sheets[0].records[0]["when"] def test_type_inference_empty_column() -> None: """Column with no values → 'text' (safe default).""" xlsx = _make_xlsx({"S": [["a", "b"], [1, None], [2, None]]}) sheets = parse_excel_bytes(xlsx) assert sheets[0].field_types == ["number", "text"] # --------------------------------------------------------------------------- # Edge cases # --------------------------------------------------------------------------- def test_empty_sheet_skipped() -> None: """Completely empty sheet → not included in results.""" xlsx = _make_xlsx({"Empty": [], "Data": [["x"], [1]]}) sheets = parse_excel_bytes(xlsx) assert len(sheets) == 1 assert sheets[0].name == "Data" def test_header_only_no_data_rows() -> None: """Sheet with only a header row → 0 records, fields still created.""" xlsx = _make_xlsx({"S": [["name", "age"]]}) sheets = parse_excel_bytes(xlsx) assert len(sheets) == 1 assert sheets[0].columns == ["name", "age"] assert len(sheets[0].records) == 0 def test_duplicate_headers_deduplicated() -> None: """Duplicate header names → suffixed with _1, _2, etc.""" xlsx = _make_xlsx({"S": [["name", "name"], ["Alice", "Bob"]]}) sheets = parse_excel_bytes(xlsx) assert sheets[0].columns == ["name", "name_1"] def test_none_header_replaced() -> None: """None header value → auto-generated column name.""" xlsx = _make_xlsx({"S": [[None, "real"], [1, 2]]}) sheets = parse_excel_bytes(xlsx) assert sheets[0].columns[0] == "col_0" assert sheets[0].columns[1] == "real" def test_corrupt_file_raises_value_error() -> None: """Non-xlsx bytes → ValueError with clear message.""" with pytest.raises(ValueError, match="Failed to parse"): parse_excel_bytes(b"not an excel file") # --------------------------------------------------------------------------- # Merged cells (known limitation) # --------------------------------------------------------------------------- def test_merged_cells_only_top_left_has_value() -> None: """Merged cell: only top-left has value, others are None (known limitation).""" from openpyxl import Workbook wb = Workbook() ws = wb.active ws.title = "Merged" ws.append(["a", "b", "c"]) ws.append([1, 2, 3]) ws.merge_cells("A2:B2") # merge A2:B2 — only A2 has value buf = io.BytesIO() wb.save(buf) sheets = parse_excel_bytes(buf.getvalue()) rec = sheets[0].records[0] # A2 has value 1, B2 is None (merged cell limitation) assert rec["a"] == 1 assert rec["b"] is None