183 lines
5.7 KiB
Python
183 lines
5.7 KiB
Python
"""Tests for Excel ingestion (U4).
|
|
|
|
Tests parse_excel_bytes with in-memory .xlsx files created via openpyxl.
|
|
No PostgreSQL required — these are pure parsing tests.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
from datetime import datetime
|
|
|
|
import pytest
|
|
|
|
from agentkit.bitable.ingestion.excel import parse_excel_bytes
|
|
|
|
pytestmark = pytest.mark.postgres # Reuse the same PG test group for consistency
|
|
|
|
|
|
def _make_xlsx(
|
|
sheets: dict[str, list[list]],
|
|
) -> bytes:
|
|
"""Create an in-memory .xlsx file from sheet data.
|
|
|
|
Args:
|
|
sheets: {sheet_name: [[row1_col1, row1_col2], [row2_col1, ...]]}
|
|
"""
|
|
from openpyxl import Workbook
|
|
|
|
wb = Workbook()
|
|
# Remove default sheet
|
|
wb.remove(wb.active)
|
|
for name, rows in sheets.items():
|
|
ws = wb.create_sheet(title=name)
|
|
for row in rows:
|
|
ws.append(row)
|
|
buf = io.BytesIO()
|
|
wb.save(buf)
|
|
return buf.getvalue()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Happy path: basic parsing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_parse_simple_sheet() -> None:
|
|
"""One sheet with header + 2 data rows → correct columns, types, records."""
|
|
xlsx = _make_xlsx(
|
|
{
|
|
"Sheet1": [
|
|
["name", "age", "city"],
|
|
["Alice", 30, "NYC"],
|
|
["Bob", 25, "LA"],
|
|
]
|
|
}
|
|
)
|
|
sheets = parse_excel_bytes(xlsx)
|
|
assert len(sheets) == 1
|
|
sheet = sheets[0]
|
|
assert sheet.name == "Sheet1"
|
|
assert sheet.columns == ["name", "age", "city"]
|
|
assert sheet.field_types == ["text", "number", "text"]
|
|
assert len(sheet.records) == 2
|
|
assert sheet.records[0] == {"name": "Alice", "age": 30, "city": "NYC"}
|
|
assert sheet.records[1] == {"name": "Bob", "age": 25, "city": "LA"}
|
|
|
|
|
|
def test_parse_multiple_sheets() -> None:
|
|
"""Multiple sheets → multiple ParsedSheet objects."""
|
|
xlsx = _make_xlsx(
|
|
{
|
|
"Users": [["id", "name"], [1, "Alice"]],
|
|
"Orders": [["order_id", "amount"], [101, 99.9]],
|
|
}
|
|
)
|
|
sheets = parse_excel_bytes(xlsx)
|
|
assert len(sheets) == 2
|
|
assert sheets[0].name == "Users"
|
|
assert sheets[1].name == "Orders"
|
|
assert sheets[1].records[0]["amount"] == 99.9
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Type inference
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_type_inference_all_number() -> None:
|
|
"""Column with all integers → 'number'."""
|
|
xlsx = _make_xlsx({"S": [["val"], [1], [2], [3]]})
|
|
sheets = parse_excel_bytes(xlsx)
|
|
assert sheets[0].field_types == ["number"]
|
|
|
|
|
|
def test_type_inference_mixed_text_number() -> None:
|
|
"""Column with mixed text and number → 'text'."""
|
|
xlsx = _make_xlsx({"S": [["val"], [1], ["two"], [3]]})
|
|
sheets = parse_excel_bytes(xlsx)
|
|
assert sheets[0].field_types == ["text"]
|
|
|
|
|
|
def test_type_inference_date_column() -> None:
|
|
"""Column with all datetime values → 'date'."""
|
|
xlsx = _make_xlsx({"S": [["when"], [datetime(2024, 1, 1)], [datetime(2024, 6, 15)]]})
|
|
sheets = parse_excel_bytes(xlsx)
|
|
assert sheets[0].field_types == ["date"]
|
|
assert "2024-01-01" in sheets[0].records[0]["when"]
|
|
|
|
|
|
def test_type_inference_empty_column() -> None:
|
|
"""Column with no values → 'text' (safe default)."""
|
|
xlsx = _make_xlsx({"S": [["a", "b"], [1, None], [2, None]]})
|
|
sheets = parse_excel_bytes(xlsx)
|
|
assert sheets[0].field_types == ["number", "text"]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Edge cases
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_empty_sheet_skipped() -> None:
|
|
"""Completely empty sheet → not included in results."""
|
|
xlsx = _make_xlsx({"Empty": [], "Data": [["x"], [1]]})
|
|
sheets = parse_excel_bytes(xlsx)
|
|
assert len(sheets) == 1
|
|
assert sheets[0].name == "Data"
|
|
|
|
|
|
def test_header_only_no_data_rows() -> None:
|
|
"""Sheet with only a header row → 0 records, fields still created."""
|
|
xlsx = _make_xlsx({"S": [["name", "age"]]})
|
|
sheets = parse_excel_bytes(xlsx)
|
|
assert len(sheets) == 1
|
|
assert sheets[0].columns == ["name", "age"]
|
|
assert len(sheets[0].records) == 0
|
|
|
|
|
|
def test_duplicate_headers_deduplicated() -> None:
|
|
"""Duplicate header names → suffixed with _1, _2, etc."""
|
|
xlsx = _make_xlsx({"S": [["name", "name"], ["Alice", "Bob"]]})
|
|
sheets = parse_excel_bytes(xlsx)
|
|
assert sheets[0].columns == ["name", "name_1"]
|
|
|
|
|
|
def test_none_header_replaced() -> None:
|
|
"""None header value → auto-generated column name."""
|
|
xlsx = _make_xlsx({"S": [[None, "real"], [1, 2]]})
|
|
sheets = parse_excel_bytes(xlsx)
|
|
assert sheets[0].columns[0] == "col_0"
|
|
assert sheets[0].columns[1] == "real"
|
|
|
|
|
|
def test_corrupt_file_raises_value_error() -> None:
|
|
"""Non-xlsx bytes → ValueError with clear message."""
|
|
with pytest.raises(ValueError, match="Failed to parse"):
|
|
parse_excel_bytes(b"not an excel file")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Merged cells (known limitation)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_merged_cells_only_top_left_has_value() -> None:
|
|
"""Merged cell: only top-left has value, others are None (known limitation)."""
|
|
from openpyxl import Workbook
|
|
|
|
wb = Workbook()
|
|
ws = wb.active
|
|
ws.title = "Merged"
|
|
ws.append(["a", "b", "c"])
|
|
ws.append([1, 2, 3])
|
|
ws.merge_cells("A2:B2") # merge A2:B2 — only A2 has value
|
|
buf = io.BytesIO()
|
|
wb.save(buf)
|
|
|
|
sheets = parse_excel_bytes(buf.getvalue())
|
|
rec = sheets[0].records[0]
|
|
# A2 has value 1, B2 is None (merged cell limitation)
|
|
assert rec["a"] == 1
|
|
assert rec["b"] is None
|