Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 37 additions & 6 deletions pandera/backends/pandas/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,18 @@
import pandas as pd

from pandera.api.base.error_handler import ErrorHandler
from pandera.api.pandas.components import Column
from pandera.api.pandas.types import (
is_field,
is_index,
is_multiindex,
is_table,
)
from pandera.backends.base import CoreCheckResult
from pandera.backends.pandas.array import ArraySchemaBackend
from pandera.backends.pandas.array import (
ArraySchemaBackend,
SeriesSchemaBackend,
)
from pandera.backends.pandas.base import PandasSchemaBackend
from pandera.backends.pandas.error_formatters import reshape_failure_cases
from pandera.errors import (
Expand Down Expand Up @@ -660,20 +664,47 @@ def _validate_level_with_full_materialization(
Used both as a fallback when optimization isn't possible and when
errors are identified in optimized validation
in order to provide proper error reporting with correct indices.

Validates a Series indexed by the full MultiIndex to ensure failure_cases
naturally contains the correct MultiIndex positions.
"""
# Materialize the full level values
full_values = multiindex.get_level_values(level_pos)
full_stub_df = pd.DataFrame(index=full_values)

# Run validation on full materialized values
index_schema.validate(
full_stub_df,
# Create a Series with level values as data, indexed by the full MultiIndex
level_series = pd.Series(
full_values.values, index=multiindex, name=index_schema.name
)

# Validate as a column (Series), rather than as an index
# to ensure that failure_cases will have all levels in the 'index' column
column_schema = Column(
dtype=index_schema.dtype,
checks=index_schema.checks,
parsers=index_schema.parsers,
nullable=index_schema.nullable,
unique=index_schema.unique,
report_duplicates=index_schema.report_duplicates,
coerce=index_schema.coerce,
name=index_schema.name,
title=index_schema.title,
description=index_schema.description,
default=index_schema.default,
metadata=index_schema.metadata,
drop_invalid_rows=index_schema.drop_invalid_rows,
)

# Use the SeriesSchemaBackend directly instead of column_schema.validate()
# because Column.validate() expects a DataFrame.
backend = SeriesSchemaBackend()
backend.validate(
check_obj=level_series,
schema=column_schema,
head=head,
tail=tail,
sample=sample,
random_state=random_state,
lazy=lazy,
inplace=True,
)

def _check_strict(
Expand Down
44 changes: 44 additions & 0 deletions tests/pandas/test_schema_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,50 @@ def test_multi_index_index() -> None:
schema.validate(df_fail)


def test_multi_index_failure_cases_show_full_tuples() -> None:
"""Test that MultiIndex failure_cases include full tuples, not just level values."""
# Create a MultiIndex where 'c' appears at positions 2 and 4
mi = pd.MultiIndex.from_arrays(
[
["a", "a", "c", "b", "c", "a"], # level 0: 'c' at positions 2, 4
[1, 2, 3, 4, 5, 6], # level 1
],
names=["level0", "level1"],
)
df = pd.DataFrame({"col": range(6)}, index=mi)

# Schema that will fail for 'c' values
schema = DataFrameSchema(
columns={"col": Column(int)},
index=MultiIndex(
indexes=[
Index(String, Check.isin(["a", "b"]), name="level0"),
Index(Int, name="level1"),
]
),
)

# Validate with lazy=True to collect all errors
with pytest.raises(errors.SchemaErrors) as exc_info:
schema.validate(df, lazy=True)

# Check that we got the expected error
schema_errors = exc_info.value.schema_errors
assert len(schema_errors) == 1

# Get the failure_cases
failure_cases = schema_errors[0].failure_cases
assert isinstance(failure_cases, pd.DataFrame)
assert "index" in failure_cases.columns

expected_index = pd.Series(["('c', 3)", "('c', 5)"], name="index")
pd.testing.assert_series_equal(
failure_cases["index"].reset_index(drop=True),
expected_index,
check_names=False,
)


def test_single_index_multi_index_mismatch() -> None:
"""Tests the failure case that attempting to validate a MultiIndex DataFrame
against a single index schema raises a SchemaError with a constructive error
Expand Down
Loading