Skip to content

Commit c6d25eb

Browse files
Copilotpaulbkoch
andcommitted
Remove duplicate pd.StringDtype handling block
Co-authored-by: paulbkoch <46825734+paulbkoch@users.noreply.github.com>
1 parent c6ae93d commit c6d25eb

File tree

1 file changed

+10
-58
lines changed

1 file changed

+10
-58
lines changed

python/interpret-core/interpret/utils/_clean_x.py

Lines changed: 10 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,25 +1060,6 @@ def _process_pandas_column(X_col, is_initial, feature_type, min_unique_continuou
10601060
X_col.codes,
10611061
None,
10621062
)
1063-
elif isinstance(dt, pd.StringDtype):
1064-
# this handles pd.StringDtype both the numpy and arrow versions
1065-
# StringDtype is similar to object dtype but with proper NA handling
1066-
if X_col.hasnans:
1067-
# if hasnans is true then there is definetly a real missing value in there and not just a mask
1068-
return _process_ndarray(
1069-
X_col.dropna().values.astype(np.str_, copy=False),
1070-
X_col.notna().values,
1071-
is_initial,
1072-
feature_type,
1073-
min_unique_continuous,
1074-
)
1075-
return _process_ndarray(
1076-
X_col.values.astype(np.str_, copy=False),
1077-
None,
1078-
is_initial,
1079-
feature_type,
1080-
min_unique_continuous,
1081-
)
10821063
elif issubclass(tt, _intbool_types):
10831064
# this handles Int8Dtype to Int64Dtype, UInt8Dtype to UInt64Dtype, and BooleanDtype
10841065
if X_col.hasnans:
@@ -1101,51 +1082,22 @@ def _process_pandas_column(X_col, is_initial, feature_type, min_unique_continuou
11011082
)
11021083
elif isinstance(dt, pd.StringDtype):
11031084
# this handles pd.StringDtype both the numpy and arrow versions
1104-
# Pass StringArray to _process_arrayish to avoid inefficient conversion
1085+
# StringDtype is similar to object dtype but with proper NA handling
11051086
if X_col.hasnans:
11061087
# if hasnans is true then there is definetly a real missing value in there and not just a mask
1107-
if feature_type == "continuous":
1108-
# called under: fit or predict
1109-
# Convert to string for continuous processing
1110-
return (
1111-
feature_type,
1112-
None,
1113-
None,
1114-
*_process_continuous(
1115-
X_col.dropna().values.astype(np.str_, copy=False),
1116-
X_col.notna().values,
1117-
),
1118-
)
1119-
if is_predict:
1120-
# called under: predict. feature_type == "nominal" or feature_type == "ordinal"
1121-
return (
1122-
None,
1123-
*_encode_categorical_existing(
1124-
X_col.dropna().values, X_col.notna().values
1125-
),
1126-
None,
1127-
)
1128-
return _process_arrayish(
1129-
X_col.dropna().values,
1088+
return _process_ndarray(
1089+
X_col.dropna().values.astype(np.str_, copy=False),
11301090
X_col.notna().values,
1091+
is_initial,
11311092
feature_type,
11321093
min_unique_continuous,
11331094
)
1134-
1135-
if feature_type == "continuous":
1136-
# called under: fit or predict
1137-
# Convert to string for continuous processing
1138-
return (
1139-
feature_type,
1140-
None,
1141-
None,
1142-
*_process_continuous(X_col.values.astype(np.str_, copy=False), None),
1143-
)
1144-
if is_predict:
1145-
# called under: predict. feature_type == "nominal" or feature_type == "ordinal"
1146-
return None, *_encode_categorical_existing(X_col.values, None), None
1147-
return _process_arrayish(
1148-
X_col.values, None, feature_type, min_unique_continuous
1095+
return _process_ndarray(
1096+
X_col.values.astype(np.str_, copy=False),
1097+
None,
1098+
is_initial,
1099+
feature_type,
1100+
min_unique_continuous,
11491101
)
11501102

11511103
# TODO: implement pd.SparseDtype

0 commit comments

Comments
 (0)