Skip to content

Commit

Permalink
BUG: frame.loc[2:, 'z'] not setting inplace when multi-block (#44345)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Nov 13, 2021
1 parent 2d3644c commit 573c063
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 23 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,7 @@ Indexing
- Bug when setting string-backed :class:`Categorical` values that can be parsed to datetimes into a :class:`DatetimeArray` or :class:`Series` or :class:`DataFrame` column backed by :class:`DatetimeArray` failing to parse these strings (:issue:`44236`)
- Bug in :meth:`Series.__setitem__` with an integer dtype other than ``int64`` setting with a ``range`` object unnecessarily upcasting to ``int64`` (:issue:`44261`)
- Bug in :meth:`Series.__setitem__` with a boolean mask indexer setting a listlike value of length 1 incorrectly broadcasting that value (:issue:`44265`)
- Bug in :meth:`DataFrame.loc.__setitem__` and :meth:`DataFrame.iloc.__setitem__` with mixed dtypes sometimes failing to operate in-place (:issue:`44345`)
-

Missing
Expand Down
17 changes: 13 additions & 4 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1860,10 +1860,19 @@ def _setitem_single_column(self, loc: int, value, plane_indexer):
# in case of slice
ser = value[pi]
else:
# set the item, possibly having a dtype change
ser = ser.copy()
ser._mgr = ser._mgr.setitem(indexer=(pi,), value=value)
ser._maybe_update_cacher(clear=True, inplace=True)
# set the item, first attempting to operate inplace, then
# falling back to casting if necessary; see
# _whatsnew_130.notable_bug_fixes.setitem_column_try_inplace

orig_values = ser._values
ser._mgr = ser._mgr.setitem((pi,), value)

if ser._values is orig_values:
# The setitem happened inplace, so the DataFrame's values
# were modified inplace.
return
self.obj._iset_item(loc, ser, inplace=True)
return

# reset the sliced object if unique
self.obj._iset_item(loc, ser, inplace=True)
Expand Down
7 changes: 4 additions & 3 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,8 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
# Cast from unsupported types to supported types
is_nullable_int = isinstance(data[col].dtype, (_IntegerDtype, BooleanDtype))
orig = data[col]
# We need to find orig_missing before altering data below
orig_missing = orig.isna()
if is_nullable_int:
missing_loc = data[col].isna()
if missing_loc.any():
Expand Down Expand Up @@ -650,11 +652,10 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
f"supported by Stata ({float64_max})"
)
if is_nullable_int:
missing = orig.isna()
if missing.any():
if orig_missing.any():
# Replace missing by Stata sentinel value
sentinel = StataMissingValue.BASE_MISSING_VALUES[data[col].dtype.name]
data.loc[missing, col] = sentinel
data.loc[orig_missing, col] = sentinel
if ws:
warnings.warn(ws, PossiblePrecisionLoss)

Expand Down
36 changes: 28 additions & 8 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def test_setitem_frame_length_0_str_key(self, indexer):
expected["A"] = expected["A"].astype("object")
tm.assert_frame_equal(df, expected)

def test_setitem_frame_duplicate_columns(self, using_array_manager):
def test_setitem_frame_duplicate_columns(self, using_array_manager, request):
# GH#15695
cols = ["A", "B", "C"] * 2
df = DataFrame(index=range(3), columns=cols)
Expand All @@ -407,6 +407,11 @@ def test_setitem_frame_duplicate_columns(self, using_array_manager):
expected["C"] = expected["C"].astype("int64")
# TODO(ArrayManager) .loc still overwrites
expected["B"] = expected["B"].astype("int64")

mark = pytest.mark.xfail(
reason="Both 'A' columns get set with 3 instead of 0 and 3"
)
request.node.add_marker(mark)
else:
# set these with unique columns to be extra-unambiguous
expected[2] = expected[2].astype(np.int64)
Expand Down Expand Up @@ -995,22 +1000,37 @@ def test_setitem_always_copy(self, float_frame):
float_frame["E"][5:10] = np.nan
assert notna(s[5:10]).all()

def test_setitem_clear_caches(self):
# see GH#304
@pytest.mark.parametrize("consolidate", [True, False])
def test_setitem_partial_column_inplace(self, consolidate, using_array_manager):
# This setting should be in-place, regardless of whether frame is
# single-block or multi-block
# GH#304 this used to be incorrectly not-inplace, in which case
# we needed to ensure _item_cache was cleared.

df = DataFrame(
{"x": [1.1, 2.1, 3.1, 4.1], "y": [5.1, 6.1, 7.1, 8.1]}, index=[0, 1, 2, 3]
)
df.insert(2, "z", np.nan)
if not using_array_manager:
if consolidate:
df._consolidate_inplace()
assert len(df._mgr.blocks) == 1
else:
assert len(df._mgr.blocks) == 2

# cache it
foo = df["z"]
df.loc[df.index[2:], "z"] = 42
zvals = df["z"]._values

expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z")
df.loc[2:, "z"] = 42

assert df["z"] is not foo
expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z")
tm.assert_series_equal(df["z"], expected)

# check setting occurred in-place
tm.assert_numpy_array_equal(zvals, expected.values)
assert np.shares_memory(zvals, df["z"]._values)
if not consolidate:
assert df["z"]._values is zvals

def test_setitem_duplicate_columns_not_inplace(self):
# GH#39510
cols = ["A", "B"] * 2
Expand Down
15 changes: 8 additions & 7 deletions pandas/tests/frame/indexing/test_xs.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,20 +366,21 @@ def test_xs_droplevel_false_view(self, using_array_manager):
assert np.shares_memory(result.iloc[:, 0]._values, df.iloc[:, 0]._values)
# modifying original df also modifies result when having a single block
df.iloc[0, 0] = 2
if not using_array_manager:
expected = DataFrame({"a": [2]})
else:
# TODO(ArrayManager) iloc does not update the array inplace using
# "split" path
expected = DataFrame({"a": [1]})
expected = DataFrame({"a": [2]})
tm.assert_frame_equal(result, expected)

# with mixed dataframe, modifying the parent doesn't modify result
# TODO the "split" path behaves differently here as with single block
df = DataFrame([[1, 2.5, "a"]], columns=Index(["a", "b", "c"]))
result = df.xs("a", axis=1, drop_level=False)
df.iloc[0, 0] = 2
expected = DataFrame({"a": [1]})
if using_array_manager:
# Here the behavior is consistent
expected = DataFrame({"a": [2]})
else:
# FIXME: iloc does not update the array inplace using
# "split" path
expected = DataFrame({"a": [1]})
tm.assert_frame_equal(result, expected)

def test_xs_list_indexer_droplevel_false(self):
Expand Down
12 changes: 11 additions & 1 deletion pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,6 +789,10 @@ def test_std_timedelta64_skipna_false(self):
# GH#37392
tdi = pd.timedelta_range("1 Day", periods=10)
df = DataFrame({"A": tdi, "B": tdi})
# Copy is needed for ArrayManager case, otherwise setting df.iloc
# below edits tdi, alterting both df['A'] and df['B']
# FIXME: passing copy=True to constructor does not fix this
df = df.copy()
df.iloc[-2, -1] = pd.NaT

result = df.std(skipna=False)
Expand Down Expand Up @@ -1017,7 +1021,9 @@ def test_idxmax_mixed_dtype(self):
# don't cast to object, which would raise in nanops
dti = date_range("2016-01-01", periods=3)

df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti})
# Copying dti is needed for ArrayManager otherwise when we set
# df.loc[0, 3] = pd.NaT below it edits dti
df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti.copy(deep=True)})

result = df.idxmax()
expected = Series([1, 0, 2], index=[1, 2, 3])
Expand Down Expand Up @@ -1074,6 +1080,10 @@ def test_idxmax_idxmin_convert_dtypes(self, op, expected_value):
def test_idxmax_dt64_multicolumn_axis1(self):
dti = date_range("2016-01-01", periods=3)
df = DataFrame({3: dti, 4: dti[::-1]})
# FIXME: copy needed for ArrayManager, otherwise setting with iloc
# below also sets df.iloc[-1, 1]; passing copy=True to DataFrame
# does not solve this.
df = df.copy()
df.iloc[0, 0] = pd.NaT

df._consolidate_inplace()
Expand Down

0 comments on commit 573c063

Please sign in to comment.