BUG: frame.loc[2:, 'z'] not setting inplace when multi-block (#44345)

pandas-dev · Nov 13, 2021 · 573c063 · 573c063
1 parent 2d3644c
commit 573c063
Show file tree

Hide file tree

Showing 6 changed files with 65 additions and 23 deletions.
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -575,6 +575,7 @@ Indexing
 - Bug when setting string-backed :class:`Categorical` values that can be parsed to datetimes into a :class:`DatetimeArray` or :class:`Series` or :class:`DataFrame` column backed by :class:`DatetimeArray` failing to parse these strings (:issue:`44236`)
 - Bug in :meth:`Series.__setitem__` with an integer dtype other than ``int64`` setting with a ``range`` object unnecessarily upcasting to ``int64`` (:issue:`44261`)
 - Bug in :meth:`Series.__setitem__` with a boolean mask indexer setting a listlike value of length 1 incorrectly broadcasting that value (:issue:`44265`)
+- Bug in :meth:`DataFrame.loc.__setitem__` and :meth:`DataFrame.iloc.__setitem__` with mixed dtypes sometimes failing to operate in-place (:issue:`44345`)
 -
 
 Missing

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -1860,10 +1860,19 @@ def _setitem_single_column(self, loc: int, value, plane_indexer):
                 # in case of slice
                 ser = value[pi]
         else:
-            # set the item, possibly having a dtype change
-            ser = ser.copy()
-            ser._mgr = ser._mgr.setitem(indexer=(pi,), value=value)
-            ser._maybe_update_cacher(clear=True, inplace=True)
+            # set the item, first attempting to operate inplace, then
+            #  falling back to casting if necessary; see
+            #  _whatsnew_130.notable_bug_fixes.setitem_column_try_inplace
+
+            orig_values = ser._values
+            ser._mgr = ser._mgr.setitem((pi,), value)
+
+            if ser._values is orig_values:
+                # The setitem happened inplace, so the DataFrame's values
+                #  were modified inplace.
+                return
+            self.obj._iset_item(loc, ser, inplace=True)
+            return
 
         # reset the sliced object if unique
         self.obj._iset_item(loc, ser, inplace=True)

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -600,6 +600,8 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
         # Cast from unsupported types to supported types
         is_nullable_int = isinstance(data[col].dtype, (_IntegerDtype, BooleanDtype))
         orig = data[col]
+        # We need to find orig_missing before altering data below
+        orig_missing = orig.isna()
         if is_nullable_int:
             missing_loc = data[col].isna()
             if missing_loc.any():
@@ -650,11 +652,10 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
                         f"supported by Stata ({float64_max})"
                     )
         if is_nullable_int:
-            missing = orig.isna()
-            if missing.any():
+            if orig_missing.any():
                 # Replace missing by Stata sentinel value
                 sentinel = StataMissingValue.BASE_MISSING_VALUES[data[col].dtype.name]
-                data.loc[missing, col] = sentinel
+                data.loc[orig_missing, col] = sentinel
     if ws:
         warnings.warn(ws, PossiblePrecisionLoss)
 

diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py
@@ -384,7 +384,7 @@ def test_setitem_frame_length_0_str_key(self, indexer):
         expected["A"] = expected["A"].astype("object")
         tm.assert_frame_equal(df, expected)
 
-    def test_setitem_frame_duplicate_columns(self, using_array_manager):
+    def test_setitem_frame_duplicate_columns(self, using_array_manager, request):
         # GH#15695
         cols = ["A", "B", "C"] * 2
         df = DataFrame(index=range(3), columns=cols)
@@ -407,6 +407,11 @@ def test_setitem_frame_duplicate_columns(self, using_array_manager):
             expected["C"] = expected["C"].astype("int64")
             # TODO(ArrayManager) .loc still overwrites
             expected["B"] = expected["B"].astype("int64")
+
+            mark = pytest.mark.xfail(
+                reason="Both 'A' columns get set with 3 instead of 0 and 3"
+            )
+            request.node.add_marker(mark)
         else:
             # set these with unique columns to be extra-unambiguous
             expected[2] = expected[2].astype(np.int64)
@@ -995,22 +1000,37 @@ def test_setitem_always_copy(self, float_frame):
         float_frame["E"][5:10] = np.nan
         assert notna(s[5:10]).all()
 
-    def test_setitem_clear_caches(self):
-        # see GH#304
+    @pytest.mark.parametrize("consolidate", [True, False])
+    def test_setitem_partial_column_inplace(self, consolidate, using_array_manager):
+        # This setting should be in-place, regardless of whether frame is
+        #  single-block or multi-block
+        # GH#304 this used to be incorrectly not-inplace, in which case
+        #  we needed to ensure _item_cache was cleared.
+
         df = DataFrame(
             {"x": [1.1, 2.1, 3.1, 4.1], "y": [5.1, 6.1, 7.1, 8.1]}, index=[0, 1, 2, 3]
         )
         df.insert(2, "z", np.nan)
+        if not using_array_manager:
+            if consolidate:
+                df._consolidate_inplace()
+                assert len(df._mgr.blocks) == 1
+            else:
+                assert len(df._mgr.blocks) == 2
 
-        # cache it
-        foo = df["z"]
-        df.loc[df.index[2:], "z"] = 42
+        zvals = df["z"]._values
 
-        expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z")
+        df.loc[2:, "z"] = 42
 
-        assert df["z"] is not foo
+        expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z")
         tm.assert_series_equal(df["z"], expected)
 
+        # check setting occurred in-place
+        tm.assert_numpy_array_equal(zvals, expected.values)
+        assert np.shares_memory(zvals, df["z"]._values)
+        if not consolidate:
+            assert df["z"]._values is zvals
+
     def test_setitem_duplicate_columns_not_inplace(self):
         # GH#39510
         cols = ["A", "B"] * 2

diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py
@@ -366,20 +366,21 @@ def test_xs_droplevel_false_view(self, using_array_manager):
         assert np.shares_memory(result.iloc[:, 0]._values, df.iloc[:, 0]._values)
         # modifying original df also modifies result when having a single block
         df.iloc[0, 0] = 2
-        if not using_array_manager:
-            expected = DataFrame({"a": [2]})
-        else:
-            # TODO(ArrayManager) iloc does not update the array inplace using
-            # "split" path
-            expected = DataFrame({"a": [1]})
+        expected = DataFrame({"a": [2]})
         tm.assert_frame_equal(result, expected)
 
         # with mixed dataframe, modifying the parent doesn't modify result
         # TODO the "split" path behaves differently here as with single block
         df = DataFrame([[1, 2.5, "a"]], columns=Index(["a", "b", "c"]))
         result = df.xs("a", axis=1, drop_level=False)
         df.iloc[0, 0] = 2
-        expected = DataFrame({"a": [1]})
+        if using_array_manager:
+            # Here the behavior is consistent
+            expected = DataFrame({"a": [2]})
+        else:
+            # FIXME: iloc does not update the array inplace using
+            # "split" path
+            expected = DataFrame({"a": [1]})
         tm.assert_frame_equal(result, expected)
 
     def test_xs_list_indexer_droplevel_false(self):

diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -789,6 +789,10 @@ def test_std_timedelta64_skipna_false(self):
         # GH#37392
         tdi = pd.timedelta_range("1 Day", periods=10)
         df = DataFrame({"A": tdi, "B": tdi})
+        # Copy is needed for ArrayManager case, otherwise setting df.iloc
+        #  below edits tdi, alterting both df['A'] and df['B']
+        #  FIXME: passing copy=True to constructor does not fix this
+        df = df.copy()
         df.iloc[-2, -1] = pd.NaT
 
         result = df.std(skipna=False)
@@ -1017,7 +1021,9 @@ def test_idxmax_mixed_dtype(self):
         # don't cast to object, which would raise in nanops
         dti = date_range("2016-01-01", periods=3)
 
-        df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti})
+        # Copying dti is needed for ArrayManager otherwise when we set
+        #  df.loc[0, 3] = pd.NaT below it edits dti
+        df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti.copy(deep=True)})
 
         result = df.idxmax()
         expected = Series([1, 0, 2], index=[1, 2, 3])
@@ -1074,6 +1080,10 @@ def test_idxmax_idxmin_convert_dtypes(self, op, expected_value):
     def test_idxmax_dt64_multicolumn_axis1(self):
         dti = date_range("2016-01-01", periods=3)
         df = DataFrame({3: dti, 4: dti[::-1]})
+        # FIXME: copy needed for ArrayManager, otherwise setting with iloc
+        #  below also sets df.iloc[-1, 1]; passing copy=True to DataFrame
+        #  does not solve this.
+        df = df.copy()
         df.iloc[0, 0] = pd.NaT
 
         df._consolidate_inplace()