BUG: fix concat of Sparse with non-sparse dtypes (#34338)

pandas-dev · May 29, 2020 · cc63484 · cc63484
1 parent 6e69ca4
commit cc63484
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 2 deletions.
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -1063,7 +1063,9 @@ def astype(self, dtype=None, copy=True):
         """
         dtype = self.dtype.update_dtype(dtype)
         subtype = dtype._subtype_with_str
-        sp_values = astype_nansafe(self.sp_values, subtype, copy=copy)
+        # TODO copy=False is broken for astype_nansafe with int -> float, so cannot
+        # passthrough copy keyword: https://github.com/pandas-dev/pandas/issues/34456
+        sp_values = astype_nansafe(self.sp_values, subtype, copy=True)
         if sp_values is self.sp_values and copy:
             sp_values = sp_values.copy()
 

diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py
@@ -360,6 +360,13 @@ def _subtype_with_str(self):
         return self.subtype
 
     def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
+        # TODO for now only handle SparseDtypes and numpy dtypes => extend
+        # with other compatibtle extension dtypes
+        if any(
+            isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype)
+            for x in dtypes
+        ):
+            return None
 
         fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)]
         fill_value = fill_values[0]
@@ -375,6 +382,5 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
                 stacklevel=6,
             )
 
-        # TODO also handle non-numpy other dtypes
         np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
         return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)
diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -1,6 +1,7 @@
 """
 Utility functions related to concat.
 """
+from typing import cast
 
 import numpy as np
 
@@ -21,6 +22,7 @@
 from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries
 
 from pandas.core.arrays import ExtensionArray
+from pandas.core.arrays.sparse import SparseArray
 from pandas.core.construction import array
 
 
@@ -81,6 +83,13 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
         except ValueError:
             return arr.astype(object, copy=False)
 
+    if is_sparse(arr) and not is_sparse(dtype):
+        # problem case: SparseArray.astype(dtype) doesn't follow the specified
+        # dtype exactly, but converts this to Sparse[dtype] -> first manually
+        # convert to dense array
+        arr = cast(SparseArray, arr)
+        return arr.to_dense().astype(dtype, copy=False)
+
     if (
         isinstance(arr, np.ndarray)
         and arr.dtype.kind in ["m", "M"]

diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -642,6 +642,8 @@ def _is_boolean(self) -> bool:
         return is_bool_dtype(self.categories)
 
     def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
+        from pandas.core.arrays.sparse import SparseDtype
+
         # check if we have all categorical dtype with identical categories
         if all(isinstance(x, CategoricalDtype) for x in dtypes):
             first = dtypes[0]
@@ -658,6 +660,8 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
         elif any(non_init_cats):
             return None
 
+        # categorical is aware of Sparse -> extract sparse subdtypes
+        dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
         # extract the categories' dtype
         non_cat_dtypes = [
             x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes

diff --git a/pandas/tests/arrays/sparse/test_combine_concat.py b/pandas/tests/arrays/sparse/test_combine_concat.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pytest
 
+import pandas as pd
 import pandas._testing as tm
 from pandas.core.arrays.sparse import SparseArray
 
@@ -29,3 +30,33 @@ def test_uses_first_kind(self, kind):
         expected = np.array([1, 2, 1, 2, 2], dtype="int64")
         tm.assert_numpy_array_equal(result.sp_values, expected)
         assert result.kind == kind
+
+
+@pytest.mark.parametrize(
+    "other, expected_dtype",
+    [
+        # compatible dtype -> preserve sparse
+        (pd.Series([3, 4, 5], dtype="int64"), pd.SparseDtype("int64", 0)),
+        # (pd.Series([3, 4, 5], dtype="Int64"), pd.SparseDtype("int64", 0)),
+        # incompatible dtype -> Sparse[common dtype]
+        (pd.Series([1.5, 2.5, 3.5], dtype="float64"), pd.SparseDtype("float64", 0)),
+        # incompatible dtype -> Sparse[object] dtype
+        (pd.Series(["a", "b", "c"], dtype=object), pd.SparseDtype(object, 0)),
+        # categorical with compatible categories -> dtype of the categories
+        (pd.Series([3, 4, 5], dtype="category"), np.dtype("int64")),
+        (pd.Series([1.5, 2.5, 3.5], dtype="category"), np.dtype("float64")),
+        # categorical with incompatible categories -> object dtype
+        (pd.Series(["a", "b", "c"], dtype="category"), np.dtype(object)),
+    ],
+)
+def test_concat_with_non_sparse(other, expected_dtype):
+    # https://github.com/pandas-dev/pandas/issues/34336
+    s_sparse = pd.Series([1, 0, 2], dtype=pd.SparseDtype("int64", 0))
+
+    result = pd.concat([s_sparse, other], ignore_index=True)
+    expected = pd.Series(list(s_sparse) + list(other)).astype(expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = pd.concat([other, s_sparse], ignore_index=True)
+    expected = pd.Series(list(other) + list(s_sparse)).astype(expected_dtype)
+    tm.assert_series_equal(result, expected)