BUG: error in read_excel with some ods files pandas-dev#45598

dimitra-karadima · Feb 18, 2022 · ae5b1c9 · ae5b1c9
1 parent 1bd193e
commit ae5b1c9
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 2 deletions.
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -359,6 +359,7 @@ I/O
 - Bug in :meth:`DataFrame.to_csv` not respecting ``float_format`` for ``Float64`` dtype (:issue:`45991`)
 - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`)
 - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`)
+- Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements(:issue:`45598`)
 
 Period
 ^^^^^^

diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py
@@ -102,7 +102,11 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
         table: list[list[Scalar]] = []
 
         for sheet_row in sheet_rows:
-            sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names]
+            sheet_cells = [
+                x
+                for x in sheet_row.childNodes
+                if "qname" in dir(x) and x.qname in cell_names
+            ]
             empty_cells = 0
             table_row: list[Scalar] = []
 
@@ -231,5 +235,5 @@ def _get_cell_string_value(self, cell) -> str:
                     # https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704
                     value.append(self._get_cell_string_value(fragment))
             else:
-                value.append(str(fragment))
+                value.append(str(fragment).strip("\n"))
         return "".join(value)
diff --git a/pandas/tests/io/data/excel/test_newlines.ods b/pandas/tests/io/data/excel/test_newlines.ods
diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py
@@ -36,3 +36,16 @@ def test_read_writer_table():
     result = pd.read_excel("writertable.odt", sheet_name="Table1", index_col=0)
 
     tm.assert_frame_equal(result, expected)
+
+
+def test_read_newlines_between_xml_elements_table():
+    # Also test reading table from an text OpenDocument file
+    # (.ods) that contains newlines between xml elements.
+    expected = pd.DataFrame(
+        [[1.0, 4.0, 7], [np.nan, np.nan, 8], [3.0, 6.0, 9]],
+        columns=["Column 1", "Column 2", "Column 3"],
+    )
+
+    result = pd.read_excel("test_newlines.ods")
+
+    tm.assert_frame_equal(result, expected)