Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -2308,6 +2308,13 @@ cdef _array_like_to_pandas(obj, options, types_mapper):
dtype = "object"
elif types_mapper:
dtype = types_mapper(original_type)
elif _pandas_api.uses_string_dtype() and (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be better to move this check a bit more above, together with the if/elif block checking for types_mapper. Because then we get to if hasattr(dtype, '__from_arrow__'): and will avoid actually converting the pyarrow memory to an numpy object-dtype array of strings

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that was my first idea but didn't think it well through before changing to what I have now. The thing is that I have hit this line https://github.com/AlenkaF/arrow/blob/6f1fda5ef1cfe7ee40ccd1ddefc3861c2718d920/python/pyarrow/array.pxi#L2319
and then the change of the dtype got reverted to None.

Will try putting the whole if/elif block further back, as suggested (if I do not break something else).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
elif _pandas_api.uses_string_dtype() and (
elif _pandas_api.uses_string_dtype() and not strings_to_categorical and (

like we do in the other place with this logic (which Raul quoted), only that you will have to get the option value out of the options object

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, will do!

original_type.id == _Type_STRING or
original_type.id == _Type_LARGE_STRING or
original_type.id == _Type_STRING_VIEW
):
# for pandas 3.0+, use pandas' new default string dtype
dtype = _pandas_api.pd.StringDtype(na_value=np.nan)
else:
dtype = None

Expand Down
23 changes: 23 additions & 0 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4651,6 +4651,29 @@ def test_chunked_array_to_pandas_types_mapper():
assert result.dtype == np.dtype("int64")


@pytest.mark.parametrize(
"string_type", [pa.string(), pa.large_string(), pa.string_view()]
)
@pytest.mark.parametrize("data", [[], [None]])
def test_array_to_pandas_string_dtype(string_type, data):
# GH-49002
if Version(pd.__version__) < Version("3.0.0"):
pytest.skip("PyArrow backed string dtype missing")

arr = pa.array(data, type=string_type)
result = arr.to_pandas()
assert result.dtype == pd.StringDtype(na_value=np.nan)

arr = pa.chunked_array([data], type=string_type)
result = arr.to_pandas()
assert result.dtype == pd.StringDtype(na_value=np.nan)

# Test types_mapper takes precedence
types_mapper = {string_type: None}.get
result = arr.to_pandas(types_mapper=types_mapper)
assert result.dtype == np.dtype("object")


# ----------------------------------------------------------------------
# Legacy metadata compatibility tests

Expand Down
Loading