From 73c582cbcc0c17f05612ce88f94abcf42db623d1 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Dec 2025 08:41:36 -0800 Subject: [PATCH 1/2] API: to_datetime(ints, unit) give requested unit --- pandas/_libs/tslib.pyx | 27 ++++++++++--- pandas/core/tools/datetimes.py | 7 ++-- pandas/tests/io/json/test_pandas.py | 10 ++--- .../tests/resample/test_resampler_grouper.py | 2 +- pandas/tests/tools/test_to_datetime.py | 39 +++++++++++-------- 5 files changed, 54 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 2a53e604423d8..a97af3cbb6186 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -26,6 +26,7 @@ import numpy as np cnp.import_array() from pandas._libs.tslibs.dtypes cimport ( + abbrev_to_npy_unit, get_supported_reso, npy_unit_to_abbrev, ) @@ -312,7 +313,7 @@ cpdef array_to_datetime( _TSObject tsobj tzinfo tz, tz_out = None cnp.flatiter it = cnp.PyArray_IterNew(values) - NPY_DATETIMEUNIT item_reso + NPY_DATETIMEUNIT item_reso, int_reso bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC DatetimeParseState state = DatetimeParseState(creso) str abbrev @@ -325,11 +326,11 @@ cpdef array_to_datetime( else: abbrev = npy_unit_to_abbrev(creso) - if unit_for_numerics is not None: - # either creso or unit_for_numerics should be passed, not both - assert creso == NPY_FR_ns - else: + if unit_for_numerics is None: unit_for_numerics = abbrev + int_reso = NPY_FR_ns + else: + int_reso = get_supported_reso(abbrev_to_npy_unit(unit_for_numerics)) result = np.empty((values).shape, dtype=f"M8[{abbrev}]") iresult = result.view("i8").ravel() @@ -370,7 +371,20 @@ cpdef array_to_datetime( iresult[i] = get_datetime64_nanos(val, creso) state.found_other = True - elif is_integer_object(val) or is_float_object(val): + elif is_integer_object(val): + if val == NPY_NAT: + iresult[i] = NPY_NAT + else: + item_reso = int_reso + state.update_creso(item_reso) + if infer_reso: + creso = state.creso + + iresult[i] = cast_from_unit(val, unit_for_numerics, out_reso=creso) + + state.found_other = True + + elif is_float_object(val): # these must be ns unit by-definition if val != val or val == NPY_NAT: @@ -460,6 +474,7 @@ cpdef array_to_datetime( dayfirst=dayfirst, utc=utc, creso=state.creso, + unit_for_numerics=unit_for_numerics, ) elif state.creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: # i.e. we never encountered anything non-NaT, default to "s". This diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index c5c0aa4d61187..5078829308ce1 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -26,11 +26,11 @@ Timedelta, Timestamp, astype_overflowsafe, + get_supported_dtype, is_supported_dtype, timezones as libtimezones, ) from pandas._libs.tslibs.conversion import cast_from_unit_vectorized -from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -503,8 +503,9 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: # Note we can't do "f" here because that could induce unwanted # rounding GH#14156, GH#20445 arr = arg.astype(f"datetime64[{unit}]", copy=False) + dtype = get_supported_dtype(arr.dtype) try: - arr = astype_overflowsafe(arr, np.dtype("M8[ns]"), copy=False) + arr = astype_overflowsafe(arr, dtype, copy=False) except OutOfBoundsDatetime: if errors == "raise": raise @@ -534,7 +535,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: utc=utc, errors=errors, unit_for_numerics=unit, - creso=cast(int, NpyDatetimeUnit.NPY_FR_ns.value), + # creso=cast(int, NpyDatetimeUnit.NPY_FR_ns.value), ) result = DatetimeIndex(arr, name=name) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 5a3ec254c96b0..92ff2357304d9 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -955,7 +955,7 @@ def test_date_format_frame_raises(self, datetime_frame): ], ) def test_date_format_series(self, date, date_unit, datetime_series): - ts = Series(Timestamp(date).as_unit("ns"), index=datetime_series.index) + ts = Series(Timestamp(date), index=datetime_series.index) ts.iloc[1] = pd.NaT ts.iloc[5] = pd.NaT if date_unit: @@ -1118,9 +1118,9 @@ def test_round_trip_exception(self, datapath): @pytest.mark.parametrize( "field,dtype", [ - ["created_at", pd.DatetimeTZDtype(tz="UTC")], - ["closed_at", "datetime64[ns]"], - ["updated_at", pd.DatetimeTZDtype(tz="UTC")], + ["created_at", pd.DatetimeTZDtype(tz="UTC", unit="us")], + ["closed_at", "datetime64[us]"], + ["updated_at", pd.DatetimeTZDtype(tz="UTC", unit="us")], ], ) def test_url(self, field, dtype, httpserver): @@ -1756,7 +1756,7 @@ def test_read_timezone_information(self): result = read_json( StringIO('{"2019-01-01T11:00:00.000Z":88}'), typ="series", orient="index" ) - exp_dti = DatetimeIndex(["2019-01-01 11:00:00"], dtype="M8[ns, UTC]") + exp_dti = DatetimeIndex(["2019-01-01 11:00:00"], dtype="M8[us, UTC]") expected = Series([88], index=exp_dti) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 1d319600e632d..862578decb782 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -502,7 +502,7 @@ def test_groupby_resample_empty_sum_string( result = gbrs.sum(min_count=min_count) index = pd.MultiIndex( - levels=[[1, 2, 3], [pd.to_datetime("2000-01-01", unit="ns")]], + levels=[[1, 2, 3], [pd.to_datetime("2000-01-01", unit="ns").as_unit("ns")]], codes=[[0, 1, 2], [0, 0, 0]], names=["A", None], ) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 466ac5582dc65..c786c01b58292 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1782,7 +1782,8 @@ class TestToDatetimeUnit: def test_to_datetime_month_or_year_unit_int(self, cache, unit, item, request): # GH#50870 Note we have separate tests that pd.Timestamp gets these right ts = Timestamp(item, unit=unit) - expected = DatetimeIndex([ts], dtype="M8[ns]") + dtype = "M8[ns]" if isinstance(item, float) else "M8[s]" + expected = DatetimeIndex([ts], dtype=dtype) result = to_datetime([item], unit=unit, cache=cache) tm.assert_index_equal(result, expected) @@ -1796,7 +1797,7 @@ def test_to_datetime_month_or_year_unit_int(self, cache, unit, item, request): # with a nan! result = to_datetime(np.array([item, np.nan]), unit=unit, cache=cache) assert result.isna()[1] - tm.assert_index_equal(result[:1], expected) + tm.assert_index_equal(result[:1], expected.astype("M8[ns]")) @pytest.mark.parametrize("unit", ["Y", "M"]) def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): @@ -1820,12 +1821,12 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # In 3.0, the string "1.5" is parsed as as it would be without unit, # which fails. With errors="coerce" this becomes NaT. res = to_datetime(["1.5"], unit=unit, errors="coerce") - expected = to_datetime([NaT]).as_unit("ns") + expected = to_datetime([NaT]) tm.assert_index_equal(res, expected) # round floats are OK res = to_datetime([1.0], unit=unit) - expected = to_datetime([1], unit=unit) + expected = to_datetime([1], unit=unit).as_unit("ns") tm.assert_index_equal(res, expected) def test_unit(self, cache): @@ -1853,7 +1854,7 @@ def test_unit_array_mixed_nans_large_int(self, cache): values = [1420043460000000000000000, iNaT, NaT, np.nan, "NaT"] result = to_datetime(values, errors="coerce", unit="s", cache=cache) - expected = DatetimeIndex(["NaT", "NaT", "NaT", "NaT", "NaT"], dtype="M8[ns]") + expected = DatetimeIndex(["NaT", "NaT", "NaT", "NaT", "NaT"], dtype="M8[s]") tm.assert_index_equal(result, expected) msg = "cannot convert input 1420043460000000000000000 with the unit 's'" @@ -1950,12 +1951,13 @@ def test_to_datetime_unit(self, dtype): epoch = 1370745748 ser = Series([epoch + t for t in range(20)]).astype(dtype) result = to_datetime(ser, unit="s") + unit = "s" if dtype is int else "ns" expected = Series( [ Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20) ], - dtype="M8[ns]", + dtype=f"M8[{unit}]", ) tm.assert_series_equal(result, expected) @@ -1964,10 +1966,13 @@ def test_to_datetime_unit_with_nulls(self, null): epoch = 1370745748 ser = Series([epoch + t for t in range(20)] + [null]) result = to_datetime(ser, unit="s") + # With np.nan, the list gets cast to a float64 array, which always + # gets ns unit. + unit = "ns" if null is np.nan else "s" expected = Series( [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + [NaT], - dtype="M8[ns]", + dtype=f"M8[{unit}]", ) tm.assert_series_equal(result, expected) @@ -1992,25 +1997,25 @@ def test_to_datetime_unit_na_values(self): result = to_datetime([1, 2, "NaT", NaT, np.nan], unit="D") expected = DatetimeIndex( [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 3, - dtype="M8[ns]", + dtype="M8[s]", ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("bad_val", ["foo", 111111111]) + @pytest.mark.parametrize("bad_val", ["foo", 111111111111111]) def test_to_datetime_unit_invalid(self, bad_val): if bad_val == "foo": msg = f"Unknown datetime string format, unable to parse: {bad_val}" else: - msg = "cannot convert input 111111111 with the unit 'D'" + msg = "cannot convert input 111111111111111 with the unit 'D'" with pytest.raises(ValueError, match=msg): to_datetime([1, 2, bad_val], unit="D") - @pytest.mark.parametrize("bad_val", ["foo", 111111111]) + @pytest.mark.parametrize("bad_val", ["foo", 111111111111111]) def test_to_timestamp_unit_coerce(self, bad_val): # coerce we can process expected = DatetimeIndex( [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 1, - dtype="M8[ns]", + dtype="M8[s]", ) result = to_datetime([1, 2, bad_val], unit="D", errors="coerce") tm.assert_index_equal(result, expected) @@ -3223,7 +3228,7 @@ def test_unix(self): result = Series(to_datetime([0, 1, 2], unit="D", origin="unix")) expected = Series( [Timestamp("1970-01-01"), Timestamp("1970-01-02"), Timestamp("1970-01-03")], - dtype="M8[ns]", + dtype="M8[s]", ) tm.assert_series_equal(result, expected) @@ -3262,8 +3267,10 @@ def test_invalid_origin(self, unit): def test_epoch(self, units, epochs): epoch_1960 = Timestamp(1960, 1, 1) units_from_epochs = np.arange(5, dtype=np.int64) + exp_unit = "s" if units == "D" else units expected = Series( - [pd.Timedelta(x, unit=units) + epoch_1960 for x in units_from_epochs] + [pd.Timedelta(x, unit=units) + epoch_1960 for x in units_from_epochs], + dtype=f"M8[{exp_unit}]", ) result = Series(to_datetime(units_from_epochs, unit=units, origin=epochs)) @@ -3358,7 +3365,7 @@ def test_arg_tz_ns_unit(self, offset, utc, exp): # GH 25546 arg = "2019-01-01T00:00:00.000" + offset result = to_datetime([arg], unit="ns", utc=utc) - expected = to_datetime([exp]).as_unit("ns") + expected = to_datetime([exp]).as_unit("us") tm.assert_index_equal(result, expected) @@ -3458,7 +3465,7 @@ def test_empty_string_datetime_coerce__unit(): # GH13044 # coerce empty string to pd.NaT result = to_datetime([1, ""], unit="s", errors="coerce") - expected = DatetimeIndex(["1970-01-01 00:00:01", "NaT"], dtype="datetime64[ns]") + expected = DatetimeIndex(["1970-01-01 00:00:01", "NaT"], dtype="datetime64[s]") tm.assert_index_equal(expected, result) # verify that no exception is raised even when errors='raise' is set From 0eda6d913b0efea9e1066188e419362558ce1d72 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 13 Dec 2025 10:34:14 -0800 Subject: [PATCH 2/2] fix json cases --- pandas/io/json/_json.py | 6 +++++- pandas/tests/io/json/test_pandas.py | 10 +++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 193189eb624ec..b4409de70dde2 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1312,7 +1312,11 @@ def _try_convert_to_date(self, data: Series) -> Series: date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS for date_unit in date_units: try: - return to_datetime(new_data, errors="raise", unit=date_unit) + # Without this as_unit cast, we would fail to overflow + # and get much-too-large dates + return to_datetime(new_data, errors="raise", unit=date_unit).dt.as_unit( + "ns" + ) except (ValueError, OverflowError, TypeError): continue return data diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 92ff2357304d9..ba24836e2672f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -964,7 +964,7 @@ def test_date_format_series(self, date, date_unit, datetime_series): json = ts.to_json(date_format="iso") result = read_json(StringIO(json), typ="series") - expected = ts.copy() + expected = ts.copy().dt.as_unit("ns") tm.assert_series_equal(result, expected) def test_date_format_series_raises(self, datetime_series): @@ -1118,9 +1118,9 @@ def test_round_trip_exception(self, datapath): @pytest.mark.parametrize( "field,dtype", [ - ["created_at", pd.DatetimeTZDtype(tz="UTC", unit="us")], - ["closed_at", "datetime64[us]"], - ["updated_at", pd.DatetimeTZDtype(tz="UTC", unit="us")], + ["created_at", pd.DatetimeTZDtype(tz="UTC")], + ["closed_at", "datetime64[ns]"], + ["updated_at", pd.DatetimeTZDtype(tz="UTC")], ], ) def test_url(self, field, dtype, httpserver): @@ -1756,7 +1756,7 @@ def test_read_timezone_information(self): result = read_json( StringIO('{"2019-01-01T11:00:00.000Z":88}'), typ="series", orient="index" ) - exp_dti = DatetimeIndex(["2019-01-01 11:00:00"], dtype="M8[us, UTC]") + exp_dti = DatetimeIndex(["2019-01-01 11:00:00"], dtype="M8[ns, UTC]") expected = Series([88], index=exp_dti) tm.assert_series_equal(result, expected)