Initial commit: 首次建仓,建立目录结构
This commit is contained in:
@ -0,0 +1,7 @@
|
||||
"""
|
||||
Test files dedicated to individual (stand-alone) DataFrame methods
|
||||
|
||||
Ideally these files/tests should correspond 1-to-1 with tests.series.methods
|
||||
|
||||
These may also present opportunities for sharing/de-duplicating test code.
|
||||
"""
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,49 @@
|
||||
import pytest
|
||||
|
||||
from pandas import Index
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_add_prefix_suffix(float_frame):
|
||||
with_prefix = float_frame.add_prefix("foo#")
|
||||
expected = Index([f"foo#{c}" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_prefix.columns, expected)
|
||||
|
||||
with_suffix = float_frame.add_suffix("#foo")
|
||||
expected = Index([f"{c}#foo" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_suffix.columns, expected)
|
||||
|
||||
with_pct_prefix = float_frame.add_prefix("%")
|
||||
expected = Index([f"%{c}" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_pct_prefix.columns, expected)
|
||||
|
||||
with_pct_suffix = float_frame.add_suffix("%")
|
||||
expected = Index([f"{c}%" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_pct_suffix.columns, expected)
|
||||
|
||||
|
||||
def test_add_prefix_suffix_axis(float_frame):
|
||||
# GH 47819
|
||||
with_prefix = float_frame.add_prefix("foo#", axis=0)
|
||||
expected = Index([f"foo#{c}" for c in float_frame.index])
|
||||
tm.assert_index_equal(with_prefix.index, expected)
|
||||
|
||||
with_prefix = float_frame.add_prefix("foo#", axis=1)
|
||||
expected = Index([f"foo#{c}" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_prefix.columns, expected)
|
||||
|
||||
with_pct_suffix = float_frame.add_suffix("#foo", axis=0)
|
||||
expected = Index([f"{c}#foo" for c in float_frame.index])
|
||||
tm.assert_index_equal(with_pct_suffix.index, expected)
|
||||
|
||||
with_pct_suffix = float_frame.add_suffix("#foo", axis=1)
|
||||
expected = Index([f"{c}#foo" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_pct_suffix.columns, expected)
|
||||
|
||||
|
||||
def test_add_prefix_suffix_invalid_axis(float_frame):
|
||||
with pytest.raises(ValueError, match="No axis named 2 for object type DataFrame"):
|
||||
float_frame.add_prefix("foo#", axis=2)
|
||||
|
||||
with pytest.raises(ValueError, match="No axis named 2 for object type DataFrame"):
|
||||
float_frame.add_suffix("foo#", axis=2)
|
||||
@ -0,0 +1,338 @@
|
||||
from datetime import timezone
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameAlign:
|
||||
def test_frame_align_aware(self):
|
||||
idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern")
|
||||
idx2 = date_range("2001", periods=5, freq="2h", tz="US/Eastern")
|
||||
df1 = DataFrame(np.random.default_rng(2).standard_normal((len(idx1), 3)), idx1)
|
||||
df2 = DataFrame(np.random.default_rng(2).standard_normal((len(idx2), 3)), idx2)
|
||||
new1, new2 = df1.align(df2)
|
||||
assert df1.index.tz == new1.index.tz
|
||||
assert df2.index.tz == new2.index.tz
|
||||
|
||||
# different timezones convert to UTC
|
||||
|
||||
# frame with frame
|
||||
df1_central = df1.tz_convert("US/Central")
|
||||
new1, new2 = df1.align(df1_central)
|
||||
assert new1.index.tz is timezone.utc
|
||||
assert new2.index.tz is timezone.utc
|
||||
|
||||
# frame with Series
|
||||
new1, new2 = df1.align(df1_central[0], axis=0)
|
||||
assert new1.index.tz is timezone.utc
|
||||
assert new2.index.tz is timezone.utc
|
||||
|
||||
df1[0].align(df1_central, axis=0)
|
||||
assert new1.index.tz is timezone.utc
|
||||
assert new2.index.tz is timezone.utc
|
||||
|
||||
def test_align_float(self, float_frame):
|
||||
af, bf = float_frame.align(float_frame)
|
||||
assert af._mgr is not float_frame._mgr
|
||||
|
||||
af, bf = float_frame.align(float_frame)
|
||||
assert af._mgr is not float_frame._mgr
|
||||
|
||||
# axis = 0
|
||||
other = float_frame.iloc[:-5, :3]
|
||||
af, bf = float_frame.align(other, axis=0, fill_value=-1)
|
||||
|
||||
tm.assert_index_equal(bf.columns, other.columns)
|
||||
|
||||
# test fill value
|
||||
join_idx = float_frame.index.join(other.index)
|
||||
diff_a = float_frame.index.difference(join_idx)
|
||||
diff_a_vals = af.reindex(diff_a).values
|
||||
assert (diff_a_vals == -1).all()
|
||||
|
||||
af, bf = float_frame.align(other, join="right", axis=0)
|
||||
tm.assert_index_equal(bf.columns, other.columns)
|
||||
tm.assert_index_equal(bf.index, other.index)
|
||||
tm.assert_index_equal(af.index, other.index)
|
||||
|
||||
# axis = 1
|
||||
other = float_frame.iloc[:-5, :3].copy()
|
||||
af, bf = float_frame.align(other, axis=1)
|
||||
tm.assert_index_equal(bf.columns, float_frame.columns)
|
||||
tm.assert_index_equal(bf.index, other.index)
|
||||
|
||||
# test fill value
|
||||
join_idx = float_frame.index.join(other.index)
|
||||
diff_a = float_frame.index.difference(join_idx)
|
||||
diff_a_vals = af.reindex(diff_a).values
|
||||
|
||||
assert (diff_a_vals == -1).all()
|
||||
|
||||
af, bf = float_frame.align(other, join="inner", axis=1)
|
||||
tm.assert_index_equal(bf.columns, other.columns)
|
||||
|
||||
# Try to align DataFrame to Series along bad axis
|
||||
msg = "No axis named 2 for object type DataFrame"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
float_frame.align(af.iloc[0, :3], join="inner", axis=2)
|
||||
|
||||
def test_align_frame_with_series(self, float_frame):
|
||||
# align dataframe to series with broadcast or not
|
||||
idx = float_frame.index
|
||||
s = Series(range(len(idx)), index=idx)
|
||||
|
||||
left, right = float_frame.align(s, axis=0)
|
||||
tm.assert_index_equal(left.index, float_frame.index)
|
||||
tm.assert_index_equal(right.index, float_frame.index)
|
||||
assert isinstance(right, Series)
|
||||
|
||||
def test_align_series_condition(self):
|
||||
# see gh-9558
|
||||
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
||||
result = df[df["a"] == 2]
|
||||
expected = DataFrame([[2, 5]], index=[1], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.where(df["a"] == 2, 0)
|
||||
expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_align_mixed_float(self, mixed_float_frame):
|
||||
# mixed floats/ints
|
||||
other = DataFrame(index=range(5), columns=["A", "B", "C"])
|
||||
af, bf = mixed_float_frame.align(
|
||||
other.iloc[:, 0], join="inner", axis=1, fill_value=0
|
||||
)
|
||||
tm.assert_index_equal(bf.index, Index([]))
|
||||
|
||||
def test_align_mixed_int(self, mixed_int_frame):
|
||||
other = DataFrame(index=range(5), columns=["A", "B", "C"])
|
||||
af, bf = mixed_int_frame.align(
|
||||
other.iloc[:, 0], join="inner", axis=1, fill_value=0
|
||||
)
|
||||
tm.assert_index_equal(bf.index, Index([]))
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"l_ordered,r_ordered,expected",
|
||||
[
|
||||
[True, True, pd.CategoricalIndex],
|
||||
[True, False, Index],
|
||||
[False, True, Index],
|
||||
[False, False, pd.CategoricalIndex],
|
||||
],
|
||||
)
|
||||
def test_align_categorical(self, l_ordered, r_ordered, expected):
|
||||
# GH-28397
|
||||
df_1 = DataFrame(
|
||||
{
|
||||
"A": np.arange(6, dtype="int64"),
|
||||
"B": Series(list("aabbca")).astype(
|
||||
pd.CategoricalDtype(list("cab"), ordered=l_ordered)
|
||||
),
|
||||
}
|
||||
).set_index("B")
|
||||
df_2 = DataFrame(
|
||||
{
|
||||
"A": np.arange(5, dtype="int64"),
|
||||
"B": Series(list("babca")).astype(
|
||||
pd.CategoricalDtype(list("cab"), ordered=r_ordered)
|
||||
),
|
||||
}
|
||||
).set_index("B")
|
||||
|
||||
aligned_1, aligned_2 = df_1.align(df_2)
|
||||
assert isinstance(aligned_1.index, expected)
|
||||
assert isinstance(aligned_2.index, expected)
|
||||
tm.assert_index_equal(aligned_1.index, aligned_2.index)
|
||||
|
||||
def test_align_multiindex(self):
|
||||
# GH#10665
|
||||
# same test cases as test_align_multiindex in test_series.py
|
||||
|
||||
midx = pd.MultiIndex.from_product(
|
||||
[range(2), range(3), range(2)], names=("a", "b", "c")
|
||||
)
|
||||
idx = Index(range(2), name="b")
|
||||
df1 = DataFrame(np.arange(12, dtype="int64"), index=midx)
|
||||
df2 = DataFrame(np.arange(2, dtype="int64"), index=idx)
|
||||
|
||||
# these must be the same results (but flipped)
|
||||
res1l, res1r = df1.align(df2, join="left")
|
||||
res2l, res2r = df2.align(df1, join="right")
|
||||
|
||||
expl = df1
|
||||
tm.assert_frame_equal(expl, res1l)
|
||||
tm.assert_frame_equal(expl, res2r)
|
||||
expr = DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx)
|
||||
tm.assert_frame_equal(expr, res1r)
|
||||
tm.assert_frame_equal(expr, res2l)
|
||||
|
||||
res1l, res1r = df1.align(df2, join="right")
|
||||
res2l, res2r = df2.align(df1, join="left")
|
||||
|
||||
exp_idx = pd.MultiIndex.from_product(
|
||||
[range(2), range(2), range(2)], names=("a", "b", "c")
|
||||
)
|
||||
expl = DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx)
|
||||
tm.assert_frame_equal(expl, res1l)
|
||||
tm.assert_frame_equal(expl, res2r)
|
||||
expr = DataFrame([0, 0, 1, 1] * 2, index=exp_idx)
|
||||
tm.assert_frame_equal(expr, res1r)
|
||||
tm.assert_frame_equal(expr, res2l)
|
||||
|
||||
def test_align_series_combinations(self):
|
||||
df = DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE"))
|
||||
s = Series([1, 2, 4], index=list("ABD"), name="x")
|
||||
|
||||
# frame + series
|
||||
res1, res2 = df.align(s, axis=0)
|
||||
exp1 = DataFrame(
|
||||
{"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]},
|
||||
index=list("ABCDE"),
|
||||
)
|
||||
exp2 = Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x")
|
||||
|
||||
tm.assert_frame_equal(res1, exp1)
|
||||
tm.assert_series_equal(res2, exp2)
|
||||
|
||||
# series + frame
|
||||
res1, res2 = s.align(df)
|
||||
tm.assert_series_equal(res1, exp2)
|
||||
tm.assert_frame_equal(res2, exp1)
|
||||
|
||||
def test_multiindex_align_to_series_with_common_index_level(self):
|
||||
# GH-46001
|
||||
foo_index = Index([1, 2, 3], name="foo")
|
||||
bar_index = Index([1, 2], name="bar")
|
||||
|
||||
series = Series([1, 2], index=bar_index, name="foo_series")
|
||||
df = DataFrame(
|
||||
{"col": np.arange(6)},
|
||||
index=pd.MultiIndex.from_product([foo_index, bar_index]),
|
||||
)
|
||||
|
||||
expected_r = Series([1, 2] * 3, index=df.index, name="foo_series")
|
||||
result_l, result_r = df.align(series, axis=0)
|
||||
|
||||
tm.assert_frame_equal(result_l, df)
|
||||
tm.assert_series_equal(result_r, expected_r)
|
||||
|
||||
def test_multiindex_align_to_series_with_common_index_level_missing_in_left(self):
|
||||
# GH-46001
|
||||
foo_index = Index([1, 2, 3], name="foo")
|
||||
bar_index = Index([1, 2], name="bar")
|
||||
|
||||
series = Series(
|
||||
[1, 2, 3, 4], index=Index([1, 2, 3, 4], name="bar"), name="foo_series"
|
||||
)
|
||||
df = DataFrame(
|
||||
{"col": np.arange(6)},
|
||||
index=pd.MultiIndex.from_product([foo_index, bar_index]),
|
||||
)
|
||||
|
||||
expected_r = Series([1, 2] * 3, index=df.index, name="foo_series")
|
||||
result_l, result_r = df.align(series, axis=0)
|
||||
|
||||
tm.assert_frame_equal(result_l, df)
|
||||
tm.assert_series_equal(result_r, expected_r)
|
||||
|
||||
def test_multiindex_align_to_series_with_common_index_level_missing_in_right(self):
|
||||
# GH-46001
|
||||
foo_index = Index([1, 2, 3], name="foo")
|
||||
bar_index = Index([1, 2, 3, 4], name="bar")
|
||||
|
||||
series = Series([1, 2], index=Index([1, 2], name="bar"), name="foo_series")
|
||||
df = DataFrame(
|
||||
{"col": np.arange(12)},
|
||||
index=pd.MultiIndex.from_product([foo_index, bar_index]),
|
||||
)
|
||||
|
||||
expected_r = Series(
|
||||
[1, 2, np.nan, np.nan] * 3, index=df.index, name="foo_series"
|
||||
)
|
||||
result_l, result_r = df.align(series, axis=0)
|
||||
|
||||
tm.assert_frame_equal(result_l, df)
|
||||
tm.assert_series_equal(result_r, expected_r)
|
||||
|
||||
def test_multiindex_align_to_series_with_common_index_level_missing_in_both(self):
|
||||
# GH-46001
|
||||
foo_index = Index([1, 2, 3], name="foo")
|
||||
bar_index = Index([1, 3, 4], name="bar")
|
||||
|
||||
series = Series(
|
||||
[1, 2, 3], index=Index([1, 2, 4], name="bar"), name="foo_series"
|
||||
)
|
||||
df = DataFrame(
|
||||
{"col": np.arange(9)},
|
||||
index=pd.MultiIndex.from_product([foo_index, bar_index]),
|
||||
)
|
||||
|
||||
expected_r = Series([1, np.nan, 3] * 3, index=df.index, name="foo_series")
|
||||
result_l, result_r = df.align(series, axis=0)
|
||||
|
||||
tm.assert_frame_equal(result_l, df)
|
||||
tm.assert_series_equal(result_r, expected_r)
|
||||
|
||||
def test_multiindex_align_to_series_with_common_index_level_non_unique_cols(self):
|
||||
# GH-46001
|
||||
foo_index = Index([1, 2, 3], name="foo")
|
||||
bar_index = Index([1, 2], name="bar")
|
||||
|
||||
series = Series([1, 2], index=bar_index, name="foo_series")
|
||||
df = DataFrame(
|
||||
np.arange(18).reshape(6, 3),
|
||||
index=pd.MultiIndex.from_product([foo_index, bar_index]),
|
||||
)
|
||||
df.columns = ["cfoo", "cbar", "cfoo"]
|
||||
|
||||
expected = Series([1, 2] * 3, index=df.index, name="foo_series")
|
||||
result_left, result_right = df.align(series, axis=0)
|
||||
|
||||
tm.assert_series_equal(result_right, expected)
|
||||
tm.assert_index_equal(result_left.columns, df.columns)
|
||||
|
||||
def test_missing_axis_specification_exception(self):
|
||||
df = DataFrame(np.arange(50).reshape((10, 5)))
|
||||
series = Series(np.arange(5))
|
||||
|
||||
with pytest.raises(ValueError, match=r"axis=0 or 1"):
|
||||
df.align(series)
|
||||
|
||||
def test_align_series_check_copy(self):
|
||||
# GH#
|
||||
df = DataFrame({0: [1, 2]})
|
||||
ser = Series([1], name=0)
|
||||
expected = ser.copy()
|
||||
result, other = df.align(ser, axis=1)
|
||||
ser.iloc[0] = 100
|
||||
tm.assert_series_equal(other, expected)
|
||||
|
||||
def test_align_identical_different_object(self):
|
||||
# GH#51032
|
||||
df = DataFrame({"a": [1, 2]})
|
||||
ser = Series([3, 4])
|
||||
result, result2 = df.align(ser, axis=0)
|
||||
tm.assert_frame_equal(result, df)
|
||||
tm.assert_series_equal(result2, ser)
|
||||
assert df is not result
|
||||
assert ser is not result2
|
||||
|
||||
def test_align_identical_different_object_columns(self):
|
||||
# GH#51032
|
||||
df = DataFrame({"a": [1, 2]})
|
||||
ser = Series([1], index=["a"])
|
||||
result, result2 = df.align(ser, axis=1)
|
||||
tm.assert_frame_equal(result, df)
|
||||
tm.assert_series_equal(result2, ser)
|
||||
assert df is not result
|
||||
assert ser is not result2
|
||||
@ -0,0 +1,296 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs.offsets import MonthEnd
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
date_range,
|
||||
period_range,
|
||||
to_datetime,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.tseries import offsets
|
||||
|
||||
|
||||
class TestAsFreq:
|
||||
def test_asfreq2(self, frame_or_series):
|
||||
ts = frame_or_series(
|
||||
[0.0, 1.0, 2.0],
|
||||
index=DatetimeIndex(
|
||||
[
|
||||
datetime(2009, 10, 30),
|
||||
datetime(2009, 11, 30),
|
||||
datetime(2009, 12, 31),
|
||||
],
|
||||
dtype="M8[ns]",
|
||||
freq="BME",
|
||||
),
|
||||
)
|
||||
|
||||
daily_ts = ts.asfreq("B")
|
||||
monthly_ts = daily_ts.asfreq("BME")
|
||||
tm.assert_equal(monthly_ts, ts)
|
||||
|
||||
daily_ts = ts.asfreq("B", method="pad")
|
||||
monthly_ts = daily_ts.asfreq("BME")
|
||||
tm.assert_equal(monthly_ts, ts)
|
||||
|
||||
daily_ts = ts.asfreq(offsets.BDay())
|
||||
monthly_ts = daily_ts.asfreq(offsets.BMonthEnd())
|
||||
tm.assert_equal(monthly_ts, ts)
|
||||
|
||||
result = ts[:0].asfreq("ME")
|
||||
assert len(result) == 0
|
||||
assert result is not ts
|
||||
|
||||
if frame_or_series is Series:
|
||||
daily_ts = ts.asfreq("D", fill_value=-1)
|
||||
result = daily_ts.value_counts().sort_index()
|
||||
expected = Series(
|
||||
[60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0], name="count"
|
||||
).sort_index()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_asfreq_datetimeindex_empty(self, frame_or_series):
|
||||
# GH#14320
|
||||
index = DatetimeIndex(["2016-09-29 11:00"])
|
||||
expected = frame_or_series(index=index, dtype=object).asfreq("h")
|
||||
result = frame_or_series([3], index=index.copy()).asfreq("h")
|
||||
tm.assert_index_equal(expected.index, result.index)
|
||||
|
||||
@pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
|
||||
def test_tz_aware_asfreq_smoke(self, tz, frame_or_series):
|
||||
dr = date_range("2011-12-01", "2012-07-20", freq="D", tz=tz)
|
||||
|
||||
obj = frame_or_series(
|
||||
np.random.default_rng(2).standard_normal(len(dr)), index=dr
|
||||
)
|
||||
|
||||
# it works!
|
||||
obj.asfreq("min")
|
||||
|
||||
def test_asfreq_normalize(self, frame_or_series):
|
||||
rng = date_range("1/1/2000 09:30", periods=20)
|
||||
norm = date_range("1/1/2000", periods=20)
|
||||
|
||||
vals = np.random.default_rng(2).standard_normal((20, 3))
|
||||
|
||||
obj = DataFrame(vals, index=rng)
|
||||
expected = DataFrame(vals, index=norm)
|
||||
if frame_or_series is Series:
|
||||
obj = obj[0]
|
||||
expected = expected[0]
|
||||
|
||||
result = obj.asfreq("D", normalize=True)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_asfreq_keep_index_name(self, frame_or_series):
|
||||
# GH#9854
|
||||
index_name = "bar"
|
||||
index = date_range("20130101", periods=20, name=index_name)
|
||||
obj = DataFrame(list(range(20)), columns=["foo"], index=index)
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
assert index_name == obj.index.name
|
||||
assert index_name == obj.asfreq("10D").index.name
|
||||
|
||||
def test_asfreq_ts(self, frame_or_series):
|
||||
index = period_range(freq="Y", start="1/1/2001", end="12/31/2010")
|
||||
obj = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(index), 3)), index=index
|
||||
)
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
result = obj.asfreq("D", how="end")
|
||||
exp_index = index.asfreq("D", how="end")
|
||||
assert len(result) == len(obj)
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
result = obj.asfreq("D", how="start")
|
||||
exp_index = index.asfreq("D", how="start")
|
||||
assert len(result) == len(obj)
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
def test_asfreq_resample_set_correct_freq(self, frame_or_series):
|
||||
# GH#5613
|
||||
# we test if .asfreq() and .resample() set the correct value for .freq
|
||||
dti = to_datetime(["2012-01-01", "2012-01-02", "2012-01-03"])
|
||||
obj = DataFrame({"col": [1, 2, 3]}, index=dti)
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
# testing the settings before calling .asfreq() and .resample()
|
||||
assert obj.index.freq is None
|
||||
assert obj.index.inferred_freq == "D"
|
||||
|
||||
# does .asfreq() set .freq correctly?
|
||||
assert obj.asfreq("D").index.freq == "D"
|
||||
|
||||
# does .resample() set .freq correctly?
|
||||
assert obj.resample("D").asfreq().index.freq == "D"
|
||||
|
||||
def test_asfreq_empty(self, datetime_frame):
|
||||
# test does not blow up on length-0 DataFrame
|
||||
zero_length = datetime_frame.reindex([])
|
||||
result = zero_length.asfreq("BME")
|
||||
assert result is not zero_length
|
||||
|
||||
def test_asfreq(self, datetime_frame):
|
||||
offset_monthly = datetime_frame.asfreq(offsets.BMonthEnd())
|
||||
rule_monthly = datetime_frame.asfreq("BME")
|
||||
|
||||
tm.assert_frame_equal(offset_monthly, rule_monthly)
|
||||
|
||||
rule_monthly.asfreq("B", method="pad")
|
||||
# TODO: actually check that this worked.
|
||||
|
||||
# don't forget!
|
||||
rule_monthly.asfreq("B", method="pad")
|
||||
|
||||
def test_asfreq_datetimeindex(self):
|
||||
df = DataFrame(
|
||||
{"A": [1, 2, 3]},
|
||||
index=[datetime(2011, 11, 1), datetime(2011, 11, 2), datetime(2011, 11, 3)],
|
||||
)
|
||||
df = df.asfreq("B")
|
||||
assert isinstance(df.index, DatetimeIndex)
|
||||
|
||||
ts = df["A"].asfreq("B")
|
||||
assert isinstance(ts.index, DatetimeIndex)
|
||||
|
||||
def test_asfreq_fillvalue(self):
|
||||
# test for fill value during upsampling, related to issue 3715
|
||||
|
||||
# setup
|
||||
rng = date_range("1/1/2016", periods=10, freq="2s")
|
||||
# Explicit cast to 'float' to avoid implicit cast when setting None
|
||||
ts = Series(np.arange(len(rng)), index=rng, dtype="float")
|
||||
df = DataFrame({"one": ts})
|
||||
|
||||
# insert pre-existing missing value
|
||||
df.loc["2016-01-01 00:00:08", "one"] = None
|
||||
|
||||
actual_df = df.asfreq(freq="1s", fill_value=9.0)
|
||||
expected_df = df.asfreq(freq="1s").fillna(9.0)
|
||||
expected_df.loc["2016-01-01 00:00:08", "one"] = None
|
||||
tm.assert_frame_equal(expected_df, actual_df)
|
||||
|
||||
expected_series = ts.asfreq(freq="1s").fillna(9.0)
|
||||
actual_series = ts.asfreq(freq="1s", fill_value=9.0)
|
||||
tm.assert_series_equal(expected_series, actual_series)
|
||||
|
||||
def test_asfreq_with_date_object_index(self, frame_or_series):
|
||||
rng = date_range("1/1/2000", periods=20, unit="ns")
|
||||
ts = frame_or_series(np.random.default_rng(2).standard_normal(20), index=rng)
|
||||
|
||||
ts2 = ts.copy()
|
||||
ts2.index = [x.date() for x in ts2.index]
|
||||
|
||||
result = ts2.asfreq("4h", method="ffill")
|
||||
expected = ts.asfreq("4h", method="ffill")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_asfreq_with_unsorted_index(self, frame_or_series):
|
||||
# GH#39805
|
||||
# Test that rows are not dropped when the datetime index is out of order
|
||||
index = to_datetime(["2021-01-04", "2021-01-02", "2021-01-03", "2021-01-01"])
|
||||
result = frame_or_series(range(4), index=index)
|
||||
|
||||
expected = result.reindex(sorted(index))
|
||||
expected.index = expected.index._with_freq("infer")
|
||||
|
||||
result = result.asfreq("D")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_asfreq_after_normalize(self, unit):
|
||||
# https://github.com/pandas-dev/pandas/issues/50727
|
||||
result = DatetimeIndex(
|
||||
date_range("2000", periods=2).as_unit(unit).normalize(), freq="D"
|
||||
)
|
||||
expected = DatetimeIndex(["2000-01-01", "2000-01-02"], freq="D").as_unit(unit)
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"freq, freq_half",
|
||||
[
|
||||
("2ME", "ME"),
|
||||
(MonthEnd(2), MonthEnd(1)),
|
||||
],
|
||||
)
|
||||
def test_asfreq_2ME(self, freq, freq_half):
|
||||
index = date_range("1/1/2000", periods=6, freq=freq_half)
|
||||
df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0, 4.0, 5.0], index=index)})
|
||||
expected = df.asfreq(freq=freq)
|
||||
|
||||
index = date_range("1/1/2000", periods=3, freq=freq)
|
||||
result = DataFrame({"s": Series([0.0, 2.0, 4.0], index=index)})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"freq, freq_depr",
|
||||
[
|
||||
("2ME", "2M"),
|
||||
("2ME", "2m"),
|
||||
("2QE", "2Q"),
|
||||
("2QE-SEP", "2Q-SEP"),
|
||||
("1BQE", "1BQ"),
|
||||
("2BQE-SEP", "2BQ-SEP"),
|
||||
("2BQE-SEP", "2bq-sep"),
|
||||
("1YE", "1y"),
|
||||
("2YE-MAR", "2Y-MAR"),
|
||||
],
|
||||
)
|
||||
def test_asfreq_frequency_M_Q_Y_raises(self, freq, freq_depr):
|
||||
msg = f"Invalid frequency: {freq_depr}"
|
||||
|
||||
index = date_range("1/1/2000", periods=4, freq=f"{freq[1:]}")
|
||||
df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)})
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.asfreq(freq=freq_depr)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"freq, error_msg",
|
||||
[
|
||||
(
|
||||
"2MS",
|
||||
"Invalid frequency: 2MS",
|
||||
),
|
||||
(
|
||||
offsets.MonthBegin(),
|
||||
r"\<MonthBegin\> is not supported as period frequency",
|
||||
),
|
||||
(
|
||||
offsets.DateOffset(months=2),
|
||||
r"\<DateOffset: months=2\> is not supported as period frequency",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_asfreq_unsupported_freq(self, freq, error_msg):
|
||||
# https://github.com/pandas-dev/pandas/issues/56718
|
||||
index = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M")
|
||||
df = DataFrame({"a": Series([0, 1], index=index)})
|
||||
|
||||
with pytest.raises(ValueError, match=error_msg):
|
||||
df.asfreq(freq=freq)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"freq, freq_depr",
|
||||
[
|
||||
("2YE", "2A"),
|
||||
("2BYE-MAR", "2BA-MAR"),
|
||||
],
|
||||
)
|
||||
def test_asfreq_frequency_A_BA_raises(self, freq, freq_depr):
|
||||
msg = f"Invalid frequency: {freq_depr}"
|
||||
|
||||
index = date_range("1/1/2000", periods=4, freq=freq)
|
||||
df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)})
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.asfreq(freq=freq_depr)
|
||||
@ -0,0 +1,185 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs import IncompatibleFrequency
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Period,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
period_range,
|
||||
to_datetime,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def date_range_frame():
|
||||
"""
|
||||
Fixture for DataFrame of ints with date_range index
|
||||
|
||||
Columns are ['A', 'B'].
|
||||
"""
|
||||
N = 50
|
||||
rng = date_range("1/1/1990", periods=N, freq="53s")
|
||||
return DataFrame({"A": np.arange(N), "B": np.arange(N)}, index=rng)
|
||||
|
||||
|
||||
class TestFrameAsof:
|
||||
def test_basic(self, date_range_frame):
|
||||
# Explicitly cast to float to avoid implicit cast when setting np.nan
|
||||
df = date_range_frame.astype({"A": "float"})
|
||||
N = 50
|
||||
df.loc[df.index[15:30], "A"] = np.nan
|
||||
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
|
||||
|
||||
result = df.asof(dates)
|
||||
assert result.notna().all(axis=1).all()
|
||||
lb = df.index[14]
|
||||
ub = df.index[30]
|
||||
|
||||
dates = list(dates)
|
||||
|
||||
result = df.asof(dates)
|
||||
assert result.notna().all(axis=1).all()
|
||||
|
||||
mask = (result.index >= lb) & (result.index < ub)
|
||||
rs = result[mask]
|
||||
assert (rs == 14).all(axis=1).all()
|
||||
|
||||
def test_subset(self, date_range_frame):
|
||||
N = 10
|
||||
# explicitly cast to float to avoid implicit upcast when setting to np.nan
|
||||
df = date_range_frame.iloc[:N].copy().astype({"A": "float"})
|
||||
df.loc[df.index[4:8], "A"] = np.nan
|
||||
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
|
||||
|
||||
# with a subset of A should be the same
|
||||
result = df.asof(dates, subset="A")
|
||||
expected = df.asof(dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# same with A/B
|
||||
result = df.asof(dates, subset=["A", "B"])
|
||||
expected = df.asof(dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# B gives df.asof
|
||||
result = df.asof(dates, subset="B")
|
||||
expected = df.resample("25s", closed="right").ffill().reindex(dates)
|
||||
expected.iloc[20:] = 9
|
||||
# no "missing", so "B" can retain int dtype (df["A"].dtype platform-dependent)
|
||||
expected["B"] = expected["B"].astype(df["B"].dtype)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_missing(self, date_range_frame):
|
||||
# GH 15118
|
||||
# no match found - `where` value before earliest date in index
|
||||
N = 10
|
||||
# Cast to 'float64' to avoid upcast when introducing nan in df.asof
|
||||
df = date_range_frame.iloc[:N].copy().astype("float64")
|
||||
|
||||
result = df.asof("1989-12-31")
|
||||
|
||||
expected = Series(
|
||||
index=["A", "B"], name=Timestamp("1989-12-31"), dtype=np.float64
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.asof(to_datetime(["1989-12-31"]))
|
||||
expected = DataFrame(
|
||||
index=to_datetime(["1989-12-31"]), columns=["A", "B"], dtype="float64"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Check that we handle PeriodIndex correctly, dont end up with
|
||||
# period.ordinal for series name
|
||||
df = df.to_period("D")
|
||||
result = df.asof("1989-12-31")
|
||||
assert isinstance(result.name, Period)
|
||||
|
||||
def test_asof_all_nans(self, frame_or_series):
|
||||
# GH 15713
|
||||
# DataFrame/Series is all nans
|
||||
result = frame_or_series([np.nan]).asof([0])
|
||||
expected = frame_or_series([np.nan])
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_all_nans(self, date_range_frame):
|
||||
# GH 15713
|
||||
# DataFrame is all nans
|
||||
|
||||
# testing non-default indexes, multiple inputs
|
||||
N = 150
|
||||
rng = date_range_frame.index
|
||||
dates = date_range("1/1/1990", periods=N, freq="25s")
|
||||
result = DataFrame(np.nan, index=rng, columns=["A"]).asof(dates)
|
||||
expected = DataFrame(np.nan, index=dates, columns=["A"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# testing multiple columns
|
||||
dates = date_range("1/1/1990", periods=N, freq="25s")
|
||||
result = DataFrame(np.nan, index=rng, columns=["A", "B", "C"]).asof(dates)
|
||||
expected = DataFrame(np.nan, index=dates, columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# testing scalar input
|
||||
result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof([3])
|
||||
expected = DataFrame(np.nan, index=[3], columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof(3)
|
||||
expected = Series(np.nan, index=["A", "B"], name=3)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"stamp,expected",
|
||||
[
|
||||
(
|
||||
Timestamp("2018-01-01 23:22:43.325+00:00"),
|
||||
Series(2, name=Timestamp("2018-01-01 23:22:43.325+00:00")),
|
||||
),
|
||||
(
|
||||
Timestamp("2018-01-01 22:33:20.682+01:00"),
|
||||
Series(1, name=Timestamp("2018-01-01 22:33:20.682+01:00")),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_time_zone_aware_index(self, stamp, expected):
|
||||
# GH21194
|
||||
# Testing awareness of DataFrame index considering different
|
||||
# UTC and timezone
|
||||
df = DataFrame(
|
||||
data=[1, 2],
|
||||
index=[
|
||||
Timestamp("2018-01-01 21:00:05.001+00:00"),
|
||||
Timestamp("2018-01-01 22:35:10.550+00:00"),
|
||||
],
|
||||
)
|
||||
|
||||
result = df.asof(stamp)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_asof_periodindex_mismatched_freq(self):
|
||||
N = 50
|
||||
rng = period_range("1/1/1990", periods=N, freq="h")
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal(N), index=rng)
|
||||
|
||||
# Mismatched freq
|
||||
msg = "Input has different freq"
|
||||
with pytest.raises(IncompatibleFrequency, match=msg):
|
||||
df.asof(rng.asfreq("D"))
|
||||
|
||||
def test_asof_preserves_bool_dtype(self):
|
||||
# GH#16063 was casting bools to floats
|
||||
dti = date_range("2017-01-01", freq="MS", periods=4)
|
||||
ser = Series([True, False, True], index=dti[:-1])
|
||||
|
||||
ts = dti[-1]
|
||||
res = ser.asof([ts])
|
||||
|
||||
expected = Series([True], index=[ts])
|
||||
tm.assert_series_equal(res, expected)
|
||||
@ -0,0 +1,84 @@
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestAssign:
|
||||
def test_assign(self):
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
original = df.copy()
|
||||
result = df.assign(C=df.B / df.A)
|
||||
expected = df.copy()
|
||||
expected["C"] = [4, 2.5, 2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# lambda syntax
|
||||
result = df.assign(C=lambda x: x.B / x.A)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# original is unmodified
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
# Non-Series array-like
|
||||
result = df.assign(C=[4, 2.5, 2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
# original is unmodified
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
result = df.assign(B=df.B / df.A)
|
||||
expected = expected.drop("B", axis=1).rename(columns={"C": "B"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# overwrite
|
||||
result = df.assign(A=df.A + df.B)
|
||||
expected = df.copy()
|
||||
expected["A"] = [5, 7, 9]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# lambda
|
||||
result = df.assign(A=lambda x: x.A + x.B)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_multiple(self):
|
||||
df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=["A", "B"])
|
||||
result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
|
||||
expected = DataFrame(
|
||||
[[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], [3, 6, 9, 3, 6]], columns=list("ABCDE")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_order(self):
|
||||
# GH 9818
|
||||
df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
|
||||
result = df.assign(D=df.A + df.B, C=df.A - df.B)
|
||||
|
||||
expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.assign(C=df.A - df.B, D=df.A + df.B)
|
||||
|
||||
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD"))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_bad(self):
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
|
||||
# non-keyword argument
|
||||
msg = r"assign\(\) takes 1 positional argument but 2 were given"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.assign(lambda x: x.A)
|
||||
msg = "'DataFrame' object has no attribute 'C'"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
df.assign(C=df.A, D=df.A + df.C)
|
||||
|
||||
def test_assign_dependent(self):
|
||||
df = DataFrame({"A": [1, 2], "B": [3, 4]})
|
||||
|
||||
result = df.assign(C=df.A, D=lambda x: x["A"] + x["C"])
|
||||
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"])
|
||||
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,920 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import Pandas4Warning
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
CategoricalDtype,
|
||||
DataFrame,
|
||||
DatetimeTZDtype,
|
||||
Index,
|
||||
Interval,
|
||||
IntervalDtype,
|
||||
NaT,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
concat,
|
||||
date_range,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def _check_cast(df, v):
|
||||
"""
|
||||
Check if all dtypes of df are equal to v
|
||||
"""
|
||||
assert all(s.dtype.name == v for _, s in df.items())
|
||||
|
||||
|
||||
class TestAstype:
|
||||
def test_astype_float(self, float_frame):
|
||||
casted = float_frame.astype(int)
|
||||
expected = DataFrame(
|
||||
float_frame.values.astype(int),
|
||||
index=float_frame.index,
|
||||
columns=float_frame.columns,
|
||||
)
|
||||
tm.assert_frame_equal(casted, expected)
|
||||
|
||||
casted = float_frame.astype(np.int32)
|
||||
expected = DataFrame(
|
||||
float_frame.values.astype(np.int32),
|
||||
index=float_frame.index,
|
||||
columns=float_frame.columns,
|
||||
)
|
||||
tm.assert_frame_equal(casted, expected)
|
||||
|
||||
float_frame["foo"] = "5"
|
||||
casted = float_frame.astype(int)
|
||||
expected = DataFrame(
|
||||
float_frame.values.astype(int),
|
||||
index=float_frame.index,
|
||||
columns=float_frame.columns,
|
||||
)
|
||||
tm.assert_frame_equal(casted, expected)
|
||||
|
||||
def test_astype_mixed_float(self, mixed_float_frame):
|
||||
# mixed casting
|
||||
casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float32")
|
||||
_check_cast(casted, "float32")
|
||||
|
||||
casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16")
|
||||
_check_cast(casted, "float16")
|
||||
|
||||
def test_astype_mixed_type(self):
|
||||
# mixed casting
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": 1.0,
|
||||
"b": 2,
|
||||
"c": "foo",
|
||||
"float32": np.array([1.0] * 10, dtype="float32"),
|
||||
"int32": np.array([1] * 10, dtype="int32"),
|
||||
},
|
||||
index=np.arange(10),
|
||||
)
|
||||
mn = df._get_numeric_data().copy()
|
||||
mn["little_float"] = np.array(12345.0, dtype="float16")
|
||||
mn["big_float"] = np.array(123456789101112.0, dtype="float64")
|
||||
|
||||
casted = mn.astype("float64")
|
||||
_check_cast(casted, "float64")
|
||||
|
||||
casted = mn.astype("int64")
|
||||
_check_cast(casted, "int64")
|
||||
|
||||
casted = mn.reindex(columns=["little_float"]).astype("float16")
|
||||
_check_cast(casted, "float16")
|
||||
|
||||
casted = mn.astype("float32")
|
||||
_check_cast(casted, "float32")
|
||||
|
||||
casted = mn.astype("int32")
|
||||
_check_cast(casted, "int32")
|
||||
|
||||
# to object
|
||||
casted = mn.astype("O")
|
||||
_check_cast(casted, "object")
|
||||
|
||||
def test_astype_with_exclude_string(self, float_frame):
|
||||
df = float_frame.copy()
|
||||
expected = float_frame.astype(int)
|
||||
df["string"] = "foo"
|
||||
casted = df.astype(int, errors="ignore")
|
||||
|
||||
expected["string"] = "foo"
|
||||
tm.assert_frame_equal(casted, expected)
|
||||
|
||||
df = float_frame.copy()
|
||||
expected = float_frame.astype(np.int32)
|
||||
df["string"] = "foo"
|
||||
casted = df.astype(np.int32, errors="ignore")
|
||||
|
||||
expected["string"] = "foo"
|
||||
tm.assert_frame_equal(casted, expected)
|
||||
|
||||
def test_astype_with_view_float(self, float_frame):
|
||||
# this is the only real reason to do it this way
|
||||
tf = np.round(float_frame).astype(np.int32)
|
||||
tf.astype(np.float32)
|
||||
|
||||
# TODO(wesm): verification?
|
||||
tf = float_frame.astype(np.float64)
|
||||
tf.astype(np.int64)
|
||||
|
||||
def test_astype_with_view_mixed_float(self, mixed_float_frame):
|
||||
tf = mixed_float_frame.reindex(columns=["A", "B", "C"])
|
||||
|
||||
tf.astype(np.int64)
|
||||
tf.astype(np.float32)
|
||||
|
||||
@pytest.mark.parametrize("val", [np.nan, np.inf])
|
||||
def test_astype_cast_nan_inf_int(self, val, any_int_numpy_dtype):
|
||||
# see GH#14265
|
||||
#
|
||||
# Check NaN and inf --> raise error when converting to int.
|
||||
msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
|
||||
df = DataFrame([val])
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.astype(any_int_numpy_dtype)
|
||||
|
||||
def test_astype_str(self):
|
||||
# see GH#9757
|
||||
a = Series(date_range("2010-01-04", periods=5))
|
||||
b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
|
||||
c = Series([Timedelta(x, unit="D") for x in range(5)])
|
||||
d = Series(range(5))
|
||||
e = Series([0.0, 0.2, 0.4, 0.6, 0.8])
|
||||
|
||||
df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})
|
||||
|
||||
# Datetime-like
|
||||
result = df.astype(str)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": list(map(str, (Timestamp(x)._date_repr for x in a._values))),
|
||||
"b": list(map(str, map(Timestamp, b._values))),
|
||||
"c": [Timedelta(x)._repr_base() for x in c._values],
|
||||
"d": list(map(str, d._values)),
|
||||
"e": list(map(str, e._values)),
|
||||
},
|
||||
dtype="str",
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_astype_str_float(self, using_infer_string):
|
||||
# see GH#11302
|
||||
result = DataFrame([np.nan]).astype(str)
|
||||
expected = DataFrame([np.nan if using_infer_string else "nan"], dtype="str")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = DataFrame([1.12345678901234567890]).astype(str)
|
||||
|
||||
val = "1.1234567890123457"
|
||||
expected = DataFrame([val], dtype="str")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype_class", [dict, Series])
|
||||
def test_astype_dict_like(self, dtype_class):
|
||||
# GH7271 & GH16717
|
||||
a = Series(date_range("2010-01-04", periods=5))
|
||||
b = Series(range(5))
|
||||
c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
|
||||
d = Series(["1.0", "2", "3.14", "4", "5.4"])
|
||||
df = DataFrame({"a": a, "b": b, "c": c, "d": d})
|
||||
original = df.copy(deep=True)
|
||||
|
||||
# change type of a subset of columns
|
||||
dt1 = dtype_class({"b": "str", "d": "float32"})
|
||||
result = df.astype(dt1)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": a,
|
||||
"b": Series(["0", "1", "2", "3", "4"], dtype="str"),
|
||||
"c": c,
|
||||
"d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64})
|
||||
result = df.astype(dt2)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": a,
|
||||
"b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"),
|
||||
"c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"),
|
||||
"d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
# change all columns
|
||||
dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str})
|
||||
tm.assert_frame_equal(df.astype(dt3), df.astype(str))
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
# error should be raised when using something other than column labels
|
||||
# in the keys of the dtype dict
|
||||
dt4 = dtype_class({"b": str, 2: str})
|
||||
dt5 = dtype_class({"e": str})
|
||||
msg_frame = (
|
||||
"Only a column name can be used for the key in a dtype mappings argument. "
|
||||
"'{}' not found in columns."
|
||||
)
|
||||
with pytest.raises(KeyError, match=msg_frame.format(2)):
|
||||
df.astype(dt4)
|
||||
with pytest.raises(KeyError, match=msg_frame.format("e")):
|
||||
df.astype(dt5)
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
# if the dtypes provided are the same as the original dtypes, the
|
||||
# resulting DataFrame should be the same as the original DataFrame
|
||||
dt6 = dtype_class({col: df[col].dtype for col in df.columns})
|
||||
equiv = df.astype(dt6)
|
||||
tm.assert_frame_equal(df, equiv)
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
# GH#16717
|
||||
# if dtypes provided is empty, the resulting DataFrame
|
||||
# should be the same as the original DataFrame
|
||||
dt7 = dtype_class({}) if dtype_class is dict else dtype_class({}, dtype=object)
|
||||
equiv = df.astype(dt7)
|
||||
tm.assert_frame_equal(df, equiv)
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
def test_astype_duplicate_col(self):
|
||||
a1 = Series([1, 2, 3, 4, 5], name="a")
|
||||
b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b")
|
||||
a2 = Series([0, 1, 2, 3, 4], name="a")
|
||||
df = concat([a1, b, a2], axis=1)
|
||||
|
||||
result = df.astype("str")
|
||||
a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a")
|
||||
b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype="str", name="b")
|
||||
a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a")
|
||||
expected = concat([a1_str, b_str, a2_str], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.astype({"a": "str"})
|
||||
expected = concat([a1_str, b, a2_str], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_astype_duplicate_col_series_arg(self):
|
||||
# GH#44417
|
||||
vals = np.random.default_rng(2).standard_normal((3, 4))
|
||||
df = DataFrame(vals, columns=["A", "B", "C", "A"])
|
||||
dtypes = df.dtypes
|
||||
dtypes.iloc[0] = str
|
||||
dtypes.iloc[2] = "Float64"
|
||||
|
||||
result = df.astype(dtypes)
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: Series(vals[:, 0].astype(str), dtype="str"),
|
||||
1: vals[:, 1],
|
||||
2: pd.array(vals[:, 2], dtype="Float64"),
|
||||
3: vals[:, 3],
|
||||
}
|
||||
)
|
||||
expected.columns = df.columns
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
"category",
|
||||
CategoricalDtype(),
|
||||
CategoricalDtype(ordered=True),
|
||||
CategoricalDtype(ordered=False),
|
||||
CategoricalDtype(categories=list("abcdef")),
|
||||
CategoricalDtype(categories=list("edba"), ordered=False),
|
||||
CategoricalDtype(categories=list("edcb"), ordered=True),
|
||||
],
|
||||
ids=repr,
|
||||
)
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:Constructing a Categorical with a dtype and values"
|
||||
)
|
||||
def test_astype_categorical(self, dtype):
|
||||
# GH#18099
|
||||
d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")}
|
||||
df = DataFrame(d)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame({k: Categorical(v, dtype=dtype) for k, v in d.items()})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype])
|
||||
def test_astype_categoricaldtype_class_raises(self, cls):
|
||||
df = DataFrame({"A": ["a", "a", "b", "c"]})
|
||||
xpr = f"Expected an instance of {cls.__name__}"
|
||||
with pytest.raises(TypeError, match=xpr):
|
||||
df.astype({"A": cls})
|
||||
|
||||
with pytest.raises(TypeError, match=xpr):
|
||||
df["A"].astype(cls)
|
||||
|
||||
def test_astype_extension_dtypes(self, any_int_ea_dtype):
|
||||
# GH#22578
|
||||
dtype = any_int_ea_dtype
|
||||
df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"])
|
||||
|
||||
expected1 = DataFrame(
|
||||
{
|
||||
"a": pd.array([1, 3, 5], dtype=dtype),
|
||||
"b": pd.array([2, 4, 6], dtype=dtype),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype(dtype).astype("float64"), df)
|
||||
|
||||
df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"])
|
||||
df["b"] = df["b"].astype(dtype)
|
||||
expected2 = DataFrame(
|
||||
{"a": [1.0, 3.0, 5.0], "b": pd.array([2, 4, 6], dtype=dtype)}
|
||||
)
|
||||
tm.assert_frame_equal(df, expected2)
|
||||
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
|
||||
|
||||
def test_astype_extension_dtypes_1d(self, any_int_ea_dtype):
|
||||
# GH#22578
|
||||
dtype = any_int_ea_dtype
|
||||
df = DataFrame({"a": [1.0, 2.0, 3.0]})
|
||||
|
||||
expected1 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
|
||||
|
||||
df = DataFrame({"a": [1.0, 2.0, 3.0]})
|
||||
df["a"] = df["a"].astype(dtype)
|
||||
expected2 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
|
||||
tm.assert_frame_equal(df, expected2)
|
||||
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["category", "Int64"])
|
||||
def test_astype_extension_dtypes_duplicate_col(self, dtype, using_nan_is_na):
|
||||
# GH#24704
|
||||
a1 = Series([0, np.nan, 4], name="a")
|
||||
a2 = Series([np.nan, 3, 5], name="a")
|
||||
df = concat([a1, a2], axis=1)
|
||||
|
||||
if dtype == "Int64" and not using_nan_is_na:
|
||||
msg = "Cannot cast NaN value to Integer dtype"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.astype(dtype)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
a1.astype(dtype)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
a2.astype(dtype)
|
||||
return
|
||||
|
||||
result = df.astype(dtype)
|
||||
expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", [{100: "float64", 200: "uint64"}, "category", "float64"]
|
||||
)
|
||||
def test_astype_column_metadata(self, dtype):
|
||||
# GH#19920
|
||||
columns = Index([100, 200, 300], dtype=np.uint64, name="foo")
|
||||
df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
|
||||
df = df.astype(dtype)
|
||||
tm.assert_index_equal(df.columns, columns)
|
||||
|
||||
@pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"])
|
||||
def test_astype_from_object_to_datetime_unit(self, unit):
|
||||
vals = [
|
||||
["2015-01-01", "2015-01-02", "2015-01-03"],
|
||||
["2017-01-01", "2017-01-02", "2017-02-03"],
|
||||
]
|
||||
df = DataFrame(vals, dtype=object)
|
||||
msg = (
|
||||
rf"Unexpected value for 'dtype': 'datetime64\[{unit}\]'. "
|
||||
r"Must be 'datetime64\[s\]', 'datetime64\[ms\]', 'datetime64\[us\]', "
|
||||
r"'datetime64\[ns\]' or DatetimeTZDtype"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.astype(f"M8[{unit}]")
|
||||
|
||||
@pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"])
|
||||
def test_astype_from_object_to_timedelta_unit(self, unit):
|
||||
vals = [
|
||||
["1 Day", "2 Days", "3 Days"],
|
||||
["4 Days", "5 Days", "6 Days"],
|
||||
]
|
||||
df = DataFrame(vals, dtype=object)
|
||||
msg = (
|
||||
r"Cannot convert from timedelta64\[us\] to timedelta64\[.*\]. "
|
||||
"Supported resolutions are 's', 'ms', 'us', 'ns'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
# TODO: this is ValueError while for DatetimeArray it is TypeError;
|
||||
# get these consistent
|
||||
df.astype(f"m8[{unit}]")
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["M8", "m8"])
|
||||
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
|
||||
def test_astype_from_datetimelike_to_object(self, dtype, unit):
|
||||
# tests astype to object dtype
|
||||
# GH#19223 / GH#12425
|
||||
dtype = f"{dtype}[{unit}]"
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(object)
|
||||
assert (result.dtypes == object).all()
|
||||
|
||||
if dtype.startswith("M8"):
|
||||
assert result.iloc[0, 0] == Timestamp(1, unit=unit)
|
||||
else:
|
||||
assert result.iloc[0, 0] == Timedelta(1, unit=unit)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["M8", "m8"])
|
||||
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
|
||||
def test_astype_to_datetimelike_unit(self, any_real_numpy_dtype, dtype, unit):
|
||||
# tests all units from numeric origination
|
||||
# GH#19223 / GH#12425
|
||||
dtype = f"{dtype}[{unit}]"
|
||||
arr = np.array([[1, 2, 3]], dtype=any_real_numpy_dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame(arr.astype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
|
||||
def test_astype_to_datetime_unit(self, unit):
|
||||
# tests all units from datetime origination
|
||||
# GH#19223
|
||||
dtype = f"M8[{unit}]"
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
ser = df.iloc[:, 0]
|
||||
idx = Index(ser)
|
||||
dta = ser._values
|
||||
|
||||
if unit in ["ns", "us", "ms", "s"]:
|
||||
# GH#48928
|
||||
result = df.astype(dtype)
|
||||
else:
|
||||
# we use the nearest supported dtype (i.e. M8[s])
|
||||
msg = rf"Cannot cast DatetimeArray to dtype datetime64\[{unit}\]"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.astype(dtype)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ser.astype(dtype)
|
||||
|
||||
with pytest.raises(TypeError, match=msg.replace("Array", "Index")):
|
||||
idx.astype(dtype)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
dta.astype(dtype)
|
||||
|
||||
return
|
||||
|
||||
exp_df = DataFrame(arr.astype(dtype))
|
||||
assert (exp_df.dtypes == dtype).all()
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
res_ser = ser.astype(dtype)
|
||||
exp_ser = exp_df.iloc[:, 0]
|
||||
assert exp_ser.dtype == dtype
|
||||
tm.assert_series_equal(res_ser, exp_ser)
|
||||
|
||||
exp_dta = exp_ser._values
|
||||
|
||||
res_index = idx.astype(dtype)
|
||||
exp_index = Index(exp_ser)
|
||||
assert exp_index.dtype == dtype
|
||||
tm.assert_index_equal(res_index, exp_index)
|
||||
|
||||
res_dta = dta.astype(dtype)
|
||||
assert exp_dta.dtype == dtype
|
||||
tm.assert_extension_array_equal(res_dta, exp_dta)
|
||||
|
||||
def test_astype_to_timedelta_unit_ns(self):
|
||||
# preserver the timedelta conversion
|
||||
# GH#19223
|
||||
dtype = "m8[ns]"
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame(arr.astype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"])
|
||||
def test_astype_to_timedelta_unit(self, unit):
|
||||
# coerce to float
|
||||
# GH#19223 until 2.0 used to coerce to float
|
||||
dtype = f"m8[{unit}]"
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
ser = df.iloc[:, 0]
|
||||
tdi = Index(ser)
|
||||
tda = tdi._values
|
||||
|
||||
if unit in ["us", "ms", "s"]:
|
||||
assert (df.dtypes == dtype).all()
|
||||
result = df.astype(dtype)
|
||||
else:
|
||||
# We get the nearest supported unit, i.e. "s"
|
||||
assert (df.dtypes == "m8[s]").all()
|
||||
|
||||
msg = (
|
||||
rf"Cannot convert from timedelta64\[s\] to timedelta64\[{unit}\]. "
|
||||
"Supported resolutions are 's', 'ms', 'us', 'ns'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.astype(dtype)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ser.astype(dtype)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
tdi.astype(dtype)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
tda.astype(dtype)
|
||||
|
||||
return
|
||||
|
||||
result = df.astype(dtype)
|
||||
# The conversion is a no-op, so we just get a copy
|
||||
expected = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
|
||||
def test_astype_to_incorrect_datetimelike(self, unit):
|
||||
# trying to astype an m to an M, or vice-versa
|
||||
# GH#19224
|
||||
dtype = f"M8[{unit}]"
|
||||
other = f"m8[{unit}]"
|
||||
|
||||
df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
|
||||
msg = rf"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.astype(other)
|
||||
|
||||
msg = rf"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]"
|
||||
df = DataFrame(np.array([[1, 2, 3]], dtype=other))
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.astype(dtype)
|
||||
|
||||
def test_astype_arg_for_errors(self):
|
||||
# GH#14878
|
||||
|
||||
df = DataFrame([1, 2, 3])
|
||||
|
||||
msg = (
|
||||
"Expected value of kwarg 'errors' to be one of "
|
||||
"['raise', 'ignore']. Supplied value is 'True'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
df.astype(np.float64, errors=True)
|
||||
|
||||
df.astype(np.int8, errors="ignore")
|
||||
|
||||
def test_astype_invalid_conversion(self):
|
||||
# GH#47571
|
||||
df = DataFrame({"a": [1, 2, "text"], "b": [1, 2, 3]})
|
||||
|
||||
msg = (
|
||||
"invalid literal for int() with base 10: 'text': "
|
||||
"Error while type casting for column 'a'"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
df.astype({"a": int})
|
||||
|
||||
def test_astype_arg_for_errors_dictlist(self):
|
||||
# GH#25905
|
||||
df = DataFrame(
|
||||
[
|
||||
{"a": "1", "b": "16.5%", "c": "test"},
|
||||
{"a": "2.2", "b": "15.3", "c": "another_test"},
|
||||
]
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
{"a": 1.0, "b": "16.5%", "c": "test"},
|
||||
{"a": 2.2, "b": "15.3", "c": "another_test"},
|
||||
]
|
||||
)
|
||||
expected["c"] = expected["c"].astype("object")
|
||||
type_dict = {"a": "float64", "b": "float64", "c": "object"}
|
||||
|
||||
result = df.astype(dtype=type_dict, errors="ignore")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_astype_dt64tz(self, timezone_frame):
|
||||
# astype
|
||||
expected = np.array(
|
||||
[
|
||||
[
|
||||
Timestamp("2013-01-01 00:00:00"),
|
||||
Timestamp("2013-01-02 00:00:00"),
|
||||
Timestamp("2013-01-03 00:00:00"),
|
||||
],
|
||||
[
|
||||
Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
|
||||
NaT,
|
||||
Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
|
||||
],
|
||||
[
|
||||
Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
|
||||
NaT,
|
||||
Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
|
||||
],
|
||||
],
|
||||
dtype=object,
|
||||
).T
|
||||
expected = DataFrame(
|
||||
expected,
|
||||
index=timezone_frame.index,
|
||||
columns=timezone_frame.columns,
|
||||
dtype=object,
|
||||
)
|
||||
result = timezone_frame.astype(object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
msg = "Cannot use .astype to convert from timezone-aware dtype to timezone-"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
# dt64tz->dt64 deprecated
|
||||
timezone_frame.astype("datetime64[ns]")
|
||||
|
||||
def test_astype_dt64tz_to_str(self, timezone_frame, using_infer_string):
|
||||
# str formatting
|
||||
result = timezone_frame.astype(str)
|
||||
na_value = np.nan if using_infer_string else "NaT"
|
||||
expected = DataFrame(
|
||||
[
|
||||
[
|
||||
"2013-01-01",
|
||||
"2013-01-01 00:00:00-05:00",
|
||||
"2013-01-01 00:00:00+01:00",
|
||||
],
|
||||
["2013-01-02", na_value, na_value],
|
||||
[
|
||||
"2013-01-03",
|
||||
"2013-01-03 00:00:00-05:00",
|
||||
"2013-01-03 00:00:00+01:00",
|
||||
],
|
||||
],
|
||||
columns=timezone_frame.columns,
|
||||
dtype="str",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with option_context("display.max_columns", 20):
|
||||
result = str(timezone_frame)
|
||||
assert (
|
||||
"0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00"
|
||||
) in result
|
||||
assert (
|
||||
"1 2013-01-02 NaT NaT"
|
||||
) in result
|
||||
assert (
|
||||
"2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00"
|
||||
) in result
|
||||
|
||||
def test_astype_empty_dtype_dict(self):
|
||||
# issue mentioned further down in the following issue's thread
|
||||
# https://github.com/pandas-dev/pandas/issues/33113
|
||||
df = DataFrame()
|
||||
result = df.astype({})
|
||||
tm.assert_frame_equal(result, df)
|
||||
assert result is not df
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, dtype",
|
||||
[
|
||||
(["x", "y", "z"], "string[python]"),
|
||||
pytest.param(
|
||||
["x", "y", "z"],
|
||||
"string[pyarrow]",
|
||||
marks=td.skip_if_no("pyarrow"),
|
||||
),
|
||||
(["x", "y", "z"], "category"),
|
||||
(3 * [Timestamp("2020-01-01", tz="UTC")], None),
|
||||
(3 * [Interval(0, 1)], None),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("errors", ["raise", "ignore"])
|
||||
def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors):
|
||||
# https://github.com/pandas-dev/pandas/issues/35471
|
||||
df = DataFrame(Series(data, dtype=dtype))
|
||||
if errors == "ignore":
|
||||
expected = df
|
||||
result = df.astype(float, errors=errors)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
msg = "(Cannot cast)|(could not convert)"
|
||||
with pytest.raises((ValueError, TypeError), match=msg):
|
||||
df.astype(float, errors=errors)
|
||||
|
||||
def test_astype_tz_conversion(self):
|
||||
# GH 35973, GH#58998
|
||||
msg = "'d' is deprecated and will be removed in a future version."
|
||||
with tm.assert_produces_warning(Pandas4Warning, match=msg):
|
||||
val = {
|
||||
"tz": date_range(
|
||||
"2020-08-30", freq="d", periods=2, tz="Europe/London", unit="ns"
|
||||
)
|
||||
}
|
||||
df = DataFrame(val)
|
||||
result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"})
|
||||
|
||||
expected = df
|
||||
expected["tz"] = expected["tz"].dt.tz_convert("Europe/Berlin")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"])
|
||||
def test_astype_tz_object_conversion(self, tz):
|
||||
# GH 35973
|
||||
val = {
|
||||
"tz": date_range(
|
||||
"2020-08-30", freq="D", periods=2, tz="Europe/London", unit="ns"
|
||||
)
|
||||
}
|
||||
expected = DataFrame(val)
|
||||
|
||||
# convert expected to object dtype from other tz str (independently tested)
|
||||
result = expected.astype({"tz": f"datetime64[ns, {tz}]"})
|
||||
result = result.astype({"tz": "object"})
|
||||
|
||||
# do real test: object dtype to a specified tz, different from construction tz.
|
||||
result = result.astype({"tz": "datetime64[ns, Europe/London]"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture):
|
||||
# GH#41409
|
||||
tz = tz_naive_fixture
|
||||
|
||||
dti = date_range("2016-01-01", periods=3, tz=tz)
|
||||
dta = dti._data
|
||||
dta[0] = NaT
|
||||
|
||||
obj = frame_or_series(dta)
|
||||
result = obj.astype("string")
|
||||
|
||||
# Check that Series/DataFrame.astype matches DatetimeArray.astype
|
||||
expected = frame_or_series(dta.astype("string"))
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
item = result.iloc[0]
|
||||
if frame_or_series is DataFrame:
|
||||
item = item.iloc[0]
|
||||
assert item is pd.NA
|
||||
|
||||
# For non-NA values, we should match what we get for non-EA str
|
||||
alt = obj.astype(str)
|
||||
assert np.all(alt.iloc[1:] == result.iloc[1:])
|
||||
|
||||
def test_astype_td64_to_string(self, frame_or_series):
|
||||
# GH#41409
|
||||
tdi = pd.timedelta_range("1 Day", periods=3)
|
||||
obj = frame_or_series(tdi)
|
||||
|
||||
expected = frame_or_series(["1 days", "2 days", "3 days"], dtype="string")
|
||||
result = obj.astype("string")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_astype_bytes(self):
|
||||
# GH#39474
|
||||
result = DataFrame(["foo", "bar", "baz"]).astype(bytes)
|
||||
assert result.dtypes[0] == np.dtype("S3")
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_slice",
|
||||
[
|
||||
np.s_[:2, :2],
|
||||
np.s_[:1, :2],
|
||||
np.s_[:2, :1],
|
||||
np.s_[::2, ::2],
|
||||
np.s_[::1, ::2],
|
||||
np.s_[::2, ::1],
|
||||
],
|
||||
)
|
||||
def test_astype_noncontiguous(self, index_slice):
|
||||
# GH#42396
|
||||
data = np.arange(16).reshape(4, 4)
|
||||
df = DataFrame(data)
|
||||
|
||||
result = df.iloc[index_slice].astype("int16")
|
||||
expected = df.iloc[index_slice]
|
||||
tm.assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
def test_astype_retain_attrs(self, any_numpy_dtype):
|
||||
# GH#44414
|
||||
df = DataFrame({"a": [0, 1, 2], "b": [3, 4, 5]})
|
||||
df.attrs["Location"] = "Michigan"
|
||||
|
||||
result = df.astype({"a": any_numpy_dtype}).attrs
|
||||
expected = df.attrs
|
||||
|
||||
tm.assert_dict_equal(expected, result)
|
||||
|
||||
|
||||
class TestAstypeCategorical:
|
||||
def test_astype_from_categorical3(self):
|
||||
df = DataFrame({"cats": [1, 2, 3, 4, 5, 6], "vals": [1, 2, 3, 4, 5, 6]})
|
||||
cats = Categorical([1, 2, 3, 4, 5, 6])
|
||||
exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
|
||||
df["cats"] = df["cats"].astype("category")
|
||||
tm.assert_frame_equal(exp_df, df)
|
||||
|
||||
def test_astype_from_categorical4(self):
|
||||
df = DataFrame(
|
||||
{"cats": ["a", "b", "b", "a", "a", "d"], "vals": [1, 2, 3, 4, 5, 6]}
|
||||
)
|
||||
cats = Categorical(["a", "b", "b", "a", "a", "d"])
|
||||
exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
|
||||
df["cats"] = df["cats"].astype("category")
|
||||
tm.assert_frame_equal(exp_df, df)
|
||||
|
||||
def test_categorical_astype_to_int(self, any_int_dtype):
|
||||
# GH#39402
|
||||
|
||||
df = DataFrame(data={"col1": pd.array([2.0, 1.0, 3.0])})
|
||||
df.col1 = df.col1.astype("category")
|
||||
df.col1 = df.col1.astype(any_int_dtype)
|
||||
expected = DataFrame({"col1": pd.array([2, 1, 3], dtype=any_int_dtype)})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_astype_categorical_to_string_missing(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/41797
|
||||
df = DataFrame(["a", "b", np.nan])
|
||||
expected = df.astype(str)
|
||||
cat = df.astype("category")
|
||||
result = cat.astype(str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class IntegerArrayNoCopy(pd.core.arrays.IntegerArray):
|
||||
# GH 42501
|
||||
|
||||
def copy(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class Int16DtypeNoCopy(pd.Int16Dtype):
|
||||
# GH 42501
|
||||
|
||||
def construct_array_type(self):
|
||||
return IntegerArrayNoCopy
|
||||
|
||||
|
||||
def test_frame_astype_no_copy():
|
||||
# GH 42501
|
||||
df = DataFrame({"a": [1, 4, None, 5], "b": [6, 7, 8, 9]}, dtype=object)
|
||||
result = df.astype({"a": Int16DtypeNoCopy()})
|
||||
|
||||
assert result.a.dtype == pd.Int16Dtype()
|
||||
assert np.shares_memory(df.b.values, result.b.values)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["int64", "Int64"])
|
||||
def test_astype_copies(dtype):
|
||||
# GH#50984
|
||||
pytest.importorskip("pyarrow")
|
||||
df = DataFrame({"a": [1, 2, 3]}, dtype=dtype)
|
||||
result = df.astype("int64[pyarrow]")
|
||||
df.iloc[0, 0] = 100
|
||||
expected = DataFrame({"a": [1, 2, 3]}, dtype="int64[pyarrow]")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
|
||||
def test_astype_to_string_not_modifying_input(string_storage, val):
|
||||
# GH#51073
|
||||
df = DataFrame({"a": ["a", "b", val]})
|
||||
expected = df.copy()
|
||||
with option_context("mode.string_storage", string_storage):
|
||||
df.astype("string")
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
|
||||
def test_astype_to_string_dtype_not_modifying_input(any_string_dtype, val):
|
||||
# GH#51073 - variant of the above test with explicit dtype instances
|
||||
df = DataFrame({"a": ["a", "b", val]})
|
||||
expected = df.copy()
|
||||
df.astype(any_string_dtype)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
@ -0,0 +1,153 @@
|
||||
from datetime import (
|
||||
time,
|
||||
timezone,
|
||||
)
|
||||
import zoneinfo
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs import timezones
|
||||
from pandas.errors import Pandas4Warning
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestAtTime:
|
||||
@pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"])
|
||||
def test_localized_at_time(self, tzstr, frame_or_series):
|
||||
tz = timezones.maybe_get_tz(tzstr)
|
||||
|
||||
rng = date_range("4/16/2012", "5/1/2012", freq="h")
|
||||
ts = frame_or_series(
|
||||
np.random.default_rng(2).standard_normal(len(rng)), index=rng
|
||||
)
|
||||
|
||||
ts_local = ts.tz_localize(tzstr)
|
||||
|
||||
result = ts_local.at_time(time(10, 0))
|
||||
expected = ts.at_time(time(10, 0)).tz_localize(tzstr)
|
||||
tm.assert_equal(result, expected)
|
||||
assert timezones.tz_compare(result.index.tz, tz)
|
||||
|
||||
def test_at_time(self, frame_or_series):
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
rs = ts.at_time(rng[1])
|
||||
assert (rs.index.hour == rng[1].hour).all()
|
||||
assert (rs.index.minute == rng[1].minute).all()
|
||||
assert (rs.index.second == rng[1].second).all()
|
||||
|
||||
result = ts.at_time("9:30")
|
||||
expected = ts.at_time(time(9, 30))
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_at_time_midnight(self, frame_or_series):
|
||||
# midnight, everything
|
||||
rng = date_range("1/1/2000", "1/31/2000")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 3)), index=rng
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
|
||||
result = ts.at_time(time(0, 0))
|
||||
tm.assert_equal(result, ts)
|
||||
|
||||
def test_at_time_nonexistent(self, frame_or_series):
|
||||
# time doesn't exist
|
||||
rng = date_range("1/1/2012", freq="23Min", periods=384)
|
||||
ts = DataFrame(np.random.default_rng(2).standard_normal(len(rng)), rng)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
rs = ts.at_time("16:00")
|
||||
assert len(rs) == 0
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=timezone.utc)]
|
||||
)
|
||||
def test_at_time_errors(self, hour):
|
||||
# GH#24043
|
||||
dti = date_range("2018", periods=3, freq="h")
|
||||
df = DataFrame(list(range(len(dti))), index=dti)
|
||||
if getattr(hour, "tzinfo", None) is None:
|
||||
result = df.at_time(hour)
|
||||
expected = df.iloc[1:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(ValueError, match="Index must be timezone"):
|
||||
df.at_time(hour)
|
||||
|
||||
def test_at_time_tz(self):
|
||||
# GH#24043
|
||||
dti = date_range("2018", periods=3, freq="h", tz="US/Pacific")
|
||||
df = DataFrame(list(range(len(dti))), index=dti)
|
||||
result = df.at_time(time(4, tzinfo=zoneinfo.ZoneInfo("US/Eastern")))
|
||||
expected = df.iloc[1:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_at_time_raises(self, frame_or_series):
|
||||
# GH#20725
|
||||
obj = DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
msg = "Index must be DatetimeIndex"
|
||||
with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex
|
||||
obj.at_time("00:00")
|
||||
|
||||
def test_at_time_axis(self, axis):
|
||||
# issue 8839
|
||||
rng = date_range("1/1/2000", "1/2/2000", freq="5min")
|
||||
ts = DataFrame(np.random.default_rng(2).standard_normal((len(rng), len(rng))))
|
||||
ts.index, ts.columns = rng, rng
|
||||
|
||||
indices = rng[(rng.hour == 9) & (rng.minute == 30) & (rng.second == 0)]
|
||||
|
||||
if axis in ["index", 0]:
|
||||
expected = ts.loc[indices, :]
|
||||
elif axis in ["columns", 1]:
|
||||
expected = ts.loc[:, indices]
|
||||
|
||||
result = ts.at_time("9:30", axis=axis)
|
||||
|
||||
# Without clearing freq, result has freq 1440T and expected 5T
|
||||
result.index = result.index._with_freq(None)
|
||||
expected.index = expected.index._with_freq(None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_at_time_datetimeindex(self):
|
||||
index = date_range("2012-01-01", "2012-01-05", freq="30min")
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(index), 5)), index=index
|
||||
)
|
||||
akey = time(12, 0, 0)
|
||||
ainds = [24, 72, 120, 168]
|
||||
|
||||
result = df.at_time(akey)
|
||||
expected = df.loc[akey]
|
||||
expected2 = df.iloc[ainds]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(result, expected2)
|
||||
assert len(result) == 4
|
||||
|
||||
def test_at_time_ambiguous_format_deprecation(self):
|
||||
# GH#50839
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="125min")
|
||||
ts = DataFrame(list(range(len(rng))), index=rng)
|
||||
|
||||
msg1 = "The string '.*' cannot be parsed"
|
||||
with tm.assert_produces_warning(Pandas4Warning, match=msg1):
|
||||
ts.at_time("2022-12-12 00:00:00")
|
||||
with tm.assert_produces_warning(Pandas4Warning, match=msg1):
|
||||
ts.at_time("2022-12-12 00:00:00 +09:00")
|
||||
with tm.assert_produces_warning(Pandas4Warning, match=msg1):
|
||||
ts.at_time("2022-12-12 00:00:00.000000")
|
||||
|
||||
# The dateutil parser raises on these, so we can give the future behavior
|
||||
# immediately using pd.core.tools.to_time
|
||||
ts.at_time("235500")
|
||||
ts.at_time("115500PM")
|
||||
@ -0,0 +1,227 @@
|
||||
from datetime import (
|
||||
datetime,
|
||||
time,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs import timezones
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestBetweenTime:
|
||||
@td.skip_if_not_us_locale
|
||||
def test_between_time_formats(self, frame_or_series):
|
||||
# GH#11818
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
|
||||
strings = [
|
||||
("2:00", "2:30"),
|
||||
("0200", "0230"),
|
||||
("2:00am", "2:30am"),
|
||||
("0200am", "0230am"),
|
||||
("2:00:00", "2:30:00"),
|
||||
("020000", "023000"),
|
||||
("2:00:00am", "2:30:00am"),
|
||||
("020000am", "023000am"),
|
||||
]
|
||||
expected_length = 28
|
||||
|
||||
for time_string in strings:
|
||||
assert len(ts.between_time(*time_string)) == expected_length
|
||||
|
||||
@pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"])
|
||||
def test_localized_between_time(self, tzstr, frame_or_series):
|
||||
tz = timezones.maybe_get_tz(tzstr)
|
||||
|
||||
rng = date_range("4/16/2012", "5/1/2012", freq="h")
|
||||
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
|
||||
if frame_or_series is DataFrame:
|
||||
ts = ts.to_frame()
|
||||
|
||||
ts_local = ts.tz_localize(tzstr)
|
||||
|
||||
t1, t2 = time(10, 0), time(11, 0)
|
||||
result = ts_local.between_time(t1, t2)
|
||||
expected = ts.between_time(t1, t2).tz_localize(tzstr)
|
||||
tm.assert_equal(result, expected)
|
||||
assert timezones.tz_compare(result.index.tz, tz)
|
||||
|
||||
def test_between_time_types(self, frame_or_series):
|
||||
# GH11818
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
obj = DataFrame({"A": 0}, index=rng)
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
msg = r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\] to a time"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5))
|
||||
|
||||
def test_between_time(self, inclusive_endpoints_fixture, frame_or_series):
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
|
||||
stime = time(0, 0)
|
||||
etime = time(1, 0)
|
||||
inclusive = inclusive_endpoints_fixture
|
||||
|
||||
filtered = ts.between_time(stime, etime, inclusive=inclusive)
|
||||
exp_len = 13 * 4 + 1
|
||||
|
||||
if inclusive in ["right", "neither"]:
|
||||
exp_len -= 5
|
||||
if inclusive in ["left", "neither"]:
|
||||
exp_len -= 4
|
||||
|
||||
assert len(filtered) == exp_len
|
||||
for rs in filtered.index:
|
||||
t = rs.time()
|
||||
if inclusive in ["left", "both"]:
|
||||
assert t >= stime
|
||||
else:
|
||||
assert t > stime
|
||||
|
||||
if inclusive in ["right", "both"]:
|
||||
assert t <= etime
|
||||
else:
|
||||
assert t < etime
|
||||
|
||||
result = ts.between_time("00:00", "01:00")
|
||||
expected = ts.between_time(stime, etime)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# across midnight
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
stime = time(22, 0)
|
||||
etime = time(9, 0)
|
||||
|
||||
filtered = ts.between_time(stime, etime, inclusive=inclusive)
|
||||
exp_len = (12 * 11 + 1) * 4 + 1
|
||||
if inclusive in ["right", "neither"]:
|
||||
exp_len -= 4
|
||||
if inclusive in ["left", "neither"]:
|
||||
exp_len -= 4
|
||||
|
||||
assert len(filtered) == exp_len
|
||||
for rs in filtered.index:
|
||||
t = rs.time()
|
||||
if inclusive in ["left", "both"]:
|
||||
assert (t >= stime) or (t <= etime)
|
||||
else:
|
||||
assert (t > stime) or (t <= etime)
|
||||
|
||||
if inclusive in ["right", "both"]:
|
||||
assert (t <= etime) or (t >= stime)
|
||||
else:
|
||||
assert (t < etime) or (t >= stime)
|
||||
|
||||
def test_between_time_raises(self, frame_or_series):
|
||||
# GH#20725
|
||||
obj = DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
msg = "Index must be DatetimeIndex"
|
||||
with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex
|
||||
obj.between_time(start_time="00:00", end_time="12:00")
|
||||
|
||||
def test_between_time_axis(self, frame_or_series):
|
||||
# GH#8839
|
||||
rng = date_range("1/1/2000", periods=100, freq="10min")
|
||||
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
|
||||
if frame_or_series is DataFrame:
|
||||
ts = ts.to_frame()
|
||||
|
||||
stime, etime = ("08:00:00", "09:00:00")
|
||||
expected_length = 7
|
||||
|
||||
assert len(ts.between_time(stime, etime)) == expected_length
|
||||
assert len(ts.between_time(stime, etime, axis=0)) == expected_length
|
||||
msg = f"No axis named {ts.ndim} for object type {type(ts).__name__}"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ts.between_time(stime, etime, axis=ts.ndim)
|
||||
|
||||
def test_between_time_axis_aliases(self, axis):
|
||||
# GH#8839
|
||||
rng = date_range("1/1/2000", periods=100, freq="10min")
|
||||
ts = DataFrame(np.random.default_rng(2).standard_normal((len(rng), len(rng))))
|
||||
stime, etime = ("08:00:00", "09:00:00")
|
||||
exp_len = 7
|
||||
|
||||
if axis in ["index", 0]:
|
||||
ts.index = rng
|
||||
assert len(ts.between_time(stime, etime)) == exp_len
|
||||
assert len(ts.between_time(stime, etime, axis=0)) == exp_len
|
||||
|
||||
if axis in ["columns", 1]:
|
||||
ts.columns = rng
|
||||
selected = ts.between_time(stime, etime, axis=1).columns
|
||||
assert len(selected) == exp_len
|
||||
|
||||
def test_between_time_axis_raises(self, axis):
|
||||
# issue 8839
|
||||
rng = date_range("1/1/2000", periods=100, freq="10min")
|
||||
mask = np.arange(0, len(rng))
|
||||
rand_data = np.random.default_rng(2).standard_normal((len(rng), len(rng)))
|
||||
ts = DataFrame(rand_data, index=rng, columns=rng)
|
||||
stime, etime = ("08:00:00", "09:00:00")
|
||||
|
||||
msg = "Index must be DatetimeIndex"
|
||||
if axis in ["columns", 1]:
|
||||
ts.index = mask
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ts.between_time(stime, etime)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ts.between_time(stime, etime, axis=0)
|
||||
|
||||
if axis in ["index", 0]:
|
||||
ts.columns = mask
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ts.between_time(stime, etime, axis=1)
|
||||
|
||||
def test_between_time_datetimeindex(self):
|
||||
index = date_range("2012-01-01", "2012-01-05", freq="30min")
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(index), 5)), index=index
|
||||
)
|
||||
bkey = slice(time(13, 0, 0), time(14, 0, 0))
|
||||
binds = [26, 27, 28, 74, 75, 76, 122, 123, 124, 170, 171, 172]
|
||||
|
||||
result = df.between_time(bkey.start, bkey.stop)
|
||||
expected = df.loc[bkey]
|
||||
expected2 = df.iloc[binds]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(result, expected2)
|
||||
assert len(result) == 12
|
||||
|
||||
def test_between_time_incorrect_arg_inclusive(self):
|
||||
# GH40245
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
|
||||
)
|
||||
|
||||
stime = time(0, 0)
|
||||
etime = time(1, 0)
|
||||
inclusive = "bad_string"
|
||||
msg = "Inclusive has to be either 'both', 'neither', 'left' or 'right'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ts.between_time(stime, etime, inclusive=inclusive)
|
||||
@ -0,0 +1,200 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameClip:
|
||||
def test_clip(self, float_frame):
|
||||
median = float_frame.median().median()
|
||||
original = float_frame.copy()
|
||||
|
||||
double = float_frame.clip(upper=median, lower=median)
|
||||
assert not (double.values != median).any()
|
||||
|
||||
# Verify that float_frame was not changed inplace
|
||||
assert (float_frame.values == original.values).all()
|
||||
|
||||
def test_inplace_clip(self, float_frame):
|
||||
# GH#15388
|
||||
median = float_frame.median().median()
|
||||
frame_copy = float_frame.copy()
|
||||
|
||||
result = frame_copy.clip(upper=median, lower=median, inplace=True)
|
||||
assert result is frame_copy
|
||||
assert not (frame_copy.values != median).any()
|
||||
|
||||
def test_dataframe_clip(self):
|
||||
# GH#2747
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
|
||||
|
||||
for lb, ub in [(-1, 1), (1, -1)]:
|
||||
clipped_df = df.clip(lb, ub)
|
||||
|
||||
lb, ub = min(lb, ub), max(ub, lb)
|
||||
lb_mask = df.values <= lb
|
||||
ub_mask = df.values >= ub
|
||||
mask = ~lb_mask & ~ub_mask
|
||||
assert (clipped_df.values[lb_mask] == lb).all()
|
||||
assert (clipped_df.values[ub_mask] == ub).all()
|
||||
assert (clipped_df.values[mask] == df.values[mask]).all()
|
||||
|
||||
def test_clip_mixed_numeric(self):
|
||||
# clip on mixed integer or floats
|
||||
# GH#24162, clipping now preserves numeric types per column
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]})
|
||||
result = df.clip(1, 2)
|
||||
expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"])
|
||||
expected = df.dtypes
|
||||
result = df.clip(upper=3).dtypes
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("inplace", [True, False])
|
||||
def test_clip_against_series(self, inplace):
|
||||
# GH#6966
|
||||
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
|
||||
lb = Series(np.random.default_rng(2).standard_normal(1000))
|
||||
ub = lb + 1
|
||||
|
||||
original = df.copy()
|
||||
clipped_df = df.clip(lb, ub, axis=0, inplace=inplace)
|
||||
|
||||
if inplace:
|
||||
assert clipped_df is df
|
||||
|
||||
for i in range(2):
|
||||
lb_mask = original.iloc[:, i] <= lb
|
||||
ub_mask = original.iloc[:, i] >= ub
|
||||
mask = ~lb_mask & ~ub_mask
|
||||
|
||||
result = clipped_df.loc[lb_mask, i]
|
||||
tm.assert_series_equal(result, lb[lb_mask], check_names=False)
|
||||
assert result.name == i
|
||||
|
||||
result = clipped_df.loc[ub_mask, i]
|
||||
tm.assert_series_equal(result, ub[ub_mask], check_names=False)
|
||||
assert result.name == i
|
||||
|
||||
tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i])
|
||||
|
||||
@pytest.mark.parametrize("inplace", [True, False])
|
||||
@pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])])
|
||||
@pytest.mark.parametrize(
|
||||
"axis,res",
|
||||
[
|
||||
(0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]),
|
||||
(1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]),
|
||||
],
|
||||
)
|
||||
def test_clip_against_list_like(self, inplace, lower, axis, res):
|
||||
# GH#15390
|
||||
arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
|
||||
|
||||
original = DataFrame(
|
||||
arr, columns=["one", "two", "three"], index=["a", "b", "c"]
|
||||
)
|
||||
|
||||
result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace)
|
||||
|
||||
expected = DataFrame(res, columns=original.columns, index=original.index)
|
||||
if inplace:
|
||||
assert result is original
|
||||
tm.assert_frame_equal(result, expected, check_exact=True)
|
||||
|
||||
@pytest.mark.parametrize("axis", [0, 1, None])
|
||||
def test_clip_against_frame(self, axis):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
|
||||
lb = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
|
||||
ub = lb + 1
|
||||
|
||||
clipped_df = df.clip(lb, ub, axis=axis)
|
||||
|
||||
lb_mask = df <= lb
|
||||
ub_mask = df >= ub
|
||||
mask = ~lb_mask & ~ub_mask
|
||||
|
||||
tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask])
|
||||
tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask])
|
||||
tm.assert_frame_equal(clipped_df[mask], df[mask])
|
||||
|
||||
def test_clip_against_unordered_columns(self):
|
||||
# GH#20911
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((1000, 4)),
|
||||
columns=["A", "B", "C", "D"],
|
||||
)
|
||||
df2 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((1000, 4)),
|
||||
columns=["D", "A", "B", "C"],
|
||||
)
|
||||
df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"])
|
||||
result_upper = df1.clip(lower=0, upper=df2)
|
||||
expected_upper = df1.clip(lower=0, upper=df2[df1.columns])
|
||||
result_lower = df1.clip(lower=df3, upper=3)
|
||||
expected_lower = df1.clip(lower=df3[df1.columns], upper=3)
|
||||
result_lower_upper = df1.clip(lower=df3, upper=df2)
|
||||
expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns])
|
||||
tm.assert_frame_equal(result_upper, expected_upper)
|
||||
tm.assert_frame_equal(result_lower, expected_lower)
|
||||
tm.assert_frame_equal(result_lower_upper, expected_lower_upper)
|
||||
|
||||
def test_clip_with_na_args(self, float_frame):
|
||||
"""Should process np.nan argument as None"""
|
||||
# GH#17276
|
||||
tm.assert_frame_equal(float_frame.clip(np.nan), float_frame)
|
||||
tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame)
|
||||
|
||||
# GH#19992 and adjusted in GH#40420
|
||||
df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]})
|
||||
|
||||
result = df.clip(lower=[4, 5, np.nan], axis=0)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col_0": Series([4, 5, 3], dtype="float"),
|
||||
"col_1": [4, 5, 6],
|
||||
"col_2": [7, 8, 9],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.clip(lower=[4, 5, np.nan], axis=1)
|
||||
expected = DataFrame(
|
||||
{"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [7, 8, 9]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH#40420
|
||||
data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]}
|
||||
df = DataFrame(data)
|
||||
t = Series([2, -4, np.nan, 6, 3])
|
||||
result = df.clip(lower=t, axis=0)
|
||||
expected = DataFrame(
|
||||
{"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]}, dtype="float"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_clip_int_data_with_float_bound(self):
|
||||
# GH51472
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
result = df.clip(lower=1.5)
|
||||
expected = DataFrame({"a": [1.5, 2.0, 3.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_clip_with_list_bound(self):
|
||||
# GH#54817
|
||||
df = DataFrame([1, 5])
|
||||
expected = DataFrame([3, 5])
|
||||
result = df.clip([3])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([1, 3])
|
||||
result = df.clip(upper=[3])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,63 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestCombine:
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
pd.date_range("2000", periods=4),
|
||||
pd.date_range("2000", periods=4, tz="US/Central"),
|
||||
pd.period_range("2000", periods=4),
|
||||
pd.timedelta_range(0, periods=4),
|
||||
],
|
||||
)
|
||||
def test_combine_datetlike_udf(self, data):
|
||||
# GH#23079
|
||||
df = pd.DataFrame({"A": data})
|
||||
other = df.copy()
|
||||
df.iloc[1, 0] = None
|
||||
|
||||
def combiner(a, b):
|
||||
return b
|
||||
|
||||
result = df.combine(other, combiner)
|
||||
tm.assert_frame_equal(result, other)
|
||||
|
||||
def test_combine_generic(self, float_frame):
|
||||
df1 = float_frame
|
||||
df2 = float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]]
|
||||
|
||||
combined = df1.combine(df2, np.add)
|
||||
combined2 = df2.combine(df1, np.add)
|
||||
assert combined["D"].isna().all()
|
||||
assert combined2["D"].isna().all()
|
||||
|
||||
chunk = combined.loc[combined.index[:-5], ["A", "B", "C"]]
|
||||
chunk2 = combined2.loc[combined2.index[:-5], ["A", "B", "C"]]
|
||||
|
||||
exp = (
|
||||
float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]].reindex_like(chunk)
|
||||
* 2
|
||||
)
|
||||
tm.assert_frame_equal(chunk, exp)
|
||||
tm.assert_frame_equal(chunk2, exp)
|
||||
|
||||
def test_combine_nonunique_columns(self):
|
||||
# GH#51340
|
||||
|
||||
df = pd.DataFrame({"A": range(5), "B": range(5)})
|
||||
df.columns = ["A", "A"]
|
||||
|
||||
other = df.copy()
|
||||
df.iloc[1, :] = None
|
||||
|
||||
def combiner(a, b):
|
||||
return b
|
||||
|
||||
result = df.combine(other, combiner)
|
||||
expected = other.astype("float64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,597 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.cast import find_common_type
|
||||
from pandas.core.dtypes.common import is_dtype_equal
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameCombineFirst:
|
||||
def test_combine_first_mixed(self):
|
||||
a = Series(["a", "b"], index=range(2))
|
||||
b = Series(range(2), index=range(2))
|
||||
f = DataFrame({"A": a, "B": b})
|
||||
|
||||
a = Series(["a", "b"], index=range(5, 7))
|
||||
b = Series(range(2), index=range(5, 7))
|
||||
g = DataFrame({"A": a, "B": b})
|
||||
|
||||
exp = DataFrame({"A": list("abab"), "B": [0, 1, 0, 1]}, index=[0, 1, 5, 6])
|
||||
combined = f.combine_first(g)
|
||||
tm.assert_frame_equal(combined, exp)
|
||||
|
||||
def test_combine_first_disjoint(self, float_frame):
|
||||
head, tail = float_frame[:5], float_frame[5:]
|
||||
combined = head.combine_first(tail)
|
||||
reordered_frame = float_frame.reindex(combined.index)
|
||||
|
||||
tm.assert_frame_equal(combined, reordered_frame)
|
||||
tm.assert_index_equal(combined.columns, float_frame.columns)
|
||||
tm.assert_series_equal(combined["A"], reordered_frame["A"])
|
||||
|
||||
tm.assert_series_equal(combined["A"].reindex(head.index), head["A"])
|
||||
tm.assert_series_equal(combined["A"].reindex(tail.index), tail["A"])
|
||||
|
||||
def test_combine_first_same_index(self, float_frame):
|
||||
fcopy = float_frame.copy()
|
||||
fcopy["A"] = 1
|
||||
del fcopy["C"]
|
||||
|
||||
fcopy2 = float_frame.copy()
|
||||
fcopy2["B"] = 0
|
||||
del fcopy2["D"]
|
||||
|
||||
combined = fcopy.combine_first(fcopy2)
|
||||
|
||||
assert (combined["A"] == 1).all()
|
||||
tm.assert_series_equal(combined["B"], fcopy["B"])
|
||||
tm.assert_series_equal(combined["C"], fcopy2["C"])
|
||||
tm.assert_series_equal(combined["D"], fcopy["D"])
|
||||
|
||||
def test_combine_first_overlap(self, float_frame):
|
||||
combined = float_frame[:5].combine_first(float_frame[5:])
|
||||
reordered_frame = float_frame.reindex(combined.index)
|
||||
head, tail = reordered_frame[:10].copy(), reordered_frame.copy()
|
||||
head["A"] = 1
|
||||
combined = head.combine_first(tail)
|
||||
assert (combined["A"][:10] == 1).all()
|
||||
|
||||
def test_combine_first_reverse_overlap(self, float_frame):
|
||||
combined = float_frame[:5].combine_first(float_frame[5:])
|
||||
reordered_frame = float_frame.reindex(combined.index)
|
||||
head, tail = reordered_frame[:10].copy(), reordered_frame
|
||||
|
||||
tail.iloc[:10, tail.columns.get_loc("A")] = 0
|
||||
combined = tail.combine_first(head)
|
||||
assert (combined["A"][:10] == 0).all()
|
||||
|
||||
def test_combine_first_with_empty(self, float_frame):
|
||||
comb = float_frame.combine_first(DataFrame())
|
||||
tm.assert_frame_equal(comb, float_frame)
|
||||
|
||||
comb = DataFrame().combine_first(float_frame)
|
||||
tm.assert_frame_equal(comb, float_frame.sort_index())
|
||||
|
||||
def test_combine_first_with_new_index(self, float_frame):
|
||||
comb = float_frame.combine_first(DataFrame(index=["faz", "boo"]))
|
||||
assert "faz" in comb.index
|
||||
|
||||
def test_combine_first_column_union(self):
|
||||
# GH#2525
|
||||
df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)])
|
||||
df2 = DataFrame(columns=["b"])
|
||||
result = df.combine_first(df2)
|
||||
assert "b" in result
|
||||
|
||||
def test_combine_first_mixed_bug(self):
|
||||
idx = Index(["a", "b", "c", "e"])
|
||||
ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
|
||||
ser2 = Series(["a", "b", "c", "e"], index=idx)
|
||||
ser3 = Series([12, 4, 5, 97], index=idx)
|
||||
|
||||
frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3})
|
||||
|
||||
idx = Index(["a", "b", "c", "f"])
|
||||
ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
|
||||
ser2 = Series(["a", "b", "c", "f"], index=idx)
|
||||
ser3 = Series([12, 4, 5, 97], index=idx)
|
||||
|
||||
frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3})
|
||||
|
||||
combined = frame1.combine_first(frame2)
|
||||
assert len(combined.columns) == 5
|
||||
|
||||
def test_combine_first_same_as_in_update(self):
|
||||
# gh 3016 (same as in update)
|
||||
df = DataFrame(
|
||||
[[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
|
||||
columns=["A", "B", "bool1", "bool2"],
|
||||
)
|
||||
|
||||
other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
|
||||
result = df.combine_first(other)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
df.loc[0, "A"] = np.nan
|
||||
result = df.combine_first(other)
|
||||
df.loc[0, "A"] = 45
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
def test_combine_first_doc_example(self):
|
||||
# doc example
|
||||
df1 = DataFrame(
|
||||
{"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
|
||||
)
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
|
||||
"B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
|
||||
}
|
||||
)
|
||||
|
||||
result = df1.combine_first(df2)
|
||||
expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_combine_first_return_obj_type_with_bools(self):
|
||||
# GH3552
|
||||
|
||||
df1 = DataFrame(
|
||||
[[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]]
|
||||
)
|
||||
df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2])
|
||||
|
||||
expected = Series([True, True, False], name=2, dtype=bool)
|
||||
|
||||
result_12 = df1.combine_first(df2)[2]
|
||||
tm.assert_series_equal(result_12, expected)
|
||||
|
||||
result_21 = df2.combine_first(df1)[2]
|
||||
tm.assert_series_equal(result_21, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data1, data2, data_expected",
|
||||
(
|
||||
(
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
[pd.NaT, pd.NaT, pd.NaT],
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
),
|
||||
(
|
||||
[pd.NaT, pd.NaT, pd.NaT],
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
),
|
||||
(
|
||||
[datetime(2000, 1, 2), pd.NaT, pd.NaT],
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
[datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
),
|
||||
(
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
[datetime(2000, 1, 2), pd.NaT, pd.NaT],
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
),
|
||||
),
|
||||
)
|
||||
def test_combine_first_convert_datatime_correctly(
|
||||
self, data1, data2, data_expected
|
||||
):
|
||||
# GH 3593
|
||||
|
||||
df1, df2 = DataFrame({"a": data1}), DataFrame({"a": data2})
|
||||
result = df1.combine_first(df2)
|
||||
expected = DataFrame({"a": data_expected})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_combine_first_align_nan(self):
|
||||
# GH 7509 (not fixed)
|
||||
ts = pd.Timestamp("2011-01-01").as_unit("s")
|
||||
dfa = DataFrame([[ts, 2]], columns=["a", "b"])
|
||||
dfb = DataFrame([[4], [5]], columns=["b"])
|
||||
assert dfa["a"].dtype == "datetime64[s]"
|
||||
assert dfa["b"].dtype == "int64"
|
||||
|
||||
res = dfa.combine_first(dfb)
|
||||
exp = DataFrame(
|
||||
{"a": [ts, pd.NaT], "b": [2, 5]},
|
||||
columns=["a", "b"],
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["a"].dtype == "datetime64[s]"
|
||||
# TODO: this must be int64
|
||||
assert res["b"].dtype == "int64"
|
||||
|
||||
res = dfa.iloc[:0].combine_first(dfb)
|
||||
exp = DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
# TODO: this must be datetime64
|
||||
assert res["a"].dtype == "float64"
|
||||
# TODO: this must be int64
|
||||
assert res["b"].dtype == "int64"
|
||||
|
||||
def test_combine_first_timezone(self, unit):
|
||||
# see gh-7630
|
||||
data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC").as_unit(unit)
|
||||
df1 = DataFrame(
|
||||
columns=["UTCdatetime", "abc"],
|
||||
data=data1,
|
||||
index=pd.date_range("20140627", periods=1, unit=unit),
|
||||
)
|
||||
data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC").as_unit(unit)
|
||||
df2 = DataFrame(
|
||||
columns=["UTCdatetime", "xyz"],
|
||||
data=data2,
|
||||
index=pd.date_range("20140628", periods=1, unit=unit),
|
||||
)
|
||||
res = df2[["UTCdatetime"]].combine_first(df1)
|
||||
exp = DataFrame(
|
||||
{
|
||||
"UTCdatetime": [
|
||||
pd.Timestamp("2010-01-01 01:01", tz="UTC"),
|
||||
pd.Timestamp("2012-12-12 12:12", tz="UTC"),
|
||||
],
|
||||
"abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT],
|
||||
},
|
||||
columns=["UTCdatetime", "abc"],
|
||||
index=pd.date_range("20140627", periods=2, freq="D", unit=unit),
|
||||
dtype=f"datetime64[{unit}, UTC]",
|
||||
)
|
||||
assert res["UTCdatetime"].dtype == f"datetime64[{unit}, UTC]"
|
||||
assert res["abc"].dtype == f"datetime64[{unit}, UTC]"
|
||||
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_combine_first_timezone2(self, unit):
|
||||
# see gh-10567
|
||||
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC", unit=unit)
|
||||
df1 = DataFrame({"DATE": dts1})
|
||||
dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC", unit=unit)
|
||||
df2 = DataFrame({"DATE": dts2})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(res, df1)
|
||||
assert res["DATE"].dtype == f"datetime64[{unit}, UTC]"
|
||||
|
||||
def test_combine_first_timezone3(self, unit):
|
||||
dts1 = pd.DatetimeIndex(
|
||||
["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern"
|
||||
).as_unit(unit)
|
||||
df1 = DataFrame({"DATE": dts1}, index=[1, 3, 5, 7])
|
||||
dts2 = pd.DatetimeIndex(
|
||||
["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern"
|
||||
).as_unit(unit)
|
||||
df2 = DataFrame({"DATE": dts2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.DatetimeIndex(
|
||||
[
|
||||
"2011-01-01",
|
||||
"2012-01-01",
|
||||
"NaT",
|
||||
"2012-01-02",
|
||||
"2011-01-03",
|
||||
"2011-01-04",
|
||||
],
|
||||
tz="US/Eastern",
|
||||
).as_unit(unit)
|
||||
exp = DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_combine_first_timezone4(self, unit):
|
||||
# different tz
|
||||
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern", unit=unit)
|
||||
df1 = DataFrame({"DATE": dts1})
|
||||
dts2 = pd.date_range("2015-01-03", "2015-01-05", unit=unit)
|
||||
df2 = DataFrame({"DATE": dts2})
|
||||
|
||||
# if df1 doesn't have NaN, keep its dtype
|
||||
res = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(res, df1)
|
||||
assert res["DATE"].dtype == f"datetime64[{unit}, US/Eastern]"
|
||||
|
||||
def test_combine_first_timezone5(self, unit):
|
||||
dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern", unit=unit)
|
||||
df1 = DataFrame({"DATE": dts1})
|
||||
dts2 = pd.date_range("2015-01-01", "2015-01-03", unit=unit)
|
||||
df2 = DataFrame({"DATE": dts2})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = [
|
||||
pd.Timestamp("2015-01-01", tz="US/Eastern"),
|
||||
pd.Timestamp("2015-01-02", tz="US/Eastern"),
|
||||
pd.Timestamp("2015-01-03"),
|
||||
]
|
||||
exp = DataFrame({"DATE": exp_dts})
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["DATE"].dtype == "object"
|
||||
|
||||
def test_combine_first_timedelta(self):
|
||||
data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"])
|
||||
df1 = DataFrame({"TD": data1}, index=[1, 3, 5, 7])
|
||||
data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"])
|
||||
df2 = DataFrame({"TD": data2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.TimedeltaIndex(
|
||||
["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"]
|
||||
)
|
||||
exp = DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["TD"].dtype == "timedelta64[us]"
|
||||
|
||||
def test_combine_first_period(self):
|
||||
data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M")
|
||||
df1 = DataFrame({"P": data1}, index=[1, 3, 5, 7])
|
||||
data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M")
|
||||
df2 = DataFrame({"P": data2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.PeriodIndex(
|
||||
["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M"
|
||||
)
|
||||
exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["P"].dtype == data1.dtype
|
||||
|
||||
# different freq
|
||||
dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D")
|
||||
df2 = DataFrame({"P": dts2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = [
|
||||
pd.Period("2011-01", freq="M"),
|
||||
pd.Period("2012-01-01", freq="D"),
|
||||
pd.NaT,
|
||||
pd.Period("2012-01-02", freq="D"),
|
||||
pd.Period("2011-03", freq="M"),
|
||||
pd.Period("2011-04", freq="M"),
|
||||
]
|
||||
exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["P"].dtype == "object"
|
||||
|
||||
def test_combine_first_int(self):
|
||||
# GH14687 - integer series that do no align exactly
|
||||
|
||||
df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64")
|
||||
df2 = DataFrame({"a": [1, 4]}, dtype="int64")
|
||||
|
||||
result_12 = df1.combine_first(df2)
|
||||
expected_12 = DataFrame({"a": [0, 1, 3, 5]})
|
||||
tm.assert_frame_equal(result_12, expected_12)
|
||||
|
||||
result_21 = df2.combine_first(df1)
|
||||
expected_21 = DataFrame({"a": [1, 4, 3, 5]})
|
||||
tm.assert_frame_equal(result_21, expected_21)
|
||||
|
||||
@pytest.mark.parametrize("val", [1, 1.0])
|
||||
def test_combine_first_with_asymmetric_other(self, val):
|
||||
# see gh-20699
|
||||
df1 = DataFrame({"isNum": [val]})
|
||||
df2 = DataFrame({"isBool": [True]})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp = DataFrame({"isNum": [val], "isBool": [True]})
|
||||
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_combine_first_string_dtype_only_na(self, nullable_string_dtype):
|
||||
# GH: 37519
|
||||
df = DataFrame(
|
||||
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
|
||||
)
|
||||
df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype=nullable_string_dtype)
|
||||
df.set_index(["a", "b"], inplace=True)
|
||||
df2.set_index(["a", "b"], inplace=True)
|
||||
result = df.combine_first(df2)
|
||||
expected = DataFrame(
|
||||
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
|
||||
).set_index(["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"wide_val, dtype",
|
||||
(
|
||||
(1666880195890293744, "UInt64"),
|
||||
(-1666880195890293744, "Int64"),
|
||||
),
|
||||
)
|
||||
def test_combine_first_preserve_EA_precision(self, wide_val, dtype):
|
||||
# GH#60128
|
||||
df1 = DataFrame({"A": [wide_val, 5]}, dtype=dtype)
|
||||
df2 = DataFrame({"A": [6, 7, wide_val]}, dtype=dtype)
|
||||
result = df1.combine_first(df2)
|
||||
expected = DataFrame({"A": [wide_val, 5, wide_val]}, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_combine_first_non_unique_columns(self):
|
||||
# GH#29135
|
||||
df1 = DataFrame([[1, np.nan], [3, 4]], columns=["P", "Q"], index=["A", "B"])
|
||||
df2 = DataFrame(
|
||||
[[5, 6, 7], [8, 9, np.nan]], columns=["P", "Q", "Q"], index=["A", "B"]
|
||||
)
|
||||
result = df1.combine_first(df2)
|
||||
expected = DataFrame(
|
||||
[[1, 6.0, 7.0], [3, 4.0, 4.0]], index=["A", "B"], columns=["P", "Q", "Q"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"scalar1, scalar2",
|
||||
[
|
||||
(datetime(2020, 1, 1), datetime(2020, 1, 2)),
|
||||
(pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")),
|
||||
(pd.Timedelta("89 days"), pd.Timedelta("60 min")),
|
||||
(pd.Interval(left=0, right=1), pd.Interval(left=2, right=3, closed="left")),
|
||||
],
|
||||
)
|
||||
def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture):
|
||||
# GH28481
|
||||
na_value = nulls_fixture
|
||||
|
||||
frame = DataFrame([[na_value, na_value]], columns=["a", "b"])
|
||||
other = DataFrame([[scalar1, scalar2]], columns=["b", "c"])
|
||||
|
||||
common_dtype = find_common_type([frame.dtypes["b"], other.dtypes["b"]])
|
||||
|
||||
if (
|
||||
is_dtype_equal(common_dtype, "object")
|
||||
or frame.dtypes["b"] == other.dtypes["b"]
|
||||
or frame.dtypes["b"].kind == frame.dtypes["b"].kind == "M"
|
||||
):
|
||||
val = scalar1
|
||||
else:
|
||||
val = na_value
|
||||
|
||||
result = frame.combine_first(other)
|
||||
|
||||
expected = DataFrame([[na_value, val, scalar2]], columns=["a", "b", "c"])
|
||||
|
||||
expected["b"] = expected["b"].astype(common_dtype)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_combine_first_timestamp_bug_NaT():
|
||||
# GH28481
|
||||
frame = DataFrame([[pd.NaT, pd.NaT]], columns=["a", "b"])
|
||||
other = DataFrame(
|
||||
[[datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["b", "c"]
|
||||
)
|
||||
|
||||
result = frame.combine_first(other)
|
||||
expected = DataFrame(
|
||||
[[pd.NaT, datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["a", "b", "c"]
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_combine_first_with_nan_multiindex():
|
||||
# gh-36562
|
||||
|
||||
mi1 = MultiIndex.from_arrays(
|
||||
[["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"]
|
||||
)
|
||||
df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1)
|
||||
mi2 = MultiIndex.from_arrays(
|
||||
[["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"]
|
||||
)
|
||||
s = Series([1, 2, 3, 4, 5, 6], index=mi2)
|
||||
res = df.combine_first(DataFrame({"d": s}))
|
||||
mi_expected = MultiIndex.from_arrays(
|
||||
[
|
||||
["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
|
||||
[1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6],
|
||||
],
|
||||
names=["a", "b"],
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1],
|
||||
"d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan],
|
||||
},
|
||||
index=mi_expected,
|
||||
)
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
|
||||
def test_combine_preserve_dtypes():
|
||||
# GH7509
|
||||
a_column = Series(["a", "b"], index=range(2))
|
||||
b_column = Series(range(2), index=range(2))
|
||||
df1 = DataFrame({"A": a_column, "B": b_column})
|
||||
|
||||
c_column = Series(["a", "b"], index=range(5, 7))
|
||||
b_column = Series(range(-1, 1), index=range(5, 7))
|
||||
df2 = DataFrame({"B": b_column, "C": c_column})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": ["a", "b", np.nan, np.nan],
|
||||
"B": [0, 1, -1, 0],
|
||||
"C": [np.nan, np.nan, "a", "b"],
|
||||
},
|
||||
index=[0, 1, 5, 6],
|
||||
)
|
||||
combined = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(combined, expected)
|
||||
|
||||
|
||||
def test_combine_first_duplicates_rows_for_nan_index_values():
|
||||
# GH39881
|
||||
df1 = DataFrame(
|
||||
{"x": [9, 10, 11]},
|
||||
index=MultiIndex.from_arrays([[1, 2, 3], [np.nan, 5, 6]], names=["a", "b"]),
|
||||
)
|
||||
|
||||
df2 = DataFrame(
|
||||
{"y": [12, 13, 14]},
|
||||
index=MultiIndex.from_arrays([[1, 2, 4], [np.nan, 5, 7]], names=["a", "b"]),
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"x": [9.0, 10.0, 11.0, np.nan],
|
||||
"y": [12.0, 13.0, np.nan, 14.0],
|
||||
},
|
||||
index=MultiIndex.from_arrays(
|
||||
[[1, 2, 3, 4], [np.nan, 5, 6, 7]], names=["a", "b"]
|
||||
),
|
||||
)
|
||||
combined = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(combined, expected)
|
||||
|
||||
|
||||
def test_combine_first_int64_not_cast_to_float64():
|
||||
# GH 28613
|
||||
df_1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
df_2 = DataFrame({"A": [1, 20, 30], "B": [40, 50, 60], "C": [12, 34, 65]})
|
||||
result = df_1.combine_first(df_2)
|
||||
expected = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [12, 34, 65]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_midx_losing_dtype():
|
||||
# GH#49830
|
||||
midx = MultiIndex.from_arrays([[0, 0], [np.nan, np.nan]])
|
||||
midx2 = MultiIndex.from_arrays([[1, 1], [np.nan, np.nan]])
|
||||
df1 = DataFrame({"a": [None, 4]}, index=midx)
|
||||
df2 = DataFrame({"a": [3, 3]}, index=midx2)
|
||||
result = df1.combine_first(df2)
|
||||
expected_midx = MultiIndex.from_arrays(
|
||||
[[0, 0, 1, 1], [np.nan, np.nan, np.nan, np.nan]]
|
||||
)
|
||||
expected = DataFrame({"a": [np.nan, 4, 3, 3]}, index=expected_midx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_combine_first_empty_columns():
|
||||
left = DataFrame(columns=["a", "b"])
|
||||
right = DataFrame(columns=["a", "c"])
|
||||
result = left.combine_first(right)
|
||||
expected = DataFrame(columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_combine_first_preserve_column_order():
|
||||
# GH#60427
|
||||
df1 = DataFrame({"B": [1, 2, 3], "A": [4, None, 6]})
|
||||
df2 = DataFrame({"A": [5]}, index=[1])
|
||||
|
||||
result = df1.combine_first(df2)
|
||||
expected = DataFrame({"B": [1, 2, 3], "A": [4.0, 5.0, 6.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,304 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"])
|
||||
def test_compare_axis(align_axis):
|
||||
# GH#30429
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
columns=["col1", "col2", "col3"],
|
||||
)
|
||||
df2 = df.copy()
|
||||
df2.loc[0, "col1"] = "c"
|
||||
df2.loc[2, "col3"] = 4.0
|
||||
|
||||
result = df.compare(df2, align_axis=align_axis)
|
||||
|
||||
if align_axis in (1, "columns"):
|
||||
indices = pd.RangeIndex(0, 4, 2)
|
||||
columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
|
||||
expected = pd.DataFrame(
|
||||
[["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]],
|
||||
index=indices,
|
||||
columns=columns,
|
||||
)
|
||||
else:
|
||||
indices = pd.MultiIndex.from_product([range(0, 4, 2), ["self", "other"]])
|
||||
columns = pd.Index(["col1", "col3"])
|
||||
expected = pd.DataFrame(
|
||||
[["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]],
|
||||
index=indices,
|
||||
columns=columns,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"keep_shape, keep_equal",
|
||||
[
|
||||
(True, False),
|
||||
(False, True),
|
||||
(True, True),
|
||||
# False, False case is already covered in test_compare_axis
|
||||
],
|
||||
)
|
||||
def test_compare_various_formats(keep_shape, keep_equal):
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
columns=["col1", "col2", "col3"],
|
||||
)
|
||||
df2 = df.copy()
|
||||
df2.loc[0, "col1"] = "c"
|
||||
df2.loc[2, "col3"] = 4.0
|
||||
|
||||
result = df.compare(df2, keep_shape=keep_shape, keep_equal=keep_equal)
|
||||
|
||||
if keep_shape:
|
||||
indices = pd.RangeIndex(3)
|
||||
columns = pd.MultiIndex.from_product(
|
||||
[["col1", "col2", "col3"], ["self", "other"]]
|
||||
)
|
||||
if keep_equal:
|
||||
expected = pd.DataFrame(
|
||||
[
|
||||
["a", "c", 1.0, 1.0, 1.0, 1.0],
|
||||
["b", "b", 2.0, 2.0, 2.0, 2.0],
|
||||
["c", "c", np.nan, np.nan, 3.0, 4.0],
|
||||
],
|
||||
index=indices,
|
||||
columns=columns,
|
||||
)
|
||||
else:
|
||||
expected = pd.DataFrame(
|
||||
[
|
||||
["a", "c", np.nan, np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan, np.nan, 3.0, 4.0],
|
||||
],
|
||||
index=indices,
|
||||
columns=columns,
|
||||
)
|
||||
else:
|
||||
indices = pd.RangeIndex(0, 4, 2)
|
||||
columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
|
||||
expected = pd.DataFrame(
|
||||
[["a", "c", 1.0, 1.0], ["c", "c", 3.0, 4.0]], index=indices, columns=columns
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_compare_with_equal_nulls():
|
||||
# We want to make sure two NaNs are considered the same
|
||||
# and dropped where applicable
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
columns=["col1", "col2", "col3"],
|
||||
)
|
||||
df2 = df.copy()
|
||||
df2.loc[0, "col1"] = "c"
|
||||
|
||||
result = df.compare(df2)
|
||||
indices = pd.Index([0])
|
||||
columns = pd.MultiIndex.from_product([["col1"], ["self", "other"]])
|
||||
expected = pd.DataFrame([["a", "c"]], index=indices, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_compare_with_non_equal_nulls():
|
||||
# We want to make sure the relevant NaNs do not get dropped
|
||||
# even if the entire row or column are NaNs
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
columns=["col1", "col2", "col3"],
|
||||
)
|
||||
df2 = df.copy()
|
||||
df2.loc[0, "col1"] = "c"
|
||||
df2.loc[2, "col3"] = np.nan
|
||||
|
||||
result = df.compare(df2)
|
||||
|
||||
indices = pd.Index([0, 2])
|
||||
columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
|
||||
expected = pd.DataFrame(
|
||||
[["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan]],
|
||||
index=indices,
|
||||
columns=columns,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("align_axis", [0, 1])
|
||||
def test_compare_multi_index(align_axis):
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}
|
||||
)
|
||||
df.columns = pd.MultiIndex.from_arrays([["a", "a", "b"], ["col1", "col2", "col3"]])
|
||||
df.index = pd.MultiIndex.from_arrays([["x", "x", "y"], [0, 1, 2]])
|
||||
|
||||
df2 = df.copy()
|
||||
df2.iloc[0, 0] = "c"
|
||||
df2.iloc[2, 2] = 4.0
|
||||
|
||||
result = df.compare(df2, align_axis=align_axis)
|
||||
|
||||
if align_axis == 0:
|
||||
indices = pd.MultiIndex.from_arrays(
|
||||
[["x", "x", "y", "y"], [0, 0, 2, 2], ["self", "other", "self", "other"]]
|
||||
)
|
||||
columns = pd.MultiIndex.from_arrays([["a", "b"], ["col1", "col3"]])
|
||||
data = [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]]
|
||||
else:
|
||||
indices = pd.MultiIndex.from_arrays([["x", "y"], [0, 2]])
|
||||
columns = pd.MultiIndex.from_arrays(
|
||||
[
|
||||
["a", "a", "b", "b"],
|
||||
["col1", "col1", "col3", "col3"],
|
||||
["self", "other", "self", "other"],
|
||||
]
|
||||
)
|
||||
data = [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]]
|
||||
|
||||
expected = pd.DataFrame(data=data, index=indices, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_compare_different_indices():
|
||||
msg = (
|
||||
r"Can only compare identically-labeled \(both index and columns\) DataFrame "
|
||||
"objects"
|
||||
)
|
||||
df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"])
|
||||
df2 = pd.DataFrame([1, 2, 3], index=["a", "b", "d"])
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df1.compare(df2)
|
||||
|
||||
|
||||
def test_compare_different_shapes():
|
||||
msg = (
|
||||
r"Can only compare identically-labeled \(both index and columns\) DataFrame "
|
||||
"objects"
|
||||
)
|
||||
df1 = pd.DataFrame(np.ones((3, 3)))
|
||||
df2 = pd.DataFrame(np.zeros((2, 1)))
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df1.compare(df2)
|
||||
|
||||
|
||||
def test_compare_result_names():
|
||||
# GH 44354
|
||||
df1 = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
)
|
||||
df2 = pd.DataFrame(
|
||||
{
|
||||
"col1": ["c", "b", "c"],
|
||||
"col2": [1.0, 2.0, np.nan],
|
||||
"col3": [1.0, 2.0, np.nan],
|
||||
},
|
||||
)
|
||||
result = df1.compare(df2, result_names=("left", "right"))
|
||||
result.index = pd.Index([0, 2])
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
("col1", "left"): {0: "a", 2: np.nan},
|
||||
("col1", "right"): {0: "c", 2: np.nan},
|
||||
("col3", "left"): {0: np.nan, 2: 3.0},
|
||||
("col3", "right"): {0: np.nan, 2: np.nan},
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"result_names",
|
||||
[
|
||||
[1, 2],
|
||||
"HK",
|
||||
{"2": 2, "3": 3},
|
||||
3,
|
||||
3.0,
|
||||
],
|
||||
)
|
||||
def test_invalid_input_result_names(result_names):
|
||||
# GH 44354
|
||||
df1 = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
)
|
||||
df2 = pd.DataFrame(
|
||||
{
|
||||
"col1": ["c", "b", "c"],
|
||||
"col2": [1.0, 2.0, np.nan],
|
||||
"col3": [1.0, 2.0, np.nan],
|
||||
},
|
||||
)
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=(
|
||||
f"Passing 'result_names' as a {type(result_names)} is not "
|
||||
"supported. Provide 'result_names' as a tuple instead."
|
||||
),
|
||||
):
|
||||
df1.compare(df2, result_names=result_names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"val1,val2",
|
||||
[(4, pd.NA), (pd.NA, pd.NA), (pd.NA, 4)],
|
||||
)
|
||||
def test_compare_ea_and_np_dtype(val1, val2):
|
||||
# GH 48966
|
||||
arr = [4.0, val1]
|
||||
ser = pd.Series([1, val2], dtype="Int64")
|
||||
|
||||
df1 = pd.DataFrame({"a": arr, "b": [1.0, 2]})
|
||||
df2 = pd.DataFrame({"a": ser, "b": [1.0, 2]})
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
("a", "self"): arr,
|
||||
("a", "other"): ser,
|
||||
("b", "self"): np.nan,
|
||||
("b", "other"): np.nan,
|
||||
}
|
||||
)
|
||||
if val1 is pd.NA and val2 is pd.NA:
|
||||
# GH#18463 TODO: is this really the desired behavior?
|
||||
expected.loc[1, ("a", "self")] = np.nan
|
||||
|
||||
if val1 is pd.NA:
|
||||
# can't compare with numpy array if it contains pd.NA
|
||||
with pytest.raises(TypeError, match="boolean value of NA is ambiguous"):
|
||||
result = df1.compare(df2, keep_shape=True)
|
||||
else:
|
||||
result = df1.compare(df2, keep_shape=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df1_val,df2_val,diff_self,diff_other",
|
||||
[
|
||||
(4, 3, 4, 3),
|
||||
(4, 4, pd.NA, pd.NA),
|
||||
(4, pd.NA, 4, pd.NA),
|
||||
(pd.NA, pd.NA, pd.NA, pd.NA),
|
||||
],
|
||||
)
|
||||
def test_compare_nullable_int64_dtype(df1_val, df2_val, diff_self, diff_other):
|
||||
# GH 48966
|
||||
df1 = pd.DataFrame({"a": pd.Series([df1_val, pd.NA], dtype="Int64"), "b": [1.0, 2]})
|
||||
df2 = df1.copy()
|
||||
df2.loc[0, "a"] = df2_val
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
("a", "self"): pd.Series([diff_self, pd.NA], dtype="Int64"),
|
||||
("a", "other"): pd.Series([diff_other, pd.NA], dtype="Int64"),
|
||||
("b", "self"): np.nan,
|
||||
("b", "other"): np.nan,
|
||||
}
|
||||
)
|
||||
result = df1.compare(df2, keep_shape=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,255 @@
|
||||
import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestConvertDtypes:
|
||||
@pytest.mark.parametrize(
|
||||
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
|
||||
)
|
||||
def test_convert_dtypes(self, convert_integer, expected, string_storage):
|
||||
# Specific types are tested in tests/series/test_dtypes.py
|
||||
# Just check that it works for DataFrame here
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
|
||||
"b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
|
||||
}
|
||||
)
|
||||
with pd.option_context("string_storage", string_storage):
|
||||
result = df.convert_dtypes(True, True, convert_integer, False)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 2, 3], dtype=expected),
|
||||
"b": pd.Series(["x", "y", "z"], dtype=f"string[{string_storage}]"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_empty(self):
|
||||
# Empty DataFrame can pass convert_dtypes, see GH#40393
|
||||
empty_df = pd.DataFrame()
|
||||
tm.assert_frame_equal(empty_df, empty_df.convert_dtypes())
|
||||
|
||||
@td.skip_if_no("pyarrow")
|
||||
def test_convert_empty_categorical_to_pyarrow(self):
|
||||
# GH#59934
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": pd.Categorical([None] * 5),
|
||||
"B": pd.Categorical([None] * 5, categories=["B1", "B2"]),
|
||||
}
|
||||
)
|
||||
converted = df.convert_dtypes(dtype_backend="pyarrow")
|
||||
expected = df
|
||||
tm.assert_frame_equal(converted, expected)
|
||||
|
||||
def test_convert_dtypes_retain_column_names(self):
|
||||
# GH#41435
|
||||
df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
|
||||
df.columns.name = "cols"
|
||||
|
||||
result = df.convert_dtypes()
|
||||
tm.assert_index_equal(result.columns, df.columns)
|
||||
assert result.columns.name == "cols"
|
||||
|
||||
def test_pyarrow_dtype_backend(self, using_nan_is_na):
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
|
||||
"b": pd.Series(["x", "y", None], dtype=np.dtype("O")),
|
||||
"c": pd.Series([True, False, None], dtype=np.dtype("O")),
|
||||
"d": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
|
||||
"e": pd.Series(pd.date_range("2022", periods=3, unit="ns")),
|
||||
"f": pd.Series(pd.date_range("2022", periods=3, tz="UTC").as_unit("s")),
|
||||
"g": pd.Series(pd.timedelta_range("1D", periods=3)),
|
||||
}
|
||||
)
|
||||
result = df.convert_dtypes(dtype_backend="pyarrow")
|
||||
|
||||
item = None if using_nan_is_na else np.nan
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": pd.arrays.ArrowExtensionArray(
|
||||
pa.array([1, 2, 3], type=pa.int32())
|
||||
),
|
||||
"b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])),
|
||||
"c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])),
|
||||
"d": pd.arrays.ArrowExtensionArray(pa.array([item, 100.5, 200.0])),
|
||||
"e": pd.arrays.ArrowExtensionArray(
|
||||
pa.array(
|
||||
[
|
||||
datetime.datetime(2022, 1, 1),
|
||||
datetime.datetime(2022, 1, 2),
|
||||
datetime.datetime(2022, 1, 3),
|
||||
],
|
||||
type=pa.timestamp(unit="ns"),
|
||||
)
|
||||
),
|
||||
"f": pd.arrays.ArrowExtensionArray(
|
||||
pa.array(
|
||||
[
|
||||
datetime.datetime(2022, 1, 1),
|
||||
datetime.datetime(2022, 1, 2),
|
||||
datetime.datetime(2022, 1, 3),
|
||||
],
|
||||
type=pa.timestamp(unit="s", tz="UTC"),
|
||||
)
|
||||
),
|
||||
"g": pd.arrays.ArrowExtensionArray(
|
||||
pa.array(
|
||||
[
|
||||
datetime.timedelta(1),
|
||||
datetime.timedelta(2),
|
||||
datetime.timedelta(3),
|
||||
],
|
||||
type=pa.duration("us"),
|
||||
)
|
||||
),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_pyarrow_dtype_backend_already_pyarrow(self):
|
||||
pytest.importorskip("pyarrow")
|
||||
expected = pd.DataFrame([1, 2, 3], dtype="int64[pyarrow]")
|
||||
result = expected.convert_dtypes(dtype_backend="pyarrow")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_pyarrow_dtype_backend_from_pandas_nullable(self):
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 2, None], dtype="Int32"),
|
||||
"b": pd.Series(["x", "y", None], dtype="string[python]"),
|
||||
"c": pd.Series([True, False, None], dtype="boolean"),
|
||||
"d": pd.Series([None, 100.5, 200], dtype="Float64"),
|
||||
}
|
||||
)
|
||||
result = df.convert_dtypes(dtype_backend="pyarrow")
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": pd.arrays.ArrowExtensionArray(
|
||||
pa.array([1, 2, None], type=pa.int32())
|
||||
),
|
||||
"b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])),
|
||||
"c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])),
|
||||
"d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_pyarrow_dtype_empty_object(self):
|
||||
# GH 50970
|
||||
pytest.importorskip("pyarrow")
|
||||
expected = pd.DataFrame(columns=[0])
|
||||
result = expected.convert_dtypes(dtype_backend="pyarrow")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_pyarrow_engine_lines_false(self):
|
||||
# GH 48893
|
||||
df = pd.DataFrame({"a": [1, 2, 3]})
|
||||
msg = (
|
||||
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
|
||||
"'pyarrow' are allowed."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.convert_dtypes(dtype_backend="numpy")
|
||||
|
||||
def test_pyarrow_backend_no_conversion(self):
|
||||
# GH#52872
|
||||
pytest.importorskip("pyarrow")
|
||||
df = pd.DataFrame({"a": [1, 2], "b": 1.5, "c": True, "d": "x"})
|
||||
expected = df.copy()
|
||||
result = df.convert_dtypes(
|
||||
convert_floating=False,
|
||||
convert_integer=False,
|
||||
convert_boolean=False,
|
||||
convert_string=False,
|
||||
dtype_backend="pyarrow",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_dtypes_pyarrow_to_np_nullable(self):
|
||||
# GH 53648
|
||||
pytest.importorskip("pyarrow")
|
||||
ser = pd.DataFrame(range(2), dtype="int32[pyarrow]")
|
||||
result = ser.convert_dtypes(dtype_backend="numpy_nullable")
|
||||
expected = pd.DataFrame(range(2), dtype="Int32")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_dtypes_pyarrow_timestamp(self):
|
||||
# GH 54191
|
||||
pytest.importorskip("pyarrow")
|
||||
ser = pd.Series(pd.date_range("2020-01-01", "2020-01-02", freq="1min"))
|
||||
expected = ser.astype("timestamp[ms][pyarrow]")
|
||||
result = expected.convert_dtypes(dtype_backend="pyarrow")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_convert_dtypes_avoid_block_splitting(self):
|
||||
# GH#55341
|
||||
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"})
|
||||
result = df.convert_dtypes(convert_integer=False)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3],
|
||||
"b": [4, 5, 6],
|
||||
"c": pd.Series(["a"] * 3, dtype="string"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert result._mgr.nblocks == 2
|
||||
|
||||
def test_convert_dtypes_from_arrow(self):
|
||||
# GH#56581
|
||||
df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"])
|
||||
result = df.convert_dtypes()
|
||||
expected = df.astype({"a": "string"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_dtype_pyarrow_timezone_preserve(self):
|
||||
# GH 60237
|
||||
pytest.importorskip("pyarrow")
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"timestamps": pd.Series(
|
||||
pd.to_datetime(range(5), utc=True, unit="h"),
|
||||
dtype="timestamp[ns, tz=UTC][pyarrow]",
|
||||
)
|
||||
}
|
||||
)
|
||||
result = df.convert_dtypes(dtype_backend="pyarrow")
|
||||
expected = df.copy()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_dtypes_complex(self):
|
||||
# GH 60129
|
||||
df = pd.DataFrame({"a": [1.0 + 5.0j, 1.5 - 3.0j], "b": [1, 2]})
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": pd.array([1.0 + 5.0j, 1.5 - 3.0j], dtype="complex128"),
|
||||
"b": pd.array([1, 2], dtype="Int64"),
|
||||
}
|
||||
)
|
||||
result = df.convert_dtypes()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_dtypes_mixed_column_after_slice(self):
|
||||
# GH#64702
|
||||
df = pd.DataFrame(data=[[1, "a"], [2, "b"], ["c", 3]], columns=["col1", "col2"])
|
||||
df = df.loc[[0, 1]].copy()
|
||||
result = df.convert_dtypes()
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"col1": pd.array([1, 2], dtype="Int64"),
|
||||
"col2": pd.array(["a", "b"], dtype="string"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,41 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
class TestCopy:
|
||||
@pytest.mark.parametrize("attr", ["index", "columns"])
|
||||
def test_copy_index_name_checking(self, float_frame, attr):
|
||||
# don't want to be able to modify the index stored elsewhere after
|
||||
# making a copy
|
||||
ind = getattr(float_frame, attr)
|
||||
ind.name = None
|
||||
cp = float_frame.copy()
|
||||
getattr(cp, attr).name = "foo"
|
||||
assert getattr(float_frame, attr).name is None
|
||||
|
||||
def test_copy(self, float_frame, float_string_frame):
|
||||
cop = float_frame.copy()
|
||||
cop["E"] = cop["A"]
|
||||
assert "E" not in float_frame
|
||||
|
||||
# copy objects
|
||||
copy = float_string_frame.copy()
|
||||
assert copy._mgr is not float_string_frame._mgr
|
||||
|
||||
def test_copy_consolidates(self):
|
||||
# GH#42477
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).integers(0, 100, size=55),
|
||||
"b": np.random.default_rng(2).integers(0, 100, size=55),
|
||||
}
|
||||
)
|
||||
|
||||
for i in range(10):
|
||||
df.loc[:, f"n_{i}"] = np.random.default_rng(2).integers(0, 100, size=55)
|
||||
|
||||
assert len(df._mgr.blocks) == 11
|
||||
result = df.copy()
|
||||
assert len(result._mgr.blocks) == 1
|
||||
@ -0,0 +1,39 @@
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameCount:
|
||||
def test_count(self):
|
||||
# corner case
|
||||
frame = DataFrame()
|
||||
ct1 = frame.count(1)
|
||||
assert isinstance(ct1, Series)
|
||||
|
||||
ct2 = frame.count(0)
|
||||
assert isinstance(ct2, Series)
|
||||
|
||||
# GH#423
|
||||
df = DataFrame(index=range(10))
|
||||
result = df.count(1)
|
||||
expected = Series(0, index=df.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame(columns=range(10))
|
||||
result = df.count(0)
|
||||
expected = Series(0, index=df.columns)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame()
|
||||
result = df.count()
|
||||
expected = Series(dtype="int64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_count_objects(self, float_string_frame):
|
||||
dm = DataFrame(float_string_frame._series)
|
||||
df = DataFrame(float_string_frame._series)
|
||||
|
||||
tm.assert_series_equal(dm.count(), df.count())
|
||||
tm.assert_series_equal(dm.count(1), df.count(1))
|
||||
@ -0,0 +1,495 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
date_range,
|
||||
isna,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameCov:
|
||||
def test_cov(self, float_frame, float_string_frame):
|
||||
# min_periods no NAs (corner case)
|
||||
expected = float_frame.cov()
|
||||
result = float_frame.cov(min_periods=len(float_frame))
|
||||
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
result = float_frame.cov(min_periods=len(float_frame) + 1)
|
||||
assert isna(result.values).all()
|
||||
|
||||
# with NAs
|
||||
frame = float_frame.copy()
|
||||
frame.iloc[:5, frame.columns.get_loc("A")] = np.nan
|
||||
frame.iloc[5:10, frame.columns.get_loc("B")] = np.nan
|
||||
result = frame.cov(min_periods=len(frame) - 8)
|
||||
expected = frame.cov()
|
||||
expected.loc["A", "B"] = np.nan
|
||||
expected.loc["B", "A"] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# regular
|
||||
result = frame.cov()
|
||||
expected = frame["A"].cov(frame["C"])
|
||||
tm.assert_almost_equal(result["A"]["C"], expected)
|
||||
|
||||
# fails on non-numeric types
|
||||
with pytest.raises(ValueError, match="could not convert string to float"):
|
||||
float_string_frame.cov()
|
||||
result = float_string_frame.cov(numeric_only=True)
|
||||
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Single column frame
|
||||
df = DataFrame(np.linspace(0.0, 1.0, 10))
|
||||
result = df.cov()
|
||||
expected = DataFrame(
|
||||
np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
df.loc[0] = np.nan
|
||||
result = df.cov()
|
||||
expected = DataFrame(
|
||||
np.cov(df.values[1:].T).reshape((1, 1)),
|
||||
index=df.columns,
|
||||
columns=df.columns,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("test_ddof", [None, 0, 1, 2, 3])
|
||||
def test_cov_ddof(self, test_ddof):
|
||||
# GH#34611
|
||||
np_array1 = np.random.default_rng(2).random(10)
|
||||
np_array2 = np.random.default_rng(2).random(10)
|
||||
df = DataFrame({0: np_array1, 1: np_array2})
|
||||
result = df.cov(ddof=test_ddof)
|
||||
expected_np = np.cov(np_array1, np_array2, ddof=test_ddof)
|
||||
expected = DataFrame(expected_np)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"other_column", [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])]
|
||||
)
|
||||
def test_cov_nullable_integer(self, other_column):
|
||||
# https://github.com/pandas-dev/pandas/issues/33803
|
||||
data = DataFrame({"a": pd.array([1, 2, None]), "b": other_column})
|
||||
result = data.cov()
|
||||
arr = np.array([[0.5, 0.5], [0.5, 1.0]])
|
||||
expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("numeric_only", [True, False])
|
||||
def test_cov_numeric_only(self, numeric_only):
|
||||
# when dtypes of pandas series are different
|
||||
# then ndarray will have dtype=object,
|
||||
# so it need to be properly handled
|
||||
df = DataFrame({"a": [1, 0], "c": ["x", "y"]})
|
||||
expected = DataFrame(0.5, index=["a"], columns=["a"])
|
||||
if numeric_only:
|
||||
result = df.cov(numeric_only=numeric_only)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(ValueError, match="could not convert string to float"):
|
||||
df.cov(numeric_only=numeric_only)
|
||||
|
||||
|
||||
class TestDataFrameCorr:
|
||||
# DataFrame.corr(), as opposed to DataFrame.corrwith
|
||||
|
||||
@pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"])
|
||||
def test_corr_scipy_method(self, float_frame, method):
|
||||
pytest.importorskip("scipy")
|
||||
float_frame.loc[float_frame.index[:5], "A"] = np.nan
|
||||
float_frame.loc[float_frame.index[5:10], "B"] = np.nan
|
||||
float_frame.loc[float_frame.index[:10], "A"] = float_frame["A"][10:20].copy()
|
||||
|
||||
correls = float_frame.corr(method=method)
|
||||
expected = float_frame["A"].corr(float_frame["C"], method=method)
|
||||
tm.assert_almost_equal(correls["A"]["C"], expected)
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
def test_corr_non_numeric(self, float_string_frame):
|
||||
with pytest.raises(ValueError, match="could not convert string to float"):
|
||||
float_string_frame.corr()
|
||||
result = float_string_frame.corr(numeric_only=True)
|
||||
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
|
||||
def test_corr_nooverlap(self, meth):
|
||||
# nothing in common
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1.5, 1, np.nan, np.nan, np.nan],
|
||||
"B": [np.nan, np.nan, np.nan, 1, 1.5, 1],
|
||||
"C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
rs = df.corr(meth)
|
||||
assert isna(rs.loc["A", "B"])
|
||||
assert isna(rs.loc["B", "A"])
|
||||
assert rs.loc["A", "A"] == 1
|
||||
assert rs.loc["B", "B"] == 1
|
||||
assert isna(rs.loc["C", "C"])
|
||||
|
||||
@pytest.mark.parametrize("meth", ["pearson", "spearman"])
|
||||
def test_corr_constant(self, meth):
|
||||
# constant --> all NA
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1, 1, np.nan, np.nan, np.nan],
|
||||
"B": [np.nan, np.nan, np.nan, 1, 1, 1],
|
||||
}
|
||||
)
|
||||
rs = df.corr(meth)
|
||||
assert isna(rs.values).all()
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
|
||||
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
|
||||
def test_corr_int_and_boolean(self, meth):
|
||||
# when dtypes of pandas series are different
|
||||
# then ndarray will have dtype=object,
|
||||
# so it need to be properly handled
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame({"a": [True, False], "b": [1, 0]})
|
||||
|
||||
expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
|
||||
result = df.corr(meth)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("method", ["cov", "corr"])
|
||||
def test_corr_cov_independent_index_column(self, method):
|
||||
# GH#14617
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal(4 * 10).reshape(10, 4),
|
||||
columns=list("abcd"),
|
||||
)
|
||||
result = getattr(df, method)()
|
||||
assert result.index is not result.columns
|
||||
assert result.index.equals(result.columns)
|
||||
|
||||
def test_corr_invalid_method(self):
|
||||
# GH#22298
|
||||
df = DataFrame(np.random.default_rng(2).normal(size=(10, 2)))
|
||||
msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, "
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.corr(method="____")
|
||||
|
||||
def test_corr_int(self):
|
||||
# dtypes other than float64 GH#1761
|
||||
df = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]})
|
||||
|
||||
df.cov()
|
||||
df.corr()
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"nullable_column", [pd.array([1, 2, 3]), pd.array([1, 2, None])]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"other_column",
|
||||
[pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, np.nan])],
|
||||
)
|
||||
@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
|
||||
def test_corr_nullable_integer(self, nullable_column, other_column, method):
|
||||
# https://github.com/pandas-dev/pandas/issues/33803
|
||||
pytest.importorskip("scipy")
|
||||
data = DataFrame({"a": nullable_column, "b": other_column})
|
||||
result = data.corr(method=method)
|
||||
expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("length", [2, 20, 200, 2000])
|
||||
def test_corr_for_constant_columns(self, length):
|
||||
# GH: 37448
|
||||
df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"])
|
||||
result = df.corr()
|
||||
expected = DataFrame(
|
||||
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_calc_corr_small_numbers(self):
|
||||
# GH: 37452
|
||||
df = DataFrame(
|
||||
{"A": [1.0e-20, 2.0e-20, 3.0e-20], "B": [1.0e-20, 2.0e-20, 3.0e-20]}
|
||||
)
|
||||
result = df.corr()
|
||||
expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
|
||||
def test_corr_min_periods_greater_than_length(self, method):
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame({"A": [1, 2], "B": [1, 2]})
|
||||
result = df.corr(method=method, min_periods=3)
|
||||
expected = DataFrame(
|
||||
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
|
||||
@pytest.mark.parametrize("numeric_only", [True, False])
|
||||
def test_corr_numeric_only(self, meth, numeric_only):
|
||||
# when dtypes of pandas series are different
|
||||
# then ndarray will have dtype=object,
|
||||
# so it need to be properly handled
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame({"a": [1, 0], "b": [1, 0], "c": ["x", "y"]})
|
||||
expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
|
||||
if numeric_only:
|
||||
result = df.corr(meth, numeric_only=numeric_only)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(ValueError, match="could not convert string to float"):
|
||||
df.corr(meth, numeric_only=numeric_only)
|
||||
|
||||
|
||||
class TestDataFrameCorrWith:
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
"float64",
|
||||
"Float64",
|
||||
pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")),
|
||||
],
|
||||
)
|
||||
def test_corrwith(self, datetime_frame, dtype):
|
||||
datetime_frame = datetime_frame.astype(dtype)
|
||||
|
||||
a = datetime_frame
|
||||
noise = Series(np.random.default_rng(2).standard_normal(len(a)), index=a.index)
|
||||
|
||||
b = datetime_frame.add(noise, axis=0)
|
||||
|
||||
# make sure order does not matter
|
||||
b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][len(a) // 2 :])
|
||||
del b["B"]
|
||||
|
||||
colcorr = a.corrwith(b, axis=0)
|
||||
tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"]))
|
||||
|
||||
rowcorr = a.corrwith(b, axis=1)
|
||||
tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0))
|
||||
|
||||
dropped = a.corrwith(b, axis=0, drop=True)
|
||||
tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"]))
|
||||
assert "B" not in dropped
|
||||
|
||||
dropped = a.corrwith(b, axis=1, drop=True)
|
||||
assert a.index[-1] not in dropped.index
|
||||
|
||||
def test_corrwith_non_timeseries_data(self):
|
||||
index = ["a", "b", "c", "d", "e"]
|
||||
columns = ["one", "two", "three", "four"]
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((5, 4)),
|
||||
index=index,
|
||||
columns=columns,
|
||||
)
|
||||
df2 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((4, 4)),
|
||||
index=index[:4],
|
||||
columns=columns,
|
||||
)
|
||||
correls = df1.corrwith(df2, axis=1)
|
||||
for row in index[:4]:
|
||||
tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row]))
|
||||
|
||||
def test_corrwith_with_objects(self, using_infer_string):
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
df2 = df1.copy()
|
||||
cols = ["A", "B", "C", "D"]
|
||||
|
||||
df1["obj"] = "foo"
|
||||
df2["obj"] = "bar"
|
||||
|
||||
if using_infer_string:
|
||||
msg = "Cannot perform reduction 'mean' with string dtype"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df1.corrwith(df2)
|
||||
else:
|
||||
with pytest.raises(TypeError, match="Could not convert"):
|
||||
df1.corrwith(df2)
|
||||
result = df1.corrwith(df2, numeric_only=True)
|
||||
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
with pytest.raises(TypeError, match="unsupported operand type"):
|
||||
df1.corrwith(df2, axis=1)
|
||||
result = df1.corrwith(df2, axis=1, numeric_only=True)
|
||||
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corrwith_series(self, datetime_frame):
|
||||
result = datetime_frame.corrwith(datetime_frame["A"])
|
||||
expected = datetime_frame.apply(datetime_frame["A"].corr)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corrwith_matches_corrcoef(self):
|
||||
df1 = DataFrame(np.arange(100), columns=["a"])
|
||||
df2 = DataFrame(np.arange(100) ** 2, columns=["a"])
|
||||
c1 = df1.corrwith(df2)["a"]
|
||||
c2 = np.corrcoef(df1["a"], df2["a"])[0][1]
|
||||
|
||||
tm.assert_almost_equal(c1, c2)
|
||||
assert c1 < 1
|
||||
|
||||
@pytest.mark.parametrize("numeric_only", [True, False])
|
||||
def test_corrwith_mixed_dtypes(self, numeric_only):
|
||||
# GH#18570
|
||||
df = DataFrame(
|
||||
{"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]}
|
||||
)
|
||||
s = Series([0, 6, 7, 3])
|
||||
if numeric_only:
|
||||
result = df.corrwith(s, numeric_only=numeric_only)
|
||||
corrs = [df["a"].corr(s), df["b"].corr(s)]
|
||||
expected = Series(data=corrs, index=["a", "b"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="could not convert string to float",
|
||||
):
|
||||
df.corrwith(s, numeric_only=numeric_only)
|
||||
|
||||
def test_corrwith_index_intersection(self):
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).random(size=(10, 2)), columns=["a", "b"]
|
||||
)
|
||||
df2 = DataFrame(
|
||||
np.random.default_rng(2).random(size=(10, 3)), columns=["a", "b", "c"]
|
||||
)
|
||||
|
||||
result = df1.corrwith(df2, drop=True).index.sort_values()
|
||||
expected = df1.columns.intersection(df2.columns).sort_values()
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
def test_corrwith_index_union(self):
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).random(size=(10, 2)), columns=["a", "b"]
|
||||
)
|
||||
df2 = DataFrame(
|
||||
np.random.default_rng(2).random(size=(10, 3)), columns=["a", "b", "c"]
|
||||
)
|
||||
|
||||
result = df1.corrwith(df2, drop=False).index.sort_values()
|
||||
expected = df1.columns.union(df2.columns).sort_values()
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
def test_corrwith_dup_cols(self):
|
||||
# GH#21925
|
||||
df1 = DataFrame(np.vstack([np.arange(10)] * 3).T)
|
||||
df2 = df1.copy()
|
||||
df2 = pd.concat((df2, df2[0]), axis=1)
|
||||
|
||||
result = df1.corrwith(df2)
|
||||
expected = Series(np.ones(4), index=[0, 0, 1, 2])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corr_numerical_instabilities(self):
|
||||
# GH#45640
|
||||
df = DataFrame([[0.2, 0.4], [0.4, 0.2]])
|
||||
result = df.corr()
|
||||
expected = DataFrame({0: [1.0, -1.0], 1: [-1.0, 1.0]})
|
||||
tm.assert_frame_equal(result - 1, expected - 1, atol=1e-17)
|
||||
|
||||
def test_corrwith_spearman(self):
|
||||
# GH#21925
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame(np.random.default_rng(2).random(size=(100, 3)))
|
||||
result = df.corrwith(df**2, method="spearman")
|
||||
expected = Series(np.ones(len(result)))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corrwith_kendall(self):
|
||||
# GH#21925
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame(np.random.default_rng(2).random(size=(100, 3)))
|
||||
result = df.corrwith(df**2, method="kendall")
|
||||
expected = Series(np.ones(len(result)))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corrwith_spearman_with_tied_data(self):
|
||||
# GH#48826
|
||||
pytest.importorskip("scipy")
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"A": [1, np.nan, 7, 8],
|
||||
"B": [False, True, True, False],
|
||||
"C": [10, 4, 9, 3],
|
||||
}
|
||||
)
|
||||
df2 = df1[["B", "C"]]
|
||||
result = (df1 + 1).corrwith(df2.B, method="spearman")
|
||||
expected = Series([0.0, 1.0, 0.0], index=["A", "B", "C"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
df_bool = DataFrame(
|
||||
{"A": [True, True, False, False], "B": [True, False, False, True]}
|
||||
)
|
||||
ser_bool = Series([True, True, False, True])
|
||||
result = df_bool.corrwith(ser_bool)
|
||||
expected = Series([0.57735, 0.57735], index=["A", "B"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corrwith_min_periods_method(self):
|
||||
# GH#9490
|
||||
pytest.importorskip("scipy")
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"A": [1, np.nan, 7, 8],
|
||||
"B": [False, True, True, False],
|
||||
"C": [10, 4, 9, 3],
|
||||
}
|
||||
)
|
||||
df2 = df1[["B", "C"]]
|
||||
result = (df1 + 1).corrwith(df2.B, method="spearman", min_periods=2)
|
||||
expected = Series([0.0, 1.0, 0.0], index=["A", "B", "C"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corrwith_min_periods_boolean(self):
|
||||
# GH#9490
|
||||
df_bool = DataFrame(
|
||||
{"A": [True, True, False, False], "B": [True, False, False, True]}
|
||||
)
|
||||
ser_bool = Series([True, True, False, True])
|
||||
result = df_bool.corrwith(ser_bool, min_periods=3)
|
||||
expected = Series([0.57735, 0.57735], index=["A", "B"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corr_within_bounds(self):
|
||||
df1 = DataFrame({"x": [0, 1], "y": [1.35951, 1.3595100000000007]})
|
||||
result1 = df1.corr().max().max()
|
||||
expected1 = 1.0
|
||||
tm.assert_equal(result1, expected1)
|
||||
|
||||
rng = np.random.default_rng(seed=42)
|
||||
df2 = DataFrame(rng.random((100, 4)))
|
||||
corr_matrix = df2.corr()
|
||||
assert corr_matrix.min().min() >= -1.0
|
||||
assert corr_matrix.max().max() <= 1.0
|
||||
|
||||
def test_cov_with_missing_values(self):
|
||||
df = DataFrame({"A": [1, 2, None, 4], "B": [2, 4, None, 9]})
|
||||
expected = DataFrame(
|
||||
{"A": [2.333333, 5.500000], "B": [5.5, 13.0]}, index=["A", "B"]
|
||||
)
|
||||
result1 = df.cov()
|
||||
result2 = df.dropna().cov()
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
@ -0,0 +1,463 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameDescribe:
|
||||
def test_describe_bool_in_mixed_frame(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"string_data": ["a", "b", "c", "d", "e"],
|
||||
"bool_data": [True, True, False, False, False],
|
||||
"int_data": [10, 20, 30, 40, 50],
|
||||
}
|
||||
)
|
||||
|
||||
# Integer data are included in .describe() output,
|
||||
# Boolean and string data are not.
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Top value is a boolean value that is False
|
||||
result = df.describe(include=["bool"])
|
||||
|
||||
expected = DataFrame(
|
||||
{"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_empty_object(self):
|
||||
# GH#27183
|
||||
df = DataFrame({"A": [None, None]}, dtype=object)
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"A": [0, 0, np.nan, np.nan]},
|
||||
dtype=object,
|
||||
index=["count", "unique", "top", "freq"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.iloc[:0].describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_bool_frame(self):
|
||||
# GH#13891
|
||||
df = DataFrame(
|
||||
{
|
||||
"bool_data_1": [False, False, True, True],
|
||||
"bool_data_2": [False, True, True, True],
|
||||
}
|
||||
)
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"bool_data_1": [4, 2, False, 2], "bool_data_2": [4, 2, True, 3]},
|
||||
index=["count", "unique", "top", "freq"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"bool_data": [False, False, True, True, False],
|
||||
"int_data": [0, 1, 2, 3, 4],
|
||||
}
|
||||
)
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame(
|
||||
{"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]}
|
||||
)
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"bool_data": [4, 2, False, 2], "str_data": [4, 3, "a", 2]},
|
||||
index=["count", "unique", "top", "freq"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_categorical(self):
|
||||
df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)})
|
||||
labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
|
||||
cat_labels = Categorical(labels, labels)
|
||||
|
||||
df = df.sort_values(by=["value"], ascending=True)
|
||||
df["value_group"] = pd.cut(
|
||||
df.value, range(0, 10500, 500), right=False, labels=cat_labels
|
||||
)
|
||||
cat = df
|
||||
|
||||
# Categoricals should not show up together with numerical columns
|
||||
result = cat.describe()
|
||||
assert len(result.columns) == 1
|
||||
|
||||
# In a frame, describe() for the cat should be the same as for string
|
||||
# arrays (count, unique, top, freq)
|
||||
|
||||
cat = Categorical(
|
||||
["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True
|
||||
)
|
||||
s = Series(cat)
|
||||
result = s.describe()
|
||||
expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
cat = Series(Categorical(["a", "b", "c", "c"]))
|
||||
df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
|
||||
result = df3.describe()
|
||||
tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
|
||||
|
||||
def test_describe_empty_categorical_column(self):
|
||||
# GH#26397
|
||||
# Ensure the index of an empty categorical DataFrame column
|
||||
# also contains (count, unique, top, freq)
|
||||
df = DataFrame({"empty_col": Categorical([])})
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"empty_col": [0, 0, np.nan, np.nan]},
|
||||
index=["count", "unique", "top", "freq"],
|
||||
dtype="object",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
# ensure NaN, not None
|
||||
assert np.isnan(result.iloc[2, 0])
|
||||
assert np.isnan(result.iloc[3, 0])
|
||||
|
||||
def test_describe_categorical_columns(self):
|
||||
# GH#11558
|
||||
columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX")
|
||||
df = DataFrame(
|
||||
{
|
||||
"int1": [10, 20, 30, 40, 50],
|
||||
"int2": [10, 20, 30, 40, 50],
|
||||
"obj": ["A", 0, None, "X", 1],
|
||||
},
|
||||
columns=columns,
|
||||
)
|
||||
result = df.describe()
|
||||
|
||||
exp_columns = pd.CategoricalIndex(
|
||||
["int1", "int2"],
|
||||
categories=["int1", "int2", "obj"],
|
||||
ordered=True,
|
||||
name="XXX",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50],
|
||||
"int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50],
|
||||
},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
columns=exp_columns,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_categorical_equal(result.columns.values, expected.columns.values)
|
||||
|
||||
def test_describe_datetime_columns(self):
|
||||
columns = pd.DatetimeIndex(
|
||||
["2011-01-01", "2011-02-01", "2011-03-01"],
|
||||
freq="MS",
|
||||
tz="US/Eastern",
|
||||
name="XXX",
|
||||
)
|
||||
df = DataFrame(
|
||||
{
|
||||
0: [10, 20, 30, 40, 50],
|
||||
1: [10, 20, 30, 40, 50],
|
||||
2: ["A", 0, None, "X", 1],
|
||||
}
|
||||
)
|
||||
df.columns = columns
|
||||
result = df.describe()
|
||||
|
||||
exp_columns = pd.DatetimeIndex(
|
||||
["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX"
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50],
|
||||
1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50],
|
||||
},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
expected.columns = exp_columns
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert result.columns.freq == "MS"
|
||||
assert result.columns.tz == expected.columns.tz
|
||||
|
||||
def test_describe_timedelta_values(self):
|
||||
# GH#6145
|
||||
t1 = pd.timedelta_range("1 days", freq="D", periods=5)
|
||||
t2 = pd.timedelta_range("1 hours", freq="h", periods=5)
|
||||
df = DataFrame({"t1": t1, "t2": t2})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"t1": [
|
||||
5,
|
||||
pd.Timedelta("3 days"),
|
||||
df.iloc[:, 0].std(),
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Timedelta("3 days"),
|
||||
pd.Timedelta("4 days"),
|
||||
pd.Timedelta("5 days"),
|
||||
],
|
||||
"t2": [
|
||||
5,
|
||||
pd.Timedelta("3 hours"),
|
||||
df.iloc[:, 1].std(),
|
||||
pd.Timedelta("1 hours"),
|
||||
pd.Timedelta("2 hours"),
|
||||
pd.Timedelta("3 hours"),
|
||||
pd.Timedelta("4 hours"),
|
||||
pd.Timedelta("5 hours"),
|
||||
],
|
||||
},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
|
||||
result = df.describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
exp_repr = (
|
||||
" t1 t2\n"
|
||||
"count 5 5\n"
|
||||
"mean 3 days 00:00:00 0 days 03:00:00\n"
|
||||
"std 1 days 13:56:50.394919 0 days 01:34:52.099788\n"
|
||||
"min 1 days 00:00:00 0 days 01:00:00\n"
|
||||
"25% 2 days 00:00:00 0 days 02:00:00\n"
|
||||
"50% 3 days 00:00:00 0 days 03:00:00\n"
|
||||
"75% 4 days 00:00:00 0 days 04:00:00\n"
|
||||
"max 5 days 00:00:00 0 days 05:00:00"
|
||||
)
|
||||
assert repr(result) == exp_repr
|
||||
|
||||
def test_describe_tz_values(self, tz_naive_fixture):
|
||||
# GH#21332
|
||||
tz = tz_naive_fixture
|
||||
s1 = Series(range(5))
|
||||
start = Timestamp(2018, 1, 1)
|
||||
end = Timestamp(2018, 1, 5)
|
||||
s2 = Series(date_range(start, end, tz=tz))
|
||||
df = DataFrame({"s1": s1, "s2": s2})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"s1": [5, 2, 0, 1, 2, 3, 4, 1.581139],
|
||||
"s2": [
|
||||
5,
|
||||
Timestamp(2018, 1, 3).tz_localize(tz),
|
||||
start.tz_localize(tz),
|
||||
s2[1],
|
||||
s2[2],
|
||||
s2[3],
|
||||
end.tz_localize(tz),
|
||||
np.nan,
|
||||
],
|
||||
},
|
||||
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
|
||||
)
|
||||
result = df.describe(include="all")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_datetime_is_numeric_includes_datetime(self):
|
||||
df = DataFrame({"a": date_range("2012", periods=3), "b": [1, 2, 3]})
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [
|
||||
3,
|
||||
Timestamp("2012-01-02"),
|
||||
Timestamp("2012-01-01"),
|
||||
Timestamp("2012-01-01T12:00:00"),
|
||||
Timestamp("2012-01-02"),
|
||||
Timestamp("2012-01-02T12:00:00"),
|
||||
Timestamp("2012-01-03"),
|
||||
np.nan,
|
||||
],
|
||||
"b": [3, 2, 1, 1.5, 2, 2.5, 3, 1],
|
||||
},
|
||||
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_tz_values2(self):
|
||||
tz = "CET"
|
||||
s1 = Series(range(5))
|
||||
start = Timestamp(2018, 1, 1)
|
||||
end = Timestamp(2018, 1, 5)
|
||||
s2 = Series(date_range(start, end, tz=tz))
|
||||
df = DataFrame({"s1": s1, "s2": s2})
|
||||
|
||||
s1_ = s1.describe()
|
||||
s2_ = s2.describe()
|
||||
idx = [
|
||||
"count",
|
||||
"mean",
|
||||
"min",
|
||||
"25%",
|
||||
"50%",
|
||||
"75%",
|
||||
"max",
|
||||
"std",
|
||||
]
|
||||
expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).reindex(idx)
|
||||
|
||||
result = df.describe(include="all")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_percentiles_integer_idx(self):
|
||||
# GH#26660
|
||||
df = DataFrame({"x": [1]})
|
||||
pct = np.linspace(0, 1, 10 + 1)
|
||||
result = df.describe(percentiles=pct)
|
||||
|
||||
expected = DataFrame(
|
||||
{"x": [1.0, 1.0, np.nan, 1.0, *(1.0 for _ in pct), 1.0]},
|
||||
index=[
|
||||
"count",
|
||||
"mean",
|
||||
"std",
|
||||
"min",
|
||||
"0%",
|
||||
"10%",
|
||||
"20%",
|
||||
"30%",
|
||||
"40%",
|
||||
"50%",
|
||||
"60%",
|
||||
"70%",
|
||||
"80%",
|
||||
"90%",
|
||||
"100%",
|
||||
"max",
|
||||
],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_does_not_raise_error_for_dictlike_elements(self):
|
||||
# GH#32409
|
||||
df = DataFrame([{"test": {"a": "1"}}, {"test": {"a": "2"}}])
|
||||
expected = DataFrame(
|
||||
{"test": [2, 2, {"a": "1"}, 1]}, index=["count", "unique", "top", "freq"]
|
||||
)
|
||||
result = df.describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("exclude", ["x", "y", ["x", "y"], ["x", "z"]])
|
||||
def test_describe_when_include_all_exclude_not_allowed(self, exclude):
|
||||
"""
|
||||
When include is 'all', then setting exclude != None is not allowed.
|
||||
"""
|
||||
df = DataFrame({"x": [1], "y": [2], "z": [3]})
|
||||
msg = "exclude must be None when include is 'all'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.describe(include="all", exclude=exclude)
|
||||
|
||||
def test_describe_when_included_dtypes_not_present(self):
|
||||
# GH#61863
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
msg = "No columns match the specified include or exclude data types"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.describe(include=["datetime"])
|
||||
|
||||
def test_describe_with_duplicate_columns(self):
|
||||
df = DataFrame(
|
||||
[[1, 1, 1], [2, 2, 2], [3, 3, 3]],
|
||||
columns=["bar", "a", "a"],
|
||||
dtype="float64",
|
||||
)
|
||||
result = df.describe()
|
||||
ser = df.iloc[:, 0].describe()
|
||||
expected = pd.concat([ser, ser, ser], keys=df.columns, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_ea_with_na(self, any_numeric_ea_dtype):
|
||||
# GH#48778
|
||||
|
||||
df = DataFrame({"a": [1, pd.NA, pd.NA], "b": pd.NA}, dtype=any_numeric_ea_dtype)
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"a": [1.0, 1.0, pd.NA] + [1.0] * 5, "b": [0.0] + [pd.NA] * 7},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
dtype="Float64",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_exclude_pa_dtype(self):
|
||||
# GH#52570
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int8())),
|
||||
"b": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int16())),
|
||||
"c": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int32())),
|
||||
}
|
||||
)
|
||||
result = df.describe(
|
||||
include=pd.ArrowDtype(pa.int8()), exclude=pd.ArrowDtype(pa.int32())
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"a": [3, 2, 1, 1, 1.5, 2, 2.5, 3]},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
dtype=pd.ArrowDtype(pa.float64()),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("percentiles", [None, [], [0.2]])
|
||||
def test_refine_percentiles(self, percentiles):
|
||||
"""
|
||||
Test that the percentiles are returned correctly depending on the `percentiles`
|
||||
argument.
|
||||
- The default behavior is to return the 25th, 50th, and 75 percentiles
|
||||
- If `percentiles` is an empty list, no percentiles are returned
|
||||
- If `percentiles` is a non-empty list, only those percentiles are returned
|
||||
"""
|
||||
# GH#60550
|
||||
df = DataFrame({"a": np.arange(0, 10, 1)})
|
||||
|
||||
result = df.describe(percentiles=percentiles)
|
||||
|
||||
if percentiles is None:
|
||||
percentiles = [0.25, 0.5, 0.75]
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
len(df.a),
|
||||
df.a.mean(),
|
||||
df.a.std(),
|
||||
df.a.min(),
|
||||
*[df.a.quantile(p) for p in percentiles],
|
||||
df.a.max(),
|
||||
],
|
||||
index=pd.Index(
|
||||
[
|
||||
"count",
|
||||
"mean",
|
||||
"std",
|
||||
"min",
|
||||
*[f"{p:.0%}" for p in percentiles],
|
||||
"max",
|
||||
]
|
||||
),
|
||||
columns=["a"],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,308 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameDiff:
|
||||
def test_diff_requires_integer(self):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((2, 2)))
|
||||
with pytest.raises(ValueError, match="periods must be an integer"):
|
||||
df.diff(1.5)
|
||||
|
||||
# GH#44572 np.int64 is accepted
|
||||
@pytest.mark.parametrize("num", [1, np.int64(1)])
|
||||
def test_diff(self, datetime_frame, num):
|
||||
df = datetime_frame
|
||||
the_diff = df.diff(num)
|
||||
|
||||
expected = df["A"] - df["A"].shift(num)
|
||||
tm.assert_series_equal(the_diff["A"], expected)
|
||||
|
||||
def test_diff_int_dtype(self):
|
||||
# int dtype
|
||||
a = 10_000_000_000_000_000
|
||||
b = a + 1
|
||||
ser = Series([a, b])
|
||||
|
||||
rs = DataFrame({"s": ser}).diff()
|
||||
assert rs.s[1] == 1
|
||||
|
||||
def test_diff_mixed_numeric(self, datetime_frame):
|
||||
# mixed numeric
|
||||
tf = datetime_frame.astype("float32")
|
||||
the_diff = tf.diff(1)
|
||||
tm.assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1))
|
||||
|
||||
def test_diff_axis1_nonconsolidated(self):
|
||||
# GH#10907
|
||||
df = DataFrame({"y": Series([2]), "z": Series([3])})
|
||||
df.insert(0, "x", 1)
|
||||
result = df.diff(axis=1)
|
||||
expected = DataFrame({"x": np.nan, "y": Series(1), "z": Series(1)})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_timedelta64_with_nat(self):
|
||||
# GH#32441
|
||||
arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]")
|
||||
arr[:, 0] = np.timedelta64("NaT", "ns")
|
||||
|
||||
df = DataFrame(arr)
|
||||
result = df.diff(1, axis=0)
|
||||
|
||||
expected = DataFrame({0: df[0], 1: [pd.NaT, pd.Timedelta(2), pd.Timedelta(2)]})
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
result = df.diff(0)
|
||||
expected = df - df
|
||||
assert expected[0].isna().all()
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
result = df.diff(-1, axis=1)
|
||||
expected = df * np.nan
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||||
def test_diff_datetime_axis0_with_nat(self, tz, unit):
|
||||
# GH#32441
|
||||
dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz).as_unit(unit)
|
||||
ser = Series(dti)
|
||||
|
||||
df = ser.to_frame()
|
||||
|
||||
result = df.diff()
|
||||
ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)]).as_unit(
|
||||
unit
|
||||
)
|
||||
expected = Series(ex_index).to_frame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||||
def test_diff_datetime_with_nat_zero_periods(self, tz):
|
||||
# diff on NaT values should give NaT, not timedelta64(0)
|
||||
dti = date_range("2016-01-01", periods=4, tz=tz)
|
||||
ser = Series(dti)
|
||||
df = ser.to_frame().copy()
|
||||
|
||||
df[1] = ser.copy()
|
||||
|
||||
df.iloc[:, 0] = pd.NaT
|
||||
|
||||
expected = df - df
|
||||
assert expected[0].isna().all()
|
||||
|
||||
result = df.diff(0, axis=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.diff(0, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||||
def test_diff_datetime_axis0(self, tz):
|
||||
# GH#18578
|
||||
df = DataFrame(
|
||||
{
|
||||
0: date_range("2010", freq="D", periods=2, tz=tz),
|
||||
1: date_range("2010", freq="D", periods=2, tz=tz),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.diff(axis=0)
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: pd.TimedeltaIndex(["NaT", "1 days"]),
|
||||
1: pd.TimedeltaIndex(["NaT", "1 days"]),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||||
def test_diff_datetime_axis1(self, tz):
|
||||
# GH#18578
|
||||
df = DataFrame(
|
||||
{
|
||||
0: date_range("2010", freq="D", periods=2, tz=tz),
|
||||
1: date_range("2010", freq="D", periods=2, tz=tz),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.diff(axis=1)
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: pd.TimedeltaIndex(["NaT", "NaT"], dtype="m8[us]"),
|
||||
1: pd.TimedeltaIndex(["0 days", "0 days"]),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_timedelta(self, unit):
|
||||
# GH#4533
|
||||
df = DataFrame(
|
||||
{
|
||||
"time": [Timestamp("20130101 9:01"), Timestamp("20130101 9:02")],
|
||||
"value": [1.0, 2.0],
|
||||
}
|
||||
)
|
||||
df["time"] = df["time"].dt.as_unit(unit)
|
||||
|
||||
res = df.diff()
|
||||
exp = DataFrame(
|
||||
[[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"]
|
||||
)
|
||||
exp["time"] = exp["time"].dt.as_unit(unit)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_diff_mixed_dtype(self):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
|
||||
df["A"] = np.array([1, 2, 3, 4, 5], dtype=object)
|
||||
|
||||
result = df.diff()
|
||||
assert result[0].dtype == np.float64
|
||||
|
||||
def test_diff_neg_n(self, datetime_frame):
|
||||
rs = datetime_frame.diff(-1)
|
||||
xp = datetime_frame - datetime_frame.shift(-1)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
def test_diff_float_n(self, datetime_frame):
|
||||
rs = datetime_frame.diff(1.0)
|
||||
xp = datetime_frame.diff(1)
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
def test_diff_axis(self):
|
||||
# GH#9727
|
||||
df = DataFrame([[1.0, 2.0], [3.0, 4.0]])
|
||||
tm.assert_frame_equal(
|
||||
df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]])
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]])
|
||||
)
|
||||
|
||||
def test_diff_period(self):
|
||||
# GH#32995 Don't pass an incorrect axis
|
||||
pi = date_range("2016-01-01", periods=3).to_period("D")
|
||||
df = DataFrame({"A": pi})
|
||||
|
||||
result = df.diff(1, axis=1)
|
||||
|
||||
expected = (df - pd.NaT).astype(object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_axis1_mixed_dtypes(self):
|
||||
# GH#32995 operate column-wise when we have mixed dtypes and axis=1
|
||||
df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
|
||||
|
||||
expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": df["B"] / 2})
|
||||
|
||||
result = df.diff(axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH#21437 mixed-float-dtypes
|
||||
df = DataFrame(
|
||||
{"a": np.arange(3, dtype="float32"), "b": np.arange(3, dtype="float64")}
|
||||
)
|
||||
result = df.diff(axis=1)
|
||||
expected = DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_axis1_mixed_dtypes_large_periods(self):
|
||||
# GH#32995 operate column-wise when we have mixed dtypes and axis=1
|
||||
df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
|
||||
|
||||
expected = df * np.nan
|
||||
|
||||
result = df.diff(axis=1, periods=3)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_axis1_mixed_dtypes_negative_periods(self):
|
||||
# GH#32995 operate column-wise when we have mixed dtypes and axis=1
|
||||
df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
|
||||
|
||||
expected = DataFrame({"A": -1.0 * df["A"], "B": df["B"] * np.nan})
|
||||
|
||||
result = df.diff(axis=1, periods=-1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_sparse(self):
|
||||
# GH#28813 .diff() should work for sparse dataframes as well
|
||||
sparse_df = DataFrame([[0, 1], [1, 0]], dtype="Sparse[int]")
|
||||
|
||||
result = sparse_df.diff()
|
||||
expected = DataFrame(
|
||||
[[np.nan, np.nan], [1.0, -1.0]], dtype=pd.SparseDtype("float", 0.0)
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"axis,expected",
|
||||
[
|
||||
(
|
||||
0,
|
||||
DataFrame(
|
||||
{
|
||||
"a": [pd.NA, 0, 1, 0, pd.NA, pd.NA, pd.NA, 0],
|
||||
"b": [pd.NA, 1, pd.NA, pd.NA, -2, 1, pd.NA, pd.NA],
|
||||
"c": np.repeat(pd.NA, 8), # type: ignore[call-overload]
|
||||
"d": [pd.NA, 3, 5, 7, 9, 11, 13, 15],
|
||||
},
|
||||
dtype="Int64",
|
||||
),
|
||||
),
|
||||
(
|
||||
1,
|
||||
DataFrame(
|
||||
{
|
||||
"a": np.repeat(pd.NA, 8), # type: ignore[call-overload]
|
||||
"b": [0, 1, pd.NA, 1, pd.NA, pd.NA, pd.NA, 0],
|
||||
"c": np.repeat(pd.NA, 8), # type: ignore[call-overload]
|
||||
"d": np.repeat(pd.NA, 8), # type: ignore[call-overload]
|
||||
},
|
||||
dtype="Int64",
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_diff_integer_na(self, axis, expected):
|
||||
# GH#24171 IntegerNA Support for DataFrame.diff()
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.repeat([0, 1, pd.NA, 2], 2),
|
||||
"b": np.tile([0, 1, pd.NA, 2], 2),
|
||||
"c": np.repeat(pd.NA, 8),
|
||||
"d": np.arange(1, 9) ** 2,
|
||||
},
|
||||
dtype="Int64",
|
||||
)
|
||||
|
||||
# Test case for default behaviour of diff
|
||||
result = df.diff(axis=axis)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_readonly(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/35559
|
||||
arr = np.random.default_rng(2).standard_normal((5, 2))
|
||||
arr.flags.writeable = False
|
||||
df = DataFrame(arr)
|
||||
result = df.diff()
|
||||
expected = DataFrame(np.array(df)).diff()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_diff_all_int_dtype(self, any_int_numpy_dtype):
|
||||
# GH 14773
|
||||
df = DataFrame(range(5))
|
||||
df = df.astype(any_int_numpy_dtype)
|
||||
result = df.diff()
|
||||
expected_dtype = (
|
||||
"float32" if any_int_numpy_dtype in ("int8", "int16") else "float64"
|
||||
)
|
||||
expected = DataFrame([np.nan, 1.0, 1.0, 1.0, 1.0], dtype=expected_dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,171 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class DotSharedTests:
|
||||
@pytest.fixture
|
||||
def obj(self):
|
||||
raise NotImplementedError
|
||||
|
||||
@pytest.fixture
|
||||
def other(self) -> DataFrame:
|
||||
"""
|
||||
other is a DataFrame that is indexed so that obj.dot(other) is valid
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@pytest.fixture
|
||||
def expected(self, obj, other) -> DataFrame:
|
||||
"""
|
||||
The expected result of obj.dot(other)
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def reduced_dim_assert(cls, result, expected):
|
||||
"""
|
||||
Assertion about results with 1 fewer dimension that self.obj
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def test_dot_equiv_values_dot(self, obj, other, expected):
|
||||
# `expected` is constructed from obj.values.dot(other.values)
|
||||
result = obj.dot(other)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_dot_2d_ndarray(self, obj, other, expected):
|
||||
# Check ndarray argument; in this case we get matching values,
|
||||
# but index/columns may not match
|
||||
result = obj.dot(other.values)
|
||||
assert np.all(result == expected.values)
|
||||
|
||||
def test_dot_1d_ndarray(self, obj, expected):
|
||||
# can pass correct-length array
|
||||
row = obj.iloc[0] if obj.ndim == 2 else obj
|
||||
|
||||
result = obj.dot(row.values)
|
||||
expected = obj.dot(row)
|
||||
self.reduced_dim_assert(result, expected)
|
||||
|
||||
def test_dot_series(self, obj, other, expected):
|
||||
# Check series argument
|
||||
result = obj.dot(other["1"])
|
||||
self.reduced_dim_assert(result, expected["1"])
|
||||
|
||||
def test_dot_series_alignment(self, obj, other, expected):
|
||||
result = obj.dot(other.iloc[::-1]["1"])
|
||||
self.reduced_dim_assert(result, expected["1"])
|
||||
|
||||
def test_dot_aligns(self, obj, other, expected):
|
||||
# Check index alignment
|
||||
other2 = other.iloc[::-1]
|
||||
result = obj.dot(other2)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_dot_shape_mismatch(self, obj):
|
||||
msg = "Dot product shape mismatch"
|
||||
# exception raised is of type Exception
|
||||
with pytest.raises(Exception, match=msg):
|
||||
obj.dot(obj.values[:3])
|
||||
|
||||
def test_dot_misaligned(self, obj, other):
|
||||
msg = "matrices are not aligned"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.dot(other.T)
|
||||
|
||||
|
||||
class TestSeriesDot(DotSharedTests):
|
||||
@pytest.fixture
|
||||
def obj(self):
|
||||
return Series(
|
||||
np.random.default_rng(2).standard_normal(4), index=["p", "q", "r", "s"]
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def other(self):
|
||||
return DataFrame(
|
||||
np.random.default_rng(2).standard_normal((3, 4)),
|
||||
index=["1", "2", "3"],
|
||||
columns=["p", "q", "r", "s"],
|
||||
).T
|
||||
|
||||
@pytest.fixture
|
||||
def expected(self, obj, other):
|
||||
return Series(np.dot(obj.values, other.values), index=other.columns)
|
||||
|
||||
@classmethod
|
||||
def reduced_dim_assert(cls, result, expected):
|
||||
"""
|
||||
Assertion about results with 1 fewer dimension that self.obj
|
||||
"""
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
class TestDataFrameDot(DotSharedTests):
|
||||
@pytest.fixture
|
||||
def obj(self):
|
||||
return DataFrame(
|
||||
np.random.default_rng(2).standard_normal((3, 4)),
|
||||
index=["a", "b", "c"],
|
||||
columns=["p", "q", "r", "s"],
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def other(self):
|
||||
return DataFrame(
|
||||
np.random.default_rng(2).standard_normal((4, 2)),
|
||||
index=["p", "q", "r", "s"],
|
||||
columns=["1", "2"],
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def expected(self, obj, other):
|
||||
return DataFrame(
|
||||
np.dot(obj.values, other.values), index=obj.index, columns=other.columns
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def reduced_dim_assert(cls, result, expected):
|
||||
"""
|
||||
Assertion about results with 1 fewer dimension that self.obj
|
||||
"""
|
||||
tm.assert_series_equal(result, expected, check_names=False)
|
||||
assert result.name is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype,exp_dtype",
|
||||
[("Float32", "Float64"), ("Int16", "Int32"), ("float[pyarrow]", "double[pyarrow]")],
|
||||
)
|
||||
def test_arrow_dtype(dtype, exp_dtype):
|
||||
pytest.importorskip("pyarrow")
|
||||
|
||||
cols = ["a", "b"]
|
||||
df_a = DataFrame([[1, 2], [3, 4], [5, 6]], columns=cols, dtype="int32")
|
||||
df_b = DataFrame([[1, 0], [0, 1]], index=cols, dtype=dtype)
|
||||
result = df_a.dot(df_b)
|
||||
expected = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=exp_dtype)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype,exp_dtype",
|
||||
[("Float32", "Float64"), ("Int16", "Int32"), ("float[pyarrow]", "double[pyarrow]")],
|
||||
)
|
||||
def test_arrow_dtype_series(dtype, exp_dtype):
|
||||
pytest.importorskip("pyarrow")
|
||||
|
||||
cols = ["a", "b"]
|
||||
series_a = Series([1, 2], index=cols, dtype="int32")
|
||||
df_b = DataFrame([[1, 0], [0, 1]], index=cols, dtype=dtype)
|
||||
result = series_a.dot(df_b)
|
||||
expected = Series([1, 2], dtype=exp_dtype)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user