Initial commit: 首次建仓,建立目录结构
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,351 @@
|
||||
import datetime as dt
|
||||
from itertools import combinations
|
||||
|
||||
import dateutil
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
Timestamp,
|
||||
concat,
|
||||
isna,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestAppend:
|
||||
def test_append(self, sort, float_frame):
|
||||
mixed_frame = float_frame.copy()
|
||||
mixed_frame["foo"] = "bar"
|
||||
|
||||
begin_index = float_frame.index[:5]
|
||||
end_index = float_frame.index[5:]
|
||||
|
||||
begin_frame = float_frame.reindex(begin_index)
|
||||
end_frame = float_frame.reindex(end_index)
|
||||
|
||||
appended = concat([begin_frame, end_frame])
|
||||
tm.assert_almost_equal(appended["A"], float_frame["A"])
|
||||
|
||||
del end_frame["A"]
|
||||
partial_appended = concat([begin_frame, end_frame], sort=sort)
|
||||
assert "A" in partial_appended
|
||||
|
||||
partial_appended = concat([end_frame, begin_frame], sort=sort)
|
||||
assert "A" in partial_appended
|
||||
|
||||
# mixed type handling
|
||||
appended = concat([mixed_frame[:5], mixed_frame[5:]])
|
||||
tm.assert_frame_equal(appended, mixed_frame)
|
||||
|
||||
# what to test here
|
||||
mixed_appended = concat([mixed_frame[:5], float_frame[5:]], sort=sort)
|
||||
mixed_appended2 = concat([float_frame[:5], mixed_frame[5:]], sort=sort)
|
||||
|
||||
# all equal except 'foo' column
|
||||
tm.assert_frame_equal(
|
||||
mixed_appended.reindex(columns=["A", "B", "C", "D"]),
|
||||
mixed_appended2.reindex(columns=["A", "B", "C", "D"]),
|
||||
)
|
||||
|
||||
def test_append_empty(self, float_frame):
|
||||
empty = DataFrame()
|
||||
|
||||
appended = concat([float_frame, empty])
|
||||
tm.assert_frame_equal(float_frame, appended)
|
||||
assert appended is not float_frame
|
||||
|
||||
appended = concat([empty, float_frame])
|
||||
tm.assert_frame_equal(float_frame, appended)
|
||||
assert appended is not float_frame
|
||||
|
||||
def test_append_overlap_raises(self, float_frame):
|
||||
msg = "Indexes have overlapping values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
concat([float_frame, float_frame], verify_integrity=True)
|
||||
|
||||
def test_append_new_columns(self):
|
||||
# see gh-6129: new columns
|
||||
df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}})
|
||||
row = Series([5, 6, 7], index=["a", "b", "c"], name="z")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": {"x": 1, "y": 2, "z": 5},
|
||||
"b": {"x": 3, "y": 4, "z": 6},
|
||||
"c": {"z": 7},
|
||||
}
|
||||
)
|
||||
result = df._append_internal(row)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_length0_frame(self, sort):
|
||||
df = DataFrame(columns=["A", "B", "C"])
|
||||
df3 = DataFrame(index=[0, 1], columns=["A", "B"])
|
||||
df5 = concat([df, df3], sort=sort)
|
||||
|
||||
expected = DataFrame(index=[0, 1], columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(df5, expected)
|
||||
|
||||
def test_append_records(self):
|
||||
arr1 = np.zeros((2,), dtype=("i4,f4,S10"))
|
||||
arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
|
||||
|
||||
arr2 = np.zeros((3,), dtype=("i4,f4,S10"))
|
||||
arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")]
|
||||
|
||||
df1 = DataFrame(arr1)
|
||||
df2 = DataFrame(arr2)
|
||||
|
||||
result = concat([df1, df2], ignore_index=True)
|
||||
expected = DataFrame(np.concatenate((arr1, arr2)))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# rewrite sort fixture, since we also want to test default of None
|
||||
def test_append_sorts(self, sort):
|
||||
df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"])
|
||||
df2 = DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3])
|
||||
|
||||
result = concat([df1, df2], sort=sort)
|
||||
|
||||
# for None / True
|
||||
expected = DataFrame(
|
||||
{"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]},
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
if sort is False:
|
||||
expected = expected[["b", "a", "c"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_different_columns(self, sort):
|
||||
df = DataFrame(
|
||||
{
|
||||
"bools": np.random.default_rng(2).standard_normal(10) > 0,
|
||||
"ints": np.random.default_rng(2).integers(0, 10, 10),
|
||||
"floats": np.random.default_rng(2).standard_normal(10),
|
||||
"strings": ["foo", "bar"] * 5,
|
||||
}
|
||||
)
|
||||
|
||||
a = df[:5].loc[:, ["bools", "ints", "floats"]]
|
||||
b = df[5:].loc[:, ["strings", "ints", "floats"]]
|
||||
|
||||
appended = concat([a, b], sort=sort)
|
||||
assert isna(appended["strings"][0:4]).all()
|
||||
assert isna(appended["bools"][5:]).all()
|
||||
|
||||
def test_append_preserve_index_name(self):
|
||||
# #980
|
||||
df1 = DataFrame(columns=["A", "B", "C"])
|
||||
df1 = df1.set_index(["A"])
|
||||
df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"])
|
||||
df2 = df2.set_index(["A"])
|
||||
|
||||
result = concat([df1, df2])
|
||||
assert result.index.name == "A"
|
||||
|
||||
indexes_can_append = [
|
||||
pd.RangeIndex(3),
|
||||
Index([4, 5, 6]),
|
||||
Index([4.5, 5.5, 6.5]),
|
||||
Index(list("abc")),
|
||||
pd.CategoricalIndex("A B C".split()),
|
||||
pd.CategoricalIndex("D E F".split(), ordered=True),
|
||||
pd.IntervalIndex.from_breaks([7, 8, 9, 10]),
|
||||
pd.DatetimeIndex(
|
||||
[
|
||||
dt.datetime(2013, 1, 3, 0, 0),
|
||||
dt.datetime(2013, 1, 3, 6, 10),
|
||||
dt.datetime(2013, 1, 3, 7, 12),
|
||||
]
|
||||
),
|
||||
pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]),
|
||||
]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index", indexes_can_append, ids=lambda x: type(x).__name__
|
||||
)
|
||||
def test_append_same_columns_type(self, index):
|
||||
# GH18359
|
||||
|
||||
# df wider than ser
|
||||
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index)
|
||||
ser_index = index[:2]
|
||||
ser = Series([7, 8], index=ser_index, name=2)
|
||||
result = df._append_internal(ser)
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index
|
||||
)
|
||||
# integer dtype is preserved for columns present in ser.index
|
||||
assert expected.dtypes.iloc[0].kind == "i"
|
||||
assert expected.dtypes.iloc[1].kind == "i"
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# ser wider than df
|
||||
ser_index = index
|
||||
index = index[:2]
|
||||
df = DataFrame([[1, 2], [4, 5]], columns=index)
|
||||
ser = Series([7, 8, 9], index=ser_index, name=2)
|
||||
result = df._append_internal(ser)
|
||||
expected = DataFrame(
|
||||
[[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]],
|
||||
index=[0, 1, 2],
|
||||
columns=ser_index,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df_columns, series_index",
|
||||
combinations(indexes_can_append, r=2),
|
||||
ids=lambda x: type(x).__name__,
|
||||
)
|
||||
def test_append_different_columns_types(self, df_columns, series_index):
|
||||
# GH18359
|
||||
# See also test 'test_append_different_columns_types_raises' below
|
||||
# for errors raised when appending
|
||||
|
||||
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
|
||||
ser = Series([7, 8, 9], index=series_index, name=2)
|
||||
|
||||
result = df._append_internal(ser)
|
||||
idx_diff = ser.index.difference(df_columns)
|
||||
combined_columns = Index(df_columns.tolist()).append(idx_diff)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1.0, 2.0, 3.0, np.nan, np.nan, np.nan],
|
||||
[4, 5, 6, np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan, 7, 8, 9],
|
||||
],
|
||||
index=[0, 1, 2],
|
||||
columns=combined_columns,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_dtype_coerce(self, sort):
|
||||
# GH 4993
|
||||
# appending with datetime will incorrectly convert datetime64
|
||||
|
||||
df1 = DataFrame(
|
||||
index=[1, 2],
|
||||
data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)],
|
||||
columns=["start_time"],
|
||||
)
|
||||
df2 = DataFrame(
|
||||
index=[4, 5],
|
||||
data=[
|
||||
[dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)],
|
||||
[dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)],
|
||||
],
|
||||
columns=["start_time", "end_time"],
|
||||
)
|
||||
|
||||
expected = concat(
|
||||
[
|
||||
Series(
|
||||
[
|
||||
pd.NaT,
|
||||
pd.NaT,
|
||||
dt.datetime(2013, 1, 3, 6, 10),
|
||||
dt.datetime(2013, 1, 4, 7, 10),
|
||||
],
|
||||
name="end_time",
|
||||
),
|
||||
Series(
|
||||
[
|
||||
dt.datetime(2013, 1, 1, 0, 0),
|
||||
dt.datetime(2013, 1, 2, 0, 0),
|
||||
dt.datetime(2013, 1, 3, 0, 0),
|
||||
dt.datetime(2013, 1, 4, 0, 0),
|
||||
],
|
||||
name="start_time",
|
||||
),
|
||||
],
|
||||
axis=1,
|
||||
sort=sort,
|
||||
)
|
||||
result = concat([df1, df2], ignore_index=True, sort=sort)
|
||||
if sort:
|
||||
expected = expected[["end_time", "start_time"]]
|
||||
else:
|
||||
expected = expected[["start_time", "end_time"]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_missing_column_proper_upcast(self, sort):
|
||||
df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")})
|
||||
df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)})
|
||||
|
||||
appended = concat([df1, df2], sort=sort)
|
||||
assert appended["A"].dtype == "f8"
|
||||
assert appended["B"].dtype == "O"
|
||||
|
||||
def test_append_empty_frame_to_series_with_dateutil_tz(self):
|
||||
# GH 23682
|
||||
date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
|
||||
ser = Series({"a": 1.0, "b": 2.0, "date": date})
|
||||
df = DataFrame(columns=["c", "d"])
|
||||
result_a = df._append_internal(ser, ignore_index=True)
|
||||
expected = DataFrame(
|
||||
[[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]
|
||||
)
|
||||
# These columns get cast to object after append
|
||||
expected["c"] = expected["c"].astype(object)
|
||||
expected["d"] = expected["d"].astype(object)
|
||||
tm.assert_frame_equal(result_a, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
[[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"]
|
||||
)
|
||||
expected["c"] = expected["c"].astype(object)
|
||||
expected["d"] = expected["d"].astype(object)
|
||||
result_b = result_a._append_internal(ser, ignore_index=True)
|
||||
tm.assert_frame_equal(result_b, expected)
|
||||
|
||||
def test_append_empty_tz_frame_with_datetime64ns(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/35460
|
||||
df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
|
||||
|
||||
# also test with typed value to append
|
||||
df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
|
||||
other = Series({"a": pd.NaT}, dtype="datetime64[ns]").to_frame().T
|
||||
result = concat([df, other], ignore_index=True)
|
||||
expected = DataFrame({"a": [pd.NaT]}, dtype=object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# mismatched tz
|
||||
other = Series({"a": pd.NaT}, dtype="datetime64[ns, US/Pacific]").to_frame().T
|
||||
result = concat([df, other], ignore_index=True)
|
||||
expected = DataFrame({"a": [pd.NaT]}).astype(object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
|
||||
)
|
||||
@pytest.mark.parametrize("val", [1, "NaT"])
|
||||
def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val):
|
||||
# https://github.com/pandas-dev/pandas/issues/35460
|
||||
df = DataFrame(columns=["a"]).astype(dtype_str)
|
||||
|
||||
other = DataFrame({"a": [np.timedelta64(val, "ns")]})
|
||||
result = concat([df, other])
|
||||
|
||||
expected = other.astype(object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
|
||||
)
|
||||
@pytest.mark.parametrize("val", [1, "NaT"])
|
||||
def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val):
|
||||
# https://github.com/pandas-dev/pandas/issues/35460
|
||||
df = DataFrame({"a": pd.array([1], dtype=dtype_str)})
|
||||
|
||||
other = DataFrame({"a": [np.timedelta64(val, "ns")]})
|
||||
result = concat([df, other], ignore_index=True)
|
||||
|
||||
expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,715 @@
|
||||
import zoneinfo
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=list(
|
||||
{
|
||||
"bool": [True, False, True],
|
||||
"int64": [1, 2, 3],
|
||||
"float64": [1.1, np.nan, 3.3],
|
||||
"category": Categorical(["X", "Y", "Z"]),
|
||||
"object": ["a", "b", "c"],
|
||||
"datetime64[s]": [
|
||||
pd.Timestamp("2011-01-01").as_unit("s"),
|
||||
pd.Timestamp("2011-01-02").as_unit("s"),
|
||||
pd.Timestamp("2011-01-03").as_unit("s"),
|
||||
],
|
||||
"datetime64[s, US/Eastern]": [
|
||||
pd.Timestamp("2011-01-01", tz="US/Eastern").as_unit("s"),
|
||||
pd.Timestamp("2011-01-02", tz="US/Eastern").as_unit("s"),
|
||||
pd.Timestamp("2011-01-03", tz="US/Eastern").as_unit("s"),
|
||||
],
|
||||
"timedelta64[us]": [
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Timedelta("3 days"),
|
||||
],
|
||||
"period[M]": [
|
||||
pd.Period("2011-01", freq="M"),
|
||||
pd.Period("2011-02", freq="M"),
|
||||
pd.Period("2011-03", freq="M"),
|
||||
],
|
||||
}.items()
|
||||
)
|
||||
)
|
||||
def item(request):
|
||||
key, data = request.param
|
||||
return key, data
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def item2(item):
|
||||
return item
|
||||
|
||||
|
||||
class TestConcatAppendCommon:
|
||||
"""
|
||||
Test common dtype coercion rules between concat and append.
|
||||
"""
|
||||
|
||||
def test_dtypes(self, item, index_or_series, using_infer_string):
|
||||
# to confirm test case covers intended dtypes
|
||||
typ, vals = item
|
||||
obj = index_or_series(vals)
|
||||
if typ == "object" and using_infer_string:
|
||||
typ = "string"
|
||||
if isinstance(obj, Index):
|
||||
assert obj.dtype == typ
|
||||
elif isinstance(obj, Series):
|
||||
if typ.startswith("period"):
|
||||
assert obj.dtype == "Period[M]"
|
||||
else:
|
||||
assert obj.dtype == typ
|
||||
|
||||
def test_concatlike_same_dtypes(self, item):
|
||||
# GH 13660
|
||||
typ1, vals1 = item
|
||||
|
||||
vals2 = vals1
|
||||
vals3 = vals1
|
||||
|
||||
if typ1 == "category":
|
||||
exp_data = Categorical(list(vals1) + list(vals2))
|
||||
exp_data3 = Categorical(list(vals1) + list(vals2) + list(vals3))
|
||||
else:
|
||||
exp_data = vals1 + vals2
|
||||
exp_data3 = vals1 + vals2 + vals3
|
||||
|
||||
# ----- Index ----- #
|
||||
|
||||
# index.append
|
||||
res = Index(vals1).append(Index(vals2))
|
||||
exp = Index(exp_data)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
# 3 elements
|
||||
res = Index(vals1).append([Index(vals2), Index(vals3)])
|
||||
exp = Index(exp_data3)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
# index.append name mismatch
|
||||
i1 = Index(vals1, name="x")
|
||||
i2 = Index(vals2, name="y")
|
||||
res = i1.append(i2)
|
||||
exp = Index(exp_data)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
# index.append name match
|
||||
i1 = Index(vals1, name="x")
|
||||
i2 = Index(vals2, name="x")
|
||||
res = i1.append(i2)
|
||||
exp = Index(exp_data, name="x")
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
# cannot append non-index
|
||||
with pytest.raises(TypeError, match="all inputs must be Index"):
|
||||
Index(vals1).append(vals2)
|
||||
|
||||
with pytest.raises(TypeError, match="all inputs must be Index"):
|
||||
Index(vals1).append([Index(vals2), vals3])
|
||||
|
||||
# ----- Series ----- #
|
||||
|
||||
# series.append
|
||||
res = Series(vals1)._append_internal(Series(vals2), ignore_index=True)
|
||||
exp = Series(exp_data)
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
# concat
|
||||
res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
# 3 elements
|
||||
exp = Series(exp_data3)
|
||||
res = pd.concat(
|
||||
[Series(vals1), Series(vals2), Series(vals3)],
|
||||
ignore_index=True,
|
||||
)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
# name mismatch
|
||||
s1 = Series(vals1, name="x")
|
||||
s2 = Series(vals2, name="y")
|
||||
res = s1._append_internal(s2, ignore_index=True)
|
||||
exp = Series(exp_data)
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
res = pd.concat([s1, s2], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
# name match
|
||||
s1 = Series(vals1, name="x")
|
||||
s2 = Series(vals2, name="x")
|
||||
res = s1._append_internal(s2, ignore_index=True)
|
||||
exp = Series(exp_data, name="x")
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
res = pd.concat([s1, s2], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
# cannot append non-index
|
||||
msg = (
|
||||
r"cannot concatenate object of type '.+'; "
|
||||
"only Series and DataFrame objs are valid"
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
pd.concat([Series(vals1), vals2])
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
pd.concat([Series(vals1), Series(vals2), vals3])
|
||||
|
||||
def test_concatlike_dtypes_coercion(self, item, item2, request):
|
||||
# GH 13660
|
||||
typ1, vals1 = item
|
||||
typ2, vals2 = item2
|
||||
|
||||
vals3 = vals2
|
||||
|
||||
# basically infer
|
||||
exp_index_dtype = None
|
||||
exp_series_dtype = None
|
||||
|
||||
if typ1 == typ2:
|
||||
pytest.skip("same dtype is tested in test_concatlike_same_dtypes")
|
||||
elif typ1 == "category" or typ2 == "category":
|
||||
pytest.skip("categorical type tested elsewhere")
|
||||
|
||||
# specify expected dtype
|
||||
if typ1 == "bool" and typ2 in ("int64", "float64"):
|
||||
# series coerces to numeric based on numpy rule
|
||||
# index doesn't because bool is object dtype
|
||||
exp_series_dtype = typ2
|
||||
mark = pytest.mark.xfail(reason="GH#39187 casting to object")
|
||||
request.applymarker(mark)
|
||||
elif typ2 == "bool" and typ1 in ("int64", "float64"):
|
||||
exp_series_dtype = typ1
|
||||
mark = pytest.mark.xfail(reason="GH#39187 casting to object")
|
||||
request.applymarker(mark)
|
||||
elif typ1 in {"datetime64[ns, US/Eastern]", "timedelta64[ns]"} or typ2 in {
|
||||
"datetime64[ns, US/Eastern]",
|
||||
"timedelta64[ns]",
|
||||
}:
|
||||
exp_index_dtype = object
|
||||
exp_series_dtype = object
|
||||
|
||||
exp_data = vals1 + vals2
|
||||
exp_data3 = vals1 + vals2 + vals3
|
||||
|
||||
# ----- Index ----- #
|
||||
|
||||
# index.append
|
||||
# GH#39817
|
||||
res = Index(vals1).append(Index(vals2))
|
||||
exp = Index(exp_data, dtype=exp_index_dtype)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
# 3 elements
|
||||
res = Index(vals1).append([Index(vals2), Index(vals3)])
|
||||
exp = Index(exp_data3, dtype=exp_index_dtype)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
# ----- Series ----- #
|
||||
|
||||
# series._append
|
||||
# GH#39817
|
||||
res = Series(vals1)._append_internal(Series(vals2), ignore_index=True)
|
||||
exp = Series(exp_data, dtype=exp_series_dtype)
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
# concat
|
||||
# GH#39817
|
||||
res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp, check_index_type=True)
|
||||
|
||||
# 3 elements
|
||||
# GH#39817
|
||||
exp = Series(exp_data3, dtype=exp_series_dtype)
|
||||
res = pd.concat(
|
||||
[Series(vals1), Series(vals2), Series(vals3)],
|
||||
ignore_index=True,
|
||||
)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_concatlike_common_coerce_to_pandas_object(self):
|
||||
# GH 13626
|
||||
# result must be Timestamp/Timedelta, not datetime.datetime/timedelta
|
||||
dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"])
|
||||
tdi = pd.TimedeltaIndex(["1 days", "2 days"])
|
||||
|
||||
exp = Index(
|
||||
[
|
||||
pd.Timestamp("2011-01-01"),
|
||||
pd.Timestamp("2011-01-02"),
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
]
|
||||
)
|
||||
|
||||
res = dti.append(tdi)
|
||||
tm.assert_index_equal(res, exp)
|
||||
assert isinstance(res[0], pd.Timestamp)
|
||||
assert isinstance(res[-1], pd.Timedelta)
|
||||
|
||||
dts = Series(dti)
|
||||
tds = Series(tdi)
|
||||
res = dts._append_internal(tds)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
assert isinstance(res.iloc[0], pd.Timestamp)
|
||||
assert isinstance(res.iloc[-1], pd.Timedelta)
|
||||
|
||||
res = pd.concat([dts, tds])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
assert isinstance(res.iloc[0], pd.Timestamp)
|
||||
assert isinstance(res.iloc[-1], pd.Timedelta)
|
||||
|
||||
def test_concatlike_datetimetz(self, tz_aware_fixture):
|
||||
tz = tz_aware_fixture
|
||||
# GH 7795
|
||||
dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz)
|
||||
dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz)
|
||||
|
||||
exp = pd.DatetimeIndex(
|
||||
["2011-01-01", "2011-01-02", "2012-01-01", "2012-01-02"], tz=tz
|
||||
)
|
||||
|
||||
res = dti1.append(dti2)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
dts1 = Series(dti1)
|
||||
dts2 = Series(dti2)
|
||||
res = dts1._append_internal(dts2)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
res = pd.concat([dts1, dts2])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
@pytest.mark.parametrize("tz", ["UTC", "US/Eastern", "Asia/Tokyo", "EST5EDT"])
|
||||
def test_concatlike_datetimetz_short(self, tz):
|
||||
# GH#7795
|
||||
ix1 = pd.date_range(
|
||||
start="2014-07-15", end="2014-07-17", freq="D", tz=tz, unit="ns"
|
||||
)
|
||||
ix2 = pd.DatetimeIndex(["2014-07-11", "2014-07-21"], tz=tz)
|
||||
df1 = DataFrame(0, index=ix1, columns=["A", "B"])
|
||||
df2 = DataFrame(0, index=ix2, columns=["A", "B"])
|
||||
|
||||
exp_idx = pd.DatetimeIndex(
|
||||
["2014-07-15", "2014-07-16", "2014-07-17", "2014-07-11", "2014-07-21"],
|
||||
tz=tz,
|
||||
).as_unit("ns")
|
||||
exp = DataFrame(0, index=exp_idx, columns=["A", "B"])
|
||||
|
||||
tm.assert_frame_equal(pd.concat([df1, df2]), exp)
|
||||
|
||||
def test_concatlike_datetimetz_to_object(self, tz_aware_fixture):
|
||||
tz = tz_aware_fixture
|
||||
# GH 13660
|
||||
|
||||
# different tz coerces to object
|
||||
dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz)
|
||||
dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"])
|
||||
|
||||
exp = Index(
|
||||
[
|
||||
pd.Timestamp("2011-01-01", tz=tz),
|
||||
pd.Timestamp("2011-01-02", tz=tz),
|
||||
pd.Timestamp("2012-01-01"),
|
||||
pd.Timestamp("2012-01-02"),
|
||||
],
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
res = dti1.append(dti2)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
dts1 = Series(dti1)
|
||||
dts2 = Series(dti2)
|
||||
res = dts1._append_internal(dts2)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
res = pd.concat([dts1, dts2])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
# different tz
|
||||
tz_diff = zoneinfo.ZoneInfo("US/Hawaii")
|
||||
dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz_diff)
|
||||
|
||||
exp = Index(
|
||||
[
|
||||
pd.Timestamp("2011-01-01", tz=tz),
|
||||
pd.Timestamp("2011-01-02", tz=tz),
|
||||
pd.Timestamp("2012-01-01", tz=tz_diff),
|
||||
pd.Timestamp("2012-01-02", tz=tz_diff),
|
||||
],
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
res = dti1.append(dti3)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
dts1 = Series(dti1)
|
||||
dts3 = Series(dti3)
|
||||
res = dts1._append_internal(dts3)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
res = pd.concat([dts1, dts3])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
def test_concatlike_common_period(self):
|
||||
# GH 13660
|
||||
pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M")
|
||||
pi2 = pd.PeriodIndex(["2012-01", "2012-02"], freq="M")
|
||||
|
||||
exp = pd.PeriodIndex(["2011-01", "2011-02", "2012-01", "2012-02"], freq="M")
|
||||
|
||||
res = pi1.append(pi2)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
ps1 = Series(pi1)
|
||||
ps2 = Series(pi2)
|
||||
res = ps1._append_internal(ps2)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
res = pd.concat([ps1, ps2])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
def test_concatlike_common_period_diff_freq_to_object(self):
|
||||
# GH 13221
|
||||
pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M")
|
||||
pi2 = pd.PeriodIndex(["2012-01-01", "2012-02-01"], freq="D")
|
||||
|
||||
exp = Index(
|
||||
[
|
||||
pd.Period("2011-01", freq="M"),
|
||||
pd.Period("2011-02", freq="M"),
|
||||
pd.Period("2012-01-01", freq="D"),
|
||||
pd.Period("2012-02-01", freq="D"),
|
||||
],
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
res = pi1.append(pi2)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
ps1 = Series(pi1)
|
||||
ps2 = Series(pi2)
|
||||
res = ps1._append_internal(ps2)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
res = pd.concat([ps1, ps2])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
def test_concatlike_common_period_mixed_dt_to_object(self):
|
||||
# GH 13221
|
||||
# different datetimelike
|
||||
pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M")
|
||||
tdi = pd.TimedeltaIndex(["1 days", "2 days"])
|
||||
exp = Index(
|
||||
[
|
||||
pd.Period("2011-01", freq="M"),
|
||||
pd.Period("2011-02", freq="M"),
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
],
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
res = pi1.append(tdi)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
ps1 = Series(pi1)
|
||||
tds = Series(tdi)
|
||||
res = ps1._append_internal(tds)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
res = pd.concat([ps1, tds])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
# inverse
|
||||
exp = Index(
|
||||
[
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Period("2011-01", freq="M"),
|
||||
pd.Period("2011-02", freq="M"),
|
||||
],
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
res = tdi.append(pi1)
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
ps1 = Series(pi1)
|
||||
tds = Series(tdi)
|
||||
res = tds._append_internal(ps1)
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
res = pd.concat([tds, ps1])
|
||||
tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
|
||||
|
||||
def test_concat_categorical(self):
|
||||
# GH 13524
|
||||
|
||||
# same categories -> category
|
||||
s1 = Series([1, 2, np.nan], dtype="category")
|
||||
s2 = Series([2, 1, 2], dtype="category")
|
||||
|
||||
exp = Series([1, 2, np.nan, 2, 1, 2], dtype="category")
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), exp)
|
||||
|
||||
# partially different categories => not-category
|
||||
s1 = Series([3, 2], dtype="category")
|
||||
s2 = Series([2, 1], dtype="category")
|
||||
|
||||
exp = Series([3, 2, 2, 1])
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), exp)
|
||||
|
||||
# completely different categories (same dtype) => not-category
|
||||
s1 = Series([10, 11, np.nan], dtype="category")
|
||||
s2 = Series([np.nan, 1, 3, 2], dtype="category")
|
||||
|
||||
exp = Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype=np.float64)
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), exp)
|
||||
|
||||
def test_union_categorical_same_categories_different_order(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/19096
|
||||
a = Series(Categorical(["a", "b", "c"], categories=["a", "b", "c"]))
|
||||
b = Series(Categorical(["a", "b", "c"], categories=["b", "a", "c"]))
|
||||
result = pd.concat([a, b], ignore_index=True)
|
||||
expected = Series(
|
||||
Categorical(["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"])
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_categorical_coercion(self):
|
||||
# GH 13524
|
||||
|
||||
# category + not-category => not-category
|
||||
s1 = Series([1, 2, np.nan], dtype="category")
|
||||
s2 = Series([2, 1, 2])
|
||||
|
||||
exp = Series([1, 2, np.nan, 2, 1, 2], dtype=np.float64)
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), exp)
|
||||
|
||||
# result shouldn't be affected by 1st elem dtype
|
||||
exp = Series([2, 1, 2, 1, 2, np.nan], dtype=np.float64)
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s2._append_internal(s1, ignore_index=True), exp)
|
||||
|
||||
# all values are not in category => not-category
|
||||
s1 = Series([3, 2], dtype="category")
|
||||
s2 = Series([2, 1])
|
||||
|
||||
exp = Series([3, 2, 2, 1])
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), exp)
|
||||
|
||||
exp = Series([2, 1, 3, 2])
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s2._append_internal(s1, ignore_index=True), exp)
|
||||
|
||||
# completely different categories => not-category
|
||||
s1 = Series([10, 11, np.nan], dtype="category")
|
||||
s2 = Series([1, 3, 2])
|
||||
|
||||
exp = Series([10, 11, np.nan, 1, 3, 2], dtype=np.float64)
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), exp)
|
||||
|
||||
exp = Series([1, 3, 2, 10, 11, np.nan], dtype=np.float64)
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s2._append_internal(s1, ignore_index=True), exp)
|
||||
|
||||
# different dtype => not-category
|
||||
s1 = Series([10, 11, np.nan], dtype="category")
|
||||
s2 = Series(["a", "b", "c"])
|
||||
|
||||
exp = Series([10, 11, np.nan, "a", "b", "c"])
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), exp)
|
||||
|
||||
exp = Series(["a", "b", "c", 10, 11, np.nan])
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s2._append_internal(s1, ignore_index=True), exp)
|
||||
|
||||
# if normal series only contains NaN-likes => not-category
|
||||
s1 = Series([10, 11], dtype="category")
|
||||
s2 = Series([np.nan, np.nan, np.nan])
|
||||
|
||||
exp = Series([10, 11, np.nan, np.nan, np.nan])
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), exp)
|
||||
|
||||
exp = Series([np.nan, np.nan, np.nan, 10, 11])
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s2._append_internal(s1, ignore_index=True), exp)
|
||||
|
||||
def test_concat_categorical_3elem_coercion(self):
|
||||
# GH 13524
|
||||
|
||||
# mixed dtypes => not-category
|
||||
s1 = Series([1, 2, np.nan], dtype="category")
|
||||
s2 = Series([2, 1, 2], dtype="category")
|
||||
s3 = Series([1, 2, 1, 2, np.nan])
|
||||
|
||||
exp = Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float")
|
||||
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
|
||||
|
||||
exp = Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float")
|
||||
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
|
||||
|
||||
# values are all in either category => not-category
|
||||
s1 = Series([4, 5, 6], dtype="category")
|
||||
s2 = Series([1, 2, 3], dtype="category")
|
||||
s3 = Series([1, 3, 4])
|
||||
|
||||
exp = Series([4, 5, 6, 1, 2, 3, 1, 3, 4])
|
||||
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
|
||||
|
||||
exp = Series([1, 3, 4, 4, 5, 6, 1, 2, 3])
|
||||
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
|
||||
|
||||
# values are all in either category => not-category
|
||||
s1 = Series([4, 5, 6], dtype="category")
|
||||
s2 = Series([1, 2, 3], dtype="category")
|
||||
s3 = Series([10, 11, 12])
|
||||
|
||||
exp = Series([4, 5, 6, 1, 2, 3, 10, 11, 12])
|
||||
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
|
||||
|
||||
exp = Series([10, 11, 12, 4, 5, 6, 1, 2, 3])
|
||||
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
|
||||
|
||||
def test_concat_categorical_multi_coercion(self):
|
||||
# GH 13524
|
||||
|
||||
s1 = Series([1, 3], dtype="category")
|
||||
s2 = Series([3, 4], dtype="category")
|
||||
s3 = Series([2, 3])
|
||||
s4 = Series([2, 2], dtype="category")
|
||||
s5 = Series([1, np.nan])
|
||||
s6 = Series([1, 3, 2], dtype="category")
|
||||
|
||||
# mixed dtype, values are all in categories => not-category
|
||||
exp = Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2])
|
||||
res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
exp = Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3])
|
||||
res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_concat_categorical_ordered(self):
|
||||
# GH 13524
|
||||
|
||||
s1 = Series(Categorical([1, 2, np.nan], ordered=True))
|
||||
s2 = Series(Categorical([2, 1, 2], ordered=True))
|
||||
|
||||
exp = Series(Categorical([1, 2, np.nan, 2, 1, 2], ordered=True))
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
|
||||
exp = Series(Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], ordered=True))
|
||||
tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp)
|
||||
|
||||
def test_concat_categorical_coercion_nan(self):
|
||||
# GH 13524
|
||||
|
||||
# some edge cases
|
||||
# category + not-category => not category
|
||||
s1 = Series(np.array([np.nan, np.nan], dtype=np.float64), dtype="category")
|
||||
s2 = Series([np.nan, 1])
|
||||
|
||||
exp = Series([np.nan, np.nan, np.nan, 1])
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
|
||||
s1 = Series([1, np.nan], dtype="category")
|
||||
s2 = Series([np.nan, np.nan])
|
||||
|
||||
exp = Series([1, np.nan, np.nan, np.nan], dtype="float")
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
|
||||
# mixed dtype, all nan-likes => not-category
|
||||
s1 = Series([np.nan, np.nan], dtype="category")
|
||||
s2 = Series([np.nan, np.nan])
|
||||
|
||||
exp = Series([np.nan, np.nan, np.nan, np.nan])
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
||||
|
||||
# all category nan-likes => category
|
||||
s1 = Series([np.nan, np.nan], dtype="category")
|
||||
s2 = Series([np.nan, np.nan], dtype="category")
|
||||
|
||||
exp = Series([np.nan, np.nan, np.nan, np.nan], dtype="category")
|
||||
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
|
||||
def test_concat_categorical_empty(self):
|
||||
# GH 13524
|
||||
|
||||
s1 = Series([], dtype="category")
|
||||
s2 = Series([1, 2], dtype="category")
|
||||
exp = s2.astype(object)
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), exp)
|
||||
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
||||
tm.assert_series_equal(s2._append_internal(s1, ignore_index=True), exp)
|
||||
|
||||
s1 = Series([], dtype="category")
|
||||
s2 = Series([], dtype="category")
|
||||
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
|
||||
tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), s2)
|
||||
|
||||
s1 = Series([], dtype="category")
|
||||
s2 = Series([], dtype="object")
|
||||
|
||||
# different dtype => not-category
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
|
||||
tm.assert_series_equal(s1._append_internal(s2, ignore_index=True), s2)
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2)
|
||||
tm.assert_series_equal(s2._append_internal(s1, ignore_index=True), s2)
|
||||
|
||||
s1 = Series([], dtype="category")
|
||||
s2 = Series([np.nan, np.nan])
|
||||
|
||||
exp = Series([np.nan, np.nan], dtype=object)
|
||||
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
||||
|
||||
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
||||
|
||||
def test_categorical_concat_append(self):
|
||||
cat = Categorical(["a", "b"], categories=["a", "b"])
|
||||
vals = [1, 2]
|
||||
df = DataFrame({"cats": cat, "vals": vals})
|
||||
cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"])
|
||||
vals2 = [1, 2, 1, 2]
|
||||
exp = DataFrame({"cats": cat2, "vals": vals2}, index=Index([0, 1, 0, 1]))
|
||||
|
||||
tm.assert_frame_equal(pd.concat([df, df]), exp)
|
||||
|
||||
# GH 13524 can concat different categories
|
||||
cat3 = Categorical(["a", "b"], categories=["a", "b", "c"])
|
||||
vals3 = [1, 2]
|
||||
df_different_categories = DataFrame({"cats": cat3, "vals": vals3})
|
||||
|
||||
res = pd.concat([df, df_different_categories], ignore_index=True)
|
||||
exp = DataFrame({"cats": list("abab"), "vals": [1, 2, 1, 2]})
|
||||
tm.assert_frame_equal(res, exp)
|
||||
@ -0,0 +1,272 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.errors import Pandas4Warning
|
||||
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestCategoricalConcat:
|
||||
def test_categorical_concat(self, sort):
|
||||
# See GH 10177
|
||||
df1 = DataFrame(
|
||||
np.arange(18, dtype="int64").reshape(6, 3), columns=["a", "b", "c"]
|
||||
)
|
||||
|
||||
df2 = DataFrame(np.arange(14, dtype="int64").reshape(7, 2), columns=["a", "c"])
|
||||
|
||||
cat_values = ["one", "one", "two", "one", "two", "two", "one"]
|
||||
df2["h"] = Series(Categorical(cat_values))
|
||||
|
||||
res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort)
|
||||
exp = DataFrame(
|
||||
{
|
||||
"a": [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12],
|
||||
"b": [
|
||||
1,
|
||||
4,
|
||||
7,
|
||||
10,
|
||||
13,
|
||||
16,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
"c": [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13],
|
||||
"h": [None] * 6 + cat_values,
|
||||
}
|
||||
)
|
||||
exp["h"] = exp["h"].astype(df2["h"].dtype)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_categorical_concat_dtypes(self, using_infer_string):
|
||||
# GH8143
|
||||
index = ["cat", "obj", "num"]
|
||||
cat = Categorical(["a", "b", "c"])
|
||||
obj = Series(["a", "b", "c"])
|
||||
num = Series([1, 2, 3])
|
||||
df = pd.concat([Series(cat), obj, num], axis=1, keys=index)
|
||||
|
||||
result = df.dtypes == (object if not using_infer_string else "str")
|
||||
expected = Series([False, True, False], index=index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.dtypes == "int64"
|
||||
expected = Series([False, False, True], index=index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.dtypes == "category"
|
||||
expected = Series([True, False, False], index=index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_categoricalindex(self):
|
||||
# GH 16111, categories that aren't lexsorted
|
||||
categories = [9, 0, 1, 2, 3]
|
||||
|
||||
a = Series(1, index=pd.CategoricalIndex([9, 0], categories=categories))
|
||||
b = Series(2, index=pd.CategoricalIndex([0, 1], categories=categories))
|
||||
c = Series(3, index=pd.CategoricalIndex([1, 2], categories=categories))
|
||||
|
||||
result = pd.concat([a, b, c], axis=1)
|
||||
|
||||
exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories)
|
||||
exp = DataFrame(
|
||||
{
|
||||
0: [1, 1, np.nan, np.nan],
|
||||
1: [np.nan, 2, 2, np.nan],
|
||||
2: [np.nan, np.nan, 3, 3],
|
||||
},
|
||||
columns=[0, 1, 2],
|
||||
index=exp_idx,
|
||||
)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
def test_categorical_concat_preserve(self):
|
||||
# GH 8641 series concat not preserving category dtype
|
||||
# GH 13524 can concat different categories
|
||||
s = Series(list("abc"), dtype="category")
|
||||
s2 = Series(list("abd"), dtype="category")
|
||||
|
||||
exp = Series(list("abcabd"))
|
||||
res = pd.concat([s, s2], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
exp = Series(list("abcabc"), dtype="category")
|
||||
res = pd.concat([s, s], ignore_index=True)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
exp = Series(list("abcabc"), index=[0, 1, 2, 0, 1, 2], dtype="category")
|
||||
res = pd.concat([s, s])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
a = Series(np.arange(6, dtype="int64"))
|
||||
b = Series(list("aabbca"))
|
||||
|
||||
df2 = DataFrame({"A": a, "B": b.astype(CategoricalDtype(list("cab")))})
|
||||
res = pd.concat([df2, df2])
|
||||
exp = DataFrame(
|
||||
{
|
||||
"A": pd.concat([a, a]),
|
||||
"B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_categorical_index_preserver(self):
|
||||
a = Series(np.arange(6, dtype="int64"))
|
||||
b = Series(list("aabbca"))
|
||||
|
||||
df2 = DataFrame(
|
||||
{"A": a, "B": b.astype(CategoricalDtype(list("cab")))}
|
||||
).set_index("B")
|
||||
result = pd.concat([df2, df2])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": pd.concat([a, a]),
|
||||
"B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))),
|
||||
}
|
||||
).set_index("B")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# wrong categories -> uses concat_compat, which casts to object
|
||||
msg = "Constructing a Categorical with a dtype and values containing"
|
||||
with tm.assert_produces_warning(Pandas4Warning, match=msg):
|
||||
df3 = DataFrame(
|
||||
{"A": a, "B": Categorical(b, categories=list("abe"))}
|
||||
).set_index("B")
|
||||
result = pd.concat([df2, df3])
|
||||
expected = pd.concat(
|
||||
[
|
||||
df2.set_axis(df2.index.astype(object), axis=0),
|
||||
df3.set_axis(df3.index.astype(object), axis=0),
|
||||
]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_categorical_tz(self):
|
||||
# GH-23816
|
||||
a = Series(pd.date_range("2017-01-01", periods=2, tz="US/Pacific"))
|
||||
b = Series(["a", "b"], dtype="category")
|
||||
result = pd.concat([a, b], ignore_index=True)
|
||||
expected = Series(
|
||||
[
|
||||
pd.Timestamp("2017-01-01", tz="US/Pacific"),
|
||||
pd.Timestamp("2017-01-02", tz="US/Pacific"),
|
||||
"a",
|
||||
"b",
|
||||
]
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_categorical_datetime(self):
|
||||
# GH-39443
|
||||
df1 = DataFrame(
|
||||
{"x": Series(datetime(2021, 1, 1), index=[0], dtype="category")}
|
||||
)
|
||||
df2 = DataFrame(
|
||||
{"x": Series(datetime(2021, 1, 2), index=[1], dtype="category")}
|
||||
)
|
||||
|
||||
result = pd.concat([df1, df2])
|
||||
expected = DataFrame(
|
||||
{"x": Series([datetime(2021, 1, 1), datetime(2021, 1, 2)])}
|
||||
)
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_concat_categorical_unchanged(self):
|
||||
# GH-12007
|
||||
# test fix for when concat on categorical and float
|
||||
# coerces dtype categorical -> float
|
||||
df = DataFrame(Series(["a", "b", "c"], dtype="category", name="A"))
|
||||
ser = Series([0, 1, 2], index=[0, 1, 3], name="B")
|
||||
result = pd.concat([df, ser], axis=1)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": Series(["a", "b", "c", np.nan], dtype="category"),
|
||||
"B": Series([0, 1, np.nan, 2], dtype="float"),
|
||||
}
|
||||
)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_categorical_concat_gh7864(self):
|
||||
# GH 7864
|
||||
# make sure ordering is preserved
|
||||
df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list("abbaae")})
|
||||
df["grade"] = Categorical(df["raw_grade"])
|
||||
df["grade"].cat.set_categories(["e", "a", "b"])
|
||||
|
||||
df1 = df[0:3]
|
||||
df2 = df[3:]
|
||||
|
||||
tm.assert_index_equal(df["grade"].cat.categories, df1["grade"].cat.categories)
|
||||
tm.assert_index_equal(df["grade"].cat.categories, df2["grade"].cat.categories)
|
||||
|
||||
dfx = pd.concat([df1, df2])
|
||||
tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories)
|
||||
|
||||
def test_categorical_index_upcast(self):
|
||||
# GH 17629
|
||||
# test upcasting to object when concatenating on categorical indexes
|
||||
# with non-identical categories
|
||||
|
||||
a = DataFrame({"foo": [1, 2]}, index=Categorical(["foo", "bar"]))
|
||||
b = DataFrame({"foo": [4, 3]}, index=Categorical(["baz", "bar"]))
|
||||
|
||||
res = pd.concat([a, b])
|
||||
exp = DataFrame({"foo": [1, 2, 4, 3]}, index=["foo", "bar", "baz", "bar"])
|
||||
|
||||
tm.assert_equal(res, exp)
|
||||
|
||||
a = Series([1, 2], index=Categorical(["foo", "bar"]))
|
||||
b = Series([4, 3], index=Categorical(["baz", "bar"]))
|
||||
|
||||
res = pd.concat([a, b])
|
||||
exp = Series([1, 2, 4, 3], index=["foo", "bar", "baz", "bar"])
|
||||
|
||||
tm.assert_equal(res, exp)
|
||||
|
||||
def test_categorical_missing_from_one_frame(self):
|
||||
# GH 25412
|
||||
df1 = DataFrame({"f1": [1, 2, 3]})
|
||||
df2 = DataFrame({"f1": [2, 3, 1], "f2": Series([4, 4, 4]).astype("category")})
|
||||
result = pd.concat([df1, df2], sort=True)
|
||||
dtype = CategoricalDtype([4])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"f1": [1, 2, 3, 2, 3, 1],
|
||||
"f2": Categorical.from_codes([-1, -1, -1, 0, 0, 0], dtype=dtype),
|
||||
},
|
||||
index=[0, 1, 2, 0, 1, 2],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_categorical_same_categories_different_order(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/24845
|
||||
|
||||
c1 = pd.CategoricalIndex(["a", "a"], categories=["a", "b"], ordered=False)
|
||||
c2 = pd.CategoricalIndex(["b", "b"], categories=["b", "a"], ordered=False)
|
||||
c3 = pd.CategoricalIndex(
|
||||
["a", "a", "b", "b"], categories=["a", "b"], ordered=False
|
||||
)
|
||||
|
||||
df1 = DataFrame({"A": [1, 2]}, index=c1)
|
||||
df2 = DataFrame({"A": [3, 4]}, index=c2)
|
||||
|
||||
result = pd.concat((df1, df2))
|
||||
expected = DataFrame({"A": [1, 2, 3, 4]}, index=c3)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,238 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameConcat:
|
||||
@pytest.mark.xfail(reason="GH#62888 the `mi[2][1] is 1` check fails")
|
||||
def test_concat_multiindex_level_bool_and_numeric(self):
|
||||
# GH#21108, GH#45101
|
||||
left = DataFrame([123, 456], columns=["data"], index=[True, False])
|
||||
right = DataFrame(
|
||||
[55, 983, 69, 112, 0], columns=["data"], index=[1, 2, 3, 4, 99]
|
||||
)
|
||||
result = concat({"One": left, "Two": right})
|
||||
|
||||
# in particular, the first two entries should not be cast to ints, the
|
||||
# other 1 should not cast to True
|
||||
mi = pd.MultiIndex.from_arrays(
|
||||
[
|
||||
["One"] * 2 + ["Two"] * 5,
|
||||
np.array([True, False, 1, 2, 3, 4, 99], dtype=object),
|
||||
],
|
||||
)
|
||||
assert mi[0][1] is True
|
||||
assert type(mi[2][1]) is int
|
||||
expected = DataFrame({"data": [123, 456, 55, 983, 69, 112, 0]}, index=mi)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_multiple_frames_dtypes(self):
|
||||
# GH#2759
|
||||
df1 = DataFrame(data=np.ones((10, 2)), columns=["foo", "bar"], dtype=np.float64)
|
||||
df2 = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
|
||||
results = concat((df1, df2), axis=1).dtypes
|
||||
expected = Series(
|
||||
[np.dtype("float64")] * 2 + [np.dtype("float32")] * 2,
|
||||
index=["foo", "bar", 0, 1],
|
||||
)
|
||||
tm.assert_series_equal(results, expected)
|
||||
|
||||
def test_concat_tuple_keys(self):
|
||||
# GH#14438
|
||||
df1 = DataFrame(np.ones((2, 2)), columns=list("AB"))
|
||||
df2 = DataFrame(np.ones((3, 2)) * 2, columns=list("AB"))
|
||||
results = concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": {
|
||||
("bee", "bah", 0): 1.0,
|
||||
("bee", "bah", 1): 1.0,
|
||||
("bee", "boo", 0): 2.0,
|
||||
("bee", "boo", 1): 2.0,
|
||||
("bee", "boo", 2): 2.0,
|
||||
},
|
||||
"B": {
|
||||
("bee", "bah", 0): 1.0,
|
||||
("bee", "bah", 1): 1.0,
|
||||
("bee", "boo", 0): 2.0,
|
||||
("bee", "boo", 1): 2.0,
|
||||
("bee", "boo", 2): 2.0,
|
||||
},
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(results, expected)
|
||||
|
||||
def test_concat_named_keys(self):
|
||||
# GH#14252
|
||||
df = DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]})
|
||||
index = Index(["a", "b"], name="baz")
|
||||
concatted_named_from_keys = concat([df, df], keys=index)
|
||||
expected_named = DataFrame(
|
||||
{"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
|
||||
index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=["baz", None]),
|
||||
)
|
||||
tm.assert_frame_equal(concatted_named_from_keys, expected_named)
|
||||
|
||||
index_no_name = Index(["a", "b"], name=None)
|
||||
concatted_named_from_names = concat([df, df], keys=index_no_name, names=["baz"])
|
||||
tm.assert_frame_equal(concatted_named_from_names, expected_named)
|
||||
|
||||
concatted_unnamed = concat([df, df], keys=index_no_name)
|
||||
expected_unnamed = DataFrame(
|
||||
{"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
|
||||
index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=[None, None]),
|
||||
)
|
||||
tm.assert_frame_equal(concatted_unnamed, expected_unnamed)
|
||||
|
||||
def test_concat_axis_parameter(self):
|
||||
# GH#14369
|
||||
df1 = DataFrame({"A": [0.1, 0.2]}, index=range(2))
|
||||
df2 = DataFrame({"A": [0.3, 0.4]}, index=range(2))
|
||||
|
||||
# Index/row/0 DataFrame
|
||||
expected_index = DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
|
||||
|
||||
concatted_index = concat([df1, df2], axis="index")
|
||||
tm.assert_frame_equal(concatted_index, expected_index)
|
||||
|
||||
concatted_row = concat([df1, df2], axis="rows")
|
||||
tm.assert_frame_equal(concatted_row, expected_index)
|
||||
|
||||
concatted_0 = concat([df1, df2], axis=0)
|
||||
tm.assert_frame_equal(concatted_0, expected_index)
|
||||
|
||||
# Columns/1 DataFrame
|
||||
expected_columns = DataFrame(
|
||||
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=["A", "A"]
|
||||
)
|
||||
|
||||
concatted_columns = concat([df1, df2], axis="columns")
|
||||
tm.assert_frame_equal(concatted_columns, expected_columns)
|
||||
|
||||
concatted_1 = concat([df1, df2], axis=1)
|
||||
tm.assert_frame_equal(concatted_1, expected_columns)
|
||||
|
||||
series1 = Series([0.1, 0.2])
|
||||
series2 = Series([0.3, 0.4])
|
||||
|
||||
# Index/row/0 Series
|
||||
expected_index_series = Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
|
||||
|
||||
concatted_index_series = concat([series1, series2], axis="index")
|
||||
tm.assert_series_equal(concatted_index_series, expected_index_series)
|
||||
|
||||
concatted_row_series = concat([series1, series2], axis="rows")
|
||||
tm.assert_series_equal(concatted_row_series, expected_index_series)
|
||||
|
||||
concatted_0_series = concat([series1, series2], axis=0)
|
||||
tm.assert_series_equal(concatted_0_series, expected_index_series)
|
||||
|
||||
# Columns/1 Series
|
||||
expected_columns_series = DataFrame(
|
||||
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1]
|
||||
)
|
||||
|
||||
concatted_columns_series = concat([series1, series2], axis="columns")
|
||||
tm.assert_frame_equal(concatted_columns_series, expected_columns_series)
|
||||
|
||||
concatted_1_series = concat([series1, series2], axis=1)
|
||||
tm.assert_frame_equal(concatted_1_series, expected_columns_series)
|
||||
|
||||
# Testing ValueError
|
||||
with pytest.raises(ValueError, match="No axis named"):
|
||||
concat([series1, series2], axis="something")
|
||||
|
||||
def test_concat_numerical_names(self):
|
||||
# GH#15262, GH#12223
|
||||
df = DataFrame(
|
||||
{"col": range(9)},
|
||||
dtype="int32",
|
||||
index=(
|
||||
pd.MultiIndex.from_product(
|
||||
[["A0", "A1", "A2"], ["B0", "B1", "B2"]], names=[1, 2]
|
||||
)
|
||||
),
|
||||
)
|
||||
result = concat((df.iloc[:2, :], df.iloc[-2:, :]))
|
||||
expected = DataFrame(
|
||||
{"col": [0, 1, 7, 8]},
|
||||
dtype="int32",
|
||||
index=pd.MultiIndex.from_tuples(
|
||||
[("A0", "B0"), ("A0", "B1"), ("A2", "B1"), ("A2", "B2")], names=[1, 2]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_astype_dup_col(self):
|
||||
# GH#23049
|
||||
df = DataFrame([{"a": "b"}])
|
||||
df = concat([df, df], axis=1)
|
||||
|
||||
result = df.astype("category")
|
||||
expected = DataFrame(
|
||||
np.array(["b", "b"]).reshape(1, 2), columns=["a", "a"]
|
||||
).astype("category")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_dataframe_keys_bug(self, sort):
|
||||
t1 = DataFrame(
|
||||
{"value": Series([1, 2, 3], index=Index(["a", "b", "c"], name="id"))}
|
||||
)
|
||||
t2 = DataFrame({"value": Series([7, 8], index=Index(["a", "b"], name="id"))})
|
||||
|
||||
# it works
|
||||
result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort)
|
||||
assert list(result.columns) == [("t1", "value"), ("t2", "value")]
|
||||
|
||||
def test_concat_bool_with_int(self):
|
||||
# GH#42092 we may want to change this to return object, but that
|
||||
# would need a deprecation
|
||||
df1 = DataFrame(Series([True, False, True, True], dtype="bool"))
|
||||
df2 = DataFrame(Series([1, 0, 1], dtype="int64"))
|
||||
|
||||
result = concat([df1, df2])
|
||||
expected = concat([df1.astype("int64"), df2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_duplicates_in_index_with_keys(self):
|
||||
# GH#42651
|
||||
index = [1, 1, 3]
|
||||
data = [1, 2, 3]
|
||||
|
||||
df = DataFrame(data=data, index=index)
|
||||
result = concat([df], keys=["A"], names=["ID", "date"])
|
||||
mi = pd.MultiIndex.from_product([["A"], index], names=["ID", "date"])
|
||||
expected = DataFrame(data=data, index=mi)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_index_equal(result.index.levels[1], Index([1, 3], name="date"))
|
||||
|
||||
def test_outer_sort_columns(self):
|
||||
# GH#47127
|
||||
df1 = DataFrame({"A": [0], "B": [1], 0: 1})
|
||||
df2 = DataFrame({"A": [100]})
|
||||
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
|
||||
expected = DataFrame({0: [1.0, np.nan], "A": [0, 100], "B": [1.0, np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_inner_sort_columns(self):
|
||||
# GH#47127
|
||||
df1 = DataFrame({"A": [0], "B": [1], 0: 1})
|
||||
df2 = DataFrame({"A": [100], 0: 2})
|
||||
result = concat([df1, df2], ignore_index=True, join="inner", sort=True)
|
||||
expected = DataFrame({0: [1, 2], "A": [0, 100]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_columns_one_df(self):
|
||||
# GH#47127
|
||||
df1 = DataFrame({"A": [100], 0: 2})
|
||||
result = concat([df1], ignore_index=True, join="inner", sort=True)
|
||||
expected = DataFrame({0: [2], "A": [100]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,611 @@
|
||||
import datetime as dt
|
||||
from datetime import datetime
|
||||
|
||||
import dateutil
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import Pandas4Warning
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
concat,
|
||||
date_range,
|
||||
to_timedelta,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDatetimeConcat:
|
||||
def test_concat_datetime64_block(self):
|
||||
rng = date_range("1/1/2000", periods=10)
|
||||
|
||||
df = DataFrame({"time": rng})
|
||||
|
||||
result = concat([df, df])
|
||||
assert (result.iloc[:10]["time"] == rng).all()
|
||||
assert (result.iloc[10:]["time"] == rng).all()
|
||||
|
||||
def test_concat_datetime_datetime64_frame(self):
|
||||
# GH#2624
|
||||
rows = []
|
||||
rows.append([datetime(2010, 1, 1), 1])
|
||||
rows.append([datetime(2010, 1, 2), "hi"])
|
||||
|
||||
df2_obj = DataFrame.from_records(rows, columns=["date", "test"])
|
||||
|
||||
ind = date_range(start="2000/1/1", freq="D", periods=10)
|
||||
df1 = DataFrame({"date": ind, "test": range(10)})
|
||||
|
||||
# it works!
|
||||
concat([df1, df2_obj])
|
||||
|
||||
def test_concat_datetime_timezone(self):
|
||||
# GH 18523
|
||||
idx1 = date_range(
|
||||
"2011-01-01", periods=3, freq="h", tz="Europe/Paris", unit="ns"
|
||||
)
|
||||
idx2 = date_range(start=idx1[0], end=idx1[-1], freq="h", unit="ns")
|
||||
df1 = DataFrame({"a": [1, 2, 3]}, index=idx1)
|
||||
df2 = DataFrame({"b": [1, 2, 3]}, index=idx2)
|
||||
result = concat([df1, df2], axis=1)
|
||||
|
||||
exp_idx = DatetimeIndex(
|
||||
[
|
||||
"2011-01-01 00:00:00+01:00",
|
||||
"2011-01-01 01:00:00+01:00",
|
||||
"2011-01-01 02:00:00+01:00",
|
||||
],
|
||||
dtype="M8[ns, Europe/Paris]",
|
||||
freq="h",
|
||||
)
|
||||
expected = DataFrame(
|
||||
[[1, 1], [2, 2], [3, 3]], index=exp_idx, columns=["a", "b"]
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
idx3 = date_range("2011-01-01", periods=3, freq="h", tz="Asia/Tokyo", unit="ns")
|
||||
df3 = DataFrame({"b": [1, 2, 3]}, index=idx3)
|
||||
msg = "Sorting by default when concatenating all DatetimeIndex"
|
||||
with tm.assert_produces_warning(Pandas4Warning, match=msg):
|
||||
result = concat([df1, df3], axis=1)
|
||||
|
||||
exp_idx = DatetimeIndex(
|
||||
[
|
||||
"2010-12-31 15:00:00+00:00",
|
||||
"2010-12-31 16:00:00+00:00",
|
||||
"2010-12-31 17:00:00+00:00",
|
||||
"2010-12-31 23:00:00+00:00",
|
||||
"2011-01-01 00:00:00+00:00",
|
||||
"2011-01-01 01:00:00+00:00",
|
||||
]
|
||||
).as_unit("ns")
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[np.nan, 1],
|
||||
[np.nan, 2],
|
||||
[np.nan, 3],
|
||||
[1, np.nan],
|
||||
[2, np.nan],
|
||||
[3, np.nan],
|
||||
],
|
||||
index=exp_idx,
|
||||
columns=["a", "b"],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 13783: Concat after resample
|
||||
result = concat([df1.resample("h").mean(), df2.resample("h").mean()], sort=True)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3] + [np.nan] * 3, "b": [np.nan] * 3 + [1, 2, 3]},
|
||||
index=idx1.append(idx1),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_datetimeindex_freq(self):
|
||||
# GH 3232
|
||||
# Monotonic index result
|
||||
dr = date_range("01-Jan-2013", periods=100, freq="50ms", tz="UTC")
|
||||
data = list(range(100))
|
||||
expected = DataFrame(data, index=dr)
|
||||
result = concat([expected[:50], expected[50:]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Non-monotonic index result
|
||||
result = concat([expected[50:], expected[:50]])
|
||||
expected = DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50]))
|
||||
expected.index._data.freq = None
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_multiindex_datetime_object_index(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/11058
|
||||
idx = Index(
|
||||
[dt.date(2013, 1, 1), dt.date(2014, 1, 1), dt.date(2015, 1, 1)],
|
||||
dtype="object",
|
||||
)
|
||||
|
||||
s = Series(
|
||||
["a", "b"],
|
||||
index=MultiIndex.from_arrays(
|
||||
[
|
||||
[1, 2],
|
||||
idx[:-1],
|
||||
],
|
||||
names=["first", "second"],
|
||||
),
|
||||
)
|
||||
s2 = Series(
|
||||
["a", "b"],
|
||||
index=MultiIndex.from_arrays(
|
||||
[[1, 2], idx[::2]],
|
||||
names=["first", "second"],
|
||||
),
|
||||
)
|
||||
mi = MultiIndex.from_arrays(
|
||||
[[1, 2, 2], idx],
|
||||
names=["first", "second"],
|
||||
)
|
||||
assert mi.levels[1].dtype == object
|
||||
|
||||
expected = DataFrame(
|
||||
[["a", "a"], ["b", np.nan], [np.nan, "b"]],
|
||||
index=mi,
|
||||
)
|
||||
result = concat([s, s2], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_NaT_series(self):
|
||||
# GH 11693
|
||||
# test for merging NaT series with datetime series.
|
||||
x = Series(
|
||||
date_range(
|
||||
"20151124 08:00",
|
||||
"20151124 09:00",
|
||||
freq="1h",
|
||||
tz="US/Eastern",
|
||||
unit="ns",
|
||||
)
|
||||
)
|
||||
y = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]")
|
||||
expected = Series([x[0], x[1], pd.NaT, pd.NaT])
|
||||
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# all NaT with tz
|
||||
expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns, US/Eastern]")
|
||||
result = concat([y, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_NaT_series2(self):
|
||||
# without tz
|
||||
x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", unit="ns"))
|
||||
y = Series(date_range("20151124 10:00", "20151124 11:00", freq="1h", unit="ns"))
|
||||
y[:] = pd.NaT
|
||||
expected = Series([x[0], x[1], pd.NaT, pd.NaT])
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# all NaT without tz
|
||||
x[:] = pd.NaT
|
||||
expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns]")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||||
def test_concat_NaT_dataframes(self, tz):
|
||||
# GH 12396
|
||||
|
||||
dti = DatetimeIndex([pd.NaT, pd.NaT], tz=tz)
|
||||
first = DataFrame({0: dti})
|
||||
second = DataFrame(
|
||||
[[Timestamp("2015/01/01", tz=tz)], [Timestamp("2016/01/01", tz=tz)]],
|
||||
index=[2, 3],
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
pd.NaT,
|
||||
pd.NaT,
|
||||
Timestamp("2015/01/01", tz=tz),
|
||||
Timestamp("2016/01/01", tz=tz),
|
||||
]
|
||||
)
|
||||
|
||||
result = concat([first, second], axis=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz1", [None, "UTC"])
|
||||
@pytest.mark.parametrize("tz2", [None, "UTC"])
|
||||
@pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101").as_unit("ns")])
|
||||
def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, item):
|
||||
# GH 12396
|
||||
|
||||
# tz-naive
|
||||
first = DataFrame([[pd.NaT], [pd.NaT]]).apply(lambda x: x.dt.tz_localize(tz1))
|
||||
second = DataFrame([item]).apply(lambda x: x.dt.tz_localize(tz2))
|
||||
|
||||
result = concat([first, second], axis=0)
|
||||
expected = DataFrame(Series([pd.NaT, pd.NaT, item], index=[0, 1, 0]))
|
||||
expected = expected.apply(lambda x: x.dt.tz_localize(tz2))
|
||||
if tz1 != tz2:
|
||||
expected = expected.astype(object)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz1", [None, "UTC"])
|
||||
@pytest.mark.parametrize("tz2", [None, "UTC"])
|
||||
def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2):
|
||||
# GH 12396
|
||||
|
||||
first = DataFrame(Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1))
|
||||
second = DataFrame(Series([pd.NaT]).dt.tz_localize(tz2), columns=[1])
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1),
|
||||
1: Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2),
|
||||
}
|
||||
)
|
||||
result = concat([first, second], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz1", [None, "UTC"])
|
||||
@pytest.mark.parametrize("tz2", [None, "UTC"])
|
||||
def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2):
|
||||
# GH 12396
|
||||
|
||||
# tz-naive
|
||||
first = Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)
|
||||
second = DataFrame(
|
||||
[
|
||||
[Timestamp("2015/01/01", tz=tz2)],
|
||||
[Timestamp("2016/01/01", tz=tz2)],
|
||||
],
|
||||
index=[2, 3],
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
pd.NaT,
|
||||
pd.NaT,
|
||||
Timestamp("2015/01/01", tz=tz2),
|
||||
Timestamp("2016/01/01", tz=tz2),
|
||||
]
|
||||
)
|
||||
if tz1 != tz2:
|
||||
expected = expected.astype(object)
|
||||
|
||||
result = concat([first, second])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_compat_on_non_ns_datetime_EA(self):
|
||||
# GH#33331
|
||||
first = Series(np.array([datetime(2010, 1, 1)], dtype="datetime64[D]"))
|
||||
second = Series(pd.array(["a", "b"], dtype="category"))
|
||||
|
||||
expected = Series([Timestamp("2010-01-01 00:00:00"), "a", "b"])
|
||||
|
||||
result = concat([first, second], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
class TestTimezoneConcat:
|
||||
def test_concat_tz_series(self):
|
||||
# gh-11755: tz and no tz
|
||||
x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC"))
|
||||
y = Series(date_range("2012-01-01", "2012-01-02"))
|
||||
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_tz_series2(self):
|
||||
# gh-11887: concat tz and object
|
||||
x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC"))
|
||||
y = Series(["a", "b"])
|
||||
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_tz_series3(self, unit, unit2):
|
||||
# see gh-12217 and gh-12306
|
||||
# Concatenating two UTC times
|
||||
first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]")
|
||||
first[0] = first[0].dt.tz_localize("UTC")
|
||||
|
||||
second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]")
|
||||
second[0] = second[0].dt.tz_localize("UTC")
|
||||
|
||||
result = concat([first, second])
|
||||
exp_unit = tm.get_finest_unit(unit, unit2)
|
||||
assert result[0].dtype == f"datetime64[{exp_unit}, UTC]"
|
||||
|
||||
def test_concat_tz_series4(self, unit, unit2):
|
||||
# Concatenating two London times
|
||||
first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]")
|
||||
first[0] = first[0].dt.tz_localize("Europe/London")
|
||||
|
||||
second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]")
|
||||
second[0] = second[0].dt.tz_localize("Europe/London")
|
||||
|
||||
result = concat([first, second])
|
||||
exp_unit = tm.get_finest_unit(unit, unit2)
|
||||
assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]"
|
||||
|
||||
def test_concat_tz_series5(self, unit, unit2):
|
||||
# Concatenating 2+1 London times
|
||||
first = DataFrame(
|
||||
[[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]], dtype=f"M8[{unit}]"
|
||||
)
|
||||
first[0] = first[0].dt.tz_localize("Europe/London")
|
||||
|
||||
second = DataFrame([[datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]")
|
||||
second[0] = second[0].dt.tz_localize("Europe/London")
|
||||
|
||||
result = concat([first, second])
|
||||
exp_unit = tm.get_finest_unit(unit, unit2)
|
||||
assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]"
|
||||
|
||||
def test_concat_tz_series6(self, unit, unit2):
|
||||
# Concatenating 1+2 London times
|
||||
first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]")
|
||||
first[0] = first[0].dt.tz_localize("Europe/London")
|
||||
|
||||
second = DataFrame(
|
||||
[[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]"
|
||||
)
|
||||
second[0] = second[0].dt.tz_localize("Europe/London")
|
||||
|
||||
result = concat([first, second])
|
||||
exp_unit = tm.get_finest_unit(unit, unit2)
|
||||
assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]"
|
||||
|
||||
def test_concat_tz_series_tzlocal(self):
|
||||
# see gh-13583
|
||||
x = [
|
||||
Timestamp("2011-01-01", tz=dateutil.tz.tzlocal()),
|
||||
Timestamp("2011-02-01", tz=dateutil.tz.tzlocal()),
|
||||
]
|
||||
y = [
|
||||
Timestamp("2012-01-01", tz=dateutil.tz.tzlocal()),
|
||||
Timestamp("2012-02-01", tz=dateutil.tz.tzlocal()),
|
||||
]
|
||||
|
||||
result = concat([Series(x), Series(y)], ignore_index=True)
|
||||
tm.assert_series_equal(result, Series(x + y))
|
||||
assert result.dtype == "datetime64[us, tzlocal()]"
|
||||
|
||||
def test_concat_tz_series_with_datetimelike(self):
|
||||
# see gh-12620: tz and timedelta
|
||||
x = [
|
||||
Timestamp("2011-01-01", tz="US/Eastern"),
|
||||
Timestamp("2011-02-01", tz="US/Eastern"),
|
||||
]
|
||||
y = [pd.Timedelta("1 day"), pd.Timedelta("2 day")]
|
||||
result = concat([Series(x), Series(y)], ignore_index=True)
|
||||
tm.assert_series_equal(result, Series(x + y, dtype="object"))
|
||||
|
||||
# tz and period
|
||||
y = [pd.Period("2011-03", freq="M"), pd.Period("2011-04", freq="M")]
|
||||
result = concat([Series(x), Series(y)], ignore_index=True)
|
||||
tm.assert_series_equal(result, Series(x + y, dtype="object"))
|
||||
|
||||
def test_concat_tz_frame(self):
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"A": Timestamp("20130102", tz="US/Eastern"),
|
||||
"B": Timestamp("20130603", tz="CET"),
|
||||
},
|
||||
index=range(5),
|
||||
)
|
||||
|
||||
# concat
|
||||
df3 = concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
|
||||
tm.assert_frame_equal(df2, df3)
|
||||
|
||||
def test_concat_multiple_tzs(self):
|
||||
# GH#12467
|
||||
# combining datetime tz-aware and naive DataFrames
|
||||
ts1 = Timestamp("2015-01-01", tz=None)
|
||||
ts2 = Timestamp("2015-01-01", tz="UTC")
|
||||
ts3 = Timestamp("2015-01-01", tz="EST")
|
||||
|
||||
df1 = DataFrame({"time": [ts1]})
|
||||
df2 = DataFrame({"time": [ts2]})
|
||||
df3 = DataFrame({"time": [ts3]})
|
||||
|
||||
results = concat([df1, df2]).reset_index(drop=True)
|
||||
expected = DataFrame({"time": [ts1, ts2]}, dtype=object)
|
||||
tm.assert_frame_equal(results, expected)
|
||||
|
||||
results = concat([df1, df3]).reset_index(drop=True)
|
||||
expected = DataFrame({"time": [ts1, ts3]}, dtype=object)
|
||||
tm.assert_frame_equal(results, expected)
|
||||
|
||||
results = concat([df2, df3]).reset_index(drop=True)
|
||||
expected = DataFrame({"time": [ts2, ts3]})
|
||||
tm.assert_frame_equal(results, expected)
|
||||
|
||||
def test_concat_multiindex_with_tz(self):
|
||||
# GH 6606
|
||||
df = DataFrame(
|
||||
{
|
||||
"dt": DatetimeIndex(
|
||||
[
|
||||
datetime(2014, 1, 1),
|
||||
datetime(2014, 1, 2),
|
||||
datetime(2014, 1, 3),
|
||||
],
|
||||
dtype="M8[ns, US/Pacific]",
|
||||
),
|
||||
"b": ["A", "B", "C"],
|
||||
"c": [1, 2, 3],
|
||||
"d": [4, 5, 6],
|
||||
}
|
||||
)
|
||||
df = df.set_index(["dt", "b"])
|
||||
|
||||
exp_idx1 = DatetimeIndex(
|
||||
["2014-01-01", "2014-01-02", "2014-01-03"] * 2,
|
||||
dtype="M8[ns, US/Pacific]",
|
||||
name="dt",
|
||||
)
|
||||
exp_idx2 = Index(["A", "B", "C"] * 2, name="b")
|
||||
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||||
expected = DataFrame(
|
||||
{"c": [1, 2, 3] * 2, "d": [4, 5, 6] * 2}, index=exp_idx, columns=["c", "d"]
|
||||
)
|
||||
|
||||
result = concat([df, df])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_tz_not_aligned(self):
|
||||
# GH#22796
|
||||
ts = pd.to_datetime([1, 2]).tz_localize("UTC")
|
||||
a = DataFrame({"A": ts})
|
||||
b = DataFrame({"A": ts, "B": ts})
|
||||
result = concat([a, b], sort=True, ignore_index=True)
|
||||
expected = DataFrame(
|
||||
{"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT, *list(ts)]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"t1",
|
||||
[
|
||||
"2015-01-01",
|
||||
pytest.param(
|
||||
pd.NaT,
|
||||
marks=pytest.mark.xfail(
|
||||
reason="GH23037 incorrect dtype when concatenating"
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_concat_tz_NaT(self, t1):
|
||||
# GH#22796
|
||||
# Concatenating tz-aware multicolumn DataFrames
|
||||
ts1 = Timestamp(t1, tz="UTC")
|
||||
ts2 = Timestamp("2015-01-01", tz="UTC")
|
||||
ts3 = Timestamp("2015-01-01", tz="UTC")
|
||||
|
||||
df1 = DataFrame([[ts1, ts2]])
|
||||
df2 = DataFrame([[ts3]])
|
||||
|
||||
result = concat([df1, df2])
|
||||
expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_tz_with_empty(self):
|
||||
# GH 9188
|
||||
result = concat(
|
||||
[DataFrame(date_range("2000", periods=1, tz="UTC")), DataFrame()]
|
||||
)
|
||||
expected = DataFrame(date_range("2000", periods=1, tz="UTC"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestPeriodConcat:
|
||||
def test_concat_period_series(self):
|
||||
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
|
||||
y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="D"))
|
||||
expected = Series([x[0], x[1], y[0], y[1]], dtype="Period[D]")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_period_multiple_freq_series(self):
|
||||
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
|
||||
y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="M"))
|
||||
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
assert result.dtype == "object"
|
||||
|
||||
def test_concat_period_other_series(self):
|
||||
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
|
||||
y = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="M"))
|
||||
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
assert result.dtype == "object"
|
||||
|
||||
def test_concat_period_other_series2(self):
|
||||
# non-period
|
||||
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
|
||||
y = Series(DatetimeIndex(["2015-11-01", "2015-12-01"]))
|
||||
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
assert result.dtype == "object"
|
||||
|
||||
def test_concat_period_other_series3(self):
|
||||
x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D"))
|
||||
y = Series(["A", "B"])
|
||||
expected = Series([x[0], x[1], y[0], y[1]], dtype="object")
|
||||
result = concat([x, y], ignore_index=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
assert result.dtype == "object"
|
||||
|
||||
|
||||
def test_concat_timedelta64_block():
|
||||
rng = to_timedelta(np.arange(10), unit="s")
|
||||
|
||||
df = DataFrame({"time": rng})
|
||||
|
||||
result = concat([df, df])
|
||||
tm.assert_frame_equal(result.iloc[:10], df, check_index_type=False)
|
||||
tm.assert_frame_equal(result.iloc[10:], df, check_index_type=False)
|
||||
|
||||
|
||||
def test_concat_multiindex_datetime_nat():
|
||||
# GH#44900
|
||||
left = DataFrame({"a": 1}, index=MultiIndex.from_tuples([(1, pd.NaT)]))
|
||||
right = DataFrame(
|
||||
{"b": 2}, index=MultiIndex.from_tuples([(1, pd.NaT), (2, pd.NaT)])
|
||||
)
|
||||
result = concat([left, right], axis="columns")
|
||||
expected = DataFrame(
|
||||
{"a": [1.0, np.nan], "b": 2}, MultiIndex.from_tuples([(1, pd.NaT), (2, pd.NaT)])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_concat_float_datetime64():
|
||||
# GH#32934
|
||||
df_time = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")})
|
||||
df_float = DataFrame({"A": pd.array([1.0], dtype="float64")})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
pd.array(["2000"], dtype="datetime64[ns]")[0],
|
||||
pd.array([1.0], dtype="float64")[0],
|
||||
]
|
||||
},
|
||||
index=[0, 0],
|
||||
)
|
||||
result = concat([df_time, df_float])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"A": pd.array([], dtype="object")})
|
||||
result = concat([df_time.iloc[:0], df_float.iloc[:0]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"A": pd.array([1.0], dtype="object")})
|
||||
result = concat([df_time.iloc[:0], df_float])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}).astype(
|
||||
object
|
||||
)
|
||||
|
||||
result = concat([df_time, df_float.iloc[:0]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,293 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
RangeIndex,
|
||||
Series,
|
||||
concat,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestEmptyConcat:
|
||||
def test_handle_empty_objects(self, sort, using_infer_string):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd")
|
||||
)
|
||||
|
||||
dfcopy = df[:5].copy()
|
||||
dfcopy["foo"] = "bar"
|
||||
empty = df[5:5]
|
||||
|
||||
frames = [dfcopy, empty, empty, df[5:]]
|
||||
concatted = concat(frames, axis=0, sort=sort)
|
||||
|
||||
expected = df.reindex(columns=["a", "b", "c", "d", "foo"])
|
||||
expected["foo"] = expected["foo"].astype(
|
||||
object if not using_infer_string else "str"
|
||||
)
|
||||
expected.loc[0:4, "foo"] = "bar"
|
||||
|
||||
tm.assert_frame_equal(concatted, expected)
|
||||
|
||||
# empty as first element with time series
|
||||
# GH3259
|
||||
df = DataFrame(
|
||||
{"A": range(10000)}, index=date_range("20130101", periods=10000, freq="s")
|
||||
)
|
||||
empty = DataFrame()
|
||||
result = concat([df, empty], axis=1)
|
||||
tm.assert_frame_equal(result, df)
|
||||
result = concat([empty, df], axis=1)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
result = concat([df, empty])
|
||||
tm.assert_frame_equal(result, df)
|
||||
result = concat([empty, df])
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
def test_concat_empty_series(self):
|
||||
# GH 11082
|
||||
s1 = Series([1, 2, 3], name="x")
|
||||
s2 = Series(name="y", dtype="float64")
|
||||
res = concat([s1, s2], axis=1)
|
||||
exp = DataFrame(
|
||||
{"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]},
|
||||
index=RangeIndex(3),
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
s1 = Series([1, 2, 3], name="x")
|
||||
s2 = Series(name="y", dtype="float64")
|
||||
res = concat([s1, s2], axis=0)
|
||||
# name will be reset
|
||||
exp = Series([1, 2, 3], dtype="float64")
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
# empty Series with no name
|
||||
s1 = Series([1, 2, 3], name="x")
|
||||
s2 = Series(name=None, dtype="float64")
|
||||
res = concat([s1, s2], axis=1)
|
||||
exp = DataFrame(
|
||||
{"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]},
|
||||
columns=["x", 0],
|
||||
index=RangeIndex(3),
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||||
@pytest.mark.parametrize("values", [[], [1, 2, 3]])
|
||||
def test_concat_empty_series_timelike(self, tz, values):
|
||||
# GH 18447
|
||||
|
||||
first = Series([], dtype="M8[ns]").dt.tz_localize(tz)
|
||||
dtype = None if values else np.float64
|
||||
second = Series(values, dtype=dtype)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz),
|
||||
1: values,
|
||||
}
|
||||
)
|
||||
result = concat([first, second], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"left,right,expected",
|
||||
[
|
||||
# booleans
|
||||
(np.bool_, np.int32, np.object_), # changed from int32 in 2.0 GH#39817
|
||||
(np.bool_, np.float32, np.object_),
|
||||
# datetime-like
|
||||
("m8[ns]", np.bool_, np.object_),
|
||||
("m8[ns]", np.int64, np.object_),
|
||||
("M8[ns]", np.bool_, np.object_),
|
||||
("M8[ns]", np.int64, np.object_),
|
||||
# categorical
|
||||
("category", "category", "category"),
|
||||
("category", "object", "object"),
|
||||
],
|
||||
)
|
||||
def test_concat_empty_series_dtypes(self, left, right, expected):
|
||||
# GH#39817, GH#45101
|
||||
result = concat([Series(dtype=left), Series(dtype=right)])
|
||||
assert result.dtype == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"]
|
||||
)
|
||||
def test_concat_empty_series_dtypes_match_roundtrips(self, dtype):
|
||||
dtype = np.dtype(dtype)
|
||||
|
||||
result = concat([Series(dtype=dtype)])
|
||||
assert result.dtype == dtype
|
||||
|
||||
result = concat([Series(dtype=dtype), Series(dtype=dtype)])
|
||||
assert result.dtype == dtype
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["float64", "int8", "uint8", "m8[ns]", "M8[ns]"])
|
||||
@pytest.mark.parametrize(
|
||||
"dtype2",
|
||||
["float64", "int8", "uint8", "m8[ns]", "M8[ns]"],
|
||||
)
|
||||
def test_concat_empty_series_dtypes_roundtrips(self, dtype, dtype2):
|
||||
# round-tripping with self & like self
|
||||
if dtype == dtype2:
|
||||
pytest.skip("same dtype is not applicable for test")
|
||||
|
||||
def int_result_type(dtype, dtype2):
|
||||
typs = {dtype.kind, dtype2.kind}
|
||||
if not len(typs - {"i", "u", "b"}) and (
|
||||
dtype.kind == "i" or dtype2.kind == "i"
|
||||
):
|
||||
return "i"
|
||||
elif not len(typs - {"u", "b"}) and (
|
||||
dtype.kind == "u" or dtype2.kind == "u"
|
||||
):
|
||||
return "u"
|
||||
return None
|
||||
|
||||
def float_result_type(dtype, dtype2):
|
||||
typs = {dtype.kind, dtype2.kind}
|
||||
if not len(typs - {"f", "i", "u"}) and (
|
||||
dtype.kind == "f" or dtype2.kind == "f"
|
||||
):
|
||||
return "f"
|
||||
return None
|
||||
|
||||
def get_result_type(dtype, dtype2):
|
||||
result = float_result_type(dtype, dtype2)
|
||||
if result is not None:
|
||||
return result
|
||||
result = int_result_type(dtype, dtype2)
|
||||
if result is not None:
|
||||
return result
|
||||
return "O"
|
||||
|
||||
dtype = np.dtype(dtype)
|
||||
dtype2 = np.dtype(dtype2)
|
||||
expected = get_result_type(dtype, dtype2)
|
||||
result = concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype
|
||||
assert result.kind == expected
|
||||
|
||||
def test_concat_empty_series_dtypes_triple(self):
|
||||
assert (
|
||||
concat(
|
||||
[Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)]
|
||||
).dtype
|
||||
== np.object_
|
||||
)
|
||||
|
||||
def test_concat_empty_series_dtype_category_with_array(self):
|
||||
# GH#18515
|
||||
assert (
|
||||
concat(
|
||||
[Series(np.array([]), dtype="category"), Series(dtype="float64")]
|
||||
).dtype
|
||||
== "float64"
|
||||
)
|
||||
|
||||
def test_concat_empty_series_dtypes_sparse(self):
|
||||
result = concat(
|
||||
[
|
||||
Series(dtype="float64").astype("Sparse"),
|
||||
Series(dtype="float64").astype("Sparse"),
|
||||
]
|
||||
)
|
||||
assert result.dtype == "Sparse[float64]"
|
||||
|
||||
result = concat(
|
||||
[Series(dtype="float64").astype("Sparse"), Series(dtype="float64")]
|
||||
)
|
||||
expected = pd.SparseDtype(np.float64)
|
||||
assert result.dtype == expected
|
||||
|
||||
result = concat(
|
||||
[Series(dtype="float64").astype("Sparse"), Series(dtype="object")]
|
||||
)
|
||||
expected = pd.SparseDtype("object")
|
||||
assert result.dtype == expected
|
||||
|
||||
def test_concat_empty_df_object_dtype(self):
|
||||
# GH 9149
|
||||
df_1 = DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]})
|
||||
df_2 = DataFrame(columns=df_1.columns)
|
||||
result = concat([df_1, df_2], axis=0)
|
||||
expected = df_1.astype(object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_empty_dataframe_dtypes(self):
|
||||
df = DataFrame(columns=list("abc"))
|
||||
df["a"] = df["a"].astype(np.bool_)
|
||||
df["b"] = df["b"].astype(np.int32)
|
||||
df["c"] = df["c"].astype(np.float64)
|
||||
|
||||
result = concat([df, df])
|
||||
assert result["a"].dtype == np.bool_
|
||||
assert result["b"].dtype == np.int32
|
||||
assert result["c"].dtype == np.float64
|
||||
|
||||
result = concat([df, df.astype(np.float64)])
|
||||
assert result["a"].dtype == np.object_
|
||||
assert result["b"].dtype == np.float64
|
||||
assert result["c"].dtype == np.float64
|
||||
|
||||
def test_concat_inner_join_empty(self):
|
||||
# GH 15328
|
||||
df_empty = DataFrame()
|
||||
df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64")
|
||||
df_expected = DataFrame({"a": []}, index=RangeIndex(0), dtype="int64")
|
||||
|
||||
result = concat([df_a, df_empty], axis=1, join="inner")
|
||||
tm.assert_frame_equal(result, df_expected)
|
||||
|
||||
result = concat([df_a, df_empty], axis=1, join="outer")
|
||||
tm.assert_frame_equal(result, df_a)
|
||||
|
||||
def test_empty_dtype_coerce(self):
|
||||
# xref to #12411
|
||||
# xref to #12045
|
||||
# xref to #11594
|
||||
# see below
|
||||
|
||||
# 10571
|
||||
df1 = DataFrame(data=[[1, None], [2, None]], columns=["a", "b"])
|
||||
df2 = DataFrame(data=[[3, None], [4, None]], columns=["a", "b"])
|
||||
result = concat([df1, df2])
|
||||
expected = df1.dtypes
|
||||
tm.assert_series_equal(result.dtypes, expected)
|
||||
|
||||
def test_concat_empty_dataframe(self):
|
||||
# 39037
|
||||
df1 = DataFrame(columns=["a", "b"])
|
||||
df2 = DataFrame(columns=["b", "c"])
|
||||
result = concat([df1, df2, df1])
|
||||
expected = DataFrame(columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df3 = DataFrame(columns=["a", "b"])
|
||||
df4 = DataFrame(columns=["b"])
|
||||
result = concat([df3, df4])
|
||||
expected = DataFrame(columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_empty_dataframe_different_dtypes(self, using_infer_string):
|
||||
# 39037
|
||||
df1 = DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
|
||||
df2 = DataFrame({"a": [1, 2, 3]})
|
||||
|
||||
result = concat([df1[:0], df2[:0]])
|
||||
assert result["a"].dtype == np.int64
|
||||
assert result["b"].dtype == np.object_ if not using_infer_string else "str"
|
||||
|
||||
def test_concat_to_empty_ea(self):
|
||||
"""48510 `concat` to an empty EA should maintain type EA dtype."""
|
||||
df_empty = DataFrame({"a": pd.array([], dtype=pd.Int64Dtype())})
|
||||
df_new = DataFrame({"a": pd.array([1, 2, 3], dtype=pd.Int64Dtype())})
|
||||
expected = df_new.copy()
|
||||
result = concat([df_empty, df_new])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,456 @@
|
||||
from copy import deepcopy
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestIndexConcat:
|
||||
def test_concat_ignore_index(self, sort):
|
||||
frame1 = DataFrame(
|
||||
{"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]}
|
||||
)
|
||||
frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]})
|
||||
frame1.index = Index(["x", "y", "z"])
|
||||
frame2.index = Index(["x", "y", "q"])
|
||||
|
||||
v1 = concat([frame1, frame2], axis=1, ignore_index=True, sort=sort)
|
||||
|
||||
nan = np.nan
|
||||
expected = DataFrame(
|
||||
[
|
||||
[nan, nan, nan, 4.3],
|
||||
["a", 1, 4.5, 5.2],
|
||||
["b", 2, 3.2, 2.2],
|
||||
["c", 3, 1.2, nan],
|
||||
],
|
||||
index=Index(["q", "x", "y", "z"]),
|
||||
)
|
||||
if not sort:
|
||||
expected = expected.loc[["x", "y", "z", "q"]]
|
||||
|
||||
tm.assert_frame_equal(v1, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name_in1,name_in2,name_in3,name_out",
|
||||
[
|
||||
("idx", "idx", "idx", "idx"),
|
||||
("idx", "idx", None, None),
|
||||
("idx", None, None, None),
|
||||
("idx1", "idx2", None, None),
|
||||
("idx1", "idx1", "idx2", None),
|
||||
("idx1", "idx2", "idx3", None),
|
||||
(None, None, None, None),
|
||||
],
|
||||
)
|
||||
def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out):
|
||||
# GH13475
|
||||
indices = [
|
||||
Index(["a", "b", "c"], name=name_in1),
|
||||
Index(["b", "c", "d"], name=name_in2),
|
||||
Index(["c", "d", "e"], name=name_in3),
|
||||
]
|
||||
frames = [
|
||||
DataFrame({c: [0, 1, 2]}, index=i) for i, c in zip(indices, ["x", "y", "z"])
|
||||
]
|
||||
result = concat(frames, axis=1)
|
||||
|
||||
exp_ind = Index(["a", "b", "c", "d", "e"], name=name_out)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"x": [0, 1, 2, np.nan, np.nan],
|
||||
"y": [np.nan, 0, 1, 2, np.nan],
|
||||
"z": [np.nan, np.nan, 0, 1, 2],
|
||||
},
|
||||
index=exp_ind,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_rename_index(self):
|
||||
a = DataFrame(
|
||||
np.random.default_rng(2).random((3, 3)),
|
||||
columns=list("ABC"),
|
||||
index=Index(list("abc"), name="index_a"),
|
||||
)
|
||||
b = DataFrame(
|
||||
np.random.default_rng(2).random((3, 3)),
|
||||
columns=list("ABC"),
|
||||
index=Index(list("abc"), name="index_b"),
|
||||
)
|
||||
|
||||
result = concat([a, b], keys=["key0", "key1"], names=["lvl0", "lvl1"])
|
||||
|
||||
exp = concat([a, b], keys=["key0", "key1"], names=["lvl0"])
|
||||
names = list(exp.index.names)
|
||||
names[1] = "lvl1"
|
||||
exp.index.set_names(names, inplace=True)
|
||||
|
||||
tm.assert_frame_equal(result, exp)
|
||||
assert result.index.names == exp.index.names
|
||||
|
||||
def test_concat_copy_index_series(self, axis):
|
||||
# GH 29879
|
||||
ser = Series([1, 2])
|
||||
comb = concat([ser, ser], axis=axis)
|
||||
if axis in [0, "index"]:
|
||||
assert comb.index is not ser.index
|
||||
else:
|
||||
assert comb.index is ser.index
|
||||
|
||||
def test_concat_copy_index_frame(self, axis):
|
||||
# GH 29879
|
||||
df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
|
||||
comb = concat([df, df], axis=axis)
|
||||
if axis in [0, "index"]:
|
||||
assert not comb.index.is_(df.index)
|
||||
assert comb.columns.is_(df.columns)
|
||||
elif axis in [1, "columns"]:
|
||||
assert comb.index.is_(df.index)
|
||||
assert not comb.columns.is_(df.columns)
|
||||
|
||||
def test_default_index(self):
|
||||
# is_series and ignore_index
|
||||
s1 = Series([1, 2, 3], name="x")
|
||||
s2 = Series([4, 5, 6], name="y")
|
||||
res = concat([s1, s2], axis=1, ignore_index=True)
|
||||
assert isinstance(res.columns, pd.RangeIndex)
|
||||
exp = DataFrame([[1, 4], [2, 5], [3, 6]])
|
||||
# use check_index_type=True to check the result have
|
||||
# RangeIndex (default index)
|
||||
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
|
||||
|
||||
# is_series and all inputs have no names
|
||||
s1 = Series([1, 2, 3])
|
||||
s2 = Series([4, 5, 6])
|
||||
res = concat([s1, s2], axis=1, ignore_index=False)
|
||||
assert isinstance(res.columns, pd.RangeIndex)
|
||||
exp = DataFrame([[1, 4], [2, 5], [3, 6]])
|
||||
exp.columns = pd.RangeIndex(2)
|
||||
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
|
||||
|
||||
# is_dataframe and ignore_index
|
||||
df1 = DataFrame({"A": [1, 2], "B": [5, 6]})
|
||||
df2 = DataFrame({"A": [3, 4], "B": [7, 8]})
|
||||
|
||||
res = concat([df1, df2], axis=0, ignore_index=True)
|
||||
exp = DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"])
|
||||
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
|
||||
|
||||
res = concat([df1, df2], axis=1, ignore_index=True)
|
||||
exp = DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]])
|
||||
tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
|
||||
|
||||
def test_dups_index(self):
|
||||
# GH 4771
|
||||
|
||||
# single dtypes
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).integers(0, 10, size=40).reshape(10, 4),
|
||||
columns=["A", "A", "C", "C"],
|
||||
)
|
||||
|
||||
result = concat([df, df], axis=1)
|
||||
tm.assert_frame_equal(result.iloc[:, :4], df)
|
||||
tm.assert_frame_equal(result.iloc[:, 4:], df)
|
||||
|
||||
result = concat([df, df], axis=0)
|
||||
tm.assert_frame_equal(result.iloc[:10], df)
|
||||
tm.assert_frame_equal(result.iloc[10:], df)
|
||||
|
||||
# multi dtypes
|
||||
df = concat(
|
||||
[
|
||||
DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=["A", "A", "B", "B"],
|
||||
),
|
||||
DataFrame(
|
||||
np.random.default_rng(2).integers(0, 10, size=20).reshape(10, 2),
|
||||
columns=["A", "C"],
|
||||
),
|
||||
],
|
||||
axis=1,
|
||||
)
|
||||
|
||||
result = concat([df, df], axis=1)
|
||||
tm.assert_frame_equal(result.iloc[:, :6], df)
|
||||
tm.assert_frame_equal(result.iloc[:, 6:], df)
|
||||
|
||||
result = concat([df, df], axis=0)
|
||||
tm.assert_frame_equal(result.iloc[:10], df)
|
||||
tm.assert_frame_equal(result.iloc[10:], df)
|
||||
|
||||
|
||||
class TestMultiIndexConcat:
|
||||
def test_concat_multiindex_with_keys(self, multiindex_dataframe_random_data):
|
||||
frame = multiindex_dataframe_random_data
|
||||
index = frame.index
|
||||
result = concat([frame, frame], keys=[0, 1], names=["iteration"])
|
||||
|
||||
assert result.index.names == ("iteration", *index.names)
|
||||
tm.assert_frame_equal(result.loc[0], frame)
|
||||
tm.assert_frame_equal(result.loc[1], frame)
|
||||
assert result.index.nlevels == 3
|
||||
|
||||
def test_concat_multiindex_with_none_in_index_names(self):
|
||||
# GH 15787
|
||||
index = MultiIndex.from_product([[1], range(5)], names=["level1", None])
|
||||
df = DataFrame({"col": range(5)}, index=index, dtype=np.int32)
|
||||
|
||||
result = concat([df, df], keys=[1, 2], names=["level2"])
|
||||
index = MultiIndex.from_product(
|
||||
[[1, 2], [1], range(5)], names=["level2", "level1", None]
|
||||
)
|
||||
expected = DataFrame({"col": list(range(5)) * 2}, index=index, dtype=np.int32)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = concat([df, df[:2]], keys=[1, 2], names=["level2"])
|
||||
level2 = [1] * 5 + [2] * 2
|
||||
level1 = [1] * 7
|
||||
no_name = list(range(5)) + list(range(2))
|
||||
tuples = list(zip(level2, level1, no_name))
|
||||
index = MultiIndex.from_tuples(tuples, names=["level2", "level1", None])
|
||||
expected = DataFrame({"col": no_name}, index=index, dtype=np.int32)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_multiindex_rangeindex(self):
|
||||
# GH13542
|
||||
# when multi-index levels are RangeIndex objects
|
||||
# there is a bug in concat with objects of len 1
|
||||
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((9, 2)))
|
||||
df.index = MultiIndex(
|
||||
levels=[pd.RangeIndex(3), pd.RangeIndex(3)],
|
||||
codes=[np.repeat(np.arange(3), 3), np.tile(np.arange(3), 3)],
|
||||
)
|
||||
|
||||
res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]])
|
||||
exp = df.iloc[[2, 3, 4, 5], :]
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_concat_multiindex_dfs_with_deepcopy(self):
|
||||
# GH 9967
|
||||
example_multiindex1 = MultiIndex.from_product([["a"], ["b"]])
|
||||
example_dataframe1 = DataFrame([0], index=example_multiindex1)
|
||||
|
||||
example_multiindex2 = MultiIndex.from_product([["a"], ["c"]])
|
||||
example_dataframe2 = DataFrame([1], index=example_multiindex2)
|
||||
|
||||
example_dict = {"s1": example_dataframe1, "s2": example_dataframe2}
|
||||
expected_index = MultiIndex(
|
||||
levels=[["s1", "s2"], ["a"], ["b", "c"]],
|
||||
codes=[[0, 1], [0, 0], [0, 1]],
|
||||
names=["testname", None, None],
|
||||
)
|
||||
expected = DataFrame([[0], [1]], index=expected_index)
|
||||
result_copy = concat(deepcopy(example_dict), names=["testname"])
|
||||
tm.assert_frame_equal(result_copy, expected)
|
||||
result_no_copy = concat(example_dict, names=["testname"])
|
||||
tm.assert_frame_equal(result_no_copy, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"mi1_list",
|
||||
[
|
||||
[["a"], range(2)],
|
||||
[["b"], np.arange(2.0, 4.0)],
|
||||
[["c"], ["A", "B"]],
|
||||
[["d"], pd.date_range(start="2017", end="2018", periods=2)],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"mi2_list",
|
||||
[
|
||||
[["a"], range(2)],
|
||||
[["b"], np.arange(2.0, 4.0)],
|
||||
[["c"], ["A", "B"]],
|
||||
[["d"], pd.date_range(start="2017", end="2018", periods=2)],
|
||||
],
|
||||
)
|
||||
def test_concat_with_various_multiindex_dtypes(
|
||||
self, mi1_list: list, mi2_list: list
|
||||
):
|
||||
# GitHub #23478
|
||||
mi1 = MultiIndex.from_product(mi1_list)
|
||||
mi2 = MultiIndex.from_product(mi2_list)
|
||||
|
||||
df1 = DataFrame(np.zeros((1, len(mi1))), columns=mi1)
|
||||
df2 = DataFrame(np.zeros((1, len(mi2))), columns=mi2)
|
||||
|
||||
if mi1_list[0] == mi2_list[0]:
|
||||
expected_mi = MultiIndex(
|
||||
levels=[mi1_list[0], list(mi1_list[1])],
|
||||
codes=[[0, 0, 0, 0], [0, 1, 0, 1]],
|
||||
)
|
||||
else:
|
||||
expected_mi = MultiIndex(
|
||||
levels=[
|
||||
mi1_list[0] + mi2_list[0],
|
||||
list(mi1_list[1]) + list(mi2_list[1]),
|
||||
],
|
||||
codes=[[0, 0, 1, 1], [0, 1, 2, 3]],
|
||||
)
|
||||
|
||||
expected_df = DataFrame(np.zeros((1, len(expected_mi))), columns=expected_mi)
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
result_df = concat((df1, df2), axis=1)
|
||||
|
||||
tm.assert_frame_equal(expected_df, result_df)
|
||||
|
||||
def test_concat_multiindex_(self):
|
||||
# GitHub #44786
|
||||
df = DataFrame({"col": ["a", "b", "c"]}, index=["1", "2", "2"])
|
||||
df = concat([df], keys=["X"])
|
||||
|
||||
iterables = [["X"], ["1", "2", "2"]]
|
||||
result_index = df.index
|
||||
expected_index = MultiIndex.from_product(iterables)
|
||||
|
||||
tm.assert_index_equal(result_index, expected_index)
|
||||
|
||||
result_df = df
|
||||
expected_df = DataFrame(
|
||||
{"col": ["a", "b", "c"]}, index=MultiIndex.from_product(iterables)
|
||||
)
|
||||
tm.assert_frame_equal(result_df, expected_df)
|
||||
|
||||
def test_concat_with_key_not_unique(self, performance_warning):
|
||||
# GitHub #46519
|
||||
df1 = DataFrame({"name": [1]})
|
||||
df2 = DataFrame({"name": [2]})
|
||||
df3 = DataFrame({"name": [3]})
|
||||
df_a = concat([df1, df2, df3], keys=["x", "y", "x"])
|
||||
# the warning is caused by indexing unsorted multi-index
|
||||
with tm.assert_produces_warning(
|
||||
performance_warning, match="indexing past lexsort depth"
|
||||
):
|
||||
out_a = df_a.loc[("x", 0), :]
|
||||
df_b = DataFrame(
|
||||
{"name": [1, 2, 3]},
|
||||
index=MultiIndex(
|
||||
levels=[["x", "y"], range(1)], codes=[[0, 1, 0], [0, 0, 0]]
|
||||
),
|
||||
)
|
||||
with tm.assert_produces_warning(
|
||||
performance_warning, match="indexing past lexsort depth"
|
||||
):
|
||||
out_b = df_b.loc[("x", 0)]
|
||||
|
||||
tm.assert_frame_equal(out_a, out_b)
|
||||
|
||||
df1 = DataFrame({"name": ["a", "a", "b"]})
|
||||
df2 = DataFrame({"name": ["a", "b"]})
|
||||
df3 = DataFrame({"name": ["c", "d"]})
|
||||
df_a = concat([df1, df2, df3], keys=["x", "y", "x"])
|
||||
with tm.assert_produces_warning(
|
||||
performance_warning, match="indexing past lexsort depth"
|
||||
):
|
||||
out_a = df_a.loc[("x", 0), :]
|
||||
|
||||
df_b = DataFrame(
|
||||
{
|
||||
"a": ["x", "x", "x", "y", "y", "x", "x"],
|
||||
"b": [0, 1, 2, 0, 1, 0, 1],
|
||||
"name": list("aababcd"),
|
||||
}
|
||||
).set_index(["a", "b"])
|
||||
df_b.index.names = [None, None]
|
||||
with tm.assert_produces_warning(
|
||||
performance_warning, match="indexing past lexsort depth"
|
||||
):
|
||||
out_b = df_b.loc[("x", 0), :]
|
||||
|
||||
tm.assert_frame_equal(out_a, out_b)
|
||||
|
||||
def test_concat_with_duplicated_levels(self):
|
||||
# keyword levels should be unique
|
||||
df1 = DataFrame({"A": [1]}, index=["x"])
|
||||
df2 = DataFrame({"A": [1]}, index=["y"])
|
||||
msg = r"Level values not unique: \['x', 'y', 'y'\]"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
concat([df1, df2], keys=["x", "y"], levels=[["x", "y", "y"]])
|
||||
|
||||
@pytest.mark.parametrize("levels", [[["x", "y"]], [["x", "y", "y"]]])
|
||||
def test_concat_with_levels_with_none_keys(self, levels):
|
||||
df1 = DataFrame({"A": [1]}, index=["x"])
|
||||
df2 = DataFrame({"A": [1]}, index=["y"])
|
||||
msg = "levels supported only when keys is not None"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
concat([df1, df2], levels=levels)
|
||||
|
||||
def test_concat_range_index_result(self):
|
||||
# GH#47501
|
||||
df1 = DataFrame({"a": [1, 2]})
|
||||
df2 = DataFrame({"b": [1, 2]})
|
||||
|
||||
result = concat([df1, df2], sort=True, axis=1)
|
||||
expected = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
expected_index = pd.RangeIndex(0, 2)
|
||||
tm.assert_index_equal(result.index, expected_index, exact=True)
|
||||
|
||||
def test_concat_index_keep_dtype(self):
|
||||
# GH#47329
|
||||
df1 = DataFrame([[0, 1, 1]], columns=Index([1, 2, 3], dtype="object"))
|
||||
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype="object"))
|
||||
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
|
||||
expected = DataFrame(
|
||||
[[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype="object")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_index_keep_dtype_ea_numeric(self, any_numeric_ea_dtype):
|
||||
# GH#47329
|
||||
df1 = DataFrame(
|
||||
[[0, 1, 1]], columns=Index([1, 2, 3], dtype=any_numeric_ea_dtype)
|
||||
)
|
||||
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype=any_numeric_ea_dtype))
|
||||
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
|
||||
expected = DataFrame(
|
||||
[[0, 1, 1.0], [0, 1, np.nan]],
|
||||
columns=Index([1, 2, 3], dtype=any_numeric_ea_dtype),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int8", "Int16", "Int32"])
|
||||
def test_concat_index_find_common(self, dtype):
|
||||
# GH#47329
|
||||
df1 = DataFrame([[0, 1, 1]], columns=Index([1, 2, 3], dtype=dtype))
|
||||
df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype="Int32"))
|
||||
result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
|
||||
expected = DataFrame(
|
||||
[[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype="Int32")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string):
|
||||
# GH 46675
|
||||
s1 = Series(["a", "b", "c"])
|
||||
s2 = Series(["a", "b"])
|
||||
s3 = Series(["a", "b", "c", "d"])
|
||||
s4 = Series([], dtype=object if not using_infer_string else "str")
|
||||
result = concat(
|
||||
[s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
["a"] * 3 + [np.nan],
|
||||
["b"] * 3 + [np.nan],
|
||||
["c", np.nan] * 2,
|
||||
[np.nan] * 2 + ["d"] + [np.nan],
|
||||
],
|
||||
dtype=object if not using_infer_string else "str",
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
result, expected, check_index_type=True, check_column_type=True
|
||||
)
|
||||
@ -0,0 +1,54 @@
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
concat,
|
||||
read_csv,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestInvalidConcat:
|
||||
@pytest.mark.parametrize("obj", [1, {}, [1, 2], (1, 2)])
|
||||
def test_concat_invalid(self, obj):
|
||||
# trying to concat an ndframe with a non-ndframe
|
||||
df1 = DataFrame(range(2))
|
||||
msg = (
|
||||
f"cannot concatenate object of type '{type(obj)}'; "
|
||||
"only Series and DataFrame objs are valid"
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
concat([df1, obj])
|
||||
|
||||
def test_concat_invalid_first_argument(self):
|
||||
df1 = DataFrame(range(2))
|
||||
msg = (
|
||||
"first argument must be an iterable of pandas "
|
||||
'objects, you passed an object of type "DataFrame"'
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
concat(df1)
|
||||
|
||||
def test_concat_generator_obj(self):
|
||||
# generator ok though
|
||||
concat(DataFrame(np.random.default_rng(2).random((5, 5))) for _ in range(3))
|
||||
|
||||
def test_concat_textreader_obj(self):
|
||||
# text reader ok
|
||||
# GH6583
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
|
||||
with read_csv(StringIO(data), chunksize=1) as reader:
|
||||
result = concat(reader, ignore_index=True)
|
||||
expected = read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,184 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
concat,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestSeriesConcat:
|
||||
@pytest.mark.parametrize("bool_dtype", [bool, "boolean"])
|
||||
@pytest.mark.parametrize("dtype", [np.int64, np.float64, "Int64", "Float64"])
|
||||
def test_concat_bool_and_numeric(self, bool_dtype, dtype):
|
||||
# GH#21108, GH#45101
|
||||
left = Series([True, False], dtype=bool_dtype)
|
||||
right = Series([1, 2], dtype=dtype)
|
||||
result = concat([left, right], ignore_index=True)
|
||||
expected = Series([True, False, 1, 2], dtype=object)
|
||||
assert result.iloc[0] is True
|
||||
assert type(result.iloc[2]) in [int, float] # i.e. not bool
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_series(self):
|
||||
ts = Series(
|
||||
np.arange(20, dtype=np.float64),
|
||||
index=date_range("2020-01-01", periods=20, unit="ns"),
|
||||
name="foo",
|
||||
)
|
||||
ts.name = "foo"
|
||||
|
||||
pieces = [ts[:5], ts[5:15], ts[15:]]
|
||||
|
||||
result = concat(pieces)
|
||||
tm.assert_series_equal(result, ts)
|
||||
assert result.name == ts.name
|
||||
|
||||
result = concat(pieces, keys=[0, 1, 2])
|
||||
expected = ts.copy()
|
||||
exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))]
|
||||
exp_index = MultiIndex(
|
||||
levels=[[0, 1, 2], DatetimeIndex(ts.index.to_numpy(dtype="M8[ns]"))],
|
||||
codes=exp_codes,
|
||||
)
|
||||
expected.index = exp_index
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_empty_and_non_empty_series_regression(self):
|
||||
# GH 18187 regression test
|
||||
s1 = Series([1])
|
||||
s2 = Series([], dtype=object)
|
||||
|
||||
expected = s1.astype(object)
|
||||
result = concat([s1, s2])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_series_axis1(self):
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
|
||||
pieces = [ts[:-2], ts[2:], ts[2:-2]]
|
||||
|
||||
result = concat(pieces, axis=1)
|
||||
expected = DataFrame(pieces).T
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = concat(pieces, keys=["A", "B", "C"], axis=1)
|
||||
expected = DataFrame(pieces, index=["A", "B", "C"]).T
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_series_axis1_preserves_series_names(self):
|
||||
# preserve series names, #2489
|
||||
s = Series(np.random.default_rng(2).standard_normal(5), name="A")
|
||||
s2 = Series(np.random.default_rng(2).standard_normal(5), name="B")
|
||||
|
||||
result = concat([s, s2], axis=1)
|
||||
expected = DataFrame({"A": s, "B": s2})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
s2.name = None
|
||||
result = concat([s, s2], axis=1)
|
||||
tm.assert_index_equal(result.columns, Index(["A", 0], dtype="object"))
|
||||
|
||||
def test_concat_series_axis1_with_reindex(self, sort):
|
||||
# must reindex, #2603
|
||||
s = Series(
|
||||
np.random.default_rng(2).standard_normal(3), index=["c", "a", "b"], name="A"
|
||||
)
|
||||
s2 = Series(
|
||||
np.random.default_rng(2).standard_normal(4),
|
||||
index=["d", "a", "b", "c"],
|
||||
name="B",
|
||||
)
|
||||
result = concat([s, s2], axis=1, sort=sort)
|
||||
expected = DataFrame({"A": s, "B": s2}, index=["c", "a", "b", "d"])
|
||||
if sort:
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_series_axis1_names_applied(self):
|
||||
# ensure names argument is not ignored on axis=1, #23490
|
||||
s = Series([1, 2, 3])
|
||||
s2 = Series([4, 5, 6])
|
||||
result = concat([s, s2], axis=1, keys=["a", "b"], names=["A"])
|
||||
expected = DataFrame(
|
||||
[[1, 4], [2, 5], [3, 6]], columns=Index(["a", "b"], name="A")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = concat([s, s2], axis=1, keys=[("a", 1), ("b", 2)], names=["A", "B"])
|
||||
expected = DataFrame(
|
||||
[[1, 4], [2, 5], [3, 6]],
|
||||
columns=MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["A", "B"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_series_axis1_same_names_ignore_index(self):
|
||||
dates = date_range("01-Jan-2013", "01-Jan-2014", freq="MS")[0:-1]
|
||||
s1 = Series(
|
||||
np.random.default_rng(2).standard_normal(len(dates)),
|
||||
index=dates,
|
||||
name="value",
|
||||
)
|
||||
s2 = Series(
|
||||
np.random.default_rng(2).standard_normal(len(dates)),
|
||||
index=dates,
|
||||
name="value",
|
||||
)
|
||||
|
||||
result = concat([s1, s2], axis=1, ignore_index=True)
|
||||
expected = Index(range(2))
|
||||
|
||||
tm.assert_index_equal(result.columns, expected, exact=True)
|
||||
|
||||
@pytest.mark.parametrize("s1name", [np.int64(190), 190])
|
||||
def test_concat_series_name_npscalar_tuple(self, s1name):
|
||||
# GH21015
|
||||
s2name = (43, 0)
|
||||
s1 = Series({"a": 1, "b": 2}, name=s1name)
|
||||
s2 = Series({"c": 5, "d": 6}, name=s2name)
|
||||
result = concat([s1, s2])
|
||||
expected = Series({"a": 1, "b": 2, "c": 5, "d": 6})
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_series_partial_columns_names(self):
|
||||
# GH10698
|
||||
named_series = Series([1, 2], name="foo")
|
||||
unnamed_series1 = Series([1, 2])
|
||||
unnamed_series2 = Series([4, 5])
|
||||
|
||||
result = concat([named_series, unnamed_series1, unnamed_series2], axis=1)
|
||||
expected = DataFrame(
|
||||
{"foo": [1, 2], 0: [1, 2], 1: [4, 5]}, columns=["foo", 0, 1]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = concat(
|
||||
[named_series, unnamed_series1, unnamed_series2],
|
||||
axis=1,
|
||||
keys=["red", "blue", "yellow"],
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"red": [1, 2], "blue": [1, 2], "yellow": [4, 5]},
|
||||
columns=["red", "blue", "yellow"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = concat(
|
||||
[named_series, unnamed_series1, unnamed_series2], axis=1, ignore_index=True
|
||||
)
|
||||
expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_series_length_one_reversed(self, frame_or_series):
|
||||
# GH39401
|
||||
obj = frame_or_series([100])
|
||||
result = concat([obj.iloc[::-1]])
|
||||
tm.assert_equal(result, obj)
|
||||
@ -0,0 +1,118 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestConcatSort:
|
||||
def test_concat_sorts_columns(self, sort):
|
||||
# GH-4588
|
||||
df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"])
|
||||
df2 = DataFrame({"a": [3, 4], "c": [5, 6]})
|
||||
|
||||
# for sort=True/None
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3, 4], "b": [1, 2, None, None], "c": [None, None, 5, 6]},
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
|
||||
if sort is False:
|
||||
expected = expected[["b", "a", "c"]]
|
||||
|
||||
# default
|
||||
with tm.assert_produces_warning(None):
|
||||
result = pd.concat([df1, df2], ignore_index=True, sort=sort)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_sorts_index(self, sort):
|
||||
df1 = DataFrame({"a": [1, 2, 3]}, index=["c", "a", "b"])
|
||||
df2 = DataFrame({"b": [1, 2]}, index=["a", "b"])
|
||||
|
||||
# For True/None
|
||||
expected = DataFrame(
|
||||
{"a": [2, 3, 1], "b": [1, 2, None]},
|
||||
index=["a", "b", "c"],
|
||||
columns=["a", "b"],
|
||||
)
|
||||
if sort is False:
|
||||
expected = expected.loc[["c", "a", "b"]]
|
||||
|
||||
# Warn and sort by default
|
||||
with tm.assert_produces_warning(None):
|
||||
result = pd.concat([df1, df2], axis=1, sort=sort)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_inner_sort(self, sort):
|
||||
# https://github.com/pandas-dev/pandas/pull/20613
|
||||
df1 = DataFrame(
|
||||
{"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=["b", "a", "c"]
|
||||
)
|
||||
df2 = DataFrame({"a": [1, 2], "b": [3, 4]}, index=[3, 4])
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
# unset sort should *not* warn for inner join
|
||||
# since that never sorted
|
||||
result = pd.concat([df1, df2], sort=sort, join="inner", ignore_index=True)
|
||||
|
||||
expected = DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=["b", "a"])
|
||||
if sort is True:
|
||||
expected = expected[["a", "b"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_aligned_sort(self):
|
||||
# GH-4588
|
||||
df = DataFrame({"c": [1, 2], "b": [3, 4], "a": [5, 6]}, columns=["c", "b", "a"])
|
||||
result = pd.concat([df, df], sort=True, ignore_index=True)
|
||||
expected = DataFrame(
|
||||
{"a": [5, 6, 5, 6], "b": [3, 4, 3, 4], "c": [1, 2, 1, 2]},
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = pd.concat(
|
||||
[df, df[["c", "b"]]], join="inner", sort=True, ignore_index=True
|
||||
)
|
||||
expected = expected[["b", "c"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_aligned_sort_does_not_raise(self):
|
||||
# GH-4588
|
||||
# We catch TypeErrors from sorting internally and do not re-raise.
|
||||
df = DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, "a"])
|
||||
expected = DataFrame({1: [1, 2, 1, 2], "a": [3, 4, 3, 4]}, columns=[1, "a"])
|
||||
result = pd.concat([df, df], ignore_index=True, sort=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_frame_with_sort_false(self):
|
||||
# GH 43375
|
||||
result = pd.concat(
|
||||
[DataFrame({i: i}, index=[i]) for i in range(2, 0, -1)], sort=False
|
||||
)
|
||||
expected = DataFrame([[2, np.nan], [np.nan, 1]], index=[2, 1], columns=[2, 1])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 37937
|
||||
df1 = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[1, 2, 3])
|
||||
df2 = DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}, index=[3, 1, 6])
|
||||
result = pd.concat([df2, df1], axis=1, sort=False)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[7.0, 10.0, 3.0, 6.0],
|
||||
[8.0, 11.0, 1.0, 4.0],
|
||||
[9.0, 12.0, np.nan, np.nan],
|
||||
[np.nan, np.nan, 2.0, 5.0],
|
||||
],
|
||||
index=[3, 1, 6, 2],
|
||||
columns=["c", "d", "a", "b"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_sort_none_raises(self):
|
||||
# GH#41518
|
||||
df = DataFrame({1: [1, 2], "a": [3, 4]})
|
||||
msg = "The 'sort' keyword only accepts boolean values; None was passed."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.concat([df, df], sort=None)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,280 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.reshape.merge import merge
|
||||
|
||||
|
||||
def test_merge_antijoin():
|
||||
# GH#42916
|
||||
left = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"])
|
||||
right = DataFrame({"B": [1, 2, 4]}, index=["a", "b", "d"])
|
||||
|
||||
result = merge(left, right, how="left_anti", left_index=True, right_index=True)
|
||||
expected = DataFrame({"A": [3], "B": [np.nan]}, index=["c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = merge(left, right, how="right_anti", left_index=True, right_index=True)
|
||||
expected = DataFrame({"A": [np.nan], "B": [4]}, index=["d"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_antijoin_on_different_columns():
|
||||
left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "c"]}).astype({"B": object})
|
||||
right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["a", "d", "b"]}).astype(
|
||||
{"D": object}
|
||||
)
|
||||
|
||||
result = merge(left, right, how="left_anti", left_on="B", right_on="D")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [3.0],
|
||||
"B": ["c"],
|
||||
"C": [np.nan],
|
||||
"D": [np.nan],
|
||||
},
|
||||
index=[2],
|
||||
).astype({"B": object, "D": object})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = merge(left, right, how="right_anti", left_on="B", right_on="D")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [np.nan],
|
||||
"B": [np.nan],
|
||||
"C": [2.0],
|
||||
"D": ["d"],
|
||||
},
|
||||
index=[1],
|
||||
).astype({"B": object, "D": object})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_antijoin_nonunique_keys():
|
||||
left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "b"]}).astype({"B": object})
|
||||
right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["b", "d", "d"]}).astype(
|
||||
{"D": object}
|
||||
)
|
||||
|
||||
result = merge(left, right, how="left_anti", left_on="B", right_on="D")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [1.0],
|
||||
"B": ["a"],
|
||||
"C": [np.nan],
|
||||
"D": [np.nan],
|
||||
},
|
||||
index=[0],
|
||||
).astype({"B": object, "D": object})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = merge(left, right, how="right_anti", left_on="B", right_on="D")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [np.nan, np.nan],
|
||||
"B": [np.nan, np.nan],
|
||||
"C": [2.0, 4.0],
|
||||
"D": ["d", "d"],
|
||||
},
|
||||
index=[2, 3],
|
||||
).astype({"B": object, "D": object})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_antijoin_same_df():
|
||||
left = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"], dtype=np.int64)
|
||||
result = merge(left, left, how="left_anti", left_index=True, right_index=True)
|
||||
expected = DataFrame([], columns=["A_x", "A_y"], dtype=np.int64)
|
||||
tm.assert_frame_equal(result, expected, check_index_type=False)
|
||||
|
||||
|
||||
def test_merge_antijoin_nans():
|
||||
left = DataFrame({"A": [1.0, 2.0, np.nan], "C": ["a", "b", "c"]}).astype(
|
||||
{"C": object}
|
||||
)
|
||||
right = DataFrame({"A": [3.0, 2.0, np.nan], "D": ["d", "e", "f"]}).astype(
|
||||
{"D": object}
|
||||
)
|
||||
result = merge(left, right, how="left_anti", on="A")
|
||||
expected = DataFrame({"A": [1.0], "C": ["a"], "D": [np.nan]}).astype(
|
||||
{"C": object, "D": object}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_antijoin_on_datetime64tz():
|
||||
# GH11405
|
||||
left = DataFrame(
|
||||
{
|
||||
"key": pd.date_range("20151010", periods=2, tz="US/Eastern"),
|
||||
"value": [1.0, 2.0],
|
||||
}
|
||||
)
|
||||
right = DataFrame(
|
||||
{
|
||||
"key": pd.date_range("20151011", periods=3, tz="US/Eastern"),
|
||||
"value": [1.0, 2.0, 3.0],
|
||||
}
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": pd.date_range("20151010", periods=1, tz="US/Eastern"),
|
||||
"value_x": [1.0],
|
||||
"value_y": [np.nan],
|
||||
},
|
||||
index=[0],
|
||||
)
|
||||
result = merge(left, right, on="key", how="left_anti")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": pd.date_range("20151012", periods=2, tz="US/Eastern"),
|
||||
"value_x": [np.nan, np.nan],
|
||||
"value_y": [2.0, 3.0],
|
||||
},
|
||||
index=[1, 2],
|
||||
)
|
||||
result = merge(left, right, on="key", how="right_anti")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_antijoin_multiindex():
|
||||
left = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3],
|
||||
"B": [4, 5, 6],
|
||||
},
|
||||
index=MultiIndex.from_tuples(
|
||||
[("a", "x"), ("b", "y"), ("c", "z")], names=["first", "second"]
|
||||
),
|
||||
)
|
||||
right = DataFrame(
|
||||
{
|
||||
"C": [7, 8, 9],
|
||||
"D": [10, 11, 12],
|
||||
},
|
||||
index=MultiIndex.from_tuples(
|
||||
[("a", "x"), ("b", "y"), ("c", "w")], names=["first", "second"]
|
||||
),
|
||||
)
|
||||
|
||||
result = merge(left, right, how="left_anti", left_index=True, right_index=True)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [3],
|
||||
"B": [6],
|
||||
"C": [np.nan],
|
||||
"D": [np.nan],
|
||||
},
|
||||
index=MultiIndex.from_tuples([("c", "z")], names=["first", "second"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = merge(left, right, how="right_anti", left_index=True, right_index=True)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [np.nan],
|
||||
"B": [np.nan],
|
||||
"C": [9],
|
||||
"D": [12],
|
||||
},
|
||||
index=MultiIndex.from_tuples([("c", "w")], names=["first", "second"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
"Int64",
|
||||
pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")),
|
||||
pytest.param("timestamp[s][pyarrow]", marks=td.skip_if_no("pyarrow")),
|
||||
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
|
||||
],
|
||||
)
|
||||
def test_merge_antijoin_extension_dtype(dtype):
|
||||
left = DataFrame(
|
||||
{
|
||||
"join_col": [1, 3, 5],
|
||||
"left_val": [1, 2, 3],
|
||||
}
|
||||
)
|
||||
right = DataFrame(
|
||||
{
|
||||
"join_col": [2, 3, 4],
|
||||
"right_val": [1, 2, 3],
|
||||
}
|
||||
)
|
||||
left = left.astype({"join_col": dtype})
|
||||
right = right.astype({"join_col": dtype})
|
||||
result = merge(left, right, how="left_anti", on="join_col")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"join_col": [1, 5],
|
||||
"left_val": [1, 3],
|
||||
"right_val": [np.nan, np.nan],
|
||||
},
|
||||
index=[0, 2],
|
||||
)
|
||||
expected = expected.astype({"join_col": dtype})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_antijoin_empty_dataframe():
|
||||
left = DataFrame({"A": [], "B": []})
|
||||
right = DataFrame({"C": [], "D": []})
|
||||
|
||||
result = merge(left, right, how="left_anti", left_on="A", right_on="C")
|
||||
expected = DataFrame({"A": [], "B": [], "C": [], "D": []})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = merge(left, right, how="right_anti", left_on="A", right_on="C")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_antijoin_no_common_elements():
|
||||
left = DataFrame({"A": [1, 2, 3]})
|
||||
right = DataFrame({"B": [4, 5, 6]})
|
||||
|
||||
result = merge(left, right, how="left_anti", left_on="A", right_on="B")
|
||||
expected = DataFrame({"A": [1, 2, 3], "B": [np.nan, np.nan, np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = merge(left, right, how="right_anti", left_on="A", right_on="B")
|
||||
expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": [4, 5, 6]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_antijoin_with_null_values():
|
||||
left = DataFrame({"A": [1.0, 2.0, None, 4.0]})
|
||||
right = DataFrame({"B": [2.0, None, 5.0]})
|
||||
|
||||
result = merge(left, right, how="left_anti", left_on="A", right_on="B")
|
||||
expected = DataFrame({"A": [1.0, 4.0], "B": [np.nan, np.nan]}, index=[0, 3])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = merge(left, right, how="right_anti", left_on="A", right_on="B")
|
||||
expected = DataFrame({"A": [np.nan], "B": [5.0]}, index=[2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_antijoin_with_mixed_dtypes():
|
||||
left = DataFrame({"A": [1, "2", 3.0]})
|
||||
right = DataFrame({"B": ["2", 3.0, 4]})
|
||||
|
||||
result = merge(left, right, how="left_anti", left_on="A", right_on="B")
|
||||
expected = DataFrame({"A": [1], "B": [np.nan]}, dtype=object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = merge(left, right, how="right_anti", left_on="A", right_on="B")
|
||||
expected = DataFrame({"A": [np.nan], "B": [4]}, dtype=object, index=[2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,109 @@
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.reshape.merge import (
|
||||
MergeError,
|
||||
merge,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])]
|
||||
)
|
||||
def test_merge_cross(input_col, output_cols):
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, 3]})
|
||||
right = DataFrame({input_col: [3, 4]})
|
||||
left_copy = left.copy()
|
||||
right_copy = right.copy()
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(left, left_copy)
|
||||
tm.assert_frame_equal(right, right_copy)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"left_index": True},
|
||||
{"right_index": True},
|
||||
{"on": "a"},
|
||||
{"left_on": "a"},
|
||||
{"right_on": "b"},
|
||||
],
|
||||
)
|
||||
def test_merge_cross_error_reporting(kwargs):
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, 3]})
|
||||
right = DataFrame({"b": [3, 4]})
|
||||
msg = (
|
||||
"Can not pass on, right_on, left_on or set right_index=True or left_index=True"
|
||||
)
|
||||
with pytest.raises(MergeError, match=msg):
|
||||
merge(left, right, how="cross", **kwargs)
|
||||
|
||||
|
||||
def test_merge_cross_mixed_dtypes():
|
||||
# GH#5401
|
||||
left = DataFrame(["a", "b", "c"], columns=["A"])
|
||||
right = DataFrame(range(2), columns=["B"])
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame({"A": ["a", "a", "b", "b", "c", "c"], "B": [0, 1, 0, 1, 0, 1]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_cross_more_than_one_column():
|
||||
# GH#5401
|
||||
left = DataFrame({"A": list("ab"), "B": [2, 1]})
|
||||
right = DataFrame({"C": range(2), "D": range(4, 6)})
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": ["a", "a", "b", "b"],
|
||||
"B": [2, 2, 1, 1],
|
||||
"C": [0, 1, 0, 1],
|
||||
"D": [4, 5, 4, 5],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_merge_cross_null_values(nulls_fixture):
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, nulls_fixture]})
|
||||
right = DataFrame({"b": ["a", "b"], "c": [1.0, 2.0]})
|
||||
result = merge(left, right, how="cross")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 1, nulls_fixture, nulls_fixture],
|
||||
"b": ["a", "b", "a", "b"],
|
||||
"c": [1.0, 2.0, 1.0, 2.0],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_join_cross_error_reporting():
|
||||
# GH#5401
|
||||
left = DataFrame({"a": [1, 3]})
|
||||
right = DataFrame({"a": [3, 4]})
|
||||
msg = (
|
||||
"Can not pass on, right_on, left_on or set right_index=True or left_index=True"
|
||||
)
|
||||
with pytest.raises(MergeError, match=msg):
|
||||
left.join(right, how="cross", on="a")
|
||||
|
||||
|
||||
def test_merge_cross_series():
|
||||
# GH#54055
|
||||
ls = Series([1, 2, 3, 4], index=[1, 2, 3, 4], name="left")
|
||||
rs = Series([3, 4, 5, 6], index=[3, 4, 5, 6], name="right")
|
||||
res = merge(ls, rs, how="cross")
|
||||
|
||||
expected = merge(ls.to_frame(), rs.to_frame(), how="cross")
|
||||
tm.assert_frame_equal(res, expected)
|
||||
@ -0,0 +1,186 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df1():
|
||||
return DataFrame(
|
||||
{
|
||||
"outer": [1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4],
|
||||
"inner": [1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2],
|
||||
"v1": np.linspace(0, 1, 11),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df2():
|
||||
return DataFrame(
|
||||
{
|
||||
"outer": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3],
|
||||
"inner": [1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3],
|
||||
"v2": np.linspace(10, 11, 12),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
|
||||
def left_df(request, df1):
|
||||
"""Construct left test DataFrame with specified levels
|
||||
(any of 'outer', 'inner', and 'v1')
|
||||
"""
|
||||
levels = request.param
|
||||
if levels:
|
||||
df1 = df1.set_index(levels)
|
||||
|
||||
return df1
|
||||
|
||||
|
||||
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
|
||||
def right_df(request, df2):
|
||||
"""Construct right test DataFrame with specified levels
|
||||
(any of 'outer', 'inner', and 'v2')
|
||||
"""
|
||||
levels = request.param
|
||||
|
||||
if levels:
|
||||
df2 = df2.set_index(levels)
|
||||
|
||||
return df2
|
||||
|
||||
|
||||
def compute_expected(df_left, df_right, on=None, left_on=None, right_on=None, how=None):
|
||||
"""
|
||||
Compute the expected merge result for the test case.
|
||||
|
||||
This method computes the expected result of merging two DataFrames on
|
||||
a combination of their columns and index levels. It does so by
|
||||
explicitly dropping/resetting their named index levels, performing a
|
||||
merge on their columns, and then finally restoring the appropriate
|
||||
index in the result.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_left : DataFrame
|
||||
The left DataFrame (may have zero or more named index levels)
|
||||
df_right : DataFrame
|
||||
The right DataFrame (may have zero or more named index levels)
|
||||
on : list of str
|
||||
The on parameter to the merge operation
|
||||
left_on : list of str
|
||||
The left_on parameter to the merge operation
|
||||
right_on : list of str
|
||||
The right_on parameter to the merge operation
|
||||
how : str
|
||||
The how parameter to the merge operation
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The expected merge result
|
||||
"""
|
||||
# Handle on param if specified
|
||||
if on is not None:
|
||||
left_on, right_on = on, on
|
||||
|
||||
# Compute input named index levels
|
||||
left_levels = [n for n in df_left.index.names if n is not None]
|
||||
right_levels = [n for n in df_right.index.names if n is not None]
|
||||
|
||||
# Compute output named index levels
|
||||
output_levels = [i for i in left_on if i in right_levels and i in left_levels]
|
||||
|
||||
# Drop index levels that aren't involved in the merge
|
||||
drop_left = [n for n in left_levels if n not in left_on]
|
||||
if drop_left:
|
||||
df_left = df_left.reset_index(drop_left, drop=True)
|
||||
|
||||
drop_right = [n for n in right_levels if n not in right_on]
|
||||
if drop_right:
|
||||
df_right = df_right.reset_index(drop_right, drop=True)
|
||||
|
||||
# Convert remaining index levels to columns
|
||||
reset_left = [n for n in left_levels if n in left_on]
|
||||
if reset_left:
|
||||
df_left = df_left.reset_index(level=reset_left)
|
||||
|
||||
reset_right = [n for n in right_levels if n in right_on]
|
||||
if reset_right:
|
||||
df_right = df_right.reset_index(level=reset_right)
|
||||
|
||||
# Perform merge
|
||||
expected = df_left.merge(df_right, left_on=left_on, right_on=right_on, how=how)
|
||||
|
||||
# Restore index levels
|
||||
if output_levels:
|
||||
expected = expected.set_index(output_levels)
|
||||
|
||||
return expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"on,how",
|
||||
[
|
||||
(["outer"], "inner"),
|
||||
(["inner"], "left"),
|
||||
(["outer", "inner"], "right"),
|
||||
(["inner", "outer"], "outer"),
|
||||
],
|
||||
)
|
||||
def test_merge_indexes_and_columns_on(left_df, right_df, on, how):
|
||||
# Construct expected result
|
||||
expected = compute_expected(left_df, right_df, on=on, how=how)
|
||||
|
||||
# Perform merge
|
||||
result = left_df.merge(right_df, on=on, how=how)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"left_on,right_on,how",
|
||||
[
|
||||
(["outer"], ["outer"], "inner"),
|
||||
(["inner"], ["inner"], "right"),
|
||||
(["outer", "inner"], ["outer", "inner"], "left"),
|
||||
(["inner", "outer"], ["inner", "outer"], "outer"),
|
||||
],
|
||||
)
|
||||
def test_merge_indexes_and_columns_lefton_righton(
|
||||
left_df, right_df, left_on, right_on, how
|
||||
):
|
||||
# Construct expected result
|
||||
expected = compute_expected(
|
||||
left_df, right_df, left_on=left_on, right_on=right_on, how=how
|
||||
)
|
||||
|
||||
# Perform merge
|
||||
result = left_df.merge(right_df, left_on=left_on, right_on=right_on, how=how)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("left_index", ["inner", ["inner", "outer"]])
|
||||
def test_join_indexes_and_columns_on(df1, df2, left_index, join_type):
|
||||
# Construct left_df
|
||||
left_df = df1.set_index(left_index)
|
||||
|
||||
# Construct right_df
|
||||
right_df = df2.set_index(["outer", "inner"])
|
||||
|
||||
# Result
|
||||
expected = (
|
||||
left_df.reset_index()
|
||||
.join(
|
||||
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
|
||||
)
|
||||
.set_index(left_index)
|
||||
)
|
||||
|
||||
# Perform join
|
||||
result = left_df.join(
|
||||
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
@ -0,0 +1,241 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
merge_ordered,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left():
|
||||
return DataFrame({"key": ["a", "c", "e"], "lvalue": [1, 2.0, 3]})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right():
|
||||
return DataFrame({"key": ["b", "c", "d", "f"], "rvalue": [1, 2, 3.0, 4]})
|
||||
|
||||
|
||||
class TestMergeOrdered:
|
||||
def test_basic(self, left, right):
|
||||
result = merge_ordered(left, right, on="key")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "b", "c", "d", "e", "f"],
|
||||
"lvalue": [1, np.nan, 2, np.nan, 3, np.nan],
|
||||
"rvalue": [np.nan, 1, 2, 3, np.nan, 4],
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_ffill(self, left, right):
|
||||
result = merge_ordered(left, right, on="key", fill_method="ffill")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "b", "c", "d", "e", "f"],
|
||||
"lvalue": [1.0, 1, 2, 2, 3, 3.0],
|
||||
"rvalue": [np.nan, 1, 2, 3, 3, 4],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_multigroup(self, left, right):
|
||||
left = pd.concat([left, left], ignore_index=True)
|
||||
|
||||
left["group"] = ["a"] * 3 + ["b"] * 3
|
||||
|
||||
result = merge_ordered(
|
||||
left, right, on="key", left_by="group", fill_method="ffill"
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "b", "c", "d", "e", "f"] * 2,
|
||||
"lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2,
|
||||
"rvalue": [np.nan, 1, 2, 3, 3, 4] * 2,
|
||||
}
|
||||
)
|
||||
expected["group"] = ["a"] * 6 + ["b"] * 6
|
||||
|
||||
tm.assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
result2 = merge_ordered(
|
||||
right, left, on="key", right_by="group", fill_method="ffill"
|
||||
)
|
||||
tm.assert_frame_equal(result, result2.loc[:, result.columns])
|
||||
|
||||
result = merge_ordered(left, right, on="key", left_by="group")
|
||||
assert result["group"].notna().all()
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
|
||||
)
|
||||
def test_merge_type(self, left, right):
|
||||
class NotADataFrame(DataFrame):
|
||||
@property
|
||||
def _constructor(self):
|
||||
return NotADataFrame
|
||||
|
||||
nad = NotADataFrame(left)
|
||||
result = nad.merge(right, on="key")
|
||||
|
||||
assert isinstance(result, NotADataFrame)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df_seq, pattern",
|
||||
[
|
||||
((), "[Nn]o objects"),
|
||||
([], "[Nn]o objects"),
|
||||
({}, "[Nn]o objects"),
|
||||
([None], "objects.*None"),
|
||||
([None, None], "objects.*None"),
|
||||
],
|
||||
)
|
||||
def test_empty_sequence_concat(self, df_seq, pattern):
|
||||
# GH 9157
|
||||
with pytest.raises(ValueError, match=pattern):
|
||||
pd.concat(df_seq)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg", [[DataFrame()], [None, DataFrame()], [DataFrame(), None]]
|
||||
)
|
||||
def test_empty_sequence_concat_ok(self, arg):
|
||||
pd.concat(arg)
|
||||
|
||||
def test_doc_example(self):
|
||||
left = DataFrame(
|
||||
{
|
||||
"group": list("aaabbb"),
|
||||
"key": ["a", "c", "e", "a", "c", "e"],
|
||||
"lvalue": [1, 2, 3] * 2,
|
||||
}
|
||||
)
|
||||
|
||||
right = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
|
||||
|
||||
result = merge_ordered(left, right, fill_method="ffill", left_by="group")
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"group": list("aaaaabbbbb"),
|
||||
"key": ["a", "b", "c", "d", "e"] * 2,
|
||||
"lvalue": [1, 1, 2, 2, 3] * 2,
|
||||
"rvalue": [np.nan, 1, 2, 3, 3] * 2,
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"left, right, on, left_by, right_by, expected",
|
||||
[
|
||||
(
|
||||
{"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]},
|
||||
{"T": [2], "E": [1]},
|
||||
["T"],
|
||||
["G", "H"],
|
||||
None,
|
||||
{
|
||||
"G": ["g"] * 3,
|
||||
"H": ["h"] * 3,
|
||||
"T": [1, 2, 3],
|
||||
"E": [np.nan, 1.0, np.nan],
|
||||
},
|
||||
),
|
||||
(
|
||||
{"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]},
|
||||
{"T": [2], "E": [1]},
|
||||
"T",
|
||||
["G", "H"],
|
||||
None,
|
||||
{
|
||||
"G": ["g"] * 3,
|
||||
"H": ["h"] * 3,
|
||||
"T": [1, 2, 3],
|
||||
"E": [np.nan, 1.0, np.nan],
|
||||
},
|
||||
),
|
||||
(
|
||||
{"T": [2], "E": [1]},
|
||||
{"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]},
|
||||
["T"],
|
||||
None,
|
||||
["G", "H"],
|
||||
{
|
||||
"T": [1, 2, 3],
|
||||
"E": [np.nan, 1.0, np.nan],
|
||||
"G": ["g"] * 3,
|
||||
"H": ["h"] * 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_list_type_by(self, left, right, on, left_by, right_by, expected):
|
||||
# GH 35269
|
||||
left = DataFrame(left)
|
||||
right = DataFrame(right)
|
||||
result = merge_ordered(
|
||||
left=left,
|
||||
right=right,
|
||||
on=on,
|
||||
left_by=left_by,
|
||||
right_by=right_by,
|
||||
)
|
||||
expected = DataFrame(expected)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_by_length_equals_to_right_shape0(self):
|
||||
# GH 38166
|
||||
left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHE"))
|
||||
right = DataFrame([[2, 1]], columns=list("ET"))
|
||||
result = merge_ordered(left, right, on="E", left_by=["G", "H"])
|
||||
expected = DataFrame(
|
||||
{"G": ["g"] * 3, "H": ["h"] * 3, "E": [1, 2, 3], "T": [np.nan, 1.0, np.nan]}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_elements_not_in_by_but_in_df(self):
|
||||
# GH 38167
|
||||
left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHE"))
|
||||
right = DataFrame([[2, 1]], columns=list("ET"))
|
||||
msg = r"\{'h'\} not found in left columns"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
merge_ordered(left, right, on="E", left_by=["G", "h"])
|
||||
|
||||
@pytest.mark.parametrize("invalid_method", ["linear", "carrot"])
|
||||
def test_ffill_validate_fill_method(self, left, right, invalid_method):
|
||||
# GH 55884
|
||||
with pytest.raises(
|
||||
ValueError, match=re.escape("fill_method must be 'ffill' or None")
|
||||
):
|
||||
merge_ordered(left, right, on="key", fill_method=invalid_method)
|
||||
|
||||
def test_ffill_left_merge(self):
|
||||
# GH 57010
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"key": ["a", "c", "e", "a", "c", "e"],
|
||||
"lvalue": [1, 2, 3, 1, 2, 3],
|
||||
"group": ["a", "a", "a", "b", "b", "b"],
|
||||
}
|
||||
)
|
||||
df2 = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
|
||||
result = merge_ordered(
|
||||
df1, df2, fill_method="ffill", left_by="group", how="left"
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "c", "e", "a", "c", "e"],
|
||||
"lvalue": [1, 2, 3, 1, 2, 3],
|
||||
"group": ["a", "a", "a", "b", "b", "b"],
|
||||
"rvalue": [np.nan, 2.0, 2.0, np.nan, 2.0, 2.0],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,930 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
RangeIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.reshape.merge import merge
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left():
|
||||
"""left dataframe (not multi-indexed) for multi-index join tests"""
|
||||
# a little relevant example with NAs
|
||||
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
|
||||
key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"]
|
||||
|
||||
data = np.random.default_rng(2).standard_normal(len(key1))
|
||||
return DataFrame({"key1": key1, "key2": key2, "data": data})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right(multiindex_dataframe_random_data):
|
||||
"""right dataframe (multi-indexed) for multi-index join tests"""
|
||||
df = multiindex_dataframe_random_data
|
||||
df.index.names = ["key1", "key2"]
|
||||
|
||||
df.columns = ["j_one", "j_two", "j_three"]
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left_multi():
|
||||
return DataFrame(
|
||||
{
|
||||
"Origin": ["A", "A", "B", "B", "C"],
|
||||
"Destination": ["A", "B", "A", "C", "A"],
|
||||
"Period": ["AM", "AM", "IP", "AM", "OP"],
|
||||
"TripPurp": ["hbw", "nhb", "hbo", "nhb", "hbw"],
|
||||
"Trips": [1987, 3647, 2470, 4296, 4444],
|
||||
},
|
||||
columns=["Origin", "Destination", "Period", "TripPurp", "Trips"],
|
||||
).set_index(["Origin", "Destination", "Period", "TripPurp"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right_multi():
|
||||
return DataFrame(
|
||||
{
|
||||
"Origin": ["A", "A", "B", "B", "C", "C", "E"],
|
||||
"Destination": ["A", "B", "A", "B", "A", "B", "F"],
|
||||
"Period": ["AM", "AM", "IP", "AM", "OP", "IP", "AM"],
|
||||
"LinkType": ["a", "b", "c", "b", "a", "b", "a"],
|
||||
"Distance": [100, 80, 90, 80, 75, 35, 55],
|
||||
},
|
||||
columns=["Origin", "Destination", "Period", "LinkType", "Distance"],
|
||||
).set_index(["Origin", "Destination", "Period", "LinkType"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def on_cols_multi():
|
||||
return ["Origin", "Destination", "Period"]
|
||||
|
||||
|
||||
class TestMergeMulti:
|
||||
def test_merge_on_multikey(self, left, right, join_type):
|
||||
on_cols = ["key1", "key2"]
|
||||
result = left.join(right, on=on_cols, how=join_type).reset_index(drop=True)
|
||||
|
||||
expected = merge(left, right.reset_index(), on=on_cols, how=join_type)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on=on_cols, how=join_type, sort=True).reset_index(
|
||||
drop=True
|
||||
)
|
||||
|
||||
expected = merge(
|
||||
left, right.reset_index(), on=on_cols, how=join_type, sort=True
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
|
||||
)
|
||||
def test_left_join_multi_index(self, sort, infer_string):
|
||||
with option_context("future.infer_string", infer_string):
|
||||
icols = ["1st", "2nd", "3rd"]
|
||||
|
||||
def bind_cols(df):
|
||||
iord = lambda a: 0 if a != a else ord(a)
|
||||
f = lambda ts: ts.map(iord) - ord("a")
|
||||
return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10
|
||||
|
||||
def run_asserts(left, right, sort):
|
||||
res = left.join(right, on=icols, how="left", sort=sort)
|
||||
|
||||
assert len(left) < len(res) + 1
|
||||
assert not res["4th"].isna().any()
|
||||
assert not res["5th"].isna().any()
|
||||
|
||||
tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
|
||||
result = bind_cols(res.iloc[:, :-2])
|
||||
tm.assert_series_equal(res["4th"], result, check_names=False)
|
||||
assert result.name is None
|
||||
|
||||
if sort:
|
||||
tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
|
||||
|
||||
out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
|
||||
|
||||
res.index = RangeIndex(len(res))
|
||||
tm.assert_frame_equal(out, res)
|
||||
|
||||
lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
|
||||
left = DataFrame(
|
||||
np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"]
|
||||
)
|
||||
# Explicit cast to float to avoid implicit cast when setting nan
|
||||
left.insert(
|
||||
1,
|
||||
"2nd",
|
||||
np.random.default_rng(2).integers(0, 10, len(left)).astype("float"),
|
||||
)
|
||||
right = left.sample(frac=1, random_state=np.random.default_rng(2))
|
||||
|
||||
left["4th"] = bind_cols(left)
|
||||
right["5th"] = -bind_cols(right)
|
||||
right.set_index(icols, inplace=True)
|
||||
|
||||
run_asserts(left, right, sort)
|
||||
|
||||
# inject some nulls
|
||||
left.loc[1::4, "1st"] = np.nan
|
||||
left.loc[2::5, "2nd"] = np.nan
|
||||
left.loc[3::6, "3rd"] = np.nan
|
||||
left["4th"] = bind_cols(left)
|
||||
|
||||
i = np.random.default_rng(2).permutation(len(left))
|
||||
right = left.iloc[i, :-1]
|
||||
right["5th"] = -bind_cols(right)
|
||||
right.set_index(icols, inplace=True)
|
||||
|
||||
run_asserts(left, right, sort)
|
||||
|
||||
def test_merge_right_vs_left(self, left, right, sort):
|
||||
# compare left vs right merge with multikey
|
||||
on_cols = ["key1", "key2"]
|
||||
merged_left_right = left.merge(
|
||||
right, left_on=on_cols, right_index=True, how="left", sort=sort
|
||||
)
|
||||
|
||||
merge_right_left = right.merge(
|
||||
left, right_on=on_cols, left_index=True, how="right", sort=sort
|
||||
)
|
||||
|
||||
# Reorder columns
|
||||
merge_right_left = merge_right_left[merged_left_right.columns]
|
||||
|
||||
tm.assert_frame_equal(merged_left_right, merge_right_left)
|
||||
|
||||
def test_merge_multiple_cols_with_mixed_cols_index(self):
|
||||
# GH29522
|
||||
s = Series(
|
||||
range(6),
|
||||
MultiIndex.from_product([["A", "B"], [1, 2, 3]], names=["lev1", "lev2"]),
|
||||
name="Amount",
|
||||
)
|
||||
df = DataFrame({"lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": 0})
|
||||
result = merge(df, s.reset_index(), on=["lev1", "lev2"])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"lev1": list("AAABBB"),
|
||||
"lev2": [1, 2, 3, 1, 2, 3],
|
||||
"col": [0] * 6,
|
||||
"Amount": range(6),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_compress_group_combinations(self):
|
||||
# ~ 40000000 possible unique groups
|
||||
key1 = [str(i) for i in range(10000)]
|
||||
key1 = np.tile(key1, 2)
|
||||
key2 = key1[::-1]
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"key1": key1,
|
||||
"key2": key2,
|
||||
"value1": np.random.default_rng(2).standard_normal(20000),
|
||||
}
|
||||
)
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"key1": key1[::2],
|
||||
"key2": key2[::2],
|
||||
"value2": np.random.default_rng(2).standard_normal(10000),
|
||||
}
|
||||
)
|
||||
|
||||
# just to hit the label compression code path
|
||||
merge(df, df2, how="outer")
|
||||
|
||||
def test_left_join_index_preserve_order(self):
|
||||
on_cols = ["k1", "k2"]
|
||||
left = DataFrame(
|
||||
{
|
||||
"k1": [0, 1, 2] * 8,
|
||||
"k2": ["foo", "bar"] * 12,
|
||||
"v": np.array(np.arange(24), dtype=np.int64),
|
||||
}
|
||||
)
|
||||
|
||||
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
|
||||
right = DataFrame({"v2": [5, 7]}, index=index)
|
||||
|
||||
result = left.join(right, on=on_cols)
|
||||
|
||||
expected = left.copy()
|
||||
expected["v2"] = np.nan
|
||||
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
|
||||
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result.sort_values(on_cols, kind="mergesort", inplace=True)
|
||||
expected = left.join(right, on=on_cols, sort=True)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# test join with multi dtypes blocks
|
||||
left = DataFrame(
|
||||
{
|
||||
"k1": [0, 1, 2] * 8,
|
||||
"k2": ["foo", "bar"] * 12,
|
||||
"k3": np.array([0, 1, 2] * 8, dtype=np.float32),
|
||||
"v": np.array(np.arange(24), dtype=np.int32),
|
||||
}
|
||||
)
|
||||
|
||||
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
|
||||
right = DataFrame({"v2": [5, 7]}, index=index)
|
||||
|
||||
result = left.join(right, on=on_cols)
|
||||
|
||||
expected = left.copy()
|
||||
expected["v2"] = np.nan
|
||||
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
|
||||
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = result.sort_values(on_cols, kind="mergesort")
|
||||
expected = left.join(right, on=on_cols, sort=True)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_join_index_multi_match_multiindex(self):
|
||||
left = DataFrame(
|
||||
[
|
||||
["X", "Y", "C", "a"],
|
||||
["W", "Y", "C", "e"],
|
||||
["V", "Q", "A", "h"],
|
||||
["V", "R", "D", "i"],
|
||||
["X", "Y", "D", "b"],
|
||||
["X", "Y", "A", "c"],
|
||||
["W", "Q", "B", "f"],
|
||||
["W", "R", "C", "g"],
|
||||
["V", "Y", "C", "j"],
|
||||
["X", "Y", "B", "d"],
|
||||
],
|
||||
columns=["cola", "colb", "colc", "tag"],
|
||||
index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8],
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
[
|
||||
["W", "R", "C", 0],
|
||||
["W", "Q", "B", 3],
|
||||
["W", "Q", "B", 8],
|
||||
["X", "Y", "A", 1],
|
||||
["X", "Y", "A", 4],
|
||||
["X", "Y", "B", 5],
|
||||
["X", "Y", "C", 6],
|
||||
["X", "Y", "C", 9],
|
||||
["X", "Q", "C", -6],
|
||||
["X", "R", "C", -9],
|
||||
["V", "Y", "C", 7],
|
||||
["V", "R", "D", 2],
|
||||
["V", "R", "D", -1],
|
||||
["V", "Q", "A", -3],
|
||||
],
|
||||
columns=["col1", "col2", "col3", "val"],
|
||||
).set_index(["col1", "col2", "col3"])
|
||||
|
||||
result = left.join(right, on=["cola", "colb", "colc"], how="left")
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
["X", "Y", "C", "a", 6],
|
||||
["X", "Y", "C", "a", 9],
|
||||
["W", "Y", "C", "e", np.nan],
|
||||
["V", "Q", "A", "h", -3],
|
||||
["V", "R", "D", "i", 2],
|
||||
["V", "R", "D", "i", -1],
|
||||
["X", "Y", "D", "b", np.nan],
|
||||
["X", "Y", "A", "c", 1],
|
||||
["X", "Y", "A", "c", 4],
|
||||
["W", "Q", "B", "f", 3],
|
||||
["W", "Q", "B", "f", 8],
|
||||
["W", "R", "C", "g", 0],
|
||||
["V", "Y", "C", "j", 7],
|
||||
["X", "Y", "B", "d", 5],
|
||||
],
|
||||
columns=["cola", "colb", "colc", "tag", "val"],
|
||||
index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on=["cola", "colb", "colc"], how="left", sort=True)
|
||||
|
||||
expected = expected.sort_values(["cola", "colb", "colc"], kind="mergesort")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_join_index_multi_match(self):
|
||||
left = DataFrame(
|
||||
[["c", 0], ["b", 1], ["a", 2], ["b", 3]],
|
||||
columns=["tag", "val"],
|
||||
index=[2, 0, 1, 3],
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
[
|
||||
["a", "v"],
|
||||
["c", "w"],
|
||||
["c", "x"],
|
||||
["d", "y"],
|
||||
["a", "z"],
|
||||
["c", "r"],
|
||||
["e", "q"],
|
||||
["c", "s"],
|
||||
],
|
||||
columns=["tag", "char"],
|
||||
).set_index("tag")
|
||||
|
||||
result = left.join(right, on="tag", how="left")
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
["c", 0, "w"],
|
||||
["c", 0, "x"],
|
||||
["c", 0, "r"],
|
||||
["c", 0, "s"],
|
||||
["b", 1, np.nan],
|
||||
["a", 2, "v"],
|
||||
["a", 2, "z"],
|
||||
["b", 3, np.nan],
|
||||
],
|
||||
columns=["tag", "val", "char"],
|
||||
index=[2, 2, 2, 2, 0, 1, 1, 3],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on="tag", how="left", sort=True)
|
||||
expected2 = expected.sort_values("tag", kind="mergesort")
|
||||
|
||||
tm.assert_frame_equal(result, expected2)
|
||||
|
||||
# GH7331 - maintain left frame order in left merge
|
||||
result = merge(left, right.reset_index(), how="left", on="tag")
|
||||
expected.index = RangeIndex(len(expected))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_merge_na_buglet(self):
|
||||
left = DataFrame(
|
||||
{
|
||||
"id": list("abcde"),
|
||||
"v1": np.random.default_rng(2).standard_normal(5),
|
||||
"v2": np.random.default_rng(2).standard_normal(5),
|
||||
"dummy": list("abcde"),
|
||||
"v3": np.random.default_rng(2).standard_normal(5),
|
||||
},
|
||||
columns=["id", "v1", "v2", "dummy", "v3"],
|
||||
)
|
||||
right = DataFrame(
|
||||
{
|
||||
"id": ["a", "b", np.nan, np.nan, np.nan],
|
||||
"sv3": [1.234, 5.678, np.nan, np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
result = merge(left, right, on="id", how="left")
|
||||
|
||||
rdf = right.drop(["id"], axis=1)
|
||||
expected = left.join(rdf)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_merge_na_keys(self):
|
||||
data = [
|
||||
[1950, "A", 1.5],
|
||||
[1950, "B", 1.5],
|
||||
[1955, "B", 1.5],
|
||||
[1960, "B", np.nan],
|
||||
[1970, "B", 4.0],
|
||||
[1950, "C", 4.0],
|
||||
[1960, "C", np.nan],
|
||||
[1965, "C", 3.0],
|
||||
[1970, "C", 4.0],
|
||||
]
|
||||
|
||||
frame = DataFrame(data, columns=["year", "panel", "data"])
|
||||
|
||||
other_data = [
|
||||
[1960, "A", np.nan],
|
||||
[1970, "A", np.nan],
|
||||
[1955, "A", np.nan],
|
||||
[1965, "A", np.nan],
|
||||
[1965, "B", np.nan],
|
||||
[1955, "C", np.nan],
|
||||
]
|
||||
other = DataFrame(other_data, columns=["year", "panel", "data"])
|
||||
|
||||
result = frame.merge(other, how="outer")
|
||||
|
||||
expected = frame.fillna(-999).merge(other.fillna(-999), how="outer")
|
||||
expected = expected.replace(-999, np.nan)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
|
||||
def test_merge_datetime_index(self, klass):
|
||||
# see gh-19038
|
||||
df = DataFrame(
|
||||
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
|
||||
)
|
||||
df.index = pd.to_datetime(df.index)
|
||||
on_vector = df.index.year
|
||||
|
||||
if klass is not None:
|
||||
on_vector = klass(on_vector)
|
||||
|
||||
exp_years = np.array([2016, 2017, 2018], dtype=np.int32)
|
||||
expected = DataFrame({"a": [1, 2, 3], "key_1": exp_years})
|
||||
|
||||
result = df.merge(df, on=["a", on_vector], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"key_0": exp_years, "a_x": [1, 2, 3], "a_y": [1, 2, 3]})
|
||||
|
||||
result = df.merge(df, on=[df.index.year], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("merge_type", ["left", "right"])
|
||||
def test_merge_datetime_multi_index_empty_df(self, merge_type):
|
||||
# see gh-36895
|
||||
|
||||
left = DataFrame(
|
||||
data={
|
||||
"data": [1.5, 1.5],
|
||||
},
|
||||
index=MultiIndex.from_tuples(
|
||||
[[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]],
|
||||
names=["date", "panel"],
|
||||
),
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
index=MultiIndex.from_tuples([], names=["date", "panel"]), columns=["state"]
|
||||
)
|
||||
|
||||
expected_index = MultiIndex.from_tuples(
|
||||
[[Timestamp("1950-01-01"), "A"], [Timestamp("1950-01-02"), "B"]],
|
||||
names=["date", "panel"],
|
||||
)
|
||||
|
||||
if merge_type == "left":
|
||||
expected = DataFrame(
|
||||
data={
|
||||
"data": [1.5, 1.5],
|
||||
"state": np.array([np.nan, np.nan], dtype=object),
|
||||
},
|
||||
index=expected_index,
|
||||
)
|
||||
results_merge = left.merge(right, how="left", on=["date", "panel"])
|
||||
results_join = left.join(right, how="left")
|
||||
else:
|
||||
expected = DataFrame(
|
||||
data={
|
||||
"state": np.array([np.nan, np.nan], dtype=object),
|
||||
"data": [1.5, 1.5],
|
||||
},
|
||||
index=expected_index,
|
||||
)
|
||||
results_merge = right.merge(left, how="right", on=["date", "panel"])
|
||||
results_join = right.join(left, how="right")
|
||||
|
||||
tm.assert_frame_equal(results_merge, expected)
|
||||
tm.assert_frame_equal(results_join, expected)
|
||||
|
||||
@pytest.fixture
|
||||
def household(self):
|
||||
household = DataFrame(
|
||||
{
|
||||
"household_id": [1, 2, 3],
|
||||
"male": [0, 1, 0],
|
||||
"wealth": [196087.3, 316478.7, 294750],
|
||||
},
|
||||
columns=["household_id", "male", "wealth"],
|
||||
).set_index("household_id")
|
||||
return household
|
||||
|
||||
@pytest.fixture
|
||||
def portfolio(self):
|
||||
portfolio = DataFrame(
|
||||
{
|
||||
"household_id": [1, 2, 2, 3, 3, 3, 4],
|
||||
"asset_id": [
|
||||
"nl0000301109",
|
||||
"nl0000289783",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
np.nan,
|
||||
],
|
||||
"name": [
|
||||
"ABN Amro",
|
||||
"Robeco",
|
||||
"Royal Dutch Shell",
|
||||
"Royal Dutch Shell",
|
||||
"AAB Eastern Europe Equity Fund",
|
||||
"Postbank BioTech Fonds",
|
||||
np.nan,
|
||||
],
|
||||
"share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
|
||||
},
|
||||
columns=["household_id", "asset_id", "name", "share"],
|
||||
).set_index(["household_id", "asset_id"])
|
||||
return portfolio
|
||||
|
||||
@pytest.fixture
|
||||
def expected(self):
|
||||
expected = (
|
||||
DataFrame(
|
||||
{
|
||||
"male": [0, 1, 1, 0, 0, 0],
|
||||
"wealth": [
|
||||
196087.3,
|
||||
316478.7,
|
||||
316478.7,
|
||||
294750.0,
|
||||
294750.0,
|
||||
294750.0,
|
||||
],
|
||||
"name": [
|
||||
"ABN Amro",
|
||||
"Robeco",
|
||||
"Royal Dutch Shell",
|
||||
"Royal Dutch Shell",
|
||||
"AAB Eastern Europe Equity Fund",
|
||||
"Postbank BioTech Fonds",
|
||||
],
|
||||
"share": [1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
|
||||
"household_id": [1, 2, 2, 3, 3, 3],
|
||||
"asset_id": [
|
||||
"nl0000301109",
|
||||
"nl0000289783",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
],
|
||||
}
|
||||
)
|
||||
.set_index(["household_id", "asset_id"])
|
||||
.reindex(columns=["male", "wealth", "name", "share"])
|
||||
)
|
||||
return expected
|
||||
|
||||
def test_join_multi_levels(self, portfolio, household, expected):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
# GH 3662
|
||||
# merge multi-levels
|
||||
result = household.join(portfolio, how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_levels_merge_equivalence(self, portfolio, household, expected):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
# equivalency
|
||||
result = merge(
|
||||
household.reset_index(),
|
||||
portfolio.reset_index(),
|
||||
on=["household_id"],
|
||||
how="inner",
|
||||
).set_index(["household_id", "asset_id"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_levels_outer(self, portfolio, household, expected):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
result = household.join(portfolio, how="outer")
|
||||
expected = concat(
|
||||
[
|
||||
expected,
|
||||
(
|
||||
DataFrame(
|
||||
{"share": [1.00]},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(4, np.nan)], names=["household_id", "asset_id"]
|
||||
),
|
||||
)
|
||||
),
|
||||
],
|
||||
axis=0,
|
||||
sort=True,
|
||||
).reindex(columns=expected.columns)
|
||||
tm.assert_frame_equal(result, expected, check_index_type=False)
|
||||
|
||||
def test_join_multi_levels_invalid(self, portfolio, household):
|
||||
portfolio = portfolio.copy()
|
||||
household = household.copy()
|
||||
|
||||
# invalid cases
|
||||
household.index.name = "foo"
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="cannot join with no overlapping index names"
|
||||
):
|
||||
household.join(portfolio, how="inner")
|
||||
|
||||
portfolio2 = portfolio.copy()
|
||||
portfolio2.index.set_names(["household_id", "foo"])
|
||||
|
||||
with pytest.raises(ValueError, match="columns overlap but no suffix specified"):
|
||||
portfolio2.join(portfolio, how="inner")
|
||||
|
||||
def test_join_multi_levels2(self):
|
||||
# some more advanced merges
|
||||
# GH6360
|
||||
household = DataFrame(
|
||||
{
|
||||
"household_id": [1, 2, 2, 3, 3, 3, 4],
|
||||
"asset_id": [
|
||||
"nl0000301109",
|
||||
"nl0000301109",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
np.nan,
|
||||
],
|
||||
"share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
|
||||
},
|
||||
columns=["household_id", "asset_id", "share"],
|
||||
).set_index(["household_id", "asset_id"])
|
||||
|
||||
log_return = DataFrame(
|
||||
{
|
||||
"asset_id": [
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"lu0197800237",
|
||||
],
|
||||
"t": [233, 234, 235, 180, 181],
|
||||
"log_return": [
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.03025441,
|
||||
0.036997,
|
||||
],
|
||||
}
|
||||
).set_index(["asset_id", "t"])
|
||||
|
||||
expected = (
|
||||
DataFrame(
|
||||
{
|
||||
"household_id": [2, 2, 2, 3, 3, 3, 3, 3],
|
||||
"asset_id": [
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"lu0197800237",
|
||||
],
|
||||
"t": [233, 234, 235, 233, 234, 235, 180, 181],
|
||||
"share": [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
|
||||
"log_return": [
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.03025441,
|
||||
0.036997,
|
||||
],
|
||||
}
|
||||
)
|
||||
.set_index(["household_id", "asset_id", "t"])
|
||||
.reindex(columns=["share", "log_return"])
|
||||
)
|
||||
|
||||
# this is the equivalency
|
||||
result = merge(
|
||||
household.reset_index(),
|
||||
log_return.reset_index(),
|
||||
on=["asset_id"],
|
||||
how="inner",
|
||||
).set_index(["household_id", "asset_id", "t"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = (
|
||||
DataFrame(
|
||||
{
|
||||
"household_id": [2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 2, 4],
|
||||
"asset_id": [
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
"nl0000301109",
|
||||
"nl0000301109",
|
||||
None,
|
||||
],
|
||||
"t": [
|
||||
233,
|
||||
234,
|
||||
235,
|
||||
233,
|
||||
234,
|
||||
235,
|
||||
180,
|
||||
181,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
],
|
||||
"share": [
|
||||
0.6,
|
||||
0.6,
|
||||
0.6,
|
||||
0.15,
|
||||
0.15,
|
||||
0.15,
|
||||
0.6,
|
||||
0.6,
|
||||
0.25,
|
||||
1.0,
|
||||
0.4,
|
||||
1.0,
|
||||
],
|
||||
"log_return": [
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.03025441,
|
||||
0.036997,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
],
|
||||
}
|
||||
)
|
||||
.set_index(["household_id", "asset_id", "t"])
|
||||
.reindex(columns=["share", "log_return"])
|
||||
)
|
||||
|
||||
result = merge(
|
||||
household.reset_index(),
|
||||
log_return.reset_index(),
|
||||
on=["asset_id"],
|
||||
how="outer",
|
||||
).set_index(["household_id", "asset_id", "t"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestJoinMultiMulti:
|
||||
def test_join_multi_multi(self, left_multi, right_multi, join_type, on_cols_multi):
|
||||
left_names = left_multi.index.names
|
||||
right_names = right_multi.index.names
|
||||
if join_type == "right":
|
||||
level_order = right_names + left_names.difference(right_names)
|
||||
else:
|
||||
level_order = left_names + right_names.difference(left_names)
|
||||
# Multi-index join tests
|
||||
expected = (
|
||||
merge(
|
||||
left_multi.reset_index(),
|
||||
right_multi.reset_index(),
|
||||
how=join_type,
|
||||
on=on_cols_multi,
|
||||
)
|
||||
.set_index(level_order)
|
||||
.sort_index()
|
||||
)
|
||||
|
||||
result = left_multi.join(right_multi, how=join_type).sort_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_empty_frames(
|
||||
self, left_multi, right_multi, join_type, on_cols_multi
|
||||
):
|
||||
left_multi = left_multi.drop(columns=left_multi.columns)
|
||||
right_multi = right_multi.drop(columns=right_multi.columns)
|
||||
|
||||
left_names = left_multi.index.names
|
||||
right_names = right_multi.index.names
|
||||
if join_type == "right":
|
||||
level_order = right_names + left_names.difference(right_names)
|
||||
else:
|
||||
level_order = left_names + right_names.difference(left_names)
|
||||
|
||||
expected = (
|
||||
merge(
|
||||
left_multi.reset_index(),
|
||||
right_multi.reset_index(),
|
||||
how=join_type,
|
||||
on=on_cols_multi,
|
||||
)
|
||||
.set_index(level_order)
|
||||
.sort_index()
|
||||
)
|
||||
|
||||
result = left_multi.join(right_multi, how=join_type).sort_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
|
||||
def test_merge_datetime_index(self, box):
|
||||
# see gh-19038
|
||||
df = DataFrame(
|
||||
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
|
||||
)
|
||||
df.index = pd.to_datetime(df.index)
|
||||
on_vector = df.index.year
|
||||
|
||||
if box is not None:
|
||||
on_vector = box(on_vector)
|
||||
|
||||
exp_years = np.array([2016, 2017, 2018], dtype=np.int32)
|
||||
expected = DataFrame({"a": [1, 2, 3], "key_1": exp_years})
|
||||
|
||||
result = df.merge(df, on=["a", on_vector], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"key_0": exp_years, "a_x": [1, 2, 3], "a_y": [1, 2, 3]})
|
||||
|
||||
result = df.merge(df, on=[df.index.year], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_single_common_level(self):
|
||||
index_left = MultiIndex.from_tuples(
|
||||
[("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"]
|
||||
)
|
||||
|
||||
left = DataFrame(
|
||||
{"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=index_left
|
||||
)
|
||||
|
||||
index_right = MultiIndex.from_tuples(
|
||||
[("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"]
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
{"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]},
|
||||
index=index_right,
|
||||
)
|
||||
|
||||
result = left.join(right)
|
||||
expected = merge(
|
||||
left.reset_index(), right.reset_index(), on=["key"], how="inner"
|
||||
).set_index(["key", "X", "Y"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_wrong_order(self):
|
||||
# GH 25760
|
||||
# GH 28956
|
||||
|
||||
midx1 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
|
||||
midx3 = MultiIndex.from_tuples([(4, 1), (3, 2), (3, 1)], names=["b", "a"])
|
||||
|
||||
left = DataFrame(index=midx1, data={"x": [10, 20, 30, 40]})
|
||||
right = DataFrame(index=midx3, data={"y": ["foo", "bar", "fing"]})
|
||||
|
||||
result = left.join(right)
|
||||
|
||||
expected = DataFrame(
|
||||
index=midx1,
|
||||
data={"x": [10, 20, 30, 40], "y": ["fing", "foo", "bar", np.nan]},
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,879 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
CategoricalDtype,
|
||||
CategoricalIndex,
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
crosstab,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
"D": np.random.default_rng(2).standard_normal(11),
|
||||
"E": np.random.default_rng(2).standard_normal(11),
|
||||
"F": np.random.default_rng(2).standard_normal(11),
|
||||
}
|
||||
)
|
||||
|
||||
return pd.concat([df, df], ignore_index=True)
|
||||
|
||||
|
||||
class TestCrosstab:
|
||||
def test_crosstab_single(self, df):
|
||||
result = crosstab(df["A"], df["C"])
|
||||
expected = df.groupby(["A", "C"]).size().unstack()
|
||||
tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64))
|
||||
|
||||
def test_crosstab_multiple(self, df):
|
||||
result = crosstab(df["A"], [df["B"], df["C"]])
|
||||
expected = df.groupby(["A", "B", "C"]).size()
|
||||
expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = crosstab([df["B"], df["C"]], df["A"])
|
||||
expected = df.groupby(["B", "C", "A"]).size()
|
||||
expected = expected.unstack("A").fillna(0).astype(np.int64)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("box", [np.array, list, tuple])
|
||||
def test_crosstab_ndarray(self, box):
|
||||
# GH 44076
|
||||
a = box(np.random.default_rng(2).integers(0, 5, size=100))
|
||||
b = box(np.random.default_rng(2).integers(0, 3, size=100))
|
||||
c = box(np.random.default_rng(2).integers(0, 10, size=100))
|
||||
|
||||
df = DataFrame({"a": a, "b": b, "c": c})
|
||||
|
||||
result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"))
|
||||
expected = crosstab(df["a"], [df["b"], df["c"]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c"))
|
||||
expected = crosstab([df["b"], df["c"]], df["a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# assign arbitrary names
|
||||
result = crosstab(a, c)
|
||||
expected = crosstab(df["a"], df["c"])
|
||||
expected.index.names = ["row_0"]
|
||||
expected.columns.names = ["col_0"]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_crosstab_non_aligned(self):
|
||||
# GH 17005
|
||||
a = Series([0, 1, 1], index=["a", "b", "c"])
|
||||
b = Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"])
|
||||
c = np.array([3, 4, 3], dtype=np.int64)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 0], [1, 1]],
|
||||
index=Index([0, 1], name="row_0"),
|
||||
columns=Index([3, 4], name="col_0"),
|
||||
)
|
||||
|
||||
result = crosstab(a, b)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = crosstab(a, c)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_crosstab_margins(self):
|
||||
a = np.random.default_rng(2).integers(0, 7, size=100)
|
||||
b = np.random.default_rng(2).integers(0, 3, size=100)
|
||||
c = np.random.default_rng(2).integers(0, 5, size=100)
|
||||
|
||||
df = DataFrame({"a": a, "b": b, "c": c})
|
||||
|
||||
result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True)
|
||||
|
||||
assert result.index.names == ("a",)
|
||||
assert result.columns.names == ["b", "c"]
|
||||
|
||||
all_cols = result["All", ""]
|
||||
exp_cols = df.groupby(["a"]).size().astype("i8")
|
||||
# to keep index.name
|
||||
exp_margin = Series([len(df)], index=Index(["All"], name="a"))
|
||||
exp_cols = pd.concat([exp_cols, exp_margin])
|
||||
exp_cols.name = ("All", "")
|
||||
|
||||
tm.assert_series_equal(all_cols, exp_cols)
|
||||
|
||||
all_rows = result.loc["All"]
|
||||
exp_rows = df.groupby(["b", "c"]).size().astype("i8")
|
||||
exp_rows = pd.concat([exp_rows, Series([len(df)], index=[("All", "")])])
|
||||
exp_rows.name = "All"
|
||||
|
||||
exp_rows = exp_rows.reindex(all_rows.index)
|
||||
exp_rows = exp_rows.fillna(0).astype(np.int64)
|
||||
tm.assert_series_equal(all_rows, exp_rows)
|
||||
|
||||
def test_crosstab_margins_set_margin_name(self):
|
||||
# GH 15972
|
||||
a = np.random.default_rng(2).integers(0, 7, size=100)
|
||||
b = np.random.default_rng(2).integers(0, 3, size=100)
|
||||
c = np.random.default_rng(2).integers(0, 5, size=100)
|
||||
|
||||
df = DataFrame({"a": a, "b": b, "c": c})
|
||||
|
||||
result = crosstab(
|
||||
a,
|
||||
[b, c],
|
||||
rownames=["a"],
|
||||
colnames=("b", "c"),
|
||||
margins=True,
|
||||
margins_name="TOTAL",
|
||||
)
|
||||
|
||||
assert result.index.names == ("a",)
|
||||
assert result.columns.names == ["b", "c"]
|
||||
|
||||
all_cols = result["TOTAL", ""]
|
||||
exp_cols = df.groupby(["a"]).size().astype("i8")
|
||||
# to keep index.name
|
||||
exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a"))
|
||||
exp_cols = pd.concat([exp_cols, exp_margin])
|
||||
exp_cols.name = ("TOTAL", "")
|
||||
|
||||
tm.assert_series_equal(all_cols, exp_cols)
|
||||
|
||||
all_rows = result.loc["TOTAL"]
|
||||
exp_rows = df.groupby(["b", "c"]).size().astype("i8")
|
||||
exp_rows = pd.concat([exp_rows, Series([len(df)], index=[("TOTAL", "")])])
|
||||
exp_rows.name = "TOTAL"
|
||||
|
||||
exp_rows = exp_rows.reindex(all_rows.index)
|
||||
exp_rows = exp_rows.fillna(0).astype(np.int64)
|
||||
tm.assert_series_equal(all_rows, exp_rows)
|
||||
|
||||
msg = "margins_name argument must be a string"
|
||||
for margins_name in [666, None, ["a", "b"]]:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
crosstab(
|
||||
a,
|
||||
[b, c],
|
||||
rownames=["a"],
|
||||
colnames=("b", "c"),
|
||||
margins=True,
|
||||
margins_name=margins_name,
|
||||
)
|
||||
|
||||
def test_crosstab_pass_values(self):
|
||||
a = np.random.default_rng(2).integers(0, 7, size=100)
|
||||
b = np.random.default_rng(2).integers(0, 3, size=100)
|
||||
c = np.random.default_rng(2).integers(0, 5, size=100)
|
||||
values = np.random.default_rng(2).standard_normal(100)
|
||||
|
||||
table = crosstab(
|
||||
[a, b], c, values, aggfunc="sum", rownames=["foo", "bar"], colnames=["baz"]
|
||||
)
|
||||
|
||||
df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values})
|
||||
|
||||
expected = df.pivot_table(
|
||||
"values", index=["foo", "bar"], columns="baz", aggfunc="sum"
|
||||
)
|
||||
tm.assert_frame_equal(table, expected)
|
||||
|
||||
def test_crosstab_dropna(self):
|
||||
# GH 3820
|
||||
a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object)
|
||||
b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object)
|
||||
c = np.array(
|
||||
["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object
|
||||
)
|
||||
res = crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False)
|
||||
m = MultiIndex.from_tuples(
|
||||
[("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")],
|
||||
names=["b", "c"],
|
||||
)
|
||||
tm.assert_index_equal(res.columns, m)
|
||||
|
||||
def test_crosstab_no_overlap(self):
|
||||
# GS 10291
|
||||
|
||||
s1 = Series([1, 2, 3], index=[1, 2, 3])
|
||||
s2 = Series([4, 5, 6], index=[4, 5, 6])
|
||||
|
||||
actual = crosstab(s1, s2)
|
||||
expected = DataFrame(
|
||||
index=Index([], dtype="int64", name="row_0"),
|
||||
columns=Index([], dtype="int64", name="col_0"),
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_margin_dropna(self):
|
||||
# GH 12577
|
||||
# pivot_table counts null into margin ('All')
|
||||
# when margins=true and dropna=true
|
||||
|
||||
df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]})
|
||||
actual = crosstab(df.a, df.b, margins=True, dropna=True)
|
||||
expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]])
|
||||
expected.index = Index([1.0, 2.0, "All"], name="a")
|
||||
expected.columns = Index([3, 4, "All"], name="b")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_margin_dropna2(self):
|
||||
df = DataFrame(
|
||||
{"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]}
|
||||
)
|
||||
actual = crosstab(df.a, df.b, margins=True, dropna=True)
|
||||
expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
|
||||
expected.index = Index([1.0, 2.0, "All"], name="a")
|
||||
expected.columns = Index([3.0, 4.0, "All"], name="b")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_margin_dropna3(self):
|
||||
df = DataFrame(
|
||||
{"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]}
|
||||
)
|
||||
actual = crosstab(df.a, df.b, margins=True, dropna=True)
|
||||
expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
|
||||
expected.index = Index([1.0, 2.0, "All"], name="a")
|
||||
expected.columns = Index([3, 4, "All"], name="b")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_margin_dropna4(self):
|
||||
# GH 12642
|
||||
# _add_margins raises KeyError: Level None not found
|
||||
# when margins=True and dropna=False
|
||||
# GH: 10772: Keep np.nan in result with dropna=False
|
||||
df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]})
|
||||
actual = crosstab(df.a, df.b, margins=True, dropna=False)
|
||||
expected = DataFrame([[1, 0, 1], [1, 3, 4], [0, 1, 1], [2, 4, 6]])
|
||||
expected.index = Index([1.0, 2.0, np.nan, "All"], name="a")
|
||||
expected.columns = Index([3, 4, "All"], name="b")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_margin_dropna5(self):
|
||||
# GH: 10772: Keep np.nan in result with dropna=False
|
||||
df = DataFrame(
|
||||
{"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]}
|
||||
)
|
||||
actual = crosstab(df.a, df.b, margins=True, dropna=False)
|
||||
expected = DataFrame(
|
||||
[[1, 0, 0, 1.0], [0, 1, 0, 1.0], [0, 3, 1, 4.0], [1, 4, 1, 6.0]]
|
||||
)
|
||||
expected.index = Index([1.0, 2.0, np.nan, "All"], name="a")
|
||||
expected.columns = Index([3.0, 4.0, np.nan, "All"], name="b")
|
||||
tm.assert_frame_equal(actual, expected, check_dtype=False)
|
||||
|
||||
def test_margin_dropna6(self):
|
||||
# GH: 10772: Keep np.nan in result with dropna=False
|
||||
a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object)
|
||||
b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object)
|
||||
c = np.array(
|
||||
["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object
|
||||
)
|
||||
|
||||
actual = crosstab(
|
||||
a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False
|
||||
)
|
||||
m = MultiIndex.from_arrays(
|
||||
[
|
||||
["one", "one", "two", "two", np.nan, np.nan, "All"],
|
||||
["dull", "shiny", "dull", "shiny", "dull", "shiny", ""],
|
||||
],
|
||||
names=["b", "c"],
|
||||
)
|
||||
expected = DataFrame(
|
||||
[[1, 0, 1, 0, 0, 0, 2], [2, 0, 1, 1, 0, 1, 5], [3, 0, 2, 1, 0, 1, 7]],
|
||||
columns=m,
|
||||
)
|
||||
expected.index = Index(["bar", "foo", "All"], name="a")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = crosstab(
|
||||
[a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False
|
||||
)
|
||||
m = MultiIndex.from_arrays(
|
||||
[
|
||||
["bar", "bar", "bar", "foo", "foo", "foo", "All"],
|
||||
["one", "two", np.nan, "one", "two", np.nan, ""],
|
||||
],
|
||||
names=["a", "b"],
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, 0, 1.0],
|
||||
[1, 0, 1.0],
|
||||
[0, 0, np.nan],
|
||||
[2, 0, 2.0],
|
||||
[1, 1, 2.0],
|
||||
[0, 1, 1.0],
|
||||
[5, 2, 7.0],
|
||||
],
|
||||
index=m,
|
||||
)
|
||||
expected.columns = Index(["dull", "shiny", "All"], name="c")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = crosstab(
|
||||
[a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True
|
||||
)
|
||||
m = MultiIndex.from_arrays(
|
||||
[["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]],
|
||||
names=["a", "b"],
|
||||
)
|
||||
expected = DataFrame(
|
||||
[[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m
|
||||
)
|
||||
expected.columns = Index(["dull", "shiny", "All"], name="c")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_crosstab_normalize(self):
|
||||
# Issue 12578
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
|
||||
)
|
||||
|
||||
rindex = Index([1, 2], name="a")
|
||||
cindex = Index([3, 4], name="b")
|
||||
full_normal = DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex)
|
||||
row_normal = DataFrame([[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex)
|
||||
col_normal = DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex)
|
||||
|
||||
# Check all normalize args
|
||||
tm.assert_frame_equal(crosstab(df.a, df.b, normalize="all"), full_normal)
|
||||
tm.assert_frame_equal(crosstab(df.a, df.b, normalize=True), full_normal)
|
||||
tm.assert_frame_equal(crosstab(df.a, df.b, normalize="index"), row_normal)
|
||||
tm.assert_frame_equal(crosstab(df.a, df.b, normalize="columns"), col_normal)
|
||||
tm.assert_frame_equal(
|
||||
crosstab(df.a, df.b, normalize=1),
|
||||
crosstab(df.a, df.b, normalize="columns"),
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index")
|
||||
)
|
||||
|
||||
row_normal_margins = DataFrame(
|
||||
[[1.0, 0], [0.25, 0.75], [0.4, 0.6]],
|
||||
index=Index([1, 2, "All"], name="a", dtype="object"),
|
||||
columns=Index([3, 4], name="b", dtype="object"),
|
||||
)
|
||||
col_normal_margins = DataFrame(
|
||||
[[0.5, 0, 0.2], [0.5, 1.0, 0.8]],
|
||||
index=Index([1, 2], name="a", dtype="object"),
|
||||
columns=Index([3, 4, "All"], name="b", dtype="object"),
|
||||
)
|
||||
|
||||
all_normal_margins = DataFrame(
|
||||
[[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]],
|
||||
index=Index([1, 2, "All"], name="a", dtype="object"),
|
||||
columns=Index([3, 4, "All"], name="b", dtype="object"),
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins
|
||||
)
|
||||
|
||||
def test_crosstab_normalize_arrays(self):
|
||||
# GH#12578
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
|
||||
)
|
||||
|
||||
# Test arrays
|
||||
crosstab(
|
||||
[np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2])
|
||||
)
|
||||
|
||||
# Test with aggfunc
|
||||
norm_counts = DataFrame(
|
||||
[[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]],
|
||||
index=Index([1, 2, "All"], name="a", dtype="object"),
|
||||
columns=Index([3, 4, "All"], name="b"),
|
||||
)
|
||||
test_case = crosstab(
|
||||
df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True
|
||||
)
|
||||
tm.assert_frame_equal(test_case, norm_counts)
|
||||
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]}
|
||||
)
|
||||
|
||||
norm_sum = DataFrame(
|
||||
[[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]],
|
||||
index=Index([1, 2, "All"], name="a", dtype="object"),
|
||||
columns=Index([3, 4, "All"], name="b", dtype="object"),
|
||||
)
|
||||
test_case = crosstab(
|
||||
df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True
|
||||
)
|
||||
tm.assert_frame_equal(test_case, norm_sum)
|
||||
|
||||
def test_crosstab_with_empties(self):
|
||||
# Check handling of empties
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 2, 2, 2, 2],
|
||||
"b": [3, 3, 4, 4, 4],
|
||||
"c": [np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
empty = DataFrame(
|
||||
[[0.0, 0.0], [0.0, 0.0]],
|
||||
index=Index([1, 2], name="a", dtype="int64"),
|
||||
columns=Index([3, 4], name="b"),
|
||||
)
|
||||
|
||||
for i in [True, "index", "columns"]:
|
||||
calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=i)
|
||||
tm.assert_frame_equal(empty, calculated)
|
||||
|
||||
nans = DataFrame(
|
||||
[[0.0, np.nan], [0.0, 0.0]],
|
||||
index=Index([1, 2], name="a", dtype="int64"),
|
||||
columns=Index([3, 4], name="b"),
|
||||
)
|
||||
|
||||
calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=False)
|
||||
tm.assert_frame_equal(nans, calculated)
|
||||
|
||||
def test_crosstab_errors(self):
|
||||
# Issue 12578
|
||||
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
|
||||
)
|
||||
|
||||
error = "values cannot be used without an aggfunc."
|
||||
with pytest.raises(ValueError, match=error):
|
||||
crosstab(df.a, df.b, values=df.c)
|
||||
|
||||
error = "aggfunc cannot be used without values"
|
||||
with pytest.raises(ValueError, match=error):
|
||||
crosstab(df.a, df.b, aggfunc=np.mean)
|
||||
|
||||
error = "Not a valid normalize argument"
|
||||
with pytest.raises(ValueError, match=error):
|
||||
crosstab(df.a, df.b, normalize="42")
|
||||
|
||||
with pytest.raises(ValueError, match=error):
|
||||
crosstab(df.a, df.b, normalize=42)
|
||||
|
||||
error = "Not a valid margins argument"
|
||||
with pytest.raises(ValueError, match=error):
|
||||
crosstab(df.a, df.b, normalize="all", margins=42)
|
||||
|
||||
def test_crosstab_with_categorial_columns(self):
|
||||
# GH 8860
|
||||
df = DataFrame(
|
||||
{
|
||||
"MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"],
|
||||
"MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"],
|
||||
}
|
||||
)
|
||||
categories = ["Sedan", "Electric", "Pickup"]
|
||||
df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories)
|
||||
result = crosstab(df["MAKE"], df["MODEL"])
|
||||
|
||||
expected_index = Index(["Acura", "Honda", "Tesla"], name="MAKE")
|
||||
expected_columns = CategoricalIndex(
|
||||
categories, categories=categories, ordered=False, name="MODEL"
|
||||
)
|
||||
expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]]
|
||||
expected = DataFrame(
|
||||
expected_data, index=expected_index, columns=expected_columns
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_crosstab_with_numpy_size(self):
|
||||
# GH 4003
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["one", "one", "two", "three"] * 6,
|
||||
"B": ["A", "B", "C"] * 8,
|
||||
"C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
|
||||
"D": np.random.default_rng(2).standard_normal(24),
|
||||
"E": np.random.default_rng(2).standard_normal(24),
|
||||
}
|
||||
)
|
||||
result = crosstab(
|
||||
index=[df["A"], df["B"]],
|
||||
columns=[df["C"]],
|
||||
margins=True,
|
||||
aggfunc=np.size,
|
||||
values=df["D"],
|
||||
)
|
||||
expected_index = MultiIndex(
|
||||
levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]],
|
||||
codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]],
|
||||
names=["A", "B"],
|
||||
)
|
||||
expected_column = Index(["bar", "foo", "All"], name="C")
|
||||
expected_data = np.array(
|
||||
[
|
||||
[2.0, 2.0, 4.0],
|
||||
[2.0, 2.0, 4.0],
|
||||
[2.0, 2.0, 4.0],
|
||||
[2.0, np.nan, 2.0],
|
||||
[np.nan, 2.0, 2.0],
|
||||
[2.0, np.nan, 2.0],
|
||||
[np.nan, 2.0, 2.0],
|
||||
[2.0, np.nan, 2.0],
|
||||
[np.nan, 2.0, 2.0],
|
||||
[12.0, 12.0, 24.0],
|
||||
]
|
||||
)
|
||||
expected = DataFrame(
|
||||
expected_data, index=expected_index, columns=expected_column
|
||||
)
|
||||
# aggfunc is np.size, resulting in integers
|
||||
expected["All"] = expected["All"].astype("int64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_crosstab_duplicate_names(self):
|
||||
# GH 13279 / 22529
|
||||
|
||||
s1 = Series(range(3), name="foo")
|
||||
s2_foo = Series(range(1, 4), name="foo")
|
||||
s2_bar = Series(range(1, 4), name="bar")
|
||||
s3 = Series(range(3), name="waldo")
|
||||
|
||||
# check result computed with duplicate labels against
|
||||
# result computed with unique labels, then relabelled
|
||||
mapper = {"bar": "foo"}
|
||||
|
||||
# duplicate row, column labels
|
||||
result = crosstab(s1, s2_foo)
|
||||
expected = crosstab(s1, s2_bar).rename_axis(columns=mapper, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# duplicate row, unique column labels
|
||||
result = crosstab([s1, s2_foo], s3)
|
||||
expected = crosstab([s1, s2_bar], s3).rename_axis(index=mapper, axis=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# unique row, duplicate column labels
|
||||
result = crosstab(s3, [s1, s2_foo])
|
||||
expected = crosstab(s3, [s1, s2_bar]).rename_axis(columns=mapper, axis=1)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]])
|
||||
def test_crosstab_tuple_name(self, names):
|
||||
s1 = Series(range(3), name=names[0])
|
||||
s2 = Series(range(1, 4), name=names[1])
|
||||
|
||||
mi = MultiIndex.from_arrays([range(3), range(1, 4)], names=names)
|
||||
expected = Series(1, index=mi).unstack(1, fill_value=0)
|
||||
|
||||
result = crosstab(s1, s2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_crosstab_both_tuple_names(self):
|
||||
# GH 18321
|
||||
s1 = Series(range(3), name=("a", "b"))
|
||||
s2 = Series(range(3), name=("c", "d"))
|
||||
|
||||
expected = DataFrame(
|
||||
np.eye(3, dtype="int64"),
|
||||
index=Index(range(3), name=("a", "b")),
|
||||
columns=Index(range(3), name=("c", "d")),
|
||||
)
|
||||
result = crosstab(s1, s2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_crosstab_unsorted_order(self):
|
||||
df = DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"])
|
||||
result = crosstab(df.index, [df.b, df.a])
|
||||
e_idx = Index(["A", "B", "C"], name="row_0")
|
||||
e_columns = MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)], names=["b", "a"])
|
||||
expected = DataFrame(
|
||||
[[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_crosstab_normalize_multiple_columns(self):
|
||||
# GH 15150
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["one", "one", "two", "three"] * 6,
|
||||
"B": ["A", "B", "C"] * 8,
|
||||
"C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
|
||||
"D": [0] * 24,
|
||||
"E": [0] * 24,
|
||||
}
|
||||
)
|
||||
|
||||
result = crosstab(
|
||||
[df.A, df.B],
|
||||
df.C,
|
||||
values=df.D,
|
||||
aggfunc=np.sum,
|
||||
normalize=True,
|
||||
margins=True,
|
||||
)
|
||||
expected = DataFrame(
|
||||
np.array([0] * 29 + [1], dtype=float).reshape(10, 3),
|
||||
columns=Index(["bar", "foo", "All"], name="C"),
|
||||
index=MultiIndex.from_tuples(
|
||||
[
|
||||
("one", "A"),
|
||||
("one", "B"),
|
||||
("one", "C"),
|
||||
("three", "A"),
|
||||
("three", "B"),
|
||||
("three", "C"),
|
||||
("two", "A"),
|
||||
("two", "B"),
|
||||
("two", "C"),
|
||||
("All", ""),
|
||||
],
|
||||
names=["A", "B"],
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_margin_normalize(self):
|
||||
# GH 27500
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
|
||||
"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
|
||||
"C": [
|
||||
"small",
|
||||
"large",
|
||||
"large",
|
||||
"small",
|
||||
"small",
|
||||
"large",
|
||||
"small",
|
||||
"small",
|
||||
"large",
|
||||
],
|
||||
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
|
||||
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
|
||||
}
|
||||
)
|
||||
# normalize on index
|
||||
result = crosstab(
|
||||
[df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0
|
||||
)
|
||||
expected = DataFrame(
|
||||
[[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]]
|
||||
)
|
||||
expected.index = MultiIndex(
|
||||
levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
|
||||
codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
|
||||
names=["A", "B"],
|
||||
)
|
||||
expected.columns = Index(["large", "small"], name="C")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# normalize on columns
|
||||
result = crosstab(
|
||||
[df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.25, 0.2, 0.222222],
|
||||
[0.25, 0.2, 0.222222],
|
||||
[0.5, 0.2, 0.333333],
|
||||
[0, 0.4, 0.222222],
|
||||
]
|
||||
)
|
||||
expected.columns = Index(["large", "small", "Sub-Total"], name="C")
|
||||
expected.index = MultiIndex(
|
||||
levels=[["bar", "foo"], ["one", "two"]],
|
||||
codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
|
||||
names=["A", "B"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# normalize on both index and column
|
||||
result = crosstab(
|
||||
[df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.111111, 0.111111, 0.222222],
|
||||
[0.111111, 0.111111, 0.222222],
|
||||
[0.222222, 0.111111, 0.333333],
|
||||
[0.000000, 0.222222, 0.222222],
|
||||
[0.444444, 0.555555, 1],
|
||||
]
|
||||
)
|
||||
expected.columns = Index(["large", "small", "Sub-Total"], name="C")
|
||||
expected.index = MultiIndex(
|
||||
levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
|
||||
codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
|
||||
names=["A", "B"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_margin_normalize_multiple_columns(self):
|
||||
# GH 35144
|
||||
# use multiple columns with margins and normalization
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
|
||||
"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
|
||||
"C": [
|
||||
"small",
|
||||
"large",
|
||||
"large",
|
||||
"small",
|
||||
"small",
|
||||
"large",
|
||||
"small",
|
||||
"small",
|
||||
"large",
|
||||
],
|
||||
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
|
||||
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
|
||||
}
|
||||
)
|
||||
result = crosstab(
|
||||
index=df.C,
|
||||
columns=[df.A, df.B],
|
||||
margins=True,
|
||||
margins_name="margin",
|
||||
normalize=True,
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.111111, 0.111111, 0.222222, 0.000000, 0.444444],
|
||||
[0.111111, 0.111111, 0.111111, 0.222222, 0.555556],
|
||||
[0.222222, 0.222222, 0.333333, 0.222222, 1.0],
|
||||
],
|
||||
index=["large", "small", "margin"],
|
||||
)
|
||||
expected.columns = MultiIndex(
|
||||
levels=[["bar", "foo", "margin"], ["", "one", "two"]],
|
||||
codes=[[0, 0, 1, 1, 2], [1, 2, 1, 2, 0]],
|
||||
names=["A", "B"],
|
||||
)
|
||||
expected.index.name = "C"
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_margin_support_Float(self):
|
||||
# GH 50313
|
||||
# use Float64 formats and function aggfunc with margins
|
||||
df = DataFrame(
|
||||
{"A": [1, 2, 2, 1], "B": [3, 3, 4, 5], "C": [-1.0, 10.0, 1.0, 10.0]},
|
||||
dtype="Float64",
|
||||
)
|
||||
result = crosstab(
|
||||
df["A"],
|
||||
df["B"],
|
||||
values=df["C"],
|
||||
aggfunc="sum",
|
||||
margins=True,
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[-1.0, pd.NA, 10.0, 9.0],
|
||||
[10.0, 1.0, pd.NA, 11.0],
|
||||
[9.0, 1.0, 10.0, 20.0],
|
||||
],
|
||||
index=Index([1.0, 2.0, "All"], dtype="object", name="A"),
|
||||
columns=Index([3.0, 4.0, 5.0, "All"], dtype="object", name="B"),
|
||||
dtype="Float64",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_margin_with_ordered_categorical_column(self):
|
||||
# GH 25278
|
||||
df = DataFrame(
|
||||
{
|
||||
"First": ["B", "B", "C", "A", "B", "C"],
|
||||
"Second": ["C", "B", "B", "B", "C", "A"],
|
||||
}
|
||||
)
|
||||
df["First"] = df["First"].astype(CategoricalDtype(ordered=True))
|
||||
customized_categories_order = ["C", "A", "B"]
|
||||
df["First"] = df["First"].cat.reorder_categories(customized_categories_order)
|
||||
result = crosstab(df["First"], df["Second"], margins=True)
|
||||
|
||||
expected_index = Index(["C", "A", "B", "All"], name="First")
|
||||
expected_columns = Index(["A", "B", "C", "All"], name="Second")
|
||||
expected_data = [[1, 1, 0, 2], [0, 1, 0, 1], [0, 1, 2, 3], [1, 3, 2, 6]]
|
||||
expected = DataFrame(
|
||||
expected_data, index=expected_index, columns=expected_columns
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("a_dtype", ["category", "int64"])
|
||||
@pytest.mark.parametrize("b_dtype", ["category", "int64"])
|
||||
def test_categoricals(a_dtype, b_dtype):
|
||||
# https://github.com/pandas-dev/pandas/issues/37465
|
||||
g = np.random.default_rng(2)
|
||||
a = Series(g.integers(0, 3, size=100)).astype(a_dtype)
|
||||
b = Series(g.integers(0, 2, size=100)).astype(b_dtype)
|
||||
result = crosstab(a, b, margins=True, dropna=False)
|
||||
columns = Index([0, 1, "All"], dtype="object", name="col_0")
|
||||
index = Index([0, 1, 2, "All"], dtype="object", name="row_0")
|
||||
values = [[10, 18, 28], [23, 16, 39], [17, 16, 33], [50, 50, 100]]
|
||||
expected = DataFrame(values, index, columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Verify when categorical does not have all values present
|
||||
a.loc[a == 1] = 2
|
||||
a_is_cat = isinstance(a.dtype, CategoricalDtype)
|
||||
assert not a_is_cat or a.value_counts().loc[1] == 0
|
||||
result = crosstab(a, b, margins=True, dropna=False)
|
||||
values = [[10, 18, 28], [0, 0, 0], [40, 32, 72], [50, 50, 100]]
|
||||
expected = DataFrame(values, index, columns)
|
||||
if not a_is_cat:
|
||||
expected = expected.loc[[0, 2, "All"]]
|
||||
expected["All"] = expected["All"].astype("int64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,828 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
Interval,
|
||||
IntervalIndex,
|
||||
Series,
|
||||
TimedeltaIndex,
|
||||
Timestamp,
|
||||
cut,
|
||||
date_range,
|
||||
interval_range,
|
||||
isna,
|
||||
qcut,
|
||||
timedelta_range,
|
||||
to_datetime,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.api.types import CategoricalDtype
|
||||
import pandas.core.reshape.tile as tmod
|
||||
|
||||
|
||||
def test_simple():
|
||||
data = np.ones(5, dtype="int64")
|
||||
result = cut(data, 4, labels=False)
|
||||
|
||||
expected = np.array([1, 1, 1, 1, 1])
|
||||
tm.assert_numpy_array_equal(result, expected, check_dtype=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", [list, np.array])
|
||||
def test_bins(func):
|
||||
data = func([0.2, 1.4, 2.5, 6.2, 9.7, 2.1])
|
||||
result, bins = cut(data, 3, retbins=True)
|
||||
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3))
|
||||
intervals = intervals.take([0, 0, 0, 1, 2, 0])
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7]))
|
||||
|
||||
|
||||
def test_right():
|
||||
data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
|
||||
result, bins = cut(data, 4, right=True, retbins=True)
|
||||
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3))
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
expected = expected.take([0, 0, 0, 2, 3, 0, 0])
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, 7.325, 9.7]))
|
||||
|
||||
|
||||
def test_no_right():
|
||||
data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
|
||||
result, bins = cut(data, 4, right=False, retbins=True)
|
||||
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3), closed="left")
|
||||
intervals = intervals.take([0, 0, 0, 2, 3, 0, 1])
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, 7.325, 9.7095]))
|
||||
|
||||
|
||||
def test_bins_from_interval_index():
|
||||
c = cut(range(5), 3)
|
||||
expected = c
|
||||
result = cut(range(5), bins=expected.categories)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
expected = Categorical.from_codes(
|
||||
np.append(c.codes, -1), categories=c.categories, ordered=True
|
||||
)
|
||||
result = cut(range(6), bins=expected.categories)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
def test_bins_from_interval_index_doc_example():
|
||||
# Make sure we preserve the bins.
|
||||
ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])
|
||||
c = cut(ages, bins=[0, 18, 35, 70])
|
||||
expected = IntervalIndex.from_tuples([(0, 18), (18, 35), (35, 70)])
|
||||
tm.assert_index_equal(c.categories, expected)
|
||||
|
||||
result = cut([25, 20, 50], bins=c.categories)
|
||||
tm.assert_index_equal(result.categories, expected)
|
||||
tm.assert_numpy_array_equal(result.codes, np.array([1, 1, 2], dtype="int8"))
|
||||
|
||||
|
||||
def test_bins_not_overlapping_from_interval_index():
|
||||
# see gh-23980
|
||||
msg = "Overlapping IntervalIndex is not accepted"
|
||||
ii = IntervalIndex.from_tuples([(0, 10), (2, 12), (4, 14)])
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut([5, 6], bins=ii)
|
||||
|
||||
|
||||
def test_bins_not_monotonic():
|
||||
msg = "bins must increase monotonically"
|
||||
data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(data, [0.1, 1.5, 1, 10])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"x, bins, expected",
|
||||
[
|
||||
(
|
||||
date_range("2017-12-31", periods=3),
|
||||
[Timestamp.min, Timestamp("2018-01-01"), Timestamp.max],
|
||||
IntervalIndex.from_tuples(
|
||||
[
|
||||
(Timestamp.min, Timestamp("2018-01-01")),
|
||||
(Timestamp("2018-01-01"), Timestamp.max),
|
||||
]
|
||||
),
|
||||
),
|
||||
(
|
||||
[-1, 0, 1],
|
||||
np.array(
|
||||
[np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], dtype="int64"
|
||||
),
|
||||
IntervalIndex.from_tuples(
|
||||
[(np.iinfo(np.int64).min, 0), (0, np.iinfo(np.int64).max)]
|
||||
),
|
||||
),
|
||||
(
|
||||
[
|
||||
np.timedelta64(-1, "ns"),
|
||||
np.timedelta64(0, "ns"),
|
||||
np.timedelta64(1, "ns"),
|
||||
],
|
||||
np.array(
|
||||
[
|
||||
np.timedelta64(-np.iinfo(np.int64).max, "ns"),
|
||||
np.timedelta64(0, "ns"),
|
||||
np.timedelta64(np.iinfo(np.int64).max, "ns"),
|
||||
]
|
||||
),
|
||||
IntervalIndex.from_tuples(
|
||||
[
|
||||
(
|
||||
np.timedelta64(-np.iinfo(np.int64).max, "ns"),
|
||||
np.timedelta64(0, "ns"),
|
||||
),
|
||||
(
|
||||
np.timedelta64(0, "ns"),
|
||||
np.timedelta64(np.iinfo(np.int64).max, "ns"),
|
||||
),
|
||||
]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_bins_monotonic_not_overflowing(x, bins, expected):
|
||||
# GH 26045
|
||||
result = cut(x, bins)
|
||||
tm.assert_index_equal(result.categories, expected)
|
||||
|
||||
|
||||
def test_wrong_num_labels():
|
||||
msg = "Bin labels must be one fewer than the number of bin edges"
|
||||
data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(data, [0, 1, 10], labels=["foo", "bar", "baz"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"x,bins,msg",
|
||||
[
|
||||
([], 2, "Cannot cut empty array"),
|
||||
([1, 2, 3], 0.5, "`bins` should be a positive integer"),
|
||||
],
|
||||
)
|
||||
def test_cut_corner(x, bins, msg):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(x, bins)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arg", [2, np.eye(2), DataFrame(np.eye(2))])
|
||||
@pytest.mark.parametrize("cut_func", [cut, qcut])
|
||||
def test_cut_not_1d_arg(arg, cut_func):
|
||||
msg = "Input array must be 1 dimensional"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut_func(arg, 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
[0, 1, 2, 3, 4, np.inf],
|
||||
[-np.inf, 0, 1, 2, 3, 4],
|
||||
[-np.inf, 0, 1, 2, 3, 4, np.inf],
|
||||
],
|
||||
)
|
||||
def test_int_bins_with_inf(data):
|
||||
# GH 24314
|
||||
msg = "cannot specify integer `bins` when input data contains infinity"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(data, bins=3)
|
||||
|
||||
|
||||
def test_cut_out_of_range_more():
|
||||
# see gh-1511
|
||||
name = "x"
|
||||
|
||||
ser = Series([0, -1, 0, 1, -3], name=name)
|
||||
ind = cut(ser, [0, 1], labels=False)
|
||||
|
||||
exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name=name)
|
||||
tm.assert_series_equal(ind, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"right,breaks,closed",
|
||||
[
|
||||
(True, [-1e-3, 0.25, 0.5, 0.75, 1], "right"),
|
||||
(False, [0, 0.25, 0.5, 0.75, 1 + 1e-3], "left"),
|
||||
],
|
||||
)
|
||||
def test_labels(right, breaks, closed):
|
||||
arr = np.tile(np.arange(0, 1.01, 0.1), 4)
|
||||
|
||||
result, bins = cut(arr, 4, retbins=True, right=right)
|
||||
ex_levels = IntervalIndex.from_breaks(breaks, closed=closed)
|
||||
tm.assert_index_equal(result.categories, ex_levels)
|
||||
|
||||
|
||||
def test_cut_pass_series_name_to_factor():
|
||||
name = "foo"
|
||||
ser = Series(np.random.default_rng(2).standard_normal(100), name=name)
|
||||
|
||||
factor = cut(ser, 4)
|
||||
assert factor.name == name
|
||||
|
||||
|
||||
def test_label_precision():
|
||||
arr = np.arange(0, 0.73, 0.01)
|
||||
result = cut(arr, 4, precision=2)
|
||||
|
||||
ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, 0.54, 0.72])
|
||||
tm.assert_index_equal(result.categories, ex_levels)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("labels", [None, False])
|
||||
def test_na_handling(labels):
|
||||
arr = np.arange(0, 0.75, 0.01)
|
||||
arr[::3] = np.nan
|
||||
|
||||
result = cut(arr, 4, labels=labels)
|
||||
result = np.asarray(result)
|
||||
|
||||
expected = np.where(isna(arr), np.nan, result)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
def test_inf_handling():
|
||||
data = np.arange(6)
|
||||
data_ser = Series(data, dtype="int64")
|
||||
|
||||
bins = [-np.inf, 2, 4, np.inf]
|
||||
result = cut(data, bins)
|
||||
result_ser = cut(data_ser, bins)
|
||||
|
||||
ex_uniques = IntervalIndex.from_breaks(bins)
|
||||
tm.assert_index_equal(result.categories, ex_uniques)
|
||||
|
||||
assert result[5] == Interval(4, np.inf)
|
||||
assert result[0] == Interval(-np.inf, 2)
|
||||
assert result_ser[5] == Interval(4, np.inf)
|
||||
assert result_ser[0] == Interval(-np.inf, 2)
|
||||
|
||||
|
||||
def test_cut_out_of_bounds():
|
||||
arr = np.random.default_rng(2).standard_normal(100)
|
||||
result = cut(arr, [-1, 0, 1])
|
||||
|
||||
mask = isna(result)
|
||||
ex_mask = (arr < -1) | (arr > 1)
|
||||
tm.assert_numpy_array_equal(mask, ex_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"get_labels,get_expected",
|
||||
[
|
||||
(
|
||||
lambda labels: labels,
|
||||
lambda labels: Categorical(
|
||||
["Medium"] + 4 * ["Small"] + ["Medium", "Large"],
|
||||
categories=labels,
|
||||
ordered=True,
|
||||
),
|
||||
),
|
||||
(
|
||||
lambda labels: Categorical.from_codes([0, 1, 2], labels),
|
||||
lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_cut_pass_labels(get_labels, get_expected):
|
||||
bins = [0, 25, 50, 100]
|
||||
arr = [50, 5, 10, 15, 20, 30, 70]
|
||||
labels = ["Small", "Medium", "Large"]
|
||||
|
||||
result = cut(arr, bins, labels=get_labels(labels))
|
||||
tm.assert_categorical_equal(result, get_expected(labels))
|
||||
|
||||
|
||||
def test_cut_pass_labels_compat():
|
||||
# see gh-16459
|
||||
arr = [50, 5, 10, 15, 20, 30, 70]
|
||||
labels = ["Good", "Medium", "Bad"]
|
||||
|
||||
result = cut(arr, 3, labels=labels)
|
||||
exp = cut(arr, 3, labels=Categorical(labels, categories=labels, ordered=True))
|
||||
tm.assert_categorical_equal(result, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("x", [np.arange(11.0), np.arange(11.0) / 1e10])
|
||||
def test_round_frac_just_works(x):
|
||||
# It works.
|
||||
cut(x, 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"val,precision,expected",
|
||||
[
|
||||
(-117.9998, 3, -118),
|
||||
(117.9998, 3, 118),
|
||||
(117.9998, 2, 118),
|
||||
(0.000123456, 2, 0.00012),
|
||||
],
|
||||
)
|
||||
def test_round_frac(val, precision, expected):
|
||||
# see gh-1979
|
||||
result = tmod._round_frac(val, precision=precision)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_cut_return_intervals():
|
||||
ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
|
||||
result = cut(ser, 3)
|
||||
|
||||
exp_bins = np.linspace(0, 8, num=4).round(3)
|
||||
exp_bins[0] -= 0.008
|
||||
|
||||
expected = Series(
|
||||
IntervalIndex.from_breaks(exp_bins, closed="right").take(
|
||||
[0, 0, 0, 1, 1, 1, 2, 2, 2]
|
||||
)
|
||||
).astype(CategoricalDtype(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_series_ret_bins():
|
||||
# see gh-8589
|
||||
ser = Series(np.arange(4))
|
||||
result, bins = cut(ser, 2, retbins=True)
|
||||
|
||||
expected = Series(
|
||||
IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2)
|
||||
).astype(CategoricalDtype(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,msg",
|
||||
[
|
||||
({"duplicates": "drop"}, None),
|
||||
({}, "Bin edges must be unique"),
|
||||
({"duplicates": "raise"}, "Bin edges must be unique"),
|
||||
({"duplicates": "foo"}, "invalid value for 'duplicates' parameter"),
|
||||
],
|
||||
)
|
||||
def test_cut_duplicates_bin(kwargs, msg):
|
||||
# see gh-20947
|
||||
bins = [0, 2, 4, 6, 10, 10]
|
||||
values = Series(np.array([1, 3, 5, 7, 9]), index=["a", "b", "c", "d", "e"])
|
||||
|
||||
if msg is not None:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(values, bins, **kwargs)
|
||||
else:
|
||||
result = cut(values, bins, **kwargs)
|
||||
expected = cut(values, pd.unique(np.asarray(bins)))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", [9.0, -9.0, 0.0])
|
||||
@pytest.mark.parametrize("length", [1, 2])
|
||||
def test_single_bin(data, length):
|
||||
# see gh-14652, gh-15428
|
||||
ser = Series([data] * length)
|
||||
result = cut(ser, 1, labels=False)
|
||||
|
||||
expected = Series([0] * length, dtype=np.intp)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values,threshold",
|
||||
[
|
||||
([0.1, 0.1, 0.1], 0.001), # small positive values
|
||||
([-0.1, -0.1, -0.1], 0.001), # negative values
|
||||
([0.01, 0.01, 0.01], 0.0001), # very small values
|
||||
],
|
||||
)
|
||||
def test_single_bin_edge_adjustment(values, threshold):
|
||||
# gh-58517 - edge adjustment mutation when all values are same
|
||||
result, bins = cut(values, 3, retbins=True)
|
||||
|
||||
bin_range = bins[-1] - bins[0]
|
||||
assert bin_range < threshold
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_1_writeable,array_2_writeable", [(True, True), (True, False), (False, False)]
|
||||
)
|
||||
def test_cut_read_only(array_1_writeable, array_2_writeable):
|
||||
# issue 18773
|
||||
array_1 = np.arange(0, 100, 10)
|
||||
array_1.flags.writeable = array_1_writeable
|
||||
|
||||
array_2 = np.arange(0, 100, 10)
|
||||
array_2.flags.writeable = array_2_writeable
|
||||
|
||||
hundred_elements = np.arange(100)
|
||||
tm.assert_categorical_equal(
|
||||
cut(hundred_elements, array_1), cut(hundred_elements, array_2)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"conv",
|
||||
[
|
||||
lambda v: Timestamp(v),
|
||||
lambda v: to_datetime(v),
|
||||
lambda v: np.datetime64(v),
|
||||
lambda v: Timestamp(v).to_pydatetime(),
|
||||
],
|
||||
)
|
||||
def test_datetime_bin(conv):
|
||||
data = [np.datetime64("2012-12-13"), np.datetime64("2012-12-15")]
|
||||
bin_data = ["2012-12-12", "2012-12-14", "2012-12-16"]
|
||||
|
||||
expected = Series(
|
||||
IntervalIndex(
|
||||
[
|
||||
Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])),
|
||||
Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
bins = [conv(v) for v in bin_data]
|
||||
result = Series(cut(data, bins=bins))
|
||||
|
||||
if type(bins[0]) is np.datetime64:
|
||||
# The bins have microsecond dtype -> so does result
|
||||
expected = expected.astype("interval[datetime64[s]]")
|
||||
|
||||
expected = expected.astype(CategoricalDtype(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("box", [Series, Index, np.array, list])
|
||||
def test_datetime_cut(unit, box):
|
||||
# see gh-14714
|
||||
#
|
||||
# Testing time data when it comes in various collection types.
|
||||
data = to_datetime(["2013-01-01", "2013-01-02", "2013-01-03"]).astype(f"M8[{unit}]")
|
||||
data = box(data)
|
||||
result, _ = cut(data, 3, retbins=True)
|
||||
|
||||
if unit == "s":
|
||||
# See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
|
||||
# for why we round to 8 seconds instead of 7
|
||||
left = DatetimeIndex(
|
||||
["2012-12-31 23:57:08", "2013-01-01 16:00:00", "2013-01-02 08:00:00"],
|
||||
dtype=f"M8[{unit}]",
|
||||
)
|
||||
else:
|
||||
left = DatetimeIndex(
|
||||
[
|
||||
"2012-12-31 23:57:07.200000",
|
||||
"2013-01-01 16:00:00",
|
||||
"2013-01-02 08:00:00",
|
||||
],
|
||||
dtype=f"M8[{unit}]",
|
||||
)
|
||||
right = DatetimeIndex(
|
||||
["2013-01-01 16:00:00", "2013-01-02 08:00:00", "2013-01-03 00:00:00"],
|
||||
dtype=f"M8[{unit}]",
|
||||
)
|
||||
|
||||
exp_intervals = IntervalIndex.from_arrays(left, right)
|
||||
expected = Series(exp_intervals).astype(CategoricalDtype(ordered=True))
|
||||
tm.assert_series_equal(Series(result), expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("box", [list, np.array, Index, Series])
|
||||
def test_datetime_tz_cut_mismatched_tzawareness(box):
|
||||
# GH#54964
|
||||
bins = box(
|
||||
[
|
||||
Timestamp("2013-01-01 04:57:07.200000"),
|
||||
Timestamp("2013-01-01 21:00:00"),
|
||||
Timestamp("2013-01-02 13:00:00"),
|
||||
Timestamp("2013-01-03 05:00:00"),
|
||||
]
|
||||
)
|
||||
ser = Series(date_range("20130101", periods=3, tz="US/Eastern"))
|
||||
|
||||
msg = "Cannot use timezone-naive bins with timezone-aware values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(ser, bins)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"bins",
|
||||
[
|
||||
3,
|
||||
[
|
||||
Timestamp("2013-01-01 04:57:07.200000", tz="UTC").tz_convert("US/Eastern"),
|
||||
Timestamp("2013-01-01 21:00:00", tz="UTC").tz_convert("US/Eastern"),
|
||||
Timestamp("2013-01-02 13:00:00", tz="UTC").tz_convert("US/Eastern"),
|
||||
Timestamp("2013-01-03 05:00:00", tz="UTC").tz_convert("US/Eastern"),
|
||||
],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("box", [list, np.array, Index, Series])
|
||||
def test_datetime_tz_cut(bins, box):
|
||||
# see gh-19872
|
||||
tz = "US/Eastern"
|
||||
ser = Series(date_range("20130101", periods=3, tz=tz, unit="ns"))
|
||||
|
||||
if not isinstance(bins, int):
|
||||
bins = box(bins)
|
||||
|
||||
result = cut(ser, bins)
|
||||
ii = IntervalIndex(
|
||||
[
|
||||
Interval(
|
||||
Timestamp("2012-12-31 23:57:07.200000", tz=tz),
|
||||
Timestamp("2013-01-01 16:00:00", tz=tz),
|
||||
),
|
||||
Interval(
|
||||
Timestamp("2013-01-01 16:00:00", tz=tz),
|
||||
Timestamp("2013-01-02 08:00:00", tz=tz),
|
||||
),
|
||||
Interval(
|
||||
Timestamp("2013-01-02 08:00:00", tz=tz),
|
||||
Timestamp("2013-01-03 00:00:00", tz=tz),
|
||||
),
|
||||
]
|
||||
)
|
||||
if isinstance(bins, int):
|
||||
# the dtype is inferred from ser, which has nanosecond unit
|
||||
ii = ii.astype("interval[datetime64[ns, US/Eastern]]")
|
||||
expected = Series(ii).astype(CategoricalDtype(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_datetime_nan_error():
|
||||
msg = "bins must be of datetime64 dtype"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(date_range("20130101", periods=3), bins=[0, 2, 4])
|
||||
|
||||
|
||||
def test_datetime_nan_mask():
|
||||
result = cut(
|
||||
date_range("20130102", periods=5), bins=date_range("20130101", periods=2)
|
||||
)
|
||||
|
||||
mask = result.categories.isna()
|
||||
tm.assert_numpy_array_equal(mask, np.array([False]))
|
||||
|
||||
mask = result.isna()
|
||||
tm.assert_numpy_array_equal(mask, np.array([False, True, True, True, True]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
|
||||
def test_datetime_cut_roundtrip(tz, unit):
|
||||
# see gh-19891
|
||||
ser = Series(date_range("20180101", periods=3, tz=tz, unit=unit))
|
||||
result, result_bins = cut(ser, 2, retbins=True)
|
||||
|
||||
expected = cut(ser, result_bins)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
if unit == "s":
|
||||
# TODO: constructing DatetimeIndex with dtype="M8[s]" without truncating
|
||||
# the first entry here raises in array_to_datetime. Should truncate
|
||||
# instead of raising?
|
||||
# See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
|
||||
# for why we round to 8 seconds instead of 7
|
||||
expected_bins = DatetimeIndex(
|
||||
["2017-12-31 23:57:08", "2018-01-02 00:00:00", "2018-01-03 00:00:00"],
|
||||
dtype=f"M8[{unit}]",
|
||||
)
|
||||
else:
|
||||
expected_bins = DatetimeIndex(
|
||||
[
|
||||
"2017-12-31 23:57:07.200000",
|
||||
"2018-01-02 00:00:00",
|
||||
"2018-01-03 00:00:00",
|
||||
],
|
||||
dtype=f"M8[{unit}]",
|
||||
)
|
||||
expected_bins = expected_bins.tz_localize(tz)
|
||||
tm.assert_index_equal(result_bins, expected_bins)
|
||||
|
||||
|
||||
def test_timedelta_cut_roundtrip():
|
||||
# see gh-19891
|
||||
ser = Series(timedelta_range("1day", periods=3))
|
||||
result, result_bins = cut(ser, 2, retbins=True)
|
||||
|
||||
expected = cut(ser, result_bins)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
expected_bins = TimedeltaIndex(
|
||||
["0 days 23:57:07.200000", "2 days 00:00:00", "3 days 00:00:00"]
|
||||
)
|
||||
tm.assert_index_equal(result_bins, expected_bins)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bins", [6, 7])
|
||||
@pytest.mark.parametrize(
|
||||
"box, compare",
|
||||
[
|
||||
(Series, tm.assert_series_equal),
|
||||
(np.array, tm.assert_categorical_equal),
|
||||
(list, tm.assert_equal),
|
||||
],
|
||||
)
|
||||
def test_cut_bool_coercion_to_int(bins, box, compare):
|
||||
# issue 20303
|
||||
data_expected = box([0, 1, 1, 0, 1] * 10)
|
||||
data_result = box([False, True, True, False, True] * 10)
|
||||
expected = cut(data_expected, bins, duplicates="drop")
|
||||
result = cut(data_result, bins, duplicates="drop")
|
||||
compare(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("labels", ["foo", 1, True])
|
||||
def test_cut_incorrect_labels(labels):
|
||||
# GH 13318
|
||||
values = range(5)
|
||||
msg = "Bin labels must either be False, None or passed in as a list-like argument"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(values, 4, labels=labels)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bins", [3, [0, 5, 15]])
|
||||
@pytest.mark.parametrize("right", [True, False])
|
||||
@pytest.mark.parametrize("include_lowest", [True, False])
|
||||
def test_cut_nullable_integer(bins, right, include_lowest):
|
||||
a = np.random.default_rng(2).integers(0, 10, size=50).astype(float)
|
||||
a[::2] = np.nan
|
||||
b = a.astype(object)
|
||||
b[::2] = pd.NA
|
||||
result = cut(
|
||||
pd.array(b, dtype="Int64"), bins, right=right, include_lowest=include_lowest
|
||||
)
|
||||
expected = cut(a, bins, right=right, include_lowest=include_lowest)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, bins, labels, expected_codes, expected_labels",
|
||||
[
|
||||
([15, 17, 19], [14, 16, 18, 20], ["A", "B", "A"], [0, 1, 0], ["A", "B"]),
|
||||
([1, 3, 5], [0, 2, 4, 6, 8], [2, 0, 1, 2], [2, 0, 1], [0, 1, 2]),
|
||||
],
|
||||
)
|
||||
def test_cut_non_unique_labels(data, bins, labels, expected_codes, expected_labels):
|
||||
# GH 33141
|
||||
result = cut(data, bins=bins, labels=labels, ordered=False)
|
||||
expected = Categorical.from_codes(
|
||||
expected_codes, categories=expected_labels, ordered=False
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, bins, labels, expected_codes, expected_labels",
|
||||
[
|
||||
([15, 17, 19], [14, 16, 18, 20], ["C", "B", "A"], [0, 1, 2], ["C", "B", "A"]),
|
||||
([1, 3, 5], [0, 2, 4, 6, 8], [3, 0, 1, 2], [0, 1, 2], [3, 0, 1, 2]),
|
||||
],
|
||||
)
|
||||
def test_cut_unordered_labels(data, bins, labels, expected_codes, expected_labels):
|
||||
# GH 33141
|
||||
result = cut(data, bins=bins, labels=labels, ordered=False)
|
||||
expected = Categorical.from_codes(
|
||||
expected_codes, categories=expected_labels, ordered=False
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
def test_cut_unordered_with_missing_labels_raises_error():
|
||||
# GH 33141
|
||||
msg = "'labels' must be provided if 'ordered = False'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut([0.5, 3], bins=[0, 1, 2], ordered=False)
|
||||
|
||||
|
||||
def test_cut_unordered_with_series_labels():
|
||||
# https://github.com/pandas-dev/pandas/issues/36603
|
||||
ser = Series([1, 2, 3, 4, 5])
|
||||
bins = Series([0, 2, 4, 6])
|
||||
labels = Series(["a", "b", "c"])
|
||||
result = cut(ser, bins=bins, labels=labels, ordered=False)
|
||||
expected = Series(["a", "a", "b", "b", "c"], dtype="category")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cut_no_warnings():
|
||||
df = DataFrame({"value": np.random.default_rng(2).integers(0, 100, 20)})
|
||||
labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)]
|
||||
with tm.assert_produces_warning(False):
|
||||
df["group"] = cut(df.value, range(0, 105, 10), right=False, labels=labels)
|
||||
|
||||
|
||||
def test_cut_with_duplicated_index_lowest_included():
|
||||
# GH 42185
|
||||
expected = Series(
|
||||
[Interval(-0.001, 2, closed="right")] * 3
|
||||
+ [Interval(2, 4, closed="right"), Interval(-0.001, 2, closed="right")],
|
||||
index=[0, 1, 2, 3, 0],
|
||||
dtype="category",
|
||||
).cat.as_ordered()
|
||||
|
||||
ser = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0])
|
||||
result = cut(ser, bins=[0, 2, 4], include_lowest=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning")
|
||||
def test_cut_with_nonexact_categorical_indices():
|
||||
# GH 42424
|
||||
|
||||
ser = Series(range(100))
|
||||
ser1 = cut(ser, 10).value_counts().head(5)
|
||||
ser2 = cut(ser, 10).value_counts().tail(5)
|
||||
result = DataFrame({"1": ser1, "2": ser2})
|
||||
|
||||
index = pd.CategoricalIndex(
|
||||
[
|
||||
Interval(-0.099, 9.9, closed="right"),
|
||||
Interval(9.9, 19.8, closed="right"),
|
||||
Interval(19.8, 29.7, closed="right"),
|
||||
Interval(29.7, 39.6, closed="right"),
|
||||
Interval(39.6, 49.5, closed="right"),
|
||||
Interval(49.5, 59.4, closed="right"),
|
||||
Interval(59.4, 69.3, closed="right"),
|
||||
Interval(69.3, 79.2, closed="right"),
|
||||
Interval(79.2, 89.1, closed="right"),
|
||||
Interval(89.1, 99, closed="right"),
|
||||
],
|
||||
ordered=True,
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{"1": [10] * 5 + [np.nan] * 5, "2": [np.nan] * 5 + [10] * 5}, index=index
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
def test_cut_with_timestamp_tuple_labels():
|
||||
# GH 40661
|
||||
labels = [(Timestamp(10),), (Timestamp(20),), (Timestamp(30),)]
|
||||
result = cut([2, 4, 6], bins=[1, 3, 5, 7], labels=labels)
|
||||
|
||||
expected = Categorical.from_codes([0, 1, 2], labels, ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
def test_cut_bins_datetime_intervalindex():
|
||||
# https://github.com/pandas-dev/pandas/issues/46218
|
||||
bins = interval_range(Timestamp("2022-02-25"), Timestamp("2022-02-27"), freq="1D")
|
||||
# passing Series instead of list is important to trigger bug
|
||||
result = cut(Series([Timestamp("2022-02-26")]), bins=bins)
|
||||
expected = Categorical.from_codes([0], bins, ordered=True)
|
||||
tm.assert_categorical_equal(result.array, expected)
|
||||
|
||||
|
||||
def test_cut_with_nullable_int64():
|
||||
# GH 30787
|
||||
series = Series([0, 1, 2, 3, 4, pd.NA, 6, 7], dtype="Int64")
|
||||
bins = [0, 2, 4, 6, 8]
|
||||
intervals = IntervalIndex.from_breaks(bins)
|
||||
|
||||
expected = Series(
|
||||
Categorical.from_codes([-1, 0, 0, 1, 1, -1, 2, 3], intervals, ordered=True)
|
||||
)
|
||||
|
||||
result = cut(series, bins=bins)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cut_datetime_array_no_attributeerror():
|
||||
# GH 55431
|
||||
ser = Series(to_datetime(["2023-10-06 12:00:00+0000", "2023-10-07 12:00:00+0000"]))
|
||||
|
||||
result = cut(ser.array, bins=2)
|
||||
|
||||
categories = result.categories
|
||||
expected = Categorical.from_codes([0, 1], categories=categories, ordered=True)
|
||||
|
||||
tm.assert_categorical_equal(
|
||||
result, expected, check_dtype=True, check_category_order=True
|
||||
)
|
||||
@ -0,0 +1,477 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
from_dummies,
|
||||
get_dummies,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummies_basic():
|
||||
return DataFrame(
|
||||
{
|
||||
"col1_a": [1, 0, 1],
|
||||
"col1_b": [0, 1, 0],
|
||||
"col2_a": [0, 1, 0],
|
||||
"col2_b": [1, 0, 0],
|
||||
"col2_c": [0, 0, 1],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummies_with_unassigned():
|
||||
return DataFrame(
|
||||
{
|
||||
"col1_a": [1, 0, 0],
|
||||
"col1_b": [0, 1, 0],
|
||||
"col2_a": [0, 1, 0],
|
||||
"col2_b": [0, 0, 0],
|
||||
"col2_c": [0, 0, 1],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def test_error_wrong_data_type():
|
||||
dummies = [0, 1, 0]
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=r"Expected 'data' to be a 'DataFrame'; Received 'data' of type: list",
|
||||
):
|
||||
from_dummies(dummies)
|
||||
|
||||
|
||||
def test_error_no_prefix_contains_unassigned():
|
||||
dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
r"Dummy DataFrame contains unassigned value\(s\); "
|
||||
r"First instance in row: 2"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies)
|
||||
|
||||
|
||||
def test_error_no_prefix_wrong_default_category_type():
|
||||
dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]})
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=(
|
||||
r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; "
|
||||
r"Received 'default_category' of type: list"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies, default_category=["c", "d"])
|
||||
|
||||
|
||||
def test_error_no_prefix_multi_assignment():
|
||||
dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]})
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
r"Dummy DataFrame contains multi-assignment\(s\); "
|
||||
r"First instance in row: 2"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies)
|
||||
|
||||
|
||||
def test_error_no_prefix_contains_nan():
|
||||
dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]})
|
||||
with pytest.raises(
|
||||
ValueError, match=r"Dummy DataFrame contains NA value in column: 'b'"
|
||||
):
|
||||
from_dummies(dummies)
|
||||
|
||||
|
||||
def test_error_contains_non_dummies():
|
||||
dummies = DataFrame(
|
||||
{"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]}
|
||||
)
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=r"Passed DataFrame contains non-dummy data",
|
||||
):
|
||||
from_dummies(dummies)
|
||||
|
||||
|
||||
def test_error_with_prefix_multiple_separators():
|
||||
dummies = DataFrame(
|
||||
{
|
||||
"col1_a": [1, 0, 1],
|
||||
"col1_b": [0, 1, 0],
|
||||
"col2-a": [0, 1, 0],
|
||||
"col2-b": [1, 0, 1],
|
||||
},
|
||||
)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(r"Separator not specified for column: col2-a"),
|
||||
):
|
||||
from_dummies(dummies, sep="_")
|
||||
|
||||
|
||||
def test_error_with_prefix_sep_wrong_type(dummies_basic):
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=(
|
||||
r"Expected 'sep' to be of type 'str' or 'None'; "
|
||||
r"Received 'sep' of type: list"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies_basic, sep=["_"])
|
||||
|
||||
|
||||
def test_error_with_prefix_contains_unassigned(dummies_with_unassigned):
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
r"Dummy DataFrame contains unassigned value\(s\); "
|
||||
r"First instance in row: 2"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies_with_unassigned, sep="_")
|
||||
|
||||
|
||||
def test_error_with_prefix_default_category_wrong_type(dummies_with_unassigned):
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=(
|
||||
r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; "
|
||||
r"Received 'default_category' of type: list"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies_with_unassigned, sep="_", default_category=["x", "y"])
|
||||
|
||||
|
||||
def test_error_with_prefix_default_category_dict_not_complete(
|
||||
dummies_with_unassigned,
|
||||
):
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
r"Length of 'default_category' \(1\) did not match "
|
||||
r"the length of the columns being encoded \(2\)"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies_with_unassigned, sep="_", default_category={"col1": "x"})
|
||||
|
||||
|
||||
def test_error_with_prefix_contains_nan(dummies_basic):
|
||||
# Set float64 dtype to avoid upcast when setting np.nan
|
||||
dummies_basic["col2_c"] = dummies_basic["col2_c"].astype("float64")
|
||||
dummies_basic.loc[2, "col2_c"] = np.nan
|
||||
with pytest.raises(
|
||||
ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'"
|
||||
):
|
||||
from_dummies(dummies_basic, sep="_")
|
||||
|
||||
|
||||
def test_error_with_prefix_contains_non_dummies(dummies_basic):
|
||||
# Set object dtype to avoid upcast when setting "str"
|
||||
dummies_basic["col2_c"] = dummies_basic["col2_c"].astype(object)
|
||||
dummies_basic.loc[2, "col2_c"] = "str"
|
||||
with pytest.raises(TypeError, match=r"Passed DataFrame contains non-dummy data"):
|
||||
from_dummies(dummies_basic, sep="_")
|
||||
|
||||
|
||||
def test_error_with_prefix_double_assignment():
|
||||
dummies = DataFrame(
|
||||
{
|
||||
"col1_a": [1, 0, 1],
|
||||
"col1_b": [1, 1, 0],
|
||||
"col2_a": [0, 1, 0],
|
||||
"col2_b": [1, 0, 0],
|
||||
"col2_c": [0, 0, 1],
|
||||
},
|
||||
)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
r"Dummy DataFrame contains multi-assignment\(s\); "
|
||||
r"First instance in row: 0"
|
||||
),
|
||||
):
|
||||
from_dummies(dummies, sep="_")
|
||||
|
||||
|
||||
def test_roundtrip_series_to_dataframe():
|
||||
categories = Series(["a", "b", "c", "a"])
|
||||
dummies = get_dummies(categories)
|
||||
result = from_dummies(dummies)
|
||||
expected = DataFrame({"": ["a", "b", "c", "a"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_roundtrip_single_column_dataframe():
|
||||
categories = DataFrame({"": ["a", "b", "c", "a"]})
|
||||
dummies = get_dummies(categories)
|
||||
result = from_dummies(dummies, sep="_")
|
||||
expected = categories
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_roundtrip_with_prefixes():
|
||||
categories = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]})
|
||||
dummies = get_dummies(categories)
|
||||
result = from_dummies(dummies, sep="_")
|
||||
expected = categories
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_prefix_string_cats_basic():
|
||||
dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]})
|
||||
expected = DataFrame({"": ["a", "b", "c", "a"]})
|
||||
result = from_dummies(dummies)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_prefix_string_cats_basic_bool_values():
|
||||
dummies = DataFrame(
|
||||
{
|
||||
"a": [True, False, False, True],
|
||||
"b": [False, True, False, False],
|
||||
"c": [False, False, True, False],
|
||||
}
|
||||
)
|
||||
expected = DataFrame({"": ["a", "b", "c", "a"]})
|
||||
result = from_dummies(dummies)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_prefix_string_cats_basic_mixed_bool_values():
|
||||
dummies = DataFrame(
|
||||
{"a": [1, 0, 0, 1], "b": [False, True, False, False], "c": [0, 0, 1, 0]}
|
||||
)
|
||||
expected = DataFrame({"": ["a", "b", "c", "a"]})
|
||||
result = from_dummies(dummies)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_prefix_int_cats_basic():
|
||||
dummies = DataFrame(
|
||||
{1: [1, 0, 0, 0], 25: [0, 1, 0, 0], 2: [0, 0, 1, 0], 5: [0, 0, 0, 1]}
|
||||
)
|
||||
expected = DataFrame({"": [1, 25, 2, 5]})
|
||||
result = from_dummies(dummies)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_prefix_float_cats_basic():
|
||||
dummies = DataFrame(
|
||||
{1.0: [1, 0, 0, 0], 25.0: [0, 1, 0, 0], 2.5: [0, 0, 1, 0], 5.84: [0, 0, 0, 1]}
|
||||
)
|
||||
expected = DataFrame({"": [1.0, 25.0, 2.5, 5.84]})
|
||||
result = from_dummies(dummies)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_prefix_mixed_cats_basic():
|
||||
dummies = DataFrame(
|
||||
{
|
||||
1.23: [1, 0, 0, 0, 0],
|
||||
"c": [0, 1, 0, 0, 0],
|
||||
2: [0, 0, 1, 0, 0],
|
||||
False: [0, 0, 0, 1, 0],
|
||||
None: [0, 0, 0, 0, 1],
|
||||
}
|
||||
)
|
||||
expected = DataFrame({"": [1.23, "c", 2, False, None]}, dtype="object")
|
||||
result = from_dummies(dummies)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_prefix_string_cats_contains_get_dummies_NaN_column():
|
||||
dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]})
|
||||
expected = DataFrame({"": ["a", "b", "NaN"]})
|
||||
result = from_dummies(dummies)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"default_category, expected",
|
||||
[
|
||||
pytest.param(
|
||||
"c",
|
||||
{"": ["a", "b", "c"]},
|
||||
id="default_category is a str",
|
||||
),
|
||||
pytest.param(
|
||||
1,
|
||||
{"": ["a", "b", 1]},
|
||||
id="default_category is an int",
|
||||
),
|
||||
pytest.param(
|
||||
1.25,
|
||||
{"": ["a", "b", 1.25]},
|
||||
id="default_category is a float",
|
||||
),
|
||||
pytest.param(
|
||||
0,
|
||||
{"": ["a", "b", 0]},
|
||||
id="default_category is a 0",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
{"": ["a", "b", False]},
|
||||
id="default_category is a bool",
|
||||
),
|
||||
pytest.param(
|
||||
(1, 2),
|
||||
{"": ["a", "b", (1, 2)]},
|
||||
id="default_category is a tuple",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_no_prefix_string_cats_default_category(
|
||||
default_category, expected, using_infer_string
|
||||
):
|
||||
dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
|
||||
result = from_dummies(dummies, default_category=default_category)
|
||||
expected = DataFrame(expected, dtype=dummies.columns.dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_with_prefix_basic(dummies_basic):
|
||||
expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]})
|
||||
result = from_dummies(dummies_basic, sep="_")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_with_prefix_contains_get_dummies_NaN_column():
|
||||
dummies = DataFrame(
|
||||
{
|
||||
"col1_a": [1, 0, 0],
|
||||
"col1_b": [0, 1, 0],
|
||||
"col1_NaN": [0, 0, 1],
|
||||
"col2_a": [0, 1, 0],
|
||||
"col2_b": [0, 0, 0],
|
||||
"col2_c": [0, 0, 1],
|
||||
"col2_NaN": [1, 0, 0],
|
||||
},
|
||||
)
|
||||
expected = DataFrame({"col1": ["a", "b", "NaN"], "col2": ["NaN", "a", "c"]})
|
||||
result = from_dummies(dummies, sep="_")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"default_category, expected",
|
||||
[
|
||||
pytest.param(
|
||||
"x",
|
||||
{"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]},
|
||||
id="default_category is a str",
|
||||
),
|
||||
pytest.param(
|
||||
0,
|
||||
{"col1": ["a", "b", 0], "col2": [0, "a", "c"]},
|
||||
id="default_category is a 0",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
{"col1": ["a", "b", False], "col2": [False, "a", "c"]},
|
||||
id="default_category is a False",
|
||||
),
|
||||
pytest.param(
|
||||
{"col2": 1, "col1": 2.5},
|
||||
{"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]},
|
||||
id="default_category is a dict with int and float values",
|
||||
),
|
||||
pytest.param(
|
||||
{"col2": None, "col1": False},
|
||||
{"col1": ["a", "b", False], "col2": [None, "a", "c"]},
|
||||
id="default_category is a dict with bool and None values",
|
||||
),
|
||||
pytest.param(
|
||||
{"col2": (1, 2), "col1": [1.25, False]},
|
||||
{"col1": ["a", "b", [1.25, False]], "col2": [(1, 2), "a", "c"]},
|
||||
id="default_category is a dict with list and tuple values",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_with_prefix_default_category(
|
||||
dummies_with_unassigned, default_category, expected, using_infer_string
|
||||
):
|
||||
result = from_dummies(
|
||||
dummies_with_unassigned, sep="_", default_category=default_category
|
||||
)
|
||||
expected = DataFrame(expected)
|
||||
if using_infer_string:
|
||||
expected = expected.astype("str")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_ea_categories():
|
||||
# GH 54300
|
||||
df = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]})
|
||||
df.columns = df.columns.astype("string[python]")
|
||||
result = from_dummies(df)
|
||||
expected = DataFrame({"": Series(list("abca"), dtype="string[python]")})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_ea_categories_with_sep():
|
||||
# GH 54300
|
||||
df = DataFrame(
|
||||
{
|
||||
"col1_a": [1, 0, 1],
|
||||
"col1_b": [0, 1, 0],
|
||||
"col2_a": [0, 1, 0],
|
||||
"col2_b": [1, 0, 0],
|
||||
"col2_c": [0, 0, 1],
|
||||
}
|
||||
)
|
||||
df.columns = df.columns.astype("string[python]")
|
||||
result = from_dummies(df, sep="_")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col1": Series(list("aba"), dtype="string[python]"),
|
||||
"col2": Series(list("bac"), dtype="string[python]"),
|
||||
}
|
||||
)
|
||||
expected.columns = expected.columns.astype("string[python]")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_maintain_original_index():
|
||||
# GH 54300
|
||||
df = DataFrame(
|
||||
{"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}, index=list("abcd")
|
||||
)
|
||||
result = from_dummies(df)
|
||||
expected = DataFrame({"": list("abca")}, index=list("abcd"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_int_columns_with_float_default():
|
||||
# https://github.com/pandas-dev/pandas/pull/60694
|
||||
df = DataFrame(
|
||||
{
|
||||
3: [1, 0, 0],
|
||||
4: [0, 1, 0],
|
||||
},
|
||||
)
|
||||
with pytest.raises(ValueError, match="Trying to coerce float values to integers"):
|
||||
from_dummies(df, default_category=0.5)
|
||||
|
||||
|
||||
def test_object_dtype_preserved():
|
||||
# https://github.com/pandas-dev/pandas/pull/60694
|
||||
# When the input has object dtype, the result should as
|
||||
# well even when infer_string is True.
|
||||
df = DataFrame(
|
||||
{
|
||||
"x": [1, 0, 0],
|
||||
"y": [0, 1, 0],
|
||||
},
|
||||
)
|
||||
df.columns = df.columns.astype("object")
|
||||
result = from_dummies(df, default_category="z")
|
||||
expected = DataFrame({"": ["x", "y", "z"]}, dtype="object")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,741 @@
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas.core.dtypes.common import is_integer_dtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
ArrowDtype,
|
||||
Categorical,
|
||||
CategoricalDtype,
|
||||
CategoricalIndex,
|
||||
DataFrame,
|
||||
Index,
|
||||
RangeIndex,
|
||||
Series,
|
||||
SparseDtype,
|
||||
get_dummies,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays.sparse import SparseArray
|
||||
|
||||
try:
|
||||
import pyarrow as pa
|
||||
except ImportError:
|
||||
pa = None
|
||||
|
||||
|
||||
class TestGetDummies:
|
||||
@pytest.fixture
|
||||
def df(self):
|
||||
return DataFrame({"A": ["a", "b", "a"], "B": ["b", "b", "c"], "C": [1, 2, 3]})
|
||||
|
||||
@pytest.fixture(params=["uint8", "i8", np.float64, bool, None])
|
||||
def dtype(self, request):
|
||||
return np.dtype(request.param)
|
||||
|
||||
@pytest.fixture(params=["dense", "sparse"])
|
||||
def sparse(self, request):
|
||||
# params are strings to simplify reading test results,
|
||||
# e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True]
|
||||
return request.param == "sparse"
|
||||
|
||||
def effective_dtype(self, dtype):
|
||||
if dtype is None:
|
||||
return np.uint8
|
||||
return dtype
|
||||
|
||||
def test_get_dummies_raises_on_dtype_object(self, df):
|
||||
msg = "dtype=object is not a valid dtype for get_dummies"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
get_dummies(df, dtype="object")
|
||||
|
||||
def test_get_dummies_basic(self, sparse, dtype):
|
||||
s_list = list("abc")
|
||||
s_series = Series(s_list)
|
||||
s_series_index = Series(s_list, list("ABC"))
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
|
||||
dtype=self.effective_dtype(dtype),
|
||||
)
|
||||
if sparse:
|
||||
if dtype.kind == "b":
|
||||
expected = expected.apply(SparseArray, fill_value=False)
|
||||
else:
|
||||
expected = expected.apply(SparseArray, fill_value=0.0)
|
||||
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected.index = list("ABC")
|
||||
result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_basic_types(self, sparse, dtype, using_infer_string):
|
||||
# GH 10531
|
||||
s_list = list("abc")
|
||||
s_series = Series(s_list)
|
||||
s_df = DataFrame(
|
||||
{"a": [0, 1, 0, 1, 2], "b": ["A", "A", "B", "C", "C"], "c": [2, 3, 3, 3, 2]}
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
|
||||
dtype=self.effective_dtype(dtype),
|
||||
columns=list("abc"),
|
||||
)
|
||||
if sparse:
|
||||
if is_integer_dtype(dtype):
|
||||
fill_value = 0
|
||||
elif dtype == bool:
|
||||
fill_value = False
|
||||
else:
|
||||
fill_value = 0.0
|
||||
|
||||
expected = expected.apply(SparseArray, fill_value=fill_value)
|
||||
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_df, columns=s_df.columns, sparse=sparse, dtype=dtype)
|
||||
if sparse:
|
||||
dtype_name = f"Sparse[{self.effective_dtype(dtype).name}, {fill_value}]"
|
||||
else:
|
||||
dtype_name = self.effective_dtype(dtype).name
|
||||
|
||||
expected = Series({dtype_name: 8}, name="count")
|
||||
result = result.dtypes.value_counts()
|
||||
result.index = [str(i) for i in result.index]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype)
|
||||
|
||||
key = "str" if using_infer_string else "object"
|
||||
expected_counts = {"int64": 1, key: 1}
|
||||
expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)
|
||||
|
||||
expected = Series(expected_counts, name="count").sort_index()
|
||||
result = result.dtypes.value_counts()
|
||||
result.index = [str(i) for i in result.index]
|
||||
result = result.sort_index()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_get_dummies_just_na(self, sparse):
|
||||
just_na_list = [np.nan]
|
||||
just_na_series = Series(just_na_list)
|
||||
just_na_series_index = Series(just_na_list, index=["A"])
|
||||
|
||||
res_list = get_dummies(just_na_list, sparse=sparse)
|
||||
res_series = get_dummies(just_na_series, sparse=sparse)
|
||||
res_series_index = get_dummies(just_na_series_index, sparse=sparse)
|
||||
|
||||
assert res_list.empty
|
||||
assert res_series.empty
|
||||
assert res_series_index.empty
|
||||
|
||||
assert res_list.index.tolist() == [0]
|
||||
assert res_series.index.tolist() == [0]
|
||||
assert res_series_index.index.tolist() == ["A"]
|
||||
|
||||
def test_get_dummies_include_na(self, sparse, dtype):
|
||||
s = ["a", "b", np.nan]
|
||||
res = get_dummies(s, sparse=sparse, dtype=dtype)
|
||||
exp = DataFrame(
|
||||
{"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype)
|
||||
)
|
||||
if sparse:
|
||||
if dtype.kind == "b":
|
||||
exp = exp.apply(SparseArray, fill_value=False)
|
||||
else:
|
||||
exp = exp.apply(SparseArray, fill_value=0.0)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# Sparse dataframes do not allow nan labelled columns, see #GH8822
|
||||
res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
|
||||
exp_na = DataFrame(
|
||||
{np.nan: [0, 0, 1], "a": [1, 0, 0], "b": [0, 1, 0]},
|
||||
dtype=self.effective_dtype(dtype),
|
||||
)
|
||||
exp_na = exp_na.reindex(["a", "b", np.nan], axis=1)
|
||||
# hack (NaN handling in assert_index_equal)
|
||||
exp_na.columns = res_na.columns
|
||||
if sparse:
|
||||
if dtype.kind == "b":
|
||||
exp_na = exp_na.apply(SparseArray, fill_value=False)
|
||||
else:
|
||||
exp_na = exp_na.apply(SparseArray, fill_value=0.0)
|
||||
tm.assert_frame_equal(res_na, exp_na)
|
||||
|
||||
res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype)
|
||||
exp_just_na = DataFrame(
|
||||
Series(1, index=[0]), columns=[np.nan], dtype=self.effective_dtype(dtype)
|
||||
)
|
||||
tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
|
||||
|
||||
def test_get_dummies_unicode(self, sparse):
|
||||
# See GH 6885 - get_dummies chokes on unicode values
|
||||
e = "e"
|
||||
eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE")
|
||||
s = [e, eacute, eacute]
|
||||
res = get_dummies(s, prefix="letter", sparse=sparse)
|
||||
exp = DataFrame(
|
||||
{"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]}
|
||||
)
|
||||
if sparse:
|
||||
exp = exp.apply(SparseArray, fill_value=False)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_dataframe_dummies_all_obj(self, df, sparse):
|
||||
df = df[["A", "B"]]
|
||||
result = get_dummies(df, sparse=sparse)
|
||||
expected = DataFrame(
|
||||
{"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]},
|
||||
dtype=bool,
|
||||
)
|
||||
if sparse:
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A_a": SparseArray([1, 0, 1], dtype="bool"),
|
||||
"A_b": SparseArray([0, 1, 0], dtype="bool"),
|
||||
"B_b": SparseArray([1, 1, 0], dtype="bool"),
|
||||
"B_c": SparseArray([0, 0, 1], dtype="bool"),
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_string_dtype(self, df, any_string_dtype):
|
||||
# GH44965
|
||||
df = df[["A", "B"]]
|
||||
df = df.astype({"A": "str", "B": any_string_dtype})
|
||||
result = get_dummies(df)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A_a": [1, 0, 1],
|
||||
"A_b": [0, 1, 0],
|
||||
"B_b": [1, 1, 0],
|
||||
"B_c": [0, 0, 1],
|
||||
},
|
||||
dtype=bool,
|
||||
)
|
||||
if any_string_dtype == "string" and any_string_dtype.na_value is pd.NA:
|
||||
expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
|
||||
result = get_dummies(df, sparse=sparse, dtype=dtype)
|
||||
if sparse:
|
||||
arr = SparseArray
|
||||
if dtype.kind == "b":
|
||||
typ = SparseDtype(dtype, False)
|
||||
else:
|
||||
typ = SparseDtype(dtype, 0)
|
||||
else:
|
||||
arr = np.array
|
||||
typ = dtype
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3],
|
||||
"A_a": arr([1, 0, 1], dtype=typ),
|
||||
"A_b": arr([0, 1, 0], dtype=typ),
|
||||
"B_b": arr([1, 1, 0], dtype=typ),
|
||||
"B_c": arr([0, 0, 1], dtype=typ),
|
||||
}
|
||||
)
|
||||
expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_list(self, df, sparse):
|
||||
prefixes = ["from_A", "from_B"]
|
||||
result = get_dummies(df, prefix=prefixes, sparse=sparse)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3],
|
||||
"from_A_a": [True, False, True],
|
||||
"from_A_b": [False, True, False],
|
||||
"from_B_b": [True, True, False],
|
||||
"from_B_c": [False, False, True],
|
||||
},
|
||||
)
|
||||
expected[["C"]] = df[["C"]]
|
||||
cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
|
||||
expected = expected[["C", *cols]]
|
||||
|
||||
typ = SparseArray if sparse else Series
|
||||
expected[cols] = expected[cols].apply(lambda x: typ(x))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_str(self, df, sparse):
|
||||
# not that you should do this...
|
||||
result = get_dummies(df, prefix="bad", sparse=sparse)
|
||||
bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"]
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, True, False, True, False],
|
||||
[2, False, True, True, False],
|
||||
[3, True, False, False, True],
|
||||
],
|
||||
columns=["C", *bad_columns],
|
||||
)
|
||||
expected = expected.astype({"C": np.int64})
|
||||
if sparse:
|
||||
# work around astyping & assigning with duplicate columns
|
||||
# https://github.com/pandas-dev/pandas/issues/14427
|
||||
expected = pd.concat(
|
||||
[
|
||||
Series([1, 2, 3], name="C"),
|
||||
Series([True, False, True], name="bad_a", dtype="Sparse[bool]"),
|
||||
Series([False, True, False], name="bad_b", dtype="Sparse[bool]"),
|
||||
Series([True, True, False], name="bad_b", dtype="Sparse[bool]"),
|
||||
Series([False, False, True], name="bad_c", dtype="Sparse[bool]"),
|
||||
],
|
||||
axis=1,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_subset(self, df, sparse):
|
||||
result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"B": ["b", "b", "c"],
|
||||
"C": [1, 2, 3],
|
||||
"from_A_a": [1, 0, 1],
|
||||
"from_A_b": [0, 1, 0],
|
||||
},
|
||||
)
|
||||
cols = expected.columns
|
||||
expected[cols[1:]] = expected[cols[1:]].astype(bool)
|
||||
expected[["C"]] = df[["C"]]
|
||||
if sparse:
|
||||
cols = ["from_A_a", "from_A_b"]
|
||||
expected[cols] = expected[cols].astype(SparseDtype("bool", False))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_sep(self, df, sparse):
|
||||
result = get_dummies(df, prefix_sep="..", sparse=sparse)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3],
|
||||
"A..a": [True, False, True],
|
||||
"A..b": [False, True, False],
|
||||
"B..b": [True, True, False],
|
||||
"B..c": [False, False, True],
|
||||
},
|
||||
)
|
||||
expected[["C"]] = df[["C"]]
|
||||
expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
|
||||
if sparse:
|
||||
cols = ["A..a", "A..b", "B..b", "B..c"]
|
||||
expected[cols] = expected[cols].astype(SparseDtype("bool", False))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse)
|
||||
expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
|
||||
msg = re.escape(
|
||||
"Length of 'prefix' (1) did not match the length of the columns being "
|
||||
"encoded (2)"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
get_dummies(df, prefix=["too few"], sparse=sparse)
|
||||
|
||||
def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
|
||||
msg = re.escape(
|
||||
"Length of 'prefix_sep' (1) did not match the length of the columns being "
|
||||
"encoded (2)"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
get_dummies(df, prefix_sep=["bad"], sparse=sparse)
|
||||
|
||||
def test_dataframe_dummies_prefix_dict(self, sparse):
|
||||
prefixes = {"A": "from_A", "B": "from_B"}
|
||||
df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]})
|
||||
result = get_dummies(df, prefix=prefixes, sparse=sparse)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3],
|
||||
"from_A_a": [1, 0, 1],
|
||||
"from_A_b": [0, 1, 0],
|
||||
"from_B_b": [1, 1, 0],
|
||||
"from_B_c": [0, 0, 1],
|
||||
}
|
||||
)
|
||||
|
||||
columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
|
||||
expected[columns] = expected[columns].astype(bool)
|
||||
if sparse:
|
||||
expected[columns] = expected[columns].astype(SparseDtype("bool", False))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_with_na(self, df, sparse, dtype):
|
||||
df.loc[3, :] = [np.nan, np.nan, np.nan]
|
||||
result = get_dummies(df, dummy_na=True, sparse=sparse, dtype=dtype).sort_index(
|
||||
axis=1
|
||||
)
|
||||
|
||||
if sparse:
|
||||
arr = SparseArray
|
||||
if dtype.kind == "b":
|
||||
typ = SparseDtype(dtype, False)
|
||||
else:
|
||||
typ = SparseDtype(dtype, 0)
|
||||
else:
|
||||
arr = np.array
|
||||
typ = dtype
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3, np.nan],
|
||||
"A_a": arr([1, 0, 1, 0], dtype=typ),
|
||||
"A_b": arr([0, 1, 0, 0], dtype=typ),
|
||||
"A_nan": arr([0, 0, 0, 1], dtype=typ),
|
||||
"B_b": arr([1, 1, 0, 0], dtype=typ),
|
||||
"B_c": arr([0, 0, 1, 0], dtype=typ),
|
||||
"B_nan": arr([0, 0, 0, 1], dtype=typ),
|
||||
}
|
||||
).sort_index(axis=1)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
|
||||
expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
|
||||
df["cat"] = Categorical(["x", "y", "y"])
|
||||
result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
|
||||
if sparse:
|
||||
arr = SparseArray
|
||||
if dtype.kind == "b":
|
||||
typ = SparseDtype(dtype, False)
|
||||
else:
|
||||
typ = SparseDtype(dtype, 0)
|
||||
else:
|
||||
arr = np.array
|
||||
typ = dtype
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3],
|
||||
"A_a": arr([1, 0, 1], dtype=typ),
|
||||
"A_b": arr([0, 1, 0], dtype=typ),
|
||||
"B_b": arr([1, 1, 0], dtype=typ),
|
||||
"B_c": arr([0, 0, 1], dtype=typ),
|
||||
"cat_x": arr([1, 0, 0], dtype=typ),
|
||||
"cat_y": arr([0, 1, 1], dtype=typ),
|
||||
}
|
||||
).sort_index(axis=1)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"get_dummies_kwargs,expected",
|
||||
[
|
||||
(
|
||||
{"data": DataFrame({"ä": ["a"]})},
|
||||
"ä_a",
|
||||
),
|
||||
(
|
||||
{"data": DataFrame({"x": ["ä"]})},
|
||||
"x_ä",
|
||||
),
|
||||
(
|
||||
{"data": DataFrame({"x": ["a"]}), "prefix": "ä"},
|
||||
"ä_a",
|
||||
),
|
||||
(
|
||||
{"data": DataFrame({"x": ["a"]}), "prefix_sep": "ä"},
|
||||
"xäa",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
|
||||
# GH22084 get_dummies incorrectly encodes unicode characters
|
||||
# in dataframe column names
|
||||
result = get_dummies(**get_dummies_kwargs)
|
||||
expected = DataFrame({expected: [True]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_basic_drop_first(self, sparse):
|
||||
# GH12402 Add a new parameter `drop_first` to avoid collinearity
|
||||
# Basic case
|
||||
s_list = list("abc")
|
||||
s_series = Series(s_list)
|
||||
s_series_index = Series(s_list, list("ABC"))
|
||||
|
||||
expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=bool)
|
||||
|
||||
result = get_dummies(s_list, drop_first=True, sparse=sparse)
|
||||
if sparse:
|
||||
expected = expected.apply(SparseArray, fill_value=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, drop_first=True, sparse=sparse)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected.index = list("ABC")
|
||||
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_basic_drop_first_one_level(self, sparse):
|
||||
# Test the case that categorical variable only has one level.
|
||||
s_list = list("aaa")
|
||||
s_series = Series(s_list)
|
||||
s_series_index = Series(s_list, list("ABC"))
|
||||
|
||||
expected = DataFrame(index=RangeIndex(3))
|
||||
|
||||
result = get_dummies(s_list, drop_first=True, sparse=sparse)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, drop_first=True, sparse=sparse)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(index=list("ABC"))
|
||||
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_basic_drop_first_NA(self, sparse):
|
||||
# Test NA handling together with drop_first
|
||||
s_NA = ["a", "b", np.nan]
|
||||
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
|
||||
exp = DataFrame({"b": [0, 1, 0]}, dtype=bool)
|
||||
if sparse:
|
||||
exp = exp.apply(SparseArray, fill_value=False)
|
||||
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
|
||||
exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=bool).reindex(
|
||||
["b", np.nan], axis=1
|
||||
)
|
||||
if sparse:
|
||||
exp_na = exp_na.apply(SparseArray, fill_value=False)
|
||||
tm.assert_frame_equal(res_na, exp_na)
|
||||
|
||||
res_just_na = get_dummies(
|
||||
[np.nan], dummy_na=True, drop_first=True, sparse=sparse
|
||||
)
|
||||
exp_just_na = DataFrame(index=RangeIndex(1))
|
||||
tm.assert_frame_equal(res_just_na, exp_just_na)
|
||||
|
||||
def test_dataframe_dummies_drop_first(self, df, sparse):
|
||||
df = df[["A", "B"]]
|
||||
result = get_dummies(df, drop_first=True, sparse=sparse)
|
||||
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool)
|
||||
if sparse:
|
||||
expected = expected.apply(SparseArray, fill_value=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
|
||||
df["cat"] = Categorical(["x", "y", "y"])
|
||||
result = get_dummies(df, drop_first=True, sparse=sparse)
|
||||
expected = DataFrame(
|
||||
{"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]}
|
||||
)
|
||||
cols = ["A_b", "B_c", "cat_y"]
|
||||
expected[cols] = expected[cols].astype(bool)
|
||||
expected = expected[["C", "A_b", "B_c", "cat_y"]]
|
||||
if sparse:
|
||||
for col in cols:
|
||||
expected[col] = SparseArray(expected[col])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
|
||||
df.loc[3, :] = [np.nan, np.nan, np.nan]
|
||||
result = get_dummies(
|
||||
df, dummy_na=True, drop_first=True, sparse=sparse
|
||||
).sort_index(axis=1)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3, np.nan],
|
||||
"A_b": [0, 1, 0, 0],
|
||||
"A_nan": [0, 0, 0, 1],
|
||||
"B_c": [0, 0, 1, 0],
|
||||
"B_nan": [0, 0, 0, 1],
|
||||
}
|
||||
)
|
||||
cols = ["A_b", "A_nan", "B_c", "B_nan"]
|
||||
expected[cols] = expected[cols].astype(bool)
|
||||
expected = expected.sort_index(axis=1)
|
||||
if sparse:
|
||||
for col in cols:
|
||||
expected[col] = SparseArray(expected[col])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse)
|
||||
expected = expected[["C", "A_b", "B_c"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_int_int(self):
|
||||
data = Series([1, 2, 1])
|
||||
result = get_dummies(data)
|
||||
expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=bool)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
data = Series(Categorical(["a", "b", "a"]))
|
||||
result = get_dummies(data)
|
||||
expected = DataFrame(
|
||||
[[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=bool
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_int_df(self, dtype):
|
||||
data = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 1],
|
||||
"B": Categorical(["a", "b", "a"]),
|
||||
"C": [1, 2, 1],
|
||||
"D": [1.0, 2.0, 1.0],
|
||||
}
|
||||
)
|
||||
columns = ["C", "D", "A_1", "A_2", "B_a", "B_b"]
|
||||
expected = DataFrame(
|
||||
[[1, 1.0, 1, 0, 1, 0], [2, 2.0, 0, 1, 0, 1], [1, 1.0, 1, 0, 1, 0]],
|
||||
columns=columns,
|
||||
)
|
||||
expected[columns[2:]] = expected[columns[2:]].astype(dtype)
|
||||
result = get_dummies(data, columns=["A", "B"], dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("ordered", [True, False])
|
||||
def test_dataframe_dummies_preserve_categorical_dtype(self, dtype, ordered):
|
||||
# GH13854
|
||||
cat = Categorical(list("xy"), categories=list("xyz"), ordered=ordered)
|
||||
result = get_dummies(cat, dtype=dtype)
|
||||
|
||||
data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype))
|
||||
cols = CategoricalIndex(
|
||||
cat.categories, categories=cat.categories, ordered=ordered
|
||||
)
|
||||
expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("sparse", [True, False])
|
||||
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
|
||||
# GH18914
|
||||
df = DataFrame.from_dict({"GDP": [1, 2], "Nation": ["AB", "CD"]})
|
||||
df = get_dummies(df, columns=["Nation"], sparse=sparse)
|
||||
df2 = df.reindex(columns=["GDP"])
|
||||
|
||||
tm.assert_frame_equal(df[["GDP"]], df2)
|
||||
|
||||
def test_get_dummies_duplicate_columns(self, df):
|
||||
# GH20839
|
||||
df.columns = ["A", "A", "A"]
|
||||
result = get_dummies(df).sort_index(axis=1)
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, True, False, True, False],
|
||||
[2, False, True, True, False],
|
||||
[3, True, False, False, True],
|
||||
],
|
||||
columns=["A", "A_a", "A_b", "A_b", "A_c"],
|
||||
).sort_index(axis=1)
|
||||
|
||||
expected = expected.astype({"A": np.int64})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_all_sparse(self):
|
||||
df = DataFrame({"A": [1, 2]})
|
||||
result = get_dummies(df, columns=["A"], sparse=True)
|
||||
dtype = SparseDtype("bool", False)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A_1": SparseArray([1, 0], dtype=dtype),
|
||||
"A_2": SparseArray([0, 1], dtype=dtype),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("values", ["baz"])
|
||||
def test_get_dummies_with_string_values(self, values):
|
||||
# issue #28383
|
||||
df = DataFrame(
|
||||
{
|
||||
"bar": [1, 2, 3, 4, 5, 6],
|
||||
"foo": ["one", "one", "one", "two", "two", "two"],
|
||||
"baz": ["A", "B", "C", "A", "B", "C"],
|
||||
"zoo": ["x", "y", "z", "q", "w", "t"],
|
||||
}
|
||||
)
|
||||
|
||||
msg = "Input must be a list-like for parameter `columns`"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
get_dummies(df, columns=values)
|
||||
|
||||
def test_get_dummies_ea_dtype_series(self, any_numeric_ea_and_arrow_dtype):
|
||||
# GH#32430
|
||||
ser = Series(list("abca"))
|
||||
result = get_dummies(ser, dtype=any_numeric_ea_and_arrow_dtype)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]},
|
||||
dtype=any_numeric_ea_and_arrow_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype):
|
||||
# GH#32430
|
||||
df = DataFrame({"x": list("abca")})
|
||||
result = get_dummies(df, dtype=any_numeric_ea_and_arrow_dtype)
|
||||
expected = DataFrame(
|
||||
{"x_a": [1, 0, 0, 1], "x_b": [0, 1, 0, 0], "x_c": [0, 0, 1, 0]},
|
||||
dtype=any_numeric_ea_and_arrow_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype_type", ["string", "category"])
|
||||
def test_get_dummies_ea_dtype(self, dtype_type, string_dtype_no_object):
|
||||
# GH#56273
|
||||
dtype = string_dtype_no_object
|
||||
exp_dtype = "boolean" if dtype.na_value is pd.NA else "bool"
|
||||
if dtype_type == "category":
|
||||
dtype = CategoricalDtype(Index(["a"], dtype))
|
||||
df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1})
|
||||
result = get_dummies(df)
|
||||
expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@td.skip_if_no("pyarrow")
|
||||
def test_get_dummies_arrow_dtype(self):
|
||||
# GH#56273
|
||||
df = DataFrame({"name": Series(["a"], dtype=ArrowDtype(pa.string())), "x": 1})
|
||||
result = get_dummies(df)
|
||||
expected = DataFrame({"x": 1, "name_a": Series([True], dtype="bool[pyarrow]")})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"name": Series(
|
||||
["a"],
|
||||
dtype=CategoricalDtype(Index(["a"], dtype=ArrowDtype(pa.string()))),
|
||||
),
|
||||
"x": 1,
|
||||
}
|
||||
)
|
||||
result = get_dummies(df)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,301 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_index, input_columns, input_values, "
|
||||
"expected_values, expected_columns, expected_index",
|
||||
[
|
||||
(
|
||||
["lev4"],
|
||||
"lev3",
|
||||
"values",
|
||||
[
|
||||
[0.0, np.nan],
|
||||
[np.nan, 1.0],
|
||||
[2.0, np.nan],
|
||||
[np.nan, 3.0],
|
||||
[4.0, np.nan],
|
||||
[np.nan, 5.0],
|
||||
[6.0, np.nan],
|
||||
[np.nan, 7.0],
|
||||
],
|
||||
Index([1, 2], name="lev3"),
|
||||
Index([1, 2, 3, 4, 5, 6, 7, 8], name="lev4"),
|
||||
),
|
||||
(
|
||||
["lev4"],
|
||||
"lev3",
|
||||
lib.no_default,
|
||||
[
|
||||
[1.0, np.nan, 1.0, np.nan, 0.0, np.nan],
|
||||
[np.nan, 1.0, np.nan, 1.0, np.nan, 1.0],
|
||||
[1.0, np.nan, 2.0, np.nan, 2.0, np.nan],
|
||||
[np.nan, 1.0, np.nan, 2.0, np.nan, 3.0],
|
||||
[2.0, np.nan, 1.0, np.nan, 4.0, np.nan],
|
||||
[np.nan, 2.0, np.nan, 1.0, np.nan, 5.0],
|
||||
[2.0, np.nan, 2.0, np.nan, 6.0, np.nan],
|
||||
[np.nan, 2.0, np.nan, 2.0, np.nan, 7.0],
|
||||
],
|
||||
MultiIndex.from_tuples(
|
||||
[
|
||||
("lev1", 1),
|
||||
("lev1", 2),
|
||||
("lev2", 1),
|
||||
("lev2", 2),
|
||||
("values", 1),
|
||||
("values", 2),
|
||||
],
|
||||
names=[None, "lev3"],
|
||||
),
|
||||
Index([1, 2, 3, 4, 5, 6, 7, 8], name="lev4"),
|
||||
),
|
||||
(
|
||||
["lev1", "lev2"],
|
||||
"lev3",
|
||||
"values",
|
||||
[[0, 1], [2, 3], [4, 5], [6, 7]],
|
||||
Index([1, 2], name="lev3"),
|
||||
MultiIndex.from_tuples(
|
||||
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
|
||||
),
|
||||
),
|
||||
(
|
||||
["lev1", "lev2"],
|
||||
"lev3",
|
||||
lib.no_default,
|
||||
[[1, 2, 0, 1], [3, 4, 2, 3], [5, 6, 4, 5], [7, 8, 6, 7]],
|
||||
MultiIndex.from_tuples(
|
||||
[("lev4", 1), ("lev4", 2), ("values", 1), ("values", 2)],
|
||||
names=[None, "lev3"],
|
||||
),
|
||||
MultiIndex.from_tuples(
|
||||
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_pivot_list_like_index(
|
||||
input_index,
|
||||
input_columns,
|
||||
input_values,
|
||||
expected_values,
|
||||
expected_columns,
|
||||
expected_index,
|
||||
):
|
||||
# GH 21425, test when index is given a list
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"lev1": [1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"lev2": [1, 1, 2, 2, 1, 1, 2, 2],
|
||||
"lev3": [1, 2, 1, 2, 1, 2, 1, 2],
|
||||
"lev4": [1, 2, 3, 4, 5, 6, 7, 8],
|
||||
"values": [0, 1, 2, 3, 4, 5, 6, 7],
|
||||
}
|
||||
)
|
||||
|
||||
result = df.pivot(index=input_index, columns=input_columns, values=input_values)
|
||||
expected = pd.DataFrame(
|
||||
expected_values, columns=expected_columns, index=expected_index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_index, input_columns, input_values, "
|
||||
"expected_values, expected_columns, expected_index",
|
||||
[
|
||||
(
|
||||
"lev4",
|
||||
["lev3"],
|
||||
"values",
|
||||
[
|
||||
[0.0, np.nan],
|
||||
[np.nan, 1.0],
|
||||
[2.0, np.nan],
|
||||
[np.nan, 3.0],
|
||||
[4.0, np.nan],
|
||||
[np.nan, 5.0],
|
||||
[6.0, np.nan],
|
||||
[np.nan, 7.0],
|
||||
],
|
||||
Index([1, 2], name="lev3"),
|
||||
Index([1, 2, 3, 4, 5, 6, 7, 8], name="lev4"),
|
||||
),
|
||||
(
|
||||
["lev1", "lev2"],
|
||||
["lev3"],
|
||||
"values",
|
||||
[[0, 1], [2, 3], [4, 5], [6, 7]],
|
||||
Index([1, 2], name="lev3"),
|
||||
MultiIndex.from_tuples(
|
||||
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
|
||||
),
|
||||
),
|
||||
(
|
||||
["lev1"],
|
||||
["lev2", "lev3"],
|
||||
"values",
|
||||
[[0, 1, 2, 3], [4, 5, 6, 7]],
|
||||
MultiIndex.from_tuples(
|
||||
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev2", "lev3"]
|
||||
),
|
||||
Index([1, 2], name="lev1"),
|
||||
),
|
||||
(
|
||||
["lev1", "lev2"],
|
||||
["lev3", "lev4"],
|
||||
"values",
|
||||
[
|
||||
[0.0, 1.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, 2.0, 3.0, np.nan, np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan, np.nan, 4.0, 5.0, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 6.0, 7.0],
|
||||
],
|
||||
MultiIndex.from_tuples(
|
||||
[(1, 1), (2, 2), (1, 3), (2, 4), (1, 5), (2, 6), (1, 7), (2, 8)],
|
||||
names=["lev3", "lev4"],
|
||||
),
|
||||
MultiIndex.from_tuples(
|
||||
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_pivot_list_like_columns(
|
||||
input_index,
|
||||
input_columns,
|
||||
input_values,
|
||||
expected_values,
|
||||
expected_columns,
|
||||
expected_index,
|
||||
):
|
||||
# GH 21425, test when columns is given a list
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"lev1": [1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"lev2": [1, 1, 2, 2, 1, 1, 2, 2],
|
||||
"lev3": [1, 2, 1, 2, 1, 2, 1, 2],
|
||||
"lev4": [1, 2, 3, 4, 5, 6, 7, 8],
|
||||
"values": [0, 1, 2, 3, 4, 5, 6, 7],
|
||||
}
|
||||
)
|
||||
|
||||
result = df.pivot(index=input_index, columns=input_columns, values=input_values)
|
||||
expected = pd.DataFrame(
|
||||
expected_values, columns=expected_columns, index=expected_index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_pivot_multiindexed_rows_and_cols():
|
||||
# GH 36360
|
||||
|
||||
df = pd.DataFrame(
|
||||
data=np.arange(12).reshape(4, 3),
|
||||
columns=MultiIndex.from_tuples(
|
||||
[(0, 0), (0, 1), (0, 2)], names=["col_L0", "col_L1"]
|
||||
),
|
||||
index=MultiIndex.from_tuples(
|
||||
[(0, 0, 0), (0, 0, 1), (1, 1, 1), (1, 0, 0)],
|
||||
names=["idx_L0", "idx_L1", "idx_L2"],
|
||||
),
|
||||
)
|
||||
|
||||
res = df.pivot_table(
|
||||
index=["idx_L0"],
|
||||
columns=["idx_L1"],
|
||||
values=[(0, 1)],
|
||||
aggfunc=lambda col: col.values.sum(),
|
||||
)
|
||||
|
||||
expected = pd.DataFrame(
|
||||
data=[[5, np.nan], [10, 7.0]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[(0, 1, 0), (0, 1, 1)], names=["col_L0", "col_L1", "idx_L1"]
|
||||
),
|
||||
index=Index([0, 1], dtype="int64", name="idx_L0"),
|
||||
)
|
||||
expected = expected.astype("float64")
|
||||
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
|
||||
def test_pivot_df_multiindex_index_none():
|
||||
# GH 23955
|
||||
df = pd.DataFrame(
|
||||
[
|
||||
["A", "A1", "label1", 1],
|
||||
["A", "A2", "label2", 2],
|
||||
["B", "A1", "label1", 3],
|
||||
["B", "A2", "label2", 4],
|
||||
],
|
||||
columns=["index_1", "index_2", "label", "value"],
|
||||
)
|
||||
df = df.set_index(["index_1", "index_2"])
|
||||
|
||||
result = df.pivot(columns="label", values="value")
|
||||
expected = pd.DataFrame(
|
||||
[[1.0, np.nan], [np.nan, 2.0], [3.0, np.nan], [np.nan, 4.0]],
|
||||
index=df.index,
|
||||
columns=Index(["label1", "label2"], name="label"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index, columns, e_data, e_index, e_cols",
|
||||
[
|
||||
(
|
||||
"index",
|
||||
["col", "value"],
|
||||
[
|
||||
[50.0, np.nan, 100.0, np.nan],
|
||||
[np.nan, 100.0, np.nan, 200.0],
|
||||
],
|
||||
Index(data=["A", "B"], name="index"),
|
||||
MultiIndex.from_arrays(
|
||||
arrays=[[1, 1, 2, 2], [50, 100, 100, 200]], names=["col", "value"]
|
||||
),
|
||||
),
|
||||
(
|
||||
["index", "value"],
|
||||
"col",
|
||||
[
|
||||
[50.0, np.nan],
|
||||
[np.nan, 100.0],
|
||||
[100.0, np.nan],
|
||||
[np.nan, 200.0],
|
||||
],
|
||||
MultiIndex.from_arrays(
|
||||
arrays=[["A", "A", "B", "B"], [50, 100, 100, 200]],
|
||||
names=["index", "value"],
|
||||
),
|
||||
Index(data=[1, 2], name="col"),
|
||||
),
|
||||
],
|
||||
ids=["values-and-columns", "values-and-index"],
|
||||
)
|
||||
def test_pivot_table_multiindex_values_as_two_params(
|
||||
index, columns, e_data, e_index, e_cols
|
||||
):
|
||||
# GH#61292
|
||||
data = [
|
||||
["A", 1, 50, -1],
|
||||
["B", 1, 100, -2],
|
||||
["A", 2, 100, -2],
|
||||
["B", 2, 200, -4],
|
||||
]
|
||||
df = pd.DataFrame(data=data, columns=["index", "col", "value", "extra"])
|
||||
result = df.pivot_table(values="value", index=index, columns=columns)
|
||||
expected = pd.DataFrame(data=e_data, index=e_index, columns=e_cols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@ -0,0 +1,308 @@
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DatetimeIndex,
|
||||
Interval,
|
||||
IntervalIndex,
|
||||
NaT,
|
||||
Series,
|
||||
Timedelta,
|
||||
TimedeltaIndex,
|
||||
Timestamp,
|
||||
cut,
|
||||
date_range,
|
||||
isna,
|
||||
qcut,
|
||||
timedelta_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.api.types import CategoricalDtype
|
||||
|
||||
from pandas.tseries.offsets import Day
|
||||
|
||||
|
||||
def test_qcut():
|
||||
arr = np.random.default_rng(2).standard_normal(1000)
|
||||
|
||||
# We store the bins as Index that have been
|
||||
# rounded to comparisons are a bit tricky.
|
||||
labels, _ = qcut(arr, 4, retbins=True)
|
||||
ex_bins = np.quantile(arr, [0, 0.25, 0.5, 0.75, 1.0])
|
||||
|
||||
result = labels.categories.left.values
|
||||
assert np.allclose(result, ex_bins[:-1], atol=1e-2)
|
||||
|
||||
result = labels.categories.right.values
|
||||
assert np.allclose(result, ex_bins[1:], atol=1e-2)
|
||||
|
||||
ex_levels = cut(arr, ex_bins, include_lowest=True)
|
||||
tm.assert_categorical_equal(labels, ex_levels)
|
||||
|
||||
|
||||
def test_qcut_bounds():
|
||||
arr = np.random.default_rng(2).standard_normal(1000)
|
||||
|
||||
factor = qcut(arr, 10, labels=False)
|
||||
assert len(np.unique(factor)) == 10
|
||||
|
||||
|
||||
def test_qcut_specify_quantiles():
|
||||
arr = np.random.default_rng(2).standard_normal(100)
|
||||
factor = qcut(arr, [0, 0.25, 0.5, 0.75, 1.0])
|
||||
|
||||
expected = qcut(arr, 4)
|
||||
tm.assert_categorical_equal(factor, expected)
|
||||
|
||||
|
||||
def test_qcut_all_bins_same():
|
||||
with pytest.raises(ValueError, match="edges.*unique"):
|
||||
qcut([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3)
|
||||
|
||||
|
||||
def test_qcut_include_lowest():
|
||||
values = np.arange(10)
|
||||
ii = qcut(values, 4)
|
||||
|
||||
ex_levels = IntervalIndex(
|
||||
[
|
||||
Interval(-0.001, 2.25),
|
||||
Interval(2.25, 4.5),
|
||||
Interval(4.5, 6.75),
|
||||
Interval(6.75, 9),
|
||||
]
|
||||
)
|
||||
tm.assert_index_equal(ii.categories, ex_levels)
|
||||
|
||||
|
||||
def test_qcut_nas():
|
||||
arr = np.random.default_rng(2).standard_normal(100)
|
||||
arr[:20] = np.nan
|
||||
|
||||
result = qcut(arr, 4)
|
||||
assert isna(result[:20]).all()
|
||||
|
||||
|
||||
def test_qcut_index():
|
||||
result = qcut([0, 2], 2)
|
||||
intervals = [Interval(-0.001, 1), Interval(1, 2)]
|
||||
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
def test_qcut_binning_issues(datapath):
|
||||
# see gh-1978, gh-1979
|
||||
cut_file = datapath(os.path.join("reshape", "data", "cut_data.csv"))
|
||||
arr = np.loadtxt(cut_file)
|
||||
result = qcut(arr, 20)
|
||||
|
||||
starts = result.categories.left
|
||||
ends = result.categories.right
|
||||
assert (starts < ends).all()
|
||||
assert (starts[1:] <= ends[:-1]).all()
|
||||
|
||||
|
||||
def test_qcut_return_intervals():
|
||||
ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
|
||||
res = qcut(ser, [0, 0.333, 0.666, 1])
|
||||
|
||||
exp_levels = np.array(
|
||||
[Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)]
|
||||
)
|
||||
exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(
|
||||
CategoricalDtype(ordered=True)
|
||||
)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("labels", ["foo", 1, True])
|
||||
def test_qcut_incorrect_labels(labels):
|
||||
# GH 13318
|
||||
values = range(5)
|
||||
msg = "Bin labels must either be False, None or passed in as a list-like argument"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
qcut(values, 4, labels=labels)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("labels", [["a", "b", "c"], list(range(3))])
|
||||
def test_qcut_wrong_length_labels(labels):
|
||||
# GH 13318
|
||||
values = range(10)
|
||||
msg = "Bin labels must be one fewer than the number of bin edges"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
qcut(values, 4, labels=labels)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"labels, expected",
|
||||
[
|
||||
(["a", "b", "c"], ["a", "b", "c"]),
|
||||
(list(range(3)), [0, 1, 2]),
|
||||
],
|
||||
)
|
||||
def test_qcut_list_like_labels(labels, expected):
|
||||
# GH 13318
|
||||
values = range(3)
|
||||
result = qcut(values, 3, labels=labels)
|
||||
expected = Categorical(expected, ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,msg",
|
||||
[
|
||||
({"duplicates": "drop"}, None),
|
||||
({}, "Bin edges must be unique"),
|
||||
({"duplicates": "raise"}, "Bin edges must be unique"),
|
||||
({"duplicates": "foo"}, "invalid value for 'duplicates' parameter"),
|
||||
],
|
||||
)
|
||||
def test_qcut_duplicates_bin(kwargs, msg):
|
||||
# see gh-7751
|
||||
values = [0, 0, 0, 0, 1, 2, 3]
|
||||
|
||||
if msg is not None:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
qcut(values, 3, **kwargs)
|
||||
else:
|
||||
result = qcut(values, 3, **kwargs)
|
||||
expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
|
||||
tm.assert_index_equal(result.categories, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,start,end", [(9.0, 8.999, 9.0), (0.0, -0.001, 0.0), (-9.0, -9.001, -9.0)]
|
||||
)
|
||||
@pytest.mark.parametrize("length", [1, 2])
|
||||
@pytest.mark.parametrize("labels", [None, False])
|
||||
def test_single_quantile(data, start, end, length, labels):
|
||||
# see gh-15431
|
||||
ser = Series([data] * length)
|
||||
result = qcut(ser, 1, labels=labels)
|
||||
|
||||
if labels is None:
|
||||
intervals = IntervalIndex([Interval(start, end)] * length, closed="right")
|
||||
expected = Series(intervals).astype(CategoricalDtype(ordered=True))
|
||||
else:
|
||||
expected = Series([0] * length, dtype=np.intp)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ser",
|
||||
[
|
||||
DatetimeIndex(["20180101", NaT, "20180103"]),
|
||||
TimedeltaIndex(["0 days", NaT, "2 days"]),
|
||||
],
|
||||
ids=lambda x: str(x.dtype),
|
||||
)
|
||||
def test_qcut_nat(ser, unit):
|
||||
# see gh-19768
|
||||
ser = Series(ser)
|
||||
ser = ser.dt.as_unit(unit)
|
||||
td = Timedelta(1, unit=unit).as_unit(unit)
|
||||
|
||||
left = Series([ser[0] - td, np.nan, ser[2] - Day()], dtype=ser.dtype)
|
||||
right = Series([ser[2] - Day(), np.nan, ser[2]], dtype=ser.dtype)
|
||||
intervals = IntervalIndex.from_arrays(left, right)
|
||||
expected = Series(Categorical(intervals, ordered=True))
|
||||
|
||||
result = qcut(ser, 2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bins", [3, np.linspace(0, 1, 4)])
|
||||
def test_datetime_tz_qcut(bins):
|
||||
# see gh-19872
|
||||
tz = "US/Eastern"
|
||||
ser = Series(date_range("20130101", periods=3, tz=tz, unit="ns"))
|
||||
|
||||
result = qcut(ser, bins)
|
||||
expected = Series(
|
||||
IntervalIndex(
|
||||
[
|
||||
Interval(
|
||||
Timestamp("2012-12-31 23:59:59.999999999", tz=tz),
|
||||
Timestamp("2013-01-01 16:00:00", tz=tz),
|
||||
),
|
||||
Interval(
|
||||
Timestamp("2013-01-01 16:00:00", tz=tz),
|
||||
Timestamp("2013-01-02 08:00:00", tz=tz),
|
||||
),
|
||||
Interval(
|
||||
Timestamp("2013-01-02 08:00:00", tz=tz),
|
||||
Timestamp("2013-01-03 00:00:00", tz=tz),
|
||||
),
|
||||
]
|
||||
)
|
||||
).astype(CategoricalDtype(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg,expected_bins",
|
||||
[
|
||||
[
|
||||
timedelta_range("1day", periods=3),
|
||||
TimedeltaIndex(["1 days", "2 days", "3 days"]),
|
||||
],
|
||||
[
|
||||
date_range("20180101", periods=3),
|
||||
DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"]),
|
||||
],
|
||||
],
|
||||
)
|
||||
def test_date_like_qcut_bins(arg, expected_bins, unit):
|
||||
# see gh-19891
|
||||
arg = arg.as_unit(unit)
|
||||
expected_bins = expected_bins.as_unit(unit)
|
||||
ser = Series(arg)
|
||||
result, result_bins = qcut(ser, 2, retbins=True)
|
||||
tm.assert_index_equal(result_bins, expected_bins)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bins", [6, 7])
|
||||
@pytest.mark.parametrize(
|
||||
"box, compare",
|
||||
[
|
||||
(Series, tm.assert_series_equal),
|
||||
(np.array, tm.assert_categorical_equal),
|
||||
(list, tm.assert_equal),
|
||||
],
|
||||
)
|
||||
def test_qcut_bool_coercion_to_int(bins, box, compare):
|
||||
# issue 20303
|
||||
data_expected = box([0, 1, 1, 0, 1] * 10)
|
||||
data_result = box([False, True, True, False, True] * 10)
|
||||
expected = qcut(data_expected, bins, duplicates="drop")
|
||||
result = qcut(data_result, bins, duplicates="drop")
|
||||
compare(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("q", [2, 5, 10])
|
||||
def test_qcut_nullable_integer(q, any_numeric_ea_dtype):
|
||||
arr = pd.array(np.arange(100), dtype=any_numeric_ea_dtype)
|
||||
arr[::2] = pd.NA
|
||||
|
||||
result = qcut(arr, q)
|
||||
expected = qcut(arr.astype(float), q)
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("scale", [1.0, 1 / 3, 17.0])
|
||||
@pytest.mark.parametrize("q", [3, 7, 9])
|
||||
@pytest.mark.parametrize("precision", [1, 3, 16])
|
||||
def test_qcut_contains(scale, q, precision):
|
||||
# GH-59355
|
||||
arr = (scale * np.arange(q + 1)).round(precision)
|
||||
result = qcut(arr, q, precision=precision)
|
||||
|
||||
for value, bucket in zip(arr, result):
|
||||
assert value in bucket
|
||||
@ -0,0 +1,369 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.concat import union_categoricals
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
CategoricalIndex,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestUnionCategoricals:
|
||||
@pytest.mark.parametrize(
|
||||
"a, b, combined",
|
||||
[
|
||||
(list("abc"), list("abd"), list("abcabd")),
|
||||
([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
|
||||
([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
|
||||
(
|
||||
["b", "b", np.nan, "a"],
|
||||
["a", np.nan, "c"],
|
||||
["b", "b", np.nan, "a", "a", np.nan, "c"],
|
||||
),
|
||||
(
|
||||
pd.date_range("2014-01-01", "2014-01-05"),
|
||||
pd.date_range("2014-01-06", "2014-01-07"),
|
||||
pd.date_range("2014-01-01", "2014-01-07"),
|
||||
),
|
||||
(
|
||||
pd.date_range("2014-01-01", "2014-01-05", tz="US/Central"),
|
||||
pd.date_range("2014-01-06", "2014-01-07", tz="US/Central"),
|
||||
pd.date_range("2014-01-01", "2014-01-07", tz="US/Central"),
|
||||
),
|
||||
(
|
||||
pd.period_range("2014-01-01", "2014-01-05"),
|
||||
pd.period_range("2014-01-06", "2014-01-07"),
|
||||
pd.period_range("2014-01-01", "2014-01-07"),
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("box", [Categorical, CategoricalIndex, Series])
|
||||
def test_union_categorical(self, a, b, combined, box):
|
||||
# GH 13361
|
||||
result = union_categoricals([box(Categorical(a)), box(Categorical(b))])
|
||||
expected = Categorical(combined)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categorical_ordered_appearance(self):
|
||||
# new categories ordered by appearance
|
||||
s = Categorical(["x", "y", "z"])
|
||||
s2 = Categorical(["a", "b", "c"])
|
||||
result = union_categoricals([s, s2])
|
||||
expected = Categorical(
|
||||
["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categorical_ordered_true(self):
|
||||
s = Categorical([0, 1.2, 2], ordered=True)
|
||||
s2 = Categorical([0, 1.2, 2], ordered=True)
|
||||
result = union_categoricals([s, s2])
|
||||
expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categorical_match_types(self):
|
||||
# must exactly match types
|
||||
s = Categorical([0, 1.2, 2])
|
||||
s2 = Categorical([2, 3, 4])
|
||||
msg = "dtype of categories must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([s, s2])
|
||||
|
||||
def test_union_categorical_empty(self):
|
||||
msg = "No Categoricals to union"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
union_categoricals([])
|
||||
|
||||
def test_union_categoricals_nan(self):
|
||||
# GH 13759
|
||||
res = union_categoricals(
|
||||
[Categorical([1, 2, np.nan]), Categorical([3, 2, np.nan])]
|
||||
)
|
||||
exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals(
|
||||
[Categorical(["A", "B"]), Categorical(["B", "B", np.nan])]
|
||||
)
|
||||
exp = Categorical(["A", "B", "B", "B", np.nan])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
val1 = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-03-01"), pd.NaT]
|
||||
val2 = [pd.NaT, pd.Timestamp("2011-01-01"), pd.Timestamp("2011-02-01")]
|
||||
|
||||
res = union_categoricals([Categorical(val1), Categorical(val2)])
|
||||
exp = Categorical(
|
||||
val1 + val2,
|
||||
categories=[
|
||||
pd.Timestamp("2011-01-01"),
|
||||
pd.Timestamp("2011-03-01"),
|
||||
pd.Timestamp("2011-02-01"),
|
||||
],
|
||||
)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
# all NaN
|
||||
res = union_categoricals(
|
||||
[
|
||||
Categorical(np.array([np.nan, np.nan], dtype=object)),
|
||||
Categorical(["X"], categories=pd.Index(["X"], dtype=object)),
|
||||
]
|
||||
)
|
||||
exp = Categorical([np.nan, np.nan, "X"])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals(
|
||||
[Categorical([np.nan, np.nan]), Categorical([np.nan, np.nan])]
|
||||
)
|
||||
exp = Categorical([np.nan, np.nan, np.nan, np.nan])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
@pytest.mark.parametrize("val", [[], ["1"]])
|
||||
def test_union_categoricals_empty(self, val, request, using_infer_string):
|
||||
# GH 13759
|
||||
if using_infer_string and val == ["1"]:
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason="TDOD(infer_string) object and strings dont match"
|
||||
)
|
||||
)
|
||||
res = union_categoricals([Categorical([]), Categorical(val)])
|
||||
exp = Categorical(val)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
def test_union_categorical_same_category(self):
|
||||
# check fastpath
|
||||
c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
|
||||
c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
|
||||
res = union_categoricals([c1, c2])
|
||||
exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
def test_union_categorical_same_category_str(self):
|
||||
c1 = Categorical(["z", "z", "z"], categories=["x", "y", "z"])
|
||||
c2 = Categorical(["x", "x", "x"], categories=["x", "y", "z"])
|
||||
res = union_categoricals([c1, c2])
|
||||
exp = Categorical(["z", "z", "z", "x", "x", "x"], categories=["x", "y", "z"])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
def test_union_categorical_same_categories_different_order(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/19096
|
||||
c1 = Categorical(["a", "b", "c"], categories=["a", "b", "c"])
|
||||
c2 = Categorical(["a", "b", "c"], categories=["b", "a", "c"])
|
||||
result = union_categoricals([c1, c2])
|
||||
expected = Categorical(
|
||||
["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_ordered(self):
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], ordered=False)
|
||||
|
||||
msg = "Categorical.ordered must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2])
|
||||
|
||||
res = union_categoricals([c1, c1])
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
|
||||
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
|
||||
|
||||
res = union_categoricals([c1, c2])
|
||||
exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
|
||||
|
||||
msg = "to union ordered Categoricals, all categories must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2])
|
||||
|
||||
def test_union_categoricals_ignore_order(self):
|
||||
# GH 15219
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], ordered=False)
|
||||
|
||||
res = union_categoricals([c1, c2], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
msg = "Categorical.ordered must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2], ignore_order=False)
|
||||
|
||||
res = union_categoricals([c1, c1], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([c1, c1], ignore_order=False)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
|
||||
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
|
||||
|
||||
res = union_categoricals([c1, c2], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, np.nan, 3, 2])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
|
||||
|
||||
res = union_categoricals([c1, c2], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([4, 5, 6], ordered=True)
|
||||
result = union_categoricals([c1, c2], ignore_order=True)
|
||||
expected = Categorical([1, 2, 3, 4, 5, 6])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
msg = "to union ordered Categoricals, all categories must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2], ignore_order=False)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2])
|
||||
|
||||
def test_union_categoricals_sort(self):
|
||||
# GH 13846
|
||||
c1 = Categorical(["x", "y", "z"])
|
||||
c2 = Categorical(["a", "b", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(
|
||||
["x", "y", "z", "a", "b", "c"], categories=["a", "b", "c", "x", "y", "z"]
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath
|
||||
c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
|
||||
c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(["a", "b"], categories=["c", "a", "b"])
|
||||
c2 = Categorical(["b", "c"], categories=["c", "a", "b"])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath - skip resort
|
||||
c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
|
||||
c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(["x", np.nan])
|
||||
c2 = Categorical([np.nan, "b"])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(["x", np.nan, np.nan, "b"], categories=["b", "x"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([np.nan])
|
||||
c2 = Categorical([np.nan])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical([np.nan, np.nan])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([])
|
||||
c2 = Categorical([])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical([])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
|
||||
c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
|
||||
msg = "Cannot use sort_categories=True with ordered Categoricals"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2], sort_categories=True)
|
||||
|
||||
def test_union_categoricals_sort_false(self):
|
||||
# GH 13846
|
||||
c1 = Categorical(["x", "y", "z"])
|
||||
c2 = Categorical(["a", "b", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(
|
||||
["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_sort_false_fastpath(self):
|
||||
# fastpath
|
||||
c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
|
||||
c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(["a", "b", "b", "c"], categories=["b", "a", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_sort_false_skipresort(self):
|
||||
# fastpath - skip resort
|
||||
c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
|
||||
c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_sort_false_one_nan(self):
|
||||
c1 = Categorical(["x", np.nan])
|
||||
c2 = Categorical([np.nan, "b"])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(["x", np.nan, np.nan, "b"], categories=["x", "b"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_sort_false_only_nan(self):
|
||||
c1 = Categorical([np.nan])
|
||||
c2 = Categorical([np.nan])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical([np.nan, np.nan])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_sort_false_empty(self):
|
||||
c1 = Categorical([])
|
||||
c2 = Categorical([])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical([])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_sort_false_ordered_true(self):
|
||||
c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
|
||||
c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(
|
||||
["b", "a", "a", "c"], categories=["b", "a", "c"], ordered=True
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categorical_unwrap(self):
|
||||
# GH 14173
|
||||
c1 = Categorical(["a", "b"])
|
||||
c2 = Series(["b", "c"], dtype="category")
|
||||
result = union_categoricals([c1, c2])
|
||||
expected = Categorical(["a", "b", "b", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c2 = CategoricalIndex(c2)
|
||||
result = union_categoricals([c1, c2])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Series(c1)
|
||||
result = union_categoricals([c1, c2])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
msg = "all components to combine must be Categorical"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, ["a", "b", "c"]])
|
||||
Reference in New Issue
Block a user