Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/test_is_first_last_distinct.py
6939 views
1
from __future__ import annotations
2
3
import datetime
4
from typing import TYPE_CHECKING, Any
5
6
import pytest
7
8
import polars as pl
9
from polars.exceptions import InvalidOperationError
10
from polars.testing import assert_frame_equal, assert_series_equal
11
12
if TYPE_CHECKING:
13
from polars._typing import PolarsDataType
14
15
16
def test_is_first_distinct() -> None:
17
lf = pl.LazyFrame({"a": [4, 1, 4]})
18
result = lf.select(pl.col("a").is_first_distinct()).collect()["a"]
19
expected = pl.Series("a", [True, True, False])
20
assert_series_equal(result, expected)
21
22
23
def test_is_first_distinct_bool_bit_chunk_index_calc() -> None:
24
# The fast path activates on sizes >=64 and processes in chunks of 64-bits.
25
# It calculates the indexes using the bit counts, which needs to be from the
26
# correct side.
27
assert pl.arange(0, 64, eager=True).filter(
28
pl.Series([True] + 63 * [False]).is_first_distinct()
29
).to_list() == [0, 1]
30
31
assert pl.arange(0, 64, eager=True).filter(
32
pl.Series([False] + 63 * [True]).is_first_distinct()
33
).to_list() == [0, 1]
34
35
assert pl.arange(0, 64, eager=True).filter(
36
pl.Series(2 * [True] + 2 * [False] + 60 * [None]).is_first_distinct()
37
).to_list() == [0, 2, 4]
38
39
assert pl.arange(0, 64, eager=True).filter(
40
pl.Series(2 * [False] + 2 * [None] + 60 * [True]).is_first_distinct()
41
).to_list() == [0, 2, 4]
42
43
44
def test_is_first_distinct_struct() -> None:
45
lf = pl.LazyFrame({"a": [1, 2, 3, 2, None, 2, 1], "b": [0, 2, 3, 2, None, 2, 0]})
46
result = lf.select(pl.struct("a", "b").is_first_distinct())
47
expected = pl.LazyFrame({"a": [True, True, True, False, True, False, False]})
48
assert_frame_equal(result, expected)
49
50
51
@pytest.mark.parametrize(
52
"data",
53
[
54
[[1, 2], [3], [1, 2], [4, None], [4, None], [], []],
55
[[True, None], [True], [True, None], [False], [False], [], []],
56
[[b"1", b"2"], [b"3"], [b"1", b"2"], [b"4", None], [b"4", None], [], []],
57
[["a", "b"], ["&"], ["a", "b"], ["...", None], ["...", None], [], []],
58
[
59
[datetime.date(2000, 10, 1), datetime.date(2001, 1, 30)],
60
[datetime.date(1949, 10, 1)],
61
[datetime.date(2000, 10, 1), datetime.date(2001, 1, 30)],
62
[datetime.date(1998, 7, 1), None],
63
[datetime.date(1998, 7, 1), None],
64
[],
65
[],
66
],
67
],
68
)
69
def test_is_first_last_distinct_list(data: list[list[Any] | None]) -> None:
70
lf = pl.LazyFrame({"a": data})
71
result = lf.select(
72
first=pl.col("a").is_first_distinct(), last=pl.col("a").is_last_distinct()
73
)
74
expected = pl.LazyFrame(
75
{
76
"first": [True, True, False, True, False, True, False],
77
"last": [False, True, True, False, True, False, True],
78
}
79
)
80
assert_frame_equal(result, expected)
81
82
83
def test_is_first_last_distinct_list_inner_nested() -> None:
84
df = pl.DataFrame({"a": [[[1, 2]], [[1, 2]]]})
85
err_msg = "only allowed if the inner type is not nested"
86
with pytest.raises(InvalidOperationError, match=err_msg):
87
df.select(pl.col("a").is_first_distinct())
88
with pytest.raises(InvalidOperationError, match=err_msg):
89
df.select(pl.col("a").is_last_distinct())
90
91
92
def test_is_first_distinct_various() -> None:
93
# numeric
94
s = pl.Series([1, 1, None, 2, None, 3, 3])
95
expected = [True, False, True, True, False, True, False]
96
assert s.is_first_distinct().to_list() == expected
97
# str
98
s = pl.Series(["x", "x", None, "y", None, "z", "z"])
99
expected = [True, False, True, True, False, True, False]
100
assert s.is_first_distinct().to_list() == expected
101
# boolean
102
s = pl.Series([True, True, None, False, None, False, False])
103
expected = [True, False, True, True, False, False, False]
104
assert s.is_first_distinct().to_list() == expected
105
# struct
106
s = pl.Series(
107
[
108
{"x": 1, "y": 2},
109
{"x": 1, "y": 2},
110
None,
111
{"x": 2, "y": 1},
112
None,
113
{"x": 3, "y": 2},
114
{"x": 3, "y": 2},
115
]
116
)
117
expected = [True, False, True, True, False, True, False]
118
assert s.is_first_distinct().to_list() == expected
119
# list
120
s = pl.Series([[1, 2], [1, 2], None, [2, 3], None, [3, 4], [3, 4]])
121
expected = [True, False, True, True, False, True, False]
122
assert s.is_first_distinct().to_list() == expected
123
124
125
def test_is_last_distinct() -> None:
126
# numeric
127
s = pl.Series([1, 1, None, 2, None, 3, 3])
128
expected = [False, True, False, True, True, False, True]
129
assert s.is_last_distinct().to_list() == expected
130
# str
131
s = pl.Series(["x", "x", None, "y", None, "z", "z"])
132
expected = [False, True, False, True, True, False, True]
133
assert s.is_last_distinct().to_list() == expected
134
# boolean
135
s = pl.Series([True, True, None, False, None, False, False])
136
expected = [False, True, False, False, True, False, True]
137
assert s.is_last_distinct().to_list() == expected
138
# struct
139
s = pl.Series(
140
[
141
{"x": 1, "y": 2},
142
{"x": 1, "y": 2},
143
None,
144
{"x": 2, "y": 1},
145
None,
146
{"x": 3, "y": 2},
147
{"x": 3, "y": 2},
148
]
149
)
150
expected = [False, True, False, True, True, False, True]
151
assert s.is_last_distinct().to_list() == expected
152
153
154
@pytest.mark.parametrize("dtypes", [pl.Int32, pl.String, pl.Boolean, pl.List(pl.Int32)])
155
def test_is_first_last_distinct_all_null(dtypes: PolarsDataType) -> None:
156
s = pl.Series([None, None, None], dtype=dtypes)
157
assert s.is_first_distinct().to_list() == [True, False, False]
158
assert s.is_last_distinct().to_list() == [False, False, True]
159
160