Path: blob/main/py-polars/tests/unit/operations/test_is_first_last_distinct.py
6939 views
from __future__ import annotations12import datetime3from typing import TYPE_CHECKING, Any45import pytest67import polars as pl8from polars.exceptions import InvalidOperationError9from polars.testing import assert_frame_equal, assert_series_equal1011if TYPE_CHECKING:12from polars._typing import PolarsDataType131415def test_is_first_distinct() -> None:16lf = pl.LazyFrame({"a": [4, 1, 4]})17result = lf.select(pl.col("a").is_first_distinct()).collect()["a"]18expected = pl.Series("a", [True, True, False])19assert_series_equal(result, expected)202122def test_is_first_distinct_bool_bit_chunk_index_calc() -> None:23# The fast path activates on sizes >=64 and processes in chunks of 64-bits.24# It calculates the indexes using the bit counts, which needs to be from the25# correct side.26assert pl.arange(0, 64, eager=True).filter(27pl.Series([True] + 63 * [False]).is_first_distinct()28).to_list() == [0, 1]2930assert pl.arange(0, 64, eager=True).filter(31pl.Series([False] + 63 * [True]).is_first_distinct()32).to_list() == [0, 1]3334assert pl.arange(0, 64, eager=True).filter(35pl.Series(2 * [True] + 2 * [False] + 60 * [None]).is_first_distinct()36).to_list() == [0, 2, 4]3738assert pl.arange(0, 64, eager=True).filter(39pl.Series(2 * [False] + 2 * [None] + 60 * [True]).is_first_distinct()40).to_list() == [0, 2, 4]414243def test_is_first_distinct_struct() -> None:44lf = pl.LazyFrame({"a": [1, 2, 3, 2, None, 2, 1], "b": [0, 2, 3, 2, None, 2, 0]})45result = lf.select(pl.struct("a", "b").is_first_distinct())46expected = pl.LazyFrame({"a": [True, True, True, False, True, False, False]})47assert_frame_equal(result, expected)484950@pytest.mark.parametrize(51"data",52[53[[1, 2], [3], [1, 2], [4, None], [4, None], [], []],54[[True, None], [True], [True, None], [False], [False], [], []],55[[b"1", b"2"], [b"3"], [b"1", b"2"], [b"4", None], [b"4", None], [], []],56[["a", "b"], ["&"], ["a", "b"], ["...", None], ["...", None], [], []],57[58[datetime.date(2000, 10, 1), datetime.date(2001, 1, 30)],59[datetime.date(1949, 10, 1)],60[datetime.date(2000, 10, 1), datetime.date(2001, 1, 30)],61[datetime.date(1998, 7, 1), None],62[datetime.date(1998, 7, 1), None],63[],64[],65],66],67)68def test_is_first_last_distinct_list(data: list[list[Any] | None]) -> None:69lf = pl.LazyFrame({"a": data})70result = lf.select(71first=pl.col("a").is_first_distinct(), last=pl.col("a").is_last_distinct()72)73expected = pl.LazyFrame(74{75"first": [True, True, False, True, False, True, False],76"last": [False, True, True, False, True, False, True],77}78)79assert_frame_equal(result, expected)808182def test_is_first_last_distinct_list_inner_nested() -> None:83df = pl.DataFrame({"a": [[[1, 2]], [[1, 2]]]})84err_msg = "only allowed if the inner type is not nested"85with pytest.raises(InvalidOperationError, match=err_msg):86df.select(pl.col("a").is_first_distinct())87with pytest.raises(InvalidOperationError, match=err_msg):88df.select(pl.col("a").is_last_distinct())899091def test_is_first_distinct_various() -> None:92# numeric93s = pl.Series([1, 1, None, 2, None, 3, 3])94expected = [True, False, True, True, False, True, False]95assert s.is_first_distinct().to_list() == expected96# str97s = pl.Series(["x", "x", None, "y", None, "z", "z"])98expected = [True, False, True, True, False, True, False]99assert s.is_first_distinct().to_list() == expected100# boolean101s = pl.Series([True, True, None, False, None, False, False])102expected = [True, False, True, True, False, False, False]103assert s.is_first_distinct().to_list() == expected104# struct105s = pl.Series(106[107{"x": 1, "y": 2},108{"x": 1, "y": 2},109None,110{"x": 2, "y": 1},111None,112{"x": 3, "y": 2},113{"x": 3, "y": 2},114]115)116expected = [True, False, True, True, False, True, False]117assert s.is_first_distinct().to_list() == expected118# list119s = pl.Series([[1, 2], [1, 2], None, [2, 3], None, [3, 4], [3, 4]])120expected = [True, False, True, True, False, True, False]121assert s.is_first_distinct().to_list() == expected122123124def test_is_last_distinct() -> None:125# numeric126s = pl.Series([1, 1, None, 2, None, 3, 3])127expected = [False, True, False, True, True, False, True]128assert s.is_last_distinct().to_list() == expected129# str130s = pl.Series(["x", "x", None, "y", None, "z", "z"])131expected = [False, True, False, True, True, False, True]132assert s.is_last_distinct().to_list() == expected133# boolean134s = pl.Series([True, True, None, False, None, False, False])135expected = [False, True, False, False, True, False, True]136assert s.is_last_distinct().to_list() == expected137# struct138s = pl.Series(139[140{"x": 1, "y": 2},141{"x": 1, "y": 2},142None,143{"x": 2, "y": 1},144None,145{"x": 3, "y": 2},146{"x": 3, "y": 2},147]148)149expected = [False, True, False, True, True, False, True]150assert s.is_last_distinct().to_list() == expected151152153@pytest.mark.parametrize("dtypes", [pl.Int32, pl.String, pl.Boolean, pl.List(pl.Int32)])154def test_is_first_last_distinct_all_null(dtypes: PolarsDataType) -> None:155s = pl.Series([None, None, None], dtype=dtypes)156assert s.is_first_distinct().to_list() == [True, False, False]157assert s.is_last_distinct().to_list() == [False, False, True]158159160