CoCalc -- test_lazy

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/io/test_lazy_ipc.py
⁶⁹³⁹ views
1
from __future__ import annotations
2

3
import io
4
from typing import TYPE_CHECKING, Any
5

6
import pyarrow.ipc
7
import pytest
8

9
import polars as pl
10
from polars.interchange.protocol import CompatLevel
11
from polars.testing.asserts.frame import assert_frame_equal
12

13
if TYPE_CHECKING:
14
    from pathlib import Path
15

16

17
@pytest.fixture
18
def foods_ipc_path(io_files_path: Path) -> Path:
19
    return io_files_path / "foods1.ipc"
20

21

22
def test_row_index(foods_ipc_path: Path) -> None:
23
    df = pl.read_ipc(foods_ipc_path, row_index_name="row_index", use_pyarrow=False)
24
    assert df["row_index"].to_list() == list(range(27))
25

26
    df = (
27
        pl.scan_ipc(foods_ipc_path, row_index_name="row_index")
28
        .filter(pl.col("category") == pl.lit("vegetables"))
29
        .collect()
30
    )
31

32
    assert df["row_index"].to_list() == [0, 6, 11, 13, 14, 20, 25]
33

34
    df = (
35
        pl.scan_ipc(foods_ipc_path, row_index_name="row_index")
36
        .with_row_index("foo", 10)
37
        .filter(pl.col("category") == pl.lit("vegetables"))
38
        .collect()
39
    )
40

41
    assert df["foo"].to_list() == [10, 16, 21, 23, 24, 30, 35]
42

43

44
def test_is_in_type_coercion(foods_ipc_path: Path) -> None:
45
    out = (
46
        pl.scan_ipc(foods_ipc_path)
47
        .filter(pl.col("category").is_in(("vegetables", "ice cream")))
48
        .collect()
49
    )
50
    assert out.shape == (7, 4)
51
    out = (
52
        pl.scan_ipc(foods_ipc_path)
53
        .select(pl.col("category").alias("cat"))
54
        .filter(pl.col("cat").is_in(["vegetables"]))
55
        .collect()
56
    )
57
    assert out.shape == (7, 1)
58

59

60
def test_row_index_schema(foods_ipc_path: Path) -> None:
61
    assert (
62
        pl.scan_ipc(foods_ipc_path, row_index_name="id")
63
        .select(["id", "category"])
64
        .collect()
65
    ).dtypes == [pl.UInt32, pl.String]
66

67

68
def test_glob_n_rows(io_files_path: Path) -> None:
69
    file_path = io_files_path / "foods*.ipc"
70
    df = pl.scan_ipc(file_path, n_rows=40).collect()
71

72
    # 27 rows from foods1.ipc and 13 from foods2.ipc
73
    assert df.shape == (40, 4)
74

75
    # take first and last rows
76
    assert df[[0, 39]].to_dict(as_series=False) == {
77
        "category": ["vegetables", "seafood"],
78
        "calories": [45, 146],
79
        "fats_g": [0.5, 6.0],
80
        "sugars_g": [2, 2],
81
    }
82

83

84
def test_ipc_list_arg(io_files_path: Path) -> None:
85
    first = io_files_path / "foods1.ipc"
86
    second = io_files_path / "foods2.ipc"
87

88
    df = pl.scan_ipc(source=[first, second]).collect()
89
    assert df.shape == (54, 4)
90
    assert df.row(-1) == ("seafood", 194, 12.0, 1)
91
    assert df.row(0) == ("vegetables", 45, 0.5, 2)
92

93

94
def test_scan_ipc_local_with_async(
95
    monkeypatch: Any,
96
    io_files_path: Path,
97
) -> None:
98
    monkeypatch.setenv("POLARS_VERBOSE", "1")
99
    monkeypatch.setenv("POLARS_FORCE_ASYNC", "1")
100

101
    assert_frame_equal(
102
        pl.scan_ipc(io_files_path / "foods1.ipc").head(1).collect(),
103
        pl.DataFrame(
104
            {
105
                "category": ["vegetables"],
106
                "calories": [45],
107
                "fats_g": [0.5],
108
                "sugars_g": [2],
109
            }
110
        ),
111
    )
112

113

114
def test_sink_ipc_compat_level_22930() -> None:
115
    df = pl.DataFrame({"a": ["foo"]})
116

117
    f1 = io.BytesIO()
118
    f2 = io.BytesIO()
119

120
    df.lazy().sink_ipc(f1, compat_level=CompatLevel.oldest(), engine="in-memory")
121
    df.lazy().sink_ipc(f2, compat_level=CompatLevel.oldest(), engine="streaming")
122

123
    f1.seek(0)
124
    f2.seek(0)
125

126
    t1 = pyarrow.ipc.open_file(f1)
127
    assert "large_string" in str(t1.schema)
128
    assert_frame_equal(pl.DataFrame(t1.read_all()), df)
129

130
    t2 = pyarrow.ipc.open_file(f2)
131
    assert "large_string" in str(t2.schema)
132
    assert_frame_equal(pl.DataFrame(t2.read_all()), df)
133

134

135
def test_scan_file_info_cache(
136
    capfd: Any, monkeypatch: Any, foods_ipc_path: Path
137
) -> None:
138
    monkeypatch.setenv("POLARS_VERBOSE", "1")
139
    a = pl.scan_ipc(foods_ipc_path)
140
    b = pl.scan_ipc(foods_ipc_path)
141

142
    a.join(b, how="cross").explain()
143

144
    captured = capfd.readouterr().err
145
    assert "FILE_INFO CACHE HIT" in captured
146

147
Product

Resources

Company