Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/sql/test_regex.py
6939 views
1
from __future__ import annotations
2
3
from pathlib import Path
4
5
import pytest
6
7
import polars as pl
8
from polars.exceptions import SQLSyntaxError
9
10
11
@pytest.fixture
12
def foods_ipc_path() -> Path:
13
return Path(__file__).parent.parent / "io" / "files" / "foods1.ipc"
14
15
16
@pytest.mark.parametrize(
17
("regex_op", "expected"),
18
[
19
("RLIKE", [0, 3]),
20
("REGEXP", [0, 3]),
21
("NOT RLIKE", [1, 2, 4]),
22
("NOT REGEXP", [1, 2, 4]),
23
],
24
)
25
def test_regex_expr_match(regex_op: str, expected: list[int]) -> None:
26
# note: the REGEXP and RLIKE operators can also use another
27
# column/expression as the source of the match pattern
28
df = pl.DataFrame(
29
{
30
"idx": [0, 1, 2, 3, 4],
31
"str": ["ABC", "abc", "000", "A0C", "a0c"],
32
"pat": ["^A", "^A", "^A", r"[AB]\d.*$", ".*xxx$"],
33
}
34
)
35
with pl.SQLContext(df=df, eager=True) as ctx:
36
out = ctx.execute(f"SELECT idx, str FROM df WHERE str {regex_op} pat")
37
assert out.to_series().to_list() == expected
38
39
40
@pytest.mark.parametrize(
41
("op", "pattern", "expected"),
42
[
43
("~", "^veg", "vegetables"),
44
("~", "^VEG", None),
45
("~*", "^VEG", "vegetables"),
46
("!~", "(t|s)$", "seafood"),
47
("!~*", "(T|S)$", "seafood"),
48
("!~*", "^.E", "fruit"),
49
("!~*", "[aeiOU]", None),
50
("RLIKE", "^veg", "vegetables"),
51
("RLIKE", "^VEG", None),
52
("RLIKE", "(?i)^VEG", "vegetables"),
53
("NOT RLIKE", "(t|s)$", "seafood"),
54
("NOT RLIKE", "(?i)(T|S)$", "seafood"),
55
("NOT RLIKE", "(?i)^.E", "fruit"),
56
("NOT RLIKE", "(?i)[aeiOU]", None),
57
("REGEXP", "^veg", "vegetables"),
58
("REGEXP", "^VEG", None),
59
("REGEXP", "(?i)^VEG", "vegetables"),
60
("NOT REGEXP", "(t|s)$", "seafood"),
61
("NOT REGEXP", "(?i)(T|S)$", "seafood"),
62
("NOT REGEXP", "(?i)^.E", "fruit"),
63
("NOT REGEXP", "(?i)[aeiOU]", None),
64
],
65
)
66
def test_regex_operators(
67
foods_ipc_path: Path, op: str, pattern: str, expected: str | None
68
) -> None:
69
lf = pl.scan_ipc(foods_ipc_path)
70
71
with pl.SQLContext(foods=lf, eager=True) as ctx:
72
out = ctx.execute(
73
f"""
74
SELECT DISTINCT category FROM foods
75
WHERE category {op} '{pattern}'
76
"""
77
)
78
assert out.rows() == ([(expected,)] if expected else [])
79
80
81
def test_regex_operators_error() -> None:
82
df = pl.LazyFrame({"sval": ["ABC", "abc", "000", "A0C", "a0c"]})
83
with pl.SQLContext(df=df, eager=True) as ctx:
84
with pytest.raises(
85
SQLSyntaxError, match="invalid pattern for '~' operator: dyn .*12345"
86
):
87
ctx.execute("SELECT * FROM df WHERE sval ~ 12345")
88
with pytest.raises(
89
SQLSyntaxError,
90
match=r"""invalid pattern for '!~\*' operator: col\("abcde"\)""",
91
):
92
ctx.execute("SELECT * FROM df WHERE sval !~* abcde")
93
94
95
@pytest.mark.parametrize(
96
("not_", "pattern", "flags", "expected"),
97
[
98
("", "^veg", None, "vegetables"),
99
("", "^VEG", None, None),
100
("", "(?i)^VEG", None, "vegetables"),
101
("NOT", "(t|s)$", None, "seafood"),
102
("NOT", "T|S$", "i", "seafood"),
103
("NOT", "^.E", "i", "fruit"),
104
("NOT", "[aeiOU]", "i", None),
105
],
106
)
107
def test_regexp_like(
108
foods_ipc_path: Path,
109
not_: str,
110
pattern: str,
111
flags: str | None,
112
expected: str | None,
113
) -> None:
114
lf = pl.scan_ipc(foods_ipc_path)
115
flags = "" if flags is None else f",'{flags}'"
116
with pl.SQLContext(foods=lf, eager=True) as ctx:
117
out = ctx.execute(
118
f"""
119
SELECT DISTINCT category FROM foods
120
WHERE {not_} REGEXP_LIKE(category,'{pattern}'{flags})
121
"""
122
)
123
assert out.rows() == ([(expected,)] if expected else [])
124
125
126
def test_regexp_like_errors() -> None:
127
with pl.SQLContext(df=pl.DataFrame({"scol": ["xyz"]})) as ctx:
128
with pytest.raises(
129
SQLSyntaxError,
130
match="invalid/empty 'flags' for REGEXP_LIKE",
131
):
132
ctx.execute("SELECT * FROM df WHERE REGEXP_LIKE(scol,'[x-z]+','')")
133
134
with pytest.raises(
135
SQLSyntaxError,
136
match="invalid arguments for REGEXP_LIKE",
137
):
138
ctx.execute("SELECT * FROM df WHERE REGEXP_LIKE(scol,999,999)")
139
140
with pytest.raises(
141
SQLSyntaxError,
142
match=r"REGEXP_LIKE expects 2-3 arguments \(found 1\)",
143
):
144
ctx.execute("SELECT * FROM df WHERE REGEXP_LIKE(scol)")
145
146