Path: blob/main/docs/source/src/python/user-guide/expressions/lists.py
7890 views
# --8<-- [start:list-example]1from datetime import datetime2import polars as pl34df = pl.DataFrame(5{6"names": [7["Anne", "Averill", "Adams"],8["Brandon", "Brooke", "Borden", "Branson"],9["Camila", "Campbell"],10["Dennis", "Doyle"],11],12"children_ages": [13[5, 7],14[],15[],16[8, 11, 18],17],18"medical_appointments": [19[],20[],21[],22[datetime(2022, 5, 22, 16, 30)],23],24}25)2627print(df)28# --8<-- [end:list-example]2930# --8<-- [start:array-example]31df = pl.DataFrame(32{33"bit_flags": [34[True, True, True, True, False],35[False, True, True, True, True],36],37"tic_tac_toe": [38[39[" ", "x", "o"],40[" ", "x", " "],41["o", "x", " "],42],43[44["o", "x", "x"],45[" ", "o", "x"],46[" ", " ", "o"],47],48],49},50schema={51"bit_flags": pl.Array(pl.Boolean, 5),52"tic_tac_toe": pl.Array(pl.String, (3, 3)),53},54)5556print(df)57# --8<-- [end:array-example]5859# --8<-- [start:numpy-array-inference]60import numpy as np6162array = np.arange(0, 120).reshape((5, 2, 3, 4)) # 4D array6364print(pl.Series(array).dtype) # Column with the 3D subarrays65# --8<-- [end:numpy-array-inference]6667# --8<-- [start:weather]68weather = pl.DataFrame(69{70"station": [f"Station {idx}" for idx in range(1, 6)],71"temperatures": [72"20 5 5 E1 7 13 19 9 6 20",73"18 8 16 11 23 E2 8 E2 E2 E2 90 70 40",74"19 24 E9 16 6 12 10 22",75"E2 E0 15 7 8 10 E1 24 17 13 6",76"14 8 E0 16 22 24 E1",77],78}79)8081print(weather)82# --8<-- [end:weather]8384# --8<-- [start:split]85weather = weather.with_columns(86pl.col("temperatures").str.split(" "),87)88print(weather)89# --8<-- [end:split]9091# --8<-- [start:explode]92result = weather.explode("temperatures")93print(result)94# --8<-- [end:explode]9596# --8<-- [start:list-slicing]97result = weather.with_columns(98pl.col("temperatures").list.head(3).alias("head"),99pl.col("temperatures").list.tail(3).alias("tail"),100pl.col("temperatures").list.slice(-3, 2).alias("two_next_to_last"),101)102print(result)103# --8<-- [end:list-slicing]104105# --8<-- [start:element-wise-casting]106result = weather.with_columns(107pl.col("temperatures")108.list.eval(pl.element().cast(pl.Int64, strict=False).is_null())109.list.sum()110.alias("errors"),111)112print(result)113# --8<-- [end:element-wise-casting]114115# --8<-- [start:element-wise-regex]116result2 = weather.with_columns(117pl.col("temperatures")118.list.eval(pl.element().str.contains("(?i)[a-z]"))119.list.sum()120.alias("errors"),121)122print(result.equals(result2))123# --8<-- [end:element-wise-regex]124125# --8<-- [start:children]126df = pl.DataFrame(127{128"children": [129[130{"name": "Anne", "age": 5},131{"name": "Averill", "age": 7},132],133[134{"name": "Brandon", "age": 12},135{"name": "Brooke", "age": 9},136{"name": "Branson", "age": 11},137],138[{"name": "Camila", "age": 19}],139[140{"name": "Dennis", "age": 8},141{"name": "Doyle", "age": 11},142{"name": "Dina", "age": 18},143],144],145}146)147148print(df)149# --8<-- [end:children]150151# --8<-- [start:list-sorting]152result = df.select(153pl.col("children")154.list.eval(155pl.element()156.sort_by(pl.element().struct.field("age"), descending=True)157.struct.field("name")158)159.alias("names_by_age"),160pl.col("children")161.list.eval(pl.element().struct.field("age").min())162.alias("min_age"),163pl.col("children")164.list.eval(pl.element().struct.field("age").max())165.alias("max_age"),166)167print(result)168# --8<-- [end:list-sorting]169170# --8<-- [start:list-aggregation]171result = df.select(172pl.col("children")173.list.eval(174pl.element()175.sort_by(pl.element().struct.field("age"), descending=True)176.struct.field("name")177)178.alias("names_by_age"),179pl.col("children")180.list.agg(pl.element().struct.field("age").min())181.alias("min_age"),182pl.col("children")183.list.agg(pl.element().struct.field("age").max())184.alias("max_age"),185)186print(result)187# --8<-- [end:list-aggregation]188189# --8<-- [start:list-entropy]190result = df.with_columns(191pl.col("children")192.list.agg(pl.element().struct.field("age").entropy())193.alias("age_entropy"),194)195print(result)196# --8<-- [end:list-entropy]197198# --8<-- [start:weather_by_day]199weather_by_day = pl.DataFrame(200{201"station": [f"Station {idx}" for idx in range(1, 11)],202"day_1": [17, 11, 8, 22, 9, 21, 20, 8, 8, 17],203"day_2": [15, 11, 10, 8, 7, 14, 18, 21, 15, 13],204"day_3": [16, 15, 24, 24, 8, 23, 19, 23, 16, 10],205}206)207print(weather_by_day)208# --8<-- [end:weather_by_day]209210# --8<-- [start:rank_pct]211rank_pct = (pl.element().rank(descending=True) / pl.element().count()).round(2)212213result = weather_by_day.with_columns(214# create the list of homogeneous data215pl.concat_list(pl.all().exclude("station")).alias("all_temps")216).select(217# select all columns except the intermediate list218pl.all().exclude("all_temps"),219# compute the rank by calling `list.eval`220pl.col("all_temps").list.eval(rank_pct, parallel=True).alias("temps_rank"),221)222223print(result)224# --8<-- [end:rank_pct]225226# --8<-- [start:array-overview]227df = pl.DataFrame(228{229"first_last": [230["Anne", "Adams"],231["Brandon", "Branson"],232["Camila", "Campbell"],233["Dennis", "Doyle"],234],235"fav_numbers": [236[42, 0, 1],237[2, 3, 5],238[13, 21, 34],239[73, 3, 7],240],241},242schema={243"first_last": pl.Array(pl.String, 2),244"fav_numbers": pl.Array(pl.Int32, 3),245},246)247248result = df.select(249pl.col("first_last").arr.join(" ").alias("name"),250pl.col("fav_numbers").arr.sort(),251pl.col("fav_numbers").arr.max().alias("largest_fav"),252pl.col("fav_numbers").arr.sum().alias("summed"),253pl.col("fav_numbers").arr.contains(3).alias("likes_3"),254)255print(result)256# --8<-- [end:array-overview]257258259