Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/docs/source/src/python/polars-cloud/distributed.py
8336 views
1
"""
2
# --8<-- [start:setup]
3
import polars as pl
4
import polars_cloud as pc
5
6
lineitem_sf100 = pl.scan_parquet("s3://polars-cloud-samples-us-east-2-prd/pdsh/sf100/lineitem/*.parquet",
7
storage_options={"request_payer": "true"})
8
customer_sf100 = pl.scan_parquet("s3://polars-cloud-samples-us-east-2-prd/pdsh/sf100/customer/*.parquet",
9
storage_options={"request_payer": "true"})
10
orders_sf100 = pl.scan_parquet("s3://polars-cloud-samples-us-east-2-prd/pdsh/sf100/orders/*.parquet",
11
storage_options={"request_payer": "true"})
12
13
# --8<-- [end:setup]
14
15
# --8<-- [start:query]
16
def pdsh_q3(customer, lineitem, orders):
17
18
return (
19
customer.filter(pl.col("c_mktsegment") == "BUILDING")
20
.join(orders, left_on="c_custkey", right_on="o_custkey")
21
.join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
22
.filter(pl.col("o_orderdate") < pl.date(1995, 3, 15))
23
.filter(pl.col("l_shipdate") > pl.date(1995, 3, 15))
24
.with_columns(
25
(pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).alias("revenue")
26
)
27
.group_by("o_orderkey", "o_orderdate", "o_shippriority")
28
.agg(pl.sum("revenue"))
29
.select(
30
pl.col("o_orderkey").alias("l_orderkey"),
31
"revenue",
32
"o_orderdate",
33
"o_shippriority",
34
)
35
.sort(by=["revenue", "o_orderdate"], descending=[True, False])
36
)
37
38
# --8<-- [end:query]
39
40
# --8<-- [start:context-run]
41
ctx = pc.ComputeContext(workspace="your-workspace", cpus=4, memory=4, cluster_size=5)
42
43
pdsh_q3(customer_sf100, lineitem_sf100, orders_sf100)
44
.remote(ctx)
45
.distributed()
46
.show()
47
48
# --8<-- [end:context-run]
49
"""
50
51