Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/docs/source/src/python/polars-cloud/remote-query.py
6940 views
1
"""
2
import polars_cloud as pc
3
4
# --8<-- [start:local]
5
import polars as pl
6
7
customer = pl.scan_parquet("data/customer.parquet")
8
lineitem = pl.scan_parquet("data/lineitem.parquet")
9
orders = pl.scan_parquet("data/orders.parquet")
10
11
def pdsh_q3(customer, lineitem, orders):
12
13
return (
14
customer.filter(pl.col("c_mktsegment") == "BUILDING")
15
.join(orders, left_on="c_custkey", right_on="o_custkey")
16
.join(lineitem, left_on="o_orderkey", right_on="l_orderkey")
17
.filter(pl.col("o_orderdate") < pl.date(1995, 3, 15))
18
.filter(pl.col("l_shipdate") > pl.date(1995, 3, 15))
19
.with_columns(
20
(pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).alias("revenue")
21
)
22
.group_by("o_orderkey", "o_orderdate", "o_shippriority")
23
.agg(pl.sum("revenue"))
24
.select(
25
pl.col("o_orderkey").alias("l_orderkey"),
26
"revenue",
27
"o_orderdate",
28
"o_shippriority",
29
)
30
.sort(by=["revenue", "o_orderdate"], descending=[True, False])
31
)
32
33
34
pdsh_q3(customer, lineitem, orders).collect()
35
36
# --8<-- [end:local]
37
38
# --8<-- [start:context]
39
import polars_cloud as pc
40
41
ctx = pc.ComputeContext(
42
# make sure to enter your own workspace name
43
workspace="your-workspace",
44
memory=16,
45
cpus=12,
46
)
47
48
# Use a larger dataset available on S3
49
lineitem_sf10 = pl.scan_parquet("s3://polars-cloud-samples-us-east-2-dev/pdsh/sf10/lineitem.parquet",
50
storage_options={"request_payer": "true"})
51
customer_sf10 = pl.scan_parquet("s3://polars-cloud-samples-us-east-2-dev/pdsh/sf10/customer.parquet",
52
storage_options={"request_payer": "true"})
53
orders_sf10 = pl.scan_parquet("s3://polars-cloud-samples-us-east-2-dev/pdsh/sf10/orders.parquet",
54
storage_options={"request_payer": "true"})
55
56
# Your query remains the same
57
pdsh_q3(lineitem_sf10, customer_sf10, orders_sf10).remote(context=ctx).show()
58
59
# --8<-- [end:context]
60
61
# --8<-- [start:sink_parquet]
62
# Replace the S3 url with your own to run the query successfully
63
64
pdsh_q3(lineitem_sf10, customer_sf10, orders_sf10).remote(context=ctx).sink_parquet("s3://your-bucket/processed-data/")
65
# --8<-- [end:sink_parquet]
66
67
# --8<-- [start:show]
68
pdsh_q3(lineitem_sf10, customer_sf10, orders_sf10).remote(context=ctx).show()
69
70
# --8<-- [end:show]
71
72
# --8<-- [start:await_scan]
73
result = pdsh_q3(lineitem_sf10, customer_sf10, orders_sf10).remote(context=ctx).await_and_scan()
74
75
print(result.collect())
76
# --8<-- [end:await_scan]
77
"""
78
79