Path: blob/main/docs/source/src/python/polars-cloud/remote-query.py
6940 views
"""1import polars_cloud as pc23# --8<-- [start:local]4import polars as pl56customer = pl.scan_parquet("data/customer.parquet")7lineitem = pl.scan_parquet("data/lineitem.parquet")8orders = pl.scan_parquet("data/orders.parquet")910def pdsh_q3(customer, lineitem, orders):1112return (13customer.filter(pl.col("c_mktsegment") == "BUILDING")14.join(orders, left_on="c_custkey", right_on="o_custkey")15.join(lineitem, left_on="o_orderkey", right_on="l_orderkey")16.filter(pl.col("o_orderdate") < pl.date(1995, 3, 15))17.filter(pl.col("l_shipdate") > pl.date(1995, 3, 15))18.with_columns(19(pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).alias("revenue")20)21.group_by("o_orderkey", "o_orderdate", "o_shippriority")22.agg(pl.sum("revenue"))23.select(24pl.col("o_orderkey").alias("l_orderkey"),25"revenue",26"o_orderdate",27"o_shippriority",28)29.sort(by=["revenue", "o_orderdate"], descending=[True, False])30)313233pdsh_q3(customer, lineitem, orders).collect()3435# --8<-- [end:local]3637# --8<-- [start:context]38import polars_cloud as pc3940ctx = pc.ComputeContext(41# make sure to enter your own workspace name42workspace="your-workspace",43memory=16,44cpus=12,45)4647# Use a larger dataset available on S348lineitem_sf10 = pl.scan_parquet("s3://polars-cloud-samples-us-east-2-dev/pdsh/sf10/lineitem.parquet",49storage_options={"request_payer": "true"})50customer_sf10 = pl.scan_parquet("s3://polars-cloud-samples-us-east-2-dev/pdsh/sf10/customer.parquet",51storage_options={"request_payer": "true"})52orders_sf10 = pl.scan_parquet("s3://polars-cloud-samples-us-east-2-dev/pdsh/sf10/orders.parquet",53storage_options={"request_payer": "true"})5455# Your query remains the same56pdsh_q3(lineitem_sf10, customer_sf10, orders_sf10).remote(context=ctx).show()5758# --8<-- [end:context]5960# --8<-- [start:sink_parquet]61# Replace the S3 url with your own to run the query successfully6263pdsh_q3(lineitem_sf10, customer_sf10, orders_sf10).remote(context=ctx).sink_parquet("s3://your-bucket/processed-data/")64# --8<-- [end:sink_parquet]6566# --8<-- [start:show]67pdsh_q3(lineitem_sf10, customer_sf10, orders_sf10).remote(context=ctx).show()6869# --8<-- [end:show]7071# --8<-- [start:await_scan]72result = pdsh_q3(lineitem_sf10, customer_sf10, orders_sf10).remote(context=ctx).await_and_scan()7374print(result.collect())75# --8<-- [end:await_scan]76"""777879