CoCalc -- intro.py

GitHub Repository: pola-rs/polars
Path: blob/main/docs/source/src/python/user-guide/sql/intro.py
⁸³³⁸ views
1
# --8<-- [start:setup]
2
import polars as pl
3

4
# --8<-- [end:setup]
5

6
# --8<-- [start:context]
7
ctx = pl.SQLContext()
8
# --8<-- [end:context]
9

10
# --8<-- [start:register_context]
11
df = pl.DataFrame({"a": [1, 2, 3]})
12
lf = pl.LazyFrame({"b": [4, 5, 6]})
13

14
# Register all dataframes in the global namespace: registers both "df" and "lf"
15
ctx = pl.SQLContext(register_globals=True)
16

17
# Register an explicit mapping of identifier name to frame
18
ctx = pl.SQLContext(frames={"table_one": df, "table_two": lf})
19

20
# Register frames using kwargs; dataframe df as "df" and lazyframe lf as "lf"
21
ctx = pl.SQLContext(df=df, lf=lf)
22
# --8<-- [end:register_context]
23

24
# --8<-- [start:register_pandas]
25
import pandas as pd
26

27
df_pandas = pd.DataFrame({"c": [7, 8, 9]})
28
ctx = pl.SQLContext(df_pandas=pl.from_pandas(df_pandas))
29
# --8<-- [end:register_pandas]
30

31
# --8<-- [start:execute]
32
# For local files use scan_csv instead
33
pokemon = pl.read_csv(
34
    "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv"
35
)
36
with pl.SQLContext(register_globals=True, eager=True) as ctx:
37
    df_small = ctx.execute("SELECT * from pokemon LIMIT 5")
38
    print(df_small)
39
# --8<-- [end:execute]
40

41
# --8<-- [start:prepare_multiple_sources]
42
with open("docs/assets/data/products_categories.json", "w") as temp_file:
43
    json_data = """{"product_id": 1, "category": "Category 1"}
44
{"product_id": 2, "category": "Category 1"}
45
{"product_id": 3, "category": "Category 2"}
46
{"product_id": 4, "category": "Category 2"}
47
{"product_id": 5, "category": "Category 3"}"""
48

49
    temp_file.write(json_data)
50

51
with open("docs/assets/data/products_masterdata.csv", "w") as temp_file:
52
    csv_data = """product_id,product_name
53
1,Product A
54
2,Product B
55
3,Product C
56
4,Product D
57
5,Product E"""
58

59
    temp_file.write(csv_data)
60

61
sales_data = pd.DataFrame(
62
    {
63
        "product_id": [1, 2, 3, 4, 5],
64
        "sales": [100, 200, 150, 250, 300],
65
    }
66
)
67
# --8<-- [end:prepare_multiple_sources]
68

69
# --8<-- [start:execute_multiple_sources]
70
# Input data:
71
# products_masterdata.csv with schema {'product_id': Int64, 'product_name': String}
72
# products_categories.json with schema {'product_id': Int64, 'category': String}
73
# sales_data is a Pandas DataFrame with schema {'product_id': Int64, 'sales': Int64}
74

75
with pl.SQLContext(
76
    products_masterdata=pl.scan_csv("docs/assets/data/products_masterdata.csv"),
77
    products_categories=pl.scan_ndjson("docs/assets/data/products_categories.json"),
78
    sales_data=pl.from_pandas(sales_data),
79
    eager=True,
80
) as ctx:
81
    query = """
82
    SELECT
83
        product_id,
84
        product_name,
85
        category,
86
        sales
87
    FROM
88
        products_masterdata
89
    LEFT JOIN products_categories USING (product_id)
90
    LEFT JOIN sales_data USING (product_id)
91
    """
92
    print(ctx.execute(query))
93
# --8<-- [end:execute_multiple_sources]
94

95
# --8<-- [start:clean_multiple_sources]
96
import os
97

98
os.remove("docs/assets/data/products_categories.json")
99
os.remove("docs/assets/data/products_masterdata.csv")
100
# --8<-- [end:clean_multiple_sources]
101

102
Product

Resources

Company