CoCalc -- customer_lifetime

GitHub Repository: keras-team/keras-io
Path: blob/master/examples/structured_data/customer_lifetime_value.py
⁷⁷⁸³ views
1
"""
2
Title: Deep Learning for Customer Lifetime Value
3
Author: [Praveen Hosdrug](https://www.linkedin.com/in/praveenhosdrug/)
4
Date created: 2024/11/23
5
Last modified: 2024/11/27
6
Description: A hybrid deep learning architecture for predicting customer purchase patterns and lifetime value.
7
Accelerator: None
8
"""
9

10
"""
11
## Introduction
12

13
A hybrid deep learning architecture combining Transformer encoders and LSTM networks
14
for predicting customer purchase patterns and lifetime value using transaction history.
15
While many existing review articles focus on classic parametric models and traditional machine learning algorithms
16
,this implementation leverages recent advancements in Transformer-based models for time series prediction.
17
The approach handles multi-granularity prediction across different temporal scales.
18

19
"""
20

21
"""
22
## Setting up Libraries for the Deep Learning Project
23
"""
24
import subprocess
25

26

27
def install_packages(packages):
28
    """
29
    Install a list of packages using pip.
30

31
    Args:
32
        packages (list): A list of package names to install.
33
    """
34
    for package in packages:
35
        subprocess.run(["pip", "install", package], check=True)
36

37

38
"""
39
## List of Packages to Install
40

41
1. uciml: For the purpose of the tutorial; we will be using
42
          the UK Retail [Dataset](https://archive.ics.uci.edu/dataset/352/online+retail)
43
2. keras_hub: Access to the transformer encoder layer.
44
"""
45

46
packages_to_install = ["ucimlrepo", "keras_hub"]
47

48
# Install the packages
49
install_packages(packages_to_install)
50

51
# Core data processing and numerical libraries
52
import os
53

54
os.environ["KERAS_BACKEND"] = "jax"
55
import keras
56
import numpy as np
57
import pandas as pd
58
from typing import Dict
59

60
# Visualization
61
import matplotlib.pyplot as plt
62

63
# Keras imports
64
from keras import layers
65
from keras import Model
66
from keras import ops
67
from keras_hub.layers import TransformerEncoder
68
from keras import regularizers
69

70
# UK Retail Dataset
71
from ucimlrepo import fetch_ucirepo
72

73
"""
74
## Preprocessing the UK Retail dataset
75
"""
76

77

78
def prepare_time_series_data(data):
79
    """
80
    Preprocess retail transaction data for deep learning.
81

82
    Args:
83
        data: Raw transaction data containing InvoiceDate, UnitPrice, etc.
84
    Returns:
85
        Processed DataFrame with calculated features
86
    """
87
    processed_data = data.copy()
88

89
    # Essential datetime handling for temporal ordering
90
    processed_data["InvoiceDate"] = pd.to_datetime(processed_data["InvoiceDate"])
91

92
    # Basic business constraints and calculations
93
    processed_data = processed_data[processed_data["UnitPrice"] > 0]
94
    processed_data["Amount"] = processed_data["UnitPrice"] * processed_data["Quantity"]
95
    processed_data["CustomerID"] = processed_data["CustomerID"].fillna(99999.0)
96

97
    # Handle outliers in Amount using statistical thresholds
98
    q1 = processed_data["Amount"].quantile(0.25)
99
    q3 = processed_data["Amount"].quantile(0.75)
100

101
    # Define bounds - using 1.5 IQR rule
102
    lower_bound = q1 - 1.5 * (q3 - q1)
103
    upper_bound = q3 + 1.5 * (q3 - q1)
104

105
    # Filter outliers
106
    processed_data = processed_data[
107
        (processed_data["Amount"] >= lower_bound)
108
        & (processed_data["Amount"] <= upper_bound)
109
    ]
110

111
    return processed_data
112

113

114
# Load Data
115

116
online_retail = fetch_ucirepo(id=352)
117
raw_data = online_retail.data.features
118
transformed_data = prepare_time_series_data(raw_data)
119

120

121
def prepare_data_for_modeling(
122
    df: pd.DataFrame,
123
    input_sequence_length: int = 6,
124
    output_sequence_length: int = 6,
125
) -> Dict:
126
    """
127
    Transform retail data into sequence-to-sequence format with separate
128
    temporal and trend components.
129
    """
130
    df = df.copy()
131

132
    # Daily aggregation
133
    daily_purchases = (
134
        df.groupby(["CustomerID", pd.Grouper(key="InvoiceDate", freq="D")])
135
        .agg({"Amount": "sum", "Quantity": "sum", "Country": "first"})
136
        .reset_index()
137
    )
138

139
    daily_purchases["frequency"] = np.where(daily_purchases["Amount"] > 0, 1, 0)
140

141
    # Monthly resampling
142
    monthly_purchases = (
143
        daily_purchases.set_index("InvoiceDate")
144
        .groupby("CustomerID")
145
        .resample("M")
146
        .agg(
147
            {"Amount": "sum", "Quantity": "sum", "frequency": "sum", "Country": "first"}
148
        )
149
        .reset_index()
150
    )
151

152
    # Add cyclical temporal features
153
    def prepare_temporal_features(input_window: pd.DataFrame) -> np.ndarray:
154

155
        month = input_window["InvoiceDate"].dt.month
156
        month_sin = np.sin(2 * np.pi * month / 12)
157
        month_cos = np.cos(2 * np.pi * month / 12)
158
        is_quarter_start = (month % 3 == 1).astype(int)
159

160
        temporal_features = np.column_stack(
161
            [
162
                month,
163
                input_window["InvoiceDate"].dt.year,
164
                month_sin,
165
                month_cos,
166
                is_quarter_start,
167
            ]
168
        )
169
        return temporal_features
170

171
    # Prepare trend features with lagged values
172
    def prepare_trend_features(input_window: pd.DataFrame, lag: int = 3) -> np.ndarray:
173

174
        lagged_data = pd.DataFrame()
175
        for i in range(1, lag + 1):
176
            lagged_data[f"Amount_lag_{i}"] = input_window["Amount"].shift(i)
177
            lagged_data[f"Quantity_lag_{i}"] = input_window["Quantity"].shift(i)
178
            lagged_data[f"frequency_lag_{i}"] = input_window["frequency"].shift(i)
179

180
        lagged_data = lagged_data.fillna(0)
181

182
        trend_features = np.column_stack(
183
            [
184
                input_window["Amount"].values,
185
                input_window["Quantity"].values,
186
                input_window["frequency"].values,
187
                lagged_data.values,
188
            ]
189
        )
190
        return trend_features
191

192
    sequence_containers = {
193
        "temporal_sequences": [],
194
        "trend_sequences": [],
195
        "static_features": [],
196
        "output_sequences": [],
197
    }
198

199
    # Process sequences for each customer
200
    for customer_id, customer_data in monthly_purchases.groupby("CustomerID"):
201
        customer_data = customer_data.sort_values("InvoiceDate")
202
        sequence_ranges = (
203
            len(customer_data) - input_sequence_length - output_sequence_length + 1
204
        )
205

206
        country = customer_data["Country"].iloc[0]
207

208
        for i in range(sequence_ranges):
209
            input_window = customer_data.iloc[i : i + input_sequence_length]
210
            output_window = customer_data.iloc[
211
                i
212
                + input_sequence_length : i
213
                + input_sequence_length
214
                + output_sequence_length
215
            ]
216

217
            if (
218
                len(input_window) == input_sequence_length
219
                and len(output_window) == output_sequence_length
220
            ):
221
                temporal_features = prepare_temporal_features(input_window)
222
                trend_features = prepare_trend_features(input_window)
223

224
                sequence_containers["temporal_sequences"].append(temporal_features)
225
                sequence_containers["trend_sequences"].append(trend_features)
226
                sequence_containers["static_features"].append(country)
227
                sequence_containers["output_sequences"].append(
228
                    output_window["Amount"].values
229
                )
230

231
    return {
232
        "temporal_sequences": (
233
            np.array(sequence_containers["temporal_sequences"], dtype=np.float32)
234
        ),
235
        "trend_sequences": (
236
            np.array(sequence_containers["trend_sequences"], dtype=np.float32)
237
        ),
238
        "static_features": np.array(sequence_containers["static_features"]),
239
        "output_sequences": (
240
            np.array(sequence_containers["output_sequences"], dtype=np.float32)
241
        ),
242
    }
243

244

245
# Transform data with input and output sequences into a Output dictionary
246
output = prepare_data_for_modeling(
247
    df=transformed_data, input_sequence_length=6, output_sequence_length=6
248
)
249

250
"""
251
## Scaling and Splitting
252
"""
253

254

255
def robust_scale(data):
256
    """
257
    Min-Max scaling function since standard deviation is high.
258
    """
259
    data = np.array(data)
260
    data_min = np.min(data)
261
    data_max = np.max(data)
262
    scaled = (data - data_min) / (data_max - data_min)
263
    return scaled
264

265

266
def create_temporal_splits_with_scaling(
267
    prepared_data: Dict[str, np.ndarray],
268
    test_ratio: float = 0.2,
269
    val_ratio: float = 0.2,
270
):
271
    total_sequences = len(prepared_data["trend_sequences"])
272
    # Calculate split points
273
    test_size = int(total_sequences * test_ratio)
274
    val_size = int(total_sequences * val_ratio)
275
    train_size = total_sequences - (test_size + val_size)
276

277
    # Scale trend sequences
278
    trend_shape = prepared_data["trend_sequences"].shape
279
    scaled_trends = np.zeros_like(prepared_data["trend_sequences"])
280

281
    # Scale each feature independently
282
    for i in range(trend_shape[-1]):
283
        scaled_trends[..., i] = robust_scale(prepared_data["trend_sequences"][..., i])
284
    # Scale output sequences
285
    scaled_outputs = robust_scale(prepared_data["output_sequences"])
286

287
    # Create splits
288
    train_data = {
289
        "trend_sequences": scaled_trends[:train_size],
290
        "temporal_sequences": prepared_data["temporal_sequences"][:train_size],
291
        "static_features": prepared_data["static_features"][:train_size],
292
        "output_sequences": scaled_outputs[:train_size],
293
    }
294

295
    val_data = {
296
        "trend_sequences": scaled_trends[train_size : train_size + val_size],
297
        "temporal_sequences": prepared_data["temporal_sequences"][
298
            train_size : train_size + val_size
299
        ],
300
        "static_features": prepared_data["static_features"][
301
            train_size : train_size + val_size
302
        ],
303
        "output_sequences": scaled_outputs[train_size : train_size + val_size],
304
    }
305

306
    test_data = {
307
        "trend_sequences": scaled_trends[train_size + val_size :],
308
        "temporal_sequences": prepared_data["temporal_sequences"][
309
            train_size + val_size :
310
        ],
311
        "static_features": prepared_data["static_features"][train_size + val_size :],
312
        "output_sequences": scaled_outputs[train_size + val_size :],
313
    }
314

315
    return train_data, val_data, test_data
316

317

318
# Usage
319
train_data, val_data, test_data = create_temporal_splits_with_scaling(output)
320

321
"""
322
## Evaluation
323
"""
324

325

326
def calculate_metrics(y_true, y_pred):
327
    """
328
    Calculates RMSE, MAE and R²
329
    """
330
    # Convert inputs to "float32"
331
    y_true = ops.cast(y_true, dtype="float32")
332
    y_pred = ops.cast(y_pred, dtype="float32")
333

334
    # RMSE
335
    rmse = np.sqrt(np.mean(np.square(y_true - y_pred)))
336

337
    # R² (coefficient of determination)
338
    ss_res = np.sum(np.square(y_true - y_pred))
339
    ss_tot = np.sum(np.square(y_true - np.mean(y_true)))
340
    r2 = 1 - (ss_res / ss_tot)
341

342
    return {"mae": np.mean(np.abs(y_true - y_pred)), "rmse": rmse, "r2": r2}
343

344

345
def plot_lorenz_analysis(y_true, y_pred):
346
    """
347
    Plots Lorenz curves to show distribution of high and low value users
348
    """
349
    # Convert to numpy arrays and flatten
350
    y_true = np.array(y_true).flatten()
351
    y_pred = np.array(y_pred).flatten()
352

353
    # Sort values in descending order (for high-value users analysis)
354
    true_sorted = np.sort(-y_true)
355
    pred_sorted = np.sort(-y_pred)
356

357
    # Calculate cumulative sums
358
    true_cumsum = np.cumsum(true_sorted)
359
    pred_cumsum = np.cumsum(pred_sorted)
360

361
    # Normalize to percentages
362
    true_cumsum_pct = true_cumsum / true_cumsum[-1]
363
    pred_cumsum_pct = pred_cumsum / pred_cumsum[-1]
364

365
    # Generate percentiles for x-axis
366
    percentiles = np.linspace(0, 1, len(y_true))
367

368
    # Calculate Mutual Gini (area between curves)
369
    mutual_gini = np.abs(
370
        np.trapz(true_cumsum_pct, percentiles) - np.trapz(pred_cumsum_pct, percentiles)
371
    )
372

373
    # Create plot
374
    plt.figure(figsize=(10, 6))
375
    plt.plot(percentiles, true_cumsum_pct, "g-", label="True Values")
376
    plt.plot(percentiles, pred_cumsum_pct, "r-", label="Predicted Values")
377
    plt.xlabel("Cumulative % of Users (Descending Order)")
378
    plt.ylabel("Cumulative % of LTV")
379
    plt.title("Lorenz Curves: True vs Predicted Values")
380
    plt.legend()
381
    plt.grid(True)
382
    print(f"\nMutual Gini: {mutual_gini:.4f} (lower is better)")
383
    plt.show()
384

385
    return mutual_gini
386

387

388
"""
389
## Hybrid Transformer / LSTM model architecture
390

391
The hybrid nature of this model is particularly significant because it combines RNN's
392
ability to handle sequential data with Transformer's attention mechanisms for capturing
393
global patterns across countries and seasonality.
394
"""
395

396

397
def build_hybrid_model(
398
    input_sequence_length: int,
399
    output_sequence_length: int,
400
    num_countries: int,
401
    d_model: int = 8,
402
    num_heads: int = 4,
403
):
404

405
    keras.utils.set_random_seed(seed=42)
406

407
    # Inputs
408
    temporal_inputs = layers.Input(
409
        shape=(input_sequence_length, 5), name="temporal_inputs"
410
    )
411
    trend_inputs = layers.Input(shape=(input_sequence_length, 12), name="trend_inputs")
412
    country_inputs = layers.Input(
413
        shape=(num_countries,), dtype="int32", name="country_inputs"
414
    )
415

416
    # Process country features
417
    country_embedding = layers.Embedding(
418
        input_dim=num_countries,
419
        output_dim=d_model,
420
        mask_zero=False,
421
        name="country_embedding",
422
    )(
423
        country_inputs
424
    )  # Output shape: (batch_size, 1, d_model)
425

426
    # Flatten the embedding output
427
    country_embedding = layers.Flatten(name="flatten_country_embedding")(
428
        country_embedding
429
    )
430

431
    # Repeat the country embedding across timesteps
432
    country_embedding_repeated = layers.RepeatVector(
433
        input_sequence_length, name="repeat_country_embedding"
434
    )(country_embedding)
435

436
    # Projection of temporal inputs to match Transformer dimensions
437
    temporal_projection = layers.Dense(
438
        d_model, activation="tanh", name="temporal_projection"
439
    )(temporal_inputs)
440

441
    # Combine all features
442
    combined_features = layers.Concatenate()(
443
        [temporal_projection, country_embedding_repeated]
444
    )
445

446
    transformer_output = combined_features
447
    for _ in range(3):
448
        transformer_output = TransformerEncoder(
449
            intermediate_dim=16, num_heads=num_heads
450
        )(transformer_output)
451

452
    lstm_output = layers.LSTM(units=64, name="lstm_trend")(trend_inputs)
453

454
    transformer_flattened = layers.GlobalAveragePooling1D(name="flatten_transformer")(
455
        transformer_output
456
    )
457
    transformer_flattened = layers.Dense(1, activation="sigmoid")(transformer_flattened)
458
    # Concatenate flattened Transformer output with LSTM output
459
    merged_features = layers.Concatenate(name="concatenate_transformer_lstm")(
460
        [transformer_flattened, lstm_output]
461
    )
462
    # Repeat the merged features to match the output sequence length
463
    decoder_initial = layers.RepeatVector(
464
        output_sequence_length, name="repeat_merged_features"
465
    )(merged_features)
466

467
    decoder_lstm = layers.LSTM(
468
        units=64,
469
        return_sequences=True,
470
        recurrent_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4),
471
    )(decoder_initial)
472

473
    # Output Dense layer
474
    output = layers.Dense(units=1, activation="linear", name="output_dense")(
475
        decoder_lstm
476
    )
477

478
    model = Model(
479
        inputs=[temporal_inputs, trend_inputs, country_inputs], outputs=output
480
    )
481

482
    model.compile(
483
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
484
        loss="mse",
485
        metrics=["mse"],
486
    )
487

488
    return model
489

490

491
# Create the hybrid model
492
model = build_hybrid_model(
493
    input_sequence_length=6,
494
    output_sequence_length=6,
495
    num_countries=len(np.unique(train_data["static_features"])) + 1,
496
    d_model=8,
497
    num_heads=4,
498
)
499

500
# Configure StringLookup
501
label_encoder = layers.StringLookup(output_mode="one_hot", num_oov_indices=1)
502

503
# Adapt and encode
504
label_encoder.adapt(train_data["static_features"])
505

506
train_static_encoded = label_encoder(train_data["static_features"])
507
val_static_encoded = label_encoder(val_data["static_features"])
508
test_static_encoded = label_encoder(test_data["static_features"])
509

510
# Convert sequences with proper type casting
511
x_train_seq = np.asarray(train_data["trend_sequences"]).astype(np.float32)
512
x_val_seq = np.asarray(val_data["trend_sequences"]).astype(np.float32)
513
x_train_temporal = np.asarray(train_data["temporal_sequences"]).astype(np.float32)
514
x_val_temporal = np.asarray(val_data["temporal_sequences"]).astype(np.float32)
515
train_outputs = np.asarray(train_data["output_sequences"]).astype(np.float32)
516
val_outputs = np.asarray(val_data["output_sequences"]).astype(np.float32)
517
test_output = np.asarray(test_data["output_sequences"]).astype(np.float32)
518
# Training setup
519
keras.utils.set_random_seed(seed=42)
520

521
history = model.fit(
522
    [x_train_temporal, x_train_seq, train_static_encoded],
523
    train_outputs,
524
    validation_data=(
525
        [x_val_temporal, x_val_seq, val_static_encoded],
526
        val_data["output_sequences"].astype(np.float32),
527
    ),
528
    epochs=20,
529
    batch_size=30,
530
)
531

532
# Make predictions
533
predictions = model.predict(
534
    [
535
        test_data["temporal_sequences"].astype(np.float32),
536
        test_data["trend_sequences"].astype(np.float32),
537
        test_static_encoded,
538
    ]
539
)
540

541
# Calculate the predictions
542
predictions = np.squeeze(predictions)
543

544
# Calculate basic metrics
545
hybrid_metrics = calculate_metrics(test_data["output_sequences"], predictions)
546

547
# Plot Lorenz curves and get Mutual Gini
548
hybrid_mutual_gini = plot_lorenz_analysis(test_data["output_sequences"], predictions)
549

550
"""
551
## Conclusion
552

553
While LSTMs excel at sequence to sequence learning as demonstrated through the work of Sutskever, I., Vinyals,
554
O., & Le, Q. V. (2014) Sequence to sequence learning with neural networks.
555
The hybrid approach here enhances this foundation. The addition of attention mechanisms allows the model to adaptively
556
focus on relevant temporal/geographical patterns while maintaining the LSTM's inherent strengths in sequence learning.
557
This combination has proven especially effective for handling both periodic patterns and special events in time
558
series forecasting from Zhou, H., Zhang, S., Peng, J., Zhang, S., Li, J., Xiong, H., & Zhang, W. (2021).
559
Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting.
560
"""
561

562
Product

Resources

Company