Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
keras-team
GitHub Repository: keras-team/keras-io
Path: blob/master/examples/structured_data/customer_lifetime_value.py
7783 views
1
"""
2
Title: Deep Learning for Customer Lifetime Value
3
Author: [Praveen Hosdrug](https://www.linkedin.com/in/praveenhosdrug/)
4
Date created: 2024/11/23
5
Last modified: 2024/11/27
6
Description: A hybrid deep learning architecture for predicting customer purchase patterns and lifetime value.
7
Accelerator: None
8
"""
9
10
"""
11
## Introduction
12
13
A hybrid deep learning architecture combining Transformer encoders and LSTM networks
14
for predicting customer purchase patterns and lifetime value using transaction history.
15
While many existing review articles focus on classic parametric models and traditional machine learning algorithms
16
,this implementation leverages recent advancements in Transformer-based models for time series prediction.
17
The approach handles multi-granularity prediction across different temporal scales.
18
19
"""
20
21
"""
22
## Setting up Libraries for the Deep Learning Project
23
"""
24
import subprocess
25
26
27
def install_packages(packages):
28
"""
29
Install a list of packages using pip.
30
31
Args:
32
packages (list): A list of package names to install.
33
"""
34
for package in packages:
35
subprocess.run(["pip", "install", package], check=True)
36
37
38
"""
39
## List of Packages to Install
40
41
1. uciml: For the purpose of the tutorial; we will be using
42
the UK Retail [Dataset](https://archive.ics.uci.edu/dataset/352/online+retail)
43
2. keras_hub: Access to the transformer encoder layer.
44
"""
45
46
packages_to_install = ["ucimlrepo", "keras_hub"]
47
48
# Install the packages
49
install_packages(packages_to_install)
50
51
# Core data processing and numerical libraries
52
import os
53
54
os.environ["KERAS_BACKEND"] = "jax"
55
import keras
56
import numpy as np
57
import pandas as pd
58
from typing import Dict
59
60
# Visualization
61
import matplotlib.pyplot as plt
62
63
# Keras imports
64
from keras import layers
65
from keras import Model
66
from keras import ops
67
from keras_hub.layers import TransformerEncoder
68
from keras import regularizers
69
70
# UK Retail Dataset
71
from ucimlrepo import fetch_ucirepo
72
73
"""
74
## Preprocessing the UK Retail dataset
75
"""
76
77
78
def prepare_time_series_data(data):
79
"""
80
Preprocess retail transaction data for deep learning.
81
82
Args:
83
data: Raw transaction data containing InvoiceDate, UnitPrice, etc.
84
Returns:
85
Processed DataFrame with calculated features
86
"""
87
processed_data = data.copy()
88
89
# Essential datetime handling for temporal ordering
90
processed_data["InvoiceDate"] = pd.to_datetime(processed_data["InvoiceDate"])
91
92
# Basic business constraints and calculations
93
processed_data = processed_data[processed_data["UnitPrice"] > 0]
94
processed_data["Amount"] = processed_data["UnitPrice"] * processed_data["Quantity"]
95
processed_data["CustomerID"] = processed_data["CustomerID"].fillna(99999.0)
96
97
# Handle outliers in Amount using statistical thresholds
98
q1 = processed_data["Amount"].quantile(0.25)
99
q3 = processed_data["Amount"].quantile(0.75)
100
101
# Define bounds - using 1.5 IQR rule
102
lower_bound = q1 - 1.5 * (q3 - q1)
103
upper_bound = q3 + 1.5 * (q3 - q1)
104
105
# Filter outliers
106
processed_data = processed_data[
107
(processed_data["Amount"] >= lower_bound)
108
& (processed_data["Amount"] <= upper_bound)
109
]
110
111
return processed_data
112
113
114
# Load Data
115
116
online_retail = fetch_ucirepo(id=352)
117
raw_data = online_retail.data.features
118
transformed_data = prepare_time_series_data(raw_data)
119
120
121
def prepare_data_for_modeling(
122
df: pd.DataFrame,
123
input_sequence_length: int = 6,
124
output_sequence_length: int = 6,
125
) -> Dict:
126
"""
127
Transform retail data into sequence-to-sequence format with separate
128
temporal and trend components.
129
"""
130
df = df.copy()
131
132
# Daily aggregation
133
daily_purchases = (
134
df.groupby(["CustomerID", pd.Grouper(key="InvoiceDate", freq="D")])
135
.agg({"Amount": "sum", "Quantity": "sum", "Country": "first"})
136
.reset_index()
137
)
138
139
daily_purchases["frequency"] = np.where(daily_purchases["Amount"] > 0, 1, 0)
140
141
# Monthly resampling
142
monthly_purchases = (
143
daily_purchases.set_index("InvoiceDate")
144
.groupby("CustomerID")
145
.resample("M")
146
.agg(
147
{"Amount": "sum", "Quantity": "sum", "frequency": "sum", "Country": "first"}
148
)
149
.reset_index()
150
)
151
152
# Add cyclical temporal features
153
def prepare_temporal_features(input_window: pd.DataFrame) -> np.ndarray:
154
155
month = input_window["InvoiceDate"].dt.month
156
month_sin = np.sin(2 * np.pi * month / 12)
157
month_cos = np.cos(2 * np.pi * month / 12)
158
is_quarter_start = (month % 3 == 1).astype(int)
159
160
temporal_features = np.column_stack(
161
[
162
month,
163
input_window["InvoiceDate"].dt.year,
164
month_sin,
165
month_cos,
166
is_quarter_start,
167
]
168
)
169
return temporal_features
170
171
# Prepare trend features with lagged values
172
def prepare_trend_features(input_window: pd.DataFrame, lag: int = 3) -> np.ndarray:
173
174
lagged_data = pd.DataFrame()
175
for i in range(1, lag + 1):
176
lagged_data[f"Amount_lag_{i}"] = input_window["Amount"].shift(i)
177
lagged_data[f"Quantity_lag_{i}"] = input_window["Quantity"].shift(i)
178
lagged_data[f"frequency_lag_{i}"] = input_window["frequency"].shift(i)
179
180
lagged_data = lagged_data.fillna(0)
181
182
trend_features = np.column_stack(
183
[
184
input_window["Amount"].values,
185
input_window["Quantity"].values,
186
input_window["frequency"].values,
187
lagged_data.values,
188
]
189
)
190
return trend_features
191
192
sequence_containers = {
193
"temporal_sequences": [],
194
"trend_sequences": [],
195
"static_features": [],
196
"output_sequences": [],
197
}
198
199
# Process sequences for each customer
200
for customer_id, customer_data in monthly_purchases.groupby("CustomerID"):
201
customer_data = customer_data.sort_values("InvoiceDate")
202
sequence_ranges = (
203
len(customer_data) - input_sequence_length - output_sequence_length + 1
204
)
205
206
country = customer_data["Country"].iloc[0]
207
208
for i in range(sequence_ranges):
209
input_window = customer_data.iloc[i : i + input_sequence_length]
210
output_window = customer_data.iloc[
211
i
212
+ input_sequence_length : i
213
+ input_sequence_length
214
+ output_sequence_length
215
]
216
217
if (
218
len(input_window) == input_sequence_length
219
and len(output_window) == output_sequence_length
220
):
221
temporal_features = prepare_temporal_features(input_window)
222
trend_features = prepare_trend_features(input_window)
223
224
sequence_containers["temporal_sequences"].append(temporal_features)
225
sequence_containers["trend_sequences"].append(trend_features)
226
sequence_containers["static_features"].append(country)
227
sequence_containers["output_sequences"].append(
228
output_window["Amount"].values
229
)
230
231
return {
232
"temporal_sequences": (
233
np.array(sequence_containers["temporal_sequences"], dtype=np.float32)
234
),
235
"trend_sequences": (
236
np.array(sequence_containers["trend_sequences"], dtype=np.float32)
237
),
238
"static_features": np.array(sequence_containers["static_features"]),
239
"output_sequences": (
240
np.array(sequence_containers["output_sequences"], dtype=np.float32)
241
),
242
}
243
244
245
# Transform data with input and output sequences into a Output dictionary
246
output = prepare_data_for_modeling(
247
df=transformed_data, input_sequence_length=6, output_sequence_length=6
248
)
249
250
"""
251
## Scaling and Splitting
252
"""
253
254
255
def robust_scale(data):
256
"""
257
Min-Max scaling function since standard deviation is high.
258
"""
259
data = np.array(data)
260
data_min = np.min(data)
261
data_max = np.max(data)
262
scaled = (data - data_min) / (data_max - data_min)
263
return scaled
264
265
266
def create_temporal_splits_with_scaling(
267
prepared_data: Dict[str, np.ndarray],
268
test_ratio: float = 0.2,
269
val_ratio: float = 0.2,
270
):
271
total_sequences = len(prepared_data["trend_sequences"])
272
# Calculate split points
273
test_size = int(total_sequences * test_ratio)
274
val_size = int(total_sequences * val_ratio)
275
train_size = total_sequences - (test_size + val_size)
276
277
# Scale trend sequences
278
trend_shape = prepared_data["trend_sequences"].shape
279
scaled_trends = np.zeros_like(prepared_data["trend_sequences"])
280
281
# Scale each feature independently
282
for i in range(trend_shape[-1]):
283
scaled_trends[..., i] = robust_scale(prepared_data["trend_sequences"][..., i])
284
# Scale output sequences
285
scaled_outputs = robust_scale(prepared_data["output_sequences"])
286
287
# Create splits
288
train_data = {
289
"trend_sequences": scaled_trends[:train_size],
290
"temporal_sequences": prepared_data["temporal_sequences"][:train_size],
291
"static_features": prepared_data["static_features"][:train_size],
292
"output_sequences": scaled_outputs[:train_size],
293
}
294
295
val_data = {
296
"trend_sequences": scaled_trends[train_size : train_size + val_size],
297
"temporal_sequences": prepared_data["temporal_sequences"][
298
train_size : train_size + val_size
299
],
300
"static_features": prepared_data["static_features"][
301
train_size : train_size + val_size
302
],
303
"output_sequences": scaled_outputs[train_size : train_size + val_size],
304
}
305
306
test_data = {
307
"trend_sequences": scaled_trends[train_size + val_size :],
308
"temporal_sequences": prepared_data["temporal_sequences"][
309
train_size + val_size :
310
],
311
"static_features": prepared_data["static_features"][train_size + val_size :],
312
"output_sequences": scaled_outputs[train_size + val_size :],
313
}
314
315
return train_data, val_data, test_data
316
317
318
# Usage
319
train_data, val_data, test_data = create_temporal_splits_with_scaling(output)
320
321
"""
322
## Evaluation
323
"""
324
325
326
def calculate_metrics(y_true, y_pred):
327
"""
328
Calculates RMSE, MAE and R²
329
"""
330
# Convert inputs to "float32"
331
y_true = ops.cast(y_true, dtype="float32")
332
y_pred = ops.cast(y_pred, dtype="float32")
333
334
# RMSE
335
rmse = np.sqrt(np.mean(np.square(y_true - y_pred)))
336
337
# R² (coefficient of determination)
338
ss_res = np.sum(np.square(y_true - y_pred))
339
ss_tot = np.sum(np.square(y_true - np.mean(y_true)))
340
r2 = 1 - (ss_res / ss_tot)
341
342
return {"mae": np.mean(np.abs(y_true - y_pred)), "rmse": rmse, "r2": r2}
343
344
345
def plot_lorenz_analysis(y_true, y_pred):
346
"""
347
Plots Lorenz curves to show distribution of high and low value users
348
"""
349
# Convert to numpy arrays and flatten
350
y_true = np.array(y_true).flatten()
351
y_pred = np.array(y_pred).flatten()
352
353
# Sort values in descending order (for high-value users analysis)
354
true_sorted = np.sort(-y_true)
355
pred_sorted = np.sort(-y_pred)
356
357
# Calculate cumulative sums
358
true_cumsum = np.cumsum(true_sorted)
359
pred_cumsum = np.cumsum(pred_sorted)
360
361
# Normalize to percentages
362
true_cumsum_pct = true_cumsum / true_cumsum[-1]
363
pred_cumsum_pct = pred_cumsum / pred_cumsum[-1]
364
365
# Generate percentiles for x-axis
366
percentiles = np.linspace(0, 1, len(y_true))
367
368
# Calculate Mutual Gini (area between curves)
369
mutual_gini = np.abs(
370
np.trapz(true_cumsum_pct, percentiles) - np.trapz(pred_cumsum_pct, percentiles)
371
)
372
373
# Create plot
374
plt.figure(figsize=(10, 6))
375
plt.plot(percentiles, true_cumsum_pct, "g-", label="True Values")
376
plt.plot(percentiles, pred_cumsum_pct, "r-", label="Predicted Values")
377
plt.xlabel("Cumulative % of Users (Descending Order)")
378
plt.ylabel("Cumulative % of LTV")
379
plt.title("Lorenz Curves: True vs Predicted Values")
380
plt.legend()
381
plt.grid(True)
382
print(f"\nMutual Gini: {mutual_gini:.4f} (lower is better)")
383
plt.show()
384
385
return mutual_gini
386
387
388
"""
389
## Hybrid Transformer / LSTM model architecture
390
391
The hybrid nature of this model is particularly significant because it combines RNN's
392
ability to handle sequential data with Transformer's attention mechanisms for capturing
393
global patterns across countries and seasonality.
394
"""
395
396
397
def build_hybrid_model(
398
input_sequence_length: int,
399
output_sequence_length: int,
400
num_countries: int,
401
d_model: int = 8,
402
num_heads: int = 4,
403
):
404
405
keras.utils.set_random_seed(seed=42)
406
407
# Inputs
408
temporal_inputs = layers.Input(
409
shape=(input_sequence_length, 5), name="temporal_inputs"
410
)
411
trend_inputs = layers.Input(shape=(input_sequence_length, 12), name="trend_inputs")
412
country_inputs = layers.Input(
413
shape=(num_countries,), dtype="int32", name="country_inputs"
414
)
415
416
# Process country features
417
country_embedding = layers.Embedding(
418
input_dim=num_countries,
419
output_dim=d_model,
420
mask_zero=False,
421
name="country_embedding",
422
)(
423
country_inputs
424
) # Output shape: (batch_size, 1, d_model)
425
426
# Flatten the embedding output
427
country_embedding = layers.Flatten(name="flatten_country_embedding")(
428
country_embedding
429
)
430
431
# Repeat the country embedding across timesteps
432
country_embedding_repeated = layers.RepeatVector(
433
input_sequence_length, name="repeat_country_embedding"
434
)(country_embedding)
435
436
# Projection of temporal inputs to match Transformer dimensions
437
temporal_projection = layers.Dense(
438
d_model, activation="tanh", name="temporal_projection"
439
)(temporal_inputs)
440
441
# Combine all features
442
combined_features = layers.Concatenate()(
443
[temporal_projection, country_embedding_repeated]
444
)
445
446
transformer_output = combined_features
447
for _ in range(3):
448
transformer_output = TransformerEncoder(
449
intermediate_dim=16, num_heads=num_heads
450
)(transformer_output)
451
452
lstm_output = layers.LSTM(units=64, name="lstm_trend")(trend_inputs)
453
454
transformer_flattened = layers.GlobalAveragePooling1D(name="flatten_transformer")(
455
transformer_output
456
)
457
transformer_flattened = layers.Dense(1, activation="sigmoid")(transformer_flattened)
458
# Concatenate flattened Transformer output with LSTM output
459
merged_features = layers.Concatenate(name="concatenate_transformer_lstm")(
460
[transformer_flattened, lstm_output]
461
)
462
# Repeat the merged features to match the output sequence length
463
decoder_initial = layers.RepeatVector(
464
output_sequence_length, name="repeat_merged_features"
465
)(merged_features)
466
467
decoder_lstm = layers.LSTM(
468
units=64,
469
return_sequences=True,
470
recurrent_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4),
471
)(decoder_initial)
472
473
# Output Dense layer
474
output = layers.Dense(units=1, activation="linear", name="output_dense")(
475
decoder_lstm
476
)
477
478
model = Model(
479
inputs=[temporal_inputs, trend_inputs, country_inputs], outputs=output
480
)
481
482
model.compile(
483
optimizer=keras.optimizers.Adam(learning_rate=0.001),
484
loss="mse",
485
metrics=["mse"],
486
)
487
488
return model
489
490
491
# Create the hybrid model
492
model = build_hybrid_model(
493
input_sequence_length=6,
494
output_sequence_length=6,
495
num_countries=len(np.unique(train_data["static_features"])) + 1,
496
d_model=8,
497
num_heads=4,
498
)
499
500
# Configure StringLookup
501
label_encoder = layers.StringLookup(output_mode="one_hot", num_oov_indices=1)
502
503
# Adapt and encode
504
label_encoder.adapt(train_data["static_features"])
505
506
train_static_encoded = label_encoder(train_data["static_features"])
507
val_static_encoded = label_encoder(val_data["static_features"])
508
test_static_encoded = label_encoder(test_data["static_features"])
509
510
# Convert sequences with proper type casting
511
x_train_seq = np.asarray(train_data["trend_sequences"]).astype(np.float32)
512
x_val_seq = np.asarray(val_data["trend_sequences"]).astype(np.float32)
513
x_train_temporal = np.asarray(train_data["temporal_sequences"]).astype(np.float32)
514
x_val_temporal = np.asarray(val_data["temporal_sequences"]).astype(np.float32)
515
train_outputs = np.asarray(train_data["output_sequences"]).astype(np.float32)
516
val_outputs = np.asarray(val_data["output_sequences"]).astype(np.float32)
517
test_output = np.asarray(test_data["output_sequences"]).astype(np.float32)
518
# Training setup
519
keras.utils.set_random_seed(seed=42)
520
521
history = model.fit(
522
[x_train_temporal, x_train_seq, train_static_encoded],
523
train_outputs,
524
validation_data=(
525
[x_val_temporal, x_val_seq, val_static_encoded],
526
val_data["output_sequences"].astype(np.float32),
527
),
528
epochs=20,
529
batch_size=30,
530
)
531
532
# Make predictions
533
predictions = model.predict(
534
[
535
test_data["temporal_sequences"].astype(np.float32),
536
test_data["trend_sequences"].astype(np.float32),
537
test_static_encoded,
538
]
539
)
540
541
# Calculate the predictions
542
predictions = np.squeeze(predictions)
543
544
# Calculate basic metrics
545
hybrid_metrics = calculate_metrics(test_data["output_sequences"], predictions)
546
547
# Plot Lorenz curves and get Mutual Gini
548
hybrid_mutual_gini = plot_lorenz_analysis(test_data["output_sequences"], predictions)
549
550
"""
551
## Conclusion
552
553
While LSTMs excel at sequence to sequence learning as demonstrated through the work of Sutskever, I., Vinyals,
554
O., & Le, Q. V. (2014) Sequence to sequence learning with neural networks.
555
The hybrid approach here enhances this foundation. The addition of attention mechanisms allows the model to adaptively
556
focus on relevant temporal/geographical patterns while maintaining the LSTM's inherent strengths in sequence learning.
557
This combination has proven especially effective for handling both periodic patterns and special events in time
558
series forecasting from Zhou, H., Zhang, S., Peng, J., Zhang, S., Li, J., Xiong, H., & Zhang, W. (2021).
559
Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting.
560
"""
561
562