Path: blob/master/examples/structured_data/customer_lifetime_value.py
7783 views
"""1Title: Deep Learning for Customer Lifetime Value2Author: [Praveen Hosdrug](https://www.linkedin.com/in/praveenhosdrug/)3Date created: 2024/11/234Last modified: 2024/11/275Description: A hybrid deep learning architecture for predicting customer purchase patterns and lifetime value.6Accelerator: None7"""89"""10## Introduction1112A hybrid deep learning architecture combining Transformer encoders and LSTM networks13for predicting customer purchase patterns and lifetime value using transaction history.14While many existing review articles focus on classic parametric models and traditional machine learning algorithms15,this implementation leverages recent advancements in Transformer-based models for time series prediction.16The approach handles multi-granularity prediction across different temporal scales.1718"""1920"""21## Setting up Libraries for the Deep Learning Project22"""23import subprocess242526def install_packages(packages):27"""28Install a list of packages using pip.2930Args:31packages (list): A list of package names to install.32"""33for package in packages:34subprocess.run(["pip", "install", package], check=True)353637"""38## List of Packages to Install39401. uciml: For the purpose of the tutorial; we will be using41the UK Retail [Dataset](https://archive.ics.uci.edu/dataset/352/online+retail)422. keras_hub: Access to the transformer encoder layer.43"""4445packages_to_install = ["ucimlrepo", "keras_hub"]4647# Install the packages48install_packages(packages_to_install)4950# Core data processing and numerical libraries51import os5253os.environ["KERAS_BACKEND"] = "jax"54import keras55import numpy as np56import pandas as pd57from typing import Dict5859# Visualization60import matplotlib.pyplot as plt6162# Keras imports63from keras import layers64from keras import Model65from keras import ops66from keras_hub.layers import TransformerEncoder67from keras import regularizers6869# UK Retail Dataset70from ucimlrepo import fetch_ucirepo7172"""73## Preprocessing the UK Retail dataset74"""757677def prepare_time_series_data(data):78"""79Preprocess retail transaction data for deep learning.8081Args:82data: Raw transaction data containing InvoiceDate, UnitPrice, etc.83Returns:84Processed DataFrame with calculated features85"""86processed_data = data.copy()8788# Essential datetime handling for temporal ordering89processed_data["InvoiceDate"] = pd.to_datetime(processed_data["InvoiceDate"])9091# Basic business constraints and calculations92processed_data = processed_data[processed_data["UnitPrice"] > 0]93processed_data["Amount"] = processed_data["UnitPrice"] * processed_data["Quantity"]94processed_data["CustomerID"] = processed_data["CustomerID"].fillna(99999.0)9596# Handle outliers in Amount using statistical thresholds97q1 = processed_data["Amount"].quantile(0.25)98q3 = processed_data["Amount"].quantile(0.75)99100# Define bounds - using 1.5 IQR rule101lower_bound = q1 - 1.5 * (q3 - q1)102upper_bound = q3 + 1.5 * (q3 - q1)103104# Filter outliers105processed_data = processed_data[106(processed_data["Amount"] >= lower_bound)107& (processed_data["Amount"] <= upper_bound)108]109110return processed_data111112113# Load Data114115online_retail = fetch_ucirepo(id=352)116raw_data = online_retail.data.features117transformed_data = prepare_time_series_data(raw_data)118119120def prepare_data_for_modeling(121df: pd.DataFrame,122input_sequence_length: int = 6,123output_sequence_length: int = 6,124) -> Dict:125"""126Transform retail data into sequence-to-sequence format with separate127temporal and trend components.128"""129df = df.copy()130131# Daily aggregation132daily_purchases = (133df.groupby(["CustomerID", pd.Grouper(key="InvoiceDate", freq="D")])134.agg({"Amount": "sum", "Quantity": "sum", "Country": "first"})135.reset_index()136)137138daily_purchases["frequency"] = np.where(daily_purchases["Amount"] > 0, 1, 0)139140# Monthly resampling141monthly_purchases = (142daily_purchases.set_index("InvoiceDate")143.groupby("CustomerID")144.resample("M")145.agg(146{"Amount": "sum", "Quantity": "sum", "frequency": "sum", "Country": "first"}147)148.reset_index()149)150151# Add cyclical temporal features152def prepare_temporal_features(input_window: pd.DataFrame) -> np.ndarray:153154month = input_window["InvoiceDate"].dt.month155month_sin = np.sin(2 * np.pi * month / 12)156month_cos = np.cos(2 * np.pi * month / 12)157is_quarter_start = (month % 3 == 1).astype(int)158159temporal_features = np.column_stack(160[161month,162input_window["InvoiceDate"].dt.year,163month_sin,164month_cos,165is_quarter_start,166]167)168return temporal_features169170# Prepare trend features with lagged values171def prepare_trend_features(input_window: pd.DataFrame, lag: int = 3) -> np.ndarray:172173lagged_data = pd.DataFrame()174for i in range(1, lag + 1):175lagged_data[f"Amount_lag_{i}"] = input_window["Amount"].shift(i)176lagged_data[f"Quantity_lag_{i}"] = input_window["Quantity"].shift(i)177lagged_data[f"frequency_lag_{i}"] = input_window["frequency"].shift(i)178179lagged_data = lagged_data.fillna(0)180181trend_features = np.column_stack(182[183input_window["Amount"].values,184input_window["Quantity"].values,185input_window["frequency"].values,186lagged_data.values,187]188)189return trend_features190191sequence_containers = {192"temporal_sequences": [],193"trend_sequences": [],194"static_features": [],195"output_sequences": [],196}197198# Process sequences for each customer199for customer_id, customer_data in monthly_purchases.groupby("CustomerID"):200customer_data = customer_data.sort_values("InvoiceDate")201sequence_ranges = (202len(customer_data) - input_sequence_length - output_sequence_length + 1203)204205country = customer_data["Country"].iloc[0]206207for i in range(sequence_ranges):208input_window = customer_data.iloc[i : i + input_sequence_length]209output_window = customer_data.iloc[210i211+ input_sequence_length : i212+ input_sequence_length213+ output_sequence_length214]215216if (217len(input_window) == input_sequence_length218and len(output_window) == output_sequence_length219):220temporal_features = prepare_temporal_features(input_window)221trend_features = prepare_trend_features(input_window)222223sequence_containers["temporal_sequences"].append(temporal_features)224sequence_containers["trend_sequences"].append(trend_features)225sequence_containers["static_features"].append(country)226sequence_containers["output_sequences"].append(227output_window["Amount"].values228)229230return {231"temporal_sequences": (232np.array(sequence_containers["temporal_sequences"], dtype=np.float32)233),234"trend_sequences": (235np.array(sequence_containers["trend_sequences"], dtype=np.float32)236),237"static_features": np.array(sequence_containers["static_features"]),238"output_sequences": (239np.array(sequence_containers["output_sequences"], dtype=np.float32)240),241}242243244# Transform data with input and output sequences into a Output dictionary245output = prepare_data_for_modeling(246df=transformed_data, input_sequence_length=6, output_sequence_length=6247)248249"""250## Scaling and Splitting251"""252253254def robust_scale(data):255"""256Min-Max scaling function since standard deviation is high.257"""258data = np.array(data)259data_min = np.min(data)260data_max = np.max(data)261scaled = (data - data_min) / (data_max - data_min)262return scaled263264265def create_temporal_splits_with_scaling(266prepared_data: Dict[str, np.ndarray],267test_ratio: float = 0.2,268val_ratio: float = 0.2,269):270total_sequences = len(prepared_data["trend_sequences"])271# Calculate split points272test_size = int(total_sequences * test_ratio)273val_size = int(total_sequences * val_ratio)274train_size = total_sequences - (test_size + val_size)275276# Scale trend sequences277trend_shape = prepared_data["trend_sequences"].shape278scaled_trends = np.zeros_like(prepared_data["trend_sequences"])279280# Scale each feature independently281for i in range(trend_shape[-1]):282scaled_trends[..., i] = robust_scale(prepared_data["trend_sequences"][..., i])283# Scale output sequences284scaled_outputs = robust_scale(prepared_data["output_sequences"])285286# Create splits287train_data = {288"trend_sequences": scaled_trends[:train_size],289"temporal_sequences": prepared_data["temporal_sequences"][:train_size],290"static_features": prepared_data["static_features"][:train_size],291"output_sequences": scaled_outputs[:train_size],292}293294val_data = {295"trend_sequences": scaled_trends[train_size : train_size + val_size],296"temporal_sequences": prepared_data["temporal_sequences"][297train_size : train_size + val_size298],299"static_features": prepared_data["static_features"][300train_size : train_size + val_size301],302"output_sequences": scaled_outputs[train_size : train_size + val_size],303}304305test_data = {306"trend_sequences": scaled_trends[train_size + val_size :],307"temporal_sequences": prepared_data["temporal_sequences"][308train_size + val_size :309],310"static_features": prepared_data["static_features"][train_size + val_size :],311"output_sequences": scaled_outputs[train_size + val_size :],312}313314return train_data, val_data, test_data315316317# Usage318train_data, val_data, test_data = create_temporal_splits_with_scaling(output)319320"""321## Evaluation322"""323324325def calculate_metrics(y_true, y_pred):326"""327Calculates RMSE, MAE and R²328"""329# Convert inputs to "float32"330y_true = ops.cast(y_true, dtype="float32")331y_pred = ops.cast(y_pred, dtype="float32")332333# RMSE334rmse = np.sqrt(np.mean(np.square(y_true - y_pred)))335336# R² (coefficient of determination)337ss_res = np.sum(np.square(y_true - y_pred))338ss_tot = np.sum(np.square(y_true - np.mean(y_true)))339r2 = 1 - (ss_res / ss_tot)340341return {"mae": np.mean(np.abs(y_true - y_pred)), "rmse": rmse, "r2": r2}342343344def plot_lorenz_analysis(y_true, y_pred):345"""346Plots Lorenz curves to show distribution of high and low value users347"""348# Convert to numpy arrays and flatten349y_true = np.array(y_true).flatten()350y_pred = np.array(y_pred).flatten()351352# Sort values in descending order (for high-value users analysis)353true_sorted = np.sort(-y_true)354pred_sorted = np.sort(-y_pred)355356# Calculate cumulative sums357true_cumsum = np.cumsum(true_sorted)358pred_cumsum = np.cumsum(pred_sorted)359360# Normalize to percentages361true_cumsum_pct = true_cumsum / true_cumsum[-1]362pred_cumsum_pct = pred_cumsum / pred_cumsum[-1]363364# Generate percentiles for x-axis365percentiles = np.linspace(0, 1, len(y_true))366367# Calculate Mutual Gini (area between curves)368mutual_gini = np.abs(369np.trapz(true_cumsum_pct, percentiles) - np.trapz(pred_cumsum_pct, percentiles)370)371372# Create plot373plt.figure(figsize=(10, 6))374plt.plot(percentiles, true_cumsum_pct, "g-", label="True Values")375plt.plot(percentiles, pred_cumsum_pct, "r-", label="Predicted Values")376plt.xlabel("Cumulative % of Users (Descending Order)")377plt.ylabel("Cumulative % of LTV")378plt.title("Lorenz Curves: True vs Predicted Values")379plt.legend()380plt.grid(True)381print(f"\nMutual Gini: {mutual_gini:.4f} (lower is better)")382plt.show()383384return mutual_gini385386387"""388## Hybrid Transformer / LSTM model architecture389390The hybrid nature of this model is particularly significant because it combines RNN's391ability to handle sequential data with Transformer's attention mechanisms for capturing392global patterns across countries and seasonality.393"""394395396def build_hybrid_model(397input_sequence_length: int,398output_sequence_length: int,399num_countries: int,400d_model: int = 8,401num_heads: int = 4,402):403404keras.utils.set_random_seed(seed=42)405406# Inputs407temporal_inputs = layers.Input(408shape=(input_sequence_length, 5), name="temporal_inputs"409)410trend_inputs = layers.Input(shape=(input_sequence_length, 12), name="trend_inputs")411country_inputs = layers.Input(412shape=(num_countries,), dtype="int32", name="country_inputs"413)414415# Process country features416country_embedding = layers.Embedding(417input_dim=num_countries,418output_dim=d_model,419mask_zero=False,420name="country_embedding",421)(422country_inputs423) # Output shape: (batch_size, 1, d_model)424425# Flatten the embedding output426country_embedding = layers.Flatten(name="flatten_country_embedding")(427country_embedding428)429430# Repeat the country embedding across timesteps431country_embedding_repeated = layers.RepeatVector(432input_sequence_length, name="repeat_country_embedding"433)(country_embedding)434435# Projection of temporal inputs to match Transformer dimensions436temporal_projection = layers.Dense(437d_model, activation="tanh", name="temporal_projection"438)(temporal_inputs)439440# Combine all features441combined_features = layers.Concatenate()(442[temporal_projection, country_embedding_repeated]443)444445transformer_output = combined_features446for _ in range(3):447transformer_output = TransformerEncoder(448intermediate_dim=16, num_heads=num_heads449)(transformer_output)450451lstm_output = layers.LSTM(units=64, name="lstm_trend")(trend_inputs)452453transformer_flattened = layers.GlobalAveragePooling1D(name="flatten_transformer")(454transformer_output455)456transformer_flattened = layers.Dense(1, activation="sigmoid")(transformer_flattened)457# Concatenate flattened Transformer output with LSTM output458merged_features = layers.Concatenate(name="concatenate_transformer_lstm")(459[transformer_flattened, lstm_output]460)461# Repeat the merged features to match the output sequence length462decoder_initial = layers.RepeatVector(463output_sequence_length, name="repeat_merged_features"464)(merged_features)465466decoder_lstm = layers.LSTM(467units=64,468return_sequences=True,469recurrent_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4),470)(decoder_initial)471472# Output Dense layer473output = layers.Dense(units=1, activation="linear", name="output_dense")(474decoder_lstm475)476477model = Model(478inputs=[temporal_inputs, trend_inputs, country_inputs], outputs=output479)480481model.compile(482optimizer=keras.optimizers.Adam(learning_rate=0.001),483loss="mse",484metrics=["mse"],485)486487return model488489490# Create the hybrid model491model = build_hybrid_model(492input_sequence_length=6,493output_sequence_length=6,494num_countries=len(np.unique(train_data["static_features"])) + 1,495d_model=8,496num_heads=4,497)498499# Configure StringLookup500label_encoder = layers.StringLookup(output_mode="one_hot", num_oov_indices=1)501502# Adapt and encode503label_encoder.adapt(train_data["static_features"])504505train_static_encoded = label_encoder(train_data["static_features"])506val_static_encoded = label_encoder(val_data["static_features"])507test_static_encoded = label_encoder(test_data["static_features"])508509# Convert sequences with proper type casting510x_train_seq = np.asarray(train_data["trend_sequences"]).astype(np.float32)511x_val_seq = np.asarray(val_data["trend_sequences"]).astype(np.float32)512x_train_temporal = np.asarray(train_data["temporal_sequences"]).astype(np.float32)513x_val_temporal = np.asarray(val_data["temporal_sequences"]).astype(np.float32)514train_outputs = np.asarray(train_data["output_sequences"]).astype(np.float32)515val_outputs = np.asarray(val_data["output_sequences"]).astype(np.float32)516test_output = np.asarray(test_data["output_sequences"]).astype(np.float32)517# Training setup518keras.utils.set_random_seed(seed=42)519520history = model.fit(521[x_train_temporal, x_train_seq, train_static_encoded],522train_outputs,523validation_data=(524[x_val_temporal, x_val_seq, val_static_encoded],525val_data["output_sequences"].astype(np.float32),526),527epochs=20,528batch_size=30,529)530531# Make predictions532predictions = model.predict(533[534test_data["temporal_sequences"].astype(np.float32),535test_data["trend_sequences"].astype(np.float32),536test_static_encoded,537]538)539540# Calculate the predictions541predictions = np.squeeze(predictions)542543# Calculate basic metrics544hybrid_metrics = calculate_metrics(test_data["output_sequences"], predictions)545546# Plot Lorenz curves and get Mutual Gini547hybrid_mutual_gini = plot_lorenz_analysis(test_data["output_sequences"], predictions)548549"""550## Conclusion551552While LSTMs excel at sequence to sequence learning as demonstrated through the work of Sutskever, I., Vinyals,553O., & Le, Q. V. (2014) Sequence to sequence learning with neural networks.554The hybrid approach here enhances this foundation. The addition of attention mechanisms allows the model to adaptively555focus on relevant temporal/geographical patterns while maintaining the LSTM's inherent strengths in sequence learning.556This combination has proven especially effective for handling both periodic patterns and special events in time557series forecasting from Zhou, H., Zhang, S., Peng, J., Zhang, S., Li, J., Xiong, H., & Zhang, W. (2021).558Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting.559"""560561562