Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
keras-team
GitHub Repository: keras-team/keras-io
Path: blob/master/examples/structured_data/deep_neural_decision_forests.py
8071 views
1
"""
2
Title: Classification with Neural Decision Forests
3
Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
4
Date created: 2021/01/15
5
Last modified: 2021/01/15
6
Description: How to train differentiable decision trees for end-to-end learning in deep neural networks.
7
Accelerator: GPU
8
"""
9
10
"""
11
## Introduction
12
13
This example provides an implementation of the
14
[Deep Neural Decision Forest](https://ieeexplore.ieee.org/document/7410529)
15
model introduced by P. Kontschieder et al. for structured data classification.
16
It demonstrates how to build a stochastic and differentiable decision tree model,
17
train it end-to-end, and unify decision trees with deep representation learning.
18
19
## The dataset
20
21
This example uses the
22
[United States Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/census+income)
23
provided by the
24
[UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php).
25
The task is binary classification
26
to predict whether a person is likely to be making over USD 50,000 a year.
27
28
The dataset includes 48,842 instances with 14 input features (such as age, work class, education, occupation, and so on): 5 numerical features
29
and 9 categorical features.
30
"""
31
32
"""
33
## Setup
34
"""
35
36
import keras
37
from keras import layers
38
from keras.layers import StringLookup
39
from keras import ops
40
41
42
from tensorflow import data as tf_data
43
import numpy as np
44
import pandas as pd
45
46
import math
47
48
"""
49
## Prepare the data
50
"""
51
52
CSV_HEADER = [
53
"age",
54
"workclass",
55
"fnlwgt",
56
"education",
57
"education_num",
58
"marital_status",
59
"occupation",
60
"relationship",
61
"race",
62
"gender",
63
"capital_gain",
64
"capital_loss",
65
"hours_per_week",
66
"native_country",
67
"income_bracket",
68
]
69
70
train_data_url = (
71
"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
72
)
73
train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)
74
75
test_data_url = (
76
"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
77
)
78
test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)
79
80
print(f"Train dataset shape: {train_data.shape}")
81
print(f"Test dataset shape: {test_data.shape}")
82
83
"""
84
Remove the first record (because it is not a valid data example) and a trailing
85
'dot' in the class labels.
86
"""
87
88
test_data = test_data[1:]
89
test_data.income_bracket = test_data.income_bracket.apply(
90
lambda value: value.replace(".", "")
91
)
92
93
"""
94
We store the training and test data splits locally as CSV files.
95
"""
96
97
train_data_file = "train_data.csv"
98
test_data_file = "test_data.csv"
99
100
train_data.to_csv(train_data_file, index=False, header=False)
101
test_data.to_csv(test_data_file, index=False, header=False)
102
103
"""
104
## Define dataset metadata
105
106
Here, we define the metadata of the dataset that will be useful for reading and parsing
107
and encoding input features.
108
"""
109
110
# A list of the numerical feature names.
111
NUMERIC_FEATURE_NAMES = [
112
"age",
113
"education_num",
114
"capital_gain",
115
"capital_loss",
116
"hours_per_week",
117
]
118
# A dictionary of the categorical features and their vocabulary.
119
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
120
"workclass": sorted(list(train_data["workclass"].unique())),
121
"education": sorted(list(train_data["education"].unique())),
122
"marital_status": sorted(list(train_data["marital_status"].unique())),
123
"occupation": sorted(list(train_data["occupation"].unique())),
124
"relationship": sorted(list(train_data["relationship"].unique())),
125
"race": sorted(list(train_data["race"].unique())),
126
"gender": sorted(list(train_data["gender"].unique())),
127
"native_country": sorted(list(train_data["native_country"].unique())),
128
}
129
# A list of the columns to ignore from the dataset.
130
IGNORE_COLUMN_NAMES = ["fnlwgt"]
131
# A list of the categorical feature names.
132
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
133
# A list of all the input features.
134
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
135
# A list of column default values for each feature.
136
COLUMN_DEFAULTS = [
137
[0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else ["NA"]
138
for feature_name in CSV_HEADER
139
]
140
# The name of the target feature.
141
TARGET_FEATURE_NAME = "income_bracket"
142
# A list of the labels of the target features.
143
TARGET_LABELS = [" <=50K", " >50K"]
144
145
"""
146
## Create `tf_data.Dataset` objects for training and validation
147
148
We create an input function to read and parse the file, and convert features and labels
149
into a [`tf_data.Dataset`](https://www.tensorflow.org/guide/datasets)
150
for training and validation. We also preprocess the input by mapping the target label
151
to an index.
152
"""
153
154
155
target_label_lookup = StringLookup(
156
vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
157
)
158
159
160
lookup_dict = {}
161
for feature_name in CATEGORICAL_FEATURE_NAMES:
162
vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
163
# Create a lookup to convert a string values to an integer indices.
164
# Since we are not using a mask token, nor expecting any out of vocabulary
165
# (oov) token, we set mask_token to None and num_oov_indices to 0.
166
lookup = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)
167
lookup_dict[feature_name] = lookup
168
169
170
def encode_categorical(batch_x, batch_y):
171
for feature_name in CATEGORICAL_FEATURE_NAMES:
172
batch_x[feature_name] = lookup_dict[feature_name](batch_x[feature_name])
173
174
return batch_x, batch_y
175
176
177
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
178
dataset = (
179
tf_data.experimental.make_csv_dataset(
180
csv_file_path,
181
batch_size=batch_size,
182
column_names=CSV_HEADER,
183
column_defaults=COLUMN_DEFAULTS,
184
label_name=TARGET_FEATURE_NAME,
185
num_epochs=1,
186
header=False,
187
na_value="?",
188
shuffle=shuffle,
189
)
190
.map(lambda features, target: (features, target_label_lookup(target)))
191
.map(encode_categorical)
192
)
193
194
return dataset.cache()
195
196
197
"""
198
## Create model inputs
199
"""
200
201
202
def create_model_inputs():
203
inputs = {}
204
for feature_name in FEATURE_NAMES:
205
if feature_name in NUMERIC_FEATURE_NAMES:
206
inputs[feature_name] = layers.Input(
207
name=feature_name, shape=(), dtype="float32"
208
)
209
else:
210
inputs[feature_name] = layers.Input(
211
name=feature_name, shape=(), dtype="int32"
212
)
213
return inputs
214
215
216
"""
217
## Encode input features
218
"""
219
220
221
def encode_inputs(inputs):
222
encoded_features = []
223
for feature_name in inputs:
224
if feature_name in CATEGORICAL_FEATURE_NAMES:
225
vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
226
# Create a lookup to convert a string values to an integer indices.
227
# Since we are not using a mask token, nor expecting any out of vocabulary
228
# (oov) token, we set mask_token to None and num_oov_indices to 0.
229
value_index = inputs[feature_name]
230
embedding_dims = int(math.sqrt(lookup.vocabulary_size()))
231
# Create an embedding layer with the specified dimensions.
232
embedding = layers.Embedding(
233
input_dim=lookup.vocabulary_size(), output_dim=embedding_dims
234
)
235
# Convert the index values to embedding representations.
236
encoded_feature = embedding(value_index)
237
else:
238
# Use the numerical features as-is.
239
encoded_feature = inputs[feature_name]
240
if inputs[feature_name].shape[-1] is None:
241
encoded_feature = keras.ops.expand_dims(encoded_feature, -1)
242
243
encoded_features.append(encoded_feature)
244
245
encoded_features = layers.concatenate(encoded_features)
246
return encoded_features
247
248
249
"""
250
## Deep Neural Decision Tree
251
252
A neural decision tree model has two sets of weights to learn. The first set is `pi`,
253
which represents the probability distribution of the classes in the tree leaves.
254
The second set is the weights of the routing layer `decision_fn`, which represents the probability
255
of going to each leave. The forward pass of the model works as follows:
256
257
1. The model expects input `features` as a single vector encoding all the features of an instance
258
in the batch. This vector can be generated from a Convolution Neural Network (CNN) applied to images
259
or dense transformations applied to structured data features.
260
2. The model first applies a `used_features_mask` to randomly select a subset of input features to use.
261
3. Then, the model computes the probabilities (`mu`) for the input instances to reach the tree leaves
262
by iteratively performing a *stochastic* routing throughout the tree levels.
263
4. Finally, the probabilities of reaching the leaves are combined by the class probabilities at the
264
leaves to produce the final `outputs`.
265
"""
266
267
268
class NeuralDecisionTree(keras.Model):
269
def __init__(self, depth, num_features, used_features_rate, num_classes):
270
super().__init__()
271
self.depth = depth
272
self.num_leaves = 2**depth
273
self.num_classes = num_classes
274
275
# Create a mask for the randomly selected features.
276
num_used_features = int(num_features * used_features_rate)
277
one_hot = np.eye(num_features)
278
sampled_feature_indices = np.random.choice(
279
np.arange(num_features), num_used_features, replace=False
280
)
281
self.used_features_mask = ops.convert_to_tensor(
282
one_hot[sampled_feature_indices], dtype="float32"
283
)
284
285
# Initialize the weights of the classes in leaves.
286
self.pi = self.add_weight(
287
initializer="random_normal",
288
shape=[self.num_leaves, self.num_classes],
289
dtype="float32",
290
trainable=True,
291
)
292
293
# Initialize the stochastic routing layer.
294
self.decision_fn = layers.Dense(
295
units=self.num_leaves, activation="sigmoid", name="decision"
296
)
297
298
def call(self, features):
299
batch_size = ops.shape(features)[0]
300
301
# Apply the feature mask to the input features.
302
features = ops.matmul(
303
features, ops.transpose(self.used_features_mask)
304
) # [batch_size, num_used_features]
305
# Compute the routing probabilities.
306
decisions = ops.expand_dims(
307
self.decision_fn(features), axis=2
308
) # [batch_size, num_leaves, 1]
309
# Concatenate the routing probabilities with their complements.
310
decisions = layers.concatenate(
311
[decisions, 1 - decisions], axis=2
312
) # [batch_size, num_leaves, 2]
313
314
mu = ops.ones([batch_size, 1, 1])
315
316
begin_idx = 1
317
end_idx = 2
318
# Traverse the tree in breadth-first order.
319
for level in range(self.depth):
320
mu = ops.reshape(mu, [batch_size, -1, 1]) # [batch_size, 2 ** level, 1]
321
mu = ops.tile(mu, (1, 1, 2)) # [batch_size, 2 ** level, 2]
322
level_decisions = decisions[
323
:, begin_idx:end_idx, :
324
] # [batch_size, 2 ** level, 2]
325
mu = mu * level_decisions # [batch_size, 2**level, 2]
326
begin_idx = end_idx
327
end_idx = begin_idx + 2 ** (level + 1)
328
329
mu = ops.reshape(mu, [batch_size, self.num_leaves]) # [batch_size, num_leaves]
330
probabilities = keras.activations.softmax(self.pi) # [num_leaves, num_classes]
331
outputs = ops.matmul(mu, probabilities) # [batch_size, num_classes]
332
return outputs
333
334
335
"""
336
## Deep Neural Decision Forest
337
338
The neural decision forest model consists of a set of neural decision trees that are
339
trained simultaneously. The output of the forest model is the average outputs of its trees.
340
"""
341
342
343
class NeuralDecisionForest(keras.Model):
344
def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):
345
super().__init__()
346
self.ensemble = []
347
# Initialize the ensemble by adding NeuralDecisionTree instances.
348
# Each tree will have its own randomly selected input features to use.
349
for _ in range(num_trees):
350
self.ensemble.append(
351
NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
352
)
353
354
def call(self, inputs):
355
# Initialize the outputs: a [batch_size, num_classes] matrix of zeros.
356
batch_size = ops.shape(inputs)[0]
357
outputs = ops.zeros([batch_size, num_classes])
358
359
# Aggregate the outputs of trees in the ensemble.
360
for tree in self.ensemble:
361
outputs += tree(inputs)
362
# Divide the outputs by the ensemble size to get the average.
363
outputs /= len(self.ensemble)
364
return outputs
365
366
367
"""
368
Finally, let's set up the code that will train and evaluate the model.
369
"""
370
371
learning_rate = 0.01
372
batch_size = 265
373
num_epochs = 10
374
375
376
def run_experiment(model):
377
model.compile(
378
optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
379
loss=keras.losses.SparseCategoricalCrossentropy(),
380
metrics=[keras.metrics.SparseCategoricalAccuracy()],
381
)
382
383
print("Start training the model...")
384
train_dataset = get_dataset_from_csv(
385
train_data_file, shuffle=True, batch_size=batch_size
386
)
387
388
model.fit(train_dataset, epochs=num_epochs)
389
print("Model training finished")
390
391
print("Evaluating the model on the test data...")
392
test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)
393
394
_, accuracy = model.evaluate(test_dataset)
395
print(f"Test accuracy: {round(accuracy * 100, 2)}%")
396
397
398
"""
399
## Experiment 1: train a decision tree model
400
401
In this experiment, we train a single neural decision tree model
402
where we use all input features.
403
"""
404
405
num_trees = 10
406
depth = 10
407
used_features_rate = 1.0
408
num_classes = len(TARGET_LABELS)
409
410
411
def create_tree_model():
412
inputs = create_model_inputs()
413
features = encode_inputs(inputs)
414
features = layers.BatchNormalization()(features)
415
num_features = features.shape[1]
416
417
tree = NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
418
419
outputs = tree(features)
420
model = keras.Model(inputs=inputs, outputs=outputs)
421
return model
422
423
424
tree_model = create_tree_model()
425
run_experiment(tree_model)
426
427
428
"""
429
## Experiment 2: train a forest model
430
431
In this experiment, we train a neural decision forest with `num_trees` trees
432
where each tree uses randomly selected 50% of the input features. You can control the number
433
of features to be used in each tree by setting the `used_features_rate` variable.
434
In addition, we set the depth to 5 instead of 10 compared to the previous experiment.
435
"""
436
437
num_trees = 25
438
depth = 5
439
used_features_rate = 0.5
440
441
442
def create_forest_model():
443
inputs = create_model_inputs()
444
features = encode_inputs(inputs)
445
features = layers.BatchNormalization()(features)
446
num_features = features.shape[1]
447
448
forest_model = NeuralDecisionForest(
449
num_trees, depth, num_features, used_features_rate, num_classes
450
)
451
452
outputs = forest_model(features)
453
model = keras.Model(inputs=inputs, outputs=outputs)
454
return model
455
456
457
forest_model = create_forest_model()
458
459
run_experiment(forest_model)
460
461