CoCalc -- deep_neural_decision

GitHub Repository: keras-team/keras-io
Path: blob/master/examples/structured_data/deep_neural_decision_forests.py
⁸⁰⁷¹ views
1
"""
2
Title: Classification with Neural Decision Forests
3
Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
4
Date created: 2021/01/15
5
Last modified: 2021/01/15
6
Description: How to train differentiable decision trees for end-to-end learning in deep neural networks.
7
Accelerator: GPU
8
"""
9

10
"""
11
## Introduction
12

13
This example provides an implementation of the
14
[Deep Neural Decision Forest](https://ieeexplore.ieee.org/document/7410529)
15
model introduced by P. Kontschieder et al. for structured data classification.
16
It demonstrates how to build a stochastic and differentiable decision tree model,
17
train it end-to-end, and unify decision trees with deep representation learning.
18

19
## The dataset
20

21
This example uses the
22
[United States Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/census+income)
23
provided by the
24
[UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php).
25
The task is binary classification
26
to predict whether a person is likely to be making over USD 50,000 a year.
27

28
The dataset includes 48,842 instances with 14 input features (such as age, work class, education, occupation, and so on): 5 numerical features
29
and 9 categorical features.
30
"""
31

32
"""
33
## Setup
34
"""
35

36
import keras
37
from keras import layers
38
from keras.layers import StringLookup
39
from keras import ops
40

41

42
from tensorflow import data as tf_data
43
import numpy as np
44
import pandas as pd
45

46
import math
47

48
"""
49
## Prepare the data
50
"""
51

52
CSV_HEADER = [
53
    "age",
54
    "workclass",
55
    "fnlwgt",
56
    "education",
57
    "education_num",
58
    "marital_status",
59
    "occupation",
60
    "relationship",
61
    "race",
62
    "gender",
63
    "capital_gain",
64
    "capital_loss",
65
    "hours_per_week",
66
    "native_country",
67
    "income_bracket",
68
]
69

70
train_data_url = (
71
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
72
)
73
train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)
74

75
test_data_url = (
76
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
77
)
78
test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)
79

80
print(f"Train dataset shape: {train_data.shape}")
81
print(f"Test dataset shape: {test_data.shape}")
82

83
"""
84
Remove the first record (because it is not a valid data example) and a trailing
85
'dot' in the class labels.
86
"""
87

88
test_data = test_data[1:]
89
test_data.income_bracket = test_data.income_bracket.apply(
90
    lambda value: value.replace(".", "")
91
)
92

93
"""
94
We store the training and test data splits locally as CSV files.
95
"""
96

97
train_data_file = "train_data.csv"
98
test_data_file = "test_data.csv"
99

100
train_data.to_csv(train_data_file, index=False, header=False)
101
test_data.to_csv(test_data_file, index=False, header=False)
102

103
"""
104
## Define dataset metadata
105

106
Here, we define the metadata of the dataset that will be useful for reading and parsing
107
and encoding input features.
108
"""
109

110
# A list of the numerical feature names.
111
NUMERIC_FEATURE_NAMES = [
112
    "age",
113
    "education_num",
114
    "capital_gain",
115
    "capital_loss",
116
    "hours_per_week",
117
]
118
# A dictionary of the categorical features and their vocabulary.
119
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
120
    "workclass": sorted(list(train_data["workclass"].unique())),
121
    "education": sorted(list(train_data["education"].unique())),
122
    "marital_status": sorted(list(train_data["marital_status"].unique())),
123
    "occupation": sorted(list(train_data["occupation"].unique())),
124
    "relationship": sorted(list(train_data["relationship"].unique())),
125
    "race": sorted(list(train_data["race"].unique())),
126
    "gender": sorted(list(train_data["gender"].unique())),
127
    "native_country": sorted(list(train_data["native_country"].unique())),
128
}
129
# A list of the columns to ignore from the dataset.
130
IGNORE_COLUMN_NAMES = ["fnlwgt"]
131
# A list of the categorical feature names.
132
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
133
# A list of all the input features.
134
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
135
# A list of column default values for each feature.
136
COLUMN_DEFAULTS = [
137
    [0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else ["NA"]
138
    for feature_name in CSV_HEADER
139
]
140
# The name of the target feature.
141
TARGET_FEATURE_NAME = "income_bracket"
142
# A list of the labels of the target features.
143
TARGET_LABELS = [" <=50K", " >50K"]
144

145
"""
146
## Create `tf_data.Dataset` objects for training and validation
147

148
We create an input function to read and parse the file, and convert features and labels
149
into a [`tf_data.Dataset`](https://www.tensorflow.org/guide/datasets)
150
for training and validation. We also preprocess the input by mapping the target label
151
to an index.
152
"""
153

154

155
target_label_lookup = StringLookup(
156
    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
157
)
158

159

160
lookup_dict = {}
161
for feature_name in CATEGORICAL_FEATURE_NAMES:
162
    vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
163
    # Create a lookup to convert a string values to an integer indices.
164
    # Since we are not using a mask token, nor expecting any out of vocabulary
165
    # (oov) token, we set mask_token to None and num_oov_indices to 0.
166
    lookup = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)
167
    lookup_dict[feature_name] = lookup
168

169

170
def encode_categorical(batch_x, batch_y):
171
    for feature_name in CATEGORICAL_FEATURE_NAMES:
172
        batch_x[feature_name] = lookup_dict[feature_name](batch_x[feature_name])
173

174
    return batch_x, batch_y
175

176

177
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
178
    dataset = (
179
        tf_data.experimental.make_csv_dataset(
180
            csv_file_path,
181
            batch_size=batch_size,
182
            column_names=CSV_HEADER,
183
            column_defaults=COLUMN_DEFAULTS,
184
            label_name=TARGET_FEATURE_NAME,
185
            num_epochs=1,
186
            header=False,
187
            na_value="?",
188
            shuffle=shuffle,
189
        )
190
        .map(lambda features, target: (features, target_label_lookup(target)))
191
        .map(encode_categorical)
192
    )
193

194
    return dataset.cache()
195

196

197
"""
198
## Create model inputs
199
"""
200

201

202
def create_model_inputs():
203
    inputs = {}
204
    for feature_name in FEATURE_NAMES:
205
        if feature_name in NUMERIC_FEATURE_NAMES:
206
            inputs[feature_name] = layers.Input(
207
                name=feature_name, shape=(), dtype="float32"
208
            )
209
        else:
210
            inputs[feature_name] = layers.Input(
211
                name=feature_name, shape=(), dtype="int32"
212
            )
213
    return inputs
214

215

216
"""
217
## Encode input features
218
"""
219

220

221
def encode_inputs(inputs):
222
    encoded_features = []
223
    for feature_name in inputs:
224
        if feature_name in CATEGORICAL_FEATURE_NAMES:
225
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
226
            # Create a lookup to convert a string values to an integer indices.
227
            # Since we are not using a mask token, nor expecting any out of vocabulary
228
            # (oov) token, we set mask_token to None and num_oov_indices to 0.
229
            value_index = inputs[feature_name]
230
            embedding_dims = int(math.sqrt(lookup.vocabulary_size()))
231
            # Create an embedding layer with the specified dimensions.
232
            embedding = layers.Embedding(
233
                input_dim=lookup.vocabulary_size(), output_dim=embedding_dims
234
            )
235
            # Convert the index values to embedding representations.
236
            encoded_feature = embedding(value_index)
237
        else:
238
            # Use the numerical features as-is.
239
            encoded_feature = inputs[feature_name]
240
            if inputs[feature_name].shape[-1] is None:
241
                encoded_feature = keras.ops.expand_dims(encoded_feature, -1)
242

243
        encoded_features.append(encoded_feature)
244

245
    encoded_features = layers.concatenate(encoded_features)
246
    return encoded_features
247

248

249
"""
250
## Deep Neural Decision Tree
251

252
A neural decision tree model has two sets of weights to learn. The first set is `pi`,
253
which represents the probability distribution of the classes in the tree leaves.
254
The second set is the weights of the routing layer `decision_fn`, which represents the probability
255
of going to each leave. The forward pass of the model works as follows:
256

257
1. The model expects input `features` as a single vector encoding all the features of an instance
258
in the batch. This vector can be generated from a Convolution Neural Network (CNN) applied to images
259
or dense transformations applied to structured data features.
260
2. The model first applies a `used_features_mask` to randomly select a subset of input features to use.
261
3. Then, the model computes the probabilities (`mu`) for the input instances to reach the tree leaves
262
by iteratively performing a *stochastic* routing throughout the tree levels.
263
4. Finally, the probabilities of reaching the leaves are combined by the class probabilities at the
264
leaves to produce the final `outputs`.
265
"""
266

267

268
class NeuralDecisionTree(keras.Model):
269
    def __init__(self, depth, num_features, used_features_rate, num_classes):
270
        super().__init__()
271
        self.depth = depth
272
        self.num_leaves = 2**depth
273
        self.num_classes = num_classes
274

275
        # Create a mask for the randomly selected features.
276
        num_used_features = int(num_features * used_features_rate)
277
        one_hot = np.eye(num_features)
278
        sampled_feature_indices = np.random.choice(
279
            np.arange(num_features), num_used_features, replace=False
280
        )
281
        self.used_features_mask = ops.convert_to_tensor(
282
            one_hot[sampled_feature_indices], dtype="float32"
283
        )
284

285
        # Initialize the weights of the classes in leaves.
286
        self.pi = self.add_weight(
287
            initializer="random_normal",
288
            shape=[self.num_leaves, self.num_classes],
289
            dtype="float32",
290
            trainable=True,
291
        )
292

293
        # Initialize the stochastic routing layer.
294
        self.decision_fn = layers.Dense(
295
            units=self.num_leaves, activation="sigmoid", name="decision"
296
        )
297

298
    def call(self, features):
299
        batch_size = ops.shape(features)[0]
300

301
        # Apply the feature mask to the input features.
302
        features = ops.matmul(
303
            features, ops.transpose(self.used_features_mask)
304
        )  # [batch_size, num_used_features]
305
        # Compute the routing probabilities.
306
        decisions = ops.expand_dims(
307
            self.decision_fn(features), axis=2
308
        )  # [batch_size, num_leaves, 1]
309
        # Concatenate the routing probabilities with their complements.
310
        decisions = layers.concatenate(
311
            [decisions, 1 - decisions], axis=2
312
        )  # [batch_size, num_leaves, 2]
313

314
        mu = ops.ones([batch_size, 1, 1])
315

316
        begin_idx = 1
317
        end_idx = 2
318
        # Traverse the tree in breadth-first order.
319
        for level in range(self.depth):
320
            mu = ops.reshape(mu, [batch_size, -1, 1])  # [batch_size, 2 ** level, 1]
321
            mu = ops.tile(mu, (1, 1, 2))  # [batch_size, 2 ** level, 2]
322
            level_decisions = decisions[
323
                :, begin_idx:end_idx, :
324
            ]  # [batch_size, 2 ** level, 2]
325
            mu = mu * level_decisions  # [batch_size, 2**level, 2]
326
            begin_idx = end_idx
327
            end_idx = begin_idx + 2 ** (level + 1)
328

329
        mu = ops.reshape(mu, [batch_size, self.num_leaves])  # [batch_size, num_leaves]
330
        probabilities = keras.activations.softmax(self.pi)  # [num_leaves, num_classes]
331
        outputs = ops.matmul(mu, probabilities)  # [batch_size, num_classes]
332
        return outputs
333

334

335
"""
336
## Deep Neural Decision Forest
337

338
The neural decision forest model consists of a set of neural decision trees that are
339
trained simultaneously. The output of the forest model is the average outputs of its trees.
340
"""
341

342

343
class NeuralDecisionForest(keras.Model):
344
    def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):
345
        super().__init__()
346
        self.ensemble = []
347
        # Initialize the ensemble by adding NeuralDecisionTree instances.
348
        # Each tree will have its own randomly selected input features to use.
349
        for _ in range(num_trees):
350
            self.ensemble.append(
351
                NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
352
            )
353

354
    def call(self, inputs):
355
        # Initialize the outputs: a [batch_size, num_classes] matrix of zeros.
356
        batch_size = ops.shape(inputs)[0]
357
        outputs = ops.zeros([batch_size, num_classes])
358

359
        # Aggregate the outputs of trees in the ensemble.
360
        for tree in self.ensemble:
361
            outputs += tree(inputs)
362
        # Divide the outputs by the ensemble size to get the average.
363
        outputs /= len(self.ensemble)
364
        return outputs
365

366

367
"""
368
Finally, let's set up the code that will train and evaluate the model.
369
"""
370

371
learning_rate = 0.01
372
batch_size = 265
373
num_epochs = 10
374

375

376
def run_experiment(model):
377
    model.compile(
378
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
379
        loss=keras.losses.SparseCategoricalCrossentropy(),
380
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
381
    )
382

383
    print("Start training the model...")
384
    train_dataset = get_dataset_from_csv(
385
        train_data_file, shuffle=True, batch_size=batch_size
386
    )
387

388
    model.fit(train_dataset, epochs=num_epochs)
389
    print("Model training finished")
390

391
    print("Evaluating the model on the test data...")
392
    test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)
393

394
    _, accuracy = model.evaluate(test_dataset)
395
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
396

397

398
"""
399
## Experiment 1: train a decision tree model
400

401
In this experiment, we train a single neural decision tree model
402
where we use all input features.
403
"""
404

405
num_trees = 10
406
depth = 10
407
used_features_rate = 1.0
408
num_classes = len(TARGET_LABELS)
409

410

411
def create_tree_model():
412
    inputs = create_model_inputs()
413
    features = encode_inputs(inputs)
414
    features = layers.BatchNormalization()(features)
415
    num_features = features.shape[1]
416

417
    tree = NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
418

419
    outputs = tree(features)
420
    model = keras.Model(inputs=inputs, outputs=outputs)
421
    return model
422

423

424
tree_model = create_tree_model()
425
run_experiment(tree_model)
426

427

428
"""
429
## Experiment 2: train a forest model
430

431
In this experiment, we train a neural decision forest with `num_trees` trees
432
where each tree uses randomly selected 50% of the input features. You can control the number
433
of features to be used in each tree by setting the `used_features_rate` variable.
434
In addition, we set the depth to 5 instead of 10 compared to the previous experiment.
435
"""
436

437
num_trees = 25
438
depth = 5
439
used_features_rate = 0.5
440

441

442
def create_forest_model():
443
    inputs = create_model_inputs()
444
    features = encode_inputs(inputs)
445
    features = layers.BatchNormalization()(features)
446
    num_features = features.shape[1]
447

448
    forest_model = NeuralDecisionForest(
449
        num_trees, depth, num_features, used_features_rate, num_classes
450
    )
451

452
    outputs = forest_model(features)
453
    model = keras.Model(inputs=inputs, outputs=outputs)
454
    return model
455

456

457
forest_model = create_forest_model()
458

459
run_experiment(forest_model)
460

461
Product

Resources

Company