CoCalc -- learning.py

GitHub Repository: aimacode/aima-python
Path: blob/master/learning.py
⁶¹⁵ views
1
"""Learning from examples (Chapters 18)"""
2

3
import copy
4
from collections import defaultdict
5
from statistics import stdev
6

7
from qpsolvers import solve_qp
8

9
from probabilistic_learning import NaiveBayesLearner
10
from utils import *
11

12

13
class DataSet:
14
    """
15
    A data set for a machine learning problem. It has the following fields:
16

17
    d.examples   A list of examples. Each one is a list of attribute values.
18
    d.attrs      A list of integers to index into an example, so example[attr]
19
                 gives a value. Normally the same as range(len(d.examples[0])).
20
    d.attr_names Optional list of mnemonic names for corresponding attrs.
21
    d.target     The attribute that a learning algorithm will try to predict.
22
                 By default the final attribute.
23
    d.inputs     The list of attrs without the target.
24
    d.values     A list of lists: each sublist is the set of possible
25
                 values for the corresponding attribute. If initially None,
26
                 it is computed from the known examples by self.set_problem.
27
                 If not None, an erroneous value raises ValueError.
28
    d.distance   A function from a pair of examples to a non-negative number.
29
                 Should be symmetric, etc. Defaults to mean_boolean_error
30
                 since that can handle any field types.
31
    d.name       Name of the data set (for output display only).
32
    d.source     URL or other source where the data came from.
33
    d.exclude    A list of attribute indexes to exclude from d.inputs. Elements
34
                 of this list can either be integers (attrs) or attr_names.
35

36
    Normally, you call the constructor and you're done; then you just
37
    access fields like d.examples and d.target and d.inputs.
38
    """
39

40
    def __init__(self, examples=None, attrs=None, attr_names=None, target=-1, inputs=None,
41
                 values=None, distance=mean_boolean_error, name='', source='', exclude=()):
42
        """
43
        Accepts any of DataSet's fields. Examples can also be a
44
        string or file from which to parse examples using parse_csv.
45
        Optional parameter: exclude, as documented in .set_problem().
46
        >>> DataSet(examples='1, 2, 3')
47
        <DataSet(): 1 examples, 3 attributes>
48
        """
49
        self.name = name
50
        self.source = source
51
        self.values = values
52
        self.distance = distance
53
        self.got_values_flag = bool(values)
54

55
        # initialize .examples from string or list or data directory
56
        if isinstance(examples, str):
57
            self.examples = parse_csv(examples)
58
        elif examples is None:
59
            self.examples = parse_csv(open_data(name + '.csv').read())
60
        else:
61
            self.examples = examples
62

63
        # attrs are the indices of examples, unless otherwise stated.
64
        if self.examples is not None and attrs is None:
65
            attrs = list(range(len(self.examples[0])))
66

67
        self.attrs = attrs
68

69
        # initialize .attr_names from string, list, or by default
70
        if isinstance(attr_names, str):
71
            self.attr_names = attr_names.split()
72
        else:
73
            self.attr_names = attr_names or attrs
74
        self.set_problem(target, inputs=inputs, exclude=exclude)
75

76
    def set_problem(self, target, inputs=None, exclude=()):
77
        """
78
        Set (or change) the target and/or inputs.
79
        This way, one DataSet can be used multiple ways. inputs, if specified,
80
        is a list of attributes, or specify exclude as a list of attributes
81
        to not use in inputs. Attributes can be -n .. n, or an attr_name.
82
        Also computes the list of possible values, if that wasn't done yet.
83
        """
84
        self.target = self.attr_num(target)
85
        exclude = list(map(self.attr_num, exclude))
86
        if inputs:
87
            self.inputs = remove_all(self.target, inputs)
88
        else:
89
            self.inputs = [a for a in self.attrs if a != self.target and a not in exclude]
90
        if not self.values:
91
            self.update_values()
92
        self.check_me()
93

94
    def check_me(self):
95
        """Check that my fields make sense."""
96
        assert len(self.attr_names) == len(self.attrs)
97
        assert self.target in self.attrs
98
        assert self.target not in self.inputs
99
        assert set(self.inputs).issubset(set(self.attrs))
100
        if self.got_values_flag:
101
            # only check if values are provided while initializing DataSet
102
            list(map(self.check_example, self.examples))
103

104
    def add_example(self, example):
105
        """Add an example to the list of examples, checking it first."""
106
        self.check_example(example)
107
        self.examples.append(example)
108

109
    def check_example(self, example):
110
        """Raise ValueError if example has any invalid values."""
111
        if self.values:
112
            for a in self.attrs:
113
                if example[a] not in self.values[a]:
114
                    raise ValueError('Bad value {} for attribute {} in {}'
115
                                     .format(example[a], self.attr_names[a], example))
116

117
    def attr_num(self, attr):
118
        """Returns the number used for attr, which can be a name, or -n .. n-1."""
119
        if isinstance(attr, str):
120
            return self.attr_names.index(attr)
121
        elif attr < 0:
122
            return len(self.attrs) + attr
123
        else:
124
            return attr
125

126
    def update_values(self):
127
        self.values = list(map(unique, zip(*self.examples)))
128

129
    def sanitize(self, example):
130
        """Return a copy of example, with non-input attributes replaced by None."""
131
        return [attr_i if i in self.inputs else None for i, attr_i in enumerate(example)]
132

133
    def classes_to_numbers(self, classes=None):
134
        """Converts class names to numbers."""
135
        if not classes:
136
            # if classes were not given, extract them from values
137
            classes = sorted(self.values[self.target])
138
        for item in self.examples:
139
            item[self.target] = classes.index(item[self.target])
140

141
    def remove_examples(self, value=''):
142
        """Remove examples that contain given value."""
143
        self.examples = [x for x in self.examples if value not in x]
144
        self.update_values()
145

146
    def split_values_by_classes(self):
147
        """Split values into buckets according to their class."""
148
        buckets = defaultdict(lambda: [])
149
        target_names = self.values[self.target]
150

151
        for v in self.examples:
152
            item = [a for a in v if a not in target_names]  # remove target from item
153
            buckets[v[self.target]].append(item)  # add item to bucket of its class
154

155
        return buckets
156

157
    def find_means_and_deviations(self):
158
        """
159
        Finds the means and standard deviations of self.dataset.
160
        means     : a dictionary for each class/target. Holds a list of the means
161
                    of the features for the class.
162
        deviations: a dictionary for each class/target. Holds a list of the sample
163
                    standard deviations of the features for the class.
164
        """
165
        target_names = self.values[self.target]
166
        feature_numbers = len(self.inputs)
167

168
        item_buckets = self.split_values_by_classes()
169

170
        means = defaultdict(lambda: [0] * feature_numbers)
171
        deviations = defaultdict(lambda: [0] * feature_numbers)
172

173
        for t in target_names:
174
            # find all the item feature values for item in class t
175
            features = [[] for _ in range(feature_numbers)]
176
            for item in item_buckets[t]:
177
                for i in range(feature_numbers):
178
                    features[i].append(item[i])
179

180
            # calculate means and deviations fo the class
181
            for i in range(feature_numbers):
182
                means[t][i] = mean(features[i])
183
                deviations[t][i] = stdev(features[i])
184

185
        return means, deviations
186

187
    def __repr__(self):
188
        return '<DataSet({}): {:d} examples, {:d} attributes>'.format(self.name, len(self.examples), len(self.attrs))
189

190

191
def parse_csv(input, delim=','):
192
    r"""
193
    Input is a string consisting of lines, each line has comma-delimited
194
    fields. Convert this into a list of lists. Blank lines are skipped.
195
    Fields that look like numbers are converted to numbers.
196
    The delim defaults to ',' but '\t' and None are also reasonable values.
197
    >>> parse_csv('1, 2, 3 \n 0, 2, na')
198
    [[1, 2, 3], [0, 2, 'na']]
199
    """
200
    lines = [line for line in input.splitlines() if line.strip()]
201
    return [list(map(num_or_str, line.split(delim))) for line in lines]
202

203

204
def err_ratio(predict, dataset, examples=None):
205
    """
206
    Return the proportion of the examples that are NOT correctly predicted.
207
    verbose - 0: No output; 1: Output wrong; 2 (or greater): Output correct
208
    """
209
    examples = examples or dataset.examples
210
    if len(examples) == 0:
211
        return 0.0
212
    right = 0
213
    for example in examples:
214
        desired = example[dataset.target]
215
        output = predict(dataset.sanitize(example))
216
        if output == desired:
217
            right += 1
218
    return 1 - (right / len(examples))
219

220

221
def grade_learner(predict, tests):
222
    """
223
    Grades the given learner based on how many tests it passes.
224
    tests is a list with each element in the form: (values, output).
225
    """
226
    return mean(int(predict(X) == y) for X, y in tests)
227

228

229
def train_test_split(dataset, start=None, end=None, test_split=None):
230
    """
231
    If you are giving 'start' and 'end' as parameters,
232
    then it will return the testing set from index 'start' to 'end'
233
    and the rest for training.
234
    If you give 'test_split' as a parameter then it will return
235
    test_split * 100% as the testing set and the rest as
236
    training set.
237
    """
238
    examples = dataset.examples
239
    if test_split is None:
240
        train = examples[:start] + examples[end:]
241
        val = examples[start:end]
242
    else:
243
        total_size = len(examples)
244
        val_size = int(total_size * test_split)
245
        train_size = total_size - val_size
246
        train = examples[:train_size]
247
        val = examples[train_size:total_size]
248

249
    return train, val
250

251

252
def cross_validation_wrapper(learner, dataset, k=10, trials=1):
253
    """
254
    [Figure 18.8]
255
    Return the optimal value of size having minimum error on validation set.
256
    errT: a training error array, indexed by size
257
    errV: a validation error array, indexed by size
258
    """
259
    errs = []
260
    size = 1
261
    while True:
262
        errT, errV = cross_validation(learner, dataset, size, k, trials)
263
        # check for convergence provided err_val is not empty
264
        if errT and not np.isclose(errT[-1], errT, rtol=1e-6):
265
            best_size = 0
266
            min_val = np.inf
267
            i = 0
268
            while i < size:
269
                if errs[i] < min_val:
270
                    min_val = errs[i]
271
                    best_size = i
272
                i += 1
273
            return learner(dataset, best_size)
274
        errs.append(errV)
275
        size += 1
276

277

278
def cross_validation(learner, dataset, size=None, k=10, trials=1):
279
    """
280
    Do k-fold cross_validate and return their mean.
281
    That is, keep out 1/k of the examples for testing on each of k runs.
282
    Shuffle the examples first; if trials > 1, average over several shuffles.
283
    Returns Training error, Validation error
284
    """
285
    k = k or len(dataset.examples)
286
    if trials > 1:
287
        trial_errT = 0
288
        trial_errV = 0
289
        for t in range(trials):
290
            errT, errV = cross_validation(learner, dataset, size, k, trials)
291
            trial_errT += errT
292
            trial_errV += errV
293
        return trial_errT / trials, trial_errV / trials
294
    else:
295
        fold_errT = 0
296
        fold_errV = 0
297
        n = len(dataset.examples)
298
        examples = dataset.examples
299
        random.shuffle(dataset.examples)
300
        for fold in range(k):
301
            train_data, val_data = train_test_split(dataset, fold * (n // k), (fold + 1) * (n // k))
302
            dataset.examples = train_data
303
            h = learner(dataset, size)
304
            fold_errT += err_ratio(h, dataset, train_data)
305
            fold_errV += err_ratio(h, dataset, val_data)
306
            # reverting back to original once test is completed
307
            dataset.examples = examples
308
        return fold_errT / k, fold_errV / k
309

310

311
def leave_one_out(learner, dataset, size=None):
312
    """Leave one out cross-validation over the dataset."""
313
    return cross_validation(learner, dataset, size, len(dataset.examples))
314

315

316
def learning_curve(learner, dataset, trials=10, sizes=None):
317
    if sizes is None:
318
        sizes = list(range(2, len(dataset.examples) - trials, 2))
319

320
    def score(learner, size):
321
        random.shuffle(dataset.examples)
322
        return cross_validation(learner, dataset, size, trials)
323

324
    return [(size, mean([score(learner, size) for _ in range(trials)])) for size in sizes]
325

326

327
def PluralityLearner(dataset):
328
    """
329
    A very dumb algorithm: always pick the result that was most popular
330
    in the training data. Makes a baseline for comparison.
331
    """
332
    most_popular = mode([e[dataset.target] for e in dataset.examples])
333

334
    def predict(example):
335
        """Always return same result: the most popular from the training set."""
336
        return most_popular
337

338
    return predict
339

340

341
class DecisionFork:
342
    """
343
    A fork of a decision tree holds an attribute to test, and a dict
344
    of branches, one for each of the attribute's values.
345
    """
346

347
    def __init__(self, attr, attr_name=None, default_child=None, branches=None):
348
        """Initialize by saying what attribute this node tests."""
349
        self.attr = attr
350
        self.attr_name = attr_name or attr
351
        self.default_child = default_child
352
        self.branches = branches or {}
353

354
    def __call__(self, example):
355
        """Given an example, classify it using the attribute and the branches."""
356
        attr_val = example[self.attr]
357
        if attr_val in self.branches:
358
            return self.branches[attr_val](example)
359
        else:
360
            # return default class when attribute is unknown
361
            return self.default_child(example)
362

363
    def add(self, val, subtree):
364
        """Add a branch. If self.attr = val, go to the given subtree."""
365
        self.branches[val] = subtree
366

367
    def display(self, indent=0):
368
        name = self.attr_name
369
        print('Test', name)
370
        for (val, subtree) in self.branches.items():
371
            print(' ' * 4 * indent, name, '=', val, '==>', end=' ')
372
            subtree.display(indent + 1)
373

374
    def __repr__(self):
375
        return 'DecisionFork({0!r}, {1!r}, {2!r})'.format(self.attr, self.attr_name, self.branches)
376

377

378
class DecisionLeaf:
379
    """A leaf of a decision tree holds just a result."""
380

381
    def __init__(self, result):
382
        self.result = result
383

384
    def __call__(self, example):
385
        return self.result
386

387
    def display(self):
388
        print('RESULT =', self.result)
389

390
    def __repr__(self):
391
        return repr(self.result)
392

393

394
def DecisionTreeLearner(dataset):
395
    """[Figure 18.5]"""
396

397
    target, values = dataset.target, dataset.values
398

399
    def decision_tree_learning(examples, attrs, parent_examples=()):
400
        if len(examples) == 0:
401
            return plurality_value(parent_examples)
402
        if all_same_class(examples):
403
            return DecisionLeaf(examples[0][target])
404
        if len(attrs) == 0:
405
            return plurality_value(examples)
406
        A = choose_attribute(attrs, examples)
407
        tree = DecisionFork(A, dataset.attr_names[A], plurality_value(examples))
408
        for (v_k, exs) in split_by(A, examples):
409
            subtree = decision_tree_learning(exs, remove_all(A, attrs), examples)
410
            tree.add(v_k, subtree)
411
        return tree
412

413
    def plurality_value(examples):
414
        """
415
        Return the most popular target value for this set of examples.
416
        (If target is binary, this is the majority; otherwise plurality).
417
        """
418
        popular = argmax_random_tie(values[target], key=lambda v: count(target, v, examples))
419
        return DecisionLeaf(popular)
420

421
    def count(attr, val, examples):
422
        """Count the number of examples that have example[attr] = val."""
423
        return sum(e[attr] == val for e in examples)
424

425
    def all_same_class(examples):
426
        """Are all these examples in the same target class?"""
427
        class0 = examples[0][target]
428
        return all(e[target] == class0 for e in examples)
429

430
    def choose_attribute(attrs, examples):
431
        """Choose the attribute with the highest information gain."""
432
        return argmax_random_tie(attrs, key=lambda a: information_gain(a, examples))
433

434
    def information_gain(attr, examples):
435
        """Return the expected reduction in entropy from splitting by attr."""
436

437
        def I(examples):
438
            return information_content([count(target, v, examples) for v in values[target]])
439

440
        n = len(examples)
441
        remainder = sum((len(examples_i) / n) * I(examples_i) for (v, examples_i) in split_by(attr, examples))
442
        return I(examples) - remainder
443

444
    def split_by(attr, examples):
445
        """Return a list of (val, examples) pairs for each val of attr."""
446
        return [(v, [e for e in examples if e[attr] == v]) for v in values[attr]]
447

448
    return decision_tree_learning(dataset.examples, dataset.inputs)
449

450

451
def information_content(values):
452
    """Number of bits to represent the probability distribution in values."""
453
    probabilities = normalize(remove_all(0, values))
454
    return sum(-p * np.log2(p) for p in probabilities)
455

456

457
def DecisionListLearner(dataset):
458
    """
459
    [Figure 18.11]
460
    A decision list implemented as a list of (test, value) pairs.
461
    """
462

463
    def decision_list_learning(examples):
464
        if not examples:
465
            return [(True, False)]
466
        t, o, examples_t = find_examples(examples)
467
        if not t:
468
            raise Exception
469
        return [(t, o)] + decision_list_learning(examples - examples_t)
470

471
    def find_examples(examples):
472
        """
473
        Find a set of examples that all have the same outcome under
474
        some test. Return a tuple of the test, outcome, and examples.
475
        """
476
        raise NotImplementedError
477

478
    def passes(example, test):
479
        """Does the example pass the test?"""
480
        raise NotImplementedError
481

482
    def predict(example):
483
        """Predict the outcome for the first passing test."""
484
        for test, outcome in predict.decision_list:
485
            if passes(example, test):
486
                return outcome
487

488
    predict.decision_list = decision_list_learning(set(dataset.examples))
489

490
    return predict
491

492

493
def NearestNeighborLearner(dataset, k=1):
494
    """k-NearestNeighbor: the k nearest neighbors vote."""
495

496
    def predict(example):
497
        """Find the k closest items, and have them vote for the best."""
498
        best = heapq.nsmallest(k, ((dataset.distance(e, example), e) for e in dataset.examples))
499
        return mode(e[dataset.target] for (d, e) in best)
500

501
    return predict
502

503

504
def LinearLearner(dataset, learning_rate=0.01, epochs=100):
505
    """
506
    [Section 18.6.3]
507
    Linear classifier with hard threshold.
508
    """
509
    idx_i = dataset.inputs
510
    idx_t = dataset.target
511
    examples = dataset.examples
512
    num_examples = len(examples)
513

514
    # X transpose
515
    X_col = [dataset.values[i] for i in idx_i]  # vertical columns of X
516

517
    # add dummy
518
    ones = [1 for _ in range(len(examples))]
519
    X_col = [ones] + X_col
520

521
    # initialize random weights
522
    num_weights = len(idx_i) + 1
523
    w = random_weights(min_value=-0.5, max_value=0.5, num_weights=num_weights)
524

525
    for epoch in range(epochs):
526
        err = []
527
        # pass over all examples
528
        for example in examples:
529
            x = [1] + example
530
            y = np.dot(w, x)
531
            t = example[idx_t]
532
            err.append(t - y)
533

534
        # update weights
535
        for i in range(len(w)):
536
            w[i] = w[i] + learning_rate * (np.dot(err, X_col[i]) / num_examples)
537

538
    def predict(example):
539
        x = [1] + example
540
        return np.dot(w, x)
541

542
    return predict
543

544

545
def LogisticLinearLeaner(dataset, learning_rate=0.01, epochs=100):
546
    """
547
    [Section 18.6.4]
548
    Linear classifier with logistic regression.
549
    """
550
    idx_i = dataset.inputs
551
    idx_t = dataset.target
552
    examples = dataset.examples
553
    num_examples = len(examples)
554

555
    # X transpose
556
    X_col = [dataset.values[i] for i in idx_i]  # vertical columns of X
557

558
    # add dummy
559
    ones = [1 for _ in range(len(examples))]
560
    X_col = [ones] + X_col
561

562
    # initialize random weights
563
    num_weights = len(idx_i) + 1
564
    w = random_weights(min_value=-0.5, max_value=0.5, num_weights=num_weights)
565

566
    for epoch in range(epochs):
567
        err = []
568
        h = []
569
        # pass over all examples
570
        for example in examples:
571
            x = [1] + example
572
            y = sigmoid(np.dot(w, x))
573
            h.append(sigmoid_derivative(y))
574
            t = example[idx_t]
575
            err.append(t - y)
576

577
        # update weights
578
        for i in range(len(w)):
579
            buffer = [x * y for x, y in zip(err, h)]
580
            w[i] = w[i] + learning_rate * (np.dot(buffer, X_col[i]) / num_examples)
581

582
    def predict(example):
583
        x = [1] + example
584
        return sigmoid(np.dot(w, x))
585

586
    return predict
587

588

589
def NeuralNetLearner(dataset, hidden_layer_sizes=None, learning_rate=0.01, epochs=100, activation=sigmoid):
590
    """
591
    Layered feed-forward network.
592
    hidden_layer_sizes: List of number of hidden units per hidden layer
593
    learning_rate: Learning rate of gradient descent
594
    epochs: Number of passes over the dataset
595
    """
596

597
    if hidden_layer_sizes is None:
598
        hidden_layer_sizes = [3]
599
    i_units = len(dataset.inputs)
600
    o_units = len(dataset.values[dataset.target])
601

602
    # construct a network
603
    raw_net = network(i_units, hidden_layer_sizes, o_units, activation)
604
    learned_net = BackPropagationLearner(dataset, raw_net, learning_rate, epochs, activation)
605

606
    def predict(example):
607
        # input nodes
608
        i_nodes = learned_net[0]
609

610
        # activate input layer
611
        for v, n in zip(example, i_nodes):
612
            n.value = v
613

614
        # forward pass
615
        for layer in learned_net[1:]:
616
            for node in layer:
617
                inc = [n.value for n in node.inputs]
618
                in_val = dot_product(inc, node.weights)
619
                node.value = node.activation(in_val)
620

621
        # hypothesis
622
        o_nodes = learned_net[-1]
623
        prediction = find_max_node(o_nodes)
624
        return prediction
625

626
    return predict
627

628

629
def BackPropagationLearner(dataset, net, learning_rate, epochs, activation=sigmoid):
630
    """
631
    [Figure 18.23]
632
    The back-propagation algorithm for multilayer networks.
633
    """
634
    # initialise weights
635
    for layer in net:
636
        for node in layer:
637
            node.weights = random_weights(min_value=-0.5, max_value=0.5, num_weights=len(node.weights))
638

639
    examples = dataset.examples
640
    # As of now dataset.target gives an int instead of list,
641
    # Changing dataset class will have effect on all the learners.
642
    # Will be taken care of later.
643
    o_nodes = net[-1]
644
    i_nodes = net[0]
645
    o_units = len(o_nodes)
646
    idx_t = dataset.target
647
    idx_i = dataset.inputs
648
    n_layers = len(net)
649

650
    inputs, targets = init_examples(examples, idx_i, idx_t, o_units)
651

652
    for epoch in range(epochs):
653
        # iterate over each example
654
        for e in range(len(examples)):
655
            i_val = inputs[e]
656
            t_val = targets[e]
657

658
            # activate input layer
659
            for v, n in zip(i_val, i_nodes):
660
                n.value = v
661

662
            # forward pass
663
            for layer in net[1:]:
664
                for node in layer:
665
                    inc = [n.value for n in node.inputs]
666
                    in_val = dot_product(inc, node.weights)
667
                    node.value = node.activation(in_val)
668

669
            # initialize delta
670
            delta = [[] for _ in range(n_layers)]
671

672
            # compute outer layer delta
673

674
            # error for the MSE cost function
675
            err = [t_val[i] - o_nodes[i].value for i in range(o_units)]
676

677
            # calculate delta at output
678
            if node.activation == sigmoid:
679
                delta[-1] = [sigmoid_derivative(o_nodes[i].value) * err[i] for i in range(o_units)]
680
            elif node.activation == relu:
681
                delta[-1] = [relu_derivative(o_nodes[i].value) * err[i] for i in range(o_units)]
682
            elif node.activation == tanh:
683
                delta[-1] = [tanh_derivative(o_nodes[i].value) * err[i] for i in range(o_units)]
684
            elif node.activation == elu:
685
                delta[-1] = [elu_derivative(o_nodes[i].value) * err[i] for i in range(o_units)]
686
            elif node.activation == leaky_relu:
687
                delta[-1] = [leaky_relu_derivative(o_nodes[i].value) * err[i] for i in range(o_units)]
688
            else:
689
                return ValueError("Activation function unknown.")
690

691
            # backward pass
692
            h_layers = n_layers - 2
693
            for i in range(h_layers, 0, -1):
694
                layer = net[i]
695
                h_units = len(layer)
696
                nx_layer = net[i + 1]
697

698
                # weights from each ith layer node to each i + 1th layer node
699
                w = [[node.weights[k] for node in nx_layer] for k in range(h_units)]
700

701
                if activation == sigmoid:
702
                    delta[i] = [sigmoid_derivative(layer[j].value) * dot_product(w[j], delta[i + 1])
703
                                for j in range(h_units)]
704
                elif activation == relu:
705
                    delta[i] = [relu_derivative(layer[j].value) * dot_product(w[j], delta[i + 1])
706
                                for j in range(h_units)]
707
                elif activation == tanh:
708
                    delta[i] = [tanh_derivative(layer[j].value) * dot_product(w[j], delta[i + 1])
709
                                for j in range(h_units)]
710
                elif activation == elu:
711
                    delta[i] = [elu_derivative(layer[j].value) * dot_product(w[j], delta[i + 1])
712
                                for j in range(h_units)]
713
                elif activation == leaky_relu:
714
                    delta[i] = [leaky_relu_derivative(layer[j].value) * dot_product(w[j], delta[i + 1])
715
                                for j in range(h_units)]
716
                else:
717
                    return ValueError("Activation function unknown.")
718

719
            # update weights
720
            for i in range(1, n_layers):
721
                layer = net[i]
722
                inc = [node.value for node in net[i - 1]]
723
                units = len(layer)
724
                for j in range(units):
725
                    layer[j].weights = vector_add(layer[j].weights,
726
                                                  scalar_vector_product(learning_rate * delta[i][j], inc))
727

728
    return net
729

730

731
def PerceptronLearner(dataset, learning_rate=0.01, epochs=100):
732
    """Logistic Regression, NO hidden layer"""
733
    i_units = len(dataset.inputs)
734
    o_units = len(dataset.values[dataset.target])
735
    hidden_layer_sizes = []
736
    raw_net = network(i_units, hidden_layer_sizes, o_units)
737
    learned_net = BackPropagationLearner(dataset, raw_net, learning_rate, epochs)
738

739
    def predict(example):
740
        o_nodes = learned_net[1]
741

742
        # forward pass
743
        for node in o_nodes:
744
            in_val = dot_product(example, node.weights)
745
            node.value = node.activation(in_val)
746

747
        # hypothesis
748
        return find_max_node(o_nodes)
749

750
    return predict
751

752

753
class NNUnit:
754
    """
755
    Single Unit of Multiple Layer Neural Network
756
    inputs: Incoming connections
757
    weights: Weights to incoming connections
758
    """
759

760
    def __init__(self, activation=sigmoid, weights=None, inputs=None):
761
        self.weights = weights or []
762
        self.inputs = inputs or []
763
        self.value = None
764
        self.activation = activation
765

766

767
def network(input_units, hidden_layer_sizes, output_units, activation=sigmoid):
768
    """
769
    Create Directed Acyclic Network of given number layers.
770
    hidden_layers_sizes : List number of neuron units in each hidden layer
771
    excluding input and output layers
772
    """
773
    layers_sizes = [input_units] + hidden_layer_sizes + [output_units]
774

775
    net = [[NNUnit(activation) for _ in range(size)] for size in layers_sizes]
776
    n_layers = len(net)
777

778
    # make connection
779
    for i in range(1, n_layers):
780
        for n in net[i]:
781
            for k in net[i - 1]:
782
                n.inputs.append(k)
783
                n.weights.append(0)
784
    return net
785

786

787
def init_examples(examples, idx_i, idx_t, o_units):
788
    inputs, targets = {}, {}
789

790
    for i, e in enumerate(examples):
791
        # input values of e
792
        inputs[i] = [e[i] for i in idx_i]
793

794
        if o_units > 1:
795
            # one-hot representation of e's target
796
            t = [0 for i in range(o_units)]
797
            t[e[idx_t]] = 1
798
            targets[i] = t
799
        else:
800
            # target value of e
801
            targets[i] = [e[idx_t]]
802

803
    return inputs, targets
804

805

806
def find_max_node(nodes):
807
    return nodes.index(max(nodes, key=lambda node: node.value))
808

809

810
class SVC:
811

812
    def __init__(self, kernel=linear_kernel, C=1.0, verbose=False):
813
        self.kernel = kernel
814
        self.C = C  # hyper-parameter
815
        self.sv_idx, self.sv, self.sv_y = np.zeros(0), np.zeros(0), np.zeros(0)
816
        self.alphas = np.zeros(0)
817
        self.w = None
818
        self.b = 0.0  # intercept
819
        self.verbose = verbose
820

821
    def fit(self, X, y):
822
        """
823
        Trains the model by solving a quadratic programming problem.
824
        :param X: array of size [n_samples, n_features] holding the training samples
825
        :param y: array of size [n_samples] holding the class labels
826
        """
827
        # In QP formulation (dual): m variables, 2m+1 constraints (1 equation, 2m inequations)
828
        self.solve_qp(X, y)
829
        sv = self.alphas > 1e-5
830
        self.sv_idx = np.arange(len(self.alphas))[sv]
831
        self.sv, self.sv_y, self.alphas = X[sv], y[sv], self.alphas[sv]
832

833
        if self.kernel == linear_kernel:
834
            self.w = np.dot(self.alphas * self.sv_y, self.sv)
835

836
        for n in range(len(self.alphas)):
837
            self.b += self.sv_y[n]
838
            self.b -= np.sum(self.alphas * self.sv_y * self.K[self.sv_idx[n], sv])
839
        self.b /= len(self.alphas)
840
        return self
841

842
    def solve_qp(self, X, y):
843
        """
844
        Solves a quadratic programming problem. In QP formulation (dual):
845
        m variables, 2m+1 constraints (1 equation, 2m inequations).
846
        :param X: array of size [n_samples, n_features] holding the training samples
847
        :param y: array of size [n_samples] holding the class labels
848
        """
849
        m = len(y)  # m = n_samples
850
        self.K = self.kernel(X)  # gram matrix
851
        P = self.K * np.outer(y, y)
852
        q = -np.ones(m)
853
        lb = np.zeros(m)  # lower bounds
854
        ub = np.ones(m) * self.C  # upper bounds
855
        A = y.astype(np.float64)  # equality matrix
856
        b = np.zeros(1)  # equality vector
857
        self.alphas = solve_qp(P, q, A=A, b=b, lb=lb, ub=ub, solver='cvxopt',
858
                               sym_proj=True, verbose=self.verbose)
859

860
    def predict_score(self, X):
861
        """
862
        Predicts the score for a given example.
863
        """
864
        if self.w is None:
865
            return np.dot(self.alphas * self.sv_y, self.kernel(self.sv, X)) + self.b
866
        return np.dot(X, self.w) + self.b
867

868
    def predict(self, X):
869
        """
870
        Predicts the class of a given example.
871
        """
872
        return np.sign(self.predict_score(X))
873

874

875
class SVR:
876

877
    def __init__(self, kernel=linear_kernel, C=1.0, epsilon=0.1, verbose=False):
878
        self.kernel = kernel
879
        self.C = C  # hyper-parameter
880
        self.epsilon = epsilon  # epsilon insensitive loss value
881
        self.sv_idx, self.sv = np.zeros(0), np.zeros(0)
882
        self.alphas_p, self.alphas_n = np.zeros(0), np.zeros(0)
883
        self.w = None
884
        self.b = 0.0  # intercept
885
        self.verbose = verbose
886

887
    def fit(self, X, y):
888
        """
889
        Trains the model by solving a quadratic programming problem.
890
        :param X: array of size [n_samples, n_features] holding the training samples
891
        :param y: array of size [n_samples] holding the class labels
892
        """
893
        # In QP formulation (dual): m variables, 2m+1 constraints (1 equation, 2m inequations)
894
        self.solve_qp(X, y)
895

896
        sv = np.logical_or(self.alphas_p > 1e-5, self.alphas_n > 1e-5)
897
        self.sv_idx = np.arange(len(self.alphas_p))[sv]
898
        self.sv, sv_y = X[sv], y[sv]
899
        self.alphas_p, self.alphas_n = self.alphas_p[sv], self.alphas_n[sv]
900

901
        if self.kernel == linear_kernel:
902
            self.w = np.dot(self.alphas_p - self.alphas_n, self.sv)
903

904
        for n in range(len(self.alphas_p)):
905
            self.b += sv_y[n]
906
            self.b -= np.sum((self.alphas_p - self.alphas_n) * self.K[self.sv_idx[n], sv])
907
        self.b -= self.epsilon
908
        self.b /= len(self.alphas_p)
909

910
        return self
911

912
    def solve_qp(self, X, y):
913
        """
914
        Solves a quadratic programming problem. In QP formulation (dual):
915
        m variables, 2m+1 constraints (1 equation, 2m inequations).
916
        :param X: array of size [n_samples, n_features] holding the training samples
917
        :param y: array of size [n_samples] holding the class labels
918
        """
919
        #
920
        m = len(y)  # m = n_samples
921
        self.K = self.kernel(X)  # gram matrix
922
        P = np.vstack((np.hstack((self.K, -self.K)),  # alphas_p, alphas_n
923
                       np.hstack((-self.K, self.K))))  # alphas_n, alphas_p
924
        q = np.hstack((-y, y)) + self.epsilon
925
        lb = np.zeros(2 * m)  # lower bounds
926
        ub = np.ones(2 * m) * self.C  # upper bounds
927
        A = np.hstack((np.ones(m), -np.ones(m)))  # equality matrix
928
        b = np.zeros(1)  # equality vector
929
        alphas = solve_qp(P, q, A=A, b=b, lb=lb, ub=ub, solver='cvxopt',
930
                          sym_proj=True, verbose=self.verbose)
931
        self.alphas_p = alphas[:m]
932
        self.alphas_n = alphas[m:]
933

934
    def predict(self, X):
935
        if self.kernel != linear_kernel:
936
            return np.dot(self.alphas_p - self.alphas_n, self.kernel(self.sv, X)) + self.b
937
        return np.dot(X, self.w) + self.b
938

939

940
class MultiClassLearner:
941

942
    def __init__(self, clf, decision_function='ovr'):
943
        self.clf = clf
944
        self.decision_function = decision_function
945
        self.n_class, self.classifiers = 0, []
946

947
    def fit(self, X, y):
948
        """
949
        Trains n_class or n_class * (n_class - 1) / 2 classifiers
950
        according to the training method, ovr or ovo respectively.
951
        :param X: array of size [n_samples, n_features] holding the training samples
952
        :param y: array of size [n_samples] holding the class labels
953
        :return: array of classifiers
954
        """
955
        labels = np.unique(y)
956
        self.n_class = len(labels)
957
        if self.decision_function == 'ovr':  # one-vs-rest method
958
            for label in labels:
959
                y1 = np.array(y)
960
                y1[y1 != label] = -1.0
961
                y1[y1 == label] = 1.0
962
                self.clf.fit(X, y1)
963
                self.classifiers.append(copy.deepcopy(self.clf))
964
        elif self.decision_function == 'ovo':  # use one-vs-one method
965
            n_labels = len(labels)
966
            for i in range(n_labels):
967
                for j in range(i + 1, n_labels):
968
                    neg_id, pos_id = y == labels[i], y == labels[j]
969
                    X1, y1 = np.r_[X[neg_id], X[pos_id]], np.r_[y[neg_id], y[pos_id]]
970
                    y1[y1 == labels[i]] = -1.0
971
                    y1[y1 == labels[j]] = 1.0
972
                    self.clf.fit(X1, y1)
973
                    self.classifiers.append(copy.deepcopy(self.clf))
974
        else:
975
            return ValueError("Decision function must be either 'ovr' or 'ovo'.")
976
        return self
977

978
    def predict(self, X):
979
        """
980
        Predicts the class of a given example according to the training method.
981
        """
982
        n_samples = len(X)
983
        if self.decision_function == 'ovr':  # one-vs-rest method
984
            assert len(self.classifiers) == self.n_class
985
            score = np.zeros((n_samples, self.n_class))
986
            for i in range(self.n_class):
987
                clf = self.classifiers[i]
988
                score[:, i] = clf.predict_score(X)
989
            return np.argmax(score, axis=1)
990
        elif self.decision_function == 'ovo':  # use one-vs-one method
991
            assert len(self.classifiers) == self.n_class * (self.n_class - 1) / 2
992
            vote = np.zeros((n_samples, self.n_class))
993
            clf_id = 0
994
            for i in range(self.n_class):
995
                for j in range(i + 1, self.n_class):
996
                    res = self.classifiers[clf_id].predict(X)
997
                    vote[res < 0, i] += 1.0  # negative sample: class i
998
                    vote[res > 0, j] += 1.0  # positive sample: class j
999
                    clf_id += 1
1000
            return np.argmax(vote, axis=1)
1001
        else:
1002
            return ValueError("Decision function must be either 'ovr' or 'ovo'.")
1003

1004

1005
def EnsembleLearner(learners):
1006
    """Given a list of learning algorithms, have them vote."""
1007

1008
    def train(dataset):
1009
        predictors = [learner(dataset) for learner in learners]
1010

1011
        def predict(example):
1012
            return mode(predictor(example) for predictor in predictors)
1013

1014
        return predict
1015

1016
    return train
1017

1018

1019
def ada_boost(dataset, L, K):
1020
    """[Figure 18.34]"""
1021

1022
    examples, target = dataset.examples, dataset.target
1023
    n = len(examples)
1024
    eps = 1 / (2 * n)
1025
    w = [1 / n] * n
1026
    h, z = [], []
1027
    for k in range(K):
1028
        h_k = L(dataset, w)
1029
        h.append(h_k)
1030
        error = sum(weight for example, weight in zip(examples, w) if example[target] != h_k(example))
1031
        # avoid divide-by-0 from either 0% or 100% error rates
1032
        error = np.clip(error, eps, 1 - eps)
1033
        for j, example in enumerate(examples):
1034
            if example[target] == h_k(example):
1035
                w[j] *= error / (1 - error)
1036
        w = normalize(w)
1037
        z.append(np.log((1 - error) / error))
1038
    return weighted_majority(h, z)
1039

1040

1041
def weighted_majority(predictors, weights):
1042
    """Return a predictor that takes a weighted vote."""
1043

1044
    def predict(example):
1045
        return weighted_mode((predictor(example) for predictor in predictors), weights)
1046

1047
    return predict
1048

1049

1050
def weighted_mode(values, weights):
1051
    """
1052
    Return the value with the greatest total weight.
1053
    >>> weighted_mode('abbaa', [1, 2, 3, 1, 2])
1054
    'b'
1055
    """
1056
    totals = defaultdict(int)
1057
    for v, w in zip(values, weights):
1058
        totals[v] += w
1059
    return max(totals, key=totals.__getitem__)
1060

1061

1062
def RandomForest(dataset, n=5):
1063
    """An ensemble of Decision Trees trained using bagging and feature bagging."""
1064

1065
    def data_bagging(dataset, m=0):
1066
        """Sample m examples with replacement"""
1067
        n = len(dataset.examples)
1068
        return weighted_sample_with_replacement(m or n, dataset.examples, [1] * n)
1069

1070
    def feature_bagging(dataset, p=0.7):
1071
        """Feature bagging with probability p to retain an attribute"""
1072
        inputs = [i for i in dataset.inputs if probability(p)]
1073
        return inputs or dataset.inputs
1074

1075
    def predict(example):
1076
        print([predictor(example) for predictor in predictors])
1077
        return mode(predictor(example) for predictor in predictors)
1078

1079
    predictors = [DecisionTreeLearner(DataSet(examples=data_bagging(dataset), attrs=dataset.attrs,
1080
                                              attr_names=dataset.attr_names, target=dataset.target,
1081
                                              inputs=feature_bagging(dataset))) for _ in range(n)]
1082

1083
    return predict
1084

1085

1086
def WeightedLearner(unweighted_learner):
1087
    """
1088
    [Page 749 footnote 14]
1089
    Given a learner that takes just an unweighted dataset, return
1090
    one that takes also a weight for each example.
1091
    """
1092

1093
    def train(dataset, weights):
1094
        return unweighted_learner(replicated_dataset(dataset, weights))
1095

1096
    return train
1097

1098

1099
def replicated_dataset(dataset, weights, n=None):
1100
    """Copy dataset, replicating each example in proportion to its weight."""
1101
    n = n or len(dataset.examples)
1102
    result = copy.copy(dataset)
1103
    result.examples = weighted_replicate(dataset.examples, weights, n)
1104
    return result
1105

1106

1107
def weighted_replicate(seq, weights, n):
1108
    """
1109
    Return n selections from seq, with the count of each element of
1110
    seq proportional to the corresponding weight (filling in fractions
1111
    randomly).
1112
    >>> weighted_replicate('ABC', [1, 2, 1], 4)
1113
    ['A', 'B', 'B', 'C']
1114
    """
1115
    assert len(seq) == len(weights)
1116
    weights = normalize(weights)
1117
    wholes = [int(w * n) for w in weights]
1118
    fractions = [(w * n) % 1 for w in weights]
1119
    return (flatten([x] * nx for x, nx in zip(seq, wholes)) +
1120
            weighted_sample_with_replacement(n - sum(wholes), seq, fractions))
1121

1122

1123
# metrics
1124

1125
def accuracy_score(y_pred, y_true):
1126
    assert y_pred.shape == y_true.shape
1127
    return np.mean(np.equal(y_pred, y_true))
1128

1129

1130
def r2_score(y_pred, y_true):
1131
    assert y_pred.shape == y_true.shape
1132
    return 1. - (np.sum(np.square(y_pred - y_true)) /  # sum of square of residuals
1133
                 np.sum(np.square(y_true - np.mean(y_true))))  # total sum of squares
1134

1135

1136
# datasets
1137

1138
orings = DataSet(name='orings', target='Distressed', attr_names='Rings Distressed Temp Pressure Flightnum')
1139

1140
zoo = DataSet(name='zoo', target='type', exclude=['name'],
1141
              attr_names='name hair feathers eggs milk airborne aquatic predator toothed backbone '
1142
                         'breathes venomous fins legs tail domestic catsize type')
1143

1144
iris = DataSet(name='iris', target='class', attr_names='sepal-len sepal-width petal-len petal-width class')
1145

1146

1147
def RestaurantDataSet(examples=None):
1148
    """
1149
    [Figure 18.3]
1150
    Build a DataSet of Restaurant waiting examples.
1151
    """
1152
    return DataSet(name='restaurant', target='Wait', examples=examples,
1153
                   attr_names='Alternate Bar Fri/Sat Hungry Patrons Price Raining Reservation Type WaitEstimate Wait')
1154

1155

1156
restaurant = RestaurantDataSet()
1157

1158

1159
def T(attr_name, branches):
1160
    branches = {value: (child if isinstance(child, DecisionFork) else DecisionLeaf(child))
1161
                for value, child in branches.items()}
1162
    return DecisionFork(restaurant.attr_num(attr_name), attr_name, print, branches)
1163

1164

1165
""" 
1166
[Figure 18.2]
1167
A decision tree for deciding whether to wait for a table at a hotel.
1168
"""
1169

1170
waiting_decision_tree = T('Patrons',
1171
                          {'None': 'No', 'Some': 'Yes',
1172
                           'Full': T('WaitEstimate',
1173
                                     {'>60': 'No', '0-10': 'Yes',
1174
                                      '30-60': T('Alternate',
1175
                                                 {'No': T('Reservation',
1176
                                                          {'Yes': 'Yes',
1177
                                                           'No': T('Bar', {'No': 'No',
1178
                                                                           'Yes': 'Yes'})}),
1179
                                                  'Yes': T('Fri/Sat', {'No': 'No', 'Yes': 'Yes'})}),
1180
                                      '10-30': T('Hungry',
1181
                                                 {'No': 'Yes',
1182
                                                  'Yes': T('Alternate',
1183
                                                           {'No': 'Yes',
1184
                                                            'Yes': T('Raining',
1185
                                                                     {'No': 'No',
1186
                                                                      'Yes': 'Yes'})})})})})
1187

1188

1189
def SyntheticRestaurant(n=20):
1190
    """Generate a DataSet with n examples."""
1191

1192
    def gen():
1193
        example = list(map(random.choice, restaurant.values))
1194
        example[restaurant.target] = waiting_decision_tree(example)
1195
        return example
1196

1197
    return RestaurantDataSet([gen() for _ in range(n)])
1198

1199

1200
def Majority(k, n):
1201
    """
1202
    Return a DataSet with n k-bit examples of the majority problem:
1203
    k random bits followed by a 1 if more than half the bits are 1, else 0.
1204
    """
1205
    examples = []
1206
    for i in range(n):
1207
        bits = [random.choice([0, 1]) for _ in range(k)]
1208
        bits.append(int(sum(bits) > k / 2))
1209
        examples.append(bits)
1210
    return DataSet(name='majority', examples=examples)
1211

1212

1213
def Parity(k, n, name='parity'):
1214
    """
1215
    Return a DataSet with n k-bit examples of the parity problem:
1216
    k random bits followed by a 1 if an odd number of bits are 1, else 0.
1217
    """
1218
    examples = []
1219
    for i in range(n):
1220
        bits = [random.choice([0, 1]) for _ in range(k)]
1221
        bits.append(sum(bits) % 2)
1222
        examples.append(bits)
1223
    return DataSet(name=name, examples=examples)
1224

1225

1226
def Xor(n):
1227
    """Return a DataSet with n examples of 2-input xor."""
1228
    return Parity(2, n, name='xor')
1229

1230

1231
def ContinuousXor(n):
1232
    """2 inputs are chosen uniformly from (0.0 .. 2.0]; output is xor of ints."""
1233
    examples = []
1234
    for i in range(n):
1235
        x, y = [random.uniform(0.0, 2.0) for _ in '12']
1236
        examples.append([x, y, x != y])
1237
    return DataSet(name='continuous xor', examples=examples)
1238

1239

1240
def compare(algorithms=None, datasets=None, k=10, trials=1):
1241
    """
1242
    Compare various learners on various datasets using cross-validation.
1243
    Print results as a table.
1244
    """
1245
    # default list of algorithms
1246
    algorithms = algorithms or [PluralityLearner, NaiveBayesLearner, NearestNeighborLearner, DecisionTreeLearner]
1247

1248
    # default list of datasets
1249
    datasets = datasets or [iris, orings, zoo, restaurant, SyntheticRestaurant(20),
1250
                            Majority(7, 100), Parity(7, 100), Xor(100)]
1251

1252
    print_table([[a.__name__.replace('Learner', '')] + [cross_validation(a, d, k=k, trials=trials) for d in datasets]
1253
                 for a in algorithms], header=[''] + [d.name[0:7] for d in datasets], numfmt='%.2f')
1254

1255
Product

Resources

Company