CoCalc -- learning4e.py

GitHub Repository: aimacode/aima-python
Path: blob/master/learning4e.py
⁶¹⁵ views
1
"""Learning from examples (Chapters 18)"""
2

3
import copy
4
from collections import defaultdict
5
from statistics import stdev
6

7
from qpsolvers import solve_qp
8

9
from deep_learning4e import Sigmoid
10
from probabilistic_learning import NaiveBayesLearner
11
from utils4e import *
12

13

14
class DataSet:
15
    """
16
    A data set for a machine learning problem. It has the following fields:
17

18
    d.examples   A list of examples. Each one is a list of attribute values.
19
    d.attrs      A list of integers to index into an example, so example[attr]
20
                 gives a value. Normally the same as range(len(d.examples[0])).
21
    d.attr_names Optional list of mnemonic names for corresponding attrs.
22
    d.target     The attribute that a learning algorithm will try to predict.
23
                 By default the final attribute.
24
    d.inputs     The list of attrs without the target.
25
    d.values     A list of lists: each sublist is the set of possible
26
                 values for the corresponding attribute. If initially None,
27
                 it is computed from the known examples by self.set_problem.
28
                 If not None, an erroneous value raises ValueError.
29
    d.distance   A function from a pair of examples to a non-negative number.
30
                 Should be symmetric, etc. Defaults to mean_boolean_error
31
                 since that can handle any field types.
32
    d.name       Name of the data set (for output display only).
33
    d.source     URL or other source where the data came from.
34
    d.exclude    A list of attribute indexes to exclude from d.inputs. Elements
35
                 of this list can either be integers (attrs) or attr_names.
36

37
    Normally, you call the constructor and you're done; then you just
38
    access fields like d.examples and d.target and d.inputs.
39
    """
40

41
    def __init__(self, examples=None, attrs=None, attr_names=None, target=-1, inputs=None,
42
                 values=None, distance=mean_boolean_error, name='', source='', exclude=()):
43
        """
44
        Accepts any of DataSet's fields. Examples can also be a
45
        string or file from which to parse examples using parse_csv.
46
        Optional parameter: exclude, as documented in .set_problem().
47
        >>> DataSet(examples='1, 2, 3')
48
        <DataSet(): 1 examples, 3 attributes>
49
        """
50
        self.name = name
51
        self.source = source
52
        self.values = values
53
        self.distance = distance
54
        self.got_values_flag = bool(values)
55

56
        # initialize .examples from string or list or data directory
57
        if isinstance(examples, str):
58
            self.examples = parse_csv(examples)
59
        elif examples is None:
60
            self.examples = parse_csv(open_data(name + '.csv').read())
61
        else:
62
            self.examples = examples
63

64
        # attrs are the indices of examples, unless otherwise stated.
65
        if self.examples is not None and attrs is None:
66
            attrs = list(range(len(self.examples[0])))
67

68
        self.attrs = attrs
69

70
        # initialize .attr_names from string, list, or by default
71
        if isinstance(attr_names, str):
72
            self.attr_names = attr_names.split()
73
        else:
74
            self.attr_names = attr_names or attrs
75
        self.set_problem(target, inputs=inputs, exclude=exclude)
76

77
    def set_problem(self, target, inputs=None, exclude=()):
78
        """
79
        Set (or change) the target and/or inputs.
80
        This way, one DataSet can be used multiple ways. inputs, if specified,
81
        is a list of attributes, or specify exclude as a list of attributes
82
        to not use in inputs. Attributes can be -n .. n, or an attr_name.
83
        Also computes the list of possible values, if that wasn't done yet.
84
        """
85
        self.target = self.attr_num(target)
86
        exclude = list(map(self.attr_num, exclude))
87
        if inputs:
88
            self.inputs = remove_all(self.target, inputs)
89
        else:
90
            self.inputs = [a for a in self.attrs if a != self.target and a not in exclude]
91
        if not self.values:
92
            self.update_values()
93
        self.check_me()
94

95
    def check_me(self):
96
        """Check that my fields make sense."""
97
        assert len(self.attr_names) == len(self.attrs)
98
        assert self.target in self.attrs
99
        assert self.target not in self.inputs
100
        assert set(self.inputs).issubset(set(self.attrs))
101
        if self.got_values_flag:
102
            # only check if values are provided while initializing DataSet
103
            list(map(self.check_example, self.examples))
104

105
    def add_example(self, example):
106
        """Add an example to the list of examples, checking it first."""
107
        self.check_example(example)
108
        self.examples.append(example)
109

110
    def check_example(self, example):
111
        """Raise ValueError if example has any invalid values."""
112
        if self.values:
113
            for a in self.attrs:
114
                if example[a] not in self.values[a]:
115
                    raise ValueError('Bad value {} for attribute {} in {}'
116
                                     .format(example[a], self.attr_names[a], example))
117

118
    def attr_num(self, attr):
119
        """Returns the number used for attr, which can be a name, or -n .. n-1."""
120
        if isinstance(attr, str):
121
            return self.attr_names.index(attr)
122
        elif attr < 0:
123
            return len(self.attrs) + attr
124
        else:
125
            return attr
126

127
    def update_values(self):
128
        self.values = list(map(unique, zip(*self.examples)))
129

130
    def sanitize(self, example):
131
        """Return a copy of example, with non-input attributes replaced by None."""
132
        return [attr_i if i in self.inputs else None for i, attr_i in enumerate(example)][:-1]
133

134
    def classes_to_numbers(self, classes=None):
135
        """Converts class names to numbers."""
136
        if not classes:
137
            # if classes were not given, extract them from values
138
            classes = sorted(self.values[self.target])
139
        for item in self.examples:
140
            item[self.target] = classes.index(item[self.target])
141

142
    def remove_examples(self, value=''):
143
        """Remove examples that contain given value."""
144
        self.examples = [x for x in self.examples if value not in x]
145
        self.update_values()
146

147
    def split_values_by_classes(self):
148
        """Split values into buckets according to their class."""
149
        buckets = defaultdict(lambda: [])
150
        target_names = self.values[self.target]
151

152
        for v in self.examples:
153
            item = [a for a in v if a not in target_names]  # remove target from item
154
            buckets[v[self.target]].append(item)  # add item to bucket of its class
155

156
        return buckets
157

158
    def find_means_and_deviations(self):
159
        """
160
        Finds the means and standard deviations of self.dataset.
161
        means     : a dictionary for each class/target. Holds a list of the means
162
                    of the features for the class.
163
        deviations: a dictionary for each class/target. Holds a list of the sample
164
                    standard deviations of the features for the class.
165
        """
166
        target_names = self.values[self.target]
167
        feature_numbers = len(self.inputs)
168

169
        item_buckets = self.split_values_by_classes()
170

171
        means = defaultdict(lambda: [0] * feature_numbers)
172
        deviations = defaultdict(lambda: [0] * feature_numbers)
173

174
        for t in target_names:
175
            # find all the item feature values for item in class t
176
            features = [[] for _ in range(feature_numbers)]
177
            for item in item_buckets[t]:
178
                for i in range(feature_numbers):
179
                    features[i].append(item[i])
180

181
            # calculate means and deviations fo the class
182
            for i in range(feature_numbers):
183
                means[t][i] = mean(features[i])
184
                deviations[t][i] = stdev(features[i])
185

186
        return means, deviations
187

188
    def __repr__(self):
189
        return '<DataSet({}): {:d} examples, {:d} attributes>'.format(self.name, len(self.examples), len(self.attrs))
190

191

192
def parse_csv(input, delim=','):
193
    r"""
194
    Input is a string consisting of lines, each line has comma-delimited
195
    fields. Convert this into a list of lists. Blank lines are skipped.
196
    Fields that look like numbers are converted to numbers.
197
    The delim defaults to ',' but '\t' and None are also reasonable values.
198
    >>> parse_csv('1, 2, 3 \n 0, 2, na')
199
    [[1, 2, 3], [0, 2, 'na']]
200
    """
201
    lines = [line for line in input.splitlines() if line.strip()]
202
    return [list(map(num_or_str, line.split(delim))) for line in lines]
203

204

205
def err_ratio(learner, dataset, examples=None):
206
    """
207
    Return the proportion of the examples that are NOT correctly predicted.
208
    verbose - 0: No output; 1: Output wrong; 2 (or greater): Output correct
209
    """
210
    examples = examples or dataset.examples
211
    if len(examples) == 0:
212
        return 0.0
213
    right = 0
214
    for example in examples:
215
        desired = example[dataset.target]
216
        output = learner.predict(dataset.sanitize(example))
217
        if np.allclose(output, desired):
218
            right += 1
219
    return 1 - (right / len(examples))
220

221

222
def grade_learner(learner, tests):
223
    """
224
    Grades the given learner based on how many tests it passes.
225
    tests is a list with each element in the form: (values, output).
226
    """
227
    return mean(int(learner.predict(X) == y) for X, y in tests)
228

229

230
def train_test_split(dataset, start=None, end=None, test_split=None):
231
    """
232
    If you are giving 'start' and 'end' as parameters,
233
    then it will return the testing set from index 'start' to 'end'
234
    and the rest for training.
235
    If you give 'test_split' as a parameter then it will return
236
    test_split * 100% as the testing set and the rest as
237
    training set.
238
    """
239
    examples = dataset.examples
240
    if test_split is None:
241
        train = examples[:start] + examples[end:]
242
        val = examples[start:end]
243
    else:
244
        total_size = len(examples)
245
        val_size = int(total_size * test_split)
246
        train_size = total_size - val_size
247
        train = examples[:train_size]
248
        val = examples[train_size:total_size]
249

250
    return train, val
251

252

253
def model_selection(learner, dataset, k=10, trials=1):
254
    """
255
    [Figure 18.8]
256
    Return the optimal value of size having minimum error on validation set.
257
    err: a validation error array, indexed by size
258
    """
259
    errs = []
260
    size = 1
261
    while True:
262
        err = cross_validation(learner, dataset, size, k, trials)
263
        # check for convergence provided err_val is not empty
264
        if err and not np.isclose(err[-1], err, rtol=1e-6):
265
            best_size = 0
266
            min_val = np.inf
267
            i = 0
268
            while i < size:
269
                if errs[i] < min_val:
270
                    min_val = errs[i]
271
                    best_size = i
272
                i += 1
273
            return learner(dataset, best_size)
274
        errs.append(err)
275
        size += 1
276

277

278
def cross_validation(learner, dataset, size=None, k=10, trials=1):
279
    """
280
    Do k-fold cross_validate and return their mean.
281
    That is, keep out 1/k of the examples for testing on each of k runs.
282
    Shuffle the examples first; if trials > 1, average over several shuffles.
283
    Returns Training error
284
    """
285
    k = k or len(dataset.examples)
286
    if trials > 1:
287
        trial_errs = 0
288
        for t in range(trials):
289
            errs = cross_validation(learner, dataset, size, k, trials)
290
            trial_errs += errs
291
        return trial_errs / trials
292
    else:
293
        fold_errs = 0
294
        n = len(dataset.examples)
295
        examples = dataset.examples
296
        random.shuffle(dataset.examples)
297
        for fold in range(k):
298
            train_data, val_data = train_test_split(dataset, fold * (n // k), (fold + 1) * (n // k))
299
            dataset.examples = train_data
300
            h = learner(dataset, size)
301
            fold_errs += err_ratio(h, dataset, train_data)
302
            # reverting back to original once test is completed
303
            dataset.examples = examples
304
        return fold_errs / k
305

306

307
def leave_one_out(learner, dataset, size=None):
308
    """Leave one out cross-validation over the dataset."""
309
    return cross_validation(learner, dataset, size, len(dataset.examples))
310

311

312
def learning_curve(learner, dataset, trials=10, sizes=None):
313
    if sizes is None:
314
        sizes = list(range(2, len(dataset.examples) - trials, 2))
315

316
    def score(learner, size):
317
        random.shuffle(dataset.examples)
318
        return cross_validation(learner, dataset, size, trials)
319

320
    return [(size, mean([score(learner, size) for _ in range(trials)])) for size in sizes]
321

322

323
class PluralityLearner:
324
    """
325
    A very dumb algorithm: always pick the result that was most popular
326
    in the training data. Makes a baseline for comparison.
327
    """
328

329
    def __init__(self, dataset):
330
        self.most_popular = mode([e[dataset.target] for e in dataset.examples])
331

332
    def predict(self, example):
333
        """Always return same result: the most popular from the training set."""
334
        return self.most_popular
335

336

337
class DecisionFork:
338
    """
339
    A fork of a decision tree holds an attribute to test, and a dict
340
    of branches, one for each of the attribute's values.
341
    """
342

343
    def __init__(self, attr, attr_name=None, default_child=None, branches=None):
344
        """Initialize by saying what attribute this node tests."""
345
        self.attr = attr
346
        self.attr_name = attr_name or attr
347
        self.default_child = default_child
348
        self.branches = branches or {}
349

350
    def __call__(self, example):
351
        """Given an example, classify it using the attribute and the branches."""
352
        attr_val = example[self.attr]
353
        if attr_val in self.branches:
354
            return self.branches[attr_val](example)
355
        else:
356
            # return default class when attribute is unknown
357
            return self.default_child(example)
358

359
    def add(self, val, subtree):
360
        """Add a branch. If self.attr = val, go to the given subtree."""
361
        self.branches[val] = subtree
362

363
    def display(self, indent=0):
364
        name = self.attr_name
365
        print('Test', name)
366
        for (val, subtree) in self.branches.items():
367
            print(' ' * 4 * indent, name, '=', val, '==>', end=' ')
368
            subtree.display(indent + 1)
369

370
    def __repr__(self):
371
        return 'DecisionFork({0!r}, {1!r}, {2!r})'.format(self.attr, self.attr_name, self.branches)
372

373

374
class DecisionLeaf:
375
    """A leaf of a decision tree holds just a result."""
376

377
    def __init__(self, result):
378
        self.result = result
379

380
    def __call__(self, example):
381
        return self.result
382

383
    def display(self):
384
        print('RESULT =', self.result)
385

386
    def __repr__(self):
387
        return repr(self.result)
388

389

390
class DecisionTreeLearner:
391
    """[Figure 18.5]"""
392

393
    def __init__(self, dataset):
394
        self.dataset = dataset
395
        self.tree = self.decision_tree_learning(dataset.examples, dataset.inputs)
396

397
    def decision_tree_learning(self, examples, attrs, parent_examples=()):
398
        if len(examples) == 0:
399
            return self.plurality_value(parent_examples)
400
        if self.all_same_class(examples):
401
            return DecisionLeaf(examples[0][self.dataset.target])
402
        if len(attrs) == 0:
403
            return self.plurality_value(examples)
404
        A = self.choose_attribute(attrs, examples)
405
        tree = DecisionFork(A, self.dataset.attr_names[A], self.plurality_value(examples))
406
        for (v_k, exs) in self.split_by(A, examples):
407
            subtree = self.decision_tree_learning(exs, remove_all(A, attrs), examples)
408
            tree.add(v_k, subtree)
409
        return tree
410

411
    def plurality_value(self, examples):
412
        """
413
        Return the most popular target value for this set of examples.
414
        (If target is binary, this is the majority; otherwise plurality).
415
        """
416
        popular = argmax_random_tie(self.dataset.values[self.dataset.target],
417
                                    key=lambda v: self.count(self.dataset.target, v, examples))
418
        return DecisionLeaf(popular)
419

420
    def count(self, attr, val, examples):
421
        """Count the number of examples that have example[attr] = val."""
422
        return sum(e[attr] == val for e in examples)
423

424
    def all_same_class(self, examples):
425
        """Are all these examples in the same target class?"""
426
        class0 = examples[0][self.dataset.target]
427
        return all(e[self.dataset.target] == class0 for e in examples)
428

429
    def choose_attribute(self, attrs, examples):
430
        """Choose the attribute with the highest information gain."""
431
        return argmax_random_tie(attrs, key=lambda a: self.information_gain(a, examples))
432

433
    def information_gain(self, attr, examples):
434
        """Return the expected reduction in entropy from splitting by attr."""
435

436
        def I(examples):
437
            return information_content([self.count(self.dataset.target, v, examples)
438
                                        for v in self.dataset.values[self.dataset.target]])
439

440
        n = len(examples)
441
        remainder = sum((len(examples_i) / n) * I(examples_i)
442
                        for (v, examples_i) in self.split_by(attr, examples))
443
        return I(examples) - remainder
444

445
    def split_by(self, attr, examples):
446
        """Return a list of (val, examples) pairs for each val of attr."""
447
        return [(v, [e for e in examples if e[attr] == v]) for v in self.dataset.values[attr]]
448

449
    def predict(self, x):
450
        return self.tree(x)
451

452

453
def information_content(values):
454
    """Number of bits to represent the probability distribution in values."""
455
    probabilities = normalize(remove_all(0, values))
456
    return sum(-p * np.log2(p) for p in probabilities)
457

458

459
class DecisionListLearner:
460
    """
461
    [Figure 18.11]
462
    A decision list implemented as a list of (test, value) pairs.
463
    """
464

465
    def __init__(self, dataset):
466
        self.predict.decision_list = self.decision_list_learning(set(dataset.examples))
467

468
    def decision_list_learning(self, examples):
469
        if not examples:
470
            return [(True, False)]
471
        t, o, examples_t = self.find_examples(examples)
472
        if not t:
473
            raise Exception
474
        return [(t, o)] + self.decision_list_learning(examples - examples_t)
475

476
    def find_examples(self, examples):
477
        """
478
        Find a set of examples that all have the same outcome under
479
        some test. Return a tuple of the test, outcome, and examples.
480
        """
481
        raise NotImplementedError
482

483
    def passes(self, example, test):
484
        """Does the example pass the test?"""
485
        raise NotImplementedError
486

487
    def predict(self, example):
488
        """Predict the outcome for the first passing test."""
489
        for test, outcome in self.predict.decision_list:
490
            if self.passes(example, test):
491
                return outcome
492

493

494
class NearestNeighborLearner:
495
    """k-NearestNeighbor: the k nearest neighbors vote."""
496

497
    def __init__(self, dataset, k=1):
498
        self.dataset = dataset
499
        self.k = k
500

501
    def predict(self, example):
502
        """Find the k closest items, and have them vote for the best."""
503
        best = heapq.nsmallest(self.k, ((self.dataset.distance(e, example), e) for e in self.dataset.examples))
504
        return mode(e[self.dataset.target] for (d, e) in best)
505

506

507
class SVC:
508

509
    def __init__(self, kernel=linear_kernel, C=1.0, verbose=False):
510
        self.kernel = kernel
511
        self.C = C  # hyper-parameter
512
        self.sv_idx, self.sv, self.sv_y = np.zeros(0), np.zeros(0), np.zeros(0)
513
        self.alphas = np.zeros(0)
514
        self.w = None
515
        self.b = 0.0  # intercept
516
        self.verbose = verbose
517

518
    def fit(self, X, y):
519
        """
520
        Trains the model by solving a quadratic programming problem.
521
        :param X: array of size [n_samples, n_features] holding the training samples
522
        :param y: array of size [n_samples] holding the class labels
523
        """
524
        # In QP formulation (dual): m variables, 2m+1 constraints (1 equation, 2m inequations)
525
        self.solve_qp(X, y)
526
        sv = self.alphas > 1e-5
527
        self.sv_idx = np.arange(len(self.alphas))[sv]
528
        self.sv, self.sv_y, self.alphas = X[sv], y[sv], self.alphas[sv]
529

530
        if self.kernel == linear_kernel:
531
            self.w = np.dot(self.alphas * self.sv_y, self.sv)
532

533
        for n in range(len(self.alphas)):
534
            self.b += self.sv_y[n]
535
            self.b -= np.sum(self.alphas * self.sv_y * self.K[self.sv_idx[n], sv])
536
        self.b /= len(self.alphas)
537
        return self
538

539
    def solve_qp(self, X, y):
540
        """
541
        Solves a quadratic programming problem. In QP formulation (dual):
542
        m variables, 2m+1 constraints (1 equation, 2m inequations).
543
        :param X: array of size [n_samples, n_features] holding the training samples
544
        :param y: array of size [n_samples] holding the class labels
545
        """
546
        m = len(y)  # m = n_samples
547
        self.K = self.kernel(X)  # gram matrix
548
        P = self.K * np.outer(y, y)
549
        q = -np.ones(m)
550
        lb = np.zeros(m)  # lower bounds
551
        ub = np.ones(m) * self.C  # upper bounds
552
        A = y.astype(np.float64)  # equality matrix
553
        b = np.zeros(1)  # equality vector
554
        self.alphas = solve_qp(P, q, A=A, b=b, lb=lb, ub=ub, solver='cvxopt',
555
                               sym_proj=True, verbose=self.verbose)
556

557
    def predict_score(self, X):
558
        """
559
        Predicts the score for a given example.
560
        """
561
        if self.w is None:
562
            return np.dot(self.alphas * self.sv_y, self.kernel(self.sv, X)) + self.b
563
        return np.dot(X, self.w) + self.b
564

565
    def predict(self, X):
566
        """
567
        Predicts the class of a given example.
568
        """
569
        return np.sign(self.predict_score(X))
570

571

572
class SVR:
573

574
    def __init__(self, kernel=linear_kernel, C=1.0, epsilon=0.1, verbose=False):
575
        self.kernel = kernel
576
        self.C = C  # hyper-parameter
577
        self.epsilon = epsilon  # epsilon insensitive loss value
578
        self.sv_idx, self.sv = np.zeros(0), np.zeros(0)
579
        self.alphas_p, self.alphas_n = np.zeros(0), np.zeros(0)
580
        self.w = None
581
        self.b = 0.0  # intercept
582
        self.verbose = verbose
583

584
    def fit(self, X, y):
585
        """
586
        Trains the model by solving a quadratic programming problem.
587
        :param X: array of size [n_samples, n_features] holding the training samples
588
        :param y: array of size [n_samples] holding the class labels
589
        """
590
        # In QP formulation (dual): m variables, 2m+1 constraints (1 equation, 2m inequations)
591
        self.solve_qp(X, y)
592

593
        sv = np.logical_or(self.alphas_p > 1e-5, self.alphas_n > 1e-5)
594
        self.sv_idx = np.arange(len(self.alphas_p))[sv]
595
        self.sv, sv_y = X[sv], y[sv]
596
        self.alphas_p, self.alphas_n = self.alphas_p[sv], self.alphas_n[sv]
597

598
        if self.kernel == linear_kernel:
599
            self.w = np.dot(self.alphas_p - self.alphas_n, self.sv)
600

601
        for n in range(len(self.alphas_p)):
602
            self.b += sv_y[n]
603
            self.b -= np.sum((self.alphas_p - self.alphas_n) * self.K[self.sv_idx[n], sv])
604
        self.b -= self.epsilon
605
        self.b /= len(self.alphas_p)
606

607
        return self
608

609
    def solve_qp(self, X, y):
610
        """
611
        Solves a quadratic programming problem. In QP formulation (dual):
612
        m variables, 2m+1 constraints (1 equation, 2m inequations).
613
        :param X: array of size [n_samples, n_features] holding the training samples
614
        :param y: array of size [n_samples] holding the class labels
615
        """
616
        m = len(y)  # m = n_samples
617
        self.K = self.kernel(X)  # gram matrix
618
        P = np.vstack((np.hstack((self.K, -self.K)),  # alphas_p, alphas_n
619
                       np.hstack((-self.K, self.K))))  # alphas_n, alphas_p
620
        q = np.hstack((-y, y)) + self.epsilon
621
        lb = np.zeros(2 * m)  # lower bounds
622
        ub = np.ones(2 * m) * self.C  # upper bounds
623
        A = np.hstack((np.ones(m), -np.ones(m)))  # equality matrix
624
        b = np.zeros(1)  # equality vector
625
        alphas = solve_qp(P, q, A=A, b=b, lb=lb, ub=ub, solver='cvxopt',
626
                          sym_proj=True, verbose=self.verbose)
627
        self.alphas_p = alphas[:m]
628
        self.alphas_n = alphas[m:]
629

630
    def predict(self, X):
631
        if self.kernel != linear_kernel:
632
            return np.dot(self.alphas_p - self.alphas_n, self.kernel(self.sv, X)) + self.b
633
        return np.dot(X, self.w) + self.b
634

635

636
class MultiClassLearner:
637

638
    def __init__(self, clf, decision_function='ovr'):
639
        self.clf = clf
640
        self.decision_function = decision_function
641
        self.n_class, self.classifiers = 0, []
642

643
    def fit(self, X, y):
644
        """
645
        Trains n_class or n_class * (n_class - 1) / 2 classifiers
646
        according to the training method, ovr or ovo respectively.
647
        :param X: array of size [n_samples, n_features] holding the training samples
648
        :param y: array of size [n_samples] holding the class labels
649
        :return: array of classifiers
650
        """
651
        labels = np.unique(y)
652
        self.n_class = len(labels)
653
        if self.decision_function == 'ovr':  # one-vs-rest method
654
            for label in labels:
655
                y1 = np.array(y)
656
                y1[y1 != label] = -1.0
657
                y1[y1 == label] = 1.0
658
                self.clf.fit(X, y1)
659
                self.classifiers.append(copy.deepcopy(self.clf))
660
        elif self.decision_function == 'ovo':  # use one-vs-one method
661
            n_labels = len(labels)
662
            for i in range(n_labels):
663
                for j in range(i + 1, n_labels):
664
                    neg_id, pos_id = y == labels[i], y == labels[j]
665
                    X1, y1 = np.r_[X[neg_id], X[pos_id]], np.r_[y[neg_id], y[pos_id]]
666
                    y1[y1 == labels[i]] = -1.0
667
                    y1[y1 == labels[j]] = 1.0
668
                    self.clf.fit(X1, y1)
669
                    self.classifiers.append(copy.deepcopy(self.clf))
670
        else:
671
            return ValueError("Decision function must be either 'ovr' or 'ovo'.")
672
        return self
673

674
    def predict(self, X):
675
        """
676
        Predicts the class of a given example according to the training method.
677
        """
678
        n_samples = len(X)
679
        if self.decision_function == 'ovr':  # one-vs-rest method
680
            assert len(self.classifiers) == self.n_class
681
            score = np.zeros((n_samples, self.n_class))
682
            for i in range(self.n_class):
683
                clf = self.classifiers[i]
684
                score[:, i] = clf.predict_score(X)
685
            return np.argmax(score, axis=1)
686
        elif self.decision_function == 'ovo':  # use one-vs-one method
687
            assert len(self.classifiers) == self.n_class * (self.n_class - 1) / 2
688
            vote = np.zeros((n_samples, self.n_class))
689
            clf_id = 0
690
            for i in range(self.n_class):
691
                for j in range(i + 1, self.n_class):
692
                    res = self.classifiers[clf_id].predict(X)
693
                    vote[res < 0, i] += 1.0  # negative sample: class i
694
                    vote[res > 0, j] += 1.0  # positive sample: class j
695
                    clf_id += 1
696
            return np.argmax(vote, axis=1)
697
        else:
698
            return ValueError("Decision function must be either 'ovr' or 'ovo'.")
699

700

701
def LinearLearner(dataset, learning_rate=0.01, epochs=100):
702
    """
703
    [Section 18.6.3]
704
    Linear classifier with hard threshold.
705
    """
706
    idx_i = dataset.inputs
707
    idx_t = dataset.target
708
    examples = dataset.examples
709
    num_examples = len(examples)
710

711
    # X transpose
712
    X_col = [dataset.values[i] for i in idx_i]  # vertical columns of X
713

714
    # add dummy
715
    ones = [1 for _ in range(len(examples))]
716
    X_col = [ones] + X_col
717

718
    # initialize random weights
719
    num_weights = len(idx_i) + 1
720
    w = random_weights(min_value=-0.5, max_value=0.5, num_weights=num_weights)
721

722
    for epoch in range(epochs):
723
        err = []
724
        # pass over all examples
725
        for example in examples:
726
            x = [1] + example
727
            y = np.dot(w, x)
728
            t = example[idx_t]
729
            err.append(t - y)
730

731
        # update weights
732
        for i in range(len(w)):
733
            w[i] = w[i] + learning_rate * (np.dot(err, X_col[i]) / num_examples)
734

735
    def predict(example):
736
        x = [1] + example
737
        return np.dot(w, x)
738

739
    return predict
740

741

742
def LogisticLinearLeaner(dataset, learning_rate=0.01, epochs=100):
743
    """
744
    [Section 18.6.4]
745
    Linear classifier with logistic regression.
746
    """
747
    idx_i = dataset.inputs
748
    idx_t = dataset.target
749
    examples = dataset.examples
750
    num_examples = len(examples)
751

752
    # X transpose
753
    X_col = [dataset.values[i] for i in idx_i]  # vertical columns of X
754

755
    # add dummy
756
    ones = [1 for _ in range(len(examples))]
757
    X_col = [ones] + X_col
758

759
    # initialize random weights
760
    num_weights = len(idx_i) + 1
761
    w = random_weights(min_value=-0.5, max_value=0.5, num_weights=num_weights)
762

763
    for epoch in range(epochs):
764
        err = []
765
        h = []
766
        # pass over all examples
767
        for example in examples:
768
            x = [1] + example
769
            y = Sigmoid()(np.dot(w, x))
770
            h.append(Sigmoid().derivative(y))
771
            t = example[idx_t]
772
            err.append(t - y)
773

774
        # update weights
775
        for i in range(len(w)):
776
            buffer = [x * y for x, y in zip(err, h)]
777
            w[i] = w[i] + learning_rate * (np.dot(buffer, X_col[i]) / num_examples)
778

779
    def predict(example):
780
        x = [1] + example
781
        return Sigmoid()(np.dot(w, x))
782

783
    return predict
784

785

786
class EnsembleLearner:
787
    """Given a list of learning algorithms, have them vote."""
788

789
    def __init__(self, learners):
790
        self.learners = learners
791

792
    def train(self, dataset):
793
        self.predictors = [learner(dataset) for learner in self.learners]
794

795
    def predict(self, example):
796
        return mode(predictor.predict(example) for predictor in self.predictors)
797

798

799
def ada_boost(dataset, L, K):
800
    """[Figure 18.34]"""
801

802
    examples, target = dataset.examples, dataset.target
803
    n = len(examples)
804
    eps = 1 / (2 * n)
805
    w = [1 / n] * n
806
    h, z = [], []
807
    for k in range(K):
808
        h_k = L(dataset, w)
809
        h.append(h_k)
810
        error = sum(weight for example, weight in zip(examples, w) if example[target] != h_k.predict(example[:-1]))
811
        # avoid divide-by-0 from either 0% or 100% error rates
812
        error = np.clip(error, eps, 1 - eps)
813
        for j, example in enumerate(examples):
814
            if example[target] == h_k.predict(example[:-1]):
815
                w[j] *= error / (1 - error)
816
        w = normalize(w)
817
        z.append(np.log((1 - error) / error))
818
    return weighted_majority(h, z)
819

820

821
class weighted_majority:
822
    """Return a predictor that takes a weighted vote."""
823

824
    def __init__(self, predictors, weights):
825
        self.predictors = predictors
826
        self.weights = weights
827

828
    def predict(self, example):
829
        return weighted_mode((predictor.predict(example) for predictor in self.predictors), self.weights)
830

831

832
def weighted_mode(values, weights):
833
    """
834
    Return the value with the greatest total weight.
835
    >>> weighted_mode('abbaa', [1, 2, 3, 1, 2])
836
    'b'
837
    """
838
    totals = defaultdict(int)
839
    for v, w in zip(values, weights):
840
        totals[v] += w
841
    return max(totals, key=totals.__getitem__)
842

843

844
class RandomForest:
845
    """An ensemble of Decision Trees trained using bagging and feature bagging."""
846

847
    def __init__(self, dataset, n=5):
848
        self.dataset = dataset
849
        self.n = n
850
        self.predictors = [DecisionTreeLearner(DataSet(examples=self.data_bagging(), attrs=self.dataset.attrs,
851
                                                       attr_names=self.dataset.attr_names, target=self.dataset.target,
852
                                                       inputs=self.feature_bagging())) for _ in range(self.n)]
853

854
    def data_bagging(self, m=0):
855
        """Sample m examples with replacement"""
856
        n = len(self.dataset.examples)
857
        return weighted_sample_with_replacement(m or n, self.dataset.examples, [1] * n)
858

859
    def feature_bagging(self, p=0.7):
860
        """Feature bagging with probability p to retain an attribute"""
861
        inputs = [i for i in self.dataset.inputs if probability(p)]
862
        return inputs or self.dataset.inputs
863

864
    def predict(self, example):
865
        return mode(predictor.predict(example) for predictor in self.predictors)
866

867

868
def WeightedLearner(unweighted_learner):
869
    """
870
    [Page 749 footnote 14]
871
    Given a learner that takes just an unweighted dataset, return
872
    one that takes also a weight for each example.
873
    """
874

875
    def train(dataset, weights):
876
        dataset = replicated_dataset(dataset, weights)
877
        n_samples, n_features = len(dataset.examples), dataset.target
878
        X, y = (np.array([x[:n_features] for x in dataset.examples]),
879
                np.array([x[n_features] for x in dataset.examples]))
880
        return unweighted_learner.fit(X, y)
881

882
    return train
883

884

885
def replicated_dataset(dataset, weights, n=None):
886
    """Copy dataset, replicating each example in proportion to its weight."""
887
    n = n or len(dataset.examples)
888
    result = copy.copy(dataset)
889
    result.examples = weighted_replicate(dataset.examples, weights, n)
890
    return result
891

892

893
def weighted_replicate(seq, weights, n):
894
    """
895
    Return n selections from seq, with the count of each element of
896
    seq proportional to the corresponding weight (filling in fractions
897
    randomly).
898
    >>> weighted_replicate('ABC', [1, 2, 1], 4)
899
    ['A', 'B', 'B', 'C']
900
    """
901
    assert len(seq) == len(weights)
902
    weights = normalize(weights)
903
    wholes = [int(w * n) for w in weights]
904
    fractions = [(w * n) % 1 for w in weights]
905
    return (flatten([x] * nx for x, nx in zip(seq, wholes)) +
906
            weighted_sample_with_replacement(n - sum(wholes), seq, fractions))
907

908

909
# metrics
910

911
def accuracy_score(y_pred, y_true):
912
    assert y_pred.shape == y_true.shape
913
    return np.mean(np.equal(y_pred, y_true))
914

915

916
def r2_score(y_pred, y_true):
917
    assert y_pred.shape == y_true.shape
918
    return 1. - (np.sum(np.square(y_pred - y_true)) /  # sum of square of residuals
919
                 np.sum(np.square(y_true - np.mean(y_true))))  # total sum of squares
920

921

922
# datasets
923

924
orings = DataSet(name='orings', target='Distressed', attr_names='Rings Distressed Temp Pressure Flightnum')
925

926
zoo = DataSet(name='zoo', target='type', exclude=['name'],
927
              attr_names='name hair feathers eggs milk airborne aquatic predator toothed backbone '
928
                         'breathes venomous fins legs tail domestic catsize type')
929

930
iris = DataSet(name='iris', target='class', attr_names='sepal-len sepal-width petal-len petal-width class')
931

932

933
def RestaurantDataSet(examples=None):
934
    """
935
    [Figure 18.3]
936
    Build a DataSet of Restaurant waiting examples.
937
    """
938
    return DataSet(name='restaurant', target='Wait', examples=examples,
939
                   attr_names='Alternate Bar Fri/Sat Hungry Patrons Price Raining Reservation Type WaitEstimate Wait')
940

941

942
restaurant = RestaurantDataSet()
943

944

945
def T(attr_name, branches):
946
    branches = {value: (child if isinstance(child, DecisionFork) else DecisionLeaf(child))
947
                for value, child in branches.items()}
948
    return DecisionFork(restaurant.attr_num(attr_name), attr_name, print, branches)
949

950

951
""" 
952
[Figure 18.2]
953
A decision tree for deciding whether to wait for a table at a hotel.
954
"""
955

956
waiting_decision_tree = T('Patrons',
957
                          {'None': 'No', 'Some': 'Yes',
958
                           'Full': T('WaitEstimate',
959
                                     {'>60': 'No', '0-10': 'Yes',
960
                                      '30-60': T('Alternate',
961
                                                 {'No': T('Reservation',
962
                                                          {'Yes': 'Yes',
963
                                                           'No': T('Bar', {'No': 'No',
964
                                                                           'Yes': 'Yes'})}),
965
                                                  'Yes': T('Fri/Sat', {'No': 'No', 'Yes': 'Yes'})}),
966
                                      '10-30': T('Hungry',
967
                                                 {'No': 'Yes',
968
                                                  'Yes': T('Alternate',
969
                                                           {'No': 'Yes',
970
                                                            'Yes': T('Raining',
971
                                                                     {'No': 'No',
972
                                                                      'Yes': 'Yes'})})})})})
973

974

975
def SyntheticRestaurant(n=20):
976
    """Generate a DataSet with n examples."""
977

978
    def gen():
979
        example = list(map(random.choice, restaurant.values))
980
        example[restaurant.target] = waiting_decision_tree(example)
981
        return example
982

983
    return RestaurantDataSet([gen() for _ in range(n)])
984

985

986
def Majority(k, n):
987
    """
988
    Return a DataSet with n k-bit examples of the majority problem:
989
    k random bits followed by a 1 if more than half the bits are 1, else 0.
990
    """
991
    examples = []
992
    for i in range(n):
993
        bits = [random.choice([0, 1]) for _ in range(k)]
994
        bits.append(int(sum(bits) > k / 2))
995
        examples.append(bits)
996
    return DataSet(name='majority', examples=examples)
997

998

999
def Parity(k, n, name='parity'):
1000
    """
1001
    Return a DataSet with n k-bit examples of the parity problem:
1002
    k random bits followed by a 1 if an odd number of bits are 1, else 0.
1003
    """
1004
    examples = []
1005
    for i in range(n):
1006
        bits = [random.choice([0, 1]) for _ in range(k)]
1007
        bits.append(sum(bits) % 2)
1008
        examples.append(bits)
1009
    return DataSet(name=name, examples=examples)
1010

1011

1012
def Xor(n):
1013
    """Return a DataSet with n examples of 2-input xor."""
1014
    return Parity(2, n, name='xor')
1015

1016

1017
def ContinuousXor(n):
1018
    """2 inputs are chosen uniformly from (0.0 .. 2.0]; output is xor of ints."""
1019
    examples = []
1020
    for i in range(n):
1021
        x, y = [random.uniform(0.0, 2.0) for _ in '12']
1022
        examples.append([x, y, x != y])
1023
    return DataSet(name='continuous xor', examples=examples)
1024

1025

1026
def compare(algorithms=None, datasets=None, k=10, trials=1):
1027
    """
1028
    Compare various learners on various datasets using cross-validation.
1029
    Print results as a table.
1030
    """
1031
    # default list of algorithms
1032
    algorithms = algorithms or [PluralityLearner, NaiveBayesLearner, NearestNeighborLearner, DecisionTreeLearner]
1033

1034
    # default list of datasets
1035
    datasets = datasets or [iris, orings, zoo, restaurant, SyntheticRestaurant(20),
1036
                            Majority(7, 100), Parity(7, 100), Xor(100)]
1037

1038
    print_table([[a.__name__.replace('Learner', '')] + [cross_validation(a, d, k=k, trials=trials) for d in datasets]
1039
                 for a in algorithms], header=[''] + [d.name[0:7] for d in datasets], numfmt='%.2f')
1040

1041
Product

Resources

Company