Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
aimacode
GitHub Repository: aimacode/aima-python
Path: blob/master/learning4e.py
615 views
1
"""Learning from examples (Chapters 18)"""
2
3
import copy
4
from collections import defaultdict
5
from statistics import stdev
6
7
from qpsolvers import solve_qp
8
9
from deep_learning4e import Sigmoid
10
from probabilistic_learning import NaiveBayesLearner
11
from utils4e import *
12
13
14
class DataSet:
15
"""
16
A data set for a machine learning problem. It has the following fields:
17
18
d.examples A list of examples. Each one is a list of attribute values.
19
d.attrs A list of integers to index into an example, so example[attr]
20
gives a value. Normally the same as range(len(d.examples[0])).
21
d.attr_names Optional list of mnemonic names for corresponding attrs.
22
d.target The attribute that a learning algorithm will try to predict.
23
By default the final attribute.
24
d.inputs The list of attrs without the target.
25
d.values A list of lists: each sublist is the set of possible
26
values for the corresponding attribute. If initially None,
27
it is computed from the known examples by self.set_problem.
28
If not None, an erroneous value raises ValueError.
29
d.distance A function from a pair of examples to a non-negative number.
30
Should be symmetric, etc. Defaults to mean_boolean_error
31
since that can handle any field types.
32
d.name Name of the data set (for output display only).
33
d.source URL or other source where the data came from.
34
d.exclude A list of attribute indexes to exclude from d.inputs. Elements
35
of this list can either be integers (attrs) or attr_names.
36
37
Normally, you call the constructor and you're done; then you just
38
access fields like d.examples and d.target and d.inputs.
39
"""
40
41
def __init__(self, examples=None, attrs=None, attr_names=None, target=-1, inputs=None,
42
values=None, distance=mean_boolean_error, name='', source='', exclude=()):
43
"""
44
Accepts any of DataSet's fields. Examples can also be a
45
string or file from which to parse examples using parse_csv.
46
Optional parameter: exclude, as documented in .set_problem().
47
>>> DataSet(examples='1, 2, 3')
48
<DataSet(): 1 examples, 3 attributes>
49
"""
50
self.name = name
51
self.source = source
52
self.values = values
53
self.distance = distance
54
self.got_values_flag = bool(values)
55
56
# initialize .examples from string or list or data directory
57
if isinstance(examples, str):
58
self.examples = parse_csv(examples)
59
elif examples is None:
60
self.examples = parse_csv(open_data(name + '.csv').read())
61
else:
62
self.examples = examples
63
64
# attrs are the indices of examples, unless otherwise stated.
65
if self.examples is not None and attrs is None:
66
attrs = list(range(len(self.examples[0])))
67
68
self.attrs = attrs
69
70
# initialize .attr_names from string, list, or by default
71
if isinstance(attr_names, str):
72
self.attr_names = attr_names.split()
73
else:
74
self.attr_names = attr_names or attrs
75
self.set_problem(target, inputs=inputs, exclude=exclude)
76
77
def set_problem(self, target, inputs=None, exclude=()):
78
"""
79
Set (or change) the target and/or inputs.
80
This way, one DataSet can be used multiple ways. inputs, if specified,
81
is a list of attributes, or specify exclude as a list of attributes
82
to not use in inputs. Attributes can be -n .. n, or an attr_name.
83
Also computes the list of possible values, if that wasn't done yet.
84
"""
85
self.target = self.attr_num(target)
86
exclude = list(map(self.attr_num, exclude))
87
if inputs:
88
self.inputs = remove_all(self.target, inputs)
89
else:
90
self.inputs = [a for a in self.attrs if a != self.target and a not in exclude]
91
if not self.values:
92
self.update_values()
93
self.check_me()
94
95
def check_me(self):
96
"""Check that my fields make sense."""
97
assert len(self.attr_names) == len(self.attrs)
98
assert self.target in self.attrs
99
assert self.target not in self.inputs
100
assert set(self.inputs).issubset(set(self.attrs))
101
if self.got_values_flag:
102
# only check if values are provided while initializing DataSet
103
list(map(self.check_example, self.examples))
104
105
def add_example(self, example):
106
"""Add an example to the list of examples, checking it first."""
107
self.check_example(example)
108
self.examples.append(example)
109
110
def check_example(self, example):
111
"""Raise ValueError if example has any invalid values."""
112
if self.values:
113
for a in self.attrs:
114
if example[a] not in self.values[a]:
115
raise ValueError('Bad value {} for attribute {} in {}'
116
.format(example[a], self.attr_names[a], example))
117
118
def attr_num(self, attr):
119
"""Returns the number used for attr, which can be a name, or -n .. n-1."""
120
if isinstance(attr, str):
121
return self.attr_names.index(attr)
122
elif attr < 0:
123
return len(self.attrs) + attr
124
else:
125
return attr
126
127
def update_values(self):
128
self.values = list(map(unique, zip(*self.examples)))
129
130
def sanitize(self, example):
131
"""Return a copy of example, with non-input attributes replaced by None."""
132
return [attr_i if i in self.inputs else None for i, attr_i in enumerate(example)][:-1]
133
134
def classes_to_numbers(self, classes=None):
135
"""Converts class names to numbers."""
136
if not classes:
137
# if classes were not given, extract them from values
138
classes = sorted(self.values[self.target])
139
for item in self.examples:
140
item[self.target] = classes.index(item[self.target])
141
142
def remove_examples(self, value=''):
143
"""Remove examples that contain given value."""
144
self.examples = [x for x in self.examples if value not in x]
145
self.update_values()
146
147
def split_values_by_classes(self):
148
"""Split values into buckets according to their class."""
149
buckets = defaultdict(lambda: [])
150
target_names = self.values[self.target]
151
152
for v in self.examples:
153
item = [a for a in v if a not in target_names] # remove target from item
154
buckets[v[self.target]].append(item) # add item to bucket of its class
155
156
return buckets
157
158
def find_means_and_deviations(self):
159
"""
160
Finds the means and standard deviations of self.dataset.
161
means : a dictionary for each class/target. Holds a list of the means
162
of the features for the class.
163
deviations: a dictionary for each class/target. Holds a list of the sample
164
standard deviations of the features for the class.
165
"""
166
target_names = self.values[self.target]
167
feature_numbers = len(self.inputs)
168
169
item_buckets = self.split_values_by_classes()
170
171
means = defaultdict(lambda: [0] * feature_numbers)
172
deviations = defaultdict(lambda: [0] * feature_numbers)
173
174
for t in target_names:
175
# find all the item feature values for item in class t
176
features = [[] for _ in range(feature_numbers)]
177
for item in item_buckets[t]:
178
for i in range(feature_numbers):
179
features[i].append(item[i])
180
181
# calculate means and deviations fo the class
182
for i in range(feature_numbers):
183
means[t][i] = mean(features[i])
184
deviations[t][i] = stdev(features[i])
185
186
return means, deviations
187
188
def __repr__(self):
189
return '<DataSet({}): {:d} examples, {:d} attributes>'.format(self.name, len(self.examples), len(self.attrs))
190
191
192
def parse_csv(input, delim=','):
193
r"""
194
Input is a string consisting of lines, each line has comma-delimited
195
fields. Convert this into a list of lists. Blank lines are skipped.
196
Fields that look like numbers are converted to numbers.
197
The delim defaults to ',' but '\t' and None are also reasonable values.
198
>>> parse_csv('1, 2, 3 \n 0, 2, na')
199
[[1, 2, 3], [0, 2, 'na']]
200
"""
201
lines = [line for line in input.splitlines() if line.strip()]
202
return [list(map(num_or_str, line.split(delim))) for line in lines]
203
204
205
def err_ratio(learner, dataset, examples=None):
206
"""
207
Return the proportion of the examples that are NOT correctly predicted.
208
verbose - 0: No output; 1: Output wrong; 2 (or greater): Output correct
209
"""
210
examples = examples or dataset.examples
211
if len(examples) == 0:
212
return 0.0
213
right = 0
214
for example in examples:
215
desired = example[dataset.target]
216
output = learner.predict(dataset.sanitize(example))
217
if np.allclose(output, desired):
218
right += 1
219
return 1 - (right / len(examples))
220
221
222
def grade_learner(learner, tests):
223
"""
224
Grades the given learner based on how many tests it passes.
225
tests is a list with each element in the form: (values, output).
226
"""
227
return mean(int(learner.predict(X) == y) for X, y in tests)
228
229
230
def train_test_split(dataset, start=None, end=None, test_split=None):
231
"""
232
If you are giving 'start' and 'end' as parameters,
233
then it will return the testing set from index 'start' to 'end'
234
and the rest for training.
235
If you give 'test_split' as a parameter then it will return
236
test_split * 100% as the testing set and the rest as
237
training set.
238
"""
239
examples = dataset.examples
240
if test_split is None:
241
train = examples[:start] + examples[end:]
242
val = examples[start:end]
243
else:
244
total_size = len(examples)
245
val_size = int(total_size * test_split)
246
train_size = total_size - val_size
247
train = examples[:train_size]
248
val = examples[train_size:total_size]
249
250
return train, val
251
252
253
def model_selection(learner, dataset, k=10, trials=1):
254
"""
255
[Figure 18.8]
256
Return the optimal value of size having minimum error on validation set.
257
err: a validation error array, indexed by size
258
"""
259
errs = []
260
size = 1
261
while True:
262
err = cross_validation(learner, dataset, size, k, trials)
263
# check for convergence provided err_val is not empty
264
if err and not np.isclose(err[-1], err, rtol=1e-6):
265
best_size = 0
266
min_val = np.inf
267
i = 0
268
while i < size:
269
if errs[i] < min_val:
270
min_val = errs[i]
271
best_size = i
272
i += 1
273
return learner(dataset, best_size)
274
errs.append(err)
275
size += 1
276
277
278
def cross_validation(learner, dataset, size=None, k=10, trials=1):
279
"""
280
Do k-fold cross_validate and return their mean.
281
That is, keep out 1/k of the examples for testing on each of k runs.
282
Shuffle the examples first; if trials > 1, average over several shuffles.
283
Returns Training error
284
"""
285
k = k or len(dataset.examples)
286
if trials > 1:
287
trial_errs = 0
288
for t in range(trials):
289
errs = cross_validation(learner, dataset, size, k, trials)
290
trial_errs += errs
291
return trial_errs / trials
292
else:
293
fold_errs = 0
294
n = len(dataset.examples)
295
examples = dataset.examples
296
random.shuffle(dataset.examples)
297
for fold in range(k):
298
train_data, val_data = train_test_split(dataset, fold * (n // k), (fold + 1) * (n // k))
299
dataset.examples = train_data
300
h = learner(dataset, size)
301
fold_errs += err_ratio(h, dataset, train_data)
302
# reverting back to original once test is completed
303
dataset.examples = examples
304
return fold_errs / k
305
306
307
def leave_one_out(learner, dataset, size=None):
308
"""Leave one out cross-validation over the dataset."""
309
return cross_validation(learner, dataset, size, len(dataset.examples))
310
311
312
def learning_curve(learner, dataset, trials=10, sizes=None):
313
if sizes is None:
314
sizes = list(range(2, len(dataset.examples) - trials, 2))
315
316
def score(learner, size):
317
random.shuffle(dataset.examples)
318
return cross_validation(learner, dataset, size, trials)
319
320
return [(size, mean([score(learner, size) for _ in range(trials)])) for size in sizes]
321
322
323
class PluralityLearner:
324
"""
325
A very dumb algorithm: always pick the result that was most popular
326
in the training data. Makes a baseline for comparison.
327
"""
328
329
def __init__(self, dataset):
330
self.most_popular = mode([e[dataset.target] for e in dataset.examples])
331
332
def predict(self, example):
333
"""Always return same result: the most popular from the training set."""
334
return self.most_popular
335
336
337
class DecisionFork:
338
"""
339
A fork of a decision tree holds an attribute to test, and a dict
340
of branches, one for each of the attribute's values.
341
"""
342
343
def __init__(self, attr, attr_name=None, default_child=None, branches=None):
344
"""Initialize by saying what attribute this node tests."""
345
self.attr = attr
346
self.attr_name = attr_name or attr
347
self.default_child = default_child
348
self.branches = branches or {}
349
350
def __call__(self, example):
351
"""Given an example, classify it using the attribute and the branches."""
352
attr_val = example[self.attr]
353
if attr_val in self.branches:
354
return self.branches[attr_val](example)
355
else:
356
# return default class when attribute is unknown
357
return self.default_child(example)
358
359
def add(self, val, subtree):
360
"""Add a branch. If self.attr = val, go to the given subtree."""
361
self.branches[val] = subtree
362
363
def display(self, indent=0):
364
name = self.attr_name
365
print('Test', name)
366
for (val, subtree) in self.branches.items():
367
print(' ' * 4 * indent, name, '=', val, '==>', end=' ')
368
subtree.display(indent + 1)
369
370
def __repr__(self):
371
return 'DecisionFork({0!r}, {1!r}, {2!r})'.format(self.attr, self.attr_name, self.branches)
372
373
374
class DecisionLeaf:
375
"""A leaf of a decision tree holds just a result."""
376
377
def __init__(self, result):
378
self.result = result
379
380
def __call__(self, example):
381
return self.result
382
383
def display(self):
384
print('RESULT =', self.result)
385
386
def __repr__(self):
387
return repr(self.result)
388
389
390
class DecisionTreeLearner:
391
"""[Figure 18.5]"""
392
393
def __init__(self, dataset):
394
self.dataset = dataset
395
self.tree = self.decision_tree_learning(dataset.examples, dataset.inputs)
396
397
def decision_tree_learning(self, examples, attrs, parent_examples=()):
398
if len(examples) == 0:
399
return self.plurality_value(parent_examples)
400
if self.all_same_class(examples):
401
return DecisionLeaf(examples[0][self.dataset.target])
402
if len(attrs) == 0:
403
return self.plurality_value(examples)
404
A = self.choose_attribute(attrs, examples)
405
tree = DecisionFork(A, self.dataset.attr_names[A], self.plurality_value(examples))
406
for (v_k, exs) in self.split_by(A, examples):
407
subtree = self.decision_tree_learning(exs, remove_all(A, attrs), examples)
408
tree.add(v_k, subtree)
409
return tree
410
411
def plurality_value(self, examples):
412
"""
413
Return the most popular target value for this set of examples.
414
(If target is binary, this is the majority; otherwise plurality).
415
"""
416
popular = argmax_random_tie(self.dataset.values[self.dataset.target],
417
key=lambda v: self.count(self.dataset.target, v, examples))
418
return DecisionLeaf(popular)
419
420
def count(self, attr, val, examples):
421
"""Count the number of examples that have example[attr] = val."""
422
return sum(e[attr] == val for e in examples)
423
424
def all_same_class(self, examples):
425
"""Are all these examples in the same target class?"""
426
class0 = examples[0][self.dataset.target]
427
return all(e[self.dataset.target] == class0 for e in examples)
428
429
def choose_attribute(self, attrs, examples):
430
"""Choose the attribute with the highest information gain."""
431
return argmax_random_tie(attrs, key=lambda a: self.information_gain(a, examples))
432
433
def information_gain(self, attr, examples):
434
"""Return the expected reduction in entropy from splitting by attr."""
435
436
def I(examples):
437
return information_content([self.count(self.dataset.target, v, examples)
438
for v in self.dataset.values[self.dataset.target]])
439
440
n = len(examples)
441
remainder = sum((len(examples_i) / n) * I(examples_i)
442
for (v, examples_i) in self.split_by(attr, examples))
443
return I(examples) - remainder
444
445
def split_by(self, attr, examples):
446
"""Return a list of (val, examples) pairs for each val of attr."""
447
return [(v, [e for e in examples if e[attr] == v]) for v in self.dataset.values[attr]]
448
449
def predict(self, x):
450
return self.tree(x)
451
452
453
def information_content(values):
454
"""Number of bits to represent the probability distribution in values."""
455
probabilities = normalize(remove_all(0, values))
456
return sum(-p * np.log2(p) for p in probabilities)
457
458
459
class DecisionListLearner:
460
"""
461
[Figure 18.11]
462
A decision list implemented as a list of (test, value) pairs.
463
"""
464
465
def __init__(self, dataset):
466
self.predict.decision_list = self.decision_list_learning(set(dataset.examples))
467
468
def decision_list_learning(self, examples):
469
if not examples:
470
return [(True, False)]
471
t, o, examples_t = self.find_examples(examples)
472
if not t:
473
raise Exception
474
return [(t, o)] + self.decision_list_learning(examples - examples_t)
475
476
def find_examples(self, examples):
477
"""
478
Find a set of examples that all have the same outcome under
479
some test. Return a tuple of the test, outcome, and examples.
480
"""
481
raise NotImplementedError
482
483
def passes(self, example, test):
484
"""Does the example pass the test?"""
485
raise NotImplementedError
486
487
def predict(self, example):
488
"""Predict the outcome for the first passing test."""
489
for test, outcome in self.predict.decision_list:
490
if self.passes(example, test):
491
return outcome
492
493
494
class NearestNeighborLearner:
495
"""k-NearestNeighbor: the k nearest neighbors vote."""
496
497
def __init__(self, dataset, k=1):
498
self.dataset = dataset
499
self.k = k
500
501
def predict(self, example):
502
"""Find the k closest items, and have them vote for the best."""
503
best = heapq.nsmallest(self.k, ((self.dataset.distance(e, example), e) for e in self.dataset.examples))
504
return mode(e[self.dataset.target] for (d, e) in best)
505
506
507
class SVC:
508
509
def __init__(self, kernel=linear_kernel, C=1.0, verbose=False):
510
self.kernel = kernel
511
self.C = C # hyper-parameter
512
self.sv_idx, self.sv, self.sv_y = np.zeros(0), np.zeros(0), np.zeros(0)
513
self.alphas = np.zeros(0)
514
self.w = None
515
self.b = 0.0 # intercept
516
self.verbose = verbose
517
518
def fit(self, X, y):
519
"""
520
Trains the model by solving a quadratic programming problem.
521
:param X: array of size [n_samples, n_features] holding the training samples
522
:param y: array of size [n_samples] holding the class labels
523
"""
524
# In QP formulation (dual): m variables, 2m+1 constraints (1 equation, 2m inequations)
525
self.solve_qp(X, y)
526
sv = self.alphas > 1e-5
527
self.sv_idx = np.arange(len(self.alphas))[sv]
528
self.sv, self.sv_y, self.alphas = X[sv], y[sv], self.alphas[sv]
529
530
if self.kernel == linear_kernel:
531
self.w = np.dot(self.alphas * self.sv_y, self.sv)
532
533
for n in range(len(self.alphas)):
534
self.b += self.sv_y[n]
535
self.b -= np.sum(self.alphas * self.sv_y * self.K[self.sv_idx[n], sv])
536
self.b /= len(self.alphas)
537
return self
538
539
def solve_qp(self, X, y):
540
"""
541
Solves a quadratic programming problem. In QP formulation (dual):
542
m variables, 2m+1 constraints (1 equation, 2m inequations).
543
:param X: array of size [n_samples, n_features] holding the training samples
544
:param y: array of size [n_samples] holding the class labels
545
"""
546
m = len(y) # m = n_samples
547
self.K = self.kernel(X) # gram matrix
548
P = self.K * np.outer(y, y)
549
q = -np.ones(m)
550
lb = np.zeros(m) # lower bounds
551
ub = np.ones(m) * self.C # upper bounds
552
A = y.astype(np.float64) # equality matrix
553
b = np.zeros(1) # equality vector
554
self.alphas = solve_qp(P, q, A=A, b=b, lb=lb, ub=ub, solver='cvxopt',
555
sym_proj=True, verbose=self.verbose)
556
557
def predict_score(self, X):
558
"""
559
Predicts the score for a given example.
560
"""
561
if self.w is None:
562
return np.dot(self.alphas * self.sv_y, self.kernel(self.sv, X)) + self.b
563
return np.dot(X, self.w) + self.b
564
565
def predict(self, X):
566
"""
567
Predicts the class of a given example.
568
"""
569
return np.sign(self.predict_score(X))
570
571
572
class SVR:
573
574
def __init__(self, kernel=linear_kernel, C=1.0, epsilon=0.1, verbose=False):
575
self.kernel = kernel
576
self.C = C # hyper-parameter
577
self.epsilon = epsilon # epsilon insensitive loss value
578
self.sv_idx, self.sv = np.zeros(0), np.zeros(0)
579
self.alphas_p, self.alphas_n = np.zeros(0), np.zeros(0)
580
self.w = None
581
self.b = 0.0 # intercept
582
self.verbose = verbose
583
584
def fit(self, X, y):
585
"""
586
Trains the model by solving a quadratic programming problem.
587
:param X: array of size [n_samples, n_features] holding the training samples
588
:param y: array of size [n_samples] holding the class labels
589
"""
590
# In QP formulation (dual): m variables, 2m+1 constraints (1 equation, 2m inequations)
591
self.solve_qp(X, y)
592
593
sv = np.logical_or(self.alphas_p > 1e-5, self.alphas_n > 1e-5)
594
self.sv_idx = np.arange(len(self.alphas_p))[sv]
595
self.sv, sv_y = X[sv], y[sv]
596
self.alphas_p, self.alphas_n = self.alphas_p[sv], self.alphas_n[sv]
597
598
if self.kernel == linear_kernel:
599
self.w = np.dot(self.alphas_p - self.alphas_n, self.sv)
600
601
for n in range(len(self.alphas_p)):
602
self.b += sv_y[n]
603
self.b -= np.sum((self.alphas_p - self.alphas_n) * self.K[self.sv_idx[n], sv])
604
self.b -= self.epsilon
605
self.b /= len(self.alphas_p)
606
607
return self
608
609
def solve_qp(self, X, y):
610
"""
611
Solves a quadratic programming problem. In QP formulation (dual):
612
m variables, 2m+1 constraints (1 equation, 2m inequations).
613
:param X: array of size [n_samples, n_features] holding the training samples
614
:param y: array of size [n_samples] holding the class labels
615
"""
616
m = len(y) # m = n_samples
617
self.K = self.kernel(X) # gram matrix
618
P = np.vstack((np.hstack((self.K, -self.K)), # alphas_p, alphas_n
619
np.hstack((-self.K, self.K)))) # alphas_n, alphas_p
620
q = np.hstack((-y, y)) + self.epsilon
621
lb = np.zeros(2 * m) # lower bounds
622
ub = np.ones(2 * m) * self.C # upper bounds
623
A = np.hstack((np.ones(m), -np.ones(m))) # equality matrix
624
b = np.zeros(1) # equality vector
625
alphas = solve_qp(P, q, A=A, b=b, lb=lb, ub=ub, solver='cvxopt',
626
sym_proj=True, verbose=self.verbose)
627
self.alphas_p = alphas[:m]
628
self.alphas_n = alphas[m:]
629
630
def predict(self, X):
631
if self.kernel != linear_kernel:
632
return np.dot(self.alphas_p - self.alphas_n, self.kernel(self.sv, X)) + self.b
633
return np.dot(X, self.w) + self.b
634
635
636
class MultiClassLearner:
637
638
def __init__(self, clf, decision_function='ovr'):
639
self.clf = clf
640
self.decision_function = decision_function
641
self.n_class, self.classifiers = 0, []
642
643
def fit(self, X, y):
644
"""
645
Trains n_class or n_class * (n_class - 1) / 2 classifiers
646
according to the training method, ovr or ovo respectively.
647
:param X: array of size [n_samples, n_features] holding the training samples
648
:param y: array of size [n_samples] holding the class labels
649
:return: array of classifiers
650
"""
651
labels = np.unique(y)
652
self.n_class = len(labels)
653
if self.decision_function == 'ovr': # one-vs-rest method
654
for label in labels:
655
y1 = np.array(y)
656
y1[y1 != label] = -1.0
657
y1[y1 == label] = 1.0
658
self.clf.fit(X, y1)
659
self.classifiers.append(copy.deepcopy(self.clf))
660
elif self.decision_function == 'ovo': # use one-vs-one method
661
n_labels = len(labels)
662
for i in range(n_labels):
663
for j in range(i + 1, n_labels):
664
neg_id, pos_id = y == labels[i], y == labels[j]
665
X1, y1 = np.r_[X[neg_id], X[pos_id]], np.r_[y[neg_id], y[pos_id]]
666
y1[y1 == labels[i]] = -1.0
667
y1[y1 == labels[j]] = 1.0
668
self.clf.fit(X1, y1)
669
self.classifiers.append(copy.deepcopy(self.clf))
670
else:
671
return ValueError("Decision function must be either 'ovr' or 'ovo'.")
672
return self
673
674
def predict(self, X):
675
"""
676
Predicts the class of a given example according to the training method.
677
"""
678
n_samples = len(X)
679
if self.decision_function == 'ovr': # one-vs-rest method
680
assert len(self.classifiers) == self.n_class
681
score = np.zeros((n_samples, self.n_class))
682
for i in range(self.n_class):
683
clf = self.classifiers[i]
684
score[:, i] = clf.predict_score(X)
685
return np.argmax(score, axis=1)
686
elif self.decision_function == 'ovo': # use one-vs-one method
687
assert len(self.classifiers) == self.n_class * (self.n_class - 1) / 2
688
vote = np.zeros((n_samples, self.n_class))
689
clf_id = 0
690
for i in range(self.n_class):
691
for j in range(i + 1, self.n_class):
692
res = self.classifiers[clf_id].predict(X)
693
vote[res < 0, i] += 1.0 # negative sample: class i
694
vote[res > 0, j] += 1.0 # positive sample: class j
695
clf_id += 1
696
return np.argmax(vote, axis=1)
697
else:
698
return ValueError("Decision function must be either 'ovr' or 'ovo'.")
699
700
701
def LinearLearner(dataset, learning_rate=0.01, epochs=100):
702
"""
703
[Section 18.6.3]
704
Linear classifier with hard threshold.
705
"""
706
idx_i = dataset.inputs
707
idx_t = dataset.target
708
examples = dataset.examples
709
num_examples = len(examples)
710
711
# X transpose
712
X_col = [dataset.values[i] for i in idx_i] # vertical columns of X
713
714
# add dummy
715
ones = [1 for _ in range(len(examples))]
716
X_col = [ones] + X_col
717
718
# initialize random weights
719
num_weights = len(idx_i) + 1
720
w = random_weights(min_value=-0.5, max_value=0.5, num_weights=num_weights)
721
722
for epoch in range(epochs):
723
err = []
724
# pass over all examples
725
for example in examples:
726
x = [1] + example
727
y = np.dot(w, x)
728
t = example[idx_t]
729
err.append(t - y)
730
731
# update weights
732
for i in range(len(w)):
733
w[i] = w[i] + learning_rate * (np.dot(err, X_col[i]) / num_examples)
734
735
def predict(example):
736
x = [1] + example
737
return np.dot(w, x)
738
739
return predict
740
741
742
def LogisticLinearLeaner(dataset, learning_rate=0.01, epochs=100):
743
"""
744
[Section 18.6.4]
745
Linear classifier with logistic regression.
746
"""
747
idx_i = dataset.inputs
748
idx_t = dataset.target
749
examples = dataset.examples
750
num_examples = len(examples)
751
752
# X transpose
753
X_col = [dataset.values[i] for i in idx_i] # vertical columns of X
754
755
# add dummy
756
ones = [1 for _ in range(len(examples))]
757
X_col = [ones] + X_col
758
759
# initialize random weights
760
num_weights = len(idx_i) + 1
761
w = random_weights(min_value=-0.5, max_value=0.5, num_weights=num_weights)
762
763
for epoch in range(epochs):
764
err = []
765
h = []
766
# pass over all examples
767
for example in examples:
768
x = [1] + example
769
y = Sigmoid()(np.dot(w, x))
770
h.append(Sigmoid().derivative(y))
771
t = example[idx_t]
772
err.append(t - y)
773
774
# update weights
775
for i in range(len(w)):
776
buffer = [x * y for x, y in zip(err, h)]
777
w[i] = w[i] + learning_rate * (np.dot(buffer, X_col[i]) / num_examples)
778
779
def predict(example):
780
x = [1] + example
781
return Sigmoid()(np.dot(w, x))
782
783
return predict
784
785
786
class EnsembleLearner:
787
"""Given a list of learning algorithms, have them vote."""
788
789
def __init__(self, learners):
790
self.learners = learners
791
792
def train(self, dataset):
793
self.predictors = [learner(dataset) for learner in self.learners]
794
795
def predict(self, example):
796
return mode(predictor.predict(example) for predictor in self.predictors)
797
798
799
def ada_boost(dataset, L, K):
800
"""[Figure 18.34]"""
801
802
examples, target = dataset.examples, dataset.target
803
n = len(examples)
804
eps = 1 / (2 * n)
805
w = [1 / n] * n
806
h, z = [], []
807
for k in range(K):
808
h_k = L(dataset, w)
809
h.append(h_k)
810
error = sum(weight for example, weight in zip(examples, w) if example[target] != h_k.predict(example[:-1]))
811
# avoid divide-by-0 from either 0% or 100% error rates
812
error = np.clip(error, eps, 1 - eps)
813
for j, example in enumerate(examples):
814
if example[target] == h_k.predict(example[:-1]):
815
w[j] *= error / (1 - error)
816
w = normalize(w)
817
z.append(np.log((1 - error) / error))
818
return weighted_majority(h, z)
819
820
821
class weighted_majority:
822
"""Return a predictor that takes a weighted vote."""
823
824
def __init__(self, predictors, weights):
825
self.predictors = predictors
826
self.weights = weights
827
828
def predict(self, example):
829
return weighted_mode((predictor.predict(example) for predictor in self.predictors), self.weights)
830
831
832
def weighted_mode(values, weights):
833
"""
834
Return the value with the greatest total weight.
835
>>> weighted_mode('abbaa', [1, 2, 3, 1, 2])
836
'b'
837
"""
838
totals = defaultdict(int)
839
for v, w in zip(values, weights):
840
totals[v] += w
841
return max(totals, key=totals.__getitem__)
842
843
844
class RandomForest:
845
"""An ensemble of Decision Trees trained using bagging and feature bagging."""
846
847
def __init__(self, dataset, n=5):
848
self.dataset = dataset
849
self.n = n
850
self.predictors = [DecisionTreeLearner(DataSet(examples=self.data_bagging(), attrs=self.dataset.attrs,
851
attr_names=self.dataset.attr_names, target=self.dataset.target,
852
inputs=self.feature_bagging())) for _ in range(self.n)]
853
854
def data_bagging(self, m=0):
855
"""Sample m examples with replacement"""
856
n = len(self.dataset.examples)
857
return weighted_sample_with_replacement(m or n, self.dataset.examples, [1] * n)
858
859
def feature_bagging(self, p=0.7):
860
"""Feature bagging with probability p to retain an attribute"""
861
inputs = [i for i in self.dataset.inputs if probability(p)]
862
return inputs or self.dataset.inputs
863
864
def predict(self, example):
865
return mode(predictor.predict(example) for predictor in self.predictors)
866
867
868
def WeightedLearner(unweighted_learner):
869
"""
870
[Page 749 footnote 14]
871
Given a learner that takes just an unweighted dataset, return
872
one that takes also a weight for each example.
873
"""
874
875
def train(dataset, weights):
876
dataset = replicated_dataset(dataset, weights)
877
n_samples, n_features = len(dataset.examples), dataset.target
878
X, y = (np.array([x[:n_features] for x in dataset.examples]),
879
np.array([x[n_features] for x in dataset.examples]))
880
return unweighted_learner.fit(X, y)
881
882
return train
883
884
885
def replicated_dataset(dataset, weights, n=None):
886
"""Copy dataset, replicating each example in proportion to its weight."""
887
n = n or len(dataset.examples)
888
result = copy.copy(dataset)
889
result.examples = weighted_replicate(dataset.examples, weights, n)
890
return result
891
892
893
def weighted_replicate(seq, weights, n):
894
"""
895
Return n selections from seq, with the count of each element of
896
seq proportional to the corresponding weight (filling in fractions
897
randomly).
898
>>> weighted_replicate('ABC', [1, 2, 1], 4)
899
['A', 'B', 'B', 'C']
900
"""
901
assert len(seq) == len(weights)
902
weights = normalize(weights)
903
wholes = [int(w * n) for w in weights]
904
fractions = [(w * n) % 1 for w in weights]
905
return (flatten([x] * nx for x, nx in zip(seq, wholes)) +
906
weighted_sample_with_replacement(n - sum(wholes), seq, fractions))
907
908
909
# metrics
910
911
def accuracy_score(y_pred, y_true):
912
assert y_pred.shape == y_true.shape
913
return np.mean(np.equal(y_pred, y_true))
914
915
916
def r2_score(y_pred, y_true):
917
assert y_pred.shape == y_true.shape
918
return 1. - (np.sum(np.square(y_pred - y_true)) / # sum of square of residuals
919
np.sum(np.square(y_true - np.mean(y_true)))) # total sum of squares
920
921
922
# datasets
923
924
orings = DataSet(name='orings', target='Distressed', attr_names='Rings Distressed Temp Pressure Flightnum')
925
926
zoo = DataSet(name='zoo', target='type', exclude=['name'],
927
attr_names='name hair feathers eggs milk airborne aquatic predator toothed backbone '
928
'breathes venomous fins legs tail domestic catsize type')
929
930
iris = DataSet(name='iris', target='class', attr_names='sepal-len sepal-width petal-len petal-width class')
931
932
933
def RestaurantDataSet(examples=None):
934
"""
935
[Figure 18.3]
936
Build a DataSet of Restaurant waiting examples.
937
"""
938
return DataSet(name='restaurant', target='Wait', examples=examples,
939
attr_names='Alternate Bar Fri/Sat Hungry Patrons Price Raining Reservation Type WaitEstimate Wait')
940
941
942
restaurant = RestaurantDataSet()
943
944
945
def T(attr_name, branches):
946
branches = {value: (child if isinstance(child, DecisionFork) else DecisionLeaf(child))
947
for value, child in branches.items()}
948
return DecisionFork(restaurant.attr_num(attr_name), attr_name, print, branches)
949
950
951
"""
952
[Figure 18.2]
953
A decision tree for deciding whether to wait for a table at a hotel.
954
"""
955
956
waiting_decision_tree = T('Patrons',
957
{'None': 'No', 'Some': 'Yes',
958
'Full': T('WaitEstimate',
959
{'>60': 'No', '0-10': 'Yes',
960
'30-60': T('Alternate',
961
{'No': T('Reservation',
962
{'Yes': 'Yes',
963
'No': T('Bar', {'No': 'No',
964
'Yes': 'Yes'})}),
965
'Yes': T('Fri/Sat', {'No': 'No', 'Yes': 'Yes'})}),
966
'10-30': T('Hungry',
967
{'No': 'Yes',
968
'Yes': T('Alternate',
969
{'No': 'Yes',
970
'Yes': T('Raining',
971
{'No': 'No',
972
'Yes': 'Yes'})})})})})
973
974
975
def SyntheticRestaurant(n=20):
976
"""Generate a DataSet with n examples."""
977
978
def gen():
979
example = list(map(random.choice, restaurant.values))
980
example[restaurant.target] = waiting_decision_tree(example)
981
return example
982
983
return RestaurantDataSet([gen() for _ in range(n)])
984
985
986
def Majority(k, n):
987
"""
988
Return a DataSet with n k-bit examples of the majority problem:
989
k random bits followed by a 1 if more than half the bits are 1, else 0.
990
"""
991
examples = []
992
for i in range(n):
993
bits = [random.choice([0, 1]) for _ in range(k)]
994
bits.append(int(sum(bits) > k / 2))
995
examples.append(bits)
996
return DataSet(name='majority', examples=examples)
997
998
999
def Parity(k, n, name='parity'):
1000
"""
1001
Return a DataSet with n k-bit examples of the parity problem:
1002
k random bits followed by a 1 if an odd number of bits are 1, else 0.
1003
"""
1004
examples = []
1005
for i in range(n):
1006
bits = [random.choice([0, 1]) for _ in range(k)]
1007
bits.append(sum(bits) % 2)
1008
examples.append(bits)
1009
return DataSet(name=name, examples=examples)
1010
1011
1012
def Xor(n):
1013
"""Return a DataSet with n examples of 2-input xor."""
1014
return Parity(2, n, name='xor')
1015
1016
1017
def ContinuousXor(n):
1018
"""2 inputs are chosen uniformly from (0.0 .. 2.0]; output is xor of ints."""
1019
examples = []
1020
for i in range(n):
1021
x, y = [random.uniform(0.0, 2.0) for _ in '12']
1022
examples.append([x, y, x != y])
1023
return DataSet(name='continuous xor', examples=examples)
1024
1025
1026
def compare(algorithms=None, datasets=None, k=10, trials=1):
1027
"""
1028
Compare various learners on various datasets using cross-validation.
1029
Print results as a table.
1030
"""
1031
# default list of algorithms
1032
algorithms = algorithms or [PluralityLearner, NaiveBayesLearner, NearestNeighborLearner, DecisionTreeLearner]
1033
1034
# default list of datasets
1035
datasets = datasets or [iris, orings, zoo, restaurant, SyntheticRestaurant(20),
1036
Majority(7, 100), Parity(7, 100), Xor(100)]
1037
1038
print_table([[a.__name__.replace('Learner', '')] + [cross_validation(a, d, k=k, trials=trials) for d in datasets]
1039
for a in algorithms], header=[''] + [d.name[0:7] for d in datasets], numfmt='%.2f')
1040
1041