Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/thinkbayes2
Path: blob/master/scripts/variability.py
1901 views
1
"""This file contains code used in "Think Bayes",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2012 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
import math
11
import numpy
12
import pickle
13
import numpy
14
import random
15
import scipy
16
17
import brfss
18
19
import thinkplot
20
import thinkbayes2
21
22
import matplotlib.pyplot as pyplot
23
24
25
NUM_SIGMAS = 1
26
27
class Height(thinkbayes2.Suite, thinkbayes2.Joint):
28
"""Hypotheses about parameters of the distribution of height."""
29
30
def __init__(self, mus, sigmas, label=None):
31
"""Makes a prior distribution for mu and sigma based on a sample.
32
33
mus: sequence of possible mus
34
sigmas: sequence of possible sigmas
35
label: string label for the Suite
36
"""
37
pairs = [(mu, sigma)
38
for mu in mus
39
for sigma in sigmas]
40
41
thinkbayes2.Suite.__init__(self, pairs, label=label)
42
43
def Likelihood(self, data, hypo):
44
"""Computes the likelihood of the data under the hypothesis.
45
46
Args:
47
hypo: tuple of hypothetical mu and sigma
48
data: float sample
49
50
Returns:
51
likelihood of the sample given mu and sigma
52
"""
53
x = data
54
mu, sigma = hypo
55
like = scipy.stats.norm.pdf(x, mu, sigma)
56
return like
57
58
def LogLikelihood(self, data, hypo):
59
"""Computes the log likelihood of the data under the hypothesis.
60
61
Args:
62
data: a list of values
63
hypo: tuple of hypothetical mu and sigma
64
65
Returns:
66
log likelihood of the sample given mu and sigma (unnormalized)
67
"""
68
x = data
69
mu, sigma = hypo
70
loglike = EvalNormalLogPdf(x, mu, sigma)
71
return loglike
72
73
def LogUpdateSetFast(self, data):
74
"""Updates the suite using a faster implementation.
75
76
Computes the sum of the log likelihoods directly.
77
78
Args:
79
data: sequence of values
80
"""
81
xs = tuple(data)
82
n = len(xs)
83
84
for hypo in self.Values():
85
mu, sigma = hypo
86
total = Summation(xs, mu)
87
loglike = -n * math.log(sigma) - total / 2 / sigma**2
88
self.Incr(hypo, loglike)
89
90
def LogUpdateSetMeanVar(self, data):
91
"""Updates the suite using ABC and mean/var.
92
93
Args:
94
data: sequence of values
95
"""
96
xs = data
97
n = len(xs)
98
99
m = numpy.mean(xs)
100
s = numpy.std(xs)
101
102
self.LogUpdateSetABC(n, m, s)
103
104
def LogUpdateSetMedianIPR(self, data):
105
"""Updates the suite using ABC and median/iqr.
106
107
Args:
108
data: sequence of values
109
"""
110
xs = data
111
n = len(xs)
112
113
# compute summary stats
114
median, s = MedianS(xs, num_sigmas=NUM_SIGMAS)
115
print('median, s', median, s)
116
117
self.LogUpdateSetABC(n, median, s)
118
119
def LogUpdateSetABC(self, n, m, s):
120
"""Updates the suite using ABC.
121
122
n: sample size
123
m: estimated central tendency
124
s: estimated spread
125
"""
126
for hypo in sorted(self.Values()):
127
mu, sigma = hypo
128
129
# compute log likelihood of m, given hypo
130
stderr_m = sigma / math.sqrt(n)
131
loglike = EvalNormalLogPdf(m, mu, stderr_m)
132
133
#compute log likelihood of s, given hypo
134
stderr_s = sigma / math.sqrt(2 * (n-1))
135
loglike += EvalNormalLogPdf(s, sigma, stderr_s)
136
137
self.Incr(hypo, loglike)
138
139
140
def EvalNormalLogPdf(x, mu, sigma):
141
"""Computes the log PDF of x given mu and sigma.
142
143
x: float values
144
mu, sigma: paramemters of Normal
145
146
returns: float log-likelihood
147
"""
148
return scipy.stats.norm.logpdf(x, mu, sigma)
149
150
151
def FindPriorRanges(xs, num_points, num_stderrs=3.0, median_flag=False):
152
"""Find ranges for mu and sigma with non-negligible likelihood.
153
154
xs: sample
155
num_points: number of values in each dimension
156
num_stderrs: number of standard errors to include on either side
157
158
Returns: sequence of mus, sequence of sigmas
159
"""
160
def MakeRange(estimate, stderr):
161
"""Makes a linear range around the estimate.
162
163
estimate: central value
164
stderr: standard error of the estimate
165
166
returns: numpy array of float
167
"""
168
spread = stderr * num_stderrs
169
array = numpy.linspace(estimate-spread, estimate+spread, num_points)
170
return array
171
172
# estimate mean and stddev of xs
173
n = len(xs)
174
if median_flag:
175
m, s = MedianS(xs, num_sigmas=NUM_SIGMAS)
176
else:
177
m = numpy.mean(xs)
178
s = numpy.std(xs)
179
180
print('classical estimators', m, s)
181
182
# compute ranges for m and s
183
stderr_m = s / math.sqrt(n)
184
mus = MakeRange(m, stderr_m)
185
186
stderr_s = s / math.sqrt(2 * (n-1))
187
sigmas = MakeRange(s, stderr_s)
188
189
return mus, sigmas
190
191
192
def Summation(xs, mu, cache={}):
193
"""Computes the sum of (x-mu)**2 for x in t.
194
195
Caches previous results.
196
197
xs: tuple of values
198
mu: hypothetical mean
199
cache: cache of previous results
200
"""
201
try:
202
return cache[xs, mu]
203
except KeyError:
204
ds = [(x-mu)**2 for x in xs]
205
total = sum(ds)
206
cache[xs, mu] = total
207
return total
208
209
210
def CoefVariation(suite):
211
"""Computes the distribution of CV.
212
213
suite: Pmf that maps (x, y) to z
214
215
Returns: Pmf object for CV.
216
"""
217
pmf = thinkbayes2.Pmf()
218
for (m, s), p in suite.Items():
219
pmf.Incr(s/m, p)
220
return pmf
221
222
223
def PlotCdfs(d, labels):
224
"""Plot CDFs for each sequence in a dictionary.
225
226
Jitters the data and subtracts away the mean.
227
228
d: map from key to sequence of values
229
labels: map from key to string label
230
"""
231
thinkplot.Clf()
232
for key, xs in d.items():
233
mu = thinkbayes2.Mean(xs)
234
xs = thinkbayes2.Jitter(xs, 1.3)
235
xs = [x-mu for x in xs]
236
cdf = thinkbayes2.MakeCdfFromList(xs)
237
thinkplot.Cdf(cdf, label=labels[key])
238
thinkplot.Show()
239
240
241
def PlotPosterior(suite, pcolor=False, contour=True):
242
"""Makes a contour plot.
243
244
suite: Suite that maps (mu, sigma) to probability
245
"""
246
thinkplot.Clf()
247
thinkplot.Contour(suite.GetDict(), pcolor=pcolor, contour=contour)
248
249
thinkplot.Save(root='variability_posterior_%s' % suite.label,
250
title='Posterior joint distribution',
251
xlabel='Mean height (cm)',
252
ylabel='Stddev (cm)')
253
254
255
def PlotCoefVariation(suites):
256
"""Plot the posterior distributions for CV.
257
258
suites: map from label to Pmf of CVs.
259
"""
260
thinkplot.Clf()
261
thinkplot.PrePlot(num=2)
262
263
pmfs = {}
264
for label, suite in suites.items():
265
pmf = CoefVariation(suite)
266
print('CV posterior mean', pmf.Mean())
267
cdf = thinkbayes2.MakeCdfFromPmf(pmf, label)
268
thinkplot.Cdf(cdf)
269
270
pmfs[label] = pmf
271
272
thinkplot.Save(root='variability_cv',
273
xlabel='Coefficient of variation',
274
ylabel='Probability')
275
276
print('female bigger', thinkbayes2.PmfProbGreater(pmfs['female'],
277
pmfs['male']))
278
print('male bigger', thinkbayes2.PmfProbGreater(pmfs['male'],
279
pmfs['female']))
280
281
282
def PlotOutliers(samples):
283
"""Make CDFs showing the distribution of outliers."""
284
cdfs = []
285
for label, sample in samples.items():
286
outliers = [x for x in sample if x < 150]
287
288
cdf = thinkbayes2.MakeCdfFromList(outliers, label)
289
cdfs.append(cdf)
290
291
thinkplot.Clf()
292
thinkplot.Cdfs(cdfs)
293
thinkplot.Save(root='variability_cdfs',
294
title='CDF of height',
295
xlabel='Reported height (cm)',
296
ylabel='CDF')
297
298
299
def PlotMarginals(suite):
300
"""Plots marginal distributions from a joint distribution.
301
302
suite: joint distribution of mu and sigma.
303
"""
304
thinkplot.Clf()
305
306
pyplot.subplot(1, 2, 1)
307
pmf_m = suite.Marginal(0)
308
cdf_m = thinkbayes2.MakeCdfFromPmf(pmf_m)
309
thinkplot.Cdf(cdf_m)
310
311
pyplot.subplot(1, 2, 2)
312
pmf_s = suite.Marginal(1)
313
cdf_s = thinkbayes2.MakeCdfFromPmf(pmf_s)
314
thinkplot.Cdf(cdf_s)
315
316
thinkplot.Show()
317
318
319
def ReadHeights(nrows=None):
320
"""Read the BRFSS dataset, extract the heights and pickle them.
321
322
nrows: number of rows to read
323
"""
324
resp = brfss.ReadBrfss(nrows=nrows).dropna(subset=['sex', 'htm3'])
325
groups = resp.groupby('sex')
326
327
d = {}
328
for name, group in groups:
329
d[name] = group.htm3.values
330
331
return d
332
333
334
def UpdateSuite1(suite, xs):
335
"""Computes the posterior distibution of mu and sigma.
336
337
Computes untransformed likelihoods.
338
339
suite: Suite that maps from (mu, sigma) to prob
340
xs: sequence
341
"""
342
suite.UpdateSet(xs)
343
344
345
def UpdateSuite2(suite, xs):
346
"""Computes the posterior distibution of mu and sigma.
347
348
Computes log likelihoods.
349
350
suite: Suite that maps from (mu, sigma) to prob
351
xs: sequence
352
"""
353
suite.Log()
354
suite.LogUpdateSet(xs)
355
suite.Exp()
356
suite.Normalize()
357
358
359
def UpdateSuite3(suite, xs):
360
"""Computes the posterior distibution of mu and sigma.
361
362
Computes log likelihoods efficiently.
363
364
suite: Suite that maps from (mu, sigma) to prob
365
t: sequence
366
"""
367
suite.Log()
368
suite.LogUpdateSetFast(xs)
369
suite.Exp()
370
suite.Normalize()
371
372
373
def UpdateSuite4(suite, xs):
374
"""Computes the posterior distibution of mu and sigma.
375
376
Computes log likelihoods efficiently.
377
378
suite: Suite that maps from (mu, sigma) to prob
379
t: sequence
380
"""
381
suite.Log()
382
suite.LogUpdateSetMeanVar(xs)
383
suite.Exp()
384
suite.Normalize()
385
386
387
def UpdateSuite5(suite, xs):
388
"""Computes the posterior distibution of mu and sigma.
389
390
Computes log likelihoods efficiently.
391
392
suite: Suite that maps from (mu, sigma) to prob
393
t: sequence
394
"""
395
suite.Log()
396
suite.LogUpdateSetMedianIPR(xs)
397
suite.Exp()
398
suite.Normalize()
399
400
401
def MedianIPR(xs, p):
402
"""Computes the median and interpercentile range.
403
404
xs: sequence of values
405
p: range (0-1), 0.5 yields the interquartile range
406
407
returns: tuple of float (median, IPR)
408
"""
409
cdf = thinkbayes2.MakeCdfFromList(xs)
410
median = cdf.Percentile(50)
411
412
alpha = (1-p) / 2
413
ipr = cdf.Value(1-alpha) - cdf.Value(alpha)
414
return median, ipr
415
416
417
def MedianS(xs, num_sigmas):
418
"""Computes the median and an estimate of sigma.
419
420
Based on an interpercentile range (IPR).
421
422
factor: number of standard deviations spanned by the IPR
423
"""
424
half_p = thinkbayes2.StandardNormalCdf(num_sigmas) - 0.5
425
median, ipr = MedianIPR(xs, half_p * 2)
426
s = ipr / 2 / num_sigmas
427
428
return median, s
429
430
def Summarize(xs):
431
"""Prints summary statistics from a sequence of values.
432
433
xs: sequence of values
434
"""
435
# print smallest and largest
436
xs.sort()
437
print('smallest', xs[:10])
438
print('largest', xs[-10:])
439
440
# print median and interquartile range
441
cdf = thinkbayes2.MakeCdfFromList(xs)
442
print(cdf.Percentile(25), cdf.Percentile(50), cdf.Percentile(75))
443
444
445
def RunEstimate(update_func, num_points=31, median_flag=False):
446
"""Runs the whole analysis.
447
448
update_func: which of the update functions to use
449
num_points: number of points in the Suite (in each dimension)
450
"""
451
d = ReadHeights(nrows=None)
452
labels = {1:'male', 2:'female'}
453
454
# PlotCdfs(d, labels)
455
456
suites = {}
457
for key, xs in d.items():
458
label = labels[key]
459
print(label, len(xs))
460
Summarize(xs)
461
462
xs = thinkbayes2.Jitter(xs, 1.3)
463
464
mus, sigmas = FindPriorRanges(xs, num_points, median_flag=median_flag)
465
suite = Height(mus, sigmas, label)
466
suites[label] = suite
467
update_func(suite, xs)
468
print('MLE', suite.MaximumLikelihood())
469
470
PlotPosterior(suite)
471
472
pmf_m = suite.Marginal(0)
473
pmf_s = suite.Marginal(1)
474
print('marginal mu', pmf_m.Mean(), pmf_m.Var())
475
print('marginal sigma', pmf_s.Mean(), pmf_s.Var())
476
477
# PlotMarginals(suite)
478
479
PlotCoefVariation(suites)
480
481
482
def main():
483
random.seed(17)
484
485
func = UpdateSuite5
486
median_flag = (func == UpdateSuite5)
487
RunEstimate(func, median_flag=median_flag)
488
489
490
if __name__ == '__main__':
491
main()
492
493
494
""" Results:
495
496
UpdateSuite1 (100):
497
marginal mu 162.816901408 0.55779791443
498
marginal sigma 6.36966103214 0.277026082819
499
500
UpdateSuite2 (100):
501
marginal mu 162.816901408 0.55779791443
502
marginal sigma 6.36966103214 0.277026082819
503
504
UpdateSuite3 (100):
505
marginal mu 162.816901408 0.55779791443
506
marginal sigma 6.36966103214 0.277026082819
507
508
UpdateSuite4 (100):
509
marginal mu 162.816901408 0.547456009605
510
marginal sigma 6.30305516111 0.27544106054
511
512
UpdateSuite3 (1000):
513
marginal mu 163.722137405 0.0660294386397
514
marginal sigma 6.64453251495 0.0329935312671
515
516
UpdateSuite4 (1000):
517
marginal mu 163.722137405 0.0658920503302
518
marginal sigma 6.63692197049 0.0329689887609
519
520
UpdateSuite3 (all):
521
marginal mu 163.223475005 0.000203282582659
522
marginal sigma 7.26918836916 0.000101641131229
523
524
UpdateSuite4 (all):
525
marginal mu 163.223475004 0.000203281499857
526
marginal sigma 7.26916693422 0.000101640932082
527
528
UpdateSuite5 (all):
529
marginal mu 163.1805214 7.9399898468e-07
530
marginal sigma 7.29969524118 3.26257030869e-14
531
532
"""
533
534
535