Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Avatar for Support and Testing.
Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

8758 views
License: GPL3
1
"""This file contains code used in "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
import math
11
12
import numpy as np
13
import pandas
14
15
import nsfg
16
import thinkplot
17
import thinkstats2
18
19
20
def ParetoMedian(xmin, alpha):
21
"""Computes the median of a Pareto distribution."""
22
return xmin * pow(2, 1/alpha)
23
24
25
def MakeExpoCdf():
26
"""Generates a plot of the exponential CDF."""
27
28
thinkplot.PrePlot(3)
29
for lam in [2.0, 1, 0.5]:
30
xs, ps = thinkstats2.RenderExpoCdf(lam, 0, 3.0, 50)
31
label = r'$\lambda=%g$' % lam
32
thinkplot.Plot(xs, ps, label=label)
33
34
thinkplot.Save(root='analytic_expo_cdf',
35
title='Exponential CDF',
36
xlabel='x',
37
ylabel='CDF')
38
39
40
def ReadBabyBoom(filename='babyboom.dat'):
41
"""Reads the babyboom data.
42
43
filename: string
44
45
returns: DataFrame
46
"""
47
var_info = [
48
('time', 1, 8, int),
49
('sex', 9, 16, int),
50
('weight_g', 17, 24, int),
51
('minutes', 25, 32, int),
52
]
53
columns = ['name', 'start', 'end', 'type']
54
variables = pandas.DataFrame(var_info, columns=columns)
55
variables.end += 1
56
dct = thinkstats2.FixedWidthVariables(variables, index_base=1)
57
58
df = dct.ReadFixedWidth(filename, skiprows=59)
59
return df
60
61
62
def MakeBabyBoom():
63
"""Plot CDF of interarrival time on log and linear scales.
64
"""
65
# compute the interarrival times
66
df = ReadBabyBoom()
67
diffs = df.minutes.diff()
68
cdf = thinkstats2.Cdf(diffs, label='actual')
69
70
thinkplot.PrePlot(cols=2)
71
thinkplot.Cdf(cdf)
72
thinkplot.Config(xlabel='minutes',
73
ylabel='CDF',
74
legend=False)
75
76
thinkplot.SubPlot(2)
77
thinkplot.Cdf(cdf, complement=True)
78
thinkplot.Config(xlabel='minutes',
79
ylabel='CCDF',
80
yscale='log',
81
legend=False)
82
83
thinkplot.Save(root='analytic_interarrivals',
84
legend=False)
85
86
87
def MakeParetoCdf():
88
"""Generates a plot of the Pareto CDF."""
89
xmin = 0.5
90
91
thinkplot.PrePlot(3)
92
for alpha in [2.0, 1.0, 0.5]:
93
xs, ps = thinkstats2.RenderParetoCdf(xmin, alpha, 0, 10.0, n=100)
94
thinkplot.Plot(xs, ps, label=r'$\alpha=%g$' % alpha)
95
96
thinkplot.Save(root='analytic_pareto_cdf',
97
title='Pareto CDF',
98
xlabel='x',
99
ylabel='CDF')
100
101
102
def MakeParetoCdf2():
103
"""Generates a plot of the CDF of height in Pareto World."""
104
xmin = 100
105
alpha = 1.7
106
xs, ps = thinkstats2.RenderParetoCdf(xmin, alpha, 0, 1000.0, n=100)
107
thinkplot.Plot(xs, ps)
108
109
thinkplot.Save(root='analytic_pareto_height',
110
title='Pareto CDF',
111
xlabel='height (cm)',
112
ylabel='CDF',
113
legend=False)
114
115
116
def MakeNormalCdf():
117
"""Generates a plot of the normal CDF."""
118
119
thinkplot.PrePlot(3)
120
121
mus = [1.0, 2.0, 3.0]
122
sigmas = [0.5, 0.4, 0.3]
123
for mu, sigma in zip(mus, sigmas):
124
xs, ps = thinkstats2.RenderNormalCdf(mu=mu, sigma=sigma,
125
low=-1.0, high=4.0)
126
label = r'$\mu=%g$, $\sigma=%g$' % (mu, sigma)
127
thinkplot.Plot(xs, ps, label=label)
128
129
thinkplot.Save(root='analytic_normal_cdf',
130
title='Normal CDF',
131
xlabel='x',
132
ylabel='CDF',
133
loc=2)
134
135
136
def MakeNormalModel(weights):
137
"""Plot the CDF of birthweights with a normal model."""
138
139
# estimate parameters: trimming outliers yields a better fit
140
mu, var = thinkstats2.TrimmedMeanVar(weights, p=0.01)
141
print('Mean, Var', mu, var)
142
143
# plot the model
144
sigma = math.sqrt(var)
145
print('Sigma', sigma)
146
xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=12.5)
147
148
thinkplot.Plot(xs, ps, label='model', color='0.8')
149
150
# plot the data
151
cdf = thinkstats2.Cdf(weights, label='data')
152
153
thinkplot.PrePlot(1)
154
thinkplot.Cdf(cdf)
155
thinkplot.Save(root='analytic_birthwgt_model',
156
title='Birth weights',
157
xlabel='birth weight (lbs)',
158
ylabel='CDF')
159
160
161
def MakeExampleNormalPlot():
162
"""Generates a sample normal probability plot.
163
"""
164
n = 1000
165
thinkplot.PrePlot(3)
166
167
mus = [0, 1, 5]
168
sigmas = [1, 1, 2]
169
for mu, sigma in zip(mus, sigmas):
170
sample = np.random.normal(mu, sigma, n)
171
xs, ys = thinkstats2.NormalProbability(sample)
172
label = '$\mu=%d$, $\sigma=%d$' % (mu, sigma)
173
thinkplot.Plot(xs, ys, label=label)
174
175
thinkplot.Save(root='analytic_normal_prob_example',
176
title='Normal probability plot',
177
xlabel='standard normal sample',
178
ylabel='sample values')
179
180
181
def MakeNormalPlot(weights, term_weights):
182
"""Generates a normal probability plot of birth weights."""
183
184
mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01)
185
std = math.sqrt(var)
186
187
xs = [-4, 4]
188
fxs, fys = thinkstats2.FitLine(xs, mean, std)
189
thinkplot.Plot(fxs, fys, linewidth=4, color='0.8')
190
191
thinkplot.PrePlot(2)
192
xs, ys = thinkstats2.NormalProbability(weights)
193
thinkplot.Plot(xs, ys, label='all live')
194
195
xs, ys = thinkstats2.NormalProbability(term_weights)
196
thinkplot.Plot(xs, ys, label='full term')
197
thinkplot.Save(root='analytic_birthwgt_normal',
198
title='Normal probability plot',
199
xlabel='Standard deviations from mean',
200
ylabel='Birth weight (lbs)')
201
202
203
def main():
204
thinkstats2.RandomSeed(18)
205
MakeExampleNormalPlot()
206
207
# make the analytic CDFs
208
MakeExpoCdf()
209
MakeBabyBoom()
210
211
MakeParetoCdf()
212
MakeParetoCdf2()
213
MakeNormalCdf()
214
215
# test the distribution of birth weights for normality
216
preg = nsfg.ReadFemPreg()
217
full_term = preg[preg.prglngth >= 37]
218
219
weights = preg.totalwgt_lb.dropna()
220
term_weights = full_term.totalwgt_lb.dropna()
221
222
MakeNormalModel(weights)
223
MakeNormalPlot(weights, term_weights)
224
225
226
if __name__ == "__main__":
227
main()
228
229