Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Avatar for Support and Testing.
Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

8758 views
License: GPL3
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2010 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
import math
11
import sys
12
import pandas
13
import numpy as np
14
15
import thinkstats2
16
import thinkplot
17
18
19
def Summarize(df, column, title):
20
"""Print summary statistics male, female and all."""
21
22
items = [
23
('all', df[column]),
24
('male', df[df.sex == 1][column]),
25
('female', df[df.sex == 2][column]),
26
]
27
28
print(title)
29
print('key\tn\tmean\tvar\tstd\tcv')
30
for key, series in items:
31
mean, var = series.mean(), series.var()
32
std = math.sqrt(var)
33
cv = std / mean
34
t = key, len(series), mean, var, std, cv
35
print('%s\t%d\t%4.2f\t%4.2f\t%4.2f\t%4.4f' % t)
36
37
38
def CleanBrfssFrame(df):
39
"""Recodes BRFSS variables.
40
41
df: DataFrame
42
"""
43
# clean age
44
df.age.replace([7, 9], float('NaN'), inplace=True)
45
46
# clean height
47
df.htm3.replace([999], float('NaN'), inplace=True)
48
49
# clean weight
50
df.wtkg2.replace([99999], float('NaN'), inplace=True)
51
df.wtkg2 /= 100.0
52
53
# clean weight a year ago
54
df.wtyrago.replace([7777, 9999], float('NaN'), inplace=True)
55
df['wtyrago'] = df.wtyrago.apply(lambda x: x/2.2 if x < 9000 else x-9000)
56
57
58
def ReadBrfss(filename='CDBRFS08.ASC.gz', compression='gzip', nrows=None):
59
"""Reads the BRFSS data.
60
61
filename: string
62
compression: string
63
nrows: int number of rows to read, or None for all
64
65
returns: DataFrame
66
"""
67
var_info = [
68
('age', 101, 102, int),
69
('sex', 143, 143, int),
70
('wtyrago', 127, 130, int),
71
('finalwt', 799, 808, int),
72
('wtkg2', 1254, 1258, int),
73
('htm3', 1251, 1253, int),
74
]
75
columns = ['name', 'start', 'end', 'type']
76
variables = pandas.DataFrame(var_info, columns=columns)
77
variables.end += 1
78
dct = thinkstats2.FixedWidthVariables(variables, index_base=1)
79
80
df = dct.ReadFixedWidth(filename, compression=compression, nrows=nrows)
81
CleanBrfssFrame(df)
82
return df
83
84
85
def MakeNormalModel(weights):
86
"""Plots a CDF with a Normal model.
87
88
weights: sequence
89
"""
90
cdf = thinkstats2.Cdf(weights, label='weights')
91
92
mean, var = thinkstats2.TrimmedMeanVar(weights)
93
std = math.sqrt(var)
94
print('n, mean, std', len(weights), mean, std)
95
96
xmin = mean - 4 * std
97
xmax = mean + 4 * std
98
99
xs, ps = thinkstats2.RenderNormalCdf(mean, std, xmin, xmax)
100
thinkplot.Plot(xs, ps, label='model', linewidth=4, color='0.8')
101
thinkplot.Cdf(cdf)
102
103
104
def MakeNormalPlot(weights):
105
"""Generates a normal probability plot of birth weights.
106
107
weights: sequence
108
"""
109
mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01)
110
std = math.sqrt(var)
111
112
xs = [-5, 5]
113
xs, ys = thinkstats2.FitLine(xs, mean, std)
114
thinkplot.Plot(xs, ys, color='0.8', label='model')
115
116
xs, ys = thinkstats2.NormalProbability(weights)
117
thinkplot.Plot(xs, ys, label='weights')
118
119
120
def MakeFigures(df):
121
"""Generates CDFs and normal prob plots for weights and log weights."""
122
weights = df.wtkg2.dropna()
123
log_weights = np.log10(weights)
124
125
# plot weights on linear and log scales
126
thinkplot.PrePlot(cols=2)
127
MakeNormalModel(weights)
128
thinkplot.Config(xlabel='adult weight (kg)', ylabel='CDF')
129
130
thinkplot.SubPlot(2)
131
MakeNormalModel(log_weights)
132
thinkplot.Config(xlabel='adult weight (log10 kg)')
133
134
thinkplot.Save(root='brfss_weight')
135
136
# make normal probability plots on linear and log scales
137
thinkplot.PrePlot(cols=2)
138
MakeNormalPlot(weights)
139
thinkplot.Config(xlabel='z', ylabel='weights (kg)')
140
141
thinkplot.SubPlot(2)
142
MakeNormalPlot(log_weights)
143
thinkplot.Config(xlabel='z', ylabel='weights (log10 kg)')
144
145
thinkplot.Save(root='brfss_weight_normal')
146
147
148
def main(script, nrows=1000):
149
"""Tests the functions in this module.
150
151
script: string script name
152
"""
153
thinkstats2.RandomSeed(17)
154
155
nrows = int(nrows)
156
df = ReadBrfss(nrows=nrows)
157
MakeFigures(df)
158
159
Summarize(df, 'htm3', 'Height (cm):')
160
Summarize(df, 'wtkg2', 'Weight (kg):')
161
Summarize(df, 'wtyrago', 'Weight year ago (kg):')
162
163
if nrows == 1000:
164
assert(df.age.value_counts()[40] == 28)
165
assert(df.sex.value_counts()[2] == 668)
166
assert(df.wtkg2.value_counts()[90.91] == 49)
167
assert(df.wtyrago.value_counts()[160/2.2] == 49)
168
assert(df.htm3.value_counts()[163] == 103)
169
assert(df.finalwt.value_counts()[185.870345] == 13)
170
print('%s: All tests passed.' % script)
171
172
173
if __name__ == '__main__':
174
main(*sys.argv)
175
176