Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/thinkbayes2
Path: blob/master/soln/brfss.ipynb
1901 views
Kernel: Python 3
import numpy as np import pandas as pd from scipy.stats import norm
!ls ../../data/LLCP2018.XPT
../../data/LLCP2018.XPT
filename = '../../data/LLCP2018.XPT' df = pd.read_sas(filename) df.head()
df['SEX1'].value_counts()
2.0 238911 1.0 197412 9.0 682 7.0 431 Name: SEX1, dtype: int64
male = df['SEX1'] == 1 male.sum()
197412
female = df['SEX1'] == 2 female.sum()
238911
df['HTM4'].describe()
count 420974.000000 mean 169.895887 std 10.721788 min 91.000000 25% 163.000000 50% 170.000000 75% 178.000000 max 241.000000 Name: HTM4, dtype: float64
height_male = df.loc[male, 'HTM4'] height_male += np.random.normal(0, 2, male.sum()) height_male.mean(), height_male.std()
(178.015133313527, 8.272961809579046)
height_male.std() / height_male.mean()
0.04647336243603734
height_female = df.loc[female, 'HTM4'] height_female += np.random.normal(0, 2, female.sum()) height_female.mean(), height_female.std()
(163.11558651312797, 7.753027176098179)
height_female.std() / height_female.mean()
0.047530878819322366
def estimate_std(series, num_sigmas): ps = norm.cdf([-num_sigmas/2, num_sigmas/2]) ipr = series.quantile(ps) std = np.diff(ipr) / num_sigmas return std
index = np.linspace(1, 6, 51) std_series_male = pd.Series(1.0, index=index) for num_sigmas in index: std_series_male[num_sigmas] = estimate_std(height_male, num_sigmas) std_series_male.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f51e555bf70>
Image in a Jupyter notebook
index = np.linspace(1, 5, 51) std_series_female = pd.Series(1.0, index=index) for num_sigmas in index: std_series_female[num_sigmas] = estimate_std(height_female, num_sigmas) std_series_female.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f51e56adfd0>
Image in a Jupyter notebook
ipr_female = height_female.quantile(ps) ipr_female
0.066807 152.117512 0.933193 174.271321 Name: HTM4, dtype: float64
np.diff(ipr_male) / 3
array([7.81866273])
np.diff(ipr_female) / 3
array([7.38460301])
from empiricaldist import Cdf cdf_male = Cdf.from_seq(height_male) cdf_male.plot() cdf_female = Cdf.from_seq(height_female) cdf_female.plot()
Image in a Jupyter notebook