Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/thinkbayes2
Path: blob/master/scripts/sat.py
1901 views
1
"""This file contains code used in "Think Bayes",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2012 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
import csv
11
import math
12
import numpy
13
import sys
14
15
import matplotlib
16
import matplotlib.pyplot as pyplot
17
18
import thinkbayes2
19
import thinkplot
20
21
22
def ReadScale(filename='sat_scale.csv', col=2):
23
"""Reads a CSV file of SAT scales (maps from raw score to standard score).
24
25
Args:
26
filename: string filename
27
col: which column to start with (0=Reading, 2=Math, 4=Writing)
28
29
Returns: thinkbayes2.Interpolator object
30
"""
31
def ParseRange(s):
32
"""Parse a range of values in the form 123-456
33
34
s: string
35
"""
36
t = [int(x) for x in s.split('-')]
37
return 1.0 * sum(t) / len(t)
38
39
fp = open(filename)
40
reader = csv.reader(fp)
41
raws = []
42
scores = []
43
44
for t in reader:
45
try:
46
raw = int(t[col])
47
raws.append(raw)
48
score = ParseRange(t[col+1])
49
scores.append(score)
50
except ValueError:
51
pass
52
53
raws.sort()
54
scores.sort()
55
return thinkbayes2.Interpolator(raws, scores)
56
57
58
def ReadRanks(filename='sat_ranks.csv'):
59
"""Reads a CSV file of SAT scores.
60
61
Args:
62
filename: string filename
63
64
Returns:
65
list of (score, freq) pairs
66
"""
67
fp = open(filename)
68
reader = csv.reader(fp)
69
res = []
70
71
for t in reader:
72
try:
73
score = int(t[0])
74
freq = int(t[1])
75
res.append((score, freq))
76
except ValueError:
77
pass
78
79
return res
80
81
82
def DivideValues(pmf, denom):
83
"""Divides the values in a Pmf by denom.
84
85
Returns a new Pmf.
86
"""
87
new = thinkbayes2.Pmf()
88
denom = float(denom)
89
for val, prob in pmf.Items():
90
x = val / denom
91
new.Set(x, prob)
92
return new
93
94
95
class Exam(object):
96
"""Encapsulates information about an exam.
97
98
Contains the distribution of scaled scores and an
99
Interpolator that maps between scaled and raw scores.
100
"""
101
def __init__(self):
102
self.scale = ReadScale()
103
104
scores = ReadRanks()
105
score_pmf = thinkbayes2.Pmf(dict(scores))
106
107
self.raw = self.ReverseScale(score_pmf)
108
self.max_score = max(self.raw.Values())
109
self.prior = DivideValues(self.raw, denom=self.max_score)
110
111
center = -0.05
112
width = 1.8
113
self.difficulties = MakeDifficulties(center, width, self.max_score)
114
115
def CompareScores(self, a_score, b_score, constructor):
116
"""Computes posteriors for two test scores and the likelihood ratio.
117
118
a_score, b_score: scales SAT scores
119
constructor: function that instantiates an Sat or Sat2 object
120
"""
121
a_sat = constructor(self, a_score)
122
b_sat = constructor(self, b_score)
123
124
a_sat.PlotPosteriors(b_sat)
125
126
if constructor is Sat:
127
PlotJointDist(a_sat, b_sat)
128
129
top = TopLevel('AB')
130
top.Update((a_sat, b_sat))
131
top.Print()
132
133
ratio = top.Prob('A') / top.Prob('B')
134
135
print('Likelihood ratio', ratio)
136
137
posterior = ratio / (ratio + 1)
138
print('Posterior', posterior)
139
140
if constructor is Sat2:
141
ComparePosteriorPredictive(a_sat, b_sat)
142
143
def MakeRawScoreDist(self, efficacies):
144
"""Makes the distribution of raw scores for given difficulty.
145
146
efficacies: Pmf of efficacy
147
"""
148
pmfs = thinkbayes2.Pmf()
149
for efficacy, prob in efficacies.Items():
150
scores = self.PmfCorrect(efficacy)
151
pmfs.Set(scores, prob)
152
153
mix = thinkbayes2.MakeMixture(pmfs)
154
return mix
155
156
def CalibrateDifficulty(self):
157
"""Make a plot showing the model distribution of raw scores."""
158
thinkplot.Clf()
159
thinkplot.PrePlot(num=2)
160
161
cdf = thinkbayes2.Cdf(self.raw, label='data')
162
thinkplot.Cdf(cdf)
163
164
efficacies = thinkbayes2.MakeNormalPmf(0, 1.5, 3)
165
pmf = self.MakeRawScoreDist(efficacies)
166
cdf = thinkbayes2.Cdf(pmf, label='model')
167
thinkplot.Cdf(cdf)
168
169
thinkplot.Save(root='sat_calibrate',
170
xlabel='raw score',
171
ylabel='CDF',
172
formats=['pdf', 'eps'])
173
174
def PmfCorrect(self, efficacy):
175
"""Returns the PMF of number of correct responses.
176
177
efficacy: float
178
"""
179
pmf = PmfCorrect(efficacy, self.difficulties)
180
return pmf
181
182
def Lookup(self, raw):
183
"""Looks up a raw score and returns a scaled score."""
184
return self.scale.Lookup(raw)
185
186
def Reverse(self, score):
187
"""Looks up a scaled score and returns a raw score.
188
189
Since we ignore the penalty, negative scores round up to zero.
190
"""
191
raw = self.scale.Reverse(score)
192
return raw if raw > 0 else 0
193
194
def ReverseScale(self, pmf):
195
"""Applies the reverse scale to the values of a PMF.
196
197
Args:
198
pmf: Pmf object
199
scale: Interpolator object
200
201
Returns:
202
new Pmf
203
"""
204
new = thinkbayes2.Pmf()
205
for val, prob in pmf.Items():
206
raw = self.Reverse(val)
207
new.Incr(raw, prob)
208
return new
209
210
211
class Sat(thinkbayes2.Suite):
212
"""Represents the distribution of p_correct for a test-taker."""
213
214
def __init__(self, exam, score):
215
self.exam = exam
216
self.score = score
217
218
# start with the prior distribution
219
thinkbayes2.Suite.__init__(self, exam.prior)
220
221
# update based on an exam score
222
self.Update(score)
223
224
def Likelihood(self, data, hypo):
225
"""Computes the likelihood of a test score, given efficacy."""
226
p_correct = hypo
227
score = data
228
229
k = self.exam.Reverse(score)
230
n = self.exam.max_score
231
like = thinkbayes2.EvalBinomialPmf(k, n, p_correct)
232
return like
233
234
def PlotPosteriors(self, other):
235
"""Plots posterior distributions of efficacy.
236
237
self, other: Sat objects.
238
"""
239
thinkplot.Clf()
240
thinkplot.PrePlot(num=2)
241
242
cdf1 = thinkbayes2.Cdf(self, label='posterior %d' % self.score)
243
cdf2 = thinkbayes2.Cdf(other, label='posterior %d' % other.score)
244
245
thinkplot.Cdfs([cdf1, cdf2])
246
thinkplot.Save(xlabel='p_correct',
247
ylabel='CDF',
248
axis=[0.7, 1.0, 0.0, 1.0],
249
root='sat_posteriors_p_corr',
250
formats=['pdf', 'eps'])
251
252
253
class Sat2(thinkbayes2.Suite):
254
"""Represents the distribution of efficacy for a test-taker."""
255
256
def __init__(self, exam, score):
257
self.exam = exam
258
self.score = score
259
260
# start with the Normal prior
261
efficacies = thinkbayes2.MakeNormalPmf(0, 1.5, 3)
262
thinkbayes2.Suite.__init__(self, efficacies)
263
264
# update based on an exam score
265
self.Update(score)
266
267
def Likelihood(self, data, hypo):
268
"""Computes the likelihood of a test score, given efficacy."""
269
efficacy = hypo
270
score = data
271
raw = self.exam.Reverse(score)
272
273
pmf = self.exam.PmfCorrect(efficacy)
274
like = pmf.Prob(raw)
275
return like
276
277
def MakePredictiveDist(self):
278
"""Returns the distribution of raw scores expected on a re-test."""
279
raw_pmf = self.exam.MakeRawScoreDist(self)
280
return raw_pmf
281
282
def PlotPosteriors(self, other):
283
"""Plots posterior distributions of efficacy.
284
285
self, other: Sat objects.
286
"""
287
thinkplot.Clf()
288
thinkplot.PrePlot(num=2)
289
290
cdf1 = thinkbayes2.Cdf(self, label='posterior %d' % self.score)
291
cdf2 = thinkbayes2.Cdf(other, label='posterior %d' % other.score)
292
293
thinkplot.Cdfs([cdf1, cdf2])
294
thinkplot.Save(xlabel='efficacy',
295
ylabel='CDF',
296
axis=[0, 4.6, 0.0, 1.0],
297
root='sat_posteriors_eff',
298
formats=['pdf', 'eps'])
299
300
301
def PlotJointDist(pmf1, pmf2, thresh=0.8):
302
"""Plot the joint distribution of p_correct.
303
304
pmf1, pmf2: posterior distributions
305
thresh: lower bound of the range to be plotted
306
"""
307
def Clean(pmf):
308
"""Removes values below thresh."""
309
vals = [val for val in pmf.Values() if val < thresh]
310
[pmf.Remove(val) for val in vals]
311
312
Clean(pmf1)
313
Clean(pmf2)
314
pmf = thinkbayes2.MakeJoint(pmf1, pmf2)
315
316
thinkplot.Figure(figsize=(6, 6))
317
thinkplot.Contour(pmf, contour=False, pcolor=True)
318
319
thinkplot.Plot([thresh, 1.0], [thresh, 1.0],
320
color='gray', alpha=0.2, linewidth=4)
321
322
thinkplot.Save(root='sat_joint',
323
xlabel='p_correct Alice',
324
ylabel='p_correct Bob',
325
axis=[thresh, 1.0, thresh, 1.0],
326
formats=['pdf', 'eps'])
327
328
329
def ComparePosteriorPredictive(a_sat, b_sat):
330
"""Compares the predictive distributions of raw scores.
331
332
a_sat: posterior distribution
333
b_sat:
334
"""
335
a_pred = a_sat.MakePredictiveDist()
336
b_pred = b_sat.MakePredictiveDist()
337
338
#thinkplot.Clf()
339
#thinkplot.Pmfs([a_pred, b_pred])
340
#thinkplot.Show()
341
342
a_like = thinkbayes2.PmfProbGreater(a_pred, b_pred)
343
b_like = thinkbayes2.PmfProbLess(a_pred, b_pred)
344
c_like = thinkbayes2.PmfProbEqual(a_pred, b_pred)
345
346
print('Posterior predictive')
347
print('A', a_like)
348
print('B', b_like)
349
print('C', c_like)
350
351
352
def PlotPriorDist(pmf):
353
"""Plot the prior distribution of p_correct.
354
355
pmf: prior
356
"""
357
thinkplot.Clf()
358
thinkplot.PrePlot(num=1)
359
360
cdf1 = thinkbayes2.Cdf(pmf, label='prior')
361
362
thinkplot.Cdf(cdf1)
363
thinkplot.Save(root='sat_prior',
364
xlabel='p_correct',
365
ylabel='CDF',
366
formats=['pdf', 'eps'])
367
368
369
class TopLevel(thinkbayes2.Suite):
370
"""Evaluates the top-level hypotheses about Alice and Bob.
371
372
Uses the bottom-level posterior distribution about p_correct
373
(or efficacy).
374
"""
375
376
def Update(self, data):
377
a_sat, b_sat = data
378
379
a_like = thinkbayes2.PmfProbGreater(a_sat, b_sat)
380
b_like = thinkbayes2.PmfProbLess(a_sat, b_sat)
381
c_like = thinkbayes2.PmfProbEqual(a_sat, b_sat)
382
383
a_like += c_like / 2
384
b_like += c_like / 2
385
386
self.Mult('A', a_like)
387
self.Mult('B', b_like)
388
389
self.Normalize()
390
391
392
def ProbCorrect(efficacy, difficulty, a=1):
393
"""Returns the probability that a person gets a question right.
394
395
efficacy: personal ability to answer questions
396
difficulty: how hard the question is
397
398
Returns: float prob
399
"""
400
return 1 / (1 + math.exp(-a * (efficacy - difficulty)))
401
402
403
def BinaryPmf(p):
404
"""Makes a Pmf with values 1 and 0.
405
406
p: probability given to 1
407
408
Returns: Pmf object
409
"""
410
pmf = thinkbayes2.Pmf()
411
pmf.Set(1, p)
412
pmf.Set(0, 1-p)
413
return pmf
414
415
416
def PmfCorrect(efficacy, difficulties):
417
"""Computes the distribution of correct responses.
418
419
efficacy: personal ability to answer questions
420
difficulties: list of difficulties, one for each question
421
422
Returns: new Pmf object
423
"""
424
pmf0 = thinkbayes2.Pmf([0])
425
426
ps = [ProbCorrect(efficacy, difficulty) for difficulty in difficulties]
427
pmfs = [BinaryPmf(p) for p in ps]
428
dist = sum(pmfs, pmf0)
429
return dist
430
431
432
def MakeDifficulties(center, width, n):
433
"""Makes a list of n difficulties with a given center and width.
434
435
Returns: list of n floats between center-width and center+width
436
"""
437
low, high = center-width, center+width
438
return numpy.linspace(low, high, n)
439
440
441
def ProbCorrectTable():
442
"""Makes a table of p_correct for a range of efficacy and difficulty."""
443
efficacies = [3, 1.5, 0, -1.5, -3]
444
difficulties = [-1.85, -0.05, 1.75]
445
446
for eff in efficacies:
447
print('%0.2f & ' % eff, end=' ')
448
for diff in difficulties:
449
p = ProbCorrect(eff, diff)
450
print('%0.2f & ' % p, end=' ')
451
print(r'\\')
452
453
454
def main(script):
455
ProbCorrectTable()
456
457
exam = Exam()
458
459
PlotPriorDist(exam.prior)
460
exam.CalibrateDifficulty()
461
462
exam.CompareScores(780, 740, constructor=Sat)
463
464
exam.CompareScores(780, 740, constructor=Sat2)
465
466
467
if __name__ == '__main__':
468
main(*sys.argv)
469
470