Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sagemath
GitHub Repository: sagemath/sagelib
Path: blob/master/sage/stats/basic_stats.py
4045 views
1
"""
2
Basic Statistics
3
4
This file contains basic descriptive functions. Included are the mean,
5
median, mode, moving average, standard deviation, and the variance.
6
When calling a function on data, there are checks for functions already
7
defined for that data type.
8
9
The ``mean`` function returns the arithmetic mean (the sum of all the members
10
of a list, divided by the number of members). Further revisions may include
11
the geometric and harmonic mean. The ``median`` function returns the number
12
separating the higher half of a sample from the lower half. The ``mode``
13
returns the most common occuring member of a sample, plus the number of times
14
it occurs. If entries occur equally common, the smallest of a list of the most
15
common entries is returned. The ``moving_average`` is a finite impulse
16
response filter, creating a series of averages using a user-defined number of
17
subsets of the full data set. The ``std`` and the ``variance`` return a
18
measurement of how far data points tend to be from the arithmetic mean.
19
20
Functions are available in the namespace ``stats``, i.e. you can use them by
21
typing ``stats.mean``, ``stats.median``, etc.
22
23
REMARK: If all the data you are working with are floating point
24
numbers, you may find ``finance.TimeSeries`` helpful, since it is
25
extremely fast and offers many of the same descriptive statistics as
26
in the module.
27
28
AUTHOR:
29
30
- Andrew Hou (11/06/2009)
31
32
"""
33
######################################################################
34
# Copyright (C) 2009, Andrew Hou <[email protected]>
35
#
36
# Distributed under the terms of the GNU General Public License (GPL)
37
#
38
# The full text of the GPL is available at:
39
# http://www.gnu.org/licenses/
40
######################################################################
41
42
from sage.rings.integer_ring import ZZ
43
from sage.symbolic.constants import NaN
44
from sage.functions.other import sqrt
45
46
def mean(v):
47
"""
48
Return the mean of the elements of `v`.
49
50
We define the mean of the empty list to be the (symbolic) NaN,
51
following the convention of MATLAB, Scipy, and R.
52
53
INPUT:
54
55
- `v` -- a list of numbers
56
57
OUTPUT:
58
59
- a number
60
61
EXAMPLES::
62
63
sage: mean([pi, e])
64
1/2*pi + 1/2*e
65
sage: mean([])
66
NaN
67
sage: mean([I, sqrt(2), 3/5])
68
1/3*sqrt(2) + 1/3*I + 1/5
69
sage: mean([RIF(1.0103,1.0103), RIF(2)])
70
1.5051500000000000?
71
sage: mean(range(4))
72
3/2
73
sage: v = finance.TimeSeries([1..100])
74
sage: mean(v)
75
50.5
76
"""
77
if hasattr(v, 'mean'): return v.mean()
78
if len(v) == 0:
79
return NaN
80
s = sum(v)
81
if isinstance(s, (int,long)):
82
# python integers are stupid.
83
return s/ZZ(len(v))
84
return s/len(v)
85
86
def mode(v):
87
"""
88
Return the mode of `v`. The mode is the sorted list of the most
89
frequently occuring elements in `v`. If `n` is the most times
90
that any element occurs in `v`, then the mode is the sorted list
91
of elements of `v` that occur `n` times.
92
93
NOTE: The elements of `v` must be hashable and comparable.
94
95
INPUT:
96
97
- `v` -- a list
98
99
OUTPUT:
100
101
- a list
102
103
EXAMPLES::
104
105
sage: v = [1,2,4,1,6,2,6,7,1]
106
sage: mode(v)
107
[1]
108
sage: v.count(1)
109
3
110
sage: mode([])
111
[]
112
sage: mode([1,2,3,4,5])
113
[1, 2, 3, 4, 5]
114
sage: mode([3,1,2,1,2,3])
115
[1, 2, 3]
116
sage: mode(['sage', 4, I, 3/5, 'sage', pi])
117
['sage']
118
sage: class MyClass:
119
... def mode(self):
120
... return [1]
121
sage: stats.mode(MyClass())
122
[1]
123
"""
124
if hasattr(v, 'mode'): return v.mode()
125
from operator import itemgetter
126
127
freq = {}
128
for i in v:
129
if freq.has_key(i):
130
freq[i] += 1
131
else:
132
freq[i] = 1
133
134
s = sorted(freq.items(), key=itemgetter(1), reverse=True)
135
return [i[0] for i in s if i[1]==s[0][1]]
136
137
def std(v, bias=False):
138
"""
139
Returns the standard deviation of the elements of `v`
140
141
We define the standard deviation of the empty list to be NaN,
142
following the convention of MATLAB, Scipy, and R.
143
144
INPUT:
145
146
- `v` -- a list of numbers
147
148
- ``bias`` -- bool (default: False); if False, divide by
149
len(v) - 1 instead of len(v)
150
to give a less biased estimator (sample) for the
151
standard deviation.
152
153
OUTPUT:
154
155
- a number
156
157
EXAMPLES::
158
159
sage: std([1..6], bias=True)
160
1/2*sqrt(35/3)
161
sage: std([1..6], bias=False)
162
sqrt(7/2)
163
sage: std([e, pi])
164
sqrt(1/2)*sqrt((pi - e)^2)
165
sage: std([])
166
NaN
167
sage: std([I, sqrt(2), 3/5])
168
sqrt(1/450*(-5*sqrt(2) - 5*I + 6)^2 + 1/450*(10*sqrt(2) - 5*I - 3)^2 + 1/450*(-5*sqrt(2) + 10*I - 3)^2)
169
sage: std([RIF(1.0103, 1.0103), RIF(2)])
170
0.6998235813403261?
171
sage: import numpy
172
sage: x = numpy.array([1,2,3,4,5])
173
sage: std(x, bias=False)
174
1.5811388300841898
175
sage: x = finance.TimeSeries([1..100])
176
sage: std(x)
177
29.011491975882016
178
"""
179
180
# NOTE: in R bias = False by default, and in Scipy bias=True by
181
# default, and R is more popular.
182
183
if hasattr(v, 'standard_deviation'): return v.standard_deviation(bias=bias)
184
185
import numpy
186
187
x = 0
188
if isinstance(v, numpy.ndarray):
189
# accounts for numpy arrays
190
if bias == True:
191
return v.std()
192
elif bias == False:
193
return v.std(ddof=1)
194
195
if len(v) == 0:
196
# standard deviation of empty set defined as NaN
197
return NaN
198
199
return sqrt(variance(v, bias=bias))
200
201
def variance(v, bias=False):
202
"""
203
Returns the variance of the elements of `v`
204
205
We define the variance of the empty list to be NaN,
206
following the convention of MATLAB, Scipy, and R.
207
208
INPUT:
209
210
- `v` -- a list of numbers
211
212
- ``bias`` -- bool (default: False); if False, divide by
213
len(v) - 1 instead of len(v)
214
to give a less biased estimator (sample) for the
215
standard deviation.
216
217
OUTPUT:
218
219
- a number
220
221
222
EXAMPLES::
223
224
sage: variance([1..6])
225
7/2
226
sage: variance([1..6], bias=True)
227
35/12
228
sage: variance([e, pi])
229
1/2*(pi - e)^2
230
sage: variance([])
231
NaN
232
sage: variance([I, sqrt(2), 3/5])
233
1/450*(-5*sqrt(2) - 5*I + 6)^2 + 1/450*(10*sqrt(2) - 5*I - 3)^2 + 1/450*(-5*sqrt(2) + 10*I - 3)^2
234
sage: variance([RIF(1.0103, 1.0103), RIF(2)])
235
0.4897530450000000?
236
sage: import numpy
237
sage: x = numpy.array([1,2,3,4,5])
238
sage: variance(x, bias=False)
239
2.5
240
sage: x = finance.TimeSeries([1..100])
241
sage: variance(x)
242
841.6666666666666
243
sage: variance(x, bias=True)
244
833.25
245
sage: class MyClass:
246
... def variance(self, bias = False):
247
... return 1
248
sage: stats.variance(MyClass())
249
1
250
sage: class SillyPythonList:
251
... def __init__(self):
252
... self.__list = [2L,4L]
253
... def __len__(self):
254
... return len(self.__list)
255
... def __iter__(self):
256
... return self.__list.__iter__()
257
... def mean(self):
258
... return 3L
259
sage: R = SillyPythonList()
260
sage: variance(R)
261
2
262
sage: variance(R, bias=True)
263
1
264
265
266
TESTS:
267
268
The performance issue from #10019 is solved::
269
270
sage: variance([1] * 2^18)
271
0
272
"""
273
if hasattr(v, 'variance'): return v.variance(bias=bias)
274
import numpy
275
276
x = 0
277
if isinstance(v, numpy.ndarray):
278
# accounts for numpy arrays
279
if bias == True:
280
return v.var()
281
elif bias == False:
282
return v.var(ddof=1)
283
if len(v) == 0:
284
# variance of empty set defined as NaN
285
return NaN
286
287
mu = mean(v)
288
for vi in v:
289
x += (vi - mu)**2
290
if bias:
291
# population variance
292
if isinstance(x, (int,long)):
293
return x/ZZ(len(v))
294
return x/len(v)
295
else:
296
# sample variance
297
if isinstance(x, (int,long)):
298
return x/ZZ(len(v)-1)
299
return x/(len(v)-1)
300
301
302
def median(v):
303
"""
304
Return the median (middle value) of the elements of `v`
305
306
If `v` is empty, we define the median to be NaN, which is
307
consistent with NumPy (note that R returns NULL).
308
If `v` is comprised of strings, TypeError occurs.
309
For elements other than numbers, the median is a result of ``sorted()``.
310
311
INPUT:
312
313
- `v` -- a list
314
315
OUTPUT:
316
317
- median element of `v`
318
319
EXAMPLES::
320
321
sage: median([1,2,3,4,5])
322
3
323
sage: median([e, pi])
324
1/2*pi + 1/2*e
325
sage: median(['sage', 'linux', 'python'])
326
'python'
327
sage: median([])
328
NaN
329
sage: class MyClass:
330
... def median(self):
331
... return 1
332
sage: stats.median(MyClass())
333
1
334
"""
335
if hasattr(v, 'median'): return v.median()
336
337
if len(v) == 0:
338
# Median of empty set defined as NaN
339
return NaN
340
values = sorted(v)
341
if len(values) % 2 == 1:
342
return values[((len(values))+1)/2-1]
343
else:
344
lower = values[(len(values)+1)/2-1]
345
upper = values[len(values)/2]
346
return (lower + upper)/ZZ(2)
347
348
def moving_average(v, n):
349
"""
350
Provides the moving average of a list `v`
351
352
The moving average of a list is often used to smooth out noisy data.
353
354
If `v` is empty, we define the entries of the moving average to be NaN.
355
356
INPUT:
357
358
- `v` -- a list
359
360
- `n` -- the number of values used in computing each average.
361
362
OUTPUT:
363
364
- a list of length ``len(v)-n+1``, since we do not fabric any values
365
366
EXAMPLES::
367
368
sage: moving_average([1..10], 1)
369
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
370
sage: moving_average([1..10], 4)
371
[5/2, 7/2, 9/2, 11/2, 13/2, 15/2, 17/2]
372
sage: moving_average([], 1)
373
[]
374
sage: moving_average([pi, e, I, sqrt(2), 3/5], 2)
375
[1/2*pi + 1/2*e, 1/2*e + 1/2*I, 1/2*sqrt(2) + 1/2*I, 1/2*sqrt(2) + 3/10]
376
377
We check if the input is a time series, and if so use the
378
optimized ``simple_moving_average`` method, but with (slightly
379
different) meaning as defined above (the point is that the
380
``simple_moving_average`` on time series returns `n` values::
381
382
sage: a = finance.TimeSeries([1..10])
383
sage: stats.moving_average(a, 3)
384
[2.0000, 3.0000, 4.0000, 5.0000, 6.0000, 7.0000, 8.0000, 9.0000]
385
sage: stats.moving_average(list(a), 3)
386
[2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
387
388
"""
389
if len(v) == 0:
390
return v
391
from sage.finance.time_series import TimeSeries
392
if isinstance(v, TimeSeries):
393
return v.simple_moving_average(n)[n-1:]
394
n = int(n)
395
if n <= 0:
396
raise ValueError, "n must be positive"
397
nn = ZZ(n)
398
s = sum(v[:n])
399
ans = [s/nn]
400
for i in range(n,len(v)):
401
# add in the i-th value in v to our running sum,
402
# and remove the value n places back.
403
s += v[i] - v[i-n]
404
ans.append(s/nn)
405
return ans
406
407