CoCalc -- basic

GitHub Repository: sagemath/sagelib
Path: blob/master/sage/stats/basic_stats.py
⁴⁰⁴⁵ views
1
"""
2
Basic Statistics
3

4
This file contains basic descriptive functions. Included are the mean,
5
median, mode, moving average, standard deviation, and the variance. 
6
When calling a function on data, there are checks for functions already 
7
defined for that data type. 
8

9
The ``mean`` function returns the arithmetic mean (the sum of all the members
10
of a list, divided by the number of members). Further revisions may include
11
the geometric and harmonic mean. The ``median`` function returns the number
12
separating the higher half of a sample from the lower half. The ``mode`` 
13
returns the most common occuring member of a sample, plus the number of times
14
it occurs. If entries occur equally common, the smallest of a list of the most
15
common  entries is returned. The ``moving_average`` is a finite impulse
16
response filter, creating a series of averages using a user-defined number of
17
subsets of the full data set. The ``std`` and the ``variance`` return a
18
measurement of how far data points tend to be from the arithmetic mean. 
19

20
Functions are available in the namespace ``stats``, i.e. you can use them by 
21
typing ``stats.mean``, ``stats.median``, etc.
22

23
REMARK: If all the data you are working with are floating point
24
numbers, you may find ``finance.TimeSeries`` helpful, since it is
25
extremely fast and offers many of the same descriptive statistics as
26
in the module.  
27

28
AUTHOR:
29

30
    - Andrew Hou (11/06/2009) 
31
 
32
"""
33
######################################################################
34
#          Copyright (C) 2009, Andrew Hou <[email protected]>
35
#
36
#  Distributed under the terms of the GNU General Public License (GPL)
37
#
38
#            The full text of the GPL is available at:
39
#                  http://www.gnu.org/licenses/
40
######################################################################
41

42
from sage.rings.integer_ring import ZZ
43
from sage.symbolic.constants import NaN
44
from sage.functions.other import sqrt
45

46
def mean(v):
47
    """
48
    Return the mean of the elements of `v`.
49

50
    We define the mean of the empty list to be the (symbolic) NaN,
51
    following the convention of MATLAB, Scipy, and R.
52

53
    INPUT:
54

55
        - `v` -- a list of numbers
56

57
    OUTPUT:
58

59
        - a number
60

61
    EXAMPLES::
62

63
        sage: mean([pi, e])
64
        1/2*pi + 1/2*e
65
        sage: mean([])
66
        NaN
67
        sage: mean([I, sqrt(2), 3/5])
68
        1/3*sqrt(2) + 1/3*I + 1/5
69
        sage: mean([RIF(1.0103,1.0103), RIF(2)])
70
        1.5051500000000000?
71
        sage: mean(range(4))
72
        3/2
73
        sage: v = finance.TimeSeries([1..100])
74
        sage: mean(v)
75
        50.5
76
    """
77
    if hasattr(v, 'mean'): return v.mean()
78
    if len(v) == 0:
79
        return NaN
80
    s = sum(v)
81
    if isinstance(s, (int,long)):
82
        # python integers are stupid.
83
        return s/ZZ(len(v))
84
    return s/len(v)
85

86
def mode(v):
87
    """
88
    Return the mode of `v`.  The mode is the sorted list of the most
89
    frequently occuring elements in `v`.  If `n` is the most times
90
    that any element occurs in `v`, then the mode is the sorted list
91
    of elements of `v` that occur `n` times.
92
    
93
    NOTE: The elements of `v` must be hashable and comparable. 
94

95
    INPUT:
96

97
        - `v` -- a list
98

99
    OUTPUT:
100

101
        - a list
102

103
    EXAMPLES::
104

105
        sage: v = [1,2,4,1,6,2,6,7,1]
106
        sage: mode(v)
107
        [1]
108
        sage: v.count(1)
109
        3
110
        sage: mode([])
111
        []
112
        sage: mode([1,2,3,4,5])
113
        [1, 2, 3, 4, 5]
114
        sage: mode([3,1,2,1,2,3])
115
        [1, 2, 3]
116
        sage: mode(['sage', 4, I, 3/5, 'sage', pi])
117
        ['sage']
118
        sage: class MyClass:
119
        ...     def mode(self):
120
        ...         return [1]
121
        sage: stats.mode(MyClass())
122
        [1]
123
    """
124
    if hasattr(v, 'mode'): return v.mode()
125
    from operator import itemgetter
126
    
127
    freq = {}
128
    for i in v:
129
        if freq.has_key(i):
130
            freq[i] += 1
131
        else:
132
            freq[i] = 1
133

134
    s = sorted(freq.items(), key=itemgetter(1), reverse=True)
135
    return [i[0] for i in s if i[1]==s[0][1]]
136

137
def std(v, bias=False):
138
    """
139
    Returns the standard deviation of the elements of `v`
140

141
    We define the standard deviation of the empty list to be NaN,
142
    following the convention of MATLAB, Scipy, and R.
143

144
    INPUT:
145

146
        - `v` -- a list of numbers
147
        
148
        - ``bias`` -- bool (default: False); if False, divide by
149
                      len(v) - 1 instead of len(v)
150
                      to give a less biased estimator (sample) for the 
151
                      standard deviation. 
152

153
    OUTPUT:
154
        
155
        - a number
156

157
    EXAMPLES::
158

159
        sage: std([1..6], bias=True) 
160
        1/2*sqrt(35/3)
161
        sage: std([1..6], bias=False)
162
        sqrt(7/2)
163
        sage: std([e, pi])
164
        sqrt(1/2)*sqrt((pi - e)^2)
165
        sage: std([])
166
        NaN
167
        sage: std([I, sqrt(2), 3/5])
168
        sqrt(1/450*(-5*sqrt(2) - 5*I + 6)^2 + 1/450*(10*sqrt(2) - 5*I - 3)^2 + 1/450*(-5*sqrt(2) + 10*I - 3)^2)
169
        sage: std([RIF(1.0103, 1.0103), RIF(2)])
170
        0.6998235813403261?
171
        sage: import numpy
172
        sage: x = numpy.array([1,2,3,4,5])
173
        sage: std(x, bias=False)
174
        1.5811388300841898
175
        sage: x = finance.TimeSeries([1..100])
176
        sage: std(x)
177
        29.011491975882016
178
    """
179

180
    # NOTE: in R bias = False by default, and in Scipy bias=True by
181
    # default, and R is more popular.
182
    
183
    if hasattr(v, 'standard_deviation'): return v.standard_deviation(bias=bias)
184

185
    import numpy
186

187
    x = 0
188
    if isinstance(v, numpy.ndarray):
189
        # accounts for numpy arrays
190
        if bias == True:
191
            return v.std()
192
        elif bias == False:
193
            return v.std(ddof=1)
194

195
    if len(v) == 0:
196
        # standard deviation of empty set defined as NaN
197
        return NaN
198

199
    return sqrt(variance(v, bias=bias))
200

201
def variance(v, bias=False):
202
    """
203
    Returns the variance of the elements of `v`
204

205
    We define the variance of the empty list to be NaN,
206
    following the convention of MATLAB, Scipy, and R.
207

208
    INPUT:
209

210
        - `v` -- a list of numbers
211
        
212
        - ``bias`` -- bool (default: False); if False, divide by
213
                      len(v) - 1 instead of len(v)
214
                      to give a less biased estimator (sample) for the 
215
                      standard deviation. 
216

217
    OUTPUT:
218
        
219
        - a number
220

221

222
    EXAMPLES::
223

224
        sage: variance([1..6])
225
        7/2
226
        sage: variance([1..6], bias=True)
227
        35/12
228
        sage: variance([e, pi])                 
229
        1/2*(pi - e)^2
230
        sage: variance([])                      
231
        NaN
232
        sage: variance([I, sqrt(2), 3/5])       
233
        1/450*(-5*sqrt(2) - 5*I + 6)^2 + 1/450*(10*sqrt(2) - 5*I - 3)^2 + 1/450*(-5*sqrt(2) + 10*I - 3)^2
234
        sage: variance([RIF(1.0103, 1.0103), RIF(2)])
235
        0.4897530450000000?
236
        sage: import numpy
237
        sage: x = numpy.array([1,2,3,4,5])
238
        sage: variance(x, bias=False)
239
        2.5
240
        sage: x = finance.TimeSeries([1..100])
241
        sage: variance(x)
242
        841.6666666666666
243
        sage: variance(x, bias=True)
244
        833.25
245
        sage: class MyClass:
246
        ...     def variance(self, bias = False):
247
        ...        return 1    
248
        sage: stats.variance(MyClass())
249
        1
250
        sage: class SillyPythonList:
251
        ...     def __init__(self):
252
        ...         self.__list = [2L,4L]
253
        ...     def __len__(self):
254
        ...         return len(self.__list)
255
        ...     def __iter__(self):
256
        ...         return self.__list.__iter__()
257
        ...     def mean(self):
258
        ...         return 3L
259
        sage: R = SillyPythonList()
260
        sage: variance(R)
261
        2
262
        sage: variance(R, bias=True)
263
        1
264
        
265

266
    TESTS:
267

268
    The performance issue from #10019 is solved::
269

270
        sage: variance([1] * 2^18)
271
        0
272
    """
273
    if hasattr(v, 'variance'): return v.variance(bias=bias)
274
    import numpy
275

276
    x = 0
277
    if isinstance(v, numpy.ndarray):
278
        # accounts for numpy arrays
279
        if bias == True:
280
            return v.var()
281
        elif bias == False:
282
            return v.var(ddof=1)
283
    if len(v) == 0:
284
        # variance of empty set defined as NaN
285
        return NaN
286

287
    mu = mean(v)
288
    for vi in v:
289
        x += (vi - mu)**2
290
    if bias:
291
        # population variance
292
        if isinstance(x, (int,long)):
293
            return x/ZZ(len(v))
294
        return x/len(v)
295
    else:
296
        # sample variance
297
        if isinstance(x, (int,long)):
298
            return x/ZZ(len(v)-1)
299
        return x/(len(v)-1)
300

301

302
def median(v):
303
    """
304
    Return the median (middle value) of the elements of `v`
305

306
    If `v` is empty, we define the median to be NaN, which is
307
    consistent with NumPy (note that R returns NULL).
308
    If `v` is comprised of strings, TypeError occurs. 
309
    For elements other than numbers, the median is a result of ``sorted()``.
310

311
    INPUT:
312

313
        - `v` -- a list
314

315
    OUTPUT:
316

317
        - median element of `v`
318

319
    EXAMPLES::
320

321
        sage: median([1,2,3,4,5])
322
        3
323
        sage: median([e, pi])
324
        1/2*pi + 1/2*e
325
        sage: median(['sage', 'linux', 'python'])
326
        'python' 
327
        sage: median([])
328
        NaN
329
        sage: class MyClass:
330
        ...      def median(self):
331
        ...         return 1
332
        sage: stats.median(MyClass())
333
        1
334
    """
335
    if hasattr(v, 'median'): return v.median()
336

337
    if len(v) == 0:
338
        # Median of empty set defined as NaN
339
        return NaN
340
    values = sorted(v)
341
    if len(values) % 2 == 1:
342
        return values[((len(values))+1)/2-1]
343
    else:
344
        lower = values[(len(values)+1)/2-1]
345
        upper = values[len(values)/2]
346
        return (lower + upper)/ZZ(2)
347

348
def moving_average(v, n):
349
    """
350
    Provides the moving average of a list `v`
351

352
    The moving average of a list is often used to smooth out noisy data.
353

354
    If `v` is empty, we define the entries of the moving average to be NaN.
355

356
    INPUT:
357
 
358
        - `v` -- a list
359

360
        - `n` -- the number of values used in computing each average.
361

362
    OUTPUT:
363

364
        - a list of length ``len(v)-n+1``, since we do not fabric any values
365
 
366
    EXAMPLES::
367

368
        sage: moving_average([1..10], 1)
369
        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
370
        sage: moving_average([1..10], 4)
371
        [5/2, 7/2, 9/2, 11/2, 13/2, 15/2, 17/2]
372
        sage: moving_average([], 1)
373
        []
374
        sage: moving_average([pi, e, I, sqrt(2), 3/5], 2)
375
        [1/2*pi + 1/2*e, 1/2*e + 1/2*I, 1/2*sqrt(2) + 1/2*I, 1/2*sqrt(2) + 3/10]
376

377
    We check if the input is a time series, and if so use the
378
    optimized ``simple_moving_average`` method, but with (slightly
379
    different) meaning as defined above (the point is that the
380
    ``simple_moving_average`` on time series returns `n` values::
381
    
382
        sage: a = finance.TimeSeries([1..10])
383
        sage: stats.moving_average(a, 3)
384
        [2.0000, 3.0000, 4.0000, 5.0000, 6.0000, 7.0000, 8.0000, 9.0000]
385
        sage: stats.moving_average(list(a), 3)
386
        [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
387
        
388
    """
389
    if len(v) == 0:
390
        return v
391
    from sage.finance.time_series import TimeSeries
392
    if isinstance(v, TimeSeries):
393
        return v.simple_moving_average(n)[n-1:]
394
    n = int(n)
395
    if n <= 0:
396
        raise ValueError, "n must be positive"
397
    nn = ZZ(n)
398
    s = sum(v[:n])
399
    ans = [s/nn]
400
    for i in range(n,len(v)):
401
        # add in the i-th value in v to our running sum,
402
        # and remove the value n places back.
403
        s += v[i] - v[i-n]
404
        ans.append(s/nn)
405
    return ans
406

407
Product

Resources

Company