Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
junzis
GitHub Repository: junzis/openap
Path: blob/master/openap/extra/statistics.py
592 views
1
"""Fit data using different statistical models."""
2
3
import scipy.stats
4
from matplotlib import pyplot as plt
5
6
import numpy as np
7
8
9
def fit(data, models):
10
if not isinstance(models, list):
11
if isinstance(models, str):
12
models = [models]
13
else:
14
raise RuntimeError("models must be string or list of strings.")
15
16
data = np.array(data)
17
data = data[np.isfinite(data)]
18
data = data[data > np.percentile(data, 0.025)]
19
data = data[data < np.percentile(data, 99.975)]
20
21
# split data in training and testing 50-50
22
train, test = data[0:-1:1], data[1:-1:1]
23
24
# construct the bound for the model fitting
25
dmin = min(data) - 1e-8
26
dmax = max(data) + 1e-8
27
dscale = dmax - dmin
28
29
result = dict()
30
for model in models:
31
# fit distribution and run kstest
32
dist = getattr(scipy.stats, model)
33
if model == "norm":
34
param_train = dist.fit(train)
35
elif model == "gamma":
36
param_train = dist.fit(train, floc=dmin)
37
else:
38
param_train = dist.fit(train, floc=dmin, fscale=dscale)
39
40
ks = scipy.stats.kstest(test, model, param_train)
41
error = ks[0] # D-stats
42
43
# recompute distribution based on all data
44
if model == "norm":
45
param = dist.fit(data)
46
elif model == "gamma":
47
param = dist.fit(data, floc=dmin)
48
else:
49
param = dist.fit(data, floc=dmin, fscale=dscale)
50
51
# construct the example PDF for ploting
52
ci = dist.interval(0.999, *param)
53
xmin = ci[0] - dscale * 0.05
54
xmax = ci[1] + dscale * 0.05
55
pdfx = np.linspace(xmin, xmax, 1000)
56
pdfy = dist.pdf(pdfx, *param)
57
58
result[model] = dict()
59
result[model]["param"] = param
60
result[model]["pdfx"] = pdfx
61
result[model]["pdfy"] = pdfy
62
result[model]["error"] = error
63
64
# print(model, param, error)
65
66
return result
67
68
69
def fitplot(data, model, **kwargs):
70
fitresults = fit(data, model)
71
72
if "bins" in kwargs:
73
bins = kwargs["bins"]
74
del kwargs["bins"]
75
else:
76
bins = 20
77
78
data = np.array(data)
79
data = data[np.isfinite(data)]
80
plt.hist(data, bins=bins, normed=True, color="gray", edgecolor="none", alpha=0.3)
81
plt.plot(
82
fitresults[model]["pdfx"], fitresults[model]["pdfy"], label=model, **kwargs
83
)
84
# plt.legend(loc='best')
85
plt.xlim([min(data), max(data)])
86
plt.ylabel("density (-)")
87
plt.grid()
88
return plt
89
90