Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
RWTH-EBC
GitHub Repository: RWTH-EBC/ebcpy
Path: blob/master/ebcpy/preprocessing.py
505 views
1
"""
2
This general overview may help you find the function you need:
3
4
- Remove duplicate rows by averaging the values
5
(``build_average_on_duplicate_rows``)
6
- Convert any integer or float index into a datetime index
7
(``convert_index_to_datetime_index``)
8
- Resample a given time-series on a given frequency
9
(``clean_and_space_equally_time_series``)
10
- Apply a low-pass-filter (``low_pass_filter``)
11
- Apply a moving average to flatten disturbances
12
in your measured data (``moving_average``)
13
- Convert e.g. an electrical power signal into a binary
14
control signal (on-off) based on a threshold (``create_on_off_signal``)
15
- Find the number of lines without any values in it (``number_lines_totally_na``)
16
- Split a data-set into training and test set according to
17
cross-validation (``cross_validation``)
18
19
All functions in the pre-processing module should have a doctest. We refer to the example
20
in this doctest for a better understanding of the functions. If you don't understand
21
the behaviour of a function or the meaning, please raise an issue.
22
"""
23
import warnings
24
import logging
25
from typing import Union, TYPE_CHECKING
26
27
from datetime import datetime
28
from scipy import signal
29
from sklearn import model_selection
30
from pandas.tseries.frequencies import to_offset
31
import numpy as np
32
import pandas as pd
33
import scipy.stats as st
34
35
if TYPE_CHECKING:
36
from ebcpy import TimeSeriesData
37
38
logger = logging.getLogger(__name__)
39
40
41
def build_average_on_duplicate_rows(df: Union[pd.DataFrame, "TimeSeriesData"]) -> pd.DataFrame:
42
"""
43
If the dataframe has duplicate-indexes, the average
44
value of all those indexes is calculated and given to
45
the first occurrence of this duplicate index. Therefore,
46
any dataFrame should be already sorted before calling this
47
function.
48
49
:param pd.DataFame df:
50
DataFrame with the data to process
51
:return: pd.DataFame
52
The processed DataFame
53
54
Example:
55
56
>>> df = pd.DataFrame({"idx": np.ones(5), "val": np.arange(5)}).set_index("idx")
57
>>> df = convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1))
58
>>> print(df)
59
val
60
idx
61
2007-01-01 00:00:01 0
62
2007-01-01 00:00:01 1
63
2007-01-01 00:00:01 2
64
2007-01-01 00:00:01 3
65
2007-01-01 00:00:01 4
66
>>> print(build_average_on_duplicate_rows(df))
67
val
68
idx
69
2007-01-01 00:00:01 2.0
70
"""
71
# Find entries that are exactly the same timestamp
72
double_ind = df.index[df.index.duplicated()].unique()
73
# Calculate the mean value
74
mean_values = []
75
for item in double_ind:
76
mean_values.append(df.loc[item].values.mean(axis=0))
77
# Delete duplicate indices
78
df_dropped = df[~df.index.duplicated(keep='first')].copy()
79
80
# Set mean values in rows that were duplicates before
81
for idx, values in zip(double_ind, mean_values):
82
df_dropped.loc[idx] = values
83
84
return df_dropped
85
86
87
def convert_index_to_datetime_index(
88
df: Union[pd.DataFrame, "TimeSeriesData"],
89
unit_of_index: str = "s",
90
origin: datetime = datetime.now(),
91
inplace: bool = False
92
) -> pd.DataFrame:
93
"""
94
Converts the index of the given DataFrame to a
95
pandas.core.indexes.datetimes.DatetimeIndex.
96
97
:param pd.DataFrame,TimeSeriesData df:
98
dataframe with index not being a DateTime.
99
Only numeric indexes are supported. Every integer
100
is interpreted with the given unit, standard form
101
is in seocnds.
102
:param str unit_of_index: default 's'
103
The unit of the given index. Used to convert to
104
total_seconds later on.
105
:param datetime.datetime origin:
106
The reference datetime object for the first index.
107
Default is the current system time.
108
:param bool inplace:
109
If True, performs operation inplace and returns None.
110
:return: df
111
Copy of DataFrame with correct index for usage in this
112
framework.
113
114
Example:
115
116
>>> import pandas as pd
117
>>> df = pd.DataFrame(np.ones([3, 4]), columns=list('ABCD'))
118
>>> print(df)
119
A B C D
120
0 1.0 1.0 1.0 1.0
121
1 1.0 1.0 1.0 1.0
122
2 1.0 1.0 1.0 1.0
123
>>> print(convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1)))
124
A B C D
125
2007-01-01 00:00:00 1.0 1.0 1.0 1.0
126
2007-01-01 00:00:01 1.0 1.0 1.0 1.0
127
2007-01-01 00:00:02 1.0 1.0 1.0 1.0
128
129
"""
130
# Check for unit of given index. Maybe one uses hour-based data.
131
_unit_conversion_to_seconds = {"ms": 1e3,
132
"s": 1,
133
"min": 1 / 60,
134
"h": 1 / 3600,
135
"d": 1 / 86400}
136
if unit_of_index not in _unit_conversion_to_seconds:
137
raise ValueError("Given unit_of_index is not supported.")
138
_unit_factor_to_seconds = _unit_conversion_to_seconds.get(unit_of_index)
139
140
# Convert
141
old_index = df.index.copy()
142
# Check if already converted:
143
if isinstance(old_index, pd.DatetimeIndex):
144
return df
145
# Convert strings to numeric values.
146
old_index = pd.to_numeric(old_index)
147
# Convert to seconds.
148
old_index /= _unit_factor_to_seconds
149
# Alter the index
150
index = pd.to_datetime(old_index, unit="s", origin=origin)
151
if inplace:
152
df.index = index
153
return None
154
df_copy = df.copy()
155
df_copy.index = index
156
return df_copy
157
158
159
def convert_datetime_index_to_float_index(
160
df: Union[pd.DataFrame, "TimeSeriesData"],
161
offset: float = 0,
162
inplace: bool = False
163
) -> pd.DataFrame:
164
"""
165
Convert a datetime-based index to FloatIndex (in seconds).
166
Seconds are used as a standard unit as simulation software
167
outputs data in seconds (e.g. Modelica)
168
169
:param pd.DataFrame,TimeSeriesData df:
170
DataFrame to be converted to FloatIndex
171
:param float offset:
172
Offset in seconds
173
:param bool inplace:
174
If True, performs operation inplace and returns None.
175
:return: pd.DataFrame df:
176
DataFrame with correct index
177
178
Example:
179
180
>>> import pandas as pd
181
>>> df = pd.DataFrame(np.ones([3, 4]), columns=list('ABCD'))
182
>>> print(convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1)))
183
A B C D
184
2007-01-01 00:00:00 1.0 1.0 1.0 1.0
185
2007-01-01 00:00:01 1.0 1.0 1.0 1.0
186
2007-01-01 00:00:02 1.0 1.0 1.0 1.0
187
>>> print(convert_datetime_index_to_float_index(df))
188
A B C D
189
0.0 1.0 1.0 1.0 1.0
190
1.0 1.0 1.0 1.0 1.0
191
2.0 1.0 1.0 1.0 1.0
192
"""
193
# Check correct input
194
if not isinstance(df.index, pd.DatetimeIndex):
195
raise IndexError("Given DataFrame has no DatetimeIndex, conversion not possible")
196
197
new_index = np.round(pd.to_timedelta(df.index - df.index[0]).total_seconds(), 4) + offset
198
if inplace:
199
df.index = new_index
200
return None
201
df_copy = df.copy()
202
df_copy.index = new_index
203
return df_copy
204
205
206
def time_based_weighted_mean(df: Union[pd.DataFrame, "TimeSeriesData"]) -> np.ndarray:
207
"""
208
Creates the weighted mean according to time index that does not need to be equidistant.
209
Further info:
210
https://stackoverflow.com/questions/26343252/create-a-weighted-mean-for-a-irregular-timeseries-in-pandas
211
212
:param pd.DataFrame df:
213
A pandas DataFrame with DatetimeIndex.
214
:return np.array:
215
A numpy array containing weighted means of all columns
216
217
Example:
218
219
>>> from datetime import datetime
220
>>> import numpy as np
221
>>> import pandas as pd
222
>>> time_vec = [datetime(2007,1,1,0,0),
223
>>> datetime(2007,1,1,0,0),
224
>>> datetime(2007,1,1,0,5),
225
>>> datetime(2007,1,1,0,7),
226
>>> datetime(2007,1,1,0,10)]
227
>>> df = pd.DataFrame({'A': [1,2,4,3,6], 'B': [11,12,14,13,16]}, index=time_vec)
228
>>> print(time_based_weighted_mean(df=df))
229
[ 3.55 13.55]
230
"""
231
232
if not isinstance(df.index, pd.DatetimeIndex):
233
raise IndexError(f"df.index must be DatetimeIndex, but it is {type(df.index)}.")
234
235
time_delta = [(x - y).total_seconds() for x, y in zip(df.index[1:], df.index[:-1])]
236
weights = [x + y for x, y in zip([0] + time_delta, time_delta + [0])]
237
# Create empty numpy array
238
res = np.empty(len(df.columns))
239
res[:] = np.nan
240
for i, col_name in enumerate(df.columns):
241
res[i] = np.average(df[col_name], weights=weights)
242
return res
243
244
245
def clean_and_space_equally_time_series(
246
df: Union[pd.DataFrame, "TimeSeriesData"],
247
desired_freq: str,
248
confidence_warning: float = 0.95
249
) -> pd.DataFrame:
250
"""
251
Function for cleaning of the given dataFrame and interpolating
252
based on the given desired frequency. Linear interpolation
253
is used.
254
255
:param pd.DataFrame,TimeSeriesData df:
256
Unclean DataFrame. Needs to have a pd.DateTimeIndex
257
:param str desired_freq:
258
Frequency to determine number of elements in processed dataframe.
259
Options are for example:
260
- s: second-based
261
- 5s: Every 5 seconds
262
- 6min: Every 6 minutes
263
This also works for h, d, m, y, ms etc.
264
:param float confidence_warning:
265
Value to check the confidence interval of input data without
266
a defined frequency. If the desired frequency is outside of
267
the resulting confidence interval, a warning is issued.
268
:return: pd.DataFrame
269
Cleaned and equally spaced data-frame
270
271
Example:
272
**Note:** The example is for random data. Try out different sampling
273
frequencys. You will be warned if the samping rate is to high or to low.
274
275
>>> df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)),
276
>>> columns=list('ABCD')).set_index("A").sort_index()
277
>>> df = convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1))
278
>>> clean_and_space_equally_time_series(df, "30s")
279
>>> import matplotlib.pyplot as plt
280
>>> plt.plot(df["B"], label="Raw data")
281
>>> df = clean_and_space_equally_time_series(df.copy(), "1500ms")
282
>>> plt.plot(df["B"], label="Clead and spaced equally")
283
>>> plt.legend()
284
>>> plt.show()
285
286
.. versionchanged:: 0.1.7
287
"""
288
from ebcpy import TimeSeriesData
289
290
# Convert indexes to datetime_index:
291
if not isinstance(df.index, pd.DatetimeIndex):
292
if isinstance(df, TimeSeriesData):
293
raise TypeError("TimeSeriesData needs a DateTimeIndex for executing this function. "
294
"Call to_datetime_index() to convert any index to "
295
"a DateTimeIndex")
296
# Else
297
raise TypeError("DataFrame needs a DateTimeIndex for executing this function. "
298
"Call convert_index_to_datetime_index() to convert any index to "
299
"a DateTimeIndex")
300
# %% Check DataFrame for NANs
301
# Create a pandas Series with number of invalid values for each column of df
302
series_with_na = df.isnull().sum()
303
for name in series_with_na.index:
304
if series_with_na.loc[name] > 0:
305
# Print only columns with invalid values
306
logger.info("%s has following number of invalid "
307
"values\n %s", name, series_with_na.loc[name])
308
# Drop all rows where at least one NA exists
309
df_temp = df.dropna(how='any')
310
311
# Check if DataFrame still has non-numeric-values:
312
if not all(df_temp.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all())):
313
raise ValueError("Given DataFrame contains non-numeric values.")
314
315
# Merge duplicate rows using mean.
316
df_temp = build_average_on_duplicate_rows(df_temp)
317
318
# Make user warning for two cases: Upsampling and data input without a freq:
319
# Check if the frequency differs
320
old_freq, old_freq_std, old_freq_sem, time_steps = get_df_index_frequency_mean_and_std(
321
df_index=df_temp.index,
322
verbose=True)
323
if old_freq_std > 0:
324
_ns_to_s = 1e9
325
# Calculate confidence interval of the mean value of the old frequency
326
cfd_int = st.t.interval(confidence_warning,
327
time_steps - 1,
328
loc=old_freq,
329
scale=old_freq_sem)
330
# Convert to timedelta
331
cfd_int = pd.to_timedelta((cfd_int[0] * _ns_to_s, cfd_int[1] * _ns_to_s))
332
_td_freq = pd.to_timedelta(desired_freq)
333
if (_td_freq < cfd_int[0]) or (_td_freq > cfd_int[1]):
334
in_seconds = np.array(cfd_int.values.tolist()) / _ns_to_s # From nanoseconds
335
warnings.warn(f"Input data has no frequency, but the desired frequency "
336
f"{_td_freq.value / _ns_to_s} seconds is outside the given "
337
f"confidence interval {in_seconds} (in seconds) "
338
"Carefully check the result to see if you "
339
"introduced errors to the data.")
340
341
# %% Re-sampling to new frequency with linear interpolation
342
# Create new equally spaced DatetimeIndex. Last entry is always < df.index[-1]
343
time_index = pd.date_range(start=df.index[0], end=df.index[-1], freq=desired_freq)
344
new_freq, _ = get_df_index_frequency_mean_and_std(df_index=time_index)
345
346
# Check if the user is trying to upsample the data:
347
if old_freq_std == 0:
348
if new_freq > old_freq:
349
warnings.warn("You are upsampling your data. This may be dangerous. "
350
"Carefully check the result to see if you introduced errors to the data.")
351
352
# Create an empty data frame
353
# If multi-columns is used, first get the old index and make it empty:
354
multi_cols = df_temp.columns
355
if isinstance(multi_cols, pd.MultiIndex):
356
empty_multi_cols = pd.MultiIndex.from_product([[] for _ in range(multi_cols.nlevels)],
357
names=multi_cols.names)
358
df_time_temp = pd.DataFrame(index=time_index, columns=empty_multi_cols)
359
else:
360
df_time_temp = pd.DataFrame(index=time_index)
361
362
# Insert temporary time_index into df. fill_value = 0 can only be used,
363
# since all NaNs should be eliminated prior
364
df_temp = df_temp.radd(df_time_temp, axis='index', fill_value=0)
365
del df_time_temp
366
367
# Interpolate linearly according to time index
368
df_temp.interpolate(method='time', axis=0, inplace=True)
369
# Determine Timedelta between current first index entry
370
# in df and the first index entry that would be created
371
# when applying df.resample() without loffset
372
delta_time = df.index[0] - \
373
df_temp.resample(rule=desired_freq).first().first(desired_freq).index[0]
374
# Resample to equally spaced index.
375
# All fields should already have a value. Thus NaNs and maybe +/- infs
376
# should have been filtered beforehand.
377
378
# Check if given dataframe was a TimeSeriesData object and of so, convert it as such
379
from ebcpy import TimeSeriesData
380
if isinstance(df_temp, TimeSeriesData):
381
df_temp = df_temp.resample(rule=desired_freq).first()
382
df_temp.index = df_temp.index + to_offset(delta_time)
383
df_temp = TimeSeriesData(df_temp)
384
else:
385
df_temp = df_temp.resample(rule=desired_freq).first()
386
df_temp.index = df_temp.index + to_offset(delta_time)
387
del delta_time
388
389
return df_temp
390
391
392
def low_pass_filter(data: np.ndarray, crit_freq: float, filter_order: int) -> np.ndarray:
393
"""
394
Create a low pass filter with given order and frequency.
395
396
:param numpy.ndarray data:
397
For dataframe e.g. df['a_col_name'].values
398
:param float crit_freq:
399
The critical frequency or frequencies.
400
:param int filter_order:
401
The order of the filter
402
:return: numpy.ndarray
403
404
Example:
405
406
>>> import numpy as np
407
>>> import matplotlib.pyplot as plt
408
>>> rand_series = np.random.rand(100)
409
>>> plt.plot(rand_series, label="reference")
410
>>> plt.plot(low_pass_filter(rand_series, 0.2, 2), label="filtered")
411
>>> plt.legend()
412
>>> plt.show()
413
414
"""
415
if len(data.shape) > 1: # Check if given data has multiple dimensions
416
if data.shape[1] == 1:
417
data = data[:, 0] # Resize to 1D-Array
418
else:
419
raise ValueError("Given data has multiple dimensions. "
420
"Only one-dimensional arrays are supported in this function.")
421
_filter_order = int(filter_order)
422
numerator, denominator = signal.butter(N=_filter_order, Wn=crit_freq,
423
btype='low', analog=False, output='ba')
424
output = signal.filtfilt(numerator, denominator, data)
425
return output
426
427
428
def moving_average(data: np.ndarray, window: int) -> np.ndarray:
429
"""
430
Creates a pandas Series as moving average of the input series.
431
432
:param np.ndarray data:
433
For dataframe e.g. df['a_col_name'].values
434
:param int window:
435
sample rate of input
436
:return: numpy.array
437
shape has (###,). First and last points of input Series are extrapolated as constant
438
values (hold first and last point).
439
440
Example:
441
442
>>> import numpy as np
443
>>> import matplotlib.pyplot as plt
444
>>> series = np.sin(np.linspace(-30, 30, 1000))
445
>>> plt.plot(series, label="reference")
446
>>> plt.plot(moving_average(series, 10), label="window=10")
447
>>> plt.plot(moving_average(series, 50), label="window=50")
448
>>> plt.plot(moving_average(series, 100), label="window=100")
449
>>> plt.legend()
450
>>> plt.show()
451
452
"""
453
if len(data.shape) > 1: # Check if given data has multiple dimensions
454
if data.shape[1] == 1:
455
data = data[:, 0] # Resize to 1D-Array
456
else:
457
raise ValueError("Given data has multiple dimensions. "
458
"Only one-dimensional arrays are supported in this function.")
459
window = int(window)
460
weights = np.repeat(1.0, window) / window
461
sma = np.convolve(data, weights, 'valid')
462
# Create array with first entries and window/2 elements
463
fill_start = np.full((int(np.floor(window / 2)), 1), sma[0])
464
# Same with last value of -data-
465
fill_end = np.full((int(np.ceil(window / 2)) - 1, 1), sma[-1])
466
# Stack the arrays
467
sma = np.concatenate((fill_start[:, 0], sma, fill_end[:, 0]), axis=0)
468
return sma
469
470
471
def create_on_off_signal(
472
df: Union[pd.DataFrame, "TimeSeriesData"],
473
col_names: list,
474
threshold: Union[float, list],
475
col_names_new: list,
476
tags: Union[list, str] = "raw",
477
new_tag: str = "converted_signal"
478
):
479
"""
480
Create on and off signals based on the given threshold for all column names.
481
482
:param pd.DataFame,TimeSeriesData df:
483
DataFrame with the data to process
484
:param list col_names:
485
Column names of variables to convert to signals
486
:param float,list threshold:
487
Threshold for all column-names (single float) or
488
a list with specific thresholds for specific columns.
489
:param list col_names_new:
490
New name for the signal-column
491
:param str,list tags:
492
If a 2-Level DataFrame for TimeSeriesData is used, one has to
493
specify the tag of the variables. Default value is to use the "raw"
494
tag set in the TimeSeriesClass. However, one can specify a list
495
(Different tag for each variable), or on can pass a string
496
(same tags for all given variables)
497
:param str new_tag:
498
The tag the newly created variable will hold. This can be used to
499
indicate where the signal was converted from.
500
:return: pd.DataFrame
501
Copy of DataFrame with the created signals added.
502
503
Example:
504
505
>>> import matplotlib.pyplot as plt
506
>>> import numpy as np
507
>>> df = pd.DataFrame({"P_el": np.sin(np.linspace(-20, 20, 10000))*100})
508
>>> df = create_on_off_signal(df, col_names=["P_el"],
509
>>> threshold=25, col_names_new=["Device On"])
510
>>> plt.plot(df)
511
>>> plt.show()
512
"""
513
if len(col_names) != len(col_names_new):
514
raise IndexError(f"Given lists differ in length. col_names: {len(col_names)}, "
515
f"col_names_new: {len(col_names_new)}")
516
if isinstance(threshold, list):
517
if len(col_names) != len(threshold):
518
raise IndexError(f"Given lists differ in length. col_names: {len(col_names)}, "
519
f"threshold: {len(threshold)}")
520
else:
521
threshold = [threshold for _ in enumerate(col_names)]
522
# Do on_off signal creation for all desired columns
523
df_copy = df.copy()
524
if isinstance(df.columns, pd.MultiIndex):
525
# Convert given tags to a list
526
if isinstance(tags, str):
527
tags = [tags for _ in enumerate(col_names)]
528
529
for i, _ in enumerate(col_names):
530
# Create zero-array
531
df_copy.loc[:, (col_names_new[i], new_tag)] = 0.0
532
# Change all values to 1.0 according to threshold
533
df_copy.loc[
534
df_copy[col_names[i], tags[i]] >= threshold[i], (col_names_new[i], new_tag)] = 1.0
535
else:
536
for i, _ in enumerate(col_names):
537
# Create zero-array
538
df_copy.loc[:, col_names_new[i]] = 0.0
539
# Change all values to 1.0 according to threshold
540
df_copy.loc[df_copy[col_names[i]] >= threshold[i], col_names_new[i]] = 1.0
541
return df_copy
542
543
544
def number_lines_totally_na(df: Union[pd.DataFrame, "TimeSeriesData"]) -> int:
545
"""
546
Returns the number of rows in the given dataframe
547
that are filled with NaN-values.
548
549
:param pd.DataFrame,TimeSeriesData df:
550
Given dataframe to process
551
:return: int
552
Number of NaN-Rows.
553
554
Example:
555
556
>>> import numpy as np
557
>>> import pandas as pd
558
>>> dim = np.random.randint(100) + 10
559
>>> nan_col = [np.NaN for i in range(dim)]
560
>>> col = [i for i in range(dim)]
561
>>> df_nan = pd.DataFrame({"col_1":nan_col, "col_2":nan_col})
562
>>> df_normal = pd.DataFrame({"col_1":nan_col, "col_2":col})
563
>>> print(number_lines_totally_na(df_nan)-dim)
564
0
565
>>> print(number_lines_totally_na(df_normal))
566
0
567
"""
568
if not isinstance(df, pd.DataFrame):
569
raise TypeError('Input must be a pandas data frame')
570
counter = 0
571
for _, row in df.iterrows():
572
# Check if the whole row is filled with NaNs.
573
if all(row.isnull()):
574
counter += 1
575
return counter
576
577
578
def z_score(x: np.ndarray, limit=3) -> np.ndarray:
579
"""
580
Calculate the z-score using the mea
581
and standard deviation of the given data.
582
583
:param np.array x:
584
For dataframe e.g. df['a_col_name'].values
585
:param float limit: default 3
586
Lower limit for required z-score
587
:return: np.array iqr:
588
modified z score
589
590
Example:
591
592
>>> import numpy as np
593
>>> normal_dis = np.random.normal(0, 1, 1000)
594
>>> res = z_score(normal_dis, limit=2)
595
>>> values = normal_dis[res]
596
597
"""
598
mean = np.mean(x)
599
standard_deviation = np.std(x)
600
z_score_value = (x - mean) / standard_deviation
601
return np.where(np.abs(z_score_value) > limit)[0]
602
603
604
def modified_z_score(x: np.ndarray, limit: float = 3.5) -> np.ndarray:
605
"""
606
Calculate the modified z-score using the median
607
and median average deviation of the given data.
608
609
:param np.array x:
610
For dataframe e.g. df['a_col_name'].values
611
:param float limit: default 3.5
612
Lower limit for required z-score
613
:return: np.array iqr:
614
modified z score
615
616
Example:
617
618
>>> import numpy as np
619
>>> normal_dis = np.random.normal(0, 1, 1000)
620
>>> res = modified_z_score(normal_dis, limit=2)
621
>>> values = normal_dis[res]
622
623
"""
624
median = np.median(x)
625
median_average_deviation = np.median(np.abs(x - median))
626
z_score_mod = 0.6745 * (x - median) / median_average_deviation
627
return np.where(np.abs(z_score_mod) > limit)[0]
628
629
630
def interquartile_range(x: np.ndarray) -> np.ndarray:
631
"""
632
Calculate interquartile range of given array.
633
Returns the indices of values outside of the interquartile range.
634
635
:param np.array x:
636
For dataframe e.g. df['a_col_name'].values
637
:return: np.array iqr:
638
Array matching the interquartile-range
639
640
Example:
641
642
>>> import numpy as np
643
>>> normal_dis = np.random.normal(0, 1, 1000)
644
>>> res = interquartile_range(normal_dis)
645
>>> values = normal_dis[res]
646
647
"""
648
quartile_1, quartile_3 = np.percentile(x, [25, 75])
649
iqr = quartile_3 - quartile_1
650
lower = quartile_1 - (iqr * 1.5)
651
upper = quartile_3 + (iqr * 1.5)
652
return np.where((x > upper) | (x < lower))[0]
653
654
655
def cross_validation(x, y, test_size=0.3):
656
"""
657
Split data set randomly with test_size
658
(if test_size = 0.30 --> 70 % are training data).
659
You can use this function for segmentation tasks.
660
Time-series-data may not be splitted with this function
661
as the results are not coherent (time-wise).
662
663
:param x:
664
Indexables with same length / shape[0] as y.
665
Allowed inputs are lists, numpy arrays, scipy-sparse
666
matrices or pandas dataframes.
667
:param list,np.ndarray,pd.DataFrame y:
668
Indexables with same length / shape[0] as x.
669
Allowed inputs are lists, numpy arrays, scipy-sparse
670
matrices or pandas dataframes.
671
:param float test_size:
672
Value between 0 and 1 specifying what percentage of the data
673
will be used for testing.
674
:return: list
675
Split data into 4 objects. The order is:
676
x_train, x_test, y_train, y_test
677
678
Example:
679
680
>>> import numpy as np
681
>>> x = np.random.rand(100)
682
>>> y = np.random.rand(100)
683
>>> ret = cross_validation(x, y)
684
>>> len(ret)
685
4
686
"""
687
return model_selection.train_test_split(x, y, test_size=test_size)
688
689
690
def get_df_index_frequency_mean_and_std(df_index: pd.Index, verbose: bool = False):
691
"""
692
Function to get the mean and std of the index-frequency.
693
If the index is a DatetimeIndex, the seconds are converted from nanoseconds
694
to seconds.
695
Else, seconds are assumed as values.
696
697
:param pd.Index df_index:
698
Time index.
699
:param bool verbose:
700
Default false. If true, additional to the mean value and standard deviation,
701
the standard error of the mean and number of time steps are returned.
702
703
:returns:
704
float: Mean value
705
float: Standard deviation
706
"""
707
708
if isinstance(df_index, pd.DatetimeIndex):
709
index_in_s = df_index.to_series().diff().dropna().values.astype(np.float64) * 1e-9
710
else:
711
index_in_s = df_index.to_series().diff().dropna().values.astype(np.float64)
712
if verbose:
713
return np.mean(index_in_s), np.std(index_in_s), st.sem(index_in_s), len(index_in_s)
714
else:
715
return np.mean(index_in_s), np.std(index_in_s)
716
717