CoCalc -- preprocessing.py

GitHub Repository: RWTH-EBC/ebcpy
Path: blob/master/ebcpy/preprocessing.py
⁵⁰⁵ views
1
"""
2
This general overview may help you find the function you need:
3

4
- Remove duplicate rows by averaging the values
5
  (``build_average_on_duplicate_rows``)
6
- Convert any integer or float index into a datetime index
7
  (``convert_index_to_datetime_index``)
8
- Resample a given time-series on a given frequency
9
  (``clean_and_space_equally_time_series``)
10
- Apply a low-pass-filter (``low_pass_filter``)
11
- Apply a moving average to flatten disturbances
12
  in your measured data (``moving_average``)
13
- Convert e.g. an electrical power signal into a binary
14
  control signal (on-off) based on a threshold (``create_on_off_signal``)
15
- Find the number of lines without any values in it (``number_lines_totally_na``)
16
- Split a data-set into training and test set according to
17
  cross-validation (``cross_validation``)
18

19
All functions in the pre-processing module should have a doctest. We refer to the example
20
in this doctest for a better understanding of the functions. If you don't understand
21
the behaviour of a function or the meaning, please raise an issue.
22
"""
23
import warnings
24
import logging
25
from typing import Union, TYPE_CHECKING
26

27
from datetime import datetime
28
from scipy import signal
29
from sklearn import model_selection
30
from pandas.tseries.frequencies import to_offset
31
import numpy as np
32
import pandas as pd
33
import scipy.stats as st
34

35
if TYPE_CHECKING:
36
    from ebcpy import TimeSeriesData
37

38
logger = logging.getLogger(__name__)
39

40

41
def build_average_on_duplicate_rows(df: Union[pd.DataFrame, "TimeSeriesData"]) -> pd.DataFrame:
42
    """
43
    If the dataframe has duplicate-indexes, the average
44
    value of all those indexes is calculated and given to
45
    the first occurrence of this duplicate index. Therefore,
46
    any dataFrame should be already sorted before calling this
47
    function.
48

49
    :param pd.DataFame df:
50
        DataFrame with the data to process
51
    :return: pd.DataFame
52
        The processed DataFame
53

54
    Example:
55

56
    >>> df = pd.DataFrame({"idx": np.ones(5), "val": np.arange(5)}).set_index("idx")
57
    >>> df = convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1))
58
    >>> print(df)
59
                         val
60
    idx
61
    2007-01-01 00:00:01    0
62
    2007-01-01 00:00:01    1
63
    2007-01-01 00:00:01    2
64
    2007-01-01 00:00:01    3
65
    2007-01-01 00:00:01    4
66
    >>> print(build_average_on_duplicate_rows(df))
67
                         val
68
    idx
69
    2007-01-01 00:00:01  2.0
70
    """
71
    # Find entries that are exactly the same timestamp
72
    double_ind = df.index[df.index.duplicated()].unique()
73
    # Calculate the mean value
74
    mean_values = []
75
    for item in double_ind:
76
        mean_values.append(df.loc[item].values.mean(axis=0))
77
    # Delete duplicate indices
78
    df_dropped = df[~df.index.duplicated(keep='first')].copy()
79

80
    # Set mean values in rows that were duplicates before
81
    for idx, values in zip(double_ind, mean_values):
82
        df_dropped.loc[idx] = values
83

84
    return df_dropped
85

86

87
def convert_index_to_datetime_index(
88
        df: Union[pd.DataFrame, "TimeSeriesData"],
89
        unit_of_index: str = "s",
90
        origin: datetime = datetime.now(),
91
        inplace: bool = False
92
) -> pd.DataFrame:
93
    """
94
    Converts the index of the given DataFrame to a
95
    pandas.core.indexes.datetimes.DatetimeIndex.
96

97
    :param pd.DataFrame,TimeSeriesData df:
98
        dataframe with index not being a DateTime.
99
        Only numeric indexes are supported. Every integer
100
        is interpreted with the given unit, standard form
101
        is in seocnds.
102
    :param str unit_of_index: default 's'
103
        The unit of the given index. Used to convert to
104
        total_seconds later on.
105
    :param datetime.datetime origin:
106
        The reference datetime object for the first index.
107
        Default is the current system time.
108
    :param bool inplace:
109
        If True, performs operation inplace and returns None.
110
    :return: df
111
        Copy of DataFrame with correct index for usage in this
112
        framework.
113

114
    Example:
115

116
    >>> import pandas as pd
117
    >>> df = pd.DataFrame(np.ones([3, 4]), columns=list('ABCD'))
118
    >>> print(df)
119
         A    B    C    D
120
    0  1.0  1.0  1.0  1.0
121
    1  1.0  1.0  1.0  1.0
122
    2  1.0  1.0  1.0  1.0
123
    >>> print(convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1)))
124
                           A    B    C    D
125
    2007-01-01 00:00:00  1.0  1.0  1.0  1.0
126
    2007-01-01 00:00:01  1.0  1.0  1.0  1.0
127
    2007-01-01 00:00:02  1.0  1.0  1.0  1.0
128

129
    """
130
    # Check for unit of given index. Maybe one uses hour-based data.
131
    _unit_conversion_to_seconds = {"ms": 1e3,
132
                                   "s": 1,
133
                                   "min": 1 / 60,
134
                                   "h": 1 / 3600,
135
                                   "d": 1 / 86400}
136
    if unit_of_index not in _unit_conversion_to_seconds:
137
        raise ValueError("Given unit_of_index is not supported.")
138
    _unit_factor_to_seconds = _unit_conversion_to_seconds.get(unit_of_index)
139

140
    # Convert
141
    old_index = df.index.copy()
142
    # Check if already converted:
143
    if isinstance(old_index, pd.DatetimeIndex):
144
        return df
145
    # Convert strings to numeric values.
146
    old_index = pd.to_numeric(old_index)
147
    # Convert to seconds.
148
    old_index /= _unit_factor_to_seconds
149
    # Alter the index
150
    index = pd.to_datetime(old_index, unit="s", origin=origin)
151
    if inplace:
152
        df.index = index
153
        return None
154
    df_copy = df.copy()
155
    df_copy.index = index
156
    return df_copy
157

158

159
def convert_datetime_index_to_float_index(
160
        df: Union[pd.DataFrame, "TimeSeriesData"],
161
        offset: float = 0,
162
        inplace: bool = False
163
) -> pd.DataFrame:
164
    """
165
    Convert a datetime-based index to FloatIndex (in seconds).
166
    Seconds are used as a standard unit as simulation software
167
    outputs data in seconds (e.g. Modelica)
168

169
    :param pd.DataFrame,TimeSeriesData df:
170
        DataFrame to be converted to FloatIndex
171
    :param float offset:
172
        Offset in seconds
173
    :param bool inplace:
174
        If True, performs operation inplace and returns None.
175
    :return: pd.DataFrame df:
176
        DataFrame with correct index
177

178
    Example:
179

180
    >>> import pandas as pd
181
    >>> df = pd.DataFrame(np.ones([3, 4]), columns=list('ABCD'))
182
    >>> print(convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1)))
183
                           A    B    C    D
184
    2007-01-01 00:00:00  1.0  1.0  1.0  1.0
185
    2007-01-01 00:00:01  1.0  1.0  1.0  1.0
186
    2007-01-01 00:00:02  1.0  1.0  1.0  1.0
187
    >>> print(convert_datetime_index_to_float_index(df))
188
           A    B    C    D
189
    0.0  1.0  1.0  1.0  1.0
190
    1.0  1.0  1.0  1.0  1.0
191
    2.0  1.0  1.0  1.0  1.0
192
    """
193
    # Check correct input
194
    if not isinstance(df.index, pd.DatetimeIndex):
195
        raise IndexError("Given DataFrame has no DatetimeIndex, conversion not possible")
196

197
    new_index = np.round(pd.to_timedelta(df.index - df.index[0]).total_seconds(), 4) + offset
198
    if inplace:
199
        df.index = new_index
200
        return None
201
    df_copy = df.copy()
202
    df_copy.index = new_index
203
    return df_copy
204

205

206
def time_based_weighted_mean(df: Union[pd.DataFrame, "TimeSeriesData"]) -> np.ndarray:
207
    """
208
    Creates the weighted mean according to time index that does not need to be equidistant.
209
    Further info:
210
    https://stackoverflow.com/questions/26343252/create-a-weighted-mean-for-a-irregular-timeseries-in-pandas
211

212
    :param pd.DataFrame df:
213
        A pandas DataFrame with DatetimeIndex.
214
    :return np.array:
215
        A numpy array containing weighted means of all columns
216

217
    Example:
218

219
    >>> from datetime import datetime
220
    >>> import numpy as np
221
    >>> import pandas as pd
222
    >>> time_vec = [datetime(2007,1,1,0,0),
223
    >>>             datetime(2007,1,1,0,0),
224
    >>>             datetime(2007,1,1,0,5),
225
    >>>             datetime(2007,1,1,0,7),
226
    >>>             datetime(2007,1,1,0,10)]
227
    >>> df = pd.DataFrame({'A': [1,2,4,3,6], 'B': [11,12,14,13,16]}, index=time_vec)
228
    >>> print(time_based_weighted_mean(df=df))
229
    [  3.55  13.55]
230
    """
231

232
    if not isinstance(df.index, pd.DatetimeIndex):
233
        raise IndexError(f"df.index must be DatetimeIndex, but it is {type(df.index)}.")
234

235
    time_delta = [(x - y).total_seconds() for x, y in zip(df.index[1:], df.index[:-1])]
236
    weights = [x + y for x, y in zip([0] + time_delta, time_delta + [0])]
237
    # Create empty numpy array
238
    res = np.empty(len(df.columns))
239
    res[:] = np.nan
240
    for i, col_name in enumerate(df.columns):
241
        res[i] = np.average(df[col_name], weights=weights)
242
    return res
243

244

245
def clean_and_space_equally_time_series(
246
        df: Union[pd.DataFrame, "TimeSeriesData"],
247
        desired_freq: str,
248
        confidence_warning: float = 0.95
249
) -> pd.DataFrame:
250
    """
251
    Function for cleaning of the given dataFrame and interpolating
252
    based on the given desired frequency. Linear interpolation
253
    is used.
254

255
    :param pd.DataFrame,TimeSeriesData df:
256
        Unclean DataFrame. Needs to have a pd.DateTimeIndex
257
    :param str desired_freq:
258
        Frequency to determine number of elements in processed dataframe.
259
        Options are for example:
260
        - s: second-based
261
        - 5s: Every 5 seconds
262
        - 6min: Every 6 minutes
263
        This also works for h, d, m, y, ms etc.
264
    :param float confidence_warning:
265
        Value to check the confidence interval of input data without
266
        a defined frequency. If the desired frequency is outside of
267
        the resulting confidence interval, a warning is issued.
268
    :return: pd.DataFrame
269
        Cleaned and equally spaced data-frame
270

271
    Example:
272
    **Note:** The example is for random data. Try out different sampling
273
    frequencys. You will be warned if the samping rate is to high or to low.
274

275
    >>> df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)),
276
    >>>                   columns=list('ABCD')).set_index("A").sort_index()
277
    >>> df = convert_index_to_datetime_index(df, origin=datetime(2007, 1, 1))
278
    >>> clean_and_space_equally_time_series(df, "30s")
279
    >>> import matplotlib.pyplot as plt
280
    >>> plt.plot(df["B"], label="Raw data")
281
    >>> df = clean_and_space_equally_time_series(df.copy(), "1500ms")
282
    >>> plt.plot(df["B"], label="Clead and spaced equally")
283
    >>> plt.legend()
284
    >>> plt.show()
285

286
    .. versionchanged:: 0.1.7
287
    """
288
    from ebcpy import TimeSeriesData
289

290
    # Convert indexes to datetime_index:
291
    if not isinstance(df.index, pd.DatetimeIndex):
292
        if isinstance(df, TimeSeriesData):
293
            raise TypeError("TimeSeriesData needs a DateTimeIndex for executing this function. "
294
                            "Call to_datetime_index() to convert any index to "
295
                            "a DateTimeIndex")
296
        # Else
297
        raise TypeError("DataFrame needs a DateTimeIndex for executing this function. "
298
                        "Call convert_index_to_datetime_index() to convert any index to "
299
                        "a DateTimeIndex")
300
    # %% Check DataFrame for NANs
301
    # Create a pandas Series with number of invalid values for each column of df
302
    series_with_na = df.isnull().sum()
303
    for name in series_with_na.index:
304
        if series_with_na.loc[name] > 0:
305
            # Print only columns with invalid values
306
            logger.info("%s has following number of invalid "
307
                        "values\n %s", name, series_with_na.loc[name])
308
    # Drop all rows where at least one NA exists
309
    df_temp = df.dropna(how='any')
310

311
    # Check if DataFrame still has non-numeric-values:
312
    if not all(df_temp.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all())):
313
        raise ValueError("Given DataFrame contains non-numeric values.")
314

315
    # Merge duplicate rows using mean.
316
    df_temp = build_average_on_duplicate_rows(df_temp)
317

318
    # Make user warning for two cases: Upsampling and data input without a freq:
319
    # Check if the frequency differs
320
    old_freq, old_freq_std, old_freq_sem, time_steps = get_df_index_frequency_mean_and_std(
321
        df_index=df_temp.index,
322
        verbose=True)
323
    if old_freq_std > 0:
324
        _ns_to_s = 1e9
325
        # Calculate confidence interval of the mean value of the old frequency
326
        cfd_int = st.t.interval(confidence_warning,
327
                                time_steps - 1,
328
                                loc=old_freq,
329
                                scale=old_freq_sem)
330
        # Convert to timedelta
331
        cfd_int = pd.to_timedelta((cfd_int[0] * _ns_to_s, cfd_int[1] * _ns_to_s))
332
        _td_freq = pd.to_timedelta(desired_freq)
333
        if (_td_freq < cfd_int[0]) or (_td_freq > cfd_int[1]):
334
            in_seconds = np.array(cfd_int.values.tolist()) / _ns_to_s  # From nanoseconds
335
            warnings.warn(f"Input data has no frequency, but the desired frequency "
336
                          f"{_td_freq.value / _ns_to_s} seconds is outside the given "
337
                          f"confidence interval {in_seconds} (in seconds) "
338
                          "Carefully check the result to see if you "
339
                          "introduced errors to the data.")
340

341
    # %% Re-sampling to new frequency with linear interpolation
342
    # Create new equally spaced DatetimeIndex. Last entry is always < df.index[-1]
343
    time_index = pd.date_range(start=df.index[0], end=df.index[-1], freq=desired_freq)
344
    new_freq, _ = get_df_index_frequency_mean_and_std(df_index=time_index)
345

346
    # Check if the user is trying to upsample the data:
347
    if old_freq_std == 0:
348
        if new_freq > old_freq:
349
            warnings.warn("You are upsampling your data. This may be dangerous. "
350
                          "Carefully check the result to see if you introduced errors to the data.")
351

352
    # Create an empty data frame
353
    # If multi-columns is used, first get the old index and make it empty:
354
    multi_cols = df_temp.columns
355
    if isinstance(multi_cols, pd.MultiIndex):
356
        empty_multi_cols = pd.MultiIndex.from_product([[] for _ in range(multi_cols.nlevels)],
357
                                                      names=multi_cols.names)
358
        df_time_temp = pd.DataFrame(index=time_index, columns=empty_multi_cols)
359
    else:
360
        df_time_temp = pd.DataFrame(index=time_index)
361

362
    # Insert temporary time_index into df. fill_value = 0 can only be used,
363
    # since all NaNs should be eliminated prior
364
    df_temp = df_temp.radd(df_time_temp, axis='index', fill_value=0)
365
    del df_time_temp
366

367
    # Interpolate linearly according to time index
368
    df_temp.interpolate(method='time', axis=0, inplace=True)
369
    # Determine Timedelta between current first index entry
370
    # in df and the first index entry that would be created
371
    # when applying df.resample() without loffset
372
    delta_time = df.index[0] - \
373
                 df_temp.resample(rule=desired_freq).first().first(desired_freq).index[0]
374
    # Resample to equally spaced index.
375
    # All fields should already have a value. Thus NaNs and maybe +/- infs
376
    # should have been filtered beforehand.
377

378
    # Check if given dataframe was a TimeSeriesData object and of so, convert it as such
379
    from ebcpy import TimeSeriesData
380
    if isinstance(df_temp, TimeSeriesData):
381
        df_temp = df_temp.resample(rule=desired_freq).first()
382
        df_temp.index = df_temp.index + to_offset(delta_time)
383
        df_temp = TimeSeriesData(df_temp)
384
    else:
385
        df_temp = df_temp.resample(rule=desired_freq).first()
386
        df_temp.index = df_temp.index + to_offset(delta_time)
387
    del delta_time
388

389
    return df_temp
390

391

392
def low_pass_filter(data: np.ndarray, crit_freq: float, filter_order: int) -> np.ndarray:
393
    """
394
    Create a low pass filter with given order and frequency.
395

396
    :param numpy.ndarray data:
397
        For dataframe e.g. df['a_col_name'].values
398
    :param float crit_freq:
399
        The critical frequency or frequencies.
400
    :param int filter_order:
401
        The order of the filter
402
    :return: numpy.ndarray
403

404
    Example:
405

406
    >>> import numpy as np
407
    >>> import matplotlib.pyplot as plt
408
    >>> rand_series = np.random.rand(100)
409
    >>> plt.plot(rand_series, label="reference")
410
    >>> plt.plot(low_pass_filter(rand_series, 0.2, 2), label="filtered")
411
    >>> plt.legend()
412
    >>> plt.show()
413

414
    """
415
    if len(data.shape) > 1:  # Check if given data has multiple dimensions
416
        if data.shape[1] == 1:
417
            data = data[:, 0]  # Resize to 1D-Array
418
        else:
419
            raise ValueError("Given data has multiple dimensions. "
420
                             "Only one-dimensional arrays are supported in this function.")
421
    _filter_order = int(filter_order)
422
    numerator, denominator = signal.butter(N=_filter_order, Wn=crit_freq,
423
                                           btype='low', analog=False, output='ba')
424
    output = signal.filtfilt(numerator, denominator, data)
425
    return output
426

427

428
def moving_average(data: np.ndarray, window: int) -> np.ndarray:
429
    """
430
    Creates a pandas Series as moving average of the input series.
431

432
    :param np.ndarray data:
433
        For dataframe e.g. df['a_col_name'].values
434
    :param int window:
435
        sample rate of input
436
    :return: numpy.array
437
        shape has (###,). First and last points of input Series are extrapolated as constant
438
        values (hold first and last point).
439

440
    Example:
441

442
    >>> import numpy as np
443
    >>> import matplotlib.pyplot as plt
444
    >>> series = np.sin(np.linspace(-30, 30, 1000))
445
    >>> plt.plot(series, label="reference")
446
    >>> plt.plot(moving_average(series, 10), label="window=10")
447
    >>> plt.plot(moving_average(series, 50), label="window=50")
448
    >>> plt.plot(moving_average(series, 100), label="window=100")
449
    >>> plt.legend()
450
    >>> plt.show()
451

452
    """
453
    if len(data.shape) > 1:  # Check if given data has multiple dimensions
454
        if data.shape[1] == 1:
455
            data = data[:, 0]  # Resize to 1D-Array
456
        else:
457
            raise ValueError("Given data has multiple dimensions. "
458
                             "Only one-dimensional arrays are supported in this function.")
459
    window = int(window)
460
    weights = np.repeat(1.0, window) / window
461
    sma = np.convolve(data, weights, 'valid')
462
    # Create array with first entries and window/2 elements
463
    fill_start = np.full((int(np.floor(window / 2)), 1), sma[0])
464
    # Same with last value of -data-
465
    fill_end = np.full((int(np.ceil(window / 2)) - 1, 1), sma[-1])
466
    # Stack the arrays
467
    sma = np.concatenate((fill_start[:, 0], sma, fill_end[:, 0]), axis=0)
468
    return sma
469

470

471
def create_on_off_signal(
472
        df: Union[pd.DataFrame, "TimeSeriesData"],
473
        col_names: list,
474
        threshold: Union[float, list],
475
        col_names_new: list,
476
        tags: Union[list, str] = "raw",
477
        new_tag: str = "converted_signal"
478
):
479
    """
480
    Create on and off signals based on the given threshold for all column names.
481

482
    :param pd.DataFame,TimeSeriesData df:
483
        DataFrame with the data to process
484
    :param list col_names:
485
        Column names of variables to convert to signals
486
    :param float,list threshold:
487
        Threshold for all column-names (single float) or
488
        a list with specific thresholds for specific columns.
489
    :param list col_names_new:
490
        New name for the signal-column
491
    :param str,list tags:
492
        If a 2-Level DataFrame for TimeSeriesData is used, one has to
493
        specify the tag of the variables. Default value is to use the "raw"
494
        tag set in the TimeSeriesClass. However, one can specify a list
495
        (Different tag for each variable), or on can pass a string
496
        (same tags for all given variables)
497
    :param str new_tag:
498
        The tag the newly created variable will hold. This can be used to
499
        indicate where the signal was converted from.
500
    :return: pd.DataFrame
501
        Copy of DataFrame with the created signals added.
502

503
    Example:
504

505
    >>> import matplotlib.pyplot as plt
506
    >>> import numpy as np
507
    >>> df = pd.DataFrame({"P_el": np.sin(np.linspace(-20, 20, 10000))*100})
508
    >>> df = create_on_off_signal(df, col_names=["P_el"],
509
    >>>                           threshold=25, col_names_new=["Device On"])
510
    >>> plt.plot(df)
511
    >>> plt.show()
512
    """
513
    if len(col_names) != len(col_names_new):
514
        raise IndexError(f"Given lists differ in length. col_names: {len(col_names)}, "
515
                         f"col_names_new: {len(col_names_new)}")
516
    if isinstance(threshold, list):
517
        if len(col_names) != len(threshold):
518
            raise IndexError(f"Given lists differ in length. col_names: {len(col_names)}, "
519
                             f"threshold: {len(threshold)}")
520
    else:
521
        threshold = [threshold for _ in enumerate(col_names)]
522
    # Do on_off signal creation for all desired columns
523
    df_copy = df.copy()
524
    if isinstance(df.columns, pd.MultiIndex):
525
        # Convert given tags to a list
526
        if isinstance(tags, str):
527
            tags = [tags for _ in enumerate(col_names)]
528

529
        for i, _ in enumerate(col_names):
530
            # Create zero-array
531
            df_copy.loc[:, (col_names_new[i], new_tag)] = 0.0
532
            # Change all values to 1.0 according to threshold
533
            df_copy.loc[
534
                df_copy[col_names[i], tags[i]] >= threshold[i], (col_names_new[i], new_tag)] = 1.0
535
    else:
536
        for i, _ in enumerate(col_names):
537
            # Create zero-array
538
            df_copy.loc[:, col_names_new[i]] = 0.0
539
            # Change all values to 1.0 according to threshold
540
            df_copy.loc[df_copy[col_names[i]] >= threshold[i], col_names_new[i]] = 1.0
541
    return df_copy
542

543

544
def number_lines_totally_na(df: Union[pd.DataFrame, "TimeSeriesData"]) -> int:
545
    """
546
    Returns the number of rows in the given dataframe
547
    that are filled with NaN-values.
548

549
    :param pd.DataFrame,TimeSeriesData df:
550
        Given dataframe to process
551
    :return: int
552
        Number of NaN-Rows.
553

554
    Example:
555

556
    >>> import numpy as np
557
    >>> import pandas as pd
558
    >>> dim = np.random.randint(100) + 10
559
    >>> nan_col = [np.NaN for i in range(dim)]
560
    >>> col = [i for i in range(dim)]
561
    >>> df_nan = pd.DataFrame({"col_1":nan_col, "col_2":nan_col})
562
    >>> df_normal = pd.DataFrame({"col_1":nan_col, "col_2":col})
563
    >>> print(number_lines_totally_na(df_nan)-dim)
564
    0
565
    >>> print(number_lines_totally_na(df_normal))
566
    0
567
    """
568
    if not isinstance(df, pd.DataFrame):
569
        raise TypeError('Input must be a pandas data frame')
570
    counter = 0
571
    for _, row in df.iterrows():
572
        # Check if the whole row is filled with NaNs.
573
        if all(row.isnull()):
574
            counter += 1
575
    return counter
576

577

578
def z_score(x: np.ndarray, limit=3) -> np.ndarray:
579
    """
580
    Calculate the z-score using the mea
581
    and standard deviation of the given data.
582

583
    :param np.array x:
584
        For dataframe e.g. df['a_col_name'].values
585
    :param float limit: default 3
586
        Lower limit for required z-score
587
    :return: np.array iqr:
588
        modified z score
589

590
    Example:
591

592
    >>> import numpy as np
593
    >>> normal_dis = np.random.normal(0, 1, 1000)
594
    >>> res = z_score(normal_dis, limit=2)
595
    >>> values = normal_dis[res]
596

597
    """
598
    mean = np.mean(x)
599
    standard_deviation = np.std(x)
600
    z_score_value = (x - mean) / standard_deviation
601
    return np.where(np.abs(z_score_value) > limit)[0]
602

603

604
def modified_z_score(x: np.ndarray, limit: float = 3.5) -> np.ndarray:
605
    """
606
    Calculate the modified z-score using the median
607
    and median average deviation of the given data.
608

609
    :param np.array x:
610
        For dataframe e.g. df['a_col_name'].values
611
    :param float limit: default 3.5
612
        Lower limit for required z-score
613
    :return: np.array iqr:
614
        modified z score
615

616
    Example:
617

618
    >>> import numpy as np
619
    >>> normal_dis = np.random.normal(0, 1, 1000)
620
    >>> res = modified_z_score(normal_dis, limit=2)
621
    >>> values = normal_dis[res]
622

623
    """
624
    median = np.median(x)
625
    median_average_deviation = np.median(np.abs(x - median))
626
    z_score_mod = 0.6745 * (x - median) / median_average_deviation
627
    return np.where(np.abs(z_score_mod) > limit)[0]
628

629

630
def interquartile_range(x: np.ndarray) -> np.ndarray:
631
    """
632
    Calculate interquartile range of given array.
633
    Returns the indices of values outside of the interquartile range.
634

635
    :param np.array x:
636
        For dataframe e.g. df['a_col_name'].values
637
    :return: np.array iqr:
638
        Array matching the interquartile-range
639

640
    Example:
641

642
    >>> import numpy as np
643
    >>> normal_dis = np.random.normal(0, 1, 1000)
644
    >>> res = interquartile_range(normal_dis)
645
    >>> values = normal_dis[res]
646

647
    """
648
    quartile_1, quartile_3 = np.percentile(x, [25, 75])
649
    iqr = quartile_3 - quartile_1
650
    lower = quartile_1 - (iqr * 1.5)
651
    upper = quartile_3 + (iqr * 1.5)
652
    return np.where((x > upper) | (x < lower))[0]
653

654

655
def cross_validation(x, y, test_size=0.3):
656
    """
657
    Split data set randomly with test_size
658
    (if test_size = 0.30 --> 70 % are training data).
659
    You can use this function for segmentation tasks.
660
    Time-series-data may not be splitted with this function
661
    as the results are not coherent (time-wise).
662

663
    :param x:
664
        Indexables with same length / shape[0] as y.
665
        Allowed inputs are lists, numpy arrays, scipy-sparse
666
        matrices or pandas dataframes.
667
    :param list,np.ndarray,pd.DataFrame y:
668
        Indexables with same length / shape[0] as x.
669
        Allowed inputs are lists, numpy arrays, scipy-sparse
670
        matrices or pandas dataframes.
671
    :param float test_size:
672
        Value between 0 and 1 specifying what percentage of the data
673
        will be used for testing.
674
    :return: list
675
        Split data into 4 objects. The order is:
676
        x_train, x_test, y_train, y_test
677

678
    Example:
679

680
    >>> import numpy as np
681
    >>> x = np.random.rand(100)
682
    >>> y = np.random.rand(100)
683
    >>> ret = cross_validation(x, y)
684
    >>> len(ret)
685
    4
686
    """
687
    return model_selection.train_test_split(x, y, test_size=test_size)
688

689

690
def get_df_index_frequency_mean_and_std(df_index: pd.Index, verbose: bool = False):
691
    """
692
    Function to get the mean and std of the index-frequency.
693
    If the index is a DatetimeIndex, the seconds are converted from nanoseconds
694
    to seconds.
695
    Else, seconds are assumed as values.
696

697
    :param pd.Index df_index:
698
        Time index.
699
    :param bool verbose:
700
        Default false. If true, additional to the mean value and standard deviation,
701
        the standard error of the mean and number of time steps are returned.
702

703
    :returns:
704
        float: Mean value
705
        float: Standard deviation
706
    """
707

708
    if isinstance(df_index, pd.DatetimeIndex):
709
        index_in_s = df_index.to_series().diff().dropna().values.astype(np.float64) * 1e-9
710
    else:
711
        index_in_s = df_index.to_series().diff().dropna().values.astype(np.float64)
712
    if verbose:
713
        return np.mean(index_in_s), np.std(index_in_s), st.sem(index_in_s), len(index_in_s)
714
    else:
715
        return np.mean(index_in_s), np.std(index_in_s)
716

717
Product

Resources

Company