CoCalc -- transform.py

GitHub Repository: jantic/deoldify
Path: blob/master/fastai/tabular/transform.py
⁸⁴¹ views
1
"Cleaning and feature engineering functions for structured data"
2
from ..torch_core import *
3
from pandas.api.types import is_numeric_dtype
4
from datetime import date, datetime
5
import calendar
6

7
__all__ = ['add_datepart', 'cont_cat_split', 'Categorify', 'FillMissing', 'FillStrategy', 'Normalize', 'TabularProc',
8
           'add_elapsed_times', 'make_date', 'add_cyclic_datepart']
9

10
def make_date(df:DataFrame, date_field:str):
11
    "Make sure `df[field_name]` is of the right date type."
12
    field_dtype = df[date_field].dtype
13
    if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
14
        field_dtype = np.datetime64
15
    if not np.issubdtype(field_dtype, np.datetime64):
16
        df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)
17

18
def cyclic_dt_feat_names(time:bool=True, add_linear:bool=False)->List[str]:
19
    "Return feature names of date/time cycles as produced by `cyclic_dt_features`."
20
    fs = ['cos','sin']
21
    attr = [f'{r}_{f}' for r in 'weekday day_month month_year day_year'.split() for f in fs]
22
    if time: attr += [f'{r}_{f}' for r in 'hour clock min sec'.split() for f in fs]
23
    if add_linear: attr.append('year_lin')
24
    return attr
25

26
def cyclic_dt_features(d:Union[date,datetime], time:bool=True, add_linear:bool=False)->List[float]:
27
    "Calculate the cos and sin of date/time cycles."
28
    tt,fs = d.timetuple(), [np.cos, np.sin]
29
    day_year,days_month = tt.tm_yday, calendar.monthrange(d.year, d.month)[1]
30
    days_year = 366 if calendar.isleap(d.year) else 365
31
    rs = d.weekday()/7, (d.day-1)/days_month, (d.month-1)/12, (day_year-1)/days_year
32
    feats = [f(r * 2 * np.pi) for r in rs for f in fs]
33
    if time and isinstance(d, datetime) and type(d) != date:
34
        rs = tt.tm_hour/24, tt.tm_hour%12/12, tt.tm_min/60, tt.tm_sec/60
35
        feats += [f(r * 2 * np.pi) for r in rs for f in fs]
36
    if add_linear:
37
        if type(d) == date: feats.append(d.year + rs[-1])
38
        else:
39
            secs_in_year = (datetime(d.year+1, 1, 1) - datetime(d.year, 1, 1)).total_seconds()
40
            feats.append(d.year + ((d - datetime(d.year, 1, 1)).total_seconds() / secs_in_year))
41
    return feats
42

43
def add_cyclic_datepart(df:DataFrame, field_name:str, prefix:str=None, drop:bool=True, time:bool=False, add_linear:bool=False):
44
    "Helper function that adds trigonometric date/time features to a date in the column `field_name` of `df`."
45
    make_date(df, field_name)
46
    field = df[field_name]
47
    prefix = ifnone(prefix, re.sub('[Dd]ate$', '', field_name))
48
    series = field.apply(partial(cyclic_dt_features, time=time, add_linear=add_linear))
49
    columns = [prefix + c for c in cyclic_dt_feat_names(time, add_linear)]
50
    df_feats = pd.DataFrame([item for item in series], columns=columns, index=series.index)
51
    for column in columns: df[column] = df_feats[column]
52
    if drop: df.drop(field_name, axis=1, inplace=True)
53
    return df
54

55
def add_datepart(df:DataFrame, field_name:str, prefix:str=None, drop:bool=True, time:bool=False):
56
    "Helper function that adds columns relevant to a date in the column `field_name` of `df`."
57
    make_date(df, field_name)
58
    field = df[field_name]
59
    prefix = ifnone(prefix, re.sub('[Dd]ate$', '', field_name))
60
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', 
61
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
62
    if time: attr = attr + ['Hour', 'Minute', 'Second']
63
    for n in attr: df[prefix + n] = getattr(field.dt, n.lower())
64
    df[prefix + 'Elapsed'] = field.astype(np.int64) // 10 ** 9
65
    if drop: df.drop(field_name, axis=1, inplace=True)
66
    return df
67

68
def _get_elapsed(df:DataFrame,field_names:Collection[str], date_field:str, base_field:str, prefix:str):
69
    for f in field_names:
70
        day1 = np.timedelta64(1, 'D')
71
        last_date,last_base,res = np.datetime64(),None,[]
72
        for b,v,d in zip(df[base_field].values, df[f].values, df[date_field].values):
73
            if last_base is None or b != last_base:
74
                last_date,last_base = np.datetime64(),b
75
            if v: last_date = d
76
            res.append(((d-last_date).astype('timedelta64[D]') / day1))
77
        df[prefix + f] = res
78
    return df
79

80
def add_elapsed_times(df:DataFrame, field_names:Collection[str], date_field:str, base_field:str):
81
    field_names = listify(field_names)
82
    #Make sure date_field is a date and base_field a bool
83
    df[field_names] = df[field_names].astype('bool')
84
    make_date(df, date_field)
85
    
86
    work_df = df[field_names + [date_field, base_field]]
87
    work_df = work_df.sort_values([base_field, date_field])
88
    work_df = _get_elapsed(work_df, field_names, date_field, base_field, 'After')
89
    work_df = work_df.sort_values([base_field, date_field], ascending=[True, False])
90
    work_df = _get_elapsed(work_df, field_names, date_field, base_field, 'Before')
91
    
92
    for a in ['After' + f for f in field_names] + ['Before' + f for f in field_names]:
93
        work_df[a] = work_df[a].fillna(0).astype(int)  
94
    
95
    for a,s in zip([True, False], ['_bw', '_fw']):
96
        work_df = work_df.set_index(date_field)
97
        tmp = (work_df[[base_field] + field_names].sort_index(ascending=a)
98
                      .groupby(base_field).rolling(7, min_periods=1).sum())
99
        tmp.drop(base_field,1,inplace=True)
100
        tmp.reset_index(inplace=True)
101
        work_df.reset_index(inplace=True)
102
        work_df = work_df.merge(tmp, 'left', [date_field, base_field], suffixes=['', s])
103
    work_df.drop(field_names,1,inplace=True)
104
    return df.merge(work_df, 'left', [date_field, base_field])
105

106
def cont_cat_split(df, max_card=20, dep_var=None)->Tuple[List,List]:
107
    "Helper function that returns column names of cont and cat variables from given df."
108
    cont_names, cat_names = [], []
109
    for label in df:
110
        if label == dep_var: continue
111
        if df[label].dtype == int and df[label].unique().shape[0] > max_card or df[label].dtype == float: cont_names.append(label)
112
        else: cat_names.append(label)
113
    return cont_names, cat_names
114
        
115
@dataclass
116
class TabularProc():
117
    "A processor for tabular dataframes."
118
    cat_names:StrList
119
    cont_names:StrList
120

121
    def __call__(self, df:DataFrame, test:bool=False):
122
        "Apply the correct function to `df` depending on `test`."
123
        func = self.apply_test if test else self.apply_train
124
        func(df)
125

126
    def apply_train(self, df:DataFrame):
127
        "Function applied to `df` if it's the train set."
128
        raise NotImplementedError
129
    def apply_test(self, df:DataFrame):
130
        "Function applied to `df` if it's the test set."
131
        self.apply_train(df)
132

133
class Categorify(TabularProc):
134
    "Transform the categorical variables to that type."
135
    def apply_train(self, df:DataFrame):
136
        "Transform `self.cat_names` columns in categorical."
137
        self.categories = {}
138
        for n in self.cat_names:
139
            df.loc[:,n] = df.loc[:,n].astype('category').cat.as_ordered()
140
            self.categories[n] = df[n].cat.categories
141

142
    def apply_test(self, df:DataFrame):
143
        "Transform `self.cat_names` columns in categorical using the codes decided in `apply_train`."
144
        for n in self.cat_names:
145
            df.loc[:,n] = pd.Categorical(df[n], categories=self.categories[n], ordered=True)
146

147
FillStrategy = IntEnum('FillStrategy', 'MEDIAN COMMON CONSTANT')
148

149
@dataclass
150
class FillMissing(TabularProc):
151
    "Fill the missing values in continuous columns."
152
    fill_strategy:FillStrategy=FillStrategy.MEDIAN
153
    add_col:bool=True
154
    fill_val:float=0.
155
    def apply_train(self, df:DataFrame):
156
        "Fill missing values in `self.cont_names` according to `self.fill_strategy`."
157
        self.na_dict = {}
158
        for name in self.cont_names:
159
            if pd.isnull(df[name]).sum():
160
                if self.add_col:
161
                    df[name+'_na'] = pd.isnull(df[name])
162
                    if name+'_na' not in self.cat_names: self.cat_names.append(name+'_na')
163
                if self.fill_strategy == FillStrategy.MEDIAN: filler = df[name].median()
164
                elif self.fill_strategy == FillStrategy.CONSTANT: filler = self.fill_val
165
                else: filler = df[name].dropna().value_counts().idxmax()
166
                df[name] = df[name].fillna(filler)
167
                self.na_dict[name] = filler
168

169
    def apply_test(self, df:DataFrame):
170
        "Fill missing values in `self.cont_names` like in `apply_train`."
171
        for name in self.cont_names:
172
            if name in self.na_dict:
173
                if self.add_col:
174
                    df[name+'_na'] = pd.isnull(df[name])
175
                    if name+'_na' not in self.cat_names: self.cat_names.append(name+'_na')
176
                df[name] = df[name].fillna(self.na_dict[name])
177
            elif pd.isnull(df[name]).sum() != 0:
178
                raise Exception(f"""There are nan values in field {name} but there were none in the training set. 
179
                Please fix those manually.""")
180

181
class Normalize(TabularProc):
182
    "Normalize the continuous variables."
183
    def apply_train(self, df:DataFrame):
184
        "Compute the means and stds of `self.cont_names` columns to normalize them."
185
        self.means,self.stds = {},{}
186
        for n in self.cont_names:
187
            assert is_numeric_dtype(df[n]), (f"""Cannot normalize '{n}' column as it isn't numerical.
188
                Are you sure it doesn't belong in the categorical set of columns?""")
189
            self.means[n],self.stds[n] = df[n].mean(),df[n].std()
190
            df[n] = (df[n]-self.means[n]) / (1e-7 + self.stds[n])
191

192
    def apply_test(self, df:DataFrame):
193
        "Normalize `self.cont_names` with the same statistics as in `apply_train`."
194
        for n in self.cont_names:
195
            df[n] = (df[n]-self.means[n]) / (1e-7 + self.stds[n])
196

197
Product

Resources

Company