GitHub Repository: YStrano/DataScience_GA
Path: blob/master/lessons/Data Cleaning & Preprocessing.ipynb
¹⁹⁰⁴ views

Kernel: Python 3

groupby

In [ ]:

In [ ]:

In [24]:

import seaborn as sns

In [25]:

titanic_df = sns.load_dataset('titanic')

In [26]:

flights_df = sns.load_dataset('flights')

In [27]:

exercise_df = sns.load_dataset('exercise')

In [28]:

planets_df = sns.load_dataset('planets')

data information

In [29]:

titanic_df.head()

Out[29]:

In [30]:

titanic_df.info()

Out[30]:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB

In [31]:

titanic_df.describe()

Out[31]:

In [32]:

list(titanic_df.columns)

Out[32]:

['survived',
 'pclass',
 'sex',
 'age',
 'sibsp',
 'parch',
 'fare',
 'embarked',
 'class',
 'who',
 'adult_male',
 'deck',
 'embark_town',
 'alive',
 'alone']

handling null values

filtering

In [66]:

titanic_df_filter = titanic_df.copy()

In [72]:

titanic_df_filter.isna() #gives true for null values
titanic_df_filter.notna() #gives false for null values

Out[72]:

In [69]:

len(titanic_df_filter[titanic_df_filter['age'].notna()]) #gives df where only age is not null

Out[69]:

714

dropping

In [36]:

titanic_df_drop = titanic_df.copy()

In [37]:

titanic_df_drop.dropna(inplace=True) #drops all na's in entire df

In [38]:

len(titanic_df_drop)

Out[38]:

182

imputing

you can fill in with the mean, median or mode (if the data is categorical, then only the mode)

In [39]:

titanic_df_mean = titanic_df.copy()

In [40]:

titanic_df_mean['age'].mean()

Out[40]:

29.69911764705882

In [41]:

titanic_df_mean.loc[titanic_df_mean['age'].isna(), 'age'] = titanic_df_mean['age'].mean()

In [42]:

titanic_df_mean.info() #no more null values for age

Out[42]:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            891 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB

In [ ]:

In [46]:

titanic_df_median = titanic_df.copy()

In [45]:

titanic_df_median['age'].median()

Out[45]:

28.0

In [47]:

titanic_df_median.loc[titanic_df_median['age'].isna(), 'age'] = titanic_df_median['age'].median()

In [49]:

titanic_df_median.info() #no more null values for age

Out[49]:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            891 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB

In [ ]:

In [50]:

titanic_df_mode = titanic_df.copy()

In [51]:

titanic_df_mode['age'].mode()

Out[51]:

0    24.0
dtype: float64

In [52]:

titanic_df_mode.loc[titanic_df_mode['age'].isna(), 'age'] = titanic_df_mode['age'].median()

In [53]:

titanic_df_mode.info() #no more null values for age

Out[53]:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            891 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB

In [ ]:

for next time, add a more sophisticated method, by running a LGR for categorical or LR for continuous, or can run a random forest to predict the imputed value

In [ ]:

In [ ]:

data distribution

In [ ]:

In [ ]:

In [ ]:

In [ ]:

TO DO:

Transformers: CountVectorizer (use instead of get dummies?)

randomized search, like grid search but is random

TFIDF

NLP

StandardScaler and imputer

SpaCy

In [2]:

import spacy
from spacy import displacy
print(spacy.__version__)

nlp = spacy.load('en')

Out[2]:

2.0.11

groupby

data information

handling null values

filtering

dropping

imputing

you can fill in with the mean, median or mode (if the data is categorical, then only the mode)

for next time, add a more sophisticated method, by running a LGR for categorical or LR for continuous, or can run a random forest to predict the imputed value

data distribution

TO DO:

Transformers: CountVectorizer (use instead of get dummies?)

randomized search, like grid search but is random

TFIDF

NLP

StandardScaler and imputer

SpaCy

Product

Resources

Company