Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/data_science_is_software/src/features/build_features.py
2579 views
1
import numpy as np
2
import pandas as pd
3
4
def remove_invalid_data(path):
5
"""
6
Takes a path to a water pumps csv, loads in pandas, removes
7
invalid columns and returns the dataframe.
8
"""
9
df = pd.read_csv( path, index_col = 0 )
10
11
# Nested dictionaries, e.g., {'a': {'b': nan}}, are read as
12
# follows: look in column 'a' for the value 'b' and replace it
13
# with nan.
14
invalid_values = {
15
'amount_tsh': { 0: np.nan },
16
'longitude': { 0: np.nan },
17
'installer': { 0: np.nan },
18
'construction_year': { 0: np.nan },
19
}
20
21
# drop rows with invalid values
22
df = df.replace(invalid_values)
23
df = df.dropna(how = 'any')
24
return df
25
26