Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/big_data/sparkml/get_data.py
2573 views
1
import os
2
import re
3
import requests
4
import pandas as pd
5
6
7
def main():
8
file_path = 'adult.csv'
9
if not os.path.isfile(file_path):
10
def chunks(input_list, n_chunk):
11
"""take a list and break it up into n-size chunks"""
12
for i in range(0, len(input_list), n_chunk):
13
yield input_list[i:i + n_chunk]
14
15
columns = [
16
'age', 'workclass', 'fnlwgt', 'education',
17
'education_num', 'marital_status', 'occupation',
18
'relationship', 'race', 'sex', 'capital_gain',
19
'capital_loss', 'hours_per_week', 'native_country', 'income']
20
21
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
22
r = requests.get(url)
23
raw_text = r.text.replace('\n', ',')
24
splitted_text = re.split(r',\s*', raw_text)
25
data = list(chunks(splitted_text, n_chunk = len(columns)))
26
data = pd.DataFrame(data, columns = columns).dropna(axis = 0, how = 'any')
27
data.to_csv(file_path, index = False)
28
29
30
if __name__ == '__main__':
31
main()
32
33