Path: blob/master/big_data/sparkml/get_data.py
2573 views
import os1import re2import requests3import pandas as pd456def main():7file_path = 'adult.csv'8if not os.path.isfile(file_path):9def chunks(input_list, n_chunk):10"""take a list and break it up into n-size chunks"""11for i in range(0, len(input_list), n_chunk):12yield input_list[i:i + n_chunk]1314columns = [15'age', 'workclass', 'fnlwgt', 'education',16'education_num', 'marital_status', 'occupation',17'relationship', 'race', 'sex', 'capital_gain',18'capital_loss', 'hours_per_week', 'native_country', 'income']1920url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'21r = requests.get(url)22raw_text = r.text.replace('\n', ',')23splitted_text = re.split(r',\s*', raw_text)24data = list(chunks(splitted_text, n_chunk = len(columns)))25data = pd.DataFrame(data, columns = columns).dropna(axis = 0, how = 'any')26data.to_csv(file_path, index = False)272829if __name__ == '__main__':30main()313233