Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
YStrano
GitHub Repository: YStrano/DataScience_GA
Path: blob/master/lessons/lesson_12/coutries_of_the_world.ipynb
1904 views
Kernel: Python 3
import pandas as pd import numpy as np import os import seaborn as sns import matplotlib.pyplot as plt
df = pd.read_csv('countries of the world.csv')
df.head()
df.dtypes
Country object Region object Population int64 Area (sq. mi.) int64 Pop. Density (per sq. mi.) object Coastline (coast/area ratio) object Net migration object Infant mortality (per 1000 births) object GDP ($ per capita) float64 Literacy (%) object Phones (per 1000) object Arable (%) object Crops (%) object Other (%) object Climate object Birthrate object Deathrate object Agriculture object Industry object Service object dtype: object
df.columns = df.columns.str.replace(' ','_') df.columns = df.columns.str.replace('.','_') #df.columns = df.columns.str.replace(',','_')
df.columns
Index(['Country', 'Region', 'Population', 'Area_(sq__mi_)', 'Pop__Density_(per_sq__mi_)', 'Coastline_(coast/area_ratio)', 'Net_migration', 'Infant_mortality_(per_1000_births)', 'GDP_($_per_capita)', 'Literacy_(%)', 'Phones_(per_1000)', 'Arable_(%)', 'Crops_(%)', 'Other_(%)', 'Climate', 'Birthrate', 'Deathrate', 'Agriculture', 'Industry', 'Service'], dtype='object')
list(df.columns)
['Country', 'Region', 'Population', 'Area_(sq__mi_)', 'Pop__Density_(per_sq__mi_)', 'Coastline_(coast/area_ratio)', 'Net_migration', 'Infant_mortality_(per_1000_births)', 'GDP_($_per_capita)', 'Literacy_(%)', 'Phones_(per_1000)', 'Arable_(%)', 'Crops_(%)', 'Other_(%)', 'Climate', 'Birthrate', 'Deathrate', 'Agriculture', 'Industry', 'Service']
df['Pop__Density_(per_sq__mi_)'].head()
0 48,0 1 124,6 2 13,8 3 290,4 4 152,1 Name: Pop__Density_(per_sq__mi_), dtype: object
df.head()
df[['Pop__Density_(per_sq__mi_)','Coastline_(coast/area_ratio)']].apply(lambda x: x.str.replace(',','').astype('float'), axis=1)
df[['Pop__Density_(per_sq__mi_)','Coastline_(coast/area_ratio)','Net_migration','Infant_mortality_(per_1000_births)','GDP_($_per_capita)','Literacy_(%)','Phones_(per_1000)','Arable_(%)','Crops_(%)','Other_(%)','Climate','Birthrate','Deathrate','Agriculture','Industry','Service']] = df[['Pop__Density_(per_sq__mi_)','Coastline_(coast/area_ratio)','Net_migration','Infant_mortality_(per_1000_births)','GDP_($_per_capita)','Literacy_(%)','Phones_(per_1000)','Arable_(%)','Crops_(%)','Other_(%)','Climate','Birthrate','Deathrate','Agriculture','Industry','Service']].apply(lambda x: x.str.replace(",", '').astype('float'), axis=1)
df.head()
data = pd.read_csv('countries of the world.csv',decimal=',') print('number of missing data:') print(data.isnull().sum()) data.describe(include='all')
number of missing data: Country 0 Region 0 Population 0 Area (sq. mi.) 0 Pop. Density (per sq. mi.) 0 Coastline (coast/area ratio) 0 Net migration 3 Infant mortality (per 1000 births) 3 GDP ($ per capita) 1 Literacy (%) 18 Phones (per 1000) 4 Arable (%) 2 Crops (%) 2 Other (%) 2 Climate 22 Birthrate 3 Deathrate 4 Agriculture 15 Industry 16 Service 15 dtype: int64
data.groupby('Region')[['GDP ($ per capita)','Literacy (%)','Agriculture']].median()
for col in data.columns.values: if data[col].isnull().sum() == 0: continue if col == 'Climate': guess_values = data.groupby('Region')['Climate'].apply(lambda x: x.mode().max()) else: guess_values = data.groupby('Region')[col].median() for region in data['Region'].unique(): data[col].loc[(data[col].isnull())&(data['Region']==region)] = guess_values[region]
/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py:194: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self._setitem_with_indexer(indexer, value)
fig, ax = plt.subplots(figsize=(16,6)) #ax = fig.add_subplot(111) top_gdp_countries = data.sort_values('GDP ($ per capita)',ascending=False).head(20) mean = pd.DataFrame({'Country':['World mean'], 'GDP ($ per capita)':[data['GDP ($ per capita)'].mean()]}) gdps = pd.concat([top_gdp_countries[['Country','GDP ($ per capita)']],mean],ignore_index=True) sns.barplot(x='Country',y='GDP ($ per capita)',data=gdps, palette='Set3') ax.set_xlabel(ax.get_xlabel(),labelpad=15) ax.set_ylabel(ax.get_ylabel(),labelpad=30) ax.xaxis.label.set_fontsize(16) ax.yaxis.label.set_fontsize(16) plt.xticks(rotation=90) plt.show()
Image in a Jupyter notebook
plt.figure(figsize=(16,12)) sns.heatmap(data=data.iloc[:,2:].corr(),annot=True,fmt='.2f',cmap='coolwarm') plt.show()
Image in a Jupyter notebook