CoCalc -- coutries_of_the

GitHub Repository: YStrano/DataScience_GA
Path: blob/master/lessons/lesson_12/coutries_of_the_world.ipynb
¹⁹⁰⁴ views

Kernel: Python 3

In [68]:

import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [45]:

df = pd.read_csv('countries of the world.csv')

In [46]:

df.head()

Out[46]:

In [47]:

df.dtypes

Out[47]:

Country                                object
Region                                 object
Population                              int64
Area (sq. mi.)                          int64
Pop. Density (per sq. mi.)             object
Coastline (coast/area ratio)           object
Net migration                          object
Infant mortality (per 1000 births)     object
GDP ($ per capita)                    float64
Literacy (%)                           object
Phones (per 1000)                      object
Arable (%)                             object
Crops (%)                              object
Other (%)                              object
Climate                                object
Birthrate                              object
Deathrate                              object
Agriculture                            object
Industry                               object
Service                                object
dtype: object

In [48]:

df.columns = df.columns.str.replace(' ','_')
df.columns = df.columns.str.replace('.','_')
#df.columns = df.columns.str.replace(',','_')

In [49]:

df.columns

Out[49]:

Index(['Country', 'Region', 'Population', 'Area_(sq__mi_)',
       'Pop__Density_(per_sq__mi_)', 'Coastline_(coast/area_ratio)',
       'Net_migration', 'Infant_mortality_(per_1000_births)',
       'GDP_($_per_capita)', 'Literacy_(%)', 'Phones_(per_1000)', 'Arable_(%)',
       'Crops_(%)', 'Other_(%)', 'Climate', 'Birthrate', 'Deathrate',
       'Agriculture', 'Industry', 'Service'],
      dtype='object')

In [50]:

list(df.columns)

Out[50]:

['Country',
 'Region',
 'Population',
 'Area_(sq__mi_)',
 'Pop__Density_(per_sq__mi_)',
 'Coastline_(coast/area_ratio)',
 'Net_migration',
 'Infant_mortality_(per_1000_births)',
 'GDP_($_per_capita)',
 'Literacy_(%)',
 'Phones_(per_1000)',
 'Arable_(%)',
 'Crops_(%)',
 'Other_(%)',
 'Climate',
 'Birthrate',
 'Deathrate',
 'Agriculture',
 'Industry',
 'Service']

In [51]:

df['Pop__Density_(per_sq__mi_)'].head()

Out[51]:

   48,0
  124,6
   13,8
  290,4
  152,1
Name: Pop__Density_(per_sq__mi_), dtype: object

In [52]:

df.head()

Out[52]:

In [55]:

df[['Pop__Density_(per_sq__mi_)','Coastline_(coast/area_ratio)']].apply(lambda x: x.str.replace(',','').astype('float'), axis=1)

Out[55]:

In [56]:

df[['Pop__Density_(per_sq__mi_)','Coastline_(coast/area_ratio)','Net_migration','Infant_mortality_(per_1000_births)','GDP_($_per_capita)','Literacy_(%)','Phones_(per_1000)','Arable_(%)','Crops_(%)','Other_(%)','Climate','Birthrate','Deathrate','Agriculture','Industry','Service']] = df[['Pop__Density_(per_sq__mi_)','Coastline_(coast/area_ratio)','Net_migration','Infant_mortality_(per_1000_births)','GDP_($_per_capita)','Literacy_(%)','Phones_(per_1000)','Arable_(%)','Crops_(%)','Other_(%)','Climate','Birthrate','Deathrate','Agriculture','Industry','Service']].apply(lambda x: x.str.replace(",", '').astype('float'), axis=1)

In [57]:

df.head()

Out[57]:

In [ ]:

In [ ]:

In [ ]:

In [64]:

data = pd.read_csv('countries of the world.csv',decimal=',')
print('number of missing data:')
print(data.isnull().sum())
data.describe(include='all')

Out[64]:

number of missing data:
Country                                0
Region                                 0
Population                             0
Area (sq. mi.)                         0
Pop. Density (per sq. mi.)             0
Coastline (coast/area ratio)           0
Net migration                          3
Infant mortality (per 1000 births)     3
GDP ($ per capita)                     1
Literacy (%)                          18
Phones (per 1000)                      4
Arable (%)                             2
Crops (%)                              2
Other (%)                              2
Climate                               22
Birthrate                              3
Deathrate                              4
Agriculture                           15
Industry                              16
Service                               15
dtype: int64

In [65]:

data.groupby('Region')[['GDP ($ per capita)','Literacy (%)','Agriculture']].median()

Out[65]:

In [66]:

for col in data.columns.values:
    if data[col].isnull().sum() == 0:
        continue
    if col == 'Climate':
        guess_values = data.groupby('Region')['Climate'].apply(lambda x: x.mode().max())
    else:
        guess_values = data.groupby('Region')[col].median()
    for region in data['Region'].unique():
        data[col].loc[(data[col].isnull())&(data['Region']==region)] = guess_values[region]

Out[66]:

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py:194: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [69]:

fig, ax = plt.subplots(figsize=(16,6))
#ax = fig.add_subplot(111)
top_gdp_countries = data.sort_values('GDP ($ per capita)',ascending=False).head(20)
mean = pd.DataFrame({'Country':['World mean'], 'GDP ($ per capita)':[data['GDP ($ per capita)'].mean()]})
gdps = pd.concat([top_gdp_countries[['Country','GDP ($ per capita)']],mean],ignore_index=True)

sns.barplot(x='Country',y='GDP ($ per capita)',data=gdps, palette='Set3')
ax.set_xlabel(ax.get_xlabel(),labelpad=15)
ax.set_ylabel(ax.get_ylabel(),labelpad=30)
ax.xaxis.label.set_fontsize(16)
ax.yaxis.label.set_fontsize(16)
plt.xticks(rotation=90)
plt.show()

Out[69]:

In [70]:

plt.figure(figsize=(16,12))
sns.heatmap(data=data.iloc[:,2:].corr(),annot=True,fmt='.2f',cmap='coolwarm')
plt.show()

Out[70]:

In [ ]:

Product

Resources

Company