CoCalc -- 2020-10-22-190547.ipynb

⁶³²⁰ views
default

Kernel: Python 3 (system-wide)

In [1]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:

# This is how you read the csv file

with open("Chocolate-Consumption.csv", 'r') as file:
    consumption = pd.read_csv(file, names = ['Country', 'Consumption'])

In [3]:

# Show the first 3 lines of the file

consumption.head(3)

Out[3]:

In [4]:

# Show the last 3 lines of the file

consumption.tail(3)

Out[4]:

In [5]:

# Sort by the country name

consumption.sort_values(by = ['Country'])

Out[5]:

In [6]:

consumption.plot.bar("Country", "Consumption")
plt.show()

Out[6]:

In [7]:

with open("nobels.txt", "r") as file:
    prizes_list = file.readlines()

prizes_list

Out[7]:

['Rank\tEntity\tNobel\n',
 'laureates[1]\tPopulation\n',
 '(2018)[2]\tLaureates/\n',
 '10 million\n',
 '—\t Faroe Islands\t1\t49,489\t202.065\n',
 '1\t Saint Lucia\t2\t179,667\t111.317\n',
 '2\t Luxembourg\t2\t590,321\t33.880\n',
 '3\t  Switzerland\t28\t8,544,034\t32.771\n',
 '4\t Sweden\t30\t9,982,709\t30.052\n',
 '5\t Iceland\t1\t337,780\t29.605\n',
 '6\t Austria\t22\t8,751,820\t25.138\n',
 '7\t Denmark\t14\t5,754,356\t24.329\n',
 '8\t Norway\t13\t5,353,363\t24.284\n',
 '9\t United Kingdom\t133\t66,573,504\t19.429\n',
 '10\t East Timor\t2\t1,324,094\t15.105\n',
 '11\t Ireland\t7\t4,803,748\t14.572\n',
 '12\t Israel\t12\t8,452,841\t14.196\n',
 '13\t Hungary\t13\t9,688,847\t13.417\n',
 '13\t Germany\t108\t82,293,457\t13.245\n',
 '15\t United States\t383\t326,766,748\t11.721\n',
 '16\t Netherlands\t21\t17,084,459\t11.707\n',
 '17\t France\t70\t65,233,271\t10.664\n',
 '—\t European Union[3]\t378\t444,697,104\t8.005\n',
 '18\t Finland\t5\t5,542,517\t9.021\n',
 '19\t Belgium\t10\t11,498,519\t8.697\n',
 '20\t Cyprus\t1\t1,189,085\t8.410\n',
 '21\t Trinidad and Tobago\t1\t1,372,598\t7.285\n',
 '22\t Canada\t25\t36,953,765\t6.765\n',
 '23\t New Zealand\t3\t4,749,598\t6.316\n',
 '24\t Bosnia and Herzegovina\t2\t3,503,554\t5.708\n',
 '25\t Latvia\t1\t1,929,938\t5.182\n',
 '26\t Poland\t19\t38,104,832\t4.986\n',
 '27\t Australia\t12\t24,772,247\t4.844\n',
 '28\t Slovenia\t1\t2,081,260\t4.805\n',
 '29\t North Macedonia\t1\t2,085,051\t4.796\n',
 '30\t Czech Republic\t5\t10,625,250\t4.706\n',
 '31\t Liberia\t2\t4,853,516\t4.121\n',
 '32\t Lithuania\t1\t2,876,475\t3.476\n',
 '33\t Italy\t20\t59,290,969\t3.373\n',
 '—\t Tibet[4]\t1\t3,310,836\t3.020\n',
 '34\t Croatia\t1\t4,164,783\t2.401\n',
 '35\t Japan\t28\t127,185,332\t2.202\n',
 '36\t Belarus\t2\t9,452,113\t2.116\n',
 '37\t Romania\t4\t19,580,634\t2.043\n',
 '38\t Costa Rica\t1\t4,953,199\t2.019\n',
 '39\t Palestine\t1\t5,052,776\t1.979\n',
 '40\t Portugal\t2\t10,291,196\t1.943\n',
 '41\t Greece\t2\t11,142,161\t1.795\n',
 '42\t South Africa\t10\t57,398,421\t1.742\n',
 '43\t Spain\t8\t46,397,452\t1.724\n',
 '44\t Russia\t23\t143,964,709\t1.598\n',
 '45\t Bulgaria\t1\t7,036,848\t1.421\n',
 '—\t Hong Kong\t1\t7,428,887\t1.346\n',
 '—\t World[5]\t919\t7,632,819,325\t1.204\n',
 '46\t Guatemala\t2\t17,245,346\t1.160\n',
 '47\t Argentina\t5\t44,688,864\t1.119\n',
 '48\t Chile\t2\t18,197,209\t1.099\n',
 '49\t Azerbaijan\t1\t9,923,914\t1.008\n',
 '50\t Algeria\t2\t42,008,054\t0.476\n',
 '51\t Ukraine\t2\t44,009,214\t0.454\n',
 '52\t Taiwan\t1\t23,694,089\t0.422\n',
 '53\t Colombia\t2\t49,464,683\t0.404\n',
 '54\t Egypt\t4\t99,375,741\t0.403\n',
 '55\t South Korea\t2\t51,164,435\t0.391\n',
 '56\t Yemen\t1\t28,915,284\t0.346\n',
 '57\t Ghana\t1\t29,463,643\t0.339\n',
 '58\t Venezuela\t1\t32,381,221\t0.309\n',
 '59\t Peru\t1\t32,551,815\t0.307\n',
 '60\t Morocco\t1\t36,191,805\t0.276\n',
 '61\t Iraq\t1\t39,339,753\t0.254\n',
 '62\t Turkey\t2\t81,916,871\t0.244\n',
 '63\t Iran\t2\t82,011,735\t0.244\n',
 '64\t Mexico\t3\t130,759,074\t0.229\n',
 '65\t Kenya\t1\t50,950,879\t0.196\n',
 '66\t Myanmar\t1\t53,855,735\t0.186\n',
 '67\t DR Congo\t1\t84,004,989\t0.119\n',
 '68\t Vietnam\t1\t96,491,146\t0.104\n',
 '69\t Pakistan\t2\t200,813,818\t0.100\n',
 '70\t Ethiopia\t1\t109,224,410[6]\t0.092\n',
 '71\t India\t11\t1,354,051,854\t0.081\n',
 '72\t China\t9\t1,415,045,928\t0.064\n',
 '73\t Bangladesh\t1\t166,368,149\t0.060\n',
 '74\t Nigeria\t1\t195,875,237\t0.051']

In [8]:

print(*prizes_list)

Out[8]:

Rank	Entity	Nobel
 laureates[1]	Population
 (2018)[2]	Laureates/
million
 —	 Faroe Islands	1	49,489	202.065
Saint Lucia	2	179,667	111.317
Luxembourg	2	590,321	33.880
 Switzerland	28	8,544,034	32.771
Sweden	30	9,982,709	30.052
Iceland	1	337,780	29.605
Austria	22	8,751,820	25.138
Denmark	14	5,754,356	24.329
Norway	13	5,353,363	24.284
United Kingdom	133	66,573,504	19.429
East Timor	2	1,324,094	15.105
Ireland	7	4,803,748	14.572
Israel	12	8,452,841	14.196
Hungary	13	9,688,847	13.417
Germany	108	82,293,457	13.245
United States	383	326,766,748	11.721
Netherlands	21	17,084,459	11.707
France	70	65,233,271	10.664
 —	 European Union[3]	378	444,697,104	8.005
Finland	5	5,542,517	9.021
Belgium	10	11,498,519	8.697
Cyprus	1	1,189,085	8.410
Trinidad and Tobago	1	1,372,598	7.285
Canada	25	36,953,765	6.765
New Zealand	3	4,749,598	6.316
Bosnia and Herzegovina	2	3,503,554	5.708
Latvia	1	1,929,938	5.182
Poland	19	38,104,832	4.986
Australia	12	24,772,247	4.844
Slovenia	1	2,081,260	4.805
North Macedonia	1	2,085,051	4.796
Czech Republic	5	10,625,250	4.706
Liberia	2	4,853,516	4.121
Lithuania	1	2,876,475	3.476
Italy	20	59,290,969	3.373
 —	 Tibet[4]	1	3,310,836	3.020
Croatia	1	4,164,783	2.401
Japan	28	127,185,332	2.202
Belarus	2	9,452,113	2.116
Romania	4	19,580,634	2.043
Costa Rica	1	4,953,199	2.019
Palestine	1	5,052,776	1.979
Portugal	2	10,291,196	1.943
Greece	2	11,142,161	1.795
South Africa	10	57,398,421	1.742
Spain	8	46,397,452	1.724
Russia	23	143,964,709	1.598
Bulgaria	1	7,036,848	1.421
 —	 Hong Kong	1	7,428,887	1.346
 —	 World[5]	919	7,632,819,325	1.204
Guatemala	2	17,245,346	1.160
Argentina	5	44,688,864	1.119
Chile	2	18,197,209	1.099
Azerbaijan	1	9,923,914	1.008
Algeria	2	42,008,054	0.476
Ukraine	2	44,009,214	0.454
Taiwan	1	23,694,089	0.422
Colombia	2	49,464,683	0.404
Egypt	4	99,375,741	0.403
South Korea	2	51,164,435	0.391
Yemen	1	28,915,284	0.346
Ghana	1	29,463,643	0.339
Venezuela	1	32,381,221	0.309
Peru	1	32,551,815	0.307
Morocco	1	36,191,805	0.276
Iraq	1	39,339,753	0.254
Turkey	2	81,916,871	0.244
Iran	2	82,011,735	0.244
Mexico	3	130,759,074	0.229
Kenya	1	50,950,879	0.196
Myanmar	1	53,855,735	0.186
DR Congo	1	84,004,989	0.119
Vietnam	1	96,491,146	0.104
Pakistan	2	200,813,818	0.100
Ethiopia	1	109,224,410[6]	0.092
India	11	1,354,051,854	0.081
China	9	1,415,045,928	0.064
Bangladesh	1	166,368,149	0.060
Nigeria	1	195,875,237	0.051

In [9]:

# reassemble the header line - first remove newlines

for i in range(3):
    prizes_list[i] = prizes_list[i].rstrip()

prizes_list[:4]

Out[9]:

['Rank\tEntity\tNobel',
 'laureates[1]\tPopulation',
 '(2018)[2]\tLaureates/',
 '10 million\n']

In [10]:

# Then join it up (putting in a space where the newlines were)

prizes2 = [' '.join(prizes_list[:4])] + prizes_list[4:]
print(*prizes2)

Out[10]:

Rank	Entity	Nobel laureates[1]	Population (2018)[2]	Laureates/ 10 million
 —	 Faroe Islands	1	49,489	202.065
Saint Lucia	2	179,667	111.317
Luxembourg	2	590,321	33.880
 Switzerland	28	8,544,034	32.771
Sweden	30	9,982,709	30.052
Iceland	1	337,780	29.605
Austria	22	8,751,820	25.138
Denmark	14	5,754,356	24.329
Norway	13	5,353,363	24.284
United Kingdom	133	66,573,504	19.429
East Timor	2	1,324,094	15.105
Ireland	7	4,803,748	14.572
Israel	12	8,452,841	14.196
Hungary	13	9,688,847	13.417
Germany	108	82,293,457	13.245
United States	383	326,766,748	11.721
Netherlands	21	17,084,459	11.707
France	70	65,233,271	10.664
 —	 European Union[3]	378	444,697,104	8.005
Finland	5	5,542,517	9.021
Belgium	10	11,498,519	8.697
Cyprus	1	1,189,085	8.410
Trinidad and Tobago	1	1,372,598	7.285
Canada	25	36,953,765	6.765
New Zealand	3	4,749,598	6.316
Bosnia and Herzegovina	2	3,503,554	5.708
Latvia	1	1,929,938	5.182
Poland	19	38,104,832	4.986
Australia	12	24,772,247	4.844
Slovenia	1	2,081,260	4.805
North Macedonia	1	2,085,051	4.796
Czech Republic	5	10,625,250	4.706
Liberia	2	4,853,516	4.121
Lithuania	1	2,876,475	3.476
Italy	20	59,290,969	3.373
 —	 Tibet[4]	1	3,310,836	3.020
Croatia	1	4,164,783	2.401
Japan	28	127,185,332	2.202
Belarus	2	9,452,113	2.116
Romania	4	19,580,634	2.043
Costa Rica	1	4,953,199	2.019
Palestine	1	5,052,776	1.979
Portugal	2	10,291,196	1.943
Greece	2	11,142,161	1.795
South Africa	10	57,398,421	1.742
Spain	8	46,397,452	1.724
Russia	23	143,964,709	1.598
Bulgaria	1	7,036,848	1.421
 —	 Hong Kong	1	7,428,887	1.346
 —	 World[5]	919	7,632,819,325	1.204
Guatemala	2	17,245,346	1.160
Argentina	5	44,688,864	1.119
Chile	2	18,197,209	1.099
Azerbaijan	1	9,923,914	1.008
Algeria	2	42,008,054	0.476
Ukraine	2	44,009,214	0.454
Taiwan	1	23,694,089	0.422
Colombia	2	49,464,683	0.404
Egypt	4	99,375,741	0.403
South Korea	2	51,164,435	0.391
Yemen	1	28,915,284	0.346
Ghana	1	29,463,643	0.339
Venezuela	1	32,381,221	0.309
Peru	1	32,551,815	0.307
Morocco	1	36,191,805	0.276
Iraq	1	39,339,753	0.254
Turkey	2	81,916,871	0.244
Iran	2	82,011,735	0.244
Mexico	3	130,759,074	0.229
Kenya	1	50,950,879	0.196
Myanmar	1	53,855,735	0.186
DR Congo	1	84,004,989	0.119
Vietnam	1	96,491,146	0.104
Pakistan	2	200,813,818	0.100
Ethiopia	1	109,224,410[6]	0.092
India	11	1,354,051,854	0.081
China	9	1,415,045,928	0.064
Bangladesh	1	166,368,149	0.060
Nigeria	1	195,875,237	0.051

In [11]:

# Use regular expression to remove reference num
import re

prizes3 = [ re.sub('\[\d+\]', '', line) for line in prizes2 ]
print(*prizes3)

Out[11]:

Rank	Entity	Nobel laureates	Population (2018)	Laureates/ 10 million
 —	 Faroe Islands	1	49,489	202.065
Saint Lucia	2	179,667	111.317
Luxembourg	2	590,321	33.880
 Switzerland	28	8,544,034	32.771
Sweden	30	9,982,709	30.052
Iceland	1	337,780	29.605
Austria	22	8,751,820	25.138
Denmark	14	5,754,356	24.329
Norway	13	5,353,363	24.284
United Kingdom	133	66,573,504	19.429
East Timor	2	1,324,094	15.105
Ireland	7	4,803,748	14.572
Israel	12	8,452,841	14.196
Hungary	13	9,688,847	13.417
Germany	108	82,293,457	13.245
United States	383	326,766,748	11.721
Netherlands	21	17,084,459	11.707
France	70	65,233,271	10.664
 —	 European Union	378	444,697,104	8.005
Finland	5	5,542,517	9.021
Belgium	10	11,498,519	8.697
Cyprus	1	1,189,085	8.410
Trinidad and Tobago	1	1,372,598	7.285
Canada	25	36,953,765	6.765
New Zealand	3	4,749,598	6.316
Bosnia and Herzegovina	2	3,503,554	5.708
Latvia	1	1,929,938	5.182
Poland	19	38,104,832	4.986
Australia	12	24,772,247	4.844
Slovenia	1	2,081,260	4.805
North Macedonia	1	2,085,051	4.796
Czech Republic	5	10,625,250	4.706
Liberia	2	4,853,516	4.121
Lithuania	1	2,876,475	3.476
Italy	20	59,290,969	3.373
 —	 Tibet	1	3,310,836	3.020
Croatia	1	4,164,783	2.401
Japan	28	127,185,332	2.202
Belarus	2	9,452,113	2.116
Romania	4	19,580,634	2.043
Costa Rica	1	4,953,199	2.019
Palestine	1	5,052,776	1.979
Portugal	2	10,291,196	1.943
Greece	2	11,142,161	1.795
South Africa	10	57,398,421	1.742
Spain	8	46,397,452	1.724
Russia	23	143,964,709	1.598
Bulgaria	1	7,036,848	1.421
 —	 Hong Kong	1	7,428,887	1.346
 —	 World	919	7,632,819,325	1.204
Guatemala	2	17,245,346	1.160
Argentina	5	44,688,864	1.119
Chile	2	18,197,209	1.099
Azerbaijan	1	9,923,914	1.008
Algeria	2	42,008,054	0.476
Ukraine	2	44,009,214	0.454
Taiwan	1	23,694,089	0.422
Colombia	2	49,464,683	0.404
Egypt	4	99,375,741	0.403
South Korea	2	51,164,435	0.391
Yemen	1	28,915,284	0.346
Ghana	1	29,463,643	0.339
Venezuela	1	32,381,221	0.309
Peru	1	32,551,815	0.307
Morocco	1	36,191,805	0.276
Iraq	1	39,339,753	0.254
Turkey	2	81,916,871	0.244
Iran	2	82,011,735	0.244
Mexico	3	130,759,074	0.229
Kenya	1	50,950,879	0.196
Myanmar	1	53,855,735	0.186
DR Congo	1	84,004,989	0.119
Vietnam	1	96,491,146	0.104
Pakistan	2	200,813,818	0.100
Ethiopia	1	109,224,410	0.092
India	11	1,354,051,854	0.081
China	9	1,415,045,928	0.064
Bangladesh	1	166,368,149	0.060
Nigeria	1	195,875,237	0.051

In [12]:

# The first method to remove words

prizes4 = [line for line in prizes3 if not ('World' in line or 'Europe'in line)]
print(*prizes4)

Out[12]:

Rank	Entity	Nobel laureates	Population (2018)	Laureates/ 10 million
 —	 Faroe Islands	1	49,489	202.065
Saint Lucia	2	179,667	111.317
Luxembourg	2	590,321	33.880
 Switzerland	28	8,544,034	32.771
Sweden	30	9,982,709	30.052
Iceland	1	337,780	29.605
Austria	22	8,751,820	25.138
Denmark	14	5,754,356	24.329
Norway	13	5,353,363	24.284
United Kingdom	133	66,573,504	19.429
East Timor	2	1,324,094	15.105
Ireland	7	4,803,748	14.572
Israel	12	8,452,841	14.196
Hungary	13	9,688,847	13.417
Germany	108	82,293,457	13.245
United States	383	326,766,748	11.721
Netherlands	21	17,084,459	11.707
France	70	65,233,271	10.664
Finland	5	5,542,517	9.021
Belgium	10	11,498,519	8.697
Cyprus	1	1,189,085	8.410
Trinidad and Tobago	1	1,372,598	7.285
Canada	25	36,953,765	6.765
New Zealand	3	4,749,598	6.316
Bosnia and Herzegovina	2	3,503,554	5.708
Latvia	1	1,929,938	5.182
Poland	19	38,104,832	4.986
Australia	12	24,772,247	4.844
Slovenia	1	2,081,260	4.805
North Macedonia	1	2,085,051	4.796
Czech Republic	5	10,625,250	4.706
Liberia	2	4,853,516	4.121
Lithuania	1	2,876,475	3.476
Italy	20	59,290,969	3.373
 —	 Tibet	1	3,310,836	3.020
Croatia	1	4,164,783	2.401
Japan	28	127,185,332	2.202
Belarus	2	9,452,113	2.116
Romania	4	19,580,634	2.043
Costa Rica	1	4,953,199	2.019
Palestine	1	5,052,776	1.979
Portugal	2	10,291,196	1.943
Greece	2	11,142,161	1.795
South Africa	10	57,398,421	1.742
Spain	8	46,397,452	1.724
Russia	23	143,964,709	1.598
Bulgaria	1	7,036,848	1.421
 —	 Hong Kong	1	7,428,887	1.346
Guatemala	2	17,245,346	1.160
Argentina	5	44,688,864	1.119
Chile	2	18,197,209	1.099
Azerbaijan	1	9,923,914	1.008
Algeria	2	42,008,054	0.476
Ukraine	2	44,009,214	0.454
Taiwan	1	23,694,089	0.422
Colombia	2	49,464,683	0.404
Egypt	4	99,375,741	0.403
South Korea	2	51,164,435	0.391
Yemen	1	28,915,284	0.346
Ghana	1	29,463,643	0.339
Venezuela	1	32,381,221	0.309
Peru	1	32,551,815	0.307
Morocco	1	36,191,805	0.276
Iraq	1	39,339,753	0.254
Turkey	2	81,916,871	0.244
Iran	2	82,011,735	0.244
Mexico	3	130,759,074	0.229
Kenya	1	50,950,879	0.196
Myanmar	1	53,855,735	0.186
DR Congo	1	84,004,989	0.119
Vietnam	1	96,491,146	0.104
Pakistan	2	200,813,818	0.100
Ethiopia	1	109,224,410	0.092
India	11	1,354,051,854	0.081
China	9	1,415,045,928	0.064
Bangladesh	1	166,368,149	0.060
Nigeria	1	195,875,237	0.051

In [13]:

prizes4 = [line for line in prizes3 if not re.search('(World)|(Europe)', line)]
prizes4[0] = re.sub('/ ', '/', prizes4[0])
print(*prizes4)

Out[13]:

Rank	Entity	Nobel laureates	Population (2018)	Laureates/10 million
 —	 Faroe Islands	1	49,489	202.065
Saint Lucia	2	179,667	111.317
Luxembourg	2	590,321	33.880
 Switzerland	28	8,544,034	32.771
Sweden	30	9,982,709	30.052
Iceland	1	337,780	29.605
Austria	22	8,751,820	25.138
Denmark	14	5,754,356	24.329
Norway	13	5,353,363	24.284
United Kingdom	133	66,573,504	19.429
East Timor	2	1,324,094	15.105
Ireland	7	4,803,748	14.572
Israel	12	8,452,841	14.196
Hungary	13	9,688,847	13.417
Germany	108	82,293,457	13.245
United States	383	326,766,748	11.721
Netherlands	21	17,084,459	11.707
France	70	65,233,271	10.664
Finland	5	5,542,517	9.021
Belgium	10	11,498,519	8.697
Cyprus	1	1,189,085	8.410
Trinidad and Tobago	1	1,372,598	7.285
Canada	25	36,953,765	6.765
New Zealand	3	4,749,598	6.316
Bosnia and Herzegovina	2	3,503,554	5.708
Latvia	1	1,929,938	5.182
Poland	19	38,104,832	4.986
Australia	12	24,772,247	4.844
Slovenia	1	2,081,260	4.805
North Macedonia	1	2,085,051	4.796
Czech Republic	5	10,625,250	4.706
Liberia	2	4,853,516	4.121
Lithuania	1	2,876,475	3.476
Italy	20	59,290,969	3.373
 —	 Tibet	1	3,310,836	3.020
Croatia	1	4,164,783	2.401
Japan	28	127,185,332	2.202
Belarus	2	9,452,113	2.116
Romania	4	19,580,634	2.043
Costa Rica	1	4,953,199	2.019
Palestine	1	5,052,776	1.979
Portugal	2	10,291,196	1.943
Greece	2	11,142,161	1.795
South Africa	10	57,398,421	1.742
Spain	8	46,397,452	1.724
Russia	23	143,964,709	1.598
Bulgaria	1	7,036,848	1.421
 —	 Hong Kong	1	7,428,887	1.346
Guatemala	2	17,245,346	1.160
Argentina	5	44,688,864	1.119
Chile	2	18,197,209	1.099
Azerbaijan	1	9,923,914	1.008
Algeria	2	42,008,054	0.476
Ukraine	2	44,009,214	0.454
Taiwan	1	23,694,089	0.422
Colombia	2	49,464,683	0.404
Egypt	4	99,375,741	0.403
South Korea	2	51,164,435	0.391
Yemen	1	28,915,284	0.346
Ghana	1	29,463,643	0.339
Venezuela	1	32,381,221	0.309
Peru	1	32,551,815	0.307
Morocco	1	36,191,805	0.276
Iraq	1	39,339,753	0.254
Turkey	2	81,916,871	0.244
Iran	2	82,011,735	0.244
Mexico	3	130,759,074	0.229
Kenya	1	50,950,879	0.196
Myanmar	1	53,855,735	0.186
DR Congo	1	84,004,989	0.119
Vietnam	1	96,491,146	0.104
Pakistan	2	200,813,818	0.100
Ethiopia	1	109,224,410	0.092
India	11	1,354,051,854	0.081
China	9	1,415,045,928	0.064
Bangladesh	1	166,368,149	0.060
Nigeria	1	195,875,237	0.051

In [14]:

# Now we are going to construct the dataframe

# First line contains the headers
# The rest of the data
prizes = pd.DataFrame(columns = prizes4[0].split("\t"),
                      data = [row.split('\t') for row in prizes4[1:]])

# Defaults to displays the first 5 rows
prizes.head()

Out[14]:

In [15]:

# Get ("index, locate") item using only indices
prizes.iloc[4, 1]

Out[15]:

' Sweden'

In [16]:

# Get ("locate") item using category name
prizes.loc[4, "Entity"]

Out[16]:

' Sweden'

In [17]:

# Get all the data from that columns
prizes.loc[:, "Entity"]

Out[17]:

    Faroe Islands
      Saint Lucia
       Luxembourg
      Switzerland
           Sweden
           ...      
        Ethiopia
           India
           China
      Bangladesh
         Nigeria
Name: Entity, Length: 77, dtype: object

In [18]:

# Another way to get all data from Entity
prizes['Entity']

Out[18]:

    Faroe Islands
      Saint Lucia
       Luxembourg
      Switzerland
           Sweden
           ...      
        Ethiopia
           India
           China
      Bangladesh
         Nigeria
Name: Entity, Length: 77, dtype: object

In [19]:

prizes3a = pd.DataFrame(columns = prizes3[0].split("\t"),
                      data = [row.split('\t') for row in prizes3[1:]])

prizes3a[16:20]

Out[19]:

In [20]:

euro_mask = (prizes3a["Entity"] == " European Union")
euro_mask[16:20]

Out[20]:

  False
  False
   True
  False
Name: Entity, dtype: bool

In [21]:

prizes3b = prizes3a[euro_mask]
prizes3b

Out[21]:

In [22]:

prizes3c = prizes3a[np.logical_not(euro_mask)]
prizes3c[16:20]

Out[22]:

In [23]:

prizes.loc[4, "Entity"]

Out[23]:

' Sweden'

In [24]:

# Working on a single item

prizes.loc[4, "Entity"] = prizes.loc[4, "Entity"].strip()
prizes.loc[4, "Entity"]

Out[24]:

'Sweden'

In [25]:

# Working on the whole series

# Strip whitespace from every entity
prizes["Entity"] = prizes["Entity"].apply(str.strip)
prizes.loc[6, "Entity"]

# Now we get rid of the extra whitespace from Entity column

Out[25]:

'Austria'

In [26]:

# Now get rid of the extra whitespace from last column
prizes["Laureates/10 million\n"] = prizes["Laureates/10 million\n"].apply(str.strip)
prizes.loc[5, "Laureates/10 million\n"]

Out[26]:

'29.605'

In [27]:

prizes.head()

Out[27]:

In [28]:

prizes.columns[4]

Out[28]:

'Laureates/10 million\n'

In [29]:

prizes.rename(columns = {prizes.columns[4] : prizes.columns[4].strip()}, inplace=True)
prizes.columns[4]

Out[29]:

'Laureates/10 million'

In [30]:

prizes.head()

Out[30]:

In [31]:

prizes.dtypes

Out[31]:

Rank                    object
Entity                  object
Nobel laureates         object
Population (2018)       object
Laureates/10 million    object
dtype: object

In [32]:

# First we got a copy of that dataframe
prizesnums = prizes.copy()

prizesnums["Laureates/10 million"] = pd.to_numeric(prizesnums["Laureates/10 million"])

prizesnums.dtypes

Out[32]:

Rank                     object
Entity                   object
Nobel laureates          object
Population (2018)        object
Laureates/10 million    float64
dtype: object

In [33]:

prizes["Population (2018)"] = prizes["Population (2018)"].apply(str.replace, args=(',', ''))

prizes["Population (2018)"]

Out[33]:

        49489
       179667
       590321
      8544034
      9982709
         ...    
   109224410
  1354051854
  1415045928
   166368149
   195875237
Name: Population (2018), Length: 77, dtype: object

In [34]:

prizesnums["Population (2018)"] = pd.to_numeric(prizesnums["Population (2018)"])

prizesnums.dtypes

Out[34]:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string "49,489"

During handling of the above exception, another exception occurred:
ValueError                                Traceback (most recent call last)
<ipython-input-34-d44213833c91> in <module>
----> 1 prizesnums["Population (2018)"] = pd.to_numeric(prizesnums["Population (2018)"])
      2 
      3 prizesnums.dtypes
/usr/local/lib/python3.8/dist-packages/pandas/core/tools/numeric.py in to_numeric(arg, errors, downcast)
    150         coerce_numeric = errors not in ("ignore", "raise")
    151         try:
--> 152             values = lib.maybe_convert_numeric(
    153                 values, set(), coerce_numeric=coerce_numeric
    154             )
pandas/_libs/lib.pyx in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string "49,489" at position 0

In [35]:

prizesnums = prizes.astype({"Nobel laureates" : np.int64,
                            "Population (2018)" : np.int64,
                            "Laureates/10 million" : np.float64})

prizesnums.dtypes

Out[35]:

Rank                     object
Entity                   object
Nobel laureates           int64
Population (2018)         int64
Laureates/10 million    float64
dtype: object

In [36]:

# By default it tries to plot all numerical columns

prizesnums.plot.bar()
plt.show()

Out[36]:

In [37]:

prizesnums.plot.bar("Entity", "Nobel laureates")
plt.show()

Out[37]:

In [38]:

prizesnums.sort_values("Nobel laureates", ascending=False, inplace=True)
prizesnums.plot.bar("Entity", "Nobel laureates")
plt.show()

Out[38]:

In [39]:

# Mask out the smaller one
prizesnums[prizesnums["Nobel laureates"]>2].plot.bar("Entity", "Nobel laureates")
plt.show()

Out[39]:

In [40]:

# Sorting the per capita value ---> the organge one

prizesnums.sort_values("Laureates/10 million", ascending=False, inplace=True)

# Compare two or more columns
prizesnums[prizesnums["Laureates/10 million"]>5].plot.bar("Entity",
                                                          ["Nobel laureates",
                                                           "Laureates/10 million"])
plt.show()

Out[40]:

In [41]:

prizesnums.plot.scatter("Population (2018)", "Nobel laureates")
plt.show()

Out[41]:

In [42]:

prizesnums[prizesnums["Population (2018)"]<300000000].plot.scatter("Population (2018)",
                                                                   "Nobel laureates")
plt.show()

Out[42]:

In [43]:

merged = consumption.merge(prizesnums, left_on="Country", right_on="Entity", validate="1:1")
merged

Out[43]:

In [44]:

print(prizesnums.shape)
print(consumption.shape)
print(merged.shape)

Out[44]:

(77, 5)
(14, 2)
(12, 7)

In [45]:

merged = consumption.merge(prizesnums, left_on="Country", right_on="Entity",
                           validate="1:1", how="outer", sort=True)
merged

Out[45]:

In [46]:

merged.head(40)

Out[46]:

In [47]:

merged.tail(40)

Out[47]:

In [48]:

consumption.replace("UK", "United Kingdom", inplace=True)
consumption

Out[48]:

In [49]:

prizesnums = prizesnums.append({"Entity" : "Estonia",
                                "Nobel laureates" : 0,
                                "Laureates/10 million" : 0}, ignore_index=True)
prizesnums

Out[49]:

In [50]:

prizesnums.dtypes

Out[50]:

Rank                     object
Entity                   object
Nobel laureates           int64
Population (2018)       float64
Laureates/10 million    float64
dtype: object

In [51]:

merged = consumption.merge(prizesnums, left_on="Country", right_on="Entity", validate="1:1")
merged

Out[51]:

In [52]:

merged.plot.scatter("Consumption", "Laureates/10 million", linewidth = 5)
plt.show()

Out[52]:

In [53]:

merged.corr()

Out[53]:

In [55]:

wo_estonia = merged[ merged["Entity"] != "Estonia"]
wo_estonia.plot.scatter("Consumption", "Laureates/10 million", linewidth=5)
plt.show()
wo_estonia.corr()

Out[55]:

In [59]:

from scipy.stats import linregress     # Also return r and p value

plt.figure(figsize = (10, 8))

(slope, intercept, r, p, _) = linregress(merged["Consumption"],
                                         merged["Laureates/10 million"])

plt.scatter(merged["Consumption"], merged["Laureates/10 million"], linewidth = 5)
plt.plot(merged["Consumption"], slope * merged["Consumption"] + intercept, "k",
         linewidth = 3, label = "Linear Model")

plt.xlabel("Chocolate Consumption (kg/year/person)")
plt.ylabel("Nobel Laureates per capita x 10^6")
plt.legend()

plt.xlim([0, 15])
plt.show()

print("r = ", round(r, 3))
print("p < 0.05 (", round(p, 3), ")")

Out[59]:

r =  0.534
p < 0.05 ( 0.049 )

In [60]:

slope

Out[60]:

1.9518222295023373

In [0]:

Product

Resources

Company