Path: blob/main/Trabajo_grupal/WG5/Grupo_3_Python
2714 views
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting pyreadstat\n",
" Downloading pyreadstat-1.1.9-cp38-cp38-win_amd64.whl (2.4 MB)\n",
" ---------------------------------------- 2.4/2.4 MB 9.5 MB/s eta 0:00:00\n",
"Requirement already satisfied: pandas>=1.2.0 in c:\\users\\aldai\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pyreadstat) (1.4.4)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\aldai\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas>=1.2.0->pyreadstat) (2022.2.1)\n",
"Requirement already satisfied: numpy>=1.18.5 in c:\\users\\aldai\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas>=1.2.0->pyreadstat) (1.23.2)\n",
"Requirement already satisfied: python-dateutil>=2.8.1 in c:\\users\\aldai\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas>=1.2.0->pyreadstat) (2.8.2)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\aldai\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from python-dateutil>=2.8.1->pandas>=1.2.0->pyreadstat) (1.16.0)\n",
"Installing collected packages: pyreadstat\n",
"Successfully installed pyreadstat-1.1.9\n"
]
}
],
"source": [
"# En caso no contar con el paquete pyreadstat, lo instalamos con el siguiente comando\n",
"!pip install pyreadstat"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#Importamos el paquete pyreadstat\n",
"import pyreadstat"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>MES</th>\n",
" <th>CONGLOME</th>\n",
" <th>VIVIENDA</th>\n",
" <th>HOGAR</th>\n",
" <th>CODPERSO</th>\n",
" <th>UBIGEO</th>\n",
" <th>DOMINIO</th>\n",
" <th>ESTRATO</th>\n",
" <th>P201P</th>\n",
" <th>P203</th>\n",
" <th>P203A</th>\n",
" <th>P203B</th>\n",
" <th>P204</th>\n",
" <th>P205</th>\n",
" <th>P206</th>\n",
" <th>P207</th>\n",
" <th>P208A</th>\n",
" <th>P208B</th>\n",
" <th>P209</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2020</td>\n",
" <td>06</td>\n",
" <td>010108</td>\n",
" <td>136</td>\n",
" <td>11</td>\n",
" <td>02</td>\n",
" <td>100111</td>\n",
" <td>5.0</td>\n",
" <td>2.0</td>\n",
" <td>20200101081361102</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>43.0</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2020</td>\n",
" <td>06</td>\n",
" <td>020257</td>\n",
" <td>093</td>\n",
" <td>11</td>\n",
" <td>04</td>\n",
" <td>250107</td>\n",
" <td>7.0</td>\n",
" <td>2.0</td>\n",
" <td>20200202570931104</td>\n",
" <td>3.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>17.0</td>\n",
" <td>NaN</td>\n",
" <td>6.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2020</td>\n",
" <td>06</td>\n",
" <td>015537</td>\n",
" <td>065</td>\n",
" <td>11</td>\n",
" <td>04</td>\n",
" <td>040126</td>\n",
" <td>6.0</td>\n",
" <td>1.0</td>\n",
" <td>20200155370651104</td>\n",
" <td>3.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>9.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2019</td>\n",
" <td>10</td>\n",
" <td>010614</td>\n",
" <td>043</td>\n",
" <td>11</td>\n",
" <td>07</td>\n",
" <td>130901</td>\n",
" <td>4.0</td>\n",
" <td>4.0</td>\n",
" <td>20190106140431107</td>\n",
" <td>7.0</td>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>4.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2020</td>\n",
" <td>06</td>\n",
" <td>009390</td>\n",
" <td>096</td>\n",
" <td>11</td>\n",
" <td>04</td>\n",
" <td>230110</td>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" <td>20180093900961104</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85030</th>\n",
" <td>2020</td>\n",
" <td>12</td>\n",
" <td>007337</td>\n",
" <td>004</td>\n",
" <td>11</td>\n",
" <td>01</td>\n",
" <td>140105</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>20200073370041101</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>65.0</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85031</th>\n",
" <td>2019</td>\n",
" <td>04</td>\n",
" <td>008226</td>\n",
" <td>084</td>\n",
" <td>13</td>\n",
" <td>02</td>\n",
" <td>160112</td>\n",
" <td>7.0</td>\n",
" <td>2.0</td>\n",
" <td>20190082260841302</td>\n",
" <td>10.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>89.0</td>\n",
" <td>NaN</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85032</th>\n",
" <td>2019</td>\n",
" <td>05</td>\n",
" <td>008965</td>\n",
" <td>060</td>\n",
" <td>11</td>\n",
" <td>03</td>\n",
" <td>210101</td>\n",
" <td>6.0</td>\n",
" <td>2.0</td>\n",
" <td>20190089650601103</td>\n",
" <td>3.0</td>\n",
" <td>1.0</td>\n",
" <td>3.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>20.0</td>\n",
" <td>NaN</td>\n",
" <td>6.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85033</th>\n",
" <td>2020</td>\n",
" <td>05</td>\n",
" <td>010311</td>\n",
" <td>022</td>\n",
" <td>11</td>\n",
" <td>02</td>\n",
" <td>200501</td>\n",
" <td>1.0</td>\n",
" <td>3.0</td>\n",
" <td>20200103110221102</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>48.0</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85034</th>\n",
" <td>2020</td>\n",
" <td>11</td>\n",
" <td>018778</td>\n",
" <td>155</td>\n",
" <td>11</td>\n",
" <td>06</td>\n",
" <td>160404</td>\n",
" <td>7.0</td>\n",
" <td>7.0</td>\n",
" <td>20200187781551106</td>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>22.0</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>85035 rows × 20 columns</p>\n",
"</div>"
],
"text/plain": [
" year MES CONGLOME VIVIENDA HOGAR CODPERSO UBIGEO DOMINIO ESTRATO \\\n",
"0 2020 06 010108 136 11 02 100111 5.0 2.0 \n",
"1 2020 06 020257 093 11 04 250107 7.0 2.0 \n",
"2 2020 06 015537 065 11 04 040126 6.0 1.0 \n",
"3 2019 10 010614 043 11 07 130901 4.0 4.0 \n",
"4 2020 06 009390 096 11 04 230110 3.0 2.0 \n",
"... ... .. ... ... ... ... ... ... ... \n",
"85030 2020 12 007337 004 11 01 140105 1.0 2.0 \n",
"85031 2019 04 008226 084 13 02 160112 7.0 2.0 \n",
"85032 2019 05 008965 060 11 03 210101 6.0 2.0 \n",
"85033 2020 05 010311 022 11 02 200501 1.0 3.0 \n",
"85034 2020 11 018778 155 11 06 160404 7.0 7.0 \n",
"\n",
" P201P P203 P203A P203B P204 P205 P206 P207 P208A \\\n",
"0 20200101081361102 2.0 NaN NaN 1.0 2.0 NaN 2.0 43.0 \n",
"1 20200202570931104 3.0 NaN NaN 1.0 2.0 NaN 1.0 17.0 \n",
"2 20200155370651104 3.0 NaN NaN 1.0 2.0 NaN 2.0 9.0 \n",
"3 20190106140431107 7.0 2.0 3.0 2.0 NaN 2.0 2.0 4.0 \n",
"4 20180093900961104 0.0 NaN NaN NaN NaN NaN NaN NaN \n",
"... ... ... ... ... ... ... ... ... ... \n",
"85030 20200073370041101 1.0 1.0 1.0 1.0 2.0 NaN 1.0 65.0 \n",
"85031 20190082260841302 10.0 0.0 1.0 1.0 2.0 NaN 2.0 89.0 \n",
"85032 20190089650601103 3.0 1.0 3.0 1.0 2.0 NaN 1.0 20.0 \n",
"85033 20200103110221102 2.0 NaN NaN 1.0 2.0 NaN 2.0 48.0 \n",
"85034 20200187781551106 3.0 2.0 1.0 1.0 2.0 NaN 1.0 22.0 \n",
"\n",
" P208B P209 \n",
"0 NaN 2.0 \n",
"1 NaN 6.0 \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN \n",
"... ... ... \n",
"85030 NaN 2.0 \n",
"85031 NaN 3.0 \n",
"85032 NaN 6.0 \n",
"85033 NaN 2.0 \n",
"85034 NaN 1.0 \n",
"\n",
"[85035 rows x 20 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Leemos el archivo .sav y guardamos el dataframe en la variable df\n",
"df, meta = pyreadstat.read_sav(\"../data_administrativa.sav\")\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Mostrar las variables que presentan missing values (NA)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['P203A', 'P203B', 'P204', 'P205', 'P206', 'P207', 'P208A', 'P208B', 'P209']\n"
]
}
],
"source": [
"vars_with_na = [var for var in df.columns if df[var].isnull().sum() > 0]\n",
"print (vars_with_na)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Mostrar las etiquetas de dos variables (var labels) y las etiquetas de los valores en dos variables (value's labels)."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CODPERSO</th>\n",
" <th>UBIGEO</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>02</td>\n",
" <td>100111</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>04</td>\n",
" <td>250107</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" CODPERSO UBIGEO\n",
"0 02 100111\n",
"1 04 250107"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[['CODPERSO', 'UBIGEO']][0:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Detectar personas que fueron entrevistadas en ambos años a partir de las variables: conglome, vivienda, hogar y codperso"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CONGLOME</th>\n",
" <th>VIVIENDA</th>\n",
" <th>HOGAR</th>\n",
" <th>CODPERSO</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2632</th>\n",
" <td>006571</td>\n",
" <td>174</td>\n",
" <td>11</td>\n",
" <td>02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2724</th>\n",
" <td>005278</td>\n",
" <td>048</td>\n",
" <td>11</td>\n",
" <td>04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2773</th>\n",
" <td>005190</td>\n",
" <td>037</td>\n",
" <td>11</td>\n",
" <td>01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3135</th>\n",
" <td>006886</td>\n",
" <td>168</td>\n",
" <td>11</td>\n",
" <td>02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3663</th>\n",
" <td>008611</td>\n",
" <td>048</td>\n",
" <td>11</td>\n",
" <td>04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84940</th>\n",
" <td>008253</td>\n",
" <td>047</td>\n",
" <td>11</td>\n",
" <td>01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84945</th>\n",
" <td>007982</td>\n",
" <td>008</td>\n",
" <td>11</td>\n",
" <td>02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84975</th>\n",
" <td>007221</td>\n",
" <td>043</td>\n",
" <td>11</td>\n",
" <td>03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84982</th>\n",
" <td>008806</td>\n",
" <td>114</td>\n",
" <td>11</td>\n",
" <td>04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85000</th>\n",
" <td>009338</td>\n",
" <td>017</td>\n",
" <td>11</td>\n",
" <td>03</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4135 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" CONGLOME VIVIENDA HOGAR CODPERSO\n",
"2632 006571 174 11 02\n",
"2724 005278 048 11 04\n",
"2773 005190 037 11 01\n",
"3135 006886 168 11 02\n",
"3663 008611 048 11 04\n",
"... ... ... ... ...\n",
"84940 008253 047 11 01\n",
"84945 007982 008 11 02\n",
"84975 007221 043 11 03\n",
"84982 008806 114 11 04\n",
"85000 009338 017 11 03\n",
"\n",
"[4135 rows x 4 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_filter = df[['CONGLOME', 'VIVIENDA', 'HOGAR', 'CODPERSO']]\n",
"df_duplicados = df_filter[df_filter.duplicated()]\n",
"df_duplicados"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Limpiamos la base de datos de las personas duplicadas\n",
"df = df.drop_duplicates(subset=['CONGLOME', 'VIVIENDA', 'HOGAR', 'CODPERSO'], keep='first')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Ordenar la base de datos a partir de las variables que identifican cada miembro y la variable año (year)."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>MES</th>\n",
" <th>CONGLOME</th>\n",
" <th>VIVIENDA</th>\n",
" <th>HOGAR</th>\n",
" <th>CODPERSO</th>\n",
" <th>UBIGEO</th>\n",
" <th>DOMINIO</th>\n",
" <th>ESTRATO</th>\n",
" <th>P201P</th>\n",
" <th>P203</th>\n",
" <th>P203A</th>\n",
" <th>P203B</th>\n",
" <th>P204</th>\n",
" <th>P205</th>\n",
" <th>P206</th>\n",
" <th>P207</th>\n",
" <th>P208A</th>\n",
" <th>P208B</th>\n",
" <th>P209</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>41445</th>\n",
" <td>2019</td>\n",
" <td>07</td>\n",
" <td>007513</td>\n",
" <td>033</td>\n",
" <td>11</td>\n",
" <td>05</td>\n",
" <td>150135</td>\n",
" <td>8.0</td>\n",
" <td>1.0</td>\n",
" <td>20190075130331105</td>\n",
" <td>7.0</td>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>66886</th>\n",
" <td>2019</td>\n",
" <td>12</td>\n",
" <td>006717</td>\n",
" <td>038</td>\n",
" <td>11</td>\n",
" <td>02</td>\n",
" <td>110101</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>20170067170381102</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>43.0</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33971</th>\n",
" <td>2019</td>\n",
" <td>03</td>\n",
" <td>005250</td>\n",
" <td>169</td>\n",
" <td>11</td>\n",
" <td>01</td>\n",
" <td>020105</td>\n",
" <td>5.0</td>\n",
" <td>3.0</td>\n",
" <td>20190052501691101</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>44.0</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33972</th>\n",
" <td>2019</td>\n",
" <td>08</td>\n",
" <td>008425</td>\n",
" <td>012</td>\n",
" <td>11</td>\n",
" <td>05</td>\n",
" <td>170103</td>\n",
" <td>7.0</td>\n",
" <td>5.0</td>\n",
" <td>20190084250121105</td>\n",
" <td>3.0</td>\n",
" <td>1.0</td>\n",
" <td>3.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33973</th>\n",
" <td>2019</td>\n",
" <td>12</td>\n",
" <td>008847</td>\n",
" <td>084</td>\n",
" <td>11</td>\n",
" <td>01</td>\n",
" <td>200601</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>20190088470841101</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>48.0</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36959</th>\n",
" <td>2020</td>\n",
" <td>12</td>\n",
" <td>017723</td>\n",
" <td>018</td>\n",
" <td>11</td>\n",
" <td>05</td>\n",
" <td>140112</td>\n",
" <td>1.0</td>\n",
" <td>4.0</td>\n",
" <td>20200177230181105</td>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>32.0</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36960</th>\n",
" <td>2020</td>\n",
" <td>05</td>\n",
" <td>015946</td>\n",
" <td>495</td>\n",
" <td>11</td>\n",
" <td>04</td>\n",
" <td>050510</td>\n",
" <td>7.0</td>\n",
" <td>7.0</td>\n",
" <td>20200159464951104</td>\n",
" <td>3.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>11.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36962</th>\n",
" <td>2020</td>\n",
" <td>03</td>\n",
" <td>006530</td>\n",
" <td>009</td>\n",
" <td>11</td>\n",
" <td>05</td>\n",
" <td>100111</td>\n",
" <td>5.0</td>\n",
" <td>2.0</td>\n",
" <td>20200065300091105</td>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>32.0</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36910</th>\n",
" <td>2020</td>\n",
" <td>09</td>\n",
" <td>008503</td>\n",
" <td>045</td>\n",
" <td>11</td>\n",
" <td>06</td>\n",
" <td>180101</td>\n",
" <td>3.0</td>\n",
" <td>3.0</td>\n",
" <td>20180085030451103</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85034</th>\n",
" <td>2020</td>\n",
" <td>11</td>\n",
" <td>018778</td>\n",
" <td>155</td>\n",
" <td>11</td>\n",
" <td>06</td>\n",
" <td>160404</td>\n",
" <td>7.0</td>\n",
" <td>7.0</td>\n",
" <td>20200187781551106</td>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>22.0</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>80900 rows × 20 columns</p>\n",
"</div>"
],
"text/plain": [
" year MES CONGLOME VIVIENDA HOGAR CODPERSO UBIGEO DOMINIO ESTRATO \\\n",
"41445 2019 07 007513 033 11 05 150135 8.0 1.0 \n",
"66886 2019 12 006717 038 11 02 110101 2.0 2.0 \n",
"33971 2019 03 005250 169 11 01 020105 5.0 3.0 \n",
"33972 2019 08 008425 012 11 05 170103 7.0 5.0 \n",
"33973 2019 12 008847 084 11 01 200601 1.0 2.0 \n",
"... ... .. ... ... ... ... ... ... ... \n",
"36959 2020 12 017723 018 11 05 140112 1.0 4.0 \n",
"36960 2020 05 015946 495 11 04 050510 7.0 7.0 \n",
"36962 2020 03 006530 009 11 05 100111 5.0 2.0 \n",
"36910 2020 09 008503 045 11 06 180101 3.0 3.0 \n",
"85034 2020 11 018778 155 11 06 160404 7.0 7.0 \n",
"\n",
" P201P P203 P203A P203B P204 P205 P206 P207 P208A \\\n",
"41445 20190075130331105 7.0 2.0 3.0 1.0 2.0 NaN 2.0 0.0 \n",
"66886 20170067170381102 2.0 1.0 2.0 1.0 2.0 NaN 2.0 43.0 \n",
"33971 20190052501691101 1.0 1.0 1.0 1.0 2.0 NaN 1.0 44.0 \n",
"33972 20190084250121105 3.0 1.0 3.0 1.0 2.0 NaN 1.0 0.0 \n",
"33973 20190088470841101 1.0 1.0 1.0 1.0 2.0 NaN 1.0 48.0 \n",
"... ... ... ... ... ... ... ... ... ... \n",
"36959 20200177230181105 3.0 2.0 1.0 1.0 2.0 NaN 2.0 32.0 \n",
"36960 20200159464951104 3.0 NaN NaN 1.0 2.0 NaN 2.0 11.0 \n",
"36962 20200065300091105 3.0 2.0 1.0 1.0 2.0 NaN 1.0 32.0 \n",
"36910 20180085030451103 0.0 NaN NaN NaN NaN NaN NaN NaN \n",
"85034 20200187781551106 3.0 2.0 1.0 1.0 2.0 NaN 1.0 22.0 \n",
"\n",
" P208B P209 \n",
"41445 1.0 NaN \n",
"66886 NaN 2.0 \n",
"33971 NaN 2.0 \n",
"33972 4.0 NaN \n",
"33973 NaN 1.0 \n",
"... ... ... \n",
"36959 NaN 1.0 \n",
"36960 NaN NaN \n",
"36962 NaN 1.0 \n",
"36910 NaN NaN \n",
"85034 NaN 1.0 \n",
"\n",
"[80900 rows x 20 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_neat = df.sort_values('year')\n",
"df_neat"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Crear una base de datos para cada año"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# Filtramos la base de datos en función al año\n",
"df_2019 = df[df['year'] == '2019']\n",
"df_2020 = df[df['year'] == '2020']\n",
"\n",
"# Exportamos los dataframes generados\n",
"df_2019.to_csv('data_2019_(3).csv')\n",
"df_2020.to_csv('data_2020_(3).csv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.0 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "1529b77671c723740968c4c73aa794f3b3bf15c122a394072452cb98e28121b2"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}