Path: blob/main/Trabajo_grupal/WG6/Grupo_8_py.ipynb
2714 views
Kernel: Python 3 (ipykernel)
1) Merge Dataset
In [1]:
In [2]:
In [3]:
Out[3]:
dict_keys(['dominio', 'estrato', 'tipenc', 'result', 'panel', 'p22', 'p24a', 'p24b', 'p25_1', 'p25_2', 'p25_3', 'p25_4', 'p25_5', 'p101', 'p102', 'p103', 'p103a', 'p104b1', 'p104b2', 'p105a', 'p106a', 'p106b', 'p107b1', 'p107c11', 'p107c12', 'p107c13', 'p107c14', 'p107c16', 'p107c17', 'p107c18', 'p107c19', 'p107c110', 'p107b2', 'p107c21', 'p107c22', 'p107c23', 'p107c24', 'p107c26', 'p107c27', 'p107c28', 'p107c29', 'p107c210', 'p107b3', 'p107c31', 'p107c32', 'p107c33', 'p107c34', 'p107c36', 'p107c37', 'p107c38', 'p107c39', 'p107c310', 'p107b4', 'p107c41', 'p107c42', 'p107c43', 'p107c44', 'p107c46', 'p107c47', 'p107c48', 'p107c49', 'p107c410', 'p107e', 'p110', 'p110a1', 'p110c', 'p110d', 'p110e', 'p110f', 'p110g', 'p111a', 'p1121', 'p1123', 'p1124', 'p1125', 'p1126', 'p1127', 'p112a', 'p1131', 'p1132', 'p1133', 'p1135', 'p1136', 'p1139', 'p1137', 'p1138', 'p113a', 'p1141', 'p1142', 'p1143', 'p1144', 'p1145', 'p1171_01', 'p1171_02', 'p1171_03', 'p1171_04', 'p1171_05', 'p1171_06', 'p1171_07', 'p1171_08', 'p1171_09', 'p1171_10', 'p1171_11', 'p1171_12', 'p1171_13', 'p1171_14', 'p1171_15', 'p1171_16', 'p1175_01', 'p1175_02', 'p1175_03', 'p1175_04', 'p1175_05', 'p1175_06', 'p1175_07', 'p1175_08', 'p1175_09', 'p1175_10', 'p1175_11', 'p1175_12', 'p1175_13', 'p1175_14', 'p1175_15', 'p1175_16', 'p612i1', 'p612i2', 't110', 't111a', 'nbi1', 'nbi2', 'nbi3', 'nbi4', 'nbi5', 'ticuest01', 'tipocuestionario', 'tipoentrevista', 'rechazo_razones'])
In [5]:
In [6]:
Out[6]:
C:\Users\SABINA\AppData\Local\Temp\ipykernel_35548\3416170526.py:3: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
merge_append = merge2020.append(merge2019, ignore_index = True) #ignore_index= True para que no haya conflictos de indexing
C:\Users\SABINA\anaconda3\lib\site-packages\pandas\io\stata.py:2491: InvalidColumnName:
Not all pandas column names were valid Stata variable names.
The following replacements have been made:
aÑo_x -> a_o_x
aÑo_y -> a_o_y
If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)
warnings.warn(ws, InvalidColumnName)
In [7]:
Out[7]:
C:\Users\SABINA\AppData\Local\Temp\ipykernel_35548\2700305781.py:3: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
merge_append["ingreso_mensual"] = merge_append["inghog1d"]/(12*merge_append["mieperho"])
C:\Users\SABINA\AppData\Local\Temp\ipykernel_35548\2700305781.py:5: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
merge_append["gasto_mensual"] = merge_append["gashog2d"]/(12*merge_append["mieperho"])
In [8]:
Out[8]:
C:\Users\SABINA\AppData\Local\Temp\ipykernel_35548\1517560721.py:5: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
merge_append["ingreso_mensual_defl"] = merge_append["ingreso_mensual"] * merge_append["ld"]
C:\Users\SABINA\AppData\Local\Temp\ipykernel_35548\1517560721.py:12: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
merge_append['departamento'] = merge_append['ubigeo_x'].str[:2]
C:\Users\SABINA\AppData\Local\Temp\ipykernel_35548\1517560721.py:13: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
merge_append['departamento6'] = merge_append['ubigeo_x'].str[:2]+"0000"
In [9]:
Out[9]:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [9], in <cell line: 3>()
1 #Merge entre merge_append y el deflactor temporal:
----> 3 merge_def_temporal_2020 = pd.merge(merge_append, deflactores_base2020_new ,
4 left_on = ["departamento", "aÑo_x"],
5 right_on = ["dpto","aniorec"],
6 how = "left",
7 validate = "1:1")
File ~\anaconda3\lib\site-packages\pandas\core\reshape\merge.py:107
, in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
90 @Substitution("\nleft : DataFrame or named Series")
91 @Appender(_merge_doc, indents=0)
92 def merge(
(...)
105 validate: str | None = None,
106 ) -> DataFrame:
--> 107 op = _MergeOperation(
108 left,
109 right,
110 how=how,
111 on=on,
112 left_on=left_on,
113 right_on=right_on,
114 left_index=left_index,
115 right_index=right_index,
116 sort=sort,
117 suffixes=suffixes,
118 copy=copy,
119 indicator=indicator,
120 validate=validate,
121 )
122 return op.get_result()
File ~\anaconda3\lib\site-packages\pandas\core\reshape\merge.py:704
, in _MergeOperation.__init__(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy, indicator, validate)
696 (
697 self.left_join_keys,
698 self.right_join_keys,
699 self.join_names,
700 ) = self._get_merge_keys()
702 # validate the merge keys dtypes. We may need to coerce
703 # to avoid incompatible dtypes
--> 704 self._maybe_coerce_merge_keys()
706 # If argument passed to validate,
707 # check if columns specified as unique
708 # are in fact unique.
709 if validate is not None:
File ~\anaconda3\lib\site-packages\pandas\core\reshape\merge.py:1257
, in _MergeOperation._maybe_coerce_merge_keys(self)
1251 # unless we are merging non-string-like with string-like
1252 elif (
1253 inferred_left in string_types and inferred_right not in string_types
1254 ) or (
1255 inferred_right in string_types and inferred_left not in string_types
1256 ):
-> 1257 raise ValueError(msg)
1259 # datetimelikes must match exactly
1260 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype):
ValueError: You are trying to merge on object and float32 columns. If you wish to proceed you should use pd.concat
In [10]:
Out[10]:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [10], in <cell line: 3>()
1 # Ingreso per capita mensual
----> 3 merge_append["ingreso_mensual_pc"] = merge_append["inghog1d"]/(12*merge_append["mieperho"]*merge_append["ld"]*merge_def_temporal_2020["i00"])
5 # Gasto per capita mensual
7 merge_append["gasto_mensual_pc"] = merge_append["gashog2d"]/(12*merge_append["mieperho"]*merge_append["ld"]*merge_def_temporal_2020["i00"])
NameError: name 'merge_def_temporal_2020' is not defined
2) Salario por hora del trabajador dependiente
In [11]:
Out[11]:
Input In [11]
convert_categoricals=False, iterator=True)
^
SyntaxError: invalid syntax
In [ ]:
3) Groupby
In [ ]:
In [ ]:
In [ ]: