Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
robertopucp
GitHub Repository: robertopucp/1eco35_2022_2
Path: blob/main/Lab11/Scrapper_MEF.ipynb
2714 views
Kernel: Python 3 (ipykernel)

Web sracpping Página web consulta amigable (MEF)

# import main packages import re # regex or regular expresion import time ### time library from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import Select from webdriver_manager.chrome import ChromeDriverManager ## install Chrome driver simulator
############################################################################## ############# GOBIERNO NACIONAL, REGIONAL Y LOCAL (AGREGADOS) ################ # Entrando a ChromeDriver (se puede descargar de https://chromedriver.chromium.org/downloads) # Antes de eso, chequear bien la versión del Chrome que uno tiene wd = webdriver.Chrome("chromedriver.exe") url = "http://apps5.mineco.gob.pe/transparencia/Navegador/default.aspx" wd.get( url ) wd.maximize_window()
C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\2185176457.py:7: DeprecationWarning: executable_path has been deprecated, please pass in a Service object wd = webdriver.Chrome("chromedriver.exe")
# bottom año idn =driver.find_element_by_id("ctl00_CPH1_DrpYear") # Error por que el contenido estpa dentro de otra página web
C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\566541331.py:2: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead idn =driver.find_element_by_id("ctl00_CPH1_DrpYear")
--------------------------------------------------------------------------- NoSuchWindowException Traceback (most recent call last) Input In [6], in <cell line: 2>() 1 # bottom año ----> 2 idn =driver.find_element_by_id("ctl00_CPH1_DrpYear")
File ~\anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py:478, in WebDriver.find_element_by_id(self, id_) 457 """Finds an element by id. 458 459 :Args: (...) 471 element = driver.find_element_by_id('foo') 472 """ 473 warnings.warn( 474 "find_element_by_* commands are deprecated. Please use find_element() instead", 475 DeprecationWarning, 476 stacklevel=2, 477 ) --> 478 return self.find_element(by=By.ID, value=id_)
File ~\anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py:1251, in WebDriver.find_element(self, by, value) 1248 by = By.CSS_SELECTOR 1249 value = '[name="%s"]' % value -> 1251 return self.execute(Command.FIND_ELEMENT, { 1252 'using': by, 1253 'value': value})['value']
File ~\anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py:430, in WebDriver.execute(self, driver_command, params) 428 response = self.command_executor.execute(driver_command, params) 429 if response: --> 430 self.error_handler.check_response(response) 431 response['value'] = self._unwrap_value( 432 response.get('value', None)) 433 return response
File ~\anaconda3\lib\site-packages\selenium\webdriver\remote\errorhandler.py:247, in ErrorHandler.check_response(self, response) 245 alert_text = value['alert'].get('text') 246 raise exception_class(message, screen, stacktrace, alert_text) # type: ignore[call-arg] # mypy is not smart enough here --> 247 raise exception_class(message, screen, stacktrace)
NoSuchWindowException: Message: no such window: target window already closed from unknown error: web view not found (Session info: chrome=107.0.5304.107) Stacktrace: Backtrace: Ordinal0 [0x00A9ACD3+2075859] Ordinal0 [0x00A2EE61+1633889] Ordinal0 [0x0092B7BD+571325] Ordinal0 [0x00912E1A+470554] Ordinal0 [0x0097AA0B+895499] Ordinal0 [0x0098AC96+961686] Ordinal0 [0x00977136+880950] Ordinal0 [0x0094FEFD+720637] Ordinal0 [0x00950F3F+724799] GetHandleVerifier [0x00D4EED2+2769538] GetHandleVerifier [0x00D40D95+2711877] GetHandleVerifier [0x00B2A03A+521194] GetHandleVerifier [0x00B28DA0+516432] Ordinal0 [0x00A3682C+1665068] Ordinal0 [0x00A3B128+1683752] Ordinal0 [0x00A3B215+1683989] Ordinal0 [0x00A46484+1729668] BaseThreadInitThunk [0x75A16939+25] RtlGetFullPathName_UEx [0x770E8FD2+1218] RtlGetFullPathName_UEx [0x770E8F9D+1165]
# select Frame and select year option frame = wd.find_element_by_id("frame0") wd.switch_to.frame(frame) time.sleep(1.5) # selecionamos el año 2016 periodo = Select(wd.find_element_by_id("ctl00_CPH1_DrpYear")) periodo.select_by_value(str(2016)) time.sleep(2) # Se debe seleccionar la pagina web interna otr avez frame = wd.find_element_by_id("frame0") wd.switch_to.frame(frame)
C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\2771775919.py:3: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead frame = wd.find_element_by_id("frame0") C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\2771775919.py:10: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead periodo = Select(wd.find_element_by_id("ctl00_CPH1_DrpYear")) C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\2771775919.py:18: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead frame = wd.find_element_by_id("frame0")
# click en el tipo de gobierno niv_gob = wd.find_element_by_name("ctl00$CPH1$BtnTipoGobierno") niv_gob.click() time.sleep(1.5) # Seleccion en gobierno locales wd.find_element_by_xpath("//*[@id='tr0']/td[2]").click()
C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\2116795948.py:3: DeprecationWarning: find_element_by_name is deprecated. Please use find_element(by=By.NAME, value=name) instead niv_gob = wd.find_element_by_name("ctl00$CPH1$BtnTipoGobierno") C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\2116795948.py:10: DeprecationWarning: find_element_by_xpath is deprecated. Please use find_element(by=By.XPATH, value=xpath) instead wd.find_element_by_xpath("//*[@id='tr0']/td[1]").click()
# Preparando el bucle: Gobierno Local wd = webdriver.Chrome("chromedriver.exe") wd.maximize_window() for k in range(2015, 2021): # En este caso se colocaron los años 2015 - 2020 wd.get("http://apps5.mineco.gob.pe/transparencia/Navegador/default.aspx") print(k) # Escogiendo el año en el desplegable (luego hacer bucle por año) frame = wd.find_element_by_id("frame0") wd.switch_to.frame(frame) periodo = Select(wd.find_element_by_id("ctl00_CPH1_DrpYear")) periodo.select_by_value(str(k)) # Para escoger el nivel de gobierno que ejecuta los proyectos: Nacional frame = wd.find_element_by_id("frame0") wd.switch_to.frame(frame) niv_gob = wd.find_element_by_name("ctl00$CPH1$BtnTipoGobierno") niv_gob.click() time.sleep(1.5) gob_nac = wd.find_element_by_xpath("//*[@id='tr0']/td[2]").click() time.sleep(1.5) # Seleccionando Función (en este caso: Agropecuaria) wd.find_element_by_id("ctl00_CPH1_BtnFuncion").click() time.sleep(2) # Seleccionar agropecuaria funcion = wd.find_element_by_xpath("//*[@id='tr6']/td[1]").click() time.sleep(1.5) # Seleccionando Función Agropecuaria (por orden alfabético) # Seleccionando opción de Departamento dptos = wd.find_element_by_name("ctl00$CPH1$BtnDepartamentoMeta") dptos.click() time.sleep(2) lista_regiones = [2, 4, 5] # En este caso se seleccionaron 3 departamentos: Apurímac, Ayacucho y Cajamarca # Preparando bucle para ir por cada departamento for i in lista_regiones: region = wd.find_element_by_xpath("//*[@id='tr"+str(i)+"']/td[2]").text print(region) # Extrae el texto de la opción y lo muestra en la consola (para orden) wd.find_element_by_id("tr"+str(i)+"").click() time.sleep(3) # Seleccionando Proyectos time.sleep(3) wd.find_element_by_id("ctl00_CPH1_BtnProdProy").click() # Descargando los archivos de Excel d_excel = wd.find_element_by_id("ctl00_CPH1_lbtnExportar").click() time.sleep(3) # Volver para cambiar de Departamento otro_dep = wd.find_element_by_xpath('//*[@id="ctl00_CPH1_RptHistory_ctl04_TD0"]') otro_dep.click() time.sleep(3)
C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\3061119188.py:2: DeprecationWarning: executable_path has been deprecated, please pass in a Service object wd = webdriver.Chrome("chromedriver.exe")
2015
C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\3061119188.py:11: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead frame = wd.find_element_by_id("frame0") C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\3061119188.py:14: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead periodo = Select(wd.find_element_by_id("ctl00_CPH1_DrpYear")) C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\3061119188.py:19: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead frame = wd.find_element_by_id("frame0") C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\3061119188.py:22: DeprecationWarning: find_element_by_name is deprecated. Please use find_element(by=By.NAME, value=name) instead niv_gob = wd.find_element_by_name("ctl00$CPH1$BtnTipoGobierno") C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\3061119188.py:27: DeprecationWarning: find_element_by_xpath is deprecated. Please use find_element(by=By.XPATH, value=xpath) instead gob_nac = wd.find_element_by_xpath("//*[@id='tr0']/td[2]").click() C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\3061119188.py:33: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead wd.find_element_by_id("ctl00_CPH1_BtnFuncion").click() C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\3061119188.py:39: DeprecationWarning: find_element_by_xpath is deprecated. Please use find_element(by=By.XPATH, value=xpath) instead funcion = wd.find_element_by_xpath("//*[@id='tr6']/td[1]").click() C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\3061119188.py:46: DeprecationWarning: find_element_by_name is deprecated. Please use find_element(by=By.NAME, value=name) instead dptos = wd.find_element_by_name("ctl00$CPH1$BtnDepartamentoMeta") C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\3061119188.py:57: DeprecationWarning: find_element_by_xpath is deprecated. Please use find_element(by=By.XPATH, value=xpath) instead region = wd.find_element_by_xpath("//*[@id='tr"+str(i)+"']/td[2]").text C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\3061119188.py:60: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead wd.find_element_by_id("tr"+str(i)+"").click()
03: APURIMAC
C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\3061119188.py:68: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead wd.find_element_by_id("ctl00_CPH1_BtnProdProy").click() C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\3061119188.py:71: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead d_excel = wd.find_element_by_id("ctl00_CPH1_lbtnExportar").click() C:\Users\Roberto\AppData\Local\Temp\ipykernel_19520\3061119188.py:76: DeprecationWarning: find_element_by_xpath is deprecated. Please use find_element(by=By.XPATH, value=xpath) instead otro_dep = wd.find_element_by_xpath('//*[@id="ctl00_CPH1_RptHistory_ctl04_TD0"]')
05: AYACUCHO 06: CAJAMARCA 2016
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) Input In [12], in <cell line: 5>() 31 # Seleccionando Función (en este caso: Agropecuaria) 33 wd.find_element_by_id("ctl00_CPH1_BtnFuncion").click() ---> 35 time.sleep(2) 37 # Seleccionar agropecuaria 39 funcion = wd.find_element_by_xpath("//*[@id='tr6']/td[1]").click() KeyboardInterrupt: