Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
YStrano
GitHub Repository: YStrano/DataScience_GA
Path: blob/master/lessons/lesson_12/python-notebooks-data-wrangling/Data-Extraction--Texas-Death-Row-Executions.ipynb
1904 views
Kernel: Python 3
# Extracting death row executions from bs4 import BeautifulSoup from os.path import join from os import makedirs from urllib.parse import urljoin import csv import requests import re EXECUTED_URL = 'http://wgetsnaps.github.io/tdcj-state-tx-us--death_row/death_row/dr_executed_offenders.html' EXECUTED_TABLE_HEADERS = ['inmate_info_url', 'last_words_url', 'last_name', 'first_name', 'tdcj_number', 'executed_age', 'executed_date', 'race', 'county'] INMATE_FIELDS_TO_EXTRACT = { 'birthdate': 'Date of Birth', 'date_offense': 'Date of Offense', 'date_received': 'Date Received', 'gender': 'gender' } FILE_HEADERS = EXECUTED_TABLE_HEADERS + list(INMATE_FIELDS_TO_EXTRACT.keys()) # set up the directory/filename DATA_DIR = join('data', 'tx-death-penalty', 'extracted') DEST_FILENAME = join(DATA_DIR, 'texas-executed.csv') makedirs(DATA_DIR, exist_ok=True)
executed_html = requests.get(EXECUTED_URL).text executed_doc = BeautifulSoup(executed_html, 'lxml') executed_rows = executed_doc.select('table.os tr')[1:] # skip first row of headers
wf = open(DEST_FILENAME, 'w') csvfile = csv.DictWriter(wf, fieldnames = FILE_HEADERS, restval="") csvfile.writeheader()
for row in executed_rows: # skip first row of table headers cols = row.find_all('td')[1:] # skip first column # create dictionary d = dict(zip(EXECUTED_TABLE_HEADERS, [td.text.strip() for td in cols])) d['inmate_info_url'] = urljoin(EXECUTED_URL, cols[0].find('a')['href']) d['last_words_url'] = urljoin(EXECUTED_URL, cols[1].find('a')['href']) # write to CSV csvfile.writerow(d)
print("Wrote", len(executed_rows), 'rows in:', DEST_FILENAME)
Wrote 531 rows in: data/tx-death-penalty/extracted/texas-executed.csv
index_txt = requests.get(EXECUTED_URL).text index_doc = htmlparser.fromstring(index_txt) #urls = [urljoin(source_index_url, href) for href in index_doc.xpath('//td[2]/a/@href')] # print(len(urls), "total offender info urls") # htmlurls = [u for u in urls if '.html' in u] # print(len(htmlurls), "total offender info HTML pages") for tr in index_doc.xpath('//table[@class="os"]/tbody/tr[td]'): cols = [td.text_content().strip() for td in tr.xpath('td')] d = {'tdcj_number': cols[EXECUTED_TDCJ_INDEX]} d['last_name'] = cols[3] d['first_name'] = cols[4] d['date_executed'] = cols[7] d['race'] = cols[8] d['county'] = cols[9] # attempt to fetch data within inmate_url inmate_url = urljoin(EXECUTED_URL, tr.xpath('td[2]/@href')) if '.html' in inmate_url: try: inmate_soup = BeautifulSoup(requests.get(inmate_url).text, 'lxml') d['birthdate'] = inmate_soup.find('td', text=re.compile('Birth')).find_next_sibling('td').text.strip() d['gender'] = inmate_soup.find('td', text=re.compile('Gender')).find_next_sibling('td').text.strip() d['date_received'] = inmate_soup.find('td', text=re.compile('Date Received')).find_next_sibling('td').text.strip() d['date_offense'] = inmate_soup.find('td', text=re.compile('Date Offense')).find_next_sibling('td').text.strip() except Exception as err: print(inmate_url, err) # finally, write the row csvfile.writerow(d) # for inmate_url in htmlurls: