CoCalc -- Data-Extraction--Texas-Death-Row-Executions.ipynb

GitHub Repository: YStrano/DataScience_GA
Path: blob/master/lessons/lesson_12/python-notebooks-data-wrangling/Data-Extraction--Texas-Death-Row-Executions.ipynb
¹⁹⁰⁴ views

Kernel: Python 3

In [1]:

# Extracting death row executions
from bs4 import BeautifulSoup
from os.path import join
from os import makedirs
from urllib.parse import urljoin
import csv
import requests
import re

EXECUTED_URL = 'http://wgetsnaps.github.io/tdcj-state-tx-us--death_row/death_row/dr_executed_offenders.html'
EXECUTED_TABLE_HEADERS = ['inmate_info_url', 'last_words_url', 'last_name', 'first_name', 
                          'tdcj_number', 'executed_age', 'executed_date', 'race',  'county']

INMATE_FIELDS_TO_EXTRACT = {
    'birthdate': 'Date of Birth', 
    'date_offense': 'Date of Offense',
    'date_received': 'Date Received',
    'gender': 'gender'    
}    


FILE_HEADERS = EXECUTED_TABLE_HEADERS + list(INMATE_FIELDS_TO_EXTRACT.keys())


# set up the directory/filename
DATA_DIR = join('data', 'tx-death-penalty', 'extracted')
DEST_FILENAME = join(DATA_DIR, 'texas-executed.csv')
makedirs(DATA_DIR, exist_ok=True)

In [2]:

executed_html = requests.get(EXECUTED_URL).text
executed_doc = BeautifulSoup(executed_html, 'lxml')
executed_rows = executed_doc.select('table.os tr')[1:] # skip first row of headers

In [3]:

wf = open(DEST_FILENAME, 'w')
csvfile = csv.DictWriter(wf, fieldnames = FILE_HEADERS, restval="")
csvfile.writeheader()

In [4]:

for row in executed_rows: # skip first row of table headers
    cols = row.find_all('td')[1:] # skip first column
    # create dictionary 
    d = dict(zip(EXECUTED_TABLE_HEADERS, [td.text.strip() for td in cols]))
    d['inmate_info_url'] = urljoin(EXECUTED_URL, cols[0].find('a')['href'])
    d['last_words_url'] = urljoin(EXECUTED_URL, cols[1].find('a')['href'])
    # write to CSV
    csvfile.writerow(d)

In [5]:

print("Wrote", len(executed_rows), 'rows in:', DEST_FILENAME)

Out[5]:

Wrote 531 rows in: data/tx-death-penalty/extracted/texas-executed.csv


index_txt = requests.get(EXECUTED_URL).text
index_doc = htmlparser.fromstring(index_txt)
#urls = [urljoin(source_index_url, href) for href in index_doc.xpath('//td[2]/a/@href')]
# print(len(urls), "total offender info urls")
# htmlurls = [u for u in urls if '.html' in u]
# print(len(htmlurls), "total offender info HTML pages")

for tr in index_doc.xpath('//table[@class="os"]/tbody/tr[td]'):
    cols = [td.text_content().strip() for td in tr.xpath('td')]
    d = {'tdcj_number': cols[EXECUTED_TDCJ_INDEX]}
    d['last_name'] = cols[3]
    d['first_name'] = cols[4]
    d['date_executed'] = cols[7]
    d['race'] = cols[8]
    d['county'] = cols[9]
    # attempt to fetch data within inmate_url
    inmate_url = urljoin(EXECUTED_URL, tr.xpath('td[2]/@href'))
    if '.html' in inmate_url: 
        try:
            inmate_soup = BeautifulSoup(requests.get(inmate_url).text, 'lxml')

            d['birthdate'] = inmate_soup.find('td', text=re.compile('Birth')).find_next_sibling('td').text.strip()
            d['gender'] = inmate_soup.find('td', text=re.compile('Gender')).find_next_sibling('td').text.strip()
            d['date_received'] = inmate_soup.find('td', text=re.compile('Date Received')).find_next_sibling('td').text.strip()
            d['date_offense'] = inmate_soup.find('td', text=re.compile('Date Offense')).find_next_sibling('td').text.strip()
        except Exception as err:
            print(inmate_url, err)
    # finally, write the row
    csvfile.writerow(d)

    
# for inmate_url in htmlurls:

In [ ]:

Product

Resources

Company