This notebook...
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
# Let's load the cardiologist file again
given_cardiologists = pd.read_csv("data/given_cardiologists_plus_national_downloadable_file.csv")
cms_program_cols = ['Participating in eRx',
'Participating in PQRS',
'Participating in EHR',
'Received PQRS Maintenance of Certification Program Incentive',
'Participated in Million Hearts']
given_cardiologists.replace(np.nan, "NULL", inplace=True)
given_cardiologists.groupby(cms_program_cols).size()
# Let's now look at the aggregate provider utilizationd data from CMS (source #3 in README.md)
all_provider_agg_util = pd.read_csv("data/data.cms.gov_4a3h-46r6_CY2015-Aggregate-Utilization.csv", low_memory=False)
# and join in the all_provider_agg_util data with given_cardiologist data into a new dataframe
given_cardiologists_plus_agg_util = pd.merge(given_cardiologists, all_provider_agg_util, how='left', left_on='npi', right_on='National Provider Identifier', suffixes=('_ndf','_agg'))
# write this merge to the file system for safe keeping
given_cardiologists_plus_agg_util.to_csv("data/given_cardiologists_plus_agg_util.csv")
# we may want to look at all cardiologists' aggregate information so let's bring in the original source #1 dataset and join it with this one as well
all_providers = pd.read_csv("data/data.medicare.gov_s63f-csi6_National-Downloadable_File.csv", low_memory=False)
all_cardiologists = all_providers[all_providers['Primary specialty'] == 'CARDIOVASCULAR DISEASE (CARDIOLOGY)']
all_cardiologists_in_ca = all_cardiologists[all_cardiologists['State'] == 'CA']
all_cardiologists_in_ca_plus_agg_util = pd.merge(all_cardiologists_in_ca, all_provider_agg_util, how='left', left_on='npi', right_on='National Provider Identifier', suffixes=('_ndf','_agg'))
# So what data do we have to work with now?
given_cardiologists_plus_agg_util
# How many CA cardiologists can we compare these 8 with?
print "There are {0} cardiologists in CA that we will compare these {1} with".format(len(all_cardiologists_in_ca_plus_agg_util), len(given_cardiologists_plus_agg_util))
# We will use plotly for interactive charting and visual analytics
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
# helper method for creating hover over tooltips/labels
def list_of_provider_labels(df):
labels = []
for n, provider in df.iterrows():
labels.append("NPI: {0}<br />Name: {1}, {2}".format(provider['npi'], provider['Last Name'], provider['First Name']))
return labels
# given one df, this produces a two-column layout: [boxplot | scatterplot]
## Example usage:
## fig = plotly_box_and_scatter_plot(given_cardiologists_plus_agg_util, 'Graduation year')
## py.iplot(fig)
def plotly_box_and_scatter_plot(df, column_to_graph):
data = [
go.Box(
name = column_to_graph,
y = df[column_to_graph],
boxmean = True
),
go.Scatter(
x = np.full(len(df), 0),
y = df[column_to_graph].tolist(),
mode = 'markers',
text = list_of_provider_labels(df) #df['npi'].tolist(),
)
]
layout = go.Layout(
showlegend=False,
hovermode='closest',
title="{0}".format(column_to_graph)
)
fig = go.Figure(data=data, layout=layout)
return fig
# given two dfs, this produces a two-column layout: [given boxplot | given scatterplot | all boxplot | all scatterplot]
## Example usage:
## fig = plotly_compare_distribution_of_column(
## column_to_graph = 'Percent (%) of Beneficiaries Identified With Ischemic Heart Disease ',
## given_name = 'Nearby Cardios', given_df = given_cardiologists_plus_agg_util,
## all_name = 'All CA Cardios', all_df = all_cardiologists_in_ca_plus_agg_util
## y_min = 0, y_min = 100 # these are optional!
## )
## py.iplot(fig)
def plotly_compare_distribution_of_column(column_to_graph, given_df, given_name, all_df, all_name, y_min=None, y_max=None):
given_box = go.Box(
name = given_name,
y = given_df[column_to_graph],
boxmean = True
)
given_scatter = go.Scatter(
name = given_name,
x = np.full(len(given_df), 0),
y = given_df[column_to_graph].tolist(),
mode = 'markers',
text = list_of_provider_labels(given_df)
)
all_box = go.Box(
name = all_name,
y = all_df[column_to_graph],
boxmean = True
)
all_scatter = go.Scatter(
name = all_name,
x = np.full(len(all_df), 0),
y = all_df[column_to_graph].tolist(),
mode = 'markers',
text = list_of_provider_labels(all_df)
)
fig = tools.make_subplots(rows=1, cols=4)
fig.append_trace(given_box, 1, 1)
fig.append_trace(given_scatter, 1, 2)
fig.append_trace(all_box, 1, 3)
fig.append_trace(all_scatter, 1, 4)
fig['layout'].update(title=column_to_graph, showlegend=True, hovermode='closest')
fig['layout']['xaxis2'].update(title=given_name, type='category', ticks='', showticklabels=False)
fig['layout']['xaxis4'].update(title=all_name, type='category', ticks='', showticklabels=False)
# if y_min and y_max are non-None, set all axes to the specified min and max
if y_min != None and y_max != None:
y_axis_layout = dict(range = [y_min, y_max], autorange = False)
fig['layout']['yaxis1'].update(y_axis_layout)
fig['layout']['yaxis2'].update(y_axis_layout)
fig['layout']['yaxis3'].update(y_axis_layout)
fig['layout']['yaxis4'].update(y_axis_layout)
return fig
fig = plotly_box_and_scatter_plot(given_cardiologists_plus_agg_util, 'Percent (%) of Beneficiaries Identified With Heart Failure ')
py.iplot(fig)
fig = plotly_compare_distribution_of_column(
column_to_graph = 'Percent (%) of Beneficiaries Identified With Heart Failure ',
given_name = 'Nearby Cardios', given_df = given_cardiologists_plus_agg_util,
all_name = 'All CA Cardios', all_df = all_cardiologists_in_ca_plus_agg_util)
py.iplot(fig)
fig1 = plotly_compare_distribution_of_column(
column_to_graph = 'Percent (%) of Beneficiaries Identified With Heart Failure ',
given_name = 'Nearby Cardios', given_df = given_cardiologists_plus_agg_util,
all_name = 'All CA Cardios', all_df = all_cardiologists_in_ca_plus_agg_util,
y_min = 0, y_max = 100)
fig2 = plotly_compare_distribution_of_column(
column_to_graph = 'Percent (%) of Beneficiaries Identified With Ischemic Heart Disease ',
given_name = 'Nearby Cardios', given_df = given_cardiologists_plus_agg_util,
all_name = 'All CA Cardios', all_df = all_cardiologists_in_ca_plus_agg_util,
y_min = 0, y_max = 100)
display(py.iplot(fig1))
display(py.iplot(fig2))
from IPython.display import display
# let's chart a bunch more interesting columns
columns_to_chart = ['Number of HCPCS',
'Number of Services',
'Number of Medicare Beneficiaries',
'Number of Medical Services',
'Average Age of Beneficiaries',
'Average HCC Risk Score of Beneficiaries ',
'Number of Beneficiaries With Medicare & Medicaid Entitlement']
for column in columns_to_chart:
fig = plotly_compare_distribution_of_column(
column_to_graph = column,
given_name = 'Nearby Cardios', given_df = given_cardiologists_plus_agg_util,
all_name = 'All CA Cardios', all_df = all_cardiologists_in_ca_plus_agg_util)
display(py.iplot(fig))