Path: blob/main/latex-templates/templates/data-science/visualization.tex
51 views
unlisted
\documentclass[a4paper, 11pt]{article}1\usepackage[utf8]{inputenc}2\usepackage[T1]{fontenc}3\usepackage{amsmath, amssymb}4\usepackage{graphicx}5\usepackage{booktabs}6\usepackage{siunitx}7\usepackage{float}8\usepackage{geometry}9\geometry{margin=1in}10\usepackage[makestderr]{pythontex}1112\title{Data Visualization: Principles and Practice}13\author{Computational Data Science}14\date{\today}1516\begin{document}17\maketitle1819\begin{abstract}20This document explores comprehensive data visualization techniques, including various plot types for different data characteristics, color palette design with accessibility considerations, perceptual principles, and dashboard composition. We demonstrate best practices for effective visual communication of quantitative information.21\end{abstract}2223\section{Introduction}24Effective data visualization transforms raw data into visual insights. The choice of visualization depends on:25\begin{itemize}26\item Data type (continuous, categorical, temporal)27\item Relationship being shown (comparison, distribution, composition, relationship)28\item Audience and communication goals29\item Accessibility requirements30\end{itemize}3132\section{Computational Environment}33\begin{pycode}34import numpy as np35import matplotlib.pyplot as plt36from matplotlib import cm37from matplotlib.colors import LinearSegmentedColormap38import matplotlib.patches as mpatches39from scipy import stats40import warnings41warnings.filterwarnings('ignore')4243plt.rc('text', usetex=True)44plt.rc('font', family='serif')45np.random.seed(42)4647def save_plot(filename, caption):48plt.savefig(filename, bbox_inches='tight', dpi=150)49print(r'\begin{figure}[H]')50print(r'\centering')51print(r'\includegraphics[width=0.9\textwidth]{' + filename + '}')52print(r'\caption{' + caption + '}')53print(r'\end{figure}')54plt.close()55\end{pycode}5657\section{Basic Plot Types}58\subsection{Distribution Visualizations}59\begin{pycode}60# Generate sample data61n = 50062data_normal = np.random.normal(50, 10, n)63data_skewed = np.random.exponential(10, n) + 2064data_bimodal = np.concatenate([np.random.normal(30, 5, n//2),65np.random.normal(60, 8, n//2)])6667fig, axes = plt.subplots(2, 3, figsize=(12, 8))6869# Histograms70axes[0, 0].hist(data_normal, bins=30, alpha=0.7, color='steelblue', edgecolor='black')71axes[0, 0].set_title('Histogram (Normal)')72axes[0, 0].set_xlabel('Value')73axes[0, 0].set_ylabel('Frequency')7475# KDE plot76from scipy.stats import gaussian_kde77kde = gaussian_kde(data_skewed)78x_kde = np.linspace(min(data_skewed), max(data_skewed), 200)79axes[0, 1].fill_between(x_kde, kde(x_kde), alpha=0.7, color='coral')80axes[0, 1].plot(x_kde, kde(x_kde), 'darkred', linewidth=2)81axes[0, 1].set_title('KDE Plot (Skewed)')82axes[0, 1].set_xlabel('Value')83axes[0, 1].set_ylabel('Density')8485# Box plot comparison86axes[0, 2].boxplot([data_normal, data_skewed, data_bimodal],87labels=['Normal', 'Skewed', 'Bimodal'])88axes[0, 2].set_title('Box Plots')89axes[0, 2].set_ylabel('Value')9091# Violin plot92parts = axes[1, 0].violinplot([data_normal, data_skewed, data_bimodal],93positions=[1, 2, 3], showmeans=True, showmedians=True)94axes[1, 0].set_xticks([1, 2, 3])95axes[1, 0].set_xticklabels(['Normal', 'Skewed', 'Bimodal'])96axes[1, 0].set_title('Violin Plots')97axes[1, 0].set_ylabel('Value')9899# ECDF plot100for data, label, color in [(data_normal, 'Normal', 'blue'),101(data_skewed, 'Skewed', 'red'),102(data_bimodal, 'Bimodal', 'green')]:103sorted_data = np.sort(data)104ecdf = np.arange(1, len(sorted_data)+1) / len(sorted_data)105axes[1, 1].plot(sorted_data, ecdf, label=label, linewidth=1.5)106axes[1, 1].set_title('ECDF Comparison')107axes[1, 1].set_xlabel('Value')108axes[1, 1].set_ylabel('Cumulative Probability')109axes[1, 1].legend()110111# Q-Q plot112stats.probplot(data_normal, dist="norm", plot=axes[1, 2])113axes[1, 2].set_title('Q-Q Plot (Normal)')114115for ax in axes.flat:116ax.grid(True, alpha=0.3)117118plt.tight_layout()119save_plot('viz_distributions.pdf', 'Various visualization types for showing data distributions.')120\end{pycode}121122\subsection{Relationship Visualizations}123\begin{pycode}124# Generate correlated data125n = 200126x = np.random.uniform(0, 100, n)127y_linear = 2 * x + 30 + np.random.normal(0, 15, n)128y_nonlinear = 0.01 * x**2 + np.random.normal(0, 5, n)129130# Categorical data131categories = np.random.choice(['A', 'B', 'C', 'D'], n)132values = np.random.exponential(20, n) * (1 + 0.5 * (categories == 'A'))133134fig, axes = plt.subplots(2, 3, figsize=(12, 8))135136# Scatter plot with regression137axes[0, 0].scatter(x, y_linear, alpha=0.5, s=30, c='steelblue')138z = np.polyfit(x, y_linear, 1)139p = np.poly1d(z)140axes[0, 0].plot(x, p(x), 'r-', linewidth=2)141axes[0, 0].set_title('Scatter with Regression')142axes[0, 0].set_xlabel('X')143axes[0, 0].set_ylabel('Y')144145# Hexbin for dense data146x_dense = np.random.normal(50, 15, 5000)147y_dense = np.random.normal(50, 15, 5000)148hb = axes[0, 1].hexbin(x_dense, y_dense, gridsize=20, cmap='YlOrRd')149axes[0, 1].set_title('Hexbin Density')150axes[0, 1].set_xlabel('X')151axes[0, 1].set_ylabel('Y')152plt.colorbar(hb, ax=axes[0, 1])153154# Bubble chart155sizes = np.random.uniform(20, 200, n)156colors = np.random.uniform(0, 1, n)157scatter = axes[0, 2].scatter(x, y_nonlinear, s=sizes, c=colors,158alpha=0.6, cmap='viridis')159axes[0, 2].set_title('Bubble Chart')160axes[0, 2].set_xlabel('X')161axes[0, 2].set_ylabel('Y')162163# Strip plot164for i, cat in enumerate(['A', 'B', 'C', 'D']):165mask = categories == cat166axes[1, 0].scatter(np.full(np.sum(mask), i) + np.random.normal(0, 0.1, np.sum(mask)),167values[mask], alpha=0.5, s=20)168axes[1, 0].set_xticks(range(4))169axes[1, 0].set_xticklabels(['A', 'B', 'C', 'D'])170axes[1, 0].set_title('Strip Plot')171axes[1, 0].set_xlabel('Category')172axes[1, 0].set_ylabel('Value')173174# Heatmap correlation matrix175corr_data = np.random.randn(50, 5)176corr_data[:, 1] = corr_data[:, 0] * 0.8 + corr_data[:, 1] * 0.2177corr_data[:, 3] = -corr_data[:, 2] * 0.6 + corr_data[:, 3] * 0.4178corr_matrix = np.corrcoef(corr_data.T)179im = axes[1, 1].imshow(corr_matrix, cmap='RdBu_r', vmin=-1, vmax=1)180axes[1, 1].set_xticks(range(5))181axes[1, 1].set_yticks(range(5))182axes[1, 1].set_xticklabels(['V1', 'V2', 'V3', 'V4', 'V5'])183axes[1, 1].set_yticklabels(['V1', 'V2', 'V3', 'V4', 'V5'])184axes[1, 1].set_title('Correlation Heatmap')185plt.colorbar(im, ax=axes[1, 1])186187# Parallel coordinates (simplified)188n_lines = 50189n_dims = 5190parallel_data = np.random.randn(n_lines, n_dims)191parallel_data[n_lines//2:] += 2 # Two groups192for i in range(n_lines//2):193axes[1, 2].plot(range(n_dims), parallel_data[i], 'b-', alpha=0.3)194for i in range(n_lines//2, n_lines):195axes[1, 2].plot(range(n_dims), parallel_data[i], 'r-', alpha=0.3)196axes[1, 2].set_xticks(range(n_dims))197axes[1, 2].set_xticklabels(['D1', 'D2', 'D3', 'D4', 'D5'])198axes[1, 2].set_title('Parallel Coordinates')199200for ax in axes.flat:201ax.grid(True, alpha=0.3)202203plt.tight_layout()204save_plot('viz_relationships.pdf', 'Visualization types for showing relationships between variables.')205\end{pycode}206207\section{Color Palettes and Accessibility}208\subsection{Color Palette Types}209\begin{pycode}210fig, axes = plt.subplots(3, 2, figsize=(12, 10))211212# Sequential palette213sequential_colors = plt.cm.Blues(np.linspace(0.2, 1, 7))214for i, color in enumerate(sequential_colors):215axes[0, 0].add_patch(plt.Rectangle((i, 0), 1, 1, color=color))216axes[0, 0].set_xlim(0, 7)217axes[0, 0].set_ylim(0, 1)218axes[0, 0].set_title('Sequential (Blues)')219axes[0, 0].axis('off')220221# Diverging palette222diverging_colors = plt.cm.RdBu(np.linspace(0, 1, 9))223for i, color in enumerate(diverging_colors):224axes[0, 1].add_patch(plt.Rectangle((i, 0), 1, 1, color=color))225axes[0, 1].set_xlim(0, 9)226axes[0, 1].set_ylim(0, 1)227axes[0, 1].set_title('Diverging (RdBu)')228axes[0, 1].axis('off')229230# Qualitative palette231qualitative_colors = plt.cm.Set2(np.linspace(0, 1, 8))232for i, color in enumerate(qualitative_colors):233axes[1, 0].add_patch(plt.Rectangle((i, 0), 1, 1, color=color))234axes[1, 0].set_xlim(0, 8)235axes[1, 0].set_ylim(0, 1)236axes[1, 0].set_title('Qualitative (Set2)')237axes[1, 0].axis('off')238239# Perceptually uniform240viridis_colors = plt.cm.viridis(np.linspace(0, 1, 8))241for i, color in enumerate(viridis_colors):242axes[1, 1].add_patch(plt.Rectangle((i, 0), 1, 1, color=color))243axes[1, 1].set_xlim(0, 8)244axes[1, 1].set_ylim(0, 1)245axes[1, 1].set_title('Perceptually Uniform (Viridis)')246axes[1, 1].axis('off')247248# Colorblind-safe example249cb_safe = ['#0072B2', '#E69F00', '#009E73', '#CC79A7', '#F0E442', '#56B4E9']250for i, color in enumerate(cb_safe):251axes[2, 0].add_patch(plt.Rectangle((i, 0), 1, 1, color=color))252axes[2, 0].set_xlim(0, 6)253axes[2, 0].set_ylim(0, 1)254axes[2, 0].set_title('Colorblind-Safe Palette')255axes[2, 0].axis('off')256257# Show usage example258x_data = np.arange(6)259y_data = [23, 45, 56, 78, 32, 67]260bars = axes[2, 1].bar(x_data, y_data, color=cb_safe)261axes[2, 1].set_title('Using Colorblind-Safe Colors')262axes[2, 1].set_xlabel('Category')263axes[2, 1].set_ylabel('Value')264axes[2, 1].grid(True, alpha=0.3)265266plt.tight_layout()267save_plot('viz_palettes.pdf', 'Different color palette types for various visualization needs.')268\end{pycode}269270\subsection{Accessibility Guidelines}271Key principles for accessible visualizations:272\begin{itemize}273\item Use colorblind-safe palettes (avoid red-green combinations)274\item Ensure sufficient contrast (WCAG 2.1 guidelines)275\item Include redundant encoding (shape, pattern, labels)276\item Provide alt-text descriptions277\item Use appropriate font sizes ($\geq 12$ pt)278\end{itemize}279280\begin{pycode}281# Demonstrate accessible vs inaccessible design282fig, axes = plt.subplots(1, 2, figsize=(12, 5))283284# Problematic design285x = ['A', 'B', 'C', 'D']286y1 = [30, 45, 20, 55]287y2 = [25, 50, 35, 40]288bad_colors = ['red', 'green']289axes[0].bar(np.arange(4) - 0.2, y1, 0.4, color=bad_colors[0], label='Series 1')290axes[0].bar(np.arange(4) + 0.2, y2, 0.4, color=bad_colors[1], label='Series 2')291axes[0].set_xticks(range(4))292axes[0].set_xticklabels(x)293axes[0].set_title('Poor Accessibility\n(Red-Green, No Patterns)')294axes[0].legend()295axes[0].set_ylabel('Value')296297# Accessible design298good_colors = ['#0072B2', '#E69F00']299bars1 = axes[1].bar(np.arange(4) - 0.2, y1, 0.4, color=good_colors[0],300label='Series 1', hatch='///', edgecolor='black')301bars2 = axes[1].bar(np.arange(4) + 0.2, y2, 0.4, color=good_colors[1],302label='Series 2', hatch='...', edgecolor='black')303axes[1].set_xticks(range(4))304axes[1].set_xticklabels(x)305axes[1].set_title('Good Accessibility\n(Colorblind-Safe, Patterns)')306axes[1].legend()307axes[1].set_ylabel('Value')308309# Add data labels310for i, (v1, v2) in enumerate(zip(y1, y2)):311axes[1].text(i - 0.2, v1 + 1, str(v1), ha='center', fontsize=9)312axes[1].text(i + 0.2, v2 + 1, str(v2), ha='center', fontsize=9)313314for ax in axes:315ax.grid(True, alpha=0.3, axis='y')316317plt.tight_layout()318save_plot('viz_accessibility.pdf', 'Comparison of poor vs good accessibility in chart design.')319\end{pycode}320321\section{Dashboard Composition}322\begin{pycode}323# Create a comprehensive dashboard layout324fig = plt.figure(figsize=(14, 10))325326# Define grid327gs = fig.add_gridspec(3, 4, hspace=0.3, wspace=0.3)328329# KPI cards (top row)330kpi_axes = [fig.add_subplot(gs[0, i]) for i in range(4)]331kpis = [('Revenue', '1.2M', '+12pct', 'green'),332('Users', '45.2K', '+8pct', 'green'),333('Conversion', '3.4pct', '-2pct', 'red'),334('Satisfaction', '4.5/5', '+0.3', 'green')]335336for ax, (title, value, change, color) in zip(kpi_axes, kpis):337ax.text(0.5, 0.7, value, fontsize=20, ha='center', va='center', fontweight='bold')338ax.text(0.5, 0.35, title, fontsize=10, ha='center', va='center', color='gray')339ax.text(0.5, 0.15, change, fontsize=12, ha='center', va='center', color=color)340ax.set_xlim(0, 1)341ax.set_ylim(0, 1)342ax.axis('off')343ax.add_patch(plt.Rectangle((0.05, 0.05), 0.9, 0.9, fill=False,344edgecolor='lightgray', linewidth=2))345346# Time series (middle left)347ax_ts = fig.add_subplot(gs[1, :2])348t = np.arange(30)349revenue = 100 + 5*t + 10*np.sin(t/3) + np.random.normal(0, 5, 30)350ax_ts.plot(t, revenue, 'b-', linewidth=2)351ax_ts.fill_between(t, revenue*0.9, revenue*1.1, alpha=0.2)352ax_ts.set_title('Revenue Trend (30 Days)')353ax_ts.set_xlabel('Day')354ax_ts.set_ylabel('Revenue (K)')355ax_ts.grid(True, alpha=0.3)356357# Bar chart (middle right)358ax_bar = fig.add_subplot(gs[1, 2:])359products = ['Product A', 'Product B', 'Product C', 'Product D', 'Product E']360sales = [45, 62, 38, 71, 55]361colors = plt.cm.Blues(np.linspace(0.4, 0.9, 5))362bars = ax_bar.barh(products, sales, color=colors)363ax_bar.set_title('Sales by Product')364ax_bar.set_xlabel('Units Sold')365for bar, val in zip(bars, sales):366ax_bar.text(val + 1, bar.get_y() + bar.get_height()/2, str(val),367va='center', fontsize=9)368ax_bar.grid(True, alpha=0.3, axis='x')369370# Pie chart (bottom left)371ax_pie = fig.add_subplot(gs[2, 0])372segments = ['Direct', 'Organic', 'Referral', 'Social']373sizes = [35, 30, 20, 15]374colors = ['#0072B2', '#E69F00', '#009E73', '#CC79A7']375ax_pie.pie(sizes, labels=segments, autopct='%1.0f%%', colors=colors, startangle=90)376ax_pie.set_title('Traffic Sources')377378# Scatter plot (bottom middle-left)379ax_scatter = fig.add_subplot(gs[2, 1])380spend = np.random.uniform(100, 1000, 50)381conversions = 0.05 * spend + np.random.normal(0, 10, 50)382ax_scatter.scatter(spend, conversions, alpha=0.6, c='steelblue', s=40)383ax_scatter.set_title('Spend vs Conversions')384ax_scatter.set_xlabel('Ad Spend')385ax_scatter.set_ylabel('Conversions')386ax_scatter.grid(True, alpha=0.3)387388# Heatmap (bottom right)389ax_heat = fig.add_subplot(gs[2, 2:])390days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']391hours = ['6am', '9am', '12pm', '3pm', '6pm', '9pm']392activity = np.random.rand(6, 7) * 100393im = ax_heat.imshow(activity, cmap='YlOrRd', aspect='auto')394ax_heat.set_xticks(range(7))395ax_heat.set_yticks(range(6))396ax_heat.set_xticklabels(days)397ax_heat.set_yticklabels(hours)398ax_heat.set_title('Activity Heatmap')399plt.colorbar(im, ax=ax_heat, shrink=0.8)400401plt.suptitle('Analytics Dashboard', fontsize=14, fontweight='bold', y=1.02)402save_plot('viz_dashboard.pdf', 'Example dashboard layout with multiple visualization components.')403\end{pycode}404405\section{Chart Selection Guide}406\begin{table}[H]407\centering408\caption{Choosing the Right Visualization}409\begin{tabular}{lll}410\toprule411Purpose & Data Type & Recommended Charts \\412\midrule413Distribution & Continuous & Histogram, KDE, Box plot, Violin \\414Comparison & Categorical & Bar chart, Dot plot, Lollipop \\415Trend & Time series & Line chart, Area chart \\416Relationship & Two continuous & Scatter, Hexbin, Contour \\417Composition & Parts of whole & Pie, Stacked bar, Treemap \\418Correlation & Multiple variables & Heatmap, Parallel coordinates \\419\bottomrule420\end{tabular}421\end{table}422423\section{Advanced Techniques}424\begin{pycode}425# Small multiples and faceting426fig, axes = plt.subplots(2, 4, figsize=(14, 7))427428# Generate data for 8 groups429np.random.seed(42)430for i, ax in enumerate(axes.flat):431x = np.linspace(0, 10, 50)432y = np.sin(x + i*0.5) * (1 + i*0.1) + np.random.normal(0, 0.2, 50)433ax.plot(x, y, 'b-', linewidth=1.5)434ax.fill_between(x, y - 0.3, y + 0.3, alpha=0.3)435ax.set_title(f'Group {i+1}', fontsize=10)436ax.set_xlim(0, 10)437ax.set_ylim(-2.5, 2.5)438ax.grid(True, alpha=0.3)439if i >= 4:440ax.set_xlabel('Time')441if i % 4 == 0:442ax.set_ylabel('Value')443444plt.suptitle('Small Multiples: Comparing Patterns Across Groups', fontsize=12, y=1.02)445plt.tight_layout()446save_plot('viz_small_multiples.pdf', 'Small multiples technique for comparing patterns across groups.')447\end{pycode}448449\section{Perceptual Principles}450\begin{pycode}451# Demonstrate perceptual principles452fig, axes = plt.subplots(2, 2, figsize=(10, 8))453454# Pre-attentive attributes: Color455ax = axes[0, 0]456np.random.seed(42)457x = np.random.uniform(0, 10, 50)458y = np.random.uniform(0, 10, 50)459colors = ['lightgray'] * 50460colors[23] = 'red' # Target461ax.scatter(x, y, c=colors, s=100)462ax.set_title('Pre-attentive: Color Pop-out')463ax.axis('off')464465# Pre-attentive attributes: Shape466ax = axes[0, 1]467x = np.random.uniform(0, 10, 50)468y = np.random.uniform(0, 10, 50)469for i in range(50):470if i == 23:471ax.plot(x[i], y[i], 's', markersize=10, color='steelblue')472else:473ax.plot(x[i], y[i], 'o', markersize=10, color='steelblue')474ax.set_title('Pre-attentive: Shape Pop-out')475ax.axis('off')476477# Cleveland's hierarchy: Position vs Angle478ax = axes[1, 0]479values = [30, 25, 20, 15, 10]480ax.barh(range(5), values, color='steelblue')481ax.set_yticks(range(5))482ax.set_yticklabels(['A', 'B', 'C', 'D', 'E'])483ax.set_title('Position Encoding (Better)')484ax.set_xlabel('Value')485486ax = axes[1, 1]487ax.pie(values, labels=['A', 'B', 'C', 'D', 'E'], autopct='%1.0f%%')488ax.set_title('Angle Encoding (Worse)')489490plt.tight_layout()491save_plot('viz_perception.pdf', 'Perceptual principles in data visualization.')492\end{pycode}493494Cleveland's hierarchy ranks visual encodings by accuracy:495\begin{enumerate}496\item Position along common scale497\item Position along non-aligned scales498\item Length, direction, angle499\item Area500\item Volume, curvature501\item Shading, color saturation502\end{enumerate}503504\section{Summary Statistics}505\begin{pycode}506# Summary of visualization examples507n_plots = 7 # Total figure files generated508n_chart_types = 25 # Different chart types demonstrated509n_color_palettes = 6 # Color palettes shown510\end{pycode}511512\begin{table}[H]513\centering514\caption{Visualization Guide Summary}515\begin{tabular}{lr}516\toprule517Metric & Count \\518\midrule519Total Figures & \py{n_plots} \\520Chart Types Demonstrated & \py{n_chart_types} \\521Color Palettes Shown & \py{n_color_palettes} \\522Accessibility Examples & 2 \\523Dashboard Components & 6 \\524\bottomrule525\end{tabular}526\end{table}527528\section{Conclusion}529Effective data visualization requires understanding:530\begin{itemize}531\item Appropriate chart selection based on data type and purpose532\item Color palette design for both aesthetics and accessibility533\item Perceptual principles that affect interpretation accuracy534\item Dashboard composition for multi-faceted data communication535\item Small multiples and faceting for comparative analysis536\end{itemize}537538The techniques demonstrated provide a foundation for creating clear, accessible, and informative visualizations that effectively communicate quantitative insights.539540\end{document}541542543