Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
UBC-DSCI
GitHub Repository: UBC-DSCI/dsci-100-assets
Path: blob/master/2020-spring/materials/worksheet_01/notebook.tex
2051 views
1
2
% Default to the notebook output style
3
4
5
6
7
% Inherit from the specified cell style.
8
9
10
11
12
13
\documentclass[11pt]{article}
14
15
16
17
\usepackage[T1]{fontenc}
18
% Nicer default font (+ math font) than Computer Modern for most use cases
19
\usepackage{mathpazo}
20
21
% Basic figure setup, for now with no caption control since it's done
22
% automatically by Pandoc (which extracts ![](path) syntax from Markdown).
23
\usepackage{graphicx}
24
% We will generate all images so they have a width \maxwidth. This means
25
% that they will get their normal width if they fit onto the page, but
26
% are scaled down if they would overflow the margins.
27
\makeatletter
28
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth
29
\else\Gin@nat@width\fi}
30
\makeatother
31
\let\Oldincludegraphics\includegraphics
32
% Set max figure width to be 80% of text width, for now hardcoded.
33
\renewcommand{\includegraphics}[1]{\Oldincludegraphics[width=.8\maxwidth]{#1}}
34
% Ensure that by default, figures have no caption (until we provide a
35
% proper Figure object with a Caption API and a way to capture that
36
% in the conversion process - todo).
37
\usepackage{caption}
38
\DeclareCaptionLabelFormat{nolabel}{}
39
\captionsetup{labelformat=nolabel}
40
41
\usepackage{adjustbox} % Used to constrain images to a maximum size
42
\usepackage{xcolor} % Allow colors to be defined
43
\usepackage{enumerate} % Needed for markdown enumerations to work
44
\usepackage{geometry} % Used to adjust the document margins
45
\usepackage{amsmath} % Equations
46
\usepackage{amssymb} % Equations
47
\usepackage{textcomp} % defines textquotesingle
48
% Hack from http://tex.stackexchange.com/a/47451/13684:
49
\AtBeginDocument{%
50
\def\PYZsq{\textquotesingle}% Upright quotes in Pygmentized code
51
}
52
\usepackage{upquote} % Upright quotes for verbatim code
53
\usepackage{eurosym} % defines \euro
54
\usepackage[mathletters]{ucs} % Extended unicode (utf-8) support
55
\usepackage[utf8x]{inputenc} % Allow utf-8 characters in the tex document
56
\usepackage{fancyvrb} % verbatim replacement that allows latex
57
\usepackage{grffile} % extends the file name processing of package graphics
58
% to support a larger range
59
% The hyperref package gives us a pdf with properly built
60
% internal navigation ('pdf bookmarks' for the table of contents,
61
% internal cross-reference links, web links for URLs, etc.)
62
\usepackage{hyperref}
63
\usepackage{longtable} % longtable support required by pandoc >1.10
64
\usepackage{booktabs} % table support for pandoc > 1.12.2
65
\usepackage[inline]{enumitem} % IRkernel/repr support (it uses the enumerate* environment)
66
\usepackage[normalem]{ulem} % ulem is needed to support strikethroughs (\sout)
67
% normalem makes italics be italics, not underlines
68
69
70
71
72
% Colors for the hyperref package
73
\definecolor{urlcolor}{rgb}{0,.145,.698}
74
\definecolor{linkcolor}{rgb}{.71,0.21,0.01}
75
\definecolor{citecolor}{rgb}{.12,.54,.11}
76
77
% ANSI colors
78
\definecolor{ansi-black}{HTML}{3E424D}
79
\definecolor{ansi-black-intense}{HTML}{282C36}
80
\definecolor{ansi-red}{HTML}{E75C58}
81
\definecolor{ansi-red-intense}{HTML}{B22B31}
82
\definecolor{ansi-green}{HTML}{00A250}
83
\definecolor{ansi-green-intense}{HTML}{007427}
84
\definecolor{ansi-yellow}{HTML}{DDB62B}
85
\definecolor{ansi-yellow-intense}{HTML}{B27D12}
86
\definecolor{ansi-blue}{HTML}{208FFB}
87
\definecolor{ansi-blue-intense}{HTML}{0065CA}
88
\definecolor{ansi-magenta}{HTML}{D160C4}
89
\definecolor{ansi-magenta-intense}{HTML}{A03196}
90
\definecolor{ansi-cyan}{HTML}{60C6C8}
91
\definecolor{ansi-cyan-intense}{HTML}{258F8F}
92
\definecolor{ansi-white}{HTML}{C5C1B4}
93
\definecolor{ansi-white-intense}{HTML}{A1A6B2}
94
95
% commands and environments needed by pandoc snippets
96
% extracted from the output of `pandoc -s`
97
\providecommand{\tightlist}{%
98
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
99
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
100
% Add ',fontsize=\small' for more characters per line
101
\newenvironment{Shaded}{}{}
102
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{\textbf{{#1}}}}
103
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.56,0.13,0.00}{{#1}}}
104
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}}
105
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}}
106
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}}
107
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}}
108
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}}
109
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textit{{#1}}}}
110
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{{#1}}}
111
\newcommand{\AlertTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{{#1}}}}
112
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.02,0.16,0.49}{{#1}}}
113
\newcommand{\RegionMarkerTok}[1]{{#1}}
114
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{{#1}}}}
115
\newcommand{\NormalTok}[1]{{#1}}
116
117
% Additional commands for more recent versions of Pandoc
118
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.53,0.00,0.00}{{#1}}}
119
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}}
120
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}}
121
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.73,0.40,0.53}{{#1}}}
122
\newcommand{\ImportTok}[1]{{#1}}
123
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.73,0.13,0.13}{\textit{{#1}}}}
124
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textbf{\textit{{#1}}}}}
125
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textbf{\textit{{#1}}}}}
126
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.10,0.09,0.49}{{#1}}}
127
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{\textbf{{#1}}}}
128
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.40,0.40,0.40}{{#1}}}
129
\newcommand{\BuiltInTok}[1]{{#1}}
130
\newcommand{\ExtensionTok}[1]{{#1}}
131
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.74,0.48,0.00}{{#1}}}
132
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.49,0.56,0.16}{{#1}}}
133
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textbf{\textit{{#1}}}}}
134
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textbf{\textit{{#1}}}}}
135
136
137
% Define a nice break command that doesn't care if a line doesn't already
138
% exist.
139
\def\br{\hspace*{\fill} \\* }
140
% Math Jax compatability definitions
141
\def\gt{>}
142
\def\lt{<}
143
% Document parameters
144
\title{worksheet\_01}
145
146
147
148
149
% Pygments definitions
150
151
\makeatletter
152
\def\PY@reset{\let\PY@it=\relax \let\PY@bf=\relax%
153
\let\PY@ul=\relax \let\PY@tc=\relax%
154
\let\PY@bc=\relax \let\PY@ff=\relax}
155
\def\PY@tok#1{\csname PY@tok@#1\endcsname}
156
\def\PY@toks#1+{\ifx\relax#1\empty\else%
157
\PY@tok{#1}\expandafter\PY@toks\fi}
158
\def\PY@do#1{\PY@bc{\PY@tc{\PY@ul{%
159
\PY@it{\PY@bf{\PY@ff{#1}}}}}}}
160
\def\PY#1#2{\PY@reset\PY@toks#1+\relax+\PY@do{#2}}
161
162
\expandafter\def\csname PY@tok@w\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.73,0.73}{##1}}}
163
\expandafter\def\csname PY@tok@c\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
164
\expandafter\def\csname PY@tok@cp\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.74,0.48,0.00}{##1}}}
165
\expandafter\def\csname PY@tok@k\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
166
\expandafter\def\csname PY@tok@kp\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
167
\expandafter\def\csname PY@tok@kt\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.69,0.00,0.25}{##1}}}
168
\expandafter\def\csname PY@tok@o\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
169
\expandafter\def\csname PY@tok@ow\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.67,0.13,1.00}{##1}}}
170
\expandafter\def\csname PY@tok@nb\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
171
\expandafter\def\csname PY@tok@nf\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
172
\expandafter\def\csname PY@tok@nc\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
173
\expandafter\def\csname PY@tok@nn\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
174
\expandafter\def\csname PY@tok@ne\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.82,0.25,0.23}{##1}}}
175
\expandafter\def\csname PY@tok@nv\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
176
\expandafter\def\csname PY@tok@no\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.53,0.00,0.00}{##1}}}
177
\expandafter\def\csname PY@tok@nl\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.63,0.63,0.00}{##1}}}
178
\expandafter\def\csname PY@tok@ni\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.60,0.60,0.60}{##1}}}
179
\expandafter\def\csname PY@tok@na\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.49,0.56,0.16}{##1}}}
180
\expandafter\def\csname PY@tok@nt\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
181
\expandafter\def\csname PY@tok@nd\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.67,0.13,1.00}{##1}}}
182
\expandafter\def\csname PY@tok@s\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
183
\expandafter\def\csname PY@tok@sd\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
184
\expandafter\def\csname PY@tok@si\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.53}{##1}}}
185
\expandafter\def\csname PY@tok@se\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.13}{##1}}}
186
\expandafter\def\csname PY@tok@sr\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.40,0.53}{##1}}}
187
\expandafter\def\csname PY@tok@ss\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
188
\expandafter\def\csname PY@tok@sx\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
189
\expandafter\def\csname PY@tok@m\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
190
\expandafter\def\csname PY@tok@gh\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,0.50}{##1}}}
191
\expandafter\def\csname PY@tok@gu\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.50,0.00,0.50}{##1}}}
192
\expandafter\def\csname PY@tok@gd\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.63,0.00,0.00}{##1}}}
193
\expandafter\def\csname PY@tok@gi\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.63,0.00}{##1}}}
194
\expandafter\def\csname PY@tok@gr\endcsname{\def\PY@tc##1{\textcolor[rgb]{1.00,0.00,0.00}{##1}}}
195
\expandafter\def\csname PY@tok@ge\endcsname{\let\PY@it=\textit}
196
\expandafter\def\csname PY@tok@gs\endcsname{\let\PY@bf=\textbf}
197
\expandafter\def\csname PY@tok@gp\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,0.50}{##1}}}
198
\expandafter\def\csname PY@tok@go\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
199
\expandafter\def\csname PY@tok@gt\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.27,0.87}{##1}}}
200
\expandafter\def\csname PY@tok@err\endcsname{\def\PY@bc##1{\setlength{\fboxsep}{0pt}\fcolorbox[rgb]{1.00,0.00,0.00}{1,1,1}{\strut ##1}}}
201
\expandafter\def\csname PY@tok@kc\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
202
\expandafter\def\csname PY@tok@kd\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
203
\expandafter\def\csname PY@tok@kn\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
204
\expandafter\def\csname PY@tok@kr\endcsname{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
205
\expandafter\def\csname PY@tok@bp\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
206
\expandafter\def\csname PY@tok@fm\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
207
\expandafter\def\csname PY@tok@vc\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
208
\expandafter\def\csname PY@tok@vg\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
209
\expandafter\def\csname PY@tok@vi\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
210
\expandafter\def\csname PY@tok@vm\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
211
\expandafter\def\csname PY@tok@sa\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
212
\expandafter\def\csname PY@tok@sb\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
213
\expandafter\def\csname PY@tok@sc\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
214
\expandafter\def\csname PY@tok@dl\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
215
\expandafter\def\csname PY@tok@s2\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
216
\expandafter\def\csname PY@tok@sh\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
217
\expandafter\def\csname PY@tok@s1\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
218
\expandafter\def\csname PY@tok@mb\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
219
\expandafter\def\csname PY@tok@mf\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
220
\expandafter\def\csname PY@tok@mh\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
221
\expandafter\def\csname PY@tok@mi\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
222
\expandafter\def\csname PY@tok@il\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
223
\expandafter\def\csname PY@tok@mo\endcsname{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
224
\expandafter\def\csname PY@tok@ch\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
225
\expandafter\def\csname PY@tok@cm\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
226
\expandafter\def\csname PY@tok@cpf\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
227
\expandafter\def\csname PY@tok@c1\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
228
\expandafter\def\csname PY@tok@cs\endcsname{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.25,0.50,0.50}{##1}}}
229
230
\def\PYZbs{\char`\\}
231
\def\PYZus{\char`\_}
232
\def\PYZob{\char`\{}
233
\def\PYZcb{\char`\}}
234
\def\PYZca{\char`\^}
235
\def\PYZam{\char`\&}
236
\def\PYZlt{\char`\<}
237
\def\PYZgt{\char`\>}
238
\def\PYZsh{\char`\#}
239
\def\PYZpc{\char`\%}
240
\def\PYZdl{\char`\$}
241
\def\PYZhy{\char`\-}
242
\def\PYZsq{\char`\'}
243
\def\PYZdq{\char`\"}
244
\def\PYZti{\char`\~}
245
% for compatibility with earlier versions
246
\def\PYZat{@}
247
\def\PYZlb{[}
248
\def\PYZrb{]}
249
\makeatother
250
251
252
% Exact colors from NB
253
\definecolor{incolor}{rgb}{0.0, 0.0, 0.5}
254
\definecolor{outcolor}{rgb}{0.545, 0.0, 0.0}
255
256
257
258
259
% Prevent overflowing lines due to hard-to-break entities
260
\sloppy
261
% Setup hyperref package
262
\hypersetup{
263
breaklinks=true, % so long urls are correctly broken across lines
264
colorlinks=true,
265
urlcolor=urlcolor,
266
linkcolor=linkcolor,
267
citecolor=citecolor,
268
}
269
% Slightly bigger margins than the latex defaults
270
271
\geometry{verbose,tmargin=1in,bmargin=1in,lmargin=1in,rmargin=1in}
272
273
274
275
\begin{document}
276
277
278
\maketitle
279
280
281
282
283
\section{Worksheet 1: Introduction to Data
284
Science}\label{worksheet-1-introduction-to-data-science}
285
286
Welcome to DSCI 100: Introduction to Data Science!
287
288
Each week you will complete a lecture assignment like this one. Before
289
we get started, there are some administrative details.
290
291
You can't learn technical subjects without hands-on practice. The weekly
292
lecture worksheets and tutorials are an important part of the course.
293
The lecture worksheets will automatically be collected at the start of
294
the weekly tutorial. Conversely, the tutorial assigments will
295
automatically be collected at the start of the weekly lecture. This is
296
set up so that you are only working on one thing at a time. Attendance
297
in lectures and tutorials are required. There will be participatory
298
activities in both the lecture and tutorial to help support your
299
learning.
300
301
Collaborating on lecture worksheets and tutorial assignments is more
302
than okay -\/- it's encouraged! You should rarely be stuck for more than
303
a few minutes on questions in lecture or tutorial, so ask a neighbor, TA
304
or an instructor for help (explaining things is beneficial, too -\/- the
305
best way to solidify your knowledge of a subject is to explain it).
306
Please don't just share answers, though. Everyone must submit a copy of
307
their own work.
308
309
You can read more about
310
\href{https://github.com/UBC-DSCI/dsci-100/blob/master/policies.md}{course
311
policies} on the \href{https://github.com/UBC-DSCI/dsci-100}{course
312
website}.
313
314
\subsubsection{Lecture and Tutorial Learning
315
Goals:}\label{lecture-and-tutorial-learning-goals}
316
317
After completing this week's lecture and tutorial work, you will be able
318
to:
319
320
\begin{itemize}
321
\tightlist
322
\item
323
use a Jupyter notebook to execute provided R code
324
\item
325
edit code and markdown cells in a Jupyter notebook
326
\item
327
create new code and markdown cells in a Jupyter notebook
328
\item
329
load the \texttt{tidyverse} library into R
330
\item
331
create new variables and objects in R using the assignment symbol
332
\item
333
use the help and documentation tools in R
334
\item
335
match the names of the following functions from the \texttt{tidyverse}
336
library to their documentation descriptions:
337
338
\begin{itemize}
339
\tightlist
340
\item
341
\texttt{read\_csv}
342
\item
343
\texttt{select}
344
\item
345
\texttt{mutate}
346
\item
347
\texttt{filter}
348
\item
349
\texttt{ggplot}
350
\item
351
\texttt{aes}
352
\end{itemize}
353
\item
354
chain together two functions using the pipe operator,
355
\texttt{\%\textgreater{}\%}
356
\end{itemize}
357
358
In this first worksheet you will also learn how to test the answers you
359
write in this worksheet to assess if you answered questions correctly
360
before your assignment is collected.
361
362
This worksheet covers parts of
363
\href{https://ubc-dsci.github.io/introduction-to-data-science/chapter2.html}{Chapter
364
1} of the online textbook. You should read this chapter before
365
attempting this worksheet.
366
367
\section{1. Jupyter notebooks}\label{jupyter-notebooks}
368
369
This webpage is called a Jupyter notebook. A notebook is a place to
370
write programs and view their results.
371
372
\subsection{1.1. Text cells}\label{text-cells}
373
374
In a notebook, each rectangle containing text or code is called a
375
\emph{cell}.
376
377
Text cells (like this one) can be edited by double-clicking on them.
378
They're written in a simple format called
379
\href{http://daringfireball.net/projects/markdown/syntax}{Markdown} to
380
add formatting and section headings. You don't need to learn Markdown,
381
but you might want to.
382
383
After you edit a text cell, click the "run cell" button at the top that
384
looks like \textbar{} to confirm any changes. (Try not to delete the
385
instructions of the lab.)
386
387
\textbf{Question 1.1.1.} This paragraph is in its own text cell. Try
388
editing it so that this sentence is the last sentence in the paragraph,
389
and then click the "run cell" \textbar{} button . This sentence, for
390
example, should be deleted. So should this one.
391
392
\subsection{1.2. Code cells}\label{code-cells}
393
394
Other cells contain code in the R language. Running a code cell will
395
execute all of the code it contains.
396
397
To run the code in a cell, first click on that cell to activate it.
398
It'll be highlighted with a little green or blue rectangle. Next, either
399
press Run \textbar{} or hold down the \texttt{shift} key and press
400
\texttt{return} or \texttt{enter}.
401
402
Try running the next cell:
403
404
\begin{Verbatim}[commandchars=\\\{\}]
405
{\color{incolor}In [{\color{incolor}1}]:} \PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Hello, World!\PYZdq{}}\PY{p}{)}
406
\end{Verbatim}
407
408
409
\begin{Verbatim}[commandchars=\\\{\}]
410
[1] "Hello, World!"
411
412
\end{Verbatim}
413
414
The above code cell contains a single line of code, but cells can also
415
contain multiple lines of code. When you run a cell, the lines of code
416
are executed in the order in which they appear. Every \texttt{print}
417
expression prints a line. Run the next cell and notice the order of the
418
output.
419
420
\begin{Verbatim}[commandchars=\\\{\}]
421
{\color{incolor}In [{\color{incolor}2}]:} \PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{First this line is printed,\PYZdq{}}\PY{p}{)}
422
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{and then this one.\PYZdq{}}\PY{p}{)}
423
\end{Verbatim}
424
425
426
\begin{Verbatim}[commandchars=\\\{\}]
427
[1] "First this line is printed,"
428
[1] "and then this one."
429
430
\end{Verbatim}
431
432
\textbf{Question 1.2.1.} Change the cell above so that it prints out:
433
434
\begin{verbatim}
435
First this line is printed,
436
and then the next line,
437
and then this one.
438
\end{verbatim}
439
440
\emph{Hint:} If you're stuck for more than a few minutes, try talking to
441
a neighbor or a TA. That's a good idea for any worksheet or tutorial
442
problem.
443
444
\subsection{1.3. Writing Jupyter
445
notebooks}\label{writing-jupyter-notebooks}
446
447
You can use Jupyter notebooks for your own projects or documents. When
448
you make your own notebook, you'll need to create your own cells for
449
text and code.
450
451
To add a cell, click the + button in the menu bar. It'll start out as a
452
code cell. You can change it to a text cell by clicking inside it so
453
it's highlighted, clicking the drop-down box next to the restart ()
454
button in the menu bar, and choosing "Markdown".
455
456
\textbf{Question 1.3.1.} Add a code cell below this one. Write code in
457
it that prints out:
458
459
\begin{verbatim}
460
A whole new code cell!
461
\end{verbatim}
462
463
Run your cell to verify that it works.
464
465
\textbf{Question 1.3.2.} Add a text/Markdown cell below this one. Write
466
the text "A whole new Markdown cell" in it.
467
468
\subsection{1.4. Errors}\label{errors}
469
470
R is a language, and like natural human languages, it has rules. It
471
differs from natural language in two important ways: 1. The rules are
472
\emph{simple}. You can learn most of them in a few weeks and gain
473
reasonable proficiency with the language in a semester. 2. The rules are
474
\emph{rigid}. If you're proficient in a natural language, you can
475
understand a non-proficient speaker, glossing over small mistakes. A
476
computer running R code is not smart enough to do that.
477
478
Whenever you write code, you'll make mistakes (everyone who writes code
479
does, even your course instructor!). When you run a code cell that has
480
errors, R will sometimes produce error messages to tell you what you did
481
wrong.
482
483
Errors are okay; even experienced programmers make many errors. When you
484
make an error, you just have to find the source of the problem, fix it,
485
and move on.
486
487
We have made an error in the next cell. Run it and see what happens.
488
489
\begin{Verbatim}[commandchars=\\\{\}]
490
{\color{incolor}In [{\color{incolor}3}]:} \PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{This line is missing something.\PYZdq{}}
491
\end{Verbatim}
492
493
494
\begin{Verbatim}[commandchars=\\\{\}]
495
496
Error in parse(text = x, srcfile = src): <text>:2:0: unexpected end of input
497
1: print("This line is missing something."
498
\^{}
499
Traceback:
500
501
502
\end{Verbatim}
503
504
\begin{figure}
505
\centering
506
\includegraphics{images/ws1_error_image.png}
507
\caption{ws1\_error\_image.png}
508
\end{figure}
509
510
There's a lot of terminology in programming languages, but you don't
511
need to know it all in order to program effectively. If you see a
512
cryptic message like this, you can often get by without deciphering it.
513
(Of course, if you're frustrated, ask a neighbor or a TA for help.)
514
515
Try to fix the code above so that you can run the cell and see the
516
intended message instead of an error.
517
518
\subsection{1.5. The Kernel}\label{the-kernel}
519
520
The kernel is a program that executes the code inside your notebook and
521
outputs the results. In the top right of your window, you can see a
522
circle that indicates the status of your kernel. If the circle is empty
523
(), the kernel is idle and ready to execute code. If the circle is
524
filled in (), the kernel is busy running some code.
525
526
You may run into problems where your kernel is stuck for an excessive
527
amount of time, your notebook is very slow and unresponsive, or your
528
kernel loses its connection. If this happens, try the following steps:
529
1. At the top of your screen, click \textbf{Kernel}, then
530
\textbf{Interrupt}. 2. If that doesn't help, click \textbf{Kernel}, then
531
\textbf{Restart}. If you do this, you will have to run your code cells
532
from the start of your notebook up until where you paused your work. 3.
533
If that doesn't help, restart your server. First, save your work by
534
clicking \textbf{File} at the top left of your screen, then \textbf{Save
535
and Checkpoint}. Next, click \textbf{Control Panel} at the top right.
536
Choose \textbf{Stop My Server} to shut it down, then \textbf{My Server}
537
to start it back up. Then, navigate back to the notebook you were
538
working on.
539
540
\subsection{1.6. Submitting your work}\label{submitting-your-work}
541
542
All lecture worksheets and tutorials assignments in the course will be
543
distributed as notebooks like this one. You will complete your work in
544
this notebook and at the due date we will copy this notebook and grade
545
that copy. For lecture worksheets we will use a system called nbgrader
546
that checks your work. For tutorial assignments we will use a
547
combination of nbgrader and manual grading of your work.
548
549
\section{2. Numbers}\label{numbers}
550
551
Quantitative information arises everywhere in data science. In addition
552
to representing commands to print out lines, our R code can represent
553
numbers and methods of combining numbers. The expression \texttt{3.2500}
554
evaluates to the number 3.25. (Run the cell and see.)
555
556
\begin{Verbatim}[commandchars=\\\{\}]
557
{\color{incolor}In [{\color{incolor}4}]:} \PY{l+m}{3.2500}
558
\end{Verbatim}
559
560
561
3.25
562
563
564
Notice that we didn't have to print. When you run a notebook cell,
565
Jupyter helpfully prints out that value for you.
566
567
\begin{Verbatim}[commandchars=\\\{\}]
568
{\color{incolor}In [{\color{incolor}5}]:} \PY{l+m}{2}
569
\PY{l+m}{3}
570
\PY{l+m}{4}
571
\end{Verbatim}
572
573
574
2
575
576
577
3
578
579
580
4
581
582
583
Above, you should see that the three numbers (2, 3, and 4) are printed
584
out. In R, simply inputting numbers and running the cell will generate
585
all the numbers that you listed. Even though we don't need to use print,
586
we will continue to do in several places in these worksheets so that we
587
are very clear with our intentions.
588
589
\subsection{2.1. Arithmetic}\label{arithmetic}
590
591
The line in the next cell subtracts. Its value is what you'd expect. Run
592
it.
593
594
\begin{Verbatim}[commandchars=\\\{\}]
595
{\color{incolor}In [{\color{incolor}6}]:} \PY{l+m}{2.0} \PY{o}{\PYZhy{}} \PY{l+m}{1.5}
596
\end{Verbatim}
597
598
599
0.5
600
601
602
Same with the cell below. Run it.
603
604
\begin{Verbatim}[commandchars=\\\{\}]
605
{\color{incolor}In [{\color{incolor}7}]:} \PY{l+m}{2} \PY{o}{*} \PY{l+m}{2}
606
\end{Verbatim}
607
608
609
4
610
611
612
Many basic arithmetic operations are built in to R.
613
\href{https://www.statmethods.net/management/operators.html}{This
614
webpage} describes all the arithmetic operators used in the course. You
615
can refer back to this webpage as you need throughout the term.
616
617
\section{3. Names}\label{names}
618
619
In natural language, we have terminology that lets us quickly reference
620
very complicated concepts. We don't say, "That's a large mammal with
621
brown fur and sharp teeth!" Instead, we just say, "Bear!"
622
623
Similarly, an effective strategy for writing code is to define names for
624
data as we compute it, like a lawyer would define terms for complex
625
ideas at the start of a legal document to simplify the rest of the
626
writing.
627
628
In R, we do this with \emph{objects}. An object has a name on the left
629
side of an \texttt{\textless{}-} sign and an expression to be evaluated
630
on the right.
631
632
\begin{Verbatim}[commandchars=\\\{\}]
633
{\color{incolor}In [{\color{incolor}8}]:} answer \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+m}{3} \PY{o}{*} \PY{l+m}{2} \PY{o}{+} \PY{l+m}{4}
634
\end{Verbatim}
635
636
637
When you run that cell, R first evaluates the first line. It computes
638
the value of the expression \texttt{3\ *\ 2\ +\ 4}, which is the number
639
10. Then it gives that value the name \texttt{answer}. At that point,
640
the code in the cell is done running.
641
642
After you run that cell, the value 10 is bound to the name
643
\texttt{answer}:
644
645
\begin{Verbatim}[commandchars=\\\{\}]
646
{\color{incolor}In [{\color{incolor}9}]:} answer
647
\end{Verbatim}
648
649
650
10
651
652
653
We can name our objects anything we'd like. Above we called it
654
\texttt{answer}, but we could have named it \texttt{value},
655
\texttt{data} or anything else we desired. A good rule of thumb is to
656
name it something that has meaning to a human as it relates to what we
657
are trying to accomplish with our R code.
658
659
\textbf{Question 3.1.} Enter a new code cell. Try creating another
660
object using \texttt{\textless{}-\ 3\ *\ 2\ +\ 4} with a name different
661
from \texttt{answer}.
662
663
A common pattern in Jupyter notebooks is to assign a value to a name and
664
then immediately evaluate the name in the last line in the cell so that
665
the value is displayed as output.
666
667
\begin{Verbatim}[commandchars=\\\{\}]
668
{\color{incolor}In [{\color{incolor}10}]:} close\PYZus{}to\PYZus{}pi \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+m}{355}\PY{o}{/}\PY{l+m}{113}
669
close\PYZus{}to\PYZus{}pi
670
\end{Verbatim}
671
672
673
3.14159292035398
674
675
676
Another common pattern is that a series of lines in a single cell will
677
build up a complex computation in stages, naming the intermediate
678
results.
679
680
\begin{Verbatim}[commandchars=\\\{\}]
681
{\color{incolor}In [{\color{incolor}11}]:} bimonthly\PYZus{}salary \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+m}{840}
682
monthly\PYZus{}salary \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+m}{2} \PY{o}{*} bimonthly\PYZus{}salary
683
number\PYZus{}of\PYZus{}months\PYZus{}in\PYZus{}a\PYZus{}year \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+m}{12}
684
yearly\PYZus{}salary \PY{o}{\PYZlt{}\PYZhy{}} number\PYZus{}of\PYZus{}months\PYZus{}in\PYZus{}a\PYZus{}year \PY{o}{*} monthly\PYZus{}salary
685
\PY{k+kp}{print}\PY{p}{(}yearly\PYZus{}salary\PY{p}{)}
686
\end{Verbatim}
687
688
689
\begin{Verbatim}[commandchars=\\\{\}]
690
[1] 20160
691
692
\end{Verbatim}
693
694
Names in R can have letters (upper- and lower-case letters are both okay
695
and count as different letters), underscores, and numbers. The first
696
character can't be a number (otherwise a name might look like a number).
697
And names can't contain spaces, since spaces are used to separate pieces
698
of code from each other.
699
700
Other than those rules, what you name something doesn't matter \emph{to
701
R}. For example, the next cell does the same thing as the above cell,
702
except everything has a different name:
703
704
\begin{Verbatim}[commandchars=\\\{\}]
705
{\color{incolor}In [{\color{incolor}12}]:} a \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+m}{840}
706
b \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+m}{2} \PY{o}{*} a
707
\PY{k+kt}{c} \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+m}{12}
708
d \PY{o}{\PYZlt{}\PYZhy{}} \PY{k+kt}{c} \PY{o}{*} b
709
\PY{k+kp}{print}\PY{p}{(}d\PY{p}{)}
710
\end{Verbatim}
711
712
713
\begin{Verbatim}[commandchars=\\\{\}]
714
[1] 20160
715
716
\end{Verbatim}
717
718
\textbf{However}, names are very important for making your code
719
\emph{readable} to yourself and others. The cell above is shorter, but
720
it's totally useless without an explanation of what it does.
721
722
There is also cultural style associated with different programming
723
languages. In the modern R style, object names should use only lowercase
724
letters, numbers, and \texttt{\_}. Underscores (\texttt{\_}) are
725
typically used to separate words within a name (\emph{e.g.},
726
\texttt{answer\_one}).
727
728
\subsection{3.1. Comments}\label{comments}
729
730
Below you see lines like this in code cells:
731
732
\begin{verbatim}
733
# Test cell; please do not change!
734
\end{verbatim}
735
736
That is called a \emph{comment}. It doesn't make anything happen in R; R
737
ignores anything on a line after a \#. Instead, it's there to
738
communicate something about the code to you, the human reader. Comments
739
are extremely useful and can help increase how readable our code is.
740
741
\textbf{Question 3.2.} Assign the name \texttt{seconds\_in\_an\_hour} to
742
the number of seconds in an hour. You should do this in two steps. In
743
the first you calculate the number of seconds in a minute and assign
744
that number the name \texttt{seconds\_in\_a\_minute}. Next you shoud
745
calculate the number of seconds in an hour and assign that number the
746
name \texttt{seconds\_in\_an\_hour.} \emph{hint - there are 60 seconds
747
in a minute and 60 minutes in a hour}
748
749
\begin{Verbatim}[commandchars=\\\{\}]
750
{\color{incolor}In [{\color{incolor}13}]:} \PY{c+c1}{\PYZsh{} Calculate the number of seconds in an hour.}
751
\PY{c+c1}{\PYZsh{} Assign your answer to seconds\PYZus{}in\PYZus{}an\PYZus{}hour}
752
753
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
754
seconds\PYZus{}in\PYZus{}a\PYZus{}minute \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+m}{60}
755
seconds\PYZus{}in\PYZus{}an\PYZus{}hour \PY{o}{\PYZlt{}\PYZhy{}} seconds\PYZus{}in\PYZus{}a\PYZus{}minute \PY{o}{*} \PY{l+m}{60}
756
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
757
758
\PY{c+c1}{\PYZsh{} We\PYZsq{}ve put this line in this cell so that it will print}
759
\PY{c+c1}{\PYZsh{} the value you\PYZsq{}ve given to seconds\PYZus{}in\PYZus{}a\PYZus{}decade when you}
760
\PY{c+c1}{\PYZsh{} run it. You don\PYZsq{}t need to change this.}
761
\PY{k+kp}{print}\PY{p}{(}seconds\PYZus{}in\PYZus{}an\PYZus{}hour\PY{p}{)}
762
\end{Verbatim}
763
764
765
\begin{Verbatim}[commandchars=\\\{\}]
766
[1] 3600
767
768
\end{Verbatim}
769
770
\subsection{3.2. Checking your code}\label{checking-your-code}
771
772
Now that you know how to name things, you can start using the built-in
773
\emph{tests} to check whether your work is correct. To do this, you will
774
need to run the cell below to set things up. In future worksheets and
775
tutorial assignments you will see this cell at the very top of the
776
notebook:
777
778
\begin{Verbatim}[commandchars=\\\{\}]
779
{\color{incolor}In [{\color{incolor}14}]:} \PY{k+kn}{library}\PY{p}{(}testthat\PY{p}{)}
780
\PY{k+kn}{library}\PY{p}{(}digest\PY{p}{)}
781
\end{Verbatim}
782
783
784
Below is an example of a test cell for Question 3.2 above (assesses
785
whether you have assigned \texttt{seconds\_in\_an\_hour} correctly). If
786
you haven't, this test will tell you the correct answer. Try not to
787
change the contents of the test cells. Resist the urge to just copy it,
788
and instead try to adjust your expression. (Sometimes the tests will
789
give hints about what went wrong...)
790
791
\begin{Verbatim}[commandchars=\\\{\}]
792
{\color{incolor}In [{\color{incolor}15}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
793
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}seconds\PYZus{}in\PYZus{}a\PYZus{}minute\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{4bdb128c943f718f5b8f347bb4b7641b\PYZsq{}}\PY{p}{)} \PY{c+c1}{\PYZsh{} we hid the answer to the test here so you can\PYZsq{}t see it, but we can still run the test}
794
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}seconds\PYZus{}in\PYZus{}an\PYZus{}hour\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{a69521e1dbffd4cd8f6ed869a4eba073\PYZsq{}}\PY{p}{)} \PY{c+c1}{\PYZsh{} we hid the answer to the test here so you can\PYZsq{}t see it, but we can still run the test}
795
\PY{p}{\PYZcb{}}\PY{p}{)}
796
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
797
\end{Verbatim}
798
799
800
\begin{Verbatim}[commandchars=\\\{\}]
801
[1] "Success!"
802
803
\end{Verbatim}
804
805
For this first question we'll provide you the solution:
806
807
\begin{Verbatim}[commandchars=\\\{\}]
808
{\color{incolor}In [{\color{incolor}16}]:} \PY{c+c1}{\PYZsh{} Calculate the number of seconds in an hour.}
809
810
\PY{c+c1}{\PYZsh{}SOLUTION:}
811
seconds\PYZus{}in\PYZus{}a\PYZus{}minute \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+m}{60}
812
seconds\PYZus{}in\PYZus{}an\PYZus{}hour \PY{o}{\PYZlt{}\PYZhy{}} seconds\PYZus{}in\PYZus{}a\PYZus{}minute \PY{o}{*} \PY{l+m}{60}
813
814
\PY{c+c1}{\PYZsh{} We\PYZsq{}ve put this line in this cell so that it will print}
815
\PY{c+c1}{\PYZsh{} the value you\PYZsq{}ve given to seconds\PYZus{}in\PYZus{}a\PYZus{}decade when you}
816
\PY{c+c1}{\PYZsh{} run it. You don\PYZsq{}t need to change this.}
817
\PY{k+kp}{print}\PY{p}{(}seconds\PYZus{}in\PYZus{}an\PYZus{}hour\PY{p}{)}
818
\end{Verbatim}
819
820
821
\begin{Verbatim}[commandchars=\\\{\}]
822
[1] 3600
823
824
\end{Verbatim}
825
826
\section{4. Calling functions}\label{calling-functions}
827
828
The most common way to combine or manipulate values in R is by calling
829
functions. R comes with many built-in functions that perform common
830
operations.
831
832
We used a function \texttt{print()} at the beginning of this notebook
833
when we printed text from a code cell. Here we'll demonstrate using
834
another function \texttt{toupper()} that converts text to uppercase:
835
836
\begin{Verbatim}[commandchars=\\\{\}]
837
{\color{incolor}In [{\color{incolor}17}]:} greeting \PY{o}{\PYZlt{}\PYZhy{}} \PY{k+kp}{toupper}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Why, hello there!\PYZdq{}}\PY{p}{)}
838
\PY{k+kp}{print}\PY{p}{(}greeting\PY{p}{)}
839
\end{Verbatim}
840
841
842
\begin{Verbatim}[commandchars=\\\{\}]
843
[1] "WHY, HELLO THERE!"
844
845
\end{Verbatim}
846
847
\textbf{Question 4.1.} Use the function \texttt{tolower} to change all
848
the words in the following movie title to lower case text: "The House
849
with a Clock in Its Walls" and assign the lower case text the name
850
\texttt{title}.
851
852
\begin{Verbatim}[commandchars=\\\{\}]
853
{\color{incolor}In [{\color{incolor}18}]:} \PY{c+c1}{\PYZsh{} Change movie title to lower case using tolower()}
854
\PY{c+c1}{\PYZsh{} Assign your answer to an object called: title }
855
856
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
857
title \PY{o}{\PYZlt{}\PYZhy{}} \PY{k+kp}{tolower}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{The House with a Clock in Its Walls\PYZdq{}}\PY{p}{)}
858
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
859
\PY{k+kp}{print}\PY{p}{(}title\PY{p}{)}
860
\end{Verbatim}
861
862
863
\begin{Verbatim}[commandchars=\\\{\}]
864
[1] "the house with a clock in its walls"
865
866
\end{Verbatim}
867
868
\begin{Verbatim}[commandchars=\\\{\}]
869
{\color{incolor}In [{\color{incolor}19}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
870
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}title\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{c76933115bc8095b2140c11556800725\PYZsq{}}\PY{p}{)} \PY{c+c1}{\PYZsh{} we hid the answer to the test here so you can\PYZsq{}t see it, but we can still run the test}
871
\PY{p}{\PYZcb{}}\PY{p}{)}
872
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
873
\end{Verbatim}
874
875
876
\begin{Verbatim}[commandchars=\\\{\}]
877
[1] "Success!"
878
879
\end{Verbatim}
880
881
\subsection{4.1. Multiple arguments}\label{multiple-arguments}
882
883
Some functions take multiple arguments, separated by commas. For
884
example, the built-in \texttt{max} function returns the maximum argument
885
passed to it.
886
887
\begin{Verbatim}[commandchars=\\\{\}]
888
{\color{incolor}In [{\color{incolor}20}]:} biggest \PY{o}{\PYZlt{}\PYZhy{}} \PY{k+kp}{max}\PY{p}{(}\PY{l+m}{2}\PY{p}{,} \PY{l+m}{15}\PY{p}{,} \PY{l+m}{4}\PY{p}{,} \PY{l+m}{7}\PY{p}{)}
889
\PY{k+kp}{print}\PY{p}{(}biggest\PY{p}{)}
890
\end{Verbatim}
891
892
893
\begin{Verbatim}[commandchars=\\\{\}]
894
[1] 15
895
896
\end{Verbatim}
897
898
\textbf{Question 4.1.} Use the \texttt{min} function to find the minumum
899
value of the numbers in the cell above.
900
901
Assign the value to an object called \texttt{smallest}.
902
903
\begin{Verbatim}[commandchars=\\\{\}]
904
{\color{incolor}In [{\color{incolor}21}]:} \PY{c+c1}{\PYZsh{} Use min() to find the smallest value. }
905
\PY{c+c1}{\PYZsh{} Assign your answer to an object called: smallest}
906
907
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
908
smallest \PY{o}{\PYZlt{}\PYZhy{}} \PY{k+kp}{min}\PY{p}{(}\PY{l+m}{2}\PY{p}{,} \PY{l+m}{15}\PY{p}{,} \PY{l+m}{4}\PY{p}{,} \PY{l+m}{7}\PY{p}{)}
909
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
910
\PY{k+kp}{print}\PY{p}{(}smallest\PY{p}{)}
911
\end{Verbatim}
912
913
914
\begin{Verbatim}[commandchars=\\\{\}]
915
[1] 2
916
917
\end{Verbatim}
918
919
\begin{Verbatim}[commandchars=\\\{\}]
920
{\color{incolor}In [{\color{incolor}22}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
921
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}smallest\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{db8e490a925a60e62212cefc7674ca02\PYZsq{}}\PY{p}{)} \PY{c+c1}{\PYZsh{} we hid the answer to the test here so you can\PYZsq{}t see it, but we can still run the test}
922
\PY{p}{\PYZcb{}}\PY{p}{)}
923
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
924
\end{Verbatim}
925
926
927
\begin{Verbatim}[commandchars=\\\{\}]
928
[1] "Success!"
929
930
\end{Verbatim}
931
932
\section{5. Packages}\label{packages}
933
934
R has many built-in functions, but we can also use functions that are
935
stored within packages created by other R users. We are going to use a
936
package, called \texttt{tidyverse}, to load, modify and plot data. This
937
package has already been installed for you. Later in the course you will
938
learn how to install packages so you are free to bring in other tools as
939
you need them for your data analysis.
940
941
To use the functions from a package you first need to load it using the
942
\texttt{library} function. This needs to be done once per notebook (and
943
a good rule of thumb is to do this at the very top of your notebook so
944
it is easy to see what packages your R code depends on).
945
946
\begin{Verbatim}[commandchars=\\\{\}]
947
{\color{incolor}In [{\color{incolor}23}]:} \PY{k+kn}{library}\PY{p}{(}tidyverse\PY{p}{)}
948
\end{Verbatim}
949
950
951
\begin{Verbatim}[commandchars=\\\{\}]
952
Attaching packages tidyverse 1.2.1
953
ggplot2 3.1.0 purrr 0.2.5
954
tibble 1.4.2 dplyr 0.7.7
955
tidyr 0.8.0 stringr 1.3.1
956
readr 1.1.1 forcats 0.3.0
957
Conflicts tidyverse\_conflicts()
958
dplyr::filter() masks stats::filter()
959
purrr::is\_null() masks testthat::is\_null()
960
dplyr::lag() masks stats::lag()
961
dplyr::matches() masks testthat::matches()
962
963
\end{Verbatim}
964
965
\textbf{Question 5.1.} Use the \texttt{library} function to load the
966
\texttt{rvest} R package
967
968
We will use this package next week to scrape data from the web!
969
970
\begin{Verbatim}[commandchars=\\\{\}]
971
{\color{incolor}In [{\color{incolor}24}]:} \PY{c+c1}{\PYZsh{} Load the rvest package using the library function.}
972
973
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
974
\PY{k+kn}{library}\PY{p}{(}rvest\PY{p}{)}
975
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
976
\end{Verbatim}
977
978
979
\begin{Verbatim}[commandchars=\\\{\}]
980
Loading required package: xml2
981
982
Attaching package: ‘rvest
983
984
The following object is masked from ‘package:purrr:
985
986
pluck
987
988
The following object is masked from ‘package:readr:
989
990
guess\_encoding
991
992
993
\end{Verbatim}
994
995
\begin{Verbatim}[commandchars=\\\{\}]
996
{\color{incolor}In [{\color{incolor}25}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect, the rvest package needs to be loaded\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
997
expect\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{package:rvest\PYZdq{}} \PY{o}{\PYZpc{}in\PYZpc{}} \PY{k+kp}{search}\PY{p}{(}\PY{p}{)} \PY{p}{,} is\PYZus{}true\PY{p}{(}\PY{p}{)}\PY{p}{)}
998
\PY{p}{\PYZcb{}}\PY{p}{)}
999
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
1000
\end{Verbatim}
1001
1002
1003
\begin{Verbatim}[commandchars=\\\{\}]
1004
[1] "Success!"
1005
1006
\end{Verbatim}
1007
1008
\section{6. Looking for help}\label{looking-for-help}
1009
1010
\paragraph{Help Files}\label{help-files}
1011
1012
No one, even experienced, professional programmers remember what every
1013
function does, nor do they remember every possible function
1014
argument/option. So both experienced and new programmers (like you!)
1015
need to look things up, A LOT! One of the most efficient places to look
1016
for help on how a function works is the R help files. Let's say we
1017
wanted to pull up the help file for the \texttt{max()} function. We can
1018
do this by typing a question mark in front of the function we want to
1019
know more about:
1020
1021
\begin{Verbatim}[commandchars=\\\{\}]
1022
{\color{incolor}In [{\color{incolor}26}]:} \PY{o}{?}read\PYZus{}csv
1023
\end{Verbatim}
1024
1025
1026
At the very top of the file, you will see the function itself and the
1027
package it is in (in this case, it is base). Next is a description of
1028
what the function does. You'll find that the most helpful sections on
1029
this page are ``Usage'', ``Arguments'' and "Examples".
1030
1031
\begin{itemize}
1032
\tightlist
1033
\item
1034
\textbf{Usage} gives you an idea of how you would use the function
1035
when coding-\/-what the syntax would be and how the function itself is
1036
structured.
1037
\item
1038
\textbf{Arguments} tells you the different parts that can be added to
1039
the function to make it more simple or more complicated. Often the
1040
``Usage'' and ``Arguments'' sections don't provide you with step by
1041
step instructions, because there are so many different ways that a
1042
person can incorporate a function into their code. Instead, they
1043
provide users with a general understanding as to what the function
1044
could do and parts that could be added. At the end of the day, the
1045
user must interpret the help file and figure out how best to use the
1046
functions and which parts are most important to include for their
1047
particular task.
1048
\item
1049
The \textbf{Examples} section is often the most useful part of the
1050
help file as it shows how a function could be used with real data. It
1051
provides a skeleton code that the users can work off of.
1052
\end{itemize}
1053
1054
Beyond the R help files there are many resources that you can use to
1055
find help. \href{https://stackoverflow.com/}{Stack overflow}, an online
1056
forum, is a great place to go and ask questions such as how to perform a
1057
complicated task in R or why a specific error message is popping up.
1058
Oftentimes, a previous user will have already asked your question of
1059
interest and received helpful advice from fellow R users.
1060
1061
\textbf{Question 6.1.} Use \texttt{?read\_csv} and read the
1062
\textbf{Description} section to answer the multiple choice question
1063
below. To answer the question assign the letter associated with the
1064
correct answer to a variable in the the code cell below:
1065
1066
Which statement below is accurate?
1067
1068
A. \texttt{read\_csv2()} uses \texttt{;} for separators, instead of
1069
\texttt{,}
1070
1071
B. \texttt{read\_delim} is a special case of the \texttt{read\_csv}
1072
function.
1073
1074
C. These functions are useful for reading binary files, such as excel
1075
spreadsheets.
1076
1077
D. European countries commonly use \texttt{:} as the decimal separator.
1078
1079
\emph{Answer in the cell below using the uppercase letter associated
1080
with your answer. Place your answer between "", assign the correct
1081
answer to an object called \texttt{answer}}
1082
1083
\begin{Verbatim}[commandchars=\\\{\}]
1084
{\color{incolor}In [{\color{incolor}27}]:} \PY{c+c1}{\PYZsh{} Assign your answer to an object called: answer}
1085
\PY{c+c1}{\PYZsh{} Make sure the correct answer is an uppercase letter. }
1086
\PY{c+c1}{\PYZsh{} Surround your answer with quotation marks.}
1087
\PY{c+c1}{\PYZsh{} Replace the fail() with your answer. }
1088
1089
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
1090
answer \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+s}{\PYZdq{}}\PY{l+s}{A\PYZdq{}}
1091
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
1092
\PY{k+kp}{print}\PY{p}{(}answer\PY{p}{)}
1093
\end{Verbatim}
1094
1095
1096
\begin{Verbatim}[commandchars=\\\{\}]
1097
[1] "A"
1098
1099
\end{Verbatim}
1100
1101
\begin{Verbatim}[commandchars=\\\{\}]
1102
{\color{incolor}In [{\color{incolor}28}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
1103
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}answer\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{75f1160e72554f4270c809f041c7a776\PYZsq{}}\PY{p}{)} \PY{c+c1}{\PYZsh{} we hid the answer to the test here so you can\PYZsq{}t see it, but we can still run the test}
1104
1105
\PY{p}{\PYZcb{}}\PY{p}{)}
1106
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
1107
\end{Verbatim}
1108
1109
1110
\begin{Verbatim}[commandchars=\\\{\}]
1111
[1] "Success!"
1112
1113
\end{Verbatim}
1114
1115
\section{7. Exercise}\label{exercise}
1116
1117
Now that we have learned a little about Jupyter notebooks and R, let's
1118
load a real dataset into R and explore it. As we do this we will learn
1119
more about key data loading, wrangling and visualization functions in R.
1120
1121
\subsubsection{Data about runners!}\label{data-about-runners}
1122
1123
Researchers, Vickers and Vertosick performed
1124
\href{https://bmcsportsscimedrehabil.biomedcentral.com/articles/10.1186/s13102-016-0052-y}{a
1125
study in 2016} that aimed to identify what factors affect race
1126
performance of recreational runners so that they could build better
1127
models to predict 5 km, 10 km and marathon race times. Such models can
1128
help runners by suggesting changes they could make to modifiable
1129
factors, such as training, to help them improve race time. Unmodifiable
1130
factors in the model, such as age or sex, allow for fair comparisons to
1131
be made between different runners.
1132
1133
Vickers and Vertosick reasoned that their study is important because all
1134
previous research done to predict races times has focused on data from
1135
elite athletes. This biased data set means that the models generated
1136
from them do not necessarily do a good job predicting race times for
1137
recreational runners (whose data was not in the dataset that created the
1138
models). Additionally, previous research focused on reporting/measuring
1139
factors that require special expertise or equipment that are not freely
1140
available to recreational runners. This means that recreational runners
1141
may not be able to put their characteristics/measurements for these
1142
factors in the race time prediction models and so they will not be able
1143
to obtain an accurate prediction, or a prediction at all (in the case of
1144
some models).
1145
1146
To make a better model, Vickers and Vertosick performed a large survey.
1147
They put their survey on the news website
1148
\href{https://slate.com/}{Slate.com} attached to a news story about race
1149
time prediction. They were able to obtain 2,497 responses. The survey
1150
included questions that allowed them to collect a data set that
1151
included: - age, - sex, - body mass index (BMI), - whether they are an
1152
edurance runner or speed demon, - what type of shoes they wear, - what
1153
type of training they do, - race time for 2-3 races they completed in
1154
the last 6 months, - self-rated fitness for each race, - and race
1155
difficulty for each race.
1156
1157
Let's now use this data to explore a question we might be interested in
1158
- is there a relationship between 5 km race time and body mass index
1159
(BMI) for women runners (if there is, then it might be a useful factor
1160
to include in a race time prediction model for these runners). We will
1161
answer this question by visualizing the data as a scatter plot using R.
1162
To accomplish this, we will need to do the following things in R:
1163
1164
\begin{enumerate}
1165
\def\labelenumi{\arabic{enumi}.}
1166
\tightlist
1167
\item
1168
load the data set into R
1169
\item
1170
subset the data we are interested in visualizing from the loaded
1171
dataset
1172
\item
1173
create a new column to get the unit of time in minutes instead of
1174
seconds
1175
\item
1176
create a scatter plot using this modified data
1177
\end{enumerate}
1178
1179
\textbf{Question 7.1} Which of the following will you not find included
1180
in Vickers and Vertosick's data set?
1181
1182
A. age
1183
1184
B. body mass index
1185
1186
C. self-rated fitness for each race
1187
1188
D. what each runner ate before the race
1189
1190
\emph{Assign your answer to an object called \texttt{answer7.1}.}
1191
1192
\begin{Verbatim}[commandchars=\\\{\}]
1193
{\color{incolor}In [{\color{incolor}29}]:} \PY{c+c1}{\PYZsh{} Assign your answer to an object called: answer7.1}
1194
\PY{c+c1}{\PYZsh{} Make sure the correct answer is an uppercase letter. }
1195
\PY{c+c1}{\PYZsh{} Surround your answer with quotation marks.}
1196
\PY{c+c1}{\PYZsh{} Replace the fail() with your answer. }
1197
1198
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
1199
answer7.1 \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+s}{\PYZdq{}}\PY{l+s}{D\PYZdq{}}
1200
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
1201
\PY{k+kp}{print}\PY{p}{(}answer7.1\PY{p}{)}
1202
\end{Verbatim}
1203
1204
1205
\begin{Verbatim}[commandchars=\\\{\}]
1206
[1] "D"
1207
1208
\end{Verbatim}
1209
1210
\begin{Verbatim}[commandchars=\\\{\}]
1211
{\color{incolor}In [{\color{incolor}30}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
1212
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}answer7.1\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{c1f86f7430df7ddb256980ea6a3b57a4\PYZsq{}}\PY{p}{)} \PY{c+c1}{\PYZsh{} we hid the answer to the test here so you can\PYZsq{}t see it, but we can still run the test}
1213
1214
\PY{p}{\PYZcb{}}\PY{p}{)}
1215
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
1216
\end{Verbatim}
1217
1218
1219
\begin{Verbatim}[commandchars=\\\{\}]
1220
[1] "Success!"
1221
1222
\end{Verbatim}
1223
1224
\textbf{Question 7.2} True or False:
1225
1226
The researchers compiled this data so that they could build better
1227
models to predict marathon race times.
1228
1229
\emph{Assign your answer to an object called \texttt{answer7.2}.}
1230
1231
\begin{Verbatim}[commandchars=\\\{\}]
1232
{\color{incolor}In [{\color{incolor}31}]:} \PY{c+c1}{\PYZsh{} Assign your answer to an object called: answer7.2}
1233
\PY{c+c1}{\PYZsh{} Make sure the correct answer is written in lower\PYZhy{}case (true / false)}
1234
\PY{c+c1}{\PYZsh{} Surround your answer with quotation marks.}
1235
\PY{c+c1}{\PYZsh{} Replace the fail() with your answer. }
1236
1237
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
1238
answer7.2 \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+s}{\PYZdq{}}\PY{l+s}{true\PYZdq{}}
1239
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
1240
\PY{k+kp}{print}\PY{p}{(}answer7.2\PY{p}{)}
1241
\end{Verbatim}
1242
1243
1244
\begin{Verbatim}[commandchars=\\\{\}]
1245
[1] "true"
1246
1247
\end{Verbatim}
1248
1249
\begin{Verbatim}[commandchars=\\\{\}]
1250
{\color{incolor}In [{\color{incolor}32}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
1251
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}answer7.2\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{05ca18b596514af73f6880309a21b5dd\PYZsq{}}\PY{p}{)} \PY{c+c1}{\PYZsh{} we hid the answer to the test here so you can\PYZsq{}t see it, but we can still run the test}
1252
1253
\PY{p}{\PYZcb{}}\PY{p}{)}
1254
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
1255
\end{Verbatim}
1256
1257
1258
\begin{Verbatim}[commandchars=\\\{\}]
1259
[1] "Success!"
1260
1261
\end{Verbatim}
1262
1263
\textbf{Question 7.3} What kind of graph will we be creating? Choose the
1264
correct answer from the options below.
1265
1266
A. Bar Graph
1267
1268
B. Pie Chart
1269
1270
C. Scatter Plot
1271
1272
D. Box Plot
1273
1274
\emph{Assign your answer to an object called \texttt{answer7.3}.}
1275
1276
\begin{Verbatim}[commandchars=\\\{\}]
1277
{\color{incolor}In [{\color{incolor}33}]:} \PY{c+c1}{\PYZsh{} Assign your answer to an object called: answer7.3}
1278
\PY{c+c1}{\PYZsh{} Make sure the correct answer is an uppercase letter. }
1279
\PY{c+c1}{\PYZsh{} Surround your answer with quotation marks.}
1280
\PY{c+c1}{\PYZsh{} Replace the fail() with your answer. }
1281
1282
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
1283
answer7.3 \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+s}{\PYZdq{}}\PY{l+s}{C\PYZdq{}}
1284
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
1285
\PY{k+kp}{print}\PY{p}{(}answer7.3\PY{p}{)}
1286
\end{Verbatim}
1287
1288
1289
\begin{Verbatim}[commandchars=\\\{\}]
1290
[1] "C"
1291
1292
\end{Verbatim}
1293
1294
\begin{Verbatim}[commandchars=\\\{\}]
1295
{\color{incolor}In [{\color{incolor}34}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
1296
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}answer7.3\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{475bf9280aab63a82af60791302736f6\PYZsq{}}\PY{p}{)} \PY{c+c1}{\PYZsh{} we hid the answer to the test here so you can\PYZsq{}t see it, but we can still run the test}
1297
1298
\PY{p}{\PYZcb{}}\PY{p}{)}
1299
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
1300
\end{Verbatim}
1301
1302
1303
\begin{Verbatim}[commandchars=\\\{\}]
1304
[1] "Success!"
1305
1306
\end{Verbatim}
1307
1308
Let's get started with our first step - loading the data set. The data
1309
set we are loading is called \texttt{marathon\_small.csv} and it
1310
contains a subset of the data from the study described above. The file
1311
is in the same directory/folder as the file for this notebook. It is a
1312
comma separated file (meaning the columns are separated by the
1313
\texttt{,} character). We often refer to these files as \texttt{.csv}'s.
1314
1315
\begin{verbatim}
1316
age,bmi,km5_time_seconds,km10_time_seconds,sex
1317
25.0,21.6221160888672,NA,2798,female
1318
41.0,23.905969619751,1210.0,NA,male
1319
25.0,21.6407279968262,994.0,NA,male
1320
35.0,23.5923233032227,1075.0,2135,male
1321
34.0,22.7064037322998,1186.0,NA,male
1322
45.0,42.0875434875488,3240.0,NA,female
1323
33.0,22.5182952880859,1292.0,NA,male
1324
58.0,25.2340793609619,NA,3420,male
1325
29.0,24.505407333374,1440.0,3240,male
1326
\end{verbatim}
1327
1328
We can use the \texttt{read\_csv} function to do this. Below is an
1329
example of reading a \texttt{.csv} file that is in the same
1330
directory/folder as the file for the notebook that would be reading it
1331
in:
1332
1333
\emph{Note - the quotes around the filename are important and you will
1334
get an error if you forget them.}
1335
1336
\textbf{Question 7.4} Use the \texttt{read\_csv()} function to load the
1337
data from the \texttt{marathon\_small.csv} file into R. Save the data to
1338
an object called \texttt{marathon\_small}. If you need additional help
1339
try \texttt{?read\_csv} and/or ask your neighbours or the Instructional
1340
team for help.
1341
1342
\begin{Verbatim}[commandchars=\\\{\}]
1343
{\color{incolor}In [{\color{incolor}35}]:} \PY{c+c1}{\PYZsh{} Load marathon\PYZus{}small.csv using read\PYZus{}csv and name it: marathon\PYZus{}small}
1344
\PY{k+kn}{library}\PY{p}{(}tidyverse\PY{p}{)}
1345
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
1346
marathon\PYZus{}small \PY{o}{\PYZlt{}\PYZhy{}} read\PYZus{}csv\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{marathon\PYZus{}small.csv\PYZdq{}}\PY{p}{)}
1347
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
1348
\PY{k+kp}{head}\PY{p}{(}marathon\PYZus{}small\PY{p}{)}
1349
\end{Verbatim}
1350
1351
1352
\begin{Verbatim}[commandchars=\\\{\}]
1353
Parsed with column specification:
1354
cols(
1355
age = col\_double(),
1356
bmi = col\_double(),
1357
km5\_time\_seconds = col\_double(),
1358
km10\_time\_seconds = col\_integer(),
1359
sex = col\_character()
1360
)
1361
1362
\end{Verbatim}
1363
1364
\begin{tabular}{r|lllll}
1365
age & bmi & km5\_time\_seconds & km10\_time\_seconds & sex\\
1366
\hline
1367
25 & 21.62212 & NA & 2798 & female \\
1368
41 & 23.90597 & 1210 & NA & male \\
1369
25 & 21.64073 & 994 & NA & male \\
1370
35 & 23.59232 & 1075 & 2135 & male \\
1371
34 & 22.70640 & 1186 & NA & male \\
1372
45 & 42.08754 & 3240 & NA & female \\
1373
\end{tabular}
1374
1375
1376
1377
\begin{Verbatim}[commandchars=\\\{\}]
1378
{\color{incolor}In [{\color{incolor}36}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
1379
expect\PYZus{}equal\PY{p}{(}\PY{k+kp}{nrow}\PY{p}{(}marathon\PYZus{}small\PY{p}{)}\PY{p}{,} \PY{l+m}{1833}\PY{p}{)}
1380
expect\PYZus{}equal\PY{p}{(}\PY{k+kp}{ncol}\PY{p}{(}marathon\PYZus{}small\PY{p}{)}\PY{p}{,} \PY{l+m}{5}\PY{p}{)}
1381
expect\PYZus{}equal\PY{p}{(}\PY{k+kp}{sum}\PY{p}{(}marathon\PYZus{}small\PY{o}{\PYZdl{}}age\PY{p}{)}\PY{p}{,} \PY{l+m}{66455.5}\PY{p}{)}
1382
expect\PYZus{}equal\PY{p}{(}\PY{k+kp}{sum}\PY{p}{(}marathon\PYZus{}small\PY{o}{\PYZdl{}}km5\PYZus{}time\PYZus{}seconds\PY{p}{,} na.rm \PY{o}{=} \PY{k+kc}{TRUE}\PY{p}{)}\PY{p}{,} \PY{l+m}{1944614.5}\PY{p}{)}
1383
\PY{p}{\PYZcb{}}\PY{p}{)}
1384
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
1385
\end{Verbatim}
1386
1387
1388
\begin{Verbatim}[commandchars=\\\{\}]
1389
[1] "Success!"
1390
1391
\end{Verbatim}
1392
1393
The pink output under the code cell above tells you a bit about what
1394
happened when \texttt{read\_csv} read the data into R. It tells you that
1395
5 columns were created (names: age, bmi, km5\_time\_seconds,
1396
km10\_time\_seconds and sex) as well as the type of the data in those
1397
columns (\emph{e.g.}, number-type or text-type), specifically:
1398
1399
\begin{itemize}
1400
\tightlist
1401
\item
1402
\texttt{col\_double} means that the data in this column is a
1403
number-type, specifically real numbers (meaning that these values
1404
\emph{can contain decimals})
1405
\item
1406
\texttt{col\_integer} means that the data in this column is a
1407
number-type, specifically integers (whole numbers)
1408
\item
1409
\texttt{col\_character} means that the data in this column contains
1410
text (e.g., letter or words)
1411
\end{itemize}
1412
1413
\textbf{Question 7.5} From the list below, which is a valid way to store
1414
a data frame object read in from \texttt{read\_csv} to an object in R?
1415
1416
A. data -\textgreater{} read\_csv("example\_file.csv")
1417
1418
B. data \textless{}- read\_csv("example\_file.csv")
1419
1420
C. data \textless{}- read\_csv"example\_file.csv"
1421
1422
D. data \textless{}- read\_csv(example\_file.csv)
1423
1424
\emph{Answer in the cell below using the uppercase letter associated
1425
with your answer. Place your answer between "", assign the correct
1426
answer to an object called \texttt{answer7.5}}.
1427
1428
\begin{Verbatim}[commandchars=\\\{\}]
1429
{\color{incolor}In [{\color{incolor}37}]:} \PY{c+c1}{\PYZsh{} Assign your answer to an object called: answer7.5}
1430
\PY{c+c1}{\PYZsh{} Make sure the correct answer is an uppercase letter. }
1431
\PY{c+c1}{\PYZsh{} Surround your answer with quotation marks.}
1432
\PY{c+c1}{\PYZsh{} Replace the fail() with your answer. }
1433
1434
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION }
1435
answer7.5 \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+s}{\PYZdq{}}\PY{l+s}{B\PYZdq{}}
1436
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION }
1437
\PY{k+kp}{print}\PY{p}{(}answer7.5\PY{p}{)}
1438
\end{Verbatim}
1439
1440
1441
\begin{Verbatim}[commandchars=\\\{\}]
1442
[1] "B"
1443
1444
\end{Verbatim}
1445
1446
\begin{Verbatim}[commandchars=\\\{\}]
1447
{\color{incolor}In [{\color{incolor}38}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
1448
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}answer7.5\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{3a5505c06543876fe45598b5e5e5195d\PYZsq{}}\PY{p}{)} \PY{c+c1}{\PYZsh{} we hid the answer to the test here so you can\PYZsq{}t see it, but we can still run the test}
1449
1450
\PY{p}{\PYZcb{}}\PY{p}{)}
1451
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
1452
\end{Verbatim}
1453
1454
1455
\begin{Verbatim}[commandchars=\\\{\}]
1456
[1] "Success!"
1457
1458
\end{Verbatim}
1459
1460
\subsubsection{Data frames}\label{data-frames}
1461
1462
We can look at the structure of the data frame using the function
1463
\texttt{head()}.
1464
1465
\begin{Verbatim}[commandchars=\\\{\}]
1466
{\color{incolor}In [{\color{incolor}39}]:} \PY{k+kp}{head}\PY{p}{(}marathon\PYZus{}small\PY{p}{)}
1467
\end{Verbatim}
1468
1469
1470
\begin{tabular}{r|lllll}
1471
age & bmi & km5\_time\_seconds & km10\_time\_seconds & sex\\
1472
\hline
1473
25 & 21.62212 & NA & 2798 & female \\
1474
41 & 23.90597 & 1210 & NA & male \\
1475
25 & 21.64073 & 994 & NA & male \\
1476
35 & 23.59232 & 1075 & 2135 & male \\
1477
34 & 22.70640 & 1186 & NA & male \\
1478
45 & 42.08754 & 3240 & NA & female \\
1479
\end{tabular}
1480
1481
1482
1483
\texttt{head()} returns the first 6 parts of a vector or data frame.
1484
1485
\begin{verbatim}
1486
age,bmi,km5_time_seconds,km10_time_seconds,sex
1487
25.0,21.6221160888672,NA,2798,female
1488
41.0,23.905969619751,1210.0,NA,male
1489
25.0,21.6407279968262,994.0,NA,male
1490
35.0,23.5923233032227,1075.0,2135,male
1491
34.0,22.7064037322998,1186.0,NA,male
1492
45.0,42.0875434875488,3240.0,NA,female
1493
33.0,22.5182952880859,1292.0,NA,male
1494
58.0,25.2340793609619,NA,3420,male
1495
29.0,24.505407333374,1440.0,3240,male
1496
\end{verbatim}
1497
1498
By default, the first row of a data set is always the \textbf{header}
1499
that \texttt{read\_csv} uses to label the column. Therefore, the first
1500
row contains descriptive names while the rows below contain the actual
1501
data.
1502
1503
This only shows us a small portion of the data set. You can look at the
1504
entire data set by simply running a cell with \texttt{marathon\_small}
1505
(data frame name) written in it but that can be very long and
1506
unnecessary to look at.
1507
1508
\textbf{Question 7.6} To know how many rows there really are, use the
1509
function \texttt{nrow()}. Replace the \texttt{fail()} with your line of
1510
code. Assign the number of rows to the object \texttt{number\_rows}.
1511
1512
\begin{Verbatim}[commandchars=\\\{\}]
1513
{\color{incolor}In [{\color{incolor}40}]:} \PY{c+c1}{\PYZsh{} Assign your answer to an object called: number\PYZus{}rows}
1514
\PY{c+c1}{\PYZsh{} Replace the fail() with your answer. }
1515
1516
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
1517
number\PYZus{}rows \PY{o}{\PYZlt{}\PYZhy{}} \PY{k+kp}{nrow}\PY{p}{(}marathon\PYZus{}small\PY{p}{)}
1518
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
1519
\PY{k+kp}{print}\PY{p}{(}number\PYZus{}rows\PY{p}{)}
1520
\end{Verbatim}
1521
1522
1523
\begin{Verbatim}[commandchars=\\\{\}]
1524
[1] 1833
1525
1526
\end{Verbatim}
1527
1528
\begin{Verbatim}[commandchars=\\\{\}]
1529
{\color{incolor}In [{\color{incolor}41}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
1530
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}number\PYZus{}rows\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{58fac55045cec17cd9f4006f4b5ab349\PYZsq{}}\PY{p}{)} \PY{c+c1}{\PYZsh{} we hid the answer to the test here so you can\PYZsq{}t see it, but we can still run the test}
1531
1532
\PY{p}{\PYZcb{}}\PY{p}{)}
1533
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
1534
\end{Verbatim}
1535
1536
1537
\begin{Verbatim}[commandchars=\\\{\}]
1538
[1] "Success!"
1539
1540
\end{Verbatim}
1541
1542
\subsubsection{Filter}\label{filter}
1543
1544
One of the most useful functions of \texttt{tidyverse} is
1545
\texttt{filter()}. With this function, it is possible to filter out
1546
specific observations based on their entries in one or more columns.
1547
1548
For example, if we had a data set (named \texttt{data}) that looked like
1549
this:
1550
1551
\begin{verbatim}
1552
colour size speed
1553
1 red 15 12.3
1554
2 blue 19 34.1
1555
3 blue 20 23.2
1556
4 red 22 21.9
1557
5 blue 12 33.6
1558
6 blue 23 28.8
1559
\end{verbatim}
1560
1561
we could use the first line of the code in the image below to filter for
1562
rows where the colour has the value of "blue". The seconde line of code
1563
below would let us filter for rows where the size has a value greater
1564
than 20.
1565
1566
1567
1568
\textbf{Question 7.7} Use the function \texttt{filter()} to subset your
1569
data frame \texttt{marathon\_small} so it only contains survey data from
1570
females. Assign your new filtered data frame to an object called
1571
\texttt{marathon\_filtered}. Replace the \texttt{fail()} with your line
1572
of code.
1573
1574
\begin{Verbatim}[commandchars=\\\{\}]
1575
{\color{incolor}In [{\color{incolor}42}]:} \PY{c+c1}{\PYZsh{} Assign your answer to an object called: marathon\PYZus{}filtered}
1576
\PY{c+c1}{\PYZsh{} Replace the fail() with your answer. }
1577
1578
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
1579
marathon\PYZus{}filtered \PY{o}{\PYZlt{}\PYZhy{}} filter\PY{p}{(}marathon\PYZus{}small\PY{p}{,} sex \PY{o}{==} \PY{l+s}{\PYZsq{}}\PY{l+s}{female\PYZsq{}}\PY{p}{)}
1580
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
1581
1582
\PY{k+kp}{head}\PY{p}{(}marathon\PYZus{}filtered\PY{p}{)}
1583
\end{Verbatim}
1584
1585
1586
\begin{tabular}{r|lllll}
1587
age & bmi & km5\_time\_seconds & km10\_time\_seconds & sex\\
1588
\hline
1589
25 & 21.62212 & NA & 2798 & female \\
1590
45 & 42.08754 & 3240 & NA & female \\
1591
36 & 25.40862 & 2115 & 4210 & female \\
1592
23 & 20.86986 & 1690 & NA & female \\
1593
34 & 23.58257 & 1603 & NA & female \\
1594
44 & 20.03506 & 1457 & NA & female \\
1595
\end{tabular}
1596
1597
1598
1599
\begin{Verbatim}[commandchars=\\\{\}]
1600
{\color{incolor}In [{\color{incolor}43}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
1601
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}\PY{k+kp}{nrow}\PY{p}{(}marathon\PYZus{}filtered\PY{p}{)}\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{22c7b9e96a1f1a8c4a13dc8b6586dc80\PYZsq{}}\PY{p}{)}
1602
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}\PY{k+kp}{ncol}\PY{p}{(}marathon\PYZus{}filtered\PY{p}{)}\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{dd4ad37ee474732a009111e3456e7ed7\PYZsq{}}\PY{p}{)}
1603
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}\PY{k+kp}{sum}\PY{p}{(}marathon\PYZus{}filtered\PY{o}{\PYZdl{}}bmi\PY{p}{)}\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{7cc4baefd16add414fe6a9e051a2f5f5\PYZsq{}}\PY{p}{)} \PY{c+c1}{\PYZsh{} we hid the answer to the test here so you can\PYZsq{}t see it, but we can still run the test}
1604
1605
\PY{p}{\PYZcb{}}\PY{p}{)}
1606
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
1607
\end{Verbatim}
1608
1609
1610
\begin{Verbatim}[commandchars=\\\{\}]
1611
[1] "Success!"
1612
1613
\end{Verbatim}
1614
1615
\subsubsection{Select}\label{select}
1616
1617
The \texttt{select()} function allows you to zoom in and focus on
1618
specific parts of the data. It is particularly helpful when working with
1619
extremely large datasets. More specifically, the function allows you to
1620
separate one or more columns from your dataset and transfer them into
1621
their own data frame.
1622
1623
Remembering our example \texttt{data}:
1624
1625
\begin{verbatim}
1626
colour size speed
1627
1 red 15 12.3
1628
2 blue 19 34.1
1629
3 blue 20 23.2
1630
4 red 22 21.9
1631
5 blue 12 33.6
1632
6 blue 23 28.8
1633
\end{verbatim}
1634
1635
For example, we can use the function \texttt{select()} to choose columns
1636
of interest (here colour and shape).
1637
1638
and we would get this smaller data set back:
1639
1640
\begin{verbatim}
1641
colour size
1642
1 red 15
1643
2 blue 19
1644
3 blue 20
1645
4 red 22
1646
5 blue 12
1647
6 blue 23
1648
\end{verbatim}
1649
1650
\textbf{Question 7.8} Use the function \texttt{select} to choose the
1651
columns \texttt{bmi} and \texttt{km5\_time\_seconds} from
1652
\texttt{marathon\_filtered}. Assign your new filtered data frame to an
1653
object called \texttt{marathon\_female}.
1654
1655
Replace the \texttt{fail()} with your line of code. \emph{Make sure you
1656
select first \texttt{bmi} and then \texttt{km5\_time\_seconds}}!
1657
1658
\begin{Verbatim}[commandchars=\\\{\}]
1659
{\color{incolor}In [{\color{incolor}44}]:} \PY{c+c1}{\PYZsh{} Assign your answer to an object called: marathon\PYZus{}female}
1660
\PY{c+c1}{\PYZsh{} Replace the fail() with your answer. }
1661
1662
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
1663
marathon\PYZus{}female \PY{o}{\PYZlt{}\PYZhy{}} select\PY{p}{(}marathon\PYZus{}filtered\PY{p}{,} bmi\PY{p}{,} km5\PYZus{}time\PYZus{}seconds\PY{p}{)}
1664
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
1665
\PY{k+kp}{head}\PY{p}{(}marathon\PYZus{}female\PY{p}{)}
1666
\end{Verbatim}
1667
1668
1669
\begin{tabular}{r|ll}
1670
bmi & km5\_time\_seconds\\
1671
\hline
1672
21.62212 & NA \\
1673
42.08754 & 3240 \\
1674
25.40862 & 2115 \\
1675
20.86986 & 1690 \\
1676
23.58257 & 1603 \\
1677
20.03506 & 1457 \\
1678
\end{tabular}
1679
1680
1681
1682
\begin{Verbatim}[commandchars=\\\{\}]
1683
{\color{incolor}In [{\color{incolor}45}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
1684
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}\PY{k+kp}{nrow}\PY{p}{(}marathon\PYZus{}female\PY{p}{)}\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{22c7b9e96a1f1a8c4a13dc8b6586dc80\PYZsq{}}\PY{p}{)}
1685
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}\PY{k+kp}{ncol}\PY{p}{(}marathon\PYZus{}female\PY{p}{)}\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{c01f179e4b57ab8bd9de309e6d576c48\PYZsq{}}\PY{p}{)}
1686
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}\PY{k+kp}{sum}\PY{p}{(}marathon\PYZus{}female\PY{o}{\PYZdl{}}bmi\PY{p}{)}\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{7cc4baefd16add414fe6a9e051a2f5f5\PYZsq{}}\PY{p}{)}
1687
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}\PY{k+kp}{sum}\PY{p}{(}marathon\PYZus{}female\PY{o}{\PYZdl{}}km5\PYZus{}time\PYZus{}seconds\PY{p}{)}\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{9c9393e1464352cd4fbea94dfadfa02a\PYZsq{}}\PY{p}{)} \PY{c+c1}{\PYZsh{} we hid the answer to the test here so you can\PYZsq{}t see it, but we can still run the test}
1688
1689
\PY{p}{\PYZcb{}}\PY{p}{)}
1690
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
1691
\end{Verbatim}
1692
1693
1694
\begin{Verbatim}[commandchars=\\\{\}]
1695
[1] "Success!"
1696
1697
\end{Verbatim}
1698
1699
\subsubsection{\texorpdfstring{Pipe Operators:
1700
\texttt{\%\textgreater{}\%}}{Pipe Operators: \%\textgreater{}\%}}\label{pipe-operators}
1701
1702
Pipe operators allow you to chain together different functions - it
1703
takes the output of one statement and makes it the input of the next
1704
statement. Having a chain of processing functions is known as a
1705
\emph{pipeline}.
1706
1707
For example, we can combine filter and select into one command:
1708
1709
\texttt{blue\_data\ \textless{}-\ filter(data,\ colour\ ==\ "blue")\ \%\textgreater{}\%\ select(colour,\ size)}
1710
1711
Since we want to specifically plot data of female participants, we need
1712
to first filter the sex column using the function: \texttt{filter()}.
1713
Below, you can see how this function as well as pipe operators
1714
(\texttt{\%\textgreater{}\%}) are used!. Then we need to select the
1715
column variables that we wish to look at. Since we want to plot BMI
1716
against the time it took to run 5 Kms, we must select \texttt{bmi} and
1717
\texttt{km5\_time\_seconds} accordingly. For this, we need to use the
1718
function: \texttt{select()}.
1719
1720
The following cell shows you how we can chain together filter and select
1721
for the marathon dataframe.
1722
1723
\begin{Verbatim}[commandchars=\\\{\}]
1724
{\color{incolor}In [{\color{incolor}46}]:} \PY{c+c1}{\PYZsh{} Run this cell. }
1725
1726
marathon\PYZus{}female \PY{o}{\PYZlt{}\PYZhy{}} filter\PY{p}{(}marathon\PYZus{}small\PY{p}{,} sex \PY{o}{==} \PY{l+s}{\PYZsq{}}\PY{l+s}{female\PYZsq{}}\PY{p}{)} \PY{o}{\PYZpc{}\PYZgt{}\PYZpc{}} select\PY{p}{(}bmi\PY{p}{,} km5\PYZus{}time\PYZus{}seconds\PY{p}{)}
1727
\PY{k+kp}{head}\PY{p}{(}marathon\PYZus{}female\PY{p}{)}
1728
\end{Verbatim}
1729
1730
1731
\begin{tabular}{r|ll}
1732
bmi & km5\_time\_seconds\\
1733
\hline
1734
21.62212 & NA \\
1735
42.08754 & 3240 \\
1736
25.40862 & 2115 \\
1737
20.86986 & 1690 \\
1738
23.58257 & 1603 \\
1739
20.03506 & 1457 \\
1740
\end{tabular}
1741
1742
1743
1744
\textbf{Question 7.9} Why do we \textbf{only} write marathon\_small
1745
(original data frame) for the function: filter()?
1746
1747
A. Because select does not require the original data frame as an
1748
argument.
1749
1750
B. Because the pipe operator uses the data frame in the first line as
1751
the data frame for all subsequent lines.
1752
1753
C. Because the pipe operator uses the output of the first function as
1754
the input of the second function.
1755
1756
\emph{Answer in the cell below using the uppercase letter associated
1757
with your answer. Place your answer between "", assign the correct
1758
answer to an object called \texttt{answer7.9}}.
1759
1760
\begin{Verbatim}[commandchars=\\\{\}]
1761
{\color{incolor}In [{\color{incolor}47}]:} \PY{c+c1}{\PYZsh{} Assign your answer to an object called: answer7.9}
1762
\PY{c+c1}{\PYZsh{} Make sure the correct answer is an uppercase letter. }
1763
\PY{c+c1}{\PYZsh{} Surround your answer with quotation marks.}
1764
\PY{c+c1}{\PYZsh{} Replace the fail() with your answer. }
1765
1766
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
1767
answer7.9 \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+s}{\PYZdq{}}\PY{l+s}{C\PYZdq{}}
1768
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
1769
\end{Verbatim}
1770
1771
1772
\begin{Verbatim}[commandchars=\\\{\}]
1773
{\color{incolor}In [{\color{incolor}48}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
1774
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}answer7.9\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{475bf9280aab63a82af60791302736f6\PYZsq{}}\PY{p}{)} \PY{c+c1}{\PYZsh{} we hid the answer to the test here so you can\PYZsq{}t see it, but we can still run the test}
1775
1776
\PY{p}{\PYZcb{}}\PY{p}{)}
1777
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
1778
\end{Verbatim}
1779
1780
1781
\begin{Verbatim}[commandchars=\\\{\}]
1782
[1] "Success!"
1783
1784
\end{Verbatim}
1785
1786
\textbf{Question 7.10} What are the units of the time taken to complete
1787
a run of 5 Kms?
1788
1789
\emph{Hint: scroll up and look at the introduction to this exercise.}
1790
1791
\begin{Verbatim}[commandchars=\\\{\}]
1792
{\color{incolor}In [{\color{incolor}49}]:} \PY{c+c1}{\PYZsh{} Write you answer in lower case. Place your answer between \PYZdq{}\PYZdq{}}
1793
\PY{c+c1}{\PYZsh{} Assign your answer for Question 7.10 to an object called: answer7.10}
1794
1795
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
1796
answer7.10 \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+s}{\PYZdq{}}\PY{l+s}{seconds\PYZdq{}}
1797
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
1798
\end{Verbatim}
1799
1800
1801
\begin{Verbatim}[commandchars=\\\{\}]
1802
{\color{incolor}In [{\color{incolor}50}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
1803
expect\PYZus{}match\PY{p}{(}digest\PY{p}{(}answer7.10\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZdq{}}\PY{l+s}{a9cf135185e7fe4ae642c8dcb228cd2d\PYZdq{}}\PY{p}{)}
1804
\PY{p}{\PYZcb{}}\PY{p}{)}
1805
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
1806
\end{Verbatim}
1807
1808
1809
\begin{Verbatim}[commandchars=\\\{\}]
1810
[1] "Success!"
1811
1812
\end{Verbatim}
1813
1814
\textbf{Question 7.11} What are the units for time (e.g., seconds,
1815
minutes, hours) that we would like to use when plotting BMI against time
1816
taken to run 5Kms? \emph{Hint: scroll up and look at the introduction to
1817
this exercise.}
1818
1819
\begin{Verbatim}[commandchars=\\\{\}]
1820
{\color{incolor}In [{\color{incolor}51}]:} \PY{c+c1}{\PYZsh{} Write you answer in lower case. Place your answer between \PYZdq{}\PYZdq{}}
1821
\PY{c+c1}{\PYZsh{} Assign your answer for Question 7.11 to an object called: answer7.11}
1822
1823
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
1824
answer7.11 \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+s}{\PYZdq{}}\PY{l+s}{minutes\PYZdq{}}
1825
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
1826
\end{Verbatim}
1827
1828
1829
\begin{Verbatim}[commandchars=\\\{\}]
1830
{\color{incolor}In [{\color{incolor}52}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
1831
expect\PYZus{}match\PY{p}{(}digest\PY{p}{(}answer7.11\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZdq{}}\PY{l+s}{edf7faf67d063030eba4ec85c6f7cc55\PYZdq{}}\PY{p}{)}
1832
\PY{p}{\PYZcb{}}\PY{p}{)}
1833
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
1834
\end{Verbatim}
1835
1836
1837
\begin{Verbatim}[commandchars=\\\{\}]
1838
[1] "Success!"
1839
1840
\end{Verbatim}
1841
1842
\subsubsection{Mutate}\label{mutate}
1843
1844
The function \texttt{mutate()} is used to add columns to an existing
1845
dataset where the new column is usually a function of one of more of the
1846
the existing columns.
1847
1848
\textbf{Question 7.12}
1849
1850
Add a new column to our marathon\_female dataset called
1851
\texttt{km5\_time\_minutes} that is equal to
1852
\texttt{km5\_time\_seconds/60.}
1853
1854
\begin{Verbatim}[commandchars=\\\{\}]
1855
{\color{incolor}In [{\color{incolor}53}]:} \PY{c+c1}{\PYZsh{} Assign your answer to an object called: marathon\PYZus{}minutes}
1856
\PY{c+c1}{\PYZsh{} Replace the fail() with your line of code.}
1857
1858
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
1859
marathon\PYZus{}minutes \PY{o}{\PYZlt{}\PYZhy{}} mutate\PY{p}{(}marathon\PYZus{}female\PY{p}{,} km5\PYZus{}time\PYZus{}minutes \PY{o}{=} km5\PYZus{}time\PYZus{}seconds\PY{o}{/}\PY{l+m}{60}\PY{p}{)}
1860
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
1861
\PY{k+kp}{head}\PY{p}{(}marathon\PYZus{}minutes\PY{p}{)}
1862
\end{Verbatim}
1863
1864
1865
\begin{tabular}{r|lll}
1866
bmi & km5\_time\_seconds & km5\_time\_minutes\\
1867
\hline
1868
21.62212 & NA & NA\\
1869
42.08754 & 3240 & 54.00000\\
1870
25.40862 & 2115 & 35.25000\\
1871
20.86986 & 1690 & 28.16667\\
1872
23.58257 & 1603 & 26.71667\\
1873
20.03506 & 1457 & 24.28333\\
1874
\end{tabular}
1875
1876
1877
1878
\begin{Verbatim}[commandchars=\\\{\}]
1879
{\color{incolor}In [{\color{incolor}54}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
1880
expect\PYZus{}equal\PY{p}{(}digest\PY{p}{(}\PY{k+kp}{sum}\PY{p}{(}marathon\PYZus{}minutes\PY{o}{\PYZdl{}}km5\PYZus{}time\PYZus{}minutes\PY{p}{)}\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{9c9393e1464352cd4fbea94dfadfa02a\PYZsq{}}\PY{p}{)} \PY{c+c1}{\PYZsh{} we hid the answer to the test here so you can\PYZsq{}t see it, but we can still run the test}
1881
1882
\PY{p}{\PYZcb{}}\PY{p}{)}
1883
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
1884
\end{Verbatim}
1885
1886
1887
\begin{Verbatim}[commandchars=\\\{\}]
1888
[1] "Success!"
1889
1890
\end{Verbatim}
1891
1892
\subsubsection{Graphing}\label{graphing}
1893
1894
\texttt{ggplot} is a function that works using layers of code. Every
1895
time you want to see something new added to your plot, you must add a
1896
new layer with each layer being separated by the ``+'' symbol. The first
1897
function we use in this line of code is the \texttt{ggplot} function.
1898
Here, we indicate the arguments that apply to all layers of the plot.
1899
The second function we use is \texttt{geom\_point()}. This function
1900
indicates that we wish to produce a scatterplot and the way we wish to
1901
display the data within this scatterplot.
1902
1903
Let's plot a scatterplot with the \texttt{bmi} on the x axis and
1904
\texttt{km5\_time\_minutes} on the y axis.
1905
1906
\begin{figure}
1907
\centering
1908
\includegraphics{images/ws1_ggplot_female.png}
1909
\caption{ws1\_ggplot\_female.png}
1910
\end{figure}
1911
1912
\begin{Verbatim}[commandchars=\\\{\}]
1913
{\color{incolor}In [{\color{incolor}55}]:} \PY{c+c1}{\PYZsh{} code to set\PYZhy{}up plot size}
1914
\PY{k+kn}{library}\PY{p}{(}repr\PY{p}{)}
1915
\PY{k+kp}{options}\PY{p}{(}repr.plot.width\PY{o}{=}\PY{l+m}{4}\PY{p}{,} repr.plot.height\PY{o}{=}\PY{l+m}{3}\PY{p}{)}
1916
\end{Verbatim}
1917
1918
1919
\begin{Verbatim}[commandchars=\\\{\}]
1920
{\color{incolor}In [{\color{incolor}56}]:} \PY{c+c1}{\PYZsh{} Run this cell to create a scatterplot of BMI against the time it took to run 5 kilometers. }
1921
ggplot\PY{p}{(}data \PY{o}{=} marathon\PYZus{}minutes\PY{p}{,} aes\PY{p}{(}x \PY{o}{=} bmi\PY{p}{,} y \PY{o}{=} km5\PYZus{}time\PYZus{}minutes\PY{p}{)}\PY{p}{)} \PY{o}{+} geom\PYZus{}point\PY{p}{(}\PY{p}{)}
1922
\end{Verbatim}
1923
1924
1925
\begin{Verbatim}[commandchars=\\\{\}]
1926
Warning message:
1927
“Removed 160 rows containing missing values (geom\_point).
1928
\end{Verbatim}
1929
1930
1931
1932
\begin{center}
1933
\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{output_121_2.png}
1934
\end{center}
1935
{ \hspace*{\fill} \\}
1936
1937
\textbf{Question 7.13} Looking at the graph above, choose a statement
1938
above that most reflects what we see?
1939
1940
A. There may be a postitive trend/relationship between 5 km run time and
1941
body mass index; as the value for for body mass index increases, so does
1942
the time it takes to run 5 km.
1943
1944
B. There may be a negative trend/relationship between 5 km run time and
1945
body mass index; as the value for for body mass index increases, the
1946
time it takes to run 5 km decreases.
1947
1948
C. There appears to be no trend/relationship between 5 km run time and
1949
body mass index; as the value for for body mass index increases we see
1950
neither an increase or decrease in the time it takes to run 5 km.
1951
1952
*Assign your answer to an object called \texttt{answer7.13}.
1953
1954
\begin{Verbatim}[commandchars=\\\{\}]
1955
{\color{incolor}In [{\color{incolor}57}]:} \PY{c+c1}{\PYZsh{} Assign your answer to an object called: answer7.13}
1956
\PY{c+c1}{\PYZsh{} Make sure the correct answer is an uppercase letter. }
1957
\PY{c+c1}{\PYZsh{} Surround your answer with quotation marks.}
1958
\PY{c+c1}{\PYZsh{} Replace the fail() with your answer. }
1959
1960
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} BEGIN SOLUTION}
1961
answer7.13 \PY{o}{\PYZlt{}\PYZhy{}} \PY{l+s}{\PYZdq{}}\PY{l+s}{A\PYZdq{}}
1962
\PY{c+c1}{\PYZsh{}\PYZsh{}\PYZsh{} END SOLUTION}
1963
\end{Verbatim}
1964
1965
1966
\begin{Verbatim}[commandchars=\\\{\}]
1967
{\color{incolor}In [{\color{incolor}58}]:} test\PYZus{}that\PY{p}{(}\PY{l+s}{\PYZsq{}}\PY{l+s}{Solution is incorrect\PYZsq{}}\PY{p}{,} \PY{p}{\PYZob{}}
1968
expect\PYZus{}match\PY{p}{(}digest\PY{p}{(}answer7.13\PY{p}{)}\PY{p}{,} \PY{l+s}{\PYZsq{}}\PY{l+s}{75f1160e72554f4270c809f041c7a776\PYZsq{}}\PY{p}{)}
1969
\PY{p}{\PYZcb{}}\PY{p}{)}
1970
\PY{k+kp}{print}\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Success!\PYZdq{}}\PY{p}{)}
1971
\end{Verbatim}
1972
1973
1974
\begin{Verbatim}[commandchars=\\\{\}]
1975
[1] "Success!"
1976
1977
\end{Verbatim}
1978
1979
The code we listed above for graphics barely scratches the surface of
1980
what ggplot, and R as a whole, are capable of. Not only are there far
1981
more choices about the kinds of plots available, but there are many,
1982
many options for customizing the look and feel of each graph. You can
1983
choose the font, the font size, the colors, the style of the axes, etc.
1984
1985
Let's dig a little deeper into just a couple of options that you can add
1986
to any of your graphs to make them look a little better. For example,
1987
you can change the text of the x-axis label or the y-axis label by using
1988
\texttt{xlab("")} or \texttt{ylab("")}. Let's do that for the
1989
scatterplot to make the labels easier to read.
1990
1991
\begin{Verbatim}[commandchars=\\\{\}]
1992
{\color{incolor}In [{\color{incolor}59}]:} \PY{c+c1}{\PYZsh{} Run this cell. }
1993
\PY{c+c1}{\PYZsh{} You can replace the axes with whatever you wish to label. }
1994
\PY{c+c1}{\PYZsh{} After running the cell once, try changing the axes to something else. }
1995
1996
ggplot\PY{p}{(}data \PY{o}{=} marathon\PYZus{}minutes\PY{p}{,} aes\PY{p}{(}x \PY{o}{=} bmi\PY{p}{,} y \PY{o}{=} km5\PYZus{}time\PYZus{}minutes\PY{p}{)}\PY{p}{)} \PY{o}{+} geom\PYZus{}point\PY{p}{(}\PY{p}{)} \PY{o}{+}
1997
xlab\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{Body Mass Index\PYZdq{}}\PY{p}{)} \PY{o}{+} ylab\PY{p}{(}\PY{l+s}{\PYZdq{}}\PY{l+s}{5km run time (minutes)\PYZdq{}}\PY{p}{)}
1998
\end{Verbatim}
1999
2000
2001
\begin{Verbatim}[commandchars=\\\{\}]
2002
Warning message:
2003
“Removed 160 rows containing missing values (geom\_point).
2004
\end{Verbatim}
2005
2006
2007
2008
\begin{center}
2009
\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{output_126_2.png}
2010
\end{center}
2011
{ \hspace*{\fill} \\}
2012
2013
\subsection{Attributions}\label{attributions}
2014
2015
\begin{itemize}
2016
\tightlist
2017
\item
2018
UC Berkley \href{https://github.com/data-8/data8assets}{Data 8 Public
2019
Materials}
2020
\end{itemize}
2021
2022
2023
% Add a bibliography block to the postdoc
2024
2025
2026
2027
\end{document}
2028
2029