Path: blob/master/projects/project_1/Project-1 - my notes w' different method process.ipynb
1904 views
Project 1
In this first project you will implement a few python functions and generally familiarize yourself with the pandas library.
Please refer to numpy-and-pandas.ipynb noteook in lesson2 and the for loop section of the python-controlflow.ipynb notebook in the python_foundations folder. and the pandas documentation here for assitance.
I have written the numerical answers you are looking for below - please show me the code you used to generate those answers.
Note! You will need to look within that documentation/ use other search results on the internet to complete this assignment!
Question 1: Multiples of Three and Five
If we list all of the natural numbers below 10 that are multiples of 3 or 5, we get 3, 5, 6, and 9. The sum of these multiples is 23. Find the sum of all the multiples of 3 and 5 below 1,000.
Answer: 233,168
Note: you may find yourself with the answer 266,333! Think carefully what is going on with this question and what may be driving the difference between your answer and the correct value! A hint can be found in the control flow notebook.
elif
solves the issue of counting a number that is a multiple of both 3 and 5 twice.
Question 2: Pandas Intro
2.1 Load the Citibike-Feb-24 dataset into memory and assign it to the variable "df"
The data are in /data/citibike_feb2014.csv
Use pd.read_csv function. Please refer to the documentation here if you are having trouble.
2.2 How many rows and how many columns are there in the dataset?
A: 224,736 rows, 15 columns
2.3 Please print out the first five rows of the dataset
2.4 What is the average trip duration? (In seconds)
A: 874.5198 seconds
2.5 What is the total trip duration in this entire dataset in hours?
A: 54593.3567 hours
2.6 What is the most popular start station? And how many rides started at that station in Feb 2014?
Note, the pandas cookbook may come in handy for this (look at chapter 1 & 2): https://pandas.pydata.org/pandas-docs/stable/tutorials.html
all the months are feb
A: Station id: 293, Number of rides; 2920
2.7 What percentage of the total riders are of usertype "Subscriber"?
A: 97.0112
What is the average age (in 2014) of the riders in this dataset?
Note, this requires creating a new column and then taking the difference between 2014 and the rider's birth year, then taking the average!
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-137-1b2a498ce611> in <module>()
----> 1 2014 - df['birth year'][0]
TypeError: unsupported operand type(s) for -: 'int' and 'str'
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-144-ff839553694e> in <module>()
----> 1 df['birth year'].astype(int)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
116 else:
117 kwargs[new_arg_name] = new_arg_value
--> 118 return func(*args, **kwargs)
119 return wrapper
120 return _deprecate_kwarg
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
4002 # else, only a single dtype is given
4003 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 4004 **kwargs)
4005 return self._constructor(new_data).__finalize__(self)
4006
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in astype(self, dtype, **kwargs)
3460
3461 def astype(self, dtype, **kwargs):
-> 3462 return self.apply('astype', dtype=dtype, **kwargs)
3463
3464 def convert(self, **kwargs):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
3327
3328 kwargs['mgr'] = self
-> 3329 applied = getattr(b, f)(**kwargs)
3330 result_blocks = _extend_blocks(applied, result_blocks)
3331
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in astype(self, dtype, copy, errors, values, **kwargs)
542 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
543 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 544 **kwargs)
545
546 def _astype(self, dtype, copy=False, errors='raise', values=None,
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
623
624 # _astype_nansafe works fine with 1-d only
--> 625 values = astype_nansafe(values.ravel(), dtype, copy=True)
626 values = values.reshape(self.shape)
627
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy)
690 elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer):
691 # work around NumPy brokenness, #1987
--> 692 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
693
694 if dtype.name in ("datetime64", "timedelta64"):
pandas/_libs/lib.pyx in pandas._libs.lib.astype_intsafe()
pandas/_libs/src/util.pxd in util.set_value_at_unsafe()
ValueError: invalid literal for int() with base 10: '\\N'
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-126-009a67367ef8> in <module>()
----> 1 df['birth year'].astype(str).astype(int)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
116 else:
117 kwargs[new_arg_name] = new_arg_value
--> 118 return func(*args, **kwargs)
119 return wrapper
120 return _deprecate_kwarg
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
4002 # else, only a single dtype is given
4003 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 4004 **kwargs)
4005 return self._constructor(new_data).__finalize__(self)
4006
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in astype(self, dtype, **kwargs)
3460
3461 def astype(self, dtype, **kwargs):
-> 3462 return self.apply('astype', dtype=dtype, **kwargs)
3463
3464 def convert(self, **kwargs):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
3327
3328 kwargs['mgr'] = self
-> 3329 applied = getattr(b, f)(**kwargs)
3330 result_blocks = _extend_blocks(applied, result_blocks)
3331
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in astype(self, dtype, copy, errors, values, **kwargs)
542 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
543 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 544 **kwargs)
545
546 def _astype(self, dtype, copy=False, errors='raise', values=None,
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
623
624 # _astype_nansafe works fine with 1-d only
--> 625 values = astype_nansafe(values.ravel(), dtype, copy=True)
626 values = values.reshape(self.shape)
627
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy)
690 elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer):
691 # work around NumPy brokenness, #1987
--> 692 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
693
694 if dtype.name in ("datetime64", "timedelta64"):
pandas/_libs/lib.pyx in pandas._libs.lib.astype_intsafe()
pandas/_libs/src/util.pxd in util.set_value_at_unsafe()
ValueError: invalid literal for int() with base 10: '\\N'
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
pandas/_libs/src/inference.pyx in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string "\N"
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-100-cd37727500f1> in <module>()
----> 1 df['birth_year'] = pd.to_numeric(df['birth year'])
/anaconda3/lib/python3.6/site-packages/pandas/core/tools/numeric.py in to_numeric(arg, errors, downcast)
131 coerce_numeric = False if errors in ('ignore', 'raise') else True
132 values = lib.maybe_convert_numeric(values, set(),
--> 133 coerce_numeric=coerce_numeric)
134
135 except Exception:
pandas/_libs/src/inference.pyx in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string "\N" at position 31
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/anaconda3/lib/python3.6/site-packages/pandas/core/ops.py in na_op(x, y)
675 try:
--> 676 result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs)
677 except TypeError:
/anaconda3/lib/python3.6/site-packages/pandas/core/computation/expressions.py in evaluate(op, op_str, a, b, use_numexpr, **eval_kwargs)
203 if use_numexpr:
--> 204 return _evaluate(op, op_str, a, b, **eval_kwargs)
205 return _evaluate_standard(op, op_str, a, b)
/anaconda3/lib/python3.6/site-packages/pandas/core/computation/expressions.py in _evaluate_numexpr(op, op_str, a, b, truediv, reversed, **eval_kwargs)
118 if result is None:
--> 119 result = _evaluate_standard(op, op_str, a, b)
120
/anaconda3/lib/python3.6/site-packages/pandas/core/computation/expressions.py in _evaluate_standard(op, op_str, a, b, **eval_kwargs)
63 with np.errstate(all='ignore'):
---> 64 return op(a, b)
65
/anaconda3/lib/python3.6/site-packages/pandas/core/ops.py in <lambda>(x, y)
98 default_axis=default_axis, reversed=True),
---> 99 rsub=arith_method(lambda x, y: y - x, names('rsub'), op('-'),
100 default_axis=default_axis, reversed=True),
TypeError: unsupported operand type(s) for -: 'int' and 'str'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
/anaconda3/lib/python3.6/site-packages/pandas/core/ops.py in safe_na_op(lvalues, rvalues)
699 with np.errstate(all='ignore'):
--> 700 return na_op(lvalues, rvalues)
701 except Exception:
/anaconda3/lib/python3.6/site-packages/pandas/core/ops.py in na_op(x, y)
685 mask = notna(x)
--> 686 result[mask] = op(x[mask], y)
687 else:
/anaconda3/lib/python3.6/site-packages/pandas/core/ops.py in <lambda>(x, y)
98 default_axis=default_axis, reversed=True),
---> 99 rsub=arith_method(lambda x, y: y - x, names('rsub'), op('-'),
100 default_axis=default_axis, reversed=True),
TypeError: unsupported operand type(s) for -: 'int' and 'str'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-99-008ed3355f8f> in <module>()
1 x = 2014
----> 2 df['diff_year'] = x - df['birth year']
3 df['diff_year'].mean()
/anaconda3/lib/python3.6/site-packages/pandas/core/ops.py in wrapper(left, right, name, na_op)
737 lvalues = lvalues.values
738
--> 739 result = wrap_results(safe_na_op(lvalues, rvalues))
740 return construct_result(
741 left,
/anaconda3/lib/python3.6/site-packages/pandas/core/ops.py in safe_na_op(lvalues, rvalues)
708 if is_object_dtype(lvalues):
709 return libalgos.arrmap_object(lvalues,
--> 710 lambda x: op(x, rvalues))
711 raise
712
pandas/_libs/algos_common_helper.pxi in pandas._libs.algos.arrmap_object()
/anaconda3/lib/python3.6/site-packages/pandas/core/ops.py in <lambda>(x)
708 if is_object_dtype(lvalues):
709 return libalgos.arrmap_object(lvalues,
--> 710 lambda x: op(x, rvalues))
711 raise
712
/anaconda3/lib/python3.6/site-packages/pandas/core/ops.py in <lambda>(x, y)
97 rmul=arith_method(operator.mul, names('rmul'), op('*'),
98 default_axis=default_axis, reversed=True),
---> 99 rsub=arith_method(lambda x, y: y - x, names('rsub'), op('-'),
100 default_axis=default_axis, reversed=True),
101 rtruediv=arith_method(lambda x, y: operator.truediv(y, x),
TypeError: unsupported operand type(s) for -: 'int' and 'str'
adding a \ helps escape the special character issue this string has. the problem with this is that the 0 is going to deflate the average result - this analysis is correct, and that's why we set this to "not a number" and not 0