Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

GEP475GROUPINEEDANAP

Views: 1461
Kernel: Python 3
Future question for Dr. Soto

Should I concatinate the dataframes on integer indices or Date-Time indices? I dont think it will make a difference.. not sure which one is easier

import pandas as pd import numpy as np
df1 = pd.read_csv('NetAtmo_2016.csv', parse_dates = True,) df1.describe()
Timestamp Temperature Humidity CO2 Noise Pressure
count 9.014300e+04 90143.000000 90143.000000 90137.000000 90132.000000 90143.000000
mean 1.469613e+09 22.766724 51.291637 550.204799 38.964730 1011.348933
std 7.900773e+06 1.592268 6.929620 318.321732 7.100703 4.217541
min 1.455917e+09 17.900000 27.000000 201.000000 35.000000 995.000000
25% 1.462765e+09 21.700000 49.000000 354.000000 36.000000 1008.300000
50% 1.469657e+09 22.900000 52.000000 416.000000 36.000000 1011.000000
75% 1.476459e+09 23.800000 55.000000 639.000000 38.000000 1014.100000
max 1.483257e+09 28.500000 76.000000 2777.000000 79.000000 1027.500000
new_index1 = pd.Series(range(1,90144))
df1['Numbered_index'] = new_index1
df1.set_index('Numbered_index', inplace = True) df1.head()
Timestamp Timezone : America/Los_Angeles Temperature Humidity CO2 Noise Pressure
Numbered_index
1 1455917199 2/19/16 13:26 18.8 76 NaN NaN 1015.7
2 1455917255 2/19/16 13:27 19.2 75 718.0 NaN 1015.7
3 1455917257 2/19/16 13:27 19.9 73 NaN NaN 1015.7
4 1455917513 2/19/16 13:31 20.3 73 337.0 44.0 1015.8
5 1455917814 2/19/16 13:36 21.2 70 332.0 47.0 1015.7
df1.drop(df1.columns[[0,2,3,5,6]], axis =1, inplace = True)
df1.head(1)
Timezone : America/Los_Angeles CO2
Numbered_index
1 2/19/16 13:26 NaN
df2 = pd.read_csv('NetAtmo_2017.csv', parse_dates = True)
new_index2 = pd.Series(range(90144, 100992))
df2['numbered_index'] = new_index2
df2.set_index('numbered_index', inplace = True)
df2.drop(df2.columns[[0,2,3,5,6]], axis =1, inplace = True)
df2.head()
Time CO2
numbered_index
90144 1/1/17 0:00 482
90145 1/1/17 0:05 491
90146 1/1/17 0:11 480
90147 1/1/17 0:16 486
90148 1/1/17 0:21 490
df1.head()
Timezone : America/Los_Angeles CO2
Numbered_index
1 2/19/16 13:26 NaN
2 2/19/16 13:27 718.0
3 2/19/16 13:27 NaN
4 2/19/16 13:31 337.0
5 2/19/16 13:36 332.0
df1 = df1.rename(columns = {'Timezone : America/Los_Angeles':'Time'}) df1.head()
Time CO2
Numbered_index
1 2/19/16 13:26 NaN
2 2/19/16 13:27 718.0
3 2/19/16 13:27 NaN
4 2/19/16 13:31 337.0
5 2/19/16 13:36 332.0
df2.tail()
Time CO2
numbered_index
100987 2/12/17 18:27 484
100988 2/12/17 18:32 486
100989 2/12/17 18:37 469
100990 2/12/17 18:42 485
100991 2/12/17 18:47 480
df3 = pd.concat([df1,df2]) df3.head()
Time CO2
1 2/19/16 13:26 NaN
2 2/19/16 13:27 718.0
3 2/19/16 13:27 NaN
4 2/19/16 13:31 337.0
5 2/19/16 13:36 332.0
df3.tail()
Time CO2
100987 2/12/17 18:27 484.0
100988 2/12/17 18:32 486.0
100989 2/12/17 18:37 469.0
100990 2/12/17 18:42 485.0
100991 2/12/17 18:47 480.0
df3.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fc30f81eb00>
Image in a Jupyter notebook
#df3.set_index('Time', inplace = True) #df3.head()
#df3.plot()
#df3.plot.hist()
df3.dtypes
Time object CO2 float64 dtype: object
df3.head()
Time CO2
1 2/19/16 13:26 NaN
2 2/19/16 13:27 718.0
3 2/19/16 13:27 NaN
4 2/19/16 13:31 337.0
5 2/19/16 13:36 332.0
df3.isnull().head()
Time CO2
1 False True
2 False False
3 False True
4 False False
5 False False
df3['Time'] = pd.to_datetime(df3.Time)
df3.head()
Time CO2
1 2016-02-19 13:26:00 NaN
2 2016-02-19 13:27:00 718.0
3 2016-02-19 13:27:00 NaN
4 2016-02-19 13:31:00 337.0
5 2016-02-19 13:36:00 332.0
df3.Time.dt.weekday_name.head()
1 Friday 2 Friday 3 Friday 4 Friday 5 Friday Name: Time, dtype: object
#isolating the seonc day Firstday = pd.to_datetime('2/20/2016 23:59:59')
df3.loc[df3.Time <= Firstday, :].tail()
Time CO2
411 2016-02-20 23:39:00 400.0
412 2016-02-20 23:44:00 419.0
413 2016-02-20 23:49:00 407.0
414 2016-02-20 23:54:00 417.0
415 2016-02-20 23:59:00 417.0
#almost a full year of data! (df3.Time.max() - df3.Time.min())
Timedelta('359 days 05:21:00')
df3['Day'] = df3.Time.dt.weekday_name df3.head()
Time CO2 Day
1 2016-02-19 13:26:00 NaN Friday
2 2016-02-19 13:27:00 718.0 Friday
3 2016-02-19 13:27:00 NaN Friday
4 2016-02-19 13:31:00 337.0 Friday
5 2016-02-19 13:36:00 332.0 Friday
# so many questions df3.Day.value_counts()
Saturday 14598 Tuesday 14589 Sunday 14539 Monday 14488 Wednesday 14472 Friday 14438 Thursday 13867 Name: Day, dtype: int64
df3.Day.value_counts().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fc30f0ed1d0>
Image in a Jupyter notebook

Switching to df2 because it is still note recognized by datetime

df2.head()
Time CO2
numbered_index
90144 1/1/17 0:00 482
90145 1/1/17 0:05 491
90146 1/1/17 0:11 480
90147 1/1/17 0:16 486
90148 1/1/17 0:21 490
#df3['Time2'].head()
df3.head()
Time CO2 Day
1 2016-02-19 13:26:00 NaN Friday
2 2016-02-19 13:27:00 718.0 Friday
3 2016-02-19 13:27:00 NaN Friday
4 2016-02-19 13:31:00 337.0 Friday
5 2016-02-19 13:36:00 332.0 Friday
df3['Time2'] = df3.Time.shift(-1)
df3.head()
Time CO2 Day Time2
1 2016-02-19 13:26:00 NaN Friday 2016-02-19 13:27:00
2 2016-02-19 13:27:00 718.0 Friday 2016-02-19 13:27:00
3 2016-02-19 13:27:00 NaN Friday 2016-02-19 13:31:00
4 2016-02-19 13:31:00 337.0 Friday 2016-02-19 13:36:00
5 2016-02-19 13:36:00 332.0 Friday 2016-02-19 13:41:00
df3['TimeDel'] = df3.Time2 - df3.Time df3.head()
Time CO2 Day Time2 TimeDel
1 2016-02-19 13:26:00 NaN Friday 2016-02-19 13:27:00 00:01:00
2 2016-02-19 13:27:00 718.0 Friday 2016-02-19 13:27:00 00:00:00
3 2016-02-19 13:27:00 NaN Friday 2016-02-19 13:31:00 00:04:00
4 2016-02-19 13:31:00 337.0 Friday 2016-02-19 13:36:00 00:05:00
5 2016-02-19 13:36:00 332.0 Friday 2016-02-19 13:41:00 00:05:00
df3.TimeDel.dt.seconds.head()
1 60.0 2 0.0 3 240.0 4 300.0 5 300.0 Name: TimeDel, dtype: float64
df3.dtypes
Time datetime64[ns] CO2 float64 Day object Time2 datetime64[ns] TimeDel timedelta64[ns] dtype: object
df3['TimeDel'] = df3.TimeDel / np.timedelta64(1, 's')
df3.dtypes
Time datetime64[ns] CO2 float64 Day object Time2 datetime64[ns] TimeDel float64 dtype: object
df3['CO2_over_TimeDiff'] = (df3.CO2 / df3.TimeDel)
# number of "not a number" in each column df3.isnull().sum()
Time 0 CO2 6 Day 0 Time2 1 TimeDel 1 CO2_over_TimeDiff 7 dtype: int64
df3[df3.CO2.isnull()]
Time CO2 Day Time2 TimeDel CO2_over_TimeDiff
1 2016-02-19 13:26:00 NaN Friday 2016-02-19 13:27:00 60.0 NaN
3 2016-02-19 13:27:00 NaN Friday 2016-02-19 13:31:00 240.0 NaN
2911 2016-02-29 17:03:00 NaN Monday 2016-02-29 17:05:00 120.0 NaN
32931 2016-06-14 05:09:00 NaN Tuesday 2016-06-14 05:10:00 60.0 NaN
48678 2016-08-09 05:21:00 NaN Tuesday 2016-08-09 05:22:00 60.0 NaN
72565 2016-10-31 15:40:00 NaN Monday 2016-10-31 15:44:00 240.0 NaN
df3.shape
(100991, 6)
# dropping rows that have "any" missing values df3.dropna(how='any', inplace = True)
df3.shape
(100984, 6)
df3.head()
Time CO2 Day Time2 TimeDel CO2_over_TimeDiff
2 2016-02-19 13:27:00 718.0 Friday 2016-02-19 13:27:00 0.0 inf
4 2016-02-19 13:31:00 337.0 Friday 2016-02-19 13:36:00 300.0 1.123333
5 2016-02-19 13:36:00 332.0 Friday 2016-02-19 13:41:00 300.0 1.106667
6 2016-02-19 13:41:00 328.0 Friday 2016-02-19 13:46:00 300.0 1.093333
7 2016-02-19 13:46:00 307.0 Friday 2016-02-19 13:51:00 300.0 1.023333
df3.describe()
CO2 TimeDel CO2_over_TimeDiff
count 100984.000000 100984.000000 1.009840e+05
mean 547.647548 307.336608 inf
std 304.512740 1369.027580 NaN
min 201.000000 -3300.000000 -3.500000e-01
25% 362.000000 300.000000 1.200000e+00
50% 438.000000 300.000000 1.453333e+00
75% 615.000000 300.000000 2.040000e+00
max 2777.000000 424320.000000 inf
df3.CO2_over_TimeDiff.head()
2 inf 4 1.123333 5 1.106667 6 1.093333 7 1.023333 Name: CO2_over_TimeDiff, dtype: float64