Kernel: Python 3
In [1]:
import numpy as np import pandas as pd np.random.seed(12345) import matplotlib.pyplot as plt plt.rc("figure", figsize=(10, 6)) PREVIOUS_MAX_ROWS = pd.options.display.max_rows pd.options.display.max_columns = 20 pd.options.display.max_rows = 20 pd.options.display.max_colwidth = 80 np.set_printoptions(precision=4, suppress=True)
In [2]:
import numpy as np import pandas as pd
In [3]:
from datetime import datetime now = datetime.now() now now.year, now.month, now.day
In [4]:
delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15) delta delta.days delta.seconds
In [5]:
from datetime import timedelta start = datetime(2011, 1, 7) start + timedelta(12) start - 2 * timedelta(12)
In [6]:
stamp = datetime(2011, 1, 3) str(stamp) stamp.strftime("%Y-%m-%d")
In [7]:
value = "2011-01-03" datetime.strptime(value, "%Y-%m-%d") datestrs = ["7/6/2011", "8/6/2011"] [datetime.strptime(x, "%m/%d/%Y") for x in datestrs]
In [8]:
datestrs = ["2011-07-06 12:00:00", "2011-08-06 00:00:00"] pd.to_datetime(datestrs)
In [9]:
idx = pd.to_datetime(datestrs + [None]) idx idx[2] pd.isna(idx)
In [10]:
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7), datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)] ts = pd.Series(np.random.standard_normal(6), index=dates) ts
In [11]:
ts.index
In [12]:
ts + ts[::2]
In [13]:
ts.index.dtype
In [14]:
stamp = ts.index[0] stamp
In [15]:
stamp = ts.index[2] ts[stamp]
In [16]:
ts["2011-01-10"]
In [17]:
longer_ts = pd.Series(np.random.standard_normal(1000), index=pd.date_range("2000-01-01", periods=1000)) longer_ts longer_ts["2001"]
In [18]:
longer_ts["2001-05"]
In [19]:
ts[datetime(2011, 1, 7):] ts[datetime(2011, 1, 7):datetime(2011, 1, 10)]
In [20]:
ts ts["2011-01-06":"2011-01-11"]
In [21]:
ts.truncate(after="2011-01-09")
In [22]:
dates = pd.date_range("2000-01-01", periods=100, freq="W-WED") long_df = pd.DataFrame(np.random.standard_normal((100, 4)), index=dates, columns=["Colorado", "Texas", "New York", "Ohio"]) long_df.loc["2001-05"]
In [23]:
dates = pd.DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-02", "2000-01-02", "2000-01-03"]) dup_ts = pd.Series(np.arange(5), index=dates) dup_ts
In [24]:
dup_ts.index.is_unique
In [25]:
dup_ts["2000-01-03"] # not duplicated dup_ts["2000-01-02"] # duplicated
In [26]:
grouped = dup_ts.groupby(level=0) grouped.mean() grouped.count()
In [27]:
ts resampler = ts.resample("D") resampler
In [28]:
index = pd.date_range("2012-04-01", "2012-06-01") index
In [29]:
pd.date_range(start="2012-04-01", periods=20) pd.date_range(end="2012-06-01", periods=20)
In [30]:
pd.date_range("2000-01-01", "2000-12-01", freq="BM")
In [31]:
pd.date_range("2012-05-02 12:56:31", periods=5)
In [32]:
pd.date_range("2012-05-02 12:56:31", periods=5, normalize=True)
In [33]:
from pandas.tseries.offsets import Hour, Minute hour = Hour() hour
In [34]:
four_hours = Hour(4) four_hours
In [35]:
pd.date_range("2000-01-01", "2000-01-03 23:59", freq="4H")
In [36]:
Hour(2) + Minute(30)
In [37]:
pd.date_range("2000-01-01", periods=10, freq="1h30min")
In [38]:
monthly_dates = pd.date_range("2012-01-01", "2012-09-01", freq="WOM-3FRI") list(monthly_dates)
In [39]:
ts = pd.Series(np.random.standard_normal(4), index=pd.date_range("2000-01-01", periods=4, freq="M")) ts ts.shift(2) ts.shift(-2)
In [40]:
ts.shift(2, freq="M")
In [41]:
ts.shift(3, freq="D") ts.shift(1, freq="90T")
In [42]:
from pandas.tseries.offsets import Day, MonthEnd now = datetime(2011, 11, 17) now + 3 * Day()
In [43]:
now + MonthEnd() now + MonthEnd(2)
In [44]:
offset = MonthEnd() offset.rollforward(now) offset.rollback(now)
In [45]:
ts = pd.Series(np.random.standard_normal(20), index=pd.date_range("2000-01-15", periods=20, freq="4D")) ts ts.groupby(MonthEnd().rollforward).mean()
In [46]:
ts.resample("M").mean()
In [47]:
import pytz pytz.common_timezones[-5:]
In [48]:
tz = pytz.timezone("America/New_York") tz
In [49]:
dates = pd.date_range("2012-03-09 09:30", periods=6) ts = pd.Series(np.random.standard_normal(len(dates)), index=dates) ts
In [50]:
print(ts.index.tz)
In [51]:
pd.date_range("2012-03-09 09:30", periods=10, tz="UTC")
In [52]:
ts ts_utc = ts.tz_localize("UTC") ts_utc ts_utc.index
In [53]:
ts_utc.tz_convert("America/New_York")
In [54]:
ts_eastern = ts.tz_localize("America/New_York") ts_eastern.tz_convert("UTC") ts_eastern.tz_convert("Europe/Berlin")
In [55]:
ts.index.tz_localize("Asia/Shanghai")
In [56]:
stamp = pd.Timestamp("2011-03-12 04:00") stamp_utc = stamp.tz_localize("utc") stamp_utc.tz_convert("America/New_York")
In [57]:
stamp_moscow = pd.Timestamp("2011-03-12 04:00", tz="Europe/Moscow") stamp_moscow
In [58]:
stamp_utc.value stamp_utc.tz_convert("America/New_York").value
In [59]:
stamp = pd.Timestamp("2012-03-11 01:30", tz="US/Eastern") stamp stamp + Hour()
In [60]:
stamp = pd.Timestamp("2012-11-04 00:30", tz="US/Eastern") stamp stamp + 2 * Hour()
In [61]:
dates = pd.date_range("2012-03-07 09:30", periods=10, freq="B") ts = pd.Series(np.random.standard_normal(len(dates)), index=dates) ts ts1 = ts[:7].tz_localize("Europe/London") ts2 = ts1[2:].tz_convert("Europe/Moscow") result = ts1 + ts2 result.index
In [62]:
p = pd.Period("2011", freq="A-DEC") p
In [63]:
p + 5 p - 2
In [64]:
pd.Period("2014", freq="A-DEC") - p
In [65]:
periods = pd.period_range("2000-01-01", "2000-06-30", freq="M") periods
In [66]:
pd.Series(np.random.standard_normal(6), index=periods)
In [67]:
values = ["2001Q3", "2002Q2", "2003Q1"] index = pd.PeriodIndex(values, freq="Q-DEC") index
In [68]:
p = pd.Period("2011", freq="A-DEC") p p.asfreq("M", how="start") p.asfreq("M", how="end") p.asfreq("M")
In [69]:
p = pd.Period("2011", freq="A-JUN") p p.asfreq("M", how="start") p.asfreq("M", how="end")
In [70]:
p = pd.Period("Aug-2011", "M") p.asfreq("A-JUN")
In [71]:
periods = pd.period_range("2006", "2009", freq="A-DEC") ts = pd.Series(np.random.standard_normal(len(periods)), index=periods) ts ts.asfreq("M", how="start")
In [72]:
ts.asfreq("B", how="end")
In [73]:
p = pd.Period("2012Q4", freq="Q-JAN") p
In [74]:
p.asfreq("D", how="start") p.asfreq("D", how="end")
In [75]:
p4pm = (p.asfreq("B", how="end") - 1).asfreq("T", how="start") + 16 * 60 p4pm p4pm.to_timestamp()
In [76]:
periods = pd.period_range("2011Q3", "2012Q4", freq="Q-JAN") ts = pd.Series(np.arange(len(periods)), index=periods) ts new_periods = (periods.asfreq("B", "end") - 1).asfreq("H", "start") + 16 ts.index = new_periods.to_timestamp() ts
In [77]:
dates = pd.date_range("2000-01-01", periods=3, freq="M") ts = pd.Series(np.random.standard_normal(3), index=dates) ts pts = ts.to_period() pts
In [78]:
dates = pd.date_range("2000-01-29", periods=6) ts2 = pd.Series(np.random.standard_normal(6), index=dates) ts2 ts2.to_period("M")
In [79]:
pts = ts2.to_period() pts pts.to_timestamp(how="end")
In [80]:
data = pd.read_csv("examples/macrodata.csv") data.head(5) data["year"] data["quarter"]
In [81]:
index = pd.PeriodIndex(year=data["year"], quarter=data["quarter"], freq="Q-DEC") index data.index = index data["infl"]
In [82]:
dates = pd.date_range("2000-01-01", periods=100) ts = pd.Series(np.random.standard_normal(len(dates)), index=dates) ts ts.resample("M").mean() ts.resample("M", kind="period").mean()
In [83]:
dates = pd.date_range("2000-01-01", periods=12, freq="T") ts = pd.Series(np.arange(len(dates)), index=dates) ts
In [84]:
ts.resample("5min").sum()
In [85]:
ts.resample("5min", closed="right").sum()
In [86]:
ts.resample("5min", closed="right", label="right").sum()
In [87]:
from pandas.tseries.frequencies import to_offset result = ts.resample("5min", closed="right", label="right").sum() result.index = result.index + to_offset("-1s") result
In [88]:
ts = pd.Series(np.random.permutation(np.arange(len(dates))), index=dates) ts.resample("5min").ohlc()
In [89]:
frame = pd.DataFrame(np.random.standard_normal((2, 4)), index=pd.date_range("2000-01-01", periods=2, freq="W-WED"), columns=["Colorado", "Texas", "New York", "Ohio"]) frame
In [90]:
df_daily = frame.resample("D").asfreq() df_daily
In [91]:
frame.resample("D").ffill()
In [92]:
frame.resample("D").ffill(limit=2)
In [93]:
frame.resample("W-THU").ffill()
In [94]:
frame = pd.DataFrame(np.random.standard_normal((24, 4)), index=pd.period_range("1-2000", "12-2001", freq="M"), columns=["Colorado", "Texas", "New York", "Ohio"]) frame.head() annual_frame = frame.resample("A-DEC").mean() annual_frame
In [95]:
# Q-DEC: Quarterly, year ending in December annual_frame.resample("Q-DEC").ffill() annual_frame.resample("Q-DEC", convention="end").asfreq()
In [96]:
annual_frame.resample("Q-MAR").ffill()
In [97]:
N = 15 times = pd.date_range("2017-05-20 00:00", freq="1min", periods=N) df = pd.DataFrame({"time": times, "value": np.arange(N)}) df
In [98]:
df.set_index("time").resample("5min").count()
In [99]:
df2 = pd.DataFrame({"time": times.repeat(3), "key": np.tile(["a", "b", "c"], N), "value": np.arange(N * 3.)}) df2.head(7)
In [100]:
time_key = pd.Grouper(freq="5min")
In [101]:
resampled = (df2.set_index("time") .groupby(["key", time_key]) .sum()) resampled resampled.reset_index()
In [102]:
close_px_all = pd.read_csv("examples/stock_px.csv", parse_dates=True, index_col=0) close_px = close_px_all[["AAPL", "MSFT", "XOM"]] close_px = close_px.resample("B").ffill()
In [103]:
close_px["AAPL"].plot() close_px["AAPL"].rolling(250).mean().plot()
In [104]:
plt.figure() std250 = close_px["AAPL"].pct_change().rolling(250, min_periods=10).std() std250[5:12] std250.plot()
In [105]:
expanding_mean = std250.expanding().mean()
In [106]:
plt.figure()
In [107]:
plt.style.use('grayscale') close_px.rolling(60).mean().plot(logy=True)
In [108]:
close_px.rolling("20D").mean()
In [109]:
plt.figure()
In [110]:
aapl_px = close_px["AAPL"]["2006":"2007"] ma30 = aapl_px.rolling(30, min_periods=20).mean() ewma30 = aapl_px.ewm(span=30).mean() aapl_px.plot(style="k-", label="Price") ma30.plot(style="k--", label="Simple Moving Avg") ewma30.plot(style="k-", label="EW MA") plt.legend()
In [111]:
plt.figure()
In [112]:
spx_px = close_px_all["SPX"] spx_rets = spx_px.pct_change() returns = close_px.pct_change()
In [113]:
corr = returns["AAPL"].rolling(125, min_periods=100).corr(spx_rets) corr.plot()
In [114]:
plt.figure()
In [115]:
corr = returns.rolling(125, min_periods=100).corr(spx_rets) corr.plot()
In [116]:
plt.figure()
In [117]:
from scipy.stats import percentileofscore def score_at_2percent(x): return percentileofscore(x, 0.02) result = returns["AAPL"].rolling(250).apply(score_at_2percent) result.plot()
In [118]:
In [119]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS