Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download
Views: 16658

See 10 Minutes to pandas

This is really "a few hours", not 10 minutes. Carefully working through this is a good way to get a solid foundation in using Pandas.

This tutorial involves two other Python packages which we haven't discussed much yet:

  • numpy, for efficient manipulation of numerical arrays (a la MATLAB);

  • matplotlib, for plotting.

We'll say more about these a bit later on.

%auto import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib matplotlib.style.use('ggplot') %default_mode python # avoid Sage data types! %typeset_mode True
pi^e
Error in lines 1-1 Traceback (most recent call last): File "/projects/sage/sage-7.5/local/lib/python2.7/site-packages/smc_sagews/sage_server.py", line 982, in execute exec compile(block+'\n', '', 'single') in namespace, locals File "", line 1, in <module> File "sage/structure/element.pyx", line 919, in sage.structure.element.Element.__xor__ (/projects/sage/sage-7.5/src/build/cythonized/sage/structure/element.c:8766) raise RuntimeError("Use ** for exponentiation, not '^', which means xor\n"+\ RuntimeError: Use ** for exponentiation, not '^', which means xor in Python, and has the wrong precedence.
pi**e
πe\displaystyle \pi^{e}
7/3
2\displaystyle 2
s = pd.Series([1, 3, 5, np.nan, 6, 8]) s
0 1.0 1 3.0 2 5.0 3 NaN 4 6.0 5 8.0 dtype: float64
pd.date_range('20160227', periods=6)
DatetimeIndex(['2016-02-27', '2016-02-28', '2016-02-29', '2016-03-01', '2016-03-02', '2016-03-03'], dtype='datetime64[ns]', freq='D')
pd.date_range('20170227', periods=6)
DatetimeIndex(['2017-02-27', '2017-02-28', '2017-03-01', '2017-03-02', '2017-03-03', '2017-03-04'], dtype='datetime64[ns]', freq='D')
dates = pd.date_range('20130101', periods=6) dates
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04', '2013-01-05', '2013-01-06'], dtype='datetime64[ns]', freq='D')
np.random.randn(6)
[ 0.15338633 -1.1123412 -0.89944875 0.50109747 -0.71681434 2.2980212 ]
print list('ABCD')
['A', 'B', 'C', 'D']
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')) df
A B C D
2013-01-01 0.505320 -0.213372 -0.720336 -0.237834
2013-01-02 -0.015716 2.987745 1.594794 0.494024
2013-01-03 0.351800 -0.763146 0.693784 1.229273
2013-01-04 -0.306839 -0.032079 -1.697872 1.017509
2013-01-05 0.453458 -0.630529 0.287576 -0.499642
2013-01-06 -0.704985 -0.958764 -1.450636 1.888210
type(df)
<class 'pandas.core.frame.DataFrame'>
a = np.array([1,2,3,4**5000]); a
[1 2 3 19950631168807583848837421626835850838234968318861924548520089498529438830221946631919961684036194597899331129423209124271556491349413781117593785932096323957855730046793794526765246551266059895520550086918193311542508608460618104685509074866089624888090489894838009253941633257850621568309473902556912388065225096643874441046759871626985453222868538161694315775629640762836880760732228535091641476183956381458969463899410840960536267821064621427333394036525565649530603142680234969400335934316651459297773279665775606172582031407994198179607378245683762280037302885487251900834464581454650557929601414833921615734588139257095379769119277800826957735674444123062018757836325502728323789270710373802866393031428133241401624195671690574061419654342324638801248856147305207431992259611796250130992860241708340807605932320161268492288496255841312844061536738951487114256315111089745514203313820202931640957596464756010405845841566072044962867016515061920631004186422275908670900574606417856951911456055068251250406007519842261898059237118054444788072906395242548339221982707404473162376760846613033778706039803413197133493654622700563169937455508241780972810983291314403571877524768509857276937926433221599399876886660808368837838027643282775172273657572744784112294389733810861607423253291974813120197604178281965697475898164531258434135959862784130128185406283476649088690521047580882615823961985770122407044330583075869039319604603404973156583208672105913300903752823415539745394397715257455290510212310947321610753474825740775273986348298498340756937955646638621874569499279016572103701364433135817214311791398222983845847334440270964182851005072927748364550578634501100852987812389473928699540834346158807043959118985815145779177143619698728131459483783202081474982171858011389071228250905826817436220577475921417653715687725614904582904992461028630081535583308130101987675856234343538955409175623400844887526162643568648833519463720377293240094456246923254350400678027273837755376406726898636241037491410966718557050759098100246789880178271925953381282421954028302759408448955014676668389697996886241636313376393903373455801407636741877711055384225739499110186468219696581651485130494222369947714763069155468217682876200362777257723781365331611196811280792669481887201298643660768551639860534602297871557517947385246369446923087894265948217008051120322365496288169035739121368338393591756418733850510970271613915439590991598154654417336311656936031122249937969999226781732358023111862644575299135758175008199839236284615249881088960232244362173771618086357015468484058622329792853875623486556440536962622018963571028812361567512543338303270029097668650568557157505516727518899194129711337690149916181315171544007728650573189557450920330185304847113818315407324053319038462084036421763703911550639789000742853672196280903477974533320468368795868580237952218629120080742819551317948157624448298518461509704888027274721574688131594750409732115080498190455803416826949787141316063210686391511681774304792596709376L]
a.dtype
object
df2 = pd.DataFrame({ 'A' : 1., "G" : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), 'D' : np.array([1,2,3,4],dtype='int32'), 'E' : pd.Categorical(["test","train","test","train"]), 'F' : ['foo', 'bar', 'foo', 'bar'] }) df2
A C D E F G
0 1.0 1.0 1 test foo 2013-01-02
1 1.0 1.0 2 train bar 2013-01-02
2 1.0 1.0 3 test foo 2013-01-02
3 1.0 1.0 4 train bar 2013-01-02
df2.dtypes
A float64 B datetime64[ns] C float32 D int32 E category F object dtype: object
df2.head(2)
A C D E F G
0 1.0 1.0 1 test foo 2013-01-02
1 1.0 1.0 2 train bar 2013-01-02
df.tail()
A B C D
2013-01-02 -0.635897 -0.567356 0.187084 0.303563
2013-01-03 0.756620 -0.551832 0.223833 -0.427561
2013-01-04 -0.280803 0.323984 -0.216256 -1.372340
2013-01-05 0.221348 0.949058 3.231426 0.450020
2013-01-06 0.709880 -0.259171 0.160104 2.354785
df.describe()
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.319048 -0.128651 0.536371 0.117809
std 0.678916 0.639421 1.342300 1.279411
min -0.635897 -0.666590 -0.367967 -1.372340
25% -0.155266 -0.563475 -0.122166 -0.558098
50% 0.465614 -0.405501 0.173594 -0.061999
75% 0.744935 0.178195 0.214646 0.413406
max 1.143139 0.949058 3.231426 2.354785
df[1]
df[df.index[1]] # learning curve is steep;
df[2:4]
A B C D
2013-01-03 0.756620 -0.551832 0.223833 -0.427561
2013-01-04 -0.280803 0.323984 -0.216256 -1.372340
df[:3]
A B C D
2013-01-01 1.143139 -0.666590 -0.367967 -0.601610
2013-01-02 -0.635897 -0.567356 0.187084 0.303563
2013-01-03 0.756620 -0.551832 0.223833 -0.427561
df.columns
Index([u'A', u'B', u'C', u'D'], dtype='object')
df.values
[[ 1.09140566 2.27443272 -0.0519728 -1.46711534] [ 0.81694772 -0.92729441 0.9847454 -1.04465081] [ 1.76427825 -0.53356165 -0.62657329 -1.76661059] [ 0.76790378 -0.85793508 2.28827014 2.3107576 ] [ 0.27506969 0.7425217 2.74225875 -1.30026817] [-1.48271814 0.3790422 1.35226067 0.17327937]]
df
A B C D
2013-01-01 0.203499 -3.007550 0.863493 0.212264
2013-01-02 0.172677 0.242367 1.393439 0.442605
2013-01-03 2.022955 0.398982 -2.152061 -1.282658
2013-01-04 -0.521096 2.033860 0.196687 1.718198
2013-01-05 0.140910 0.579546 0.002477 0.782339
2013-01-06 -0.975994 -1.138417 1.484711 -0.831980
df.describe()
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.173825 -0.148535 0.298124 0.173461
std 1.022374 1.726231 1.343759 1.091978
min -0.975994 -3.007550 -2.152061 -1.282658
25% -0.355594 -0.793221 0.051029 -0.570919
50% 0.156794 0.320674 0.530090 0.327434
75% 0.195793 0.534405 1.260952 0.697406
max 2.022955 2.033860 1.484711 1.718198
df2
A C D E F G
0 1.0 1.0 1 test foo 2013-01-02
1 1.0 1.0 2 train bar 2013-01-02
2 1.0 1.0 3 test foo 2013-01-02
3 1.0 1.0 4 train bar 2013-01-02
df2.describe()
A C D
count 4.0 4.0 4.000000
mean 1.0 1.0 2.500000
std 0.0 0.0 1.290994
min 1.0 1.0 1.000000
25% 1.0 1.0 1.750000
50% 1.0 1.0 2.500000
75% 1.0 1.0 3.250000
max 1.0 1.0 4.000000
df2.T
0 1 2 3
A 1 1 1 1
C 1 1 1 1
D 1 2 3 4
E test train test train
F foo foo foo foo
Jake's column 2013-01-02 00:00:00 2013-01-02 00:00:00 2013-01-02 00:00:00 2013-01-02 00:00:00
df.T
2013-01-01 00:00:00 2013-01-02 00:00:00 2013-01-03 00:00:00 2013-01-04 00:00:00 2013-01-05 00:00:00 2013-01-06 00:00:00
A 1.091406 0.816948 1.764278 0.767904 0.275070 -1.482718
B 2.274433 -0.927294 -0.533562 -0.857935 0.742522 0.379042
C -0.051973 0.984745 -0.626573 2.288270 2.742259 1.352261
D -1.467115 -1.044651 -1.766611 2.310758 -1.300268 0.173279
df
A B C D
2013-01-01 0.203499 -3.007550 0.863493 0.212264
2013-01-02 0.172677 0.242367 1.393439 0.442605
2013-01-03 2.022955 0.398982 -2.152061 -1.282658
2013-01-04 -0.521096 2.033860 0.196687 1.718198
2013-01-05 0.140910 0.579546 0.002477 0.782339
2013-01-06 -0.975994 -1.138417 1.484711 -0.831980
df['D']
2013-01-01 0.212264 2013-01-02 0.442605 2013-01-03 -1.282658 2013-01-04 1.718198 2013-01-05 0.782339 2013-01-06 -0.831980 Freq: D, Name: D, dtype: float64
df2
A C D E F Jake's column
0 1.0 1.0 1 test foo 2013-01-02
1 1.0 1.0 2 train foo 2013-01-02
2 1.0 1.0 3 test foo 2013-01-02
3 1.0 1.0 4 train foo 2013-01-02
df2["E"]
0 test 1 train 2 test 3 train Name: E, dtype: category Categories (2, object): [test, train]
df2.E
0 test 1 train 2 test 3 train Name: E, dtype: category Categories (2, object): [test, train]
df2["G"]
0 2013-01-02 1 2013-01-02 2 2013-01-02 3 2013-01-02 Name: G, dtype: datetime64[ns]
df.A == df["A"]
2013-01-01 True 2013-01-02 True 2013-01-03 True 2013-01-04 True 2013-01-05 True 2013-01-06 True Freq: D, Name: A, dtype: bool
df[0:3]
A B C D
2013-01-01 1.091406 2.274433 -0.051973 -1.467115
2013-01-02 0.816948 -0.927294 0.984745 -1.044651
2013-01-03 1.764278 -0.533562 -0.626573 -1.766611
del df['A']; print d
B C D 2013-01-01 -0.666590 -0.367967 -0.601610 2013-01-02 -0.567356 0.187084 0.303563 2013-01-03 -0.551832 0.223833 -0.427561 2013-01-04 0.323984 -0.216256 -1.372340 2013-01-05 0.949058 3.231426 0.450020 2013-01-06 -0.259171 0.160104 2.354785
df.drop(df.index[2:4])
B C D
2013-01-01 -0.666590 -0.367967 -0.601610
2013-01-02 -0.567356 0.187084 0.303563
2013-01-05 0.949058 3.231426 0.450020
2013-01-06 -0.259171 0.160104 2.354785
df['20130102':'20130104']
A B C D
2013-01-02 0.816948 -0.927294 0.984745 -1.044651
2013-01-03 1.764278 -0.533562 -0.626573 -1.766611
2013-01-04 0.767904 -0.857935 2.288270 2.310758
dates
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04', '2013-01-05', '2013-01-06'], dtype='datetime64[ns]', freq='D')
df.loc[dates[0]]
A 0.505320 B -0.213372 C -0.720336 D -0.237834 Name: 2013-01-01 00:00:00, dtype: float64
df
B C D
2013-01-01 -0.666590 -0.367967 -0.601610
2013-01-02 -0.567356 0.187084 0.303563
2013-01-03 -0.551832 0.223833 -0.427561
2013-01-04 0.323984 -0.216256 -1.372340
2013-01-05 0.949058 3.231426 0.450020
2013-01-06 -0.259171 0.160104 2.354785
df[df.A + df.B > 0]
A B C D
2013-01-01 0.505320 -0.213372 -0.720336 -0.237834
2013-01-02 -0.015716 2.987745 1.594794 0.494024
df.loc[:,['A','B']]
A B
2013-01-01 1.091406 2.274433
2013-01-02 0.816948 -0.927294
2013-01-03 1.764278 -0.533562
2013-01-04 0.767904 -0.857935
2013-01-05 0.275070 0.742522
2013-01-06 -1.482718 0.379042
df.loc['20130102':'20130104',['A','B']]
A B
2013-01-02 0.816948 -0.927294
2013-01-03 1.764278 -0.533562
2013-01-04 0.767904 -0.857935
df.loc['20130102',['A','B']]
A 0.816948 B -0.927294 Name: 2013-01-02 00:00:00, dtype: float64
df.loc[dates[0],'A']
1.09140566006\displaystyle 1.09140566006
df.at[dates[0],'A']
1.09140566006\displaystyle 1.09140566006
df.iloc[3]
A 0.767904 B -0.857935 C 2.288270 D 2.310758 Name: 2013-01-04 00:00:00, dtype: float64
df.iloc[3:5,0:2]
A B
2013-01-04 0.767904 -0.857935
2013-01-05 0.275070 0.742522
df.iloc[[1,2,4],[0,2]]
A C
2013-01-02 0.816948 0.984745
2013-01-03 1.764278 -0.626573
2013-01-05 0.275070 2.742259
df.iloc[1:3,:]
A B C D
2013-01-02 0.816948 -0.927294 0.984745 -1.044651
2013-01-03 1.764278 -0.533562 -0.626573 -1.766611
df.iloc[:,1:3]
B C
2013-01-01 2.274433 -0.051973
2013-01-02 -0.927294 0.984745
2013-01-03 -0.533562 -0.626573
2013-01-04 -0.857935 2.288270
2013-01-05 0.742522 2.742259
2013-01-06 0.379042 1.352261
df.iloc[1,1]
0.927294411026\displaystyle -0.927294411026
df.iat[1,1]
0.927294411026\displaystyle -0.927294411026
df[df.A > 0]
A B C D
2013-01-01 1.091406 2.274433 -0.051973 -1.467115
2013-01-02 0.816948 -0.927294 0.984745 -1.044651
2013-01-03 1.764278 -0.533562 -0.626573 -1.766611
2013-01-04 0.767904 -0.857935 2.288270 2.310758
2013-01-05 0.275070 0.742522 2.742259 -1.300268
df[df > 0]
A B C D
2013-01-01 1.091406 2.274433 NaN NaN
2013-01-02 0.816948 NaN 0.984745 NaN
2013-01-03 1.764278 NaN NaN NaN
2013-01-04 0.767904 NaN 2.288270 2.310758
2013-01-05 0.275070 0.742522 2.742259 NaN
2013-01-06 NaN 0.379042 1.352261 0.173279
df2 = df.copy() df2['E'] = ['one', 'one','two','three','four','three'] df2
A B C D E
2013-01-01 0.505320 -0.213372 -0.720336 -0.237834 one
2013-01-02 -0.015716 2.987745 1.594794 0.494024 one
2013-01-03 0.351800 -0.763146 0.693784 1.229273 two
2013-01-04 -0.306839 -0.032079 -1.697872 1.017509 three
2013-01-05 0.453458 -0.630529 0.287576 -0.499642 four
2013-01-06 -0.704985 -0.958764 -1.450636 1.888210 three
df2[df2['E'].isin(['one','four'])]
A B C D E
2013-01-01 0.505320 -0.213372 -0.720336 -0.237834 one
2013-01-02 -0.015716 2.987745 1.594794 0.494024 one
2013-01-05 0.453458 -0.630529 0.287576 -0.499642 four
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6)) s1
2013-01-02 1 2013-01-03 2 2013-01-04 3 2013-01-05 4 2013-01-06 5 2013-01-07 6 Freq: D, dtype: int64
df['F'] = s1 df
A B C D F
2013-01-01 1.091406 2.274433 -0.051973 -1.467115 NaN
2013-01-02 0.816948 -0.927294 0.984745 -1.044651 1.0
2013-01-03 1.764278 -0.533562 -0.626573 -1.766611 2.0
2013-01-04 0.767904 -0.857935 2.288270 2.310758 3.0
2013-01-05 0.275070 0.742522 2.742259 -1.300268 4.0
2013-01-06 -1.482718 0.379042 1.352261 0.173279 5.0
df.at[dates[0],'A'] = 0 df
A B C D F
2013-01-01 0.000000 2.274433 -0.051973 -1.467115 NaN
2013-01-02 0.816948 -0.927294 0.984745 -1.044651 1.0
2013-01-03 1.764278 -0.533562 -0.626573 -1.766611 2.0
2013-01-04 0.767904 -0.857935 2.288270 2.310758 3.0
2013-01-05 0.275070 0.742522 2.742259 -1.300268 4.0
2013-01-06 -1.482718 0.379042 1.352261 0.173279 5.0
df.iat[0,1] = 0 df
A B C D F
2013-01-01 0.000000 0.000000 -0.051973 -1.467115 NaN
2013-01-02 0.816948 -0.927294 0.984745 -1.044651 1.0
2013-01-03 1.764278 -0.533562 -0.626573 -1.766611 2.0
2013-01-04 0.767904 -0.857935 2.288270 2.310758 3.0
2013-01-05 0.275070 0.742522 2.742259 -1.300268 4.0
2013-01-06 -1.482718 0.379042 1.352261 0.173279 5.0
df.loc[:,'D'] = np.array([5] * len(df)) df
A B C D F
2013-01-01 0.000000 0.000000 -0.051973 5 NaN
2013-01-02 0.816948 -0.927294 0.984745 5 1.0
2013-01-03 1.764278 -0.533562 -0.626573 5 2.0
2013-01-04 0.767904 -0.857935 2.288270 5 3.0
2013-01-05 0.275070 0.742522 2.742259 5 4.0
2013-01-06 -1.482718 0.379042 1.352261 5 5.0
df2 = df.copy() df2[df2 > 0] = -df2 df2
A B C D F
2013-01-01 0.000000 0.000000 -0.051973 -5 NaN
2013-01-02 -0.816948 -0.927294 -0.984745 -5 -1.0
2013-01-03 -1.764278 -0.533562 -0.626573 -5 -2.0
2013-01-04 -0.767904 -0.857935 -2.288270 -5 -3.0
2013-01-05 -0.275070 -0.742522 -2.742259 -5 -4.0
2013-01-06 -1.482718 -0.379042 -1.352261 -5 -5.0
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) df1.loc[dates[0]:dates[1],'E'] = 1 df1
A B C D F E
2013-01-01 0.000000 0.000000 -0.051973 5 NaN 1.0
2013-01-02 0.816948 -0.927294 0.984745 5 1.0 1.0
2013-01-03 1.764278 -0.533562 -0.626573 5 2.0 NaN
2013-01-04 0.767904 -0.857935 2.288270 5 3.0 NaN
df1.dropna(how='any')
A B C D F E
2013-01-02 0.816948 -0.927294 0.984745 5 1.0 1.0
df1.fillna(value=5)
A B C D F E
2013-01-01 0.000000 0.000000 -0.051973 5 5.0 1.0
2013-01-02 0.816948 -0.927294 0.984745 5 1.0 1.0
2013-01-03 1.764278 -0.533562 -0.626573 5 2.0 5.0
2013-01-04 0.767904 -0.857935 2.288270 5 3.0 5.0
pd.isnull(df1)
A B C D F E
2013-01-01 False False False False True False
2013-01-02 False False False False False False
2013-01-03 False False False False False True
2013-01-04 False False False False False True
df
A B C D F
2013-01-01 0.000000 0.000000 -0.051973 5 NaN
2013-01-02 0.816948 -0.927294 0.984745 5 1.0
2013-01-03 1.764278 -0.533562 -0.626573 5 2.0
2013-01-04 0.767904 -0.857935 2.288270 5 3.0
2013-01-05 0.275070 0.742522 2.742259 5 4.0
2013-01-06 -1.482718 0.379042 1.352261 5 5.0
df.mean()
A 0.356914 B -0.199538 C 1.114831 D 5.000000 F 3.000000 dtype: float64
df.mean(1)
2013-01-01 1.237007 2013-01-02 1.374880 2013-01-03 1.520829 2013-01-04 2.039648 2013-01-05 2.551970 2013-01-06 2.049717 Freq: D, dtype: float64
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2) s
2013-01-01 NaN 2013-01-02 NaN 2013-01-03 1.0 2013-01-04 3.0 2013-01-05 5.0 2013-01-06 NaN Freq: D, dtype: float64
df.sub(s, axis='index')
A B C D F
2013-01-01 NaN NaN NaN NaN NaN
2013-01-02 NaN NaN NaN NaN NaN
2013-01-03 0.764278 -1.533562 -1.626573 4.0 1.0
2013-01-04 -2.232096 -3.857935 -0.711730 2.0 0.0
2013-01-05 -4.724930 -4.257478 -2.257741 0.0 -1.0
2013-01-06 NaN NaN NaN NaN NaN
df
A B C D
2013-01-01 0.505320 -0.213372 -0.720336 -0.237834
2013-01-02 -0.015716 2.987745 1.594794 0.494024
2013-01-03 0.351800 -0.763146 0.693784 1.229273
2013-01-04 -0.306839 -0.032079 -1.697872 1.017509
2013-01-05 0.453458 -0.630529 0.287576 -0.499642
2013-01-06 -0.704985 -0.958764 -1.450636 1.888210
df.apply(np.cumsum)
A B C D
2013-01-01 0.505320 -0.213372 -0.720336 -0.237834
2013-01-02 0.489604 2.774373 0.874458 0.256190
2013-01-03 0.841405 2.011227 1.568242 1.485463
2013-01-04 0.534565 1.979147 -0.129630 2.502972
2013-01-05 0.988023 1.348619 0.157946 2.003330
2013-01-06 0.283038 0.389855 -1.292690 3.891541
df.apply(np.cumsum, axis=1)
A B C D F
2013-01-01 0.000000 0.000000 -0.051973 4.948027 NaN
2013-01-02 0.816948 -0.110347 0.874399 5.874399 6.874399
2013-01-03 1.764278 1.230717 0.604143 5.604143 7.604143
2013-01-04 0.767904 -0.090031 2.198239 7.198239 10.198239
2013-01-05 0.275070 1.017591 3.759850 8.759850 12.759850
2013-01-06 -1.482718 -1.103676 0.248585 5.248585 10.248585
df.apply(lambda x: x.max() - x.min())
A 3.246996 B 1.669816 C 3.368832 D 0.000000 F 4.000000 dtype: float64
s = pd.Series(np.random.randint(0, 7, size=10)) s
0 2 1 6 2 4 3 3 4 1 5 3 6 6 7 4 8 1 9 4 dtype: int64
s.value_counts()
3 2 2 2 0 2 6 1 5 1 4 1 1 1 dtype: int64
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) s.str.lower()
0 a 1 b 2 c 3 aaba 4 baca 5 NaN 6 caba 7 dog 8 cat dtype: object
df = pd.DataFrame(np.random.randn(10, 4)) df
0 1 2 3
0 0.489649 -0.930106 0.534682 0.583658
1 0.201224 -0.103676 1.076049 -0.378711
2 1.734129 1.270388 0.182100 1.275143
3 0.756130 -1.469688 1.139847 -1.056633
4 -0.210804 -1.576685 0.379073 -0.357527
5 -0.762810 -1.107889 1.040216 -0.601039
6 1.619109 -0.869445 -0.655173 -1.418691
7 -1.061251 -0.346968 -0.057042 -1.518861
8 1.135685 1.133475 0.196636 0.291553
9 -0.726244 -0.842093 -0.265447 -0.634778
df[:3]
0 1 2 3
0 0.489649 -0.930106 0.534682 0.583658
1 0.201224 -0.103676 1.076049 -0.378711
2 1.734129 1.270388 0.182100 1.275143
pieces = [df[:3], df[3:7], df[7:]] pd.concat(pieces)
0 1 2 3
0 0.489649 -0.930106 0.534682 0.583658
1 0.201224 -0.103676 1.076049 -0.378711
2 1.734129 1.270388 0.182100 1.275143
3 0.756130 -1.469688 1.139847 -1.056633
4 -0.210804 -1.576685 0.379073 -0.357527
5 -0.762810 -1.107889 1.040216 -0.601039
6 1.619109 -0.869445 -0.655173 -1.418691
7 -1.061251 -0.346968 -0.057042 -1.518861
8 1.135685 1.133475 0.196636 0.291553
9 -0.726244 -0.842093 -0.265447 -0.634778
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) print 'left=' left right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) print 'right=' right pd.merge(left, right, on='key') # cartesian product of sets
left=
key lval
0 foo 1
1 foo 2
right=
key rval
0 foo 4
1 foo 5
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) df
A B C D
0 0.357406 1.244947 -1.505490 0.647047
1 0.965902 -0.817161 -1.021037 1.973119
2 -0.592771 -0.712344 -1.239662 0.023419
3 -0.050144 -0.425577 -0.450232 0.117308
4 0.405984 0.388842 1.020623 0.377212
5 0.429241 0.414187 0.216448 0.099873
6 1.308360 -0.131163 -0.290972 -1.351830
7 -0.258822 1.826776 0.124504 1.157286
s = df.iloc[3] s
A -0.050144 B -0.425577 C -0.450232 D 0.117308 Name: 3, dtype: float64
df.append(s, ignore_index=True)
A B C D
0 0.357406 1.244947 -1.505490 0.647047
1 0.965902 -0.817161 -1.021037 1.973119
2 -0.592771 -0.712344 -1.239662 0.023419
3 -0.050144 -0.425577 -0.450232 0.117308
4 0.405984 0.388842 1.020623 0.377212
5 0.429241 0.414187 0.216448 0.099873
6 1.308360 -0.131163 -0.290972 -1.351830
7 -0.258822 1.826776 0.124504 1.157286
8 -0.050144 -0.425577 -0.450232 0.117308
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C' : np.random.randn(8), 'D' : np.random.randn(8)}) df
A B C D
0 foo one -2.077682 0.331603
1 bar one -0.071400 0.219208
2 foo two 1.242458 1.252414
3 bar three 0.963360 -1.377083
4 foo two -0.469703 1.170327
5 bar two 0.413790 0.358047
6 foo one -0.611831 0.359070
7 foo three 0.018343 0.790031
df.groupby('A').sum()
C D
A
bar 1.305749 -0.799828
foo -1.898415 3.903445
df.groupby(['A','B']).sum()
C D
A B
bar one -0.071400 0.219208
three 0.963360 -1.377083
two 0.413790 0.358047
foo one -2.689514 0.690673
three 0.018343 0.790031
two 0.772755 2.422741
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])) index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B']) df2 = df[:4] df2
A B
first second
bar one -0.083946 -0.246541
two -0.208254 -2.175975
baz one 0.658479 1.221688
two -0.399217 0.366461
stacked = df2.stack() stacked
first second bar one A -0.083946 B -0.246541 two A -0.208254 B -2.175975 baz one A 0.658479 B 1.221688 two A -0.399217 B 0.366461 dtype: float64
stacked.unstack()
A B
first second
bar one -0.083946 -0.246541
two -0.208254 -2.175975
baz one 0.658479 1.221688
two -0.399217 0.366461
stacked.unstack(1)
second one two
first
bar A -0.083946 -0.208254
B -0.246541 -2.175975
baz A 0.658479 -0.399217
B 1.221688 0.366461
stacked.unstack(0)
first bar baz
second
one A -0.083946 0.658479
B -0.246541 1.221688
two A -0.208254 -0.399217
B -2.175975 0.366461
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, 'B' : ['A', 'B', 'C'] * 4, 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 'D' : np.random.randn(12), 'E' : np.random.randn(12)}) df
A B C D E
0 one A foo 0.924358 -1.318103
1 one B foo -0.013813 0.135119
2 two C foo -1.007086 0.391596
3 three A bar 1.693627 0.139658
4 one B bar -0.015281 -0.184786
5 one C bar 2.120000 -0.841796
6 two A foo -2.785113 0.612259
7 three B foo -0.325130 -0.930560
8 one C foo -0.019052 1.161260
9 one A bar 0.573002 -2.131227
10 two B bar -0.027490 0.884262
11 three C bar 0.258147 -0.192678
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
C bar foo
A B
one A 0.573002 0.924358
B -0.015281 -0.013813
C 2.120000 -0.019052
three A 1.693627 NaN
B NaN -0.325130
C 0.258147 NaN
two A NaN -2.785113
B -0.027490 NaN
C NaN -1.007086
rng = pd.date_range('1/1/2012', periods=100, freq='S') ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) ts.resample('5Min').sum()
2012-01-01 24920 Freq: 5T, dtype: int64
ts_utc = ts.tz_localize('UTC') ts_utc
2012-01-01 00:00:00+00:00 106 2012-01-01 00:00:01+00:00 32 2012-01-01 00:00:02+00:00 281 2012-01-01 00:00:03+00:00 276 2012-01-01 00:00:04+00:00 448 2012-01-01 00:00:05+00:00 207 2012-01-01 00:00:06+00:00 433 2012-01-01 00:00:07+00:00 0 2012-01-01 00:00:08+00:00 81 2012-01-01 00:00:09+00:00 408 2012-01-01 00:00:10+00:00 475 2012-01-01 00:00:11+00:00 37 2012-01-01 00:00:12+00:00 80 2012-01-01 00:00:13+00:00 272 2012-01-01 00:00:14+00:00 496 2012-01-01 00:00:15+00:00 191 2012-01-01 00:00:16+00:00 121 2012-01-01 00:00:17+00:00 476 2012-01-01 00:00:18+00:00 142 2012-01-01 00:00:19+00:00 255 2012-01-01 00:00:20+00:00 492 2012-01-01 00:00:21+00:00 104 2012-01-01 00:00:22+00:00 31 2012-01-01 00:00:23+00:00 128 2012-01-01 00:00:24+00:00 413 2012-01-01 00:00:25+00:00 178 2012-01-01 00:00:26+00:00 293 2012-01-01 00:00:27+00:00 273 2012-01-01 00:00:28+00:00 393 2012-01-01 00:00:29+00:00 284 ... 2012-01-01 00:01:10+00:00 17 2012-01-01 00:01:11+00:00 408 2012-01-01 00:01:12+00:00 459 2012-01-01 00:01:13+00:00 329 2012-01-01 00:01:14+00:00 255 2012-01-01 00:01:15+00:00 187 2012-01-01 00:01:16+00:00 277 2012-01-01 00:01:17+00:00 97 2012-01-01 00:01:18+00:00 352 2012-01-01 00:01:19+00:00 163 2012-01-01 00:01:20+00:00 132 2012-01-01 00:01:21+00:00 348 2012-01-01 00:01:22+00:00 130 2012-01-01 00:01:23+00:00 405 2012-01-01 00:01:24+00:00 359 2012-01-01 00:01:25+00:00 308 2012-01-01 00:01:26+00:00 412 2012-01-01 00:01:27+00:00 485 2012-01-01 00:01:28+00:00 430 2012-01-01 00:01:29+00:00 85 2012-01-01 00:01:30+00:00 108 2012-01-01 00:01:31+00:00 462 2012-01-01 00:01:32+00:00 61 2012-01-01 00:01:33+00:00 206 2012-01-01 00:01:34+00:00 383 2012-01-01 00:01:35+00:00 284 2012-01-01 00:01:36+00:00 401 2012-01-01 00:01:37+00:00 318 2012-01-01 00:01:38+00:00 410 2012-01-01 00:01:39+00:00 314 Freq: S, dtype: int64
ts_utc.tz_convert('US/Eastern')
2011-12-31 19:00:00-05:00 106 2011-12-31 19:00:01-05:00 32 2011-12-31 19:00:02-05:00 281 2011-12-31 19:00:03-05:00 276 2011-12-31 19:00:04-05:00 448 2011-12-31 19:00:05-05:00 207 2011-12-31 19:00:06-05:00 433 2011-12-31 19:00:07-05:00 0 2011-12-31 19:00:08-05:00 81 2011-12-31 19:00:09-05:00 408 2011-12-31 19:00:10-05:00 475 2011-12-31 19:00:11-05:00 37 2011-12-31 19:00:12-05:00 80 2011-12-31 19:00:13-05:00 272 2011-12-31 19:00:14-05:00 496 2011-12-31 19:00:15-05:00 191 2011-12-31 19:00:16-05:00 121 2011-12-31 19:00:17-05:00 476 2011-12-31 19:00:18-05:00 142 2011-12-31 19:00:19-05:00 255 2011-12-31 19:00:20-05:00 492 2011-12-31 19:00:21-05:00 104 2011-12-31 19:00:22-05:00 31 2011-12-31 19:00:23-05:00 128 2011-12-31 19:00:24-05:00 413 2011-12-31 19:00:25-05:00 178 2011-12-31 19:00:26-05:00 293 2011-12-31 19:00:27-05:00 273 2011-12-31 19:00:28-05:00 393 2011-12-31 19:00:29-05:00 284 ... 2011-12-31 19:01:10-05:00 17 2011-12-31 19:01:11-05:00 408 2011-12-31 19:01:12-05:00 459 2011-12-31 19:01:13-05:00 329 2011-12-31 19:01:14-05:00 255 2011-12-31 19:01:15-05:00 187 2011-12-31 19:01:16-05:00 277 2011-12-31 19:01:17-05:00 97 2011-12-31 19:01:18-05:00 352 2011-12-31 19:01:19-05:00 163 2011-12-31 19:01:20-05:00 132 2011-12-31 19:01:21-05:00 348 2011-12-31 19:01:22-05:00 130 2011-12-31 19:01:23-05:00 405 2011-12-31 19:01:24-05:00 359 2011-12-31 19:01:25-05:00 308 2011-12-31 19:01:26-05:00 412 2011-12-31 19:01:27-05:00 485 2011-12-31 19:01:28-05:00 430 2011-12-31 19:01:29-05:00 85 2011-12-31 19:01:30-05:00 108 2011-12-31 19:01:31-05:00 462 2011-12-31 19:01:32-05:00 61 2011-12-31 19:01:33-05:00 206 2011-12-31 19:01:34-05:00 383 2011-12-31 19:01:35-05:00 284 2011-12-31 19:01:36-05:00 401 2011-12-31 19:01:37-05:00 318 2011-12-31 19:01:38-05:00 410 2011-12-31 19:01:39-05:00 314 Freq: S, dtype: int64
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) ts = ts.cumsum() ts.plot()