Kernel: Unknown Kernel
In [ ]:
%pylab inline
In [ ]:
#grep -v '"::1", "", "service",' s.log > s2.log #mv s2.log s.log import json from datetime import datetime def parseline(line): a,b=line.split('[', 1) c=a.split() d = datetime.strptime(c[4]+c[5], "%Y-%m-%d%H:%M:%S,%f") data = json.loads('['+b) return [d, c[3], data[1], data[2], data[3]] lines = [] i=0 errors=0 skipped=0 import gc gc.disable() with open('s.log') as f: for s in f: s=s.rstrip() i+=1 if i%100000==0: print 'processing ',i,'lines' #if i>10000: break if s[-2:]!='"]': # ignore lines that don't end correctly skipped+=1 try: lines.append(parseline(s)) except Exception as E: #print(i, E) errors+=1 gc.enable() print("Errors: ",errors) print("Skipped: ",skipped) print("Processed: ",i)
In [ ]:
len(lines)
In [ ]:
lines[0]
In [ ]:
import pandas #d=pandas.DataFrame(lines, index=columns=["time", "server", "ip","url","type"]) d=pandas.DataFrame.from_items(((l[0],l[1:]) for l in lines), columns=["server", "ip","url","type"], orient='index').sort() d
In [ ]:
from datetime import datetime dec10=datetime(2013,12,10) dec11=datetime(2013,12,11) d_dec10=d.ix[dec10:dec11]
In [ ]:
d.groupby('ip').count().sort('ip',ascending=False)[:10]
In [ ]:
d.groupby('server').count()
In [ ]:
d.groupby('type').count()
In [ ]:
d[d.url.str.contains('^(http|https)://[^.]*.ups.edu')]
In [ ]:
notsagecell=d[~d.url.str.contains('^(http|https)://[^.]*.sagemath.org')] print(len(notsagecell)) c=notsagecell.groupby('url').count().sort('url',ascending=False).take([0],axis=1) print(len(c)) c[:500]
In [ ]:
d.groupby('url').count().sort('url', ascending=False).take([0],axis=1)[:100]
In [ ]:
daily=d['type'].resample('1D',how='count') print(daily.describe()) daily.plot(kind='kde')
In [ ]:
daily.plot()
In [ ]:
pandas.set_option('display.max_rows', 500)
In [ ]: