CoCalc -- sagecell-stats.ipynb

GitHub Repository: sagemath/sagecell
Path: blob/master/contrib/stats/sagecell-stats.ipynb
⁴⁴⁷ views

Kernel: Unknown Kernel

In [ ]:

%pylab inline

In [ ]:

#grep -v '"::1", "", "service",' s.log > s2.log
#mv s2.log s.log


import json
from datetime import datetime
def parseline(line):
    a,b=line.split('[', 1)
    c=a.split()
    d = datetime.strptime(c[4]+c[5], "%Y-%m-%d%H:%M:%S,%f")
    data = json.loads('['+b)
    return [d, c[3], data[1], data[2], data[3]]


lines = []
i=0
errors=0
skipped=0
import gc
gc.disable()
with open('s.log') as f:
    for s in f:
        s=s.rstrip()
        i+=1
        if i%100000==0: 
            print 'processing ',i,'lines'
        #if i>10000: break
        if s[-2:]!='"]':
            # ignore lines that don't end correctly
            skipped+=1
        try:
            lines.append(parseline(s))
        except Exception as E:
            #print(i, E)
            errors+=1
gc.enable()
print("Errors: ",errors)
print("Skipped: ",skipped)
print("Processed: ",i)

In [ ]:

len(lines)

In [ ]:

lines[0]

In [ ]:

import pandas
#d=pandas.DataFrame(lines, index=columns=["time", "server", "ip","url","type"])
d=pandas.DataFrame.from_items(((l[0],l[1:]) for l in lines), 
                              columns=["server", "ip","url","type"],
                              orient='index').sort()
d

In [ ]:

from datetime import datetime
dec10=datetime(2013,12,10)
dec11=datetime(2013,12,11)
d_dec10=d.ix[dec10:dec11]

In [ ]:

d.groupby('ip').count().sort('ip',ascending=False)[:10]

In [ ]:

d.groupby('server').count()

In [ ]:

d.groupby('type').count()

In [ ]:

d[d.url.str.contains('^(http|https)://[^.]*.ups.edu')]

In [ ]:

notsagecell=d[~d.url.str.contains('^(http|https)://[^.]*.sagemath.org')]
print(len(notsagecell))
c=notsagecell.groupby('url').count().sort('url',ascending=False).take([0],axis=1)
print(len(c))
c[:500]

In [ ]:

d.groupby('url').count().sort('url', ascending=False).take([0],axis=1)[:100]

In [ ]:

daily=d['type'].resample('1D',how='count')
print(daily.describe())
daily.plot(kind='kde')

In [ ]:

daily.plot()

In [ ]:

pandas.set_option('display.max_rows', 500)

In [ ]:

Product

Resources

Company