Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
sagemathinc
GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/scripts/stats-active-users.py
Views: 275
1
#!/usr/bin/env python3
2
# This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.
3
# License: AGPLv3 s.t. "Commons Clause" – read LICENSE.md for details
4
5
import sys, os
6
from os.path import abspath, dirname, join
7
file_dir = abspath(dirname(__file__))
8
sys.path.insert(0, file_dir)
9
10
from smc_rethinkdb import *
11
from pprint import pprint
12
from datetime import datetime, timedelta
13
from pytz import utc
14
from collections import defaultdict
15
from time import time
16
import numpy as np
17
18
now = r.maxval
19
# ago = time_past(24 * 50)
20
21
ago = datetime(2015, 1, 1).replace(tzinfo=utc)
22
# now = datetime(2016, 3, 1).replace(tzinfo = utc)
23
24
# round down to midnight
25
ago = ago.replace(hour=0, minute=0, second=0, microsecond=0)
26
27
# ATTN: central_log doesn't record all sign_in events, probably due to some missing case for passports. hence that's incomplete
28
# q = central_log.between(ago, now, index='time')#.filter({'event' : 'successful_sign_in', 'value' : {'email_address' : "[email protected]"}})
29
# q = q.has_fields({'value' : 'email_address'}).filter({'value' : {'email_address' : "[email protected]"}})
30
31
# INFO: using file_access_log instead, which has the benefit to be more truthy regarding really "active" users
32
q = file_access_log.between(ago, now, index='time')
33
# q = q.filter({"account_id" : '3c40513b-7e7c-450c-aa13-bf4f3411cf33'}) # only [email protected]
34
35
total = q.count().run()
36
print("Hold tight. Going through {} records for you …".format(total))
37
38
q = q.has_fields('account_id').pluck("time", "account_id")
39
40
day = defaultdict(set)
41
week = defaultdict(set)
42
month = defaultdict(set)
43
44
t0 = None
45
46
for idx, ev in enumerate(q.run()):
47
if t0 is None:
48
t0 = t = time()
49
if total > 1001 and idx % int(total * .001) == 0 and t < time():
50
eta = (total - idx) * (t - t0) / idx if idx > 1000 else np.nan
51
print('{:5.2f} % eta: {:5.2f} s'.format(100. * idx / total, eta))
52
t = time() + 5.
53
id = ev["account_id"]
54
dt = ev["time"]
55
day[dt.date()].add(id)
56
w = "{0}-{1:02d}".format(*dt.isocalendar())
57
week[w].add(id)
58
m = "{0.year}-{0.month:02d}".format(dt)
59
month[m].add(id)
60
61
with open("active-users-1d.csv", 'w') as out:
62
print("Daily active users")
63
out.write("day;active\n")
64
for k, v in sorted(day.items()):
65
print("{}: {}".format(k, len(v)))
66
out.write("{};{}\n".format(k, len(v)))
67
68
with open("active-users-1w.csv", 'w') as out:
69
print("")
70
print("Weekly active users")
71
out.write("week;active\n")
72
for k, v in sorted(week.items()):
73
print("{}: {}".format(k, len(v)))
74
out.write("{};{}\n".format(k, len(v)))
75
76
with open("active-users-1m.csv", 'w') as out:
77
print("")
78
print("Monthly active users")
79
out.write("month;active\n")
80
for k, v in sorted(month.items()):
81
print("{}: {}".format(k, len(v)))
82
out.write("{};{}\n".format(k, len(v)))
83
84