Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
sagemathinc
GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/scripts/stats-activity.py
Views: 275
1
#!/usr/bin/env python3
2
# This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.
3
# License: AGPLv3 s.t. "Commons Clause" – read LICENSE.md for details
4
5
# this script scans the table recording file edits and tabulates the most active users and projects.
6
# it bin-counts the active projects or active users by 10 minutes sized bins.
7
# that's much more accurate than just counting activities in bulk and allows to discretely sum up the event bins.
8
9
print(
10
"CONFIDENTIAL: don't share the generated data publicly. It is solely used to improve the service!"
11
)
12
13
import sys, os
14
d = os.path.abspath(os.path.dirname(__file__))
15
sys.path.insert(0, d)
16
17
from smc_rethinkdb import r, accounts, file_access_log, secs2hms
18
from pprint import pprint
19
from datetime import datetime, timedelta
20
from pytz import utc
21
from queue import Queue
22
from threading import Thread
23
import socket
24
import numpy as np
25
from collections import Counter, defaultdict
26
import itertools as it
27
28
# map account and project IDs to integers.
29
#pid_count = it.count()
30
#anon_pid = defaultdict(lambda : next(pid_count))
31
#aid_count = it.count()
32
#anon_aid = defaultdict(lambda : next(aid_count))
33
34
DAYS_AGO = int(sys.argv[1]) if len(sys.argv) >= 2 else 7
35
now = datetime.utcnow().replace(tzinfo=utc)
36
ago = now - timedelta(days=DAYS_AGO)
37
38
# round down to midnight
39
# ago = ago.replace(hour = 0, minute = 0, second = 0, microsecond = 0)
40
41
q = file_access_log\
42
.between(ago, now, index='time')\
43
.eq_join(r.row["account_id"], accounts)\
44
.pluck({"left" : ["time", "account_id", "project_id"], "right": ["first_name", "last_name", "email_address"]})
45
46
users = dict()
47
48
# mapping hourly timestamp to set of users
49
users_bins = defaultdict(set)
50
projs_bins = defaultdict(set)
51
52
# summing up total number of bins when active
53
users_tot = Counter()
54
projs_tot = Counter()
55
56
for idx, res in enumerate(q.run()):
57
what = res["left"]
58
who = res["right"]
59
# print(what); print(who)
60
# who: {'first_name': 'xxx', 'email_address': 'xxx', 'last_name': 'xxx'}
61
# what: {'project_id': 'xxx-xxx-xxx', 'account_id': 'xxx-xxx-xxx', 'time': ' ... ' }
62
aid = what["account_id"]
63
pid = what["project_id"]
64
if aid not in users:
65
who["email_address"] = who.get("email_address", "None")
66
users[
67
aid] = "{0[first_name]} {0[last_name]} <{0[email_address]}>".format(
68
who)
69
70
# full hour bin
71
t = what["time"]
72
ts = int(
73
t.replace(minute=t.minute - t.minute % 10, second=0,
74
microsecond=0).timestamp())
75
users_bins[ts].add(aid)
76
projs_bins[ts].add(pid)
77
78
#if idx > 1000:
79
# break
80
81
# print(projs)
82
83
for name, bins, tot in [("users", users_bins, users_tot),
84
("projects", projs_bins, projs_tot)]:
85
print()
86
print("{} Bins".format(name.title()))
87
for ts, ids in sorted(bins.items()):
88
ts = datetime.fromtimestamp(ts).isoformat()
89
print("{} → {}".format(ts, len(ids)))
90
tot.update(Counter(ids))
91
92
sum_user_total = 0
93
print("Top Users")
94
for (aid, nb) in users_tot.most_common(30):
95
# nb: number of 10min intervals
96
x = 60 * 10 * nb
97
print("{:>9}s {} ({})".format(secs2hms(x), users[aid], aid))
98
sum_user_total += x
99
100
print()
101
ratio = (sum_user_total / 60.) / (DAYS_AGO * 24 * 60)
102
print("Sum of all {} user's activity: {} (radio: 1:{:.2f})".format(
103
len(users_tot), secs2hms(sum_user_total), ratio))
104
105