Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Path: blob/master/src/scripts/stats-active-users.py
Views: 687
#!/usr/bin/env python31# This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.2# License: MS-RSL – see LICENSE.md for details34import sys, os5from os.path import abspath, dirname, join6file_dir = abspath(dirname(__file__))7sys.path.insert(0, file_dir)89from smc_rethinkdb import *10from pprint import pprint11from datetime import datetime, timedelta12from pytz import utc13from collections import defaultdict14from time import time15import numpy as np1617now = r.maxval18# ago = time_past(24 * 50)1920ago = datetime(2015, 1, 1).replace(tzinfo=utc)21# now = datetime(2016, 3, 1).replace(tzinfo = utc)2223# round down to midnight24ago = ago.replace(hour=0, minute=0, second=0, microsecond=0)2526# ATTN: central_log doesn't record all sign_in events, probably due to some missing case for passports. hence that's incomplete27# q = central_log.between(ago, now, index='time')#.filter({'event' : 'successful_sign_in', 'value' : {'email_address' : "[email protected]"}})28# q = q.has_fields({'value' : 'email_address'}).filter({'value' : {'email_address' : "[email protected]"}})2930# INFO: using file_access_log instead, which has the benefit to be more truthy regarding really "active" users31q = file_access_log.between(ago, now, index='time')32# q = q.filter({"account_id" : '3c40513b-7e7c-450c-aa13-bf4f3411cf33'}) # only [email protected]3334total = q.count().run()35print("Hold tight. Going through {} records for you …".format(total))3637q = q.has_fields('account_id').pluck("time", "account_id")3839day = defaultdict(set)40week = defaultdict(set)41month = defaultdict(set)4243t0 = None4445for idx, ev in enumerate(q.run()):46if t0 is None:47t0 = t = time()48if total > 1001 and idx % int(total * .001) == 0 and t < time():49eta = (total - idx) * (t - t0) / idx if idx > 1000 else np.nan50print('{:5.2f} % eta: {:5.2f} s'.format(100. * idx / total, eta))51t = time() + 5.52id = ev["account_id"]53dt = ev["time"]54day[dt.date()].add(id)55w = "{0}-{1:02d}".format(*dt.isocalendar())56week[w].add(id)57m = "{0.year}-{0.month:02d}".format(dt)58month[m].add(id)5960with open("active-users-1d.csv", 'w') as out:61print("Daily active users")62out.write("day;active\n")63for k, v in sorted(day.items()):64print("{}: {}".format(k, len(v)))65out.write("{};{}\n".format(k, len(v)))6667with open("active-users-1w.csv", 'w') as out:68print("")69print("Weekly active users")70out.write("week;active\n")71for k, v in sorted(week.items()):72print("{}: {}".format(k, len(v)))73out.write("{};{}\n".format(k, len(v)))7475with open("active-users-1m.csv", 'w') as out:76print("")77print("Monthly active users")78out.write("month;active\n")79for k, v in sorted(month.items()):80print("{}: {}".format(k, len(v)))81out.write("{};{}\n".format(k, len(v)))828384