import sys, os
from os.path import abspath, dirname, join
file_dir = abspath(dirname(__file__))
sys.path.insert(0, file_dir)
from smc_rethinkdb import *
from pprint import pprint
from datetime import datetime, timedelta
from pytz import utc
from collections import defaultdict
from time import time
import numpy as np
now = r.maxval
ago = datetime(2015, 1, 1).replace(tzinfo=utc)
ago = ago.replace(hour=0, minute=0, second=0, microsecond=0)
q = file_access_log.between(ago, now, index='time')
total = q.count().run()
print("Hold tight. Going through {} records for you …".format(total))
q = q.has_fields('account_id').pluck("time", "account_id")
day = defaultdict(set)
week = defaultdict(set)
month = defaultdict(set)
t0 = None
for idx, ev in enumerate(q.run()):
if t0 is None:
t0 = t = time()
if total > 1001 and idx % int(total * .001) == 0 and t < time():
eta = (total - idx) * (t - t0) / idx if idx > 1000 else np.nan
print('{:5.2f} % eta: {:5.2f} s'.format(100. * idx / total, eta))
t = time() + 5.
id = ev["account_id"]
dt = ev["time"]
day[dt.date()].add(id)
w = "{0}-{1:02d}".format(*dt.isocalendar())
week[w].add(id)
m = "{0.year}-{0.month:02d}".format(dt)
month[m].add(id)
with open("active-users-1d.csv", 'w') as out:
print("Daily active users")
out.write("day;active\n")
for k, v in sorted(day.items()):
print("{}: {}".format(k, len(v)))
out.write("{};{}\n".format(k, len(v)))
with open("active-users-1w.csv", 'w') as out:
print("")
print("Weekly active users")
out.write("week;active\n")
for k, v in sorted(week.items()):
print("{}: {}".format(k, len(v)))
out.write("{};{}\n".format(k, len(v)))
with open("active-users-1m.csv", 'w') as out:
print("")
print("Monthly active users")
out.write("month;active\n")
for k, v in sorted(month.items()):
print("{}: {}".format(k, len(v)))
out.write("{};{}\n".format(k, len(v)))