Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/kernel/acct.c
26243 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* linux/kernel/acct.c
4
*
5
* BSD Process Accounting for Linux
6
*
7
* Author: Marco van Wieringen <[email protected]>
8
*
9
* Some code based on ideas and code from:
10
* Thomas K. Dyas <[email protected]>
11
*
12
* This file implements BSD-style process accounting. Whenever any
13
* process exits, an accounting record of type "struct acct" is
14
* written to the file specified with the acct() system call. It is
15
* up to user-level programs to do useful things with the accounting
16
* log. The kernel just provides the raw accounting information.
17
*
18
* (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
19
*
20
* Plugged two leaks. 1) It didn't return acct_file into the free_filps if
21
* the file happened to be read-only. 2) If the accounting was suspended
22
* due to the lack of space it happily allowed to reopen it and completely
23
* lost the old acct_file. 3/10/98, Al Viro.
24
*
25
* Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
26
* XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
27
*
28
* Fixed a nasty interaction with sys_umount(). If the accounting
29
* was suspeneded we failed to stop it on umount(). Messy.
30
* Another one: remount to readonly didn't stop accounting.
31
* Question: what should we do if we have CAP_SYS_ADMIN but not
32
* CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
33
* unless we are messing with the root. In that case we are getting a
34
* real mess with do_remount_sb(). 9/11/98, AV.
35
*
36
* Fixed a bunch of races (and pair of leaks). Probably not the best way,
37
* but this one obviously doesn't introduce deadlocks. Later. BTW, found
38
* one race (and leak) in BSD implementation.
39
* OK, that's better. ANOTHER race and leak in BSD variant. There always
40
* is one more bug... 10/11/98, AV.
41
*
42
* Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
43
* ->mmap_lock to walk the vma list of current->mm. Nasty, since it leaks
44
* a struct file opened for write. Fixed. 2/6/2000, AV.
45
*/
46
47
#include <linux/mm.h>
48
#include <linux/slab.h>
49
#include <linux/acct.h>
50
#include <linux/capability.h>
51
#include <linux/file.h>
52
#include <linux/tty.h>
53
#include <linux/security.h>
54
#include <linux/vfs.h>
55
#include <linux/jiffies.h>
56
#include <linux/times.h>
57
#include <linux/syscalls.h>
58
#include <linux/mount.h>
59
#include <linux/uaccess.h>
60
#include <linux/sched/cputime.h>
61
62
#include <asm/div64.h>
63
#include <linux/pid_namespace.h>
64
#include <linux/fs_pin.h>
65
66
/*
67
* These constants control the amount of freespace that suspend and
68
* resume the process accounting system, and the time delay between
69
* each check.
70
* Turned into sysctl-controllable parameters. AV, 12/11/98
71
*/
72
73
static int acct_parm[3] = {4, 2, 30};
74
#define RESUME (acct_parm[0]) /* >foo% free space - resume */
75
#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
76
#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
77
78
#ifdef CONFIG_SYSCTL
79
static const struct ctl_table kern_acct_table[] = {
80
{
81
.procname = "acct",
82
.data = &acct_parm,
83
.maxlen = 3*sizeof(int),
84
.mode = 0644,
85
.proc_handler = proc_dointvec,
86
},
87
};
88
89
static __init int kernel_acct_sysctls_init(void)
90
{
91
register_sysctl_init("kernel", kern_acct_table);
92
return 0;
93
}
94
late_initcall(kernel_acct_sysctls_init);
95
#endif /* CONFIG_SYSCTL */
96
97
/*
98
* External references and all of the globals.
99
*/
100
101
struct bsd_acct_struct {
102
struct fs_pin pin;
103
atomic_long_t count;
104
struct rcu_head rcu;
105
struct mutex lock;
106
bool active;
107
bool check_space;
108
unsigned long needcheck;
109
struct file *file;
110
struct pid_namespace *ns;
111
struct work_struct work;
112
struct completion done;
113
acct_t ac;
114
};
115
116
static void fill_ac(struct bsd_acct_struct *acct);
117
static void acct_write_process(struct bsd_acct_struct *acct);
118
119
/*
120
* Check the amount of free space and suspend/resume accordingly.
121
*/
122
static bool check_free_space(struct bsd_acct_struct *acct)
123
{
124
struct kstatfs sbuf;
125
126
if (!acct->check_space)
127
return acct->active;
128
129
/* May block */
130
if (vfs_statfs(&acct->file->f_path, &sbuf))
131
return acct->active;
132
133
if (acct->active) {
134
u64 suspend = sbuf.f_blocks * SUSPEND;
135
do_div(suspend, 100);
136
if (sbuf.f_bavail <= suspend) {
137
acct->active = false;
138
pr_info("Process accounting paused\n");
139
}
140
} else {
141
u64 resume = sbuf.f_blocks * RESUME;
142
do_div(resume, 100);
143
if (sbuf.f_bavail >= resume) {
144
acct->active = true;
145
pr_info("Process accounting resumed\n");
146
}
147
}
148
149
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
150
return acct->active;
151
}
152
153
static void acct_put(struct bsd_acct_struct *p)
154
{
155
if (atomic_long_dec_and_test(&p->count))
156
kfree_rcu(p, rcu);
157
}
158
159
static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
160
{
161
return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
162
}
163
164
static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
165
{
166
struct bsd_acct_struct *res;
167
again:
168
smp_rmb();
169
rcu_read_lock();
170
res = to_acct(READ_ONCE(ns->bacct));
171
if (!res) {
172
rcu_read_unlock();
173
return NULL;
174
}
175
if (!atomic_long_inc_not_zero(&res->count)) {
176
rcu_read_unlock();
177
cpu_relax();
178
goto again;
179
}
180
rcu_read_unlock();
181
mutex_lock(&res->lock);
182
if (res != to_acct(READ_ONCE(ns->bacct))) {
183
mutex_unlock(&res->lock);
184
acct_put(res);
185
goto again;
186
}
187
return res;
188
}
189
190
static void acct_pin_kill(struct fs_pin *pin)
191
{
192
struct bsd_acct_struct *acct = to_acct(pin);
193
mutex_lock(&acct->lock);
194
/*
195
* Fill the accounting struct with the exiting task's info
196
* before punting to the workqueue.
197
*/
198
fill_ac(acct);
199
schedule_work(&acct->work);
200
wait_for_completion(&acct->done);
201
cmpxchg(&acct->ns->bacct, pin, NULL);
202
mutex_unlock(&acct->lock);
203
pin_remove(pin);
204
acct_put(acct);
205
}
206
207
static void close_work(struct work_struct *work)
208
{
209
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
210
struct file *file = acct->file;
211
212
/* We were fired by acct_pin_kill() which holds acct->lock. */
213
acct_write_process(acct);
214
if (file->f_op->flush)
215
file->f_op->flush(file, NULL);
216
__fput_sync(file);
217
complete(&acct->done);
218
}
219
220
static int acct_on(struct filename *pathname)
221
{
222
struct file *file;
223
struct vfsmount *mnt, *internal;
224
struct pid_namespace *ns = task_active_pid_ns(current);
225
struct bsd_acct_struct *acct;
226
struct fs_pin *old;
227
int err;
228
229
acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
230
if (!acct)
231
return -ENOMEM;
232
233
/* Difference from BSD - they don't do O_APPEND */
234
file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
235
if (IS_ERR(file)) {
236
kfree(acct);
237
return PTR_ERR(file);
238
}
239
240
if (!S_ISREG(file_inode(file)->i_mode)) {
241
kfree(acct);
242
filp_close(file, NULL);
243
return -EACCES;
244
}
245
246
/* Exclude kernel kernel internal filesystems. */
247
if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) {
248
kfree(acct);
249
filp_close(file, NULL);
250
return -EINVAL;
251
}
252
253
/* Exclude procfs and sysfs. */
254
if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) {
255
kfree(acct);
256
filp_close(file, NULL);
257
return -EINVAL;
258
}
259
260
if (!(file->f_mode & FMODE_CAN_WRITE)) {
261
kfree(acct);
262
filp_close(file, NULL);
263
return -EIO;
264
}
265
internal = mnt_clone_internal(&file->f_path);
266
if (IS_ERR(internal)) {
267
kfree(acct);
268
filp_close(file, NULL);
269
return PTR_ERR(internal);
270
}
271
err = mnt_get_write_access(internal);
272
if (err) {
273
mntput(internal);
274
kfree(acct);
275
filp_close(file, NULL);
276
return err;
277
}
278
mnt = file->f_path.mnt;
279
file->f_path.mnt = internal;
280
281
atomic_long_set(&acct->count, 1);
282
init_fs_pin(&acct->pin, acct_pin_kill);
283
acct->file = file;
284
acct->needcheck = jiffies;
285
acct->ns = ns;
286
mutex_init(&acct->lock);
287
INIT_WORK(&acct->work, close_work);
288
init_completion(&acct->done);
289
mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
290
pin_insert(&acct->pin, mnt);
291
292
rcu_read_lock();
293
old = xchg(&ns->bacct, &acct->pin);
294
mutex_unlock(&acct->lock);
295
pin_kill(old);
296
mnt_put_write_access(mnt);
297
mntput(mnt);
298
return 0;
299
}
300
301
static DEFINE_MUTEX(acct_on_mutex);
302
303
/**
304
* sys_acct - enable/disable process accounting
305
* @name: file name for accounting records or NULL to shutdown accounting
306
*
307
* sys_acct() is the only system call needed to implement process
308
* accounting. It takes the name of the file where accounting records
309
* should be written. If the filename is NULL, accounting will be
310
* shutdown.
311
*
312
* Returns: 0 for success or negative errno values for failure.
313
*/
314
SYSCALL_DEFINE1(acct, const char __user *, name)
315
{
316
int error = 0;
317
318
if (!capable(CAP_SYS_PACCT))
319
return -EPERM;
320
321
if (name) {
322
struct filename *tmp = getname(name);
323
324
if (IS_ERR(tmp))
325
return PTR_ERR(tmp);
326
mutex_lock(&acct_on_mutex);
327
error = acct_on(tmp);
328
mutex_unlock(&acct_on_mutex);
329
putname(tmp);
330
} else {
331
rcu_read_lock();
332
pin_kill(task_active_pid_ns(current)->bacct);
333
}
334
335
return error;
336
}
337
338
void acct_exit_ns(struct pid_namespace *ns)
339
{
340
rcu_read_lock();
341
pin_kill(ns->bacct);
342
}
343
344
/*
345
* encode an u64 into a comp_t
346
*
347
* This routine has been adopted from the encode_comp_t() function in
348
* the kern_acct.c file of the FreeBSD operating system. The encoding
349
* is a 13-bit fraction with a 3-bit (base 8) exponent.
350
*/
351
352
#define MANTSIZE 13 /* 13 bit mantissa. */
353
#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
354
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
355
356
static comp_t encode_comp_t(u64 value)
357
{
358
int exp, rnd;
359
360
exp = rnd = 0;
361
while (value > MAXFRACT) {
362
rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */
363
value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
364
exp++;
365
}
366
367
/*
368
* If we need to round up, do it (and handle overflow correctly).
369
*/
370
if (rnd && (++value > MAXFRACT)) {
371
value >>= EXPSIZE;
372
exp++;
373
}
374
375
if (exp > (((comp_t) ~0U) >> MANTSIZE))
376
return (comp_t) ~0U;
377
/*
378
* Clean it up and polish it off.
379
*/
380
exp <<= MANTSIZE; /* Shift the exponent into place */
381
exp += value; /* and add on the mantissa. */
382
return exp;
383
}
384
385
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
386
/*
387
* encode an u64 into a comp2_t (24 bits)
388
*
389
* Format: 5 bit base 2 exponent, 20 bits mantissa.
390
* The leading bit of the mantissa is not stored, but implied for
391
* non-zero exponents.
392
* Largest encodable value is 50 bits.
393
*/
394
395
#define MANTSIZE2 20 /* 20 bit mantissa. */
396
#define EXPSIZE2 5 /* 5 bit base 2 exponent. */
397
#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
398
#define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */
399
400
static comp2_t encode_comp2_t(u64 value)
401
{
402
int exp, rnd;
403
404
exp = (value > (MAXFRACT2>>1));
405
rnd = 0;
406
while (value > MAXFRACT2) {
407
rnd = value & 1;
408
value >>= 1;
409
exp++;
410
}
411
412
/*
413
* If we need to round up, do it (and handle overflow correctly).
414
*/
415
if (rnd && (++value > MAXFRACT2)) {
416
value >>= 1;
417
exp++;
418
}
419
420
if (exp > MAXEXP2) {
421
/* Overflow. Return largest representable number instead. */
422
return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
423
} else {
424
return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
425
}
426
}
427
#elif ACCT_VERSION == 3
428
/*
429
* encode an u64 into a 32 bit IEEE float
430
*/
431
static u32 encode_float(u64 value)
432
{
433
unsigned exp = 190;
434
unsigned u;
435
436
if (value == 0)
437
return 0;
438
while ((s64)value > 0) {
439
value <<= 1;
440
exp--;
441
}
442
u = (u32)(value >> 40) & 0x7fffffu;
443
return u | (exp << 23);
444
}
445
#endif
446
447
/*
448
* Write an accounting entry for an exiting process
449
*
450
* The acct_process() call is the workhorse of the process
451
* accounting system. The struct acct is built here and then written
452
* into the accounting file. This function should only be called from
453
* do_exit() or when switching to a different output file.
454
*/
455
456
static void fill_ac(struct bsd_acct_struct *acct)
457
{
458
struct pacct_struct *pacct = &current->signal->pacct;
459
struct file *file = acct->file;
460
acct_t *ac = &acct->ac;
461
u64 elapsed, run_time;
462
time64_t btime;
463
struct tty_struct *tty;
464
465
lockdep_assert_held(&acct->lock);
466
467
if (time_is_after_jiffies(acct->needcheck)) {
468
acct->check_space = false;
469
470
/* Don't fill in @ac if nothing will be written. */
471
if (!acct->active)
472
return;
473
} else {
474
acct->check_space = true;
475
}
476
477
/*
478
* Fill the accounting struct with the needed info as recorded
479
* by the different kernel functions.
480
*/
481
memset(ac, 0, sizeof(acct_t));
482
483
ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
484
strscpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
485
486
/* calculate run_time in nsec*/
487
run_time = ktime_get_ns();
488
run_time -= current->group_leader->start_time;
489
/* convert nsec -> AHZ */
490
elapsed = nsec_to_AHZ(run_time);
491
#if ACCT_VERSION == 3
492
ac->ac_etime = encode_float(elapsed);
493
#else
494
ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
495
(unsigned long) elapsed : (unsigned long) -1l);
496
#endif
497
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
498
{
499
/* new enlarged etime field */
500
comp2_t etime = encode_comp2_t(elapsed);
501
502
ac->ac_etime_hi = etime >> 16;
503
ac->ac_etime_lo = (u16) etime;
504
}
505
#endif
506
do_div(elapsed, AHZ);
507
btime = ktime_get_real_seconds() - elapsed;
508
ac->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
509
#if ACCT_VERSION == 2
510
ac->ac_ahz = AHZ;
511
#endif
512
513
spin_lock_irq(&current->sighand->siglock);
514
tty = current->signal->tty; /* Safe as we hold the siglock */
515
ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
516
ac->ac_utime = encode_comp_t(nsec_to_AHZ(pacct->ac_utime));
517
ac->ac_stime = encode_comp_t(nsec_to_AHZ(pacct->ac_stime));
518
ac->ac_flag = pacct->ac_flag;
519
ac->ac_mem = encode_comp_t(pacct->ac_mem);
520
ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
521
ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
522
ac->ac_exitcode = pacct->ac_exitcode;
523
spin_unlock_irq(&current->sighand->siglock);
524
525
/* we really need to bite the bullet and change layout */
526
ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid());
527
ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid());
528
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
529
/* backward-compatible 16 bit fields */
530
ac->ac_uid16 = ac->ac_uid;
531
ac->ac_gid16 = ac->ac_gid;
532
#elif ACCT_VERSION == 3
533
{
534
struct pid_namespace *ns = acct->ns;
535
536
ac->ac_pid = task_tgid_nr_ns(current, ns);
537
rcu_read_lock();
538
ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
539
rcu_read_unlock();
540
}
541
#endif
542
}
543
544
static void acct_write_process(struct bsd_acct_struct *acct)
545
{
546
struct file *file = acct->file;
547
const struct cred *cred;
548
acct_t *ac = &acct->ac;
549
550
/* Perform file operations on behalf of whoever enabled accounting */
551
cred = override_creds(file->f_cred);
552
553
/*
554
* First check to see if there is enough free_space to continue
555
* the process accounting system. Then get freeze protection. If
556
* the fs is frozen, just skip the write as we could deadlock
557
* the system otherwise.
558
*/
559
if (check_free_space(acct) && file_start_write_trylock(file)) {
560
/* it's been opened O_APPEND, so position is irrelevant */
561
loff_t pos = 0;
562
__kernel_write(file, ac, sizeof(acct_t), &pos);
563
file_end_write(file);
564
}
565
566
revert_creds(cred);
567
}
568
569
static void do_acct_process(struct bsd_acct_struct *acct)
570
{
571
unsigned long flim;
572
573
/* Accounting records are not subject to resource limits. */
574
flim = rlimit(RLIMIT_FSIZE);
575
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
576
fill_ac(acct);
577
acct_write_process(acct);
578
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
579
}
580
581
/**
582
* acct_collect - collect accounting information into pacct_struct
583
* @exitcode: task exit code
584
* @group_dead: not 0, if this thread is the last one in the process.
585
*/
586
void acct_collect(long exitcode, int group_dead)
587
{
588
struct pacct_struct *pacct = &current->signal->pacct;
589
u64 utime, stime;
590
unsigned long vsize = 0;
591
592
if (group_dead && current->mm) {
593
struct mm_struct *mm = current->mm;
594
VMA_ITERATOR(vmi, mm, 0);
595
struct vm_area_struct *vma;
596
597
mmap_read_lock(mm);
598
for_each_vma(vmi, vma)
599
vsize += vma->vm_end - vma->vm_start;
600
mmap_read_unlock(mm);
601
}
602
603
spin_lock_irq(&current->sighand->siglock);
604
if (group_dead)
605
pacct->ac_mem = vsize / 1024;
606
if (thread_group_leader(current)) {
607
pacct->ac_exitcode = exitcode;
608
if (current->flags & PF_FORKNOEXEC)
609
pacct->ac_flag |= AFORK;
610
}
611
if (current->flags & PF_SUPERPRIV)
612
pacct->ac_flag |= ASU;
613
if (current->flags & PF_DUMPCORE)
614
pacct->ac_flag |= ACORE;
615
if (current->flags & PF_SIGNALED)
616
pacct->ac_flag |= AXSIG;
617
618
task_cputime(current, &utime, &stime);
619
pacct->ac_utime += utime;
620
pacct->ac_stime += stime;
621
pacct->ac_minflt += current->min_flt;
622
pacct->ac_majflt += current->maj_flt;
623
spin_unlock_irq(&current->sighand->siglock);
624
}
625
626
static void slow_acct_process(struct pid_namespace *ns)
627
{
628
for ( ; ns; ns = ns->parent) {
629
struct bsd_acct_struct *acct = acct_get(ns);
630
if (acct) {
631
do_acct_process(acct);
632
mutex_unlock(&acct->lock);
633
acct_put(acct);
634
}
635
}
636
}
637
638
/**
639
* acct_process - handles process accounting for an exiting task
640
*/
641
void acct_process(void)
642
{
643
struct pid_namespace *ns;
644
645
/*
646
* This loop is safe lockless, since current is still
647
* alive and holds its namespace, which in turn holds
648
* its parent.
649
*/
650
for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
651
if (ns->bacct)
652
break;
653
}
654
if (unlikely(ns))
655
slow_acct_process(ns);
656
}
657
658