Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/ipc/mqueue.c
48939 views
1
/*
2
* POSIX message queues filesystem for Linux.
3
*
4
* Copyright (C) 2003,2004 Krzysztof Benedyczak ([email protected])
5
* Michal Wronski ([email protected])
6
*
7
* Spinlocks: Mohamed Abbas ([email protected])
8
* Lockless receive & send, fd based notify:
9
* Manfred Spraul ([email protected])
10
*
11
* Audit: George Wilson ([email protected])
12
*
13
* This file is released under the GPL.
14
*/
15
16
#include <linux/capability.h>
17
#include <linux/init.h>
18
#include <linux/pagemap.h>
19
#include <linux/file.h>
20
#include <linux/mount.h>
21
#include <linux/fs_context.h>
22
#include <linux/namei.h>
23
#include <linux/sysctl.h>
24
#include <linux/poll.h>
25
#include <linux/mqueue.h>
26
#include <linux/msg.h>
27
#include <linux/skbuff.h>
28
#include <linux/vmalloc.h>
29
#include <linux/netlink.h>
30
#include <linux/syscalls.h>
31
#include <linux/audit.h>
32
#include <linux/signal.h>
33
#include <linux/mutex.h>
34
#include <linux/nsproxy.h>
35
#include <linux/pid.h>
36
#include <linux/ipc_namespace.h>
37
#include <linux/user_namespace.h>
38
#include <linux/slab.h>
39
#include <linux/sched/wake_q.h>
40
#include <linux/sched/signal.h>
41
#include <linux/sched/user.h>
42
43
#include <net/sock.h>
44
#include "util.h"
45
46
struct mqueue_fs_context {
47
struct ipc_namespace *ipc_ns;
48
bool newns; /* Set if newly created ipc namespace */
49
};
50
51
#define MQUEUE_MAGIC 0x19800202
52
#define DIRENT_SIZE 20
53
#define FILENT_SIZE 80
54
55
#define SEND 0
56
#define RECV 1
57
58
#define STATE_NONE 0
59
#define STATE_READY 1
60
61
struct posix_msg_tree_node {
62
struct rb_node rb_node;
63
struct list_head msg_list;
64
int priority;
65
};
66
67
/*
68
* Locking:
69
*
70
* Accesses to a message queue are synchronized by acquiring info->lock.
71
*
72
* There are two notable exceptions:
73
* - The actual wakeup of a sleeping task is performed using the wake_q
74
* framework. info->lock is already released when wake_up_q is called.
75
* - The exit codepaths after sleeping check ext_wait_queue->state without
76
* any locks. If it is STATE_READY, then the syscall is completed without
77
* acquiring info->lock.
78
*
79
* MQ_BARRIER:
80
* To achieve proper release/acquire memory barrier pairing, the state is set to
81
* STATE_READY with smp_store_release(), and it is read with READ_ONCE followed
82
* by smp_acquire__after_ctrl_dep(). In addition, wake_q_add_safe() is used.
83
*
84
* This prevents the following races:
85
*
86
* 1) With the simple wake_q_add(), the task could be gone already before
87
* the increase of the reference happens
88
* Thread A
89
* Thread B
90
* WRITE_ONCE(wait.state, STATE_NONE);
91
* schedule_hrtimeout()
92
* wake_q_add(A)
93
* if (cmpxchg()) // success
94
* ->state = STATE_READY (reordered)
95
* <timeout returns>
96
* if (wait.state == STATE_READY) return;
97
* sysret to user space
98
* sys_exit()
99
* get_task_struct() // UaF
100
*
101
* Solution: Use wake_q_add_safe() and perform the get_task_struct() before
102
* the smp_store_release() that does ->state = STATE_READY.
103
*
104
* 2) Without proper _release/_acquire barriers, the woken up task
105
* could read stale data
106
*
107
* Thread A
108
* Thread B
109
* do_mq_timedreceive
110
* WRITE_ONCE(wait.state, STATE_NONE);
111
* schedule_hrtimeout()
112
* state = STATE_READY;
113
* <timeout returns>
114
* if (wait.state == STATE_READY) return;
115
* msg_ptr = wait.msg; // Access to stale data!
116
* receiver->msg = message; (reordered)
117
*
118
* Solution: use _release and _acquire barriers.
119
*
120
* 3) There is intentionally no barrier when setting current->state
121
* to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the
122
* release memory barrier, and the wakeup is triggered when holding
123
* info->lock, i.e. spin_lock(&info->lock) provided a pairing
124
* acquire memory barrier.
125
*/
126
127
struct ext_wait_queue { /* queue of sleeping tasks */
128
struct task_struct *task;
129
struct list_head list;
130
struct msg_msg *msg; /* ptr of loaded message */
131
int state; /* one of STATE_* values */
132
};
133
134
struct mqueue_inode_info {
135
spinlock_t lock;
136
struct inode vfs_inode;
137
wait_queue_head_t wait_q;
138
139
struct rb_root msg_tree;
140
struct rb_node *msg_tree_rightmost;
141
struct posix_msg_tree_node *node_cache;
142
struct mq_attr attr;
143
144
struct sigevent notify;
145
struct pid *notify_owner;
146
u32 notify_self_exec_id;
147
struct user_namespace *notify_user_ns;
148
struct ucounts *ucounts; /* user who created, for accounting */
149
struct sock *notify_sock;
150
struct sk_buff *notify_cookie;
151
152
/* for tasks waiting for free space and messages, respectively */
153
struct ext_wait_queue e_wait_q[2];
154
155
unsigned long qsize; /* size of queue in memory (sum of all msgs) */
156
};
157
158
static struct file_system_type mqueue_fs_type;
159
static const struct inode_operations mqueue_dir_inode_operations;
160
static const struct file_operations mqueue_file_operations;
161
static const struct super_operations mqueue_super_ops;
162
static const struct fs_context_operations mqueue_fs_context_ops;
163
static void remove_notification(struct mqueue_inode_info *info);
164
165
static struct kmem_cache *mqueue_inode_cachep;
166
167
static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
168
{
169
return container_of(inode, struct mqueue_inode_info, vfs_inode);
170
}
171
172
/*
173
* This routine should be called with the mq_lock held.
174
*/
175
static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
176
{
177
return get_ipc_ns(inode->i_sb->s_fs_info);
178
}
179
180
static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
181
{
182
struct ipc_namespace *ns;
183
184
spin_lock(&mq_lock);
185
ns = __get_ns_from_inode(inode);
186
spin_unlock(&mq_lock);
187
return ns;
188
}
189
190
/* Auxiliary functions to manipulate messages' list */
191
static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
192
{
193
struct rb_node **p, *parent = NULL;
194
struct posix_msg_tree_node *leaf;
195
bool rightmost = true;
196
197
p = &info->msg_tree.rb_node;
198
while (*p) {
199
parent = *p;
200
leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
201
202
if (likely(leaf->priority == msg->m_type))
203
goto insert_msg;
204
else if (msg->m_type < leaf->priority) {
205
p = &(*p)->rb_left;
206
rightmost = false;
207
} else
208
p = &(*p)->rb_right;
209
}
210
if (info->node_cache) {
211
leaf = info->node_cache;
212
info->node_cache = NULL;
213
} else {
214
leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC);
215
if (!leaf)
216
return -ENOMEM;
217
INIT_LIST_HEAD(&leaf->msg_list);
218
}
219
leaf->priority = msg->m_type;
220
221
if (rightmost)
222
info->msg_tree_rightmost = &leaf->rb_node;
223
224
rb_link_node(&leaf->rb_node, parent, p);
225
rb_insert_color(&leaf->rb_node, &info->msg_tree);
226
insert_msg:
227
info->attr.mq_curmsgs++;
228
info->qsize += msg->m_ts;
229
list_add_tail(&msg->m_list, &leaf->msg_list);
230
return 0;
231
}
232
233
static inline void msg_tree_erase(struct posix_msg_tree_node *leaf,
234
struct mqueue_inode_info *info)
235
{
236
struct rb_node *node = &leaf->rb_node;
237
238
if (info->msg_tree_rightmost == node)
239
info->msg_tree_rightmost = rb_prev(node);
240
241
rb_erase(node, &info->msg_tree);
242
if (info->node_cache)
243
kfree(leaf);
244
else
245
info->node_cache = leaf;
246
}
247
248
static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
249
{
250
struct rb_node *parent = NULL;
251
struct posix_msg_tree_node *leaf;
252
struct msg_msg *msg;
253
254
try_again:
255
/*
256
* During insert, low priorities go to the left and high to the
257
* right. On receive, we want the highest priorities first, so
258
* walk all the way to the right.
259
*/
260
parent = info->msg_tree_rightmost;
261
if (!parent) {
262
if (info->attr.mq_curmsgs) {
263
pr_warn_once("Inconsistency in POSIX message queue, "
264
"no tree element, but supposedly messages "
265
"should exist!\n");
266
info->attr.mq_curmsgs = 0;
267
}
268
return NULL;
269
}
270
leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
271
if (unlikely(list_empty(&leaf->msg_list))) {
272
pr_warn_once("Inconsistency in POSIX message queue, "
273
"empty leaf node but we haven't implemented "
274
"lazy leaf delete!\n");
275
msg_tree_erase(leaf, info);
276
goto try_again;
277
} else {
278
msg = list_first_entry(&leaf->msg_list,
279
struct msg_msg, m_list);
280
list_del(&msg->m_list);
281
if (list_empty(&leaf->msg_list)) {
282
msg_tree_erase(leaf, info);
283
}
284
}
285
info->attr.mq_curmsgs--;
286
info->qsize -= msg->m_ts;
287
return msg;
288
}
289
290
static struct inode *mqueue_get_inode(struct super_block *sb,
291
struct ipc_namespace *ipc_ns, umode_t mode,
292
struct mq_attr *attr)
293
{
294
struct inode *inode;
295
int ret = -ENOMEM;
296
297
inode = new_inode(sb);
298
if (!inode)
299
goto err;
300
301
inode->i_ino = get_next_ino();
302
inode->i_mode = mode;
303
inode->i_uid = current_fsuid();
304
inode->i_gid = current_fsgid();
305
simple_inode_init_ts(inode);
306
307
if (S_ISREG(mode)) {
308
struct mqueue_inode_info *info;
309
unsigned long mq_bytes, mq_treesize;
310
311
inode->i_fop = &mqueue_file_operations;
312
inode->i_size = FILENT_SIZE;
313
/* mqueue specific info */
314
info = MQUEUE_I(inode);
315
spin_lock_init(&info->lock);
316
init_waitqueue_head(&info->wait_q);
317
INIT_LIST_HEAD(&info->e_wait_q[0].list);
318
INIT_LIST_HEAD(&info->e_wait_q[1].list);
319
info->notify_owner = NULL;
320
info->notify_user_ns = NULL;
321
info->qsize = 0;
322
info->ucounts = NULL; /* set when all is ok */
323
info->msg_tree = RB_ROOT;
324
info->msg_tree_rightmost = NULL;
325
info->node_cache = NULL;
326
memset(&info->attr, 0, sizeof(info->attr));
327
info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
328
ipc_ns->mq_msg_default);
329
info->attr.mq_msgsize = min(ipc_ns->mq_msgsize_max,
330
ipc_ns->mq_msgsize_default);
331
if (attr) {
332
info->attr.mq_maxmsg = attr->mq_maxmsg;
333
info->attr.mq_msgsize = attr->mq_msgsize;
334
}
335
/*
336
* We used to allocate a static array of pointers and account
337
* the size of that array as well as one msg_msg struct per
338
* possible message into the queue size. That's no longer
339
* accurate as the queue is now an rbtree and will grow and
340
* shrink depending on usage patterns. We can, however, still
341
* account one msg_msg struct per message, but the nodes are
342
* allocated depending on priority usage, and most programs
343
* only use one, or a handful, of priorities. However, since
344
* this is pinned memory, we need to assume worst case, so
345
* that means the min(mq_maxmsg, max_priorities) * struct
346
* posix_msg_tree_node.
347
*/
348
349
ret = -EINVAL;
350
if (info->attr.mq_maxmsg <= 0 || info->attr.mq_msgsize <= 0)
351
goto out_inode;
352
if (capable(CAP_SYS_RESOURCE)) {
353
if (info->attr.mq_maxmsg > HARD_MSGMAX ||
354
info->attr.mq_msgsize > HARD_MSGSIZEMAX)
355
goto out_inode;
356
} else {
357
if (info->attr.mq_maxmsg > ipc_ns->mq_msg_max ||
358
info->attr.mq_msgsize > ipc_ns->mq_msgsize_max)
359
goto out_inode;
360
}
361
ret = -EOVERFLOW;
362
/* check for overflow */
363
if (info->attr.mq_msgsize > ULONG_MAX/info->attr.mq_maxmsg)
364
goto out_inode;
365
mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
366
min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
367
sizeof(struct posix_msg_tree_node);
368
mq_bytes = info->attr.mq_maxmsg * info->attr.mq_msgsize;
369
if (mq_bytes + mq_treesize < mq_bytes)
370
goto out_inode;
371
mq_bytes += mq_treesize;
372
info->ucounts = get_ucounts(current_ucounts());
373
if (info->ucounts) {
374
long msgqueue;
375
376
spin_lock(&mq_lock);
377
msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
378
if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) {
379
dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
380
spin_unlock(&mq_lock);
381
put_ucounts(info->ucounts);
382
info->ucounts = NULL;
383
/* mqueue_evict_inode() releases info->messages */
384
ret = -EMFILE;
385
goto out_inode;
386
}
387
spin_unlock(&mq_lock);
388
}
389
} else if (S_ISDIR(mode)) {
390
inc_nlink(inode);
391
/* Some things misbehave if size == 0 on a directory */
392
inode->i_size = 2 * DIRENT_SIZE;
393
inode->i_op = &mqueue_dir_inode_operations;
394
inode->i_fop = &simple_dir_operations;
395
}
396
397
return inode;
398
out_inode:
399
iput(inode);
400
err:
401
return ERR_PTR(ret);
402
}
403
404
static int mqueue_fill_super(struct super_block *sb, struct fs_context *fc)
405
{
406
struct inode *inode;
407
struct ipc_namespace *ns = sb->s_fs_info;
408
409
sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
410
sb->s_blocksize = PAGE_SIZE;
411
sb->s_blocksize_bits = PAGE_SHIFT;
412
sb->s_magic = MQUEUE_MAGIC;
413
sb->s_op = &mqueue_super_ops;
414
sb->s_d_flags = DCACHE_DONTCACHE;
415
416
inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL);
417
if (IS_ERR(inode))
418
return PTR_ERR(inode);
419
420
sb->s_root = d_make_root(inode);
421
if (!sb->s_root)
422
return -ENOMEM;
423
return 0;
424
}
425
426
static int mqueue_get_tree(struct fs_context *fc)
427
{
428
struct mqueue_fs_context *ctx = fc->fs_private;
429
430
/*
431
* With a newly created ipc namespace, we don't need to do a search
432
* for an ipc namespace match, but we still need to set s_fs_info.
433
*/
434
if (ctx->newns) {
435
fc->s_fs_info = ctx->ipc_ns;
436
return get_tree_nodev(fc, mqueue_fill_super);
437
}
438
return get_tree_keyed(fc, mqueue_fill_super, ctx->ipc_ns);
439
}
440
441
static void mqueue_fs_context_free(struct fs_context *fc)
442
{
443
struct mqueue_fs_context *ctx = fc->fs_private;
444
445
put_ipc_ns(ctx->ipc_ns);
446
kfree(ctx);
447
}
448
449
static int mqueue_init_fs_context(struct fs_context *fc)
450
{
451
struct mqueue_fs_context *ctx;
452
453
ctx = kzalloc(sizeof(struct mqueue_fs_context), GFP_KERNEL);
454
if (!ctx)
455
return -ENOMEM;
456
457
ctx->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns);
458
put_user_ns(fc->user_ns);
459
fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns);
460
fc->fs_private = ctx;
461
fc->ops = &mqueue_fs_context_ops;
462
return 0;
463
}
464
465
/*
466
* mq_init_ns() is currently the only caller of mq_create_mount().
467
* So the ns parameter is always a newly created ipc namespace.
468
*/
469
static struct vfsmount *mq_create_mount(struct ipc_namespace *ns)
470
{
471
struct mqueue_fs_context *ctx;
472
struct fs_context *fc;
473
struct vfsmount *mnt;
474
475
fc = fs_context_for_mount(&mqueue_fs_type, SB_KERNMOUNT);
476
if (IS_ERR(fc))
477
return ERR_CAST(fc);
478
479
ctx = fc->fs_private;
480
ctx->newns = true;
481
put_ipc_ns(ctx->ipc_ns);
482
ctx->ipc_ns = get_ipc_ns(ns);
483
put_user_ns(fc->user_ns);
484
fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns);
485
486
mnt = fc_mount_longterm(fc);
487
put_fs_context(fc);
488
return mnt;
489
}
490
491
static void init_once(void *foo)
492
{
493
struct mqueue_inode_info *p = foo;
494
495
inode_init_once(&p->vfs_inode);
496
}
497
498
static struct inode *mqueue_alloc_inode(struct super_block *sb)
499
{
500
struct mqueue_inode_info *ei;
501
502
ei = alloc_inode_sb(sb, mqueue_inode_cachep, GFP_KERNEL);
503
if (!ei)
504
return NULL;
505
return &ei->vfs_inode;
506
}
507
508
static void mqueue_free_inode(struct inode *inode)
509
{
510
kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));
511
}
512
513
static void mqueue_evict_inode(struct inode *inode)
514
{
515
struct mqueue_inode_info *info;
516
struct ipc_namespace *ipc_ns;
517
struct msg_msg *msg, *nmsg;
518
LIST_HEAD(tmp_msg);
519
520
clear_inode(inode);
521
522
if (S_ISDIR(inode->i_mode))
523
return;
524
525
ipc_ns = get_ns_from_inode(inode);
526
info = MQUEUE_I(inode);
527
spin_lock(&info->lock);
528
while ((msg = msg_get(info)) != NULL)
529
list_add_tail(&msg->m_list, &tmp_msg);
530
kfree(info->node_cache);
531
spin_unlock(&info->lock);
532
533
list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
534
list_del(&msg->m_list);
535
free_msg(msg);
536
}
537
538
if (info->ucounts) {
539
unsigned long mq_bytes, mq_treesize;
540
541
/* Total amount of bytes accounted for the mqueue */
542
mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
543
min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
544
sizeof(struct posix_msg_tree_node);
545
546
mq_bytes = mq_treesize + (info->attr.mq_maxmsg *
547
info->attr.mq_msgsize);
548
549
spin_lock(&mq_lock);
550
dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
551
/*
552
* get_ns_from_inode() ensures that the
553
* (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
554
* to which we now hold a reference, or it is NULL.
555
* We can't put it here under mq_lock, though.
556
*/
557
if (ipc_ns)
558
ipc_ns->mq_queues_count--;
559
spin_unlock(&mq_lock);
560
put_ucounts(info->ucounts);
561
info->ucounts = NULL;
562
}
563
if (ipc_ns)
564
put_ipc_ns(ipc_ns);
565
}
566
567
static int mqueue_create_attr(struct dentry *dentry, umode_t mode, void *arg)
568
{
569
struct inode *dir = dentry->d_parent->d_inode;
570
struct inode *inode;
571
struct mq_attr *attr = arg;
572
int error;
573
struct ipc_namespace *ipc_ns;
574
575
spin_lock(&mq_lock);
576
ipc_ns = __get_ns_from_inode(dir);
577
if (!ipc_ns) {
578
error = -EACCES;
579
goto out_unlock;
580
}
581
582
if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
583
!capable(CAP_SYS_RESOURCE)) {
584
error = -ENOSPC;
585
goto out_unlock;
586
}
587
ipc_ns->mq_queues_count++;
588
spin_unlock(&mq_lock);
589
590
inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);
591
if (IS_ERR(inode)) {
592
error = PTR_ERR(inode);
593
spin_lock(&mq_lock);
594
ipc_ns->mq_queues_count--;
595
goto out_unlock;
596
}
597
598
put_ipc_ns(ipc_ns);
599
dir->i_size += DIRENT_SIZE;
600
simple_inode_init_ts(dir);
601
602
d_make_persistent(dentry, inode);
603
return 0;
604
out_unlock:
605
spin_unlock(&mq_lock);
606
if (ipc_ns)
607
put_ipc_ns(ipc_ns);
608
return error;
609
}
610
611
static int mqueue_create(struct mnt_idmap *idmap, struct inode *dir,
612
struct dentry *dentry, umode_t mode, bool excl)
613
{
614
return mqueue_create_attr(dentry, mode, NULL);
615
}
616
617
static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
618
{
619
dir->i_size -= DIRENT_SIZE;
620
return simple_unlink(dir, dentry);
621
}
622
623
/*
624
* This is routine for system read from queue file.
625
* To avoid mess with doing here some sort of mq_receive we allow
626
* to read only queue size & notification info (the only values
627
* that are interesting from user point of view and aren't accessible
628
* through std routines)
629
*/
630
static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
631
size_t count, loff_t *off)
632
{
633
struct inode *inode = file_inode(filp);
634
struct mqueue_inode_info *info = MQUEUE_I(inode);
635
char buffer[FILENT_SIZE];
636
ssize_t ret;
637
638
spin_lock(&info->lock);
639
snprintf(buffer, sizeof(buffer),
640
"QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
641
info->qsize,
642
info->notify_owner ? info->notify.sigev_notify : 0,
643
(info->notify_owner &&
644
info->notify.sigev_notify == SIGEV_SIGNAL) ?
645
info->notify.sigev_signo : 0,
646
pid_vnr(info->notify_owner));
647
spin_unlock(&info->lock);
648
buffer[sizeof(buffer)-1] = '\0';
649
650
ret = simple_read_from_buffer(u_data, count, off, buffer,
651
strlen(buffer));
652
if (ret <= 0)
653
return ret;
654
655
inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
656
return ret;
657
}
658
659
static int mqueue_flush_file(struct file *filp, fl_owner_t id)
660
{
661
struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
662
663
spin_lock(&info->lock);
664
if (task_tgid(current) == info->notify_owner)
665
remove_notification(info);
666
667
spin_unlock(&info->lock);
668
return 0;
669
}
670
671
static __poll_t mqueue_poll_file(struct file *filp, struct poll_table_struct *poll_tab)
672
{
673
struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
674
__poll_t retval = 0;
675
676
poll_wait(filp, &info->wait_q, poll_tab);
677
678
spin_lock(&info->lock);
679
if (info->attr.mq_curmsgs)
680
retval = EPOLLIN | EPOLLRDNORM;
681
682
if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
683
retval |= EPOLLOUT | EPOLLWRNORM;
684
spin_unlock(&info->lock);
685
686
return retval;
687
}
688
689
/* Adds current to info->e_wait_q[sr] before element with smaller prio */
690
static void wq_add(struct mqueue_inode_info *info, int sr,
691
struct ext_wait_queue *ewp)
692
{
693
struct ext_wait_queue *walk;
694
695
list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
696
if (walk->task->prio <= current->prio) {
697
list_add_tail(&ewp->list, &walk->list);
698
return;
699
}
700
}
701
list_add_tail(&ewp->list, &info->e_wait_q[sr].list);
702
}
703
704
/*
705
* Puts current task to sleep. Caller must hold queue lock. After return
706
* lock isn't held.
707
* sr: SEND or RECV
708
*/
709
static int wq_sleep(struct mqueue_inode_info *info, int sr,
710
ktime_t *timeout, struct ext_wait_queue *ewp)
711
__releases(&info->lock)
712
{
713
int retval;
714
signed long time;
715
716
wq_add(info, sr, ewp);
717
718
for (;;) {
719
/* memory barrier not required, we hold info->lock */
720
__set_current_state(TASK_INTERRUPTIBLE);
721
722
spin_unlock(&info->lock);
723
time = schedule_hrtimeout_range_clock(timeout, 0,
724
HRTIMER_MODE_ABS, CLOCK_REALTIME);
725
726
if (READ_ONCE(ewp->state) == STATE_READY) {
727
/* see MQ_BARRIER for purpose/pairing */
728
smp_acquire__after_ctrl_dep();
729
retval = 0;
730
goto out;
731
}
732
spin_lock(&info->lock);
733
734
/* we hold info->lock, so no memory barrier required */
735
if (READ_ONCE(ewp->state) == STATE_READY) {
736
retval = 0;
737
goto out_unlock;
738
}
739
if (signal_pending(current)) {
740
retval = -ERESTARTSYS;
741
break;
742
}
743
if (time == 0) {
744
retval = -ETIMEDOUT;
745
break;
746
}
747
}
748
list_del(&ewp->list);
749
out_unlock:
750
spin_unlock(&info->lock);
751
out:
752
return retval;
753
}
754
755
/*
756
* Returns waiting task that should be serviced first or NULL if none exists
757
*/
758
static struct ext_wait_queue *wq_get_first_waiter(
759
struct mqueue_inode_info *info, int sr)
760
{
761
struct list_head *ptr;
762
763
ptr = info->e_wait_q[sr].list.prev;
764
if (ptr == &info->e_wait_q[sr].list)
765
return NULL;
766
return list_entry(ptr, struct ext_wait_queue, list);
767
}
768
769
770
static inline void set_cookie(struct sk_buff *skb, char code)
771
{
772
((char *)skb->data)[NOTIFY_COOKIE_LEN-1] = code;
773
}
774
775
/*
776
* The next function is only to split too long sys_mq_timedsend
777
*/
778
static void __do_notify(struct mqueue_inode_info *info)
779
{
780
/* notification
781
* invoked when there is registered process and there isn't process
782
* waiting synchronously for message AND state of queue changed from
783
* empty to not empty. Here we are sure that no one is waiting
784
* synchronously. */
785
if (info->notify_owner &&
786
info->attr.mq_curmsgs == 1) {
787
switch (info->notify.sigev_notify) {
788
case SIGEV_NONE:
789
break;
790
case SIGEV_SIGNAL: {
791
struct kernel_siginfo sig_i;
792
struct task_struct *task;
793
794
/* do_mq_notify() accepts sigev_signo == 0, why?? */
795
if (!info->notify.sigev_signo)
796
break;
797
798
clear_siginfo(&sig_i);
799
sig_i.si_signo = info->notify.sigev_signo;
800
sig_i.si_errno = 0;
801
sig_i.si_code = SI_MESGQ;
802
sig_i.si_value = info->notify.sigev_value;
803
rcu_read_lock();
804
/* map current pid/uid into info->owner's namespaces */
805
sig_i.si_pid = task_tgid_nr_ns(current,
806
ns_of_pid(info->notify_owner));
807
sig_i.si_uid = from_kuid_munged(info->notify_user_ns,
808
current_uid());
809
/*
810
* We can't use kill_pid_info(), this signal should
811
* bypass check_kill_permission(). It is from kernel
812
* but si_fromuser() can't know this.
813
* We do check the self_exec_id, to avoid sending
814
* signals to programs that don't expect them.
815
*/
816
task = pid_task(info->notify_owner, PIDTYPE_TGID);
817
if (task && task->self_exec_id ==
818
info->notify_self_exec_id) {
819
do_send_sig_info(info->notify.sigev_signo,
820
&sig_i, task, PIDTYPE_TGID);
821
}
822
rcu_read_unlock();
823
break;
824
}
825
case SIGEV_THREAD:
826
set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
827
netlink_sendskb(info->notify_sock, info->notify_cookie);
828
break;
829
}
830
/* after notification unregisters process */
831
put_pid(info->notify_owner);
832
put_user_ns(info->notify_user_ns);
833
info->notify_owner = NULL;
834
info->notify_user_ns = NULL;
835
}
836
wake_up(&info->wait_q);
837
}
838
839
static int prepare_timeout(const struct __kernel_timespec __user *u_abs_timeout,
840
struct timespec64 *ts)
841
{
842
if (get_timespec64(ts, u_abs_timeout))
843
return -EFAULT;
844
if (!timespec64_valid(ts))
845
return -EINVAL;
846
return 0;
847
}
848
849
static void remove_notification(struct mqueue_inode_info *info)
850
{
851
if (info->notify_owner != NULL &&
852
info->notify.sigev_notify == SIGEV_THREAD) {
853
set_cookie(info->notify_cookie, NOTIFY_REMOVED);
854
netlink_sendskb(info->notify_sock, info->notify_cookie);
855
}
856
put_pid(info->notify_owner);
857
put_user_ns(info->notify_user_ns);
858
info->notify_owner = NULL;
859
info->notify_user_ns = NULL;
860
}
861
862
static int prepare_open(struct dentry *dentry, int oflag, int ro,
863
umode_t mode, struct filename *name,
864
struct mq_attr *attr)
865
{
866
static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
867
MAY_READ | MAY_WRITE };
868
int acc;
869
870
if (d_really_is_negative(dentry)) {
871
if (!(oflag & O_CREAT))
872
return -ENOENT;
873
if (ro)
874
return ro;
875
audit_inode_parent_hidden(name, dentry->d_parent);
876
return vfs_mkobj(dentry, mode & ~current_umask(),
877
mqueue_create_attr, attr);
878
}
879
/* it already existed */
880
audit_inode(name, dentry, 0);
881
if ((oflag & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
882
return -EEXIST;
883
if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
884
return -EINVAL;
885
acc = oflag2acc[oflag & O_ACCMODE];
886
return inode_permission(&nop_mnt_idmap, d_inode(dentry), acc);
887
}
888
889
static struct file *mqueue_file_open(struct filename *name,
890
struct vfsmount *mnt, int oflag, int ro,
891
umode_t mode, struct mq_attr *attr)
892
{
893
struct dentry *dentry;
894
struct file *file;
895
int ret;
896
897
dentry = start_creating_noperm(mnt->mnt_root, &QSTR(name->name));
898
if (IS_ERR(dentry))
899
return ERR_CAST(dentry);
900
901
ret = prepare_open(dentry, oflag, ro, mode, name, attr);
902
file = ERR_PTR(ret);
903
if (!ret) {
904
const struct path path = { .mnt = mnt, .dentry = dentry };
905
file = dentry_open(&path, oflag, current_cred());
906
}
907
908
end_creating(dentry);
909
return file;
910
}
911
912
static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
913
struct mq_attr *attr)
914
{
915
struct filename *name __free(putname) = NULL;;
916
struct vfsmount *mnt = current->nsproxy->ipc_ns->mq_mnt;
917
int fd, ro;
918
919
audit_mq_open(oflag, mode, attr);
920
921
name = getname(u_name);
922
if (IS_ERR(name))
923
return PTR_ERR(name);
924
925
ro = mnt_want_write(mnt); /* we'll drop it in any case */
926
fd = FD_ADD(O_CLOEXEC, mqueue_file_open(name, mnt, oflag, ro, mode, attr));
927
if (!ro)
928
mnt_drop_write(mnt);
929
return fd;
930
}
931
932
SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
933
struct mq_attr __user *, u_attr)
934
{
935
struct mq_attr attr;
936
if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
937
return -EFAULT;
938
939
return do_mq_open(u_name, oflag, mode, u_attr ? &attr : NULL);
940
}
941
942
SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
943
{
944
int err;
945
struct filename *name;
946
struct dentry *dentry;
947
struct inode *inode;
948
struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
949
struct vfsmount *mnt = ipc_ns->mq_mnt;
950
951
name = getname(u_name);
952
if (IS_ERR(name))
953
return PTR_ERR(name);
954
955
audit_inode_parent_hidden(name, mnt->mnt_root);
956
err = mnt_want_write(mnt);
957
if (err)
958
goto out_name;
959
dentry = start_removing_noperm(mnt->mnt_root, &QSTR(name->name));
960
if (IS_ERR(dentry)) {
961
err = PTR_ERR(dentry);
962
goto out_drop_write;
963
}
964
965
inode = d_inode(dentry);
966
ihold(inode);
967
err = vfs_unlink(&nop_mnt_idmap, d_inode(mnt->mnt_root),
968
dentry, NULL);
969
end_removing(dentry);
970
iput(inode);
971
972
out_drop_write:
973
mnt_drop_write(mnt);
974
out_name:
975
putname(name);
976
977
return err;
978
}
979
980
/* Pipelined send and receive functions.
981
*
982
* If a receiver finds no waiting message, then it registers itself in the
983
* list of waiting receivers. A sender checks that list before adding the new
984
* message into the message array. If there is a waiting receiver, then it
985
* bypasses the message array and directly hands the message over to the
986
* receiver. The receiver accepts the message and returns without grabbing the
987
* queue spinlock:
988
*
989
* - Set pointer to message.
990
* - Queue the receiver task for later wakeup (without the info->lock).
991
* - Update its state to STATE_READY. Now the receiver can continue.
992
* - Wake up the process after the lock is dropped. Should the process wake up
993
* before this wakeup (due to a timeout or a signal) it will either see
994
* STATE_READY and continue or acquire the lock to check the state again.
995
*
996
* The same algorithm is used for senders.
997
*/
998
999
static inline void __pipelined_op(struct wake_q_head *wake_q,
1000
struct mqueue_inode_info *info,
1001
struct ext_wait_queue *this)
1002
{
1003
struct task_struct *task;
1004
1005
list_del(&this->list);
1006
task = get_task_struct(this->task);
1007
1008
/* see MQ_BARRIER for purpose/pairing */
1009
smp_store_release(&this->state, STATE_READY);
1010
wake_q_add_safe(wake_q, task);
1011
}
1012
1013
/* pipelined_send() - send a message directly to the task waiting in
1014
* sys_mq_timedreceive() (without inserting message into a queue).
1015
*/
1016
static inline void pipelined_send(struct wake_q_head *wake_q,
1017
struct mqueue_inode_info *info,
1018
struct msg_msg *message,
1019
struct ext_wait_queue *receiver)
1020
{
1021
receiver->msg = message;
1022
__pipelined_op(wake_q, info, receiver);
1023
}
1024
1025
/* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
1026
* gets its message and put to the queue (we have one free place for sure). */
1027
static inline void pipelined_receive(struct wake_q_head *wake_q,
1028
struct mqueue_inode_info *info)
1029
{
1030
struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
1031
1032
if (!sender) {
1033
/* for poll */
1034
wake_up_interruptible(&info->wait_q);
1035
return;
1036
}
1037
if (msg_insert(sender->msg, info))
1038
return;
1039
1040
__pipelined_op(wake_q, info, sender);
1041
}
1042
1043
static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
1044
size_t msg_len, unsigned int msg_prio,
1045
struct timespec64 *ts)
1046
{
1047
struct inode *inode;
1048
struct ext_wait_queue wait;
1049
struct ext_wait_queue *receiver;
1050
struct msg_msg *msg_ptr;
1051
struct mqueue_inode_info *info;
1052
ktime_t expires, *timeout = NULL;
1053
struct posix_msg_tree_node *new_leaf = NULL;
1054
int ret = 0;
1055
DEFINE_WAKE_Q(wake_q);
1056
1057
if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
1058
return -EINVAL;
1059
1060
if (ts) {
1061
expires = timespec64_to_ktime(*ts);
1062
timeout = &expires;
1063
}
1064
1065
audit_mq_sendrecv(mqdes, msg_len, msg_prio, ts);
1066
1067
CLASS(fd, f)(mqdes);
1068
if (fd_empty(f))
1069
return -EBADF;
1070
1071
inode = file_inode(fd_file(f));
1072
if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))
1073
return -EBADF;
1074
info = MQUEUE_I(inode);
1075
audit_file(fd_file(f));
1076
1077
if (unlikely(!(fd_file(f)->f_mode & FMODE_WRITE)))
1078
return -EBADF;
1079
1080
if (unlikely(msg_len > info->attr.mq_msgsize))
1081
return -EMSGSIZE;
1082
1083
/* First try to allocate memory, before doing anything with
1084
* existing queues. */
1085
msg_ptr = load_msg(u_msg_ptr, msg_len);
1086
if (IS_ERR(msg_ptr))
1087
return PTR_ERR(msg_ptr);
1088
msg_ptr->m_ts = msg_len;
1089
msg_ptr->m_type = msg_prio;
1090
1091
/*
1092
* msg_insert really wants us to have a valid, spare node struct so
1093
* it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
1094
* fall back to that if necessary.
1095
*/
1096
if (!info->node_cache)
1097
new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
1098
1099
spin_lock(&info->lock);
1100
1101
if (!info->node_cache && new_leaf) {
1102
/* Save our speculative allocation into the cache */
1103
INIT_LIST_HEAD(&new_leaf->msg_list);
1104
info->node_cache = new_leaf;
1105
new_leaf = NULL;
1106
} else {
1107
kfree(new_leaf);
1108
}
1109
1110
if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
1111
if (fd_file(f)->f_flags & O_NONBLOCK) {
1112
ret = -EAGAIN;
1113
} else {
1114
wait.task = current;
1115
wait.msg = (void *) msg_ptr;
1116
1117
/* memory barrier not required, we hold info->lock */
1118
WRITE_ONCE(wait.state, STATE_NONE);
1119
ret = wq_sleep(info, SEND, timeout, &wait);
1120
/*
1121
* wq_sleep must be called with info->lock held, and
1122
* returns with the lock released
1123
*/
1124
goto out_free;
1125
}
1126
} else {
1127
receiver = wq_get_first_waiter(info, RECV);
1128
if (receiver) {
1129
pipelined_send(&wake_q, info, msg_ptr, receiver);
1130
} else {
1131
/* adds message to the queue */
1132
ret = msg_insert(msg_ptr, info);
1133
if (ret)
1134
goto out_unlock;
1135
__do_notify(info);
1136
}
1137
simple_inode_init_ts(inode);
1138
}
1139
out_unlock:
1140
spin_unlock(&info->lock);
1141
wake_up_q(&wake_q);
1142
out_free:
1143
if (ret)
1144
free_msg(msg_ptr);
1145
return ret;
1146
}
1147
1148
static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
1149
size_t msg_len, unsigned int __user *u_msg_prio,
1150
struct timespec64 *ts)
1151
{
1152
ssize_t ret;
1153
struct msg_msg *msg_ptr;
1154
struct inode *inode;
1155
struct mqueue_inode_info *info;
1156
struct ext_wait_queue wait;
1157
ktime_t expires, *timeout = NULL;
1158
struct posix_msg_tree_node *new_leaf = NULL;
1159
1160
if (ts) {
1161
expires = timespec64_to_ktime(*ts);
1162
timeout = &expires;
1163
}
1164
1165
audit_mq_sendrecv(mqdes, msg_len, 0, ts);
1166
1167
CLASS(fd, f)(mqdes);
1168
if (fd_empty(f))
1169
return -EBADF;
1170
1171
inode = file_inode(fd_file(f));
1172
if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))
1173
return -EBADF;
1174
info = MQUEUE_I(inode);
1175
audit_file(fd_file(f));
1176
1177
if (unlikely(!(fd_file(f)->f_mode & FMODE_READ)))
1178
return -EBADF;
1179
1180
/* checks if buffer is big enough */
1181
if (unlikely(msg_len < info->attr.mq_msgsize))
1182
return -EMSGSIZE;
1183
1184
/*
1185
* msg_insert really wants us to have a valid, spare node struct so
1186
* it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
1187
* fall back to that if necessary.
1188
*/
1189
if (!info->node_cache)
1190
new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
1191
1192
spin_lock(&info->lock);
1193
1194
if (!info->node_cache && new_leaf) {
1195
/* Save our speculative allocation into the cache */
1196
INIT_LIST_HEAD(&new_leaf->msg_list);
1197
info->node_cache = new_leaf;
1198
} else {
1199
kfree(new_leaf);
1200
}
1201
1202
if (info->attr.mq_curmsgs == 0) {
1203
if (fd_file(f)->f_flags & O_NONBLOCK) {
1204
spin_unlock(&info->lock);
1205
ret = -EAGAIN;
1206
} else {
1207
wait.task = current;
1208
1209
/* memory barrier not required, we hold info->lock */
1210
WRITE_ONCE(wait.state, STATE_NONE);
1211
ret = wq_sleep(info, RECV, timeout, &wait);
1212
msg_ptr = wait.msg;
1213
}
1214
} else {
1215
DEFINE_WAKE_Q(wake_q);
1216
1217
msg_ptr = msg_get(info);
1218
1219
simple_inode_init_ts(inode);
1220
1221
/* There is now free space in queue. */
1222
pipelined_receive(&wake_q, info);
1223
spin_unlock(&info->lock);
1224
wake_up_q(&wake_q);
1225
ret = 0;
1226
}
1227
if (ret == 0) {
1228
ret = msg_ptr->m_ts;
1229
1230
if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||
1231
store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
1232
ret = -EFAULT;
1233
}
1234
free_msg(msg_ptr);
1235
}
1236
return ret;
1237
}
1238
1239
SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
1240
size_t, msg_len, unsigned int, msg_prio,
1241
const struct __kernel_timespec __user *, u_abs_timeout)
1242
{
1243
struct timespec64 ts, *p = NULL;
1244
if (u_abs_timeout) {
1245
int res = prepare_timeout(u_abs_timeout, &ts);
1246
if (res)
1247
return res;
1248
p = &ts;
1249
}
1250
return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p);
1251
}
1252
1253
SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
1254
size_t, msg_len, unsigned int __user *, u_msg_prio,
1255
const struct __kernel_timespec __user *, u_abs_timeout)
1256
{
1257
struct timespec64 ts, *p = NULL;
1258
if (u_abs_timeout) {
1259
int res = prepare_timeout(u_abs_timeout, &ts);
1260
if (res)
1261
return res;
1262
p = &ts;
1263
}
1264
return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p);
1265
}
1266
1267
/*
1268
* Notes: the case when user wants us to deregister (with NULL as pointer)
1269
* and he isn't currently owner of notification, will be silently discarded.
1270
* It isn't explicitly defined in the POSIX.
1271
*/
1272
static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
1273
{
1274
int ret;
1275
struct sock *sock;
1276
struct inode *inode;
1277
struct mqueue_inode_info *info;
1278
struct sk_buff *nc;
1279
1280
audit_mq_notify(mqdes, notification);
1281
1282
nc = NULL;
1283
sock = NULL;
1284
if (notification != NULL) {
1285
if (unlikely(notification->sigev_notify != SIGEV_NONE &&
1286
notification->sigev_notify != SIGEV_SIGNAL &&
1287
notification->sigev_notify != SIGEV_THREAD))
1288
return -EINVAL;
1289
if (notification->sigev_notify == SIGEV_SIGNAL &&
1290
!valid_signal(notification->sigev_signo)) {
1291
return -EINVAL;
1292
}
1293
if (notification->sigev_notify == SIGEV_THREAD) {
1294
long timeo;
1295
1296
/* create the notify skb */
1297
nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
1298
if (!nc)
1299
return -ENOMEM;
1300
1301
if (copy_from_user(nc->data,
1302
notification->sigev_value.sival_ptr,
1303
NOTIFY_COOKIE_LEN)) {
1304
kfree_skb(nc);
1305
return -EFAULT;
1306
}
1307
1308
/* TODO: add a header? */
1309
skb_put(nc, NOTIFY_COOKIE_LEN);
1310
/* and attach it to the socket */
1311
retry:
1312
sock = netlink_getsockbyfd(notification->sigev_signo);
1313
if (IS_ERR(sock)) {
1314
kfree_skb(nc);
1315
return PTR_ERR(sock);
1316
}
1317
1318
timeo = MAX_SCHEDULE_TIMEOUT;
1319
ret = netlink_attachskb(sock, nc, &timeo, NULL);
1320
if (ret == 1)
1321
goto retry;
1322
if (ret)
1323
return ret;
1324
}
1325
}
1326
1327
CLASS(fd, f)(mqdes);
1328
if (fd_empty(f)) {
1329
ret = -EBADF;
1330
goto out;
1331
}
1332
1333
inode = file_inode(fd_file(f));
1334
if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) {
1335
ret = -EBADF;
1336
goto out;
1337
}
1338
info = MQUEUE_I(inode);
1339
1340
ret = 0;
1341
spin_lock(&info->lock);
1342
if (notification == NULL) {
1343
if (info->notify_owner == task_tgid(current)) {
1344
remove_notification(info);
1345
inode_set_atime_to_ts(inode,
1346
inode_set_ctime_current(inode));
1347
}
1348
} else if (info->notify_owner != NULL) {
1349
ret = -EBUSY;
1350
} else {
1351
switch (notification->sigev_notify) {
1352
case SIGEV_NONE:
1353
info->notify.sigev_notify = SIGEV_NONE;
1354
break;
1355
case SIGEV_THREAD:
1356
info->notify_sock = sock;
1357
info->notify_cookie = nc;
1358
sock = NULL;
1359
nc = NULL;
1360
info->notify.sigev_notify = SIGEV_THREAD;
1361
break;
1362
case SIGEV_SIGNAL:
1363
info->notify.sigev_signo = notification->sigev_signo;
1364
info->notify.sigev_value = notification->sigev_value;
1365
info->notify.sigev_notify = SIGEV_SIGNAL;
1366
info->notify_self_exec_id = current->self_exec_id;
1367
break;
1368
}
1369
1370
info->notify_owner = get_pid(task_tgid(current));
1371
info->notify_user_ns = get_user_ns(current_user_ns());
1372
inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
1373
}
1374
spin_unlock(&info->lock);
1375
out:
1376
if (sock)
1377
netlink_detachskb(sock, nc);
1378
return ret;
1379
}
1380
1381
SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
1382
const struct sigevent __user *, u_notification)
1383
{
1384
struct sigevent n, *p = NULL;
1385
if (u_notification) {
1386
if (copy_from_user(&n, u_notification, sizeof(struct sigevent)))
1387
return -EFAULT;
1388
p = &n;
1389
}
1390
return do_mq_notify(mqdes, p);
1391
}
1392
1393
static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
1394
{
1395
struct inode *inode;
1396
struct mqueue_inode_info *info;
1397
1398
if (new && (new->mq_flags & (~O_NONBLOCK)))
1399
return -EINVAL;
1400
1401
CLASS(fd, f)(mqdes);
1402
if (fd_empty(f))
1403
return -EBADF;
1404
1405
if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))
1406
return -EBADF;
1407
1408
inode = file_inode(fd_file(f));
1409
info = MQUEUE_I(inode);
1410
1411
spin_lock(&info->lock);
1412
1413
if (old) {
1414
*old = info->attr;
1415
old->mq_flags = fd_file(f)->f_flags & O_NONBLOCK;
1416
}
1417
if (new) {
1418
audit_mq_getsetattr(mqdes, new);
1419
spin_lock(&fd_file(f)->f_lock);
1420
if (new->mq_flags & O_NONBLOCK)
1421
fd_file(f)->f_flags |= O_NONBLOCK;
1422
else
1423
fd_file(f)->f_flags &= ~O_NONBLOCK;
1424
spin_unlock(&fd_file(f)->f_lock);
1425
1426
inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
1427
}
1428
1429
spin_unlock(&info->lock);
1430
return 0;
1431
}
1432
1433
SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
1434
const struct mq_attr __user *, u_mqstat,
1435
struct mq_attr __user *, u_omqstat)
1436
{
1437
int ret;
1438
struct mq_attr mqstat, omqstat;
1439
struct mq_attr *new = NULL, *old = NULL;
1440
1441
if (u_mqstat) {
1442
new = &mqstat;
1443
if (copy_from_user(new, u_mqstat, sizeof(struct mq_attr)))
1444
return -EFAULT;
1445
}
1446
if (u_omqstat)
1447
old = &omqstat;
1448
1449
ret = do_mq_getsetattr(mqdes, new, old);
1450
if (ret || !old)
1451
return ret;
1452
1453
if (copy_to_user(u_omqstat, old, sizeof(struct mq_attr)))
1454
return -EFAULT;
1455
return 0;
1456
}
1457
1458
#ifdef CONFIG_COMPAT
1459
1460
struct compat_mq_attr {
1461
compat_long_t mq_flags; /* message queue flags */
1462
compat_long_t mq_maxmsg; /* maximum number of messages */
1463
compat_long_t mq_msgsize; /* maximum message size */
1464
compat_long_t mq_curmsgs; /* number of messages currently queued */
1465
compat_long_t __reserved[4]; /* ignored for input, zeroed for output */
1466
};
1467
1468
static inline int get_compat_mq_attr(struct mq_attr *attr,
1469
const struct compat_mq_attr __user *uattr)
1470
{
1471
struct compat_mq_attr v;
1472
1473
if (copy_from_user(&v, uattr, sizeof(*uattr)))
1474
return -EFAULT;
1475
1476
memset(attr, 0, sizeof(*attr));
1477
attr->mq_flags = v.mq_flags;
1478
attr->mq_maxmsg = v.mq_maxmsg;
1479
attr->mq_msgsize = v.mq_msgsize;
1480
attr->mq_curmsgs = v.mq_curmsgs;
1481
return 0;
1482
}
1483
1484
static inline int put_compat_mq_attr(const struct mq_attr *attr,
1485
struct compat_mq_attr __user *uattr)
1486
{
1487
struct compat_mq_attr v;
1488
1489
memset(&v, 0, sizeof(v));
1490
v.mq_flags = attr->mq_flags;
1491
v.mq_maxmsg = attr->mq_maxmsg;
1492
v.mq_msgsize = attr->mq_msgsize;
1493
v.mq_curmsgs = attr->mq_curmsgs;
1494
if (copy_to_user(uattr, &v, sizeof(*uattr)))
1495
return -EFAULT;
1496
return 0;
1497
}
1498
1499
COMPAT_SYSCALL_DEFINE4(mq_open, const char __user *, u_name,
1500
int, oflag, compat_mode_t, mode,
1501
struct compat_mq_attr __user *, u_attr)
1502
{
1503
struct mq_attr attr, *p = NULL;
1504
if (u_attr && oflag & O_CREAT) {
1505
p = &attr;
1506
if (get_compat_mq_attr(&attr, u_attr))
1507
return -EFAULT;
1508
}
1509
return do_mq_open(u_name, oflag, mode, p);
1510
}
1511
1512
COMPAT_SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
1513
const struct compat_sigevent __user *, u_notification)
1514
{
1515
struct sigevent n, *p = NULL;
1516
if (u_notification) {
1517
if (get_compat_sigevent(&n, u_notification))
1518
return -EFAULT;
1519
if (n.sigev_notify == SIGEV_THREAD)
1520
n.sigev_value.sival_ptr = compat_ptr(n.sigev_value.sival_int);
1521
p = &n;
1522
}
1523
return do_mq_notify(mqdes, p);
1524
}
1525
1526
COMPAT_SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
1527
const struct compat_mq_attr __user *, u_mqstat,
1528
struct compat_mq_attr __user *, u_omqstat)
1529
{
1530
int ret;
1531
struct mq_attr mqstat, omqstat;
1532
struct mq_attr *new = NULL, *old = NULL;
1533
1534
if (u_mqstat) {
1535
new = &mqstat;
1536
if (get_compat_mq_attr(new, u_mqstat))
1537
return -EFAULT;
1538
}
1539
if (u_omqstat)
1540
old = &omqstat;
1541
1542
ret = do_mq_getsetattr(mqdes, new, old);
1543
if (ret || !old)
1544
return ret;
1545
1546
if (put_compat_mq_attr(old, u_omqstat))
1547
return -EFAULT;
1548
return 0;
1549
}
1550
#endif
1551
1552
#ifdef CONFIG_COMPAT_32BIT_TIME
1553
static int compat_prepare_timeout(const struct old_timespec32 __user *p,
1554
struct timespec64 *ts)
1555
{
1556
if (get_old_timespec32(ts, p))
1557
return -EFAULT;
1558
if (!timespec64_valid(ts))
1559
return -EINVAL;
1560
return 0;
1561
}
1562
1563
SYSCALL_DEFINE5(mq_timedsend_time32, mqd_t, mqdes,
1564
const char __user *, u_msg_ptr,
1565
unsigned int, msg_len, unsigned int, msg_prio,
1566
const struct old_timespec32 __user *, u_abs_timeout)
1567
{
1568
struct timespec64 ts, *p = NULL;
1569
if (u_abs_timeout) {
1570
int res = compat_prepare_timeout(u_abs_timeout, &ts);
1571
if (res)
1572
return res;
1573
p = &ts;
1574
}
1575
return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p);
1576
}
1577
1578
SYSCALL_DEFINE5(mq_timedreceive_time32, mqd_t, mqdes,
1579
char __user *, u_msg_ptr,
1580
unsigned int, msg_len, unsigned int __user *, u_msg_prio,
1581
const struct old_timespec32 __user *, u_abs_timeout)
1582
{
1583
struct timespec64 ts, *p = NULL;
1584
if (u_abs_timeout) {
1585
int res = compat_prepare_timeout(u_abs_timeout, &ts);
1586
if (res)
1587
return res;
1588
p = &ts;
1589
}
1590
return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p);
1591
}
1592
#endif
1593
1594
static const struct inode_operations mqueue_dir_inode_operations = {
1595
.lookup = simple_lookup,
1596
.create = mqueue_create,
1597
.unlink = mqueue_unlink,
1598
};
1599
1600
static const struct file_operations mqueue_file_operations = {
1601
.flush = mqueue_flush_file,
1602
.poll = mqueue_poll_file,
1603
.read = mqueue_read_file,
1604
.llseek = default_llseek,
1605
};
1606
1607
static const struct super_operations mqueue_super_ops = {
1608
.alloc_inode = mqueue_alloc_inode,
1609
.free_inode = mqueue_free_inode,
1610
.evict_inode = mqueue_evict_inode,
1611
.statfs = simple_statfs,
1612
};
1613
1614
static const struct fs_context_operations mqueue_fs_context_ops = {
1615
.free = mqueue_fs_context_free,
1616
.get_tree = mqueue_get_tree,
1617
};
1618
1619
static struct file_system_type mqueue_fs_type = {
1620
.name = "mqueue",
1621
.init_fs_context = mqueue_init_fs_context,
1622
.kill_sb = kill_anon_super,
1623
.fs_flags = FS_USERNS_MOUNT,
1624
};
1625
1626
int mq_init_ns(struct ipc_namespace *ns)
1627
{
1628
struct vfsmount *m;
1629
1630
ns->mq_queues_count = 0;
1631
ns->mq_queues_max = DFLT_QUEUESMAX;
1632
ns->mq_msg_max = DFLT_MSGMAX;
1633
ns->mq_msgsize_max = DFLT_MSGSIZEMAX;
1634
ns->mq_msg_default = DFLT_MSG;
1635
ns->mq_msgsize_default = DFLT_MSGSIZE;
1636
1637
m = mq_create_mount(ns);
1638
if (IS_ERR(m))
1639
return PTR_ERR(m);
1640
ns->mq_mnt = m;
1641
return 0;
1642
}
1643
1644
void mq_clear_sbinfo(struct ipc_namespace *ns)
1645
{
1646
ns->mq_mnt->mnt_sb->s_fs_info = NULL;
1647
}
1648
1649
static int __init init_mqueue_fs(void)
1650
{
1651
int error;
1652
1653
mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
1654
sizeof(struct mqueue_inode_info), 0,
1655
SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, init_once);
1656
if (mqueue_inode_cachep == NULL)
1657
return -ENOMEM;
1658
1659
if (!setup_mq_sysctls(&init_ipc_ns)) {
1660
pr_warn("sysctl registration failed\n");
1661
error = -ENOMEM;
1662
goto out_kmem;
1663
}
1664
1665
error = register_filesystem(&mqueue_fs_type);
1666
if (error)
1667
goto out_sysctl;
1668
1669
spin_lock_init(&mq_lock);
1670
1671
error = mq_init_ns(&init_ipc_ns);
1672
if (error)
1673
goto out_filesystem;
1674
1675
return 0;
1676
1677
out_filesystem:
1678
unregister_filesystem(&mqueue_fs_type);
1679
out_sysctl:
1680
retire_mq_sysctls(&init_ipc_ns);
1681
out_kmem:
1682
kmem_cache_destroy(mqueue_inode_cachep);
1683
return error;
1684
}
1685
1686
device_initcall(init_mqueue_fs);
1687
1688