Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/ipc/mqueue.c
26131 views
1
/*
2
* POSIX message queues filesystem for Linux.
3
*
4
* Copyright (C) 2003,2004 Krzysztof Benedyczak ([email protected])
5
* Michal Wronski ([email protected])
6
*
7
* Spinlocks: Mohamed Abbas ([email protected])
8
* Lockless receive & send, fd based notify:
9
* Manfred Spraul ([email protected])
10
*
11
* Audit: George Wilson ([email protected])
12
*
13
* This file is released under the GPL.
14
*/
15
16
#include <linux/capability.h>
17
#include <linux/init.h>
18
#include <linux/pagemap.h>
19
#include <linux/file.h>
20
#include <linux/mount.h>
21
#include <linux/fs_context.h>
22
#include <linux/namei.h>
23
#include <linux/sysctl.h>
24
#include <linux/poll.h>
25
#include <linux/mqueue.h>
26
#include <linux/msg.h>
27
#include <linux/skbuff.h>
28
#include <linux/vmalloc.h>
29
#include <linux/netlink.h>
30
#include <linux/syscalls.h>
31
#include <linux/audit.h>
32
#include <linux/signal.h>
33
#include <linux/mutex.h>
34
#include <linux/nsproxy.h>
35
#include <linux/pid.h>
36
#include <linux/ipc_namespace.h>
37
#include <linux/user_namespace.h>
38
#include <linux/slab.h>
39
#include <linux/sched/wake_q.h>
40
#include <linux/sched/signal.h>
41
#include <linux/sched/user.h>
42
43
#include <net/sock.h>
44
#include "util.h"
45
46
struct mqueue_fs_context {
47
struct ipc_namespace *ipc_ns;
48
bool newns; /* Set if newly created ipc namespace */
49
};
50
51
#define MQUEUE_MAGIC 0x19800202
52
#define DIRENT_SIZE 20
53
#define FILENT_SIZE 80
54
55
#define SEND 0
56
#define RECV 1
57
58
#define STATE_NONE 0
59
#define STATE_READY 1
60
61
struct posix_msg_tree_node {
62
struct rb_node rb_node;
63
struct list_head msg_list;
64
int priority;
65
};
66
67
/*
68
* Locking:
69
*
70
* Accesses to a message queue are synchronized by acquiring info->lock.
71
*
72
* There are two notable exceptions:
73
* - The actual wakeup of a sleeping task is performed using the wake_q
74
* framework. info->lock is already released when wake_up_q is called.
75
* - The exit codepaths after sleeping check ext_wait_queue->state without
76
* any locks. If it is STATE_READY, then the syscall is completed without
77
* acquiring info->lock.
78
*
79
* MQ_BARRIER:
80
* To achieve proper release/acquire memory barrier pairing, the state is set to
81
* STATE_READY with smp_store_release(), and it is read with READ_ONCE followed
82
* by smp_acquire__after_ctrl_dep(). In addition, wake_q_add_safe() is used.
83
*
84
* This prevents the following races:
85
*
86
* 1) With the simple wake_q_add(), the task could be gone already before
87
* the increase of the reference happens
88
* Thread A
89
* Thread B
90
* WRITE_ONCE(wait.state, STATE_NONE);
91
* schedule_hrtimeout()
92
* wake_q_add(A)
93
* if (cmpxchg()) // success
94
* ->state = STATE_READY (reordered)
95
* <timeout returns>
96
* if (wait.state == STATE_READY) return;
97
* sysret to user space
98
* sys_exit()
99
* get_task_struct() // UaF
100
*
101
* Solution: Use wake_q_add_safe() and perform the get_task_struct() before
102
* the smp_store_release() that does ->state = STATE_READY.
103
*
104
* 2) Without proper _release/_acquire barriers, the woken up task
105
* could read stale data
106
*
107
* Thread A
108
* Thread B
109
* do_mq_timedreceive
110
* WRITE_ONCE(wait.state, STATE_NONE);
111
* schedule_hrtimeout()
112
* state = STATE_READY;
113
* <timeout returns>
114
* if (wait.state == STATE_READY) return;
115
* msg_ptr = wait.msg; // Access to stale data!
116
* receiver->msg = message; (reordered)
117
*
118
* Solution: use _release and _acquire barriers.
119
*
120
* 3) There is intentionally no barrier when setting current->state
121
* to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the
122
* release memory barrier, and the wakeup is triggered when holding
123
* info->lock, i.e. spin_lock(&info->lock) provided a pairing
124
* acquire memory barrier.
125
*/
126
127
struct ext_wait_queue { /* queue of sleeping tasks */
128
struct task_struct *task;
129
struct list_head list;
130
struct msg_msg *msg; /* ptr of loaded message */
131
int state; /* one of STATE_* values */
132
};
133
134
struct mqueue_inode_info {
135
spinlock_t lock;
136
struct inode vfs_inode;
137
wait_queue_head_t wait_q;
138
139
struct rb_root msg_tree;
140
struct rb_node *msg_tree_rightmost;
141
struct posix_msg_tree_node *node_cache;
142
struct mq_attr attr;
143
144
struct sigevent notify;
145
struct pid *notify_owner;
146
u32 notify_self_exec_id;
147
struct user_namespace *notify_user_ns;
148
struct ucounts *ucounts; /* user who created, for accounting */
149
struct sock *notify_sock;
150
struct sk_buff *notify_cookie;
151
152
/* for tasks waiting for free space and messages, respectively */
153
struct ext_wait_queue e_wait_q[2];
154
155
unsigned long qsize; /* size of queue in memory (sum of all msgs) */
156
};
157
158
static struct file_system_type mqueue_fs_type;
159
static const struct inode_operations mqueue_dir_inode_operations;
160
static const struct file_operations mqueue_file_operations;
161
static const struct super_operations mqueue_super_ops;
162
static const struct fs_context_operations mqueue_fs_context_ops;
163
static void remove_notification(struct mqueue_inode_info *info);
164
165
static struct kmem_cache *mqueue_inode_cachep;
166
167
static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
168
{
169
return container_of(inode, struct mqueue_inode_info, vfs_inode);
170
}
171
172
/*
173
* This routine should be called with the mq_lock held.
174
*/
175
static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
176
{
177
return get_ipc_ns(inode->i_sb->s_fs_info);
178
}
179
180
static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
181
{
182
struct ipc_namespace *ns;
183
184
spin_lock(&mq_lock);
185
ns = __get_ns_from_inode(inode);
186
spin_unlock(&mq_lock);
187
return ns;
188
}
189
190
/* Auxiliary functions to manipulate messages' list */
191
static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
192
{
193
struct rb_node **p, *parent = NULL;
194
struct posix_msg_tree_node *leaf;
195
bool rightmost = true;
196
197
p = &info->msg_tree.rb_node;
198
while (*p) {
199
parent = *p;
200
leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
201
202
if (likely(leaf->priority == msg->m_type))
203
goto insert_msg;
204
else if (msg->m_type < leaf->priority) {
205
p = &(*p)->rb_left;
206
rightmost = false;
207
} else
208
p = &(*p)->rb_right;
209
}
210
if (info->node_cache) {
211
leaf = info->node_cache;
212
info->node_cache = NULL;
213
} else {
214
leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC);
215
if (!leaf)
216
return -ENOMEM;
217
INIT_LIST_HEAD(&leaf->msg_list);
218
}
219
leaf->priority = msg->m_type;
220
221
if (rightmost)
222
info->msg_tree_rightmost = &leaf->rb_node;
223
224
rb_link_node(&leaf->rb_node, parent, p);
225
rb_insert_color(&leaf->rb_node, &info->msg_tree);
226
insert_msg:
227
info->attr.mq_curmsgs++;
228
info->qsize += msg->m_ts;
229
list_add_tail(&msg->m_list, &leaf->msg_list);
230
return 0;
231
}
232
233
static inline void msg_tree_erase(struct posix_msg_tree_node *leaf,
234
struct mqueue_inode_info *info)
235
{
236
struct rb_node *node = &leaf->rb_node;
237
238
if (info->msg_tree_rightmost == node)
239
info->msg_tree_rightmost = rb_prev(node);
240
241
rb_erase(node, &info->msg_tree);
242
if (info->node_cache)
243
kfree(leaf);
244
else
245
info->node_cache = leaf;
246
}
247
248
static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
249
{
250
struct rb_node *parent = NULL;
251
struct posix_msg_tree_node *leaf;
252
struct msg_msg *msg;
253
254
try_again:
255
/*
256
* During insert, low priorities go to the left and high to the
257
* right. On receive, we want the highest priorities first, so
258
* walk all the way to the right.
259
*/
260
parent = info->msg_tree_rightmost;
261
if (!parent) {
262
if (info->attr.mq_curmsgs) {
263
pr_warn_once("Inconsistency in POSIX message queue, "
264
"no tree element, but supposedly messages "
265
"should exist!\n");
266
info->attr.mq_curmsgs = 0;
267
}
268
return NULL;
269
}
270
leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
271
if (unlikely(list_empty(&leaf->msg_list))) {
272
pr_warn_once("Inconsistency in POSIX message queue, "
273
"empty leaf node but we haven't implemented "
274
"lazy leaf delete!\n");
275
msg_tree_erase(leaf, info);
276
goto try_again;
277
} else {
278
msg = list_first_entry(&leaf->msg_list,
279
struct msg_msg, m_list);
280
list_del(&msg->m_list);
281
if (list_empty(&leaf->msg_list)) {
282
msg_tree_erase(leaf, info);
283
}
284
}
285
info->attr.mq_curmsgs--;
286
info->qsize -= msg->m_ts;
287
return msg;
288
}
289
290
static struct inode *mqueue_get_inode(struct super_block *sb,
291
struct ipc_namespace *ipc_ns, umode_t mode,
292
struct mq_attr *attr)
293
{
294
struct inode *inode;
295
int ret = -ENOMEM;
296
297
inode = new_inode(sb);
298
if (!inode)
299
goto err;
300
301
inode->i_ino = get_next_ino();
302
inode->i_mode = mode;
303
inode->i_uid = current_fsuid();
304
inode->i_gid = current_fsgid();
305
simple_inode_init_ts(inode);
306
307
if (S_ISREG(mode)) {
308
struct mqueue_inode_info *info;
309
unsigned long mq_bytes, mq_treesize;
310
311
inode->i_fop = &mqueue_file_operations;
312
inode->i_size = FILENT_SIZE;
313
/* mqueue specific info */
314
info = MQUEUE_I(inode);
315
spin_lock_init(&info->lock);
316
init_waitqueue_head(&info->wait_q);
317
INIT_LIST_HEAD(&info->e_wait_q[0].list);
318
INIT_LIST_HEAD(&info->e_wait_q[1].list);
319
info->notify_owner = NULL;
320
info->notify_user_ns = NULL;
321
info->qsize = 0;
322
info->ucounts = NULL; /* set when all is ok */
323
info->msg_tree = RB_ROOT;
324
info->msg_tree_rightmost = NULL;
325
info->node_cache = NULL;
326
memset(&info->attr, 0, sizeof(info->attr));
327
info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
328
ipc_ns->mq_msg_default);
329
info->attr.mq_msgsize = min(ipc_ns->mq_msgsize_max,
330
ipc_ns->mq_msgsize_default);
331
if (attr) {
332
info->attr.mq_maxmsg = attr->mq_maxmsg;
333
info->attr.mq_msgsize = attr->mq_msgsize;
334
}
335
/*
336
* We used to allocate a static array of pointers and account
337
* the size of that array as well as one msg_msg struct per
338
* possible message into the queue size. That's no longer
339
* accurate as the queue is now an rbtree and will grow and
340
* shrink depending on usage patterns. We can, however, still
341
* account one msg_msg struct per message, but the nodes are
342
* allocated depending on priority usage, and most programs
343
* only use one, or a handful, of priorities. However, since
344
* this is pinned memory, we need to assume worst case, so
345
* that means the min(mq_maxmsg, max_priorities) * struct
346
* posix_msg_tree_node.
347
*/
348
349
ret = -EINVAL;
350
if (info->attr.mq_maxmsg <= 0 || info->attr.mq_msgsize <= 0)
351
goto out_inode;
352
if (capable(CAP_SYS_RESOURCE)) {
353
if (info->attr.mq_maxmsg > HARD_MSGMAX ||
354
info->attr.mq_msgsize > HARD_MSGSIZEMAX)
355
goto out_inode;
356
} else {
357
if (info->attr.mq_maxmsg > ipc_ns->mq_msg_max ||
358
info->attr.mq_msgsize > ipc_ns->mq_msgsize_max)
359
goto out_inode;
360
}
361
ret = -EOVERFLOW;
362
/* check for overflow */
363
if (info->attr.mq_msgsize > ULONG_MAX/info->attr.mq_maxmsg)
364
goto out_inode;
365
mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
366
min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
367
sizeof(struct posix_msg_tree_node);
368
mq_bytes = info->attr.mq_maxmsg * info->attr.mq_msgsize;
369
if (mq_bytes + mq_treesize < mq_bytes)
370
goto out_inode;
371
mq_bytes += mq_treesize;
372
info->ucounts = get_ucounts(current_ucounts());
373
if (info->ucounts) {
374
long msgqueue;
375
376
spin_lock(&mq_lock);
377
msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
378
if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) {
379
dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
380
spin_unlock(&mq_lock);
381
put_ucounts(info->ucounts);
382
info->ucounts = NULL;
383
/* mqueue_evict_inode() releases info->messages */
384
ret = -EMFILE;
385
goto out_inode;
386
}
387
spin_unlock(&mq_lock);
388
}
389
} else if (S_ISDIR(mode)) {
390
inc_nlink(inode);
391
/* Some things misbehave if size == 0 on a directory */
392
inode->i_size = 2 * DIRENT_SIZE;
393
inode->i_op = &mqueue_dir_inode_operations;
394
inode->i_fop = &simple_dir_operations;
395
}
396
397
return inode;
398
out_inode:
399
iput(inode);
400
err:
401
return ERR_PTR(ret);
402
}
403
404
static int mqueue_fill_super(struct super_block *sb, struct fs_context *fc)
405
{
406
struct inode *inode;
407
struct ipc_namespace *ns = sb->s_fs_info;
408
409
sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
410
sb->s_blocksize = PAGE_SIZE;
411
sb->s_blocksize_bits = PAGE_SHIFT;
412
sb->s_magic = MQUEUE_MAGIC;
413
sb->s_op = &mqueue_super_ops;
414
sb->s_d_flags = DCACHE_DONTCACHE;
415
416
inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL);
417
if (IS_ERR(inode))
418
return PTR_ERR(inode);
419
420
sb->s_root = d_make_root(inode);
421
if (!sb->s_root)
422
return -ENOMEM;
423
return 0;
424
}
425
426
static int mqueue_get_tree(struct fs_context *fc)
427
{
428
struct mqueue_fs_context *ctx = fc->fs_private;
429
430
/*
431
* With a newly created ipc namespace, we don't need to do a search
432
* for an ipc namespace match, but we still need to set s_fs_info.
433
*/
434
if (ctx->newns) {
435
fc->s_fs_info = ctx->ipc_ns;
436
return get_tree_nodev(fc, mqueue_fill_super);
437
}
438
return get_tree_keyed(fc, mqueue_fill_super, ctx->ipc_ns);
439
}
440
441
static void mqueue_fs_context_free(struct fs_context *fc)
442
{
443
struct mqueue_fs_context *ctx = fc->fs_private;
444
445
put_ipc_ns(ctx->ipc_ns);
446
kfree(ctx);
447
}
448
449
static int mqueue_init_fs_context(struct fs_context *fc)
450
{
451
struct mqueue_fs_context *ctx;
452
453
ctx = kzalloc(sizeof(struct mqueue_fs_context), GFP_KERNEL);
454
if (!ctx)
455
return -ENOMEM;
456
457
ctx->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns);
458
put_user_ns(fc->user_ns);
459
fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns);
460
fc->fs_private = ctx;
461
fc->ops = &mqueue_fs_context_ops;
462
return 0;
463
}
464
465
/*
466
* mq_init_ns() is currently the only caller of mq_create_mount().
467
* So the ns parameter is always a newly created ipc namespace.
468
*/
469
static struct vfsmount *mq_create_mount(struct ipc_namespace *ns)
470
{
471
struct mqueue_fs_context *ctx;
472
struct fs_context *fc;
473
struct vfsmount *mnt;
474
475
fc = fs_context_for_mount(&mqueue_fs_type, SB_KERNMOUNT);
476
if (IS_ERR(fc))
477
return ERR_CAST(fc);
478
479
ctx = fc->fs_private;
480
ctx->newns = true;
481
put_ipc_ns(ctx->ipc_ns);
482
ctx->ipc_ns = get_ipc_ns(ns);
483
put_user_ns(fc->user_ns);
484
fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns);
485
486
mnt = fc_mount_longterm(fc);
487
put_fs_context(fc);
488
return mnt;
489
}
490
491
static void init_once(void *foo)
492
{
493
struct mqueue_inode_info *p = foo;
494
495
inode_init_once(&p->vfs_inode);
496
}
497
498
static struct inode *mqueue_alloc_inode(struct super_block *sb)
499
{
500
struct mqueue_inode_info *ei;
501
502
ei = alloc_inode_sb(sb, mqueue_inode_cachep, GFP_KERNEL);
503
if (!ei)
504
return NULL;
505
return &ei->vfs_inode;
506
}
507
508
static void mqueue_free_inode(struct inode *inode)
509
{
510
kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));
511
}
512
513
static void mqueue_evict_inode(struct inode *inode)
514
{
515
struct mqueue_inode_info *info;
516
struct ipc_namespace *ipc_ns;
517
struct msg_msg *msg, *nmsg;
518
LIST_HEAD(tmp_msg);
519
520
clear_inode(inode);
521
522
if (S_ISDIR(inode->i_mode))
523
return;
524
525
ipc_ns = get_ns_from_inode(inode);
526
info = MQUEUE_I(inode);
527
spin_lock(&info->lock);
528
while ((msg = msg_get(info)) != NULL)
529
list_add_tail(&msg->m_list, &tmp_msg);
530
kfree(info->node_cache);
531
spin_unlock(&info->lock);
532
533
list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
534
list_del(&msg->m_list);
535
free_msg(msg);
536
}
537
538
if (info->ucounts) {
539
unsigned long mq_bytes, mq_treesize;
540
541
/* Total amount of bytes accounted for the mqueue */
542
mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
543
min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
544
sizeof(struct posix_msg_tree_node);
545
546
mq_bytes = mq_treesize + (info->attr.mq_maxmsg *
547
info->attr.mq_msgsize);
548
549
spin_lock(&mq_lock);
550
dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
551
/*
552
* get_ns_from_inode() ensures that the
553
* (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
554
* to which we now hold a reference, or it is NULL.
555
* We can't put it here under mq_lock, though.
556
*/
557
if (ipc_ns)
558
ipc_ns->mq_queues_count--;
559
spin_unlock(&mq_lock);
560
put_ucounts(info->ucounts);
561
info->ucounts = NULL;
562
}
563
if (ipc_ns)
564
put_ipc_ns(ipc_ns);
565
}
566
567
static int mqueue_create_attr(struct dentry *dentry, umode_t mode, void *arg)
568
{
569
struct inode *dir = dentry->d_parent->d_inode;
570
struct inode *inode;
571
struct mq_attr *attr = arg;
572
int error;
573
struct ipc_namespace *ipc_ns;
574
575
spin_lock(&mq_lock);
576
ipc_ns = __get_ns_from_inode(dir);
577
if (!ipc_ns) {
578
error = -EACCES;
579
goto out_unlock;
580
}
581
582
if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
583
!capable(CAP_SYS_RESOURCE)) {
584
error = -ENOSPC;
585
goto out_unlock;
586
}
587
ipc_ns->mq_queues_count++;
588
spin_unlock(&mq_lock);
589
590
inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);
591
if (IS_ERR(inode)) {
592
error = PTR_ERR(inode);
593
spin_lock(&mq_lock);
594
ipc_ns->mq_queues_count--;
595
goto out_unlock;
596
}
597
598
put_ipc_ns(ipc_ns);
599
dir->i_size += DIRENT_SIZE;
600
simple_inode_init_ts(dir);
601
602
d_instantiate(dentry, inode);
603
dget(dentry);
604
return 0;
605
out_unlock:
606
spin_unlock(&mq_lock);
607
if (ipc_ns)
608
put_ipc_ns(ipc_ns);
609
return error;
610
}
611
612
static int mqueue_create(struct mnt_idmap *idmap, struct inode *dir,
613
struct dentry *dentry, umode_t mode, bool excl)
614
{
615
return mqueue_create_attr(dentry, mode, NULL);
616
}
617
618
static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
619
{
620
struct inode *inode = d_inode(dentry);
621
622
simple_inode_init_ts(dir);
623
dir->i_size -= DIRENT_SIZE;
624
drop_nlink(inode);
625
dput(dentry);
626
return 0;
627
}
628
629
/*
630
* This is routine for system read from queue file.
631
* To avoid mess with doing here some sort of mq_receive we allow
632
* to read only queue size & notification info (the only values
633
* that are interesting from user point of view and aren't accessible
634
* through std routines)
635
*/
636
static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
637
size_t count, loff_t *off)
638
{
639
struct inode *inode = file_inode(filp);
640
struct mqueue_inode_info *info = MQUEUE_I(inode);
641
char buffer[FILENT_SIZE];
642
ssize_t ret;
643
644
spin_lock(&info->lock);
645
snprintf(buffer, sizeof(buffer),
646
"QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
647
info->qsize,
648
info->notify_owner ? info->notify.sigev_notify : 0,
649
(info->notify_owner &&
650
info->notify.sigev_notify == SIGEV_SIGNAL) ?
651
info->notify.sigev_signo : 0,
652
pid_vnr(info->notify_owner));
653
spin_unlock(&info->lock);
654
buffer[sizeof(buffer)-1] = '\0';
655
656
ret = simple_read_from_buffer(u_data, count, off, buffer,
657
strlen(buffer));
658
if (ret <= 0)
659
return ret;
660
661
inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
662
return ret;
663
}
664
665
static int mqueue_flush_file(struct file *filp, fl_owner_t id)
666
{
667
struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
668
669
spin_lock(&info->lock);
670
if (task_tgid(current) == info->notify_owner)
671
remove_notification(info);
672
673
spin_unlock(&info->lock);
674
return 0;
675
}
676
677
static __poll_t mqueue_poll_file(struct file *filp, struct poll_table_struct *poll_tab)
678
{
679
struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
680
__poll_t retval = 0;
681
682
poll_wait(filp, &info->wait_q, poll_tab);
683
684
spin_lock(&info->lock);
685
if (info->attr.mq_curmsgs)
686
retval = EPOLLIN | EPOLLRDNORM;
687
688
if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
689
retval |= EPOLLOUT | EPOLLWRNORM;
690
spin_unlock(&info->lock);
691
692
return retval;
693
}
694
695
/* Adds current to info->e_wait_q[sr] before element with smaller prio */
696
static void wq_add(struct mqueue_inode_info *info, int sr,
697
struct ext_wait_queue *ewp)
698
{
699
struct ext_wait_queue *walk;
700
701
list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
702
if (walk->task->prio <= current->prio) {
703
list_add_tail(&ewp->list, &walk->list);
704
return;
705
}
706
}
707
list_add_tail(&ewp->list, &info->e_wait_q[sr].list);
708
}
709
710
/*
711
* Puts current task to sleep. Caller must hold queue lock. After return
712
* lock isn't held.
713
* sr: SEND or RECV
714
*/
715
static int wq_sleep(struct mqueue_inode_info *info, int sr,
716
ktime_t *timeout, struct ext_wait_queue *ewp)
717
__releases(&info->lock)
718
{
719
int retval;
720
signed long time;
721
722
wq_add(info, sr, ewp);
723
724
for (;;) {
725
/* memory barrier not required, we hold info->lock */
726
__set_current_state(TASK_INTERRUPTIBLE);
727
728
spin_unlock(&info->lock);
729
time = schedule_hrtimeout_range_clock(timeout, 0,
730
HRTIMER_MODE_ABS, CLOCK_REALTIME);
731
732
if (READ_ONCE(ewp->state) == STATE_READY) {
733
/* see MQ_BARRIER for purpose/pairing */
734
smp_acquire__after_ctrl_dep();
735
retval = 0;
736
goto out;
737
}
738
spin_lock(&info->lock);
739
740
/* we hold info->lock, so no memory barrier required */
741
if (READ_ONCE(ewp->state) == STATE_READY) {
742
retval = 0;
743
goto out_unlock;
744
}
745
if (signal_pending(current)) {
746
retval = -ERESTARTSYS;
747
break;
748
}
749
if (time == 0) {
750
retval = -ETIMEDOUT;
751
break;
752
}
753
}
754
list_del(&ewp->list);
755
out_unlock:
756
spin_unlock(&info->lock);
757
out:
758
return retval;
759
}
760
761
/*
762
* Returns waiting task that should be serviced first or NULL if none exists
763
*/
764
static struct ext_wait_queue *wq_get_first_waiter(
765
struct mqueue_inode_info *info, int sr)
766
{
767
struct list_head *ptr;
768
769
ptr = info->e_wait_q[sr].list.prev;
770
if (ptr == &info->e_wait_q[sr].list)
771
return NULL;
772
return list_entry(ptr, struct ext_wait_queue, list);
773
}
774
775
776
static inline void set_cookie(struct sk_buff *skb, char code)
777
{
778
((char *)skb->data)[NOTIFY_COOKIE_LEN-1] = code;
779
}
780
781
/*
782
* The next function is only to split too long sys_mq_timedsend
783
*/
784
static void __do_notify(struct mqueue_inode_info *info)
785
{
786
/* notification
787
* invoked when there is registered process and there isn't process
788
* waiting synchronously for message AND state of queue changed from
789
* empty to not empty. Here we are sure that no one is waiting
790
* synchronously. */
791
if (info->notify_owner &&
792
info->attr.mq_curmsgs == 1) {
793
switch (info->notify.sigev_notify) {
794
case SIGEV_NONE:
795
break;
796
case SIGEV_SIGNAL: {
797
struct kernel_siginfo sig_i;
798
struct task_struct *task;
799
800
/* do_mq_notify() accepts sigev_signo == 0, why?? */
801
if (!info->notify.sigev_signo)
802
break;
803
804
clear_siginfo(&sig_i);
805
sig_i.si_signo = info->notify.sigev_signo;
806
sig_i.si_errno = 0;
807
sig_i.si_code = SI_MESGQ;
808
sig_i.si_value = info->notify.sigev_value;
809
rcu_read_lock();
810
/* map current pid/uid into info->owner's namespaces */
811
sig_i.si_pid = task_tgid_nr_ns(current,
812
ns_of_pid(info->notify_owner));
813
sig_i.si_uid = from_kuid_munged(info->notify_user_ns,
814
current_uid());
815
/*
816
* We can't use kill_pid_info(), this signal should
817
* bypass check_kill_permission(). It is from kernel
818
* but si_fromuser() can't know this.
819
* We do check the self_exec_id, to avoid sending
820
* signals to programs that don't expect them.
821
*/
822
task = pid_task(info->notify_owner, PIDTYPE_TGID);
823
if (task && task->self_exec_id ==
824
info->notify_self_exec_id) {
825
do_send_sig_info(info->notify.sigev_signo,
826
&sig_i, task, PIDTYPE_TGID);
827
}
828
rcu_read_unlock();
829
break;
830
}
831
case SIGEV_THREAD:
832
set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
833
netlink_sendskb(info->notify_sock, info->notify_cookie);
834
break;
835
}
836
/* after notification unregisters process */
837
put_pid(info->notify_owner);
838
put_user_ns(info->notify_user_ns);
839
info->notify_owner = NULL;
840
info->notify_user_ns = NULL;
841
}
842
wake_up(&info->wait_q);
843
}
844
845
static int prepare_timeout(const struct __kernel_timespec __user *u_abs_timeout,
846
struct timespec64 *ts)
847
{
848
if (get_timespec64(ts, u_abs_timeout))
849
return -EFAULT;
850
if (!timespec64_valid(ts))
851
return -EINVAL;
852
return 0;
853
}
854
855
static void remove_notification(struct mqueue_inode_info *info)
856
{
857
if (info->notify_owner != NULL &&
858
info->notify.sigev_notify == SIGEV_THREAD) {
859
set_cookie(info->notify_cookie, NOTIFY_REMOVED);
860
netlink_sendskb(info->notify_sock, info->notify_cookie);
861
}
862
put_pid(info->notify_owner);
863
put_user_ns(info->notify_user_ns);
864
info->notify_owner = NULL;
865
info->notify_user_ns = NULL;
866
}
867
868
static int prepare_open(struct dentry *dentry, int oflag, int ro,
869
umode_t mode, struct filename *name,
870
struct mq_attr *attr)
871
{
872
static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
873
MAY_READ | MAY_WRITE };
874
int acc;
875
876
if (d_really_is_negative(dentry)) {
877
if (!(oflag & O_CREAT))
878
return -ENOENT;
879
if (ro)
880
return ro;
881
audit_inode_parent_hidden(name, dentry->d_parent);
882
return vfs_mkobj(dentry, mode & ~current_umask(),
883
mqueue_create_attr, attr);
884
}
885
/* it already existed */
886
audit_inode(name, dentry, 0);
887
if ((oflag & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
888
return -EEXIST;
889
if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
890
return -EINVAL;
891
acc = oflag2acc[oflag & O_ACCMODE];
892
return inode_permission(&nop_mnt_idmap, d_inode(dentry), acc);
893
}
894
895
static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
896
struct mq_attr *attr)
897
{
898
struct vfsmount *mnt = current->nsproxy->ipc_ns->mq_mnt;
899
struct dentry *root = mnt->mnt_root;
900
struct filename *name;
901
struct path path;
902
int fd, error;
903
int ro;
904
905
audit_mq_open(oflag, mode, attr);
906
907
name = getname(u_name);
908
if (IS_ERR(name))
909
return PTR_ERR(name);
910
911
fd = get_unused_fd_flags(O_CLOEXEC);
912
if (fd < 0)
913
goto out_putname;
914
915
ro = mnt_want_write(mnt); /* we'll drop it in any case */
916
inode_lock(d_inode(root));
917
path.dentry = lookup_noperm(&QSTR(name->name), root);
918
if (IS_ERR(path.dentry)) {
919
error = PTR_ERR(path.dentry);
920
goto out_putfd;
921
}
922
path.mnt = mntget(mnt);
923
error = prepare_open(path.dentry, oflag, ro, mode, name, attr);
924
if (!error) {
925
struct file *file = dentry_open(&path, oflag, current_cred());
926
if (!IS_ERR(file))
927
fd_install(fd, file);
928
else
929
error = PTR_ERR(file);
930
}
931
path_put(&path);
932
out_putfd:
933
if (error) {
934
put_unused_fd(fd);
935
fd = error;
936
}
937
inode_unlock(d_inode(root));
938
if (!ro)
939
mnt_drop_write(mnt);
940
out_putname:
941
putname(name);
942
return fd;
943
}
944
945
SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
946
struct mq_attr __user *, u_attr)
947
{
948
struct mq_attr attr;
949
if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
950
return -EFAULT;
951
952
return do_mq_open(u_name, oflag, mode, u_attr ? &attr : NULL);
953
}
954
955
SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
956
{
957
int err;
958
struct filename *name;
959
struct dentry *dentry;
960
struct inode *inode = NULL;
961
struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
962
struct vfsmount *mnt = ipc_ns->mq_mnt;
963
964
name = getname(u_name);
965
if (IS_ERR(name))
966
return PTR_ERR(name);
967
968
audit_inode_parent_hidden(name, mnt->mnt_root);
969
err = mnt_want_write(mnt);
970
if (err)
971
goto out_name;
972
inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT);
973
dentry = lookup_noperm(&QSTR(name->name), mnt->mnt_root);
974
if (IS_ERR(dentry)) {
975
err = PTR_ERR(dentry);
976
goto out_unlock;
977
}
978
979
inode = d_inode(dentry);
980
if (!inode) {
981
err = -ENOENT;
982
} else {
983
ihold(inode);
984
err = vfs_unlink(&nop_mnt_idmap, d_inode(dentry->d_parent),
985
dentry, NULL);
986
}
987
dput(dentry);
988
989
out_unlock:
990
inode_unlock(d_inode(mnt->mnt_root));
991
iput(inode);
992
mnt_drop_write(mnt);
993
out_name:
994
putname(name);
995
996
return err;
997
}
998
999
/* Pipelined send and receive functions.
1000
*
1001
* If a receiver finds no waiting message, then it registers itself in the
1002
* list of waiting receivers. A sender checks that list before adding the new
1003
* message into the message array. If there is a waiting receiver, then it
1004
* bypasses the message array and directly hands the message over to the
1005
* receiver. The receiver accepts the message and returns without grabbing the
1006
* queue spinlock:
1007
*
1008
* - Set pointer to message.
1009
* - Queue the receiver task for later wakeup (without the info->lock).
1010
* - Update its state to STATE_READY. Now the receiver can continue.
1011
* - Wake up the process after the lock is dropped. Should the process wake up
1012
* before this wakeup (due to a timeout or a signal) it will either see
1013
* STATE_READY and continue or acquire the lock to check the state again.
1014
*
1015
* The same algorithm is used for senders.
1016
*/
1017
1018
static inline void __pipelined_op(struct wake_q_head *wake_q,
1019
struct mqueue_inode_info *info,
1020
struct ext_wait_queue *this)
1021
{
1022
struct task_struct *task;
1023
1024
list_del(&this->list);
1025
task = get_task_struct(this->task);
1026
1027
/* see MQ_BARRIER for purpose/pairing */
1028
smp_store_release(&this->state, STATE_READY);
1029
wake_q_add_safe(wake_q, task);
1030
}
1031
1032
/* pipelined_send() - send a message directly to the task waiting in
1033
* sys_mq_timedreceive() (without inserting message into a queue).
1034
*/
1035
static inline void pipelined_send(struct wake_q_head *wake_q,
1036
struct mqueue_inode_info *info,
1037
struct msg_msg *message,
1038
struct ext_wait_queue *receiver)
1039
{
1040
receiver->msg = message;
1041
__pipelined_op(wake_q, info, receiver);
1042
}
1043
1044
/* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
1045
* gets its message and put to the queue (we have one free place for sure). */
1046
static inline void pipelined_receive(struct wake_q_head *wake_q,
1047
struct mqueue_inode_info *info)
1048
{
1049
struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
1050
1051
if (!sender) {
1052
/* for poll */
1053
wake_up_interruptible(&info->wait_q);
1054
return;
1055
}
1056
if (msg_insert(sender->msg, info))
1057
return;
1058
1059
__pipelined_op(wake_q, info, sender);
1060
}
1061
1062
static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
1063
size_t msg_len, unsigned int msg_prio,
1064
struct timespec64 *ts)
1065
{
1066
struct inode *inode;
1067
struct ext_wait_queue wait;
1068
struct ext_wait_queue *receiver;
1069
struct msg_msg *msg_ptr;
1070
struct mqueue_inode_info *info;
1071
ktime_t expires, *timeout = NULL;
1072
struct posix_msg_tree_node *new_leaf = NULL;
1073
int ret = 0;
1074
DEFINE_WAKE_Q(wake_q);
1075
1076
if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
1077
return -EINVAL;
1078
1079
if (ts) {
1080
expires = timespec64_to_ktime(*ts);
1081
timeout = &expires;
1082
}
1083
1084
audit_mq_sendrecv(mqdes, msg_len, msg_prio, ts);
1085
1086
CLASS(fd, f)(mqdes);
1087
if (fd_empty(f))
1088
return -EBADF;
1089
1090
inode = file_inode(fd_file(f));
1091
if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))
1092
return -EBADF;
1093
info = MQUEUE_I(inode);
1094
audit_file(fd_file(f));
1095
1096
if (unlikely(!(fd_file(f)->f_mode & FMODE_WRITE)))
1097
return -EBADF;
1098
1099
if (unlikely(msg_len > info->attr.mq_msgsize))
1100
return -EMSGSIZE;
1101
1102
/* First try to allocate memory, before doing anything with
1103
* existing queues. */
1104
msg_ptr = load_msg(u_msg_ptr, msg_len);
1105
if (IS_ERR(msg_ptr))
1106
return PTR_ERR(msg_ptr);
1107
msg_ptr->m_ts = msg_len;
1108
msg_ptr->m_type = msg_prio;
1109
1110
/*
1111
* msg_insert really wants us to have a valid, spare node struct so
1112
* it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
1113
* fall back to that if necessary.
1114
*/
1115
if (!info->node_cache)
1116
new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
1117
1118
spin_lock(&info->lock);
1119
1120
if (!info->node_cache && new_leaf) {
1121
/* Save our speculative allocation into the cache */
1122
INIT_LIST_HEAD(&new_leaf->msg_list);
1123
info->node_cache = new_leaf;
1124
new_leaf = NULL;
1125
} else {
1126
kfree(new_leaf);
1127
}
1128
1129
if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
1130
if (fd_file(f)->f_flags & O_NONBLOCK) {
1131
ret = -EAGAIN;
1132
} else {
1133
wait.task = current;
1134
wait.msg = (void *) msg_ptr;
1135
1136
/* memory barrier not required, we hold info->lock */
1137
WRITE_ONCE(wait.state, STATE_NONE);
1138
ret = wq_sleep(info, SEND, timeout, &wait);
1139
/*
1140
* wq_sleep must be called with info->lock held, and
1141
* returns with the lock released
1142
*/
1143
goto out_free;
1144
}
1145
} else {
1146
receiver = wq_get_first_waiter(info, RECV);
1147
if (receiver) {
1148
pipelined_send(&wake_q, info, msg_ptr, receiver);
1149
} else {
1150
/* adds message to the queue */
1151
ret = msg_insert(msg_ptr, info);
1152
if (ret)
1153
goto out_unlock;
1154
__do_notify(info);
1155
}
1156
simple_inode_init_ts(inode);
1157
}
1158
out_unlock:
1159
spin_unlock(&info->lock);
1160
wake_up_q(&wake_q);
1161
out_free:
1162
if (ret)
1163
free_msg(msg_ptr);
1164
return ret;
1165
}
1166
1167
static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
1168
size_t msg_len, unsigned int __user *u_msg_prio,
1169
struct timespec64 *ts)
1170
{
1171
ssize_t ret;
1172
struct msg_msg *msg_ptr;
1173
struct inode *inode;
1174
struct mqueue_inode_info *info;
1175
struct ext_wait_queue wait;
1176
ktime_t expires, *timeout = NULL;
1177
struct posix_msg_tree_node *new_leaf = NULL;
1178
1179
if (ts) {
1180
expires = timespec64_to_ktime(*ts);
1181
timeout = &expires;
1182
}
1183
1184
audit_mq_sendrecv(mqdes, msg_len, 0, ts);
1185
1186
CLASS(fd, f)(mqdes);
1187
if (fd_empty(f))
1188
return -EBADF;
1189
1190
inode = file_inode(fd_file(f));
1191
if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))
1192
return -EBADF;
1193
info = MQUEUE_I(inode);
1194
audit_file(fd_file(f));
1195
1196
if (unlikely(!(fd_file(f)->f_mode & FMODE_READ)))
1197
return -EBADF;
1198
1199
/* checks if buffer is big enough */
1200
if (unlikely(msg_len < info->attr.mq_msgsize))
1201
return -EMSGSIZE;
1202
1203
/*
1204
* msg_insert really wants us to have a valid, spare node struct so
1205
* it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
1206
* fall back to that if necessary.
1207
*/
1208
if (!info->node_cache)
1209
new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
1210
1211
spin_lock(&info->lock);
1212
1213
if (!info->node_cache && new_leaf) {
1214
/* Save our speculative allocation into the cache */
1215
INIT_LIST_HEAD(&new_leaf->msg_list);
1216
info->node_cache = new_leaf;
1217
} else {
1218
kfree(new_leaf);
1219
}
1220
1221
if (info->attr.mq_curmsgs == 0) {
1222
if (fd_file(f)->f_flags & O_NONBLOCK) {
1223
spin_unlock(&info->lock);
1224
ret = -EAGAIN;
1225
} else {
1226
wait.task = current;
1227
1228
/* memory barrier not required, we hold info->lock */
1229
WRITE_ONCE(wait.state, STATE_NONE);
1230
ret = wq_sleep(info, RECV, timeout, &wait);
1231
msg_ptr = wait.msg;
1232
}
1233
} else {
1234
DEFINE_WAKE_Q(wake_q);
1235
1236
msg_ptr = msg_get(info);
1237
1238
simple_inode_init_ts(inode);
1239
1240
/* There is now free space in queue. */
1241
pipelined_receive(&wake_q, info);
1242
spin_unlock(&info->lock);
1243
wake_up_q(&wake_q);
1244
ret = 0;
1245
}
1246
if (ret == 0) {
1247
ret = msg_ptr->m_ts;
1248
1249
if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||
1250
store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
1251
ret = -EFAULT;
1252
}
1253
free_msg(msg_ptr);
1254
}
1255
return ret;
1256
}
1257
1258
SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
1259
size_t, msg_len, unsigned int, msg_prio,
1260
const struct __kernel_timespec __user *, u_abs_timeout)
1261
{
1262
struct timespec64 ts, *p = NULL;
1263
if (u_abs_timeout) {
1264
int res = prepare_timeout(u_abs_timeout, &ts);
1265
if (res)
1266
return res;
1267
p = &ts;
1268
}
1269
return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p);
1270
}
1271
1272
SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
1273
size_t, msg_len, unsigned int __user *, u_msg_prio,
1274
const struct __kernel_timespec __user *, u_abs_timeout)
1275
{
1276
struct timespec64 ts, *p = NULL;
1277
if (u_abs_timeout) {
1278
int res = prepare_timeout(u_abs_timeout, &ts);
1279
if (res)
1280
return res;
1281
p = &ts;
1282
}
1283
return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p);
1284
}
1285
1286
/*
1287
* Notes: the case when user wants us to deregister (with NULL as pointer)
1288
* and he isn't currently owner of notification, will be silently discarded.
1289
* It isn't explicitly defined in the POSIX.
1290
*/
1291
static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
1292
{
1293
int ret;
1294
struct sock *sock;
1295
struct inode *inode;
1296
struct mqueue_inode_info *info;
1297
struct sk_buff *nc;
1298
1299
audit_mq_notify(mqdes, notification);
1300
1301
nc = NULL;
1302
sock = NULL;
1303
if (notification != NULL) {
1304
if (unlikely(notification->sigev_notify != SIGEV_NONE &&
1305
notification->sigev_notify != SIGEV_SIGNAL &&
1306
notification->sigev_notify != SIGEV_THREAD))
1307
return -EINVAL;
1308
if (notification->sigev_notify == SIGEV_SIGNAL &&
1309
!valid_signal(notification->sigev_signo)) {
1310
return -EINVAL;
1311
}
1312
if (notification->sigev_notify == SIGEV_THREAD) {
1313
long timeo;
1314
1315
/* create the notify skb */
1316
nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
1317
if (!nc)
1318
return -ENOMEM;
1319
1320
if (copy_from_user(nc->data,
1321
notification->sigev_value.sival_ptr,
1322
NOTIFY_COOKIE_LEN)) {
1323
kfree_skb(nc);
1324
return -EFAULT;
1325
}
1326
1327
/* TODO: add a header? */
1328
skb_put(nc, NOTIFY_COOKIE_LEN);
1329
/* and attach it to the socket */
1330
retry:
1331
sock = netlink_getsockbyfd(notification->sigev_signo);
1332
if (IS_ERR(sock)) {
1333
kfree_skb(nc);
1334
return PTR_ERR(sock);
1335
}
1336
1337
timeo = MAX_SCHEDULE_TIMEOUT;
1338
ret = netlink_attachskb(sock, nc, &timeo, NULL);
1339
if (ret == 1)
1340
goto retry;
1341
if (ret)
1342
return ret;
1343
}
1344
}
1345
1346
CLASS(fd, f)(mqdes);
1347
if (fd_empty(f)) {
1348
ret = -EBADF;
1349
goto out;
1350
}
1351
1352
inode = file_inode(fd_file(f));
1353
if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) {
1354
ret = -EBADF;
1355
goto out;
1356
}
1357
info = MQUEUE_I(inode);
1358
1359
ret = 0;
1360
spin_lock(&info->lock);
1361
if (notification == NULL) {
1362
if (info->notify_owner == task_tgid(current)) {
1363
remove_notification(info);
1364
inode_set_atime_to_ts(inode,
1365
inode_set_ctime_current(inode));
1366
}
1367
} else if (info->notify_owner != NULL) {
1368
ret = -EBUSY;
1369
} else {
1370
switch (notification->sigev_notify) {
1371
case SIGEV_NONE:
1372
info->notify.sigev_notify = SIGEV_NONE;
1373
break;
1374
case SIGEV_THREAD:
1375
info->notify_sock = sock;
1376
info->notify_cookie = nc;
1377
sock = NULL;
1378
nc = NULL;
1379
info->notify.sigev_notify = SIGEV_THREAD;
1380
break;
1381
case SIGEV_SIGNAL:
1382
info->notify.sigev_signo = notification->sigev_signo;
1383
info->notify.sigev_value = notification->sigev_value;
1384
info->notify.sigev_notify = SIGEV_SIGNAL;
1385
info->notify_self_exec_id = current->self_exec_id;
1386
break;
1387
}
1388
1389
info->notify_owner = get_pid(task_tgid(current));
1390
info->notify_user_ns = get_user_ns(current_user_ns());
1391
inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
1392
}
1393
spin_unlock(&info->lock);
1394
out:
1395
if (sock)
1396
netlink_detachskb(sock, nc);
1397
return ret;
1398
}
1399
1400
SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
1401
const struct sigevent __user *, u_notification)
1402
{
1403
struct sigevent n, *p = NULL;
1404
if (u_notification) {
1405
if (copy_from_user(&n, u_notification, sizeof(struct sigevent)))
1406
return -EFAULT;
1407
p = &n;
1408
}
1409
return do_mq_notify(mqdes, p);
1410
}
1411
1412
static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
1413
{
1414
struct inode *inode;
1415
struct mqueue_inode_info *info;
1416
1417
if (new && (new->mq_flags & (~O_NONBLOCK)))
1418
return -EINVAL;
1419
1420
CLASS(fd, f)(mqdes);
1421
if (fd_empty(f))
1422
return -EBADF;
1423
1424
if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))
1425
return -EBADF;
1426
1427
inode = file_inode(fd_file(f));
1428
info = MQUEUE_I(inode);
1429
1430
spin_lock(&info->lock);
1431
1432
if (old) {
1433
*old = info->attr;
1434
old->mq_flags = fd_file(f)->f_flags & O_NONBLOCK;
1435
}
1436
if (new) {
1437
audit_mq_getsetattr(mqdes, new);
1438
spin_lock(&fd_file(f)->f_lock);
1439
if (new->mq_flags & O_NONBLOCK)
1440
fd_file(f)->f_flags |= O_NONBLOCK;
1441
else
1442
fd_file(f)->f_flags &= ~O_NONBLOCK;
1443
spin_unlock(&fd_file(f)->f_lock);
1444
1445
inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
1446
}
1447
1448
spin_unlock(&info->lock);
1449
return 0;
1450
}
1451
1452
SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
1453
const struct mq_attr __user *, u_mqstat,
1454
struct mq_attr __user *, u_omqstat)
1455
{
1456
int ret;
1457
struct mq_attr mqstat, omqstat;
1458
struct mq_attr *new = NULL, *old = NULL;
1459
1460
if (u_mqstat) {
1461
new = &mqstat;
1462
if (copy_from_user(new, u_mqstat, sizeof(struct mq_attr)))
1463
return -EFAULT;
1464
}
1465
if (u_omqstat)
1466
old = &omqstat;
1467
1468
ret = do_mq_getsetattr(mqdes, new, old);
1469
if (ret || !old)
1470
return ret;
1471
1472
if (copy_to_user(u_omqstat, old, sizeof(struct mq_attr)))
1473
return -EFAULT;
1474
return 0;
1475
}
1476
1477
#ifdef CONFIG_COMPAT
1478
1479
struct compat_mq_attr {
1480
compat_long_t mq_flags; /* message queue flags */
1481
compat_long_t mq_maxmsg; /* maximum number of messages */
1482
compat_long_t mq_msgsize; /* maximum message size */
1483
compat_long_t mq_curmsgs; /* number of messages currently queued */
1484
compat_long_t __reserved[4]; /* ignored for input, zeroed for output */
1485
};
1486
1487
static inline int get_compat_mq_attr(struct mq_attr *attr,
1488
const struct compat_mq_attr __user *uattr)
1489
{
1490
struct compat_mq_attr v;
1491
1492
if (copy_from_user(&v, uattr, sizeof(*uattr)))
1493
return -EFAULT;
1494
1495
memset(attr, 0, sizeof(*attr));
1496
attr->mq_flags = v.mq_flags;
1497
attr->mq_maxmsg = v.mq_maxmsg;
1498
attr->mq_msgsize = v.mq_msgsize;
1499
attr->mq_curmsgs = v.mq_curmsgs;
1500
return 0;
1501
}
1502
1503
static inline int put_compat_mq_attr(const struct mq_attr *attr,
1504
struct compat_mq_attr __user *uattr)
1505
{
1506
struct compat_mq_attr v;
1507
1508
memset(&v, 0, sizeof(v));
1509
v.mq_flags = attr->mq_flags;
1510
v.mq_maxmsg = attr->mq_maxmsg;
1511
v.mq_msgsize = attr->mq_msgsize;
1512
v.mq_curmsgs = attr->mq_curmsgs;
1513
if (copy_to_user(uattr, &v, sizeof(*uattr)))
1514
return -EFAULT;
1515
return 0;
1516
}
1517
1518
COMPAT_SYSCALL_DEFINE4(mq_open, const char __user *, u_name,
1519
int, oflag, compat_mode_t, mode,
1520
struct compat_mq_attr __user *, u_attr)
1521
{
1522
struct mq_attr attr, *p = NULL;
1523
if (u_attr && oflag & O_CREAT) {
1524
p = &attr;
1525
if (get_compat_mq_attr(&attr, u_attr))
1526
return -EFAULT;
1527
}
1528
return do_mq_open(u_name, oflag, mode, p);
1529
}
1530
1531
COMPAT_SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
1532
const struct compat_sigevent __user *, u_notification)
1533
{
1534
struct sigevent n, *p = NULL;
1535
if (u_notification) {
1536
if (get_compat_sigevent(&n, u_notification))
1537
return -EFAULT;
1538
if (n.sigev_notify == SIGEV_THREAD)
1539
n.sigev_value.sival_ptr = compat_ptr(n.sigev_value.sival_int);
1540
p = &n;
1541
}
1542
return do_mq_notify(mqdes, p);
1543
}
1544
1545
COMPAT_SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
1546
const struct compat_mq_attr __user *, u_mqstat,
1547
struct compat_mq_attr __user *, u_omqstat)
1548
{
1549
int ret;
1550
struct mq_attr mqstat, omqstat;
1551
struct mq_attr *new = NULL, *old = NULL;
1552
1553
if (u_mqstat) {
1554
new = &mqstat;
1555
if (get_compat_mq_attr(new, u_mqstat))
1556
return -EFAULT;
1557
}
1558
if (u_omqstat)
1559
old = &omqstat;
1560
1561
ret = do_mq_getsetattr(mqdes, new, old);
1562
if (ret || !old)
1563
return ret;
1564
1565
if (put_compat_mq_attr(old, u_omqstat))
1566
return -EFAULT;
1567
return 0;
1568
}
1569
#endif
1570
1571
#ifdef CONFIG_COMPAT_32BIT_TIME
1572
static int compat_prepare_timeout(const struct old_timespec32 __user *p,
1573
struct timespec64 *ts)
1574
{
1575
if (get_old_timespec32(ts, p))
1576
return -EFAULT;
1577
if (!timespec64_valid(ts))
1578
return -EINVAL;
1579
return 0;
1580
}
1581
1582
SYSCALL_DEFINE5(mq_timedsend_time32, mqd_t, mqdes,
1583
const char __user *, u_msg_ptr,
1584
unsigned int, msg_len, unsigned int, msg_prio,
1585
const struct old_timespec32 __user *, u_abs_timeout)
1586
{
1587
struct timespec64 ts, *p = NULL;
1588
if (u_abs_timeout) {
1589
int res = compat_prepare_timeout(u_abs_timeout, &ts);
1590
if (res)
1591
return res;
1592
p = &ts;
1593
}
1594
return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p);
1595
}
1596
1597
SYSCALL_DEFINE5(mq_timedreceive_time32, mqd_t, mqdes,
1598
char __user *, u_msg_ptr,
1599
unsigned int, msg_len, unsigned int __user *, u_msg_prio,
1600
const struct old_timespec32 __user *, u_abs_timeout)
1601
{
1602
struct timespec64 ts, *p = NULL;
1603
if (u_abs_timeout) {
1604
int res = compat_prepare_timeout(u_abs_timeout, &ts);
1605
if (res)
1606
return res;
1607
p = &ts;
1608
}
1609
return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p);
1610
}
1611
#endif
1612
1613
static const struct inode_operations mqueue_dir_inode_operations = {
1614
.lookup = simple_lookup,
1615
.create = mqueue_create,
1616
.unlink = mqueue_unlink,
1617
};
1618
1619
static const struct file_operations mqueue_file_operations = {
1620
.flush = mqueue_flush_file,
1621
.poll = mqueue_poll_file,
1622
.read = mqueue_read_file,
1623
.llseek = default_llseek,
1624
};
1625
1626
static const struct super_operations mqueue_super_ops = {
1627
.alloc_inode = mqueue_alloc_inode,
1628
.free_inode = mqueue_free_inode,
1629
.evict_inode = mqueue_evict_inode,
1630
.statfs = simple_statfs,
1631
};
1632
1633
static const struct fs_context_operations mqueue_fs_context_ops = {
1634
.free = mqueue_fs_context_free,
1635
.get_tree = mqueue_get_tree,
1636
};
1637
1638
static struct file_system_type mqueue_fs_type = {
1639
.name = "mqueue",
1640
.init_fs_context = mqueue_init_fs_context,
1641
.kill_sb = kill_litter_super,
1642
.fs_flags = FS_USERNS_MOUNT,
1643
};
1644
1645
int mq_init_ns(struct ipc_namespace *ns)
1646
{
1647
struct vfsmount *m;
1648
1649
ns->mq_queues_count = 0;
1650
ns->mq_queues_max = DFLT_QUEUESMAX;
1651
ns->mq_msg_max = DFLT_MSGMAX;
1652
ns->mq_msgsize_max = DFLT_MSGSIZEMAX;
1653
ns->mq_msg_default = DFLT_MSG;
1654
ns->mq_msgsize_default = DFLT_MSGSIZE;
1655
1656
m = mq_create_mount(ns);
1657
if (IS_ERR(m))
1658
return PTR_ERR(m);
1659
ns->mq_mnt = m;
1660
return 0;
1661
}
1662
1663
void mq_clear_sbinfo(struct ipc_namespace *ns)
1664
{
1665
ns->mq_mnt->mnt_sb->s_fs_info = NULL;
1666
}
1667
1668
static int __init init_mqueue_fs(void)
1669
{
1670
int error;
1671
1672
mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
1673
sizeof(struct mqueue_inode_info), 0,
1674
SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, init_once);
1675
if (mqueue_inode_cachep == NULL)
1676
return -ENOMEM;
1677
1678
if (!setup_mq_sysctls(&init_ipc_ns)) {
1679
pr_warn("sysctl registration failed\n");
1680
error = -ENOMEM;
1681
goto out_kmem;
1682
}
1683
1684
error = register_filesystem(&mqueue_fs_type);
1685
if (error)
1686
goto out_sysctl;
1687
1688
spin_lock_init(&mq_lock);
1689
1690
error = mq_init_ns(&init_ipc_ns);
1691
if (error)
1692
goto out_filesystem;
1693
1694
return 0;
1695
1696
out_filesystem:
1697
unregister_filesystem(&mqueue_fs_type);
1698
out_sysctl:
1699
retire_mq_sysctls(&init_ipc_ns);
1700
out_kmem:
1701
kmem_cache_destroy(mqueue_inode_cachep);
1702
return error;
1703
}
1704
1705
device_initcall(init_mqueue_fs);
1706
1707