Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/ceph/locks.c
26283 views
1
// SPDX-License-Identifier: GPL-2.0
2
#include <linux/ceph/ceph_debug.h>
3
4
#include <linux/file.h>
5
#include <linux/namei.h>
6
#include <linux/random.h>
7
8
#include "super.h"
9
#include "mds_client.h"
10
#include <linux/filelock.h>
11
#include <linux/ceph/pagelist.h>
12
13
static u64 lock_secret;
14
static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
15
struct ceph_mds_request *req);
16
17
static inline u64 secure_addr(void *addr)
18
{
19
u64 v = lock_secret ^ (u64)(unsigned long)addr;
20
/*
21
* Set the most significant bit, so that MDS knows the 'owner'
22
* is sufficient to identify the owner of lock. (old code uses
23
* both 'owner' and 'pid')
24
*/
25
v |= (1ULL << 63);
26
return v;
27
}
28
29
void __init ceph_flock_init(void)
30
{
31
get_random_bytes(&lock_secret, sizeof(lock_secret));
32
}
33
34
static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
35
{
36
struct inode *inode = file_inode(dst->c.flc_file);
37
atomic_inc(&ceph_inode(inode)->i_filelock_ref);
38
dst->fl_u.ceph.inode = igrab(inode);
39
}
40
41
/*
42
* Do not use the 'fl->fl_file' in release function, which
43
* is possibly already released by another thread.
44
*/
45
static void ceph_fl_release_lock(struct file_lock *fl)
46
{
47
struct inode *inode = fl->fl_u.ceph.inode;
48
struct ceph_inode_info *ci;
49
50
/*
51
* If inode is NULL it should be a request file_lock,
52
* nothing we can do.
53
*/
54
if (!inode)
55
return;
56
57
ci = ceph_inode(inode);
58
if (atomic_dec_and_test(&ci->i_filelock_ref)) {
59
/* clear error when all locks are released */
60
spin_lock(&ci->i_ceph_lock);
61
ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
62
spin_unlock(&ci->i_ceph_lock);
63
}
64
fl->fl_u.ceph.inode = NULL;
65
iput(inode);
66
}
67
68
static const struct file_lock_operations ceph_fl_lock_ops = {
69
.fl_copy_lock = ceph_fl_copy_lock,
70
.fl_release_private = ceph_fl_release_lock,
71
};
72
73
/*
74
* Implement fcntl and flock locking functions.
75
*/
76
static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
77
int cmd, u8 wait, struct file_lock *fl)
78
{
79
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
80
struct ceph_client *cl = mdsc->fsc->client;
81
struct ceph_mds_request *req;
82
int err;
83
u64 length = 0;
84
u64 owner;
85
86
if (operation == CEPH_MDS_OP_SETFILELOCK) {
87
/*
88
* increasing i_filelock_ref closes race window between
89
* handling request reply and adding file_lock struct to
90
* inode. Otherwise, auth caps may get trimmed in the
91
* window. Caller function will decrease the counter.
92
*/
93
fl->fl_ops = &ceph_fl_lock_ops;
94
fl->fl_ops->fl_copy_lock(fl, NULL);
95
}
96
97
if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
98
wait = 0;
99
100
req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
101
if (IS_ERR(req))
102
return PTR_ERR(req);
103
req->r_inode = inode;
104
ihold(inode);
105
req->r_num_caps = 1;
106
107
/* mds requires start and length rather than start and end */
108
if (LLONG_MAX == fl->fl_end)
109
length = 0;
110
else
111
length = fl->fl_end - fl->fl_start + 1;
112
113
owner = secure_addr(fl->c.flc_owner);
114
115
doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, "
116
"start: %llu, length: %llu, wait: %d, type: %d\n",
117
(int)lock_type, (int)operation, owner,
118
(u64) fl->c.flc_pid,
119
fl->fl_start, length, wait, fl->c.flc_type);
120
121
req->r_args.filelock_change.rule = lock_type;
122
req->r_args.filelock_change.type = cmd;
123
req->r_args.filelock_change.owner = cpu_to_le64(owner);
124
req->r_args.filelock_change.pid = cpu_to_le64((u64) fl->c.flc_pid);
125
req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
126
req->r_args.filelock_change.length = cpu_to_le64(length);
127
req->r_args.filelock_change.wait = wait;
128
129
err = ceph_mdsc_submit_request(mdsc, inode, req);
130
if (!err)
131
err = ceph_mdsc_wait_request(mdsc, req, wait ?
132
ceph_lock_wait_for_completion : NULL);
133
if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
134
fl->c.flc_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
135
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
136
fl->c.flc_type = F_RDLCK;
137
else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
138
fl->c.flc_type = F_WRLCK;
139
else
140
fl->c.flc_type = F_UNLCK;
141
142
fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
143
length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
144
le64_to_cpu(req->r_reply_info.filelock_reply->length);
145
if (length >= 1)
146
fl->fl_end = length -1;
147
else
148
fl->fl_end = 0;
149
150
}
151
ceph_mdsc_put_request(req);
152
doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, "
153
"length: %llu, wait: %d, type: %d, err code %d\n",
154
(int)lock_type, (int)operation, (u64) fl->c.flc_pid,
155
fl->fl_start, length, wait, fl->c.flc_type, err);
156
return err;
157
}
158
159
static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
160
struct ceph_mds_request *req)
161
{
162
struct ceph_client *cl = mdsc->fsc->client;
163
struct ceph_mds_request *intr_req;
164
struct inode *inode = req->r_inode;
165
int err, lock_type;
166
167
BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK);
168
if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL)
169
lock_type = CEPH_LOCK_FCNTL_INTR;
170
else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK)
171
lock_type = CEPH_LOCK_FLOCK_INTR;
172
else
173
BUG_ON(1);
174
BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK);
175
176
err = wait_for_completion_interruptible(&req->r_completion);
177
if (!err)
178
return 0;
179
180
doutc(cl, "request %llu was interrupted\n", req->r_tid);
181
182
mutex_lock(&mdsc->mutex);
183
if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
184
err = 0;
185
} else {
186
/*
187
* ensure we aren't running concurrently with
188
* ceph_fill_trace or ceph_readdir_prepopulate, which
189
* rely on locks (dir mutex) held by our caller.
190
*/
191
mutex_lock(&req->r_fill_mutex);
192
req->r_err = err;
193
set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
194
mutex_unlock(&req->r_fill_mutex);
195
196
if (!req->r_session) {
197
// haven't sent the request
198
err = 0;
199
}
200
}
201
mutex_unlock(&mdsc->mutex);
202
if (!err)
203
return 0;
204
205
intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
206
USE_AUTH_MDS);
207
if (IS_ERR(intr_req))
208
return PTR_ERR(intr_req);
209
210
intr_req->r_inode = inode;
211
ihold(inode);
212
intr_req->r_num_caps = 1;
213
214
intr_req->r_args.filelock_change = req->r_args.filelock_change;
215
intr_req->r_args.filelock_change.rule = lock_type;
216
intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK;
217
218
err = ceph_mdsc_do_request(mdsc, inode, intr_req);
219
ceph_mdsc_put_request(intr_req);
220
221
if (err && err != -ERESTARTSYS)
222
return err;
223
224
wait_for_completion_killable(&req->r_safe_completion);
225
return 0;
226
}
227
228
static int try_unlock_file(struct file *file, struct file_lock *fl)
229
{
230
int err;
231
unsigned int orig_flags = fl->c.flc_flags;
232
fl->c.flc_flags |= FL_EXISTS;
233
err = locks_lock_file_wait(file, fl);
234
fl->c.flc_flags = orig_flags;
235
if (err == -ENOENT) {
236
if (!(orig_flags & FL_EXISTS))
237
err = 0;
238
return err;
239
}
240
return 1;
241
}
242
243
/*
244
* Attempt to set an fcntl lock.
245
* For now, this just goes away to the server. Later it may be more awesome.
246
*/
247
int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
248
{
249
struct inode *inode = file_inode(file);
250
struct ceph_inode_info *ci = ceph_inode(inode);
251
struct ceph_client *cl = ceph_inode_to_client(inode);
252
int err = 0;
253
u16 op = CEPH_MDS_OP_SETFILELOCK;
254
u8 wait = 0;
255
u8 lock_cmd;
256
257
if (!(fl->c.flc_flags & FL_POSIX))
258
return -ENOLCK;
259
260
if (ceph_inode_is_shutdown(inode))
261
return -ESTALE;
262
263
doutc(cl, "fl_owner: %p\n", fl->c.flc_owner);
264
265
/* set wait bit as appropriate, then make command as Ceph expects it*/
266
if (IS_GETLK(cmd))
267
op = CEPH_MDS_OP_GETFILELOCK;
268
else if (IS_SETLKW(cmd))
269
wait = 1;
270
271
spin_lock(&ci->i_ceph_lock);
272
if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
273
err = -EIO;
274
}
275
spin_unlock(&ci->i_ceph_lock);
276
if (err < 0) {
277
if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl))
278
posix_lock_file(file, fl, NULL);
279
return err;
280
}
281
282
if (lock_is_read(fl))
283
lock_cmd = CEPH_LOCK_SHARED;
284
else if (lock_is_write(fl))
285
lock_cmd = CEPH_LOCK_EXCL;
286
else
287
lock_cmd = CEPH_LOCK_UNLOCK;
288
289
if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) {
290
err = try_unlock_file(file, fl);
291
if (err <= 0)
292
return err;
293
}
294
295
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
296
if (!err) {
297
if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->c.flc_type) {
298
doutc(cl, "locking locally\n");
299
err = posix_lock_file(file, fl, NULL);
300
if (err) {
301
/* undo! This should only happen if
302
* the kernel detects local
303
* deadlock. */
304
ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
305
CEPH_LOCK_UNLOCK, 0, fl);
306
doutc(cl, "got %d on posix_lock_file, undid lock\n",
307
err);
308
}
309
}
310
}
311
return err;
312
}
313
314
int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
315
{
316
struct inode *inode = file_inode(file);
317
struct ceph_inode_info *ci = ceph_inode(inode);
318
struct ceph_client *cl = ceph_inode_to_client(inode);
319
int err = 0;
320
u8 wait = 0;
321
u8 lock_cmd;
322
323
if (!(fl->c.flc_flags & FL_FLOCK))
324
return -ENOLCK;
325
326
if (ceph_inode_is_shutdown(inode))
327
return -ESTALE;
328
329
doutc(cl, "fl_file: %p\n", fl->c.flc_file);
330
331
spin_lock(&ci->i_ceph_lock);
332
if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
333
err = -EIO;
334
}
335
spin_unlock(&ci->i_ceph_lock);
336
if (err < 0) {
337
if (lock_is_unlock(fl))
338
locks_lock_file_wait(file, fl);
339
return err;
340
}
341
342
if (IS_SETLKW(cmd))
343
wait = 1;
344
345
if (lock_is_read(fl))
346
lock_cmd = CEPH_LOCK_SHARED;
347
else if (lock_is_write(fl))
348
lock_cmd = CEPH_LOCK_EXCL;
349
else
350
lock_cmd = CEPH_LOCK_UNLOCK;
351
352
if (lock_is_unlock(fl)) {
353
err = try_unlock_file(file, fl);
354
if (err <= 0)
355
return err;
356
}
357
358
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
359
inode, lock_cmd, wait, fl);
360
if (!err && F_UNLCK != fl->c.flc_type) {
361
err = locks_lock_file_wait(file, fl);
362
if (err) {
363
ceph_lock_message(CEPH_LOCK_FLOCK,
364
CEPH_MDS_OP_SETFILELOCK,
365
inode, CEPH_LOCK_UNLOCK, 0, fl);
366
doutc(cl, "got %d on locks_lock_file_wait, undid lock\n",
367
err);
368
}
369
}
370
return err;
371
}
372
373
/*
374
* Fills in the passed counter variables, so you can prepare pagelist metadata
375
* before calling ceph_encode_locks.
376
*/
377
void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
378
{
379
struct ceph_client *cl = ceph_inode_to_client(inode);
380
struct file_lock *lock;
381
struct file_lock_context *ctx;
382
383
*fcntl_count = 0;
384
*flock_count = 0;
385
386
ctx = locks_inode_context(inode);
387
if (ctx) {
388
spin_lock(&ctx->flc_lock);
389
for_each_file_lock(lock, &ctx->flc_posix)
390
++(*fcntl_count);
391
for_each_file_lock(lock, &ctx->flc_flock)
392
++(*flock_count);
393
spin_unlock(&ctx->flc_lock);
394
}
395
doutc(cl, "counted %d flock locks and %d fcntl locks\n",
396
*flock_count, *fcntl_count);
397
}
398
399
/*
400
* Given a pointer to a lock, convert it to a ceph filelock
401
*/
402
static int lock_to_ceph_filelock(struct inode *inode,
403
struct file_lock *lock,
404
struct ceph_filelock *cephlock)
405
{
406
struct ceph_client *cl = ceph_inode_to_client(inode);
407
int err = 0;
408
409
cephlock->start = cpu_to_le64(lock->fl_start);
410
cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
411
cephlock->client = cpu_to_le64(0);
412
cephlock->pid = cpu_to_le64((u64) lock->c.flc_pid);
413
cephlock->owner = cpu_to_le64(secure_addr(lock->c.flc_owner));
414
415
switch (lock->c.flc_type) {
416
case F_RDLCK:
417
cephlock->type = CEPH_LOCK_SHARED;
418
break;
419
case F_WRLCK:
420
cephlock->type = CEPH_LOCK_EXCL;
421
break;
422
case F_UNLCK:
423
cephlock->type = CEPH_LOCK_UNLOCK;
424
break;
425
default:
426
doutc(cl, "Have unknown lock type %d\n",
427
lock->c.flc_type);
428
err = -EINVAL;
429
}
430
431
return err;
432
}
433
434
/*
435
* Encode the flock and fcntl locks for the given inode into the ceph_filelock
436
* array. Must be called with inode->i_lock already held.
437
* If we encounter more of a specific lock type than expected, return -ENOSPC.
438
*/
439
int ceph_encode_locks_to_buffer(struct inode *inode,
440
struct ceph_filelock *flocks,
441
int num_fcntl_locks, int num_flock_locks)
442
{
443
struct file_lock *lock;
444
struct file_lock_context *ctx = locks_inode_context(inode);
445
struct ceph_client *cl = ceph_inode_to_client(inode);
446
int err = 0;
447
int seen_fcntl = 0;
448
int seen_flock = 0;
449
int l = 0;
450
451
doutc(cl, "encoding %d flock and %d fcntl locks\n", num_flock_locks,
452
num_fcntl_locks);
453
454
if (!ctx)
455
return 0;
456
457
spin_lock(&ctx->flc_lock);
458
for_each_file_lock(lock, &ctx->flc_posix) {
459
++seen_fcntl;
460
if (seen_fcntl > num_fcntl_locks) {
461
err = -ENOSPC;
462
goto fail;
463
}
464
err = lock_to_ceph_filelock(inode, lock, &flocks[l]);
465
if (err)
466
goto fail;
467
++l;
468
}
469
for_each_file_lock(lock, &ctx->flc_flock) {
470
++seen_flock;
471
if (seen_flock > num_flock_locks) {
472
err = -ENOSPC;
473
goto fail;
474
}
475
err = lock_to_ceph_filelock(inode, lock, &flocks[l]);
476
if (err)
477
goto fail;
478
++l;
479
}
480
fail:
481
spin_unlock(&ctx->flc_lock);
482
return err;
483
}
484
485
/*
486
* Copy the encoded flock and fcntl locks into the pagelist.
487
* Format is: #fcntl locks, sequential fcntl locks, #flock locks,
488
* sequential flock locks.
489
* Returns zero on success.
490
*/
491
int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
492
struct ceph_pagelist *pagelist,
493
int num_fcntl_locks, int num_flock_locks)
494
{
495
int err = 0;
496
__le32 nlocks;
497
498
nlocks = cpu_to_le32(num_fcntl_locks);
499
err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
500
if (err)
501
goto out_fail;
502
503
if (num_fcntl_locks > 0) {
504
err = ceph_pagelist_append(pagelist, flocks,
505
num_fcntl_locks * sizeof(*flocks));
506
if (err)
507
goto out_fail;
508
}
509
510
nlocks = cpu_to_le32(num_flock_locks);
511
err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
512
if (err)
513
goto out_fail;
514
515
if (num_flock_locks > 0) {
516
err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks],
517
num_flock_locks * sizeof(*flocks));
518
}
519
out_fail:
520
return err;
521
}
522
523