Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/ceph/inode.c
49410 views
1
// SPDX-License-Identifier: GPL-2.0
2
#include <linux/ceph/ceph_debug.h>
3
4
#include <linux/module.h>
5
#include <linux/fs.h>
6
#include <linux/slab.h>
7
#include <linux/string.h>
8
#include <linux/uaccess.h>
9
#include <linux/kernel.h>
10
#include <linux/writeback.h>
11
#include <linux/vmalloc.h>
12
#include <linux/xattr.h>
13
#include <linux/posix_acl.h>
14
#include <linux/random.h>
15
#include <linux/sort.h>
16
#include <linux/iversion.h>
17
#include <linux/fscrypt.h>
18
19
#include "super.h"
20
#include "mds_client.h"
21
#include "cache.h"
22
#include "crypto.h"
23
#include <linux/ceph/decode.h>
24
25
/*
26
* Ceph inode operations
27
*
28
* Implement basic inode helpers (get, alloc) and inode ops (getattr,
29
* setattr, etc.), xattr helpers, and helpers for assimilating
30
* metadata returned by the MDS into our cache.
31
*
32
* Also define helpers for doing asynchronous writeback, invalidation,
33
* and truncation for the benefit of those who can't afford to block
34
* (typically because they are in the message handler path).
35
*/
36
37
static const struct inode_operations ceph_symlink_iops;
38
static const struct inode_operations ceph_encrypted_symlink_iops;
39
40
static void ceph_inode_work(struct work_struct *work);
41
42
/*
43
* find or create an inode, given the ceph ino number
44
*/
45
static int ceph_set_ino_cb(struct inode *inode, void *data)
46
{
47
struct ceph_inode_info *ci = ceph_inode(inode);
48
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
49
50
ci->i_vino = *(struct ceph_vino *)data;
51
inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
52
inode_set_iversion_raw(inode, 0);
53
percpu_counter_inc(&mdsc->metric.total_inodes);
54
55
return 0;
56
}
57
58
/*
59
* Check if the parent inode matches the vino from directory reply info
60
*/
61
static inline bool ceph_vino_matches_parent(struct inode *parent,
62
struct ceph_vino vino)
63
{
64
return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap;
65
}
66
67
/*
68
* Validate that the directory inode referenced by @req->r_parent matches the
69
* inode number and snapshot id contained in the reply's directory record. If
70
* they do not match – which can theoretically happen if the parent dentry was
71
* moved between the time the request was issued and the reply arrived – fall
72
* back to looking up the correct inode in the inode cache.
73
*
74
* A reference is *always* returned. Callers that receive a different inode
75
* than the original @parent are responsible for dropping the extra reference
76
* once the reply has been processed.
77
*/
78
static struct inode *ceph_get_reply_dir(struct super_block *sb,
79
struct inode *parent,
80
struct ceph_mds_reply_info_parsed *rinfo)
81
{
82
struct ceph_vino vino;
83
84
if (unlikely(!rinfo->diri.in))
85
return parent; /* nothing to compare against */
86
87
/* If we didn't have a cached parent inode to begin with, just bail out. */
88
if (!parent)
89
return NULL;
90
91
vino.ino = le64_to_cpu(rinfo->diri.in->ino);
92
vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
93
94
if (likely(ceph_vino_matches_parent(parent, vino)))
95
return parent; /* matches – use the original reference */
96
97
/* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */
98
WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
99
ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
100
101
return ceph_get_inode(sb, vino, NULL);
102
}
103
104
/**
105
* ceph_new_inode - allocate a new inode in advance of an expected create
106
* @dir: parent directory for new inode
107
* @dentry: dentry that may eventually point to new inode
108
* @mode: mode of new inode
109
* @as_ctx: pointer to inherited security context
110
*
111
* Allocate a new inode in advance of an operation to create a new inode.
112
* This allocates the inode and sets up the acl_sec_ctx with appropriate
113
* info for the new inode.
114
*
115
* Returns a pointer to the new inode or an ERR_PTR.
116
*/
117
struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
118
umode_t *mode, struct ceph_acl_sec_ctx *as_ctx)
119
{
120
int err;
121
struct inode *inode;
122
123
inode = new_inode(dir->i_sb);
124
if (!inode)
125
return ERR_PTR(-ENOMEM);
126
127
inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
128
129
if (!S_ISLNK(*mode)) {
130
err = ceph_pre_init_acls(dir, mode, as_ctx);
131
if (err < 0)
132
goto out_err;
133
}
134
135
inode_state_assign_raw(inode, 0);
136
inode->i_mode = *mode;
137
138
err = ceph_security_init_secctx(dentry, *mode, as_ctx);
139
if (err < 0)
140
goto out_err;
141
142
/*
143
* We'll skip setting fscrypt context for snapshots, leaving that for
144
* the handle_reply().
145
*/
146
if (ceph_snap(dir) != CEPH_SNAPDIR) {
147
err = ceph_fscrypt_prepare_context(dir, inode, as_ctx);
148
if (err)
149
goto out_err;
150
}
151
152
return inode;
153
out_err:
154
iput(inode);
155
return ERR_PTR(err);
156
}
157
158
void ceph_as_ctx_to_req(struct ceph_mds_request *req,
159
struct ceph_acl_sec_ctx *as_ctx)
160
{
161
if (as_ctx->pagelist) {
162
req->r_pagelist = as_ctx->pagelist;
163
as_ctx->pagelist = NULL;
164
}
165
ceph_fscrypt_as_ctx_to_req(req, as_ctx);
166
}
167
168
/**
169
* ceph_get_inode - find or create/hash a new inode
170
* @sb: superblock to search and allocate in
171
* @vino: vino to search for
172
* @newino: optional new inode to insert if one isn't found (may be NULL)
173
*
174
* Search for or insert a new inode into the hash for the given vino, and
175
* return a reference to it. If new is non-NULL, its reference is consumed.
176
*/
177
struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
178
struct inode *newino)
179
{
180
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
181
struct ceph_client *cl = mdsc->fsc->client;
182
struct inode *inode;
183
184
if (ceph_vino_is_reserved(vino))
185
return ERR_PTR(-EREMOTEIO);
186
187
if (newino) {
188
inode = inode_insert5(newino, (unsigned long)vino.ino,
189
ceph_ino_compare, ceph_set_ino_cb, &vino);
190
if (inode != newino)
191
iput(newino);
192
} else {
193
inode = iget5_locked(sb, (unsigned long)vino.ino,
194
ceph_ino_compare, ceph_set_ino_cb, &vino);
195
}
196
197
if (!inode) {
198
doutc(cl, "no inode found for %llx.%llx\n", vino.ino, vino.snap);
199
return ERR_PTR(-ENOMEM);
200
}
201
202
doutc(cl, "on %llx=%llx.%llx got %p new %d\n",
203
ceph_present_inode(inode), ceph_vinop(inode), inode,
204
!!(inode_state_read_once(inode) & I_NEW));
205
return inode;
206
}
207
208
/*
209
* get/construct snapdir inode for a given directory
210
*/
211
struct inode *ceph_get_snapdir(struct inode *parent)
212
{
213
struct ceph_client *cl = ceph_inode_to_client(parent);
214
struct ceph_vino vino = {
215
.ino = ceph_ino(parent),
216
.snap = CEPH_SNAPDIR,
217
};
218
struct inode *inode = ceph_get_inode(parent->i_sb, vino, NULL);
219
struct ceph_inode_info *ci = ceph_inode(inode);
220
int ret = -ENOTDIR;
221
222
if (IS_ERR(inode))
223
return inode;
224
225
if (!S_ISDIR(parent->i_mode)) {
226
pr_warn_once_client(cl, "bad snapdir parent type (mode=0%o)\n",
227
parent->i_mode);
228
goto err;
229
}
230
231
if (!(inode_state_read_once(inode) & I_NEW) && !S_ISDIR(inode->i_mode)) {
232
pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n",
233
inode->i_mode);
234
goto err;
235
}
236
237
inode->i_mode = parent->i_mode;
238
inode->i_uid = parent->i_uid;
239
inode->i_gid = parent->i_gid;
240
inode_set_mtime_to_ts(inode, inode_get_mtime(parent));
241
inode_set_ctime_to_ts(inode, inode_get_ctime(parent));
242
inode_set_atime_to_ts(inode, inode_get_atime(parent));
243
ci->i_rbytes = 0;
244
ci->i_btime = ceph_inode(parent)->i_btime;
245
246
#ifdef CONFIG_FS_ENCRYPTION
247
/* if encrypted, just borrow fscrypt_auth from parent */
248
if (IS_ENCRYPTED(parent)) {
249
struct ceph_inode_info *pci = ceph_inode(parent);
250
251
ci->fscrypt_auth = kmemdup(pci->fscrypt_auth,
252
pci->fscrypt_auth_len,
253
GFP_KERNEL);
254
if (ci->fscrypt_auth) {
255
inode->i_flags |= S_ENCRYPTED;
256
ci->fscrypt_auth_len = pci->fscrypt_auth_len;
257
} else {
258
doutc(cl, "Failed to alloc snapdir fscrypt_auth\n");
259
ret = -ENOMEM;
260
goto err;
261
}
262
}
263
#endif
264
if (inode_state_read_once(inode) & I_NEW) {
265
inode->i_op = &ceph_snapdir_iops;
266
inode->i_fop = &ceph_snapdir_fops;
267
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
268
unlock_new_inode(inode);
269
}
270
271
return inode;
272
err:
273
if ((inode_state_read_once(inode) & I_NEW))
274
discard_new_inode(inode);
275
else
276
iput(inode);
277
return ERR_PTR(ret);
278
}
279
280
const struct inode_operations ceph_file_iops = {
281
.permission = ceph_permission,
282
.setattr = ceph_setattr,
283
.getattr = ceph_getattr,
284
.listxattr = ceph_listxattr,
285
.get_inode_acl = ceph_get_acl,
286
.set_acl = ceph_set_acl,
287
};
288
289
290
/*
291
* We use a 'frag tree' to keep track of the MDS's directory fragments
292
* for a given inode (usually there is just a single fragment). We
293
* need to know when a child frag is delegated to a new MDS, or when
294
* it is flagged as replicated, so we can direct our requests
295
* accordingly.
296
*/
297
298
/*
299
* find/create a frag in the tree
300
*/
301
static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
302
u32 f)
303
{
304
struct inode *inode = &ci->netfs.inode;
305
struct ceph_client *cl = ceph_inode_to_client(inode);
306
struct rb_node **p;
307
struct rb_node *parent = NULL;
308
struct ceph_inode_frag *frag;
309
int c;
310
311
p = &ci->i_fragtree.rb_node;
312
while (*p) {
313
parent = *p;
314
frag = rb_entry(parent, struct ceph_inode_frag, node);
315
c = ceph_frag_compare(f, frag->frag);
316
if (c < 0)
317
p = &(*p)->rb_left;
318
else if (c > 0)
319
p = &(*p)->rb_right;
320
else
321
return frag;
322
}
323
324
frag = kmalloc(sizeof(*frag), GFP_NOFS);
325
if (!frag)
326
return ERR_PTR(-ENOMEM);
327
328
frag->frag = f;
329
frag->split_by = 0;
330
frag->mds = -1;
331
frag->ndist = 0;
332
333
rb_link_node(&frag->node, parent, p);
334
rb_insert_color(&frag->node, &ci->i_fragtree);
335
336
doutc(cl, "added %p %llx.%llx frag %x\n", inode, ceph_vinop(inode), f);
337
return frag;
338
}
339
340
/*
341
* find a specific frag @f
342
*/
343
struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
344
{
345
struct rb_node *n = ci->i_fragtree.rb_node;
346
347
while (n) {
348
struct ceph_inode_frag *frag =
349
rb_entry(n, struct ceph_inode_frag, node);
350
int c = ceph_frag_compare(f, frag->frag);
351
if (c < 0)
352
n = n->rb_left;
353
else if (c > 0)
354
n = n->rb_right;
355
else
356
return frag;
357
}
358
return NULL;
359
}
360
361
/*
362
* Choose frag containing the given value @v. If @pfrag is
363
* specified, copy the frag delegation info to the caller if
364
* it is present.
365
*/
366
static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
367
struct ceph_inode_frag *pfrag, int *found)
368
{
369
struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
370
u32 t = ceph_frag_make(0, 0);
371
struct ceph_inode_frag *frag;
372
unsigned nway, i;
373
u32 n;
374
375
if (found)
376
*found = 0;
377
378
while (1) {
379
WARN_ON(!ceph_frag_contains_value(t, v));
380
frag = __ceph_find_frag(ci, t);
381
if (!frag)
382
break; /* t is a leaf */
383
if (frag->split_by == 0) {
384
if (pfrag)
385
memcpy(pfrag, frag, sizeof(*pfrag));
386
if (found)
387
*found = 1;
388
break;
389
}
390
391
/* choose child */
392
nway = 1 << frag->split_by;
393
doutc(cl, "frag(%x) %x splits by %d (%d ways)\n", v, t,
394
frag->split_by, nway);
395
for (i = 0; i < nway; i++) {
396
n = ceph_frag_make_child(t, frag->split_by, i);
397
if (ceph_frag_contains_value(n, v)) {
398
t = n;
399
break;
400
}
401
}
402
BUG_ON(i == nway);
403
}
404
doutc(cl, "frag(%x) = %x\n", v, t);
405
406
return t;
407
}
408
409
u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
410
struct ceph_inode_frag *pfrag, int *found)
411
{
412
u32 ret;
413
mutex_lock(&ci->i_fragtree_mutex);
414
ret = __ceph_choose_frag(ci, v, pfrag, found);
415
mutex_unlock(&ci->i_fragtree_mutex);
416
return ret;
417
}
418
419
/*
420
* Process dirfrag (delegation) info from the mds. Include leaf
421
* fragment in tree ONLY if ndist > 0. Otherwise, only
422
* branches/splits are included in i_fragtree)
423
*/
424
static int ceph_fill_dirfrag(struct inode *inode,
425
struct ceph_mds_reply_dirfrag *dirinfo)
426
{
427
struct ceph_inode_info *ci = ceph_inode(inode);
428
struct ceph_client *cl = ceph_inode_to_client(inode);
429
struct ceph_inode_frag *frag;
430
u32 id = le32_to_cpu(dirinfo->frag);
431
int mds = le32_to_cpu(dirinfo->auth);
432
int ndist = le32_to_cpu(dirinfo->ndist);
433
int diri_auth = -1;
434
int i;
435
int err = 0;
436
437
spin_lock(&ci->i_ceph_lock);
438
if (ci->i_auth_cap)
439
diri_auth = ci->i_auth_cap->mds;
440
spin_unlock(&ci->i_ceph_lock);
441
442
if (mds == -1) /* CDIR_AUTH_PARENT */
443
mds = diri_auth;
444
445
mutex_lock(&ci->i_fragtree_mutex);
446
if (ndist == 0 && mds == diri_auth) {
447
/* no delegation info needed. */
448
frag = __ceph_find_frag(ci, id);
449
if (!frag)
450
goto out;
451
if (frag->split_by == 0) {
452
/* tree leaf, remove */
453
doutc(cl, "removed %p %llx.%llx frag %x (no ref)\n",
454
inode, ceph_vinop(inode), id);
455
rb_erase(&frag->node, &ci->i_fragtree);
456
kfree(frag);
457
} else {
458
/* tree branch, keep and clear */
459
doutc(cl, "cleared %p %llx.%llx frag %x referral\n",
460
inode, ceph_vinop(inode), id);
461
frag->mds = -1;
462
frag->ndist = 0;
463
}
464
goto out;
465
}
466
467
468
/* find/add this frag to store mds delegation info */
469
frag = __get_or_create_frag(ci, id);
470
if (IS_ERR(frag)) {
471
/* this is not the end of the world; we can continue
472
with bad/inaccurate delegation info */
473
pr_err_client(cl, "ENOMEM on mds ref %p %llx.%llx fg %x\n",
474
inode, ceph_vinop(inode),
475
le32_to_cpu(dirinfo->frag));
476
err = -ENOMEM;
477
goto out;
478
}
479
480
frag->mds = mds;
481
frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
482
for (i = 0; i < frag->ndist; i++)
483
frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
484
doutc(cl, "%p %llx.%llx frag %x ndist=%d\n", inode,
485
ceph_vinop(inode), frag->frag, frag->ndist);
486
487
out:
488
mutex_unlock(&ci->i_fragtree_mutex);
489
return err;
490
}
491
492
static int frag_tree_split_cmp(const void *l, const void *r)
493
{
494
struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
495
struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
496
return ceph_frag_compare(le32_to_cpu(ls->frag),
497
le32_to_cpu(rs->frag));
498
}
499
500
static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
501
{
502
if (!frag)
503
return f == ceph_frag_make(0, 0);
504
if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
505
return false;
506
return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
507
}
508
509
static int ceph_fill_fragtree(struct inode *inode,
510
struct ceph_frag_tree_head *fragtree,
511
struct ceph_mds_reply_dirfrag *dirinfo)
512
{
513
struct ceph_client *cl = ceph_inode_to_client(inode);
514
struct ceph_inode_info *ci = ceph_inode(inode);
515
struct ceph_inode_frag *frag, *prev_frag = NULL;
516
struct rb_node *rb_node;
517
unsigned i, split_by, nsplits;
518
u32 id;
519
bool update = false;
520
521
mutex_lock(&ci->i_fragtree_mutex);
522
nsplits = le32_to_cpu(fragtree->nsplits);
523
if (nsplits != ci->i_fragtree_nsplits) {
524
update = true;
525
} else if (nsplits) {
526
i = get_random_u32_below(nsplits);
527
id = le32_to_cpu(fragtree->splits[i].frag);
528
if (!__ceph_find_frag(ci, id))
529
update = true;
530
} else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
531
rb_node = rb_first(&ci->i_fragtree);
532
frag = rb_entry(rb_node, struct ceph_inode_frag, node);
533
if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node))
534
update = true;
535
}
536
if (!update && dirinfo) {
537
id = le32_to_cpu(dirinfo->frag);
538
if (id != __ceph_choose_frag(ci, id, NULL, NULL))
539
update = true;
540
}
541
if (!update)
542
goto out_unlock;
543
544
if (nsplits > 1) {
545
sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
546
frag_tree_split_cmp, NULL);
547
}
548
549
doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
550
rb_node = rb_first(&ci->i_fragtree);
551
for (i = 0; i < nsplits; i++) {
552
id = le32_to_cpu(fragtree->splits[i].frag);
553
split_by = le32_to_cpu(fragtree->splits[i].by);
554
if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
555
pr_err_client(cl, "%p %llx.%llx invalid split %d/%u, "
556
"frag %x split by %d\n", inode,
557
ceph_vinop(inode), i, nsplits, id, split_by);
558
continue;
559
}
560
frag = NULL;
561
while (rb_node) {
562
frag = rb_entry(rb_node, struct ceph_inode_frag, node);
563
if (ceph_frag_compare(frag->frag, id) >= 0) {
564
if (frag->frag != id)
565
frag = NULL;
566
else
567
rb_node = rb_next(rb_node);
568
break;
569
}
570
rb_node = rb_next(rb_node);
571
/* delete stale split/leaf node */
572
if (frag->split_by > 0 ||
573
!is_frag_child(frag->frag, prev_frag)) {
574
rb_erase(&frag->node, &ci->i_fragtree);
575
if (frag->split_by > 0)
576
ci->i_fragtree_nsplits--;
577
kfree(frag);
578
}
579
frag = NULL;
580
}
581
if (!frag) {
582
frag = __get_or_create_frag(ci, id);
583
if (IS_ERR(frag))
584
continue;
585
}
586
if (frag->split_by == 0)
587
ci->i_fragtree_nsplits++;
588
frag->split_by = split_by;
589
doutc(cl, " frag %x split by %d\n", frag->frag, frag->split_by);
590
prev_frag = frag;
591
}
592
while (rb_node) {
593
frag = rb_entry(rb_node, struct ceph_inode_frag, node);
594
rb_node = rb_next(rb_node);
595
/* delete stale split/leaf node */
596
if (frag->split_by > 0 ||
597
!is_frag_child(frag->frag, prev_frag)) {
598
rb_erase(&frag->node, &ci->i_fragtree);
599
if (frag->split_by > 0)
600
ci->i_fragtree_nsplits--;
601
kfree(frag);
602
}
603
}
604
out_unlock:
605
mutex_unlock(&ci->i_fragtree_mutex);
606
return 0;
607
}
608
609
/*
610
* initialize a newly allocated inode.
611
*/
612
struct inode *ceph_alloc_inode(struct super_block *sb)
613
{
614
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
615
struct ceph_inode_info *ci;
616
int i;
617
618
ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS);
619
if (!ci)
620
return NULL;
621
622
doutc(fsc->client, "%p\n", &ci->netfs.inode);
623
624
/* Set parameters for the netfs library */
625
netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false);
626
627
spin_lock_init(&ci->i_ceph_lock);
628
629
ci->i_version = 0;
630
ci->i_inline_version = 0;
631
ci->i_time_warp_seq = 0;
632
ci->i_ceph_flags = 0;
633
atomic64_set(&ci->i_ordered_count, 1);
634
atomic64_set(&ci->i_release_count, 1);
635
atomic64_set(&ci->i_complete_seq[0], 0);
636
atomic64_set(&ci->i_complete_seq[1], 0);
637
ci->i_symlink = NULL;
638
639
ci->i_max_bytes = 0;
640
ci->i_max_files = 0;
641
642
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
643
memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
644
RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
645
646
ci->i_fragtree = RB_ROOT;
647
mutex_init(&ci->i_fragtree_mutex);
648
649
ci->i_xattrs.blob = NULL;
650
ci->i_xattrs.prealloc_blob = NULL;
651
ci->i_xattrs.dirty = false;
652
ci->i_xattrs.index = RB_ROOT;
653
ci->i_xattrs.count = 0;
654
ci->i_xattrs.names_size = 0;
655
ci->i_xattrs.vals_size = 0;
656
ci->i_xattrs.version = 0;
657
ci->i_xattrs.index_version = 0;
658
659
ci->i_caps = RB_ROOT;
660
ci->i_auth_cap = NULL;
661
ci->i_dirty_caps = 0;
662
ci->i_flushing_caps = 0;
663
INIT_LIST_HEAD(&ci->i_dirty_item);
664
INIT_LIST_HEAD(&ci->i_flushing_item);
665
ci->i_prealloc_cap_flush = NULL;
666
INIT_LIST_HEAD(&ci->i_cap_flush_list);
667
init_waitqueue_head(&ci->i_cap_wq);
668
ci->i_hold_caps_max = 0;
669
INIT_LIST_HEAD(&ci->i_cap_delay_list);
670
INIT_LIST_HEAD(&ci->i_cap_snaps);
671
ci->i_head_snapc = NULL;
672
ci->i_snap_caps = 0;
673
674
ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
675
for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
676
ci->i_nr_by_mode[i] = 0;
677
678
mutex_init(&ci->i_truncate_mutex);
679
ci->i_truncate_seq = 0;
680
ci->i_truncate_size = 0;
681
ci->i_truncate_pending = 0;
682
ci->i_truncate_pagecache_size = 0;
683
684
ci->i_max_size = 0;
685
ci->i_reported_size = 0;
686
ci->i_wanted_max_size = 0;
687
ci->i_requested_max_size = 0;
688
689
ci->i_pin_ref = 0;
690
ci->i_rd_ref = 0;
691
ci->i_rdcache_ref = 0;
692
ci->i_wr_ref = 0;
693
ci->i_wb_ref = 0;
694
ci->i_fx_ref = 0;
695
ci->i_wrbuffer_ref = 0;
696
ci->i_wrbuffer_ref_head = 0;
697
atomic_set(&ci->i_filelock_ref, 0);
698
atomic_set(&ci->i_shared_gen, 1);
699
ci->i_rdcache_gen = 0;
700
ci->i_rdcache_revoking = 0;
701
702
INIT_LIST_HEAD(&ci->i_unsafe_dirops);
703
INIT_LIST_HEAD(&ci->i_unsafe_iops);
704
spin_lock_init(&ci->i_unsafe_lock);
705
706
ci->i_snap_realm = NULL;
707
INIT_LIST_HEAD(&ci->i_snap_realm_item);
708
INIT_LIST_HEAD(&ci->i_snap_flush_item);
709
710
INIT_WORK(&ci->i_work, ceph_inode_work);
711
ci->i_work_mask = 0;
712
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
713
#ifdef CONFIG_FS_ENCRYPTION
714
ci->i_crypt_info = NULL;
715
ci->fscrypt_auth = NULL;
716
ci->fscrypt_auth_len = 0;
717
#endif
718
return &ci->netfs.inode;
719
}
720
721
void ceph_free_inode(struct inode *inode)
722
{
723
struct ceph_inode_info *ci = ceph_inode(inode);
724
725
kfree(ci->i_symlink);
726
#ifdef CONFIG_FS_ENCRYPTION
727
kfree(ci->fscrypt_auth);
728
#endif
729
fscrypt_free_inode(inode);
730
kmem_cache_free(ceph_inode_cachep, ci);
731
}
732
733
void ceph_evict_inode(struct inode *inode)
734
{
735
struct ceph_inode_info *ci = ceph_inode(inode);
736
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
737
struct ceph_client *cl = ceph_inode_to_client(inode);
738
struct ceph_inode_frag *frag;
739
struct rb_node *n;
740
741
doutc(cl, "%p ino %llx.%llx\n", inode, ceph_vinop(inode));
742
743
percpu_counter_dec(&mdsc->metric.total_inodes);
744
745
netfs_wait_for_outstanding_io(inode);
746
truncate_inode_pages_final(&inode->i_data);
747
if (inode_state_read_once(inode) & I_PINNING_NETFS_WB)
748
ceph_fscache_unuse_cookie(inode, true);
749
clear_inode(inode);
750
751
ceph_fscache_unregister_inode_cookie(ci);
752
fscrypt_put_encryption_info(inode);
753
754
__ceph_remove_caps(ci);
755
756
if (__ceph_has_quota(ci, QUOTA_GET_ANY))
757
ceph_adjust_quota_realms_count(inode, false);
758
759
/*
760
* we may still have a snap_realm reference if there are stray
761
* caps in i_snap_caps.
762
*/
763
if (ci->i_snap_realm) {
764
if (ceph_snap(inode) == CEPH_NOSNAP) {
765
doutc(cl, " dropping residual ref to snap realm %p\n",
766
ci->i_snap_realm);
767
ceph_change_snap_realm(inode, NULL);
768
} else {
769
ceph_put_snapid_map(mdsc, ci->i_snapid_map);
770
ci->i_snap_realm = NULL;
771
}
772
}
773
774
while ((n = rb_first(&ci->i_fragtree)) != NULL) {
775
frag = rb_entry(n, struct ceph_inode_frag, node);
776
rb_erase(n, &ci->i_fragtree);
777
kfree(frag);
778
}
779
ci->i_fragtree_nsplits = 0;
780
781
__ceph_destroy_xattrs(ci);
782
if (ci->i_xattrs.blob)
783
ceph_buffer_put(ci->i_xattrs.blob);
784
if (ci->i_xattrs.prealloc_blob)
785
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
786
787
ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
788
ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
789
}
790
791
static inline blkcnt_t calc_inode_blocks(u64 size)
792
{
793
return (size + (1<<9) - 1) >> 9;
794
}
795
796
/*
797
* Helpers to fill in size, ctime, mtime, and atime. We have to be
798
* careful because either the client or MDS may have more up to date
799
* info, depending on which capabilities are held, and whether
800
* time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
801
* and size are monotonically increasing, except when utimes() or
802
* truncate() increments the corresponding _seq values.)
803
*/
804
int ceph_fill_file_size(struct inode *inode, int issued,
805
u32 truncate_seq, u64 truncate_size, u64 size)
806
{
807
struct ceph_client *cl = ceph_inode_to_client(inode);
808
struct ceph_inode_info *ci = ceph_inode(inode);
809
int queue_trunc = 0;
810
loff_t isize = i_size_read(inode);
811
812
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
813
(truncate_seq == ci->i_truncate_seq && size > isize)) {
814
doutc(cl, "size %lld -> %llu\n", isize, size);
815
if (size > 0 && S_ISDIR(inode->i_mode)) {
816
pr_err_client(cl, "non-zero size for directory\n");
817
size = 0;
818
}
819
i_size_write(inode, size);
820
inode->i_blocks = calc_inode_blocks(size);
821
/*
822
* If we're expanding, then we should be able to just update
823
* the existing cookie.
824
*/
825
if (size > isize)
826
ceph_fscache_update(inode);
827
ci->i_reported_size = size;
828
if (truncate_seq != ci->i_truncate_seq) {
829
doutc(cl, "truncate_seq %u -> %u\n",
830
ci->i_truncate_seq, truncate_seq);
831
ci->i_truncate_seq = truncate_seq;
832
833
/* the MDS should have revoked these caps */
834
WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD |
835
CEPH_CAP_FILE_LAZYIO));
836
/*
837
* If we hold relevant caps, or in the case where we're
838
* not the only client referencing this file and we
839
* don't hold those caps, then we need to check whether
840
* the file is either opened or mmaped
841
*/
842
if ((issued & (CEPH_CAP_FILE_CACHE|
843
CEPH_CAP_FILE_BUFFER)) ||
844
mapping_mapped(inode->i_mapping) ||
845
__ceph_is_file_opened(ci)) {
846
ci->i_truncate_pending++;
847
queue_trunc = 1;
848
}
849
}
850
}
851
852
/*
853
* It's possible that the new sizes of the two consecutive
854
* size truncations will be in the same fscrypt last block,
855
* and we need to truncate the corresponding page caches
856
* anyway.
857
*/
858
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) {
859
doutc(cl, "truncate_size %lld -> %llu, encrypted %d\n",
860
ci->i_truncate_size, truncate_size,
861
!!IS_ENCRYPTED(inode));
862
863
ci->i_truncate_size = truncate_size;
864
865
if (IS_ENCRYPTED(inode)) {
866
doutc(cl, "truncate_pagecache_size %lld -> %llu\n",
867
ci->i_truncate_pagecache_size, size);
868
ci->i_truncate_pagecache_size = size;
869
} else {
870
ci->i_truncate_pagecache_size = truncate_size;
871
}
872
}
873
return queue_trunc;
874
}
875
876
void ceph_fill_file_time(struct inode *inode, int issued,
877
u64 time_warp_seq, struct timespec64 *ctime,
878
struct timespec64 *mtime, struct timespec64 *atime)
879
{
880
struct ceph_client *cl = ceph_inode_to_client(inode);
881
struct ceph_inode_info *ci = ceph_inode(inode);
882
struct timespec64 iatime = inode_get_atime(inode);
883
struct timespec64 ictime = inode_get_ctime(inode);
884
struct timespec64 imtime = inode_get_mtime(inode);
885
int warn = 0;
886
887
if (issued & (CEPH_CAP_FILE_EXCL|
888
CEPH_CAP_FILE_WR|
889
CEPH_CAP_FILE_BUFFER|
890
CEPH_CAP_AUTH_EXCL|
891
CEPH_CAP_XATTR_EXCL)) {
892
if (ci->i_version == 0 ||
893
timespec64_compare(ctime, &ictime) > 0) {
894
doutc(cl, "ctime %ptSp -> %ptSp inc w/ cap\n", &ictime, ctime);
895
inode_set_ctime_to_ts(inode, *ctime);
896
}
897
if (ci->i_version == 0 ||
898
ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
899
/* the MDS did a utimes() */
900
doutc(cl, "mtime %ptSp -> %ptSp tw %d -> %d\n", &imtime, mtime,
901
ci->i_time_warp_seq, (int)time_warp_seq);
902
903
inode_set_mtime_to_ts(inode, *mtime);
904
inode_set_atime_to_ts(inode, *atime);
905
ci->i_time_warp_seq = time_warp_seq;
906
} else if (time_warp_seq == ci->i_time_warp_seq) {
907
/* nobody did utimes(); take the max */
908
if (timespec64_compare(mtime, &imtime) > 0) {
909
doutc(cl, "mtime %ptSp -> %ptSp inc\n", &imtime, mtime);
910
inode_set_mtime_to_ts(inode, *mtime);
911
}
912
if (timespec64_compare(atime, &iatime) > 0) {
913
doutc(cl, "atime %ptSp -> %ptSp inc\n", &iatime, atime);
914
inode_set_atime_to_ts(inode, *atime);
915
}
916
} else if (issued & CEPH_CAP_FILE_EXCL) {
917
/* we did a utimes(); ignore mds values */
918
} else {
919
warn = 1;
920
}
921
} else {
922
/* we have no write|excl caps; whatever the MDS says is true */
923
if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
924
inode_set_ctime_to_ts(inode, *ctime);
925
inode_set_mtime_to_ts(inode, *mtime);
926
inode_set_atime_to_ts(inode, *atime);
927
ci->i_time_warp_seq = time_warp_seq;
928
} else {
929
warn = 1;
930
}
931
}
932
if (warn) /* time_warp_seq shouldn't go backwards */
933
doutc(cl, "%p mds time_warp_seq %llu < %u\n", inode,
934
time_warp_seq, ci->i_time_warp_seq);
935
}
936
937
#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
938
static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
939
const char *encsym,
940
int enclen, u8 **decsym)
941
{
942
struct ceph_client *cl = mdsc->fsc->client;
943
int declen;
944
u8 *sym;
945
946
sym = kmalloc(enclen + 1, GFP_NOFS);
947
if (!sym)
948
return -ENOMEM;
949
950
declen = base64_decode(encsym, enclen, sym, false, BASE64_IMAP);
951
if (declen < 0) {
952
pr_err_client(cl,
953
"can't decode symlink (%d). Content: %.*s\n",
954
declen, enclen, encsym);
955
kfree(sym);
956
return -EIO;
957
}
958
sym[declen + 1] = '\0';
959
*decsym = sym;
960
return declen;
961
}
962
#else
963
static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
964
const char *encsym,
965
int symlen, u8 **decsym)
966
{
967
return -EOPNOTSUPP;
968
}
969
#endif
970
971
/*
972
* Populate an inode based on info from mds. May be called on new or
973
* existing inodes.
974
*/
975
int ceph_fill_inode(struct inode *inode, struct page *locked_page,
976
struct ceph_mds_reply_info_in *iinfo,
977
struct ceph_mds_reply_dirfrag *dirinfo,
978
struct ceph_mds_session *session, int cap_fmode,
979
struct ceph_cap_reservation *caps_reservation)
980
{
981
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
982
struct ceph_client *cl = mdsc->fsc->client;
983
struct ceph_mds_reply_inode *info = iinfo->in;
984
struct ceph_inode_info *ci = ceph_inode(inode);
985
int issued, new_issued, info_caps;
986
struct timespec64 mtime, atime, ctime;
987
struct ceph_buffer *xattr_blob = NULL;
988
struct ceph_buffer *old_blob = NULL;
989
struct ceph_string *pool_ns = NULL;
990
struct ceph_cap *new_cap = NULL;
991
int err = 0;
992
bool wake = false;
993
bool queue_trunc = false;
994
bool new_version = false;
995
bool fill_inline = false;
996
umode_t mode = le32_to_cpu(info->mode);
997
dev_t rdev = le32_to_cpu(info->rdev);
998
999
lockdep_assert_held(&mdsc->snap_rwsem);
1000
1001
doutc(cl, "%p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode),
1002
le64_to_cpu(info->version), ci->i_version);
1003
1004
/* Once I_NEW is cleared, we can't change type or dev numbers */
1005
if (inode_state_read_once(inode) & I_NEW) {
1006
inode->i_mode = mode;
1007
} else {
1008
if (inode_wrong_type(inode, mode)) {
1009
pr_warn_once_client(cl,
1010
"inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
1011
ceph_vinop(inode), inode->i_mode, mode);
1012
return -ESTALE;
1013
}
1014
1015
if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) {
1016
pr_warn_once_client(cl,
1017
"dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
1018
ceph_vinop(inode), MAJOR(inode->i_rdev),
1019
MINOR(inode->i_rdev), MAJOR(rdev),
1020
MINOR(rdev));
1021
return -ESTALE;
1022
}
1023
}
1024
1025
info_caps = le32_to_cpu(info->cap.caps);
1026
1027
/* prealloc new cap struct */
1028
if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) {
1029
new_cap = ceph_get_cap(mdsc, caps_reservation);
1030
if (!new_cap)
1031
return -ENOMEM;
1032
}
1033
1034
/*
1035
* prealloc xattr data, if it looks like we'll need it. only
1036
* if len > 4 (meaning there are actually xattrs; the first 4
1037
* bytes are the xattr count).
1038
*/
1039
if (iinfo->xattr_len > 4) {
1040
xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
1041
if (!xattr_blob)
1042
pr_err_client(cl, "ENOMEM xattr blob %d bytes\n",
1043
iinfo->xattr_len);
1044
}
1045
1046
if (iinfo->pool_ns_len > 0)
1047
pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
1048
iinfo->pool_ns_len);
1049
1050
if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
1051
ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
1052
1053
spin_lock(&ci->i_ceph_lock);
1054
1055
/*
1056
* provided version will be odd if inode value is projected,
1057
* even if stable. skip the update if we have newer stable
1058
* info (ours>=theirs, e.g. due to racing mds replies), unless
1059
* we are getting projected (unstable) info (in which case the
1060
* version is odd, and we want ours>theirs).
1061
* us them
1062
* 2 2 skip
1063
* 3 2 skip
1064
* 3 3 update
1065
*/
1066
if (ci->i_version == 0 ||
1067
((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1068
le64_to_cpu(info->version) > (ci->i_version & ~1)))
1069
new_version = true;
1070
1071
/* Update change_attribute */
1072
inode_set_max_iversion_raw(inode, iinfo->change_attr);
1073
1074
__ceph_caps_issued(ci, &issued);
1075
issued |= __ceph_caps_dirty(ci);
1076
new_issued = ~issued & info_caps;
1077
1078
__ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
1079
1080
#ifdef CONFIG_FS_ENCRYPTION
1081
if (iinfo->fscrypt_auth_len &&
1082
((inode_state_read_once(inode) & I_NEW) || (ci->fscrypt_auth_len == 0))) {
1083
kfree(ci->fscrypt_auth);
1084
ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
1085
ci->fscrypt_auth = iinfo->fscrypt_auth;
1086
iinfo->fscrypt_auth = NULL;
1087
iinfo->fscrypt_auth_len = 0;
1088
inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
1089
}
1090
#endif
1091
1092
if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
1093
(issued & CEPH_CAP_AUTH_EXCL) == 0) {
1094
inode->i_mode = mode;
1095
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
1096
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
1097
doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode,
1098
ceph_vinop(inode), inode->i_mode,
1099
from_kuid(&init_user_ns, inode->i_uid),
1100
from_kgid(&init_user_ns, inode->i_gid));
1101
ceph_decode_timespec64(&ci->i_btime, &iinfo->btime);
1102
ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
1103
}
1104
1105
/* directories have fl_stripe_unit set to zero */
1106
if (IS_ENCRYPTED(inode))
1107
inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
1108
else if (le32_to_cpu(info->layout.fl_stripe_unit))
1109
inode->i_blkbits =
1110
fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
1111
else
1112
inode->i_blkbits = CEPH_BLOCK_SHIFT;
1113
1114
if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
1115
(issued & CEPH_CAP_LINK_EXCL) == 0)
1116
set_nlink(inode, le32_to_cpu(info->nlink));
1117
1118
if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
1119
/* be careful with mtime, atime, size */
1120
ceph_decode_timespec64(&atime, &info->atime);
1121
ceph_decode_timespec64(&mtime, &info->mtime);
1122
ceph_decode_timespec64(&ctime, &info->ctime);
1123
ceph_fill_file_time(inode, issued,
1124
le32_to_cpu(info->time_warp_seq),
1125
&ctime, &mtime, &atime);
1126
}
1127
1128
if (new_version || (info_caps & CEPH_CAP_FILE_SHARED)) {
1129
ci->i_files = le64_to_cpu(info->files);
1130
ci->i_subdirs = le64_to_cpu(info->subdirs);
1131
}
1132
1133
if (new_version ||
1134
(new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
1135
u64 size = le64_to_cpu(info->size);
1136
s64 old_pool = ci->i_layout.pool_id;
1137
struct ceph_string *old_ns;
1138
1139
ceph_file_layout_from_legacy(&ci->i_layout, &info->layout);
1140
old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
1141
lockdep_is_held(&ci->i_ceph_lock));
1142
rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns);
1143
1144
if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns)
1145
ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
1146
1147
pool_ns = old_ns;
1148
1149
if (IS_ENCRYPTED(inode) && size &&
1150
iinfo->fscrypt_file_len == sizeof(__le64)) {
1151
u64 fsize = __le64_to_cpu(*(__le64 *)iinfo->fscrypt_file);
1152
1153
if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) {
1154
size = fsize;
1155
} else {
1156
pr_warn_client(cl,
1157
"fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
1158
info->size, size);
1159
}
1160
}
1161
1162
queue_trunc = ceph_fill_file_size(inode, issued,
1163
le32_to_cpu(info->truncate_seq),
1164
le64_to_cpu(info->truncate_size),
1165
size);
1166
/* only update max_size on auth cap */
1167
if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1168
ci->i_max_size != le64_to_cpu(info->max_size)) {
1169
doutc(cl, "max_size %lld -> %llu\n",
1170
ci->i_max_size, le64_to_cpu(info->max_size));
1171
ci->i_max_size = le64_to_cpu(info->max_size);
1172
}
1173
}
1174
1175
/* layout and rstat are not tracked by capability, update them if
1176
* the inode info is from auth mds */
1177
if (new_version || (info->cap.flags & CEPH_CAP_FLAG_AUTH)) {
1178
if (S_ISDIR(inode->i_mode)) {
1179
ci->i_dir_layout = iinfo->dir_layout;
1180
ci->i_rbytes = le64_to_cpu(info->rbytes);
1181
ci->i_rfiles = le64_to_cpu(info->rfiles);
1182
ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
1183
ci->i_dir_pin = iinfo->dir_pin;
1184
ci->i_rsnaps = iinfo->rsnaps;
1185
ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
1186
}
1187
}
1188
1189
/* xattrs */
1190
/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
1191
if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
1192
le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
1193
if (ci->i_xattrs.blob)
1194
old_blob = ci->i_xattrs.blob;
1195
ci->i_xattrs.blob = xattr_blob;
1196
if (xattr_blob)
1197
memcpy(ci->i_xattrs.blob->vec.iov_base,
1198
iinfo->xattr_data, iinfo->xattr_len);
1199
ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
1200
ceph_forget_all_cached_acls(inode);
1201
ceph_security_invalidate_secctx(inode);
1202
xattr_blob = NULL;
1203
}
1204
1205
/* finally update i_version */
1206
if (le64_to_cpu(info->version) > ci->i_version)
1207
ci->i_version = le64_to_cpu(info->version);
1208
1209
inode->i_mapping->a_ops = &ceph_aops;
1210
1211
switch (inode->i_mode & S_IFMT) {
1212
case S_IFIFO:
1213
case S_IFBLK:
1214
case S_IFCHR:
1215
case S_IFSOCK:
1216
inode->i_blkbits = PAGE_SHIFT;
1217
init_special_inode(inode, inode->i_mode, rdev);
1218
inode->i_op = &ceph_file_iops;
1219
break;
1220
case S_IFREG:
1221
inode->i_op = &ceph_file_iops;
1222
inode->i_fop = &ceph_file_fops;
1223
break;
1224
case S_IFLNK:
1225
if (!ci->i_symlink) {
1226
u32 symlen = iinfo->symlink_len;
1227
char *sym;
1228
1229
spin_unlock(&ci->i_ceph_lock);
1230
1231
if (IS_ENCRYPTED(inode)) {
1232
if (symlen != i_size_read(inode))
1233
pr_err_client(cl,
1234
"%p %llx.%llx BAD symlink size %lld\n",
1235
inode, ceph_vinop(inode),
1236
i_size_read(inode));
1237
1238
err = decode_encrypted_symlink(mdsc, iinfo->symlink,
1239
symlen, (u8 **)&sym);
1240
if (err < 0) {
1241
pr_err_client(cl,
1242
"decoding encrypted symlink failed: %d\n",
1243
err);
1244
goto out;
1245
}
1246
symlen = err;
1247
i_size_write(inode, symlen);
1248
inode->i_blocks = calc_inode_blocks(symlen);
1249
} else {
1250
if (symlen != i_size_read(inode)) {
1251
pr_err_client(cl,
1252
"%p %llx.%llx BAD symlink size %lld\n",
1253
inode, ceph_vinop(inode),
1254
i_size_read(inode));
1255
i_size_write(inode, symlen);
1256
inode->i_blocks = calc_inode_blocks(symlen);
1257
}
1258
1259
err = -ENOMEM;
1260
sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
1261
if (!sym)
1262
goto out;
1263
}
1264
1265
spin_lock(&ci->i_ceph_lock);
1266
if (!ci->i_symlink)
1267
ci->i_symlink = sym;
1268
else
1269
kfree(sym); /* lost a race */
1270
}
1271
1272
if (IS_ENCRYPTED(inode)) {
1273
/*
1274
* Encrypted symlinks need to be decrypted before we can
1275
* cache their targets in i_link. Don't touch it here.
1276
*/
1277
inode->i_op = &ceph_encrypted_symlink_iops;
1278
} else {
1279
inode->i_link = ci->i_symlink;
1280
inode->i_op = &ceph_symlink_iops;
1281
}
1282
break;
1283
case S_IFDIR:
1284
inode->i_op = &ceph_dir_iops;
1285
inode->i_fop = &ceph_dir_fops;
1286
break;
1287
default:
1288
pr_err_client(cl, "%p %llx.%llx BAD mode 0%o\n", inode,
1289
ceph_vinop(inode), inode->i_mode);
1290
}
1291
1292
/* were we issued a capability? */
1293
if (info_caps) {
1294
if (ceph_snap(inode) == CEPH_NOSNAP) {
1295
ceph_add_cap(inode, session,
1296
le64_to_cpu(info->cap.cap_id),
1297
info_caps,
1298
le32_to_cpu(info->cap.wanted),
1299
le32_to_cpu(info->cap.seq),
1300
le32_to_cpu(info->cap.mseq),
1301
le64_to_cpu(info->cap.realm),
1302
info->cap.flags, &new_cap);
1303
1304
/* set dir completion flag? */
1305
if (S_ISDIR(inode->i_mode) &&
1306
ci->i_files == 0 && ci->i_subdirs == 0 &&
1307
(info_caps & CEPH_CAP_FILE_SHARED) &&
1308
(issued & CEPH_CAP_FILE_EXCL) == 0 &&
1309
!__ceph_dir_is_complete(ci)) {
1310
doutc(cl, " marking %p complete (empty)\n",
1311
inode);
1312
i_size_write(inode, 0);
1313
__ceph_dir_set_complete(ci,
1314
atomic64_read(&ci->i_release_count),
1315
atomic64_read(&ci->i_ordered_count));
1316
}
1317
1318
wake = true;
1319
} else {
1320
doutc(cl, " %p got snap_caps %s\n", inode,
1321
ceph_cap_string(info_caps));
1322
ci->i_snap_caps |= info_caps;
1323
}
1324
}
1325
1326
if (iinfo->inline_version > 0 &&
1327
iinfo->inline_version >= ci->i_inline_version) {
1328
int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
1329
ci->i_inline_version = iinfo->inline_version;
1330
if (ceph_has_inline_data(ci) &&
1331
(locked_page || (info_caps & cache_caps)))
1332
fill_inline = true;
1333
}
1334
1335
if (cap_fmode >= 0) {
1336
if (!info_caps)
1337
pr_warn_client(cl, "mds issued no caps on %llx.%llx\n",
1338
ceph_vinop(inode));
1339
__ceph_touch_fmode(ci, mdsc, cap_fmode);
1340
}
1341
1342
spin_unlock(&ci->i_ceph_lock);
1343
1344
ceph_fscache_register_inode_cookie(inode);
1345
1346
if (fill_inline)
1347
ceph_fill_inline_data(inode, locked_page,
1348
iinfo->inline_data, iinfo->inline_len);
1349
1350
if (wake)
1351
wake_up_all(&ci->i_cap_wq);
1352
1353
/* queue truncate if we saw i_size decrease */
1354
if (queue_trunc)
1355
ceph_queue_vmtruncate(inode);
1356
1357
/* populate frag tree */
1358
if (S_ISDIR(inode->i_mode))
1359
ceph_fill_fragtree(inode, &info->fragtree, dirinfo);
1360
1361
/* update delegation info? */
1362
if (dirinfo)
1363
ceph_fill_dirfrag(inode, dirinfo);
1364
1365
err = 0;
1366
out:
1367
if (new_cap)
1368
ceph_put_cap(mdsc, new_cap);
1369
ceph_buffer_put(old_blob);
1370
ceph_buffer_put(xattr_blob);
1371
ceph_put_string(pool_ns);
1372
return err;
1373
}
1374
1375
/*
1376
* caller should hold session s_mutex and dentry->d_lock.
1377
*/
1378
static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
1379
struct ceph_mds_reply_lease *lease,
1380
struct ceph_mds_session *session,
1381
unsigned long from_time,
1382
struct ceph_mds_session **old_lease_session)
1383
{
1384
struct ceph_client *cl = ceph_inode_to_client(dir);
1385
struct ceph_dentry_info *di = ceph_dentry(dentry);
1386
unsigned mask = le16_to_cpu(lease->mask);
1387
long unsigned duration = le32_to_cpu(lease->duration_ms);
1388
long unsigned ttl = from_time + (duration * HZ) / 1000;
1389
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
1390
1391
doutc(cl, "%p duration %lu ms ttl %lu\n", dentry, duration, ttl);
1392
1393
/* only track leases on regular dentries */
1394
if (ceph_snap(dir) != CEPH_NOSNAP)
1395
return;
1396
1397
if (mask & CEPH_LEASE_PRIMARY_LINK)
1398
di->flags |= CEPH_DENTRY_PRIMARY_LINK;
1399
else
1400
di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
1401
1402
di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
1403
if (!(mask & CEPH_LEASE_VALID)) {
1404
__ceph_dentry_dir_lease_touch(di);
1405
return;
1406
}
1407
1408
if (di->lease_gen == atomic_read(&session->s_cap_gen) &&
1409
time_before(ttl, di->time))
1410
return; /* we already have a newer lease. */
1411
1412
if (di->lease_session && di->lease_session != session) {
1413
*old_lease_session = di->lease_session;
1414
di->lease_session = NULL;
1415
}
1416
1417
if (!di->lease_session)
1418
di->lease_session = ceph_get_mds_session(session);
1419
di->lease_gen = atomic_read(&session->s_cap_gen);
1420
di->lease_seq = le32_to_cpu(lease->seq);
1421
di->lease_renew_after = half_ttl;
1422
di->lease_renew_from = 0;
1423
di->time = ttl;
1424
1425
__ceph_dentry_lease_touch(di);
1426
}
1427
1428
static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry,
1429
struct ceph_mds_reply_lease *lease,
1430
struct ceph_mds_session *session,
1431
unsigned long from_time)
1432
{
1433
struct ceph_mds_session *old_lease_session = NULL;
1434
spin_lock(&dentry->d_lock);
1435
__update_dentry_lease(dir, dentry, lease, session, from_time,
1436
&old_lease_session);
1437
spin_unlock(&dentry->d_lock);
1438
ceph_put_mds_session(old_lease_session);
1439
}
1440
1441
/*
1442
* update dentry lease without having parent inode locked
1443
*/
1444
static void update_dentry_lease_careful(struct dentry *dentry,
1445
struct ceph_mds_reply_lease *lease,
1446
struct ceph_mds_session *session,
1447
unsigned long from_time,
1448
char *dname, u32 dname_len,
1449
struct ceph_vino *pdvino,
1450
struct ceph_vino *ptvino)
1451
1452
{
1453
struct inode *dir;
1454
struct ceph_mds_session *old_lease_session = NULL;
1455
1456
spin_lock(&dentry->d_lock);
1457
/* make sure dentry's name matches target */
1458
if (dentry->d_name.len != dname_len ||
1459
memcmp(dentry->d_name.name, dname, dname_len))
1460
goto out_unlock;
1461
1462
dir = d_inode(dentry->d_parent);
1463
/* make sure parent matches dvino */
1464
if (!ceph_ino_compare(dir, pdvino))
1465
goto out_unlock;
1466
1467
/* make sure dentry's inode matches target. NULL ptvino means that
1468
* we expect a negative dentry */
1469
if (ptvino) {
1470
if (d_really_is_negative(dentry))
1471
goto out_unlock;
1472
if (!ceph_ino_compare(d_inode(dentry), ptvino))
1473
goto out_unlock;
1474
} else {
1475
if (d_really_is_positive(dentry))
1476
goto out_unlock;
1477
}
1478
1479
__update_dentry_lease(dir, dentry, lease, session,
1480
from_time, &old_lease_session);
1481
out_unlock:
1482
spin_unlock(&dentry->d_lock);
1483
ceph_put_mds_session(old_lease_session);
1484
}
1485
1486
/*
1487
* splice a dentry to an inode.
1488
* caller must hold directory i_rwsem for this to be safe.
1489
*/
1490
static int splice_dentry(struct dentry **pdn, struct inode *in)
1491
{
1492
struct ceph_client *cl = ceph_inode_to_client(in);
1493
struct dentry *dn = *pdn;
1494
struct dentry *realdn;
1495
1496
BUG_ON(d_inode(dn));
1497
1498
if (S_ISDIR(in->i_mode)) {
1499
/* If inode is directory, d_splice_alias() below will remove
1500
* 'realdn' from its origin parent. We need to ensure that
1501
* origin parent's readdir cache will not reference 'realdn'
1502
*/
1503
realdn = d_find_any_alias(in);
1504
if (realdn) {
1505
struct ceph_dentry_info *di = ceph_dentry(realdn);
1506
spin_lock(&realdn->d_lock);
1507
1508
realdn->d_op->d_prune(realdn);
1509
1510
di->time = jiffies;
1511
di->lease_shared_gen = 0;
1512
di->offset = 0;
1513
1514
spin_unlock(&realdn->d_lock);
1515
dput(realdn);
1516
}
1517
}
1518
1519
/* dn must be unhashed */
1520
if (!d_unhashed(dn))
1521
d_drop(dn);
1522
realdn = d_splice_alias(in, dn);
1523
if (IS_ERR(realdn)) {
1524
pr_err_client(cl, "error %ld %p inode %p ino %llx.%llx\n",
1525
PTR_ERR(realdn), dn, in, ceph_vinop(in));
1526
return PTR_ERR(realdn);
1527
}
1528
1529
if (realdn) {
1530
doutc(cl, "dn %p (%d) spliced with %p (%d) inode %p ino %llx.%llx\n",
1531
dn, d_count(dn), realdn, d_count(realdn),
1532
d_inode(realdn), ceph_vinop(d_inode(realdn)));
1533
dput(dn);
1534
*pdn = realdn;
1535
} else {
1536
BUG_ON(!ceph_dentry(dn));
1537
doutc(cl, "dn %p attached to %p ino %llx.%llx\n", dn,
1538
d_inode(dn), ceph_vinop(d_inode(dn)));
1539
}
1540
return 0;
1541
}
1542
1543
/*
1544
* Incorporate results into the local cache. This is either just
1545
* one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
1546
* after a lookup).
1547
*
1548
* A reply may contain
1549
* a directory inode along with a dentry.
1550
* and/or a target inode
1551
*
1552
* Called with snap_rwsem (read).
1553
*/
1554
int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
1555
{
1556
struct ceph_mds_session *session = req->r_session;
1557
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1558
struct inode *in = NULL;
1559
struct ceph_vino tvino, dvino;
1560
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
1561
struct ceph_client *cl = fsc->client;
1562
struct inode *parent_dir = NULL;
1563
int err = 0;
1564
1565
doutc(cl, "%p is_dentry %d is_target %d\n", req,
1566
rinfo->head->is_dentry, rinfo->head->is_target);
1567
1568
if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
1569
doutc(cl, "reply is empty!\n");
1570
if (rinfo->head->result == 0 && req->r_parent)
1571
ceph_invalidate_dir_request(req);
1572
return 0;
1573
}
1574
1575
if (rinfo->head->is_dentry) {
1576
/*
1577
* r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
1578
* so we need to get the correct inode
1579
*/
1580
parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo);
1581
if (unlikely(IS_ERR(parent_dir))) {
1582
err = PTR_ERR(parent_dir);
1583
goto done;
1584
}
1585
if (parent_dir) {
1586
err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri,
1587
rinfo->dirfrag, session, -1,
1588
&req->r_caps_reservation);
1589
if (err < 0)
1590
goto done;
1591
} else {
1592
WARN_ON_ONCE(1);
1593
}
1594
1595
if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
1596
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
1597
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
1598
bool is_nokey = false;
1599
struct qstr dname;
1600
struct dentry *dn, *parent;
1601
struct fscrypt_str oname = FSTR_INIT(NULL, 0);
1602
struct ceph_fname fname = { .dir = parent_dir,
1603
.name = rinfo->dname,
1604
.ctext = rinfo->altname,
1605
.name_len = rinfo->dname_len,
1606
.ctext_len = rinfo->altname_len };
1607
1608
BUG_ON(!rinfo->head->is_target);
1609
BUG_ON(req->r_dentry);
1610
1611
parent = d_find_any_alias(parent_dir);
1612
BUG_ON(!parent);
1613
1614
err = ceph_fname_alloc_buffer(parent_dir, &oname);
1615
if (err < 0) {
1616
dput(parent);
1617
goto done;
1618
}
1619
1620
err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
1621
if (err < 0) {
1622
dput(parent);
1623
ceph_fname_free_buffer(parent_dir, &oname);
1624
goto done;
1625
}
1626
dname.name = oname.name;
1627
dname.len = oname.len;
1628
dname.hash = full_name_hash(parent, dname.name, dname.len);
1629
tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1630
tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1631
retry_lookup:
1632
dn = d_lookup(parent, &dname);
1633
doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
1634
parent, dname.len, dname.name, dn);
1635
1636
if (!dn) {
1637
dn = d_alloc(parent, &dname);
1638
doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
1639
dname.len, dname.name, dn);
1640
if (!dn) {
1641
dput(parent);
1642
ceph_fname_free_buffer(parent_dir, &oname);
1643
err = -ENOMEM;
1644
goto done;
1645
}
1646
if (is_nokey) {
1647
spin_lock(&dn->d_lock);
1648
dn->d_flags |= DCACHE_NOKEY_NAME;
1649
spin_unlock(&dn->d_lock);
1650
}
1651
err = 0;
1652
} else if (d_really_is_positive(dn) &&
1653
(ceph_ino(d_inode(dn)) != tvino.ino ||
1654
ceph_snap(d_inode(dn)) != tvino.snap)) {
1655
doutc(cl, " dn %p points to wrong inode %p\n",
1656
dn, d_inode(dn));
1657
ceph_dir_clear_ordered(parent_dir);
1658
d_delete(dn);
1659
dput(dn);
1660
goto retry_lookup;
1661
}
1662
ceph_fname_free_buffer(parent_dir, &oname);
1663
1664
req->r_dentry = dn;
1665
dput(parent);
1666
}
1667
}
1668
1669
if (rinfo->head->is_target) {
1670
/* Should be filled in by handle_reply */
1671
BUG_ON(!req->r_target_inode);
1672
1673
in = req->r_target_inode;
1674
err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
1675
NULL, session,
1676
(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
1677
!test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
1678
rinfo->head->result == 0) ? req->r_fmode : -1,
1679
&req->r_caps_reservation);
1680
if (err < 0) {
1681
pr_err_client(cl, "badness %p %llx.%llx\n", in,
1682
ceph_vinop(in));
1683
req->r_target_inode = NULL;
1684
if (inode_state_read_once(in) & I_NEW)
1685
discard_new_inode(in);
1686
else
1687
iput(in);
1688
goto done;
1689
}
1690
if (inode_state_read_once(in) & I_NEW)
1691
unlock_new_inode(in);
1692
}
1693
1694
/*
1695
* ignore null lease/binding on snapdir ENOENT, or else we
1696
* will have trouble splicing in the virtual snapdir later
1697
*/
1698
if (rinfo->head->is_dentry &&
1699
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
1700
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
1701
(rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
1702
fsc->mount_options->snapdir_name,
1703
req->r_dentry->d_name.len))) {
1704
/*
1705
* lookup link rename : null -> possibly existing inode
1706
* mknod symlink mkdir : null -> new inode
1707
* unlink : linked -> null
1708
*/
1709
struct inode *dir = req->r_parent;
1710
struct dentry *dn = req->r_dentry;
1711
bool have_dir_cap, have_lease;
1712
1713
BUG_ON(!dn);
1714
BUG_ON(!dir);
1715
BUG_ON(d_inode(dn->d_parent) != dir);
1716
1717
dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
1718
dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
1719
1720
BUG_ON(ceph_ino(dir) != dvino.ino);
1721
BUG_ON(ceph_snap(dir) != dvino.snap);
1722
1723
/* do we have a lease on the whole dir? */
1724
have_dir_cap =
1725
(le32_to_cpu(rinfo->diri.in->cap.caps) &
1726
CEPH_CAP_FILE_SHARED);
1727
1728
/* do we have a dn lease? */
1729
have_lease = have_dir_cap ||
1730
le32_to_cpu(rinfo->dlease->duration_ms);
1731
if (!have_lease)
1732
doutc(cl, "no dentry lease or dir cap\n");
1733
1734
/* rename? */
1735
if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
1736
struct inode *olddir = req->r_old_dentry_dir;
1737
BUG_ON(!olddir);
1738
1739
doutc(cl, " src %p '%pd' dst %p '%pd'\n",
1740
req->r_old_dentry, req->r_old_dentry, dn, dn);
1741
doutc(cl, "doing d_move %p -> %p\n", req->r_old_dentry, dn);
1742
1743
/* d_move screws up sibling dentries' offsets */
1744
ceph_dir_clear_ordered(dir);
1745
ceph_dir_clear_ordered(olddir);
1746
1747
d_move(req->r_old_dentry, dn);
1748
doutc(cl, " src %p '%pd' dst %p '%pd'\n",
1749
req->r_old_dentry, req->r_old_dentry, dn, dn);
1750
1751
/* ensure target dentry is invalidated, despite
1752
rehashing bug in vfs_rename_dir */
1753
ceph_invalidate_dentry_lease(dn);
1754
1755
doutc(cl, "dn %p gets new offset %lld\n",
1756
req->r_old_dentry,
1757
ceph_dentry(req->r_old_dentry)->offset);
1758
1759
/* swap r_dentry and r_old_dentry in case that
1760
* splice_dentry() gets called later. This is safe
1761
* because no other place will use them */
1762
req->r_dentry = req->r_old_dentry;
1763
req->r_old_dentry = dn;
1764
dn = req->r_dentry;
1765
}
1766
1767
/* null dentry? */
1768
if (!rinfo->head->is_target) {
1769
doutc(cl, "null dentry\n");
1770
if (d_really_is_positive(dn)) {
1771
doutc(cl, "d_delete %p\n", dn);
1772
ceph_dir_clear_ordered(dir);
1773
d_delete(dn);
1774
} else if (have_lease) {
1775
if (d_unhashed(dn))
1776
d_add(dn, NULL);
1777
}
1778
1779
if (!d_unhashed(dn) && have_lease)
1780
update_dentry_lease(dir, dn,
1781
rinfo->dlease, session,
1782
req->r_request_started);
1783
goto done;
1784
}
1785
1786
if (unlikely(!in)) {
1787
err = -EINVAL;
1788
goto done;
1789
}
1790
1791
/* attach proper inode */
1792
if (d_really_is_negative(dn)) {
1793
ceph_dir_clear_ordered(dir);
1794
ihold(in);
1795
err = splice_dentry(&req->r_dentry, in);
1796
if (err < 0)
1797
goto done;
1798
dn = req->r_dentry; /* may have spliced */
1799
} else if (d_really_is_positive(dn) && d_inode(dn) != in) {
1800
doutc(cl, " %p links to %p %llx.%llx, not %llx.%llx\n",
1801
dn, d_inode(dn), ceph_vinop(d_inode(dn)),
1802
ceph_vinop(in));
1803
d_invalidate(dn);
1804
have_lease = false;
1805
}
1806
1807
if (have_lease) {
1808
update_dentry_lease(dir, dn,
1809
rinfo->dlease, session,
1810
req->r_request_started);
1811
}
1812
doutc(cl, " final dn %p\n", dn);
1813
} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1814
req->r_op == CEPH_MDS_OP_MKSNAP) &&
1815
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
1816
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
1817
struct inode *dir = req->r_parent;
1818
1819
/* fill out a snapdir LOOKUPSNAP dentry */
1820
BUG_ON(!dir);
1821
BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
1822
BUG_ON(!req->r_dentry);
1823
doutc(cl, " linking snapped dir %p to dn %p\n", in,
1824
req->r_dentry);
1825
ceph_dir_clear_ordered(dir);
1826
1827
if (unlikely(!in)) {
1828
err = -EINVAL;
1829
goto done;
1830
}
1831
1832
ihold(in);
1833
err = splice_dentry(&req->r_dentry, in);
1834
if (err < 0)
1835
goto done;
1836
} else if (rinfo->head->is_dentry && req->r_dentry) {
1837
/* parent inode is not locked, be careful */
1838
struct ceph_vino *ptvino = NULL;
1839
dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
1840
dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
1841
if (rinfo->head->is_target) {
1842
tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1843
tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1844
ptvino = &tvino;
1845
}
1846
update_dentry_lease_careful(req->r_dentry, rinfo->dlease,
1847
session, req->r_request_started,
1848
rinfo->dname, rinfo->dname_len,
1849
&dvino, ptvino);
1850
}
1851
done:
1852
/* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */
1853
if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent))
1854
iput(parent_dir);
1855
doutc(cl, "done err=%d\n", err);
1856
return err;
1857
}
1858
1859
/*
1860
* Prepopulate our cache with readdir results, leases, etc.
1861
*/
1862
static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
1863
struct ceph_mds_session *session)
1864
{
1865
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1866
struct ceph_client *cl = session->s_mdsc->fsc->client;
1867
int i, err = 0;
1868
1869
for (i = 0; i < rinfo->dir_nr; i++) {
1870
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
1871
struct ceph_vino vino;
1872
struct inode *in;
1873
int rc;
1874
1875
vino.ino = le64_to_cpu(rde->inode.in->ino);
1876
vino.snap = le64_to_cpu(rde->inode.in->snapid);
1877
1878
in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL);
1879
if (IS_ERR(in)) {
1880
err = PTR_ERR(in);
1881
doutc(cl, "badness got %d\n", err);
1882
continue;
1883
}
1884
rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
1885
-1, &req->r_caps_reservation);
1886
if (rc < 0) {
1887
pr_err_client(cl, "inode badness on %p got %d\n", in,
1888
rc);
1889
err = rc;
1890
if (inode_state_read_once(in) & I_NEW) {
1891
ihold(in);
1892
discard_new_inode(in);
1893
}
1894
} else if (inode_state_read_once(in) & I_NEW) {
1895
unlock_new_inode(in);
1896
}
1897
1898
iput(in);
1899
}
1900
1901
return err;
1902
}
1903
1904
void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
1905
{
1906
if (ctl->folio) {
1907
folio_release_kmap(ctl->folio, ctl->dentries);
1908
ctl->folio = NULL;
1909
}
1910
}
1911
1912
static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
1913
struct ceph_readdir_cache_control *ctl,
1914
struct ceph_mds_request *req)
1915
{
1916
struct ceph_client *cl = ceph_inode_to_client(dir);
1917
struct ceph_inode_info *ci = ceph_inode(dir);
1918
unsigned nsize = PAGE_SIZE / sizeof(struct dentry*);
1919
unsigned idx = ctl->index % nsize;
1920
pgoff_t pgoff = ctl->index / nsize;
1921
1922
if (!ctl->folio || pgoff != ctl->folio->index) {
1923
ceph_readdir_cache_release(ctl);
1924
fgf_t fgf = FGP_LOCK;
1925
1926
if (idx == 0)
1927
fgf |= FGP_ACCESSED | FGP_CREAT;
1928
1929
ctl->folio = __filemap_get_folio(&dir->i_data, pgoff,
1930
fgf, mapping_gfp_mask(&dir->i_data));
1931
if (IS_ERR(ctl->folio)) {
1932
int err = PTR_ERR(ctl->folio);
1933
1934
ctl->folio = NULL;
1935
ctl->index = -1;
1936
return idx == 0 ? err : 0;
1937
}
1938
/* reading/filling the cache are serialized by
1939
* i_rwsem, no need to use folio lock */
1940
folio_unlock(ctl->folio);
1941
ctl->dentries = kmap_local_folio(ctl->folio, 0);
1942
if (idx == 0)
1943
memset(ctl->dentries, 0, PAGE_SIZE);
1944
}
1945
1946
if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
1947
req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
1948
doutc(cl, "dn %p idx %d\n", dn, ctl->index);
1949
ctl->dentries[idx] = dn;
1950
ctl->index++;
1951
} else {
1952
doutc(cl, "disable readdir cache\n");
1953
ctl->index = -1;
1954
}
1955
return 0;
1956
}
1957
1958
int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1959
struct ceph_mds_session *session)
1960
{
1961
struct dentry *parent = req->r_dentry;
1962
struct inode *inode = d_inode(parent);
1963
struct ceph_inode_info *ci = ceph_inode(inode);
1964
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1965
struct ceph_client *cl = session->s_mdsc->fsc->client;
1966
struct qstr dname;
1967
struct dentry *dn;
1968
struct inode *in;
1969
int err = 0, skipped = 0, ret, i;
1970
u32 frag = le32_to_cpu(req->r_args.readdir.frag);
1971
u32 last_hash = 0;
1972
u32 fpos_offset;
1973
struct ceph_readdir_cache_control cache_ctl = {};
1974
1975
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
1976
return readdir_prepopulate_inodes_only(req, session);
1977
1978
if (rinfo->hash_order) {
1979
if (req->r_path2) {
1980
last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
1981
req->r_path2,
1982
strlen(req->r_path2));
1983
last_hash = ceph_frag_value(last_hash);
1984
} else if (rinfo->offset_hash) {
1985
/* mds understands offset_hash */
1986
WARN_ON_ONCE(req->r_readdir_offset != 2);
1987
last_hash = le32_to_cpu(req->r_args.readdir.offset_hash);
1988
}
1989
}
1990
1991
if (rinfo->dir_dir &&
1992
le32_to_cpu(rinfo->dir_dir->frag) != frag) {
1993
doutc(cl, "got new frag %x -> %x\n", frag,
1994
le32_to_cpu(rinfo->dir_dir->frag));
1995
frag = le32_to_cpu(rinfo->dir_dir->frag);
1996
if (!rinfo->hash_order)
1997
req->r_readdir_offset = 2;
1998
}
1999
2000
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
2001
doutc(cl, "%d items under SNAPDIR dn %p\n",
2002
rinfo->dir_nr, parent);
2003
} else {
2004
doutc(cl, "%d items under dn %p\n", rinfo->dir_nr, parent);
2005
if (rinfo->dir_dir)
2006
ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
2007
2008
if (ceph_frag_is_leftmost(frag) &&
2009
req->r_readdir_offset == 2 &&
2010
!(rinfo->hash_order && last_hash)) {
2011
/* note dir version at start of readdir so we can
2012
* tell if any dentries get dropped */
2013
req->r_dir_release_cnt =
2014
atomic64_read(&ci->i_release_count);
2015
req->r_dir_ordered_cnt =
2016
atomic64_read(&ci->i_ordered_count);
2017
req->r_readdir_cache_idx = 0;
2018
}
2019
}
2020
2021
cache_ctl.index = req->r_readdir_cache_idx;
2022
fpos_offset = req->r_readdir_offset;
2023
2024
/* FIXME: release caps/leases if error occurs */
2025
for (i = 0; i < rinfo->dir_nr; i++) {
2026
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
2027
struct ceph_vino tvino;
2028
2029
dname.name = rde->name;
2030
dname.len = rde->name_len;
2031
dname.hash = full_name_hash(parent, dname.name, dname.len);
2032
2033
tvino.ino = le64_to_cpu(rde->inode.in->ino);
2034
tvino.snap = le64_to_cpu(rde->inode.in->snapid);
2035
2036
if (rinfo->hash_order) {
2037
u32 hash = ceph_frag_value(rde->raw_hash);
2038
if (hash != last_hash)
2039
fpos_offset = 2;
2040
last_hash = hash;
2041
rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
2042
} else {
2043
rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
2044
}
2045
2046
retry_lookup:
2047
dn = d_lookup(parent, &dname);
2048
doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
2049
parent, dname.len, dname.name, dn);
2050
2051
if (!dn) {
2052
dn = d_alloc(parent, &dname);
2053
doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
2054
dname.len, dname.name, dn);
2055
if (!dn) {
2056
doutc(cl, "d_alloc badness\n");
2057
err = -ENOMEM;
2058
goto out;
2059
}
2060
if (rde->is_nokey) {
2061
spin_lock(&dn->d_lock);
2062
dn->d_flags |= DCACHE_NOKEY_NAME;
2063
spin_unlock(&dn->d_lock);
2064
}
2065
} else if (d_really_is_positive(dn) &&
2066
(ceph_ino(d_inode(dn)) != tvino.ino ||
2067
ceph_snap(d_inode(dn)) != tvino.snap)) {
2068
struct ceph_dentry_info *di = ceph_dentry(dn);
2069
doutc(cl, " dn %p points to wrong inode %p\n",
2070
dn, d_inode(dn));
2071
2072
spin_lock(&dn->d_lock);
2073
if (di->offset > 0 &&
2074
di->lease_shared_gen ==
2075
atomic_read(&ci->i_shared_gen)) {
2076
__ceph_dir_clear_ordered(ci);
2077
di->offset = 0;
2078
}
2079
spin_unlock(&dn->d_lock);
2080
2081
d_delete(dn);
2082
dput(dn);
2083
goto retry_lookup;
2084
}
2085
2086
/* inode */
2087
if (d_really_is_positive(dn)) {
2088
in = d_inode(dn);
2089
} else {
2090
in = ceph_get_inode(parent->d_sb, tvino, NULL);
2091
if (IS_ERR(in)) {
2092
doutc(cl, "new_inode badness\n");
2093
d_drop(dn);
2094
dput(dn);
2095
err = PTR_ERR(in);
2096
goto out;
2097
}
2098
}
2099
2100
ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
2101
-1, &req->r_caps_reservation);
2102
if (ret < 0) {
2103
pr_err_client(cl, "badness on %p %llx.%llx\n", in,
2104
ceph_vinop(in));
2105
if (d_really_is_negative(dn)) {
2106
if (inode_state_read_once(in) & I_NEW) {
2107
ihold(in);
2108
discard_new_inode(in);
2109
}
2110
iput(in);
2111
}
2112
d_drop(dn);
2113
err = ret;
2114
goto next_item;
2115
}
2116
if (inode_state_read_once(in) & I_NEW)
2117
unlock_new_inode(in);
2118
2119
if (d_really_is_negative(dn)) {
2120
if (ceph_security_xattr_deadlock(in)) {
2121
doutc(cl, " skip splicing dn %p to inode %p"
2122
" (security xattr deadlock)\n", dn, in);
2123
iput(in);
2124
skipped++;
2125
goto next_item;
2126
}
2127
2128
err = splice_dentry(&dn, in);
2129
if (err < 0)
2130
goto next_item;
2131
}
2132
2133
ceph_dentry(dn)->offset = rde->offset;
2134
2135
update_dentry_lease(d_inode(parent), dn,
2136
rde->lease, req->r_session,
2137
req->r_request_started);
2138
2139
if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
2140
ret = fill_readdir_cache(d_inode(parent), dn,
2141
&cache_ctl, req);
2142
if (ret < 0)
2143
err = ret;
2144
}
2145
next_item:
2146
dput(dn);
2147
}
2148
out:
2149
if (err == 0 && skipped == 0) {
2150
set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags);
2151
req->r_readdir_cache_idx = cache_ctl.index;
2152
}
2153
ceph_readdir_cache_release(&cache_ctl);
2154
doutc(cl, "done\n");
2155
return err;
2156
}
2157
2158
bool ceph_inode_set_size(struct inode *inode, loff_t size)
2159
{
2160
struct ceph_client *cl = ceph_inode_to_client(inode);
2161
struct ceph_inode_info *ci = ceph_inode(inode);
2162
bool ret;
2163
2164
spin_lock(&ci->i_ceph_lock);
2165
doutc(cl, "set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
2166
i_size_write(inode, size);
2167
ceph_fscache_update(inode);
2168
inode->i_blocks = calc_inode_blocks(size);
2169
2170
ret = __ceph_should_report_size(ci);
2171
2172
spin_unlock(&ci->i_ceph_lock);
2173
2174
return ret;
2175
}
2176
2177
void ceph_queue_inode_work(struct inode *inode, int work_bit)
2178
{
2179
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
2180
struct ceph_client *cl = fsc->client;
2181
struct ceph_inode_info *ci = ceph_inode(inode);
2182
set_bit(work_bit, &ci->i_work_mask);
2183
2184
ihold(inode);
2185
if (queue_work(fsc->inode_wq, &ci->i_work)) {
2186
doutc(cl, "%p %llx.%llx mask=%lx\n", inode,
2187
ceph_vinop(inode), ci->i_work_mask);
2188
} else {
2189
doutc(cl, "%p %llx.%llx already queued, mask=%lx\n",
2190
inode, ceph_vinop(inode), ci->i_work_mask);
2191
iput(inode);
2192
}
2193
}
2194
2195
static void ceph_do_invalidate_pages(struct inode *inode)
2196
{
2197
struct ceph_client *cl = ceph_inode_to_client(inode);
2198
struct ceph_inode_info *ci = ceph_inode(inode);
2199
u32 orig_gen;
2200
int check = 0;
2201
2202
ceph_fscache_invalidate(inode, false);
2203
2204
mutex_lock(&ci->i_truncate_mutex);
2205
2206
if (ceph_inode_is_shutdown(inode)) {
2207
pr_warn_ratelimited_client(cl,
2208
"%p %llx.%llx is shut down\n", inode,
2209
ceph_vinop(inode));
2210
mapping_set_error(inode->i_mapping, -EIO);
2211
truncate_pagecache(inode, 0);
2212
mutex_unlock(&ci->i_truncate_mutex);
2213
goto out;
2214
}
2215
2216
spin_lock(&ci->i_ceph_lock);
2217
doutc(cl, "%p %llx.%llx gen %d revoking %d\n", inode,
2218
ceph_vinop(inode), ci->i_rdcache_gen, ci->i_rdcache_revoking);
2219
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2220
if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
2221
check = 1;
2222
spin_unlock(&ci->i_ceph_lock);
2223
mutex_unlock(&ci->i_truncate_mutex);
2224
goto out;
2225
}
2226
orig_gen = ci->i_rdcache_gen;
2227
spin_unlock(&ci->i_ceph_lock);
2228
2229
if (invalidate_inode_pages2(inode->i_mapping) < 0) {
2230
pr_err_client(cl, "invalidate_inode_pages2 %llx.%llx failed\n",
2231
ceph_vinop(inode));
2232
}
2233
2234
spin_lock(&ci->i_ceph_lock);
2235
if (orig_gen == ci->i_rdcache_gen &&
2236
orig_gen == ci->i_rdcache_revoking) {
2237
doutc(cl, "%p %llx.%llx gen %d successful\n", inode,
2238
ceph_vinop(inode), ci->i_rdcache_gen);
2239
ci->i_rdcache_revoking--;
2240
check = 1;
2241
} else {
2242
doutc(cl, "%p %llx.%llx gen %d raced, now %d revoking %d\n",
2243
inode, ceph_vinop(inode), orig_gen, ci->i_rdcache_gen,
2244
ci->i_rdcache_revoking);
2245
if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
2246
check = 1;
2247
}
2248
spin_unlock(&ci->i_ceph_lock);
2249
mutex_unlock(&ci->i_truncate_mutex);
2250
out:
2251
if (check)
2252
ceph_check_caps(ci, 0);
2253
}
2254
2255
/*
2256
* Make sure any pending truncation is applied before doing anything
2257
* that may depend on it.
2258
*/
2259
void __ceph_do_pending_vmtruncate(struct inode *inode)
2260
{
2261
struct ceph_client *cl = ceph_inode_to_client(inode);
2262
struct ceph_inode_info *ci = ceph_inode(inode);
2263
u64 to;
2264
int wrbuffer_refs, finish = 0;
2265
2266
mutex_lock(&ci->i_truncate_mutex);
2267
retry:
2268
spin_lock(&ci->i_ceph_lock);
2269
if (ci->i_truncate_pending == 0) {
2270
doutc(cl, "%p %llx.%llx none pending\n", inode,
2271
ceph_vinop(inode));
2272
spin_unlock(&ci->i_ceph_lock);
2273
mutex_unlock(&ci->i_truncate_mutex);
2274
return;
2275
}
2276
2277
/*
2278
* make sure any dirty snapped pages are flushed before we
2279
* possibly truncate them.. so write AND block!
2280
*/
2281
if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
2282
spin_unlock(&ci->i_ceph_lock);
2283
doutc(cl, "%p %llx.%llx flushing snaps first\n", inode,
2284
ceph_vinop(inode));
2285
filemap_write_and_wait_range(&inode->i_data, 0,
2286
inode->i_sb->s_maxbytes);
2287
goto retry;
2288
}
2289
2290
/* there should be no reader or writer */
2291
WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
2292
2293
to = ci->i_truncate_pagecache_size;
2294
wrbuffer_refs = ci->i_wrbuffer_ref;
2295
doutc(cl, "%p %llx.%llx (%d) to %lld\n", inode, ceph_vinop(inode),
2296
ci->i_truncate_pending, to);
2297
spin_unlock(&ci->i_ceph_lock);
2298
2299
ceph_fscache_resize(inode, to);
2300
truncate_pagecache(inode, to);
2301
2302
spin_lock(&ci->i_ceph_lock);
2303
if (to == ci->i_truncate_pagecache_size) {
2304
ci->i_truncate_pending = 0;
2305
finish = 1;
2306
}
2307
spin_unlock(&ci->i_ceph_lock);
2308
if (!finish)
2309
goto retry;
2310
2311
mutex_unlock(&ci->i_truncate_mutex);
2312
2313
if (wrbuffer_refs == 0)
2314
ceph_check_caps(ci, 0);
2315
2316
wake_up_all(&ci->i_cap_wq);
2317
}
2318
2319
static void ceph_inode_work(struct work_struct *work)
2320
{
2321
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
2322
i_work);
2323
struct inode *inode = &ci->netfs.inode;
2324
struct ceph_client *cl = ceph_inode_to_client(inode);
2325
2326
if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
2327
doutc(cl, "writeback %p %llx.%llx\n", inode, ceph_vinop(inode));
2328
filemap_fdatawrite(&inode->i_data);
2329
}
2330
if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask))
2331
ceph_do_invalidate_pages(inode);
2332
2333
if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask))
2334
__ceph_do_pending_vmtruncate(inode);
2335
2336
if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask))
2337
ceph_check_caps(ci, 0);
2338
2339
if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask))
2340
ceph_flush_snaps(ci, NULL);
2341
2342
iput(inode);
2343
}
2344
2345
static const char *ceph_encrypted_get_link(struct dentry *dentry,
2346
struct inode *inode,
2347
struct delayed_call *done)
2348
{
2349
struct ceph_inode_info *ci = ceph_inode(inode);
2350
2351
if (!dentry)
2352
return ERR_PTR(-ECHILD);
2353
2354
return fscrypt_get_symlink(inode, ci->i_symlink, i_size_read(inode),
2355
done);
2356
}
2357
2358
static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap,
2359
const struct path *path,
2360
struct kstat *stat, u32 request_mask,
2361
unsigned int query_flags)
2362
{
2363
int ret;
2364
2365
ret = ceph_getattr(idmap, path, stat, request_mask, query_flags);
2366
if (ret)
2367
return ret;
2368
return fscrypt_symlink_getattr(path, stat);
2369
}
2370
2371
/*
2372
* symlinks
2373
*/
2374
static const struct inode_operations ceph_symlink_iops = {
2375
.get_link = simple_get_link,
2376
.setattr = ceph_setattr,
2377
.getattr = ceph_getattr,
2378
.listxattr = ceph_listxattr,
2379
};
2380
2381
static const struct inode_operations ceph_encrypted_symlink_iops = {
2382
.get_link = ceph_encrypted_get_link,
2383
.setattr = ceph_setattr,
2384
.getattr = ceph_encrypted_symlink_getattr,
2385
.listxattr = ceph_listxattr,
2386
};
2387
2388
/*
2389
* Transfer the encrypted last block to the MDS and the MDS
2390
* will help update it when truncating a smaller size.
2391
*
2392
* We don't support a PAGE_SIZE that is smaller than the
2393
* CEPH_FSCRYPT_BLOCK_SIZE.
2394
*/
2395
static int fill_fscrypt_truncate(struct inode *inode,
2396
struct ceph_mds_request *req,
2397
struct iattr *attr)
2398
{
2399
struct ceph_client *cl = ceph_inode_to_client(inode);
2400
struct ceph_inode_info *ci = ceph_inode(inode);
2401
int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE;
2402
loff_t pos, orig_pos = round_down(attr->ia_size,
2403
CEPH_FSCRYPT_BLOCK_SIZE);
2404
u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT;
2405
struct ceph_pagelist *pagelist = NULL;
2406
struct kvec iov = {0};
2407
struct iov_iter iter;
2408
struct page *page = NULL;
2409
struct ceph_fscrypt_truncate_size_header header;
2410
int retry_op = 0;
2411
int len = CEPH_FSCRYPT_BLOCK_SIZE;
2412
loff_t i_size = i_size_read(inode);
2413
int got, ret, issued;
2414
u64 objver;
2415
2416
ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got);
2417
if (ret < 0)
2418
return ret;
2419
2420
issued = __ceph_caps_issued(ci, NULL);
2421
2422
doutc(cl, "size %lld -> %lld got cap refs on %s, issued %s\n",
2423
i_size, attr->ia_size, ceph_cap_string(got),
2424
ceph_cap_string(issued));
2425
2426
/* Try to writeback the dirty pagecaches */
2427
if (issued & (CEPH_CAP_FILE_BUFFER)) {
2428
loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1;
2429
2430
ret = filemap_write_and_wait_range(inode->i_mapping,
2431
orig_pos, lend);
2432
if (ret < 0)
2433
goto out;
2434
}
2435
2436
page = __page_cache_alloc(GFP_KERNEL);
2437
if (page == NULL) {
2438
ret = -ENOMEM;
2439
goto out;
2440
}
2441
2442
pagelist = ceph_pagelist_alloc(GFP_KERNEL);
2443
if (!pagelist) {
2444
ret = -ENOMEM;
2445
goto out;
2446
}
2447
2448
iov.iov_base = kmap_local_page(page);
2449
iov.iov_len = len;
2450
iov_iter_kvec(&iter, READ, &iov, 1, len);
2451
2452
pos = orig_pos;
2453
ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver);
2454
if (ret < 0)
2455
goto out;
2456
2457
/* Insert the header first */
2458
header.ver = 1;
2459
header.compat = 1;
2460
header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode));
2461
2462
/*
2463
* Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE,
2464
* because in MDS it may need this to do the truncate.
2465
*/
2466
header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE);
2467
2468
/*
2469
* If we hit a hole here, we should just skip filling
2470
* the fscrypt for the request, because once the fscrypt
2471
* is enabled, the file will be split into many blocks
2472
* with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there
2473
* has a hole, the hole size should be multiple of block
2474
* size.
2475
*
2476
* If the Rados object doesn't exist, it will be set to 0.
2477
*/
2478
if (!objver) {
2479
doutc(cl, "hit hole, ppos %lld < size %lld\n", pos, i_size);
2480
2481
header.data_len = cpu_to_le32(8 + 8 + 4);
2482
header.file_offset = 0;
2483
ret = 0;
2484
} else {
2485
header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE);
2486
header.file_offset = cpu_to_le64(orig_pos);
2487
2488
doutc(cl, "encrypt block boff/bsize %d/%lu\n", boff,
2489
CEPH_FSCRYPT_BLOCK_SIZE);
2490
2491
/* truncate and zero out the extra contents for the last block */
2492
memset(iov.iov_base + boff, 0, PAGE_SIZE - boff);
2493
2494
/* encrypt the last block */
2495
ret = ceph_fscrypt_encrypt_block_inplace(inode, page,
2496
CEPH_FSCRYPT_BLOCK_SIZE,
2497
0, block);
2498
if (ret)
2499
goto out;
2500
}
2501
2502
/* Insert the header */
2503
ret = ceph_pagelist_append(pagelist, &header, sizeof(header));
2504
if (ret)
2505
goto out;
2506
2507
if (header.block_size) {
2508
/* Append the last block contents to pagelist */
2509
ret = ceph_pagelist_append(pagelist, iov.iov_base,
2510
CEPH_FSCRYPT_BLOCK_SIZE);
2511
if (ret)
2512
goto out;
2513
}
2514
req->r_pagelist = pagelist;
2515
out:
2516
doutc(cl, "%p %llx.%llx size dropping cap refs on %s\n", inode,
2517
ceph_vinop(inode), ceph_cap_string(got));
2518
ceph_put_cap_refs(ci, got);
2519
if (iov.iov_base)
2520
kunmap_local(iov.iov_base);
2521
if (page)
2522
__free_pages(page, 0);
2523
if (ret && pagelist)
2524
ceph_pagelist_release(pagelist);
2525
return ret;
2526
}
2527
2528
int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
2529
struct iattr *attr, struct ceph_iattr *cia)
2530
{
2531
struct ceph_inode_info *ci = ceph_inode(inode);
2532
unsigned int ia_valid = attr->ia_valid;
2533
struct ceph_mds_request *req;
2534
struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
2535
struct ceph_client *cl = ceph_inode_to_client(inode);
2536
struct ceph_cap_flush *prealloc_cf;
2537
loff_t isize = i_size_read(inode);
2538
int issued;
2539
int release = 0, dirtied = 0;
2540
int mask = 0;
2541
int err = 0;
2542
int inode_dirty_flags = 0;
2543
bool lock_snap_rwsem = false;
2544
bool fill_fscrypt;
2545
int truncate_retry = 20; /* The RMW will take around 50ms */
2546
struct dentry *dentry;
2547
char *path;
2548
bool do_sync = false;
2549
2550
dentry = d_find_alias(inode);
2551
if (!dentry) {
2552
do_sync = true;
2553
} else {
2554
struct ceph_path_info path_info;
2555
path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
2556
if (IS_ERR(path)) {
2557
do_sync = true;
2558
err = 0;
2559
} else {
2560
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
2561
}
2562
ceph_mdsc_free_path_info(&path_info);
2563
dput(dentry);
2564
2565
/* For none EACCES cases will let the MDS do the mds auth check */
2566
if (err == -EACCES) {
2567
return err;
2568
} else if (err < 0) {
2569
do_sync = true;
2570
err = 0;
2571
}
2572
}
2573
2574
retry:
2575
prealloc_cf = ceph_alloc_cap_flush();
2576
if (!prealloc_cf)
2577
return -ENOMEM;
2578
2579
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
2580
USE_AUTH_MDS);
2581
if (IS_ERR(req)) {
2582
ceph_free_cap_flush(prealloc_cf);
2583
return PTR_ERR(req);
2584
}
2585
2586
fill_fscrypt = false;
2587
spin_lock(&ci->i_ceph_lock);
2588
issued = __ceph_caps_issued(ci, NULL);
2589
2590
if (!ci->i_head_snapc &&
2591
(issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
2592
lock_snap_rwsem = true;
2593
if (!down_read_trylock(&mdsc->snap_rwsem)) {
2594
spin_unlock(&ci->i_ceph_lock);
2595
down_read(&mdsc->snap_rwsem);
2596
spin_lock(&ci->i_ceph_lock);
2597
issued = __ceph_caps_issued(ci, NULL);
2598
}
2599
}
2600
2601
doutc(cl, "%p %llx.%llx issued %s\n", inode, ceph_vinop(inode),
2602
ceph_cap_string(issued));
2603
#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
2604
if (cia && cia->fscrypt_auth) {
2605
u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth);
2606
2607
if (len > sizeof(*cia->fscrypt_auth)) {
2608
err = -EINVAL;
2609
spin_unlock(&ci->i_ceph_lock);
2610
goto out;
2611
}
2612
2613
doutc(cl, "%p %llx.%llx fscrypt_auth len %u to %u)\n", inode,
2614
ceph_vinop(inode), ci->fscrypt_auth_len, len);
2615
2616
/* It should never be re-set once set */
2617
WARN_ON_ONCE(ci->fscrypt_auth);
2618
2619
if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
2620
dirtied |= CEPH_CAP_AUTH_EXCL;
2621
kfree(ci->fscrypt_auth);
2622
ci->fscrypt_auth = (u8 *)cia->fscrypt_auth;
2623
ci->fscrypt_auth_len = len;
2624
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
2625
ci->fscrypt_auth_len != len ||
2626
memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) {
2627
req->r_fscrypt_auth = cia->fscrypt_auth;
2628
mask |= CEPH_SETATTR_FSCRYPT_AUTH;
2629
release |= CEPH_CAP_AUTH_SHARED;
2630
}
2631
cia->fscrypt_auth = NULL;
2632
}
2633
#else
2634
if (cia && cia->fscrypt_auth) {
2635
err = -EINVAL;
2636
spin_unlock(&ci->i_ceph_lock);
2637
goto out;
2638
}
2639
#endif /* CONFIG_FS_ENCRYPTION */
2640
2641
if (ia_valid & ATTR_UID) {
2642
kuid_t fsuid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid);
2643
2644
doutc(cl, "%p %llx.%llx uid %d -> %d\n", inode,
2645
ceph_vinop(inode),
2646
from_kuid(&init_user_ns, inode->i_uid),
2647
from_kuid(&init_user_ns, attr->ia_uid));
2648
if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
2649
inode->i_uid = fsuid;
2650
dirtied |= CEPH_CAP_AUTH_EXCL;
2651
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
2652
!uid_eq(fsuid, inode->i_uid)) {
2653
req->r_args.setattr.uid = cpu_to_le32(
2654
from_kuid(&init_user_ns, fsuid));
2655
mask |= CEPH_SETATTR_UID;
2656
release |= CEPH_CAP_AUTH_SHARED;
2657
}
2658
}
2659
if (ia_valid & ATTR_GID) {
2660
kgid_t fsgid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid);
2661
2662
doutc(cl, "%p %llx.%llx gid %d -> %d\n", inode,
2663
ceph_vinop(inode),
2664
from_kgid(&init_user_ns, inode->i_gid),
2665
from_kgid(&init_user_ns, attr->ia_gid));
2666
if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
2667
inode->i_gid = fsgid;
2668
dirtied |= CEPH_CAP_AUTH_EXCL;
2669
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
2670
!gid_eq(fsgid, inode->i_gid)) {
2671
req->r_args.setattr.gid = cpu_to_le32(
2672
from_kgid(&init_user_ns, fsgid));
2673
mask |= CEPH_SETATTR_GID;
2674
release |= CEPH_CAP_AUTH_SHARED;
2675
}
2676
}
2677
if (ia_valid & ATTR_MODE) {
2678
doutc(cl, "%p %llx.%llx mode 0%o -> 0%o\n", inode,
2679
ceph_vinop(inode), inode->i_mode, attr->ia_mode);
2680
if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
2681
inode->i_mode = attr->ia_mode;
2682
dirtied |= CEPH_CAP_AUTH_EXCL;
2683
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
2684
attr->ia_mode != inode->i_mode) {
2685
inode->i_mode = attr->ia_mode;
2686
req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
2687
mask |= CEPH_SETATTR_MODE;
2688
release |= CEPH_CAP_AUTH_SHARED;
2689
}
2690
}
2691
2692
if (ia_valid & ATTR_ATIME) {
2693
struct timespec64 atime = inode_get_atime(inode);
2694
2695
doutc(cl, "%p %llx.%llx atime %ptSp -> %ptSp\n",
2696
inode, ceph_vinop(inode), &atime, &attr->ia_atime);
2697
if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
2698
ci->i_time_warp_seq++;
2699
inode_set_atime_to_ts(inode, attr->ia_atime);
2700
dirtied |= CEPH_CAP_FILE_EXCL;
2701
} else if (!do_sync && (issued & CEPH_CAP_FILE_WR) &&
2702
timespec64_compare(&atime,
2703
&attr->ia_atime) < 0) {
2704
inode_set_atime_to_ts(inode, attr->ia_atime);
2705
dirtied |= CEPH_CAP_FILE_WR;
2706
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
2707
!timespec64_equal(&atime, &attr->ia_atime)) {
2708
ceph_encode_timespec64(&req->r_args.setattr.atime,
2709
&attr->ia_atime);
2710
mask |= CEPH_SETATTR_ATIME;
2711
release |= CEPH_CAP_FILE_SHARED |
2712
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2713
}
2714
}
2715
if (ia_valid & ATTR_SIZE) {
2716
doutc(cl, "%p %llx.%llx size %lld -> %lld\n", inode,
2717
ceph_vinop(inode), isize, attr->ia_size);
2718
/*
2719
* Only when the new size is smaller and not aligned to
2720
* CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
2721
*/
2722
if (IS_ENCRYPTED(inode) && attr->ia_size < isize &&
2723
(attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) {
2724
mask |= CEPH_SETATTR_SIZE;
2725
release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2726
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2727
set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
2728
mask |= CEPH_SETATTR_FSCRYPT_FILE;
2729
req->r_args.setattr.size =
2730
cpu_to_le64(round_up(attr->ia_size,
2731
CEPH_FSCRYPT_BLOCK_SIZE));
2732
req->r_args.setattr.old_size =
2733
cpu_to_le64(round_up(isize,
2734
CEPH_FSCRYPT_BLOCK_SIZE));
2735
req->r_fscrypt_file = attr->ia_size;
2736
fill_fscrypt = true;
2737
} else if (!do_sync && (issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
2738
if (attr->ia_size > isize) {
2739
i_size_write(inode, attr->ia_size);
2740
inode->i_blocks = calc_inode_blocks(attr->ia_size);
2741
ci->i_reported_size = attr->ia_size;
2742
dirtied |= CEPH_CAP_FILE_EXCL;
2743
ia_valid |= ATTR_MTIME;
2744
}
2745
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
2746
attr->ia_size != isize) {
2747
mask |= CEPH_SETATTR_SIZE;
2748
release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2749
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2750
if (IS_ENCRYPTED(inode) && attr->ia_size) {
2751
set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
2752
mask |= CEPH_SETATTR_FSCRYPT_FILE;
2753
req->r_args.setattr.size =
2754
cpu_to_le64(round_up(attr->ia_size,
2755
CEPH_FSCRYPT_BLOCK_SIZE));
2756
req->r_args.setattr.old_size =
2757
cpu_to_le64(round_up(isize,
2758
CEPH_FSCRYPT_BLOCK_SIZE));
2759
req->r_fscrypt_file = attr->ia_size;
2760
} else {
2761
req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
2762
req->r_args.setattr.old_size = cpu_to_le64(isize);
2763
req->r_fscrypt_file = 0;
2764
}
2765
}
2766
}
2767
if (ia_valid & ATTR_MTIME) {
2768
struct timespec64 mtime = inode_get_mtime(inode);
2769
2770
doutc(cl, "%p %llx.%llx mtime %ptSp -> %ptSp\n",
2771
inode, ceph_vinop(inode), &mtime, &attr->ia_mtime);
2772
if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
2773
ci->i_time_warp_seq++;
2774
inode_set_mtime_to_ts(inode, attr->ia_mtime);
2775
dirtied |= CEPH_CAP_FILE_EXCL;
2776
} else if (!do_sync && (issued & CEPH_CAP_FILE_WR) &&
2777
timespec64_compare(&mtime, &attr->ia_mtime) < 0) {
2778
inode_set_mtime_to_ts(inode, attr->ia_mtime);
2779
dirtied |= CEPH_CAP_FILE_WR;
2780
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
2781
!timespec64_equal(&mtime, &attr->ia_mtime)) {
2782
ceph_encode_timespec64(&req->r_args.setattr.mtime,
2783
&attr->ia_mtime);
2784
mask |= CEPH_SETATTR_MTIME;
2785
release |= CEPH_CAP_FILE_SHARED |
2786
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2787
}
2788
}
2789
2790
/* these do nothing */
2791
if (ia_valid & ATTR_CTIME) {
2792
struct timespec64 ictime = inode_get_ctime(inode);
2793
bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
2794
ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
2795
doutc(cl, "%p %llx.%llx ctime %ptSp -> %ptSp (%s)\n",
2796
inode, ceph_vinop(inode), &ictime, &attr->ia_ctime,
2797
only ? "ctime only" : "ignored");
2798
if (only) {
2799
/*
2800
* if kernel wants to dirty ctime but nothing else,
2801
* we need to choose a cap to dirty under, or do
2802
* a almost-no-op setattr
2803
*/
2804
if (issued & CEPH_CAP_AUTH_EXCL)
2805
dirtied |= CEPH_CAP_AUTH_EXCL;
2806
else if (issued & CEPH_CAP_FILE_EXCL)
2807
dirtied |= CEPH_CAP_FILE_EXCL;
2808
else if (issued & CEPH_CAP_XATTR_EXCL)
2809
dirtied |= CEPH_CAP_XATTR_EXCL;
2810
else
2811
mask |= CEPH_SETATTR_CTIME;
2812
}
2813
}
2814
if (ia_valid & ATTR_FILE)
2815
doutc(cl, "%p %llx.%llx ATTR_FILE ... hrm!\n", inode,
2816
ceph_vinop(inode));
2817
2818
if (dirtied) {
2819
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
2820
&prealloc_cf);
2821
inode_set_ctime_to_ts(inode, attr->ia_ctime);
2822
inode_inc_iversion_raw(inode);
2823
}
2824
2825
release &= issued;
2826
spin_unlock(&ci->i_ceph_lock);
2827
if (lock_snap_rwsem) {
2828
up_read(&mdsc->snap_rwsem);
2829
lock_snap_rwsem = false;
2830
}
2831
2832
if (inode_dirty_flags)
2833
__mark_inode_dirty(inode, inode_dirty_flags);
2834
2835
if (mask) {
2836
req->r_inode = inode;
2837
ihold(inode);
2838
req->r_inode_drop = release;
2839
req->r_args.setattr.mask = cpu_to_le32(mask);
2840
req->r_num_caps = 1;
2841
req->r_stamp = attr->ia_ctime;
2842
if (fill_fscrypt) {
2843
err = fill_fscrypt_truncate(inode, req, attr);
2844
if (err)
2845
goto out;
2846
}
2847
2848
/*
2849
* The truncate request will return -EAGAIN when the
2850
* last block has been updated just before the MDS
2851
* successfully gets the xlock for the FILE lock. To
2852
* avoid corrupting the file contents we need to retry
2853
* it.
2854
*/
2855
err = ceph_mdsc_do_request(mdsc, NULL, req);
2856
if (err == -EAGAIN && truncate_retry--) {
2857
doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote), retry it!\n",
2858
inode, ceph_vinop(inode), err,
2859
ceph_cap_string(dirtied), mask);
2860
ceph_mdsc_put_request(req);
2861
ceph_free_cap_flush(prealloc_cf);
2862
goto retry;
2863
}
2864
}
2865
out:
2866
doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote)\n", inode,
2867
ceph_vinop(inode), err, ceph_cap_string(dirtied), mask);
2868
2869
ceph_mdsc_put_request(req);
2870
ceph_free_cap_flush(prealloc_cf);
2871
2872
if (err >= 0 && (mask & CEPH_SETATTR_SIZE))
2873
__ceph_do_pending_vmtruncate(inode);
2874
2875
return err;
2876
}
2877
2878
/*
2879
* setattr
2880
*/
2881
int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
2882
struct iattr *attr)
2883
{
2884
struct inode *inode = d_inode(dentry);
2885
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
2886
int err;
2887
2888
if (ceph_snap(inode) != CEPH_NOSNAP)
2889
return -EROFS;
2890
2891
if (ceph_inode_is_shutdown(inode))
2892
return -ESTALE;
2893
2894
err = fscrypt_prepare_setattr(dentry, attr);
2895
if (err)
2896
return err;
2897
2898
err = setattr_prepare(idmap, dentry, attr);
2899
if (err != 0)
2900
return err;
2901
2902
if ((attr->ia_valid & ATTR_SIZE) &&
2903
attr->ia_size > max(i_size_read(inode), fsc->max_file_size))
2904
return -EFBIG;
2905
2906
if ((attr->ia_valid & ATTR_SIZE) &&
2907
ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
2908
return -EDQUOT;
2909
2910
err = __ceph_setattr(idmap, inode, attr, NULL);
2911
2912
if (err >= 0 && (attr->ia_valid & ATTR_MODE))
2913
err = posix_acl_chmod(idmap, dentry, attr->ia_mode);
2914
2915
return err;
2916
}
2917
2918
int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
2919
{
2920
int issued = ceph_caps_issued(ceph_inode(inode));
2921
2922
/*
2923
* If any 'x' caps is issued we can just choose the auth MDS
2924
* instead of the random replica MDSes. Because only when the
2925
* Locker is in LOCK_EXEC state will the loner client could
2926
* get the 'x' caps. And if we send the getattr requests to
2927
* any replica MDS it must auth pin and tries to rdlock from
2928
* the auth MDS, and then the auth MDS need to do the Locker
2929
* state transition to LOCK_SYNC. And after that the lock state
2930
* will change back.
2931
*
2932
* This cost much when doing the Locker state transition and
2933
* usually will need to revoke caps from clients.
2934
*
2935
* And for the 'Xs' caps for getxattr we will also choose the
2936
* auth MDS, because the MDS side code is buggy due to setxattr
2937
* won't notify the replica MDSes when the values changed and
2938
* the replica MDS will return the old values. Though we will
2939
* fix it in MDS code, but this still makes sense for old ceph.
2940
*/
2941
if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
2942
|| (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR)))
2943
return USE_AUTH_MDS;
2944
else
2945
return USE_ANY_MDS;
2946
}
2947
2948
/*
2949
* Verify that we have a lease on the given mask. If not,
2950
* do a getattr against an mds.
2951
*/
2952
int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
2953
int mask, bool force)
2954
{
2955
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
2956
struct ceph_client *cl = fsc->client;
2957
struct ceph_mds_client *mdsc = fsc->mdsc;
2958
struct ceph_mds_request *req;
2959
int mode;
2960
int err;
2961
2962
if (ceph_snap(inode) == CEPH_SNAPDIR) {
2963
doutc(cl, "inode %p %llx.%llx SNAPDIR\n", inode,
2964
ceph_vinop(inode));
2965
return 0;
2966
}
2967
2968
doutc(cl, "inode %p %llx.%llx mask %s mode 0%o\n", inode,
2969
ceph_vinop(inode), ceph_cap_string(mask), inode->i_mode);
2970
if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
2971
return 0;
2972
2973
mode = ceph_try_to_choose_auth_mds(inode, mask);
2974
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
2975
if (IS_ERR(req))
2976
return PTR_ERR(req);
2977
req->r_inode = inode;
2978
ihold(inode);
2979
req->r_num_caps = 1;
2980
req->r_args.getattr.mask = cpu_to_le32(mask);
2981
req->r_locked_page = locked_page;
2982
err = ceph_mdsc_do_request(mdsc, NULL, req);
2983
if (locked_page && err == 0) {
2984
u64 inline_version = req->r_reply_info.targeti.inline_version;
2985
if (inline_version == 0) {
2986
/* the reply is supposed to contain inline data */
2987
err = -EINVAL;
2988
} else if (inline_version == CEPH_INLINE_NONE ||
2989
inline_version == 1) {
2990
err = -ENODATA;
2991
} else {
2992
err = req->r_reply_info.targeti.inline_len;
2993
}
2994
}
2995
ceph_mdsc_put_request(req);
2996
doutc(cl, "result=%d\n", err);
2997
return err;
2998
}
2999
3000
int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
3001
size_t size)
3002
{
3003
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
3004
struct ceph_client *cl = fsc->client;
3005
struct ceph_mds_client *mdsc = fsc->mdsc;
3006
struct ceph_mds_request *req;
3007
int mode = USE_AUTH_MDS;
3008
int err;
3009
char *xattr_value;
3010
size_t xattr_value_len;
3011
3012
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
3013
if (IS_ERR(req)) {
3014
err = -ENOMEM;
3015
goto out;
3016
}
3017
3018
req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR;
3019
req->r_path2 = kstrdup(name, GFP_NOFS);
3020
if (!req->r_path2) {
3021
err = -ENOMEM;
3022
goto put;
3023
}
3024
3025
ihold(inode);
3026
req->r_inode = inode;
3027
err = ceph_mdsc_do_request(mdsc, NULL, req);
3028
if (err < 0)
3029
goto put;
3030
3031
xattr_value = req->r_reply_info.xattr_info.xattr_value;
3032
xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
3033
3034
doutc(cl, "xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
3035
3036
err = (int)xattr_value_len;
3037
if (size == 0)
3038
goto put;
3039
3040
if (xattr_value_len > size) {
3041
err = -ERANGE;
3042
goto put;
3043
}
3044
3045
memcpy(value, xattr_value, xattr_value_len);
3046
put:
3047
ceph_mdsc_put_request(req);
3048
out:
3049
doutc(cl, "result=%d\n", err);
3050
return err;
3051
}
3052
3053
3054
/*
3055
* Check inode permissions. We verify we have a valid value for
3056
* the AUTH cap, then call the generic handler.
3057
*/
3058
int ceph_permission(struct mnt_idmap *idmap, struct inode *inode,
3059
int mask)
3060
{
3061
int err;
3062
3063
if (mask & MAY_NOT_BLOCK)
3064
return -ECHILD;
3065
3066
err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
3067
3068
if (!err)
3069
err = generic_permission(idmap, inode, mask);
3070
return err;
3071
}
3072
3073
/* Craft a mask of needed caps given a set of requested statx attrs. */
3074
static int statx_to_caps(u32 want, umode_t mode)
3075
{
3076
int mask = 0;
3077
3078
if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE))
3079
mask |= CEPH_CAP_AUTH_SHARED;
3080
3081
if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) {
3082
/*
3083
* The link count for directories depends on inode->i_subdirs,
3084
* and that is only updated when Fs caps are held.
3085
*/
3086
if (S_ISDIR(mode))
3087
mask |= CEPH_CAP_FILE_SHARED;
3088
else
3089
mask |= CEPH_CAP_LINK_SHARED;
3090
}
3091
3092
if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE))
3093
mask |= CEPH_CAP_FILE_SHARED;
3094
3095
if (want & (STATX_CTIME|STATX_CHANGE_COOKIE))
3096
mask |= CEPH_CAP_XATTR_SHARED;
3097
3098
return mask;
3099
}
3100
3101
/*
3102
* Get all the attributes. If we have sufficient caps for the requested attrs,
3103
* then we can avoid talking to the MDS at all.
3104
*/
3105
int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
3106
struct kstat *stat, u32 request_mask, unsigned int flags)
3107
{
3108
struct inode *inode = d_inode(path->dentry);
3109
struct super_block *sb = inode->i_sb;
3110
struct ceph_inode_info *ci = ceph_inode(inode);
3111
u32 valid_mask = STATX_BASIC_STATS;
3112
int err = 0;
3113
3114
if (ceph_inode_is_shutdown(inode))
3115
return -ESTALE;
3116
3117
/* Skip the getattr altogether if we're asked not to sync */
3118
if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) {
3119
err = ceph_do_getattr(inode,
3120
statx_to_caps(request_mask, inode->i_mode),
3121
flags & AT_STATX_FORCE_SYNC);
3122
if (err)
3123
return err;
3124
}
3125
3126
generic_fillattr(idmap, request_mask, inode, stat);
3127
stat->ino = ceph_present_inode(inode);
3128
3129
/*
3130
* btime on newly-allocated inodes is 0, so if this is still set to
3131
* that, then assume that it's not valid.
3132
*/
3133
if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) {
3134
stat->btime = ci->i_btime;
3135
valid_mask |= STATX_BTIME;
3136
}
3137
3138
if (request_mask & STATX_CHANGE_COOKIE) {
3139
stat->change_cookie = inode_peek_iversion_raw(inode);
3140
valid_mask |= STATX_CHANGE_COOKIE;
3141
}
3142
3143
if (ceph_snap(inode) == CEPH_NOSNAP)
3144
stat->dev = sb->s_dev;
3145
else
3146
stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
3147
3148
if (S_ISDIR(inode->i_mode)) {
3149
if (ceph_test_mount_opt(ceph_sb_to_fs_client(sb), RBYTES)) {
3150
stat->size = ci->i_rbytes;
3151
} else if (ceph_snap(inode) == CEPH_SNAPDIR) {
3152
struct ceph_inode_info *pci;
3153
struct ceph_snap_realm *realm;
3154
struct inode *parent;
3155
3156
parent = ceph_lookup_inode(sb, ceph_ino(inode));
3157
if (IS_ERR(parent))
3158
return PTR_ERR(parent);
3159
3160
pci = ceph_inode(parent);
3161
spin_lock(&pci->i_ceph_lock);
3162
realm = pci->i_snap_realm;
3163
if (realm)
3164
stat->size = realm->num_snaps;
3165
else
3166
stat->size = 0;
3167
spin_unlock(&pci->i_ceph_lock);
3168
iput(parent);
3169
} else {
3170
stat->size = ci->i_files + ci->i_subdirs;
3171
}
3172
stat->blocks = 0;
3173
stat->blksize = 65536;
3174
/*
3175
* Some applications rely on the number of st_nlink
3176
* value on directories to be either 0 (if unlinked)
3177
* or 2 + number of subdirectories.
3178
*/
3179
if (stat->nlink == 1)
3180
/* '.' + '..' + subdirs */
3181
stat->nlink = 1 + 1 + ci->i_subdirs;
3182
}
3183
3184
stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC;
3185
if (IS_ENCRYPTED(inode))
3186
stat->attributes |= STATX_ATTR_ENCRYPTED;
3187
stat->attributes_mask |= (STATX_ATTR_CHANGE_MONOTONIC |
3188
STATX_ATTR_ENCRYPTED);
3189
3190
stat->result_mask = request_mask & valid_mask;
3191
return err;
3192
}
3193
3194
void ceph_inode_shutdown(struct inode *inode)
3195
{
3196
struct ceph_inode_info *ci = ceph_inode(inode);
3197
struct rb_node *p;
3198
int iputs = 0;
3199
bool invalidate = false;
3200
3201
spin_lock(&ci->i_ceph_lock);
3202
ci->i_ceph_flags |= CEPH_I_SHUTDOWN;
3203
p = rb_first(&ci->i_caps);
3204
while (p) {
3205
struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
3206
3207
p = rb_next(p);
3208
iputs += ceph_purge_inode_cap(inode, cap, &invalidate);
3209
}
3210
spin_unlock(&ci->i_ceph_lock);
3211
3212
if (invalidate)
3213
ceph_queue_invalidate(inode);
3214
while (iputs--)
3215
iput(inode);
3216
}
3217
3218