Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/fs/ceph/inode.c
15109 views
1
#include <linux/ceph/ceph_debug.h>
2
3
#include <linux/module.h>
4
#include <linux/fs.h>
5
#include <linux/slab.h>
6
#include <linux/string.h>
7
#include <linux/uaccess.h>
8
#include <linux/kernel.h>
9
#include <linux/namei.h>
10
#include <linux/writeback.h>
11
#include <linux/vmalloc.h>
12
#include <linux/pagevec.h>
13
14
#include "super.h"
15
#include "mds_client.h"
16
#include <linux/ceph/decode.h>
17
18
/*
19
* Ceph inode operations
20
*
21
* Implement basic inode helpers (get, alloc) and inode ops (getattr,
22
* setattr, etc.), xattr helpers, and helpers for assimilating
23
* metadata returned by the MDS into our cache.
24
*
25
* Also define helpers for doing asynchronous writeback, invalidation,
26
* and truncation for the benefit of those who can't afford to block
27
* (typically because they are in the message handler path).
28
*/
29
30
static const struct inode_operations ceph_symlink_iops;
31
32
static void ceph_invalidate_work(struct work_struct *work);
33
static void ceph_writeback_work(struct work_struct *work);
34
static void ceph_vmtruncate_work(struct work_struct *work);
35
36
/*
37
* find or create an inode, given the ceph ino number
38
*/
39
static int ceph_set_ino_cb(struct inode *inode, void *data)
40
{
41
ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
42
inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
43
return 0;
44
}
45
46
struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
47
{
48
struct inode *inode;
49
ino_t t = ceph_vino_to_ino(vino);
50
51
inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
52
if (inode == NULL)
53
return ERR_PTR(-ENOMEM);
54
if (inode->i_state & I_NEW) {
55
dout("get_inode created new inode %p %llx.%llx ino %llx\n",
56
inode, ceph_vinop(inode), (u64)inode->i_ino);
57
unlock_new_inode(inode);
58
}
59
60
dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
61
vino.snap, inode);
62
return inode;
63
}
64
65
/*
66
* get/constuct snapdir inode for a given directory
67
*/
68
struct inode *ceph_get_snapdir(struct inode *parent)
69
{
70
struct ceph_vino vino = {
71
.ino = ceph_ino(parent),
72
.snap = CEPH_SNAPDIR,
73
};
74
struct inode *inode = ceph_get_inode(parent->i_sb, vino);
75
struct ceph_inode_info *ci = ceph_inode(inode);
76
77
BUG_ON(!S_ISDIR(parent->i_mode));
78
if (IS_ERR(inode))
79
return inode;
80
inode->i_mode = parent->i_mode;
81
inode->i_uid = parent->i_uid;
82
inode->i_gid = parent->i_gid;
83
inode->i_op = &ceph_dir_iops;
84
inode->i_fop = &ceph_dir_fops;
85
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
86
ci->i_rbytes = 0;
87
return inode;
88
}
89
90
const struct inode_operations ceph_file_iops = {
91
.permission = ceph_permission,
92
.setattr = ceph_setattr,
93
.getattr = ceph_getattr,
94
.setxattr = ceph_setxattr,
95
.getxattr = ceph_getxattr,
96
.listxattr = ceph_listxattr,
97
.removexattr = ceph_removexattr,
98
};
99
100
101
/*
102
* We use a 'frag tree' to keep track of the MDS's directory fragments
103
* for a given inode (usually there is just a single fragment). We
104
* need to know when a child frag is delegated to a new MDS, or when
105
* it is flagged as replicated, so we can direct our requests
106
* accordingly.
107
*/
108
109
/*
110
* find/create a frag in the tree
111
*/
112
static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
113
u32 f)
114
{
115
struct rb_node **p;
116
struct rb_node *parent = NULL;
117
struct ceph_inode_frag *frag;
118
int c;
119
120
p = &ci->i_fragtree.rb_node;
121
while (*p) {
122
parent = *p;
123
frag = rb_entry(parent, struct ceph_inode_frag, node);
124
c = ceph_frag_compare(f, frag->frag);
125
if (c < 0)
126
p = &(*p)->rb_left;
127
else if (c > 0)
128
p = &(*p)->rb_right;
129
else
130
return frag;
131
}
132
133
frag = kmalloc(sizeof(*frag), GFP_NOFS);
134
if (!frag) {
135
pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
136
"frag %x\n", &ci->vfs_inode,
137
ceph_vinop(&ci->vfs_inode), f);
138
return ERR_PTR(-ENOMEM);
139
}
140
frag->frag = f;
141
frag->split_by = 0;
142
frag->mds = -1;
143
frag->ndist = 0;
144
145
rb_link_node(&frag->node, parent, p);
146
rb_insert_color(&frag->node, &ci->i_fragtree);
147
148
dout("get_or_create_frag added %llx.%llx frag %x\n",
149
ceph_vinop(&ci->vfs_inode), f);
150
return frag;
151
}
152
153
/*
154
* find a specific frag @f
155
*/
156
struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
157
{
158
struct rb_node *n = ci->i_fragtree.rb_node;
159
160
while (n) {
161
struct ceph_inode_frag *frag =
162
rb_entry(n, struct ceph_inode_frag, node);
163
int c = ceph_frag_compare(f, frag->frag);
164
if (c < 0)
165
n = n->rb_left;
166
else if (c > 0)
167
n = n->rb_right;
168
else
169
return frag;
170
}
171
return NULL;
172
}
173
174
/*
175
* Choose frag containing the given value @v. If @pfrag is
176
* specified, copy the frag delegation info to the caller if
177
* it is present.
178
*/
179
u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
180
struct ceph_inode_frag *pfrag,
181
int *found)
182
{
183
u32 t = ceph_frag_make(0, 0);
184
struct ceph_inode_frag *frag;
185
unsigned nway, i;
186
u32 n;
187
188
if (found)
189
*found = 0;
190
191
mutex_lock(&ci->i_fragtree_mutex);
192
while (1) {
193
WARN_ON(!ceph_frag_contains_value(t, v));
194
frag = __ceph_find_frag(ci, t);
195
if (!frag)
196
break; /* t is a leaf */
197
if (frag->split_by == 0) {
198
if (pfrag)
199
memcpy(pfrag, frag, sizeof(*pfrag));
200
if (found)
201
*found = 1;
202
break;
203
}
204
205
/* choose child */
206
nway = 1 << frag->split_by;
207
dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
208
frag->split_by, nway);
209
for (i = 0; i < nway; i++) {
210
n = ceph_frag_make_child(t, frag->split_by, i);
211
if (ceph_frag_contains_value(n, v)) {
212
t = n;
213
break;
214
}
215
}
216
BUG_ON(i == nway);
217
}
218
dout("choose_frag(%x) = %x\n", v, t);
219
220
mutex_unlock(&ci->i_fragtree_mutex);
221
return t;
222
}
223
224
/*
225
* Process dirfrag (delegation) info from the mds. Include leaf
226
* fragment in tree ONLY if ndist > 0. Otherwise, only
227
* branches/splits are included in i_fragtree)
228
*/
229
static int ceph_fill_dirfrag(struct inode *inode,
230
struct ceph_mds_reply_dirfrag *dirinfo)
231
{
232
struct ceph_inode_info *ci = ceph_inode(inode);
233
struct ceph_inode_frag *frag;
234
u32 id = le32_to_cpu(dirinfo->frag);
235
int mds = le32_to_cpu(dirinfo->auth);
236
int ndist = le32_to_cpu(dirinfo->ndist);
237
int i;
238
int err = 0;
239
240
mutex_lock(&ci->i_fragtree_mutex);
241
if (ndist == 0) {
242
/* no delegation info needed. */
243
frag = __ceph_find_frag(ci, id);
244
if (!frag)
245
goto out;
246
if (frag->split_by == 0) {
247
/* tree leaf, remove */
248
dout("fill_dirfrag removed %llx.%llx frag %x"
249
" (no ref)\n", ceph_vinop(inode), id);
250
rb_erase(&frag->node, &ci->i_fragtree);
251
kfree(frag);
252
} else {
253
/* tree branch, keep and clear */
254
dout("fill_dirfrag cleared %llx.%llx frag %x"
255
" referral\n", ceph_vinop(inode), id);
256
frag->mds = -1;
257
frag->ndist = 0;
258
}
259
goto out;
260
}
261
262
263
/* find/add this frag to store mds delegation info */
264
frag = __get_or_create_frag(ci, id);
265
if (IS_ERR(frag)) {
266
/* this is not the end of the world; we can continue
267
with bad/inaccurate delegation info */
268
pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
269
ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
270
err = -ENOMEM;
271
goto out;
272
}
273
274
frag->mds = mds;
275
frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
276
for (i = 0; i < frag->ndist; i++)
277
frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
278
dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
279
ceph_vinop(inode), frag->frag, frag->ndist);
280
281
out:
282
mutex_unlock(&ci->i_fragtree_mutex);
283
return err;
284
}
285
286
287
/*
288
* initialize a newly allocated inode.
289
*/
290
struct inode *ceph_alloc_inode(struct super_block *sb)
291
{
292
struct ceph_inode_info *ci;
293
int i;
294
295
ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
296
if (!ci)
297
return NULL;
298
299
dout("alloc_inode %p\n", &ci->vfs_inode);
300
301
ci->i_version = 0;
302
ci->i_time_warp_seq = 0;
303
ci->i_ceph_flags = 0;
304
ci->i_release_count = 0;
305
ci->i_symlink = NULL;
306
307
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
308
309
ci->i_fragtree = RB_ROOT;
310
mutex_init(&ci->i_fragtree_mutex);
311
312
ci->i_xattrs.blob = NULL;
313
ci->i_xattrs.prealloc_blob = NULL;
314
ci->i_xattrs.dirty = false;
315
ci->i_xattrs.index = RB_ROOT;
316
ci->i_xattrs.count = 0;
317
ci->i_xattrs.names_size = 0;
318
ci->i_xattrs.vals_size = 0;
319
ci->i_xattrs.version = 0;
320
ci->i_xattrs.index_version = 0;
321
322
ci->i_caps = RB_ROOT;
323
ci->i_auth_cap = NULL;
324
ci->i_dirty_caps = 0;
325
ci->i_flushing_caps = 0;
326
INIT_LIST_HEAD(&ci->i_dirty_item);
327
INIT_LIST_HEAD(&ci->i_flushing_item);
328
ci->i_cap_flush_seq = 0;
329
ci->i_cap_flush_last_tid = 0;
330
memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
331
init_waitqueue_head(&ci->i_cap_wq);
332
ci->i_hold_caps_min = 0;
333
ci->i_hold_caps_max = 0;
334
INIT_LIST_HEAD(&ci->i_cap_delay_list);
335
ci->i_cap_exporting_mds = 0;
336
ci->i_cap_exporting_mseq = 0;
337
ci->i_cap_exporting_issued = 0;
338
INIT_LIST_HEAD(&ci->i_cap_snaps);
339
ci->i_head_snapc = NULL;
340
ci->i_snap_caps = 0;
341
342
for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
343
ci->i_nr_by_mode[i] = 0;
344
345
ci->i_truncate_seq = 0;
346
ci->i_truncate_size = 0;
347
ci->i_truncate_pending = 0;
348
349
ci->i_max_size = 0;
350
ci->i_reported_size = 0;
351
ci->i_wanted_max_size = 0;
352
ci->i_requested_max_size = 0;
353
354
ci->i_pin_ref = 0;
355
ci->i_rd_ref = 0;
356
ci->i_rdcache_ref = 0;
357
ci->i_wr_ref = 0;
358
ci->i_wb_ref = 0;
359
ci->i_wrbuffer_ref = 0;
360
ci->i_wrbuffer_ref_head = 0;
361
ci->i_shared_gen = 0;
362
ci->i_rdcache_gen = 0;
363
ci->i_rdcache_revoking = 0;
364
365
INIT_LIST_HEAD(&ci->i_unsafe_writes);
366
INIT_LIST_HEAD(&ci->i_unsafe_dirops);
367
spin_lock_init(&ci->i_unsafe_lock);
368
369
ci->i_snap_realm = NULL;
370
INIT_LIST_HEAD(&ci->i_snap_realm_item);
371
INIT_LIST_HEAD(&ci->i_snap_flush_item);
372
373
INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
374
INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
375
376
INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
377
378
return &ci->vfs_inode;
379
}
380
381
static void ceph_i_callback(struct rcu_head *head)
382
{
383
struct inode *inode = container_of(head, struct inode, i_rcu);
384
struct ceph_inode_info *ci = ceph_inode(inode);
385
386
INIT_LIST_HEAD(&inode->i_dentry);
387
kmem_cache_free(ceph_inode_cachep, ci);
388
}
389
390
void ceph_destroy_inode(struct inode *inode)
391
{
392
struct ceph_inode_info *ci = ceph_inode(inode);
393
struct ceph_inode_frag *frag;
394
struct rb_node *n;
395
396
dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
397
398
ceph_queue_caps_release(inode);
399
400
/*
401
* we may still have a snap_realm reference if there are stray
402
* caps in i_cap_exporting_issued or i_snap_caps.
403
*/
404
if (ci->i_snap_realm) {
405
struct ceph_mds_client *mdsc =
406
ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
407
struct ceph_snap_realm *realm = ci->i_snap_realm;
408
409
dout(" dropping residual ref to snap realm %p\n", realm);
410
spin_lock(&realm->inodes_with_caps_lock);
411
list_del_init(&ci->i_snap_realm_item);
412
spin_unlock(&realm->inodes_with_caps_lock);
413
ceph_put_snap_realm(mdsc, realm);
414
}
415
416
kfree(ci->i_symlink);
417
while ((n = rb_first(&ci->i_fragtree)) != NULL) {
418
frag = rb_entry(n, struct ceph_inode_frag, node);
419
rb_erase(n, &ci->i_fragtree);
420
kfree(frag);
421
}
422
423
__ceph_destroy_xattrs(ci);
424
if (ci->i_xattrs.blob)
425
ceph_buffer_put(ci->i_xattrs.blob);
426
if (ci->i_xattrs.prealloc_blob)
427
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
428
429
call_rcu(&inode->i_rcu, ceph_i_callback);
430
}
431
432
433
/*
434
* Helpers to fill in size, ctime, mtime, and atime. We have to be
435
* careful because either the client or MDS may have more up to date
436
* info, depending on which capabilities are held, and whether
437
* time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
438
* and size are monotonically increasing, except when utimes() or
439
* truncate() increments the corresponding _seq values.)
440
*/
441
int ceph_fill_file_size(struct inode *inode, int issued,
442
u32 truncate_seq, u64 truncate_size, u64 size)
443
{
444
struct ceph_inode_info *ci = ceph_inode(inode);
445
int queue_trunc = 0;
446
447
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
448
(truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
449
dout("size %lld -> %llu\n", inode->i_size, size);
450
inode->i_size = size;
451
inode->i_blocks = (size + (1<<9) - 1) >> 9;
452
ci->i_reported_size = size;
453
if (truncate_seq != ci->i_truncate_seq) {
454
dout("truncate_seq %u -> %u\n",
455
ci->i_truncate_seq, truncate_seq);
456
ci->i_truncate_seq = truncate_seq;
457
/*
458
* If we hold relevant caps, or in the case where we're
459
* not the only client referencing this file and we
460
* don't hold those caps, then we need to check whether
461
* the file is either opened or mmaped
462
*/
463
if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
464
CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
465
CEPH_CAP_FILE_EXCL|
466
CEPH_CAP_FILE_LAZYIO)) ||
467
mapping_mapped(inode->i_mapping) ||
468
__ceph_caps_file_wanted(ci)) {
469
ci->i_truncate_pending++;
470
queue_trunc = 1;
471
}
472
}
473
}
474
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
475
ci->i_truncate_size != truncate_size) {
476
dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
477
truncate_size);
478
ci->i_truncate_size = truncate_size;
479
}
480
return queue_trunc;
481
}
482
483
void ceph_fill_file_time(struct inode *inode, int issued,
484
u64 time_warp_seq, struct timespec *ctime,
485
struct timespec *mtime, struct timespec *atime)
486
{
487
struct ceph_inode_info *ci = ceph_inode(inode);
488
int warn = 0;
489
490
if (issued & (CEPH_CAP_FILE_EXCL|
491
CEPH_CAP_FILE_WR|
492
CEPH_CAP_FILE_BUFFER|
493
CEPH_CAP_AUTH_EXCL|
494
CEPH_CAP_XATTR_EXCL)) {
495
if (timespec_compare(ctime, &inode->i_ctime) > 0) {
496
dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
497
inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
498
ctime->tv_sec, ctime->tv_nsec);
499
inode->i_ctime = *ctime;
500
}
501
if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
502
/* the MDS did a utimes() */
503
dout("mtime %ld.%09ld -> %ld.%09ld "
504
"tw %d -> %d\n",
505
inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
506
mtime->tv_sec, mtime->tv_nsec,
507
ci->i_time_warp_seq, (int)time_warp_seq);
508
509
inode->i_mtime = *mtime;
510
inode->i_atime = *atime;
511
ci->i_time_warp_seq = time_warp_seq;
512
} else if (time_warp_seq == ci->i_time_warp_seq) {
513
/* nobody did utimes(); take the max */
514
if (timespec_compare(mtime, &inode->i_mtime) > 0) {
515
dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
516
inode->i_mtime.tv_sec,
517
inode->i_mtime.tv_nsec,
518
mtime->tv_sec, mtime->tv_nsec);
519
inode->i_mtime = *mtime;
520
}
521
if (timespec_compare(atime, &inode->i_atime) > 0) {
522
dout("atime %ld.%09ld -> %ld.%09ld inc\n",
523
inode->i_atime.tv_sec,
524
inode->i_atime.tv_nsec,
525
atime->tv_sec, atime->tv_nsec);
526
inode->i_atime = *atime;
527
}
528
} else if (issued & CEPH_CAP_FILE_EXCL) {
529
/* we did a utimes(); ignore mds values */
530
} else {
531
warn = 1;
532
}
533
} else {
534
/* we have no write|excl caps; whatever the MDS says is true */
535
if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
536
inode->i_ctime = *ctime;
537
inode->i_mtime = *mtime;
538
inode->i_atime = *atime;
539
ci->i_time_warp_seq = time_warp_seq;
540
} else {
541
warn = 1;
542
}
543
}
544
if (warn) /* time_warp_seq shouldn't go backwards */
545
dout("%p mds time_warp_seq %llu < %u\n",
546
inode, time_warp_seq, ci->i_time_warp_seq);
547
}
548
549
/*
550
* Populate an inode based on info from mds. May be called on new or
551
* existing inodes.
552
*/
553
static int fill_inode(struct inode *inode,
554
struct ceph_mds_reply_info_in *iinfo,
555
struct ceph_mds_reply_dirfrag *dirinfo,
556
struct ceph_mds_session *session,
557
unsigned long ttl_from, int cap_fmode,
558
struct ceph_cap_reservation *caps_reservation)
559
{
560
struct ceph_mds_reply_inode *info = iinfo->in;
561
struct ceph_inode_info *ci = ceph_inode(inode);
562
int i;
563
int issued, implemented;
564
struct timespec mtime, atime, ctime;
565
u32 nsplits;
566
struct ceph_buffer *xattr_blob = NULL;
567
int err = 0;
568
int queue_trunc = 0;
569
570
dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
571
inode, ceph_vinop(inode), le64_to_cpu(info->version),
572
ci->i_version);
573
574
/*
575
* prealloc xattr data, if it looks like we'll need it. only
576
* if len > 4 (meaning there are actually xattrs; the first 4
577
* bytes are the xattr count).
578
*/
579
if (iinfo->xattr_len > 4) {
580
xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
581
if (!xattr_blob)
582
pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
583
iinfo->xattr_len);
584
}
585
586
spin_lock(&inode->i_lock);
587
588
/*
589
* provided version will be odd if inode value is projected,
590
* even if stable. skip the update if we have newer stable
591
* info (ours>=theirs, e.g. due to racing mds replies), unless
592
* we are getting projected (unstable) info (in which case the
593
* version is odd, and we want ours>theirs).
594
* us them
595
* 2 2 skip
596
* 3 2 skip
597
* 3 3 update
598
*/
599
if (le64_to_cpu(info->version) > 0 &&
600
(ci->i_version & ~1) >= le64_to_cpu(info->version))
601
goto no_change;
602
603
issued = __ceph_caps_issued(ci, &implemented);
604
issued |= implemented | __ceph_caps_dirty(ci);
605
606
/* update inode */
607
ci->i_version = le64_to_cpu(info->version);
608
inode->i_version++;
609
inode->i_rdev = le32_to_cpu(info->rdev);
610
611
if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
612
inode->i_mode = le32_to_cpu(info->mode);
613
inode->i_uid = le32_to_cpu(info->uid);
614
inode->i_gid = le32_to_cpu(info->gid);
615
dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
616
inode->i_uid, inode->i_gid);
617
}
618
619
if ((issued & CEPH_CAP_LINK_EXCL) == 0)
620
inode->i_nlink = le32_to_cpu(info->nlink);
621
622
/* be careful with mtime, atime, size */
623
ceph_decode_timespec(&atime, &info->atime);
624
ceph_decode_timespec(&mtime, &info->mtime);
625
ceph_decode_timespec(&ctime, &info->ctime);
626
queue_trunc = ceph_fill_file_size(inode, issued,
627
le32_to_cpu(info->truncate_seq),
628
le64_to_cpu(info->truncate_size),
629
le64_to_cpu(info->size));
630
ceph_fill_file_time(inode, issued,
631
le32_to_cpu(info->time_warp_seq),
632
&ctime, &mtime, &atime);
633
634
/* only update max_size on auth cap */
635
if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
636
ci->i_max_size != le64_to_cpu(info->max_size)) {
637
dout("max_size %lld -> %llu\n", ci->i_max_size,
638
le64_to_cpu(info->max_size));
639
ci->i_max_size = le64_to_cpu(info->max_size);
640
}
641
642
ci->i_layout = info->layout;
643
inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
644
645
/* xattrs */
646
/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
647
if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
648
le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
649
if (ci->i_xattrs.blob)
650
ceph_buffer_put(ci->i_xattrs.blob);
651
ci->i_xattrs.blob = xattr_blob;
652
if (xattr_blob)
653
memcpy(ci->i_xattrs.blob->vec.iov_base,
654
iinfo->xattr_data, iinfo->xattr_len);
655
ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
656
xattr_blob = NULL;
657
}
658
659
inode->i_mapping->a_ops = &ceph_aops;
660
inode->i_mapping->backing_dev_info =
661
&ceph_sb_to_client(inode->i_sb)->backing_dev_info;
662
663
switch (inode->i_mode & S_IFMT) {
664
case S_IFIFO:
665
case S_IFBLK:
666
case S_IFCHR:
667
case S_IFSOCK:
668
init_special_inode(inode, inode->i_mode, inode->i_rdev);
669
inode->i_op = &ceph_file_iops;
670
break;
671
case S_IFREG:
672
inode->i_op = &ceph_file_iops;
673
inode->i_fop = &ceph_file_fops;
674
break;
675
case S_IFLNK:
676
inode->i_op = &ceph_symlink_iops;
677
if (!ci->i_symlink) {
678
int symlen = iinfo->symlink_len;
679
char *sym;
680
681
BUG_ON(symlen != inode->i_size);
682
spin_unlock(&inode->i_lock);
683
684
err = -ENOMEM;
685
sym = kmalloc(symlen+1, GFP_NOFS);
686
if (!sym)
687
goto out;
688
memcpy(sym, iinfo->symlink, symlen);
689
sym[symlen] = 0;
690
691
spin_lock(&inode->i_lock);
692
if (!ci->i_symlink)
693
ci->i_symlink = sym;
694
else
695
kfree(sym); /* lost a race */
696
}
697
break;
698
case S_IFDIR:
699
inode->i_op = &ceph_dir_iops;
700
inode->i_fop = &ceph_dir_fops;
701
702
ci->i_dir_layout = iinfo->dir_layout;
703
704
ci->i_files = le64_to_cpu(info->files);
705
ci->i_subdirs = le64_to_cpu(info->subdirs);
706
ci->i_rbytes = le64_to_cpu(info->rbytes);
707
ci->i_rfiles = le64_to_cpu(info->rfiles);
708
ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
709
ceph_decode_timespec(&ci->i_rctime, &info->rctime);
710
711
/* set dir completion flag? */
712
if (ci->i_files == 0 && ci->i_subdirs == 0 &&
713
ceph_snap(inode) == CEPH_NOSNAP &&
714
(le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
715
(issued & CEPH_CAP_FILE_EXCL) == 0 &&
716
(ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
717
dout(" marking %p complete (empty)\n", inode);
718
/* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
719
ci->i_max_offset = 2;
720
}
721
break;
722
default:
723
pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
724
ceph_vinop(inode), inode->i_mode);
725
}
726
727
no_change:
728
spin_unlock(&inode->i_lock);
729
730
/* queue truncate if we saw i_size decrease */
731
if (queue_trunc)
732
ceph_queue_vmtruncate(inode);
733
734
/* populate frag tree */
735
/* FIXME: move me up, if/when version reflects fragtree changes */
736
nsplits = le32_to_cpu(info->fragtree.nsplits);
737
mutex_lock(&ci->i_fragtree_mutex);
738
for (i = 0; i < nsplits; i++) {
739
u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
740
struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
741
742
if (IS_ERR(frag))
743
continue;
744
frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
745
dout(" frag %x split by %d\n", frag->frag, frag->split_by);
746
}
747
mutex_unlock(&ci->i_fragtree_mutex);
748
749
/* were we issued a capability? */
750
if (info->cap.caps) {
751
if (ceph_snap(inode) == CEPH_NOSNAP) {
752
ceph_add_cap(inode, session,
753
le64_to_cpu(info->cap.cap_id),
754
cap_fmode,
755
le32_to_cpu(info->cap.caps),
756
le32_to_cpu(info->cap.wanted),
757
le32_to_cpu(info->cap.seq),
758
le32_to_cpu(info->cap.mseq),
759
le64_to_cpu(info->cap.realm),
760
info->cap.flags,
761
caps_reservation);
762
} else {
763
spin_lock(&inode->i_lock);
764
dout(" %p got snap_caps %s\n", inode,
765
ceph_cap_string(le32_to_cpu(info->cap.caps)));
766
ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
767
if (cap_fmode >= 0)
768
__ceph_get_fmode(ci, cap_fmode);
769
spin_unlock(&inode->i_lock);
770
}
771
} else if (cap_fmode >= 0) {
772
pr_warning("mds issued no caps on %llx.%llx\n",
773
ceph_vinop(inode));
774
__ceph_get_fmode(ci, cap_fmode);
775
}
776
777
/* update delegation info? */
778
if (dirinfo)
779
ceph_fill_dirfrag(inode, dirinfo);
780
781
err = 0;
782
783
out:
784
if (xattr_blob)
785
ceph_buffer_put(xattr_blob);
786
return err;
787
}
788
789
/*
790
* caller should hold session s_mutex.
791
*/
792
static void update_dentry_lease(struct dentry *dentry,
793
struct ceph_mds_reply_lease *lease,
794
struct ceph_mds_session *session,
795
unsigned long from_time)
796
{
797
struct ceph_dentry_info *di = ceph_dentry(dentry);
798
long unsigned duration = le32_to_cpu(lease->duration_ms);
799
long unsigned ttl = from_time + (duration * HZ) / 1000;
800
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
801
struct inode *dir;
802
803
/* only track leases on regular dentries */
804
if (dentry->d_op != &ceph_dentry_ops)
805
return;
806
807
spin_lock(&dentry->d_lock);
808
dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
809
dentry, le16_to_cpu(lease->mask), duration, ttl);
810
811
/* make lease_rdcache_gen match directory */
812
dir = dentry->d_parent->d_inode;
813
di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
814
815
if (lease->mask == 0)
816
goto out_unlock;
817
818
if (di->lease_gen == session->s_cap_gen &&
819
time_before(ttl, dentry->d_time))
820
goto out_unlock; /* we already have a newer lease. */
821
822
if (di->lease_session && di->lease_session != session)
823
goto out_unlock;
824
825
ceph_dentry_lru_touch(dentry);
826
827
if (!di->lease_session)
828
di->lease_session = ceph_get_mds_session(session);
829
di->lease_gen = session->s_cap_gen;
830
di->lease_seq = le32_to_cpu(lease->seq);
831
di->lease_renew_after = half_ttl;
832
di->lease_renew_from = 0;
833
dentry->d_time = ttl;
834
out_unlock:
835
spin_unlock(&dentry->d_lock);
836
return;
837
}
838
839
/*
840
* Set dentry's directory position based on the current dir's max, and
841
* order it in d_subdirs, so that dcache_readdir behaves.
842
*/
843
static void ceph_set_dentry_offset(struct dentry *dn)
844
{
845
struct dentry *dir = dn->d_parent;
846
struct inode *inode = dn->d_parent->d_inode;
847
struct ceph_dentry_info *di;
848
849
BUG_ON(!inode);
850
851
di = ceph_dentry(dn);
852
853
spin_lock(&inode->i_lock);
854
if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
855
spin_unlock(&inode->i_lock);
856
return;
857
}
858
di->offset = ceph_inode(inode)->i_max_offset++;
859
spin_unlock(&inode->i_lock);
860
861
spin_lock(&dir->d_lock);
862
spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
863
list_move(&dn->d_u.d_child, &dir->d_subdirs);
864
dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
865
dn->d_u.d_child.prev, dn->d_u.d_child.next);
866
spin_unlock(&dn->d_lock);
867
spin_unlock(&dir->d_lock);
868
}
869
870
/*
871
* splice a dentry to an inode.
872
* caller must hold directory i_mutex for this to be safe.
873
*
874
* we will only rehash the resulting dentry if @prehash is
875
* true; @prehash will be set to false (for the benefit of
876
* the caller) if we fail.
877
*/
878
static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
879
bool *prehash, bool set_offset)
880
{
881
struct dentry *realdn;
882
883
BUG_ON(dn->d_inode);
884
885
/* dn must be unhashed */
886
if (!d_unhashed(dn))
887
d_drop(dn);
888
realdn = d_materialise_unique(dn, in);
889
if (IS_ERR(realdn)) {
890
pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
891
PTR_ERR(realdn), dn, in, ceph_vinop(in));
892
if (prehash)
893
*prehash = false; /* don't rehash on error */
894
dn = realdn; /* note realdn contains the error */
895
goto out;
896
} else if (realdn) {
897
dout("dn %p (%d) spliced with %p (%d) "
898
"inode %p ino %llx.%llx\n",
899
dn, dn->d_count,
900
realdn, realdn->d_count,
901
realdn->d_inode, ceph_vinop(realdn->d_inode));
902
dput(dn);
903
dn = realdn;
904
} else {
905
BUG_ON(!ceph_dentry(dn));
906
dout("dn %p attached to %p ino %llx.%llx\n",
907
dn, dn->d_inode, ceph_vinop(dn->d_inode));
908
}
909
if ((!prehash || *prehash) && d_unhashed(dn))
910
d_rehash(dn);
911
if (set_offset)
912
ceph_set_dentry_offset(dn);
913
out:
914
return dn;
915
}
916
917
/*
918
* Incorporate results into the local cache. This is either just
919
* one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
920
* after a lookup).
921
*
922
* A reply may contain
923
* a directory inode along with a dentry.
924
* and/or a target inode
925
*
926
* Called with snap_rwsem (read).
927
*/
928
int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
929
struct ceph_mds_session *session)
930
{
931
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
932
struct inode *in = NULL;
933
struct ceph_mds_reply_inode *ininfo;
934
struct ceph_vino vino;
935
struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
936
int i = 0;
937
int err = 0;
938
939
dout("fill_trace %p is_dentry %d is_target %d\n", req,
940
rinfo->head->is_dentry, rinfo->head->is_target);
941
942
#if 0
943
/*
944
* Debugging hook:
945
*
946
* If we resend completed ops to a recovering mds, we get no
947
* trace. Since that is very rare, pretend this is the case
948
* to ensure the 'no trace' handlers in the callers behave.
949
*
950
* Fill in inodes unconditionally to avoid breaking cap
951
* invariants.
952
*/
953
if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
954
pr_info("fill_trace faking empty trace on %lld %s\n",
955
req->r_tid, ceph_mds_op_name(rinfo->head->op));
956
if (rinfo->head->is_dentry) {
957
rinfo->head->is_dentry = 0;
958
err = fill_inode(req->r_locked_dir,
959
&rinfo->diri, rinfo->dirfrag,
960
session, req->r_request_started, -1);
961
}
962
if (rinfo->head->is_target) {
963
rinfo->head->is_target = 0;
964
ininfo = rinfo->targeti.in;
965
vino.ino = le64_to_cpu(ininfo->ino);
966
vino.snap = le64_to_cpu(ininfo->snapid);
967
in = ceph_get_inode(sb, vino);
968
err = fill_inode(in, &rinfo->targeti, NULL,
969
session, req->r_request_started,
970
req->r_fmode);
971
iput(in);
972
}
973
}
974
#endif
975
976
if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
977
dout("fill_trace reply is empty!\n");
978
if (rinfo->head->result == 0 && req->r_locked_dir)
979
ceph_invalidate_dir_request(req);
980
return 0;
981
}
982
983
if (rinfo->head->is_dentry) {
984
struct inode *dir = req->r_locked_dir;
985
986
err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
987
session, req->r_request_started, -1,
988
&req->r_caps_reservation);
989
if (err < 0)
990
return err;
991
}
992
993
/*
994
* ignore null lease/binding on snapdir ENOENT, or else we
995
* will have trouble splicing in the virtual snapdir later
996
*/
997
if (rinfo->head->is_dentry && !req->r_aborted &&
998
(rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
999
fsc->mount_options->snapdir_name,
1000
req->r_dentry->d_name.len))) {
1001
/*
1002
* lookup link rename : null -> possibly existing inode
1003
* mknod symlink mkdir : null -> new inode
1004
* unlink : linked -> null
1005
*/
1006
struct inode *dir = req->r_locked_dir;
1007
struct dentry *dn = req->r_dentry;
1008
bool have_dir_cap, have_lease;
1009
1010
BUG_ON(!dn);
1011
BUG_ON(!dir);
1012
BUG_ON(dn->d_parent->d_inode != dir);
1013
BUG_ON(ceph_ino(dir) !=
1014
le64_to_cpu(rinfo->diri.in->ino));
1015
BUG_ON(ceph_snap(dir) !=
1016
le64_to_cpu(rinfo->diri.in->snapid));
1017
1018
/* do we have a lease on the whole dir? */
1019
have_dir_cap =
1020
(le32_to_cpu(rinfo->diri.in->cap.caps) &
1021
CEPH_CAP_FILE_SHARED);
1022
1023
/* do we have a dn lease? */
1024
have_lease = have_dir_cap ||
1025
(le16_to_cpu(rinfo->dlease->mask) &
1026
CEPH_LOCK_DN);
1027
1028
if (!have_lease)
1029
dout("fill_trace no dentry lease or dir cap\n");
1030
1031
/* rename? */
1032
if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
1033
dout(" src %p '%.*s' dst %p '%.*s'\n",
1034
req->r_old_dentry,
1035
req->r_old_dentry->d_name.len,
1036
req->r_old_dentry->d_name.name,
1037
dn, dn->d_name.len, dn->d_name.name);
1038
dout("fill_trace doing d_move %p -> %p\n",
1039
req->r_old_dentry, dn);
1040
1041
d_move(req->r_old_dentry, dn);
1042
dout(" src %p '%.*s' dst %p '%.*s'\n",
1043
req->r_old_dentry,
1044
req->r_old_dentry->d_name.len,
1045
req->r_old_dentry->d_name.name,
1046
dn, dn->d_name.len, dn->d_name.name);
1047
1048
/* ensure target dentry is invalidated, despite
1049
rehashing bug in vfs_rename_dir */
1050
ceph_invalidate_dentry_lease(dn);
1051
1052
/*
1053
* d_move() puts the renamed dentry at the end of
1054
* d_subdirs. We need to assign it an appropriate
1055
* directory offset so we can behave when holding
1056
* I_COMPLETE.
1057
*/
1058
ceph_set_dentry_offset(req->r_old_dentry);
1059
dout("dn %p gets new offset %lld\n", req->r_old_dentry,
1060
ceph_dentry(req->r_old_dentry)->offset);
1061
1062
dn = req->r_old_dentry; /* use old_dentry */
1063
in = dn->d_inode;
1064
}
1065
1066
/* null dentry? */
1067
if (!rinfo->head->is_target) {
1068
dout("fill_trace null dentry\n");
1069
if (dn->d_inode) {
1070
dout("d_delete %p\n", dn);
1071
d_delete(dn);
1072
} else {
1073
dout("d_instantiate %p NULL\n", dn);
1074
d_instantiate(dn, NULL);
1075
if (have_lease && d_unhashed(dn))
1076
d_rehash(dn);
1077
update_dentry_lease(dn, rinfo->dlease,
1078
session,
1079
req->r_request_started);
1080
}
1081
goto done;
1082
}
1083
1084
/* attach proper inode */
1085
ininfo = rinfo->targeti.in;
1086
vino.ino = le64_to_cpu(ininfo->ino);
1087
vino.snap = le64_to_cpu(ininfo->snapid);
1088
in = dn->d_inode;
1089
if (!in) {
1090
in = ceph_get_inode(sb, vino);
1091
if (IS_ERR(in)) {
1092
pr_err("fill_trace bad get_inode "
1093
"%llx.%llx\n", vino.ino, vino.snap);
1094
err = PTR_ERR(in);
1095
d_delete(dn);
1096
goto done;
1097
}
1098
dn = splice_dentry(dn, in, &have_lease, true);
1099
if (IS_ERR(dn)) {
1100
err = PTR_ERR(dn);
1101
goto done;
1102
}
1103
req->r_dentry = dn; /* may have spliced */
1104
ihold(in);
1105
} else if (ceph_ino(in) == vino.ino &&
1106
ceph_snap(in) == vino.snap) {
1107
ihold(in);
1108
} else {
1109
dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
1110
dn, in, ceph_ino(in), ceph_snap(in),
1111
vino.ino, vino.snap);
1112
have_lease = false;
1113
in = NULL;
1114
}
1115
1116
if (have_lease)
1117
update_dentry_lease(dn, rinfo->dlease, session,
1118
req->r_request_started);
1119
dout(" final dn %p\n", dn);
1120
i++;
1121
} else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1122
req->r_op == CEPH_MDS_OP_MKSNAP) {
1123
struct dentry *dn = req->r_dentry;
1124
1125
/* fill out a snapdir LOOKUPSNAP dentry */
1126
BUG_ON(!dn);
1127
BUG_ON(!req->r_locked_dir);
1128
BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
1129
ininfo = rinfo->targeti.in;
1130
vino.ino = le64_to_cpu(ininfo->ino);
1131
vino.snap = le64_to_cpu(ininfo->snapid);
1132
in = ceph_get_inode(sb, vino);
1133
if (IS_ERR(in)) {
1134
pr_err("fill_inode get_inode badness %llx.%llx\n",
1135
vino.ino, vino.snap);
1136
err = PTR_ERR(in);
1137
d_delete(dn);
1138
goto done;
1139
}
1140
dout(" linking snapped dir %p to dn %p\n", in, dn);
1141
dn = splice_dentry(dn, in, NULL, true);
1142
if (IS_ERR(dn)) {
1143
err = PTR_ERR(dn);
1144
goto done;
1145
}
1146
req->r_dentry = dn; /* may have spliced */
1147
ihold(in);
1148
rinfo->head->is_dentry = 1; /* fool notrace handlers */
1149
}
1150
1151
if (rinfo->head->is_target) {
1152
vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1153
vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1154
1155
if (in == NULL || ceph_ino(in) != vino.ino ||
1156
ceph_snap(in) != vino.snap) {
1157
in = ceph_get_inode(sb, vino);
1158
if (IS_ERR(in)) {
1159
err = PTR_ERR(in);
1160
goto done;
1161
}
1162
}
1163
req->r_target_inode = in;
1164
1165
err = fill_inode(in,
1166
&rinfo->targeti, NULL,
1167
session, req->r_request_started,
1168
(le32_to_cpu(rinfo->head->result) == 0) ?
1169
req->r_fmode : -1,
1170
&req->r_caps_reservation);
1171
if (err < 0) {
1172
pr_err("fill_inode badness %p %llx.%llx\n",
1173
in, ceph_vinop(in));
1174
goto done;
1175
}
1176
}
1177
1178
done:
1179
dout("fill_trace done err=%d\n", err);
1180
return err;
1181
}
1182
1183
/*
1184
* Prepopulate our cache with readdir results, leases, etc.
1185
*/
1186
int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1187
struct ceph_mds_session *session)
1188
{
1189
struct dentry *parent = req->r_dentry;
1190
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1191
struct qstr dname;
1192
struct dentry *dn;
1193
struct inode *in;
1194
int err = 0, i;
1195
struct inode *snapdir = NULL;
1196
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1197
u64 frag = le32_to_cpu(rhead->args.readdir.frag);
1198
struct ceph_dentry_info *di;
1199
1200
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
1201
snapdir = ceph_get_snapdir(parent->d_inode);
1202
parent = d_find_alias(snapdir);
1203
dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
1204
rinfo->dir_nr, parent);
1205
} else {
1206
dout("readdir_prepopulate %d items under dn %p\n",
1207
rinfo->dir_nr, parent);
1208
if (rinfo->dir_dir)
1209
ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
1210
}
1211
1212
for (i = 0; i < rinfo->dir_nr; i++) {
1213
struct ceph_vino vino;
1214
1215
dname.name = rinfo->dir_dname[i];
1216
dname.len = rinfo->dir_dname_len[i];
1217
dname.hash = full_name_hash(dname.name, dname.len);
1218
1219
vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
1220
vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
1221
1222
retry_lookup:
1223
dn = d_lookup(parent, &dname);
1224
dout("d_lookup on parent=%p name=%.*s got %p\n",
1225
parent, dname.len, dname.name, dn);
1226
1227
if (!dn) {
1228
dn = d_alloc(parent, &dname);
1229
dout("d_alloc %p '%.*s' = %p\n", parent,
1230
dname.len, dname.name, dn);
1231
if (dn == NULL) {
1232
dout("d_alloc badness\n");
1233
err = -ENOMEM;
1234
goto out;
1235
}
1236
err = ceph_init_dentry(dn);
1237
if (err < 0) {
1238
dput(dn);
1239
goto out;
1240
}
1241
} else if (dn->d_inode &&
1242
(ceph_ino(dn->d_inode) != vino.ino ||
1243
ceph_snap(dn->d_inode) != vino.snap)) {
1244
dout(" dn %p points to wrong inode %p\n",
1245
dn, dn->d_inode);
1246
d_delete(dn);
1247
dput(dn);
1248
goto retry_lookup;
1249
} else {
1250
/* reorder parent's d_subdirs */
1251
spin_lock(&parent->d_lock);
1252
spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
1253
list_move(&dn->d_u.d_child, &parent->d_subdirs);
1254
spin_unlock(&dn->d_lock);
1255
spin_unlock(&parent->d_lock);
1256
}
1257
1258
di = dn->d_fsdata;
1259
di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
1260
1261
/* inode */
1262
if (dn->d_inode) {
1263
in = dn->d_inode;
1264
} else {
1265
in = ceph_get_inode(parent->d_sb, vino);
1266
if (IS_ERR(in)) {
1267
dout("new_inode badness\n");
1268
d_delete(dn);
1269
dput(dn);
1270
err = PTR_ERR(in);
1271
goto out;
1272
}
1273
dn = splice_dentry(dn, in, NULL, false);
1274
if (IS_ERR(dn))
1275
dn = NULL;
1276
}
1277
1278
if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
1279
req->r_request_started, -1,
1280
&req->r_caps_reservation) < 0) {
1281
pr_err("fill_inode badness on %p\n", in);
1282
goto next_item;
1283
}
1284
if (dn)
1285
update_dentry_lease(dn, rinfo->dir_dlease[i],
1286
req->r_session,
1287
req->r_request_started);
1288
next_item:
1289
if (dn)
1290
dput(dn);
1291
}
1292
req->r_did_prepopulate = true;
1293
1294
out:
1295
if (snapdir) {
1296
iput(snapdir);
1297
dput(parent);
1298
}
1299
dout("readdir_prepopulate done\n");
1300
return err;
1301
}
1302
1303
int ceph_inode_set_size(struct inode *inode, loff_t size)
1304
{
1305
struct ceph_inode_info *ci = ceph_inode(inode);
1306
int ret = 0;
1307
1308
spin_lock(&inode->i_lock);
1309
dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
1310
inode->i_size = size;
1311
inode->i_blocks = (size + (1 << 9) - 1) >> 9;
1312
1313
/* tell the MDS if we are approaching max_size */
1314
if ((size << 1) >= ci->i_max_size &&
1315
(ci->i_reported_size << 1) < ci->i_max_size)
1316
ret = 1;
1317
1318
spin_unlock(&inode->i_lock);
1319
return ret;
1320
}
1321
1322
/*
1323
* Write back inode data in a worker thread. (This can't be done
1324
* in the message handler context.)
1325
*/
1326
void ceph_queue_writeback(struct inode *inode)
1327
{
1328
if (queue_work(ceph_inode_to_client(inode)->wb_wq,
1329
&ceph_inode(inode)->i_wb_work)) {
1330
dout("ceph_queue_writeback %p\n", inode);
1331
ihold(inode);
1332
} else {
1333
dout("ceph_queue_writeback %p failed\n", inode);
1334
}
1335
}
1336
1337
static void ceph_writeback_work(struct work_struct *work)
1338
{
1339
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1340
i_wb_work);
1341
struct inode *inode = &ci->vfs_inode;
1342
1343
dout("writeback %p\n", inode);
1344
filemap_fdatawrite(&inode->i_data);
1345
iput(inode);
1346
}
1347
1348
/*
1349
* queue an async invalidation
1350
*/
1351
void ceph_queue_invalidate(struct inode *inode)
1352
{
1353
if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
1354
&ceph_inode(inode)->i_pg_inv_work)) {
1355
dout("ceph_queue_invalidate %p\n", inode);
1356
ihold(inode);
1357
} else {
1358
dout("ceph_queue_invalidate %p failed\n", inode);
1359
}
1360
}
1361
1362
/*
1363
* invalidate any pages that are not dirty or under writeback. this
1364
* includes pages that are clean and mapped.
1365
*/
1366
static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
1367
{
1368
struct pagevec pvec;
1369
pgoff_t next = 0;
1370
int i;
1371
1372
pagevec_init(&pvec, 0);
1373
while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
1374
for (i = 0; i < pagevec_count(&pvec); i++) {
1375
struct page *page = pvec.pages[i];
1376
pgoff_t index;
1377
int skip_page =
1378
(PageDirty(page) || PageWriteback(page));
1379
1380
if (!skip_page)
1381
skip_page = !trylock_page(page);
1382
1383
/*
1384
* We really shouldn't be looking at the ->index of an
1385
* unlocked page. But we're not allowed to lock these
1386
* pages. So we rely upon nobody altering the ->index
1387
* of this (pinned-by-us) page.
1388
*/
1389
index = page->index;
1390
if (index > next)
1391
next = index;
1392
next++;
1393
1394
if (skip_page)
1395
continue;
1396
1397
generic_error_remove_page(mapping, page);
1398
unlock_page(page);
1399
}
1400
pagevec_release(&pvec);
1401
cond_resched();
1402
}
1403
}
1404
1405
/*
1406
* Invalidate inode pages in a worker thread. (This can't be done
1407
* in the message handler context.)
1408
*/
1409
static void ceph_invalidate_work(struct work_struct *work)
1410
{
1411
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1412
i_pg_inv_work);
1413
struct inode *inode = &ci->vfs_inode;
1414
u32 orig_gen;
1415
int check = 0;
1416
1417
spin_lock(&inode->i_lock);
1418
dout("invalidate_pages %p gen %d revoking %d\n", inode,
1419
ci->i_rdcache_gen, ci->i_rdcache_revoking);
1420
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1421
/* nevermind! */
1422
spin_unlock(&inode->i_lock);
1423
goto out;
1424
}
1425
orig_gen = ci->i_rdcache_gen;
1426
spin_unlock(&inode->i_lock);
1427
1428
ceph_invalidate_nondirty_pages(inode->i_mapping);
1429
1430
spin_lock(&inode->i_lock);
1431
if (orig_gen == ci->i_rdcache_gen &&
1432
orig_gen == ci->i_rdcache_revoking) {
1433
dout("invalidate_pages %p gen %d successful\n", inode,
1434
ci->i_rdcache_gen);
1435
ci->i_rdcache_revoking--;
1436
check = 1;
1437
} else {
1438
dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
1439
inode, orig_gen, ci->i_rdcache_gen,
1440
ci->i_rdcache_revoking);
1441
}
1442
spin_unlock(&inode->i_lock);
1443
1444
if (check)
1445
ceph_check_caps(ci, 0, NULL);
1446
out:
1447
iput(inode);
1448
}
1449
1450
1451
/*
1452
* called by trunc_wq; take i_mutex ourselves
1453
*
1454
* We also truncate in a separate thread as well.
1455
*/
1456
static void ceph_vmtruncate_work(struct work_struct *work)
1457
{
1458
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1459
i_vmtruncate_work);
1460
struct inode *inode = &ci->vfs_inode;
1461
1462
dout("vmtruncate_work %p\n", inode);
1463
mutex_lock(&inode->i_mutex);
1464
__ceph_do_pending_vmtruncate(inode);
1465
mutex_unlock(&inode->i_mutex);
1466
iput(inode);
1467
}
1468
1469
/*
1470
* Queue an async vmtruncate. If we fail to queue work, we will handle
1471
* the truncation the next time we call __ceph_do_pending_vmtruncate.
1472
*/
1473
void ceph_queue_vmtruncate(struct inode *inode)
1474
{
1475
struct ceph_inode_info *ci = ceph_inode(inode);
1476
1477
if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
1478
&ci->i_vmtruncate_work)) {
1479
dout("ceph_queue_vmtruncate %p\n", inode);
1480
ihold(inode);
1481
} else {
1482
dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
1483
inode, ci->i_truncate_pending);
1484
}
1485
}
1486
1487
/*
1488
* called with i_mutex held.
1489
*
1490
* Make sure any pending truncation is applied before doing anything
1491
* that may depend on it.
1492
*/
1493
void __ceph_do_pending_vmtruncate(struct inode *inode)
1494
{
1495
struct ceph_inode_info *ci = ceph_inode(inode);
1496
u64 to;
1497
int wrbuffer_refs, wake = 0;
1498
1499
retry:
1500
spin_lock(&inode->i_lock);
1501
if (ci->i_truncate_pending == 0) {
1502
dout("__do_pending_vmtruncate %p none pending\n", inode);
1503
spin_unlock(&inode->i_lock);
1504
return;
1505
}
1506
1507
/*
1508
* make sure any dirty snapped pages are flushed before we
1509
* possibly truncate them.. so write AND block!
1510
*/
1511
if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
1512
dout("__do_pending_vmtruncate %p flushing snaps first\n",
1513
inode);
1514
spin_unlock(&inode->i_lock);
1515
filemap_write_and_wait_range(&inode->i_data, 0,
1516
inode->i_sb->s_maxbytes);
1517
goto retry;
1518
}
1519
1520
to = ci->i_truncate_size;
1521
wrbuffer_refs = ci->i_wrbuffer_ref;
1522
dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
1523
ci->i_truncate_pending, to);
1524
spin_unlock(&inode->i_lock);
1525
1526
truncate_inode_pages(inode->i_mapping, to);
1527
1528
spin_lock(&inode->i_lock);
1529
ci->i_truncate_pending--;
1530
if (ci->i_truncate_pending == 0)
1531
wake = 1;
1532
spin_unlock(&inode->i_lock);
1533
1534
if (wrbuffer_refs == 0)
1535
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1536
if (wake)
1537
wake_up_all(&ci->i_cap_wq);
1538
}
1539
1540
1541
/*
1542
* symlinks
1543
*/
1544
static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
1545
{
1546
struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
1547
nd_set_link(nd, ci->i_symlink);
1548
return NULL;
1549
}
1550
1551
static const struct inode_operations ceph_symlink_iops = {
1552
.readlink = generic_readlink,
1553
.follow_link = ceph_sym_follow_link,
1554
};
1555
1556
/*
1557
* setattr
1558
*/
1559
int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1560
{
1561
struct inode *inode = dentry->d_inode;
1562
struct ceph_inode_info *ci = ceph_inode(inode);
1563
struct inode *parent_inode = dentry->d_parent->d_inode;
1564
const unsigned int ia_valid = attr->ia_valid;
1565
struct ceph_mds_request *req;
1566
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
1567
int issued;
1568
int release = 0, dirtied = 0;
1569
int mask = 0;
1570
int err = 0;
1571
int inode_dirty_flags = 0;
1572
1573
if (ceph_snap(inode) != CEPH_NOSNAP)
1574
return -EROFS;
1575
1576
__ceph_do_pending_vmtruncate(inode);
1577
1578
err = inode_change_ok(inode, attr);
1579
if (err != 0)
1580
return err;
1581
1582
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
1583
USE_AUTH_MDS);
1584
if (IS_ERR(req))
1585
return PTR_ERR(req);
1586
1587
spin_lock(&inode->i_lock);
1588
issued = __ceph_caps_issued(ci, NULL);
1589
dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
1590
1591
if (ia_valid & ATTR_UID) {
1592
dout("setattr %p uid %d -> %d\n", inode,
1593
inode->i_uid, attr->ia_uid);
1594
if (issued & CEPH_CAP_AUTH_EXCL) {
1595
inode->i_uid = attr->ia_uid;
1596
dirtied |= CEPH_CAP_AUTH_EXCL;
1597
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1598
attr->ia_uid != inode->i_uid) {
1599
req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
1600
mask |= CEPH_SETATTR_UID;
1601
release |= CEPH_CAP_AUTH_SHARED;
1602
}
1603
}
1604
if (ia_valid & ATTR_GID) {
1605
dout("setattr %p gid %d -> %d\n", inode,
1606
inode->i_gid, attr->ia_gid);
1607
if (issued & CEPH_CAP_AUTH_EXCL) {
1608
inode->i_gid = attr->ia_gid;
1609
dirtied |= CEPH_CAP_AUTH_EXCL;
1610
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1611
attr->ia_gid != inode->i_gid) {
1612
req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
1613
mask |= CEPH_SETATTR_GID;
1614
release |= CEPH_CAP_AUTH_SHARED;
1615
}
1616
}
1617
if (ia_valid & ATTR_MODE) {
1618
dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
1619
attr->ia_mode);
1620
if (issued & CEPH_CAP_AUTH_EXCL) {
1621
inode->i_mode = attr->ia_mode;
1622
dirtied |= CEPH_CAP_AUTH_EXCL;
1623
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1624
attr->ia_mode != inode->i_mode) {
1625
req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
1626
mask |= CEPH_SETATTR_MODE;
1627
release |= CEPH_CAP_AUTH_SHARED;
1628
}
1629
}
1630
1631
if (ia_valid & ATTR_ATIME) {
1632
dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
1633
inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
1634
attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
1635
if (issued & CEPH_CAP_FILE_EXCL) {
1636
ci->i_time_warp_seq++;
1637
inode->i_atime = attr->ia_atime;
1638
dirtied |= CEPH_CAP_FILE_EXCL;
1639
} else if ((issued & CEPH_CAP_FILE_WR) &&
1640
timespec_compare(&inode->i_atime,
1641
&attr->ia_atime) < 0) {
1642
inode->i_atime = attr->ia_atime;
1643
dirtied |= CEPH_CAP_FILE_WR;
1644
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1645
!timespec_equal(&inode->i_atime, &attr->ia_atime)) {
1646
ceph_encode_timespec(&req->r_args.setattr.atime,
1647
&attr->ia_atime);
1648
mask |= CEPH_SETATTR_ATIME;
1649
release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
1650
CEPH_CAP_FILE_WR;
1651
}
1652
}
1653
if (ia_valid & ATTR_MTIME) {
1654
dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
1655
inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
1656
attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
1657
if (issued & CEPH_CAP_FILE_EXCL) {
1658
ci->i_time_warp_seq++;
1659
inode->i_mtime = attr->ia_mtime;
1660
dirtied |= CEPH_CAP_FILE_EXCL;
1661
} else if ((issued & CEPH_CAP_FILE_WR) &&
1662
timespec_compare(&inode->i_mtime,
1663
&attr->ia_mtime) < 0) {
1664
inode->i_mtime = attr->ia_mtime;
1665
dirtied |= CEPH_CAP_FILE_WR;
1666
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1667
!timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
1668
ceph_encode_timespec(&req->r_args.setattr.mtime,
1669
&attr->ia_mtime);
1670
mask |= CEPH_SETATTR_MTIME;
1671
release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1672
CEPH_CAP_FILE_WR;
1673
}
1674
}
1675
if (ia_valid & ATTR_SIZE) {
1676
dout("setattr %p size %lld -> %lld\n", inode,
1677
inode->i_size, attr->ia_size);
1678
if (attr->ia_size > inode->i_sb->s_maxbytes) {
1679
err = -EINVAL;
1680
goto out;
1681
}
1682
if ((issued & CEPH_CAP_FILE_EXCL) &&
1683
attr->ia_size > inode->i_size) {
1684
inode->i_size = attr->ia_size;
1685
inode->i_blocks =
1686
(attr->ia_size + (1 << 9) - 1) >> 9;
1687
inode->i_ctime = attr->ia_ctime;
1688
ci->i_reported_size = attr->ia_size;
1689
dirtied |= CEPH_CAP_FILE_EXCL;
1690
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1691
attr->ia_size != inode->i_size) {
1692
req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
1693
req->r_args.setattr.old_size =
1694
cpu_to_le64(inode->i_size);
1695
mask |= CEPH_SETATTR_SIZE;
1696
release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1697
CEPH_CAP_FILE_WR;
1698
}
1699
}
1700
1701
/* these do nothing */
1702
if (ia_valid & ATTR_CTIME) {
1703
bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
1704
ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
1705
dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
1706
inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
1707
attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
1708
only ? "ctime only" : "ignored");
1709
inode->i_ctime = attr->ia_ctime;
1710
if (only) {
1711
/*
1712
* if kernel wants to dirty ctime but nothing else,
1713
* we need to choose a cap to dirty under, or do
1714
* a almost-no-op setattr
1715
*/
1716
if (issued & CEPH_CAP_AUTH_EXCL)
1717
dirtied |= CEPH_CAP_AUTH_EXCL;
1718
else if (issued & CEPH_CAP_FILE_EXCL)
1719
dirtied |= CEPH_CAP_FILE_EXCL;
1720
else if (issued & CEPH_CAP_XATTR_EXCL)
1721
dirtied |= CEPH_CAP_XATTR_EXCL;
1722
else
1723
mask |= CEPH_SETATTR_CTIME;
1724
}
1725
}
1726
if (ia_valid & ATTR_FILE)
1727
dout("setattr %p ATTR_FILE ... hrm!\n", inode);
1728
1729
if (dirtied) {
1730
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);
1731
inode->i_ctime = CURRENT_TIME;
1732
}
1733
1734
release &= issued;
1735
spin_unlock(&inode->i_lock);
1736
1737
if (inode_dirty_flags)
1738
__mark_inode_dirty(inode, inode_dirty_flags);
1739
1740
if (mask) {
1741
req->r_inode = inode;
1742
ihold(inode);
1743
req->r_inode_drop = release;
1744
req->r_args.setattr.mask = cpu_to_le32(mask);
1745
req->r_num_caps = 1;
1746
err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1747
}
1748
dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
1749
ceph_cap_string(dirtied), mask);
1750
1751
ceph_mdsc_put_request(req);
1752
__ceph_do_pending_vmtruncate(inode);
1753
return err;
1754
out:
1755
spin_unlock(&inode->i_lock);
1756
ceph_mdsc_put_request(req);
1757
return err;
1758
}
1759
1760
/*
1761
* Verify that we have a lease on the given mask. If not,
1762
* do a getattr against an mds.
1763
*/
1764
int ceph_do_getattr(struct inode *inode, int mask)
1765
{
1766
struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
1767
struct ceph_mds_client *mdsc = fsc->mdsc;
1768
struct ceph_mds_request *req;
1769
int err;
1770
1771
if (ceph_snap(inode) == CEPH_SNAPDIR) {
1772
dout("do_getattr inode %p SNAPDIR\n", inode);
1773
return 0;
1774
}
1775
1776
dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
1777
if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1778
return 0;
1779
1780
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
1781
if (IS_ERR(req))
1782
return PTR_ERR(req);
1783
req->r_inode = inode;
1784
ihold(inode);
1785
req->r_num_caps = 1;
1786
req->r_args.getattr.mask = cpu_to_le32(mask);
1787
err = ceph_mdsc_do_request(mdsc, NULL, req);
1788
ceph_mdsc_put_request(req);
1789
dout("do_getattr result=%d\n", err);
1790
return err;
1791
}
1792
1793
1794
/*
1795
* Check inode permissions. We verify we have a valid value for
1796
* the AUTH cap, then call the generic handler.
1797
*/
1798
int ceph_permission(struct inode *inode, int mask, unsigned int flags)
1799
{
1800
int err;
1801
1802
if (flags & IPERM_FLAG_RCU)
1803
return -ECHILD;
1804
1805
err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
1806
1807
if (!err)
1808
err = generic_permission(inode, mask, flags, NULL);
1809
return err;
1810
}
1811
1812
/*
1813
* Get all attributes. Hopefully somedata we'll have a statlite()
1814
* and can limit the fields we require to be accurate.
1815
*/
1816
int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1817
struct kstat *stat)
1818
{
1819
struct inode *inode = dentry->d_inode;
1820
struct ceph_inode_info *ci = ceph_inode(inode);
1821
int err;
1822
1823
err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
1824
if (!err) {
1825
generic_fillattr(inode, stat);
1826
stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
1827
if (ceph_snap(inode) != CEPH_NOSNAP)
1828
stat->dev = ceph_snap(inode);
1829
else
1830
stat->dev = 0;
1831
if (S_ISDIR(inode->i_mode)) {
1832
if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
1833
RBYTES))
1834
stat->size = ci->i_rbytes;
1835
else
1836
stat->size = ci->i_files + ci->i_subdirs;
1837
stat->blocks = 0;
1838
stat->blksize = 65536;
1839
}
1840
}
1841
return err;
1842
}
1843
1844