CoCalc -- file.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/fs/ceph/file.c
¹⁷³⁶⁶ views
1
#include <linux/ceph/ceph_debug.h>
2

3
#include <linux/module.h>
4
#include <linux/sched.h>
5
#include <linux/slab.h>
6
#include <linux/file.h>
7
#include <linux/namei.h>
8
#include <linux/writeback.h>
9

10
#include "super.h"
11
#include "mds_client.h"
12

13
/*
14
 * Ceph file operations
15
 *
16
 * Implement basic open/close functionality, and implement
17
 * read/write.
18
 *
19
 * We implement three modes of file I/O:
20
 *  - buffered uses the generic_file_aio_{read,write} helpers
21
 *
22
 *  - synchronous is used when there is multi-client read/write
23
 *    sharing, avoids the page cache, and synchronously waits for an
24
 *    ack from the OSD.
25
 *
26
 *  - direct io takes the variant of the sync path that references
27
 *    user pages directly.
28
 *
29
 * fsync() flushes and waits on dirty pages, but just queues metadata
30
 * for writeback: since the MDS can recover size and mtime there is no
31
 * need to wait for MDS acknowledgement.
32
 */
33

34

35
/*
36
 * Prepare an open request.  Preallocate ceph_cap to avoid an
37
 * inopportune ENOMEM later.
38
 */
39
static struct ceph_mds_request *
40
prepare_open_request(struct super_block *sb, int flags, int create_mode)
41
{
42
	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
43
	struct ceph_mds_client *mdsc = fsc->mdsc;
44
	struct ceph_mds_request *req;
45
	int want_auth = USE_ANY_MDS;
46
	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
47

48
	if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
49
		want_auth = USE_AUTH_MDS;
50

51
	req = ceph_mdsc_create_request(mdsc, op, want_auth);
52
	if (IS_ERR(req))
53
		goto out;
54
	req->r_fmode = ceph_flags_to_mode(flags);
55
	req->r_args.open.flags = cpu_to_le32(flags);
56
	req->r_args.open.mode = cpu_to_le32(create_mode);
57
	req->r_args.open.preferred = cpu_to_le32(-1);
58
out:
59
	return req;
60
}
61

62
/*
63
 * initialize private struct file data.
64
 * if we fail, clean up by dropping fmode reference on the ceph_inode
65
 */
66
static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
67
{
68
	struct ceph_file_info *cf;
69
	int ret = 0;
70

71
	switch (inode->i_mode & S_IFMT) {
72
	case S_IFREG:
73
	case S_IFDIR:
74
		dout("init_file %p %p 0%o (regular)\n", inode, file,
75
		     inode->i_mode);
76
		cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
77
		if (cf == NULL) {
78
			ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
79
			return -ENOMEM;
80
		}
81
		cf->fmode = fmode;
82
		cf->next_offset = 2;
83
		file->private_data = cf;
84
		BUG_ON(inode->i_fop->release != ceph_release);
85
		break;
86

87
	case S_IFLNK:
88
		dout("init_file %p %p 0%o (symlink)\n", inode, file,
89
		     inode->i_mode);
90
		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
91
		break;
92

93
	default:
94
		dout("init_file %p %p 0%o (special)\n", inode, file,
95
		     inode->i_mode);
96
		/*
97
		 * we need to drop the open ref now, since we don't
98
		 * have .release set to ceph_release.
99
		 */
100
		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
101
		BUG_ON(inode->i_fop->release == ceph_release);
102

103
		/* call the proper open fop */
104
		ret = inode->i_fop->open(inode, file);
105
	}
106
	return ret;
107
}
108

109
/*
110
 * If the filp already has private_data, that means the file was
111
 * already opened by intent during lookup, and we do nothing.
112
 *
113
 * If we already have the requisite capabilities, we can satisfy
114
 * the open request locally (no need to request new caps from the
115
 * MDS).  We do, however, need to inform the MDS (asynchronously)
116
 * if our wanted caps set expands.
117
 */
118
int ceph_open(struct inode *inode, struct file *file)
119
{
120
	struct ceph_inode_info *ci = ceph_inode(inode);
121
	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
122
	struct ceph_mds_client *mdsc = fsc->mdsc;
123
	struct ceph_mds_request *req;
124
	struct ceph_file_info *cf = file->private_data;
125
	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
126
	int err;
127
	int flags, fmode, wanted;
128

129
	if (cf) {
130
		dout("open file %p is already opened\n", file);
131
		return 0;
132
	}
133

134
	/* filter out O_CREAT|O_EXCL; vfs did that already.  yuck. */
135
	flags = file->f_flags & ~(O_CREAT|O_EXCL);
136
	if (S_ISDIR(inode->i_mode))
137
		flags = O_DIRECTORY;  /* mds likes to know */
138

139
	dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
140
	     ceph_vinop(inode), file, flags, file->f_flags);
141
	fmode = ceph_flags_to_mode(flags);
142
	wanted = ceph_caps_for_mode(fmode);
143

144
	/* snapped files are read-only */
145
	if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
146
		return -EROFS;
147

148
	/* trivially open snapdir */
149
	if (ceph_snap(inode) == CEPH_SNAPDIR) {
150
		spin_lock(&inode->i_lock);
151
		__ceph_get_fmode(ci, fmode);
152
		spin_unlock(&inode->i_lock);
153
		return ceph_init_file(inode, file, fmode);
154
	}
155

156
	/*
157
	 * No need to block if we have caps on the auth MDS (for
158
	 * write) or any MDS (for read).  Update wanted set
159
	 * asynchronously.
160
	 */
161
	spin_lock(&inode->i_lock);
162
	if (__ceph_is_any_real_caps(ci) &&
163
	    (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
164
		int mds_wanted = __ceph_caps_mds_wanted(ci);
165
		int issued = __ceph_caps_issued(ci, NULL);
166

167
		dout("open %p fmode %d want %s issued %s using existing\n",
168
		     inode, fmode, ceph_cap_string(wanted),
169
		     ceph_cap_string(issued));
170
		__ceph_get_fmode(ci, fmode);
171
		spin_unlock(&inode->i_lock);
172

173
		/* adjust wanted? */
174
		if ((issued & wanted) != wanted &&
175
		    (mds_wanted & wanted) != wanted &&
176
		    ceph_snap(inode) != CEPH_SNAPDIR)
177
			ceph_check_caps(ci, 0, NULL);
178

179
		return ceph_init_file(inode, file, fmode);
180
	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
181
		   (ci->i_snap_caps & wanted) == wanted) {
182
		__ceph_get_fmode(ci, fmode);
183
		spin_unlock(&inode->i_lock);
184
		return ceph_init_file(inode, file, fmode);
185
	}
186
	spin_unlock(&inode->i_lock);
187

188
	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
189
	req = prepare_open_request(inode->i_sb, flags, 0);
190
	if (IS_ERR(req)) {
191
		err = PTR_ERR(req);
192
		goto out;
193
	}
194
	req->r_inode = inode;
195
	ihold(inode);
196
	req->r_num_caps = 1;
197
	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
198
	if (!err)
199
		err = ceph_init_file(inode, file, req->r_fmode);
200
	ceph_mdsc_put_request(req);
201
	dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
202
out:
203
	return err;
204
}
205

206

207
/*
208
 * Do a lookup + open with a single request.
209
 *
210
 * If this succeeds, but some subsequent check in the vfs
211
 * may_open() fails, the struct *file gets cleaned up (i.e.
212
 * ceph_release gets called).  So fear not!
213
 */
214
/*
215
 * flags
216
 *  path_lookup_open   -> LOOKUP_OPEN
217
 *  path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
218
 */
219
struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
220
				struct nameidata *nd, int mode,
221
				int locked_dir)
222
{
223
	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
224
	struct ceph_mds_client *mdsc = fsc->mdsc;
225
	struct file *file = nd->intent.open.file;
226
	struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
227
	struct ceph_mds_request *req;
228
	int err;
229
	int flags = nd->intent.open.flags - 1;  /* silly vfs! */
230

231
	dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
232
	     dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
233

234
	/* do the open */
235
	req = prepare_open_request(dir->i_sb, flags, mode);
236
	if (IS_ERR(req))
237
		return ERR_CAST(req);
238
	req->r_dentry = dget(dentry);
239
	req->r_num_caps = 2;
240
	if (flags & O_CREAT) {
241
		req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
242
		req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
243
	}
244
	req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
245
	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
246
	dentry = ceph_finish_lookup(req, dentry, err);
247
	if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
248
		err = ceph_handle_notrace_create(dir, dentry);
249
	if (!err)
250
		err = ceph_init_file(req->r_dentry->d_inode, file,
251
				     req->r_fmode);
252
	ceph_mdsc_put_request(req);
253
	dout("ceph_lookup_open result=%p\n", dentry);
254
	return dentry;
255
}
256

257
int ceph_release(struct inode *inode, struct file *file)
258
{
259
	struct ceph_inode_info *ci = ceph_inode(inode);
260
	struct ceph_file_info *cf = file->private_data;
261

262
	dout("release inode %p file %p\n", inode, file);
263
	ceph_put_fmode(ci, cf->fmode);
264
	if (cf->last_readdir)
265
		ceph_mdsc_put_request(cf->last_readdir);
266
	kfree(cf->last_name);
267
	kfree(cf->dir_info);
268
	dput(cf->dentry);
269
	kmem_cache_free(ceph_file_cachep, cf);
270

271
	/* wake up anyone waiting for caps on this inode */
272
	wake_up_all(&ci->i_cap_wq);
273
	return 0;
274
}
275

276
/*
277
 * Read a range of bytes striped over one or more objects.  Iterate over
278
 * objects we stripe over.  (That's not atomic, but good enough for now.)
279
 *
280
 * If we get a short result from the OSD, check against i_size; we need to
281
 * only return a short read to the caller if we hit EOF.
282
 */
283
static int striped_read(struct inode *inode,
284
			u64 off, u64 len,
285
			struct page **pages, int num_pages,
286
			int *checkeof, bool o_direct,
287
			unsigned long buf_align)
288
{
289
	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
290
	struct ceph_inode_info *ci = ceph_inode(inode);
291
	u64 pos, this_len;
292
	int io_align, page_align;
293
	int left, pages_left;
294
	int read;
295
	struct page **page_pos;
296
	int ret;
297
	bool hit_stripe, was_short;
298

299
	/*
300
	 * we may need to do multiple reads.  not atomic, unfortunately.
301
	 */
302
	pos = off;
303
	left = len;
304
	page_pos = pages;
305
	pages_left = num_pages;
306
	read = 0;
307
	io_align = off & ~PAGE_MASK;
308

309
more:
310
	if (o_direct)
311
		page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
312
	else
313
		page_align = pos & ~PAGE_MASK;
314
	this_len = left;
315
	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
316
				  &ci->i_layout, pos, &this_len,
317
				  ci->i_truncate_seq,
318
				  ci->i_truncate_size,
319
				  page_pos, pages_left, page_align);
320
	if (ret == -ENOENT)
321
		ret = 0;
322
	hit_stripe = this_len < left;
323
	was_short = ret >= 0 && ret < this_len;
324
	dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
325
	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
326

327
	if (ret > 0) {
328
		int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
329

330
		if (read < pos - off) {
331
			dout(" zero gap %llu to %llu\n", off + read, pos);
332
			ceph_zero_page_vector_range(page_align + read,
333
						    pos - off - read, pages);
334
		}
335
		pos += ret;
336
		read = pos - off;
337
		left -= ret;
338
		page_pos += didpages;
339
		pages_left -= didpages;
340

341
		/* hit stripe? */
342
		if (left && hit_stripe)
343
			goto more;
344
	}
345

346
	if (was_short) {
347
		/* did we bounce off eof? */
348
		if (pos + left > inode->i_size)
349
			*checkeof = 1;
350

351
		/* zero trailing bytes (inside i_size) */
352
		if (left > 0 && pos < inode->i_size) {
353
			if (pos + left > inode->i_size)
354
				left = inode->i_size - pos;
355

356
			dout("zero tail %d\n", left);
357
			ceph_zero_page_vector_range(page_align + read, left,
358
						    pages);
359
			read += left;
360
		}
361
	}
362

363
	if (ret >= 0)
364
		ret = read;
365
	dout("striped_read returns %d\n", ret);
366
	return ret;
367
}
368

369
/*
370
 * Completely synchronous read and write methods.  Direct from __user
371
 * buffer to osd, or directly to user pages (if O_DIRECT).
372
 *
373
 * If the read spans object boundary, just do multiple reads.
374
 */
375
static ssize_t ceph_sync_read(struct file *file, char __user *data,
376
			      unsigned len, loff_t *poff, int *checkeof)
377
{
378
	struct inode *inode = file->f_dentry->d_inode;
379
	struct page **pages;
380
	u64 off = *poff;
381
	int num_pages, ret;
382

383
	dout("sync_read on file %p %llu~%u %s\n", file, off, len,
384
	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
385

386
	if (file->f_flags & O_DIRECT) {
387
		num_pages = calc_pages_for((unsigned long)data, len);
388
		pages = ceph_get_direct_page_vector(data, num_pages, true);
389
	} else {
390
		num_pages = calc_pages_for(off, len);
391
		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
392
	}
393
	if (IS_ERR(pages))
394
		return PTR_ERR(pages);
395

396
	/*
397
	 * flush any page cache pages in this range.  this
398
	 * will make concurrent normal and sync io slow,
399
	 * but it will at least behave sensibly when they are
400
	 * in sequence.
401
	 */
402
	ret = filemap_write_and_wait(inode->i_mapping);
403
	if (ret < 0)
404
		goto done;
405

406
	ret = striped_read(inode, off, len, pages, num_pages, checkeof,
407
			   file->f_flags & O_DIRECT,
408
			   (unsigned long)data & ~PAGE_MASK);
409

410
	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
411
		ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
412
	if (ret >= 0)
413
		*poff = off + ret;
414

415
done:
416
	if (file->f_flags & O_DIRECT)
417
		ceph_put_page_vector(pages, num_pages, true);
418
	else
419
		ceph_release_page_vector(pages, num_pages);
420
	dout("sync_read result %d\n", ret);
421
	return ret;
422
}
423

424
/*
425
 * Write commit callback, called if we requested both an ACK and
426
 * ONDISK commit reply from the OSD.
427
 */
428
static void sync_write_commit(struct ceph_osd_request *req,
429
			      struct ceph_msg *msg)
430
{
431
	struct ceph_inode_info *ci = ceph_inode(req->r_inode);
432

433
	dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
434
	spin_lock(&ci->i_unsafe_lock);
435
	list_del_init(&req->r_unsafe_item);
436
	spin_unlock(&ci->i_unsafe_lock);
437
	ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
438
}
439

440
/*
441
 * Synchronous write, straight from __user pointer or user pages (if
442
 * O_DIRECT).
443
 *
444
 * If write spans object boundary, just do multiple writes.  (For a
445
 * correct atomic write, we should e.g. take write locks on all
446
 * objects, rollback on failure, etc.)
447
 */
448
static ssize_t ceph_sync_write(struct file *file, const char __user *data,
449
			       size_t left, loff_t *offset)
450
{
451
	struct inode *inode = file->f_dentry->d_inode;
452
	struct ceph_inode_info *ci = ceph_inode(inode);
453
	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
454
	struct ceph_osd_request *req;
455
	struct page **pages;
456
	int num_pages;
457
	long long unsigned pos;
458
	u64 len;
459
	int written = 0;
460
	int flags;
461
	int do_sync = 0;
462
	int check_caps = 0;
463
	int page_align, io_align;
464
	unsigned long buf_align;
465
	int ret;
466
	struct timespec mtime = CURRENT_TIME;
467

468
	if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
469
		return -EROFS;
470

471
	dout("sync_write on file %p %lld~%u %s\n", file, *offset,
472
	     (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
473

474
	if (file->f_flags & O_APPEND)
475
		pos = i_size_read(inode);
476
	else
477
		pos = *offset;
478

479
	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
480
	if (ret < 0)
481
		return ret;
482

483
	ret = invalidate_inode_pages2_range(inode->i_mapping,
484
					    pos >> PAGE_CACHE_SHIFT,
485
					    (pos + left) >> PAGE_CACHE_SHIFT);
486
	if (ret < 0)
487
		dout("invalidate_inode_pages2_range returned %d\n", ret);
488

489
	flags = CEPH_OSD_FLAG_ORDERSNAP |
490
		CEPH_OSD_FLAG_ONDISK |
491
		CEPH_OSD_FLAG_WRITE;
492
	if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
493
		flags |= CEPH_OSD_FLAG_ACK;
494
	else
495
		do_sync = 1;
496

497
	/*
498
	 * we may need to do multiple writes here if we span an object
499
	 * boundary.  this isn't atomic, unfortunately.  :(
500
	 */
501
more:
502
	io_align = pos & ~PAGE_MASK;
503
	buf_align = (unsigned long)data & ~PAGE_MASK;
504
	len = left;
505
	if (file->f_flags & O_DIRECT) {
506
		/* write from beginning of first page, regardless of
507
		   io alignment */
508
		page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
509
		num_pages = calc_pages_for((unsigned long)data, len);
510
	} else {
511
		page_align = pos & ~PAGE_MASK;
512
		num_pages = calc_pages_for(pos, len);
513
	}
514
	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
515
				    ceph_vino(inode), pos, &len,
516
				    CEPH_OSD_OP_WRITE, flags,
517
				    ci->i_snap_realm->cached_context,
518
				    do_sync,
519
				    ci->i_truncate_seq, ci->i_truncate_size,
520
				    &mtime, false, 2, page_align);
521
	if (!req)
522
		return -ENOMEM;
523

524
	if (file->f_flags & O_DIRECT) {
525
		pages = ceph_get_direct_page_vector(data, num_pages, false);
526
		if (IS_ERR(pages)) {
527
			ret = PTR_ERR(pages);
528
			goto out;
529
		}
530

531
		/*
532
		 * throw out any page cache pages in this range. this
533
		 * may block.
534
		 */
535
		truncate_inode_pages_range(inode->i_mapping, pos,
536
					   (pos+len) | (PAGE_CACHE_SIZE-1));
537
	} else {
538
		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
539
		if (IS_ERR(pages)) {
540
			ret = PTR_ERR(pages);
541
			goto out;
542
		}
543
		ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
544
		if (ret < 0) {
545
			ceph_release_page_vector(pages, num_pages);
546
			goto out;
547
		}
548

549
		if ((file->f_flags & O_SYNC) == 0) {
550
			/* get a second commit callback */
551
			req->r_safe_callback = sync_write_commit;
552
			req->r_own_pages = 1;
553
		}
554
	}
555
	req->r_pages = pages;
556
	req->r_num_pages = num_pages;
557
	req->r_inode = inode;
558

559
	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
560
	if (!ret) {
561
		if (req->r_safe_callback) {
562
			/*
563
			 * Add to inode unsafe list only after we
564
			 * start_request so that a tid has been assigned.
565
			 */
566
			spin_lock(&ci->i_unsafe_lock);
567
			list_add_tail(&req->r_unsafe_item,
568
				      &ci->i_unsafe_writes);
569
			spin_unlock(&ci->i_unsafe_lock);
570
			ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
571
		}
572
		
573
		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
574
		if (ret < 0 && req->r_safe_callback) {
575
			spin_lock(&ci->i_unsafe_lock);
576
			list_del_init(&req->r_unsafe_item);
577
			spin_unlock(&ci->i_unsafe_lock);
578
			ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
579
		}
580
	}
581

582
	if (file->f_flags & O_DIRECT)
583
		ceph_put_page_vector(pages, num_pages, false);
584
	else if (file->f_flags & O_SYNC)
585
		ceph_release_page_vector(pages, num_pages);
586

587
out:
588
	ceph_osdc_put_request(req);
589
	if (ret == 0) {
590
		pos += len;
591
		written += len;
592
		left -= len;
593
		data += written;
594
		if (left)
595
			goto more;
596

597
		ret = written;
598
		*offset = pos;
599
		if (pos > i_size_read(inode))
600
			check_caps = ceph_inode_set_size(inode, pos);
601
		if (check_caps)
602
			ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
603
					NULL);
604
	}
605
	return ret;
606
}
607

608
/*
609
 * Wrap generic_file_aio_read with checks for cap bits on the inode.
610
 * Atomically grab references, so that those bits are not released
611
 * back to the MDS mid-read.
612
 *
613
 * Hmm, the sync read case isn't actually async... should it be?
614
 */
615
static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
616
			     unsigned long nr_segs, loff_t pos)
617
{
618
	struct file *filp = iocb->ki_filp;
619
	struct ceph_file_info *fi = filp->private_data;
620
	loff_t *ppos = &iocb->ki_pos;
621
	size_t len = iov->iov_len;
622
	struct inode *inode = filp->f_dentry->d_inode;
623
	struct ceph_inode_info *ci = ceph_inode(inode);
624
	void __user *base = iov->iov_base;
625
	ssize_t ret;
626
	int want, got = 0;
627
	int checkeof = 0, read = 0;
628

629
	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
630
	     inode, ceph_vinop(inode), pos, (unsigned)len, inode);
631
again:
632
	__ceph_do_pending_vmtruncate(inode);
633
	if (fi->fmode & CEPH_FILE_MODE_LAZY)
634
		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
635
	else
636
		want = CEPH_CAP_FILE_CACHE;
637
	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
638
	if (ret < 0)
639
		goto out;
640
	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
641
	     inode, ceph_vinop(inode), pos, (unsigned)len,
642
	     ceph_cap_string(got));
643

644
	if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
645
	    (iocb->ki_filp->f_flags & O_DIRECT) ||
646
	    (inode->i_sb->s_flags & MS_SYNCHRONOUS))
647
		/* hmm, this isn't really async... */
648
		ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
649
	else
650
		ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
651

652
out:
653
	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
654
	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
655
	ceph_put_cap_refs(ci, got);
656

657
	if (checkeof && ret >= 0) {
658
		int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
659

660
		/* hit EOF or hole? */
661
		if (statret == 0 && *ppos < inode->i_size) {
662
			dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size);
663
			read += ret;
664
			base += ret;
665
			len -= ret;
666
			checkeof = 0;
667
			goto again;
668
		}
669
	}
670
	if (ret >= 0)
671
		ret += read;
672

673
	return ret;
674
}
675

676
/*
677
 * Take cap references to avoid releasing caps to MDS mid-write.
678
 *
679
 * If we are synchronous, and write with an old snap context, the OSD
680
 * may return EOLDSNAPC.  In that case, retry the write.. _after_
681
 * dropping our cap refs and allowing the pending snap to logically
682
 * complete _before_ this write occurs.
683
 *
684
 * If we are near ENOSPC, write synchronously.
685
 */
686
static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
687
		       unsigned long nr_segs, loff_t pos)
688
{
689
	struct file *file = iocb->ki_filp;
690
	struct ceph_file_info *fi = file->private_data;
691
	struct inode *inode = file->f_dentry->d_inode;
692
	struct ceph_inode_info *ci = ceph_inode(inode);
693
	struct ceph_osd_client *osdc =
694
		&ceph_sb_to_client(inode->i_sb)->client->osdc;
695
	loff_t endoff = pos + iov->iov_len;
696
	int want, got = 0;
697
	int ret, err;
698

699
	if (ceph_snap(inode) != CEPH_NOSNAP)
700
		return -EROFS;
701

702
retry_snap:
703
	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
704
		return -ENOSPC;
705
	__ceph_do_pending_vmtruncate(inode);
706
	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
707
	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
708
	     inode->i_size);
709
	if (fi->fmode & CEPH_FILE_MODE_LAZY)
710
		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
711
	else
712
		want = CEPH_CAP_FILE_BUFFER;
713
	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
714
	if (ret < 0)
715
		goto out;
716

717
	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
718
	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
719
	     ceph_cap_string(got));
720

721
	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
722
	    (iocb->ki_filp->f_flags & O_DIRECT) ||
723
	    (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
724
		ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
725
			&iocb->ki_pos);
726
	} else {
727
		ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
728

729
		if ((ret >= 0 || ret == -EIOCBQUEUED) &&
730
		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
731
		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
732
			err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
733
			if (err < 0)
734
				ret = err;
735
		}
736
	}
737
	if (ret >= 0) {
738
		int dirty;
739
		spin_lock(&inode->i_lock);
740
		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
741
		spin_unlock(&inode->i_lock);
742
		if (dirty)
743
			__mark_inode_dirty(inode, dirty);
744
	}
745

746
out:
747
	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
748
	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
749
	     ceph_cap_string(got));
750
	ceph_put_cap_refs(ci, got);
751

752
	if (ret == -EOLDSNAPC) {
753
		dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
754
		     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
755
		goto retry_snap;
756
	}
757

758
	return ret;
759
}
760

761
/*
762
 * llseek.  be sure to verify file size on SEEK_END.
763
 */
764
static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
765
{
766
	struct inode *inode = file->f_mapping->host;
767
	int ret;
768

769
	mutex_lock(&inode->i_mutex);
770
	__ceph_do_pending_vmtruncate(inode);
771
	switch (origin) {
772
	case SEEK_END:
773
		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
774
		if (ret < 0) {
775
			offset = ret;
776
			goto out;
777
		}
778
		offset += inode->i_size;
779
		break;
780
	case SEEK_CUR:
781
		/*
782
		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
783
		 * position-querying operation.  Avoid rewriting the "same"
784
		 * f_pos value back to the file because a concurrent read(),
785
		 * write() or lseek() might have altered it
786
		 */
787
		if (offset == 0) {
788
			offset = file->f_pos;
789
			goto out;
790
		}
791
		offset += file->f_pos;
792
		break;
793
	}
794

795
	if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
796
		offset = -EINVAL;
797
		goto out;
798
	}
799

800
	/* Special lock needed here? */
801
	if (offset != file->f_pos) {
802
		file->f_pos = offset;
803
		file->f_version = 0;
804
	}
805

806
out:
807
	mutex_unlock(&inode->i_mutex);
808
	return offset;
809
}
810

811
const struct file_operations ceph_file_fops = {
812
	.open = ceph_open,
813
	.release = ceph_release,
814
	.llseek = ceph_llseek,
815
	.read = do_sync_read,
816
	.write = do_sync_write,
817
	.aio_read = ceph_aio_read,
818
	.aio_write = ceph_aio_write,
819
	.mmap = ceph_mmap,
820
	.fsync = ceph_fsync,
821
	.lock = ceph_lock,
822
	.flock = ceph_flock,
823
	.splice_read = generic_file_splice_read,
824
	.splice_write = generic_file_splice_write,
825
	.unlocked_ioctl = ceph_ioctl,
826
	.compat_ioctl	= ceph_ioctl,
827
};
828

829

830
Product

Resources

Company