CoCalc -- rbd.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/drivers/block/rbd.c
¹⁷³¹⁶ views
1
/*
2
   rbd.c -- Export ceph rados objects as a Linux block device
3

4

5
   based on drivers/block/osdblk.c:
6

7
   Copyright 2009 Red Hat, Inc.
8

9
   This program is free software; you can redistribute it and/or modify
10
   it under the terms of the GNU General Public License as published by
11
   the Free Software Foundation.
12

13
   This program is distributed in the hope that it will be useful,
14
   but WITHOUT ANY WARRANTY; without even the implied warranty of
15
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
   GNU General Public License for more details.
17

18
   You should have received a copy of the GNU General Public License
19
   along with this program; see the file COPYING.  If not, write to
20
   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21

22

23

24
   For usage instructions, please refer to:
25

26
                 Documentation/ABI/testing/sysfs-bus-rbd
27

28
 */
29

30
#include <linux/ceph/libceph.h>
31
#include <linux/ceph/osd_client.h>
32
#include <linux/ceph/mon_client.h>
33
#include <linux/ceph/decode.h>
34
#include <linux/parser.h>
35

36
#include <linux/kernel.h>
37
#include <linux/device.h>
38
#include <linux/module.h>
39
#include <linux/fs.h>
40
#include <linux/blkdev.h>
41

42
#include "rbd_types.h"
43

44
#define DRV_NAME "rbd"
45
#define DRV_NAME_LONG "rbd (rados block device)"
46

47
#define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
48

49
#define RBD_MAX_MD_NAME_LEN	(96 + sizeof(RBD_SUFFIX))
50
#define RBD_MAX_POOL_NAME_LEN	64
51
#define RBD_MAX_SNAP_NAME_LEN	32
52
#define RBD_MAX_OPT_LEN		1024
53

54
#define RBD_SNAP_HEAD_NAME	"-"
55

56
#define DEV_NAME_LEN		32
57

58
#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
59

60
/*
61
 * block device image metadata (in-memory version)
62
 */
63
struct rbd_image_header {
64
	u64 image_size;
65
	char block_name[32];
66
	__u8 obj_order;
67
	__u8 crypt_type;
68
	__u8 comp_type;
69
	struct rw_semaphore snap_rwsem;
70
	struct ceph_snap_context *snapc;
71
	size_t snap_names_len;
72
	u64 snap_seq;
73
	u32 total_snaps;
74

75
	char *snap_names;
76
	u64 *snap_sizes;
77

78
	u64 obj_version;
79
};
80

81
struct rbd_options {
82
	int	notify_timeout;
83
};
84

85
/*
86
 * an instance of the client.  multiple devices may share a client.
87
 */
88
struct rbd_client {
89
	struct ceph_client	*client;
90
	struct rbd_options	*rbd_opts;
91
	struct kref		kref;
92
	struct list_head	node;
93
};
94

95
struct rbd_req_coll;
96

97
/*
98
 * a single io request
99
 */
100
struct rbd_request {
101
	struct request		*rq;		/* blk layer request */
102
	struct bio		*bio;		/* cloned bio */
103
	struct page		**pages;	/* list of used pages */
104
	u64			len;
105
	int			coll_index;
106
	struct rbd_req_coll	*coll;
107
};
108

109
struct rbd_req_status {
110
	int done;
111
	int rc;
112
	u64 bytes;
113
};
114

115
/*
116
 * a collection of requests
117
 */
118
struct rbd_req_coll {
119
	int			total;
120
	int			num_done;
121
	struct kref		kref;
122
	struct rbd_req_status	status[0];
123
};
124

125
struct rbd_snap {
126
	struct	device		dev;
127
	const char		*name;
128
	size_t			size;
129
	struct list_head	node;
130
	u64			id;
131
};
132

133
/*
134
 * a single device
135
 */
136
struct rbd_device {
137
	int			id;		/* blkdev unique id */
138

139
	int			major;		/* blkdev assigned major */
140
	struct gendisk		*disk;		/* blkdev's gendisk and rq */
141
	struct request_queue	*q;
142

143
	struct ceph_client	*client;
144
	struct rbd_client	*rbd_client;
145

146
	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
147

148
	spinlock_t		lock;		/* queue lock */
149

150
	struct rbd_image_header	header;
151
	char			obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
152
	int			obj_len;
153
	char			obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
154
	char			pool_name[RBD_MAX_POOL_NAME_LEN];
155
	int			poolid;
156

157
	struct ceph_osd_event   *watch_event;
158
	struct ceph_osd_request *watch_request;
159

160
	char                    snap_name[RBD_MAX_SNAP_NAME_LEN];
161
	u32 cur_snap;	/* index+1 of current snapshot within snap context
162
			   0 - for the head */
163
	int read_only;
164

165
	struct list_head	node;
166

167
	/* list of snapshots */
168
	struct list_head	snaps;
169

170
	/* sysfs related */
171
	struct device		dev;
172
};
173

174
static struct bus_type rbd_bus_type = {
175
	.name		= "rbd",
176
};
177

178
static spinlock_t node_lock;      /* protects client get/put */
179

180
static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
181
static LIST_HEAD(rbd_dev_list);    /* devices */
182
static LIST_HEAD(rbd_client_list);      /* clients */
183

184
static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
185
static void rbd_dev_release(struct device *dev);
186
static ssize_t rbd_snap_rollback(struct device *dev,
187
				 struct device_attribute *attr,
188
				 const char *buf,
189
				 size_t size);
190
static ssize_t rbd_snap_add(struct device *dev,
191
			    struct device_attribute *attr,
192
			    const char *buf,
193
			    size_t count);
194
static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
195
				  struct rbd_snap *snap);;
196

197

198
static struct rbd_device *dev_to_rbd(struct device *dev)
199
{
200
	return container_of(dev, struct rbd_device, dev);
201
}
202

203
static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
204
{
205
	return get_device(&rbd_dev->dev);
206
}
207

208
static void rbd_put_dev(struct rbd_device *rbd_dev)
209
{
210
	put_device(&rbd_dev->dev);
211
}
212

213
static int __rbd_update_snaps(struct rbd_device *rbd_dev);
214

215
static int rbd_open(struct block_device *bdev, fmode_t mode)
216
{
217
	struct gendisk *disk = bdev->bd_disk;
218
	struct rbd_device *rbd_dev = disk->private_data;
219

220
	rbd_get_dev(rbd_dev);
221

222
	set_device_ro(bdev, rbd_dev->read_only);
223

224
	if ((mode & FMODE_WRITE) && rbd_dev->read_only)
225
		return -EROFS;
226

227
	return 0;
228
}
229

230
static int rbd_release(struct gendisk *disk, fmode_t mode)
231
{
232
	struct rbd_device *rbd_dev = disk->private_data;
233

234
	rbd_put_dev(rbd_dev);
235

236
	return 0;
237
}
238

239
static const struct block_device_operations rbd_bd_ops = {
240
	.owner			= THIS_MODULE,
241
	.open			= rbd_open,
242
	.release		= rbd_release,
243
};
244

245
/*
246
 * Initialize an rbd client instance.
247
 * We own *opt.
248
 */
249
static struct rbd_client *rbd_client_create(struct ceph_options *opt,
250
					    struct rbd_options *rbd_opts)
251
{
252
	struct rbd_client *rbdc;
253
	int ret = -ENOMEM;
254

255
	dout("rbd_client_create\n");
256
	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
257
	if (!rbdc)
258
		goto out_opt;
259

260
	kref_init(&rbdc->kref);
261
	INIT_LIST_HEAD(&rbdc->node);
262

263
	rbdc->client = ceph_create_client(opt, rbdc);
264
	if (IS_ERR(rbdc->client))
265
		goto out_rbdc;
266
	opt = NULL; /* Now rbdc->client is responsible for opt */
267

268
	ret = ceph_open_session(rbdc->client);
269
	if (ret < 0)
270
		goto out_err;
271

272
	rbdc->rbd_opts = rbd_opts;
273

274
	spin_lock(&node_lock);
275
	list_add_tail(&rbdc->node, &rbd_client_list);
276
	spin_unlock(&node_lock);
277

278
	dout("rbd_client_create created %p\n", rbdc);
279
	return rbdc;
280

281
out_err:
282
	ceph_destroy_client(rbdc->client);
283
out_rbdc:
284
	kfree(rbdc);
285
out_opt:
286
	if (opt)
287
		ceph_destroy_options(opt);
288
	return ERR_PTR(ret);
289
}
290

291
/*
292
 * Find a ceph client with specific addr and configuration.
293
 */
294
static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
295
{
296
	struct rbd_client *client_node;
297

298
	if (opt->flags & CEPH_OPT_NOSHARE)
299
		return NULL;
300

301
	list_for_each_entry(client_node, &rbd_client_list, node)
302
		if (ceph_compare_options(opt, client_node->client) == 0)
303
			return client_node;
304
	return NULL;
305
}
306

307
/*
308
 * mount options
309
 */
310
enum {
311
	Opt_notify_timeout,
312
	Opt_last_int,
313
	/* int args above */
314
	Opt_last_string,
315
	/* string args above */
316
};
317

318
static match_table_t rbdopt_tokens = {
319
	{Opt_notify_timeout, "notify_timeout=%d"},
320
	/* int args above */
321
	/* string args above */
322
	{-1, NULL}
323
};
324

325
static int parse_rbd_opts_token(char *c, void *private)
326
{
327
	struct rbd_options *rbdopt = private;
328
	substring_t argstr[MAX_OPT_ARGS];
329
	int token, intval, ret;
330

331
	token = match_token((char *)c, rbdopt_tokens, argstr);
332
	if (token < 0)
333
		return -EINVAL;
334

335
	if (token < Opt_last_int) {
336
		ret = match_int(&argstr[0], &intval);
337
		if (ret < 0) {
338
			pr_err("bad mount option arg (not int) "
339
			       "at '%s'\n", c);
340
			return ret;
341
		}
342
		dout("got int token %d val %d\n", token, intval);
343
	} else if (token > Opt_last_int && token < Opt_last_string) {
344
		dout("got string token %d val %s\n", token,
345
		     argstr[0].from);
346
	} else {
347
		dout("got token %d\n", token);
348
	}
349

350
	switch (token) {
351
	case Opt_notify_timeout:
352
		rbdopt->notify_timeout = intval;
353
		break;
354
	default:
355
		BUG_ON(token);
356
	}
357
	return 0;
358
}
359

360
/*
361
 * Get a ceph client with specific addr and configuration, if one does
362
 * not exist create it.
363
 */
364
static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
365
			  char *options)
366
{
367
	struct rbd_client *rbdc;
368
	struct ceph_options *opt;
369
	int ret;
370
	struct rbd_options *rbd_opts;
371

372
	rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
373
	if (!rbd_opts)
374
		return -ENOMEM;
375

376
	rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
377

378
	ret = ceph_parse_options(&opt, options, mon_addr,
379
				 mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts);
380
	if (ret < 0)
381
		goto done_err;
382

383
	spin_lock(&node_lock);
384
	rbdc = __rbd_client_find(opt);
385
	if (rbdc) {
386
		ceph_destroy_options(opt);
387

388
		/* using an existing client */
389
		kref_get(&rbdc->kref);
390
		rbd_dev->rbd_client = rbdc;
391
		rbd_dev->client = rbdc->client;
392
		spin_unlock(&node_lock);
393
		return 0;
394
	}
395
	spin_unlock(&node_lock);
396

397
	rbdc = rbd_client_create(opt, rbd_opts);
398
	if (IS_ERR(rbdc)) {
399
		ret = PTR_ERR(rbdc);
400
		goto done_err;
401
	}
402

403
	rbd_dev->rbd_client = rbdc;
404
	rbd_dev->client = rbdc->client;
405
	return 0;
406
done_err:
407
	kfree(rbd_opts);
408
	return ret;
409
}
410

411
/*
412
 * Destroy ceph client
413
 */
414
static void rbd_client_release(struct kref *kref)
415
{
416
	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
417

418
	dout("rbd_release_client %p\n", rbdc);
419
	spin_lock(&node_lock);
420
	list_del(&rbdc->node);
421
	spin_unlock(&node_lock);
422

423
	ceph_destroy_client(rbdc->client);
424
	kfree(rbdc->rbd_opts);
425
	kfree(rbdc);
426
}
427

428
/*
429
 * Drop reference to ceph client node. If it's not referenced anymore, release
430
 * it.
431
 */
432
static void rbd_put_client(struct rbd_device *rbd_dev)
433
{
434
	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
435
	rbd_dev->rbd_client = NULL;
436
	rbd_dev->client = NULL;
437
}
438

439
/*
440
 * Destroy requests collection
441
 */
442
static void rbd_coll_release(struct kref *kref)
443
{
444
	struct rbd_req_coll *coll =
445
		container_of(kref, struct rbd_req_coll, kref);
446

447
	dout("rbd_coll_release %p\n", coll);
448
	kfree(coll);
449
}
450

451
/*
452
 * Create a new header structure, translate header format from the on-disk
453
 * header.
454
 */
455
static int rbd_header_from_disk(struct rbd_image_header *header,
456
				 struct rbd_image_header_ondisk *ondisk,
457
				 int allocated_snaps,
458
				 gfp_t gfp_flags)
459
{
460
	int i;
461
	u32 snap_count = le32_to_cpu(ondisk->snap_count);
462
	int ret = -ENOMEM;
463

464
	init_rwsem(&header->snap_rwsem);
465
	header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
466
	header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
467
				snap_count *
468
				 sizeof(struct rbd_image_snap_ondisk),
469
				gfp_flags);
470
	if (!header->snapc)
471
		return -ENOMEM;
472
	if (snap_count) {
473
		header->snap_names = kmalloc(header->snap_names_len,
474
					     GFP_KERNEL);
475
		if (!header->snap_names)
476
			goto err_snapc;
477
		header->snap_sizes = kmalloc(snap_count * sizeof(u64),
478
					     GFP_KERNEL);
479
		if (!header->snap_sizes)
480
			goto err_names;
481
	} else {
482
		header->snap_names = NULL;
483
		header->snap_sizes = NULL;
484
	}
485
	memcpy(header->block_name, ondisk->block_name,
486
	       sizeof(ondisk->block_name));
487

488
	header->image_size = le64_to_cpu(ondisk->image_size);
489
	header->obj_order = ondisk->options.order;
490
	header->crypt_type = ondisk->options.crypt_type;
491
	header->comp_type = ondisk->options.comp_type;
492

493
	atomic_set(&header->snapc->nref, 1);
494
	header->snap_seq = le64_to_cpu(ondisk->snap_seq);
495
	header->snapc->num_snaps = snap_count;
496
	header->total_snaps = snap_count;
497

498
	if (snap_count &&
499
	    allocated_snaps == snap_count) {
500
		for (i = 0; i < snap_count; i++) {
501
			header->snapc->snaps[i] =
502
				le64_to_cpu(ondisk->snaps[i].id);
503
			header->snap_sizes[i] =
504
				le64_to_cpu(ondisk->snaps[i].image_size);
505
		}
506

507
		/* copy snapshot names */
508
		memcpy(header->snap_names, &ondisk->snaps[i],
509
			header->snap_names_len);
510
	}
511

512
	return 0;
513

514
err_names:
515
	kfree(header->snap_names);
516
err_snapc:
517
	kfree(header->snapc);
518
	return ret;
519
}
520

521
static int snap_index(struct rbd_image_header *header, int snap_num)
522
{
523
	return header->total_snaps - snap_num;
524
}
525

526
static u64 cur_snap_id(struct rbd_device *rbd_dev)
527
{
528
	struct rbd_image_header *header = &rbd_dev->header;
529

530
	if (!rbd_dev->cur_snap)
531
		return 0;
532

533
	return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
534
}
535

536
static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
537
			u64 *seq, u64 *size)
538
{
539
	int i;
540
	char *p = header->snap_names;
541

542
	for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
543
		if (strcmp(snap_name, p) == 0)
544
			break;
545
	}
546
	if (i == header->total_snaps)
547
		return -ENOENT;
548
	if (seq)
549
		*seq = header->snapc->snaps[i];
550

551
	if (size)
552
		*size = header->snap_sizes[i];
553

554
	return i;
555
}
556

557
static int rbd_header_set_snap(struct rbd_device *dev,
558
			       const char *snap_name,
559
			       u64 *size)
560
{
561
	struct rbd_image_header *header = &dev->header;
562
	struct ceph_snap_context *snapc = header->snapc;
563
	int ret = -ENOENT;
564

565
	down_write(&header->snap_rwsem);
566

567
	if (!snap_name ||
568
	    !*snap_name ||
569
	    strcmp(snap_name, "-") == 0 ||
570
	    strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
571
		if (header->total_snaps)
572
			snapc->seq = header->snap_seq;
573
		else
574
			snapc->seq = 0;
575
		dev->cur_snap = 0;
576
		dev->read_only = 0;
577
		if (size)
578
			*size = header->image_size;
579
	} else {
580
		ret = snap_by_name(header, snap_name, &snapc->seq, size);
581
		if (ret < 0)
582
			goto done;
583

584
		dev->cur_snap = header->total_snaps - ret;
585
		dev->read_only = 1;
586
	}
587

588
	ret = 0;
589
done:
590
	up_write(&header->snap_rwsem);
591
	return ret;
592
}
593

594
static void rbd_header_free(struct rbd_image_header *header)
595
{
596
	kfree(header->snapc);
597
	kfree(header->snap_names);
598
	kfree(header->snap_sizes);
599
}
600

601
/*
602
 * get the actual striped segment name, offset and length
603
 */
604
static u64 rbd_get_segment(struct rbd_image_header *header,
605
			   const char *block_name,
606
			   u64 ofs, u64 len,
607
			   char *seg_name, u64 *segofs)
608
{
609
	u64 seg = ofs >> header->obj_order;
610

611
	if (seg_name)
612
		snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
613
			 "%s.%012llx", block_name, seg);
614

615
	ofs = ofs & ((1 << header->obj_order) - 1);
616
	len = min_t(u64, len, (1 << header->obj_order) - ofs);
617

618
	if (segofs)
619
		*segofs = ofs;
620

621
	return len;
622
}
623

624
static int rbd_get_num_segments(struct rbd_image_header *header,
625
				u64 ofs, u64 len)
626
{
627
	u64 start_seg = ofs >> header->obj_order;
628
	u64 end_seg = (ofs + len - 1) >> header->obj_order;
629
	return end_seg - start_seg + 1;
630
}
631

632
/*
633
 * bio helpers
634
 */
635

636
static void bio_chain_put(struct bio *chain)
637
{
638
	struct bio *tmp;
639

640
	while (chain) {
641
		tmp = chain;
642
		chain = chain->bi_next;
643
		bio_put(tmp);
644
	}
645
}
646

647
/*
648
 * zeros a bio chain, starting at specific offset
649
 */
650
static void zero_bio_chain(struct bio *chain, int start_ofs)
651
{
652
	struct bio_vec *bv;
653
	unsigned long flags;
654
	void *buf;
655
	int i;
656
	int pos = 0;
657

658
	while (chain) {
659
		bio_for_each_segment(bv, chain, i) {
660
			if (pos + bv->bv_len > start_ofs) {
661
				int remainder = max(start_ofs - pos, 0);
662
				buf = bvec_kmap_irq(bv, &flags);
663
				memset(buf + remainder, 0,
664
				       bv->bv_len - remainder);
665
				bvec_kunmap_irq(buf, &flags);
666
			}
667
			pos += bv->bv_len;
668
		}
669

670
		chain = chain->bi_next;
671
	}
672
}
673

674
/*
675
 * bio_chain_clone - clone a chain of bios up to a certain length.
676
 * might return a bio_pair that will need to be released.
677
 */
678
static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
679
				   struct bio_pair **bp,
680
				   int len, gfp_t gfpmask)
681
{
682
	struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
683
	int total = 0;
684

685
	if (*bp) {
686
		bio_pair_release(*bp);
687
		*bp = NULL;
688
	}
689

690
	while (old_chain && (total < len)) {
691
		tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
692
		if (!tmp)
693
			goto err_out;
694

695
		if (total + old_chain->bi_size > len) {
696
			struct bio_pair *bp;
697

698
			/*
699
			 * this split can only happen with a single paged bio,
700
			 * split_bio will BUG_ON if this is not the case
701
			 */
702
			dout("bio_chain_clone split! total=%d remaining=%d"
703
			     "bi_size=%d\n",
704
			     (int)total, (int)len-total,
705
			     (int)old_chain->bi_size);
706

707
			/* split the bio. We'll release it either in the next
708
			   call, or it will have to be released outside */
709
			bp = bio_split(old_chain, (len - total) / 512ULL);
710
			if (!bp)
711
				goto err_out;
712

713
			__bio_clone(tmp, &bp->bio1);
714

715
			*next = &bp->bio2;
716
		} else {
717
			__bio_clone(tmp, old_chain);
718
			*next = old_chain->bi_next;
719
		}
720

721
		tmp->bi_bdev = NULL;
722
		gfpmask &= ~__GFP_WAIT;
723
		tmp->bi_next = NULL;
724

725
		if (!new_chain) {
726
			new_chain = tail = tmp;
727
		} else {
728
			tail->bi_next = tmp;
729
			tail = tmp;
730
		}
731
		old_chain = old_chain->bi_next;
732

733
		total += tmp->bi_size;
734
	}
735

736
	BUG_ON(total < len);
737

738
	if (tail)
739
		tail->bi_next = NULL;
740

741
	*old = old_chain;
742

743
	return new_chain;
744

745
err_out:
746
	dout("bio_chain_clone with err\n");
747
	bio_chain_put(new_chain);
748
	return NULL;
749
}
750

751
/*
752
 * helpers for osd request op vectors.
753
 */
754
static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
755
			    int num_ops,
756
			    int opcode,
757
			    u32 payload_len)
758
{
759
	*ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
760
		       GFP_NOIO);
761
	if (!*ops)
762
		return -ENOMEM;
763
	(*ops)[0].op = opcode;
764
	/*
765
	 * op extent offset and length will be set later on
766
	 * in calc_raw_layout()
767
	 */
768
	(*ops)[0].payload_len = payload_len;
769
	return 0;
770
}
771

772
static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
773
{
774
	kfree(ops);
775
}
776

777
static void rbd_coll_end_req_index(struct request *rq,
778
				   struct rbd_req_coll *coll,
779
				   int index,
780
				   int ret, u64 len)
781
{
782
	struct request_queue *q;
783
	int min, max, i;
784

785
	dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
786
	     coll, index, ret, len);
787

788
	if (!rq)
789
		return;
790

791
	if (!coll) {
792
		blk_end_request(rq, ret, len);
793
		return;
794
	}
795

796
	q = rq->q;
797

798
	spin_lock_irq(q->queue_lock);
799
	coll->status[index].done = 1;
800
	coll->status[index].rc = ret;
801
	coll->status[index].bytes = len;
802
	max = min = coll->num_done;
803
	while (max < coll->total && coll->status[max].done)
804
		max++;
805

806
	for (i = min; i<max; i++) {
807
		__blk_end_request(rq, coll->status[i].rc,
808
				  coll->status[i].bytes);
809
		coll->num_done++;
810
		kref_put(&coll->kref, rbd_coll_release);
811
	}
812
	spin_unlock_irq(q->queue_lock);
813
}
814

815
static void rbd_coll_end_req(struct rbd_request *req,
816
			     int ret, u64 len)
817
{
818
	rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
819
}
820

821
/*
822
 * Send ceph osd request
823
 */
824
static int rbd_do_request(struct request *rq,
825
			  struct rbd_device *dev,
826
			  struct ceph_snap_context *snapc,
827
			  u64 snapid,
828
			  const char *obj, u64 ofs, u64 len,
829
			  struct bio *bio,
830
			  struct page **pages,
831
			  int num_pages,
832
			  int flags,
833
			  struct ceph_osd_req_op *ops,
834
			  int num_reply,
835
			  struct rbd_req_coll *coll,
836
			  int coll_index,
837
			  void (*rbd_cb)(struct ceph_osd_request *req,
838
					 struct ceph_msg *msg),
839
			  struct ceph_osd_request **linger_req,
840
			  u64 *ver)
841
{
842
	struct ceph_osd_request *req;
843
	struct ceph_file_layout *layout;
844
	int ret;
845
	u64 bno;
846
	struct timespec mtime = CURRENT_TIME;
847
	struct rbd_request *req_data;
848
	struct ceph_osd_request_head *reqhead;
849
	struct rbd_image_header *header = &dev->header;
850

851
	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
852
	if (!req_data) {
853
		if (coll)
854
			rbd_coll_end_req_index(rq, coll, coll_index,
855
					       -ENOMEM, len);
856
		return -ENOMEM;
857
	}
858

859
	if (coll) {
860
		req_data->coll = coll;
861
		req_data->coll_index = coll_index;
862
	}
863

864
	dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
865

866
	down_read(&header->snap_rwsem);
867

868
	req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
869
				      snapc,
870
				      ops,
871
				      false,
872
				      GFP_NOIO, pages, bio);
873
	if (!req) {
874
		up_read(&header->snap_rwsem);
875
		ret = -ENOMEM;
876
		goto done_pages;
877
	}
878

879
	req->r_callback = rbd_cb;
880

881
	req_data->rq = rq;
882
	req_data->bio = bio;
883
	req_data->pages = pages;
884
	req_data->len = len;
885

886
	req->r_priv = req_data;
887

888
	reqhead = req->r_request->front.iov_base;
889
	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
890

891
	strncpy(req->r_oid, obj, sizeof(req->r_oid));
892
	req->r_oid_len = strlen(req->r_oid);
893

894
	layout = &req->r_file_layout;
895
	memset(layout, 0, sizeof(*layout));
896
	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
897
	layout->fl_stripe_count = cpu_to_le32(1);
898
	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
899
	layout->fl_pg_preferred = cpu_to_le32(-1);
900
	layout->fl_pg_pool = cpu_to_le32(dev->poolid);
901
	ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
902
			     ofs, &len, &bno, req, ops);
903

904
	ceph_osdc_build_request(req, ofs, &len,
905
				ops,
906
				snapc,
907
				&mtime,
908
				req->r_oid, req->r_oid_len);
909
	up_read(&header->snap_rwsem);
910

911
	if (linger_req) {
912
		ceph_osdc_set_request_linger(&dev->client->osdc, req);
913
		*linger_req = req;
914
	}
915

916
	ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
917
	if (ret < 0)
918
		goto done_err;
919

920
	if (!rbd_cb) {
921
		ret = ceph_osdc_wait_request(&dev->client->osdc, req);
922
		if (ver)
923
			*ver = le64_to_cpu(req->r_reassert_version.version);
924
		dout("reassert_ver=%lld\n",
925
		     le64_to_cpu(req->r_reassert_version.version));
926
		ceph_osdc_put_request(req);
927
	}
928
	return ret;
929

930
done_err:
931
	bio_chain_put(req_data->bio);
932
	ceph_osdc_put_request(req);
933
done_pages:
934
	rbd_coll_end_req(req_data, ret, len);
935
	kfree(req_data);
936
	return ret;
937
}
938

939
/*
940
 * Ceph osd op callback
941
 */
942
static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
943
{
944
	struct rbd_request *req_data = req->r_priv;
945
	struct ceph_osd_reply_head *replyhead;
946
	struct ceph_osd_op *op;
947
	__s32 rc;
948
	u64 bytes;
949
	int read_op;
950

951
	/* parse reply */
952
	replyhead = msg->front.iov_base;
953
	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
954
	op = (void *)(replyhead + 1);
955
	rc = le32_to_cpu(replyhead->result);
956
	bytes = le64_to_cpu(op->extent.length);
957
	read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
958

959
	dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
960

961
	if (rc == -ENOENT && read_op) {
962
		zero_bio_chain(req_data->bio, 0);
963
		rc = 0;
964
	} else if (rc == 0 && read_op && bytes < req_data->len) {
965
		zero_bio_chain(req_data->bio, bytes);
966
		bytes = req_data->len;
967
	}
968

969
	rbd_coll_end_req(req_data, rc, bytes);
970

971
	if (req_data->bio)
972
		bio_chain_put(req_data->bio);
973

974
	ceph_osdc_put_request(req);
975
	kfree(req_data);
976
}
977

978
static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
979
{
980
	ceph_osdc_put_request(req);
981
}
982

983
/*
984
 * Do a synchronous ceph osd operation
985
 */
986
static int rbd_req_sync_op(struct rbd_device *dev,
987
			   struct ceph_snap_context *snapc,
988
			   u64 snapid,
989
			   int opcode,
990
			   int flags,
991
			   struct ceph_osd_req_op *orig_ops,
992
			   int num_reply,
993
			   const char *obj,
994
			   u64 ofs, u64 len,
995
			   char *buf,
996
			   struct ceph_osd_request **linger_req,
997
			   u64 *ver)
998
{
999
	int ret;
1000
	struct page **pages;
1001
	int num_pages;
1002
	struct ceph_osd_req_op *ops = orig_ops;
1003
	u32 payload_len;
1004

1005
	num_pages = calc_pages_for(ofs , len);
1006
	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1007
	if (IS_ERR(pages))
1008
		return PTR_ERR(pages);
1009

1010
	if (!orig_ops) {
1011
		payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1012
		ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1013
		if (ret < 0)
1014
			goto done;
1015

1016
		if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1017
			ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1018
			if (ret < 0)
1019
				goto done_ops;
1020
		}
1021
	}
1022

1023
	ret = rbd_do_request(NULL, dev, snapc, snapid,
1024
			  obj, ofs, len, NULL,
1025
			  pages, num_pages,
1026
			  flags,
1027
			  ops,
1028
			  2,
1029
			  NULL, 0,
1030
			  NULL,
1031
			  linger_req, ver);
1032
	if (ret < 0)
1033
		goto done_ops;
1034

1035
	if ((flags & CEPH_OSD_FLAG_READ) && buf)
1036
		ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1037

1038
done_ops:
1039
	if (!orig_ops)
1040
		rbd_destroy_ops(ops);
1041
done:
1042
	ceph_release_page_vector(pages, num_pages);
1043
	return ret;
1044
}
1045

1046
/*
1047
 * Do an asynchronous ceph osd operation
1048
 */
1049
static int rbd_do_op(struct request *rq,
1050
		     struct rbd_device *rbd_dev ,
1051
		     struct ceph_snap_context *snapc,
1052
		     u64 snapid,
1053
		     int opcode, int flags, int num_reply,
1054
		     u64 ofs, u64 len,
1055
		     struct bio *bio,
1056
		     struct rbd_req_coll *coll,
1057
		     int coll_index)
1058
{
1059
	char *seg_name;
1060
	u64 seg_ofs;
1061
	u64 seg_len;
1062
	int ret;
1063
	struct ceph_osd_req_op *ops;
1064
	u32 payload_len;
1065

1066
	seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1067
	if (!seg_name)
1068
		return -ENOMEM;
1069

1070
	seg_len = rbd_get_segment(&rbd_dev->header,
1071
				  rbd_dev->header.block_name,
1072
				  ofs, len,
1073
				  seg_name, &seg_ofs);
1074

1075
	payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1076

1077
	ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1078
	if (ret < 0)
1079
		goto done;
1080

1081
	/* we've taken care of segment sizes earlier when we
1082
	   cloned the bios. We should never have a segment
1083
	   truncated at this point */
1084
	BUG_ON(seg_len < len);
1085

1086
	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1087
			     seg_name, seg_ofs, seg_len,
1088
			     bio,
1089
			     NULL, 0,
1090
			     flags,
1091
			     ops,
1092
			     num_reply,
1093
			     coll, coll_index,
1094
			     rbd_req_cb, 0, NULL);
1095

1096
	rbd_destroy_ops(ops);
1097
done:
1098
	kfree(seg_name);
1099
	return ret;
1100
}
1101

1102
/*
1103
 * Request async osd write
1104
 */
1105
static int rbd_req_write(struct request *rq,
1106
			 struct rbd_device *rbd_dev,
1107
			 struct ceph_snap_context *snapc,
1108
			 u64 ofs, u64 len,
1109
			 struct bio *bio,
1110
			 struct rbd_req_coll *coll,
1111
			 int coll_index)
1112
{
1113
	return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1114
			 CEPH_OSD_OP_WRITE,
1115
			 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1116
			 2,
1117
			 ofs, len, bio, coll, coll_index);
1118
}
1119

1120
/*
1121
 * Request async osd read
1122
 */
1123
static int rbd_req_read(struct request *rq,
1124
			 struct rbd_device *rbd_dev,
1125
			 u64 snapid,
1126
			 u64 ofs, u64 len,
1127
			 struct bio *bio,
1128
			 struct rbd_req_coll *coll,
1129
			 int coll_index)
1130
{
1131
	return rbd_do_op(rq, rbd_dev, NULL,
1132
			 (snapid ? snapid : CEPH_NOSNAP),
1133
			 CEPH_OSD_OP_READ,
1134
			 CEPH_OSD_FLAG_READ,
1135
			 2,
1136
			 ofs, len, bio, coll, coll_index);
1137
}
1138

1139
/*
1140
 * Request sync osd read
1141
 */
1142
static int rbd_req_sync_read(struct rbd_device *dev,
1143
			  struct ceph_snap_context *snapc,
1144
			  u64 snapid,
1145
			  const char *obj,
1146
			  u64 ofs, u64 len,
1147
			  char *buf,
1148
			  u64 *ver)
1149
{
1150
	return rbd_req_sync_op(dev, NULL,
1151
			       (snapid ? snapid : CEPH_NOSNAP),
1152
			       CEPH_OSD_OP_READ,
1153
			       CEPH_OSD_FLAG_READ,
1154
			       NULL,
1155
			       1, obj, ofs, len, buf, NULL, ver);
1156
}
1157

1158
/*
1159
 * Request sync osd watch
1160
 */
1161
static int rbd_req_sync_notify_ack(struct rbd_device *dev,
1162
				   u64 ver,
1163
				   u64 notify_id,
1164
				   const char *obj)
1165
{
1166
	struct ceph_osd_req_op *ops;
1167
	struct page **pages = NULL;
1168
	int ret;
1169

1170
	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1171
	if (ret < 0)
1172
		return ret;
1173

1174
	ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
1175
	ops[0].watch.cookie = notify_id;
1176
	ops[0].watch.flag = 0;
1177

1178
	ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
1179
			  obj, 0, 0, NULL,
1180
			  pages, 0,
1181
			  CEPH_OSD_FLAG_READ,
1182
			  ops,
1183
			  1,
1184
			  NULL, 0,
1185
			  rbd_simple_req_cb, 0, NULL);
1186

1187
	rbd_destroy_ops(ops);
1188
	return ret;
1189
}
1190

1191
static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1192
{
1193
	struct rbd_device *dev = (struct rbd_device *)data;
1194
	int rc;
1195

1196
	if (!dev)
1197
		return;
1198

1199
	dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1200
		notify_id, (int)opcode);
1201
	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1202
	rc = __rbd_update_snaps(dev);
1203
	mutex_unlock(&ctl_mutex);
1204
	if (rc)
1205
		pr_warning(DRV_NAME "%d got notification but failed to update"
1206
			   " snaps: %d\n", dev->major, rc);
1207

1208
	rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
1209
}
1210

1211
/*
1212
 * Request sync osd watch
1213
 */
1214
static int rbd_req_sync_watch(struct rbd_device *dev,
1215
			      const char *obj,
1216
			      u64 ver)
1217
{
1218
	struct ceph_osd_req_op *ops;
1219
	struct ceph_osd_client *osdc = &dev->client->osdc;
1220

1221
	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1222
	if (ret < 0)
1223
		return ret;
1224

1225
	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
1226
				     (void *)dev, &dev->watch_event);
1227
	if (ret < 0)
1228
		goto fail;
1229

1230
	ops[0].watch.ver = cpu_to_le64(ver);
1231
	ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1232
	ops[0].watch.flag = 1;
1233

1234
	ret = rbd_req_sync_op(dev, NULL,
1235
			      CEPH_NOSNAP,
1236
			      0,
1237
			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1238
			      ops,
1239
			      1, obj, 0, 0, NULL,
1240
			      &dev->watch_request, NULL);
1241

1242
	if (ret < 0)
1243
		goto fail_event;
1244

1245
	rbd_destroy_ops(ops);
1246
	return 0;
1247

1248
fail_event:
1249
	ceph_osdc_cancel_event(dev->watch_event);
1250
	dev->watch_event = NULL;
1251
fail:
1252
	rbd_destroy_ops(ops);
1253
	return ret;
1254
}
1255

1256
struct rbd_notify_info {
1257
	struct rbd_device *dev;
1258
};
1259

1260
static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1261
{
1262
	struct rbd_device *dev = (struct rbd_device *)data;
1263
	if (!dev)
1264
		return;
1265

1266
	dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1267
		notify_id, (int)opcode);
1268
}
1269

1270
/*
1271
 * Request sync osd notify
1272
 */
1273
static int rbd_req_sync_notify(struct rbd_device *dev,
1274
		          const char *obj)
1275
{
1276
	struct ceph_osd_req_op *ops;
1277
	struct ceph_osd_client *osdc = &dev->client->osdc;
1278
	struct ceph_osd_event *event;
1279
	struct rbd_notify_info info;
1280
	int payload_len = sizeof(u32) + sizeof(u32);
1281
	int ret;
1282

1283
	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1284
	if (ret < 0)
1285
		return ret;
1286

1287
	info.dev = dev;
1288

1289
	ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1290
				     (void *)&info, &event);
1291
	if (ret < 0)
1292
		goto fail;
1293

1294
	ops[0].watch.ver = 1;
1295
	ops[0].watch.flag = 1;
1296
	ops[0].watch.cookie = event->cookie;
1297
	ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1298
	ops[0].watch.timeout = 12;
1299

1300
	ret = rbd_req_sync_op(dev, NULL,
1301
			       CEPH_NOSNAP,
1302
			       0,
1303
			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1304
			       ops,
1305
			       1, obj, 0, 0, NULL, NULL, NULL);
1306
	if (ret < 0)
1307
		goto fail_event;
1308

1309
	ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1310
	dout("ceph_osdc_wait_event returned %d\n", ret);
1311
	rbd_destroy_ops(ops);
1312
	return 0;
1313

1314
fail_event:
1315
	ceph_osdc_cancel_event(event);
1316
fail:
1317
	rbd_destroy_ops(ops);
1318
	return ret;
1319
}
1320

1321
/*
1322
 * Request sync osd rollback
1323
 */
1324
static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
1325
				     u64 snapid,
1326
				     const char *obj)
1327
{
1328
	struct ceph_osd_req_op *ops;
1329
	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
1330
	if (ret < 0)
1331
		return ret;
1332

1333
	ops[0].snap.snapid = snapid;
1334

1335
	ret = rbd_req_sync_op(dev, NULL,
1336
			       CEPH_NOSNAP,
1337
			       0,
1338
			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1339
			       ops,
1340
			       1, obj, 0, 0, NULL, NULL, NULL);
1341

1342
	rbd_destroy_ops(ops);
1343

1344
	return ret;
1345
}
1346

1347
/*
1348
 * Request sync osd read
1349
 */
1350
static int rbd_req_sync_exec(struct rbd_device *dev,
1351
			     const char *obj,
1352
			     const char *cls,
1353
			     const char *method,
1354
			     const char *data,
1355
			     int len,
1356
			     u64 *ver)
1357
{
1358
	struct ceph_osd_req_op *ops;
1359
	int cls_len = strlen(cls);
1360
	int method_len = strlen(method);
1361
	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1362
				    cls_len + method_len + len);
1363
	if (ret < 0)
1364
		return ret;
1365

1366
	ops[0].cls.class_name = cls;
1367
	ops[0].cls.class_len = (__u8)cls_len;
1368
	ops[0].cls.method_name = method;
1369
	ops[0].cls.method_len = (__u8)method_len;
1370
	ops[0].cls.argc = 0;
1371
	ops[0].cls.indata = data;
1372
	ops[0].cls.indata_len = len;
1373

1374
	ret = rbd_req_sync_op(dev, NULL,
1375
			       CEPH_NOSNAP,
1376
			       0,
1377
			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1378
			       ops,
1379
			       1, obj, 0, 0, NULL, NULL, ver);
1380

1381
	rbd_destroy_ops(ops);
1382

1383
	dout("cls_exec returned %d\n", ret);
1384
	return ret;
1385
}
1386

1387
static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1388
{
1389
	struct rbd_req_coll *coll =
1390
			kzalloc(sizeof(struct rbd_req_coll) +
1391
			        sizeof(struct rbd_req_status) * num_reqs,
1392
				GFP_ATOMIC);
1393

1394
	if (!coll)
1395
		return NULL;
1396
	coll->total = num_reqs;
1397
	kref_init(&coll->kref);
1398
	return coll;
1399
}
1400

1401
/*
1402
 * block device queue callback
1403
 */
1404
static void rbd_rq_fn(struct request_queue *q)
1405
{
1406
	struct rbd_device *rbd_dev = q->queuedata;
1407
	struct request *rq;
1408
	struct bio_pair *bp = NULL;
1409

1410
	rq = blk_fetch_request(q);
1411

1412
	while (1) {
1413
		struct bio *bio;
1414
		struct bio *rq_bio, *next_bio = NULL;
1415
		bool do_write;
1416
		int size, op_size = 0;
1417
		u64 ofs;
1418
		int num_segs, cur_seg = 0;
1419
		struct rbd_req_coll *coll;
1420

1421
		/* peek at request from block layer */
1422
		if (!rq)
1423
			break;
1424

1425
		dout("fetched request\n");
1426

1427
		/* filter out block requests we don't understand */
1428
		if ((rq->cmd_type != REQ_TYPE_FS)) {
1429
			__blk_end_request_all(rq, 0);
1430
			goto next;
1431
		}
1432

1433
		/* deduce our operation (read, write) */
1434
		do_write = (rq_data_dir(rq) == WRITE);
1435

1436
		size = blk_rq_bytes(rq);
1437
		ofs = blk_rq_pos(rq) * 512ULL;
1438
		rq_bio = rq->bio;
1439
		if (do_write && rbd_dev->read_only) {
1440
			__blk_end_request_all(rq, -EROFS);
1441
			goto next;
1442
		}
1443

1444
		spin_unlock_irq(q->queue_lock);
1445

1446
		dout("%s 0x%x bytes at 0x%llx\n",
1447
		     do_write ? "write" : "read",
1448
		     size, blk_rq_pos(rq) * 512ULL);
1449

1450
		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1451
		coll = rbd_alloc_coll(num_segs);
1452
		if (!coll) {
1453
			spin_lock_irq(q->queue_lock);
1454
			__blk_end_request_all(rq, -ENOMEM);
1455
			goto next;
1456
		}
1457

1458
		do {
1459
			/* a bio clone to be passed down to OSD req */
1460
			dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1461
			op_size = rbd_get_segment(&rbd_dev->header,
1462
						  rbd_dev->header.block_name,
1463
						  ofs, size,
1464
						  NULL, NULL);
1465
			kref_get(&coll->kref);
1466
			bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1467
					      op_size, GFP_ATOMIC);
1468
			if (!bio) {
1469
				rbd_coll_end_req_index(rq, coll, cur_seg,
1470
						       -ENOMEM, op_size);
1471
				goto next_seg;
1472
			}
1473

1474

1475
			/* init OSD command: write or read */
1476
			if (do_write)
1477
				rbd_req_write(rq, rbd_dev,
1478
					      rbd_dev->header.snapc,
1479
					      ofs,
1480
					      op_size, bio,
1481
					      coll, cur_seg);
1482
			else
1483
				rbd_req_read(rq, rbd_dev,
1484
					     cur_snap_id(rbd_dev),
1485
					     ofs,
1486
					     op_size, bio,
1487
					     coll, cur_seg);
1488

1489
next_seg:
1490
			size -= op_size;
1491
			ofs += op_size;
1492

1493
			cur_seg++;
1494
			rq_bio = next_bio;
1495
		} while (size > 0);
1496
		kref_put(&coll->kref, rbd_coll_release);
1497

1498
		if (bp)
1499
			bio_pair_release(bp);
1500
		spin_lock_irq(q->queue_lock);
1501
next:
1502
		rq = blk_fetch_request(q);
1503
	}
1504
}
1505

1506
/*
1507
 * a queue callback. Makes sure that we don't create a bio that spans across
1508
 * multiple osd objects. One exception would be with a single page bios,
1509
 * which we handle later at bio_chain_clone
1510
 */
1511
static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1512
			  struct bio_vec *bvec)
1513
{
1514
	struct rbd_device *rbd_dev = q->queuedata;
1515
	unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
1516
	sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1517
	unsigned int bio_sectors = bmd->bi_size >> 9;
1518
	int max;
1519

1520
	max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
1521
				 + bio_sectors)) << 9;
1522
	if (max < 0)
1523
		max = 0; /* bio_add cannot handle a negative return */
1524
	if (max <= bvec->bv_len && bio_sectors == 0)
1525
		return bvec->bv_len;
1526
	return max;
1527
}
1528

1529
static void rbd_free_disk(struct rbd_device *rbd_dev)
1530
{
1531
	struct gendisk *disk = rbd_dev->disk;
1532

1533
	if (!disk)
1534
		return;
1535

1536
	rbd_header_free(&rbd_dev->header);
1537

1538
	if (disk->flags & GENHD_FL_UP)
1539
		del_gendisk(disk);
1540
	if (disk->queue)
1541
		blk_cleanup_queue(disk->queue);
1542
	put_disk(disk);
1543
}
1544

1545
/*
1546
 * reload the ondisk the header 
1547
 */
1548
static int rbd_read_header(struct rbd_device *rbd_dev,
1549
			   struct rbd_image_header *header)
1550
{
1551
	ssize_t rc;
1552
	struct rbd_image_header_ondisk *dh;
1553
	int snap_count = 0;
1554
	u64 snap_names_len = 0;
1555
	u64 ver;
1556

1557
	while (1) {
1558
		int len = sizeof(*dh) +
1559
			  snap_count * sizeof(struct rbd_image_snap_ondisk) +
1560
			  snap_names_len;
1561

1562
		rc = -ENOMEM;
1563
		dh = kmalloc(len, GFP_KERNEL);
1564
		if (!dh)
1565
			return -ENOMEM;
1566

1567
		rc = rbd_req_sync_read(rbd_dev,
1568
				       NULL, CEPH_NOSNAP,
1569
				       rbd_dev->obj_md_name,
1570
				       0, len,
1571
				       (char *)dh, &ver);
1572
		if (rc < 0)
1573
			goto out_dh;
1574

1575
		rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
1576
		if (rc < 0)
1577
			goto out_dh;
1578

1579
		if (snap_count != header->total_snaps) {
1580
			snap_count = header->total_snaps;
1581
			snap_names_len = header->snap_names_len;
1582
			rbd_header_free(header);
1583
			kfree(dh);
1584
			continue;
1585
		}
1586
		break;
1587
	}
1588
	header->obj_version = ver;
1589

1590
out_dh:
1591
	kfree(dh);
1592
	return rc;
1593
}
1594

1595
/*
1596
 * create a snapshot
1597
 */
1598
static int rbd_header_add_snap(struct rbd_device *dev,
1599
			       const char *snap_name,
1600
			       gfp_t gfp_flags)
1601
{
1602
	int name_len = strlen(snap_name);
1603
	u64 new_snapid;
1604
	int ret;
1605
	void *data, *p, *e;
1606
	u64 ver;
1607

1608
	/* we should create a snapshot only if we're pointing at the head */
1609
	if (dev->cur_snap)
1610
		return -EINVAL;
1611

1612
	ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
1613
				      &new_snapid);
1614
	dout("created snapid=%lld\n", new_snapid);
1615
	if (ret < 0)
1616
		return ret;
1617

1618
	data = kmalloc(name_len + 16, gfp_flags);
1619
	if (!data)
1620
		return -ENOMEM;
1621

1622
	p = data;
1623
	e = data + name_len + 16;
1624

1625
	ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1626
	ceph_encode_64_safe(&p, e, new_snapid, bad);
1627

1628
	ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
1629
				data, p - data, &ver);
1630

1631
	kfree(data);
1632

1633
	if (ret < 0)
1634
		return ret;
1635

1636
	dev->header.snapc->seq =  new_snapid;
1637

1638
	return 0;
1639
bad:
1640
	return -ERANGE;
1641
}
1642

1643
static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1644
{
1645
	struct rbd_snap *snap;
1646

1647
	while (!list_empty(&rbd_dev->snaps)) {
1648
		snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1649
		__rbd_remove_snap_dev(rbd_dev, snap);
1650
	}
1651
}
1652

1653
/*
1654
 * only read the first part of the ondisk header, without the snaps info
1655
 */
1656
static int __rbd_update_snaps(struct rbd_device *rbd_dev)
1657
{
1658
	int ret;
1659
	struct rbd_image_header h;
1660
	u64 snap_seq;
1661
	int follow_seq = 0;
1662

1663
	ret = rbd_read_header(rbd_dev, &h);
1664
	if (ret < 0)
1665
		return ret;
1666

1667
	/* resized? */
1668
	set_capacity(rbd_dev->disk, h.image_size / 512ULL);
1669

1670
	down_write(&rbd_dev->header.snap_rwsem);
1671

1672
	snap_seq = rbd_dev->header.snapc->seq;
1673
	if (rbd_dev->header.total_snaps &&
1674
	    rbd_dev->header.snapc->snaps[0] == snap_seq)
1675
		/* pointing at the head, will need to follow that
1676
		   if head moves */
1677
		follow_seq = 1;
1678

1679
	kfree(rbd_dev->header.snapc);
1680
	kfree(rbd_dev->header.snap_names);
1681
	kfree(rbd_dev->header.snap_sizes);
1682

1683
	rbd_dev->header.total_snaps = h.total_snaps;
1684
	rbd_dev->header.snapc = h.snapc;
1685
	rbd_dev->header.snap_names = h.snap_names;
1686
	rbd_dev->header.snap_names_len = h.snap_names_len;
1687
	rbd_dev->header.snap_sizes = h.snap_sizes;
1688
	if (follow_seq)
1689
		rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1690
	else
1691
		rbd_dev->header.snapc->seq = snap_seq;
1692

1693
	ret = __rbd_init_snaps_header(rbd_dev);
1694

1695
	up_write(&rbd_dev->header.snap_rwsem);
1696

1697
	return ret;
1698
}
1699

1700
static int rbd_init_disk(struct rbd_device *rbd_dev)
1701
{
1702
	struct gendisk *disk;
1703
	struct request_queue *q;
1704
	int rc;
1705
	u64 total_size = 0;
1706

1707
	/* contact OSD, request size info about the object being mapped */
1708
	rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1709
	if (rc)
1710
		return rc;
1711

1712
	/* no need to lock here, as rbd_dev is not registered yet */
1713
	rc = __rbd_init_snaps_header(rbd_dev);
1714
	if (rc)
1715
		return rc;
1716

1717
	rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
1718
	if (rc)
1719
		return rc;
1720

1721
	/* create gendisk info */
1722
	rc = -ENOMEM;
1723
	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1724
	if (!disk)
1725
		goto out;
1726

1727
	snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d",
1728
		 rbd_dev->id);
1729
	disk->major = rbd_dev->major;
1730
	disk->first_minor = 0;
1731
	disk->fops = &rbd_bd_ops;
1732
	disk->private_data = rbd_dev;
1733

1734
	/* init rq */
1735
	rc = -ENOMEM;
1736
	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1737
	if (!q)
1738
		goto out_disk;
1739
	blk_queue_merge_bvec(q, rbd_merge_bvec);
1740
	disk->queue = q;
1741

1742
	q->queuedata = rbd_dev;
1743

1744
	rbd_dev->disk = disk;
1745
	rbd_dev->q = q;
1746

1747
	/* finally, announce the disk to the world */
1748
	set_capacity(disk, total_size / 512ULL);
1749
	add_disk(disk);
1750

1751
	pr_info("%s: added with size 0x%llx\n",
1752
		disk->disk_name, (unsigned long long)total_size);
1753
	return 0;
1754

1755
out_disk:
1756
	put_disk(disk);
1757
out:
1758
	return rc;
1759
}
1760

1761
/*
1762
  sysfs
1763
*/
1764

1765
static ssize_t rbd_size_show(struct device *dev,
1766
			     struct device_attribute *attr, char *buf)
1767
{
1768
	struct rbd_device *rbd_dev = dev_to_rbd(dev);
1769

1770
	return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
1771
}
1772

1773
static ssize_t rbd_major_show(struct device *dev,
1774
			      struct device_attribute *attr, char *buf)
1775
{
1776
	struct rbd_device *rbd_dev = dev_to_rbd(dev);
1777

1778
	return sprintf(buf, "%d\n", rbd_dev->major);
1779
}
1780

1781
static ssize_t rbd_client_id_show(struct device *dev,
1782
				  struct device_attribute *attr, char *buf)
1783
{
1784
	struct rbd_device *rbd_dev = dev_to_rbd(dev);
1785

1786
	return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client));
1787
}
1788

1789
static ssize_t rbd_pool_show(struct device *dev,
1790
			     struct device_attribute *attr, char *buf)
1791
{
1792
	struct rbd_device *rbd_dev = dev_to_rbd(dev);
1793

1794
	return sprintf(buf, "%s\n", rbd_dev->pool_name);
1795
}
1796

1797
static ssize_t rbd_name_show(struct device *dev,
1798
			     struct device_attribute *attr, char *buf)
1799
{
1800
	struct rbd_device *rbd_dev = dev_to_rbd(dev);
1801

1802
	return sprintf(buf, "%s\n", rbd_dev->obj);
1803
}
1804

1805
static ssize_t rbd_snap_show(struct device *dev,
1806
			     struct device_attribute *attr,
1807
			     char *buf)
1808
{
1809
	struct rbd_device *rbd_dev = dev_to_rbd(dev);
1810

1811
	return sprintf(buf, "%s\n", rbd_dev->snap_name);
1812
}
1813

1814
static ssize_t rbd_image_refresh(struct device *dev,
1815
				 struct device_attribute *attr,
1816
				 const char *buf,
1817
				 size_t size)
1818
{
1819
	struct rbd_device *rbd_dev = dev_to_rbd(dev);
1820
	int rc;
1821
	int ret = size;
1822

1823
	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1824

1825
	rc = __rbd_update_snaps(rbd_dev);
1826
	if (rc < 0)
1827
		ret = rc;
1828

1829
	mutex_unlock(&ctl_mutex);
1830
	return ret;
1831
}
1832

1833
static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1834
static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1835
static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1836
static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1837
static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1838
static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1839
static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1840
static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
1841
static DEVICE_ATTR(rollback_snap, S_IWUSR, NULL, rbd_snap_rollback);
1842

1843
static struct attribute *rbd_attrs[] = {
1844
	&dev_attr_size.attr,
1845
	&dev_attr_major.attr,
1846
	&dev_attr_client_id.attr,
1847
	&dev_attr_pool.attr,
1848
	&dev_attr_name.attr,
1849
	&dev_attr_current_snap.attr,
1850
	&dev_attr_refresh.attr,
1851
	&dev_attr_create_snap.attr,
1852
	&dev_attr_rollback_snap.attr,
1853
	NULL
1854
};
1855

1856
static struct attribute_group rbd_attr_group = {
1857
	.attrs = rbd_attrs,
1858
};
1859

1860
static const struct attribute_group *rbd_attr_groups[] = {
1861
	&rbd_attr_group,
1862
	NULL
1863
};
1864

1865
static void rbd_sysfs_dev_release(struct device *dev)
1866
{
1867
}
1868

1869
static struct device_type rbd_device_type = {
1870
	.name		= "rbd",
1871
	.groups		= rbd_attr_groups,
1872
	.release	= rbd_sysfs_dev_release,
1873
};
1874

1875

1876
/*
1877
  sysfs - snapshots
1878
*/
1879

1880
static ssize_t rbd_snap_size_show(struct device *dev,
1881
				  struct device_attribute *attr,
1882
				  char *buf)
1883
{
1884
	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1885

1886
	return sprintf(buf, "%lld\n", (long long)snap->size);
1887
}
1888

1889
static ssize_t rbd_snap_id_show(struct device *dev,
1890
				struct device_attribute *attr,
1891
				char *buf)
1892
{
1893
	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1894

1895
	return sprintf(buf, "%lld\n", (long long)snap->id);
1896
}
1897

1898
static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1899
static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1900

1901
static struct attribute *rbd_snap_attrs[] = {
1902
	&dev_attr_snap_size.attr,
1903
	&dev_attr_snap_id.attr,
1904
	NULL,
1905
};
1906

1907
static struct attribute_group rbd_snap_attr_group = {
1908
	.attrs = rbd_snap_attrs,
1909
};
1910

1911
static void rbd_snap_dev_release(struct device *dev)
1912
{
1913
	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1914
	kfree(snap->name);
1915
	kfree(snap);
1916
}
1917

1918
static const struct attribute_group *rbd_snap_attr_groups[] = {
1919
	&rbd_snap_attr_group,
1920
	NULL
1921
};
1922

1923
static struct device_type rbd_snap_device_type = {
1924
	.groups		= rbd_snap_attr_groups,
1925
	.release	= rbd_snap_dev_release,
1926
};
1927

1928
static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
1929
				  struct rbd_snap *snap)
1930
{
1931
	list_del(&snap->node);
1932
	device_unregister(&snap->dev);
1933
}
1934

1935
static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
1936
				  struct rbd_snap *snap,
1937
				  struct device *parent)
1938
{
1939
	struct device *dev = &snap->dev;
1940
	int ret;
1941

1942
	dev->type = &rbd_snap_device_type;
1943
	dev->parent = parent;
1944
	dev->release = rbd_snap_dev_release;
1945
	dev_set_name(dev, "snap_%s", snap->name);
1946
	ret = device_register(dev);
1947

1948
	return ret;
1949
}
1950

1951
static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
1952
			      int i, const char *name,
1953
			      struct rbd_snap **snapp)
1954
{
1955
	int ret;
1956
	struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
1957
	if (!snap)
1958
		return -ENOMEM;
1959
	snap->name = kstrdup(name, GFP_KERNEL);
1960
	snap->size = rbd_dev->header.snap_sizes[i];
1961
	snap->id = rbd_dev->header.snapc->snaps[i];
1962
	if (device_is_registered(&rbd_dev->dev)) {
1963
		ret = rbd_register_snap_dev(rbd_dev, snap,
1964
					     &rbd_dev->dev);
1965
		if (ret < 0)
1966
			goto err;
1967
	}
1968
	*snapp = snap;
1969
	return 0;
1970
err:
1971
	kfree(snap->name);
1972
	kfree(snap);
1973
	return ret;
1974
}
1975

1976
/*
1977
 * search for the previous snap in a null delimited string list
1978
 */
1979
const char *rbd_prev_snap_name(const char *name, const char *start)
1980
{
1981
	if (name < start + 2)
1982
		return NULL;
1983

1984
	name -= 2;
1985
	while (*name) {
1986
		if (name == start)
1987
			return start;
1988
		name--;
1989
	}
1990
	return name + 1;
1991
}
1992

1993
/*
1994
 * compare the old list of snapshots that we have to what's in the header
1995
 * and update it accordingly. Note that the header holds the snapshots
1996
 * in a reverse order (from newest to oldest) and we need to go from
1997
 * older to new so that we don't get a duplicate snap name when
1998
 * doing the process (e.g., removed snapshot and recreated a new
1999
 * one with the same name.
2000
 */
2001
static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2002
{
2003
	const char *name, *first_name;
2004
	int i = rbd_dev->header.total_snaps;
2005
	struct rbd_snap *snap, *old_snap = NULL;
2006
	int ret;
2007
	struct list_head *p, *n;
2008

2009
	first_name = rbd_dev->header.snap_names;
2010
	name = first_name + rbd_dev->header.snap_names_len;
2011

2012
	list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2013
		u64 cur_id;
2014

2015
		old_snap = list_entry(p, struct rbd_snap, node);
2016

2017
		if (i)
2018
			cur_id = rbd_dev->header.snapc->snaps[i - 1];
2019

2020
		if (!i || old_snap->id < cur_id) {
2021
			/* old_snap->id was skipped, thus was removed */
2022
			__rbd_remove_snap_dev(rbd_dev, old_snap);
2023
			continue;
2024
		}
2025
		if (old_snap->id == cur_id) {
2026
			/* we have this snapshot already */
2027
			i--;
2028
			name = rbd_prev_snap_name(name, first_name);
2029
			continue;
2030
		}
2031
		for (; i > 0;
2032
		     i--, name = rbd_prev_snap_name(name, first_name)) {
2033
			if (!name) {
2034
				WARN_ON(1);
2035
				return -EINVAL;
2036
			}
2037
			cur_id = rbd_dev->header.snapc->snaps[i];
2038
			/* snapshot removal? handle it above */
2039
			if (cur_id >= old_snap->id)
2040
				break;
2041
			/* a new snapshot */
2042
			ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2043
			if (ret < 0)
2044
				return ret;
2045

2046
			/* note that we add it backward so using n and not p */
2047
			list_add(&snap->node, n);
2048
			p = &snap->node;
2049
		}
2050
	}
2051
	/* we're done going over the old snap list, just add what's left */
2052
	for (; i > 0; i--) {
2053
		name = rbd_prev_snap_name(name, first_name);
2054
		if (!name) {
2055
			WARN_ON(1);
2056
			return -EINVAL;
2057
		}
2058
		ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2059
		if (ret < 0)
2060
			return ret;
2061
		list_add(&snap->node, &rbd_dev->snaps);
2062
	}
2063

2064
	return 0;
2065
}
2066

2067

2068
static void rbd_root_dev_release(struct device *dev)
2069
{
2070
}
2071

2072
static struct device rbd_root_dev = {
2073
	.init_name =    "rbd",
2074
	.release =      rbd_root_dev_release,
2075
};
2076

2077
static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2078
{
2079
	int ret = -ENOMEM;
2080
	struct device *dev;
2081
	struct rbd_snap *snap;
2082

2083
	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2084
	dev = &rbd_dev->dev;
2085

2086
	dev->bus = &rbd_bus_type;
2087
	dev->type = &rbd_device_type;
2088
	dev->parent = &rbd_root_dev;
2089
	dev->release = rbd_dev_release;
2090
	dev_set_name(dev, "%d", rbd_dev->id);
2091
	ret = device_register(dev);
2092
	if (ret < 0)
2093
		goto done_free;
2094

2095
	list_for_each_entry(snap, &rbd_dev->snaps, node) {
2096
		ret = rbd_register_snap_dev(rbd_dev, snap,
2097
					     &rbd_dev->dev);
2098
		if (ret < 0)
2099
			break;
2100
	}
2101

2102
	mutex_unlock(&ctl_mutex);
2103
	return 0;
2104
done_free:
2105
	mutex_unlock(&ctl_mutex);
2106
	return ret;
2107
}
2108

2109
static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2110
{
2111
	device_unregister(&rbd_dev->dev);
2112
}
2113

2114
static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2115
{
2116
	int ret, rc;
2117

2118
	do {
2119
		ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
2120
					 rbd_dev->header.obj_version);
2121
		if (ret == -ERANGE) {
2122
			mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2123
			rc = __rbd_update_snaps(rbd_dev);
2124
			mutex_unlock(&ctl_mutex);
2125
			if (rc < 0)
2126
				return rc;
2127
		}
2128
	} while (ret == -ERANGE);
2129

2130
	return ret;
2131
}
2132

2133
static ssize_t rbd_add(struct bus_type *bus,
2134
		       const char *buf,
2135
		       size_t count)
2136
{
2137
	struct ceph_osd_client *osdc;
2138
	struct rbd_device *rbd_dev;
2139
	ssize_t rc = -ENOMEM;
2140
	int irc, new_id = 0;
2141
	struct list_head *tmp;
2142
	char *mon_dev_name;
2143
	char *options;
2144

2145
	if (!try_module_get(THIS_MODULE))
2146
		return -ENODEV;
2147

2148
	mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2149
	if (!mon_dev_name)
2150
		goto err_out_mod;
2151

2152
	options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2153
	if (!options)
2154
		goto err_mon_dev;
2155

2156
	/* new rbd_device object */
2157
	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2158
	if (!rbd_dev)
2159
		goto err_out_opt;
2160

2161
	/* static rbd_device initialization */
2162
	spin_lock_init(&rbd_dev->lock);
2163
	INIT_LIST_HEAD(&rbd_dev->node);
2164
	INIT_LIST_HEAD(&rbd_dev->snaps);
2165

2166
	/* generate unique id: find highest unique id, add one */
2167
	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2168

2169
	list_for_each(tmp, &rbd_dev_list) {
2170
		struct rbd_device *rbd_dev;
2171

2172
		rbd_dev = list_entry(tmp, struct rbd_device, node);
2173
		if (rbd_dev->id >= new_id)
2174
			new_id = rbd_dev->id + 1;
2175
	}
2176

2177
	rbd_dev->id = new_id;
2178

2179
	/* add to global list */
2180
	list_add_tail(&rbd_dev->node, &rbd_dev_list);
2181

2182
	/* parse add command */
2183
	if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
2184
		   "%" __stringify(RBD_MAX_OPT_LEN) "s "
2185
		   "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
2186
		   "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
2187
		   "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
2188
		   mon_dev_name, options, rbd_dev->pool_name,
2189
		   rbd_dev->obj, rbd_dev->snap_name) < 4) {
2190
		rc = -EINVAL;
2191
		goto err_out_slot;
2192
	}
2193

2194
	if (rbd_dev->snap_name[0] == 0)
2195
		rbd_dev->snap_name[0] = '-';
2196

2197
	rbd_dev->obj_len = strlen(rbd_dev->obj);
2198
	snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
2199
		 rbd_dev->obj, RBD_SUFFIX);
2200

2201
	/* initialize rest of new object */
2202
	snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
2203
	rc = rbd_get_client(rbd_dev, mon_dev_name, options);
2204
	if (rc < 0)
2205
		goto err_out_slot;
2206

2207
	mutex_unlock(&ctl_mutex);
2208

2209
	/* pick the pool */
2210
	osdc = &rbd_dev->client->osdc;
2211
	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2212
	if (rc < 0)
2213
		goto err_out_client;
2214
	rbd_dev->poolid = rc;
2215

2216
	/* register our block device */
2217
	irc = register_blkdev(0, rbd_dev->name);
2218
	if (irc < 0) {
2219
		rc = irc;
2220
		goto err_out_client;
2221
	}
2222
	rbd_dev->major = irc;
2223

2224
	rc = rbd_bus_add_dev(rbd_dev);
2225
	if (rc)
2226
		goto err_out_blkdev;
2227

2228
	/* set up and announce blkdev mapping */
2229
	rc = rbd_init_disk(rbd_dev);
2230
	if (rc)
2231
		goto err_out_bus;
2232

2233
	rc = rbd_init_watch_dev(rbd_dev);
2234
	if (rc)
2235
		goto err_out_bus;
2236

2237
	return count;
2238

2239
err_out_bus:
2240
	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2241
	list_del_init(&rbd_dev->node);
2242
	mutex_unlock(&ctl_mutex);
2243

2244
	/* this will also clean up rest of rbd_dev stuff */
2245

2246
	rbd_bus_del_dev(rbd_dev);
2247
	kfree(options);
2248
	kfree(mon_dev_name);
2249
	return rc;
2250

2251
err_out_blkdev:
2252
	unregister_blkdev(rbd_dev->major, rbd_dev->name);
2253
err_out_client:
2254
	rbd_put_client(rbd_dev);
2255
	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2256
err_out_slot:
2257
	list_del_init(&rbd_dev->node);
2258
	mutex_unlock(&ctl_mutex);
2259

2260
	kfree(rbd_dev);
2261
err_out_opt:
2262
	kfree(options);
2263
err_mon_dev:
2264
	kfree(mon_dev_name);
2265
err_out_mod:
2266
	dout("Error adding device %s\n", buf);
2267
	module_put(THIS_MODULE);
2268
	return rc;
2269
}
2270

2271
static struct rbd_device *__rbd_get_dev(unsigned long id)
2272
{
2273
	struct list_head *tmp;
2274
	struct rbd_device *rbd_dev;
2275

2276
	list_for_each(tmp, &rbd_dev_list) {
2277
		rbd_dev = list_entry(tmp, struct rbd_device, node);
2278
		if (rbd_dev->id == id)
2279
			return rbd_dev;
2280
	}
2281
	return NULL;
2282
}
2283

2284
static void rbd_dev_release(struct device *dev)
2285
{
2286
	struct rbd_device *rbd_dev =
2287
			container_of(dev, struct rbd_device, dev);
2288

2289
	if (rbd_dev->watch_request)
2290
		ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc,
2291
						    rbd_dev->watch_request);
2292
	if (rbd_dev->watch_event)
2293
		ceph_osdc_cancel_event(rbd_dev->watch_event);
2294

2295
	rbd_put_client(rbd_dev);
2296

2297
	/* clean up and free blkdev */
2298
	rbd_free_disk(rbd_dev);
2299
	unregister_blkdev(rbd_dev->major, rbd_dev->name);
2300
	kfree(rbd_dev);
2301

2302
	/* release module ref */
2303
	module_put(THIS_MODULE);
2304
}
2305

2306
static ssize_t rbd_remove(struct bus_type *bus,
2307
			  const char *buf,
2308
			  size_t count)
2309
{
2310
	struct rbd_device *rbd_dev = NULL;
2311
	int target_id, rc;
2312
	unsigned long ul;
2313
	int ret = count;
2314

2315
	rc = strict_strtoul(buf, 10, &ul);
2316
	if (rc)
2317
		return rc;
2318

2319
	/* convert to int; abort if we lost anything in the conversion */
2320
	target_id = (int) ul;
2321
	if (target_id != ul)
2322
		return -EINVAL;
2323

2324
	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2325

2326
	rbd_dev = __rbd_get_dev(target_id);
2327
	if (!rbd_dev) {
2328
		ret = -ENOENT;
2329
		goto done;
2330
	}
2331

2332
	list_del_init(&rbd_dev->node);
2333

2334
	__rbd_remove_all_snaps(rbd_dev);
2335
	rbd_bus_del_dev(rbd_dev);
2336

2337
done:
2338
	mutex_unlock(&ctl_mutex);
2339
	return ret;
2340
}
2341

2342
static ssize_t rbd_snap_add(struct device *dev,
2343
			    struct device_attribute *attr,
2344
			    const char *buf,
2345
			    size_t count)
2346
{
2347
	struct rbd_device *rbd_dev = dev_to_rbd(dev);
2348
	int ret;
2349
	char *name = kmalloc(count + 1, GFP_KERNEL);
2350
	if (!name)
2351
		return -ENOMEM;
2352

2353
	snprintf(name, count, "%s", buf);
2354

2355
	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2356

2357
	ret = rbd_header_add_snap(rbd_dev,
2358
				  name, GFP_KERNEL);
2359
	if (ret < 0)
2360
		goto err_unlock;
2361

2362
	ret = __rbd_update_snaps(rbd_dev);
2363
	if (ret < 0)
2364
		goto err_unlock;
2365

2366
	/* shouldn't hold ctl_mutex when notifying.. notify might
2367
	   trigger a watch callback that would need to get that mutex */
2368
	mutex_unlock(&ctl_mutex);
2369

2370
	/* make a best effort, don't error if failed */
2371
	rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
2372

2373
	ret = count;
2374
	kfree(name);
2375
	return ret;
2376

2377
err_unlock:
2378
	mutex_unlock(&ctl_mutex);
2379
	kfree(name);
2380
	return ret;
2381
}
2382

2383
static ssize_t rbd_snap_rollback(struct device *dev,
2384
				 struct device_attribute *attr,
2385
				 const char *buf,
2386
				 size_t count)
2387
{
2388
	struct rbd_device *rbd_dev = dev_to_rbd(dev);
2389
	int ret;
2390
	u64 snapid;
2391
	u64 cur_ofs;
2392
	char *seg_name = NULL;
2393
	char *snap_name = kmalloc(count + 1, GFP_KERNEL);
2394
	ret = -ENOMEM;
2395
	if (!snap_name)
2396
		return ret;
2397

2398
	/* parse snaps add command */
2399
	snprintf(snap_name, count, "%s", buf);
2400
	seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
2401
	if (!seg_name)
2402
		goto done;
2403

2404
	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2405

2406
	ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
2407
	if (ret < 0)
2408
		goto done_unlock;
2409

2410
	dout("snapid=%lld\n", snapid);
2411

2412
	cur_ofs = 0;
2413
	while (cur_ofs < rbd_dev->header.image_size) {
2414
		cur_ofs += rbd_get_segment(&rbd_dev->header,
2415
					   rbd_dev->obj,
2416
					   cur_ofs, (u64)-1,
2417
					   seg_name, NULL);
2418
		dout("seg_name=%s\n", seg_name);
2419

2420
		ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
2421
		if (ret < 0)
2422
			pr_warning("could not roll back obj %s err=%d\n",
2423
				   seg_name, ret);
2424
	}
2425

2426
	ret = __rbd_update_snaps(rbd_dev);
2427
	if (ret < 0)
2428
		goto done_unlock;
2429

2430
	ret = count;
2431

2432
done_unlock:
2433
	mutex_unlock(&ctl_mutex);
2434
done:
2435
	kfree(seg_name);
2436
	kfree(snap_name);
2437

2438
	return ret;
2439
}
2440

2441
static struct bus_attribute rbd_bus_attrs[] = {
2442
	__ATTR(add, S_IWUSR, NULL, rbd_add),
2443
	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
2444
	__ATTR_NULL
2445
};
2446

2447
/*
2448
 * create control files in sysfs
2449
 * /sys/bus/rbd/...
2450
 */
2451
static int rbd_sysfs_init(void)
2452
{
2453
	int ret;
2454

2455
	rbd_bus_type.bus_attrs = rbd_bus_attrs;
2456

2457
	ret = bus_register(&rbd_bus_type);
2458
	 if (ret < 0)
2459
		return ret;
2460

2461
	ret = device_register(&rbd_root_dev);
2462

2463
	return ret;
2464
}
2465

2466
static void rbd_sysfs_cleanup(void)
2467
{
2468
	device_unregister(&rbd_root_dev);
2469
	bus_unregister(&rbd_bus_type);
2470
}
2471

2472
int __init rbd_init(void)
2473
{
2474
	int rc;
2475

2476
	rc = rbd_sysfs_init();
2477
	if (rc)
2478
		return rc;
2479
	spin_lock_init(&node_lock);
2480
	pr_info("loaded " DRV_NAME_LONG "\n");
2481
	return 0;
2482
}
2483

2484
void __exit rbd_exit(void)
2485
{
2486
	rbd_sysfs_cleanup();
2487
}
2488

2489
module_init(rbd_init);
2490
module_exit(rbd_exit);
2491

2492
MODULE_AUTHOR("Sage Weil <[email protected]>");
2493
MODULE_AUTHOR("Yehuda Sadeh <[email protected]>");
2494
MODULE_DESCRIPTION("rados block device");
2495

2496
/* following authorship retained from original osdblk.c */
2497
MODULE_AUTHOR("Jeff Garzik <[email protected]>");
2498

2499
MODULE_LICENSE("GPL");
2500

2501
Product

Resources

Company