Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/net/ceph/osdmap.c
15109 views
1
2
#include <linux/ceph/ceph_debug.h>
3
4
#include <linux/module.h>
5
#include <linux/slab.h>
6
#include <asm/div64.h>
7
8
#include <linux/ceph/libceph.h>
9
#include <linux/ceph/osdmap.h>
10
#include <linux/ceph/decode.h>
11
#include <linux/crush/hash.h>
12
#include <linux/crush/mapper.h>
13
14
char *ceph_osdmap_state_str(char *str, int len, int state)
15
{
16
int flag = 0;
17
18
if (!len)
19
goto done;
20
21
*str = '\0';
22
if (state) {
23
if (state & CEPH_OSD_EXISTS) {
24
snprintf(str, len, "exists");
25
flag = 1;
26
}
27
if (state & CEPH_OSD_UP) {
28
snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
29
"up");
30
flag = 1;
31
}
32
} else {
33
snprintf(str, len, "doesn't exist");
34
}
35
done:
36
return str;
37
}
38
39
/* maps */
40
41
static int calc_bits_of(unsigned t)
42
{
43
int b = 0;
44
while (t) {
45
t = t >> 1;
46
b++;
47
}
48
return b;
49
}
50
51
/*
52
* the foo_mask is the smallest value 2^n-1 that is >= foo.
53
*/
54
static void calc_pg_masks(struct ceph_pg_pool_info *pi)
55
{
56
pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
57
pi->pgp_num_mask =
58
(1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
59
pi->lpg_num_mask =
60
(1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
61
pi->lpgp_num_mask =
62
(1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
63
}
64
65
/*
66
* decode crush map
67
*/
68
static int crush_decode_uniform_bucket(void **p, void *end,
69
struct crush_bucket_uniform *b)
70
{
71
dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
72
ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
73
b->item_weight = ceph_decode_32(p);
74
return 0;
75
bad:
76
return -EINVAL;
77
}
78
79
static int crush_decode_list_bucket(void **p, void *end,
80
struct crush_bucket_list *b)
81
{
82
int j;
83
dout("crush_decode_list_bucket %p to %p\n", *p, end);
84
b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
85
if (b->item_weights == NULL)
86
return -ENOMEM;
87
b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
88
if (b->sum_weights == NULL)
89
return -ENOMEM;
90
ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
91
for (j = 0; j < b->h.size; j++) {
92
b->item_weights[j] = ceph_decode_32(p);
93
b->sum_weights[j] = ceph_decode_32(p);
94
}
95
return 0;
96
bad:
97
return -EINVAL;
98
}
99
100
static int crush_decode_tree_bucket(void **p, void *end,
101
struct crush_bucket_tree *b)
102
{
103
int j;
104
dout("crush_decode_tree_bucket %p to %p\n", *p, end);
105
ceph_decode_32_safe(p, end, b->num_nodes, bad);
106
b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
107
if (b->node_weights == NULL)
108
return -ENOMEM;
109
ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
110
for (j = 0; j < b->num_nodes; j++)
111
b->node_weights[j] = ceph_decode_32(p);
112
return 0;
113
bad:
114
return -EINVAL;
115
}
116
117
static int crush_decode_straw_bucket(void **p, void *end,
118
struct crush_bucket_straw *b)
119
{
120
int j;
121
dout("crush_decode_straw_bucket %p to %p\n", *p, end);
122
b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
123
if (b->item_weights == NULL)
124
return -ENOMEM;
125
b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
126
if (b->straws == NULL)
127
return -ENOMEM;
128
ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
129
for (j = 0; j < b->h.size; j++) {
130
b->item_weights[j] = ceph_decode_32(p);
131
b->straws[j] = ceph_decode_32(p);
132
}
133
return 0;
134
bad:
135
return -EINVAL;
136
}
137
138
static struct crush_map *crush_decode(void *pbyval, void *end)
139
{
140
struct crush_map *c;
141
int err = -EINVAL;
142
int i, j;
143
void **p = &pbyval;
144
void *start = pbyval;
145
u32 magic;
146
147
dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
148
149
c = kzalloc(sizeof(*c), GFP_NOFS);
150
if (c == NULL)
151
return ERR_PTR(-ENOMEM);
152
153
ceph_decode_need(p, end, 4*sizeof(u32), bad);
154
magic = ceph_decode_32(p);
155
if (magic != CRUSH_MAGIC) {
156
pr_err("crush_decode magic %x != current %x\n",
157
(unsigned)magic, (unsigned)CRUSH_MAGIC);
158
goto bad;
159
}
160
c->max_buckets = ceph_decode_32(p);
161
c->max_rules = ceph_decode_32(p);
162
c->max_devices = ceph_decode_32(p);
163
164
c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
165
if (c->device_parents == NULL)
166
goto badmem;
167
c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
168
if (c->bucket_parents == NULL)
169
goto badmem;
170
171
c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
172
if (c->buckets == NULL)
173
goto badmem;
174
c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
175
if (c->rules == NULL)
176
goto badmem;
177
178
/* buckets */
179
for (i = 0; i < c->max_buckets; i++) {
180
int size = 0;
181
u32 alg;
182
struct crush_bucket *b;
183
184
ceph_decode_32_safe(p, end, alg, bad);
185
if (alg == 0) {
186
c->buckets[i] = NULL;
187
continue;
188
}
189
dout("crush_decode bucket %d off %x %p to %p\n",
190
i, (int)(*p-start), *p, end);
191
192
switch (alg) {
193
case CRUSH_BUCKET_UNIFORM:
194
size = sizeof(struct crush_bucket_uniform);
195
break;
196
case CRUSH_BUCKET_LIST:
197
size = sizeof(struct crush_bucket_list);
198
break;
199
case CRUSH_BUCKET_TREE:
200
size = sizeof(struct crush_bucket_tree);
201
break;
202
case CRUSH_BUCKET_STRAW:
203
size = sizeof(struct crush_bucket_straw);
204
break;
205
default:
206
err = -EINVAL;
207
goto bad;
208
}
209
BUG_ON(size == 0);
210
b = c->buckets[i] = kzalloc(size, GFP_NOFS);
211
if (b == NULL)
212
goto badmem;
213
214
ceph_decode_need(p, end, 4*sizeof(u32), bad);
215
b->id = ceph_decode_32(p);
216
b->type = ceph_decode_16(p);
217
b->alg = ceph_decode_8(p);
218
b->hash = ceph_decode_8(p);
219
b->weight = ceph_decode_32(p);
220
b->size = ceph_decode_32(p);
221
222
dout("crush_decode bucket size %d off %x %p to %p\n",
223
b->size, (int)(*p-start), *p, end);
224
225
b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
226
if (b->items == NULL)
227
goto badmem;
228
b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
229
if (b->perm == NULL)
230
goto badmem;
231
b->perm_n = 0;
232
233
ceph_decode_need(p, end, b->size*sizeof(u32), bad);
234
for (j = 0; j < b->size; j++)
235
b->items[j] = ceph_decode_32(p);
236
237
switch (b->alg) {
238
case CRUSH_BUCKET_UNIFORM:
239
err = crush_decode_uniform_bucket(p, end,
240
(struct crush_bucket_uniform *)b);
241
if (err < 0)
242
goto bad;
243
break;
244
case CRUSH_BUCKET_LIST:
245
err = crush_decode_list_bucket(p, end,
246
(struct crush_bucket_list *)b);
247
if (err < 0)
248
goto bad;
249
break;
250
case CRUSH_BUCKET_TREE:
251
err = crush_decode_tree_bucket(p, end,
252
(struct crush_bucket_tree *)b);
253
if (err < 0)
254
goto bad;
255
break;
256
case CRUSH_BUCKET_STRAW:
257
err = crush_decode_straw_bucket(p, end,
258
(struct crush_bucket_straw *)b);
259
if (err < 0)
260
goto bad;
261
break;
262
}
263
}
264
265
/* rules */
266
dout("rule vec is %p\n", c->rules);
267
for (i = 0; i < c->max_rules; i++) {
268
u32 yes;
269
struct crush_rule *r;
270
271
ceph_decode_32_safe(p, end, yes, bad);
272
if (!yes) {
273
dout("crush_decode NO rule %d off %x %p to %p\n",
274
i, (int)(*p-start), *p, end);
275
c->rules[i] = NULL;
276
continue;
277
}
278
279
dout("crush_decode rule %d off %x %p to %p\n",
280
i, (int)(*p-start), *p, end);
281
282
/* len */
283
ceph_decode_32_safe(p, end, yes, bad);
284
#if BITS_PER_LONG == 32
285
err = -EINVAL;
286
if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
287
goto bad;
288
#endif
289
r = c->rules[i] = kmalloc(sizeof(*r) +
290
yes*sizeof(struct crush_rule_step),
291
GFP_NOFS);
292
if (r == NULL)
293
goto badmem;
294
dout(" rule %d is at %p\n", i, r);
295
r->len = yes;
296
ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
297
ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
298
for (j = 0; j < r->len; j++) {
299
r->steps[j].op = ceph_decode_32(p);
300
r->steps[j].arg1 = ceph_decode_32(p);
301
r->steps[j].arg2 = ceph_decode_32(p);
302
}
303
}
304
305
/* ignore trailing name maps. */
306
307
dout("crush_decode success\n");
308
return c;
309
310
badmem:
311
err = -ENOMEM;
312
bad:
313
dout("crush_decode fail %d\n", err);
314
crush_destroy(c);
315
return ERR_PTR(err);
316
}
317
318
/*
319
* rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
320
* to a set of osds)
321
*/
322
static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
323
{
324
u64 a = *(u64 *)&l;
325
u64 b = *(u64 *)&r;
326
327
if (a < b)
328
return -1;
329
if (a > b)
330
return 1;
331
return 0;
332
}
333
334
static int __insert_pg_mapping(struct ceph_pg_mapping *new,
335
struct rb_root *root)
336
{
337
struct rb_node **p = &root->rb_node;
338
struct rb_node *parent = NULL;
339
struct ceph_pg_mapping *pg = NULL;
340
int c;
341
342
while (*p) {
343
parent = *p;
344
pg = rb_entry(parent, struct ceph_pg_mapping, node);
345
c = pgid_cmp(new->pgid, pg->pgid);
346
if (c < 0)
347
p = &(*p)->rb_left;
348
else if (c > 0)
349
p = &(*p)->rb_right;
350
else
351
return -EEXIST;
352
}
353
354
rb_link_node(&new->node, parent, p);
355
rb_insert_color(&new->node, root);
356
return 0;
357
}
358
359
static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
360
struct ceph_pg pgid)
361
{
362
struct rb_node *n = root->rb_node;
363
struct ceph_pg_mapping *pg;
364
int c;
365
366
while (n) {
367
pg = rb_entry(n, struct ceph_pg_mapping, node);
368
c = pgid_cmp(pgid, pg->pgid);
369
if (c < 0)
370
n = n->rb_left;
371
else if (c > 0)
372
n = n->rb_right;
373
else
374
return pg;
375
}
376
return NULL;
377
}
378
379
/*
380
* rbtree of pg pool info
381
*/
382
static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
383
{
384
struct rb_node **p = &root->rb_node;
385
struct rb_node *parent = NULL;
386
struct ceph_pg_pool_info *pi = NULL;
387
388
while (*p) {
389
parent = *p;
390
pi = rb_entry(parent, struct ceph_pg_pool_info, node);
391
if (new->id < pi->id)
392
p = &(*p)->rb_left;
393
else if (new->id > pi->id)
394
p = &(*p)->rb_right;
395
else
396
return -EEXIST;
397
}
398
399
rb_link_node(&new->node, parent, p);
400
rb_insert_color(&new->node, root);
401
return 0;
402
}
403
404
static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
405
{
406
struct ceph_pg_pool_info *pi;
407
struct rb_node *n = root->rb_node;
408
409
while (n) {
410
pi = rb_entry(n, struct ceph_pg_pool_info, node);
411
if (id < pi->id)
412
n = n->rb_left;
413
else if (id > pi->id)
414
n = n->rb_right;
415
else
416
return pi;
417
}
418
return NULL;
419
}
420
421
int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
422
{
423
struct rb_node *rbp;
424
425
for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
426
struct ceph_pg_pool_info *pi =
427
rb_entry(rbp, struct ceph_pg_pool_info, node);
428
if (pi->name && strcmp(pi->name, name) == 0)
429
return pi->id;
430
}
431
return -ENOENT;
432
}
433
EXPORT_SYMBOL(ceph_pg_poolid_by_name);
434
435
static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
436
{
437
rb_erase(&pi->node, root);
438
kfree(pi->name);
439
kfree(pi);
440
}
441
442
static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
443
{
444
unsigned n, m;
445
446
ceph_decode_copy(p, &pi->v, sizeof(pi->v));
447
calc_pg_masks(pi);
448
449
/* num_snaps * snap_info_t */
450
n = le32_to_cpu(pi->v.num_snaps);
451
while (n--) {
452
ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
453
sizeof(struct ceph_timespec), bad);
454
*p += sizeof(u64) + /* key */
455
1 + sizeof(u64) + /* u8, snapid */
456
sizeof(struct ceph_timespec);
457
m = ceph_decode_32(p); /* snap name */
458
*p += m;
459
}
460
461
*p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
462
return 0;
463
464
bad:
465
return -EINVAL;
466
}
467
468
static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
469
{
470
struct ceph_pg_pool_info *pi;
471
u32 num, len, pool;
472
473
ceph_decode_32_safe(p, end, num, bad);
474
dout(" %d pool names\n", num);
475
while (num--) {
476
ceph_decode_32_safe(p, end, pool, bad);
477
ceph_decode_32_safe(p, end, len, bad);
478
dout(" pool %d len %d\n", pool, len);
479
pi = __lookup_pg_pool(&map->pg_pools, pool);
480
if (pi) {
481
kfree(pi->name);
482
pi->name = kmalloc(len + 1, GFP_NOFS);
483
if (pi->name) {
484
memcpy(pi->name, *p, len);
485
pi->name[len] = '\0';
486
dout(" name is %s\n", pi->name);
487
}
488
}
489
*p += len;
490
}
491
return 0;
492
493
bad:
494
return -EINVAL;
495
}
496
497
/*
498
* osd map
499
*/
500
void ceph_osdmap_destroy(struct ceph_osdmap *map)
501
{
502
dout("osdmap_destroy %p\n", map);
503
if (map->crush)
504
crush_destroy(map->crush);
505
while (!RB_EMPTY_ROOT(&map->pg_temp)) {
506
struct ceph_pg_mapping *pg =
507
rb_entry(rb_first(&map->pg_temp),
508
struct ceph_pg_mapping, node);
509
rb_erase(&pg->node, &map->pg_temp);
510
kfree(pg);
511
}
512
while (!RB_EMPTY_ROOT(&map->pg_pools)) {
513
struct ceph_pg_pool_info *pi =
514
rb_entry(rb_first(&map->pg_pools),
515
struct ceph_pg_pool_info, node);
516
__remove_pg_pool(&map->pg_pools, pi);
517
}
518
kfree(map->osd_state);
519
kfree(map->osd_weight);
520
kfree(map->osd_addr);
521
kfree(map);
522
}
523
524
/*
525
* adjust max osd value. reallocate arrays.
526
*/
527
static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
528
{
529
u8 *state;
530
struct ceph_entity_addr *addr;
531
u32 *weight;
532
533
state = kcalloc(max, sizeof(*state), GFP_NOFS);
534
addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
535
weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
536
if (state == NULL || addr == NULL || weight == NULL) {
537
kfree(state);
538
kfree(addr);
539
kfree(weight);
540
return -ENOMEM;
541
}
542
543
/* copy old? */
544
if (map->osd_state) {
545
memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
546
memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
547
memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
548
kfree(map->osd_state);
549
kfree(map->osd_addr);
550
kfree(map->osd_weight);
551
}
552
553
map->osd_state = state;
554
map->osd_weight = weight;
555
map->osd_addr = addr;
556
map->max_osd = max;
557
return 0;
558
}
559
560
/*
561
* decode a full map.
562
*/
563
struct ceph_osdmap *osdmap_decode(void **p, void *end)
564
{
565
struct ceph_osdmap *map;
566
u16 version;
567
u32 len, max, i;
568
u8 ev;
569
int err = -EINVAL;
570
void *start = *p;
571
struct ceph_pg_pool_info *pi;
572
573
dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
574
575
map = kzalloc(sizeof(*map), GFP_NOFS);
576
if (map == NULL)
577
return ERR_PTR(-ENOMEM);
578
map->pg_temp = RB_ROOT;
579
580
ceph_decode_16_safe(p, end, version, bad);
581
if (version > CEPH_OSDMAP_VERSION) {
582
pr_warning("got unknown v %d > %d of osdmap\n", version,
583
CEPH_OSDMAP_VERSION);
584
goto bad;
585
}
586
587
ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
588
ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
589
map->epoch = ceph_decode_32(p);
590
ceph_decode_copy(p, &map->created, sizeof(map->created));
591
ceph_decode_copy(p, &map->modified, sizeof(map->modified));
592
593
ceph_decode_32_safe(p, end, max, bad);
594
while (max--) {
595
ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
596
pi = kzalloc(sizeof(*pi), GFP_NOFS);
597
if (!pi)
598
goto bad;
599
pi->id = ceph_decode_32(p);
600
ev = ceph_decode_8(p); /* encoding version */
601
if (ev > CEPH_PG_POOL_VERSION) {
602
pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
603
ev, CEPH_PG_POOL_VERSION);
604
kfree(pi);
605
goto bad;
606
}
607
err = __decode_pool(p, end, pi);
608
if (err < 0) {
609
kfree(pi);
610
goto bad;
611
}
612
__insert_pg_pool(&map->pg_pools, pi);
613
}
614
615
if (version >= 5 && __decode_pool_names(p, end, map) < 0)
616
goto bad;
617
618
ceph_decode_32_safe(p, end, map->pool_max, bad);
619
620
ceph_decode_32_safe(p, end, map->flags, bad);
621
622
max = ceph_decode_32(p);
623
624
/* (re)alloc osd arrays */
625
err = osdmap_set_max_osd(map, max);
626
if (err < 0)
627
goto bad;
628
dout("osdmap_decode max_osd = %d\n", map->max_osd);
629
630
/* osds */
631
err = -EINVAL;
632
ceph_decode_need(p, end, 3*sizeof(u32) +
633
map->max_osd*(1 + sizeof(*map->osd_weight) +
634
sizeof(*map->osd_addr)), bad);
635
*p += 4; /* skip length field (should match max) */
636
ceph_decode_copy(p, map->osd_state, map->max_osd);
637
638
*p += 4; /* skip length field (should match max) */
639
for (i = 0; i < map->max_osd; i++)
640
map->osd_weight[i] = ceph_decode_32(p);
641
642
*p += 4; /* skip length field (should match max) */
643
ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
644
for (i = 0; i < map->max_osd; i++)
645
ceph_decode_addr(&map->osd_addr[i]);
646
647
/* pg_temp */
648
ceph_decode_32_safe(p, end, len, bad);
649
for (i = 0; i < len; i++) {
650
int n, j;
651
struct ceph_pg pgid;
652
struct ceph_pg_mapping *pg;
653
654
ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
655
ceph_decode_copy(p, &pgid, sizeof(pgid));
656
n = ceph_decode_32(p);
657
ceph_decode_need(p, end, n * sizeof(u32), bad);
658
err = -ENOMEM;
659
pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
660
if (!pg)
661
goto bad;
662
pg->pgid = pgid;
663
pg->len = n;
664
for (j = 0; j < n; j++)
665
pg->osds[j] = ceph_decode_32(p);
666
667
err = __insert_pg_mapping(pg, &map->pg_temp);
668
if (err)
669
goto bad;
670
dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
671
}
672
673
/* crush */
674
ceph_decode_32_safe(p, end, len, bad);
675
dout("osdmap_decode crush len %d from off 0x%x\n", len,
676
(int)(*p - start));
677
ceph_decode_need(p, end, len, bad);
678
map->crush = crush_decode(*p, end);
679
*p += len;
680
if (IS_ERR(map->crush)) {
681
err = PTR_ERR(map->crush);
682
map->crush = NULL;
683
goto bad;
684
}
685
686
/* ignore the rest of the map */
687
*p = end;
688
689
dout("osdmap_decode done %p %p\n", *p, end);
690
return map;
691
692
bad:
693
dout("osdmap_decode fail\n");
694
ceph_osdmap_destroy(map);
695
return ERR_PTR(err);
696
}
697
698
/*
699
* decode and apply an incremental map update.
700
*/
701
struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
702
struct ceph_osdmap *map,
703
struct ceph_messenger *msgr)
704
{
705
struct crush_map *newcrush = NULL;
706
struct ceph_fsid fsid;
707
u32 epoch = 0;
708
struct ceph_timespec modified;
709
u32 len, pool;
710
__s32 new_pool_max, new_flags, max;
711
void *start = *p;
712
int err = -EINVAL;
713
u16 version;
714
struct rb_node *rbp;
715
716
ceph_decode_16_safe(p, end, version, bad);
717
if (version > CEPH_OSDMAP_INC_VERSION) {
718
pr_warning("got unknown v %d > %d of inc osdmap\n", version,
719
CEPH_OSDMAP_INC_VERSION);
720
goto bad;
721
}
722
723
ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
724
bad);
725
ceph_decode_copy(p, &fsid, sizeof(fsid));
726
epoch = ceph_decode_32(p);
727
BUG_ON(epoch != map->epoch+1);
728
ceph_decode_copy(p, &modified, sizeof(modified));
729
new_pool_max = ceph_decode_32(p);
730
new_flags = ceph_decode_32(p);
731
732
/* full map? */
733
ceph_decode_32_safe(p, end, len, bad);
734
if (len > 0) {
735
dout("apply_incremental full map len %d, %p to %p\n",
736
len, *p, end);
737
return osdmap_decode(p, min(*p+len, end));
738
}
739
740
/* new crush? */
741
ceph_decode_32_safe(p, end, len, bad);
742
if (len > 0) {
743
dout("apply_incremental new crush map len %d, %p to %p\n",
744
len, *p, end);
745
newcrush = crush_decode(*p, min(*p+len, end));
746
if (IS_ERR(newcrush))
747
return ERR_CAST(newcrush);
748
*p += len;
749
}
750
751
/* new flags? */
752
if (new_flags >= 0)
753
map->flags = new_flags;
754
if (new_pool_max >= 0)
755
map->pool_max = new_pool_max;
756
757
ceph_decode_need(p, end, 5*sizeof(u32), bad);
758
759
/* new max? */
760
max = ceph_decode_32(p);
761
if (max >= 0) {
762
err = osdmap_set_max_osd(map, max);
763
if (err < 0)
764
goto bad;
765
}
766
767
map->epoch++;
768
map->modified = modified;
769
if (newcrush) {
770
if (map->crush)
771
crush_destroy(map->crush);
772
map->crush = newcrush;
773
newcrush = NULL;
774
}
775
776
/* new_pool */
777
ceph_decode_32_safe(p, end, len, bad);
778
while (len--) {
779
__u8 ev;
780
struct ceph_pg_pool_info *pi;
781
782
ceph_decode_32_safe(p, end, pool, bad);
783
ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
784
ev = ceph_decode_8(p); /* encoding version */
785
if (ev > CEPH_PG_POOL_VERSION) {
786
pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
787
ev, CEPH_PG_POOL_VERSION);
788
goto bad;
789
}
790
pi = __lookup_pg_pool(&map->pg_pools, pool);
791
if (!pi) {
792
pi = kzalloc(sizeof(*pi), GFP_NOFS);
793
if (!pi) {
794
err = -ENOMEM;
795
goto bad;
796
}
797
pi->id = pool;
798
__insert_pg_pool(&map->pg_pools, pi);
799
}
800
err = __decode_pool(p, end, pi);
801
if (err < 0)
802
goto bad;
803
}
804
if (version >= 5 && __decode_pool_names(p, end, map) < 0)
805
goto bad;
806
807
/* old_pool */
808
ceph_decode_32_safe(p, end, len, bad);
809
while (len--) {
810
struct ceph_pg_pool_info *pi;
811
812
ceph_decode_32_safe(p, end, pool, bad);
813
pi = __lookup_pg_pool(&map->pg_pools, pool);
814
if (pi)
815
__remove_pg_pool(&map->pg_pools, pi);
816
}
817
818
/* new_up */
819
err = -EINVAL;
820
ceph_decode_32_safe(p, end, len, bad);
821
while (len--) {
822
u32 osd;
823
struct ceph_entity_addr addr;
824
ceph_decode_32_safe(p, end, osd, bad);
825
ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
826
ceph_decode_addr(&addr);
827
pr_info("osd%d up\n", osd);
828
BUG_ON(osd >= map->max_osd);
829
map->osd_state[osd] |= CEPH_OSD_UP;
830
map->osd_addr[osd] = addr;
831
}
832
833
/* new_state */
834
ceph_decode_32_safe(p, end, len, bad);
835
while (len--) {
836
u32 osd;
837
u8 xorstate;
838
ceph_decode_32_safe(p, end, osd, bad);
839
xorstate = **(u8 **)p;
840
(*p)++; /* clean flag */
841
if (xorstate == 0)
842
xorstate = CEPH_OSD_UP;
843
if (xorstate & CEPH_OSD_UP)
844
pr_info("osd%d down\n", osd);
845
if (osd < map->max_osd)
846
map->osd_state[osd] ^= xorstate;
847
}
848
849
/* new_weight */
850
ceph_decode_32_safe(p, end, len, bad);
851
while (len--) {
852
u32 osd, off;
853
ceph_decode_need(p, end, sizeof(u32)*2, bad);
854
osd = ceph_decode_32(p);
855
off = ceph_decode_32(p);
856
pr_info("osd%d weight 0x%x %s\n", osd, off,
857
off == CEPH_OSD_IN ? "(in)" :
858
(off == CEPH_OSD_OUT ? "(out)" : ""));
859
if (osd < map->max_osd)
860
map->osd_weight[osd] = off;
861
}
862
863
/* new_pg_temp */
864
rbp = rb_first(&map->pg_temp);
865
ceph_decode_32_safe(p, end, len, bad);
866
while (len--) {
867
struct ceph_pg_mapping *pg;
868
int j;
869
struct ceph_pg pgid;
870
u32 pglen;
871
ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
872
ceph_decode_copy(p, &pgid, sizeof(pgid));
873
pglen = ceph_decode_32(p);
874
875
/* remove any? */
876
while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
877
node)->pgid, pgid) <= 0) {
878
struct ceph_pg_mapping *cur =
879
rb_entry(rbp, struct ceph_pg_mapping, node);
880
881
rbp = rb_next(rbp);
882
dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
883
rb_erase(&cur->node, &map->pg_temp);
884
kfree(cur);
885
}
886
887
if (pglen) {
888
/* insert */
889
ceph_decode_need(p, end, pglen*sizeof(u32), bad);
890
pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
891
if (!pg) {
892
err = -ENOMEM;
893
goto bad;
894
}
895
pg->pgid = pgid;
896
pg->len = pglen;
897
for (j = 0; j < pglen; j++)
898
pg->osds[j] = ceph_decode_32(p);
899
err = __insert_pg_mapping(pg, &map->pg_temp);
900
if (err) {
901
kfree(pg);
902
goto bad;
903
}
904
dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
905
pglen);
906
}
907
}
908
while (rbp) {
909
struct ceph_pg_mapping *cur =
910
rb_entry(rbp, struct ceph_pg_mapping, node);
911
912
rbp = rb_next(rbp);
913
dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
914
rb_erase(&cur->node, &map->pg_temp);
915
kfree(cur);
916
}
917
918
/* ignore the rest */
919
*p = end;
920
return map;
921
922
bad:
923
pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
924
epoch, (int)(*p - start), *p, start, end);
925
print_hex_dump(KERN_DEBUG, "osdmap: ",
926
DUMP_PREFIX_OFFSET, 16, 1,
927
start, end - start, true);
928
if (newcrush)
929
crush_destroy(newcrush);
930
return ERR_PTR(err);
931
}
932
933
934
935
936
/*
937
* calculate file layout from given offset, length.
938
* fill in correct oid, logical length, and object extent
939
* offset, length.
940
*
941
* for now, we write only a single su, until we can
942
* pass a stride back to the caller.
943
*/
944
void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
945
u64 off, u64 *plen,
946
u64 *ono,
947
u64 *oxoff, u64 *oxlen)
948
{
949
u32 osize = le32_to_cpu(layout->fl_object_size);
950
u32 su = le32_to_cpu(layout->fl_stripe_unit);
951
u32 sc = le32_to_cpu(layout->fl_stripe_count);
952
u32 bl, stripeno, stripepos, objsetno;
953
u32 su_per_object;
954
u64 t, su_offset;
955
956
dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
957
osize, su);
958
su_per_object = osize / su;
959
dout("osize %u / su %u = su_per_object %u\n", osize, su,
960
su_per_object);
961
962
BUG_ON((su & ~PAGE_MASK) != 0);
963
/* bl = *off / su; */
964
t = off;
965
do_div(t, su);
966
bl = t;
967
dout("off %llu / su %u = bl %u\n", off, su, bl);
968
969
stripeno = bl / sc;
970
stripepos = bl % sc;
971
objsetno = stripeno / su_per_object;
972
973
*ono = objsetno * sc + stripepos;
974
dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
975
976
/* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
977
t = off;
978
su_offset = do_div(t, su);
979
*oxoff = su_offset + (stripeno % su_per_object) * su;
980
981
/*
982
* Calculate the length of the extent being written to the selected
983
* object. This is the minimum of the full length requested (plen) or
984
* the remainder of the current stripe being written to.
985
*/
986
*oxlen = min_t(u64, *plen, su - su_offset);
987
*plen = *oxlen;
988
989
dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
990
}
991
EXPORT_SYMBOL(ceph_calc_file_object_mapping);
992
993
/*
994
* calculate an object layout (i.e. pgid) from an oid,
995
* file_layout, and osdmap
996
*/
997
int ceph_calc_object_layout(struct ceph_object_layout *ol,
998
const char *oid,
999
struct ceph_file_layout *fl,
1000
struct ceph_osdmap *osdmap)
1001
{
1002
unsigned num, num_mask;
1003
struct ceph_pg pgid;
1004
s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
1005
int poolid = le32_to_cpu(fl->fl_pg_pool);
1006
struct ceph_pg_pool_info *pool;
1007
unsigned ps;
1008
1009
BUG_ON(!osdmap);
1010
1011
pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1012
if (!pool)
1013
return -EIO;
1014
ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
1015
if (preferred >= 0) {
1016
ps += preferred;
1017
num = le32_to_cpu(pool->v.lpg_num);
1018
num_mask = pool->lpg_num_mask;
1019
} else {
1020
num = le32_to_cpu(pool->v.pg_num);
1021
num_mask = pool->pg_num_mask;
1022
}
1023
1024
pgid.ps = cpu_to_le16(ps);
1025
pgid.preferred = cpu_to_le16(preferred);
1026
pgid.pool = fl->fl_pg_pool;
1027
if (preferred >= 0)
1028
dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
1029
(int)preferred);
1030
else
1031
dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
1032
1033
ol->ol_pgid = pgid;
1034
ol->ol_stripe_unit = fl->fl_object_stripe_unit;
1035
return 0;
1036
}
1037
EXPORT_SYMBOL(ceph_calc_object_layout);
1038
1039
/*
1040
* Calculate raw osd vector for the given pgid. Return pointer to osd
1041
* array, or NULL on failure.
1042
*/
1043
static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1044
int *osds, int *num)
1045
{
1046
struct ceph_pg_mapping *pg;
1047
struct ceph_pg_pool_info *pool;
1048
int ruleno;
1049
unsigned poolid, ps, pps;
1050
int preferred;
1051
1052
/* pg_temp? */
1053
pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1054
if (pg) {
1055
*num = pg->len;
1056
return pg->osds;
1057
}
1058
1059
/* crush */
1060
poolid = le32_to_cpu(pgid.pool);
1061
ps = le16_to_cpu(pgid.ps);
1062
preferred = (s16)le16_to_cpu(pgid.preferred);
1063
1064
/* don't forcefeed bad device ids to crush */
1065
if (preferred >= osdmap->max_osd ||
1066
preferred >= osdmap->crush->max_devices)
1067
preferred = -1;
1068
1069
pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1070
if (!pool)
1071
return NULL;
1072
ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
1073
pool->v.type, pool->v.size);
1074
if (ruleno < 0) {
1075
pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
1076
poolid, pool->v.crush_ruleset, pool->v.type,
1077
pool->v.size);
1078
return NULL;
1079
}
1080
1081
if (preferred >= 0)
1082
pps = ceph_stable_mod(ps,
1083
le32_to_cpu(pool->v.lpgp_num),
1084
pool->lpgp_num_mask);
1085
else
1086
pps = ceph_stable_mod(ps,
1087
le32_to_cpu(pool->v.pgp_num),
1088
pool->pgp_num_mask);
1089
pps += poolid;
1090
*num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1091
min_t(int, pool->v.size, *num),
1092
preferred, osdmap->osd_weight);
1093
return osds;
1094
}
1095
1096
/*
1097
* Return acting set for given pgid.
1098
*/
1099
int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1100
int *acting)
1101
{
1102
int rawosds[CEPH_PG_MAX_SIZE], *osds;
1103
int i, o, num = CEPH_PG_MAX_SIZE;
1104
1105
osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1106
if (!osds)
1107
return -1;
1108
1109
/* primary is first up osd */
1110
o = 0;
1111
for (i = 0; i < num; i++)
1112
if (ceph_osd_is_up(osdmap, osds[i]))
1113
acting[o++] = osds[i];
1114
return o;
1115
}
1116
1117
/*
1118
* Return primary osd for given pgid, or -1 if none.
1119
*/
1120
int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1121
{
1122
int rawosds[CEPH_PG_MAX_SIZE], *osds;
1123
int i, num = CEPH_PG_MAX_SIZE;
1124
1125
osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1126
if (!osds)
1127
return -1;
1128
1129
/* primary is first up osd */
1130
for (i = 0; i < num; i++)
1131
if (ceph_osd_is_up(osdmap, osds[i]))
1132
return osds[i];
1133
return -1;
1134
}
1135
EXPORT_SYMBOL(ceph_calc_pg_primary);
1136
1137