Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/drivers/md/dm-log-userspace-base.c
15109 views
1
/*
2
* Copyright (C) 2006-2009 Red Hat, Inc.
3
*
4
* This file is released under the LGPL.
5
*/
6
7
#include <linux/bio.h>
8
#include <linux/slab.h>
9
#include <linux/dm-dirty-log.h>
10
#include <linux/device-mapper.h>
11
#include <linux/dm-log-userspace.h>
12
13
#include "dm-log-userspace-transfer.h"
14
15
#define DM_LOG_USERSPACE_VSN "1.1.0"
16
17
struct flush_entry {
18
int type;
19
region_t region;
20
struct list_head list;
21
};
22
23
/*
24
* This limit on the number of mark and clear request is, to a degree,
25
* arbitrary. However, there is some basis for the choice in the limits
26
* imposed on the size of data payload by dm-log-userspace-transfer.c:
27
* dm_consult_userspace().
28
*/
29
#define MAX_FLUSH_GROUP_COUNT 32
30
31
struct log_c {
32
struct dm_target *ti;
33
uint32_t region_size;
34
region_t region_count;
35
uint64_t luid;
36
char uuid[DM_UUID_LEN];
37
38
char *usr_argv_str;
39
uint32_t usr_argc;
40
41
/*
42
* in_sync_hint gets set when doing is_remote_recovering. It
43
* represents the first region that needs recovery. IOW, the
44
* first zero bit of sync_bits. This can be useful for to limit
45
* traffic for calls like is_remote_recovering and get_resync_work,
46
* but be take care in its use for anything else.
47
*/
48
uint64_t in_sync_hint;
49
50
/*
51
* Mark and clear requests are held until a flush is issued
52
* so that we can group, and thereby limit, the amount of
53
* network traffic between kernel and userspace. The 'flush_lock'
54
* is used to protect these lists.
55
*/
56
spinlock_t flush_lock;
57
struct list_head mark_list;
58
struct list_head clear_list;
59
};
60
61
static mempool_t *flush_entry_pool;
62
63
static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
64
{
65
return kmalloc(sizeof(struct flush_entry), gfp_mask);
66
}
67
68
static void flush_entry_free(void *element, void *pool_data)
69
{
70
kfree(element);
71
}
72
73
static int userspace_do_request(struct log_c *lc, const char *uuid,
74
int request_type, char *data, size_t data_size,
75
char *rdata, size_t *rdata_size)
76
{
77
int r;
78
79
/*
80
* If the server isn't there, -ESRCH is returned,
81
* and we must keep trying until the server is
82
* restored.
83
*/
84
retry:
85
r = dm_consult_userspace(uuid, lc->luid, request_type, data,
86
data_size, rdata, rdata_size);
87
88
if (r != -ESRCH)
89
return r;
90
91
DMERR(" Userspace log server not found.");
92
while (1) {
93
set_current_state(TASK_INTERRUPTIBLE);
94
schedule_timeout(2*HZ);
95
DMWARN("Attempting to contact userspace log server...");
96
r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR,
97
lc->usr_argv_str,
98
strlen(lc->usr_argv_str) + 1,
99
NULL, NULL);
100
if (!r)
101
break;
102
}
103
DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
104
r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL,
105
0, NULL, NULL);
106
if (!r)
107
goto retry;
108
109
DMERR("Error trying to resume userspace log: %d", r);
110
111
return -ESRCH;
112
}
113
114
static int build_constructor_string(struct dm_target *ti,
115
unsigned argc, char **argv,
116
char **ctr_str)
117
{
118
int i, str_size;
119
char *str = NULL;
120
121
*ctr_str = NULL;
122
123
for (i = 0, str_size = 0; i < argc; i++)
124
str_size += strlen(argv[i]) + 1; /* +1 for space between args */
125
126
str_size += 20; /* Max number of chars in a printed u64 number */
127
128
str = kzalloc(str_size, GFP_KERNEL);
129
if (!str) {
130
DMWARN("Unable to allocate memory for constructor string");
131
return -ENOMEM;
132
}
133
134
str_size = sprintf(str, "%llu", (unsigned long long)ti->len);
135
for (i = 0; i < argc; i++)
136
str_size += sprintf(str + str_size, " %s", argv[i]);
137
138
*ctr_str = str;
139
return str_size;
140
}
141
142
/*
143
* userspace_ctr
144
*
145
* argv contains:
146
* <UUID> <other args>
147
* Where 'other args' is the userspace implementation specific log
148
* arguments. An example might be:
149
* <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
150
*
151
* So, this module will strip off the <UUID> for identification purposes
152
* when communicating with userspace about a log; but will pass on everything
153
* else.
154
*/
155
static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
156
unsigned argc, char **argv)
157
{
158
int r = 0;
159
int str_size;
160
char *ctr_str = NULL;
161
struct log_c *lc = NULL;
162
uint64_t rdata;
163
size_t rdata_size = sizeof(rdata);
164
165
if (argc < 3) {
166
DMWARN("Too few arguments to userspace dirty log");
167
return -EINVAL;
168
}
169
170
lc = kmalloc(sizeof(*lc), GFP_KERNEL);
171
if (!lc) {
172
DMWARN("Unable to allocate userspace log context.");
173
return -ENOMEM;
174
}
175
176
/* The ptr value is sufficient for local unique id */
177
lc->luid = (unsigned long)lc;
178
179
lc->ti = ti;
180
181
if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
182
DMWARN("UUID argument too long.");
183
kfree(lc);
184
return -EINVAL;
185
}
186
187
strncpy(lc->uuid, argv[0], DM_UUID_LEN);
188
spin_lock_init(&lc->flush_lock);
189
INIT_LIST_HEAD(&lc->mark_list);
190
INIT_LIST_HEAD(&lc->clear_list);
191
192
str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
193
if (str_size < 0) {
194
kfree(lc);
195
return str_size;
196
}
197
198
/* Send table string */
199
r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
200
ctr_str, str_size, NULL, NULL);
201
202
if (r < 0) {
203
if (r == -ESRCH)
204
DMERR("Userspace log server not found");
205
else
206
DMERR("Userspace log server failed to create log");
207
goto out;
208
}
209
210
/* Since the region size does not change, get it now */
211
rdata_size = sizeof(rdata);
212
r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE,
213
NULL, 0, (char *)&rdata, &rdata_size);
214
215
if (r) {
216
DMERR("Failed to get region size of dirty log");
217
goto out;
218
}
219
220
lc->region_size = (uint32_t)rdata;
221
lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
222
223
out:
224
if (r) {
225
kfree(lc);
226
kfree(ctr_str);
227
} else {
228
lc->usr_argv_str = ctr_str;
229
lc->usr_argc = argc;
230
log->context = lc;
231
}
232
233
return r;
234
}
235
236
static void userspace_dtr(struct dm_dirty_log *log)
237
{
238
struct log_c *lc = log->context;
239
240
(void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
241
NULL, 0,
242
NULL, NULL);
243
244
kfree(lc->usr_argv_str);
245
kfree(lc);
246
247
return;
248
}
249
250
static int userspace_presuspend(struct dm_dirty_log *log)
251
{
252
int r;
253
struct log_c *lc = log->context;
254
255
r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND,
256
NULL, 0,
257
NULL, NULL);
258
259
return r;
260
}
261
262
static int userspace_postsuspend(struct dm_dirty_log *log)
263
{
264
int r;
265
struct log_c *lc = log->context;
266
267
r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND,
268
NULL, 0,
269
NULL, NULL);
270
271
return r;
272
}
273
274
static int userspace_resume(struct dm_dirty_log *log)
275
{
276
int r;
277
struct log_c *lc = log->context;
278
279
lc->in_sync_hint = 0;
280
r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME,
281
NULL, 0,
282
NULL, NULL);
283
284
return r;
285
}
286
287
static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
288
{
289
struct log_c *lc = log->context;
290
291
return lc->region_size;
292
}
293
294
/*
295
* userspace_is_clean
296
*
297
* Check whether a region is clean. If there is any sort of
298
* failure when consulting the server, we return not clean.
299
*
300
* Returns: 1 if clean, 0 otherwise
301
*/
302
static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
303
{
304
int r;
305
uint64_t region64 = (uint64_t)region;
306
int64_t is_clean;
307
size_t rdata_size;
308
struct log_c *lc = log->context;
309
310
rdata_size = sizeof(is_clean);
311
r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
312
(char *)&region64, sizeof(region64),
313
(char *)&is_clean, &rdata_size);
314
315
return (r) ? 0 : (int)is_clean;
316
}
317
318
/*
319
* userspace_in_sync
320
*
321
* Check if the region is in-sync. If there is any sort
322
* of failure when consulting the server, we assume that
323
* the region is not in sync.
324
*
325
* If 'can_block' is set, return immediately
326
*
327
* Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
328
*/
329
static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
330
int can_block)
331
{
332
int r;
333
uint64_t region64 = region;
334
int64_t in_sync;
335
size_t rdata_size;
336
struct log_c *lc = log->context;
337
338
/*
339
* We can never respond directly - even if in_sync_hint is
340
* set. This is because another machine could see a device
341
* failure and mark the region out-of-sync. If we don't go
342
* to userspace to ask, we might think the region is in-sync
343
* and allow a read to pick up data that is stale. (This is
344
* very unlikely if a device actually fails; but it is very
345
* likely if a connection to one device from one machine fails.)
346
*
347
* There still might be a problem if the mirror caches the region
348
* state as in-sync... but then this call would not be made. So,
349
* that is a mirror problem.
350
*/
351
if (!can_block)
352
return -EWOULDBLOCK;
353
354
rdata_size = sizeof(in_sync);
355
r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
356
(char *)&region64, sizeof(region64),
357
(char *)&in_sync, &rdata_size);
358
return (r) ? 0 : (int)in_sync;
359
}
360
361
static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
362
{
363
int r = 0;
364
struct flush_entry *fe;
365
366
list_for_each_entry(fe, flush_list, list) {
367
r = userspace_do_request(lc, lc->uuid, fe->type,
368
(char *)&fe->region,
369
sizeof(fe->region),
370
NULL, NULL);
371
if (r)
372
break;
373
}
374
375
return r;
376
}
377
378
static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
379
{
380
int r = 0;
381
int count;
382
uint32_t type = 0;
383
struct flush_entry *fe, *tmp_fe;
384
LIST_HEAD(tmp_list);
385
uint64_t group[MAX_FLUSH_GROUP_COUNT];
386
387
/*
388
* Group process the requests
389
*/
390
while (!list_empty(flush_list)) {
391
count = 0;
392
393
list_for_each_entry_safe(fe, tmp_fe, flush_list, list) {
394
group[count] = fe->region;
395
count++;
396
397
list_del(&fe->list);
398
list_add(&fe->list, &tmp_list);
399
400
type = fe->type;
401
if (count >= MAX_FLUSH_GROUP_COUNT)
402
break;
403
}
404
405
r = userspace_do_request(lc, lc->uuid, type,
406
(char *)(group),
407
count * sizeof(uint64_t),
408
NULL, NULL);
409
if (r) {
410
/* Group send failed. Attempt one-by-one. */
411
list_splice_init(&tmp_list, flush_list);
412
r = flush_one_by_one(lc, flush_list);
413
break;
414
}
415
}
416
417
/*
418
* Must collect flush_entrys that were successfully processed
419
* as a group so that they will be free'd by the caller.
420
*/
421
list_splice_init(&tmp_list, flush_list);
422
423
return r;
424
}
425
426
/*
427
* userspace_flush
428
*
429
* This function is ok to block.
430
* The flush happens in two stages. First, it sends all
431
* clear/mark requests that are on the list. Then it
432
* tells the server to commit them. This gives the
433
* server a chance to optimise the commit, instead of
434
* doing it for every request.
435
*
436
* Additionally, we could implement another thread that
437
* sends the requests up to the server - reducing the
438
* load on flush. Then the flush would have less in
439
* the list and be responsible for the finishing commit.
440
*
441
* Returns: 0 on success, < 0 on failure
442
*/
443
static int userspace_flush(struct dm_dirty_log *log)
444
{
445
int r = 0;
446
unsigned long flags;
447
struct log_c *lc = log->context;
448
LIST_HEAD(mark_list);
449
LIST_HEAD(clear_list);
450
struct flush_entry *fe, *tmp_fe;
451
452
spin_lock_irqsave(&lc->flush_lock, flags);
453
list_splice_init(&lc->mark_list, &mark_list);
454
list_splice_init(&lc->clear_list, &clear_list);
455
spin_unlock_irqrestore(&lc->flush_lock, flags);
456
457
if (list_empty(&mark_list) && list_empty(&clear_list))
458
return 0;
459
460
r = flush_by_group(lc, &mark_list);
461
if (r)
462
goto fail;
463
464
r = flush_by_group(lc, &clear_list);
465
if (r)
466
goto fail;
467
468
r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
469
NULL, 0, NULL, NULL);
470
471
fail:
472
/*
473
* We can safely remove these entries, even if failure.
474
* Calling code will receive an error and will know that
475
* the log facility has failed.
476
*/
477
list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) {
478
list_del(&fe->list);
479
mempool_free(fe, flush_entry_pool);
480
}
481
list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) {
482
list_del(&fe->list);
483
mempool_free(fe, flush_entry_pool);
484
}
485
486
if (r)
487
dm_table_event(lc->ti->table);
488
489
return r;
490
}
491
492
/*
493
* userspace_mark_region
494
*
495
* This function should avoid blocking unless absolutely required.
496
* (Memory allocation is valid for blocking.)
497
*/
498
static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
499
{
500
unsigned long flags;
501
struct log_c *lc = log->context;
502
struct flush_entry *fe;
503
504
/* Wait for an allocation, but _never_ fail */
505
fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
506
BUG_ON(!fe);
507
508
spin_lock_irqsave(&lc->flush_lock, flags);
509
fe->type = DM_ULOG_MARK_REGION;
510
fe->region = region;
511
list_add(&fe->list, &lc->mark_list);
512
spin_unlock_irqrestore(&lc->flush_lock, flags);
513
514
return;
515
}
516
517
/*
518
* userspace_clear_region
519
*
520
* This function must not block.
521
* So, the alloc can't block. In the worst case, it is ok to
522
* fail. It would simply mean we can't clear the region.
523
* Does nothing to current sync context, but does mean
524
* the region will be re-sync'ed on a reload of the mirror
525
* even though it is in-sync.
526
*/
527
static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
528
{
529
unsigned long flags;
530
struct log_c *lc = log->context;
531
struct flush_entry *fe;
532
533
/*
534
* If we fail to allocate, we skip the clearing of
535
* the region. This doesn't hurt us in any way, except
536
* to cause the region to be resync'ed when the
537
* device is activated next time.
538
*/
539
fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
540
if (!fe) {
541
DMERR("Failed to allocate memory to clear region.");
542
return;
543
}
544
545
spin_lock_irqsave(&lc->flush_lock, flags);
546
fe->type = DM_ULOG_CLEAR_REGION;
547
fe->region = region;
548
list_add(&fe->list, &lc->clear_list);
549
spin_unlock_irqrestore(&lc->flush_lock, flags);
550
551
return;
552
}
553
554
/*
555
* userspace_get_resync_work
556
*
557
* Get a region that needs recovery. It is valid to return
558
* an error for this function.
559
*
560
* Returns: 1 if region filled, 0 if no work, <0 on error
561
*/
562
static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
563
{
564
int r;
565
size_t rdata_size;
566
struct log_c *lc = log->context;
567
struct {
568
int64_t i; /* 64-bit for mix arch compatibility */
569
region_t r;
570
} pkg;
571
572
if (lc->in_sync_hint >= lc->region_count)
573
return 0;
574
575
rdata_size = sizeof(pkg);
576
r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
577
NULL, 0,
578
(char *)&pkg, &rdata_size);
579
580
*region = pkg.r;
581
return (r) ? r : (int)pkg.i;
582
}
583
584
/*
585
* userspace_set_region_sync
586
*
587
* Set the sync status of a given region. This function
588
* must not fail.
589
*/
590
static void userspace_set_region_sync(struct dm_dirty_log *log,
591
region_t region, int in_sync)
592
{
593
int r;
594
struct log_c *lc = log->context;
595
struct {
596
region_t r;
597
int64_t i;
598
} pkg;
599
600
pkg.r = region;
601
pkg.i = (int64_t)in_sync;
602
603
r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
604
(char *)&pkg, sizeof(pkg),
605
NULL, NULL);
606
607
/*
608
* It would be nice to be able to report failures.
609
* However, it is easy emough to detect and resolve.
610
*/
611
return;
612
}
613
614
/*
615
* userspace_get_sync_count
616
*
617
* If there is any sort of failure when consulting the server,
618
* we assume that the sync count is zero.
619
*
620
* Returns: sync count on success, 0 on failure
621
*/
622
static region_t userspace_get_sync_count(struct dm_dirty_log *log)
623
{
624
int r;
625
size_t rdata_size;
626
uint64_t sync_count;
627
struct log_c *lc = log->context;
628
629
rdata_size = sizeof(sync_count);
630
r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
631
NULL, 0,
632
(char *)&sync_count, &rdata_size);
633
634
if (r)
635
return 0;
636
637
if (sync_count >= lc->region_count)
638
lc->in_sync_hint = lc->region_count;
639
640
return (region_t)sync_count;
641
}
642
643
/*
644
* userspace_status
645
*
646
* Returns: amount of space consumed
647
*/
648
static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
649
char *result, unsigned maxlen)
650
{
651
int r = 0;
652
char *table_args;
653
size_t sz = (size_t)maxlen;
654
struct log_c *lc = log->context;
655
656
switch (status_type) {
657
case STATUSTYPE_INFO:
658
r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
659
NULL, 0,
660
result, &sz);
661
662
if (r) {
663
sz = 0;
664
DMEMIT("%s 1 COM_FAILURE", log->type->name);
665
}
666
break;
667
case STATUSTYPE_TABLE:
668
sz = 0;
669
table_args = strchr(lc->usr_argv_str, ' ');
670
BUG_ON(!table_args); /* There will always be a ' ' */
671
table_args++;
672
673
DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc,
674
lc->uuid, table_args);
675
break;
676
}
677
return (r) ? 0 : (int)sz;
678
}
679
680
/*
681
* userspace_is_remote_recovering
682
*
683
* Returns: 1 if region recovering, 0 otherwise
684
*/
685
static int userspace_is_remote_recovering(struct dm_dirty_log *log,
686
region_t region)
687
{
688
int r;
689
uint64_t region64 = region;
690
struct log_c *lc = log->context;
691
static unsigned long long limit;
692
struct {
693
int64_t is_recovering;
694
uint64_t in_sync_hint;
695
} pkg;
696
size_t rdata_size = sizeof(pkg);
697
698
/*
699
* Once the mirror has been reported to be in-sync,
700
* it will never again ask for recovery work. So,
701
* we can safely say there is not a remote machine
702
* recovering if the device is in-sync. (in_sync_hint
703
* must be reset at resume time.)
704
*/
705
if (region < lc->in_sync_hint)
706
return 0;
707
else if (jiffies < limit)
708
return 1;
709
710
limit = jiffies + (HZ / 4);
711
r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
712
(char *)&region64, sizeof(region64),
713
(char *)&pkg, &rdata_size);
714
if (r)
715
return 1;
716
717
lc->in_sync_hint = pkg.in_sync_hint;
718
719
return (int)pkg.is_recovering;
720
}
721
722
static struct dm_dirty_log_type _userspace_type = {
723
.name = "userspace",
724
.module = THIS_MODULE,
725
.ctr = userspace_ctr,
726
.dtr = userspace_dtr,
727
.presuspend = userspace_presuspend,
728
.postsuspend = userspace_postsuspend,
729
.resume = userspace_resume,
730
.get_region_size = userspace_get_region_size,
731
.is_clean = userspace_is_clean,
732
.in_sync = userspace_in_sync,
733
.flush = userspace_flush,
734
.mark_region = userspace_mark_region,
735
.clear_region = userspace_clear_region,
736
.get_resync_work = userspace_get_resync_work,
737
.set_region_sync = userspace_set_region_sync,
738
.get_sync_count = userspace_get_sync_count,
739
.status = userspace_status,
740
.is_remote_recovering = userspace_is_remote_recovering,
741
};
742
743
static int __init userspace_dirty_log_init(void)
744
{
745
int r = 0;
746
747
flush_entry_pool = mempool_create(100, flush_entry_alloc,
748
flush_entry_free, NULL);
749
750
if (!flush_entry_pool) {
751
DMWARN("Unable to create flush_entry_pool: No memory.");
752
return -ENOMEM;
753
}
754
755
r = dm_ulog_tfr_init();
756
if (r) {
757
DMWARN("Unable to initialize userspace log communications");
758
mempool_destroy(flush_entry_pool);
759
return r;
760
}
761
762
r = dm_dirty_log_type_register(&_userspace_type);
763
if (r) {
764
DMWARN("Couldn't register userspace dirty log type");
765
dm_ulog_tfr_exit();
766
mempool_destroy(flush_entry_pool);
767
return r;
768
}
769
770
DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");
771
return 0;
772
}
773
774
static void __exit userspace_dirty_log_exit(void)
775
{
776
dm_dirty_log_type_unregister(&_userspace_type);
777
dm_ulog_tfr_exit();
778
mempool_destroy(flush_entry_pool);
779
780
DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
781
return;
782
}
783
784
module_init(userspace_dirty_log_init);
785
module_exit(userspace_dirty_log_exit);
786
787
MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
788
MODULE_AUTHOR("Jonathan Brassow <[email protected]>");
789
MODULE_LICENSE("GPL");
790
791