Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/drivers/block/drbd/drbd_bitmap.c
15180 views
1
/*
2
drbd_bitmap.c
3
4
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6
Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
7
Copyright (C) 2004-2008, Philipp Reisner <[email protected]>.
8
Copyright (C) 2004-2008, Lars Ellenberg <[email protected]>.
9
10
drbd is free software; you can redistribute it and/or modify
11
it under the terms of the GNU General Public License as published by
12
the Free Software Foundation; either version 2, or (at your option)
13
any later version.
14
15
drbd is distributed in the hope that it will be useful,
16
but WITHOUT ANY WARRANTY; without even the implied warranty of
17
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
GNU General Public License for more details.
19
20
You should have received a copy of the GNU General Public License
21
along with drbd; see the file COPYING. If not, write to
22
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
*/
24
25
#include <linux/bitops.h>
26
#include <linux/vmalloc.h>
27
#include <linux/string.h>
28
#include <linux/drbd.h>
29
#include <linux/slab.h>
30
#include <asm/kmap_types.h>
31
32
#include "drbd_int.h"
33
34
35
/* OPAQUE outside this file!
36
* interface defined in drbd_int.h
37
38
* convention:
39
* function name drbd_bm_... => used elsewhere, "public".
40
* function name bm_... => internal to implementation, "private".
41
*/
42
43
44
/*
45
* LIMITATIONS:
46
* We want to support >= peta byte of backend storage, while for now still using
47
* a granularity of one bit per 4KiB of storage.
48
* 1 << 50 bytes backend storage (1 PiB)
49
* 1 << (50 - 12) bits needed
50
* 38 --> we need u64 to index and count bits
51
* 1 << (38 - 3) bitmap bytes needed
52
* 35 --> we still need u64 to index and count bytes
53
* (that's 32 GiB of bitmap for 1 PiB storage)
54
* 1 << (35 - 2) 32bit longs needed
55
* 33 --> we'd even need u64 to index and count 32bit long words.
56
* 1 << (35 - 3) 64bit longs needed
57
* 32 --> we could get away with a 32bit unsigned int to index and count
58
* 64bit long words, but I rather stay with unsigned long for now.
59
* We probably should neither count nor point to bytes or long words
60
* directly, but either by bitnumber, or by page index and offset.
61
* 1 << (35 - 12)
62
* 22 --> we need that much 4KiB pages of bitmap.
63
* 1 << (22 + 3) --> on a 64bit arch,
64
* we need 32 MiB to store the array of page pointers.
65
*
66
* Because I'm lazy, and because the resulting patch was too large, too ugly
67
* and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
68
* (1 << 32) bits * 4k storage.
69
*
70
71
* bitmap storage and IO:
72
* Bitmap is stored little endian on disk, and is kept little endian in
73
* core memory. Currently we still hold the full bitmap in core as long
74
* as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
75
* seems excessive.
76
*
77
* We plan to reduce the amount of in-core bitmap pages by paging them in
78
* and out against their on-disk location as necessary, but need to make
79
* sure we don't cause too much meta data IO, and must not deadlock in
80
* tight memory situations. This needs some more work.
81
*/
82
83
/*
84
* NOTE
85
* Access to the *bm_pages is protected by bm_lock.
86
* It is safe to read the other members within the lock.
87
*
88
* drbd_bm_set_bits is called from bio_endio callbacks,
89
* We may be called with irq already disabled,
90
* so we need spin_lock_irqsave().
91
* And we need the kmap_atomic.
92
*/
93
struct drbd_bitmap {
94
struct page **bm_pages;
95
spinlock_t bm_lock;
96
97
/* see LIMITATIONS: above */
98
99
unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
100
unsigned long bm_bits;
101
size_t bm_words;
102
size_t bm_number_of_pages;
103
sector_t bm_dev_capacity;
104
struct mutex bm_change; /* serializes resize operations */
105
106
wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
107
108
enum bm_flag bm_flags;
109
110
/* debugging aid, in case we are still racy somewhere */
111
char *bm_why;
112
struct task_struct *bm_task;
113
};
114
115
#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
116
static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
117
{
118
struct drbd_bitmap *b = mdev->bitmap;
119
if (!__ratelimit(&drbd_ratelimit_state))
120
return;
121
dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
122
current == mdev->receiver.task ? "receiver" :
123
current == mdev->asender.task ? "asender" :
124
current == mdev->worker.task ? "worker" : current->comm,
125
func, b->bm_why ?: "?",
126
b->bm_task == mdev->receiver.task ? "receiver" :
127
b->bm_task == mdev->asender.task ? "asender" :
128
b->bm_task == mdev->worker.task ? "worker" : "?");
129
}
130
131
void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
132
{
133
struct drbd_bitmap *b = mdev->bitmap;
134
int trylock_failed;
135
136
if (!b) {
137
dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n");
138
return;
139
}
140
141
trylock_failed = !mutex_trylock(&b->bm_change);
142
143
if (trylock_failed) {
144
dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
145
current == mdev->receiver.task ? "receiver" :
146
current == mdev->asender.task ? "asender" :
147
current == mdev->worker.task ? "worker" : current->comm,
148
why, b->bm_why ?: "?",
149
b->bm_task == mdev->receiver.task ? "receiver" :
150
b->bm_task == mdev->asender.task ? "asender" :
151
b->bm_task == mdev->worker.task ? "worker" : "?");
152
mutex_lock(&b->bm_change);
153
}
154
if (BM_LOCKED_MASK & b->bm_flags)
155
dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
156
b->bm_flags |= flags & BM_LOCKED_MASK;
157
158
b->bm_why = why;
159
b->bm_task = current;
160
}
161
162
void drbd_bm_unlock(struct drbd_conf *mdev)
163
{
164
struct drbd_bitmap *b = mdev->bitmap;
165
if (!b) {
166
dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n");
167
return;
168
}
169
170
if (!(BM_LOCKED_MASK & mdev->bitmap->bm_flags))
171
dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
172
173
b->bm_flags &= ~BM_LOCKED_MASK;
174
b->bm_why = NULL;
175
b->bm_task = NULL;
176
mutex_unlock(&b->bm_change);
177
}
178
179
/* we store some "meta" info about our pages in page->private */
180
/* at a granularity of 4k storage per bitmap bit:
181
* one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
182
* 1<<38 bits,
183
* 1<<23 4k bitmap pages.
184
* Use 24 bits as page index, covers 2 peta byte storage
185
* at a granularity of 4k per bit.
186
* Used to report the failed page idx on io error from the endio handlers.
187
*/
188
#define BM_PAGE_IDX_MASK ((1UL<<24)-1)
189
/* this page is currently read in, or written back */
190
#define BM_PAGE_IO_LOCK 31
191
/* if there has been an IO error for this page */
192
#define BM_PAGE_IO_ERROR 30
193
/* this is to be able to intelligently skip disk IO,
194
* set if bits have been set since last IO. */
195
#define BM_PAGE_NEED_WRITEOUT 29
196
/* to mark for lazy writeout once syncer cleared all clearable bits,
197
* we if bits have been cleared since last IO. */
198
#define BM_PAGE_LAZY_WRITEOUT 28
199
200
/* store_page_idx uses non-atomic assignment. It is only used directly after
201
* allocating the page. All other bm_set_page_* and bm_clear_page_* need to
202
* use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
203
* changes) may happen from various contexts, and wait_on_bit/wake_up_bit
204
* requires it all to be atomic as well. */
205
static void bm_store_page_idx(struct page *page, unsigned long idx)
206
{
207
BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
208
page_private(page) |= idx;
209
}
210
211
static unsigned long bm_page_to_idx(struct page *page)
212
{
213
return page_private(page) & BM_PAGE_IDX_MASK;
214
}
215
216
/* As is very unlikely that the same page is under IO from more than one
217
* context, we can get away with a bit per page and one wait queue per bitmap.
218
*/
219
static void bm_page_lock_io(struct drbd_conf *mdev, int page_nr)
220
{
221
struct drbd_bitmap *b = mdev->bitmap;
222
void *addr = &page_private(b->bm_pages[page_nr]);
223
wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
224
}
225
226
static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr)
227
{
228
struct drbd_bitmap *b = mdev->bitmap;
229
void *addr = &page_private(b->bm_pages[page_nr]);
230
clear_bit(BM_PAGE_IO_LOCK, addr);
231
smp_mb__after_clear_bit();
232
wake_up(&mdev->bitmap->bm_io_wait);
233
}
234
235
/* set _before_ submit_io, so it may be reset due to being changed
236
* while this page is in flight... will get submitted later again */
237
static void bm_set_page_unchanged(struct page *page)
238
{
239
/* use cmpxchg? */
240
clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
241
clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
242
}
243
244
static void bm_set_page_need_writeout(struct page *page)
245
{
246
set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
247
}
248
249
static int bm_test_page_unchanged(struct page *page)
250
{
251
volatile const unsigned long *addr = &page_private(page);
252
return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
253
}
254
255
static void bm_set_page_io_err(struct page *page)
256
{
257
set_bit(BM_PAGE_IO_ERROR, &page_private(page));
258
}
259
260
static void bm_clear_page_io_err(struct page *page)
261
{
262
clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
263
}
264
265
static void bm_set_page_lazy_writeout(struct page *page)
266
{
267
set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
268
}
269
270
static int bm_test_page_lazy_writeout(struct page *page)
271
{
272
return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
273
}
274
275
/* on a 32bit box, this would allow for exactly (2<<38) bits. */
276
static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
277
{
278
/* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
279
unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
280
BUG_ON(page_nr >= b->bm_number_of_pages);
281
return page_nr;
282
}
283
284
static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
285
{
286
/* page_nr = (bitnr/8) >> PAGE_SHIFT; */
287
unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
288
BUG_ON(page_nr >= b->bm_number_of_pages);
289
return page_nr;
290
}
291
292
static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km)
293
{
294
struct page *page = b->bm_pages[idx];
295
return (unsigned long *) kmap_atomic(page, km);
296
}
297
298
static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
299
{
300
return __bm_map_pidx(b, idx, KM_IRQ1);
301
}
302
303
static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
304
{
305
kunmap_atomic(p_addr, km);
306
};
307
308
static void bm_unmap(unsigned long *p_addr)
309
{
310
return __bm_unmap(p_addr, KM_IRQ1);
311
}
312
313
/* long word offset of _bitmap_ sector */
314
#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
315
/* word offset from start of bitmap to word number _in_page_
316
* modulo longs per page
317
#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
318
hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
319
so do it explicitly:
320
*/
321
#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
322
323
/* Long words per page */
324
#define LWPP (PAGE_SIZE/sizeof(long))
325
326
/*
327
* actually most functions herein should take a struct drbd_bitmap*, not a
328
* struct drbd_conf*, but for the debug macros I like to have the mdev around
329
* to be able to report device specific.
330
*/
331
332
333
static void bm_free_pages(struct page **pages, unsigned long number)
334
{
335
unsigned long i;
336
if (!pages)
337
return;
338
339
for (i = 0; i < number; i++) {
340
if (!pages[i]) {
341
printk(KERN_ALERT "drbd: bm_free_pages tried to free "
342
"a NULL pointer; i=%lu n=%lu\n",
343
i, number);
344
continue;
345
}
346
__free_page(pages[i]);
347
pages[i] = NULL;
348
}
349
}
350
351
static void bm_vk_free(void *ptr, int v)
352
{
353
if (v)
354
vfree(ptr);
355
else
356
kfree(ptr);
357
}
358
359
/*
360
* "have" and "want" are NUMBER OF PAGES.
361
*/
362
static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
363
{
364
struct page **old_pages = b->bm_pages;
365
struct page **new_pages, *page;
366
unsigned int i, bytes, vmalloced = 0;
367
unsigned long have = b->bm_number_of_pages;
368
369
BUG_ON(have == 0 && old_pages != NULL);
370
BUG_ON(have != 0 && old_pages == NULL);
371
372
if (have == want)
373
return old_pages;
374
375
/* Trying kmalloc first, falling back to vmalloc.
376
* GFP_KERNEL is ok, as this is done when a lower level disk is
377
* "attached" to the drbd. Context is receiver thread or cqueue
378
* thread. As we have no disk yet, we are not in the IO path,
379
* not even the IO path of the peer. */
380
bytes = sizeof(struct page *)*want;
381
new_pages = kmalloc(bytes, GFP_KERNEL);
382
if (!new_pages) {
383
new_pages = vmalloc(bytes);
384
if (!new_pages)
385
return NULL;
386
vmalloced = 1;
387
}
388
389
memset(new_pages, 0, bytes);
390
if (want >= have) {
391
for (i = 0; i < have; i++)
392
new_pages[i] = old_pages[i];
393
for (; i < want; i++) {
394
page = alloc_page(GFP_HIGHUSER);
395
if (!page) {
396
bm_free_pages(new_pages + have, i - have);
397
bm_vk_free(new_pages, vmalloced);
398
return NULL;
399
}
400
/* we want to know which page it is
401
* from the endio handlers */
402
bm_store_page_idx(page, i);
403
new_pages[i] = page;
404
}
405
} else {
406
for (i = 0; i < want; i++)
407
new_pages[i] = old_pages[i];
408
/* NOT HERE, we are outside the spinlock!
409
bm_free_pages(old_pages + want, have - want);
410
*/
411
}
412
413
if (vmalloced)
414
b->bm_flags |= BM_P_VMALLOCED;
415
else
416
b->bm_flags &= ~BM_P_VMALLOCED;
417
418
return new_pages;
419
}
420
421
/*
422
* called on driver init only. TODO call when a device is created.
423
* allocates the drbd_bitmap, and stores it in mdev->bitmap.
424
*/
425
int drbd_bm_init(struct drbd_conf *mdev)
426
{
427
struct drbd_bitmap *b = mdev->bitmap;
428
WARN_ON(b != NULL);
429
b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
430
if (!b)
431
return -ENOMEM;
432
spin_lock_init(&b->bm_lock);
433
mutex_init(&b->bm_change);
434
init_waitqueue_head(&b->bm_io_wait);
435
436
mdev->bitmap = b;
437
438
return 0;
439
}
440
441
sector_t drbd_bm_capacity(struct drbd_conf *mdev)
442
{
443
ERR_IF(!mdev->bitmap) return 0;
444
return mdev->bitmap->bm_dev_capacity;
445
}
446
447
/* called on driver unload. TODO: call when a device is destroyed.
448
*/
449
void drbd_bm_cleanup(struct drbd_conf *mdev)
450
{
451
ERR_IF (!mdev->bitmap) return;
452
bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
453
bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags));
454
kfree(mdev->bitmap);
455
mdev->bitmap = NULL;
456
}
457
458
/*
459
* since (b->bm_bits % BITS_PER_LONG) != 0,
460
* this masks out the remaining bits.
461
* Returns the number of bits cleared.
462
*/
463
#define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3))
464
#define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1)
465
#define BITS_PER_LONG_MASK (BITS_PER_LONG - 1)
466
static int bm_clear_surplus(struct drbd_bitmap *b)
467
{
468
unsigned long mask;
469
unsigned long *p_addr, *bm;
470
int tmp;
471
int cleared = 0;
472
473
/* number of bits modulo bits per page */
474
tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
475
/* mask the used bits of the word containing the last bit */
476
mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
477
/* bitmap is always stored little endian,
478
* on disk and in core memory alike */
479
mask = cpu_to_lel(mask);
480
481
p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
482
bm = p_addr + (tmp/BITS_PER_LONG);
483
if (mask) {
484
/* If mask != 0, we are not exactly aligned, so bm now points
485
* to the long containing the last bit.
486
* If mask == 0, bm already points to the word immediately
487
* after the last (long word aligned) bit. */
488
cleared = hweight_long(*bm & ~mask);
489
*bm &= mask;
490
bm++;
491
}
492
493
if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
494
/* on a 32bit arch, we may need to zero out
495
* a padding long to align with a 64bit remote */
496
cleared += hweight_long(*bm);
497
*bm = 0;
498
}
499
bm_unmap(p_addr);
500
return cleared;
501
}
502
503
static void bm_set_surplus(struct drbd_bitmap *b)
504
{
505
unsigned long mask;
506
unsigned long *p_addr, *bm;
507
int tmp;
508
509
/* number of bits modulo bits per page */
510
tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
511
/* mask the used bits of the word containing the last bit */
512
mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
513
/* bitmap is always stored little endian,
514
* on disk and in core memory alike */
515
mask = cpu_to_lel(mask);
516
517
p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
518
bm = p_addr + (tmp/BITS_PER_LONG);
519
if (mask) {
520
/* If mask != 0, we are not exactly aligned, so bm now points
521
* to the long containing the last bit.
522
* If mask == 0, bm already points to the word immediately
523
* after the last (long word aligned) bit. */
524
*bm |= ~mask;
525
bm++;
526
}
527
528
if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
529
/* on a 32bit arch, we may need to zero out
530
* a padding long to align with a 64bit remote */
531
*bm = ~0UL;
532
}
533
bm_unmap(p_addr);
534
}
535
536
/* you better not modify the bitmap while this is running,
537
* or its results will be stale */
538
static unsigned long bm_count_bits(struct drbd_bitmap *b)
539
{
540
unsigned long *p_addr;
541
unsigned long bits = 0;
542
unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
543
int idx, i, last_word;
544
545
/* all but last page */
546
for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
547
p_addr = __bm_map_pidx(b, idx, KM_USER0);
548
for (i = 0; i < LWPP; i++)
549
bits += hweight_long(p_addr[i]);
550
__bm_unmap(p_addr, KM_USER0);
551
cond_resched();
552
}
553
/* last (or only) page */
554
last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
555
p_addr = __bm_map_pidx(b, idx, KM_USER0);
556
for (i = 0; i < last_word; i++)
557
bits += hweight_long(p_addr[i]);
558
p_addr[last_word] &= cpu_to_lel(mask);
559
bits += hweight_long(p_addr[last_word]);
560
/* 32bit arch, may have an unused padding long */
561
if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
562
p_addr[last_word+1] = 0;
563
__bm_unmap(p_addr, KM_USER0);
564
return bits;
565
}
566
567
/* offset and len in long words.*/
568
static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
569
{
570
unsigned long *p_addr, *bm;
571
unsigned int idx;
572
size_t do_now, end;
573
574
end = offset + len;
575
576
if (end > b->bm_words) {
577
printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
578
return;
579
}
580
581
while (offset < end) {
582
do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
583
idx = bm_word_to_page_idx(b, offset);
584
p_addr = bm_map_pidx(b, idx);
585
bm = p_addr + MLPP(offset);
586
if (bm+do_now > p_addr + LWPP) {
587
printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
588
p_addr, bm, (int)do_now);
589
} else
590
memset(bm, c, do_now * sizeof(long));
591
bm_unmap(p_addr);
592
bm_set_page_need_writeout(b->bm_pages[idx]);
593
offset += do_now;
594
}
595
}
596
597
/*
598
* make sure the bitmap has enough room for the attached storage,
599
* if necessary, resize.
600
* called whenever we may have changed the device size.
601
* returns -ENOMEM if we could not allocate enough memory, 0 on success.
602
* In case this is actually a resize, we copy the old bitmap into the new one.
603
* Otherwise, the bitmap is initialized to all bits set.
604
*/
605
int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
606
{
607
struct drbd_bitmap *b = mdev->bitmap;
608
unsigned long bits, words, owords, obits;
609
unsigned long want, have, onpages; /* number of pages */
610
struct page **npages, **opages = NULL;
611
int err = 0, growing;
612
int opages_vmalloced;
613
614
ERR_IF(!b) return -ENOMEM;
615
616
drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK);
617
618
dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
619
(unsigned long long)capacity);
620
621
if (capacity == b->bm_dev_capacity)
622
goto out;
623
624
opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
625
626
if (capacity == 0) {
627
spin_lock_irq(&b->bm_lock);
628
opages = b->bm_pages;
629
onpages = b->bm_number_of_pages;
630
owords = b->bm_words;
631
b->bm_pages = NULL;
632
b->bm_number_of_pages =
633
b->bm_set =
634
b->bm_bits =
635
b->bm_words =
636
b->bm_dev_capacity = 0;
637
spin_unlock_irq(&b->bm_lock);
638
bm_free_pages(opages, onpages);
639
bm_vk_free(opages, opages_vmalloced);
640
goto out;
641
}
642
bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
643
644
/* if we would use
645
words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
646
a 32bit host could present the wrong number of words
647
to a 64bit host.
648
*/
649
words = ALIGN(bits, 64) >> LN2_BPL;
650
651
if (get_ldev(mdev)) {
652
u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12;
653
put_ldev(mdev);
654
if (bits > bits_on_disk) {
655
dev_info(DEV, "bits = %lu\n", bits);
656
dev_info(DEV, "bits_on_disk = %llu\n", bits_on_disk);
657
err = -ENOSPC;
658
goto out;
659
}
660
}
661
662
want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
663
have = b->bm_number_of_pages;
664
if (want == have) {
665
D_ASSERT(b->bm_pages != NULL);
666
npages = b->bm_pages;
667
} else {
668
if (drbd_insert_fault(mdev, DRBD_FAULT_BM_ALLOC))
669
npages = NULL;
670
else
671
npages = bm_realloc_pages(b, want);
672
}
673
674
if (!npages) {
675
err = -ENOMEM;
676
goto out;
677
}
678
679
spin_lock_irq(&b->bm_lock);
680
opages = b->bm_pages;
681
owords = b->bm_words;
682
obits = b->bm_bits;
683
684
growing = bits > obits;
685
if (opages && growing && set_new_bits)
686
bm_set_surplus(b);
687
688
b->bm_pages = npages;
689
b->bm_number_of_pages = want;
690
b->bm_bits = bits;
691
b->bm_words = words;
692
b->bm_dev_capacity = capacity;
693
694
if (growing) {
695
if (set_new_bits) {
696
bm_memset(b, owords, 0xff, words-owords);
697
b->bm_set += bits - obits;
698
} else
699
bm_memset(b, owords, 0x00, words-owords);
700
701
}
702
703
if (want < have) {
704
/* implicit: (opages != NULL) && (opages != npages) */
705
bm_free_pages(opages + want, have - want);
706
}
707
708
(void)bm_clear_surplus(b);
709
710
spin_unlock_irq(&b->bm_lock);
711
if (opages != npages)
712
bm_vk_free(opages, opages_vmalloced);
713
if (!growing)
714
b->bm_set = bm_count_bits(b);
715
dev_info(DEV, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
716
717
out:
718
drbd_bm_unlock(mdev);
719
return err;
720
}
721
722
/* inherently racy:
723
* if not protected by other means, return value may be out of date when
724
* leaving this function...
725
* we still need to lock it, since it is important that this returns
726
* bm_set == 0 precisely.
727
*
728
* maybe bm_set should be atomic_t ?
729
*/
730
unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
731
{
732
struct drbd_bitmap *b = mdev->bitmap;
733
unsigned long s;
734
unsigned long flags;
735
736
ERR_IF(!b) return 0;
737
ERR_IF(!b->bm_pages) return 0;
738
739
spin_lock_irqsave(&b->bm_lock, flags);
740
s = b->bm_set;
741
spin_unlock_irqrestore(&b->bm_lock, flags);
742
743
return s;
744
}
745
746
unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
747
{
748
unsigned long s;
749
/* if I don't have a disk, I don't know about out-of-sync status */
750
if (!get_ldev_if_state(mdev, D_NEGOTIATING))
751
return 0;
752
s = _drbd_bm_total_weight(mdev);
753
put_ldev(mdev);
754
return s;
755
}
756
757
size_t drbd_bm_words(struct drbd_conf *mdev)
758
{
759
struct drbd_bitmap *b = mdev->bitmap;
760
ERR_IF(!b) return 0;
761
ERR_IF(!b->bm_pages) return 0;
762
763
return b->bm_words;
764
}
765
766
unsigned long drbd_bm_bits(struct drbd_conf *mdev)
767
{
768
struct drbd_bitmap *b = mdev->bitmap;
769
ERR_IF(!b) return 0;
770
771
return b->bm_bits;
772
}
773
774
/* merge number words from buffer into the bitmap starting at offset.
775
* buffer[i] is expected to be little endian unsigned long.
776
* bitmap must be locked by drbd_bm_lock.
777
* currently only used from receive_bitmap.
778
*/
779
void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
780
unsigned long *buffer)
781
{
782
struct drbd_bitmap *b = mdev->bitmap;
783
unsigned long *p_addr, *bm;
784
unsigned long word, bits;
785
unsigned int idx;
786
size_t end, do_now;
787
788
end = offset + number;
789
790
ERR_IF(!b) return;
791
ERR_IF(!b->bm_pages) return;
792
if (number == 0)
793
return;
794
WARN_ON(offset >= b->bm_words);
795
WARN_ON(end > b->bm_words);
796
797
spin_lock_irq(&b->bm_lock);
798
while (offset < end) {
799
do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
800
idx = bm_word_to_page_idx(b, offset);
801
p_addr = bm_map_pidx(b, idx);
802
bm = p_addr + MLPP(offset);
803
offset += do_now;
804
while (do_now--) {
805
bits = hweight_long(*bm);
806
word = *bm | *buffer++;
807
*bm++ = word;
808
b->bm_set += hweight_long(word) - bits;
809
}
810
bm_unmap(p_addr);
811
bm_set_page_need_writeout(b->bm_pages[idx]);
812
}
813
/* with 32bit <-> 64bit cross-platform connect
814
* this is only correct for current usage,
815
* where we _know_ that we are 64 bit aligned,
816
* and know that this function is used in this way, too...
817
*/
818
if (end == b->bm_words)
819
b->bm_set -= bm_clear_surplus(b);
820
spin_unlock_irq(&b->bm_lock);
821
}
822
823
/* copy number words from the bitmap starting at offset into the buffer.
824
* buffer[i] will be little endian unsigned long.
825
*/
826
void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
827
unsigned long *buffer)
828
{
829
struct drbd_bitmap *b = mdev->bitmap;
830
unsigned long *p_addr, *bm;
831
size_t end, do_now;
832
833
end = offset + number;
834
835
ERR_IF(!b) return;
836
ERR_IF(!b->bm_pages) return;
837
838
spin_lock_irq(&b->bm_lock);
839
if ((offset >= b->bm_words) ||
840
(end > b->bm_words) ||
841
(number <= 0))
842
dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n",
843
(unsigned long) offset,
844
(unsigned long) number,
845
(unsigned long) b->bm_words);
846
else {
847
while (offset < end) {
848
do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
849
p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
850
bm = p_addr + MLPP(offset);
851
offset += do_now;
852
while (do_now--)
853
*buffer++ = *bm++;
854
bm_unmap(p_addr);
855
}
856
}
857
spin_unlock_irq(&b->bm_lock);
858
}
859
860
/* set all bits in the bitmap */
861
void drbd_bm_set_all(struct drbd_conf *mdev)
862
{
863
struct drbd_bitmap *b = mdev->bitmap;
864
ERR_IF(!b) return;
865
ERR_IF(!b->bm_pages) return;
866
867
spin_lock_irq(&b->bm_lock);
868
bm_memset(b, 0, 0xff, b->bm_words);
869
(void)bm_clear_surplus(b);
870
b->bm_set = b->bm_bits;
871
spin_unlock_irq(&b->bm_lock);
872
}
873
874
/* clear all bits in the bitmap */
875
void drbd_bm_clear_all(struct drbd_conf *mdev)
876
{
877
struct drbd_bitmap *b = mdev->bitmap;
878
ERR_IF(!b) return;
879
ERR_IF(!b->bm_pages) return;
880
881
spin_lock_irq(&b->bm_lock);
882
bm_memset(b, 0, 0, b->bm_words);
883
b->bm_set = 0;
884
spin_unlock_irq(&b->bm_lock);
885
}
886
887
struct bm_aio_ctx {
888
struct drbd_conf *mdev;
889
atomic_t in_flight;
890
struct completion done;
891
unsigned flags;
892
#define BM_AIO_COPY_PAGES 1
893
int error;
894
};
895
896
/* bv_page may be a copy, or may be the original */
897
static void bm_async_io_complete(struct bio *bio, int error)
898
{
899
struct bm_aio_ctx *ctx = bio->bi_private;
900
struct drbd_conf *mdev = ctx->mdev;
901
struct drbd_bitmap *b = mdev->bitmap;
902
unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
903
int uptodate = bio_flagged(bio, BIO_UPTODATE);
904
905
906
/* strange behavior of some lower level drivers...
907
* fail the request by clearing the uptodate flag,
908
* but do not return any error?!
909
* do we want to WARN() on this? */
910
if (!error && !uptodate)
911
error = -EIO;
912
913
if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
914
!bm_test_page_unchanged(b->bm_pages[idx]))
915
dev_warn(DEV, "bitmap page idx %u changed during IO!\n", idx);
916
917
if (error) {
918
/* ctx error will hold the completed-last non-zero error code,
919
* in case error codes differ. */
920
ctx->error = error;
921
bm_set_page_io_err(b->bm_pages[idx]);
922
/* Not identical to on disk version of it.
923
* Is BM_PAGE_IO_ERROR enough? */
924
if (__ratelimit(&drbd_ratelimit_state))
925
dev_err(DEV, "IO ERROR %d on bitmap page idx %u\n",
926
error, idx);
927
} else {
928
bm_clear_page_io_err(b->bm_pages[idx]);
929
dynamic_dev_dbg(DEV, "bitmap page idx %u completed\n", idx);
930
}
931
932
bm_page_unlock_io(mdev, idx);
933
934
/* FIXME give back to page pool */
935
if (ctx->flags & BM_AIO_COPY_PAGES)
936
put_page(bio->bi_io_vec[0].bv_page);
937
938
bio_put(bio);
939
940
if (atomic_dec_and_test(&ctx->in_flight))
941
complete(&ctx->done);
942
}
943
944
static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
945
{
946
/* we are process context. we always get a bio */
947
struct bio *bio = bio_alloc(GFP_KERNEL, 1);
948
struct drbd_conf *mdev = ctx->mdev;
949
struct drbd_bitmap *b = mdev->bitmap;
950
struct page *page;
951
unsigned int len;
952
953
sector_t on_disk_sector =
954
mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
955
on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
956
957
/* this might happen with very small
958
* flexible external meta data device,
959
* or with PAGE_SIZE > 4k */
960
len = min_t(unsigned int, PAGE_SIZE,
961
(drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
962
963
/* serialize IO on this page */
964
bm_page_lock_io(mdev, page_nr);
965
/* before memcpy and submit,
966
* so it can be redirtied any time */
967
bm_set_page_unchanged(b->bm_pages[page_nr]);
968
969
if (ctx->flags & BM_AIO_COPY_PAGES) {
970
/* FIXME alloc_page is good enough for now, but actually needs
971
* to use pre-allocated page pool */
972
void *src, *dest;
973
page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT);
974
dest = kmap_atomic(page, KM_USER0);
975
src = kmap_atomic(b->bm_pages[page_nr], KM_USER1);
976
memcpy(dest, src, PAGE_SIZE);
977
kunmap_atomic(src, KM_USER1);
978
kunmap_atomic(dest, KM_USER0);
979
bm_store_page_idx(page, page_nr);
980
} else
981
page = b->bm_pages[page_nr];
982
983
bio->bi_bdev = mdev->ldev->md_bdev;
984
bio->bi_sector = on_disk_sector;
985
bio_add_page(bio, page, len, 0);
986
bio->bi_private = ctx;
987
bio->bi_end_io = bm_async_io_complete;
988
989
if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
990
bio->bi_rw |= rw;
991
bio_endio(bio, -EIO);
992
} else {
993
submit_bio(rw, bio);
994
/* this should not count as user activity and cause the
995
* resync to throttle -- see drbd_rs_should_slow_down(). */
996
atomic_add(len >> 9, &mdev->rs_sect_ev);
997
}
998
}
999
1000
/*
1001
* bm_rw: read/write the whole bitmap from/to its on disk location.
1002
*/
1003
static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local)
1004
{
1005
struct bm_aio_ctx ctx = {
1006
.mdev = mdev,
1007
.in_flight = ATOMIC_INIT(1),
1008
.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
1009
.flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0,
1010
};
1011
struct drbd_bitmap *b = mdev->bitmap;
1012
int num_pages, i, count = 0;
1013
unsigned long now;
1014
char ppb[10];
1015
int err = 0;
1016
1017
/*
1018
* We are protected against bitmap disappearing/resizing by holding an
1019
* ldev reference (caller must have called get_ldev()).
1020
* For read/write, we are protected against changes to the bitmap by
1021
* the bitmap lock (see drbd_bitmap_io).
1022
* For lazy writeout, we don't care for ongoing changes to the bitmap,
1023
* as we submit copies of pages anyways.
1024
*/
1025
if (!ctx.flags)
1026
WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
1027
1028
num_pages = b->bm_number_of_pages;
1029
1030
now = jiffies;
1031
1032
/* let the layers below us try to merge these bios... */
1033
for (i = 0; i < num_pages; i++) {
1034
/* ignore completely unchanged pages */
1035
if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
1036
break;
1037
if (rw & WRITE) {
1038
if (bm_test_page_unchanged(b->bm_pages[i])) {
1039
dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
1040
continue;
1041
}
1042
/* during lazy writeout,
1043
* ignore those pages not marked for lazy writeout. */
1044
if (lazy_writeout_upper_idx &&
1045
!bm_test_page_lazy_writeout(b->bm_pages[i])) {
1046
dynamic_dev_dbg(DEV, "skipped bm lazy write for idx %u\n", i);
1047
continue;
1048
}
1049
}
1050
atomic_inc(&ctx.in_flight);
1051
bm_page_io_async(&ctx, i, rw);
1052
++count;
1053
cond_resched();
1054
}
1055
1056
/*
1057
* We initialize ctx.in_flight to one to make sure bm_async_io_complete
1058
* will not complete() early, and decrement / test it here. If there
1059
* are still some bios in flight, we need to wait for them here.
1060
*/
1061
if (!atomic_dec_and_test(&ctx.in_flight))
1062
wait_for_completion(&ctx.done);
1063
dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
1064
rw == WRITE ? "WRITE" : "READ",
1065
count, jiffies - now);
1066
1067
if (ctx.error) {
1068
dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
1069
drbd_chk_io_error(mdev, 1, true);
1070
err = -EIO; /* ctx.error ? */
1071
}
1072
1073
now = jiffies;
1074
if (rw == WRITE) {
1075
drbd_md_flush(mdev);
1076
} else /* rw == READ */ {
1077
b->bm_set = bm_count_bits(b);
1078
dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
1079
jiffies - now);
1080
}
1081
now = b->bm_set;
1082
1083
dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
1084
ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
1085
1086
return err;
1087
}
1088
1089
/**
1090
* drbd_bm_read() - Read the whole bitmap from its on disk location.
1091
* @mdev: DRBD device.
1092
*/
1093
int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
1094
{
1095
return bm_rw(mdev, READ, 0);
1096
}
1097
1098
/**
1099
* drbd_bm_write() - Write the whole bitmap to its on disk location.
1100
* @mdev: DRBD device.
1101
*
1102
* Will only write pages that have changed since last IO.
1103
*/
1104
int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
1105
{
1106
return bm_rw(mdev, WRITE, 0);
1107
}
1108
1109
/**
1110
* drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
1111
* @mdev: DRBD device.
1112
* @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages
1113
*/
1114
int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)
1115
{
1116
return bm_rw(mdev, WRITE, upper_idx);
1117
}
1118
1119
1120
/**
1121
* drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap
1122
* @mdev: DRBD device.
1123
* @idx: bitmap page index
1124
*
1125
* We don't want to special case on logical_block_size of the backend device,
1126
* so we submit PAGE_SIZE aligned pieces.
1127
* Note that on "most" systems, PAGE_SIZE is 4k.
1128
*
1129
* In case this becomes an issue on systems with larger PAGE_SIZE,
1130
* we may want to change this again to write 4k aligned 4k pieces.
1131
*/
1132
int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
1133
{
1134
struct bm_aio_ctx ctx = {
1135
.mdev = mdev,
1136
.in_flight = ATOMIC_INIT(1),
1137
.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
1138
.flags = BM_AIO_COPY_PAGES,
1139
};
1140
1141
if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
1142
dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
1143
return 0;
1144
}
1145
1146
bm_page_io_async(&ctx, idx, WRITE_SYNC);
1147
wait_for_completion(&ctx.done);
1148
1149
if (ctx.error)
1150
drbd_chk_io_error(mdev, 1, true);
1151
/* that should force detach, so the in memory bitmap will be
1152
* gone in a moment as well. */
1153
1154
mdev->bm_writ_cnt++;
1155
return ctx.error;
1156
}
1157
1158
/* NOTE
1159
* find_first_bit returns int, we return unsigned long.
1160
* For this to work on 32bit arch with bitnumbers > (1<<32),
1161
* we'd need to return u64, and get a whole lot of other places
1162
* fixed where we still use unsigned long.
1163
*
1164
* this returns a bit number, NOT a sector!
1165
*/
1166
static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
1167
const int find_zero_bit, const enum km_type km)
1168
{
1169
struct drbd_bitmap *b = mdev->bitmap;
1170
unsigned long *p_addr;
1171
unsigned long bit_offset;
1172
unsigned i;
1173
1174
1175
if (bm_fo > b->bm_bits) {
1176
dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
1177
bm_fo = DRBD_END_OF_BITMAP;
1178
} else {
1179
while (bm_fo < b->bm_bits) {
1180
/* bit offset of the first bit in the page */
1181
bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
1182
p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km);
1183
1184
if (find_zero_bit)
1185
i = find_next_zero_bit_le(p_addr,
1186
PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
1187
else
1188
i = find_next_bit_le(p_addr,
1189
PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
1190
1191
__bm_unmap(p_addr, km);
1192
if (i < PAGE_SIZE*8) {
1193
bm_fo = bit_offset + i;
1194
if (bm_fo >= b->bm_bits)
1195
break;
1196
goto found;
1197
}
1198
bm_fo = bit_offset + PAGE_SIZE*8;
1199
}
1200
bm_fo = DRBD_END_OF_BITMAP;
1201
}
1202
found:
1203
return bm_fo;
1204
}
1205
1206
static unsigned long bm_find_next(struct drbd_conf *mdev,
1207
unsigned long bm_fo, const int find_zero_bit)
1208
{
1209
struct drbd_bitmap *b = mdev->bitmap;
1210
unsigned long i = DRBD_END_OF_BITMAP;
1211
1212
ERR_IF(!b) return i;
1213
ERR_IF(!b->bm_pages) return i;
1214
1215
spin_lock_irq(&b->bm_lock);
1216
if (BM_DONT_TEST & b->bm_flags)
1217
bm_print_lock_info(mdev);
1218
1219
i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
1220
1221
spin_unlock_irq(&b->bm_lock);
1222
return i;
1223
}
1224
1225
unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
1226
{
1227
return bm_find_next(mdev, bm_fo, 0);
1228
}
1229
1230
#if 0
1231
/* not yet needed for anything. */
1232
unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
1233
{
1234
return bm_find_next(mdev, bm_fo, 1);
1235
}
1236
#endif
1237
1238
/* does not spin_lock_irqsave.
1239
* you must take drbd_bm_lock() first */
1240
unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
1241
{
1242
/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
1243
return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
1244
}
1245
1246
unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
1247
{
1248
/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
1249
return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
1250
}
1251
1252
/* returns number of bits actually changed.
1253
* for val != 0, we change 0 -> 1, return code positive
1254
* for val == 0, we change 1 -> 0, return code negative
1255
* wants bitnr, not sector.
1256
* expected to be called for only a few bits (e - s about BITS_PER_LONG).
1257
* Must hold bitmap lock already. */
1258
static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1259
unsigned long e, int val)
1260
{
1261
struct drbd_bitmap *b = mdev->bitmap;
1262
unsigned long *p_addr = NULL;
1263
unsigned long bitnr;
1264
unsigned int last_page_nr = -1U;
1265
int c = 0;
1266
int changed_total = 0;
1267
1268
if (e >= b->bm_bits) {
1269
dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
1270
s, e, b->bm_bits);
1271
e = b->bm_bits ? b->bm_bits -1 : 0;
1272
}
1273
for (bitnr = s; bitnr <= e; bitnr++) {
1274
unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
1275
if (page_nr != last_page_nr) {
1276
if (p_addr)
1277
__bm_unmap(p_addr, KM_IRQ1);
1278
if (c < 0)
1279
bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
1280
else if (c > 0)
1281
bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
1282
changed_total += c;
1283
c = 0;
1284
p_addr = __bm_map_pidx(b, page_nr, KM_IRQ1);
1285
last_page_nr = page_nr;
1286
}
1287
if (val)
1288
c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
1289
else
1290
c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
1291
}
1292
if (p_addr)
1293
__bm_unmap(p_addr, KM_IRQ1);
1294
if (c < 0)
1295
bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
1296
else if (c > 0)
1297
bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
1298
changed_total += c;
1299
b->bm_set += changed_total;
1300
return changed_total;
1301
}
1302
1303
/* returns number of bits actually changed.
1304
* for val != 0, we change 0 -> 1, return code positive
1305
* for val == 0, we change 1 -> 0, return code negative
1306
* wants bitnr, not sector */
1307
static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1308
const unsigned long e, int val)
1309
{
1310
unsigned long flags;
1311
struct drbd_bitmap *b = mdev->bitmap;
1312
int c = 0;
1313
1314
ERR_IF(!b) return 1;
1315
ERR_IF(!b->bm_pages) return 0;
1316
1317
spin_lock_irqsave(&b->bm_lock, flags);
1318
if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
1319
bm_print_lock_info(mdev);
1320
1321
c = __bm_change_bits_to(mdev, s, e, val);
1322
1323
spin_unlock_irqrestore(&b->bm_lock, flags);
1324
return c;
1325
}
1326
1327
/* returns number of bits changed 0 -> 1 */
1328
int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1329
{
1330
return bm_change_bits_to(mdev, s, e, 1);
1331
}
1332
1333
/* returns number of bits changed 1 -> 0 */
1334
int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1335
{
1336
return -bm_change_bits_to(mdev, s, e, 0);
1337
}
1338
1339
/* sets all bits in full words,
1340
* from first_word up to, but not including, last_word */
1341
static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
1342
int page_nr, int first_word, int last_word)
1343
{
1344
int i;
1345
int bits;
1346
unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1);
1347
for (i = first_word; i < last_word; i++) {
1348
bits = hweight_long(paddr[i]);
1349
paddr[i] = ~0UL;
1350
b->bm_set += BITS_PER_LONG - bits;
1351
}
1352
kunmap_atomic(paddr, KM_IRQ1);
1353
}
1354
1355
/* Same thing as drbd_bm_set_bits,
1356
* but more efficient for a large bit range.
1357
* You must first drbd_bm_lock().
1358
* Can be called to set the whole bitmap in one go.
1359
* Sets bits from s to e _inclusive_. */
1360
void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1361
{
1362
/* First set_bit from the first bit (s)
1363
* up to the next long boundary (sl),
1364
* then assign full words up to the last long boundary (el),
1365
* then set_bit up to and including the last bit (e).
1366
*
1367
* Do not use memset, because we must account for changes,
1368
* so we need to loop over the words with hweight() anyways.
1369
*/
1370
struct drbd_bitmap *b = mdev->bitmap;
1371
unsigned long sl = ALIGN(s,BITS_PER_LONG);
1372
unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
1373
int first_page;
1374
int last_page;
1375
int page_nr;
1376
int first_word;
1377
int last_word;
1378
1379
if (e - s <= 3*BITS_PER_LONG) {
1380
/* don't bother; el and sl may even be wrong. */
1381
spin_lock_irq(&b->bm_lock);
1382
__bm_change_bits_to(mdev, s, e, 1);
1383
spin_unlock_irq(&b->bm_lock);
1384
return;
1385
}
1386
1387
/* difference is large enough that we can trust sl and el */
1388
1389
spin_lock_irq(&b->bm_lock);
1390
1391
/* bits filling the current long */
1392
if (sl)
1393
__bm_change_bits_to(mdev, s, sl-1, 1);
1394
1395
first_page = sl >> (3 + PAGE_SHIFT);
1396
last_page = el >> (3 + PAGE_SHIFT);
1397
1398
/* MLPP: modulo longs per page */
1399
/* LWPP: long words per page */
1400
first_word = MLPP(sl >> LN2_BPL);
1401
last_word = LWPP;
1402
1403
/* first and full pages, unless first page == last page */
1404
for (page_nr = first_page; page_nr < last_page; page_nr++) {
1405
bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);
1406
spin_unlock_irq(&b->bm_lock);
1407
cond_resched();
1408
first_word = 0;
1409
spin_lock_irq(&b->bm_lock);
1410
}
1411
1412
/* last page (respectively only page, for first page == last page) */
1413
last_word = MLPP(el >> LN2_BPL);
1414
bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
1415
1416
/* possibly trailing bits.
1417
* example: (e & 63) == 63, el will be e+1.
1418
* if that even was the very last bit,
1419
* it would trigger an assert in __bm_change_bits_to()
1420
*/
1421
if (el <= e)
1422
__bm_change_bits_to(mdev, el, e, 1);
1423
spin_unlock_irq(&b->bm_lock);
1424
}
1425
1426
/* returns bit state
1427
* wants bitnr, NOT sector.
1428
* inherently racy... area needs to be locked by means of {al,rs}_lru
1429
* 1 ... bit set
1430
* 0 ... bit not set
1431
* -1 ... first out of bounds access, stop testing for bits!
1432
*/
1433
int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
1434
{
1435
unsigned long flags;
1436
struct drbd_bitmap *b = mdev->bitmap;
1437
unsigned long *p_addr;
1438
int i;
1439
1440
ERR_IF(!b) return 0;
1441
ERR_IF(!b->bm_pages) return 0;
1442
1443
spin_lock_irqsave(&b->bm_lock, flags);
1444
if (BM_DONT_TEST & b->bm_flags)
1445
bm_print_lock_info(mdev);
1446
if (bitnr < b->bm_bits) {
1447
p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
1448
i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
1449
bm_unmap(p_addr);
1450
} else if (bitnr == b->bm_bits) {
1451
i = -1;
1452
} else { /* (bitnr > b->bm_bits) */
1453
dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
1454
i = 0;
1455
}
1456
1457
spin_unlock_irqrestore(&b->bm_lock, flags);
1458
return i;
1459
}
1460
1461
/* returns number of bits set in the range [s, e] */
1462
int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1463
{
1464
unsigned long flags;
1465
struct drbd_bitmap *b = mdev->bitmap;
1466
unsigned long *p_addr = NULL;
1467
unsigned long bitnr;
1468
unsigned int page_nr = -1U;
1469
int c = 0;
1470
1471
/* If this is called without a bitmap, that is a bug. But just to be
1472
* robust in case we screwed up elsewhere, in that case pretend there
1473
* was one dirty bit in the requested area, so we won't try to do a
1474
* local read there (no bitmap probably implies no disk) */
1475
ERR_IF(!b) return 1;
1476
ERR_IF(!b->bm_pages) return 1;
1477
1478
spin_lock_irqsave(&b->bm_lock, flags);
1479
if (BM_DONT_TEST & b->bm_flags)
1480
bm_print_lock_info(mdev);
1481
for (bitnr = s; bitnr <= e; bitnr++) {
1482
unsigned int idx = bm_bit_to_page_idx(b, bitnr);
1483
if (page_nr != idx) {
1484
page_nr = idx;
1485
if (p_addr)
1486
bm_unmap(p_addr);
1487
p_addr = bm_map_pidx(b, idx);
1488
}
1489
ERR_IF (bitnr >= b->bm_bits) {
1490
dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
1491
} else {
1492
c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
1493
}
1494
}
1495
if (p_addr)
1496
bm_unmap(p_addr);
1497
spin_unlock_irqrestore(&b->bm_lock, flags);
1498
return c;
1499
}
1500
1501
1502
/* inherently racy...
1503
* return value may be already out-of-date when this function returns.
1504
* but the general usage is that this is only use during a cstate when bits are
1505
* only cleared, not set, and typically only care for the case when the return
1506
* value is zero, or we already "locked" this "bitmap extent" by other means.
1507
*
1508
* enr is bm-extent number, since we chose to name one sector (512 bytes)
1509
* worth of the bitmap a "bitmap extent".
1510
*
1511
* TODO
1512
* I think since we use it like a reference count, we should use the real
1513
* reference count of some bitmap extent element from some lru instead...
1514
*
1515
*/
1516
int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
1517
{
1518
struct drbd_bitmap *b = mdev->bitmap;
1519
int count, s, e;
1520
unsigned long flags;
1521
unsigned long *p_addr, *bm;
1522
1523
ERR_IF(!b) return 0;
1524
ERR_IF(!b->bm_pages) return 0;
1525
1526
spin_lock_irqsave(&b->bm_lock, flags);
1527
if (BM_DONT_TEST & b->bm_flags)
1528
bm_print_lock_info(mdev);
1529
1530
s = S2W(enr);
1531
e = min((size_t)S2W(enr+1), b->bm_words);
1532
count = 0;
1533
if (s < b->bm_words) {
1534
int n = e-s;
1535
p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
1536
bm = p_addr + MLPP(s);
1537
while (n--)
1538
count += hweight_long(*bm++);
1539
bm_unmap(p_addr);
1540
} else {
1541
dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s);
1542
}
1543
spin_unlock_irqrestore(&b->bm_lock, flags);
1544
return count;
1545
}
1546
1547
/* Set all bits covered by the AL-extent al_enr.
1548
* Returns number of bits changed. */
1549
unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
1550
{
1551
struct drbd_bitmap *b = mdev->bitmap;
1552
unsigned long *p_addr, *bm;
1553
unsigned long weight;
1554
unsigned long s, e;
1555
int count, i, do_now;
1556
ERR_IF(!b) return 0;
1557
ERR_IF(!b->bm_pages) return 0;
1558
1559
spin_lock_irq(&b->bm_lock);
1560
if (BM_DONT_SET & b->bm_flags)
1561
bm_print_lock_info(mdev);
1562
weight = b->bm_set;
1563
1564
s = al_enr * BM_WORDS_PER_AL_EXT;
1565
e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
1566
/* assert that s and e are on the same page */
1567
D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
1568
== s >> (PAGE_SHIFT - LN2_BPL + 3));
1569
count = 0;
1570
if (s < b->bm_words) {
1571
i = do_now = e-s;
1572
p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
1573
bm = p_addr + MLPP(s);
1574
while (i--) {
1575
count += hweight_long(*bm);
1576
*bm = -1UL;
1577
bm++;
1578
}
1579
bm_unmap(p_addr);
1580
b->bm_set += do_now*BITS_PER_LONG - count;
1581
if (e == b->bm_words)
1582
b->bm_set -= bm_clear_surplus(b);
1583
} else {
1584
dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s);
1585
}
1586
weight = b->bm_set - weight;
1587
spin_unlock_irq(&b->bm_lock);
1588
return weight;
1589
}
1590
1591