Path: blob/master/drivers/block/drbd/drbd_bitmap.c
15180 views
/*1drbd_bitmap.c23This file is part of DRBD by Philipp Reisner and Lars Ellenberg.45Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.6Copyright (C) 2004-2008, Philipp Reisner <[email protected]>.7Copyright (C) 2004-2008, Lars Ellenberg <[email protected]>.89drbd is free software; you can redistribute it and/or modify10it under the terms of the GNU General Public License as published by11the Free Software Foundation; either version 2, or (at your option)12any later version.1314drbd is distributed in the hope that it will be useful,15but WITHOUT ANY WARRANTY; without even the implied warranty of16MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the17GNU General Public License for more details.1819You should have received a copy of the GNU General Public License20along with drbd; see the file COPYING. If not, write to21the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.22*/2324#include <linux/bitops.h>25#include <linux/vmalloc.h>26#include <linux/string.h>27#include <linux/drbd.h>28#include <linux/slab.h>29#include <asm/kmap_types.h>3031#include "drbd_int.h"323334/* OPAQUE outside this file!35* interface defined in drbd_int.h3637* convention:38* function name drbd_bm_... => used elsewhere, "public".39* function name bm_... => internal to implementation, "private".40*/414243/*44* LIMITATIONS:45* We want to support >= peta byte of backend storage, while for now still using46* a granularity of one bit per 4KiB of storage.47* 1 << 50 bytes backend storage (1 PiB)48* 1 << (50 - 12) bits needed49* 38 --> we need u64 to index and count bits50* 1 << (38 - 3) bitmap bytes needed51* 35 --> we still need u64 to index and count bytes52* (that's 32 GiB of bitmap for 1 PiB storage)53* 1 << (35 - 2) 32bit longs needed54* 33 --> we'd even need u64 to index and count 32bit long words.55* 1 << (35 - 3) 64bit longs needed56* 32 --> we could get away with a 32bit unsigned int to index and count57* 64bit long words, but I rather stay with unsigned long for now.58* We probably should neither count nor point to bytes or long words59* directly, but either by bitnumber, or by page index and offset.60* 1 << (35 - 12)61* 22 --> we need that much 4KiB pages of bitmap.62* 1 << (22 + 3) --> on a 64bit arch,63* we need 32 MiB to store the array of page pointers.64*65* Because I'm lazy, and because the resulting patch was too large, too ugly66* and still incomplete, on 32bit we still "only" support 16 TiB (minus some),67* (1 << 32) bits * 4k storage.68*6970* bitmap storage and IO:71* Bitmap is stored little endian on disk, and is kept little endian in72* core memory. Currently we still hold the full bitmap in core as long73* as we are "attached" to a local disk, which at 32 GiB for 1PiB storage74* seems excessive.75*76* We plan to reduce the amount of in-core bitmap pages by paging them in77* and out against their on-disk location as necessary, but need to make78* sure we don't cause too much meta data IO, and must not deadlock in79* tight memory situations. This needs some more work.80*/8182/*83* NOTE84* Access to the *bm_pages is protected by bm_lock.85* It is safe to read the other members within the lock.86*87* drbd_bm_set_bits is called from bio_endio callbacks,88* We may be called with irq already disabled,89* so we need spin_lock_irqsave().90* And we need the kmap_atomic.91*/92struct drbd_bitmap {93struct page **bm_pages;94spinlock_t bm_lock;9596/* see LIMITATIONS: above */9798unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */99unsigned long bm_bits;100size_t bm_words;101size_t bm_number_of_pages;102sector_t bm_dev_capacity;103struct mutex bm_change; /* serializes resize operations */104105wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */106107enum bm_flag bm_flags;108109/* debugging aid, in case we are still racy somewhere */110char *bm_why;111struct task_struct *bm_task;112};113114#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)115static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)116{117struct drbd_bitmap *b = mdev->bitmap;118if (!__ratelimit(&drbd_ratelimit_state))119return;120dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",121current == mdev->receiver.task ? "receiver" :122current == mdev->asender.task ? "asender" :123current == mdev->worker.task ? "worker" : current->comm,124func, b->bm_why ?: "?",125b->bm_task == mdev->receiver.task ? "receiver" :126b->bm_task == mdev->asender.task ? "asender" :127b->bm_task == mdev->worker.task ? "worker" : "?");128}129130void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)131{132struct drbd_bitmap *b = mdev->bitmap;133int trylock_failed;134135if (!b) {136dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n");137return;138}139140trylock_failed = !mutex_trylock(&b->bm_change);141142if (trylock_failed) {143dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",144current == mdev->receiver.task ? "receiver" :145current == mdev->asender.task ? "asender" :146current == mdev->worker.task ? "worker" : current->comm,147why, b->bm_why ?: "?",148b->bm_task == mdev->receiver.task ? "receiver" :149b->bm_task == mdev->asender.task ? "asender" :150b->bm_task == mdev->worker.task ? "worker" : "?");151mutex_lock(&b->bm_change);152}153if (BM_LOCKED_MASK & b->bm_flags)154dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");155b->bm_flags |= flags & BM_LOCKED_MASK;156157b->bm_why = why;158b->bm_task = current;159}160161void drbd_bm_unlock(struct drbd_conf *mdev)162{163struct drbd_bitmap *b = mdev->bitmap;164if (!b) {165dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n");166return;167}168169if (!(BM_LOCKED_MASK & mdev->bitmap->bm_flags))170dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");171172b->bm_flags &= ~BM_LOCKED_MASK;173b->bm_why = NULL;174b->bm_task = NULL;175mutex_unlock(&b->bm_change);176}177178/* we store some "meta" info about our pages in page->private */179/* at a granularity of 4k storage per bitmap bit:180* one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks181* 1<<38 bits,182* 1<<23 4k bitmap pages.183* Use 24 bits as page index, covers 2 peta byte storage184* at a granularity of 4k per bit.185* Used to report the failed page idx on io error from the endio handlers.186*/187#define BM_PAGE_IDX_MASK ((1UL<<24)-1)188/* this page is currently read in, or written back */189#define BM_PAGE_IO_LOCK 31190/* if there has been an IO error for this page */191#define BM_PAGE_IO_ERROR 30192/* this is to be able to intelligently skip disk IO,193* set if bits have been set since last IO. */194#define BM_PAGE_NEED_WRITEOUT 29195/* to mark for lazy writeout once syncer cleared all clearable bits,196* we if bits have been cleared since last IO. */197#define BM_PAGE_LAZY_WRITEOUT 28198199/* store_page_idx uses non-atomic assignment. It is only used directly after200* allocating the page. All other bm_set_page_* and bm_clear_page_* need to201* use atomic bit manipulation, as set_out_of_sync (and therefore bitmap202* changes) may happen from various contexts, and wait_on_bit/wake_up_bit203* requires it all to be atomic as well. */204static void bm_store_page_idx(struct page *page, unsigned long idx)205{206BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));207page_private(page) |= idx;208}209210static unsigned long bm_page_to_idx(struct page *page)211{212return page_private(page) & BM_PAGE_IDX_MASK;213}214215/* As is very unlikely that the same page is under IO from more than one216* context, we can get away with a bit per page and one wait queue per bitmap.217*/218static void bm_page_lock_io(struct drbd_conf *mdev, int page_nr)219{220struct drbd_bitmap *b = mdev->bitmap;221void *addr = &page_private(b->bm_pages[page_nr]);222wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));223}224225static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr)226{227struct drbd_bitmap *b = mdev->bitmap;228void *addr = &page_private(b->bm_pages[page_nr]);229clear_bit(BM_PAGE_IO_LOCK, addr);230smp_mb__after_clear_bit();231wake_up(&mdev->bitmap->bm_io_wait);232}233234/* set _before_ submit_io, so it may be reset due to being changed235* while this page is in flight... will get submitted later again */236static void bm_set_page_unchanged(struct page *page)237{238/* use cmpxchg? */239clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));240clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));241}242243static void bm_set_page_need_writeout(struct page *page)244{245set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));246}247248static int bm_test_page_unchanged(struct page *page)249{250volatile const unsigned long *addr = &page_private(page);251return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;252}253254static void bm_set_page_io_err(struct page *page)255{256set_bit(BM_PAGE_IO_ERROR, &page_private(page));257}258259static void bm_clear_page_io_err(struct page *page)260{261clear_bit(BM_PAGE_IO_ERROR, &page_private(page));262}263264static void bm_set_page_lazy_writeout(struct page *page)265{266set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));267}268269static int bm_test_page_lazy_writeout(struct page *page)270{271return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));272}273274/* on a 32bit box, this would allow for exactly (2<<38) bits. */275static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)276{277/* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */278unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);279BUG_ON(page_nr >= b->bm_number_of_pages);280return page_nr;281}282283static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)284{285/* page_nr = (bitnr/8) >> PAGE_SHIFT; */286unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);287BUG_ON(page_nr >= b->bm_number_of_pages);288return page_nr;289}290291static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km)292{293struct page *page = b->bm_pages[idx];294return (unsigned long *) kmap_atomic(page, km);295}296297static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)298{299return __bm_map_pidx(b, idx, KM_IRQ1);300}301302static void __bm_unmap(unsigned long *p_addr, const enum km_type km)303{304kunmap_atomic(p_addr, km);305};306307static void bm_unmap(unsigned long *p_addr)308{309return __bm_unmap(p_addr, KM_IRQ1);310}311312/* long word offset of _bitmap_ sector */313#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))314/* word offset from start of bitmap to word number _in_page_315* modulo longs per page316#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))317hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)318so do it explicitly:319*/320#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))321322/* Long words per page */323#define LWPP (PAGE_SIZE/sizeof(long))324325/*326* actually most functions herein should take a struct drbd_bitmap*, not a327* struct drbd_conf*, but for the debug macros I like to have the mdev around328* to be able to report device specific.329*/330331332static void bm_free_pages(struct page **pages, unsigned long number)333{334unsigned long i;335if (!pages)336return;337338for (i = 0; i < number; i++) {339if (!pages[i]) {340printk(KERN_ALERT "drbd: bm_free_pages tried to free "341"a NULL pointer; i=%lu n=%lu\n",342i, number);343continue;344}345__free_page(pages[i]);346pages[i] = NULL;347}348}349350static void bm_vk_free(void *ptr, int v)351{352if (v)353vfree(ptr);354else355kfree(ptr);356}357358/*359* "have" and "want" are NUMBER OF PAGES.360*/361static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)362{363struct page **old_pages = b->bm_pages;364struct page **new_pages, *page;365unsigned int i, bytes, vmalloced = 0;366unsigned long have = b->bm_number_of_pages;367368BUG_ON(have == 0 && old_pages != NULL);369BUG_ON(have != 0 && old_pages == NULL);370371if (have == want)372return old_pages;373374/* Trying kmalloc first, falling back to vmalloc.375* GFP_KERNEL is ok, as this is done when a lower level disk is376* "attached" to the drbd. Context is receiver thread or cqueue377* thread. As we have no disk yet, we are not in the IO path,378* not even the IO path of the peer. */379bytes = sizeof(struct page *)*want;380new_pages = kmalloc(bytes, GFP_KERNEL);381if (!new_pages) {382new_pages = vmalloc(bytes);383if (!new_pages)384return NULL;385vmalloced = 1;386}387388memset(new_pages, 0, bytes);389if (want >= have) {390for (i = 0; i < have; i++)391new_pages[i] = old_pages[i];392for (; i < want; i++) {393page = alloc_page(GFP_HIGHUSER);394if (!page) {395bm_free_pages(new_pages + have, i - have);396bm_vk_free(new_pages, vmalloced);397return NULL;398}399/* we want to know which page it is400* from the endio handlers */401bm_store_page_idx(page, i);402new_pages[i] = page;403}404} else {405for (i = 0; i < want; i++)406new_pages[i] = old_pages[i];407/* NOT HERE, we are outside the spinlock!408bm_free_pages(old_pages + want, have - want);409*/410}411412if (vmalloced)413b->bm_flags |= BM_P_VMALLOCED;414else415b->bm_flags &= ~BM_P_VMALLOCED;416417return new_pages;418}419420/*421* called on driver init only. TODO call when a device is created.422* allocates the drbd_bitmap, and stores it in mdev->bitmap.423*/424int drbd_bm_init(struct drbd_conf *mdev)425{426struct drbd_bitmap *b = mdev->bitmap;427WARN_ON(b != NULL);428b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);429if (!b)430return -ENOMEM;431spin_lock_init(&b->bm_lock);432mutex_init(&b->bm_change);433init_waitqueue_head(&b->bm_io_wait);434435mdev->bitmap = b;436437return 0;438}439440sector_t drbd_bm_capacity(struct drbd_conf *mdev)441{442ERR_IF(!mdev->bitmap) return 0;443return mdev->bitmap->bm_dev_capacity;444}445446/* called on driver unload. TODO: call when a device is destroyed.447*/448void drbd_bm_cleanup(struct drbd_conf *mdev)449{450ERR_IF (!mdev->bitmap) return;451bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);452bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags));453kfree(mdev->bitmap);454mdev->bitmap = NULL;455}456457/*458* since (b->bm_bits % BITS_PER_LONG) != 0,459* this masks out the remaining bits.460* Returns the number of bits cleared.461*/462#define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3))463#define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1)464#define BITS_PER_LONG_MASK (BITS_PER_LONG - 1)465static int bm_clear_surplus(struct drbd_bitmap *b)466{467unsigned long mask;468unsigned long *p_addr, *bm;469int tmp;470int cleared = 0;471472/* number of bits modulo bits per page */473tmp = (b->bm_bits & BITS_PER_PAGE_MASK);474/* mask the used bits of the word containing the last bit */475mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;476/* bitmap is always stored little endian,477* on disk and in core memory alike */478mask = cpu_to_lel(mask);479480p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);481bm = p_addr + (tmp/BITS_PER_LONG);482if (mask) {483/* If mask != 0, we are not exactly aligned, so bm now points484* to the long containing the last bit.485* If mask == 0, bm already points to the word immediately486* after the last (long word aligned) bit. */487cleared = hweight_long(*bm & ~mask);488*bm &= mask;489bm++;490}491492if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {493/* on a 32bit arch, we may need to zero out494* a padding long to align with a 64bit remote */495cleared += hweight_long(*bm);496*bm = 0;497}498bm_unmap(p_addr);499return cleared;500}501502static void bm_set_surplus(struct drbd_bitmap *b)503{504unsigned long mask;505unsigned long *p_addr, *bm;506int tmp;507508/* number of bits modulo bits per page */509tmp = (b->bm_bits & BITS_PER_PAGE_MASK);510/* mask the used bits of the word containing the last bit */511mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;512/* bitmap is always stored little endian,513* on disk and in core memory alike */514mask = cpu_to_lel(mask);515516p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);517bm = p_addr + (tmp/BITS_PER_LONG);518if (mask) {519/* If mask != 0, we are not exactly aligned, so bm now points520* to the long containing the last bit.521* If mask == 0, bm already points to the word immediately522* after the last (long word aligned) bit. */523*bm |= ~mask;524bm++;525}526527if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {528/* on a 32bit arch, we may need to zero out529* a padding long to align with a 64bit remote */530*bm = ~0UL;531}532bm_unmap(p_addr);533}534535/* you better not modify the bitmap while this is running,536* or its results will be stale */537static unsigned long bm_count_bits(struct drbd_bitmap *b)538{539unsigned long *p_addr;540unsigned long bits = 0;541unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;542int idx, i, last_word;543544/* all but last page */545for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {546p_addr = __bm_map_pidx(b, idx, KM_USER0);547for (i = 0; i < LWPP; i++)548bits += hweight_long(p_addr[i]);549__bm_unmap(p_addr, KM_USER0);550cond_resched();551}552/* last (or only) page */553last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;554p_addr = __bm_map_pidx(b, idx, KM_USER0);555for (i = 0; i < last_word; i++)556bits += hweight_long(p_addr[i]);557p_addr[last_word] &= cpu_to_lel(mask);558bits += hweight_long(p_addr[last_word]);559/* 32bit arch, may have an unused padding long */560if (BITS_PER_LONG == 32 && (last_word & 1) == 0)561p_addr[last_word+1] = 0;562__bm_unmap(p_addr, KM_USER0);563return bits;564}565566/* offset and len in long words.*/567static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)568{569unsigned long *p_addr, *bm;570unsigned int idx;571size_t do_now, end;572573end = offset + len;574575if (end > b->bm_words) {576printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");577return;578}579580while (offset < end) {581do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;582idx = bm_word_to_page_idx(b, offset);583p_addr = bm_map_pidx(b, idx);584bm = p_addr + MLPP(offset);585if (bm+do_now > p_addr + LWPP) {586printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",587p_addr, bm, (int)do_now);588} else589memset(bm, c, do_now * sizeof(long));590bm_unmap(p_addr);591bm_set_page_need_writeout(b->bm_pages[idx]);592offset += do_now;593}594}595596/*597* make sure the bitmap has enough room for the attached storage,598* if necessary, resize.599* called whenever we may have changed the device size.600* returns -ENOMEM if we could not allocate enough memory, 0 on success.601* In case this is actually a resize, we copy the old bitmap into the new one.602* Otherwise, the bitmap is initialized to all bits set.603*/604int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)605{606struct drbd_bitmap *b = mdev->bitmap;607unsigned long bits, words, owords, obits;608unsigned long want, have, onpages; /* number of pages */609struct page **npages, **opages = NULL;610int err = 0, growing;611int opages_vmalloced;612613ERR_IF(!b) return -ENOMEM;614615drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK);616617dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",618(unsigned long long)capacity);619620if (capacity == b->bm_dev_capacity)621goto out;622623opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);624625if (capacity == 0) {626spin_lock_irq(&b->bm_lock);627opages = b->bm_pages;628onpages = b->bm_number_of_pages;629owords = b->bm_words;630b->bm_pages = NULL;631b->bm_number_of_pages =632b->bm_set =633b->bm_bits =634b->bm_words =635b->bm_dev_capacity = 0;636spin_unlock_irq(&b->bm_lock);637bm_free_pages(opages, onpages);638bm_vk_free(opages, opages_vmalloced);639goto out;640}641bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));642643/* if we would use644words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;645a 32bit host could present the wrong number of words646to a 64bit host.647*/648words = ALIGN(bits, 64) >> LN2_BPL;649650if (get_ldev(mdev)) {651u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12;652put_ldev(mdev);653if (bits > bits_on_disk) {654dev_info(DEV, "bits = %lu\n", bits);655dev_info(DEV, "bits_on_disk = %llu\n", bits_on_disk);656err = -ENOSPC;657goto out;658}659}660661want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;662have = b->bm_number_of_pages;663if (want == have) {664D_ASSERT(b->bm_pages != NULL);665npages = b->bm_pages;666} else {667if (drbd_insert_fault(mdev, DRBD_FAULT_BM_ALLOC))668npages = NULL;669else670npages = bm_realloc_pages(b, want);671}672673if (!npages) {674err = -ENOMEM;675goto out;676}677678spin_lock_irq(&b->bm_lock);679opages = b->bm_pages;680owords = b->bm_words;681obits = b->bm_bits;682683growing = bits > obits;684if (opages && growing && set_new_bits)685bm_set_surplus(b);686687b->bm_pages = npages;688b->bm_number_of_pages = want;689b->bm_bits = bits;690b->bm_words = words;691b->bm_dev_capacity = capacity;692693if (growing) {694if (set_new_bits) {695bm_memset(b, owords, 0xff, words-owords);696b->bm_set += bits - obits;697} else698bm_memset(b, owords, 0x00, words-owords);699700}701702if (want < have) {703/* implicit: (opages != NULL) && (opages != npages) */704bm_free_pages(opages + want, have - want);705}706707(void)bm_clear_surplus(b);708709spin_unlock_irq(&b->bm_lock);710if (opages != npages)711bm_vk_free(opages, opages_vmalloced);712if (!growing)713b->bm_set = bm_count_bits(b);714dev_info(DEV, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);715716out:717drbd_bm_unlock(mdev);718return err;719}720721/* inherently racy:722* if not protected by other means, return value may be out of date when723* leaving this function...724* we still need to lock it, since it is important that this returns725* bm_set == 0 precisely.726*727* maybe bm_set should be atomic_t ?728*/729unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)730{731struct drbd_bitmap *b = mdev->bitmap;732unsigned long s;733unsigned long flags;734735ERR_IF(!b) return 0;736ERR_IF(!b->bm_pages) return 0;737738spin_lock_irqsave(&b->bm_lock, flags);739s = b->bm_set;740spin_unlock_irqrestore(&b->bm_lock, flags);741742return s;743}744745unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)746{747unsigned long s;748/* if I don't have a disk, I don't know about out-of-sync status */749if (!get_ldev_if_state(mdev, D_NEGOTIATING))750return 0;751s = _drbd_bm_total_weight(mdev);752put_ldev(mdev);753return s;754}755756size_t drbd_bm_words(struct drbd_conf *mdev)757{758struct drbd_bitmap *b = mdev->bitmap;759ERR_IF(!b) return 0;760ERR_IF(!b->bm_pages) return 0;761762return b->bm_words;763}764765unsigned long drbd_bm_bits(struct drbd_conf *mdev)766{767struct drbd_bitmap *b = mdev->bitmap;768ERR_IF(!b) return 0;769770return b->bm_bits;771}772773/* merge number words from buffer into the bitmap starting at offset.774* buffer[i] is expected to be little endian unsigned long.775* bitmap must be locked by drbd_bm_lock.776* currently only used from receive_bitmap.777*/778void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,779unsigned long *buffer)780{781struct drbd_bitmap *b = mdev->bitmap;782unsigned long *p_addr, *bm;783unsigned long word, bits;784unsigned int idx;785size_t end, do_now;786787end = offset + number;788789ERR_IF(!b) return;790ERR_IF(!b->bm_pages) return;791if (number == 0)792return;793WARN_ON(offset >= b->bm_words);794WARN_ON(end > b->bm_words);795796spin_lock_irq(&b->bm_lock);797while (offset < end) {798do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;799idx = bm_word_to_page_idx(b, offset);800p_addr = bm_map_pidx(b, idx);801bm = p_addr + MLPP(offset);802offset += do_now;803while (do_now--) {804bits = hweight_long(*bm);805word = *bm | *buffer++;806*bm++ = word;807b->bm_set += hweight_long(word) - bits;808}809bm_unmap(p_addr);810bm_set_page_need_writeout(b->bm_pages[idx]);811}812/* with 32bit <-> 64bit cross-platform connect813* this is only correct for current usage,814* where we _know_ that we are 64 bit aligned,815* and know that this function is used in this way, too...816*/817if (end == b->bm_words)818b->bm_set -= bm_clear_surplus(b);819spin_unlock_irq(&b->bm_lock);820}821822/* copy number words from the bitmap starting at offset into the buffer.823* buffer[i] will be little endian unsigned long.824*/825void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,826unsigned long *buffer)827{828struct drbd_bitmap *b = mdev->bitmap;829unsigned long *p_addr, *bm;830size_t end, do_now;831832end = offset + number;833834ERR_IF(!b) return;835ERR_IF(!b->bm_pages) return;836837spin_lock_irq(&b->bm_lock);838if ((offset >= b->bm_words) ||839(end > b->bm_words) ||840(number <= 0))841dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n",842(unsigned long) offset,843(unsigned long) number,844(unsigned long) b->bm_words);845else {846while (offset < end) {847do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;848p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));849bm = p_addr + MLPP(offset);850offset += do_now;851while (do_now--)852*buffer++ = *bm++;853bm_unmap(p_addr);854}855}856spin_unlock_irq(&b->bm_lock);857}858859/* set all bits in the bitmap */860void drbd_bm_set_all(struct drbd_conf *mdev)861{862struct drbd_bitmap *b = mdev->bitmap;863ERR_IF(!b) return;864ERR_IF(!b->bm_pages) return;865866spin_lock_irq(&b->bm_lock);867bm_memset(b, 0, 0xff, b->bm_words);868(void)bm_clear_surplus(b);869b->bm_set = b->bm_bits;870spin_unlock_irq(&b->bm_lock);871}872873/* clear all bits in the bitmap */874void drbd_bm_clear_all(struct drbd_conf *mdev)875{876struct drbd_bitmap *b = mdev->bitmap;877ERR_IF(!b) return;878ERR_IF(!b->bm_pages) return;879880spin_lock_irq(&b->bm_lock);881bm_memset(b, 0, 0, b->bm_words);882b->bm_set = 0;883spin_unlock_irq(&b->bm_lock);884}885886struct bm_aio_ctx {887struct drbd_conf *mdev;888atomic_t in_flight;889struct completion done;890unsigned flags;891#define BM_AIO_COPY_PAGES 1892int error;893};894895/* bv_page may be a copy, or may be the original */896static void bm_async_io_complete(struct bio *bio, int error)897{898struct bm_aio_ctx *ctx = bio->bi_private;899struct drbd_conf *mdev = ctx->mdev;900struct drbd_bitmap *b = mdev->bitmap;901unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);902int uptodate = bio_flagged(bio, BIO_UPTODATE);903904905/* strange behavior of some lower level drivers...906* fail the request by clearing the uptodate flag,907* but do not return any error?!908* do we want to WARN() on this? */909if (!error && !uptodate)910error = -EIO;911912if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&913!bm_test_page_unchanged(b->bm_pages[idx]))914dev_warn(DEV, "bitmap page idx %u changed during IO!\n", idx);915916if (error) {917/* ctx error will hold the completed-last non-zero error code,918* in case error codes differ. */919ctx->error = error;920bm_set_page_io_err(b->bm_pages[idx]);921/* Not identical to on disk version of it.922* Is BM_PAGE_IO_ERROR enough? */923if (__ratelimit(&drbd_ratelimit_state))924dev_err(DEV, "IO ERROR %d on bitmap page idx %u\n",925error, idx);926} else {927bm_clear_page_io_err(b->bm_pages[idx]);928dynamic_dev_dbg(DEV, "bitmap page idx %u completed\n", idx);929}930931bm_page_unlock_io(mdev, idx);932933/* FIXME give back to page pool */934if (ctx->flags & BM_AIO_COPY_PAGES)935put_page(bio->bi_io_vec[0].bv_page);936937bio_put(bio);938939if (atomic_dec_and_test(&ctx->in_flight))940complete(&ctx->done);941}942943static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)944{945/* we are process context. we always get a bio */946struct bio *bio = bio_alloc(GFP_KERNEL, 1);947struct drbd_conf *mdev = ctx->mdev;948struct drbd_bitmap *b = mdev->bitmap;949struct page *page;950unsigned int len;951952sector_t on_disk_sector =953mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;954on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);955956/* this might happen with very small957* flexible external meta data device,958* or with PAGE_SIZE > 4k */959len = min_t(unsigned int, PAGE_SIZE,960(drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);961962/* serialize IO on this page */963bm_page_lock_io(mdev, page_nr);964/* before memcpy and submit,965* so it can be redirtied any time */966bm_set_page_unchanged(b->bm_pages[page_nr]);967968if (ctx->flags & BM_AIO_COPY_PAGES) {969/* FIXME alloc_page is good enough for now, but actually needs970* to use pre-allocated page pool */971void *src, *dest;972page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT);973dest = kmap_atomic(page, KM_USER0);974src = kmap_atomic(b->bm_pages[page_nr], KM_USER1);975memcpy(dest, src, PAGE_SIZE);976kunmap_atomic(src, KM_USER1);977kunmap_atomic(dest, KM_USER0);978bm_store_page_idx(page, page_nr);979} else980page = b->bm_pages[page_nr];981982bio->bi_bdev = mdev->ldev->md_bdev;983bio->bi_sector = on_disk_sector;984bio_add_page(bio, page, len, 0);985bio->bi_private = ctx;986bio->bi_end_io = bm_async_io_complete;987988if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {989bio->bi_rw |= rw;990bio_endio(bio, -EIO);991} else {992submit_bio(rw, bio);993/* this should not count as user activity and cause the994* resync to throttle -- see drbd_rs_should_slow_down(). */995atomic_add(len >> 9, &mdev->rs_sect_ev);996}997}998999/*1000* bm_rw: read/write the whole bitmap from/to its on disk location.1001*/1002static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local)1003{1004struct bm_aio_ctx ctx = {1005.mdev = mdev,1006.in_flight = ATOMIC_INIT(1),1007.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),1008.flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0,1009};1010struct drbd_bitmap *b = mdev->bitmap;1011int num_pages, i, count = 0;1012unsigned long now;1013char ppb[10];1014int err = 0;10151016/*1017* We are protected against bitmap disappearing/resizing by holding an1018* ldev reference (caller must have called get_ldev()).1019* For read/write, we are protected against changes to the bitmap by1020* the bitmap lock (see drbd_bitmap_io).1021* For lazy writeout, we don't care for ongoing changes to the bitmap,1022* as we submit copies of pages anyways.1023*/1024if (!ctx.flags)1025WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));10261027num_pages = b->bm_number_of_pages;10281029now = jiffies;10301031/* let the layers below us try to merge these bios... */1032for (i = 0; i < num_pages; i++) {1033/* ignore completely unchanged pages */1034if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)1035break;1036if (rw & WRITE) {1037if (bm_test_page_unchanged(b->bm_pages[i])) {1038dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);1039continue;1040}1041/* during lazy writeout,1042* ignore those pages not marked for lazy writeout. */1043if (lazy_writeout_upper_idx &&1044!bm_test_page_lazy_writeout(b->bm_pages[i])) {1045dynamic_dev_dbg(DEV, "skipped bm lazy write for idx %u\n", i);1046continue;1047}1048}1049atomic_inc(&ctx.in_flight);1050bm_page_io_async(&ctx, i, rw);1051++count;1052cond_resched();1053}10541055/*1056* We initialize ctx.in_flight to one to make sure bm_async_io_complete1057* will not complete() early, and decrement / test it here. If there1058* are still some bios in flight, we need to wait for them here.1059*/1060if (!atomic_dec_and_test(&ctx.in_flight))1061wait_for_completion(&ctx.done);1062dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",1063rw == WRITE ? "WRITE" : "READ",1064count, jiffies - now);10651066if (ctx.error) {1067dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");1068drbd_chk_io_error(mdev, 1, true);1069err = -EIO; /* ctx.error ? */1070}10711072now = jiffies;1073if (rw == WRITE) {1074drbd_md_flush(mdev);1075} else /* rw == READ */ {1076b->bm_set = bm_count_bits(b);1077dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",1078jiffies - now);1079}1080now = b->bm_set;10811082dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",1083ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);10841085return err;1086}10871088/**1089* drbd_bm_read() - Read the whole bitmap from its on disk location.1090* @mdev: DRBD device.1091*/1092int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)1093{1094return bm_rw(mdev, READ, 0);1095}10961097/**1098* drbd_bm_write() - Write the whole bitmap to its on disk location.1099* @mdev: DRBD device.1100*1101* Will only write pages that have changed since last IO.1102*/1103int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)1104{1105return bm_rw(mdev, WRITE, 0);1106}11071108/**1109* drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.1110* @mdev: DRBD device.1111* @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages1112*/1113int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)1114{1115return bm_rw(mdev, WRITE, upper_idx);1116}111711181119/**1120* drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap1121* @mdev: DRBD device.1122* @idx: bitmap page index1123*1124* We don't want to special case on logical_block_size of the backend device,1125* so we submit PAGE_SIZE aligned pieces.1126* Note that on "most" systems, PAGE_SIZE is 4k.1127*1128* In case this becomes an issue on systems with larger PAGE_SIZE,1129* we may want to change this again to write 4k aligned 4k pieces.1130*/1131int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)1132{1133struct bm_aio_ctx ctx = {1134.mdev = mdev,1135.in_flight = ATOMIC_INIT(1),1136.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),1137.flags = BM_AIO_COPY_PAGES,1138};11391140if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {1141dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);1142return 0;1143}11441145bm_page_io_async(&ctx, idx, WRITE_SYNC);1146wait_for_completion(&ctx.done);11471148if (ctx.error)1149drbd_chk_io_error(mdev, 1, true);1150/* that should force detach, so the in memory bitmap will be1151* gone in a moment as well. */11521153mdev->bm_writ_cnt++;1154return ctx.error;1155}11561157/* NOTE1158* find_first_bit returns int, we return unsigned long.1159* For this to work on 32bit arch with bitnumbers > (1<<32),1160* we'd need to return u64, and get a whole lot of other places1161* fixed where we still use unsigned long.1162*1163* this returns a bit number, NOT a sector!1164*/1165static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,1166const int find_zero_bit, const enum km_type km)1167{1168struct drbd_bitmap *b = mdev->bitmap;1169unsigned long *p_addr;1170unsigned long bit_offset;1171unsigned i;117211731174if (bm_fo > b->bm_bits) {1175dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);1176bm_fo = DRBD_END_OF_BITMAP;1177} else {1178while (bm_fo < b->bm_bits) {1179/* bit offset of the first bit in the page */1180bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;1181p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km);11821183if (find_zero_bit)1184i = find_next_zero_bit_le(p_addr,1185PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);1186else1187i = find_next_bit_le(p_addr,1188PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);11891190__bm_unmap(p_addr, km);1191if (i < PAGE_SIZE*8) {1192bm_fo = bit_offset + i;1193if (bm_fo >= b->bm_bits)1194break;1195goto found;1196}1197bm_fo = bit_offset + PAGE_SIZE*8;1198}1199bm_fo = DRBD_END_OF_BITMAP;1200}1201found:1202return bm_fo;1203}12041205static unsigned long bm_find_next(struct drbd_conf *mdev,1206unsigned long bm_fo, const int find_zero_bit)1207{1208struct drbd_bitmap *b = mdev->bitmap;1209unsigned long i = DRBD_END_OF_BITMAP;12101211ERR_IF(!b) return i;1212ERR_IF(!b->bm_pages) return i;12131214spin_lock_irq(&b->bm_lock);1215if (BM_DONT_TEST & b->bm_flags)1216bm_print_lock_info(mdev);12171218i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);12191220spin_unlock_irq(&b->bm_lock);1221return i;1222}12231224unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)1225{1226return bm_find_next(mdev, bm_fo, 0);1227}12281229#if 01230/* not yet needed for anything. */1231unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)1232{1233return bm_find_next(mdev, bm_fo, 1);1234}1235#endif12361237/* does not spin_lock_irqsave.1238* you must take drbd_bm_lock() first */1239unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)1240{1241/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */1242return __bm_find_next(mdev, bm_fo, 0, KM_USER1);1243}12441245unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)1246{1247/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */1248return __bm_find_next(mdev, bm_fo, 1, KM_USER1);1249}12501251/* returns number of bits actually changed.1252* for val != 0, we change 0 -> 1, return code positive1253* for val == 0, we change 1 -> 0, return code negative1254* wants bitnr, not sector.1255* expected to be called for only a few bits (e - s about BITS_PER_LONG).1256* Must hold bitmap lock already. */1257static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,1258unsigned long e, int val)1259{1260struct drbd_bitmap *b = mdev->bitmap;1261unsigned long *p_addr = NULL;1262unsigned long bitnr;1263unsigned int last_page_nr = -1U;1264int c = 0;1265int changed_total = 0;12661267if (e >= b->bm_bits) {1268dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",1269s, e, b->bm_bits);1270e = b->bm_bits ? b->bm_bits -1 : 0;1271}1272for (bitnr = s; bitnr <= e; bitnr++) {1273unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);1274if (page_nr != last_page_nr) {1275if (p_addr)1276__bm_unmap(p_addr, KM_IRQ1);1277if (c < 0)1278bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);1279else if (c > 0)1280bm_set_page_need_writeout(b->bm_pages[last_page_nr]);1281changed_total += c;1282c = 0;1283p_addr = __bm_map_pidx(b, page_nr, KM_IRQ1);1284last_page_nr = page_nr;1285}1286if (val)1287c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));1288else1289c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));1290}1291if (p_addr)1292__bm_unmap(p_addr, KM_IRQ1);1293if (c < 0)1294bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);1295else if (c > 0)1296bm_set_page_need_writeout(b->bm_pages[last_page_nr]);1297changed_total += c;1298b->bm_set += changed_total;1299return changed_total;1300}13011302/* returns number of bits actually changed.1303* for val != 0, we change 0 -> 1, return code positive1304* for val == 0, we change 1 -> 0, return code negative1305* wants bitnr, not sector */1306static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,1307const unsigned long e, int val)1308{1309unsigned long flags;1310struct drbd_bitmap *b = mdev->bitmap;1311int c = 0;13121313ERR_IF(!b) return 1;1314ERR_IF(!b->bm_pages) return 0;13151316spin_lock_irqsave(&b->bm_lock, flags);1317if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)1318bm_print_lock_info(mdev);13191320c = __bm_change_bits_to(mdev, s, e, val);13211322spin_unlock_irqrestore(&b->bm_lock, flags);1323return c;1324}13251326/* returns number of bits changed 0 -> 1 */1327int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)1328{1329return bm_change_bits_to(mdev, s, e, 1);1330}13311332/* returns number of bits changed 1 -> 0 */1333int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)1334{1335return -bm_change_bits_to(mdev, s, e, 0);1336}13371338/* sets all bits in full words,1339* from first_word up to, but not including, last_word */1340static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,1341int page_nr, int first_word, int last_word)1342{1343int i;1344int bits;1345unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1);1346for (i = first_word; i < last_word; i++) {1347bits = hweight_long(paddr[i]);1348paddr[i] = ~0UL;1349b->bm_set += BITS_PER_LONG - bits;1350}1351kunmap_atomic(paddr, KM_IRQ1);1352}13531354/* Same thing as drbd_bm_set_bits,1355* but more efficient for a large bit range.1356* You must first drbd_bm_lock().1357* Can be called to set the whole bitmap in one go.1358* Sets bits from s to e _inclusive_. */1359void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)1360{1361/* First set_bit from the first bit (s)1362* up to the next long boundary (sl),1363* then assign full words up to the last long boundary (el),1364* then set_bit up to and including the last bit (e).1365*1366* Do not use memset, because we must account for changes,1367* so we need to loop over the words with hweight() anyways.1368*/1369struct drbd_bitmap *b = mdev->bitmap;1370unsigned long sl = ALIGN(s,BITS_PER_LONG);1371unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);1372int first_page;1373int last_page;1374int page_nr;1375int first_word;1376int last_word;13771378if (e - s <= 3*BITS_PER_LONG) {1379/* don't bother; el and sl may even be wrong. */1380spin_lock_irq(&b->bm_lock);1381__bm_change_bits_to(mdev, s, e, 1);1382spin_unlock_irq(&b->bm_lock);1383return;1384}13851386/* difference is large enough that we can trust sl and el */13871388spin_lock_irq(&b->bm_lock);13891390/* bits filling the current long */1391if (sl)1392__bm_change_bits_to(mdev, s, sl-1, 1);13931394first_page = sl >> (3 + PAGE_SHIFT);1395last_page = el >> (3 + PAGE_SHIFT);13961397/* MLPP: modulo longs per page */1398/* LWPP: long words per page */1399first_word = MLPP(sl >> LN2_BPL);1400last_word = LWPP;14011402/* first and full pages, unless first page == last page */1403for (page_nr = first_page; page_nr < last_page; page_nr++) {1404bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);1405spin_unlock_irq(&b->bm_lock);1406cond_resched();1407first_word = 0;1408spin_lock_irq(&b->bm_lock);1409}14101411/* last page (respectively only page, for first page == last page) */1412last_word = MLPP(el >> LN2_BPL);1413bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);14141415/* possibly trailing bits.1416* example: (e & 63) == 63, el will be e+1.1417* if that even was the very last bit,1418* it would trigger an assert in __bm_change_bits_to()1419*/1420if (el <= e)1421__bm_change_bits_to(mdev, el, e, 1);1422spin_unlock_irq(&b->bm_lock);1423}14241425/* returns bit state1426* wants bitnr, NOT sector.1427* inherently racy... area needs to be locked by means of {al,rs}_lru1428* 1 ... bit set1429* 0 ... bit not set1430* -1 ... first out of bounds access, stop testing for bits!1431*/1432int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)1433{1434unsigned long flags;1435struct drbd_bitmap *b = mdev->bitmap;1436unsigned long *p_addr;1437int i;14381439ERR_IF(!b) return 0;1440ERR_IF(!b->bm_pages) return 0;14411442spin_lock_irqsave(&b->bm_lock, flags);1443if (BM_DONT_TEST & b->bm_flags)1444bm_print_lock_info(mdev);1445if (bitnr < b->bm_bits) {1446p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));1447i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;1448bm_unmap(p_addr);1449} else if (bitnr == b->bm_bits) {1450i = -1;1451} else { /* (bitnr > b->bm_bits) */1452dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);1453i = 0;1454}14551456spin_unlock_irqrestore(&b->bm_lock, flags);1457return i;1458}14591460/* returns number of bits set in the range [s, e] */1461int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)1462{1463unsigned long flags;1464struct drbd_bitmap *b = mdev->bitmap;1465unsigned long *p_addr = NULL;1466unsigned long bitnr;1467unsigned int page_nr = -1U;1468int c = 0;14691470/* If this is called without a bitmap, that is a bug. But just to be1471* robust in case we screwed up elsewhere, in that case pretend there1472* was one dirty bit in the requested area, so we won't try to do a1473* local read there (no bitmap probably implies no disk) */1474ERR_IF(!b) return 1;1475ERR_IF(!b->bm_pages) return 1;14761477spin_lock_irqsave(&b->bm_lock, flags);1478if (BM_DONT_TEST & b->bm_flags)1479bm_print_lock_info(mdev);1480for (bitnr = s; bitnr <= e; bitnr++) {1481unsigned int idx = bm_bit_to_page_idx(b, bitnr);1482if (page_nr != idx) {1483page_nr = idx;1484if (p_addr)1485bm_unmap(p_addr);1486p_addr = bm_map_pidx(b, idx);1487}1488ERR_IF (bitnr >= b->bm_bits) {1489dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);1490} else {1491c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));1492}1493}1494if (p_addr)1495bm_unmap(p_addr);1496spin_unlock_irqrestore(&b->bm_lock, flags);1497return c;1498}149915001501/* inherently racy...1502* return value may be already out-of-date when this function returns.1503* but the general usage is that this is only use during a cstate when bits are1504* only cleared, not set, and typically only care for the case when the return1505* value is zero, or we already "locked" this "bitmap extent" by other means.1506*1507* enr is bm-extent number, since we chose to name one sector (512 bytes)1508* worth of the bitmap a "bitmap extent".1509*1510* TODO1511* I think since we use it like a reference count, we should use the real1512* reference count of some bitmap extent element from some lru instead...1513*1514*/1515int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)1516{1517struct drbd_bitmap *b = mdev->bitmap;1518int count, s, e;1519unsigned long flags;1520unsigned long *p_addr, *bm;15211522ERR_IF(!b) return 0;1523ERR_IF(!b->bm_pages) return 0;15241525spin_lock_irqsave(&b->bm_lock, flags);1526if (BM_DONT_TEST & b->bm_flags)1527bm_print_lock_info(mdev);15281529s = S2W(enr);1530e = min((size_t)S2W(enr+1), b->bm_words);1531count = 0;1532if (s < b->bm_words) {1533int n = e-s;1534p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));1535bm = p_addr + MLPP(s);1536while (n--)1537count += hweight_long(*bm++);1538bm_unmap(p_addr);1539} else {1540dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s);1541}1542spin_unlock_irqrestore(&b->bm_lock, flags);1543return count;1544}15451546/* Set all bits covered by the AL-extent al_enr.1547* Returns number of bits changed. */1548unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)1549{1550struct drbd_bitmap *b = mdev->bitmap;1551unsigned long *p_addr, *bm;1552unsigned long weight;1553unsigned long s, e;1554int count, i, do_now;1555ERR_IF(!b) return 0;1556ERR_IF(!b->bm_pages) return 0;15571558spin_lock_irq(&b->bm_lock);1559if (BM_DONT_SET & b->bm_flags)1560bm_print_lock_info(mdev);1561weight = b->bm_set;15621563s = al_enr * BM_WORDS_PER_AL_EXT;1564e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);1565/* assert that s and e are on the same page */1566D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)1567== s >> (PAGE_SHIFT - LN2_BPL + 3));1568count = 0;1569if (s < b->bm_words) {1570i = do_now = e-s;1571p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));1572bm = p_addr + MLPP(s);1573while (i--) {1574count += hweight_long(*bm);1575*bm = -1UL;1576bm++;1577}1578bm_unmap(p_addr);1579b->bm_set += do_now*BITS_PER_LONG - count;1580if (e == b->bm_words)1581b->bm_set -= bm_clear_surplus(b);1582} else {1583dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s);1584}1585weight = b->bm_set - weight;1586spin_unlock_irq(&b->bm_lock);1587return weight;1588}158915901591