Path: blob/master/drivers/block/drbd/drbd_actlog.c
15179 views
/*1drbd_actlog.c23This file is part of DRBD by Philipp Reisner and Lars Ellenberg.45Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.6Copyright (C) 2003-2008, Philipp Reisner <[email protected]>.7Copyright (C) 2003-2008, Lars Ellenberg <[email protected]>.89drbd is free software; you can redistribute it and/or modify10it under the terms of the GNU General Public License as published by11the Free Software Foundation; either version 2, or (at your option)12any later version.1314drbd is distributed in the hope that it will be useful,15but WITHOUT ANY WARRANTY; without even the implied warranty of16MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the17GNU General Public License for more details.1819You should have received a copy of the GNU General Public License20along with drbd; see the file COPYING. If not, write to21the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.2223*/2425#include <linux/slab.h>26#include <linux/drbd.h>27#include "drbd_int.h"28#include "drbd_wrappers.h"2930/* We maintain a trivial checksum in our on disk activity log.31* With that we can ensure correct operation even when the storage32* device might do a partial (last) sector write while losing power.33*/34struct __packed al_transaction {35u32 magic;36u32 tr_number;37struct __packed {38u32 pos;39u32 extent; } updates[1 + AL_EXTENTS_PT];40u32 xor_sum;41};4243struct update_odbm_work {44struct drbd_work w;45unsigned int enr;46};4748struct update_al_work {49struct drbd_work w;50struct lc_element *al_ext;51struct completion event;52unsigned int enr;53/* if old_enr != LC_FREE, write corresponding bitmap sector, too */54unsigned int old_enr;55};5657struct drbd_atodb_wait {58atomic_t count;59struct completion io_done;60struct drbd_conf *mdev;61int error;62};636465int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);6667static int _drbd_md_sync_page_io(struct drbd_conf *mdev,68struct drbd_backing_dev *bdev,69struct page *page, sector_t sector,70int rw, int size)71{72struct bio *bio;73struct drbd_md_io md_io;74int ok;7576md_io.mdev = mdev;77init_completion(&md_io.event);78md_io.error = 0;7980if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))81rw |= REQ_FUA | REQ_FLUSH;82rw |= REQ_SYNC;8384bio = bio_alloc(GFP_NOIO, 1);85bio->bi_bdev = bdev->md_bdev;86bio->bi_sector = sector;87ok = (bio_add_page(bio, page, size, 0) == size);88if (!ok)89goto out;90bio->bi_private = &md_io;91bio->bi_end_io = drbd_md_io_complete;92bio->bi_rw = rw;9394if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))95bio_endio(bio, -EIO);96else97submit_bio(rw, bio);98wait_for_completion(&md_io.event);99ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;100101out:102bio_put(bio);103return ok;104}105106int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,107sector_t sector, int rw)108{109int logical_block_size, mask, ok;110int offset = 0;111struct page *iop = mdev->md_io_page;112113D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));114115BUG_ON(!bdev->md_bdev);116117logical_block_size = bdev_logical_block_size(bdev->md_bdev);118if (logical_block_size == 0)119logical_block_size = MD_SECTOR_SIZE;120121/* in case logical_block_size != 512 [ s390 only? ] */122if (logical_block_size != MD_SECTOR_SIZE) {123mask = (logical_block_size / MD_SECTOR_SIZE) - 1;124D_ASSERT(mask == 1 || mask == 3 || mask == 7);125D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);126offset = sector & mask;127sector = sector & ~mask;128iop = mdev->md_io_tmpp;129130if (rw & WRITE) {131/* these are GFP_KERNEL pages, pre-allocated132* on device initialization */133void *p = page_address(mdev->md_io_page);134void *hp = page_address(mdev->md_io_tmpp);135136ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,137READ, logical_block_size);138139if (unlikely(!ok)) {140dev_err(DEV, "drbd_md_sync_page_io(,%llus,"141"READ [logical_block_size!=512]) failed!\n",142(unsigned long long)sector);143return 0;144}145146memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);147}148}149150if (sector < drbd_md_first_sector(bdev) ||151sector > drbd_md_last_sector(bdev))152dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",153current->comm, current->pid, __func__,154(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");155156ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);157if (unlikely(!ok)) {158dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",159(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");160return 0;161}162163if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {164void *p = page_address(mdev->md_io_page);165void *hp = page_address(mdev->md_io_tmpp);166167memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);168}169170return ok;171}172173static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)174{175struct lc_element *al_ext;176struct lc_element *tmp;177unsigned long al_flags = 0;178int wake;179180spin_lock_irq(&mdev->al_lock);181tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);182if (unlikely(tmp != NULL)) {183struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);184if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {185wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);186spin_unlock_irq(&mdev->al_lock);187if (wake)188wake_up(&mdev->al_wait);189return NULL;190}191}192al_ext = lc_get(mdev->act_log, enr);193al_flags = mdev->act_log->flags;194spin_unlock_irq(&mdev->al_lock);195196/*197if (!al_ext) {198if (al_flags & LC_STARVING)199dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");200if (al_flags & LC_DIRTY)201dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");202}203*/204205return al_ext;206}207208void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)209{210unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));211struct lc_element *al_ext;212struct update_al_work al_work;213214D_ASSERT(atomic_read(&mdev->local_cnt) > 0);215216wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));217218if (al_ext->lc_number != enr) {219/* drbd_al_write_transaction(mdev,al_ext,enr);220* recurses into generic_make_request(), which221* disallows recursion, bios being serialized on the222* current->bio_tail list now.223* we have to delegate updates to the activity log224* to the worker thread. */225init_completion(&al_work.event);226al_work.al_ext = al_ext;227al_work.enr = enr;228al_work.old_enr = al_ext->lc_number;229al_work.w.cb = w_al_write_transaction;230drbd_queue_work_front(&mdev->data.work, &al_work.w);231wait_for_completion(&al_work.event);232233mdev->al_writ_cnt++;234235spin_lock_irq(&mdev->al_lock);236lc_changed(mdev->act_log, al_ext);237spin_unlock_irq(&mdev->al_lock);238wake_up(&mdev->al_wait);239}240}241242void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)243{244unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));245struct lc_element *extent;246unsigned long flags;247248spin_lock_irqsave(&mdev->al_lock, flags);249250extent = lc_find(mdev->act_log, enr);251252if (!extent) {253spin_unlock_irqrestore(&mdev->al_lock, flags);254dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);255return;256}257258if (lc_put(mdev->act_log, extent) == 0)259wake_up(&mdev->al_wait);260261spin_unlock_irqrestore(&mdev->al_lock, flags);262}263264#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)265/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT266* are still coupled, or assume too much about their relation.267* Code below will not work if this is violated.268* Will be cleaned up with some followup patch.269*/270# error FIXME271#endif272273static unsigned int al_extent_to_bm_page(unsigned int al_enr)274{275return al_enr >>276/* bit to page */277((PAGE_SHIFT + 3) -278/* al extent number to bit */279(AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));280}281282static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)283{284return rs_enr >>285/* bit to page */286((PAGE_SHIFT + 3) -287/* al extent number to bit */288(BM_EXT_SHIFT - BM_BLOCK_SHIFT));289}290291int292w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)293{294struct update_al_work *aw = container_of(w, struct update_al_work, w);295struct lc_element *updated = aw->al_ext;296const unsigned int new_enr = aw->enr;297const unsigned int evicted = aw->old_enr;298struct al_transaction *buffer;299sector_t sector;300int i, n, mx;301unsigned int extent_nr;302u32 xor_sum = 0;303304if (!get_ldev(mdev)) {305dev_err(DEV,306"disk is %s, cannot start al transaction (-%d +%d)\n",307drbd_disk_str(mdev->state.disk), evicted, new_enr);308complete(&((struct update_al_work *)w)->event);309return 1;310}311/* do we have to do a bitmap write, first?312* TODO reduce maximum latency:313* submit both bios, then wait for both,314* instead of doing two synchronous sector writes.315* For now, we must not write the transaction,316* if we cannot write out the bitmap of the evicted extent. */317if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)318drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted));319320/* The bitmap write may have failed, causing a state change. */321if (mdev->state.disk < D_INCONSISTENT) {322dev_err(DEV,323"disk is %s, cannot write al transaction (-%d +%d)\n",324drbd_disk_str(mdev->state.disk), evicted, new_enr);325complete(&((struct update_al_work *)w)->event);326put_ldev(mdev);327return 1;328}329330mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */331buffer = (struct al_transaction *)page_address(mdev->md_io_page);332333buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);334buffer->tr_number = cpu_to_be32(mdev->al_tr_number);335336n = lc_index_of(mdev->act_log, updated);337338buffer->updates[0].pos = cpu_to_be32(n);339buffer->updates[0].extent = cpu_to_be32(new_enr);340341xor_sum ^= new_enr;342343mx = min_t(int, AL_EXTENTS_PT,344mdev->act_log->nr_elements - mdev->al_tr_cycle);345for (i = 0; i < mx; i++) {346unsigned idx = mdev->al_tr_cycle + i;347extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;348buffer->updates[i+1].pos = cpu_to_be32(idx);349buffer->updates[i+1].extent = cpu_to_be32(extent_nr);350xor_sum ^= extent_nr;351}352for (; i < AL_EXTENTS_PT; i++) {353buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);354buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);355xor_sum ^= LC_FREE;356}357mdev->al_tr_cycle += AL_EXTENTS_PT;358if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)359mdev->al_tr_cycle = 0;360361buffer->xor_sum = cpu_to_be32(xor_sum);362363sector = mdev->ldev->md.md_offset364+ mdev->ldev->md.al_offset + mdev->al_tr_pos;365366if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))367drbd_chk_io_error(mdev, 1, true);368369if (++mdev->al_tr_pos >370div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))371mdev->al_tr_pos = 0;372373D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);374mdev->al_tr_number++;375376mutex_unlock(&mdev->md_io_mutex);377378complete(&((struct update_al_work *)w)->event);379put_ldev(mdev);380381return 1;382}383384/**385* drbd_al_read_tr() - Read a single transaction from the on disk activity log386* @mdev: DRBD device.387* @bdev: Block device to read form.388* @b: pointer to an al_transaction.389* @index: On disk slot of the transaction to read.390*391* Returns -1 on IO error, 0 on checksum error and 1 upon success.392*/393static int drbd_al_read_tr(struct drbd_conf *mdev,394struct drbd_backing_dev *bdev,395struct al_transaction *b,396int index)397{398sector_t sector;399int rv, i;400u32 xor_sum = 0;401402sector = bdev->md.md_offset + bdev->md.al_offset + index;403404/* Dont process error normally,405* as this is done before disk is attached! */406if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))407return -1;408409rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);410411for (i = 0; i < AL_EXTENTS_PT + 1; i++)412xor_sum ^= be32_to_cpu(b->updates[i].extent);413rv &= (xor_sum == be32_to_cpu(b->xor_sum));414415return rv;416}417418/**419* drbd_al_read_log() - Restores the activity log from its on disk representation.420* @mdev: DRBD device.421* @bdev: Block device to read form.422*423* Returns 1 on success, returns 0 when reading the log failed due to IO errors.424*/425int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)426{427struct al_transaction *buffer;428int i;429int rv;430int mx;431int active_extents = 0;432int transactions = 0;433int found_valid = 0;434int from = 0;435int to = 0;436u32 from_tnr = 0;437u32 to_tnr = 0;438u32 cnr;439440mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);441442/* lock out all other meta data io for now,443* and make sure the page is mapped.444*/445mutex_lock(&mdev->md_io_mutex);446buffer = page_address(mdev->md_io_page);447448/* Find the valid transaction in the log */449for (i = 0; i <= mx; i++) {450rv = drbd_al_read_tr(mdev, bdev, buffer, i);451if (rv == 0)452continue;453if (rv == -1) {454mutex_unlock(&mdev->md_io_mutex);455return 0;456}457cnr = be32_to_cpu(buffer->tr_number);458459if (++found_valid == 1) {460from = i;461to = i;462from_tnr = cnr;463to_tnr = cnr;464continue;465}466if ((int)cnr - (int)from_tnr < 0) {467D_ASSERT(from_tnr - cnr + i - from == mx+1);468from = i;469from_tnr = cnr;470}471if ((int)cnr - (int)to_tnr > 0) {472D_ASSERT(cnr - to_tnr == i - to);473to = i;474to_tnr = cnr;475}476}477478if (!found_valid) {479dev_warn(DEV, "No usable activity log found.\n");480mutex_unlock(&mdev->md_io_mutex);481return 1;482}483484/* Read the valid transactions.485* dev_info(DEV, "Reading from %d to %d.\n",from,to); */486i = from;487while (1) {488int j, pos;489unsigned int extent_nr;490unsigned int trn;491492rv = drbd_al_read_tr(mdev, bdev, buffer, i);493ERR_IF(rv == 0) goto cancel;494if (rv == -1) {495mutex_unlock(&mdev->md_io_mutex);496return 0;497}498499trn = be32_to_cpu(buffer->tr_number);500501spin_lock_irq(&mdev->al_lock);502503/* This loop runs backwards because in the cyclic504elements there might be an old version of the505updated element (in slot 0). So the element in slot 0506can overwrite old versions. */507for (j = AL_EXTENTS_PT; j >= 0; j--) {508pos = be32_to_cpu(buffer->updates[j].pos);509extent_nr = be32_to_cpu(buffer->updates[j].extent);510511if (extent_nr == LC_FREE)512continue;513514lc_set(mdev->act_log, extent_nr, pos);515active_extents++;516}517spin_unlock_irq(&mdev->al_lock);518519transactions++;520521cancel:522if (i == to)523break;524i++;525if (i > mx)526i = 0;527}528529mdev->al_tr_number = to_tnr+1;530mdev->al_tr_pos = to;531if (++mdev->al_tr_pos >532div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))533mdev->al_tr_pos = 0;534535/* ok, we are done with it */536mutex_unlock(&mdev->md_io_mutex);537538dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",539transactions, active_extents);540541return 1;542}543544/**545* drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents546* @mdev: DRBD device.547*/548void drbd_al_apply_to_bm(struct drbd_conf *mdev)549{550unsigned int enr;551unsigned long add = 0;552char ppb[10];553int i, tmp;554555wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));556557for (i = 0; i < mdev->act_log->nr_elements; i++) {558enr = lc_element_by_index(mdev->act_log, i)->lc_number;559if (enr == LC_FREE)560continue;561tmp = drbd_bm_ALe_set_all(mdev, enr);562dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);563add += tmp;564}565566lc_unlock(mdev->act_log);567wake_up(&mdev->al_wait);568569dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",570ppsize(ppb, Bit2KB(add)));571}572573static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)574{575int rv;576577spin_lock_irq(&mdev->al_lock);578rv = (al_ext->refcnt == 0);579if (likely(rv))580lc_del(mdev->act_log, al_ext);581spin_unlock_irq(&mdev->al_lock);582583return rv;584}585586/**587* drbd_al_shrink() - Removes all active extents form the activity log588* @mdev: DRBD device.589*590* Removes all active extents form the activity log, waiting until591* the reference count of each entry dropped to 0 first, of course.592*593* You need to lock mdev->act_log with lc_try_lock() / lc_unlock()594*/595void drbd_al_shrink(struct drbd_conf *mdev)596{597struct lc_element *al_ext;598int i;599600D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));601602for (i = 0; i < mdev->act_log->nr_elements; i++) {603al_ext = lc_element_by_index(mdev->act_log, i);604if (al_ext->lc_number == LC_FREE)605continue;606wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));607}608609wake_up(&mdev->al_wait);610}611612static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)613{614struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);615616if (!get_ldev(mdev)) {617if (__ratelimit(&drbd_ratelimit_state))618dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");619kfree(udw);620return 1;621}622623drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr));624put_ldev(mdev);625626kfree(udw);627628if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {629switch (mdev->state.conn) {630case C_SYNC_SOURCE: case C_SYNC_TARGET:631case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:632drbd_resync_finished(mdev);633default:634/* nothing to do */635break;636}637}638drbd_bcast_sync_progress(mdev);639640return 1;641}642643644/* ATTENTION. The AL's extents are 4MB each, while the extents in the645* resync LRU-cache are 16MB each.646* The caller of this function has to hold an get_ldev() reference.647*648* TODO will be obsoleted once we have a caching lru of the on disk bitmap649*/650static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,651int count, int success)652{653struct lc_element *e;654struct update_odbm_work *udw;655656unsigned int enr;657658D_ASSERT(atomic_read(&mdev->local_cnt));659660/* I simply assume that a sector/size pair never crosses661* a 16 MB extent border. (Currently this is true...) */662enr = BM_SECT_TO_EXT(sector);663664e = lc_get(mdev->resync, enr);665if (e) {666struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);667if (ext->lce.lc_number == enr) {668if (success)669ext->rs_left -= count;670else671ext->rs_failed += count;672if (ext->rs_left < ext->rs_failed) {673dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "674"rs_failed=%d count=%d\n",675(unsigned long long)sector,676ext->lce.lc_number, ext->rs_left,677ext->rs_failed, count);678dump_stack();679680lc_put(mdev->resync, &ext->lce);681drbd_force_state(mdev, NS(conn, C_DISCONNECTING));682return;683}684} else {685/* Normally this element should be in the cache,686* since drbd_rs_begin_io() pulled it already in.687*688* But maybe an application write finished, and we set689* something outside the resync lru_cache in sync.690*/691int rs_left = drbd_bm_e_weight(mdev, enr);692if (ext->flags != 0) {693dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"694" -> %d[%u;00]\n",695ext->lce.lc_number, ext->rs_left,696ext->flags, enr, rs_left);697ext->flags = 0;698}699if (ext->rs_failed) {700dev_warn(DEV, "Kicking resync_lru element enr=%u "701"out with rs_failed=%d\n",702ext->lce.lc_number, ext->rs_failed);703}704ext->rs_left = rs_left;705ext->rs_failed = success ? 0 : count;706lc_changed(mdev->resync, &ext->lce);707}708lc_put(mdev->resync, &ext->lce);709/* no race, we are within the al_lock! */710711if (ext->rs_left == ext->rs_failed) {712ext->rs_failed = 0;713714udw = kmalloc(sizeof(*udw), GFP_ATOMIC);715if (udw) {716udw->enr = ext->lce.lc_number;717udw->w.cb = w_update_odbm;718drbd_queue_work_front(&mdev->data.work, &udw->w);719} else {720dev_warn(DEV, "Could not kmalloc an udw\n");721}722}723} else {724dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",725mdev->resync_locked,726mdev->resync->nr_elements,727mdev->resync->flags);728}729}730731void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go)732{733unsigned long now = jiffies;734unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];735int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;736if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {737if (mdev->rs_mark_left[mdev->rs_last_mark] != still_to_go &&738mdev->state.conn != C_PAUSED_SYNC_T &&739mdev->state.conn != C_PAUSED_SYNC_S) {740mdev->rs_mark_time[next] = now;741mdev->rs_mark_left[next] = still_to_go;742mdev->rs_last_mark = next;743}744}745}746747/* clear the bit corresponding to the piece of storage in question:748* size byte of data starting from sector. Only clear a bits of the affected749* one ore more _aligned_ BM_BLOCK_SIZE blocks.750*751* called by worker on C_SYNC_TARGET and receiver on SyncSource.752*753*/754void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,755const char *file, const unsigned int line)756{757/* Is called from worker and receiver context _only_ */758unsigned long sbnr, ebnr, lbnr;759unsigned long count = 0;760sector_t esector, nr_sectors;761int wake_up = 0;762unsigned long flags;763764if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {765dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",766(unsigned long long)sector, size);767return;768}769nr_sectors = drbd_get_capacity(mdev->this_bdev);770esector = sector + (size >> 9) - 1;771772ERR_IF(sector >= nr_sectors) return;773ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);774775lbnr = BM_SECT_TO_BIT(nr_sectors-1);776777/* we clear it (in sync).778* round up start sector, round down end sector. we make sure we only779* clear full, aligned, BM_BLOCK_SIZE (4K) blocks */780if (unlikely(esector < BM_SECT_PER_BIT-1))781return;782if (unlikely(esector == (nr_sectors-1)))783ebnr = lbnr;784else785ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));786sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);787788if (sbnr > ebnr)789return;790791/*792* ok, (capacity & 7) != 0 sometimes, but who cares...793* we count rs_{total,left} in bits, not sectors.794*/795count = drbd_bm_clear_bits(mdev, sbnr, ebnr);796if (count && get_ldev(mdev)) {797drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev));798spin_lock_irqsave(&mdev->al_lock, flags);799drbd_try_clear_on_disk_bm(mdev, sector, count, true);800spin_unlock_irqrestore(&mdev->al_lock, flags);801802/* just wake_up unconditional now, various lc_chaged(),803* lc_put() in drbd_try_clear_on_disk_bm(). */804wake_up = 1;805put_ldev(mdev);806}807if (wake_up)808wake_up(&mdev->al_wait);809}810811/*812* this is intended to set one request worth of data out of sync.813* affects at least 1 bit,814* and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.815*816* called by tl_clear and drbd_send_dblock (==drbd_make_request).817* so this can be _any_ process.818*/819int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,820const char *file, const unsigned int line)821{822unsigned long sbnr, ebnr, lbnr, flags;823sector_t esector, nr_sectors;824unsigned int enr, count = 0;825struct lc_element *e;826827if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {828dev_err(DEV, "sector: %llus, size: %d\n",829(unsigned long long)sector, size);830return 0;831}832833if (!get_ldev(mdev))834return 0; /* no disk, no metadata, no bitmap to set bits in */835836nr_sectors = drbd_get_capacity(mdev->this_bdev);837esector = sector + (size >> 9) - 1;838839ERR_IF(sector >= nr_sectors)840goto out;841ERR_IF(esector >= nr_sectors)842esector = (nr_sectors-1);843844lbnr = BM_SECT_TO_BIT(nr_sectors-1);845846/* we set it out of sync,847* we do not need to round anything here */848sbnr = BM_SECT_TO_BIT(sector);849ebnr = BM_SECT_TO_BIT(esector);850851/* ok, (capacity & 7) != 0 sometimes, but who cares...852* we count rs_{total,left} in bits, not sectors. */853spin_lock_irqsave(&mdev->al_lock, flags);854count = drbd_bm_set_bits(mdev, sbnr, ebnr);855856enr = BM_SECT_TO_EXT(sector);857e = lc_find(mdev->resync, enr);858if (e)859lc_entry(e, struct bm_extent, lce)->rs_left += count;860spin_unlock_irqrestore(&mdev->al_lock, flags);861862out:863put_ldev(mdev);864865return count;866}867868static869struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)870{871struct lc_element *e;872struct bm_extent *bm_ext;873int wakeup = 0;874unsigned long rs_flags;875876spin_lock_irq(&mdev->al_lock);877if (mdev->resync_locked > mdev->resync->nr_elements/2) {878spin_unlock_irq(&mdev->al_lock);879return NULL;880}881e = lc_get(mdev->resync, enr);882bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;883if (bm_ext) {884if (bm_ext->lce.lc_number != enr) {885bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);886bm_ext->rs_failed = 0;887lc_changed(mdev->resync, &bm_ext->lce);888wakeup = 1;889}890if (bm_ext->lce.refcnt == 1)891mdev->resync_locked++;892set_bit(BME_NO_WRITES, &bm_ext->flags);893}894rs_flags = mdev->resync->flags;895spin_unlock_irq(&mdev->al_lock);896if (wakeup)897wake_up(&mdev->al_wait);898899if (!bm_ext) {900if (rs_flags & LC_STARVING)901dev_warn(DEV, "Have to wait for element"902" (resync LRU too small?)\n");903BUG_ON(rs_flags & LC_DIRTY);904}905906return bm_ext;907}908909static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)910{911struct lc_element *al_ext;912int rv = 0;913914spin_lock_irq(&mdev->al_lock);915if (unlikely(enr == mdev->act_log->new_number))916rv = 1;917else {918al_ext = lc_find(mdev->act_log, enr);919if (al_ext) {920if (al_ext->refcnt)921rv = 1;922}923}924spin_unlock_irq(&mdev->al_lock);925926/*927if (unlikely(rv)) {928dev_info(DEV, "Delaying sync read until app's write is done\n");929}930*/931return rv;932}933934/**935* drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED936* @mdev: DRBD device.937* @sector: The sector number.938*939* This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.940*/941int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)942{943unsigned int enr = BM_SECT_TO_EXT(sector);944struct bm_extent *bm_ext;945int i, sig;946int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait.947200 times -> 20 seconds. */948949retry:950sig = wait_event_interruptible(mdev->al_wait,951(bm_ext = _bme_get(mdev, enr)));952if (sig)953return -EINTR;954955if (test_bit(BME_LOCKED, &bm_ext->flags))956return 0;957958for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {959sig = wait_event_interruptible(mdev->al_wait,960!_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i) ||961test_bit(BME_PRIORITY, &bm_ext->flags));962963if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) {964spin_lock_irq(&mdev->al_lock);965if (lc_put(mdev->resync, &bm_ext->lce) == 0) {966bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */967mdev->resync_locked--;968wake_up(&mdev->al_wait);969}970spin_unlock_irq(&mdev->al_lock);971if (sig)972return -EINTR;973if (schedule_timeout_interruptible(HZ/10))974return -EINTR;975if (sa && --sa == 0)976dev_warn(DEV,"drbd_rs_begin_io() stepped aside for 20sec."977"Resync stalled?\n");978goto retry;979}980}981set_bit(BME_LOCKED, &bm_ext->flags);982return 0;983}984985/**986* drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep987* @mdev: DRBD device.988* @sector: The sector number.989*990* Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then991* tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN992* if there is still application IO going on in this area.993*/994int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)995{996unsigned int enr = BM_SECT_TO_EXT(sector);997const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;998struct lc_element *e;999struct bm_extent *bm_ext;1000int i;10011002spin_lock_irq(&mdev->al_lock);1003if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {1004/* in case you have very heavy scattered io, it may1005* stall the syncer undefined if we give up the ref count1006* when we try again and requeue.1007*1008* if we don't give up the refcount, but the next time1009* we are scheduled this extent has been "synced" by new1010* application writes, we'd miss the lc_put on the1011* extent we keep the refcount on.1012* so we remembered which extent we had to try again, and1013* if the next requested one is something else, we do1014* the lc_put here...1015* we also have to wake_up1016*/1017e = lc_find(mdev->resync, mdev->resync_wenr);1018bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;1019if (bm_ext) {1020D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));1021D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));1022clear_bit(BME_NO_WRITES, &bm_ext->flags);1023mdev->resync_wenr = LC_FREE;1024if (lc_put(mdev->resync, &bm_ext->lce) == 0)1025mdev->resync_locked--;1026wake_up(&mdev->al_wait);1027} else {1028dev_alert(DEV, "LOGIC BUG\n");1029}1030}1031/* TRY. */1032e = lc_try_get(mdev->resync, enr);1033bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;1034if (bm_ext) {1035if (test_bit(BME_LOCKED, &bm_ext->flags))1036goto proceed;1037if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {1038mdev->resync_locked++;1039} else {1040/* we did set the BME_NO_WRITES,1041* but then could not set BME_LOCKED,1042* so we tried again.1043* drop the extra reference. */1044bm_ext->lce.refcnt--;1045D_ASSERT(bm_ext->lce.refcnt > 0);1046}1047goto check_al;1048} else {1049/* do we rather want to try later? */1050if (mdev->resync_locked > mdev->resync->nr_elements-3)1051goto try_again;1052/* Do or do not. There is no try. -- Yoda */1053e = lc_get(mdev->resync, enr);1054bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;1055if (!bm_ext) {1056const unsigned long rs_flags = mdev->resync->flags;1057if (rs_flags & LC_STARVING)1058dev_warn(DEV, "Have to wait for element"1059" (resync LRU too small?)\n");1060BUG_ON(rs_flags & LC_DIRTY);1061goto try_again;1062}1063if (bm_ext->lce.lc_number != enr) {1064bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);1065bm_ext->rs_failed = 0;1066lc_changed(mdev->resync, &bm_ext->lce);1067wake_up(&mdev->al_wait);1068D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);1069}1070set_bit(BME_NO_WRITES, &bm_ext->flags);1071D_ASSERT(bm_ext->lce.refcnt == 1);1072mdev->resync_locked++;1073goto check_al;1074}1075check_al:1076for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {1077if (unlikely(al_enr+i == mdev->act_log->new_number))1078goto try_again;1079if (lc_is_used(mdev->act_log, al_enr+i))1080goto try_again;1081}1082set_bit(BME_LOCKED, &bm_ext->flags);1083proceed:1084mdev->resync_wenr = LC_FREE;1085spin_unlock_irq(&mdev->al_lock);1086return 0;10871088try_again:1089if (bm_ext)1090mdev->resync_wenr = enr;1091spin_unlock_irq(&mdev->al_lock);1092return -EAGAIN;1093}10941095void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)1096{1097unsigned int enr = BM_SECT_TO_EXT(sector);1098struct lc_element *e;1099struct bm_extent *bm_ext;1100unsigned long flags;11011102spin_lock_irqsave(&mdev->al_lock, flags);1103e = lc_find(mdev->resync, enr);1104bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;1105if (!bm_ext) {1106spin_unlock_irqrestore(&mdev->al_lock, flags);1107if (__ratelimit(&drbd_ratelimit_state))1108dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");1109return;1110}11111112if (bm_ext->lce.refcnt == 0) {1113spin_unlock_irqrestore(&mdev->al_lock, flags);1114dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "1115"but refcnt is 0!?\n",1116(unsigned long long)sector, enr);1117return;1118}11191120if (lc_put(mdev->resync, &bm_ext->lce) == 0) {1121bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */1122mdev->resync_locked--;1123wake_up(&mdev->al_wait);1124}11251126spin_unlock_irqrestore(&mdev->al_lock, flags);1127}11281129/**1130* drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)1131* @mdev: DRBD device.1132*/1133void drbd_rs_cancel_all(struct drbd_conf *mdev)1134{1135spin_lock_irq(&mdev->al_lock);11361137if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */1138lc_reset(mdev->resync);1139put_ldev(mdev);1140}1141mdev->resync_locked = 0;1142mdev->resync_wenr = LC_FREE;1143spin_unlock_irq(&mdev->al_lock);1144wake_up(&mdev->al_wait);1145}11461147/**1148* drbd_rs_del_all() - Gracefully remove all extents from the resync LRU1149* @mdev: DRBD device.1150*1151* Returns 0 upon success, -EAGAIN if at least one reference count was1152* not zero.1153*/1154int drbd_rs_del_all(struct drbd_conf *mdev)1155{1156struct lc_element *e;1157struct bm_extent *bm_ext;1158int i;11591160spin_lock_irq(&mdev->al_lock);11611162if (get_ldev_if_state(mdev, D_FAILED)) {1163/* ok, ->resync is there. */1164for (i = 0; i < mdev->resync->nr_elements; i++) {1165e = lc_element_by_index(mdev->resync, i);1166bm_ext = lc_entry(e, struct bm_extent, lce);1167if (bm_ext->lce.lc_number == LC_FREE)1168continue;1169if (bm_ext->lce.lc_number == mdev->resync_wenr) {1170dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"1171" got 'synced' by application io\n",1172mdev->resync_wenr);1173D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));1174D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));1175clear_bit(BME_NO_WRITES, &bm_ext->flags);1176mdev->resync_wenr = LC_FREE;1177lc_put(mdev->resync, &bm_ext->lce);1178}1179if (bm_ext->lce.refcnt != 0) {1180dev_info(DEV, "Retrying drbd_rs_del_all() later. "1181"refcnt=%d\n", bm_ext->lce.refcnt);1182put_ldev(mdev);1183spin_unlock_irq(&mdev->al_lock);1184return -EAGAIN;1185}1186D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));1187D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));1188lc_del(mdev->resync, &bm_ext->lce);1189}1190D_ASSERT(mdev->resync->used == 0);1191put_ldev(mdev);1192}1193spin_unlock_irq(&mdev->al_lock);11941195return 0;1196}11971198/**1199* drbd_rs_failed_io() - Record information on a failure to resync the specified blocks1200* @mdev: DRBD device.1201* @sector: The sector number.1202* @size: Size of failed IO operation, in byte.1203*/1204void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)1205{1206/* Is called from worker and receiver context _only_ */1207unsigned long sbnr, ebnr, lbnr;1208unsigned long count;1209sector_t esector, nr_sectors;1210int wake_up = 0;12111212if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {1213dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",1214(unsigned long long)sector, size);1215return;1216}1217nr_sectors = drbd_get_capacity(mdev->this_bdev);1218esector = sector + (size >> 9) - 1;12191220ERR_IF(sector >= nr_sectors) return;1221ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);12221223lbnr = BM_SECT_TO_BIT(nr_sectors-1);12241225/*1226* round up start sector, round down end sector. we make sure we only1227* handle full, aligned, BM_BLOCK_SIZE (4K) blocks */1228if (unlikely(esector < BM_SECT_PER_BIT-1))1229return;1230if (unlikely(esector == (nr_sectors-1)))1231ebnr = lbnr;1232else1233ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));1234sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);12351236if (sbnr > ebnr)1237return;12381239/*1240* ok, (capacity & 7) != 0 sometimes, but who cares...1241* we count rs_{total,left} in bits, not sectors.1242*/1243spin_lock_irq(&mdev->al_lock);1244count = drbd_bm_count_bits(mdev, sbnr, ebnr);1245if (count) {1246mdev->rs_failed += count;12471248if (get_ldev(mdev)) {1249drbd_try_clear_on_disk_bm(mdev, sector, count, false);1250put_ldev(mdev);1251}12521253/* just wake_up unconditional now, various lc_chaged(),1254* lc_put() in drbd_try_clear_on_disk_bm(). */1255wake_up = 1;1256}1257spin_unlock_irq(&mdev->al_lock);1258if (wake_up)1259wake_up(&mdev->al_wait);1260}126112621263