Path: blob/master/drivers/block/drbd/drbd_worker.c
15179 views
/*1drbd_worker.c23This file is part of DRBD by Philipp Reisner and Lars Ellenberg.45Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.6Copyright (C) 1999-2008, Philipp Reisner <[email protected]>.7Copyright (C) 2002-2008, Lars Ellenberg <[email protected]>.89drbd is free software; you can redistribute it and/or modify10it under the terms of the GNU General Public License as published by11the Free Software Foundation; either version 2, or (at your option)12any later version.1314drbd is distributed in the hope that it will be useful,15but WITHOUT ANY WARRANTY; without even the implied warranty of16MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the17GNU General Public License for more details.1819You should have received a copy of the GNU General Public License20along with drbd; see the file COPYING. If not, write to21the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.2223*/2425#include <linux/module.h>26#include <linux/drbd.h>27#include <linux/sched.h>28#include <linux/wait.h>29#include <linux/mm.h>30#include <linux/memcontrol.h>31#include <linux/mm_inline.h>32#include <linux/slab.h>33#include <linux/random.h>34#include <linux/string.h>35#include <linux/scatterlist.h>3637#include "drbd_int.h"38#include "drbd_req.h"3940static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);41static int w_make_resync_request(struct drbd_conf *mdev,42struct drbd_work *w, int cancel);43444546/* endio handlers:47* drbd_md_io_complete (defined here)48* drbd_endio_pri (defined here)49* drbd_endio_sec (defined here)50* bm_async_io_complete (defined in drbd_bitmap.c)51*52* For all these callbacks, note the following:53* The callbacks will be called in irq context by the IDE drivers,54* and in Softirqs/Tasklets/BH context by the SCSI drivers.55* Try to get the locking right :)56*57*/585960/* About the global_state_lock61Each state transition on an device holds a read lock. In case we have62to evaluate the sync after dependencies, we grab a write lock, because63we need stable states on all devices for that. */64rwlock_t global_state_lock;6566/* used for synchronous meta data and bitmap IO67* submitted by drbd_md_sync_page_io()68*/69void drbd_md_io_complete(struct bio *bio, int error)70{71struct drbd_md_io *md_io;7273md_io = (struct drbd_md_io *)bio->bi_private;74md_io->error = error;7576complete(&md_io->event);77}7879/* reads on behalf of the partner,80* "submitted" by the receiver81*/82void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)83{84unsigned long flags = 0;85struct drbd_conf *mdev = e->mdev;8687D_ASSERT(e->block_id != ID_VACANT);8889spin_lock_irqsave(&mdev->req_lock, flags);90mdev->read_cnt += e->size >> 9;91list_del(&e->w.list);92if (list_empty(&mdev->read_ee))93wake_up(&mdev->ee_wait);94if (test_bit(__EE_WAS_ERROR, &e->flags))95__drbd_chk_io_error(mdev, false);96spin_unlock_irqrestore(&mdev->req_lock, flags);9798drbd_queue_work(&mdev->data.work, &e->w);99put_ldev(mdev);100}101102/* writes on behalf of the partner, or resync writes,103* "submitted" by the receiver, final stage. */104static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)105{106unsigned long flags = 0;107struct drbd_conf *mdev = e->mdev;108sector_t e_sector;109int do_wake;110int is_syncer_req;111int do_al_complete_io;112113D_ASSERT(e->block_id != ID_VACANT);114115/* after we moved e to done_ee,116* we may no longer access it,117* it may be freed/reused already!118* (as soon as we release the req_lock) */119e_sector = e->sector;120do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;121is_syncer_req = is_syncer_block_id(e->block_id);122123spin_lock_irqsave(&mdev->req_lock, flags);124mdev->writ_cnt += e->size >> 9;125list_del(&e->w.list); /* has been on active_ee or sync_ee */126list_add_tail(&e->w.list, &mdev->done_ee);127128/* No hlist_del_init(&e->collision) here, we did not send the Ack yet,129* neither did we wake possibly waiting conflicting requests.130* done from "drbd_process_done_ee" within the appropriate w.cb131* (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */132133do_wake = is_syncer_req134? list_empty(&mdev->sync_ee)135: list_empty(&mdev->active_ee);136137if (test_bit(__EE_WAS_ERROR, &e->flags))138__drbd_chk_io_error(mdev, false);139spin_unlock_irqrestore(&mdev->req_lock, flags);140141if (is_syncer_req)142drbd_rs_complete_io(mdev, e_sector);143144if (do_wake)145wake_up(&mdev->ee_wait);146147if (do_al_complete_io)148drbd_al_complete_io(mdev, e_sector);149150wake_asender(mdev);151put_ldev(mdev);152}153154/* writes on behalf of the partner, or resync writes,155* "submitted" by the receiver.156*/157void drbd_endio_sec(struct bio *bio, int error)158{159struct drbd_epoch_entry *e = bio->bi_private;160struct drbd_conf *mdev = e->mdev;161int uptodate = bio_flagged(bio, BIO_UPTODATE);162int is_write = bio_data_dir(bio) == WRITE;163164if (error && __ratelimit(&drbd_ratelimit_state))165dev_warn(DEV, "%s: error=%d s=%llus\n",166is_write ? "write" : "read", error,167(unsigned long long)e->sector);168if (!error && !uptodate) {169if (__ratelimit(&drbd_ratelimit_state))170dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",171is_write ? "write" : "read",172(unsigned long long)e->sector);173/* strange behavior of some lower level drivers...174* fail the request by clearing the uptodate flag,175* but do not return any error?! */176error = -EIO;177}178179if (error)180set_bit(__EE_WAS_ERROR, &e->flags);181182bio_put(bio); /* no need for the bio anymore */183if (atomic_dec_and_test(&e->pending_bios)) {184if (is_write)185drbd_endio_write_sec_final(e);186else187drbd_endio_read_sec_final(e);188}189}190191/* read, readA or write requests on R_PRIMARY coming from drbd_make_request192*/193void drbd_endio_pri(struct bio *bio, int error)194{195unsigned long flags;196struct drbd_request *req = bio->bi_private;197struct drbd_conf *mdev = req->mdev;198struct bio_and_error m;199enum drbd_req_event what;200int uptodate = bio_flagged(bio, BIO_UPTODATE);201202if (!error && !uptodate) {203dev_warn(DEV, "p %s: setting error to -EIO\n",204bio_data_dir(bio) == WRITE ? "write" : "read");205/* strange behavior of some lower level drivers...206* fail the request by clearing the uptodate flag,207* but do not return any error?! */208error = -EIO;209}210211/* to avoid recursion in __req_mod */212if (unlikely(error)) {213what = (bio_data_dir(bio) == WRITE)214? write_completed_with_error215: (bio_rw(bio) == READ)216? read_completed_with_error217: read_ahead_completed_with_error;218} else219what = completed_ok;220221bio_put(req->private_bio);222req->private_bio = ERR_PTR(error);223224/* not req_mod(), we need irqsave here! */225spin_lock_irqsave(&mdev->req_lock, flags);226__req_mod(req, what, &m);227spin_unlock_irqrestore(&mdev->req_lock, flags);228229if (m.bio)230complete_master_bio(mdev, &m);231}232233int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)234{235struct drbd_request *req = container_of(w, struct drbd_request, w);236237/* We should not detach for read io-error,238* but try to WRITE the P_DATA_REPLY to the failed location,239* to give the disk the chance to relocate that block */240241spin_lock_irq(&mdev->req_lock);242if (cancel || mdev->state.pdsk != D_UP_TO_DATE) {243_req_mod(req, read_retry_remote_canceled);244spin_unlock_irq(&mdev->req_lock);245return 1;246}247spin_unlock_irq(&mdev->req_lock);248249return w_send_read_req(mdev, w, 0);250}251252void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)253{254struct hash_desc desc;255struct scatterlist sg;256struct page *page = e->pages;257struct page *tmp;258unsigned len;259260desc.tfm = tfm;261desc.flags = 0;262263sg_init_table(&sg, 1);264crypto_hash_init(&desc);265266while ((tmp = page_chain_next(page))) {267/* all but the last page will be fully used */268sg_set_page(&sg, page, PAGE_SIZE, 0);269crypto_hash_update(&desc, &sg, sg.length);270page = tmp;271}272/* and now the last, possibly only partially used page */273len = e->size & (PAGE_SIZE - 1);274sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);275crypto_hash_update(&desc, &sg, sg.length);276crypto_hash_final(&desc, digest);277}278279void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)280{281struct hash_desc desc;282struct scatterlist sg;283struct bio_vec *bvec;284int i;285286desc.tfm = tfm;287desc.flags = 0;288289sg_init_table(&sg, 1);290crypto_hash_init(&desc);291292__bio_for_each_segment(bvec, bio, i, 0) {293sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);294crypto_hash_update(&desc, &sg, sg.length);295}296crypto_hash_final(&desc, digest);297}298299/* TODO merge common code with w_e_end_ov_req */300int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)301{302struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);303int digest_size;304void *digest;305int ok = 1;306307D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);308309if (unlikely(cancel))310goto out;311312if (likely((e->flags & EE_WAS_ERROR) != 0))313goto out;314315digest_size = crypto_hash_digestsize(mdev->csums_tfm);316digest = kmalloc(digest_size, GFP_NOIO);317if (digest) {318sector_t sector = e->sector;319unsigned int size = e->size;320drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);321/* Free e and pages before send.322* In case we block on congestion, we could otherwise run into323* some distributed deadlock, if the other side blocks on324* congestion as well, because our receiver blocks in325* drbd_pp_alloc due to pp_in_use > max_buffers. */326drbd_free_ee(mdev, e);327e = NULL;328inc_rs_pending(mdev);329ok = drbd_send_drequest_csum(mdev, sector, size,330digest, digest_size,331P_CSUM_RS_REQUEST);332kfree(digest);333} else {334dev_err(DEV, "kmalloc() of digest failed.\n");335ok = 0;336}337338out:339if (e)340drbd_free_ee(mdev, e);341342if (unlikely(!ok))343dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");344return ok;345}346347#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)348349static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)350{351struct drbd_epoch_entry *e;352353if (!get_ldev(mdev))354return -EIO;355356if (drbd_rs_should_slow_down(mdev, sector))357goto defer;358359/* GFP_TRY, because if there is no memory available right now, this may360* be rescheduled for later. It is "only" background resync, after all. */361e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);362if (!e)363goto defer;364365e->w.cb = w_e_send_csum;366spin_lock_irq(&mdev->req_lock);367list_add(&e->w.list, &mdev->read_ee);368spin_unlock_irq(&mdev->req_lock);369370atomic_add(size >> 9, &mdev->rs_sect_ev);371if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)372return 0;373374/* If it failed because of ENOMEM, retry should help. If it failed375* because bio_add_page failed (probably broken lower level driver),376* retry may or may not help.377* If it does not, you may need to force disconnect. */378spin_lock_irq(&mdev->req_lock);379list_del(&e->w.list);380spin_unlock_irq(&mdev->req_lock);381382drbd_free_ee(mdev, e);383defer:384put_ldev(mdev);385return -EAGAIN;386}387388int w_resync_timer(struct drbd_conf *mdev, struct drbd_work *w, int cancel)389{390switch (mdev->state.conn) {391case C_VERIFY_S:392w_make_ov_request(mdev, w, cancel);393break;394case C_SYNC_TARGET:395w_make_resync_request(mdev, w, cancel);396break;397}398399return 1;400}401402void resync_timer_fn(unsigned long data)403{404struct drbd_conf *mdev = (struct drbd_conf *) data;405406if (list_empty(&mdev->resync_work.list))407drbd_queue_work(&mdev->data.work, &mdev->resync_work);408}409410static void fifo_set(struct fifo_buffer *fb, int value)411{412int i;413414for (i = 0; i < fb->size; i++)415fb->values[i] = value;416}417418static int fifo_push(struct fifo_buffer *fb, int value)419{420int ov;421422ov = fb->values[fb->head_index];423fb->values[fb->head_index++] = value;424425if (fb->head_index >= fb->size)426fb->head_index = 0;427428return ov;429}430431static void fifo_add_val(struct fifo_buffer *fb, int value)432{433int i;434435for (i = 0; i < fb->size; i++)436fb->values[i] += value;437}438439static int drbd_rs_controller(struct drbd_conf *mdev)440{441unsigned int sect_in; /* Number of sectors that came in since the last turn */442unsigned int want; /* The number of sectors we want in the proxy */443int req_sect; /* Number of sectors to request in this turn */444int correction; /* Number of sectors more we need in the proxy*/445int cps; /* correction per invocation of drbd_rs_controller() */446int steps; /* Number of time steps to plan ahead */447int curr_corr;448int max_sect;449450sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */451mdev->rs_in_flight -= sect_in;452453spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */454455steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */456457if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */458want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps;459} else { /* normal path */460want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target :461sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10);462}463464correction = want - mdev->rs_in_flight - mdev->rs_planed;465466/* Plan ahead */467cps = correction / steps;468fifo_add_val(&mdev->rs_plan_s, cps);469mdev->rs_planed += cps * steps;470471/* What we do in this step */472curr_corr = fifo_push(&mdev->rs_plan_s, 0);473spin_unlock(&mdev->peer_seq_lock);474mdev->rs_planed -= curr_corr;475476req_sect = sect_in + curr_corr;477if (req_sect < 0)478req_sect = 0;479480max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ;481if (req_sect > max_sect)482req_sect = max_sect;483484/*485dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",486sect_in, mdev->rs_in_flight, want, correction,487steps, cps, mdev->rs_planed, curr_corr, req_sect);488*/489490return req_sect;491}492493static int drbd_rs_number_requests(struct drbd_conf *mdev)494{495int number;496if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */497number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);498mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;499} else {500mdev->c_sync_rate = mdev->sync_conf.rate;501number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);502}503504/* ignore the amount of pending requests, the resync controller should505* throttle down to incoming reply rate soon enough anyways. */506return number;507}508509static int w_make_resync_request(struct drbd_conf *mdev,510struct drbd_work *w, int cancel)511{512unsigned long bit;513sector_t sector;514const sector_t capacity = drbd_get_capacity(mdev->this_bdev);515int max_bio_size;516int number, rollback_i, size;517int align, queued, sndbuf;518int i = 0;519520if (unlikely(cancel))521return 1;522523if (mdev->rs_total == 0) {524/* empty resync? */525drbd_resync_finished(mdev);526return 1;527}528529if (!get_ldev(mdev)) {530/* Since we only need to access mdev->rsync a531get_ldev_if_state(mdev,D_FAILED) would be sufficient, but532to continue resync with a broken disk makes no sense at533all */534dev_err(DEV, "Disk broke down during resync!\n");535return 1;536}537538max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;539number = drbd_rs_number_requests(mdev);540if (number == 0)541goto requeue;542543for (i = 0; i < number; i++) {544/* Stop generating RS requests, when half of the send buffer is filled */545mutex_lock(&mdev->data.mutex);546if (mdev->data.socket) {547queued = mdev->data.socket->sk->sk_wmem_queued;548sndbuf = mdev->data.socket->sk->sk_sndbuf;549} else {550queued = 1;551sndbuf = 0;552}553mutex_unlock(&mdev->data.mutex);554if (queued > sndbuf / 2)555goto requeue;556557next_sector:558size = BM_BLOCK_SIZE;559bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);560561if (bit == DRBD_END_OF_BITMAP) {562mdev->bm_resync_fo = drbd_bm_bits(mdev);563put_ldev(mdev);564return 1;565}566567sector = BM_BIT_TO_SECT(bit);568569if (drbd_rs_should_slow_down(mdev, sector) ||570drbd_try_rs_begin_io(mdev, sector)) {571mdev->bm_resync_fo = bit;572goto requeue;573}574mdev->bm_resync_fo = bit + 1;575576if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {577drbd_rs_complete_io(mdev, sector);578goto next_sector;579}580581#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE582/* try to find some adjacent bits.583* we stop if we have already the maximum req size.584*585* Additionally always align bigger requests, in order to586* be prepared for all stripe sizes of software RAIDs.587*/588align = 1;589rollback_i = i;590for (;;) {591if (size + BM_BLOCK_SIZE > max_bio_size)592break;593594/* Be always aligned */595if (sector & ((1<<(align+3))-1))596break;597598/* do not cross extent boundaries */599if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)600break;601/* now, is it actually dirty, after all?602* caution, drbd_bm_test_bit is tri-state for some603* obscure reason; ( b == 0 ) would get the out-of-band604* only accidentally right because of the "oddly sized"605* adjustment below */606if (drbd_bm_test_bit(mdev, bit+1) != 1)607break;608bit++;609size += BM_BLOCK_SIZE;610if ((BM_BLOCK_SIZE << align) <= size)611align++;612i++;613}614/* if we merged some,615* reset the offset to start the next drbd_bm_find_next from */616if (size > BM_BLOCK_SIZE)617mdev->bm_resync_fo = bit + 1;618#endif619620/* adjust very last sectors, in case we are oddly sized */621if (sector + (size>>9) > capacity)622size = (capacity-sector)<<9;623if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {624switch (read_for_csum(mdev, sector, size)) {625case -EIO: /* Disk failure */626put_ldev(mdev);627return 0;628case -EAGAIN: /* allocation failed, or ldev busy */629drbd_rs_complete_io(mdev, sector);630mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);631i = rollback_i;632goto requeue;633case 0:634/* everything ok */635break;636default:637BUG();638}639} else {640inc_rs_pending(mdev);641if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,642sector, size, ID_SYNCER)) {643dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");644dec_rs_pending(mdev);645put_ldev(mdev);646return 0;647}648}649}650651if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {652/* last syncer _request_ was sent,653* but the P_RS_DATA_REPLY not yet received. sync will end (and654* next sync group will resume), as soon as we receive the last655* resync data block, and the last bit is cleared.656* until then resync "work" is "inactive" ...657*/658put_ldev(mdev);659return 1;660}661662requeue:663mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));664mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);665put_ldev(mdev);666return 1;667}668669static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)670{671int number, i, size;672sector_t sector;673const sector_t capacity = drbd_get_capacity(mdev->this_bdev);674675if (unlikely(cancel))676return 1;677678number = drbd_rs_number_requests(mdev);679680sector = mdev->ov_position;681for (i = 0; i < number; i++) {682if (sector >= capacity) {683return 1;684}685686size = BM_BLOCK_SIZE;687688if (drbd_rs_should_slow_down(mdev, sector) ||689drbd_try_rs_begin_io(mdev, sector)) {690mdev->ov_position = sector;691goto requeue;692}693694if (sector + (size>>9) > capacity)695size = (capacity-sector)<<9;696697inc_rs_pending(mdev);698if (!drbd_send_ov_request(mdev, sector, size)) {699dec_rs_pending(mdev);700return 0;701}702sector += BM_SECT_PER_BIT;703}704mdev->ov_position = sector;705706requeue:707mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));708mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);709return 1;710}711712713void start_resync_timer_fn(unsigned long data)714{715struct drbd_conf *mdev = (struct drbd_conf *) data;716717drbd_queue_work(&mdev->data.work, &mdev->start_resync_work);718}719720int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel)721{722if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {723dev_warn(DEV, "w_start_resync later...\n");724mdev->start_resync_timer.expires = jiffies + HZ/10;725add_timer(&mdev->start_resync_timer);726return 1;727}728729drbd_start_resync(mdev, C_SYNC_SOURCE);730clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags);731return 1;732}733734int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)735{736kfree(w);737ov_oos_print(mdev);738drbd_resync_finished(mdev);739740return 1;741}742743static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)744{745kfree(w);746747drbd_resync_finished(mdev);748749return 1;750}751752static void ping_peer(struct drbd_conf *mdev)753{754clear_bit(GOT_PING_ACK, &mdev->flags);755request_ping(mdev);756wait_event(mdev->misc_wait,757test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);758}759760int drbd_resync_finished(struct drbd_conf *mdev)761{762unsigned long db, dt, dbdt;763unsigned long n_oos;764union drbd_state os, ns;765struct drbd_work *w;766char *khelper_cmd = NULL;767int verify_done = 0;768769/* Remove all elements from the resync LRU. Since future actions770* might set bits in the (main) bitmap, then the entries in the771* resync LRU would be wrong. */772if (drbd_rs_del_all(mdev)) {773/* In case this is not possible now, most probably because774* there are P_RS_DATA_REPLY Packets lingering on the worker's775* queue (or even the read operations for those packets776* is not finished by now). Retry in 100ms. */777778schedule_timeout_interruptible(HZ / 10);779w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);780if (w) {781w->cb = w_resync_finished;782drbd_queue_work(&mdev->data.work, w);783return 1;784}785dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");786}787788dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;789if (dt <= 0)790dt = 1;791db = mdev->rs_total;792dbdt = Bit2KB(db/dt);793mdev->rs_paused /= HZ;794795if (!get_ldev(mdev))796goto out;797798ping_peer(mdev);799800spin_lock_irq(&mdev->req_lock);801os = mdev->state;802803verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);804805/* This protects us against multiple calls (that can happen in the presence806of application IO), and against connectivity loss just before we arrive here. */807if (os.conn <= C_CONNECTED)808goto out_unlock;809810ns = os;811ns.conn = C_CONNECTED;812813dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",814verify_done ? "Online verify " : "Resync",815dt + mdev->rs_paused, mdev->rs_paused, dbdt);816817n_oos = drbd_bm_total_weight(mdev);818819if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {820if (n_oos) {821dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",822n_oos, Bit2KB(1));823khelper_cmd = "out-of-sync";824}825} else {826D_ASSERT((n_oos - mdev->rs_failed) == 0);827828if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)829khelper_cmd = "after-resync-target";830831if (mdev->csums_tfm && mdev->rs_total) {832const unsigned long s = mdev->rs_same_csum;833const unsigned long t = mdev->rs_total;834const int ratio =835(t == 0) ? 0 :836(t < 100000) ? ((s*100)/t) : (s/(t/100));837dev_info(DEV, "%u %% had equal checksums, eliminated: %luK; "838"transferred %luK total %luK\n",839ratio,840Bit2KB(mdev->rs_same_csum),841Bit2KB(mdev->rs_total - mdev->rs_same_csum),842Bit2KB(mdev->rs_total));843}844}845846if (mdev->rs_failed) {847dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed);848849if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {850ns.disk = D_INCONSISTENT;851ns.pdsk = D_UP_TO_DATE;852} else {853ns.disk = D_UP_TO_DATE;854ns.pdsk = D_INCONSISTENT;855}856} else {857ns.disk = D_UP_TO_DATE;858ns.pdsk = D_UP_TO_DATE;859860if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {861if (mdev->p_uuid) {862int i;863for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)864_drbd_uuid_set(mdev, i, mdev->p_uuid[i]);865drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);866_drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);867} else {868dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");869}870}871872if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {873/* for verify runs, we don't update uuids here,874* so there would be nothing to report. */875drbd_uuid_set_bm(mdev, 0UL);876drbd_print_uuids(mdev, "updated UUIDs");877if (mdev->p_uuid) {878/* Now the two UUID sets are equal, update what we879* know of the peer. */880int i;881for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)882mdev->p_uuid[i] = mdev->ldev->md.uuid[i];883}884}885}886887_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);888out_unlock:889spin_unlock_irq(&mdev->req_lock);890put_ldev(mdev);891out:892mdev->rs_total = 0;893mdev->rs_failed = 0;894mdev->rs_paused = 0;895if (verify_done)896mdev->ov_start_sector = 0;897898drbd_md_sync(mdev);899900if (khelper_cmd)901drbd_khelper(mdev, khelper_cmd);902903return 1;904}905906/* helper */907static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)908{909if (drbd_ee_has_active_page(e)) {910/* This might happen if sendpage() has not finished */911int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT;912atomic_add(i, &mdev->pp_in_use_by_net);913atomic_sub(i, &mdev->pp_in_use);914spin_lock_irq(&mdev->req_lock);915list_add_tail(&e->w.list, &mdev->net_ee);916spin_unlock_irq(&mdev->req_lock);917wake_up(&drbd_pp_wait);918} else919drbd_free_ee(mdev, e);920}921922/**923* w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST924* @mdev: DRBD device.925* @w: work object.926* @cancel: The connection will be closed anyways927*/928int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)929{930struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);931int ok;932933if (unlikely(cancel)) {934drbd_free_ee(mdev, e);935dec_unacked(mdev);936return 1;937}938939if (likely((e->flags & EE_WAS_ERROR) == 0)) {940ok = drbd_send_block(mdev, P_DATA_REPLY, e);941} else {942if (__ratelimit(&drbd_ratelimit_state))943dev_err(DEV, "Sending NegDReply. sector=%llus.\n",944(unsigned long long)e->sector);945946ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);947}948949dec_unacked(mdev);950951move_to_net_ee_or_free(mdev, e);952953if (unlikely(!ok))954dev_err(DEV, "drbd_send_block() failed\n");955return ok;956}957958/**959* w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS960* @mdev: DRBD device.961* @w: work object.962* @cancel: The connection will be closed anyways963*/964int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)965{966struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);967int ok;968969if (unlikely(cancel)) {970drbd_free_ee(mdev, e);971dec_unacked(mdev);972return 1;973}974975if (get_ldev_if_state(mdev, D_FAILED)) {976drbd_rs_complete_io(mdev, e->sector);977put_ldev(mdev);978}979980if (mdev->state.conn == C_AHEAD) {981ok = drbd_send_ack(mdev, P_RS_CANCEL, e);982} else if (likely((e->flags & EE_WAS_ERROR) == 0)) {983if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {984inc_rs_pending(mdev);985ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);986} else {987if (__ratelimit(&drbd_ratelimit_state))988dev_err(DEV, "Not sending RSDataReply, "989"partner DISKLESS!\n");990ok = 1;991}992} else {993if (__ratelimit(&drbd_ratelimit_state))994dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",995(unsigned long long)e->sector);996997ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);998999/* update resync data with failure */1000drbd_rs_failed_io(mdev, e->sector, e->size);1001}10021003dec_unacked(mdev);10041005move_to_net_ee_or_free(mdev, e);10061007if (unlikely(!ok))1008dev_err(DEV, "drbd_send_block() failed\n");1009return ok;1010}10111012int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)1013{1014struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);1015struct digest_info *di;1016int digest_size;1017void *digest = NULL;1018int ok, eq = 0;10191020if (unlikely(cancel)) {1021drbd_free_ee(mdev, e);1022dec_unacked(mdev);1023return 1;1024}10251026if (get_ldev(mdev)) {1027drbd_rs_complete_io(mdev, e->sector);1028put_ldev(mdev);1029}10301031di = e->digest;10321033if (likely((e->flags & EE_WAS_ERROR) == 0)) {1034/* quick hack to try to avoid a race against reconfiguration.1035* a real fix would be much more involved,1036* introducing more locking mechanisms */1037if (mdev->csums_tfm) {1038digest_size = crypto_hash_digestsize(mdev->csums_tfm);1039D_ASSERT(digest_size == di->digest_size);1040digest = kmalloc(digest_size, GFP_NOIO);1041}1042if (digest) {1043drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);1044eq = !memcmp(digest, di->digest, digest_size);1045kfree(digest);1046}10471048if (eq) {1049drbd_set_in_sync(mdev, e->sector, e->size);1050/* rs_same_csums unit is BM_BLOCK_SIZE */1051mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT;1052ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);1053} else {1054inc_rs_pending(mdev);1055e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */1056e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */1057kfree(di);1058ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);1059}1060} else {1061ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);1062if (__ratelimit(&drbd_ratelimit_state))1063dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");1064}10651066dec_unacked(mdev);1067move_to_net_ee_or_free(mdev, e);10681069if (unlikely(!ok))1070dev_err(DEV, "drbd_send_block/ack() failed\n");1071return ok;1072}10731074/* TODO merge common code with w_e_send_csum */1075int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)1076{1077struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);1078sector_t sector = e->sector;1079unsigned int size = e->size;1080int digest_size;1081void *digest;1082int ok = 1;10831084if (unlikely(cancel))1085goto out;10861087digest_size = crypto_hash_digestsize(mdev->verify_tfm);1088digest = kmalloc(digest_size, GFP_NOIO);1089if (!digest) {1090ok = 0; /* terminate the connection in case the allocation failed */1091goto out;1092}10931094if (likely(!(e->flags & EE_WAS_ERROR)))1095drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);1096else1097memset(digest, 0, digest_size);10981099/* Free e and pages before send.1100* In case we block on congestion, we could otherwise run into1101* some distributed deadlock, if the other side blocks on1102* congestion as well, because our receiver blocks in1103* drbd_pp_alloc due to pp_in_use > max_buffers. */1104drbd_free_ee(mdev, e);1105e = NULL;1106inc_rs_pending(mdev);1107ok = drbd_send_drequest_csum(mdev, sector, size,1108digest, digest_size,1109P_OV_REPLY);1110if (!ok)1111dec_rs_pending(mdev);1112kfree(digest);11131114out:1115if (e)1116drbd_free_ee(mdev, e);1117dec_unacked(mdev);1118return ok;1119}11201121void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)1122{1123if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {1124mdev->ov_last_oos_size += size>>9;1125} else {1126mdev->ov_last_oos_start = sector;1127mdev->ov_last_oos_size = size>>9;1128}1129drbd_set_out_of_sync(mdev, sector, size);1130}11311132int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)1133{1134struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);1135struct digest_info *di;1136void *digest;1137sector_t sector = e->sector;1138unsigned int size = e->size;1139int digest_size;1140int ok, eq = 0;11411142if (unlikely(cancel)) {1143drbd_free_ee(mdev, e);1144dec_unacked(mdev);1145return 1;1146}11471148/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all1149* the resync lru has been cleaned up already */1150if (get_ldev(mdev)) {1151drbd_rs_complete_io(mdev, e->sector);1152put_ldev(mdev);1153}11541155di = e->digest;11561157if (likely((e->flags & EE_WAS_ERROR) == 0)) {1158digest_size = crypto_hash_digestsize(mdev->verify_tfm);1159digest = kmalloc(digest_size, GFP_NOIO);1160if (digest) {1161drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);11621163D_ASSERT(digest_size == di->digest_size);1164eq = !memcmp(digest, di->digest, digest_size);1165kfree(digest);1166}1167}11681169/* Free e and pages before send.1170* In case we block on congestion, we could otherwise run into1171* some distributed deadlock, if the other side blocks on1172* congestion as well, because our receiver blocks in1173* drbd_pp_alloc due to pp_in_use > max_buffers. */1174drbd_free_ee(mdev, e);1175if (!eq)1176drbd_ov_oos_found(mdev, sector, size);1177else1178ov_oos_print(mdev);11791180ok = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,1181eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);11821183dec_unacked(mdev);11841185--mdev->ov_left;11861187/* let's advance progress step marks only for every other megabyte */1188if ((mdev->ov_left & 0x200) == 0x200)1189drbd_advance_rs_marks(mdev, mdev->ov_left);11901191if (mdev->ov_left == 0) {1192ov_oos_print(mdev);1193drbd_resync_finished(mdev);1194}11951196return ok;1197}11981199int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)1200{1201struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);1202complete(&b->done);1203return 1;1204}12051206int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)1207{1208struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);1209struct p_barrier *p = &mdev->data.sbuf.barrier;1210int ok = 1;12111212/* really avoid racing with tl_clear. w.cb may have been referenced1213* just before it was reassigned and re-queued, so double check that.1214* actually, this race was harmless, since we only try to send the1215* barrier packet here, and otherwise do nothing with the object.1216* but compare with the head of w_clear_epoch */1217spin_lock_irq(&mdev->req_lock);1218if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)1219cancel = 1;1220spin_unlock_irq(&mdev->req_lock);1221if (cancel)1222return 1;12231224if (!drbd_get_data_sock(mdev))1225return 0;1226p->barrier = b->br_number;1227/* inc_ap_pending was done where this was queued.1228* dec_ap_pending will be done in got_BarrierAck1229* or (on connection loss) in w_clear_epoch. */1230ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,1231(struct p_header80 *)p, sizeof(*p), 0);1232drbd_put_data_sock(mdev);12331234return ok;1235}12361237int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)1238{1239if (cancel)1240return 1;1241return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);1242}12431244int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel)1245{1246struct drbd_request *req = container_of(w, struct drbd_request, w);1247int ok;12481249if (unlikely(cancel)) {1250req_mod(req, send_canceled);1251return 1;1252}12531254ok = drbd_send_oos(mdev, req);1255req_mod(req, oos_handed_to_network);12561257return ok;1258}12591260/**1261* w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request1262* @mdev: DRBD device.1263* @w: work object.1264* @cancel: The connection will be closed anyways1265*/1266int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)1267{1268struct drbd_request *req = container_of(w, struct drbd_request, w);1269int ok;12701271if (unlikely(cancel)) {1272req_mod(req, send_canceled);1273return 1;1274}12751276ok = drbd_send_dblock(mdev, req);1277req_mod(req, ok ? handed_over_to_network : send_failed);12781279return ok;1280}12811282/**1283* w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet1284* @mdev: DRBD device.1285* @w: work object.1286* @cancel: The connection will be closed anyways1287*/1288int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)1289{1290struct drbd_request *req = container_of(w, struct drbd_request, w);1291int ok;12921293if (unlikely(cancel)) {1294req_mod(req, send_canceled);1295return 1;1296}12971298ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,1299(unsigned long)req);13001301if (!ok) {1302/* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();1303* so this is probably redundant */1304if (mdev->state.conn >= C_CONNECTED)1305drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));1306}1307req_mod(req, ok ? handed_over_to_network : send_failed);13081309return ok;1310}13111312int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)1313{1314struct drbd_request *req = container_of(w, struct drbd_request, w);13151316if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)1317drbd_al_begin_io(mdev, req->sector);1318/* Calling drbd_al_begin_io() out of the worker might deadlocks1319theoretically. Practically it can not deadlock, since this is1320only used when unfreezing IOs. All the extents of the requests1321that made it into the TL are already active */13221323drbd_req_make_private_bio(req, req->master_bio);1324req->private_bio->bi_bdev = mdev->ldev->backing_bdev;1325generic_make_request(req->private_bio);13261327return 1;1328}13291330static int _drbd_may_sync_now(struct drbd_conf *mdev)1331{1332struct drbd_conf *odev = mdev;13331334while (1) {1335if (odev->sync_conf.after == -1)1336return 1;1337odev = minor_to_mdev(odev->sync_conf.after);1338ERR_IF(!odev) return 1;1339if ((odev->state.conn >= C_SYNC_SOURCE &&1340odev->state.conn <= C_PAUSED_SYNC_T) ||1341odev->state.aftr_isp || odev->state.peer_isp ||1342odev->state.user_isp)1343return 0;1344}1345}13461347/**1348* _drbd_pause_after() - Pause resync on all devices that may not resync now1349* @mdev: DRBD device.1350*1351* Called from process context only (admin command and after_state_ch).1352*/1353static int _drbd_pause_after(struct drbd_conf *mdev)1354{1355struct drbd_conf *odev;1356int i, rv = 0;13571358for (i = 0; i < minor_count; i++) {1359odev = minor_to_mdev(i);1360if (!odev)1361continue;1362if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)1363continue;1364if (!_drbd_may_sync_now(odev))1365rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)1366!= SS_NOTHING_TO_DO);1367}13681369return rv;1370}13711372/**1373* _drbd_resume_next() - Resume resync on all devices that may resync now1374* @mdev: DRBD device.1375*1376* Called from process context only (admin command and worker).1377*/1378static int _drbd_resume_next(struct drbd_conf *mdev)1379{1380struct drbd_conf *odev;1381int i, rv = 0;13821383for (i = 0; i < minor_count; i++) {1384odev = minor_to_mdev(i);1385if (!odev)1386continue;1387if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)1388continue;1389if (odev->state.aftr_isp) {1390if (_drbd_may_sync_now(odev))1391rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),1392CS_HARD, NULL)1393!= SS_NOTHING_TO_DO) ;1394}1395}1396return rv;1397}13981399void resume_next_sg(struct drbd_conf *mdev)1400{1401write_lock_irq(&global_state_lock);1402_drbd_resume_next(mdev);1403write_unlock_irq(&global_state_lock);1404}14051406void suspend_other_sg(struct drbd_conf *mdev)1407{1408write_lock_irq(&global_state_lock);1409_drbd_pause_after(mdev);1410write_unlock_irq(&global_state_lock);1411}14121413static int sync_after_error(struct drbd_conf *mdev, int o_minor)1414{1415struct drbd_conf *odev;14161417if (o_minor == -1)1418return NO_ERROR;1419if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)1420return ERR_SYNC_AFTER;14211422/* check for loops */1423odev = minor_to_mdev(o_minor);1424while (1) {1425if (odev == mdev)1426return ERR_SYNC_AFTER_CYCLE;14271428/* dependency chain ends here, no cycles. */1429if (odev->sync_conf.after == -1)1430return NO_ERROR;14311432/* follow the dependency chain */1433odev = minor_to_mdev(odev->sync_conf.after);1434}1435}14361437int drbd_alter_sa(struct drbd_conf *mdev, int na)1438{1439int changes;1440int retcode;14411442write_lock_irq(&global_state_lock);1443retcode = sync_after_error(mdev, na);1444if (retcode == NO_ERROR) {1445mdev->sync_conf.after = na;1446do {1447changes = _drbd_pause_after(mdev);1448changes |= _drbd_resume_next(mdev);1449} while (changes);1450}1451write_unlock_irq(&global_state_lock);1452return retcode;1453}14541455void drbd_rs_controller_reset(struct drbd_conf *mdev)1456{1457atomic_set(&mdev->rs_sect_in, 0);1458atomic_set(&mdev->rs_sect_ev, 0);1459mdev->rs_in_flight = 0;1460mdev->rs_planed = 0;1461spin_lock(&mdev->peer_seq_lock);1462fifo_set(&mdev->rs_plan_s, 0);1463spin_unlock(&mdev->peer_seq_lock);1464}14651466/**1467* drbd_start_resync() - Start the resync process1468* @mdev: DRBD device.1469* @side: Either C_SYNC_SOURCE or C_SYNC_TARGET1470*1471* This function might bring you directly into one of the1472* C_PAUSED_SYNC_* states.1473*/1474void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)1475{1476union drbd_state ns;1477int r;14781479if (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn < C_AHEAD) {1480dev_err(DEV, "Resync already running!\n");1481return;1482}14831484if (mdev->state.conn < C_AHEAD) {1485/* In case a previous resync run was aborted by an IO error/detach on the peer. */1486drbd_rs_cancel_all(mdev);1487/* This should be done when we abort the resync. We definitely do not1488want to have this for connections going back and forth between1489Ahead/Behind and SyncSource/SyncTarget */1490}14911492if (side == C_SYNC_TARGET) {1493/* Since application IO was locked out during C_WF_BITMAP_T and1494C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET1495we check that we might make the data inconsistent. */1496r = drbd_khelper(mdev, "before-resync-target");1497r = (r >> 8) & 0xff;1498if (r > 0) {1499dev_info(DEV, "before-resync-target handler returned %d, "1500"dropping connection.\n", r);1501drbd_force_state(mdev, NS(conn, C_DISCONNECTING));1502return;1503}1504} else /* C_SYNC_SOURCE */ {1505r = drbd_khelper(mdev, "before-resync-source");1506r = (r >> 8) & 0xff;1507if (r > 0) {1508if (r == 3) {1509dev_info(DEV, "before-resync-source handler returned %d, "1510"ignoring. Old userland tools?", r);1511} else {1512dev_info(DEV, "before-resync-source handler returned %d, "1513"dropping connection.\n", r);1514drbd_force_state(mdev, NS(conn, C_DISCONNECTING));1515return;1516}1517}1518}15191520drbd_state_lock(mdev);15211522if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {1523drbd_state_unlock(mdev);1524return;1525}15261527write_lock_irq(&global_state_lock);1528ns = mdev->state;15291530ns.aftr_isp = !_drbd_may_sync_now(mdev);15311532ns.conn = side;15331534if (side == C_SYNC_TARGET)1535ns.disk = D_INCONSISTENT;1536else /* side == C_SYNC_SOURCE */1537ns.pdsk = D_INCONSISTENT;15381539r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);1540ns = mdev->state;15411542if (ns.conn < C_CONNECTED)1543r = SS_UNKNOWN_ERROR;15441545if (r == SS_SUCCESS) {1546unsigned long tw = drbd_bm_total_weight(mdev);1547unsigned long now = jiffies;1548int i;15491550mdev->rs_failed = 0;1551mdev->rs_paused = 0;1552mdev->rs_same_csum = 0;1553mdev->rs_last_events = 0;1554mdev->rs_last_sect_ev = 0;1555mdev->rs_total = tw;1556mdev->rs_start = now;1557for (i = 0; i < DRBD_SYNC_MARKS; i++) {1558mdev->rs_mark_left[i] = tw;1559mdev->rs_mark_time[i] = now;1560}1561_drbd_pause_after(mdev);1562}1563write_unlock_irq(&global_state_lock);15641565if (r == SS_SUCCESS) {1566dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",1567drbd_conn_str(ns.conn),1568(unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),1569(unsigned long) mdev->rs_total);1570if (side == C_SYNC_TARGET)1571mdev->bm_resync_fo = 0;15721573/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid1574* with w_send_oos, or the sync target will get confused as to1575* how much bits to resync. We cannot do that always, because for an1576* empty resync and protocol < 95, we need to do it here, as we call1577* drbd_resync_finished from here in that case.1578* We drbd_gen_and_send_sync_uuid here for protocol < 96,1579* and from after_state_ch otherwise. */1580if (side == C_SYNC_SOURCE && mdev->agreed_pro_version < 96)1581drbd_gen_and_send_sync_uuid(mdev);15821583if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {1584/* This still has a race (about when exactly the peers1585* detect connection loss) that can lead to a full sync1586* on next handshake. In 8.3.9 we fixed this with explicit1587* resync-finished notifications, but the fix1588* introduces a protocol change. Sleeping for some1589* time longer than the ping interval + timeout on the1590* SyncSource, to give the SyncTarget the chance to1591* detect connection loss, then waiting for a ping1592* response (implicit in drbd_resync_finished) reduces1593* the race considerably, but does not solve it. */1594if (side == C_SYNC_SOURCE)1595schedule_timeout_interruptible(1596mdev->net_conf->ping_int * HZ +1597mdev->net_conf->ping_timeo*HZ/9);1598drbd_resync_finished(mdev);1599}16001601drbd_rs_controller_reset(mdev);1602/* ns.conn may already be != mdev->state.conn,1603* we may have been paused in between, or become paused until1604* the timer triggers.1605* No matter, that is handled in resync_timer_fn() */1606if (ns.conn == C_SYNC_TARGET)1607mod_timer(&mdev->resync_timer, jiffies);16081609drbd_md_sync(mdev);1610}1611put_ldev(mdev);1612drbd_state_unlock(mdev);1613}16141615int drbd_worker(struct drbd_thread *thi)1616{1617struct drbd_conf *mdev = thi->mdev;1618struct drbd_work *w = NULL;1619LIST_HEAD(work_list);1620int intr = 0, i;16211622sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));16231624while (get_t_state(thi) == Running) {1625drbd_thread_current_set_cpu(mdev);16261627if (down_trylock(&mdev->data.work.s)) {1628mutex_lock(&mdev->data.mutex);1629if (mdev->data.socket && !mdev->net_conf->no_cork)1630drbd_tcp_uncork(mdev->data.socket);1631mutex_unlock(&mdev->data.mutex);16321633intr = down_interruptible(&mdev->data.work.s);16341635mutex_lock(&mdev->data.mutex);1636if (mdev->data.socket && !mdev->net_conf->no_cork)1637drbd_tcp_cork(mdev->data.socket);1638mutex_unlock(&mdev->data.mutex);1639}16401641if (intr) {1642D_ASSERT(intr == -EINTR);1643flush_signals(current);1644ERR_IF (get_t_state(thi) == Running)1645continue;1646break;1647}16481649if (get_t_state(thi) != Running)1650break;1651/* With this break, we have done a down() but not consumed1652the entry from the list. The cleanup code takes care of1653this... */16541655w = NULL;1656spin_lock_irq(&mdev->data.work.q_lock);1657ERR_IF(list_empty(&mdev->data.work.q)) {1658/* something terribly wrong in our logic.1659* we were able to down() the semaphore,1660* but the list is empty... doh.1661*1662* what is the best thing to do now?1663* try again from scratch, restarting the receiver,1664* asender, whatnot? could break even more ugly,1665* e.g. when we are primary, but no good local data.1666*1667* I'll try to get away just starting over this loop.1668*/1669spin_unlock_irq(&mdev->data.work.q_lock);1670continue;1671}1672w = list_entry(mdev->data.work.q.next, struct drbd_work, list);1673list_del_init(&w->list);1674spin_unlock_irq(&mdev->data.work.q_lock);16751676if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {1677/* dev_warn(DEV, "worker: a callback failed! \n"); */1678if (mdev->state.conn >= C_CONNECTED)1679drbd_force_state(mdev,1680NS(conn, C_NETWORK_FAILURE));1681}1682}1683D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));1684D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));16851686spin_lock_irq(&mdev->data.work.q_lock);1687i = 0;1688while (!list_empty(&mdev->data.work.q)) {1689list_splice_init(&mdev->data.work.q, &work_list);1690spin_unlock_irq(&mdev->data.work.q_lock);16911692while (!list_empty(&work_list)) {1693w = list_entry(work_list.next, struct drbd_work, list);1694list_del_init(&w->list);1695w->cb(mdev, w, 1);1696i++; /* dead debugging code */1697}16981699spin_lock_irq(&mdev->data.work.q_lock);1700}1701sema_init(&mdev->data.work.s, 0);1702/* DANGEROUS race: if someone did queue his work within the spinlock,1703* but up() ed outside the spinlock, we could get an up() on the1704* semaphore without corresponding list entry.1705* So don't do that.1706*/1707spin_unlock_irq(&mdev->data.work.q_lock);17081709D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);1710/* _drbd_set_state only uses stop_nowait.1711* wait here for the Exiting receiver. */1712drbd_thread_stop(&mdev->receiver);1713drbd_mdev_cleanup(mdev);17141715dev_info(DEV, "worker terminated\n");17161717clear_bit(DEVICE_DYING, &mdev->flags);1718clear_bit(CONFIG_PENDING, &mdev->flags);1719wake_up(&mdev->state_wait);17201721return 0;1722}172317241725