CoCalc -- drbd

GitHub Repository: awilliam/linux-vfio
Path: blob/master/drivers/block/drbd/drbd_actlog.c
¹⁷³⁷² views
1
/*
2
   drbd_actlog.c
3

4
   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5

6
   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7
   Copyright (C) 2003-2008, Philipp Reisner <[email protected]>.
8
   Copyright (C) 2003-2008, Lars Ellenberg <[email protected]>.
9

10
   drbd is free software; you can redistribute it and/or modify
11
   it under the terms of the GNU General Public License as published by
12
   the Free Software Foundation; either version 2, or (at your option)
13
   any later version.
14

15
   drbd is distributed in the hope that it will be useful,
16
   but WITHOUT ANY WARRANTY; without even the implied warranty of
17
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
   GNU General Public License for more details.
19

20
   You should have received a copy of the GNU General Public License
21
   along with drbd; see the file COPYING.  If not, write to
22
   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23

24
 */
25

26
#include <linux/slab.h>
27
#include <linux/drbd.h>
28
#include "drbd_int.h"
29
#include "drbd_wrappers.h"
30

31
/* We maintain a trivial checksum in our on disk activity log.
32
 * With that we can ensure correct operation even when the storage
33
 * device might do a partial (last) sector write while losing power.
34
 */
35
struct __packed al_transaction {
36
	u32       magic;
37
	u32       tr_number;
38
	struct __packed {
39
		u32 pos;
40
		u32 extent; } updates[1 + AL_EXTENTS_PT];
41
	u32       xor_sum;
42
};
43

44
struct update_odbm_work {
45
	struct drbd_work w;
46
	unsigned int enr;
47
};
48

49
struct update_al_work {
50
	struct drbd_work w;
51
	struct lc_element *al_ext;
52
	struct completion event;
53
	unsigned int enr;
54
	/* if old_enr != LC_FREE, write corresponding bitmap sector, too */
55
	unsigned int old_enr;
56
};
57

58
struct drbd_atodb_wait {
59
	atomic_t           count;
60
	struct completion  io_done;
61
	struct drbd_conf   *mdev;
62
	int                error;
63
};
64

65

66
int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
67

68
static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
69
				 struct drbd_backing_dev *bdev,
70
				 struct page *page, sector_t sector,
71
				 int rw, int size)
72
{
73
	struct bio *bio;
74
	struct drbd_md_io md_io;
75
	int ok;
76

77
	md_io.mdev = mdev;
78
	init_completion(&md_io.event);
79
	md_io.error = 0;
80

81
	if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
82
		rw |= REQ_FUA | REQ_FLUSH;
83
	rw |= REQ_SYNC;
84

85
	bio = bio_alloc(GFP_NOIO, 1);
86
	bio->bi_bdev = bdev->md_bdev;
87
	bio->bi_sector = sector;
88
	ok = (bio_add_page(bio, page, size, 0) == size);
89
	if (!ok)
90
		goto out;
91
	bio->bi_private = &md_io;
92
	bio->bi_end_io = drbd_md_io_complete;
93
	bio->bi_rw = rw;
94

95
	if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
96
		bio_endio(bio, -EIO);
97
	else
98
		submit_bio(rw, bio);
99
	wait_for_completion(&md_io.event);
100
	ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
101

102
 out:
103
	bio_put(bio);
104
	return ok;
105
}
106

107
int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
108
			 sector_t sector, int rw)
109
{
110
	int logical_block_size, mask, ok;
111
	int offset = 0;
112
	struct page *iop = mdev->md_io_page;
113

114
	D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
115

116
	BUG_ON(!bdev->md_bdev);
117

118
	logical_block_size = bdev_logical_block_size(bdev->md_bdev);
119
	if (logical_block_size == 0)
120
		logical_block_size = MD_SECTOR_SIZE;
121

122
	/* in case logical_block_size != 512 [ s390 only? ] */
123
	if (logical_block_size != MD_SECTOR_SIZE) {
124
		mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
125
		D_ASSERT(mask == 1 || mask == 3 || mask == 7);
126
		D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
127
		offset = sector & mask;
128
		sector = sector & ~mask;
129
		iop = mdev->md_io_tmpp;
130

131
		if (rw & WRITE) {
132
			/* these are GFP_KERNEL pages, pre-allocated
133
			 * on device initialization */
134
			void *p = page_address(mdev->md_io_page);
135
			void *hp = page_address(mdev->md_io_tmpp);
136

137
			ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
138
					READ, logical_block_size);
139

140
			if (unlikely(!ok)) {
141
				dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
142
				    "READ [logical_block_size!=512]) failed!\n",
143
				    (unsigned long long)sector);
144
				return 0;
145
			}
146

147
			memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
148
		}
149
	}
150

151
	if (sector < drbd_md_first_sector(bdev) ||
152
	    sector > drbd_md_last_sector(bdev))
153
		dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
154
		     current->comm, current->pid, __func__,
155
		     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
156

157
	ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
158
	if (unlikely(!ok)) {
159
		dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
160
		    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
161
		return 0;
162
	}
163

164
	if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
165
		void *p = page_address(mdev->md_io_page);
166
		void *hp = page_address(mdev->md_io_tmpp);
167

168
		memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
169
	}
170

171
	return ok;
172
}
173

174
static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
175
{
176
	struct lc_element *al_ext;
177
	struct lc_element *tmp;
178
	unsigned long     al_flags = 0;
179
	int wake;
180

181
	spin_lock_irq(&mdev->al_lock);
182
	tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
183
	if (unlikely(tmp != NULL)) {
184
		struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
185
		if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
186
			wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
187
			spin_unlock_irq(&mdev->al_lock);
188
			if (wake)
189
				wake_up(&mdev->al_wait);
190
			return NULL;
191
		}
192
	}
193
	al_ext   = lc_get(mdev->act_log, enr);
194
	al_flags = mdev->act_log->flags;
195
	spin_unlock_irq(&mdev->al_lock);
196

197
	/*
198
	if (!al_ext) {
199
		if (al_flags & LC_STARVING)
200
			dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
201
		if (al_flags & LC_DIRTY)
202
			dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
203
	}
204
	*/
205

206
	return al_ext;
207
}
208

209
void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
210
{
211
	unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
212
	struct lc_element *al_ext;
213
	struct update_al_work al_work;
214

215
	D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
216

217
	wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
218

219
	if (al_ext->lc_number != enr) {
220
		/* drbd_al_write_transaction(mdev,al_ext,enr);
221
		 * recurses into generic_make_request(), which
222
		 * disallows recursion, bios being serialized on the
223
		 * current->bio_tail list now.
224
		 * we have to delegate updates to the activity log
225
		 * to the worker thread. */
226
		init_completion(&al_work.event);
227
		al_work.al_ext = al_ext;
228
		al_work.enr = enr;
229
		al_work.old_enr = al_ext->lc_number;
230
		al_work.w.cb = w_al_write_transaction;
231
		drbd_queue_work_front(&mdev->data.work, &al_work.w);
232
		wait_for_completion(&al_work.event);
233

234
		mdev->al_writ_cnt++;
235

236
		spin_lock_irq(&mdev->al_lock);
237
		lc_changed(mdev->act_log, al_ext);
238
		spin_unlock_irq(&mdev->al_lock);
239
		wake_up(&mdev->al_wait);
240
	}
241
}
242

243
void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
244
{
245
	unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
246
	struct lc_element *extent;
247
	unsigned long flags;
248

249
	spin_lock_irqsave(&mdev->al_lock, flags);
250

251
	extent = lc_find(mdev->act_log, enr);
252

253
	if (!extent) {
254
		spin_unlock_irqrestore(&mdev->al_lock, flags);
255
		dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
256
		return;
257
	}
258

259
	if (lc_put(mdev->act_log, extent) == 0)
260
		wake_up(&mdev->al_wait);
261

262
	spin_unlock_irqrestore(&mdev->al_lock, flags);
263
}
264

265
#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
266
/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
267
 * are still coupled, or assume too much about their relation.
268
 * Code below will not work if this is violated.
269
 * Will be cleaned up with some followup patch.
270
 */
271
# error FIXME
272
#endif
273

274
static unsigned int al_extent_to_bm_page(unsigned int al_enr)
275
{
276
	return al_enr >>
277
		/* bit to page */
278
		((PAGE_SHIFT + 3) -
279
		/* al extent number to bit */
280
		 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
281
}
282

283
static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
284
{
285
	return rs_enr >>
286
		/* bit to page */
287
		((PAGE_SHIFT + 3) -
288
		/* al extent number to bit */
289
		 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
290
}
291

292
int
293
w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
294
{
295
	struct update_al_work *aw = container_of(w, struct update_al_work, w);
296
	struct lc_element *updated = aw->al_ext;
297
	const unsigned int new_enr = aw->enr;
298
	const unsigned int evicted = aw->old_enr;
299
	struct al_transaction *buffer;
300
	sector_t sector;
301
	int i, n, mx;
302
	unsigned int extent_nr;
303
	u32 xor_sum = 0;
304

305
	if (!get_ldev(mdev)) {
306
		dev_err(DEV,
307
			"disk is %s, cannot start al transaction (-%d +%d)\n",
308
			drbd_disk_str(mdev->state.disk), evicted, new_enr);
309
		complete(&((struct update_al_work *)w)->event);
310
		return 1;
311
	}
312
	/* do we have to do a bitmap write, first?
313
	 * TODO reduce maximum latency:
314
	 * submit both bios, then wait for both,
315
	 * instead of doing two synchronous sector writes.
316
	 * For now, we must not write the transaction,
317
	 * if we cannot write out the bitmap of the evicted extent. */
318
	if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
319
		drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted));
320

321
	/* The bitmap write may have failed, causing a state change. */
322
	if (mdev->state.disk < D_INCONSISTENT) {
323
		dev_err(DEV,
324
			"disk is %s, cannot write al transaction (-%d +%d)\n",
325
			drbd_disk_str(mdev->state.disk), evicted, new_enr);
326
		complete(&((struct update_al_work *)w)->event);
327
		put_ldev(mdev);
328
		return 1;
329
	}
330

331
	mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
332
	buffer = (struct al_transaction *)page_address(mdev->md_io_page);
333

334
	buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
335
	buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
336

337
	n = lc_index_of(mdev->act_log, updated);
338

339
	buffer->updates[0].pos = cpu_to_be32(n);
340
	buffer->updates[0].extent = cpu_to_be32(new_enr);
341

342
	xor_sum ^= new_enr;
343

344
	mx = min_t(int, AL_EXTENTS_PT,
345
		   mdev->act_log->nr_elements - mdev->al_tr_cycle);
346
	for (i = 0; i < mx; i++) {
347
		unsigned idx = mdev->al_tr_cycle + i;
348
		extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
349
		buffer->updates[i+1].pos = cpu_to_be32(idx);
350
		buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
351
		xor_sum ^= extent_nr;
352
	}
353
	for (; i < AL_EXTENTS_PT; i++) {
354
		buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
355
		buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
356
		xor_sum ^= LC_FREE;
357
	}
358
	mdev->al_tr_cycle += AL_EXTENTS_PT;
359
	if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
360
		mdev->al_tr_cycle = 0;
361

362
	buffer->xor_sum = cpu_to_be32(xor_sum);
363

364
	sector =  mdev->ldev->md.md_offset
365
		+ mdev->ldev->md.al_offset + mdev->al_tr_pos;
366

367
	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
368
		drbd_chk_io_error(mdev, 1, true);
369

370
	if (++mdev->al_tr_pos >
371
	    div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
372
		mdev->al_tr_pos = 0;
373

374
	D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
375
	mdev->al_tr_number++;
376

377
	mutex_unlock(&mdev->md_io_mutex);
378

379
	complete(&((struct update_al_work *)w)->event);
380
	put_ldev(mdev);
381

382
	return 1;
383
}
384

385
/**
386
 * drbd_al_read_tr() - Read a single transaction from the on disk activity log
387
 * @mdev:	DRBD device.
388
 * @bdev:	Block device to read form.
389
 * @b:		pointer to an al_transaction.
390
 * @index:	On disk slot of the transaction to read.
391
 *
392
 * Returns -1 on IO error, 0 on checksum error and 1 upon success.
393
 */
394
static int drbd_al_read_tr(struct drbd_conf *mdev,
395
			   struct drbd_backing_dev *bdev,
396
			   struct al_transaction *b,
397
			   int index)
398
{
399
	sector_t sector;
400
	int rv, i;
401
	u32 xor_sum = 0;
402

403
	sector = bdev->md.md_offset + bdev->md.al_offset + index;
404

405
	/* Dont process error normally,
406
	 * as this is done before disk is attached! */
407
	if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
408
		return -1;
409

410
	rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
411

412
	for (i = 0; i < AL_EXTENTS_PT + 1; i++)
413
		xor_sum ^= be32_to_cpu(b->updates[i].extent);
414
	rv &= (xor_sum == be32_to_cpu(b->xor_sum));
415

416
	return rv;
417
}
418

419
/**
420
 * drbd_al_read_log() - Restores the activity log from its on disk representation.
421
 * @mdev:	DRBD device.
422
 * @bdev:	Block device to read form.
423
 *
424
 * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
425
 */
426
int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
427
{
428
	struct al_transaction *buffer;
429
	int i;
430
	int rv;
431
	int mx;
432
	int active_extents = 0;
433
	int transactions = 0;
434
	int found_valid = 0;
435
	int from = 0;
436
	int to = 0;
437
	u32 from_tnr = 0;
438
	u32 to_tnr = 0;
439
	u32 cnr;
440

441
	mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
442

443
	/* lock out all other meta data io for now,
444
	 * and make sure the page is mapped.
445
	 */
446
	mutex_lock(&mdev->md_io_mutex);
447
	buffer = page_address(mdev->md_io_page);
448

449
	/* Find the valid transaction in the log */
450
	for (i = 0; i <= mx; i++) {
451
		rv = drbd_al_read_tr(mdev, bdev, buffer, i);
452
		if (rv == 0)
453
			continue;
454
		if (rv == -1) {
455
			mutex_unlock(&mdev->md_io_mutex);
456
			return 0;
457
		}
458
		cnr = be32_to_cpu(buffer->tr_number);
459

460
		if (++found_valid == 1) {
461
			from = i;
462
			to = i;
463
			from_tnr = cnr;
464
			to_tnr = cnr;
465
			continue;
466
		}
467
		if ((int)cnr - (int)from_tnr < 0) {
468
			D_ASSERT(from_tnr - cnr + i - from == mx+1);
469
			from = i;
470
			from_tnr = cnr;
471
		}
472
		if ((int)cnr - (int)to_tnr > 0) {
473
			D_ASSERT(cnr - to_tnr == i - to);
474
			to = i;
475
			to_tnr = cnr;
476
		}
477
	}
478

479
	if (!found_valid) {
480
		dev_warn(DEV, "No usable activity log found.\n");
481
		mutex_unlock(&mdev->md_io_mutex);
482
		return 1;
483
	}
484

485
	/* Read the valid transactions.
486
	 * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
487
	i = from;
488
	while (1) {
489
		int j, pos;
490
		unsigned int extent_nr;
491
		unsigned int trn;
492

493
		rv = drbd_al_read_tr(mdev, bdev, buffer, i);
494
		ERR_IF(rv == 0) goto cancel;
495
		if (rv == -1) {
496
			mutex_unlock(&mdev->md_io_mutex);
497
			return 0;
498
		}
499

500
		trn = be32_to_cpu(buffer->tr_number);
501

502
		spin_lock_irq(&mdev->al_lock);
503

504
		/* This loop runs backwards because in the cyclic
505
		   elements there might be an old version of the
506
		   updated element (in slot 0). So the element in slot 0
507
		   can overwrite old versions. */
508
		for (j = AL_EXTENTS_PT; j >= 0; j--) {
509
			pos = be32_to_cpu(buffer->updates[j].pos);
510
			extent_nr = be32_to_cpu(buffer->updates[j].extent);
511

512
			if (extent_nr == LC_FREE)
513
				continue;
514

515
			lc_set(mdev->act_log, extent_nr, pos);
516
			active_extents++;
517
		}
518
		spin_unlock_irq(&mdev->al_lock);
519

520
		transactions++;
521

522
cancel:
523
		if (i == to)
524
			break;
525
		i++;
526
		if (i > mx)
527
			i = 0;
528
	}
529

530
	mdev->al_tr_number = to_tnr+1;
531
	mdev->al_tr_pos = to;
532
	if (++mdev->al_tr_pos >
533
	    div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
534
		mdev->al_tr_pos = 0;
535

536
	/* ok, we are done with it */
537
	mutex_unlock(&mdev->md_io_mutex);
538

539
	dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
540
	     transactions, active_extents);
541

542
	return 1;
543
}
544

545
/**
546
 * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
547
 * @mdev:	DRBD device.
548
 */
549
void drbd_al_apply_to_bm(struct drbd_conf *mdev)
550
{
551
	unsigned int enr;
552
	unsigned long add = 0;
553
	char ppb[10];
554
	int i, tmp;
555

556
	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
557

558
	for (i = 0; i < mdev->act_log->nr_elements; i++) {
559
		enr = lc_element_by_index(mdev->act_log, i)->lc_number;
560
		if (enr == LC_FREE)
561
			continue;
562
		tmp = drbd_bm_ALe_set_all(mdev, enr);
563
		dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
564
		add += tmp;
565
	}
566

567
	lc_unlock(mdev->act_log);
568
	wake_up(&mdev->al_wait);
569

570
	dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
571
	     ppsize(ppb, Bit2KB(add)));
572
}
573

574
static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
575
{
576
	int rv;
577

578
	spin_lock_irq(&mdev->al_lock);
579
	rv = (al_ext->refcnt == 0);
580
	if (likely(rv))
581
		lc_del(mdev->act_log, al_ext);
582
	spin_unlock_irq(&mdev->al_lock);
583

584
	return rv;
585
}
586

587
/**
588
 * drbd_al_shrink() - Removes all active extents form the activity log
589
 * @mdev:	DRBD device.
590
 *
591
 * Removes all active extents form the activity log, waiting until
592
 * the reference count of each entry dropped to 0 first, of course.
593
 *
594
 * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
595
 */
596
void drbd_al_shrink(struct drbd_conf *mdev)
597
{
598
	struct lc_element *al_ext;
599
	int i;
600

601
	D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
602

603
	for (i = 0; i < mdev->act_log->nr_elements; i++) {
604
		al_ext = lc_element_by_index(mdev->act_log, i);
605
		if (al_ext->lc_number == LC_FREE)
606
			continue;
607
		wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
608
	}
609

610
	wake_up(&mdev->al_wait);
611
}
612

613
static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
614
{
615
	struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
616

617
	if (!get_ldev(mdev)) {
618
		if (__ratelimit(&drbd_ratelimit_state))
619
			dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
620
		kfree(udw);
621
		return 1;
622
	}
623

624
	drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr));
625
	put_ldev(mdev);
626

627
	kfree(udw);
628

629
	if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
630
		switch (mdev->state.conn) {
631
		case C_SYNC_SOURCE:  case C_SYNC_TARGET:
632
		case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
633
			drbd_resync_finished(mdev);
634
		default:
635
			/* nothing to do */
636
			break;
637
		}
638
	}
639
	drbd_bcast_sync_progress(mdev);
640

641
	return 1;
642
}
643

644

645
/* ATTENTION. The AL's extents are 4MB each, while the extents in the
646
 * resync LRU-cache are 16MB each.
647
 * The caller of this function has to hold an get_ldev() reference.
648
 *
649
 * TODO will be obsoleted once we have a caching lru of the on disk bitmap
650
 */
651
static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
652
				      int count, int success)
653
{
654
	struct lc_element *e;
655
	struct update_odbm_work *udw;
656

657
	unsigned int enr;
658

659
	D_ASSERT(atomic_read(&mdev->local_cnt));
660

661
	/* I simply assume that a sector/size pair never crosses
662
	 * a 16 MB extent border. (Currently this is true...) */
663
	enr = BM_SECT_TO_EXT(sector);
664

665
	e = lc_get(mdev->resync, enr);
666
	if (e) {
667
		struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
668
		if (ext->lce.lc_number == enr) {
669
			if (success)
670
				ext->rs_left -= count;
671
			else
672
				ext->rs_failed += count;
673
			if (ext->rs_left < ext->rs_failed) {
674
				dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
675
				    "rs_failed=%d count=%d\n",
676
				     (unsigned long long)sector,
677
				     ext->lce.lc_number, ext->rs_left,
678
				     ext->rs_failed, count);
679
				dump_stack();
680

681
				lc_put(mdev->resync, &ext->lce);
682
				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
683
				return;
684
			}
685
		} else {
686
			/* Normally this element should be in the cache,
687
			 * since drbd_rs_begin_io() pulled it already in.
688
			 *
689
			 * But maybe an application write finished, and we set
690
			 * something outside the resync lru_cache in sync.
691
			 */
692
			int rs_left = drbd_bm_e_weight(mdev, enr);
693
			if (ext->flags != 0) {
694
				dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
695
				     " -> %d[%u;00]\n",
696
				     ext->lce.lc_number, ext->rs_left,
697
				     ext->flags, enr, rs_left);
698
				ext->flags = 0;
699
			}
700
			if (ext->rs_failed) {
701
				dev_warn(DEV, "Kicking resync_lru element enr=%u "
702
				     "out with rs_failed=%d\n",
703
				     ext->lce.lc_number, ext->rs_failed);
704
			}
705
			ext->rs_left = rs_left;
706
			ext->rs_failed = success ? 0 : count;
707
			lc_changed(mdev->resync, &ext->lce);
708
		}
709
		lc_put(mdev->resync, &ext->lce);
710
		/* no race, we are within the al_lock! */
711

712
		if (ext->rs_left == ext->rs_failed) {
713
			ext->rs_failed = 0;
714

715
			udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
716
			if (udw) {
717
				udw->enr = ext->lce.lc_number;
718
				udw->w.cb = w_update_odbm;
719
				drbd_queue_work_front(&mdev->data.work, &udw->w);
720
			} else {
721
				dev_warn(DEV, "Could not kmalloc an udw\n");
722
			}
723
		}
724
	} else {
725
		dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
726
		    mdev->resync_locked,
727
		    mdev->resync->nr_elements,
728
		    mdev->resync->flags);
729
	}
730
}
731

732
void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go)
733
{
734
	unsigned long now = jiffies;
735
	unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
736
	int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
737
	if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
738
		if (mdev->rs_mark_left[mdev->rs_last_mark] != still_to_go &&
739
		    mdev->state.conn != C_PAUSED_SYNC_T &&
740
		    mdev->state.conn != C_PAUSED_SYNC_S) {
741
			mdev->rs_mark_time[next] = now;
742
			mdev->rs_mark_left[next] = still_to_go;
743
			mdev->rs_last_mark = next;
744
		}
745
	}
746
}
747

748
/* clear the bit corresponding to the piece of storage in question:
749
 * size byte of data starting from sector.  Only clear a bits of the affected
750
 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
751
 *
752
 * called by worker on C_SYNC_TARGET and receiver on SyncSource.
753
 *
754
 */
755
void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
756
		       const char *file, const unsigned int line)
757
{
758
	/* Is called from worker and receiver context _only_ */
759
	unsigned long sbnr, ebnr, lbnr;
760
	unsigned long count = 0;
761
	sector_t esector, nr_sectors;
762
	int wake_up = 0;
763
	unsigned long flags;
764

765
	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
766
		dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
767
				(unsigned long long)sector, size);
768
		return;
769
	}
770
	nr_sectors = drbd_get_capacity(mdev->this_bdev);
771
	esector = sector + (size >> 9) - 1;
772

773
	ERR_IF(sector >= nr_sectors) return;
774
	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
775

776
	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
777

778
	/* we clear it (in sync).
779
	 * round up start sector, round down end sector.  we make sure we only
780
	 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
781
	if (unlikely(esector < BM_SECT_PER_BIT-1))
782
		return;
783
	if (unlikely(esector == (nr_sectors-1)))
784
		ebnr = lbnr;
785
	else
786
		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
787
	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
788

789
	if (sbnr > ebnr)
790
		return;
791

792
	/*
793
	 * ok, (capacity & 7) != 0 sometimes, but who cares...
794
	 * we count rs_{total,left} in bits, not sectors.
795
	 */
796
	count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
797
	if (count && get_ldev(mdev)) {
798
		drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev));
799
		spin_lock_irqsave(&mdev->al_lock, flags);
800
		drbd_try_clear_on_disk_bm(mdev, sector, count, true);
801
		spin_unlock_irqrestore(&mdev->al_lock, flags);
802

803
		/* just wake_up unconditional now, various lc_chaged(),
804
		 * lc_put() in drbd_try_clear_on_disk_bm(). */
805
		wake_up = 1;
806
		put_ldev(mdev);
807
	}
808
	if (wake_up)
809
		wake_up(&mdev->al_wait);
810
}
811

812
/*
813
 * this is intended to set one request worth of data out of sync.
814
 * affects at least 1 bit,
815
 * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.
816
 *
817
 * called by tl_clear and drbd_send_dblock (==drbd_make_request).
818
 * so this can be _any_ process.
819
 */
820
int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
821
			    const char *file, const unsigned int line)
822
{
823
	unsigned long sbnr, ebnr, lbnr, flags;
824
	sector_t esector, nr_sectors;
825
	unsigned int enr, count = 0;
826
	struct lc_element *e;
827

828
	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
829
		dev_err(DEV, "sector: %llus, size: %d\n",
830
			(unsigned long long)sector, size);
831
		return 0;
832
	}
833

834
	if (!get_ldev(mdev))
835
		return 0; /* no disk, no metadata, no bitmap to set bits in */
836

837
	nr_sectors = drbd_get_capacity(mdev->this_bdev);
838
	esector = sector + (size >> 9) - 1;
839

840
	ERR_IF(sector >= nr_sectors)
841
		goto out;
842
	ERR_IF(esector >= nr_sectors)
843
		esector = (nr_sectors-1);
844

845
	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
846

847
	/* we set it out of sync,
848
	 * we do not need to round anything here */
849
	sbnr = BM_SECT_TO_BIT(sector);
850
	ebnr = BM_SECT_TO_BIT(esector);
851

852
	/* ok, (capacity & 7) != 0 sometimes, but who cares...
853
	 * we count rs_{total,left} in bits, not sectors.  */
854
	spin_lock_irqsave(&mdev->al_lock, flags);
855
	count = drbd_bm_set_bits(mdev, sbnr, ebnr);
856

857
	enr = BM_SECT_TO_EXT(sector);
858
	e = lc_find(mdev->resync, enr);
859
	if (e)
860
		lc_entry(e, struct bm_extent, lce)->rs_left += count;
861
	spin_unlock_irqrestore(&mdev->al_lock, flags);
862

863
out:
864
	put_ldev(mdev);
865

866
	return count;
867
}
868

869
static
870
struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
871
{
872
	struct lc_element *e;
873
	struct bm_extent *bm_ext;
874
	int wakeup = 0;
875
	unsigned long rs_flags;
876

877
	spin_lock_irq(&mdev->al_lock);
878
	if (mdev->resync_locked > mdev->resync->nr_elements/2) {
879
		spin_unlock_irq(&mdev->al_lock);
880
		return NULL;
881
	}
882
	e = lc_get(mdev->resync, enr);
883
	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
884
	if (bm_ext) {
885
		if (bm_ext->lce.lc_number != enr) {
886
			bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
887
			bm_ext->rs_failed = 0;
888
			lc_changed(mdev->resync, &bm_ext->lce);
889
			wakeup = 1;
890
		}
891
		if (bm_ext->lce.refcnt == 1)
892
			mdev->resync_locked++;
893
		set_bit(BME_NO_WRITES, &bm_ext->flags);
894
	}
895
	rs_flags = mdev->resync->flags;
896
	spin_unlock_irq(&mdev->al_lock);
897
	if (wakeup)
898
		wake_up(&mdev->al_wait);
899

900
	if (!bm_ext) {
901
		if (rs_flags & LC_STARVING)
902
			dev_warn(DEV, "Have to wait for element"
903
			     " (resync LRU too small?)\n");
904
		BUG_ON(rs_flags & LC_DIRTY);
905
	}
906

907
	return bm_ext;
908
}
909

910
static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
911
{
912
	struct lc_element *al_ext;
913
	int rv = 0;
914

915
	spin_lock_irq(&mdev->al_lock);
916
	if (unlikely(enr == mdev->act_log->new_number))
917
		rv = 1;
918
	else {
919
		al_ext = lc_find(mdev->act_log, enr);
920
		if (al_ext) {
921
			if (al_ext->refcnt)
922
				rv = 1;
923
		}
924
	}
925
	spin_unlock_irq(&mdev->al_lock);
926

927
	/*
928
	if (unlikely(rv)) {
929
		dev_info(DEV, "Delaying sync read until app's write is done\n");
930
	}
931
	*/
932
	return rv;
933
}
934

935
/**
936
 * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
937
 * @mdev:	DRBD device.
938
 * @sector:	The sector number.
939
 *
940
 * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
941
 */
942
int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
943
{
944
	unsigned int enr = BM_SECT_TO_EXT(sector);
945
	struct bm_extent *bm_ext;
946
	int i, sig;
947
	int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait.
948
			 200 times -> 20 seconds. */
949

950
retry:
951
	sig = wait_event_interruptible(mdev->al_wait,
952
			(bm_ext = _bme_get(mdev, enr)));
953
	if (sig)
954
		return -EINTR;
955

956
	if (test_bit(BME_LOCKED, &bm_ext->flags))
957
		return 0;
958

959
	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
960
		sig = wait_event_interruptible(mdev->al_wait,
961
					       !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i) ||
962
					       test_bit(BME_PRIORITY, &bm_ext->flags));
963

964
		if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) {
965
			spin_lock_irq(&mdev->al_lock);
966
			if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
967
				bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
968
				mdev->resync_locked--;
969
				wake_up(&mdev->al_wait);
970
			}
971
			spin_unlock_irq(&mdev->al_lock);
972
			if (sig)
973
				return -EINTR;
974
			if (schedule_timeout_interruptible(HZ/10))
975
				return -EINTR;
976
			if (sa && --sa == 0)
977
				dev_warn(DEV,"drbd_rs_begin_io() stepped aside for 20sec."
978
					 "Resync stalled?\n");
979
			goto retry;
980
		}
981
	}
982
	set_bit(BME_LOCKED, &bm_ext->flags);
983
	return 0;
984
}
985

986
/**
987
 * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
988
 * @mdev:	DRBD device.
989
 * @sector:	The sector number.
990
 *
991
 * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
992
 * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
993
 * if there is still application IO going on in this area.
994
 */
995
int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
996
{
997
	unsigned int enr = BM_SECT_TO_EXT(sector);
998
	const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
999
	struct lc_element *e;
1000
	struct bm_extent *bm_ext;
1001
	int i;
1002

1003
	spin_lock_irq(&mdev->al_lock);
1004
	if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
1005
		/* in case you have very heavy scattered io, it may
1006
		 * stall the syncer undefined if we give up the ref count
1007
		 * when we try again and requeue.
1008
		 *
1009
		 * if we don't give up the refcount, but the next time
1010
		 * we are scheduled this extent has been "synced" by new
1011
		 * application writes, we'd miss the lc_put on the
1012
		 * extent we keep the refcount on.
1013
		 * so we remembered which extent we had to try again, and
1014
		 * if the next requested one is something else, we do
1015
		 * the lc_put here...
1016
		 * we also have to wake_up
1017
		 */
1018
		e = lc_find(mdev->resync, mdev->resync_wenr);
1019
		bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1020
		if (bm_ext) {
1021
			D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1022
			D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
1023
			clear_bit(BME_NO_WRITES, &bm_ext->flags);
1024
			mdev->resync_wenr = LC_FREE;
1025
			if (lc_put(mdev->resync, &bm_ext->lce) == 0)
1026
				mdev->resync_locked--;
1027
			wake_up(&mdev->al_wait);
1028
		} else {
1029
			dev_alert(DEV, "LOGIC BUG\n");
1030
		}
1031
	}
1032
	/* TRY. */
1033
	e = lc_try_get(mdev->resync, enr);
1034
	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1035
	if (bm_ext) {
1036
		if (test_bit(BME_LOCKED, &bm_ext->flags))
1037
			goto proceed;
1038
		if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
1039
			mdev->resync_locked++;
1040
		} else {
1041
			/* we did set the BME_NO_WRITES,
1042
			 * but then could not set BME_LOCKED,
1043
			 * so we tried again.
1044
			 * drop the extra reference. */
1045
			bm_ext->lce.refcnt--;
1046
			D_ASSERT(bm_ext->lce.refcnt > 0);
1047
		}
1048
		goto check_al;
1049
	} else {
1050
		/* do we rather want to try later? */
1051
		if (mdev->resync_locked > mdev->resync->nr_elements-3)
1052
			goto try_again;
1053
		/* Do or do not. There is no try. -- Yoda */
1054
		e = lc_get(mdev->resync, enr);
1055
		bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1056
		if (!bm_ext) {
1057
			const unsigned long rs_flags = mdev->resync->flags;
1058
			if (rs_flags & LC_STARVING)
1059
				dev_warn(DEV, "Have to wait for element"
1060
				     " (resync LRU too small?)\n");
1061
			BUG_ON(rs_flags & LC_DIRTY);
1062
			goto try_again;
1063
		}
1064
		if (bm_ext->lce.lc_number != enr) {
1065
			bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
1066
			bm_ext->rs_failed = 0;
1067
			lc_changed(mdev->resync, &bm_ext->lce);
1068
			wake_up(&mdev->al_wait);
1069
			D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
1070
		}
1071
		set_bit(BME_NO_WRITES, &bm_ext->flags);
1072
		D_ASSERT(bm_ext->lce.refcnt == 1);
1073
		mdev->resync_locked++;
1074
		goto check_al;
1075
	}
1076
check_al:
1077
	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1078
		if (unlikely(al_enr+i == mdev->act_log->new_number))
1079
			goto try_again;
1080
		if (lc_is_used(mdev->act_log, al_enr+i))
1081
			goto try_again;
1082
	}
1083
	set_bit(BME_LOCKED, &bm_ext->flags);
1084
proceed:
1085
	mdev->resync_wenr = LC_FREE;
1086
	spin_unlock_irq(&mdev->al_lock);
1087
	return 0;
1088

1089
try_again:
1090
	if (bm_ext)
1091
		mdev->resync_wenr = enr;
1092
	spin_unlock_irq(&mdev->al_lock);
1093
	return -EAGAIN;
1094
}
1095

1096
void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
1097
{
1098
	unsigned int enr = BM_SECT_TO_EXT(sector);
1099
	struct lc_element *e;
1100
	struct bm_extent *bm_ext;
1101
	unsigned long flags;
1102

1103
	spin_lock_irqsave(&mdev->al_lock, flags);
1104
	e = lc_find(mdev->resync, enr);
1105
	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1106
	if (!bm_ext) {
1107
		spin_unlock_irqrestore(&mdev->al_lock, flags);
1108
		if (__ratelimit(&drbd_ratelimit_state))
1109
			dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
1110
		return;
1111
	}
1112

1113
	if (bm_ext->lce.refcnt == 0) {
1114
		spin_unlock_irqrestore(&mdev->al_lock, flags);
1115
		dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
1116
		    "but refcnt is 0!?\n",
1117
		    (unsigned long long)sector, enr);
1118
		return;
1119
	}
1120

1121
	if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1122
		bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
1123
		mdev->resync_locked--;
1124
		wake_up(&mdev->al_wait);
1125
	}
1126

1127
	spin_unlock_irqrestore(&mdev->al_lock, flags);
1128
}
1129

1130
/**
1131
 * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
1132
 * @mdev:	DRBD device.
1133
 */
1134
void drbd_rs_cancel_all(struct drbd_conf *mdev)
1135
{
1136
	spin_lock_irq(&mdev->al_lock);
1137

1138
	if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
1139
		lc_reset(mdev->resync);
1140
		put_ldev(mdev);
1141
	}
1142
	mdev->resync_locked = 0;
1143
	mdev->resync_wenr = LC_FREE;
1144
	spin_unlock_irq(&mdev->al_lock);
1145
	wake_up(&mdev->al_wait);
1146
}
1147

1148
/**
1149
 * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
1150
 * @mdev:	DRBD device.
1151
 *
1152
 * Returns 0 upon success, -EAGAIN if at least one reference count was
1153
 * not zero.
1154
 */
1155
int drbd_rs_del_all(struct drbd_conf *mdev)
1156
{
1157
	struct lc_element *e;
1158
	struct bm_extent *bm_ext;
1159
	int i;
1160

1161
	spin_lock_irq(&mdev->al_lock);
1162

1163
	if (get_ldev_if_state(mdev, D_FAILED)) {
1164
		/* ok, ->resync is there. */
1165
		for (i = 0; i < mdev->resync->nr_elements; i++) {
1166
			e = lc_element_by_index(mdev->resync, i);
1167
			bm_ext = lc_entry(e, struct bm_extent, lce);
1168
			if (bm_ext->lce.lc_number == LC_FREE)
1169
				continue;
1170
			if (bm_ext->lce.lc_number == mdev->resync_wenr) {
1171
				dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
1172
				     " got 'synced' by application io\n",
1173
				     mdev->resync_wenr);
1174
				D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1175
				D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
1176
				clear_bit(BME_NO_WRITES, &bm_ext->flags);
1177
				mdev->resync_wenr = LC_FREE;
1178
				lc_put(mdev->resync, &bm_ext->lce);
1179
			}
1180
			if (bm_ext->lce.refcnt != 0) {
1181
				dev_info(DEV, "Retrying drbd_rs_del_all() later. "
1182
				     "refcnt=%d\n", bm_ext->lce.refcnt);
1183
				put_ldev(mdev);
1184
				spin_unlock_irq(&mdev->al_lock);
1185
				return -EAGAIN;
1186
			}
1187
			D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1188
			D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
1189
			lc_del(mdev->resync, &bm_ext->lce);
1190
		}
1191
		D_ASSERT(mdev->resync->used == 0);
1192
		put_ldev(mdev);
1193
	}
1194
	spin_unlock_irq(&mdev->al_lock);
1195

1196
	return 0;
1197
}
1198

1199
/**
1200
 * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
1201
 * @mdev:	DRBD device.
1202
 * @sector:	The sector number.
1203
 * @size:	Size of failed IO operation, in byte.
1204
 */
1205
void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
1206
{
1207
	/* Is called from worker and receiver context _only_ */
1208
	unsigned long sbnr, ebnr, lbnr;
1209
	unsigned long count;
1210
	sector_t esector, nr_sectors;
1211
	int wake_up = 0;
1212

1213
	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
1214
		dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
1215
				(unsigned long long)sector, size);
1216
		return;
1217
	}
1218
	nr_sectors = drbd_get_capacity(mdev->this_bdev);
1219
	esector = sector + (size >> 9) - 1;
1220

1221
	ERR_IF(sector >= nr_sectors) return;
1222
	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
1223

1224
	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
1225

1226
	/*
1227
	 * round up start sector, round down end sector.  we make sure we only
1228
	 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
1229
	if (unlikely(esector < BM_SECT_PER_BIT-1))
1230
		return;
1231
	if (unlikely(esector == (nr_sectors-1)))
1232
		ebnr = lbnr;
1233
	else
1234
		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
1235
	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
1236

1237
	if (sbnr > ebnr)
1238
		return;
1239

1240
	/*
1241
	 * ok, (capacity & 7) != 0 sometimes, but who cares...
1242
	 * we count rs_{total,left} in bits, not sectors.
1243
	 */
1244
	spin_lock_irq(&mdev->al_lock);
1245
	count = drbd_bm_count_bits(mdev, sbnr, ebnr);
1246
	if (count) {
1247
		mdev->rs_failed += count;
1248

1249
		if (get_ldev(mdev)) {
1250
			drbd_try_clear_on_disk_bm(mdev, sector, count, false);
1251
			put_ldev(mdev);
1252
		}
1253

1254
		/* just wake_up unconditional now, various lc_chaged(),
1255
		 * lc_put() in drbd_try_clear_on_disk_bm(). */
1256
		wake_up = 1;
1257
	}
1258
	spin_unlock_irq(&mdev->al_lock);
1259
	if (wake_up)
1260
		wake_up(&mdev->al_wait);
1261
}
1262

1263
Product

Resources

Company