CoCalc -- md.h

GitHub Repository: awilliam/linux-vfio
Path: blob/master/drivers/md/md.h
¹⁷³⁵⁵ views
1
/*
2
   md_k.h : kernel internal structure of the Linux MD driver
3
          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
4
	  
5
   This program is free software; you can redistribute it and/or modify
6
   it under the terms of the GNU General Public License as published by
7
   the Free Software Foundation; either version 2, or (at your option)
8
   any later version.
9
   
10
   You should have received a copy of the GNU General Public License
11
   (for example /usr/src/linux/COPYING); if not, write to the Free
12
   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
13
*/
14

15
#ifndef _MD_MD_H
16
#define _MD_MD_H
17

18
#include <linux/blkdev.h>
19
#include <linux/kobject.h>
20
#include <linux/list.h>
21
#include <linux/mm.h>
22
#include <linux/mutex.h>
23
#include <linux/timer.h>
24
#include <linux/wait.h>
25
#include <linux/workqueue.h>
26

27
#define MaxSector (~(sector_t)0)
28

29
typedef struct mddev_s mddev_t;
30
typedef struct mdk_rdev_s mdk_rdev_t;
31

32
/*
33
 * MD's 'extended' device
34
 */
35
struct mdk_rdev_s
36
{
37
	struct list_head same_set;	/* RAID devices within the same set */
38

39
	sector_t sectors;		/* Device size (in 512bytes sectors) */
40
	mddev_t *mddev;			/* RAID array if running */
41
	int last_events;		/* IO event timestamp */
42

43
	/*
44
	 * If meta_bdev is non-NULL, it means that a separate device is
45
	 * being used to store the metadata (superblock/bitmap) which
46
	 * would otherwise be contained on the same device as the data (bdev).
47
	 */
48
	struct block_device *meta_bdev;
49
	struct block_device *bdev;	/* block device handle */
50

51
	struct page	*sb_page;
52
	int		sb_loaded;
53
	__u64		sb_events;
54
	sector_t	data_offset;	/* start of data in array */
55
	sector_t 	sb_start;	/* offset of the super block (in 512byte sectors) */
56
	int		sb_size;	/* bytes in the superblock */
57
	int		preferred_minor;	/* autorun support */
58

59
	struct kobject	kobj;
60

61
	/* A device can be in one of three states based on two flags:
62
	 * Not working:   faulty==1 in_sync==0
63
	 * Fully working: faulty==0 in_sync==1
64
	 * Working, but not
65
	 * in sync with array
66
	 *                faulty==0 in_sync==0
67
	 *
68
	 * It can never have faulty==1, in_sync==1
69
	 * This reduces the burden of testing multiple flags in many cases
70
	 */
71

72
	unsigned long	flags;
73
#define	Faulty		1		/* device is known to have a fault */
74
#define	In_sync		2		/* device is in_sync with rest of array */
75
#define	WriteMostly	4		/* Avoid reading if at all possible */
76
#define	AutoDetected	7		/* added by auto-detect */
77
#define Blocked		8		/* An error occurred on an externally
78
					 * managed array, don't allow writes
79
					 * until it is cleared */
80
	wait_queue_head_t blocked_wait;
81

82
	int desc_nr;			/* descriptor index in the superblock */
83
	int raid_disk;			/* role of device in array */
84
	int new_raid_disk;		/* role that the device will have in
85
					 * the array after a level-change completes.
86
					 */
87
	int saved_raid_disk;		/* role that device used to have in the
88
					 * array and could again if we did a partial
89
					 * resync from the bitmap
90
					 */
91
	sector_t	recovery_offset;/* If this device has been partially
92
					 * recovered, this is where we were
93
					 * up to.
94
					 */
95

96
	atomic_t	nr_pending;	/* number of pending requests.
97
					 * only maintained for arrays that
98
					 * support hot removal
99
					 */
100
	atomic_t	read_errors;	/* number of consecutive read errors that
101
					 * we have tried to ignore.
102
					 */
103
	struct timespec last_read_error;	/* monotonic time since our
104
						 * last read error
105
						 */
106
	atomic_t	corrected_errors; /* number of corrected read errors,
107
					   * for reporting to userspace and storing
108
					   * in superblock.
109
					   */
110
	struct work_struct del_work;	/* used for delayed sysfs removal */
111

112
	struct sysfs_dirent *sysfs_state; /* handle for 'state'
113
					   * sysfs entry */
114
};
115

116
struct mddev_s
117
{
118
	void				*private;
119
	struct mdk_personality		*pers;
120
	dev_t				unit;
121
	int				md_minor;
122
	struct list_head 		disks;
123
	unsigned long			flags;
124
#define MD_CHANGE_DEVS	0	/* Some device status has changed */
125
#define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
126
#define MD_CHANGE_PENDING 2	/* switch from 'clean' to 'active' in progress */
127
#define MD_ARRAY_FIRST_USE 3    /* First use of array, needs initialization */
128

129
	int				suspended;
130
	atomic_t			active_io;
131
	int				ro;
132
	int				sysfs_active; /* set when sysfs deletes
133
						       * are happening, so run/
134
						       * takeover/stop are not safe
135
						       */
136
	int				ready; /* See when safe to pass 
137
						* IO requests down */
138
	struct gendisk			*gendisk;
139

140
	struct kobject			kobj;
141
	int				hold_active;
142
#define	UNTIL_IOCTL	1
143
#define	UNTIL_STOP	2
144

145
	/* Superblock information */
146
	int				major_version,
147
					minor_version,
148
					patch_version;
149
	int				persistent;
150
	int 				external;	/* metadata is
151
							 * managed externally */
152
	char				metadata_type[17]; /* externally set*/
153
	int				chunk_sectors;
154
	time_t				ctime, utime;
155
	int				level, layout;
156
	char				clevel[16];
157
	int				raid_disks;
158
	int				max_disks;
159
	sector_t			dev_sectors; 	/* used size of
160
							 * component devices */
161
	sector_t			array_sectors; /* exported array size */
162
	int				external_size; /* size managed
163
							* externally */
164
	__u64				events;
165
	/* If the last 'event' was simply a clean->dirty transition, and
166
	 * we didn't write it to the spares, then it is safe and simple
167
	 * to just decrement the event count on a dirty->clean transition.
168
	 * So we record that possibility here.
169
	 */
170
	int				can_decrease_events;
171

172
	char				uuid[16];
173

174
	/* If the array is being reshaped, we need to record the
175
	 * new shape and an indication of where we are up to.
176
	 * This is written to the superblock.
177
	 * If reshape_position is MaxSector, then no reshape is happening (yet).
178
	 */
179
	sector_t			reshape_position;
180
	int				delta_disks, new_level, new_layout;
181
	int				new_chunk_sectors;
182

183
	atomic_t			plug_cnt;	/* If device is expecting
184
							 * more bios soon.
185
							 */
186
	struct mdk_thread_s		*thread;	/* management thread */
187
	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
188
	sector_t			curr_resync;	/* last block scheduled */
189
	/* As resync requests can complete out of order, we cannot easily track
190
	 * how much resync has been completed.  So we occasionally pause until
191
	 * everything completes, then set curr_resync_completed to curr_resync.
192
	 * As such it may be well behind the real resync mark, but it is a value
193
	 * we are certain of.
194
	 */
195
	sector_t			curr_resync_completed;
196
	unsigned long			resync_mark;	/* a recent timestamp */
197
	sector_t			resync_mark_cnt;/* blocks written at resync_mark */
198
	sector_t			curr_mark_cnt; /* blocks scheduled now */
199

200
	sector_t			resync_max_sectors; /* may be set by personality */
201

202
	sector_t			resync_mismatches; /* count of sectors where
203
							    * parity/replica mismatch found
204
							    */
205

206
	/* allow user-space to request suspension of IO to regions of the array */
207
	sector_t			suspend_lo;
208
	sector_t			suspend_hi;
209
	/* if zero, use the system-wide default */
210
	int				sync_speed_min;
211
	int				sync_speed_max;
212

213
	/* resync even though the same disks are shared among md-devices */
214
	int				parallel_resync;
215

216
	int				ok_start_degraded;
217
	/* recovery/resync flags 
218
	 * NEEDED:   we might need to start a resync/recover
219
	 * RUNNING:  a thread is running, or about to be started
220
	 * SYNC:     actually doing a resync, not a recovery
221
	 * RECOVER:  doing recovery, or need to try it.
222
	 * INTR:     resync needs to be aborted for some reason
223
	 * DONE:     thread is done and is waiting to be reaped
224
	 * REQUEST:  user-space has requested a sync (used with SYNC)
225
	 * CHECK:    user-space request for check-only, no repair
226
	 * RESHAPE:  A reshape is happening
227
	 *
228
	 * If neither SYNC or RESHAPE are set, then it is a recovery.
229
	 */
230
#define	MD_RECOVERY_RUNNING	0
231
#define	MD_RECOVERY_SYNC	1
232
#define	MD_RECOVERY_RECOVER	2
233
#define	MD_RECOVERY_INTR	3
234
#define	MD_RECOVERY_DONE	4
235
#define	MD_RECOVERY_NEEDED	5
236
#define	MD_RECOVERY_REQUESTED	6
237
#define	MD_RECOVERY_CHECK	7
238
#define MD_RECOVERY_RESHAPE	8
239
#define	MD_RECOVERY_FROZEN	9
240

241
	unsigned long			recovery;
242
	int				recovery_disabled; /* if we detect that recovery
243
							    * will always fail, set this
244
							    * so we don't loop trying */
245

246
	int				in_sync;	/* know to not need resync */
247
	/* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
248
	 * that we are never stopping an array while it is open.
249
	 * 'reconfig_mutex' protects all other reconfiguration.
250
	 * These locks are separate due to conflicting interactions
251
	 * with bdev->bd_mutex.
252
	 * Lock ordering is:
253
	 *  reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk
254
	 *  bd_mutex -> open_mutex:  e.g. __blkdev_get -> md_open
255
	 */
256
	struct mutex			open_mutex;
257
	struct mutex			reconfig_mutex;
258
	atomic_t			active;		/* general refcount */
259
	atomic_t			openers;	/* number of active opens */
260

261
	int				changed;	/* True if we might need to
262
							 * reread partition info */
263
	int				degraded;	/* whether md should consider
264
							 * adding a spare
265
							 */
266

267
	atomic_t			recovery_active; /* blocks scheduled, but not written */
268
	wait_queue_head_t		recovery_wait;
269
	sector_t			recovery_cp;
270
	sector_t			resync_min;	/* user requested sync
271
							 * starts here */
272
	sector_t			resync_max;	/* resync should pause
273
							 * when it gets here */
274

275
	struct sysfs_dirent		*sysfs_state;	/* handle for 'array_state'
276
							 * file in sysfs.
277
							 */
278
	struct sysfs_dirent		*sysfs_action;  /* handle for 'sync_action' */
279

280
	struct work_struct del_work;	/* used for delayed sysfs removal */
281

282
	spinlock_t			write_lock;
283
	wait_queue_head_t		sb_wait;	/* for waiting on superblock updates */
284
	atomic_t			pending_writes;	/* number of active superblock writes */
285

286
	unsigned int			safemode;	/* if set, update "clean" superblock
287
							 * when no writes pending.
288
							 */ 
289
	unsigned int			safemode_delay;
290
	struct timer_list		safemode_timer;
291
	atomic_t			writes_pending; 
292
	struct request_queue		*queue;	/* for plugging ... */
293

294
	struct bitmap                   *bitmap; /* the bitmap for the device */
295
	struct {
296
		struct file		*file; /* the bitmap file */
297
		loff_t			offset; /* offset from superblock of
298
						 * start of bitmap. May be
299
						 * negative, but not '0'
300
						 * For external metadata, offset
301
						 * from start of device. 
302
						 */
303
		loff_t			default_offset; /* this is the offset to use when
304
							 * hot-adding a bitmap.  It should
305
							 * eventually be settable by sysfs.
306
							 */
307
		/* When md is serving under dm, it might use a
308
		 * dirty_log to store the bits.
309
		 */
310
		struct dm_dirty_log *log;
311

312
		struct mutex		mutex;
313
		unsigned long		chunksize;
314
		unsigned long		daemon_sleep; /* how many jiffies between updates? */
315
		unsigned long		max_write_behind; /* write-behind mode */
316
		int			external;
317
	} bitmap_info;
318

319
	atomic_t 			max_corr_read_errors; /* max read retries */
320
	struct list_head		all_mddevs;
321

322
	struct attribute_group		*to_remove;
323

324
	struct bio_set			*bio_set;
325

326
	/* Generic flush handling.
327
	 * The last to finish preflush schedules a worker to submit
328
	 * the rest of the request (without the REQ_FLUSH flag).
329
	 */
330
	struct bio *flush_bio;
331
	atomic_t flush_pending;
332
	struct work_struct flush_work;
333
	struct work_struct event_work;	/* used by dm to report failure event */
334
	void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
335
};
336

337

338
static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
339
{
340
	int faulty = test_bit(Faulty, &rdev->flags);
341
	if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
342
		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
343
}
344

345
static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
346
{
347
        atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
348
}
349

350
struct mdk_personality
351
{
352
	char *name;
353
	int level;
354
	struct list_head list;
355
	struct module *owner;
356
	int (*make_request)(mddev_t *mddev, struct bio *bio);
357
	int (*run)(mddev_t *mddev);
358
	int (*stop)(mddev_t *mddev);
359
	void (*status)(struct seq_file *seq, mddev_t *mddev);
360
	/* error_handler must set ->faulty and clear ->in_sync
361
	 * if appropriate, and should abort recovery if needed 
362
	 */
363
	void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
364
	int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
365
	int (*hot_remove_disk) (mddev_t *mddev, int number);
366
	int (*spare_active) (mddev_t *mddev);
367
	sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
368
	int (*resize) (mddev_t *mddev, sector_t sectors);
369
	sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks);
370
	int (*check_reshape) (mddev_t *mddev);
371
	int (*start_reshape) (mddev_t *mddev);
372
	void (*finish_reshape) (mddev_t *mddev);
373
	/* quiesce moves between quiescence states
374
	 * 0 - fully active
375
	 * 1 - no new requests allowed
376
	 * others - reserved
377
	 */
378
	void (*quiesce) (mddev_t *mddev, int state);
379
	/* takeover is used to transition an array from one
380
	 * personality to another.  The new personality must be able
381
	 * to handle the data in the current layout.
382
	 * e.g. 2drive raid1 -> 2drive raid5
383
	 *      ndrive raid5 -> degraded n+1drive raid6 with special layout
384
	 * If the takeover succeeds, a new 'private' structure is returned.
385
	 * This needs to be installed and then ->run used to activate the
386
	 * array.
387
	 */
388
	void *(*takeover) (mddev_t *mddev);
389
};
390

391

392
struct md_sysfs_entry {
393
	struct attribute attr;
394
	ssize_t (*show)(mddev_t *, char *);
395
	ssize_t (*store)(mddev_t *, const char *, size_t);
396
};
397
extern struct attribute_group md_bitmap_group;
398

399
static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name)
400
{
401
	if (sd)
402
		return sysfs_get_dirent(sd, NULL, name);
403
	return sd;
404
}
405
static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd)
406
{
407
	if (sd)
408
		sysfs_notify_dirent(sd);
409
}
410

411
static inline char * mdname (mddev_t * mddev)
412
{
413
	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
414
}
415

416
/*
417
 * iterates through some rdev ringlist. It's safe to remove the
418
 * current 'rdev'. Dont touch 'tmp' though.
419
 */
420
#define rdev_for_each_list(rdev, tmp, head)				\
421
	list_for_each_entry_safe(rdev, tmp, head, same_set)
422

423
/*
424
 * iterates through the 'same array disks' ringlist
425
 */
426
#define rdev_for_each(rdev, tmp, mddev)				\
427
	list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
428

429
#define rdev_for_each_rcu(rdev, mddev)				\
430
	list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
431

432
typedef struct mdk_thread_s {
433
	void			(*run) (mddev_t *mddev);
434
	mddev_t			*mddev;
435
	wait_queue_head_t	wqueue;
436
	unsigned long           flags;
437
	struct task_struct	*tsk;
438
	unsigned long		timeout;
439
} mdk_thread_t;
440

441
#define THREAD_WAKEUP  0
442

443
#define __wait_event_lock_irq(wq, condition, lock, cmd) 		\
444
do {									\
445
	wait_queue_t __wait;						\
446
	init_waitqueue_entry(&__wait, current);				\
447
									\
448
	add_wait_queue(&wq, &__wait);					\
449
	for (;;) {							\
450
		set_current_state(TASK_UNINTERRUPTIBLE);		\
451
		if (condition)						\
452
			break;						\
453
		spin_unlock_irq(&lock);					\
454
		cmd;							\
455
		schedule();						\
456
		spin_lock_irq(&lock);					\
457
	}								\
458
	current->state = TASK_RUNNING;					\
459
	remove_wait_queue(&wq, &__wait);				\
460
} while (0)
461

462
#define wait_event_lock_irq(wq, condition, lock, cmd) 			\
463
do {									\
464
	if (condition)	 						\
465
		break;							\
466
	__wait_event_lock_irq(wq, condition, lock, cmd);		\
467
} while (0)
468

469
static inline void safe_put_page(struct page *p)
470
{
471
	if (p) put_page(p);
472
}
473

474
extern int register_md_personality(struct mdk_personality *p);
475
extern int unregister_md_personality(struct mdk_personality *p);
476
extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
477
				mddev_t *mddev, const char *name);
478
extern void md_unregister_thread(mdk_thread_t *thread);
479
extern void md_wakeup_thread(mdk_thread_t *thread);
480
extern void md_check_recovery(mddev_t *mddev);
481
extern void md_write_start(mddev_t *mddev, struct bio *bi);
482
extern void md_write_end(mddev_t *mddev);
483
extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
484
extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
485

486
extern int mddev_congested(mddev_t *mddev, int bits);
487
extern void md_flush_request(mddev_t *mddev, struct bio *bio);
488
extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
489
			   sector_t sector, int size, struct page *page);
490
extern void md_super_wait(mddev_t *mddev);
491
extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, 
492
			struct page *page, int rw, bool metadata_op);
493
extern void md_do_sync(mddev_t *mddev);
494
extern void md_new_event(mddev_t *mddev);
495
extern int md_allow_write(mddev_t *mddev);
496
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
497
extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
498
extern int md_check_no_bitmap(mddev_t *mddev);
499
extern int md_integrity_register(mddev_t *mddev);
500
extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
501
extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
502
extern void restore_bitmap_write_access(struct file *file);
503

504
extern void mddev_init(mddev_t *mddev);
505
extern int md_run(mddev_t *mddev);
506
extern void md_stop(mddev_t *mddev);
507
extern void md_stop_writes(mddev_t *mddev);
508
extern void md_rdev_init(mdk_rdev_t *rdev);
509

510
extern void mddev_suspend(mddev_t *mddev);
511
extern void mddev_resume(mddev_t *mddev);
512
extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
513
				   mddev_t *mddev);
514
extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
515
				   mddev_t *mddev);
516
extern int mddev_check_plugged(mddev_t *mddev);
517
#endif /* _MD_MD_H */
518

519
Product

Resources

Company