CoCalc -- validation.c

GitHub Repository: torvalds/linux
Path: blob/master/fs/afs/validation.c
²⁶²⁸⁵ views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/* vnode and volume validity verification.
3
 *
4
 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5
 * Written by David Howells ([email protected])
6
 */
7

8
#include <linux/kernel.h>
9
#include <linux/module.h>
10
#include <linux/sched.h>
11
#include "internal.h"
12

13
/*
14
 * Data validation is managed through a number of mechanisms from the server:
15
 *
16
 *  (1) On first contact with a server (such as if it has just been rebooted),
17
 *      the server sends us a CB.InitCallBackState* request.
18
 *
19
 *  (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
20
 *      calls, the server maintains a time-limited per-vnode promise that it
21
 *      will send us a CB.CallBack request if a third party alters the vnodes
22
 *      accessed.
23
 *
24
 *      Note that a vnode-level callbacks may also be sent for other reasons,
25
 *      such as filelock release.
26
 *
27
 *  (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
28
 *      calls, each server maintains a time-limited per-volume promise that it
29
 *      will send us a CB.CallBack request if the RO volume is updated to a
30
 *      snapshot of the RW volume ("vos release").  This is an atomic event
31
 *      that cuts over all instances of the RO volume across multiple servers
32
 *      simultaneously.
33
 *
34
 *	Note that a volume-level callbacks may also be sent for other reasons,
35
 *	such as the volumeserver taking over control of the volume from the
36
 *	fileserver.
37
 *
38
 *	Note also that each server maintains an independent time limit on an
39
 *	independent callback.
40
 *
41
 *  (4) Certain RPC calls include a volume information record "VolSync" in
42
 *      their reply.  This contains a creation date for the volume that should
43
 *      remain unchanged for a RW volume (but will be changed if the volume is
44
 *      restored from backup) or will be bumped to the time of snapshotting
45
 *      when a RO volume is released.
46
 *
47
 * In order to track this events, the following are provided:
48
 *
49
 *	->cb_v_break.  A counter of events that might mean that the contents of
50
 *	a volume have been altered since we last checked a vnode.
51
 *
52
 *	->cb_v_check.  A counter of the number of events that we've sent a
53
 *	query to the server for.  Everything's up to date if this equals
54
 *	cb_v_break.
55
 *
56
 *	->cb_scrub.  A counter of the number of regression events for which we
57
 *	have to completely wipe the cache.
58
 *
59
 *	->cb_ro_snapshot.  A counter of the number of times that we've
60
 *      recognised that a RO volume has been updated.
61
 *
62
 *	->cb_break.  A counter of events that might mean that the contents of a
63
 *      vnode have been altered.
64
 *
65
 *	->cb_expires_at.  The time at which the callback promise expires or
66
 *      AFS_NO_CB_PROMISE if we have no promise.
67
 *
68
 * The way we manage things is:
69
 *
70
 *  (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
71
 *      the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
72
 *      volume and volume's server record.
73
 *
74
 *  (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
75
 *	callback break on all the volumes that have been using that volume
76
 *	(ie. increment ->cb_v_break and reset ->cb_expires_at).
77
 *
78
 *  (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
79
 *	vnode and reset its ->cb_expires_at.  If the vnode is mmapped, we also
80
 *	dispatch a work item to unmap all PTEs to the vnode's pagecache to
81
 *	force reentry to the filesystem for revalidation.
82
 *
83
 *  (4) When entering the filesystem, we call afs_validate() to check the
84
 *	validity of a vnode.  This first checks to see if ->cb_v_check and
85
 *	->cb_v_break match, and if they don't, we lock volume->cb_check_lock
86
 *	exclusively and perform an FS.FetchStatus on the vnode.
87
 *
88
 *	After checking the volume, we check the vnode.  If there's a mismatch
89
 *	between the volume counters and the vnode's mirrors of those counters,
90
 *	we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
91
 *
92
 *  (5) When the reply from FS.FetchStatus arrives, the VolSync record is
93
 *      parsed:
94
 *
95
 *	(A) If the Creation timestamp has changed on a RW volume or regressed
96
 *	    on a RO volume, we try to increment ->cb_scrub; if it advances on a
97
 *	    RO volume, we assume "vos release" happened and try to increment
98
 *	    ->cb_ro_snapshot.
99
 *
100
 *      (B) If the Update timestamp has regressed, we try to increment
101
 *	    ->cb_scrub.
102
 *
103
 *      Note that in both of these cases, we only do the increment if we can
104
 *      cmpxchg the value of the timestamp from the value we noted before the
105
 *      op.  This tries to prevent parallel ops from fighting one another.
106
 *
107
 *	volume->cb_v_check is then set to ->cb_v_break.
108
 *
109
 *  (6) The AFSCallBack record included in the FS.FetchStatus reply is also
110
 *	parsed and used to set the promise in ->cb_expires_at for the vnode,
111
 *	the volume and the volume's server record.
112
 *
113
 *  (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
114
 *      the vnode.
115
 */
116

117
/*
118
 * Check the validity of a vnode/inode and its parent volume.
119
 */
120
bool afs_check_validity(const struct afs_vnode *vnode)
121
{
122
	const struct afs_volume *volume = vnode->volume;
123
	enum afs_vnode_invalid_trace trace = afs_vnode_valid_trace;
124
	time64_t cb_expires_at = atomic64_read(&vnode->cb_expires_at);
125
	time64_t deadline = ktime_get_real_seconds() + 10;
126

127
	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
128
		return true;
129

130
	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break))
131
		trace = afs_vnode_invalid_trace_cb_v_break;
132
	else if (cb_expires_at == AFS_NO_CB_PROMISE)
133
		trace = afs_vnode_invalid_trace_no_cb_promise;
134
	else if (cb_expires_at <= deadline)
135
		trace = afs_vnode_invalid_trace_expired;
136
	else if (volume->cb_expires_at <= deadline)
137
		trace = afs_vnode_invalid_trace_vol_expired;
138
	else if (vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot))
139
		trace = afs_vnode_invalid_trace_cb_ro_snapshot;
140
	else if (vnode->cb_scrub != atomic_read(&volume->cb_scrub))
141
		trace = afs_vnode_invalid_trace_cb_scrub;
142
	else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
143
		trace = afs_vnode_invalid_trace_zap_data;
144
	else
145
		return true;
146
	trace_afs_vnode_invalid(vnode, trace);
147
	return false;
148
}
149

150
/*
151
 * See if the server we've just talked to is currently excluded.
152
 */
153
static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
154
{
155
	const struct afs_server_entry *se;
156
	const struct afs_server_list *slist;
157
	bool is_excluded = true;
158
	int i;
159

160
	rcu_read_lock();
161

162
	slist = rcu_dereference(volume->servers);
163
	for (i = 0; i < slist->nr_servers; i++) {
164
		se = &slist->servers[i];
165
		if (op->server == se->server) {
166
			is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
167
			break;
168
		}
169
	}
170

171
	rcu_read_unlock();
172
	return is_excluded;
173
}
174

175
/*
176
 * Update the volume's server list when the creation time changes and see if
177
 * the server we've just talked to is currently excluded.
178
 */
179
static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
180
{
181
	int ret;
182

183
	if (__afs_is_server_excluded(op, volume))
184
		return 1;
185

186
	set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
187
	ret = afs_check_volume_status(op->volume, op);
188
	if (ret < 0)
189
		return ret;
190

191
	return __afs_is_server_excluded(op, volume);
192
}
193

194
/*
195
 * Handle a change to the volume creation time in the VolSync record.
196
 */
197
static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
198
{
199
	unsigned int snap;
200
	time64_t cur = volume->creation_time;
201
	time64_t old = op->pre_volsync.creation;
202
	time64_t new = op->volsync.creation;
203
	int ret;
204

205
	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
206

207
	if (cur == TIME64_MIN) {
208
		volume->creation_time = new;
209
		return 0;
210
	}
211

212
	if (new == cur)
213
		return 0;
214

215
	/* Try to advance the creation timestamp from what we had before the
216
	 * operation to what we got back from the server.  This should
217
	 * hopefully ensure that in a race between multiple operations only one
218
	 * of them will do this.
219
	 */
220
	if (cur != old)
221
		return 0;
222

223
	/* If the creation time changes in an unexpected way, we need to scrub
224
	 * our caches.  For a RW vol, this will only change if the volume is
225
	 * restored from a backup; for a RO/Backup vol, this will advance when
226
	 * the volume is updated to a new snapshot (eg. "vos release").
227
	 */
228
	if (volume->type == AFSVL_RWVOL)
229
		goto regressed;
230
	if (volume->type == AFSVL_BACKVOL) {
231
		if (new < old)
232
			goto regressed;
233
		goto advance;
234
	}
235

236
	/* We have an RO volume, we need to query the VL server and look at the
237
	 * server flags to see if RW->RO replication is in progress.
238
	 */
239
	ret = afs_is_server_excluded(op, volume);
240
	if (ret < 0)
241
		return ret;
242
	if (ret > 0) {
243
		snap = atomic_read(&volume->cb_ro_snapshot);
244
		trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
245
		return ret;
246
	}
247

248
advance:
249
	snap = atomic_inc_return(&volume->cb_ro_snapshot);
250
	trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
251
	volume->creation_time = new;
252
	return 0;
253

254
regressed:
255
	atomic_inc(&volume->cb_scrub);
256
	trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
257
	volume->creation_time = new;
258
	return 0;
259
}
260

261
/*
262
 * Handle a change to the volume update time in the VolSync record.
263
 */
264
static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
265
{
266
	enum afs_cb_break_reason reason = afs_cb_break_no_break;
267
	time64_t cur = volume->update_time;
268
	time64_t old = op->pre_volsync.update;
269
	time64_t new = op->volsync.update;
270

271
	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
272

273
	if (cur == TIME64_MIN) {
274
		volume->update_time = new;
275
		return;
276
	}
277

278
	if (new == cur)
279
		return;
280

281
	/* If the volume update time changes in an unexpected way, we need to
282
	 * scrub our caches.  For a RW vol, this will advance on every
283
	 * modification op; for a RO/Backup vol, this will advance when the
284
	 * volume is updated to a new snapshot (eg. "vos release").
285
	 */
286
	if (new < old)
287
		reason = afs_cb_break_for_update_regress;
288

289
	/* Try to advance the update timestamp from what we had before the
290
	 * operation to what we got back from the server.  This should
291
	 * hopefully ensure that in a race between multiple operations only one
292
	 * of them will do this.
293
	 */
294
	if (cur == old) {
295
		if (reason == afs_cb_break_for_update_regress) {
296
			atomic_inc(&volume->cb_scrub);
297
			trace_afs_cb_v_break(volume->vid, 0, reason);
298
		}
299
		volume->update_time = new;
300
	}
301
}
302

303
static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
304
{
305
	int ret = 0;
306

307
	if (likely(op->volsync.creation == volume->creation_time &&
308
		   op->volsync.update == volume->update_time))
309
		return 0;
310

311
	mutex_lock(&volume->volsync_lock);
312
	if (op->volsync.creation != volume->creation_time) {
313
		ret = afs_update_volume_creation_time(op, volume);
314
		if (ret < 0)
315
			goto out;
316
	}
317
	if (op->volsync.update != volume->update_time)
318
		afs_update_volume_update_time(op, volume);
319
out:
320
	mutex_unlock(&volume->volsync_lock);
321
	return ret;
322
}
323

324
/*
325
 * Update the state of a volume, including recording the expiration time of the
326
 * callback promise.  Returns 1 to redo the operation from the start.
327
 */
328
int afs_update_volume_state(struct afs_operation *op)
329
{
330
	struct afs_server_list *slist = op->server_list;
331
	struct afs_server_entry *se = &slist->servers[op->server_index];
332
	struct afs_callback *cb = &op->file[0].scb.callback;
333
	struct afs_volume *volume = op->volume;
334
	unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
335
	unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
336
	int ret;
337

338
	_enter("%llx", op->volume->vid);
339

340
	if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
341
		ret = afs_update_volume_times(op, volume);
342
		if (ret != 0) {
343
			_leave(" = %d", ret);
344
			return ret;
345
		}
346
	}
347

348
	if (op->cb_v_break == cb_v_break &&
349
	    (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
350
		time64_t expires_at = cb->expires_at;
351

352
		if (!op->file[0].scb.have_cb)
353
			expires_at = op->file[1].scb.callback.expires_at;
354

355
		se->cb_expires_at = expires_at;
356
		volume->cb_expires_at = expires_at;
357
	}
358
	if (cb_v_check < op->cb_v_break)
359
		atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
360
	return 0;
361
}
362

363
/*
364
 * mark the data attached to an inode as obsolete due to a write on the server
365
 * - might also want to ditch all the outstanding writes and dirty pages
366
 */
367
static void afs_zap_data(struct afs_vnode *vnode)
368
{
369
	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
370

371
	afs_invalidate_cache(vnode, 0);
372

373
	/* nuke all the non-dirty pages that aren't locked, mapped or being
374
	 * written back in a regular file and completely discard the pages in a
375
	 * directory or symlink */
376
	if (S_ISREG(vnode->netfs.inode.i_mode))
377
		filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX);
378
	else
379
		filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX);
380
}
381

382
/*
383
 * validate a vnode/inode
384
 * - there are several things we need to check
385
 *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
386
 *     symlink)
387
 *   - parent dir metadata changed (security changes)
388
 *   - dentry data changed (write, truncate)
389
 *   - dentry metadata changed (security changes)
390
 */
391
int afs_validate(struct afs_vnode *vnode, struct key *key)
392
{
393
	struct afs_volume *volume = vnode->volume;
394
	unsigned int cb_ro_snapshot, cb_scrub;
395
	time64_t deadline = ktime_get_real_seconds() + 10;
396
	bool zap = false, locked_vol = false;
397
	int ret;
398

399
	_enter("{v={%llx:%llu} fl=%lx},%x",
400
	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
401
	       key_serial(key));
402

403
	if (afs_check_validity(vnode))
404
		return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
405

406
	ret = down_write_killable(&vnode->validate_lock);
407
	if (ret < 0)
408
		goto error;
409

410
	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
411
		ret = -ESTALE;
412
		goto error_unlock;
413
	}
414

415
	/* Validate a volume after the v_break has changed or the volume
416
	 * callback expired.  We only want to do this once per volume per
417
	 * v_break change.  The actual work will be done when parsing the
418
	 * status fetch reply.
419
	 */
420
	if (volume->cb_expires_at <= deadline ||
421
	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
422
		ret = mutex_lock_interruptible(&volume->cb_check_lock);
423
		if (ret < 0)
424
			goto error_unlock;
425
		locked_vol = true;
426
	}
427

428
	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
429
	cb_scrub = atomic_read(&volume->cb_scrub);
430
	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
431
	    vnode->cb_scrub	  != cb_scrub)
432
		unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
433

434
	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
435
	    vnode->cb_scrub	  != cb_scrub ||
436
	    volume->cb_expires_at <= deadline ||
437
	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
438
	    atomic64_read(&vnode->cb_expires_at) <= deadline
439
	    ) {
440
		ret = afs_fetch_status(vnode, key, false, NULL);
441
		if (ret < 0) {
442
			if (ret == -ENOENT) {
443
				set_bit(AFS_VNODE_DELETED, &vnode->flags);
444
				ret = -ESTALE;
445
			}
446
			goto error_unlock;
447
		}
448

449
		_debug("new promise [fl=%lx]", vnode->flags);
450
	}
451

452
	/* We can drop the volume lock now as. */
453
	if (locked_vol) {
454
		mutex_unlock(&volume->cb_check_lock);
455
		locked_vol = false;
456
	}
457

458
	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
459
	cb_scrub = atomic_read(&volume->cb_scrub);
460
	_debug("vnode inval %x==%x %x==%x",
461
	       vnode->cb_ro_snapshot, cb_ro_snapshot,
462
	       vnode->cb_scrub, cb_scrub);
463
	if (vnode->cb_scrub != cb_scrub)
464
		zap = true;
465
	vnode->cb_ro_snapshot = cb_ro_snapshot;
466
	vnode->cb_scrub = cb_scrub;
467

468
	/* if the vnode's data version number changed then its contents are
469
	 * different */
470
	zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
471
	if (zap)
472
		afs_zap_data(vnode);
473
	up_write(&vnode->validate_lock);
474
	_leave(" = 0");
475
	return 0;
476

477
error_unlock:
478
	if (locked_vol)
479
		mutex_unlock(&volume->cb_check_lock);
480
	up_write(&vnode->validate_lock);
481
error:
482
	_leave(" = %d", ret);
483
	return ret;
484
}
485

486
Product

Resources

Company