// SPDX-License-Identifier: GPL-2.0-or-later1/* vnode and volume validity verification.2*3* Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.4* Written by David Howells ([email protected])5*/67#include <linux/kernel.h>8#include <linux/module.h>9#include <linux/sched.h>10#include "internal.h"1112/*13* Data validation is managed through a number of mechanisms from the server:14*15* (1) On first contact with a server (such as if it has just been rebooted),16* the server sends us a CB.InitCallBackState* request.17*18* (2) On a RW volume, in response to certain vnode (inode)-accessing RPC19* calls, the server maintains a time-limited per-vnode promise that it20* will send us a CB.CallBack request if a third party alters the vnodes21* accessed.22*23* Note that a vnode-level callbacks may also be sent for other reasons,24* such as filelock release.25*26* (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC27* calls, each server maintains a time-limited per-volume promise that it28* will send us a CB.CallBack request if the RO volume is updated to a29* snapshot of the RW volume ("vos release"). This is an atomic event30* that cuts over all instances of the RO volume across multiple servers31* simultaneously.32*33* Note that a volume-level callbacks may also be sent for other reasons,34* such as the volumeserver taking over control of the volume from the35* fileserver.36*37* Note also that each server maintains an independent time limit on an38* independent callback.39*40* (4) Certain RPC calls include a volume information record "VolSync" in41* their reply. This contains a creation date for the volume that should42* remain unchanged for a RW volume (but will be changed if the volume is43* restored from backup) or will be bumped to the time of snapshotting44* when a RO volume is released.45*46* In order to track this events, the following are provided:47*48* ->cb_v_break. A counter of events that might mean that the contents of49* a volume have been altered since we last checked a vnode.50*51* ->cb_v_check. A counter of the number of events that we've sent a52* query to the server for. Everything's up to date if this equals53* cb_v_break.54*55* ->cb_scrub. A counter of the number of regression events for which we56* have to completely wipe the cache.57*58* ->cb_ro_snapshot. A counter of the number of times that we've59* recognised that a RO volume has been updated.60*61* ->cb_break. A counter of events that might mean that the contents of a62* vnode have been altered.63*64* ->cb_expires_at. The time at which the callback promise expires or65* AFS_NO_CB_PROMISE if we have no promise.66*67* The way we manage things is:68*69* (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on70* the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the71* volume and volume's server record.72*73* (2) When a CB.InitCallBackState occurs, we treat this as a volume-level74* callback break on all the volumes that have been using that volume75* (ie. increment ->cb_v_break and reset ->cb_expires_at).76*77* (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the78* vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also79* dispatch a work item to unmap all PTEs to the vnode's pagecache to80* force reentry to the filesystem for revalidation.81*82* (4) When entering the filesystem, we call afs_validate() to check the83* validity of a vnode. This first checks to see if ->cb_v_check and84* ->cb_v_break match, and if they don't, we lock volume->cb_check_lock85* exclusively and perform an FS.FetchStatus on the vnode.86*87* After checking the volume, we check the vnode. If there's a mismatch88* between the volume counters and the vnode's mirrors of those counters,89* we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.90*91* (5) When the reply from FS.FetchStatus arrives, the VolSync record is92* parsed:93*94* (A) If the Creation timestamp has changed on a RW volume or regressed95* on a RO volume, we try to increment ->cb_scrub; if it advances on a96* RO volume, we assume "vos release" happened and try to increment97* ->cb_ro_snapshot.98*99* (B) If the Update timestamp has regressed, we try to increment100* ->cb_scrub.101*102* Note that in both of these cases, we only do the increment if we can103* cmpxchg the value of the timestamp from the value we noted before the104* op. This tries to prevent parallel ops from fighting one another.105*106* volume->cb_v_check is then set to ->cb_v_break.107*108* (6) The AFSCallBack record included in the FS.FetchStatus reply is also109* parsed and used to set the promise in ->cb_expires_at for the vnode,110* the volume and the volume's server record.111*112* (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for113* the vnode.114*/115116/*117* Check the validity of a vnode/inode and its parent volume.118*/119bool afs_check_validity(const struct afs_vnode *vnode)120{121const struct afs_volume *volume = vnode->volume;122enum afs_vnode_invalid_trace trace = afs_vnode_valid_trace;123time64_t cb_expires_at = atomic64_read(&vnode->cb_expires_at);124time64_t deadline = ktime_get_real_seconds() + 10;125126if (test_bit(AFS_VNODE_DELETED, &vnode->flags))127return true;128129if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break))130trace = afs_vnode_invalid_trace_cb_v_break;131else if (cb_expires_at == AFS_NO_CB_PROMISE)132trace = afs_vnode_invalid_trace_no_cb_promise;133else if (cb_expires_at <= deadline)134trace = afs_vnode_invalid_trace_expired;135else if (volume->cb_expires_at <= deadline)136trace = afs_vnode_invalid_trace_vol_expired;137else if (vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot))138trace = afs_vnode_invalid_trace_cb_ro_snapshot;139else if (vnode->cb_scrub != atomic_read(&volume->cb_scrub))140trace = afs_vnode_invalid_trace_cb_scrub;141else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))142trace = afs_vnode_invalid_trace_zap_data;143else144return true;145trace_afs_vnode_invalid(vnode, trace);146return false;147}148149/*150* See if the server we've just talked to is currently excluded.151*/152static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)153{154const struct afs_server_entry *se;155const struct afs_server_list *slist;156bool is_excluded = true;157int i;158159rcu_read_lock();160161slist = rcu_dereference(volume->servers);162for (i = 0; i < slist->nr_servers; i++) {163se = &slist->servers[i];164if (op->server == se->server) {165is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);166break;167}168}169170rcu_read_unlock();171return is_excluded;172}173174/*175* Update the volume's server list when the creation time changes and see if176* the server we've just talked to is currently excluded.177*/178static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)179{180int ret;181182if (__afs_is_server_excluded(op, volume))183return 1;184185set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);186ret = afs_check_volume_status(op->volume, op);187if (ret < 0)188return ret;189190return __afs_is_server_excluded(op, volume);191}192193/*194* Handle a change to the volume creation time in the VolSync record.195*/196static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)197{198unsigned int snap;199time64_t cur = volume->creation_time;200time64_t old = op->pre_volsync.creation;201time64_t new = op->volsync.creation;202int ret;203204_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);205206if (cur == TIME64_MIN) {207volume->creation_time = new;208return 0;209}210211if (new == cur)212return 0;213214/* Try to advance the creation timestamp from what we had before the215* operation to what we got back from the server. This should216* hopefully ensure that in a race between multiple operations only one217* of them will do this.218*/219if (cur != old)220return 0;221222/* If the creation time changes in an unexpected way, we need to scrub223* our caches. For a RW vol, this will only change if the volume is224* restored from a backup; for a RO/Backup vol, this will advance when225* the volume is updated to a new snapshot (eg. "vos release").226*/227if (volume->type == AFSVL_RWVOL)228goto regressed;229if (volume->type == AFSVL_BACKVOL) {230if (new < old)231goto regressed;232goto advance;233}234235/* We have an RO volume, we need to query the VL server and look at the236* server flags to see if RW->RO replication is in progress.237*/238ret = afs_is_server_excluded(op, volume);239if (ret < 0)240return ret;241if (ret > 0) {242snap = atomic_read(&volume->cb_ro_snapshot);243trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);244return ret;245}246247advance:248snap = atomic_inc_return(&volume->cb_ro_snapshot);249trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);250volume->creation_time = new;251return 0;252253regressed:254atomic_inc(&volume->cb_scrub);255trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);256volume->creation_time = new;257return 0;258}259260/*261* Handle a change to the volume update time in the VolSync record.262*/263static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)264{265enum afs_cb_break_reason reason = afs_cb_break_no_break;266time64_t cur = volume->update_time;267time64_t old = op->pre_volsync.update;268time64_t new = op->volsync.update;269270_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);271272if (cur == TIME64_MIN) {273volume->update_time = new;274return;275}276277if (new == cur)278return;279280/* If the volume update time changes in an unexpected way, we need to281* scrub our caches. For a RW vol, this will advance on every282* modification op; for a RO/Backup vol, this will advance when the283* volume is updated to a new snapshot (eg. "vos release").284*/285if (new < old)286reason = afs_cb_break_for_update_regress;287288/* Try to advance the update timestamp from what we had before the289* operation to what we got back from the server. This should290* hopefully ensure that in a race between multiple operations only one291* of them will do this.292*/293if (cur == old) {294if (reason == afs_cb_break_for_update_regress) {295atomic_inc(&volume->cb_scrub);296trace_afs_cb_v_break(volume->vid, 0, reason);297}298volume->update_time = new;299}300}301302static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)303{304int ret = 0;305306if (likely(op->volsync.creation == volume->creation_time &&307op->volsync.update == volume->update_time))308return 0;309310mutex_lock(&volume->volsync_lock);311if (op->volsync.creation != volume->creation_time) {312ret = afs_update_volume_creation_time(op, volume);313if (ret < 0)314goto out;315}316if (op->volsync.update != volume->update_time)317afs_update_volume_update_time(op, volume);318out:319mutex_unlock(&volume->volsync_lock);320return ret;321}322323/*324* Update the state of a volume, including recording the expiration time of the325* callback promise. Returns 1 to redo the operation from the start.326*/327int afs_update_volume_state(struct afs_operation *op)328{329struct afs_server_list *slist = op->server_list;330struct afs_server_entry *se = &slist->servers[op->server_index];331struct afs_callback *cb = &op->file[0].scb.callback;332struct afs_volume *volume = op->volume;333unsigned int cb_v_break = atomic_read(&volume->cb_v_break);334unsigned int cb_v_check = atomic_read(&volume->cb_v_check);335int ret;336337_enter("%llx", op->volume->vid);338339if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {340ret = afs_update_volume_times(op, volume);341if (ret != 0) {342_leave(" = %d", ret);343return ret;344}345}346347if (op->cb_v_break == cb_v_break &&348(op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {349time64_t expires_at = cb->expires_at;350351if (!op->file[0].scb.have_cb)352expires_at = op->file[1].scb.callback.expires_at;353354se->cb_expires_at = expires_at;355volume->cb_expires_at = expires_at;356}357if (cb_v_check < op->cb_v_break)358atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);359return 0;360}361362/*363* mark the data attached to an inode as obsolete due to a write on the server364* - might also want to ditch all the outstanding writes and dirty pages365*/366static void afs_zap_data(struct afs_vnode *vnode)367{368_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);369370afs_invalidate_cache(vnode, 0);371372/* nuke all the non-dirty pages that aren't locked, mapped or being373* written back in a regular file and completely discard the pages in a374* directory or symlink */375if (S_ISREG(vnode->netfs.inode.i_mode))376filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX);377else378filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX);379}380381/*382* validate a vnode/inode383* - there are several things we need to check384* - parent dir data changes (rm, rmdir, rename, mkdir, create, link,385* symlink)386* - parent dir metadata changed (security changes)387* - dentry data changed (write, truncate)388* - dentry metadata changed (security changes)389*/390int afs_validate(struct afs_vnode *vnode, struct key *key)391{392struct afs_volume *volume = vnode->volume;393unsigned int cb_ro_snapshot, cb_scrub;394time64_t deadline = ktime_get_real_seconds() + 10;395bool zap = false, locked_vol = false;396int ret;397398_enter("{v={%llx:%llu} fl=%lx},%x",399vnode->fid.vid, vnode->fid.vnode, vnode->flags,400key_serial(key));401402if (afs_check_validity(vnode))403return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;404405ret = down_write_killable(&vnode->validate_lock);406if (ret < 0)407goto error;408409if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {410ret = -ESTALE;411goto error_unlock;412}413414/* Validate a volume after the v_break has changed or the volume415* callback expired. We only want to do this once per volume per416* v_break change. The actual work will be done when parsing the417* status fetch reply.418*/419if (volume->cb_expires_at <= deadline ||420atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {421ret = mutex_lock_interruptible(&volume->cb_check_lock);422if (ret < 0)423goto error_unlock;424locked_vol = true;425}426427cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);428cb_scrub = atomic_read(&volume->cb_scrub);429if (vnode->cb_ro_snapshot != cb_ro_snapshot ||430vnode->cb_scrub != cb_scrub)431unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);432433if (vnode->cb_ro_snapshot != cb_ro_snapshot ||434vnode->cb_scrub != cb_scrub ||435volume->cb_expires_at <= deadline ||436atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||437atomic64_read(&vnode->cb_expires_at) <= deadline438) {439ret = afs_fetch_status(vnode, key, false, NULL);440if (ret < 0) {441if (ret == -ENOENT) {442set_bit(AFS_VNODE_DELETED, &vnode->flags);443ret = -ESTALE;444}445goto error_unlock;446}447448_debug("new promise [fl=%lx]", vnode->flags);449}450451/* We can drop the volume lock now as. */452if (locked_vol) {453mutex_unlock(&volume->cb_check_lock);454locked_vol = false;455}456457cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);458cb_scrub = atomic_read(&volume->cb_scrub);459_debug("vnode inval %x==%x %x==%x",460vnode->cb_ro_snapshot, cb_ro_snapshot,461vnode->cb_scrub, cb_scrub);462if (vnode->cb_scrub != cb_scrub)463zap = true;464vnode->cb_ro_snapshot = cb_ro_snapshot;465vnode->cb_scrub = cb_scrub;466467/* if the vnode's data version number changed then its contents are468* different */469zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);470if (zap)471afs_zap_data(vnode);472up_write(&vnode->validate_lock);473_leave(" = 0");474return 0;475476error_unlock:477if (locked_vol)478mutex_unlock(&volume->cb_check_lock);479up_write(&vnode->validate_lock);480error:481_leave(" = %d", ret);482return ret;483}484485486