CoCalc -- rwsem.c

GitHub Repository: torvalds/linux
Path: blob/master/kernel/locking/rwsem.c
²⁵⁹²³ views
1
// SPDX-License-Identifier: GPL-2.0
2
/* kernel/rwsem.c: R/W semaphores, public implementation
3
 *
4
 * Written by David Howells ([email protected]).
5
 * Derived from asm-i386/semaphore.h
6
 *
7
 * Writer lock-stealing by Alex Shi <[email protected]>
8
 * and Michel Lespinasse <[email protected]>
9
 *
10
 * Optimistic spinning by Tim Chen <[email protected]>
11
 * and Davidlohr Bueso <[email protected]>. Based on mutexes.
12
 *
13
 * Rwsem count bit fields re-definition and rwsem rearchitecture by
14
 * Waiman Long <[email protected]> and
15
 * Peter Zijlstra <[email protected]>.
16
 */
17

18
#include <linux/types.h>
19
#include <linux/kernel.h>
20
#include <linux/sched.h>
21
#include <linux/sched/rt.h>
22
#include <linux/sched/task.h>
23
#include <linux/sched/debug.h>
24
#include <linux/sched/wake_q.h>
25
#include <linux/sched/signal.h>
26
#include <linux/sched/clock.h>
27
#include <linux/export.h>
28
#include <linux/rwsem.h>
29
#include <linux/atomic.h>
30
#include <linux/hung_task.h>
31
#include <trace/events/lock.h>
32

33
#ifndef CONFIG_PREEMPT_RT
34
#include "lock_events.h"
35

36
/*
37
 * The least significant 2 bits of the owner value has the following
38
 * meanings when set.
39
 *  - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint)
40
 *  - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
41
 *
42
 * When the rwsem is reader-owned and a spinning writer has timed out,
43
 * the nonspinnable bit will be set to disable optimistic spinning.
44

45
 * When a writer acquires a rwsem, it puts its task_struct pointer
46
 * into the owner field. It is cleared after an unlock.
47
 *
48
 * When a reader acquires a rwsem, it will also puts its task_struct
49
 * pointer into the owner field with the RWSEM_READER_OWNED bit set.
50
 * On unlock, the owner field will largely be left untouched. So
51
 * for a free or reader-owned rwsem, the owner value may contain
52
 * information about the last reader that acquires the rwsem.
53
 *
54
 * That information may be helpful in debugging cases where the system
55
 * seems to hang on a reader owned rwsem especially if only one reader
56
 * is involved. Ideally we would like to track all the readers that own
57
 * a rwsem, but the overhead is simply too big.
58
 *
59
 * A fast path reader optimistic lock stealing is supported when the rwsem
60
 * is previously owned by a writer and the following conditions are met:
61
 *  - rwsem is not currently writer owned
62
 *  - the handoff isn't set.
63
 */
64
#define RWSEM_READER_OWNED	(1UL << 0)
65
#define RWSEM_NONSPINNABLE	(1UL << 1)
66
#define RWSEM_OWNER_FLAGS_MASK	(RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
67

68
#ifdef CONFIG_DEBUG_RWSEMS
69
# define DEBUG_RWSEMS_WARN_ON(c, sem)	do {			\
70
	if (!debug_locks_silent &&				\
71
	    WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
72
		#c, atomic_long_read(&(sem)->count),		\
73
		(unsigned long) sem->magic,			\
74
		atomic_long_read(&(sem)->owner), (long)current,	\
75
		list_empty(&(sem)->wait_list) ? "" : "not "))	\
76
			debug_locks_off();			\
77
	} while (0)
78
#else
79
# define DEBUG_RWSEMS_WARN_ON(c, sem)
80
#endif
81

82
/*
83
 * On 64-bit architectures, the bit definitions of the count are:
84
 *
85
 * Bit  0    - writer locked bit
86
 * Bit  1    - waiters present bit
87
 * Bit  2    - lock handoff bit
88
 * Bits 3-7  - reserved
89
 * Bits 8-62 - 55-bit reader count
90
 * Bit  63   - read fail bit
91
 *
92
 * On 32-bit architectures, the bit definitions of the count are:
93
 *
94
 * Bit  0    - writer locked bit
95
 * Bit  1    - waiters present bit
96
 * Bit  2    - lock handoff bit
97
 * Bits 3-7  - reserved
98
 * Bits 8-30 - 23-bit reader count
99
 * Bit  31   - read fail bit
100
 *
101
 * It is not likely that the most significant bit (read fail bit) will ever
102
 * be set. This guard bit is still checked anyway in the down_read() fastpath
103
 * just in case we need to use up more of the reader bits for other purpose
104
 * in the future.
105
 *
106
 * atomic_long_fetch_add() is used to obtain reader lock, whereas
107
 * atomic_long_cmpxchg() will be used to obtain writer lock.
108
 *
109
 * There are three places where the lock handoff bit may be set or cleared.
110
 * 1) rwsem_mark_wake() for readers		-- set, clear
111
 * 2) rwsem_try_write_lock() for writers	-- set, clear
112
 * 3) rwsem_del_waiter()			-- clear
113
 *
114
 * For all the above cases, wait_lock will be held. A writer must also
115
 * be the first one in the wait_list to be eligible for setting the handoff
116
 * bit. So concurrent setting/clearing of handoff bit is not possible.
117
 */
118
#define RWSEM_WRITER_LOCKED	(1UL << 0)
119
#define RWSEM_FLAG_WAITERS	(1UL << 1)
120
#define RWSEM_FLAG_HANDOFF	(1UL << 2)
121
#define RWSEM_FLAG_READFAIL	(1UL << (BITS_PER_LONG - 1))
122

123
#define RWSEM_READER_SHIFT	8
124
#define RWSEM_READER_BIAS	(1UL << RWSEM_READER_SHIFT)
125
#define RWSEM_READER_MASK	(~(RWSEM_READER_BIAS - 1))
126
#define RWSEM_WRITER_MASK	RWSEM_WRITER_LOCKED
127
#define RWSEM_LOCK_MASK		(RWSEM_WRITER_MASK|RWSEM_READER_MASK)
128
#define RWSEM_READ_FAILED_MASK	(RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
129
				 RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)
130

131
/*
132
 * All writes to owner are protected by WRITE_ONCE() to make sure that
133
 * store tearing can't happen as optimistic spinners may read and use
134
 * the owner value concurrently without lock. Read from owner, however,
135
 * may not need READ_ONCE() as long as the pointer value is only used
136
 * for comparison and isn't being dereferenced.
137
 *
138
 * Both rwsem_{set,clear}_owner() functions should be in the same
139
 * preempt disable section as the atomic op that changes sem->count.
140
 */
141
static inline void rwsem_set_owner(struct rw_semaphore *sem)
142
{
143
	lockdep_assert_preemption_disabled();
144
	atomic_long_set(&sem->owner, (long)current);
145
}
146

147
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
148
{
149
	lockdep_assert_preemption_disabled();
150
	atomic_long_set(&sem->owner, 0);
151
}
152

153
/*
154
 * Test the flags in the owner field.
155
 */
156
static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags)
157
{
158
	return atomic_long_read(&sem->owner) & flags;
159
}
160

161
/*
162
 * The task_struct pointer of the last owning reader will be left in
163
 * the owner field.
164
 *
165
 * Note that the owner value just indicates the task has owned the rwsem
166
 * previously, it may not be the real owner or one of the real owners
167
 * anymore when that field is examined, so take it with a grain of salt.
168
 *
169
 * The reader non-spinnable bit is preserved.
170
 */
171
static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
172
					    struct task_struct *owner)
173
{
174
	unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED |
175
		(atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE);
176

177
	atomic_long_set(&sem->owner, val);
178
}
179

180
static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
181
{
182
	__rwsem_set_reader_owned(sem, current);
183
}
184

185
#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
186
/*
187
 * Return just the real task structure pointer of the owner
188
 */
189
struct task_struct *rwsem_owner(struct rw_semaphore *sem)
190
{
191
	return (struct task_struct *)
192
		(atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
193
}
194

195
/*
196
 * Return true if the rwsem is owned by a reader.
197
 */
198
bool is_rwsem_reader_owned(struct rw_semaphore *sem)
199
{
200
	/*
201
	 * Check the count to see if it is write-locked.
202
	 */
203
	long count = atomic_long_read(&sem->count);
204

205
	if (count & RWSEM_WRITER_MASK)
206
		return false;
207
	return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
208
}
209

210
/*
211
 * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured,
212
 * it will make sure that the owner field of a reader-owned rwsem either
213
 * points to a real reader-owner(s) or gets cleared. The only exception is
214
 * when the unlock is done by up_read_non_owner().
215
 */
216
static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
217
{
218
	unsigned long val = atomic_long_read(&sem->owner);
219

220
	while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) {
221
		if (atomic_long_try_cmpxchg(&sem->owner, &val,
222
					    val & RWSEM_OWNER_FLAGS_MASK))
223
			return;
224
	}
225
}
226
#else
227
static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
228
{
229
}
230
#endif
231

232
/*
233
 * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag
234
 * remains set. Otherwise, the operation will be aborted.
235
 */
236
static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
237
{
238
	unsigned long owner = atomic_long_read(&sem->owner);
239

240
	do {
241
		if (!(owner & RWSEM_READER_OWNED))
242
			break;
243
		if (owner & RWSEM_NONSPINNABLE)
244
			break;
245
	} while (!atomic_long_try_cmpxchg(&sem->owner, &owner,
246
					  owner | RWSEM_NONSPINNABLE));
247
}
248

249
static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)
250
{
251
	*cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
252

253
	if (WARN_ON_ONCE(*cntp < 0))
254
		rwsem_set_nonspinnable(sem);
255

256
	if (!(*cntp & RWSEM_READ_FAILED_MASK)) {
257
		rwsem_set_reader_owned(sem);
258
		return true;
259
	}
260

261
	return false;
262
}
263

264
static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
265
{
266
	long tmp = RWSEM_UNLOCKED_VALUE;
267

268
	if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {
269
		rwsem_set_owner(sem);
270
		return true;
271
	}
272

273
	return false;
274
}
275

276
/*
277
 * Return the real task structure pointer of the owner and the embedded
278
 * flags in the owner. pflags must be non-NULL.
279
 */
280
static inline struct task_struct *
281
rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags)
282
{
283
	unsigned long owner = atomic_long_read(&sem->owner);
284

285
	*pflags = owner & RWSEM_OWNER_FLAGS_MASK;
286
	return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK);
287
}
288

289
/*
290
 * Guide to the rw_semaphore's count field.
291
 *
292
 * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
293
 * by a writer.
294
 *
295
 * The lock is owned by readers when
296
 * (1) the RWSEM_WRITER_LOCKED isn't set in count,
297
 * (2) some of the reader bits are set in count, and
298
 * (3) the owner field has RWSEM_READ_OWNED bit set.
299
 *
300
 * Having some reader bits set is not enough to guarantee a readers owned
301
 * lock as the readers may be in the process of backing out from the count
302
 * and a writer has just released the lock. So another writer may steal
303
 * the lock immediately after that.
304
 */
305

306
/*
307
 * Initialize an rwsem:
308
 */
309
void __init_rwsem(struct rw_semaphore *sem, const char *name,
310
		  struct lock_class_key *key)
311
{
312
#ifdef CONFIG_DEBUG_LOCK_ALLOC
313
	/*
314
	 * Make sure we are not reinitializing a held semaphore:
315
	 */
316
	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
317
	lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
318
#endif
319
#ifdef CONFIG_DEBUG_RWSEMS
320
	sem->magic = sem;
321
#endif
322
	atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
323
	raw_spin_lock_init(&sem->wait_lock);
324
	INIT_LIST_HEAD(&sem->wait_list);
325
	atomic_long_set(&sem->owner, 0L);
326
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
327
	osq_lock_init(&sem->osq);
328
#endif
329
}
330
EXPORT_SYMBOL(__init_rwsem);
331

332
enum rwsem_waiter_type {
333
	RWSEM_WAITING_FOR_WRITE,
334
	RWSEM_WAITING_FOR_READ
335
};
336

337
struct rwsem_waiter {
338
	struct list_head list;
339
	struct task_struct *task;
340
	enum rwsem_waiter_type type;
341
	unsigned long timeout;
342
	bool handoff_set;
343
};
344
#define rwsem_first_waiter(sem) \
345
	list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
346

347
enum rwsem_wake_type {
348
	RWSEM_WAKE_ANY,		/* Wake whatever's at head of wait list */
349
	RWSEM_WAKE_READERS,	/* Wake readers only */
350
	RWSEM_WAKE_READ_OWNED	/* Waker thread holds the read lock */
351
};
352

353
/*
354
 * The typical HZ value is either 250 or 1000. So set the minimum waiting
355
 * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
356
 * queue before initiating the handoff protocol.
357
 */
358
#define RWSEM_WAIT_TIMEOUT	DIV_ROUND_UP(HZ, 250)
359

360
/*
361
 * Magic number to batch-wakeup waiting readers, even when writers are
362
 * also present in the queue. This both limits the amount of work the
363
 * waking thread must do and also prevents any potential counter overflow,
364
 * however unlikely.
365
 */
366
#define MAX_READERS_WAKEUP	0x100
367

368
static inline void
369
rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
370
{
371
	lockdep_assert_held(&sem->wait_lock);
372
	list_add_tail(&waiter->list, &sem->wait_list);
373
	/* caller will set RWSEM_FLAG_WAITERS */
374
}
375

376
/*
377
 * Remove a waiter from the wait_list and clear flags.
378
 *
379
 * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
380
 * this function. Modify with care.
381
 *
382
 * Return: true if wait_list isn't empty and false otherwise
383
 */
384
static inline bool
385
rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
386
{
387
	lockdep_assert_held(&sem->wait_lock);
388
	list_del(&waiter->list);
389
	if (likely(!list_empty(&sem->wait_list)))
390
		return true;
391

392
	atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
393
	return false;
394
}
395

396
/*
397
 * handle the lock release when processes blocked on it that can now run
398
 * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
399
 *   have been set.
400
 * - there must be someone on the queue
401
 * - the wait_lock must be held by the caller
402
 * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
403
 *   to actually wakeup the blocked task(s) and drop the reference count,
404
 *   preferably when the wait_lock is released
405
 * - woken process blocks are discarded from the list after having task zeroed
406
 * - writers are only marked woken if downgrading is false
407
 *
408
 * Implies rwsem_del_waiter() for all woken readers.
409
 */
410
static void rwsem_mark_wake(struct rw_semaphore *sem,
411
			    enum rwsem_wake_type wake_type,
412
			    struct wake_q_head *wake_q)
413
{
414
	struct rwsem_waiter *waiter, *tmp;
415
	long oldcount, woken = 0, adjustment = 0;
416
	struct list_head wlist;
417

418
	lockdep_assert_held(&sem->wait_lock);
419

420
	/*
421
	 * Take a peek at the queue head waiter such that we can determine
422
	 * the wakeup(s) to perform.
423
	 */
424
	waiter = rwsem_first_waiter(sem);
425

426
	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
427
		if (wake_type == RWSEM_WAKE_ANY) {
428
			/*
429
			 * Mark writer at the front of the queue for wakeup.
430
			 * Until the task is actually later awoken later by
431
			 * the caller, other writers are able to steal it.
432
			 * Readers, on the other hand, will block as they
433
			 * will notice the queued writer.
434
			 */
435
			wake_q_add(wake_q, waiter->task);
436
			lockevent_inc(rwsem_wake_writer);
437
		}
438

439
		return;
440
	}
441

442
	/*
443
	 * No reader wakeup if there are too many of them already.
444
	 */
445
	if (unlikely(atomic_long_read(&sem->count) < 0))
446
		return;
447

448
	/*
449
	 * Writers might steal the lock before we grant it to the next reader.
450
	 * We prefer to do the first reader grant before counting readers
451
	 * so we can bail out early if a writer stole the lock.
452
	 */
453
	if (wake_type != RWSEM_WAKE_READ_OWNED) {
454
		struct task_struct *owner;
455

456
		adjustment = RWSEM_READER_BIAS;
457
		oldcount = atomic_long_fetch_add(adjustment, &sem->count);
458
		if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
459
			/*
460
			 * When we've been waiting "too" long (for writers
461
			 * to give up the lock), request a HANDOFF to
462
			 * force the issue.
463
			 */
464
			if (time_after(jiffies, waiter->timeout)) {
465
				if (!(oldcount & RWSEM_FLAG_HANDOFF)) {
466
					adjustment -= RWSEM_FLAG_HANDOFF;
467
					lockevent_inc(rwsem_rlock_handoff);
468
				}
469
				waiter->handoff_set = true;
470
			}
471

472
			atomic_long_add(-adjustment, &sem->count);
473
			return;
474
		}
475
		/*
476
		 * Set it to reader-owned to give spinners an early
477
		 * indication that readers now have the lock.
478
		 * The reader nonspinnable bit seen at slowpath entry of
479
		 * the reader is copied over.
480
		 */
481
		owner = waiter->task;
482
		__rwsem_set_reader_owned(sem, owner);
483
	}
484

485
	/*
486
	 * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the
487
	 * queue. We know that the woken will be at least 1 as we accounted
488
	 * for above. Note we increment the 'active part' of the count by the
489
	 * number of readers before waking any processes up.
490
	 *
491
	 * This is an adaptation of the phase-fair R/W locks where at the
492
	 * reader phase (first waiter is a reader), all readers are eligible
493
	 * to acquire the lock at the same time irrespective of their order
494
	 * in the queue. The writers acquire the lock according to their
495
	 * order in the queue.
496
	 *
497
	 * We have to do wakeup in 2 passes to prevent the possibility that
498
	 * the reader count may be decremented before it is incremented. It
499
	 * is because the to-be-woken waiter may not have slept yet. So it
500
	 * may see waiter->task got cleared, finish its critical section and
501
	 * do an unlock before the reader count increment.
502
	 *
503
	 * 1) Collect the read-waiters in a separate list, count them and
504
	 *    fully increment the reader count in rwsem.
505
	 * 2) For each waiters in the new list, clear waiter->task and
506
	 *    put them into wake_q to be woken up later.
507
	 */
508
	INIT_LIST_HEAD(&wlist);
509
	list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
510
		if (waiter->type == RWSEM_WAITING_FOR_WRITE)
511
			continue;
512

513
		woken++;
514
		list_move_tail(&waiter->list, &wlist);
515

516
		/*
517
		 * Limit # of readers that can be woken up per wakeup call.
518
		 */
519
		if (unlikely(woken >= MAX_READERS_WAKEUP))
520
			break;
521
	}
522

523
	adjustment = woken * RWSEM_READER_BIAS - adjustment;
524
	lockevent_cond_inc(rwsem_wake_reader, woken);
525

526
	oldcount = atomic_long_read(&sem->count);
527
	if (list_empty(&sem->wait_list)) {
528
		/*
529
		 * Combined with list_move_tail() above, this implies
530
		 * rwsem_del_waiter().
531
		 */
532
		adjustment -= RWSEM_FLAG_WAITERS;
533
		if (oldcount & RWSEM_FLAG_HANDOFF)
534
			adjustment -= RWSEM_FLAG_HANDOFF;
535
	} else if (woken) {
536
		/*
537
		 * When we've woken a reader, we no longer need to force
538
		 * writers to give up the lock and we can clear HANDOFF.
539
		 */
540
		if (oldcount & RWSEM_FLAG_HANDOFF)
541
			adjustment -= RWSEM_FLAG_HANDOFF;
542
	}
543

544
	if (adjustment)
545
		atomic_long_add(adjustment, &sem->count);
546

547
	/* 2nd pass */
548
	list_for_each_entry_safe(waiter, tmp, &wlist, list) {
549
		struct task_struct *tsk;
550

551
		tsk = waiter->task;
552
		get_task_struct(tsk);
553

554
		/*
555
		 * Ensure calling get_task_struct() before setting the reader
556
		 * waiter to nil such that rwsem_down_read_slowpath() cannot
557
		 * race with do_exit() by always holding a reference count
558
		 * to the task to wakeup.
559
		 */
560
		smp_store_release(&waiter->task, NULL);
561
		/*
562
		 * Ensure issuing the wakeup (either by us or someone else)
563
		 * after setting the reader waiter to nil.
564
		 */
565
		wake_q_add_safe(wake_q, tsk);
566
	}
567
}
568

569
/*
570
 * Remove a waiter and try to wake up other waiters in the wait queue
571
 * This function is called from the out_nolock path of both the reader and
572
 * writer slowpaths with wait_lock held. It releases the wait_lock and
573
 * optionally wake up waiters before it returns.
574
 */
575
static inline void
576
rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
577
		      struct wake_q_head *wake_q)
578
		      __releases(&sem->wait_lock)
579
{
580
	bool first = rwsem_first_waiter(sem) == waiter;
581

582
	wake_q_init(wake_q);
583

584
	/*
585
	 * If the wait_list isn't empty and the waiter to be deleted is
586
	 * the first waiter, we wake up the remaining waiters as they may
587
	 * be eligible to acquire or spin on the lock.
588
	 */
589
	if (rwsem_del_waiter(sem, waiter) && first)
590
		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
591
	raw_spin_unlock_irq(&sem->wait_lock);
592
	if (!wake_q_empty(wake_q))
593
		wake_up_q(wake_q);
594
}
595

596
/*
597
 * This function must be called with the sem->wait_lock held to prevent
598
 * race conditions between checking the rwsem wait list and setting the
599
 * sem->count accordingly.
600
 *
601
 * Implies rwsem_del_waiter() on success.
602
 */
603
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
604
					struct rwsem_waiter *waiter)
605
{
606
	struct rwsem_waiter *first = rwsem_first_waiter(sem);
607
	long count, new;
608

609
	lockdep_assert_held(&sem->wait_lock);
610

611
	count = atomic_long_read(&sem->count);
612
	do {
613
		bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
614

615
		if (has_handoff) {
616
			/*
617
			 * Honor handoff bit and yield only when the first
618
			 * waiter is the one that set it. Otherwisee, we
619
			 * still try to acquire the rwsem.
620
			 */
621
			if (first->handoff_set && (waiter != first))
622
				return false;
623
		}
624

625
		new = count;
626

627
		if (count & RWSEM_LOCK_MASK) {
628
			/*
629
			 * A waiter (first or not) can set the handoff bit
630
			 * if it is an RT task or wait in the wait queue
631
			 * for too long.
632
			 */
633
			if (has_handoff || (!rt_or_dl_task(waiter->task) &&
634
					    !time_after(jiffies, waiter->timeout)))
635
				return false;
636

637
			new |= RWSEM_FLAG_HANDOFF;
638
		} else {
639
			new |= RWSEM_WRITER_LOCKED;
640
			new &= ~RWSEM_FLAG_HANDOFF;
641

642
			if (list_is_singular(&sem->wait_list))
643
				new &= ~RWSEM_FLAG_WAITERS;
644
		}
645
	} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
646

647
	/*
648
	 * We have either acquired the lock with handoff bit cleared or set
649
	 * the handoff bit. Only the first waiter can have its handoff_set
650
	 * set here to enable optimistic spinning in slowpath loop.
651
	 */
652
	if (new & RWSEM_FLAG_HANDOFF) {
653
		first->handoff_set = true;
654
		lockevent_inc(rwsem_wlock_handoff);
655
		return false;
656
	}
657

658
	/*
659
	 * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
660
	 * success.
661
	 */
662
	list_del(&waiter->list);
663
	rwsem_set_owner(sem);
664
	return true;
665
}
666

667
/*
668
 * The rwsem_spin_on_owner() function returns the following 4 values
669
 * depending on the lock owner state.
670
 *   OWNER_NULL  : owner is currently NULL
671
 *   OWNER_WRITER: when owner changes and is a writer
672
 *   OWNER_READER: when owner changes and the new owner may be a reader.
673
 *   OWNER_NONSPINNABLE:
674
 *		   when optimistic spinning has to stop because either the
675
 *		   owner stops running, is unknown, or its timeslice has
676
 *		   been used up.
677
 */
678
enum owner_state {
679
	OWNER_NULL		= 1 << 0,
680
	OWNER_WRITER		= 1 << 1,
681
	OWNER_READER		= 1 << 2,
682
	OWNER_NONSPINNABLE	= 1 << 3,
683
};
684

685
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
686
/*
687
 * Try to acquire write lock before the writer has been put on wait queue.
688
 */
689
static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
690
{
691
	long count = atomic_long_read(&sem->count);
692

693
	while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) {
694
		if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
695
					count | RWSEM_WRITER_LOCKED)) {
696
			rwsem_set_owner(sem);
697
			lockevent_inc(rwsem_opt_lock);
698
			return true;
699
		}
700
	}
701
	return false;
702
}
703

704
static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
705
{
706
	struct task_struct *owner;
707
	unsigned long flags;
708
	bool ret = true;
709

710
	if (need_resched()) {
711
		lockevent_inc(rwsem_opt_fail);
712
		return false;
713
	}
714

715
	/*
716
	 * Disable preemption is equal to the RCU read-side crital section,
717
	 * thus the task_strcut structure won't go away.
718
	 */
719
	owner = rwsem_owner_flags(sem, &flags);
720
	/*
721
	 * Don't check the read-owner as the entry may be stale.
722
	 */
723
	if ((flags & RWSEM_NONSPINNABLE) ||
724
	    (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
725
		ret = false;
726

727
	lockevent_cond_inc(rwsem_opt_fail, !ret);
728
	return ret;
729
}
730

731
static inline enum owner_state
732
rwsem_owner_state(struct task_struct *owner, unsigned long flags)
733
{
734
	if (flags & RWSEM_NONSPINNABLE)
735
		return OWNER_NONSPINNABLE;
736

737
	if (flags & RWSEM_READER_OWNED)
738
		return OWNER_READER;
739

740
	return owner ? OWNER_WRITER : OWNER_NULL;
741
}
742

743
static noinline enum owner_state
744
rwsem_spin_on_owner(struct rw_semaphore *sem)
745
{
746
	struct task_struct *new, *owner;
747
	unsigned long flags, new_flags;
748
	enum owner_state state;
749

750
	lockdep_assert_preemption_disabled();
751

752
	owner = rwsem_owner_flags(sem, &flags);
753
	state = rwsem_owner_state(owner, flags);
754
	if (state != OWNER_WRITER)
755
		return state;
756

757
	for (;;) {
758
		/*
759
		 * When a waiting writer set the handoff flag, it may spin
760
		 * on the owner as well. Once that writer acquires the lock,
761
		 * we can spin on it. So we don't need to quit even when the
762
		 * handoff bit is set.
763
		 */
764
		new = rwsem_owner_flags(sem, &new_flags);
765
		if ((new != owner) || (new_flags != flags)) {
766
			state = rwsem_owner_state(new, new_flags);
767
			break;
768
		}
769

770
		/*
771
		 * Ensure we emit the owner->on_cpu, dereference _after_
772
		 * checking sem->owner still matches owner, if that fails,
773
		 * owner might point to free()d memory, if it still matches,
774
		 * our spinning context already disabled preemption which is
775
		 * equal to RCU read-side crital section ensures the memory
776
		 * stays valid.
777
		 */
778
		barrier();
779

780
		if (need_resched() || !owner_on_cpu(owner)) {
781
			state = OWNER_NONSPINNABLE;
782
			break;
783
		}
784

785
		cpu_relax();
786
	}
787

788
	return state;
789
}
790

791
/*
792
 * Calculate reader-owned rwsem spinning threshold for writer
793
 *
794
 * The more readers own the rwsem, the longer it will take for them to
795
 * wind down and free the rwsem. So the empirical formula used to
796
 * determine the actual spinning time limit here is:
797
 *
798
 *   Spinning threshold = (10 + nr_readers/2)us
799
 *
800
 * The limit is capped to a maximum of 25us (30 readers). This is just
801
 * a heuristic and is subjected to change in the future.
802
 */
803
static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
804
{
805
	long count = atomic_long_read(&sem->count);
806
	int readers = count >> RWSEM_READER_SHIFT;
807
	u64 delta;
808

809
	if (readers > 30)
810
		readers = 30;
811
	delta = (20 + readers) * NSEC_PER_USEC / 2;
812

813
	return sched_clock() + delta;
814
}
815

816
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
817
{
818
	bool taken = false;
819
	int prev_owner_state = OWNER_NULL;
820
	int loop = 0;
821
	u64 rspin_threshold = 0;
822

823
	/* sem->wait_lock should not be held when doing optimistic spinning */
824
	if (!osq_lock(&sem->osq))
825
		goto done;
826

827
	/*
828
	 * Optimistically spin on the owner field and attempt to acquire the
829
	 * lock whenever the owner changes. Spinning will be stopped when:
830
	 *  1) the owning writer isn't running; or
831
	 *  2) readers own the lock and spinning time has exceeded limit.
832
	 */
833
	for (;;) {
834
		enum owner_state owner_state;
835

836
		owner_state = rwsem_spin_on_owner(sem);
837
		if (owner_state == OWNER_NONSPINNABLE)
838
			break;
839

840
		/*
841
		 * Try to acquire the lock
842
		 */
843
		taken = rwsem_try_write_lock_unqueued(sem);
844

845
		if (taken)
846
			break;
847

848
		/*
849
		 * Time-based reader-owned rwsem optimistic spinning
850
		 */
851
		if (owner_state == OWNER_READER) {
852
			/*
853
			 * Re-initialize rspin_threshold every time when
854
			 * the owner state changes from non-reader to reader.
855
			 * This allows a writer to steal the lock in between
856
			 * 2 reader phases and have the threshold reset at
857
			 * the beginning of the 2nd reader phase.
858
			 */
859
			if (prev_owner_state != OWNER_READER) {
860
				if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
861
					break;
862
				rspin_threshold = rwsem_rspin_threshold(sem);
863
				loop = 0;
864
			}
865

866
			/*
867
			 * Check time threshold once every 16 iterations to
868
			 * avoid calling sched_clock() too frequently so
869
			 * as to reduce the average latency between the times
870
			 * when the lock becomes free and when the spinner
871
			 * is ready to do a trylock.
872
			 */
873
			else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) {
874
				rwsem_set_nonspinnable(sem);
875
				lockevent_inc(rwsem_opt_nospin);
876
				break;
877
			}
878
		}
879

880
		/*
881
		 * An RT task cannot do optimistic spinning if it cannot
882
		 * be sure the lock holder is running or live-lock may
883
		 * happen if the current task and the lock holder happen
884
		 * to run in the same CPU. However, aborting optimistic
885
		 * spinning while a NULL owner is detected may miss some
886
		 * opportunity where spinning can continue without causing
887
		 * problem.
888
		 *
889
		 * There are 2 possible cases where an RT task may be able
890
		 * to continue spinning.
891
		 *
892
		 * 1) The lock owner is in the process of releasing the
893
		 *    lock, sem->owner is cleared but the lock has not
894
		 *    been released yet.
895
		 * 2) The lock was free and owner cleared, but another
896
		 *    task just comes in and acquire the lock before
897
		 *    we try to get it. The new owner may be a spinnable
898
		 *    writer.
899
		 *
900
		 * To take advantage of two scenarios listed above, the RT
901
		 * task is made to retry one more time to see if it can
902
		 * acquire the lock or continue spinning on the new owning
903
		 * writer. Of course, if the time lag is long enough or the
904
		 * new owner is not a writer or spinnable, the RT task will
905
		 * quit spinning.
906
		 *
907
		 * If the owner is a writer, the need_resched() check is
908
		 * done inside rwsem_spin_on_owner(). If the owner is not
909
		 * a writer, need_resched() check needs to be done here.
910
		 */
911
		if (owner_state != OWNER_WRITER) {
912
			if (need_resched())
913
				break;
914
			if (rt_or_dl_task(current) &&
915
			   (prev_owner_state != OWNER_WRITER))
916
				break;
917
		}
918
		prev_owner_state = owner_state;
919

920
		/*
921
		 * The cpu_relax() call is a compiler barrier which forces
922
		 * everything in this loop to be re-loaded. We don't need
923
		 * memory barriers as we'll eventually observe the right
924
		 * values at the cost of a few extra spins.
925
		 */
926
		cpu_relax();
927
	}
928
	osq_unlock(&sem->osq);
929
done:
930
	lockevent_cond_inc(rwsem_opt_fail, !taken);
931
	return taken;
932
}
933

934
/*
935
 * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should
936
 * only be called when the reader count reaches 0.
937
 */
938
static inline void clear_nonspinnable(struct rw_semaphore *sem)
939
{
940
	if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)))
941
		atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner);
942
}
943

944
#else
945
static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
946
{
947
	return false;
948
}
949

950
static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem)
951
{
952
	return false;
953
}
954

955
static inline void clear_nonspinnable(struct rw_semaphore *sem) { }
956

957
static inline enum owner_state
958
rwsem_spin_on_owner(struct rw_semaphore *sem)
959
{
960
	return OWNER_NONSPINNABLE;
961
}
962
#endif
963

964
/*
965
 * Prepare to wake up waiter(s) in the wait queue by putting them into the
966
 * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely
967
 * reader-owned, wake up read lock waiters in queue front or wake up any
968
 * front waiter otherwise.
969

970
 * This is being called from both reader and writer slow paths.
971
 */
972
static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count,
973
					  struct wake_q_head *wake_q)
974
{
975
	enum rwsem_wake_type wake_type;
976

977
	if (count & RWSEM_WRITER_MASK)
978
		return;
979

980
	if (count & RWSEM_READER_MASK) {
981
		wake_type = RWSEM_WAKE_READERS;
982
	} else {
983
		wake_type = RWSEM_WAKE_ANY;
984
		clear_nonspinnable(sem);
985
	}
986
	rwsem_mark_wake(sem, wake_type, wake_q);
987
}
988

989
/*
990
 * Wait for the read lock to be granted
991
 */
992
static struct rw_semaphore __sched *
993
rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state)
994
{
995
	long adjustment = -RWSEM_READER_BIAS;
996
	long rcnt = (count >> RWSEM_READER_SHIFT);
997
	struct rwsem_waiter waiter;
998
	DEFINE_WAKE_Q(wake_q);
999

1000
	/*
1001
	 * To prevent a constant stream of readers from starving a sleeping
1002
	 * writer, don't attempt optimistic lock stealing if the lock is
1003
	 * very likely owned by readers.
1004
	 */
1005
	if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) &&
1006
	    (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED))
1007
		goto queue;
1008

1009
	/*
1010
	 * Reader optimistic lock stealing.
1011
	 */
1012
	if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) {
1013
		rwsem_set_reader_owned(sem);
1014
		lockevent_inc(rwsem_rlock_steal);
1015

1016
		/*
1017
		 * Wake up other readers in the wait queue if it is
1018
		 * the first reader.
1019
		 */
1020
		if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) {
1021
			raw_spin_lock_irq(&sem->wait_lock);
1022
			if (!list_empty(&sem->wait_list))
1023
				rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
1024
						&wake_q);
1025
			raw_spin_unlock_irq(&sem->wait_lock);
1026
			wake_up_q(&wake_q);
1027
		}
1028
		return sem;
1029
	}
1030

1031
queue:
1032
	waiter.task = current;
1033
	waiter.type = RWSEM_WAITING_FOR_READ;
1034
	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1035
	waiter.handoff_set = false;
1036

1037
	raw_spin_lock_irq(&sem->wait_lock);
1038
	if (list_empty(&sem->wait_list)) {
1039
		/*
1040
		 * In case the wait queue is empty and the lock isn't owned
1041
		 * by a writer, this reader can exit the slowpath and return
1042
		 * immediately as its RWSEM_READER_BIAS has already been set
1043
		 * in the count.
1044
		 */
1045
		if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
1046
			/* Provide lock ACQUIRE */
1047
			smp_acquire__after_ctrl_dep();
1048
			raw_spin_unlock_irq(&sem->wait_lock);
1049
			rwsem_set_reader_owned(sem);
1050
			lockevent_inc(rwsem_rlock_fast);
1051
			return sem;
1052
		}
1053
		adjustment += RWSEM_FLAG_WAITERS;
1054
	}
1055
	rwsem_add_waiter(sem, &waiter);
1056

1057
	/* we're now waiting on the lock, but no longer actively locking */
1058
	count = atomic_long_add_return(adjustment, &sem->count);
1059

1060
	rwsem_cond_wake_waiter(sem, count, &wake_q);
1061
	raw_spin_unlock_irq(&sem->wait_lock);
1062

1063
	if (!wake_q_empty(&wake_q))
1064
		wake_up_q(&wake_q);
1065

1066
	trace_contention_begin(sem, LCB_F_READ);
1067
	set_current_state(state);
1068

1069
	if (state == TASK_UNINTERRUPTIBLE)
1070
		hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER);
1071

1072
	/* wait to be given the lock */
1073
	for (;;) {
1074
		if (!smp_load_acquire(&waiter.task)) {
1075
			/* Matches rwsem_mark_wake()'s smp_store_release(). */
1076
			break;
1077
		}
1078
		if (signal_pending_state(state, current)) {
1079
			raw_spin_lock_irq(&sem->wait_lock);
1080
			if (waiter.task)
1081
				goto out_nolock;
1082
			raw_spin_unlock_irq(&sem->wait_lock);
1083
			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
1084
			break;
1085
		}
1086
		schedule_preempt_disabled();
1087
		lockevent_inc(rwsem_sleep_reader);
1088
		set_current_state(state);
1089
	}
1090

1091
	if (state == TASK_UNINTERRUPTIBLE)
1092
		hung_task_clear_blocker();
1093

1094
	__set_current_state(TASK_RUNNING);
1095
	lockevent_inc(rwsem_rlock);
1096
	trace_contention_end(sem, 0);
1097
	return sem;
1098

1099
out_nolock:
1100
	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
1101
	__set_current_state(TASK_RUNNING);
1102
	lockevent_inc(rwsem_rlock_fail);
1103
	trace_contention_end(sem, -EINTR);
1104
	return ERR_PTR(-EINTR);
1105
}
1106

1107
/*
1108
 * Wait until we successfully acquire the write lock
1109
 */
1110
static struct rw_semaphore __sched *
1111
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
1112
{
1113
	struct rwsem_waiter waiter;
1114
	DEFINE_WAKE_Q(wake_q);
1115

1116
	/* do optimistic spinning and steal lock if possible */
1117
	if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) {
1118
		/* rwsem_optimistic_spin() implies ACQUIRE on success */
1119
		return sem;
1120
	}
1121

1122
	/*
1123
	 * Optimistic spinning failed, proceed to the slowpath
1124
	 * and block until we can acquire the sem.
1125
	 */
1126
	waiter.task = current;
1127
	waiter.type = RWSEM_WAITING_FOR_WRITE;
1128
	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1129
	waiter.handoff_set = false;
1130

1131
	raw_spin_lock_irq(&sem->wait_lock);
1132
	rwsem_add_waiter(sem, &waiter);
1133

1134
	/* we're now waiting on the lock */
1135
	if (rwsem_first_waiter(sem) != &waiter) {
1136
		rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
1137
				       &wake_q);
1138
		if (!wake_q_empty(&wake_q)) {
1139
			/*
1140
			 * We want to minimize wait_lock hold time especially
1141
			 * when a large number of readers are to be woken up.
1142
			 */
1143
			raw_spin_unlock_irq(&sem->wait_lock);
1144
			wake_up_q(&wake_q);
1145
			raw_spin_lock_irq(&sem->wait_lock);
1146
		}
1147
	} else {
1148
		atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
1149
	}
1150

1151
	/* wait until we successfully acquire the lock */
1152
	set_current_state(state);
1153
	trace_contention_begin(sem, LCB_F_WRITE);
1154

1155
	if (state == TASK_UNINTERRUPTIBLE)
1156
		hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER);
1157

1158
	for (;;) {
1159
		if (rwsem_try_write_lock(sem, &waiter)) {
1160
			/* rwsem_try_write_lock() implies ACQUIRE on success */
1161
			break;
1162
		}
1163

1164
		raw_spin_unlock_irq(&sem->wait_lock);
1165

1166
		if (signal_pending_state(state, current))
1167
			goto out_nolock;
1168

1169
		/*
1170
		 * After setting the handoff bit and failing to acquire
1171
		 * the lock, attempt to spin on owner to accelerate lock
1172
		 * transfer. If the previous owner is a on-cpu writer and it
1173
		 * has just released the lock, OWNER_NULL will be returned.
1174
		 * In this case, we attempt to acquire the lock again
1175
		 * without sleeping.
1176
		 */
1177
		if (waiter.handoff_set) {
1178
			enum owner_state owner_state;
1179

1180
			owner_state = rwsem_spin_on_owner(sem);
1181
			if (owner_state == OWNER_NULL)
1182
				goto trylock_again;
1183
		}
1184

1185
		schedule_preempt_disabled();
1186
		lockevent_inc(rwsem_sleep_writer);
1187
		set_current_state(state);
1188
trylock_again:
1189
		raw_spin_lock_irq(&sem->wait_lock);
1190
	}
1191

1192
	if (state == TASK_UNINTERRUPTIBLE)
1193
		hung_task_clear_blocker();
1194

1195
	__set_current_state(TASK_RUNNING);
1196
	raw_spin_unlock_irq(&sem->wait_lock);
1197
	lockevent_inc(rwsem_wlock);
1198
	trace_contention_end(sem, 0);
1199
	return sem;
1200

1201
out_nolock:
1202
	__set_current_state(TASK_RUNNING);
1203
	raw_spin_lock_irq(&sem->wait_lock);
1204
	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
1205
	lockevent_inc(rwsem_wlock_fail);
1206
	trace_contention_end(sem, -EINTR);
1207
	return ERR_PTR(-EINTR);
1208
}
1209

1210
/*
1211
 * handle waking up a waiter on the semaphore
1212
 * - up_read/up_write has decremented the active part of count if we come here
1213
 */
1214
static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
1215
{
1216
	unsigned long flags;
1217
	DEFINE_WAKE_Q(wake_q);
1218

1219
	raw_spin_lock_irqsave(&sem->wait_lock, flags);
1220

1221
	if (!list_empty(&sem->wait_list))
1222
		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
1223

1224
	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
1225
	wake_up_q(&wake_q);
1226

1227
	return sem;
1228
}
1229

1230
/*
1231
 * downgrade a write lock into a read lock
1232
 * - caller incremented waiting part of count and discovered it still negative
1233
 * - just wake up any readers at the front of the queue
1234
 */
1235
static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
1236
{
1237
	unsigned long flags;
1238
	DEFINE_WAKE_Q(wake_q);
1239

1240
	raw_spin_lock_irqsave(&sem->wait_lock, flags);
1241

1242
	if (!list_empty(&sem->wait_list))
1243
		rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
1244

1245
	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
1246
	wake_up_q(&wake_q);
1247

1248
	return sem;
1249
}
1250

1251
/*
1252
 * lock for reading
1253
 */
1254
static __always_inline int __down_read_common(struct rw_semaphore *sem, int state)
1255
{
1256
	int ret = 0;
1257
	long count;
1258

1259
	preempt_disable();
1260
	if (!rwsem_read_trylock(sem, &count)) {
1261
		if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) {
1262
			ret = -EINTR;
1263
			goto out;
1264
		}
1265
		DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1266
	}
1267
out:
1268
	preempt_enable();
1269
	return ret;
1270
}
1271

1272
static __always_inline void __down_read(struct rw_semaphore *sem)
1273
{
1274
	__down_read_common(sem, TASK_UNINTERRUPTIBLE);
1275
}
1276

1277
static __always_inline int __down_read_interruptible(struct rw_semaphore *sem)
1278
{
1279
	return __down_read_common(sem, TASK_INTERRUPTIBLE);
1280
}
1281

1282
static __always_inline int __down_read_killable(struct rw_semaphore *sem)
1283
{
1284
	return __down_read_common(sem, TASK_KILLABLE);
1285
}
1286

1287
static inline int __down_read_trylock(struct rw_semaphore *sem)
1288
{
1289
	int ret = 0;
1290
	long tmp;
1291

1292
	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1293

1294
	preempt_disable();
1295
	tmp = atomic_long_read(&sem->count);
1296
	while (!(tmp & RWSEM_READ_FAILED_MASK)) {
1297
		if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
1298
						    tmp + RWSEM_READER_BIAS)) {
1299
			rwsem_set_reader_owned(sem);
1300
			ret = 1;
1301
			break;
1302
		}
1303
	}
1304
	preempt_enable();
1305
	return ret;
1306
}
1307

1308
/*
1309
 * lock for writing
1310
 */
1311
static __always_inline int __down_write_common(struct rw_semaphore *sem, int state)
1312
{
1313
	int ret = 0;
1314

1315
	preempt_disable();
1316
	if (unlikely(!rwsem_write_trylock(sem))) {
1317
		if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
1318
			ret = -EINTR;
1319
	}
1320
	preempt_enable();
1321
	return ret;
1322
}
1323

1324
static __always_inline void __down_write(struct rw_semaphore *sem)
1325
{
1326
	__down_write_common(sem, TASK_UNINTERRUPTIBLE);
1327
}
1328

1329
static __always_inline int __down_write_killable(struct rw_semaphore *sem)
1330
{
1331
	return __down_write_common(sem, TASK_KILLABLE);
1332
}
1333

1334
static inline int __down_write_trylock(struct rw_semaphore *sem)
1335
{
1336
	int ret;
1337

1338
	preempt_disable();
1339
	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1340
	ret = rwsem_write_trylock(sem);
1341
	preempt_enable();
1342

1343
	return ret;
1344
}
1345

1346
/*
1347
 * unlock after reading
1348
 */
1349
static inline void __up_read(struct rw_semaphore *sem)
1350
{
1351
	long tmp;
1352

1353
	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1354
	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1355

1356
	preempt_disable();
1357
	rwsem_clear_reader_owned(sem);
1358
	tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
1359
	DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
1360
	if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
1361
		      RWSEM_FLAG_WAITERS)) {
1362
		clear_nonspinnable(sem);
1363
		rwsem_wake(sem);
1364
	}
1365
	preempt_enable();
1366
}
1367

1368
/*
1369
 * unlock after writing
1370
 */
1371
static inline void __up_write(struct rw_semaphore *sem)
1372
{
1373
	long tmp;
1374

1375
	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1376
	/*
1377
	 * sem->owner may differ from current if the ownership is transferred
1378
	 * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
1379
	 */
1380
	DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
1381
			    !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
1382

1383
	preempt_disable();
1384
	rwsem_clear_owner(sem);
1385
	tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
1386
	if (unlikely(tmp & RWSEM_FLAG_WAITERS))
1387
		rwsem_wake(sem);
1388
	preempt_enable();
1389
}
1390

1391
/*
1392
 * downgrade write lock to read lock
1393
 */
1394
static inline void __downgrade_write(struct rw_semaphore *sem)
1395
{
1396
	long tmp;
1397

1398
	/*
1399
	 * When downgrading from exclusive to shared ownership,
1400
	 * anything inside the write-locked region cannot leak
1401
	 * into the read side. In contrast, anything in the
1402
	 * read-locked region is ok to be re-ordered into the
1403
	 * write side. As such, rely on RELEASE semantics.
1404
	 */
1405
	DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
1406
	preempt_disable();
1407
	tmp = atomic_long_fetch_add_release(
1408
		-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
1409
	rwsem_set_reader_owned(sem);
1410
	if (tmp & RWSEM_FLAG_WAITERS)
1411
		rwsem_downgrade_wake(sem);
1412
	preempt_enable();
1413
}
1414

1415
#else /* !CONFIG_PREEMPT_RT */
1416

1417
#define RT_MUTEX_BUILD_MUTEX
1418
#include "rtmutex.c"
1419

1420
#define rwbase_set_and_save_current_state(state)	\
1421
	set_current_state(state)
1422

1423
#define rwbase_restore_current_state()			\
1424
	__set_current_state(TASK_RUNNING)
1425

1426
#define rwbase_rtmutex_lock_state(rtm, state)		\
1427
	__rt_mutex_lock(rtm, state)
1428

1429
#define rwbase_rtmutex_slowlock_locked(rtm, state, wq)	\
1430
	__rt_mutex_slowlock_locked(rtm, NULL, state, wq)
1431

1432
#define rwbase_rtmutex_unlock(rtm)			\
1433
	__rt_mutex_unlock(rtm)
1434

1435
#define rwbase_rtmutex_trylock(rtm)			\
1436
	__rt_mutex_trylock(rtm)
1437

1438
#define rwbase_signal_pending_state(state, current)	\
1439
	signal_pending_state(state, current)
1440

1441
#define rwbase_pre_schedule()				\
1442
	rt_mutex_pre_schedule()
1443

1444
#define rwbase_schedule()				\
1445
	rt_mutex_schedule()
1446

1447
#define rwbase_post_schedule()				\
1448
	rt_mutex_post_schedule()
1449

1450
#include "rwbase_rt.c"
1451

1452
void __init_rwsem(struct rw_semaphore *sem, const char *name,
1453
		  struct lock_class_key *key)
1454
{
1455
	init_rwbase_rt(&(sem)->rwbase);
1456

1457
#ifdef CONFIG_DEBUG_LOCK_ALLOC
1458
	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
1459
	lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
1460
#endif
1461
}
1462
EXPORT_SYMBOL(__init_rwsem);
1463

1464
static inline void __down_read(struct rw_semaphore *sem)
1465
{
1466
	rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
1467
}
1468

1469
static inline int __down_read_interruptible(struct rw_semaphore *sem)
1470
{
1471
	return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE);
1472
}
1473

1474
static inline int __down_read_killable(struct rw_semaphore *sem)
1475
{
1476
	return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE);
1477
}
1478

1479
static inline int __down_read_trylock(struct rw_semaphore *sem)
1480
{
1481
	return rwbase_read_trylock(&sem->rwbase);
1482
}
1483

1484
static inline void __up_read(struct rw_semaphore *sem)
1485
{
1486
	rwbase_read_unlock(&sem->rwbase, TASK_NORMAL);
1487
}
1488

1489
static inline void __sched __down_write(struct rw_semaphore *sem)
1490
{
1491
	rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
1492
}
1493

1494
static inline int __sched __down_write_killable(struct rw_semaphore *sem)
1495
{
1496
	return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE);
1497
}
1498

1499
static inline int __down_write_trylock(struct rw_semaphore *sem)
1500
{
1501
	return rwbase_write_trylock(&sem->rwbase);
1502
}
1503

1504
static inline void __up_write(struct rw_semaphore *sem)
1505
{
1506
	rwbase_write_unlock(&sem->rwbase);
1507
}
1508

1509
static inline void __downgrade_write(struct rw_semaphore *sem)
1510
{
1511
	rwbase_write_downgrade(&sem->rwbase);
1512
}
1513

1514
/* Debug stubs for the common API */
1515
#define DEBUG_RWSEMS_WARN_ON(c, sem)
1516

1517
static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
1518
					    struct task_struct *owner)
1519
{
1520
}
1521

1522
static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
1523
{
1524
	int count = atomic_read(&sem->rwbase.readers);
1525

1526
	return count < 0 && count != READER_BIAS;
1527
}
1528

1529
#endif /* CONFIG_PREEMPT_RT */
1530

1531
/*
1532
 * lock for reading
1533
 */
1534
void __sched down_read(struct rw_semaphore *sem)
1535
{
1536
	might_sleep();
1537
	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1538

1539
	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1540
}
1541
EXPORT_SYMBOL(down_read);
1542

1543
int __sched down_read_interruptible(struct rw_semaphore *sem)
1544
{
1545
	might_sleep();
1546
	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1547

1548
	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) {
1549
		rwsem_release(&sem->dep_map, _RET_IP_);
1550
		return -EINTR;
1551
	}
1552

1553
	return 0;
1554
}
1555
EXPORT_SYMBOL(down_read_interruptible);
1556

1557
int __sched down_read_killable(struct rw_semaphore *sem)
1558
{
1559
	might_sleep();
1560
	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1561

1562
	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
1563
		rwsem_release(&sem->dep_map, _RET_IP_);
1564
		return -EINTR;
1565
	}
1566

1567
	return 0;
1568
}
1569
EXPORT_SYMBOL(down_read_killable);
1570

1571
/*
1572
 * trylock for reading -- returns 1 if successful, 0 if contention
1573
 */
1574
int down_read_trylock(struct rw_semaphore *sem)
1575
{
1576
	int ret = __down_read_trylock(sem);
1577

1578
	if (ret == 1)
1579
		rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
1580
	return ret;
1581
}
1582
EXPORT_SYMBOL(down_read_trylock);
1583

1584
/*
1585
 * lock for writing
1586
 */
1587
void __sched down_write(struct rw_semaphore *sem)
1588
{
1589
	might_sleep();
1590
	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
1591
	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1592
}
1593
EXPORT_SYMBOL(down_write);
1594

1595
/*
1596
 * lock for writing
1597
 */
1598
int __sched down_write_killable(struct rw_semaphore *sem)
1599
{
1600
	might_sleep();
1601
	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
1602

1603
	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1604
				  __down_write_killable)) {
1605
		rwsem_release(&sem->dep_map, _RET_IP_);
1606
		return -EINTR;
1607
	}
1608

1609
	return 0;
1610
}
1611
EXPORT_SYMBOL(down_write_killable);
1612

1613
/*
1614
 * trylock for writing -- returns 1 if successful, 0 if contention
1615
 */
1616
int down_write_trylock(struct rw_semaphore *sem)
1617
{
1618
	int ret = __down_write_trylock(sem);
1619

1620
	if (ret == 1)
1621
		rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
1622

1623
	return ret;
1624
}
1625
EXPORT_SYMBOL(down_write_trylock);
1626

1627
/*
1628
 * release a read lock
1629
 */
1630
void up_read(struct rw_semaphore *sem)
1631
{
1632
	rwsem_release(&sem->dep_map, _RET_IP_);
1633
	__up_read(sem);
1634
}
1635
EXPORT_SYMBOL(up_read);
1636

1637
/*
1638
 * release a write lock
1639
 */
1640
void up_write(struct rw_semaphore *sem)
1641
{
1642
	rwsem_release(&sem->dep_map, _RET_IP_);
1643
	__up_write(sem);
1644
}
1645
EXPORT_SYMBOL(up_write);
1646

1647
/*
1648
 * downgrade write lock to read lock
1649
 */
1650
void downgrade_write(struct rw_semaphore *sem)
1651
{
1652
	lock_downgrade(&sem->dep_map, _RET_IP_);
1653
	__downgrade_write(sem);
1654
}
1655
EXPORT_SYMBOL(downgrade_write);
1656

1657
#ifdef CONFIG_DEBUG_LOCK_ALLOC
1658

1659
void down_read_nested(struct rw_semaphore *sem, int subclass)
1660
{
1661
	might_sleep();
1662
	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
1663
	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1664
}
1665
EXPORT_SYMBOL(down_read_nested);
1666

1667
int down_read_killable_nested(struct rw_semaphore *sem, int subclass)
1668
{
1669
	might_sleep();
1670
	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
1671

1672
	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
1673
		rwsem_release(&sem->dep_map, _RET_IP_);
1674
		return -EINTR;
1675
	}
1676

1677
	return 0;
1678
}
1679
EXPORT_SYMBOL(down_read_killable_nested);
1680

1681
void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
1682
{
1683
	might_sleep();
1684
	rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
1685
	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1686
}
1687
EXPORT_SYMBOL(_down_write_nest_lock);
1688

1689
void down_read_non_owner(struct rw_semaphore *sem)
1690
{
1691
	might_sleep();
1692
	__down_read(sem);
1693
	/*
1694
	 * The owner value for a reader-owned lock is mostly for debugging
1695
	 * purpose only and is not critical to the correct functioning of
1696
	 * rwsem. So it is perfectly fine to set it in a preempt-enabled
1697
	 * context here.
1698
	 */
1699
	__rwsem_set_reader_owned(sem, NULL);
1700
}
1701
EXPORT_SYMBOL(down_read_non_owner);
1702

1703
void down_write_nested(struct rw_semaphore *sem, int subclass)
1704
{
1705
	might_sleep();
1706
	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1707
	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1708
}
1709
EXPORT_SYMBOL(down_write_nested);
1710

1711
int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
1712
{
1713
	might_sleep();
1714
	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1715

1716
	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1717
				  __down_write_killable)) {
1718
		rwsem_release(&sem->dep_map, _RET_IP_);
1719
		return -EINTR;
1720
	}
1721

1722
	return 0;
1723
}
1724
EXPORT_SYMBOL(down_write_killable_nested);
1725

1726
void up_read_non_owner(struct rw_semaphore *sem)
1727
{
1728
	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1729
	__up_read(sem);
1730
}
1731
EXPORT_SYMBOL(up_read_non_owner);
1732

1733
#endif
1734

1735
Product

Resources

Company