Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/vm/vm_map.c
101167 views
1
/*-
2
* SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
3
*
4
* Copyright (c) 1991, 1993
5
* The Regents of the University of California. All rights reserved.
6
*
7
* This code is derived from software contributed to Berkeley by
8
* The Mach Operating System project at Carnegie-Mellon University.
9
*
10
* Redistribution and use in source and binary forms, with or without
11
* modification, are permitted provided that the following conditions
12
* are met:
13
* 1. Redistributions of source code must retain the above copyright
14
* notice, this list of conditions and the following disclaimer.
15
* 2. Redistributions in binary form must reproduce the above copyright
16
* notice, this list of conditions and the following disclaimer in the
17
* documentation and/or other materials provided with the distribution.
18
* 3. Neither the name of the University nor the names of its contributors
19
* may be used to endorse or promote products derived from this software
20
* without specific prior written permission.
21
*
22
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32
* SUCH DAMAGE.
33
*
34
*
35
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
36
* All rights reserved.
37
*
38
* Authors: Avadis Tevanian, Jr., Michael Wayne Young
39
*
40
* Permission to use, copy, modify and distribute this software and
41
* its documentation is hereby granted, provided that both the copyright
42
* notice and this permission notice appear in all copies of the
43
* software, derivative works or modified versions, and any portions
44
* thereof, and that both notices appear in supporting documentation.
45
*
46
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49
*
50
* Carnegie Mellon requests users of this software to return to
51
*
52
* Software Distribution Coordinator or [email protected]
53
* School of Computer Science
54
* Carnegie Mellon University
55
* Pittsburgh PA 15213-3890
56
*
57
* any improvements or extensions that they make and grant Carnegie the
58
* rights to redistribute these changes.
59
*/
60
61
/*
62
* Virtual memory mapping module.
63
*/
64
65
#include <sys/param.h>
66
#include <sys/systm.h>
67
#include <sys/elf.h>
68
#include <sys/kernel.h>
69
#include <sys/ktr.h>
70
#include <sys/lock.h>
71
#include <sys/mutex.h>
72
#include <sys/proc.h>
73
#include <sys/vmmeter.h>
74
#include <sys/mman.h>
75
#include <sys/vnode.h>
76
#include <sys/racct.h>
77
#include <sys/resourcevar.h>
78
#include <sys/rwlock.h>
79
#include <sys/file.h>
80
#include <sys/sysctl.h>
81
#include <sys/sysent.h>
82
#include <sys/shm.h>
83
84
#include <vm/vm.h>
85
#include <vm/vm_param.h>
86
#include <vm/pmap.h>
87
#include <vm/vm_map.h>
88
#include <vm/vm_page.h>
89
#include <vm/vm_pageout.h>
90
#include <vm/vm_object.h>
91
#include <vm/vm_pager.h>
92
#include <vm/vm_radix.h>
93
#include <vm/vm_kern.h>
94
#include <vm/vm_extern.h>
95
#include <vm/vnode_pager.h>
96
#include <vm/swap_pager.h>
97
#include <vm/uma.h>
98
99
/*
100
* Virtual memory maps provide for the mapping, protection,
101
* and sharing of virtual memory objects. In addition,
102
* this module provides for an efficient virtual copy of
103
* memory from one map to another.
104
*
105
* Synchronization is required prior to most operations.
106
*
107
* Maps consist of an ordered doubly-linked list of simple
108
* entries; a self-adjusting binary search tree of these
109
* entries is used to speed up lookups.
110
*
111
* Since portions of maps are specified by start/end addresses,
112
* which may not align with existing map entries, all
113
* routines merely "clip" entries to these start/end values.
114
* [That is, an entry is split into two, bordering at a
115
* start or end value.] Note that these clippings may not
116
* always be necessary (as the two resulting entries are then
117
* not changed); however, the clipping is done for convenience.
118
*
119
* As mentioned above, virtual copy operations are performed
120
* by copying VM object references from one map to
121
* another, and then marking both regions as copy-on-write.
122
*/
123
124
static struct mtx map_sleep_mtx;
125
static uma_zone_t mapentzone;
126
static uma_zone_t kmapentzone;
127
static uma_zone_t vmspace_zone;
128
static int vmspace_zinit(void *mem, int size, int flags);
129
static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
130
vm_offset_t max);
131
static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
132
static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
133
static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
134
static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
135
vm_map_entry_t gap_entry);
136
static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
137
vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
138
#ifdef INVARIANTS
139
static void vmspace_zdtor(void *mem, int size, void *arg);
140
#endif
141
static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
142
vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
143
int cow);
144
static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
145
vm_offset_t failed_addr);
146
147
#define CONTAINS_BITS(set, bits) ((~(set) & (bits)) == 0)
148
149
#define ENTRY_CHARGED(e) ((e)->cred != NULL || \
150
((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
151
!((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
152
153
/*
154
* PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
155
* stable.
156
*/
157
#define PROC_VMSPACE_LOCK(p) do { } while (0)
158
#define PROC_VMSPACE_UNLOCK(p) do { } while (0)
159
160
/*
161
* VM_MAP_RANGE_CHECK: [ internal use only ]
162
*
163
* Asserts that the starting and ending region
164
* addresses fall within the valid range of the map.
165
*/
166
#define VM_MAP_RANGE_CHECK(map, start, end) \
167
{ \
168
if (start < vm_map_min(map)) \
169
start = vm_map_min(map); \
170
if (end > vm_map_max(map)) \
171
end = vm_map_max(map); \
172
if (start > end) \
173
start = end; \
174
}
175
176
#ifndef UMA_USE_DMAP
177
178
/*
179
* Allocate a new slab for kernel map entries. The kernel map may be locked or
180
* unlocked, depending on whether the request is coming from the kernel map or a
181
* submap. This function allocates a virtual address range directly from the
182
* kernel map instead of the kmem_* layer to avoid recursion on the kernel map
183
* lock and also to avoid triggering allocator recursion in the vmem boundary
184
* tag allocator.
185
*/
186
static void *
187
kmapent_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
188
int wait)
189
{
190
vm_offset_t addr;
191
int error, locked;
192
193
*pflag = UMA_SLAB_PRIV;
194
195
if (!(locked = vm_map_locked(kernel_map)))
196
vm_map_lock(kernel_map);
197
addr = vm_map_findspace(kernel_map, vm_map_min(kernel_map), bytes);
198
if (addr + bytes < addr || addr + bytes > vm_map_max(kernel_map))
199
panic("%s: kernel map is exhausted", __func__);
200
error = vm_map_insert(kernel_map, NULL, 0, addr, addr + bytes,
201
VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
202
if (error != KERN_SUCCESS)
203
panic("%s: vm_map_insert() failed: %d", __func__, error);
204
if (!locked)
205
vm_map_unlock(kernel_map);
206
error = kmem_back_domain(domain, kernel_object, addr, bytes, M_NOWAIT |
207
M_USE_RESERVE | (wait & M_ZERO));
208
if (error == KERN_SUCCESS) {
209
return ((void *)addr);
210
} else {
211
if (!locked)
212
vm_map_lock(kernel_map);
213
vm_map_delete(kernel_map, addr, bytes);
214
if (!locked)
215
vm_map_unlock(kernel_map);
216
return (NULL);
217
}
218
}
219
220
static void
221
kmapent_free(void *item, vm_size_t size, uint8_t pflag)
222
{
223
vm_offset_t addr;
224
int error __diagused;
225
226
if ((pflag & UMA_SLAB_PRIV) == 0)
227
/* XXX leaked */
228
return;
229
230
addr = (vm_offset_t)item;
231
kmem_unback(kernel_object, addr, size);
232
error = vm_map_remove(kernel_map, addr, addr + size);
233
KASSERT(error == KERN_SUCCESS,
234
("%s: vm_map_remove failed: %d", __func__, error));
235
}
236
237
/*
238
* The worst-case upper bound on the number of kernel map entries that may be
239
* created before the zone must be replenished in _vm_map_unlock().
240
*/
241
#define KMAPENT_RESERVE 1
242
243
#endif /* !UMD_MD_SMALL_ALLOC */
244
245
/*
246
* vm_map_startup:
247
*
248
* Initialize the vm_map module. Must be called before any other vm_map
249
* routines.
250
*
251
* User map and entry structures are allocated from the general purpose
252
* memory pool. Kernel maps are statically defined. Kernel map entries
253
* require special handling to avoid recursion; see the comments above
254
* kmapent_alloc() and in vm_map_entry_create().
255
*/
256
void
257
vm_map_startup(void)
258
{
259
mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
260
261
/*
262
* Disable the use of per-CPU buckets: map entry allocation is
263
* serialized by the kernel map lock.
264
*/
265
kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
266
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
267
UMA_ZONE_VM | UMA_ZONE_NOBUCKET);
268
#ifndef UMA_USE_DMAP
269
/* Reserve an extra map entry for use when replenishing the reserve. */
270
uma_zone_reserve(kmapentzone, KMAPENT_RESERVE + 1);
271
uma_prealloc(kmapentzone, KMAPENT_RESERVE + 1);
272
uma_zone_set_allocf(kmapentzone, kmapent_alloc);
273
uma_zone_set_freef(kmapentzone, kmapent_free);
274
#endif
275
276
mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
277
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
278
vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
279
#ifdef INVARIANTS
280
vmspace_zdtor,
281
#else
282
NULL,
283
#endif
284
vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
285
}
286
287
static int
288
vmspace_zinit(void *mem, int size, int flags)
289
{
290
struct vmspace *vm;
291
vm_map_t map;
292
293
vm = (struct vmspace *)mem;
294
map = &vm->vm_map;
295
296
memset(map, 0, sizeof(*map)); /* set MAP_SYSTEM_MAP to false */
297
sx_init(&map->lock, "vm map (user)");
298
PMAP_LOCK_INIT(vmspace_pmap(vm));
299
return (0);
300
}
301
302
#ifdef INVARIANTS
303
static void
304
vmspace_zdtor(void *mem, int size, void *arg)
305
{
306
struct vmspace *vm;
307
308
vm = (struct vmspace *)mem;
309
KASSERT(vm->vm_map.nentries == 0,
310
("vmspace %p nentries == %d on free", vm, vm->vm_map.nentries));
311
KASSERT(vm->vm_map.size == 0,
312
("vmspace %p size == %ju on free", vm, (uintmax_t)vm->vm_map.size));
313
}
314
#endif /* INVARIANTS */
315
316
/*
317
* Allocate a vmspace structure, including a vm_map and pmap,
318
* and initialize those structures. The refcnt is set to 1.
319
*/
320
struct vmspace *
321
vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
322
{
323
struct vmspace *vm;
324
325
vm = uma_zalloc(vmspace_zone, M_WAITOK);
326
KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
327
if (!pinit(vmspace_pmap(vm))) {
328
uma_zfree(vmspace_zone, vm);
329
return (NULL);
330
}
331
CTR1(KTR_VM, "vmspace_alloc: %p", vm);
332
_vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
333
refcount_init(&vm->vm_refcnt, 1);
334
vm->vm_shm = NULL;
335
vm->vm_swrss = 0;
336
vm->vm_tsize = 0;
337
vm->vm_dsize = 0;
338
vm->vm_ssize = 0;
339
vm->vm_taddr = 0;
340
vm->vm_daddr = 0;
341
vm->vm_maxsaddr = 0;
342
return (vm);
343
}
344
345
#ifdef RACCT
346
static void
347
vmspace_container_reset(struct proc *p)
348
{
349
350
PROC_LOCK(p);
351
racct_set(p, RACCT_DATA, 0);
352
racct_set(p, RACCT_STACK, 0);
353
racct_set(p, RACCT_RSS, 0);
354
racct_set(p, RACCT_MEMLOCK, 0);
355
racct_set(p, RACCT_VMEM, 0);
356
PROC_UNLOCK(p);
357
}
358
#endif
359
360
static inline void
361
vmspace_dofree(struct vmspace *vm)
362
{
363
364
CTR1(KTR_VM, "vmspace_free: %p", vm);
365
366
/*
367
* Make sure any SysV shm is freed, it might not have been in
368
* exit1().
369
*/
370
shmexit(vm);
371
372
/*
373
* Lock the map, to wait out all other references to it.
374
* Delete all of the mappings and pages they hold, then call
375
* the pmap module to reclaim anything left.
376
*/
377
(void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
378
vm_map_max(&vm->vm_map));
379
380
pmap_release(vmspace_pmap(vm));
381
vm->vm_map.pmap = NULL;
382
uma_zfree(vmspace_zone, vm);
383
}
384
385
void
386
vmspace_free(struct vmspace *vm)
387
{
388
389
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
390
"vmspace_free() called");
391
392
if (refcount_release(&vm->vm_refcnt))
393
vmspace_dofree(vm);
394
}
395
396
void
397
vmspace_exitfree(struct proc *p)
398
{
399
struct vmspace *vm;
400
401
PROC_VMSPACE_LOCK(p);
402
vm = p->p_vmspace;
403
p->p_vmspace = NULL;
404
PROC_VMSPACE_UNLOCK(p);
405
KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
406
vmspace_free(vm);
407
}
408
409
void
410
vmspace_exit(struct thread *td)
411
{
412
struct vmspace *vm;
413
struct proc *p;
414
bool released;
415
416
p = td->td_proc;
417
vm = p->p_vmspace;
418
419
/*
420
* Prepare to release the vmspace reference. The thread that releases
421
* the last reference is responsible for tearing down the vmspace.
422
* However, threads not releasing the final reference must switch to the
423
* kernel's vmspace0 before the decrement so that the subsequent pmap
424
* deactivation does not modify a freed vmspace.
425
*/
426
refcount_acquire(&vmspace0.vm_refcnt);
427
if (!(released = refcount_release_if_last(&vm->vm_refcnt))) {
428
if (p->p_vmspace != &vmspace0) {
429
PROC_VMSPACE_LOCK(p);
430
p->p_vmspace = &vmspace0;
431
PROC_VMSPACE_UNLOCK(p);
432
pmap_activate(td);
433
}
434
released = refcount_release(&vm->vm_refcnt);
435
}
436
if (released) {
437
/*
438
* pmap_remove_pages() expects the pmap to be active, so switch
439
* back first if necessary.
440
*/
441
if (p->p_vmspace != vm) {
442
PROC_VMSPACE_LOCK(p);
443
p->p_vmspace = vm;
444
PROC_VMSPACE_UNLOCK(p);
445
pmap_activate(td);
446
}
447
pmap_remove_pages(vmspace_pmap(vm));
448
PROC_VMSPACE_LOCK(p);
449
p->p_vmspace = &vmspace0;
450
PROC_VMSPACE_UNLOCK(p);
451
pmap_activate(td);
452
vmspace_dofree(vm);
453
}
454
#ifdef RACCT
455
if (racct_enable)
456
vmspace_container_reset(p);
457
#endif
458
}
459
460
/* Acquire reference to vmspace owned by another process. */
461
462
struct vmspace *
463
vmspace_acquire_ref(struct proc *p)
464
{
465
struct vmspace *vm;
466
467
PROC_VMSPACE_LOCK(p);
468
vm = p->p_vmspace;
469
if (vm == NULL || !refcount_acquire_if_not_zero(&vm->vm_refcnt)) {
470
PROC_VMSPACE_UNLOCK(p);
471
return (NULL);
472
}
473
if (vm != p->p_vmspace) {
474
PROC_VMSPACE_UNLOCK(p);
475
vmspace_free(vm);
476
return (NULL);
477
}
478
PROC_VMSPACE_UNLOCK(p);
479
return (vm);
480
}
481
482
/*
483
* Switch between vmspaces in an AIO kernel process.
484
*
485
* The new vmspace is either the vmspace of a user process obtained
486
* from an active AIO request or the initial vmspace of the AIO kernel
487
* process (when it is idling). Because user processes will block to
488
* drain any active AIO requests before proceeding in exit() or
489
* execve(), the reference count for vmspaces from AIO requests can
490
* never be 0. Similarly, AIO kernel processes hold an extra
491
* reference on their initial vmspace for the life of the process. As
492
* a result, the 'newvm' vmspace always has a non-zero reference
493
* count. This permits an additional reference on 'newvm' to be
494
* acquired via a simple atomic increment rather than the loop in
495
* vmspace_acquire_ref() above.
496
*/
497
void
498
vmspace_switch_aio(struct vmspace *newvm)
499
{
500
struct vmspace *oldvm;
501
502
/* XXX: Need some way to assert that this is an aio daemon. */
503
504
KASSERT(refcount_load(&newvm->vm_refcnt) > 0,
505
("vmspace_switch_aio: newvm unreferenced"));
506
507
oldvm = curproc->p_vmspace;
508
if (oldvm == newvm)
509
return;
510
511
/*
512
* Point to the new address space and refer to it.
513
*/
514
curproc->p_vmspace = newvm;
515
refcount_acquire(&newvm->vm_refcnt);
516
517
/* Activate the new mapping. */
518
pmap_activate(curthread);
519
520
vmspace_free(oldvm);
521
}
522
523
void
524
_vm_map_lock(vm_map_t map, const char *file, int line)
525
{
526
527
if (vm_map_is_system(map))
528
mtx_lock_flags_(&map->system_mtx, 0, file, line);
529
else
530
sx_xlock_(&map->lock, file, line);
531
map->timestamp++;
532
}
533
534
void
535
vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add)
536
{
537
vm_object_t object;
538
struct vnode *vp;
539
bool vp_held;
540
541
if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0)
542
return;
543
KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
544
("Submap with execs"));
545
object = entry->object.vm_object;
546
KASSERT(object != NULL, ("No object for text, entry %p", entry));
547
if ((object->flags & OBJ_ANON) != 0)
548
object = object->handle;
549
else
550
KASSERT(object->backing_object == NULL,
551
("non-anon object %p shadows", object));
552
KASSERT(object != NULL, ("No content object for text, entry %p obj %p",
553
entry, entry->object.vm_object));
554
555
/*
556
* Mostly, we do not lock the backing object. It is
557
* referenced by the entry we are processing, so it cannot go
558
* away.
559
*/
560
vm_pager_getvp(object, &vp, &vp_held);
561
if (vp != NULL) {
562
if (add) {
563
VOP_SET_TEXT_CHECKED(vp);
564
} else {
565
vn_lock(vp, LK_SHARED | LK_RETRY);
566
VOP_UNSET_TEXT_CHECKED(vp);
567
VOP_UNLOCK(vp);
568
}
569
if (vp_held)
570
vdrop(vp);
571
}
572
}
573
574
/*
575
* Use a different name for this vm_map_entry field when it's use
576
* is not consistent with its use as part of an ordered search tree.
577
*/
578
#define defer_next right
579
580
static void
581
vm_map_process_deferred(void)
582
{
583
struct thread *td;
584
vm_map_entry_t entry, next;
585
vm_object_t object;
586
587
td = curthread;
588
entry = td->td_map_def_user;
589
td->td_map_def_user = NULL;
590
while (entry != NULL) {
591
next = entry->defer_next;
592
MPASS((entry->eflags & (MAP_ENTRY_WRITECNT |
593
MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT |
594
MAP_ENTRY_VN_EXEC));
595
if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) {
596
/*
597
* Decrement the object's writemappings and
598
* possibly the vnode's v_writecount.
599
*/
600
KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
601
("Submap with writecount"));
602
object = entry->object.vm_object;
603
KASSERT(object != NULL, ("No object for writecount"));
604
vm_pager_release_writecount(object, entry->start,
605
entry->end);
606
}
607
vm_map_entry_set_vnode_text(entry, false);
608
vm_map_entry_deallocate(entry, FALSE);
609
entry = next;
610
}
611
}
612
613
#ifdef INVARIANTS
614
static void
615
_vm_map_assert_locked(vm_map_t map, const char *file, int line)
616
{
617
618
if (vm_map_is_system(map))
619
mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
620
else
621
sx_assert_(&map->lock, SA_XLOCKED, file, line);
622
}
623
624
#define VM_MAP_ASSERT_LOCKED(map) \
625
_vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
626
627
enum { VMMAP_CHECK_NONE, VMMAP_CHECK_UNLOCK, VMMAP_CHECK_ALL };
628
#ifdef DIAGNOSTIC
629
static int enable_vmmap_check = VMMAP_CHECK_UNLOCK;
630
#else
631
static int enable_vmmap_check = VMMAP_CHECK_NONE;
632
#endif
633
SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN,
634
&enable_vmmap_check, 0, "Enable vm map consistency checking");
635
636
static void _vm_map_assert_consistent(vm_map_t map, int check);
637
638
#define VM_MAP_ASSERT_CONSISTENT(map) \
639
_vm_map_assert_consistent(map, VMMAP_CHECK_ALL)
640
#ifdef DIAGNOSTIC
641
#define VM_MAP_UNLOCK_CONSISTENT(map) do { \
642
if (map->nupdates > map->nentries) { \
643
_vm_map_assert_consistent(map, VMMAP_CHECK_UNLOCK); \
644
map->nupdates = 0; \
645
} \
646
} while (0)
647
#else
648
#define VM_MAP_UNLOCK_CONSISTENT(map)
649
#endif
650
#else
651
#define VM_MAP_ASSERT_LOCKED(map)
652
#define VM_MAP_ASSERT_CONSISTENT(map)
653
#define VM_MAP_UNLOCK_CONSISTENT(map)
654
#endif /* INVARIANTS */
655
656
void
657
_vm_map_unlock(vm_map_t map, const char *file, int line)
658
{
659
660
VM_MAP_UNLOCK_CONSISTENT(map);
661
if (vm_map_is_system(map)) {
662
#ifndef UMA_USE_DMAP
663
if (map == kernel_map && (map->flags & MAP_REPLENISH) != 0) {
664
uma_prealloc(kmapentzone, 1);
665
map->flags &= ~MAP_REPLENISH;
666
}
667
#endif
668
mtx_unlock_flags_(&map->system_mtx, 0, file, line);
669
} else {
670
sx_xunlock_(&map->lock, file, line);
671
vm_map_process_deferred();
672
}
673
}
674
675
void
676
_vm_map_lock_read(vm_map_t map, const char *file, int line)
677
{
678
679
if (vm_map_is_system(map))
680
mtx_lock_flags_(&map->system_mtx, 0, file, line);
681
else
682
sx_slock_(&map->lock, file, line);
683
}
684
685
void
686
_vm_map_unlock_read(vm_map_t map, const char *file, int line)
687
{
688
689
if (vm_map_is_system(map)) {
690
KASSERT((map->flags & MAP_REPLENISH) == 0,
691
("%s: MAP_REPLENISH leaked", __func__));
692
mtx_unlock_flags_(&map->system_mtx, 0, file, line);
693
} else {
694
sx_sunlock_(&map->lock, file, line);
695
vm_map_process_deferred();
696
}
697
}
698
699
int
700
_vm_map_trylock(vm_map_t map, const char *file, int line)
701
{
702
int error;
703
704
error = vm_map_is_system(map) ?
705
!mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
706
!sx_try_xlock_(&map->lock, file, line);
707
if (error == 0)
708
map->timestamp++;
709
return (error == 0);
710
}
711
712
int
713
_vm_map_trylock_read(vm_map_t map, const char *file, int line)
714
{
715
int error;
716
717
error = vm_map_is_system(map) ?
718
!mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
719
!sx_try_slock_(&map->lock, file, line);
720
return (error == 0);
721
}
722
723
/*
724
* _vm_map_lock_upgrade: [ internal use only ]
725
*
726
* Tries to upgrade a read (shared) lock on the specified map to a write
727
* (exclusive) lock. Returns the value "0" if the upgrade succeeds and a
728
* non-zero value if the upgrade fails. If the upgrade fails, the map is
729
* returned without a read or write lock held.
730
*
731
* Requires that the map be read locked.
732
*/
733
int
734
_vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
735
{
736
unsigned int last_timestamp;
737
738
if (vm_map_is_system(map)) {
739
mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
740
} else {
741
if (!sx_try_upgrade_(&map->lock, file, line)) {
742
last_timestamp = map->timestamp;
743
sx_sunlock_(&map->lock, file, line);
744
vm_map_process_deferred();
745
/*
746
* If the map's timestamp does not change while the
747
* map is unlocked, then the upgrade succeeds.
748
*/
749
sx_xlock_(&map->lock, file, line);
750
if (last_timestamp != map->timestamp) {
751
sx_xunlock_(&map->lock, file, line);
752
return (1);
753
}
754
}
755
}
756
map->timestamp++;
757
return (0);
758
}
759
760
void
761
_vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
762
{
763
764
if (vm_map_is_system(map)) {
765
KASSERT((map->flags & MAP_REPLENISH) == 0,
766
("%s: MAP_REPLENISH leaked", __func__));
767
mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
768
} else {
769
VM_MAP_UNLOCK_CONSISTENT(map);
770
sx_downgrade_(&map->lock, file, line);
771
}
772
}
773
774
/*
775
* vm_map_locked:
776
*
777
* Returns a non-zero value if the caller holds a write (exclusive) lock
778
* on the specified map and the value "0" otherwise.
779
*/
780
int
781
vm_map_locked(vm_map_t map)
782
{
783
784
if (vm_map_is_system(map))
785
return (mtx_owned(&map->system_mtx));
786
return (sx_xlocked(&map->lock));
787
}
788
789
/*
790
* _vm_map_unlock_and_wait:
791
*
792
* Atomically releases the lock on the specified map and puts the calling
793
* thread to sleep. The calling thread will remain asleep until either
794
* vm_map_wakeup() is performed on the map or the specified timeout is
795
* exceeded.
796
*
797
* WARNING! This function does not perform deferred deallocations of
798
* objects and map entries. Therefore, the calling thread is expected to
799
* reacquire the map lock after reawakening and later perform an ordinary
800
* unlock operation, such as vm_map_unlock(), before completing its
801
* operation on the map.
802
*/
803
int
804
_vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
805
{
806
807
VM_MAP_UNLOCK_CONSISTENT(map);
808
mtx_lock(&map_sleep_mtx);
809
if (vm_map_is_system(map)) {
810
KASSERT((map->flags & MAP_REPLENISH) == 0,
811
("%s: MAP_REPLENISH leaked", __func__));
812
mtx_unlock_flags_(&map->system_mtx, 0, file, line);
813
} else {
814
sx_xunlock_(&map->lock, file, line);
815
}
816
return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
817
timo));
818
}
819
820
/*
821
* vm_map_wakeup:
822
*
823
* Awaken any threads that have slept on the map using
824
* vm_map_unlock_and_wait().
825
*/
826
void
827
vm_map_wakeup(vm_map_t map)
828
{
829
830
/*
831
* Acquire and release map_sleep_mtx to prevent a wakeup()
832
* from being performed (and lost) between the map unlock
833
* and the msleep() in _vm_map_unlock_and_wait().
834
*/
835
mtx_lock(&map_sleep_mtx);
836
mtx_unlock(&map_sleep_mtx);
837
wakeup(&map->root);
838
}
839
840
void
841
vm_map_busy(vm_map_t map)
842
{
843
844
VM_MAP_ASSERT_LOCKED(map);
845
map->busy++;
846
}
847
848
void
849
vm_map_unbusy(vm_map_t map)
850
{
851
852
VM_MAP_ASSERT_LOCKED(map);
853
KASSERT(map->busy, ("vm_map_unbusy: not busy"));
854
if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
855
vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
856
wakeup(&map->busy);
857
}
858
}
859
860
void
861
vm_map_wait_busy(vm_map_t map)
862
{
863
864
VM_MAP_ASSERT_LOCKED(map);
865
while (map->busy) {
866
vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
867
if (vm_map_is_system(map))
868
msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
869
else
870
sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
871
}
872
map->timestamp++;
873
}
874
875
long
876
vmspace_resident_count(struct vmspace *vmspace)
877
{
878
return pmap_resident_count(vmspace_pmap(vmspace));
879
}
880
881
/*
882
* Initialize an existing vm_map structure
883
* such as that in the vmspace structure.
884
*/
885
static void
886
_vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
887
{
888
889
map->header.eflags = MAP_ENTRY_HEADER;
890
map->pmap = pmap;
891
map->header.end = min;
892
map->header.start = max;
893
map->flags = 0;
894
map->header.left = map->header.right = &map->header;
895
map->root = NULL;
896
map->timestamp = 0;
897
map->busy = 0;
898
map->anon_loc = 0;
899
#ifdef DIAGNOSTIC
900
map->nupdates = 0;
901
#endif
902
}
903
904
void
905
vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
906
{
907
_vm_map_init(map, pmap, min, max);
908
sx_init(&map->lock, "vm map (user)");
909
}
910
911
void
912
vm_map_init_system(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
913
{
914
_vm_map_init(map, pmap, min, max);
915
vm_map_modflags(map, MAP_SYSTEM_MAP, 0);
916
mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF |
917
MTX_DUPOK);
918
}
919
920
/*
921
* vm_map_entry_dispose: [ internal use only ]
922
*
923
* Inverse of vm_map_entry_create.
924
*/
925
static void
926
vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
927
{
928
uma_zfree(vm_map_is_system(map) ? kmapentzone : mapentzone, entry);
929
}
930
931
/*
932
* vm_map_entry_create: [ internal use only ]
933
*
934
* Allocates a VM map entry for insertion.
935
* No entry fields are filled in.
936
*/
937
static vm_map_entry_t
938
vm_map_entry_create(vm_map_t map)
939
{
940
vm_map_entry_t new_entry;
941
942
#ifndef UMA_USE_DMAP
943
if (map == kernel_map) {
944
VM_MAP_ASSERT_LOCKED(map);
945
946
/*
947
* A new slab of kernel map entries cannot be allocated at this
948
* point because the kernel map has not yet been updated to
949
* reflect the caller's request. Therefore, we allocate a new
950
* map entry, dipping into the reserve if necessary, and set a
951
* flag indicating that the reserve must be replenished before
952
* the map is unlocked.
953
*/
954
new_entry = uma_zalloc(kmapentzone, M_NOWAIT | M_NOVM);
955
if (new_entry == NULL) {
956
new_entry = uma_zalloc(kmapentzone,
957
M_NOWAIT | M_NOVM | M_USE_RESERVE);
958
kernel_map->flags |= MAP_REPLENISH;
959
}
960
} else
961
#endif
962
if (vm_map_is_system(map)) {
963
new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
964
} else {
965
new_entry = uma_zalloc(mapentzone, M_WAITOK);
966
}
967
KASSERT(new_entry != NULL,
968
("vm_map_entry_create: kernel resources exhausted"));
969
return (new_entry);
970
}
971
972
/*
973
* vm_map_entry_set_behavior:
974
*
975
* Set the expected access behavior, either normal, random, or
976
* sequential.
977
*/
978
static inline void
979
vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
980
{
981
entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
982
(behavior & MAP_ENTRY_BEHAV_MASK);
983
}
984
985
/*
986
* vm_map_entry_max_free_{left,right}:
987
*
988
* Compute the size of the largest free gap between two entries,
989
* one the root of a tree and the other the ancestor of that root
990
* that is the least or greatest ancestor found on the search path.
991
*/
992
static inline vm_size_t
993
vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor)
994
{
995
996
return (root->left != left_ancestor ?
997
root->left->max_free : root->start - left_ancestor->end);
998
}
999
1000
static inline vm_size_t
1001
vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor)
1002
{
1003
1004
return (root->right != right_ancestor ?
1005
root->right->max_free : right_ancestor->start - root->end);
1006
}
1007
1008
/*
1009
* vm_map_entry_{pred,succ}:
1010
*
1011
* Find the {predecessor, successor} of the entry by taking one step
1012
* in the appropriate direction and backtracking as much as necessary.
1013
* vm_map_entry_succ is defined in vm_map.h.
1014
*/
1015
static inline vm_map_entry_t
1016
vm_map_entry_pred(vm_map_entry_t entry)
1017
{
1018
vm_map_entry_t prior;
1019
1020
prior = entry->left;
1021
if (prior->right->start < entry->start) {
1022
do
1023
prior = prior->right;
1024
while (prior->right != entry);
1025
}
1026
return (prior);
1027
}
1028
1029
static inline vm_size_t
1030
vm_size_max(vm_size_t a, vm_size_t b)
1031
{
1032
1033
return (a > b ? a : b);
1034
}
1035
1036
#define SPLAY_LEFT_STEP(root, y, llist, rlist, test) do { \
1037
vm_map_entry_t z; \
1038
vm_size_t max_free; \
1039
\
1040
/* \
1041
* Infer root->right->max_free == root->max_free when \
1042
* y->max_free < root->max_free || root->max_free == 0. \
1043
* Otherwise, look right to find it. \
1044
*/ \
1045
y = root->left; \
1046
max_free = root->max_free; \
1047
KASSERT(max_free == vm_size_max( \
1048
vm_map_entry_max_free_left(root, llist), \
1049
vm_map_entry_max_free_right(root, rlist)), \
1050
("%s: max_free invariant fails", __func__)); \
1051
if (max_free - 1 < vm_map_entry_max_free_left(root, llist)) \
1052
max_free = vm_map_entry_max_free_right(root, rlist); \
1053
if (y != llist && (test)) { \
1054
/* Rotate right and make y root. */ \
1055
z = y->right; \
1056
if (z != root) { \
1057
root->left = z; \
1058
y->right = root; \
1059
if (max_free < y->max_free) \
1060
root->max_free = max_free = \
1061
vm_size_max(max_free, z->max_free); \
1062
} else if (max_free < y->max_free) \
1063
root->max_free = max_free = \
1064
vm_size_max(max_free, root->start - y->end);\
1065
root = y; \
1066
y = root->left; \
1067
} \
1068
/* Copy right->max_free. Put root on rlist. */ \
1069
root->max_free = max_free; \
1070
KASSERT(max_free == vm_map_entry_max_free_right(root, rlist), \
1071
("%s: max_free not copied from right", __func__)); \
1072
root->left = rlist; \
1073
rlist = root; \
1074
root = y != llist ? y : NULL; \
1075
} while (0)
1076
1077
#define SPLAY_RIGHT_STEP(root, y, llist, rlist, test) do { \
1078
vm_map_entry_t z; \
1079
vm_size_t max_free; \
1080
\
1081
/* \
1082
* Infer root->left->max_free == root->max_free when \
1083
* y->max_free < root->max_free || root->max_free == 0. \
1084
* Otherwise, look left to find it. \
1085
*/ \
1086
y = root->right; \
1087
max_free = root->max_free; \
1088
KASSERT(max_free == vm_size_max( \
1089
vm_map_entry_max_free_left(root, llist), \
1090
vm_map_entry_max_free_right(root, rlist)), \
1091
("%s: max_free invariant fails", __func__)); \
1092
if (max_free - 1 < vm_map_entry_max_free_right(root, rlist)) \
1093
max_free = vm_map_entry_max_free_left(root, llist); \
1094
if (y != rlist && (test)) { \
1095
/* Rotate left and make y root. */ \
1096
z = y->left; \
1097
if (z != root) { \
1098
root->right = z; \
1099
y->left = root; \
1100
if (max_free < y->max_free) \
1101
root->max_free = max_free = \
1102
vm_size_max(max_free, z->max_free); \
1103
} else if (max_free < y->max_free) \
1104
root->max_free = max_free = \
1105
vm_size_max(max_free, y->start - root->end);\
1106
root = y; \
1107
y = root->right; \
1108
} \
1109
/* Copy left->max_free. Put root on llist. */ \
1110
root->max_free = max_free; \
1111
KASSERT(max_free == vm_map_entry_max_free_left(root, llist), \
1112
("%s: max_free not copied from left", __func__)); \
1113
root->right = llist; \
1114
llist = root; \
1115
root = y != rlist ? y : NULL; \
1116
} while (0)
1117
1118
/*
1119
* Walk down the tree until we find addr or a gap where addr would go, breaking
1120
* off left and right subtrees of nodes less than, or greater than addr. Treat
1121
* subtrees with root->max_free < length as empty trees. llist and rlist are
1122
* the two sides in reverse order (bottom-up), with llist linked by the right
1123
* pointer and rlist linked by the left pointer in the vm_map_entry, and both
1124
* lists terminated by &map->header. This function, and the subsequent call to
1125
* vm_map_splay_merge_{left,right,pred,succ}, rely on the start and end address
1126
* values in &map->header.
1127
*/
1128
static __always_inline vm_map_entry_t
1129
vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length,
1130
vm_map_entry_t *llist, vm_map_entry_t *rlist)
1131
{
1132
vm_map_entry_t left, right, root, y;
1133
1134
left = right = &map->header;
1135
root = map->root;
1136
while (root != NULL && root->max_free >= length) {
1137
KASSERT(left->end <= root->start &&
1138
root->end <= right->start,
1139
("%s: root not within tree bounds", __func__));
1140
if (addr < root->start) {
1141
SPLAY_LEFT_STEP(root, y, left, right,
1142
y->max_free >= length && addr < y->start);
1143
} else if (addr >= root->end) {
1144
SPLAY_RIGHT_STEP(root, y, left, right,
1145
y->max_free >= length && addr >= y->end);
1146
} else
1147
break;
1148
}
1149
*llist = left;
1150
*rlist = right;
1151
return (root);
1152
}
1153
1154
static __always_inline void
1155
vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *rlist)
1156
{
1157
vm_map_entry_t hi, right, y;
1158
1159
right = *rlist;
1160
hi = root->right == right ? NULL : root->right;
1161
if (hi == NULL)
1162
return;
1163
do
1164
SPLAY_LEFT_STEP(hi, y, root, right, true);
1165
while (hi != NULL);
1166
*rlist = right;
1167
}
1168
1169
static __always_inline void
1170
vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *llist)
1171
{
1172
vm_map_entry_t left, lo, y;
1173
1174
left = *llist;
1175
lo = root->left == left ? NULL : root->left;
1176
if (lo == NULL)
1177
return;
1178
do
1179
SPLAY_RIGHT_STEP(lo, y, left, root, true);
1180
while (lo != NULL);
1181
*llist = left;
1182
}
1183
1184
static inline void
1185
vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b)
1186
{
1187
vm_map_entry_t tmp;
1188
1189
tmp = *b;
1190
*b = *a;
1191
*a = tmp;
1192
}
1193
1194
/*
1195
* Walk back up the two spines, flip the pointers and set max_free. The
1196
* subtrees of the root go at the bottom of llist and rlist.
1197
*/
1198
static vm_size_t
1199
vm_map_splay_merge_left_walk(vm_map_entry_t header, vm_map_entry_t root,
1200
vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t llist)
1201
{
1202
do {
1203
/*
1204
* The max_free values of the children of llist are in
1205
* llist->max_free and max_free. Update with the
1206
* max value.
1207
*/
1208
llist->max_free = max_free =
1209
vm_size_max(llist->max_free, max_free);
1210
vm_map_entry_swap(&llist->right, &tail);
1211
vm_map_entry_swap(&tail, &llist);
1212
} while (llist != header);
1213
root->left = tail;
1214
return (max_free);
1215
}
1216
1217
/*
1218
* When llist is known to be the predecessor of root.
1219
*/
1220
static inline vm_size_t
1221
vm_map_splay_merge_pred(vm_map_entry_t header, vm_map_entry_t root,
1222
vm_map_entry_t llist)
1223
{
1224
vm_size_t max_free;
1225
1226
max_free = root->start - llist->end;
1227
if (llist != header) {
1228
max_free = vm_map_splay_merge_left_walk(header, root,
1229
root, max_free, llist);
1230
} else {
1231
root->left = header;
1232
header->right = root;
1233
}
1234
return (max_free);
1235
}
1236
1237
/*
1238
* When llist may or may not be the predecessor of root.
1239
*/
1240
static inline vm_size_t
1241
vm_map_splay_merge_left(vm_map_entry_t header, vm_map_entry_t root,
1242
vm_map_entry_t llist)
1243
{
1244
vm_size_t max_free;
1245
1246
max_free = vm_map_entry_max_free_left(root, llist);
1247
if (llist != header) {
1248
max_free = vm_map_splay_merge_left_walk(header, root,
1249
root->left == llist ? root : root->left,
1250
max_free, llist);
1251
}
1252
return (max_free);
1253
}
1254
1255
static vm_size_t
1256
vm_map_splay_merge_right_walk(vm_map_entry_t header, vm_map_entry_t root,
1257
vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t rlist)
1258
{
1259
do {
1260
/*
1261
* The max_free values of the children of rlist are in
1262
* rlist->max_free and max_free. Update with the
1263
* max value.
1264
*/
1265
rlist->max_free = max_free =
1266
vm_size_max(rlist->max_free, max_free);
1267
vm_map_entry_swap(&rlist->left, &tail);
1268
vm_map_entry_swap(&tail, &rlist);
1269
} while (rlist != header);
1270
root->right = tail;
1271
return (max_free);
1272
}
1273
1274
/*
1275
* When rlist is known to be the succecessor of root.
1276
*/
1277
static inline vm_size_t
1278
vm_map_splay_merge_succ(vm_map_entry_t header, vm_map_entry_t root,
1279
vm_map_entry_t rlist)
1280
{
1281
vm_size_t max_free;
1282
1283
max_free = rlist->start - root->end;
1284
if (rlist != header) {
1285
max_free = vm_map_splay_merge_right_walk(header, root,
1286
root, max_free, rlist);
1287
} else {
1288
root->right = header;
1289
header->left = root;
1290
}
1291
return (max_free);
1292
}
1293
1294
/*
1295
* When rlist may or may not be the succecessor of root.
1296
*/
1297
static inline vm_size_t
1298
vm_map_splay_merge_right(vm_map_entry_t header, vm_map_entry_t root,
1299
vm_map_entry_t rlist)
1300
{
1301
vm_size_t max_free;
1302
1303
max_free = vm_map_entry_max_free_right(root, rlist);
1304
if (rlist != header) {
1305
max_free = vm_map_splay_merge_right_walk(header, root,
1306
root->right == rlist ? root : root->right,
1307
max_free, rlist);
1308
}
1309
return (max_free);
1310
}
1311
1312
/*
1313
* vm_map_splay:
1314
*
1315
* The Sleator and Tarjan top-down splay algorithm with the
1316
* following variation. Max_free must be computed bottom-up, so
1317
* on the downward pass, maintain the left and right spines in
1318
* reverse order. Then, make a second pass up each side to fix
1319
* the pointers and compute max_free. The time bound is O(log n)
1320
* amortized.
1321
*
1322
* The tree is threaded, which means that there are no null pointers.
1323
* When a node has no left child, its left pointer points to its
1324
* predecessor, which the last ancestor on the search path from the root
1325
* where the search branched right. Likewise, when a node has no right
1326
* child, its right pointer points to its successor. The map header node
1327
* is the predecessor of the first map entry, and the successor of the
1328
* last.
1329
*
1330
* The new root is the vm_map_entry containing "addr", or else an
1331
* adjacent entry (lower if possible) if addr is not in the tree.
1332
*
1333
* The map must be locked, and leaves it so.
1334
*
1335
* Returns: the new root.
1336
*/
1337
static vm_map_entry_t
1338
vm_map_splay(vm_map_t map, vm_offset_t addr)
1339
{
1340
vm_map_entry_t header, llist, rlist, root;
1341
vm_size_t max_free_left, max_free_right;
1342
1343
header = &map->header;
1344
root = vm_map_splay_split(map, addr, 0, &llist, &rlist);
1345
if (root != NULL) {
1346
max_free_left = vm_map_splay_merge_left(header, root, llist);
1347
max_free_right = vm_map_splay_merge_right(header, root, rlist);
1348
} else if (llist != header) {
1349
/*
1350
* Recover the greatest node in the left
1351
* subtree and make it the root.
1352
*/
1353
root = llist;
1354
llist = root->right;
1355
max_free_left = vm_map_splay_merge_left(header, root, llist);
1356
max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1357
} else if (rlist != header) {
1358
/*
1359
* Recover the least node in the right
1360
* subtree and make it the root.
1361
*/
1362
root = rlist;
1363
rlist = root->left;
1364
max_free_left = vm_map_splay_merge_pred(header, root, llist);
1365
max_free_right = vm_map_splay_merge_right(header, root, rlist);
1366
} else {
1367
/* There is no root. */
1368
return (NULL);
1369
}
1370
root->max_free = vm_size_max(max_free_left, max_free_right);
1371
map->root = root;
1372
VM_MAP_ASSERT_CONSISTENT(map);
1373
return (root);
1374
}
1375
1376
/*
1377
* vm_map_entry_{un,}link:
1378
*
1379
* Insert/remove entries from maps. On linking, if new entry clips
1380
* existing entry, trim existing entry to avoid overlap, and manage
1381
* offsets. On unlinking, merge disappearing entry with neighbor, if
1382
* called for, and manage offsets. Callers should not modify fields in
1383
* entries already mapped.
1384
*/
1385
static void
1386
vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
1387
{
1388
vm_map_entry_t header, llist, rlist, root;
1389
vm_size_t max_free_left, max_free_right;
1390
1391
CTR3(KTR_VM,
1392
"vm_map_entry_link: map %p, nentries %d, entry %p", map,
1393
map->nentries, entry);
1394
VM_MAP_ASSERT_LOCKED(map);
1395
map->nentries++;
1396
header = &map->header;
1397
root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1398
if (root == NULL) {
1399
/*
1400
* The new entry does not overlap any existing entry in the
1401
* map, so it becomes the new root of the map tree.
1402
*/
1403
max_free_left = vm_map_splay_merge_pred(header, entry, llist);
1404
max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
1405
} else if (entry->start == root->start) {
1406
/*
1407
* The new entry is a clone of root, with only the end field
1408
* changed. The root entry will be shrunk to abut the new
1409
* entry, and will be the right child of the new root entry in
1410
* the modified map.
1411
*/
1412
KASSERT(entry->end < root->end,
1413
("%s: clip_start not within entry", __func__));
1414
vm_map_splay_findprev(root, &llist);
1415
if ((root->eflags & MAP_ENTRY_STACK_GAP) == 0)
1416
root->offset += entry->end - root->start;
1417
root->start = entry->end;
1418
max_free_left = vm_map_splay_merge_pred(header, entry, llist);
1419
max_free_right = root->max_free = vm_size_max(
1420
vm_map_splay_merge_pred(entry, root, entry),
1421
vm_map_splay_merge_right(header, root, rlist));
1422
} else {
1423
/*
1424
* The new entry is a clone of root, with only the start field
1425
* changed. The root entry will be shrunk to abut the new
1426
* entry, and will be the left child of the new root entry in
1427
* the modified map.
1428
*/
1429
KASSERT(entry->end == root->end,
1430
("%s: clip_start not within entry", __func__));
1431
vm_map_splay_findnext(root, &rlist);
1432
if ((entry->eflags & MAP_ENTRY_STACK_GAP) == 0)
1433
entry->offset += entry->start - root->start;
1434
root->end = entry->start;
1435
max_free_left = root->max_free = vm_size_max(
1436
vm_map_splay_merge_left(header, root, llist),
1437
vm_map_splay_merge_succ(entry, root, entry));
1438
max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
1439
}
1440
entry->max_free = vm_size_max(max_free_left, max_free_right);
1441
map->root = entry;
1442
VM_MAP_ASSERT_CONSISTENT(map);
1443
}
1444
1445
enum unlink_merge_type {
1446
UNLINK_MERGE_NONE,
1447
UNLINK_MERGE_NEXT
1448
};
1449
1450
static void
1451
vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry,
1452
enum unlink_merge_type op)
1453
{
1454
vm_map_entry_t header, llist, rlist, root;
1455
vm_size_t max_free_left, max_free_right;
1456
1457
VM_MAP_ASSERT_LOCKED(map);
1458
header = &map->header;
1459
root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1460
KASSERT(root != NULL,
1461
("vm_map_entry_unlink: unlink object not mapped"));
1462
1463
vm_map_splay_findprev(root, &llist);
1464
vm_map_splay_findnext(root, &rlist);
1465
if (op == UNLINK_MERGE_NEXT) {
1466
rlist->start = root->start;
1467
MPASS((rlist->eflags & MAP_ENTRY_STACK_GAP) == 0);
1468
rlist->offset = root->offset;
1469
}
1470
if (llist != header) {
1471
root = llist;
1472
llist = root->right;
1473
max_free_left = vm_map_splay_merge_left(header, root, llist);
1474
max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1475
} else if (rlist != header) {
1476
root = rlist;
1477
rlist = root->left;
1478
max_free_left = vm_map_splay_merge_pred(header, root, llist);
1479
max_free_right = vm_map_splay_merge_right(header, root, rlist);
1480
} else {
1481
header->left = header->right = header;
1482
root = NULL;
1483
}
1484
if (root != NULL)
1485
root->max_free = vm_size_max(max_free_left, max_free_right);
1486
map->root = root;
1487
VM_MAP_ASSERT_CONSISTENT(map);
1488
map->nentries--;
1489
CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
1490
map->nentries, entry);
1491
}
1492
1493
/*
1494
* vm_map_entry_resize:
1495
*
1496
* Resize a vm_map_entry, recompute the amount of free space that
1497
* follows it and propagate that value up the tree.
1498
*
1499
* The map must be locked, and leaves it so.
1500
*/
1501
static void
1502
vm_map_entry_resize(vm_map_t map, vm_map_entry_t entry, vm_size_t grow_amount)
1503
{
1504
vm_map_entry_t header, llist, rlist, root;
1505
1506
VM_MAP_ASSERT_LOCKED(map);
1507
header = &map->header;
1508
root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1509
KASSERT(root != NULL, ("%s: resize object not mapped", __func__));
1510
vm_map_splay_findnext(root, &rlist);
1511
entry->end += grow_amount;
1512
root->max_free = vm_size_max(
1513
vm_map_splay_merge_left(header, root, llist),
1514
vm_map_splay_merge_succ(header, root, rlist));
1515
map->root = root;
1516
VM_MAP_ASSERT_CONSISTENT(map);
1517
CTR4(KTR_VM, "%s: map %p, nentries %d, entry %p",
1518
__func__, map, map->nentries, entry);
1519
}
1520
1521
/*
1522
* vm_map_lookup_entry: [ internal use only ]
1523
*
1524
* Finds the map entry containing (or
1525
* immediately preceding) the specified address
1526
* in the given map; the entry is returned
1527
* in the "entry" parameter. The boolean
1528
* result indicates whether the address is
1529
* actually contained in the map.
1530
*/
1531
boolean_t
1532
vm_map_lookup_entry(
1533
vm_map_t map,
1534
vm_offset_t address,
1535
vm_map_entry_t *entry) /* OUT */
1536
{
1537
vm_map_entry_t cur, header, lbound, ubound;
1538
boolean_t locked;
1539
1540
/*
1541
* If the map is empty, then the map entry immediately preceding
1542
* "address" is the map's header.
1543
*/
1544
header = &map->header;
1545
cur = map->root;
1546
if (cur == NULL) {
1547
*entry = header;
1548
return (FALSE);
1549
}
1550
if (address >= cur->start && cur->end > address) {
1551
*entry = cur;
1552
return (TRUE);
1553
}
1554
if ((locked = vm_map_locked(map)) ||
1555
sx_try_upgrade(&map->lock)) {
1556
/*
1557
* Splay requires a write lock on the map. However, it only
1558
* restructures the binary search tree; it does not otherwise
1559
* change the map. Thus, the map's timestamp need not change
1560
* on a temporary upgrade.
1561
*/
1562
cur = vm_map_splay(map, address);
1563
if (!locked) {
1564
VM_MAP_UNLOCK_CONSISTENT(map);
1565
sx_downgrade(&map->lock);
1566
}
1567
1568
/*
1569
* If "address" is contained within a map entry, the new root
1570
* is that map entry. Otherwise, the new root is a map entry
1571
* immediately before or after "address".
1572
*/
1573
if (address < cur->start) {
1574
*entry = header;
1575
return (FALSE);
1576
}
1577
*entry = cur;
1578
return (address < cur->end);
1579
}
1580
/*
1581
* Since the map is only locked for read access, perform a
1582
* standard binary search tree lookup for "address".
1583
*/
1584
lbound = ubound = header;
1585
for (;;) {
1586
if (address < cur->start) {
1587
ubound = cur;
1588
cur = cur->left;
1589
if (cur == lbound)
1590
break;
1591
} else if (cur->end <= address) {
1592
lbound = cur;
1593
cur = cur->right;
1594
if (cur == ubound)
1595
break;
1596
} else {
1597
*entry = cur;
1598
return (TRUE);
1599
}
1600
}
1601
*entry = lbound;
1602
return (FALSE);
1603
}
1604
1605
/*
1606
* vm_map_insert1() is identical to vm_map_insert() except that it
1607
* returns the newly inserted map entry in '*res'. In case the new
1608
* entry is coalesced with a neighbor or an existing entry was
1609
* resized, that entry is returned. In any case, the returned entry
1610
* covers the specified address range.
1611
*/
1612
static int
1613
vm_map_insert1(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1614
vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow,
1615
vm_map_entry_t *res)
1616
{
1617
vm_map_entry_t new_entry, next_entry, prev_entry;
1618
struct ucred *cred;
1619
vm_eflags_t protoeflags;
1620
vm_inherit_t inheritance;
1621
u_long bdry;
1622
u_int bidx;
1623
int cflags;
1624
1625
VM_MAP_ASSERT_LOCKED(map);
1626
KASSERT(object != kernel_object ||
1627
(cow & MAP_COPY_ON_WRITE) == 0,
1628
("vm_map_insert: kernel object and COW"));
1629
KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 ||
1630
(cow & MAP_SPLIT_BOUNDARY_MASK) != 0,
1631
("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x",
1632
object, cow));
1633
KASSERT((prot & ~max) == 0,
1634
("prot %#x is not subset of max_prot %#x", prot, max));
1635
1636
/*
1637
* Check that the start and end points are not bogus.
1638
*/
1639
if (start == end || !vm_map_range_valid(map, start, end))
1640
return (KERN_INVALID_ADDRESS);
1641
1642
if ((map->flags & MAP_WXORX) != 0 && (prot & (VM_PROT_WRITE |
1643
VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE))
1644
return (KERN_PROTECTION_FAILURE);
1645
1646
/*
1647
* Find the entry prior to the proposed starting address; if it's part
1648
* of an existing entry, this range is bogus.
1649
*/
1650
if (vm_map_lookup_entry(map, start, &prev_entry))
1651
return (KERN_NO_SPACE);
1652
1653
/*
1654
* Assert that the next entry doesn't overlap the end point.
1655
*/
1656
next_entry = vm_map_entry_succ(prev_entry);
1657
if (next_entry->start < end)
1658
return (KERN_NO_SPACE);
1659
1660
if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
1661
max != VM_PROT_NONE))
1662
return (KERN_INVALID_ARGUMENT);
1663
1664
protoeflags = 0;
1665
if (cow & MAP_COPY_ON_WRITE)
1666
protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
1667
if (cow & MAP_NOFAULT)
1668
protoeflags |= MAP_ENTRY_NOFAULT;
1669
if (cow & MAP_DISABLE_SYNCER)
1670
protoeflags |= MAP_ENTRY_NOSYNC;
1671
if (cow & MAP_DISABLE_COREDUMP)
1672
protoeflags |= MAP_ENTRY_NOCOREDUMP;
1673
if (cow & MAP_STACK_AREA)
1674
protoeflags |= MAP_ENTRY_GROWS_DOWN;
1675
if (cow & MAP_WRITECOUNT)
1676
protoeflags |= MAP_ENTRY_WRITECNT;
1677
if (cow & MAP_VN_EXEC)
1678
protoeflags |= MAP_ENTRY_VN_EXEC;
1679
if ((cow & MAP_CREATE_GUARD) != 0)
1680
protoeflags |= MAP_ENTRY_GUARD;
1681
if ((cow & MAP_CREATE_STACK_GAP) != 0)
1682
protoeflags |= MAP_ENTRY_STACK_GAP;
1683
if (cow & MAP_INHERIT_SHARE)
1684
inheritance = VM_INHERIT_SHARE;
1685
else
1686
inheritance = VM_INHERIT_DEFAULT;
1687
if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) {
1688
/* This magically ignores index 0, for usual page size. */
1689
bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >>
1690
MAP_SPLIT_BOUNDARY_SHIFT;
1691
if (bidx >= MAXPAGESIZES)
1692
return (KERN_INVALID_ARGUMENT);
1693
bdry = pagesizes[bidx] - 1;
1694
if ((start & bdry) != 0 || (end & bdry) != 0)
1695
return (KERN_INVALID_ARGUMENT);
1696
protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
1697
}
1698
1699
cred = NULL;
1700
if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0) {
1701
cflags = OBJCO_NO_CHARGE;
1702
} else {
1703
cflags = 0;
1704
if ((cow & MAP_ACC_CHARGED) != 0 ||
1705
((prot & VM_PROT_WRITE) != 0 &&
1706
((protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
1707
object == NULL))) {
1708
if ((cow & MAP_ACC_CHARGED) == 0) {
1709
if (!swap_reserve(end - start))
1710
return (KERN_RESOURCE_SHORTAGE);
1711
1712
/*
1713
* Only inform vm_object_coalesce()
1714
* that the object was charged if
1715
* there is no need for CoW, so the
1716
* swap amount reserved is applicable
1717
* to the prev_entry->object.
1718
*/
1719
if ((protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)
1720
cflags |= OBJCO_CHARGED;
1721
}
1722
KASSERT(object == NULL ||
1723
(protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
1724
object->cred == NULL,
1725
("overcommit: vm_map_insert o %p", object));
1726
cred = curthread->td_ucred;
1727
}
1728
}
1729
1730
/* Expand the kernel pmap, if necessary. */
1731
if (map == kernel_map && end > kernel_vm_end) {
1732
int rv;
1733
1734
rv = pmap_growkernel(end);
1735
if (rv != KERN_SUCCESS)
1736
return (rv);
1737
}
1738
if (object != NULL) {
1739
/*
1740
* OBJ_ONEMAPPING must be cleared unless this mapping
1741
* is trivially proven to be the only mapping for any
1742
* of the object's pages. (Object granularity
1743
* reference counting is insufficient to recognize
1744
* aliases with precision.)
1745
*/
1746
if ((object->flags & OBJ_ANON) != 0) {
1747
VM_OBJECT_WLOCK(object);
1748
if (object->ref_count > 1 || object->shadow_count != 0)
1749
vm_object_clear_flag(object, OBJ_ONEMAPPING);
1750
VM_OBJECT_WUNLOCK(object);
1751
}
1752
} else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) ==
1753
protoeflags &&
1754
(cow & (MAP_STACK_AREA | MAP_VN_EXEC)) == 0 &&
1755
prev_entry->end == start && (prev_entry->cred == cred ||
1756
(prev_entry->object.vm_object != NULL &&
1757
prev_entry->object.vm_object->cred == cred)) &&
1758
vm_object_coalesce(prev_entry->object.vm_object,
1759
prev_entry->offset,
1760
(vm_size_t)(prev_entry->end - prev_entry->start),
1761
(vm_size_t)(end - prev_entry->end), cflags)) {
1762
/*
1763
* We were able to extend the object. Determine if we
1764
* can extend the previous map entry to include the
1765
* new range as well.
1766
*/
1767
if (prev_entry->inheritance == inheritance &&
1768
prev_entry->protection == prot &&
1769
prev_entry->max_protection == max &&
1770
prev_entry->wired_count == 0) {
1771
KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) ==
1772
0, ("prev_entry %p has incoherent wiring",
1773
prev_entry));
1774
if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
1775
map->size += end - prev_entry->end;
1776
vm_map_entry_resize(map, prev_entry,
1777
end - prev_entry->end);
1778
*res = vm_map_try_merge_entries(map, prev_entry,
1779
next_entry);
1780
return (KERN_SUCCESS);
1781
}
1782
1783
/*
1784
* If we can extend the object but cannot extend the
1785
* map entry, we have to create a new map entry. We
1786
* must bump the ref count on the extended object to
1787
* account for it. object may be NULL.
1788
*/
1789
object = prev_entry->object.vm_object;
1790
offset = prev_entry->offset +
1791
(prev_entry->end - prev_entry->start);
1792
vm_object_reference(object);
1793
if (cred != NULL && object != NULL && object->cred != NULL &&
1794
!(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
1795
/* Object already accounts for this uid. */
1796
cred = NULL;
1797
}
1798
}
1799
if (cred != NULL)
1800
crhold(cred);
1801
1802
/*
1803
* Create a new entry
1804
*/
1805
new_entry = vm_map_entry_create(map);
1806
new_entry->start = start;
1807
new_entry->end = end;
1808
new_entry->cred = NULL;
1809
1810
new_entry->eflags = protoeflags;
1811
new_entry->object.vm_object = object;
1812
new_entry->offset = offset;
1813
1814
new_entry->inheritance = inheritance;
1815
new_entry->protection = prot;
1816
new_entry->max_protection = max;
1817
new_entry->wired_count = 0;
1818
new_entry->wiring_thread = NULL;
1819
new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
1820
new_entry->next_read = start;
1821
1822
KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
1823
("overcommit: vm_map_insert leaks vm_map %p", new_entry));
1824
new_entry->cred = cred;
1825
1826
/*
1827
* Insert the new entry into the list
1828
*/
1829
vm_map_entry_link(map, new_entry);
1830
if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
1831
map->size += new_entry->end - new_entry->start;
1832
1833
/*
1834
* Try to coalesce the new entry with both the previous and next
1835
* entries in the list. Previously, we only attempted to coalesce
1836
* with the previous entry when object is NULL. Here, we handle the
1837
* other cases, which are less common.
1838
*/
1839
vm_map_try_merge_entries(map, prev_entry, new_entry);
1840
*res = vm_map_try_merge_entries(map, new_entry, next_entry);
1841
1842
if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
1843
vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
1844
end - start, cow & MAP_PREFAULT_PARTIAL);
1845
}
1846
1847
return (KERN_SUCCESS);
1848
}
1849
1850
/*
1851
* vm_map_insert:
1852
*
1853
* Inserts the given VM object into the target map at the
1854
* specified address range.
1855
*
1856
* Requires that the map be locked, and leaves it so.
1857
*
1858
* If object is non-NULL, ref count must be bumped by caller
1859
* prior to making call to account for the new entry.
1860
*/
1861
int
1862
vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1863
vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
1864
{
1865
vm_map_entry_t res;
1866
1867
return (vm_map_insert1(map, object, offset, start, end, prot, max,
1868
cow, &res));
1869
}
1870
1871
/*
1872
* vm_map_findspace:
1873
*
1874
* Find the first fit (lowest VM address) for "length" free bytes
1875
* beginning at address >= start in the given map.
1876
*
1877
* In a vm_map_entry, "max_free" is the maximum amount of
1878
* contiguous free space between an entry in its subtree and a
1879
* neighbor of that entry. This allows finding a free region in
1880
* one path down the tree, so O(log n) amortized with splay
1881
* trees.
1882
*
1883
* The map must be locked, and leaves it so.
1884
*
1885
* Returns: starting address if sufficient space,
1886
* vm_map_max(map)-length+1 if insufficient space.
1887
*/
1888
vm_offset_t
1889
vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length)
1890
{
1891
vm_map_entry_t header, llist, rlist, root, y;
1892
vm_size_t left_length, max_free_left, max_free_right;
1893
vm_offset_t gap_end;
1894
1895
VM_MAP_ASSERT_LOCKED(map);
1896
1897
/*
1898
* Request must fit within min/max VM address and must avoid
1899
* address wrap.
1900
*/
1901
start = MAX(start, vm_map_min(map));
1902
if (start >= vm_map_max(map) || length > vm_map_max(map) - start)
1903
return (vm_map_max(map) - length + 1);
1904
1905
/* Empty tree means wide open address space. */
1906
if (map->root == NULL)
1907
return (start);
1908
1909
/*
1910
* After splay_split, if start is within an entry, push it to the start
1911
* of the following gap. If rlist is at the end of the gap containing
1912
* start, save the end of that gap in gap_end to see if the gap is big
1913
* enough; otherwise set gap_end to start skip gap-checking and move
1914
* directly to a search of the right subtree.
1915
*/
1916
header = &map->header;
1917
root = vm_map_splay_split(map, start, length, &llist, &rlist);
1918
gap_end = rlist->start;
1919
if (root != NULL) {
1920
start = root->end;
1921
if (root->right != rlist)
1922
gap_end = start;
1923
max_free_left = vm_map_splay_merge_left(header, root, llist);
1924
max_free_right = vm_map_splay_merge_right(header, root, rlist);
1925
} else if (rlist != header) {
1926
root = rlist;
1927
rlist = root->left;
1928
max_free_left = vm_map_splay_merge_pred(header, root, llist);
1929
max_free_right = vm_map_splay_merge_right(header, root, rlist);
1930
} else {
1931
root = llist;
1932
llist = root->right;
1933
max_free_left = vm_map_splay_merge_left(header, root, llist);
1934
max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1935
}
1936
root->max_free = vm_size_max(max_free_left, max_free_right);
1937
map->root = root;
1938
VM_MAP_ASSERT_CONSISTENT(map);
1939
if (length <= gap_end - start)
1940
return (start);
1941
1942
/* With max_free, can immediately tell if no solution. */
1943
if (root->right == header || length > root->right->max_free)
1944
return (vm_map_max(map) - length + 1);
1945
1946
/*
1947
* Splay for the least large-enough gap in the right subtree.
1948
*/
1949
llist = rlist = header;
1950
for (left_length = 0;;
1951
left_length = vm_map_entry_max_free_left(root, llist)) {
1952
if (length <= left_length)
1953
SPLAY_LEFT_STEP(root, y, llist, rlist,
1954
length <= vm_map_entry_max_free_left(y, llist));
1955
else
1956
SPLAY_RIGHT_STEP(root, y, llist, rlist,
1957
length > vm_map_entry_max_free_left(y, root));
1958
if (root == NULL)
1959
break;
1960
}
1961
root = llist;
1962
llist = root->right;
1963
max_free_left = vm_map_splay_merge_left(header, root, llist);
1964
if (rlist == header) {
1965
root->max_free = vm_size_max(max_free_left,
1966
vm_map_splay_merge_succ(header, root, rlist));
1967
} else {
1968
y = rlist;
1969
rlist = y->left;
1970
y->max_free = vm_size_max(
1971
vm_map_splay_merge_pred(root, y, root),
1972
vm_map_splay_merge_right(header, y, rlist));
1973
root->max_free = vm_size_max(max_free_left, y->max_free);
1974
}
1975
map->root = root;
1976
VM_MAP_ASSERT_CONSISTENT(map);
1977
return (root->end);
1978
}
1979
1980
int
1981
vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1982
vm_offset_t start, vm_size_t length, vm_prot_t prot,
1983
vm_prot_t max, int cow)
1984
{
1985
vm_offset_t end;
1986
int result;
1987
1988
end = start + length;
1989
KASSERT((cow & MAP_STACK_AREA) == 0 || object == NULL,
1990
("vm_map_fixed: non-NULL backing object for stack"));
1991
vm_map_lock(map);
1992
VM_MAP_RANGE_CHECK(map, start, end);
1993
if ((cow & MAP_CHECK_EXCL) == 0) {
1994
result = vm_map_delete(map, start, end);
1995
if (result != KERN_SUCCESS)
1996
goto out;
1997
}
1998
if ((cow & MAP_STACK_AREA) != 0) {
1999
result = vm_map_stack_locked(map, start, length, sgrowsiz,
2000
prot, max, cow);
2001
} else {
2002
result = vm_map_insert(map, object, offset, start, end,
2003
prot, max, cow);
2004
}
2005
out:
2006
vm_map_unlock(map);
2007
return (result);
2008
}
2009
2010
#if VM_NRESERVLEVEL <= 1
2011
static const int aslr_pages_rnd_64[2] = {0x1000, 0x10};
2012
static const int aslr_pages_rnd_32[2] = {0x100, 0x4};
2013
#elif VM_NRESERVLEVEL == 2
2014
static const int aslr_pages_rnd_64[3] = {0x1000, 0x1000, 0x10};
2015
static const int aslr_pages_rnd_32[3] = {0x100, 0x100, 0x4};
2016
#else
2017
#error "Unsupported VM_NRESERVLEVEL"
2018
#endif
2019
2020
static int cluster_anon = 1;
2021
SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW,
2022
&cluster_anon, 0,
2023
"Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always");
2024
2025
static bool
2026
clustering_anon_allowed(vm_offset_t addr, int cow)
2027
{
2028
2029
switch (cluster_anon) {
2030
case 0:
2031
return (false);
2032
case 1:
2033
return (addr == 0 || (cow & MAP_NO_HINT) != 0);
2034
case 2:
2035
default:
2036
return (true);
2037
}
2038
}
2039
2040
static long aslr_restarts;
2041
SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD,
2042
&aslr_restarts, 0,
2043
"Number of aslr failures");
2044
2045
/*
2046
* Searches for the specified amount of free space in the given map with the
2047
* specified alignment. Performs an address-ordered, first-fit search from
2048
* the given address "*addr", with an optional upper bound "max_addr". If the
2049
* parameter "alignment" is zero, then the alignment is computed from the
2050
* given (object, offset) pair so as to enable the greatest possible use of
2051
* superpage mappings. Returns KERN_SUCCESS and the address of the free space
2052
* in "*addr" if successful. Otherwise, returns KERN_NO_SPACE.
2053
*
2054
* The map must be locked. Initially, there must be at least "length" bytes
2055
* of free space at the given address.
2056
*/
2057
static int
2058
vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2059
vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr,
2060
vm_offset_t alignment)
2061
{
2062
vm_offset_t aligned_addr, free_addr;
2063
2064
VM_MAP_ASSERT_LOCKED(map);
2065
free_addr = *addr;
2066
KASSERT(free_addr == vm_map_findspace(map, free_addr, length),
2067
("caller failed to provide space %#jx at address %p",
2068
(uintmax_t)length, (void *)free_addr));
2069
for (;;) {
2070
/*
2071
* At the start of every iteration, the free space at address
2072
* "*addr" is at least "length" bytes.
2073
*/
2074
if (alignment == 0)
2075
pmap_align_superpage(object, offset, addr, length);
2076
else
2077
*addr = roundup2(*addr, alignment);
2078
aligned_addr = *addr;
2079
if (aligned_addr == free_addr) {
2080
/*
2081
* Alignment did not change "*addr", so "*addr" must
2082
* still provide sufficient free space.
2083
*/
2084
return (KERN_SUCCESS);
2085
}
2086
2087
/*
2088
* Test for address wrap on "*addr". A wrapped "*addr" could
2089
* be a valid address, in which case vm_map_findspace() cannot
2090
* be relied upon to fail.
2091
*/
2092
if (aligned_addr < free_addr)
2093
return (KERN_NO_SPACE);
2094
*addr = vm_map_findspace(map, aligned_addr, length);
2095
if (*addr + length > vm_map_max(map) ||
2096
(max_addr != 0 && *addr + length > max_addr))
2097
return (KERN_NO_SPACE);
2098
free_addr = *addr;
2099
if (free_addr == aligned_addr) {
2100
/*
2101
* If a successful call to vm_map_findspace() did not
2102
* change "*addr", then "*addr" must still be aligned
2103
* and provide sufficient free space.
2104
*/
2105
return (KERN_SUCCESS);
2106
}
2107
}
2108
}
2109
2110
int
2111
vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
2112
vm_offset_t max_addr, vm_offset_t alignment)
2113
{
2114
/* XXXKIB ASLR eh ? */
2115
*addr = vm_map_findspace(map, *addr, length);
2116
if (*addr + length > vm_map_max(map) ||
2117
(max_addr != 0 && *addr + length > max_addr))
2118
return (KERN_NO_SPACE);
2119
return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr,
2120
alignment));
2121
}
2122
2123
/*
2124
* vm_map_find finds an unallocated region in the target address
2125
* map with the given length. The search is defined to be
2126
* first-fit from the specified address; the region found is
2127
* returned in the same parameter.
2128
*
2129
* If object is non-NULL, ref count must be bumped by caller
2130
* prior to making call to account for the new entry.
2131
*/
2132
int
2133
vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2134
vm_offset_t *addr, /* IN/OUT */
2135
vm_size_t length, vm_offset_t max_addr, int find_space,
2136
vm_prot_t prot, vm_prot_t max, int cow)
2137
{
2138
int rv;
2139
2140
vm_map_lock(map);
2141
rv = vm_map_find_locked(map, object, offset, addr, length, max_addr,
2142
find_space, prot, max, cow);
2143
vm_map_unlock(map);
2144
return (rv);
2145
}
2146
2147
int
2148
vm_map_find_locked(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2149
vm_offset_t *addr, /* IN/OUT */
2150
vm_size_t length, vm_offset_t max_addr, int find_space,
2151
vm_prot_t prot, vm_prot_t max, int cow)
2152
{
2153
vm_offset_t alignment, curr_min_addr, min_addr;
2154
int gap, pidx, rv, try;
2155
bool cluster, en_aslr, update_anon;
2156
2157
KASSERT((cow & MAP_STACK_AREA) == 0 || object == NULL,
2158
("non-NULL backing object for stack"));
2159
MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
2160
(cow & MAP_STACK_AREA) == 0));
2161
if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
2162
(object->flags & OBJ_COLORED) == 0))
2163
find_space = VMFS_ANY_SPACE;
2164
if (find_space >> 8 != 0) {
2165
KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
2166
alignment = (vm_offset_t)1 << (find_space >> 8);
2167
} else
2168
alignment = 0;
2169
en_aslr = (map->flags & MAP_ASLR) != 0;
2170
update_anon = cluster = clustering_anon_allowed(*addr, cow) &&
2171
(map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 &&
2172
find_space != VMFS_NO_SPACE && object == NULL &&
2173
(cow & (MAP_INHERIT_SHARE | MAP_STACK_AREA)) == 0 &&
2174
prot != PROT_NONE;
2175
curr_min_addr = min_addr = *addr;
2176
if (en_aslr && min_addr == 0 && !cluster &&
2177
find_space != VMFS_NO_SPACE &&
2178
(map->flags & MAP_ASLR_IGNSTART) != 0)
2179
curr_min_addr = min_addr = vm_map_min(map);
2180
try = 0;
2181
if (cluster) {
2182
curr_min_addr = map->anon_loc;
2183
if (curr_min_addr == 0)
2184
cluster = false;
2185
}
2186
if (find_space != VMFS_NO_SPACE) {
2187
KASSERT(find_space == VMFS_ANY_SPACE ||
2188
find_space == VMFS_OPTIMAL_SPACE ||
2189
find_space == VMFS_SUPER_SPACE ||
2190
alignment != 0, ("unexpected VMFS flag"));
2191
again:
2192
/*
2193
* When creating an anonymous mapping, try clustering
2194
* with an existing anonymous mapping first.
2195
*
2196
* We make up to two attempts to find address space
2197
* for a given find_space value. The first attempt may
2198
* apply randomization or may cluster with an existing
2199
* anonymous mapping. If this first attempt fails,
2200
* perform a first-fit search of the available address
2201
* space.
2202
*
2203
* If all tries failed, and find_space is
2204
* VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE.
2205
* Again enable clustering and randomization.
2206
*/
2207
try++;
2208
MPASS(try <= 2);
2209
2210
if (try == 2) {
2211
/*
2212
* Second try: we failed either to find a
2213
* suitable region for randomizing the
2214
* allocation, or to cluster with an existing
2215
* mapping. Retry with free run.
2216
*/
2217
curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ?
2218
vm_map_min(map) : min_addr;
2219
atomic_add_long(&aslr_restarts, 1);
2220
}
2221
2222
if (try == 1 && en_aslr && !cluster) {
2223
/*
2224
* Find space for allocation, including
2225
* gap needed for later randomization.
2226
*/
2227
pidx = 0;
2228
#if VM_NRESERVLEVEL > 0
2229
if ((find_space == VMFS_SUPER_SPACE ||
2230
find_space == VMFS_OPTIMAL_SPACE) &&
2231
pagesizes[VM_NRESERVLEVEL] != 0) {
2232
/*
2233
* Do not pointlessly increase the space that
2234
* is requested from vm_map_findspace().
2235
* pmap_align_superpage() will only change a
2236
* mapping's alignment if that mapping is at
2237
* least a superpage in size.
2238
*/
2239
pidx = VM_NRESERVLEVEL;
2240
while (pidx > 0 && length < pagesizes[pidx])
2241
pidx--;
2242
}
2243
#endif
2244
gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR &&
2245
(max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ?
2246
aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx];
2247
*addr = vm_map_findspace(map, curr_min_addr,
2248
length + gap * pagesizes[pidx]);
2249
if (*addr + length + gap * pagesizes[pidx] >
2250
vm_map_max(map))
2251
goto again;
2252
/* And randomize the start address. */
2253
*addr += (arc4random() % gap) * pagesizes[pidx];
2254
if (max_addr != 0 && *addr + length > max_addr)
2255
goto again;
2256
} else {
2257
*addr = vm_map_findspace(map, curr_min_addr, length);
2258
if (*addr + length > vm_map_max(map) ||
2259
(max_addr != 0 && *addr + length > max_addr)) {
2260
if (cluster) {
2261
cluster = false;
2262
MPASS(try == 1);
2263
goto again;
2264
}
2265
return (KERN_NO_SPACE);
2266
}
2267
}
2268
2269
if (find_space != VMFS_ANY_SPACE &&
2270
(rv = vm_map_alignspace(map, object, offset, addr, length,
2271
max_addr, alignment)) != KERN_SUCCESS) {
2272
if (find_space == VMFS_OPTIMAL_SPACE) {
2273
find_space = VMFS_ANY_SPACE;
2274
curr_min_addr = min_addr;
2275
cluster = update_anon;
2276
try = 0;
2277
goto again;
2278
}
2279
return (rv);
2280
}
2281
} else if ((cow & MAP_REMAP) != 0) {
2282
if (!vm_map_range_valid(map, *addr, *addr + length))
2283
return (KERN_INVALID_ADDRESS);
2284
rv = vm_map_delete(map, *addr, *addr + length);
2285
if (rv != KERN_SUCCESS)
2286
return (rv);
2287
}
2288
if ((cow & MAP_STACK_AREA) != 0) {
2289
rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot,
2290
max, cow);
2291
} else {
2292
rv = vm_map_insert(map, object, offset, *addr, *addr + length,
2293
prot, max, cow);
2294
}
2295
2296
/*
2297
* Update the starting address for clustered anonymous memory mappings
2298
* if a starting address was not previously defined or an ASLR restart
2299
* placed an anonymous memory mapping at a lower address.
2300
*/
2301
if (update_anon && rv == KERN_SUCCESS && (map->anon_loc == 0 ||
2302
*addr < map->anon_loc))
2303
map->anon_loc = *addr;
2304
return (rv);
2305
}
2306
2307
/*
2308
* vm_map_find_min() is a variant of vm_map_find() that takes an
2309
* additional parameter ("default_addr") and treats the given address
2310
* ("*addr") differently. Specifically, it treats "*addr" as a hint
2311
* and not as the minimum address where the mapping is created.
2312
*
2313
* This function works in two phases. First, it tries to
2314
* allocate above the hint. If that fails and the hint is
2315
* greater than "default_addr", it performs a second pass, replacing
2316
* the hint with "default_addr" as the minimum address for the
2317
* allocation.
2318
*/
2319
int
2320
vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2321
vm_offset_t *addr, vm_size_t length, vm_offset_t default_addr,
2322
vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
2323
int cow)
2324
{
2325
vm_offset_t hint;
2326
int rv;
2327
2328
hint = *addr;
2329
if (hint == 0) {
2330
cow |= MAP_NO_HINT;
2331
*addr = hint = default_addr;
2332
}
2333
for (;;) {
2334
rv = vm_map_find(map, object, offset, addr, length, max_addr,
2335
find_space, prot, max, cow);
2336
if (rv == KERN_SUCCESS || default_addr >= hint)
2337
return (rv);
2338
*addr = hint = default_addr;
2339
}
2340
}
2341
2342
/*
2343
* A map entry with any of the following flags set must not be merged with
2344
* another entry.
2345
*/
2346
#define MAP_ENTRY_NOMERGE_MASK (MAP_ENTRY_GROWS_DOWN | \
2347
MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC | \
2348
MAP_ENTRY_STACK_GAP)
2349
2350
static bool
2351
vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry)
2352
{
2353
2354
KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 ||
2355
(entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0,
2356
("vm_map_mergeable_neighbors: neither %p nor %p are mergeable",
2357
prev, entry));
2358
return (prev->end == entry->start &&
2359
prev->object.vm_object == entry->object.vm_object &&
2360
(prev->object.vm_object == NULL ||
2361
prev->offset + (prev->end - prev->start) == entry->offset) &&
2362
prev->eflags == entry->eflags &&
2363
prev->protection == entry->protection &&
2364
prev->max_protection == entry->max_protection &&
2365
prev->inheritance == entry->inheritance &&
2366
prev->wired_count == entry->wired_count &&
2367
prev->cred == entry->cred);
2368
}
2369
2370
static void
2371
vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry)
2372
{
2373
2374
/*
2375
* If the backing object is a vnode object, vm_object_deallocate()
2376
* calls vrele(). However, vrele() does not lock the vnode because
2377
* the vnode has additional references. Thus, the map lock can be
2378
* kept without causing a lock-order reversal with the vnode lock.
2379
*
2380
* Since we count the number of virtual page mappings in
2381
* object->un_pager.vnp.writemappings, the writemappings value
2382
* should not be adjusted when the entry is disposed of.
2383
*/
2384
if (entry->object.vm_object != NULL)
2385
vm_object_deallocate(entry->object.vm_object);
2386
if (entry->cred != NULL)
2387
crfree(entry->cred);
2388
vm_map_entry_dispose(map, entry);
2389
}
2390
2391
/*
2392
* vm_map_try_merge_entries:
2393
*
2394
* Compare two map entries that represent consecutive ranges. If
2395
* the entries can be merged, expand the range of the second to
2396
* cover the range of the first and delete the first. Then return
2397
* the map entry that includes the first range.
2398
*
2399
* The map must be locked.
2400
*/
2401
vm_map_entry_t
2402
vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev_entry,
2403
vm_map_entry_t entry)
2404
{
2405
2406
VM_MAP_ASSERT_LOCKED(map);
2407
if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 &&
2408
vm_map_mergeable_neighbors(prev_entry, entry)) {
2409
vm_map_entry_unlink(map, prev_entry, UNLINK_MERGE_NEXT);
2410
vm_map_merged_neighbor_dispose(map, prev_entry);
2411
return (entry);
2412
}
2413
return (prev_entry);
2414
}
2415
2416
/*
2417
* vm_map_entry_back:
2418
*
2419
* Allocate an object to back a map entry.
2420
*/
2421
static inline void
2422
vm_map_entry_back(vm_map_entry_t entry)
2423
{
2424
vm_object_t object;
2425
2426
KASSERT(entry->object.vm_object == NULL,
2427
("map entry %p has backing object", entry));
2428
KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2429
("map entry %p is a submap", entry));
2430
object = vm_object_allocate_anon(atop(entry->end - entry->start), NULL,
2431
entry->cred);
2432
entry->object.vm_object = object;
2433
entry->offset = 0;
2434
entry->cred = NULL;
2435
}
2436
2437
/*
2438
* vm_map_entry_charge_object
2439
*
2440
* If there is no object backing this entry, create one. Otherwise, if
2441
* the entry has cred, give it to the backing object.
2442
*/
2443
static inline void
2444
vm_map_entry_charge_object(vm_map_t map, vm_map_entry_t entry)
2445
{
2446
vm_object_t object;
2447
2448
VM_MAP_ASSERT_LOCKED(map);
2449
KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2450
("map entry %p is a submap", entry));
2451
object = entry->object.vm_object;
2452
if (object == NULL && !vm_map_is_system(map) &&
2453
(entry->eflags & MAP_ENTRY_GUARD) == 0)
2454
vm_map_entry_back(entry);
2455
else if (object != NULL &&
2456
((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
2457
entry->cred != NULL) {
2458
VM_OBJECT_WLOCK(object);
2459
KASSERT(object->cred == NULL,
2460
("OVERCOMMIT: %s: both cred e %p", __func__, entry));
2461
object->cred = entry->cred;
2462
if (entry->end - entry->start < ptoa(object->size)) {
2463
swap_reserve_force_by_cred(ptoa(object->size) -
2464
entry->end + entry->start, object->cred);
2465
}
2466
VM_OBJECT_WUNLOCK(entry->object.vm_object);
2467
entry->cred = NULL;
2468
}
2469
}
2470
2471
/*
2472
* vm_map_entry_clone
2473
*
2474
* Create a duplicate map entry for clipping.
2475
*/
2476
static vm_map_entry_t
2477
vm_map_entry_clone(vm_map_t map, vm_map_entry_t entry)
2478
{
2479
vm_map_entry_t new_entry;
2480
2481
VM_MAP_ASSERT_LOCKED(map);
2482
2483
/*
2484
* Create a backing object now, if none exists, so that more individual
2485
* objects won't be created after the map entry is split.
2486
*/
2487
vm_map_entry_charge_object(map, entry);
2488
2489
/* Clone the entry. */
2490
new_entry = vm_map_entry_create(map);
2491
*new_entry = *entry;
2492
if (new_entry->cred != NULL)
2493
crhold(entry->cred);
2494
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
2495
vm_object_reference(new_entry->object.vm_object);
2496
vm_map_entry_set_vnode_text(new_entry, true);
2497
/*
2498
* The object->un_pager.vnp.writemappings for the object of
2499
* MAP_ENTRY_WRITECNT type entry shall be kept as is here. The
2500
* virtual pages are re-distributed among the clipped entries,
2501
* so the sum is left the same.
2502
*/
2503
}
2504
return (new_entry);
2505
}
2506
2507
/*
2508
* vm_map_clip_start: [ internal use only ]
2509
*
2510
* Asserts that the given entry begins at or after
2511
* the specified address; if necessary,
2512
* it splits the entry into two.
2513
*/
2514
static int
2515
vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr)
2516
{
2517
vm_map_entry_t new_entry;
2518
int bdry_idx;
2519
2520
if (!vm_map_is_system(map))
2521
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2522
"%s: map %p entry %p start 0x%jx", __func__, map, entry,
2523
(uintmax_t)startaddr);
2524
2525
if (startaddr <= entry->start)
2526
return (KERN_SUCCESS);
2527
2528
VM_MAP_ASSERT_LOCKED(map);
2529
KASSERT(entry->end > startaddr && entry->start < startaddr,
2530
("%s: invalid clip of entry %p", __func__, entry));
2531
2532
bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
2533
if (bdry_idx != 0) {
2534
if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0)
2535
return (KERN_INVALID_ARGUMENT);
2536
}
2537
2538
new_entry = vm_map_entry_clone(map, entry);
2539
2540
/*
2541
* Split off the front portion. Insert the new entry BEFORE this one,
2542
* so that this entry has the specified starting address.
2543
*/
2544
new_entry->end = startaddr;
2545
vm_map_entry_link(map, new_entry);
2546
return (KERN_SUCCESS);
2547
}
2548
2549
/*
2550
* vm_map_lookup_clip_start:
2551
*
2552
* Find the entry at or just after 'start', and clip it if 'start' is in
2553
* the interior of the entry. Return entry after 'start', and in
2554
* prev_entry set the entry before 'start'.
2555
*/
2556
static int
2557
vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start,
2558
vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry)
2559
{
2560
vm_map_entry_t entry;
2561
int rv;
2562
2563
if (!vm_map_is_system(map))
2564
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2565
"%s: map %p start 0x%jx prev %p", __func__, map,
2566
(uintmax_t)start, prev_entry);
2567
2568
if (vm_map_lookup_entry(map, start, prev_entry)) {
2569
entry = *prev_entry;
2570
rv = vm_map_clip_start(map, entry, start);
2571
if (rv != KERN_SUCCESS)
2572
return (rv);
2573
*prev_entry = vm_map_entry_pred(entry);
2574
} else
2575
entry = vm_map_entry_succ(*prev_entry);
2576
*res_entry = entry;
2577
return (KERN_SUCCESS);
2578
}
2579
2580
/*
2581
* vm_map_clip_end: [ internal use only ]
2582
*
2583
* Asserts that the given entry ends at or before
2584
* the specified address; if necessary,
2585
* it splits the entry into two.
2586
*/
2587
static int
2588
vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr)
2589
{
2590
vm_map_entry_t new_entry;
2591
int bdry_idx;
2592
2593
if (!vm_map_is_system(map))
2594
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2595
"%s: map %p entry %p end 0x%jx", __func__, map, entry,
2596
(uintmax_t)endaddr);
2597
2598
if (endaddr >= entry->end)
2599
return (KERN_SUCCESS);
2600
2601
VM_MAP_ASSERT_LOCKED(map);
2602
KASSERT(entry->start < endaddr && entry->end > endaddr,
2603
("%s: invalid clip of entry %p", __func__, entry));
2604
2605
bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
2606
if (bdry_idx != 0) {
2607
if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0)
2608
return (KERN_INVALID_ARGUMENT);
2609
}
2610
2611
new_entry = vm_map_entry_clone(map, entry);
2612
2613
/*
2614
* Split off the back portion. Insert the new entry AFTER this one,
2615
* so that this entry has the specified ending address.
2616
*/
2617
new_entry->start = endaddr;
2618
vm_map_entry_link(map, new_entry);
2619
2620
return (KERN_SUCCESS);
2621
}
2622
2623
/*
2624
* vm_map_submap: [ kernel use only ]
2625
*
2626
* Mark the given range as handled by a subordinate map.
2627
*
2628
* This range must have been created with vm_map_find,
2629
* and no other operations may have been performed on this
2630
* range prior to calling vm_map_submap.
2631
*
2632
* Only a limited number of operations can be performed
2633
* within this rage after calling vm_map_submap:
2634
* vm_fault
2635
* [Don't try vm_map_copy!]
2636
*
2637
* To remove a submapping, one must first remove the
2638
* range from the superior map, and then destroy the
2639
* submap (if desired). [Better yet, don't try it.]
2640
*/
2641
int
2642
vm_map_submap(
2643
vm_map_t map,
2644
vm_offset_t start,
2645
vm_offset_t end,
2646
vm_map_t submap)
2647
{
2648
vm_map_entry_t entry;
2649
int result;
2650
2651
result = KERN_INVALID_ARGUMENT;
2652
2653
vm_map_lock(submap);
2654
submap->flags |= MAP_IS_SUB_MAP;
2655
vm_map_unlock(submap);
2656
2657
vm_map_lock(map);
2658
VM_MAP_RANGE_CHECK(map, start, end);
2659
if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end &&
2660
(entry->eflags & MAP_ENTRY_COW) == 0 &&
2661
entry->object.vm_object == NULL) {
2662
result = vm_map_clip_start(map, entry, start);
2663
if (result != KERN_SUCCESS)
2664
goto unlock;
2665
result = vm_map_clip_end(map, entry, end);
2666
if (result != KERN_SUCCESS)
2667
goto unlock;
2668
entry->object.sub_map = submap;
2669
entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
2670
result = KERN_SUCCESS;
2671
}
2672
unlock:
2673
vm_map_unlock(map);
2674
2675
if (result != KERN_SUCCESS) {
2676
vm_map_lock(submap);
2677
submap->flags &= ~MAP_IS_SUB_MAP;
2678
vm_map_unlock(submap);
2679
}
2680
return (result);
2681
}
2682
2683
/*
2684
* The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
2685
*/
2686
#define MAX_INIT_PT 96
2687
2688
/*
2689
* vm_map_pmap_enter:
2690
*
2691
* Preload the specified map's pmap with mappings to the specified
2692
* object's memory-resident pages. No further physical pages are
2693
* allocated, and no further virtual pages are retrieved from secondary
2694
* storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a
2695
* limited number of page mappings are created at the low-end of the
2696
* specified address range. (For this purpose, a superpage mapping
2697
* counts as one page mapping.) Otherwise, all resident pages within
2698
* the specified address range are mapped.
2699
*/
2700
static void
2701
vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
2702
vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
2703
{
2704
struct pctrie_iter pages;
2705
vm_offset_t start;
2706
vm_page_t p, p_start;
2707
vm_pindex_t jump, mask, psize, threshold, tmpidx;
2708
int psind;
2709
2710
if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
2711
return;
2712
if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2713
VM_OBJECT_WLOCK(object);
2714
if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2715
pmap_object_init_pt(map->pmap, addr, object, pindex,
2716
size);
2717
VM_OBJECT_WUNLOCK(object);
2718
return;
2719
}
2720
VM_OBJECT_LOCK_DOWNGRADE(object);
2721
} else
2722
VM_OBJECT_RLOCK(object);
2723
2724
psize = atop(size);
2725
if (psize + pindex > object->size) {
2726
if (pindex >= object->size) {
2727
VM_OBJECT_RUNLOCK(object);
2728
return;
2729
}
2730
psize = object->size - pindex;
2731
}
2732
2733
start = 0;
2734
p_start = NULL;
2735
threshold = MAX_INIT_PT;
2736
2737
vm_page_iter_limit_init(&pages, object, pindex + psize);
2738
for (p = vm_radix_iter_lookup_ge(&pages, pindex); p != NULL;
2739
p = vm_radix_iter_jump(&pages, jump)) {
2740
/*
2741
* don't allow an madvise to blow away our really
2742
* free pages allocating pv entries.
2743
*/
2744
tmpidx = p->pindex - pindex;
2745
if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
2746
vm_page_count_severe()) ||
2747
((flags & MAP_PREFAULT_PARTIAL) != 0 &&
2748
tmpidx >= threshold)) {
2749
psize = tmpidx;
2750
break;
2751
}
2752
jump = 1;
2753
if (vm_page_all_valid(p)) {
2754
if (p_start == NULL) {
2755
start = addr + ptoa(tmpidx);
2756
p_start = p;
2757
}
2758
/* Jump ahead if a superpage mapping is possible. */
2759
for (psind = p->psind; psind > 0; psind--) {
2760
if (((addr + ptoa(tmpidx)) &
2761
(pagesizes[psind] - 1)) == 0) {
2762
mask = atop(pagesizes[psind]) - 1;
2763
if (tmpidx + mask < psize &&
2764
vm_page_ps_test(p, psind,
2765
PS_ALL_VALID, NULL)) {
2766
jump += mask;
2767
threshold += mask;
2768
break;
2769
}
2770
}
2771
}
2772
} else if (p_start != NULL) {
2773
pmap_enter_object(map->pmap, start, addr +
2774
ptoa(tmpidx), p_start, prot);
2775
p_start = NULL;
2776
}
2777
}
2778
if (p_start != NULL)
2779
pmap_enter_object(map->pmap, start, addr + ptoa(psize),
2780
p_start, prot);
2781
VM_OBJECT_RUNLOCK(object);
2782
}
2783
2784
static void
2785
vm_map_protect_guard(vm_map_entry_t entry, vm_prot_t new_prot,
2786
vm_prot_t new_maxprot, int flags)
2787
{
2788
vm_prot_t old_prot;
2789
2790
MPASS((entry->eflags & MAP_ENTRY_GUARD) != 0);
2791
if ((entry->eflags & MAP_ENTRY_STACK_GAP) == 0)
2792
return;
2793
2794
old_prot = PROT_EXTRACT(entry->offset);
2795
if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0) {
2796
entry->offset = PROT_MAX(new_maxprot) |
2797
(new_maxprot & old_prot);
2798
}
2799
if ((flags & VM_MAP_PROTECT_SET_PROT) != 0) {
2800
entry->offset = new_prot | PROT_MAX(
2801
PROT_MAX_EXTRACT(entry->offset));
2802
}
2803
}
2804
2805
/*
2806
* vm_map_protect:
2807
*
2808
* Sets the protection and/or the maximum protection of the
2809
* specified address region in the target map.
2810
*/
2811
int
2812
vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2813
vm_prot_t new_prot, vm_prot_t new_maxprot, int flags)
2814
{
2815
vm_map_entry_t entry, first_entry, in_tran, prev_entry;
2816
vm_object_t obj;
2817
struct ucred *cred;
2818
vm_offset_t orig_start;
2819
vm_prot_t check_prot, max_prot, old_prot;
2820
int rv;
2821
2822
if (start == end)
2823
return (KERN_SUCCESS);
2824
2825
if (CONTAINS_BITS(flags, VM_MAP_PROTECT_SET_PROT |
2826
VM_MAP_PROTECT_SET_MAXPROT) &&
2827
!CONTAINS_BITS(new_maxprot, new_prot))
2828
return (KERN_OUT_OF_BOUNDS);
2829
2830
orig_start = start;
2831
again:
2832
in_tran = NULL;
2833
start = orig_start;
2834
vm_map_lock(map);
2835
2836
if ((map->flags & MAP_WXORX) != 0 &&
2837
(flags & VM_MAP_PROTECT_SET_PROT) != 0 &&
2838
CONTAINS_BITS(new_prot, VM_PROT_WRITE | VM_PROT_EXECUTE)) {
2839
vm_map_unlock(map);
2840
return (KERN_PROTECTION_FAILURE);
2841
}
2842
2843
/*
2844
* Ensure that we are not concurrently wiring pages. vm_map_wire() may
2845
* need to fault pages into the map and will drop the map lock while
2846
* doing so, and the VM object may end up in an inconsistent state if we
2847
* update the protection on the map entry in between faults.
2848
*/
2849
vm_map_wait_busy(map);
2850
2851
VM_MAP_RANGE_CHECK(map, start, end);
2852
2853
if (!vm_map_lookup_entry(map, start, &first_entry))
2854
first_entry = vm_map_entry_succ(first_entry);
2855
2856
if ((flags & VM_MAP_PROTECT_GROWSDOWN) != 0 &&
2857
(first_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0) {
2858
/*
2859
* Handle Linux's PROT_GROWSDOWN flag.
2860
* It means that protection is applied down to the
2861
* whole stack, including the specified range of the
2862
* mapped region, and the grow down region (AKA
2863
* guard).
2864
*/
2865
while (!CONTAINS_BITS(first_entry->eflags,
2866
MAP_ENTRY_GUARD | MAP_ENTRY_STACK_GAP) &&
2867
first_entry != vm_map_entry_first(map))
2868
first_entry = vm_map_entry_pred(first_entry);
2869
start = first_entry->start;
2870
}
2871
2872
/*
2873
* Make a first pass to check for protection violations.
2874
*/
2875
check_prot = 0;
2876
if ((flags & VM_MAP_PROTECT_SET_PROT) != 0)
2877
check_prot |= new_prot;
2878
if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0)
2879
check_prot |= new_maxprot;
2880
for (entry = first_entry; entry->start < end;
2881
entry = vm_map_entry_succ(entry)) {
2882
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
2883
vm_map_unlock(map);
2884
return (KERN_INVALID_ARGUMENT);
2885
}
2886
if ((entry->eflags & (MAP_ENTRY_GUARD |
2887
MAP_ENTRY_STACK_GAP)) == MAP_ENTRY_GUARD)
2888
continue;
2889
max_prot = (entry->eflags & MAP_ENTRY_STACK_GAP) != 0 ?
2890
PROT_MAX_EXTRACT(entry->offset) : entry->max_protection;
2891
if (!CONTAINS_BITS(max_prot, check_prot)) {
2892
vm_map_unlock(map);
2893
return (KERN_PROTECTION_FAILURE);
2894
}
2895
if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0)
2896
in_tran = entry;
2897
}
2898
2899
/*
2900
* Postpone the operation until all in-transition map entries have
2901
* stabilized. An in-transition entry might already have its pages
2902
* wired and wired_count incremented, but not yet have its
2903
* MAP_ENTRY_USER_WIRED flag set. In which case, we would fail to call
2904
* vm_fault_copy_entry() in the final loop below.
2905
*/
2906
if (in_tran != NULL) {
2907
in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2908
vm_map_unlock_and_wait(map, 0);
2909
goto again;
2910
}
2911
2912
/*
2913
* Before changing the protections, try to reserve swap space for any
2914
* private (i.e., copy-on-write) mappings that are transitioning from
2915
* read-only to read/write access. If a reservation fails, break out
2916
* of this loop early and let the next loop simplify the entries, since
2917
* some may now be mergeable.
2918
*/
2919
rv = vm_map_clip_start(map, first_entry, start);
2920
if (rv != KERN_SUCCESS) {
2921
vm_map_unlock(map);
2922
return (rv);
2923
}
2924
for (entry = first_entry; entry->start < end;
2925
entry = vm_map_entry_succ(entry)) {
2926
rv = vm_map_clip_end(map, entry, end);
2927
if (rv != KERN_SUCCESS) {
2928
vm_map_unlock(map);
2929
return (rv);
2930
}
2931
2932
if ((flags & VM_MAP_PROTECT_SET_PROT) == 0 ||
2933
((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 ||
2934
ENTRY_CHARGED(entry) ||
2935
(entry->eflags & MAP_ENTRY_GUARD) != 0)
2936
continue;
2937
2938
cred = curthread->td_ucred;
2939
obj = entry->object.vm_object;
2940
2941
if (obj == NULL ||
2942
(entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0) {
2943
if (!swap_reserve(entry->end - entry->start)) {
2944
rv = KERN_RESOURCE_SHORTAGE;
2945
end = entry->end;
2946
break;
2947
}
2948
crhold(cred);
2949
entry->cred = cred;
2950
continue;
2951
}
2952
2953
VM_OBJECT_WLOCK(obj);
2954
if ((obj->flags & OBJ_SWAP) == 0) {
2955
VM_OBJECT_WUNLOCK(obj);
2956
continue;
2957
}
2958
2959
/*
2960
* Charge for the whole object allocation now, since
2961
* we cannot distinguish between non-charged and
2962
* charged clipped mapping of the same object later.
2963
*/
2964
KASSERT(obj->cred == NULL,
2965
("vm_map_protect: object %p overcharged (entry %p)",
2966
obj, entry));
2967
if (!swap_reserve(ptoa(obj->size))) {
2968
VM_OBJECT_WUNLOCK(obj);
2969
rv = KERN_RESOURCE_SHORTAGE;
2970
end = entry->end;
2971
break;
2972
}
2973
2974
crhold(cred);
2975
obj->cred = cred;
2976
VM_OBJECT_WUNLOCK(obj);
2977
}
2978
2979
/*
2980
* If enough swap space was available, go back and fix up protections.
2981
* Otherwise, just simplify entries, since some may have been modified.
2982
* [Note that clipping is not necessary the second time.]
2983
*/
2984
for (prev_entry = vm_map_entry_pred(first_entry), entry = first_entry;
2985
entry->start < end;
2986
vm_map_try_merge_entries(map, prev_entry, entry),
2987
prev_entry = entry, entry = vm_map_entry_succ(entry)) {
2988
if (rv != KERN_SUCCESS)
2989
continue;
2990
2991
if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
2992
vm_map_protect_guard(entry, new_prot, new_maxprot,
2993
flags);
2994
continue;
2995
}
2996
2997
old_prot = entry->protection;
2998
2999
if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0) {
3000
entry->max_protection = new_maxprot;
3001
entry->protection = new_maxprot & old_prot;
3002
}
3003
if ((flags & VM_MAP_PROTECT_SET_PROT) != 0)
3004
entry->protection = new_prot;
3005
3006
/*
3007
* For user wired map entries, the normal lazy evaluation of
3008
* write access upgrades through soft page faults is
3009
* undesirable. Instead, immediately copy any pages that are
3010
* copy-on-write and enable write access in the physical map.
3011
*/
3012
if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
3013
(entry->protection & VM_PROT_WRITE) != 0 &&
3014
(old_prot & VM_PROT_WRITE) == 0)
3015
vm_fault_copy_entry(map, map, entry, entry, NULL);
3016
3017
/*
3018
* When restricting access, update the physical map. Worry
3019
* about copy-on-write here.
3020
*/
3021
if ((old_prot & ~entry->protection) != 0) {
3022
#define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
3023
VM_PROT_ALL)
3024
pmap_protect(map->pmap, entry->start,
3025
entry->end,
3026
entry->protection & MASK(entry));
3027
#undef MASK
3028
}
3029
}
3030
vm_map_try_merge_entries(map, prev_entry, entry);
3031
vm_map_unlock(map);
3032
return (rv);
3033
}
3034
3035
/*
3036
* vm_map_madvise:
3037
*
3038
* This routine traverses a processes map handling the madvise
3039
* system call. Advisories are classified as either those effecting
3040
* the vm_map_entry structure, or those effecting the underlying
3041
* objects.
3042
*/
3043
int
3044
vm_map_madvise(
3045
vm_map_t map,
3046
vm_offset_t start,
3047
vm_offset_t end,
3048
int behav)
3049
{
3050
vm_map_entry_t entry, prev_entry;
3051
int rv;
3052
bool modify_map;
3053
3054
/*
3055
* Some madvise calls directly modify the vm_map_entry, in which case
3056
* we need to use an exclusive lock on the map and we need to perform
3057
* various clipping operations. Otherwise we only need a read-lock
3058
* on the map.
3059
*/
3060
switch(behav) {
3061
case MADV_NORMAL:
3062
case MADV_SEQUENTIAL:
3063
case MADV_RANDOM:
3064
case MADV_NOSYNC:
3065
case MADV_AUTOSYNC:
3066
case MADV_NOCORE:
3067
case MADV_CORE:
3068
if (start == end)
3069
return (0);
3070
modify_map = true;
3071
vm_map_lock(map);
3072
break;
3073
case MADV_WILLNEED:
3074
case MADV_DONTNEED:
3075
case MADV_FREE:
3076
if (start == end)
3077
return (0);
3078
modify_map = false;
3079
vm_map_lock_read(map);
3080
break;
3081
default:
3082
return (EINVAL);
3083
}
3084
3085
/*
3086
* Locate starting entry and clip if necessary.
3087
*/
3088
VM_MAP_RANGE_CHECK(map, start, end);
3089
3090
if (modify_map) {
3091
/*
3092
* madvise behaviors that are implemented in the vm_map_entry.
3093
*
3094
* We clip the vm_map_entry so that behavioral changes are
3095
* limited to the specified address range.
3096
*/
3097
rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
3098
if (rv != KERN_SUCCESS) {
3099
vm_map_unlock(map);
3100
return (vm_mmap_to_errno(rv));
3101
}
3102
3103
for (; entry->start < end; prev_entry = entry,
3104
entry = vm_map_entry_succ(entry)) {
3105
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
3106
continue;
3107
3108
rv = vm_map_clip_end(map, entry, end);
3109
if (rv != KERN_SUCCESS) {
3110
vm_map_unlock(map);
3111
return (vm_mmap_to_errno(rv));
3112
}
3113
3114
switch (behav) {
3115
case MADV_NORMAL:
3116
vm_map_entry_set_behavior(entry,
3117
MAP_ENTRY_BEHAV_NORMAL);
3118
break;
3119
case MADV_SEQUENTIAL:
3120
vm_map_entry_set_behavior(entry,
3121
MAP_ENTRY_BEHAV_SEQUENTIAL);
3122
break;
3123
case MADV_RANDOM:
3124
vm_map_entry_set_behavior(entry,
3125
MAP_ENTRY_BEHAV_RANDOM);
3126
break;
3127
case MADV_NOSYNC:
3128
entry->eflags |= MAP_ENTRY_NOSYNC;
3129
break;
3130
case MADV_AUTOSYNC:
3131
entry->eflags &= ~MAP_ENTRY_NOSYNC;
3132
break;
3133
case MADV_NOCORE:
3134
entry->eflags |= MAP_ENTRY_NOCOREDUMP;
3135
break;
3136
case MADV_CORE:
3137
entry->eflags &= ~MAP_ENTRY_NOCOREDUMP;
3138
break;
3139
default:
3140
break;
3141
}
3142
vm_map_try_merge_entries(map, prev_entry, entry);
3143
}
3144
vm_map_try_merge_entries(map, prev_entry, entry);
3145
vm_map_unlock(map);
3146
} else {
3147
vm_pindex_t pstart, pend;
3148
3149
/*
3150
* madvise behaviors that are implemented in the underlying
3151
* vm_object.
3152
*
3153
* Since we don't clip the vm_map_entry, we have to clip
3154
* the vm_object pindex and count.
3155
*/
3156
if (!vm_map_lookup_entry(map, start, &entry))
3157
entry = vm_map_entry_succ(entry);
3158
for (; entry->start < end;
3159
entry = vm_map_entry_succ(entry)) {
3160
vm_offset_t useEnd, useStart;
3161
3162
if ((entry->eflags & (MAP_ENTRY_IS_SUB_MAP |
3163
MAP_ENTRY_GUARD)) != 0)
3164
continue;
3165
3166
/*
3167
* MADV_FREE would otherwise rewind time to
3168
* the creation of the shadow object. Because
3169
* we hold the VM map read-locked, neither the
3170
* entry's object nor the presence of a
3171
* backing object can change.
3172
*/
3173
if (behav == MADV_FREE &&
3174
entry->object.vm_object != NULL &&
3175
entry->object.vm_object->backing_object != NULL)
3176
continue;
3177
3178
pstart = OFF_TO_IDX(entry->offset);
3179
pend = pstart + atop(entry->end - entry->start);
3180
useStart = entry->start;
3181
useEnd = entry->end;
3182
3183
if (entry->start < start) {
3184
pstart += atop(start - entry->start);
3185
useStart = start;
3186
}
3187
if (entry->end > end) {
3188
pend -= atop(entry->end - end);
3189
useEnd = end;
3190
}
3191
3192
if (pstart >= pend)
3193
continue;
3194
3195
/*
3196
* Perform the pmap_advise() before clearing
3197
* PGA_REFERENCED in vm_page_advise(). Otherwise, a
3198
* concurrent pmap operation, such as pmap_remove(),
3199
* could clear a reference in the pmap and set
3200
* PGA_REFERENCED on the page before the pmap_advise()
3201
* had completed. Consequently, the page would appear
3202
* referenced based upon an old reference that
3203
* occurred before this pmap_advise() ran.
3204
*/
3205
if (behav == MADV_DONTNEED || behav == MADV_FREE)
3206
pmap_advise(map->pmap, useStart, useEnd,
3207
behav);
3208
3209
vm_object_madvise(entry->object.vm_object, pstart,
3210
pend, behav);
3211
3212
/*
3213
* Pre-populate paging structures in the
3214
* WILLNEED case. For wired entries, the
3215
* paging structures are already populated.
3216
*/
3217
if (behav == MADV_WILLNEED &&
3218
entry->wired_count == 0) {
3219
vm_map_pmap_enter(map,
3220
useStart,
3221
entry->protection,
3222
entry->object.vm_object,
3223
pstart,
3224
ptoa(pend - pstart),
3225
MAP_PREFAULT_MADVISE
3226
);
3227
}
3228
}
3229
vm_map_unlock_read(map);
3230
}
3231
return (0);
3232
}
3233
3234
/*
3235
* vm_map_inherit:
3236
*
3237
* Sets the inheritance of the specified address
3238
* range in the target map. Inheritance
3239
* affects how the map will be shared with
3240
* child maps at the time of vmspace_fork.
3241
*/
3242
int
3243
vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
3244
vm_inherit_t new_inheritance)
3245
{
3246
vm_map_entry_t entry, lentry, prev_entry, start_entry;
3247
int rv;
3248
3249
switch (new_inheritance) {
3250
case VM_INHERIT_NONE:
3251
case VM_INHERIT_COPY:
3252
case VM_INHERIT_SHARE:
3253
case VM_INHERIT_ZERO:
3254
break;
3255
default:
3256
return (KERN_INVALID_ARGUMENT);
3257
}
3258
if (start == end)
3259
return (KERN_SUCCESS);
3260
vm_map_lock(map);
3261
VM_MAP_RANGE_CHECK(map, start, end);
3262
rv = vm_map_lookup_clip_start(map, start, &start_entry, &prev_entry);
3263
if (rv != KERN_SUCCESS)
3264
goto unlock;
3265
if (vm_map_lookup_entry(map, end - 1, &lentry)) {
3266
rv = vm_map_clip_end(map, lentry, end);
3267
if (rv != KERN_SUCCESS)
3268
goto unlock;
3269
}
3270
if (new_inheritance == VM_INHERIT_COPY) {
3271
for (entry = start_entry; entry->start < end;
3272
prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3273
if ((entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK)
3274
!= 0) {
3275
rv = KERN_INVALID_ARGUMENT;
3276
goto unlock;
3277
}
3278
}
3279
}
3280
for (entry = start_entry; entry->start < end; prev_entry = entry,
3281
entry = vm_map_entry_succ(entry)) {
3282
KASSERT(entry->end <= end, ("non-clipped entry %p end %jx %jx",
3283
entry, (uintmax_t)entry->end, (uintmax_t)end));
3284
if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
3285
new_inheritance != VM_INHERIT_ZERO)
3286
entry->inheritance = new_inheritance;
3287
vm_map_try_merge_entries(map, prev_entry, entry);
3288
}
3289
vm_map_try_merge_entries(map, prev_entry, entry);
3290
unlock:
3291
vm_map_unlock(map);
3292
return (rv);
3293
}
3294
3295
/*
3296
* vm_map_entry_in_transition:
3297
*
3298
* Release the map lock, and sleep until the entry is no longer in
3299
* transition. Awake and acquire the map lock. If the map changed while
3300
* another held the lock, lookup a possibly-changed entry at or after the
3301
* 'start' position of the old entry.
3302
*/
3303
static vm_map_entry_t
3304
vm_map_entry_in_transition(vm_map_t map, vm_offset_t in_start,
3305
vm_offset_t *io_end, bool holes_ok, vm_map_entry_t in_entry)
3306
{
3307
vm_map_entry_t entry;
3308
vm_offset_t start;
3309
u_int last_timestamp;
3310
3311
VM_MAP_ASSERT_LOCKED(map);
3312
KASSERT((in_entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3313
("not in-tranition map entry %p", in_entry));
3314
/*
3315
* We have not yet clipped the entry.
3316
*/
3317
start = MAX(in_start, in_entry->start);
3318
in_entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3319
last_timestamp = map->timestamp;
3320
if (vm_map_unlock_and_wait(map, 0)) {
3321
/*
3322
* Allow interruption of user wiring/unwiring?
3323
*/
3324
}
3325
vm_map_lock(map);
3326
if (last_timestamp + 1 == map->timestamp)
3327
return (in_entry);
3328
3329
/*
3330
* Look again for the entry because the map was modified while it was
3331
* unlocked. Specifically, the entry may have been clipped, merged, or
3332
* deleted.
3333
*/
3334
if (!vm_map_lookup_entry(map, start, &entry)) {
3335
if (!holes_ok) {
3336
*io_end = start;
3337
return (NULL);
3338
}
3339
entry = vm_map_entry_succ(entry);
3340
}
3341
return (entry);
3342
}
3343
3344
/*
3345
* vm_map_unwire:
3346
*
3347
* Implements both kernel and user unwiring.
3348
*/
3349
int
3350
vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
3351
int flags)
3352
{
3353
vm_map_entry_t entry, first_entry, next_entry, prev_entry;
3354
int rv;
3355
bool holes_ok, need_wakeup, user_unwire;
3356
3357
if (start == end)
3358
return (KERN_SUCCESS);
3359
holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
3360
user_unwire = (flags & VM_MAP_WIRE_USER) != 0;
3361
vm_map_lock(map);
3362
VM_MAP_RANGE_CHECK(map, start, end);
3363
if (!vm_map_lookup_entry(map, start, &first_entry)) {
3364
if (holes_ok)
3365
first_entry = vm_map_entry_succ(first_entry);
3366
else {
3367
vm_map_unlock(map);
3368
return (KERN_INVALID_ADDRESS);
3369
}
3370
}
3371
rv = KERN_SUCCESS;
3372
for (entry = first_entry; entry->start < end; entry = next_entry) {
3373
if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3374
/*
3375
* We have not yet clipped the entry.
3376
*/
3377
next_entry = vm_map_entry_in_transition(map, start,
3378
&end, holes_ok, entry);
3379
if (next_entry == NULL) {
3380
if (entry == first_entry) {
3381
vm_map_unlock(map);
3382
return (KERN_INVALID_ADDRESS);
3383
}
3384
rv = KERN_INVALID_ADDRESS;
3385
break;
3386
}
3387
first_entry = (entry == first_entry) ?
3388
next_entry : NULL;
3389
continue;
3390
}
3391
rv = vm_map_clip_start(map, entry, start);
3392
if (rv != KERN_SUCCESS)
3393
break;
3394
rv = vm_map_clip_end(map, entry, end);
3395
if (rv != KERN_SUCCESS)
3396
break;
3397
3398
/*
3399
* Mark the entry in case the map lock is released. (See
3400
* above.)
3401
*/
3402
KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
3403
entry->wiring_thread == NULL,
3404
("owned map entry %p", entry));
3405
entry->eflags |= MAP_ENTRY_IN_TRANSITION;
3406
entry->wiring_thread = curthread;
3407
next_entry = vm_map_entry_succ(entry);
3408
/*
3409
* Check the map for holes in the specified region.
3410
* If holes_ok, skip this check.
3411
*/
3412
if (!holes_ok &&
3413
entry->end < end && next_entry->start > entry->end) {
3414
end = entry->end;
3415
rv = KERN_INVALID_ADDRESS;
3416
break;
3417
}
3418
/*
3419
* If system unwiring, require that the entry is system wired.
3420
*/
3421
if (!user_unwire &&
3422
vm_map_entry_system_wired_count(entry) == 0) {
3423
end = entry->end;
3424
rv = KERN_INVALID_ARGUMENT;
3425
break;
3426
}
3427
}
3428
need_wakeup = false;
3429
if (first_entry == NULL &&
3430
!vm_map_lookup_entry(map, start, &first_entry)) {
3431
KASSERT(holes_ok, ("vm_map_unwire: lookup failed"));
3432
prev_entry = first_entry;
3433
entry = vm_map_entry_succ(first_entry);
3434
} else {
3435
prev_entry = vm_map_entry_pred(first_entry);
3436
entry = first_entry;
3437
}
3438
for (; entry->start < end;
3439
prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3440
/*
3441
* If holes_ok was specified, an empty
3442
* space in the unwired region could have been mapped
3443
* while the map lock was dropped for draining
3444
* MAP_ENTRY_IN_TRANSITION. Moreover, another thread
3445
* could be simultaneously wiring this new mapping
3446
* entry. Detect these cases and skip any entries
3447
* marked as in transition by us.
3448
*/
3449
if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3450
entry->wiring_thread != curthread) {
3451
KASSERT(holes_ok,
3452
("vm_map_unwire: !HOLESOK and new/changed entry"));
3453
continue;
3454
}
3455
3456
if (rv == KERN_SUCCESS && (!user_unwire ||
3457
(entry->eflags & MAP_ENTRY_USER_WIRED))) {
3458
if (entry->wired_count == 1)
3459
vm_map_entry_unwire(map, entry);
3460
else
3461
entry->wired_count--;
3462
if (user_unwire)
3463
entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3464
}
3465
KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3466
("vm_map_unwire: in-transition flag missing %p", entry));
3467
KASSERT(entry->wiring_thread == curthread,
3468
("vm_map_unwire: alien wire %p", entry));
3469
entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
3470
entry->wiring_thread = NULL;
3471
if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3472
entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3473
need_wakeup = true;
3474
}
3475
vm_map_try_merge_entries(map, prev_entry, entry);
3476
}
3477
vm_map_try_merge_entries(map, prev_entry, entry);
3478
vm_map_unlock(map);
3479
if (need_wakeup)
3480
vm_map_wakeup(map);
3481
return (rv);
3482
}
3483
3484
static void
3485
vm_map_wire_user_count_sub(u_long npages)
3486
{
3487
3488
atomic_subtract_long(&vm_user_wire_count, npages);
3489
}
3490
3491
static bool
3492
vm_map_wire_user_count_add(u_long npages)
3493
{
3494
u_long wired;
3495
3496
wired = vm_user_wire_count;
3497
do {
3498
if (npages + wired > vm_page_max_user_wired)
3499
return (false);
3500
} while (!atomic_fcmpset_long(&vm_user_wire_count, &wired,
3501
npages + wired));
3502
3503
return (true);
3504
}
3505
3506
/*
3507
* vm_map_wire_entry_failure:
3508
*
3509
* Handle a wiring failure on the given entry.
3510
*
3511
* The map should be locked.
3512
*/
3513
static void
3514
vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
3515
vm_offset_t failed_addr)
3516
{
3517
3518
VM_MAP_ASSERT_LOCKED(map);
3519
KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
3520
entry->wired_count == 1,
3521
("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
3522
KASSERT(failed_addr < entry->end,
3523
("vm_map_wire_entry_failure: entry %p was fully wired", entry));
3524
3525
/*
3526
* If any pages at the start of this entry were successfully wired,
3527
* then unwire them.
3528
*/
3529
if (failed_addr > entry->start) {
3530
pmap_unwire(map->pmap, entry->start, failed_addr);
3531
vm_object_unwire(entry->object.vm_object, entry->offset,
3532
failed_addr - entry->start, PQ_ACTIVE);
3533
}
3534
3535
/*
3536
* Assign an out-of-range value to represent the failure to wire this
3537
* entry.
3538
*/
3539
entry->wired_count = -1;
3540
}
3541
3542
int
3543
vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3544
{
3545
int rv;
3546
3547
vm_map_lock(map);
3548
rv = vm_map_wire_locked(map, start, end, flags);
3549
vm_map_unlock(map);
3550
return (rv);
3551
}
3552
3553
/*
3554
* vm_map_wire_locked:
3555
*
3556
* Implements both kernel and user wiring. Returns with the map locked,
3557
* the map lock may be dropped.
3558
*/
3559
int
3560
vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3561
{
3562
vm_map_entry_t entry, first_entry, next_entry, prev_entry;
3563
vm_offset_t faddr, saved_end, saved_start;
3564
u_long incr, npages;
3565
u_int bidx, last_timestamp;
3566
int rv;
3567
bool holes_ok, need_wakeup, user_wire;
3568
vm_prot_t prot;
3569
3570
VM_MAP_ASSERT_LOCKED(map);
3571
3572
if (start == end)
3573
return (KERN_SUCCESS);
3574
prot = 0;
3575
if (flags & VM_MAP_WIRE_WRITE)
3576
prot |= VM_PROT_WRITE;
3577
holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
3578
user_wire = (flags & VM_MAP_WIRE_USER) != 0;
3579
VM_MAP_RANGE_CHECK(map, start, end);
3580
if (!vm_map_lookup_entry(map, start, &first_entry)) {
3581
if (holes_ok)
3582
first_entry = vm_map_entry_succ(first_entry);
3583
else
3584
return (KERN_INVALID_ADDRESS);
3585
}
3586
for (entry = first_entry; entry->start < end; entry = next_entry) {
3587
if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3588
/*
3589
* We have not yet clipped the entry.
3590
*/
3591
next_entry = vm_map_entry_in_transition(map, start,
3592
&end, holes_ok, entry);
3593
if (next_entry == NULL) {
3594
if (entry == first_entry)
3595
return (KERN_INVALID_ADDRESS);
3596
rv = KERN_INVALID_ADDRESS;
3597
goto done;
3598
}
3599
first_entry = (entry == first_entry) ?
3600
next_entry : NULL;
3601
continue;
3602
}
3603
rv = vm_map_clip_start(map, entry, start);
3604
if (rv != KERN_SUCCESS)
3605
goto done;
3606
rv = vm_map_clip_end(map, entry, end);
3607
if (rv != KERN_SUCCESS)
3608
goto done;
3609
3610
/*
3611
* Mark the entry in case the map lock is released. (See
3612
* above.)
3613
*/
3614
KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
3615
entry->wiring_thread == NULL,
3616
("owned map entry %p", entry));
3617
entry->eflags |= MAP_ENTRY_IN_TRANSITION;
3618
entry->wiring_thread = curthread;
3619
if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
3620
|| (entry->protection & prot) != prot) {
3621
entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
3622
if (!holes_ok) {
3623
end = entry->end;
3624
rv = KERN_INVALID_ADDRESS;
3625
goto done;
3626
}
3627
} else if (entry->wired_count == 0) {
3628
entry->wired_count++;
3629
3630
npages = atop(entry->end - entry->start);
3631
if (user_wire && !vm_map_wire_user_count_add(npages)) {
3632
vm_map_wire_entry_failure(map, entry,
3633
entry->start);
3634
end = entry->end;
3635
rv = KERN_RESOURCE_SHORTAGE;
3636
goto done;
3637
}
3638
3639
/*
3640
* Release the map lock, relying on the in-transition
3641
* mark. Mark the map busy for fork.
3642
*/
3643
saved_start = entry->start;
3644
saved_end = entry->end;
3645
last_timestamp = map->timestamp;
3646
bidx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
3647
incr = pagesizes[bidx];
3648
vm_map_busy(map);
3649
vm_map_unlock(map);
3650
3651
for (faddr = saved_start; faddr < saved_end;
3652
faddr += incr) {
3653
/*
3654
* Simulate a fault to get the page and enter
3655
* it into the physical map.
3656
*/
3657
rv = vm_fault(map, faddr, VM_PROT_NONE,
3658
VM_FAULT_WIRE, NULL);
3659
if (rv != KERN_SUCCESS)
3660
break;
3661
}
3662
vm_map_lock(map);
3663
vm_map_unbusy(map);
3664
if (last_timestamp + 1 != map->timestamp) {
3665
/*
3666
* Look again for the entry because the map was
3667
* modified while it was unlocked. The entry
3668
* may have been clipped, but NOT merged or
3669
* deleted.
3670
*/
3671
if (!vm_map_lookup_entry(map, saved_start,
3672
&next_entry))
3673
KASSERT(false,
3674
("vm_map_wire: lookup failed"));
3675
first_entry = (entry == first_entry) ?
3676
next_entry : NULL;
3677
for (entry = next_entry; entry->end < saved_end;
3678
entry = vm_map_entry_succ(entry)) {
3679
/*
3680
* In case of failure, handle entries
3681
* that were not fully wired here;
3682
* fully wired entries are handled
3683
* later.
3684
*/
3685
if (rv != KERN_SUCCESS &&
3686
faddr < entry->end)
3687
vm_map_wire_entry_failure(map,
3688
entry, faddr);
3689
}
3690
}
3691
if (rv != KERN_SUCCESS) {
3692
vm_map_wire_entry_failure(map, entry, faddr);
3693
if (user_wire)
3694
vm_map_wire_user_count_sub(npages);
3695
end = entry->end;
3696
goto done;
3697
}
3698
} else if (!user_wire ||
3699
(entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3700
entry->wired_count++;
3701
}
3702
/*
3703
* Check the map for holes in the specified region.
3704
* If holes_ok was specified, skip this check.
3705
*/
3706
next_entry = vm_map_entry_succ(entry);
3707
if (!holes_ok &&
3708
entry->end < end && next_entry->start > entry->end) {
3709
end = entry->end;
3710
rv = KERN_INVALID_ADDRESS;
3711
goto done;
3712
}
3713
}
3714
rv = KERN_SUCCESS;
3715
done:
3716
need_wakeup = false;
3717
if (first_entry == NULL &&
3718
!vm_map_lookup_entry(map, start, &first_entry)) {
3719
KASSERT(holes_ok, ("vm_map_wire: lookup failed"));
3720
prev_entry = first_entry;
3721
entry = vm_map_entry_succ(first_entry);
3722
} else {
3723
prev_entry = vm_map_entry_pred(first_entry);
3724
entry = first_entry;
3725
}
3726
for (; entry->start < end;
3727
prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3728
/*
3729
* If holes_ok was specified, an empty
3730
* space in the unwired region could have been mapped
3731
* while the map lock was dropped for faulting in the
3732
* pages or draining MAP_ENTRY_IN_TRANSITION.
3733
* Moreover, another thread could be simultaneously
3734
* wiring this new mapping entry. Detect these cases
3735
* and skip any entries marked as in transition not by us.
3736
*
3737
* Another way to get an entry not marked with
3738
* MAP_ENTRY_IN_TRANSITION is after failed clipping,
3739
* which set rv to KERN_INVALID_ARGUMENT.
3740
*/
3741
if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3742
entry->wiring_thread != curthread) {
3743
KASSERT(holes_ok || rv == KERN_INVALID_ARGUMENT,
3744
("vm_map_wire: !HOLESOK and new/changed entry"));
3745
continue;
3746
}
3747
3748
if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) {
3749
/* do nothing */
3750
} else if (rv == KERN_SUCCESS) {
3751
if (user_wire)
3752
entry->eflags |= MAP_ENTRY_USER_WIRED;
3753
} else if (entry->wired_count == -1) {
3754
/*
3755
* Wiring failed on this entry. Thus, unwiring is
3756
* unnecessary.
3757
*/
3758
entry->wired_count = 0;
3759
} else if (!user_wire ||
3760
(entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3761
/*
3762
* Undo the wiring. Wiring succeeded on this entry
3763
* but failed on a later entry.
3764
*/
3765
if (entry->wired_count == 1) {
3766
vm_map_entry_unwire(map, entry);
3767
if (user_wire)
3768
vm_map_wire_user_count_sub(
3769
atop(entry->end - entry->start));
3770
} else
3771
entry->wired_count--;
3772
}
3773
KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3774
("vm_map_wire: in-transition flag missing %p", entry));
3775
KASSERT(entry->wiring_thread == curthread,
3776
("vm_map_wire: alien wire %p", entry));
3777
entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
3778
MAP_ENTRY_WIRE_SKIPPED);
3779
entry->wiring_thread = NULL;
3780
if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3781
entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3782
need_wakeup = true;
3783
}
3784
vm_map_try_merge_entries(map, prev_entry, entry);
3785
}
3786
vm_map_try_merge_entries(map, prev_entry, entry);
3787
if (need_wakeup)
3788
vm_map_wakeup(map);
3789
return (rv);
3790
}
3791
3792
/*
3793
* vm_map_sync
3794
*
3795
* Push any dirty cached pages in the address range to their pager.
3796
* If syncio is TRUE, dirty pages are written synchronously.
3797
* If invalidate is TRUE, any cached pages are freed as well.
3798
*
3799
* If the size of the region from start to end is zero, we are
3800
* supposed to flush all modified pages within the region containing
3801
* start. Unfortunately, a region can be split or coalesced with
3802
* neighboring regions, making it difficult to determine what the
3803
* original region was. Therefore, we approximate this requirement by
3804
* flushing the current region containing start.
3805
*
3806
* Returns an error if any part of the specified range is not mapped.
3807
*/
3808
int
3809
vm_map_sync(
3810
vm_map_t map,
3811
vm_offset_t start,
3812
vm_offset_t end,
3813
boolean_t syncio,
3814
boolean_t invalidate)
3815
{
3816
vm_map_entry_t entry, first_entry, next_entry;
3817
vm_size_t size;
3818
vm_object_t object;
3819
vm_ooffset_t offset;
3820
unsigned int last_timestamp;
3821
int bdry_idx;
3822
boolean_t failed;
3823
3824
vm_map_lock_read(map);
3825
VM_MAP_RANGE_CHECK(map, start, end);
3826
if (!vm_map_lookup_entry(map, start, &first_entry)) {
3827
vm_map_unlock_read(map);
3828
return (KERN_INVALID_ADDRESS);
3829
} else if (start == end) {
3830
start = first_entry->start;
3831
end = first_entry->end;
3832
}
3833
3834
/*
3835
* Make a first pass to check for user-wired memory, holes,
3836
* and partial invalidation of largepage mappings.
3837
*/
3838
for (entry = first_entry; entry->start < end; entry = next_entry) {
3839
if (invalidate) {
3840
if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) {
3841
vm_map_unlock_read(map);
3842
return (KERN_INVALID_ARGUMENT);
3843
}
3844
bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
3845
if (bdry_idx != 0 &&
3846
((start & (pagesizes[bdry_idx] - 1)) != 0 ||
3847
(end & (pagesizes[bdry_idx] - 1)) != 0)) {
3848
vm_map_unlock_read(map);
3849
return (KERN_INVALID_ARGUMENT);
3850
}
3851
}
3852
next_entry = vm_map_entry_succ(entry);
3853
if (end > entry->end &&
3854
entry->end != next_entry->start) {
3855
vm_map_unlock_read(map);
3856
return (KERN_INVALID_ADDRESS);
3857
}
3858
}
3859
3860
if (invalidate)
3861
pmap_remove(map->pmap, start, end);
3862
failed = FALSE;
3863
3864
/*
3865
* Make a second pass, cleaning/uncaching pages from the indicated
3866
* objects as we go.
3867
*/
3868
for (entry = first_entry; entry->start < end;) {
3869
offset = entry->offset + (start - entry->start);
3870
size = (end <= entry->end ? end : entry->end) - start;
3871
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
3872
vm_map_t smap;
3873
vm_map_entry_t tentry;
3874
vm_size_t tsize;
3875
3876
smap = entry->object.sub_map;
3877
vm_map_lock_read(smap);
3878
(void) vm_map_lookup_entry(smap, offset, &tentry);
3879
tsize = tentry->end - offset;
3880
if (tsize < size)
3881
size = tsize;
3882
object = tentry->object.vm_object;
3883
offset = tentry->offset + (offset - tentry->start);
3884
vm_map_unlock_read(smap);
3885
} else {
3886
object = entry->object.vm_object;
3887
}
3888
vm_object_reference(object);
3889
last_timestamp = map->timestamp;
3890
vm_map_unlock_read(map);
3891
if (!vm_object_sync(object, offset, size, syncio, invalidate))
3892
failed = TRUE;
3893
start += size;
3894
vm_object_deallocate(object);
3895
vm_map_lock_read(map);
3896
if (last_timestamp == map->timestamp ||
3897
!vm_map_lookup_entry(map, start, &entry))
3898
entry = vm_map_entry_succ(entry);
3899
}
3900
3901
vm_map_unlock_read(map);
3902
return (failed ? KERN_FAILURE : KERN_SUCCESS);
3903
}
3904
3905
/*
3906
* vm_map_entry_unwire: [ internal use only ]
3907
*
3908
* Make the region specified by this entry pageable.
3909
*
3910
* The map in question should be locked.
3911
* [This is the reason for this routine's existence.]
3912
*/
3913
static void
3914
vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3915
{
3916
vm_size_t size;
3917
3918
VM_MAP_ASSERT_LOCKED(map);
3919
KASSERT(entry->wired_count > 0,
3920
("vm_map_entry_unwire: entry %p isn't wired", entry));
3921
3922
size = entry->end - entry->start;
3923
if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0)
3924
vm_map_wire_user_count_sub(atop(size));
3925
pmap_unwire(map->pmap, entry->start, entry->end);
3926
vm_object_unwire(entry->object.vm_object, entry->offset, size,
3927
PQ_ACTIVE);
3928
entry->wired_count = 0;
3929
}
3930
3931
static void
3932
vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
3933
{
3934
3935
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
3936
vm_object_deallocate(entry->object.vm_object);
3937
uma_zfree(system_map ? kmapentzone : mapentzone, entry);
3938
}
3939
3940
/*
3941
* vm_map_entry_delete: [ internal use only ]
3942
*
3943
* Deallocate the given entry from the target map.
3944
*/
3945
static void
3946
vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
3947
{
3948
vm_object_t object;
3949
vm_pindex_t offidxstart, offidxend, oldsize;
3950
vm_size_t size;
3951
3952
vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE);
3953
object = entry->object.vm_object;
3954
3955
if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
3956
MPASS(entry->cred == NULL);
3957
MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
3958
MPASS(object == NULL);
3959
vm_map_entry_deallocate(entry, vm_map_is_system(map));
3960
return;
3961
}
3962
3963
size = entry->end - entry->start;
3964
map->size -= size;
3965
3966
if (entry->cred != NULL) {
3967
swap_release_by_cred(size, entry->cred);
3968
crfree(entry->cred);
3969
}
3970
3971
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || object == NULL) {
3972
entry->object.vm_object = NULL;
3973
} else if ((object->flags & OBJ_ANON) != 0 ||
3974
object == kernel_object) {
3975
KASSERT(entry->cred == NULL || object->cred == NULL ||
3976
(entry->eflags & MAP_ENTRY_NEEDS_COPY),
3977
("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
3978
offidxstart = OFF_TO_IDX(entry->offset);
3979
offidxend = offidxstart + atop(size);
3980
VM_OBJECT_WLOCK(object);
3981
if (object->ref_count != 1 &&
3982
((object->flags & OBJ_ONEMAPPING) != 0 ||
3983
object == kernel_object)) {
3984
vm_object_collapse(object);
3985
3986
/*
3987
* The option OBJPR_NOTMAPPED can be passed here
3988
* because vm_map_delete() already performed
3989
* pmap_remove() on the only mapping to this range
3990
* of pages.
3991
*/
3992
vm_object_page_remove(object, offidxstart, offidxend,
3993
OBJPR_NOTMAPPED);
3994
if (offidxend >= object->size &&
3995
offidxstart < object->size) {
3996
oldsize = object->size;
3997
object->size = offidxstart;
3998
if (object->cred != NULL) {
3999
swap_release_by_cred(ptoa(oldsize -
4000
object->size), object->cred);
4001
}
4002
}
4003
}
4004
VM_OBJECT_WUNLOCK(object);
4005
}
4006
if (vm_map_is_system(map))
4007
vm_map_entry_deallocate(entry, TRUE);
4008
else {
4009
entry->defer_next = curthread->td_map_def_user;
4010
curthread->td_map_def_user = entry;
4011
}
4012
}
4013
4014
/*
4015
* vm_map_delete: [ internal use only ]
4016
*
4017
* Deallocates the given address range from the target
4018
* map.
4019
*/
4020
int
4021
vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
4022
{
4023
vm_map_entry_t entry, next_entry, scratch_entry;
4024
int rv;
4025
4026
VM_MAP_ASSERT_LOCKED(map);
4027
4028
if (start == end)
4029
return (KERN_SUCCESS);
4030
4031
/*
4032
* Find the start of the region, and clip it.
4033
* Step through all entries in this region.
4034
*/
4035
rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry);
4036
if (rv != KERN_SUCCESS)
4037
return (rv);
4038
for (; entry->start < end; entry = next_entry) {
4039
/*
4040
* Wait for wiring or unwiring of an entry to complete.
4041
* Also wait for any system wirings to disappear on
4042
* user maps.
4043
*/
4044
if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
4045
(vm_map_pmap(map) != kernel_pmap &&
4046
vm_map_entry_system_wired_count(entry) != 0)) {
4047
unsigned int last_timestamp;
4048
vm_offset_t saved_start;
4049
4050
saved_start = entry->start;
4051
entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
4052
last_timestamp = map->timestamp;
4053
(void) vm_map_unlock_and_wait(map, 0);
4054
vm_map_lock(map);
4055
if (last_timestamp + 1 != map->timestamp) {
4056
/*
4057
* Look again for the entry because the map was
4058
* modified while it was unlocked.
4059
* Specifically, the entry may have been
4060
* clipped, merged, or deleted.
4061
*/
4062
rv = vm_map_lookup_clip_start(map, saved_start,
4063
&next_entry, &scratch_entry);
4064
if (rv != KERN_SUCCESS)
4065
break;
4066
} else
4067
next_entry = entry;
4068
continue;
4069
}
4070
4071
/* XXXKIB or delete to the upper superpage boundary ? */
4072
rv = vm_map_clip_end(map, entry, end);
4073
if (rv != KERN_SUCCESS)
4074
break;
4075
next_entry = vm_map_entry_succ(entry);
4076
4077
/*
4078
* Unwire before removing addresses from the pmap; otherwise,
4079
* unwiring will put the entries back in the pmap.
4080
*/
4081
if (entry->wired_count != 0)
4082
vm_map_entry_unwire(map, entry);
4083
4084
/*
4085
* Remove mappings for the pages, but only if the
4086
* mappings could exist. For instance, it does not
4087
* make sense to call pmap_remove() for guard entries.
4088
*/
4089
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
4090
entry->object.vm_object != NULL)
4091
pmap_map_delete(map->pmap, entry->start, entry->end);
4092
4093
/*
4094
* Delete the entry only after removing all pmap
4095
* entries pointing to its pages. (Otherwise, its
4096
* page frames may be reallocated, and any modify bits
4097
* will be set in the wrong object!)
4098
*/
4099
vm_map_entry_delete(map, entry);
4100
}
4101
return (rv);
4102
}
4103
4104
/*
4105
* vm_map_remove:
4106
*
4107
* Remove the given address range from the target map.
4108
* This is the exported form of vm_map_delete.
4109
*/
4110
int
4111
vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
4112
{
4113
int result;
4114
4115
vm_map_lock(map);
4116
VM_MAP_RANGE_CHECK(map, start, end);
4117
result = vm_map_delete(map, start, end);
4118
vm_map_unlock(map);
4119
return (result);
4120
}
4121
4122
/*
4123
* vm_map_check_protection:
4124
*
4125
* Assert that the target map allows the specified privilege on the
4126
* entire address region given. The entire region must be allocated.
4127
*
4128
* WARNING! This code does not and should not check whether the
4129
* contents of the region is accessible. For example a smaller file
4130
* might be mapped into a larger address space.
4131
*
4132
* NOTE! This code is also called by munmap().
4133
*
4134
* The map must be locked. A read lock is sufficient.
4135
*/
4136
boolean_t
4137
vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
4138
vm_prot_t protection)
4139
{
4140
vm_map_entry_t entry;
4141
vm_map_entry_t tmp_entry;
4142
4143
if (!vm_map_lookup_entry(map, start, &tmp_entry))
4144
return (FALSE);
4145
entry = tmp_entry;
4146
4147
while (start < end) {
4148
/*
4149
* No holes allowed!
4150
*/
4151
if (start < entry->start)
4152
return (FALSE);
4153
/*
4154
* Check protection associated with entry.
4155
*/
4156
if ((entry->protection & protection) != protection)
4157
return (FALSE);
4158
/* go to next entry */
4159
start = entry->end;
4160
entry = vm_map_entry_succ(entry);
4161
}
4162
return (TRUE);
4163
}
4164
4165
/*
4166
*
4167
* vm_map_copy_swap_object:
4168
*
4169
* Copies a swap-backed object from an existing map entry to a
4170
* new one. Carries forward the swap charge. May change the
4171
* src object on return.
4172
*/
4173
static void
4174
vm_map_copy_swap_object(vm_map_entry_t src_entry, vm_map_entry_t dst_entry,
4175
vm_offset_t size, vm_ooffset_t *fork_charge)
4176
{
4177
vm_object_t src_object;
4178
struct ucred *cred;
4179
int charged;
4180
4181
src_object = src_entry->object.vm_object;
4182
charged = ENTRY_CHARGED(src_entry);
4183
if ((src_object->flags & OBJ_ANON) != 0) {
4184
VM_OBJECT_WLOCK(src_object);
4185
vm_object_collapse(src_object);
4186
if ((src_object->flags & OBJ_ONEMAPPING) != 0) {
4187
vm_object_split(src_entry);
4188
src_object = src_entry->object.vm_object;
4189
}
4190
vm_object_reference_locked(src_object);
4191
vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
4192
VM_OBJECT_WUNLOCK(src_object);
4193
} else
4194
vm_object_reference(src_object);
4195
if (src_entry->cred != NULL &&
4196
!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
4197
KASSERT(src_object->cred == NULL,
4198
("OVERCOMMIT: vm_map_copy_anon_entry: cred %p",
4199
src_object));
4200
src_object->cred = src_entry->cred;
4201
*fork_charge += ptoa(src_object->size) - size;
4202
}
4203
dst_entry->object.vm_object = src_object;
4204
if (charged) {
4205
cred = curthread->td_ucred;
4206
crhold(cred);
4207
dst_entry->cred = cred;
4208
*fork_charge += size;
4209
if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
4210
crhold(cred);
4211
src_entry->cred = cred;
4212
*fork_charge += size;
4213
}
4214
}
4215
}
4216
4217
/*
4218
* vm_map_copy_entry:
4219
*
4220
* Copies the contents of the source entry to the destination
4221
* entry. The entries *must* be aligned properly.
4222
*/
4223
static void
4224
vm_map_copy_entry(
4225
vm_map_t src_map,
4226
vm_map_t dst_map,
4227
vm_map_entry_t src_entry,
4228
vm_map_entry_t dst_entry,
4229
vm_ooffset_t *fork_charge)
4230
{
4231
vm_object_t src_object;
4232
vm_map_entry_t fake_entry;
4233
vm_offset_t size;
4234
4235
VM_MAP_ASSERT_LOCKED(dst_map);
4236
4237
if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
4238
return;
4239
4240
if (src_entry->wired_count == 0 ||
4241
(src_entry->protection & VM_PROT_WRITE) == 0) {
4242
/*
4243
* If the source entry is marked needs_copy, it is already
4244
* write-protected.
4245
*/
4246
if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
4247
(src_entry->protection & VM_PROT_WRITE) != 0) {
4248
pmap_protect(src_map->pmap,
4249
src_entry->start,
4250
src_entry->end,
4251
src_entry->protection & ~VM_PROT_WRITE);
4252
}
4253
4254
/*
4255
* Make a copy of the object.
4256
*/
4257
size = src_entry->end - src_entry->start;
4258
if ((src_object = src_entry->object.vm_object) != NULL) {
4259
if ((src_object->flags & OBJ_SWAP) != 0) {
4260
vm_map_copy_swap_object(src_entry, dst_entry,
4261
size, fork_charge);
4262
/* May have split/collapsed, reload obj. */
4263
src_object = src_entry->object.vm_object;
4264
} else {
4265
vm_object_reference(src_object);
4266
dst_entry->object.vm_object = src_object;
4267
}
4268
src_entry->eflags |= MAP_ENTRY_COW |
4269
MAP_ENTRY_NEEDS_COPY;
4270
dst_entry->eflags |= MAP_ENTRY_COW |
4271
MAP_ENTRY_NEEDS_COPY;
4272
dst_entry->offset = src_entry->offset;
4273
if (src_entry->eflags & MAP_ENTRY_WRITECNT) {
4274
/*
4275
* MAP_ENTRY_WRITECNT cannot
4276
* indicate write reference from
4277
* src_entry, since the entry is
4278
* marked as needs copy. Allocate a
4279
* fake entry that is used to
4280
* decrement object->un_pager writecount
4281
* at the appropriate time. Attach
4282
* fake_entry to the deferred list.
4283
*/
4284
fake_entry = vm_map_entry_create(dst_map);
4285
fake_entry->eflags = MAP_ENTRY_WRITECNT;
4286
src_entry->eflags &= ~MAP_ENTRY_WRITECNT;
4287
vm_object_reference(src_object);
4288
fake_entry->object.vm_object = src_object;
4289
fake_entry->start = src_entry->start;
4290
fake_entry->end = src_entry->end;
4291
fake_entry->defer_next =
4292
curthread->td_map_def_user;
4293
curthread->td_map_def_user = fake_entry;
4294
}
4295
4296
pmap_copy(dst_map->pmap, src_map->pmap,
4297
dst_entry->start, dst_entry->end - dst_entry->start,
4298
src_entry->start);
4299
} else {
4300
dst_entry->object.vm_object = NULL;
4301
if ((dst_entry->eflags & MAP_ENTRY_GUARD) == 0)
4302
dst_entry->offset = 0;
4303
if (src_entry->cred != NULL) {
4304
dst_entry->cred = curthread->td_ucred;
4305
crhold(dst_entry->cred);
4306
*fork_charge += size;
4307
}
4308
}
4309
} else {
4310
/*
4311
* We don't want to make writeable wired pages copy-on-write.
4312
* Immediately copy these pages into the new map by simulating
4313
* page faults. The new pages are pageable.
4314
*/
4315
vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
4316
fork_charge);
4317
}
4318
}
4319
4320
/*
4321
* vmspace_map_entry_forked:
4322
* Update the newly-forked vmspace each time a map entry is inherited
4323
* or copied. The values for vm_dsize and vm_tsize are approximate
4324
* (and mostly-obsolete ideas in the face of mmap(2) et al.)
4325
*/
4326
static void
4327
vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
4328
vm_map_entry_t entry)
4329
{
4330
vm_size_t entrysize;
4331
vm_offset_t newend;
4332
4333
if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
4334
return;
4335
entrysize = entry->end - entry->start;
4336
vm2->vm_map.size += entrysize;
4337
if ((entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0) {
4338
vm2->vm_ssize += btoc(entrysize);
4339
} else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
4340
entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
4341
newend = MIN(entry->end,
4342
(vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
4343
vm2->vm_dsize += btoc(newend - entry->start);
4344
} else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
4345
entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
4346
newend = MIN(entry->end,
4347
(vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
4348
vm2->vm_tsize += btoc(newend - entry->start);
4349
}
4350
}
4351
4352
/*
4353
* vmspace_fork:
4354
* Create a new process vmspace structure and vm_map
4355
* based on those of an existing process. The new map
4356
* is based on the old map, according to the inheritance
4357
* values on the regions in that map.
4358
*
4359
* XXX It might be worth coalescing the entries added to the new vmspace.
4360
*
4361
* The source map must not be locked.
4362
*/
4363
struct vmspace *
4364
vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
4365
{
4366
struct vmspace *vm2;
4367
vm_map_t new_map, old_map;
4368
vm_map_entry_t new_entry, old_entry;
4369
vm_object_t object;
4370
int error, locked __diagused;
4371
vm_inherit_t inh;
4372
4373
old_map = &vm1->vm_map;
4374
/* Copy immutable fields of vm1 to vm2. */
4375
vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
4376
pmap_pinit);
4377
if (vm2 == NULL)
4378
return (NULL);
4379
4380
vm2->vm_taddr = vm1->vm_taddr;
4381
vm2->vm_daddr = vm1->vm_daddr;
4382
vm2->vm_maxsaddr = vm1->vm_maxsaddr;
4383
vm2->vm_stacktop = vm1->vm_stacktop;
4384
vm2->vm_shp_base = vm1->vm_shp_base;
4385
vm_map_lock(old_map);
4386
if (old_map->busy)
4387
vm_map_wait_busy(old_map);
4388
new_map = &vm2->vm_map;
4389
locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
4390
KASSERT(locked, ("vmspace_fork: lock failed"));
4391
4392
error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
4393
if (error != 0) {
4394
sx_xunlock(&old_map->lock);
4395
sx_xunlock(&new_map->lock);
4396
vm_map_process_deferred();
4397
vmspace_free(vm2);
4398
return (NULL);
4399
}
4400
4401
new_map->anon_loc = old_map->anon_loc;
4402
new_map->flags |= old_map->flags & (MAP_ASLR | MAP_ASLR_IGNSTART |
4403
MAP_ASLR_STACK | MAP_WXORX);
4404
4405
VM_MAP_ENTRY_FOREACH(old_entry, old_map) {
4406
if ((old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
4407
panic("vm_map_fork: encountered a submap");
4408
4409
inh = old_entry->inheritance;
4410
if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
4411
inh != VM_INHERIT_NONE)
4412
inh = VM_INHERIT_COPY;
4413
4414
switch (inh) {
4415
case VM_INHERIT_NONE:
4416
break;
4417
4418
case VM_INHERIT_SHARE:
4419
/*
4420
* Clone the entry, creating the shared object if
4421
* necessary.
4422
*/
4423
object = old_entry->object.vm_object;
4424
if (object == NULL) {
4425
vm_map_entry_back(old_entry);
4426
object = old_entry->object.vm_object;
4427
}
4428
4429
/*
4430
* Add the reference before calling vm_object_shadow
4431
* to insure that a shadow object is created.
4432
*/
4433
vm_object_reference(object);
4434
if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4435
vm_object_shadow(&old_entry->object.vm_object,
4436
&old_entry->offset,
4437
old_entry->end - old_entry->start,
4438
old_entry->cred,
4439
/* Transfer the second reference too. */
4440
true);
4441
old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
4442
old_entry->cred = NULL;
4443
4444
/*
4445
* As in vm_map_merged_neighbor_dispose(),
4446
* the vnode lock will not be acquired in
4447
* this call to vm_object_deallocate().
4448
*/
4449
vm_object_deallocate(object);
4450
object = old_entry->object.vm_object;
4451
} else {
4452
VM_OBJECT_WLOCK(object);
4453
vm_object_clear_flag(object, OBJ_ONEMAPPING);
4454
if (old_entry->cred != NULL) {
4455
KASSERT(object->cred == NULL,
4456
("vmspace_fork both cred"));
4457
object->cred = old_entry->cred;
4458
*fork_charge += old_entry->end -
4459
old_entry->start;
4460
old_entry->cred = NULL;
4461
}
4462
4463
/*
4464
* Assert the correct state of the vnode
4465
* v_writecount while the object is locked, to
4466
* not relock it later for the assertion
4467
* correctness.
4468
*/
4469
if (old_entry->eflags & MAP_ENTRY_WRITECNT &&
4470
object->type == OBJT_VNODE) {
4471
KASSERT(((struct vnode *)object->
4472
handle)->v_writecount > 0,
4473
("vmspace_fork: v_writecount %p",
4474
object));
4475
KASSERT(object->un_pager.vnp.
4476
writemappings > 0,
4477
("vmspace_fork: vnp.writecount %p",
4478
object));
4479
}
4480
VM_OBJECT_WUNLOCK(object);
4481
}
4482
4483
/*
4484
* Clone the entry, referencing the shared object.
4485
*/
4486
new_entry = vm_map_entry_create(new_map);
4487
*new_entry = *old_entry;
4488
new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
4489
MAP_ENTRY_IN_TRANSITION);
4490
new_entry->wiring_thread = NULL;
4491
new_entry->wired_count = 0;
4492
if (new_entry->eflags & MAP_ENTRY_WRITECNT) {
4493
vm_pager_update_writecount(object,
4494
new_entry->start, new_entry->end);
4495
}
4496
vm_map_entry_set_vnode_text(new_entry, true);
4497
4498
/*
4499
* Insert the entry into the new map -- we know we're
4500
* inserting at the end of the new map.
4501
*/
4502
vm_map_entry_link(new_map, new_entry);
4503
vmspace_map_entry_forked(vm1, vm2, new_entry);
4504
4505
/*
4506
* Update the physical map
4507
*/
4508
pmap_copy(new_map->pmap, old_map->pmap,
4509
new_entry->start,
4510
(old_entry->end - old_entry->start),
4511
old_entry->start);
4512
break;
4513
4514
case VM_INHERIT_COPY:
4515
/*
4516
* Clone the entry and link into the map.
4517
*/
4518
new_entry = vm_map_entry_create(new_map);
4519
*new_entry = *old_entry;
4520
/*
4521
* Copied entry is COW over the old object.
4522
*/
4523
new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
4524
MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT);
4525
new_entry->wiring_thread = NULL;
4526
new_entry->wired_count = 0;
4527
new_entry->object.vm_object = NULL;
4528
new_entry->cred = NULL;
4529
vm_map_entry_link(new_map, new_entry);
4530
vmspace_map_entry_forked(vm1, vm2, new_entry);
4531
vm_map_copy_entry(old_map, new_map, old_entry,
4532
new_entry, fork_charge);
4533
vm_map_entry_set_vnode_text(new_entry, true);
4534
break;
4535
4536
case VM_INHERIT_ZERO:
4537
/*
4538
* Create a new anonymous mapping entry modelled from
4539
* the old one.
4540
*/
4541
new_entry = vm_map_entry_create(new_map);
4542
memset(new_entry, 0, sizeof(*new_entry));
4543
4544
new_entry->start = old_entry->start;
4545
new_entry->end = old_entry->end;
4546
new_entry->eflags = old_entry->eflags &
4547
~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
4548
MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC |
4549
MAP_ENTRY_SPLIT_BOUNDARY_MASK);
4550
new_entry->protection = old_entry->protection;
4551
new_entry->max_protection = old_entry->max_protection;
4552
new_entry->inheritance = VM_INHERIT_ZERO;
4553
4554
vm_map_entry_link(new_map, new_entry);
4555
vmspace_map_entry_forked(vm1, vm2, new_entry);
4556
4557
new_entry->cred = curthread->td_ucred;
4558
crhold(new_entry->cred);
4559
*fork_charge += (new_entry->end - new_entry->start);
4560
4561
break;
4562
}
4563
}
4564
/*
4565
* Use inlined vm_map_unlock() to postpone handling the deferred
4566
* map entries, which cannot be done until both old_map and
4567
* new_map locks are released.
4568
*/
4569
sx_xunlock(&old_map->lock);
4570
sx_xunlock(&new_map->lock);
4571
vm_map_process_deferred();
4572
4573
return (vm2);
4574
}
4575
4576
/*
4577
* Create a process's stack for exec_new_vmspace(). This function is never
4578
* asked to wire the newly created stack.
4579
*/
4580
int
4581
vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4582
vm_prot_t prot, vm_prot_t max, int cow)
4583
{
4584
vm_size_t growsize, init_ssize;
4585
rlim_t vmemlim;
4586
int rv;
4587
4588
MPASS((map->flags & MAP_WIREFUTURE) == 0);
4589
growsize = sgrowsiz;
4590
init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
4591
vm_map_lock(map);
4592
vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4593
/* If we would blow our VMEM resource limit, no go */
4594
if (map->size + init_ssize > vmemlim) {
4595
rv = KERN_NO_SPACE;
4596
goto out;
4597
}
4598
rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
4599
max, cow);
4600
out:
4601
vm_map_unlock(map);
4602
return (rv);
4603
}
4604
4605
static int stack_guard_page = 1;
4606
SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
4607
&stack_guard_page, 0,
4608
"Specifies the number of guard pages for a stack that grows");
4609
4610
static int
4611
vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4612
vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
4613
{
4614
vm_map_entry_t gap_entry, new_entry, prev_entry;
4615
vm_offset_t bot, gap_bot, gap_top, top;
4616
vm_size_t init_ssize, sgp;
4617
int rv;
4618
4619
KASSERT((cow & MAP_STACK_AREA) != 0,
4620
("New mapping is not a stack"));
4621
4622
if (max_ssize == 0 ||
4623
!vm_map_range_valid(map, addrbos, addrbos + max_ssize))
4624
return (KERN_INVALID_ADDRESS);
4625
sgp = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
4626
(curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
4627
(vm_size_t)stack_guard_page * PAGE_SIZE;
4628
if (sgp >= max_ssize)
4629
return (KERN_INVALID_ARGUMENT);
4630
4631
init_ssize = growsize;
4632
if (max_ssize < init_ssize + sgp)
4633
init_ssize = max_ssize - sgp;
4634
4635
/* If addr is already mapped, no go */
4636
if (vm_map_lookup_entry(map, addrbos, &prev_entry))
4637
return (KERN_NO_SPACE);
4638
4639
/*
4640
* If we can't accommodate max_ssize in the current mapping, no go.
4641
*/
4642
if (vm_map_entry_succ(prev_entry)->start < addrbos + max_ssize)
4643
return (KERN_NO_SPACE);
4644
4645
/*
4646
* We initially map a stack of only init_ssize, at the top of
4647
* the range. We will grow as needed later.
4648
*
4649
* Note: we would normally expect prot and max to be VM_PROT_ALL,
4650
* and cow to be 0. Possibly we should eliminate these as input
4651
* parameters, and just pass these values here in the insert call.
4652
*/
4653
bot = addrbos + max_ssize - init_ssize;
4654
top = bot + init_ssize;
4655
gap_bot = addrbos;
4656
gap_top = bot;
4657
rv = vm_map_insert1(map, NULL, 0, bot, top, prot, max, cow,
4658
&new_entry);
4659
if (rv != KERN_SUCCESS)
4660
return (rv);
4661
KASSERT(new_entry->end == top || new_entry->start == bot,
4662
("Bad entry start/end for new stack entry"));
4663
KASSERT((new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
4664
("new entry lacks MAP_ENTRY_GROWS_DOWN"));
4665
if (gap_bot == gap_top)
4666
return (KERN_SUCCESS);
4667
rv = vm_map_insert1(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
4668
VM_PROT_NONE, MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP,
4669
&gap_entry);
4670
if (rv == KERN_SUCCESS) {
4671
KASSERT((gap_entry->eflags & MAP_ENTRY_GUARD) != 0,
4672
("entry %p not gap %#x", gap_entry, gap_entry->eflags));
4673
KASSERT((gap_entry->eflags & MAP_ENTRY_STACK_GAP) != 0,
4674
("entry %p not stack gap %#x", gap_entry,
4675
gap_entry->eflags));
4676
4677
/*
4678
* Gap can never successfully handle a fault, so
4679
* read-ahead logic is never used for it. Re-use
4680
* next_read of the gap entry to store
4681
* stack_guard_page for vm_map_growstack().
4682
* Similarly, since a gap cannot have a backing object,
4683
* store the original stack protections in the
4684
* object offset.
4685
*/
4686
gap_entry->next_read = sgp;
4687
gap_entry->offset = prot | PROT_MAX(max);
4688
} else {
4689
(void)vm_map_delete(map, bot, top);
4690
}
4691
return (rv);
4692
}
4693
4694
/*
4695
* Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we
4696
* successfully grow the stack.
4697
*/
4698
static int
4699
vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
4700
{
4701
vm_map_entry_t stack_entry;
4702
struct proc *p;
4703
struct vmspace *vm;
4704
vm_offset_t gap_end, gap_start, grow_start;
4705
vm_size_t grow_amount, guard, max_grow, sgp;
4706
vm_prot_t prot, max;
4707
rlim_t lmemlim, stacklim, vmemlim;
4708
int rv, rv1 __diagused;
4709
bool gap_deleted, is_procstack;
4710
#ifdef notyet
4711
uint64_t limit;
4712
#endif
4713
#ifdef RACCT
4714
int error __diagused;
4715
#endif
4716
4717
p = curproc;
4718
vm = p->p_vmspace;
4719
4720
/*
4721
* Disallow stack growth when the access is performed by a
4722
* debugger or AIO daemon. The reason is that the wrong
4723
* resource limits are applied.
4724
*/
4725
if (p != initproc && (map != &p->p_vmspace->vm_map ||
4726
p->p_textvp == NULL))
4727
return (KERN_FAILURE);
4728
4729
MPASS(!vm_map_is_system(map));
4730
4731
lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
4732
stacklim = lim_cur(curthread, RLIMIT_STACK);
4733
vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4734
retry:
4735
/* If addr is not in a hole for a stack grow area, no need to grow. */
4736
if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
4737
return (KERN_FAILURE);
4738
if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
4739
return (KERN_SUCCESS);
4740
if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP) != 0) {
4741
stack_entry = vm_map_entry_succ(gap_entry);
4742
if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
4743
stack_entry->start != gap_entry->end)
4744
return (KERN_FAILURE);
4745
grow_amount = round_page(stack_entry->start - addr);
4746
} else {
4747
return (KERN_FAILURE);
4748
}
4749
guard = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
4750
(curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
4751
gap_entry->next_read;
4752
max_grow = gap_entry->end - gap_entry->start;
4753
if (guard > max_grow)
4754
return (KERN_NO_SPACE);
4755
max_grow -= guard;
4756
if (grow_amount > max_grow)
4757
return (KERN_NO_SPACE);
4758
4759
/*
4760
* If this is the main process stack, see if we're over the stack
4761
* limit.
4762
*/
4763
is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
4764
addr < (vm_offset_t)vm->vm_stacktop;
4765
if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
4766
return (KERN_NO_SPACE);
4767
4768
#ifdef RACCT
4769
if (racct_enable) {
4770
PROC_LOCK(p);
4771
if (is_procstack && racct_set(p, RACCT_STACK,
4772
ctob(vm->vm_ssize) + grow_amount)) {
4773
PROC_UNLOCK(p);
4774
return (KERN_NO_SPACE);
4775
}
4776
PROC_UNLOCK(p);
4777
}
4778
#endif
4779
4780
grow_amount = roundup(grow_amount, sgrowsiz);
4781
if (grow_amount > max_grow)
4782
grow_amount = max_grow;
4783
if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
4784
grow_amount = trunc_page((vm_size_t)stacklim) -
4785
ctob(vm->vm_ssize);
4786
}
4787
4788
#ifdef notyet
4789
PROC_LOCK(p);
4790
limit = racct_get_available(p, RACCT_STACK);
4791
PROC_UNLOCK(p);
4792
if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
4793
grow_amount = limit - ctob(vm->vm_ssize);
4794
#endif
4795
4796
if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
4797
if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
4798
rv = KERN_NO_SPACE;
4799
goto out;
4800
}
4801
#ifdef RACCT
4802
if (racct_enable) {
4803
PROC_LOCK(p);
4804
if (racct_set(p, RACCT_MEMLOCK,
4805
ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
4806
PROC_UNLOCK(p);
4807
rv = KERN_NO_SPACE;
4808
goto out;
4809
}
4810
PROC_UNLOCK(p);
4811
}
4812
#endif
4813
}
4814
4815
/* If we would blow our VMEM resource limit, no go */
4816
if (map->size + grow_amount > vmemlim) {
4817
rv = KERN_NO_SPACE;
4818
goto out;
4819
}
4820
#ifdef RACCT
4821
if (racct_enable) {
4822
PROC_LOCK(p);
4823
if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
4824
PROC_UNLOCK(p);
4825
rv = KERN_NO_SPACE;
4826
goto out;
4827
}
4828
PROC_UNLOCK(p);
4829
}
4830
#endif
4831
4832
if (vm_map_lock_upgrade(map)) {
4833
gap_entry = NULL;
4834
vm_map_lock_read(map);
4835
goto retry;
4836
}
4837
4838
/*
4839
* The gap_entry "offset" field is overloaded. See
4840
* vm_map_stack_locked().
4841
*/
4842
prot = PROT_EXTRACT(gap_entry->offset);
4843
max = PROT_MAX_EXTRACT(gap_entry->offset);
4844
sgp = gap_entry->next_read;
4845
4846
grow_start = gap_entry->end - grow_amount;
4847
if (gap_entry->start + grow_amount == gap_entry->end) {
4848
gap_start = gap_entry->start;
4849
gap_end = gap_entry->end;
4850
vm_map_entry_delete(map, gap_entry);
4851
gap_deleted = true;
4852
} else {
4853
MPASS(gap_entry->start < gap_entry->end - grow_amount);
4854
vm_map_entry_resize(map, gap_entry, -grow_amount);
4855
gap_deleted = false;
4856
}
4857
rv = vm_map_insert(map, NULL, 0, grow_start,
4858
grow_start + grow_amount, prot, max, MAP_STACK_AREA);
4859
if (rv != KERN_SUCCESS) {
4860
if (gap_deleted) {
4861
rv1 = vm_map_insert1(map, NULL, 0, gap_start,
4862
gap_end, VM_PROT_NONE, VM_PROT_NONE,
4863
MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP,
4864
&gap_entry);
4865
MPASS(rv1 == KERN_SUCCESS);
4866
gap_entry->next_read = sgp;
4867
gap_entry->offset = prot | PROT_MAX(max);
4868
} else {
4869
vm_map_entry_resize(map, gap_entry,
4870
grow_amount);
4871
}
4872
}
4873
if (rv == KERN_SUCCESS && is_procstack)
4874
vm->vm_ssize += btoc(grow_amount);
4875
4876
/*
4877
* Heed the MAP_WIREFUTURE flag if it was set for this process.
4878
*/
4879
if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
4880
rv = vm_map_wire_locked(map, grow_start,
4881
grow_start + grow_amount,
4882
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
4883
}
4884
vm_map_lock_downgrade(map);
4885
4886
out:
4887
#ifdef RACCT
4888
if (racct_enable && rv != KERN_SUCCESS) {
4889
PROC_LOCK(p);
4890
error = racct_set(p, RACCT_VMEM, map->size);
4891
KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
4892
if (!old_mlock) {
4893
error = racct_set(p, RACCT_MEMLOCK,
4894
ptoa(pmap_wired_count(map->pmap)));
4895
KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
4896
}
4897
error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
4898
KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
4899
PROC_UNLOCK(p);
4900
}
4901
#endif
4902
4903
return (rv);
4904
}
4905
4906
/*
4907
* Unshare the specified VM space for exec. If other processes are
4908
* mapped to it, then create a new one. The new vmspace is null.
4909
*/
4910
int
4911
vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
4912
{
4913
struct vmspace *oldvmspace = p->p_vmspace;
4914
struct vmspace *newvmspace;
4915
4916
KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
4917
("vmspace_exec recursed"));
4918
newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
4919
if (newvmspace == NULL)
4920
return (ENOMEM);
4921
newvmspace->vm_swrss = oldvmspace->vm_swrss;
4922
/*
4923
* This code is written like this for prototype purposes. The
4924
* goal is to avoid running down the vmspace here, but let the
4925
* other process's that are still using the vmspace to finally
4926
* run it down. Even though there is little or no chance of blocking
4927
* here, it is a good idea to keep this form for future mods.
4928
*/
4929
PROC_VMSPACE_LOCK(p);
4930
p->p_vmspace = newvmspace;
4931
PROC_VMSPACE_UNLOCK(p);
4932
if (p == curthread->td_proc)
4933
pmap_activate(curthread);
4934
curthread->td_pflags |= TDP_EXECVMSPC;
4935
return (0);
4936
}
4937
4938
/*
4939
* Unshare the specified VM space for forcing COW. This
4940
* is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4941
*/
4942
int
4943
vmspace_unshare(struct proc *p)
4944
{
4945
struct vmspace *oldvmspace = p->p_vmspace;
4946
struct vmspace *newvmspace;
4947
vm_ooffset_t fork_charge;
4948
4949
/*
4950
* The caller is responsible for ensuring that the reference count
4951
* cannot concurrently transition 1 -> 2.
4952
*/
4953
if (refcount_load(&oldvmspace->vm_refcnt) == 1)
4954
return (0);
4955
fork_charge = 0;
4956
newvmspace = vmspace_fork(oldvmspace, &fork_charge);
4957
if (newvmspace == NULL)
4958
return (ENOMEM);
4959
if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
4960
/*
4961
* The swap reservation failed. The accounting from
4962
* the entries of the copied newvmspace will be
4963
* subtracted in vmspace_free(), so force the
4964
* reservation there.
4965
*/
4966
swap_reserve_force_by_cred(fork_charge, p->p_ucred);
4967
vmspace_free(newvmspace);
4968
return (ENOMEM);
4969
}
4970
PROC_VMSPACE_LOCK(p);
4971
p->p_vmspace = newvmspace;
4972
PROC_VMSPACE_UNLOCK(p);
4973
if (p == curthread->td_proc)
4974
pmap_activate(curthread);
4975
vmspace_free(oldvmspace);
4976
return (0);
4977
}
4978
4979
/*
4980
* vm_map_lookup:
4981
*
4982
* Finds the VM object, offset, and
4983
* protection for a given virtual address in the
4984
* specified map, assuming a page fault of the
4985
* type specified.
4986
*
4987
* Leaves the map in question locked for read; return
4988
* values are guaranteed until a vm_map_lookup_done
4989
* call is performed. Note that the map argument
4990
* is in/out; the returned map must be used in
4991
* the call to vm_map_lookup_done.
4992
*
4993
* A handle (out_entry) is returned for use in
4994
* vm_map_lookup_done, to make that fast.
4995
*
4996
* If a lookup is requested with "write protection"
4997
* specified, the map may be changed to perform virtual
4998
* copying operations, although the data referenced will
4999
* remain the same.
5000
*/
5001
int
5002
vm_map_lookup(vm_map_t *var_map, /* IN/OUT */
5003
vm_offset_t vaddr,
5004
vm_prot_t fault_typea,
5005
vm_map_entry_t *out_entry, /* OUT */
5006
vm_object_t *object, /* OUT */
5007
vm_pindex_t *pindex, /* OUT */
5008
vm_prot_t *out_prot, /* OUT */
5009
boolean_t *wired) /* OUT */
5010
{
5011
vm_map_entry_t entry;
5012
vm_map_t map = *var_map;
5013
vm_prot_t prot;
5014
vm_prot_t fault_type;
5015
vm_object_t eobject;
5016
vm_size_t size;
5017
struct ucred *cred;
5018
5019
RetryLookup:
5020
5021
vm_map_lock_read(map);
5022
5023
RetryLookupLocked:
5024
/*
5025
* Lookup the faulting address.
5026
*/
5027
if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
5028
vm_map_unlock_read(map);
5029
return (KERN_INVALID_ADDRESS);
5030
}
5031
5032
entry = *out_entry;
5033
5034
/*
5035
* Handle submaps.
5036
*/
5037
if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
5038
vm_map_t old_map = map;
5039
5040
*var_map = map = entry->object.sub_map;
5041
vm_map_unlock_read(old_map);
5042
goto RetryLookup;
5043
}
5044
5045
/*
5046
* Check whether this task is allowed to have this page.
5047
*/
5048
prot = entry->protection;
5049
if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
5050
fault_typea &= ~VM_PROT_FAULT_LOOKUP;
5051
if (prot == VM_PROT_NONE && map != kernel_map &&
5052
(entry->eflags & MAP_ENTRY_GUARD) != 0 &&
5053
(entry->eflags & MAP_ENTRY_STACK_GAP) != 0 &&
5054
vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
5055
goto RetryLookupLocked;
5056
}
5057
fault_type = fault_typea & VM_PROT_ALL;
5058
if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
5059
vm_map_unlock_read(map);
5060
return (KERN_PROTECTION_FAILURE);
5061
}
5062
KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
5063
(MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
5064
(MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
5065
("entry %p flags %x", entry, entry->eflags));
5066
if ((fault_typea & VM_PROT_COPY) != 0 &&
5067
(entry->max_protection & VM_PROT_WRITE) == 0 &&
5068
(entry->eflags & MAP_ENTRY_COW) == 0) {
5069
vm_map_unlock_read(map);
5070
return (KERN_PROTECTION_FAILURE);
5071
}
5072
5073
/*
5074
* If this page is not pageable, we have to get it for all possible
5075
* accesses.
5076
*/
5077
*wired = (entry->wired_count != 0);
5078
if (*wired)
5079
fault_type = entry->protection;
5080
size = entry->end - entry->start;
5081
5082
/*
5083
* If the entry was copy-on-write, we either ...
5084
*/
5085
if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
5086
/*
5087
* If we want to write the page, we may as well handle that
5088
* now since we've got the map locked.
5089
*
5090
* If we don't need to write the page, we just demote the
5091
* permissions allowed.
5092
*/
5093
if ((fault_type & VM_PROT_WRITE) != 0 ||
5094
(fault_typea & VM_PROT_COPY) != 0) {
5095
/*
5096
* Make a new object, and place it in the object
5097
* chain. Note that no new references have appeared
5098
* -- one just moved from the map to the new
5099
* object.
5100
*/
5101
if (vm_map_lock_upgrade(map))
5102
goto RetryLookup;
5103
5104
if (entry->cred == NULL) {
5105
/*
5106
* The debugger owner is charged for
5107
* the memory.
5108
*/
5109
cred = curthread->td_ucred;
5110
crhold(cred);
5111
if (!swap_reserve_by_cred(size, cred)) {
5112
crfree(cred);
5113
vm_map_unlock(map);
5114
return (KERN_RESOURCE_SHORTAGE);
5115
}
5116
entry->cred = cred;
5117
}
5118
eobject = entry->object.vm_object;
5119
vm_object_shadow(&entry->object.vm_object,
5120
&entry->offset, size, entry->cred, false);
5121
if (eobject == entry->object.vm_object) {
5122
/*
5123
* The object was not shadowed.
5124
*/
5125
swap_release_by_cred(size, entry->cred);
5126
crfree(entry->cred);
5127
}
5128
entry->cred = NULL;
5129
entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
5130
5131
vm_map_lock_downgrade(map);
5132
} else {
5133
/*
5134
* We're attempting to read a copy-on-write page --
5135
* don't allow writes.
5136
*/
5137
prot &= ~VM_PROT_WRITE;
5138
}
5139
}
5140
5141
/*
5142
* Create an object if necessary.
5143
*/
5144
if (entry->object.vm_object == NULL && !vm_map_is_system(map)) {
5145
if (vm_map_lock_upgrade(map))
5146
goto RetryLookup;
5147
entry->object.vm_object = vm_object_allocate_anon(atop(size),
5148
NULL, entry->cred);
5149
entry->offset = 0;
5150
entry->cred = NULL;
5151
vm_map_lock_downgrade(map);
5152
}
5153
5154
/*
5155
* Return the object/offset from this entry. If the entry was
5156
* copy-on-write or empty, it has been fixed up.
5157
*/
5158
*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
5159
*object = entry->object.vm_object;
5160
5161
*out_prot = prot;
5162
return (KERN_SUCCESS);
5163
}
5164
5165
/*
5166
* vm_map_lookup_locked:
5167
*
5168
* Lookup the faulting address. A version of vm_map_lookup that returns
5169
* KERN_FAILURE instead of blocking on map lock or memory allocation.
5170
*/
5171
int
5172
vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */
5173
vm_offset_t vaddr,
5174
vm_prot_t fault_typea,
5175
vm_map_entry_t *out_entry, /* OUT */
5176
vm_object_t *object, /* OUT */
5177
vm_pindex_t *pindex, /* OUT */
5178
vm_prot_t *out_prot, /* OUT */
5179
boolean_t *wired) /* OUT */
5180
{
5181
vm_map_entry_t entry;
5182
vm_map_t map = *var_map;
5183
vm_prot_t prot;
5184
vm_prot_t fault_type = fault_typea;
5185
5186
/*
5187
* Lookup the faulting address.
5188
*/
5189
if (!vm_map_lookup_entry(map, vaddr, out_entry))
5190
return (KERN_INVALID_ADDRESS);
5191
5192
entry = *out_entry;
5193
5194
/*
5195
* Fail if the entry refers to a submap.
5196
*/
5197
if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
5198
return (KERN_FAILURE);
5199
5200
/*
5201
* Check whether this task is allowed to have this page.
5202
*/
5203
prot = entry->protection;
5204
fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
5205
if ((fault_type & prot) != fault_type)
5206
return (KERN_PROTECTION_FAILURE);
5207
5208
/*
5209
* If this page is not pageable, we have to get it for all possible
5210
* accesses.
5211
*/
5212
*wired = (entry->wired_count != 0);
5213
if (*wired)
5214
fault_type = entry->protection;
5215
5216
if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
5217
/*
5218
* Fail if the entry was copy-on-write for a write fault.
5219
*/
5220
if (fault_type & VM_PROT_WRITE)
5221
return (KERN_FAILURE);
5222
/*
5223
* We're attempting to read a copy-on-write page --
5224
* don't allow writes.
5225
*/
5226
prot &= ~VM_PROT_WRITE;
5227
}
5228
5229
/*
5230
* Fail if an object should be created.
5231
*/
5232
if (entry->object.vm_object == NULL && !vm_map_is_system(map))
5233
return (KERN_FAILURE);
5234
5235
/*
5236
* Return the object/offset from this entry. If the entry was
5237
* copy-on-write or empty, it has been fixed up.
5238
*/
5239
*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
5240
*object = entry->object.vm_object;
5241
5242
*out_prot = prot;
5243
return (KERN_SUCCESS);
5244
}
5245
5246
/*
5247
* vm_map_lookup_done:
5248
*
5249
* Releases locks acquired by a vm_map_lookup
5250
* (according to the handle returned by that lookup).
5251
*/
5252
void
5253
vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
5254
{
5255
/*
5256
* Unlock the main-level map
5257
*/
5258
vm_map_unlock_read(map);
5259
}
5260
5261
vm_offset_t
5262
vm_map_max_KBI(const struct vm_map *map)
5263
{
5264
5265
return (vm_map_max(map));
5266
}
5267
5268
vm_offset_t
5269
vm_map_min_KBI(const struct vm_map *map)
5270
{
5271
5272
return (vm_map_min(map));
5273
}
5274
5275
pmap_t
5276
vm_map_pmap_KBI(vm_map_t map)
5277
{
5278
5279
return (map->pmap);
5280
}
5281
5282
bool
5283
vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end)
5284
{
5285
5286
return (vm_map_range_valid(map, start, end));
5287
}
5288
5289
#ifdef INVARIANTS
5290
static void
5291
_vm_map_assert_consistent(vm_map_t map, int check)
5292
{
5293
vm_map_entry_t entry, prev;
5294
vm_map_entry_t cur, header, lbound, ubound;
5295
vm_size_t max_left, max_right;
5296
5297
#ifdef DIAGNOSTIC
5298
++map->nupdates;
5299
#endif
5300
if (enable_vmmap_check != check)
5301
return;
5302
5303
header = prev = &map->header;
5304
VM_MAP_ENTRY_FOREACH(entry, map) {
5305
KASSERT(prev->end <= entry->start,
5306
("map %p prev->end = %jx, start = %jx", map,
5307
(uintmax_t)prev->end, (uintmax_t)entry->start));
5308
KASSERT(entry->start < entry->end,
5309
("map %p start = %jx, end = %jx", map,
5310
(uintmax_t)entry->start, (uintmax_t)entry->end));
5311
KASSERT(entry->left == header ||
5312
entry->left->start < entry->start,
5313
("map %p left->start = %jx, start = %jx", map,
5314
(uintmax_t)entry->left->start, (uintmax_t)entry->start));
5315
KASSERT(entry->right == header ||
5316
entry->start < entry->right->start,
5317
("map %p start = %jx, right->start = %jx", map,
5318
(uintmax_t)entry->start, (uintmax_t)entry->right->start));
5319
cur = map->root;
5320
lbound = ubound = header;
5321
for (;;) {
5322
if (entry->start < cur->start) {
5323
ubound = cur;
5324
cur = cur->left;
5325
KASSERT(cur != lbound,
5326
("map %p cannot find %jx",
5327
map, (uintmax_t)entry->start));
5328
} else if (cur->end <= entry->start) {
5329
lbound = cur;
5330
cur = cur->right;
5331
KASSERT(cur != ubound,
5332
("map %p cannot find %jx",
5333
map, (uintmax_t)entry->start));
5334
} else {
5335
KASSERT(cur == entry,
5336
("map %p cannot find %jx",
5337
map, (uintmax_t)entry->start));
5338
break;
5339
}
5340
}
5341
max_left = vm_map_entry_max_free_left(entry, lbound);
5342
max_right = vm_map_entry_max_free_right(entry, ubound);
5343
KASSERT(entry->max_free == vm_size_max(max_left, max_right),
5344
("map %p max = %jx, max_left = %jx, max_right = %jx", map,
5345
(uintmax_t)entry->max_free,
5346
(uintmax_t)max_left, (uintmax_t)max_right));
5347
prev = entry;
5348
}
5349
KASSERT(prev->end <= entry->start,
5350
("map %p prev->end = %jx, start = %jx", map,
5351
(uintmax_t)prev->end, (uintmax_t)entry->start));
5352
}
5353
#endif
5354
5355
#include "opt_ddb.h"
5356
#ifdef DDB
5357
#include <sys/kernel.h>
5358
5359
#include <ddb/ddb.h>
5360
5361
static void
5362
vm_map_print(vm_map_t map)
5363
{
5364
vm_map_entry_t entry, prev;
5365
5366
db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
5367
(void *)map,
5368
(void *)map->pmap, map->nentries, map->timestamp);
5369
5370
db_indent += 2;
5371
prev = &map->header;
5372
VM_MAP_ENTRY_FOREACH(entry, map) {
5373
db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
5374
(void *)entry, (void *)entry->start, (void *)entry->end,
5375
entry->eflags);
5376
{
5377
static const char * const inheritance_name[4] =
5378
{"share", "copy", "none", "donate_copy"};
5379
5380
db_iprintf(" prot=%x/%x/%s",
5381
entry->protection,
5382
entry->max_protection,
5383
inheritance_name[(int)(unsigned char)
5384
entry->inheritance]);
5385
if (entry->wired_count != 0)
5386
db_printf(", wired");
5387
}
5388
if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
5389
db_printf(", share=%p, offset=0x%jx\n",
5390
(void *)entry->object.sub_map,
5391
(uintmax_t)entry->offset);
5392
if (prev == &map->header ||
5393
prev->object.sub_map !=
5394
entry->object.sub_map) {
5395
db_indent += 2;
5396
vm_map_print((vm_map_t)entry->object.sub_map);
5397
db_indent -= 2;
5398
}
5399
} else {
5400
if (entry->cred != NULL)
5401
db_printf(", ruid %d", entry->cred->cr_ruid);
5402
db_printf(", object=%p, offset=0x%jx",
5403
(void *)entry->object.vm_object,
5404
(uintmax_t)entry->offset);
5405
if (entry->object.vm_object && entry->object.vm_object->cred)
5406
db_printf(", obj ruid %d ",
5407
entry->object.vm_object->cred->cr_ruid);
5408
if (entry->eflags & MAP_ENTRY_COW)
5409
db_printf(", copy (%s)",
5410
(entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
5411
db_printf("\n");
5412
5413
if (prev == &map->header ||
5414
prev->object.vm_object !=
5415
entry->object.vm_object) {
5416
db_indent += 2;
5417
vm_object_print((db_expr_t)(intptr_t)
5418
entry->object.vm_object,
5419
0, 0, (char *)0);
5420
db_indent -= 2;
5421
}
5422
}
5423
prev = entry;
5424
}
5425
db_indent -= 2;
5426
}
5427
5428
DB_SHOW_COMMAND(map, map)
5429
{
5430
5431
if (!have_addr) {
5432
db_printf("usage: show map <addr>\n");
5433
return;
5434
}
5435
vm_map_print((vm_map_t)addr);
5436
}
5437
5438
DB_SHOW_COMMAND(procvm, procvm)
5439
{
5440
struct proc *p;
5441
5442
if (have_addr) {
5443
p = db_lookup_proc(addr);
5444
} else {
5445
p = curproc;
5446
}
5447
5448
db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
5449
(void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
5450
(void *)vmspace_pmap(p->p_vmspace));
5451
5452
vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
5453
}
5454
5455
#endif /* DDB */
5456
5457