Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/vm/vm_map.c
39476 views
1
/*-
2
* SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
3
*
4
* Copyright (c) 1991, 1993
5
* The Regents of the University of California. All rights reserved.
6
*
7
* This code is derived from software contributed to Berkeley by
8
* The Mach Operating System project at Carnegie-Mellon University.
9
*
10
* Redistribution and use in source and binary forms, with or without
11
* modification, are permitted provided that the following conditions
12
* are met:
13
* 1. Redistributions of source code must retain the above copyright
14
* notice, this list of conditions and the following disclaimer.
15
* 2. Redistributions in binary form must reproduce the above copyright
16
* notice, this list of conditions and the following disclaimer in the
17
* documentation and/or other materials provided with the distribution.
18
* 3. Neither the name of the University nor the names of its contributors
19
* may be used to endorse or promote products derived from this software
20
* without specific prior written permission.
21
*
22
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32
* SUCH DAMAGE.
33
*
34
*
35
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
36
* All rights reserved.
37
*
38
* Authors: Avadis Tevanian, Jr., Michael Wayne Young
39
*
40
* Permission to use, copy, modify and distribute this software and
41
* its documentation is hereby granted, provided that both the copyright
42
* notice and this permission notice appear in all copies of the
43
* software, derivative works or modified versions, and any portions
44
* thereof, and that both notices appear in supporting documentation.
45
*
46
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49
*
50
* Carnegie Mellon requests users of this software to return to
51
*
52
* Software Distribution Coordinator or [email protected]
53
* School of Computer Science
54
* Carnegie Mellon University
55
* Pittsburgh PA 15213-3890
56
*
57
* any improvements or extensions that they make and grant Carnegie the
58
* rights to redistribute these changes.
59
*/
60
61
/*
62
* Virtual memory mapping module.
63
*/
64
65
#include <sys/param.h>
66
#include <sys/systm.h>
67
#include <sys/elf.h>
68
#include <sys/kernel.h>
69
#include <sys/ktr.h>
70
#include <sys/lock.h>
71
#include <sys/mutex.h>
72
#include <sys/proc.h>
73
#include <sys/vmmeter.h>
74
#include <sys/mman.h>
75
#include <sys/vnode.h>
76
#include <sys/racct.h>
77
#include <sys/resourcevar.h>
78
#include <sys/rwlock.h>
79
#include <sys/file.h>
80
#include <sys/sysctl.h>
81
#include <sys/sysent.h>
82
#include <sys/shm.h>
83
84
#include <vm/vm.h>
85
#include <vm/vm_param.h>
86
#include <vm/pmap.h>
87
#include <vm/vm_map.h>
88
#include <vm/vm_page.h>
89
#include <vm/vm_pageout.h>
90
#include <vm/vm_object.h>
91
#include <vm/vm_pager.h>
92
#include <vm/vm_radix.h>
93
#include <vm/vm_kern.h>
94
#include <vm/vm_extern.h>
95
#include <vm/vnode_pager.h>
96
#include <vm/swap_pager.h>
97
#include <vm/uma.h>
98
99
/*
100
* Virtual memory maps provide for the mapping, protection,
101
* and sharing of virtual memory objects. In addition,
102
* this module provides for an efficient virtual copy of
103
* memory from one map to another.
104
*
105
* Synchronization is required prior to most operations.
106
*
107
* Maps consist of an ordered doubly-linked list of simple
108
* entries; a self-adjusting binary search tree of these
109
* entries is used to speed up lookups.
110
*
111
* Since portions of maps are specified by start/end addresses,
112
* which may not align with existing map entries, all
113
* routines merely "clip" entries to these start/end values.
114
* [That is, an entry is split into two, bordering at a
115
* start or end value.] Note that these clippings may not
116
* always be necessary (as the two resulting entries are then
117
* not changed); however, the clipping is done for convenience.
118
*
119
* As mentioned above, virtual copy operations are performed
120
* by copying VM object references from one map to
121
* another, and then marking both regions as copy-on-write.
122
*/
123
124
static struct mtx map_sleep_mtx;
125
static uma_zone_t mapentzone;
126
static uma_zone_t kmapentzone;
127
static uma_zone_t vmspace_zone;
128
static int vmspace_zinit(void *mem, int size, int flags);
129
static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
130
vm_offset_t max);
131
static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
132
static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
133
static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
134
static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
135
vm_map_entry_t gap_entry);
136
static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
137
vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
138
#ifdef INVARIANTS
139
static void vmspace_zdtor(void *mem, int size, void *arg);
140
#endif
141
static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
142
vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
143
int cow);
144
static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
145
vm_offset_t failed_addr);
146
147
#define CONTAINS_BITS(set, bits) ((~(set) & (bits)) == 0)
148
149
#define ENTRY_CHARGED(e) ((e)->cred != NULL || \
150
((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
151
!((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
152
153
/*
154
* PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
155
* stable.
156
*/
157
#define PROC_VMSPACE_LOCK(p) do { } while (0)
158
#define PROC_VMSPACE_UNLOCK(p) do { } while (0)
159
160
/*
161
* VM_MAP_RANGE_CHECK: [ internal use only ]
162
*
163
* Asserts that the starting and ending region
164
* addresses fall within the valid range of the map.
165
*/
166
#define VM_MAP_RANGE_CHECK(map, start, end) \
167
{ \
168
if (start < vm_map_min(map)) \
169
start = vm_map_min(map); \
170
if (end > vm_map_max(map)) \
171
end = vm_map_max(map); \
172
if (start > end) \
173
start = end; \
174
}
175
176
#ifndef UMA_USE_DMAP
177
178
/*
179
* Allocate a new slab for kernel map entries. The kernel map may be locked or
180
* unlocked, depending on whether the request is coming from the kernel map or a
181
* submap. This function allocates a virtual address range directly from the
182
* kernel map instead of the kmem_* layer to avoid recursion on the kernel map
183
* lock and also to avoid triggering allocator recursion in the vmem boundary
184
* tag allocator.
185
*/
186
static void *
187
kmapent_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
188
int wait)
189
{
190
vm_offset_t addr;
191
int error, locked;
192
193
*pflag = UMA_SLAB_PRIV;
194
195
if (!(locked = vm_map_locked(kernel_map)))
196
vm_map_lock(kernel_map);
197
addr = vm_map_findspace(kernel_map, vm_map_min(kernel_map), bytes);
198
if (addr + bytes < addr || addr + bytes > vm_map_max(kernel_map))
199
panic("%s: kernel map is exhausted", __func__);
200
error = vm_map_insert(kernel_map, NULL, 0, addr, addr + bytes,
201
VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
202
if (error != KERN_SUCCESS)
203
panic("%s: vm_map_insert() failed: %d", __func__, error);
204
if (!locked)
205
vm_map_unlock(kernel_map);
206
error = kmem_back_domain(domain, kernel_object, addr, bytes, M_NOWAIT |
207
M_USE_RESERVE | (wait & M_ZERO));
208
if (error == KERN_SUCCESS) {
209
return ((void *)addr);
210
} else {
211
if (!locked)
212
vm_map_lock(kernel_map);
213
vm_map_delete(kernel_map, addr, bytes);
214
if (!locked)
215
vm_map_unlock(kernel_map);
216
return (NULL);
217
}
218
}
219
220
static void
221
kmapent_free(void *item, vm_size_t size, uint8_t pflag)
222
{
223
vm_offset_t addr;
224
int error __diagused;
225
226
if ((pflag & UMA_SLAB_PRIV) == 0)
227
/* XXX leaked */
228
return;
229
230
addr = (vm_offset_t)item;
231
kmem_unback(kernel_object, addr, size);
232
error = vm_map_remove(kernel_map, addr, addr + size);
233
KASSERT(error == KERN_SUCCESS,
234
("%s: vm_map_remove failed: %d", __func__, error));
235
}
236
237
/*
238
* The worst-case upper bound on the number of kernel map entries that may be
239
* created before the zone must be replenished in _vm_map_unlock().
240
*/
241
#define KMAPENT_RESERVE 1
242
243
#endif /* !UMD_MD_SMALL_ALLOC */
244
245
/*
246
* vm_map_startup:
247
*
248
* Initialize the vm_map module. Must be called before any other vm_map
249
* routines.
250
*
251
* User map and entry structures are allocated from the general purpose
252
* memory pool. Kernel maps are statically defined. Kernel map entries
253
* require special handling to avoid recursion; see the comments above
254
* kmapent_alloc() and in vm_map_entry_create().
255
*/
256
void
257
vm_map_startup(void)
258
{
259
mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
260
261
/*
262
* Disable the use of per-CPU buckets: map entry allocation is
263
* serialized by the kernel map lock.
264
*/
265
kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
266
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
267
UMA_ZONE_VM | UMA_ZONE_NOBUCKET);
268
#ifndef UMA_USE_DMAP
269
/* Reserve an extra map entry for use when replenishing the reserve. */
270
uma_zone_reserve(kmapentzone, KMAPENT_RESERVE + 1);
271
uma_prealloc(kmapentzone, KMAPENT_RESERVE + 1);
272
uma_zone_set_allocf(kmapentzone, kmapent_alloc);
273
uma_zone_set_freef(kmapentzone, kmapent_free);
274
#endif
275
276
mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
277
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
278
vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
279
#ifdef INVARIANTS
280
vmspace_zdtor,
281
#else
282
NULL,
283
#endif
284
vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
285
}
286
287
static int
288
vmspace_zinit(void *mem, int size, int flags)
289
{
290
struct vmspace *vm;
291
vm_map_t map;
292
293
vm = (struct vmspace *)mem;
294
map = &vm->vm_map;
295
296
memset(map, 0, sizeof(*map)); /* set MAP_SYSTEM_MAP to false */
297
sx_init(&map->lock, "vm map (user)");
298
PMAP_LOCK_INIT(vmspace_pmap(vm));
299
return (0);
300
}
301
302
#ifdef INVARIANTS
303
static void
304
vmspace_zdtor(void *mem, int size, void *arg)
305
{
306
struct vmspace *vm;
307
308
vm = (struct vmspace *)mem;
309
KASSERT(vm->vm_map.nentries == 0,
310
("vmspace %p nentries == %d on free", vm, vm->vm_map.nentries));
311
KASSERT(vm->vm_map.size == 0,
312
("vmspace %p size == %ju on free", vm, (uintmax_t)vm->vm_map.size));
313
}
314
#endif /* INVARIANTS */
315
316
/*
317
* Allocate a vmspace structure, including a vm_map and pmap,
318
* and initialize those structures. The refcnt is set to 1.
319
*/
320
struct vmspace *
321
vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
322
{
323
struct vmspace *vm;
324
325
vm = uma_zalloc(vmspace_zone, M_WAITOK);
326
KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
327
if (!pinit(vmspace_pmap(vm))) {
328
uma_zfree(vmspace_zone, vm);
329
return (NULL);
330
}
331
CTR1(KTR_VM, "vmspace_alloc: %p", vm);
332
_vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
333
refcount_init(&vm->vm_refcnt, 1);
334
vm->vm_shm = NULL;
335
vm->vm_swrss = 0;
336
vm->vm_tsize = 0;
337
vm->vm_dsize = 0;
338
vm->vm_ssize = 0;
339
vm->vm_taddr = 0;
340
vm->vm_daddr = 0;
341
vm->vm_maxsaddr = 0;
342
return (vm);
343
}
344
345
#ifdef RACCT
346
static void
347
vmspace_container_reset(struct proc *p)
348
{
349
350
PROC_LOCK(p);
351
racct_set(p, RACCT_DATA, 0);
352
racct_set(p, RACCT_STACK, 0);
353
racct_set(p, RACCT_RSS, 0);
354
racct_set(p, RACCT_MEMLOCK, 0);
355
racct_set(p, RACCT_VMEM, 0);
356
PROC_UNLOCK(p);
357
}
358
#endif
359
360
static inline void
361
vmspace_dofree(struct vmspace *vm)
362
{
363
364
CTR1(KTR_VM, "vmspace_free: %p", vm);
365
366
/*
367
* Make sure any SysV shm is freed, it might not have been in
368
* exit1().
369
*/
370
shmexit(vm);
371
372
/*
373
* Lock the map, to wait out all other references to it.
374
* Delete all of the mappings and pages they hold, then call
375
* the pmap module to reclaim anything left.
376
*/
377
(void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
378
vm_map_max(&vm->vm_map));
379
380
pmap_release(vmspace_pmap(vm));
381
vm->vm_map.pmap = NULL;
382
uma_zfree(vmspace_zone, vm);
383
}
384
385
void
386
vmspace_free(struct vmspace *vm)
387
{
388
389
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
390
"vmspace_free() called");
391
392
if (refcount_release(&vm->vm_refcnt))
393
vmspace_dofree(vm);
394
}
395
396
void
397
vmspace_exitfree(struct proc *p)
398
{
399
struct vmspace *vm;
400
401
PROC_VMSPACE_LOCK(p);
402
vm = p->p_vmspace;
403
p->p_vmspace = NULL;
404
PROC_VMSPACE_UNLOCK(p);
405
KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
406
vmspace_free(vm);
407
}
408
409
void
410
vmspace_exit(struct thread *td)
411
{
412
struct vmspace *vm;
413
struct proc *p;
414
bool released;
415
416
p = td->td_proc;
417
vm = p->p_vmspace;
418
419
/*
420
* Prepare to release the vmspace reference. The thread that releases
421
* the last reference is responsible for tearing down the vmspace.
422
* However, threads not releasing the final reference must switch to the
423
* kernel's vmspace0 before the decrement so that the subsequent pmap
424
* deactivation does not modify a freed vmspace.
425
*/
426
refcount_acquire(&vmspace0.vm_refcnt);
427
if (!(released = refcount_release_if_last(&vm->vm_refcnt))) {
428
if (p->p_vmspace != &vmspace0) {
429
PROC_VMSPACE_LOCK(p);
430
p->p_vmspace = &vmspace0;
431
PROC_VMSPACE_UNLOCK(p);
432
pmap_activate(td);
433
}
434
released = refcount_release(&vm->vm_refcnt);
435
}
436
if (released) {
437
/*
438
* pmap_remove_pages() expects the pmap to be active, so switch
439
* back first if necessary.
440
*/
441
if (p->p_vmspace != vm) {
442
PROC_VMSPACE_LOCK(p);
443
p->p_vmspace = vm;
444
PROC_VMSPACE_UNLOCK(p);
445
pmap_activate(td);
446
}
447
pmap_remove_pages(vmspace_pmap(vm));
448
PROC_VMSPACE_LOCK(p);
449
p->p_vmspace = &vmspace0;
450
PROC_VMSPACE_UNLOCK(p);
451
pmap_activate(td);
452
vmspace_dofree(vm);
453
}
454
#ifdef RACCT
455
if (racct_enable)
456
vmspace_container_reset(p);
457
#endif
458
}
459
460
/* Acquire reference to vmspace owned by another process. */
461
462
struct vmspace *
463
vmspace_acquire_ref(struct proc *p)
464
{
465
struct vmspace *vm;
466
467
PROC_VMSPACE_LOCK(p);
468
vm = p->p_vmspace;
469
if (vm == NULL || !refcount_acquire_if_not_zero(&vm->vm_refcnt)) {
470
PROC_VMSPACE_UNLOCK(p);
471
return (NULL);
472
}
473
if (vm != p->p_vmspace) {
474
PROC_VMSPACE_UNLOCK(p);
475
vmspace_free(vm);
476
return (NULL);
477
}
478
PROC_VMSPACE_UNLOCK(p);
479
return (vm);
480
}
481
482
/*
483
* Switch between vmspaces in an AIO kernel process.
484
*
485
* The new vmspace is either the vmspace of a user process obtained
486
* from an active AIO request or the initial vmspace of the AIO kernel
487
* process (when it is idling). Because user processes will block to
488
* drain any active AIO requests before proceeding in exit() or
489
* execve(), the reference count for vmspaces from AIO requests can
490
* never be 0. Similarly, AIO kernel processes hold an extra
491
* reference on their initial vmspace for the life of the process. As
492
* a result, the 'newvm' vmspace always has a non-zero reference
493
* count. This permits an additional reference on 'newvm' to be
494
* acquired via a simple atomic increment rather than the loop in
495
* vmspace_acquire_ref() above.
496
*/
497
void
498
vmspace_switch_aio(struct vmspace *newvm)
499
{
500
struct vmspace *oldvm;
501
502
/* XXX: Need some way to assert that this is an aio daemon. */
503
504
KASSERT(refcount_load(&newvm->vm_refcnt) > 0,
505
("vmspace_switch_aio: newvm unreferenced"));
506
507
oldvm = curproc->p_vmspace;
508
if (oldvm == newvm)
509
return;
510
511
/*
512
* Point to the new address space and refer to it.
513
*/
514
curproc->p_vmspace = newvm;
515
refcount_acquire(&newvm->vm_refcnt);
516
517
/* Activate the new mapping. */
518
pmap_activate(curthread);
519
520
vmspace_free(oldvm);
521
}
522
523
void
524
_vm_map_lock(vm_map_t map, const char *file, int line)
525
{
526
527
if (vm_map_is_system(map))
528
mtx_lock_flags_(&map->system_mtx, 0, file, line);
529
else
530
sx_xlock_(&map->lock, file, line);
531
map->timestamp++;
532
}
533
534
void
535
vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add)
536
{
537
vm_object_t object;
538
struct vnode *vp;
539
bool vp_held;
540
541
if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0)
542
return;
543
KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
544
("Submap with execs"));
545
object = entry->object.vm_object;
546
KASSERT(object != NULL, ("No object for text, entry %p", entry));
547
if ((object->flags & OBJ_ANON) != 0)
548
object = object->handle;
549
else
550
KASSERT(object->backing_object == NULL,
551
("non-anon object %p shadows", object));
552
KASSERT(object != NULL, ("No content object for text, entry %p obj %p",
553
entry, entry->object.vm_object));
554
555
/*
556
* Mostly, we do not lock the backing object. It is
557
* referenced by the entry we are processing, so it cannot go
558
* away.
559
*/
560
vm_pager_getvp(object, &vp, &vp_held);
561
if (vp != NULL) {
562
if (add) {
563
VOP_SET_TEXT_CHECKED(vp);
564
} else {
565
vn_lock(vp, LK_SHARED | LK_RETRY);
566
VOP_UNSET_TEXT_CHECKED(vp);
567
VOP_UNLOCK(vp);
568
}
569
if (vp_held)
570
vdrop(vp);
571
}
572
}
573
574
/*
575
* Use a different name for this vm_map_entry field when it's use
576
* is not consistent with its use as part of an ordered search tree.
577
*/
578
#define defer_next right
579
580
static void
581
vm_map_process_deferred(void)
582
{
583
struct thread *td;
584
vm_map_entry_t entry, next;
585
vm_object_t object;
586
587
td = curthread;
588
entry = td->td_map_def_user;
589
td->td_map_def_user = NULL;
590
while (entry != NULL) {
591
next = entry->defer_next;
592
MPASS((entry->eflags & (MAP_ENTRY_WRITECNT |
593
MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT |
594
MAP_ENTRY_VN_EXEC));
595
if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) {
596
/*
597
* Decrement the object's writemappings and
598
* possibly the vnode's v_writecount.
599
*/
600
KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
601
("Submap with writecount"));
602
object = entry->object.vm_object;
603
KASSERT(object != NULL, ("No object for writecount"));
604
vm_pager_release_writecount(object, entry->start,
605
entry->end);
606
}
607
vm_map_entry_set_vnode_text(entry, false);
608
vm_map_entry_deallocate(entry, FALSE);
609
entry = next;
610
}
611
}
612
613
#ifdef INVARIANTS
614
static void
615
_vm_map_assert_locked(vm_map_t map, const char *file, int line)
616
{
617
618
if (vm_map_is_system(map))
619
mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
620
else
621
sx_assert_(&map->lock, SA_XLOCKED, file, line);
622
}
623
624
#define VM_MAP_ASSERT_LOCKED(map) \
625
_vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
626
627
enum { VMMAP_CHECK_NONE, VMMAP_CHECK_UNLOCK, VMMAP_CHECK_ALL };
628
#ifdef DIAGNOSTIC
629
static int enable_vmmap_check = VMMAP_CHECK_UNLOCK;
630
#else
631
static int enable_vmmap_check = VMMAP_CHECK_NONE;
632
#endif
633
SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN,
634
&enable_vmmap_check, 0, "Enable vm map consistency checking");
635
636
static void _vm_map_assert_consistent(vm_map_t map, int check);
637
638
#define VM_MAP_ASSERT_CONSISTENT(map) \
639
_vm_map_assert_consistent(map, VMMAP_CHECK_ALL)
640
#ifdef DIAGNOSTIC
641
#define VM_MAP_UNLOCK_CONSISTENT(map) do { \
642
if (map->nupdates > map->nentries) { \
643
_vm_map_assert_consistent(map, VMMAP_CHECK_UNLOCK); \
644
map->nupdates = 0; \
645
} \
646
} while (0)
647
#else
648
#define VM_MAP_UNLOCK_CONSISTENT(map)
649
#endif
650
#else
651
#define VM_MAP_ASSERT_LOCKED(map)
652
#define VM_MAP_ASSERT_CONSISTENT(map)
653
#define VM_MAP_UNLOCK_CONSISTENT(map)
654
#endif /* INVARIANTS */
655
656
void
657
_vm_map_unlock(vm_map_t map, const char *file, int line)
658
{
659
660
VM_MAP_UNLOCK_CONSISTENT(map);
661
if (vm_map_is_system(map)) {
662
#ifndef UMA_USE_DMAP
663
if (map == kernel_map && (map->flags & MAP_REPLENISH) != 0) {
664
uma_prealloc(kmapentzone, 1);
665
map->flags &= ~MAP_REPLENISH;
666
}
667
#endif
668
mtx_unlock_flags_(&map->system_mtx, 0, file, line);
669
} else {
670
sx_xunlock_(&map->lock, file, line);
671
vm_map_process_deferred();
672
}
673
}
674
675
void
676
_vm_map_lock_read(vm_map_t map, const char *file, int line)
677
{
678
679
if (vm_map_is_system(map))
680
mtx_lock_flags_(&map->system_mtx, 0, file, line);
681
else
682
sx_slock_(&map->lock, file, line);
683
}
684
685
void
686
_vm_map_unlock_read(vm_map_t map, const char *file, int line)
687
{
688
689
if (vm_map_is_system(map)) {
690
KASSERT((map->flags & MAP_REPLENISH) == 0,
691
("%s: MAP_REPLENISH leaked", __func__));
692
mtx_unlock_flags_(&map->system_mtx, 0, file, line);
693
} else {
694
sx_sunlock_(&map->lock, file, line);
695
vm_map_process_deferred();
696
}
697
}
698
699
int
700
_vm_map_trylock(vm_map_t map, const char *file, int line)
701
{
702
int error;
703
704
error = vm_map_is_system(map) ?
705
!mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
706
!sx_try_xlock_(&map->lock, file, line);
707
if (error == 0)
708
map->timestamp++;
709
return (error == 0);
710
}
711
712
int
713
_vm_map_trylock_read(vm_map_t map, const char *file, int line)
714
{
715
int error;
716
717
error = vm_map_is_system(map) ?
718
!mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
719
!sx_try_slock_(&map->lock, file, line);
720
return (error == 0);
721
}
722
723
/*
724
* _vm_map_lock_upgrade: [ internal use only ]
725
*
726
* Tries to upgrade a read (shared) lock on the specified map to a write
727
* (exclusive) lock. Returns the value "0" if the upgrade succeeds and a
728
* non-zero value if the upgrade fails. If the upgrade fails, the map is
729
* returned without a read or write lock held.
730
*
731
* Requires that the map be read locked.
732
*/
733
int
734
_vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
735
{
736
unsigned int last_timestamp;
737
738
if (vm_map_is_system(map)) {
739
mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
740
} else {
741
if (!sx_try_upgrade_(&map->lock, file, line)) {
742
last_timestamp = map->timestamp;
743
sx_sunlock_(&map->lock, file, line);
744
vm_map_process_deferred();
745
/*
746
* If the map's timestamp does not change while the
747
* map is unlocked, then the upgrade succeeds.
748
*/
749
sx_xlock_(&map->lock, file, line);
750
if (last_timestamp != map->timestamp) {
751
sx_xunlock_(&map->lock, file, line);
752
return (1);
753
}
754
}
755
}
756
map->timestamp++;
757
return (0);
758
}
759
760
void
761
_vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
762
{
763
764
if (vm_map_is_system(map)) {
765
KASSERT((map->flags & MAP_REPLENISH) == 0,
766
("%s: MAP_REPLENISH leaked", __func__));
767
mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
768
} else {
769
VM_MAP_UNLOCK_CONSISTENT(map);
770
sx_downgrade_(&map->lock, file, line);
771
}
772
}
773
774
/*
775
* vm_map_locked:
776
*
777
* Returns a non-zero value if the caller holds a write (exclusive) lock
778
* on the specified map and the value "0" otherwise.
779
*/
780
int
781
vm_map_locked(vm_map_t map)
782
{
783
784
if (vm_map_is_system(map))
785
return (mtx_owned(&map->system_mtx));
786
return (sx_xlocked(&map->lock));
787
}
788
789
/*
790
* _vm_map_unlock_and_wait:
791
*
792
* Atomically releases the lock on the specified map and puts the calling
793
* thread to sleep. The calling thread will remain asleep until either
794
* vm_map_wakeup() is performed on the map or the specified timeout is
795
* exceeded.
796
*
797
* WARNING! This function does not perform deferred deallocations of
798
* objects and map entries. Therefore, the calling thread is expected to
799
* reacquire the map lock after reawakening and later perform an ordinary
800
* unlock operation, such as vm_map_unlock(), before completing its
801
* operation on the map.
802
*/
803
int
804
_vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
805
{
806
807
VM_MAP_UNLOCK_CONSISTENT(map);
808
mtx_lock(&map_sleep_mtx);
809
if (vm_map_is_system(map)) {
810
KASSERT((map->flags & MAP_REPLENISH) == 0,
811
("%s: MAP_REPLENISH leaked", __func__));
812
mtx_unlock_flags_(&map->system_mtx, 0, file, line);
813
} else {
814
sx_xunlock_(&map->lock, file, line);
815
}
816
return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
817
timo));
818
}
819
820
/*
821
* vm_map_wakeup:
822
*
823
* Awaken any threads that have slept on the map using
824
* vm_map_unlock_and_wait().
825
*/
826
void
827
vm_map_wakeup(vm_map_t map)
828
{
829
830
/*
831
* Acquire and release map_sleep_mtx to prevent a wakeup()
832
* from being performed (and lost) between the map unlock
833
* and the msleep() in _vm_map_unlock_and_wait().
834
*/
835
mtx_lock(&map_sleep_mtx);
836
mtx_unlock(&map_sleep_mtx);
837
wakeup(&map->root);
838
}
839
840
void
841
vm_map_busy(vm_map_t map)
842
{
843
844
VM_MAP_ASSERT_LOCKED(map);
845
map->busy++;
846
}
847
848
void
849
vm_map_unbusy(vm_map_t map)
850
{
851
852
VM_MAP_ASSERT_LOCKED(map);
853
KASSERT(map->busy, ("vm_map_unbusy: not busy"));
854
if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
855
vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
856
wakeup(&map->busy);
857
}
858
}
859
860
void
861
vm_map_wait_busy(vm_map_t map)
862
{
863
864
VM_MAP_ASSERT_LOCKED(map);
865
while (map->busy) {
866
vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
867
if (vm_map_is_system(map))
868
msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
869
else
870
sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
871
}
872
map->timestamp++;
873
}
874
875
long
876
vmspace_resident_count(struct vmspace *vmspace)
877
{
878
return pmap_resident_count(vmspace_pmap(vmspace));
879
}
880
881
/*
882
* Initialize an existing vm_map structure
883
* such as that in the vmspace structure.
884
*/
885
static void
886
_vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
887
{
888
889
map->header.eflags = MAP_ENTRY_HEADER;
890
map->pmap = pmap;
891
map->header.end = min;
892
map->header.start = max;
893
map->flags = 0;
894
map->header.left = map->header.right = &map->header;
895
map->root = NULL;
896
map->timestamp = 0;
897
map->busy = 0;
898
map->anon_loc = 0;
899
#ifdef DIAGNOSTIC
900
map->nupdates = 0;
901
#endif
902
}
903
904
void
905
vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
906
{
907
_vm_map_init(map, pmap, min, max);
908
sx_init(&map->lock, "vm map (user)");
909
}
910
911
void
912
vm_map_init_system(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
913
{
914
_vm_map_init(map, pmap, min, max);
915
vm_map_modflags(map, MAP_SYSTEM_MAP, 0);
916
mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF |
917
MTX_DUPOK);
918
}
919
920
/*
921
* vm_map_entry_dispose: [ internal use only ]
922
*
923
* Inverse of vm_map_entry_create.
924
*/
925
static void
926
vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
927
{
928
uma_zfree(vm_map_is_system(map) ? kmapentzone : mapentzone, entry);
929
}
930
931
/*
932
* vm_map_entry_create: [ internal use only ]
933
*
934
* Allocates a VM map entry for insertion.
935
* No entry fields are filled in.
936
*/
937
static vm_map_entry_t
938
vm_map_entry_create(vm_map_t map)
939
{
940
vm_map_entry_t new_entry;
941
942
#ifndef UMA_USE_DMAP
943
if (map == kernel_map) {
944
VM_MAP_ASSERT_LOCKED(map);
945
946
/*
947
* A new slab of kernel map entries cannot be allocated at this
948
* point because the kernel map has not yet been updated to
949
* reflect the caller's request. Therefore, we allocate a new
950
* map entry, dipping into the reserve if necessary, and set a
951
* flag indicating that the reserve must be replenished before
952
* the map is unlocked.
953
*/
954
new_entry = uma_zalloc(kmapentzone, M_NOWAIT | M_NOVM);
955
if (new_entry == NULL) {
956
new_entry = uma_zalloc(kmapentzone,
957
M_NOWAIT | M_NOVM | M_USE_RESERVE);
958
kernel_map->flags |= MAP_REPLENISH;
959
}
960
} else
961
#endif
962
if (vm_map_is_system(map)) {
963
new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
964
} else {
965
new_entry = uma_zalloc(mapentzone, M_WAITOK);
966
}
967
KASSERT(new_entry != NULL,
968
("vm_map_entry_create: kernel resources exhausted"));
969
return (new_entry);
970
}
971
972
/*
973
* vm_map_entry_set_behavior:
974
*
975
* Set the expected access behavior, either normal, random, or
976
* sequential.
977
*/
978
static inline void
979
vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
980
{
981
entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
982
(behavior & MAP_ENTRY_BEHAV_MASK);
983
}
984
985
/*
986
* vm_map_entry_max_free_{left,right}:
987
*
988
* Compute the size of the largest free gap between two entries,
989
* one the root of a tree and the other the ancestor of that root
990
* that is the least or greatest ancestor found on the search path.
991
*/
992
static inline vm_size_t
993
vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor)
994
{
995
996
return (root->left != left_ancestor ?
997
root->left->max_free : root->start - left_ancestor->end);
998
}
999
1000
static inline vm_size_t
1001
vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor)
1002
{
1003
1004
return (root->right != right_ancestor ?
1005
root->right->max_free : right_ancestor->start - root->end);
1006
}
1007
1008
/*
1009
* vm_map_entry_{pred,succ}:
1010
*
1011
* Find the {predecessor, successor} of the entry by taking one step
1012
* in the appropriate direction and backtracking as much as necessary.
1013
* vm_map_entry_succ is defined in vm_map.h.
1014
*/
1015
static inline vm_map_entry_t
1016
vm_map_entry_pred(vm_map_entry_t entry)
1017
{
1018
vm_map_entry_t prior;
1019
1020
prior = entry->left;
1021
if (prior->right->start < entry->start) {
1022
do
1023
prior = prior->right;
1024
while (prior->right != entry);
1025
}
1026
return (prior);
1027
}
1028
1029
static inline vm_size_t
1030
vm_size_max(vm_size_t a, vm_size_t b)
1031
{
1032
1033
return (a > b ? a : b);
1034
}
1035
1036
#define SPLAY_LEFT_STEP(root, y, llist, rlist, test) do { \
1037
vm_map_entry_t z; \
1038
vm_size_t max_free; \
1039
\
1040
/* \
1041
* Infer root->right->max_free == root->max_free when \
1042
* y->max_free < root->max_free || root->max_free == 0. \
1043
* Otherwise, look right to find it. \
1044
*/ \
1045
y = root->left; \
1046
max_free = root->max_free; \
1047
KASSERT(max_free == vm_size_max( \
1048
vm_map_entry_max_free_left(root, llist), \
1049
vm_map_entry_max_free_right(root, rlist)), \
1050
("%s: max_free invariant fails", __func__)); \
1051
if (max_free - 1 < vm_map_entry_max_free_left(root, llist)) \
1052
max_free = vm_map_entry_max_free_right(root, rlist); \
1053
if (y != llist && (test)) { \
1054
/* Rotate right and make y root. */ \
1055
z = y->right; \
1056
if (z != root) { \
1057
root->left = z; \
1058
y->right = root; \
1059
if (max_free < y->max_free) \
1060
root->max_free = max_free = \
1061
vm_size_max(max_free, z->max_free); \
1062
} else if (max_free < y->max_free) \
1063
root->max_free = max_free = \
1064
vm_size_max(max_free, root->start - y->end);\
1065
root = y; \
1066
y = root->left; \
1067
} \
1068
/* Copy right->max_free. Put root on rlist. */ \
1069
root->max_free = max_free; \
1070
KASSERT(max_free == vm_map_entry_max_free_right(root, rlist), \
1071
("%s: max_free not copied from right", __func__)); \
1072
root->left = rlist; \
1073
rlist = root; \
1074
root = y != llist ? y : NULL; \
1075
} while (0)
1076
1077
#define SPLAY_RIGHT_STEP(root, y, llist, rlist, test) do { \
1078
vm_map_entry_t z; \
1079
vm_size_t max_free; \
1080
\
1081
/* \
1082
* Infer root->left->max_free == root->max_free when \
1083
* y->max_free < root->max_free || root->max_free == 0. \
1084
* Otherwise, look left to find it. \
1085
*/ \
1086
y = root->right; \
1087
max_free = root->max_free; \
1088
KASSERT(max_free == vm_size_max( \
1089
vm_map_entry_max_free_left(root, llist), \
1090
vm_map_entry_max_free_right(root, rlist)), \
1091
("%s: max_free invariant fails", __func__)); \
1092
if (max_free - 1 < vm_map_entry_max_free_right(root, rlist)) \
1093
max_free = vm_map_entry_max_free_left(root, llist); \
1094
if (y != rlist && (test)) { \
1095
/* Rotate left and make y root. */ \
1096
z = y->left; \
1097
if (z != root) { \
1098
root->right = z; \
1099
y->left = root; \
1100
if (max_free < y->max_free) \
1101
root->max_free = max_free = \
1102
vm_size_max(max_free, z->max_free); \
1103
} else if (max_free < y->max_free) \
1104
root->max_free = max_free = \
1105
vm_size_max(max_free, y->start - root->end);\
1106
root = y; \
1107
y = root->right; \
1108
} \
1109
/* Copy left->max_free. Put root on llist. */ \
1110
root->max_free = max_free; \
1111
KASSERT(max_free == vm_map_entry_max_free_left(root, llist), \
1112
("%s: max_free not copied from left", __func__)); \
1113
root->right = llist; \
1114
llist = root; \
1115
root = y != rlist ? y : NULL; \
1116
} while (0)
1117
1118
/*
1119
* Walk down the tree until we find addr or a gap where addr would go, breaking
1120
* off left and right subtrees of nodes less than, or greater than addr. Treat
1121
* subtrees with root->max_free < length as empty trees. llist and rlist are
1122
* the two sides in reverse order (bottom-up), with llist linked by the right
1123
* pointer and rlist linked by the left pointer in the vm_map_entry, and both
1124
* lists terminated by &map->header. This function, and the subsequent call to
1125
* vm_map_splay_merge_{left,right,pred,succ}, rely on the start and end address
1126
* values in &map->header.
1127
*/
1128
static __always_inline vm_map_entry_t
1129
vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length,
1130
vm_map_entry_t *llist, vm_map_entry_t *rlist)
1131
{
1132
vm_map_entry_t left, right, root, y;
1133
1134
left = right = &map->header;
1135
root = map->root;
1136
while (root != NULL && root->max_free >= length) {
1137
KASSERT(left->end <= root->start &&
1138
root->end <= right->start,
1139
("%s: root not within tree bounds", __func__));
1140
if (addr < root->start) {
1141
SPLAY_LEFT_STEP(root, y, left, right,
1142
y->max_free >= length && addr < y->start);
1143
} else if (addr >= root->end) {
1144
SPLAY_RIGHT_STEP(root, y, left, right,
1145
y->max_free >= length && addr >= y->end);
1146
} else
1147
break;
1148
}
1149
*llist = left;
1150
*rlist = right;
1151
return (root);
1152
}
1153
1154
static __always_inline void
1155
vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *rlist)
1156
{
1157
vm_map_entry_t hi, right, y;
1158
1159
right = *rlist;
1160
hi = root->right == right ? NULL : root->right;
1161
if (hi == NULL)
1162
return;
1163
do
1164
SPLAY_LEFT_STEP(hi, y, root, right, true);
1165
while (hi != NULL);
1166
*rlist = right;
1167
}
1168
1169
static __always_inline void
1170
vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *llist)
1171
{
1172
vm_map_entry_t left, lo, y;
1173
1174
left = *llist;
1175
lo = root->left == left ? NULL : root->left;
1176
if (lo == NULL)
1177
return;
1178
do
1179
SPLAY_RIGHT_STEP(lo, y, left, root, true);
1180
while (lo != NULL);
1181
*llist = left;
1182
}
1183
1184
static inline void
1185
vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b)
1186
{
1187
vm_map_entry_t tmp;
1188
1189
tmp = *b;
1190
*b = *a;
1191
*a = tmp;
1192
}
1193
1194
/*
1195
* Walk back up the two spines, flip the pointers and set max_free. The
1196
* subtrees of the root go at the bottom of llist and rlist.
1197
*/
1198
static vm_size_t
1199
vm_map_splay_merge_left_walk(vm_map_entry_t header, vm_map_entry_t root,
1200
vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t llist)
1201
{
1202
do {
1203
/*
1204
* The max_free values of the children of llist are in
1205
* llist->max_free and max_free. Update with the
1206
* max value.
1207
*/
1208
llist->max_free = max_free =
1209
vm_size_max(llist->max_free, max_free);
1210
vm_map_entry_swap(&llist->right, &tail);
1211
vm_map_entry_swap(&tail, &llist);
1212
} while (llist != header);
1213
root->left = tail;
1214
return (max_free);
1215
}
1216
1217
/*
1218
* When llist is known to be the predecessor of root.
1219
*/
1220
static inline vm_size_t
1221
vm_map_splay_merge_pred(vm_map_entry_t header, vm_map_entry_t root,
1222
vm_map_entry_t llist)
1223
{
1224
vm_size_t max_free;
1225
1226
max_free = root->start - llist->end;
1227
if (llist != header) {
1228
max_free = vm_map_splay_merge_left_walk(header, root,
1229
root, max_free, llist);
1230
} else {
1231
root->left = header;
1232
header->right = root;
1233
}
1234
return (max_free);
1235
}
1236
1237
/*
1238
* When llist may or may not be the predecessor of root.
1239
*/
1240
static inline vm_size_t
1241
vm_map_splay_merge_left(vm_map_entry_t header, vm_map_entry_t root,
1242
vm_map_entry_t llist)
1243
{
1244
vm_size_t max_free;
1245
1246
max_free = vm_map_entry_max_free_left(root, llist);
1247
if (llist != header) {
1248
max_free = vm_map_splay_merge_left_walk(header, root,
1249
root->left == llist ? root : root->left,
1250
max_free, llist);
1251
}
1252
return (max_free);
1253
}
1254
1255
static vm_size_t
1256
vm_map_splay_merge_right_walk(vm_map_entry_t header, vm_map_entry_t root,
1257
vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t rlist)
1258
{
1259
do {
1260
/*
1261
* The max_free values of the children of rlist are in
1262
* rlist->max_free and max_free. Update with the
1263
* max value.
1264
*/
1265
rlist->max_free = max_free =
1266
vm_size_max(rlist->max_free, max_free);
1267
vm_map_entry_swap(&rlist->left, &tail);
1268
vm_map_entry_swap(&tail, &rlist);
1269
} while (rlist != header);
1270
root->right = tail;
1271
return (max_free);
1272
}
1273
1274
/*
1275
* When rlist is known to be the succecessor of root.
1276
*/
1277
static inline vm_size_t
1278
vm_map_splay_merge_succ(vm_map_entry_t header, vm_map_entry_t root,
1279
vm_map_entry_t rlist)
1280
{
1281
vm_size_t max_free;
1282
1283
max_free = rlist->start - root->end;
1284
if (rlist != header) {
1285
max_free = vm_map_splay_merge_right_walk(header, root,
1286
root, max_free, rlist);
1287
} else {
1288
root->right = header;
1289
header->left = root;
1290
}
1291
return (max_free);
1292
}
1293
1294
/*
1295
* When rlist may or may not be the succecessor of root.
1296
*/
1297
static inline vm_size_t
1298
vm_map_splay_merge_right(vm_map_entry_t header, vm_map_entry_t root,
1299
vm_map_entry_t rlist)
1300
{
1301
vm_size_t max_free;
1302
1303
max_free = vm_map_entry_max_free_right(root, rlist);
1304
if (rlist != header) {
1305
max_free = vm_map_splay_merge_right_walk(header, root,
1306
root->right == rlist ? root : root->right,
1307
max_free, rlist);
1308
}
1309
return (max_free);
1310
}
1311
1312
/*
1313
* vm_map_splay:
1314
*
1315
* The Sleator and Tarjan top-down splay algorithm with the
1316
* following variation. Max_free must be computed bottom-up, so
1317
* on the downward pass, maintain the left and right spines in
1318
* reverse order. Then, make a second pass up each side to fix
1319
* the pointers and compute max_free. The time bound is O(log n)
1320
* amortized.
1321
*
1322
* The tree is threaded, which means that there are no null pointers.
1323
* When a node has no left child, its left pointer points to its
1324
* predecessor, which the last ancestor on the search path from the root
1325
* where the search branched right. Likewise, when a node has no right
1326
* child, its right pointer points to its successor. The map header node
1327
* is the predecessor of the first map entry, and the successor of the
1328
* last.
1329
*
1330
* The new root is the vm_map_entry containing "addr", or else an
1331
* adjacent entry (lower if possible) if addr is not in the tree.
1332
*
1333
* The map must be locked, and leaves it so.
1334
*
1335
* Returns: the new root.
1336
*/
1337
static vm_map_entry_t
1338
vm_map_splay(vm_map_t map, vm_offset_t addr)
1339
{
1340
vm_map_entry_t header, llist, rlist, root;
1341
vm_size_t max_free_left, max_free_right;
1342
1343
header = &map->header;
1344
root = vm_map_splay_split(map, addr, 0, &llist, &rlist);
1345
if (root != NULL) {
1346
max_free_left = vm_map_splay_merge_left(header, root, llist);
1347
max_free_right = vm_map_splay_merge_right(header, root, rlist);
1348
} else if (llist != header) {
1349
/*
1350
* Recover the greatest node in the left
1351
* subtree and make it the root.
1352
*/
1353
root = llist;
1354
llist = root->right;
1355
max_free_left = vm_map_splay_merge_left(header, root, llist);
1356
max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1357
} else if (rlist != header) {
1358
/*
1359
* Recover the least node in the right
1360
* subtree and make it the root.
1361
*/
1362
root = rlist;
1363
rlist = root->left;
1364
max_free_left = vm_map_splay_merge_pred(header, root, llist);
1365
max_free_right = vm_map_splay_merge_right(header, root, rlist);
1366
} else {
1367
/* There is no root. */
1368
return (NULL);
1369
}
1370
root->max_free = vm_size_max(max_free_left, max_free_right);
1371
map->root = root;
1372
VM_MAP_ASSERT_CONSISTENT(map);
1373
return (root);
1374
}
1375
1376
/*
1377
* vm_map_entry_{un,}link:
1378
*
1379
* Insert/remove entries from maps. On linking, if new entry clips
1380
* existing entry, trim existing entry to avoid overlap, and manage
1381
* offsets. On unlinking, merge disappearing entry with neighbor, if
1382
* called for, and manage offsets. Callers should not modify fields in
1383
* entries already mapped.
1384
*/
1385
static void
1386
vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
1387
{
1388
vm_map_entry_t header, llist, rlist, root;
1389
vm_size_t max_free_left, max_free_right;
1390
1391
CTR3(KTR_VM,
1392
"vm_map_entry_link: map %p, nentries %d, entry %p", map,
1393
map->nentries, entry);
1394
VM_MAP_ASSERT_LOCKED(map);
1395
map->nentries++;
1396
header = &map->header;
1397
root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1398
if (root == NULL) {
1399
/*
1400
* The new entry does not overlap any existing entry in the
1401
* map, so it becomes the new root of the map tree.
1402
*/
1403
max_free_left = vm_map_splay_merge_pred(header, entry, llist);
1404
max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
1405
} else if (entry->start == root->start) {
1406
/*
1407
* The new entry is a clone of root, with only the end field
1408
* changed. The root entry will be shrunk to abut the new
1409
* entry, and will be the right child of the new root entry in
1410
* the modified map.
1411
*/
1412
KASSERT(entry->end < root->end,
1413
("%s: clip_start not within entry", __func__));
1414
vm_map_splay_findprev(root, &llist);
1415
if ((root->eflags & MAP_ENTRY_STACK_GAP) == 0)
1416
root->offset += entry->end - root->start;
1417
root->start = entry->end;
1418
max_free_left = vm_map_splay_merge_pred(header, entry, llist);
1419
max_free_right = root->max_free = vm_size_max(
1420
vm_map_splay_merge_pred(entry, root, entry),
1421
vm_map_splay_merge_right(header, root, rlist));
1422
} else {
1423
/*
1424
* The new entry is a clone of root, with only the start field
1425
* changed. The root entry will be shrunk to abut the new
1426
* entry, and will be the left child of the new root entry in
1427
* the modified map.
1428
*/
1429
KASSERT(entry->end == root->end,
1430
("%s: clip_start not within entry", __func__));
1431
vm_map_splay_findnext(root, &rlist);
1432
if ((entry->eflags & MAP_ENTRY_STACK_GAP) == 0)
1433
entry->offset += entry->start - root->start;
1434
root->end = entry->start;
1435
max_free_left = root->max_free = vm_size_max(
1436
vm_map_splay_merge_left(header, root, llist),
1437
vm_map_splay_merge_succ(entry, root, entry));
1438
max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
1439
}
1440
entry->max_free = vm_size_max(max_free_left, max_free_right);
1441
map->root = entry;
1442
VM_MAP_ASSERT_CONSISTENT(map);
1443
}
1444
1445
enum unlink_merge_type {
1446
UNLINK_MERGE_NONE,
1447
UNLINK_MERGE_NEXT
1448
};
1449
1450
static void
1451
vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry,
1452
enum unlink_merge_type op)
1453
{
1454
vm_map_entry_t header, llist, rlist, root;
1455
vm_size_t max_free_left, max_free_right;
1456
1457
VM_MAP_ASSERT_LOCKED(map);
1458
header = &map->header;
1459
root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1460
KASSERT(root != NULL,
1461
("vm_map_entry_unlink: unlink object not mapped"));
1462
1463
vm_map_splay_findprev(root, &llist);
1464
vm_map_splay_findnext(root, &rlist);
1465
if (op == UNLINK_MERGE_NEXT) {
1466
rlist->start = root->start;
1467
MPASS((rlist->eflags & MAP_ENTRY_STACK_GAP) == 0);
1468
rlist->offset = root->offset;
1469
}
1470
if (llist != header) {
1471
root = llist;
1472
llist = root->right;
1473
max_free_left = vm_map_splay_merge_left(header, root, llist);
1474
max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1475
} else if (rlist != header) {
1476
root = rlist;
1477
rlist = root->left;
1478
max_free_left = vm_map_splay_merge_pred(header, root, llist);
1479
max_free_right = vm_map_splay_merge_right(header, root, rlist);
1480
} else {
1481
header->left = header->right = header;
1482
root = NULL;
1483
}
1484
if (root != NULL)
1485
root->max_free = vm_size_max(max_free_left, max_free_right);
1486
map->root = root;
1487
VM_MAP_ASSERT_CONSISTENT(map);
1488
map->nentries--;
1489
CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
1490
map->nentries, entry);
1491
}
1492
1493
/*
1494
* vm_map_entry_resize:
1495
*
1496
* Resize a vm_map_entry, recompute the amount of free space that
1497
* follows it and propagate that value up the tree.
1498
*
1499
* The map must be locked, and leaves it so.
1500
*/
1501
static void
1502
vm_map_entry_resize(vm_map_t map, vm_map_entry_t entry, vm_size_t grow_amount)
1503
{
1504
vm_map_entry_t header, llist, rlist, root;
1505
1506
VM_MAP_ASSERT_LOCKED(map);
1507
header = &map->header;
1508
root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1509
KASSERT(root != NULL, ("%s: resize object not mapped", __func__));
1510
vm_map_splay_findnext(root, &rlist);
1511
entry->end += grow_amount;
1512
root->max_free = vm_size_max(
1513
vm_map_splay_merge_left(header, root, llist),
1514
vm_map_splay_merge_succ(header, root, rlist));
1515
map->root = root;
1516
VM_MAP_ASSERT_CONSISTENT(map);
1517
CTR4(KTR_VM, "%s: map %p, nentries %d, entry %p",
1518
__func__, map, map->nentries, entry);
1519
}
1520
1521
/*
1522
* vm_map_lookup_entry: [ internal use only ]
1523
*
1524
* Finds the map entry containing (or
1525
* immediately preceding) the specified address
1526
* in the given map; the entry is returned
1527
* in the "entry" parameter. The boolean
1528
* result indicates whether the address is
1529
* actually contained in the map.
1530
*/
1531
boolean_t
1532
vm_map_lookup_entry(
1533
vm_map_t map,
1534
vm_offset_t address,
1535
vm_map_entry_t *entry) /* OUT */
1536
{
1537
vm_map_entry_t cur, header, lbound, ubound;
1538
boolean_t locked;
1539
1540
/*
1541
* If the map is empty, then the map entry immediately preceding
1542
* "address" is the map's header.
1543
*/
1544
header = &map->header;
1545
cur = map->root;
1546
if (cur == NULL) {
1547
*entry = header;
1548
return (FALSE);
1549
}
1550
if (address >= cur->start && cur->end > address) {
1551
*entry = cur;
1552
return (TRUE);
1553
}
1554
if ((locked = vm_map_locked(map)) ||
1555
sx_try_upgrade(&map->lock)) {
1556
/*
1557
* Splay requires a write lock on the map. However, it only
1558
* restructures the binary search tree; it does not otherwise
1559
* change the map. Thus, the map's timestamp need not change
1560
* on a temporary upgrade.
1561
*/
1562
cur = vm_map_splay(map, address);
1563
if (!locked) {
1564
VM_MAP_UNLOCK_CONSISTENT(map);
1565
sx_downgrade(&map->lock);
1566
}
1567
1568
/*
1569
* If "address" is contained within a map entry, the new root
1570
* is that map entry. Otherwise, the new root is a map entry
1571
* immediately before or after "address".
1572
*/
1573
if (address < cur->start) {
1574
*entry = header;
1575
return (FALSE);
1576
}
1577
*entry = cur;
1578
return (address < cur->end);
1579
}
1580
/*
1581
* Since the map is only locked for read access, perform a
1582
* standard binary search tree lookup for "address".
1583
*/
1584
lbound = ubound = header;
1585
for (;;) {
1586
if (address < cur->start) {
1587
ubound = cur;
1588
cur = cur->left;
1589
if (cur == lbound)
1590
break;
1591
} else if (cur->end <= address) {
1592
lbound = cur;
1593
cur = cur->right;
1594
if (cur == ubound)
1595
break;
1596
} else {
1597
*entry = cur;
1598
return (TRUE);
1599
}
1600
}
1601
*entry = lbound;
1602
return (FALSE);
1603
}
1604
1605
/*
1606
* vm_map_insert1() is identical to vm_map_insert() except that it
1607
* returns the newly inserted map entry in '*res'. In case the new
1608
* entry is coalesced with a neighbor or an existing entry was
1609
* resized, that entry is returned. In any case, the returned entry
1610
* covers the specified address range.
1611
*/
1612
static int
1613
vm_map_insert1(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1614
vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow,
1615
vm_map_entry_t *res)
1616
{
1617
vm_map_entry_t new_entry, next_entry, prev_entry;
1618
struct ucred *cred;
1619
vm_eflags_t protoeflags;
1620
vm_inherit_t inheritance;
1621
u_long bdry;
1622
u_int bidx;
1623
1624
VM_MAP_ASSERT_LOCKED(map);
1625
KASSERT(object != kernel_object ||
1626
(cow & MAP_COPY_ON_WRITE) == 0,
1627
("vm_map_insert: kernel object and COW"));
1628
KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 ||
1629
(cow & MAP_SPLIT_BOUNDARY_MASK) != 0,
1630
("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x",
1631
object, cow));
1632
KASSERT((prot & ~max) == 0,
1633
("prot %#x is not subset of max_prot %#x", prot, max));
1634
1635
/*
1636
* Check that the start and end points are not bogus.
1637
*/
1638
if (start == end || !vm_map_range_valid(map, start, end))
1639
return (KERN_INVALID_ADDRESS);
1640
1641
if ((map->flags & MAP_WXORX) != 0 && (prot & (VM_PROT_WRITE |
1642
VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE))
1643
return (KERN_PROTECTION_FAILURE);
1644
1645
/*
1646
* Find the entry prior to the proposed starting address; if it's part
1647
* of an existing entry, this range is bogus.
1648
*/
1649
if (vm_map_lookup_entry(map, start, &prev_entry))
1650
return (KERN_NO_SPACE);
1651
1652
/*
1653
* Assert that the next entry doesn't overlap the end point.
1654
*/
1655
next_entry = vm_map_entry_succ(prev_entry);
1656
if (next_entry->start < end)
1657
return (KERN_NO_SPACE);
1658
1659
if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
1660
max != VM_PROT_NONE))
1661
return (KERN_INVALID_ARGUMENT);
1662
1663
protoeflags = 0;
1664
if (cow & MAP_COPY_ON_WRITE)
1665
protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
1666
if (cow & MAP_NOFAULT)
1667
protoeflags |= MAP_ENTRY_NOFAULT;
1668
if (cow & MAP_DISABLE_SYNCER)
1669
protoeflags |= MAP_ENTRY_NOSYNC;
1670
if (cow & MAP_DISABLE_COREDUMP)
1671
protoeflags |= MAP_ENTRY_NOCOREDUMP;
1672
if (cow & MAP_STACK_AREA)
1673
protoeflags |= MAP_ENTRY_GROWS_DOWN;
1674
if (cow & MAP_WRITECOUNT)
1675
protoeflags |= MAP_ENTRY_WRITECNT;
1676
if (cow & MAP_VN_EXEC)
1677
protoeflags |= MAP_ENTRY_VN_EXEC;
1678
if ((cow & MAP_CREATE_GUARD) != 0)
1679
protoeflags |= MAP_ENTRY_GUARD;
1680
if ((cow & MAP_CREATE_STACK_GAP) != 0)
1681
protoeflags |= MAP_ENTRY_STACK_GAP;
1682
if (cow & MAP_INHERIT_SHARE)
1683
inheritance = VM_INHERIT_SHARE;
1684
else
1685
inheritance = VM_INHERIT_DEFAULT;
1686
if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) {
1687
/* This magically ignores index 0, for usual page size. */
1688
bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >>
1689
MAP_SPLIT_BOUNDARY_SHIFT;
1690
if (bidx >= MAXPAGESIZES)
1691
return (KERN_INVALID_ARGUMENT);
1692
bdry = pagesizes[bidx] - 1;
1693
if ((start & bdry) != 0 || (end & bdry) != 0)
1694
return (KERN_INVALID_ARGUMENT);
1695
protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
1696
}
1697
1698
cred = NULL;
1699
if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
1700
goto charged;
1701
if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
1702
((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
1703
if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
1704
return (KERN_RESOURCE_SHORTAGE);
1705
KASSERT(object == NULL ||
1706
(protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
1707
object->cred == NULL,
1708
("overcommit: vm_map_insert o %p", object));
1709
cred = curthread->td_ucred;
1710
}
1711
1712
charged:
1713
/* Expand the kernel pmap, if necessary. */
1714
if (map == kernel_map && end > kernel_vm_end) {
1715
int rv;
1716
1717
rv = pmap_growkernel(end);
1718
if (rv != KERN_SUCCESS)
1719
return (rv);
1720
}
1721
if (object != NULL) {
1722
/*
1723
* OBJ_ONEMAPPING must be cleared unless this mapping
1724
* is trivially proven to be the only mapping for any
1725
* of the object's pages. (Object granularity
1726
* reference counting is insufficient to recognize
1727
* aliases with precision.)
1728
*/
1729
if ((object->flags & OBJ_ANON) != 0) {
1730
VM_OBJECT_WLOCK(object);
1731
if (object->ref_count > 1 || object->shadow_count != 0)
1732
vm_object_clear_flag(object, OBJ_ONEMAPPING);
1733
VM_OBJECT_WUNLOCK(object);
1734
}
1735
} else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) ==
1736
protoeflags &&
1737
(cow & (MAP_STACK_AREA | MAP_VN_EXEC)) == 0 &&
1738
prev_entry->end == start && (prev_entry->cred == cred ||
1739
(prev_entry->object.vm_object != NULL &&
1740
prev_entry->object.vm_object->cred == cred)) &&
1741
vm_object_coalesce(prev_entry->object.vm_object,
1742
prev_entry->offset,
1743
(vm_size_t)(prev_entry->end - prev_entry->start),
1744
(vm_size_t)(end - prev_entry->end), cred != NULL &&
1745
(protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
1746
/*
1747
* We were able to extend the object. Determine if we
1748
* can extend the previous map entry to include the
1749
* new range as well.
1750
*/
1751
if (prev_entry->inheritance == inheritance &&
1752
prev_entry->protection == prot &&
1753
prev_entry->max_protection == max &&
1754
prev_entry->wired_count == 0) {
1755
KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) ==
1756
0, ("prev_entry %p has incoherent wiring",
1757
prev_entry));
1758
if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
1759
map->size += end - prev_entry->end;
1760
vm_map_entry_resize(map, prev_entry,
1761
end - prev_entry->end);
1762
*res = vm_map_try_merge_entries(map, prev_entry,
1763
next_entry);
1764
return (KERN_SUCCESS);
1765
}
1766
1767
/*
1768
* If we can extend the object but cannot extend the
1769
* map entry, we have to create a new map entry. We
1770
* must bump the ref count on the extended object to
1771
* account for it. object may be NULL.
1772
*/
1773
object = prev_entry->object.vm_object;
1774
offset = prev_entry->offset +
1775
(prev_entry->end - prev_entry->start);
1776
vm_object_reference(object);
1777
if (cred != NULL && object != NULL && object->cred != NULL &&
1778
!(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
1779
/* Object already accounts for this uid. */
1780
cred = NULL;
1781
}
1782
}
1783
if (cred != NULL)
1784
crhold(cred);
1785
1786
/*
1787
* Create a new entry
1788
*/
1789
new_entry = vm_map_entry_create(map);
1790
new_entry->start = start;
1791
new_entry->end = end;
1792
new_entry->cred = NULL;
1793
1794
new_entry->eflags = protoeflags;
1795
new_entry->object.vm_object = object;
1796
new_entry->offset = offset;
1797
1798
new_entry->inheritance = inheritance;
1799
new_entry->protection = prot;
1800
new_entry->max_protection = max;
1801
new_entry->wired_count = 0;
1802
new_entry->wiring_thread = NULL;
1803
new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
1804
new_entry->next_read = start;
1805
1806
KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
1807
("overcommit: vm_map_insert leaks vm_map %p", new_entry));
1808
new_entry->cred = cred;
1809
1810
/*
1811
* Insert the new entry into the list
1812
*/
1813
vm_map_entry_link(map, new_entry);
1814
if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
1815
map->size += new_entry->end - new_entry->start;
1816
1817
/*
1818
* Try to coalesce the new entry with both the previous and next
1819
* entries in the list. Previously, we only attempted to coalesce
1820
* with the previous entry when object is NULL. Here, we handle the
1821
* other cases, which are less common.
1822
*/
1823
vm_map_try_merge_entries(map, prev_entry, new_entry);
1824
*res = vm_map_try_merge_entries(map, new_entry, next_entry);
1825
1826
if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
1827
vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
1828
end - start, cow & MAP_PREFAULT_PARTIAL);
1829
}
1830
1831
return (KERN_SUCCESS);
1832
}
1833
1834
/*
1835
* vm_map_insert:
1836
*
1837
* Inserts the given VM object into the target map at the
1838
* specified address range.
1839
*
1840
* Requires that the map be locked, and leaves it so.
1841
*
1842
* If object is non-NULL, ref count must be bumped by caller
1843
* prior to making call to account for the new entry.
1844
*/
1845
int
1846
vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1847
vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
1848
{
1849
vm_map_entry_t res;
1850
1851
return (vm_map_insert1(map, object, offset, start, end, prot, max,
1852
cow, &res));
1853
}
1854
1855
/*
1856
* vm_map_findspace:
1857
*
1858
* Find the first fit (lowest VM address) for "length" free bytes
1859
* beginning at address >= start in the given map.
1860
*
1861
* In a vm_map_entry, "max_free" is the maximum amount of
1862
* contiguous free space between an entry in its subtree and a
1863
* neighbor of that entry. This allows finding a free region in
1864
* one path down the tree, so O(log n) amortized with splay
1865
* trees.
1866
*
1867
* The map must be locked, and leaves it so.
1868
*
1869
* Returns: starting address if sufficient space,
1870
* vm_map_max(map)-length+1 if insufficient space.
1871
*/
1872
vm_offset_t
1873
vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length)
1874
{
1875
vm_map_entry_t header, llist, rlist, root, y;
1876
vm_size_t left_length, max_free_left, max_free_right;
1877
vm_offset_t gap_end;
1878
1879
VM_MAP_ASSERT_LOCKED(map);
1880
1881
/*
1882
* Request must fit within min/max VM address and must avoid
1883
* address wrap.
1884
*/
1885
start = MAX(start, vm_map_min(map));
1886
if (start >= vm_map_max(map) || length > vm_map_max(map) - start)
1887
return (vm_map_max(map) - length + 1);
1888
1889
/* Empty tree means wide open address space. */
1890
if (map->root == NULL)
1891
return (start);
1892
1893
/*
1894
* After splay_split, if start is within an entry, push it to the start
1895
* of the following gap. If rlist is at the end of the gap containing
1896
* start, save the end of that gap in gap_end to see if the gap is big
1897
* enough; otherwise set gap_end to start skip gap-checking and move
1898
* directly to a search of the right subtree.
1899
*/
1900
header = &map->header;
1901
root = vm_map_splay_split(map, start, length, &llist, &rlist);
1902
gap_end = rlist->start;
1903
if (root != NULL) {
1904
start = root->end;
1905
if (root->right != rlist)
1906
gap_end = start;
1907
max_free_left = vm_map_splay_merge_left(header, root, llist);
1908
max_free_right = vm_map_splay_merge_right(header, root, rlist);
1909
} else if (rlist != header) {
1910
root = rlist;
1911
rlist = root->left;
1912
max_free_left = vm_map_splay_merge_pred(header, root, llist);
1913
max_free_right = vm_map_splay_merge_right(header, root, rlist);
1914
} else {
1915
root = llist;
1916
llist = root->right;
1917
max_free_left = vm_map_splay_merge_left(header, root, llist);
1918
max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1919
}
1920
root->max_free = vm_size_max(max_free_left, max_free_right);
1921
map->root = root;
1922
VM_MAP_ASSERT_CONSISTENT(map);
1923
if (length <= gap_end - start)
1924
return (start);
1925
1926
/* With max_free, can immediately tell if no solution. */
1927
if (root->right == header || length > root->right->max_free)
1928
return (vm_map_max(map) - length + 1);
1929
1930
/*
1931
* Splay for the least large-enough gap in the right subtree.
1932
*/
1933
llist = rlist = header;
1934
for (left_length = 0;;
1935
left_length = vm_map_entry_max_free_left(root, llist)) {
1936
if (length <= left_length)
1937
SPLAY_LEFT_STEP(root, y, llist, rlist,
1938
length <= vm_map_entry_max_free_left(y, llist));
1939
else
1940
SPLAY_RIGHT_STEP(root, y, llist, rlist,
1941
length > vm_map_entry_max_free_left(y, root));
1942
if (root == NULL)
1943
break;
1944
}
1945
root = llist;
1946
llist = root->right;
1947
max_free_left = vm_map_splay_merge_left(header, root, llist);
1948
if (rlist == header) {
1949
root->max_free = vm_size_max(max_free_left,
1950
vm_map_splay_merge_succ(header, root, rlist));
1951
} else {
1952
y = rlist;
1953
rlist = y->left;
1954
y->max_free = vm_size_max(
1955
vm_map_splay_merge_pred(root, y, root),
1956
vm_map_splay_merge_right(header, y, rlist));
1957
root->max_free = vm_size_max(max_free_left, y->max_free);
1958
}
1959
map->root = root;
1960
VM_MAP_ASSERT_CONSISTENT(map);
1961
return (root->end);
1962
}
1963
1964
int
1965
vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1966
vm_offset_t start, vm_size_t length, vm_prot_t prot,
1967
vm_prot_t max, int cow)
1968
{
1969
vm_offset_t end;
1970
int result;
1971
1972
end = start + length;
1973
KASSERT((cow & MAP_STACK_AREA) == 0 || object == NULL,
1974
("vm_map_fixed: non-NULL backing object for stack"));
1975
vm_map_lock(map);
1976
VM_MAP_RANGE_CHECK(map, start, end);
1977
if ((cow & MAP_CHECK_EXCL) == 0) {
1978
result = vm_map_delete(map, start, end);
1979
if (result != KERN_SUCCESS)
1980
goto out;
1981
}
1982
if ((cow & MAP_STACK_AREA) != 0) {
1983
result = vm_map_stack_locked(map, start, length, sgrowsiz,
1984
prot, max, cow);
1985
} else {
1986
result = vm_map_insert(map, object, offset, start, end,
1987
prot, max, cow);
1988
}
1989
out:
1990
vm_map_unlock(map);
1991
return (result);
1992
}
1993
1994
#if VM_NRESERVLEVEL <= 1
1995
static const int aslr_pages_rnd_64[2] = {0x1000, 0x10};
1996
static const int aslr_pages_rnd_32[2] = {0x100, 0x4};
1997
#elif VM_NRESERVLEVEL == 2
1998
static const int aslr_pages_rnd_64[3] = {0x1000, 0x1000, 0x10};
1999
static const int aslr_pages_rnd_32[3] = {0x100, 0x100, 0x4};
2000
#else
2001
#error "Unsupported VM_NRESERVLEVEL"
2002
#endif
2003
2004
static int cluster_anon = 1;
2005
SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW,
2006
&cluster_anon, 0,
2007
"Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always");
2008
2009
static bool
2010
clustering_anon_allowed(vm_offset_t addr, int cow)
2011
{
2012
2013
switch (cluster_anon) {
2014
case 0:
2015
return (false);
2016
case 1:
2017
return (addr == 0 || (cow & MAP_NO_HINT) != 0);
2018
case 2:
2019
default:
2020
return (true);
2021
}
2022
}
2023
2024
static long aslr_restarts;
2025
SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD,
2026
&aslr_restarts, 0,
2027
"Number of aslr failures");
2028
2029
/*
2030
* Searches for the specified amount of free space in the given map with the
2031
* specified alignment. Performs an address-ordered, first-fit search from
2032
* the given address "*addr", with an optional upper bound "max_addr". If the
2033
* parameter "alignment" is zero, then the alignment is computed from the
2034
* given (object, offset) pair so as to enable the greatest possible use of
2035
* superpage mappings. Returns KERN_SUCCESS and the address of the free space
2036
* in "*addr" if successful. Otherwise, returns KERN_NO_SPACE.
2037
*
2038
* The map must be locked. Initially, there must be at least "length" bytes
2039
* of free space at the given address.
2040
*/
2041
static int
2042
vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2043
vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr,
2044
vm_offset_t alignment)
2045
{
2046
vm_offset_t aligned_addr, free_addr;
2047
2048
VM_MAP_ASSERT_LOCKED(map);
2049
free_addr = *addr;
2050
KASSERT(free_addr == vm_map_findspace(map, free_addr, length),
2051
("caller failed to provide space %#jx at address %p",
2052
(uintmax_t)length, (void *)free_addr));
2053
for (;;) {
2054
/*
2055
* At the start of every iteration, the free space at address
2056
* "*addr" is at least "length" bytes.
2057
*/
2058
if (alignment == 0)
2059
pmap_align_superpage(object, offset, addr, length);
2060
else
2061
*addr = roundup2(*addr, alignment);
2062
aligned_addr = *addr;
2063
if (aligned_addr == free_addr) {
2064
/*
2065
* Alignment did not change "*addr", so "*addr" must
2066
* still provide sufficient free space.
2067
*/
2068
return (KERN_SUCCESS);
2069
}
2070
2071
/*
2072
* Test for address wrap on "*addr". A wrapped "*addr" could
2073
* be a valid address, in which case vm_map_findspace() cannot
2074
* be relied upon to fail.
2075
*/
2076
if (aligned_addr < free_addr)
2077
return (KERN_NO_SPACE);
2078
*addr = vm_map_findspace(map, aligned_addr, length);
2079
if (*addr + length > vm_map_max(map) ||
2080
(max_addr != 0 && *addr + length > max_addr))
2081
return (KERN_NO_SPACE);
2082
free_addr = *addr;
2083
if (free_addr == aligned_addr) {
2084
/*
2085
* If a successful call to vm_map_findspace() did not
2086
* change "*addr", then "*addr" must still be aligned
2087
* and provide sufficient free space.
2088
*/
2089
return (KERN_SUCCESS);
2090
}
2091
}
2092
}
2093
2094
int
2095
vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
2096
vm_offset_t max_addr, vm_offset_t alignment)
2097
{
2098
/* XXXKIB ASLR eh ? */
2099
*addr = vm_map_findspace(map, *addr, length);
2100
if (*addr + length > vm_map_max(map) ||
2101
(max_addr != 0 && *addr + length > max_addr))
2102
return (KERN_NO_SPACE);
2103
return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr,
2104
alignment));
2105
}
2106
2107
/*
2108
* vm_map_find finds an unallocated region in the target address
2109
* map with the given length. The search is defined to be
2110
* first-fit from the specified address; the region found is
2111
* returned in the same parameter.
2112
*
2113
* If object is non-NULL, ref count must be bumped by caller
2114
* prior to making call to account for the new entry.
2115
*/
2116
int
2117
vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2118
vm_offset_t *addr, /* IN/OUT */
2119
vm_size_t length, vm_offset_t max_addr, int find_space,
2120
vm_prot_t prot, vm_prot_t max, int cow)
2121
{
2122
int rv;
2123
2124
vm_map_lock(map);
2125
rv = vm_map_find_locked(map, object, offset, addr, length, max_addr,
2126
find_space, prot, max, cow);
2127
vm_map_unlock(map);
2128
return (rv);
2129
}
2130
2131
int
2132
vm_map_find_locked(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2133
vm_offset_t *addr, /* IN/OUT */
2134
vm_size_t length, vm_offset_t max_addr, int find_space,
2135
vm_prot_t prot, vm_prot_t max, int cow)
2136
{
2137
vm_offset_t alignment, curr_min_addr, min_addr;
2138
int gap, pidx, rv, try;
2139
bool cluster, en_aslr, update_anon;
2140
2141
KASSERT((cow & MAP_STACK_AREA) == 0 || object == NULL,
2142
("non-NULL backing object for stack"));
2143
MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
2144
(cow & MAP_STACK_AREA) == 0));
2145
if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
2146
(object->flags & OBJ_COLORED) == 0))
2147
find_space = VMFS_ANY_SPACE;
2148
if (find_space >> 8 != 0) {
2149
KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
2150
alignment = (vm_offset_t)1 << (find_space >> 8);
2151
} else
2152
alignment = 0;
2153
en_aslr = (map->flags & MAP_ASLR) != 0;
2154
update_anon = cluster = clustering_anon_allowed(*addr, cow) &&
2155
(map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 &&
2156
find_space != VMFS_NO_SPACE && object == NULL &&
2157
(cow & (MAP_INHERIT_SHARE | MAP_STACK_AREA)) == 0 &&
2158
prot != PROT_NONE;
2159
curr_min_addr = min_addr = *addr;
2160
if (en_aslr && min_addr == 0 && !cluster &&
2161
find_space != VMFS_NO_SPACE &&
2162
(map->flags & MAP_ASLR_IGNSTART) != 0)
2163
curr_min_addr = min_addr = vm_map_min(map);
2164
try = 0;
2165
if (cluster) {
2166
curr_min_addr = map->anon_loc;
2167
if (curr_min_addr == 0)
2168
cluster = false;
2169
}
2170
if (find_space != VMFS_NO_SPACE) {
2171
KASSERT(find_space == VMFS_ANY_SPACE ||
2172
find_space == VMFS_OPTIMAL_SPACE ||
2173
find_space == VMFS_SUPER_SPACE ||
2174
alignment != 0, ("unexpected VMFS flag"));
2175
again:
2176
/*
2177
* When creating an anonymous mapping, try clustering
2178
* with an existing anonymous mapping first.
2179
*
2180
* We make up to two attempts to find address space
2181
* for a given find_space value. The first attempt may
2182
* apply randomization or may cluster with an existing
2183
* anonymous mapping. If this first attempt fails,
2184
* perform a first-fit search of the available address
2185
* space.
2186
*
2187
* If all tries failed, and find_space is
2188
* VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE.
2189
* Again enable clustering and randomization.
2190
*/
2191
try++;
2192
MPASS(try <= 2);
2193
2194
if (try == 2) {
2195
/*
2196
* Second try: we failed either to find a
2197
* suitable region for randomizing the
2198
* allocation, or to cluster with an existing
2199
* mapping. Retry with free run.
2200
*/
2201
curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ?
2202
vm_map_min(map) : min_addr;
2203
atomic_add_long(&aslr_restarts, 1);
2204
}
2205
2206
if (try == 1 && en_aslr && !cluster) {
2207
/*
2208
* Find space for allocation, including
2209
* gap needed for later randomization.
2210
*/
2211
pidx = 0;
2212
#if VM_NRESERVLEVEL > 0
2213
if ((find_space == VMFS_SUPER_SPACE ||
2214
find_space == VMFS_OPTIMAL_SPACE) &&
2215
pagesizes[VM_NRESERVLEVEL] != 0) {
2216
/*
2217
* Do not pointlessly increase the space that
2218
* is requested from vm_map_findspace().
2219
* pmap_align_superpage() will only change a
2220
* mapping's alignment if that mapping is at
2221
* least a superpage in size.
2222
*/
2223
pidx = VM_NRESERVLEVEL;
2224
while (pidx > 0 && length < pagesizes[pidx])
2225
pidx--;
2226
}
2227
#endif
2228
gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR &&
2229
(max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ?
2230
aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx];
2231
*addr = vm_map_findspace(map, curr_min_addr,
2232
length + gap * pagesizes[pidx]);
2233
if (*addr + length + gap * pagesizes[pidx] >
2234
vm_map_max(map))
2235
goto again;
2236
/* And randomize the start address. */
2237
*addr += (arc4random() % gap) * pagesizes[pidx];
2238
if (max_addr != 0 && *addr + length > max_addr)
2239
goto again;
2240
} else {
2241
*addr = vm_map_findspace(map, curr_min_addr, length);
2242
if (*addr + length > vm_map_max(map) ||
2243
(max_addr != 0 && *addr + length > max_addr)) {
2244
if (cluster) {
2245
cluster = false;
2246
MPASS(try == 1);
2247
goto again;
2248
}
2249
return (KERN_NO_SPACE);
2250
}
2251
}
2252
2253
if (find_space != VMFS_ANY_SPACE &&
2254
(rv = vm_map_alignspace(map, object, offset, addr, length,
2255
max_addr, alignment)) != KERN_SUCCESS) {
2256
if (find_space == VMFS_OPTIMAL_SPACE) {
2257
find_space = VMFS_ANY_SPACE;
2258
curr_min_addr = min_addr;
2259
cluster = update_anon;
2260
try = 0;
2261
goto again;
2262
}
2263
return (rv);
2264
}
2265
} else if ((cow & MAP_REMAP) != 0) {
2266
if (!vm_map_range_valid(map, *addr, *addr + length))
2267
return (KERN_INVALID_ADDRESS);
2268
rv = vm_map_delete(map, *addr, *addr + length);
2269
if (rv != KERN_SUCCESS)
2270
return (rv);
2271
}
2272
if ((cow & MAP_STACK_AREA) != 0) {
2273
rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot,
2274
max, cow);
2275
} else {
2276
rv = vm_map_insert(map, object, offset, *addr, *addr + length,
2277
prot, max, cow);
2278
}
2279
2280
/*
2281
* Update the starting address for clustered anonymous memory mappings
2282
* if a starting address was not previously defined or an ASLR restart
2283
* placed an anonymous memory mapping at a lower address.
2284
*/
2285
if (update_anon && rv == KERN_SUCCESS && (map->anon_loc == 0 ||
2286
*addr < map->anon_loc))
2287
map->anon_loc = *addr;
2288
return (rv);
2289
}
2290
2291
/*
2292
* vm_map_find_min() is a variant of vm_map_find() that takes an
2293
* additional parameter ("default_addr") and treats the given address
2294
* ("*addr") differently. Specifically, it treats "*addr" as a hint
2295
* and not as the minimum address where the mapping is created.
2296
*
2297
* This function works in two phases. First, it tries to
2298
* allocate above the hint. If that fails and the hint is
2299
* greater than "default_addr", it performs a second pass, replacing
2300
* the hint with "default_addr" as the minimum address for the
2301
* allocation.
2302
*/
2303
int
2304
vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2305
vm_offset_t *addr, vm_size_t length, vm_offset_t default_addr,
2306
vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
2307
int cow)
2308
{
2309
vm_offset_t hint;
2310
int rv;
2311
2312
hint = *addr;
2313
if (hint == 0) {
2314
cow |= MAP_NO_HINT;
2315
*addr = hint = default_addr;
2316
}
2317
for (;;) {
2318
rv = vm_map_find(map, object, offset, addr, length, max_addr,
2319
find_space, prot, max, cow);
2320
if (rv == KERN_SUCCESS || default_addr >= hint)
2321
return (rv);
2322
*addr = hint = default_addr;
2323
}
2324
}
2325
2326
/*
2327
* A map entry with any of the following flags set must not be merged with
2328
* another entry.
2329
*/
2330
#define MAP_ENTRY_NOMERGE_MASK (MAP_ENTRY_GROWS_DOWN | \
2331
MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC | \
2332
MAP_ENTRY_STACK_GAP)
2333
2334
static bool
2335
vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry)
2336
{
2337
2338
KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 ||
2339
(entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0,
2340
("vm_map_mergeable_neighbors: neither %p nor %p are mergeable",
2341
prev, entry));
2342
return (prev->end == entry->start &&
2343
prev->object.vm_object == entry->object.vm_object &&
2344
(prev->object.vm_object == NULL ||
2345
prev->offset + (prev->end - prev->start) == entry->offset) &&
2346
prev->eflags == entry->eflags &&
2347
prev->protection == entry->protection &&
2348
prev->max_protection == entry->max_protection &&
2349
prev->inheritance == entry->inheritance &&
2350
prev->wired_count == entry->wired_count &&
2351
prev->cred == entry->cred);
2352
}
2353
2354
static void
2355
vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry)
2356
{
2357
2358
/*
2359
* If the backing object is a vnode object, vm_object_deallocate()
2360
* calls vrele(). However, vrele() does not lock the vnode because
2361
* the vnode has additional references. Thus, the map lock can be
2362
* kept without causing a lock-order reversal with the vnode lock.
2363
*
2364
* Since we count the number of virtual page mappings in
2365
* object->un_pager.vnp.writemappings, the writemappings value
2366
* should not be adjusted when the entry is disposed of.
2367
*/
2368
if (entry->object.vm_object != NULL)
2369
vm_object_deallocate(entry->object.vm_object);
2370
if (entry->cred != NULL)
2371
crfree(entry->cred);
2372
vm_map_entry_dispose(map, entry);
2373
}
2374
2375
/*
2376
* vm_map_try_merge_entries:
2377
*
2378
* Compare two map entries that represent consecutive ranges. If
2379
* the entries can be merged, expand the range of the second to
2380
* cover the range of the first and delete the first. Then return
2381
* the map entry that includes the first range.
2382
*
2383
* The map must be locked.
2384
*/
2385
vm_map_entry_t
2386
vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev_entry,
2387
vm_map_entry_t entry)
2388
{
2389
2390
VM_MAP_ASSERT_LOCKED(map);
2391
if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 &&
2392
vm_map_mergeable_neighbors(prev_entry, entry)) {
2393
vm_map_entry_unlink(map, prev_entry, UNLINK_MERGE_NEXT);
2394
vm_map_merged_neighbor_dispose(map, prev_entry);
2395
return (entry);
2396
}
2397
return (prev_entry);
2398
}
2399
2400
/*
2401
* vm_map_entry_back:
2402
*
2403
* Allocate an object to back a map entry.
2404
*/
2405
static inline void
2406
vm_map_entry_back(vm_map_entry_t entry)
2407
{
2408
vm_object_t object;
2409
2410
KASSERT(entry->object.vm_object == NULL,
2411
("map entry %p has backing object", entry));
2412
KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2413
("map entry %p is a submap", entry));
2414
object = vm_object_allocate_anon(atop(entry->end - entry->start), NULL,
2415
entry->cred, entry->end - entry->start);
2416
entry->object.vm_object = object;
2417
entry->offset = 0;
2418
entry->cred = NULL;
2419
}
2420
2421
/*
2422
* vm_map_entry_charge_object
2423
*
2424
* If there is no object backing this entry, create one. Otherwise, if
2425
* the entry has cred, give it to the backing object.
2426
*/
2427
static inline void
2428
vm_map_entry_charge_object(vm_map_t map, vm_map_entry_t entry)
2429
{
2430
2431
VM_MAP_ASSERT_LOCKED(map);
2432
KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2433
("map entry %p is a submap", entry));
2434
if (entry->object.vm_object == NULL && !vm_map_is_system(map) &&
2435
(entry->eflags & MAP_ENTRY_GUARD) == 0)
2436
vm_map_entry_back(entry);
2437
else if (entry->object.vm_object != NULL &&
2438
((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
2439
entry->cred != NULL) {
2440
VM_OBJECT_WLOCK(entry->object.vm_object);
2441
KASSERT(entry->object.vm_object->cred == NULL,
2442
("OVERCOMMIT: %s: both cred e %p", __func__, entry));
2443
entry->object.vm_object->cred = entry->cred;
2444
entry->object.vm_object->charge = entry->end - entry->start;
2445
VM_OBJECT_WUNLOCK(entry->object.vm_object);
2446
entry->cred = NULL;
2447
}
2448
}
2449
2450
/*
2451
* vm_map_entry_clone
2452
*
2453
* Create a duplicate map entry for clipping.
2454
*/
2455
static vm_map_entry_t
2456
vm_map_entry_clone(vm_map_t map, vm_map_entry_t entry)
2457
{
2458
vm_map_entry_t new_entry;
2459
2460
VM_MAP_ASSERT_LOCKED(map);
2461
2462
/*
2463
* Create a backing object now, if none exists, so that more individual
2464
* objects won't be created after the map entry is split.
2465
*/
2466
vm_map_entry_charge_object(map, entry);
2467
2468
/* Clone the entry. */
2469
new_entry = vm_map_entry_create(map);
2470
*new_entry = *entry;
2471
if (new_entry->cred != NULL)
2472
crhold(entry->cred);
2473
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
2474
vm_object_reference(new_entry->object.vm_object);
2475
vm_map_entry_set_vnode_text(new_entry, true);
2476
/*
2477
* The object->un_pager.vnp.writemappings for the object of
2478
* MAP_ENTRY_WRITECNT type entry shall be kept as is here. The
2479
* virtual pages are re-distributed among the clipped entries,
2480
* so the sum is left the same.
2481
*/
2482
}
2483
return (new_entry);
2484
}
2485
2486
/*
2487
* vm_map_clip_start: [ internal use only ]
2488
*
2489
* Asserts that the given entry begins at or after
2490
* the specified address; if necessary,
2491
* it splits the entry into two.
2492
*/
2493
static int
2494
vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr)
2495
{
2496
vm_map_entry_t new_entry;
2497
int bdry_idx;
2498
2499
if (!vm_map_is_system(map))
2500
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2501
"%s: map %p entry %p start 0x%jx", __func__, map, entry,
2502
(uintmax_t)startaddr);
2503
2504
if (startaddr <= entry->start)
2505
return (KERN_SUCCESS);
2506
2507
VM_MAP_ASSERT_LOCKED(map);
2508
KASSERT(entry->end > startaddr && entry->start < startaddr,
2509
("%s: invalid clip of entry %p", __func__, entry));
2510
2511
bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
2512
if (bdry_idx != 0) {
2513
if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0)
2514
return (KERN_INVALID_ARGUMENT);
2515
}
2516
2517
new_entry = vm_map_entry_clone(map, entry);
2518
2519
/*
2520
* Split off the front portion. Insert the new entry BEFORE this one,
2521
* so that this entry has the specified starting address.
2522
*/
2523
new_entry->end = startaddr;
2524
vm_map_entry_link(map, new_entry);
2525
return (KERN_SUCCESS);
2526
}
2527
2528
/*
2529
* vm_map_lookup_clip_start:
2530
*
2531
* Find the entry at or just after 'start', and clip it if 'start' is in
2532
* the interior of the entry. Return entry after 'start', and in
2533
* prev_entry set the entry before 'start'.
2534
*/
2535
static int
2536
vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start,
2537
vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry)
2538
{
2539
vm_map_entry_t entry;
2540
int rv;
2541
2542
if (!vm_map_is_system(map))
2543
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2544
"%s: map %p start 0x%jx prev %p", __func__, map,
2545
(uintmax_t)start, prev_entry);
2546
2547
if (vm_map_lookup_entry(map, start, prev_entry)) {
2548
entry = *prev_entry;
2549
rv = vm_map_clip_start(map, entry, start);
2550
if (rv != KERN_SUCCESS)
2551
return (rv);
2552
*prev_entry = vm_map_entry_pred(entry);
2553
} else
2554
entry = vm_map_entry_succ(*prev_entry);
2555
*res_entry = entry;
2556
return (KERN_SUCCESS);
2557
}
2558
2559
/*
2560
* vm_map_clip_end: [ internal use only ]
2561
*
2562
* Asserts that the given entry ends at or before
2563
* the specified address; if necessary,
2564
* it splits the entry into two.
2565
*/
2566
static int
2567
vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr)
2568
{
2569
vm_map_entry_t new_entry;
2570
int bdry_idx;
2571
2572
if (!vm_map_is_system(map))
2573
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2574
"%s: map %p entry %p end 0x%jx", __func__, map, entry,
2575
(uintmax_t)endaddr);
2576
2577
if (endaddr >= entry->end)
2578
return (KERN_SUCCESS);
2579
2580
VM_MAP_ASSERT_LOCKED(map);
2581
KASSERT(entry->start < endaddr && entry->end > endaddr,
2582
("%s: invalid clip of entry %p", __func__, entry));
2583
2584
bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
2585
if (bdry_idx != 0) {
2586
if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0)
2587
return (KERN_INVALID_ARGUMENT);
2588
}
2589
2590
new_entry = vm_map_entry_clone(map, entry);
2591
2592
/*
2593
* Split off the back portion. Insert the new entry AFTER this one,
2594
* so that this entry has the specified ending address.
2595
*/
2596
new_entry->start = endaddr;
2597
vm_map_entry_link(map, new_entry);
2598
2599
return (KERN_SUCCESS);
2600
}
2601
2602
/*
2603
* vm_map_submap: [ kernel use only ]
2604
*
2605
* Mark the given range as handled by a subordinate map.
2606
*
2607
* This range must have been created with vm_map_find,
2608
* and no other operations may have been performed on this
2609
* range prior to calling vm_map_submap.
2610
*
2611
* Only a limited number of operations can be performed
2612
* within this rage after calling vm_map_submap:
2613
* vm_fault
2614
* [Don't try vm_map_copy!]
2615
*
2616
* To remove a submapping, one must first remove the
2617
* range from the superior map, and then destroy the
2618
* submap (if desired). [Better yet, don't try it.]
2619
*/
2620
int
2621
vm_map_submap(
2622
vm_map_t map,
2623
vm_offset_t start,
2624
vm_offset_t end,
2625
vm_map_t submap)
2626
{
2627
vm_map_entry_t entry;
2628
int result;
2629
2630
result = KERN_INVALID_ARGUMENT;
2631
2632
vm_map_lock(submap);
2633
submap->flags |= MAP_IS_SUB_MAP;
2634
vm_map_unlock(submap);
2635
2636
vm_map_lock(map);
2637
VM_MAP_RANGE_CHECK(map, start, end);
2638
if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end &&
2639
(entry->eflags & MAP_ENTRY_COW) == 0 &&
2640
entry->object.vm_object == NULL) {
2641
result = vm_map_clip_start(map, entry, start);
2642
if (result != KERN_SUCCESS)
2643
goto unlock;
2644
result = vm_map_clip_end(map, entry, end);
2645
if (result != KERN_SUCCESS)
2646
goto unlock;
2647
entry->object.sub_map = submap;
2648
entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
2649
result = KERN_SUCCESS;
2650
}
2651
unlock:
2652
vm_map_unlock(map);
2653
2654
if (result != KERN_SUCCESS) {
2655
vm_map_lock(submap);
2656
submap->flags &= ~MAP_IS_SUB_MAP;
2657
vm_map_unlock(submap);
2658
}
2659
return (result);
2660
}
2661
2662
/*
2663
* The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
2664
*/
2665
#define MAX_INIT_PT 96
2666
2667
/*
2668
* vm_map_pmap_enter:
2669
*
2670
* Preload the specified map's pmap with mappings to the specified
2671
* object's memory-resident pages. No further physical pages are
2672
* allocated, and no further virtual pages are retrieved from secondary
2673
* storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a
2674
* limited number of page mappings are created at the low-end of the
2675
* specified address range. (For this purpose, a superpage mapping
2676
* counts as one page mapping.) Otherwise, all resident pages within
2677
* the specified address range are mapped.
2678
*/
2679
static void
2680
vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
2681
vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
2682
{
2683
struct pctrie_iter pages;
2684
vm_offset_t start;
2685
vm_page_t p, p_start;
2686
vm_pindex_t jump, mask, psize, threshold, tmpidx;
2687
int psind;
2688
2689
if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
2690
return;
2691
if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2692
VM_OBJECT_WLOCK(object);
2693
if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2694
pmap_object_init_pt(map->pmap, addr, object, pindex,
2695
size);
2696
VM_OBJECT_WUNLOCK(object);
2697
return;
2698
}
2699
VM_OBJECT_LOCK_DOWNGRADE(object);
2700
} else
2701
VM_OBJECT_RLOCK(object);
2702
2703
psize = atop(size);
2704
if (psize + pindex > object->size) {
2705
if (pindex >= object->size) {
2706
VM_OBJECT_RUNLOCK(object);
2707
return;
2708
}
2709
psize = object->size - pindex;
2710
}
2711
2712
start = 0;
2713
p_start = NULL;
2714
threshold = MAX_INIT_PT;
2715
2716
vm_page_iter_limit_init(&pages, object, pindex + psize);
2717
for (p = vm_radix_iter_lookup_ge(&pages, pindex); p != NULL;
2718
p = vm_radix_iter_jump(&pages, jump)) {
2719
/*
2720
* don't allow an madvise to blow away our really
2721
* free pages allocating pv entries.
2722
*/
2723
tmpidx = p->pindex - pindex;
2724
if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
2725
vm_page_count_severe()) ||
2726
((flags & MAP_PREFAULT_PARTIAL) != 0 &&
2727
tmpidx >= threshold)) {
2728
psize = tmpidx;
2729
break;
2730
}
2731
jump = 1;
2732
if (vm_page_all_valid(p)) {
2733
if (p_start == NULL) {
2734
start = addr + ptoa(tmpidx);
2735
p_start = p;
2736
}
2737
/* Jump ahead if a superpage mapping is possible. */
2738
for (psind = p->psind; psind > 0; psind--) {
2739
if (((addr + ptoa(tmpidx)) &
2740
(pagesizes[psind] - 1)) == 0) {
2741
mask = atop(pagesizes[psind]) - 1;
2742
if (tmpidx + mask < psize &&
2743
vm_page_ps_test(p, psind,
2744
PS_ALL_VALID, NULL)) {
2745
jump += mask;
2746
threshold += mask;
2747
break;
2748
}
2749
}
2750
}
2751
} else if (p_start != NULL) {
2752
pmap_enter_object(map->pmap, start, addr +
2753
ptoa(tmpidx), p_start, prot);
2754
p_start = NULL;
2755
}
2756
}
2757
if (p_start != NULL)
2758
pmap_enter_object(map->pmap, start, addr + ptoa(psize),
2759
p_start, prot);
2760
VM_OBJECT_RUNLOCK(object);
2761
}
2762
2763
static void
2764
vm_map_protect_guard(vm_map_entry_t entry, vm_prot_t new_prot,
2765
vm_prot_t new_maxprot, int flags)
2766
{
2767
vm_prot_t old_prot;
2768
2769
MPASS((entry->eflags & MAP_ENTRY_GUARD) != 0);
2770
if ((entry->eflags & MAP_ENTRY_STACK_GAP) == 0)
2771
return;
2772
2773
old_prot = PROT_EXTRACT(entry->offset);
2774
if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0) {
2775
entry->offset = PROT_MAX(new_maxprot) |
2776
(new_maxprot & old_prot);
2777
}
2778
if ((flags & VM_MAP_PROTECT_SET_PROT) != 0) {
2779
entry->offset = new_prot | PROT_MAX(
2780
PROT_MAX_EXTRACT(entry->offset));
2781
}
2782
}
2783
2784
/*
2785
* vm_map_protect:
2786
*
2787
* Sets the protection and/or the maximum protection of the
2788
* specified address region in the target map.
2789
*/
2790
int
2791
vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2792
vm_prot_t new_prot, vm_prot_t new_maxprot, int flags)
2793
{
2794
vm_map_entry_t entry, first_entry, in_tran, prev_entry;
2795
vm_object_t obj;
2796
struct ucred *cred;
2797
vm_offset_t orig_start;
2798
vm_prot_t check_prot, max_prot, old_prot;
2799
int rv;
2800
2801
if (start == end)
2802
return (KERN_SUCCESS);
2803
2804
if (CONTAINS_BITS(flags, VM_MAP_PROTECT_SET_PROT |
2805
VM_MAP_PROTECT_SET_MAXPROT) &&
2806
!CONTAINS_BITS(new_maxprot, new_prot))
2807
return (KERN_OUT_OF_BOUNDS);
2808
2809
orig_start = start;
2810
again:
2811
in_tran = NULL;
2812
start = orig_start;
2813
vm_map_lock(map);
2814
2815
if ((map->flags & MAP_WXORX) != 0 &&
2816
(flags & VM_MAP_PROTECT_SET_PROT) != 0 &&
2817
CONTAINS_BITS(new_prot, VM_PROT_WRITE | VM_PROT_EXECUTE)) {
2818
vm_map_unlock(map);
2819
return (KERN_PROTECTION_FAILURE);
2820
}
2821
2822
/*
2823
* Ensure that we are not concurrently wiring pages. vm_map_wire() may
2824
* need to fault pages into the map and will drop the map lock while
2825
* doing so, and the VM object may end up in an inconsistent state if we
2826
* update the protection on the map entry in between faults.
2827
*/
2828
vm_map_wait_busy(map);
2829
2830
VM_MAP_RANGE_CHECK(map, start, end);
2831
2832
if (!vm_map_lookup_entry(map, start, &first_entry))
2833
first_entry = vm_map_entry_succ(first_entry);
2834
2835
if ((flags & VM_MAP_PROTECT_GROWSDOWN) != 0 &&
2836
(first_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0) {
2837
/*
2838
* Handle Linux's PROT_GROWSDOWN flag.
2839
* It means that protection is applied down to the
2840
* whole stack, including the specified range of the
2841
* mapped region, and the grow down region (AKA
2842
* guard).
2843
*/
2844
while (!CONTAINS_BITS(first_entry->eflags,
2845
MAP_ENTRY_GUARD | MAP_ENTRY_STACK_GAP) &&
2846
first_entry != vm_map_entry_first(map))
2847
first_entry = vm_map_entry_pred(first_entry);
2848
start = first_entry->start;
2849
}
2850
2851
/*
2852
* Make a first pass to check for protection violations.
2853
*/
2854
check_prot = 0;
2855
if ((flags & VM_MAP_PROTECT_SET_PROT) != 0)
2856
check_prot |= new_prot;
2857
if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0)
2858
check_prot |= new_maxprot;
2859
for (entry = first_entry; entry->start < end;
2860
entry = vm_map_entry_succ(entry)) {
2861
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
2862
vm_map_unlock(map);
2863
return (KERN_INVALID_ARGUMENT);
2864
}
2865
if ((entry->eflags & (MAP_ENTRY_GUARD |
2866
MAP_ENTRY_STACK_GAP)) == MAP_ENTRY_GUARD)
2867
continue;
2868
max_prot = (entry->eflags & MAP_ENTRY_STACK_GAP) != 0 ?
2869
PROT_MAX_EXTRACT(entry->offset) : entry->max_protection;
2870
if (!CONTAINS_BITS(max_prot, check_prot)) {
2871
vm_map_unlock(map);
2872
return (KERN_PROTECTION_FAILURE);
2873
}
2874
if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0)
2875
in_tran = entry;
2876
}
2877
2878
/*
2879
* Postpone the operation until all in-transition map entries have
2880
* stabilized. An in-transition entry might already have its pages
2881
* wired and wired_count incremented, but not yet have its
2882
* MAP_ENTRY_USER_WIRED flag set. In which case, we would fail to call
2883
* vm_fault_copy_entry() in the final loop below.
2884
*/
2885
if (in_tran != NULL) {
2886
in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2887
vm_map_unlock_and_wait(map, 0);
2888
goto again;
2889
}
2890
2891
/*
2892
* Before changing the protections, try to reserve swap space for any
2893
* private (i.e., copy-on-write) mappings that are transitioning from
2894
* read-only to read/write access. If a reservation fails, break out
2895
* of this loop early and let the next loop simplify the entries, since
2896
* some may now be mergeable.
2897
*/
2898
rv = vm_map_clip_start(map, first_entry, start);
2899
if (rv != KERN_SUCCESS) {
2900
vm_map_unlock(map);
2901
return (rv);
2902
}
2903
for (entry = first_entry; entry->start < end;
2904
entry = vm_map_entry_succ(entry)) {
2905
rv = vm_map_clip_end(map, entry, end);
2906
if (rv != KERN_SUCCESS) {
2907
vm_map_unlock(map);
2908
return (rv);
2909
}
2910
2911
if ((flags & VM_MAP_PROTECT_SET_PROT) == 0 ||
2912
((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 ||
2913
ENTRY_CHARGED(entry) ||
2914
(entry->eflags & MAP_ENTRY_GUARD) != 0)
2915
continue;
2916
2917
cred = curthread->td_ucred;
2918
obj = entry->object.vm_object;
2919
2920
if (obj == NULL ||
2921
(entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0) {
2922
if (!swap_reserve(entry->end - entry->start)) {
2923
rv = KERN_RESOURCE_SHORTAGE;
2924
end = entry->end;
2925
break;
2926
}
2927
crhold(cred);
2928
entry->cred = cred;
2929
continue;
2930
}
2931
2932
VM_OBJECT_WLOCK(obj);
2933
if ((obj->flags & OBJ_SWAP) == 0) {
2934
VM_OBJECT_WUNLOCK(obj);
2935
continue;
2936
}
2937
2938
/*
2939
* Charge for the whole object allocation now, since
2940
* we cannot distinguish between non-charged and
2941
* charged clipped mapping of the same object later.
2942
*/
2943
KASSERT(obj->charge == 0,
2944
("vm_map_protect: object %p overcharged (entry %p)",
2945
obj, entry));
2946
if (!swap_reserve(ptoa(obj->size))) {
2947
VM_OBJECT_WUNLOCK(obj);
2948
rv = KERN_RESOURCE_SHORTAGE;
2949
end = entry->end;
2950
break;
2951
}
2952
2953
crhold(cred);
2954
obj->cred = cred;
2955
obj->charge = ptoa(obj->size);
2956
VM_OBJECT_WUNLOCK(obj);
2957
}
2958
2959
/*
2960
* If enough swap space was available, go back and fix up protections.
2961
* Otherwise, just simplify entries, since some may have been modified.
2962
* [Note that clipping is not necessary the second time.]
2963
*/
2964
for (prev_entry = vm_map_entry_pred(first_entry), entry = first_entry;
2965
entry->start < end;
2966
vm_map_try_merge_entries(map, prev_entry, entry),
2967
prev_entry = entry, entry = vm_map_entry_succ(entry)) {
2968
if (rv != KERN_SUCCESS)
2969
continue;
2970
2971
if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
2972
vm_map_protect_guard(entry, new_prot, new_maxprot,
2973
flags);
2974
continue;
2975
}
2976
2977
old_prot = entry->protection;
2978
2979
if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0) {
2980
entry->max_protection = new_maxprot;
2981
entry->protection = new_maxprot & old_prot;
2982
}
2983
if ((flags & VM_MAP_PROTECT_SET_PROT) != 0)
2984
entry->protection = new_prot;
2985
2986
/*
2987
* For user wired map entries, the normal lazy evaluation of
2988
* write access upgrades through soft page faults is
2989
* undesirable. Instead, immediately copy any pages that are
2990
* copy-on-write and enable write access in the physical map.
2991
*/
2992
if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
2993
(entry->protection & VM_PROT_WRITE) != 0 &&
2994
(old_prot & VM_PROT_WRITE) == 0)
2995
vm_fault_copy_entry(map, map, entry, entry, NULL);
2996
2997
/*
2998
* When restricting access, update the physical map. Worry
2999
* about copy-on-write here.
3000
*/
3001
if ((old_prot & ~entry->protection) != 0) {
3002
#define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
3003
VM_PROT_ALL)
3004
pmap_protect(map->pmap, entry->start,
3005
entry->end,
3006
entry->protection & MASK(entry));
3007
#undef MASK
3008
}
3009
}
3010
vm_map_try_merge_entries(map, prev_entry, entry);
3011
vm_map_unlock(map);
3012
return (rv);
3013
}
3014
3015
/*
3016
* vm_map_madvise:
3017
*
3018
* This routine traverses a processes map handling the madvise
3019
* system call. Advisories are classified as either those effecting
3020
* the vm_map_entry structure, or those effecting the underlying
3021
* objects.
3022
*/
3023
int
3024
vm_map_madvise(
3025
vm_map_t map,
3026
vm_offset_t start,
3027
vm_offset_t end,
3028
int behav)
3029
{
3030
vm_map_entry_t entry, prev_entry;
3031
int rv;
3032
bool modify_map;
3033
3034
/*
3035
* Some madvise calls directly modify the vm_map_entry, in which case
3036
* we need to use an exclusive lock on the map and we need to perform
3037
* various clipping operations. Otherwise we only need a read-lock
3038
* on the map.
3039
*/
3040
switch(behav) {
3041
case MADV_NORMAL:
3042
case MADV_SEQUENTIAL:
3043
case MADV_RANDOM:
3044
case MADV_NOSYNC:
3045
case MADV_AUTOSYNC:
3046
case MADV_NOCORE:
3047
case MADV_CORE:
3048
if (start == end)
3049
return (0);
3050
modify_map = true;
3051
vm_map_lock(map);
3052
break;
3053
case MADV_WILLNEED:
3054
case MADV_DONTNEED:
3055
case MADV_FREE:
3056
if (start == end)
3057
return (0);
3058
modify_map = false;
3059
vm_map_lock_read(map);
3060
break;
3061
default:
3062
return (EINVAL);
3063
}
3064
3065
/*
3066
* Locate starting entry and clip if necessary.
3067
*/
3068
VM_MAP_RANGE_CHECK(map, start, end);
3069
3070
if (modify_map) {
3071
/*
3072
* madvise behaviors that are implemented in the vm_map_entry.
3073
*
3074
* We clip the vm_map_entry so that behavioral changes are
3075
* limited to the specified address range.
3076
*/
3077
rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
3078
if (rv != KERN_SUCCESS) {
3079
vm_map_unlock(map);
3080
return (vm_mmap_to_errno(rv));
3081
}
3082
3083
for (; entry->start < end; prev_entry = entry,
3084
entry = vm_map_entry_succ(entry)) {
3085
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
3086
continue;
3087
3088
rv = vm_map_clip_end(map, entry, end);
3089
if (rv != KERN_SUCCESS) {
3090
vm_map_unlock(map);
3091
return (vm_mmap_to_errno(rv));
3092
}
3093
3094
switch (behav) {
3095
case MADV_NORMAL:
3096
vm_map_entry_set_behavior(entry,
3097
MAP_ENTRY_BEHAV_NORMAL);
3098
break;
3099
case MADV_SEQUENTIAL:
3100
vm_map_entry_set_behavior(entry,
3101
MAP_ENTRY_BEHAV_SEQUENTIAL);
3102
break;
3103
case MADV_RANDOM:
3104
vm_map_entry_set_behavior(entry,
3105
MAP_ENTRY_BEHAV_RANDOM);
3106
break;
3107
case MADV_NOSYNC:
3108
entry->eflags |= MAP_ENTRY_NOSYNC;
3109
break;
3110
case MADV_AUTOSYNC:
3111
entry->eflags &= ~MAP_ENTRY_NOSYNC;
3112
break;
3113
case MADV_NOCORE:
3114
entry->eflags |= MAP_ENTRY_NOCOREDUMP;
3115
break;
3116
case MADV_CORE:
3117
entry->eflags &= ~MAP_ENTRY_NOCOREDUMP;
3118
break;
3119
default:
3120
break;
3121
}
3122
vm_map_try_merge_entries(map, prev_entry, entry);
3123
}
3124
vm_map_try_merge_entries(map, prev_entry, entry);
3125
vm_map_unlock(map);
3126
} else {
3127
vm_pindex_t pstart, pend;
3128
3129
/*
3130
* madvise behaviors that are implemented in the underlying
3131
* vm_object.
3132
*
3133
* Since we don't clip the vm_map_entry, we have to clip
3134
* the vm_object pindex and count.
3135
*/
3136
if (!vm_map_lookup_entry(map, start, &entry))
3137
entry = vm_map_entry_succ(entry);
3138
for (; entry->start < end;
3139
entry = vm_map_entry_succ(entry)) {
3140
vm_offset_t useEnd, useStart;
3141
3142
if ((entry->eflags & (MAP_ENTRY_IS_SUB_MAP |
3143
MAP_ENTRY_GUARD)) != 0)
3144
continue;
3145
3146
/*
3147
* MADV_FREE would otherwise rewind time to
3148
* the creation of the shadow object. Because
3149
* we hold the VM map read-locked, neither the
3150
* entry's object nor the presence of a
3151
* backing object can change.
3152
*/
3153
if (behav == MADV_FREE &&
3154
entry->object.vm_object != NULL &&
3155
entry->object.vm_object->backing_object != NULL)
3156
continue;
3157
3158
pstart = OFF_TO_IDX(entry->offset);
3159
pend = pstart + atop(entry->end - entry->start);
3160
useStart = entry->start;
3161
useEnd = entry->end;
3162
3163
if (entry->start < start) {
3164
pstart += atop(start - entry->start);
3165
useStart = start;
3166
}
3167
if (entry->end > end) {
3168
pend -= atop(entry->end - end);
3169
useEnd = end;
3170
}
3171
3172
if (pstart >= pend)
3173
continue;
3174
3175
/*
3176
* Perform the pmap_advise() before clearing
3177
* PGA_REFERENCED in vm_page_advise(). Otherwise, a
3178
* concurrent pmap operation, such as pmap_remove(),
3179
* could clear a reference in the pmap and set
3180
* PGA_REFERENCED on the page before the pmap_advise()
3181
* had completed. Consequently, the page would appear
3182
* referenced based upon an old reference that
3183
* occurred before this pmap_advise() ran.
3184
*/
3185
if (behav == MADV_DONTNEED || behav == MADV_FREE)
3186
pmap_advise(map->pmap, useStart, useEnd,
3187
behav);
3188
3189
vm_object_madvise(entry->object.vm_object, pstart,
3190
pend, behav);
3191
3192
/*
3193
* Pre-populate paging structures in the
3194
* WILLNEED case. For wired entries, the
3195
* paging structures are already populated.
3196
*/
3197
if (behav == MADV_WILLNEED &&
3198
entry->wired_count == 0) {
3199
vm_map_pmap_enter(map,
3200
useStart,
3201
entry->protection,
3202
entry->object.vm_object,
3203
pstart,
3204
ptoa(pend - pstart),
3205
MAP_PREFAULT_MADVISE
3206
);
3207
}
3208
}
3209
vm_map_unlock_read(map);
3210
}
3211
return (0);
3212
}
3213
3214
/*
3215
* vm_map_inherit:
3216
*
3217
* Sets the inheritance of the specified address
3218
* range in the target map. Inheritance
3219
* affects how the map will be shared with
3220
* child maps at the time of vmspace_fork.
3221
*/
3222
int
3223
vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
3224
vm_inherit_t new_inheritance)
3225
{
3226
vm_map_entry_t entry, lentry, prev_entry, start_entry;
3227
int rv;
3228
3229
switch (new_inheritance) {
3230
case VM_INHERIT_NONE:
3231
case VM_INHERIT_COPY:
3232
case VM_INHERIT_SHARE:
3233
case VM_INHERIT_ZERO:
3234
break;
3235
default:
3236
return (KERN_INVALID_ARGUMENT);
3237
}
3238
if (start == end)
3239
return (KERN_SUCCESS);
3240
vm_map_lock(map);
3241
VM_MAP_RANGE_CHECK(map, start, end);
3242
rv = vm_map_lookup_clip_start(map, start, &start_entry, &prev_entry);
3243
if (rv != KERN_SUCCESS)
3244
goto unlock;
3245
if (vm_map_lookup_entry(map, end - 1, &lentry)) {
3246
rv = vm_map_clip_end(map, lentry, end);
3247
if (rv != KERN_SUCCESS)
3248
goto unlock;
3249
}
3250
if (new_inheritance == VM_INHERIT_COPY) {
3251
for (entry = start_entry; entry->start < end;
3252
prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3253
if ((entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK)
3254
!= 0) {
3255
rv = KERN_INVALID_ARGUMENT;
3256
goto unlock;
3257
}
3258
}
3259
}
3260
for (entry = start_entry; entry->start < end; prev_entry = entry,
3261
entry = vm_map_entry_succ(entry)) {
3262
KASSERT(entry->end <= end, ("non-clipped entry %p end %jx %jx",
3263
entry, (uintmax_t)entry->end, (uintmax_t)end));
3264
if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
3265
new_inheritance != VM_INHERIT_ZERO)
3266
entry->inheritance = new_inheritance;
3267
vm_map_try_merge_entries(map, prev_entry, entry);
3268
}
3269
vm_map_try_merge_entries(map, prev_entry, entry);
3270
unlock:
3271
vm_map_unlock(map);
3272
return (rv);
3273
}
3274
3275
/*
3276
* vm_map_entry_in_transition:
3277
*
3278
* Release the map lock, and sleep until the entry is no longer in
3279
* transition. Awake and acquire the map lock. If the map changed while
3280
* another held the lock, lookup a possibly-changed entry at or after the
3281
* 'start' position of the old entry.
3282
*/
3283
static vm_map_entry_t
3284
vm_map_entry_in_transition(vm_map_t map, vm_offset_t in_start,
3285
vm_offset_t *io_end, bool holes_ok, vm_map_entry_t in_entry)
3286
{
3287
vm_map_entry_t entry;
3288
vm_offset_t start;
3289
u_int last_timestamp;
3290
3291
VM_MAP_ASSERT_LOCKED(map);
3292
KASSERT((in_entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3293
("not in-tranition map entry %p", in_entry));
3294
/*
3295
* We have not yet clipped the entry.
3296
*/
3297
start = MAX(in_start, in_entry->start);
3298
in_entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3299
last_timestamp = map->timestamp;
3300
if (vm_map_unlock_and_wait(map, 0)) {
3301
/*
3302
* Allow interruption of user wiring/unwiring?
3303
*/
3304
}
3305
vm_map_lock(map);
3306
if (last_timestamp + 1 == map->timestamp)
3307
return (in_entry);
3308
3309
/*
3310
* Look again for the entry because the map was modified while it was
3311
* unlocked. Specifically, the entry may have been clipped, merged, or
3312
* deleted.
3313
*/
3314
if (!vm_map_lookup_entry(map, start, &entry)) {
3315
if (!holes_ok) {
3316
*io_end = start;
3317
return (NULL);
3318
}
3319
entry = vm_map_entry_succ(entry);
3320
}
3321
return (entry);
3322
}
3323
3324
/*
3325
* vm_map_unwire:
3326
*
3327
* Implements both kernel and user unwiring.
3328
*/
3329
int
3330
vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
3331
int flags)
3332
{
3333
vm_map_entry_t entry, first_entry, next_entry, prev_entry;
3334
int rv;
3335
bool holes_ok, need_wakeup, user_unwire;
3336
3337
if (start == end)
3338
return (KERN_SUCCESS);
3339
holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
3340
user_unwire = (flags & VM_MAP_WIRE_USER) != 0;
3341
vm_map_lock(map);
3342
VM_MAP_RANGE_CHECK(map, start, end);
3343
if (!vm_map_lookup_entry(map, start, &first_entry)) {
3344
if (holes_ok)
3345
first_entry = vm_map_entry_succ(first_entry);
3346
else {
3347
vm_map_unlock(map);
3348
return (KERN_INVALID_ADDRESS);
3349
}
3350
}
3351
rv = KERN_SUCCESS;
3352
for (entry = first_entry; entry->start < end; entry = next_entry) {
3353
if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3354
/*
3355
* We have not yet clipped the entry.
3356
*/
3357
next_entry = vm_map_entry_in_transition(map, start,
3358
&end, holes_ok, entry);
3359
if (next_entry == NULL) {
3360
if (entry == first_entry) {
3361
vm_map_unlock(map);
3362
return (KERN_INVALID_ADDRESS);
3363
}
3364
rv = KERN_INVALID_ADDRESS;
3365
break;
3366
}
3367
first_entry = (entry == first_entry) ?
3368
next_entry : NULL;
3369
continue;
3370
}
3371
rv = vm_map_clip_start(map, entry, start);
3372
if (rv != KERN_SUCCESS)
3373
break;
3374
rv = vm_map_clip_end(map, entry, end);
3375
if (rv != KERN_SUCCESS)
3376
break;
3377
3378
/*
3379
* Mark the entry in case the map lock is released. (See
3380
* above.)
3381
*/
3382
KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
3383
entry->wiring_thread == NULL,
3384
("owned map entry %p", entry));
3385
entry->eflags |= MAP_ENTRY_IN_TRANSITION;
3386
entry->wiring_thread = curthread;
3387
next_entry = vm_map_entry_succ(entry);
3388
/*
3389
* Check the map for holes in the specified region.
3390
* If holes_ok, skip this check.
3391
*/
3392
if (!holes_ok &&
3393
entry->end < end && next_entry->start > entry->end) {
3394
end = entry->end;
3395
rv = KERN_INVALID_ADDRESS;
3396
break;
3397
}
3398
/*
3399
* If system unwiring, require that the entry is system wired.
3400
*/
3401
if (!user_unwire &&
3402
vm_map_entry_system_wired_count(entry) == 0) {
3403
end = entry->end;
3404
rv = KERN_INVALID_ARGUMENT;
3405
break;
3406
}
3407
}
3408
need_wakeup = false;
3409
if (first_entry == NULL &&
3410
!vm_map_lookup_entry(map, start, &first_entry)) {
3411
KASSERT(holes_ok, ("vm_map_unwire: lookup failed"));
3412
prev_entry = first_entry;
3413
entry = vm_map_entry_succ(first_entry);
3414
} else {
3415
prev_entry = vm_map_entry_pred(first_entry);
3416
entry = first_entry;
3417
}
3418
for (; entry->start < end;
3419
prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3420
/*
3421
* If holes_ok was specified, an empty
3422
* space in the unwired region could have been mapped
3423
* while the map lock was dropped for draining
3424
* MAP_ENTRY_IN_TRANSITION. Moreover, another thread
3425
* could be simultaneously wiring this new mapping
3426
* entry. Detect these cases and skip any entries
3427
* marked as in transition by us.
3428
*/
3429
if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3430
entry->wiring_thread != curthread) {
3431
KASSERT(holes_ok,
3432
("vm_map_unwire: !HOLESOK and new/changed entry"));
3433
continue;
3434
}
3435
3436
if (rv == KERN_SUCCESS && (!user_unwire ||
3437
(entry->eflags & MAP_ENTRY_USER_WIRED))) {
3438
if (entry->wired_count == 1)
3439
vm_map_entry_unwire(map, entry);
3440
else
3441
entry->wired_count--;
3442
if (user_unwire)
3443
entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3444
}
3445
KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3446
("vm_map_unwire: in-transition flag missing %p", entry));
3447
KASSERT(entry->wiring_thread == curthread,
3448
("vm_map_unwire: alien wire %p", entry));
3449
entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
3450
entry->wiring_thread = NULL;
3451
if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3452
entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3453
need_wakeup = true;
3454
}
3455
vm_map_try_merge_entries(map, prev_entry, entry);
3456
}
3457
vm_map_try_merge_entries(map, prev_entry, entry);
3458
vm_map_unlock(map);
3459
if (need_wakeup)
3460
vm_map_wakeup(map);
3461
return (rv);
3462
}
3463
3464
static void
3465
vm_map_wire_user_count_sub(u_long npages)
3466
{
3467
3468
atomic_subtract_long(&vm_user_wire_count, npages);
3469
}
3470
3471
static bool
3472
vm_map_wire_user_count_add(u_long npages)
3473
{
3474
u_long wired;
3475
3476
wired = vm_user_wire_count;
3477
do {
3478
if (npages + wired > vm_page_max_user_wired)
3479
return (false);
3480
} while (!atomic_fcmpset_long(&vm_user_wire_count, &wired,
3481
npages + wired));
3482
3483
return (true);
3484
}
3485
3486
/*
3487
* vm_map_wire_entry_failure:
3488
*
3489
* Handle a wiring failure on the given entry.
3490
*
3491
* The map should be locked.
3492
*/
3493
static void
3494
vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
3495
vm_offset_t failed_addr)
3496
{
3497
3498
VM_MAP_ASSERT_LOCKED(map);
3499
KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
3500
entry->wired_count == 1,
3501
("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
3502
KASSERT(failed_addr < entry->end,
3503
("vm_map_wire_entry_failure: entry %p was fully wired", entry));
3504
3505
/*
3506
* If any pages at the start of this entry were successfully wired,
3507
* then unwire them.
3508
*/
3509
if (failed_addr > entry->start) {
3510
pmap_unwire(map->pmap, entry->start, failed_addr);
3511
vm_object_unwire(entry->object.vm_object, entry->offset,
3512
failed_addr - entry->start, PQ_ACTIVE);
3513
}
3514
3515
/*
3516
* Assign an out-of-range value to represent the failure to wire this
3517
* entry.
3518
*/
3519
entry->wired_count = -1;
3520
}
3521
3522
int
3523
vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3524
{
3525
int rv;
3526
3527
vm_map_lock(map);
3528
rv = vm_map_wire_locked(map, start, end, flags);
3529
vm_map_unlock(map);
3530
return (rv);
3531
}
3532
3533
/*
3534
* vm_map_wire_locked:
3535
*
3536
* Implements both kernel and user wiring. Returns with the map locked,
3537
* the map lock may be dropped.
3538
*/
3539
int
3540
vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3541
{
3542
vm_map_entry_t entry, first_entry, next_entry, prev_entry;
3543
vm_offset_t faddr, saved_end, saved_start;
3544
u_long incr, npages;
3545
u_int bidx, last_timestamp;
3546
int rv;
3547
bool holes_ok, need_wakeup, user_wire;
3548
vm_prot_t prot;
3549
3550
VM_MAP_ASSERT_LOCKED(map);
3551
3552
if (start == end)
3553
return (KERN_SUCCESS);
3554
prot = 0;
3555
if (flags & VM_MAP_WIRE_WRITE)
3556
prot |= VM_PROT_WRITE;
3557
holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
3558
user_wire = (flags & VM_MAP_WIRE_USER) != 0;
3559
VM_MAP_RANGE_CHECK(map, start, end);
3560
if (!vm_map_lookup_entry(map, start, &first_entry)) {
3561
if (holes_ok)
3562
first_entry = vm_map_entry_succ(first_entry);
3563
else
3564
return (KERN_INVALID_ADDRESS);
3565
}
3566
for (entry = first_entry; entry->start < end; entry = next_entry) {
3567
if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3568
/*
3569
* We have not yet clipped the entry.
3570
*/
3571
next_entry = vm_map_entry_in_transition(map, start,
3572
&end, holes_ok, entry);
3573
if (next_entry == NULL) {
3574
if (entry == first_entry)
3575
return (KERN_INVALID_ADDRESS);
3576
rv = KERN_INVALID_ADDRESS;
3577
goto done;
3578
}
3579
first_entry = (entry == first_entry) ?
3580
next_entry : NULL;
3581
continue;
3582
}
3583
rv = vm_map_clip_start(map, entry, start);
3584
if (rv != KERN_SUCCESS)
3585
goto done;
3586
rv = vm_map_clip_end(map, entry, end);
3587
if (rv != KERN_SUCCESS)
3588
goto done;
3589
3590
/*
3591
* Mark the entry in case the map lock is released. (See
3592
* above.)
3593
*/
3594
KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
3595
entry->wiring_thread == NULL,
3596
("owned map entry %p", entry));
3597
entry->eflags |= MAP_ENTRY_IN_TRANSITION;
3598
entry->wiring_thread = curthread;
3599
if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
3600
|| (entry->protection & prot) != prot) {
3601
entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
3602
if (!holes_ok) {
3603
end = entry->end;
3604
rv = KERN_INVALID_ADDRESS;
3605
goto done;
3606
}
3607
} else if (entry->wired_count == 0) {
3608
entry->wired_count++;
3609
3610
npages = atop(entry->end - entry->start);
3611
if (user_wire && !vm_map_wire_user_count_add(npages)) {
3612
vm_map_wire_entry_failure(map, entry,
3613
entry->start);
3614
end = entry->end;
3615
rv = KERN_RESOURCE_SHORTAGE;
3616
goto done;
3617
}
3618
3619
/*
3620
* Release the map lock, relying on the in-transition
3621
* mark. Mark the map busy for fork.
3622
*/
3623
saved_start = entry->start;
3624
saved_end = entry->end;
3625
last_timestamp = map->timestamp;
3626
bidx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
3627
incr = pagesizes[bidx];
3628
vm_map_busy(map);
3629
vm_map_unlock(map);
3630
3631
for (faddr = saved_start; faddr < saved_end;
3632
faddr += incr) {
3633
/*
3634
* Simulate a fault to get the page and enter
3635
* it into the physical map.
3636
*/
3637
rv = vm_fault(map, faddr, VM_PROT_NONE,
3638
VM_FAULT_WIRE, NULL);
3639
if (rv != KERN_SUCCESS)
3640
break;
3641
}
3642
vm_map_lock(map);
3643
vm_map_unbusy(map);
3644
if (last_timestamp + 1 != map->timestamp) {
3645
/*
3646
* Look again for the entry because the map was
3647
* modified while it was unlocked. The entry
3648
* may have been clipped, but NOT merged or
3649
* deleted.
3650
*/
3651
if (!vm_map_lookup_entry(map, saved_start,
3652
&next_entry))
3653
KASSERT(false,
3654
("vm_map_wire: lookup failed"));
3655
first_entry = (entry == first_entry) ?
3656
next_entry : NULL;
3657
for (entry = next_entry; entry->end < saved_end;
3658
entry = vm_map_entry_succ(entry)) {
3659
/*
3660
* In case of failure, handle entries
3661
* that were not fully wired here;
3662
* fully wired entries are handled
3663
* later.
3664
*/
3665
if (rv != KERN_SUCCESS &&
3666
faddr < entry->end)
3667
vm_map_wire_entry_failure(map,
3668
entry, faddr);
3669
}
3670
}
3671
if (rv != KERN_SUCCESS) {
3672
vm_map_wire_entry_failure(map, entry, faddr);
3673
if (user_wire)
3674
vm_map_wire_user_count_sub(npages);
3675
end = entry->end;
3676
goto done;
3677
}
3678
} else if (!user_wire ||
3679
(entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3680
entry->wired_count++;
3681
}
3682
/*
3683
* Check the map for holes in the specified region.
3684
* If holes_ok was specified, skip this check.
3685
*/
3686
next_entry = vm_map_entry_succ(entry);
3687
if (!holes_ok &&
3688
entry->end < end && next_entry->start > entry->end) {
3689
end = entry->end;
3690
rv = KERN_INVALID_ADDRESS;
3691
goto done;
3692
}
3693
}
3694
rv = KERN_SUCCESS;
3695
done:
3696
need_wakeup = false;
3697
if (first_entry == NULL &&
3698
!vm_map_lookup_entry(map, start, &first_entry)) {
3699
KASSERT(holes_ok, ("vm_map_wire: lookup failed"));
3700
prev_entry = first_entry;
3701
entry = vm_map_entry_succ(first_entry);
3702
} else {
3703
prev_entry = vm_map_entry_pred(first_entry);
3704
entry = first_entry;
3705
}
3706
for (; entry->start < end;
3707
prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3708
/*
3709
* If holes_ok was specified, an empty
3710
* space in the unwired region could have been mapped
3711
* while the map lock was dropped for faulting in the
3712
* pages or draining MAP_ENTRY_IN_TRANSITION.
3713
* Moreover, another thread could be simultaneously
3714
* wiring this new mapping entry. Detect these cases
3715
* and skip any entries marked as in transition not by us.
3716
*
3717
* Another way to get an entry not marked with
3718
* MAP_ENTRY_IN_TRANSITION is after failed clipping,
3719
* which set rv to KERN_INVALID_ARGUMENT.
3720
*/
3721
if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3722
entry->wiring_thread != curthread) {
3723
KASSERT(holes_ok || rv == KERN_INVALID_ARGUMENT,
3724
("vm_map_wire: !HOLESOK and new/changed entry"));
3725
continue;
3726
}
3727
3728
if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) {
3729
/* do nothing */
3730
} else if (rv == KERN_SUCCESS) {
3731
if (user_wire)
3732
entry->eflags |= MAP_ENTRY_USER_WIRED;
3733
} else if (entry->wired_count == -1) {
3734
/*
3735
* Wiring failed on this entry. Thus, unwiring is
3736
* unnecessary.
3737
*/
3738
entry->wired_count = 0;
3739
} else if (!user_wire ||
3740
(entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3741
/*
3742
* Undo the wiring. Wiring succeeded on this entry
3743
* but failed on a later entry.
3744
*/
3745
if (entry->wired_count == 1) {
3746
vm_map_entry_unwire(map, entry);
3747
if (user_wire)
3748
vm_map_wire_user_count_sub(
3749
atop(entry->end - entry->start));
3750
} else
3751
entry->wired_count--;
3752
}
3753
KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3754
("vm_map_wire: in-transition flag missing %p", entry));
3755
KASSERT(entry->wiring_thread == curthread,
3756
("vm_map_wire: alien wire %p", entry));
3757
entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
3758
MAP_ENTRY_WIRE_SKIPPED);
3759
entry->wiring_thread = NULL;
3760
if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3761
entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3762
need_wakeup = true;
3763
}
3764
vm_map_try_merge_entries(map, prev_entry, entry);
3765
}
3766
vm_map_try_merge_entries(map, prev_entry, entry);
3767
if (need_wakeup)
3768
vm_map_wakeup(map);
3769
return (rv);
3770
}
3771
3772
/*
3773
* vm_map_sync
3774
*
3775
* Push any dirty cached pages in the address range to their pager.
3776
* If syncio is TRUE, dirty pages are written synchronously.
3777
* If invalidate is TRUE, any cached pages are freed as well.
3778
*
3779
* If the size of the region from start to end is zero, we are
3780
* supposed to flush all modified pages within the region containing
3781
* start. Unfortunately, a region can be split or coalesced with
3782
* neighboring regions, making it difficult to determine what the
3783
* original region was. Therefore, we approximate this requirement by
3784
* flushing the current region containing start.
3785
*
3786
* Returns an error if any part of the specified range is not mapped.
3787
*/
3788
int
3789
vm_map_sync(
3790
vm_map_t map,
3791
vm_offset_t start,
3792
vm_offset_t end,
3793
boolean_t syncio,
3794
boolean_t invalidate)
3795
{
3796
vm_map_entry_t entry, first_entry, next_entry;
3797
vm_size_t size;
3798
vm_object_t object;
3799
vm_ooffset_t offset;
3800
unsigned int last_timestamp;
3801
int bdry_idx;
3802
boolean_t failed;
3803
3804
vm_map_lock_read(map);
3805
VM_MAP_RANGE_CHECK(map, start, end);
3806
if (!vm_map_lookup_entry(map, start, &first_entry)) {
3807
vm_map_unlock_read(map);
3808
return (KERN_INVALID_ADDRESS);
3809
} else if (start == end) {
3810
start = first_entry->start;
3811
end = first_entry->end;
3812
}
3813
3814
/*
3815
* Make a first pass to check for user-wired memory, holes,
3816
* and partial invalidation of largepage mappings.
3817
*/
3818
for (entry = first_entry; entry->start < end; entry = next_entry) {
3819
if (invalidate) {
3820
if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) {
3821
vm_map_unlock_read(map);
3822
return (KERN_INVALID_ARGUMENT);
3823
}
3824
bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
3825
if (bdry_idx != 0 &&
3826
((start & (pagesizes[bdry_idx] - 1)) != 0 ||
3827
(end & (pagesizes[bdry_idx] - 1)) != 0)) {
3828
vm_map_unlock_read(map);
3829
return (KERN_INVALID_ARGUMENT);
3830
}
3831
}
3832
next_entry = vm_map_entry_succ(entry);
3833
if (end > entry->end &&
3834
entry->end != next_entry->start) {
3835
vm_map_unlock_read(map);
3836
return (KERN_INVALID_ADDRESS);
3837
}
3838
}
3839
3840
if (invalidate)
3841
pmap_remove(map->pmap, start, end);
3842
failed = FALSE;
3843
3844
/*
3845
* Make a second pass, cleaning/uncaching pages from the indicated
3846
* objects as we go.
3847
*/
3848
for (entry = first_entry; entry->start < end;) {
3849
offset = entry->offset + (start - entry->start);
3850
size = (end <= entry->end ? end : entry->end) - start;
3851
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
3852
vm_map_t smap;
3853
vm_map_entry_t tentry;
3854
vm_size_t tsize;
3855
3856
smap = entry->object.sub_map;
3857
vm_map_lock_read(smap);
3858
(void) vm_map_lookup_entry(smap, offset, &tentry);
3859
tsize = tentry->end - offset;
3860
if (tsize < size)
3861
size = tsize;
3862
object = tentry->object.vm_object;
3863
offset = tentry->offset + (offset - tentry->start);
3864
vm_map_unlock_read(smap);
3865
} else {
3866
object = entry->object.vm_object;
3867
}
3868
vm_object_reference(object);
3869
last_timestamp = map->timestamp;
3870
vm_map_unlock_read(map);
3871
if (!vm_object_sync(object, offset, size, syncio, invalidate))
3872
failed = TRUE;
3873
start += size;
3874
vm_object_deallocate(object);
3875
vm_map_lock_read(map);
3876
if (last_timestamp == map->timestamp ||
3877
!vm_map_lookup_entry(map, start, &entry))
3878
entry = vm_map_entry_succ(entry);
3879
}
3880
3881
vm_map_unlock_read(map);
3882
return (failed ? KERN_FAILURE : KERN_SUCCESS);
3883
}
3884
3885
/*
3886
* vm_map_entry_unwire: [ internal use only ]
3887
*
3888
* Make the region specified by this entry pageable.
3889
*
3890
* The map in question should be locked.
3891
* [This is the reason for this routine's existence.]
3892
*/
3893
static void
3894
vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3895
{
3896
vm_size_t size;
3897
3898
VM_MAP_ASSERT_LOCKED(map);
3899
KASSERT(entry->wired_count > 0,
3900
("vm_map_entry_unwire: entry %p isn't wired", entry));
3901
3902
size = entry->end - entry->start;
3903
if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0)
3904
vm_map_wire_user_count_sub(atop(size));
3905
pmap_unwire(map->pmap, entry->start, entry->end);
3906
vm_object_unwire(entry->object.vm_object, entry->offset, size,
3907
PQ_ACTIVE);
3908
entry->wired_count = 0;
3909
}
3910
3911
static void
3912
vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
3913
{
3914
3915
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
3916
vm_object_deallocate(entry->object.vm_object);
3917
uma_zfree(system_map ? kmapentzone : mapentzone, entry);
3918
}
3919
3920
/*
3921
* vm_map_entry_delete: [ internal use only ]
3922
*
3923
* Deallocate the given entry from the target map.
3924
*/
3925
static void
3926
vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
3927
{
3928
vm_object_t object;
3929
vm_pindex_t offidxstart, offidxend, size1;
3930
vm_size_t size;
3931
3932
vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE);
3933
object = entry->object.vm_object;
3934
3935
if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
3936
MPASS(entry->cred == NULL);
3937
MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
3938
MPASS(object == NULL);
3939
vm_map_entry_deallocate(entry, vm_map_is_system(map));
3940
return;
3941
}
3942
3943
size = entry->end - entry->start;
3944
map->size -= size;
3945
3946
if (entry->cred != NULL) {
3947
swap_release_by_cred(size, entry->cred);
3948
crfree(entry->cred);
3949
}
3950
3951
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || object == NULL) {
3952
entry->object.vm_object = NULL;
3953
} else if ((object->flags & OBJ_ANON) != 0 ||
3954
object == kernel_object) {
3955
KASSERT(entry->cred == NULL || object->cred == NULL ||
3956
(entry->eflags & MAP_ENTRY_NEEDS_COPY),
3957
("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
3958
offidxstart = OFF_TO_IDX(entry->offset);
3959
offidxend = offidxstart + atop(size);
3960
VM_OBJECT_WLOCK(object);
3961
if (object->ref_count != 1 &&
3962
((object->flags & OBJ_ONEMAPPING) != 0 ||
3963
object == kernel_object)) {
3964
vm_object_collapse(object);
3965
3966
/*
3967
* The option OBJPR_NOTMAPPED can be passed here
3968
* because vm_map_delete() already performed
3969
* pmap_remove() on the only mapping to this range
3970
* of pages.
3971
*/
3972
vm_object_page_remove(object, offidxstart, offidxend,
3973
OBJPR_NOTMAPPED);
3974
if (offidxend >= object->size &&
3975
offidxstart < object->size) {
3976
size1 = object->size;
3977
object->size = offidxstart;
3978
if (object->cred != NULL) {
3979
size1 -= object->size;
3980
KASSERT(object->charge >= ptoa(size1),
3981
("object %p charge < 0", object));
3982
swap_release_by_cred(ptoa(size1),
3983
object->cred);
3984
object->charge -= ptoa(size1);
3985
}
3986
}
3987
}
3988
VM_OBJECT_WUNLOCK(object);
3989
}
3990
if (vm_map_is_system(map))
3991
vm_map_entry_deallocate(entry, TRUE);
3992
else {
3993
entry->defer_next = curthread->td_map_def_user;
3994
curthread->td_map_def_user = entry;
3995
}
3996
}
3997
3998
/*
3999
* vm_map_delete: [ internal use only ]
4000
*
4001
* Deallocates the given address range from the target
4002
* map.
4003
*/
4004
int
4005
vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
4006
{
4007
vm_map_entry_t entry, next_entry, scratch_entry;
4008
int rv;
4009
4010
VM_MAP_ASSERT_LOCKED(map);
4011
4012
if (start == end)
4013
return (KERN_SUCCESS);
4014
4015
/*
4016
* Find the start of the region, and clip it.
4017
* Step through all entries in this region.
4018
*/
4019
rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry);
4020
if (rv != KERN_SUCCESS)
4021
return (rv);
4022
for (; entry->start < end; entry = next_entry) {
4023
/*
4024
* Wait for wiring or unwiring of an entry to complete.
4025
* Also wait for any system wirings to disappear on
4026
* user maps.
4027
*/
4028
if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
4029
(vm_map_pmap(map) != kernel_pmap &&
4030
vm_map_entry_system_wired_count(entry) != 0)) {
4031
unsigned int last_timestamp;
4032
vm_offset_t saved_start;
4033
4034
saved_start = entry->start;
4035
entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
4036
last_timestamp = map->timestamp;
4037
(void) vm_map_unlock_and_wait(map, 0);
4038
vm_map_lock(map);
4039
if (last_timestamp + 1 != map->timestamp) {
4040
/*
4041
* Look again for the entry because the map was
4042
* modified while it was unlocked.
4043
* Specifically, the entry may have been
4044
* clipped, merged, or deleted.
4045
*/
4046
rv = vm_map_lookup_clip_start(map, saved_start,
4047
&next_entry, &scratch_entry);
4048
if (rv != KERN_SUCCESS)
4049
break;
4050
} else
4051
next_entry = entry;
4052
continue;
4053
}
4054
4055
/* XXXKIB or delete to the upper superpage boundary ? */
4056
rv = vm_map_clip_end(map, entry, end);
4057
if (rv != KERN_SUCCESS)
4058
break;
4059
next_entry = vm_map_entry_succ(entry);
4060
4061
/*
4062
* Unwire before removing addresses from the pmap; otherwise,
4063
* unwiring will put the entries back in the pmap.
4064
*/
4065
if (entry->wired_count != 0)
4066
vm_map_entry_unwire(map, entry);
4067
4068
/*
4069
* Remove mappings for the pages, but only if the
4070
* mappings could exist. For instance, it does not
4071
* make sense to call pmap_remove() for guard entries.
4072
*/
4073
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
4074
entry->object.vm_object != NULL)
4075
pmap_map_delete(map->pmap, entry->start, entry->end);
4076
4077
/*
4078
* Delete the entry only after removing all pmap
4079
* entries pointing to its pages. (Otherwise, its
4080
* page frames may be reallocated, and any modify bits
4081
* will be set in the wrong object!)
4082
*/
4083
vm_map_entry_delete(map, entry);
4084
}
4085
return (rv);
4086
}
4087
4088
/*
4089
* vm_map_remove:
4090
*
4091
* Remove the given address range from the target map.
4092
* This is the exported form of vm_map_delete.
4093
*/
4094
int
4095
vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
4096
{
4097
int result;
4098
4099
vm_map_lock(map);
4100
VM_MAP_RANGE_CHECK(map, start, end);
4101
result = vm_map_delete(map, start, end);
4102
vm_map_unlock(map);
4103
return (result);
4104
}
4105
4106
/*
4107
* vm_map_check_protection:
4108
*
4109
* Assert that the target map allows the specified privilege on the
4110
* entire address region given. The entire region must be allocated.
4111
*
4112
* WARNING! This code does not and should not check whether the
4113
* contents of the region is accessible. For example a smaller file
4114
* might be mapped into a larger address space.
4115
*
4116
* NOTE! This code is also called by munmap().
4117
*
4118
* The map must be locked. A read lock is sufficient.
4119
*/
4120
boolean_t
4121
vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
4122
vm_prot_t protection)
4123
{
4124
vm_map_entry_t entry;
4125
vm_map_entry_t tmp_entry;
4126
4127
if (!vm_map_lookup_entry(map, start, &tmp_entry))
4128
return (FALSE);
4129
entry = tmp_entry;
4130
4131
while (start < end) {
4132
/*
4133
* No holes allowed!
4134
*/
4135
if (start < entry->start)
4136
return (FALSE);
4137
/*
4138
* Check protection associated with entry.
4139
*/
4140
if ((entry->protection & protection) != protection)
4141
return (FALSE);
4142
/* go to next entry */
4143
start = entry->end;
4144
entry = vm_map_entry_succ(entry);
4145
}
4146
return (TRUE);
4147
}
4148
4149
/*
4150
*
4151
* vm_map_copy_swap_object:
4152
*
4153
* Copies a swap-backed object from an existing map entry to a
4154
* new one. Carries forward the swap charge. May change the
4155
* src object on return.
4156
*/
4157
static void
4158
vm_map_copy_swap_object(vm_map_entry_t src_entry, vm_map_entry_t dst_entry,
4159
vm_offset_t size, vm_ooffset_t *fork_charge)
4160
{
4161
vm_object_t src_object;
4162
struct ucred *cred;
4163
int charged;
4164
4165
src_object = src_entry->object.vm_object;
4166
charged = ENTRY_CHARGED(src_entry);
4167
if ((src_object->flags & OBJ_ANON) != 0) {
4168
VM_OBJECT_WLOCK(src_object);
4169
vm_object_collapse(src_object);
4170
if ((src_object->flags & OBJ_ONEMAPPING) != 0) {
4171
vm_object_split(src_entry);
4172
src_object = src_entry->object.vm_object;
4173
}
4174
vm_object_reference_locked(src_object);
4175
vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
4176
VM_OBJECT_WUNLOCK(src_object);
4177
} else
4178
vm_object_reference(src_object);
4179
if (src_entry->cred != NULL &&
4180
!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
4181
KASSERT(src_object->cred == NULL,
4182
("OVERCOMMIT: vm_map_copy_anon_entry: cred %p",
4183
src_object));
4184
src_object->cred = src_entry->cred;
4185
src_object->charge = size;
4186
}
4187
dst_entry->object.vm_object = src_object;
4188
if (charged) {
4189
cred = curthread->td_ucred;
4190
crhold(cred);
4191
dst_entry->cred = cred;
4192
*fork_charge += size;
4193
if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
4194
crhold(cred);
4195
src_entry->cred = cred;
4196
*fork_charge += size;
4197
}
4198
}
4199
}
4200
4201
/*
4202
* vm_map_copy_entry:
4203
*
4204
* Copies the contents of the source entry to the destination
4205
* entry. The entries *must* be aligned properly.
4206
*/
4207
static void
4208
vm_map_copy_entry(
4209
vm_map_t src_map,
4210
vm_map_t dst_map,
4211
vm_map_entry_t src_entry,
4212
vm_map_entry_t dst_entry,
4213
vm_ooffset_t *fork_charge)
4214
{
4215
vm_object_t src_object;
4216
vm_map_entry_t fake_entry;
4217
vm_offset_t size;
4218
4219
VM_MAP_ASSERT_LOCKED(dst_map);
4220
4221
if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
4222
return;
4223
4224
if (src_entry->wired_count == 0 ||
4225
(src_entry->protection & VM_PROT_WRITE) == 0) {
4226
/*
4227
* If the source entry is marked needs_copy, it is already
4228
* write-protected.
4229
*/
4230
if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
4231
(src_entry->protection & VM_PROT_WRITE) != 0) {
4232
pmap_protect(src_map->pmap,
4233
src_entry->start,
4234
src_entry->end,
4235
src_entry->protection & ~VM_PROT_WRITE);
4236
}
4237
4238
/*
4239
* Make a copy of the object.
4240
*/
4241
size = src_entry->end - src_entry->start;
4242
if ((src_object = src_entry->object.vm_object) != NULL) {
4243
if ((src_object->flags & OBJ_SWAP) != 0) {
4244
vm_map_copy_swap_object(src_entry, dst_entry,
4245
size, fork_charge);
4246
/* May have split/collapsed, reload obj. */
4247
src_object = src_entry->object.vm_object;
4248
} else {
4249
vm_object_reference(src_object);
4250
dst_entry->object.vm_object = src_object;
4251
}
4252
src_entry->eflags |= MAP_ENTRY_COW |
4253
MAP_ENTRY_NEEDS_COPY;
4254
dst_entry->eflags |= MAP_ENTRY_COW |
4255
MAP_ENTRY_NEEDS_COPY;
4256
dst_entry->offset = src_entry->offset;
4257
if (src_entry->eflags & MAP_ENTRY_WRITECNT) {
4258
/*
4259
* MAP_ENTRY_WRITECNT cannot
4260
* indicate write reference from
4261
* src_entry, since the entry is
4262
* marked as needs copy. Allocate a
4263
* fake entry that is used to
4264
* decrement object->un_pager writecount
4265
* at the appropriate time. Attach
4266
* fake_entry to the deferred list.
4267
*/
4268
fake_entry = vm_map_entry_create(dst_map);
4269
fake_entry->eflags = MAP_ENTRY_WRITECNT;
4270
src_entry->eflags &= ~MAP_ENTRY_WRITECNT;
4271
vm_object_reference(src_object);
4272
fake_entry->object.vm_object = src_object;
4273
fake_entry->start = src_entry->start;
4274
fake_entry->end = src_entry->end;
4275
fake_entry->defer_next =
4276
curthread->td_map_def_user;
4277
curthread->td_map_def_user = fake_entry;
4278
}
4279
4280
pmap_copy(dst_map->pmap, src_map->pmap,
4281
dst_entry->start, dst_entry->end - dst_entry->start,
4282
src_entry->start);
4283
} else {
4284
dst_entry->object.vm_object = NULL;
4285
if ((dst_entry->eflags & MAP_ENTRY_GUARD) == 0)
4286
dst_entry->offset = 0;
4287
if (src_entry->cred != NULL) {
4288
dst_entry->cred = curthread->td_ucred;
4289
crhold(dst_entry->cred);
4290
*fork_charge += size;
4291
}
4292
}
4293
} else {
4294
/*
4295
* We don't want to make writeable wired pages copy-on-write.
4296
* Immediately copy these pages into the new map by simulating
4297
* page faults. The new pages are pageable.
4298
*/
4299
vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
4300
fork_charge);
4301
}
4302
}
4303
4304
/*
4305
* vmspace_map_entry_forked:
4306
* Update the newly-forked vmspace each time a map entry is inherited
4307
* or copied. The values for vm_dsize and vm_tsize are approximate
4308
* (and mostly-obsolete ideas in the face of mmap(2) et al.)
4309
*/
4310
static void
4311
vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
4312
vm_map_entry_t entry)
4313
{
4314
vm_size_t entrysize;
4315
vm_offset_t newend;
4316
4317
if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
4318
return;
4319
entrysize = entry->end - entry->start;
4320
vm2->vm_map.size += entrysize;
4321
if ((entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0) {
4322
vm2->vm_ssize += btoc(entrysize);
4323
} else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
4324
entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
4325
newend = MIN(entry->end,
4326
(vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
4327
vm2->vm_dsize += btoc(newend - entry->start);
4328
} else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
4329
entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
4330
newend = MIN(entry->end,
4331
(vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
4332
vm2->vm_tsize += btoc(newend - entry->start);
4333
}
4334
}
4335
4336
/*
4337
* vmspace_fork:
4338
* Create a new process vmspace structure and vm_map
4339
* based on those of an existing process. The new map
4340
* is based on the old map, according to the inheritance
4341
* values on the regions in that map.
4342
*
4343
* XXX It might be worth coalescing the entries added to the new vmspace.
4344
*
4345
* The source map must not be locked.
4346
*/
4347
struct vmspace *
4348
vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
4349
{
4350
struct vmspace *vm2;
4351
vm_map_t new_map, old_map;
4352
vm_map_entry_t new_entry, old_entry;
4353
vm_object_t object;
4354
int error, locked __diagused;
4355
vm_inherit_t inh;
4356
4357
old_map = &vm1->vm_map;
4358
/* Copy immutable fields of vm1 to vm2. */
4359
vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
4360
pmap_pinit);
4361
if (vm2 == NULL)
4362
return (NULL);
4363
4364
vm2->vm_taddr = vm1->vm_taddr;
4365
vm2->vm_daddr = vm1->vm_daddr;
4366
vm2->vm_maxsaddr = vm1->vm_maxsaddr;
4367
vm2->vm_stacktop = vm1->vm_stacktop;
4368
vm2->vm_shp_base = vm1->vm_shp_base;
4369
vm_map_lock(old_map);
4370
if (old_map->busy)
4371
vm_map_wait_busy(old_map);
4372
new_map = &vm2->vm_map;
4373
locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
4374
KASSERT(locked, ("vmspace_fork: lock failed"));
4375
4376
error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
4377
if (error != 0) {
4378
sx_xunlock(&old_map->lock);
4379
sx_xunlock(&new_map->lock);
4380
vm_map_process_deferred();
4381
vmspace_free(vm2);
4382
return (NULL);
4383
}
4384
4385
new_map->anon_loc = old_map->anon_loc;
4386
new_map->flags |= old_map->flags & (MAP_ASLR | MAP_ASLR_IGNSTART |
4387
MAP_ASLR_STACK | MAP_WXORX);
4388
4389
VM_MAP_ENTRY_FOREACH(old_entry, old_map) {
4390
if ((old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
4391
panic("vm_map_fork: encountered a submap");
4392
4393
inh = old_entry->inheritance;
4394
if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
4395
inh != VM_INHERIT_NONE)
4396
inh = VM_INHERIT_COPY;
4397
4398
switch (inh) {
4399
case VM_INHERIT_NONE:
4400
break;
4401
4402
case VM_INHERIT_SHARE:
4403
/*
4404
* Clone the entry, creating the shared object if
4405
* necessary.
4406
*/
4407
object = old_entry->object.vm_object;
4408
if (object == NULL) {
4409
vm_map_entry_back(old_entry);
4410
object = old_entry->object.vm_object;
4411
}
4412
4413
/*
4414
* Add the reference before calling vm_object_shadow
4415
* to insure that a shadow object is created.
4416
*/
4417
vm_object_reference(object);
4418
if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4419
vm_object_shadow(&old_entry->object.vm_object,
4420
&old_entry->offset,
4421
old_entry->end - old_entry->start,
4422
old_entry->cred,
4423
/* Transfer the second reference too. */
4424
true);
4425
old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
4426
old_entry->cred = NULL;
4427
4428
/*
4429
* As in vm_map_merged_neighbor_dispose(),
4430
* the vnode lock will not be acquired in
4431
* this call to vm_object_deallocate().
4432
*/
4433
vm_object_deallocate(object);
4434
object = old_entry->object.vm_object;
4435
} else {
4436
VM_OBJECT_WLOCK(object);
4437
vm_object_clear_flag(object, OBJ_ONEMAPPING);
4438
if (old_entry->cred != NULL) {
4439
KASSERT(object->cred == NULL,
4440
("vmspace_fork both cred"));
4441
object->cred = old_entry->cred;
4442
object->charge = old_entry->end -
4443
old_entry->start;
4444
old_entry->cred = NULL;
4445
}
4446
4447
/*
4448
* Assert the correct state of the vnode
4449
* v_writecount while the object is locked, to
4450
* not relock it later for the assertion
4451
* correctness.
4452
*/
4453
if (old_entry->eflags & MAP_ENTRY_WRITECNT &&
4454
object->type == OBJT_VNODE) {
4455
KASSERT(((struct vnode *)object->
4456
handle)->v_writecount > 0,
4457
("vmspace_fork: v_writecount %p",
4458
object));
4459
KASSERT(object->un_pager.vnp.
4460
writemappings > 0,
4461
("vmspace_fork: vnp.writecount %p",
4462
object));
4463
}
4464
VM_OBJECT_WUNLOCK(object);
4465
}
4466
4467
/*
4468
* Clone the entry, referencing the shared object.
4469
*/
4470
new_entry = vm_map_entry_create(new_map);
4471
*new_entry = *old_entry;
4472
new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
4473
MAP_ENTRY_IN_TRANSITION);
4474
new_entry->wiring_thread = NULL;
4475
new_entry->wired_count = 0;
4476
if (new_entry->eflags & MAP_ENTRY_WRITECNT) {
4477
vm_pager_update_writecount(object,
4478
new_entry->start, new_entry->end);
4479
}
4480
vm_map_entry_set_vnode_text(new_entry, true);
4481
4482
/*
4483
* Insert the entry into the new map -- we know we're
4484
* inserting at the end of the new map.
4485
*/
4486
vm_map_entry_link(new_map, new_entry);
4487
vmspace_map_entry_forked(vm1, vm2, new_entry);
4488
4489
/*
4490
* Update the physical map
4491
*/
4492
pmap_copy(new_map->pmap, old_map->pmap,
4493
new_entry->start,
4494
(old_entry->end - old_entry->start),
4495
old_entry->start);
4496
break;
4497
4498
case VM_INHERIT_COPY:
4499
/*
4500
* Clone the entry and link into the map.
4501
*/
4502
new_entry = vm_map_entry_create(new_map);
4503
*new_entry = *old_entry;
4504
/*
4505
* Copied entry is COW over the old object.
4506
*/
4507
new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
4508
MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT);
4509
new_entry->wiring_thread = NULL;
4510
new_entry->wired_count = 0;
4511
new_entry->object.vm_object = NULL;
4512
new_entry->cred = NULL;
4513
vm_map_entry_link(new_map, new_entry);
4514
vmspace_map_entry_forked(vm1, vm2, new_entry);
4515
vm_map_copy_entry(old_map, new_map, old_entry,
4516
new_entry, fork_charge);
4517
vm_map_entry_set_vnode_text(new_entry, true);
4518
break;
4519
4520
case VM_INHERIT_ZERO:
4521
/*
4522
* Create a new anonymous mapping entry modelled from
4523
* the old one.
4524
*/
4525
new_entry = vm_map_entry_create(new_map);
4526
memset(new_entry, 0, sizeof(*new_entry));
4527
4528
new_entry->start = old_entry->start;
4529
new_entry->end = old_entry->end;
4530
new_entry->eflags = old_entry->eflags &
4531
~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
4532
MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC |
4533
MAP_ENTRY_SPLIT_BOUNDARY_MASK);
4534
new_entry->protection = old_entry->protection;
4535
new_entry->max_protection = old_entry->max_protection;
4536
new_entry->inheritance = VM_INHERIT_ZERO;
4537
4538
vm_map_entry_link(new_map, new_entry);
4539
vmspace_map_entry_forked(vm1, vm2, new_entry);
4540
4541
new_entry->cred = curthread->td_ucred;
4542
crhold(new_entry->cred);
4543
*fork_charge += (new_entry->end - new_entry->start);
4544
4545
break;
4546
}
4547
}
4548
/*
4549
* Use inlined vm_map_unlock() to postpone handling the deferred
4550
* map entries, which cannot be done until both old_map and
4551
* new_map locks are released.
4552
*/
4553
sx_xunlock(&old_map->lock);
4554
sx_xunlock(&new_map->lock);
4555
vm_map_process_deferred();
4556
4557
return (vm2);
4558
}
4559
4560
/*
4561
* Create a process's stack for exec_new_vmspace(). This function is never
4562
* asked to wire the newly created stack.
4563
*/
4564
int
4565
vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4566
vm_prot_t prot, vm_prot_t max, int cow)
4567
{
4568
vm_size_t growsize, init_ssize;
4569
rlim_t vmemlim;
4570
int rv;
4571
4572
MPASS((map->flags & MAP_WIREFUTURE) == 0);
4573
growsize = sgrowsiz;
4574
init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
4575
vm_map_lock(map);
4576
vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4577
/* If we would blow our VMEM resource limit, no go */
4578
if (map->size + init_ssize > vmemlim) {
4579
rv = KERN_NO_SPACE;
4580
goto out;
4581
}
4582
rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
4583
max, cow);
4584
out:
4585
vm_map_unlock(map);
4586
return (rv);
4587
}
4588
4589
static int stack_guard_page = 1;
4590
SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
4591
&stack_guard_page, 0,
4592
"Specifies the number of guard pages for a stack that grows");
4593
4594
static int
4595
vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4596
vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
4597
{
4598
vm_map_entry_t gap_entry, new_entry, prev_entry;
4599
vm_offset_t bot, gap_bot, gap_top, top;
4600
vm_size_t init_ssize, sgp;
4601
int rv;
4602
4603
KASSERT((cow & MAP_STACK_AREA) != 0,
4604
("New mapping is not a stack"));
4605
4606
if (max_ssize == 0 ||
4607
!vm_map_range_valid(map, addrbos, addrbos + max_ssize))
4608
return (KERN_INVALID_ADDRESS);
4609
sgp = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
4610
(curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
4611
(vm_size_t)stack_guard_page * PAGE_SIZE;
4612
if (sgp >= max_ssize)
4613
return (KERN_INVALID_ARGUMENT);
4614
4615
init_ssize = growsize;
4616
if (max_ssize < init_ssize + sgp)
4617
init_ssize = max_ssize - sgp;
4618
4619
/* If addr is already mapped, no go */
4620
if (vm_map_lookup_entry(map, addrbos, &prev_entry))
4621
return (KERN_NO_SPACE);
4622
4623
/*
4624
* If we can't accommodate max_ssize in the current mapping, no go.
4625
*/
4626
if (vm_map_entry_succ(prev_entry)->start < addrbos + max_ssize)
4627
return (KERN_NO_SPACE);
4628
4629
/*
4630
* We initially map a stack of only init_ssize, at the top of
4631
* the range. We will grow as needed later.
4632
*
4633
* Note: we would normally expect prot and max to be VM_PROT_ALL,
4634
* and cow to be 0. Possibly we should eliminate these as input
4635
* parameters, and just pass these values here in the insert call.
4636
*/
4637
bot = addrbos + max_ssize - init_ssize;
4638
top = bot + init_ssize;
4639
gap_bot = addrbos;
4640
gap_top = bot;
4641
rv = vm_map_insert1(map, NULL, 0, bot, top, prot, max, cow,
4642
&new_entry);
4643
if (rv != KERN_SUCCESS)
4644
return (rv);
4645
KASSERT(new_entry->end == top || new_entry->start == bot,
4646
("Bad entry start/end for new stack entry"));
4647
KASSERT((new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
4648
("new entry lacks MAP_ENTRY_GROWS_DOWN"));
4649
if (gap_bot == gap_top)
4650
return (KERN_SUCCESS);
4651
rv = vm_map_insert1(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
4652
VM_PROT_NONE, MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP,
4653
&gap_entry);
4654
if (rv == KERN_SUCCESS) {
4655
KASSERT((gap_entry->eflags & MAP_ENTRY_GUARD) != 0,
4656
("entry %p not gap %#x", gap_entry, gap_entry->eflags));
4657
KASSERT((gap_entry->eflags & MAP_ENTRY_STACK_GAP) != 0,
4658
("entry %p not stack gap %#x", gap_entry,
4659
gap_entry->eflags));
4660
4661
/*
4662
* Gap can never successfully handle a fault, so
4663
* read-ahead logic is never used for it. Re-use
4664
* next_read of the gap entry to store
4665
* stack_guard_page for vm_map_growstack().
4666
* Similarly, since a gap cannot have a backing object,
4667
* store the original stack protections in the
4668
* object offset.
4669
*/
4670
gap_entry->next_read = sgp;
4671
gap_entry->offset = prot | PROT_MAX(max);
4672
} else {
4673
(void)vm_map_delete(map, bot, top);
4674
}
4675
return (rv);
4676
}
4677
4678
/*
4679
* Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we
4680
* successfully grow the stack.
4681
*/
4682
static int
4683
vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
4684
{
4685
vm_map_entry_t stack_entry;
4686
struct proc *p;
4687
struct vmspace *vm;
4688
vm_offset_t gap_end, gap_start, grow_start;
4689
vm_size_t grow_amount, guard, max_grow, sgp;
4690
vm_prot_t prot, max;
4691
rlim_t lmemlim, stacklim, vmemlim;
4692
int rv, rv1 __diagused;
4693
bool gap_deleted, is_procstack;
4694
#ifdef notyet
4695
uint64_t limit;
4696
#endif
4697
#ifdef RACCT
4698
int error __diagused;
4699
#endif
4700
4701
p = curproc;
4702
vm = p->p_vmspace;
4703
4704
/*
4705
* Disallow stack growth when the access is performed by a
4706
* debugger or AIO daemon. The reason is that the wrong
4707
* resource limits are applied.
4708
*/
4709
if (p != initproc && (map != &p->p_vmspace->vm_map ||
4710
p->p_textvp == NULL))
4711
return (KERN_FAILURE);
4712
4713
MPASS(!vm_map_is_system(map));
4714
4715
lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
4716
stacklim = lim_cur(curthread, RLIMIT_STACK);
4717
vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4718
retry:
4719
/* If addr is not in a hole for a stack grow area, no need to grow. */
4720
if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
4721
return (KERN_FAILURE);
4722
if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
4723
return (KERN_SUCCESS);
4724
if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP) != 0) {
4725
stack_entry = vm_map_entry_succ(gap_entry);
4726
if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
4727
stack_entry->start != gap_entry->end)
4728
return (KERN_FAILURE);
4729
grow_amount = round_page(stack_entry->start - addr);
4730
} else {
4731
return (KERN_FAILURE);
4732
}
4733
guard = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
4734
(curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
4735
gap_entry->next_read;
4736
max_grow = gap_entry->end - gap_entry->start;
4737
if (guard > max_grow)
4738
return (KERN_NO_SPACE);
4739
max_grow -= guard;
4740
if (grow_amount > max_grow)
4741
return (KERN_NO_SPACE);
4742
4743
/*
4744
* If this is the main process stack, see if we're over the stack
4745
* limit.
4746
*/
4747
is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
4748
addr < (vm_offset_t)vm->vm_stacktop;
4749
if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
4750
return (KERN_NO_SPACE);
4751
4752
#ifdef RACCT
4753
if (racct_enable) {
4754
PROC_LOCK(p);
4755
if (is_procstack && racct_set(p, RACCT_STACK,
4756
ctob(vm->vm_ssize) + grow_amount)) {
4757
PROC_UNLOCK(p);
4758
return (KERN_NO_SPACE);
4759
}
4760
PROC_UNLOCK(p);
4761
}
4762
#endif
4763
4764
grow_amount = roundup(grow_amount, sgrowsiz);
4765
if (grow_amount > max_grow)
4766
grow_amount = max_grow;
4767
if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
4768
grow_amount = trunc_page((vm_size_t)stacklim) -
4769
ctob(vm->vm_ssize);
4770
}
4771
4772
#ifdef notyet
4773
PROC_LOCK(p);
4774
limit = racct_get_available(p, RACCT_STACK);
4775
PROC_UNLOCK(p);
4776
if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
4777
grow_amount = limit - ctob(vm->vm_ssize);
4778
#endif
4779
4780
if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
4781
if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
4782
rv = KERN_NO_SPACE;
4783
goto out;
4784
}
4785
#ifdef RACCT
4786
if (racct_enable) {
4787
PROC_LOCK(p);
4788
if (racct_set(p, RACCT_MEMLOCK,
4789
ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
4790
PROC_UNLOCK(p);
4791
rv = KERN_NO_SPACE;
4792
goto out;
4793
}
4794
PROC_UNLOCK(p);
4795
}
4796
#endif
4797
}
4798
4799
/* If we would blow our VMEM resource limit, no go */
4800
if (map->size + grow_amount > vmemlim) {
4801
rv = KERN_NO_SPACE;
4802
goto out;
4803
}
4804
#ifdef RACCT
4805
if (racct_enable) {
4806
PROC_LOCK(p);
4807
if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
4808
PROC_UNLOCK(p);
4809
rv = KERN_NO_SPACE;
4810
goto out;
4811
}
4812
PROC_UNLOCK(p);
4813
}
4814
#endif
4815
4816
if (vm_map_lock_upgrade(map)) {
4817
gap_entry = NULL;
4818
vm_map_lock_read(map);
4819
goto retry;
4820
}
4821
4822
/*
4823
* The gap_entry "offset" field is overloaded. See
4824
* vm_map_stack_locked().
4825
*/
4826
prot = PROT_EXTRACT(gap_entry->offset);
4827
max = PROT_MAX_EXTRACT(gap_entry->offset);
4828
sgp = gap_entry->next_read;
4829
4830
grow_start = gap_entry->end - grow_amount;
4831
if (gap_entry->start + grow_amount == gap_entry->end) {
4832
gap_start = gap_entry->start;
4833
gap_end = gap_entry->end;
4834
vm_map_entry_delete(map, gap_entry);
4835
gap_deleted = true;
4836
} else {
4837
MPASS(gap_entry->start < gap_entry->end - grow_amount);
4838
vm_map_entry_resize(map, gap_entry, -grow_amount);
4839
gap_deleted = false;
4840
}
4841
rv = vm_map_insert(map, NULL, 0, grow_start,
4842
grow_start + grow_amount, prot, max, MAP_STACK_AREA);
4843
if (rv != KERN_SUCCESS) {
4844
if (gap_deleted) {
4845
rv1 = vm_map_insert1(map, NULL, 0, gap_start,
4846
gap_end, VM_PROT_NONE, VM_PROT_NONE,
4847
MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP,
4848
&gap_entry);
4849
MPASS(rv1 == KERN_SUCCESS);
4850
gap_entry->next_read = sgp;
4851
gap_entry->offset = prot | PROT_MAX(max);
4852
} else {
4853
vm_map_entry_resize(map, gap_entry,
4854
grow_amount);
4855
}
4856
}
4857
if (rv == KERN_SUCCESS && is_procstack)
4858
vm->vm_ssize += btoc(grow_amount);
4859
4860
/*
4861
* Heed the MAP_WIREFUTURE flag if it was set for this process.
4862
*/
4863
if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
4864
rv = vm_map_wire_locked(map, grow_start,
4865
grow_start + grow_amount,
4866
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
4867
}
4868
vm_map_lock_downgrade(map);
4869
4870
out:
4871
#ifdef RACCT
4872
if (racct_enable && rv != KERN_SUCCESS) {
4873
PROC_LOCK(p);
4874
error = racct_set(p, RACCT_VMEM, map->size);
4875
KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
4876
if (!old_mlock) {
4877
error = racct_set(p, RACCT_MEMLOCK,
4878
ptoa(pmap_wired_count(map->pmap)));
4879
KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
4880
}
4881
error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
4882
KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
4883
PROC_UNLOCK(p);
4884
}
4885
#endif
4886
4887
return (rv);
4888
}
4889
4890
/*
4891
* Unshare the specified VM space for exec. If other processes are
4892
* mapped to it, then create a new one. The new vmspace is null.
4893
*/
4894
int
4895
vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
4896
{
4897
struct vmspace *oldvmspace = p->p_vmspace;
4898
struct vmspace *newvmspace;
4899
4900
KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
4901
("vmspace_exec recursed"));
4902
newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
4903
if (newvmspace == NULL)
4904
return (ENOMEM);
4905
newvmspace->vm_swrss = oldvmspace->vm_swrss;
4906
/*
4907
* This code is written like this for prototype purposes. The
4908
* goal is to avoid running down the vmspace here, but let the
4909
* other process's that are still using the vmspace to finally
4910
* run it down. Even though there is little or no chance of blocking
4911
* here, it is a good idea to keep this form for future mods.
4912
*/
4913
PROC_VMSPACE_LOCK(p);
4914
p->p_vmspace = newvmspace;
4915
PROC_VMSPACE_UNLOCK(p);
4916
if (p == curthread->td_proc)
4917
pmap_activate(curthread);
4918
curthread->td_pflags |= TDP_EXECVMSPC;
4919
return (0);
4920
}
4921
4922
/*
4923
* Unshare the specified VM space for forcing COW. This
4924
* is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4925
*/
4926
int
4927
vmspace_unshare(struct proc *p)
4928
{
4929
struct vmspace *oldvmspace = p->p_vmspace;
4930
struct vmspace *newvmspace;
4931
vm_ooffset_t fork_charge;
4932
4933
/*
4934
* The caller is responsible for ensuring that the reference count
4935
* cannot concurrently transition 1 -> 2.
4936
*/
4937
if (refcount_load(&oldvmspace->vm_refcnt) == 1)
4938
return (0);
4939
fork_charge = 0;
4940
newvmspace = vmspace_fork(oldvmspace, &fork_charge);
4941
if (newvmspace == NULL)
4942
return (ENOMEM);
4943
if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
4944
vmspace_free(newvmspace);
4945
return (ENOMEM);
4946
}
4947
PROC_VMSPACE_LOCK(p);
4948
p->p_vmspace = newvmspace;
4949
PROC_VMSPACE_UNLOCK(p);
4950
if (p == curthread->td_proc)
4951
pmap_activate(curthread);
4952
vmspace_free(oldvmspace);
4953
return (0);
4954
}
4955
4956
/*
4957
* vm_map_lookup:
4958
*
4959
* Finds the VM object, offset, and
4960
* protection for a given virtual address in the
4961
* specified map, assuming a page fault of the
4962
* type specified.
4963
*
4964
* Leaves the map in question locked for read; return
4965
* values are guaranteed until a vm_map_lookup_done
4966
* call is performed. Note that the map argument
4967
* is in/out; the returned map must be used in
4968
* the call to vm_map_lookup_done.
4969
*
4970
* A handle (out_entry) is returned for use in
4971
* vm_map_lookup_done, to make that fast.
4972
*
4973
* If a lookup is requested with "write protection"
4974
* specified, the map may be changed to perform virtual
4975
* copying operations, although the data referenced will
4976
* remain the same.
4977
*/
4978
int
4979
vm_map_lookup(vm_map_t *var_map, /* IN/OUT */
4980
vm_offset_t vaddr,
4981
vm_prot_t fault_typea,
4982
vm_map_entry_t *out_entry, /* OUT */
4983
vm_object_t *object, /* OUT */
4984
vm_pindex_t *pindex, /* OUT */
4985
vm_prot_t *out_prot, /* OUT */
4986
boolean_t *wired) /* OUT */
4987
{
4988
vm_map_entry_t entry;
4989
vm_map_t map = *var_map;
4990
vm_prot_t prot;
4991
vm_prot_t fault_type;
4992
vm_object_t eobject;
4993
vm_size_t size;
4994
struct ucred *cred;
4995
4996
RetryLookup:
4997
4998
vm_map_lock_read(map);
4999
5000
RetryLookupLocked:
5001
/*
5002
* Lookup the faulting address.
5003
*/
5004
if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
5005
vm_map_unlock_read(map);
5006
return (KERN_INVALID_ADDRESS);
5007
}
5008
5009
entry = *out_entry;
5010
5011
/*
5012
* Handle submaps.
5013
*/
5014
if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
5015
vm_map_t old_map = map;
5016
5017
*var_map = map = entry->object.sub_map;
5018
vm_map_unlock_read(old_map);
5019
goto RetryLookup;
5020
}
5021
5022
/*
5023
* Check whether this task is allowed to have this page.
5024
*/
5025
prot = entry->protection;
5026
if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
5027
fault_typea &= ~VM_PROT_FAULT_LOOKUP;
5028
if (prot == VM_PROT_NONE && map != kernel_map &&
5029
(entry->eflags & MAP_ENTRY_GUARD) != 0 &&
5030
(entry->eflags & MAP_ENTRY_STACK_GAP) != 0 &&
5031
vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
5032
goto RetryLookupLocked;
5033
}
5034
fault_type = fault_typea & VM_PROT_ALL;
5035
if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
5036
vm_map_unlock_read(map);
5037
return (KERN_PROTECTION_FAILURE);
5038
}
5039
KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
5040
(MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
5041
(MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
5042
("entry %p flags %x", entry, entry->eflags));
5043
if ((fault_typea & VM_PROT_COPY) != 0 &&
5044
(entry->max_protection & VM_PROT_WRITE) == 0 &&
5045
(entry->eflags & MAP_ENTRY_COW) == 0) {
5046
vm_map_unlock_read(map);
5047
return (KERN_PROTECTION_FAILURE);
5048
}
5049
5050
/*
5051
* If this page is not pageable, we have to get it for all possible
5052
* accesses.
5053
*/
5054
*wired = (entry->wired_count != 0);
5055
if (*wired)
5056
fault_type = entry->protection;
5057
size = entry->end - entry->start;
5058
5059
/*
5060
* If the entry was copy-on-write, we either ...
5061
*/
5062
if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
5063
/*
5064
* If we want to write the page, we may as well handle that
5065
* now since we've got the map locked.
5066
*
5067
* If we don't need to write the page, we just demote the
5068
* permissions allowed.
5069
*/
5070
if ((fault_type & VM_PROT_WRITE) != 0 ||
5071
(fault_typea & VM_PROT_COPY) != 0) {
5072
/*
5073
* Make a new object, and place it in the object
5074
* chain. Note that no new references have appeared
5075
* -- one just moved from the map to the new
5076
* object.
5077
*/
5078
if (vm_map_lock_upgrade(map))
5079
goto RetryLookup;
5080
5081
if (entry->cred == NULL) {
5082
/*
5083
* The debugger owner is charged for
5084
* the memory.
5085
*/
5086
cred = curthread->td_ucred;
5087
crhold(cred);
5088
if (!swap_reserve_by_cred(size, cred)) {
5089
crfree(cred);
5090
vm_map_unlock(map);
5091
return (KERN_RESOURCE_SHORTAGE);
5092
}
5093
entry->cred = cred;
5094
}
5095
eobject = entry->object.vm_object;
5096
vm_object_shadow(&entry->object.vm_object,
5097
&entry->offset, size, entry->cred, false);
5098
if (eobject == entry->object.vm_object) {
5099
/*
5100
* The object was not shadowed.
5101
*/
5102
swap_release_by_cred(size, entry->cred);
5103
crfree(entry->cred);
5104
}
5105
entry->cred = NULL;
5106
entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
5107
5108
vm_map_lock_downgrade(map);
5109
} else {
5110
/*
5111
* We're attempting to read a copy-on-write page --
5112
* don't allow writes.
5113
*/
5114
prot &= ~VM_PROT_WRITE;
5115
}
5116
}
5117
5118
/*
5119
* Create an object if necessary.
5120
*/
5121
if (entry->object.vm_object == NULL && !vm_map_is_system(map)) {
5122
if (vm_map_lock_upgrade(map))
5123
goto RetryLookup;
5124
entry->object.vm_object = vm_object_allocate_anon(atop(size),
5125
NULL, entry->cred, size);
5126
entry->offset = 0;
5127
entry->cred = NULL;
5128
vm_map_lock_downgrade(map);
5129
}
5130
5131
/*
5132
* Return the object/offset from this entry. If the entry was
5133
* copy-on-write or empty, it has been fixed up.
5134
*/
5135
*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
5136
*object = entry->object.vm_object;
5137
5138
*out_prot = prot;
5139
return (KERN_SUCCESS);
5140
}
5141
5142
/*
5143
* vm_map_lookup_locked:
5144
*
5145
* Lookup the faulting address. A version of vm_map_lookup that returns
5146
* KERN_FAILURE instead of blocking on map lock or memory allocation.
5147
*/
5148
int
5149
vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */
5150
vm_offset_t vaddr,
5151
vm_prot_t fault_typea,
5152
vm_map_entry_t *out_entry, /* OUT */
5153
vm_object_t *object, /* OUT */
5154
vm_pindex_t *pindex, /* OUT */
5155
vm_prot_t *out_prot, /* OUT */
5156
boolean_t *wired) /* OUT */
5157
{
5158
vm_map_entry_t entry;
5159
vm_map_t map = *var_map;
5160
vm_prot_t prot;
5161
vm_prot_t fault_type = fault_typea;
5162
5163
/*
5164
* Lookup the faulting address.
5165
*/
5166
if (!vm_map_lookup_entry(map, vaddr, out_entry))
5167
return (KERN_INVALID_ADDRESS);
5168
5169
entry = *out_entry;
5170
5171
/*
5172
* Fail if the entry refers to a submap.
5173
*/
5174
if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
5175
return (KERN_FAILURE);
5176
5177
/*
5178
* Check whether this task is allowed to have this page.
5179
*/
5180
prot = entry->protection;
5181
fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
5182
if ((fault_type & prot) != fault_type)
5183
return (KERN_PROTECTION_FAILURE);
5184
5185
/*
5186
* If this page is not pageable, we have to get it for all possible
5187
* accesses.
5188
*/
5189
*wired = (entry->wired_count != 0);
5190
if (*wired)
5191
fault_type = entry->protection;
5192
5193
if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
5194
/*
5195
* Fail if the entry was copy-on-write for a write fault.
5196
*/
5197
if (fault_type & VM_PROT_WRITE)
5198
return (KERN_FAILURE);
5199
/*
5200
* We're attempting to read a copy-on-write page --
5201
* don't allow writes.
5202
*/
5203
prot &= ~VM_PROT_WRITE;
5204
}
5205
5206
/*
5207
* Fail if an object should be created.
5208
*/
5209
if (entry->object.vm_object == NULL && !vm_map_is_system(map))
5210
return (KERN_FAILURE);
5211
5212
/*
5213
* Return the object/offset from this entry. If the entry was
5214
* copy-on-write or empty, it has been fixed up.
5215
*/
5216
*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
5217
*object = entry->object.vm_object;
5218
5219
*out_prot = prot;
5220
return (KERN_SUCCESS);
5221
}
5222
5223
/*
5224
* vm_map_lookup_done:
5225
*
5226
* Releases locks acquired by a vm_map_lookup
5227
* (according to the handle returned by that lookup).
5228
*/
5229
void
5230
vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
5231
{
5232
/*
5233
* Unlock the main-level map
5234
*/
5235
vm_map_unlock_read(map);
5236
}
5237
5238
vm_offset_t
5239
vm_map_max_KBI(const struct vm_map *map)
5240
{
5241
5242
return (vm_map_max(map));
5243
}
5244
5245
vm_offset_t
5246
vm_map_min_KBI(const struct vm_map *map)
5247
{
5248
5249
return (vm_map_min(map));
5250
}
5251
5252
pmap_t
5253
vm_map_pmap_KBI(vm_map_t map)
5254
{
5255
5256
return (map->pmap);
5257
}
5258
5259
bool
5260
vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end)
5261
{
5262
5263
return (vm_map_range_valid(map, start, end));
5264
}
5265
5266
#ifdef INVARIANTS
5267
static void
5268
_vm_map_assert_consistent(vm_map_t map, int check)
5269
{
5270
vm_map_entry_t entry, prev;
5271
vm_map_entry_t cur, header, lbound, ubound;
5272
vm_size_t max_left, max_right;
5273
5274
#ifdef DIAGNOSTIC
5275
++map->nupdates;
5276
#endif
5277
if (enable_vmmap_check != check)
5278
return;
5279
5280
header = prev = &map->header;
5281
VM_MAP_ENTRY_FOREACH(entry, map) {
5282
KASSERT(prev->end <= entry->start,
5283
("map %p prev->end = %jx, start = %jx", map,
5284
(uintmax_t)prev->end, (uintmax_t)entry->start));
5285
KASSERT(entry->start < entry->end,
5286
("map %p start = %jx, end = %jx", map,
5287
(uintmax_t)entry->start, (uintmax_t)entry->end));
5288
KASSERT(entry->left == header ||
5289
entry->left->start < entry->start,
5290
("map %p left->start = %jx, start = %jx", map,
5291
(uintmax_t)entry->left->start, (uintmax_t)entry->start));
5292
KASSERT(entry->right == header ||
5293
entry->start < entry->right->start,
5294
("map %p start = %jx, right->start = %jx", map,
5295
(uintmax_t)entry->start, (uintmax_t)entry->right->start));
5296
cur = map->root;
5297
lbound = ubound = header;
5298
for (;;) {
5299
if (entry->start < cur->start) {
5300
ubound = cur;
5301
cur = cur->left;
5302
KASSERT(cur != lbound,
5303
("map %p cannot find %jx",
5304
map, (uintmax_t)entry->start));
5305
} else if (cur->end <= entry->start) {
5306
lbound = cur;
5307
cur = cur->right;
5308
KASSERT(cur != ubound,
5309
("map %p cannot find %jx",
5310
map, (uintmax_t)entry->start));
5311
} else {
5312
KASSERT(cur == entry,
5313
("map %p cannot find %jx",
5314
map, (uintmax_t)entry->start));
5315
break;
5316
}
5317
}
5318
max_left = vm_map_entry_max_free_left(entry, lbound);
5319
max_right = vm_map_entry_max_free_right(entry, ubound);
5320
KASSERT(entry->max_free == vm_size_max(max_left, max_right),
5321
("map %p max = %jx, max_left = %jx, max_right = %jx", map,
5322
(uintmax_t)entry->max_free,
5323
(uintmax_t)max_left, (uintmax_t)max_right));
5324
prev = entry;
5325
}
5326
KASSERT(prev->end <= entry->start,
5327
("map %p prev->end = %jx, start = %jx", map,
5328
(uintmax_t)prev->end, (uintmax_t)entry->start));
5329
}
5330
#endif
5331
5332
#include "opt_ddb.h"
5333
#ifdef DDB
5334
#include <sys/kernel.h>
5335
5336
#include <ddb/ddb.h>
5337
5338
static void
5339
vm_map_print(vm_map_t map)
5340
{
5341
vm_map_entry_t entry, prev;
5342
5343
db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
5344
(void *)map,
5345
(void *)map->pmap, map->nentries, map->timestamp);
5346
5347
db_indent += 2;
5348
prev = &map->header;
5349
VM_MAP_ENTRY_FOREACH(entry, map) {
5350
db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
5351
(void *)entry, (void *)entry->start, (void *)entry->end,
5352
entry->eflags);
5353
{
5354
static const char * const inheritance_name[4] =
5355
{"share", "copy", "none", "donate_copy"};
5356
5357
db_iprintf(" prot=%x/%x/%s",
5358
entry->protection,
5359
entry->max_protection,
5360
inheritance_name[(int)(unsigned char)
5361
entry->inheritance]);
5362
if (entry->wired_count != 0)
5363
db_printf(", wired");
5364
}
5365
if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
5366
db_printf(", share=%p, offset=0x%jx\n",
5367
(void *)entry->object.sub_map,
5368
(uintmax_t)entry->offset);
5369
if (prev == &map->header ||
5370
prev->object.sub_map !=
5371
entry->object.sub_map) {
5372
db_indent += 2;
5373
vm_map_print((vm_map_t)entry->object.sub_map);
5374
db_indent -= 2;
5375
}
5376
} else {
5377
if (entry->cred != NULL)
5378
db_printf(", ruid %d", entry->cred->cr_ruid);
5379
db_printf(", object=%p, offset=0x%jx",
5380
(void *)entry->object.vm_object,
5381
(uintmax_t)entry->offset);
5382
if (entry->object.vm_object && entry->object.vm_object->cred)
5383
db_printf(", obj ruid %d charge %jx",
5384
entry->object.vm_object->cred->cr_ruid,
5385
(uintmax_t)entry->object.vm_object->charge);
5386
if (entry->eflags & MAP_ENTRY_COW)
5387
db_printf(", copy (%s)",
5388
(entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
5389
db_printf("\n");
5390
5391
if (prev == &map->header ||
5392
prev->object.vm_object !=
5393
entry->object.vm_object) {
5394
db_indent += 2;
5395
vm_object_print((db_expr_t)(intptr_t)
5396
entry->object.vm_object,
5397
0, 0, (char *)0);
5398
db_indent -= 2;
5399
}
5400
}
5401
prev = entry;
5402
}
5403
db_indent -= 2;
5404
}
5405
5406
DB_SHOW_COMMAND(map, map)
5407
{
5408
5409
if (!have_addr) {
5410
db_printf("usage: show map <addr>\n");
5411
return;
5412
}
5413
vm_map_print((vm_map_t)addr);
5414
}
5415
5416
DB_SHOW_COMMAND(procvm, procvm)
5417
{
5418
struct proc *p;
5419
5420
if (have_addr) {
5421
p = db_lookup_proc(addr);
5422
} else {
5423
p = curproc;
5424
}
5425
5426
db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
5427
(void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
5428
(void *)vmspace_pmap(p->p_vmspace));
5429
5430
vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
5431
}
5432
5433
#endif /* DDB */
5434
5435