Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/vm/vm_mmap.c
103381 views
1
/*-
2
* SPDX-License-Identifier: BSD-3-Clause
3
*
4
* Copyright (c) 1988 University of Utah.
5
* Copyright (c) 1991, 1993
6
* The Regents of the University of California. All rights reserved.
7
*
8
* This code is derived from software contributed to Berkeley by
9
* the Systems Programming Group of the University of Utah Computer
10
* Science Department.
11
*
12
* Redistribution and use in source and binary forms, with or without
13
* modification, are permitted provided that the following conditions
14
* are met:
15
* 1. Redistributions of source code must retain the above copyright
16
* notice, this list of conditions and the following disclaimer.
17
* 2. Redistributions in binary form must reproduce the above copyright
18
* notice, this list of conditions and the following disclaimer in the
19
* documentation and/or other materials provided with the distribution.
20
* 3. Neither the name of the University nor the names of its contributors
21
* may be used to endorse or promote products derived from this software
22
* without specific prior written permission.
23
*
24
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34
* SUCH DAMAGE.
35
*
36
* from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
37
*/
38
39
/*
40
* Mapped file (mmap) interface to VM
41
*/
42
43
#include "opt_hwpmc_hooks.h"
44
#include "opt_hwt_hooks.h"
45
#include "opt_vm.h"
46
47
#define EXTERR_CATEGORY EXTERR_CAT_MMAP
48
#include <sys/param.h>
49
#include <sys/systm.h>
50
#include <sys/capsicum.h>
51
#include <sys/exterrvar.h>
52
#include <sys/kernel.h>
53
#include <sys/lock.h>
54
#include <sys/mutex.h>
55
#include <sys/sysproto.h>
56
#include <sys/elf.h>
57
#include <sys/filedesc.h>
58
#include <sys/priv.h>
59
#include <sys/proc.h>
60
#include <sys/procctl.h>
61
#include <sys/racct.h>
62
#include <sys/resource.h>
63
#include <sys/resourcevar.h>
64
#include <sys/rwlock.h>
65
#include <sys/sysctl.h>
66
#include <sys/vnode.h>
67
#include <sys/fcntl.h>
68
#include <sys/file.h>
69
#include <sys/mman.h>
70
#include <sys/mount.h>
71
#include <sys/conf.h>
72
#include <sys/stat.h>
73
#include <sys/syscallsubr.h>
74
#include <sys/sysent.h>
75
#include <sys/vmmeter.h>
76
#if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */
77
#include <machine/md_var.h>
78
#endif
79
80
#include <security/audit/audit.h>
81
#include <security/mac/mac_framework.h>
82
83
#include <vm/vm.h>
84
#include <vm/vm_param.h>
85
#include <vm/pmap.h>
86
#include <vm/vm_map.h>
87
#include <vm/vm_object.h>
88
#include <vm/vm_page.h>
89
#include <vm/vm_pager.h>
90
#include <vm/vm_pageout.h>
91
#include <vm/vm_extern.h>
92
#include <vm/vm_page.h>
93
#include <vm/vnode_pager.h>
94
95
#ifdef HWPMC_HOOKS
96
#include <sys/pmckern.h>
97
#endif
98
99
#ifdef HWT_HOOKS
100
#include <dev/hwt/hwt_hook.h>
101
#endif
102
103
int old_mlock = 0;
104
SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
105
"Do not apply RLIMIT_MEMLOCK on mlockall");
106
static int mincore_mapped = 1;
107
SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0,
108
"mincore reports mappings, not residency");
109
static int imply_prot_max = 0;
110
SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0,
111
"Imply maximum page protections in mmap() when none are specified");
112
113
_Static_assert(MAXPAGESIZES <= 4, "MINCORE_SUPER too narrow");
114
115
#if defined(COMPAT_43)
116
int
117
ogetpagesize(struct thread *td, struct ogetpagesize_args *uap)
118
{
119
120
td->td_retval[0] = PAGE_SIZE;
121
return (0);
122
}
123
#endif /* COMPAT_43 */
124
125
/*
126
* Memory Map (mmap) system call. Note that the file offset
127
* and address are allowed to be NOT page aligned, though if
128
* the MAP_FIXED flag it set, both must have the same remainder
129
* modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not
130
* page-aligned, the actual mapping starts at trunc_page(addr)
131
* and the return value is adjusted up by the page offset.
132
*
133
* Generally speaking, only character devices which are themselves
134
* memory-based, such as a video framebuffer, can be mmap'd. Otherwise
135
* there would be no cache coherency between a descriptor and a VM mapping
136
* both to the same character device.
137
*/
138
#ifndef _SYS_SYSPROTO_H_
139
struct mmap_args {
140
void *addr;
141
size_t len;
142
int prot;
143
int flags;
144
int fd;
145
long pad;
146
off_t pos;
147
};
148
#endif
149
150
int
151
sys_mmap(struct thread *td, struct mmap_args *uap)
152
{
153
154
return (kern_mmap(td, &(struct mmap_req){
155
.mr_hint = (uintptr_t)uap->addr,
156
.mr_len = uap->len,
157
.mr_prot = uap->prot,
158
.mr_flags = uap->flags,
159
.mr_fd = uap->fd,
160
.mr_pos = uap->pos,
161
}));
162
}
163
164
int
165
kern_mmap_maxprot(struct proc *p, int prot)
166
{
167
168
if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 ||
169
(p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0)
170
return (_PROT_ALL);
171
if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) &&
172
prot != PROT_NONE)
173
return (prot);
174
return (_PROT_ALL);
175
}
176
177
int
178
kern_mmap(struct thread *td, const struct mmap_req *mrp)
179
{
180
struct vmspace *vms;
181
struct file *fp;
182
struct proc *p;
183
off_t pos;
184
vm_offset_t addr, orig_addr;
185
vm_size_t len, pageoff, size;
186
vm_prot_t cap_maxprot;
187
int align, error, fd, flags, max_prot, prot;
188
cap_rights_t rights;
189
mmap_check_fp_fn check_fp_fn;
190
191
orig_addr = addr = mrp->mr_hint;
192
len = mrp->mr_len;
193
prot = mrp->mr_prot;
194
flags = mrp->mr_flags;
195
fd = mrp->mr_fd;
196
pos = mrp->mr_pos;
197
check_fp_fn = mrp->mr_check_fp_fn;
198
199
if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) {
200
return (EXTERROR(EINVAL, "unknown PROT bits %#jx", prot));
201
}
202
max_prot = PROT_MAX_EXTRACT(prot);
203
prot = PROT_EXTRACT(prot);
204
if (max_prot != 0 && (max_prot & prot) != prot) {
205
return (EXTERROR(ENOTSUP,
206
"prot %#jx is not subset of max_prot %#jx",
207
prot, max_prot));
208
}
209
210
p = td->td_proc;
211
212
/*
213
* Always honor PROT_MAX if set. If not, default to all
214
* permissions unless we're implying maximum permissions.
215
*/
216
if (max_prot == 0)
217
max_prot = kern_mmap_maxprot(p, prot);
218
219
vms = p->p_vmspace;
220
fp = NULL;
221
AUDIT_ARG_FD(fd);
222
223
/*
224
* Ignore old flags that used to be defined but did not do anything.
225
*/
226
flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
227
228
/*
229
* Enforce the constraints.
230
* Mapping of length 0 is only allowed for old binaries.
231
* Anonymous mapping shall specify -1 as filedescriptor and
232
* zero position for new code. Be nice to ancient a.out
233
* binaries and correct pos for anonymous mapping, since old
234
* ld.so sometimes issues anonymous map requests with non-zero
235
* pos.
236
*/
237
if (!SV_CURPROC_FLAG(SV_AOUT)) {
238
if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) ||
239
((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) {
240
return (EXTERROR(EINVAL,
241
"offset %#jd not zero/fd %#jd not -1 for MAP_ANON",
242
fd, pos));
243
}
244
} else {
245
if ((flags & MAP_ANON) != 0)
246
pos = 0;
247
}
248
249
if (flags & MAP_STACK) {
250
if ((fd != -1) || ((prot & (PROT_READ | PROT_WRITE)) !=
251
(PROT_READ | PROT_WRITE))) {
252
return (EXTERROR(EINVAL,
253
"MAP_STACK with prot %#jx < rw", prot));
254
}
255
flags |= MAP_ANON;
256
pos = 0;
257
}
258
if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
259
MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
260
MAP_PREFAULT_READ | MAP_GUARD | MAP_32BIT |
261
MAP_ALIGNMENT_MASK)) != 0) {
262
return (EXTERROR(EINVAL, "reserved flag set (flags %#jx)",
263
flags));
264
}
265
if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) {
266
return (EXTERROR(EINVAL, "EXCL without FIXED (flags %#jx)",
267
flags));
268
}
269
if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED |
270
MAP_PRIVATE)) {
271
return (EXTERROR(EINVAL,
272
"both SHARED and PRIVATE set (flags %#jx)", flags));
273
}
274
if (prot != PROT_NONE &&
275
(prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) {
276
return (EXTERROR(EINVAL, "invalid prot %#jx", prot));
277
}
278
if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 ||
279
pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL |
280
MAP_32BIT | MAP_ALIGNMENT_MASK)) != 0)) {
281
return (EXTERROR(EINVAL, "GUARD with wrong parameters"));
282
}
283
284
/*
285
* Align the file position to a page boundary,
286
* and save its page offset component.
287
*/
288
pageoff = (pos & PAGE_MASK);
289
pos -= pageoff;
290
291
/* Compute size from len by rounding (on both ends). */
292
size = len + pageoff; /* low end... */
293
size = round_page(size); /* hi end */
294
/* Check for rounding up to zero. */
295
if (len > size)
296
return (ENOMEM);
297
298
/* Ensure alignment is at least a page and fits in a pointer. */
299
align = flags & MAP_ALIGNMENT_MASK;
300
if (align != 0 && align != MAP_ALIGNED_SUPER &&
301
(align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
302
align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) {
303
return (EXTERROR(EINVAL, "bad alignment %#jx", align));
304
}
305
306
/*
307
* Check for illegal addresses. Watch out for address wrap... Note
308
* that VM_*_ADDRESS are not constants due to casts (argh).
309
*/
310
if (flags & MAP_FIXED) {
311
/*
312
* The specified address must have the same remainder
313
* as the file offset taken modulo PAGE_SIZE, so it
314
* should be aligned after adjustment by pageoff.
315
*/
316
addr -= pageoff;
317
if ((addr & PAGE_MASK) != 0) {
318
return (EXTERROR(EINVAL,
319
"fixed mapping at %#jx not aligned", addr));
320
}
321
322
/* Address range must be all in user VM space. */
323
if (!vm_map_range_valid(&vms->vm_map, addr, addr + size)) {
324
EXTERROR(EINVAL, "mapping outside vm_map");
325
return (EINVAL);
326
}
327
if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) {
328
return (EXTERROR(EINVAL,
329
"fixed 32bit mapping of [%#jx %#jx] does not fit into 4G",
330
addr, addr + size));
331
}
332
} else if (flags & MAP_32BIT) {
333
/*
334
* For MAP_32BIT, override the hint if it is too high and
335
* do not bother moving the mapping past the heap (since
336
* the heap is usually above 2GB).
337
*/
338
if (addr + size > MAP_32BIT_MAX_ADDR)
339
addr = 0;
340
} else {
341
/*
342
* XXX for non-fixed mappings where no hint is provided or
343
* the hint would fall in the potential heap space,
344
* place it after the end of the largest possible heap.
345
*
346
* For anonymous mappings within the address space of the
347
* calling process, the absence of a hint is handled at a
348
* lower level in order to implement different clustering
349
* strategies for ASLR.
350
*/
351
if (((flags & MAP_ANON) == 0 && addr == 0) ||
352
(addr >= round_page((vm_offset_t)vms->vm_taddr) &&
353
addr < round_page((vm_offset_t)vms->vm_daddr +
354
lim_max(td, RLIMIT_DATA))))
355
addr = round_page((vm_offset_t)vms->vm_daddr +
356
lim_max(td, RLIMIT_DATA));
357
}
358
if (len == 0) {
359
/*
360
* Return success without mapping anything for old
361
* binaries that request a page-aligned mapping of
362
* length 0. For modern binaries, this function
363
* returns an error earlier.
364
*/
365
error = 0;
366
} else if ((flags & MAP_GUARD) != 0) {
367
error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE,
368
VM_PROT_NONE, flags, NULL, pos, FALSE, td);
369
} else if ((flags & MAP_ANON) != 0) {
370
/*
371
* Mapping blank space is trivial.
372
*
373
* This relies on VM_PROT_* matching PROT_*.
374
*/
375
error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
376
max_prot, flags, NULL, pos, FALSE, td);
377
} else {
378
/*
379
* Mapping file, get fp for validation and don't let the
380
* descriptor disappear on us if we block. Check capability
381
* rights, but also return the maximum rights to be combined
382
* with maxprot later.
383
*/
384
cap_rights_init_one(&rights, CAP_MMAP);
385
if (prot & PROT_READ)
386
cap_rights_set_one(&rights, CAP_MMAP_R);
387
if ((flags & MAP_SHARED) != 0) {
388
if (prot & PROT_WRITE)
389
cap_rights_set_one(&rights, CAP_MMAP_W);
390
}
391
if (prot & PROT_EXEC)
392
cap_rights_set_one(&rights, CAP_MMAP_X);
393
error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp);
394
if (error != 0)
395
goto done;
396
if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
397
p->p_osrel >= P_OSREL_MAP_FSTRICT) {
398
EXTERROR(EINVAL, "neither SHARED nor PRIVATE req");
399
error = EINVAL;
400
goto done;
401
}
402
if (check_fp_fn != NULL) {
403
error = check_fp_fn(fp, prot, max_prot & cap_maxprot,
404
flags);
405
if (error != 0)
406
goto done;
407
}
408
if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data))
409
addr = orig_addr;
410
/* This relies on VM_PROT_* matching PROT_*. */
411
error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
412
max_prot & cap_maxprot, flags, pos, td);
413
}
414
415
if (error == 0)
416
td->td_retval[0] = addr + pageoff;
417
done:
418
if (fp)
419
fdrop(fp, td);
420
421
return (error);
422
}
423
424
#if defined(COMPAT_FREEBSD6)
425
int
426
freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
427
{
428
return (kern_mmap(td, &(struct mmap_req){
429
.mr_hint = (uintptr_t)uap->addr,
430
.mr_len = uap->len,
431
.mr_prot = uap->prot,
432
.mr_flags = uap->flags,
433
.mr_fd = uap->fd,
434
.mr_pos = uap->pos,
435
}));
436
}
437
#endif
438
439
#ifdef COMPAT_43
440
#ifndef _SYS_SYSPROTO_H_
441
struct ommap_args {
442
caddr_t addr;
443
int len;
444
int prot;
445
int flags;
446
int fd;
447
long pos;
448
};
449
#endif
450
int
451
ommap(struct thread *td, struct ommap_args *uap)
452
{
453
return (kern_ommap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
454
uap->flags, uap->fd, uap->pos));
455
}
456
457
int
458
kern_ommap(struct thread *td, uintptr_t hint, int len, int oprot,
459
int oflags, int fd, long pos)
460
{
461
static const char cvtbsdprot[8] = {
462
0,
463
PROT_EXEC,
464
PROT_WRITE,
465
PROT_EXEC | PROT_WRITE,
466
PROT_READ,
467
PROT_EXEC | PROT_READ,
468
PROT_WRITE | PROT_READ,
469
PROT_EXEC | PROT_WRITE | PROT_READ,
470
};
471
int flags, prot;
472
473
if (len < 0)
474
return (EINVAL);
475
476
#define OMAP_ANON 0x0002
477
#define OMAP_COPY 0x0020
478
#define OMAP_SHARED 0x0010
479
#define OMAP_FIXED 0x0100
480
481
prot = cvtbsdprot[oprot & 0x7];
482
#if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__)
483
if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
484
prot != 0)
485
prot |= PROT_EXEC;
486
#endif
487
flags = 0;
488
if (oflags & OMAP_ANON)
489
flags |= MAP_ANON;
490
if (oflags & OMAP_COPY)
491
flags |= MAP_COPY;
492
if (oflags & OMAP_SHARED)
493
flags |= MAP_SHARED;
494
else
495
flags |= MAP_PRIVATE;
496
if (oflags & OMAP_FIXED)
497
flags |= MAP_FIXED;
498
return (kern_mmap(td, &(struct mmap_req){
499
.mr_hint = hint,
500
.mr_len = len,
501
.mr_prot = prot,
502
.mr_flags = flags,
503
.mr_fd = fd,
504
.mr_pos = pos,
505
}));
506
}
507
#endif /* COMPAT_43 */
508
509
#ifndef _SYS_SYSPROTO_H_
510
struct msync_args {
511
void *addr;
512
size_t len;
513
int flags;
514
};
515
#endif
516
int
517
sys_msync(struct thread *td, struct msync_args *uap)
518
{
519
520
return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags));
521
}
522
523
int
524
kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags)
525
{
526
vm_offset_t addr;
527
vm_size_t pageoff;
528
vm_map_t map;
529
int rv;
530
531
addr = addr0;
532
pageoff = (addr & PAGE_MASK);
533
addr -= pageoff;
534
size += pageoff;
535
size = (vm_size_t) round_page(size);
536
if (addr + size < addr)
537
return (EINVAL);
538
539
if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
540
return (EINVAL);
541
542
map = &td->td_proc->p_vmspace->vm_map;
543
544
/*
545
* Clean the pages and interpret the return value.
546
*/
547
rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
548
(flags & MS_INVALIDATE) != 0);
549
switch (rv) {
550
case KERN_SUCCESS:
551
return (0);
552
case KERN_INVALID_ADDRESS:
553
return (ENOMEM);
554
case KERN_INVALID_ARGUMENT:
555
return (EBUSY);
556
case KERN_FAILURE:
557
return (EIO);
558
default:
559
return (EINVAL);
560
}
561
}
562
563
#ifndef _SYS_SYSPROTO_H_
564
struct munmap_args {
565
void *addr;
566
size_t len;
567
};
568
#endif
569
int
570
sys_munmap(struct thread *td, struct munmap_args *uap)
571
{
572
573
return (kern_munmap(td, (uintptr_t)uap->addr, uap->len));
574
}
575
576
int
577
kern_munmap(struct thread *td, uintptr_t addr0, size_t size)
578
{
579
#ifdef HWPMC_HOOKS
580
struct pmckern_map_out pkm;
581
vm_map_entry_t entry;
582
bool pmc_handled;
583
#endif
584
vm_offset_t addr, end;
585
vm_size_t pageoff;
586
vm_map_t map;
587
int rv;
588
589
if (size == 0)
590
return (EINVAL);
591
592
addr = addr0;
593
pageoff = (addr & PAGE_MASK);
594
addr -= pageoff;
595
size += pageoff;
596
size = (vm_size_t) round_page(size);
597
end = addr + size;
598
map = &td->td_proc->p_vmspace->vm_map;
599
if (!vm_map_range_valid(map, addr, end))
600
return (EINVAL);
601
602
vm_map_lock(map);
603
#ifdef HWPMC_HOOKS
604
pmc_handled = false;
605
if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) {
606
pmc_handled = true;
607
/*
608
* Inform hwpmc if the address range being unmapped contains
609
* an executable region.
610
*/
611
pkm.pm_address = (uintptr_t) NULL;
612
if (vm_map_lookup_entry(map, addr, &entry)) {
613
for (; entry->start < end;
614
entry = vm_map_entry_succ(entry)) {
615
if (vm_map_check_protection(map, entry->start,
616
entry->end, VM_PROT_EXECUTE) == TRUE) {
617
pkm.pm_address = (uintptr_t) addr;
618
pkm.pm_size = (size_t) size;
619
break;
620
}
621
}
622
}
623
}
624
#endif
625
rv = vm_map_delete(map, addr, end);
626
627
#ifdef HWT_HOOKS
628
if (HWT_HOOK_INSTALLED && rv == KERN_SUCCESS) {
629
struct hwt_record_entry ent;
630
631
ent.addr = (uintptr_t) addr;
632
ent.fullpath = NULL;
633
ent.record_type = HWT_RECORD_MUNMAP;
634
HWT_CALL_HOOK(td, HWT_RECORD, &ent);
635
}
636
#endif
637
638
#ifdef HWPMC_HOOKS
639
if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) {
640
/* downgrade the lock to prevent a LOR with the pmc-sx lock */
641
vm_map_lock_downgrade(map);
642
if (pkm.pm_address != (uintptr_t) NULL)
643
PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
644
vm_map_unlock_read(map);
645
} else
646
#endif
647
vm_map_unlock(map);
648
649
return (vm_mmap_to_errno(rv));
650
}
651
652
#ifndef _SYS_SYSPROTO_H_
653
struct mprotect_args {
654
const void *addr;
655
size_t len;
656
int prot;
657
};
658
#endif
659
int
660
sys_mprotect(struct thread *td, struct mprotect_args *uap)
661
{
662
663
return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len,
664
uap->prot, 0));
665
}
666
667
int
668
kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot,
669
int flags)
670
{
671
vm_offset_t addr;
672
vm_size_t pageoff;
673
int vm_error, max_prot;
674
675
addr = addr0;
676
if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0)
677
return (EINVAL);
678
max_prot = PROT_MAX_EXTRACT(prot);
679
prot = PROT_EXTRACT(prot);
680
pageoff = (addr & PAGE_MASK);
681
addr -= pageoff;
682
size += pageoff;
683
size = (vm_size_t) round_page(size);
684
#ifdef COMPAT_FREEBSD32
685
if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
686
if (((addr + size) & 0xffffffff) < addr)
687
return (EINVAL);
688
} else
689
#endif
690
if (addr + size < addr)
691
return (EINVAL);
692
693
flags |= VM_MAP_PROTECT_SET_PROT;
694
if (max_prot != 0)
695
flags |= VM_MAP_PROTECT_SET_MAXPROT;
696
vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map,
697
addr, addr + size, prot, max_prot, flags);
698
699
switch (vm_error) {
700
case KERN_SUCCESS:
701
return (0);
702
case KERN_PROTECTION_FAILURE:
703
return (EACCES);
704
case KERN_RESOURCE_SHORTAGE:
705
return (ENOMEM);
706
case KERN_OUT_OF_BOUNDS:
707
return (ENOTSUP);
708
}
709
return (EINVAL);
710
}
711
712
#ifndef _SYS_SYSPROTO_H_
713
struct minherit_args {
714
void *addr;
715
size_t len;
716
int inherit;
717
};
718
#endif
719
int
720
sys_minherit(struct thread *td, struct minherit_args *uap)
721
{
722
723
return (kern_minherit(td, (uintptr_t)uap->addr, uap->len,
724
uap->inherit));
725
}
726
727
int
728
kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0)
729
{
730
vm_offset_t addr;
731
vm_size_t size, pageoff;
732
vm_inherit_t inherit;
733
734
addr = (vm_offset_t)addr0;
735
size = len;
736
inherit = inherit0;
737
738
pageoff = (addr & PAGE_MASK);
739
addr -= pageoff;
740
size += pageoff;
741
size = (vm_size_t) round_page(size);
742
if (addr + size < addr)
743
return (EINVAL);
744
745
switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
746
addr + size, inherit)) {
747
case KERN_SUCCESS:
748
return (0);
749
case KERN_PROTECTION_FAILURE:
750
return (EACCES);
751
}
752
return (EINVAL);
753
}
754
755
#ifndef _SYS_SYSPROTO_H_
756
struct madvise_args {
757
void *addr;
758
size_t len;
759
int behav;
760
};
761
#endif
762
763
int
764
sys_madvise(struct thread *td, struct madvise_args *uap)
765
{
766
767
return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav));
768
}
769
770
int
771
kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav)
772
{
773
vm_map_t map;
774
vm_offset_t addr, end, start;
775
int flags;
776
777
/*
778
* Check for our special case, advising the swap pager we are
779
* "immortal."
780
*/
781
if (behav == MADV_PROTECT) {
782
flags = PPROT_SET;
783
return (kern_procctl(td, P_PID, td->td_proc->p_pid,
784
PROC_SPROTECT, &flags));
785
}
786
787
/*
788
* Check for illegal addresses. Watch out for address wrap... Note
789
* that VM_*_ADDRESS are not constants due to casts (argh).
790
*/
791
map = &td->td_proc->p_vmspace->vm_map;
792
addr = addr0;
793
if (!vm_map_range_valid(map, addr, addr + len))
794
return (EINVAL);
795
796
/*
797
* Since this routine is only advisory, we default to conservative
798
* behavior.
799
*/
800
start = trunc_page(addr);
801
end = round_page(addr + len);
802
803
/*
804
* vm_map_madvise() checks for illegal values of behav.
805
*/
806
return (vm_map_madvise(map, start, end, behav));
807
}
808
809
#ifndef _SYS_SYSPROTO_H_
810
struct mincore_args {
811
const void *addr;
812
size_t len;
813
char *vec;
814
};
815
#endif
816
817
int
818
sys_mincore(struct thread *td, struct mincore_args *uap)
819
{
820
821
return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec));
822
}
823
824
int
825
kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec)
826
{
827
pmap_t pmap;
828
vm_map_t map;
829
vm_map_entry_t current, entry;
830
vm_object_t object;
831
vm_offset_t addr, cend, end, first_addr;
832
vm_paddr_t pa;
833
vm_page_t m;
834
vm_pindex_t pindex;
835
int error, lastvecindex, mincoreinfo, vecindex;
836
unsigned int timestamp;
837
838
/*
839
* Make sure that the addresses presented are valid for user
840
* mode.
841
*/
842
first_addr = addr = trunc_page(addr0);
843
end = round_page(addr0 + len);
844
map = &td->td_proc->p_vmspace->vm_map;
845
if (end > vm_map_max(map) || end < addr)
846
return (ENOMEM);
847
848
pmap = vmspace_pmap(td->td_proc->p_vmspace);
849
850
vm_map_lock_read(map);
851
RestartScan:
852
timestamp = map->timestamp;
853
854
if (!vm_map_lookup_entry(map, addr, &entry)) {
855
vm_map_unlock_read(map);
856
return (ENOMEM);
857
}
858
859
/*
860
* Do this on a map entry basis so that if the pages are not
861
* in the current processes address space, we can easily look
862
* up the pages elsewhere.
863
*/
864
lastvecindex = -1;
865
while (entry->start < end) {
866
/*
867
* check for contiguity
868
*/
869
current = entry;
870
entry = vm_map_entry_succ(current);
871
if (current->end < end &&
872
entry->start > current->end) {
873
vm_map_unlock_read(map);
874
return (ENOMEM);
875
}
876
877
/*
878
* ignore submaps (for now) or null objects
879
*/
880
if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
881
current->object.vm_object == NULL)
882
continue;
883
884
/*
885
* limit this scan to the current map entry and the
886
* limits for the mincore call
887
*/
888
if (addr < current->start)
889
addr = current->start;
890
cend = current->end;
891
if (cend > end)
892
cend = end;
893
894
for (; addr < cend; addr += PAGE_SIZE) {
895
/*
896
* Check pmap first, it is likely faster, also
897
* it can provide info as to whether we are the
898
* one referencing or modifying the page.
899
*/
900
m = NULL;
901
object = NULL;
902
retry:
903
pa = 0;
904
mincoreinfo = pmap_mincore(pmap, addr, &pa);
905
if (mincore_mapped) {
906
/*
907
* We only care about this pmap's
908
* mapping of the page, if any.
909
*/
910
;
911
} else if (pa != 0) {
912
/*
913
* The page is mapped by this process but not
914
* both accessed and modified. It is also
915
* managed. Acquire the object lock so that
916
* other mappings might be examined. The page's
917
* identity may change at any point before its
918
* object lock is acquired, so re-validate if
919
* necessary.
920
*/
921
m = PHYS_TO_VM_PAGE(pa);
922
while (object == NULL || m->object != object) {
923
if (object != NULL)
924
VM_OBJECT_WUNLOCK(object);
925
object = atomic_load_ptr(&m->object);
926
if (object == NULL)
927
goto retry;
928
VM_OBJECT_WLOCK(object);
929
}
930
if (pa != pmap_extract(pmap, addr))
931
goto retry;
932
KASSERT(vm_page_all_valid(m),
933
("mincore: page %p is mapped but invalid",
934
m));
935
} else if (mincoreinfo == 0) {
936
/*
937
* The page is not mapped by this process. If
938
* the object implements managed pages, then
939
* determine if the page is resident so that
940
* the mappings might be examined.
941
*/
942
if (current->object.vm_object != object) {
943
if (object != NULL)
944
VM_OBJECT_WUNLOCK(object);
945
object = current->object.vm_object;
946
VM_OBJECT_WLOCK(object);
947
}
948
if ((object->flags & OBJ_SWAP) != 0 ||
949
object->type == OBJT_VNODE) {
950
pindex = OFF_TO_IDX(current->offset +
951
(addr - current->start));
952
m = vm_page_lookup(object, pindex);
953
if (m != NULL && vm_page_none_valid(m))
954
m = NULL;
955
if (m != NULL)
956
mincoreinfo = MINCORE_INCORE;
957
}
958
}
959
if (m != NULL) {
960
VM_OBJECT_ASSERT_WLOCKED(m->object);
961
962
/* Examine other mappings of the page. */
963
if (m->dirty == 0 && pmap_is_modified(m))
964
vm_page_dirty(m);
965
if (m->dirty != 0)
966
mincoreinfo |= MINCORE_MODIFIED_OTHER;
967
968
/*
969
* The first test for PGA_REFERENCED is an
970
* optimization. The second test is
971
* required because a concurrent pmap
972
* operation could clear the last reference
973
* and set PGA_REFERENCED before the call to
974
* pmap_is_referenced().
975
*/
976
if ((m->a.flags & PGA_REFERENCED) != 0 ||
977
pmap_is_referenced(m) ||
978
(m->a.flags & PGA_REFERENCED) != 0)
979
mincoreinfo |= MINCORE_REFERENCED_OTHER;
980
}
981
if (object != NULL)
982
VM_OBJECT_WUNLOCK(object);
983
984
/*
985
* subyte may page fault. In case it needs to modify
986
* the map, we release the lock.
987
*/
988
vm_map_unlock_read(map);
989
990
/*
991
* calculate index into user supplied byte vector
992
*/
993
vecindex = atop(addr - first_addr);
994
995
/*
996
* If we have skipped map entries, we need to make sure that
997
* the byte vector is zeroed for those skipped entries.
998
*/
999
while ((lastvecindex + 1) < vecindex) {
1000
++lastvecindex;
1001
error = subyte(vec + lastvecindex, 0);
1002
if (error) {
1003
error = EFAULT;
1004
goto done2;
1005
}
1006
}
1007
1008
/*
1009
* Pass the page information to the user
1010
*/
1011
error = subyte(vec + vecindex, mincoreinfo);
1012
if (error) {
1013
error = EFAULT;
1014
goto done2;
1015
}
1016
1017
/*
1018
* If the map has changed, due to the subyte, the previous
1019
* output may be invalid.
1020
*/
1021
vm_map_lock_read(map);
1022
if (timestamp != map->timestamp)
1023
goto RestartScan;
1024
1025
lastvecindex = vecindex;
1026
}
1027
}
1028
1029
/*
1030
* subyte may page fault. In case it needs to modify
1031
* the map, we release the lock.
1032
*/
1033
vm_map_unlock_read(map);
1034
1035
/*
1036
* Zero the last entries in the byte vector.
1037
*/
1038
vecindex = atop(end - first_addr);
1039
while ((lastvecindex + 1) < vecindex) {
1040
++lastvecindex;
1041
error = subyte(vec + lastvecindex, 0);
1042
if (error) {
1043
error = EFAULT;
1044
goto done2;
1045
}
1046
}
1047
1048
/*
1049
* If the map has changed, due to the subyte, the previous
1050
* output may be invalid.
1051
*/
1052
vm_map_lock_read(map);
1053
if (timestamp != map->timestamp)
1054
goto RestartScan;
1055
vm_map_unlock_read(map);
1056
done2:
1057
return (error);
1058
}
1059
1060
#ifndef _SYS_SYSPROTO_H_
1061
struct mlock_args {
1062
const void *addr;
1063
size_t len;
1064
};
1065
#endif
1066
int
1067
sys_mlock(struct thread *td, struct mlock_args *uap)
1068
{
1069
1070
return (kern_mlock(td->td_proc, td->td_ucred,
1071
__DECONST(uintptr_t, uap->addr), uap->len));
1072
}
1073
1074
int
1075
kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len)
1076
{
1077
vm_offset_t addr, end, last, start;
1078
vm_size_t npages, size;
1079
vm_map_t map;
1080
unsigned long nsize;
1081
int error;
1082
1083
error = priv_check_cred(cred, PRIV_VM_MLOCK);
1084
if (error)
1085
return (error);
1086
addr = addr0;
1087
size = len;
1088
last = addr + size;
1089
start = trunc_page(addr);
1090
end = round_page(last);
1091
if (last < addr || end < addr)
1092
return (EINVAL);
1093
npages = atop(end - start);
1094
if (npages > vm_page_max_user_wired)
1095
return (ENOMEM);
1096
map = &proc->p_vmspace->vm_map;
1097
PROC_LOCK(proc);
1098
nsize = ptoa(npages + pmap_wired_count(map->pmap));
1099
if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
1100
PROC_UNLOCK(proc);
1101
return (ENOMEM);
1102
}
1103
PROC_UNLOCK(proc);
1104
#ifdef RACCT
1105
if (racct_enable) {
1106
PROC_LOCK(proc);
1107
error = racct_set(proc, RACCT_MEMLOCK, nsize);
1108
PROC_UNLOCK(proc);
1109
if (error != 0)
1110
return (ENOMEM);
1111
}
1112
#endif
1113
error = vm_map_wire(map, start, end,
1114
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1115
#ifdef RACCT
1116
if (racct_enable && error != KERN_SUCCESS) {
1117
PROC_LOCK(proc);
1118
racct_set(proc, RACCT_MEMLOCK,
1119
ptoa(pmap_wired_count(map->pmap)));
1120
PROC_UNLOCK(proc);
1121
}
1122
#endif
1123
switch (error) {
1124
case KERN_SUCCESS:
1125
return (0);
1126
case KERN_INVALID_ARGUMENT:
1127
return (EINVAL);
1128
default:
1129
return (ENOMEM);
1130
}
1131
}
1132
1133
#ifndef _SYS_SYSPROTO_H_
1134
struct mlockall_args {
1135
int how;
1136
};
1137
#endif
1138
1139
int
1140
sys_mlockall(struct thread *td, struct mlockall_args *uap)
1141
{
1142
vm_map_t map;
1143
int error;
1144
1145
map = &td->td_proc->p_vmspace->vm_map;
1146
error = priv_check(td, PRIV_VM_MLOCK);
1147
if (error)
1148
return (error);
1149
1150
if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
1151
return (EINVAL);
1152
1153
/*
1154
* If wiring all pages in the process would cause it to exceed
1155
* a hard resource limit, return ENOMEM.
1156
*/
1157
if (!old_mlock && uap->how & MCL_CURRENT) {
1158
if (map->size > lim_cur(td, RLIMIT_MEMLOCK))
1159
return (ENOMEM);
1160
}
1161
#ifdef RACCT
1162
if (racct_enable) {
1163
PROC_LOCK(td->td_proc);
1164
error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
1165
PROC_UNLOCK(td->td_proc);
1166
if (error != 0)
1167
return (ENOMEM);
1168
}
1169
#endif
1170
1171
if (uap->how & MCL_FUTURE) {
1172
vm_map_lock(map);
1173
vm_map_modflags(map, MAP_WIREFUTURE, 0);
1174
vm_map_unlock(map);
1175
error = 0;
1176
}
1177
1178
if (uap->how & MCL_CURRENT) {
1179
/*
1180
* P1003.1-2001 mandates that all currently mapped pages
1181
* will be memory resident and locked (wired) upon return
1182
* from mlockall(). vm_map_wire() will wire pages, by
1183
* calling vm_fault_wire() for each page in the region.
1184
*/
1185
error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
1186
VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1187
if (error == KERN_SUCCESS)
1188
error = 0;
1189
else if (error == KERN_RESOURCE_SHORTAGE)
1190
error = ENOMEM;
1191
else
1192
error = EAGAIN;
1193
}
1194
#ifdef RACCT
1195
if (racct_enable && error != KERN_SUCCESS) {
1196
PROC_LOCK(td->td_proc);
1197
racct_set(td->td_proc, RACCT_MEMLOCK,
1198
ptoa(pmap_wired_count(map->pmap)));
1199
PROC_UNLOCK(td->td_proc);
1200
}
1201
#endif
1202
1203
return (error);
1204
}
1205
1206
#ifndef _SYS_SYSPROTO_H_
1207
struct munlockall_args {
1208
register_t dummy;
1209
};
1210
#endif
1211
1212
int
1213
sys_munlockall(struct thread *td, struct munlockall_args *uap)
1214
{
1215
vm_map_t map;
1216
int error;
1217
1218
map = &td->td_proc->p_vmspace->vm_map;
1219
error = priv_check(td, PRIV_VM_MUNLOCK);
1220
if (error)
1221
return (error);
1222
1223
/* Clear the MAP_WIREFUTURE flag from this vm_map. */
1224
vm_map_lock(map);
1225
vm_map_modflags(map, 0, MAP_WIREFUTURE);
1226
vm_map_unlock(map);
1227
1228
/* Forcibly unwire all pages. */
1229
error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
1230
VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1231
#ifdef RACCT
1232
if (racct_enable && error == KERN_SUCCESS) {
1233
PROC_LOCK(td->td_proc);
1234
racct_set(td->td_proc, RACCT_MEMLOCK, 0);
1235
PROC_UNLOCK(td->td_proc);
1236
}
1237
#endif
1238
1239
return (error);
1240
}
1241
1242
#ifndef _SYS_SYSPROTO_H_
1243
struct munlock_args {
1244
const void *addr;
1245
size_t len;
1246
};
1247
#endif
1248
int
1249
sys_munlock(struct thread *td, struct munlock_args *uap)
1250
{
1251
1252
return (kern_munlock(td, (uintptr_t)uap->addr, uap->len));
1253
}
1254
1255
int
1256
kern_munlock(struct thread *td, uintptr_t addr0, size_t size)
1257
{
1258
vm_offset_t addr, end, last, start;
1259
#ifdef RACCT
1260
vm_map_t map;
1261
#endif
1262
int error;
1263
1264
error = priv_check(td, PRIV_VM_MUNLOCK);
1265
if (error)
1266
return (error);
1267
addr = addr0;
1268
last = addr + size;
1269
start = trunc_page(addr);
1270
end = round_page(last);
1271
if (last < addr || end < addr)
1272
return (EINVAL);
1273
error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
1274
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1275
#ifdef RACCT
1276
if (racct_enable && error == KERN_SUCCESS) {
1277
PROC_LOCK(td->td_proc);
1278
map = &td->td_proc->p_vmspace->vm_map;
1279
racct_set(td->td_proc, RACCT_MEMLOCK,
1280
ptoa(pmap_wired_count(map->pmap)));
1281
PROC_UNLOCK(td->td_proc);
1282
}
1283
#endif
1284
return (error == KERN_SUCCESS ? 0 : ENOMEM);
1285
}
1286
1287
/*
1288
* vm_mmap_vnode()
1289
*
1290
* Helper function for vm_mmap. Perform sanity check specific for mmap
1291
* operations on vnodes.
1292
*/
1293
int
1294
vm_mmap_vnode(struct thread *td, vm_size_t objsize,
1295
vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
1296
struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
1297
boolean_t *writecounted)
1298
{
1299
struct vattr va;
1300
vm_object_t obj;
1301
vm_ooffset_t foff;
1302
struct ucred *cred;
1303
int error, flags;
1304
bool writex;
1305
1306
cred = td->td_ucred;
1307
writex = (*maxprotp & VM_PROT_WRITE) != 0 &&
1308
(*flagsp & MAP_SHARED) != 0;
1309
if ((error = vget(vp, LK_SHARED)) != 0)
1310
return (error);
1311
AUDIT_ARG_VNODE1(vp);
1312
foff = *foffp;
1313
flags = *flagsp;
1314
obj = vp->v_object;
1315
if (vp->v_type == VREG) {
1316
/*
1317
* Get the proper underlying object
1318
*/
1319
if (obj == NULL) {
1320
error = EINVAL;
1321
goto done;
1322
}
1323
if (obj->type == OBJT_VNODE && obj->handle != vp) {
1324
vput(vp);
1325
vp = (struct vnode *)obj->handle;
1326
/*
1327
* Bypass filesystems obey the mpsafety of the
1328
* underlying fs. Tmpfs never bypasses.
1329
*/
1330
error = vget(vp, LK_SHARED);
1331
if (error != 0)
1332
return (error);
1333
}
1334
if (writex) {
1335
*writecounted = TRUE;
1336
vm_pager_update_writecount(obj, 0, objsize);
1337
}
1338
} else {
1339
error = EXTERROR(EINVAL, "non-reg file");
1340
goto done;
1341
}
1342
if ((error = VOP_GETATTR(vp, &va, cred)))
1343
goto done;
1344
#ifdef MAC
1345
/* This relies on VM_PROT_* matching PROT_*. */
1346
error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
1347
if (error != 0)
1348
goto done;
1349
#endif
1350
if ((flags & MAP_SHARED) != 0) {
1351
if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
1352
if (prot & VM_PROT_WRITE) {
1353
error = EPERM;
1354
goto done;
1355
}
1356
*maxprotp &= ~VM_PROT_WRITE;
1357
}
1358
}
1359
/*
1360
* If it is a regular file without any references
1361
* we do not need to sync it.
1362
* Adjust object size to be the size of actual file.
1363
*/
1364
objsize = round_page(va.va_size);
1365
if (va.va_nlink == 0)
1366
flags |= MAP_NOSYNC;
1367
if (obj->type == OBJT_VNODE) {
1368
obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
1369
cred);
1370
if (obj == NULL) {
1371
error = ENOMEM;
1372
goto done;
1373
}
1374
} else {
1375
KASSERT((obj->flags & OBJ_SWAP) != 0, ("wrong object type"));
1376
vm_object_reference(obj);
1377
#if VM_NRESERVLEVEL > 0
1378
if ((obj->flags & OBJ_COLORED) == 0) {
1379
VM_OBJECT_WLOCK(obj);
1380
vm_object_color(obj, 0);
1381
VM_OBJECT_WUNLOCK(obj);
1382
}
1383
#endif
1384
}
1385
*objp = obj;
1386
*flagsp = flags;
1387
1388
VOP_MMAPPED(vp);
1389
1390
done:
1391
if (error != 0 && *writecounted) {
1392
*writecounted = FALSE;
1393
vm_pager_update_writecount(obj, objsize, 0);
1394
}
1395
vput(vp);
1396
return (error);
1397
}
1398
1399
/*
1400
* vm_mmap_cdev()
1401
*
1402
* Helper function for vm_mmap. Perform sanity check specific for mmap
1403
* operations on cdevs.
1404
*/
1405
int
1406
vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
1407
vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
1408
vm_ooffset_t *foff, vm_object_t *objp)
1409
{
1410
vm_object_t obj;
1411
int error, flags;
1412
1413
flags = *flagsp;
1414
1415
if (dsw->d_flags & D_MMAP_ANON) {
1416
*objp = NULL;
1417
*foff = 0;
1418
*maxprotp = VM_PROT_ALL;
1419
*flagsp |= MAP_ANON;
1420
return (0);
1421
}
1422
1423
/*
1424
* cdevs do not provide private mappings of any kind.
1425
*/
1426
if ((*maxprotp & VM_PROT_WRITE) == 0 &&
1427
(prot & VM_PROT_WRITE) != 0)
1428
return (EACCES);
1429
if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0) {
1430
return (EXTERROR(EINVAL, "cdev mapping must be shared"));
1431
}
1432
1433
/*
1434
* Force device mappings to be shared.
1435
*/
1436
flags |= MAP_SHARED;
1437
#ifdef MAC_XXX
1438
error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
1439
if (error != 0)
1440
return (error);
1441
#endif
1442
/*
1443
* First, try d_mmap_single(). If that is not implemented
1444
* (returns ENODEV), fall back to using the device pager.
1445
* Note that d_mmap_single() must return a reference to the
1446
* object (it needs to bump the reference count of the object
1447
* it returns somehow).
1448
*
1449
* XXX assumes VM_PROT_* == PROT_*
1450
*/
1451
error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
1452
if (error != ENODEV)
1453
return (error);
1454
obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
1455
td->td_ucred);
1456
if (obj == NULL) {
1457
return (EXTERROR(EINVAL,
1458
"cdev driver does not support mmap"));
1459
}
1460
*objp = obj;
1461
*flagsp = flags;
1462
return (0);
1463
}
1464
1465
int
1466
vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1467
vm_prot_t maxprot, int flags,
1468
objtype_t handle_type, void *handle,
1469
vm_ooffset_t foff)
1470
{
1471
vm_object_t object;
1472
struct thread *td = curthread;
1473
int error;
1474
boolean_t writecounted;
1475
1476
if (size == 0) {
1477
return (EXTERROR(EINVAL, "zero-sized req"));
1478
}
1479
1480
size = round_page(size);
1481
object = NULL;
1482
writecounted = FALSE;
1483
1484
switch (handle_type) {
1485
case OBJT_DEVICE: {
1486
struct cdevsw *dsw;
1487
struct cdev *cdev;
1488
int ref;
1489
1490
cdev = handle;
1491
dsw = dev_refthread(cdev, &ref);
1492
if (dsw == NULL)
1493
return (ENXIO);
1494
error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
1495
dsw, &foff, &object);
1496
dev_relthread(cdev, ref);
1497
break;
1498
}
1499
case OBJT_VNODE:
1500
error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
1501
handle, &foff, &object, &writecounted);
1502
break;
1503
default:
1504
error = EXTERROR(EINVAL, "unsupported backing obj type %jd",
1505
handle_type);
1506
break;
1507
}
1508
if (error)
1509
return (error);
1510
1511
error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
1512
foff, writecounted, td);
1513
if (error != 0 && object != NULL) {
1514
/*
1515
* If this mapping was accounted for in the vnode's
1516
* writecount, then undo that now.
1517
*/
1518
if (writecounted)
1519
vm_pager_release_writecount(object, 0, size);
1520
vm_object_deallocate(object);
1521
}
1522
return (error);
1523
}
1524
1525
int
1526
kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size)
1527
{
1528
int error;
1529
1530
RACCT_PROC_LOCK(td->td_proc);
1531
if (map->size + size > lim_cur(td, RLIMIT_VMEM)) {
1532
RACCT_PROC_UNLOCK(td->td_proc);
1533
return (ENOMEM);
1534
}
1535
if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
1536
RACCT_PROC_UNLOCK(td->td_proc);
1537
return (ENOMEM);
1538
}
1539
if (!old_mlock && map->flags & MAP_WIREFUTURE) {
1540
if (ptoa(pmap_wired_count(map->pmap)) + size >
1541
lim_cur(td, RLIMIT_MEMLOCK)) {
1542
racct_set_force(td->td_proc, RACCT_VMEM, map->size);
1543
RACCT_PROC_UNLOCK(td->td_proc);
1544
return (ENOMEM);
1545
}
1546
error = racct_set(td->td_proc, RACCT_MEMLOCK,
1547
ptoa(pmap_wired_count(map->pmap)) + size);
1548
if (error != 0) {
1549
racct_set_force(td->td_proc, RACCT_VMEM, map->size);
1550
RACCT_PROC_UNLOCK(td->td_proc);
1551
return (error);
1552
}
1553
}
1554
RACCT_PROC_UNLOCK(td->td_proc);
1555
return (0);
1556
}
1557
1558
/*
1559
* Internal version of mmap that maps a specific VM object into an
1560
* map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
1561
*/
1562
int
1563
vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1564
vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
1565
boolean_t writecounted, struct thread *td)
1566
{
1567
vm_offset_t default_addr, max_addr;
1568
int docow, error, findspace, rv;
1569
bool curmap, fitit;
1570
1571
curmap = map == &td->td_proc->p_vmspace->vm_map;
1572
if (curmap) {
1573
error = kern_mmap_racct_check(td, map, size);
1574
if (error != 0)
1575
return (error);
1576
}
1577
1578
/*
1579
* We currently can only deal with page aligned file offsets.
1580
* The mmap() system call already enforces this by subtracting
1581
* the page offset from the file offset, but checking here
1582
* catches errors in device drivers (e.g. d_single_mmap()
1583
* callbacks) and other internal mapping requests (such as in
1584
* exec).
1585
*/
1586
if ((foff & PAGE_MASK) != 0) {
1587
return (EXTERROR(EINVAL, "offset %#jx not page-aligned", foff));
1588
}
1589
1590
if ((flags & MAP_FIXED) == 0) {
1591
fitit = true;
1592
*addr = round_page(*addr);
1593
} else {
1594
if (*addr != trunc_page(*addr)) {
1595
return (EXTERROR(EINVAL,
1596
"non-fixed mapping address %#jx not aligned",
1597
*addr));
1598
}
1599
fitit = false;
1600
}
1601
1602
if (flags & MAP_ANON) {
1603
if (object != NULL) {
1604
return (EXTERROR(EINVAL,
1605
"anon mapping backed by an object"));
1606
}
1607
if (foff != 0) {
1608
return (EXTERROR(EINVAL,
1609
"anon mapping with non-zero offset %#jx", foff));
1610
}
1611
docow = 0;
1612
} else if (flags & MAP_PREFAULT_READ)
1613
docow = MAP_PREFAULT;
1614
else
1615
docow = MAP_PREFAULT_PARTIAL;
1616
1617
if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
1618
docow |= MAP_COPY_ON_WRITE;
1619
if (flags & MAP_NOSYNC)
1620
docow |= MAP_DISABLE_SYNCER;
1621
if (flags & MAP_NOCORE)
1622
docow |= MAP_DISABLE_COREDUMP;
1623
/* Shared memory is also shared with children. */
1624
if (flags & MAP_SHARED)
1625
docow |= MAP_INHERIT_SHARE;
1626
if (writecounted)
1627
docow |= MAP_WRITECOUNT;
1628
if (flags & MAP_STACK) {
1629
if (object != NULL) {
1630
return (EXTERROR(EINVAL,
1631
"stack mapping backed by an object"));
1632
}
1633
docow |= MAP_STACK_AREA;
1634
}
1635
if ((flags & MAP_EXCL) != 0)
1636
docow |= MAP_CHECK_EXCL;
1637
if ((flags & MAP_GUARD) != 0)
1638
docow |= MAP_CREATE_GUARD;
1639
1640
if (fitit) {
1641
if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
1642
findspace = VMFS_SUPER_SPACE;
1643
else if ((flags & MAP_ALIGNMENT_MASK) != 0)
1644
findspace = VMFS_ALIGNED_SPACE(flags >>
1645
MAP_ALIGNMENT_SHIFT);
1646
else
1647
findspace = VMFS_OPTIMAL_SPACE;
1648
max_addr = 0;
1649
if ((flags & MAP_32BIT) != 0)
1650
max_addr = MAP_32BIT_MAX_ADDR;
1651
if (curmap) {
1652
default_addr =
1653
round_page((vm_offset_t)td->td_proc->p_vmspace->
1654
vm_daddr + lim_max(td, RLIMIT_DATA));
1655
if ((flags & MAP_32BIT) != 0)
1656
default_addr = 0;
1657
rv = vm_map_find_min(map, object, foff, addr, size,
1658
default_addr, max_addr, findspace, prot, maxprot,
1659
docow);
1660
} else {
1661
rv = vm_map_find(map, object, foff, addr, size,
1662
max_addr, findspace, prot, maxprot, docow);
1663
}
1664
} else {
1665
rv = vm_map_fixed(map, object, foff, *addr, size,
1666
prot, maxprot, docow);
1667
}
1668
1669
if (rv == KERN_SUCCESS) {
1670
/*
1671
* If the process has requested that all future mappings
1672
* be wired, then heed this.
1673
*/
1674
if ((map->flags & MAP_WIREFUTURE) != 0) {
1675
vm_map_lock(map);
1676
if ((map->flags & MAP_WIREFUTURE) != 0)
1677
(void)vm_map_wire_locked(map, *addr,
1678
*addr + size, VM_MAP_WIRE_USER |
1679
((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK :
1680
VM_MAP_WIRE_NOHOLES));
1681
vm_map_unlock(map);
1682
}
1683
}
1684
return (vm_mmap_to_errno(rv));
1685
}
1686
1687
/*
1688
* Translate a Mach VM return code to zero on success or the appropriate errno
1689
* on failure.
1690
*/
1691
int
1692
vm_mmap_to_errno(int rv)
1693
{
1694
int error;
1695
1696
switch (rv) {
1697
case KERN_SUCCESS:
1698
return (0);
1699
case KERN_INVALID_ADDRESS:
1700
case KERN_NO_SPACE:
1701
error = ENOMEM;
1702
break;
1703
case KERN_PROTECTION_FAILURE:
1704
error = EACCES;
1705
break;
1706
default:
1707
error = EINVAL;
1708
break;
1709
}
1710
if ((curthread->td_pflags2 & (TDP2_UEXTERR | TDP2_EXTERR)) ==
1711
TDP2_UEXTERR)
1712
EXTERROR(error, "mach error %jd", rv);
1713
return (error);
1714
}
1715
1716