Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/usr.sbin/bhyve/block_if.c
105240 views
1
/*-
2
* SPDX-License-Identifier: BSD-2-Clause
3
*
4
* Copyright (c) 2013 Peter Grehan <[email protected]>
5
* All rights reserved.
6
* Copyright 2020 Joyent, Inc.
7
*
8
* Redistribution and use in source and binary forms, with or without
9
* modification, are permitted provided that the following conditions
10
* are met:
11
* 1. Redistributions of source code must retain the above copyright
12
* notice, this list of conditions and the following disclaimer.
13
* 2. Redistributions in binary form must reproduce the above copyright
14
* notice, this list of conditions and the following disclaimer in the
15
* documentation and/or other materials provided with the distribution.
16
*
17
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27
* SUCH DAMAGE.
28
*/
29
30
#include <sys/param.h>
31
#ifndef WITHOUT_CAPSICUM
32
#include <sys/capsicum.h>
33
#endif
34
#include <sys/queue.h>
35
#include <sys/errno.h>
36
#include <sys/stat.h>
37
#include <sys/ioctl.h>
38
#include <sys/disk.h>
39
40
#include <assert.h>
41
#ifndef WITHOUT_CAPSICUM
42
#include <capsicum_helpers.h>
43
#endif
44
#include <err.h>
45
#include <fcntl.h>
46
#include <stdio.h>
47
#include <stdlib.h>
48
#include <string.h>
49
#include <pthread.h>
50
#include <pthread_np.h>
51
#include <signal.h>
52
#include <sysexits.h>
53
#include <unistd.h>
54
55
#include <machine/atomic.h>
56
#include <machine/vmm_snapshot.h>
57
58
#include "bhyverun.h"
59
#include "config.h"
60
#include "debug.h"
61
#include "mevent.h"
62
#include "pci_emul.h"
63
#include "block_if.h"
64
65
#define BLOCKIF_SIG 0xb109b109
66
67
#define BLOCKIF_NUMTHR 8
68
#define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
69
70
enum blockop {
71
BOP_READ,
72
BOP_WRITE,
73
BOP_FLUSH,
74
BOP_DELETE
75
};
76
77
enum blockstat {
78
BST_FREE,
79
BST_BLOCK,
80
BST_PEND,
81
BST_BUSY,
82
BST_DONE
83
};
84
85
struct blockif_elem {
86
TAILQ_ENTRY(blockif_elem) be_link;
87
struct blockif_req *be_req;
88
enum blockop be_op;
89
enum blockstat be_status;
90
pthread_t be_tid;
91
off_t be_block;
92
};
93
94
struct blockif_ctxt {
95
unsigned int bc_magic;
96
int bc_fd;
97
int bc_ischr;
98
int bc_isgeom;
99
int bc_candelete;
100
int bc_rdonly;
101
off_t bc_size;
102
int bc_sectsz;
103
int bc_psectsz;
104
int bc_psectoff;
105
int bc_closing;
106
int bc_paused;
107
pthread_t bc_btid[BLOCKIF_NUMTHR];
108
pthread_mutex_t bc_mtx;
109
pthread_cond_t bc_cond;
110
pthread_cond_t bc_work_done_cond;
111
blockif_resize_cb *bc_resize_cb;
112
void *bc_resize_cb_arg;
113
struct mevent *bc_resize_event;
114
115
/* Request elements and free/pending/busy queues */
116
TAILQ_HEAD(, blockif_elem) bc_freeq;
117
TAILQ_HEAD(, blockif_elem) bc_pendq;
118
TAILQ_HEAD(, blockif_elem) bc_busyq;
119
struct blockif_elem bc_reqs[BLOCKIF_MAXREQ];
120
int bc_bootindex;
121
};
122
123
static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
124
125
struct blockif_sig_elem {
126
pthread_mutex_t bse_mtx;
127
pthread_cond_t bse_cond;
128
int bse_pending;
129
struct blockif_sig_elem *bse_next;
130
};
131
132
static struct blockif_sig_elem *blockif_bse_head;
133
134
static int
135
blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
136
enum blockop op)
137
{
138
struct blockif_elem *be, *tbe;
139
off_t off;
140
int i;
141
142
be = TAILQ_FIRST(&bc->bc_freeq);
143
assert(be != NULL);
144
assert(be->be_status == BST_FREE);
145
TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
146
be->be_req = breq;
147
be->be_op = op;
148
switch (op) {
149
case BOP_READ:
150
case BOP_WRITE:
151
case BOP_DELETE:
152
off = breq->br_offset;
153
for (i = 0; i < breq->br_iovcnt; i++)
154
off += breq->br_iov[i].iov_len;
155
break;
156
default:
157
off = OFF_MAX;
158
}
159
be->be_block = off;
160
TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
161
if (tbe->be_block == breq->br_offset)
162
break;
163
}
164
if (tbe == NULL) {
165
TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
166
if (tbe->be_block == breq->br_offset)
167
break;
168
}
169
}
170
if (tbe == NULL)
171
be->be_status = BST_PEND;
172
else
173
be->be_status = BST_BLOCK;
174
TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
175
return (be->be_status == BST_PEND);
176
}
177
178
static int
179
blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
180
{
181
struct blockif_elem *be;
182
183
TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
184
if (be->be_status == BST_PEND)
185
break;
186
assert(be->be_status == BST_BLOCK);
187
}
188
if (be == NULL)
189
return (0);
190
TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
191
be->be_status = BST_BUSY;
192
be->be_tid = t;
193
TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
194
*bep = be;
195
return (1);
196
}
197
198
static void
199
blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
200
{
201
struct blockif_elem *tbe;
202
203
if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
204
TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
205
else
206
TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
207
TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
208
if (tbe->be_req->br_offset == be->be_block)
209
tbe->be_status = BST_PEND;
210
}
211
be->be_tid = 0;
212
be->be_status = BST_FREE;
213
be->be_req = NULL;
214
TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
215
}
216
217
static int
218
blockif_flush_bc(struct blockif_ctxt *bc)
219
{
220
if (bc->bc_ischr) {
221
if (ioctl(bc->bc_fd, DIOCGFLUSH))
222
return (errno);
223
} else if (fsync(bc->bc_fd))
224
return (errno);
225
226
return (0);
227
}
228
229
static void
230
blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
231
{
232
struct spacectl_range range;
233
struct blockif_req *br;
234
off_t arg[2];
235
ssize_t n;
236
size_t clen, len, off, boff, voff;
237
int i, err;
238
239
br = be->be_req;
240
assert(br->br_resid >= 0);
241
242
if (br->br_iovcnt <= 1)
243
buf = NULL;
244
err = 0;
245
switch (be->be_op) {
246
case BOP_READ:
247
if (buf == NULL) {
248
if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
249
br->br_offset)) < 0)
250
err = errno;
251
else
252
br->br_resid -= n;
253
break;
254
}
255
i = 0;
256
off = voff = 0;
257
while (br->br_resid > 0) {
258
len = MIN(br->br_resid, MAXPHYS);
259
n = pread(bc->bc_fd, buf, len, br->br_offset + off);
260
if (n < 0) {
261
err = errno;
262
break;
263
}
264
len = (size_t)n;
265
boff = 0;
266
do {
267
clen = MIN(len - boff, br->br_iov[i].iov_len -
268
voff);
269
memcpy((uint8_t *)br->br_iov[i].iov_base + voff,
270
buf + boff, clen);
271
if (clen < br->br_iov[i].iov_len - voff)
272
voff += clen;
273
else {
274
i++;
275
voff = 0;
276
}
277
boff += clen;
278
} while (boff < len);
279
off += len;
280
br->br_resid -= len;
281
}
282
break;
283
case BOP_WRITE:
284
if (bc->bc_rdonly) {
285
err = EROFS;
286
break;
287
}
288
if (buf == NULL) {
289
if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
290
br->br_offset)) < 0)
291
err = errno;
292
else
293
br->br_resid -= n;
294
break;
295
}
296
i = 0;
297
off = voff = 0;
298
while (br->br_resid > 0) {
299
len = MIN(br->br_resid, MAXPHYS);
300
boff = 0;
301
do {
302
clen = MIN(len - boff, br->br_iov[i].iov_len -
303
voff);
304
memcpy(buf + boff,
305
(uint8_t *)br->br_iov[i].iov_base + voff,
306
clen);
307
if (clen < br->br_iov[i].iov_len - voff)
308
voff += clen;
309
else {
310
i++;
311
voff = 0;
312
}
313
boff += clen;
314
} while (boff < len);
315
316
n = pwrite(bc->bc_fd, buf, len, br->br_offset + off);
317
if (n < 0) {
318
err = errno;
319
break;
320
}
321
off += n;
322
br->br_resid -= n;
323
}
324
break;
325
case BOP_FLUSH:
326
err = blockif_flush_bc(bc);
327
break;
328
case BOP_DELETE:
329
if (!bc->bc_candelete)
330
err = EOPNOTSUPP;
331
else if (bc->bc_rdonly)
332
err = EROFS;
333
else if (bc->bc_ischr) {
334
arg[0] = br->br_offset;
335
arg[1] = br->br_resid;
336
if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
337
err = errno;
338
else
339
br->br_resid = 0;
340
} else {
341
range.r_offset = br->br_offset;
342
range.r_len = br->br_resid;
343
344
while (range.r_len > 0) {
345
if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC,
346
&range, 0, &range) != 0) {
347
err = errno;
348
break;
349
}
350
}
351
if (err == 0)
352
br->br_resid = 0;
353
}
354
break;
355
default:
356
err = EINVAL;
357
break;
358
}
359
360
be->be_status = BST_DONE;
361
362
(*br->br_callback)(br, err);
363
}
364
365
static inline bool
366
blockif_empty(const struct blockif_ctxt *bc)
367
{
368
return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq));
369
}
370
371
static void *
372
blockif_thr(void *arg)
373
{
374
struct blockif_ctxt *bc;
375
struct blockif_elem *be;
376
pthread_t t;
377
uint8_t *buf;
378
379
bc = arg;
380
if (bc->bc_isgeom)
381
buf = malloc(MAXPHYS);
382
else
383
buf = NULL;
384
t = pthread_self();
385
386
pthread_mutex_lock(&bc->bc_mtx);
387
for (;;) {
388
while (blockif_dequeue(bc, t, &be)) {
389
pthread_mutex_unlock(&bc->bc_mtx);
390
blockif_proc(bc, be, buf);
391
pthread_mutex_lock(&bc->bc_mtx);
392
blockif_complete(bc, be);
393
}
394
395
/* If none to work, notify the main thread */
396
if (blockif_empty(bc))
397
pthread_cond_broadcast(&bc->bc_work_done_cond);
398
399
/* Check ctxt status here to see if exit requested */
400
if (bc->bc_closing)
401
break;
402
403
pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
404
}
405
pthread_mutex_unlock(&bc->bc_mtx);
406
407
if (buf)
408
free(buf);
409
pthread_exit(NULL);
410
return (NULL);
411
}
412
413
static void
414
blockif_sigcont_handler(int signal __unused, enum ev_type type __unused,
415
void *arg __unused)
416
{
417
struct blockif_sig_elem *bse;
418
419
for (;;) {
420
/*
421
* Process the entire list even if not intended for
422
* this thread.
423
*/
424
do {
425
bse = blockif_bse_head;
426
if (bse == NULL)
427
return;
428
} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
429
(uintptr_t)bse,
430
(uintptr_t)bse->bse_next));
431
432
pthread_mutex_lock(&bse->bse_mtx);
433
bse->bse_pending = 0;
434
pthread_cond_signal(&bse->bse_cond);
435
pthread_mutex_unlock(&bse->bse_mtx);
436
}
437
}
438
439
static void
440
blockif_init(void)
441
{
442
mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
443
(void) signal(SIGCONT, SIG_IGN);
444
}
445
446
int
447
blockif_legacy_config(nvlist_t *nvl, const char *opts)
448
{
449
char *cp, *path;
450
451
if (opts == NULL)
452
return (0);
453
454
cp = strchr(opts, ',');
455
if (cp == NULL) {
456
set_config_value_node(nvl, "path", opts);
457
return (0);
458
}
459
path = strndup(opts, cp - opts);
460
set_config_value_node(nvl, "path", path);
461
free(path);
462
return (pci_parse_legacy_config(nvl, cp + 1));
463
}
464
465
int
466
blockif_add_boot_device(struct pci_devinst *const pi,
467
struct blockif_ctxt *const bc)
468
{
469
if (bc->bc_bootindex < 0)
470
return (0);
471
472
return (pci_emul_add_boot_device(pi, bc->bc_bootindex));
473
}
474
475
struct blockif_ctxt *
476
blockif_open(nvlist_t *nvl, const char *ident)
477
{
478
char tname[MAXCOMLEN + 1];
479
char name[MAXPATHLEN];
480
const char *path, *pssval, *ssval, *bootindex_val;
481
char *cp;
482
struct blockif_ctxt *bc;
483
struct stat sbuf;
484
struct diocgattr_arg arg;
485
off_t size, psectsz, psectoff;
486
int extra, fd, i, sectsz;
487
int ro, candelete, geom, ssopt, pssopt;
488
int nodelete;
489
int bootindex;
490
491
#ifndef WITHOUT_CAPSICUM
492
cap_rights_t rights;
493
cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE };
494
#endif
495
496
pthread_once(&blockif_once, blockif_init);
497
498
fd = -1;
499
extra = 0;
500
ssopt = 0;
501
ro = 0;
502
nodelete = 0;
503
bootindex = -1;
504
505
if (get_config_bool_node_default(nvl, "nocache", false))
506
extra |= O_DIRECT;
507
if (get_config_bool_node_default(nvl, "nodelete", false))
508
nodelete = 1;
509
if (get_config_bool_node_default(nvl, "sync", false) ||
510
get_config_bool_node_default(nvl, "direct", false))
511
extra |= O_SYNC;
512
if (get_config_bool_node_default(nvl, "ro", false))
513
ro = 1;
514
ssval = get_config_value_node(nvl, "sectorsize");
515
if (ssval != NULL) {
516
ssopt = strtol(ssval, &cp, 10);
517
if (cp == ssval) {
518
EPRINTLN("Invalid sector size \"%s\"", ssval);
519
goto err;
520
}
521
if (*cp == '\0') {
522
pssopt = ssopt;
523
} else if (*cp == '/') {
524
pssval = cp + 1;
525
pssopt = strtol(pssval, &cp, 10);
526
if (cp == pssval || *cp != '\0') {
527
EPRINTLN("Invalid sector size \"%s\"", ssval);
528
goto err;
529
}
530
} else {
531
EPRINTLN("Invalid sector size \"%s\"", ssval);
532
goto err;
533
}
534
}
535
536
bootindex_val = get_config_value_node(nvl, "bootindex");
537
if (bootindex_val != NULL) {
538
bootindex = atoi(bootindex_val);
539
}
540
541
path = get_config_value_node(nvl, "path");
542
if (path == NULL) {
543
EPRINTLN("Missing \"path\" for block device.");
544
goto err;
545
}
546
547
fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra);
548
if (fd < 0 && !ro) {
549
/* Attempt a r/w fail with a r/o open */
550
fd = open(path, O_RDONLY | extra);
551
ro = 1;
552
}
553
554
if (fd < 0) {
555
warn("Could not open backing file: %s", path);
556
goto err;
557
}
558
559
if (fstat(fd, &sbuf) < 0) {
560
warn("Could not stat backing file %s", path);
561
goto err;
562
}
563
564
#ifndef WITHOUT_CAPSICUM
565
cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
566
CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF);
567
if (ro)
568
cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
569
570
if (caph_rights_limit(fd, &rights) == -1)
571
errx(EX_OSERR, "Unable to apply rights for sandbox");
572
#endif
573
574
/*
575
* Deal with raw devices
576
*/
577
size = sbuf.st_size;
578
sectsz = DEV_BSIZE;
579
psectsz = psectoff = 0;
580
candelete = geom = 0;
581
if (S_ISCHR(sbuf.st_mode)) {
582
if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
583
ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
584
perror("Could not fetch dev blk/sector size");
585
goto err;
586
}
587
assert(size != 0);
588
assert(sectsz != 0);
589
if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
590
ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
591
strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
592
arg.len = sizeof(arg.value.i);
593
if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
594
candelete = arg.value.i;
595
if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
596
geom = 1;
597
} else {
598
psectsz = sbuf.st_blksize;
599
/* Avoid fallback implementation */
600
candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1;
601
}
602
603
#ifndef WITHOUT_CAPSICUM
604
if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
605
errx(EX_OSERR, "Unable to apply rights for sandbox");
606
#endif
607
608
if (ssopt != 0) {
609
if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
610
ssopt > pssopt) {
611
EPRINTLN("Invalid sector size %d/%d",
612
ssopt, pssopt);
613
goto err;
614
}
615
616
/*
617
* Some backend drivers (e.g. cd0, ada0) require that the I/O
618
* size be a multiple of the device's sector size.
619
*
620
* Validate that the emulated sector size complies with this
621
* requirement.
622
*/
623
if (S_ISCHR(sbuf.st_mode)) {
624
if (ssopt < sectsz || (ssopt % sectsz) != 0) {
625
EPRINTLN("Sector size %d incompatible "
626
"with underlying device sector size %d",
627
ssopt, sectsz);
628
goto err;
629
}
630
}
631
632
sectsz = ssopt;
633
psectsz = pssopt;
634
psectoff = 0;
635
}
636
637
bc = calloc(1, sizeof(struct blockif_ctxt));
638
if (bc == NULL) {
639
perror("calloc");
640
goto err;
641
}
642
643
bc->bc_magic = BLOCKIF_SIG;
644
bc->bc_fd = fd;
645
bc->bc_ischr = S_ISCHR(sbuf.st_mode);
646
bc->bc_isgeom = geom;
647
bc->bc_candelete = candelete;
648
bc->bc_rdonly = ro;
649
bc->bc_size = size;
650
bc->bc_sectsz = sectsz;
651
bc->bc_psectsz = psectsz;
652
bc->bc_psectoff = psectoff;
653
pthread_mutex_init(&bc->bc_mtx, NULL);
654
pthread_cond_init(&bc->bc_cond, NULL);
655
bc->bc_paused = 0;
656
pthread_cond_init(&bc->bc_work_done_cond, NULL);
657
TAILQ_INIT(&bc->bc_freeq);
658
TAILQ_INIT(&bc->bc_pendq);
659
TAILQ_INIT(&bc->bc_busyq);
660
bc->bc_bootindex = bootindex;
661
for (i = 0; i < BLOCKIF_MAXREQ; i++) {
662
bc->bc_reqs[i].be_status = BST_FREE;
663
TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
664
}
665
666
for (i = 0; i < BLOCKIF_NUMTHR; i++) {
667
pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
668
snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
669
pthread_set_name_np(bc->bc_btid[i], tname);
670
}
671
672
return (bc);
673
err:
674
if (fd >= 0)
675
close(fd);
676
return (NULL);
677
}
678
679
static void
680
blockif_resized(int fd, enum ev_type type __unused, void *arg)
681
{
682
struct blockif_ctxt *bc;
683
struct stat sb;
684
off_t mediasize;
685
686
if (fstat(fd, &sb) != 0)
687
return;
688
689
if (S_ISCHR(sb.st_mode)) {
690
if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) {
691
EPRINTLN("blockif_resized: get mediasize failed: %s",
692
strerror(errno));
693
return;
694
}
695
} else
696
mediasize = sb.st_size;
697
698
bc = arg;
699
pthread_mutex_lock(&bc->bc_mtx);
700
if (mediasize != bc->bc_size) {
701
bc->bc_size = mediasize;
702
bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size);
703
}
704
pthread_mutex_unlock(&bc->bc_mtx);
705
}
706
707
int
708
blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb,
709
void *cb_arg)
710
{
711
struct stat sb;
712
int err;
713
714
if (cb == NULL)
715
return (EINVAL);
716
717
err = 0;
718
719
pthread_mutex_lock(&bc->bc_mtx);
720
if (bc->bc_resize_cb != NULL) {
721
err = EBUSY;
722
goto out;
723
}
724
725
assert(bc->bc_closing == 0);
726
727
if (fstat(bc->bc_fd, &sb) != 0) {
728
err = errno;
729
goto out;
730
}
731
732
bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE,
733
EVFF_ATTRIB, blockif_resized, bc);
734
if (bc->bc_resize_event == NULL) {
735
err = ENXIO;
736
goto out;
737
}
738
739
bc->bc_resize_cb = cb;
740
bc->bc_resize_cb_arg = cb_arg;
741
out:
742
pthread_mutex_unlock(&bc->bc_mtx);
743
744
return (err);
745
}
746
747
static int
748
blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
749
enum blockop op)
750
{
751
int err;
752
753
err = 0;
754
755
pthread_mutex_lock(&bc->bc_mtx);
756
assert(!bc->bc_paused);
757
if (!TAILQ_EMPTY(&bc->bc_freeq)) {
758
/*
759
* Enqueue and inform the block i/o thread
760
* that there is work available
761
*/
762
if (blockif_enqueue(bc, breq, op))
763
pthread_cond_signal(&bc->bc_cond);
764
} else {
765
/*
766
* Callers are not allowed to enqueue more than
767
* the specified blockif queue limit. Return an
768
* error to indicate that the queue length has been
769
* exceeded.
770
*/
771
err = E2BIG;
772
}
773
pthread_mutex_unlock(&bc->bc_mtx);
774
775
return (err);
776
}
777
778
int
779
blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
780
{
781
assert(bc->bc_magic == BLOCKIF_SIG);
782
return (blockif_request(bc, breq, BOP_READ));
783
}
784
785
int
786
blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
787
{
788
assert(bc->bc_magic == BLOCKIF_SIG);
789
return (blockif_request(bc, breq, BOP_WRITE));
790
}
791
792
int
793
blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
794
{
795
assert(bc->bc_magic == BLOCKIF_SIG);
796
return (blockif_request(bc, breq, BOP_FLUSH));
797
}
798
799
int
800
blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
801
{
802
assert(bc->bc_magic == BLOCKIF_SIG);
803
return (blockif_request(bc, breq, BOP_DELETE));
804
}
805
806
int
807
blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
808
{
809
struct blockif_elem *be;
810
811
assert(bc->bc_magic == BLOCKIF_SIG);
812
813
pthread_mutex_lock(&bc->bc_mtx);
814
/* XXX: not waiting while paused */
815
816
/*
817
* Check pending requests.
818
*/
819
TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
820
if (be->be_req == breq)
821
break;
822
}
823
if (be != NULL) {
824
/*
825
* Found it.
826
*/
827
blockif_complete(bc, be);
828
pthread_mutex_unlock(&bc->bc_mtx);
829
830
return (0);
831
}
832
833
/*
834
* Check in-flight requests.
835
*/
836
TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
837
if (be->be_req == breq)
838
break;
839
}
840
if (be == NULL) {
841
/*
842
* Didn't find it.
843
*/
844
pthread_mutex_unlock(&bc->bc_mtx);
845
return (EINVAL);
846
}
847
848
/*
849
* Interrupt the processing thread to force it return
850
* prematurely via it's normal callback path.
851
*/
852
while (be->be_status == BST_BUSY) {
853
struct blockif_sig_elem bse, *old_head;
854
855
pthread_mutex_init(&bse.bse_mtx, NULL);
856
pthread_cond_init(&bse.bse_cond, NULL);
857
858
bse.bse_pending = 1;
859
860
do {
861
old_head = blockif_bse_head;
862
bse.bse_next = old_head;
863
} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
864
(uintptr_t)old_head,
865
(uintptr_t)&bse));
866
867
pthread_kill(be->be_tid, SIGCONT);
868
869
pthread_mutex_lock(&bse.bse_mtx);
870
while (bse.bse_pending)
871
pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
872
pthread_mutex_unlock(&bse.bse_mtx);
873
}
874
875
pthread_mutex_unlock(&bc->bc_mtx);
876
877
/*
878
* The processing thread has been interrupted. Since it's not
879
* clear if the callback has been invoked yet, return EBUSY.
880
*/
881
return (EBUSY);
882
}
883
884
int
885
blockif_close(struct blockif_ctxt *bc)
886
{
887
void *jval;
888
int i;
889
890
assert(bc->bc_magic == BLOCKIF_SIG);
891
892
/*
893
* Stop the block i/o thread
894
*/
895
pthread_mutex_lock(&bc->bc_mtx);
896
bc->bc_closing = 1;
897
if (bc->bc_resize_event != NULL)
898
mevent_disable(bc->bc_resize_event);
899
pthread_mutex_unlock(&bc->bc_mtx);
900
pthread_cond_broadcast(&bc->bc_cond);
901
for (i = 0; i < BLOCKIF_NUMTHR; i++)
902
pthread_join(bc->bc_btid[i], &jval);
903
904
/* XXX Cancel queued i/o's ??? */
905
906
/*
907
* Release resources
908
*/
909
bc->bc_magic = 0;
910
close(bc->bc_fd);
911
free(bc);
912
913
return (0);
914
}
915
916
/*
917
* Return virtual C/H/S values for a given block. Use the algorithm
918
* outlined in the VHD specification to calculate values.
919
*/
920
void
921
blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
922
{
923
off_t sectors; /* total sectors of the block dev */
924
off_t hcyl; /* cylinders times heads */
925
uint16_t secpt; /* sectors per track */
926
uint8_t heads;
927
928
assert(bc->bc_magic == BLOCKIF_SIG);
929
930
sectors = bc->bc_size / bc->bc_sectsz;
931
932
/* Clamp the size to the largest possible with CHS */
933
if (sectors > 65535L * 16 * 255)
934
sectors = 65535L * 16 * 255;
935
936
if (sectors >= 65536L * 16 * 63) {
937
secpt = 255;
938
heads = 16;
939
hcyl = sectors / secpt;
940
} else {
941
secpt = 17;
942
hcyl = sectors / secpt;
943
heads = (hcyl + 1023) / 1024;
944
945
if (heads < 4)
946
heads = 4;
947
948
if (hcyl >= (heads * 1024) || heads > 16) {
949
secpt = 31;
950
heads = 16;
951
hcyl = sectors / secpt;
952
}
953
if (hcyl >= (heads * 1024)) {
954
secpt = 63;
955
heads = 16;
956
hcyl = sectors / secpt;
957
}
958
}
959
960
*c = hcyl / heads;
961
*h = heads;
962
*s = secpt;
963
}
964
965
/*
966
* Accessors
967
*/
968
off_t
969
blockif_size(struct blockif_ctxt *bc)
970
{
971
assert(bc->bc_magic == BLOCKIF_SIG);
972
return (bc->bc_size);
973
}
974
975
int
976
blockif_sectsz(struct blockif_ctxt *bc)
977
{
978
assert(bc->bc_magic == BLOCKIF_SIG);
979
return (bc->bc_sectsz);
980
}
981
982
void
983
blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
984
{
985
assert(bc->bc_magic == BLOCKIF_SIG);
986
*size = bc->bc_psectsz;
987
*off = bc->bc_psectoff;
988
}
989
990
int
991
blockif_queuesz(struct blockif_ctxt *bc)
992
{
993
assert(bc->bc_magic == BLOCKIF_SIG);
994
return (BLOCKIF_MAXREQ - 1);
995
}
996
997
int
998
blockif_is_ro(struct blockif_ctxt *bc)
999
{
1000
assert(bc->bc_magic == BLOCKIF_SIG);
1001
return (bc->bc_rdonly);
1002
}
1003
1004
int
1005
blockif_candelete(struct blockif_ctxt *bc)
1006
{
1007
assert(bc->bc_magic == BLOCKIF_SIG);
1008
return (bc->bc_candelete);
1009
}
1010
1011
#ifdef BHYVE_SNAPSHOT
1012
void
1013
blockif_pause(struct blockif_ctxt *bc)
1014
{
1015
assert(bc != NULL);
1016
assert(bc->bc_magic == BLOCKIF_SIG);
1017
1018
pthread_mutex_lock(&bc->bc_mtx);
1019
bc->bc_paused = 1;
1020
1021
/* The interface is paused. Wait for workers to finish their work */
1022
while (!blockif_empty(bc))
1023
pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
1024
pthread_mutex_unlock(&bc->bc_mtx);
1025
1026
if (!bc->bc_rdonly && blockif_flush_bc(bc))
1027
EPRINTLN("%s: [WARN] failed to flush backing file.",
1028
__func__);
1029
}
1030
1031
void
1032
blockif_resume(struct blockif_ctxt *bc)
1033
{
1034
assert(bc != NULL);
1035
assert(bc->bc_magic == BLOCKIF_SIG);
1036
1037
pthread_mutex_lock(&bc->bc_mtx);
1038
bc->bc_paused = 0;
1039
pthread_mutex_unlock(&bc->bc_mtx);
1040
}
1041
#endif /* BHYVE_SNAPSHOT */
1042
1043