Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/ufs/ffs/ffs_alloc.c
39478 views
1
/*-
2
* SPDX-License-Identifier: (BSD-2-Clause AND BSD-3-Clause)
3
*
4
* Copyright (c) 2002 Networks Associates Technology, Inc.
5
* All rights reserved.
6
*
7
* This software was developed for the FreeBSD Project by Marshall
8
* Kirk McKusick and Network Associates Laboratories, the Security
9
* Research Division of Network Associates, Inc. under DARPA/SPAWAR
10
* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
11
* research program
12
*
13
* Redistribution and use in source and binary forms, with or without
14
* modification, are permitted provided that the following conditions
15
* are met:
16
* 1. Redistributions of source code must retain the above copyright
17
* notice, this list of conditions and the following disclaimer.
18
* 2. Redistributions in binary form must reproduce the above copyright
19
* notice, this list of conditions and the following disclaimer in the
20
* documentation and/or other materials provided with the distribution.
21
*
22
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32
* SUCH DAMAGE.
33
*
34
* Copyright (c) 1982, 1986, 1989, 1993
35
* The Regents of the University of California. All rights reserved.
36
*
37
* Redistribution and use in source and binary forms, with or without
38
* modification, are permitted provided that the following conditions
39
* are met:
40
* 1. Redistributions of source code must retain the above copyright
41
* notice, this list of conditions and the following disclaimer.
42
* 2. Redistributions in binary form must reproduce the above copyright
43
* notice, this list of conditions and the following disclaimer in the
44
* documentation and/or other materials provided with the distribution.
45
* 3. Neither the name of the University nor the names of its contributors
46
* may be used to endorse or promote products derived from this software
47
* without specific prior written permission.
48
*
49
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59
* SUCH DAMAGE.
60
*/
61
62
#include <sys/cdefs.h>
63
#include "opt_quota.h"
64
65
#include <sys/param.h>
66
#include <sys/systm.h>
67
#include <sys/bio.h>
68
#include <sys/buf.h>
69
#include <sys/capsicum.h>
70
#include <sys/conf.h>
71
#include <sys/fcntl.h>
72
#include <sys/file.h>
73
#include <sys/filedesc.h>
74
#include <sys/gsb_crc32.h>
75
#include <sys/kernel.h>
76
#include <sys/mount.h>
77
#include <sys/priv.h>
78
#include <sys/proc.h>
79
#include <sys/stat.h>
80
#include <sys/syscallsubr.h>
81
#include <sys/sysctl.h>
82
#include <sys/syslog.h>
83
#include <sys/taskqueue.h>
84
#include <sys/vnode.h>
85
86
#include <security/audit/audit.h>
87
88
#include <geom/geom.h>
89
#include <geom/geom_vfs.h>
90
91
#include <ufs/ufs/dir.h>
92
#include <ufs/ufs/extattr.h>
93
#include <ufs/ufs/quota.h>
94
#include <ufs/ufs/inode.h>
95
#include <ufs/ufs/ufs_extern.h>
96
#include <ufs/ufs/ufsmount.h>
97
98
#include <ufs/ffs/fs.h>
99
#include <ufs/ffs/ffs_extern.h>
100
#include <ufs/ffs/softdep.h>
101
102
typedef ufs2_daddr_t allocfcn_t(struct inode *ip, uint64_t cg,
103
ufs2_daddr_t bpref, int size, int rsize);
104
105
static ufs2_daddr_t ffs_alloccg(struct inode *, uint64_t, ufs2_daddr_t, int,
106
int);
107
static ufs2_daddr_t
108
ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int);
109
static void ffs_blkfree_cg(struct ufsmount *, struct fs *,
110
struct vnode *, ufs2_daddr_t, long, ino_t,
111
struct workhead *);
112
#ifdef INVARIANTS
113
static int ffs_checkfreeblk(struct inode *, ufs2_daddr_t, long);
114
#endif
115
static void ffs_checkcgintegrity(struct fs *, uint64_t, int);
116
static ufs2_daddr_t ffs_clusteralloc(struct inode *, uint64_t, ufs2_daddr_t,
117
int);
118
static ino_t ffs_dirpref(struct inode *);
119
static ufs2_daddr_t ffs_fragextend(struct inode *, uint64_t, ufs2_daddr_t,
120
int, int);
121
static ufs2_daddr_t ffs_hashalloc(struct inode *, uint64_t, ufs2_daddr_t,
122
int, int, allocfcn_t *);
123
static ufs2_daddr_t ffs_nodealloccg(struct inode *, uint64_t, ufs2_daddr_t, int,
124
int);
125
static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int);
126
static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *);
127
static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *);
128
static void ffs_ckhash_cg(struct buf *);
129
130
/*
131
* Allocate a block in the filesystem.
132
*
133
* The size of the requested block is given, which must be some
134
* multiple of fs_fsize and <= fs_bsize.
135
* A preference may be optionally specified. If a preference is given
136
* the following hierarchy is used to allocate a block:
137
* 1) allocate the requested block.
138
* 2) allocate a rotationally optimal block in the same cylinder.
139
* 3) allocate a block in the same cylinder group.
140
* 4) quadratically rehash into other cylinder groups, until an
141
* available block is located.
142
* If no block preference is given the following hierarchy is used
143
* to allocate a block:
144
* 1) allocate a block in the cylinder group that contains the
145
* inode for the file.
146
* 2) quadratically rehash into other cylinder groups, until an
147
* available block is located.
148
*/
149
int
150
ffs_alloc(struct inode *ip,
151
ufs2_daddr_t lbn,
152
ufs2_daddr_t bpref,
153
int size,
154
int flags,
155
struct ucred *cred,
156
ufs2_daddr_t *bnp)
157
{
158
struct fs *fs;
159
struct ufsmount *ump;
160
ufs2_daddr_t bno;
161
uint64_t cg, reclaimed;
162
int64_t delta;
163
#ifdef QUOTA
164
int error;
165
#endif
166
167
*bnp = 0;
168
ump = ITOUMP(ip);
169
fs = ump->um_fs;
170
mtx_assert(UFS_MTX(ump), MA_OWNED);
171
#ifdef INVARIANTS
172
if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0) {
173
printf("dev = %s, bsize = %ld, size = %d, fs = %s\n",
174
devtoname(ump->um_dev), (long)fs->fs_bsize, size,
175
fs->fs_fsmnt);
176
panic("ffs_alloc: bad size");
177
}
178
if (cred == NOCRED)
179
panic("ffs_alloc: missing credential");
180
#endif /* INVARIANTS */
181
reclaimed = 0;
182
retry:
183
#ifdef QUOTA
184
UFS_UNLOCK(ump);
185
error = chkdq(ip, btodb(size), cred, 0);
186
if (error)
187
return (error);
188
UFS_LOCK(ump);
189
#endif
190
if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
191
goto nospace;
192
if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) &&
193
freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0)
194
goto nospace;
195
if (bpref >= fs->fs_size)
196
bpref = 0;
197
if (bpref == 0)
198
cg = ino_to_cg(fs, ip->i_number);
199
else
200
cg = dtog(fs, bpref);
201
bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg);
202
if (bno > 0) {
203
delta = btodb(size);
204
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
205
if (flags & IO_EXT)
206
UFS_INODE_SET_FLAG(ip, IN_CHANGE);
207
else
208
UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
209
*bnp = bno;
210
return (0);
211
}
212
nospace:
213
#ifdef QUOTA
214
UFS_UNLOCK(ump);
215
/*
216
* Restore user's disk quota because allocation failed.
217
*/
218
(void) chkdq(ip, -btodb(size), cred, FORCE);
219
UFS_LOCK(ump);
220
#endif
221
if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
222
reclaimed = 1;
223
softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT);
224
goto retry;
225
}
226
if (ffs_fsfail_cleanup_locked(ump, 0)) {
227
UFS_UNLOCK(ump);
228
return (ENXIO);
229
}
230
if (reclaimed > 0 &&
231
ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) {
232
UFS_UNLOCK(ump);
233
ffs_fserr(fs, ip->i_number, "filesystem full");
234
uprintf("\n%s: write failed, filesystem is full\n",
235
fs->fs_fsmnt);
236
} else {
237
UFS_UNLOCK(ump);
238
}
239
return (ENOSPC);
240
}
241
242
/*
243
* Reallocate a fragment to a bigger size
244
*
245
* The number and size of the old block is given, and a preference
246
* and new size is also specified. The allocator attempts to extend
247
* the original block. Failing that, the regular block allocator is
248
* invoked to get an appropriate block.
249
*/
250
int
251
ffs_realloccg(struct inode *ip,
252
ufs2_daddr_t lbprev,
253
ufs2_daddr_t bprev,
254
ufs2_daddr_t bpref,
255
int osize,
256
int nsize,
257
int flags,
258
struct ucred *cred,
259
struct buf **bpp)
260
{
261
struct vnode *vp;
262
struct fs *fs;
263
struct buf *bp;
264
struct ufsmount *ump;
265
uint64_t cg, request, reclaimed;
266
int error, gbflags;
267
ufs2_daddr_t bno;
268
int64_t delta;
269
270
vp = ITOV(ip);
271
ump = ITOUMP(ip);
272
fs = ump->um_fs;
273
bp = NULL;
274
gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
275
#ifdef WITNESS
276
gbflags |= IS_SNAPSHOT(ip) ? GB_NOWITNESS : 0;
277
#endif
278
279
mtx_assert(UFS_MTX(ump), MA_OWNED);
280
#ifdef INVARIANTS
281
if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
282
panic("ffs_realloccg: allocation on suspended filesystem");
283
if ((uint64_t)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
284
(uint64_t)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
285
printf(
286
"dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n",
287
devtoname(ump->um_dev), (long)fs->fs_bsize, osize,
288
nsize, fs->fs_fsmnt);
289
panic("ffs_realloccg: bad size");
290
}
291
if (cred == NOCRED)
292
panic("ffs_realloccg: missing credential");
293
#endif /* INVARIANTS */
294
reclaimed = 0;
295
retry:
296
if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) &&
297
freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) {
298
goto nospace;
299
}
300
if (bprev == 0) {
301
printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n",
302
devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev,
303
fs->fs_fsmnt);
304
panic("ffs_realloccg: bad bprev");
305
}
306
UFS_UNLOCK(ump);
307
/*
308
* Allocate the extra space in the buffer.
309
*/
310
error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp);
311
if (error) {
312
return (error);
313
}
314
315
if (bp->b_blkno == bp->b_lblkno) {
316
if (lbprev >= UFS_NDADDR)
317
panic("ffs_realloccg: lbprev out of range");
318
bp->b_blkno = fsbtodb(fs, bprev);
319
}
320
321
#ifdef QUOTA
322
error = chkdq(ip, btodb(nsize - osize), cred, 0);
323
if (error) {
324
brelse(bp);
325
return (error);
326
}
327
#endif
328
/*
329
* Check for extension in the existing location.
330
*/
331
*bpp = NULL;
332
cg = dtog(fs, bprev);
333
UFS_LOCK(ump);
334
bno = ffs_fragextend(ip, cg, bprev, osize, nsize);
335
if (bno) {
336
if (bp->b_blkno != fsbtodb(fs, bno))
337
panic("ffs_realloccg: bad blockno");
338
delta = btodb(nsize - osize);
339
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
340
if (flags & IO_EXT)
341
UFS_INODE_SET_FLAG(ip, IN_CHANGE);
342
else
343
UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
344
allocbuf(bp, nsize);
345
bp->b_flags |= B_DONE;
346
vfs_bio_bzero_buf(bp, osize, nsize - osize);
347
if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO)
348
vfs_bio_set_valid(bp, osize, nsize - osize);
349
*bpp = bp;
350
return (0);
351
}
352
/*
353
* Allocate a new disk location.
354
*/
355
if (bpref >= fs->fs_size)
356
bpref = 0;
357
switch ((int)fs->fs_optim) {
358
case FS_OPTSPACE:
359
/*
360
* Allocate an exact sized fragment. Although this makes
361
* best use of space, we will waste time relocating it if
362
* the file continues to grow. If the fragmentation is
363
* less than half of the minimum free reserve, we choose
364
* to begin optimizing for time.
365
*/
366
request = nsize;
367
if (fs->fs_minfree <= 5 ||
368
fs->fs_cstotal.cs_nffree >
369
(off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100))
370
break;
371
log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n",
372
fs->fs_fsmnt);
373
fs->fs_optim = FS_OPTTIME;
374
break;
375
case FS_OPTTIME:
376
/*
377
* At this point we have discovered a file that is trying to
378
* grow a small fragment to a larger fragment. To save time,
379
* we allocate a full sized block, then free the unused portion.
380
* If the file continues to grow, the `ffs_fragextend' call
381
* above will be able to grow it in place without further
382
* copying. If aberrant programs cause disk fragmentation to
383
* grow within 2% of the free reserve, we choose to begin
384
* optimizing for space.
385
*/
386
request = fs->fs_bsize;
387
if (fs->fs_cstotal.cs_nffree <
388
(off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100)
389
break;
390
log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n",
391
fs->fs_fsmnt);
392
fs->fs_optim = FS_OPTSPACE;
393
break;
394
default:
395
printf("dev = %s, optim = %ld, fs = %s\n",
396
devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt);
397
panic("ffs_realloccg: bad optim");
398
/* NOTREACHED */
399
}
400
bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg);
401
if (bno > 0) {
402
bp->b_blkno = fsbtodb(fs, bno);
403
if (!DOINGSOFTDEP(vp))
404
/*
405
* The usual case is that a smaller fragment that
406
* was just allocated has been replaced with a bigger
407
* fragment or a full-size block. If it is marked as
408
* B_DELWRI, the current contents have not been written
409
* to disk. It is possible that the block was written
410
* earlier, but very uncommon. If the block has never
411
* been written, there is no need to send a BIO_DELETE
412
* for it when it is freed. The gain from avoiding the
413
* TRIMs for the common case of unwritten blocks far
414
* exceeds the cost of the write amplification for the
415
* uncommon case of failing to send a TRIM for a block
416
* that had been written.
417
*/
418
ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize,
419
ip->i_number, vp->v_type, NULL,
420
(bp->b_flags & B_DELWRI) != 0 ?
421
NOTRIM_KEY : SINGLETON_KEY);
422
delta = btodb(nsize - osize);
423
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
424
if (flags & IO_EXT)
425
UFS_INODE_SET_FLAG(ip, IN_CHANGE);
426
else
427
UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
428
allocbuf(bp, nsize);
429
bp->b_flags |= B_DONE;
430
vfs_bio_bzero_buf(bp, osize, nsize - osize);
431
if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO)
432
vfs_bio_set_valid(bp, osize, nsize - osize);
433
*bpp = bp;
434
return (0);
435
}
436
#ifdef QUOTA
437
UFS_UNLOCK(ump);
438
/*
439
* Restore user's disk quota because allocation failed.
440
*/
441
(void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
442
UFS_LOCK(ump);
443
#endif
444
nospace:
445
/*
446
* no space available
447
*/
448
if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
449
reclaimed = 1;
450
UFS_UNLOCK(ump);
451
if (bp) {
452
brelse(bp);
453
bp = NULL;
454
}
455
UFS_LOCK(ump);
456
softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT);
457
goto retry;
458
}
459
if (bp)
460
brelse(bp);
461
if (ffs_fsfail_cleanup_locked(ump, 0)) {
462
UFS_UNLOCK(ump);
463
return (ENXIO);
464
}
465
if (reclaimed > 0 &&
466
ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) {
467
UFS_UNLOCK(ump);
468
ffs_fserr(fs, ip->i_number, "filesystem full");
469
uprintf("\n%s: write failed, filesystem is full\n",
470
fs->fs_fsmnt);
471
} else {
472
UFS_UNLOCK(ump);
473
}
474
return (ENOSPC);
475
}
476
477
/*
478
* Reallocate a sequence of blocks into a contiguous sequence of blocks.
479
*
480
* The vnode and an array of buffer pointers for a range of sequential
481
* logical blocks to be made contiguous is given. The allocator attempts
482
* to find a range of sequential blocks starting as close as possible
483
* from the end of the allocation for the logical block immediately
484
* preceding the current range. If successful, the physical block numbers
485
* in the buffer pointers and in the inode are changed to reflect the new
486
* allocation. If unsuccessful, the allocation is left unchanged. The
487
* success in doing the reallocation is returned. Note that the error
488
* return is not reflected back to the user. Rather the previous block
489
* allocation will be used.
490
*/
491
492
SYSCTL_DECL(_vfs_ffs);
493
494
static int doasyncfree = 1;
495
SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0,
496
"do not force synchronous writes when blocks are reallocated");
497
498
static int doreallocblks = 1;
499
SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0,
500
"enable block reallocation");
501
502
static int dotrimcons = 1;
503
SYSCTL_INT(_vfs_ffs, OID_AUTO, dotrimcons, CTLFLAG_RWTUN, &dotrimcons, 0,
504
"enable BIO_DELETE / TRIM consolidation");
505
506
static int maxclustersearch = 10;
507
SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch,
508
0, "max number of cylinder group to search for contigous blocks");
509
510
#ifdef DIAGNOSTIC
511
static int prtrealloc = 0;
512
SYSCTL_INT(_debug, OID_AUTO, ffs_prtrealloc, CTLFLAG_RW, &prtrealloc, 0,
513
"print out FFS filesystem block reallocation operations");
514
#endif
515
516
int
517
ffs_reallocblks(
518
struct vop_reallocblks_args /* {
519
struct vnode *a_vp;
520
struct cluster_save *a_buflist;
521
} */ *ap)
522
{
523
struct ufsmount *ump;
524
int error;
525
526
/*
527
* We used to skip reallocating the blocks of a file into a
528
* contiguous sequence if the underlying flash device requested
529
* BIO_DELETE notifications, because devices that benefit from
530
* BIO_DELETE also benefit from not moving the data. However,
531
* the destination for the data is usually moved before the data
532
* is written to the initially allocated location, so we rarely
533
* suffer the penalty of extra writes. With the addition of the
534
* consolidation of contiguous blocks into single BIO_DELETE
535
* operations, having fewer but larger contiguous blocks reduces
536
* the number of (slow and expensive) BIO_DELETE operations. So
537
* when doing BIO_DELETE consolidation, we do block reallocation.
538
*
539
* Skip if reallocblks has been disabled globally.
540
*/
541
ump = ap->a_vp->v_mount->mnt_data;
542
if ((((ump->um_flags) & UM_CANDELETE) != 0 && dotrimcons == 0) ||
543
doreallocblks == 0)
544
return (ENOSPC);
545
546
/*
547
* We can't wait in softdep prealloc as it may fsync and recurse
548
* here. Instead we simply fail to reallocate blocks if this
549
* rare condition arises.
550
*/
551
if (DOINGSUJ(ap->a_vp))
552
if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0)
553
return (ENOSPC);
554
vn_seqc_write_begin(ap->a_vp);
555
error = ump->um_fstype == UFS1 ? ffs_reallocblks_ufs1(ap) :
556
ffs_reallocblks_ufs2(ap);
557
vn_seqc_write_end(ap->a_vp);
558
return (error);
559
}
560
561
static int
562
ffs_reallocblks_ufs1(
563
struct vop_reallocblks_args /* {
564
struct vnode *a_vp;
565
struct cluster_save *a_buflist;
566
} */ *ap)
567
{
568
struct fs *fs;
569
struct inode *ip;
570
struct vnode *vp;
571
struct buf *sbp, *ebp, *bp;
572
ufs1_daddr_t *bap, *sbap, *ebap;
573
struct cluster_save *buflist;
574
struct ufsmount *ump;
575
ufs_lbn_t start_lbn, end_lbn;
576
ufs1_daddr_t soff, newblk, blkno;
577
ufs2_daddr_t pref;
578
struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp;
579
int i, cg, len, start_lvl, end_lvl, ssize;
580
581
vp = ap->a_vp;
582
ip = VTOI(vp);
583
ump = ITOUMP(ip);
584
fs = ump->um_fs;
585
/*
586
* If we are not tracking block clusters or if we have less than 4%
587
* free blocks left, then do not attempt to cluster. Running with
588
* less than 5% free block reserve is not recommended and those that
589
* choose to do so do not expect to have good file layout.
590
*/
591
if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0)
592
return (ENOSPC);
593
buflist = ap->a_buflist;
594
len = buflist->bs_nchildren;
595
start_lbn = buflist->bs_children[0]->b_lblkno;
596
end_lbn = start_lbn + len - 1;
597
#ifdef INVARIANTS
598
for (i = 0; i < len; i++)
599
if (!ffs_checkfreeblk(ip,
600
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
601
panic("ffs_reallocblks: unallocated block 1");
602
for (i = 1; i < len; i++)
603
if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
604
panic("ffs_reallocblks: non-logical cluster");
605
blkno = buflist->bs_children[0]->b_blkno;
606
ssize = fsbtodb(fs, fs->fs_frag);
607
for (i = 1; i < len - 1; i++)
608
if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
609
panic("ffs_reallocblks: non-physical cluster %d", i);
610
#endif
611
/*
612
* If the cluster crosses the boundary for the first indirect
613
* block, leave space for the indirect block. Indirect blocks
614
* are initially laid out in a position after the last direct
615
* block. Block reallocation would usually destroy locality by
616
* moving the indirect block out of the way to make room for
617
* data blocks if we didn't compensate here. We should also do
618
* this for other indirect block boundaries, but it is only
619
* important for the first one.
620
*/
621
if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR)
622
return (ENOSPC);
623
/*
624
* If the latest allocation is in a new cylinder group, assume that
625
* the filesystem has decided to move and do not force it back to
626
* the previous cylinder group.
627
*/
628
if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
629
dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
630
return (ENOSPC);
631
if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
632
ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
633
return (ENOSPC);
634
/*
635
* Get the starting offset and block map for the first block.
636
*/
637
if (start_lvl == 0) {
638
sbap = &ip->i_din1->di_db[0];
639
soff = start_lbn;
640
} else {
641
idp = &start_ap[start_lvl - 1];
642
if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
643
brelse(sbp);
644
return (ENOSPC);
645
}
646
sbap = (ufs1_daddr_t *)sbp->b_data;
647
soff = idp->in_off;
648
}
649
/*
650
* If the block range spans two block maps, get the second map.
651
*/
652
ebap = NULL;
653
if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
654
ssize = len;
655
} else {
656
#ifdef INVARIANTS
657
if (start_lvl > 0 &&
658
start_ap[start_lvl - 1].in_lbn == idp->in_lbn)
659
panic("ffs_reallocblk: start == end");
660
#endif
661
ssize = len - (idp->in_off + 1);
662
if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
663
goto fail;
664
ebap = (ufs1_daddr_t *)ebp->b_data;
665
}
666
/*
667
* Find the preferred location for the cluster. If we have not
668
* previously failed at this endeavor, then follow our standard
669
* preference calculation. If we have failed at it, then pick up
670
* where we last ended our search.
671
*/
672
UFS_LOCK(ump);
673
if (ip->i_nextclustercg == -1)
674
pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap);
675
else
676
pref = cgdata(fs, ip->i_nextclustercg);
677
/*
678
* Search the block map looking for an allocation of the desired size.
679
* To avoid wasting too much time, we limit the number of cylinder
680
* groups that we will search.
681
*/
682
cg = dtog(fs, pref);
683
MPASS(cg < fs->fs_ncg);
684
for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) {
685
if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0)
686
break;
687
cg += 1;
688
if (cg >= fs->fs_ncg)
689
cg = 0;
690
}
691
/*
692
* If we have failed in our search, record where we gave up for
693
* next time. Otherwise, fall back to our usual search citerion.
694
*/
695
if (newblk == 0) {
696
ip->i_nextclustercg = cg;
697
UFS_UNLOCK(ump);
698
goto fail;
699
}
700
ip->i_nextclustercg = -1;
701
/*
702
* We have found a new contiguous block.
703
*
704
* First we have to replace the old block pointers with the new
705
* block pointers in the inode and indirect blocks associated
706
* with the file.
707
*/
708
#ifdef DIAGNOSTIC
709
if (prtrealloc)
710
printf("realloc: ino %ju, lbns %jd-%jd\n\told:",
711
(uintmax_t)ip->i_number,
712
(intmax_t)start_lbn, (intmax_t)end_lbn);
713
#endif
714
blkno = newblk;
715
for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
716
if (i == ssize) {
717
bap = ebap;
718
soff = -i;
719
}
720
#ifdef INVARIANTS
721
if (!ffs_checkfreeblk(ip,
722
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
723
panic("ffs_reallocblks: unallocated block 2");
724
if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
725
panic("ffs_reallocblks: alloc mismatch");
726
#endif
727
#ifdef DIAGNOSTIC
728
if (prtrealloc)
729
printf(" %d,", *bap);
730
#endif
731
if (DOINGSOFTDEP(vp)) {
732
if (sbap == &ip->i_din1->di_db[0] && i < ssize)
733
softdep_setup_allocdirect(ip, start_lbn + i,
734
blkno, *bap, fs->fs_bsize, fs->fs_bsize,
735
buflist->bs_children[i]);
736
else
737
softdep_setup_allocindir_page(ip, start_lbn + i,
738
i < ssize ? sbp : ebp, soff + i, blkno,
739
*bap, buflist->bs_children[i]);
740
}
741
*bap++ = blkno;
742
}
743
/*
744
* Next we must write out the modified inode and indirect blocks.
745
* For strict correctness, the writes should be synchronous since
746
* the old block values may have been written to disk. In practise
747
* they are almost never written, but if we are concerned about
748
* strict correctness, the `doasyncfree' flag should be set to zero.
749
*
750
* The test on `doasyncfree' should be changed to test a flag
751
* that shows whether the associated buffers and inodes have
752
* been written. The flag should be set when the cluster is
753
* started and cleared whenever the buffer or inode is flushed.
754
* We can then check below to see if it is set, and do the
755
* synchronous write only when it has been cleared.
756
*/
757
if (sbap != &ip->i_din1->di_db[0]) {
758
if (doasyncfree)
759
bdwrite(sbp);
760
else
761
bwrite(sbp);
762
} else {
763
UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
764
if (!doasyncfree)
765
ffs_update(vp, 1);
766
}
767
if (ssize < len) {
768
if (doasyncfree)
769
bdwrite(ebp);
770
else
771
bwrite(ebp);
772
}
773
/*
774
* Last, free the old blocks and assign the new blocks to the buffers.
775
*/
776
#ifdef DIAGNOSTIC
777
if (prtrealloc)
778
printf("\n\tnew:");
779
#endif
780
for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
781
bp = buflist->bs_children[i];
782
if (!DOINGSOFTDEP(vp))
783
/*
784
* The usual case is that a set of N-contiguous blocks
785
* that was just allocated has been replaced with a
786
* set of N+1-contiguous blocks. If they are marked as
787
* B_DELWRI, the current contents have not been written
788
* to disk. It is possible that the blocks were written
789
* earlier, but very uncommon. If the blocks have never
790
* been written, there is no need to send a BIO_DELETE
791
* for them when they are freed. The gain from avoiding
792
* the TRIMs for the common case of unwritten blocks
793
* far exceeds the cost of the write amplification for
794
* the uncommon case of failing to send a TRIM for the
795
* blocks that had been written.
796
*/
797
ffs_blkfree(ump, fs, ump->um_devvp,
798
dbtofsb(fs, bp->b_blkno),
799
fs->fs_bsize, ip->i_number, vp->v_type, NULL,
800
(bp->b_flags & B_DELWRI) != 0 ?
801
NOTRIM_KEY : SINGLETON_KEY);
802
bp->b_blkno = fsbtodb(fs, blkno);
803
#ifdef INVARIANTS
804
if (!ffs_checkfreeblk(ip, dbtofsb(fs, bp->b_blkno),
805
fs->fs_bsize))
806
panic("ffs_reallocblks: unallocated block 3");
807
#endif
808
#ifdef DIAGNOSTIC
809
if (prtrealloc)
810
printf(" %d,", blkno);
811
#endif
812
}
813
#ifdef DIAGNOSTIC
814
if (prtrealloc) {
815
prtrealloc--;
816
printf("\n");
817
}
818
#endif
819
return (0);
820
821
fail:
822
if (ssize < len)
823
brelse(ebp);
824
if (sbap != &ip->i_din1->di_db[0])
825
brelse(sbp);
826
return (ENOSPC);
827
}
828
829
static int
830
ffs_reallocblks_ufs2(
831
struct vop_reallocblks_args /* {
832
struct vnode *a_vp;
833
struct cluster_save *a_buflist;
834
} */ *ap)
835
{
836
struct fs *fs;
837
struct inode *ip;
838
struct vnode *vp;
839
struct buf *sbp, *ebp, *bp;
840
ufs2_daddr_t *bap, *sbap, *ebap;
841
struct cluster_save *buflist;
842
struct ufsmount *ump;
843
ufs_lbn_t start_lbn, end_lbn;
844
ufs2_daddr_t soff, newblk, blkno, pref;
845
struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp;
846
int i, cg, len, start_lvl, end_lvl, ssize;
847
848
vp = ap->a_vp;
849
ip = VTOI(vp);
850
ump = ITOUMP(ip);
851
fs = ump->um_fs;
852
/*
853
* If we are not tracking block clusters or if we have less than 4%
854
* free blocks left, then do not attempt to cluster. Running with
855
* less than 5% free block reserve is not recommended and those that
856
* choose to do so do not expect to have good file layout.
857
*/
858
if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0)
859
return (ENOSPC);
860
buflist = ap->a_buflist;
861
len = buflist->bs_nchildren;
862
start_lbn = buflist->bs_children[0]->b_lblkno;
863
end_lbn = start_lbn + len - 1;
864
#ifdef INVARIANTS
865
for (i = 0; i < len; i++)
866
if (!ffs_checkfreeblk(ip,
867
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
868
panic("ffs_reallocblks: unallocated block 1");
869
for (i = 1; i < len; i++)
870
if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
871
panic("ffs_reallocblks: non-logical cluster");
872
blkno = buflist->bs_children[0]->b_blkno;
873
ssize = fsbtodb(fs, fs->fs_frag);
874
for (i = 1; i < len - 1; i++)
875
if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
876
panic("ffs_reallocblks: non-physical cluster %d", i);
877
#endif
878
/*
879
* If the cluster crosses the boundary for the first indirect
880
* block, do not move anything in it. Indirect blocks are
881
* usually initially laid out in a position between the data
882
* blocks. Block reallocation would usually destroy locality by
883
* moving the indirect block out of the way to make room for
884
* data blocks if we didn't compensate here. We should also do
885
* this for other indirect block boundaries, but it is only
886
* important for the first one.
887
*/
888
if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR)
889
return (ENOSPC);
890
/*
891
* If the latest allocation is in a new cylinder group, assume that
892
* the filesystem has decided to move and do not force it back to
893
* the previous cylinder group.
894
*/
895
if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
896
dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
897
return (ENOSPC);
898
if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
899
ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
900
return (ENOSPC);
901
/*
902
* Get the starting offset and block map for the first block.
903
*/
904
if (start_lvl == 0) {
905
sbap = &ip->i_din2->di_db[0];
906
soff = start_lbn;
907
} else {
908
idp = &start_ap[start_lvl - 1];
909
if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
910
brelse(sbp);
911
return (ENOSPC);
912
}
913
sbap = (ufs2_daddr_t *)sbp->b_data;
914
soff = idp->in_off;
915
}
916
/*
917
* If the block range spans two block maps, get the second map.
918
*/
919
ebap = NULL;
920
if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
921
ssize = len;
922
} else {
923
#ifdef INVARIANTS
924
if (start_lvl > 0 &&
925
start_ap[start_lvl - 1].in_lbn == idp->in_lbn)
926
panic("ffs_reallocblk: start == end");
927
#endif
928
ssize = len - (idp->in_off + 1);
929
if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
930
goto fail;
931
ebap = (ufs2_daddr_t *)ebp->b_data;
932
}
933
/*
934
* Find the preferred location for the cluster. If we have not
935
* previously failed at this endeavor, then follow our standard
936
* preference calculation. If we have failed at it, then pick up
937
* where we last ended our search.
938
*/
939
UFS_LOCK(ump);
940
if (ip->i_nextclustercg == -1)
941
pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap);
942
else
943
pref = cgdata(fs, ip->i_nextclustercg);
944
/*
945
* Search the block map looking for an allocation of the desired size.
946
* To avoid wasting too much time, we limit the number of cylinder
947
* groups that we will search.
948
*/
949
cg = dtog(fs, pref);
950
MPASS(cg < fs->fs_ncg);
951
for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) {
952
if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0)
953
break;
954
cg += 1;
955
if (cg >= fs->fs_ncg)
956
cg = 0;
957
}
958
/*
959
* If we have failed in our search, record where we gave up for
960
* next time. Otherwise, fall back to our usual search citerion.
961
*/
962
if (newblk == 0) {
963
ip->i_nextclustercg = cg;
964
UFS_UNLOCK(ump);
965
goto fail;
966
}
967
ip->i_nextclustercg = -1;
968
/*
969
* We have found a new contiguous block.
970
*
971
* First we have to replace the old block pointers with the new
972
* block pointers in the inode and indirect blocks associated
973
* with the file.
974
*/
975
#ifdef DIAGNOSTIC
976
if (prtrealloc)
977
printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number,
978
(intmax_t)start_lbn, (intmax_t)end_lbn);
979
#endif
980
blkno = newblk;
981
for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
982
if (i == ssize) {
983
bap = ebap;
984
soff = -i;
985
}
986
#ifdef INVARIANTS
987
if (!ffs_checkfreeblk(ip,
988
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
989
panic("ffs_reallocblks: unallocated block 2");
990
if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
991
panic("ffs_reallocblks: alloc mismatch");
992
#endif
993
#ifdef DIAGNOSTIC
994
if (prtrealloc)
995
printf(" %jd,", (intmax_t)*bap);
996
#endif
997
if (DOINGSOFTDEP(vp)) {
998
if (sbap == &ip->i_din2->di_db[0] && i < ssize)
999
softdep_setup_allocdirect(ip, start_lbn + i,
1000
blkno, *bap, fs->fs_bsize, fs->fs_bsize,
1001
buflist->bs_children[i]);
1002
else
1003
softdep_setup_allocindir_page(ip, start_lbn + i,
1004
i < ssize ? sbp : ebp, soff + i, blkno,
1005
*bap, buflist->bs_children[i]);
1006
}
1007
*bap++ = blkno;
1008
}
1009
/*
1010
* Next we must write out the modified inode and indirect blocks.
1011
* For strict correctness, the writes should be synchronous since
1012
* the old block values may have been written to disk. In practise
1013
* they are almost never written, but if we are concerned about
1014
* strict correctness, the `doasyncfree' flag should be set to zero.
1015
*
1016
* The test on `doasyncfree' should be changed to test a flag
1017
* that shows whether the associated buffers and inodes have
1018
* been written. The flag should be set when the cluster is
1019
* started and cleared whenever the buffer or inode is flushed.
1020
* We can then check below to see if it is set, and do the
1021
* synchronous write only when it has been cleared.
1022
*/
1023
if (sbap != &ip->i_din2->di_db[0]) {
1024
if (doasyncfree)
1025
bdwrite(sbp);
1026
else
1027
bwrite(sbp);
1028
} else {
1029
UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
1030
if (!doasyncfree)
1031
ffs_update(vp, 1);
1032
}
1033
if (ssize < len) {
1034
if (doasyncfree)
1035
bdwrite(ebp);
1036
else
1037
bwrite(ebp);
1038
}
1039
/*
1040
* Last, free the old blocks and assign the new blocks to the buffers.
1041
*/
1042
#ifdef DIAGNOSTIC
1043
if (prtrealloc)
1044
printf("\n\tnew:");
1045
#endif
1046
for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
1047
bp = buflist->bs_children[i];
1048
if (!DOINGSOFTDEP(vp))
1049
/*
1050
* The usual case is that a set of N-contiguous blocks
1051
* that was just allocated has been replaced with a
1052
* set of N+1-contiguous blocks. If they are marked as
1053
* B_DELWRI, the current contents have not been written
1054
* to disk. It is possible that the blocks were written
1055
* earlier, but very uncommon. If the blocks have never
1056
* been written, there is no need to send a BIO_DELETE
1057
* for them when they are freed. The gain from avoiding
1058
* the TRIMs for the common case of unwritten blocks
1059
* far exceeds the cost of the write amplification for
1060
* the uncommon case of failing to send a TRIM for the
1061
* blocks that had been written.
1062
*/
1063
ffs_blkfree(ump, fs, ump->um_devvp,
1064
dbtofsb(fs, bp->b_blkno),
1065
fs->fs_bsize, ip->i_number, vp->v_type, NULL,
1066
(bp->b_flags & B_DELWRI) != 0 ?
1067
NOTRIM_KEY : SINGLETON_KEY);
1068
bp->b_blkno = fsbtodb(fs, blkno);
1069
#ifdef INVARIANTS
1070
if (!ffs_checkfreeblk(ip, dbtofsb(fs, bp->b_blkno),
1071
fs->fs_bsize))
1072
panic("ffs_reallocblks: unallocated block 3");
1073
#endif
1074
#ifdef DIAGNOSTIC
1075
if (prtrealloc)
1076
printf(" %jd,", (intmax_t)blkno);
1077
#endif
1078
}
1079
#ifdef DIAGNOSTIC
1080
if (prtrealloc) {
1081
prtrealloc--;
1082
printf("\n");
1083
}
1084
#endif
1085
return (0);
1086
1087
fail:
1088
if (ssize < len)
1089
brelse(ebp);
1090
if (sbap != &ip->i_din2->di_db[0])
1091
brelse(sbp);
1092
return (ENOSPC);
1093
}
1094
1095
/*
1096
* Allocate an inode in the filesystem.
1097
*
1098
* If allocating a directory, use ffs_dirpref to select the inode.
1099
* If allocating in a directory, the following hierarchy is followed:
1100
* 1) allocate the preferred inode.
1101
* 2) allocate an inode in the same cylinder group.
1102
* 3) quadratically rehash into other cylinder groups, until an
1103
* available inode is located.
1104
* If no inode preference is given the following hierarchy is used
1105
* to allocate an inode:
1106
* 1) allocate an inode in cylinder group 0.
1107
* 2) quadratically rehash into other cylinder groups, until an
1108
* available inode is located.
1109
*/
1110
int
1111
ffs_valloc(struct vnode *pvp,
1112
int mode,
1113
struct ucred *cred,
1114
struct vnode **vpp)
1115
{
1116
struct inode *pip;
1117
struct fs *fs;
1118
struct inode *ip;
1119
struct timespec ts;
1120
struct ufsmount *ump;
1121
ino_t ino, ipref;
1122
uint64_t cg;
1123
int error, reclaimed;
1124
1125
*vpp = NULL;
1126
pip = VTOI(pvp);
1127
ump = ITOUMP(pip);
1128
fs = ump->um_fs;
1129
1130
UFS_LOCK(ump);
1131
reclaimed = 0;
1132
retry:
1133
if (fs->fs_cstotal.cs_nifree == 0)
1134
goto noinodes;
1135
1136
if ((mode & IFMT) == IFDIR)
1137
ipref = ffs_dirpref(pip);
1138
else
1139
ipref = pip->i_number;
1140
if (ipref >= fs->fs_ncg * fs->fs_ipg)
1141
ipref = 0;
1142
cg = ino_to_cg(fs, ipref);
1143
/*
1144
* Track number of dirs created one after another
1145
* in a same cg without intervening by files.
1146
*/
1147
if ((mode & IFMT) == IFDIR) {
1148
if (fs->fs_contigdirs[cg] < 255)
1149
fs->fs_contigdirs[cg]++;
1150
} else {
1151
if (fs->fs_contigdirs[cg] > 0)
1152
fs->fs_contigdirs[cg]--;
1153
}
1154
ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0,
1155
(allocfcn_t *)ffs_nodealloccg);
1156
if (ino == 0)
1157
goto noinodes;
1158
/*
1159
* Get rid of the cached old vnode, force allocation of a new vnode
1160
* for this inode. If this fails, release the allocated ino and
1161
* return the error.
1162
*/
1163
if ((error = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp,
1164
FFSV_FORCEINSMQ | FFSV_REPLACE | FFSV_NEWINODE)) != 0) {
1165
ffs_vfree(pvp, ino, mode);
1166
return (error);
1167
}
1168
/*
1169
* We got an inode, so check mode and panic if it is already allocated.
1170
*/
1171
ip = VTOI(*vpp);
1172
if (ip->i_mode) {
1173
printf("mode = 0%o, inum = %ju, fs = %s\n",
1174
ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt);
1175
panic("ffs_valloc: dup alloc");
1176
}
1177
if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */
1178
printf("free inode %s/%ju had %ld blocks\n",
1179
fs->fs_fsmnt, (intmax_t)ino, (long)DIP(ip, i_blocks));
1180
DIP_SET(ip, i_blocks, 0);
1181
}
1182
ip->i_flags = 0;
1183
DIP_SET(ip, i_flags, 0);
1184
if ((mode & IFMT) == IFDIR)
1185
DIP_SET(ip, i_dirdepth, DIP(pip, i_dirdepth) + 1);
1186
/*
1187
* Set up a new generation number for this inode.
1188
*/
1189
while (ip->i_gen == 0 || ++ip->i_gen == 0)
1190
ip->i_gen = arc4random();
1191
DIP_SET(ip, i_gen, ip->i_gen);
1192
if (fs->fs_magic == FS_UFS2_MAGIC) {
1193
vfs_timestamp(&ts);
1194
ip->i_din2->di_birthtime = ts.tv_sec;
1195
ip->i_din2->di_birthnsec = ts.tv_nsec;
1196
}
1197
ip->i_flag = 0;
1198
(*vpp)->v_vflag = 0;
1199
(*vpp)->v_type = VNON;
1200
if (fs->fs_magic == FS_UFS2_MAGIC) {
1201
(*vpp)->v_op = &ffs_vnodeops2;
1202
UFS_INODE_SET_FLAG(ip, IN_UFS2);
1203
} else {
1204
(*vpp)->v_op = &ffs_vnodeops1;
1205
}
1206
return (0);
1207
noinodes:
1208
if (reclaimed == 0) {
1209
reclaimed = 1;
1210
softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT);
1211
goto retry;
1212
}
1213
if (ffs_fsfail_cleanup_locked(ump, 0)) {
1214
UFS_UNLOCK(ump);
1215
return (ENXIO);
1216
}
1217
if (ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) {
1218
UFS_UNLOCK(ump);
1219
ffs_fserr(fs, pip->i_number, "out of inodes");
1220
uprintf("\n%s: create/symlink failed, no inodes free\n",
1221
fs->fs_fsmnt);
1222
} else {
1223
UFS_UNLOCK(ump);
1224
}
1225
return (ENOSPC);
1226
}
1227
1228
/*
1229
* Find a cylinder group to place a directory.
1230
*
1231
* The policy implemented by this algorithm is to allocate a
1232
* directory inode in the same cylinder group as its parent
1233
* directory, but also to reserve space for its files inodes
1234
* and data. Restrict the number of directories which may be
1235
* allocated one after another in the same cylinder group
1236
* without intervening allocation of files.
1237
*
1238
* If we allocate a first level directory then force allocation
1239
* in another cylinder group.
1240
*/
1241
static ino_t
1242
ffs_dirpref(struct inode *pip)
1243
{
1244
struct fs *fs;
1245
int cg, prefcg, curcg, dirsize, cgsize;
1246
int depth, range, start, end, numdirs, power, numerator, denominator;
1247
uint64_t avgifree, avgbfree, avgndir, curdirsize;
1248
uint64_t minifree, minbfree, maxndir;
1249
uint64_t maxcontigdirs;
1250
1251
mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED);
1252
fs = ITOFS(pip);
1253
1254
avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
1255
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
1256
avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
1257
1258
/*
1259
* Select a preferred cylinder group to place a new directory.
1260
* If we are near the root of the filesystem we aim to spread
1261
* them out as much as possible. As we descend deeper from the
1262
* root we cluster them closer together around their parent as
1263
* we expect them to be more closely interactive. Higher-level
1264
* directories like usr/src/sys and usr/src/bin should be
1265
* separated while the directories in these areas are more
1266
* likely to be accessed together so should be closer.
1267
*
1268
* We pick a range of cylinder groups around the cylinder group
1269
* of the directory in which we are being created. The size of
1270
* the range for our search is based on our depth from the root
1271
* of our filesystem. We then probe that range based on how many
1272
* directories are already present. The first new directory is at
1273
* 1/2 (middle) of the range; the second is in the first 1/4 of the
1274
* range, then at 3/4, 1/8, 3/8, 5/8, 7/8, 1/16, 3/16, 5/16, etc.
1275
*/
1276
depth = DIP(pip, i_dirdepth);
1277
range = fs->fs_ncg / (1 << depth);
1278
curcg = ino_to_cg(fs, pip->i_number);
1279
start = curcg - (range / 2);
1280
if (start < 0)
1281
start += fs->fs_ncg;
1282
end = curcg + (range / 2);
1283
if (end >= fs->fs_ncg)
1284
end -= fs->fs_ncg;
1285
numdirs = pip->i_effnlink - 1;
1286
power = fls(numdirs);
1287
numerator = (numdirs & ~(1 << (power - 1))) * 2 + 1;
1288
denominator = 1 << power;
1289
prefcg = (curcg - (range / 2) + (range * numerator / denominator));
1290
if (prefcg < 0)
1291
prefcg += fs->fs_ncg;
1292
if (prefcg >= fs->fs_ncg)
1293
prefcg -= fs->fs_ncg;
1294
/*
1295
* If this filesystem is not tracking directory depths,
1296
* revert to the old algorithm.
1297
*/
1298
if (depth == 0 && pip->i_number != UFS_ROOTINO)
1299
prefcg = curcg;
1300
1301
/*
1302
* Count various limits which used for
1303
* optimal allocation of a directory inode.
1304
*/
1305
maxndir = min(avgndir + (1 << depth), fs->fs_ipg);
1306
minifree = avgifree - avgifree / 4;
1307
if (minifree < 1)
1308
minifree = 1;
1309
minbfree = avgbfree - avgbfree / 4;
1310
if (minbfree < 1)
1311
minbfree = 1;
1312
cgsize = fs->fs_fsize * fs->fs_fpg;
1313
dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir;
1314
curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0;
1315
if (dirsize < curdirsize)
1316
dirsize = curdirsize;
1317
if (dirsize <= 0)
1318
maxcontigdirs = 0; /* dirsize overflowed */
1319
else
1320
maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255);
1321
if (fs->fs_avgfpdir > 0)
1322
maxcontigdirs = min(maxcontigdirs,
1323
fs->fs_ipg / fs->fs_avgfpdir);
1324
if (maxcontigdirs == 0)
1325
maxcontigdirs = 1;
1326
1327
/*
1328
* Limit number of dirs in one cg and reserve space for
1329
* regular files, but only if we have no deficit in
1330
* inodes or space.
1331
*
1332
* We are trying to find a suitable cylinder group nearby
1333
* our preferred cylinder group to place a new directory.
1334
* We scan from our preferred cylinder group forward looking
1335
* for a cylinder group that meets our criterion. If we get
1336
* to the final cylinder group and do not find anything,
1337
* we start scanning forwards from the beginning of the
1338
* filesystem. While it might seem sensible to start scanning
1339
* backwards or even to alternate looking forward and backward,
1340
* this approach fails badly when the filesystem is nearly full.
1341
* Specifically, we first search all the areas that have no space
1342
* and finally try the one preceding that. We repeat this on
1343
* every request and in the case of the final block end up
1344
* searching the entire filesystem. By jumping to the front
1345
* of the filesystem, our future forward searches always look
1346
* in new cylinder groups so finds every possible block after
1347
* one pass over the filesystem.
1348
*/
1349
for (cg = prefcg; cg < fs->fs_ncg; cg++)
1350
if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
1351
fs->fs_cs(fs, cg).cs_nifree >= minifree &&
1352
fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
1353
if (fs->fs_contigdirs[cg] < maxcontigdirs)
1354
return ((ino_t)(fs->fs_ipg * cg));
1355
}
1356
for (cg = 0; cg < prefcg; cg++)
1357
if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
1358
fs->fs_cs(fs, cg).cs_nifree >= minifree &&
1359
fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
1360
if (fs->fs_contigdirs[cg] < maxcontigdirs)
1361
return ((ino_t)(fs->fs_ipg * cg));
1362
}
1363
/*
1364
* This is a backstop when we have deficit in space.
1365
*/
1366
for (cg = prefcg; cg < fs->fs_ncg; cg++)
1367
if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
1368
return ((ino_t)(fs->fs_ipg * cg));
1369
for (cg = 0; cg < prefcg; cg++)
1370
if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
1371
break;
1372
return ((ino_t)(fs->fs_ipg * cg));
1373
}
1374
1375
/*
1376
* Select the desired position for the next block in a file. The file is
1377
* logically divided into sections. The first section is composed of the
1378
* direct blocks and the next fs_maxbpg blocks. Each additional section
1379
* contains fs_maxbpg blocks.
1380
*
1381
* If no blocks have been allocated in the first section, the policy is to
1382
* request a block in the same cylinder group as the inode that describes
1383
* the file. The first indirect is allocated immediately following the last
1384
* direct block and the data blocks for the first indirect immediately
1385
* follow it.
1386
*
1387
* If no blocks have been allocated in any other section, the indirect
1388
* block(s) are allocated in the same cylinder group as its inode in an
1389
* area reserved immediately following the inode blocks. The policy for
1390
* the data blocks is to place them in a cylinder group with a greater than
1391
* average number of free blocks. An appropriate cylinder group is found
1392
* by using a rotor that sweeps the cylinder groups. When a new group of
1393
* blocks is needed, the sweep begins in the cylinder group following the
1394
* cylinder group from which the previous allocation was made. The sweep
1395
* continues until a cylinder group with greater than the average number
1396
* of free blocks is found. If the allocation is for the first block in an
1397
* indirect block or the previous block is a hole, then the information on
1398
* the previous allocation is unavailable; here a best guess is made based
1399
* on the logical block number being allocated.
1400
*
1401
* If a section is already partially allocated, the policy is to
1402
* allocate blocks contiguously within the section if possible.
1403
*/
1404
ufs2_daddr_t
1405
ffs_blkpref_ufs1(struct inode *ip,
1406
ufs_lbn_t lbn,
1407
int indx,
1408
ufs1_daddr_t *bap)
1409
{
1410
struct fs *fs;
1411
uint64_t cg, inocg;
1412
uint64_t avgbfree, startcg;
1413
ufs2_daddr_t pref, prevbn;
1414
1415
KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap"));
1416
mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
1417
fs = ITOFS(ip);
1418
/*
1419
* Allocation of indirect blocks is indicated by passing negative
1420
* values in indx: -1 for single indirect, -2 for double indirect,
1421
* -3 for triple indirect. As noted below, we attempt to allocate
1422
* the first indirect inline with the file data. For all later
1423
* indirect blocks, the data is often allocated in other cylinder
1424
* groups. However to speed random file access and to speed up
1425
* fsck, the filesystem reserves the first fs_metaspace blocks
1426
* (typically half of fs_minfree) of the data area of each cylinder
1427
* group to hold these later indirect blocks.
1428
*/
1429
inocg = ino_to_cg(fs, ip->i_number);
1430
if (indx < 0) {
1431
/*
1432
* Our preference for indirect blocks is the zone at the
1433
* beginning of the inode's cylinder group data area that
1434
* we try to reserve for indirect blocks.
1435
*/
1436
pref = cgmeta(fs, inocg);
1437
/*
1438
* If we are allocating the first indirect block, try to
1439
* place it immediately following the last direct block.
1440
*/
1441
if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) &&
1442
ip->i_din1->di_db[UFS_NDADDR - 1] != 0) {
1443
pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag;
1444
if (dtog(fs, pref) >= fs->fs_ncg)
1445
pref = 0;
1446
}
1447
return (pref);
1448
}
1449
/*
1450
* If we are allocating the first data block in the first indirect
1451
* block and the indirect has been allocated in the data block area,
1452
* try to place it immediately following the indirect block.
1453
*/
1454
if (lbn == UFS_NDADDR) {
1455
pref = ip->i_din1->di_ib[0];
1456
if (pref != 0 && pref >= cgdata(fs, inocg) &&
1457
pref < cgbase(fs, inocg + 1)) {
1458
if (dtog(fs, pref + fs->fs_frag) >= fs->fs_ncg)
1459
return (0);
1460
return (pref + fs->fs_frag);
1461
}
1462
}
1463
/*
1464
* If we are at the beginning of a file, or we have already allocated
1465
* the maximum number of blocks per cylinder group, or we do not
1466
* have a block allocated immediately preceding us, then we need
1467
* to decide where to start allocating new blocks.
1468
*/
1469
if (indx == 0) {
1470
prevbn = 0;
1471
} else {
1472
prevbn = bap[indx - 1];
1473
if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn,
1474
fs->fs_bsize) != 0)
1475
prevbn = 0;
1476
}
1477
if (indx % fs->fs_maxbpg == 0 || prevbn == 0) {
1478
/*
1479
* If we are allocating a directory data block, we want
1480
* to place it in the metadata area.
1481
*/
1482
if ((ip->i_mode & IFMT) == IFDIR)
1483
return (cgmeta(fs, inocg));
1484
/*
1485
* Until we fill all the direct and all the first indirect's
1486
* blocks, we try to allocate in the data area of the inode's
1487
* cylinder group.
1488
*/
1489
if (lbn < UFS_NDADDR + NINDIR(fs))
1490
return (cgdata(fs, inocg));
1491
/*
1492
* Find a cylinder with greater than average number of
1493
* unused data blocks.
1494
*/
1495
if (indx == 0 || prevbn == 0)
1496
startcg = inocg + lbn / fs->fs_maxbpg;
1497
else
1498
startcg = dtog(fs, prevbn) + 1;
1499
startcg %= fs->fs_ncg;
1500
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
1501
for (cg = startcg; cg < fs->fs_ncg; cg++)
1502
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
1503
fs->fs_cgrotor = cg;
1504
return (cgdata(fs, cg));
1505
}
1506
for (cg = 0; cg < startcg; cg++)
1507
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
1508
fs->fs_cgrotor = cg;
1509
return (cgdata(fs, cg));
1510
}
1511
return (0);
1512
}
1513
/*
1514
* Otherwise, we just always try to lay things out contiguously.
1515
*/
1516
if (dtog(fs, prevbn + fs->fs_frag) >= fs->fs_ncg)
1517
return (0);
1518
return (prevbn + fs->fs_frag);
1519
}
1520
1521
/*
1522
* Same as above, but for UFS2
1523
*/
1524
ufs2_daddr_t
1525
ffs_blkpref_ufs2(struct inode *ip,
1526
ufs_lbn_t lbn,
1527
int indx,
1528
ufs2_daddr_t *bap)
1529
{
1530
struct fs *fs;
1531
uint64_t cg, inocg;
1532
uint64_t avgbfree, startcg;
1533
ufs2_daddr_t pref, prevbn;
1534
1535
KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap"));
1536
mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
1537
fs = ITOFS(ip);
1538
/*
1539
* Allocation of indirect blocks is indicated by passing negative
1540
* values in indx: -1 for single indirect, -2 for double indirect,
1541
* -3 for triple indirect. As noted below, we attempt to allocate
1542
* the first indirect inline with the file data. For all later
1543
* indirect blocks, the data is often allocated in other cylinder
1544
* groups. However to speed random file access and to speed up
1545
* fsck, the filesystem reserves the first fs_metaspace blocks
1546
* (typically half of fs_minfree) of the data area of each cylinder
1547
* group to hold these later indirect blocks.
1548
*/
1549
inocg = ino_to_cg(fs, ip->i_number);
1550
if (indx < 0) {
1551
/*
1552
* Our preference for indirect blocks is the zone at the
1553
* beginning of the inode's cylinder group data area that
1554
* we try to reserve for indirect blocks.
1555
*/
1556
pref = cgmeta(fs, inocg);
1557
/*
1558
* If we are allocating the first indirect block, try to
1559
* place it immediately following the last direct block.
1560
*/
1561
if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) &&
1562
ip->i_din2->di_db[UFS_NDADDR - 1] != 0) {
1563
pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag;
1564
if (dtog(fs, pref) >= fs->fs_ncg)
1565
pref = 0;
1566
}
1567
return (pref);
1568
}
1569
/*
1570
* If we are allocating the first data block in the first indirect
1571
* block and the indirect has been allocated in the data block area,
1572
* try to place it immediately following the indirect block.
1573
*/
1574
if (lbn == UFS_NDADDR) {
1575
pref = ip->i_din2->di_ib[0];
1576
if (pref != 0 && pref >= cgdata(fs, inocg) &&
1577
pref < cgbase(fs, inocg + 1)) {
1578
if (dtog(fs, pref + fs->fs_frag) >= fs->fs_ncg)
1579
return (0);
1580
return (pref + fs->fs_frag);
1581
}
1582
}
1583
/*
1584
* If we are at the beginning of a file, or we have already allocated
1585
* the maximum number of blocks per cylinder group, or we do not
1586
* have a block allocated immediately preceding us, then we need
1587
* to decide where to start allocating new blocks.
1588
*/
1589
if (indx == 0) {
1590
prevbn = 0;
1591
} else {
1592
prevbn = bap[indx - 1];
1593
if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn,
1594
fs->fs_bsize) != 0)
1595
prevbn = 0;
1596
}
1597
if (indx % fs->fs_maxbpg == 0 || prevbn == 0) {
1598
/*
1599
* If we are allocating a directory data block, we want
1600
* to place it in the metadata area.
1601
*/
1602
if ((ip->i_mode & IFMT) == IFDIR)
1603
return (cgmeta(fs, inocg));
1604
/*
1605
* Until we fill all the direct and all the first indirect's
1606
* blocks, we try to allocate in the data area of the inode's
1607
* cylinder group.
1608
*/
1609
if (lbn < UFS_NDADDR + NINDIR(fs))
1610
return (cgdata(fs, inocg));
1611
/*
1612
* Find a cylinder with greater than average number of
1613
* unused data blocks.
1614
*/
1615
if (indx == 0 || prevbn == 0)
1616
startcg = inocg + lbn / fs->fs_maxbpg;
1617
else
1618
startcg = dtog(fs, prevbn) + 1;
1619
startcg %= fs->fs_ncg;
1620
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
1621
for (cg = startcg; cg < fs->fs_ncg; cg++)
1622
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
1623
fs->fs_cgrotor = cg;
1624
return (cgdata(fs, cg));
1625
}
1626
for (cg = 0; cg < startcg; cg++)
1627
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
1628
fs->fs_cgrotor = cg;
1629
return (cgdata(fs, cg));
1630
}
1631
return (0);
1632
}
1633
/*
1634
* Otherwise, we just always try to lay things out contiguously.
1635
*/
1636
if (dtog(fs, prevbn + fs->fs_frag) >= fs->fs_ncg)
1637
return (0);
1638
return (prevbn + fs->fs_frag);
1639
}
1640
1641
/*
1642
* Implement the cylinder overflow algorithm.
1643
*
1644
* The policy implemented by this algorithm is:
1645
* 1) allocate the block in its requested cylinder group.
1646
* 2) quadratically rehash on the cylinder group number.
1647
* 3) brute force search for a free block.
1648
*
1649
* Must be called with the UFS lock held. Will release the lock on success
1650
* and return with it held on failure.
1651
*/
1652
/*VARARGS5*/
1653
static ufs2_daddr_t
1654
ffs_hashalloc(struct inode *ip,
1655
uint64_t cg,
1656
ufs2_daddr_t pref,
1657
int size, /* Search size for data blocks, mode for inodes */
1658
int rsize, /* Real allocated size. */
1659
allocfcn_t *allocator)
1660
{
1661
struct fs *fs;
1662
ufs2_daddr_t result;
1663
uint64_t i, icg = cg;
1664
1665
mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
1666
#ifdef INVARIANTS
1667
if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
1668
panic("ffs_hashalloc: allocation on suspended filesystem");
1669
#endif
1670
fs = ITOFS(ip);
1671
/*
1672
* 1: preferred cylinder group
1673
*/
1674
result = (*allocator)(ip, cg, pref, size, rsize);
1675
if (result)
1676
return (result);
1677
/*
1678
* 2: quadratic rehash
1679
*/
1680
for (i = 1; i < fs->fs_ncg; i *= 2) {
1681
cg += i;
1682
if (cg >= fs->fs_ncg)
1683
cg -= fs->fs_ncg;
1684
result = (*allocator)(ip, cg, 0, size, rsize);
1685
if (result)
1686
return (result);
1687
}
1688
/*
1689
* 3: brute force search
1690
* Note that we start at i == 2, since 0 was checked initially,
1691
* and 1 is always checked in the quadratic rehash.
1692
*/
1693
cg = (icg + 2) % fs->fs_ncg;
1694
for (i = 2; i < fs->fs_ncg; i++) {
1695
result = (*allocator)(ip, cg, 0, size, rsize);
1696
if (result)
1697
return (result);
1698
cg++;
1699
if (cg == fs->fs_ncg)
1700
cg = 0;
1701
}
1702
return (0);
1703
}
1704
1705
/*
1706
* Determine whether a fragment can be extended.
1707
*
1708
* Check to see if the necessary fragments are available, and
1709
* if they are, allocate them.
1710
*/
1711
static ufs2_daddr_t
1712
ffs_fragextend(struct inode *ip,
1713
uint64_t cg,
1714
ufs2_daddr_t bprev,
1715
int osize,
1716
int nsize)
1717
{
1718
struct fs *fs;
1719
struct cg *cgp;
1720
struct buf *bp;
1721
struct ufsmount *ump;
1722
int nffree;
1723
long bno;
1724
int frags, bbase;
1725
int i, error;
1726
uint8_t *blksfree;
1727
1728
ump = ITOUMP(ip);
1729
fs = ump->um_fs;
1730
if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
1731
return (0);
1732
frags = numfrags(fs, nsize);
1733
bbase = fragnum(fs, bprev);
1734
if (bbase > fragnum(fs, (bprev + frags - 1))) {
1735
/* cannot extend across a block boundary */
1736
return (0);
1737
}
1738
UFS_UNLOCK(ump);
1739
if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) {
1740
ffs_checkcgintegrity(fs, cg, error);
1741
goto fail;
1742
}
1743
bno = dtogd(fs, bprev);
1744
blksfree = cg_blksfree(cgp);
1745
for (i = numfrags(fs, osize); i < frags; i++)
1746
if (isclr(blksfree, bno + i))
1747
goto fail;
1748
/*
1749
* the current fragment can be extended
1750
* deduct the count on fragment being extended into
1751
* increase the count on the remaining fragment (if any)
1752
* allocate the extended piece
1753
*/
1754
for (i = frags; i < fs->fs_frag - bbase; i++)
1755
if (isclr(blksfree, bno + i))
1756
break;
1757
cgp->cg_frsum[i - numfrags(fs, osize)]--;
1758
if (i != frags)
1759
cgp->cg_frsum[i - frags]++;
1760
for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) {
1761
clrbit(blksfree, bno + i);
1762
cgp->cg_cs.cs_nffree--;
1763
nffree++;
1764
}
1765
UFS_LOCK(ump);
1766
fs->fs_cstotal.cs_nffree -= nffree;
1767
fs->fs_cs(fs, cg).cs_nffree -= nffree;
1768
fs->fs_fmod = 1;
1769
ACTIVECLEAR(fs, cg);
1770
UFS_UNLOCK(ump);
1771
if (DOINGSOFTDEP(ITOV(ip)))
1772
softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev,
1773
frags, numfrags(fs, osize));
1774
bdwrite(bp);
1775
return (bprev);
1776
1777
fail:
1778
brelse(bp);
1779
UFS_LOCK(ump);
1780
return (0);
1781
1782
}
1783
1784
/*
1785
* Determine whether a block can be allocated.
1786
*
1787
* Check to see if a block of the appropriate size is available,
1788
* and if it is, allocate it.
1789
*/
1790
static ufs2_daddr_t
1791
ffs_alloccg(struct inode *ip,
1792
uint64_t cg,
1793
ufs2_daddr_t bpref,
1794
int size,
1795
int rsize)
1796
{
1797
struct fs *fs;
1798
struct cg *cgp;
1799
struct buf *bp;
1800
struct ufsmount *ump;
1801
ufs1_daddr_t bno;
1802
ufs2_daddr_t blkno;
1803
int i, allocsiz, error, frags;
1804
uint8_t *blksfree;
1805
1806
ump = ITOUMP(ip);
1807
fs = ump->um_fs;
1808
if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
1809
return (0);
1810
UFS_UNLOCK(ump);
1811
if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0 ||
1812
(cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
1813
ffs_checkcgintegrity(fs, cg, error);
1814
goto fail;
1815
}
1816
if (size == fs->fs_bsize) {
1817
UFS_LOCK(ump);
1818
blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
1819
ACTIVECLEAR(fs, cg);
1820
UFS_UNLOCK(ump);
1821
bdwrite(bp);
1822
return (blkno);
1823
}
1824
/*
1825
* check to see if any fragments are already available
1826
* allocsiz is the size which will be allocated, hacking
1827
* it down to a smaller size if necessary
1828
*/
1829
blksfree = cg_blksfree(cgp);
1830
frags = numfrags(fs, size);
1831
for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
1832
if (cgp->cg_frsum[allocsiz] != 0)
1833
break;
1834
if (allocsiz == fs->fs_frag) {
1835
/*
1836
* no fragments were available, so a block will be
1837
* allocated, and hacked up
1838
*/
1839
if (cgp->cg_cs.cs_nbfree == 0)
1840
goto fail;
1841
UFS_LOCK(ump);
1842
blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
1843
ACTIVECLEAR(fs, cg);
1844
UFS_UNLOCK(ump);
1845
bdwrite(bp);
1846
return (blkno);
1847
}
1848
KASSERT(size == rsize,
1849
("ffs_alloccg: size(%d) != rsize(%d)", size, rsize));
1850
bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
1851
if (bno < 0)
1852
goto fail;
1853
for (i = 0; i < frags; i++)
1854
clrbit(blksfree, bno + i);
1855
cgp->cg_cs.cs_nffree -= frags;
1856
cgp->cg_frsum[allocsiz]--;
1857
if (frags != allocsiz)
1858
cgp->cg_frsum[allocsiz - frags]++;
1859
UFS_LOCK(ump);
1860
fs->fs_cstotal.cs_nffree -= frags;
1861
fs->fs_cs(fs, cg).cs_nffree -= frags;
1862
fs->fs_fmod = 1;
1863
blkno = cgbase(fs, cg) + bno;
1864
ACTIVECLEAR(fs, cg);
1865
UFS_UNLOCK(ump);
1866
if (DOINGSOFTDEP(ITOV(ip)))
1867
softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0);
1868
bdwrite(bp);
1869
return (blkno);
1870
1871
fail:
1872
brelse(bp);
1873
UFS_LOCK(ump);
1874
return (0);
1875
}
1876
1877
/*
1878
* Allocate a block in a cylinder group.
1879
*
1880
* This algorithm implements the following policy:
1881
* 1) allocate the requested block.
1882
* 2) allocate a rotationally optimal block in the same cylinder.
1883
* 3) allocate the next available block on the block rotor for the
1884
* specified cylinder group.
1885
* Note that this routine only allocates fs_bsize blocks; these
1886
* blocks may be fragmented by the routine that allocates them.
1887
*/
1888
static ufs2_daddr_t
1889
ffs_alloccgblk(struct inode *ip,
1890
struct buf *bp,
1891
ufs2_daddr_t bpref,
1892
int size)
1893
{
1894
struct fs *fs;
1895
struct cg *cgp;
1896
struct ufsmount *ump;
1897
ufs1_daddr_t bno;
1898
ufs2_daddr_t blkno;
1899
uint8_t *blksfree;
1900
int i, cgbpref;
1901
1902
ump = ITOUMP(ip);
1903
fs = ump->um_fs;
1904
mtx_assert(UFS_MTX(ump), MA_OWNED);
1905
cgp = (struct cg *)bp->b_data;
1906
blksfree = cg_blksfree(cgp);
1907
if (bpref == 0) {
1908
bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag;
1909
} else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) {
1910
/* map bpref to correct zone in this cg */
1911
if (bpref < cgdata(fs, cgbpref))
1912
bpref = cgmeta(fs, cgp->cg_cgx);
1913
else
1914
bpref = cgdata(fs, cgp->cg_cgx);
1915
}
1916
/*
1917
* if the requested block is available, use it
1918
*/
1919
bno = dtogd(fs, blknum(fs, bpref));
1920
if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
1921
goto gotit;
1922
/*
1923
* Take the next available block in this cylinder group.
1924
*/
1925
bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
1926
if (bno < 0)
1927
return (0);
1928
/* Update cg_rotor only if allocated from the data zone */
1929
if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx)))
1930
cgp->cg_rotor = bno;
1931
gotit:
1932
blkno = fragstoblks(fs, bno);
1933
ffs_clrblock(fs, blksfree, (long)blkno);
1934
ffs_clusteracct(fs, cgp, blkno, -1);
1935
cgp->cg_cs.cs_nbfree--;
1936
fs->fs_cstotal.cs_nbfree--;
1937
fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
1938
fs->fs_fmod = 1;
1939
blkno = cgbase(fs, cgp->cg_cgx) + bno;
1940
/*
1941
* If the caller didn't want the whole block free the frags here.
1942
*/
1943
size = numfrags(fs, size);
1944
if (size != fs->fs_frag) {
1945
bno = dtogd(fs, blkno);
1946
for (i = size; i < fs->fs_frag; i++)
1947
setbit(blksfree, bno + i);
1948
i = fs->fs_frag - size;
1949
cgp->cg_cs.cs_nffree += i;
1950
fs->fs_cstotal.cs_nffree += i;
1951
fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i;
1952
fs->fs_fmod = 1;
1953
cgp->cg_frsum[i]++;
1954
}
1955
/* XXX Fixme. */
1956
UFS_UNLOCK(ump);
1957
if (DOINGSOFTDEP(ITOV(ip)))
1958
softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0);
1959
UFS_LOCK(ump);
1960
return (blkno);
1961
}
1962
1963
/*
1964
* Determine whether a cluster can be allocated.
1965
*
1966
* We do not currently check for optimal rotational layout if there
1967
* are multiple choices in the same cylinder group. Instead we just
1968
* take the first one that we find following bpref.
1969
*/
1970
static ufs2_daddr_t
1971
ffs_clusteralloc(struct inode *ip,
1972
uint64_t cg,
1973
ufs2_daddr_t bpref,
1974
int len)
1975
{
1976
struct fs *fs;
1977
struct cg *cgp;
1978
struct buf *bp;
1979
struct ufsmount *ump;
1980
int i, run, bit, map, got, error;
1981
ufs2_daddr_t bno;
1982
uint8_t *mapp;
1983
int32_t *lp;
1984
uint8_t *blksfree;
1985
1986
ump = ITOUMP(ip);
1987
fs = ump->um_fs;
1988
MPASS(cg < fs->fs_ncg);
1989
if (fs->fs_maxcluster[cg] < len)
1990
return (0);
1991
UFS_UNLOCK(ump);
1992
if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) {
1993
ffs_checkcgintegrity(fs, cg, error);
1994
UFS_LOCK(ump);
1995
return (0);
1996
}
1997
/*
1998
* Check to see if a cluster of the needed size (or bigger) is
1999
* available in this cylinder group.
2000
*/
2001
lp = &cg_clustersum(cgp)[len];
2002
for (i = len; i <= fs->fs_contigsumsize; i++)
2003
if (*lp++ > 0)
2004
break;
2005
if (i > fs->fs_contigsumsize) {
2006
/*
2007
* This is the first time looking for a cluster in this
2008
* cylinder group. Update the cluster summary information
2009
* to reflect the true maximum sized cluster so that
2010
* future cluster allocation requests can avoid reading
2011
* the cylinder group map only to find no clusters.
2012
*/
2013
lp = &cg_clustersum(cgp)[len - 1];
2014
for (i = len - 1; i > 0; i--)
2015
if (*lp-- > 0)
2016
break;
2017
UFS_LOCK(ump);
2018
fs->fs_maxcluster[cg] = i;
2019
brelse(bp);
2020
return (0);
2021
}
2022
/*
2023
* Search the cluster map to find a big enough cluster.
2024
* We take the first one that we find, even if it is larger
2025
* than we need as we prefer to get one close to the previous
2026
* block allocation. We do not search before the current
2027
* preference point as we do not want to allocate a block
2028
* that is allocated before the previous one (as we will
2029
* then have to wait for another pass of the elevator
2030
* algorithm before it will be read). We prefer to fail and
2031
* be recalled to try an allocation in the next cylinder group.
2032
*/
2033
if (dtog(fs, bpref) != cg)
2034
bpref = cgdata(fs, cg);
2035
else
2036
bpref = blknum(fs, bpref);
2037
bpref = fragstoblks(fs, dtogd(fs, bpref));
2038
mapp = &cg_clustersfree(cgp)[bpref / NBBY];
2039
map = *mapp++;
2040
bit = 1 << (bpref % NBBY);
2041
for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) {
2042
if ((map & bit) == 0) {
2043
run = 0;
2044
} else {
2045
run++;
2046
if (run == len)
2047
break;
2048
}
2049
if ((got & (NBBY - 1)) != (NBBY - 1)) {
2050
bit <<= 1;
2051
} else {
2052
map = *mapp++;
2053
bit = 1;
2054
}
2055
}
2056
if (got >= cgp->cg_nclusterblks) {
2057
UFS_LOCK(ump);
2058
brelse(bp);
2059
return (0);
2060
}
2061
/*
2062
* Allocate the cluster that we have found.
2063
*/
2064
blksfree = cg_blksfree(cgp);
2065
for (i = 1; i <= len; i++)
2066
if (!ffs_isblock(fs, blksfree, got - run + i))
2067
panic("ffs_clusteralloc: map mismatch");
2068
bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1);
2069
if (dtog(fs, bno) != cg)
2070
panic("ffs_clusteralloc: allocated out of group");
2071
len = blkstofrags(fs, len);
2072
UFS_LOCK(ump);
2073
for (i = 0; i < len; i += fs->fs_frag)
2074
if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i)
2075
panic("ffs_clusteralloc: lost block");
2076
ACTIVECLEAR(fs, cg);
2077
UFS_UNLOCK(ump);
2078
bdwrite(bp);
2079
return (bno);
2080
}
2081
2082
static inline struct buf *
2083
getinobuf(struct inode *ip,
2084
uint64_t cg,
2085
uint32_t cginoblk,
2086
int gbflags)
2087
{
2088
struct fs *fs;
2089
2090
fs = ITOFS(ip);
2091
return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs,
2092
cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0,
2093
gbflags));
2094
}
2095
2096
/*
2097
* Synchronous inode initialization is needed only when barrier writes do not
2098
* work as advertised, and will impose a heavy cost on file creation in a newly
2099
* created filesystem.
2100
*/
2101
static int doasyncinodeinit = 1;
2102
SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN,
2103
&doasyncinodeinit, 0,
2104
"Perform inode block initialization using asynchronous writes");
2105
2106
/*
2107
* Determine whether an inode can be allocated.
2108
*
2109
* Check to see if an inode is available, and if it is,
2110
* allocate it using the following policy:
2111
* 1) allocate the requested inode.
2112
* 2) allocate the next available inode after the requested
2113
* inode in the specified cylinder group.
2114
*/
2115
static ufs2_daddr_t
2116
ffs_nodealloccg(struct inode *ip,
2117
uint64_t cg,
2118
ufs2_daddr_t ipref,
2119
int mode,
2120
int unused)
2121
{
2122
struct fs *fs;
2123
struct cg *cgp;
2124
struct buf *bp, *ibp;
2125
struct ufsmount *ump;
2126
uint8_t *inosused, *loc;
2127
struct ufs2_dinode *dp2;
2128
int error, start, len, i;
2129
uint32_t old_initediblk;
2130
2131
ump = ITOUMP(ip);
2132
fs = ump->um_fs;
2133
check_nifree:
2134
if (fs->fs_cs(fs, cg).cs_nifree == 0)
2135
return (0);
2136
UFS_UNLOCK(ump);
2137
if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) {
2138
ffs_checkcgintegrity(fs, cg, error);
2139
UFS_LOCK(ump);
2140
return (0);
2141
}
2142
restart:
2143
if (cgp->cg_cs.cs_nifree == 0) {
2144
brelse(bp);
2145
UFS_LOCK(ump);
2146
return (0);
2147
}
2148
inosused = cg_inosused(cgp);
2149
if (ipref) {
2150
ipref %= fs->fs_ipg;
2151
if (isclr(inosused, ipref))
2152
goto gotit;
2153
}
2154
start = cgp->cg_irotor / NBBY;
2155
len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
2156
loc = memcchr(&inosused[start], 0xff, len);
2157
if (loc == NULL) {
2158
len = start + 1;
2159
start = 0;
2160
loc = memcchr(&inosused[start], 0xff, len);
2161
if (loc == NULL) {
2162
printf("cg = %ju, irotor = %ld, fs = %s\n",
2163
(intmax_t)cg, (long)cgp->cg_irotor, fs->fs_fsmnt);
2164
panic("ffs_nodealloccg: map corrupted");
2165
/* NOTREACHED */
2166
}
2167
}
2168
ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1;
2169
gotit:
2170
/*
2171
* Check to see if we need to initialize more inodes.
2172
*/
2173
if (fs->fs_magic == FS_UFS2_MAGIC &&
2174
ipref + INOPB(fs) > cgp->cg_initediblk &&
2175
cgp->cg_initediblk < cgp->cg_niblk) {
2176
old_initediblk = cgp->cg_initediblk;
2177
2178
/*
2179
* Free the cylinder group lock before writing the
2180
* initialized inode block. Entering the
2181
* babarrierwrite() with the cylinder group lock
2182
* causes lock order violation between the lock and
2183
* snaplk.
2184
*
2185
* Another thread can decide to initialize the same
2186
* inode block, but whichever thread first gets the
2187
* cylinder group lock after writing the newly
2188
* allocated inode block will update it and the other
2189
* will realize that it has lost and leave the
2190
* cylinder group unchanged.
2191
*/
2192
ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT);
2193
brelse(bp);
2194
if (ibp == NULL) {
2195
/*
2196
* The inode block buffer is already owned by
2197
* another thread, which must initialize it.
2198
* Wait on the buffer to allow another thread
2199
* to finish the updates, with dropped cg
2200
* buffer lock, then retry.
2201
*/
2202
ibp = getinobuf(ip, cg, old_initediblk, 0);
2203
brelse(ibp);
2204
UFS_LOCK(ump);
2205
goto check_nifree;
2206
}
2207
bzero(ibp->b_data, (int)fs->fs_bsize);
2208
dp2 = (struct ufs2_dinode *)(ibp->b_data);
2209
for (i = 0; i < INOPB(fs); i++) {
2210
while (dp2->di_gen == 0)
2211
dp2->di_gen = arc4random();
2212
dp2++;
2213
}
2214
2215
/*
2216
* Rather than adding a soft updates dependency to ensure
2217
* that the new inode block is written before it is claimed
2218
* by the cylinder group map, we just do a barrier write
2219
* here. The barrier write will ensure that the inode block
2220
* gets written before the updated cylinder group map can be
2221
* written. The barrier write should only slow down bulk
2222
* loading of newly created filesystems.
2223
*/
2224
if (doasyncinodeinit)
2225
babarrierwrite(ibp);
2226
else
2227
bwrite(ibp);
2228
2229
/*
2230
* After the inode block is written, try to update the
2231
* cg initediblk pointer. If another thread beat us
2232
* to it, then leave it unchanged as the other thread
2233
* has already set it correctly.
2234
*/
2235
error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp);
2236
UFS_LOCK(ump);
2237
ACTIVECLEAR(fs, cg);
2238
UFS_UNLOCK(ump);
2239
if (error != 0)
2240
return (error);
2241
if (cgp->cg_initediblk == old_initediblk)
2242
cgp->cg_initediblk += INOPB(fs);
2243
goto restart;
2244
}
2245
cgp->cg_irotor = ipref;
2246
UFS_LOCK(ump);
2247
ACTIVECLEAR(fs, cg);
2248
setbit(inosused, ipref);
2249
cgp->cg_cs.cs_nifree--;
2250
fs->fs_cstotal.cs_nifree--;
2251
fs->fs_cs(fs, cg).cs_nifree--;
2252
fs->fs_fmod = 1;
2253
if ((mode & IFMT) == IFDIR) {
2254
cgp->cg_cs.cs_ndir++;
2255
fs->fs_cstotal.cs_ndir++;
2256
fs->fs_cs(fs, cg).cs_ndir++;
2257
}
2258
UFS_UNLOCK(ump);
2259
if (DOINGSOFTDEP(ITOV(ip)))
2260
softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode);
2261
bdwrite(bp);
2262
return ((ino_t)(cg * fs->fs_ipg + ipref));
2263
}
2264
2265
/*
2266
* Free a block or fragment.
2267
*
2268
* The specified block or fragment is placed back in the
2269
* free map. If a fragment is deallocated, a possible
2270
* block reassembly is checked.
2271
*/
2272
static void
2273
ffs_blkfree_cg(struct ufsmount *ump,
2274
struct fs *fs,
2275
struct vnode *devvp,
2276
ufs2_daddr_t bno,
2277
long size,
2278
ino_t inum,
2279
struct workhead *dephd)
2280
{
2281
struct mount *mp;
2282
struct cg *cgp;
2283
struct buf *bp;
2284
daddr_t dbn;
2285
ufs1_daddr_t fragno, cgbno;
2286
int i, blk, frags, bbase, error;
2287
uint64_t cg;
2288
uint8_t *blksfree;
2289
struct cdev *dev;
2290
2291
cg = dtog(fs, bno);
2292
if (devvp->v_type == VREG) {
2293
/* devvp is a snapshot */
2294
MPASS(devvp->v_mount->mnt_data == ump);
2295
dev = ump->um_devvp->v_rdev;
2296
} else if (devvp->v_type == VCHR) {
2297
/*
2298
* devvp is a normal disk device
2299
* XXXKIB: devvp is not locked there, v_rdev access depends on
2300
* busy mount, which prevents mntfs devvp from reclamation.
2301
*/
2302
dev = devvp->v_rdev;
2303
} else
2304
return;
2305
#ifdef INVARIANTS
2306
if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
2307
fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
2308
printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n",
2309
devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize,
2310
size, fs->fs_fsmnt);
2311
panic("ffs_blkfree_cg: invalid size");
2312
}
2313
#endif
2314
if ((uint64_t)bno >= fs->fs_size) {
2315
printf("bad block %jd, ino %ju\n", (intmax_t)bno,
2316
(intmax_t)inum);
2317
ffs_fserr(fs, inum, "bad block");
2318
return;
2319
}
2320
if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) {
2321
if (!MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR)
2322
return;
2323
/*
2324
* Would like to just downgrade to read-only. Until that
2325
* capability is available, just toss the cylinder group
2326
* update and mark the filesystem as needing to run fsck.
2327
*/
2328
fs->fs_flags |= FS_NEEDSFSCK;
2329
if (devvp->v_type == VREG)
2330
dbn = fragstoblks(fs, cgtod(fs, cg));
2331
else
2332
dbn = fsbtodb(fs, cgtod(fs, cg));
2333
error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp);
2334
KASSERT(error == 0, ("getblkx failed"));
2335
softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
2336
numfrags(fs, size), dephd, true);
2337
bp->b_flags |= B_RELBUF | B_NOCACHE;
2338
bp->b_flags &= ~B_CACHE;
2339
bawrite(bp);
2340
return;
2341
}
2342
cgbno = dtogd(fs, bno);
2343
blksfree = cg_blksfree(cgp);
2344
UFS_LOCK(ump);
2345
if (size == fs->fs_bsize) {
2346
fragno = fragstoblks(fs, cgbno);
2347
if (!ffs_isfreeblock(fs, blksfree, fragno)) {
2348
if (devvp->v_type == VREG) {
2349
UFS_UNLOCK(ump);
2350
/* devvp is a snapshot */
2351
brelse(bp);
2352
return;
2353
}
2354
printf("dev = %s, block = %jd, fs = %s\n",
2355
devtoname(dev), (intmax_t)bno, fs->fs_fsmnt);
2356
panic("ffs_blkfree_cg: freeing free block");
2357
}
2358
ffs_setblock(fs, blksfree, fragno);
2359
ffs_clusteracct(fs, cgp, fragno, 1);
2360
cgp->cg_cs.cs_nbfree++;
2361
fs->fs_cstotal.cs_nbfree++;
2362
fs->fs_cs(fs, cg).cs_nbfree++;
2363
} else {
2364
bbase = cgbno - fragnum(fs, cgbno);
2365
/*
2366
* decrement the counts associated with the old frags
2367
*/
2368
blk = blkmap(fs, blksfree, bbase);
2369
ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
2370
/*
2371
* deallocate the fragment
2372
*/
2373
frags = numfrags(fs, size);
2374
for (i = 0; i < frags; i++) {
2375
if (isset(blksfree, cgbno + i)) {
2376
printf("dev = %s, block = %jd, fs = %s\n",
2377
devtoname(dev), (intmax_t)(bno + i),
2378
fs->fs_fsmnt);
2379
panic("ffs_blkfree_cg: freeing free frag");
2380
}
2381
setbit(blksfree, cgbno + i);
2382
}
2383
cgp->cg_cs.cs_nffree += i;
2384
fs->fs_cstotal.cs_nffree += i;
2385
fs->fs_cs(fs, cg).cs_nffree += i;
2386
/*
2387
* add back in counts associated with the new frags
2388
*/
2389
blk = blkmap(fs, blksfree, bbase);
2390
ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
2391
/*
2392
* if a complete block has been reassembled, account for it
2393
*/
2394
fragno = fragstoblks(fs, bbase);
2395
if (ffs_isblock(fs, blksfree, fragno)) {
2396
cgp->cg_cs.cs_nffree -= fs->fs_frag;
2397
fs->fs_cstotal.cs_nffree -= fs->fs_frag;
2398
fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
2399
ffs_clusteracct(fs, cgp, fragno, 1);
2400
cgp->cg_cs.cs_nbfree++;
2401
fs->fs_cstotal.cs_nbfree++;
2402
fs->fs_cs(fs, cg).cs_nbfree++;
2403
}
2404
}
2405
fs->fs_fmod = 1;
2406
ACTIVECLEAR(fs, cg);
2407
UFS_UNLOCK(ump);
2408
mp = UFSTOVFS(ump);
2409
if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR)
2410
softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
2411
numfrags(fs, size), dephd, false);
2412
bdwrite(bp);
2413
}
2414
2415
/*
2416
* Structures and routines associated with trim management.
2417
*
2418
* The following requests are passed to trim_lookup to indicate
2419
* the actions that should be taken.
2420
*/
2421
#define NEW 1 /* if found, error else allocate and hash it */
2422
#define OLD 2 /* if not found, error, else return it */
2423
#define REPLACE 3 /* if not found, error else unhash and reallocate it */
2424
#define DONE 4 /* if not found, error else unhash and return it */
2425
#define SINGLE 5 /* don't look up, just allocate it and don't hash it */
2426
2427
MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures");
2428
2429
#define TRIMLIST_HASH(ump, key) \
2430
(&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize])
2431
2432
/*
2433
* These structures describe each of the block free requests aggregated
2434
* together to make up a trim request.
2435
*/
2436
struct trim_blkreq {
2437
TAILQ_ENTRY(trim_blkreq) blkreqlist;
2438
ufs2_daddr_t bno;
2439
long size;
2440
struct workhead *pdephd;
2441
struct workhead dephd;
2442
};
2443
2444
/*
2445
* Description of a trim request.
2446
*/
2447
struct ffs_blkfree_trim_params {
2448
TAILQ_HEAD(, trim_blkreq) blklist;
2449
LIST_ENTRY(ffs_blkfree_trim_params) hashlist;
2450
struct task task;
2451
struct ufsmount *ump;
2452
struct vnode *devvp;
2453
ino_t inum;
2454
ufs2_daddr_t bno;
2455
long size;
2456
long key;
2457
};
2458
2459
static void ffs_blkfree_trim_completed(struct buf *);
2460
static void ffs_blkfree_trim_task(void *ctx, int pending __unused);
2461
static struct ffs_blkfree_trim_params *trim_lookup(struct ufsmount *,
2462
struct vnode *, ufs2_daddr_t, long, ino_t, uint64_t, int);
2463
static void ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *);
2464
2465
/*
2466
* Called on trim completion to start a task to free the associated block(s).
2467
*/
2468
static void
2469
ffs_blkfree_trim_completed(struct buf *bp)
2470
{
2471
struct ffs_blkfree_trim_params *tp;
2472
2473
tp = bp->b_fsprivate1;
2474
free(bp, M_TRIM);
2475
TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp);
2476
taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task);
2477
}
2478
2479
/*
2480
* Trim completion task that free associated block(s).
2481
*/
2482
static void
2483
ffs_blkfree_trim_task(void *ctx, int pending)
2484
{
2485
struct ffs_blkfree_trim_params *tp;
2486
struct trim_blkreq *blkelm;
2487
struct ufsmount *ump;
2488
2489
tp = ctx;
2490
ump = tp->ump;
2491
while ((blkelm = TAILQ_FIRST(&tp->blklist)) != NULL) {
2492
ffs_blkfree_cg(ump, ump->um_fs, tp->devvp, blkelm->bno,
2493
blkelm->size, tp->inum, blkelm->pdephd);
2494
TAILQ_REMOVE(&tp->blklist, blkelm, blkreqlist);
2495
free(blkelm, M_TRIM);
2496
}
2497
vn_finished_secondary_write(UFSTOVFS(ump));
2498
UFS_LOCK(ump);
2499
ump->um_trim_inflight -= 1;
2500
ump->um_trim_inflight_blks -= numfrags(ump->um_fs, tp->size);
2501
UFS_UNLOCK(ump);
2502
free(tp, M_TRIM);
2503
}
2504
2505
/*
2506
* Lookup a trim request by inode number.
2507
* Allocate if requested (NEW, REPLACE, SINGLE).
2508
*/
2509
static struct ffs_blkfree_trim_params *
2510
trim_lookup(struct ufsmount *ump,
2511
struct vnode *devvp,
2512
ufs2_daddr_t bno,
2513
long size,
2514
ino_t inum,
2515
uint64_t key,
2516
int alloctype)
2517
{
2518
struct trimlist_hashhead *tphashhead;
2519
struct ffs_blkfree_trim_params *tp, *ntp;
2520
2521
ntp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK);
2522
if (alloctype != SINGLE) {
2523
KASSERT(key >= FIRST_VALID_KEY, ("trim_lookup: invalid key"));
2524
UFS_LOCK(ump);
2525
tphashhead = TRIMLIST_HASH(ump, key);
2526
LIST_FOREACH(tp, tphashhead, hashlist)
2527
if (key == tp->key)
2528
break;
2529
}
2530
switch (alloctype) {
2531
case NEW:
2532
KASSERT(tp == NULL, ("trim_lookup: found trim"));
2533
break;
2534
case OLD:
2535
KASSERT(tp != NULL,
2536
("trim_lookup: missing call to ffs_blkrelease_start()"));
2537
UFS_UNLOCK(ump);
2538
free(ntp, M_TRIM);
2539
return (tp);
2540
case REPLACE:
2541
KASSERT(tp != NULL, ("trim_lookup: missing REPLACE trim"));
2542
LIST_REMOVE(tp, hashlist);
2543
/* tp will be freed by caller */
2544
break;
2545
case DONE:
2546
KASSERT(tp != NULL, ("trim_lookup: missing DONE trim"));
2547
LIST_REMOVE(tp, hashlist);
2548
UFS_UNLOCK(ump);
2549
free(ntp, M_TRIM);
2550
return (tp);
2551
}
2552
TAILQ_INIT(&ntp->blklist);
2553
ntp->ump = ump;
2554
ntp->devvp = devvp;
2555
ntp->bno = bno;
2556
ntp->size = size;
2557
ntp->inum = inum;
2558
ntp->key = key;
2559
if (alloctype != SINGLE) {
2560
LIST_INSERT_HEAD(tphashhead, ntp, hashlist);
2561
UFS_UNLOCK(ump);
2562
}
2563
return (ntp);
2564
}
2565
2566
/*
2567
* Dispatch a trim request.
2568
*/
2569
static void
2570
ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *tp)
2571
{
2572
struct ufsmount *ump;
2573
struct mount *mp;
2574
struct buf *bp;
2575
2576
/*
2577
* Postpone the set of the free bit in the cg bitmap until the
2578
* BIO_DELETE is completed. Otherwise, due to disk queue
2579
* reordering, TRIM might be issued after we reuse the block
2580
* and write some new data into it.
2581
*/
2582
ump = tp->ump;
2583
bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
2584
bp->b_iocmd = BIO_DELETE;
2585
bp->b_iooffset = dbtob(fsbtodb(ump->um_fs, tp->bno));
2586
bp->b_iodone = ffs_blkfree_trim_completed;
2587
bp->b_bcount = tp->size;
2588
bp->b_fsprivate1 = tp;
2589
UFS_LOCK(ump);
2590
ump->um_trim_total += 1;
2591
ump->um_trim_inflight += 1;
2592
ump->um_trim_inflight_blks += numfrags(ump->um_fs, tp->size);
2593
ump->um_trim_total_blks += numfrags(ump->um_fs, tp->size);
2594
UFS_UNLOCK(ump);
2595
2596
mp = UFSTOVFS(ump);
2597
vn_start_secondary_write(NULL, &mp, 0);
2598
g_vfs_strategy(ump->um_bo, bp);
2599
}
2600
2601
/*
2602
* Allocate a new key to use to identify a range of blocks.
2603
*/
2604
uint64_t
2605
ffs_blkrelease_start(struct ufsmount *ump,
2606
struct vnode *devvp,
2607
ino_t inum)
2608
{
2609
static u_long masterkey;
2610
uint64_t key;
2611
2612
if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0)
2613
return (SINGLETON_KEY);
2614
do {
2615
key = atomic_fetchadd_long(&masterkey, 1);
2616
} while (key < FIRST_VALID_KEY);
2617
(void) trim_lookup(ump, devvp, 0, 0, inum, key, NEW);
2618
return (key);
2619
}
2620
2621
/*
2622
* Deallocate a key that has been used to identify a range of blocks.
2623
*/
2624
void
2625
ffs_blkrelease_finish(struct ufsmount *ump, uint64_t key)
2626
{
2627
struct ffs_blkfree_trim_params *tp;
2628
2629
if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0)
2630
return;
2631
/*
2632
* If the vfs.ffs.dotrimcons sysctl option is enabled while
2633
* a file deletion is active, specifically after a call
2634
* to ffs_blkrelease_start() but before the call to
2635
* ffs_blkrelease_finish(), ffs_blkrelease_start() will
2636
* have handed out SINGLETON_KEY rather than starting a
2637
* collection sequence. Thus if we get a SINGLETON_KEY
2638
* passed to ffs_blkrelease_finish(), we just return rather
2639
* than trying to finish the nonexistent sequence.
2640
*/
2641
if (key == SINGLETON_KEY) {
2642
#ifdef INVARIANTS
2643
printf("%s: vfs.ffs.dotrimcons enabled on active filesystem\n",
2644
ump->um_mountp->mnt_stat.f_mntonname);
2645
#endif
2646
return;
2647
}
2648
/*
2649
* We are done with sending blocks using this key. Look up the key
2650
* using the DONE alloctype (in tp) to request that it be unhashed
2651
* as we will not be adding to it. If the key has never been used,
2652
* tp->size will be zero, so we can just free tp. Otherwise the call
2653
* to ffs_blkfree_sendtrim(tp) causes the block range described by
2654
* tp to be issued (and then tp to be freed).
2655
*/
2656
tp = trim_lookup(ump, NULL, 0, 0, 0, key, DONE);
2657
if (tp->size == 0)
2658
free(tp, M_TRIM);
2659
else
2660
ffs_blkfree_sendtrim(tp);
2661
}
2662
2663
/*
2664
* Setup to free a block or fragment.
2665
*
2666
* Check for snapshots that might want to claim the block.
2667
* If trims are requested, prepare a trim request. Attempt to
2668
* aggregate consecutive blocks into a single trim request.
2669
*/
2670
void
2671
ffs_blkfree(struct ufsmount *ump,
2672
struct fs *fs,
2673
struct vnode *devvp,
2674
ufs2_daddr_t bno,
2675
long size,
2676
ino_t inum,
2677
__enum_uint8(vtype) vtype,
2678
struct workhead *dephd,
2679
uint64_t key)
2680
{
2681
struct ffs_blkfree_trim_params *tp, *ntp;
2682
struct trim_blkreq *blkelm;
2683
2684
/*
2685
* Check to see if a snapshot wants to claim the block.
2686
* Check that devvp is a normal disk device, not a snapshot,
2687
* it has a snapshot(s) associated with it, and one of the
2688
* snapshots wants to claim the block.
2689
*/
2690
if (devvp->v_type == VCHR &&
2691
(devvp->v_vflag & VV_COPYONWRITE) &&
2692
ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) {
2693
return;
2694
}
2695
/*
2696
* Nothing to delay if TRIM is not required for this block or TRIM
2697
* is disabled or the operation is performed on a snapshot.
2698
*/
2699
if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) ||
2700
devvp->v_type == VREG) {
2701
ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd);
2702
return;
2703
}
2704
blkelm = malloc(sizeof(struct trim_blkreq), M_TRIM, M_WAITOK);
2705
blkelm->bno = bno;
2706
blkelm->size = size;
2707
if (dephd == NULL) {
2708
blkelm->pdephd = NULL;
2709
} else {
2710
LIST_INIT(&blkelm->dephd);
2711
LIST_SWAP(dephd, &blkelm->dephd, worklist, wk_list);
2712
blkelm->pdephd = &blkelm->dephd;
2713
}
2714
if (key == SINGLETON_KEY) {
2715
/*
2716
* Just a single non-contiguous piece. Use the SINGLE
2717
* alloctype to return a trim request that will not be
2718
* hashed for future lookup.
2719
*/
2720
tp = trim_lookup(ump, devvp, bno, size, inum, key, SINGLE);
2721
TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist);
2722
ffs_blkfree_sendtrim(tp);
2723
return;
2724
}
2725
/*
2726
* The callers of this function are not tracking whether or not
2727
* the blocks are contiguous. They are just saying that they
2728
* are freeing a set of blocks. It is this code that determines
2729
* the pieces of that range that are actually contiguous.
2730
*
2731
* Calling ffs_blkrelease_start() will have created an entry
2732
* that we will use.
2733
*/
2734
tp = trim_lookup(ump, devvp, bno, size, inum, key, OLD);
2735
if (tp->size == 0) {
2736
/*
2737
* First block of a potential range, set block and size
2738
* for the trim block.
2739
*/
2740
tp->bno = bno;
2741
tp->size = size;
2742
TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist);
2743
return;
2744
}
2745
/*
2746
* If this block is a continuation of the range (either
2747
* follows at the end or preceeds in the front) then we
2748
* add it to the front or back of the list and return.
2749
*
2750
* If it is not a continuation of the trim that we were
2751
* building, using the REPLACE alloctype, we request that
2752
* the old trim request (still in tp) be unhashed and a
2753
* new range started (in ntp). The ffs_blkfree_sendtrim(tp)
2754
* call causes the block range described by tp to be issued
2755
* (and then tp to be freed).
2756
*/
2757
if (bno + numfrags(fs, size) == tp->bno) {
2758
TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist);
2759
tp->bno = bno;
2760
tp->size += size;
2761
return;
2762
} else if (bno == tp->bno + numfrags(fs, tp->size)) {
2763
TAILQ_INSERT_TAIL(&tp->blklist, blkelm, blkreqlist);
2764
tp->size += size;
2765
return;
2766
}
2767
ntp = trim_lookup(ump, devvp, bno, size, inum, key, REPLACE);
2768
TAILQ_INSERT_HEAD(&ntp->blklist, blkelm, blkreqlist);
2769
ffs_blkfree_sendtrim(tp);
2770
}
2771
2772
#ifdef INVARIANTS
2773
/*
2774
* Verify allocation of a block or fragment.
2775
* Return 1 if block or fragment is free.
2776
*/
2777
static int
2778
ffs_checkfreeblk(struct inode *ip,
2779
ufs2_daddr_t bno,
2780
long size)
2781
{
2782
struct fs *fs;
2783
struct cg *cgp;
2784
struct buf *bp;
2785
ufs1_daddr_t cgbno;
2786
int i, frags, blkalloced;
2787
uint8_t *blksfree;
2788
2789
fs = ITOFS(ip);
2790
if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0) {
2791
printf("bsize = %ld, size = %ld, fs = %s\n",
2792
(long)fs->fs_bsize, size, fs->fs_fsmnt);
2793
panic("ffs_checkfreeblk: bad size");
2794
}
2795
if ((uint64_t)bno >= fs->fs_size)
2796
panic("ffs_checkfreeblk: too big block %jd", (intmax_t)bno);
2797
if (ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), 0, &bp, &cgp) != 0)
2798
return (0);
2799
blksfree = cg_blksfree(cgp);
2800
cgbno = dtogd(fs, bno);
2801
if (size == fs->fs_bsize) {
2802
blkalloced = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno));
2803
} else {
2804
frags = numfrags(fs, size);
2805
for (blkalloced = 0, i = 0; i < frags; i++)
2806
if (isset(blksfree, cgbno + i))
2807
blkalloced++;
2808
if (blkalloced != 0 && blkalloced != frags)
2809
panic("ffs_checkfreeblk: partially free fragment");
2810
}
2811
brelse(bp);
2812
return (blkalloced == 0);
2813
}
2814
#endif /* INVARIANTS */
2815
2816
/*
2817
* Free an inode.
2818
*/
2819
int
2820
ffs_vfree(struct vnode *pvp,
2821
ino_t ino,
2822
int mode)
2823
{
2824
struct ufsmount *ump;
2825
2826
if (DOINGSOFTDEP(pvp)) {
2827
softdep_freefile(pvp, ino, mode);
2828
return (0);
2829
}
2830
ump = VFSTOUFS(pvp->v_mount);
2831
return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL));
2832
}
2833
2834
/*
2835
* Do the actual free operation.
2836
* The specified inode is placed back in the free map.
2837
*/
2838
int
2839
ffs_freefile(struct ufsmount *ump,
2840
struct fs *fs,
2841
struct vnode *devvp,
2842
ino_t ino,
2843
int mode,
2844
struct workhead *wkhd)
2845
{
2846
struct cg *cgp;
2847
struct buf *bp;
2848
daddr_t dbn;
2849
int error;
2850
uint64_t cg;
2851
uint8_t *inosused;
2852
struct cdev *dev;
2853
ino_t cgino;
2854
2855
cg = ino_to_cg(fs, ino);
2856
if (devvp->v_type == VREG) {
2857
/* devvp is a snapshot */
2858
MPASS(devvp->v_mount->mnt_data == ump);
2859
dev = ump->um_devvp->v_rdev;
2860
} else if (devvp->v_type == VCHR) {
2861
/* devvp is a normal disk device */
2862
dev = devvp->v_rdev;
2863
} else {
2864
bp = NULL;
2865
return (0);
2866
}
2867
if (ino >= fs->fs_ipg * fs->fs_ncg)
2868
panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s",
2869
devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt);
2870
if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) {
2871
if (!MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR)
2872
return (error);
2873
/*
2874
* Would like to just downgrade to read-only. Until that
2875
* capability is available, just toss the cylinder group
2876
* update and mark the filesystem as needing to run fsck.
2877
*/
2878
fs->fs_flags |= FS_NEEDSFSCK;
2879
if (devvp->v_type == VREG)
2880
dbn = fragstoblks(fs, cgtod(fs, cg));
2881
else
2882
dbn = fsbtodb(fs, cgtod(fs, cg));
2883
error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp);
2884
KASSERT(error == 0, ("getblkx failed"));
2885
softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd, true);
2886
bp->b_flags |= B_RELBUF | B_NOCACHE;
2887
bp->b_flags &= ~B_CACHE;
2888
bawrite(bp);
2889
return (error);
2890
}
2891
inosused = cg_inosused(cgp);
2892
cgino = ino % fs->fs_ipg;
2893
if (isclr(inosused, cgino)) {
2894
printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev),
2895
(uintmax_t)ino, fs->fs_fsmnt);
2896
if (fs->fs_ronly == 0)
2897
panic("ffs_freefile: freeing free inode");
2898
}
2899
clrbit(inosused, cgino);
2900
if (cgino < cgp->cg_irotor)
2901
cgp->cg_irotor = cgino;
2902
cgp->cg_cs.cs_nifree++;
2903
UFS_LOCK(ump);
2904
fs->fs_cstotal.cs_nifree++;
2905
fs->fs_cs(fs, cg).cs_nifree++;
2906
if ((mode & IFMT) == IFDIR) {
2907
cgp->cg_cs.cs_ndir--;
2908
fs->fs_cstotal.cs_ndir--;
2909
fs->fs_cs(fs, cg).cs_ndir--;
2910
}
2911
fs->fs_fmod = 1;
2912
ACTIVECLEAR(fs, cg);
2913
UFS_UNLOCK(ump);
2914
if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR)
2915
softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd, false);
2916
bdwrite(bp);
2917
return (0);
2918
}
2919
2920
/*
2921
* Check to see if a file is free.
2922
* Used to check for allocated files in snapshots.
2923
* Return 1 if file is free.
2924
*/
2925
int
2926
ffs_checkfreefile(struct fs *fs,
2927
struct vnode *devvp,
2928
ino_t ino)
2929
{
2930
struct cg *cgp;
2931
struct buf *bp;
2932
int ret, error;
2933
uint64_t cg;
2934
uint8_t *inosused;
2935
2936
cg = ino_to_cg(fs, ino);
2937
if ((devvp->v_type != VREG) && (devvp->v_type != VCHR))
2938
return (1);
2939
if (ino >= fs->fs_ipg * fs->fs_ncg)
2940
return (1);
2941
if ((error = ffs_getcg(fs, devvp, cg, 0, &bp, &cgp)) != 0)
2942
return (1);
2943
inosused = cg_inosused(cgp);
2944
ino %= fs->fs_ipg;
2945
ret = isclr(inosused, ino);
2946
brelse(bp);
2947
return (ret);
2948
}
2949
2950
/*
2951
* Find a block of the specified size in the specified cylinder group.
2952
*
2953
* It is a panic if a request is made to find a block if none are
2954
* available.
2955
*/
2956
static ufs1_daddr_t
2957
ffs_mapsearch(struct fs *fs,
2958
struct cg *cgp,
2959
ufs2_daddr_t bpref,
2960
int allocsiz)
2961
{
2962
ufs1_daddr_t bno;
2963
int start, len, loc, i;
2964
int blk, field, subfield, pos;
2965
uint8_t *blksfree;
2966
2967
/*
2968
* find the fragment by searching through the free block
2969
* map for an appropriate bit pattern
2970
*/
2971
if (bpref)
2972
start = dtogd(fs, bpref) / NBBY;
2973
else
2974
start = cgp->cg_frotor / NBBY;
2975
blksfree = cg_blksfree(cgp);
2976
len = howmany(fs->fs_fpg, NBBY) - start;
2977
loc = scanc((uint64_t)len, (uint8_t *)&blksfree[start],
2978
fragtbl[fs->fs_frag],
2979
(uint8_t)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
2980
if (loc == 0) {
2981
len = start + 1;
2982
start = 0;
2983
loc = scanc((uint64_t)len, (uint8_t *)&blksfree[0],
2984
fragtbl[fs->fs_frag],
2985
(uint8_t)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
2986
if (loc == 0) {
2987
printf("start = %d, len = %d, fs = %s\n",
2988
start, len, fs->fs_fsmnt);
2989
panic("ffs_alloccg: map corrupted");
2990
/* NOTREACHED */
2991
}
2992
}
2993
bno = (start + len - loc) * NBBY;
2994
cgp->cg_frotor = bno;
2995
/*
2996
* found the byte in the map
2997
* sift through the bits to find the selected frag
2998
*/
2999
for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
3000
blk = blkmap(fs, blksfree, bno);
3001
blk <<= 1;
3002
field = around[allocsiz];
3003
subfield = inside[allocsiz];
3004
for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
3005
if ((blk & field) == subfield)
3006
return (bno + pos);
3007
field <<= 1;
3008
subfield <<= 1;
3009
}
3010
}
3011
printf("bno = %ju, fs = %s\n", (intmax_t)bno, fs->fs_fsmnt);
3012
panic("ffs_alloccg: block not in map");
3013
return (-1);
3014
}
3015
3016
/*
3017
* Fetch and verify a cylinder group.
3018
*/
3019
int
3020
ffs_getcg(struct fs *fs,
3021
struct vnode *devvp,
3022
uint64_t cg,
3023
int flags,
3024
struct buf **bpp,
3025
struct cg **cgpp)
3026
{
3027
struct buf *bp;
3028
struct cg *cgp;
3029
struct mount *mp;
3030
const struct statfs *sfs;
3031
daddr_t blkno;
3032
int error;
3033
3034
*bpp = NULL;
3035
*cgpp = NULL;
3036
if ((fs->fs_metackhash & CK_CYLGRP) != 0)
3037
flags |= GB_CKHASH;
3038
if (devvp->v_type == VCHR) {
3039
blkno = fsbtodb(fs, cgtod(fs, cg));
3040
mp = devvp->v_rdev->si_mountpt;
3041
} else {
3042
blkno = fragstoblks(fs, cgtod(fs, cg));
3043
mp = devvp->v_mount;
3044
}
3045
error = breadn_flags(devvp, blkno, blkno, (int)fs->fs_cgsize, NULL,
3046
NULL, 0, NOCRED, flags, ffs_ckhash_cg, &bp);
3047
if (error != 0)
3048
return (error);
3049
cgp = (struct cg *)bp->b_data;
3050
if ((fs->fs_metackhash & CK_CYLGRP) != 0 &&
3051
(bp->b_flags & B_CKHASH) != 0 &&
3052
cgp->cg_ckhash != bp->b_ckhash) {
3053
if (ppsratecheck(&VFSTOUFS(mp)->um_last_integritymsg,
3054
&VFSTOUFS(mp)->um_secs_integritymsg, 1)) {
3055
sfs = &mp->mnt_stat;
3056
printf("UFS %s%s (%s) cylinder checkhash failed: "
3057
"cg %ju, cgp: 0x%x != bp: 0x%jx\n",
3058
devvp->v_type == VCHR ? "" : "snapshot of ",
3059
sfs->f_mntfromname, sfs->f_mntonname, (intmax_t)cg,
3060
cgp->cg_ckhash, (uintmax_t)bp->b_ckhash);
3061
}
3062
bp->b_flags &= ~B_CKHASH;
3063
bp->b_flags |= B_INVAL | B_NOCACHE;
3064
brelse(bp);
3065
return (EINTEGRITY);
3066
}
3067
if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) {
3068
if (ppsratecheck(&VFSTOUFS(mp)->um_last_integritymsg,
3069
&VFSTOUFS(mp)->um_secs_integritymsg, 1)) {
3070
sfs = &mp->mnt_stat;
3071
printf("UFS %s%s (%s)",
3072
devvp->v_type == VCHR ? "" : "snapshot of ",
3073
sfs->f_mntfromname, sfs->f_mntonname);
3074
if (!cg_chkmagic(cgp))
3075
printf(" cg %ju: bad magic number 0x%x should "
3076
"be 0x%x\n", (intmax_t)cg, cgp->cg_magic,
3077
CG_MAGIC);
3078
else
3079
printf(": wrong cylinder group cg %ju != "
3080
"cgx %u\n", (intmax_t)cg, cgp->cg_cgx);
3081
}
3082
bp->b_flags &= ~B_CKHASH;
3083
bp->b_flags |= B_INVAL | B_NOCACHE;
3084
brelse(bp);
3085
return (EINTEGRITY);
3086
}
3087
bp->b_flags &= ~B_CKHASH;
3088
bp->b_xflags |= BX_BKGRDWRITE;
3089
/*
3090
* If we are using check hashes on the cylinder group then we want
3091
* to limit changing the cylinder group time to when we are actually
3092
* going to write it to disk so that its check hash remains correct
3093
* in memory. If the CK_CYLGRP flag is set the time is updated in
3094
* ffs_bufwrite() as the buffer is queued for writing. Otherwise we
3095
* update the time here as we have done historically.
3096
*/
3097
if ((fs->fs_metackhash & CK_CYLGRP) != 0)
3098
bp->b_xflags |= BX_CYLGRP;
3099
else
3100
cgp->cg_old_time = cgp->cg_time = time_second;
3101
*bpp = bp;
3102
*cgpp = cgp;
3103
return (0);
3104
}
3105
3106
static void
3107
ffs_ckhash_cg(struct buf *bp)
3108
{
3109
uint32_t ckhash;
3110
struct cg *cgp;
3111
3112
cgp = (struct cg *)bp->b_data;
3113
ckhash = cgp->cg_ckhash;
3114
cgp->cg_ckhash = 0;
3115
bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount);
3116
cgp->cg_ckhash = ckhash;
3117
}
3118
3119
/*
3120
* Called when a cylinder group read has failed. If an integrity check
3121
* is the cause of failure then the cylinder group will not be usable
3122
* until the filesystem has been unmounted and fsck has been run to
3123
* repair it. To avoid future attempts to allocate resources from the
3124
* cylinder group, its available resources are set to zero in the
3125
* superblock summary information. Since it will appear to have no
3126
* resources available, no further calls will be made to allocate
3127
* resources from it. When resources are freed to the cylinder group
3128
* the resource free routines will find the cylinder group unusable so
3129
* the resource will simply be discarded and thus will not show up in
3130
* the superblock summary information until they are recovered by fsck.
3131
*/
3132
static void
3133
ffs_checkcgintegrity(struct fs *fs,
3134
uint64_t cg,
3135
int error)
3136
{
3137
3138
if (error != EINTEGRITY)
3139
return;
3140
fs->fs_cstotal.cs_nffree -= fs->fs_cs(fs, cg).cs_nffree;
3141
fs->fs_cs(fs, cg).cs_nffree = 0;
3142
fs->fs_cstotal.cs_nbfree -= fs->fs_cs(fs, cg).cs_nbfree;
3143
fs->fs_cs(fs, cg).cs_nbfree = 0;
3144
fs->fs_cstotal.cs_nifree -= fs->fs_cs(fs, cg).cs_nifree;
3145
fs->fs_cs(fs, cg).cs_nifree = 0;
3146
fs->fs_maxcluster[cg] = 0;
3147
fs->fs_flags |= FS_NEEDSFSCK;
3148
fs->fs_fmod = 1;
3149
}
3150
3151
/*
3152
* Fserr prints the name of a filesystem with an error diagnostic.
3153
*
3154
* The form of the error message is:
3155
* fs: error message
3156
*/
3157
void
3158
ffs_fserr(struct fs *fs,
3159
ino_t inum,
3160
char *cp)
3161
{
3162
struct thread *td = curthread; /* XXX */
3163
struct proc *p = td->td_proc;
3164
3165
log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n",
3166
p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum,
3167
fs->fs_fsmnt, cp);
3168
}
3169
3170
/*
3171
* This function provides the capability for the fsck program to
3172
* update an active filesystem. Sixteen operations are provided:
3173
*
3174
* adjrefcnt(inode, amt) - adjusts the reference count on the
3175
* specified inode by the specified amount. Under normal
3176
* operation the count should always go down. Decrementing
3177
* the count to zero will cause the inode to be freed.
3178
* adjblkcnt(inode, amt) - adjust the number of blocks used by the
3179
* inode by the specified amount.
3180
* adjdepth(inode, amt) - adjust the depth of the specified directory
3181
* inode by the specified amount.
3182
* setsize(inode, size) - set the size of the inode to the
3183
* specified size.
3184
* adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) -
3185
* adjust the superblock summary.
3186
* freedirs(inode, count) - directory inodes [inode..inode + count - 1]
3187
* are marked as free. Inodes should never have to be marked
3188
* as in use.
3189
* freefiles(inode, count) - file inodes [inode..inode + count - 1]
3190
* are marked as free. Inodes should never have to be marked
3191
* as in use.
3192
* freeblks(blockno, size) - blocks [blockno..blockno + size - 1]
3193
* are marked as free. Blocks should never have to be marked
3194
* as in use.
3195
* setflags(flags, set/clear) - the fs_flags field has the specified
3196
* flags set (second parameter +1) or cleared (second parameter -1).
3197
* setcwd(dirinode) - set the current directory to dirinode in the
3198
* filesystem associated with the snapshot.
3199
* setdotdot(oldvalue, newvalue) - Verify that the inode number for ".."
3200
* in the current directory is oldvalue then change it to newvalue.
3201
* unlink(nameptr, oldvalue) - Verify that the inode number associated
3202
* with nameptr in the current directory is oldvalue then unlink it.
3203
*/
3204
3205
static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS);
3206
3207
SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt,
3208
CTLFLAG_WR | CTLTYPE_STRUCT | CTLFLAG_NEEDGIANT,
3209
0, 0, sysctl_ffs_fsck, "S,fsck",
3210
"Adjust Inode Reference Count");
3211
3212
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt,
3213
CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
3214
"Adjust Inode Used Blocks Count");
3215
3216
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_DEPTH, adjdepth,
3217
CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
3218
"Adjust Directory Inode Depth");
3219
3220
static SYSCTL_NODE(_vfs_ffs, FFS_SET_SIZE, setsize,
3221
CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
3222
"Set the inode size");
3223
3224
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir,
3225
CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
3226
"Adjust number of directories");
3227
3228
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree,
3229
CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
3230
"Adjust number of free blocks");
3231
3232
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree,
3233
CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
3234
"Adjust number of free inodes");
3235
3236
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree,
3237
CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
3238
"Adjust number of free frags");
3239
3240
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters,
3241
CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
3242
"Adjust number of free clusters");
3243
3244
static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs,
3245
CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
3246
"Free Range of Directory Inodes");
3247
3248
static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles,
3249
CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
3250
"Free Range of File Inodes");
3251
3252
static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks,
3253
CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
3254
"Free Range of Blocks");
3255
3256
static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags,
3257
CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
3258
"Change Filesystem Flags");
3259
3260
static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd,
3261
CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
3262
"Set Current Working Directory");
3263
3264
static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot,
3265
CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
3266
"Change Value of .. Entry");
3267
3268
static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink,
3269
CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
3270
"Unlink a Duplicate Name");
3271
3272
#ifdef DIAGNOSTIC
3273
static int fsckcmds = 0;
3274
SYSCTL_INT(_debug, OID_AUTO, ffs_fsckcmds, CTLFLAG_RW, &fsckcmds, 0,
3275
"print out fsck_ffs-based filesystem update commands");
3276
#endif /* DIAGNOSTIC */
3277
3278
static int
3279
sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
3280
{
3281
struct thread *td = curthread;
3282
struct fsck_cmd cmd;
3283
struct ufsmount *ump;
3284
struct vnode *vp, *dvp, *fdvp;
3285
struct inode *ip, *dp;
3286
struct mount *mp;
3287
struct fs *fs;
3288
struct pwd *pwd;
3289
ufs2_daddr_t blkno;
3290
long blkcnt, blksize;
3291
uint64_t key;
3292
struct file *fp;
3293
cap_rights_t rights;
3294
int filetype, error;
3295
3296
if (req->newptr == NULL || req->newlen > sizeof(cmd))
3297
return (EBADRPC);
3298
if ((error = SYSCTL_IN(req, &cmd, sizeof(cmd))) != 0)
3299
return (error);
3300
if (cmd.version != FFS_CMD_VERSION)
3301
return (ERPCMISMATCH);
3302
if ((error = getvnode(td, cmd.handle,
3303
cap_rights_init_one(&rights, CAP_FSCK), &fp)) != 0)
3304
return (error);
3305
vp = fp->f_vnode;
3306
if (vp->v_type != VREG && vp->v_type != VDIR) {
3307
fdrop(fp, td);
3308
return (EINVAL);
3309
}
3310
vn_start_write(vp, &mp, V_WAIT);
3311
if (mp == NULL ||
3312
strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) {
3313
vn_finished_write(mp);
3314
fdrop(fp, td);
3315
return (EINVAL);
3316
}
3317
ump = VFSTOUFS(mp);
3318
if (mp->mnt_flag & MNT_RDONLY) {
3319
vn_finished_write(mp);
3320
fdrop(fp, td);
3321
return (EROFS);
3322
}
3323
fs = ump->um_fs;
3324
filetype = IFREG;
3325
3326
switch (oidp->oid_number) {
3327
case FFS_SET_FLAGS:
3328
#ifdef DIAGNOSTIC
3329
if (fsckcmds)
3330
printf("%s: %s flags\n", mp->mnt_stat.f_mntonname,
3331
cmd.size > 0 ? "set" : "clear");
3332
#endif /* DIAGNOSTIC */
3333
if (cmd.size > 0)
3334
fs->fs_flags |= (long)cmd.value;
3335
else
3336
fs->fs_flags &= ~(long)cmd.value;
3337
break;
3338
3339
case FFS_ADJ_REFCNT:
3340
#ifdef DIAGNOSTIC
3341
if (fsckcmds) {
3342
printf("%s: adjust inode %jd link count by %jd\n",
3343
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
3344
(intmax_t)cmd.size);
3345
}
3346
#endif /* DIAGNOSTIC */
3347
if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
3348
break;
3349
ip = VTOI(vp);
3350
ip->i_nlink += cmd.size;
3351
DIP_SET_NLINK(ip, ip->i_nlink);
3352
ip->i_effnlink += cmd.size;
3353
UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED);
3354
error = ffs_update(vp, 1);
3355
if (DOINGSOFTDEP(vp))
3356
softdep_change_linkcnt(ip);
3357
vput(vp);
3358
break;
3359
3360
case FFS_ADJ_BLKCNT:
3361
#ifdef DIAGNOSTIC
3362
if (fsckcmds) {
3363
printf("%s: adjust inode %jd block count by %jd\n",
3364
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
3365
(intmax_t)cmd.size);
3366
}
3367
#endif /* DIAGNOSTIC */
3368
if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
3369
break;
3370
ip = VTOI(vp);
3371
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size);
3372
UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED);
3373
error = ffs_update(vp, 1);
3374
vput(vp);
3375
break;
3376
3377
case FFS_ADJ_DEPTH:
3378
#ifdef DIAGNOSTIC
3379
if (fsckcmds) {
3380
printf("%s: adjust directory inode %jd depth by %jd\n",
3381
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
3382
(intmax_t)cmd.size);
3383
}
3384
#endif /* DIAGNOSTIC */
3385
if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
3386
break;
3387
if (vp->v_type != VDIR) {
3388
vput(vp);
3389
error = ENOTDIR;
3390
break;
3391
}
3392
ip = VTOI(vp);
3393
DIP_SET(ip, i_dirdepth, DIP(ip, i_dirdepth) + cmd.size);
3394
UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED);
3395
error = ffs_update(vp, 1);
3396
vput(vp);
3397
break;
3398
3399
case FFS_SET_SIZE:
3400
#ifdef DIAGNOSTIC
3401
if (fsckcmds) {
3402
printf("%s: set inode %jd size to %jd\n",
3403
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
3404
(intmax_t)cmd.size);
3405
}
3406
#endif /* DIAGNOSTIC */
3407
if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
3408
break;
3409
ip = VTOI(vp);
3410
DIP_SET(ip, i_size, cmd.size);
3411
UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_MODIFIED);
3412
error = ffs_update(vp, 1);
3413
vput(vp);
3414
break;
3415
3416
case FFS_DIR_FREE:
3417
filetype = IFDIR;
3418
/* fall through */
3419
3420
case FFS_FILE_FREE:
3421
#ifdef DIAGNOSTIC
3422
if (fsckcmds) {
3423
if (cmd.size == 1)
3424
printf("%s: free %s inode %ju\n",
3425
mp->mnt_stat.f_mntonname,
3426
filetype == IFDIR ? "directory" : "file",
3427
(uintmax_t)cmd.value);
3428
else
3429
printf("%s: free %s inodes %ju-%ju\n",
3430
mp->mnt_stat.f_mntonname,
3431
filetype == IFDIR ? "directory" : "file",
3432
(uintmax_t)cmd.value,
3433
(uintmax_t)(cmd.value + cmd.size - 1));
3434
}
3435
#endif /* DIAGNOSTIC */
3436
while (cmd.size > 0) {
3437
if ((error = ffs_freefile(ump, fs, ump->um_devvp,
3438
cmd.value, filetype, NULL)))
3439
break;
3440
cmd.size -= 1;
3441
cmd.value += 1;
3442
}
3443
break;
3444
3445
case FFS_BLK_FREE:
3446
#ifdef DIAGNOSTIC
3447
if (fsckcmds) {
3448
if (cmd.size == 1)
3449
printf("%s: free block %jd\n",
3450
mp->mnt_stat.f_mntonname,
3451
(intmax_t)cmd.value);
3452
else
3453
printf("%s: free blocks %jd-%jd\n",
3454
mp->mnt_stat.f_mntonname,
3455
(intmax_t)cmd.value,
3456
(intmax_t)cmd.value + cmd.size - 1);
3457
}
3458
#endif /* DIAGNOSTIC */
3459
blkno = cmd.value;
3460
blkcnt = cmd.size;
3461
blksize = fs->fs_frag - (blkno % fs->fs_frag);
3462
key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO);
3463
while (blkcnt > 0) {
3464
if (blkcnt < blksize)
3465
blksize = blkcnt;
3466
ffs_blkfree(ump, fs, ump->um_devvp, blkno,
3467
blksize * fs->fs_fsize, UFS_ROOTINO,
3468
VDIR, NULL, key);
3469
blkno += blksize;
3470
blkcnt -= blksize;
3471
blksize = fs->fs_frag;
3472
}
3473
ffs_blkrelease_finish(ump, key);
3474
break;
3475
3476
/*
3477
* Adjust superblock summaries. fsck(8) is expected to
3478
* submit deltas when necessary.
3479
*/
3480
case FFS_ADJ_NDIR:
3481
#ifdef DIAGNOSTIC
3482
if (fsckcmds) {
3483
printf("%s: adjust number of directories by %jd\n",
3484
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
3485
}
3486
#endif /* DIAGNOSTIC */
3487
fs->fs_cstotal.cs_ndir += cmd.value;
3488
break;
3489
3490
case FFS_ADJ_NBFREE:
3491
#ifdef DIAGNOSTIC
3492
if (fsckcmds) {
3493
printf("%s: adjust number of free blocks by %+jd\n",
3494
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
3495
}
3496
#endif /* DIAGNOSTIC */
3497
fs->fs_cstotal.cs_nbfree += cmd.value;
3498
break;
3499
3500
case FFS_ADJ_NIFREE:
3501
#ifdef DIAGNOSTIC
3502
if (fsckcmds) {
3503
printf("%s: adjust number of free inodes by %+jd\n",
3504
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
3505
}
3506
#endif /* DIAGNOSTIC */
3507
fs->fs_cstotal.cs_nifree += cmd.value;
3508
break;
3509
3510
case FFS_ADJ_NFFREE:
3511
#ifdef DIAGNOSTIC
3512
if (fsckcmds) {
3513
printf("%s: adjust number of free frags by %+jd\n",
3514
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
3515
}
3516
#endif /* DIAGNOSTIC */
3517
fs->fs_cstotal.cs_nffree += cmd.value;
3518
break;
3519
3520
case FFS_ADJ_NUMCLUSTERS:
3521
#ifdef DIAGNOSTIC
3522
if (fsckcmds) {
3523
printf("%s: adjust number of free clusters by %+jd\n",
3524
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
3525
}
3526
#endif /* DIAGNOSTIC */
3527
fs->fs_cstotal.cs_numclusters += cmd.value;
3528
break;
3529
3530
case FFS_SET_CWD:
3531
#ifdef DIAGNOSTIC
3532
if (fsckcmds) {
3533
printf("%s: set current directory to inode %jd\n",
3534
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
3535
}
3536
#endif /* DIAGNOSTIC */
3537
if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp)))
3538
break;
3539
AUDIT_ARG_VNODE1(vp);
3540
if ((error = change_dir(vp, td)) != 0) {
3541
vput(vp);
3542
break;
3543
}
3544
VOP_UNLOCK(vp);
3545
pwd_chdir(td, vp);
3546
break;
3547
3548
case FFS_SET_DOTDOT:
3549
#ifdef DIAGNOSTIC
3550
if (fsckcmds) {
3551
printf("%s: change .. in cwd from %jd to %jd\n",
3552
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
3553
(intmax_t)cmd.size);
3554
}
3555
#endif /* DIAGNOSTIC */
3556
/*
3557
* First we have to get and lock the parent directory
3558
* to which ".." points.
3559
*/
3560
error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp);
3561
if (error)
3562
break;
3563
/*
3564
* Now we get and lock the child directory containing "..".
3565
*/
3566
pwd = pwd_hold(td);
3567
dvp = pwd->pwd_cdir;
3568
if ((error = vget(dvp, LK_EXCLUSIVE)) != 0) {
3569
vput(fdvp);
3570
pwd_drop(pwd);
3571
break;
3572
}
3573
dp = VTOI(dvp);
3574
SET_I_OFFSET(dp, 12); /* XXX mastertemplate.dot_reclen */
3575
error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size,
3576
DT_DIR, 0);
3577
cache_purge(fdvp);
3578
cache_purge(dvp);
3579
vput(dvp);
3580
vput(fdvp);
3581
pwd_drop(pwd);
3582
break;
3583
3584
case FFS_UNLINK:
3585
#ifdef DIAGNOSTIC
3586
if (fsckcmds) {
3587
char buf[32];
3588
3589
if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL))
3590
strncpy(buf, "Name_too_long", 32);
3591
printf("%s: unlink %s (inode %jd)\n",
3592
mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size);
3593
}
3594
#endif /* DIAGNOSTIC */
3595
/*
3596
* kern_funlinkat will do its own start/finish writes and
3597
* they do not nest, so drop ours here. Setting mp == NULL
3598
* indicates that vn_finished_write is not needed down below.
3599
*/
3600
vn_finished_write(mp);
3601
mp = NULL;
3602
error = kern_funlinkat(td, AT_FDCWD,
3603
(char *)(intptr_t)cmd.value, FD_NONE, UIO_USERSPACE,
3604
0, (ino_t)cmd.size);
3605
break;
3606
3607
default:
3608
#ifdef DIAGNOSTIC
3609
if (fsckcmds) {
3610
printf("Invalid request %d from fsck\n",
3611
oidp->oid_number);
3612
}
3613
#endif /* DIAGNOSTIC */
3614
error = EINVAL;
3615
break;
3616
}
3617
fdrop(fp, td);
3618
vn_finished_write(mp);
3619
return (error);
3620
}
3621
3622