CoCalc -- ffs_softdep.c

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/ufs/ffs/ffs_softdep.c
³⁹⁴⁷⁸ views
1
/*-
2
 * SPDX-License-Identifier: BSD-2-Clause
3
 *
4
 * Copyright 1998, 2000 Marshall Kirk McKusick.
5
 * Copyright 2009, 2010 Jeffrey W. Roberson <[email protected]>
6
 * All rights reserved.
7
 *
8
 * The soft updates code is derived from the appendix of a University
9
 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
10
 * "Soft Updates: A Solution to the Metadata Update Problem in File
11
 * Systems", CSE-TR-254-95, August 1995).
12
 *
13
 * Further information about soft updates can be obtained from:
14
 *
15
 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
16
 *	1614 Oxford Street		[email protected]
17
 *	Berkeley, CA 94709-1608		+1-510-843-9542
18
 *	USA
19
 *
20
 * Redistribution and use in source and binary forms, with or without
21
 * modification, are permitted provided that the following conditions
22
 * are met:
23
 *
24
 * 1. Redistributions of source code must retain the above copyright
25
 *    notice, this list of conditions and the following disclaimer.
26
 * 2. Redistributions in binary form must reproduce the above copyright
27
 *    notice, this list of conditions and the following disclaimer in the
28
 *    documentation and/or other materials provided with the distribution.
29
 *
30
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
31
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
32
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
33
 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
34
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
35
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
36
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
37
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
38
 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
39
 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40
 */
41

42
#include <sys/cdefs.h>
43
#include "opt_ffs.h"
44
#include "opt_quota.h"
45
#include "opt_ddb.h"
46

47
#include <sys/param.h>
48
#include <sys/kernel.h>
49
#include <sys/systm.h>
50
#include <sys/bio.h>
51
#include <sys/buf.h>
52
#include <sys/kdb.h>
53
#include <sys/kthread.h>
54
#include <sys/ktr.h>
55
#include <sys/limits.h>
56
#include <sys/lock.h>
57
#include <sys/malloc.h>
58
#include <sys/mount.h>
59
#include <sys/mutex.h>
60
#include <sys/namei.h>
61
#include <sys/priv.h>
62
#include <sys/proc.h>
63
#include <sys/racct.h>
64
#include <sys/rwlock.h>
65
#include <sys/stat.h>
66
#include <sys/sysctl.h>
67
#include <sys/syslog.h>
68
#include <sys/vnode.h>
69
#include <sys/conf.h>
70

71
#include <ufs/ufs/dir.h>
72
#include <ufs/ufs/extattr.h>
73
#include <ufs/ufs/quota.h>
74
#include <ufs/ufs/inode.h>
75
#include <ufs/ufs/ufsmount.h>
76
#include <ufs/ffs/fs.h>
77
#include <ufs/ffs/softdep.h>
78
#include <ufs/ffs/ffs_extern.h>
79
#include <ufs/ufs/ufs_extern.h>
80

81
#include <vm/vm.h>
82
#include <vm/vm_extern.h>
83
#include <vm/vm_object.h>
84

85
#include <geom/geom.h>
86
#include <geom/geom_vfs.h>
87

88
#include <ddb/ddb.h>
89

90
#define	KTR_SUJ	0	/* Define to KTR_SPARE. */
91

92
#ifndef SOFTUPDATES
93

94
int
95
softdep_flushfiles(struct mount *oldmnt,
96
	int flags,
97
	struct thread *td)
98
{
99

100
	panic("softdep_flushfiles called");
101
}
102

103
int
104
softdep_mount(struct vnode *devvp,
105
	struct mount *mp,
106
	struct fs *fs,
107
	struct ucred *cred)
108
{
109

110
	return (0);
111
}
112

113
void
114
softdep_initialize(void)
115
{
116

117
	return;
118
}
119

120
void
121
softdep_uninitialize(void)
122
{
123

124
	return;
125
}
126

127
void
128
softdep_unmount(struct mount *mp)
129
{
130

131
	panic("softdep_unmount called");
132
}
133

134
void
135
softdep_setup_sbupdate(struct ufsmount *ump,
136
	struct fs *fs,
137
	struct buf *bp)
138
{
139

140
	panic("softdep_setup_sbupdate called");
141
}
142

143
void
144
softdep_setup_inomapdep(struct buf *bp,
145
	struct inode *ip,
146
	ino_t newinum,
147
	int mode)
148
{
149

150
	panic("softdep_setup_inomapdep called");
151
}
152

153
void
154
softdep_setup_blkmapdep(struct buf *bp,
155
	struct mount *mp,
156
	ufs2_daddr_t newblkno,
157
	int frags,
158
	int oldfrags)
159
{
160

161
	panic("softdep_setup_blkmapdep called");
162
}
163

164
void
165
softdep_setup_allocdirect(struct inode *ip,
166
	ufs_lbn_t lbn,
167
	ufs2_daddr_t newblkno,
168
	ufs2_daddr_t oldblkno,
169
	long newsize,
170
	long oldsize,
171
	struct buf *bp)
172
{
173

174
	panic("softdep_setup_allocdirect called");
175
}
176

177
void
178
softdep_setup_allocext(struct inode *ip,
179
	ufs_lbn_t lbn,
180
	ufs2_daddr_t newblkno,
181
	ufs2_daddr_t oldblkno,
182
	long newsize,
183
	long oldsize,
184
	struct buf *bp)
185
{
186

187
	panic("softdep_setup_allocext called");
188
}
189

190
void
191
softdep_setup_allocindir_page(struct inode *ip,
192
	ufs_lbn_t lbn,
193
	struct buf *bp,
194
	int ptrno,
195
	ufs2_daddr_t newblkno,
196
	ufs2_daddr_t oldblkno,
197
	struct buf *nbp)
198
{
199

200
	panic("softdep_setup_allocindir_page called");
201
}
202

203
void
204
softdep_setup_allocindir_meta(struct buf *nbp,
205
	struct inode *ip,
206
	struct buf *bp,
207
	int ptrno,
208
	ufs2_daddr_t newblkno)
209
{
210

211
	panic("softdep_setup_allocindir_meta called");
212
}
213

214
void
215
softdep_journal_freeblocks(struct inode *ip,
216
	struct ucred *cred,
217
	off_t length,
218
	int flags)
219
{
220

221
	panic("softdep_journal_freeblocks called");
222
}
223

224
void
225
softdep_journal_fsync(struct inode *ip)
226
{
227

228
	panic("softdep_journal_fsync called");
229
}
230

231
void
232
softdep_setup_freeblocks(struct inode *ip,
233
	off_t length,
234
	int flags)
235
{
236

237
	panic("softdep_setup_freeblocks called");
238
}
239

240
void
241
softdep_freefile(struct vnode *pvp,
242
		ino_t ino,
243
		int mode)
244
{
245

246
	panic("softdep_freefile called");
247
}
248

249
int
250
softdep_setup_directory_add(struct buf *bp,
251
	struct inode *dp,
252
	off_t diroffset,
253
	ino_t newinum,
254
	struct buf *newdirbp,
255
	int isnewblk)
256
{
257

258
	panic("softdep_setup_directory_add called");
259
}
260

261
void
262
softdep_change_directoryentry_offset(struct buf *bp,
263
	struct inode *dp,
264
	caddr_t base,
265
	caddr_t oldloc,
266
	caddr_t newloc,
267
	int entrysize)
268
{
269

270
	panic("softdep_change_directoryentry_offset called");
271
}
272

273
void
274
softdep_setup_remove(struct buf *bp,
275
	struct inode *dp,
276
	struct inode *ip,
277
	bool isrmdir)
278
{
279

280
	panic("softdep_setup_remove called");
281
}
282

283
void
284
softdep_setup_directory_change(struct buf *bp,
285
	struct inode *dp,
286
	struct inode *ip,
287
	ino_t newinum,
288
	u_int newparent)
289
{
290

291
	panic("softdep_setup_directory_change called");
292
}
293

294
void
295
softdep_setup_blkfree(struct mount *mp,
296
	struct buf *bp,
297
	ufs2_daddr_t blkno,
298
	int frags,
299
	struct workhead *wkhd,
300
	bool doingrecovery)
301
{
302

303
	panic("%s called", __FUNCTION__);
304
}
305

306
void
307
softdep_setup_inofree(struct mount *mp,
308
	struct buf *bp,
309
	ino_t ino,
310
	struct workhead *wkhd,
311
	bool doingrecovery)
312
{
313

314
	panic("%s called", __FUNCTION__);
315
}
316

317
void
318
softdep_setup_unlink(struct inode *dp, struct inode *ip)
319
{
320

321
	panic("%s called", __FUNCTION__);
322
}
323

324
void
325
softdep_setup_link(struct inode *dp, struct inode *ip)
326
{
327

328
	panic("%s called", __FUNCTION__);
329
}
330

331
void
332
softdep_revert_link(struct inode *dp, struct inode *ip)
333
{
334

335
	panic("%s called", __FUNCTION__);
336
}
337

338
void
339
softdep_setup_rmdir(struct inode *dp, struct inode *ip)
340
{
341

342
	panic("%s called", __FUNCTION__);
343
}
344

345
void
346
softdep_revert_rmdir(struct inode *dp, struct inode *ip)
347
{
348

349
	panic("%s called", __FUNCTION__);
350
}
351

352
void
353
softdep_setup_create(struct inode *dp, struct inode *ip)
354
{
355

356
	panic("%s called", __FUNCTION__);
357
}
358

359
void
360
softdep_revert_create(struct inode *dp, struct inode *ip)
361
{
362

363
	panic("%s called", __FUNCTION__);
364
}
365

366
void
367
softdep_setup_mkdir(struct inode *dp, struct inode *ip)
368
{
369

370
	panic("%s called", __FUNCTION__);
371
}
372

373
void
374
softdep_revert_mkdir(struct inode *dp, struct inode *ip)
375
{
376

377
	panic("%s called", __FUNCTION__);
378
}
379

380
void
381
softdep_setup_dotdot_link(struct inode *dp, struct inode *ip)
382
{
383

384
	panic("%s called", __FUNCTION__);
385
}
386

387
int
388
softdep_prealloc(struct vnode *vp, int waitok)
389
{
390

391
	panic("%s called", __FUNCTION__);
392
}
393

394
int
395
softdep_journal_lookup(struct mount *mp, struct vnode **vpp)
396
{
397

398
	return (ENOENT);
399
}
400

401
void
402
softdep_change_linkcnt(struct inode *ip)
403
{
404

405
	panic("softdep_change_linkcnt called");
406
}
407

408
void 
409
softdep_load_inodeblock(struct inode *ip)
410
{
411

412
	panic("softdep_load_inodeblock called");
413
}
414

415
void
416
softdep_update_inodeblock(struct inode *ip,
417
	struct buf *bp,
418
	int waitfor)
419
{
420

421
	panic("softdep_update_inodeblock called");
422
}
423

424
int
425
softdep_fsync(struct vnode *vp)	/* the "in_core" copy of the inode */
426
{
427

428
	return (0);
429
}
430

431
void
432
softdep_fsync_mountdev(struct vnode *vp)
433
{
434

435
	return;
436
}
437

438
int
439
softdep_flushworklist(struct mount *oldmnt,
440
	int *countp,
441
	struct thread *td)
442
{
443

444
	*countp = 0;
445
	return (0);
446
}
447

448
int
449
softdep_sync_metadata(struct vnode *vp)
450
{
451

452
	panic("softdep_sync_metadata called");
453
}
454

455
int
456
softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
457
{
458

459
	panic("softdep_sync_buf called");
460
}
461

462
int
463
softdep_slowdown(struct vnode *vp)
464
{
465

466
	panic("softdep_slowdown called");
467
}
468

469
int
470
softdep_request_cleanup(struct fs *fs,
471
	struct vnode *vp,
472
	struct ucred *cred,
473
	int resource)
474
{
475

476
	return (0);
477
}
478

479
int
480
softdep_check_suspend(struct mount *mp,
481
		      struct vnode *devvp,
482
		      int softdep_depcnt,
483
		      int softdep_accdepcnt,
484
		      int secondary_writes,
485
		      int secondary_accwrites)
486
{
487
	struct bufobj *bo;
488
	int error;
489

490
	(void) softdep_depcnt,
491
	(void) softdep_accdepcnt;
492

493
	bo = &devvp->v_bufobj;
494
	ASSERT_BO_WLOCKED(bo);
495

496
	MNT_ILOCK(mp);
497
	while (mp->mnt_secondary_writes != 0) {
498
		BO_UNLOCK(bo);
499
		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
500
		    PRI_MAX_KERN | PDROP, "secwr", 0);
501
		BO_LOCK(bo);
502
		MNT_ILOCK(mp);
503
	}
504

505
	/*
506
	 * Reasons for needing more work before suspend:
507
	 * - Dirty buffers on devvp.
508
	 * - Secondary writes occurred after start of vnode sync loop
509
	 */
510
	error = 0;
511
	if (bo->bo_numoutput > 0 ||
512
	    bo->bo_dirty.bv_cnt > 0 ||
513
	    secondary_writes != 0 ||
514
	    mp->mnt_secondary_writes != 0 ||
515
	    secondary_accwrites != mp->mnt_secondary_accwrites)
516
		error = EAGAIN;
517
	BO_UNLOCK(bo);
518
	return (error);
519
}
520

521
void
522
softdep_get_depcounts(struct mount *mp,
523
		      int *softdepactivep,
524
		      int *softdepactiveaccp)
525
{
526
	(void) mp;
527
	*softdepactivep = 0;
528
	*softdepactiveaccp = 0;
529
}
530

531
void
532
softdep_buf_append(struct buf *bp, struct workhead *wkhd)
533
{
534

535
	panic("softdep_buf_appendwork called");
536
}
537

538
void
539
softdep_inode_append(struct inode *ip,
540
	struct ucred *cred,
541
	struct workhead *wkhd)
542
{
543

544
	panic("softdep_inode_appendwork called");
545
}
546

547
void
548
softdep_freework(struct workhead *wkhd)
549
{
550

551
	panic("softdep_freework called");
552
}
553

554
int
555
softdep_prerename(struct vnode *fdvp,
556
	struct vnode *fvp,
557
	struct vnode *tdvp,
558
	struct vnode *tvp)
559
{
560

561
	panic("softdep_prerename called");
562
}
563

564
int
565
softdep_prelink(struct vnode *dvp,
566
	struct vnode *vp,
567
	struct componentname *cnp)
568
{
569

570
	panic("softdep_prelink called");
571
}
572

573
#else
574

575
FEATURE(softupdates, "FFS soft-updates support");
576

577
static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
578
    "soft updates stats");
579
static SYSCTL_NODE(_debug_softdep, OID_AUTO, total,
580
    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
581
    "total dependencies allocated");
582
static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse,
583
    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
584
    "high use dependencies allocated");
585
static SYSCTL_NODE(_debug_softdep, OID_AUTO, current,
586
    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
587
    "current dependencies allocated");
588
static SYSCTL_NODE(_debug_softdep, OID_AUTO, write,
589
    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
590
    "current dependencies written");
591

592
unsigned long dep_current[D_LAST + 1];
593
unsigned long dep_highuse[D_LAST + 1];
594
unsigned long dep_total[D_LAST + 1];
595
unsigned long dep_write[D_LAST + 1];
596

597
#define	SOFTDEP_TYPE(type, str, long)					\
598
    static MALLOC_DEFINE(M_ ## type, #str, long);			\
599
    SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
600
	&dep_total[D_ ## type], 0, "");					\
601
    SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
602
	&dep_current[D_ ## type], 0, "");				\
603
    SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, 	\
604
	&dep_highuse[D_ ## type], 0, "");				\
605
    SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, 	\
606
	&dep_write[D_ ## type], 0, "");
607

608
SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); 
609
SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
610
SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
611
    "Block or frag allocated from cyl group map");
612
SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
613
SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
614
SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
615
SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
616
SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
617
SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
618
SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
619
SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
620
SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
621
SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
622
SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
623
SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
624
SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
625
SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
626
SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
627
SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
628
SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
629
SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
630
SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
631
SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
632
SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
633
SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
634
SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
635
SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
636

637
static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
638

639
static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
640
static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
641
static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
642

643
#define M_SOFTDEP_FLAGS	(M_WAITOK)
644

645
/* 
646
 * translate from workitem type to memory type
647
 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
648
 */
649
static struct malloc_type *memtype[] = {
650
	NULL,
651
	M_PAGEDEP,
652
	M_INODEDEP,
653
	M_BMSAFEMAP,
654
	M_NEWBLK,
655
	M_ALLOCDIRECT,
656
	M_INDIRDEP,
657
	M_ALLOCINDIR,
658
	M_FREEFRAG,
659
	M_FREEBLKS,
660
	M_FREEFILE,
661
	M_DIRADD,
662
	M_MKDIR,
663
	M_DIRREM,
664
	M_NEWDIRBLK,
665
	M_FREEWORK,
666
	M_FREEDEP,
667
	M_JADDREF,
668
	M_JREMREF,
669
	M_JMVREF,
670
	M_JNEWBLK,
671
	M_JFREEBLK,
672
	M_JFREEFRAG,
673
	M_JSEG,
674
	M_JSEGDEP,
675
	M_SBDEP,
676
	M_JTRUNC,
677
	M_JFSYNC,
678
	M_SENTINEL
679
};
680

681
#define DtoM(type) (memtype[type])
682

683
/*
684
 * Names of malloc types.
685
 */
686
#define TYPENAME(type)  \
687
	((unsigned)(type) <= D_LAST && (unsigned)(type) >= D_FIRST ? \
688
	memtype[type]->ks_shortdesc : "???")
689
/*
690
 * End system adaptation definitions.
691
 */
692

693
#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
694
#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
695

696
/*
697
 * Internal function prototypes.
698
 */
699
static	void check_clear_deps(struct mount *);
700
static	void softdep_error(char *, int);
701
static	int softdep_prerename_vnode(struct ufsmount *, struct vnode *);
702
static	int softdep_process_worklist(struct mount *, int);
703
static	int softdep_waitidle(struct mount *, int);
704
static	void drain_output(struct vnode *);
705
static	struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
706
static	int check_inodedep_free(struct inodedep *);
707
static	void clear_remove(struct mount *);
708
static	void clear_inodedeps(struct mount *);
709
static	void unlinked_inodedep(struct mount *, struct inodedep *);
710
static	void clear_unlinked_inodedep(struct inodedep *);
711
static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
712
static	int flush_pagedep_deps(struct vnode *, struct mount *,
713
	    struct diraddhd *, struct buf *);
714
static	int free_pagedep(struct pagedep *);
715
static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
716
static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
717
static	int flush_deplist(struct allocdirectlst *, int, int *);
718
static	int sync_cgs(struct mount *, int);
719
static	int handle_written_filepage(struct pagedep *, struct buf *, int);
720
static	int handle_written_sbdep(struct sbdep *, struct buf *);
721
static	void initiate_write_sbdep(struct sbdep *);
722
static	void diradd_inode_written(struct diradd *, struct inodedep *);
723
static	int handle_written_indirdep(struct indirdep *, struct buf *,
724
	    struct buf**, int);
725
static	int handle_written_inodeblock(struct inodedep *, struct buf *, int);
726
static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
727
	    uint8_t *);
728
static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
729
static	void handle_written_jaddref(struct jaddref *);
730
static	void handle_written_jremref(struct jremref *);
731
static	void handle_written_jseg(struct jseg *, struct buf *);
732
static	void handle_written_jnewblk(struct jnewblk *);
733
static	void handle_written_jblkdep(struct jblkdep *);
734
static	void handle_written_jfreefrag(struct jfreefrag *);
735
static	void complete_jseg(struct jseg *);
736
static	void complete_jsegs(struct jseg *);
737
static	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
738
static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
739
static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
740
static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
741
static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
742
static	void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
743
static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
744
static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
745
static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
746
static	inline void inoref_write(struct inoref *, struct jseg *,
747
	    struct jrefrec *);
748
static	void handle_allocdirect_partdone(struct allocdirect *,
749
	    struct workhead *);
750
static	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
751
	    struct workhead *);
752
static	void indirdep_complete(struct indirdep *);
753
static	int indirblk_lookup(struct mount *, ufs2_daddr_t);
754
static	void indirblk_insert(struct freework *);
755
static	void indirblk_remove(struct freework *);
756
static	void handle_allocindir_partdone(struct allocindir *);
757
static	void initiate_write_filepage(struct pagedep *, struct buf *);
758
static	void initiate_write_indirdep(struct indirdep*, struct buf *);
759
static	void handle_written_mkdir(struct mkdir *, int);
760
static	int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
761
	    uint8_t *);
762
static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
763
static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
764
static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
765
static	void handle_workitem_freefile(struct freefile *);
766
static	int handle_workitem_remove(struct dirrem *, int);
767
static	struct dirrem *newdirrem(struct buf *, struct inode *,
768
	    struct inode *, bool, struct dirrem **);
769
static	struct indirdep *indirdep_lookup(struct mount *, struct inode *,
770
	    struct buf *);
771
static	void cancel_indirdep(struct indirdep *, struct buf *,
772
	    struct freeblks *);
773
static	void free_indirdep(struct indirdep *);
774
static	void free_diradd(struct diradd *, struct workhead *);
775
static	void merge_diradd(struct inodedep *, struct diradd *);
776
static	void complete_diradd(struct diradd *);
777
static	struct diradd *diradd_lookup(struct pagedep *, int);
778
static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
779
	    struct jremref *);
780
static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
781
	    struct jremref *);
782
static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
783
	    struct jremref *, struct jremref *);
784
static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
785
	    struct jremref *);
786
static	void cancel_allocindir(struct allocindir *, struct buf *bp,
787
	    struct freeblks *, int);
788
static	int setup_trunc_indir(struct freeblks *, struct inode *,
789
	    ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
790
static	void complete_trunc_indir(struct freework *);
791
static	void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
792
	    int);
793
static	void complete_mkdir(struct mkdir *);
794
static	void free_newdirblk(struct newdirblk *);
795
static	void free_jremref(struct jremref *);
796
static	void free_jaddref(struct jaddref *);
797
static	void free_jsegdep(struct jsegdep *);
798
static	void free_jsegs(struct jblocks *);
799
static	void rele_jseg(struct jseg *);
800
static	void free_jseg(struct jseg *, struct jblocks *);
801
static	void free_jnewblk(struct jnewblk *);
802
static	void free_jblkdep(struct jblkdep *);
803
static	void free_jfreefrag(struct jfreefrag *);
804
static	void free_freedep(struct freedep *);
805
static	void journal_jremref(struct dirrem *, struct jremref *,
806
	    struct inodedep *);
807
static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
808
static	int cancel_jaddref(struct jaddref *, struct inodedep *,
809
	    struct workhead *);
810
static	void cancel_jfreefrag(struct jfreefrag *);
811
static	inline void setup_freedirect(struct freeblks *, struct inode *,
812
	    int, int);
813
static	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
814
static	inline void setup_freeindir(struct freeblks *, struct inode *, int,
815
	    ufs_lbn_t, int);
816
static	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
817
static	void freeblks_free(struct ufsmount *, struct freeblks *, int);
818
static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
819
static	ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
820
static	int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
821
static	void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
822
	    int, int);
823
static	void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
824
static 	int cancel_pagedep(struct pagedep *, struct freeblks *, int);
825
static	int deallocate_dependencies(struct buf *, struct freeblks *, int);
826
static	void newblk_freefrag(struct newblk*);
827
static	void free_newblk(struct newblk *);
828
static	void cancel_allocdirect(struct allocdirectlst *,
829
	    struct allocdirect *, struct freeblks *);
830
static	int check_inode_unwritten(struct inodedep *);
831
static	int free_inodedep(struct inodedep *);
832
static	void freework_freeblock(struct freework *, uint64_t);
833
static	void freework_enqueue(struct freework *);
834
static	int handle_workitem_freeblocks(struct freeblks *, int);
835
static	int handle_complete_freeblocks(struct freeblks *, int);
836
static	void handle_workitem_indirblk(struct freework *);
837
static	void handle_written_freework(struct freework *);
838
static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
839
static	struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
840
	    struct workhead *);
841
static	struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
842
	    struct inodedep *, struct allocindir *, ufs_lbn_t);
843
static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
844
	    ufs2_daddr_t, ufs_lbn_t);
845
static	void handle_workitem_freefrag(struct freefrag *);
846
static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
847
	    ufs_lbn_t, uint64_t);
848
static	void allocdirect_merge(struct allocdirectlst *,
849
	    struct allocdirect *, struct allocdirect *);
850
static	struct freefrag *allocindir_merge(struct allocindir *,
851
	    struct allocindir *);
852
static	int bmsafemap_find(struct bmsafemap_hashhead *, int,
853
	    struct bmsafemap **);
854
static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
855
	    int cg, struct bmsafemap *);
856
static	int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
857
	    struct newblk **);
858
static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
859
static	int inodedep_find(struct inodedep_hashhead *, ino_t,
860
	    struct inodedep **);
861
static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
862
static	int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
863
	    int, struct pagedep **);
864
static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
865
	    struct pagedep **);
866
static	void pause_timer(void *);
867
static	int request_cleanup(struct mount *, int);
868
static	int softdep_request_cleanup_flush(struct mount *, struct ufsmount *);
869
static	void schedule_cleanup(struct mount *);
870
static void softdep_ast_cleanup_proc(struct thread *, int);
871
static struct ufsmount *softdep_bp_to_mp(struct buf *bp);
872
static	int process_worklist_item(struct mount *, int, int);
873
static	void process_removes(struct vnode *);
874
static	void process_truncates(struct vnode *);
875
static	void jwork_move(struct workhead *, struct workhead *);
876
static	void jwork_insert(struct workhead *, struct jsegdep *);
877
static	void add_to_worklist(struct worklist *, int);
878
static	void wake_worklist(struct worklist *);
879
static	void wait_worklist(struct worklist *, char *);
880
static	void remove_from_worklist(struct worklist *);
881
static	void softdep_flush(void *);
882
static	void softdep_flushjournal(struct mount *);
883
static	int softdep_speedup(struct ufsmount *);
884
static	void worklist_speedup(struct mount *);
885
static	int journal_mount(struct mount *, struct fs *, struct ucred *);
886
static	void journal_unmount(struct ufsmount *);
887
static	int journal_space(struct ufsmount *, int);
888
static	void journal_suspend(struct ufsmount *);
889
static	int journal_unsuspend(struct ufsmount *ump);
890
static	void add_to_journal(struct worklist *);
891
static	void remove_from_journal(struct worklist *);
892
static	bool softdep_excess_items(struct ufsmount *, int);
893
static	void softdep_process_journal(struct mount *, struct worklist *, int);
894
static	struct jremref *newjremref(struct dirrem *, struct inode *,
895
	    struct inode *ip, off_t, nlink_t);
896
static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
897
	    uint16_t);
898
static	inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
899
	    uint16_t);
900
static	inline struct jsegdep *inoref_jseg(struct inoref *);
901
static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
902
static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
903
	    ufs2_daddr_t, int);
904
static	void adjust_newfreework(struct freeblks *, int);
905
static	struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
906
static	void move_newblock_dep(struct jaddref *, struct inodedep *);
907
static	void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
908
static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
909
	    ufs2_daddr_t, long, ufs_lbn_t);
910
static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
911
	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
912
static	int jwait(struct worklist *, int);
913
static	struct inodedep *inodedep_lookup_ip(struct inode *);
914
static	int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
915
static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
916
static	void handle_jwork(struct workhead *);
917
static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
918
	    struct mkdir **);
919
static	struct jblocks *jblocks_create(void);
920
static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
921
static	void jblocks_free(struct jblocks *, struct mount *, int);
922
static	void jblocks_destroy(struct jblocks *);
923
static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
924

925
/*
926
 * Exported softdep operations.
927
 */
928
static	void softdep_disk_io_initiation(struct buf *);
929
static	void softdep_disk_write_complete(struct buf *);
930
static	void softdep_deallocate_dependencies(struct buf *);
931
static	int softdep_count_dependencies(struct buf *bp, int);
932

933
/*
934
 * Global lock over all of soft updates.
935
 */
936
static struct mtx lk;
937
MTX_SYSINIT(softdep_lock, &lk, "global softdep", MTX_DEF);
938

939
#define ACQUIRE_GBLLOCK(lk)	mtx_lock(lk)
940
#define FREE_GBLLOCK(lk)	mtx_unlock(lk)
941
#define GBLLOCK_OWNED(lk)	mtx_assert((lk), MA_OWNED)
942

943
/*
944
 * Per-filesystem soft-updates locking.
945
 */
946
#define LOCK_PTR(ump)		(&(ump)->um_softdep->sd_fslock)
947
#define TRY_ACQUIRE_LOCK(ump)	rw_try_wlock(&(ump)->um_softdep->sd_fslock)
948
#define ACQUIRE_LOCK(ump)	rw_wlock(&(ump)->um_softdep->sd_fslock)
949
#define FREE_LOCK(ump)		rw_wunlock(&(ump)->um_softdep->sd_fslock)
950
#define LOCK_OWNED(ump)		rw_assert(&(ump)->um_softdep->sd_fslock, \
951
				    RA_WLOCKED)
952

953
#define	BUF_AREC(bp)		lockallowrecurse(&(bp)->b_lock)
954
#define	BUF_NOREC(bp)		lockdisablerecurse(&(bp)->b_lock)
955

956
/*
957
 * Worklist queue management.
958
 * These routines require that the lock be held.
959
 */
960
#ifndef /* NOT */ INVARIANTS
961
#define WORKLIST_INSERT(head, item) do {	\
962
	(item)->wk_state |= ONWORKLIST;		\
963
	LIST_INSERT_HEAD(head, item, wk_list);	\
964
} while (0)
965
#define WORKLIST_REMOVE(item) do {		\
966
	(item)->wk_state &= ~ONWORKLIST;	\
967
	LIST_REMOVE(item, wk_list);		\
968
} while (0)
969
#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
970
#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
971

972
#else /* INVARIANTS */
973
static	void worklist_insert(struct workhead *, struct worklist *, int,
974
	const char *, int);
975
static	void worklist_remove(struct worklist *, int, const char *, int);
976

977
#define WORKLIST_INSERT(head, item) \
978
	worklist_insert(head, item, 1, __func__, __LINE__)
979
#define WORKLIST_INSERT_UNLOCKED(head, item)\
980
	worklist_insert(head, item, 0, __func__, __LINE__)
981
#define WORKLIST_REMOVE(item)\
982
	worklist_remove(item, 1, __func__, __LINE__)
983
#define WORKLIST_REMOVE_UNLOCKED(item)\
984
	worklist_remove(item, 0, __func__, __LINE__)
985

986
static void
987
worklist_insert(struct workhead *head,
988
	struct worklist *item,
989
	int locked,
990
	const char *func,
991
	int line)
992
{
993

994
	if (locked)
995
		LOCK_OWNED(VFSTOUFS(item->wk_mp));
996
	if (item->wk_state & ONWORKLIST)
997
		panic("worklist_insert: %p %s(0x%X) already on list, "
998
		    "added in function %s at line %d",
999
		    item, TYPENAME(item->wk_type), item->wk_state,
1000
		    item->wk_func, item->wk_line);
1001
	item->wk_state |= ONWORKLIST;
1002
	item->wk_func = func;
1003
	item->wk_line = line;
1004
	LIST_INSERT_HEAD(head, item, wk_list);
1005
}
1006

1007
static void
1008
worklist_remove(struct worklist *item,
1009
	int locked,
1010
	const char *func,
1011
	int line)
1012
{
1013

1014
	if (locked)
1015
		LOCK_OWNED(VFSTOUFS(item->wk_mp));
1016
	if ((item->wk_state & ONWORKLIST) == 0)
1017
		panic("worklist_remove: %p %s(0x%X) not on list, "
1018
		    "removed in function %s at line %d",
1019
		    item, TYPENAME(item->wk_type), item->wk_state,
1020
		    item->wk_func, item->wk_line);
1021
	item->wk_state &= ~ONWORKLIST;
1022
	item->wk_func = func;
1023
	item->wk_line = line;
1024
	LIST_REMOVE(item, wk_list);
1025
}
1026
#endif /* INVARIANTS */
1027

1028
/*
1029
 * Merge two jsegdeps keeping only the oldest one as newer references
1030
 * can't be discarded until after older references.
1031
 */
1032
static inline struct jsegdep *
1033
jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1034
{
1035
	struct jsegdep *swp;
1036

1037
	if (two == NULL)
1038
		return (one);
1039

1040
	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1041
		swp = one;
1042
		one = two;
1043
		two = swp;
1044
	}
1045
	WORKLIST_REMOVE(&two->jd_list);
1046
	free_jsegdep(two);
1047

1048
	return (one);
1049
}
1050

1051
/*
1052
 * If two freedeps are compatible free one to reduce list size.
1053
 */
1054
static inline struct freedep *
1055
freedep_merge(struct freedep *one, struct freedep *two)
1056
{
1057
	if (two == NULL)
1058
		return (one);
1059

1060
	if (one->fd_freework == two->fd_freework) {
1061
		WORKLIST_REMOVE(&two->fd_list);
1062
		free_freedep(two);
1063
	}
1064
	return (one);
1065
}
1066

1067
/*
1068
 * Move journal work from one list to another.  Duplicate freedeps and
1069
 * jsegdeps are coalesced to keep the lists as small as possible.
1070
 */
1071
static void
1072
jwork_move(struct workhead *dst, struct workhead *src)
1073
{
1074
	struct freedep *freedep;
1075
	struct jsegdep *jsegdep;
1076
	struct worklist *wkn;
1077
	struct worklist *wk;
1078

1079
	KASSERT(dst != src,
1080
	    ("jwork_move: dst == src"));
1081
	freedep = NULL;
1082
	jsegdep = NULL;
1083
	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1084
		if (wk->wk_type == D_JSEGDEP)
1085
			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1086
		else if (wk->wk_type == D_FREEDEP)
1087
			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1088
	}
1089

1090
	while ((wk = LIST_FIRST(src)) != NULL) {
1091
		WORKLIST_REMOVE(wk);
1092
		WORKLIST_INSERT(dst, wk);
1093
		if (wk->wk_type == D_JSEGDEP) {
1094
			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1095
			continue;
1096
		}
1097
		if (wk->wk_type == D_FREEDEP)
1098
			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1099
	}
1100
}
1101

1102
static void
1103
jwork_insert(struct workhead *dst, struct jsegdep *jsegdep)
1104
{
1105
	struct jsegdep *jsegdepn;
1106
	struct worklist *wk;
1107

1108
	LIST_FOREACH(wk, dst, wk_list)
1109
		if (wk->wk_type == D_JSEGDEP)
1110
			break;
1111
	if (wk == NULL) {
1112
		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1113
		return;
1114
	}
1115
	jsegdepn = WK_JSEGDEP(wk);
1116
	if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1117
		WORKLIST_REMOVE(wk);
1118
		free_jsegdep(jsegdepn);
1119
		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1120
	} else
1121
		free_jsegdep(jsegdep);
1122
}
1123

1124
/*
1125
 * Routines for tracking and managing workitems.
1126
 */
1127
static	void workitem_free(struct worklist *, int);
1128
static	void workitem_alloc(struct worklist *, int, struct mount *);
1129
static	void workitem_reassign(struct worklist *, int);
1130

1131
#define	WORKITEM_FREE(item, type) \
1132
	workitem_free((struct worklist *)(item), (type))
1133
#define	WORKITEM_REASSIGN(item, type) \
1134
	workitem_reassign((struct worklist *)(item), (type))
1135

1136
static void
1137
workitem_free(struct worklist *item, int type)
1138
{
1139
	struct ufsmount *ump;
1140

1141
#ifdef INVARIANTS
1142
	if (item->wk_state & ONWORKLIST)
1143
		panic("workitem_free: %s(0x%X) still on list, "
1144
		    "added in function %s at line %d",
1145
		    TYPENAME(item->wk_type), item->wk_state,
1146
		    item->wk_func, item->wk_line);
1147
	if (item->wk_type != type && type != D_NEWBLK)
1148
		panic("workitem_free: type mismatch %s != %s",
1149
		    TYPENAME(item->wk_type), TYPENAME(type));
1150
#endif
1151
	if (item->wk_state & IOWAITING)
1152
		wakeup(item);
1153
	ump = VFSTOUFS(item->wk_mp);
1154
	LOCK_OWNED(ump);
1155
	KASSERT(ump->softdep_deps > 0,
1156
	    ("workitem_free: %s: softdep_deps going negative",
1157
	    ump->um_fs->fs_fsmnt));
1158
	if (--ump->softdep_deps == 0 && ump->softdep_req)
1159
		wakeup(&ump->softdep_deps);
1160
	KASSERT(dep_current[item->wk_type] > 0,
1161
	    ("workitem_free: %s: dep_current[%s] going negative",
1162
	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1163
	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1164
	    ("workitem_free: %s: softdep_curdeps[%s] going negative",
1165
	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1166
	atomic_subtract_long(&dep_current[item->wk_type], 1);
1167
	ump->softdep_curdeps[item->wk_type] -= 1;
1168
	LIST_REMOVE(item, wk_all);
1169
	free(item, DtoM(type));
1170
}
1171

1172
static void
1173
workitem_alloc(struct worklist *item,
1174
	int type,
1175
	struct mount *mp)
1176
{
1177
	struct ufsmount *ump;
1178

1179
	item->wk_type = type;
1180
	item->wk_mp = mp;
1181
	item->wk_state = 0;
1182

1183
	ump = VFSTOUFS(mp);
1184
	ACQUIRE_GBLLOCK(&lk);
1185
	dep_current[type]++;
1186
	if (dep_current[type] > dep_highuse[type])
1187
		dep_highuse[type] = dep_current[type];
1188
	dep_total[type]++;
1189
	FREE_GBLLOCK(&lk);
1190
	ACQUIRE_LOCK(ump);
1191
	ump->softdep_curdeps[type] += 1;
1192
	ump->softdep_deps++;
1193
	ump->softdep_accdeps++;
1194
	LIST_INSERT_HEAD(&ump->softdep_alldeps[type], item, wk_all);
1195
	FREE_LOCK(ump);
1196
}
1197

1198
static void
1199
workitem_reassign(struct worklist *item, int newtype)
1200
{
1201
	struct ufsmount *ump;
1202

1203
	ump = VFSTOUFS(item->wk_mp);
1204
	LOCK_OWNED(ump);
1205
	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1206
	    ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
1207
	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1208
	ump->softdep_curdeps[item->wk_type] -= 1;
1209
	ump->softdep_curdeps[newtype] += 1;
1210
	KASSERT(dep_current[item->wk_type] > 0,
1211
	    ("workitem_reassign: %s: dep_current[%s] going negative",
1212
	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1213
	ACQUIRE_GBLLOCK(&lk);
1214
	dep_current[newtype]++;
1215
	dep_current[item->wk_type]--;
1216
	if (dep_current[newtype] > dep_highuse[newtype])
1217
		dep_highuse[newtype] = dep_current[newtype];
1218
	dep_total[newtype]++;
1219
	FREE_GBLLOCK(&lk);
1220
	item->wk_type = newtype;
1221
	LIST_REMOVE(item, wk_all);
1222
	LIST_INSERT_HEAD(&ump->softdep_alldeps[newtype], item, wk_all);
1223
}
1224

1225
/*
1226
 * Workitem queue management
1227
 */
1228
static int max_softdeps;	/* maximum number of structs before slowdown */
1229
static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1230
static int proc_waiting;	/* tracks whether we have a timeout posted */
1231
static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1232
static struct callout softdep_callout;
1233
static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1234
static int req_clear_remove;	/* syncer process flush some freeblks */
1235
static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1236

1237
/*
1238
 * runtime statistics
1239
 */
1240
static int stat_flush_threads;	/* number of softdep flushing threads */
1241
static int stat_worklist_push;	/* number of worklist cleanups */
1242
static int stat_delayed_inact;	/* number of delayed inactivation cleanups */
1243
static int stat_blk_limit_push;	/* number of times block limit neared */
1244
static int stat_ino_limit_push;	/* number of times inode limit neared */
1245
static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1246
static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1247
static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1248
static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1249
static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1250
static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1251
static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1252
static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1253
static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1254
static int stat_journal_min;	/* Times hit journal min threshold */
1255
static int stat_journal_low;	/* Times hit journal low threshold */
1256
static int stat_journal_wait;	/* Times blocked in jwait(). */
1257
static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1258
static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1259
static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1260
static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1261
static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1262
static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1263
static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1264
static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1265
static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1266
static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
1267

1268
SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1269
    &max_softdeps, 0, "");
1270
SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1271
    &tickdelay, 0, "");
1272
SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
1273
    &stat_flush_threads, 0, "");
1274
SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push,
1275
    CTLFLAG_RW | CTLFLAG_STATS, &stat_worklist_push, 0,"");
1276
SYSCTL_INT(_debug_softdep, OID_AUTO, delayed_inactivations, CTLFLAG_RD,
1277
    &stat_delayed_inact, 0, "");
1278
SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push,
1279
    CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_push, 0,"");
1280
SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push,
1281
    CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_push, 0,"");
1282
SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit,
1283
    CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_hit, 0, "");
1284
SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit,
1285
    CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_hit, 0, "");
1286
SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit,
1287
    CTLFLAG_RW | CTLFLAG_STATS, &stat_sync_limit_hit, 0, "");
1288
SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs,
1289
    CTLFLAG_RW | CTLFLAG_STATS, &stat_indir_blk_ptrs, 0, "");
1290
SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap,
1291
    CTLFLAG_RW | CTLFLAG_STATS, &stat_inode_bitmap, 0, "");
1292
SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs,
1293
    CTLFLAG_RW | CTLFLAG_STATS, &stat_direct_blk_ptrs, 0, "");
1294
SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry,
1295
    CTLFLAG_RW | CTLFLAG_STATS, &stat_dir_entry, 0, "");
1296
SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback,
1297
    CTLFLAG_RW | CTLFLAG_STATS, &stat_jaddref, 0, "");
1298
SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback,
1299
    CTLFLAG_RW | CTLFLAG_STATS, &stat_jnewblk, 0, "");
1300
SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low,
1301
    CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_low, 0, "");
1302
SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min,
1303
    CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_min, 0, "");
1304
SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait,
1305
    CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_wait, 0, "");
1306
SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage,
1307
    CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_filepage, 0, "");
1308
SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks,
1309
    CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_freeblks, 0, "");
1310
SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode,
1311
    CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_inode, 0, "");
1312
SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk,
1313
    CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_newblk, 0, "");
1314
SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests,
1315
    CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_blkrequests, 0, "");
1316
SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests,
1317
    CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_inorequests, 0, "");
1318
SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay,
1319
    CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_high_delay, 0, "");
1320
SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries,
1321
    CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_retries, 0, "");
1322
SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures,
1323
    CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_failures, 0, "");
1324

1325
SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1326
    &softdep_flushcache, 0, "");
1327
SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
1328
    &stat_emptyjblocks, 0, "");
1329

1330
SYSCTL_DECL(_vfs_ffs);
1331

1332
/* Whether to recompute the summary at mount time */
1333
static int compute_summary_at_mount = 0;
1334
SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1335
	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1336
static int print_threads = 0;
1337
SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
1338
    &print_threads, 0, "Notify flusher thread start/stop");
1339

1340
/* List of all filesystems mounted with soft updates */
1341
static TAILQ_HEAD(, mount_softdeps) softdepmounts;
1342

1343
static void
1344
get_parent_vp_unlock_bp(struct mount *mp,
1345
	struct buf *bp,
1346
	struct diraddhd *diraddhdp,
1347
	struct diraddhd *unfinishedp)
1348
{
1349
	struct diradd *dap;
1350

1351
	/*
1352
	 * Requeue unfinished dependencies before
1353
	 * unlocking buffer, which could make
1354
	 * diraddhdp invalid.
1355
	 */
1356
	ACQUIRE_LOCK(VFSTOUFS(mp));
1357
	while ((dap = LIST_FIRST(unfinishedp)) != NULL) {
1358
		LIST_REMOVE(dap, da_pdlist);
1359
		LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
1360
	}
1361
	FREE_LOCK(VFSTOUFS(mp));
1362

1363
	bp->b_vflags &= ~BV_SCANNED;
1364
	BUF_NOREC(bp);
1365
	BUF_UNLOCK(bp);
1366
}
1367

1368
/*
1369
 * This function fetches inode inum on mount point mp.  We already
1370
 * hold a locked vnode vp, and might have a locked buffer bp belonging
1371
 * to vp.
1372

1373
 * We must not block on acquiring the new inode lock as we will get
1374
 * into a lock-order reversal with the buffer lock and possibly get a
1375
 * deadlock.  Thus if we cannot instantiate the requested vnode
1376
 * without sleeping on its lock, we must unlock the vnode and the
1377
 * buffer before doing a blocking on the vnode lock.  We return
1378
 * ERELOOKUP if we have had to unlock either the vnode or the buffer so
1379
 * that the caller can reassess its state.
1380
 *
1381
 * Top-level VFS code (for syscalls and other consumers, e.g. callers
1382
 * of VOP_FSYNC() in syncer) check for ERELOOKUP and restart at safe
1383
 * point.
1384
 *
1385
 * Since callers expect to operate on fully constructed vnode, we also
1386
 * recheck v_data after relock, and return ENOENT if NULL.
1387
 *
1388
 * If unlocking bp, we must unroll dequeueing its unfinished
1389
 * dependencies, and clear scan flag, before unlocking.  If unlocking
1390
 * vp while it is under deactivation, we re-queue deactivation.
1391
 */
1392
static int
1393
get_parent_vp(struct vnode *vp,
1394
	struct mount *mp,
1395
	ino_t inum,
1396
	struct buf *bp,
1397
	struct diraddhd *diraddhdp,
1398
	struct diraddhd *unfinishedp,
1399
	struct vnode **rvp)
1400
{
1401
	struct vnode *pvp;
1402
	int error;
1403
	bool bplocked;
1404

1405
	ASSERT_VOP_ELOCKED(vp, "child vnode must be locked");
1406
	for (bplocked = true, pvp = NULL;;) {
1407
		error = ffs_vgetf(mp, inum, LK_EXCLUSIVE | LK_NOWAIT, &pvp,
1408
		    FFSV_FORCEINSMQ | FFSV_FORCEINODEDEP);
1409
		if (error == 0) {
1410
			/*
1411
			 * Since we could have unlocked vp, the inode
1412
			 * number could no longer indicate a
1413
			 * constructed node.  In this case, we must
1414
			 * restart the syscall.
1415
			 */
1416
			if (VTOI(pvp)->i_mode == 0 || !bplocked) {
1417
				if (bp != NULL && bplocked)
1418
					get_parent_vp_unlock_bp(mp, bp,
1419
					    diraddhdp, unfinishedp);
1420
				if (VTOI(pvp)->i_mode == 0)
1421
					vgone(pvp);
1422
				error = ERELOOKUP;
1423
				goto out2;
1424
			}
1425
			goto out1;
1426
		}
1427
		if (bp != NULL && bplocked) {
1428
			get_parent_vp_unlock_bp(mp, bp, diraddhdp, unfinishedp);
1429
			bplocked = false;
1430
		}
1431

1432
		/*
1433
		 * Do not drop vnode lock while inactivating during
1434
		 * vunref.  This would result in leaks of the VI flags
1435
		 * and reclaiming of non-truncated vnode.  Instead,
1436
		 * re-schedule inactivation hoping that we would be
1437
		 * able to sync inode later.
1438
		 */
1439
		if ((vp->v_iflag & VI_DOINGINACT) != 0 &&
1440
		    (vp->v_vflag & VV_UNREF) != 0) {
1441
			VI_LOCK(vp);
1442
			vp->v_iflag |= VI_OWEINACT;
1443
			VI_UNLOCK(vp);
1444
			return (ERELOOKUP);
1445
		}
1446

1447
		VOP_UNLOCK(vp);
1448
		error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &pvp,
1449
		    FFSV_FORCEINSMQ | FFSV_FORCEINODEDEP);
1450
		if (error != 0) {
1451
			MPASS(error != ERELOOKUP);
1452
			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1453
			break;
1454
		}
1455
		if (VTOI(pvp)->i_mode == 0) {
1456
			vgone(pvp);
1457
			vput(pvp);
1458
			pvp = NULL;
1459
			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1460
			error = ERELOOKUP;
1461
			break;
1462
		}
1463
		error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
1464
		if (error == 0)
1465
			break;
1466
		vput(pvp);
1467
		pvp = NULL;
1468
		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1469
		if (vp->v_data == NULL) {
1470
			error = ENOENT;
1471
			break;
1472
		}
1473
	}
1474
	if (bp != NULL) {
1475
		MPASS(!bplocked);
1476
		error = ERELOOKUP;
1477
	}
1478
out2:
1479
	if (error != 0 && pvp != NULL) {
1480
		vput(pvp);
1481
		pvp = NULL;
1482
	}
1483
out1:
1484
	*rvp = pvp;
1485
	ASSERT_VOP_ELOCKED(vp, "child vnode must be locked on return");
1486
	return (error);
1487
}
1488

1489
/*
1490
 * This function cleans the worklist for a filesystem.
1491
 * Each filesystem running with soft dependencies gets its own
1492
 * thread to run in this function. The thread is started up in
1493
 * softdep_mount and shutdown in softdep_unmount. They show up
1494
 * as part of the kernel "bufdaemon" process whose process
1495
 * entry is available in bufdaemonproc.
1496
 */
1497
static int searchfailed;
1498
extern struct proc *bufdaemonproc;
1499
static void
1500
softdep_flush(void *addr)
1501
{
1502
	struct mount *mp;
1503
	struct thread *td;
1504
	struct ufsmount *ump;
1505
	int cleanups;
1506

1507
	td = curthread;
1508
	td->td_pflags |= TDP_NORUNNINGBUF;
1509
	mp = (struct mount *)addr;
1510
	ump = VFSTOUFS(mp);
1511
	atomic_add_int(&stat_flush_threads, 1);
1512
	ACQUIRE_LOCK(ump);
1513
	ump->softdep_flags &= ~FLUSH_STARTING;
1514
	wakeup(&ump->softdep_flushtd);
1515
	FREE_LOCK(ump);
1516
	if (print_threads) {
1517
		if (stat_flush_threads == 1)
1518
			printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
1519
			    bufdaemonproc->p_pid);
1520
		printf("Start thread %s\n", td->td_name);
1521
	}
1522
	for (;;) {	
1523
		while (softdep_process_worklist(mp, 0) > 0 ||
1524
		    (MOUNTEDSUJ(mp) &&
1525
		    VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
1526
			kthread_suspend_check();
1527
		ACQUIRE_LOCK(ump);
1528
		if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1529
			msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
1530
			    "sdflush", hz / 2);
1531
		ump->softdep_flags &= ~FLUSH_CLEANUP;
1532
		/*
1533
		 * Check to see if we are done and need to exit.
1534
		 */
1535
		if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
1536
			FREE_LOCK(ump);
1537
			continue;
1538
		}
1539
		ump->softdep_flags &= ~FLUSH_EXIT;
1540
		cleanups = ump->um_softdep->sd_cleanups;
1541
		FREE_LOCK(ump);
1542
		wakeup(&ump->softdep_flags);
1543
		if (print_threads) {
1544
			printf("Stop thread %s: searchfailed %d, "
1545
			    "did cleanups %d\n",
1546
			    td->td_name, searchfailed, cleanups);
1547
		}
1548
		atomic_subtract_int(&stat_flush_threads, 1);
1549
		kthread_exit();
1550
		panic("kthread_exit failed\n");
1551
	}
1552
}
1553

1554
static void
1555
worklist_speedup(struct mount *mp)
1556
{
1557
	struct ufsmount *ump;
1558

1559
	ump = VFSTOUFS(mp);
1560
	LOCK_OWNED(ump);
1561
	if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1562
		ump->softdep_flags |= FLUSH_CLEANUP;
1563
	wakeup(&ump->softdep_flushtd);
1564
}
1565

1566
static void
1567
softdep_send_speedup(struct ufsmount *ump,
1568
	off_t shortage,
1569
	uint64_t flags)
1570
{
1571
	struct buf *bp;
1572

1573
	if ((ump->um_flags & UM_CANSPEEDUP) == 0)
1574
		return;
1575

1576
	bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
1577
	bp->b_iocmd = BIO_SPEEDUP;
1578
	bp->b_ioflags = flags;
1579
	bp->b_bcount = omin(shortage, LONG_MAX);
1580
	g_vfs_strategy(ump->um_bo, bp);
1581
	bufwait(bp);
1582
	free(bp, M_TRIM);
1583
}
1584

1585
static int
1586
softdep_speedup(struct ufsmount *ump)
1587
{
1588
	struct ufsmount *altump;
1589
	struct mount_softdeps *sdp;
1590

1591
	LOCK_OWNED(ump);
1592
	worklist_speedup(ump->um_mountp);
1593
	bd_speedup();
1594
	/*
1595
	 * If we have global shortages, then we need other
1596
	 * filesystems to help with the cleanup. Here we wakeup a
1597
	 * flusher thread for a filesystem that is over its fair
1598
	 * share of resources.
1599
	 */
1600
	if (req_clear_inodedeps || req_clear_remove) {
1601
		ACQUIRE_GBLLOCK(&lk);
1602
		TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
1603
			if ((altump = sdp->sd_ump) == ump)
1604
				continue;
1605
			if (((req_clear_inodedeps &&
1606
			    altump->softdep_curdeps[D_INODEDEP] >
1607
			    max_softdeps / stat_flush_threads) ||
1608
			    (req_clear_remove &&
1609
			    altump->softdep_curdeps[D_DIRREM] >
1610
			    (max_softdeps / 2) / stat_flush_threads)) &&
1611
			    TRY_ACQUIRE_LOCK(altump))
1612
				break;
1613
		}
1614
		if (sdp == NULL) {
1615
			searchfailed++;
1616
			FREE_GBLLOCK(&lk);
1617
		} else {
1618
			/*
1619
			 * Move to the end of the list so we pick a
1620
			 * different one on out next try.
1621
			 */
1622
			TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
1623
			TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
1624
			FREE_GBLLOCK(&lk);
1625
			if ((altump->softdep_flags &
1626
			    (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1627
				altump->softdep_flags |= FLUSH_CLEANUP;
1628
			altump->um_softdep->sd_cleanups++;
1629
			wakeup(&altump->softdep_flushtd);
1630
			FREE_LOCK(altump);
1631
		}
1632
	}
1633
	return (speedup_syncer());
1634
}
1635

1636
/*
1637
 * Add an item to the end of the work queue.
1638
 * This routine requires that the lock be held.
1639
 * This is the only routine that adds items to the list.
1640
 * The following routine is the only one that removes items
1641
 * and does so in order from first to last.
1642
 */
1643

1644
#define	WK_HEAD		0x0001	/* Add to HEAD. */
1645
#define	WK_NODELAY	0x0002	/* Process immediately. */
1646

1647
static void
1648
add_to_worklist(struct worklist *wk, int flags)
1649
{
1650
	struct ufsmount *ump;
1651

1652
	ump = VFSTOUFS(wk->wk_mp);
1653
	LOCK_OWNED(ump);
1654
	if (wk->wk_state & ONWORKLIST)
1655
		panic("add_to_worklist: %s(0x%X) already on list",
1656
		    TYPENAME(wk->wk_type), wk->wk_state);
1657
	wk->wk_state |= ONWORKLIST;
1658
	if (ump->softdep_on_worklist == 0) {
1659
		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1660
		ump->softdep_worklist_tail = wk;
1661
	} else if (flags & WK_HEAD) {
1662
		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1663
	} else {
1664
		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1665
		ump->softdep_worklist_tail = wk;
1666
	}
1667
	ump->softdep_on_worklist += 1;
1668
	if (flags & WK_NODELAY)
1669
		worklist_speedup(wk->wk_mp);
1670
}
1671

1672
/*
1673
 * Remove the item to be processed. If we are removing the last
1674
 * item on the list, we need to recalculate the tail pointer.
1675
 */
1676
static void
1677
remove_from_worklist(struct worklist *wk)
1678
{
1679
	struct ufsmount *ump;
1680

1681
	ump = VFSTOUFS(wk->wk_mp);
1682
	if (ump->softdep_worklist_tail == wk)
1683
		ump->softdep_worklist_tail =
1684
		    (struct worklist *)wk->wk_list.le_prev;
1685
	WORKLIST_REMOVE(wk);
1686
	ump->softdep_on_worklist -= 1;
1687
}
1688

1689
static void
1690
wake_worklist(struct worklist *wk)
1691
{
1692
	if (wk->wk_state & IOWAITING) {
1693
		wk->wk_state &= ~IOWAITING;
1694
		wakeup(wk);
1695
	}
1696
}
1697

1698
static void
1699
wait_worklist(struct worklist *wk, char *wmesg)
1700
{
1701
	struct ufsmount *ump;
1702

1703
	ump = VFSTOUFS(wk->wk_mp);
1704
	wk->wk_state |= IOWAITING;
1705
	msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
1706
}
1707

1708
/*
1709
 * Process that runs once per second to handle items in the background queue.
1710
 *
1711
 * Note that we ensure that everything is done in the order in which they
1712
 * appear in the queue. The code below depends on this property to ensure
1713
 * that blocks of a file are freed before the inode itself is freed. This
1714
 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1715
 * until all the old ones have been purged from the dependency lists.
1716
 */
1717
static int 
1718
softdep_process_worklist(struct mount *mp, int full)
1719
{
1720
	int cnt, matchcnt;
1721
	struct ufsmount *ump;
1722
	long starttime;
1723

1724
	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1725
	ump = VFSTOUFS(mp);
1726
	if (ump->um_softdep == NULL)
1727
		return (0);
1728
	matchcnt = 0;
1729
	ACQUIRE_LOCK(ump);
1730
	starttime = time_second;
1731
	softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
1732
	check_clear_deps(mp);
1733
	while (ump->softdep_on_worklist > 0) {
1734
		if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1735
			break;
1736
		else
1737
			matchcnt += cnt;
1738
		check_clear_deps(mp);
1739
		/*
1740
		 * We do not generally want to stop for buffer space, but if
1741
		 * we are really being a buffer hog, we will stop and wait.
1742
		 */
1743
		if (should_yield()) {
1744
			FREE_LOCK(ump);
1745
			kern_yield(PRI_USER);
1746
			bwillwrite();
1747
			ACQUIRE_LOCK(ump);
1748
		}
1749
		/*
1750
		 * Never allow processing to run for more than one
1751
		 * second. This gives the syncer thread the opportunity
1752
		 * to pause if appropriate.
1753
		 */
1754
		if (!full && starttime != time_second)
1755
			break;
1756
	}
1757
	if (full == 0)
1758
		journal_unsuspend(ump);
1759
	FREE_LOCK(ump);
1760
	return (matchcnt);
1761
}
1762

1763
/*
1764
 * Process all removes associated with a vnode if we are running out of
1765
 * journal space.  Any other process which attempts to flush these will
1766
 * be unable as we have the vnodes locked.
1767
 */
1768
static void
1769
process_removes(struct vnode *vp)
1770
{
1771
	struct inodedep *inodedep;
1772
	struct dirrem *dirrem;
1773
	struct ufsmount *ump;
1774
	struct mount *mp;
1775
	ino_t inum;
1776

1777
	mp = vp->v_mount;
1778
	ump = VFSTOUFS(mp);
1779
	LOCK_OWNED(ump);
1780
	inum = VTOI(vp)->i_number;
1781
	for (;;) {
1782
top:
1783
		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1784
			return;
1785
		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1786
			/*
1787
			 * If another thread is trying to lock this vnode
1788
			 * it will fail but we must wait for it to do so
1789
			 * before we can proceed.
1790
			 */
1791
			if (dirrem->dm_state & INPROGRESS) {
1792
				wait_worklist(&dirrem->dm_list, "pwrwait");
1793
				goto top;
1794
			}
1795
			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == 
1796
			    (COMPLETE | ONWORKLIST))
1797
				break;
1798
		}
1799
		if (dirrem == NULL)
1800
			return;
1801
		remove_from_worklist(&dirrem->dm_list);
1802
		FREE_LOCK(ump);
1803
		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1804
			panic("process_removes: suspended filesystem");
1805
		handle_workitem_remove(dirrem, 0);
1806
		vn_finished_secondary_write(mp);
1807
		ACQUIRE_LOCK(ump);
1808
	}
1809
}
1810

1811
/*
1812
 * Process all truncations associated with a vnode if we are running out
1813
 * of journal space.  This is called when the vnode lock is already held
1814
 * and no other process can clear the truncation.  This function returns
1815
 * a value greater than zero if it did any work.
1816
 */
1817
static void
1818
process_truncates(struct vnode *vp)
1819
{
1820
	struct inodedep *inodedep;
1821
	struct freeblks *freeblks;
1822
	struct ufsmount *ump;
1823
	struct mount *mp;
1824
	ino_t inum;
1825
	int cgwait;
1826

1827
	mp = vp->v_mount;
1828
	ump = VFSTOUFS(mp);
1829
	LOCK_OWNED(ump);
1830
	inum = VTOI(vp)->i_number;
1831
	for (;;) {
1832
		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1833
			return;
1834
		cgwait = 0;
1835
		TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1836
			/* Journal entries not yet written.  */
1837
			if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1838
				jwait(&LIST_FIRST(
1839
				    &freeblks->fb_jblkdephd)->jb_list,
1840
				    MNT_WAIT);
1841
				break;
1842
			}
1843
			/* Another thread is executing this item. */
1844
			if (freeblks->fb_state & INPROGRESS) {
1845
				wait_worklist(&freeblks->fb_list, "ptrwait");
1846
				break;
1847
			}
1848
			/* Freeblks is waiting on a inode write. */
1849
			if ((freeblks->fb_state & COMPLETE) == 0) {
1850
				FREE_LOCK(ump);
1851
				ffs_update(vp, 1);
1852
				ACQUIRE_LOCK(ump);
1853
				break;
1854
			}
1855
			if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1856
			    (ALLCOMPLETE | ONWORKLIST)) {
1857
				remove_from_worklist(&freeblks->fb_list);
1858
				freeblks->fb_state |= INPROGRESS;
1859
				FREE_LOCK(ump);
1860
				if (vn_start_secondary_write(NULL, &mp,
1861
				    V_NOWAIT))
1862
					panic("process_truncates: "
1863
					    "suspended filesystem");
1864
				handle_workitem_freeblocks(freeblks, 0);
1865
				vn_finished_secondary_write(mp);
1866
				ACQUIRE_LOCK(ump);
1867
				break;
1868
			}
1869
			if (freeblks->fb_cgwait)
1870
				cgwait++;
1871
		}
1872
		if (cgwait) {
1873
			FREE_LOCK(ump);
1874
			sync_cgs(mp, MNT_WAIT);
1875
			ffs_sync_snap(mp, MNT_WAIT);
1876
			ACQUIRE_LOCK(ump);
1877
			continue;
1878
		}
1879
		if (freeblks == NULL)
1880
			break;
1881
	}
1882
	return;
1883
}
1884

1885
/*
1886
 * Process one item on the worklist.
1887
 */
1888
static int
1889
process_worklist_item(struct mount *mp,
1890
	int target,
1891
	int flags)
1892
{
1893
	struct worklist sentinel;
1894
	struct worklist *wk;
1895
	struct ufsmount *ump;
1896
	int matchcnt;
1897
	int error;
1898

1899
	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1900
	/*
1901
	 * If we are being called because of a process doing a
1902
	 * copy-on-write, then it is not safe to write as we may
1903
	 * recurse into the copy-on-write routine.
1904
	 */
1905
	if (curthread->td_pflags & TDP_COWINPROGRESS)
1906
		return (-1);
1907
	ump = VFSTOUFS(mp);
1908
	LOCK_OWNED(ump);
1909
	matchcnt = 0;
1910
	sentinel.wk_mp = NULL;
1911
	sentinel.wk_type = D_SENTINEL;
1912
	LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
1913
	for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
1914
	    wk = LIST_NEXT(&sentinel, wk_list)) {
1915
		if (wk->wk_type == D_SENTINEL) {
1916
			LIST_REMOVE(&sentinel, wk_list);
1917
			LIST_INSERT_AFTER(wk, &sentinel, wk_list);
1918
			continue;
1919
		}
1920
		if (wk->wk_state & INPROGRESS)
1921
			panic("process_worklist_item: %p already in progress.",
1922
			    wk);
1923
		wk->wk_state |= INPROGRESS;
1924
		remove_from_worklist(wk);
1925
		FREE_LOCK(ump);
1926
		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1927
			panic("process_worklist_item: suspended filesystem");
1928
		switch (wk->wk_type) {
1929
		case D_DIRREM:
1930
			/* removal of a directory entry */
1931
			error = handle_workitem_remove(WK_DIRREM(wk), flags);
1932
			break;
1933

1934
		case D_FREEBLKS:
1935
			/* releasing blocks and/or fragments from a file */
1936
			error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1937
			    flags);
1938
			break;
1939

1940
		case D_FREEFRAG:
1941
			/* releasing a fragment when replaced as a file grows */
1942
			handle_workitem_freefrag(WK_FREEFRAG(wk));
1943
			error = 0;
1944
			break;
1945

1946
		case D_FREEFILE:
1947
			/* releasing an inode when its link count drops to 0 */
1948
			handle_workitem_freefile(WK_FREEFILE(wk));
1949
			error = 0;
1950
			break;
1951

1952
		default:
1953
			panic("%s_process_worklist: Unknown type %s",
1954
			    "softdep", TYPENAME(wk->wk_type));
1955
			/* NOTREACHED */
1956
		}
1957
		vn_finished_secondary_write(mp);
1958
		ACQUIRE_LOCK(ump);
1959
		if (error == 0) {
1960
			if (++matchcnt == target)
1961
				break;
1962
			continue;
1963
		}
1964
		/*
1965
		 * We have to retry the worklist item later.  Wake up any
1966
		 * waiters who may be able to complete it immediately and
1967
		 * add the item back to the head so we don't try to execute
1968
		 * it again.
1969
		 */
1970
		wk->wk_state &= ~INPROGRESS;
1971
		wake_worklist(wk);
1972
		add_to_worklist(wk, WK_HEAD);
1973
	}
1974
	/* Sentinal could've become the tail from remove_from_worklist. */
1975
	if (ump->softdep_worklist_tail == &sentinel)
1976
		ump->softdep_worklist_tail =
1977
		    (struct worklist *)sentinel.wk_list.le_prev;
1978
	LIST_REMOVE(&sentinel, wk_list);
1979
	return (matchcnt);
1980
}
1981

1982
/*
1983
 * Move dependencies from one buffer to another.
1984
 */
1985
int
1986
softdep_move_dependencies(struct buf *oldbp, struct buf *newbp)
1987
{
1988
	struct worklist *wk, *wktail;
1989
	struct ufsmount *ump;
1990
	int dirty;
1991

1992
	if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
1993
		return (0);
1994
	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
1995
	    ("softdep_move_dependencies called on non-softdep filesystem"));
1996
	dirty = 0;
1997
	wktail = NULL;
1998
	ump = VFSTOUFS(wk->wk_mp);
1999
	ACQUIRE_LOCK(ump);
2000
	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
2001
		LIST_REMOVE(wk, wk_list);
2002
		if (wk->wk_type == D_BMSAFEMAP &&
2003
		    bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
2004
			dirty = 1;
2005
		if (wktail == NULL)
2006
			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
2007
		else
2008
			LIST_INSERT_AFTER(wktail, wk, wk_list);
2009
		wktail = wk;
2010
	}
2011
	FREE_LOCK(ump);
2012

2013
	return (dirty);
2014
}
2015

2016
/*
2017
 * Purge the work list of all items associated with a particular mount point.
2018
 */
2019
int
2020
softdep_flushworklist(struct mount *oldmnt,
2021
	int *countp,
2022
	struct thread *td)
2023
{
2024
	struct vnode *devvp;
2025
	struct ufsmount *ump;
2026
	int count, error;
2027

2028
	/*
2029
	 * Alternately flush the block device associated with the mount
2030
	 * point and process any dependencies that the flushing
2031
	 * creates. We continue until no more worklist dependencies
2032
	 * are found.
2033
	 */
2034
	*countp = 0;
2035
	error = 0;
2036
	ump = VFSTOUFS(oldmnt);
2037
	devvp = ump->um_devvp;
2038
	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
2039
		*countp += count;
2040
		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
2041
		error = VOP_FSYNC(devvp, MNT_WAIT, td);
2042
		VOP_UNLOCK(devvp);
2043
		if (error != 0)
2044
			break;
2045
	}
2046
	return (error);
2047
}
2048

2049
#define	SU_WAITIDLE_RETRIES	20
2050
static int
2051
softdep_waitidle(struct mount *mp, int flags __unused)
2052
{
2053
	struct ufsmount *ump;
2054
	struct vnode *devvp;
2055
	struct thread *td;
2056
	int error, i;
2057

2058
	ump = VFSTOUFS(mp);
2059
	KASSERT(ump->um_softdep != NULL,
2060
	    ("softdep_waitidle called on non-softdep filesystem"));
2061
	devvp = ump->um_devvp;
2062
	td = curthread;
2063
	error = 0;
2064
	ACQUIRE_LOCK(ump);
2065
	for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
2066
		ump->softdep_req = 1;
2067
		KASSERT((flags & FORCECLOSE) == 0 ||
2068
		    ump->softdep_on_worklist == 0,
2069
		    ("softdep_waitidle: work added after flush"));
2070
		msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
2071
		    "softdeps", 10 * hz);
2072
		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
2073
		error = VOP_FSYNC(devvp, MNT_WAIT, td);
2074
		VOP_UNLOCK(devvp);
2075
		ACQUIRE_LOCK(ump);
2076
		if (error != 0)
2077
			break;
2078
	}
2079
	ump->softdep_req = 0;
2080
	if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
2081
		error = EBUSY;
2082
		printf("softdep_waitidle: Failed to flush worklist for %p\n",
2083
		    mp);
2084
	}
2085
	FREE_LOCK(ump);
2086
	return (error);
2087
}
2088

2089
/*
2090
 * Flush all vnodes and worklist items associated with a specified mount point.
2091
 */
2092
int
2093
softdep_flushfiles(struct mount *oldmnt,
2094
	int flags,
2095
	struct thread *td)
2096
{
2097
	struct ufsmount *ump __unused;
2098
#ifdef QUOTA
2099
	int i;
2100
#endif
2101
	int error, early, depcount, loopcnt, retry_flush_count, retry;
2102
	int morework;
2103

2104
	ump = VFSTOUFS(oldmnt);
2105
	KASSERT(ump->um_softdep != NULL,
2106
	    ("softdep_flushfiles called on non-softdep filesystem"));
2107
	loopcnt = 10;
2108
	retry_flush_count = 3;
2109
retry_flush:
2110
	error = 0;
2111

2112
	/*
2113
	 * Alternately flush the vnodes associated with the mount
2114
	 * point and process any dependencies that the flushing
2115
	 * creates. In theory, this loop can happen at most twice,
2116
	 * but we give it a few extra just to be sure.
2117
	 */
2118
	for (; loopcnt > 0; loopcnt--) {
2119
		/*
2120
		 * Do another flush in case any vnodes were brought in
2121
		 * as part of the cleanup operations.
2122
		 */
2123
		early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
2124
		    MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
2125
		if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
2126
			break;
2127
		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
2128
		    depcount == 0)
2129
			break;
2130
	}
2131
	/*
2132
	 * If we are unmounting then it is an error to fail. If we
2133
	 * are simply trying to downgrade to read-only, then filesystem
2134
	 * activity can keep us busy forever, so we just fail with EBUSY.
2135
	 */
2136
	if (loopcnt == 0) {
2137
		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
2138
			panic("softdep_flushfiles: looping");
2139
		error = EBUSY;
2140
	}
2141
	if (!error)
2142
		error = softdep_waitidle(oldmnt, flags);
2143
	if (!error) {
2144
		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
2145
			retry = 0;
2146
			MNT_ILOCK(oldmnt);
2147
			morework = oldmnt->mnt_nvnodelistsize > 0;
2148
#ifdef QUOTA
2149
			UFS_LOCK(ump);
2150
			for (i = 0; i < MAXQUOTAS; i++) {
2151
				if (ump->um_quotas[i] != NULL)
2152
					morework = 1;
2153
			}
2154
			UFS_UNLOCK(ump);
2155
#endif
2156
			if (morework) {
2157
				if (--retry_flush_count > 0) {
2158
					retry = 1;
2159
					loopcnt = 3;
2160
				} else
2161
					error = EBUSY;
2162
			}
2163
			MNT_IUNLOCK(oldmnt);
2164
			if (retry)
2165
				goto retry_flush;
2166
		}
2167
	}
2168
	return (error);
2169
}
2170

2171
/*
2172
 * Structure hashing.
2173
 * 
2174
 * There are four types of structures that can be looked up:
2175
 *	1) pagedep structures identified by mount point, inode number,
2176
 *	   and logical block.
2177
 *	2) inodedep structures identified by mount point and inode number.
2178
 *	3) newblk structures identified by mount point and
2179
 *	   physical block number.
2180
 *	4) bmsafemap structures identified by mount point and
2181
 *	   cylinder group number.
2182
 *
2183
 * The "pagedep" and "inodedep" dependency structures are hashed
2184
 * separately from the file blocks and inodes to which they correspond.
2185
 * This separation helps when the in-memory copy of an inode or
2186
 * file block must be replaced. It also obviates the need to access
2187
 * an inode or file page when simply updating (or de-allocating)
2188
 * dependency structures. Lookup of newblk structures is needed to
2189
 * find newly allocated blocks when trying to associate them with
2190
 * their allocdirect or allocindir structure.
2191
 *
2192
 * The lookup routines optionally create and hash a new instance when
2193
 * an existing entry is not found. The bmsafemap lookup routine always
2194
 * allocates a new structure if an existing one is not found.
2195
 */
2196
#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
2197

2198
/*
2199
 * Structures and routines associated with pagedep caching.
2200
 */
2201
#define	PAGEDEP_HASH(ump, inum, lbn) \
2202
	(&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
2203

2204
static int
2205
pagedep_find(struct pagedep_hashhead *pagedephd,
2206
	ino_t ino,
2207
	ufs_lbn_t lbn,
2208
	struct pagedep **pagedeppp)
2209
{
2210
	struct pagedep *pagedep;
2211

2212
	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
2213
		if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
2214
			*pagedeppp = pagedep;
2215
			return (1);
2216
		}
2217
	}
2218
	*pagedeppp = NULL;
2219
	return (0);
2220
}
2221
/*
2222
 * Look up a pagedep. Return 1 if found, 0 otherwise.
2223
 * If not found, allocate if DEPALLOC flag is passed.
2224
 * Found or allocated entry is returned in pagedeppp.
2225
 */
2226
static int
2227
pagedep_lookup(struct mount *mp,
2228
	struct buf *bp,
2229
	ino_t ino,
2230
	ufs_lbn_t lbn,
2231
	int flags,
2232
	struct pagedep **pagedeppp)
2233
{
2234
	struct pagedep *pagedep;
2235
	struct pagedep_hashhead *pagedephd;
2236
	struct worklist *wk;
2237
	struct ufsmount *ump;
2238
	int ret;
2239
	int i;
2240

2241
	ump = VFSTOUFS(mp);
2242
	LOCK_OWNED(ump);
2243
	if (bp) {
2244
		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2245
			if (wk->wk_type == D_PAGEDEP) {
2246
				*pagedeppp = WK_PAGEDEP(wk);
2247
				return (1);
2248
			}
2249
		}
2250
	}
2251
	pagedephd = PAGEDEP_HASH(ump, ino, lbn);
2252
	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2253
	if (ret) {
2254
		if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2255
			WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2256
		return (1);
2257
	}
2258
	if ((flags & DEPALLOC) == 0)
2259
		return (0);
2260
	FREE_LOCK(ump);
2261
	pagedep = malloc(sizeof(struct pagedep),
2262
	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2263
	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2264
	ACQUIRE_LOCK(ump);
2265
	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2266
	if (*pagedeppp) {
2267
		/*
2268
		 * This should never happen since we only create pagedeps
2269
		 * with the vnode lock held.  Could be an assert.
2270
		 */
2271
		WORKITEM_FREE(pagedep, D_PAGEDEP);
2272
		return (ret);
2273
	}
2274
	pagedep->pd_ino = ino;
2275
	pagedep->pd_lbn = lbn;
2276
	LIST_INIT(&pagedep->pd_dirremhd);
2277
	LIST_INIT(&pagedep->pd_pendinghd);
2278
	for (i = 0; i < DAHASHSZ; i++)
2279
		LIST_INIT(&pagedep->pd_diraddhd[i]);
2280
	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2281
	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2282
	*pagedeppp = pagedep;
2283
	return (0);
2284
}
2285

2286
/*
2287
 * Structures and routines associated with inodedep caching.
2288
 */
2289
#define	INODEDEP_HASH(ump, inum) \
2290
      (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
2291

2292
static int
2293
inodedep_find(struct inodedep_hashhead *inodedephd,
2294
	ino_t inum,
2295
	struct inodedep **inodedeppp)
2296
{
2297
	struct inodedep *inodedep;
2298

2299
	LIST_FOREACH(inodedep, inodedephd, id_hash)
2300
		if (inum == inodedep->id_ino)
2301
			break;
2302
	if (inodedep) {
2303
		*inodedeppp = inodedep;
2304
		return (1);
2305
	}
2306
	*inodedeppp = NULL;
2307

2308
	return (0);
2309
}
2310
/*
2311
 * Look up an inodedep. Return 1 if found, 0 if not found.
2312
 * If not found, allocate if DEPALLOC flag is passed.
2313
 * Found or allocated entry is returned in inodedeppp.
2314
 */
2315
static int
2316
inodedep_lookup(struct mount *mp,
2317
	ino_t inum,
2318
	int flags,
2319
	struct inodedep **inodedeppp)
2320
{
2321
	struct inodedep *inodedep;
2322
	struct inodedep_hashhead *inodedephd;
2323
	struct ufsmount *ump;
2324
	struct fs *fs;
2325

2326
	ump = VFSTOUFS(mp);
2327
	LOCK_OWNED(ump);
2328
	fs = ump->um_fs;
2329
	inodedephd = INODEDEP_HASH(ump, inum);
2330

2331
	if (inodedep_find(inodedephd, inum, inodedeppp))
2332
		return (1);
2333
	if ((flags & DEPALLOC) == 0)
2334
		return (0);
2335
	/*
2336
	 * If the system is over its limit and our filesystem is
2337
	 * responsible for more than our share of that usage and
2338
	 * we are not in a rush, request some inodedep cleanup.
2339
	 */
2340
	if (softdep_excess_items(ump, D_INODEDEP))
2341
		schedule_cleanup(mp);
2342
	else
2343
		FREE_LOCK(ump);
2344
	inodedep = malloc(sizeof(struct inodedep),
2345
		M_INODEDEP, M_SOFTDEP_FLAGS);
2346
	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2347
	ACQUIRE_LOCK(ump);
2348
	if (inodedep_find(inodedephd, inum, inodedeppp)) {
2349
		WORKITEM_FREE(inodedep, D_INODEDEP);
2350
		return (1);
2351
	}
2352
	inodedep->id_fs = fs;
2353
	inodedep->id_ino = inum;
2354
	inodedep->id_state = ALLCOMPLETE;
2355
	inodedep->id_nlinkdelta = 0;
2356
	inodedep->id_nlinkwrote = -1;
2357
	inodedep->id_savedino1 = NULL;
2358
	inodedep->id_savedsize = -1;
2359
	inodedep->id_savedextsize = -1;
2360
	inodedep->id_savednlink = -1;
2361
	inodedep->id_bmsafemap = NULL;
2362
	inodedep->id_mkdiradd = NULL;
2363
	LIST_INIT(&inodedep->id_dirremhd);
2364
	LIST_INIT(&inodedep->id_pendinghd);
2365
	LIST_INIT(&inodedep->id_inowait);
2366
	LIST_INIT(&inodedep->id_bufwait);
2367
	TAILQ_INIT(&inodedep->id_inoreflst);
2368
	TAILQ_INIT(&inodedep->id_inoupdt);
2369
	TAILQ_INIT(&inodedep->id_newinoupdt);
2370
	TAILQ_INIT(&inodedep->id_extupdt);
2371
	TAILQ_INIT(&inodedep->id_newextupdt);
2372
	TAILQ_INIT(&inodedep->id_freeblklst);
2373
	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2374
	*inodedeppp = inodedep;
2375
	return (0);
2376
}
2377

2378
/*
2379
 * Structures and routines associated with newblk caching.
2380
 */
2381
#define	NEWBLK_HASH(ump, inum) \
2382
	(&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
2383

2384
static int
2385
newblk_find(struct newblk_hashhead *newblkhd,
2386
	ufs2_daddr_t newblkno,
2387
	int flags,
2388
	struct newblk **newblkpp)
2389
{
2390
	struct newblk *newblk;
2391

2392
	LIST_FOREACH(newblk, newblkhd, nb_hash) {
2393
		if (newblkno != newblk->nb_newblkno)
2394
			continue;
2395
		/*
2396
		 * If we're creating a new dependency don't match those that
2397
		 * have already been converted to allocdirects.  This is for
2398
		 * a frag extend.
2399
		 */
2400
		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2401
			continue;
2402
		break;
2403
	}
2404
	if (newblk) {
2405
		*newblkpp = newblk;
2406
		return (1);
2407
	}
2408
	*newblkpp = NULL;
2409
	return (0);
2410
}
2411

2412
/*
2413
 * Look up a newblk. Return 1 if found, 0 if not found.
2414
 * If not found, allocate if DEPALLOC flag is passed.
2415
 * Found or allocated entry is returned in newblkpp.
2416
 */
2417
static int
2418
newblk_lookup(struct mount *mp,
2419
	ufs2_daddr_t newblkno,
2420
	int flags,
2421
	struct newblk **newblkpp)
2422
{
2423
	struct newblk *newblk;
2424
	struct newblk_hashhead *newblkhd;
2425
	struct ufsmount *ump;
2426

2427
	ump = VFSTOUFS(mp);
2428
	LOCK_OWNED(ump);
2429
	newblkhd = NEWBLK_HASH(ump, newblkno);
2430
	if (newblk_find(newblkhd, newblkno, flags, newblkpp))
2431
		return (1);
2432
	if ((flags & DEPALLOC) == 0)
2433
		return (0);
2434
	if (softdep_excess_items(ump, D_NEWBLK) ||
2435
	    softdep_excess_items(ump, D_ALLOCDIRECT) ||
2436
	    softdep_excess_items(ump, D_ALLOCINDIR))
2437
		schedule_cleanup(mp);
2438
	else
2439
		FREE_LOCK(ump);
2440
	newblk = malloc(sizeof(union allblk), M_NEWBLK,
2441
	    M_SOFTDEP_FLAGS | M_ZERO);
2442
	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2443
	ACQUIRE_LOCK(ump);
2444
	if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
2445
		WORKITEM_FREE(newblk, D_NEWBLK);
2446
		return (1);
2447
	}
2448
	newblk->nb_freefrag = NULL;
2449
	LIST_INIT(&newblk->nb_indirdeps);
2450
	LIST_INIT(&newblk->nb_newdirblk);
2451
	LIST_INIT(&newblk->nb_jwork);
2452
	newblk->nb_state = ATTACHED;
2453
	newblk->nb_newblkno = newblkno;
2454
	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2455
	*newblkpp = newblk;
2456
	return (0);
2457
}
2458

2459
/*
2460
 * Structures and routines associated with freed indirect block caching.
2461
 */
2462
#define	INDIR_HASH(ump, blkno) \
2463
	(&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
2464

2465
/*
2466
 * Lookup an indirect block in the indir hash table.  The freework is
2467
 * removed and potentially freed.  The caller must do a blocking journal
2468
 * write before writing to the blkno.
2469
 */
2470
static int
2471
indirblk_lookup(struct mount *mp, ufs2_daddr_t blkno)
2472
{
2473
	struct freework *freework;
2474
	struct indir_hashhead *wkhd;
2475
	struct ufsmount *ump;
2476

2477
	ump = VFSTOUFS(mp);
2478
	wkhd = INDIR_HASH(ump, blkno);
2479
	TAILQ_FOREACH(freework, wkhd, fw_next) {
2480
		if (freework->fw_blkno != blkno)
2481
			continue;
2482
		indirblk_remove(freework);
2483
		return (1);
2484
	}
2485
	return (0);
2486
}
2487

2488
/*
2489
 * Insert an indirect block represented by freework into the indirblk
2490
 * hash table so that it may prevent the block from being re-used prior
2491
 * to the journal being written.
2492
 */
2493
static void
2494
indirblk_insert(struct freework *freework)
2495
{
2496
	struct jblocks *jblocks;
2497
	struct jseg *jseg;
2498
	struct ufsmount *ump;
2499

2500
	ump = VFSTOUFS(freework->fw_list.wk_mp);
2501
	jblocks = ump->softdep_jblocks;
2502
	jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2503
	if (jseg == NULL)
2504
		return;
2505

2506
	LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2507
	TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
2508
	    fw_next);
2509
	freework->fw_state &= ~DEPCOMPLETE;
2510
}
2511

2512
static void
2513
indirblk_remove(struct freework *freework)
2514
{
2515
	struct ufsmount *ump;
2516

2517
	ump = VFSTOUFS(freework->fw_list.wk_mp);
2518
	LIST_REMOVE(freework, fw_segs);
2519
	TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
2520
	freework->fw_state |= DEPCOMPLETE;
2521
	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2522
		WORKITEM_FREE(freework, D_FREEWORK);
2523
}
2524

2525
/*
2526
 * Executed during filesystem system initialization before
2527
 * mounting any filesystems.
2528
 */
2529
void 
2530
softdep_initialize(void)
2531
{
2532

2533
	TAILQ_INIT(&softdepmounts);
2534
#ifdef __LP64__
2535
	max_softdeps = desiredvnodes * 4;
2536
#else
2537
	max_softdeps = desiredvnodes * 2;
2538
#endif
2539

2540
	/* initialise bioops hack */
2541
	bioops.io_start = softdep_disk_io_initiation;
2542
	bioops.io_complete = softdep_disk_write_complete;
2543
	bioops.io_deallocate = softdep_deallocate_dependencies;
2544
	bioops.io_countdeps = softdep_count_dependencies;
2545
	ast_register(TDA_UFS, ASTR_KCLEAR | ASTR_ASTF_REQUIRED, 0,
2546
	    softdep_ast_cleanup_proc);
2547

2548
	/* Initialize the callout with an mtx. */
2549
	callout_init_mtx(&softdep_callout, &lk, 0);
2550
}
2551

2552
/*
2553
 * Executed after all filesystems have been unmounted during
2554
 * filesystem module unload.
2555
 */
2556
void
2557
softdep_uninitialize(void)
2558
{
2559

2560
	/* clear bioops hack */
2561
	bioops.io_start = NULL;
2562
	bioops.io_complete = NULL;
2563
	bioops.io_deallocate = NULL;
2564
	bioops.io_countdeps = NULL;
2565
	ast_deregister(TDA_UFS);
2566

2567
	callout_drain(&softdep_callout);
2568
}
2569

2570
/*
2571
 * Called at mount time to notify the dependency code that a
2572
 * filesystem wishes to use it.
2573
 */
2574
int
2575
softdep_mount(struct vnode *devvp,
2576
	struct mount *mp,
2577
	struct fs *fs,
2578
	struct ucred *cred)
2579
{
2580
	struct csum_total cstotal;
2581
	struct mount_softdeps *sdp;
2582
	struct ufsmount *ump;
2583
	struct cg *cgp;
2584
	struct buf *bp;
2585
	uint64_t cyl, i;
2586
	int error;
2587

2588
	ump = VFSTOUFS(mp);
2589

2590
	sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
2591
	    M_WAITOK | M_ZERO);
2592
	rw_init(&sdp->sd_fslock, "SUrw");
2593
	sdp->sd_ump = ump;
2594
	LIST_INIT(&sdp->sd_workitem_pending);
2595
	LIST_INIT(&sdp->sd_journal_pending);
2596
	TAILQ_INIT(&sdp->sd_unlinked);
2597
	LIST_INIT(&sdp->sd_dirtycg);
2598
	sdp->sd_worklist_tail = NULL;
2599
	sdp->sd_on_worklist = 0;
2600
	sdp->sd_deps = 0;
2601
	LIST_INIT(&sdp->sd_mkdirlisthd);
2602
	sdp->sd_pdhash = hashinit(desiredvnodes / 5, M_PAGEDEP,
2603
	    &sdp->sd_pdhashsize);
2604
	sdp->sd_pdnextclean = 0;
2605
	sdp->sd_idhash = hashinit(desiredvnodes, M_INODEDEP,
2606
	    &sdp->sd_idhashsize);
2607
	sdp->sd_idnextclean = 0;
2608
	sdp->sd_newblkhash = hashinit(max_softdeps / 2,  M_NEWBLK,
2609
	    &sdp->sd_newblkhashsize);
2610
	sdp->sd_bmhash = hashinit(1024, M_BMSAFEMAP, &sdp->sd_bmhashsize);
2611
	i = 1 << (ffs(desiredvnodes / 10) - 1);
2612
	sdp->sd_indirhash = malloc(i * sizeof(struct indir_hashhead),
2613
	    M_FREEWORK, M_WAITOK);
2614
	sdp->sd_indirhashsize = i - 1;
2615
	for (i = 0; i <= sdp->sd_indirhashsize; i++)
2616
		TAILQ_INIT(&sdp->sd_indirhash[i]);
2617
	for (i = 0; i <= D_LAST; i++)
2618
		LIST_INIT(&sdp->sd_alldeps[i]);
2619
	ACQUIRE_GBLLOCK(&lk);
2620
	TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
2621
	FREE_GBLLOCK(&lk);
2622

2623
	ump->um_softdep = sdp;
2624
	MNT_ILOCK(mp);
2625
	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2626
	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2627
		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2628
		    MNTK_SOFTDEP | MNTK_NOASYNC;
2629
	}
2630
	MNT_IUNLOCK(mp);
2631

2632
	if ((fs->fs_flags & FS_SUJ) &&
2633
	    (error = journal_mount(mp, fs, cred)) != 0) {
2634
		printf("%s: failed to start journal: %d\n",
2635
		    mp->mnt_stat.f_mntonname, error);
2636
		softdep_unmount(mp);
2637
		return (error);
2638
	}
2639
	/*
2640
	 * Start our flushing thread in the bufdaemon process.
2641
	 */
2642
	ACQUIRE_LOCK(ump);
2643
	ump->softdep_flags |= FLUSH_STARTING;
2644
	FREE_LOCK(ump);
2645
	error = kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
2646
	    &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
2647
	    mp->mnt_stat.f_mntonname);
2648
	ACQUIRE_LOCK(ump);
2649
	if (error != 0) {
2650
		printf("%s: failed to start softdepflush thread: %d\n",
2651
		    mp->mnt_stat.f_mntonname, error);
2652
		ump->softdep_flags &= ~FLUSH_STARTING;
2653
		FREE_LOCK(ump);
2654
		softdep_unmount(mp);
2655
		return (error);
2656
	}
2657
	while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
2658
		msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
2659
		    hz / 2);
2660
	}
2661
	FREE_LOCK(ump);
2662
	/*
2663
	 * When doing soft updates, the counters in the
2664
	 * superblock may have gotten out of sync. Recomputation
2665
	 * can take a long time and can be deferred for background
2666
	 * fsck.  However, the old behavior of scanning the cylinder
2667
	 * groups and recalculating them at mount time is available
2668
	 * by setting vfs.ffs.compute_summary_at_mount to one.
2669
	 */
2670
	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2671
		return (0);
2672
	bzero(&cstotal, sizeof cstotal);
2673
	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2674
		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2675
		    fs->fs_cgsize, cred, &bp)) != 0) {
2676
			brelse(bp);
2677
			softdep_unmount(mp);
2678
			return (error);
2679
		}
2680
		cgp = (struct cg *)bp->b_data;
2681
		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2682
		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2683
		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2684
		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2685
		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2686
		brelse(bp);
2687
	}
2688
#ifdef INVARIANTS
2689
	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2690
		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2691
#endif
2692
	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2693
	return (0);
2694
}
2695

2696
void
2697
softdep_unmount(struct mount *mp)
2698
{
2699
	struct ufsmount *ump;
2700
	struct mount_softdeps *ums;
2701

2702
	ump = VFSTOUFS(mp);
2703
	KASSERT(ump->um_softdep != NULL,
2704
	    ("softdep_unmount called on non-softdep filesystem"));
2705
	MNT_ILOCK(mp);
2706
	mp->mnt_flag &= ~MNT_SOFTDEP;
2707
	if ((mp->mnt_flag & MNT_SUJ) == 0) {
2708
		MNT_IUNLOCK(mp);
2709
	} else {
2710
		mp->mnt_flag &= ~MNT_SUJ;
2711
		MNT_IUNLOCK(mp);
2712
		journal_unmount(ump);
2713
	}
2714
	/*
2715
	 * Shut down our flushing thread. Check for NULL is if
2716
	 * softdep_mount errors out before the thread has been created.
2717
	 */
2718
	if (ump->softdep_flushtd != NULL) {
2719
		ACQUIRE_LOCK(ump);
2720
		ump->softdep_flags |= FLUSH_EXIT;
2721
		wakeup(&ump->softdep_flushtd);
2722
		while ((ump->softdep_flags & FLUSH_EXIT) != 0) {
2723
			msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM,
2724
			    "sdwait", 0);
2725
		}
2726
		KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
2727
		    ("Thread shutdown failed"));
2728
		FREE_LOCK(ump);
2729
	}
2730

2731
	/*
2732
	 * We are no longer have softdep structure attached to ump.
2733
	 */
2734
	ums = ump->um_softdep;
2735
	ACQUIRE_GBLLOCK(&lk);
2736
	TAILQ_REMOVE(&softdepmounts, ums, sd_next);
2737
	FREE_GBLLOCK(&lk);
2738
	ump->um_softdep = NULL;
2739

2740
	KASSERT(ums->sd_on_journal == 0,
2741
	    ("ump %p ums %p on_journal %d", ump, ums, ums->sd_on_journal));
2742
	KASSERT(ums->sd_on_worklist == 0,
2743
	    ("ump %p ums %p on_worklist %d", ump, ums, ums->sd_on_worklist));
2744
	KASSERT(ums->sd_deps == 0,
2745
	    ("ump %p ums %p deps %d", ump, ums, ums->sd_deps));
2746

2747
	/*
2748
	 * Free up our resources.
2749
	 */
2750
	rw_destroy(&ums->sd_fslock);
2751
	hashdestroy(ums->sd_pdhash, M_PAGEDEP, ums->sd_pdhashsize);
2752
	hashdestroy(ums->sd_idhash, M_INODEDEP, ums->sd_idhashsize);
2753
	hashdestroy(ums->sd_newblkhash, M_NEWBLK, ums->sd_newblkhashsize);
2754
	hashdestroy(ums->sd_bmhash, M_BMSAFEMAP, ums->sd_bmhashsize);
2755
	free(ums->sd_indirhash, M_FREEWORK);
2756
#ifdef INVARIANTS
2757
	for (int i = 0; i <= D_LAST; i++) {
2758
		KASSERT(ums->sd_curdeps[i] == 0,
2759
		    ("Unmount %s: Dep type %s != 0 (%jd)", ump->um_fs->fs_fsmnt,
2760
		    TYPENAME(i), (intmax_t)ums->sd_curdeps[i]));
2761
		KASSERT(LIST_EMPTY(&ums->sd_alldeps[i]),
2762
		    ("Unmount %s: Dep type %s not empty (%p)",
2763
		    ump->um_fs->fs_fsmnt,
2764
		    TYPENAME(i), LIST_FIRST(&ums->sd_alldeps[i])));
2765
	}
2766
#endif
2767
	free(ums, M_MOUNTDATA);
2768
}
2769

2770
static struct jblocks *
2771
jblocks_create(void)
2772
{
2773
	struct jblocks *jblocks;
2774

2775
	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2776
	TAILQ_INIT(&jblocks->jb_segs);
2777
	jblocks->jb_avail = 10;
2778
	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2779
	    M_JBLOCKS, M_WAITOK | M_ZERO);
2780

2781
	return (jblocks);
2782
}
2783

2784
static ufs2_daddr_t
2785
jblocks_alloc(struct jblocks *jblocks,
2786
	int bytes,
2787
	int *actual)
2788
{
2789
	ufs2_daddr_t daddr;
2790
	struct jextent *jext;
2791
	int freecnt;
2792
	int blocks;
2793

2794
	blocks = bytes / DEV_BSIZE;
2795
	jext = &jblocks->jb_extent[jblocks->jb_head];
2796
	freecnt = jext->je_blocks - jblocks->jb_off;
2797
	if (freecnt == 0) {
2798
		jblocks->jb_off = 0;
2799
		if (++jblocks->jb_head > jblocks->jb_used)
2800
			jblocks->jb_head = 0;
2801
		jext = &jblocks->jb_extent[jblocks->jb_head];
2802
		freecnt = jext->je_blocks;
2803
	}
2804
	if (freecnt > blocks)
2805
		freecnt = blocks;
2806
	*actual = freecnt * DEV_BSIZE;
2807
	daddr = jext->je_daddr + jblocks->jb_off;
2808
	jblocks->jb_off += freecnt;
2809
	jblocks->jb_free -= freecnt;
2810

2811
	return (daddr);
2812
}
2813

2814
static void
2815
jblocks_free(struct jblocks *jblocks,
2816
	struct mount *mp,
2817
	int bytes)
2818
{
2819

2820
	LOCK_OWNED(VFSTOUFS(mp));
2821
	jblocks->jb_free += bytes / DEV_BSIZE;
2822
	if (jblocks->jb_suspended)
2823
		worklist_speedup(mp);
2824
	wakeup(jblocks);
2825
}
2826

2827
static void
2828
jblocks_destroy(struct jblocks *jblocks)
2829
{
2830

2831
	if (jblocks->jb_extent)
2832
		free(jblocks->jb_extent, M_JBLOCKS);
2833
	free(jblocks, M_JBLOCKS);
2834
}
2835

2836
static void
2837
jblocks_add(struct jblocks *jblocks,
2838
	ufs2_daddr_t daddr,
2839
	int blocks)
2840
{
2841
	struct jextent *jext;
2842

2843
	jblocks->jb_blocks += blocks;
2844
	jblocks->jb_free += blocks;
2845
	jext = &jblocks->jb_extent[jblocks->jb_used];
2846
	/* Adding the first block. */
2847
	if (jext->je_daddr == 0) {
2848
		jext->je_daddr = daddr;
2849
		jext->je_blocks = blocks;
2850
		return;
2851
	}
2852
	/* Extending the last extent. */
2853
	if (jext->je_daddr + jext->je_blocks == daddr) {
2854
		jext->je_blocks += blocks;
2855
		return;
2856
	}
2857
	/* Adding a new extent. */
2858
	if (++jblocks->jb_used == jblocks->jb_avail) {
2859
		jblocks->jb_avail *= 2;
2860
		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2861
		    M_JBLOCKS, M_WAITOK | M_ZERO);
2862
		memcpy(jext, jblocks->jb_extent,
2863
		    sizeof(struct jextent) * jblocks->jb_used);
2864
		free(jblocks->jb_extent, M_JBLOCKS);
2865
		jblocks->jb_extent = jext;
2866
	}
2867
	jext = &jblocks->jb_extent[jblocks->jb_used];
2868
	jext->je_daddr = daddr;
2869
	jext->je_blocks = blocks;
2870
	return;
2871
}
2872

2873
int
2874
softdep_journal_lookup(struct mount *mp, struct vnode **vpp)
2875
{
2876
	struct componentname cnp;
2877
	struct vnode *dvp;
2878
	ino_t sujournal;
2879
	int error;
2880

2881
	error = VFS_VGET(mp, UFS_ROOTINO, LK_EXCLUSIVE, &dvp);
2882
	if (error)
2883
		return (error);
2884
	bzero(&cnp, sizeof(cnp));
2885
	cnp.cn_nameiop = LOOKUP;
2886
	cnp.cn_flags = ISLASTCN;
2887
	cnp.cn_cred = curthread->td_ucred;
2888
	cnp.cn_pnbuf = SUJ_FILE;
2889
	cnp.cn_nameptr = SUJ_FILE;
2890
	cnp.cn_namelen = strlen(SUJ_FILE);
2891
	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2892
	vput(dvp);
2893
	if (error != 0)
2894
		return (error);
2895
	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2896
	return (error);
2897
}
2898

2899
/*
2900
 * Open and verify the journal file.
2901
 */
2902
static int
2903
journal_mount(struct mount *mp,
2904
	struct fs *fs,
2905
	struct ucred *cred)
2906
{
2907
	struct jblocks *jblocks;
2908
	struct ufsmount *ump;
2909
	struct vnode *vp;
2910
	struct inode *ip;
2911
	ufs2_daddr_t blkno;
2912
	int bcount;
2913
	int error;
2914
	int i;
2915

2916
	ump = VFSTOUFS(mp);
2917
	ump->softdep_journal_tail = NULL;
2918
	ump->softdep_on_journal = 0;
2919
	ump->softdep_accdeps = 0;
2920
	ump->softdep_req = 0;
2921
	ump->softdep_jblocks = NULL;
2922
	error = softdep_journal_lookup(mp, &vp);
2923
	if (error != 0) {
2924
		printf("Failed to find journal.  Use tunefs to create one\n");
2925
		return (error);
2926
	}
2927
	ip = VTOI(vp);
2928
	if (ip->i_size < SUJ_MIN) {
2929
		error = ENOSPC;
2930
		goto out;
2931
	}
2932
	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2933
	jblocks = jblocks_create();
2934
	for (i = 0; i < bcount; i++) {
2935
		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2936
		if (error)
2937
			break;
2938
		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2939
	}
2940
	if (error) {
2941
		jblocks_destroy(jblocks);
2942
		goto out;
2943
	}
2944
	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2945
	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2946
	ump->softdep_jblocks = jblocks;
2947

2948
	MNT_ILOCK(mp);
2949
	mp->mnt_flag |= MNT_SUJ;
2950
	MNT_IUNLOCK(mp);
2951

2952
	/*
2953
	 * Only validate the journal contents if the
2954
	 * filesystem is clean, otherwise we write the logs
2955
	 * but they'll never be used.  If the filesystem was
2956
	 * still dirty when we mounted it the journal is
2957
	 * invalid and a new journal can only be valid if it
2958
	 * starts from a clean mount.
2959
	 */
2960
	if (fs->fs_clean) {
2961
		DIP_SET(ip, i_modrev, fs->fs_mtime);
2962
		ip->i_flags |= IN_MODIFIED;
2963
		ffs_update(vp, 1);
2964
	}
2965
out:
2966
	vput(vp);
2967
	return (error);
2968
}
2969

2970
static void
2971
journal_unmount(struct ufsmount *ump)
2972
{
2973

2974
	if (ump->softdep_jblocks)
2975
		jblocks_destroy(ump->softdep_jblocks);
2976
	ump->softdep_jblocks = NULL;
2977
}
2978

2979
/*
2980
 * Called when a journal record is ready to be written.  Space is allocated
2981
 * and the journal entry is created when the journal is flushed to stable
2982
 * store.
2983
 */
2984
static void
2985
add_to_journal(struct worklist *wk)
2986
{
2987
	struct ufsmount *ump;
2988

2989
	ump = VFSTOUFS(wk->wk_mp);
2990
	LOCK_OWNED(ump);
2991
	if (wk->wk_state & ONWORKLIST)
2992
		panic("add_to_journal: %s(0x%X) already on list",
2993
		    TYPENAME(wk->wk_type), wk->wk_state);
2994
	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2995
	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2996
		ump->softdep_jblocks->jb_age = ticks;
2997
		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2998
	} else
2999
		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
3000
	ump->softdep_journal_tail = wk;
3001
	ump->softdep_on_journal += 1;
3002
}
3003

3004
/*
3005
 * Remove an arbitrary item for the journal worklist maintain the tail
3006
 * pointer.  This happens when a new operation obviates the need to
3007
 * journal an old operation.
3008
 */
3009
static void
3010
remove_from_journal(struct worklist *wk)
3011
{
3012
	struct ufsmount *ump;
3013

3014
	ump = VFSTOUFS(wk->wk_mp);
3015
	LOCK_OWNED(ump);
3016
#ifdef INVARIANTS
3017
	{
3018
		struct worklist *wkn;
3019

3020
		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
3021
			if (wkn == wk)
3022
				break;
3023
		if (wkn == NULL)
3024
			panic("remove_from_journal: %p is not in journal", wk);
3025
	}
3026
#endif
3027
	/*
3028
	 * We emulate a TAILQ to save space in most structures which do not
3029
	 * require TAILQ semantics.  Here we must update the tail position
3030
	 * when removing the tail which is not the final entry. This works
3031
	 * only if the worklist linkage are at the beginning of the structure.
3032
	 */
3033
	if (ump->softdep_journal_tail == wk)
3034
		ump->softdep_journal_tail =
3035
		    (struct worklist *)wk->wk_list.le_prev;
3036
	WORKLIST_REMOVE(wk);
3037
	ump->softdep_on_journal -= 1;
3038
}
3039

3040
/*
3041
 * Check for journal space as well as dependency limits so the prelink
3042
 * code can throttle both journaled and non-journaled filesystems.
3043
 * Threshold is 0 for low and 1 for min.
3044
 */
3045
static int
3046
journal_space(struct ufsmount *ump, int thresh)
3047
{
3048
	struct jblocks *jblocks;
3049
	int limit, avail;
3050

3051
	jblocks = ump->softdep_jblocks;
3052
	if (jblocks == NULL)
3053
		return (1);
3054
	/*
3055
	 * We use a tighter restriction here to prevent request_cleanup()
3056
	 * running in threads from running into locks we currently hold.
3057
	 * We have to be over the limit and our filesystem has to be
3058
	 * responsible for more than our share of that usage.
3059
	 */
3060
	limit = (max_softdeps / 10) * 9;
3061
	if (dep_current[D_INODEDEP] > limit &&
3062
	    ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
3063
		return (0);
3064
	if (thresh)
3065
		thresh = jblocks->jb_min;
3066
	else
3067
		thresh = jblocks->jb_low;
3068
	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
3069
	avail = jblocks->jb_free - avail;
3070

3071
	return (avail > thresh);
3072
}
3073

3074
static void
3075
journal_suspend(struct ufsmount *ump)
3076
{
3077
	struct jblocks *jblocks;
3078
	struct mount *mp;
3079
	bool set;
3080

3081
	mp = UFSTOVFS(ump);
3082
	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0)
3083
		return;
3084

3085
	jblocks = ump->softdep_jblocks;
3086
	vfs_op_enter(mp);
3087
	set = false;
3088
	MNT_ILOCK(mp);
3089
	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
3090
		stat_journal_min++;
3091
		mp->mnt_kern_flag |= MNTK_SUSPEND;
3092
		mp->mnt_susp_owner = ump->softdep_flushtd;
3093
		set = true;
3094
	}
3095
	jblocks->jb_suspended = 1;
3096
	MNT_IUNLOCK(mp);
3097
	if (!set)
3098
		vfs_op_exit(mp);
3099
}
3100

3101
static int
3102
journal_unsuspend(struct ufsmount *ump)
3103
{
3104
	struct jblocks *jblocks;
3105
	struct mount *mp;
3106

3107
	mp = UFSTOVFS(ump);
3108
	jblocks = ump->softdep_jblocks;
3109

3110
	if (jblocks != NULL && jblocks->jb_suspended &&
3111
	    journal_space(ump, jblocks->jb_min)) {
3112
		jblocks->jb_suspended = 0;
3113
		FREE_LOCK(ump);
3114
		mp->mnt_susp_owner = curthread;
3115
		vfs_write_resume(mp, 0);
3116
		ACQUIRE_LOCK(ump);
3117
		return (1);
3118
	}
3119
	return (0);
3120
}
3121

3122
static void
3123
journal_check_space(struct ufsmount *ump)
3124
{
3125
	struct mount *mp;
3126

3127
	LOCK_OWNED(ump);
3128

3129
	if (journal_space(ump, 0) == 0) {
3130
		softdep_speedup(ump);
3131
		mp = UFSTOVFS(ump);
3132
		FREE_LOCK(ump);
3133
		VFS_SYNC(mp, MNT_NOWAIT);
3134
		ffs_sbupdate(ump, MNT_WAIT, 0);
3135
		ACQUIRE_LOCK(ump);
3136
		if (journal_space(ump, 1) == 0)
3137
			journal_suspend(ump);
3138
	}
3139
}
3140

3141
/*
3142
 * Called before any allocation function to be certain that there is
3143
 * sufficient space in the journal prior to creating any new records.
3144
 * Since in the case of block allocation we may have multiple locked
3145
 * buffers at the time of the actual allocation we can not block
3146
 * when the journal records are created.  Doing so would create a deadlock
3147
 * if any of these buffers needed to be flushed to reclaim space.  Instead
3148
 * we require a sufficiently large amount of available space such that
3149
 * each thread in the system could have passed this allocation check and
3150
 * still have sufficient free space.  With 20% of a minimum journal size
3151
 * of 1MB we have 6553 records available.
3152
 */
3153
int
3154
softdep_prealloc(struct vnode *vp, int waitok)
3155
{
3156
	struct ufsmount *ump;
3157

3158
	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
3159
	    ("softdep_prealloc called on non-softdep filesystem"));
3160
	/*
3161
	 * Nothing to do if we are not running journaled soft updates.
3162
	 * If we currently hold the snapshot lock, we must avoid
3163
	 * handling other resources that could cause deadlock.  Do not
3164
	 * touch quotas vnode since it is typically recursed with
3165
	 * other vnode locks held.
3166
	 */
3167
	if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) ||
3168
	    (vp->v_vflag & VV_SYSTEM) != 0)
3169
		return (0);
3170
	ump = VFSTOUFS(vp->v_mount);
3171
	ACQUIRE_LOCK(ump);
3172
	if (journal_space(ump, 0)) {
3173
		FREE_LOCK(ump);
3174
		return (0);
3175
	}
3176
	stat_journal_low++;
3177
	FREE_LOCK(ump);
3178
	if (waitok == MNT_NOWAIT)
3179
		return (ENOSPC);
3180
	/*
3181
	 * Attempt to sync this vnode once to flush any journal
3182
	 * work attached to it.
3183
	 */
3184
	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
3185
		ffs_syncvnode(vp, waitok, 0);
3186
	ACQUIRE_LOCK(ump);
3187
	process_removes(vp);
3188
	process_truncates(vp);
3189
	journal_check_space(ump);
3190
	FREE_LOCK(ump);
3191

3192
	return (0);
3193
}
3194

3195
/*
3196
 * Try hard to sync all data and metadata for the vnode, and workitems
3197
 * flushing which might conflict with the vnode lock.  This is a
3198
 * helper for softdep_prerename().
3199
 */
3200
static int
3201
softdep_prerename_vnode(struct ufsmount *ump, struct vnode *vp)
3202
{
3203
	int error;
3204

3205
	ASSERT_VOP_ELOCKED(vp, "prehandle");
3206
	if (vp->v_data == NULL)
3207
		return (0);
3208
	error = VOP_FSYNC(vp, MNT_WAIT, curthread);
3209
	if (error != 0)
3210
		return (error);
3211
	ACQUIRE_LOCK(ump);
3212
	process_removes(vp);
3213
	process_truncates(vp);
3214
	FREE_LOCK(ump);
3215
	return (0);
3216
}
3217

3218
/*
3219
 * Must be called from VOP_RENAME() after all vnodes are locked.
3220
 * Ensures that there is enough journal space for rename.  It is
3221
 * sufficiently different from softdep_prelink() by having to handle
3222
 * four vnodes.
3223
 */
3224
int
3225
softdep_prerename(struct vnode *fdvp,
3226
	struct vnode *fvp,
3227
	struct vnode *tdvp,
3228
	struct vnode *tvp)
3229
{
3230
	struct ufsmount *ump;
3231
	int error;
3232

3233
	ump = VFSTOUFS(fdvp->v_mount);
3234

3235
	if (journal_space(ump, 0))
3236
		return (0);
3237

3238
	VOP_UNLOCK(tdvp);
3239
	VOP_UNLOCK(fvp);
3240
	if (tvp != NULL && tvp != tdvp)
3241
		VOP_UNLOCK(tvp);
3242

3243
	error = softdep_prerename_vnode(ump, fdvp);
3244
	VOP_UNLOCK(fdvp);
3245
	if (error != 0)
3246
		return (error);
3247

3248
	VOP_LOCK(fvp, LK_EXCLUSIVE | LK_RETRY);
3249
	error = softdep_prerename_vnode(ump, fvp);
3250
	VOP_UNLOCK(fvp);
3251
	if (error != 0)
3252
		return (error);
3253

3254
	if (tdvp != fdvp) {
3255
		VOP_LOCK(tdvp, LK_EXCLUSIVE | LK_RETRY);
3256
		error = softdep_prerename_vnode(ump, tdvp);
3257
		VOP_UNLOCK(tdvp);
3258
		if (error != 0)
3259
			return (error);
3260
	}
3261

3262
	if (tvp != fvp && tvp != NULL) {
3263
		VOP_LOCK(tvp, LK_EXCLUSIVE | LK_RETRY);
3264
		error = softdep_prerename_vnode(ump, tvp);
3265
		VOP_UNLOCK(tvp);
3266
		if (error != 0)
3267
			return (error);
3268
	}
3269

3270
	ACQUIRE_LOCK(ump);
3271
	softdep_speedup(ump);
3272
	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
3273
	journal_check_space(ump);
3274
	FREE_LOCK(ump);
3275
	return (ERELOOKUP);
3276
}
3277

3278
/*
3279
 * Before adjusting a link count on a vnode verify that we have sufficient
3280
 * journal space.  If not, process operations that depend on the currently
3281
 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
3282
 * and softdep flush threads can not acquire these locks to reclaim space.
3283
 *
3284
 * Returns 0 if all owned locks are still valid and were not dropped
3285
 * in the process, in other case it returns either an error from sync,
3286
 * or ERELOOKUP if any of the locks were re-acquired.  In the later
3287
 * case, the state of the vnodes cannot be relied upon and our VFS
3288
 * syscall must be restarted at top level from the lookup.
3289
 */
3290
int
3291
softdep_prelink(struct vnode *dvp,
3292
	struct vnode *vp,
3293
	struct componentname *cnp)
3294
{
3295
	struct ufsmount *ump;
3296
	struct nameidata *ndp;
3297

3298
	ASSERT_VOP_ELOCKED(dvp, "prelink dvp");
3299
	if (vp != NULL)
3300
		ASSERT_VOP_ELOCKED(vp, "prelink vp");
3301
	ump = VFSTOUFS(dvp->v_mount);
3302

3303
	/*
3304
	 * Nothing to do if we have sufficient journal space.  We skip
3305
	 * flushing when vp is a snapshot to avoid deadlock where
3306
	 * another thread is trying to update the inodeblock for dvp
3307
	 * and is waiting on snaplk that vp holds.
3308
	 */
3309
	if (journal_space(ump, 0) || (vp != NULL && IS_SNAPSHOT(VTOI(vp))))
3310
		return (0);
3311

3312
	/*
3313
	 * Check if the journal space consumption can in theory be
3314
	 * accounted on dvp and vp.  If the vnodes metadata was not
3315
	 * changed comparing with the previous round-trip into
3316
	 * softdep_prelink(), as indicated by the seqc generation
3317
	 * recorded in the nameidata, then there is no point in
3318
	 * starting the sync.
3319
	 */
3320
	ndp = __containerof(cnp, struct nameidata, ni_cnd);
3321
	if (!seqc_in_modify(ndp->ni_dvp_seqc) &&
3322
	    vn_seqc_consistent(dvp, ndp->ni_dvp_seqc) &&
3323
	    (vp == NULL || (!seqc_in_modify(ndp->ni_vp_seqc) &&
3324
	    vn_seqc_consistent(vp, ndp->ni_vp_seqc))))
3325
		return (0);
3326

3327
	stat_journal_low++;
3328
	if (vp != NULL) {
3329
		VOP_UNLOCK(dvp);
3330
		ffs_syncvnode(vp, MNT_NOWAIT, 0);
3331
		vn_lock_pair(dvp, false, LK_EXCLUSIVE, vp, true, LK_EXCLUSIVE);
3332
		if (dvp->v_data == NULL)
3333
			goto out;
3334
	}
3335
	if (vp != NULL)
3336
		VOP_UNLOCK(vp);
3337
	ffs_syncvnode(dvp, MNT_WAIT, 0);
3338
	/* Process vp before dvp as it may create .. removes. */
3339
	if (vp != NULL) {
3340
		VOP_UNLOCK(dvp);
3341
		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3342
		if (vp->v_data == NULL) {
3343
			vn_lock_pair(dvp, false, LK_EXCLUSIVE, vp, true,
3344
			    LK_EXCLUSIVE);
3345
			goto out;
3346
		}
3347
		ACQUIRE_LOCK(ump);
3348
		process_removes(vp);
3349
		process_truncates(vp);
3350
		FREE_LOCK(ump);
3351
		VOP_UNLOCK(vp);
3352
		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
3353
		if (dvp->v_data == NULL) {
3354
			vn_lock_pair(dvp, true, LK_EXCLUSIVE, vp, false,
3355
			    LK_EXCLUSIVE);
3356
			goto out;
3357
		}
3358
	}
3359

3360
	ACQUIRE_LOCK(ump);
3361
	process_removes(dvp);
3362
	process_truncates(dvp);
3363
	VOP_UNLOCK(dvp);
3364
	softdep_speedup(ump);
3365

3366
	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
3367
	journal_check_space(ump);
3368
	FREE_LOCK(ump);
3369

3370
	vn_lock_pair(dvp, false, LK_EXCLUSIVE, vp, false, LK_EXCLUSIVE);
3371
out:
3372
	ndp->ni_dvp_seqc = vn_seqc_read_any(dvp);
3373
	if (vp != NULL)
3374
		ndp->ni_vp_seqc = vn_seqc_read_any(vp);
3375
	return (ERELOOKUP);
3376
}
3377

3378
static void
3379
jseg_write(struct ufsmount *ump,
3380
	struct jseg *jseg,
3381
	uint8_t *data)
3382
{
3383
	struct jsegrec *rec;
3384

3385
	rec = (struct jsegrec *)data;
3386
	rec->jsr_seq = jseg->js_seq;
3387
	rec->jsr_oldest = jseg->js_oldseq;
3388
	rec->jsr_cnt = jseg->js_cnt;
3389
	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
3390
	rec->jsr_crc = 0;
3391
	rec->jsr_time = ump->um_fs->fs_mtime;
3392
}
3393

3394
static inline void
3395
inoref_write(struct inoref *inoref,
3396
	struct jseg *jseg,
3397
	struct jrefrec *rec)
3398
{
3399

3400
	inoref->if_jsegdep->jd_seg = jseg;
3401
	rec->jr_ino = inoref->if_ino;
3402
	rec->jr_parent = inoref->if_parent;
3403
	rec->jr_nlink = inoref->if_nlink;
3404
	rec->jr_mode = inoref->if_mode;
3405
	rec->jr_diroff = inoref->if_diroff;
3406
}
3407

3408
static void
3409
jaddref_write(struct jaddref *jaddref,
3410
	struct jseg *jseg,
3411
	uint8_t *data)
3412
{
3413
	struct jrefrec *rec;
3414

3415
	rec = (struct jrefrec *)data;
3416
	rec->jr_op = JOP_ADDREF;
3417
	inoref_write(&jaddref->ja_ref, jseg, rec);
3418
}
3419

3420
static void
3421
jremref_write(struct jremref *jremref,
3422
	struct jseg *jseg,
3423
	uint8_t *data)
3424
{
3425
	struct jrefrec *rec;
3426

3427
	rec = (struct jrefrec *)data;
3428
	rec->jr_op = JOP_REMREF;
3429
	inoref_write(&jremref->jr_ref, jseg, rec);
3430
}
3431

3432
static void
3433
jmvref_write(struct jmvref *jmvref,
3434
	struct jseg *jseg,
3435
	uint8_t *data)
3436
{
3437
	struct jmvrec *rec;
3438

3439
	rec = (struct jmvrec *)data;
3440
	rec->jm_op = JOP_MVREF;
3441
	rec->jm_ino = jmvref->jm_ino;
3442
	rec->jm_parent = jmvref->jm_parent;
3443
	rec->jm_oldoff = jmvref->jm_oldoff;
3444
	rec->jm_newoff = jmvref->jm_newoff;
3445
}
3446

3447
static void
3448
jnewblk_write(struct jnewblk *jnewblk,
3449
	struct jseg *jseg,
3450
	uint8_t *data)
3451
{
3452
	struct jblkrec *rec;
3453

3454
	jnewblk->jn_jsegdep->jd_seg = jseg;
3455
	rec = (struct jblkrec *)data;
3456
	rec->jb_op = JOP_NEWBLK;
3457
	rec->jb_ino = jnewblk->jn_ino;
3458
	rec->jb_blkno = jnewblk->jn_blkno;
3459
	rec->jb_lbn = jnewblk->jn_lbn;
3460
	rec->jb_frags = jnewblk->jn_frags;
3461
	rec->jb_oldfrags = jnewblk->jn_oldfrags;
3462
}
3463

3464
static void
3465
jfreeblk_write(struct jfreeblk *jfreeblk,
3466
	struct jseg *jseg,
3467
	uint8_t *data)
3468
{
3469
	struct jblkrec *rec;
3470

3471
	jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3472
	rec = (struct jblkrec *)data;
3473
	rec->jb_op = JOP_FREEBLK;
3474
	rec->jb_ino = jfreeblk->jf_ino;
3475
	rec->jb_blkno = jfreeblk->jf_blkno;
3476
	rec->jb_lbn = jfreeblk->jf_lbn;
3477
	rec->jb_frags = jfreeblk->jf_frags;
3478
	rec->jb_oldfrags = 0;
3479
}
3480

3481
static void
3482
jfreefrag_write(struct jfreefrag *jfreefrag,
3483
	struct jseg *jseg,
3484
	uint8_t *data)
3485
{
3486
	struct jblkrec *rec;
3487

3488
	jfreefrag->fr_jsegdep->jd_seg = jseg;
3489
	rec = (struct jblkrec *)data;
3490
	rec->jb_op = JOP_FREEBLK;
3491
	rec->jb_ino = jfreefrag->fr_ino;
3492
	rec->jb_blkno = jfreefrag->fr_blkno;
3493
	rec->jb_lbn = jfreefrag->fr_lbn;
3494
	rec->jb_frags = jfreefrag->fr_frags;
3495
	rec->jb_oldfrags = 0;
3496
}
3497

3498
static void
3499
jtrunc_write(struct jtrunc *jtrunc,
3500
	struct jseg *jseg,
3501
	uint8_t *data)
3502
{
3503
	struct jtrncrec *rec;
3504

3505
	jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3506
	rec = (struct jtrncrec *)data;
3507
	rec->jt_op = JOP_TRUNC;
3508
	rec->jt_ino = jtrunc->jt_ino;
3509
	rec->jt_size = jtrunc->jt_size;
3510
	rec->jt_extsize = jtrunc->jt_extsize;
3511
}
3512

3513
static void
3514
jfsync_write(struct jfsync *jfsync,
3515
	struct jseg *jseg,
3516
	uint8_t *data)
3517
{
3518
	struct jtrncrec *rec;
3519

3520
	rec = (struct jtrncrec *)data;
3521
	rec->jt_op = JOP_SYNC;
3522
	rec->jt_ino = jfsync->jfs_ino;
3523
	rec->jt_size = jfsync->jfs_size;
3524
	rec->jt_extsize = jfsync->jfs_extsize;
3525
}
3526

3527
static void
3528
softdep_flushjournal(struct mount *mp)
3529
{
3530
	struct jblocks *jblocks;
3531
	struct ufsmount *ump;
3532

3533
	if (MOUNTEDSUJ(mp) == 0)
3534
		return;
3535
	ump = VFSTOUFS(mp);
3536
	jblocks = ump->softdep_jblocks;
3537
	ACQUIRE_LOCK(ump);
3538
	while (ump->softdep_on_journal) {
3539
		jblocks->jb_needseg = 1;
3540
		softdep_process_journal(mp, NULL, MNT_WAIT);
3541
	}
3542
	FREE_LOCK(ump);
3543
}
3544

3545
static void softdep_synchronize_completed(struct bio *);
3546
static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3547

3548
static void
3549
softdep_synchronize_completed(struct bio *bp)
3550
{
3551
	struct jseg *oldest;
3552
	struct jseg *jseg;
3553
	struct ufsmount *ump;
3554

3555
	/*
3556
	 * caller1 marks the last segment written before we issued the
3557
	 * synchronize cache.
3558
	 */
3559
	jseg = bp->bio_caller1;
3560
	if (jseg == NULL) {
3561
		g_destroy_bio(bp);
3562
		return;
3563
	}
3564
	ump = VFSTOUFS(jseg->js_list.wk_mp);
3565
	ACQUIRE_LOCK(ump);
3566
	oldest = NULL;
3567
	/*
3568
	 * Mark all the journal entries waiting on the synchronize cache
3569
	 * as completed so they may continue on.
3570
	 */
3571
	while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3572
		jseg->js_state |= COMPLETE;
3573
		oldest = jseg;
3574
		jseg = TAILQ_PREV(jseg, jseglst, js_next);
3575
	}
3576
	/*
3577
	 * Restart deferred journal entry processing from the oldest
3578
	 * completed jseg.
3579
	 */
3580
	if (oldest)
3581
		complete_jsegs(oldest);
3582

3583
	FREE_LOCK(ump);
3584
	g_destroy_bio(bp);
3585
}
3586

3587
/*
3588
 * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3589
 * barriers.  The journal must be written prior to any blocks that depend
3590
 * on it and the journal can not be released until the blocks have be
3591
 * written.  This code handles both barriers simultaneously.
3592
 */
3593
static void
3594
softdep_synchronize(struct bio *bp,
3595
	struct ufsmount *ump,
3596
	void *caller1)
3597
{
3598

3599
	bp->bio_cmd = BIO_FLUSH;
3600
	bp->bio_flags |= BIO_ORDERED;
3601
	bp->bio_data = NULL;
3602
	bp->bio_offset = ump->um_cp->provider->mediasize;
3603
	bp->bio_length = 0;
3604
	bp->bio_done = softdep_synchronize_completed;
3605
	bp->bio_caller1 = caller1;
3606
	g_io_request(bp, ump->um_cp);
3607
}
3608

3609
/*
3610
 * Flush some journal records to disk.
3611
 */
3612
static void
3613
softdep_process_journal(struct mount *mp,
3614
	struct worklist *needwk,
3615
	int flags)
3616
{
3617
	struct jblocks *jblocks;
3618
	struct ufsmount *ump;
3619
	struct worklist *wk;
3620
	struct jseg *jseg;
3621
	struct buf *bp;
3622
	struct bio *bio;
3623
	uint8_t *data;
3624
	struct fs *fs;
3625
	int shouldflush;
3626
	int segwritten;
3627
	int jrecmin;	/* Minimum records per block. */
3628
	int jrecmax;	/* Maximum records per block. */
3629
	int size;
3630
	int cnt;
3631
	int off;
3632
	int devbsize;
3633
	int savef;
3634

3635
	ump = VFSTOUFS(mp);
3636
	if (ump->um_softdep == NULL || ump->um_softdep->sd_jblocks == NULL)
3637
		return;
3638
	shouldflush = softdep_flushcache;
3639
	bio = NULL;
3640
	jseg = NULL;
3641
	LOCK_OWNED(ump);
3642
	fs = ump->um_fs;
3643
	jblocks = ump->softdep_jblocks;
3644
	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3645
	savef = curthread_pflags_set(TDP_NORUNNINGBUF);
3646

3647
	/*
3648
	 * We write anywhere between a disk block and fs block.  The upper
3649
	 * bound is picked to prevent buffer cache fragmentation and limit
3650
	 * processing time per I/O.
3651
	 */
3652
	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3653
	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3654
	segwritten = 0;
3655
	for (;;) {
3656
		cnt = ump->softdep_on_journal;
3657
		/*
3658
		 * Criteria for writing a segment:
3659
		 * 1) We have a full block.
3660
		 * 2) We're called from jwait() and haven't found the
3661
		 *    journal item yet.
3662
		 * 3) Always write if needseg is set.
3663
		 * 4) If we are called from process_worklist and have
3664
		 *    not yet written anything we write a partial block
3665
		 *    to enforce a 1 second maximum latency on journal
3666
		 *    entries.
3667
		 */
3668
		if (cnt < (jrecmax - 1) && needwk == NULL &&
3669
		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3670
			break;
3671
		cnt++;
3672
		/*
3673
		 * Verify some free journal space.  softdep_prealloc() should
3674
		 * guarantee that we don't run out so this is indicative of
3675
		 * a problem with the flow control.  Try to recover
3676
		 * gracefully in any event.
3677
		 */
3678
		while (jblocks->jb_free == 0) {
3679
			if (flags != MNT_WAIT)
3680
				break;
3681
			printf("softdep: Out of journal space!\n");
3682
			softdep_speedup(ump);
3683
			msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
3684
		}
3685
		FREE_LOCK(ump);
3686
		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3687
		workitem_alloc(&jseg->js_list, D_JSEG, mp);
3688
		LIST_INIT(&jseg->js_entries);
3689
		LIST_INIT(&jseg->js_indirs);
3690
		jseg->js_state = ATTACHED;
3691
		if (shouldflush == 0)
3692
			jseg->js_state |= COMPLETE;
3693
		else if (bio == NULL)
3694
			bio = g_alloc_bio();
3695
		jseg->js_jblocks = jblocks;
3696
		bp = geteblk(fs->fs_bsize, 0);
3697
		ACQUIRE_LOCK(ump);
3698
		/*
3699
		 * If there was a race while we were allocating the block
3700
		 * and jseg the entry we care about was likely written.
3701
		 * We bail out in both the WAIT and NOWAIT case and assume
3702
		 * the caller will loop if the entry it cares about is
3703
		 * not written.
3704
		 */
3705
		cnt = ump->softdep_on_journal;
3706
		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3707
			bp->b_flags |= B_INVAL | B_NOCACHE;
3708
			WORKITEM_FREE(jseg, D_JSEG);
3709
			FREE_LOCK(ump);
3710
			brelse(bp);
3711
			ACQUIRE_LOCK(ump);
3712
			break;
3713
		}
3714
		/*
3715
		 * Calculate the disk block size required for the available
3716
		 * records rounded to the min size.
3717
		 */
3718
		if (cnt == 0)
3719
			size = devbsize;
3720
		else if (cnt < jrecmax)
3721
			size = howmany(cnt, jrecmin) * devbsize;
3722
		else
3723
			size = fs->fs_bsize;
3724
		/*
3725
		 * Allocate a disk block for this journal data and account
3726
		 * for truncation of the requested size if enough contiguous
3727
		 * space was not available.
3728
		 */
3729
		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3730
		bp->b_lblkno = bp->b_blkno;
3731
		bp->b_offset = bp->b_blkno * DEV_BSIZE;
3732
		bp->b_bcount = size;
3733
		bp->b_flags &= ~B_INVAL;
3734
		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3735
		/*
3736
		 * Initialize our jseg with cnt records.  Assign the next
3737
		 * sequence number to it and link it in-order.
3738
		 */
3739
		cnt = MIN(cnt, (size / devbsize) * jrecmin);
3740
		jseg->js_buf = bp;
3741
		jseg->js_cnt = cnt;
3742
		jseg->js_refs = cnt + 1;	/* Self ref. */
3743
		jseg->js_size = size;
3744
		jseg->js_seq = jblocks->jb_nextseq++;
3745
		if (jblocks->jb_oldestseg == NULL)
3746
			jblocks->jb_oldestseg = jseg;
3747
		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3748
		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3749
		if (jblocks->jb_writeseg == NULL)
3750
			jblocks->jb_writeseg = jseg;
3751
		/*
3752
		 * Start filling in records from the pending list.
3753
		 */
3754
		data = bp->b_data;
3755
		off = 0;
3756

3757
		/*
3758
		 * Always put a header on the first block.
3759
		 * XXX As with below, there might not be a chance to get
3760
		 * into the loop.  Ensure that something valid is written.
3761
		 */
3762
		jseg_write(ump, jseg, data);
3763
		off += JREC_SIZE;
3764
		data = bp->b_data + off;
3765

3766
		/*
3767
		 * XXX Something is wrong here.  There's no work to do,
3768
		 * but we need to perform and I/O and allow it to complete
3769
		 * anyways.
3770
		 */
3771
		if (LIST_EMPTY(&ump->softdep_journal_pending))
3772
			stat_emptyjblocks++;
3773

3774
		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3775
		    != NULL) {
3776
			if (cnt == 0)
3777
				break;
3778
			/* Place a segment header on every device block. */
3779
			if ((off % devbsize) == 0) {
3780
				jseg_write(ump, jseg, data);
3781
				off += JREC_SIZE;
3782
				data = bp->b_data + off;
3783
			}
3784
			if (wk == needwk)
3785
				needwk = NULL;
3786
			remove_from_journal(wk);
3787
			wk->wk_state |= INPROGRESS;
3788
			WORKLIST_INSERT(&jseg->js_entries, wk);
3789
			switch (wk->wk_type) {
3790
			case D_JADDREF:
3791
				jaddref_write(WK_JADDREF(wk), jseg, data);
3792
				break;
3793
			case D_JREMREF:
3794
				jremref_write(WK_JREMREF(wk), jseg, data);
3795
				break;
3796
			case D_JMVREF:
3797
				jmvref_write(WK_JMVREF(wk), jseg, data);
3798
				break;
3799
			case D_JNEWBLK:
3800
				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3801
				break;
3802
			case D_JFREEBLK:
3803
				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3804
				break;
3805
			case D_JFREEFRAG:
3806
				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3807
				break;
3808
			case D_JTRUNC:
3809
				jtrunc_write(WK_JTRUNC(wk), jseg, data);
3810
				break;
3811
			case D_JFSYNC:
3812
				jfsync_write(WK_JFSYNC(wk), jseg, data);
3813
				break;
3814
			default:
3815
				panic("process_journal: Unknown type %s",
3816
				    TYPENAME(wk->wk_type));
3817
				/* NOTREACHED */
3818
			}
3819
			off += JREC_SIZE;
3820
			data = bp->b_data + off;
3821
			cnt--;
3822
		}
3823

3824
		/* Clear any remaining space so we don't leak kernel data */
3825
		if (size > off)
3826
			bzero(data, size - off);
3827

3828
		/*
3829
		 * Write this one buffer and continue.
3830
		 */
3831
		segwritten = 1;
3832
		jblocks->jb_needseg = 0;
3833
		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3834
		FREE_LOCK(ump);
3835
		bp->b_xflags |= BX_CVTENXIO;
3836
		pbgetvp(ump->um_devvp, bp);
3837
		/*
3838
		 * We only do the blocking wait once we find the journal
3839
		 * entry we're looking for.
3840
		 */
3841
		if (needwk == NULL && flags == MNT_WAIT)
3842
			bwrite(bp);
3843
		else
3844
			bawrite(bp);
3845
		ACQUIRE_LOCK(ump);
3846
	}
3847
	/*
3848
	 * If we wrote a segment issue a synchronize cache so the journal
3849
	 * is reflected on disk before the data is written.  Since reclaiming
3850
	 * journal space also requires writing a journal record this
3851
	 * process also enforces a barrier before reclamation.
3852
	 */
3853
	if (segwritten && shouldflush) {
3854
		softdep_synchronize(bio, ump, 
3855
		    TAILQ_LAST(&jblocks->jb_segs, jseglst));
3856
	} else if (bio)
3857
		g_destroy_bio(bio);
3858
	/*
3859
	 * If we've suspended the filesystem because we ran out of journal
3860
	 * space either try to sync it here to make some progress or
3861
	 * unsuspend it if we already have.
3862
	 */
3863
	if (flags == 0 && jblocks->jb_suspended) {
3864
		if (journal_unsuspend(ump))
3865
			goto out;
3866
		FREE_LOCK(ump);
3867
		VFS_SYNC(mp, MNT_NOWAIT);
3868
		ffs_sbupdate(ump, MNT_WAIT, 0);
3869
		ACQUIRE_LOCK(ump);
3870
	}
3871

3872
out:
3873
	curthread_pflags_restore(savef);
3874
}
3875

3876
/*
3877
 * Complete a jseg, allowing all dependencies awaiting journal writes
3878
 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3879
 * structures so that the journal segment can be freed to reclaim space.
3880
 */
3881
static void
3882
complete_jseg(struct jseg *jseg)
3883
{
3884
	struct worklist *wk;
3885
	struct jmvref *jmvref;
3886
#ifdef INVARIANTS
3887
	int i = 0;
3888
#endif
3889

3890
	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3891
		WORKLIST_REMOVE(wk);
3892
		wk->wk_state &= ~INPROGRESS;
3893
		wk->wk_state |= COMPLETE;
3894
		KASSERT(i++ < jseg->js_cnt,
3895
		    ("handle_written_jseg: overflow %d >= %d",
3896
		    i - 1, jseg->js_cnt));
3897
		switch (wk->wk_type) {
3898
		case D_JADDREF:
3899
			handle_written_jaddref(WK_JADDREF(wk));
3900
			break;
3901
		case D_JREMREF:
3902
			handle_written_jremref(WK_JREMREF(wk));
3903
			break;
3904
		case D_JMVREF:
3905
			rele_jseg(jseg);	/* No jsegdep. */
3906
			jmvref = WK_JMVREF(wk);
3907
			LIST_REMOVE(jmvref, jm_deps);
3908
			if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3909
				free_pagedep(jmvref->jm_pagedep);
3910
			WORKITEM_FREE(jmvref, D_JMVREF);
3911
			break;
3912
		case D_JNEWBLK:
3913
			handle_written_jnewblk(WK_JNEWBLK(wk));
3914
			break;
3915
		case D_JFREEBLK:
3916
			handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3917
			break;
3918
		case D_JTRUNC:
3919
			handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3920
			break;
3921
		case D_JFSYNC:
3922
			rele_jseg(jseg);	/* No jsegdep. */
3923
			WORKITEM_FREE(wk, D_JFSYNC);
3924
			break;
3925
		case D_JFREEFRAG:
3926
			handle_written_jfreefrag(WK_JFREEFRAG(wk));
3927
			break;
3928
		default:
3929
			panic("handle_written_jseg: Unknown type %s",
3930
			    TYPENAME(wk->wk_type));
3931
			/* NOTREACHED */
3932
		}
3933
	}
3934
	/* Release the self reference so the structure may be freed. */
3935
	rele_jseg(jseg);
3936
}
3937

3938
/*
3939
 * Determine which jsegs are ready for completion processing.  Waits for
3940
 * synchronize cache to complete as well as forcing in-order completion
3941
 * of journal entries.
3942
 */
3943
static void
3944
complete_jsegs(struct jseg *jseg)
3945
{
3946
	struct jblocks *jblocks;
3947
	struct jseg *jsegn;
3948

3949
	jblocks = jseg->js_jblocks;
3950
	/*
3951
	 * Don't allow out of order completions.  If this isn't the first
3952
	 * block wait for it to write before we're done.
3953
	 */
3954
	if (jseg != jblocks->jb_writeseg)
3955
		return;
3956
	/* Iterate through available jsegs processing their entries. */
3957
	while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
3958
		jblocks->jb_oldestwrseq = jseg->js_oldseq;
3959
		jsegn = TAILQ_NEXT(jseg, js_next);
3960
		complete_jseg(jseg);
3961
		jseg = jsegn;
3962
	}
3963
	jblocks->jb_writeseg = jseg;
3964
	/*
3965
	 * Attempt to free jsegs now that oldestwrseq may have advanced. 
3966
	 */
3967
	free_jsegs(jblocks);
3968
}
3969

3970
/*
3971
 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
3972
 * the final completions.
3973
 */
3974
static void
3975
handle_written_jseg(struct jseg *jseg, struct buf *bp)
3976
{
3977

3978
	if (jseg->js_refs == 0)
3979
		panic("handle_written_jseg: No self-reference on %p", jseg);
3980
	jseg->js_state |= DEPCOMPLETE;
3981
	/*
3982
	 * We'll never need this buffer again, set flags so it will be
3983
	 * discarded.
3984
	 */
3985
	bp->b_flags |= B_INVAL | B_NOCACHE;
3986
	pbrelvp(bp);
3987
	complete_jsegs(jseg);
3988
}
3989

3990
static inline struct jsegdep *
3991
inoref_jseg(struct inoref *inoref)
3992
{
3993
	struct jsegdep *jsegdep;
3994

3995
	jsegdep = inoref->if_jsegdep;
3996
	inoref->if_jsegdep = NULL;
3997

3998
	return (jsegdep);
3999
}
4000

4001
/*
4002
 * Called once a jremref has made it to stable store.  The jremref is marked
4003
 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
4004
 * for the jremref to complete will be awoken by free_jremref.
4005
 */
4006
static void
4007
handle_written_jremref(struct jremref *jremref)
4008
{
4009
	struct inodedep *inodedep;
4010
	struct jsegdep *jsegdep;
4011
	struct dirrem *dirrem;
4012

4013
	/* Grab the jsegdep. */
4014
	jsegdep = inoref_jseg(&jremref->jr_ref);
4015
	/*
4016
	 * Remove us from the inoref list.
4017
	 */
4018
	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
4019
	    0, &inodedep) == 0)
4020
		panic("handle_written_jremref: Lost inodedep");
4021
	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
4022
	/*
4023
	 * Complete the dirrem.
4024
	 */
4025
	dirrem = jremref->jr_dirrem;
4026
	jremref->jr_dirrem = NULL;
4027
	LIST_REMOVE(jremref, jr_deps);
4028
	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
4029
	jwork_insert(&dirrem->dm_jwork, jsegdep);
4030
	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
4031
	    (dirrem->dm_state & COMPLETE) != 0)
4032
		add_to_worklist(&dirrem->dm_list, 0);
4033
	free_jremref(jremref);
4034
}
4035

4036
/*
4037
 * Called once a jaddref has made it to stable store.  The dependency is
4038
 * marked complete and any dependent structures are added to the inode
4039
 * bufwait list to be completed as soon as it is written.  If a bitmap write
4040
 * depends on this entry we move the inode into the inodedephd of the
4041
 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
4042
 */
4043
static void
4044
handle_written_jaddref(struct jaddref *jaddref)
4045
{
4046
	struct jsegdep *jsegdep;
4047
	struct inodedep *inodedep;
4048
	struct diradd *diradd;
4049
	struct mkdir *mkdir;
4050

4051
	/* Grab the jsegdep. */
4052
	jsegdep = inoref_jseg(&jaddref->ja_ref);
4053
	mkdir = NULL;
4054
	diradd = NULL;
4055
	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4056
	    0, &inodedep) == 0)
4057
		panic("handle_written_jaddref: Lost inodedep.");
4058
	if (jaddref->ja_diradd == NULL)
4059
		panic("handle_written_jaddref: No dependency");
4060
	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
4061
		diradd = jaddref->ja_diradd;
4062
		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
4063
	} else if (jaddref->ja_state & MKDIR_PARENT) {
4064
		mkdir = jaddref->ja_mkdir;
4065
		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
4066
	} else if (jaddref->ja_state & MKDIR_BODY)
4067
		mkdir = jaddref->ja_mkdir;
4068
	else
4069
		panic("handle_written_jaddref: Unknown dependency %p",
4070
		    jaddref->ja_diradd);
4071
	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
4072
	/*
4073
	 * Remove us from the inode list.
4074
	 */
4075
	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
4076
	/*
4077
	 * The mkdir may be waiting on the jaddref to clear before freeing.
4078
	 */
4079
	if (mkdir) {
4080
		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
4081
		    ("handle_written_jaddref: Incorrect type for mkdir %s",
4082
		    TYPENAME(mkdir->md_list.wk_type)));
4083
		mkdir->md_jaddref = NULL;
4084
		diradd = mkdir->md_diradd;
4085
		mkdir->md_state |= DEPCOMPLETE;
4086
		complete_mkdir(mkdir);
4087
	}
4088
	jwork_insert(&diradd->da_jwork, jsegdep);
4089
	if (jaddref->ja_state & NEWBLOCK) {
4090
		inodedep->id_state |= ONDEPLIST;
4091
		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
4092
		    inodedep, id_deps);
4093
	}
4094
	free_jaddref(jaddref);
4095
}
4096

4097
/*
4098
 * Called once a jnewblk journal is written.  The allocdirect or allocindir
4099
 * is placed in the bmsafemap to await notification of a written bitmap.  If
4100
 * the operation was canceled we add the segdep to the appropriate
4101
 * dependency to free the journal space once the canceling operation
4102
 * completes.
4103
 */
4104
static void
4105
handle_written_jnewblk(struct jnewblk *jnewblk)
4106
{
4107
	struct bmsafemap *bmsafemap;
4108
	struct freefrag *freefrag;
4109
	struct freework *freework;
4110
	struct jsegdep *jsegdep;
4111
	struct newblk *newblk;
4112

4113
	/* Grab the jsegdep. */
4114
	jsegdep = jnewblk->jn_jsegdep;
4115
	jnewblk->jn_jsegdep = NULL;
4116
	if (jnewblk->jn_dep == NULL) 
4117
		panic("handle_written_jnewblk: No dependency for the segdep.");
4118
	switch (jnewblk->jn_dep->wk_type) {
4119
	case D_NEWBLK:
4120
	case D_ALLOCDIRECT:
4121
	case D_ALLOCINDIR:
4122
		/*
4123
		 * Add the written block to the bmsafemap so it can
4124
		 * be notified when the bitmap is on disk.
4125
		 */
4126
		newblk = WK_NEWBLK(jnewblk->jn_dep);
4127
		newblk->nb_jnewblk = NULL;
4128
		if ((newblk->nb_state & GOINGAWAY) == 0) {
4129
			bmsafemap = newblk->nb_bmsafemap;
4130
			newblk->nb_state |= ONDEPLIST;
4131
			LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
4132
			    nb_deps);
4133
		}
4134
		jwork_insert(&newblk->nb_jwork, jsegdep);
4135
		break;
4136
	case D_FREEFRAG:
4137
		/*
4138
		 * A newblock being removed by a freefrag when replaced by
4139
		 * frag extension.
4140
		 */
4141
		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
4142
		freefrag->ff_jdep = NULL;
4143
		jwork_insert(&freefrag->ff_jwork, jsegdep);
4144
		break;
4145
	case D_FREEWORK:
4146
		/*
4147
		 * A direct block was removed by truncate.
4148
		 */
4149
		freework = WK_FREEWORK(jnewblk->jn_dep);
4150
		freework->fw_jnewblk = NULL;
4151
		jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
4152
		break;
4153
	default:
4154
		panic("handle_written_jnewblk: Unknown type %d.",
4155
		    jnewblk->jn_dep->wk_type);
4156
	}
4157
	jnewblk->jn_dep = NULL;
4158
	free_jnewblk(jnewblk);
4159
}
4160

4161
/*
4162
 * Cancel a jfreefrag that won't be needed, probably due to colliding with
4163
 * an in-flight allocation that has not yet been committed.  Divorce us
4164
 * from the freefrag and mark it DEPCOMPLETE so that it may be added
4165
 * to the worklist.
4166
 */
4167
static void
4168
cancel_jfreefrag(struct jfreefrag *jfreefrag)
4169
{
4170
	struct freefrag *freefrag;
4171

4172
	if (jfreefrag->fr_jsegdep) {
4173
		free_jsegdep(jfreefrag->fr_jsegdep);
4174
		jfreefrag->fr_jsegdep = NULL;
4175
	}
4176
	freefrag = jfreefrag->fr_freefrag;
4177
	jfreefrag->fr_freefrag = NULL;
4178
	free_jfreefrag(jfreefrag);
4179
	freefrag->ff_state |= DEPCOMPLETE;
4180
	CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
4181
}
4182

4183
/*
4184
 * Free a jfreefrag when the parent freefrag is rendered obsolete.
4185
 */
4186
static void
4187
free_jfreefrag(struct jfreefrag *jfreefrag)
4188
{
4189

4190
	if (jfreefrag->fr_state & INPROGRESS)
4191
		WORKLIST_REMOVE(&jfreefrag->fr_list);
4192
	else if (jfreefrag->fr_state & ONWORKLIST)
4193
		remove_from_journal(&jfreefrag->fr_list);
4194
	if (jfreefrag->fr_freefrag != NULL)
4195
		panic("free_jfreefrag:  Still attached to a freefrag.");
4196
	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
4197
}
4198

4199
/*
4200
 * Called when the journal write for a jfreefrag completes.  The parent
4201
 * freefrag is added to the worklist if this completes its dependencies.
4202
 */
4203
static void
4204
handle_written_jfreefrag(struct jfreefrag *jfreefrag)
4205
{
4206
	struct jsegdep *jsegdep;
4207
	struct freefrag *freefrag;
4208

4209
	/* Grab the jsegdep. */
4210
	jsegdep = jfreefrag->fr_jsegdep;
4211
	jfreefrag->fr_jsegdep = NULL;
4212
	freefrag = jfreefrag->fr_freefrag;
4213
	if (freefrag == NULL)
4214
		panic("handle_written_jfreefrag: No freefrag.");
4215
	freefrag->ff_state |= DEPCOMPLETE;
4216
	freefrag->ff_jdep = NULL;
4217
	jwork_insert(&freefrag->ff_jwork, jsegdep);
4218
	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
4219
		add_to_worklist(&freefrag->ff_list, 0);
4220
	jfreefrag->fr_freefrag = NULL;
4221
	free_jfreefrag(jfreefrag);
4222
}
4223

4224
/*
4225
 * Called when the journal write for a jfreeblk completes.  The jfreeblk
4226
 * is removed from the freeblks list of pending journal writes and the
4227
 * jsegdep is moved to the freeblks jwork to be completed when all blocks
4228
 * have been reclaimed.
4229
 */
4230
static void
4231
handle_written_jblkdep(struct jblkdep *jblkdep)
4232
{
4233
	struct freeblks *freeblks;
4234
	struct jsegdep *jsegdep;
4235

4236
	/* Grab the jsegdep. */
4237
	jsegdep = jblkdep->jb_jsegdep;
4238
	jblkdep->jb_jsegdep = NULL;
4239
	freeblks = jblkdep->jb_freeblks;
4240
	LIST_REMOVE(jblkdep, jb_deps);
4241
	jwork_insert(&freeblks->fb_jwork, jsegdep);
4242
	/*
4243
	 * If the freeblks is all journaled, we can add it to the worklist.
4244
	 */
4245
	if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
4246
	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
4247
		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
4248

4249
	free_jblkdep(jblkdep);
4250
}
4251

4252
static struct jsegdep *
4253
newjsegdep(struct worklist *wk)
4254
{
4255
	struct jsegdep *jsegdep;
4256

4257
	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
4258
	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
4259
	jsegdep->jd_seg = NULL;
4260

4261
	return (jsegdep);
4262
}
4263

4264
static struct jmvref *
4265
newjmvref(struct inode *dp,
4266
	ino_t ino,
4267
	off_t oldoff,
4268
	off_t newoff)
4269
{
4270
	struct jmvref *jmvref;
4271

4272
	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
4273
	workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp));
4274
	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
4275
	jmvref->jm_parent = dp->i_number;
4276
	jmvref->jm_ino = ino;
4277
	jmvref->jm_oldoff = oldoff;
4278
	jmvref->jm_newoff = newoff;
4279

4280
	return (jmvref);
4281
}
4282

4283
/*
4284
 * Allocate a new jremref that tracks the removal of ip from dp with the
4285
 * directory entry offset of diroff.  Mark the entry as ATTACHED and
4286
 * DEPCOMPLETE as we have all the information required for the journal write
4287
 * and the directory has already been removed from the buffer.  The caller
4288
 * is responsible for linking the jremref into the pagedep and adding it
4289
 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
4290
 * a DOTDOT addition so handle_workitem_remove() can properly assign
4291
 * the jsegdep when we're done.
4292
 */
4293
static struct jremref *
4294
newjremref(struct dirrem *dirrem,
4295
	struct inode *dp,
4296
	struct inode *ip,
4297
	off_t diroff,
4298
	nlink_t nlink)
4299
{
4300
	struct jremref *jremref;
4301

4302
	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
4303
	workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp));
4304
	jremref->jr_state = ATTACHED;
4305
	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
4306
	   nlink, ip->i_mode);
4307
	jremref->jr_dirrem = dirrem;
4308

4309
	return (jremref);
4310
}
4311

4312
static inline void
4313
newinoref(struct inoref *inoref,
4314
	ino_t ino,
4315
	ino_t parent,
4316
	off_t diroff,
4317
	nlink_t nlink,
4318
	uint16_t mode)
4319
{
4320

4321
	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
4322
	inoref->if_diroff = diroff;
4323
	inoref->if_ino = ino;
4324
	inoref->if_parent = parent;
4325
	inoref->if_nlink = nlink;
4326
	inoref->if_mode = mode;
4327
}
4328

4329
/*
4330
 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
4331
 * directory offset may not be known until later.  The caller is responsible
4332
 * adding the entry to the journal when this information is available.  nlink
4333
 * should be the link count prior to the addition and mode is only required
4334
 * to have the correct FMT.
4335
 */
4336
static struct jaddref *
4337
newjaddref(struct inode *dp,
4338
	ino_t ino,
4339
	off_t diroff,
4340
	int16_t nlink,
4341
	uint16_t mode)
4342
{
4343
	struct jaddref *jaddref;
4344

4345
	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
4346
	workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp));
4347
	jaddref->ja_state = ATTACHED;
4348
	jaddref->ja_mkdir = NULL;
4349
	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
4350

4351
	return (jaddref);
4352
}
4353

4354
/*
4355
 * Create a new free dependency for a freework.  The caller is responsible
4356
 * for adjusting the reference count when it has the lock held.  The freedep
4357
 * will track an outstanding bitmap write that will ultimately clear the
4358
 * freework to continue.
4359
 */
4360
static struct freedep *
4361
newfreedep(struct freework *freework)
4362
{
4363
	struct freedep *freedep;
4364

4365
	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
4366
	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
4367
	freedep->fd_freework = freework;
4368

4369
	return (freedep);
4370
}
4371

4372
/*
4373
 * Free a freedep structure once the buffer it is linked to is written.  If
4374
 * this is the last reference to the freework schedule it for completion.
4375
 */
4376
static void
4377
free_freedep(struct freedep *freedep)
4378
{
4379
	struct freework *freework;
4380

4381
	freework = freedep->fd_freework;
4382
	freework->fw_freeblks->fb_cgwait--;
4383
	if (--freework->fw_ref == 0)
4384
		freework_enqueue(freework);
4385
	WORKITEM_FREE(freedep, D_FREEDEP);
4386
}
4387

4388
/*
4389
 * Allocate a new freework structure that may be a level in an indirect
4390
 * when parent is not NULL or a top level block when it is.  The top level
4391
 * freework structures are allocated without the per-filesystem lock held
4392
 * and before the freeblks is visible outside of softdep_setup_freeblocks().
4393
 */
4394
static struct freework *
4395
newfreework(struct ufsmount *ump,
4396
	struct freeblks *freeblks,
4397
	struct freework *parent,
4398
	ufs_lbn_t lbn,
4399
	ufs2_daddr_t nb,
4400
	int frags,
4401
	int off,
4402
	int journal)
4403
{
4404
	struct freework *freework;
4405

4406
	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
4407
	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
4408
	freework->fw_state = ATTACHED;
4409
	freework->fw_jnewblk = NULL;
4410
	freework->fw_freeblks = freeblks;
4411
	freework->fw_parent = parent;
4412
	freework->fw_lbn = lbn;
4413
	freework->fw_blkno = nb;
4414
	freework->fw_frags = frags;
4415
	freework->fw_indir = NULL;
4416
	freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 ||
4417
	    lbn >= -UFS_NXADDR) ? 0 : NINDIR(ump->um_fs) + 1;
4418
	freework->fw_start = freework->fw_off = off;
4419
	if (journal)
4420
		newjfreeblk(freeblks, lbn, nb, frags);
4421
	if (parent == NULL) {
4422
		ACQUIRE_LOCK(ump);
4423
		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
4424
		freeblks->fb_ref++;
4425
		FREE_LOCK(ump);
4426
	}
4427

4428
	return (freework);
4429
}
4430

4431
/*
4432
 * Eliminate a jfreeblk for a block that does not need journaling.
4433
 */
4434
static void
4435
cancel_jfreeblk(struct freeblks *freeblks, ufs2_daddr_t blkno)
4436
{
4437
	struct jfreeblk *jfreeblk;
4438
	struct jblkdep *jblkdep;
4439

4440
	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
4441
		if (jblkdep->jb_list.wk_type != D_JFREEBLK)
4442
			continue;
4443
		jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
4444
		if (jfreeblk->jf_blkno == blkno)
4445
			break;
4446
	}
4447
	if (jblkdep == NULL)
4448
		return;
4449
	CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
4450
	free_jsegdep(jblkdep->jb_jsegdep);
4451
	LIST_REMOVE(jblkdep, jb_deps);
4452
	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
4453
}
4454

4455
/*
4456
 * Allocate a new jfreeblk to journal top level block pointer when truncating
4457
 * a file.  The caller must add this to the worklist when the per-filesystem
4458
 * lock is held.
4459
 */
4460
static struct jfreeblk *
4461
newjfreeblk(struct freeblks *freeblks,
4462
	ufs_lbn_t lbn,
4463
	ufs2_daddr_t blkno,
4464
	int frags)
4465
{
4466
	struct jfreeblk *jfreeblk;
4467

4468
	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4469
	workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4470
	    freeblks->fb_list.wk_mp);
4471
	jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4472
	jfreeblk->jf_dep.jb_freeblks = freeblks;
4473
	jfreeblk->jf_ino = freeblks->fb_inum;
4474
	jfreeblk->jf_lbn = lbn;
4475
	jfreeblk->jf_blkno = blkno;
4476
	jfreeblk->jf_frags = frags;
4477
	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4478

4479
	return (jfreeblk);
4480
}
4481

4482
/*
4483
 * The journal is only prepared to handle full-size block numbers, so we
4484
 * have to adjust the record to reflect the change to a full-size block.
4485
 * For example, suppose we have a block made up of fragments 8-15 and
4486
 * want to free its last two fragments. We are given a request that says:
4487
 *     FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
4488
 * where frags are the number of fragments to free and oldfrags are the
4489
 * number of fragments to keep. To block align it, we have to change it to
4490
 * have a valid full-size blkno, so it becomes:
4491
 *     FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
4492
 */
4493
static void
4494
adjust_newfreework(struct freeblks *freeblks, int frag_offset)
4495
{
4496
	struct jfreeblk *jfreeblk;
4497

4498
	KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL &&
4499
	    LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
4500
	    ("adjust_newfreework: Missing freeblks dependency"));
4501

4502
	jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
4503
	jfreeblk->jf_blkno -= frag_offset;
4504
	jfreeblk->jf_frags += frag_offset;
4505
}
4506

4507
/*
4508
 * Allocate a new jtrunc to track a partial truncation.
4509
 */
4510
static struct jtrunc *
4511
newjtrunc(struct freeblks *freeblks,
4512
	off_t size,
4513
	int extsize)
4514
{
4515
	struct jtrunc *jtrunc;
4516

4517
	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4518
	workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4519
	    freeblks->fb_list.wk_mp);
4520
	jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4521
	jtrunc->jt_dep.jb_freeblks = freeblks;
4522
	jtrunc->jt_ino = freeblks->fb_inum;
4523
	jtrunc->jt_size = size;
4524
	jtrunc->jt_extsize = extsize;
4525
	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4526

4527
	return (jtrunc);
4528
}
4529

4530
/*
4531
 * If we're canceling a new bitmap we have to search for another ref
4532
 * to move into the bmsafemap dep.  This might be better expressed
4533
 * with another structure.
4534
 */
4535
static void
4536
move_newblock_dep(struct jaddref *jaddref, struct inodedep *inodedep)
4537
{
4538
	struct inoref *inoref;
4539
	struct jaddref *jaddrefn;
4540

4541
	jaddrefn = NULL;
4542
	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4543
	    inoref = TAILQ_NEXT(inoref, if_deps)) {
4544
		if ((jaddref->ja_state & NEWBLOCK) &&
4545
		    inoref->if_list.wk_type == D_JADDREF) {
4546
			jaddrefn = (struct jaddref *)inoref;
4547
			break;
4548
		}
4549
	}
4550
	if (jaddrefn == NULL)
4551
		return;
4552
	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4553
	jaddrefn->ja_state |= jaddref->ja_state &
4554
	    (ATTACHED | UNDONE | NEWBLOCK);
4555
	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4556
	jaddref->ja_state |= ATTACHED;
4557
	LIST_REMOVE(jaddref, ja_bmdeps);
4558
	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4559
	    ja_bmdeps);
4560
}
4561

4562
/*
4563
 * Cancel a jaddref either before it has been written or while it is being
4564
 * written.  This happens when a link is removed before the add reaches
4565
 * the disk.  The jaddref dependency is kept linked into the bmsafemap
4566
 * and inode to prevent the link count or bitmap from reaching the disk
4567
 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4568
 * required.
4569
 *
4570
 * Returns 1 if the canceled addref requires journaling of the remove and
4571
 * 0 otherwise.
4572
 */
4573
static int
4574
cancel_jaddref(struct jaddref *jaddref,
4575
	struct inodedep *inodedep,
4576
	struct workhead *wkhd)
4577
{
4578
	struct inoref *inoref;
4579
	struct jsegdep *jsegdep;
4580
	int needsj;
4581

4582
	KASSERT((jaddref->ja_state & COMPLETE) == 0,
4583
	    ("cancel_jaddref: Canceling complete jaddref"));
4584
	if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4585
		needsj = 1;
4586
	else
4587
		needsj = 0;
4588
	if (inodedep == NULL)
4589
		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4590
		    0, &inodedep) == 0)
4591
			panic("cancel_jaddref: Lost inodedep");
4592
	/*
4593
	 * We must adjust the nlink of any reference operation that follows
4594
	 * us so that it is consistent with the in-memory reference.  This
4595
	 * ensures that inode nlink rollbacks always have the correct link.
4596
	 */
4597
	if (needsj == 0) {
4598
		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4599
		    inoref = TAILQ_NEXT(inoref, if_deps)) {
4600
			if (inoref->if_state & GOINGAWAY)
4601
				break;
4602
			inoref->if_nlink--;
4603
		}
4604
	}
4605
	jsegdep = inoref_jseg(&jaddref->ja_ref);
4606
	if (jaddref->ja_state & NEWBLOCK)
4607
		move_newblock_dep(jaddref, inodedep);
4608
	wake_worklist(&jaddref->ja_list);
4609
	jaddref->ja_mkdir = NULL;
4610
	if (jaddref->ja_state & INPROGRESS) {
4611
		jaddref->ja_state &= ~INPROGRESS;
4612
		WORKLIST_REMOVE(&jaddref->ja_list);
4613
		jwork_insert(wkhd, jsegdep);
4614
	} else {
4615
		free_jsegdep(jsegdep);
4616
		if (jaddref->ja_state & DEPCOMPLETE)
4617
			remove_from_journal(&jaddref->ja_list);
4618
	}
4619
	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4620
	/*
4621
	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4622
	 * can arrange for them to be freed with the bitmap.  Otherwise we
4623
	 * no longer need this addref attached to the inoreflst and it
4624
	 * will incorrectly adjust nlink if we leave it.
4625
	 */
4626
	if ((jaddref->ja_state & NEWBLOCK) == 0) {
4627
		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4628
		    if_deps);
4629
		jaddref->ja_state |= COMPLETE;
4630
		free_jaddref(jaddref);
4631
		return (needsj);
4632
	}
4633
	/*
4634
	 * Leave the head of the list for jsegdeps for fast merging.
4635
	 */
4636
	if (LIST_FIRST(wkhd) != NULL) {
4637
		jaddref->ja_state |= ONWORKLIST;
4638
		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4639
	} else
4640
		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4641

4642
	return (needsj);
4643
}
4644

4645
/* 
4646
 * Attempt to free a jaddref structure when some work completes.  This
4647
 * should only succeed once the entry is written and all dependencies have
4648
 * been notified.
4649
 */
4650
static void
4651
free_jaddref(struct jaddref *jaddref)
4652
{
4653

4654
	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4655
		return;
4656
	if (jaddref->ja_ref.if_jsegdep)
4657
		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4658
		    jaddref, jaddref->ja_state);
4659
	if (jaddref->ja_state & NEWBLOCK)
4660
		LIST_REMOVE(jaddref, ja_bmdeps);
4661
	if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4662
		panic("free_jaddref: Bad state %p(0x%X)",
4663
		    jaddref, jaddref->ja_state);
4664
	if (jaddref->ja_mkdir != NULL)
4665
		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4666
	WORKITEM_FREE(jaddref, D_JADDREF);
4667
}
4668

4669
/*
4670
 * Free a jremref structure once it has been written or discarded.
4671
 */
4672
static void
4673
free_jremref(struct jremref *jremref)
4674
{
4675

4676
	if (jremref->jr_ref.if_jsegdep)
4677
		free_jsegdep(jremref->jr_ref.if_jsegdep);
4678
	if (jremref->jr_state & INPROGRESS)
4679
		panic("free_jremref: IO still pending");
4680
	WORKITEM_FREE(jremref, D_JREMREF);
4681
}
4682

4683
/*
4684
 * Free a jnewblk structure.
4685
 */
4686
static void
4687
free_jnewblk(struct jnewblk *jnewblk)
4688
{
4689

4690
	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4691
		return;
4692
	LIST_REMOVE(jnewblk, jn_deps);
4693
	if (jnewblk->jn_dep != NULL)
4694
		panic("free_jnewblk: Dependency still attached.");
4695
	WORKITEM_FREE(jnewblk, D_JNEWBLK);
4696
}
4697

4698
/*
4699
 * Cancel a jnewblk which has been been made redundant by frag extension.
4700
 */
4701
static void
4702
cancel_jnewblk(struct jnewblk *jnewblk, struct workhead *wkhd)
4703
{
4704
	struct jsegdep *jsegdep;
4705

4706
	CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4707
	jsegdep = jnewblk->jn_jsegdep;
4708
	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4709
		panic("cancel_jnewblk: Invalid state");
4710
	jnewblk->jn_jsegdep  = NULL;
4711
	jnewblk->jn_dep = NULL;
4712
	jnewblk->jn_state |= GOINGAWAY;
4713
	if (jnewblk->jn_state & INPROGRESS) {
4714
		jnewblk->jn_state &= ~INPROGRESS;
4715
		WORKLIST_REMOVE(&jnewblk->jn_list);
4716
		jwork_insert(wkhd, jsegdep);
4717
	} else {
4718
		free_jsegdep(jsegdep);
4719
		remove_from_journal(&jnewblk->jn_list);
4720
	}
4721
	wake_worklist(&jnewblk->jn_list);
4722
	WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4723
}
4724

4725
static void
4726
free_jblkdep(struct jblkdep *jblkdep)
4727
{
4728

4729
	if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4730
		WORKITEM_FREE(jblkdep, D_JFREEBLK);
4731
	else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4732
		WORKITEM_FREE(jblkdep, D_JTRUNC);
4733
	else
4734
		panic("free_jblkdep: Unexpected type %s",
4735
		    TYPENAME(jblkdep->jb_list.wk_type));
4736
}
4737

4738
/*
4739
 * Free a single jseg once it is no longer referenced in memory or on
4740
 * disk.  Reclaim journal blocks and dependencies waiting for the segment
4741
 * to disappear.
4742
 */
4743
static void
4744
free_jseg(struct jseg *jseg, struct jblocks *jblocks)
4745
{
4746
	struct freework *freework;
4747

4748
	/*
4749
	 * Free freework structures that were lingering to indicate freed
4750
	 * indirect blocks that forced journal write ordering on reallocate.
4751
	 */
4752
	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4753
		indirblk_remove(freework);
4754
	if (jblocks->jb_oldestseg == jseg)
4755
		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4756
	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4757
	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4758
	KASSERT(LIST_EMPTY(&jseg->js_entries),
4759
	    ("free_jseg: Freed jseg has valid entries."));
4760
	WORKITEM_FREE(jseg, D_JSEG);
4761
}
4762

4763
/*
4764
 * Free all jsegs that meet the criteria for being reclaimed and update
4765
 * oldestseg.
4766
 */
4767
static void
4768
free_jsegs(struct jblocks *jblocks)
4769
{
4770
	struct jseg *jseg;
4771

4772
	/*
4773
	 * Free only those jsegs which have none allocated before them to
4774
	 * preserve the journal space ordering.
4775
	 */
4776
	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4777
		/*
4778
		 * Only reclaim space when nothing depends on this journal
4779
		 * set and another set has written that it is no longer
4780
		 * valid.
4781
		 */
4782
		if (jseg->js_refs != 0) {
4783
			jblocks->jb_oldestseg = jseg;
4784
			return;
4785
		}
4786
		if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4787
			break;
4788
		if (jseg->js_seq > jblocks->jb_oldestwrseq)
4789
			break;
4790
		/*
4791
		 * We can free jsegs that didn't write entries when
4792
		 * oldestwrseq == js_seq.
4793
		 */
4794
		if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4795
		    jseg->js_cnt != 0)
4796
			break;
4797
		free_jseg(jseg, jblocks);
4798
	}
4799
	/*
4800
	 * If we exited the loop above we still must discover the
4801
	 * oldest valid segment.
4802
	 */
4803
	if (jseg)
4804
		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4805
		     jseg = TAILQ_NEXT(jseg, js_next))
4806
			if (jseg->js_refs != 0)
4807
				break;
4808
	jblocks->jb_oldestseg = jseg;
4809
	/*
4810
	 * The journal has no valid records but some jsegs may still be
4811
	 * waiting on oldestwrseq to advance.  We force a small record
4812
	 * out to permit these lingering records to be reclaimed.
4813
	 */
4814
	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4815
		jblocks->jb_needseg = 1;
4816
}
4817

4818
/*
4819
 * Release one reference to a jseg and free it if the count reaches 0.  This
4820
 * should eventually reclaim journal space as well.
4821
 */
4822
static void
4823
rele_jseg(struct jseg *jseg)
4824
{
4825

4826
	KASSERT(jseg->js_refs > 0,
4827
	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4828
	if (--jseg->js_refs != 0)
4829
		return;
4830
	free_jsegs(jseg->js_jblocks);
4831
}
4832

4833
/*
4834
 * Release a jsegdep and decrement the jseg count.
4835
 */
4836
static void
4837
free_jsegdep(struct jsegdep *jsegdep)
4838
{
4839

4840
	if (jsegdep->jd_seg)
4841
		rele_jseg(jsegdep->jd_seg);
4842
	WORKITEM_FREE(jsegdep, D_JSEGDEP);
4843
}
4844

4845
/*
4846
 * Wait for a journal item to make it to disk.  Initiate journal processing
4847
 * if required.
4848
 */
4849
static int
4850
jwait(struct worklist *wk, int waitfor)
4851
{
4852

4853
	LOCK_OWNED(VFSTOUFS(wk->wk_mp));
4854
	/*
4855
	 * Blocking journal waits cause slow synchronous behavior.  Record
4856
	 * stats on the frequency of these blocking operations.
4857
	 */
4858
	if (waitfor == MNT_WAIT) {
4859
		stat_journal_wait++;
4860
		switch (wk->wk_type) {
4861
		case D_JREMREF:
4862
		case D_JMVREF:
4863
			stat_jwait_filepage++;
4864
			break;
4865
		case D_JTRUNC:
4866
		case D_JFREEBLK:
4867
			stat_jwait_freeblks++;
4868
			break;
4869
		case D_JNEWBLK:
4870
			stat_jwait_newblk++;
4871
			break;
4872
		case D_JADDREF:
4873
			stat_jwait_inode++;
4874
			break;
4875
		default:
4876
			break;
4877
		}
4878
	}
4879
	/*
4880
	 * If IO has not started we process the journal.  We can't mark the
4881
	 * worklist item as IOWAITING because we drop the lock while
4882
	 * processing the journal and the worklist entry may be freed after
4883
	 * this point.  The caller may call back in and re-issue the request.
4884
	 */
4885
	if ((wk->wk_state & INPROGRESS) == 0) {
4886
		softdep_process_journal(wk->wk_mp, wk, waitfor);
4887
		if (waitfor != MNT_WAIT)
4888
			return (EBUSY);
4889
		return (0);
4890
	}
4891
	if (waitfor != MNT_WAIT)
4892
		return (EBUSY);
4893
	wait_worklist(wk, "jwait");
4894
	return (0);
4895
}
4896

4897
/*
4898
 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4899
 * appropriate.  This is a convenience function to reduce duplicate code
4900
 * for the setup and revert functions below.
4901
 */
4902
static struct inodedep *
4903
inodedep_lookup_ip(struct inode *ip)
4904
{
4905
	struct inodedep *inodedep;
4906

4907
	KASSERT(ip->i_nlink >= ip->i_effnlink,
4908
	    ("inodedep_lookup_ip: bad delta"));
4909
	(void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC,
4910
	    &inodedep);
4911
	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4912
	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4913

4914
	return (inodedep);
4915
}
4916

4917
/*
4918
 * Called prior to creating a new inode and linking it to a directory.  The
4919
 * jaddref structure must already be allocated by softdep_setup_inomapdep
4920
 * and it is discovered here so we can initialize the mode and update
4921
 * nlinkdelta.
4922
 */
4923
void
4924
softdep_setup_create(struct inode *dp, struct inode *ip)
4925
{
4926
	struct inodedep *inodedep;
4927
	struct jaddref *jaddref __diagused;
4928
	struct vnode *dvp;
4929

4930
	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4931
	    ("softdep_setup_create called on non-softdep filesystem"));
4932
	KASSERT(ip->i_nlink == 1,
4933
	    ("softdep_setup_create: Invalid link count."));
4934
	dvp = ITOV(dp);
4935
	ACQUIRE_LOCK(ITOUMP(dp));
4936
	inodedep = inodedep_lookup_ip(ip);
4937
	if (DOINGSUJ(dvp)) {
4938
		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4939
		    inoreflst);
4940
		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4941
		    ("softdep_setup_create: No addref structure present."));
4942
	}
4943
	FREE_LOCK(ITOUMP(dp));
4944
}
4945

4946
/*
4947
 * Create a jaddref structure to track the addition of a DOTDOT link when
4948
 * we are reparenting an inode as part of a rename.  This jaddref will be
4949
 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4950
 * non-journaling softdep.
4951
 */
4952
void
4953
softdep_setup_dotdot_link(struct inode *dp, struct inode *ip)
4954
{
4955
	struct inodedep *inodedep;
4956
	struct jaddref *jaddref;
4957
	struct vnode *dvp;
4958

4959
	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4960
	    ("softdep_setup_dotdot_link called on non-softdep filesystem"));
4961
	dvp = ITOV(dp);
4962
	jaddref = NULL;
4963
	/*
4964
	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4965
	 * is used as a normal link would be.
4966
	 */
4967
	if (DOINGSUJ(dvp))
4968
		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4969
		    dp->i_effnlink - 1, dp->i_mode);
4970
	ACQUIRE_LOCK(ITOUMP(dp));
4971
	inodedep = inodedep_lookup_ip(dp);
4972
	if (jaddref)
4973
		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4974
		    if_deps);
4975
	FREE_LOCK(ITOUMP(dp));
4976
}
4977

4978
/*
4979
 * Create a jaddref structure to track a new link to an inode.  The directory
4980
 * offset is not known until softdep_setup_directory_add or
4981
 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4982
 * softdep.
4983
 */
4984
void
4985
softdep_setup_link(struct inode *dp, struct inode *ip)
4986
{
4987
	struct inodedep *inodedep;
4988
	struct jaddref *jaddref;
4989
	struct vnode *dvp;
4990

4991
	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4992
	    ("softdep_setup_link called on non-softdep filesystem"));
4993
	dvp = ITOV(dp);
4994
	jaddref = NULL;
4995
	if (DOINGSUJ(dvp))
4996
		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4997
		    ip->i_mode);
4998
	ACQUIRE_LOCK(ITOUMP(dp));
4999
	inodedep = inodedep_lookup_ip(ip);
5000
	if (jaddref)
5001
		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
5002
		    if_deps);
5003
	FREE_LOCK(ITOUMP(dp));
5004
}
5005

5006
/*
5007
 * Called to create the jaddref structures to track . and .. references as
5008
 * well as lookup and further initialize the incomplete jaddref created
5009
 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
5010
 * nlinkdelta for non-journaling softdep.
5011
 */
5012
void
5013
softdep_setup_mkdir(struct inode *dp, struct inode *ip)
5014
{
5015
	struct inodedep *inodedep;
5016
	struct jaddref *dotdotaddref;
5017
	struct jaddref *dotaddref;
5018
	struct jaddref *jaddref;
5019
	struct vnode *dvp;
5020

5021
	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5022
	    ("softdep_setup_mkdir called on non-softdep filesystem"));
5023
	dvp = ITOV(dp);
5024
	dotaddref = dotdotaddref = NULL;
5025
	if (DOINGSUJ(dvp)) {
5026
		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
5027
		    ip->i_mode);
5028
		dotaddref->ja_state |= MKDIR_BODY;
5029
		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
5030
		    dp->i_effnlink - 1, dp->i_mode);
5031
		dotdotaddref->ja_state |= MKDIR_PARENT;
5032
	}
5033
	ACQUIRE_LOCK(ITOUMP(dp));
5034
	inodedep = inodedep_lookup_ip(ip);
5035
	if (DOINGSUJ(dvp)) {
5036
		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5037
		    inoreflst);
5038
		KASSERT(jaddref != NULL,
5039
		    ("softdep_setup_mkdir: No addref structure present."));
5040
		KASSERT(jaddref->ja_parent == dp->i_number, 
5041
		    ("softdep_setup_mkdir: bad parent %ju",
5042
		    (uintmax_t)jaddref->ja_parent));
5043
		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
5044
		    if_deps);
5045
	}
5046
	inodedep = inodedep_lookup_ip(dp);
5047
	if (DOINGSUJ(dvp))
5048
		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
5049
		    &dotdotaddref->ja_ref, if_deps);
5050
	FREE_LOCK(ITOUMP(dp));
5051
}
5052

5053
/*
5054
 * Called to track nlinkdelta of the inode and parent directories prior to
5055
 * unlinking a directory.
5056
 */
5057
void
5058
softdep_setup_rmdir(struct inode *dp, struct inode *ip)
5059
{
5060

5061
	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5062
	    ("softdep_setup_rmdir called on non-softdep filesystem"));
5063
	ACQUIRE_LOCK(ITOUMP(dp));
5064
	(void) inodedep_lookup_ip(ip);
5065
	(void) inodedep_lookup_ip(dp);
5066
	FREE_LOCK(ITOUMP(dp));
5067
}
5068

5069
/*
5070
 * Called to track nlinkdelta of the inode and parent directories prior to
5071
 * unlink.
5072
 */
5073
void
5074
softdep_setup_unlink(struct inode *dp, struct inode *ip)
5075
{
5076

5077
	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5078
	    ("softdep_setup_unlink called on non-softdep filesystem"));
5079
	ACQUIRE_LOCK(ITOUMP(dp));
5080
	(void) inodedep_lookup_ip(ip);
5081
	(void) inodedep_lookup_ip(dp);
5082
	FREE_LOCK(ITOUMP(dp));
5083
}
5084

5085
/*
5086
 * Called to release the journal structures created by a failed non-directory
5087
 * creation.  Adjusts nlinkdelta for non-journaling softdep.
5088
 */
5089
void
5090
softdep_revert_create(struct inode *dp, struct inode *ip)
5091
{
5092
	struct inodedep *inodedep;
5093
	struct jaddref *jaddref;
5094
	struct vnode *dvp;
5095

5096
	KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0,
5097
	    ("softdep_revert_create called on non-softdep filesystem"));
5098
	dvp = ITOV(dp);
5099
	ACQUIRE_LOCK(ITOUMP(dp));
5100
	inodedep = inodedep_lookup_ip(ip);
5101
	if (DOINGSUJ(dvp)) {
5102
		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5103
		    inoreflst);
5104
		KASSERT(jaddref->ja_parent == dp->i_number,
5105
		    ("softdep_revert_create: addref parent mismatch"));
5106
		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
5107
	}
5108
	FREE_LOCK(ITOUMP(dp));
5109
}
5110

5111
/*
5112
 * Called to release the journal structures created by a failed link
5113
 * addition.  Adjusts nlinkdelta for non-journaling softdep.
5114
 */
5115
void
5116
softdep_revert_link(struct inode *dp, struct inode *ip)
5117
{
5118
	struct inodedep *inodedep;
5119
	struct jaddref *jaddref;
5120
	struct vnode *dvp;
5121

5122
	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5123
	    ("softdep_revert_link called on non-softdep filesystem"));
5124
	dvp = ITOV(dp);
5125
	ACQUIRE_LOCK(ITOUMP(dp));
5126
	inodedep = inodedep_lookup_ip(ip);
5127
	if (DOINGSUJ(dvp)) {
5128
		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5129
		    inoreflst);
5130
		KASSERT(jaddref->ja_parent == dp->i_number,
5131
		    ("softdep_revert_link: addref parent mismatch"));
5132
		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
5133
	}
5134
	FREE_LOCK(ITOUMP(dp));
5135
}
5136

5137
/*
5138
 * Called to release the journal structures created by a failed mkdir
5139
 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
5140
 */
5141
void
5142
softdep_revert_mkdir(struct inode *dp, struct inode *ip)
5143
{
5144
	struct inodedep *inodedep;
5145
	struct jaddref *jaddref;
5146
	struct jaddref *dotaddref;
5147
	struct vnode *dvp;
5148

5149
	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5150
	    ("softdep_revert_mkdir called on non-softdep filesystem"));
5151
	dvp = ITOV(dp);
5152

5153
	ACQUIRE_LOCK(ITOUMP(dp));
5154
	inodedep = inodedep_lookup_ip(dp);
5155
	if (DOINGSUJ(dvp)) {
5156
		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5157
		    inoreflst);
5158
		KASSERT(jaddref->ja_parent == ip->i_number,
5159
		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
5160
		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
5161
	}
5162
	inodedep = inodedep_lookup_ip(ip);
5163
	if (DOINGSUJ(dvp)) {
5164
		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5165
		    inoreflst);
5166
		KASSERT(jaddref->ja_parent == dp->i_number,
5167
		    ("softdep_revert_mkdir: addref parent mismatch"));
5168
		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
5169
		    inoreflst, if_deps);
5170
		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
5171
		KASSERT(dotaddref->ja_parent == ip->i_number,
5172
		    ("softdep_revert_mkdir: dot addref parent mismatch"));
5173
		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
5174
	}
5175
	FREE_LOCK(ITOUMP(dp));
5176
}
5177

5178
/* 
5179
 * Called to correct nlinkdelta after a failed rmdir.
5180
 */
5181
void
5182
softdep_revert_rmdir(struct inode *dp, struct inode *ip)
5183
{
5184

5185
	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5186
	    ("softdep_revert_rmdir called on non-softdep filesystem"));
5187
	ACQUIRE_LOCK(ITOUMP(dp));
5188
	(void) inodedep_lookup_ip(ip);
5189
	(void) inodedep_lookup_ip(dp);
5190
	FREE_LOCK(ITOUMP(dp));
5191
}
5192

5193
/*
5194
 * Protecting the freemaps (or bitmaps).
5195
 * 
5196
 * To eliminate the need to execute fsck before mounting a filesystem
5197
 * after a power failure, one must (conservatively) guarantee that the
5198
 * on-disk copy of the bitmaps never indicate that a live inode or block is
5199
 * free.  So, when a block or inode is allocated, the bitmap should be
5200
 * updated (on disk) before any new pointers.  When a block or inode is
5201
 * freed, the bitmap should not be updated until all pointers have been
5202
 * reset.  The latter dependency is handled by the delayed de-allocation
5203
 * approach described below for block and inode de-allocation.  The former
5204
 * dependency is handled by calling the following procedure when a block or
5205
 * inode is allocated. When an inode is allocated an "inodedep" is created
5206
 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
5207
 * Each "inodedep" is also inserted into the hash indexing structure so
5208
 * that any additional link additions can be made dependent on the inode
5209
 * allocation.
5210
 * 
5211
 * The ufs filesystem maintains a number of free block counts (e.g., per
5212
 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
5213
 * in addition to the bitmaps.  These counts are used to improve efficiency
5214
 * during allocation and therefore must be consistent with the bitmaps.
5215
 * There is no convenient way to guarantee post-crash consistency of these
5216
 * counts with simple update ordering, for two main reasons: (1) The counts
5217
 * and bitmaps for a single cylinder group block are not in the same disk
5218
 * sector.  If a disk write is interrupted (e.g., by power failure), one may
5219
 * be written and the other not.  (2) Some of the counts are located in the
5220
 * superblock rather than the cylinder group block. So, we focus our soft
5221
 * updates implementation on protecting the bitmaps. When mounting a
5222
 * filesystem, we recompute the auxiliary counts from the bitmaps.
5223
 */
5224

5225
/*
5226
 * Called just after updating the cylinder group block to allocate an inode.
5227
 */
5228
void
5229
softdep_setup_inomapdep(
5230
	struct buf *bp,		/* buffer for cylgroup block with inode map */
5231
	struct inode *ip,	/* inode related to allocation */
5232
	ino_t newinum,		/* new inode number being allocated */
5233
	int mode)
5234
{
5235
	struct inodedep *inodedep;
5236
	struct bmsafemap *bmsafemap;
5237
	struct jaddref *jaddref;
5238
	struct mount *mp;
5239
	struct fs *fs;
5240

5241
	mp = ITOVFS(ip);
5242
	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5243
	    ("softdep_setup_inomapdep called on non-softdep filesystem"));
5244
	fs = VFSTOUFS(mp)->um_fs;
5245
	jaddref = NULL;
5246

5247
	/*
5248
	 * Allocate the journal reference add structure so that the bitmap
5249
	 * can be dependent on it.
5250
	 */
5251
	if (MOUNTEDSUJ(mp)) {
5252
		jaddref = newjaddref(ip, newinum, 0, 0, mode);
5253
		jaddref->ja_state |= NEWBLOCK;
5254
	}
5255

5256
	/*
5257
	 * Create a dependency for the newly allocated inode.
5258
	 * Panic if it already exists as something is seriously wrong.
5259
	 * Otherwise add it to the dependency list for the buffer holding
5260
	 * the cylinder group map from which it was allocated.
5261
	 *
5262
	 * We have to preallocate a bmsafemap entry in case it is needed
5263
	 * in bmsafemap_lookup since once we allocate the inodedep, we
5264
	 * have to finish initializing it before we can FREE_LOCK().
5265
	 * By preallocating, we avoid FREE_LOCK() while doing a malloc
5266
	 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
5267
	 * creating the inodedep as it can be freed during the time
5268
	 * that we FREE_LOCK() while allocating the inodedep. We must
5269
	 * call workitem_alloc() before entering the locked section as
5270
	 * it also acquires the lock and we must avoid trying doing so
5271
	 * recursively.
5272
	 */
5273
	bmsafemap = malloc(sizeof(struct bmsafemap),
5274
	    M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5275
	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5276
	ACQUIRE_LOCK(ITOUMP(ip));
5277
	if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep)))
5278
		panic("softdep_setup_inomapdep: dependency %p for new"
5279
		    "inode already exists", inodedep);
5280
	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
5281
	if (jaddref) {
5282
		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
5283
		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
5284
		    if_deps);
5285
	} else {
5286
		inodedep->id_state |= ONDEPLIST;
5287
		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
5288
	}
5289
	inodedep->id_bmsafemap = bmsafemap;
5290
	inodedep->id_state &= ~DEPCOMPLETE;
5291
	FREE_LOCK(ITOUMP(ip));
5292
}
5293

5294
/*
5295
 * Called just after updating the cylinder group block to
5296
 * allocate block or fragment.
5297
 */
5298
void
5299
softdep_setup_blkmapdep(
5300
	struct buf *bp,		/* buffer for cylgroup block with block map */
5301
	struct mount *mp,	/* filesystem doing allocation */
5302
	ufs2_daddr_t newblkno,	/* number of newly allocated block */
5303
	int frags,		/* Number of fragments. */
5304
	int oldfrags)		/* Previous number of fragments for extend. */
5305
{
5306
	struct newblk *newblk;
5307
	struct bmsafemap *bmsafemap;
5308
	struct jnewblk *jnewblk;
5309
	struct ufsmount *ump;
5310
	struct fs *fs;
5311

5312
	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5313
	    ("softdep_setup_blkmapdep called on non-softdep filesystem"));
5314
	ump = VFSTOUFS(mp);
5315
	fs = ump->um_fs;
5316
	jnewblk = NULL;
5317
	/*
5318
	 * Create a dependency for the newly allocated block.
5319
	 * Add it to the dependency list for the buffer holding
5320
	 * the cylinder group map from which it was allocated.
5321
	 */
5322
	if (MOUNTEDSUJ(mp)) {
5323
		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
5324
		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
5325
		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
5326
		jnewblk->jn_state = ATTACHED;
5327
		jnewblk->jn_blkno = newblkno;
5328
		jnewblk->jn_frags = frags;
5329
		jnewblk->jn_oldfrags = oldfrags;
5330
#ifdef INVARIANTS
5331
		{
5332
			struct cg *cgp;
5333
			uint8_t *blksfree;
5334
			long bno;
5335
			int i;
5336

5337
			cgp = (struct cg *)bp->b_data;
5338
			blksfree = cg_blksfree(cgp);
5339
			bno = dtogd(fs, jnewblk->jn_blkno);
5340
			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
5341
			    i++) {
5342
				if (isset(blksfree, bno + i))
5343
					panic("softdep_setup_blkmapdep: "
5344
					    "free fragment %d from %d-%d "
5345
					    "state 0x%X dep %p", i,
5346
					    jnewblk->jn_oldfrags,
5347
					    jnewblk->jn_frags,
5348
					    jnewblk->jn_state,
5349
					    jnewblk->jn_dep);
5350
			}
5351
		}
5352
#endif
5353
	}
5354

5355
	CTR3(KTR_SUJ,
5356
	    "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
5357
	    newblkno, frags, oldfrags);
5358
	ACQUIRE_LOCK(ump);
5359
	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
5360
		panic("softdep_setup_blkmapdep: found block");
5361
	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
5362
	    dtog(fs, newblkno), NULL);
5363
	if (jnewblk) {
5364
		jnewblk->jn_dep = (struct worklist *)newblk;
5365
		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
5366
	} else {
5367
		newblk->nb_state |= ONDEPLIST;
5368
		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
5369
	}
5370
	newblk->nb_bmsafemap = bmsafemap;
5371
	newblk->nb_jnewblk = jnewblk;
5372
	FREE_LOCK(ump);
5373
}
5374

5375
#define	BMSAFEMAP_HASH(ump, cg) \
5376
      (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
5377

5378
static int
5379
bmsafemap_find(
5380
	struct bmsafemap_hashhead *bmsafemaphd,
5381
	int cg,
5382
	struct bmsafemap **bmsafemapp)
5383
{
5384
	struct bmsafemap *bmsafemap;
5385

5386
	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
5387
		if (bmsafemap->sm_cg == cg)
5388
			break;
5389
	if (bmsafemap) {
5390
		*bmsafemapp = bmsafemap;
5391
		return (1);
5392
	}
5393
	*bmsafemapp = NULL;
5394

5395
	return (0);
5396
}
5397

5398
/*
5399
 * Find the bmsafemap associated with a cylinder group buffer.
5400
 * If none exists, create one. The buffer must be locked when
5401
 * this routine is called and this routine must be called with
5402
 * the softdep lock held. To avoid giving up the lock while
5403
 * allocating a new bmsafemap, a preallocated bmsafemap may be
5404
 * provided. If it is provided but not needed, it is freed.
5405
 */
5406
static struct bmsafemap *
5407
bmsafemap_lookup(struct mount *mp,
5408
	struct buf *bp,
5409
	int cg,
5410
	struct bmsafemap *newbmsafemap)
5411
{
5412
	struct bmsafemap_hashhead *bmsafemaphd;
5413
	struct bmsafemap *bmsafemap, *collision;
5414
	struct worklist *wk;
5415
	struct ufsmount *ump;
5416

5417
	ump = VFSTOUFS(mp);
5418
	LOCK_OWNED(ump);
5419
	KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
5420
	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5421
		if (wk->wk_type == D_BMSAFEMAP) {
5422
			if (newbmsafemap)
5423
				WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5424
			return (WK_BMSAFEMAP(wk));
5425
		}
5426
	}
5427
	bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
5428
	if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
5429
		if (newbmsafemap)
5430
			WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5431
		return (bmsafemap);
5432
	}
5433
	if (newbmsafemap) {
5434
		bmsafemap = newbmsafemap;
5435
	} else {
5436
		FREE_LOCK(ump);
5437
		bmsafemap = malloc(sizeof(struct bmsafemap),
5438
			M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5439
		workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5440
		ACQUIRE_LOCK(ump);
5441
	}
5442
	bmsafemap->sm_buf = bp;
5443
	LIST_INIT(&bmsafemap->sm_inodedephd);
5444
	LIST_INIT(&bmsafemap->sm_inodedepwr);
5445
	LIST_INIT(&bmsafemap->sm_newblkhd);
5446
	LIST_INIT(&bmsafemap->sm_newblkwr);
5447
	LIST_INIT(&bmsafemap->sm_jaddrefhd);
5448
	LIST_INIT(&bmsafemap->sm_jnewblkhd);
5449
	LIST_INIT(&bmsafemap->sm_freehd);
5450
	LIST_INIT(&bmsafemap->sm_freewr);
5451
	if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) {
5452
		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
5453
		return (collision);
5454
	}
5455
	bmsafemap->sm_cg = cg;
5456
	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
5457
	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
5458
	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
5459
	return (bmsafemap);
5460
}
5461

5462
/*
5463
 * Direct block allocation dependencies.
5464
 * 
5465
 * When a new block is allocated, the corresponding disk locations must be
5466
 * initialized (with zeros or new data) before the on-disk inode points to
5467
 * them.  Also, the freemap from which the block was allocated must be
5468
 * updated (on disk) before the inode's pointer. These two dependencies are
5469
 * independent of each other and are needed for all file blocks and indirect
5470
 * blocks that are pointed to directly by the inode.  Just before the
5471
 * "in-core" version of the inode is updated with a newly allocated block
5472
 * number, a procedure (below) is called to setup allocation dependency
5473
 * structures.  These structures are removed when the corresponding
5474
 * dependencies are satisfied or when the block allocation becomes obsolete
5475
 * (i.e., the file is deleted, the block is de-allocated, or the block is a
5476
 * fragment that gets upgraded).  All of these cases are handled in
5477
 * procedures described later.
5478
 * 
5479
 * When a file extension causes a fragment to be upgraded, either to a larger
5480
 * fragment or to a full block, the on-disk location may change (if the
5481
 * previous fragment could not simply be extended). In this case, the old
5482
 * fragment must be de-allocated, but not until after the inode's pointer has
5483
 * been updated. In most cases, this is handled by later procedures, which
5484
 * will construct a "freefrag" structure to be added to the workitem queue
5485
 * when the inode update is complete (or obsolete).  The main exception to
5486
 * this is when an allocation occurs while a pending allocation dependency
5487
 * (for the same block pointer) remains.  This case is handled in the main
5488
 * allocation dependency setup procedure by immediately freeing the
5489
 * unreferenced fragments.
5490
 */ 
5491
void 
5492
softdep_setup_allocdirect(
5493
	struct inode *ip,	/* inode to which block is being added */
5494
	ufs_lbn_t off,		/* block pointer within inode */
5495
	ufs2_daddr_t newblkno,	/* disk block number being added */
5496
	ufs2_daddr_t oldblkno,	/* previous block number, 0 unless frag */
5497
	long newsize,		/* size of new block */
5498
	long oldsize,		/* size of new block */
5499
	struct buf *bp)		/* bp for allocated block */
5500
{
5501
	struct allocdirect *adp, *oldadp;
5502
	struct allocdirectlst *adphead;
5503
	struct freefrag *freefrag;
5504
	struct inodedep *inodedep;
5505
	struct pagedep *pagedep;
5506
	struct jnewblk *jnewblk;
5507
	struct newblk *newblk;
5508
	struct mount *mp;
5509
	ufs_lbn_t lbn;
5510

5511
	lbn = bp->b_lblkno;
5512
	mp = ITOVFS(ip);
5513
	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5514
	    ("softdep_setup_allocdirect called on non-softdep filesystem"));
5515
	if (oldblkno && oldblkno != newblkno)
5516
		/*
5517
		 * The usual case is that a smaller fragment that
5518
		 * was just allocated has been replaced with a bigger
5519
		 * fragment or a full-size block. If it is marked as
5520
		 * B_DELWRI, the current contents have not been written
5521
		 * to disk. It is possible that the block was written
5522
		 * earlier, but very uncommon. If the block has never
5523
		 * been written, there is no need to send a BIO_DELETE
5524
		 * for it when it is freed. The gain from avoiding the
5525
		 * TRIMs for the common case of unwritten blocks far
5526
		 * exceeds the cost of the write amplification for the
5527
		 * uncommon case of failing to send a TRIM for a block
5528
		 * that had been written.
5529
		 */
5530
		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
5531
		    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
5532
	else
5533
		freefrag = NULL;
5534

5535
	CTR6(KTR_SUJ,
5536
	    "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
5537
	    "off %jd newsize %ld oldsize %d",
5538
	    ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
5539
	ACQUIRE_LOCK(ITOUMP(ip));
5540
	if (off >= UFS_NDADDR) {
5541
		if (lbn > 0)
5542
			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
5543
			    lbn, off);
5544
		/* allocating an indirect block */
5545
		if (oldblkno != 0)
5546
			panic("softdep_setup_allocdirect: non-zero indir");
5547
	} else {
5548
		if (off != lbn)
5549
			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
5550
			    lbn, off);
5551
		/*
5552
		 * Allocating a direct block.
5553
		 *
5554
		 * If we are allocating a directory block, then we must
5555
		 * allocate an associated pagedep to track additions and
5556
		 * deletions.
5557
		 */
5558
		if ((ip->i_mode & IFMT) == IFDIR)
5559
			pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
5560
			    &pagedep);
5561
	}
5562
	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5563
		panic("softdep_setup_allocdirect: lost block");
5564
	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5565
	    ("softdep_setup_allocdirect: newblk already initialized"));
5566
	/*
5567
	 * Convert the newblk to an allocdirect.
5568
	 */
5569
	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5570
	adp = (struct allocdirect *)newblk;
5571
	newblk->nb_freefrag = freefrag;
5572
	adp->ad_offset = off;
5573
	adp->ad_oldblkno = oldblkno;
5574
	adp->ad_newsize = newsize;
5575
	adp->ad_oldsize = oldsize;
5576

5577
	/*
5578
	 * Finish initializing the journal.
5579
	 */
5580
	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5581
		jnewblk->jn_ino = ip->i_number;
5582
		jnewblk->jn_lbn = lbn;
5583
		add_to_journal(&jnewblk->jn_list);
5584
	}
5585
	if (freefrag && freefrag->ff_jdep != NULL &&
5586
	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5587
		add_to_journal(freefrag->ff_jdep);
5588
	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5589
	adp->ad_inodedep = inodedep;
5590

5591
	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5592
	/*
5593
	 * The list of allocdirects must be kept in sorted and ascending
5594
	 * order so that the rollback routines can quickly determine the
5595
	 * first uncommitted block (the size of the file stored on disk
5596
	 * ends at the end of the lowest committed fragment, or if there
5597
	 * are no fragments, at the end of the highest committed block).
5598
	 * Since files generally grow, the typical case is that the new
5599
	 * block is to be added at the end of the list. We speed this
5600
	 * special case by checking against the last allocdirect in the
5601
	 * list before laboriously traversing the list looking for the
5602
	 * insertion point.
5603
	 */
5604
	adphead = &inodedep->id_newinoupdt;
5605
	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5606
	if (oldadp == NULL || oldadp->ad_offset <= off) {
5607
		/* insert at end of list */
5608
		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5609
		if (oldadp != NULL && oldadp->ad_offset == off)
5610
			allocdirect_merge(adphead, adp, oldadp);
5611
		FREE_LOCK(ITOUMP(ip));
5612
		return;
5613
	}
5614
	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5615
		if (oldadp->ad_offset >= off)
5616
			break;
5617
	}
5618
	if (oldadp == NULL)
5619
		panic("softdep_setup_allocdirect: lost entry");
5620
	/* insert in middle of list */
5621
	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5622
	if (oldadp->ad_offset == off)
5623
		allocdirect_merge(adphead, adp, oldadp);
5624

5625
	FREE_LOCK(ITOUMP(ip));
5626
}
5627

5628
/*
5629
 * Merge a newer and older journal record to be stored either in a
5630
 * newblock or freefrag.  This handles aggregating journal records for
5631
 * fragment allocation into a second record as well as replacing a
5632
 * journal free with an aborted journal allocation.  A segment for the
5633
 * oldest record will be placed on wkhd if it has been written.  If not
5634
 * the segment for the newer record will suffice.
5635
 */
5636
static struct worklist *
5637
jnewblk_merge(struct worklist *new,
5638
	struct worklist *old,
5639
	struct workhead *wkhd)
5640
{
5641
	struct jnewblk *njnewblk;
5642
	struct jnewblk *jnewblk;
5643

5644
	/* Handle NULLs to simplify callers. */
5645
	if (new == NULL)
5646
		return (old);
5647
	if (old == NULL)
5648
		return (new);
5649
	/* Replace a jfreefrag with a jnewblk. */
5650
	if (new->wk_type == D_JFREEFRAG) {
5651
		if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
5652
			panic("jnewblk_merge: blkno mismatch: %p, %p",
5653
			    old, new);
5654
		cancel_jfreefrag(WK_JFREEFRAG(new));
5655
		return (old);
5656
	}
5657
	if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
5658
		panic("jnewblk_merge: Bad type: old %d new %d\n",
5659
		    old->wk_type, new->wk_type);
5660
	/*
5661
	 * Handle merging of two jnewblk records that describe
5662
	 * different sets of fragments in the same block.
5663
	 */
5664
	jnewblk = WK_JNEWBLK(old);
5665
	njnewblk = WK_JNEWBLK(new);
5666
	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5667
		panic("jnewblk_merge: Merging disparate blocks.");
5668
	/*
5669
	 * The record may be rolled back in the cg.
5670
	 */
5671
	if (jnewblk->jn_state & UNDONE) {
5672
		jnewblk->jn_state &= ~UNDONE;
5673
		njnewblk->jn_state |= UNDONE;
5674
		njnewblk->jn_state &= ~ATTACHED;
5675
	}
5676
	/*
5677
	 * We modify the newer addref and free the older so that if neither
5678
	 * has been written the most up-to-date copy will be on disk.  If
5679
	 * both have been written but rolled back we only temporarily need
5680
	 * one of them to fix the bits when the cg write completes.
5681
	 */
5682
	jnewblk->jn_state |= ATTACHED | COMPLETE;
5683
	njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5684
	cancel_jnewblk(jnewblk, wkhd);
5685
	WORKLIST_REMOVE(&jnewblk->jn_list);
5686
	free_jnewblk(jnewblk);
5687
	return (new);
5688
}
5689

5690
/*
5691
 * Replace an old allocdirect dependency with a newer one.
5692
 */
5693
static void
5694
allocdirect_merge(
5695
	struct allocdirectlst *adphead,	/* head of list holding allocdirects */
5696
	struct allocdirect *newadp,	/* allocdirect being added */
5697
	struct allocdirect *oldadp)	/* existing allocdirect being checked */
5698
{
5699
	struct worklist *wk;
5700
	struct freefrag *freefrag;
5701

5702
	freefrag = NULL;
5703
	LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp));
5704
	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5705
	    newadp->ad_oldsize != oldadp->ad_newsize ||
5706
	    newadp->ad_offset >= UFS_NDADDR)
5707
		panic("%s %jd != new %jd || old size %ld != new %ld",
5708
		    "allocdirect_merge: old blkno",
5709
		    (intmax_t)newadp->ad_oldblkno,
5710
		    (intmax_t)oldadp->ad_newblkno,
5711
		    newadp->ad_oldsize, oldadp->ad_newsize);
5712
	newadp->ad_oldblkno = oldadp->ad_oldblkno;
5713
	newadp->ad_oldsize = oldadp->ad_oldsize;
5714
	/*
5715
	 * If the old dependency had a fragment to free or had never
5716
	 * previously had a block allocated, then the new dependency
5717
	 * can immediately post its freefrag and adopt the old freefrag.
5718
	 * This action is done by swapping the freefrag dependencies.
5719
	 * The new dependency gains the old one's freefrag, and the
5720
	 * old one gets the new one and then immediately puts it on
5721
	 * the worklist when it is freed by free_newblk. It is
5722
	 * not possible to do this swap when the old dependency had a
5723
	 * non-zero size but no previous fragment to free. This condition
5724
	 * arises when the new block is an extension of the old block.
5725
	 * Here, the first part of the fragment allocated to the new
5726
	 * dependency is part of the block currently claimed on disk by
5727
	 * the old dependency, so cannot legitimately be freed until the
5728
	 * conditions for the new dependency are fulfilled.
5729
	 */
5730
	freefrag = newadp->ad_freefrag;
5731
	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5732
		newadp->ad_freefrag = oldadp->ad_freefrag;
5733
		oldadp->ad_freefrag = freefrag;
5734
	}
5735
	/*
5736
	 * If we are tracking a new directory-block allocation,
5737
	 * move it from the old allocdirect to the new allocdirect.
5738
	 */
5739
	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5740
		WORKLIST_REMOVE(wk);
5741
		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5742
			panic("allocdirect_merge: extra newdirblk");
5743
		WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5744
	}
5745
	TAILQ_REMOVE(adphead, oldadp, ad_next);
5746
	/*
5747
	 * We need to move any journal dependencies over to the freefrag
5748
	 * that releases this block if it exists.  Otherwise we are
5749
	 * extending an existing block and we'll wait until that is
5750
	 * complete to release the journal space and extend the
5751
	 * new journal to cover this old space as well.
5752
	 */
5753
	if (freefrag == NULL) {
5754
		if (oldadp->ad_newblkno != newadp->ad_newblkno)
5755
			panic("allocdirect_merge: %jd != %jd",
5756
			    oldadp->ad_newblkno, newadp->ad_newblkno);
5757
		newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5758
		    jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list, 
5759
		    &oldadp->ad_block.nb_jnewblk->jn_list,
5760
		    &newadp->ad_block.nb_jwork);
5761
		oldadp->ad_block.nb_jnewblk = NULL;
5762
		cancel_newblk(&oldadp->ad_block, NULL,
5763
		    &newadp->ad_block.nb_jwork);
5764
	} else {
5765
		wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5766
		    &freefrag->ff_list, &freefrag->ff_jwork);
5767
		freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5768
		    &freefrag->ff_jwork);
5769
	}
5770
	free_newblk(&oldadp->ad_block);
5771
}
5772

5773
/*
5774
 * Allocate a jfreefrag structure to journal a single block free.
5775
 */
5776
static struct jfreefrag *
5777
newjfreefrag(struct freefrag *freefrag,
5778
	struct inode *ip,
5779
	ufs2_daddr_t blkno,
5780
	long size,
5781
	ufs_lbn_t lbn)
5782
{
5783
	struct jfreefrag *jfreefrag;
5784
	struct fs *fs;
5785

5786
	fs = ITOFS(ip);
5787
	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5788
	    M_SOFTDEP_FLAGS);
5789
	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip));
5790
	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5791
	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5792
	jfreefrag->fr_ino = ip->i_number;
5793
	jfreefrag->fr_lbn = lbn;
5794
	jfreefrag->fr_blkno = blkno;
5795
	jfreefrag->fr_frags = numfrags(fs, size);
5796
	jfreefrag->fr_freefrag = freefrag;
5797

5798
	return (jfreefrag);
5799
}
5800

5801
/*
5802
 * Allocate a new freefrag structure.
5803
 */
5804
static struct freefrag *
5805
newfreefrag(struct inode *ip,
5806
	ufs2_daddr_t blkno,
5807
	long size,
5808
	ufs_lbn_t lbn,
5809
	uint64_t key)
5810
{
5811
	struct freefrag *freefrag;
5812
	struct ufsmount *ump;
5813
	struct fs *fs;
5814

5815
	CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
5816
	    ip->i_number, blkno, size, lbn);
5817
	ump = ITOUMP(ip);
5818
	fs = ump->um_fs;
5819
	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5820
		panic("newfreefrag: frag size");
5821
	freefrag = malloc(sizeof(struct freefrag),
5822
	    M_FREEFRAG, M_SOFTDEP_FLAGS);
5823
	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump));
5824
	freefrag->ff_state = ATTACHED;
5825
	LIST_INIT(&freefrag->ff_jwork);
5826
	freefrag->ff_inum = ip->i_number;
5827
	freefrag->ff_vtype = ITOV(ip)->v_type;
5828
	freefrag->ff_blkno = blkno;
5829
	freefrag->ff_fragsize = size;
5830
	freefrag->ff_key = key;
5831

5832
	if (MOUNTEDSUJ(UFSTOVFS(ump))) {
5833
		freefrag->ff_jdep = (struct worklist *)
5834
		    newjfreefrag(freefrag, ip, blkno, size, lbn);
5835
	} else {
5836
		freefrag->ff_state |= DEPCOMPLETE;
5837
		freefrag->ff_jdep = NULL;
5838
	}
5839

5840
	return (freefrag);
5841
}
5842

5843
/*
5844
 * This workitem de-allocates fragments that were replaced during
5845
 * file block allocation.
5846
 */
5847
static void 
5848
handle_workitem_freefrag(struct freefrag *freefrag)
5849
{
5850
	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5851
	struct workhead wkhd;
5852

5853
	CTR3(KTR_SUJ,
5854
	    "handle_workitem_freefrag: ino %d blkno %jd size %ld",
5855
	    freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
5856
	/*
5857
	 * It would be illegal to add new completion items to the
5858
	 * freefrag after it was schedule to be done so it must be
5859
	 * safe to modify the list head here.
5860
	 */
5861
	LIST_INIT(&wkhd);
5862
	ACQUIRE_LOCK(ump);
5863
	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5864
	/*
5865
	 * If the journal has not been written we must cancel it here.
5866
	 */
5867
	if (freefrag->ff_jdep) {
5868
		if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5869
			panic("handle_workitem_freefrag: Unexpected type %d\n",
5870
			    freefrag->ff_jdep->wk_type);
5871
		cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5872
	}
5873
	FREE_LOCK(ump);
5874
	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5875
	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype,
5876
	   &wkhd, freefrag->ff_key);
5877
	ACQUIRE_LOCK(ump);
5878
	WORKITEM_FREE(freefrag, D_FREEFRAG);
5879
	FREE_LOCK(ump);
5880
}
5881

5882
/*
5883
 * Set up a dependency structure for an external attributes data block.
5884
 * This routine follows much of the structure of softdep_setup_allocdirect.
5885
 * See the description of softdep_setup_allocdirect above for details.
5886
 */
5887
void 
5888
softdep_setup_allocext(
5889
	struct inode *ip,
5890
	ufs_lbn_t off,
5891
	ufs2_daddr_t newblkno,
5892
	ufs2_daddr_t oldblkno,
5893
	long newsize,
5894
	long oldsize,
5895
	struct buf *bp)
5896
{
5897
	struct allocdirect *adp, *oldadp;
5898
	struct allocdirectlst *adphead;
5899
	struct freefrag *freefrag;
5900
	struct inodedep *inodedep;
5901
	struct jnewblk *jnewblk;
5902
	struct newblk *newblk;
5903
	struct mount *mp;
5904
	struct ufsmount *ump;
5905
	ufs_lbn_t lbn;
5906

5907
	mp = ITOVFS(ip);
5908
	ump = VFSTOUFS(mp);
5909
	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5910
	    ("softdep_setup_allocext called on non-softdep filesystem"));
5911
	KASSERT(off < UFS_NXADDR,
5912
	    ("softdep_setup_allocext: lbn %lld > UFS_NXADDR", (long long)off));
5913

5914
	lbn = bp->b_lblkno;
5915
	if (oldblkno && oldblkno != newblkno)
5916
		/*
5917
		 * The usual case is that a smaller fragment that
5918
		 * was just allocated has been replaced with a bigger
5919
		 * fragment or a full-size block. If it is marked as
5920
		 * B_DELWRI, the current contents have not been written
5921
		 * to disk. It is possible that the block was written
5922
		 * earlier, but very uncommon. If the block has never
5923
		 * been written, there is no need to send a BIO_DELETE
5924
		 * for it when it is freed. The gain from avoiding the
5925
		 * TRIMs for the common case of unwritten blocks far
5926
		 * exceeds the cost of the write amplification for the
5927
		 * uncommon case of failing to send a TRIM for a block
5928
		 * that had been written.
5929
		 */
5930
		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
5931
		    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
5932
	else
5933
		freefrag = NULL;
5934

5935
	ACQUIRE_LOCK(ump);
5936
	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5937
		panic("softdep_setup_allocext: lost block");
5938
	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5939
	    ("softdep_setup_allocext: newblk already initialized"));
5940
	/*
5941
	 * Convert the newblk to an allocdirect.
5942
	 */
5943
	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5944
	adp = (struct allocdirect *)newblk;
5945
	newblk->nb_freefrag = freefrag;
5946
	adp->ad_offset = off;
5947
	adp->ad_oldblkno = oldblkno;
5948
	adp->ad_newsize = newsize;
5949
	adp->ad_oldsize = oldsize;
5950
	adp->ad_state |=  EXTDATA;
5951

5952
	/*
5953
	 * Finish initializing the journal.
5954
	 */
5955
	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5956
		jnewblk->jn_ino = ip->i_number;
5957
		jnewblk->jn_lbn = lbn;
5958
		add_to_journal(&jnewblk->jn_list);
5959
	}
5960
	if (freefrag && freefrag->ff_jdep != NULL &&
5961
	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5962
		add_to_journal(freefrag->ff_jdep);
5963
	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5964
	adp->ad_inodedep = inodedep;
5965

5966
	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5967
	/*
5968
	 * The list of allocdirects must be kept in sorted and ascending
5969
	 * order so that the rollback routines can quickly determine the
5970
	 * first uncommitted block (the size of the file stored on disk
5971
	 * ends at the end of the lowest committed fragment, or if there
5972
	 * are no fragments, at the end of the highest committed block).
5973
	 * Since files generally grow, the typical case is that the new
5974
	 * block is to be added at the end of the list. We speed this
5975
	 * special case by checking against the last allocdirect in the
5976
	 * list before laboriously traversing the list looking for the
5977
	 * insertion point.
5978
	 */
5979
	adphead = &inodedep->id_newextupdt;
5980
	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5981
	if (oldadp == NULL || oldadp->ad_offset <= off) {
5982
		/* insert at end of list */
5983
		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5984
		if (oldadp != NULL && oldadp->ad_offset == off)
5985
			allocdirect_merge(adphead, adp, oldadp);
5986
		FREE_LOCK(ump);
5987
		return;
5988
	}
5989
	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5990
		if (oldadp->ad_offset >= off)
5991
			break;
5992
	}
5993
	if (oldadp == NULL)
5994
		panic("softdep_setup_allocext: lost entry");
5995
	/* insert in middle of list */
5996
	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5997
	if (oldadp->ad_offset == off)
5998
		allocdirect_merge(adphead, adp, oldadp);
5999
	FREE_LOCK(ump);
6000
}
6001

6002
/*
6003
 * Indirect block allocation dependencies.
6004
 * 
6005
 * The same dependencies that exist for a direct block also exist when
6006
 * a new block is allocated and pointed to by an entry in a block of
6007
 * indirect pointers. The undo/redo states described above are also
6008
 * used here. Because an indirect block contains many pointers that
6009
 * may have dependencies, a second copy of the entire in-memory indirect
6010
 * block is kept. The buffer cache copy is always completely up-to-date.
6011
 * The second copy, which is used only as a source for disk writes,
6012
 * contains only the safe pointers (i.e., those that have no remaining
6013
 * update dependencies). The second copy is freed when all pointers
6014
 * are safe. The cache is not allowed to replace indirect blocks with
6015
 * pending update dependencies. If a buffer containing an indirect
6016
 * block with dependencies is written, these routines will mark it
6017
 * dirty again. It can only be successfully written once all the
6018
 * dependencies are removed. The ffs_fsync routine in conjunction with
6019
 * softdep_sync_metadata work together to get all the dependencies
6020
 * removed so that a file can be successfully written to disk. Three
6021
 * procedures are used when setting up indirect block pointer
6022
 * dependencies. The division is necessary because of the organization
6023
 * of the "balloc" routine and because of the distinction between file
6024
 * pages and file metadata blocks.
6025
 */
6026

6027
/*
6028
 * Allocate a new allocindir structure.
6029
 */
6030
static struct allocindir *
6031
newallocindir(
6032
	struct inode *ip,	/* inode for file being extended */
6033
	int ptrno,		/* offset of pointer in indirect block */
6034
	ufs2_daddr_t newblkno,	/* disk block number being added */
6035
	ufs2_daddr_t oldblkno,	/* previous block number, 0 if none */
6036
	ufs_lbn_t lbn)
6037
{
6038
	struct newblk *newblk;
6039
	struct allocindir *aip;
6040
	struct freefrag *freefrag;
6041
	struct jnewblk *jnewblk;
6042

6043
	if (oldblkno)
6044
		freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn,
6045
		    SINGLETON_KEY);
6046
	else
6047
		freefrag = NULL;
6048
	ACQUIRE_LOCK(ITOUMP(ip));
6049
	if (newblk_lookup(ITOVFS(ip), newblkno, 0, &newblk) == 0)
6050
		panic("new_allocindir: lost block");
6051
	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
6052
	    ("newallocindir: newblk already initialized"));
6053
	WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
6054
	newblk->nb_freefrag = freefrag;
6055
	aip = (struct allocindir *)newblk;
6056
	aip->ai_offset = ptrno;
6057
	aip->ai_oldblkno = oldblkno;
6058
	aip->ai_lbn = lbn;
6059
	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
6060
		jnewblk->jn_ino = ip->i_number;
6061
		jnewblk->jn_lbn = lbn;
6062
		add_to_journal(&jnewblk->jn_list);
6063
	}
6064
	if (freefrag && freefrag->ff_jdep != NULL &&
6065
	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
6066
		add_to_journal(freefrag->ff_jdep);
6067
	return (aip);
6068
}
6069

6070
/*
6071
 * Called just before setting an indirect block pointer
6072
 * to a newly allocated file page.
6073
 */
6074
void
6075
softdep_setup_allocindir_page(
6076
	struct inode *ip,	/* inode for file being extended */
6077
	ufs_lbn_t lbn,		/* allocated block number within file */
6078
	struct buf *bp,		/* buffer with indirect blk referencing page */
6079
	int ptrno,		/* offset of pointer in indirect block */
6080
	ufs2_daddr_t newblkno,	/* disk block number being added */
6081
	ufs2_daddr_t oldblkno,	/* previous block number, 0 if none */
6082
	struct buf *nbp)	/* buffer holding allocated page */
6083
{
6084
	struct inodedep *inodedep;
6085
	struct freefrag *freefrag;
6086
	struct allocindir *aip;
6087
	struct pagedep *pagedep;
6088
	struct mount *mp;
6089
	struct ufsmount *ump;
6090

6091
	mp = ITOVFS(ip);
6092
	ump = VFSTOUFS(mp);
6093
	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6094
	    ("softdep_setup_allocindir_page called on non-softdep filesystem"));
6095
	KASSERT(lbn == nbp->b_lblkno,
6096
	    ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
6097
	    lbn, bp->b_lblkno));
6098
	CTR4(KTR_SUJ,
6099
	    "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
6100
	    "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
6101
	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
6102
	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
6103
	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6104
	/*
6105
	 * If we are allocating a directory page, then we must
6106
	 * allocate an associated pagedep to track additions and
6107
	 * deletions.
6108
	 */
6109
	if ((ip->i_mode & IFMT) == IFDIR)
6110
		pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
6111
	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
6112
	freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
6113
	FREE_LOCK(ump);
6114
	if (freefrag)
6115
		handle_workitem_freefrag(freefrag);
6116
}
6117

6118
/*
6119
 * Called just before setting an indirect block pointer to a
6120
 * newly allocated indirect block.
6121
 */
6122
void
6123
softdep_setup_allocindir_meta(
6124
	struct buf *nbp,	/* newly allocated indirect block */
6125
	struct inode *ip,	/* inode for file being extended */
6126
	struct buf *bp,		/* indirect block referencing allocated block */
6127
	int ptrno,		/* offset of pointer in indirect block */
6128
	ufs2_daddr_t newblkno)	/* disk block number being added */
6129
{
6130
	struct inodedep *inodedep;
6131
	struct allocindir *aip;
6132
	struct ufsmount *ump;
6133
	ufs_lbn_t lbn;
6134

6135
	ump = ITOUMP(ip);
6136
	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
6137
	    ("softdep_setup_allocindir_meta called on non-softdep filesystem"));
6138
	CTR3(KTR_SUJ,
6139
	    "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
6140
	    ip->i_number, newblkno, ptrno);
6141
	lbn = nbp->b_lblkno;
6142
	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
6143
	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
6144
	inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
6145
	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
6146
	if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
6147
		panic("softdep_setup_allocindir_meta: Block already existed");
6148
	FREE_LOCK(ump);
6149
}
6150

6151
static void
6152
indirdep_complete(struct indirdep *indirdep)
6153
{
6154
	struct allocindir *aip;
6155

6156
	LIST_REMOVE(indirdep, ir_next);
6157
	indirdep->ir_state |= DEPCOMPLETE;
6158

6159
	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
6160
		LIST_REMOVE(aip, ai_next);
6161
		free_newblk(&aip->ai_block);
6162
	}
6163
	/*
6164
	 * If this indirdep is not attached to a buf it was simply waiting
6165
	 * on completion to clear completehd.  free_indirdep() asserts
6166
	 * that nothing is dangling.
6167
	 */
6168
	if ((indirdep->ir_state & ONWORKLIST) == 0)
6169
		free_indirdep(indirdep);
6170
}
6171

6172
static struct indirdep *
6173
indirdep_lookup(struct mount *mp,
6174
	struct inode *ip,
6175
	struct buf *bp)
6176
{
6177
	struct indirdep *indirdep, *newindirdep;
6178
	struct newblk *newblk;
6179
	struct ufsmount *ump;
6180
	struct worklist *wk;
6181
	struct fs *fs;
6182
	ufs2_daddr_t blkno;
6183

6184
	ump = VFSTOUFS(mp);
6185
	LOCK_OWNED(ump);
6186
	indirdep = NULL;
6187
	newindirdep = NULL;
6188
	fs = ump->um_fs;
6189
	for (;;) {
6190
		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
6191
			if (wk->wk_type != D_INDIRDEP)
6192
				continue;
6193
			indirdep = WK_INDIRDEP(wk);
6194
			break;
6195
		}
6196
		/* Found on the buffer worklist, no new structure to free. */
6197
		if (indirdep != NULL && newindirdep == NULL)
6198
			return (indirdep);
6199
		if (indirdep != NULL && newindirdep != NULL)
6200
			panic("indirdep_lookup: simultaneous create");
6201
		/* None found on the buffer and a new structure is ready. */
6202
		if (indirdep == NULL && newindirdep != NULL)
6203
			break;
6204
		/* None found and no new structure available. */
6205
		FREE_LOCK(ump);
6206
		newindirdep = malloc(sizeof(struct indirdep),
6207
		    M_INDIRDEP, M_SOFTDEP_FLAGS);
6208
		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
6209
		newindirdep->ir_state = ATTACHED;
6210
		if (I_IS_UFS1(ip))
6211
			newindirdep->ir_state |= UFS1FMT;
6212
		TAILQ_INIT(&newindirdep->ir_trunc);
6213
		newindirdep->ir_saveddata = NULL;
6214
		LIST_INIT(&newindirdep->ir_deplisthd);
6215
		LIST_INIT(&newindirdep->ir_donehd);
6216
		LIST_INIT(&newindirdep->ir_writehd);
6217
		LIST_INIT(&newindirdep->ir_completehd);
6218
		if (bp->b_blkno == bp->b_lblkno) {
6219
			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
6220
			    NULL, NULL);
6221
			bp->b_blkno = blkno;
6222
		}
6223
		newindirdep->ir_freeblks = NULL;
6224
		newindirdep->ir_savebp =
6225
		    getblk(ump->um_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
6226
		newindirdep->ir_bp = bp;
6227
		BUF_KERNPROC(newindirdep->ir_savebp);
6228
		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
6229
		ACQUIRE_LOCK(ump);
6230
	}
6231
	indirdep = newindirdep;
6232
	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
6233
	/*
6234
	 * If the block is not yet allocated we don't set DEPCOMPLETE so
6235
	 * that we don't free dependencies until the pointers are valid.
6236
	 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
6237
	 * than using the hash.
6238
	 */
6239
	if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
6240
		LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
6241
	else
6242
		indirdep->ir_state |= DEPCOMPLETE;
6243
	return (indirdep);
6244
}
6245

6246
/*
6247
 * Called to finish the allocation of the "aip" allocated
6248
 * by one of the two routines above.
6249
 */
6250
static struct freefrag *
6251
setup_allocindir_phase2(
6252
	struct buf *bp,		/* in-memory copy of the indirect block */
6253
	struct inode *ip,	/* inode for file being extended */
6254
	struct inodedep *inodedep, /* Inodedep for ip */
6255
	struct allocindir *aip,	/* allocindir allocated by the above routines */
6256
	ufs_lbn_t lbn)		/* Logical block number for this block. */
6257
{
6258
	struct fs *fs __diagused;
6259
	struct indirdep *indirdep;
6260
	struct allocindir *oldaip;
6261
	struct freefrag *freefrag;
6262
	struct mount *mp;
6263
	struct ufsmount *ump;
6264

6265
	mp = ITOVFS(ip);
6266
	ump = VFSTOUFS(mp);
6267
	LOCK_OWNED(ump);
6268
	fs = ump->um_fs;
6269
	if (bp->b_lblkno >= 0)
6270
		panic("setup_allocindir_phase2: not indir blk");
6271
	KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
6272
	    ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
6273
	indirdep = indirdep_lookup(mp, ip, bp);
6274
	KASSERT(indirdep->ir_savebp != NULL,
6275
	    ("setup_allocindir_phase2 NULL ir_savebp"));
6276
	aip->ai_indirdep = indirdep;
6277
	/*
6278
	 * Check for an unwritten dependency for this indirect offset.  If
6279
	 * there is, merge the old dependency into the new one.  This happens
6280
	 * as a result of reallocblk only.
6281
	 */
6282
	freefrag = NULL;
6283
	if (aip->ai_oldblkno != 0) {
6284
		LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
6285
			if (oldaip->ai_offset == aip->ai_offset) {
6286
				freefrag = allocindir_merge(aip, oldaip);
6287
				goto done;
6288
			}
6289
		}
6290
		LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
6291
			if (oldaip->ai_offset == aip->ai_offset) {
6292
				freefrag = allocindir_merge(aip, oldaip);
6293
				goto done;
6294
			}
6295
		}
6296
	}
6297
done:
6298
	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
6299
	return (freefrag);
6300
}
6301

6302
/*
6303
 * Merge two allocindirs which refer to the same block.  Move newblock
6304
 * dependencies and setup the freefrags appropriately.
6305
 */
6306
static struct freefrag *
6307
allocindir_merge(
6308
	struct allocindir *aip,
6309
	struct allocindir *oldaip)
6310
{
6311
	struct freefrag *freefrag;
6312
	struct worklist *wk;
6313

6314
	if (oldaip->ai_newblkno != aip->ai_oldblkno)
6315
		panic("allocindir_merge: blkno");
6316
	aip->ai_oldblkno = oldaip->ai_oldblkno;
6317
	freefrag = aip->ai_freefrag;
6318
	aip->ai_freefrag = oldaip->ai_freefrag;
6319
	oldaip->ai_freefrag = NULL;
6320
	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
6321
	/*
6322
	 * If we are tracking a new directory-block allocation,
6323
	 * move it from the old allocindir to the new allocindir.
6324
	 */
6325
	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
6326
		WORKLIST_REMOVE(wk);
6327
		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
6328
			panic("allocindir_merge: extra newdirblk");
6329
		WORKLIST_INSERT(&aip->ai_newdirblk, wk);
6330
	}
6331
	/*
6332
	 * We can skip journaling for this freefrag and just complete
6333
	 * any pending journal work for the allocindir that is being
6334
	 * removed after the freefrag completes.
6335
	 */
6336
	if (freefrag->ff_jdep)
6337
		cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
6338
	LIST_REMOVE(oldaip, ai_next);
6339
	freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
6340
	    &freefrag->ff_list, &freefrag->ff_jwork);
6341
	free_newblk(&oldaip->ai_block);
6342

6343
	return (freefrag);
6344
}
6345

6346
static inline void
6347
setup_freedirect(
6348
	struct freeblks *freeblks,
6349
	struct inode *ip,
6350
	int i,
6351
	int needj)
6352
{
6353
	struct ufsmount *ump;
6354
	ufs2_daddr_t blkno;
6355
	int frags;
6356

6357
	blkno = DIP(ip, i_db[i]);
6358
	if (blkno == 0)
6359
		return;
6360
	DIP_SET(ip, i_db[i], 0);
6361
	ump = ITOUMP(ip);
6362
	frags = sblksize(ump->um_fs, ip->i_size, i);
6363
	frags = numfrags(ump->um_fs, frags);
6364
	newfreework(ump, freeblks, NULL, i, blkno, frags, 0, needj);
6365
}
6366

6367
static inline void
6368
setup_freeext(
6369
	struct freeblks *freeblks,
6370
	struct inode *ip,
6371
	int i,
6372
	int needj)
6373
{
6374
	struct ufsmount *ump;
6375
	ufs2_daddr_t blkno;
6376
	int frags;
6377

6378
	blkno = ip->i_din2->di_extb[i];
6379
	if (blkno == 0)
6380
		return;
6381
	ip->i_din2->di_extb[i] = 0;
6382
	ump = ITOUMP(ip);
6383
	frags = sblksize(ump->um_fs, ip->i_din2->di_extsize, i);
6384
	frags = numfrags(ump->um_fs, frags);
6385
	newfreework(ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
6386
}
6387

6388
static inline void
6389
setup_freeindir(
6390
	struct freeblks *freeblks,
6391
	struct inode *ip,
6392
	int i,
6393
	ufs_lbn_t lbn,
6394
	int needj)
6395
{
6396
	struct ufsmount *ump;
6397
	ufs2_daddr_t blkno;
6398

6399
	blkno = DIP(ip, i_ib[i]);
6400
	if (blkno == 0)
6401
		return;
6402
	DIP_SET(ip, i_ib[i], 0);
6403
	ump = ITOUMP(ip);
6404
	newfreework(ump, freeblks, NULL, lbn, blkno, ump->um_fs->fs_frag,
6405
	    0, needj);
6406
}
6407

6408
static inline struct freeblks *
6409
newfreeblks(struct mount *mp, struct inode *ip)
6410
{
6411
	struct freeblks *freeblks;
6412

6413
	freeblks = malloc(sizeof(struct freeblks),
6414
		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
6415
	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
6416
	LIST_INIT(&freeblks->fb_jblkdephd);
6417
	LIST_INIT(&freeblks->fb_jwork);
6418
	freeblks->fb_ref = 0;
6419
	freeblks->fb_cgwait = 0;
6420
	freeblks->fb_state = ATTACHED;
6421
	freeblks->fb_uid = ip->i_uid;
6422
	freeblks->fb_inum = ip->i_number;
6423
	freeblks->fb_vtype = ITOV(ip)->v_type;
6424
	freeblks->fb_modrev = DIP(ip, i_modrev);
6425
	freeblks->fb_devvp = ITODEVVP(ip);
6426
	freeblks->fb_chkcnt = 0;
6427
	freeblks->fb_len = 0;
6428

6429
	return (freeblks);
6430
}
6431

6432
static void
6433
trunc_indirdep(
6434
	struct indirdep *indirdep,
6435
	struct freeblks *freeblks,
6436
	struct buf *bp,
6437
	int off)
6438
{
6439
	struct allocindir *aip, *aipn;
6440

6441
	/*
6442
	 * The first set of allocindirs won't be in savedbp.
6443
	 */
6444
	LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
6445
		if (aip->ai_offset > off)
6446
			cancel_allocindir(aip, bp, freeblks, 1);
6447
	LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
6448
		if (aip->ai_offset > off)
6449
			cancel_allocindir(aip, bp, freeblks, 1);
6450
	/*
6451
	 * These will exist in savedbp.
6452
	 */
6453
	LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
6454
		if (aip->ai_offset > off)
6455
			cancel_allocindir(aip, NULL, freeblks, 0);
6456
	LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
6457
		if (aip->ai_offset > off)
6458
			cancel_allocindir(aip, NULL, freeblks, 0);
6459
}
6460

6461
/*
6462
 * Follow the chain of indirects down to lastlbn creating a freework
6463
 * structure for each.  This will be used to start indir_trunc() at
6464
 * the right offset and create the journal records for the parrtial
6465
 * truncation.  A second step will handle the truncated dependencies.
6466
 */
6467
static int
6468
setup_trunc_indir(
6469
	struct freeblks *freeblks,
6470
	struct inode *ip,
6471
	ufs_lbn_t lbn,
6472
	ufs_lbn_t lastlbn,
6473
	ufs2_daddr_t blkno)
6474
{
6475
	struct indirdep *indirdep;
6476
	struct indirdep *indirn;
6477
	struct freework *freework;
6478
	struct newblk *newblk;
6479
	struct mount *mp;
6480
	struct ufsmount *ump;
6481
	struct buf *bp;
6482
	uint8_t *start;
6483
	uint8_t *end;
6484
	ufs_lbn_t lbnadd;
6485
	int level;
6486
	int error;
6487
	int off;
6488

6489
	freework = NULL;
6490
	if (blkno == 0)
6491
		return (0);
6492
	mp = freeblks->fb_list.wk_mp;
6493
	ump = VFSTOUFS(mp);
6494
	/*
6495
	 * Here, calls to VOP_BMAP() will fail.  However, we already have
6496
	 * the on-disk address, so we just pass it to bread() instead of
6497
	 * having bread() attempt to calculate it using VOP_BMAP().
6498
	 */
6499
	error = ffs_breadz(ump, ITOV(ip), lbn, blkptrtodb(ump, blkno),
6500
	    (int)mp->mnt_stat.f_iosize, NULL, NULL, 0, NOCRED, 0, NULL, &bp);
6501
	if (error)
6502
		return (error);
6503
	level = lbn_level(lbn);
6504
	lbnadd = lbn_offset(ump->um_fs, level);
6505
	/*
6506
	 * Compute the offset of the last block we want to keep.  Store
6507
	 * in the freework the first block we want to completely free.
6508
	 */
6509
	off = (lastlbn - -(lbn + level)) / lbnadd;
6510
	if (off + 1 == NINDIR(ump->um_fs))
6511
		goto nowork;
6512
	freework = newfreework(ump, freeblks, NULL, lbn, blkno, 0, off + 1, 0);
6513
	/*
6514
	 * Link the freework into the indirdep.  This will prevent any new
6515
	 * allocations from proceeding until we are finished with the
6516
	 * truncate and the block is written.
6517
	 */
6518
	ACQUIRE_LOCK(ump);
6519
	indirdep = indirdep_lookup(mp, ip, bp);
6520
	if (indirdep->ir_freeblks)
6521
		panic("setup_trunc_indir: indirdep already truncated.");
6522
	TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
6523
	freework->fw_indir = indirdep;
6524
	/*
6525
	 * Cancel any allocindirs that will not make it to disk.
6526
	 * We have to do this for all copies of the indirdep that
6527
	 * live on this newblk.
6528
	 */
6529
	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
6530
		if (newblk_lookup(mp, dbtofsb(ump->um_fs, bp->b_blkno), 0,
6531
		    &newblk) == 0)
6532
			panic("setup_trunc_indir: lost block");
6533
		LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
6534
			trunc_indirdep(indirn, freeblks, bp, off);
6535
	} else
6536
		trunc_indirdep(indirdep, freeblks, bp, off);
6537
	FREE_LOCK(ump);
6538
	/*
6539
	 * Creation is protected by the buf lock. The saveddata is only
6540
	 * needed if a full truncation follows a partial truncation but it
6541
	 * is difficult to allocate in that case so we fetch it anyway.
6542
	 */
6543
	if (indirdep->ir_saveddata == NULL)
6544
		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
6545
		    M_SOFTDEP_FLAGS);
6546
nowork:
6547
	/* Fetch the blkno of the child and the zero start offset. */
6548
	if (I_IS_UFS1(ip)) {
6549
		blkno = ((ufs1_daddr_t *)bp->b_data)[off];
6550
		start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
6551
	} else {
6552
		blkno = ((ufs2_daddr_t *)bp->b_data)[off];
6553
		start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
6554
	}
6555
	if (freework) {
6556
		/* Zero the truncated pointers. */
6557
		end = bp->b_data + bp->b_bcount;
6558
		bzero(start, end - start);
6559
		bdwrite(bp);
6560
	} else
6561
		bqrelse(bp);
6562
	if (level == 0)
6563
		return (0);
6564
	lbn++; /* adjust level */
6565
	lbn -= (off * lbnadd);
6566
	return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
6567
}
6568

6569
/*
6570
 * Complete the partial truncation of an indirect block setup by
6571
 * setup_trunc_indir().  This zeros the truncated pointers in the saved
6572
 * copy and writes them to disk before the freeblks is allowed to complete.
6573
 */
6574
static void
6575
complete_trunc_indir(struct freework *freework)
6576
{
6577
	struct freework *fwn;
6578
	struct indirdep *indirdep;
6579
	struct ufsmount *ump;
6580
	struct buf *bp;
6581
	uintptr_t start;
6582
	int count;
6583

6584
	ump = VFSTOUFS(freework->fw_list.wk_mp);
6585
	LOCK_OWNED(ump);
6586
	indirdep = freework->fw_indir;
6587
	for (;;) {
6588
		bp = indirdep->ir_bp;
6589
		/* See if the block was discarded. */
6590
		if (bp == NULL)
6591
			break;
6592
		/* Inline part of getdirtybuf().  We dont want bremfree. */
6593
		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
6594
			break;
6595
		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6596
		    LOCK_PTR(ump)) == 0)
6597
			BUF_UNLOCK(bp);
6598
		ACQUIRE_LOCK(ump);
6599
	}
6600
	freework->fw_state |= DEPCOMPLETE;
6601
	TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
6602
	/*
6603
	 * Zero the pointers in the saved copy.
6604
	 */
6605
	if (indirdep->ir_state & UFS1FMT)
6606
		start = sizeof(ufs1_daddr_t);
6607
	else
6608
		start = sizeof(ufs2_daddr_t);
6609
	start *= freework->fw_start;
6610
	count = indirdep->ir_savebp->b_bcount - start;
6611
	start += (uintptr_t)indirdep->ir_savebp->b_data;
6612
	bzero((char *)start, count);
6613
	/*
6614
	 * We need to start the next truncation in the list if it has not
6615
	 * been started yet.
6616
	 */
6617
	fwn = TAILQ_FIRST(&indirdep->ir_trunc);
6618
	if (fwn != NULL) {
6619
		if (fwn->fw_freeblks == indirdep->ir_freeblks)
6620
			TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
6621
		if ((fwn->fw_state & ONWORKLIST) == 0)
6622
			freework_enqueue(fwn);
6623
	}
6624
	/*
6625
	 * If bp is NULL the block was fully truncated, restore
6626
	 * the saved block list otherwise free it if it is no
6627
	 * longer needed.
6628
	 */
6629
	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6630
		if (bp == NULL)
6631
			bcopy(indirdep->ir_saveddata,
6632
			    indirdep->ir_savebp->b_data,
6633
			    indirdep->ir_savebp->b_bcount);
6634
		free(indirdep->ir_saveddata, M_INDIRDEP);
6635
		indirdep->ir_saveddata = NULL;
6636
	}
6637
	/*
6638
	 * When bp is NULL there is a full truncation pending.  We
6639
	 * must wait for this full truncation to be journaled before
6640
	 * we can release this freework because the disk pointers will
6641
	 * never be written as zero.
6642
	 */
6643
	if (bp == NULL)  {
6644
		if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6645
			handle_written_freework(freework);
6646
		else
6647
			WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6648
			   &freework->fw_list);
6649
		if (fwn == NULL) {
6650
			freework->fw_indir = (void *)0x0000deadbeef0000;
6651
			bp = indirdep->ir_savebp;
6652
			indirdep->ir_savebp = NULL;
6653
			free_indirdep(indirdep);
6654
			FREE_LOCK(ump);
6655
			brelse(bp);
6656
			ACQUIRE_LOCK(ump);
6657
		}
6658
	} else {
6659
		/* Complete when the real copy is written. */
6660
		WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6661
		BUF_UNLOCK(bp);
6662
	}
6663
}
6664

6665
/*
6666
 * Calculate the number of blocks we are going to release where datablocks
6667
 * is the current total and length is the new file size.
6668
 */
6669
static ufs2_daddr_t
6670
blkcount(struct fs *fs,
6671
	ufs2_daddr_t datablocks,
6672
	off_t length)
6673
{
6674
	off_t totblks, numblks;
6675

6676
	totblks = 0;
6677
	numblks = howmany(length, fs->fs_bsize);
6678
	if (numblks <= UFS_NDADDR) {
6679
		totblks = howmany(length, fs->fs_fsize);
6680
		goto out;
6681
	}
6682
        totblks = blkstofrags(fs, numblks);
6683
	numblks -= UFS_NDADDR;
6684
	/*
6685
	 * Count all single, then double, then triple indirects required.
6686
	 * Subtracting one indirects worth of blocks for each pass
6687
	 * acknowledges one of each pointed to by the inode.
6688
	 */
6689
	for (;;) {
6690
		totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6691
		numblks -= NINDIR(fs);
6692
		if (numblks <= 0)
6693
			break;
6694
		numblks = howmany(numblks, NINDIR(fs));
6695
	}
6696
out:
6697
	totblks = fsbtodb(fs, totblks);
6698
	/*
6699
	 * Handle sparse files.  We can't reclaim more blocks than the inode
6700
	 * references.  We will correct it later in handle_complete_freeblks()
6701
	 * when we know the real count.
6702
	 */
6703
	if (totblks > datablocks)
6704
		return (0);
6705
	return (datablocks - totblks);
6706
}
6707

6708
/*
6709
 * Handle freeblocks for journaled softupdate filesystems.
6710
 *
6711
 * Contrary to normal softupdates, we must preserve the block pointers in
6712
 * indirects until their subordinates are free.  This is to avoid journaling
6713
 * every block that is freed which may consume more space than the journal
6714
 * itself.  The recovery program will see the free block journals at the
6715
 * base of the truncated area and traverse them to reclaim space.  The
6716
 * pointers in the inode may be cleared immediately after the journal
6717
 * records are written because each direct and indirect pointer in the
6718
 * inode is recorded in a journal.  This permits full truncation to proceed
6719
 * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6720
 *
6721
 * The algorithm is as follows:
6722
 * 1) Traverse the in-memory state and create journal entries to release
6723
 *    the relevant blocks and full indirect trees.
6724
 * 2) Traverse the indirect block chain adding partial truncation freework
6725
 *    records to indirects in the path to lastlbn.  The freework will
6726
 *    prevent new allocation dependencies from being satisfied in this
6727
 *    indirect until the truncation completes.
6728
 * 3) Read and lock the inode block, performing an update with the new size
6729
 *    and pointers.  This prevents truncated data from becoming valid on
6730
 *    disk through step 4.
6731
 * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6732
 *    eliminate journal work for those records that do not require it.
6733
 * 5) Schedule the journal records to be written followed by the inode block.
6734
 * 6) Allocate any necessary frags for the end of file.
6735
 * 7) Zero any partially truncated blocks.
6736
 *
6737
 * From this truncation proceeds asynchronously using the freework and
6738
 * indir_trunc machinery.  The file will not be extended again into a
6739
 * partially truncated indirect block until all work is completed but
6740
 * the normal dependency mechanism ensures that it is rolled back/forward
6741
 * as appropriate.  Further truncation may occur without delay and is
6742
 * serialized in indir_trunc().
6743
 */
6744
void
6745
softdep_journal_freeblocks(
6746
	struct inode *ip,	/* The inode whose length is to be reduced */
6747
	struct ucred *cred,
6748
	off_t length,		/* The new length for the file */
6749
	int flags)		/* IO_EXT and/or IO_NORMAL */
6750
{
6751
	struct freeblks *freeblks, *fbn;
6752
	struct worklist *wk, *wkn;
6753
	struct inodedep *inodedep;
6754
	struct jblkdep *jblkdep;
6755
	struct allocdirect *adp, *adpn;
6756
	struct ufsmount *ump;
6757
	struct fs *fs;
6758
	struct buf *bp;
6759
	struct vnode *vp;
6760
	struct mount *mp;
6761
	daddr_t dbn;
6762
	ufs2_daddr_t extblocks, datablocks;
6763
	ufs_lbn_t tmpval, lbn, lastlbn;
6764
	int frags, lastoff, iboff, allocblock, needj, error, i;
6765

6766
	ump = ITOUMP(ip);
6767
	mp = UFSTOVFS(ump);
6768
	fs = ump->um_fs;
6769
	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6770
	    ("softdep_journal_freeblocks called on non-softdep filesystem"));
6771
	vp = ITOV(ip);
6772
	needj = 1;
6773
	iboff = -1;
6774
	allocblock = 0;
6775
	extblocks = 0;
6776
	datablocks = 0;
6777
	frags = 0;
6778
	freeblks = newfreeblks(mp, ip);
6779
	ACQUIRE_LOCK(ump);
6780
	/*
6781
	 * If we're truncating a removed file that will never be written
6782
	 * we don't need to journal the block frees.  The canceled journals
6783
	 * for the allocations will suffice.
6784
	 */
6785
	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6786
	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6787
	    length == 0)
6788
		needj = 0;
6789
	CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
6790
	    ip->i_number, length, needj);
6791
	FREE_LOCK(ump);
6792
	/*
6793
	 * Calculate the lbn that we are truncating to.  This results in -1
6794
	 * if we're truncating the 0 bytes.  So it is the last lbn we want
6795
	 * to keep, not the first lbn we want to truncate.
6796
	 */
6797
	lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6798
	lastoff = blkoff(fs, length);
6799
	/*
6800
	 * Compute frags we are keeping in lastlbn.  0 means all.
6801
	 */
6802
	if (lastlbn >= 0 && lastlbn < UFS_NDADDR) {
6803
		frags = fragroundup(fs, lastoff);
6804
		/* adp offset of last valid allocdirect. */
6805
		iboff = lastlbn;
6806
	} else if (lastlbn > 0)
6807
		iboff = UFS_NDADDR;
6808
	if (fs->fs_magic == FS_UFS2_MAGIC)
6809
		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6810
	/*
6811
	 * Handle normal data blocks and indirects.  This section saves
6812
	 * values used after the inode update to complete frag and indirect
6813
	 * truncation.
6814
	 */
6815
	if ((flags & IO_NORMAL) != 0) {
6816
		/*
6817
		 * Handle truncation of whole direct and indirect blocks.
6818
		 */
6819
		for (i = iboff + 1; i < UFS_NDADDR; i++)
6820
			setup_freedirect(freeblks, ip, i, needj);
6821
		for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR;
6822
		    i < UFS_NIADDR;
6823
		    i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6824
			/* Release a whole indirect tree. */
6825
			if (lbn > lastlbn) {
6826
				setup_freeindir(freeblks, ip, i, -lbn -i,
6827
				    needj);
6828
				continue;
6829
			}
6830
			iboff = i + UFS_NDADDR;
6831
			/*
6832
			 * Traverse partially truncated indirect tree.
6833
			 */
6834
			if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6835
				setup_trunc_indir(freeblks, ip, -lbn - i,
6836
				    lastlbn, DIP(ip, i_ib[i]));
6837
		}
6838
		/*
6839
		 * Handle partial truncation to a frag boundary.
6840
		 */
6841
		if (frags) {
6842
			ufs2_daddr_t blkno;
6843
			long oldfrags;
6844

6845
			oldfrags = blksize(fs, ip, lastlbn);
6846
			blkno = DIP(ip, i_db[lastlbn]);
6847
			if (blkno && oldfrags != frags) {
6848
				oldfrags -= frags;
6849
				oldfrags = numfrags(fs, oldfrags);
6850
				blkno += numfrags(fs, frags);
6851
				newfreework(ump, freeblks, NULL, lastlbn,
6852
				    blkno, oldfrags, 0, needj);
6853
				if (needj)
6854
					adjust_newfreework(freeblks,
6855
					    numfrags(fs, frags));
6856
			} else if (blkno == 0)
6857
				allocblock = 1;
6858
		}
6859
		/*
6860
		 * Add a journal record for partial truncate if we are
6861
		 * handling indirect blocks.  Non-indirects need no extra
6862
		 * journaling.
6863
		 */
6864
		if (length != 0 && lastlbn >= UFS_NDADDR) {
6865
			UFS_INODE_SET_FLAG(ip, IN_TRUNCATED);
6866
			newjtrunc(freeblks, length, 0);
6867
		}
6868
		ip->i_size = length;
6869
		DIP_SET(ip, i_size, ip->i_size);
6870
		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
6871
		datablocks = DIP(ip, i_blocks) - extblocks;
6872
		if (length != 0)
6873
			datablocks = blkcount(fs, datablocks, length);
6874
		freeblks->fb_len = length;
6875
	}
6876
	if ((flags & IO_EXT) != 0) {
6877
		for (i = 0; i < UFS_NXADDR; i++)
6878
			setup_freeext(freeblks, ip, i, needj);
6879
		ip->i_din2->di_extsize = 0;
6880
		datablocks += extblocks;
6881
		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
6882
	}
6883
#ifdef QUOTA
6884
	/* Reference the quotas in case the block count is wrong in the end. */
6885
	quotaref(vp, freeblks->fb_quota);
6886
	(void) chkdq(ip, -datablocks, NOCRED, FORCE);
6887
#endif
6888
	freeblks->fb_chkcnt = -datablocks;
6889
	UFS_LOCK(ump);
6890
	fs->fs_pendingblocks += datablocks;
6891
	UFS_UNLOCK(ump);
6892
	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6893
	/*
6894
	 * Handle truncation of incomplete alloc direct dependencies.  We
6895
	 * hold the inode block locked to prevent incomplete dependencies
6896
	 * from reaching the disk while we are eliminating those that
6897
	 * have been truncated.  This is a partially inlined ffs_update().
6898
	 */
6899
	ufs_itimes(vp);
6900
	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
6901
	dbn = fsbtodb(fs, ino_to_fsba(fs, ip->i_number));
6902
	error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize,
6903
	    NULL, NULL, 0, cred, 0, NULL, &bp);
6904
	if (error) {
6905
		softdep_error("softdep_journal_freeblocks", error);
6906
		return;
6907
	}
6908
	if (bp->b_bufsize == fs->fs_bsize)
6909
		bp->b_flags |= B_CLUSTEROK;
6910
	softdep_update_inodeblock(ip, bp, 0);
6911
	if (ump->um_fstype == UFS1) {
6912
		*((struct ufs1_dinode *)bp->b_data +
6913
		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
6914
	} else {
6915
		ffs_update_dinode_ckhash(fs, ip->i_din2);
6916
		*((struct ufs2_dinode *)bp->b_data +
6917
		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
6918
	}
6919
	ACQUIRE_LOCK(ump);
6920
	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6921
	if ((inodedep->id_state & IOSTARTED) != 0)
6922
		panic("softdep_setup_freeblocks: inode busy");
6923
	/*
6924
	 * Add the freeblks structure to the list of operations that
6925
	 * must await the zero'ed inode being written to disk. If we
6926
	 * still have a bitmap dependency (needj), then the inode
6927
	 * has never been written to disk, so we can process the
6928
	 * freeblks below once we have deleted the dependencies.
6929
	 */
6930
	if (needj)
6931
		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6932
	else
6933
		freeblks->fb_state |= COMPLETE;
6934
	if ((flags & IO_NORMAL) != 0) {
6935
		TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
6936
			if (adp->ad_offset > iboff)
6937
				cancel_allocdirect(&inodedep->id_inoupdt, adp,
6938
				    freeblks);
6939
			/*
6940
			 * Truncate the allocdirect.  We could eliminate
6941
			 * or modify journal records as well.
6942
			 */
6943
			else if (adp->ad_offset == iboff && frags)
6944
				adp->ad_newsize = frags;
6945
		}
6946
	}
6947
	if ((flags & IO_EXT) != 0)
6948
		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
6949
			cancel_allocdirect(&inodedep->id_extupdt, adp,
6950
			    freeblks);
6951
	/*
6952
	 * Scan the bufwait list for newblock dependencies that will never
6953
	 * make it to disk.
6954
	 */
6955
	LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
6956
		if (wk->wk_type != D_ALLOCDIRECT)
6957
			continue;
6958
		adp = WK_ALLOCDIRECT(wk);
6959
		if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
6960
		    ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
6961
			cancel_jfreeblk(freeblks, adp->ad_newblkno);
6962
			cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
6963
			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
6964
		}
6965
	}
6966
	/*
6967
	 * Add journal work.
6968
	 */
6969
	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
6970
		add_to_journal(&jblkdep->jb_list);
6971
	FREE_LOCK(ump);
6972
	bdwrite(bp);
6973
	/*
6974
	 * Truncate dependency structures beyond length.
6975
	 */
6976
	trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
6977
	/*
6978
	 * This is only set when we need to allocate a fragment because
6979
	 * none existed at the end of a frag-sized file.  It handles only
6980
	 * allocating a new, zero filled block.
6981
	 */
6982
	if (allocblock) {
6983
		ip->i_size = length - lastoff;
6984
		DIP_SET(ip, i_size, ip->i_size);
6985
		error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
6986
		if (error != 0) {
6987
			softdep_error("softdep_journal_freeblks", error);
6988
			return;
6989
		}
6990
		ip->i_size = length;
6991
		DIP_SET(ip, i_size, length);
6992
		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE);
6993
		allocbuf(bp, frags);
6994
		ffs_update(vp, 0);
6995
		bawrite(bp);
6996
	} else if (lastoff != 0 && vp->v_type != VDIR) {
6997
		int size;
6998

6999
		/*
7000
		 * Zero the end of a truncated frag or block.
7001
		 */
7002
		size = sblksize(fs, length, lastlbn);
7003
		error = bread(vp, lastlbn, size, cred, &bp);
7004
		if (error == 0) {
7005
			bzero((char *)bp->b_data + lastoff, size - lastoff);
7006
			bawrite(bp);
7007
		} else if (!ffs_fsfail_cleanup(ump, error)) {
7008
			softdep_error("softdep_journal_freeblks", error);
7009
			return;
7010
		}
7011
	}
7012
	ACQUIRE_LOCK(ump);
7013
	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
7014
	TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
7015
	freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
7016
	/*
7017
	 * We zero earlier truncations so they don't erroneously
7018
	 * update i_blocks.
7019
	 */
7020
	if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
7021
		TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
7022
			fbn->fb_len = 0;
7023
	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
7024
	    LIST_EMPTY(&freeblks->fb_jblkdephd))
7025
		freeblks->fb_state |= INPROGRESS;
7026
	else
7027
		freeblks = NULL;
7028
	FREE_LOCK(ump);
7029
	if (freeblks)
7030
		handle_workitem_freeblocks(freeblks, 0);
7031
	trunc_pages(ip, length, extblocks, flags);
7032

7033
}
7034

7035
/*
7036
 * Flush a JOP_SYNC to the journal.
7037
 */
7038
void
7039
softdep_journal_fsync(struct inode *ip)
7040
{
7041
	struct jfsync *jfsync;
7042
	struct ufsmount *ump;
7043

7044
	ump = ITOUMP(ip);
7045
	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
7046
	    ("softdep_journal_fsync called on non-softdep filesystem"));
7047
	if ((ip->i_flag & IN_TRUNCATED) == 0)
7048
		return;
7049
	ip->i_flag &= ~IN_TRUNCATED;
7050
	jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
7051
	workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ump));
7052
	jfsync->jfs_size = ip->i_size;
7053
	jfsync->jfs_ino = ip->i_number;
7054
	ACQUIRE_LOCK(ump);
7055
	add_to_journal(&jfsync->jfs_list);
7056
	jwait(&jfsync->jfs_list, MNT_WAIT);
7057
	FREE_LOCK(ump);
7058
}
7059

7060
/*
7061
 * Block de-allocation dependencies.
7062
 * 
7063
 * When blocks are de-allocated, the on-disk pointers must be nullified before
7064
 * the blocks are made available for use by other files.  (The true
7065
 * requirement is that old pointers must be nullified before new on-disk
7066
 * pointers are set.  We chose this slightly more stringent requirement to
7067
 * reduce complexity.) Our implementation handles this dependency by updating
7068
 * the inode (or indirect block) appropriately but delaying the actual block
7069
 * de-allocation (i.e., freemap and free space count manipulation) until
7070
 * after the updated versions reach stable storage.  After the disk is
7071
 * updated, the blocks can be safely de-allocated whenever it is convenient.
7072
 * This implementation handles only the common case of reducing a file's
7073
 * length to zero. Other cases are handled by the conventional synchronous
7074
 * write approach.
7075
 *
7076
 * The ffs implementation with which we worked double-checks
7077
 * the state of the block pointers and file size as it reduces
7078
 * a file's length.  Some of this code is replicated here in our
7079
 * soft updates implementation.  The freeblks->fb_chkcnt field is
7080
 * used to transfer a part of this information to the procedure
7081
 * that eventually de-allocates the blocks.
7082
 *
7083
 * This routine should be called from the routine that shortens
7084
 * a file's length, before the inode's size or block pointers
7085
 * are modified. It will save the block pointer information for
7086
 * later release and zero the inode so that the calling routine
7087
 * can release it.
7088
 */
7089
void
7090
softdep_setup_freeblocks(
7091
	struct inode *ip,	/* The inode whose length is to be reduced */
7092
	off_t length,		/* The new length for the file */
7093
	int flags)		/* IO_EXT and/or IO_NORMAL */
7094
{
7095
	struct ufs1_dinode *dp1;
7096
	struct ufs2_dinode *dp2;
7097
	struct freeblks *freeblks;
7098
	struct inodedep *inodedep;
7099
	struct allocdirect *adp;
7100
	struct ufsmount *ump;
7101
	struct buf *bp;
7102
	struct fs *fs;
7103
	ufs2_daddr_t extblocks, datablocks;
7104
	struct mount *mp;
7105
	int i, delay, error;
7106
	ufs_lbn_t tmpval;
7107
	ufs_lbn_t lbn;
7108

7109
	ump = ITOUMP(ip);
7110
	mp = UFSTOVFS(ump);
7111
	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
7112
	    ("softdep_setup_freeblocks called on non-softdep filesystem"));
7113
	CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
7114
	    ip->i_number, length);
7115
	KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length"));
7116
	fs = ump->um_fs;
7117
	if ((error = bread(ump->um_devvp,
7118
	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
7119
	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
7120
		if (!ffs_fsfail_cleanup(ump, error))
7121
			softdep_error("softdep_setup_freeblocks", error);
7122
		return;
7123
	}
7124
	freeblks = newfreeblks(mp, ip);
7125
	extblocks = 0;
7126
	datablocks = 0;
7127
	if (fs->fs_magic == FS_UFS2_MAGIC)
7128
		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
7129
	if ((flags & IO_NORMAL) != 0) {
7130
		for (i = 0; i < UFS_NDADDR; i++)
7131
			setup_freedirect(freeblks, ip, i, 0);
7132
		for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR;
7133
		    i < UFS_NIADDR;
7134
		    i++, lbn += tmpval, tmpval *= NINDIR(fs))
7135
			setup_freeindir(freeblks, ip, i, -lbn -i, 0);
7136
		ip->i_size = 0;
7137
		DIP_SET(ip, i_size, 0);
7138
		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
7139
		datablocks = DIP(ip, i_blocks) - extblocks;
7140
	}
7141
	if ((flags & IO_EXT) != 0) {
7142
		for (i = 0; i < UFS_NXADDR; i++)
7143
			setup_freeext(freeblks, ip, i, 0);
7144
		ip->i_din2->di_extsize = 0;
7145
		datablocks += extblocks;
7146
		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
7147
	}
7148
#ifdef QUOTA
7149
	/* Reference the quotas in case the block count is wrong in the end. */
7150
	quotaref(ITOV(ip), freeblks->fb_quota);
7151
	(void) chkdq(ip, -datablocks, NOCRED, FORCE);
7152
#endif
7153
	freeblks->fb_chkcnt = -datablocks;
7154
	UFS_LOCK(ump);
7155
	fs->fs_pendingblocks += datablocks;
7156
	UFS_UNLOCK(ump);
7157
	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
7158
	/*
7159
	 * Push the zero'ed inode to its disk buffer so that we are free
7160
	 * to delete its dependencies below. Once the dependencies are gone
7161
	 * the buffer can be safely released.
7162
	 */
7163
	if (ump->um_fstype == UFS1) {
7164
		dp1 = ((struct ufs1_dinode *)bp->b_data +
7165
		    ino_to_fsbo(fs, ip->i_number));
7166
		ip->i_din1->di_freelink = dp1->di_freelink;
7167
		*dp1 = *ip->i_din1;
7168
	} else {
7169
		dp2 = ((struct ufs2_dinode *)bp->b_data +
7170
		    ino_to_fsbo(fs, ip->i_number));
7171
		ip->i_din2->di_freelink = dp2->di_freelink;
7172
		ffs_update_dinode_ckhash(fs, ip->i_din2);
7173
		*dp2 = *ip->i_din2;
7174
	}
7175
	/*
7176
	 * Find and eliminate any inode dependencies.
7177
	 */
7178
	ACQUIRE_LOCK(ump);
7179
	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
7180
	if ((inodedep->id_state & IOSTARTED) != 0)
7181
		panic("softdep_setup_freeblocks: inode busy");
7182
	/*
7183
	 * Add the freeblks structure to the list of operations that
7184
	 * must await the zero'ed inode being written to disk. If we
7185
	 * still have a bitmap dependency (delay == 0), then the inode
7186
	 * has never been written to disk, so we can process the
7187
	 * freeblks below once we have deleted the dependencies.
7188
	 */
7189
	delay = (inodedep->id_state & DEPCOMPLETE);
7190
	if (delay)
7191
		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
7192
	else
7193
		freeblks->fb_state |= COMPLETE;
7194
	/*
7195
	 * Because the file length has been truncated to zero, any
7196
	 * pending block allocation dependency structures associated
7197
	 * with this inode are obsolete and can simply be de-allocated.
7198
	 * We must first merge the two dependency lists to get rid of
7199
	 * any duplicate freefrag structures, then purge the merged list.
7200
	 * If we still have a bitmap dependency, then the inode has never
7201
	 * been written to disk, so we can free any fragments without delay.
7202
	 */
7203
	if (flags & IO_NORMAL) {
7204
		merge_inode_lists(&inodedep->id_newinoupdt,
7205
		    &inodedep->id_inoupdt);
7206
		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
7207
			cancel_allocdirect(&inodedep->id_inoupdt, adp,
7208
			    freeblks);
7209
	}
7210
	if (flags & IO_EXT) {
7211
		merge_inode_lists(&inodedep->id_newextupdt,
7212
		    &inodedep->id_extupdt);
7213
		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
7214
			cancel_allocdirect(&inodedep->id_extupdt, adp,
7215
			    freeblks);
7216
	}
7217
	FREE_LOCK(ump);
7218
	bdwrite(bp);
7219
	trunc_dependencies(ip, freeblks, -1, 0, flags);
7220
	ACQUIRE_LOCK(ump);
7221
	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
7222
		(void) free_inodedep(inodedep);
7223
	freeblks->fb_state |= DEPCOMPLETE;
7224
	/*
7225
	 * If the inode with zeroed block pointers is now on disk
7226
	 * we can start freeing blocks.
7227
	 */  
7228
	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
7229
		freeblks->fb_state |= INPROGRESS;
7230
	else
7231
		freeblks = NULL;
7232
	FREE_LOCK(ump);
7233
	if (freeblks)
7234
		handle_workitem_freeblocks(freeblks, 0);
7235
	trunc_pages(ip, length, extblocks, flags);
7236
}
7237

7238
/*
7239
 * Eliminate pages from the page cache that back parts of this inode and
7240
 * adjust the vnode pager's idea of our size.  This prevents stale data
7241
 * from hanging around in the page cache.
7242
 */
7243
static void
7244
trunc_pages(
7245
	struct inode *ip,
7246
	off_t length,
7247
	ufs2_daddr_t extblocks,
7248
	int flags)
7249
{
7250
	struct vnode *vp;
7251
	struct fs *fs;
7252
	ufs_lbn_t lbn;
7253
	off_t end, extend;
7254

7255
	vp = ITOV(ip);
7256
	fs = ITOFS(ip);
7257
	extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
7258
	if ((flags & IO_EXT) != 0)
7259
		vn_pages_remove(vp, extend, 0);
7260
	if ((flags & IO_NORMAL) == 0)
7261
		return;
7262
	BO_LOCK(&vp->v_bufobj);
7263
	drain_output(vp);
7264
	BO_UNLOCK(&vp->v_bufobj);
7265
	/*
7266
	 * The vnode pager eliminates file pages we eliminate indirects
7267
	 * below.
7268
	 */
7269
	vnode_pager_setsize(vp, length);
7270
	/*
7271
	 * Calculate the end based on the last indirect we want to keep.  If
7272
	 * the block extends into indirects we can just use the negative of
7273
	 * its lbn.  Doubles and triples exist at lower numbers so we must
7274
	 * be careful not to remove those, if they exist.  double and triple
7275
	 * indirect lbns do not overlap with others so it is not important
7276
	 * to verify how many levels are required.
7277
	 */
7278
	lbn = lblkno(fs, length);
7279
	if (lbn >= UFS_NDADDR) {
7280
		/* Calculate the virtual lbn of the triple indirect. */
7281
		lbn = -lbn - (UFS_NIADDR - 1);
7282
		end = OFF_TO_IDX(lblktosize(fs, lbn));
7283
	} else
7284
		end = extend;
7285
	vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
7286
}
7287

7288
/*
7289
 * See if the buf bp is in the range eliminated by truncation.
7290
 */
7291
static int
7292
trunc_check_buf(
7293
	struct buf *bp,
7294
	int *blkoffp,
7295
	ufs_lbn_t lastlbn,
7296
	int lastoff,
7297
	int flags)
7298
{
7299
	ufs_lbn_t lbn;
7300

7301
	*blkoffp = 0;
7302
	/* Only match ext/normal blocks as appropriate. */
7303
	if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
7304
	    ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
7305
		return (0);
7306
	/* ALTDATA is always a full truncation. */
7307
	if ((bp->b_xflags & BX_ALTDATA) != 0)
7308
		return (1);
7309
	/* -1 is full truncation. */
7310
	if (lastlbn == -1)
7311
		return (1);
7312
	/*
7313
	 * If this is a partial truncate we only want those
7314
	 * blocks and indirect blocks that cover the range
7315
	 * we're after.
7316
	 */
7317
	lbn = bp->b_lblkno;
7318
	if (lbn < 0)
7319
		lbn = -(lbn + lbn_level(lbn));
7320
	if (lbn < lastlbn)
7321
		return (0);
7322
	/* Here we only truncate lblkno if it's partial. */
7323
	if (lbn == lastlbn) {
7324
		if (lastoff == 0)
7325
			return (0);
7326
		*blkoffp = lastoff;
7327
	}
7328
	return (1);
7329
}
7330

7331
/*
7332
 * Eliminate any dependencies that exist in memory beyond lblkno:off
7333
 */
7334
static void
7335
trunc_dependencies(
7336
	struct inode *ip,
7337
	struct freeblks *freeblks,
7338
	ufs_lbn_t lastlbn,
7339
	int lastoff,
7340
	int flags)
7341
{
7342
	struct bufobj *bo;
7343
	struct vnode *vp;
7344
	struct buf *bp;
7345
	int blkoff;
7346

7347
	/*
7348
	 * We must wait for any I/O in progress to finish so that
7349
	 * all potential buffers on the dirty list will be visible.
7350
	 * Once they are all there, walk the list and get rid of
7351
	 * any dependencies.
7352
	 */
7353
	vp = ITOV(ip);
7354
	bo = &vp->v_bufobj;
7355
	BO_LOCK(bo);
7356
	drain_output(vp);
7357
	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
7358
		bp->b_vflags &= ~BV_SCANNED;
7359
restart:
7360
	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
7361
		if (bp->b_vflags & BV_SCANNED)
7362
			continue;
7363
		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7364
			bp->b_vflags |= BV_SCANNED;
7365
			continue;
7366
		}
7367
		KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer"));
7368
		if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL)
7369
			goto restart;
7370
		BO_UNLOCK(bo);
7371
		if (deallocate_dependencies(bp, freeblks, blkoff))
7372
			bqrelse(bp);
7373
		else
7374
			brelse(bp);
7375
		BO_LOCK(bo);
7376
		goto restart;
7377
	}
7378
	/*
7379
	 * Now do the work of vtruncbuf while also matching indirect blocks.
7380
	 */
7381
	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
7382
		bp->b_vflags &= ~BV_SCANNED;
7383
cleanrestart:
7384
	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
7385
		if (bp->b_vflags & BV_SCANNED)
7386
			continue;
7387
		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7388
			bp->b_vflags |= BV_SCANNED;
7389
			continue;
7390
		}
7391
		if (BUF_LOCK(bp,
7392
		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
7393
		    BO_LOCKPTR(bo)) == ENOLCK) {
7394
			BO_LOCK(bo);
7395
			goto cleanrestart;
7396
		}
7397
		BO_LOCK(bo);
7398
		bp->b_vflags |= BV_SCANNED;
7399
		BO_UNLOCK(bo);
7400
		bremfree(bp);
7401
		if (blkoff != 0) {
7402
			allocbuf(bp, blkoff);
7403
			bqrelse(bp);
7404
		} else {
7405
			bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
7406
			brelse(bp);
7407
		}
7408
		BO_LOCK(bo);
7409
		goto cleanrestart;
7410
	}
7411
	drain_output(vp);
7412
	BO_UNLOCK(bo);
7413
}
7414

7415
static int
7416
cancel_pagedep(
7417
	struct pagedep *pagedep,
7418
	struct freeblks *freeblks,
7419
	int blkoff)
7420
{
7421
	struct jremref *jremref;
7422
	struct jmvref *jmvref;
7423
	struct dirrem *dirrem, *tmp;
7424
	int i;
7425

7426
	/*
7427
	 * Copy any directory remove dependencies to the list
7428
	 * to be processed after the freeblks proceeds.  If
7429
	 * directory entry never made it to disk they
7430
	 * can be dumped directly onto the work list.
7431
	 */
7432
	LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
7433
		/* Skip this directory removal if it is intended to remain. */
7434
		if (dirrem->dm_offset < blkoff)
7435
			continue;
7436
		/*
7437
		 * If there are any dirrems we wait for the journal write
7438
		 * to complete and then restart the buf scan as the lock
7439
		 * has been dropped.
7440
		 */
7441
		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
7442
			jwait(&jremref->jr_list, MNT_WAIT);
7443
			return (ERESTART);
7444
		}
7445
		LIST_REMOVE(dirrem, dm_next);
7446
		dirrem->dm_dirinum = pagedep->pd_ino;
7447
		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
7448
	}
7449
	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
7450
		jwait(&jmvref->jm_list, MNT_WAIT);
7451
		return (ERESTART);
7452
	}
7453
	/*
7454
	 * When we're partially truncating a pagedep we just want to flush
7455
	 * journal entries and return.  There can not be any adds in the
7456
	 * truncated portion of the directory and newblk must remain if
7457
	 * part of the block remains.
7458
	 */
7459
	if (blkoff != 0) {
7460
		struct diradd *dap;
7461

7462
		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
7463
			if (dap->da_offset > blkoff)
7464
				panic("cancel_pagedep: diradd %p off %d > %d",
7465
				    dap, dap->da_offset, blkoff);
7466
		for (i = 0; i < DAHASHSZ; i++)
7467
			LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
7468
				if (dap->da_offset > blkoff)
7469
					panic("cancel_pagedep: diradd %p off %d > %d",
7470
					    dap, dap->da_offset, blkoff);
7471
		return (0);
7472
	}
7473
	/*
7474
	 * There should be no directory add dependencies present
7475
	 * as the directory could not be truncated until all
7476
	 * children were removed.
7477
	 */
7478
	KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
7479
	    ("deallocate_dependencies: pendinghd != NULL"));
7480
	for (i = 0; i < DAHASHSZ; i++)
7481
		KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
7482
		    ("deallocate_dependencies: diraddhd != NULL"));
7483
	if ((pagedep->pd_state & NEWBLOCK) != 0)
7484
		free_newdirblk(pagedep->pd_newdirblk);
7485
	if (free_pagedep(pagedep) == 0)
7486
		panic("Failed to free pagedep %p", pagedep);
7487
	return (0);
7488
}
7489

7490
/*
7491
 * Reclaim any dependency structures from a buffer that is about to
7492
 * be reallocated to a new vnode. The buffer must be locked, thus,
7493
 * no I/O completion operations can occur while we are manipulating
7494
 * its associated dependencies. The mutex is held so that other I/O's
7495
 * associated with related dependencies do not occur.
7496
 */
7497
static int
7498
deallocate_dependencies(
7499
	struct buf *bp,
7500
	struct freeblks *freeblks,
7501
	int off)
7502
{
7503
	struct indirdep *indirdep;
7504
	struct pagedep *pagedep;
7505
	struct worklist *wk, *wkn;
7506
	struct ufsmount *ump;
7507

7508
	ump = softdep_bp_to_mp(bp);
7509
	if (ump == NULL)
7510
		goto done;
7511
	ACQUIRE_LOCK(ump);
7512
	LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
7513
		switch (wk->wk_type) {
7514
		case D_INDIRDEP:
7515
			indirdep = WK_INDIRDEP(wk);
7516
			if (bp->b_lblkno >= 0 ||
7517
			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
7518
				panic("deallocate_dependencies: not indir");
7519
			cancel_indirdep(indirdep, bp, freeblks);
7520
			continue;
7521

7522
		case D_PAGEDEP:
7523
			pagedep = WK_PAGEDEP(wk);
7524
			if (cancel_pagedep(pagedep, freeblks, off)) {
7525
				FREE_LOCK(ump);
7526
				return (ERESTART);
7527
			}
7528
			continue;
7529

7530
		case D_ALLOCINDIR:
7531
			/*
7532
			 * Simply remove the allocindir, we'll find it via
7533
			 * the indirdep where we can clear pointers if
7534
			 * needed.
7535
			 */
7536
			WORKLIST_REMOVE(wk);
7537
			continue;
7538

7539
		case D_FREEWORK:
7540
			/*
7541
			 * A truncation is waiting for the zero'd pointers
7542
			 * to be written.  It can be freed when the freeblks
7543
			 * is journaled.
7544
			 */
7545
			WORKLIST_REMOVE(wk);
7546
			wk->wk_state |= ONDEPLIST;
7547
			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
7548
			break;
7549

7550
		case D_ALLOCDIRECT:
7551
			if (off != 0)
7552
				continue;
7553
			/* FALLTHROUGH */
7554
		default:
7555
			panic("deallocate_dependencies: Unexpected type %s",
7556
			    TYPENAME(wk->wk_type));
7557
			/* NOTREACHED */
7558
		}
7559
	}
7560
	FREE_LOCK(ump);
7561
done:
7562
	/*
7563
	 * Don't throw away this buf, we were partially truncating and
7564
	 * some deps may always remain.
7565
	 */
7566
	if (off) {
7567
		allocbuf(bp, off);
7568
		bp->b_vflags |= BV_SCANNED;
7569
		return (EBUSY);
7570
	}
7571
	bp->b_flags |= B_INVAL | B_NOCACHE;
7572

7573
	return (0);
7574
}
7575

7576
/*
7577
 * An allocdirect is being canceled due to a truncate.  We must make sure
7578
 * the journal entry is released in concert with the blkfree that releases
7579
 * the storage.  Completed journal entries must not be released until the
7580
 * space is no longer pointed to by the inode or in the bitmap.
7581
 */
7582
static void
7583
cancel_allocdirect(
7584
	struct allocdirectlst *adphead,
7585
	struct allocdirect *adp,
7586
	struct freeblks *freeblks)
7587
{
7588
	struct freework *freework;
7589
	struct newblk *newblk;
7590
	struct worklist *wk;
7591

7592
	TAILQ_REMOVE(adphead, adp, ad_next);
7593
	newblk = (struct newblk *)adp;
7594
	freework = NULL;
7595
	/*
7596
	 * Find the correct freework structure.
7597
	 */
7598
	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
7599
		if (wk->wk_type != D_FREEWORK)
7600
			continue;
7601
		freework = WK_FREEWORK(wk);
7602
		if (freework->fw_blkno == newblk->nb_newblkno)
7603
			break;
7604
	}
7605
	if (freework == NULL)
7606
		panic("cancel_allocdirect: Freework not found");
7607
	/*
7608
	 * If a newblk exists at all we still have the journal entry that
7609
	 * initiated the allocation so we do not need to journal the free.
7610
	 */
7611
	cancel_jfreeblk(freeblks, freework->fw_blkno);
7612
	/*
7613
	 * If the journal hasn't been written the jnewblk must be passed
7614
	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
7615
	 * this by linking the journal dependency into the freework to be
7616
	 * freed when freework_freeblock() is called.  If the journal has
7617
	 * been written we can simply reclaim the journal space when the
7618
	 * freeblks work is complete.
7619
	 */
7620
	freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
7621
	    &freeblks->fb_jwork);
7622
	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7623
}
7624

7625
/*
7626
 * Cancel a new block allocation.  May be an indirect or direct block.  We
7627
 * remove it from various lists and return any journal record that needs to
7628
 * be resolved by the caller.
7629
 *
7630
 * A special consideration is made for indirects which were never pointed
7631
 * at on disk and will never be found once this block is released.
7632
 */
7633
static struct jnewblk *
7634
cancel_newblk(
7635
	struct newblk *newblk,
7636
	struct worklist *wk,
7637
	struct workhead *wkhd)
7638
{
7639
	struct jnewblk *jnewblk;
7640

7641
	CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
7642
	    
7643
	newblk->nb_state |= GOINGAWAY;
7644
	/*
7645
	 * Previously we traversed the completedhd on each indirdep
7646
	 * attached to this newblk to cancel them and gather journal
7647
	 * work.  Since we need only the oldest journal segment and
7648
	 * the lowest point on the tree will always have the oldest
7649
	 * journal segment we are free to release the segments
7650
	 * of any subordinates and may leave the indirdep list to
7651
	 * indirdep_complete() when this newblk is freed.
7652
	 */
7653
	if (newblk->nb_state & ONDEPLIST) {
7654
		newblk->nb_state &= ~ONDEPLIST;
7655
		LIST_REMOVE(newblk, nb_deps);
7656
	}
7657
	if (newblk->nb_state & ONWORKLIST)
7658
		WORKLIST_REMOVE(&newblk->nb_list);
7659
	/*
7660
	 * If the journal entry hasn't been written we save a pointer to
7661
	 * the dependency that frees it until it is written or the
7662
	 * superseding operation completes.
7663
	 */
7664
	jnewblk = newblk->nb_jnewblk;
7665
	if (jnewblk != NULL && wk != NULL) {
7666
		newblk->nb_jnewblk = NULL;
7667
		jnewblk->jn_dep = wk;
7668
	}
7669
	if (!LIST_EMPTY(&newblk->nb_jwork))
7670
		jwork_move(wkhd, &newblk->nb_jwork);
7671
	/*
7672
	 * When truncating we must free the newdirblk early to remove
7673
	 * the pagedep from the hash before returning.
7674
	 */
7675
	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7676
		free_newdirblk(WK_NEWDIRBLK(wk));
7677
	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7678
		panic("cancel_newblk: extra newdirblk");
7679

7680
	return (jnewblk);
7681
}
7682

7683
/*
7684
 * Schedule the freefrag associated with a newblk to be released once
7685
 * the pointers are written and the previous block is no longer needed.
7686
 */
7687
static void
7688
newblk_freefrag(struct newblk *newblk)
7689
{
7690
	struct freefrag *freefrag;
7691

7692
	if (newblk->nb_freefrag == NULL)
7693
		return;
7694
	freefrag = newblk->nb_freefrag;
7695
	newblk->nb_freefrag = NULL;
7696
	freefrag->ff_state |= COMPLETE;
7697
	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7698
		add_to_worklist(&freefrag->ff_list, 0);
7699
}
7700

7701
/*
7702
 * Free a newblk. Generate a new freefrag work request if appropriate.
7703
 * This must be called after the inode pointer and any direct block pointers
7704
 * are valid or fully removed via truncate or frag extension.
7705
 */
7706
static void
7707
free_newblk(struct newblk *newblk)
7708
{
7709
	struct indirdep *indirdep;
7710
	struct worklist *wk;
7711

7712
	KASSERT(newblk->nb_jnewblk == NULL,
7713
	    ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
7714
	KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
7715
	    ("free_newblk: unclaimed newblk"));
7716
	LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp));
7717
	newblk_freefrag(newblk);
7718
	if (newblk->nb_state & ONDEPLIST)
7719
		LIST_REMOVE(newblk, nb_deps);
7720
	if (newblk->nb_state & ONWORKLIST)
7721
		WORKLIST_REMOVE(&newblk->nb_list);
7722
	LIST_REMOVE(newblk, nb_hash);
7723
	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7724
		free_newdirblk(WK_NEWDIRBLK(wk));
7725
	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7726
		panic("free_newblk: extra newdirblk");
7727
	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7728
		indirdep_complete(indirdep);
7729
	handle_jwork(&newblk->nb_jwork);
7730
	WORKITEM_FREE(newblk, D_NEWBLK);
7731
}
7732

7733
/*
7734
 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7735
 */
7736
static void
7737
free_newdirblk(struct newdirblk *newdirblk)
7738
{
7739
	struct pagedep *pagedep;
7740
	struct diradd *dap;
7741
	struct worklist *wk;
7742

7743
	LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp));
7744
	WORKLIST_REMOVE(&newdirblk->db_list);
7745
	/*
7746
	 * If the pagedep is still linked onto the directory buffer
7747
	 * dependency chain, then some of the entries on the
7748
	 * pd_pendinghd list may not be committed to disk yet. In
7749
	 * this case, we will simply clear the NEWBLOCK flag and
7750
	 * let the pd_pendinghd list be processed when the pagedep
7751
	 * is next written. If the pagedep is no longer on the buffer
7752
	 * dependency chain, then all the entries on the pd_pending
7753
	 * list are committed to disk and we can free them here.
7754
	 */
7755
	pagedep = newdirblk->db_pagedep;
7756
	pagedep->pd_state &= ~NEWBLOCK;
7757
	if ((pagedep->pd_state & ONWORKLIST) == 0) {
7758
		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7759
			free_diradd(dap, NULL);
7760
		/*
7761
		 * If no dependencies remain, the pagedep will be freed.
7762
		 */
7763
		free_pagedep(pagedep);
7764
	}
7765
	/* Should only ever be one item in the list. */
7766
	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7767
		WORKLIST_REMOVE(wk);
7768
		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7769
	}
7770
	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7771
}
7772

7773
/*
7774
 * Prepare an inode to be freed. The actual free operation is not
7775
 * done until the zero'ed inode has been written to disk.
7776
 */
7777
void
7778
softdep_freefile(
7779
	struct vnode *pvp,
7780
	ino_t ino,
7781
	int mode)
7782
{
7783
	struct inode *ip = VTOI(pvp);
7784
	struct inodedep *inodedep;
7785
	struct freefile *freefile;
7786
	struct freeblks *freeblks;
7787
	struct ufsmount *ump;
7788

7789
	ump = ITOUMP(ip);
7790
	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
7791
	    ("softdep_freefile called on non-softdep filesystem"));
7792
	/*
7793
	 * This sets up the inode de-allocation dependency.
7794
	 */
7795
	freefile = malloc(sizeof(struct freefile),
7796
		M_FREEFILE, M_SOFTDEP_FLAGS);
7797
	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7798
	freefile->fx_mode = mode;
7799
	freefile->fx_oldinum = ino;
7800
	freefile->fx_devvp = ump->um_devvp;
7801
	LIST_INIT(&freefile->fx_jwork);
7802
	UFS_LOCK(ump);
7803
	ump->um_fs->fs_pendinginodes += 1;
7804
	UFS_UNLOCK(ump);
7805

7806
	/*
7807
	 * If the inodedep does not exist, then the zero'ed inode has
7808
	 * been written to disk. If the allocated inode has never been
7809
	 * written to disk, then the on-disk inode is zero'ed. In either
7810
	 * case we can free the file immediately.  If the journal was
7811
	 * canceled before being written the inode will never make it to
7812
	 * disk and we must send the canceled journal entrys to
7813
	 * ffs_freefile() to be cleared in conjunction with the bitmap.
7814
	 * Any blocks waiting on the inode to write can be safely freed
7815
	 * here as it will never been written.
7816
	 */
7817
	ACQUIRE_LOCK(ump);
7818
	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7819
	if (inodedep) {
7820
		/*
7821
		 * Clear out freeblks that no longer need to reference
7822
		 * this inode.
7823
		 */
7824
		while ((freeblks =
7825
		    TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7826
			TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7827
			    fb_next);
7828
			freeblks->fb_state &= ~ONDEPLIST;
7829
		}
7830
		/*
7831
		 * Remove this inode from the unlinked list.
7832
		 */
7833
		if (inodedep->id_state & UNLINKED) {
7834
			/*
7835
			 * Save the journal work to be freed with the bitmap
7836
			 * before we clear UNLINKED.  Otherwise it can be lost
7837
			 * if the inode block is written.
7838
			 */
7839
			handle_bufwait(inodedep, &freefile->fx_jwork);
7840
			clear_unlinked_inodedep(inodedep);
7841
			/*
7842
			 * Re-acquire inodedep as we've dropped the
7843
			 * per-filesystem lock in clear_unlinked_inodedep().
7844
			 */
7845
			inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7846
		}
7847
	}
7848
	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7849
		FREE_LOCK(ump);
7850
		handle_workitem_freefile(freefile);
7851
		return;
7852
	}
7853
	if ((inodedep->id_state & DEPCOMPLETE) == 0)
7854
		inodedep->id_state |= GOINGAWAY;
7855
	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7856
	FREE_LOCK(ump);
7857
	if (ip->i_number == ino)
7858
		UFS_INODE_SET_FLAG(ip, IN_MODIFIED);
7859
}
7860

7861
/*
7862
 * Check to see if an inode has never been written to disk. If
7863
 * so free the inodedep and return success, otherwise return failure.
7864
 *
7865
 * If we still have a bitmap dependency, then the inode has never
7866
 * been written to disk. Drop the dependency as it is no longer
7867
 * necessary since the inode is being deallocated. We set the
7868
 * ALLCOMPLETE flags since the bitmap now properly shows that the
7869
 * inode is not allocated. Even if the inode is actively being
7870
 * written, it has been rolled back to its zero'ed state, so we
7871
 * are ensured that a zero inode is what is on the disk. For short
7872
 * lived files, this change will usually result in removing all the
7873
 * dependencies from the inode so that it can be freed immediately.
7874
 */
7875
static int
7876
check_inode_unwritten(struct inodedep *inodedep)
7877
{
7878

7879
	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7880

7881
	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
7882
	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7883
	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7884
	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7885
	    !LIST_EMPTY(&inodedep->id_inowait) ||
7886
	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7887
	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7888
	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7889
	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7890
	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7891
	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7892
	    inodedep->id_mkdiradd != NULL || 
7893
	    inodedep->id_nlinkdelta != 0)
7894
		return (0);
7895
	/*
7896
	 * Another process might be in initiate_write_inodeblock_ufs[12]
7897
	 * trying to allocate memory without holding "Softdep Lock".
7898
	 */
7899
	if ((inodedep->id_state & IOSTARTED) != 0 &&
7900
	    inodedep->id_savedino1 == NULL)
7901
		return (0);
7902

7903
	if (inodedep->id_state & ONDEPLIST)
7904
		LIST_REMOVE(inodedep, id_deps);
7905
	inodedep->id_state &= ~ONDEPLIST;
7906
	inodedep->id_state |= ALLCOMPLETE;
7907
	inodedep->id_bmsafemap = NULL;
7908
	if (inodedep->id_state & ONWORKLIST)
7909
		WORKLIST_REMOVE(&inodedep->id_list);
7910
	if (inodedep->id_savedino1 != NULL) {
7911
		free(inodedep->id_savedino1, M_SAVEDINO);
7912
		inodedep->id_savedino1 = NULL;
7913
	}
7914
	if (free_inodedep(inodedep) == 0)
7915
		panic("check_inode_unwritten: busy inode");
7916
	return (1);
7917
}
7918

7919
static int
7920
check_inodedep_free(struct inodedep *inodedep)
7921
{
7922

7923
	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7924
	if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
7925
	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7926
	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7927
	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7928
	    !LIST_EMPTY(&inodedep->id_inowait) ||
7929
	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7930
	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7931
	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7932
	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7933
	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7934
	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7935
	    inodedep->id_mkdiradd != NULL ||
7936
	    inodedep->id_nlinkdelta != 0 ||
7937
	    inodedep->id_savedino1 != NULL)
7938
		return (0);
7939
	return (1);
7940
}
7941

7942
/*
7943
 * Try to free an inodedep structure. Return 1 if it could be freed.
7944
 */
7945
static int
7946
free_inodedep(struct inodedep *inodedep)
7947
{
7948

7949
	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7950
	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
7951
	    !check_inodedep_free(inodedep))
7952
		return (0);
7953
	if (inodedep->id_state & ONDEPLIST)
7954
		LIST_REMOVE(inodedep, id_deps);
7955
	LIST_REMOVE(inodedep, id_hash);
7956
	WORKITEM_FREE(inodedep, D_INODEDEP);
7957
	return (1);
7958
}
7959

7960
/*
7961
 * Free the block referenced by a freework structure.  The parent freeblks
7962
 * structure is released and completed when the final cg bitmap reaches
7963
 * the disk.  This routine may be freeing a jnewblk which never made it to
7964
 * disk in which case we do not have to wait as the operation is undone
7965
 * in memory immediately.
7966
 */
7967
static void
7968
freework_freeblock(struct freework *freework, uint64_t key)
7969
{
7970
	struct freeblks *freeblks;
7971
	struct jnewblk *jnewblk;
7972
	struct ufsmount *ump;
7973
	struct workhead wkhd;
7974
	struct fs *fs;
7975
	int bsize;
7976
	int needj;
7977

7978
	ump = VFSTOUFS(freework->fw_list.wk_mp);
7979
	LOCK_OWNED(ump);
7980
	/*
7981
	 * Handle partial truncate separately.
7982
	 */
7983
	if (freework->fw_indir) {
7984
		complete_trunc_indir(freework);
7985
		return;
7986
	}
7987
	freeblks = freework->fw_freeblks;
7988
	fs = ump->um_fs;
7989
	needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
7990
	bsize = lfragtosize(fs, freework->fw_frags);
7991
	LIST_INIT(&wkhd);
7992
	/*
7993
	 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
7994
	 * on the indirblk hashtable and prevents premature freeing.
7995
	 */
7996
	freework->fw_state |= DEPCOMPLETE;
7997
	/*
7998
	 * SUJ needs to wait for the segment referencing freed indirect
7999
	 * blocks to expire so that we know the checker will not confuse
8000
	 * a re-allocated indirect block with its old contents.
8001
	 */
8002
	if (needj && freework->fw_lbn <= -UFS_NDADDR)
8003
		indirblk_insert(freework);
8004
	/*
8005
	 * If we are canceling an existing jnewblk pass it to the free
8006
	 * routine, otherwise pass the freeblk which will ultimately
8007
	 * release the freeblks.  If we're not journaling, we can just
8008
	 * free the freeblks immediately.
8009
	 */
8010
	jnewblk = freework->fw_jnewblk;
8011
	if (jnewblk != NULL) {
8012
		cancel_jnewblk(jnewblk, &wkhd);
8013
		needj = 0;
8014
	} else if (needj) {
8015
		freework->fw_state |= DELAYEDFREE;
8016
		freeblks->fb_cgwait++;
8017
		WORKLIST_INSERT(&wkhd, &freework->fw_list);
8018
	}
8019
	FREE_LOCK(ump);
8020
	freeblks_free(ump, freeblks, btodb(bsize));
8021
	CTR4(KTR_SUJ,
8022
	    "freework_freeblock: ino %jd blkno %jd lbn %jd size %d",
8023
	    freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
8024
	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
8025
	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key);
8026
	ACQUIRE_LOCK(ump);
8027
	/*
8028
	 * The jnewblk will be discarded and the bits in the map never
8029
	 * made it to disk.  We can immediately free the freeblk.
8030
	 */
8031
	if (needj == 0)
8032
		handle_written_freework(freework);
8033
}
8034

8035
/*
8036
 * We enqueue freework items that need processing back on the freeblks and
8037
 * add the freeblks to the worklist.  This makes it easier to find all work
8038
 * required to flush a truncation in process_truncates().
8039
 */
8040
static void
8041
freework_enqueue(struct freework *freework)
8042
{
8043
	struct freeblks *freeblks;
8044

8045
	freeblks = freework->fw_freeblks;
8046
	if ((freework->fw_state & INPROGRESS) == 0)
8047
		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
8048
	if ((freeblks->fb_state &
8049
	    (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
8050
	    LIST_EMPTY(&freeblks->fb_jblkdephd))
8051
		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
8052
}
8053

8054
/*
8055
 * Start, continue, or finish the process of freeing an indirect block tree.
8056
 * The free operation may be paused at any point with fw_off containing the
8057
 * offset to restart from.  This enables us to implement some flow control
8058
 * for large truncates which may fan out and generate a huge number of
8059
 * dependencies.
8060
 */
8061
static void
8062
handle_workitem_indirblk(struct freework *freework)
8063
{
8064
	struct freeblks *freeblks;
8065
	struct ufsmount *ump;
8066
	struct fs *fs;
8067

8068
	freeblks = freework->fw_freeblks;
8069
	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8070
	fs = ump->um_fs;
8071
	if (freework->fw_state & DEPCOMPLETE) {
8072
		handle_written_freework(freework);
8073
		return;
8074
	}
8075
	if (freework->fw_off == NINDIR(fs)) {
8076
		freework_freeblock(freework, SINGLETON_KEY);
8077
		return;
8078
	}
8079
	freework->fw_state |= INPROGRESS;
8080
	FREE_LOCK(ump);
8081
	indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
8082
	    freework->fw_lbn);
8083
	ACQUIRE_LOCK(ump);
8084
}
8085

8086
/*
8087
 * Called when a freework structure attached to a cg buf is written.  The
8088
 * ref on either the parent or the freeblks structure is released and
8089
 * the freeblks is added back to the worklist if there is more work to do.
8090
 */
8091
static void
8092
handle_written_freework(struct freework *freework)
8093
{
8094
	struct freeblks *freeblks;
8095
	struct freework *parent;
8096

8097
	freeblks = freework->fw_freeblks;
8098
	parent = freework->fw_parent;
8099
	if (freework->fw_state & DELAYEDFREE)
8100
		freeblks->fb_cgwait--;
8101
	freework->fw_state |= COMPLETE;
8102
	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
8103
		WORKITEM_FREE(freework, D_FREEWORK);
8104
	if (parent) {
8105
		if (--parent->fw_ref == 0)
8106
			freework_enqueue(parent);
8107
		return;
8108
	}
8109
	if (--freeblks->fb_ref != 0)
8110
		return;
8111
	if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
8112
	    ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) 
8113
		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
8114
}
8115

8116
/*
8117
 * This workitem routine performs the block de-allocation.
8118
 * The workitem is added to the pending list after the updated
8119
 * inode block has been written to disk.  As mentioned above,
8120
 * checks regarding the number of blocks de-allocated (compared
8121
 * to the number of blocks allocated for the file) are also
8122
 * performed in this function.
8123
 */
8124
static int
8125
handle_workitem_freeblocks(struct freeblks *freeblks, int flags)
8126
{
8127
	struct freework *freework;
8128
	struct newblk *newblk;
8129
	struct allocindir *aip;
8130
	struct ufsmount *ump;
8131
	struct worklist *wk;
8132
	uint64_t key;
8133

8134
	KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
8135
	    ("handle_workitem_freeblocks: Journal entries not written."));
8136
	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8137
	key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
8138
	ACQUIRE_LOCK(ump);
8139
	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
8140
		WORKLIST_REMOVE(wk);
8141
		switch (wk->wk_type) {
8142
		case D_DIRREM:
8143
			wk->wk_state |= COMPLETE;
8144
			add_to_worklist(wk, 0);
8145
			continue;
8146

8147
		case D_ALLOCDIRECT:
8148
			free_newblk(WK_NEWBLK(wk));
8149
			continue;
8150

8151
		case D_ALLOCINDIR:
8152
			aip = WK_ALLOCINDIR(wk);
8153
			freework = NULL;
8154
			if (aip->ai_state & DELAYEDFREE) {
8155
				FREE_LOCK(ump);
8156
				freework = newfreework(ump, freeblks, NULL,
8157
				    aip->ai_lbn, aip->ai_newblkno,
8158
				    ump->um_fs->fs_frag, 0, 0);
8159
				ACQUIRE_LOCK(ump);
8160
			}
8161
			newblk = WK_NEWBLK(wk);
8162
			if (newblk->nb_jnewblk) {
8163
				freework->fw_jnewblk = newblk->nb_jnewblk;
8164
				newblk->nb_jnewblk->jn_dep = &freework->fw_list;
8165
				newblk->nb_jnewblk = NULL;
8166
			}
8167
			free_newblk(newblk);
8168
			continue;
8169

8170
		case D_FREEWORK:
8171
			freework = WK_FREEWORK(wk);
8172
			if (freework->fw_lbn <= -UFS_NDADDR)
8173
				handle_workitem_indirblk(freework);
8174
			else
8175
				freework_freeblock(freework, key);
8176
			continue;
8177
		default:
8178
			panic("handle_workitem_freeblocks: Unknown type %s",
8179
			    TYPENAME(wk->wk_type));
8180
		}
8181
	}
8182
	if (freeblks->fb_ref != 0) {
8183
		freeblks->fb_state &= ~INPROGRESS;
8184
		wake_worklist(&freeblks->fb_list);
8185
		freeblks = NULL;
8186
	}
8187
	FREE_LOCK(ump);
8188
	ffs_blkrelease_finish(ump, key);
8189
	if (freeblks)
8190
		return handle_complete_freeblocks(freeblks, flags);
8191
	return (0);
8192
}
8193

8194
/*
8195
 * Handle completion of block free via truncate.  This allows fs_pending
8196
 * to track the actual free block count more closely than if we only updated
8197
 * it at the end.  We must be careful to handle cases where the block count
8198
 * on free was incorrect.
8199
 */
8200
static void
8201
freeblks_free(struct ufsmount *ump,
8202
	struct freeblks *freeblks,
8203
	int blocks)
8204
{
8205
	struct fs *fs;
8206
	ufs2_daddr_t remain;
8207

8208
	UFS_LOCK(ump);
8209
	remain = -freeblks->fb_chkcnt;
8210
	freeblks->fb_chkcnt += blocks;
8211
	if (remain > 0) {
8212
		if (remain < blocks)
8213
			blocks = remain;
8214
		fs = ump->um_fs;
8215
		fs->fs_pendingblocks -= blocks;
8216
	}
8217
	UFS_UNLOCK(ump);
8218
}
8219

8220
/*
8221
 * Once all of the freework workitems are complete we can retire the
8222
 * freeblocks dependency and any journal work awaiting completion.  This
8223
 * can not be called until all other dependencies are stable on disk.
8224
 */
8225
static int
8226
handle_complete_freeblocks(struct freeblks *freeblks, int flags)
8227
{
8228
	struct inodedep *inodedep;
8229
	struct inode *ip;
8230
	struct vnode *vp;
8231
	struct fs *fs;
8232
	struct ufsmount *ump;
8233
	ufs2_daddr_t spare;
8234

8235
	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8236
	fs = ump->um_fs;
8237
	flags = LK_EXCLUSIVE | flags;
8238
	spare = freeblks->fb_chkcnt;
8239

8240
	/*
8241
	 * If we did not release the expected number of blocks we may have
8242
	 * to adjust the inode block count here.  Only do so if it wasn't
8243
	 * a truncation to zero and the modrev still matches.
8244
	 */
8245
	if (spare && freeblks->fb_len != 0) {
8246
		if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
8247
		    flags, &vp, FFSV_FORCEINSMQ | FFSV_FORCEINODEDEP) != 0)
8248
			return (EBUSY);
8249
		ip = VTOI(vp);
8250
		if (ip->i_mode == 0) {
8251
			vgone(vp);
8252
		} else if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
8253
			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
8254
			UFS_INODE_SET_FLAG(ip, IN_CHANGE);
8255
			/*
8256
			 * We must wait so this happens before the
8257
			 * journal is reclaimed.
8258
			 */
8259
			ffs_update(vp, 1);
8260
		}
8261
		vput(vp);
8262
	}
8263
	if (spare < 0) {
8264
		UFS_LOCK(ump);
8265
		fs->fs_pendingblocks += spare;
8266
		UFS_UNLOCK(ump);
8267
	}
8268
#ifdef QUOTA
8269
	/* Handle spare. */
8270
	if (spare)
8271
		quotaadj(freeblks->fb_quota, ump, -spare);
8272
	quotarele(freeblks->fb_quota);
8273
#endif
8274
	ACQUIRE_LOCK(ump);
8275
	if (freeblks->fb_state & ONDEPLIST) {
8276
		inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
8277
		    0, &inodedep);
8278
		TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
8279
		freeblks->fb_state &= ~ONDEPLIST;
8280
		if (TAILQ_EMPTY(&inodedep->id_freeblklst))
8281
			free_inodedep(inodedep);
8282
	}
8283
	/*
8284
	 * All of the freeblock deps must be complete prior to this call
8285
	 * so it's now safe to complete earlier outstanding journal entries.
8286
	 */
8287
	handle_jwork(&freeblks->fb_jwork);
8288
	WORKITEM_FREE(freeblks, D_FREEBLKS);
8289
	FREE_LOCK(ump);
8290
	return (0);
8291
}
8292

8293
/*
8294
 * Release blocks associated with the freeblks and stored in the indirect
8295
 * block dbn. If level is greater than SINGLE, the block is an indirect block
8296
 * and recursive calls to indirtrunc must be used to cleanse other indirect
8297
 * blocks.
8298
 *
8299
 * This handles partial and complete truncation of blocks.  Partial is noted
8300
 * with goingaway == 0.  In this case the freework is completed after the
8301
 * zero'd indirects are written to disk.  For full truncation the freework
8302
 * is completed after the block is freed.
8303
 */
8304
static void
8305
indir_trunc(struct freework *freework,
8306
	ufs2_daddr_t dbn,
8307
	ufs_lbn_t lbn)
8308
{
8309
	struct freework *nfreework;
8310
	struct workhead wkhd;
8311
	struct freeblks *freeblks;
8312
	struct buf *bp;
8313
	struct fs *fs;
8314
	struct indirdep *indirdep;
8315
	struct mount *mp;
8316
	struct ufsmount *ump;
8317
	ufs1_daddr_t *bap1;
8318
	ufs2_daddr_t nb, nnb, *bap2;
8319
	ufs_lbn_t lbnadd, nlbn;
8320
	uint64_t key;
8321
	int nblocks, ufs1fmt, freedblocks;
8322
	int goingaway, freedeps, needj, level, cnt, i, error;
8323

8324
	freeblks = freework->fw_freeblks;
8325
	mp = freeblks->fb_list.wk_mp;
8326
	ump = VFSTOUFS(mp);
8327
	fs = ump->um_fs;
8328
	/*
8329
	 * Get buffer of block pointers to be freed.  There are three cases:
8330
	 * 
8331
	 * 1) Partial truncate caches the indirdep pointer in the freework
8332
	 *    which provides us a back copy to the save bp which holds the
8333
	 *    pointers we want to clear.  When this completes the zero
8334
	 *    pointers are written to the real copy.
8335
	 * 2) The indirect is being completely truncated, cancel_indirdep()
8336
	 *    eliminated the real copy and placed the indirdep on the saved
8337
	 *    copy.  The indirdep and buf are discarded when this completes.
8338
	 * 3) The indirect was not in memory, we read a copy off of the disk
8339
	 *    using the devvp and drop and invalidate the buffer when we're
8340
	 *    done.
8341
	 */
8342
	goingaway = 1;
8343
	indirdep = NULL;
8344
	if (freework->fw_indir != NULL) {
8345
		goingaway = 0;
8346
		indirdep = freework->fw_indir;
8347
		bp = indirdep->ir_savebp;
8348
		if (bp == NULL || bp->b_blkno != dbn)
8349
			panic("indir_trunc: Bad saved buf %p blkno %jd",
8350
			    bp, (intmax_t)dbn);
8351
	} else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
8352
		/*
8353
		 * The lock prevents the buf dep list from changing and
8354
	 	 * indirects on devvp should only ever have one dependency.
8355
		 */
8356
		indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
8357
		if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
8358
			panic("indir_trunc: Bad indirdep %p from buf %p",
8359
			    indirdep, bp);
8360
	} else {
8361
		error = ffs_breadz(ump, freeblks->fb_devvp, dbn, dbn,
8362
		    (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, &bp);
8363
		if (error)
8364
			return;
8365
	}
8366
	ACQUIRE_LOCK(ump);
8367
	/* Protects against a race with complete_trunc_indir(). */
8368
	freework->fw_state &= ~INPROGRESS;
8369
	/*
8370
	 * If we have an indirdep we need to enforce the truncation order
8371
	 * and discard it when it is complete.
8372
	 */
8373
	if (indirdep) {
8374
		if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
8375
		    !TAILQ_EMPTY(&indirdep->ir_trunc)) {
8376
			/*
8377
			 * Add the complete truncate to the list on the
8378
			 * indirdep to enforce in-order processing.
8379
			 */
8380
			if (freework->fw_indir == NULL)
8381
				TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
8382
				    freework, fw_next);
8383
			FREE_LOCK(ump);
8384
			return;
8385
		}
8386
		/*
8387
		 * If we're goingaway, free the indirdep.  Otherwise it will
8388
		 * linger until the write completes.
8389
		 */
8390
		if (goingaway) {
8391
			KASSERT(indirdep->ir_savebp == bp,
8392
			    ("indir_trunc: losing ir_savebp %p",
8393
			    indirdep->ir_savebp));
8394
			indirdep->ir_savebp = NULL;
8395
			free_indirdep(indirdep);
8396
		}
8397
	}
8398
	FREE_LOCK(ump);
8399
	/* Initialize pointers depending on block size. */
8400
	if (ump->um_fstype == UFS1) {
8401
		bap1 = (ufs1_daddr_t *)bp->b_data;
8402
		nb = bap1[freework->fw_off];
8403
		ufs1fmt = 1;
8404
		bap2 = NULL;
8405
	} else {
8406
		bap2 = (ufs2_daddr_t *)bp->b_data;
8407
		nb = bap2[freework->fw_off];
8408
		ufs1fmt = 0;
8409
		bap1 = NULL;
8410
	}
8411
	level = lbn_level(lbn);
8412
	needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
8413
	lbnadd = lbn_offset(fs, level);
8414
	nblocks = btodb(fs->fs_bsize);
8415
	nfreework = freework;
8416
	freedeps = 0;
8417
	cnt = 0;
8418
	/*
8419
	 * Reclaim blocks.  Traverses into nested indirect levels and
8420
	 * arranges for the current level to be freed when subordinates
8421
	 * are free when journaling.
8422
	 */
8423
	key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
8424
	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
8425
		if (UFS_CHECK_BLKNO(mp, freeblks->fb_inum, nb,
8426
		    fs->fs_bsize) != 0)
8427
			nb = 0;
8428
		if (i != NINDIR(fs) - 1) {
8429
			if (ufs1fmt)
8430
				nnb = bap1[i+1];
8431
			else
8432
				nnb = bap2[i+1];
8433
		} else
8434
			nnb = 0;
8435
		if (nb == 0)
8436
			continue;
8437
		cnt++;
8438
		if (level != 0) {
8439
			nlbn = (lbn + 1) - (i * lbnadd);
8440
			if (needj != 0) {
8441
				nfreework = newfreework(ump, freeblks, freework,
8442
				    nlbn, nb, fs->fs_frag, 0, 0);
8443
				freedeps++;
8444
			}
8445
			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
8446
		} else {
8447
			struct freedep *freedep;
8448

8449
			/*
8450
			 * Attempt to aggregate freedep dependencies for
8451
			 * all blocks being released to the same CG.
8452
			 */
8453
			LIST_INIT(&wkhd);
8454
			if (needj != 0 &&
8455
			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
8456
				freedep = newfreedep(freework);
8457
				WORKLIST_INSERT_UNLOCKED(&wkhd,
8458
				    &freedep->fd_list);
8459
				freedeps++;
8460
			}
8461
			CTR3(KTR_SUJ,
8462
			    "indir_trunc: ino %jd blkno %jd size %d",
8463
			    freeblks->fb_inum, nb, fs->fs_bsize);
8464
			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
8465
			    fs->fs_bsize, freeblks->fb_inum,
8466
			    freeblks->fb_vtype, &wkhd, key);
8467
		}
8468
	}
8469
	ffs_blkrelease_finish(ump, key);
8470
	if (goingaway) {
8471
		bp->b_flags |= B_INVAL | B_NOCACHE;
8472
		brelse(bp);
8473
	}
8474
	freedblocks = 0;
8475
	if (level == 0)
8476
		freedblocks = (nblocks * cnt);
8477
	if (needj == 0)
8478
		freedblocks += nblocks;
8479
	freeblks_free(ump, freeblks, freedblocks);
8480
	/*
8481
	 * If we are journaling set up the ref counts and offset so this
8482
	 * indirect can be completed when its children are free.
8483
	 */
8484
	if (needj) {
8485
		ACQUIRE_LOCK(ump);
8486
		freework->fw_off = i;
8487
		freework->fw_ref += freedeps;
8488
		freework->fw_ref -= NINDIR(fs) + 1;
8489
		if (level == 0)
8490
			freeblks->fb_cgwait += freedeps;
8491
		if (freework->fw_ref == 0)
8492
			freework_freeblock(freework, SINGLETON_KEY);
8493
		FREE_LOCK(ump);
8494
		return;
8495
	}
8496
	/*
8497
	 * If we're not journaling we can free the indirect now.
8498
	 */
8499
	dbn = dbtofsb(fs, dbn);
8500
	CTR3(KTR_SUJ,
8501
	    "indir_trunc 2: ino %jd blkno %jd size %d",
8502
	    freeblks->fb_inum, dbn, fs->fs_bsize);
8503
	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
8504
	    freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON_KEY);
8505
	/* Non SUJ softdep does single-threaded truncations. */
8506
	if (freework->fw_blkno == dbn) {
8507
		freework->fw_state |= ALLCOMPLETE;
8508
		ACQUIRE_LOCK(ump);
8509
		handle_written_freework(freework);
8510
		FREE_LOCK(ump);
8511
	}
8512
	return;
8513
}
8514

8515
/*
8516
 * Cancel an allocindir when it is removed via truncation.  When bp is not
8517
 * NULL the indirect never appeared on disk and is scheduled to be freed
8518
 * independently of the indir so we can more easily track journal work.
8519
 */
8520
static void
8521
cancel_allocindir(
8522
	struct allocindir *aip,
8523
	struct buf *bp,
8524
	struct freeblks *freeblks,
8525
	int trunc)
8526
{
8527
	struct indirdep *indirdep;
8528
	struct freefrag *freefrag;
8529
	struct newblk *newblk;
8530

8531
	newblk = (struct newblk *)aip;
8532
	LIST_REMOVE(aip, ai_next);
8533
	/*
8534
	 * We must eliminate the pointer in bp if it must be freed on its
8535
	 * own due to partial truncate or pending journal work.
8536
	 */
8537
	if (bp && (trunc || newblk->nb_jnewblk)) {
8538
		/*
8539
		 * Clear the pointer and mark the aip to be freed
8540
		 * directly if it never existed on disk.
8541
		 */
8542
		aip->ai_state |= DELAYEDFREE;
8543
		indirdep = aip->ai_indirdep;
8544
		if (indirdep->ir_state & UFS1FMT)
8545
			((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8546
		else
8547
			((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8548
	}
8549
	/*
8550
	 * When truncating the previous pointer will be freed via
8551
	 * savedbp.  Eliminate the freefrag which would dup free.
8552
	 */
8553
	if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
8554
		newblk->nb_freefrag = NULL;
8555
		if (freefrag->ff_jdep)
8556
			cancel_jfreefrag(
8557
			    WK_JFREEFRAG(freefrag->ff_jdep));
8558
		jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
8559
		WORKITEM_FREE(freefrag, D_FREEFRAG);
8560
	}
8561
	/*
8562
	 * If the journal hasn't been written the jnewblk must be passed
8563
	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
8564
	 * this by leaving the journal dependency on the newblk to be freed
8565
	 * when a freework is created in handle_workitem_freeblocks().
8566
	 */
8567
	cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
8568
	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
8569
}
8570

8571
/*
8572
 * Create the mkdir dependencies for . and .. in a new directory.  Link them
8573
 * in to a newdirblk so any subsequent additions are tracked properly.  The
8574
 * caller is responsible for adding the mkdir1 dependency to the journal
8575
 * and updating id_mkdiradd.  This function returns with the per-filesystem
8576
 * lock held.
8577
 */
8578
static struct mkdir *
8579
setup_newdir(
8580
	struct diradd *dap,
8581
	ino_t newinum,
8582
	ino_t dinum,
8583
	struct buf *newdirbp,
8584
	struct mkdir **mkdirp)
8585
{
8586
	struct newblk *newblk;
8587
	struct pagedep *pagedep;
8588
	struct inodedep *inodedep;
8589
	struct newdirblk *newdirblk;
8590
	struct mkdir *mkdir1, *mkdir2;
8591
	struct worklist *wk;
8592
	struct jaddref *jaddref;
8593
	struct ufsmount *ump;
8594
	struct mount *mp;
8595

8596
	mp = dap->da_list.wk_mp;
8597
	ump = VFSTOUFS(mp);
8598
	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
8599
	    M_SOFTDEP_FLAGS);
8600
	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8601
	LIST_INIT(&newdirblk->db_mkdir);
8602
	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8603
	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
8604
	mkdir1->md_state = ATTACHED | MKDIR_BODY;
8605
	mkdir1->md_diradd = dap;
8606
	mkdir1->md_jaddref = NULL;
8607
	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8608
	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
8609
	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
8610
	mkdir2->md_diradd = dap;
8611
	mkdir2->md_jaddref = NULL;
8612
	if (MOUNTEDSUJ(mp) == 0) {
8613
		mkdir1->md_state |= DEPCOMPLETE;
8614
		mkdir2->md_state |= DEPCOMPLETE;
8615
	}
8616
	/*
8617
	 * Dependency on "." and ".." being written to disk.
8618
	 */
8619
	mkdir1->md_buf = newdirbp;
8620
	ACQUIRE_LOCK(VFSTOUFS(mp));
8621
	LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs);
8622
	/*
8623
	 * We must link the pagedep, allocdirect, and newdirblk for
8624
	 * the initial file page so the pointer to the new directory
8625
	 * is not written until the directory contents are live and
8626
	 * any subsequent additions are not marked live until the
8627
	 * block is reachable via the inode.
8628
	 */
8629
	if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
8630
		panic("setup_newdir: lost pagedep");
8631
	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
8632
		if (wk->wk_type == D_ALLOCDIRECT)
8633
			break;
8634
	if (wk == NULL)
8635
		panic("setup_newdir: lost allocdirect");
8636
	if (pagedep->pd_state & NEWBLOCK)
8637
		panic("setup_newdir: NEWBLOCK already set");
8638
	newblk = WK_NEWBLK(wk);
8639
	pagedep->pd_state |= NEWBLOCK;
8640
	pagedep->pd_newdirblk = newdirblk;
8641
	newdirblk->db_pagedep = pagedep;
8642
	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8643
	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
8644
	/*
8645
	 * Look up the inodedep for the parent directory so that we
8646
	 * can link mkdir2 into the pending dotdot jaddref or
8647
	 * the inode write if there is none.  If the inode is
8648
	 * ALLCOMPLETE and no jaddref is present all dependencies have
8649
	 * been satisfied and mkdir2 can be freed.
8650
	 */
8651
	inodedep_lookup(mp, dinum, 0, &inodedep);
8652
	if (MOUNTEDSUJ(mp)) {
8653
		if (inodedep == NULL)
8654
			panic("setup_newdir: Lost parent.");
8655
		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8656
		    inoreflst);
8657
		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
8658
		    (jaddref->ja_state & MKDIR_PARENT),
8659
		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
8660
		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8661
		mkdir2->md_jaddref = jaddref;
8662
		jaddref->ja_mkdir = mkdir2;
8663
	} else if (inodedep == NULL ||
8664
	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8665
		dap->da_state &= ~MKDIR_PARENT;
8666
		WORKITEM_FREE(mkdir2, D_MKDIR);
8667
		mkdir2 = NULL;
8668
	} else {
8669
		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8670
		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
8671
	}
8672
	*mkdirp = mkdir2;
8673

8674
	return (mkdir1);
8675
}
8676

8677
/*
8678
 * Directory entry addition dependencies.
8679
 * 
8680
 * When adding a new directory entry, the inode (with its incremented link
8681
 * count) must be written to disk before the directory entry's pointer to it.
8682
 * Also, if the inode is newly allocated, the corresponding freemap must be
8683
 * updated (on disk) before the directory entry's pointer. These requirements
8684
 * are met via undo/redo on the directory entry's pointer, which consists
8685
 * simply of the inode number.
8686
 * 
8687
 * As directory entries are added and deleted, the free space within a
8688
 * directory block can become fragmented.  The ufs filesystem will compact
8689
 * a fragmented directory block to make space for a new entry. When this
8690
 * occurs, the offsets of previously added entries change. Any "diradd"
8691
 * dependency structures corresponding to these entries must be updated with
8692
 * the new offsets.
8693
 */
8694

8695
/*
8696
 * This routine is called after the in-memory inode's link
8697
 * count has been incremented, but before the directory entry's
8698
 * pointer to the inode has been set.
8699
 */
8700
int
8701
softdep_setup_directory_add(
8702
	struct buf *bp,		/* buffer containing directory block */
8703
	struct inode *dp,	/* inode for directory */
8704
	off_t diroffset,	/* offset of new entry in directory */
8705
	ino_t newinum,		/* inode referenced by new directory entry */
8706
	struct buf *newdirbp,	/* non-NULL => contents of new mkdir */
8707
	int isnewblk)		/* entry is in a newly allocated block */
8708
{
8709
	int offset;		/* offset of new entry within directory block */
8710
	ufs_lbn_t lbn;		/* block in directory containing new entry */
8711
	struct fs *fs;
8712
	struct diradd *dap;
8713
	struct newblk *newblk;
8714
	struct pagedep *pagedep;
8715
	struct inodedep *inodedep;
8716
	struct newdirblk *newdirblk;
8717
	struct mkdir *mkdir1, *mkdir2;
8718
	struct jaddref *jaddref;
8719
	struct ufsmount *ump;
8720
	struct mount *mp;
8721
	int isindir;
8722

8723
	mp = ITOVFS(dp);
8724
	ump = VFSTOUFS(mp);
8725
	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8726
	    ("softdep_setup_directory_add called on non-softdep filesystem"));
8727
	/*
8728
	 * Whiteouts have no dependencies.
8729
	 */
8730
	if (newinum == UFS_WINO) {
8731
		if (newdirbp != NULL)
8732
			bdwrite(newdirbp);
8733
		return (0);
8734
	}
8735
	jaddref = NULL;
8736
	mkdir1 = mkdir2 = NULL;
8737
	fs = ump->um_fs;
8738
	lbn = lblkno(fs, diroffset);
8739
	offset = blkoff(fs, diroffset);
8740
	dap = malloc(sizeof(struct diradd), M_DIRADD,
8741
		M_SOFTDEP_FLAGS|M_ZERO);
8742
	workitem_alloc(&dap->da_list, D_DIRADD, mp);
8743
	dap->da_offset = offset;
8744
	dap->da_newinum = newinum;
8745
	dap->da_state = ATTACHED;
8746
	LIST_INIT(&dap->da_jwork);
8747
	isindir = bp->b_lblkno >= UFS_NDADDR;
8748
	newdirblk = NULL;
8749
	if (isnewblk &&
8750
	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8751
		newdirblk = malloc(sizeof(struct newdirblk),
8752
		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8753
		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8754
		LIST_INIT(&newdirblk->db_mkdir);
8755
	}
8756
	/*
8757
	 * If we're creating a new directory setup the dependencies and set
8758
	 * the dap state to wait for them.  Otherwise it's COMPLETE and
8759
	 * we can move on.
8760
	 */
8761
	if (newdirbp == NULL) {
8762
		dap->da_state |= DEPCOMPLETE;
8763
		ACQUIRE_LOCK(ump);
8764
	} else {
8765
		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8766
		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8767
		    &mkdir2);
8768
	}
8769
	/*
8770
	 * Link into parent directory pagedep to await its being written.
8771
	 */
8772
	pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8773
#ifdef INVARIANTS
8774
	if (diradd_lookup(pagedep, offset) != NULL)
8775
		panic("softdep_setup_directory_add: %p already at off %d\n",
8776
		    diradd_lookup(pagedep, offset), offset);
8777
#endif
8778
	dap->da_pagedep = pagedep;
8779
	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8780
	    da_pdlist);
8781
	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
8782
	/*
8783
	 * If we're journaling, link the diradd into the jaddref so it
8784
	 * may be completed after the journal entry is written.  Otherwise,
8785
	 * link the diradd into its inodedep.  If the inode is not yet
8786
	 * written place it on the bufwait list, otherwise do the post-inode
8787
	 * write processing to put it on the id_pendinghd list.
8788
	 */
8789
	if (MOUNTEDSUJ(mp)) {
8790
		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8791
		    inoreflst);
8792
		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8793
		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8794
		jaddref->ja_diroff = diroffset;
8795
		jaddref->ja_diradd = dap;
8796
		add_to_journal(&jaddref->ja_list);
8797
	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8798
		diradd_inode_written(dap, inodedep);
8799
	else
8800
		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8801
	/*
8802
	 * Add the journal entries for . and .. links now that the primary
8803
	 * link is written.
8804
	 */
8805
	if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8806
		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8807
		    inoreflst, if_deps);
8808
		KASSERT(jaddref != NULL &&
8809
		    jaddref->ja_ino == jaddref->ja_parent &&
8810
		    (jaddref->ja_state & MKDIR_BODY),
8811
		    ("softdep_setup_directory_add: bad dot jaddref %p",
8812
		    jaddref));
8813
		mkdir1->md_jaddref = jaddref;
8814
		jaddref->ja_mkdir = mkdir1;
8815
		/*
8816
		 * It is important that the dotdot journal entry
8817
		 * is added prior to the dot entry since dot writes
8818
		 * both the dot and dotdot links.  These both must
8819
		 * be added after the primary link for the journal
8820
		 * to remain consistent.
8821
		 */
8822
		add_to_journal(&mkdir2->md_jaddref->ja_list);
8823
		add_to_journal(&jaddref->ja_list);
8824
	}
8825
	/*
8826
	 * If we are adding a new directory remember this diradd so that if
8827
	 * we rename it we can keep the dot and dotdot dependencies.  If
8828
	 * we are adding a new name for an inode that has a mkdiradd we
8829
	 * must be in rename and we have to move the dot and dotdot
8830
	 * dependencies to this new name.  The old name is being orphaned
8831
	 * soon.
8832
	 */
8833
	if (mkdir1 != NULL) {
8834
		if (inodedep->id_mkdiradd != NULL)
8835
			panic("softdep_setup_directory_add: Existing mkdir");
8836
		inodedep->id_mkdiradd = dap;
8837
	} else if (inodedep->id_mkdiradd)
8838
		merge_diradd(inodedep, dap);
8839
	if (newdirblk != NULL) {
8840
		/*
8841
		 * There is nothing to do if we are already tracking
8842
		 * this block.
8843
		 */
8844
		if ((pagedep->pd_state & NEWBLOCK) != 0) {
8845
			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8846
			FREE_LOCK(ump);
8847
			return (0);
8848
		}
8849
		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8850
		    == 0)
8851
			panic("softdep_setup_directory_add: lost entry");
8852
		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8853
		pagedep->pd_state |= NEWBLOCK;
8854
		pagedep->pd_newdirblk = newdirblk;
8855
		newdirblk->db_pagedep = pagedep;
8856
		FREE_LOCK(ump);
8857
		/*
8858
		 * If we extended into an indirect signal direnter to sync.
8859
		 */
8860
		if (isindir)
8861
			return (1);
8862
		return (0);
8863
	}
8864
	FREE_LOCK(ump);
8865
	return (0);
8866
}
8867

8868
/*
8869
 * This procedure is called to change the offset of a directory
8870
 * entry when compacting a directory block which must be owned
8871
 * exclusively by the caller. Note that the actual entry movement
8872
 * must be done in this procedure to ensure that no I/O completions
8873
 * occur while the move is in progress.
8874
 */
8875
void 
8876
softdep_change_directoryentry_offset(
8877
	struct buf *bp,		/* Buffer holding directory block. */
8878
	struct inode *dp,	/* inode for directory */
8879
	caddr_t base,		/* address of dp->i_offset */
8880
	caddr_t oldloc,		/* address of old directory location */
8881
	caddr_t newloc,		/* address of new directory location */
8882
	int entrysize)		/* size of directory entry */
8883
{
8884
	int offset, oldoffset, newoffset;
8885
	struct pagedep *pagedep;
8886
	struct jmvref *jmvref;
8887
	struct diradd *dap;
8888
	struct direct *de;
8889
	struct mount *mp;
8890
	struct ufsmount *ump;
8891
	ufs_lbn_t lbn;
8892
	int flags;
8893

8894
	mp = ITOVFS(dp);
8895
	ump = VFSTOUFS(mp);
8896
	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8897
	    ("softdep_change_directoryentry_offset called on "
8898
	     "non-softdep filesystem"));
8899
	de = (struct direct *)oldloc;
8900
	jmvref = NULL;
8901
	flags = 0;
8902
	/*
8903
	 * Moves are always journaled as it would be too complex to
8904
	 * determine if any affected adds or removes are present in the
8905
	 * journal.
8906
	 */
8907
	if (MOUNTEDSUJ(mp)) {
8908
		flags = DEPALLOC;
8909
		jmvref = newjmvref(dp, de->d_ino,
8910
		    I_OFFSET(dp) + (oldloc - base),
8911
		    I_OFFSET(dp) + (newloc - base));
8912
	}
8913
	lbn = lblkno(ump->um_fs, I_OFFSET(dp));
8914
	offset = blkoff(ump->um_fs, I_OFFSET(dp));
8915
	oldoffset = offset + (oldloc - base);
8916
	newoffset = offset + (newloc - base);
8917
	ACQUIRE_LOCK(ump);
8918
	if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
8919
		goto done;
8920
	dap = diradd_lookup(pagedep, oldoffset);
8921
	if (dap) {
8922
		dap->da_offset = newoffset;
8923
		newoffset = DIRADDHASH(newoffset);
8924
		oldoffset = DIRADDHASH(oldoffset);
8925
		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
8926
		    newoffset != oldoffset) {
8927
			LIST_REMOVE(dap, da_pdlist);
8928
			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
8929
			    dap, da_pdlist);
8930
		}
8931
	}
8932
done:
8933
	if (jmvref) {
8934
		jmvref->jm_pagedep = pagedep;
8935
		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
8936
		add_to_journal(&jmvref->jm_list);
8937
	}
8938
	bcopy(oldloc, newloc, entrysize);
8939
	FREE_LOCK(ump);
8940
}
8941

8942
/*
8943
 * Move the mkdir dependencies and journal work from one diradd to another
8944
 * when renaming a directory.  The new name must depend on the mkdir deps
8945
 * completing as the old name did.  Directories can only have one valid link
8946
 * at a time so one must be canonical.
8947
 */
8948
static void
8949
merge_diradd(struct inodedep *inodedep, struct diradd *newdap)
8950
{
8951
	struct diradd *olddap;
8952
	struct mkdir *mkdir, *nextmd;
8953
	struct ufsmount *ump;
8954
	short state;
8955

8956
	olddap = inodedep->id_mkdiradd;
8957
	inodedep->id_mkdiradd = newdap;
8958
	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8959
		newdap->da_state &= ~DEPCOMPLETE;
8960
		ump = VFSTOUFS(inodedep->id_list.wk_mp);
8961
		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8962
		     mkdir = nextmd) {
8963
			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8964
			if (mkdir->md_diradd != olddap)
8965
				continue;
8966
			mkdir->md_diradd = newdap;
8967
			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
8968
			newdap->da_state |= state;
8969
			olddap->da_state &= ~state;
8970
			if ((olddap->da_state &
8971
			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
8972
				break;
8973
		}
8974
		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8975
			panic("merge_diradd: unfound ref");
8976
	}
8977
	/*
8978
	 * Any mkdir related journal items are not safe to be freed until
8979
	 * the new name is stable.
8980
	 */
8981
	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
8982
	olddap->da_state |= DEPCOMPLETE;
8983
	complete_diradd(olddap);
8984
}
8985

8986
/*
8987
 * Move the diradd to the pending list when all diradd dependencies are
8988
 * complete.
8989
 */
8990
static void
8991
complete_diradd(struct diradd *dap)
8992
{
8993
	struct pagedep *pagedep;
8994

8995
	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
8996
		if (dap->da_state & DIRCHG)
8997
			pagedep = dap->da_previous->dm_pagedep;
8998
		else
8999
			pagedep = dap->da_pagedep;
9000
		LIST_REMOVE(dap, da_pdlist);
9001
		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9002
	}
9003
}
9004

9005
/*
9006
 * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
9007
 * add entries and conditionally journal the remove.
9008
 */
9009
static void
9010
cancel_diradd(
9011
	struct diradd *dap,
9012
	struct dirrem *dirrem,
9013
	struct jremref *jremref,
9014
	struct jremref *dotremref,
9015
	struct jremref *dotdotremref)
9016
{
9017
	struct inodedep *inodedep;
9018
	struct jaddref *jaddref;
9019
	struct inoref *inoref;
9020
	struct ufsmount *ump;
9021
	struct mkdir *mkdir;
9022

9023
	/*
9024
	 * If no remove references were allocated we're on a non-journaled
9025
	 * filesystem and can skip the cancel step.
9026
	 */
9027
	if (jremref == NULL) {
9028
		free_diradd(dap, NULL);
9029
		return;
9030
	}
9031
	/*
9032
	 * Cancel the primary name an free it if it does not require
9033
	 * journaling.
9034
	 */
9035
	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
9036
	    0, &inodedep) != 0) {
9037
		/* Abort the addref that reference this diradd.  */
9038
		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
9039
			if (inoref->if_list.wk_type != D_JADDREF)
9040
				continue;
9041
			jaddref = (struct jaddref *)inoref;
9042
			if (jaddref->ja_diradd != dap)
9043
				continue;
9044
			if (cancel_jaddref(jaddref, inodedep,
9045
			    &dirrem->dm_jwork) == 0) {
9046
				free_jremref(jremref);
9047
				jremref = NULL;
9048
			}
9049
			break;
9050
		}
9051
	}
9052
	/*
9053
	 * Cancel subordinate names and free them if they do not require
9054
	 * journaling.
9055
	 */
9056
	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
9057
		ump = VFSTOUFS(dap->da_list.wk_mp);
9058
		LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) {
9059
			if (mkdir->md_diradd != dap)
9060
				continue;
9061
			if ((jaddref = mkdir->md_jaddref) == NULL)
9062
				continue;
9063
			mkdir->md_jaddref = NULL;
9064
			if (mkdir->md_state & MKDIR_PARENT) {
9065
				if (cancel_jaddref(jaddref, NULL,
9066
				    &dirrem->dm_jwork) == 0) {
9067
					free_jremref(dotdotremref);
9068
					dotdotremref = NULL;
9069
				}
9070
			} else {
9071
				if (cancel_jaddref(jaddref, inodedep,
9072
				    &dirrem->dm_jwork) == 0) {
9073
					free_jremref(dotremref);
9074
					dotremref = NULL;
9075
				}
9076
			}
9077
		}
9078
	}
9079

9080
	if (jremref)
9081
		journal_jremref(dirrem, jremref, inodedep);
9082
	if (dotremref)
9083
		journal_jremref(dirrem, dotremref, inodedep);
9084
	if (dotdotremref)
9085
		journal_jremref(dirrem, dotdotremref, NULL);
9086
	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
9087
	free_diradd(dap, &dirrem->dm_jwork);
9088
}
9089

9090
/*
9091
 * Free a diradd dependency structure.
9092
 */
9093
static void
9094
free_diradd(struct diradd *dap, struct workhead *wkhd)
9095
{
9096
	struct dirrem *dirrem;
9097
	struct pagedep *pagedep;
9098
	struct inodedep *inodedep;
9099
	struct mkdir *mkdir, *nextmd;
9100
	struct ufsmount *ump;
9101

9102
	ump = VFSTOUFS(dap->da_list.wk_mp);
9103
	LOCK_OWNED(ump);
9104
	LIST_REMOVE(dap, da_pdlist);
9105
	if (dap->da_state & ONWORKLIST)
9106
		WORKLIST_REMOVE(&dap->da_list);
9107
	if ((dap->da_state & DIRCHG) == 0) {
9108
		pagedep = dap->da_pagedep;
9109
	} else {
9110
		dirrem = dap->da_previous;
9111
		pagedep = dirrem->dm_pagedep;
9112
		dirrem->dm_dirinum = pagedep->pd_ino;
9113
		dirrem->dm_state |= COMPLETE;
9114
		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9115
			add_to_worklist(&dirrem->dm_list, 0);
9116
	}
9117
	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
9118
	    0, &inodedep) != 0)
9119
		if (inodedep->id_mkdiradd == dap)
9120
			inodedep->id_mkdiradd = NULL;
9121
	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
9122
		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
9123
		     mkdir = nextmd) {
9124
			nextmd = LIST_NEXT(mkdir, md_mkdirs);
9125
			if (mkdir->md_diradd != dap)
9126
				continue;
9127
			dap->da_state &=
9128
			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
9129
			LIST_REMOVE(mkdir, md_mkdirs);
9130
			if (mkdir->md_state & ONWORKLIST)
9131
				WORKLIST_REMOVE(&mkdir->md_list);
9132
			if (mkdir->md_jaddref != NULL)
9133
				panic("free_diradd: Unexpected jaddref");
9134
			WORKITEM_FREE(mkdir, D_MKDIR);
9135
			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
9136
				break;
9137
		}
9138
		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
9139
			panic("free_diradd: unfound ref");
9140
	}
9141
	if (inodedep)
9142
		free_inodedep(inodedep);
9143
	/*
9144
	 * Free any journal segments waiting for the directory write.
9145
	 */
9146
	handle_jwork(&dap->da_jwork);
9147
	WORKITEM_FREE(dap, D_DIRADD);
9148
}
9149

9150
/*
9151
 * Directory entry removal dependencies.
9152
 * 
9153
 * When removing a directory entry, the entry's inode pointer must be
9154
 * zero'ed on disk before the corresponding inode's link count is decremented
9155
 * (possibly freeing the inode for re-use). This dependency is handled by
9156
 * updating the directory entry but delaying the inode count reduction until
9157
 * after the directory block has been written to disk. After this point, the
9158
 * inode count can be decremented whenever it is convenient.
9159
 */
9160

9161
/*
9162
 * This routine should be called immediately after removing
9163
 * a directory entry.  The inode's link count should not be
9164
 * decremented by the calling procedure -- the soft updates
9165
 * code will do this task when it is safe.
9166
 */
9167
void 
9168
softdep_setup_remove(
9169
	struct buf *bp,		/* buffer containing directory block */
9170
	struct inode *dp,	/* inode for the directory being modified */
9171
	struct inode *ip,	/* inode for directory entry being removed */
9172
	bool isrmdir)		/* indicates if doing RMDIR */
9173
{
9174
	struct dirrem *dirrem, *prevdirrem;
9175
	struct inodedep *inodedep;
9176
	struct ufsmount *ump;
9177
	int direct;
9178

9179
	ump = ITOUMP(ip);
9180
	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9181
	    ("softdep_setup_remove called on non-softdep filesystem"));
9182
	/*
9183
	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
9184
	 * newdirrem() to setup the full directory remove which requires
9185
	 * isrmdir > 1.
9186
	 */
9187
	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9188
	/*
9189
	 * Add the dirrem to the inodedep's pending remove list for quick
9190
	 * discovery later.
9191
	 */
9192
	if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0)
9193
		panic("softdep_setup_remove: Lost inodedep.");
9194
	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
9195
	dirrem->dm_state |= ONDEPLIST;
9196
	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9197

9198
	/*
9199
	 * If the COMPLETE flag is clear, then there were no active
9200
	 * entries and we want to roll back to a zeroed entry until
9201
	 * the new inode is committed to disk. If the COMPLETE flag is
9202
	 * set then we have deleted an entry that never made it to
9203
	 * disk. If the entry we deleted resulted from a name change,
9204
	 * then the old name still resides on disk. We cannot delete
9205
	 * its inode (returned to us in prevdirrem) until the zeroed
9206
	 * directory entry gets to disk. The new inode has never been
9207
	 * referenced on the disk, so can be deleted immediately.
9208
	 */
9209
	if ((dirrem->dm_state & COMPLETE) == 0) {
9210
		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
9211
		    dm_next);
9212
		FREE_LOCK(ump);
9213
	} else {
9214
		if (prevdirrem != NULL)
9215
			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
9216
			    prevdirrem, dm_next);
9217
		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
9218
		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
9219
		FREE_LOCK(ump);
9220
		if (direct)
9221
			handle_workitem_remove(dirrem, 0);
9222
	}
9223
}
9224

9225
/*
9226
 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
9227
 * pd_pendinghd list of a pagedep.
9228
 */
9229
static struct diradd *
9230
diradd_lookup(struct pagedep *pagedep, int offset)
9231
{
9232
	struct diradd *dap;
9233

9234
	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
9235
		if (dap->da_offset == offset)
9236
			return (dap);
9237
	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
9238
		if (dap->da_offset == offset)
9239
			return (dap);
9240
	return (NULL);
9241
}
9242

9243
/*
9244
 * Search for a .. diradd dependency in a directory that is being removed.
9245
 * If the directory was renamed to a new parent we have a diradd rather
9246
 * than a mkdir for the .. entry.  We need to cancel it now before
9247
 * it is found in truncate().
9248
 */
9249
static struct jremref *
9250
cancel_diradd_dotdot(struct inode *ip,
9251
	struct dirrem *dirrem,
9252
	struct jremref *jremref)
9253
{
9254
	struct pagedep *pagedep;
9255
	struct diradd *dap;
9256
	struct worklist *wk;
9257

9258
	if (pagedep_lookup(ITOVFS(ip), NULL, ip->i_number, 0, 0, &pagedep) == 0)
9259
		return (jremref);
9260
	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
9261
	if (dap == NULL)
9262
		return (jremref);
9263
	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
9264
	/*
9265
	 * Mark any journal work as belonging to the parent so it is freed
9266
	 * with the .. reference.
9267
	 */
9268
	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9269
		wk->wk_state |= MKDIR_PARENT;
9270
	return (NULL);
9271
}
9272

9273
/*
9274
 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
9275
 * replace it with a dirrem/diradd pair as a result of re-parenting a
9276
 * directory.  This ensures that we don't simultaneously have a mkdir and
9277
 * a diradd for the same .. entry.
9278
 */
9279
static struct jremref *
9280
cancel_mkdir_dotdot(struct inode *ip,
9281
	struct dirrem *dirrem,
9282
	struct jremref *jremref)
9283
{
9284
	struct inodedep *inodedep;
9285
	struct jaddref *jaddref;
9286
	struct ufsmount *ump;
9287
	struct mkdir *mkdir;
9288
	struct diradd *dap;
9289
	struct mount *mp;
9290

9291
	mp = ITOVFS(ip);
9292
	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9293
		return (jremref);
9294
	dap = inodedep->id_mkdiradd;
9295
	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
9296
		return (jremref);
9297
	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9298
	for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
9299
	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
9300
		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
9301
			break;
9302
	if (mkdir == NULL)
9303
		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
9304
	if ((jaddref = mkdir->md_jaddref) != NULL) {
9305
		mkdir->md_jaddref = NULL;
9306
		jaddref->ja_state &= ~MKDIR_PARENT;
9307
		if (inodedep_lookup(mp, jaddref->ja_ino, 0, &inodedep) == 0)
9308
			panic("cancel_mkdir_dotdot: Lost parent inodedep");
9309
		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
9310
			journal_jremref(dirrem, jremref, inodedep);
9311
			jremref = NULL;
9312
		}
9313
	}
9314
	if (mkdir->md_state & ONWORKLIST)
9315
		WORKLIST_REMOVE(&mkdir->md_list);
9316
	mkdir->md_state |= ALLCOMPLETE;
9317
	complete_mkdir(mkdir);
9318
	return (jremref);
9319
}
9320

9321
static void
9322
journal_jremref(struct dirrem *dirrem,
9323
	struct jremref *jremref,
9324
	struct inodedep *inodedep)
9325
{
9326

9327
	if (inodedep == NULL)
9328
		if (inodedep_lookup(jremref->jr_list.wk_mp,
9329
		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
9330
			panic("journal_jremref: Lost inodedep");
9331
	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
9332
	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
9333
	add_to_journal(&jremref->jr_list);
9334
}
9335

9336
static void
9337
dirrem_journal(
9338
	struct dirrem *dirrem,
9339
	struct jremref *jremref,
9340
	struct jremref *dotremref,
9341
	struct jremref *dotdotremref)
9342
{
9343
	struct inodedep *inodedep;
9344

9345
	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
9346
	    &inodedep) == 0)
9347
		panic("dirrem_journal: Lost inodedep");
9348
	journal_jremref(dirrem, jremref, inodedep);
9349
	if (dotremref)
9350
		journal_jremref(dirrem, dotremref, inodedep);
9351
	if (dotdotremref)
9352
		journal_jremref(dirrem, dotdotremref, NULL);
9353
}
9354

9355
/*
9356
 * Allocate a new dirrem if appropriate and return it along with
9357
 * its associated pagedep. Called without a lock, returns with lock.
9358
 */
9359
static struct dirrem *
9360
newdirrem(
9361
	struct buf *bp,		/* buffer containing directory block */
9362
	struct inode *dp,	/* inode for the directory being modified */
9363
	struct inode *ip,	/* inode for directory entry being removed */
9364
	bool isrmdir,		/* indicates if doing RMDIR */
9365
	struct dirrem **prevdirremp) /* previously referenced inode, if any */
9366
{
9367
	int offset;
9368
	ufs_lbn_t lbn;
9369
	struct diradd *dap;
9370
	struct dirrem *dirrem;
9371
	struct pagedep *pagedep;
9372
	struct jremref *jremref;
9373
	struct jremref *dotremref;
9374
	struct jremref *dotdotremref;
9375
	struct vnode *dvp;
9376
	struct ufsmount *ump;
9377

9378
	/*
9379
	 * Whiteouts have no deletion dependencies.
9380
	 */
9381
	if (ip == NULL)
9382
		panic("newdirrem: whiteout");
9383
	dvp = ITOV(dp);
9384
	ump = ITOUMP(dp);
9385

9386
	/*
9387
	 * If the system is over its limit and our filesystem is
9388
	 * responsible for more than our share of that usage and
9389
	 * we are not a snapshot, request some inodedep cleanup.
9390
	 * Limiting the number of dirrem structures will also limit
9391
	 * the number of freefile and freeblks structures.
9392
	 */
9393
	ACQUIRE_LOCK(ump);
9394
	if (!IS_SNAPSHOT(ip) && softdep_excess_items(ump, D_DIRREM))
9395
		schedule_cleanup(UFSTOVFS(ump));
9396
	else
9397
		FREE_LOCK(ump);
9398
	dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS |
9399
	    M_ZERO);
9400
	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
9401
	LIST_INIT(&dirrem->dm_jremrefhd);
9402
	LIST_INIT(&dirrem->dm_jwork);
9403
	dirrem->dm_state = isrmdir ? RMDIR : 0;
9404
	dirrem->dm_oldinum = ip->i_number;
9405
	*prevdirremp = NULL;
9406
	/*
9407
	 * Allocate remove reference structures to track journal write
9408
	 * dependencies.  We will always have one for the link and
9409
	 * when doing directories we will always have one more for dot.
9410
	 * When renaming a directory we skip the dotdot link change so
9411
	 * this is not needed.
9412
	 */
9413
	jremref = dotremref = dotdotremref = NULL;
9414
	if (DOINGSUJ(dvp)) {
9415
		if (isrmdir) {
9416
			jremref = newjremref(dirrem, dp, ip, I_OFFSET(dp),
9417
			    ip->i_effnlink + 2);
9418
			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
9419
			    ip->i_effnlink + 1);
9420
			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
9421
			    dp->i_effnlink + 1);
9422
			dotdotremref->jr_state |= MKDIR_PARENT;
9423
		} else
9424
			jremref = newjremref(dirrem, dp, ip, I_OFFSET(dp),
9425
			    ip->i_effnlink + 1);
9426
	}
9427
	ACQUIRE_LOCK(ump);
9428
	lbn = lblkno(ump->um_fs, I_OFFSET(dp));
9429
	offset = blkoff(ump->um_fs, I_OFFSET(dp));
9430
	pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC,
9431
	    &pagedep);
9432
	dirrem->dm_pagedep = pagedep;
9433
	dirrem->dm_offset = offset;
9434
	/*
9435
	 * If we're renaming a .. link to a new directory, cancel any
9436
	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
9437
	 * the jremref is preserved for any potential diradd in this
9438
	 * location.  This can not coincide with a rmdir.
9439
	 */
9440
	if (I_OFFSET(dp) == DOTDOT_OFFSET) {
9441
		if (isrmdir)
9442
			panic("newdirrem: .. directory change during remove?");
9443
		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
9444
	}
9445
	/*
9446
	 * If we're removing a directory search for the .. dependency now and
9447
	 * cancel it.  Any pending journal work will be added to the dirrem
9448
	 * to be completed when the workitem remove completes.
9449
	 */
9450
	if (isrmdir)
9451
		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
9452
	/*
9453
	 * Check for a diradd dependency for the same directory entry.
9454
	 * If present, then both dependencies become obsolete and can
9455
	 * be de-allocated.
9456
	 */
9457
	dap = diradd_lookup(pagedep, offset);
9458
	if (dap == NULL) {
9459
		/*
9460
		 * Link the jremref structures into the dirrem so they are
9461
		 * written prior to the pagedep.
9462
		 */
9463
		if (jremref)
9464
			dirrem_journal(dirrem, jremref, dotremref,
9465
			    dotdotremref);
9466
		return (dirrem);
9467
	}
9468
	/*
9469
	 * Must be ATTACHED at this point.
9470
	 */
9471
	if ((dap->da_state & ATTACHED) == 0)
9472
		panic("newdirrem: not ATTACHED");
9473
	if (dap->da_newinum != ip->i_number)
9474
		panic("newdirrem: inum %ju should be %ju",
9475
		    (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
9476
	/*
9477
	 * If we are deleting a changed name that never made it to disk,
9478
	 * then return the dirrem describing the previous inode (which
9479
	 * represents the inode currently referenced from this entry on disk).
9480
	 */
9481
	if ((dap->da_state & DIRCHG) != 0) {
9482
		*prevdirremp = dap->da_previous;
9483
		dap->da_state &= ~DIRCHG;
9484
		dap->da_pagedep = pagedep;
9485
	}
9486
	/*
9487
	 * We are deleting an entry that never made it to disk.
9488
	 * Mark it COMPLETE so we can delete its inode immediately.
9489
	 */
9490
	dirrem->dm_state |= COMPLETE;
9491
	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
9492
#ifdef INVARIANTS
9493
	if (!isrmdir) {
9494
		struct worklist *wk;
9495

9496
		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9497
			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
9498
				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
9499
	}
9500
#endif
9501

9502
	return (dirrem);
9503
}
9504

9505
/*
9506
 * Directory entry change dependencies.
9507
 * 
9508
 * Changing an existing directory entry requires that an add operation
9509
 * be completed first followed by a deletion. The semantics for the addition
9510
 * are identical to the description of adding a new entry above except
9511
 * that the rollback is to the old inode number rather than zero. Once
9512
 * the addition dependency is completed, the removal is done as described
9513
 * in the removal routine above.
9514
 */
9515

9516
/*
9517
 * This routine should be called immediately after changing
9518
 * a directory entry.  The inode's link count should not be
9519
 * decremented by the calling procedure -- the soft updates
9520
 * code will perform this task when it is safe.
9521
 */
9522
void 
9523
softdep_setup_directory_change(
9524
	struct buf *bp,		/* buffer containing directory block */
9525
	struct inode *dp,	/* inode for the directory being modified */
9526
	struct inode *ip,	/* inode for directory entry being removed */
9527
	ino_t newinum,		/* new inode number for changed entry */
9528
	u_int newparent)	/* indicates if doing RMDIR */
9529
{
9530
	int offset;
9531
	struct diradd *dap = NULL;
9532
	struct dirrem *dirrem, *prevdirrem;
9533
	struct pagedep *pagedep;
9534
	struct inodedep *inodedep;
9535
	struct jaddref *jaddref;
9536
	struct mount *mp;
9537
	struct ufsmount *ump;
9538

9539
	mp = ITOVFS(dp);
9540
	ump = VFSTOUFS(mp);
9541
	offset = blkoff(ump->um_fs, I_OFFSET(dp));
9542
	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
9543
	   ("softdep_setup_directory_change called on non-softdep filesystem"));
9544

9545
	/*
9546
	 * Whiteouts do not need diradd dependencies.
9547
	 */
9548
	if (newinum != UFS_WINO) {
9549
		dap = malloc(sizeof(struct diradd),
9550
		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
9551
		workitem_alloc(&dap->da_list, D_DIRADD, mp);
9552
		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
9553
		dap->da_offset = offset;
9554
		dap->da_newinum = newinum;
9555
		LIST_INIT(&dap->da_jwork);
9556
	}
9557

9558
	/*
9559
	 * Allocate a new dirrem and ACQUIRE_LOCK.
9560
	 */
9561
	dirrem = newdirrem(bp, dp, ip, newparent != 0, &prevdirrem);
9562
	pagedep = dirrem->dm_pagedep;
9563
	/*
9564
	 * The possible values for newparent:
9565
	 *	0 - non-directory file rename
9566
	 *	1 - directory rename within same directory
9567
	 *   inum - directory rename to new directory of given inode number
9568
	 * When renaming to a new directory, we are both deleting and
9569
	 * creating a new directory entry, so the link count on the new
9570
	 * directory should not change. Thus we do not need the followup
9571
	 * dirrem which is usually done in handle_workitem_remove. We set
9572
	 * the DIRCHG flag to tell handle_workitem_remove to skip the 
9573
	 * followup dirrem.
9574
	 */
9575
	if (newparent > 1)
9576
		dirrem->dm_state |= DIRCHG;
9577

9578
	/*
9579
	 * Whiteouts have no additional dependencies,
9580
	 * so just put the dirrem on the correct list.
9581
	 */
9582
	if (newinum == UFS_WINO) {
9583
		if ((dirrem->dm_state & COMPLETE) == 0) {
9584
			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
9585
			    dm_next);
9586
		} else {
9587
			dirrem->dm_dirinum = pagedep->pd_ino;
9588
			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9589
				add_to_worklist(&dirrem->dm_list, 0);
9590
		}
9591
		FREE_LOCK(ump);
9592
		return;
9593
	}
9594
	/*
9595
	 * Add the dirrem to the inodedep's pending remove list for quick
9596
	 * discovery later.  A valid nlinkdelta ensures that this lookup
9597
	 * will not fail.
9598
	 */
9599
	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9600
		panic("softdep_setup_directory_change: Lost inodedep.");
9601
	dirrem->dm_state |= ONDEPLIST;
9602
	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9603

9604
	/*
9605
	 * If the COMPLETE flag is clear, then there were no active
9606
	 * entries and we want to roll back to the previous inode until
9607
	 * the new inode is committed to disk. If the COMPLETE flag is
9608
	 * set, then we have deleted an entry that never made it to disk.
9609
	 * If the entry we deleted resulted from a name change, then the old
9610
	 * inode reference still resides on disk. Any rollback that we do
9611
	 * needs to be to that old inode (returned to us in prevdirrem). If
9612
	 * the entry we deleted resulted from a create, then there is
9613
	 * no entry on the disk, so we want to roll back to zero rather
9614
	 * than the uncommitted inode. In either of the COMPLETE cases we
9615
	 * want to immediately free the unwritten and unreferenced inode.
9616
	 */
9617
	if ((dirrem->dm_state & COMPLETE) == 0) {
9618
		dap->da_previous = dirrem;
9619
	} else {
9620
		if (prevdirrem != NULL) {
9621
			dap->da_previous = prevdirrem;
9622
		} else {
9623
			dap->da_state &= ~DIRCHG;
9624
			dap->da_pagedep = pagedep;
9625
		}
9626
		dirrem->dm_dirinum = pagedep->pd_ino;
9627
		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9628
			add_to_worklist(&dirrem->dm_list, 0);
9629
	}
9630
	/*
9631
	 * Lookup the jaddref for this journal entry.  We must finish
9632
	 * initializing it and make the diradd write dependent on it.
9633
	 * If we're not journaling, put it on the id_bufwait list if the
9634
	 * inode is not yet written. If it is written, do the post-inode
9635
	 * write processing to put it on the id_pendinghd list.
9636
	 */
9637
	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
9638
	if (MOUNTEDSUJ(mp)) {
9639
		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
9640
		    inoreflst);
9641
		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
9642
		    ("softdep_setup_directory_change: bad jaddref %p",
9643
		    jaddref));
9644
		jaddref->ja_diroff = I_OFFSET(dp);
9645
		jaddref->ja_diradd = dap;
9646
		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9647
		    dap, da_pdlist);
9648
		add_to_journal(&jaddref->ja_list);
9649
	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
9650
		dap->da_state |= COMPLETE;
9651
		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9652
		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9653
	} else {
9654
		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9655
		    dap, da_pdlist);
9656
		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
9657
	}
9658
	/*
9659
	 * If we're making a new name for a directory that has not been
9660
	 * committed when need to move the dot and dotdot references to
9661
	 * this new name.
9662
	 */
9663
	if (inodedep->id_mkdiradd && I_OFFSET(dp) != DOTDOT_OFFSET)
9664
		merge_diradd(inodedep, dap);
9665
	FREE_LOCK(ump);
9666
}
9667

9668
/*
9669
 * Called whenever the link count on an inode is changed.
9670
 * It creates an inode dependency so that the new reference(s)
9671
 * to the inode cannot be committed to disk until the updated
9672
 * inode has been written.
9673
 */
9674
void
9675
softdep_change_linkcnt(
9676
	struct inode *ip)	/* the inode with the increased link count */
9677
{
9678
	struct inodedep *inodedep;
9679
	struct ufsmount *ump;
9680

9681
	ump = ITOUMP(ip);
9682
	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9683
	    ("softdep_change_linkcnt called on non-softdep filesystem"));
9684
	ACQUIRE_LOCK(ump);
9685
	inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
9686
	if (ip->i_nlink < ip->i_effnlink)
9687
		panic("softdep_change_linkcnt: bad delta");
9688
	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9689
	FREE_LOCK(ump);
9690
}
9691

9692
/*
9693
 * Attach a sbdep dependency to the superblock buf so that we can keep
9694
 * track of the head of the linked list of referenced but unlinked inodes.
9695
 */
9696
void
9697
softdep_setup_sbupdate(
9698
	struct ufsmount *ump,
9699
	struct fs *fs,
9700
	struct buf *bp)
9701
{
9702
	struct sbdep *sbdep;
9703
	struct worklist *wk;
9704

9705
	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9706
	    ("softdep_setup_sbupdate called on non-softdep filesystem"));
9707
	LIST_FOREACH(wk, &bp->b_dep, wk_list)
9708
		if (wk->wk_type == D_SBDEP)
9709
			break;
9710
	if (wk != NULL)
9711
		return;
9712
	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
9713
	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
9714
	sbdep->sb_fs = fs;
9715
	sbdep->sb_ump = ump;
9716
	ACQUIRE_LOCK(ump);
9717
	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
9718
	FREE_LOCK(ump);
9719
}
9720

9721
/*
9722
 * Return the first unlinked inodedep which is ready to be the head of the
9723
 * list.  The inodedep and all those after it must have valid next pointers.
9724
 */
9725
static struct inodedep *
9726
first_unlinked_inodedep(struct ufsmount *ump)
9727
{
9728
	struct inodedep *inodedep;
9729
	struct inodedep *idp;
9730

9731
	LOCK_OWNED(ump);
9732
	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9733
	    inodedep; inodedep = idp) {
9734
		if ((inodedep->id_state & UNLINKNEXT) == 0)
9735
			return (NULL);
9736
		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9737
		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9738
			break;
9739
		if ((inodedep->id_state & UNLINKPREV) == 0)
9740
			break;
9741
	}
9742
	return (inodedep);
9743
}
9744

9745
/*
9746
 * Set the sujfree unlinked head pointer prior to writing a superblock.
9747
 */
9748
static void
9749
initiate_write_sbdep(struct sbdep *sbdep)
9750
{
9751
	struct inodedep *inodedep;
9752
	struct fs *bpfs;
9753
	struct fs *fs;
9754

9755
	bpfs = sbdep->sb_fs;
9756
	fs = sbdep->sb_ump->um_fs;
9757
	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9758
	if (inodedep) {
9759
		fs->fs_sujfree = inodedep->id_ino;
9760
		inodedep->id_state |= UNLINKPREV;
9761
	} else
9762
		fs->fs_sujfree = 0;
9763
	bpfs->fs_sujfree = fs->fs_sujfree;
9764
	/*
9765
	 * Because we have made changes to the superblock, we need to
9766
	 * recompute its check-hash.
9767
	 */
9768
	bpfs->fs_ckhash = ffs_calc_sbhash(bpfs);
9769
}
9770

9771
/*
9772
 * After a superblock is written determine whether it must be written again
9773
 * due to a changing unlinked list head.
9774
 */
9775
static int
9776
handle_written_sbdep(struct sbdep *sbdep, struct buf *bp)
9777
{
9778
	struct inodedep *inodedep;
9779
	struct fs *fs;
9780

9781
	LOCK_OWNED(sbdep->sb_ump);
9782
	fs = sbdep->sb_fs;
9783
	/*
9784
	 * If the superblock doesn't match the in-memory list start over.
9785
	 */
9786
	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9787
	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9788
	    (inodedep == NULL && fs->fs_sujfree != 0)) {
9789
		bdirty(bp);
9790
		return (1);
9791
	}
9792
	WORKITEM_FREE(sbdep, D_SBDEP);
9793
	if (fs->fs_sujfree == 0)
9794
		return (0);
9795
	/*
9796
	 * Now that we have a record of this inode in stable store allow it
9797
	 * to be written to free up pending work.  Inodes may see a lot of
9798
	 * write activity after they are unlinked which we must not hold up.
9799
	 */
9800
	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9801
		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9802
			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9803
			    inodedep, inodedep->id_state);
9804
		if (inodedep->id_state & UNLINKONLIST)
9805
			break;
9806
		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9807
	}
9808

9809
	return (0);
9810
}
9811

9812
/*
9813
 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9814
 */
9815
static void
9816
unlinked_inodedep( struct mount *mp, struct inodedep *inodedep)
9817
{
9818
	struct ufsmount *ump;
9819

9820
	ump = VFSTOUFS(mp);
9821
	LOCK_OWNED(ump);
9822
	if (MOUNTEDSUJ(mp) == 0)
9823
		return;
9824
	ump->um_fs->fs_fmod = 1;
9825
	if (inodedep->id_state & UNLINKED)
9826
		panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9827
	inodedep->id_state |= UNLINKED;
9828
	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9829
}
9830

9831
/*
9832
 * Remove an inodedep from the unlinked inodedep list.  This may require
9833
 * disk writes if the inode has made it that far.
9834
 */
9835
static void
9836
clear_unlinked_inodedep( struct inodedep *inodedep)
9837
{
9838
	struct ufs2_dinode *dip;
9839
	struct ufsmount *ump;
9840
	struct inodedep *idp;
9841
	struct inodedep *idn;
9842
	struct fs *fs, *bpfs;
9843
	struct buf *bp;
9844
	daddr_t dbn;
9845
	ino_t ino;
9846
	ino_t nino;
9847
	ino_t pino;
9848
	int error;
9849

9850
	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9851
	fs = ump->um_fs;
9852
	ino = inodedep->id_ino;
9853
	error = 0;
9854
	for (;;) {
9855
		LOCK_OWNED(ump);
9856
		KASSERT((inodedep->id_state & UNLINKED) != 0,
9857
		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9858
		    inodedep));
9859
		/*
9860
		 * If nothing has yet been written simply remove us from
9861
		 * the in memory list and return.  This is the most common
9862
		 * case where handle_workitem_remove() loses the final
9863
		 * reference.
9864
		 */
9865
		if ((inodedep->id_state & UNLINKLINKS) == 0)
9866
			break;
9867
		/*
9868
		 * If we have a NEXT pointer and no PREV pointer we can simply
9869
		 * clear NEXT's PREV and remove ourselves from the list.  Be
9870
		 * careful not to clear PREV if the superblock points at
9871
		 * next as well.
9872
		 */
9873
		idn = TAILQ_NEXT(inodedep, id_unlinked);
9874
		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
9875
			if (idn && fs->fs_sujfree != idn->id_ino)
9876
				idn->id_state &= ~UNLINKPREV;
9877
			break;
9878
		}
9879
		/*
9880
		 * Here we have an inodedep which is actually linked into
9881
		 * the list.  We must remove it by forcing a write to the
9882
		 * link before us, whether it be the superblock or an inode.
9883
		 * Unfortunately the list may change while we're waiting
9884
		 * on the buf lock for either resource so we must loop until
9885
		 * we lock the right one.  If both the superblock and an
9886
		 * inode point to this inode we must clear the inode first
9887
		 * followed by the superblock.
9888
		 */
9889
		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9890
		pino = 0;
9891
		if (idp && (idp->id_state & UNLINKNEXT))
9892
			pino = idp->id_ino;
9893
		FREE_LOCK(ump);
9894
		if (pino == 0) {
9895
			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9896
			    (int)fs->fs_sbsize, 0, 0, 0);
9897
		} else {
9898
			dbn = fsbtodb(fs, ino_to_fsba(fs, pino));
9899
			error = ffs_breadz(ump, ump->um_devvp, dbn, dbn,
9900
			    (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL,
9901
			    &bp);
9902
		}
9903
		ACQUIRE_LOCK(ump);
9904
		if (error)
9905
			break;
9906
		/* If the list has changed restart the loop. */
9907
		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9908
		nino = 0;
9909
		if (idp && (idp->id_state & UNLINKNEXT))
9910
			nino = idp->id_ino;
9911
		if (nino != pino ||
9912
		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
9913
			FREE_LOCK(ump);
9914
			brelse(bp);
9915
			ACQUIRE_LOCK(ump);
9916
			continue;
9917
		}
9918
		nino = 0;
9919
		idn = TAILQ_NEXT(inodedep, id_unlinked);
9920
		if (idn)
9921
			nino = idn->id_ino;
9922
		/*
9923
		 * Remove us from the in memory list.  After this we cannot
9924
		 * access the inodedep.
9925
		 */
9926
		KASSERT((inodedep->id_state & UNLINKED) != 0,
9927
		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9928
		    inodedep));
9929
		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9930
		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9931
		FREE_LOCK(ump);
9932
		/*
9933
		 * The predecessor's next pointer is manually updated here
9934
		 * so that the NEXT flag is never cleared for an element
9935
		 * that is in the list.
9936
		 */
9937
		if (pino == 0) {
9938
			bcopy((caddr_t)fs, bp->b_data, (uint64_t)fs->fs_sbsize);
9939
			bpfs = (struct fs *)bp->b_data;
9940
			ffs_oldfscompat_write(bpfs);
9941
			softdep_setup_sbupdate(ump, bpfs, bp);
9942
			/*
9943
			 * Because we may have made changes to the superblock,
9944
			 * we need to recompute its check-hash.
9945
			 */
9946
			bpfs->fs_ckhash = ffs_calc_sbhash(bpfs);
9947
		} else if (fs->fs_magic == FS_UFS1_MAGIC) {
9948
			((struct ufs1_dinode *)bp->b_data +
9949
			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9950
		} else {
9951
			dip = (struct ufs2_dinode *)bp->b_data +
9952
			    ino_to_fsbo(fs, pino);
9953
			dip->di_freelink = nino;
9954
			ffs_update_dinode_ckhash(fs, dip);
9955
		}
9956
		/*
9957
		 * If the bwrite fails we have no recourse to recover.  The
9958
		 * filesystem is corrupted already.
9959
		 */
9960
		bwrite(bp);
9961
		ACQUIRE_LOCK(ump);
9962
		/*
9963
		 * If the superblock pointer still needs to be cleared force
9964
		 * a write here.
9965
		 */
9966
		if (fs->fs_sujfree == ino) {
9967
			FREE_LOCK(ump);
9968
			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9969
			    (int)fs->fs_sbsize, 0, 0, 0);
9970
			bcopy((caddr_t)fs, bp->b_data, (uint64_t)fs->fs_sbsize);
9971
			bpfs = (struct fs *)bp->b_data;
9972
			ffs_oldfscompat_write(bpfs);
9973
			softdep_setup_sbupdate(ump, bpfs, bp);
9974
			/*
9975
			 * Because we may have made changes to the superblock,
9976
			 * we need to recompute its check-hash.
9977
			 */
9978
			bpfs->fs_ckhash = ffs_calc_sbhash(bpfs);
9979
			bwrite(bp);
9980
			ACQUIRE_LOCK(ump);
9981
		}
9982

9983
		if (fs->fs_sujfree != ino)
9984
			return;
9985
		panic("clear_unlinked_inodedep: Failed to clear free head");
9986
	}
9987
	if (inodedep->id_ino == fs->fs_sujfree)
9988
		panic("clear_unlinked_inodedep: Freeing head of free list");
9989
	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9990
	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9991
	return;
9992
}
9993

9994
/*
9995
 * This workitem decrements the inode's link count.
9996
 * If the link count reaches zero, the file is removed.
9997
 */
9998
static int
9999
handle_workitem_remove(struct dirrem *dirrem, int flags)
10000
{
10001
	struct inodedep *inodedep;
10002
	struct workhead dotdotwk;
10003
	struct worklist *wk;
10004
	struct ufsmount *ump;
10005
	struct mount *mp;
10006
	struct vnode *vp;
10007
	struct inode *ip;
10008
	ino_t oldinum;
10009

10010
	if (dirrem->dm_state & ONWORKLIST)
10011
		panic("handle_workitem_remove: dirrem %p still on worklist",
10012
		    dirrem);
10013
	oldinum = dirrem->dm_oldinum;
10014
	mp = dirrem->dm_list.wk_mp;
10015
	ump = VFSTOUFS(mp);
10016
	flags |= LK_EXCLUSIVE;
10017
	if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ |
10018
	    FFSV_FORCEINODEDEP) != 0)
10019
		return (EBUSY);
10020
	ip = VTOI(vp);
10021
	MPASS(ip->i_mode != 0);
10022
	ACQUIRE_LOCK(ump);
10023
	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
10024
		panic("handle_workitem_remove: lost inodedep");
10025
	if (dirrem->dm_state & ONDEPLIST)
10026
		LIST_REMOVE(dirrem, dm_inonext);
10027
	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
10028
	    ("handle_workitem_remove:  Journal entries not written."));
10029

10030
	/*
10031
	 * Move all dependencies waiting on the remove to complete
10032
	 * from the dirrem to the inode inowait list to be completed
10033
	 * after the inode has been updated and written to disk.
10034
	 *
10035
	 * Any marked MKDIR_PARENT are saved to be completed when the 
10036
	 * dotdot ref is removed unless DIRCHG is specified.  For
10037
	 * directory change operations there will be no further
10038
	 * directory writes and the jsegdeps need to be moved along
10039
	 * with the rest to be completed when the inode is free or
10040
	 * stable in the inode free list.
10041
	 */
10042
	LIST_INIT(&dotdotwk);
10043
	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
10044
		WORKLIST_REMOVE(wk);
10045
		if ((dirrem->dm_state & DIRCHG) == 0 &&
10046
		    wk->wk_state & MKDIR_PARENT) {
10047
			wk->wk_state &= ~MKDIR_PARENT;
10048
			WORKLIST_INSERT(&dotdotwk, wk);
10049
			continue;
10050
		}
10051
		WORKLIST_INSERT(&inodedep->id_inowait, wk);
10052
	}
10053
	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
10054
	/*
10055
	 * Normal file deletion.
10056
	 */
10057
	if ((dirrem->dm_state & RMDIR) == 0) {
10058
		ip->i_nlink--;
10059
		KASSERT(ip->i_nlink >= 0, ("handle_workitem_remove: file ino "
10060
		    "%ju negative i_nlink %d", (intmax_t)ip->i_number,
10061
		    ip->i_nlink));
10062
		DIP_SET_NLINK(ip, ip->i_nlink);
10063
		UFS_INODE_SET_FLAG(ip, IN_CHANGE);
10064
		if (ip->i_nlink < ip->i_effnlink)
10065
			panic("handle_workitem_remove: bad file delta");
10066
		if (ip->i_nlink == 0) 
10067
			unlinked_inodedep(mp, inodedep);
10068
		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
10069
		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
10070
		    ("handle_workitem_remove: worklist not empty. %s",
10071
		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
10072
		WORKITEM_FREE(dirrem, D_DIRREM);
10073
		FREE_LOCK(ump);
10074
		goto out;
10075
	}
10076
	/*
10077
	 * Directory deletion. Decrement reference count for both the
10078
	 * just deleted parent directory entry and the reference for ".".
10079
	 * Arrange to have the reference count on the parent decremented
10080
	 * to account for the loss of "..".
10081
	 */
10082
	ip->i_nlink -= 2;
10083
	KASSERT(ip->i_nlink >= 0, ("handle_workitem_remove: directory ino "
10084
	    "%ju negative i_nlink %d", (intmax_t)ip->i_number, ip->i_nlink));
10085
	DIP_SET_NLINK(ip, ip->i_nlink);
10086
	UFS_INODE_SET_FLAG(ip, IN_CHANGE);
10087
	if (ip->i_nlink < ip->i_effnlink)
10088
		panic("handle_workitem_remove: bad dir delta");
10089
	if (ip->i_nlink == 0)
10090
		unlinked_inodedep(mp, inodedep);
10091
	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
10092
	/*
10093
	 * Rename a directory to a new parent. Since, we are both deleting
10094
	 * and creating a new directory entry, the link count on the new
10095
	 * directory should not change. Thus we skip the followup dirrem.
10096
	 */
10097
	if (dirrem->dm_state & DIRCHG) {
10098
		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
10099
		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
10100
		WORKITEM_FREE(dirrem, D_DIRREM);
10101
		FREE_LOCK(ump);
10102
		goto out;
10103
	}
10104
	dirrem->dm_state = ONDEPLIST;
10105
	dirrem->dm_oldinum = dirrem->dm_dirinum;
10106
	/*
10107
	 * Place the dirrem on the parent's diremhd list.
10108
	 */
10109
	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
10110
		panic("handle_workitem_remove: lost dir inodedep");
10111
	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
10112
	/*
10113
	 * If the allocated inode has never been written to disk, then
10114
	 * the on-disk inode is zero'ed and we can remove the file
10115
	 * immediately.  When journaling if the inode has been marked
10116
	 * unlinked and not DEPCOMPLETE we know it can never be written.
10117
	 */
10118
	inodedep_lookup(mp, oldinum, 0, &inodedep);
10119
	if (inodedep == NULL ||
10120
	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
10121
	    check_inode_unwritten(inodedep)) {
10122
		FREE_LOCK(ump);
10123
		vput(vp);
10124
		return handle_workitem_remove(dirrem, flags);
10125
	}
10126
	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
10127
	FREE_LOCK(ump);
10128
	UFS_INODE_SET_FLAG(ip, IN_CHANGE);
10129
out:
10130
	ffs_update(vp, 0);
10131
	vput(vp);
10132
	return (0);
10133
}
10134

10135
/*
10136
 * Inode de-allocation dependencies.
10137
 * 
10138
 * When an inode's link count is reduced to zero, it can be de-allocated. We
10139
 * found it convenient to postpone de-allocation until after the inode is
10140
 * written to disk with its new link count (zero).  At this point, all of the
10141
 * on-disk inode's block pointers are nullified and, with careful dependency
10142
 * list ordering, all dependencies related to the inode will be satisfied and
10143
 * the corresponding dependency structures de-allocated.  So, if/when the
10144
 * inode is reused, there will be no mixing of old dependencies with new
10145
 * ones.  This artificial dependency is set up by the block de-allocation
10146
 * procedure above (softdep_setup_freeblocks) and completed by the
10147
 * following procedure.
10148
 */
10149
static void 
10150
handle_workitem_freefile(struct freefile *freefile)
10151
{
10152
	struct workhead wkhd;
10153
	struct fs *fs;
10154
	struct ufsmount *ump;
10155
	int error;
10156
#ifdef INVARIANTS
10157
	struct inodedep *idp;
10158
#endif
10159

10160
	ump = VFSTOUFS(freefile->fx_list.wk_mp);
10161
	fs = ump->um_fs;
10162
#ifdef INVARIANTS
10163
	ACQUIRE_LOCK(ump);
10164
	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
10165
	FREE_LOCK(ump);
10166
	if (error)
10167
		panic("handle_workitem_freefile: inodedep %p survived", idp);
10168
#endif
10169
	UFS_LOCK(ump);
10170
	fs->fs_pendinginodes -= 1;
10171
	UFS_UNLOCK(ump);
10172
	LIST_INIT(&wkhd);
10173
	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
10174
	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
10175
	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
10176
		softdep_error("handle_workitem_freefile", error);
10177
	ACQUIRE_LOCK(ump);
10178
	WORKITEM_FREE(freefile, D_FREEFILE);
10179
	FREE_LOCK(ump);
10180
}
10181

10182
/*
10183
 * Helper function which unlinks marker element from work list and returns
10184
 * the next element on the list.
10185
 */
10186
static __inline struct worklist *
10187
markernext(struct worklist *marker)
10188
{
10189
	struct worklist *next;
10190

10191
	next = LIST_NEXT(marker, wk_list);
10192
	LIST_REMOVE(marker, wk_list);
10193
	return next;
10194
}
10195

10196
/*
10197
 * Disk writes.
10198
 * 
10199
 * The dependency structures constructed above are most actively used when file
10200
 * system blocks are written to disk.  No constraints are placed on when a
10201
 * block can be written, but unsatisfied update dependencies are made safe by
10202
 * modifying (or replacing) the source memory for the duration of the disk
10203
 * write.  When the disk write completes, the memory block is again brought
10204
 * up-to-date.
10205
 *
10206
 * In-core inode structure reclamation.
10207
 * 
10208
 * Because there are a finite number of "in-core" inode structures, they are
10209
 * reused regularly.  By transferring all inode-related dependencies to the
10210
 * in-memory inode block and indexing them separately (via "inodedep"s), we
10211
 * can allow "in-core" inode structures to be reused at any time and avoid
10212
 * any increase in contention.
10213
 *
10214
 * Called just before entering the device driver to initiate a new disk I/O.
10215
 * The buffer must be locked, thus, no I/O completion operations can occur
10216
 * while we are manipulating its associated dependencies.
10217
 */
10218
static void 
10219
softdep_disk_io_initiation(
10220
	struct buf *bp)		/* structure describing disk write to occur */
10221
{
10222
	struct worklist *wk;
10223
	struct worklist marker;
10224
	struct inodedep *inodedep;
10225
	struct freeblks *freeblks;
10226
	struct jblkdep *jblkdep;
10227
	struct newblk *newblk;
10228
	struct ufsmount *ump;
10229

10230
	/*
10231
	 * We only care about write operations. There should never
10232
	 * be dependencies for reads.
10233
	 */
10234
	if (bp->b_iocmd != BIO_WRITE)
10235
		panic("softdep_disk_io_initiation: not write");
10236

10237
	if (bp->b_vflags & BV_BKGRDINPROG)
10238
		panic("softdep_disk_io_initiation: Writing buffer with "
10239
		    "background write in progress: %p", bp);
10240

10241
	ump = softdep_bp_to_mp(bp);
10242
	if (ump == NULL)
10243
		return;
10244

10245
	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
10246
	ACQUIRE_LOCK(ump);
10247
	/*
10248
	 * Do any necessary pre-I/O processing.
10249
	 */
10250
	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
10251
	     wk = markernext(&marker)) {
10252
		LIST_INSERT_AFTER(wk, &marker, wk_list);
10253
		switch (wk->wk_type) {
10254
		case D_PAGEDEP:
10255
			initiate_write_filepage(WK_PAGEDEP(wk), bp);
10256
			continue;
10257

10258
		case D_INODEDEP:
10259
			inodedep = WK_INODEDEP(wk);
10260
			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
10261
				initiate_write_inodeblock_ufs1(inodedep, bp);
10262
			else
10263
				initiate_write_inodeblock_ufs2(inodedep, bp);
10264
			continue;
10265

10266
		case D_INDIRDEP:
10267
			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
10268
			continue;
10269

10270
		case D_BMSAFEMAP:
10271
			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
10272
			continue;
10273

10274
		case D_JSEG:
10275
			WK_JSEG(wk)->js_buf = NULL;
10276
			continue;
10277

10278
		case D_FREEBLKS:
10279
			freeblks = WK_FREEBLKS(wk);
10280
			jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
10281
			/*
10282
			 * We have to wait for the freeblks to be journaled
10283
			 * before we can write an inodeblock with updated
10284
			 * pointers.  Be careful to arrange the marker so
10285
			 * we revisit the freeblks if it's not removed by
10286
			 * the first jwait().
10287
			 */
10288
			if (jblkdep != NULL) {
10289
				LIST_REMOVE(&marker, wk_list);
10290
				LIST_INSERT_BEFORE(wk, &marker, wk_list);
10291
				jwait(&jblkdep->jb_list, MNT_WAIT);
10292
			}
10293
			continue;
10294
		case D_ALLOCDIRECT:
10295
		case D_ALLOCINDIR:
10296
			/*
10297
			 * We have to wait for the jnewblk to be journaled
10298
			 * before we can write to a block if the contents
10299
			 * may be confused with an earlier file's indirect
10300
			 * at recovery time.  Handle the marker as described
10301
			 * above.
10302
			 */
10303
			newblk = WK_NEWBLK(wk);
10304
			if (newblk->nb_jnewblk != NULL &&
10305
			    indirblk_lookup(newblk->nb_list.wk_mp,
10306
			    newblk->nb_newblkno)) {
10307
				LIST_REMOVE(&marker, wk_list);
10308
				LIST_INSERT_BEFORE(wk, &marker, wk_list);
10309
				jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
10310
			}
10311
			continue;
10312

10313
		case D_SBDEP:
10314
			initiate_write_sbdep(WK_SBDEP(wk));
10315
			continue;
10316

10317
		case D_MKDIR:
10318
		case D_FREEWORK:
10319
		case D_FREEDEP:
10320
		case D_JSEGDEP:
10321
			continue;
10322

10323
		default:
10324
			panic("handle_disk_io_initiation: Unexpected type %s",
10325
			    TYPENAME(wk->wk_type));
10326
			/* NOTREACHED */
10327
		}
10328
	}
10329
	FREE_LOCK(ump);
10330
}
10331

10332
/*
10333
 * Called from within the procedure above to deal with unsatisfied
10334
 * allocation dependencies in a directory. The buffer must be locked,
10335
 * thus, no I/O completion operations can occur while we are
10336
 * manipulating its associated dependencies.
10337
 */
10338
static void
10339
initiate_write_filepage(struct pagedep *pagedep, struct buf *bp)
10340
{
10341
	struct jremref *jremref;
10342
	struct jmvref *jmvref;
10343
	struct dirrem *dirrem;
10344
	struct diradd *dap;
10345
	struct direct *ep;
10346
	int i;
10347

10348
	if (pagedep->pd_state & IOSTARTED) {
10349
		/*
10350
		 * This can only happen if there is a driver that does not
10351
		 * understand chaining. Here biodone will reissue the call
10352
		 * to strategy for the incomplete buffers.
10353
		 */
10354
		printf("initiate_write_filepage: already started\n");
10355
		return;
10356
	}
10357
	pagedep->pd_state |= IOSTARTED;
10358
	/*
10359
	 * Wait for all journal remove dependencies to hit the disk.
10360
	 * We can not allow any potentially conflicting directory adds
10361
	 * to be visible before removes and rollback is too difficult.
10362
	 * The per-filesystem lock may be dropped and re-acquired, however 
10363
	 * we hold the buf locked so the dependency can not go away.
10364
	 */
10365
	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
10366
		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
10367
			jwait(&jremref->jr_list, MNT_WAIT);
10368
	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
10369
		jwait(&jmvref->jm_list, MNT_WAIT);
10370
	for (i = 0; i < DAHASHSZ; i++) {
10371
		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
10372
			ep = (struct direct *)
10373
			    ((char *)bp->b_data + dap->da_offset);
10374
			if (ep->d_ino != dap->da_newinum)
10375
				panic("%s: dir inum %ju != new %ju",
10376
				    "initiate_write_filepage",
10377
				    (uintmax_t)ep->d_ino,
10378
				    (uintmax_t)dap->da_newinum);
10379
			if (dap->da_state & DIRCHG)
10380
				ep->d_ino = dap->da_previous->dm_oldinum;
10381
			else
10382
				ep->d_ino = 0;
10383
			dap->da_state &= ~ATTACHED;
10384
			dap->da_state |= UNDONE;
10385
		}
10386
	}
10387
}
10388

10389
/*
10390
 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
10391
 * Note that any bug fixes made to this routine must be done in the
10392
 * version found below.
10393
 *
10394
 * Called from within the procedure above to deal with unsatisfied
10395
 * allocation dependencies in an inodeblock. The buffer must be
10396
 * locked, thus, no I/O completion operations can occur while we
10397
 * are manipulating its associated dependencies.
10398
 */
10399
static void 
10400
initiate_write_inodeblock_ufs1(
10401
	struct inodedep *inodedep,
10402
	struct buf *bp)			/* The inode block */
10403
{
10404
	struct allocdirect *adp, *lastadp;
10405
	struct ufs1_dinode *dp;
10406
	struct ufs1_dinode *sip;
10407
	struct inoref *inoref;
10408
	struct ufsmount *ump;
10409
	struct fs *fs;
10410
	ufs_lbn_t i;
10411
#ifdef INVARIANTS
10412
	ufs_lbn_t prevlbn = 0;
10413
#endif
10414
	int deplist __diagused;
10415

10416
	if (inodedep->id_state & IOSTARTED)
10417
		panic("initiate_write_inodeblock_ufs1: already started");
10418
	inodedep->id_state |= IOSTARTED;
10419
	fs = inodedep->id_fs;
10420
	ump = VFSTOUFS(inodedep->id_list.wk_mp);
10421
	LOCK_OWNED(ump);
10422
	dp = (struct ufs1_dinode *)bp->b_data +
10423
	    ino_to_fsbo(fs, inodedep->id_ino);
10424

10425
	/*
10426
	 * If we're on the unlinked list but have not yet written our
10427
	 * next pointer initialize it here.
10428
	 */
10429
	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10430
		struct inodedep *inon;
10431

10432
		inon = TAILQ_NEXT(inodedep, id_unlinked);
10433
		dp->di_freelink = inon ? inon->id_ino : 0;
10434
	}
10435
	/*
10436
	 * If the bitmap is not yet written, then the allocated
10437
	 * inode cannot be written to disk.
10438
	 */
10439
	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10440
		if (inodedep->id_savedino1 != NULL)
10441
			panic("initiate_write_inodeblock_ufs1: I/O underway");
10442
		FREE_LOCK(ump);
10443
		sip = malloc(sizeof(struct ufs1_dinode),
10444
		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10445
		ACQUIRE_LOCK(ump);
10446
		inodedep->id_savedino1 = sip;
10447
		*inodedep->id_savedino1 = *dp;
10448
		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
10449
		dp->di_gen = inodedep->id_savedino1->di_gen;
10450
		dp->di_freelink = inodedep->id_savedino1->di_freelink;
10451
		return;
10452
	}
10453
	/*
10454
	 * If no dependencies, then there is nothing to roll back.
10455
	 */
10456
	inodedep->id_savedsize = dp->di_size;
10457
	inodedep->id_savedextsize = 0;
10458
	inodedep->id_savednlink = dp->di_nlink;
10459
	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10460
	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10461
		return;
10462
	/*
10463
	 * Revert the link count to that of the first unwritten journal entry.
10464
	 */
10465
	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10466
	if (inoref)
10467
		dp->di_nlink = inoref->if_nlink;
10468
	/*
10469
	 * Set the dependencies to busy.
10470
	 */
10471
	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10472
	     adp = TAILQ_NEXT(adp, ad_next)) {
10473
#ifdef INVARIANTS
10474
		if (deplist != 0 && prevlbn >= adp->ad_offset)
10475
			panic("softdep_write_inodeblock: lbn order");
10476
		prevlbn = adp->ad_offset;
10477
		if (adp->ad_offset < UFS_NDADDR &&
10478
		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10479
			panic("initiate_write_inodeblock_ufs1: "
10480
			    "direct pointer #%jd mismatch %d != %jd",
10481
			    (intmax_t)adp->ad_offset,
10482
			    dp->di_db[adp->ad_offset],
10483
			    (intmax_t)adp->ad_newblkno);
10484
		if (adp->ad_offset >= UFS_NDADDR &&
10485
		    dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno)
10486
			panic("initiate_write_inodeblock_ufs1: "
10487
			    "indirect pointer #%jd mismatch %d != %jd",
10488
			    (intmax_t)adp->ad_offset - UFS_NDADDR,
10489
			    dp->di_ib[adp->ad_offset - UFS_NDADDR],
10490
			    (intmax_t)adp->ad_newblkno);
10491
		deplist |= 1 << adp->ad_offset;
10492
		if ((adp->ad_state & ATTACHED) == 0)
10493
			panic("initiate_write_inodeblock_ufs1: "
10494
			    "Unknown state 0x%x", adp->ad_state);
10495
#endif /* INVARIANTS */
10496
		adp->ad_state &= ~ATTACHED;
10497
		adp->ad_state |= UNDONE;
10498
	}
10499
	/*
10500
	 * The on-disk inode cannot claim to be any larger than the last
10501
	 * fragment that has been written. Otherwise, the on-disk inode
10502
	 * might have fragments that were not the last block in the file
10503
	 * which would corrupt the filesystem.
10504
	 */
10505
	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10506
	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10507
		if (adp->ad_offset >= UFS_NDADDR)
10508
			break;
10509
		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10510
		/* keep going until hitting a rollback to a frag */
10511
		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10512
			continue;
10513
		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10514
		for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) {
10515
#ifdef INVARIANTS
10516
			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10517
				panic("initiate_write_inodeblock_ufs1: "
10518
				    "lost dep1");
10519
#endif /* INVARIANTS */
10520
			dp->di_db[i] = 0;
10521
		}
10522
		for (i = 0; i < UFS_NIADDR; i++) {
10523
#ifdef INVARIANTS
10524
			if (dp->di_ib[i] != 0 &&
10525
			    (deplist & ((1 << UFS_NDADDR) << i)) == 0)
10526
				panic("initiate_write_inodeblock_ufs1: "
10527
				    "lost dep2");
10528
#endif /* INVARIANTS */
10529
			dp->di_ib[i] = 0;
10530
		}
10531
		return;
10532
	}
10533
	/*
10534
	 * If we have zero'ed out the last allocated block of the file,
10535
	 * roll back the size to the last currently allocated block.
10536
	 * We know that this last allocated block is a full-sized as
10537
	 * we already checked for fragments in the loop above.
10538
	 */
10539
	if (lastadp != NULL &&
10540
	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10541
		for (i = lastadp->ad_offset; i >= 0; i--)
10542
			if (dp->di_db[i] != 0)
10543
				break;
10544
		dp->di_size = (i + 1) * fs->fs_bsize;
10545
	}
10546
	/*
10547
	 * The only dependencies are for indirect blocks.
10548
	 *
10549
	 * The file size for indirect block additions is not guaranteed.
10550
	 * Such a guarantee would be non-trivial to achieve. The conventional
10551
	 * synchronous write implementation also does not make this guarantee.
10552
	 * Fsck should catch and fix discrepancies. Arguably, the file size
10553
	 * can be over-estimated without destroying integrity when the file
10554
	 * moves into the indirect blocks (i.e., is large). If we want to
10555
	 * postpone fsck, we are stuck with this argument.
10556
	 */
10557
	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10558
		dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0;
10559
}
10560
		
10561
/*
10562
 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
10563
 * Note that any bug fixes made to this routine must be done in the
10564
 * version found above.
10565
 *
10566
 * Called from within the procedure above to deal with unsatisfied
10567
 * allocation dependencies in an inodeblock. The buffer must be
10568
 * locked, thus, no I/O completion operations can occur while we
10569
 * are manipulating its associated dependencies.
10570
 */
10571
static void 
10572
initiate_write_inodeblock_ufs2(
10573
	struct inodedep *inodedep,
10574
	struct buf *bp)			/* The inode block */
10575
{
10576
	struct allocdirect *adp, *lastadp;
10577
	struct ufs2_dinode *dp;
10578
	struct ufs2_dinode *sip;
10579
	struct inoref *inoref;
10580
	struct ufsmount *ump;
10581
	struct fs *fs;
10582
	ufs_lbn_t i;
10583
#ifdef INVARIANTS
10584
	ufs_lbn_t prevlbn = 0;
10585
#endif
10586
	int deplist __diagused;
10587

10588
	if (inodedep->id_state & IOSTARTED)
10589
		panic("initiate_write_inodeblock_ufs2: already started");
10590
	inodedep->id_state |= IOSTARTED;
10591
	fs = inodedep->id_fs;
10592
	ump = VFSTOUFS(inodedep->id_list.wk_mp);
10593
	LOCK_OWNED(ump);
10594
	dp = (struct ufs2_dinode *)bp->b_data +
10595
	    ino_to_fsbo(fs, inodedep->id_ino);
10596

10597
	/*
10598
	 * If we're on the unlinked list but have not yet written our
10599
	 * next pointer initialize it here.
10600
	 */
10601
	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10602
		struct inodedep *inon;
10603

10604
		inon = TAILQ_NEXT(inodedep, id_unlinked);
10605
		dp->di_freelink = inon ? inon->id_ino : 0;
10606
		ffs_update_dinode_ckhash(fs, dp);
10607
	}
10608
	/*
10609
	 * If the bitmap is not yet written, then the allocated
10610
	 * inode cannot be written to disk.
10611
	 */
10612
	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10613
		if (inodedep->id_savedino2 != NULL)
10614
			panic("initiate_write_inodeblock_ufs2: I/O underway");
10615
		FREE_LOCK(ump);
10616
		sip = malloc(sizeof(struct ufs2_dinode),
10617
		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10618
		ACQUIRE_LOCK(ump);
10619
		inodedep->id_savedino2 = sip;
10620
		*inodedep->id_savedino2 = *dp;
10621
		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
10622
		dp->di_gen = inodedep->id_savedino2->di_gen;
10623
		dp->di_freelink = inodedep->id_savedino2->di_freelink;
10624
		return;
10625
	}
10626
	/*
10627
	 * If no dependencies, then there is nothing to roll back.
10628
	 */
10629
	inodedep->id_savedsize = dp->di_size;
10630
	inodedep->id_savedextsize = dp->di_extsize;
10631
	inodedep->id_savednlink = dp->di_nlink;
10632
	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10633
	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
10634
	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10635
		return;
10636
	/*
10637
	 * Revert the link count to that of the first unwritten journal entry.
10638
	 */
10639
	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10640
	if (inoref)
10641
		dp->di_nlink = inoref->if_nlink;
10642

10643
	/*
10644
	 * Set the ext data dependencies to busy.
10645
	 */
10646
	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10647
	     adp = TAILQ_NEXT(adp, ad_next)) {
10648
#ifdef INVARIANTS
10649
		if (deplist != 0 && prevlbn >= adp->ad_offset)
10650
			panic("initiate_write_inodeblock_ufs2: lbn order");
10651
		prevlbn = adp->ad_offset;
10652
		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
10653
			panic("initiate_write_inodeblock_ufs2: "
10654
			    "ext pointer #%jd mismatch %jd != %jd",
10655
			    (intmax_t)adp->ad_offset,
10656
			    (intmax_t)dp->di_extb[adp->ad_offset],
10657
			    (intmax_t)adp->ad_newblkno);
10658
		deplist |= 1 << adp->ad_offset;
10659
		if ((adp->ad_state & ATTACHED) == 0)
10660
			panic("initiate_write_inodeblock_ufs2: Unknown "
10661
			    "state 0x%x", adp->ad_state);
10662
#endif /* INVARIANTS */
10663
		adp->ad_state &= ~ATTACHED;
10664
		adp->ad_state |= UNDONE;
10665
	}
10666
	/*
10667
	 * The on-disk inode cannot claim to be any larger than the last
10668
	 * fragment that has been written. Otherwise, the on-disk inode
10669
	 * might have fragments that were not the last block in the ext
10670
	 * data which would corrupt the filesystem.
10671
	 */
10672
	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10673
	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10674
		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
10675
		/* keep going until hitting a rollback to a frag */
10676
		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10677
			continue;
10678
		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10679
		for (i = adp->ad_offset + 1; i < UFS_NXADDR; i++) {
10680
#ifdef INVARIANTS
10681
			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
10682
				panic("initiate_write_inodeblock_ufs2: "
10683
				    "lost dep1");
10684
#endif /* INVARIANTS */
10685
			dp->di_extb[i] = 0;
10686
		}
10687
		lastadp = NULL;
10688
		break;
10689
	}
10690
	/*
10691
	 * If we have zero'ed out the last allocated block of the ext
10692
	 * data, roll back the size to the last currently allocated block.
10693
	 * We know that this last allocated block is a full-sized as
10694
	 * we already checked for fragments in the loop above.
10695
	 */
10696
	if (lastadp != NULL &&
10697
	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10698
		for (i = lastadp->ad_offset; i >= 0; i--)
10699
			if (dp->di_extb[i] != 0)
10700
				break;
10701
		dp->di_extsize = (i + 1) * fs->fs_bsize;
10702
	}
10703
	/*
10704
	 * Set the file data dependencies to busy.
10705
	 */
10706
	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10707
	     adp = TAILQ_NEXT(adp, ad_next)) {
10708
#ifdef INVARIANTS
10709
		if (deplist != 0 && prevlbn >= adp->ad_offset)
10710
			panic("softdep_write_inodeblock: lbn order");
10711
		if ((adp->ad_state & ATTACHED) == 0)
10712
			panic("inodedep %p and adp %p not attached", inodedep, adp);
10713
		prevlbn = adp->ad_offset;
10714
		if (!ffs_fsfail_cleanup(ump, 0) &&
10715
		    adp->ad_offset < UFS_NDADDR &&
10716
		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10717
			panic("initiate_write_inodeblock_ufs2: "
10718
			    "direct pointer #%jd mismatch %jd != %jd",
10719
			    (intmax_t)adp->ad_offset,
10720
			    (intmax_t)dp->di_db[adp->ad_offset],
10721
			    (intmax_t)adp->ad_newblkno);
10722
		if (!ffs_fsfail_cleanup(ump, 0) &&
10723
		    adp->ad_offset >= UFS_NDADDR &&
10724
		    dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno)
10725
			panic("initiate_write_inodeblock_ufs2: "
10726
			    "indirect pointer #%jd mismatch %jd != %jd",
10727
			    (intmax_t)adp->ad_offset - UFS_NDADDR,
10728
			    (intmax_t)dp->di_ib[adp->ad_offset - UFS_NDADDR],
10729
			    (intmax_t)adp->ad_newblkno);
10730
		deplist |= 1 << adp->ad_offset;
10731
		if ((adp->ad_state & ATTACHED) == 0)
10732
			panic("initiate_write_inodeblock_ufs2: Unknown "
10733
			     "state 0x%x", adp->ad_state);
10734
#endif /* INVARIANTS */
10735
		adp->ad_state &= ~ATTACHED;
10736
		adp->ad_state |= UNDONE;
10737
	}
10738
	/*
10739
	 * The on-disk inode cannot claim to be any larger than the last
10740
	 * fragment that has been written. Otherwise, the on-disk inode
10741
	 * might have fragments that were not the last block in the file
10742
	 * which would corrupt the filesystem.
10743
	 */
10744
	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10745
	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10746
		if (adp->ad_offset >= UFS_NDADDR)
10747
			break;
10748
		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10749
		/* keep going until hitting a rollback to a frag */
10750
		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10751
			continue;
10752
		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10753
		for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) {
10754
#ifdef INVARIANTS
10755
			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10756
				panic("initiate_write_inodeblock_ufs2: "
10757
				    "lost dep2");
10758
#endif /* INVARIANTS */
10759
			dp->di_db[i] = 0;
10760
		}
10761
		for (i = 0; i < UFS_NIADDR; i++) {
10762
#ifdef INVARIANTS
10763
			if (dp->di_ib[i] != 0 &&
10764
			    (deplist & ((1 << UFS_NDADDR) << i)) == 0)
10765
				panic("initiate_write_inodeblock_ufs2: "
10766
				    "lost dep3");
10767
#endif /* INVARIANTS */
10768
			dp->di_ib[i] = 0;
10769
		}
10770
		ffs_update_dinode_ckhash(fs, dp);
10771
		return;
10772
	}
10773
	/*
10774
	 * If we have zero'ed out the last allocated block of the file,
10775
	 * roll back the size to the last currently allocated block.
10776
	 * We know that this last allocated block is a full-sized as
10777
	 * we already checked for fragments in the loop above.
10778
	 */
10779
	if (lastadp != NULL &&
10780
	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10781
		for (i = lastadp->ad_offset; i >= 0; i--)
10782
			if (dp->di_db[i] != 0)
10783
				break;
10784
		dp->di_size = (i + 1) * fs->fs_bsize;
10785
	}
10786
	/*
10787
	 * The only dependencies are for indirect blocks.
10788
	 *
10789
	 * The file size for indirect block additions is not guaranteed.
10790
	 * Such a guarantee would be non-trivial to achieve. The conventional
10791
	 * synchronous write implementation also does not make this guarantee.
10792
	 * Fsck should catch and fix discrepancies. Arguably, the file size
10793
	 * can be over-estimated without destroying integrity when the file
10794
	 * moves into the indirect blocks (i.e., is large). If we want to
10795
	 * postpone fsck, we are stuck with this argument.
10796
	 */
10797
	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10798
		dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0;
10799
	ffs_update_dinode_ckhash(fs, dp);
10800
}
10801

10802
/*
10803
 * Cancel an indirdep as a result of truncation.  Release all of the
10804
 * children allocindirs and place their journal work on the appropriate
10805
 * list.
10806
 */
10807
static void
10808
cancel_indirdep(
10809
	struct indirdep *indirdep,
10810
	struct buf *bp,
10811
	struct freeblks *freeblks)
10812
{
10813
	struct allocindir *aip;
10814

10815
	/*
10816
	 * None of the indirect pointers will ever be visible,
10817
	 * so they can simply be tossed. GOINGAWAY ensures
10818
	 * that allocated pointers will be saved in the buffer
10819
	 * cache until they are freed. Note that they will
10820
	 * only be able to be found by their physical address
10821
	 * since the inode mapping the logical address will
10822
	 * be gone. The save buffer used for the safe copy
10823
	 * was allocated in setup_allocindir_phase2 using
10824
	 * the physical address so it could be used for this
10825
	 * purpose. Hence we swap the safe copy with the real
10826
	 * copy, allowing the safe copy to be freed and holding
10827
	 * on to the real copy for later use in indir_trunc.
10828
	 */
10829
	if (indirdep->ir_state & GOINGAWAY)
10830
		panic("cancel_indirdep: already gone");
10831
	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10832
		indirdep->ir_state |= DEPCOMPLETE;
10833
		LIST_REMOVE(indirdep, ir_next);
10834
	}
10835
	indirdep->ir_state |= GOINGAWAY;
10836
	/*
10837
	 * Pass in bp for blocks still have journal writes
10838
	 * pending so we can cancel them on their own.
10839
	 */
10840
	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL)
10841
		cancel_allocindir(aip, bp, freeblks, 0);
10842
	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL)
10843
		cancel_allocindir(aip, NULL, freeblks, 0);
10844
	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL)
10845
		cancel_allocindir(aip, NULL, freeblks, 0);
10846
	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL)
10847
		cancel_allocindir(aip, NULL, freeblks, 0);
10848
	/*
10849
	 * If there are pending partial truncations we need to keep the
10850
	 * old block copy around until they complete.  This is because
10851
	 * the current b_data is not a perfect superset of the available
10852
	 * blocks.
10853
	 */
10854
	if (TAILQ_EMPTY(&indirdep->ir_trunc))
10855
		bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10856
	else
10857
		bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10858
	WORKLIST_REMOVE(&indirdep->ir_list);
10859
	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10860
	indirdep->ir_bp = NULL;
10861
	indirdep->ir_freeblks = freeblks;
10862
}
10863

10864
/*
10865
 * Free an indirdep once it no longer has new pointers to track.
10866
 */
10867
static void
10868
free_indirdep(struct indirdep *indirdep)
10869
{
10870

10871
	KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10872
	    ("free_indirdep: Indir trunc list not empty."));
10873
	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10874
	    ("free_indirdep: Complete head not empty."));
10875
	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10876
	    ("free_indirdep: write head not empty."));
10877
	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10878
	    ("free_indirdep: done head not empty."));
10879
	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10880
	    ("free_indirdep: deplist head not empty."));
10881
	KASSERT((indirdep->ir_state & DEPCOMPLETE),
10882
	    ("free_indirdep: %p still on newblk list.", indirdep));
10883
	KASSERT(indirdep->ir_saveddata == NULL,
10884
	    ("free_indirdep: %p still has saved data.", indirdep));
10885
	KASSERT(indirdep->ir_savebp == NULL,
10886
	    ("free_indirdep: %p still has savebp buffer.", indirdep));
10887
	if (indirdep->ir_state & ONWORKLIST)
10888
		WORKLIST_REMOVE(&indirdep->ir_list);
10889
	WORKITEM_FREE(indirdep, D_INDIRDEP);
10890
}
10891

10892
/*
10893
 * Called before a write to an indirdep.  This routine is responsible for
10894
 * rolling back pointers to a safe state which includes only those
10895
 * allocindirs which have been completed.
10896
 */
10897
static void
10898
initiate_write_indirdep(struct indirdep *indirdep, struct buf *bp)
10899
{
10900
	struct ufsmount *ump;
10901

10902
	indirdep->ir_state |= IOSTARTED;
10903
	if (indirdep->ir_state & GOINGAWAY)
10904
		panic("disk_io_initiation: indirdep gone");
10905
	/*
10906
	 * If there are no remaining dependencies, this will be writing
10907
	 * the real pointers.
10908
	 */
10909
	if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10910
	    TAILQ_EMPTY(&indirdep->ir_trunc))
10911
		return;
10912
	/*
10913
	 * Replace up-to-date version with safe version.
10914
	 */
10915
	if (indirdep->ir_saveddata == NULL) {
10916
		ump = VFSTOUFS(indirdep->ir_list.wk_mp);
10917
		LOCK_OWNED(ump);
10918
		FREE_LOCK(ump);
10919
		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10920
		    M_SOFTDEP_FLAGS);
10921
		ACQUIRE_LOCK(ump);
10922
	}
10923
	indirdep->ir_state &= ~ATTACHED;
10924
	indirdep->ir_state |= UNDONE;
10925
	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10926
	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10927
	    bp->b_bcount);
10928
}
10929

10930
/*
10931
 * Called when an inode has been cleared in a cg bitmap.  This finally
10932
 * eliminates any canceled jaddrefs
10933
 */
10934
void
10935
softdep_setup_inofree(struct mount *mp,
10936
	struct buf *bp,
10937
	ino_t ino,
10938
	struct workhead *wkhd,
10939
	bool doingrecovery)
10940
{
10941
	struct worklist *wk, *wkn;
10942
	struct ufsmount *ump;
10943
#ifdef INVARIANTS
10944
	struct inodedep *inodedep;
10945
#endif
10946

10947
	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
10948
	    ("softdep_setup_inofree called on non-softdep filesystem"));
10949
	ump = VFSTOUFS(mp);
10950
	ACQUIRE_LOCK(ump);
10951
	KASSERT(doingrecovery || ffs_fsfail_cleanup(ump, 0) ||
10952
	    isclr(cg_inosused((struct cg *)bp->b_data),
10953
	    ino % ump->um_fs->fs_ipg),
10954
	    ("softdep_setup_inofree: inode %ju not freed.", (uintmax_t)ino));
10955
	KASSERT(inodedep_lookup(mp, ino, 0, &inodedep) == 0,
10956
	    ("softdep_setup_inofree: ino %ju has existing inodedep %p",
10957
	    (uintmax_t)ino, inodedep));
10958
	if (wkhd) {
10959
		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10960
			if (wk->wk_type != D_JADDREF)
10961
				continue;
10962
			WORKLIST_REMOVE(wk);
10963
			/*
10964
			 * We can free immediately even if the jaddref
10965
			 * isn't attached in a background write as now
10966
			 * the bitmaps are reconciled.
10967
			 */
10968
			wk->wk_state |= COMPLETE | ATTACHED;
10969
			free_jaddref(WK_JADDREF(wk));
10970
		}
10971
		jwork_move(&bp->b_dep, wkhd);
10972
	}
10973
	FREE_LOCK(ump);
10974
}
10975

10976
/*
10977
 * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10978
 * map.  Any dependencies waiting for the write to clear are added to the
10979
 * buf's list and any jnewblks that are being canceled are discarded
10980
 * immediately.
10981
 */
10982
void
10983
softdep_setup_blkfree(
10984
	struct mount *mp,
10985
	struct buf *bp,
10986
	ufs2_daddr_t blkno,
10987
	int frags,
10988
	struct workhead *wkhd,
10989
	bool doingrecovery)
10990
{
10991
	struct bmsafemap *bmsafemap;
10992
	struct jnewblk *jnewblk;
10993
	struct ufsmount *ump;
10994
	struct worklist *wk;
10995
	struct fs *fs;
10996
#ifdef INVARIANTS
10997
	uint8_t *blksfree;
10998
	struct cg *cgp;
10999
	ufs2_daddr_t jstart;
11000
	ufs2_daddr_t jend;
11001
	ufs2_daddr_t end;
11002
	long bno;
11003
	int i;
11004
#endif
11005

11006
	CTR3(KTR_SUJ,
11007
	    "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
11008
	    blkno, frags, wkhd);
11009

11010
	ump = VFSTOUFS(mp);
11011
	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
11012
	    ("softdep_setup_blkfree called on non-softdep filesystem"));
11013
	ACQUIRE_LOCK(ump);
11014
	/* Lookup the bmsafemap so we track when it is dirty. */
11015
	fs = ump->um_fs;
11016
	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
11017
	/*
11018
	 * Detach any jnewblks which have been canceled.  They must linger
11019
	 * until the bitmap is cleared again by ffs_blkfree() to prevent
11020
	 * an unjournaled allocation from hitting the disk.
11021
	 */
11022
	if (wkhd) {
11023
		while ((wk = LIST_FIRST(wkhd)) != NULL) {
11024
			CTR2(KTR_SUJ,
11025
			    "softdep_setup_blkfree: blkno %jd wk type %d",
11026
			    blkno, wk->wk_type);
11027
			WORKLIST_REMOVE(wk);
11028
			if (wk->wk_type != D_JNEWBLK) {
11029
				WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
11030
				continue;
11031
			}
11032
			jnewblk = WK_JNEWBLK(wk);
11033
			KASSERT(jnewblk->jn_state & GOINGAWAY,
11034
			    ("softdep_setup_blkfree: jnewblk not canceled."));
11035
#ifdef INVARIANTS
11036
			if (!doingrecovery && !ffs_fsfail_cleanup(ump, 0)) {
11037
				/*
11038
				 * Assert that this block is free in the
11039
				 * bitmap before we discard the jnewblk.
11040
				 */
11041
				cgp = (struct cg *)bp->b_data;
11042
				blksfree = cg_blksfree(cgp);
11043
				bno = dtogd(fs, jnewblk->jn_blkno);
11044
				for (i = jnewblk->jn_oldfrags;
11045
				    i < jnewblk->jn_frags; i++) {
11046
					if (isset(blksfree, bno + i))
11047
						continue;
11048
					panic("softdep_setup_blkfree: block "
11049
					    "%ju not freed.",
11050
					    (uintmax_t)jnewblk->jn_blkno);
11051
				}
11052
			}
11053
#endif
11054
			/*
11055
			 * Even if it's not attached we can free immediately
11056
			 * as the new bitmap is correct.
11057
			 */
11058
			wk->wk_state |= COMPLETE | ATTACHED;
11059
			free_jnewblk(jnewblk);
11060
		}
11061
	}
11062

11063
#ifdef INVARIANTS
11064
	/*
11065
	 * Assert that we are not freeing a block which has an outstanding
11066
	 * allocation dependency.
11067
	 */
11068
	fs = VFSTOUFS(mp)->um_fs;
11069
	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
11070
	end = blkno + frags;
11071
	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
11072
		/*
11073
		 * Don't match against blocks that will be freed when the
11074
		 * background write is done.
11075
		 */
11076
		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
11077
		    (COMPLETE | DEPCOMPLETE))
11078
			continue;
11079
		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
11080
		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
11081
		if ((blkno >= jstart && blkno < jend) ||
11082
		    (end > jstart && end <= jend)) {
11083
			printf("state 0x%X %jd - %d %d dep %p\n",
11084
			    jnewblk->jn_state, jnewblk->jn_blkno,
11085
			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
11086
			    jnewblk->jn_dep);
11087
			panic("softdep_setup_blkfree: "
11088
			    "%jd-%jd(%d) overlaps with %jd-%jd",
11089
			    blkno, end, frags, jstart, jend);
11090
		}
11091
	}
11092
#endif
11093
	FREE_LOCK(ump);
11094
}
11095

11096
/*
11097
 * Revert a block allocation when the journal record that describes it
11098
 * is not yet written.
11099
 */
11100
static int
11101
jnewblk_rollback(
11102
	struct jnewblk *jnewblk,
11103
	struct fs *fs,
11104
	struct cg *cgp,
11105
	uint8_t *blksfree)
11106
{
11107
	ufs1_daddr_t fragno;
11108
	long cgbno, bbase;
11109
	int frags, blk;
11110
	int i;
11111

11112
	frags = 0;
11113
	cgbno = dtogd(fs, jnewblk->jn_blkno);
11114
	/*
11115
	 * We have to test which frags need to be rolled back.  We may
11116
	 * be operating on a stale copy when doing background writes.
11117
	 */
11118
	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
11119
		if (isclr(blksfree, cgbno + i))
11120
			frags++;
11121
	if (frags == 0)
11122
		return (0);
11123
	/*
11124
	 * This is mostly ffs_blkfree() sans some validation and
11125
	 * superblock updates.
11126
	 */
11127
	if (frags == fs->fs_frag) {
11128
		fragno = fragstoblks(fs, cgbno);
11129
		ffs_setblock(fs, blksfree, fragno);
11130
		ffs_clusteracct(fs, cgp, fragno, 1);
11131
		cgp->cg_cs.cs_nbfree++;
11132
	} else {
11133
		cgbno += jnewblk->jn_oldfrags;
11134
		bbase = cgbno - fragnum(fs, cgbno);
11135
		/* Decrement the old frags.  */
11136
		blk = blkmap(fs, blksfree, bbase);
11137
		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11138
		/* Deallocate the fragment */
11139
		for (i = 0; i < frags; i++)
11140
			setbit(blksfree, cgbno + i);
11141
		cgp->cg_cs.cs_nffree += frags;
11142
		/* Add back in counts associated with the new frags */
11143
		blk = blkmap(fs, blksfree, bbase);
11144
		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11145
		/* If a complete block has been reassembled, account for it. */
11146
		fragno = fragstoblks(fs, bbase);
11147
		if (ffs_isblock(fs, blksfree, fragno)) {
11148
			cgp->cg_cs.cs_nffree -= fs->fs_frag;
11149
			ffs_clusteracct(fs, cgp, fragno, 1);
11150
			cgp->cg_cs.cs_nbfree++;
11151
		}
11152
	}
11153
	stat_jnewblk++;
11154
	jnewblk->jn_state &= ~ATTACHED;
11155
	jnewblk->jn_state |= UNDONE;
11156

11157
	return (frags);
11158
}
11159

11160
static void
11161
initiate_write_bmsafemap(
11162
	struct bmsafemap *bmsafemap,
11163
	struct buf *bp)			/* The cg block. */
11164
{
11165
	struct jaddref *jaddref;
11166
	struct jnewblk *jnewblk;
11167
	uint8_t *inosused;
11168
	uint8_t *blksfree;
11169
	struct cg *cgp;
11170
	struct fs *fs;
11171
	ino_t ino;
11172

11173
	/*
11174
	 * If this is a background write, we did this at the time that
11175
	 * the copy was made, so do not need to do it again.
11176
	 */
11177
	if (bmsafemap->sm_state & IOSTARTED)
11178
		return;
11179
	bmsafemap->sm_state |= IOSTARTED;
11180
	/*
11181
	 * Clear any inode allocations which are pending journal writes.
11182
	 */
11183
	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
11184
		cgp = (struct cg *)bp->b_data;
11185
		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11186
		inosused = cg_inosused(cgp);
11187
		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
11188
			ino = jaddref->ja_ino % fs->fs_ipg;
11189
			if (isset(inosused, ino)) {
11190
				if ((jaddref->ja_mode & IFMT) == IFDIR)
11191
					cgp->cg_cs.cs_ndir--;
11192
				cgp->cg_cs.cs_nifree++;
11193
				clrbit(inosused, ino);
11194
				jaddref->ja_state &= ~ATTACHED;
11195
				jaddref->ja_state |= UNDONE;
11196
				stat_jaddref++;
11197
			} else
11198
				panic("initiate_write_bmsafemap: inode %ju "
11199
				    "marked free", (uintmax_t)jaddref->ja_ino);
11200
		}
11201
	}
11202
	/*
11203
	 * Clear any block allocations which are pending journal writes.
11204
	 */
11205
	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11206
		cgp = (struct cg *)bp->b_data;
11207
		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11208
		blksfree = cg_blksfree(cgp);
11209
		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
11210
			if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
11211
				continue;
11212
			panic("initiate_write_bmsafemap: block %jd "
11213
			    "marked free", jnewblk->jn_blkno);
11214
		}
11215
	}
11216
	/*
11217
	 * Move allocation lists to the written lists so they can be
11218
	 * cleared once the block write is complete.
11219
	 */
11220
	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
11221
	    inodedep, id_deps);
11222
	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
11223
	    newblk, nb_deps);
11224
	LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
11225
	    wk_list);
11226
}
11227

11228
void
11229
softdep_handle_error(struct buf *bp)
11230
{
11231
	struct ufsmount *ump;
11232

11233
	ump = softdep_bp_to_mp(bp);
11234
	if (ump == NULL)
11235
		return;
11236

11237
	if (ffs_fsfail_cleanup(ump, bp->b_error)) {
11238
		/*
11239
		 * No future writes will succeed, so the on-disk image is safe.
11240
		 * Pretend that this write succeeded so that the softdep state
11241
		 * will be cleaned up naturally.
11242
		 */
11243
		bp->b_ioflags &= ~BIO_ERROR;
11244
		bp->b_error = 0;
11245
	}
11246
}
11247

11248
/*
11249
 * This routine is called during the completion interrupt
11250
 * service routine for a disk write (from the procedure called
11251
 * by the device driver to inform the filesystem caches of
11252
 * a request completion).  It should be called early in this
11253
 * procedure, before the block is made available to other
11254
 * processes or other routines are called.
11255
 *
11256
 */
11257
static void 
11258
softdep_disk_write_complete(
11259
	struct buf *bp)		/* describes the completed disk write */
11260
{
11261
	struct worklist *wk;
11262
	struct worklist *owk;
11263
	struct ufsmount *ump;
11264
	struct workhead reattach;
11265
	struct freeblks *freeblks;
11266
	struct buf *sbp;
11267

11268
	ump = softdep_bp_to_mp(bp);
11269
	KASSERT(LIST_EMPTY(&bp->b_dep) || ump != NULL,
11270
	    ("softdep_disk_write_complete: softdep_bp_to_mp returned NULL "
11271
	     "with outstanding dependencies for buffer %p", bp));
11272
	if (ump == NULL)
11273
		return;
11274
	if ((bp->b_ioflags & BIO_ERROR) != 0)
11275
		softdep_handle_error(bp);
11276
	/*
11277
	 * If an error occurred while doing the write, then the data
11278
	 * has not hit the disk and the dependencies cannot be processed.
11279
	 * But we do have to go through and roll forward any dependencies
11280
	 * that were rolled back before the disk write.
11281
	 */
11282
	sbp = NULL;
11283
	ACQUIRE_LOCK(ump);
11284
	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) {
11285
		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
11286
			switch (wk->wk_type) {
11287
			case D_PAGEDEP:
11288
				handle_written_filepage(WK_PAGEDEP(wk), bp, 0);
11289
				continue;
11290

11291
			case D_INODEDEP:
11292
				handle_written_inodeblock(WK_INODEDEP(wk),
11293
				    bp, 0);
11294
				continue;
11295

11296
			case D_BMSAFEMAP:
11297
				handle_written_bmsafemap(WK_BMSAFEMAP(wk),
11298
				    bp, 0);
11299
				continue;
11300

11301
			case D_INDIRDEP:
11302
				handle_written_indirdep(WK_INDIRDEP(wk),
11303
				    bp, &sbp, 0);
11304
				continue;
11305
			default:
11306
				/* nothing to roll forward */
11307
				continue;
11308
			}
11309
		}
11310
		FREE_LOCK(ump);
11311
		if (sbp)
11312
			brelse(sbp);
11313
		return;
11314
	}
11315
	LIST_INIT(&reattach);
11316

11317
	/*
11318
	 * Ump SU lock must not be released anywhere in this code segment.
11319
	 */
11320
	owk = NULL;
11321
	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
11322
		WORKLIST_REMOVE(wk);
11323
		atomic_add_long(&dep_write[wk->wk_type], 1);
11324
		if (wk == owk)
11325
			panic("duplicate worklist: %p\n", wk);
11326
		owk = wk;
11327
		switch (wk->wk_type) {
11328
		case D_PAGEDEP:
11329
			if (handle_written_filepage(WK_PAGEDEP(wk), bp,
11330
			    WRITESUCCEEDED))
11331
				WORKLIST_INSERT(&reattach, wk);
11332
			continue;
11333

11334
		case D_INODEDEP:
11335
			if (handle_written_inodeblock(WK_INODEDEP(wk), bp,
11336
			    WRITESUCCEEDED))
11337
				WORKLIST_INSERT(&reattach, wk);
11338
			continue;
11339

11340
		case D_BMSAFEMAP:
11341
			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp,
11342
			    WRITESUCCEEDED))
11343
				WORKLIST_INSERT(&reattach, wk);
11344
			continue;
11345

11346
		case D_MKDIR:
11347
			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
11348
			continue;
11349

11350
		case D_ALLOCDIRECT:
11351
			wk->wk_state |= COMPLETE;
11352
			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
11353
			continue;
11354

11355
		case D_ALLOCINDIR:
11356
			wk->wk_state |= COMPLETE;
11357
			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
11358
			continue;
11359

11360
		case D_INDIRDEP:
11361
			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp,
11362
			    WRITESUCCEEDED))
11363
				WORKLIST_INSERT(&reattach, wk);
11364
			continue;
11365

11366
		case D_FREEBLKS:
11367
			wk->wk_state |= COMPLETE;
11368
			freeblks = WK_FREEBLKS(wk);
11369
			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
11370
			    LIST_EMPTY(&freeblks->fb_jblkdephd))
11371
				add_to_worklist(wk, WK_NODELAY);
11372
			continue;
11373

11374
		case D_FREEWORK:
11375
			handle_written_freework(WK_FREEWORK(wk));
11376
			break;
11377

11378
		case D_JSEGDEP:
11379
			free_jsegdep(WK_JSEGDEP(wk));
11380
			continue;
11381

11382
		case D_JSEG:
11383
			handle_written_jseg(WK_JSEG(wk), bp);
11384
			continue;
11385

11386
		case D_SBDEP:
11387
			if (handle_written_sbdep(WK_SBDEP(wk), bp))
11388
				WORKLIST_INSERT(&reattach, wk);
11389
			continue;
11390

11391
		case D_FREEDEP:
11392
			free_freedep(WK_FREEDEP(wk));
11393
			continue;
11394

11395
		default:
11396
			panic("handle_disk_write_complete: Unknown type %s",
11397
			    TYPENAME(wk->wk_type));
11398
			/* NOTREACHED */
11399
		}
11400
	}
11401
	/*
11402
	 * Reattach any requests that must be redone.
11403
	 */
11404
	while ((wk = LIST_FIRST(&reattach)) != NULL) {
11405
		WORKLIST_REMOVE(wk);
11406
		WORKLIST_INSERT(&bp->b_dep, wk);
11407
	}
11408
	FREE_LOCK(ump);
11409
	if (sbp)
11410
		brelse(sbp);
11411
}
11412

11413
/*
11414
 * Called from within softdep_disk_write_complete above.
11415
 */
11416
static void 
11417
handle_allocdirect_partdone(
11418
	struct allocdirect *adp,	/* the completed allocdirect */
11419
	struct workhead *wkhd)		/* Work to do when inode is writtne. */
11420
{
11421
	struct allocdirectlst *listhead;
11422
	struct allocdirect *listadp;
11423
	struct inodedep *inodedep;
11424
	long bsize;
11425

11426
	LOCK_OWNED(VFSTOUFS(adp->ad_block.nb_list.wk_mp));
11427
	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11428
		return;
11429
	/*
11430
	 * The on-disk inode cannot claim to be any larger than the last
11431
	 * fragment that has been written. Otherwise, the on-disk inode
11432
	 * might have fragments that were not the last block in the file
11433
	 * which would corrupt the filesystem. Thus, we cannot free any
11434
	 * allocdirects after one whose ad_oldblkno claims a fragment as
11435
	 * these blocks must be rolled back to zero before writing the inode.
11436
	 * We check the currently active set of allocdirects in id_inoupdt
11437
	 * or id_extupdt as appropriate.
11438
	 */
11439
	inodedep = adp->ad_inodedep;
11440
	bsize = inodedep->id_fs->fs_bsize;
11441
	if (adp->ad_state & EXTDATA)
11442
		listhead = &inodedep->id_extupdt;
11443
	else
11444
		listhead = &inodedep->id_inoupdt;
11445
	TAILQ_FOREACH(listadp, listhead, ad_next) {
11446
		/* found our block */
11447
		if (listadp == adp)
11448
			break;
11449
		/* continue if ad_oldlbn is not a fragment */
11450
		if (listadp->ad_oldsize == 0 ||
11451
		    listadp->ad_oldsize == bsize)
11452
			continue;
11453
		/* hit a fragment */
11454
		return;
11455
	}
11456
	/*
11457
	 * If we have reached the end of the current list without
11458
	 * finding the just finished dependency, then it must be
11459
	 * on the future dependency list. Future dependencies cannot
11460
	 * be freed until they are moved to the current list.
11461
	 */
11462
	if (listadp == NULL) {
11463
#ifdef INVARIANTS
11464
		if (adp->ad_state & EXTDATA)
11465
			listhead = &inodedep->id_newextupdt;
11466
		else
11467
			listhead = &inodedep->id_newinoupdt;
11468
		TAILQ_FOREACH(listadp, listhead, ad_next)
11469
			/* found our block */
11470
			if (listadp == adp)
11471
				break;
11472
		if (listadp == NULL)
11473
			panic("handle_allocdirect_partdone: lost dep");
11474
#endif /* INVARIANTS */
11475
		return;
11476
	}
11477
	/*
11478
	 * If we have found the just finished dependency, then queue
11479
	 * it along with anything that follows it that is complete.
11480
	 * Since the pointer has not yet been written in the inode
11481
	 * as the dependency prevents it, place the allocdirect on the
11482
	 * bufwait list where it will be freed once the pointer is
11483
	 * valid.
11484
	 */
11485
	if (wkhd == NULL)
11486
		wkhd = &inodedep->id_bufwait;
11487
	for (; adp; adp = listadp) {
11488
		listadp = TAILQ_NEXT(adp, ad_next);
11489
		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11490
			return;
11491
		TAILQ_REMOVE(listhead, adp, ad_next);
11492
		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
11493
	}
11494
}
11495

11496
/*
11497
 * Called from within softdep_disk_write_complete above.  This routine
11498
 * completes successfully written allocindirs.
11499
 */
11500
static void
11501
handle_allocindir_partdone(
11502
	struct allocindir *aip)		/* the completed allocindir */
11503
{
11504
	struct indirdep *indirdep;
11505

11506
	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
11507
		return;
11508
	indirdep = aip->ai_indirdep;
11509
	LIST_REMOVE(aip, ai_next);
11510
	/*
11511
	 * Don't set a pointer while the buffer is undergoing IO or while
11512
	 * we have active truncations.
11513
	 */
11514
	if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
11515
		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
11516
		return;
11517
	}
11518
	if (indirdep->ir_state & UFS1FMT)
11519
		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11520
		    aip->ai_newblkno;
11521
	else
11522
		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11523
		    aip->ai_newblkno;
11524
	/*
11525
	 * Await the pointer write before freeing the allocindir.
11526
	 */
11527
	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
11528
}
11529

11530
/*
11531
 * Release segments held on a jwork list.
11532
 */
11533
static void
11534
handle_jwork(struct workhead *wkhd)
11535
{
11536
	struct worklist *wk;
11537

11538
	while ((wk = LIST_FIRST(wkhd)) != NULL) {
11539
		WORKLIST_REMOVE(wk);
11540
		switch (wk->wk_type) {
11541
		case D_JSEGDEP:
11542
			free_jsegdep(WK_JSEGDEP(wk));
11543
			continue;
11544
		case D_FREEDEP:
11545
			free_freedep(WK_FREEDEP(wk));
11546
			continue;
11547
		case D_FREEFRAG:
11548
			rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
11549
			WORKITEM_FREE(wk, D_FREEFRAG);
11550
			continue;
11551
		case D_FREEWORK:
11552
			handle_written_freework(WK_FREEWORK(wk));
11553
			continue;
11554
		default:
11555
			panic("handle_jwork: Unknown type %s\n",
11556
			    TYPENAME(wk->wk_type));
11557
		}
11558
	}
11559
}
11560

11561
/*
11562
 * Handle the bufwait list on an inode when it is safe to release items
11563
 * held there.  This normally happens after an inode block is written but
11564
 * may be delayed and handled later if there are pending journal items that
11565
 * are not yet safe to be released.
11566
 */
11567
static struct freefile *
11568
handle_bufwait(
11569
	struct inodedep *inodedep,
11570
	struct workhead *refhd)
11571
{
11572
	struct jaddref *jaddref;
11573
	struct freefile *freefile;
11574
	struct worklist *wk;
11575

11576
	freefile = NULL;
11577
	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
11578
		WORKLIST_REMOVE(wk);
11579
		switch (wk->wk_type) {
11580
		case D_FREEFILE:
11581
			/*
11582
			 * We defer adding freefile to the worklist
11583
			 * until all other additions have been made to
11584
			 * ensure that it will be done after all the
11585
			 * old blocks have been freed.
11586
			 */
11587
			if (freefile != NULL)
11588
				panic("handle_bufwait: freefile");
11589
			freefile = WK_FREEFILE(wk);
11590
			continue;
11591

11592
		case D_MKDIR:
11593
			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
11594
			continue;
11595

11596
		case D_DIRADD:
11597
			diradd_inode_written(WK_DIRADD(wk), inodedep);
11598
			continue;
11599

11600
		case D_FREEFRAG:
11601
			wk->wk_state |= COMPLETE;
11602
			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
11603
				add_to_worklist(wk, 0);
11604
			continue;
11605

11606
		case D_DIRREM:
11607
			wk->wk_state |= COMPLETE;
11608
			add_to_worklist(wk, 0);
11609
			continue;
11610

11611
		case D_ALLOCDIRECT:
11612
		case D_ALLOCINDIR:
11613
			free_newblk(WK_NEWBLK(wk));
11614
			continue;
11615

11616
		case D_JNEWBLK:
11617
			wk->wk_state |= COMPLETE;
11618
			free_jnewblk(WK_JNEWBLK(wk));
11619
			continue;
11620

11621
		/*
11622
		 * Save freed journal segments and add references on
11623
		 * the supplied list which will delay their release
11624
		 * until the cg bitmap is cleared on disk.
11625
		 */
11626
		case D_JSEGDEP:
11627
			if (refhd == NULL)
11628
				free_jsegdep(WK_JSEGDEP(wk));
11629
			else
11630
				WORKLIST_INSERT(refhd, wk);
11631
			continue;
11632

11633
		case D_JADDREF:
11634
			jaddref = WK_JADDREF(wk);
11635
			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
11636
			    if_deps);
11637
			/*
11638
			 * Transfer any jaddrefs to the list to be freed with
11639
			 * the bitmap if we're handling a removed file.
11640
			 */
11641
			if (refhd == NULL) {
11642
				wk->wk_state |= COMPLETE;
11643
				free_jaddref(jaddref);
11644
			} else
11645
				WORKLIST_INSERT(refhd, wk);
11646
			continue;
11647

11648
		default:
11649
			panic("handle_bufwait: Unknown type %p(%s)",
11650
			    wk, TYPENAME(wk->wk_type));
11651
			/* NOTREACHED */
11652
		}
11653
	}
11654
	return (freefile);
11655
}
11656
/*
11657
 * Called from within softdep_disk_write_complete above to restore
11658
 * in-memory inode block contents to their most up-to-date state. Note
11659
 * that this routine is always called from interrupt level with further
11660
 * interrupts from this device blocked.
11661
 *
11662
 * If the write did not succeed, we will do all the roll-forward
11663
 * operations, but we will not take the actions that will allow its
11664
 * dependencies to be processed.
11665
 */
11666
static int 
11667
handle_written_inodeblock(
11668
	struct inodedep *inodedep,
11669
	struct buf *bp,		/* buffer containing the inode block */
11670
	int flags)
11671
{
11672
	struct freefile *freefile;
11673
	struct allocdirect *adp, *nextadp;
11674
	struct ufs1_dinode *dp1 = NULL;
11675
	struct ufs2_dinode *dp2 = NULL;
11676
	struct workhead wkhd;
11677
	int hadchanges, fstype;
11678
	ino_t freelink;
11679

11680
	LIST_INIT(&wkhd);
11681
	hadchanges = 0;
11682
	freefile = NULL;
11683
	if ((inodedep->id_state & IOSTARTED) == 0)
11684
		panic("handle_written_inodeblock: not started");
11685
	inodedep->id_state &= ~IOSTARTED;
11686
	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
11687
		fstype = UFS1;
11688
		dp1 = (struct ufs1_dinode *)bp->b_data +
11689
		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11690
		freelink = dp1->di_freelink;
11691
	} else {
11692
		fstype = UFS2;
11693
		dp2 = (struct ufs2_dinode *)bp->b_data +
11694
		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11695
		freelink = dp2->di_freelink;
11696
	}
11697
	/*
11698
	 * Leave this inodeblock dirty until it's in the list.
11699
	 */
11700
	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED &&
11701
	    (flags & WRITESUCCEEDED)) {
11702
		struct inodedep *inon;
11703

11704
		inon = TAILQ_NEXT(inodedep, id_unlinked);
11705
		if ((inon == NULL && freelink == 0) ||
11706
		    (inon && inon->id_ino == freelink)) {
11707
			if (inon)
11708
				inon->id_state |= UNLINKPREV;
11709
			inodedep->id_state |= UNLINKNEXT;
11710
		}
11711
		hadchanges = 1;
11712
	}
11713
	/*
11714
	 * If we had to rollback the inode allocation because of
11715
	 * bitmaps being incomplete, then simply restore it.
11716
	 * Keep the block dirty so that it will not be reclaimed until
11717
	 * all associated dependencies have been cleared and the
11718
	 * corresponding updates written to disk.
11719
	 */
11720
	if (inodedep->id_savedino1 != NULL) {
11721
		hadchanges = 1;
11722
		if (fstype == UFS1)
11723
			*dp1 = *inodedep->id_savedino1;
11724
		else
11725
			*dp2 = *inodedep->id_savedino2;
11726
		free(inodedep->id_savedino1, M_SAVEDINO);
11727
		inodedep->id_savedino1 = NULL;
11728
		if ((bp->b_flags & B_DELWRI) == 0)
11729
			stat_inode_bitmap++;
11730
		bdirty(bp);
11731
		/*
11732
		 * If the inode is clear here and GOINGAWAY it will never
11733
		 * be written.  Process the bufwait and clear any pending
11734
		 * work which may include the freefile.
11735
		 */
11736
		if (inodedep->id_state & GOINGAWAY)
11737
			goto bufwait;
11738
		return (1);
11739
	}
11740
	if (flags & WRITESUCCEEDED)
11741
		inodedep->id_state |= COMPLETE;
11742
	/*
11743
	 * Roll forward anything that had to be rolled back before 
11744
	 * the inode could be updated.
11745
	 */
11746
	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
11747
		nextadp = TAILQ_NEXT(adp, ad_next);
11748
		if (adp->ad_state & ATTACHED)
11749
			panic("handle_written_inodeblock: new entry");
11750
		if (fstype == UFS1) {
11751
			if (adp->ad_offset < UFS_NDADDR) {
11752
				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11753
					panic("%s %s #%jd mismatch %d != %jd",
11754
					    "handle_written_inodeblock:",
11755
					    "direct pointer",
11756
					    (intmax_t)adp->ad_offset,
11757
					    dp1->di_db[adp->ad_offset],
11758
					    (intmax_t)adp->ad_oldblkno);
11759
				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
11760
			} else {
11761
				if (dp1->di_ib[adp->ad_offset - UFS_NDADDR] !=
11762
				    0)
11763
					panic("%s: %s #%jd allocated as %d",
11764
					    "handle_written_inodeblock",
11765
					    "indirect pointer",
11766
					    (intmax_t)adp->ad_offset -
11767
					    UFS_NDADDR,
11768
					    dp1->di_ib[adp->ad_offset -
11769
					    UFS_NDADDR]);
11770
				dp1->di_ib[adp->ad_offset - UFS_NDADDR] =
11771
				    adp->ad_newblkno;
11772
			}
11773
		} else {
11774
			if (adp->ad_offset < UFS_NDADDR) {
11775
				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11776
					panic("%s: %s #%jd %s %jd != %jd",
11777
					    "handle_written_inodeblock",
11778
					    "direct pointer",
11779
					    (intmax_t)adp->ad_offset, "mismatch",
11780
					    (intmax_t)dp2->di_db[adp->ad_offset],
11781
					    (intmax_t)adp->ad_oldblkno);
11782
				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
11783
			} else {
11784
				if (dp2->di_ib[adp->ad_offset - UFS_NDADDR] !=
11785
				    0)
11786
					panic("%s: %s #%jd allocated as %jd",
11787
					    "handle_written_inodeblock",
11788
					    "indirect pointer",
11789
					    (intmax_t)adp->ad_offset -
11790
					    UFS_NDADDR,
11791
					    (intmax_t)
11792
					    dp2->di_ib[adp->ad_offset -
11793
					    UFS_NDADDR]);
11794
				dp2->di_ib[adp->ad_offset - UFS_NDADDR] =
11795
				    adp->ad_newblkno;
11796
			}
11797
		}
11798
		adp->ad_state &= ~UNDONE;
11799
		adp->ad_state |= ATTACHED;
11800
		hadchanges = 1;
11801
	}
11802
	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
11803
		nextadp = TAILQ_NEXT(adp, ad_next);
11804
		if (adp->ad_state & ATTACHED)
11805
			panic("handle_written_inodeblock: new entry");
11806
		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
11807
			panic("%s: direct pointers #%jd %s %jd != %jd",
11808
			    "handle_written_inodeblock",
11809
			    (intmax_t)adp->ad_offset, "mismatch",
11810
			    (intmax_t)dp2->di_extb[adp->ad_offset],
11811
			    (intmax_t)adp->ad_oldblkno);
11812
		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
11813
		adp->ad_state &= ~UNDONE;
11814
		adp->ad_state |= ATTACHED;
11815
		hadchanges = 1;
11816
	}
11817
	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
11818
		stat_direct_blk_ptrs++;
11819
	/*
11820
	 * Reset the file size to its most up-to-date value.
11821
	 */
11822
	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
11823
		panic("handle_written_inodeblock: bad size");
11824
	if (inodedep->id_savednlink > UFS_LINK_MAX)
11825
		panic("handle_written_inodeblock: Invalid link count "
11826
		    "%jd for inodedep %p", (uintmax_t)inodedep->id_savednlink,
11827
		    inodedep);
11828
	if (fstype == UFS1) {
11829
		if (dp1->di_nlink != inodedep->id_savednlink) { 
11830
			dp1->di_nlink = inodedep->id_savednlink;
11831
			hadchanges = 1;
11832
		}
11833
		if (dp1->di_size != inodedep->id_savedsize) {
11834
			dp1->di_size = inodedep->id_savedsize;
11835
			hadchanges = 1;
11836
		}
11837
	} else {
11838
		if (dp2->di_nlink != inodedep->id_savednlink) { 
11839
			dp2->di_nlink = inodedep->id_savednlink;
11840
			hadchanges = 1;
11841
		}
11842
		if (dp2->di_size != inodedep->id_savedsize) {
11843
			dp2->di_size = inodedep->id_savedsize;
11844
			hadchanges = 1;
11845
		}
11846
		if (dp2->di_extsize != inodedep->id_savedextsize) {
11847
			dp2->di_extsize = inodedep->id_savedextsize;
11848
			hadchanges = 1;
11849
		}
11850
	}
11851
	inodedep->id_savedsize = -1;
11852
	inodedep->id_savedextsize = -1;
11853
	inodedep->id_savednlink = -1;
11854
	/*
11855
	 * If there were any rollbacks in the inode block, then it must be
11856
	 * marked dirty so that its will eventually get written back in
11857
	 * its correct form.
11858
	 */
11859
	if (hadchanges) {
11860
		if (fstype == UFS2)
11861
			ffs_update_dinode_ckhash(inodedep->id_fs, dp2);
11862
		bdirty(bp);
11863
	}
11864
bufwait:
11865
	/*
11866
	 * If the write did not succeed, we have done all the roll-forward
11867
	 * operations, but we cannot take the actions that will allow its
11868
	 * dependencies to be processed.
11869
	 */
11870
	if ((flags & WRITESUCCEEDED) == 0)
11871
		return (hadchanges);
11872
	/*
11873
	 * Process any allocdirects that completed during the update.
11874
	 */
11875
	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11876
		handle_allocdirect_partdone(adp, &wkhd);
11877
	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11878
		handle_allocdirect_partdone(adp, &wkhd);
11879
	/*
11880
	 * Process deallocations that were held pending until the
11881
	 * inode had been written to disk. Freeing of the inode
11882
	 * is delayed until after all blocks have been freed to
11883
	 * avoid creation of new <vfsid, inum, lbn> triples
11884
	 * before the old ones have been deleted.  Completely
11885
	 * unlinked inodes are not processed until the unlinked
11886
	 * inode list is written or the last reference is removed.
11887
	 */
11888
	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11889
		freefile = handle_bufwait(inodedep, NULL);
11890
		if (freefile && !LIST_EMPTY(&wkhd)) {
11891
			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11892
			freefile = NULL;
11893
		}
11894
	}
11895
	/*
11896
	 * Move rolled forward dependency completions to the bufwait list
11897
	 * now that those that were already written have been processed.
11898
	 */
11899
	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11900
		panic("handle_written_inodeblock: bufwait but no changes");
11901
	jwork_move(&inodedep->id_bufwait, &wkhd);
11902

11903
	if (freefile != NULL) {
11904
		/*
11905
		 * If the inode is goingaway it was never written.  Fake up
11906
		 * the state here so free_inodedep() can succeed.
11907
		 */
11908
		if (inodedep->id_state & GOINGAWAY)
11909
			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11910
		if (free_inodedep(inodedep) == 0)
11911
			panic("handle_written_inodeblock: live inodedep %p",
11912
			    inodedep);
11913
		add_to_worklist(&freefile->fx_list, 0);
11914
		return (0);
11915
	}
11916

11917
	/*
11918
	 * If no outstanding dependencies, free it.
11919
	 */
11920
	if (free_inodedep(inodedep) ||
11921
	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11922
	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11923
	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11924
	     LIST_FIRST(&inodedep->id_bufwait) == 0))
11925
		return (0);
11926
	return (hadchanges);
11927
}
11928

11929
/*
11930
 * Perform needed roll-forwards and kick off any dependencies that
11931
 * can now be processed.
11932
 *
11933
 * If the write did not succeed, we will do all the roll-forward
11934
 * operations, but we will not take the actions that will allow its
11935
 * dependencies to be processed.
11936
 */
11937
static int
11938
handle_written_indirdep(
11939
	struct indirdep *indirdep,
11940
	struct buf *bp,
11941
	struct buf **bpp,
11942
	int flags)
11943
{
11944
	struct allocindir *aip;
11945
	struct buf *sbp;
11946
	int chgs;
11947

11948
	if (indirdep->ir_state & GOINGAWAY)
11949
		panic("handle_written_indirdep: indirdep gone");
11950
	if ((indirdep->ir_state & IOSTARTED) == 0)
11951
		panic("handle_written_indirdep: IO not started");
11952
	chgs = 0;
11953
	/*
11954
	 * If there were rollbacks revert them here.
11955
	 */
11956
	if (indirdep->ir_saveddata) {
11957
		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11958
		if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11959
			free(indirdep->ir_saveddata, M_INDIRDEP);
11960
			indirdep->ir_saveddata = NULL;
11961
		}
11962
		chgs = 1;
11963
	}
11964
	indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11965
	indirdep->ir_state |= ATTACHED;
11966
	/*
11967
	 * If the write did not succeed, we have done all the roll-forward
11968
	 * operations, but we cannot take the actions that will allow its
11969
	 * dependencies to be processed.
11970
	 */
11971
	if ((flags & WRITESUCCEEDED) == 0) {
11972
		stat_indir_blk_ptrs++;
11973
		bdirty(bp);
11974
		return (1);
11975
	}
11976
	/*
11977
	 * Move allocindirs with written pointers to the completehd if
11978
	 * the indirdep's pointer is not yet written.  Otherwise
11979
	 * free them here.
11980
	 */
11981
	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) {
11982
		LIST_REMOVE(aip, ai_next);
11983
		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11984
			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11985
			    ai_next);
11986
			newblk_freefrag(&aip->ai_block);
11987
			continue;
11988
		}
11989
		free_newblk(&aip->ai_block);
11990
	}
11991
	/*
11992
	 * Move allocindirs that have finished dependency processing from
11993
	 * the done list to the write list after updating the pointers.
11994
	 */
11995
	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11996
		while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) {
11997
			handle_allocindir_partdone(aip);
11998
			if (aip == LIST_FIRST(&indirdep->ir_donehd))
11999
				panic("disk_write_complete: not gone");
12000
			chgs = 1;
12001
		}
12002
	}
12003
	/*
12004
	 * Preserve the indirdep if there were any changes or if it is not
12005
	 * yet valid on disk.
12006
	 */
12007
	if (chgs) {
12008
		stat_indir_blk_ptrs++;
12009
		bdirty(bp);
12010
		return (1);
12011
	}
12012
	/*
12013
	 * If there were no changes we can discard the savedbp and detach
12014
	 * ourselves from the buf.  We are only carrying completed pointers
12015
	 * in this case.
12016
	 */
12017
	sbp = indirdep->ir_savebp;
12018
	sbp->b_flags |= B_INVAL | B_NOCACHE;
12019
	indirdep->ir_savebp = NULL;
12020
	indirdep->ir_bp = NULL;
12021
	if (*bpp != NULL)
12022
		panic("handle_written_indirdep: bp already exists.");
12023
	*bpp = sbp;
12024
	/*
12025
	 * The indirdep may not be freed until its parent points at it.
12026
	 */
12027
	if (indirdep->ir_state & DEPCOMPLETE)
12028
		free_indirdep(indirdep);
12029

12030
	return (0);
12031
}
12032

12033
/*
12034
 * Process a diradd entry after its dependent inode has been written.
12035
 */
12036
static void
12037
diradd_inode_written(
12038
	struct diradd *dap,
12039
	struct inodedep *inodedep)
12040
{
12041

12042
	LOCK_OWNED(VFSTOUFS(dap->da_list.wk_mp));
12043
	dap->da_state |= COMPLETE;
12044
	complete_diradd(dap);
12045
	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
12046
}
12047

12048
/*
12049
 * Returns true if the bmsafemap will have rollbacks when written.  Must only
12050
 * be called with the per-filesystem lock and the buf lock on the cg held.
12051
 */
12052
static int
12053
bmsafemap_backgroundwrite(
12054
	struct bmsafemap *bmsafemap,
12055
	struct buf *bp)
12056
{
12057
	int dirty;
12058

12059
	LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp));
12060
	dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | 
12061
	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
12062
	/*
12063
	 * If we're initiating a background write we need to process the
12064
	 * rollbacks as they exist now, not as they exist when IO starts.
12065
	 * No other consumers will look at the contents of the shadowed
12066
	 * buf so this is safe to do here.
12067
	 */
12068
	if (bp->b_xflags & BX_BKGRDMARKER)
12069
		initiate_write_bmsafemap(bmsafemap, bp);
12070

12071
	return (dirty);
12072
}
12073

12074
/*
12075
 * Re-apply an allocation when a cg write is complete.
12076
 */
12077
static int
12078
jnewblk_rollforward(
12079
	struct jnewblk *jnewblk,
12080
	struct fs *fs,
12081
	struct cg *cgp,
12082
	uint8_t *blksfree)
12083
{
12084
	ufs1_daddr_t fragno;
12085
	ufs2_daddr_t blkno;
12086
	long cgbno, bbase;
12087
	int frags, blk;
12088
	int i;
12089

12090
	frags = 0;
12091
	cgbno = dtogd(fs, jnewblk->jn_blkno);
12092
	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
12093
		if (isclr(blksfree, cgbno + i))
12094
			panic("jnewblk_rollforward: re-allocated fragment");
12095
		frags++;
12096
	}
12097
	if (frags == fs->fs_frag) {
12098
		blkno = fragstoblks(fs, cgbno);
12099
		ffs_clrblock(fs, blksfree, (long)blkno);
12100
		ffs_clusteracct(fs, cgp, blkno, -1);
12101
		cgp->cg_cs.cs_nbfree--;
12102
	} else {
12103
		bbase = cgbno - fragnum(fs, cgbno);
12104
		cgbno += jnewblk->jn_oldfrags;
12105
                /* If a complete block had been reassembled, account for it. */
12106
		fragno = fragstoblks(fs, bbase);
12107
		if (ffs_isblock(fs, blksfree, fragno)) {
12108
			cgp->cg_cs.cs_nffree += fs->fs_frag;
12109
			ffs_clusteracct(fs, cgp, fragno, -1);
12110
			cgp->cg_cs.cs_nbfree--;
12111
		}
12112
		/* Decrement the old frags.  */
12113
		blk = blkmap(fs, blksfree, bbase);
12114
		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
12115
		/* Allocate the fragment */
12116
		for (i = 0; i < frags; i++)
12117
			clrbit(blksfree, cgbno + i);
12118
		cgp->cg_cs.cs_nffree -= frags;
12119
		/* Add back in counts associated with the new frags */
12120
		blk = blkmap(fs, blksfree, bbase);
12121
		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
12122
	}
12123
	return (frags);
12124
}
12125

12126
/*
12127
 * Complete a write to a bmsafemap structure.  Roll forward any bitmap
12128
 * changes if it's not a background write.  Set all written dependencies 
12129
 * to DEPCOMPLETE and free the structure if possible.
12130
 *
12131
 * If the write did not succeed, we will do all the roll-forward
12132
 * operations, but we will not take the actions that will allow its
12133
 * dependencies to be processed.
12134
 */
12135
static int
12136
handle_written_bmsafemap(
12137
	struct bmsafemap *bmsafemap,
12138
	struct buf *bp,
12139
	int flags)
12140
{
12141
	struct newblk *newblk;
12142
	struct inodedep *inodedep;
12143
	struct jaddref *jaddref, *jatmp;
12144
	struct jnewblk *jnewblk, *jntmp;
12145
	struct ufsmount *ump;
12146
	uint8_t *inosused;
12147
	uint8_t *blksfree;
12148
	struct cg *cgp;
12149
	struct fs *fs;
12150
	ino_t ino;
12151
	int foreground;
12152
	int chgs;
12153

12154
	if ((bmsafemap->sm_state & IOSTARTED) == 0)
12155
		panic("handle_written_bmsafemap: Not started\n");
12156
	ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
12157
	chgs = 0;
12158
	bmsafemap->sm_state &= ~IOSTARTED;
12159
	foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
12160
	/*
12161
	 * If write was successful, release journal work that was waiting
12162
	 * on the write. Otherwise move the work back.
12163
	 */
12164
	if (flags & WRITESUCCEEDED)
12165
		handle_jwork(&bmsafemap->sm_freewr);
12166
	else
12167
		LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
12168
		    worklist, wk_list);
12169

12170
	/*
12171
	 * Restore unwritten inode allocation pending jaddref writes.
12172
	 */
12173
	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
12174
		cgp = (struct cg *)bp->b_data;
12175
		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
12176
		inosused = cg_inosused(cgp);
12177
		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
12178
		    ja_bmdeps, jatmp) {
12179
			if ((jaddref->ja_state & UNDONE) == 0)
12180
				continue;
12181
			ino = jaddref->ja_ino % fs->fs_ipg;
12182
			if (isset(inosused, ino))
12183
				panic("handle_written_bmsafemap: "
12184
				    "re-allocated inode");
12185
			/* Do the roll-forward only if it's a real copy. */
12186
			if (foreground) {
12187
				if ((jaddref->ja_mode & IFMT) == IFDIR)
12188
					cgp->cg_cs.cs_ndir++;
12189
				cgp->cg_cs.cs_nifree--;
12190
				setbit(inosused, ino);
12191
				chgs = 1;
12192
			}
12193
			jaddref->ja_state &= ~UNDONE;
12194
			jaddref->ja_state |= ATTACHED;
12195
			free_jaddref(jaddref);
12196
		}
12197
	}
12198
	/*
12199
	 * Restore any block allocations which are pending journal writes.
12200
	 */
12201
	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
12202
		cgp = (struct cg *)bp->b_data;
12203
		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
12204
		blksfree = cg_blksfree(cgp);
12205
		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
12206
		    jntmp) {
12207
			if ((jnewblk->jn_state & UNDONE) == 0)
12208
				continue;
12209
			/* Do the roll-forward only if it's a real copy. */
12210
			if (foreground &&
12211
			    jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
12212
				chgs = 1;
12213
			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
12214
			jnewblk->jn_state |= ATTACHED;
12215
			free_jnewblk(jnewblk);
12216
		}
12217
	}
12218
	/*
12219
	 * If the write did not succeed, we have done all the roll-forward
12220
	 * operations, but we cannot take the actions that will allow its
12221
	 * dependencies to be processed.
12222
	 */
12223
	if ((flags & WRITESUCCEEDED) == 0) {
12224
		LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
12225
		    newblk, nb_deps);
12226
		LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
12227
		    worklist, wk_list);
12228
		if (foreground)
12229
			bdirty(bp);
12230
		return (1);
12231
	}
12232
	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
12233
		newblk->nb_state |= DEPCOMPLETE;
12234
		newblk->nb_state &= ~ONDEPLIST;
12235
		newblk->nb_bmsafemap = NULL;
12236
		LIST_REMOVE(newblk, nb_deps);
12237
		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
12238
			handle_allocdirect_partdone(
12239
			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
12240
		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
12241
			handle_allocindir_partdone(
12242
			    WK_ALLOCINDIR(&newblk->nb_list));
12243
		else if (newblk->nb_list.wk_type != D_NEWBLK)
12244
			panic("handle_written_bmsafemap: Unexpected type: %s",
12245
			    TYPENAME(newblk->nb_list.wk_type));
12246
	}
12247
	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
12248
		inodedep->id_state |= DEPCOMPLETE;
12249
		inodedep->id_state &= ~ONDEPLIST;
12250
		LIST_REMOVE(inodedep, id_deps);
12251
		inodedep->id_bmsafemap = NULL;
12252
	}
12253
	LIST_REMOVE(bmsafemap, sm_next);
12254
	if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
12255
	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
12256
	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
12257
	    LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
12258
	    LIST_EMPTY(&bmsafemap->sm_freehd)) {
12259
		LIST_REMOVE(bmsafemap, sm_hash);
12260
		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
12261
		return (0);
12262
	}
12263
	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
12264
	if (foreground)
12265
		bdirty(bp);
12266
	return (1);
12267
}
12268

12269
/*
12270
 * Try to free a mkdir dependency.
12271
 */
12272
static void
12273
complete_mkdir(struct mkdir *mkdir)
12274
{
12275
	struct diradd *dap;
12276

12277
	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
12278
		return;
12279
	LIST_REMOVE(mkdir, md_mkdirs);
12280
	dap = mkdir->md_diradd;
12281
	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
12282
	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
12283
		dap->da_state |= DEPCOMPLETE;
12284
		complete_diradd(dap);
12285
	}
12286
	WORKITEM_FREE(mkdir, D_MKDIR);
12287
}
12288

12289
/*
12290
 * Handle the completion of a mkdir dependency.
12291
 */
12292
static void
12293
handle_written_mkdir(struct mkdir *mkdir, int type)
12294
{
12295

12296
	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
12297
		panic("handle_written_mkdir: bad type");
12298
	mkdir->md_state |= COMPLETE;
12299
	complete_mkdir(mkdir);
12300
}
12301

12302
static int
12303
free_pagedep(struct pagedep *pagedep)
12304
{
12305
	int i;
12306

12307
	if (pagedep->pd_state & NEWBLOCK)
12308
		return (0);
12309
	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
12310
		return (0);
12311
	for (i = 0; i < DAHASHSZ; i++)
12312
		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
12313
			return (0);
12314
	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
12315
		return (0);
12316
	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
12317
		return (0);
12318
	if (pagedep->pd_state & ONWORKLIST)
12319
		WORKLIST_REMOVE(&pagedep->pd_list);
12320
	LIST_REMOVE(pagedep, pd_hash);
12321
	WORKITEM_FREE(pagedep, D_PAGEDEP);
12322

12323
	return (1);
12324
}
12325

12326
/*
12327
 * Called from within softdep_disk_write_complete above.
12328
 * A write operation was just completed. Removed inodes can
12329
 * now be freed and associated block pointers may be committed.
12330
 * Note that this routine is always called from interrupt level
12331
 * with further interrupts from this device blocked.
12332
 *
12333
 * If the write did not succeed, we will do all the roll-forward
12334
 * operations, but we will not take the actions that will allow its
12335
 * dependencies to be processed.
12336
 */
12337
static int 
12338
handle_written_filepage(
12339
	struct pagedep *pagedep,
12340
	struct buf *bp,		/* buffer containing the written page */
12341
	int flags)
12342
{
12343
	struct dirrem *dirrem;
12344
	struct diradd *dap, *nextdap;
12345
	struct direct *ep;
12346
	int i, chgs;
12347

12348
	if ((pagedep->pd_state & IOSTARTED) == 0)
12349
		panic("handle_written_filepage: not started");
12350
	pagedep->pd_state &= ~IOSTARTED;
12351
	if ((flags & WRITESUCCEEDED) == 0)
12352
		goto rollforward;
12353
	/*
12354
	 * Process any directory removals that have been committed.
12355
	 */
12356
	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
12357
		LIST_REMOVE(dirrem, dm_next);
12358
		dirrem->dm_state |= COMPLETE;
12359
		dirrem->dm_dirinum = pagedep->pd_ino;
12360
		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
12361
		    ("handle_written_filepage: Journal entries not written."));
12362
		add_to_worklist(&dirrem->dm_list, 0);
12363
	}
12364
	/*
12365
	 * Free any directory additions that have been committed.
12366
	 * If it is a newly allocated block, we have to wait until
12367
	 * the on-disk directory inode claims the new block.
12368
	 */
12369
	if ((pagedep->pd_state & NEWBLOCK) == 0)
12370
		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
12371
			free_diradd(dap, NULL);
12372
rollforward:
12373
	/*
12374
	 * Uncommitted directory entries must be restored.
12375
	 */
12376
	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
12377
		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
12378
		     dap = nextdap) {
12379
			nextdap = LIST_NEXT(dap, da_pdlist);
12380
			if (dap->da_state & ATTACHED)
12381
				panic("handle_written_filepage: attached");
12382
			ep = (struct direct *)
12383
			    ((char *)bp->b_data + dap->da_offset);
12384
			ep->d_ino = dap->da_newinum;
12385
			dap->da_state &= ~UNDONE;
12386
			dap->da_state |= ATTACHED;
12387
			chgs = 1;
12388
			/*
12389
			 * If the inode referenced by the directory has
12390
			 * been written out, then the dependency can be
12391
			 * moved to the pending list.
12392
			 */
12393
			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
12394
				LIST_REMOVE(dap, da_pdlist);
12395
				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
12396
				    da_pdlist);
12397
			}
12398
		}
12399
	}
12400
	/*
12401
	 * If there were any rollbacks in the directory, then it must be
12402
	 * marked dirty so that its will eventually get written back in
12403
	 * its correct form.
12404
	 */
12405
	if (chgs || (flags & WRITESUCCEEDED) == 0) {
12406
		if ((bp->b_flags & B_DELWRI) == 0)
12407
			stat_dir_entry++;
12408
		bdirty(bp);
12409
		return (1);
12410
	}
12411
	/*
12412
	 * If we are not waiting for a new directory block to be
12413
	 * claimed by its inode, then the pagedep will be freed.
12414
	 * Otherwise it will remain to track any new entries on
12415
	 * the page in case they are fsync'ed.
12416
	 */
12417
	free_pagedep(pagedep);
12418
	return (0);
12419
}
12420

12421
/*
12422
 * Writing back in-core inode structures.
12423
 * 
12424
 * The filesystem only accesses an inode's contents when it occupies an
12425
 * "in-core" inode structure.  These "in-core" structures are separate from
12426
 * the page frames used to cache inode blocks.  Only the latter are
12427
 * transferred to/from the disk.  So, when the updated contents of the
12428
 * "in-core" inode structure are copied to the corresponding in-memory inode
12429
 * block, the dependencies are also transferred.  The following procedure is
12430
 * called when copying a dirty "in-core" inode to a cached inode block.
12431
 */
12432

12433
/*
12434
 * Called when an inode is loaded from disk. If the effective link count
12435
 * differed from the actual link count when it was last flushed, then we
12436
 * need to ensure that the correct effective link count is put back.
12437
 */
12438
void 
12439
softdep_load_inodeblock(
12440
	struct inode *ip)	/* the "in_core" copy of the inode */
12441
{
12442
	struct inodedep *inodedep;
12443
	struct ufsmount *ump;
12444

12445
	ump = ITOUMP(ip);
12446
	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
12447
	    ("softdep_load_inodeblock called on non-softdep filesystem"));
12448
	/*
12449
	 * Check for alternate nlink count.
12450
	 */
12451
	ip->i_effnlink = ip->i_nlink;
12452
	ACQUIRE_LOCK(ump);
12453
	if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) {
12454
		FREE_LOCK(ump);
12455
		return;
12456
	}
12457
	if (ip->i_nlink != inodedep->id_nlinkwrote &&
12458
	    inodedep->id_nlinkwrote != -1) {
12459
		KASSERT(ip->i_nlink == 0 &&
12460
		    (ump->um_flags & UM_FSFAIL_CLEANUP) != 0,
12461
		    ("read bad i_nlink value"));
12462
		ip->i_effnlink = ip->i_nlink = inodedep->id_nlinkwrote;
12463
	}
12464
	ip->i_effnlink -= inodedep->id_nlinkdelta;
12465
	KASSERT(ip->i_effnlink >= 0,
12466
	    ("softdep_load_inodeblock: negative i_effnlink"));
12467
	FREE_LOCK(ump);
12468
}
12469

12470
/*
12471
 * This routine is called just before the "in-core" inode
12472
 * information is to be copied to the in-memory inode block.
12473
 * Recall that an inode block contains several inodes. If
12474
 * the force flag is set, then the dependencies will be
12475
 * cleared so that the update can always be made. Note that
12476
 * the buffer is locked when this routine is called, so we
12477
 * will never be in the middle of writing the inode block 
12478
 * to disk.
12479
 */
12480
void 
12481
softdep_update_inodeblock(
12482
	struct inode *ip,	/* the "in_core" copy of the inode */
12483
	struct buf *bp,		/* the buffer containing the inode block */
12484
	int waitfor)		/* nonzero => update must be allowed */
12485
{
12486
	struct inodedep *inodedep;
12487
	struct inoref *inoref;
12488
	struct ufsmount *ump;
12489
	struct worklist *wk;
12490
	struct mount *mp;
12491
	struct buf *ibp;
12492
	struct fs *fs;
12493
	int error;
12494

12495
	ump = ITOUMP(ip);
12496
	mp = UFSTOVFS(ump);
12497
	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
12498
	    ("softdep_update_inodeblock called on non-softdep filesystem"));
12499
	fs = ump->um_fs;
12500
	/*
12501
	 * If the effective link count is not equal to the actual link
12502
	 * count, then we must track the difference in an inodedep while
12503
	 * the inode is (potentially) tossed out of the cache. Otherwise,
12504
	 * if there is no existing inodedep, then there are no dependencies
12505
	 * to track.
12506
	 */
12507
	ACQUIRE_LOCK(ump);
12508
again:
12509
	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12510
		FREE_LOCK(ump);
12511
		if (ip->i_effnlink != ip->i_nlink)
12512
			panic("softdep_update_inodeblock: bad link count");
12513
		return;
12514
	}
12515
	/*
12516
	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
12517
	 * does not have access to the in-core ip so must write directly into
12518
	 * the inode block buffer when setting freelink.
12519
	 */
12520
	if ((inodedep->id_state & UNLINKED) != 0) {
12521
		if (fs->fs_magic == FS_UFS1_MAGIC)
12522
			DIP_SET(ip, i_freelink,
12523
			    ((struct ufs1_dinode *)bp->b_data +
12524
			    ino_to_fsbo(fs, ip->i_number))->di_freelink);
12525
		else
12526
			DIP_SET(ip, i_freelink,
12527
			    ((struct ufs2_dinode *)bp->b_data +
12528
			    ino_to_fsbo(fs, ip->i_number))->di_freelink);
12529
	}
12530
	KASSERT(ip->i_nlink >= inodedep->id_nlinkdelta,
12531
	    ("softdep_update_inodeblock inconsistent ip %p i_nlink %d "
12532
	    "inodedep %p id_nlinkdelta %jd",
12533
	    ip, ip->i_nlink, inodedep, (intmax_t)inodedep->id_nlinkdelta));
12534
	inodedep->id_nlinkwrote = ip->i_nlink;
12535
	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
12536
		panic("softdep_update_inodeblock: bad delta");
12537
	/*
12538
	 * If we're flushing all dependencies we must also move any waiting
12539
	 * for journal writes onto the bufwait list prior to I/O.
12540
	 */
12541
	if (waitfor) {
12542
		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12543
			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12544
			    == DEPCOMPLETE) {
12545
				jwait(&inoref->if_list, MNT_WAIT);
12546
				goto again;
12547
			}
12548
		}
12549
	}
12550
	/*
12551
	 * Changes have been initiated. Anything depending on these
12552
	 * changes cannot occur until this inode has been written.
12553
	 */
12554
	inodedep->id_state &= ~COMPLETE;
12555
	if ((inodedep->id_state & ONWORKLIST) == 0)
12556
		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
12557
	/*
12558
	 * Any new dependencies associated with the incore inode must 
12559
	 * now be moved to the list associated with the buffer holding
12560
	 * the in-memory copy of the inode. Once merged process any
12561
	 * allocdirects that are completed by the merger.
12562
	 */
12563
	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
12564
	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
12565
		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
12566
		    NULL);
12567
	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
12568
	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
12569
		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
12570
		    NULL);
12571
	/*
12572
	 * Now that the inode has been pushed into the buffer, the
12573
	 * operations dependent on the inode being written to disk
12574
	 * can be moved to the id_bufwait so that they will be
12575
	 * processed when the buffer I/O completes.
12576
	 */
12577
	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
12578
		WORKLIST_REMOVE(wk);
12579
		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
12580
	}
12581
	/*
12582
	 * Newly allocated inodes cannot be written until the bitmap
12583
	 * that allocates them have been written (indicated by
12584
	 * DEPCOMPLETE being set in id_state). If we are doing a
12585
	 * forced sync (e.g., an fsync on a file), we force the bitmap
12586
	 * to be written so that the update can be done.
12587
	 */
12588
	if (waitfor == 0) {
12589
		FREE_LOCK(ump);
12590
		return;
12591
	}
12592
retry:
12593
	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
12594
		FREE_LOCK(ump);
12595
		return;
12596
	}
12597
	ibp = inodedep->id_bmsafemap->sm_buf;
12598
	ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT);
12599
	if (ibp == NULL) {
12600
		/*
12601
		 * If ibp came back as NULL, the dependency could have been
12602
		 * freed while we slept.  Look it up again, and check to see
12603
		 * that it has completed.
12604
		 */
12605
		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
12606
			goto retry;
12607
		FREE_LOCK(ump);
12608
		return;
12609
	}
12610
	FREE_LOCK(ump);
12611
	if ((error = bwrite(ibp)) != 0)
12612
		softdep_error("softdep_update_inodeblock: bwrite", error);
12613
}
12614

12615
/*
12616
 * Merge the a new inode dependency list (such as id_newinoupdt) into an
12617
 * old inode dependency list (such as id_inoupdt).
12618
 */
12619
static void
12620
merge_inode_lists(
12621
	struct allocdirectlst *newlisthead,
12622
	struct allocdirectlst *oldlisthead)
12623
{
12624
	struct allocdirect *listadp, *newadp;
12625

12626
	newadp = TAILQ_FIRST(newlisthead);
12627
	if (newadp != NULL)
12628
		LOCK_OWNED(VFSTOUFS(newadp->ad_block.nb_list.wk_mp));
12629
	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
12630
		if (listadp->ad_offset < newadp->ad_offset) {
12631
			listadp = TAILQ_NEXT(listadp, ad_next);
12632
			continue;
12633
		}
12634
		TAILQ_REMOVE(newlisthead, newadp, ad_next);
12635
		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
12636
		if (listadp->ad_offset == newadp->ad_offset) {
12637
			allocdirect_merge(oldlisthead, newadp,
12638
			    listadp);
12639
			listadp = newadp;
12640
		}
12641
		newadp = TAILQ_FIRST(newlisthead);
12642
	}
12643
	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
12644
		TAILQ_REMOVE(newlisthead, newadp, ad_next);
12645
		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
12646
	}
12647
}
12648

12649
/*
12650
 * If we are doing an fsync, then we must ensure that any directory
12651
 * entries for the inode have been written after the inode gets to disk.
12652
 */
12653
int
12654
softdep_fsync(
12655
	struct vnode *vp)	/* the "in_core" copy of the inode */
12656
{
12657
	struct inodedep *inodedep;
12658
	struct pagedep *pagedep;
12659
	struct inoref *inoref;
12660
	struct ufsmount *ump;
12661
	struct worklist *wk;
12662
	struct diradd *dap;
12663
	struct mount *mp;
12664
	struct vnode *pvp;
12665
	struct inode *ip;
12666
	struct buf *bp;
12667
	struct fs *fs;
12668
	struct thread *td = curthread;
12669
	int error, flushparent, pagedep_new_block;
12670
	ino_t parentino;
12671
	ufs_lbn_t lbn;
12672

12673
	ip = VTOI(vp);
12674
	mp = vp->v_mount;
12675
	ump = VFSTOUFS(mp);
12676
	fs = ump->um_fs;
12677
	if (MOUNTEDSOFTDEP(mp) == 0)
12678
		return (0);
12679
	ACQUIRE_LOCK(ump);
12680
restart:
12681
	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12682
		FREE_LOCK(ump);
12683
		return (0);
12684
	}
12685
	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12686
		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12687
		    == DEPCOMPLETE) {
12688
			jwait(&inoref->if_list, MNT_WAIT);
12689
			goto restart;
12690
		}
12691
	}
12692
	if (!LIST_EMPTY(&inodedep->id_inowait) ||
12693
	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
12694
	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
12695
	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
12696
	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
12697
		panic("softdep_fsync: pending ops %p", inodedep);
12698
	for (error = 0, flushparent = 0; ; ) {
12699
		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
12700
			break;
12701
		if (wk->wk_type != D_DIRADD)
12702
			panic("softdep_fsync: Unexpected type %s",
12703
			    TYPENAME(wk->wk_type));
12704
		dap = WK_DIRADD(wk);
12705
		/*
12706
		 * Flush our parent if this directory entry has a MKDIR_PARENT
12707
		 * dependency or is contained in a newly allocated block.
12708
		 */
12709
		if (dap->da_state & DIRCHG)
12710
			pagedep = dap->da_previous->dm_pagedep;
12711
		else
12712
			pagedep = dap->da_pagedep;
12713
		parentino = pagedep->pd_ino;
12714
		lbn = pagedep->pd_lbn;
12715
		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
12716
			panic("softdep_fsync: dirty");
12717
		if ((dap->da_state & MKDIR_PARENT) ||
12718
		    (pagedep->pd_state & NEWBLOCK))
12719
			flushparent = 1;
12720
		else
12721
			flushparent = 0;
12722
		/*
12723
		 * If we are being fsync'ed as part of vgone'ing this vnode,
12724
		 * then we will not be able to release and recover the
12725
		 * vnode below, so we just have to give up on writing its
12726
		 * directory entry out. It will eventually be written, just
12727
		 * not now, but then the user was not asking to have it
12728
		 * written, so we are not breaking any promises.
12729
		 */
12730
		if (VN_IS_DOOMED(vp))
12731
			break;
12732
		/*
12733
		 * We prevent deadlock by always fetching inodes from the
12734
		 * root, moving down the directory tree. Thus, when fetching
12735
		 * our parent directory, we first try to get the lock. If
12736
		 * that fails, we must unlock ourselves before requesting
12737
		 * the lock on our parent. See the comment in ufs_lookup
12738
		 * for details on possible races.
12739
		 */
12740
		FREE_LOCK(ump);
12741
		error = get_parent_vp(vp, mp, parentino, NULL, NULL, NULL,
12742
		    &pvp);
12743
		if (error == ERELOOKUP)
12744
			error = 0;
12745
		if (error != 0)
12746
			return (error);
12747
		/*
12748
		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
12749
		 * that are contained in direct blocks will be resolved by 
12750
		 * doing a ffs_update. Pagedeps contained in indirect blocks
12751
		 * may require a complete sync'ing of the directory. So, we
12752
		 * try the cheap and fast ffs_update first, and if that fails,
12753
		 * then we do the slower ffs_syncvnode of the directory.
12754
		 */
12755
		if (flushparent) {
12756
			int locked;
12757

12758
			if ((error = ffs_update(pvp, 1)) != 0) {
12759
				vput(pvp);
12760
				return (error);
12761
			}
12762
			ACQUIRE_LOCK(ump);
12763
			locked = 1;
12764
			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
12765
				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
12766
					if (wk->wk_type != D_DIRADD)
12767
						panic("softdep_fsync: Unexpected type %s",
12768
						      TYPENAME(wk->wk_type));
12769
					dap = WK_DIRADD(wk);
12770
					if (dap->da_state & DIRCHG)
12771
						pagedep = dap->da_previous->dm_pagedep;
12772
					else
12773
						pagedep = dap->da_pagedep;
12774
					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
12775
					FREE_LOCK(ump);
12776
					locked = 0;
12777
					if (pagedep_new_block) {
12778
						VOP_UNLOCK(vp);
12779
						error = ffs_syncvnode(pvp,
12780
						    MNT_WAIT, 0);
12781
						if (error == 0)
12782
							error = ERELOOKUP;
12783
						vput(pvp);
12784
						vn_lock(vp, LK_EXCLUSIVE |
12785
						    LK_RETRY);
12786
						return (error);
12787
					}
12788
				}
12789
			}
12790
			if (locked)
12791
				FREE_LOCK(ump);
12792
		}
12793
		/*
12794
		 * Flush directory page containing the inode's name.
12795
		 */
12796
		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
12797
		    &bp);
12798
		if (error == 0)
12799
			error = bwrite(bp);
12800
		else
12801
			brelse(bp);
12802
		vput(pvp);
12803
		if (!ffs_fsfail_cleanup(ump, error))
12804
			return (error);
12805
		ACQUIRE_LOCK(ump);
12806
		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
12807
			break;
12808
	}
12809
	FREE_LOCK(ump);
12810
	return (0);
12811
}
12812

12813
/*
12814
 * Flush all the dirty bitmaps associated with the block device
12815
 * before flushing the rest of the dirty blocks so as to reduce
12816
 * the number of dependencies that will have to be rolled back.
12817
 *
12818
 * XXX Unused?
12819
 */
12820
void
12821
softdep_fsync_mountdev(struct vnode *vp)
12822
{
12823
	struct buf *bp, *nbp;
12824
	struct worklist *wk;
12825
	struct bufobj *bo;
12826

12827
	if (!vn_isdisk(vp))
12828
		panic("softdep_fsync_mountdev: vnode not a disk");
12829
	bo = &vp->v_bufobj;
12830
restart:
12831
	BO_LOCK(bo);
12832
	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
12833
		/* 
12834
		 * If it is already scheduled, skip to the next buffer.
12835
		 */
12836
		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
12837
			continue;
12838

12839
		if ((bp->b_flags & B_DELWRI) == 0)
12840
			panic("softdep_fsync_mountdev: not dirty");
12841
		/*
12842
		 * We are only interested in bitmaps with outstanding
12843
		 * dependencies.
12844
		 */
12845
		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
12846
		    wk->wk_type != D_BMSAFEMAP ||
12847
		    (bp->b_vflags & BV_BKGRDINPROG)) {
12848
			BUF_UNLOCK(bp);
12849
			continue;
12850
		}
12851
		BO_UNLOCK(bo);
12852
		bremfree(bp);
12853
		(void) bawrite(bp);
12854
		goto restart;
12855
	}
12856
	drain_output(vp);
12857
	BO_UNLOCK(bo);
12858
}
12859

12860
/*
12861
 * Sync all cylinder groups that were dirty at the time this function is
12862
 * called.  Newly dirtied cgs will be inserted before the sentinel.  This
12863
 * is used to flush freedep activity that may be holding up writes to a
12864
 * indirect block.
12865
 */
12866
static int
12867
sync_cgs(struct mount *mp, int waitfor)
12868
{
12869
	struct bmsafemap *bmsafemap;
12870
	struct bmsafemap *sentinel;
12871
	struct ufsmount *ump;
12872
	struct buf *bp;
12873
	int error;
12874

12875
	sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
12876
	sentinel->sm_cg = -1;
12877
	ump = VFSTOUFS(mp);
12878
	error = 0;
12879
	ACQUIRE_LOCK(ump);
12880
	LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
12881
	for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL;
12882
	    bmsafemap = LIST_NEXT(sentinel, sm_next)) {
12883
		/* Skip sentinels and cgs with no work to release. */
12884
		if (bmsafemap->sm_cg == -1 ||
12885
		    (LIST_EMPTY(&bmsafemap->sm_freehd) &&
12886
		    LIST_EMPTY(&bmsafemap->sm_freewr))) {
12887
			LIST_REMOVE(sentinel, sm_next);
12888
			LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12889
			continue;
12890
		}
12891
		/*
12892
		 * If we don't get the lock and we're waiting try again, if
12893
		 * not move on to the next buf and try to sync it.
12894
		 */
12895
		bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor);
12896
		if (bp == NULL && waitfor == MNT_WAIT)
12897
			continue;
12898
		LIST_REMOVE(sentinel, sm_next);
12899
		LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12900
		if (bp == NULL)
12901
			continue;
12902
		FREE_LOCK(ump);
12903
		if (waitfor == MNT_NOWAIT)
12904
			bawrite(bp);
12905
		else
12906
			error = bwrite(bp);
12907
		ACQUIRE_LOCK(ump);
12908
		if (error)
12909
			break;
12910
	}
12911
	LIST_REMOVE(sentinel, sm_next);
12912
	FREE_LOCK(ump);
12913
	free(sentinel, M_BMSAFEMAP);
12914
	return (error);
12915
}
12916

12917
/*
12918
 * This routine is called when we are trying to synchronously flush a
12919
 * file. This routine must eliminate any filesystem metadata dependencies
12920
 * so that the syncing routine can succeed.
12921
 */
12922
int
12923
softdep_sync_metadata(struct vnode *vp)
12924
{
12925
	struct inode *ip;
12926
	int error;
12927

12928
	ip = VTOI(vp);
12929
	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12930
	    ("softdep_sync_metadata called on non-softdep filesystem"));
12931
	/*
12932
	 * Ensure that any direct block dependencies have been cleared,
12933
	 * truncations are started, and inode references are journaled.
12934
	 */
12935
	ACQUIRE_LOCK(VFSTOUFS(vp->v_mount));
12936
	/*
12937
	 * Write all journal records to prevent rollbacks on devvp.
12938
	 */
12939
	if (vp->v_type == VCHR)
12940
		softdep_flushjournal(vp->v_mount);
12941
	error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number);
12942
	/*
12943
	 * Ensure that all truncates are written so we won't find deps on
12944
	 * indirect blocks.
12945
	 */
12946
	process_truncates(vp);
12947
	FREE_LOCK(VFSTOUFS(vp->v_mount));
12948

12949
	return (error);
12950
}
12951

12952
/*
12953
 * This routine is called when we are attempting to sync a buf with
12954
 * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12955
 * other IO it can but returns EBUSY if the buffer is not yet able to
12956
 * be written.  Dependencies which will not cause rollbacks will always
12957
 * return 0.
12958
 */
12959
int
12960
softdep_sync_buf(struct vnode *vp,
12961
	struct buf *bp,
12962
	int waitfor)
12963
{
12964
	struct indirdep *indirdep;
12965
	struct pagedep *pagedep;
12966
	struct allocindir *aip;
12967
	struct newblk *newblk;
12968
	struct ufsmount *ump;
12969
	struct buf *nbp;
12970
	struct worklist *wk;
12971
	int i, error;
12972

12973
	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12974
	    ("softdep_sync_buf called on non-softdep filesystem"));
12975
	/*
12976
	 * For VCHR we just don't want to force flush any dependencies that
12977
	 * will cause rollbacks.
12978
	 */
12979
	if (vp->v_type == VCHR) {
12980
		if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12981
			return (EBUSY);
12982
		return (0);
12983
	}
12984
	ump = VFSTOUFS(vp->v_mount);
12985
	ACQUIRE_LOCK(ump);
12986
	/*
12987
	 * As we hold the buffer locked, none of its dependencies
12988
	 * will disappear.
12989
	 */
12990
	error = 0;
12991
top:
12992
	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12993
		switch (wk->wk_type) {
12994
		case D_ALLOCDIRECT:
12995
		case D_ALLOCINDIR:
12996
			newblk = WK_NEWBLK(wk);
12997
			if (newblk->nb_jnewblk != NULL) {
12998
				if (waitfor == MNT_NOWAIT) {
12999
					error = EBUSY;
13000
					goto out_unlock;
13001
				}
13002
				jwait(&newblk->nb_jnewblk->jn_list, waitfor);
13003
				goto top;
13004
			}
13005
			if (newblk->nb_state & DEPCOMPLETE ||
13006
			    waitfor == MNT_NOWAIT)
13007
				continue;
13008
			nbp = newblk->nb_bmsafemap->sm_buf;
13009
			nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
13010
			if (nbp == NULL)
13011
				goto top;
13012
			FREE_LOCK(ump);
13013
			if ((error = bwrite(nbp)) != 0)
13014
				goto out;
13015
			ACQUIRE_LOCK(ump);
13016
			continue;
13017

13018
		case D_INDIRDEP:
13019
			indirdep = WK_INDIRDEP(wk);
13020
			if (waitfor == MNT_NOWAIT) {
13021
				if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
13022
				    !LIST_EMPTY(&indirdep->ir_deplisthd)) {
13023
					error = EBUSY;
13024
					goto out_unlock;
13025
				}
13026
			}
13027
			if (!TAILQ_EMPTY(&indirdep->ir_trunc))
13028
				panic("softdep_sync_buf: truncation pending.");
13029
		restart:
13030
			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
13031
				newblk = (struct newblk *)aip;
13032
				if (newblk->nb_jnewblk != NULL) {
13033
					jwait(&newblk->nb_jnewblk->jn_list,
13034
					    waitfor);
13035
					goto restart;
13036
				}
13037
				if (newblk->nb_state & DEPCOMPLETE)
13038
					continue;
13039
				nbp = newblk->nb_bmsafemap->sm_buf;
13040
				nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
13041
				if (nbp == NULL)
13042
					goto restart;
13043
				FREE_LOCK(ump);
13044
				if ((error = bwrite(nbp)) != 0)
13045
					goto out;
13046
				ACQUIRE_LOCK(ump);
13047
				goto restart;
13048
			}
13049
			continue;
13050

13051
		case D_PAGEDEP:
13052
			/*
13053
			 * Only flush directory entries in synchronous passes.
13054
			 */
13055
			if (waitfor != MNT_WAIT) {
13056
				error = EBUSY;
13057
				goto out_unlock;
13058
			}
13059
			/*
13060
			 * While syncing snapshots, we must allow recursive
13061
			 * lookups.
13062
			 */
13063
			BUF_AREC(bp);
13064
			/*
13065
			 * We are trying to sync a directory that may
13066
			 * have dependencies on both its own metadata
13067
			 * and/or dependencies on the inodes of any
13068
			 * recently allocated files. We walk its diradd
13069
			 * lists pushing out the associated inode.
13070
			 */
13071
			pagedep = WK_PAGEDEP(wk);
13072
			for (i = 0; i < DAHASHSZ; i++) {
13073
				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
13074
					continue;
13075
				error = flush_pagedep_deps(vp, wk->wk_mp,
13076
				    &pagedep->pd_diraddhd[i], bp);
13077
				if (error != 0) {
13078
					if (error != ERELOOKUP)
13079
						BUF_NOREC(bp);
13080
					goto out_unlock;
13081
				}
13082
			}
13083
			BUF_NOREC(bp);
13084
			continue;
13085

13086
		case D_FREEWORK:
13087
		case D_FREEDEP:
13088
		case D_JSEGDEP:
13089
		case D_JNEWBLK:
13090
			continue;
13091

13092
		default:
13093
			panic("softdep_sync_buf: Unknown type %s",
13094
			    TYPENAME(wk->wk_type));
13095
			/* NOTREACHED */
13096
		}
13097
	}
13098
out_unlock:
13099
	FREE_LOCK(ump);
13100
out:
13101
	return (error);
13102
}
13103

13104
/*
13105
 * Flush the dependencies associated with an inodedep.
13106
 */
13107
static int
13108
flush_inodedep_deps(
13109
	struct vnode *vp,
13110
	struct mount *mp,
13111
	ino_t ino)
13112
{
13113
	struct inodedep *inodedep;
13114
	struct inoref *inoref;
13115
	struct ufsmount *ump;
13116
	int error, waitfor;
13117

13118
	/*
13119
	 * This work is done in two passes. The first pass grabs most
13120
	 * of the buffers and begins asynchronously writing them. The
13121
	 * only way to wait for these asynchronous writes is to sleep
13122
	 * on the filesystem vnode which may stay busy for a long time
13123
	 * if the filesystem is active. So, instead, we make a second
13124
	 * pass over the dependencies blocking on each write. In the
13125
	 * usual case we will be blocking against a write that we
13126
	 * initiated, so when it is done the dependency will have been
13127
	 * resolved. Thus the second pass is expected to end quickly.
13128
	 * We give a brief window at the top of the loop to allow
13129
	 * any pending I/O to complete.
13130
	 */
13131
	ump = VFSTOUFS(mp);
13132
	LOCK_OWNED(ump);
13133
	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
13134
		if (error)
13135
			return (error);
13136
		FREE_LOCK(ump);
13137
		ACQUIRE_LOCK(ump);
13138
restart:
13139
		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
13140
			return (0);
13141
		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
13142
			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
13143
			    == DEPCOMPLETE) {
13144
				jwait(&inoref->if_list, MNT_WAIT);
13145
				goto restart;
13146
			}
13147
		}
13148
		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
13149
		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
13150
		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
13151
		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
13152
			continue;
13153
		/*
13154
		 * If pass2, we are done, otherwise do pass 2.
13155
		 */
13156
		if (waitfor == MNT_WAIT)
13157
			break;
13158
		waitfor = MNT_WAIT;
13159
	}
13160
	/*
13161
	 * Try freeing inodedep in case all dependencies have been removed.
13162
	 */
13163
	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
13164
		(void) free_inodedep(inodedep);
13165
	return (0);
13166
}
13167

13168
/*
13169
 * Flush an inode dependency list.
13170
 */
13171
static int
13172
flush_deplist(
13173
	struct allocdirectlst *listhead,
13174
	int waitfor,
13175
	int *errorp)
13176
{
13177
	struct allocdirect *adp;
13178
	struct newblk *newblk;
13179
	struct ufsmount *ump;
13180
	struct buf *bp;
13181

13182
	if ((adp = TAILQ_FIRST(listhead)) == NULL)
13183
		return (0);
13184
	ump = VFSTOUFS(adp->ad_list.wk_mp);
13185
	LOCK_OWNED(ump);
13186
	TAILQ_FOREACH(adp, listhead, ad_next) {
13187
		newblk = (struct newblk *)adp;
13188
		if (newblk->nb_jnewblk != NULL) {
13189
			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
13190
			return (1);
13191
		}
13192
		if (newblk->nb_state & DEPCOMPLETE)
13193
			continue;
13194
		bp = newblk->nb_bmsafemap->sm_buf;
13195
		bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor);
13196
		if (bp == NULL) {
13197
			if (waitfor == MNT_NOWAIT)
13198
				continue;
13199
			return (1);
13200
		}
13201
		FREE_LOCK(ump);
13202
		if (waitfor == MNT_NOWAIT)
13203
			bawrite(bp);
13204
		else 
13205
			*errorp = bwrite(bp);
13206
		ACQUIRE_LOCK(ump);
13207
		return (1);
13208
	}
13209
	return (0);
13210
}
13211

13212
/*
13213
 * Flush dependencies associated with an allocdirect block.
13214
 */
13215
static int
13216
flush_newblk_dep(
13217
	struct vnode *vp,
13218
	struct mount *mp,
13219
	ufs_lbn_t lbn)
13220
{
13221
	struct newblk *newblk;
13222
	struct ufsmount *ump;
13223
	struct bufobj *bo;
13224
	struct inode *ip;
13225
	struct buf *bp;
13226
	ufs2_daddr_t blkno;
13227
	int error;
13228

13229
	error = 0;
13230
	bo = &vp->v_bufobj;
13231
	ip = VTOI(vp);
13232
	blkno = DIP(ip, i_db[lbn]);
13233
	if (blkno == 0)
13234
		panic("flush_newblk_dep: Missing block");
13235
	ump = VFSTOUFS(mp);
13236
	ACQUIRE_LOCK(ump);
13237
	/*
13238
	 * Loop until all dependencies related to this block are satisfied.
13239
	 * We must be careful to restart after each sleep in case a write
13240
	 * completes some part of this process for us.
13241
	 */
13242
	for (;;) {
13243
		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
13244
			FREE_LOCK(ump);
13245
			break;
13246
		}
13247
		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
13248
			panic("flush_newblk_dep: Bad newblk %p", newblk);
13249
		/*
13250
		 * Flush the journal.
13251
		 */
13252
		if (newblk->nb_jnewblk != NULL) {
13253
			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
13254
			continue;
13255
		}
13256
		/*
13257
		 * Write the bitmap dependency.
13258
		 */
13259
		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
13260
			bp = newblk->nb_bmsafemap->sm_buf;
13261
			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
13262
			if (bp == NULL)
13263
				continue;
13264
			FREE_LOCK(ump);
13265
			error = bwrite(bp);
13266
			if (error)
13267
				break;
13268
			ACQUIRE_LOCK(ump);
13269
			continue;
13270
		}
13271
		/*
13272
		 * Write the buffer.
13273
		 */
13274
		FREE_LOCK(ump);
13275
		BO_LOCK(bo);
13276
		bp = gbincore(bo, lbn);
13277
		if (bp != NULL) {
13278
			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
13279
			    LK_INTERLOCK, BO_LOCKPTR(bo));
13280
			if (error == ENOLCK) {
13281
				ACQUIRE_LOCK(ump);
13282
				error = 0;
13283
				continue; /* Slept, retry */
13284
			}
13285
			if (error != 0)
13286
				break;	/* Failed */
13287
			if (bp->b_flags & B_DELWRI) {
13288
				bremfree(bp);
13289
				error = bwrite(bp);
13290
				if (error)
13291
					break;
13292
			} else
13293
				BUF_UNLOCK(bp);
13294
		} else
13295
			BO_UNLOCK(bo);
13296
		/*
13297
		 * We have to wait for the direct pointers to
13298
		 * point at the newdirblk before the dependency
13299
		 * will go away.
13300
		 */
13301
		error = ffs_update(vp, 1);
13302
		if (error)
13303
			break;
13304
		ACQUIRE_LOCK(ump);
13305
	}
13306
	return (error);
13307
}
13308

13309
/*
13310
 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
13311
 */
13312
static int
13313
flush_pagedep_deps(
13314
	struct vnode *pvp,
13315
	struct mount *mp,
13316
	struct diraddhd *diraddhdp,
13317
	struct buf *locked_bp)
13318
{
13319
	struct inodedep *inodedep;
13320
	struct inoref *inoref;
13321
	struct ufsmount *ump;
13322
	struct diradd *dap;
13323
	struct vnode *vp;
13324
	int error = 0;
13325
	struct buf *bp;
13326
	ino_t inum;
13327
	struct diraddhd unfinished;
13328

13329
	LIST_INIT(&unfinished);
13330
	ump = VFSTOUFS(mp);
13331
	LOCK_OWNED(ump);
13332
restart:
13333
	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
13334
		/*
13335
		 * Flush ourselves if this directory entry
13336
		 * has a MKDIR_PARENT dependency.
13337
		 */
13338
		if (dap->da_state & MKDIR_PARENT) {
13339
			FREE_LOCK(ump);
13340
			if ((error = ffs_update(pvp, 1)) != 0)
13341
				break;
13342
			ACQUIRE_LOCK(ump);
13343
			/*
13344
			 * If that cleared dependencies, go on to next.
13345
			 */
13346
			if (dap != LIST_FIRST(diraddhdp))
13347
				continue;
13348
			/*
13349
			 * All MKDIR_PARENT dependencies and all the
13350
			 * NEWBLOCK pagedeps that are contained in direct
13351
			 * blocks were resolved by doing above ffs_update.
13352
			 * Pagedeps contained in indirect blocks may
13353
			 * require a complete sync'ing of the directory.
13354
			 * We are in the midst of doing a complete sync,
13355
			 * so if they are not resolved in this pass we
13356
			 * defer them for now as they will be sync'ed by
13357
			 * our caller shortly.
13358
			 */
13359
			LIST_REMOVE(dap, da_pdlist);
13360
			LIST_INSERT_HEAD(&unfinished, dap, da_pdlist);
13361
			continue;
13362
		}
13363
		/*
13364
		 * A newly allocated directory must have its "." and
13365
		 * ".." entries written out before its name can be
13366
		 * committed in its parent. 
13367
		 */
13368
		inum = dap->da_newinum;
13369
		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
13370
			panic("flush_pagedep_deps: lost inode1");
13371
		/*
13372
		 * Wait for any pending journal adds to complete so we don't
13373
		 * cause rollbacks while syncing.
13374
		 */
13375
		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
13376
			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
13377
			    == DEPCOMPLETE) {
13378
				jwait(&inoref->if_list, MNT_WAIT);
13379
				goto restart;
13380
			}
13381
		}
13382
		if (dap->da_state & MKDIR_BODY) {
13383
			FREE_LOCK(ump);
13384
			error = get_parent_vp(pvp, mp, inum, locked_bp,
13385
			    diraddhdp, &unfinished, &vp);
13386
			if (error != 0)
13387
				break;
13388
			error = flush_newblk_dep(vp, mp, 0);
13389
			/*
13390
			 * If we still have the dependency we might need to
13391
			 * update the vnode to sync the new link count to
13392
			 * disk.
13393
			 */
13394
			if (error == 0 && dap == LIST_FIRST(diraddhdp))
13395
				error = ffs_update(vp, 1);
13396
			vput(vp);
13397
			if (error != 0)
13398
				break;
13399
			ACQUIRE_LOCK(ump);
13400
			/*
13401
			 * If that cleared dependencies, go on to next.
13402
			 */
13403
			if (dap != LIST_FIRST(diraddhdp))
13404
				continue;
13405
			if (dap->da_state & MKDIR_BODY) {
13406
				inodedep_lookup(UFSTOVFS(ump), inum, 0,
13407
				    &inodedep);
13408
				panic("flush_pagedep_deps: MKDIR_BODY "
13409
				    "inodedep %p dap %p vp %p",
13410
				    inodedep, dap, vp);
13411
			}
13412
		}
13413
		/*
13414
		 * Flush the inode on which the directory entry depends.
13415
		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
13416
		 * the only remaining dependency is that the updated inode
13417
		 * count must get pushed to disk. The inode has already
13418
		 * been pushed into its inode buffer (via VOP_UPDATE) at
13419
		 * the time of the reference count change. So we need only
13420
		 * locate that buffer, ensure that there will be no rollback
13421
		 * caused by a bitmap dependency, then write the inode buffer.
13422
		 */
13423
retry:
13424
		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
13425
			panic("flush_pagedep_deps: lost inode");
13426
		/*
13427
		 * If the inode still has bitmap dependencies,
13428
		 * push them to disk.
13429
		 */
13430
		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
13431
			bp = inodedep->id_bmsafemap->sm_buf;
13432
			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
13433
			if (bp == NULL)
13434
				goto retry;
13435
			FREE_LOCK(ump);
13436
			if ((error = bwrite(bp)) != 0)
13437
				break;
13438
			ACQUIRE_LOCK(ump);
13439
			if (dap != LIST_FIRST(diraddhdp))
13440
				continue;
13441
		}
13442
		/*
13443
		 * If the inode is still sitting in a buffer waiting
13444
		 * to be written or waiting for the link count to be
13445
		 * adjusted update it here to flush it to disk.
13446
		 */
13447
		if (dap == LIST_FIRST(diraddhdp)) {
13448
			FREE_LOCK(ump);
13449
			error = get_parent_vp(pvp, mp, inum, locked_bp,
13450
			    diraddhdp, &unfinished, &vp);
13451
			if (error != 0)
13452
				break;
13453
			error = ffs_update(vp, 1);
13454
			vput(vp);
13455
			if (error)
13456
				break;
13457
			ACQUIRE_LOCK(ump);
13458
		}
13459
		/*
13460
		 * If we have failed to get rid of all the dependencies
13461
		 * then something is seriously wrong.
13462
		 */
13463
		if (dap == LIST_FIRST(diraddhdp)) {
13464
			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
13465
			panic("flush_pagedep_deps: failed to flush " 
13466
			    "inodedep %p ino %ju dap %p",
13467
			    inodedep, (uintmax_t)inum, dap);
13468
		}
13469
	}
13470
	if (error)
13471
		ACQUIRE_LOCK(ump);
13472
	while ((dap = LIST_FIRST(&unfinished)) != NULL) {
13473
		LIST_REMOVE(dap, da_pdlist);
13474
		LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
13475
	}
13476
	return (error);
13477
}
13478

13479
/*
13480
 * A large burst of file addition or deletion activity can drive the
13481
 * memory load excessively high. First attempt to slow things down
13482
 * using the techniques below. If that fails, this routine requests
13483
 * the offending operations to fall back to running synchronously
13484
 * until the memory load returns to a reasonable level.
13485
 */
13486
int
13487
softdep_slowdown(struct vnode *vp)
13488
{
13489
	struct ufsmount *ump;
13490
	int jlow;
13491
	int max_softdeps_hard;
13492

13493
	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
13494
	    ("softdep_slowdown called on non-softdep filesystem"));
13495
	ump = VFSTOUFS(vp->v_mount);
13496
	ACQUIRE_LOCK(ump);
13497
	jlow = 0;
13498
	/*
13499
	 * Check for journal space if needed.
13500
	 */
13501
	if (DOINGSUJ(vp)) {
13502
		if (journal_space(ump, 0) == 0)
13503
			jlow = 1;
13504
	}
13505
	/*
13506
	 * If the system is under its limits and our filesystem is
13507
	 * not responsible for more than our share of the usage and
13508
	 * we are not low on journal space, then no need to slow down.
13509
	 */
13510
	max_softdeps_hard = max_softdeps * 11 / 10;
13511
	if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
13512
	    dep_current[D_INODEDEP] < max_softdeps_hard &&
13513
	    dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 &&
13514
	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 &&
13515
	    ump->softdep_curdeps[D_DIRREM] <
13516
	    (max_softdeps_hard / 2) / stat_flush_threads &&
13517
	    ump->softdep_curdeps[D_INODEDEP] <
13518
	    max_softdeps_hard / stat_flush_threads &&
13519
	    ump->softdep_curdeps[D_INDIRDEP] <
13520
	    (max_softdeps_hard / 1000) / stat_flush_threads &&
13521
	    ump->softdep_curdeps[D_FREEBLKS] <
13522
	    max_softdeps_hard / stat_flush_threads) {
13523
		FREE_LOCK(ump);
13524
  		return (0);
13525
	}
13526
	/*
13527
	 * If the journal is low or our filesystem is over its limit
13528
	 * then speedup the cleanup.
13529
	 */
13530
	if (ump->softdep_curdeps[D_INDIRDEP] <
13531
	    (max_softdeps_hard / 1000) / stat_flush_threads || jlow)
13532
		softdep_speedup(ump);
13533
	stat_sync_limit_hit += 1;
13534
	FREE_LOCK(ump);
13535
	/*
13536
	 * We only slow down the rate at which new dependencies are
13537
	 * generated if we are not using journaling. With journaling,
13538
	 * the cleanup should always be sufficient to keep things
13539
	 * under control.
13540
	 */
13541
	if (DOINGSUJ(vp))
13542
		return (0);
13543
	return (1);
13544
}
13545

13546
static int
13547
softdep_request_cleanup_filter(struct vnode *vp, void *arg __unused)
13548
{
13549
	return ((vp->v_iflag & VI_OWEINACT) != 0 && vp->v_usecount == 0 &&
13550
	    ((vp->v_vflag & VV_NOSYNC) != 0 || VTOI(vp)->i_effnlink == 0));
13551
}
13552

13553
static void
13554
softdep_request_cleanup_inactivate(struct mount *mp)
13555
{
13556
	struct vnode *vp, *mvp;
13557
	int error;
13558

13559
	MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, softdep_request_cleanup_filter,
13560
	    NULL) {
13561
		vholdl(vp);
13562
		vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
13563
		VI_LOCK(vp);
13564
		if (IS_UFS(vp) && vp->v_usecount == 0) {
13565
			while ((vp->v_iflag & VI_OWEINACT) != 0) {
13566
				error = vinactive(vp);
13567
				if (error != 0 && error != ERELOOKUP)
13568
					break;
13569
			}
13570
			atomic_add_int(&stat_delayed_inact, 1);
13571
		}
13572
		VOP_UNLOCK(vp);
13573
		vdropl(vp);
13574
	}
13575
}
13576

13577
/*
13578
 * Called by the allocation routines when they are about to fail
13579
 * in the hope that we can free up the requested resource (inodes
13580
 * or disk space).
13581
 * 
13582
 * First check to see if the work list has anything on it. If it has,
13583
 * clean up entries until we successfully free the requested resource.
13584
 * Because this process holds inodes locked, we cannot handle any remove
13585
 * requests that might block on a locked inode as that could lead to
13586
 * deadlock. If the worklist yields none of the requested resource,
13587
 * start syncing out vnodes to free up the needed space.
13588
 */
13589
int
13590
softdep_request_cleanup(
13591
	struct fs *fs,
13592
	struct vnode *vp,
13593
	struct ucred *cred,
13594
	int resource)
13595
{
13596
	struct ufsmount *ump;
13597
	struct mount *mp;
13598
	long starttime;
13599
	ufs2_daddr_t needed;
13600
	int error, failed_vnode;
13601

13602
	/*
13603
	 * If we are being called because of a process doing a
13604
	 * copy-on-write, then it is not safe to process any
13605
	 * worklist items as we will recurse into the copyonwrite
13606
	 * routine.  This will result in an incoherent snapshot.
13607
	 * If the vnode that we hold is a snapshot, we must avoid
13608
	 * handling other resources that could cause deadlock.
13609
	 */
13610
	if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
13611
		return (0);
13612

13613
	if (resource == FLUSH_BLOCKS_WAIT)
13614
		stat_cleanup_blkrequests += 1;
13615
	else
13616
		stat_cleanup_inorequests += 1;
13617

13618
	mp = vp->v_mount;
13619
	ump = VFSTOUFS(mp);
13620
	mtx_assert(UFS_MTX(ump), MA_OWNED);
13621
	UFS_UNLOCK(ump);
13622
	error = ffs_update(vp, 1);
13623
	if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) {
13624
		UFS_LOCK(ump);
13625
		return (0);
13626
	}
13627
	/*
13628
	 * If we are in need of resources, start by cleaning up
13629
	 * any block removals associated with our inode.
13630
	 */
13631
	ACQUIRE_LOCK(ump);
13632
	process_removes(vp);
13633
	process_truncates(vp);
13634
	FREE_LOCK(ump);
13635
	/*
13636
	 * Now clean up at least as many resources as we will need.
13637
	 *
13638
	 * When requested to clean up inodes, the number that are needed
13639
	 * is set by the number of simultaneous writers (mnt_writeopcount)
13640
	 * plus a bit of slop (2) in case some more writers show up while
13641
	 * we are cleaning.
13642
	 *
13643
	 * When requested to free up space, the amount of space that
13644
	 * we need is enough blocks to allocate a full-sized segment
13645
	 * (fs_contigsumsize). The number of such segments that will
13646
	 * be needed is set by the number of simultaneous writers
13647
	 * (mnt_writeopcount) plus a bit of slop (2) in case some more
13648
	 * writers show up while we are cleaning.
13649
	 *
13650
	 * Additionally, if we are unpriviledged and allocating space,
13651
	 * we need to ensure that we clean up enough blocks to get the
13652
	 * needed number of blocks over the threshold of the minimum
13653
	 * number of blocks required to be kept free by the filesystem
13654
	 * (fs_minfree).
13655
	 */
13656
	if (resource == FLUSH_INODES_WAIT) {
13657
		needed = vfs_mount_fetch_counter(vp->v_mount,
13658
		    MNT_COUNT_WRITEOPCOUNT) + 2;
13659
	} else if (resource == FLUSH_BLOCKS_WAIT) {
13660
		needed = (vfs_mount_fetch_counter(vp->v_mount,
13661
		    MNT_COUNT_WRITEOPCOUNT) + 2) * fs->fs_contigsumsize;
13662
		if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE))
13663
			needed += fragstoblks(fs,
13664
			    roundup((fs->fs_dsize * fs->fs_minfree / 100) -
13665
			    fs->fs_cstotal.cs_nffree, fs->fs_frag));
13666
	} else {
13667
		printf("softdep_request_cleanup: Unknown resource type %d\n",
13668
		    resource);
13669
		UFS_LOCK(ump);
13670
		return (0);
13671
	}
13672
	starttime = time_second;
13673
retry:
13674
	if (resource == FLUSH_BLOCKS_WAIT &&
13675
	    fs->fs_cstotal.cs_nbfree <= needed)
13676
		softdep_send_speedup(ump, needed * fs->fs_bsize,
13677
		    BIO_SPEEDUP_TRIM);
13678
	if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
13679
	    fs->fs_cstotal.cs_nbfree <= needed) ||
13680
	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13681
	    fs->fs_cstotal.cs_nifree <= needed)) {
13682
		ACQUIRE_LOCK(ump);
13683
		if (ump->softdep_on_worklist > 0 &&
13684
		    process_worklist_item(UFSTOVFS(ump),
13685
		    ump->softdep_on_worklist, LK_NOWAIT) != 0)
13686
			stat_worklist_push += 1;
13687
		FREE_LOCK(ump);
13688
	}
13689

13690
	/*
13691
	 * Check that there are vnodes pending inactivation.  As they
13692
	 * have been unlinked, inactivating them will free up their
13693
	 * inodes.
13694
	 */
13695
	ACQUIRE_LOCK(ump);
13696
	if (resource == FLUSH_INODES_WAIT &&
13697
	    fs->fs_cstotal.cs_nifree <= needed &&
13698
	    fs->fs_pendinginodes <= needed) {
13699
		if ((ump->um_softdep->sd_flags & FLUSH_DI_ACTIVE) == 0) {
13700
			ump->um_softdep->sd_flags |= FLUSH_DI_ACTIVE;
13701
			FREE_LOCK(ump);
13702
			softdep_request_cleanup_inactivate(mp);
13703
			ACQUIRE_LOCK(ump);
13704
			ump->um_softdep->sd_flags &= ~FLUSH_DI_ACTIVE;
13705
			wakeup(&ump->um_softdep->sd_flags);
13706
		} else {
13707
			while ((ump->um_softdep->sd_flags &
13708
			    FLUSH_DI_ACTIVE) != 0) {
13709
				msleep(&ump->um_softdep->sd_flags,
13710
				    LOCK_PTR(ump), PVM, "ffsvina", hz);
13711
			}
13712
		}
13713
	}
13714
	FREE_LOCK(ump);
13715

13716
	/*
13717
	 * If we still need resources and there are no more worklist
13718
	 * entries to process to obtain them, we have to start flushing
13719
	 * the dirty vnodes to force the release of additional requests
13720
	 * to the worklist that we can then process to reap addition
13721
	 * resources. We walk the vnodes associated with the mount point
13722
	 * until we get the needed worklist requests that we can reap.
13723
	 *
13724
	 * If there are several threads all needing to clean the same
13725
	 * mount point, only one is allowed to walk the mount list.
13726
	 * When several threads all try to walk the same mount list,
13727
	 * they end up competing with each other and often end up in
13728
	 * livelock. This approach ensures that forward progress is
13729
	 * made at the cost of occational ENOSPC errors being returned
13730
	 * that might otherwise have been avoided.
13731
	 */
13732
	error = 1;
13733
	if ((resource == FLUSH_BLOCKS_WAIT && 
13734
	     fs->fs_cstotal.cs_nbfree <= needed) ||
13735
	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13736
	     fs->fs_cstotal.cs_nifree <= needed)) {
13737
		ACQUIRE_LOCK(ump);
13738
		if ((ump->um_softdep->sd_flags & FLUSH_RC_ACTIVE) == 0) {
13739
			ump->um_softdep->sd_flags |= FLUSH_RC_ACTIVE;
13740
			FREE_LOCK(ump);
13741
			failed_vnode = softdep_request_cleanup_flush(mp, ump);
13742
			ACQUIRE_LOCK(ump);
13743
			ump->um_softdep->sd_flags &= ~FLUSH_RC_ACTIVE;
13744
			wakeup(&ump->um_softdep->sd_flags);
13745
			FREE_LOCK(ump);
13746
			if (ump->softdep_on_worklist > 0) {
13747
				stat_cleanup_retries += 1;
13748
				if (!failed_vnode)
13749
					goto retry;
13750
			}
13751
		} else {
13752
			while ((ump->um_softdep->sd_flags &
13753
			    FLUSH_RC_ACTIVE) != 0) {
13754
				msleep(&ump->um_softdep->sd_flags,
13755
				    LOCK_PTR(ump), PVM, "ffsrca", hz);
13756
			}
13757
			FREE_LOCK(ump);
13758
			error = 0;
13759
		}
13760
		stat_cleanup_failures += 1;
13761
	}
13762
	if (time_second - starttime > stat_cleanup_high_delay)
13763
		stat_cleanup_high_delay = time_second - starttime;
13764
	UFS_LOCK(ump);
13765
	return (error);
13766
}
13767

13768
/*
13769
 * Scan the vnodes for the specified mount point flushing out any
13770
 * vnodes that can be locked without waiting. Finally, try to flush
13771
 * the device associated with the mount point if it can be locked
13772
 * without waiting.
13773
 *
13774
 * We return 0 if we were able to lock every vnode in our scan.
13775
 * If we had to skip one or more vnodes, we return 1.
13776
 */
13777
static int
13778
softdep_request_cleanup_flush(struct mount *mp, struct ufsmount *ump)
13779
{
13780
	struct thread *td;
13781
	struct vnode *lvp, *mvp;
13782
	int failed_vnode;
13783

13784
	failed_vnode = 0;
13785
	td = curthread;
13786
	MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
13787
		if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
13788
			VI_UNLOCK(lvp);
13789
			continue;
13790
		}
13791
		if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT) != 0) {
13792
			failed_vnode = 1;
13793
			continue;
13794
		}
13795
		if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
13796
			vput(lvp);
13797
			continue;
13798
		}
13799
		(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
13800
		vput(lvp);
13801
	}
13802
	lvp = ump->um_devvp;
13803
	if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
13804
		VOP_FSYNC(lvp, MNT_NOWAIT, td);
13805
		VOP_UNLOCK(lvp);
13806
	}
13807
	return (failed_vnode);
13808
}
13809

13810
static bool
13811
softdep_excess_items(struct ufsmount *ump, int item)
13812
{
13813

13814
	KASSERT(item >= 0 && item < D_LAST, ("item %d", item));
13815
	return (dep_current[item] > max_softdeps &&
13816
	    ump->softdep_curdeps[item] > max_softdeps /
13817
	    stat_flush_threads);
13818
}
13819

13820
static void
13821
schedule_cleanup(struct mount *mp)
13822
{
13823
	struct ufsmount *ump;
13824
	struct thread *td;
13825

13826
	ump = VFSTOUFS(mp);
13827
	LOCK_OWNED(ump);
13828
	FREE_LOCK(ump);
13829
	td = curthread;
13830
	if ((td->td_pflags & TDP_KTHREAD) != 0 &&
13831
	    (td->td_proc->p_flag2 & P2_AST_SU) == 0) {
13832
		/*
13833
		 * No ast is delivered to kernel threads, so nobody
13834
		 * would deref the mp.  Some kernel threads
13835
		 * explicitly check for AST, e.g. NFS daemon does
13836
		 * this in the serving loop.
13837
		 */
13838
		return;
13839
	}
13840
	if (td->td_su != NULL)
13841
		vfs_rel(td->td_su);
13842
	vfs_ref(mp);
13843
	td->td_su = mp;
13844
	ast_sched(td, TDA_UFS);
13845
}
13846

13847
static void
13848
softdep_ast_cleanup_proc(struct thread *td, int ast __unused)
13849
{
13850
	struct mount *mp;
13851
	struct ufsmount *ump;
13852
	int error;
13853
	bool req;
13854

13855
	while ((mp = td->td_su) != NULL) {
13856
		td->td_su = NULL;
13857
		error = vfs_busy(mp, MBF_NOWAIT);
13858
		vfs_rel(mp);
13859
		if (error != 0)
13860
			return;
13861
		if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) {
13862
			ump = VFSTOUFS(mp);
13863
			for (;;) {
13864
				req = false;
13865
				ACQUIRE_LOCK(ump);
13866
				if (softdep_excess_items(ump, D_INODEDEP)) {
13867
					req = true;
13868
					request_cleanup(mp, FLUSH_INODES);
13869
				}
13870
				if (softdep_excess_items(ump, D_DIRREM)) {
13871
					req = true;
13872
					request_cleanup(mp, FLUSH_BLOCKS);
13873
				}
13874
				FREE_LOCK(ump);
13875
				if (softdep_excess_items(ump, D_NEWBLK) ||
13876
				    softdep_excess_items(ump, D_ALLOCDIRECT) ||
13877
				    softdep_excess_items(ump, D_ALLOCINDIR)) {
13878
					error = vn_start_write(NULL, &mp,
13879
					    V_WAIT);
13880
					if (error == 0) {
13881
						req = true;
13882
						VFS_SYNC(mp, MNT_WAIT);
13883
						vn_finished_write(mp);
13884
					}
13885
				}
13886
				if ((td->td_pflags & TDP_KTHREAD) != 0 || !req)
13887
					break;
13888
			}
13889
		}
13890
		vfs_unbusy(mp);
13891
	}
13892
	if ((mp = td->td_su) != NULL) {
13893
		td->td_su = NULL;
13894
		vfs_rel(mp);
13895
	}
13896
}
13897

13898
/*
13899
 * If memory utilization has gotten too high, deliberately slow things
13900
 * down and speed up the I/O processing.
13901
 */
13902
static int
13903
request_cleanup(struct mount *mp, int resource)
13904
{
13905
	struct thread *td = curthread;
13906
	struct ufsmount *ump;
13907

13908
	ump = VFSTOUFS(mp);
13909
	LOCK_OWNED(ump);
13910
	/*
13911
	 * We never hold up the filesystem syncer or buf daemon.
13912
	 */
13913
	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
13914
		return (0);
13915
	/*
13916
	 * First check to see if the work list has gotten backlogged.
13917
	 * If it has, co-opt this process to help clean up two entries.
13918
	 * Because this process may hold inodes locked, we cannot
13919
	 * handle any remove requests that might block on a locked
13920
	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
13921
	 * to avoid recursively processing the worklist.
13922
	 */
13923
	if (ump->softdep_on_worklist > max_softdeps / 10) {
13924
		td->td_pflags |= TDP_SOFTDEP;
13925
		process_worklist_item(mp, 2, LK_NOWAIT);
13926
		td->td_pflags &= ~TDP_SOFTDEP;
13927
		stat_worklist_push += 2;
13928
		return(1);
13929
	}
13930
	/*
13931
	 * Next, we attempt to speed up the syncer process. If that
13932
	 * is successful, then we allow the process to continue.
13933
	 */
13934
	if (softdep_speedup(ump) &&
13935
	    resource != FLUSH_BLOCKS_WAIT &&
13936
	    resource != FLUSH_INODES_WAIT)
13937
		return(0);
13938
	/*
13939
	 * If we are resource constrained on inode dependencies, try
13940
	 * flushing some dirty inodes. Otherwise, we are constrained
13941
	 * by file deletions, so try accelerating flushes of directories
13942
	 * with removal dependencies. We would like to do the cleanup
13943
	 * here, but we probably hold an inode locked at this point and 
13944
	 * that might deadlock against one that we try to clean. So,
13945
	 * the best that we can do is request the syncer daemon to do
13946
	 * the cleanup for us.
13947
	 */
13948
	switch (resource) {
13949
	case FLUSH_INODES:
13950
	case FLUSH_INODES_WAIT:
13951
		ACQUIRE_GBLLOCK(&lk);
13952
		stat_ino_limit_push += 1;
13953
		req_clear_inodedeps += 1;
13954
		FREE_GBLLOCK(&lk);
13955
		stat_countp = &stat_ino_limit_hit;
13956
		break;
13957

13958
	case FLUSH_BLOCKS:
13959
	case FLUSH_BLOCKS_WAIT:
13960
		ACQUIRE_GBLLOCK(&lk);
13961
		stat_blk_limit_push += 1;
13962
		req_clear_remove += 1;
13963
		FREE_GBLLOCK(&lk);
13964
		stat_countp = &stat_blk_limit_hit;
13965
		break;
13966

13967
	default:
13968
		panic("request_cleanup: unknown type");
13969
	}
13970
	/*
13971
	 * Hopefully the syncer daemon will catch up and awaken us.
13972
	 * We wait at most tickdelay before proceeding in any case.
13973
	 */
13974
	ACQUIRE_GBLLOCK(&lk);
13975
	FREE_LOCK(ump);
13976
	proc_waiting += 1;
13977
	if (callout_pending(&softdep_callout) == FALSE)
13978
		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13979
		    pause_timer, 0);
13980

13981
	if ((td->td_pflags & TDP_KTHREAD) == 0)
13982
		msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
13983
	proc_waiting -= 1;
13984
	FREE_GBLLOCK(&lk);
13985
	ACQUIRE_LOCK(ump);
13986
	return (1);
13987
}
13988

13989
/*
13990
 * Awaken processes pausing in request_cleanup and clear proc_waiting
13991
 * to indicate that there is no longer a timer running. Pause_timer
13992
 * will be called with the global softdep mutex (&lk) locked.
13993
 */
13994
static void
13995
pause_timer(void *arg)
13996
{
13997

13998
	GBLLOCK_OWNED(&lk);
13999
	/*
14000
	 * The callout_ API has acquired mtx and will hold it around this
14001
	 * function call.
14002
	 */
14003
	*stat_countp += proc_waiting;
14004
	wakeup(&proc_waiting);
14005
}
14006

14007
/*
14008
 * If requested, try removing inode or removal dependencies.
14009
 */
14010
static void
14011
check_clear_deps(struct mount *mp)
14012
{
14013
	struct ufsmount *ump;
14014
	bool suj_susp;
14015

14016
	/*
14017
	 * Tell the lower layers that any TRIM or WRITE transactions that have
14018
	 * been delayed for performance reasons should proceed to help alleviate
14019
	 * the shortage faster. The race between checking req_* and the softdep
14020
	 * mutex (lk) is fine since this is an advisory operation that at most
14021
	 * causes deferred work to be done sooner.
14022
	 */
14023
	ump = VFSTOUFS(mp);
14024
	suj_susp = ump->um_softdep->sd_jblocks != NULL &&
14025
	    ump->softdep_jblocks->jb_suspended;
14026
	if (req_clear_remove || req_clear_inodedeps || suj_susp) {
14027
		FREE_LOCK(ump);
14028
		softdep_send_speedup(ump, 0, BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE);
14029
		ACQUIRE_LOCK(ump);
14030
	}
14031

14032
	/*
14033
	 * If we are suspended, it may be because of our using
14034
	 * too many inodedeps, so help clear them out.
14035
	 */
14036
	if (suj_susp)
14037
		clear_inodedeps(mp);
14038

14039
	/*
14040
	 * General requests for cleanup of backed up dependencies
14041
	 */
14042
	ACQUIRE_GBLLOCK(&lk);
14043
	if (req_clear_inodedeps) {
14044
		req_clear_inodedeps -= 1;
14045
		FREE_GBLLOCK(&lk);
14046
		clear_inodedeps(mp);
14047
		ACQUIRE_GBLLOCK(&lk);
14048
		wakeup(&proc_waiting);
14049
	}
14050
	if (req_clear_remove) {
14051
		req_clear_remove -= 1;
14052
		FREE_GBLLOCK(&lk);
14053
		clear_remove(mp);
14054
		ACQUIRE_GBLLOCK(&lk);
14055
		wakeup(&proc_waiting);
14056
	}
14057
	FREE_GBLLOCK(&lk);
14058
}
14059

14060
/*
14061
 * Flush out a directory with at least one removal dependency in an effort to
14062
 * reduce the number of dirrem, freefile, and freeblks dependency structures.
14063
 */
14064
static void
14065
clear_remove(struct mount *mp)
14066
{
14067
	struct pagedep_hashhead *pagedephd;
14068
	struct pagedep *pagedep;
14069
	struct ufsmount *ump;
14070
	struct vnode *vp;
14071
	struct bufobj *bo;
14072
	int error, cnt;
14073
	ino_t ino;
14074

14075
	ump = VFSTOUFS(mp);
14076
	LOCK_OWNED(ump);
14077

14078
	for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) {
14079
		pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++];
14080
		if (ump->pagedep_nextclean > ump->pagedep_hash_size)
14081
			ump->pagedep_nextclean = 0;
14082
		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
14083
			if (LIST_EMPTY(&pagedep->pd_dirremhd))
14084
				continue;
14085
			ino = pagedep->pd_ino;
14086
			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
14087
				continue;
14088
			FREE_LOCK(ump);
14089

14090
			/*
14091
			 * Let unmount clear deps
14092
			 */
14093
			error = vfs_busy(mp, MBF_NOWAIT);
14094
			if (error != 0)
14095
				goto finish_write;
14096
			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
14097
			     FFSV_FORCEINSMQ | FFSV_FORCEINODEDEP);
14098
			vfs_unbusy(mp);
14099
			if (error != 0) {
14100
				softdep_error("clear_remove: vget", error);
14101
				goto finish_write;
14102
			}
14103
			MPASS(VTOI(vp)->i_mode != 0);
14104
			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
14105
				softdep_error("clear_remove: fsync", error);
14106
			bo = &vp->v_bufobj;
14107
			BO_LOCK(bo);
14108
			drain_output(vp);
14109
			BO_UNLOCK(bo);
14110
			vput(vp);
14111
		finish_write:
14112
			vn_finished_write(mp);
14113
			ACQUIRE_LOCK(ump);
14114
			return;
14115
		}
14116
	}
14117
}
14118

14119
/*
14120
 * Clear out a block of dirty inodes in an effort to reduce
14121
 * the number of inodedep dependency structures.
14122
 */
14123
static void
14124
clear_inodedeps(struct mount *mp)
14125
{
14126
	struct inodedep_hashhead *inodedephd;
14127
	struct inodedep *inodedep;
14128
	struct ufsmount *ump;
14129
	struct vnode *vp;
14130
	struct fs *fs;
14131
	int error, cnt;
14132
	ino_t firstino, lastino, ino;
14133

14134
	ump = VFSTOUFS(mp);
14135
	fs = ump->um_fs;
14136
	LOCK_OWNED(ump);
14137
	/*
14138
	 * Pick a random inode dependency to be cleared.
14139
	 * We will then gather up all the inodes in its block 
14140
	 * that have dependencies and flush them out.
14141
	 */
14142
	for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) {
14143
		inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++];
14144
		if (ump->inodedep_nextclean > ump->inodedep_hash_size)
14145
			ump->inodedep_nextclean = 0;
14146
		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
14147
			break;
14148
	}
14149
	if (inodedep == NULL)
14150
		return;
14151
	/*
14152
	 * Find the last inode in the block with dependencies.
14153
	 */
14154
	firstino = rounddown2(inodedep->id_ino, INOPB(fs));
14155
	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
14156
		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
14157
			break;
14158
	/*
14159
	 * Asynchronously push all but the last inode with dependencies.
14160
	 * Synchronously push the last inode with dependencies to ensure
14161
	 * that the inode block gets written to free up the inodedeps.
14162
	 */
14163
	for (ino = firstino; ino <= lastino; ino++) {
14164
		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
14165
			continue;
14166
		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
14167
			continue;
14168
		FREE_LOCK(ump);
14169
		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
14170
		if (error != 0) {
14171
			vn_finished_write(mp);
14172
			ACQUIRE_LOCK(ump);
14173
			return;
14174
		}
14175
		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
14176
		    FFSV_FORCEINSMQ | FFSV_FORCEINODEDEP)) != 0) {
14177
			softdep_error("clear_inodedeps: vget", error);
14178
			vfs_unbusy(mp);
14179
			vn_finished_write(mp);
14180
			ACQUIRE_LOCK(ump);
14181
			return;
14182
		}
14183
		vfs_unbusy(mp);
14184
		if (VTOI(vp)->i_mode == 0) {
14185
			vgone(vp);
14186
		} else if (ino == lastino) {
14187
			do {
14188
				error = ffs_syncvnode(vp, MNT_WAIT, 0);
14189
			} while (error == ERELOOKUP);
14190
			if (error != 0)
14191
				softdep_error("clear_inodedeps: fsync1", error);
14192
		} else {
14193
			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
14194
				softdep_error("clear_inodedeps: fsync2", error);
14195
			BO_LOCK(&vp->v_bufobj);
14196
			drain_output(vp);
14197
			BO_UNLOCK(&vp->v_bufobj);
14198
		}
14199
		vput(vp);
14200
		vn_finished_write(mp);
14201
		ACQUIRE_LOCK(ump);
14202
	}
14203
}
14204

14205
void
14206
softdep_buf_append(struct buf *bp, struct workhead *wkhd)
14207
{
14208
	struct worklist *wk;
14209
	struct ufsmount *ump;
14210

14211
	if ((wk = LIST_FIRST(wkhd)) == NULL)
14212
		return;
14213
	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
14214
	    ("softdep_buf_append called on non-softdep filesystem"));
14215
	ump = VFSTOUFS(wk->wk_mp);
14216
	ACQUIRE_LOCK(ump);
14217
	while ((wk = LIST_FIRST(wkhd)) != NULL) {
14218
		WORKLIST_REMOVE(wk);
14219
		WORKLIST_INSERT(&bp->b_dep, wk);
14220
	}
14221
	FREE_LOCK(ump);
14222

14223
}
14224

14225
void
14226
softdep_inode_append(
14227
	struct inode *ip,
14228
	struct ucred *cred,
14229
	struct workhead *wkhd)
14230
{
14231
	struct buf *bp;
14232
	struct fs *fs;
14233
	struct ufsmount *ump;
14234
	int error;
14235

14236
	ump = ITOUMP(ip);
14237
	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
14238
	    ("softdep_inode_append called on non-softdep filesystem"));
14239
	fs = ump->um_fs;
14240
	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
14241
	    (int)fs->fs_bsize, cred, &bp);
14242
	if (error) {
14243
		bqrelse(bp);
14244
		softdep_freework(wkhd);
14245
		return;
14246
	}
14247
	softdep_buf_append(bp, wkhd);
14248
	bqrelse(bp);
14249
}
14250

14251
void
14252
softdep_freework(struct workhead *wkhd)
14253
{
14254
	struct worklist *wk;
14255
	struct ufsmount *ump;
14256

14257
	if ((wk = LIST_FIRST(wkhd)) == NULL)
14258
		return;
14259
	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
14260
	    ("softdep_freework called on non-softdep filesystem"));
14261
	ump = VFSTOUFS(wk->wk_mp);
14262
	ACQUIRE_LOCK(ump);
14263
	handle_jwork(wkhd);
14264
	FREE_LOCK(ump);
14265
}
14266

14267
static struct ufsmount *
14268
softdep_bp_to_mp(struct buf *bp)
14269
{
14270
	struct mount *mp;
14271
	struct vnode *vp;
14272

14273
	if (LIST_EMPTY(&bp->b_dep))
14274
		return (NULL);
14275
	vp = bp->b_vp;
14276
	KASSERT(vp != NULL,
14277
	    ("%s, buffer with dependencies lacks vnode", __func__));
14278

14279
	/*
14280
	 * The ump mount point is stable after we get a correct
14281
	 * pointer, since bp is locked and this prevents unmount from
14282
	 * proceeding.  But to get to it, we cannot dereference bp->b_dep
14283
	 * head wk_mp, because we do not yet own SU ump lock and
14284
	 * workitem might be freed while dereferenced.
14285
	 */
14286
retry:
14287
	switch (vp->v_type) {
14288
	case VCHR:
14289
		VI_LOCK(vp);
14290
		mp = vp->v_type == VCHR ? vp->v_rdev->si_mountpt : NULL;
14291
		VI_UNLOCK(vp);
14292
		if (mp == NULL)
14293
			goto retry;
14294
		break;
14295
	case VREG:
14296
	case VDIR:
14297
	case VLNK:
14298
	case VFIFO:
14299
	case VSOCK:
14300
		mp = vp->v_mount;
14301
		break;
14302
	case VBLK:
14303
		vn_printf(vp, "softdep_bp_to_mp: unexpected block device\n");
14304
		/* FALLTHROUGH */
14305
	case VNON:
14306
	case VBAD:
14307
	case VMARKER:
14308
		mp = NULL;
14309
		break;
14310
	default:
14311
		vn_printf(vp, "unknown vnode type");
14312
		mp = NULL;
14313
		break;
14314
	}
14315
	return (VFSTOUFS(mp));
14316
}
14317

14318
/*
14319
 * Function to determine if the buffer has outstanding dependencies
14320
 * that will cause a roll-back if the buffer is written. If wantcount
14321
 * is set, return number of dependencies, otherwise just yes or no.
14322
 */
14323
static int
14324
softdep_count_dependencies(struct buf *bp, int wantcount)
14325
{
14326
	struct worklist *wk;
14327
	struct ufsmount *ump;
14328
	struct bmsafemap *bmsafemap;
14329
	struct freework *freework;
14330
	struct inodedep *inodedep;
14331
	struct indirdep *indirdep;
14332
	struct freeblks *freeblks;
14333
	struct allocindir *aip;
14334
	struct pagedep *pagedep;
14335
	struct dirrem *dirrem;
14336
	struct newblk *newblk;
14337
	struct mkdir *mkdir;
14338
	struct diradd *dap;
14339
	int i, retval;
14340

14341
	ump = softdep_bp_to_mp(bp);
14342
	if (ump == NULL)
14343
		return (0);
14344
	retval = 0;
14345
	ACQUIRE_LOCK(ump);
14346
	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
14347
		switch (wk->wk_type) {
14348
		case D_INODEDEP:
14349
			inodedep = WK_INODEDEP(wk);
14350
			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
14351
				/* bitmap allocation dependency */
14352
				retval += 1;
14353
				if (!wantcount)
14354
					goto out;
14355
			}
14356
			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
14357
				/* direct block pointer dependency */
14358
				retval += 1;
14359
				if (!wantcount)
14360
					goto out;
14361
			}
14362
			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
14363
				/* direct block pointer dependency */
14364
				retval += 1;
14365
				if (!wantcount)
14366
					goto out;
14367
			}
14368
			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
14369
				/* Add reference dependency. */
14370
				retval += 1;
14371
				if (!wantcount)
14372
					goto out;
14373
			}
14374
			continue;
14375

14376
		case D_INDIRDEP:
14377
			indirdep = WK_INDIRDEP(wk);
14378

14379
			TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
14380
				/* indirect truncation dependency */
14381
				retval += 1;
14382
				if (!wantcount)
14383
					goto out;
14384
			}
14385

14386
			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
14387
				/* indirect block pointer dependency */
14388
				retval += 1;
14389
				if (!wantcount)
14390
					goto out;
14391
			}
14392
			continue;
14393

14394
		case D_PAGEDEP:
14395
			pagedep = WK_PAGEDEP(wk);
14396
			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
14397
				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
14398
					/* Journal remove ref dependency. */
14399
					retval += 1;
14400
					if (!wantcount)
14401
						goto out;
14402
				}
14403
			}
14404
			for (i = 0; i < DAHASHSZ; i++) {
14405
				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
14406
					/* directory entry dependency */
14407
					retval += 1;
14408
					if (!wantcount)
14409
						goto out;
14410
				}
14411
			}
14412
			continue;
14413

14414
		case D_BMSAFEMAP:
14415
			bmsafemap = WK_BMSAFEMAP(wk);
14416
			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
14417
				/* Add reference dependency. */
14418
				retval += 1;
14419
				if (!wantcount)
14420
					goto out;
14421
			}
14422
			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
14423
				/* Allocate block dependency. */
14424
				retval += 1;
14425
				if (!wantcount)
14426
					goto out;
14427
			}
14428
			continue;
14429

14430
		case D_FREEBLKS:
14431
			freeblks = WK_FREEBLKS(wk);
14432
			if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
14433
				/* Freeblk journal dependency. */
14434
				retval += 1;
14435
				if (!wantcount)
14436
					goto out;
14437
			}
14438
			continue;
14439

14440
		case D_ALLOCDIRECT:
14441
		case D_ALLOCINDIR:
14442
			newblk = WK_NEWBLK(wk);
14443
			if (newblk->nb_jnewblk) {
14444
				/* Journal allocate dependency. */
14445
				retval += 1;
14446
				if (!wantcount)
14447
					goto out;
14448
			}
14449
			continue;
14450

14451
		case D_MKDIR:
14452
			mkdir = WK_MKDIR(wk);
14453
			if (mkdir->md_jaddref) {
14454
				/* Journal reference dependency. */
14455
				retval += 1;
14456
				if (!wantcount)
14457
					goto out;
14458
			}
14459
			continue;
14460

14461
		case D_FREEWORK:
14462
		case D_FREEDEP:
14463
		case D_JSEGDEP:
14464
		case D_JSEG:
14465
		case D_SBDEP:
14466
			/* never a dependency on these blocks */
14467
			continue;
14468

14469
		default:
14470
			panic("softdep_count_dependencies: Unexpected type %s",
14471
			    TYPENAME(wk->wk_type));
14472
			/* NOTREACHED */
14473
		}
14474
	}
14475
out:
14476
	FREE_LOCK(ump);
14477
	return (retval);
14478
}
14479

14480
/*
14481
 * Acquire exclusive access to a buffer.
14482
 * Must be called with a locked mtx parameter.
14483
 * Return acquired buffer or NULL on failure.
14484
 */
14485
static struct buf *
14486
getdirtybuf(struct buf *bp,
14487
	struct rwlock *lock,
14488
	int waitfor)
14489
{
14490
	int error;
14491

14492
	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
14493
		if (waitfor != MNT_WAIT)
14494
			return (NULL);
14495
		error = BUF_LOCK(bp,
14496
		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
14497
		/*
14498
		 * Even if we successfully acquire bp here, we have dropped
14499
		 * lock, which may violates our guarantee.
14500
		 */
14501
		if (error == 0)
14502
			BUF_UNLOCK(bp);
14503
		else if (error != ENOLCK)
14504
			panic("getdirtybuf: inconsistent lock: %d", error);
14505
		rw_wlock(lock);
14506
		return (NULL);
14507
	}
14508
	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
14509
		if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) {
14510
			rw_wunlock(lock);
14511
			BO_LOCK(bp->b_bufobj);
14512
			BUF_UNLOCK(bp);
14513
			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
14514
				bp->b_vflags |= BV_BKGRDWAIT;
14515
				msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj),
14516
				       PRIBIO | PDROP, "getbuf", 0);
14517
			} else
14518
				BO_UNLOCK(bp->b_bufobj);
14519
			rw_wlock(lock);
14520
			return (NULL);
14521
		}
14522
		BUF_UNLOCK(bp);
14523
		if (waitfor != MNT_WAIT)
14524
			return (NULL);
14525
		if (bp->b_vp->v_type != VCHR)
14526
			ASSERT_BO_WLOCKED(bp->b_bufobj);
14527
		bp->b_vflags |= BV_BKGRDWAIT;
14528
		rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0);
14529
		return (NULL);
14530
	}
14531
	if ((bp->b_flags & B_DELWRI) == 0) {
14532
		BUF_UNLOCK(bp);
14533
		return (NULL);
14534
	}
14535
	bremfree(bp);
14536
	return (bp);
14537
}
14538

14539
/*
14540
 * Check if it is safe to suspend the file system now.  On entry,
14541
 * the vnode interlock for devvp should be held.  Return 0 with
14542
 * the mount interlock held if the file system can be suspended now,
14543
 * otherwise return EAGAIN with the mount interlock held.
14544
 */
14545
int
14546
softdep_check_suspend(struct mount *mp,
14547
		      struct vnode *devvp,
14548
		      int softdep_depcnt,
14549
		      int softdep_accdepcnt,
14550
		      int secondary_writes,
14551
		      int secondary_accwrites)
14552
{
14553
	struct buf *bp;
14554
	struct bufobj *bo;
14555
	struct ufsmount *ump;
14556
	struct inodedep *inodedep;
14557
	struct indirdep *indirdep;
14558
	struct worklist *wk, *nextwk;
14559
	int error, unlinked;
14560

14561
	bo = &devvp->v_bufobj;
14562
	ASSERT_BO_WLOCKED(bo);
14563

14564
	/*
14565
	 * If we are not running with soft updates, then we need only
14566
	 * deal with secondary writes as we try to suspend.
14567
	 */
14568
	if (MOUNTEDSOFTDEP(mp) == 0) {
14569
		MNT_ILOCK(mp);
14570
		while (mp->mnt_secondary_writes != 0) {
14571
			BO_UNLOCK(bo);
14572
			msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
14573
			    PRI_MAX_KERN | PDROP, "secwr", 0);
14574
			BO_LOCK(bo);
14575
			MNT_ILOCK(mp);
14576
		}
14577

14578
		/*
14579
		 * Reasons for needing more work before suspend:
14580
		 * - Dirty buffers on devvp.
14581
		 * - Secondary writes occurred after start of vnode sync loop
14582
		 */
14583
		error = 0;
14584
		if (bo->bo_numoutput > 0 ||
14585
		    bo->bo_dirty.bv_cnt > 0 ||
14586
		    secondary_writes != 0 ||
14587
		    mp->mnt_secondary_writes != 0 ||
14588
		    secondary_accwrites != mp->mnt_secondary_accwrites)
14589
			error = EAGAIN;
14590
		BO_UNLOCK(bo);
14591
		return (error);
14592
	}
14593

14594
	/*
14595
	 * If we are running with soft updates, then we need to coordinate
14596
	 * with them as we try to suspend.
14597
	 */
14598
	ump = VFSTOUFS(mp);
14599
	for (;;) {
14600
		if (!TRY_ACQUIRE_LOCK(ump)) {
14601
			BO_UNLOCK(bo);
14602
			ACQUIRE_LOCK(ump);
14603
			FREE_LOCK(ump);
14604
			BO_LOCK(bo);
14605
			continue;
14606
		}
14607
		MNT_ILOCK(mp);
14608
		if (mp->mnt_secondary_writes != 0) {
14609
			FREE_LOCK(ump);
14610
			BO_UNLOCK(bo);
14611
			msleep(&mp->mnt_secondary_writes,
14612
			       MNT_MTX(mp),
14613
			       PRI_MAX_KERN | PDROP, "secwr", 0);
14614
			BO_LOCK(bo);
14615
			continue;
14616
		}
14617
		break;
14618
	}
14619

14620
	unlinked = 0;
14621
	if (MOUNTEDSUJ(mp)) {
14622
		for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked);
14623
		    inodedep != NULL;
14624
		    inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
14625
			if ((inodedep->id_state & (UNLINKED | UNLINKLINKS |
14626
			    UNLINKONLIST)) != (UNLINKED | UNLINKLINKS |
14627
			    UNLINKONLIST) ||
14628
			    !check_inodedep_free(inodedep))
14629
				continue;
14630
			unlinked++;
14631
		}
14632
	}
14633

14634
	/*
14635
	 * XXX Check for orphaned indirdep dependency structures.
14636
	 *
14637
	 * During forcible unmount after a disk failure there is a
14638
	 * bug that causes one or more indirdep dependency structures
14639
	 * to fail to be deallocated. We check for them here and clean
14640
	 * them up so that the unmount can succeed.
14641
	 */
14642
	if ((ump->um_flags & UM_FSFAIL_CLEANUP) != 0 && ump->softdep_deps > 0 &&
14643
	    ump->softdep_deps == ump->softdep_curdeps[D_INDIRDEP]) {
14644
		LIST_FOREACH_SAFE(wk, &ump->softdep_alldeps[D_INDIRDEP],
14645
		    wk_all, nextwk) {
14646
			indirdep = WK_INDIRDEP(wk);
14647
			if ((indirdep->ir_state & (GOINGAWAY | DEPCOMPLETE)) !=
14648
			    (GOINGAWAY | DEPCOMPLETE) ||
14649
			    !TAILQ_EMPTY(&indirdep->ir_trunc) ||
14650
			    !LIST_EMPTY(&indirdep->ir_completehd) ||
14651
			    !LIST_EMPTY(&indirdep->ir_writehd) ||
14652
			    !LIST_EMPTY(&indirdep->ir_donehd) ||
14653
			    !LIST_EMPTY(&indirdep->ir_deplisthd) ||
14654
			    indirdep->ir_saveddata != NULL ||
14655
			    indirdep->ir_savebp == NULL) {
14656
				printf("%s: skipping orphaned indirdep %p\n",
14657
				    __FUNCTION__, indirdep);
14658
				continue;
14659
			}
14660
			printf("%s: freeing orphaned indirdep %p\n",
14661
			    __FUNCTION__, indirdep);
14662
			bp = indirdep->ir_savebp;
14663
			indirdep->ir_savebp = NULL;
14664
			free_indirdep(indirdep);
14665
			FREE_LOCK(ump);
14666
			brelse(bp);
14667
			while (!TRY_ACQUIRE_LOCK(ump)) {
14668
				BO_UNLOCK(bo);
14669
				ACQUIRE_LOCK(ump);
14670
				FREE_LOCK(ump);
14671
				BO_LOCK(bo);
14672
			}
14673
		}
14674
	}
14675

14676
	/*
14677
	 * Reasons for needing more work before suspend:
14678
	 * - Dirty buffers on devvp.
14679
	 * - Dependency structures still exist
14680
	 * - Softdep activity occurred after start of vnode sync loop
14681
	 * - Secondary writes occurred after start of vnode sync loop
14682
	 */
14683
	error = 0;
14684
	if (bo->bo_numoutput > 0 ||
14685
	    bo->bo_dirty.bv_cnt > 0 ||
14686
	    softdep_depcnt != unlinked ||
14687
	    ump->softdep_deps != unlinked ||
14688
	    softdep_accdepcnt != ump->softdep_accdeps ||
14689
	    secondary_writes != 0 ||
14690
	    mp->mnt_secondary_writes != 0 ||
14691
	    secondary_accwrites != mp->mnt_secondary_accwrites)
14692
		error = EAGAIN;
14693
	FREE_LOCK(ump);
14694
	BO_UNLOCK(bo);
14695
	return (error);
14696
}
14697

14698
/*
14699
 * Get the number of dependency structures for the file system, both
14700
 * the current number and the total number allocated.  These will
14701
 * later be used to detect that softdep processing has occurred.
14702
 */
14703
void
14704
softdep_get_depcounts(struct mount *mp,
14705
		      int *softdep_depsp,
14706
		      int *softdep_accdepsp)
14707
{
14708
	struct ufsmount *ump;
14709

14710
	if (MOUNTEDSOFTDEP(mp) == 0) {
14711
		*softdep_depsp = 0;
14712
		*softdep_accdepsp = 0;
14713
		return;
14714
	}
14715
	ump = VFSTOUFS(mp);
14716
	ACQUIRE_LOCK(ump);
14717
	*softdep_depsp = ump->softdep_deps;
14718
	*softdep_accdepsp = ump->softdep_accdeps;
14719
	FREE_LOCK(ump);
14720
}
14721

14722
/*
14723
 * Wait for pending output on a vnode to complete.
14724
 */
14725
static void
14726
drain_output(struct vnode *vp)
14727
{
14728

14729
	ASSERT_VOP_LOCKED(vp, "drain_output");
14730
	(void)bufobj_wwait(&vp->v_bufobj, 0, 0);
14731
}
14732

14733
/*
14734
 * Called whenever a buffer that is being invalidated or reallocated
14735
 * contains dependencies. This should only happen if an I/O error has
14736
 * occurred. The routine is called with the buffer locked.
14737
 */ 
14738
static void
14739
softdep_deallocate_dependencies(struct buf *bp)
14740
{
14741

14742
	if ((bp->b_ioflags & BIO_ERROR) == 0)
14743
		panic("softdep_deallocate_dependencies: dangling deps");
14744
	if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
14745
		softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
14746
	else
14747
		printf("softdep_deallocate_dependencies: "
14748
		    "got error %d while accessing filesystem\n", bp->b_error);
14749
	if (bp->b_error != ENXIO)
14750
		panic("softdep_deallocate_dependencies: unrecovered I/O error");
14751
}
14752

14753
/*
14754
 * Function to handle asynchronous write errors in the filesystem.
14755
 */
14756
static void
14757
softdep_error(char *func, int error)
14758
{
14759

14760
	/* XXX should do something better! */
14761
	printf("%s: got error %d while accessing filesystem\n", func, error);
14762
}
14763

14764
#ifdef DDB
14765

14766
/* exported to ffs_vfsops.c */
14767
extern void db_print_ffs(struct ufsmount *ump);
14768
void
14769
db_print_ffs(struct ufsmount *ump)
14770
{
14771
	db_printf("mp %p (%s) devvp %p\n", ump->um_mountp,
14772
	    ump->um_mountp->mnt_stat.f_mntonname, ump->um_devvp);
14773
	db_printf("    fs %p ", ump->um_fs);
14774

14775
	if (ump->um_softdep != NULL) {
14776
		db_printf("su_wl %d su_deps %d su_req %d\n",
14777
		    ump->softdep_on_worklist, ump->softdep_deps,
14778
		    ump->softdep_req);
14779
	} else {
14780
		db_printf("su disabled\n");
14781
	}
14782
}
14783

14784
static void
14785
worklist_print(struct worklist *wk, int verbose)
14786
{
14787

14788
	if (!verbose) {
14789
		db_printf("%s: %p state 0x%b\n", TYPENAME(wk->wk_type), wk,
14790
		    wk->wk_state, PRINT_SOFTDEP_FLAGS);
14791
		return;
14792
	}
14793
	db_printf("worklist: %p type %s state 0x%b next %p\n    ", wk,
14794
	    TYPENAME(wk->wk_type), wk->wk_state, PRINT_SOFTDEP_FLAGS,
14795
	    LIST_NEXT(wk, wk_list));
14796
	db_print_ffs(VFSTOUFS(wk->wk_mp));
14797
}
14798

14799
static void
14800
inodedep_print(struct inodedep *inodedep, int verbose)
14801
{
14802

14803
	worklist_print(&inodedep->id_list, 0);
14804
	db_printf("    fs %p ino %jd inoblk %jd delta %jd nlink %jd\n",
14805
	    inodedep->id_fs,
14806
	    (intmax_t)inodedep->id_ino,
14807
	    (intmax_t)fsbtodb(inodedep->id_fs,
14808
	        ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
14809
	    (intmax_t)inodedep->id_nlinkdelta,
14810
	    (intmax_t)inodedep->id_savednlink);
14811

14812
	if (verbose == 0)
14813
		return;
14814

14815
	db_printf("    bmsafemap %p, mkdiradd %p, inoreflst %p\n",
14816
	    inodedep->id_bmsafemap,
14817
	    inodedep->id_mkdiradd,
14818
	    TAILQ_FIRST(&inodedep->id_inoreflst));
14819
	db_printf("    dirremhd %p, pendinghd %p, bufwait %p\n",
14820
	    LIST_FIRST(&inodedep->id_dirremhd),
14821
	    LIST_FIRST(&inodedep->id_pendinghd),
14822
	    LIST_FIRST(&inodedep->id_bufwait));
14823
	db_printf("    inowait %p, inoupdt %p, newinoupdt %p\n",
14824
	    LIST_FIRST(&inodedep->id_inowait),
14825
	    TAILQ_FIRST(&inodedep->id_inoupdt),
14826
	    TAILQ_FIRST(&inodedep->id_newinoupdt));
14827
	db_printf("    extupdt %p, newextupdt %p, freeblklst %p\n",
14828
	    TAILQ_FIRST(&inodedep->id_extupdt),
14829
	    TAILQ_FIRST(&inodedep->id_newextupdt),
14830
	    TAILQ_FIRST(&inodedep->id_freeblklst));
14831
	db_printf("    saveino %p, savedsize %jd, savedextsize %jd\n",
14832
	    inodedep->id_savedino1,
14833
	    (intmax_t)inodedep->id_savedsize,
14834
	    (intmax_t)inodedep->id_savedextsize);
14835
}
14836

14837
static void
14838
newblk_print(struct newblk *nbp)
14839
{
14840

14841
	worklist_print(&nbp->nb_list, 0);
14842
	db_printf("    newblkno %jd\n", (intmax_t)nbp->nb_newblkno);
14843
	db_printf("    jnewblk %p, bmsafemap %p, freefrag %p\n",
14844
	    &nbp->nb_jnewblk,
14845
	    &nbp->nb_bmsafemap,
14846
	    &nbp->nb_freefrag);
14847
	db_printf("    indirdeps %p, newdirblk %p, jwork %p\n",
14848
	    LIST_FIRST(&nbp->nb_indirdeps),
14849
	    LIST_FIRST(&nbp->nb_newdirblk),
14850
	    LIST_FIRST(&nbp->nb_jwork));
14851
}
14852

14853
static void
14854
allocdirect_print(struct allocdirect *adp)
14855
{
14856

14857
	newblk_print(&adp->ad_block);
14858
	db_printf("    oldblkno %jd, oldsize %ld, newsize %ld\n",
14859
	    adp->ad_oldblkno, adp->ad_oldsize, adp->ad_newsize);
14860
	db_printf("    offset %d, inodedep %p\n",
14861
	    adp->ad_offset, adp->ad_inodedep);
14862
}
14863

14864
static void
14865
allocindir_print(struct allocindir *aip)
14866
{
14867

14868
	newblk_print(&aip->ai_block);
14869
	db_printf("    oldblkno %jd, lbn %jd\n",
14870
	    (intmax_t)aip->ai_oldblkno, (intmax_t)aip->ai_lbn);
14871
	db_printf("    offset %d, indirdep %p\n",
14872
	    aip->ai_offset, aip->ai_indirdep);
14873
}
14874

14875
static void
14876
mkdir_print(struct mkdir *mkdir)
14877
{
14878

14879
	worklist_print(&mkdir->md_list, 0);
14880
	db_printf("    diradd %p, jaddref %p, buf %p\n",
14881
		mkdir->md_diradd, mkdir->md_jaddref, mkdir->md_buf);
14882
}
14883

14884
DB_SHOW_COMMAND(sd_inodedep, db_show_sd_inodedep)
14885
{
14886

14887
	if (have_addr == 0) {
14888
		db_printf("inodedep address required\n");
14889
		return;
14890
	}
14891
	inodedep_print((struct inodedep*)addr, 1);
14892
}
14893

14894
DB_SHOW_COMMAND(sd_allinodedeps, db_show_sd_allinodedeps)
14895
{
14896
	struct inodedep_hashhead *inodedephd;
14897
	struct inodedep *inodedep;
14898
	struct ufsmount *ump;
14899
	int cnt;
14900

14901
	if (have_addr == 0) {
14902
		db_printf("ufsmount address required\n");
14903
		return;
14904
	}
14905
	ump = (struct ufsmount *)addr;
14906
	for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) {
14907
		inodedephd = &ump->inodedep_hashtbl[cnt];
14908
		LIST_FOREACH(inodedep, inodedephd, id_hash) {
14909
			inodedep_print(inodedep, 0);
14910
		}
14911
	}
14912
}
14913

14914
DB_SHOW_COMMAND(sd_worklist, db_show_sd_worklist)
14915
{
14916

14917
	if (have_addr == 0) {
14918
		db_printf("worklist address required\n");
14919
		return;
14920
	}
14921
	worklist_print((struct worklist *)addr, 1);
14922
}
14923

14924
DB_SHOW_COMMAND(sd_workhead, db_show_sd_workhead)
14925
{
14926
	struct worklist *wk;
14927
	struct workhead *wkhd;
14928

14929
	if (have_addr == 0) {
14930
		db_printf("worklist address required "
14931
		    "(for example value in bp->b_dep)\n");
14932
		return;
14933
	}
14934
	/*
14935
	 * We often do not have the address of the worklist head but
14936
	 * instead a pointer to its first entry (e.g., we have the
14937
	 * contents of bp->b_dep rather than &bp->b_dep). But the back
14938
	 * pointer of bp->b_dep will point at the head of the list, so
14939
	 * we cheat and use that instead. If we are in the middle of
14940
	 * a list we will still get the same result, so nothing
14941
	 * unexpected will result.
14942
	 */
14943
	wk = (struct worklist *)addr;
14944
	if (wk == NULL)
14945
		return;
14946
	wkhd = (struct workhead *)wk->wk_list.le_prev;
14947
	LIST_FOREACH(wk, wkhd, wk_list) {
14948
		switch(wk->wk_type) {
14949
		case D_INODEDEP:
14950
			inodedep_print(WK_INODEDEP(wk), 0);
14951
			continue;
14952
		case D_ALLOCDIRECT:
14953
			allocdirect_print(WK_ALLOCDIRECT(wk));
14954
			continue;
14955
		case D_ALLOCINDIR:
14956
			allocindir_print(WK_ALLOCINDIR(wk));
14957
			continue;
14958
		case D_MKDIR:
14959
			mkdir_print(WK_MKDIR(wk));
14960
			continue;
14961
		default:
14962
			worklist_print(wk, 0);
14963
			continue;
14964
		}
14965
	}
14966
}
14967

14968
DB_SHOW_COMMAND(sd_mkdir, db_show_sd_mkdir)
14969
{
14970
	if (have_addr == 0) {
14971
		db_printf("mkdir address required\n");
14972
		return;
14973
	}
14974
	mkdir_print((struct mkdir *)addr);
14975
}
14976

14977
DB_SHOW_COMMAND(sd_mkdir_list, db_show_sd_mkdir_list)
14978
{
14979
	struct mkdirlist *mkdirlisthd;
14980
	struct mkdir *mkdir;
14981

14982
	if (have_addr == 0) {
14983
		db_printf("mkdir listhead address required\n");
14984
		return;
14985
	}
14986
	mkdirlisthd = (struct mkdirlist *)addr;
14987
	LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) {
14988
		mkdir_print(mkdir);
14989
		if (mkdir->md_diradd != NULL) {
14990
			db_printf("    ");
14991
			worklist_print(&mkdir->md_diradd->da_list, 0);
14992
		}
14993
		if (mkdir->md_jaddref != NULL) {
14994
			db_printf("    ");
14995
			worklist_print(&mkdir->md_jaddref->ja_list, 0);
14996
		}
14997
	}
14998
}
14999

15000
DB_SHOW_COMMAND(sd_allocdirect, db_show_sd_allocdirect)
15001
{
15002
	if (have_addr == 0) {
15003
		db_printf("allocdirect address required\n");
15004
		return;
15005
	}
15006
	allocdirect_print((struct allocdirect *)addr);
15007
}
15008

15009
DB_SHOW_COMMAND(sd_allocindir, db_show_sd_allocindir)
15010
{
15011
	if (have_addr == 0) {
15012
		db_printf("allocindir address required\n");
15013
		return;
15014
	}
15015
	allocindir_print((struct allocindir *)addr);
15016
}
15017

15018
#endif /* DDB */
15019

15020
#endif /* SOFTUPDATES */
15021

15022
Product

Resources

Company