/*-1* Copyright (c) 1998 Michael Smith <[email protected]>2* Copyright 2015 Toomas Soome <[email protected]>3* All rights reserved.4*5* Redistribution and use in source and binary forms, with or without6* modification, are permitted provided that the following conditions7* are met:8* 1. Redistributions of source code must retain the above copyright9* notice, this list of conditions and the following disclaimer.10* 2. Redistributions in binary form must reproduce the above copyright11* notice, this list of conditions and the following disclaimer in the12* documentation and/or other materials provided with the distribution.13*14* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND15* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE16* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE17* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE18* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL19* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS20* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)21* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT22* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY23* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF24* SUCH DAMAGE.25*/2627#include <sys/param.h>28/*29* Simple hashed block cache30*/3132#include <sys/stdint.h>3334#include <stand.h>35#include <string.h>36#include <strings.h>3738#include "bootstrap.h"3940/* #define BCACHE_DEBUG */4142#ifdef BCACHE_DEBUG43# define DPRINTF(fmt, args...) printf("%s: " fmt "\n" , __func__ , ## args)44#else45# define DPRINTF(fmt, args...) ((void)0)46#endif4748struct bcachectl49{50daddr_t bc_blkno;51int bc_count;52};5354/*55* bcache per device node. cache is allocated on device first open and freed56* on last close, to save memory. The issue there is the size; biosdisk57* supports up to 31 (0x1f) devices. Classic setup would use single disk58* to boot from, but this has changed with zfs.59*/60struct bcache {61struct bcachectl *bcache_ctl;62caddr_t bcache_data;63size_t bcache_nblks;64size_t ra;65daddr_t bcache_nextblkno;66size_t ralen;67};6869static u_int bcache_total_nblks; /* set by bcache_init */70static u_int bcache_blksize; /* set by bcache_init */71static u_int bcache_numdev; /* set by bcache_add_dev */72/* statistics */73static u_int bcache_units; /* number of devices with cache */74static u_int bcache_unit_nblks; /* nblocks per unit */75static u_int bcache_hits;76static u_int bcache_misses;77static u_int bcache_ops;78static u_int bcache_bypasses;79static u_int bcache_bcount;80static u_int bcache_rablks;8182#define BHASH(bc, blkno) ((blkno) & ((bc)->bcache_nblks - 1))83#define BCACHE_LOOKUP(bc, blkno) \84((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno))85#define BCACHE_READAHEAD 51286#define BCACHE_MINREADAHEAD 3287#define BCACHE_MAXIOWRA 5128889static void bcache_invalidate(struct bcache *bc, daddr_t blkno);90static void bcache_insert(struct bcache *bc, daddr_t blkno);91static void bcache_free_instance(struct bcache *bc);9293/*94* Initialise the cache for (nblks) of (bsize).95*/96void97bcache_init(size_t nblks, size_t bsize)98{99/* set up control data */100bcache_total_nblks = nblks;101bcache_blksize = bsize;102}103104/*105* add number of devices to bcache. we have to divide cache space106* between the devices, so bcache_add_dev() can be used to set up the107* number. The issue is, we need to get the number before actual allocations.108* bcache_add_dev() is supposed to be called from device init() call, so the109* assumption is, devsw dv_init is called for plain devices first, and110* for zfs, last.111*/112void113bcache_add_dev(int devices)114{115bcache_numdev += devices;116}117118void *119bcache_allocate(void)120{121u_int i;122struct bcache *bc = malloc(sizeof (struct bcache));123int disks = bcache_numdev;124125if (disks == 0)126disks = 1; /* safe guard */127128if (bc == NULL) {129errno = ENOMEM;130return (bc);131}132133/*134* the bcache block count must be power of 2 for hash function135*/136i = fls(disks) - 1; /* highbit - 1 */137if (disks > (1 << i)) /* next power of 2 */138i++;139140bc->bcache_nblks = bcache_total_nblks >> i;141bcache_unit_nblks = bc->bcache_nblks;142bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize);143if (bc->bcache_data == NULL) {144/* dont error out yet. fall back to 32 blocks and try again */145bc->bcache_nblks = 32;146bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize +147sizeof(uint32_t));148}149150bc->bcache_ctl = malloc(bc->bcache_nblks * sizeof(struct bcachectl));151152if ((bc->bcache_data == NULL) || (bc->bcache_ctl == NULL)) {153bcache_free_instance(bc);154errno = ENOMEM;155return (NULL);156}157158/* Flush the cache */159for (i = 0; i < bc->bcache_nblks; i++) {160bc->bcache_ctl[i].bc_count = -1;161bc->bcache_ctl[i].bc_blkno = -1;162}163bcache_units++;164bc->ra = BCACHE_READAHEAD; /* optimistic read ahead */165bc->bcache_nextblkno = -1;166return (bc);167}168169void170bcache_free(void *cache)171{172struct bcache *bc = cache;173174if (bc == NULL)175return;176177bcache_free_instance(bc);178bcache_units--;179}180181/*182* Handle a write request; write directly to the disk, and populate the183* cache with the new values.184*/185static int186write_strategy(void *devdata, int rw, daddr_t blk, size_t size,187char *buf, size_t *rsize)188{189struct bcache_devdata *dd = (struct bcache_devdata *)devdata;190struct bcache *bc = dd->dv_cache;191daddr_t i, nblk;192193nblk = size / bcache_blksize;194195/* Invalidate the blocks being written */196for (i = 0; i < nblk; i++) {197bcache_invalidate(bc, blk + i);198}199200/* Write the blocks */201return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize));202}203204/*205* Handle a read request; fill in parts of the request that can206* be satisfied by the cache, use the supplied strategy routine to do207* device I/O and then use the I/O results to populate the cache.208*/209static int210read_strategy(void *devdata, int rw, daddr_t blk, size_t size,211char *buf, size_t *rsize)212{213struct bcache_devdata *dd = (struct bcache_devdata *)devdata;214struct bcache *bc = dd->dv_cache;215size_t i, nblk, p_size, r_size, complete, ra;216int result;217daddr_t p_blk;218caddr_t p_buf;219220if (bc == NULL) {221errno = ENODEV;222return (-1);223}224225if (rsize != NULL)226*rsize = 0;227228nblk = size / bcache_blksize;229if (nblk == 0 && size != 0)230nblk++;231result = 0;232complete = 1;233234/* Satisfy any cache hits up front, break on first miss */235for (i = 0; i < nblk; i++) {236if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) {237bcache_misses += (nblk - i);238complete = 0;239break;240} else {241bcache_hits++;242}243}244245/*246* Adjust read-ahead size if appropriate. Subject to the requirement247* that bc->ra must stay in between MINREADAHEAD and READAHEAD, we248* increase it when we notice that readahead was useful and decrease249* it when we notice that readahead was not useful.250*/251if (complete || (i == bc->ralen && bc->ralen > 0)) {252if (bc->ra < BCACHE_READAHEAD)253bc->ra <<= 1; /* increase read ahead */254} else {255if (nblk - i > BCACHE_MINREADAHEAD && bc->ralen > 0 &&256bc->ra > BCACHE_MINREADAHEAD)257bc->ra >>= 1; /* reduce read ahead */258}259260/* Adjust our "unconsumed readahead" value. */261if (blk == bc->bcache_nextblkno) {262if (nblk > bc->ralen)263bc->ralen = 0;264else265bc->ralen -= nblk;266}267268if (complete) { /* whole set was in cache, return it */269bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size);270goto done;271}272273/*274* Fill in any misses. From check we have i pointing to first missing275* block, read in all remaining blocks + readahead.276* We have space at least for nblk - i before bcache wraps.277*/278p_blk = blk + i;279p_buf = bc->bcache_data + (bcache_blksize * BHASH(bc, p_blk));280r_size = bc->bcache_nblks - BHASH(bc, p_blk); /* remaining blocks */281282p_size = MIN(r_size, nblk - i); /* read at least those blocks */283284/*285* The read ahead size setup.286* While the read ahead can save us IO, it also can complicate things:287* 1. We do not want to read ahead by wrapping around the288* bcache end - this would complicate the cache management.289* 2. We are using bc->ra as dynamic hint for read ahead size,290* detected cache hits will increase the read-ahead block count, and291* misses will decrease, see the code above.292* 3. The bcache is sized by 512B blocks, however, the underlying device293* may have a larger sector size, and we should perform the IO by294* taking into account these larger sector sizes. We could solve this by295* passing the sector size to bcache_allocate(), or by using ioctl(), but296* in this version we are using the constant, 16 blocks, and are rounding297* read ahead block count down to multiple of 16.298* Using the constant has two reasons, we are not entirely sure if the299* BIOS disk interface is providing the correct value for sector size.300* And secondly, this way we get the most conservative setup for the ra.301*302* The selection of multiple of 16 blocks (8KB) is quite arbitrary, however,303* we want to cover CDs (2K) and 4K disks.304* bcache_allocate() will always fall back to a minimum of 32 blocks.305* Our choice of 16 read ahead blocks will always fit inside the bcache.306*/307308if ((rw & F_NORA) == F_NORA)309ra = 0;310else311ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size);312313/*314* Only trigger read-ahead if we detect two blocks being read315* sequentially.316*/317if ((bc->bcache_nextblkno != blk) && ra != 0) {318ra = 0;319}320321if (ra != 0 && ra != bc->bcache_nblks) { /* do we have RA space? */322ra = MIN(bc->ra, ra - 1);323ra = rounddown(ra, 16); /* multiple of 16 blocks */324if (ra + p_size > BCACHE_MAXIOWRA)325ra = BCACHE_MAXIOWRA - p_size;326bc->ralen = ra;327p_size += ra;328} else {329bc->ralen = 0;330}331332/* invalidate bcache */333for (i = 0; i < p_size; i++) {334bcache_invalidate(bc, p_blk + i);335}336337r_size = 0;338/*339* with read-ahead, it may happen we are attempting to read past340* disk end, as bcache has no information about disk size.341* in such case we should get partial read if some blocks can be342* read or error, if no blocks can be read.343* in either case we should return the data in bcache and only344* return error if there is no data.345*/346rw &= F_MASK;347result = dd->dv_strategy(dd->dv_devdata, rw, p_blk,348p_size * bcache_blksize, p_buf, &r_size);349350r_size /= bcache_blksize;351for (i = 0; i < r_size; i++)352bcache_insert(bc, p_blk + i);353354/* update ra statistics */355if (r_size != 0) {356if (r_size < p_size)357bcache_rablks += (p_size - r_size);358else359bcache_rablks += ra;360}361362/* check how much data can we copy */363for (i = 0; i < nblk; i++) {364if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i)))365break;366}367368if (size > i * bcache_blksize)369size = i * bcache_blksize;370371if (size != 0) {372bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size);373result = 0;374}375376done:377if (result == 0) {378if (rsize != NULL)379*rsize = size;380bc->bcache_nextblkno = blk + (size / DEV_BSIZE);381}382return(result);383}384385/*386* Requests larger than 1/2 cache size will be bypassed and go387* directly to the disk. XXX tune this.388*/389int390bcache_strategy(void *devdata, int rw, daddr_t blk, size_t size,391char *buf, size_t *rsize)392{393struct bcache_devdata *dd = (struct bcache_devdata *)devdata;394struct bcache *bc = dd->dv_cache;395u_int bcache_nblks = 0;396int nblk, cblk, ret;397size_t csize, isize, total;398399bcache_ops++;400401if (bc != NULL)402bcache_nblks = bc->bcache_nblks;403404/* bypass large requests, or when the cache is inactive */405if (bc == NULL ||406((size * 2 / bcache_blksize) > bcache_nblks)) {407DPRINTF("bypass %zu from %jd", size / bcache_blksize, blk);408bcache_bypasses++;409rw &= F_MASK;410return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize));411}412413switch (rw & F_MASK) {414case F_READ:415nblk = size / bcache_blksize;416if (size != 0 && nblk == 0)417nblk++; /* read at least one block */418419ret = 0;420total = 0;421while(size) {422cblk = bcache_nblks - BHASH(bc, blk); /* # of blocks left */423cblk = MIN(cblk, nblk);424425if (size <= bcache_blksize)426csize = size;427else428csize = cblk * bcache_blksize;429430ret = read_strategy(devdata, rw, blk, csize, buf+total, &isize);431432/*433* we may have error from read ahead, if we have read some data434* return partial read.435*/436if (ret != 0 || isize == 0) {437if (total != 0)438ret = 0;439break;440}441blk += isize / bcache_blksize;442total += isize;443size -= isize;444nblk = size / bcache_blksize;445}446447if (rsize)448*rsize = total;449450return (ret);451case F_WRITE:452return write_strategy(devdata, F_WRITE, blk, size, buf, rsize);453}454return -1;455}456457/*458* Free allocated bcache instance459*/460static void461bcache_free_instance(struct bcache *bc)462{463if (bc != NULL) {464free(bc->bcache_ctl);465free(bc->bcache_data);466free(bc);467}468}469470/*471* Insert a block into the cache.472*/473static void474bcache_insert(struct bcache *bc, daddr_t blkno)475{476u_int cand;477478cand = BHASH(bc, blkno);479480DPRINTF("insert blk %jd -> %u # %d", blkno, cand, bcache_bcount);481bc->bcache_ctl[cand].bc_blkno = blkno;482bc->bcache_ctl[cand].bc_count = bcache_bcount++;483}484485/*486* Invalidate a block from the cache.487*/488static void489bcache_invalidate(struct bcache *bc, daddr_t blkno)490{491u_int i;492493i = BHASH(bc, blkno);494if (bc->bcache_ctl[i].bc_blkno == blkno) {495bc->bcache_ctl[i].bc_count = -1;496bc->bcache_ctl[i].bc_blkno = -1;497DPRINTF("invalidate blk %ju", blkno);498}499}500501#ifndef BOOT2502COMMAND_SET(bcachestat, "bcachestat", "get disk block cache stats", command_bcache);503504static int505command_bcache(int argc, char *argv[] __unused)506{507if (argc != 1) {508command_errmsg = "wrong number of arguments";509return(CMD_ERROR);510}511512printf("\ncache blocks: %u\n", bcache_total_nblks);513printf("cache blocksz: %u\n", bcache_blksize);514printf("cache readahead: %u\n", bcache_rablks);515printf("unit cache blocks: %u\n", bcache_unit_nblks);516printf("cached units: %u\n", bcache_units);517printf("%u ops %d bypasses %u hits %u misses\n", bcache_ops,518bcache_bypasses, bcache_hits, bcache_misses);519return(CMD_OK);520}521#endif522523524