/*-1* SPDX-License-Identifier: BSD-3-Clause2*3* Copyright (c) 1990, 1993, 19944* The Regents of the University of California. All rights reserved.5*6* This code is derived from software contributed to Berkeley by7* Mike Olson.8*9* Redistribution and use in source and binary forms, with or without10* modification, are permitted provided that the following conditions11* are met:12* 1. Redistributions of source code must retain the above copyright13* notice, this list of conditions and the following disclaimer.14* 2. Redistributions in binary form must reproduce the above copyright15* notice, this list of conditions and the following disclaimer in the16* documentation and/or other materials provided with the distribution.17* 3. Neither the name of the University nor the names of its contributors18* may be used to endorse or promote products derived from this software19* without specific prior written permission.20*21* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE23* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE24* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE25* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL26* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS27* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)28* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT29* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY30* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF31* SUCH DAMAGE.32*/3334#include <sys/types.h>3536#include <errno.h>37#include <stdio.h>38#include <string.h>3940#include <db.h>41#include "btree.h"4243static int __bt_bdelete(BTREE *, const DBT *);44static int __bt_curdel(BTREE *, const DBT *, PAGE *, u_int);45static int __bt_pdelete(BTREE *, PAGE *);46static int __bt_relink(BTREE *, PAGE *);47static int __bt_stkacq(BTREE *, PAGE **, CURSOR *);4849/*50* __bt_delete51* Delete the item(s) referenced by a key.52*53* Return RET_SPECIAL if the key is not found.54*/55int56__bt_delete(const DB *dbp, const DBT *key, u_int flags)57{58BTREE *t;59CURSOR *c;60PAGE *h;61int status;6263t = dbp->internal;6465/* Toss any page pinned across calls. */66if (t->bt_pinned != NULL) {67mpool_put(t->bt_mp, t->bt_pinned, 0);68t->bt_pinned = NULL;69}7071/* Check for change to a read-only tree. */72if (F_ISSET(t, B_RDONLY)) {73errno = EPERM;74return (RET_ERROR);75}7677switch (flags) {78case 0:79status = __bt_bdelete(t, key);80break;81case R_CURSOR:82/*83* If flags is R_CURSOR, delete the cursor. Must already84* have started a scan and not have already deleted it.85*/86c = &t->bt_cursor;87if (F_ISSET(c, CURS_INIT)) {88if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE))89return (RET_SPECIAL);90if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)91return (RET_ERROR);9293/*94* If the page is about to be emptied, we'll need to95* delete it, which means we have to acquire a stack.96*/97if (NEXTINDEX(h) == 1)98if (__bt_stkacq(t, &h, &t->bt_cursor))99return (RET_ERROR);100101status = __bt_dleaf(t, NULL, h, c->pg.index);102103if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) {104if (__bt_pdelete(t, h))105return (RET_ERROR);106} else107mpool_put(t->bt_mp,108h, status == RET_SUCCESS ? MPOOL_DIRTY : 0);109break;110}111/* FALLTHROUGH */112default:113errno = EINVAL;114return (RET_ERROR);115}116if (status == RET_SUCCESS)117F_SET(t, B_MODIFIED);118return (status);119}120121/*122* __bt_stkacq --123* Acquire a stack so we can delete a cursor entry.124*125* Parameters:126* t: tree127* hp: pointer to current, pinned PAGE pointer128* c: pointer to the cursor129*130* Returns:131* 0 on success, 1 on failure132*/133static int134__bt_stkacq(BTREE *t, PAGE **hp, CURSOR *c)135{136BINTERNAL *bi;137EPG *e;138EPGNO *parent;139PAGE *h;140indx_t idx;141pgno_t pgno;142recno_t nextpg, prevpg;143int exact, level;144145/*146* Find the first occurrence of the key in the tree. Toss the147* currently locked page so we don't hit an already-locked page.148*/149h = *hp;150mpool_put(t->bt_mp, h, 0);151if ((e = __bt_search(t, &c->key, &exact)) == NULL)152return (1);153h = e->page;154155/* See if we got it in one shot. */156if (h->pgno == c->pg.pgno)157goto ret;158159/*160* Move right, looking for the page. At each move we have to move161* up the stack until we don't have to move to the next page. If162* we have to change pages at an internal level, we have to fix the163* stack back up.164*/165while (h->pgno != c->pg.pgno) {166if ((nextpg = h->nextpg) == P_INVALID)167break;168mpool_put(t->bt_mp, h, 0);169170/* Move up the stack. */171for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {172/* Get the parent page. */173if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)174return (1);175176/* Move to the next index. */177if (parent->index != NEXTINDEX(h) - 1) {178idx = parent->index + 1;179BT_PUSH(t, h->pgno, idx);180break;181}182mpool_put(t->bt_mp, h, 0);183}184185/* Restore the stack. */186while (level--) {187/* Push the next level down onto the stack. */188bi = GETBINTERNAL(h, idx);189pgno = bi->pgno;190BT_PUSH(t, pgno, 0);191192/* Lose the currently pinned page. */193mpool_put(t->bt_mp, h, 0);194195/* Get the next level down. */196if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)197return (1);198idx = 0;199}200mpool_put(t->bt_mp, h, 0);201if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL)202return (1);203}204205if (h->pgno == c->pg.pgno)206goto ret;207208/* Reacquire the original stack. */209mpool_put(t->bt_mp, h, 0);210if ((e = __bt_search(t, &c->key, &exact)) == NULL)211return (1);212h = e->page;213214/*215* Move left, looking for the page. At each move we have to move216* up the stack until we don't have to change pages to move to the217* next page. If we have to change pages at an internal level, we218* have to fix the stack back up.219*/220while (h->pgno != c->pg.pgno) {221if ((prevpg = h->prevpg) == P_INVALID)222break;223mpool_put(t->bt_mp, h, 0);224225/* Move up the stack. */226for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {227/* Get the parent page. */228if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)229return (1);230231/* Move to the next index. */232if (parent->index != 0) {233idx = parent->index - 1;234BT_PUSH(t, h->pgno, idx);235break;236}237mpool_put(t->bt_mp, h, 0);238}239240/* Restore the stack. */241while (level--) {242/* Push the next level down onto the stack. */243bi = GETBINTERNAL(h, idx);244pgno = bi->pgno;245246/* Lose the currently pinned page. */247mpool_put(t->bt_mp, h, 0);248249/* Get the next level down. */250if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)251return (1);252253idx = NEXTINDEX(h) - 1;254BT_PUSH(t, pgno, idx);255}256mpool_put(t->bt_mp, h, 0);257if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL)258return (1);259}260261262ret: mpool_put(t->bt_mp, h, 0);263return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL);264}265266/*267* __bt_bdelete --268* Delete all key/data pairs matching the specified key.269*270* Parameters:271* t: tree272* key: key to delete273*274* Returns:275* RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.276*/277static int278__bt_bdelete(BTREE *t, const DBT *key)279{280EPG *e;281PAGE *h;282int deleted, exact, redo;283284deleted = 0;285286/* Find any matching record; __bt_search pins the page. */287loop: if ((e = __bt_search(t, key, &exact)) == NULL)288return (deleted ? RET_SUCCESS : RET_ERROR);289if (!exact) {290mpool_put(t->bt_mp, e->page, 0);291return (deleted ? RET_SUCCESS : RET_SPECIAL);292}293294/*295* Delete forward, then delete backward, from the found key. If296* there are duplicates and we reach either side of the page, do297* the key search again, so that we get them all.298*/299redo = 0;300h = e->page;301do {302if (__bt_dleaf(t, key, h, e->index)) {303mpool_put(t->bt_mp, h, 0);304return (RET_ERROR);305}306if (F_ISSET(t, B_NODUPS)) {307if (NEXTINDEX(h) == 0) {308if (__bt_pdelete(t, h))309return (RET_ERROR);310} else311mpool_put(t->bt_mp, h, MPOOL_DIRTY);312return (RET_SUCCESS);313}314deleted = 1;315} while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0);316317/* Check for right-hand edge of the page. */318if (e->index == NEXTINDEX(h))319redo = 1;320321/* Delete from the key to the beginning of the page. */322while (e->index-- > 0) {323if (__bt_cmp(t, key, e) != 0)324break;325if (__bt_dleaf(t, key, h, e->index) == RET_ERROR) {326mpool_put(t->bt_mp, h, 0);327return (RET_ERROR);328}329if (e->index == 0)330redo = 1;331}332333/* Check for an empty page. */334if (NEXTINDEX(h) == 0) {335if (__bt_pdelete(t, h))336return (RET_ERROR);337goto loop;338}339340/* Put the page. */341mpool_put(t->bt_mp, h, MPOOL_DIRTY);342343if (redo)344goto loop;345return (RET_SUCCESS);346}347348/*349* __bt_pdelete --350* Delete a single page from the tree.351*352* Parameters:353* t: tree354* h: leaf page355*356* Returns:357* RET_SUCCESS, RET_ERROR.358*359* Side-effects:360* mpool_put's the page361*/362static int363__bt_pdelete(BTREE *t, PAGE *h)364{365BINTERNAL *bi;366PAGE *pg;367EPGNO *parent;368indx_t cnt, idx, *ip, offset;369u_int32_t nksize;370char *from;371372/*373* Walk the parent page stack -- a LIFO stack of the pages that were374* traversed when we searched for the page where the delete occurred.375* Each stack entry is a page number and a page index offset. The376* offset is for the page traversed on the search. We've just deleted377* a page, so we have to delete the key from the parent page.378*379* If the delete from the parent page makes it empty, this process may380* continue all the way up the tree. We stop if we reach the root page381* (which is never deleted, it's just not worth the effort) or if the382* delete does not empty the page.383*/384while ((parent = BT_POP(t)) != NULL) {385/* Get the parent page. */386if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)387return (RET_ERROR);388389idx = parent->index;390bi = GETBINTERNAL(pg, idx);391392/* Free any overflow pages. */393if (bi->flags & P_BIGKEY &&394__ovfl_delete(t, bi->bytes) == RET_ERROR) {395mpool_put(t->bt_mp, pg, 0);396return (RET_ERROR);397}398399/*400* Free the parent if it has only the one key and it's not the401* root page. If it's the rootpage, turn it back into an empty402* leaf page.403*/404if (NEXTINDEX(pg) == 1) {405if (pg->pgno == P_ROOT) {406pg->lower = BTDATAOFF;407pg->upper = t->bt_psize;408pg->flags = P_BLEAF;409} else {410if (__bt_relink(t, pg) || __bt_free(t, pg))411return (RET_ERROR);412continue;413}414} else {415/* Pack remaining key items at the end of the page. */416nksize = NBINTERNAL(bi->ksize);417from = (char *)pg + pg->upper;418memmove(from + nksize, from, (char *)bi - from);419pg->upper += nksize;420421/* Adjust indices' offsets, shift the indices down. */422offset = pg->linp[idx];423for (cnt = idx, ip = &pg->linp[0]; cnt--; ++ip)424if (ip[0] < offset)425ip[0] += nksize;426for (cnt = NEXTINDEX(pg) - idx; --cnt; ++ip)427ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1];428pg->lower -= sizeof(indx_t);429}430431mpool_put(t->bt_mp, pg, MPOOL_DIRTY);432break;433}434435/* Free the leaf page, as long as it wasn't the root. */436if (h->pgno == P_ROOT) {437mpool_put(t->bt_mp, h, MPOOL_DIRTY);438return (RET_SUCCESS);439}440return (__bt_relink(t, h) || __bt_free(t, h));441}442443/*444* __bt_dleaf --445* Delete a single record from a leaf page.446*447* Parameters:448* t: tree449* key: referenced key450* h: page451* idx: index on page to delete452*453* Returns:454* RET_SUCCESS, RET_ERROR.455*/456int457__bt_dleaf(BTREE *t, const DBT *key, PAGE *h, u_int idx)458{459BLEAF *bl;460indx_t cnt, *ip, offset;461u_int32_t nbytes;462void *to;463char *from;464465/* If this record is referenced by the cursor, delete the cursor. */466if (F_ISSET(&t->bt_cursor, CURS_INIT) &&467!F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&468t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == idx &&469__bt_curdel(t, key, h, idx))470return (RET_ERROR);471472/* If the entry uses overflow pages, make them available for reuse. */473to = bl = GETBLEAF(h, idx);474if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR)475return (RET_ERROR);476if (bl->flags & P_BIGDATA &&477__ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR)478return (RET_ERROR);479480/* Pack the remaining key/data items at the end of the page. */481nbytes = NBLEAF(bl);482from = (char *)h + h->upper;483memmove(from + nbytes, from, (char *)to - from);484h->upper += nbytes;485486/* Adjust the indices' offsets, shift the indices down. */487offset = h->linp[idx];488for (cnt = idx, ip = &h->linp[0]; cnt--; ++ip)489if (ip[0] < offset)490ip[0] += nbytes;491for (cnt = NEXTINDEX(h) - idx; --cnt; ++ip)492ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1];493h->lower -= sizeof(indx_t);494495/* If the cursor is on this page, adjust it as necessary. */496if (F_ISSET(&t->bt_cursor, CURS_INIT) &&497!F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&498t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > idx)499--t->bt_cursor.pg.index;500501return (RET_SUCCESS);502}503504/*505* __bt_curdel --506* Delete the cursor.507*508* Parameters:509* t: tree510* key: referenced key (or NULL)511* h: page512* idx: index on page to delete513*514* Returns:515* RET_SUCCESS, RET_ERROR.516*/517static int518__bt_curdel(BTREE *t, const DBT *key, PAGE *h, u_int idx)519{520CURSOR *c;521EPG e;522PAGE *pg;523int curcopy, status;524525/*526* If there are duplicates, move forward or backward to one.527* Otherwise, copy the key into the cursor area.528*/529c = &t->bt_cursor;530F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE);531532curcopy = 0;533if (!F_ISSET(t, B_NODUPS)) {534/*535* We're going to have to do comparisons. If we weren't536* provided a copy of the key, i.e. the user is deleting537* the current cursor position, get one.538*/539if (key == NULL) {540e.page = h;541e.index = idx;542if ((status = __bt_ret(t, &e,543&c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS)544return (status);545curcopy = 1;546key = &c->key;547}548/* Check previous key, if not at the beginning of the page. */549if (idx > 0) {550e.page = h;551e.index = idx - 1;552if (__bt_cmp(t, key, &e) == 0) {553F_SET(c, CURS_BEFORE);554goto dup2;555}556}557/* Check next key, if not at the end of the page. */558if (idx < NEXTINDEX(h) - 1) {559e.page = h;560e.index = idx + 1;561if (__bt_cmp(t, key, &e) == 0) {562F_SET(c, CURS_AFTER);563goto dup2;564}565}566/* Check previous key if at the beginning of the page. */567if (idx == 0 && h->prevpg != P_INVALID) {568if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)569return (RET_ERROR);570e.page = pg;571e.index = NEXTINDEX(pg) - 1;572if (__bt_cmp(t, key, &e) == 0) {573F_SET(c, CURS_BEFORE);574goto dup1;575}576mpool_put(t->bt_mp, pg, 0);577}578/* Check next key if at the end of the page. */579if (idx == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) {580if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)581return (RET_ERROR);582e.page = pg;583e.index = 0;584if (__bt_cmp(t, key, &e) == 0) {585F_SET(c, CURS_AFTER);586dup1: mpool_put(t->bt_mp, pg, 0);587dup2: c->pg.pgno = e.page->pgno;588c->pg.index = e.index;589return (RET_SUCCESS);590}591mpool_put(t->bt_mp, pg, 0);592}593}594e.page = h;595e.index = idx;596if (curcopy || (status =597__bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) {598F_SET(c, CURS_ACQUIRE);599return (RET_SUCCESS);600}601return (status);602}603604/*605* __bt_relink --606* Link around a deleted page.607*608* Parameters:609* t: tree610* h: page to be deleted611*/612static int613__bt_relink(BTREE *t, PAGE *h)614{615PAGE *pg;616617if (h->nextpg != P_INVALID) {618if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)619return (RET_ERROR);620pg->prevpg = h->prevpg;621mpool_put(t->bt_mp, pg, MPOOL_DIRTY);622}623if (h->prevpg != P_INVALID) {624if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)625return (RET_ERROR);626pg->nextpg = h->nextpg;627mpool_put(t->bt_mp, pg, MPOOL_DIRTY);628}629return (0);630}631632633