}
/*
- * Returns true, if the given block has the half-dead flag set.
+ * Check that leftsib page (the btpo_prev of target page) is not marked with
+ * INCOMPLETE_SPLIT flag.
+ *
+ * Returning true indicates that page flag is set in leftsib (which is
+ * definitely still the left sibling of target). When that happens, the
+ * target doesn't have a downlink in parent, and the page deletion algorithm
+ * isn't prepared to handle that. Deletion of the target page (or the whole
+ * subtree that contains the target page) cannot take place.
+ */
+static bool
+_bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
+{
+ Buffer buf;
+ Page page;
+ BTPageOpaque opaque;
+ bool result;
+
+ /* Easy case: No left sibling */
+ if (leftsib == P_NONE)
+ return false;
+
+ buf = _bt_getbuf(rel, leftsib, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * If the left sibling was concurrently split, so that its next-pointer
+ * doesn't point to the current page anymore, the split that created
+ * target must be completed. Caller can reasonably expect that there will
+ * be a downlink to the target page that it can relocate using its stack.
+ * (We don't allow splitting an incompletely split page again until the
+ * previous split has been completed.)
+ */
+ result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
+ _bt_relbuf(rel, buf);
+
+ return result;
+}
+
+/*
+ * Check that leafrightsib page (the btpo_next of target leaf page) is not
+ * marked with ISHALFDEAD flag.
+ *
+ * Returning true indicates that page flag is set in leafrightsib, so page
+ * deletion cannot go ahead. Our caller is not prepared to deal with the case
+ * where the parent page does not have a pivot tuples whose downlink points to
+ * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't
+ * seem worth going to the trouble of teaching our caller to deal with it.
+ * The situation will be resolved after VACUUM finishes the deletion of the
+ * half-dead page (when a future VACUUM operation reaches the target page
+ * again).
+ *
+ * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
+ * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is
+ * okay because of the restriction on deleting pages that are the rightmost
+ * page of their parent (i.e. that such deletions can only take place when the
+ * entire subtree must be deleted). The leaf level check made here will apply
+ * to a right "cousin" leaf page rather than a simple right sibling leaf page
+ * in cases where caller actually goes on to attempt deleting pages that are
+ * above the leaf page. The right cousin leaf page is representative of the
+ * left edge of the subtree to the right of the to-be-deleted subtree as a
+ * whole, which is exactly the condition that our caller cares about.
+ * (Besides, internal pages are never marked half-dead, so it isn't even
+ * possible to _directly_ assess if an internal page is part of some other
+ * to-be-deleted subtree.)
*/
static bool
-_bt_is_page_halfdead(Relation rel, BlockNumber blk)
+_bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
{
Buffer buf;
Page page;
BTPageOpaque opaque;
bool result;
- buf = _bt_getbuf(rel, blk, BT_READ);
+ Assert(leafrightsib != P_NONE);
+
+ buf = _bt_getbuf(rel, leafrightsib, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
result = P_ISHALFDEAD(opaque);
_bt_relbuf(rel, buf);
Buffer pbuf;
Page page;
BTPageOpaque opaque;
- BlockNumber leftsib;
/*
* Locate the downlink of "child" in the parent, updating the stack entry
* If the target is the rightmost child of its parent, then we can't
* delete, unless it's also the only child.
*/
+ Assert(poffset <= maxoff);
if (poffset >= maxoff)
{
/* It's rightmost child... */
if (poffset == P_FIRSTDATAKEY(opaque))
{
+ BlockNumber leftsibparent;
+
/*
* It's only child, so safe if parent would itself be removable.
* We have to check the parent itself, and then recurse to test
*target = parent;
*rightsib = opaque->btpo_next;
- leftsib = opaque->btpo_prev;
+ leftsibparent = opaque->btpo_prev;
_bt_relbuf(rel, pbuf);
/*
- * Like in _bt_pagedel, check that the left sibling is not marked
- * with INCOMPLETE_SPLIT flag. That would mean that there is no
- * downlink to the page to be deleted, and the page deletion
- * algorithm isn't prepared to handle that.
+ * Check that the left sibling of parent (if any) is not marked
+ * with INCOMPLETE_SPLIT flag before proceeding
*/
- if (leftsib != P_NONE)
- {
- Buffer lbuf;
- Page lpage;
- BTPageOpaque lopaque;
-
- lbuf = _bt_getbuf(rel, leftsib, BT_READ);
- lpage = BufferGetPage(lbuf);
- lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
-
- /*
- * If the left sibling was concurrently split, so that its
- * next-pointer doesn't point to the current page anymore, the
- * split that created the current page must be completed. (We
- * don't allow splitting an incompletely split page again
- * until the previous split has been completed)
- */
- if (lopaque->btpo_next == parent &&
- P_INCOMPLETE_SPLIT(lopaque))
- {
- _bt_relbuf(rel, lbuf);
- return false;
- }
- _bt_relbuf(rel, lbuf);
- }
+ if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
+ return false;
return _bt_lock_branch_parent(rel, parent, stack->bts_parent,
topparent, topoff, target, rightsib);
*
* Also, when "stack" is not NULL, we have already checked that the
* current page is not the right half of an incomplete split, i.e. the
- * left sibling does not have its INCOMPLETE_SPLIT flag set.
+ * left sibling does not have its INCOMPLETE_SPLIT flag set, including
+ * when the current target page is to the right of caller's initial page
+ * (the scanblkno page).
*/
BTStack stack = NULL;
* The INCOMPLETE_SPLIT flag on the page tells us if the page is the
* left half of an incomplete split, but ensuring that it's not the
* right half is more complicated. For that, we have to check that
- * the left sibling doesn't have its INCOMPLETE_SPLIT flag set. On
- * the first iteration, we temporarily release the lock on the current
- * page, and check the left sibling and also construct a search stack
- * to. On subsequent iterations, we know we stepped right from a page
- * that passed these tests, so it's OK.
+ * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
+ * _bt_leftsib_splitflag(). On the first iteration, we temporarily
+ * release the lock on scanblkno/leafbuf, check the left sibling, and
+ * construct a search stack to scanblkno. On subsequent iterations,
+ * we know we stepped right from a page that passed these tests, so
+ * it's OK.
*/
if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
BTScanInsert itup_key;
ItemId itemid;
IndexTuple targetkey;
+ BlockNumber leftsib, target;
Buffer lbuf;
- BlockNumber leftsib;
itemid = PageGetItemId(page, P_HIKEY);
targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
leftsib = opaque->btpo_prev;
+ target = BufferGetBlockNumber(leafbuf);
/*
* To avoid deadlocks, we'd better drop the leaf page lock
LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
/*
- * Fetch the left sibling, to check that it's not marked with
- * INCOMPLETE_SPLIT flag. That would mean that the page
- * to-be-deleted doesn't have a downlink, and the page
- * deletion algorithm isn't prepared to handle that.
+ * Check that the left sibling of leafbuf (if any) is not
+ * marked with INCOMPLETE_SPLIT flag before proceeding
*/
- if (leftsib != P_NONE)
+ Assert(target == scanblkno);
+ if (_bt_leftsib_splitflag(rel, leftsib, target))
{
- BTPageOpaque lopaque;
- Page lpage;
-
- lbuf = _bt_getbuf(rel, leftsib, BT_READ);
- lpage = BufferGetPage(lbuf);
- lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
-
- /*
- * If the left sibling is split again by another backend,
- * after we released the lock, we know that the first
- * split must have finished, because we don't allow an
- * incompletely-split page to be split again. So we don't
- * need to walk right here.
- */
- if (lopaque->btpo_next == BufferGetBlockNumber(leafbuf) &&
- P_INCOMPLETE_SPLIT(lopaque))
- {
- ReleaseBuffer(leafbuf);
- _bt_relbuf(rel, lbuf);
- return ndeleted;
- }
- _bt_relbuf(rel, lbuf);
+ ReleaseBuffer(leafbuf);
+ return ndeleted;
}
/* we need an insertion scan key for the search, so build one */
/* find the leftmost leaf page with matching pivot/high key */
itup_key->pivotsearch = true;
stack = _bt_search(rel, itup_key, &lbuf, BT_READ, NULL);
- /* don't need a lock or second pin on the page */
+ /* won't need a second lock or pin on leafbuf */
_bt_relbuf(rel, lbuf);
/*
* Before attempting to lock the parent page, check that the right sibling
* is not in half-dead state. A half-dead right sibling would have no
* downlink in the parent, which would be highly confusing later when we
- * delete the downlink that follows the current page's downlink. (I
- * believe the deletion would work correctly, but it would fail the
- * cross-check we make that the following downlink points to the right
- * sibling of the delete page.)
+ * delete the downlink that follows the leafbuf page's downlink. It would
+ * fail the "right sibling of target page is also the next child in parent
+ * page" cross-check below.
*/
- if (_bt_is_page_halfdead(rel, leafrightsib))
+ if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
{
elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
leafblkno, leafrightsib);
* be deleted too, and the same condition applies recursively to it. We
* have to check this condition all the way up before trying to delete,
* and lock the final parent of the to-be-deleted subtree.
- *
- * However, we won't need to repeat the above _bt_is_page_halfdead() check
- * for parent/ancestor pages because of the rightmost restriction. The
- * leaf check will apply to a right "cousin" leaf page rather than a
- * simple right sibling leaf page in cases where we actually go on to
- * perform internal page deletion. The right cousin leaf page is
- * representative of the left edge of the subtree to the right of the
- * to-be-deleted subtree as a whole. (Besides, internal pages are never
- * marked half-dead, so it isn't even possible to directly assess if an
- * internal page is part of some other to-be-deleted subtree.)
*/
rightsib = leafrightsib;
target = leafblkno;