-$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.12 2006/05/08 00:00:09 tgl Exp $
+$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.13 2006/07/25 19:13:00 tgl Exp $
This directory contains a correct implementation of Lehman and Yao's
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
possible that an indexscan will return items that are no longer stored on
the page it has a pin on, but rather somewhere to the right of that page.
To ensure that VACUUM can't prematurely remove such heap tuples, we require
-btbulkdelete to obtain super-exclusive lock on every leaf page in the index
-(even pages that don't contain any deletable tuples). This guarantees that
+btbulkdelete to obtain super-exclusive lock on every leaf page in the index,
+even pages that don't contain any deletable tuples. This guarantees that
the btbulkdelete call cannot return while any indexscan is still holding
a copy of a deleted index tuple. Note that this requirement does not say
-that btbulkdelete must visit the pages in any particular order.
+that btbulkdelete must visit the pages in any particular order. (See also
+on-the-fly deletion, below.)
There is no such interlocking for deletion of items in internal pages,
since backends keep no lock nor pin on a page they have descended past.
possible to implement the test with a small counter value stored on each
index page.
+On-the-fly deletion of index tuples
+-----------------------------------
+
+If a process visits a heap tuple and finds that it's dead and removable
+(ie, dead to all open transactions, not only that process), then we can
+return to the index and mark the corresponding index entry "known dead",
+allowing subsequent index scans to skip visiting the heap tuple. The
+"known dead" marking uses the LP_DELETE bit in ItemIds. This is currently
+only done in plain indexscans, not bitmap scans, because only plain scans
+visit the heap and index "in sync" and so there's not a convenient way
+to do it for bitmap scans.
+
+Once an index tuple has been marked LP_DELETE it can actually be removed
+from the index immediately; since index scans only stop "between" pages,
+no scan can lose its place from such a deletion. We separate the steps
+because we allow LP_DELETE to be set with only a share lock (it's exactly
+like a hint bit for a heap tuple), but physically removing tuples requires
+exclusive lock. In the current code we try to remove LP_DELETE tuples when
+we are otherwise faced with having to split a page to do an insertion (and
+hence have exclusive lock on it already).
+
+This leaves the index in a state where it has no entry for a dead tuple
+that still exists in the heap. This is not a problem for the current
+implementation of VACUUM, but it could be a problem for anything that
+explicitly tries to find index entries for dead tuples. (However, the
+same situation is created by REINDEX, since it doesn't enter dead
+tuples into the index.)
+
+It's sufficient to have an exclusive lock on the index page, not a
+super-exclusive lock, to do deletion of LP_DELETE items. It might seem
+that this breaks the interlock between VACUUM and indexscans, but that is
+not so: as long as an indexscanning process has a pin on the page where
+the index item used to be, VACUUM cannot complete its btbulkdelete scan
+and so cannot remove the heap tuple. This is another reason why
+btbulkdelete has to get super-exclusive lock on every leaf page, not only
+the ones where it actually sees items to delete.
+
WAL considerations
------------------
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.141 2006/07/13 16:49:12 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.142 2006/07/25 19:13:00 tgl Exp $
*
*-------------------------------------------------------------------------
*/
OffsetNumber itup_off, const char *where);
static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
int keysz, ScanKey scankey);
+static void _bt_vacuum_one_page(Relation rel, Buffer buffer);
/*
hbuffer) == HEAPTUPLE_DEAD)
{
curitemid->lp_flags |= LP_DELETE;
+ opaque->btpo_flags |= BTP_HAS_GARBAGE;
+ /* be sure to mark the proper buffer dirty... */
if (nbuf != InvalidBuffer)
SetBufferCommitInfoNeedsSave(nbuf);
else
*/
bool movedright = false;
- while (PageGetFreeSpace(page) < itemsz &&
- !P_RIGHTMOST(lpageop) &&
- _bt_compare(rel, keysz, scankey, page, P_HIKEY) == 0 &&
- random() > (MAX_RANDOM_VALUE / 100))
+ while (PageGetFreeSpace(page) < itemsz)
{
+ Buffer rbuf;
+
+ /*
+ * before considering moving right, see if we can obtain enough
+ * space by erasing LP_DELETE items
+ */
+ if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
+ {
+ _bt_vacuum_one_page(rel, buf);
+ if (PageGetFreeSpace(page) >= itemsz)
+ break; /* OK, now we have enough space */
+ }
+
+ /*
+ * nope, so check conditions (b) and (c) enumerated above
+ */
+ if (P_RIGHTMOST(lpageop) ||
+ _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
+ random() <= (MAX_RANDOM_VALUE / 100))
+ break;
+
/*
* step right to next non-dead page
*
* pages won't do because we don't know when they will get
* de-linked from the tree.
*/
- Buffer rbuf = InvalidBuffer;
+ rbuf = InvalidBuffer;
for (;;)
{
ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
/* if we're splitting this page, it won't be the root when we're done */
- /* also, clear the SPLIT_END flag in both pages */
+ /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
lopaque->btpo_flags = oopaque->btpo_flags;
- lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END);
+ lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
ropaque->btpo_flags = lopaque->btpo_flags;
lopaque->btpo_prev = oopaque->btpo_prev;
lopaque->btpo_next = BufferGetBlockNumber(rbuf);
/* if we get here, the keys are equal */
return true;
}
+
+/*
+ * _bt_vacuum_one_page - vacuum just one index page.
+ *
+ * Try to remove LP_DELETE items from the given page. The passed buffer
+ * must be exclusive-locked, but unlike a real VACUUM, we don't need a
+ * super-exclusive "cleanup" lock (see nbtree/README).
+ */
+static void
+_bt_vacuum_one_page(Relation rel, Buffer buffer)
+{
+ OffsetNumber deletable[MaxOffsetNumber];
+ int ndeletable = 0;
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ Page page = BufferGetPage(buffer);
+ BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * Scan over all items to see which ones need deleted
+ * according to LP_DELETE flags.
+ */
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemId = PageGetItemId(page, offnum);
+
+ if (ItemIdDeleted(itemId))
+ deletable[ndeletable++] = offnum;
+ }
+
+ if (ndeletable > 0)
+ _bt_delitems(rel, buffer, deletable, ndeletable);
+ /*
+ * Note: if we didn't find any LP_DELETE items, then the page's
+ * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother
+ * expending a separate write to clear it, however. We will clear
+ * it when we split the page.
+ */
+}
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.98 2006/07/13 16:49:12 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.99 2006/07/25 19:13:00 tgl Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
opaque->btpo_cycleid = 0;
+ /*
+ * Mark the page as not containing any LP_DELETE items. This is not
+ * certainly true (there might be some that have recently been marked,
+ * but weren't included in our target-item list), but it will almost
+ * always be true and it doesn't seem worth an additional page scan
+ * to check it. Remember that BTP_HAS_GARBAGE is only a hint anyway.
+ */
+ opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
MarkBufferDirty(buf);
/* XLOG stuff */
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.77 2006/07/13 16:49:12 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.78 2006/07/25 19:13:00 tgl Exp $
*
*-------------------------------------------------------------------------
*/
* Since this can be redone later if needed, it's treated the same
* as a commit-hint-bit status update for heap tuples: we mark the
* buffer dirty but don't make a WAL log entry.
+ *
+ * Whenever we mark anything LP_DELETEd, we also set the page's
+ * BTP_HAS_GARBAGE flag, which is likewise just a hint.
*/
if (killedsomething)
+ {
+ opaque->btpo_flags |= BTP_HAS_GARBAGE;
SetBufferCommitInfoNeedsSave(so->currPos.buf);
+ }
if (!haveLock)
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.35 2006/07/14 14:52:17 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.36 2006/07/25 19:13:00 tgl Exp $
*
*-------------------------------------------------------------------------
*/
Relation reln;
Buffer buffer;
Page page;
+ BTPageOpaque opaque;
if (record->xl_info & XLR_BKP_BLOCK_1)
return;
PageIndexMultiDelete(page, unused, unend - unused);
}
+ /*
+ * Mark the page as not containing any LP_DELETE items --- see comments
+ * in _bt_delitems().
+ */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.101 2006/07/11 21:05:57 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.102 2006/07/25 19:13:00 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#define BTP_META (1 << 3) /* meta-page */
#define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */
#define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */
+#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DELETEd tuples */
/*
#define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT)
#define P_ISDELETED(opaque) ((opaque)->btpo_flags & BTP_DELETED)
#define P_IGNORE(opaque) ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
+#define P_HAS_GARBAGE(opaque) ((opaque)->btpo_flags & BTP_HAS_GARBAGE)
/*
* Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost