Modify btree to delete known-dead index entries without an actual VACUUM.

author Tom Lane <tgl@sss.pgh.pa.us>

Tue, 25 Jul 2006 19:13:00 +0000 (19:13 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Tue, 25 Jul 2006 19:13:00 +0000 (19:13 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Tue, 25 Jul 2006 19:13:00 +0000 (19:13 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Tue, 25 Jul 2006 19:13:00 +0000 (19:13 +0000)
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README

index a9466aa634a76678f3a3f700d041f35aade475ae..e153a47a2a29597b12a34a070ab6406f4f6bff47 100644 (file)
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.12 2006/05/08 00:00:09 tgl Exp $
+$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.13 2006/07/25 19:13:00 tgl Exp $
  
  This directory contains a correct implementation of Lehman and Yao's
  high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
@@ -146,11 +146,12 @@ Because a page can be split even while someone holds a pin on it, it is
  possible that an indexscan will return items that are no longer stored on
  the page it has a pin on, but rather somewhere to the right of that page.
  To ensure that VACUUM can't prematurely remove such heap tuples, we require
-btbulkdelete to obtain super-exclusive lock on every leaf page in the index
-(even pages that don't contain any deletable tuples).  This guarantees that
+btbulkdelete to obtain super-exclusive lock on every leaf page in the index,
+even pages that don't contain any deletable tuples.  This guarantees that
  the btbulkdelete call cannot return while any indexscan is still holding
  a copy of a deleted index tuple.  Note that this requirement does not say
-that btbulkdelete must visit the pages in any particular order.
+that btbulkdelete must visit the pages in any particular order.  (See also
+on-the-fly deletion, below.) 
  
  There is no such interlocking for deletion of items in internal pages,
  since backends keep no lock nor pin on a page they have descended past.
@@ -320,6 +321,43 @@ positives, so long as it never gives a false negative.  This makes it
  possible to implement the test with a small counter value stored on each
  index page.
  
+On-the-fly deletion of index tuples
+-----------------------------------
+
+If a process visits a heap tuple and finds that it's dead and removable
+(ie, dead to all open transactions, not only that process), then we can
+return to the index and mark the corresponding index entry "known dead",
+allowing subsequent index scans to skip visiting the heap tuple.  The
+"known dead" marking uses the LP_DELETE bit in ItemIds.  This is currently
+only done in plain indexscans, not bitmap scans, because only plain scans
+visit the heap and index "in sync" and so there's not a convenient way
+to do it for bitmap scans.
+
+Once an index tuple has been marked LP_DELETE it can actually be removed
+from the index immediately; since index scans only stop "between" pages,
+no scan can lose its place from such a deletion.  We separate the steps
+because we allow LP_DELETE to be set with only a share lock (it's exactly
+like a hint bit for a heap tuple), but physically removing tuples requires
+exclusive lock.  In the current code we try to remove LP_DELETE tuples when
+we are otherwise faced with having to split a page to do an insertion (and
+hence have exclusive lock on it already).
+
+This leaves the index in a state where it has no entry for a dead tuple
+that still exists in the heap.  This is not a problem for the current
+implementation of VACUUM, but it could be a problem for anything that
+explicitly tries to find index entries for dead tuples.  (However, the
+same situation is created by REINDEX, since it doesn't enter dead
+tuples into the index.)
+
+It's sufficient to have an exclusive lock on the index page, not a
+super-exclusive lock, to do deletion of LP_DELETE items.  It might seem
+that this breaks the interlock between VACUUM and indexscans, but that is
+not so: as long as an indexscanning process has a pin on the page where
+the index item used to be, VACUUM cannot complete its btbulkdelete scan
+and so cannot remove the heap tuple.  This is another reason why
+btbulkdelete has to get super-exclusive lock on every leaf page, not only
+the ones where it actually sees items to delete.
+
  WAL considerations
  ------------------
  
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c

index 974ef92d187d2671b16b7201180c7c6cdfc0acb3..597949aa2d92a187cb8dce1d8cc1f196256a3dba 100644 (file)
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.141 2006/07/13 16:49:12 momjian Exp $
+ *   $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.142 2006/07/25 19:13:00 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -65,6 +65,7 @@ static void _bt_pgaddtup(Relation rel, Page page,
              OffsetNumber itup_off, const char *where);
  static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
             int keysz, ScanKey scankey);
+static void _bt_vacuum_one_page(Relation rel, Buffer buffer);
  
  
  /*
@@ -262,6 +263,8 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
                                                  hbuffer) == HEAPTUPLE_DEAD)
                     {
                         curitemid->lp_flags |= LP_DELETE;
+                       opaque->btpo_flags |= BTP_HAS_GARBAGE;
+                       /* be sure to mark the proper buffer dirty... */
                         if (nbuf != InvalidBuffer)
                             SetBufferCommitInfoNeedsSave(nbuf);
                         else
@@ -424,11 +427,29 @@ _bt_insertonpg(Relation rel,
          */
         bool        movedright = false;
  
-       while (PageGetFreeSpace(page) < itemsz &&
-              !P_RIGHTMOST(lpageop) &&
-              _bt_compare(rel, keysz, scankey, page, P_HIKEY) == 0 &&
-              random() > (MAX_RANDOM_VALUE / 100))
+       while (PageGetFreeSpace(page) < itemsz)
         {
+           Buffer      rbuf;
+
+           /*
+            * before considering moving right, see if we can obtain enough
+            * space by erasing LP_DELETE items
+            */
+           if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
+           {
+               _bt_vacuum_one_page(rel, buf);
+               if (PageGetFreeSpace(page) >= itemsz)
+                   break;      /* OK, now we have enough space */
+           }
+
+           /*
+            * nope, so check conditions (b) and (c) enumerated above
+            */
+           if (P_RIGHTMOST(lpageop) ||
+               _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
+               random() <= (MAX_RANDOM_VALUE / 100))
+               break;
+
             /*
              * step right to next non-dead page
              *
@@ -438,7 +459,7 @@ _bt_insertonpg(Relation rel,
              * pages won't do because we don't know when they will get
              * de-linked from the tree.
              */
-           Buffer      rbuf = InvalidBuffer;
+           rbuf = InvalidBuffer;
  
             for (;;)
             {
@@ -702,9 +723,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
     ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
  
     /* if we're splitting this page, it won't be the root when we're done */
-   /* also, clear the SPLIT_END flag in both pages */
+   /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
     lopaque->btpo_flags = oopaque->btpo_flags;
-   lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END);
+   lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
     ropaque->btpo_flags = lopaque->btpo_flags;
     lopaque->btpo_prev = oopaque->btpo_prev;
     lopaque->btpo_next = BufferGetBlockNumber(rbuf);
@@ -1651,3 +1672,47 @@ _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
     /* if we get here, the keys are equal */
     return true;
  }
+
+/*
+ * _bt_vacuum_one_page - vacuum just one index page.
+ *
+ * Try to remove LP_DELETE items from the given page.  The passed buffer
+ * must be exclusive-locked, but unlike a real VACUUM, we don't need a
+ * super-exclusive "cleanup" lock (see nbtree/README).
+ */
+static void
+_bt_vacuum_one_page(Relation rel, Buffer buffer)
+{
+   OffsetNumber    deletable[MaxOffsetNumber];
+   int             ndeletable = 0;
+   OffsetNumber    offnum,
+                   minoff,
+                   maxoff;
+   Page            page = BufferGetPage(buffer);
+   BTPageOpaque    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+   /*
+    * Scan over all items to see which ones need deleted
+    * according to LP_DELETE flags.
+    */
+   minoff = P_FIRSTDATAKEY(opaque);
+   maxoff = PageGetMaxOffsetNumber(page);
+   for (offnum = minoff;
+        offnum <= maxoff;
+        offnum = OffsetNumberNext(offnum))
+   {
+       ItemId itemId = PageGetItemId(page, offnum);
+
+       if (ItemIdDeleted(itemId))
+           deletable[ndeletable++] = offnum;
+   }
+
+   if (ndeletable > 0)
+       _bt_delitems(rel, buffer, deletable, ndeletable);
+   /*
+    * Note: if we didn't find any LP_DELETE items, then the page's
+    * BTP_HAS_GARBAGE hint bit is falsely set.  We do not bother
+    * expending a separate write to clear it, however.  We will clear
+    * it when we split the page.
+    */
+}
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c

index e069495f94f73abf12274d967a77d5058b1baba5..080e10c88cf4a701bc534bef98cca658bb530ca5 100644 (file)
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.98 2006/07/13 16:49:12 momjian Exp $
+ *   $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.99 2006/07/25 19:13:00 tgl Exp $
   *
   * NOTES
   *    Postgres btree pages look like ordinary relation pages.  The opaque
@@ -668,6 +668,15 @@ _bt_delitems(Relation rel, Buffer buf,
     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     opaque->btpo_cycleid = 0;
  
+   /*
+    * Mark the page as not containing any LP_DELETE items.  This is not
+    * certainly true (there might be some that have recently been marked,
+    * but weren't included in our target-item list), but it will almost
+    * always be true and it doesn't seem worth an additional page scan
+    * to check it.  Remember that BTP_HAS_GARBAGE is only a hint anyway.
+    */
+   opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
     MarkBufferDirty(buf);
  
     /* XLOG stuff */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c

index 840244e597acd16b73c1a1a12a96af2e6f3da6b1..8b680541121280a1200f35cd7a9658c1189eacec 100644 (file)
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.77 2006/07/13 16:49:12 momjian Exp $
+ *   $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.78 2006/07/25 19:13:00 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -884,9 +884,15 @@ _bt_killitems(IndexScanDesc scan, bool haveLock)
      * Since this can be redone later if needed, it's treated the same
      * as a commit-hint-bit status update for heap tuples: we mark the
      * buffer dirty but don't make a WAL log entry.
+    *
+    * Whenever we mark anything LP_DELETEd, we also set the page's
+    * BTP_HAS_GARBAGE flag, which is likewise just a hint.
      */
     if (killedsomething)
+   {
+       opaque->btpo_flags |= BTP_HAS_GARBAGE;
         SetBufferCommitInfoNeedsSave(so->currPos.buf);
+   }
  
     if (!haveLock)
         LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c

index 63cb342e3933927b373800db79d9b61cc3368d43..126f593cd96f7cb8c031f56c4b90922cff73d136 100644 (file)
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -8,7 +8,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.35 2006/07/14 14:52:17 momjian Exp $
+ *   $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.36 2006/07/25 19:13:00 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -346,6 +346,7 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
     Relation    reln;
     Buffer      buffer;
     Page        page;
+   BTPageOpaque opaque;
  
     if (record->xl_info & XLR_BKP_BLOCK_1)
         return;
@@ -374,6 +375,13 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
         PageIndexMultiDelete(page, unused, unend - unused);
     }
  
+   /*
+    * Mark the page as not containing any LP_DELETE items --- see comments
+    * in _bt_delitems().
+    */
+   opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+   opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
     PageSetLSN(page, lsn);
     PageSetTLI(page, ThisTimeLineID);
     MarkBufferDirty(buffer);
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h

index b22422ffd6f16eadf09e8a9783883cbe3316969a..232cbb92bdbbfe29a9e78ed5f387e506cffc2eb2 100644 (file)
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.101 2006/07/11 21:05:57 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.102 2006/07/25 19:13:00 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -71,6 +71,7 @@ typedef BTPageOpaqueData *BTPageOpaque;
  #define BTP_META       (1 << 3)    /* meta-page */
  #define BTP_HALF_DEAD  (1 << 4)    /* empty, but still in tree */
  #define BTP_SPLIT_END  (1 << 5)    /* rightmost page of split group */
+#define BTP_HAS_GARBAGE    (1 << 6)    /* page has LP_DELETEd tuples */
  
  
  /*
@@ -163,6 +164,7 @@ typedef struct BTMetaPageData
  #define P_ISROOT(opaque)       ((opaque)->btpo_flags & BTP_ROOT)
  #define P_ISDELETED(opaque)        ((opaque)->btpo_flags & BTP_DELETED)
  #define P_IGNORE(opaque)       ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
+#define P_HAS_GARBAGE(opaque)  ((opaque)->btpo_flags & BTP_HAS_GARBAGE)
  
  /*
   * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
author	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 25 Jul 2006 19:13:00 +0000 (19:13 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 25 Jul 2006 19:13:00 +0000 (19:13 +0000)
src/backend/access/nbtree/README		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtinsert.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtpage.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtutils.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtxlog.c		patch \| blob \| blame \| history
src/include/access/nbtree.h		patch \| blob \| blame \| history