Skip full index scan during cleanup of B-tree indexes when possible
authorTeodor Sigaev <teodor@sigaev.ru>
Wed, 4 Apr 2018 16:29:00 +0000 (19:29 +0300)
committerTeodor Sigaev <teodor@sigaev.ru>
Wed, 4 Apr 2018 16:29:00 +0000 (19:29 +0300)
Vacuum of index consists from two stages: multiple (zero of more) ambulkdelete
calls and one amvacuumcleanup call. When workload on particular table
is append-only, then autovacuum isn't intended to touch this table. However,
user may run vacuum manually in order to fill visibility map and get benefits
of index-only scans. Then ambulkdelete wouldn't be called for indexes
of such table (because no heap tuples were deleted), only amvacuumcleanup would
be called In this case, amvacuumcleanup would perform full index scan for
two objectives: put recyclable pages into free space map and update index
statistics.

This patch allows btvacuumclanup to skip full index scan when two conditions
are satisfied: no pages are going to be put into free space map and index
statistics isn't stalled. In order to check first condition, we store
oldest btpo_xact in the meta-page. When it's precedes RecentGlobalXmin, then
there are some recyclable pages. In order to check second condition we store
number of heap tuples observed during previous full index scan by cleanup.
If fraction of newly inserted tuples is less than
vacuum_cleanup_index_scale_factor, then statistics isn't considered to be
stalled. vacuum_cleanup_index_scale_factor can be defined as both reloption and GUC (default).

This patch bumps B-tree meta-page version. Upgrade of meta-page is performed
"on the fly": during VACUUM meta-page is rewritten with new version. No special
handling in pg_upgrade is required.

Author: Masahiko Sawada, Alexander Korotkov
Review by: Peter Geoghegan, Kyotaro Horiguchi, Alexander Korotkov, Yura Sokolov
Discussion: https://www.postgresql.org/message-id/flat/CAD21AoAX+d2oD_nrd9O2YkpzHaFr=uQeGr9s1rKC3O4ENc568g@mail.gmail.com

23 files changed:
contrib/amcheck/verify_nbtree.c
contrib/pageinspect/Makefile
contrib/pageinspect/btreefuncs.c
contrib/pageinspect/expected/btree.out
contrib/pageinspect/pageinspect--1.6--1.7.sql [new file with mode: 0644]
contrib/pageinspect/pageinspect.control
contrib/pgstattuple/expected/pgstattuple.out
doc/src/sgml/config.sgml
doc/src/sgml/pageinspect.sgml
doc/src/sgml/ref/create_index.sgml
src/backend/access/common/reloptions.c
src/backend/access/nbtree/nbtinsert.c
src/backend/access/nbtree/nbtpage.c
src/backend/access/nbtree/nbtree.c
src/backend/access/nbtree/nbtxlog.c
src/backend/utils/init/globals.c
src/backend/utils/misc/guc.c
src/include/access/nbtree.h
src/include/access/nbtxlog.h
src/include/miscadmin.h
src/include/utils/rel.h
src/test/regress/expected/btree_index.out
src/test/regress/sql/btree_index.sql

index a15fe21933b9a5b8baefedaa8f38e517d6c91877..52aa633056b75dd929ac758493ebbca026e2e0c0 100644 (file)
@@ -1500,12 +1500,14 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
                     errmsg("index \"%s\" meta page is corrupt",
                            RelationGetRelationName(state->rel))));
 
-       if (metad->btm_version != BTREE_VERSION)
+       if (metad->btm_version < BTREE_MIN_VERSION ||
+           metad->btm_version > BTREE_VERSION)
            ereport(ERROR,
                    (errcode(ERRCODE_INDEX_CORRUPTED),
-                    errmsg("version mismatch in index \"%s\": file version %d, code version %d",
+                    errmsg("version mismatch in index \"%s\": file version %d, "
+                           "current version %d, minimal supported version %d",
                            RelationGetRelationName(state->rel),
-                           metad->btm_version, BTREE_VERSION)));
+                           metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
    }
 
    /*
index 0a3cbeeb108ce3cb4ad6509c7db0b8395b905e9e..e5a581f141be7cb1db65dda1ca62bdbc885fdbfd 100644 (file)
@@ -5,7 +5,8 @@ OBJS        = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o \
          brinfuncs.o ginfuncs.o hashfuncs.o $(WIN32RES)
 
 EXTENSION = pageinspect
-DATA = pageinspect--1.5.sql pageinspect--1.5--1.6.sql \
+DATA =  pageinspect--1.6--1.7.sql \
+   pageinspect--1.5.sql pageinspect--1.5--1.6.sql \
    pageinspect--1.4--1.5.sql pageinspect--1.3--1.4.sql \
    pageinspect--1.2--1.3.sql pageinspect--1.1--1.2.sql \
    pageinspect--1.0--1.1.sql pageinspect--unpackaged--1.0.sql
index 4f834676ea297f1ba4e2c5a8fde10382e4edc0e9..5133653791952a8e93b35a202352d20c3aac864d 100644 (file)
@@ -511,7 +511,7 @@ bt_metap(PG_FUNCTION_ARGS)
    BTMetaPageData *metad;
    TupleDesc   tupleDesc;
    int         j;
-   char       *values[6];
+   char       *values[8];
    Buffer      buffer;
    Page        page;
    HeapTuple   tuple;
@@ -555,6 +555,8 @@ bt_metap(PG_FUNCTION_ARGS)
    values[j++] = psprintf("%d", metad->btm_level);
    values[j++] = psprintf("%d", metad->btm_fastroot);
    values[j++] = psprintf("%d", metad->btm_fastlevel);
+   values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact);
+   values[j++] = psprintf("%lf", metad->btm_last_cleanup_num_heap_tuples);
 
    tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
                                   values);
index 67b103add3fef7c2dbb952716cbaa8d41bfacc1e..2aaa4df53b148919b8962057150c760714e9b954 100644 (file)
@@ -3,13 +3,15 @@ INSERT INTO test1 VALUES (72057594037927937, 'text');
 CREATE INDEX test1_a_idx ON test1 USING btree (a);
 \x
 SELECT * FROM bt_metap('test1_a_idx');
--[ RECORD 1 ]-----
-magic     | 340322
-version   | 2
-root      | 1
-level     | 0
-fastroot  | 1
-fastlevel | 0
+-[ RECORD 1 ]-----------+-------
+magic                   | 340322
+version                 | 3
+root                    | 1
+level                   | 0
+fastroot                | 1
+fastlevel               | 0
+oldest_xact             | 0
+last_cleanup_num_tuples | -1
 
 SELECT * FROM bt_page_stats('test1_a_idx', 0);
 ERROR:  block 0 is a meta page
diff --git a/contrib/pageinspect/pageinspect--1.6--1.7.sql b/contrib/pageinspect/pageinspect--1.6--1.7.sql
new file mode 100644 (file)
index 0000000..2433a21
--- /dev/null
@@ -0,0 +1,26 @@
+/* contrib/pageinspect/pageinspect--1.6--1.7.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pageinspect UPDATE TO '1.7'" to load this file. \quit
+
+--
+-- bt_metap()
+--
+DROP FUNCTION bt_metap(IN relname text,
+    OUT magic int4,
+    OUT version int4,
+    OUT root int4,
+    OUT level int4,
+    OUT fastroot int4,
+    OUT fastlevel int4);
+CREATE FUNCTION bt_metap(IN relname text,
+    OUT magic int4,
+    OUT version int4,
+    OUT root int4,
+    OUT level int4,
+    OUT fastroot int4,
+    OUT fastlevel int4,
+    OUT oldest_xact int4,
+    OUT last_cleanup_num_tuples real)
+AS 'MODULE_PATHNAME', 'bt_metap'
+LANGUAGE C STRICT PARALLEL SAFE;
index 1a61c9f5ad31b80b399f6779a0a729a5b56c16b9..dcfc61f22dc57cb5c72245b5e116074d8d3faf8e 100644 (file)
@@ -1,5 +1,5 @@
 # pageinspect extension
 comment = 'inspect the contents of database pages at a low level'
-default_version = '1.6'
+default_version = '1.7'
 module_pathname = '$libdir/pageinspect'
 relocatable = true
index 20b5585d03a2db6e6b8bd10ce42a979a3d038901..a7087f6d4573802a0923131f5b96aabb3a037327 100644 (file)
@@ -48,7 +48,7 @@ select version, tree_level,
     from pgstatindex('test_pkey');
  version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation 
 ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
-       2 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
+       3 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
 (1 row)
 
 select version, tree_level,
@@ -58,7 +58,7 @@ select version, tree_level,
     from pgstatindex('test_pkey'::text);
  version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation 
 ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
-       2 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
+       3 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
 (1 row)
 
 select version, tree_level,
@@ -68,7 +68,7 @@ select version, tree_level,
     from pgstatindex('test_pkey'::name);
  version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation 
 ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
-       2 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
+       3 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
 (1 row)
 
 select version, tree_level,
@@ -78,7 +78,7 @@ select version, tree_level,
     from pgstatindex('test_pkey'::regclass);
  version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation 
 ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
-       2 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
+       3 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
 (1 row)
 
 select pg_relpages('test');
@@ -229,7 +229,7 @@ create index test_partition_hash_idx on test_partition using hash (a);
 select pgstatindex('test_partition_idx');
          pgstatindex          
 ------------------------------
- (2,0,8192,0,0,0,0,0,NaN,NaN)
+ (3,0,8192,0,0,0,0,0,NaN,NaN)
 (1 row)
 
 select pgstathashindex('test_partition_hash_idx');
index e7d408824e2a9adcb57b15e1b9bbab389d1cc66c..a189a8efc3fe52d99a06b44bd348318ea60413d7 100644 (file)
@@ -1882,6 +1882,31 @@ include_dir 'conf.d'
      </note>
     </sect2>
 
+    <sect2 id="runtime-config-index-vacuum">
+     <title>Index Vacuum</title>
+     <variablelist>
+     <varlistentry id="guc-vacuum-cleanup-index-scale-factor" xreflabel="vacuum_cleanup_index_scale_factor">
+      <term><varname>vacuum_cleanup_index_scale_factor</varname> (<type>floating point</type>)
+      <indexterm>
+       <primary><varname>vacuum_cleanup_index_scale_factor</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        When no tuples were deleted from the heap, B-tree indexes might still
+        be scanned during <command>VACUUM</command> cleanup stage by two
+        reasons.  The first reason is that B-tree index contains deleted pages
+        which can be recycled during cleanup.  The second reason is that B-tree
+        index statistics is stalled.  The criterion of stalled index statistics
+        is number of inserted tuples since previous statistics collection
+        is greater than <varname>vacuum_cleanup_index_scale_factor</varname>
+        fraction of total number of heap tuples.
+       </para>
+      </listitem>
+     </varlistentry>
+     </variablelist>
+    </sect2>
+
     <sect2 id="runtime-config-resource-background-writer">
      <title>Background Writer</title>
 
index 23570af4bf82c1dd7c98e3ba8b10fa6100818154..4d5da186bb41303af79482c2c2e4ce3e40dd53b9 100644 (file)
@@ -247,13 +247,15 @@ test=# SELECT * FROM heap_page_item_attrs(get_raw_page('pg_class', 0), 'pg_class
       index's metapage.  For example:
 <screen>
 test=# SELECT * FROM bt_metap('pg_cast_oid_index');
--[ RECORD 1 ]-----
-magic     | 340322
-version   | 2
-root      | 1
-level     | 0
-fastroot  | 1
-fastlevel | 0
+-[ RECORD 1 ]-----------+-------
+magic                   | 340322
+version                 | 3
+root                    | 1
+level                   | 0
+fastroot                | 1
+fastlevel               | 0
+oldest_xact             | 582
+last_cleanup_num_tuples | 1000
 </screen>
      </para>
     </listitem>
index ba1c5d639253bc50fc416408dbc443ff6c5433ac..e9521fbfb91fce98d2836c30092aab7798726be7 100644 (file)
@@ -369,6 +369,21 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class=
    </varlistentry>
    </variablelist>
 
+   <para>
+    B-tree indexes additionally accept this parameter:
+   </para>
+
+   <variablelist>
+   <varlistentry>
+    <term><literal>vacuum_cleanup_index_scale_factor</literal></term>
+    <listitem>
+    <para>
+      Per-table value for <xref linkend="guc-vacuum-cleanup-index-scale-factor"/>.
+    </para>
+    </listitem>
+   </varlistentry>
+   </variablelist>
+
    <para>
     GiST indexes additionally accept this parameter:
    </para>
index 35c09987adb783541b07dd98f7cbe4754cdd72a2..69ab2f101c7b15ac8886df323bc33c74ee4bef7d 100644 (file)
@@ -409,6 +409,15 @@ static relopt_real realRelOpts[] =
        },
        0, -1.0, DBL_MAX
    },
+   {
+       {
+           "vacuum_cleanup_index_scale_factor",
+           "Number of tuple inserts prior to index cleanup as a fraction of reltuples.",
+           RELOPT_KIND_BTREE,
+           ShareUpdateExclusiveLock
+       },
+       -1, 0.0, 100.0
+   },
    /* list terminator */
    {{NULL}}
 };
@@ -1371,7 +1380,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
        {"user_catalog_table", RELOPT_TYPE_BOOL,
        offsetof(StdRdOptions, user_catalog_table)},
        {"parallel_workers", RELOPT_TYPE_INT,
-       offsetof(StdRdOptions, parallel_workers)}
+       offsetof(StdRdOptions, parallel_workers)},
+       {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
+       offsetof(StdRdOptions, vacuum_cleanup_index_scale_factor)}
    };
 
    options = parseRelOptions(reloptions, validate, kind, &numoptions);
index 40111990c5ecb6a366d0554594c4198a22578765..fd7360278dbd302c6ff41f4b37eb2094900600fc 100644 (file)
@@ -939,6 +939,9 @@ _bt_insertonpg(Relation rel,
 
        if (BufferIsValid(metabuf))
        {
+           /* upgrade meta-page if needed */
+           if (metad->btm_version < BTREE_VERSION)
+               _bt_upgrademetapage(metapg);
            metad->btm_fastroot = itup_blkno;
            metad->btm_fastlevel = lpageop->btpo.level;
            MarkBufferDirty(metabuf);
@@ -997,6 +1000,9 @@ _bt_insertonpg(Relation rel,
                xlmeta.level = metad->btm_level;
                xlmeta.fastroot = metad->btm_fastroot;
                xlmeta.fastlevel = metad->btm_fastlevel;
+               xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+               xlmeta.last_cleanup_num_heap_tuples =
+                   metad->btm_last_cleanup_num_heap_tuples;
 
                XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
                XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
@@ -2049,6 +2055,10 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
    metapg = BufferGetPage(metabuf);
    metad = BTPageGetMeta(metapg);
 
+   /* upgrade metapage if needed */
+   if (metad->btm_version < BTREE_VERSION)
+       _bt_upgrademetapage(metapg);
+
    /*
     * Create downlink item for left page (old root).  Since this will be the
     * first item in a non-leaf page, it implicitly has minus-infinity key
@@ -2138,6 +2148,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
        md.level = metad->btm_level;
        md.fastroot = rootblknum;
        md.fastlevel = metad->btm_level;
+       md.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+       md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
 
        XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
 
index 92afe2de383e7bdce08d1ef217a359440bf090c1..505a67e6ed2a146b7bf6337926bed1b7e1c77c43 100644 (file)
@@ -60,6 +60,8 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
    metad->btm_level = level;
    metad->btm_fastroot = rootbknum;
    metad->btm_fastlevel = level;
+   metad->btm_oldest_btpo_xact = InvalidTransactionId;
+   metad->btm_last_cleanup_num_heap_tuples = -1.0;
 
    metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
    metaopaque->btpo_flags = BTP_META;
@@ -73,6 +75,114 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
        ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
 }
 
+/*
+ * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to the new.
+ *
+ *     This routine does purely in-memory image upgrade.  Caller is
+ *     responsible for locking, WAL-logging etc.
+ */
+void
+_bt_upgrademetapage(Page page)
+{
+   BTMetaPageData *metad;
+   BTPageOpaque metaopaque;
+
+   metad = BTPageGetMeta(page);
+   metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+   /* It must be really a meta page of upgradable version */
+   Assert(metaopaque->btpo_flags & BTP_META);
+   Assert(metad->btm_version < BTREE_VERSION);
+   Assert(metad->btm_version >= BTREE_MIN_VERSION);
+
+   /* Set version number and fill extra fields added into version 3 */
+   metad->btm_version = BTREE_VERSION;
+   metad->btm_oldest_btpo_xact = InvalidTransactionId;
+   metad->btm_last_cleanup_num_heap_tuples = -1.0;
+
+   /* Adjust pd_lower (see _bt_initmetapage() for details) */
+   ((PageHeader) page)->pd_lower =
+       ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
+}
+
+/*
+ * _bt_update_meta_cleanup_info() -- Update cleanup-related information in
+ *                                   the metapage.
+ *
+ *     This routine checks if provided cleanup-related information is matching
+ *     to those written in the metapage.  On mismatch, metapage is overritten.
+ */
+void
+_bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
+                            float8 numHeapTuples)
+{
+   Buffer          metabuf;
+   Page            metapg;
+   BTPageOpaque    metaopaque;
+   BTMetaPageData *metad;
+   bool            needsRewrite = false;
+   XLogRecPtr      recptr;
+
+   /* read the metapage and check if it needs rewrite */
+   metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+   metapg = BufferGetPage(metabuf);
+   metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+   metad = BTPageGetMeta(metapg);
+
+   /* outdated version of metapage always needs rewrite */
+   if (metad->btm_version < BTREE_VERSION)
+       needsRewrite = true;
+   else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
+            metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
+       needsRewrite = true;
+
+   if (!needsRewrite)
+   {
+       _bt_relbuf(rel, metabuf);
+       return;
+   }
+
+   /* trade in our read lock for a write lock */
+   LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+   LockBuffer(metabuf, BT_WRITE);
+
+   START_CRIT_SECTION();
+
+   /* upgrade meta-page if needed */
+   if (metad->btm_version < BTREE_VERSION)
+       _bt_upgrademetapage(metapg);
+
+   /* update cleanup-related infromation */
+   metad->btm_oldest_btpo_xact = oldestBtpoXact;
+   metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
+   MarkBufferDirty(metabuf);
+
+   /* write wal record if needed */
+   if (RelationNeedsWAL(rel))
+   {
+       xl_btree_metadata md;
+
+       XLogBeginInsert();
+       XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+
+       md.root = metad->btm_root;
+       md.level = metad->btm_level;
+       md.fastroot = metad->btm_fastroot;
+       md.fastlevel = metad->btm_fastlevel;
+       md.oldest_btpo_xact = oldestBtpoXact;
+       md.last_cleanup_num_heap_tuples = numHeapTuples;
+
+       XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
+
+       recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
+
+       PageSetLSN(metapg, recptr);
+   }
+
+   END_CRIT_SECTION();
+   _bt_relbuf(rel, metabuf);
+}
+
 /*
  * _bt_getroot() -- Get the root page of the btree.
  *
@@ -124,7 +234,8 @@ _bt_getroot(Relation rel, int access)
        metad = (BTMetaPageData *) rel->rd_amcache;
        /* We shouldn't have cached it if any of these fail */
        Assert(metad->btm_magic == BTREE_MAGIC);
-       Assert(metad->btm_version == BTREE_VERSION);
+       Assert(metad->btm_version >= BTREE_MIN_VERSION);
+       Assert(metad->btm_version <= BTREE_VERSION);
        Assert(metad->btm_root != P_NONE);
 
        rootblkno = metad->btm_fastroot;
@@ -170,12 +281,14 @@ _bt_getroot(Relation rel, int access)
                 errmsg("index \"%s\" is not a btree",
                        RelationGetRelationName(rel))));
 
-   if (metad->btm_version != BTREE_VERSION)
+   if (metad->btm_version < BTREE_MIN_VERSION ||
+       metad->btm_version > BTREE_VERSION)
        ereport(ERROR,
                (errcode(ERRCODE_INDEX_CORRUPTED),
-                errmsg("version mismatch in index \"%s\": file version %d, code version %d",
+                errmsg("version mismatch in index \"%s\": file version %d, "
+                       "current version %d, minimal supported version %d",
                        RelationGetRelationName(rel),
-                       metad->btm_version, BTREE_VERSION)));
+                       metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
 
    /* if no root page initialized yet, do it */
    if (metad->btm_root == P_NONE)
@@ -191,6 +304,10 @@ _bt_getroot(Relation rel, int access)
        LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
        LockBuffer(metabuf, BT_WRITE);
 
+       /* upgrade metapage if needed */
+       if (metad->btm_version < BTREE_VERSION)
+           _bt_upgrademetapage(metapg);
+
        /*
         * Race condition:  if someone else initialized the metadata between
         * the time we released the read lock and acquired the write lock, we
@@ -229,6 +346,8 @@ _bt_getroot(Relation rel, int access)
        metad->btm_level = 0;
        metad->btm_fastroot = rootblkno;
        metad->btm_fastlevel = 0;
+       metad->btm_oldest_btpo_xact = InvalidTransactionId;
+       metad->btm_last_cleanup_num_heap_tuples = -1.0;
 
        MarkBufferDirty(rootbuf);
        MarkBufferDirty(metabuf);
@@ -248,6 +367,8 @@ _bt_getroot(Relation rel, int access)
            md.level = 0;
            md.fastroot = rootblkno;
            md.fastlevel = 0;
+           md.oldest_btpo_xact = InvalidTransactionId;
+           md.last_cleanup_num_heap_tuples = -1.0;
 
            XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
 
@@ -373,12 +494,14 @@ _bt_gettrueroot(Relation rel)
                 errmsg("index \"%s\" is not a btree",
                        RelationGetRelationName(rel))));
 
-   if (metad->btm_version != BTREE_VERSION)
+   if (metad->btm_version < BTREE_MIN_VERSION ||
+       metad->btm_version > BTREE_VERSION)
        ereport(ERROR,
                (errcode(ERRCODE_INDEX_CORRUPTED),
-                errmsg("version mismatch in index \"%s\": file version %d, code version %d",
+                errmsg("version mismatch in index \"%s\": file version %d, "
+                       "current version %d, minimal supported version %d",
                        RelationGetRelationName(rel),
-                       metad->btm_version, BTREE_VERSION)));
+                       metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
 
    /* if no root page initialized yet, fail */
    if (metad->btm_root == P_NONE)
@@ -460,12 +583,14 @@ _bt_getrootheight(Relation rel)
                     errmsg("index \"%s\" is not a btree",
                            RelationGetRelationName(rel))));
 
-       if (metad->btm_version != BTREE_VERSION)
+       if (metad->btm_version < BTREE_MIN_VERSION ||
+           metad->btm_version > BTREE_VERSION)
            ereport(ERROR,
                    (errcode(ERRCODE_INDEX_CORRUPTED),
-                    errmsg("version mismatch in index \"%s\": file version %d, code version %d",
+                    errmsg("version mismatch in index \"%s\": file version %d, "
+                           "current version %d, minimal supported version %d",
                            RelationGetRelationName(rel),
-                           metad->btm_version, BTREE_VERSION)));
+                           metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
 
        /*
         * If there's no root page yet, _bt_getroot() doesn't expect a cache
@@ -1784,6 +1909,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
    /* And update the metapage, if needed */
    if (BufferIsValid(metabuf))
    {
+       /* upgrade metapage if needed */
+       if (metad->btm_version < BTREE_VERSION)
+           _bt_upgrademetapage(metapg);
        metad->btm_fastroot = rightsib;
        metad->btm_fastlevel = targetlevel;
        MarkBufferDirty(metabuf);
@@ -1834,6 +1962,8 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
            xlmeta.level = metad->btm_level;
            xlmeta.fastroot = metad->btm_fastroot;
            xlmeta.fastlevel = metad->btm_fastlevel;
+           xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+           xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
 
            XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
            xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
index 6fca8e358fe885852798023d4521bc5b16204cf0..06badc90ba133363d6ef8963b99715893e4f821e 100644 (file)
 #include "postgres.h"
 
 #include "access/nbtree.h"
+#include "access/nbtxlog.h"
 #include "access/relscan.h"
 #include "access/xlog.h"
 #include "commands/vacuum.h"
+#include "miscadmin.h"
 #include "nodes/execnodes.h"
 #include "pgstat.h"
+#include "postmaster/autovacuum.h"
 #include "storage/condition_variable.h"
 #include "storage/indexfsm.h"
 #include "storage/ipc.h"
@@ -45,6 +48,7 @@ typedef struct
    BlockNumber lastBlockVacuumed;  /* highest blkno actually vacuumed */
    BlockNumber lastBlockLocked;    /* highest blkno we've cleanup-locked */
    BlockNumber totFreePages;   /* true total # of free pages */
+   TransactionId oldestBtpoXact;
    MemoryContext pagedelcontext;
 } BTVacState;
 
@@ -89,7 +93,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc;
 
 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
             IndexBulkDeleteCallback callback, void *callback_state,
-            BTCycleId cycleid);
+            BTCycleId cycleid, TransactionId *oldestBtpoXact);
 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
             BlockNumber orig_blkno);
 
@@ -773,6 +777,70 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan)
    SpinLockRelease(&btscan->btps_mutex);
 }
 
+/*
+ * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
+ *         btbulkdelete() wasn't called.
+ */
+static bool
+_bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
+{
+   Buffer          metabuf;
+   Page            metapg;
+   BTPageOpaque    metaopaque;
+   BTMetaPageData *metad;
+   bool            result = false;
+
+   metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
+   metapg = BufferGetPage(metabuf);
+   metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+   metad = BTPageGetMeta(metapg);
+
+   if (metad->btm_version < BTREE_VERSION)
+   {
+       /*
+        * Do cleanup if metapage needs upgrade, because we don't have
+        * cleanup-related meta-information yet.
+        */
+       result = true;
+   }
+   else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
+            TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
+                                  RecentGlobalXmin))
+   {
+       /*
+        * If oldest btpo.xact in the deleted pages is older than
+        * RecentGlobalXmin, then at least one deleted page can be recycled.
+        */
+       result = true;
+   }
+   else
+   {
+       StdRdOptions   *relopts;
+       float8          cleanup_scale_factor;
+
+       /*
+        * If table receives large enough amount of insertions and no cleanup
+        * was performed, then index might appear to have stalled statistics.
+        * In order to evade that, we perform cleanup when table receives
+        * vacuum_cleanup_index_scale_factor fractions of insertions.
+        */
+       relopts = (StdRdOptions *) info->index->rd_options;
+       cleanup_scale_factor = (relopts &&
+           relopts->vacuum_cleanup_index_scale_factor >= 0)
+               ? relopts->vacuum_cleanup_index_scale_factor
+               : vacuum_cleanup_index_scale_factor;
+
+       if (cleanup_scale_factor < 0 ||
+           metad->btm_last_cleanup_num_heap_tuples < 0 ||
+           info->num_heap_tuples > (1.0 + cleanup_scale_factor) *
+                                   metad->btm_last_cleanup_num_heap_tuples)
+           result = true;
+   }
+
+   _bt_relbuf(info->index, metabuf);
+   return result;
+}
+
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples.
  * The set of target tuples is specified via a callback routine that tells
@@ -795,9 +863,20 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
    /* The ENSURE stuff ensures we clean up shared memory on failure */
    PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
    {
+       TransactionId   oldestBtpoXact;
+
        cycleid = _bt_start_vacuum(rel);
 
-       btvacuumscan(info, stats, callback, callback_state, cycleid);
+       btvacuumscan(info, stats, callback, callback_state, cycleid,
+                    &oldestBtpoXact);
+
+       /*
+        * Update cleanup-related information in metapage. These information
+        * is used only for cleanup but keeping up them to date can avoid
+        * unnecessary cleanup even after bulkdelete.
+        */
+       _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
+                                    info->num_heap_tuples);
    }
    PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
    _bt_end_vacuum(rel);
@@ -819,17 +898,28 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
    /*
     * If btbulkdelete was called, we need not do anything, just return the
-    * stats from the latest btbulkdelete call.  If it wasn't called, we must
-    * still do a pass over the index, to recycle any newly-recyclable pages
-    * and to obtain index statistics.
+    * stats from the latest btbulkdelete call.  If it wasn't called, we might
+    * still need to do a pass over the index, to recycle any newly-recyclable
+    * pages and to obtain index statistics.  _bt_vacuum_needs_cleanup checks
+    * is there are newly-recyclable or stalled index statistics.
     *
     * Since we aren't going to actually delete any leaf items, there's no
     * need to go through all the vacuum-cycle-ID pushups.
     */
    if (stats == NULL)
    {
+       TransactionId   oldestBtpoXact;
+
+       /* Check if we need a cleanup */
+       if (!_bt_vacuum_needs_cleanup(info))
+           return NULL;
+
        stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
-       btvacuumscan(info, stats, NULL, NULL, 0);
+       btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact);
+
+       /* Update cleanup-related information in the metapage */
+       _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
+                                    info->num_heap_tuples);
    }
 
    /*
@@ -862,7 +952,7 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 static void
 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
             IndexBulkDeleteCallback callback, void *callback_state,
-            BTCycleId cycleid)
+            BTCycleId cycleid, TransactionId *oldestBtpoXact)
 {
    Relation    rel = info->index;
    BTVacState  vstate;
@@ -887,6 +977,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
    vstate.lastBlockVacuumed = BTREE_METAPAGE;  /* Initialise at first block */
    vstate.lastBlockLocked = BTREE_METAPAGE;
    vstate.totFreePages = 0;
+   vstate.oldestBtpoXact = InvalidTransactionId;
 
    /* Create a temporary memory context to run _bt_pagedel in */
    vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
@@ -991,6 +1082,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
    /* update statistics */
    stats->num_pages = num_pages;
    stats->pages_free = vstate.totFreePages;
+
+   if (oldestBtpoXact)
+       *oldestBtpoXact = vstate.oldestBtpoXact;
 }
 
 /*
@@ -1070,6 +1164,11 @@ restart:
    {
        /* Already deleted, but can't recycle yet */
        stats->pages_deleted++;
+
+       /* Update the oldest btpo.xact */
+       if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
+           TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
+           vstate->oldestBtpoXact = opaque->btpo.xact;
    }
    else if (P_ISHALFDEAD(opaque))
    {
@@ -1238,7 +1337,12 @@ restart:
 
        /* count only this page, else may double-count parent */
        if (ndel)
+       {
            stats->pages_deleted++;
+           if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
+               TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
+               vstate->oldestBtpoXact = opaque->btpo.xact;
+       }
 
        MemoryContextSwitchTo(oldcontext);
        /* pagedel released buffer, so we shouldn't */
index 233c3965d95a44956a6cc44545f0448b45df562d..b565bcb54017e0630f804d32f3f276f3945e8faf 100644 (file)
@@ -108,6 +108,8 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id)
    md->btm_level = xlrec->level;
    md->btm_fastroot = xlrec->fastroot;
    md->btm_fastlevel = xlrec->fastlevel;
+   md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact;
+   md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples;
 
    pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
    pageop->btpo_flags = BTP_META;
@@ -985,7 +987,6 @@ btree_xlog_reuse_page(XLogReaderState *record)
    }
 }
 
-
 void
 btree_redo(XLogReaderState *record)
 {
@@ -1027,6 +1028,9 @@ btree_redo(XLogReaderState *record)
        case XLOG_BTREE_REUSE_PAGE:
            btree_xlog_reuse_page(record);
            break;
+       case XLOG_BTREE_META_CLEANUP:
+           _bt_restore_meta(record, 0);
+           break;
        default:
            elog(PANIC, "btree_redo: unknown op code %u", info);
    }
index 446040d8160a9c73d7a08434ff39543367b177f2..c1f0441b0817f382eeb2174233b7a9c6f1cbf891 100644 (file)
@@ -138,3 +138,5 @@ int         VacuumPageDirty = 0;
 
 int            VacuumCostBalance = 0;  /* working state for vacuum */
 bool       VacuumCostActive = false;
+
+double     vacuum_cleanup_index_scale_factor;
index 4ffc8451ca4885ff2ff02c5510b1daf1ed0a130e..260ae264d88054a0d767430aaa056ce5d82dee37 100644 (file)
@@ -3208,6 +3208,16 @@ static struct config_real ConfigureNamesReal[] =
        NULL, NULL, NULL
    },
 
+   {
+       {"vacuum_cleanup_index_scale_factor", PGC_SIGHUP, AUTOVACUUM,
+           gettext_noop("Number of tuple inserts prior to index cleanup as a fraction of reltuples."),
+           NULL
+       },
+       &vacuum_cleanup_index_scale_factor,
+       0.1, 0.0, 100.0,
+       NULL, NULL, NULL
+   },
+
    /* End-of-list marker */
    {
        {NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL
index 2b0b1da7636ce85ed0e5d54f2ad21957baf862e1..f532f3ffff3ca8d4bcca4f5dc60f5694a530e232 100644 (file)
@@ -102,6 +102,11 @@ typedef struct BTMetaPageData
    uint32      btm_level;      /* tree level of the root page */
    BlockNumber btm_fastroot;   /* current "fast" root location */
    uint32      btm_fastlevel;  /* tree level of the "fast" root page */
+   /* following fields are available since page version 3 */
+   TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among of
+                                        * deleted pages */
+   float4      btm_last_cleanup_num_heap_tuples; /* number of heap tuples
+                                                  * during last cleanup */
 } BTMetaPageData;
 
 #define BTPageGetMeta(p) \
@@ -109,7 +114,8 @@ typedef struct BTMetaPageData
 
 #define BTREE_METAPAGE 0       /* first page is meta */
 #define BTREE_MAGIC        0x053162    /* magic number of btree pages */
-#define BTREE_VERSION  2       /* current version number */
+#define BTREE_VERSION  3       /* current version number */
+#define BTREE_MIN_VERSION  2       /* minimal supported version number */
 
 /*
  * Maximum size of a btree index entry, including its tuple header.
@@ -481,6 +487,9 @@ extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
  * prototypes for functions in nbtpage.c
  */
 extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
+extern void _bt_update_meta_cleanup_info(Relation rel,
+                           TransactionId oldestBtpoXact, float8 numHeapTuples);
+extern void _bt_upgrademetapage(Page page);
 extern Buffer _bt_getroot(Relation rel, int access);
 extern Buffer _bt_gettrueroot(Relation rel);
 extern int _bt_getrootheight(Relation rel);
index 8297df75fe8db4faa97ac2800b2b98211b3e9d34..a8ccdcec426dde3c48282cadbaef2f97ffde7f53 100644 (file)
@@ -38,6 +38,8 @@
                                         * vacuum */
 #define XLOG_BTREE_REUSE_PAGE  0xD0    /* old page is about to be reused from
                                         * FSM */
+#define XLOG_BTREE_META_CLEANUP    0xE0    /* update cleanup-related data in the
+                                        * metapage */
 
 /*
  * All that we need to regenerate the meta-data page
@@ -48,6 +50,8 @@ typedef struct xl_btree_metadata
    uint32      level;
    BlockNumber fastroot;
    uint32      fastlevel;
+   TransactionId oldest_btpo_xact;
+   double      last_cleanup_num_heap_tuples;
 } xl_btree_metadata;
 
 /*
index a4574cd5331b370f8de1faf961435d1480dbd043..a429a19964e38c8586be9fa38f69927a6da8c178 100644 (file)
@@ -256,6 +256,8 @@ extern int  VacuumPageDirty;
 extern int VacuumCostBalance;
 extern bool VacuumCostActive;
 
+extern double vacuum_cleanup_index_scale_factor;
+
 
 /* in tcop/postgres.c */
 
index c26c395b0bd47baeb224ee1ebbdbbe5852377350..9826c67fc418a448b95f39e3e3182b9a9b1cfe2c 100644 (file)
@@ -287,6 +287,8 @@ typedef struct StdRdOptions
 {
    int32       vl_len_;        /* varlena header (do not touch directly!) */
    int         fillfactor;     /* page fill factor in percent (0..100) */
+   /* fraction of newly inserted tuples prior to trigger index cleanup */
+   float8      vacuum_cleanup_index_scale_factor;
    int         toast_tuple_target; /* target for tuple toasting */
    AutoVacOpts autovacuum;     /* autovacuum-related options */
    bool        user_catalog_table; /* use as an additional catalog relation */
index 755cd177925fc9c30df49fcb177deedd4c1f915f..4778ac14a4c136338af4184954bfb0fa3ebcda3d 100644 (file)
@@ -150,3 +150,32 @@ vacuum btree_tall_tbl;
 -- need to insert some rows to cause the fast root page to split.
 insert into btree_tall_tbl (id, t)
   select g, repeat('x', 100) from generate_series(1, 500) g;
+--
+-- Test vacuum_cleanup_index_scale_factor
+--
+-- Simple create
+create table btree_test(a int);
+create index btree_idx1 on btree_test(a) with (vacuum_cleanup_index_scale_factor = 40.0);
+select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;
+                reloptions                
+------------------------------------------
+ {vacuum_cleanup_index_scale_factor=40.0}
+(1 row)
+
+-- Fail while setting improper values
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = -10.0);
+ERROR:  value -10.0 out of bounds for option "vacuum_cleanup_index_scale_factor"
+DETAIL:  Valid values are between "0.000000" and "100.000000".
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 100.0);
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 'string');
+ERROR:  invalid value for floating point option "vacuum_cleanup_index_scale_factor": string
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = true);
+ERROR:  invalid value for floating point option "vacuum_cleanup_index_scale_factor": true
+-- Simple ALTER INDEX
+alter index btree_idx1 set (vacuum_cleanup_index_scale_factor = 70.0);
+select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;
+                reloptions                
+------------------------------------------
+ {vacuum_cleanup_index_scale_factor=70.0}
+(1 row)
+
index 65b08c828247b8699d35a3da40590bc6024b8a01..21171f776259b55ee51d4e666c46cc8e6bdb1424 100644 (file)
@@ -92,3 +92,22 @@ vacuum btree_tall_tbl;
 -- need to insert some rows to cause the fast root page to split.
 insert into btree_tall_tbl (id, t)
   select g, repeat('x', 100) from generate_series(1, 500) g;
+
+--
+-- Test vacuum_cleanup_index_scale_factor
+--
+
+-- Simple create
+create table btree_test(a int);
+create index btree_idx1 on btree_test(a) with (vacuum_cleanup_index_scale_factor = 40.0);
+select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;
+
+-- Fail while setting improper values
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = -10.0);
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 100.0);
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 'string');
+create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = true);
+
+-- Simple ALTER INDEX
+alter index btree_idx1 set (vacuum_cleanup_index_scale_factor = 70.0);
+select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;