Make parallel nbtree index scans use an LWLock.
authorPeter Geoghegan <pg@bowt.ie>
Sat, 8 Mar 2025 16:10:14 +0000 (11:10 -0500)
committerPeter Geoghegan <pg@bowt.ie>
Sat, 8 Mar 2025 16:10:14 +0000 (11:10 -0500)
Teach parallel nbtree index scans to use an LWLock (not a spinlock) to
protect the scan's shared descriptor state.

Preparation for an upcoming patch that will add skip scan optimizations
to nbtree.  That patch will create the need to occasionally allocate
memory while the scan descriptor is locked, while copying datums that
were serialized by another backend.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Matthias van de Meent <boekewurm+postgres@gmail.com>
Discussion: https://postgr.es/m/CAH2-Wz=PKR6rB7qbx+Vnd7eqeB5VTcrW=iJvAsTsKbdG+kW_UA@mail.gmail.com

src/backend/access/nbtree/nbtpreprocesskeys.c
src/backend/access/nbtree/nbtree.c
src/backend/storage/lmgr/lwlock.c
src/backend/utils/activity/wait_event_names.txt
src/include/storage/lwlock.h

index 1fd1da5f18b31bf56909658ab4ad3917200feae5..38a87af1cc8b873ab5341012dd5b6055e09b919c 100644 (file)
@@ -1565,7 +1565,7 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap)
     * Parallel index scans require space in shared memory to store the
     * current array elements (for arrays kept by preprocessing) to schedule
     * the next primitive index scan.  The underlying structure is protected
-    * using a spinlock, so defensively limit its size.  In practice this can
+    * using an LWLock, so defensively limit its size.  In practice this can
     * only affect parallel scans that use an incomplete opfamily.
     */
    if (scan->parallel_scan && so->numArrayKeys > INDEX_MAX_KEYS)
index 136e9408ae5fd024167b0546d66064b5e32e21d8..25188a644efec388225538b9f36fd69c330d65dd 100644 (file)
@@ -70,7 +70,7 @@ typedef struct BTParallelScanDescData
    BTPS_State  btps_pageStatus;    /* indicates whether next page is
                                     * available for scan. see above for
                                     * possible states of parallel scan. */
-   slock_t     btps_mutex;     /* protects above variables, btps_arrElems */
+   LWLock      btps_lock;      /* protects shared parallel state */
    ConditionVariable btps_cv;  /* used to synchronize parallel scan */
 
    /*
@@ -554,7 +554,8 @@ btinitparallelscan(void *target)
 {
    BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
 
-   SpinLockInit(&bt_target->btps_mutex);
+   LWLockInitialize(&bt_target->btps_lock,
+                    LWTRANCHE_PARALLEL_BTREE_SCAN);
    bt_target->btps_nextScanPage = InvalidBlockNumber;
    bt_target->btps_lastCurrPage = InvalidBlockNumber;
    bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
@@ -576,15 +577,15 @@ btparallelrescan(IndexScanDesc scan)
                                                  parallel_scan->ps_offset);
 
    /*
-    * In theory, we don't need to acquire the spinlock here, because there
+    * In theory, we don't need to acquire the LWLock here, because there
     * shouldn't be any other workers running at this point, but we do so for
     * consistency.
     */
-   SpinLockAcquire(&btscan->btps_mutex);
+   LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
    btscan->btps_nextScanPage = InvalidBlockNumber;
    btscan->btps_lastCurrPage = InvalidBlockNumber;
    btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
-   SpinLockRelease(&btscan->btps_mutex);
+   LWLockRelease(&btscan->btps_lock);
 }
 
 /*
@@ -655,7 +656,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
 
    while (1)
    {
-       SpinLockAcquire(&btscan->btps_mutex);
+       LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
 
        if (btscan->btps_pageStatus == BTPARALLEL_DONE)
        {
@@ -717,7 +718,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
            *last_curr_page = btscan->btps_lastCurrPage;
            exit_loop = true;
        }
-       SpinLockRelease(&btscan->btps_mutex);
+       LWLockRelease(&btscan->btps_lock);
        if (exit_loop || !status)
            break;
        ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
@@ -761,11 +762,11 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
    btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
                                                  parallel_scan->ps_offset);
 
-   SpinLockAcquire(&btscan->btps_mutex);
+   LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
    btscan->btps_nextScanPage = next_scan_page;
    btscan->btps_lastCurrPage = curr_page;
    btscan->btps_pageStatus = BTPARALLEL_IDLE;
-   SpinLockRelease(&btscan->btps_mutex);
+   LWLockRelease(&btscan->btps_lock);
    ConditionVariableSignal(&btscan->btps_cv);
 }
 
@@ -804,14 +805,14 @@ _bt_parallel_done(IndexScanDesc scan)
     * Mark the parallel scan as done, unless some other process did so
     * already
     */
-   SpinLockAcquire(&btscan->btps_mutex);
+   LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
    Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
    if (btscan->btps_pageStatus != BTPARALLEL_DONE)
    {
        btscan->btps_pageStatus = BTPARALLEL_DONE;
        status_changed = true;
    }
-   SpinLockRelease(&btscan->btps_mutex);
+   LWLockRelease(&btscan->btps_lock);
 
    /* wake up all the workers associated with this parallel scan */
    if (status_changed)
@@ -838,7 +839,7 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
    btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
                                                  parallel_scan->ps_offset);
 
-   SpinLockAcquire(&btscan->btps_mutex);
+   LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
    if (btscan->btps_lastCurrPage == curr_page &&
        btscan->btps_pageStatus == BTPARALLEL_IDLE)
    {
@@ -854,7 +855,7 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
            btscan->btps_arrElems[i] = array->cur_elem;
        }
    }
-   SpinLockRelease(&btscan->btps_mutex);
+   LWLockRelease(&btscan->btps_lock);
 }
 
 /*
index 8adf27302770975d49b85115d1922ff811a79fac..5702c35bb91a7b2c6f9d51e9bebfc62aabb9f235 100644 (file)
@@ -153,6 +153,7 @@ static const char *const BuiltinTrancheNames[] = {
    [LWTRANCHE_LOCK_MANAGER] = "LockManager",
    [LWTRANCHE_PREDICATE_LOCK_MANAGER] = "PredicateLockManager",
    [LWTRANCHE_PARALLEL_HASH_JOIN] = "ParallelHashJoin",
+   [LWTRANCHE_PARALLEL_BTREE_SCAN] = "ParallelBtreeScan",
    [LWTRANCHE_PARALLEL_QUERY_DSA] = "ParallelQueryDSA",
    [LWTRANCHE_PER_SESSION_DSA] = "PerSessionDSA",
    [LWTRANCHE_PER_SESSION_RECORD_TYPE] = "PerSessionRecordType",
index e199f071628987ec0626847d8c18632293dd3e89..3c594415bfdb82b003a73c230cc5002793a847fb 100644 (file)
@@ -371,6 +371,7 @@ BufferMapping   "Waiting to associate a data block with a buffer in the buffer poo
 LockManager    "Waiting to read or update information about <quote>heavyweight</quote> locks."
 PredicateLockManager   "Waiting to access predicate lock information used by serializable transactions."
 ParallelHashJoin   "Waiting to synchronize workers during Parallel Hash Join plan execution."
+ParallelBtreeScan  "Waiting to synchronize workers during Parallel B-tree scan plan execution."
 ParallelQueryDSA   "Waiting for parallel query dynamic shared memory allocation."
 PerSessionDSA  "Waiting for parallel query dynamic shared memory allocation."
 PerSessionRecordType   "Waiting to access a parallel query's information about composite types."
index 13a7dc89980490c49f2179cd77dc52d1bbfbf734..ffa03189e2d686217824281752233d5f871cc775 100644 (file)
@@ -194,6 +194,7 @@ typedef enum BuiltinTrancheIds
    LWTRANCHE_LOCK_MANAGER,
    LWTRANCHE_PREDICATE_LOCK_MANAGER,
    LWTRANCHE_PARALLEL_HASH_JOIN,
+   LWTRANCHE_PARALLEL_BTREE_SCAN,
    LWTRANCHE_PARALLEL_QUERY_DSA,
    LWTRANCHE_PER_SESSION_DSA,
    LWTRANCHE_PER_SESSION_RECORD_TYPE,