Fix risk of deadlock failure while dropping a partitioned index.
authorTom Lane <tgl@sss.pgh.pa.us>
Mon, 21 Mar 2022 16:22:13 +0000 (12:22 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Mon, 21 Mar 2022 16:22:13 +0000 (12:22 -0400)
DROP INDEX needs to lock the index's table before the index itself,
else it will deadlock against ordinary queries that acquire the
relation locks in that order.  This is correctly mechanized for
plain indexes by RangeVarCallbackForDropRelation; but in the case of
a partitioned index, we neglected to lock the child tables in advance
of locking the child indexes.  We can fix that by traversing the
inheritance tree and acquiring the needed locks in RemoveRelations,
after we have acquired our locks on the parent partitioned table and
index.

While at it, do some refactoring to eliminate confusion between
the actual and expected relkind in RangeVarCallbackForDropRelation.
We can save a couple of syscache lookups too, by having that function
pass back info that RemoveRelations will need.

Back-patch to v11 where partitioned indexes were added.

Jimmy Yih, Gaurab Dey, Tom Lane

Discussion: https://postgr.es/m/BYAPR05MB645402330042E17D91A70C12BD5F9@BYAPR05MB6454.namprd05.prod.outlook.com

src/backend/commands/tablecmds.c
src/test/isolation/expected/partition-drop-index-locking.out [new file with mode: 0644]
src/test/isolation/isolation_schedule
src/test/isolation/specs/partition-drop-index-locking.spec [new file with mode: 0644]

index dc5872f988c96cd76c7dcf01b7961457dc7f664f..ab9a53b27c65ad76c0e6969ae79cabcc9c1d9645 100644 (file)
@@ -295,12 +295,18 @@ static const struct dropmsgstrings dropmsgstringarray[] = {
    {'\0', 0, NULL, NULL, NULL, NULL}
 };
 
+/* communication between RemoveRelations and RangeVarCallbackForDropRelation */
 struct DropRelationCallbackState
 {
-   char        relkind;
+   /* These fields are set by RemoveRelations: */
+   char        expected_relkind;
+   LOCKMODE    heap_lockmode;
+   /* These fields are state to track which subsidiary locks are held: */
    Oid         heapOid;
    Oid         partParentOid;
-   bool        concurrent;
+   /* These fields are passed back by RangeVarCallbackForDropRelation: */
+   char        actual_relkind;
+   char        actual_relpersistence;
 };
 
 /* Alter table target-type flags for ATSimplePermissions */
@@ -1416,10 +1422,13 @@ RemoveRelations(DropStmt *drop)
        AcceptInvalidationMessages();
 
        /* Look up the appropriate relation using namespace search. */
-       state.relkind = relkind;
+       state.expected_relkind = relkind;
+       state.heap_lockmode = drop->concurrent ?
+           ShareUpdateExclusiveLock : AccessExclusiveLock;
+       /* We must initialize these fields to show that no locks are held: */
        state.heapOid = InvalidOid;
        state.partParentOid = InvalidOid;
-       state.concurrent = drop->concurrent;
+
        relOid = RangeVarGetRelidExtended(rel, lockmode, RVR_MISSING_OK,
                                          RangeVarCallbackForDropRelation,
                                          (void *) &state);
@@ -1433,10 +1442,10 @@ RemoveRelations(DropStmt *drop)
 
        /*
         * Decide if concurrent mode needs to be used here or not.  The
-        * relation persistence cannot be known without its OID.
+        * callback retrieved the rel's persistence for us.
         */
        if (drop->concurrent &&
-           get_rel_persistence(relOid) != RELPERSISTENCE_TEMP)
+           state.actual_relpersistence != RELPERSISTENCE_TEMP)
        {
            Assert(list_length(drop->objects) == 1 &&
                   drop->removeType == OBJECT_INDEX);
@@ -1448,12 +1457,24 @@ RemoveRelations(DropStmt *drop)
         * either.
         */
        if ((flags & PERFORM_DELETION_CONCURRENTLY) != 0 &&
-           get_rel_relkind(relOid) == RELKIND_PARTITIONED_INDEX)
+           state.actual_relkind == RELKIND_PARTITIONED_INDEX)
            ereport(ERROR,
                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                     errmsg("cannot drop partitioned index \"%s\" concurrently",
                            rel->relname)));
 
+       /*
+        * If we're told to drop a partitioned index, we must acquire lock on
+        * all the children of its parent partitioned table before proceeding.
+        * Otherwise we'd try to lock the child index partitions before their
+        * tables, leading to potential deadlock against other sessions that
+        * will lock those objects in the other order.
+        */
+       if (state.actual_relkind == RELKIND_PARTITIONED_INDEX)
+           (void) find_all_inheritors(state.heapOid,
+                                      state.heap_lockmode,
+                                      NULL);
+
        /* OK, we're ready to delete this one */
        obj.classId = RelationRelationId;
        obj.objectId = relOid;
@@ -1479,7 +1500,6 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
 {
    HeapTuple   tuple;
    struct DropRelationCallbackState *state;
-   char        relkind;
    char        expected_relkind;
    bool        is_partition;
    Form_pg_class classform;
@@ -1487,9 +1507,7 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
    bool        invalid_system_index = false;
 
    state = (struct DropRelationCallbackState *) arg;
-   relkind = state->relkind;
-   heap_lockmode = state->concurrent ?
-       ShareUpdateExclusiveLock : AccessExclusiveLock;
+   heap_lockmode = state->heap_lockmode;
 
    /*
     * If we previously locked some other index's heap, and the name we're
@@ -1523,6 +1541,10 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
    classform = (Form_pg_class) GETSTRUCT(tuple);
    is_partition = classform->relispartition;
 
+   /* Pass back some data to save lookups in RemoveRelations */
+   state->actual_relkind = classform->relkind;
+   state->actual_relpersistence = classform->relpersistence;
+
    /*
     * Both RELKIND_RELATION and RELKIND_PARTITIONED_TABLE are OBJECT_TABLE,
     * but RemoveRelations() can only pass one relkind for a given relation.
@@ -1538,13 +1560,15 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
    else
        expected_relkind = classform->relkind;
 
-   if (relkind != expected_relkind)
-       DropErrorMsgWrongType(rel->relname, classform->relkind, relkind);
+   if (state->expected_relkind != expected_relkind)
+       DropErrorMsgWrongType(rel->relname, classform->relkind,
+                             state->expected_relkind);
 
    /* Allow DROP to either table owner or schema owner */
    if (!pg_class_ownercheck(relOid, GetUserId()) &&
        !pg_namespace_ownercheck(classform->relnamespace, GetUserId()))
-       aclcheck_error(ACLCHECK_NOT_OWNER, get_relkind_objtype(get_rel_relkind(relOid)),
+       aclcheck_error(ACLCHECK_NOT_OWNER,
+                      get_relkind_objtype(classform->relkind),
                       rel->relname);
 
    /*
@@ -1553,7 +1577,7 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
     * only concerns indexes of toast relations that became invalid during a
     * REINDEX CONCURRENTLY process.
     */
-   if (IsSystemClass(relOid, classform) && relkind == RELKIND_INDEX)
+   if (IsSystemClass(relOid, classform) && classform->relkind == RELKIND_INDEX)
    {
        HeapTuple   locTuple;
        Form_pg_index indexform;
@@ -1589,9 +1613,10 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
     * locking the index.  index_drop() will need this anyway, and since
     * regular queries lock tables before their indexes, we risk deadlock if
     * we do it the other way around.  No error if we don't find a pg_index
-    * entry, though --- the relation may have been dropped.
+    * entry, though --- the relation may have been dropped.  Note that this
+    * code will execute for either plain or partitioned indexes.
     */
-   if ((relkind == RELKIND_INDEX || relkind == RELKIND_PARTITIONED_INDEX) &&
+   if (expected_relkind == RELKIND_INDEX &&
        relOid != oldRelOid)
    {
        state->heapOid = IndexGetRelation(relOid, true);
@@ -1602,7 +1627,7 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
    /*
     * Similarly, if the relation is a partition, we must acquire lock on its
     * parent before locking the partition.  That's because queries lock the
-    * parent before its partitions, so we risk deadlock it we do it the other
+    * parent before its partitions, so we risk deadlock if we do it the other
     * way around.
     */
    if (is_partition && relOid != oldRelOid)
diff --git a/src/test/isolation/expected/partition-drop-index-locking.out b/src/test/isolation/expected/partition-drop-index-locking.out
new file mode 100644 (file)
index 0000000..9acd51d
--- /dev/null
@@ -0,0 +1,100 @@
+Parsed test spec with 3 sessions
+
+starting permutation: s1begin s1lock s2begin s2drop s1select s3getlocks s1commit s3getlocks s2commit
+step s1begin: BEGIN;
+step s1lock: LOCK TABLE part_drop_index_locking_subpart_child IN ACCESS SHARE MODE;
+step s2begin: BEGIN;
+step s2drop: DROP INDEX part_drop_index_locking_idx; <waiting ...>
+step s1select: SELECT * FROM part_drop_index_locking_subpart_child;
+id
+--
+(0 rows)
+
+step s3getlocks: 
+        SELECT s.query, c.relname, l.mode, l.granted
+        FROM pg_locks l
+                JOIN pg_class c ON l.relation = c.oid
+                JOIN pg_stat_activity s ON l.pid = s.pid
+        WHERE c.relname LIKE 'part_drop_index_locking%'
+        ORDER BY s.query, c.relname, l.mode, l.granted;
+
+query                                               |relname                                      |mode               |granted
+----------------------------------------------------+---------------------------------------------+-------------------+-------
+DROP INDEX part_drop_index_locking_idx;             |part_drop_index_locking                      |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;             |part_drop_index_locking_idx                  |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;             |part_drop_index_locking_subpart              |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;             |part_drop_index_locking_subpart_child        |AccessExclusiveLock|f      
+SELECT * FROM part_drop_index_locking_subpart_child;|part_drop_index_locking_subpart_child        |AccessShareLock    |t      
+SELECT * FROM part_drop_index_locking_subpart_child;|part_drop_index_locking_subpart_child_id_idx |AccessShareLock    |t      
+SELECT * FROM part_drop_index_locking_subpart_child;|part_drop_index_locking_subpart_child_id_idx1|AccessShareLock    |t      
+(7 rows)
+
+step s1commit: COMMIT;
+step s2drop: <... completed>
+step s3getlocks: 
+        SELECT s.query, c.relname, l.mode, l.granted
+        FROM pg_locks l
+                JOIN pg_class c ON l.relation = c.oid
+                JOIN pg_stat_activity s ON l.pid = s.pid
+        WHERE c.relname LIKE 'part_drop_index_locking%'
+        ORDER BY s.query, c.relname, l.mode, l.granted;
+
+query                                  |relname                                     |mode               |granted
+---------------------------------------+--------------------------------------------+-------------------+-------
+DROP INDEX part_drop_index_locking_idx;|part_drop_index_locking                     |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;|part_drop_index_locking_idx                 |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;|part_drop_index_locking_subpart             |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;|part_drop_index_locking_subpart_child       |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;|part_drop_index_locking_subpart_child_id_idx|AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;|part_drop_index_locking_subpart_id_idx      |AccessExclusiveLock|t      
+(6 rows)
+
+step s2commit: COMMIT;
+
+starting permutation: s1begin s1lock s2begin s2dropsub s1select s3getlocks s1commit s3getlocks s2commit
+step s1begin: BEGIN;
+step s1lock: LOCK TABLE part_drop_index_locking_subpart_child IN ACCESS SHARE MODE;
+step s2begin: BEGIN;
+step s2dropsub: DROP INDEX part_drop_index_locking_subpart_idx; <waiting ...>
+step s1select: SELECT * FROM part_drop_index_locking_subpart_child;
+id
+--
+(0 rows)
+
+step s3getlocks: 
+        SELECT s.query, c.relname, l.mode, l.granted
+        FROM pg_locks l
+                JOIN pg_class c ON l.relation = c.oid
+                JOIN pg_stat_activity s ON l.pid = s.pid
+        WHERE c.relname LIKE 'part_drop_index_locking%'
+        ORDER BY s.query, c.relname, l.mode, l.granted;
+
+query                                               |relname                                      |mode               |granted
+----------------------------------------------------+---------------------------------------------+-------------------+-------
+DROP INDEX part_drop_index_locking_subpart_idx;     |part_drop_index_locking_subpart              |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_subpart_idx;     |part_drop_index_locking_subpart_child        |AccessExclusiveLock|f      
+DROP INDEX part_drop_index_locking_subpart_idx;     |part_drop_index_locking_subpart_idx          |AccessExclusiveLock|t      
+SELECT * FROM part_drop_index_locking_subpart_child;|part_drop_index_locking_subpart_child        |AccessShareLock    |t      
+SELECT * FROM part_drop_index_locking_subpart_child;|part_drop_index_locking_subpart_child_id_idx |AccessShareLock    |t      
+SELECT * FROM part_drop_index_locking_subpart_child;|part_drop_index_locking_subpart_child_id_idx1|AccessShareLock    |t      
+(6 rows)
+
+step s1commit: COMMIT;
+step s2dropsub: <... completed>
+step s3getlocks: 
+        SELECT s.query, c.relname, l.mode, l.granted
+        FROM pg_locks l
+                JOIN pg_class c ON l.relation = c.oid
+                JOIN pg_stat_activity s ON l.pid = s.pid
+        WHERE c.relname LIKE 'part_drop_index_locking%'
+        ORDER BY s.query, c.relname, l.mode, l.granted;
+
+query                                          |relname                                      |mode               |granted
+-----------------------------------------------+---------------------------------------------+-------------------+-------
+DROP INDEX part_drop_index_locking_subpart_idx;|part_drop_index_locking_subpart              |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_subpart_idx;|part_drop_index_locking_subpart_child        |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_subpart_idx;|part_drop_index_locking_subpart_child_id_idx1|AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_subpart_idx;|part_drop_index_locking_subpart_idx          |AccessExclusiveLock|t      
+(4 rows)
+
+step s2commit: COMMIT;
index 0dae483e827ab1d2185e2663c45407d02cc9d2fd..8e87098150112425f74002f6b829f747f882540a 100644 (file)
@@ -90,6 +90,7 @@ test: predicate-hash
 test: predicate-gist
 test: predicate-gin
 test: partition-concurrent-attach
+test: partition-drop-index-locking
 test: partition-key-update-1
 test: partition-key-update-2
 test: partition-key-update-3
diff --git a/src/test/isolation/specs/partition-drop-index-locking.spec b/src/test/isolation/specs/partition-drop-index-locking.spec
new file mode 100644 (file)
index 0000000..34e8b52
--- /dev/null
@@ -0,0 +1,47 @@
+# Verify that DROP INDEX properly locks all downward sub-partitions
+# and partitions before locking the indexes.
+
+setup
+{
+  CREATE TABLE part_drop_index_locking (id int) PARTITION BY RANGE(id);
+  CREATE TABLE part_drop_index_locking_subpart PARTITION OF part_drop_index_locking FOR VALUES FROM (1) TO (100) PARTITION BY RANGE(id);
+  CREATE TABLE part_drop_index_locking_subpart_child PARTITION OF part_drop_index_locking_subpart FOR VALUES FROM (1) TO (100);
+  CREATE INDEX part_drop_index_locking_idx ON part_drop_index_locking(id);
+  CREATE INDEX part_drop_index_locking_subpart_idx ON part_drop_index_locking_subpart(id);
+}
+
+teardown
+{
+  DROP TABLE part_drop_index_locking;
+}
+
+# SELECT will take AccessShare lock first on the table and then on its index.
+# We can simulate the case where DROP INDEX starts between those steps
+# by manually taking the table lock beforehand.
+session s1
+step s1begin    { BEGIN; }
+step s1lock     { LOCK TABLE part_drop_index_locking_subpart_child IN ACCESS SHARE MODE; }
+step s1select   { SELECT * FROM part_drop_index_locking_subpart_child; }
+step s1commit   { COMMIT; }
+
+session s2
+step s2begin    { BEGIN; }
+step s2drop     { DROP INDEX part_drop_index_locking_idx; }
+step s2dropsub  { DROP INDEX part_drop_index_locking_subpart_idx; }
+step s2commit   { COMMIT; }
+
+session s3
+step s3getlocks {
+        SELECT s.query, c.relname, l.mode, l.granted
+        FROM pg_locks l
+                JOIN pg_class c ON l.relation = c.oid
+                JOIN pg_stat_activity s ON l.pid = s.pid
+        WHERE c.relname LIKE 'part_drop_index_locking%'
+        ORDER BY s.query, c.relname, l.mode, l.granted;
+}
+
+# Run DROP INDEX on top partitioned table
+permutation s1begin s1lock s2begin s2drop(s1commit) s1select s3getlocks s1commit s3getlocks s2commit
+
+# Run DROP INDEX on top sub-partition table
+permutation s1begin s1lock s2begin s2dropsub(s1commit) s1select s3getlocks s1commit s3getlocks s2commit