#include "storage/indexfsm.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
+#include "utils/memdebug.h"
#include "utils/snapmgr.h"
static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
}
/* trade in our read lock for a write lock */
- LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
- LockBuffer(metabuf, BT_WRITE);
+ _bt_unlockbuf(rel, metabuf);
+ _bt_lockbuf(rel, metabuf, BT_WRITE);
START_CRIT_SECTION();
}
/* trade in our read lock for a write lock */
- LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
- LockBuffer(metabuf, BT_WRITE);
+ _bt_unlockbuf(rel, metabuf);
+ _bt_lockbuf(rel, metabuf, BT_WRITE);
/*
* Race condition: if someone else initialized the metadata between
* else accessing the new root page while it's unlocked, since no one
* else knows where it is yet.
*/
- LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
- LockBuffer(rootbuf, BT_READ);
+ _bt_unlockbuf(rel, rootbuf);
+ _bt_lockbuf(rel, rootbuf, BT_READ);
/* okay, metadata is correct, release lock on it without caching */
_bt_relbuf(rel, metabuf);
* blkno == P_NEW means to get an unallocated index page. The page
* will be initialized before returning it.
*
+ * The general rule in nbtree is that it's never okay to access a
+ * page without holding both a buffer pin and a buffer lock on
+ * the page's buffer.
+ *
* When this routine returns, the appropriate lock is set on the
* requested buffer and its reference count has been incremented
* (ie, the buffer is "locked and pinned"). Also, we apply
- * _bt_checkpage to sanity-check the page (except in P_NEW case).
+ * _bt_checkpage to sanity-check the page (except in P_NEW case),
+ * and perform Valgrind client requests that help Valgrind detect
+ * unsafe page accesses.
+ *
+ * Note: raw LockBuffer() calls are disallowed in nbtree; all
+ * buffer lock requests need to go through wrapper functions such
+ * as _bt_lockbuf().
*/
Buffer
_bt_getbuf(Relation rel, BlockNumber blkno, int access)
{
/* Read an existing block of the relation */
buf = ReadBuffer(rel, blkno);
- LockBuffer(buf, access);
+ _bt_lockbuf(rel, buf, access);
_bt_checkpage(rel, buf);
}
else
if (blkno == InvalidBlockNumber)
break;
buf = ReadBuffer(rel, blkno);
- if (ConditionalLockBuffer(buf))
+ if (_bt_conditionallockbuf(rel, buf))
{
page = BufferGetPage(buf);
if (_bt_page_recyclable(page))
buf = ReadBuffer(rel, P_NEW);
/* Acquire buffer lock on new page */
- LockBuffer(buf, BT_WRITE);
+ _bt_lockbuf(rel, buf, BT_WRITE);
/*
* Release the file-extension lock; it's now OK for someone else to
Assert(blkno != P_NEW);
if (BufferIsValid(obuf))
- LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
+ _bt_unlockbuf(rel, obuf);
buf = ReleaseAndReadBuffer(obuf, rel, blkno);
- LockBuffer(buf, access);
+ _bt_lockbuf(rel, buf, access);
+
_bt_checkpage(rel, buf);
return buf;
}
void
_bt_relbuf(Relation rel, Buffer buf)
{
- UnlockReleaseBuffer(buf);
+ _bt_unlockbuf(rel, buf);
+ ReleaseBuffer(buf);
+}
+
+/*
+ * _bt_lockbuf() -- lock a pinned buffer.
+ *
+ * Lock is acquired without acquiring another pin. This is like a raw
+ * LockBuffer() call, but performs extra steps needed by Valgrind.
+ *
+ * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
+ * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
+ */
+void
+_bt_lockbuf(Relation rel, Buffer buf, int access)
+{
+ /* LockBuffer() asserts that pin is held by this backend */
+ LockBuffer(buf, access);
+
+ /*
+ * It doesn't matter that _bt_unlockbuf() won't get called in the
+ * event of an nbtree error (e.g. a unique violation error). That
+ * won't cause Valgrind false positives.
+ *
+ * The nbtree client requests are superimposed on top of the
+ * bufmgr.c buffer pin client requests. In the event of an nbtree
+ * error the buffer will certainly get marked as defined when the
+ * backend once again acquires its first pin on the buffer. (Of
+ * course, if the backend never touches the buffer again then it
+ * doesn't matter that it remains non-accessible to Valgrind.)
+ *
+ * Note: When an IndexTuple C pointer gets computed using an
+ * ItemId read from a page while a lock was held, the C pointer
+ * becomes unsafe to dereference forever as soon as the lock is
+ * released. Valgrind can only detect cases where the pointer
+ * gets dereferenced with no _current_ lock/pin held, though.
+ */
+ if (!RelationUsesLocalBuffers(rel))
+ VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
+}
+
+/*
+ * _bt_unlockbuf() -- unlock a pinned buffer.
+ */
+void
+_bt_unlockbuf(Relation rel, Buffer buf)
+{
+ /*
+ * Buffer is pinned and locked, which means that it is expected to be
+ * defined and addressable. Check that proactively.
+ */
+ VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+ /* LockBuffer() asserts that pin is held by this backend */
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+ if (!RelationUsesLocalBuffers(rel))
+ VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
+}
+
+/*
+ * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
+ * buffer.
+ *
+ * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
+ * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
+ */
+bool
+_bt_conditionallockbuf(Relation rel, Buffer buf)
+{
+ /* ConditionalLockBuffer() asserts that pin is held by this backend */
+ if (!ConditionalLockBuffer(buf))
+ return false;
+
+ if (!RelationUsesLocalBuffers(rel))
+ VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+ return true;
+}
+
+/*
+ * _bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup
+ * lock.
+ */
+void
+_bt_upgradelockbufcleanup(Relation rel, Buffer buf)
+{
+ /*
+ * Buffer is pinned and locked, which means that it is expected to be
+ * defined and addressable. Check that proactively.
+ */
+ VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+ /* LockBuffer() asserts that pin is held by this backend */
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ LockBufferForCleanup(buf);
}
/*
* To avoid deadlocks, we'd better drop the leaf page lock
* before going further.
*/
- LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
+ _bt_unlockbuf(rel, leafbuf);
/*
* Check that the left sibling of leafbuf (if any) is not
* (Page deletion can cope with the stack being to the left of
* leafbuf, but not to the right of leafbuf.)
*/
- LockBuffer(leafbuf, BT_WRITE);
+ _bt_lockbuf(rel, leafbuf, BT_WRITE);
continue;
}
leafleftsib = opaque->btpo_prev;
leafrightsib = opaque->btpo_next;
- LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
+ _bt_unlockbuf(rel, leafbuf);
/*
* Check here, as calling loops will have locks held, preventing
* To avoid deadlocks, we'd better drop the target page lock before
* going further.
*/
- LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ _bt_unlockbuf(rel, buf);
}
/*
* table.)
*/
if (target != leafblkno)
- LockBuffer(leafbuf, BT_WRITE);
+ _bt_lockbuf(rel, leafbuf, BT_WRITE);
if (leftsib != P_NONE)
{
lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
* rather than a superexclusive lock, since no scan will stop on an empty
* page.
*/
- LockBuffer(buf, BT_WRITE);
+ _bt_lockbuf(rel, buf, BT_WRITE);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
static void
_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
{
- LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
+ _bt_unlockbuf(scan->indexRelation, sp->buf);
if (IsMVCCSnapshot(scan->xs_snapshot) &&
RelationNeedsWAL(scan->indexRelation) &&
if (access == BT_WRITE && page_access == BT_READ)
{
/* trade in our read lock for a write lock */
- LockBuffer(*bufP, BUFFER_LOCK_UNLOCK);
- LockBuffer(*bufP, BT_WRITE);
+ _bt_unlockbuf(rel, *bufP);
+ _bt_lockbuf(rel, *bufP, BT_WRITE);
/*
- * If the page was split between the time that we surrendered our read
- * lock and acquired our write lock, then this page may no longer be
- * the right place for the key we want to insert. In this case, we
- * need to move right in the tree.
+ * Race -- the leaf page may have split after we dropped the read lock
+ * but before we acquired a write lock. If it has, we may need to
+ * move right to its new sibling. Do that.
*/
*bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
snapshot);
/* upgrade our lock if necessary */
if (access == BT_READ)
{
- LockBuffer(buf, BUFFER_LOCK_UNLOCK);
- LockBuffer(buf, BT_WRITE);
+ _bt_unlockbuf(rel, buf);
+ _bt_lockbuf(rel, buf, BT_WRITE);
}
if (P_INCOMPLETE_SPLIT(opaque))
* There's no actually-matching data on this page. Try to advance to
* the next page. Return false if there's no matching data at all.
*/
- LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+ _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
if (!_bt_steppage(scan, dir))
return false;
}
* deleted.
*/
if (BTScanPosIsPinned(so->currPos))
- LockBuffer(so->currPos.buf, BT_READ);
+ _bt_lockbuf(rel, so->currPos.buf, BT_READ);
else
so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
* There's no actually-matching data on this page. Try to advance to
* the next page. Return false if there's no matching data at all.
*/
- LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+ _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
if (!_bt_steppage(scan, dir))
return false;
}