Teach SLRU code to avoid replacing I/O-busy pages.
authorRobert Haas <rhaas@postgresql.org>
Mon, 9 Apr 2012 03:04:07 +0000 (23:04 -0400)
committerRobert Haas <rhaas@postgresql.org>
Mon, 9 Apr 2012 03:05:55 +0000 (23:05 -0400)
Patch by me; review by Tom Lane and others.

src/backend/access/transam/slru.c

index 3049e01e1cd3063bccdfce26e67402dc6872db3e..a8e3f19119a568858e8107bf53196bffa5cfa2c5 100644 (file)
@@ -403,12 +403,6 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
        /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
        LWLockAcquire(shared->buffer_locks[slotno], LW_EXCLUSIVE);
 
-       /*
-        * Temporarily mark page as recently-used to discourage
-        * SlruSelectLRUPage from selecting it again for someone else.
-        */
-       SlruRecentlyUsed(shared, slotno);
-
        /* Release control lock while doing I/O */
        LWLockRelease(shared->ControlLock);
 
@@ -909,9 +903,12 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
    {
        int         slotno;
        int         cur_count;
-       int         bestslot;
-       int         best_delta;
-       int         best_page_number;
+       int         bestvalidslot = 0;              /* keep compiler quiet */
+       int         best_valid_delta = -1;
+       int         best_valid_page_number = 0;     /* keep compiler quiet */
+       int         bestinvalidslot = 0;            /* keep compiler quiet */
+       int         best_invalid_delta = -1;
+       int         best_invalid_page_number = 0;   /* keep compiler quiet */
 
        /* See if page already has a buffer assigned */
        for (slotno = 0; slotno < shared->num_slots; slotno++)
@@ -922,8 +919,16 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
        }
 
        /*
-        * If we find any EMPTY slot, just select that one. Else locate the
-        * least-recently-used slot to replace.
+        * If we find any EMPTY slot, just select that one. Else choose a
+        * victim page to replace.  We normally take the least recently used
+        * valid page, but we will never take the slot containing
+        * latest_page_number, even if it appears least recently used.  We
+        * will select a slot that is already I/O busy only if there is no
+        * other choice: a read-busy slot will not be least recently used once
+        * the read finishes, and waiting for an I/O on a write-busy slot is
+        * inferior to just picking some other slot.  Testing shows the slot
+        * we pick instead will often be clean, allowing us to begin a read
+        * at once.
         *
         * Normally the page_lru_count values will all be different and so
         * there will be a well-defined LRU page.  But since we allow
@@ -932,9 +937,6 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
         * acquire the same lru_count values.  In that case we break ties by
         * choosing the furthest-back page.
         *
-        * In no case will we select the slot containing latest_page_number
-        * for replacement, even if it appears least recently used.
-        *
         * Notice that this next line forcibly advances cur_lru_count to a
         * value that is certainly beyond any value that will be in the
         * page_lru_count array after the loop finishes.  This ensures that
@@ -944,9 +946,6 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
         * multiple pages with the same lru_count.
         */
        cur_count = (shared->cur_lru_count)++;
-       best_delta = -1;
-       bestslot = 0;           /* no-op, just keeps compiler quiet */
-       best_page_number = 0;   /* ditto */
        for (slotno = 0; slotno < shared->num_slots; slotno++)
        {
            int         this_delta;
@@ -968,34 +967,57 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
                this_delta = 0;
            }
            this_page_number = shared->page_number[slotno];
-           if ((this_delta > best_delta ||
-                (this_delta == best_delta &&
-                 ctl->PagePrecedes(this_page_number, best_page_number))) &&
-               this_page_number != shared->latest_page_number)
+           if (this_page_number == shared->latest_page_number)
+               continue;
+           if (shared->page_status[slotno] == SLRU_PAGE_VALID)
+           {
+               if (this_delta > best_valid_delta ||
+                   (this_delta == best_valid_delta &&
+                    ctl->PagePrecedes(this_page_number,
+                                      best_valid_page_number)))
+               {
+                   bestvalidslot = slotno;
+                   best_valid_delta = this_delta;
+                   best_valid_page_number = this_page_number;
+               }
+           }
+           else
            {
-               bestslot = slotno;
-               best_delta = this_delta;
-               best_page_number = this_page_number;
+               if (this_delta > best_invalid_delta ||
+                   (this_delta == best_invalid_delta &&
+                    ctl->PagePrecedes(this_page_number,
+                                      best_invalid_page_number)))
+               {
+                   bestinvalidslot = slotno;
+                   best_invalid_delta = this_delta;
+                   best_invalid_page_number = this_page_number;
+               }
            }
        }
 
+       /*
+        * If all pages (except possibly the latest one) are I/O busy, we'll
+        * have to wait for an I/O to complete and then retry.  In that unhappy
+        * case, we choose to wait for the I/O on the least recently used slot,
+        * on the assumption that it was likely initiated first of all the I/Os
+        * in progress and may therefore finish first.
+        */
+       if (best_valid_delta < 0)
+       {
+           SimpleLruWaitIO(ctl, bestinvalidslot);
+           continue;
+       }
+
        /*
         * If the selected page is clean, we're set.
         */
-       if (shared->page_status[bestslot] == SLRU_PAGE_VALID &&
-           !shared->page_dirty[bestslot])
-           return bestslot;
+       if (!shared->page_dirty[bestvalidslot])
+           return bestvalidslot;
 
        /*
-        * We need to wait for I/O.  Normal case is that it's dirty and we
-        * must initiate a write, but it's possible that the page is already
-        * write-busy, or in the worst case still read-busy.  In those cases
-        * we wait for the existing I/O to complete.
+        * Write the page.
         */
-       if (shared->page_status[bestslot] == SLRU_PAGE_VALID)
-           SlruInternalWritePage(ctl, bestslot, NULL);
-       else
-           SimpleLruWaitIO(ctl, bestslot);
+       SlruInternalWritePage(ctl, bestvalidslot, NULL);
 
        /*
         * Now loop back and try again.  This is the easiest way of dealing