In psql \copy from, send data to server in larger chunks.
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 14 Jul 2021 10:08:28 +0000 (13:08 +0300)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 14 Jul 2021 10:08:28 +0000 (13:08 +0300)
Previously, we would send each line as a separate CopyData message.
That's pretty wasteful if the table is narrow, as each CopyData message
has 5 bytes of overhead. For efficiency, buffer up and pack 8 kB of
input data into each CopyData message.

The server also sends each line as a separate CopyData message in COPY TO
STDOUT, and that's similarly wasteful. But that's documented in the FE/BE
protocol description, so changing that would be a wire protocol break.

Reviewed-by: Aleksander Alekseev
Discussion: https://www.postgresql.org/message-id/40b2cec0-d0fb-3191-2ae1-9a3fe16a7e48%40iki.fi

src/bin/psql/copy.c

index e1fee8e0992ec16617d57d4eb4663cb04a54da87..64ab40c4f75feae9a52f81a9d464e9d81ed2dce3 100644 (file)
@@ -581,13 +581,21 @@ handleCopyIn(PGconn *conn, FILE *copystream, bool isbinary, PGresult **res)
    else
    {
        bool        copydone = false;
+       int         buflen;
+       bool        at_line_begin = true;
 
+       /*
+        * In text mode, we have to read the input one line at a time, so that
+        * we can stop reading at the EOF marker (\.).  We mustn't read beyond
+        * the EOF marker, because if the data was inlined in a SQL script, we
+        * would eat up the commands after the EOF marker.
+        */
+       buflen = 0;
        while (!copydone)
-       {                       /* for each input line ... */
-           bool        firstload;
-           bool        linedone;
+       {
+           char       *fgresult;
 
-           if (showprompt)
+           if (at_line_begin && showprompt)
            {
                const char *prompt = get_prompt(PROMPT_COPY, NULL);
 
@@ -595,63 +603,68 @@ handleCopyIn(PGconn *conn, FILE *copystream, bool isbinary, PGresult **res)
                fflush(stdout);
            }
 
-           firstload = true;
-           linedone = false;
-
-           while (!linedone)
-           {                   /* for each bufferload in line ... */
-               int         linelen;
-               char       *fgresult;
-
-               /* enable longjmp while waiting for input */
-               sigint_interrupt_enabled = true;
+           /* enable longjmp while waiting for input */
+           sigint_interrupt_enabled = true;
 
-               fgresult = fgets(buf, sizeof(buf), copystream);
+           fgresult = fgets(&buf[buflen], COPYBUFSIZ - buflen, copystream);
 
-               sigint_interrupt_enabled = false;
+           sigint_interrupt_enabled = false;
 
-               if (!fgresult)
-               {
-                   copydone = true;
-                   break;
-               }
+           if (!fgresult)
+               copydone = true;
+           else
+           {
+               int         linelen;
 
-               linelen = strlen(buf);
+               linelen = strlen(fgresult);
+               buflen += linelen;
 
                /* current line is done? */
-               if (linelen > 0 && buf[linelen - 1] == '\n')
-                   linedone = true;
-
-               /* check for EOF marker, but not on a partial line */
-               if (firstload)
+               if (buf[buflen - 1] == '\n')
                {
-                   /*
-                    * This code erroneously assumes '\.' on a line alone
-                    * inside a quoted CSV string terminates the \copy.
-                    * https://www.postgresql.org/message-id/E1TdNVQ-0001ju-GO@wrigleys.postgresql.org
-                    */
-                   if (strcmp(buf, "\\.\n") == 0 ||
-                       strcmp(buf, "\\.\r\n") == 0)
+                   /* check for EOF marker, but not on a partial line */
+                   if (at_line_begin)
                    {
-                       copydone = true;
-                       break;
+                       /*
+                        * This code erroneously assumes '\.' on a line alone
+                        * inside a quoted CSV string terminates the \copy.
+                        * https://www.postgresql.org/message-id/E1TdNVQ-0001ju-GO@wrigleys.postgresql.org
+                        */
+                       if ((linelen == 3 && memcmp(fgresult, "\\.\n", 3) == 0) ||
+                           (linelen == 4 && memcmp(fgresult, "\\.\r\n", 4) == 0))
+                       {
+                           copydone = true;
+                       }
                    }
 
-                   firstload = false;
+                   if (copystream == pset.cur_cmd_source)
+                   {
+                       pset.lineno++;
+                       pset.stmt_lineno++;
+                   }
+                   at_line_begin = true;
                }
+               else
+                   at_line_begin = false;
+           }
 
-               if (PQputCopyData(conn, buf, linelen) <= 0)
+           /*
+            * If the buffer is full, or we've reached the EOF, flush it.
+            *
+            * Make sure there's always space for four more bytes in the
+            * buffer, plus a NUL terminator.  That way, an EOF marker is
+            * never split across two fgets() calls, which simplies the logic.
+            */
+           if (buflen >= COPYBUFSIZ - 5 || (copydone && buflen > 0))
+           {
+               if (PQputCopyData(conn, buf, buflen) <= 0)
                {
                    OK = false;
                    copydone = true;
                    break;
                }
-           }
 
-           if (copystream == pset.cur_cmd_source)
-           {
-               pset.lineno++;
-               pset.stmt_lineno++;
+               buflen = 0;
            }
        }
    }