Support long distance matching for zstd compression
authorTomas Vondra <tomas.vondra@postgresql.org>
Thu, 6 Apr 2023 15:18:38 +0000 (17:18 +0200)
committerTomas Vondra <tomas.vondra@postgresql.org>
Thu, 6 Apr 2023 15:18:42 +0000 (17:18 +0200)
zstd compression supports a special mode for finding matched in distant
past, which may result in better compression ratio, at the expense of
using more memory (the window size is 128MB).

To enable this optional mode, use the "long" keyword when specifying the
compression method (--compress=zstd:long).

Author: Justin Pryzby
Reviewed-by: Tomas Vondra, Jacob Champion
Discussion: https://postgr.es/m/20230224191840.GD1653@telsasoft.com
Discussion: https://postgr.es/m/20220327205020.GM28503@telsasoft.com

12 files changed:
doc/src/sgml/protocol.sgml
doc/src/sgml/ref/pg_basebackup.sgml
doc/src/sgml/ref/pg_dump.sgml
src/backend/backup/basebackup_zstd.c
src/bin/pg_basebackup/bbstreamer_zstd.c
src/bin/pg_basebackup/t/010_pg_basebackup.pl
src/bin/pg_dump/compress_zstd.c
src/bin/pg_dump/t/002_pg_dump.pl
src/bin/pg_verifybackup/t/008_untar.pl
src/bin/pg_verifybackup/t/010_client_untar.pl
src/common/compression.c
src/include/common/compression.h

index 8b5e7b1ad7f21efd4f31779a83045e5da09bd009..b11d9a6ba355e66184d61e02d90ebeaa8907c5a5 100644 (file)
@@ -2729,7 +2729,8 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
            level.  Otherwise, it should be a comma-separated list of items,
            each of the form <replaceable>keyword</replaceable> or
            <replaceable>keyword=value</replaceable>. Currently, the supported
-           keywords are <literal>level</literal> and <literal>workers</literal>.
+           keywords are <literal>level</literal>, <literal>long</literal> and
+           <literal>workers</literal>.
           </para>
 
           <para>
@@ -2746,6 +2747,13 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
            <literal>3</literal>).
           </para>
 
+          <para>
+           The <literal>long</literal> keyword enables long-distance matching
+           mode, for improved compression ratio, at the expense of higher memory
+           use.  Long-distance mode is supported only for
+           <literal>zstd</literal>.
+          </para>
+
           <para>
            The <literal>workers</literal> keyword sets the number of threads
            that should be used for parallel compression. Parallel compression
index db3ad9cd5eba8781bbe18beb196971e623512e77..79d3e657c32c245f9825c602d2760c0c3f49d9e1 100644 (file)
@@ -424,8 +424,8 @@ PostgreSQL documentation
         level.  Otherwise, it should be a comma-separated list of items,
         each of the form <literal>keyword</literal> or
         <literal>keyword=value</literal>.
-        Currently, the supported keywords are <literal>level</literal>
-        and <literal>workers</literal>.
+        Currently, the supported keywords are <literal>level</literal>,
+        <literal>long</literal>, and <literal>workers</literal>.
         The detail string cannot be used when the compression method
         is specified as a plain integer.
        </para>
index 8de38e0fd0dec3375e2d7391742c530c0c6d2e58..e81e35c13b35217d91b3f3819127af3f040db699 100644 (file)
@@ -681,6 +681,8 @@ PostgreSQL documentation
         as though it had been fed through <application>gzip</application>,
         <application>lz4</application>, or <application>zstd</application>;
         but the default is not to compress.
+        With zstd compression, <literal>long</literal> mode may improve the
+        compression ratio, at the cost of increased memory use.
        </para>
        <para>
         The tar archive format currently does not support compression at all.
index ac6cac178a00a54bdeb6b97d17e733350542c8b6..1bb5820c884f22fa57d14ee677d88eba75fb870a 100644 (file)
@@ -118,6 +118,18 @@ bbsink_zstd_begin_backup(bbsink *sink)
                           compress->workers, ZSTD_getErrorName(ret)));
    }
 
+   if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+   {
+       ret = ZSTD_CCtx_setParameter(mysink->cctx,
+                                    ZSTD_c_enableLongDistanceMatching,
+                                    compress->long_distance);
+       if (ZSTD_isError(ret))
+           ereport(ERROR,
+                   errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                   errmsg("could not set compression flag for %s: %s",
+                          "long", ZSTD_getErrorName(ret)));
+   }
+
    /*
     * We need our own buffer, because we're going to pass different data to
     * the next sink than what gets passed to us.
index fe17d6df4ef78c2034bb161fb952f8fe9ab45d3f..fba391e2a0f0c4eb35458ccb44e2511a3936f9af 100644 (file)
@@ -106,6 +106,19 @@ bbstreamer_zstd_compressor_new(bbstreamer *next, pg_compress_specification *comp
                     compress->workers, ZSTD_getErrorName(ret));
    }
 
+   if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
+   {
+       ret = ZSTD_CCtx_setParameter(streamer->cctx,
+                                    ZSTD_c_enableLongDistanceMatching,
+                                    compress->long_distance);
+       if (ZSTD_isError(ret))
+       {
+           pg_log_error("could not set compression flag for %s: %s",
+                        "long", ZSTD_getErrorName(ret));
+           exit(1);
+       }
+   }
+
    /* Initialize the ZSTD output buffer. */
    streamer->zstd_outBuf.dst = streamer->base.bbs_buffer.data;
    streamer->zstd_outBuf.size = streamer->base.bbs_buffer.maxlen;
index b60cb78a0d57dff4c5d97ca254c55efffc8faae5..4d130a7f9446dfb34fc84b49d4913af81ca7686e 100644 (file)
@@ -139,7 +139,14 @@ SKIP:
            'gzip:workers=3',
            'invalid compression specification: compression algorithm "gzip" does not accept a worker count',
            'failure on worker count for gzip'
-       ],);
+       ],
+       [
+           'gzip:long',
+           'invalid compression specification: compression algorithm "gzip" does not support long-distance mode',
+           'failure on long mode for gzip'
+       ],
+   );
+
    for my $cft (@compression_failure_tests)
    {
        my $cfail = quotemeta($client_fails . $cft->[1]);
index aa16822dffa0d0fbda1522f6869f533346742bbc..001b4f15130642266b4f75c32f598a31e99b654c 100644 (file)
@@ -80,6 +80,11 @@ _ZstdCStreamParams(pg_compress_specification compress)
    _Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_compressionLevel,
                               compress.level, "level");
 
+   if (compress.options & PG_COMPRESSION_OPTION_LONG_DISTANCE)
+       _Zstd_CCtx_setParam_or_die(cstream,
+                                 ZSTD_c_enableLongDistanceMatching,
+                                 compress.long_distance, "long");
+
    return cstream;
 }
 
index b5c97694e32cb1e45f354187d9bd7bccd011f726..93e24d5145727c9027285c146703586e0bbc0dc6 100644 (file)
@@ -267,11 +267,12 @@ my %pgdump_runs = (
        ],
    },
 
+   # Exercise long mode for test coverage
    compression_zstd_plain => {
        test_key       => 'compression',
        compile_option => 'zstd',
        dump_cmd       => [
-           'pg_dump', '--format=plain', '--compress=zstd',
+           'pg_dump', '--format=plain', '--compress=zstd:long',
            "--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
        ],
        # Decompress the generated file to run through the tests.
index 3007bbe8556a862e1869f6cb677d1c422cd28fdb..05754bc8ec72042dfa6d63f6b5dbfb0a97df6e3c 100644 (file)
@@ -49,6 +49,14 @@ my @test_configuration = (
        'decompress_program' => $ENV{'ZSTD'},
        'decompress_flags'   => ['-d'],
        'enabled'            => check_pg_config("#define USE_ZSTD 1")
+   },
+   {
+       'compression_method' => 'zstd',
+       'backup_flags'       => [ '--compress', 'server-zstd:level=1,long' ],
+       'backup_archive'     => 'base.tar.zst',
+       'decompress_program' => $ENV{'ZSTD'},
+       'decompress_flags'   => ['-d'],
+       'enabled'            => check_pg_config("#define USE_ZSTD 1")
    });
 
 for my $tc (@test_configuration)
index f3aa0f59e29a99ce15771b20f6a4b76ad9311d01..ac51a174d14edfe51fa247ba85c3ba4095a435b0 100644 (file)
@@ -50,6 +50,14 @@ my @test_configuration = (
        'decompress_flags'   => ['-d'],
        'enabled'            => check_pg_config("#define USE_ZSTD 1")
    },
+   {
+       'compression_method' => 'zstd',
+       'backup_flags' => ['--compress', 'client-zstd:level=1,long'],
+       'backup_archive' => 'base.tar.zst',
+       'decompress_program' => $ENV{'ZSTD'},
+       'decompress_flags' => [ '-d' ],
+       'enabled' => check_pg_config("#define USE_ZSTD 1")
+   },
    {
        'compression_method' => 'parallel zstd',
        'backup_flags'       => [ '--compress', 'client-zstd:workers=3' ],
index 2d3e56b4d62d0dd584b7d2ddeb0dec3e5550c453..35a7cade645cc7d09fde1a858966d3b87a9c57c7 100644 (file)
@@ -12,7 +12,7 @@
  * Otherwise, a compression specification is a comma-separated list of items,
  * each having the form keyword or keyword=value.
  *
- * Currently, the only supported keywords are "level" and "workers".
+ * Currently, the supported keywords are "level", "long", and "workers".
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  *
@@ -38,6 +38,8 @@
 
 static int expect_integer_value(char *keyword, char *value,
                                 pg_compress_specification *result);
+static bool expect_boolean_value(char *keyword, char *value,
+                                pg_compress_specification *result);
 
 /*
  * Look up a compression algorithm by name. Returns true and sets *algorithm
@@ -232,6 +234,11 @@ parse_compress_specification(pg_compress_algorithm algorithm, char *specificatio
            result->workers = expect_integer_value(keyword, value, result);
            result->options |= PG_COMPRESSION_OPTION_WORKERS;
        }
+       else if (strcmp(keyword, "long") == 0)
+       {
+           result->long_distance = expect_boolean_value(keyword, value, result);
+           result->options |= PG_COMPRESSION_OPTION_LONG_DISTANCE;
+       }
        else
            result->parse_error =
                psprintf(_("unrecognized compression option: \"%s\""), keyword);
@@ -289,6 +296,43 @@ expect_integer_value(char *keyword, char *value, pg_compress_specification *resu
    return ivalue;
 }
 
+/*
+ * Parse 'value' as a boolean and return the result.
+ *
+ * If parsing fails, set result->parse_error to an appropriate message
+ * and return -1.  The caller must check result->parse_error to determine if
+ * the call was successful.
+ *
+ * Valid values are: yes, no, on, off, 1, 0.
+ *
+ * Inspired by ParseVariableBool().
+ */
+static bool
+expect_boolean_value(char *keyword, char *value, pg_compress_specification *result)
+{
+   if (value == NULL)
+       return true;
+
+   if (pg_strcasecmp(value, "yes") == 0)
+       return true;
+   if (pg_strcasecmp(value, "on") == 0)
+       return true;
+   if (pg_strcasecmp(value, "1") == 0)
+       return true;
+
+   if (pg_strcasecmp(value, "no") == 0)
+       return false;
+   if (pg_strcasecmp(value, "off") == 0)
+       return false;
+   if (pg_strcasecmp(value, "0") == 0)
+       return false;
+
+   result->parse_error =
+       psprintf(_("value for compression option \"%s\" must be a boolean"),
+                keyword);
+   return false;
+}
+
 /*
  * Returns NULL if the compression specification string was syntactically
  * valid and semantically sensible.  Otherwise, returns an error message.
@@ -354,6 +398,17 @@ validate_compress_specification(pg_compress_specification *spec)
                        get_compress_algorithm_name(spec->algorithm));
    }
 
+   /*
+    * Of the compression algorithms that we currently support, only zstd
+    * supports long-distance mode.
+    */
+   if ((spec->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0 &&
+       (spec->algorithm != PG_COMPRESSION_ZSTD))
+   {
+       return psprintf(_("compression algorithm \"%s\" does not support long-distance mode"),
+                       get_compress_algorithm_name(spec->algorithm));
+   }
+
    return NULL;
 }
 
index b48c173022e0507f551753535a85a1fce7453727..38aae9dd8739453002c7ab2f44ee7940be20687d 100644 (file)
@@ -27,6 +27,7 @@ typedef enum pg_compress_algorithm
 } pg_compress_algorithm;
 
 #define PG_COMPRESSION_OPTION_WORKERS      (1 << 0)
+#define PG_COMPRESSION_OPTION_LONG_DISTANCE    (1 << 1)
 
 typedef struct pg_compress_specification
 {
@@ -34,6 +35,7 @@ typedef struct pg_compress_specification
    unsigned    options;        /* OR of PG_COMPRESSION_OPTION constants */
    int         level;
    int         workers;
+   bool        long_distance;
    char       *parse_error;    /* NULL if parsing was OK, else message */
 } pg_compress_specification;