Add general purpose hasing functions to pgbench.
authorTeodor Sigaev <teodor@sigaev.ru>
Wed, 21 Mar 2018 15:01:23 +0000 (18:01 +0300)
committerTeodor Sigaev <teodor@sigaev.ru>
Wed, 21 Mar 2018 15:01:23 +0000 (18:01 +0300)
Hashing function is useful for simulating real-world workload in test like
WEB workload, as an example - YCSB benchmarks.

Author: Ildar Musin with minor editorization by me
Reviewed by: Fabien Coelho, me
Discussion: https://www.postgresql.org/message-id/flat/0e8bd39e-dfcd-2879-f88f-272799ad7ef2@postgrespro.ru

doc/src/sgml/ref/pgbench.sgml
src/bin/pgbench/exprparse.y
src/bin/pgbench/pgbench.c
src/bin/pgbench/pgbench.h
src/bin/pgbench/t/001_pgbench_with_server.pl

index 5f28023799121c3438705b7bf2ae41eb69ef2948..f07ddf1226e842ca1ddc6967318c9b7f5ee0d323 100644 (file)
@@ -874,13 +874,18 @@ pgbench <optional> <replaceable>options</replaceable> </optional> <replaceable>d
 
      <tbody>
       <row>
-       <entry> <literal>scale</literal> </entry>
-       <entry>current scale factor</entry>
+       <entry> <literal>client_id</literal> </entry>
+       <entry>unique number identifying the client session (starts from zero)</entry>
       </row>
 
       <row>
-       <entry> <literal>client_id</literal> </entry>
-       <entry>unique number identifying the client session (starts from zero)</entry>
+       <entry> <literal>default_seed</literal> </entry>
+       <entry>seed used in hash functions by default</entry>
+      </row>
+
+      <row>
+       <entry> <literal>scale</literal> </entry>
+       <entry>current scale factor</entry>
       </row>
      </tbody>
     </tgroup>
@@ -1245,6 +1250,27 @@ pgbench <optional> <replaceable>options</replaceable> </optional> <replaceable>d
        <entry><literal>greatest(5, 4, 3, 2)</literal></entry>
        <entry><literal>5</literal></entry>
       </row>
+      <row>
+       <entry><literal><function>hash(<replaceable>a</replaceable> [, <replaceable>seed</replaceable> ] )</function></literal></entry>
+       <entry>integer</entry>
+       <entry>alias for <literal>hash_murmur2()</literal></entry>
+       <entry><literal>hash(10, 5432)</literal></entry>
+       <entry><literal>-5817877081768721676</literal></entry>
+      </row>
+      <row>
+       <entry><literal><function>hash_fnv1a(<replaceable>a</replaceable> [, <replaceable>seed</replaceable> ] )</function></literal></entry>
+       <entry>integer</entry>
+       <entry><ulink url="https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function">FNV-1a hash</ulink></entry>
+       <entry><literal>hash_fnv1a(10, 5432)</literal></entry>
+       <entry><literal>-7793829335365542153</literal></entry>
+      </row>
+      <row>
+       <entry><literal><function>hash_murmur2(<replaceable>a</replaceable> [, <replaceable>seed</replaceable> ] )</function></literal></entry>
+       <entry>integer</entry>
+       <entry><ulink url="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash2 hash</ulink></entry>
+       <entry><literal>hash_murmur2(10, 5432)</literal></entry>
+       <entry><literal>-5817877081768721676</literal></entry>
+      </row>
       <row>
        <entry><literal><function>int(<replaceable>x</replaceable>)</function></literal></entry>
        <entry>integer</entry>
@@ -1423,6 +1449,31 @@ f(x) = PHI(2.0 * parameter * (x - mu) / (max - min + 1)) /
     </listitem>
    </itemizedlist>
 
+  <para>
+    Hash functions <literal>hash</literal>, <literal>hash_murmur2</literal> and
+    <literal>hash_fnv1a</literal> accept an input value and an optional seed parameter.
+    In case the seed isn't provided the value of <literal>:default_seed</literal>
+    is used, which is initialized randomly unless set by the command-line
+    <literal>-D</literal> option. Hash functions can be used to scatter the
+    distribution of random functions such as <literal>random_zipfian</literal> or
+    <literal>random_exponential</literal>. For instance, the following pgbench
+    script simulates possible real world workload typical for social media and
+    blogging platforms where few accounts generate excessive load:
+
+<programlisting>
+\set r random_zipfian(0, 100000000, 1.07)
+\set k abs(hash(:r)) % 1000000
+</programlisting>
+
+    In some cases several distinct distributions are needed which don't correlate
+    with each other and this is when implicit seed parameter comes in handy:
+
+<programlisting>
+\set k1 abs(hash(:r), :default_seed + 123) % 1000000
+\set k2 abs(hash(:r), :default_seed + 321) % 1000000
+</programlisting>
+  </para>
+
   <para>
    As an example, the full definition of the built-in TPC-B-like
    transaction is:
index e23ca517864ef79395c2ae8fc832237ed2b64a97..8447e14d141215de853058ab038ea07cd8fe0052 100644 (file)
 
 #include "pgbench.h"
 
+#define PGBENCH_NARGS_VARIABLE (-1)
+#define PGBENCH_NARGS_CASE     (-2)
+#define PGBENCH_NARGS_HASH     (-3)
+
 PgBenchExpr *expr_parse_result;
 
 static PgBenchExprList *make_elist(PgBenchExpr *exp, PgBenchExprList *list);
@@ -226,9 +230,13 @@ make_uop(yyscan_t yyscanner, const char *operator, PgBenchExpr *expr)
 /*
  * List of available functions:
  * - fname: function name, "!..." for special internal functions
- * - nargs: number of arguments
- *         -1 is a special value for least & greatest meaning #args >= 1
- *         -2 is for the "CASE WHEN ..." function, which has #args >= 3 and odd
+ * - nargs: number of arguments. Special cases:
+ *         - PGBENCH_NARGS_VARIABLE is a special value for least & greatest
+ *           meaning #args >= 1;
+ *         - PGBENCH_NARGS_CASE is for the "CASE WHEN ..." function, which
+ *           has #args >= 3 and odd;
+ *             - PGBENCH_NARGS_HASH is for hash functions, which have one required
+ *           and one optional argument;
  * - tag: function identifier from PgBenchFunction enum
  */
 static const struct
@@ -259,10 +267,10 @@ static const struct
        "abs", 1, PGBENCH_ABS
    },
    {
-       "least", -1, PGBENCH_LEAST
+       "least", PGBENCH_NARGS_VARIABLE, PGBENCH_LEAST
    },
    {
-       "greatest", -1, PGBENCH_GREATEST
+       "greatest", PGBENCH_NARGS_VARIABLE, PGBENCH_GREATEST
    },
    {
        "debug", 1, PGBENCH_DEBUG
@@ -347,7 +355,25 @@ static const struct
    },
    /* "case when ... then ... else ... end" construction */
    {
-       "!case_end", -2, PGBENCH_CASE
+       "!case_end", PGBENCH_NARGS_CASE, PGBENCH_CASE
+   },
+   {
+       "hash", PGBENCH_NARGS_HASH, PGBENCH_HASH_MURMUR2
+   },
+   {
+       "hash_murmur2", PGBENCH_NARGS_HASH, PGBENCH_HASH_MURMUR2
+   },
+   {
+       "hash_fnv1a", PGBENCH_NARGS_HASH, PGBENCH_HASH_FNV1A
+   },
+   {
+       "hash", PGBENCH_NARGS_HASH, PGBENCH_HASH_MURMUR2
+   },
+   {
+       "hash_murmur2", PGBENCH_NARGS_HASH, PGBENCH_HASH_MURMUR2
+   },
+   {
+       "hash_fnv1a", PGBENCH_NARGS_HASH, PGBENCH_HASH_FNV1A
    },
    /* keep as last array element */
    {
@@ -423,29 +449,51 @@ elist_length(PgBenchExprList *list)
 static PgBenchExpr *
 make_func(yyscan_t yyscanner, int fnumber, PgBenchExprList *args)
 {
+   int len = elist_length(args);
+
    PgBenchExpr *expr = pg_malloc(sizeof(PgBenchExpr));
 
    Assert(fnumber >= 0);
 
-   if (PGBENCH_FUNCTIONS[fnumber].nargs >= 0 &&
-       PGBENCH_FUNCTIONS[fnumber].nargs != elist_length(args))
-       expr_yyerror_more(yyscanner, "unexpected number of arguments",
-                         PGBENCH_FUNCTIONS[fnumber].fname);
-
-   /* check at least one arg for least & greatest */
-   if (PGBENCH_FUNCTIONS[fnumber].nargs == -1 &&
-       elist_length(args) == 0)
-       expr_yyerror_more(yyscanner, "at least one argument expected",
-                         PGBENCH_FUNCTIONS[fnumber].fname);
-   /* special case: case (when ... then ...)+ (else ...)? end */
-   if (PGBENCH_FUNCTIONS[fnumber].nargs == -2)
-   {
-       int len = elist_length(args);
-
-       /* 'else' branch is always present, but could be a NULL-constant */
-       if (len < 3 || len % 2 != 1)
-           expr_yyerror_more(yyscanner, "odd and >= 3 number of arguments expected",
-                             "case control structure");
+   /* validate arguments number including few special cases */
+   switch (PGBENCH_FUNCTIONS[fnumber].nargs)
+   {
+       /* check at least one arg for least & greatest */
+       case PGBENCH_NARGS_VARIABLE:
+           if (len == 0)
+               expr_yyerror_more(yyscanner, "at least one argument expected",
+                                 PGBENCH_FUNCTIONS[fnumber].fname);
+           break;
+
+       /* case (when ... then ...)+ (else ...)? end */
+       case PGBENCH_NARGS_CASE:
+           /* 'else' branch is always present, but could be a NULL-constant */
+           if (len < 3 || len % 2 != 1)
+               expr_yyerror_more(yyscanner,
+                                 "odd and >= 3 number of arguments expected",
+                                 "case control structure");
+           break;
+
+       /* hash functions with optional seed argument */
+       case PGBENCH_NARGS_HASH:
+           if (len > 2)
+               expr_yyerror_more(yyscanner, "unexpected number of arguments",
+                                 PGBENCH_FUNCTIONS[fnumber].fname);
+
+           if (len == 1)
+           {
+               PgBenchExpr *var = make_variable("default_seed");
+               args = make_elist(var, args);
+           }
+           break;
+
+       /* common case: positive arguments number */
+       default:
+           Assert(PGBENCH_FUNCTIONS[fnumber].nargs >= 0);
+
+           if (PGBENCH_FUNCTIONS[fnumber].nargs != len)
+               expr_yyerror_more(yyscanner, "unexpected number of arguments",
+                                 PGBENCH_FUNCTIONS[fnumber].fname);
    }
 
    expr->etype = ENODE_FUNCTION;
index 29d69de4d13cfd43116747fee9907d3d3eb4ccec..a15aa06b194a592a960db0861c24c750e7bb0c19 100644 (file)
 
 #define ERRCODE_UNDEFINED_TABLE  "42P01"
 
+/*
+ * Hashing constants
+ */
+#define FNV_PRIME 0x100000001b3
+#define FNV_OFFSET_BASIS 0xcbf29ce484222325
+#define MM2_MUL 0xc6a4a7935bd1e995
+#define MM2_ROT 47
+
 /*
  * Multi-platform pthread implementations
  */
@@ -915,6 +923,54 @@ getZipfianRand(TState *thread, int64 min, int64 max, double s)
                      : computeHarmonicZipfian(thread, n, s));
 }
 
+/*
+ * FNV-1a hash function
+ */
+static int64
+getHashFnv1a(int64 val, uint64 seed)
+{
+   int64   result;
+   int     i;
+
+   result = FNV_OFFSET_BASIS ^ seed;
+   for (i = 0; i < 8; ++i)
+   {
+       int32 octet = val & 0xff;
+
+       val = val >> 8;
+       result = result ^ octet;
+       result = result * FNV_PRIME;
+   }
+
+   return result;
+}
+
+/*
+ * Murmur2 hash function
+ *
+ * Based on original work of Austin Appleby
+ * https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp
+ */
+static int64
+getHashMurmur2(int64 val, uint64 seed)
+{
+   uint64  result = seed ^ (sizeof(int64) * MM2_MUL);
+   uint64  k = (uint64) val;
+
+   k *= MM2_MUL;
+   k ^= k >> MM2_ROT;
+   k *= MM2_MUL;
+
+   result ^= k;
+   result *= MM2_MUL;
+
+   result ^= result >> MM2_ROT;
+   result *= MM2_MUL;
+   result ^= result >> MM2_ROT;
+
+   return (int64) result;
+}
+
 /*
  * Initialize the given SimpleStats struct to all zeroes
  */
@@ -2211,6 +2267,30 @@ evalStandardFunc(TState *thread, CState *st,
                return true;
            }
 
+           /* hashing */
+       case PGBENCH_HASH_FNV1A:
+       case PGBENCH_HASH_MURMUR2:
+           {
+               int64   val,
+                       seed;
+
+               Assert(nargs == 2);
+
+               if (!coerceToInt(&vargs[0], &val) ||
+                   !coerceToInt(&vargs[1], &seed))
+                   return false;
+
+               if (func == PGBENCH_HASH_MURMUR2)
+                   setIntValue(retval, getHashMurmur2(val, seed));
+               else if (func == PGBENCH_HASH_FNV1A)
+                   setIntValue(retval, getHashFnv1a(val, seed));
+               else
+                   /* cannot get here */
+                   Assert(0);
+
+               return true;
+           }
+
        default:
            /* cannot get here */
            Assert(0);
@@ -4963,6 +5043,10 @@ main(int argc, char **argv)
        exit(1);
    }
 
+   /* set random seed */
+   INSTR_TIME_SET_CURRENT(start_time);
+   srandom((unsigned int) INSTR_TIME_GET_MICROSEC(start_time));
+
    if (internal_script_used)
    {
        /*
@@ -5024,6 +5108,19 @@ main(int argc, char **argv)
        }
    }
 
+   /* set default seed for hash functions */
+   if (lookupVariable(&state[0], "default_seed") == NULL)
+   {
+       uint64  seed = ((uint64) (random() & 0xFFFF) << 48) |
+                      ((uint64) (random() & 0xFFFF) << 32) |
+                      ((uint64) (random() & 0xFFFF) << 16) |
+                      (uint64) (random() & 0xFFFF);
+
+       for (i = 0; i < nclients; i++)
+           if (!putVariableInt(&state[i], "startup", "default_seed", (int64) seed))
+               exit(1);
+   }
+
    if (!is_no_vacuum)
    {
        fprintf(stderr, "starting vacuum...");
@@ -5041,10 +5138,6 @@ main(int argc, char **argv)
    }
    PQfinish(con);
 
-   /* set random seed */
-   INSTR_TIME_SET_CURRENT(start_time);
-   srandom((unsigned int) INSTR_TIME_GET_MICROSEC(start_time));
-
    /* set up thread data structures */
    threads = (TState *) pg_malloc(sizeof(TState) * nthreads);
    nclients_dealt = 0;
index 0705ccdf0d66f79a0a2d64ce18d5ed82caae1035..6983865b9258d7239a030533a248b59570c2ab3a 100644 (file)
@@ -97,7 +97,9 @@ typedef enum PgBenchFunction
    PGBENCH_LE,
    PGBENCH_LT,
    PGBENCH_IS,
-   PGBENCH_CASE
+   PGBENCH_CASE,
+   PGBENCH_HASH_FNV1A,
+   PGBENCH_HASH_MURMUR2
 } PgBenchFunction;
 
 typedef struct PgBenchExpr PgBenchExpr;
index 0c23d2fc58358fa3021a016469e2c8310117692c..50cbb23f20bd060d1c6cfbbbb1058450aa2a0f36 100644 (file)
@@ -259,6 +259,11 @@ pgbench(
        qr{command=46.: int 46\b},
        qr{command=47.: boolean true\b},
        qr{command=48.: boolean true\b},
+       qr{command=49.: int -5817877081768721676\b},
+       qr{command=50.: boolean true\b},
+       qr{command=51.: int -7793829335365542153\b},
+       qr{command=52.: int -?\d+\b},
+       qr{command=53.: boolean true\b},
    ],
    'pgbench expressions',
    {   '001_pgbench_expressions' => q{-- integer functions
@@ -327,6 +332,12 @@ pgbench(
 \set n6 debug(:n IS NULL AND NOT :f AND :t)
 -- conditional truth
 \set cs debug(CASE WHEN 1 THEN TRUE END AND CASE WHEN 1.0 THEN TRUE END AND CASE WHEN :n THEN NULL ELSE TRUE END)
+-- hash functions
+\set h0 debug(hash(10, 5432))
+\set h1 debug(:h0 = hash_murmur2(10, 5432))
+\set h3 debug(hash_fnv1a(10, 5432))
+\set h4 debug(hash(10))
+\set h5 debug(hash(10) = hash(10, :default_seed))
 -- lazy evaluation
 \set zy 0
 \set yz debug(case when :zy = 0 then -1 else (1 / :zy) end)