Add Bloom filter implementation.

author Andres Freund <andres@anarazel.de>

Sun, 1 Apr 2018 00:49:41 +0000 (17:49 -0700)

committer Andres Freund <andres@anarazel.de>

Sun, 1 Apr 2018 00:49:41 +0000 (17:49 -0700)
author Andres Freund <andres@anarazel.de>
Sun, 1 Apr 2018 00:49:41 +0000 (17:49 -0700)
committer Andres Freund <andres@anarazel.de>
Sun, 1 Apr 2018 00:49:41 +0000 (17:49 -0700)
diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile

index d1fefe43f2a19723f640389617ac6b625f08442d..191ea9bca26860d65dd0c49ce53b5b6d22126d9b 100644 (file)
--- a/src/backend/lib/Makefile
+++ b/src/backend/lib/Makefile
@@ -12,7 +12,7 @@ subdir = src/backend/lib
  top_builddir = ../../..
  include $(top_builddir)/src/Makefile.global
  
-OBJS = binaryheap.o bipartite_match.o dshash.o hyperloglog.o ilist.o \
-      knapsack.o pairingheap.o rbtree.o stringinfo.o
+OBJS = binaryheap.o bipartite_match.o bloomfilter.o dshash.o hyperloglog.o \
+       ilist.o knapsack.o pairingheap.o rbtree.o stringinfo.o
  
  include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/lib/README b/src/backend/lib/README

index 5e5ba5e43725d3dcf4f735342d475ec10f2a3d89..376ae273a9013610e601b766c19ffa2046ca782d 100644 (file)
--- a/src/backend/lib/README
+++ b/src/backend/lib/README
@@ -3,6 +3,8 @@ in the backend:
  
  binaryheap.c - a binary heap
  
+bloomfilter.c - probabilistic, space-efficient set membership testing
+
  hyperloglog.c - a streaming cardinality estimator
  
  pairingheap.c - a pairing heap
diff --git a/src/backend/lib/bloomfilter.c b/src/backend/lib/bloomfilter.c

new file mode 100644 (file)

index 0000000..eb08f4a
--- /dev/null
+++ b/src/backend/lib/bloomfilter.c
@@ -0,0 +1,305 @@
+/*-------------------------------------------------------------------------
+ *
+ * bloomfilter.c
+ *     Space-efficient set membership testing
+ *
+ * A Bloom filter is a probabilistic data structure that is used to test an
+ * element's membership of a set.  False positives are possible, but false
+ * negatives are not; a test of membership of the set returns either "possibly
+ * in set" or "definitely not in set".  This is typically very space efficient,
+ * which can be a decisive advantage.
+ *
+ * Elements can be added to the set, but not removed.  The more elements that
+ * are added, the larger the probability of false positives.  Caller must hint
+ * an estimated total size of the set when the Bloom filter is initialized.
+ * This is used to balance the use of memory against the final false positive
+ * rate.
+ *
+ * The implementation is well suited to data synchronization problems between
+ * unordered sets, especially where predictable performance is important and
+ * some false positives are acceptable.  It's also well suited to cache
+ * filtering problems where a relatively small and/or low cardinality set is
+ * fingerprinted, especially when many subsequent membership tests end up
+ * indicating that values of interest are not present.  That should save the
+ * caller many authoritative lookups, such as expensive probes of a much larger
+ * on-disk structure.
+ *
+ * Copyright (c) 2018, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *   src/backend/lib/bloomfilter.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/hash.h"
+#include "lib/bloomfilter.h"
+
+#define MAX_HASH_FUNCS     10
+
+struct bloom_filter
+{
+   /* K hash functions are used, seeded by caller's seed */
+   int         k_hash_funcs;
+   uint64      seed;
+   /* m is bitset size, in bits.  Must be a power of two <= 2^32.  */
+   uint64      m;
+   unsigned char bitset[FLEXIBLE_ARRAY_MEMBER];
+};
+
+static int my_bloom_power(uint64 target_bitset_bits);
+static int optimal_k(uint64 bitset_bits, int64 total_elems);
+static void k_hashes(bloom_filter *filter, uint32 *hashes, unsigned char *elem,
+        size_t len);
+static inline uint32 mod_m(uint32 a, uint64 m);
+
+/*
+ * Create Bloom filter in caller's memory context.  We aim for a false positive
+ * rate of between 1% and 2% when bitset size is not constrained by memory
+ * availability.
+ *
+ * total_elems is an estimate of the final size of the set.  It should be
+ * approximately correct, but the implementation can cope well with it being
+ * off by perhaps a factor of five or more.  See "Bloom Filters in
+ * Probabilistic Verification" (Dillinger & Manolios, 2004) for details of why
+ * this is the case.
+ *
+ * bloom_work_mem is sized in KB, in line with the general work_mem convention.
+ * This determines the size of the underlying bitset (trivial bookkeeping space
+ * isn't counted).  The bitset is always sized as a power of two number of
+ * bits, and the largest possible bitset is 512MB (2^32 bits).  The
+ * implementation allocates only enough memory to target its standard false
+ * positive rate, using a simple formula with caller's total_elems estimate as
+ * an input.  The bitset might be as small as 1MB, even when bloom_work_mem is
+ * much higher.
+ *
+ * The Bloom filter is seeded using a value provided by the caller.  Using a
+ * distinct seed value on every call makes it unlikely that the same false
+ * positives will reoccur when the same set is fingerprinted a second time.
+ * Callers that don't care about this pass a constant as their seed, typically
+ * 0.  Callers can use a pseudo-random seed in the range of 0 - INT_MAX by
+ * calling random().
+ */
+bloom_filter *
+bloom_create(int64 total_elems, int bloom_work_mem, uint64 seed)
+{
+   bloom_filter *filter;
+   int         bloom_power;
+   uint64      bitset_bytes;
+   uint64      bitset_bits;
+
+   /*
+    * Aim for two bytes per element; this is sufficient to get a false
+    * positive rate below 1%, independent of the size of the bitset or total
+    * number of elements.  Also, if rounding down the size of the bitset to
+    * the next lowest power of two turns out to be a significant drop, the
+    * false positive rate still won't exceed 2% in almost all cases.
+    */
+   bitset_bytes = Min(bloom_work_mem * UINT64CONST(1024), total_elems * 2);
+   bitset_bytes = Max(1024 * 1024, bitset_bytes);
+
+   /*
+    * Size in bits should be the highest power of two <= target.  bitset_bits
+    * is uint64 because PG_UINT32_MAX is 2^32 - 1, not 2^32
+    */
+   bloom_power = my_bloom_power(bitset_bytes * BITS_PER_BYTE);
+   bitset_bits = UINT64CONST(1) << bloom_power;
+   bitset_bytes = bitset_bits / BITS_PER_BYTE;
+
+   /* Allocate bloom filter with unset bitset */
+   filter = palloc0(offsetof(bloom_filter, bitset) +
+                    sizeof(unsigned char) * bitset_bytes);
+   filter->k_hash_funcs = optimal_k(bitset_bits, total_elems);
+   filter->seed = seed;
+   filter->m = bitset_bits;
+
+   return filter;
+}
+
+/*
+ * Free Bloom filter
+ */
+void
+bloom_free(bloom_filter *filter)
+{
+   pfree(filter);
+}
+
+/*
+ * Add element to Bloom filter
+ */
+void
+bloom_add_element(bloom_filter *filter, unsigned char *elem, size_t len)
+{
+   uint32      hashes[MAX_HASH_FUNCS];
+   int         i;
+
+   k_hashes(filter, hashes, elem, len);
+
+   /* Map a bit-wise address to a byte-wise address + bit offset */
+   for (i = 0; i < filter->k_hash_funcs; i++)
+   {
+       filter->bitset[hashes[i] >> 3] |= 1 << (hashes[i] & 7);
+   }
+}
+
+/*
+ * Test if Bloom filter definitely lacks element.
+ *
+ * Returns true if the element is definitely not in the set of elements
+ * observed by bloom_add_element().  Otherwise, returns false, indicating that
+ * element is probably present in set.
+ */
+bool
+bloom_lacks_element(bloom_filter *filter, unsigned char *elem, size_t len)
+{
+   uint32      hashes[MAX_HASH_FUNCS];
+   int         i;
+
+   k_hashes(filter, hashes, elem, len);
+
+   /* Map a bit-wise address to a byte-wise address + bit offset */
+   for (i = 0; i < filter->k_hash_funcs; i++)
+   {
+       if (!(filter->bitset[hashes[i] >> 3] & (1 << (hashes[i] & 7))))
+           return true;
+   }
+
+   return false;
+}
+
+/*
+ * What proportion of bits are currently set?
+ *
+ * Returns proportion, expressed as a multiplier of filter size.  That should
+ * generally be close to 0.5, even when we have more than enough memory to
+ * ensure a false positive rate within target 1% to 2% band, since more hash
+ * functions are used as more memory is available per element.
+ *
+ * This is the only instrumentation that is low overhead enough to appear in
+ * debug traces.  When debugging Bloom filter code, it's likely to be far more
+ * interesting to directly test the false positive rate.
+ */
+double
+bloom_prop_bits_set(bloom_filter *filter)
+{
+   int         bitset_bytes = filter->m / BITS_PER_BYTE;
+   uint64      bits_set = 0;
+   int         i;
+
+   for (i = 0; i < bitset_bytes; i++)
+   {
+       unsigned char byte = filter->bitset[i];
+
+       while (byte)
+       {
+           bits_set++;
+           byte &= (byte - 1);
+       }
+   }
+
+   return bits_set / (double) filter->m;
+}
+
+/*
+ * Which element in the sequence of powers of two is less than or equal to
+ * target_bitset_bits?
+ *
+ * Value returned here must be generally safe as the basis for actual bitset
+ * size.
+ *
+ * Bitset is never allowed to exceed 2 ^ 32 bits (512MB).  This is sufficient
+ * for the needs of all current callers, and allows us to use 32-bit hash
+ * functions.  It also makes it easy to stay under the MaxAllocSize restriction
+ * (caller needs to leave room for non-bitset fields that appear before
+ * flexible array member, so a 1GB bitset would use an allocation that just
+ * exceeds MaxAllocSize).
+ */
+static int
+my_bloom_power(uint64 target_bitset_bits)
+{
+   int         bloom_power = -1;
+
+   while (target_bitset_bits > 0 && bloom_power < 32)
+   {
+       bloom_power++;
+       target_bitset_bits >>= 1;
+   }
+
+   return bloom_power;
+}
+
+/*
+ * Determine optimal number of hash functions based on size of filter in bits,
+ * and projected total number of elements.  The optimal number is the number
+ * that minimizes the false positive rate.
+ */
+static int
+optimal_k(uint64 bitset_bits, int64 total_elems)
+{
+   int         k = round(log(2.0) * bitset_bits / total_elems);
+
+   return Max(1, Min(k, MAX_HASH_FUNCS));
+}
+
+/*
+ * Generate k hash values for element.
+ *
+ * Caller passes array, which is filled-in with k values determined by hashing
+ * caller's element.
+ *
+ * Only 2 real independent hash functions are actually used to support an
+ * interface of up to MAX_HASH_FUNCS hash functions; enhanced double hashing is
+ * used to make this work.  The main reason we prefer enhanced double hashing
+ * to classic double hashing is that the latter has an issue with collisions
+ * when using power of two sized bitsets.  See Dillinger & Manolios for full
+ * details.
+ */
+static void
+k_hashes(bloom_filter *filter, uint32 *hashes, unsigned char *elem, size_t len)
+{
+   uint64      hash;
+   uint32      x, y;
+   uint64      m;
+   int         i;
+
+   /* Use 64-bit hashing to get two independent 32-bit hashes */
+   hash = DatumGetUInt64(hash_any_extended(elem, len, filter->seed));
+   x = (uint32) hash;
+   y = (uint32) (hash >> 32);
+   m = filter->m;
+
+   x = mod_m(x, m);
+   y = mod_m(y, m);
+
+   /* Accumulate hashes */
+   hashes[0] = x;
+   for (i = 1; i < filter->k_hash_funcs; i++)
+   {
+       x = mod_m(x + y, m);
+       y = mod_m(y + i, m);
+
+       hashes[i] = x;
+   }
+}
+
+/*
+ * Calculate "val MOD m" inexpensively.
+ *
+ * Assumes that m (which is bitset size) is a power of two.
+ *
+ * Using a power of two number of bits for bitset size allows us to use bitwise
+ * AND operations to calculate the modulo of a hash value.  It's also a simple
+ * way of avoiding the modulo bias effect.
+ */
+static inline uint32
+mod_m(uint32 val, uint64 m)
+{
+   Assert(m <= PG_UINT32_MAX + UINT64CONST(1));
+   Assert(((m - 1) & m) == 0);
+
+   return val & (m - 1);
+}
diff --git a/src/include/lib/bloomfilter.h b/src/include/lib/bloomfilter.h

new file mode 100644 (file)

index 0000000..6cbdd9b
--- /dev/null
+++ b/src/include/lib/bloomfilter.h
@@ -0,0 +1,27 @@
+/*-------------------------------------------------------------------------
+ *
+ * bloomfilter.h
+ *   Space-efficient set membership testing
+ *
+ * Copyright (c) 2018, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *    src/include/lib/bloomfilter.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BLOOMFILTER_H
+#define BLOOMFILTER_H
+
+typedef struct bloom_filter bloom_filter;
+
+extern bloom_filter *bloom_create(int64 total_elems, int bloom_work_mem,
+            uint64 seed);
+extern void bloom_free(bloom_filter *filter);
+extern void bloom_add_element(bloom_filter *filter, unsigned char *elem,
+                 size_t len);
+extern bool bloom_lacks_element(bloom_filter *filter, unsigned char *elem,
+                   size_t len);
+extern double bloom_prop_bits_set(bloom_filter *filter);
+
+#endif                         /* BLOOMFILTER_H */
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile

index 7294b6958b0661c484417a5138c1abf78992ce56..a9b8377acfd026d569ea43d0e085ce0dc0ae2260 100644 (file)
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -9,6 +9,7 @@ SUBDIRS = \
           commit_ts \
           dummy_seclabel \
           snapshot_too_old \
+         test_bloomfilter \
           test_ddl_deparse \
           test_extensions \
           test_parser \
diff --git a/src/test/modules/test_bloomfilter/.gitignore b/src/test/modules/test_bloomfilter/.gitignore

new file mode 100644 (file)

index 0000000..5dcb3ff
--- /dev/null
+++ b/src/test/modules/test_bloomfilter/.gitignore
@@ -0,0 +1,4 @@
+# Generated subdirectories
+/log/
+/results/
+/tmp_check/
diff --git a/src/test/modules/test_bloomfilter/Makefile b/src/test/modules/test_bloomfilter/Makefile

new file mode 100644 (file)

index 0000000..808c931
--- /dev/null
+++ b/src/test/modules/test_bloomfilter/Makefile
@@ -0,0 +1,21 @@
+# src/test/modules/test_bloomfilter/Makefile
+
+MODULE_big = test_bloomfilter
+OBJS = test_bloomfilter.o $(WIN32RES)
+PGFILEDESC = "test_bloomfilter - test code for Bloom filter library"
+
+EXTENSION = test_bloomfilter
+DATA = test_bloomfilter--1.0.sql
+
+REGRESS = test_bloomfilter
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_bloomfilter
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_bloomfilter/README b/src/test/modules/test_bloomfilter/README

new file mode 100644 (file)

index 0000000..4c05efe
--- /dev/null
+++ b/src/test/modules/test_bloomfilter/README
@@ -0,0 +1,68 @@
+test_bloomfilter overview
+=========================
+
+test_bloomfilter is a test harness module for testing Bloom filter library set
+membership operations.  It consists of a single SQL-callable function,
+test_bloomfilter(), plus a regression test that calls test_bloomfilter().
+Membership tests are performed against a dataset that the test harness module
+generates.
+
+The test_bloomfilter() function displays instrumentation at DEBUG1 elog level
+(WARNING when the false positive rate exceeds a 1% threshold).  This can be
+used to get a sense of the performance characteristics of the Postgres Bloom
+filter implementation under varied conditions.
+
+Bitset size
+-----------
+
+The main bloomfilter.c criteria for sizing its bitset is that the false
+positive rate should not exceed 2% when sufficient bloom_work_mem is available
+(and the caller-supplied estimate of the number of elements turns out to have
+been accurate).  A 1% - 2% rate is currently assumed to be suitable for all
+Bloom filter callers.
+
+With an optimal K (number of hash functions), Bloom filters should only have a
+1% false positive rate with just 9.6 bits of memory per element.  The Postgres
+implementation's 2% worst case guarantee exists because there is a need for
+some slop due to implementation inflexibility in bitset sizing.  Since the
+bitset size is always actually kept to a power of two number of bits, callers
+can have their bloom_work_mem argument truncated down by almost half.
+In practice, callers that make a point of passing a bloom_work_mem that is an
+exact power of two bitset size (such as test_bloomfilter.c) will actually get
+the "9.6 bits per element" 1% false positive rate.
+
+Testing strategy
+----------------
+
+Our approach to regression testing is to test that a Bloom filter has only a 1%
+false positive rate for a single bitset size (2 ^ 23, or 1MB).  We test a
+dataset with 838,861 elements, which works out at 10 bits of memory per
+element.  We round up from 9.6 bits to 10 bits to make sure that we reliably
+get under 1% for regression testing.  Note that a random seed is used in the
+regression tests because the exact false positive rate is inconsistent across
+platforms.  Inconsistent hash function behavior is something that the
+regression tests need to be tolerant of anyway.
+
+test_bloomfilter() SQL-callable function
+========================================
+
+The SQL-callable function test_bloomfilter() provides the following arguments:
+
+* "power" is the power of two used to size the Bloom filter's bitset.
+
+The minimum valid argument value is 23 (2^23 bits), or 1MB of memory.  The
+maximum valid argument value is 32, or 512MB of memory.
+
+* "nelements" is the number of elements to generate for testing purposes.
+
+* "seed" is a seed value for hashing.
+
+A value < 0 is interpreted as "use random seed".  Varying the seed value (or
+specifying -1) should result in small variations in the total number of false
+positives.
+
+* "tests" is the number of tests to run.
+
+This may be increased when it's useful to perform many tests in an interactive
+session.  It only makes sense to perform multiple tests when a random seed is
+used.
diff --git a/src/test/modules/test_bloomfilter/expected/test_bloomfilter.out b/src/test/modules/test_bloomfilter/expected/test_bloomfilter.out

new file mode 100644 (file)

index 0000000..21c0688
--- /dev/null
+++ b/src/test/modules/test_bloomfilter/expected/test_bloomfilter.out
@@ -0,0 +1,22 @@
+CREATE EXTENSION test_bloomfilter;
+-- See README for explanation of arguments:
+SELECT test_bloomfilter(power => 23,
+    nelements => 838861,
+    seed => -1,
+    tests => 1);
+ test_bloomfilter 
+------------------
+ 
+(1 row)
+
+-- Equivalent "10 bits per element" tests for all possible bitset sizes:
+--
+-- SELECT test_bloomfilter(24, 1677722)
+-- SELECT test_bloomfilter(25, 3355443)
+-- SELECT test_bloomfilter(26, 6710886)
+-- SELECT test_bloomfilter(27, 13421773)
+-- SELECT test_bloomfilter(28, 26843546)
+-- SELECT test_bloomfilter(29, 53687091)
+-- SELECT test_bloomfilter(30, 107374182)
+-- SELECT test_bloomfilter(31, 214748365)
+-- SELECT test_bloomfilter(32, 429496730)
diff --git a/src/test/modules/test_bloomfilter/sql/test_bloomfilter.sql b/src/test/modules/test_bloomfilter/sql/test_bloomfilter.sql

new file mode 100644 (file)

index 0000000..9ec159c
--- /dev/null
+++ b/src/test/modules/test_bloomfilter/sql/test_bloomfilter.sql
@@ -0,0 +1,19 @@
+CREATE EXTENSION test_bloomfilter;
+
+-- See README for explanation of arguments:
+SELECT test_bloomfilter(power => 23,
+    nelements => 838861,
+    seed => -1,
+    tests => 1);
+
+-- Equivalent "10 bits per element" tests for all possible bitset sizes:
+--
+-- SELECT test_bloomfilter(24, 1677722)
+-- SELECT test_bloomfilter(25, 3355443)
+-- SELECT test_bloomfilter(26, 6710886)
+-- SELECT test_bloomfilter(27, 13421773)
+-- SELECT test_bloomfilter(28, 26843546)
+-- SELECT test_bloomfilter(29, 53687091)
+-- SELECT test_bloomfilter(30, 107374182)
+-- SELECT test_bloomfilter(31, 214748365)
+-- SELECT test_bloomfilter(32, 429496730)
diff --git a/src/test/modules/test_bloomfilter/test_bloomfilter--1.0.sql b/src/test/modules/test_bloomfilter/test_bloomfilter--1.0.sql

new file mode 100644 (file)

index 0000000..7682318
--- /dev/null
+++ b/src/test/modules/test_bloomfilter/test_bloomfilter--1.0.sql
@@ -0,0 +1,11 @@
+/* src/test/modules/test_bloomfilter/test_bloomfilter--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_bloomfilter" to load this file. \quit
+
+CREATE FUNCTION test_bloomfilter(power integer,
+    nelements bigint,
+    seed integer DEFAULT -1,
+    tests integer DEFAULT 1)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_bloomfilter/test_bloomfilter.c b/src/test/modules/test_bloomfilter/test_bloomfilter.c

new file mode 100644 (file)

index 0000000..1691b0f
--- /dev/null
+++ b/src/test/modules/test_bloomfilter/test_bloomfilter.c
@@ -0,0 +1,138 @@
+/*--------------------------------------------------------------------------
+ *
+ * test_bloomfilter.c
+ *     Test false positive rate of Bloom filter.
+ *
+ * Copyright (c) 2018, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *     src/test/modules/test_bloomfilter/test_bloomfilter.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "lib/bloomfilter.h"
+#include "miscadmin.h"
+
+PG_MODULE_MAGIC;
+
+/* Must fit decimal representation of PG_INT64_MAX + 2 bytes: */
+#define MAX_ELEMENT_BYTES      20
+/* False positive rate WARNING threshold (1%): */
+#define FPOSITIVE_THRESHOLD        0.01
+
+
+/*
+ * Populate an empty Bloom filter with "nelements" dummy strings.
+ */
+static void
+populate_with_dummy_strings(bloom_filter *filter, int64 nelements)
+{
+   char        element[MAX_ELEMENT_BYTES];
+   int64       i;
+
+   for (i = 0; i < nelements; i++)
+   {
+       CHECK_FOR_INTERRUPTS();
+
+       snprintf(element, sizeof(element), "i" INT64_FORMAT, i);
+       bloom_add_element(filter, (unsigned char *) element, strlen(element));
+   }
+}
+
+/*
+ * Returns number of strings that are indicated as probably appearing in Bloom
+ * filter that were in fact never added by populate_with_dummy_strings().
+ * These are false positives.
+ */
+static int64
+nfalsepos_for_missing_strings(bloom_filter *filter, int64 nelements)
+{
+   char        element[MAX_ELEMENT_BYTES];
+   int64       nfalsepos = 0;
+   int64       i;
+
+   for (i = 0; i < nelements; i++)
+   {
+       CHECK_FOR_INTERRUPTS();
+
+       snprintf(element, sizeof(element), "M" INT64_FORMAT, i);
+       if (!bloom_lacks_element(filter, (unsigned char *) element,
+                                strlen(element)))
+           nfalsepos++;
+   }
+
+   return nfalsepos;
+}
+
+static void
+create_and_test_bloom(int power, int64 nelements, int callerseed)
+{
+   int         bloom_work_mem;
+   uint64      seed;
+   int64       nfalsepos;
+   bloom_filter *filter;
+
+   bloom_work_mem = (1L << power) / 8L / 1024L;
+
+   elog(DEBUG1, "bloom_work_mem (KB): %d", bloom_work_mem);
+
+   /*
+    * Generate random seed, or use caller's.  Seed should always be a
+    * positive value less than or equal to PG_INT32_MAX, to ensure that any
+    * random seed can be recreated through callerseed if the need arises.
+    * (Don't assume that RAND_MAX cannot exceed PG_INT32_MAX.)
+    */
+   seed = callerseed < 0 ? random() % PG_INT32_MAX : callerseed;
+
+   /* Create Bloom filter, populate it, and report on false positive rate */
+   filter = bloom_create(nelements, bloom_work_mem, seed);
+   populate_with_dummy_strings(filter, nelements);
+   nfalsepos = nfalsepos_for_missing_strings(filter, nelements);
+
+   ereport((nfalsepos > nelements * FPOSITIVE_THRESHOLD) ? WARNING : DEBUG1,
+           (errmsg_internal("seed: " UINT64_FORMAT " false positives: " INT64_FORMAT " (%.6f%%) bitset %.2f%% set" ,
+                            seed, nfalsepos, (double) nfalsepos / nelements,
+                            100.0 * bloom_prop_bits_set(filter))));
+
+   bloom_free(filter);
+}
+
+PG_FUNCTION_INFO_V1(test_bloomfilter);
+
+/*
+ * SQL-callable entry point to perform all tests.
+ *
+ * If a 1% false positive threshold is not met, emits WARNINGs.
+ *
+ * See README for details of arguments.
+ */
+Datum
+test_bloomfilter(PG_FUNCTION_ARGS)
+{
+   int         power = PG_GETARG_INT32(0);
+   int64       nelements = PG_GETARG_INT64(1);
+   int         seed = PG_GETARG_INT32(2);
+   int         tests = PG_GETARG_INT32(3);
+   int         i;
+
+   if (power < 23 || power > 32)
+       elog(ERROR, "power argument must be between 23 and 32 inclusive");
+
+   if (tests <= 0)
+       elog(ERROR, "invalid number of tests: %d", tests);
+
+   if (nelements < 0)
+       elog(ERROR, "invalid number of elements: %d", tests);
+
+   for (i = 0; i < tests; i++)
+   {
+       elog(DEBUG1, "beginning test #%d...", i + 1);
+
+       create_and_test_bloom(power, nelements, seed);
+   }
+
+   PG_RETURN_VOID();
+}
diff --git a/src/test/modules/test_bloomfilter/test_bloomfilter.control b/src/test/modules/test_bloomfilter/test_bloomfilter.control

new file mode 100644 (file)

index 0000000..99e56ee
--- /dev/null
+++ b/src/test/modules/test_bloomfilter/test_bloomfilter.control
@@ -0,0 +1,4 @@
+comment = 'Test code for Bloom filter library'
+default_version = '1.0'
+module_pathname = '$libdir/test_bloomfilter'
+relocatable = true
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list

index 17bf55c1f55650bc26d593700c121a5cf64c5633..abc10a8ffd40bbd4134db11910959c7716cc02cd 100644 (file)
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2590,6 +2590,7 @@ bitmapword
  bits16
  bits32
  bits8
+bloom_filter
  bool
  brin_column_state
  bytea
author	Andres Freund <andres@anarazel.de>
	Sun, 1 Apr 2018 00:49:41 +0000 (17:49 -0700)
committer	Andres Freund <andres@anarazel.de>
	Sun, 1 Apr 2018 00:49:41 +0000 (17:49 -0700)
src/backend/lib/Makefile		patch \| blob \| blame \| history
src/backend/lib/README		patch \| blob \| blame \| history
src/backend/lib/bloomfilter.c	[new file with mode: 0644]	patch \| blob
src/include/lib/bloomfilter.h	[new file with mode: 0644]	patch \| blob
src/test/modules/Makefile		patch \| blob \| blame \| history
src/test/modules/test_bloomfilter/.gitignore	[new file with mode: 0644]	patch \| blob
src/test/modules/test_bloomfilter/Makefile	[new file with mode: 0644]	patch \| blob
src/test/modules/test_bloomfilter/README	[new file with mode: 0644]	patch \| blob
src/test/modules/test_bloomfilter/expected/test_bloomfilter.out	[new file with mode: 0644]	patch \| blob
src/test/modules/test_bloomfilter/sql/test_bloomfilter.sql	[new file with mode: 0644]	patch \| blob
src/test/modules/test_bloomfilter/test_bloomfilter--1.0.sql	[new file with mode: 0644]	patch \| blob
src/test/modules/test_bloomfilter/test_bloomfilter.c	[new file with mode: 0644]	patch \| blob
src/test/modules/test_bloomfilter/test_bloomfilter.control	[new file with mode: 0644]	patch \| blob
src/tools/pgindent/typedefs.list		patch \| blob \| blame \| history