Improvements from Heikki Linnakangas <heikki@enterprisedb.com>

author Teodor Sigaev <teodor@sigaev.ru>

Fri, 7 Sep 2007 16:03:40 +0000 (16:03 +0000)

committer Teodor Sigaev <teodor@sigaev.ru>

Fri, 7 Sep 2007 16:03:40 +0000 (16:03 +0000)
author Teodor Sigaev <teodor@sigaev.ru>
Fri, 7 Sep 2007 16:03:40 +0000 (16:03 +0000)
committer Teodor Sigaev <teodor@sigaev.ru>
Fri, 7 Sep 2007 16:03:40 +0000 (16:03 +0000)
diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c

index 10b80dc9566d304ed5ab768f6bde609edf2b1523..974a1b7ae4e545e6023947d3cc7aa9319079a727 100644 (file)
--- a/src/backend/utils/adt/tsginidx.c
+++ b/src/backend/utils/adt/tsginidx.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/adt/tsginidx.c,v 1.2 2007/09/07 15:09:56 teodor Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/adt/tsginidx.c,v 1.3 2007/09/07 16:03:40 teodor Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -22,7 +22,7 @@ Datum
  gin_extract_tsvector(PG_FUNCTION_ARGS)
  {
     TSVector    vector = PG_GETARG_TSVECTOR(0);
-   uint32     *nentries = (uint32 *) PG_GETARG_POINTER(1);
+   int32      *nentries = (int32 *) PG_GETARG_POINTER(1);
     Datum      *entries = NULL;
  
     *nentries = 0;
@@ -55,7 +55,7 @@ Datum
  gin_extract_query(PG_FUNCTION_ARGS)
  {
     TSQuery     query = PG_GETARG_TSQUERY(0);
-   uint32     *nentries = (uint32 *) PG_GETARG_POINTER(1);
+   int32      *nentries = (int32 *) PG_GETARG_POINTER(1);
     StrategyNumber strategy = PG_GETARG_UINT16(2);
     Datum      *entries = NULL;
  
diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c

index 83f2c602a81e17b5a8e3ff8dd7b878295617b359..c53d526bdd1694d4e219031333e98d706b21d906 100644 (file)
--- a/src/backend/utils/adt/tsquery.c
+++ b/src/backend/utils/adt/tsquery.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.4 2007/09/07 15:35:10 teodor Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.5 2007/09/07 16:03:40 teodor Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -21,7 +21,6 @@
  #include "tsearch/ts_utils.h"
  #include "utils/memutils.h"
  #include "utils/pg_crc.h"
-#include "nodes/bitmapset.h"
  
  
  struct TSQueryParserStateData
@@ -384,16 +383,15 @@ makepol(TSQueryParserState state,
     }
  }
  
-/*
- * Fills in the left-fields previously left unfilled. The input
- * QueryItems must be in polish (prefix) notation. 
- */
  static void
-findoprnd(QueryItem *ptr, uint32 *pos)
+findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes)
  {
     /* since this function recurses, it could be driven to stack overflow. */
     check_stack_depth();
  
+   if (*pos >= nnodes)
+       elog(ERROR, "malformed tsquery; operand not found");
+
     if (ptr[*pos].type == QI_VAL ||
         ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here,
                                        * they haven't been cleaned
@@ -410,7 +408,7 @@ findoprnd(QueryItem *ptr, uint32 *pos)
         {
             ptr[*pos].operator.left = 1;
             (*pos)++;
-           findoprnd(ptr, pos);
+           findoprnd_recurse(ptr, pos, nnodes);
         }
         else
         {
@@ -420,13 +418,31 @@ findoprnd(QueryItem *ptr, uint32 *pos)
             Assert(curitem->oper == OP_AND || curitem->oper == OP_OR);
  
             (*pos)++;
-           findoprnd(ptr, pos);
+           findoprnd_recurse(ptr, pos, nnodes);
             curitem->left = *pos - tmp;
-           findoprnd(ptr, pos);
+           findoprnd_recurse(ptr, pos, nnodes);
         }
     }
  }
  
+
+/*
+ * Fills in the left-fields previously left unfilled. The input
+ * QueryItems must be in polish (prefix) notation. 
+ */
+static void
+findoprnd(QueryItem *ptr, int size)
+{
+   uint32 pos;
+
+   pos = 0;
+   findoprnd_recurse(ptr, &pos, size);
+
+   if (pos != size)
+       elog(ERROR, "malformed tsquery; extra nodes");
+}
+
+
  /*
   * Each value (operand) in the query is be passed to pushval. pushval can
   * transform the simple value to an arbitrarily complex expression using
@@ -452,7 +468,6 @@ parse_tsquery(char *buf,
     TSQuery     query;
     int         commonlen;
     QueryItem  *ptr;
-   uint32      pos = 0;
     ListCell   *cell;
  
     /* init state */
@@ -522,8 +537,7 @@ parse_tsquery(char *buf,
     pfree(state.op);
  
     /* Set left operand pointers for every operator. */
-   pos = 0;
-   findoprnd(ptr, &pos);
+   findoprnd(ptr, query->size);
  
     return query;
  }
@@ -734,6 +748,22 @@ tsqueryout(PG_FUNCTION_ARGS)
     PG_RETURN_CSTRING(nrm.buf);
  }
  
+/*
+ * Binary Input / Output functions. The binary format is as follows:
+ *
+ * uint32   number of operators/operands in the query
+ * 
+ * Followed by the operators and operands, in prefix notation. For each
+ * operand:
+ *
+ * uint8   type, QI_VAL
+ * uint8   weight
+ *             operand text in client encoding, null-terminated
+ *
+ * For each operator:
+ * uint8   type, QI_OPR
+ * uint8   operator, one of OP_AND, OP_OR, OP_NOT.
+ */
  Datum
  tsquerysend(PG_FUNCTION_ARGS)
  {
@@ -744,7 +774,7 @@ tsquerysend(PG_FUNCTION_ARGS)
  
     pq_begintypsend(&buf);
  
-   pq_sendint(&buf, query->size, sizeof(int32));
+   pq_sendint(&buf, query->size, sizeof(uint32));
     for (i = 0; i < query->size; i++)
     {
         pq_sendint(&buf, item->type, sizeof(item->type));
@@ -752,16 +782,13 @@ tsquerysend(PG_FUNCTION_ARGS)
         switch(item->type)
         {
             case QI_VAL:
-               pq_sendint(&buf, item->operand.weight, sizeof(item->operand.weight));
-               pq_sendint(&buf, item->operand.valcrc, sizeof(item->operand.valcrc));
-               pq_sendint(&buf, item->operand.length, sizeof(int16));
+               pq_sendint(&buf, item->operand.weight, sizeof(uint8));
+               pq_sendstring(&buf, GETOPERAND(query) + item->operand.distance);
                 /* istrue flag is just for temporary use in tsrank.c/Cover,
                  * so we don't need to transfer that */
                 break;
             case QI_OPR:
                 pq_sendint(&buf, item->operator.oper, sizeof(item->operator.oper));
-               if (item->operator.oper != OP_NOT)
-                   pq_sendint(&buf, item->operator.left, sizeof(item->operator.left));
                 break;
             default:
                 elog(ERROR, "unknown tsquery node type %d", item->type);
@@ -769,14 +796,6 @@ tsquerysend(PG_FUNCTION_ARGS)
         item++;
     }
  
-   item = GETQUERY(query);
-   for (i = 0; i < query->size; i++)
-   {
-       if (item->type == QI_VAL)
-           pq_sendbytes(&buf, GETOPERAND(query) + item->operand.distance, item->operand.length);
-       item++;
-   }
-
     PG_FREE_IF_COPY(query, 0);
  
     PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
@@ -788,141 +807,113 @@ tsqueryrecv(PG_FUNCTION_ARGS)
     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
     TSQuery     query;
     int         i,
-               size,
                 len;
     QueryItem  *item;
-   int         datalen = 0;
+   int         datalen;
     char       *ptr;
-   Bitmapset  *parentset = NULL;
+   uint32      size;
+   const char **operands;
  
     size = pq_getmsgint(buf, sizeof(uint32));
-   if (size < 0 || size > (MaxAllocSize / sizeof(QueryItem)))
+   if (size > (MaxAllocSize / sizeof(QueryItem)))
         elog(ERROR, "invalid size of tsquery");
  
-   len = HDRSIZETQ + sizeof(QueryItem) * size;
+   /* Allocate space to temporarily hold operand strings */
+   operands = palloc(size * sizeof(char *));
  
+   /* Allocate space for all the QueryItems. */
+   len = HDRSIZETQ + sizeof(QueryItem) * size;
     query = (TSQuery) palloc0(len);
     query->size = size;
     item = GETQUERY(query);
  
+   datalen = 0;
     for (i = 0; i < size; i++)
     {
         item->type = (int8) pq_getmsgint(buf, sizeof(int8));
  
-       switch(item->type)
+       if (item->type == QI_VAL)
         {
-           case QI_VAL:
-               item->operand.weight = (int8) pq_getmsgint(buf, sizeof(int8));
-               item->operand.valcrc = (int32) pq_getmsgint(buf, sizeof(int32));
-               item->operand.length = pq_getmsgint(buf, sizeof(int16));
-
-               /* Check that the weight bitmap is valid */
-               if (item->operand.weight < 0 || item->operand.weight > 0xF)
-                   elog(ERROR, "invalid weight bitmap");
-
-               /* XXX: We don't check that the CRC is valid. Actually, if we
-                * bothered to calculate it to verify, there would be no need
-                * to transfer it.
-                */
-
-               /*
-                * Check that datalen doesn't grow too large. Without the
-                * check, a malicious client could induce a buffer overflow
-                * by sending a tsquery whose size exceeds 2GB. datalen
-                * would overflow, we would allocate a too small buffer below,
-                * and overflow the buffer. Because operand.length is a 20-bit
-                * field, adding one such value to datalen must exceed
-                * MaxAllocSize before wrapping over the 32-bit datalen field,
-                * so this check will protect from it.
-                */
-               if (datalen > MAXSTRLEN)
-                   elog(ERROR, "invalid tsquery; total operand length exceeded");
-
-               /* We can calculate distance from datalen, no need to send it
-                * across the wire. If we did, we would have to check that
-                * it's valid anyway.
-                */
-               item->operand.distance = datalen;
-
-               datalen += item->operand.length + 1;        /* \0 */
-
-               break;
-           case QI_OPR:
-               item->operator.oper = (int8) pq_getmsgint(buf, sizeof(int8));
-               if (item->operator.oper != OP_NOT &&
-                   item->operator.oper != OP_OR &&
-                   item->operator.oper != OP_AND)
-                   elog(ERROR, "unknown operator type %d", (int) item->operator.oper);
-
-               /*
-                * Check that no previous operator node points to the right
-                * operand. That would mean that the operand node
-                * has two parents.
-                */
-               if (bms_is_member(i + 1, parentset))
-                   elog(ERROR, "malformed query tree");
-
-               parentset = bms_add_member(parentset, i + 1);
-
-               if(item->operator.oper != OP_NOT)
-               {
-                   uint32 left = (uint32) pq_getmsgint(buf, sizeof(uint32));
-
-                   /*
-                    * Right operand is implicitly at "this+1". Don't allow
-                    * left to point to the right operand, or to self.
-                    */
-                   if (left <= 1 || i + left >= size)
-                       elog(ERROR, "invalid pointer to left operand");
-
-                   /*
-                    * Check that no previous operator node points to the left
-                    * operand.
-                    */
-                   if (bms_is_member(i + left, parentset))
-                       elog(ERROR, "malformed query tree");
-
-                   parentset = bms_add_member(parentset, i + left);
-
-                   item->operator.left = left;
-               }
-               else
-                   item->operator.left = 1; /* do not leave uninitialized fields */
-
-               if (i == size - 1)
-                   elog(ERROR, "invalid pointer to right operand");
-               break;
-           default:
-               elog(ERROR, "unknown tsquery node type %d", item->type);
+           size_t val_len; /* length after recoding to server encoding */
+           uint8 weight;
+           const char *val;
+           pg_crc32 valcrc;
+
+           weight   = (uint8) pq_getmsgint(buf, sizeof(uint8));
+           val = pq_getmsgstring(buf);
+           val_len = strlen(val);
+
+           /* Sanity checks */
+
+           if (weight > 0xF)
+               elog(ERROR, "invalid tsquery; invalid weight bitmap");
+
+           if (val_len > MAXSTRLEN)
+               elog(ERROR, "invalid tsquery; operand too long");
+               
+           if (datalen > MAXSTRPOS)
+               elog(ERROR, "invalid tsquery; total operand length exceeded");
+
+           /* Looks valid. */
+
+           INIT_CRC32(valcrc);
+           COMP_CRC32(valcrc, val, val_len);
+           FIN_CRC32(valcrc);
+
+           item->operand.weight = weight;
+           item->operand.valcrc = (int32) valcrc;
+           item->operand.length = val_len;
+           item->operand.distance = datalen;
+
+           /* 
+            * Operand strings are copied to the final struct after this loop;
+            * here we just collect them to an array
+            */
+           operands[i] = val;
+
+           datalen += val_len + 1;     /* + 1 for the '\0' terminator */
+       } 
+       else if (item->type == QI_OPR)
+       {
+           int8 oper;
+           oper = (int8) pq_getmsgint(buf, sizeof(int8));
+           if (oper != OP_NOT && oper != OP_OR && oper != OP_AND)
+               elog(ERROR, "invalid tsquery; unknown operator type %d", (int) oper);
+           if (i == size - 1)
+               elog(ERROR, "invalid pointer to right operand");
+
+           item->operator.oper = oper;
         }
+       else 
+           elog(ERROR, "unknown tsquery node type %d", item->type);
  
         item++;
     }
  
-   /* Now check that each node, except the root, has a parent. We
-    * already checked above that no node has more than one parent. */
-   if (bms_num_members(parentset) != size - 1 && size != 0)
-       elog(ERROR, "malformed query tree");
-
-   bms_free( parentset );
-
+   /* Enlarge buffer to make room for the operand values. */
     query = (TSQuery) repalloc(query, len + datalen);
-
     item = GETQUERY(query);
     ptr = GETOPERAND(query);
+
+   /* 
+    * Fill in the left-pointers. Checks that the tree is well-formed
+    * as a side-effect.
+    */
+   findoprnd(item, size);
+
+   /* Copy operands to output struct */
     for (i = 0; i < size; i++)
     {
         if (item->type == QI_VAL)
         {
-           memcpy(ptr,
-                  pq_getmsgbytes(buf, item->operand.length),
-                  item->operand.length);
-           ptr += item->operand.length;
-           *ptr++ = '\0';
+           memcpy(ptr, operands[i], item->operand.length + 1);
+           ptr += item->operand.length + 1;
         }
         item++;
     }
  
+   pfree(operands);
+
     Assert(ptr - GETOPERAND(query) == datalen);
  
     SET_VARSIZE(query, len + datalen);
diff --git a/src/backend/utils/adt/tsquery_cleanup.c b/src/backend/utils/adt/tsquery_cleanup.c

index 6ab58228b59244eeed2f260eb1f3efa46cfe4678..bc123378e09aeaa22f6b7724d10a0f7345f4b6c1 100644 (file)
--- a/src/backend/utils/adt/tsquery_cleanup.c
+++ b/src/backend/utils/adt/tsquery_cleanup.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_cleanup.c,v 1.3 2007/09/07 15:35:10 teodor Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_cleanup.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -17,6 +17,7 @@
  
  #include "tsearch/ts_type.h"
  #include "tsearch/ts_utils.h"
+#include "miscadmin.h"
  
  typedef struct NODE
  {
diff --git a/src/backend/utils/adt/tsquery_rewrite.c b/src/backend/utils/adt/tsquery_rewrite.c

index d733a51d8e378dc9154d6916678f8df0f4b81ad4..77e3a8b173b71fc8a184393235eb950944c9f7c6 100644 (file)
--- a/src/backend/utils/adt/tsquery_rewrite.c
+++ b/src/backend/utils/adt/tsquery_rewrite.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_rewrite.c,v 1.3 2007/09/07 15:35:10 teodor Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_rewrite.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -17,6 +17,7 @@
  #include "executor/spi.h"
  #include "tsearch/ts_type.h"
  #include "tsearch/ts_utils.h"
+#include "miscadmin.h"
  
  
  static int
diff --git a/src/backend/utils/adt/tsquery_util.c b/src/backend/utils/adt/tsquery_util.c

index 60de44cc6f68d67efc2dc073099c7f4181d87740..a4fd0937c4eb04104ef2d5a04c7043529e0be6ed 100644 (file)
--- a/src/backend/utils/adt/tsquery_util.c
+++ b/src/backend/utils/adt/tsquery_util.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_util.c,v 1.3 2007/09/07 15:35:10 teodor Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_util.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -16,6 +16,7 @@
  
  #include "tsearch/ts_type.h"
  #include "tsearch/ts_utils.h"
+#include "miscadmin.h"
  
  QTNode *
  QT2QTN(QueryItem * in, char *operand)
diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c

index 30ccfa3f6170ccc9236cb1c5095e8fbf3200a7e1..535a3541bf75cfced4baddb8e254bbcbf21e7321 100644 (file)
--- a/src/backend/utils/adt/tsrank.c
+++ b/src/backend/utils/adt/tsrank.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.3 2007/09/07 15:35:10 teodor Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -18,6 +18,7 @@
  #include "tsearch/ts_type.h"
  #include "tsearch/ts_utils.h"
  #include "utils/array.h"
+#include "miscadmin.h"
  
  
  static float weights[] = {0.1, 0.2, 0.4, 1.0};
@@ -176,8 +177,9 @@ SortAndUniqItems(TSQuery q, int *size)
     return res;
  }
  
+/* A dummy WordEntryPos array to use when haspos is false */
  static WordEntryPos POSNULL[] = {
-   0,
+   1, /* Number of elements that follow */
     0
  };
  
@@ -207,7 +209,6 @@ calc_rank_and(float *w, TSVector t, TSQuery q)
     }
     pos = (uint16 **) palloc(sizeof(uint16 *) * q->size);
     memset(pos, 0, sizeof(uint16 *) * q->size);
-   *(uint16 *) POSNULL = lengthof(POSNULL) - 1;
     WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1);
  
     for (i = 0; i < size; i++)
@@ -265,7 +266,6 @@ calc_rank_or(float *w, TSVector t, TSQuery q)
     QueryOperand **item;
     int         size = q->size;
  
-   *(uint16 *) POSNULL = lengthof(POSNULL) - 1;
     item = SortAndUniqItems(q, &size);
  
     for (i = 0; i < size; i++)
@@ -593,7 +593,6 @@ get_docrep(TSVector txt, TSQuery query, int *doclen)
     DocRepresentation *doc;
     char       *operand;
  
-   *(uint16 *) POSNULL = lengthof(POSNULL) - 1;
     doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
     operand = GETOPERAND(query);
     reset_istrue_flag(query);
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c

index 2866e028da02b778c7592d909524b90413f2d612..992da4a9b4097a7a6dde10b6dab6c694d0efd382 100644 (file)
--- a/src/backend/utils/adt/tsvector.c
+++ b/src/backend/utils/adt/tsvector.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.3 2007/09/07 15:09:56 teodor Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -75,18 +75,20 @@ uniquePos(WordEntryPos * a, int l)
  }
  
  static int
-compareentry(const void *a, const void *b, void *arg)
+compareentry(const void *va, const void *vb, void *arg)
  {
     char       *BufferStr = (char *) arg;
+   WordEntryIN *a = (WordEntryIN *) va;
+   WordEntryIN *b = (WordEntryIN *) vb;
  
-   if (((WordEntryIN *) a)->entry.len == ((WordEntryIN *) b)->entry.len)
+   if (a->entry.len == b->entry.len)
     {
-       return strncmp(&BufferStr[((WordEntryIN *) a)->entry.pos],
-                      &BufferStr[((WordEntryIN *) b)->entry.pos],
-                      ((WordEntryIN *) a)->entry.len);
+       return strncmp(&BufferStr[a->entry.pos],
+                      &BufferStr[b->entry.pos],
+                      a->entry.len);
     }
  
-   return (((WordEntryIN *) a)->entry.len > ((WordEntryIN *) b)->entry.len) ? 1 : -1;
+   return (a->entry.len > b->entry.len) ? 1 : -1;
  }
  
  static int
@@ -104,6 +106,9 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen)
             a->poslen = uniquePos(a->pos, a->poslen);
             *outbuflen = SHORTALIGN(a->entry.len) + (a->poslen + 1) * sizeof(WordEntryPos);
         }
+       else
+           *outbuflen = a->entry.len;
+
         return l;
     }
     res = a;
@@ -118,10 +123,12 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen)
         {
             if (res->entry.haspos)
             {
+               *outbuflen += SHORTALIGN(res->entry.len);
                 res->poslen = uniquePos(res->pos, res->poslen);
                 *outbuflen += res->poslen * sizeof(WordEntryPos);
             }
-           *outbuflen += SHORTALIGN(res->entry.len);
+           else
+               *outbuflen += res->entry.len;
             res++;
             memcpy(res, ptr, sizeof(WordEntryIN));
         }
@@ -147,12 +154,18 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen)
         }
         ptr++;
     }
+
+   /* add last item */
+
     if (res->entry.haspos)
     {
+       *outbuflen += SHORTALIGN(res->entry.len);
+
         res->poslen = uniquePos(res->pos, res->poslen);
         *outbuflen += res->poslen * sizeof(WordEntryPos);
     }
-   *outbuflen += SHORTALIGN(res->entry.len);
+   else
+       *outbuflen += res->entry.len;
  
     return res + 1 - a;
  }
@@ -367,6 +380,18 @@ tsvectorout(PG_FUNCTION_ARGS)
     PG_RETURN_CSTRING(outbuf);
  }
  
+/*
+ * Binary Input / Output functions. The binary format is as follows:
+ *
+ * uint32  number of lexemes
+ * 
+ * for each lexeme:
+ *     lexeme text in client encoding, null-terminated
+ *         uint16  number of positions
+ *         for each position:
+ *         uint16 WordEntryPos
+ */
+
  Datum
  tsvectorsend(PG_FUNCTION_ARGS)
  {
@@ -381,18 +406,22 @@ tsvectorsend(PG_FUNCTION_ARGS)
     pq_sendint(&buf, vec->size, sizeof(int32));
     for (i = 0; i < vec->size; i++)
     {
-       /*
-        * We are sure that sizeof(WordEntry) == sizeof(int32)
+       uint16 npos;
+
+       /* the strings in the TSVector array are not null-terminated, so 
+        * we have to send the null-terminator separately
          */
-       pq_sendint(&buf, *(int32 *) weptr, sizeof(int32));
+       pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
+       pq_sendbyte(&buf, '\0');
  
-       pq_sendbytes(&buf, STRPTR(vec) + weptr->pos, weptr->len);
-       if (weptr->haspos)
+       npos = POSDATALEN(vec, weptr);
+       pq_sendint(&buf, npos, sizeof(uint16));
+
+       if(npos > 0)
         {
             WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
  
-           pq_sendint(&buf, POSDATALEN(vec, weptr), sizeof(WordEntryPos));
-           for (j = 0; j < POSDATALEN(vec, weptr); j++)
+           for (j = 0; j < npos; j++)
                 pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
         }
         weptr++;
@@ -407,71 +436,92 @@ tsvectorrecv(PG_FUNCTION_ARGS)
     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
     TSVector    vec;
     int         i;
-   uint32      size;
-   WordEntry  *weptr;
-   int         datalen = 0;
-   Size        len;
-
-   size = pq_getmsgint(buf, sizeof(uint32));
-   if (size < 0 || size > (MaxAllocSize / sizeof(WordEntry)))
+   int32       nentries;
+   int         datalen; /* number of bytes used in the variable size area
+                         * after fixed size TSVector header and WordEntries
+                         */
+   Size        hdrlen;
+   Size        len;  /* allocated size of vec */
+
+   nentries = pq_getmsgint(buf, sizeof(int32));
+   if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
         elog(ERROR, "invalid size of tsvector");
  
-   len = DATAHDRSIZE + sizeof(WordEntry) * size;
+   hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
  
-   len = len * 2; /* times two to make room for lexemes */
+   len = hdrlen * 2; /* times two to make room for lexemes */
     vec = (TSVector) palloc0(len);
-   vec->size = size;
+   vec->size = nentries;
  
-   weptr = ARRPTR(vec);
-   for (i = 0; i < size; i++)
+   datalen = 0;
+   for (i = 0; i < nentries; i++)
     {
-       int32 tmp;
+       const char *lexeme;
+       uint16 npos;
+       size_t lex_len;
+
+       lexeme = pq_getmsgstring(buf);
+       npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
+
+       /* sanity checks */
+
+       lex_len = strlen(lexeme);
+       if (lex_len < 0 || lex_len > MAXSTRLEN)
+           elog(ERROR, "invalid tsvector; lexeme too long");
+
+       if (datalen > MAXSTRPOS)
+           elog(ERROR, "invalid tsvector; maximum total lexeme length exceeded"); 
  
-       weptr = ARRPTR(vec) + i;
+       if (npos > MAXNUMPOS)
+           elog(ERROR, "unexpected number of positions");
  
         /*
-        * We are sure that sizeof(WordEntry) == sizeof(int32)
+        * Looks valid. Fill the WordEntry struct, and copy lexeme.
+        *
+        * But make sure the buffer is large enough first.
          */
-       tmp = pq_getmsgint(buf, sizeof(int32));
-       *weptr = *(WordEntry *) & tmp;
-
-       while (CALCDATASIZE(size, datalen + SHORTALIGN(weptr->len)) >= len)
+       while (hdrlen + SHORTALIGN(datalen + lex_len) +
+              (npos + 1) * sizeof(WordEntryPos) >= len)
         {
             len *= 2;
             vec = (TSVector) repalloc(vec, len);
-           weptr = ARRPTR(vec) + i;
         }
  
-       memcpy(STRPTR(vec) + weptr->pos,
-              pq_getmsgbytes(buf, weptr->len),
-              weptr->len);
-       datalen += SHORTALIGN(weptr->len);
+       vec->entries[i].haspos = (npos > 0) ? 1 : 0;
+       vec->entries[i].len = lex_len;
+       vec->entries[i].pos = datalen;
+
+       memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
+
+       datalen += lex_len;
  
-       if (i > 0 && WordEntryCMP(weptr, weptr - 1, STRPTR(vec)) <= 0)
+       if (i > 0 && WordEntryCMP(&vec->entries[i], &vec->entries[i - 1], STRPTR(vec)) <= 0)
             elog(ERROR, "lexemes are unordered");
  
-       if (weptr->haspos)
+       /* Receive positions */
+
+       if (npos > 0)
         {
-           uint16      j,
-                       npos;
+           uint16      j;
             WordEntryPos *wepptr;
  
-           npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
-           if (npos > MAXNUMPOS)
-               elog(ERROR, "unexpected number of positions");
-
-           while (CALCDATASIZE(size, datalen + (npos + 1) * sizeof(WordEntryPos)) >= len)
+           /*
+            * Pad to 2-byte alignment if necessary. Though we used palloc0
+            * for the initial allocation, subsequent repalloc'd memory
+            * areas are not initialized to zero.
+            */
+           if (datalen != SHORTALIGN(datalen))
             {
-               len *= 2;
-               vec = (TSVector) repalloc(vec, len);
-               weptr = ARRPTR(vec) + i;
+               *(STRPTR(vec) + datalen) = '\0';
+               datalen = SHORTALIGN(datalen);
             }
  
-           memcpy(_POSDATAPTR(vec, weptr), &npos, sizeof(int16));
-           wepptr = POSDATAPTR(vec, weptr);
+           memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
+
+           wepptr = POSDATAPTR(vec, &vec->entries[i]);
             for (j = 0; j < npos; j++)
             {
-               wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(int16));
+               wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
                 if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
                     elog(ERROR, "position information is unordered");
             }
@@ -480,7 +530,7 @@ tsvectorrecv(PG_FUNCTION_ARGS)
         }
     }
  
-   SET_VARSIZE(vec, CALCDATASIZE(vec->size, datalen));
+   SET_VARSIZE(vec, hdrlen + datalen);
  
     PG_RETURN_TSVECTOR(vec);
  }
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c

index d34ab1fcf0bd68872f76fb7043cf8b442f1c4921..8e7593513ff70baf7f365b5a723b00874a4437ff 100644 (file)
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.3 2007/09/07 15:09:56 teodor Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -165,7 +165,7 @@ tsvector_strip(PG_FUNCTION_ARGS)
     char       *cur;
  
     for (i = 0; i < in->size; i++)
-       len += SHORTALIGN(arrin[i].len);
+       len += arrin[i].len;
  
     len = CALCDATASIZE(in->size, len);
     out = (TSVector) palloc0(len);
@@ -179,7 +179,7 @@ tsvector_strip(PG_FUNCTION_ARGS)
         arrout[i].haspos = 0;
         arrout[i].len = arrin[i].len;
         arrout[i].pos = cur - STRPTR(out);
-       cur += SHORTALIGN(arrout[i].len);
+       cur += arrout[i].len;
     }
  
     PG_FREE_IF_COPY(in, 0);
@@ -351,12 +351,15 @@ tsvector_concat(PG_FUNCTION_ARGS)
             ptr->len = ptr1->len;
             memcpy(cur, data1 + ptr1->pos, ptr1->len);
             ptr->pos = cur - data;
-           cur += SHORTALIGN(ptr1->len);
             if (ptr->haspos)
             {
+               cur += SHORTALIGN(ptr1->len);
                 memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
                 cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
             }
+           else
+               cur += ptr1->len;
+               
             ptr++;
             ptr1++;
             i1--;
@@ -367,16 +370,20 @@ tsvector_concat(PG_FUNCTION_ARGS)
             ptr->len = ptr2->len;
             memcpy(cur, data2 + ptr2->pos, ptr2->len);
             ptr->pos = cur - data;
-           cur += SHORTALIGN(ptr2->len);
             if (ptr->haspos)
             {
                 int         addlen = add_pos(in2, ptr2, out, ptr, maxpos);
  
+               cur += SHORTALIGN(ptr2->len);
+
                 if (addlen == 0)
                     ptr->haspos = 0;
                 else
                     cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
             }
+           else
+               cur += ptr2->len;
+
             ptr++;
             ptr2++;
             i2--;
@@ -387,9 +394,9 @@ tsvector_concat(PG_FUNCTION_ARGS)
             ptr->len = ptr1->len;
             memcpy(cur, data1 + ptr1->pos, ptr1->len);
             ptr->pos = cur - data;
-           cur += SHORTALIGN(ptr1->len);
             if (ptr->haspos)
             {
+               cur += SHORTALIGN(ptr1->len);
                 if (ptr1->haspos)
                 {
                     memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
@@ -407,6 +414,9 @@ tsvector_concat(PG_FUNCTION_ARGS)
                         cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
                 }
             }
+           else
+               cur += ptr1->len;
+
             ptr++;
             ptr1++;
             ptr2++;
@@ -421,12 +431,15 @@ tsvector_concat(PG_FUNCTION_ARGS)
         ptr->len = ptr1->len;
         memcpy(cur, data1 + ptr1->pos, ptr1->len);
         ptr->pos = cur - data;
-       cur += SHORTALIGN(ptr1->len);
         if (ptr->haspos)
         {
+           cur += SHORTALIGN(ptr1->len);
             memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
             cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
         }
+       else
+           cur += ptr1->len;
+
         ptr++;
         ptr1++;
         i1--;
@@ -438,16 +451,20 @@ tsvector_concat(PG_FUNCTION_ARGS)
         ptr->len = ptr2->len;
         memcpy(cur, data2 + ptr2->pos, ptr2->len);
         ptr->pos = cur - data;
-       cur += SHORTALIGN(ptr2->len);
         if (ptr->haspos)
         {
             int         addlen = add_pos(in2, ptr2, out, ptr, maxpos);
  
+           cur += SHORTALIGN(ptr2->len);
+
             if (addlen == 0)
                 ptr->haspos = 0;
             else
                 cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
         }
+       else
+           cur += ptr2->len;
+
         ptr++;
         ptr2++;
         i2--;
@@ -484,8 +501,8 @@ ValCompare(CHKVAL * chkval, WordEntry * ptr, QueryOperand * item)
  static bool
  checkclass_str(CHKVAL * chkval, WordEntry * val, QueryOperand * item)
  {
-   WordEntryPos *ptr = (WordEntryPos *) (chkval->values + val->pos + SHORTALIGN(val->len) + sizeof(uint16));
-   uint16      len = *((uint16 *) (chkval->values + val->pos + SHORTALIGN(val->len)));
+   WordEntryPos *ptr = (WordEntryPos *) (chkval->values + SHORTALIGN(val->pos + val->len) + sizeof(uint16));
+   uint16      len = *((uint16 *) (chkval->values + SHORTALIGN(val->pos + val->len)));
  
     while (len--)
     {
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h

index 15aa9f5c6e171cf24f9a9f4ea5529f3d3f1bd65d..0aa95e892cc00325624995039d6d8ebf72ee9f03 100644 (file)
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -5,7 +5,7 @@
   *
   * Copyright (c) 1998-2007, PostgreSQL Global Development Group
   *
- * $PostgreSQL: pgsql/src/include/tsearch/ts_type.h,v 1.3 2007/09/07 15:35:11 teodor Exp $
+ * $PostgreSQL: pgsql/src/include/tsearch/ts_type.h,v 1.4 2007/09/07 16:03:40 teodor Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -62,26 +62,33 @@ typedef uint16 WordEntryPos;
   *                         bytes from end of WordEntry array to start of
   *                         corresponding lexeme.
   * 4) Lexeme's storage:
- *   SHORTALIGNED(lexeme) and position information if it exists
- *   Position information: first int2 - is a number of positions and it
- *   follows array of WordEntryPos
+ *   lexeme (without null-terminator)
+ *    if haspos is true:
+ *     padding byte if necessary to make the number of positions 2-byte aligned
+ *     uint16      number of positions that follow.
+ *     uint16[]    positions
+ *
+ * The positions must be sorted.
   */
  
  typedef struct
  {
     int32       vl_len_;        /* varlena header (do not touch directly!) */
-   uint32      size;
-   char        data[1];
+   int32       size;
+   WordEntry   entries[1]; /* var size */
+   /* lexemes follow */
  } TSVectorData;
  
  typedef TSVectorData *TSVector;
  
-#define DATAHDRSIZE (VARHDRSZ + sizeof(int4))
-#define CALCDATASIZE(x, lenstr) ( (x) * sizeof(WordEntry) + DATAHDRSIZE + (lenstr) )
-#define ARRPTR(x)  ( (WordEntry*) ( (char*)(x) + DATAHDRSIZE ) )
-#define STRPTR(x)  ( (char*)(x) + DATAHDRSIZE + ( sizeof(WordEntry) * ((TSVector)(x))->size ) )
-#define STRSIZE(x) ( ((TSVector)(x))->len - DATAHDRSIZE - ( sizeof(WordEntry) * ((TSVector)(x))->size ) )
-#define _POSDATAPTR(x,e)   (STRPTR(x)+((WordEntry*)(e))->pos+SHORTALIGN(((WordEntry*)(e))->len))
+#define DATAHDRSIZE (offsetof(TSVectorData, entries))
+#define CALCDATASIZE(x, lenstr) (DATAHDRSIZE + (x) * sizeof(WordEntry) + (lenstr) )
+#define ARRPTR(x)  ( (x)->entries )
+
+/* returns a pointer to the beginning of lexemes */
+#define STRPTR(x)  ( (char *) &(x)->entries[x->size] )
+
+#define _POSDATAPTR(x,e)   (STRPTR(x) + SHORTALIGN((e)->pos + (e)->len))
  #define POSDATALEN(x,e) ( ( ((WordEntry*)(e))->haspos ) ? (*(uint16*)_POSDATAPTR(x,e)) : 0 )
  #define POSDATAPTR(x,e) ( (WordEntryPos*)( _POSDATAPTR(x,e)+sizeof(uint16) ) )
  
@@ -159,7 +166,7 @@ typedef int8 QueryItemType;
  typedef struct
  {
     QueryItemType       type;   /* operand or kind of operator (ts_tokentype) */
-   int8        weight;         /* weights of operand to search. It's a bitmask of allowed weights.
+   uint8       weight;         /* weights of operand to search. It's a bitmask of allowed weights.
                                  * if it =0 then any weight are allowed.
                                  * Weights and bit map:
                                  * A: 1<<3
author	Teodor Sigaev <teodor@sigaev.ru>
	Fri, 7 Sep 2007 16:03:40 +0000 (16:03 +0000)
committer	Teodor Sigaev <teodor@sigaev.ru>
	Fri, 7 Sep 2007 16:03:40 +0000 (16:03 +0000)
src/backend/utils/adt/tsginidx.c		patch \| blob \| blame \| history
src/backend/utils/adt/tsquery.c		patch \| blob \| blame \| history
src/backend/utils/adt/tsquery_cleanup.c		patch \| blob \| blame \| history
src/backend/utils/adt/tsquery_rewrite.c		patch \| blob \| blame \| history
src/backend/utils/adt/tsquery_util.c		patch \| blob \| blame \| history
src/backend/utils/adt/tsrank.c		patch \| blob \| blame \| history
src/backend/utils/adt/tsvector.c		patch \| blob \| blame \| history
src/backend/utils/adt/tsvector_op.c		patch \| blob \| blame \| history
src/include/tsearch/ts_type.h		patch \| blob \| blame \| history