Fix default text search parser's ts_headline code for phrase queries.

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 9 Apr 2020 17:19:23 +0000 (13:19 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 9 Apr 2020 17:19:23 +0000 (13:19 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 9 Apr 2020 17:19:23 +0000 (13:19 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 9 Apr 2020 17:19:23 +0000 (13:19 -0400)
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c

index 9a351ac9da4cce53c09a4cde32871c48aecbec06..e5c4d7cb04911a87cb834f82c453ecafb5f8b199 100644 (file)
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -1942,9 +1942,10 @@ prsd_end(PG_FUNCTION_ARGS)
  #define INTERESTINGWORD(j) \
     (prs->words[j].item && !prs->words[j].repeated)
  
-/* Don't want to end at a non-word or a short word */
+/* Don't want to end at a non-word or a short word, unless interesting */
  #define BADENDPOINT(j) \
-   (NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword)
+   ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
+    !INTERESTINGWORD(j))
  
  typedef struct
  {
@@ -2003,75 +2004,97 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
     return false;
  }
  
-
-static bool
-hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
+/*
+ * hlFirstIndex: find first index >= pos containing any word used in query
+ *
+ * Returns -1 if no such index
+ */
+static int
+hlFirstIndex(HeadlineParsedText *prs, TSQuery query, int pos)
  {
-   int         i,
-               j;
-   QueryItem  *item = GETQUERY(query);
-   int         pos = *p;
-
-   *q = -1;
-   *p = INT_MAX;
+   int         i;
  
-   for (j = 0; j < query->size; j++)
+   /* For each word ... */
+   for (i = pos; i < prs->curwords; i++)
     {
-       if (item->type != QI_VAL)
+       /* ... scan the query to see if this word matches any operand */
+       QueryItem  *item = GETQUERY(query);
+       int         j;
+
+       for (j = 0; j < query->size; j++)
         {
+           if (item->type == QI_VAL &&
+               prs->words[i].item == &item->qoperand)
+               return i;
             item++;
-           continue;
-       }
-       for (i = pos; i < prs->curwords; i++)
-       {
-           if (prs->words[i].item == &item->qoperand)
-           {
-               if (i > *q)
-                   *q = i;
-               break;
-           }
         }
-       item++;
     }
+   return -1;
+}
  
-   if (*q < 0)
-       return false;
+/*
+ * hlCover: try to find a substring of prs' word list that satisfies query
+ *
+ * At entry, *p must be the first word index to consider (initialize this to
+ * zero, or to the next index after a previous successful search).
+ *
+ * On success, sets *p to first word index and *q to last word index of the
+ * cover substring, and returns true.
+ *
+ * The result is a minimal cover, in the sense that both *p and *q will be
+ * words used in the query.
+ */
+static bool
+hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
+{
+   int         pmin,
+               pmax,
+               nextpmin,
+               nextpmax;
+   hlCheck     ch;
  
-   item = GETQUERY(query);
-   for (j = 0; j < query->size; j++)
+   /*
+    * We look for the earliest, shortest substring of prs->words that
+    * satisfies the query.  Both the pmin and pmax indices must be words
+    * appearing in the query; there's no point in trying endpoints in between
+    * such points.
+    */
+   pmin = hlFirstIndex(prs, query, *p);
+   while (pmin >= 0)
     {
-       if (item->type != QI_VAL)
+       /* This useless assignment just keeps stupider compilers quiet */
+       nextpmin = -1;
+       /* Consider substrings starting at pmin */
+       ch.words = &(prs->words[pmin]);
+       /* Consider the length-one substring first, then longer substrings */
+       pmax = pmin;
+       do
         {
-           item++;
-           continue;
-       }
-       for (i = *q; i >= pos; i--)
-       {
-           if (prs->words[i].item == &item->qoperand)
+           /* Try to match query against pmin .. pmax substring */
+           ch.len = pmax - pmin + 1;
+           if (TS_execute(GETQUERY(query), &ch,
+                          TS_EXEC_EMPTY, checkcondition_HL))
             {
-               if (i < *p)
-                   *p = i;
-               break;
+               *p = pmin;
+               *q = pmax;
+               return true;
             }
-       }
-       item++;
-   }
-
-   if (*p <= *q)
-   {
-       hlCheck     ch;
+           /* Nope, so advance pmax to next feasible endpoint */
+           nextpmax = hlFirstIndex(prs, query, pmax + 1);
  
-       ch.words = &(prs->words[*p]);
-       ch.len = *q - *p + 1;
-       if (TS_execute(GETQUERY(query), &ch, TS_EXEC_EMPTY, checkcondition_HL))
-           return true;
-       else
-       {
-           (*p)++;
-           return hlCover(prs, query, p, q);
+           /*
+            * If this is our first advance past pmin, then the result is also
+            * the next feasible value of pmin; remember it to save a
+            * redundant search.
+            */
+           if (pmax == pmin)
+               nextpmin = nextpmax;
+           pmax = nextpmax;
         }
+       while (pmax >= 0);
+       /* No luck here, so try next feasible startpoint */
+       pmin = nextpmin;
     }
-
     return false;
  }
  
@@ -2357,11 +2380,12 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
     int         bestb = -1,
                 beste = -1;
     int         bestlen = -1;
+   bool        bestcover = false;
     int         pose,
                 posb,
                 poslen,
                 curlen;
-
+   bool        poscover;
     int         i;
  
     if (!highlightall)
@@ -2387,14 +2411,6 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
                 pose = i;
             }
  
-           /* XXX this optimization seems unnecessary and wrong */
-           if (poslen < bestlen && !BADENDPOINT(beste))
-           {
-               /* better cover already found, so try next cover */
-               p++;
-               continue;
-           }
-
             if (curlen < max_words)
             {
                 /*
@@ -2449,29 +2465,38 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
                     i = q;
                 for (; curlen > min_words; i--)
                 {
+                   if (!BADENDPOINT(i))
+                       break;
                     if (!NONWORDTOKEN(prs->words[i].type))
                         curlen--;
                     if (INTERESTINGWORD(i))
                         poslen--;
-                   pose = i;
-                   if (!BADENDPOINT(i))
-                       break;
+                   pose = i - 1;
                 }
             }
  
             /*
-            * Adopt this headline if it's the first, or if it has more
-            * interesting words and isn't ending at a bad endpoint, or if it
-            * replaces a bad endpoint with a good one (XXX even if it has
-            * fewer interesting words?  Really?)
+            * Check whether the proposed headline includes the original
+            * cover; it might not if we trimmed it due to max_words.
+            */
+           poscover = (posb <= p && pose >= q);
+
+           /*
+            * Adopt this headline if it's better than the last one, giving
+            * highest priority to headlines including the cover, then to
+            * headlines with more interesting words, then to headlines with
+            * good stopping points.  (Since bestlen is initially -1, we will
+            * certainly adopt the first headline.)
              */
-           if (bestlen < 0 ||
-               (poslen > bestlen && !BADENDPOINT(pose)) ||
-               (!BADENDPOINT(pose) && BADENDPOINT(beste)))
+           if (poscover > bestcover ||
+               (poscover == bestcover && poslen > bestlen) ||
+               (poscover == bestcover && poslen == bestlen &&
+                !BADENDPOINT(pose) && BADENDPOINT(beste)))
             {
                 bestb = posb;
                 beste = pose;
                 bestlen = poslen;
+               bestcover = poscover;
             }
  
             /* move p to generate the next cover */
diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out

index 2bfd58b0b967778519a0fd10126120de0efb1e97..d6e0ba6f81b5c355806fece149960602e3225616 100644 (file)
--- a/src/test/regress/expected/tsearch.out
+++ b/src/test/regress/expected/tsearch.out
@@ -1301,12 +1301,12 @@ Water, water, every where,
    Nor any drop to drink.
  S. T. Coleridge (1772-1834)
  ', phraseto_tsquery('english', 'painted Ocean'));
-           ts_headline            
-----------------------------------
- <b>painted</b> <b>Ocean</b>.    +
- Water, water, every where       +
-   And all the boards did shrink;+
- Water, water, every
+              ts_headline              
+---------------------------------------
+ <b>painted</b> Ship                  +
+   Upon a <b>painted</b> <b>Ocean</b>.+
+ Water, water, every where            +
+   And all the boards did shrink
  (1 row)
  
  SELECT ts_headline('english', '
@@ -1328,6 +1328,15 @@ S. T. Coleridge (1772-1834)
     And all the boards
  (1 row)
  
+SELECT ts_headline('english',
+'Lorem ipsum urna.  Nullam nullam ullamcorper urna.',
+to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
+'MaxWords=100, MinWords=1');
+                                  ts_headline                                  
+-------------------------------------------------------------------------------
+ <b>Lorem</b> ipsum <b>urna</b>.  Nullam nullam <b>ullamcorper</b> <b>urna</b>
+(1 row)
+
  SELECT ts_headline('english', '
  <html>
  <!-- some comment -->
@@ -1364,15 +1373,15 @@ SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=2, MinWords
  (1 row)
  
  SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 & 3', 'MaxWords=4, MinWords=1');
-         ts_headline          
-------------------------------
- <b>1</b> 2 <b>3</b> <b>1</b>
+     ts_headline     
+---------------------
+ <b>1</b> 2 <b>3</b>
  (1 row)
  
  SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=4, MinWords=1');
-    ts_headline    
--------------------
- <b>1</b> <b>3</b>
+        ts_headline         
+----------------------------
+ <b>3</b> <b>1</b> <b>3</b>
  (1 row)
  
  --Check if headline fragments work
@@ -1467,6 +1476,16 @@ S. T. Coleridge (1772-1834)
   S. T. <b>Coleridge</b>
  (1 row)
  
+--Fragments with phrase search
+SELECT ts_headline('english',
+'Lorem ipsum urna.  Nullam nullam ullamcorper urna.',
+to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
+'MaxFragments=100, MaxWords=100, MinWords=1');
+                                  ts_headline                                  
+-------------------------------------------------------------------------------
+ <b>Lorem</b> ipsum <b>urna</b>.  Nullam nullam <b>ullamcorper</b> <b>urna</b>
+(1 row)
+
  --Rewrite sub system
  CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
  \set ECHO none
diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql

index c71cda5cf92b7fbcd5ffb97d62d5088f4a1ec545..22c84a31342c6f4a8cddbfd8c2668189238a0f60 100644 (file)
--- a/src/test/regress/sql/tsearch.sql
+++ b/src/test/regress/sql/tsearch.sql
@@ -384,6 +384,11 @@ Water, water, every where,
  S. T. Coleridge (1772-1834)
  ', phraseto_tsquery('english', 'idle as a painted Ship'));
  
+SELECT ts_headline('english',
+'Lorem ipsum urna.  Nullam nullam ullamcorper urna.',
+to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
+'MaxWords=100, MinWords=1');
+
  SELECT ts_headline('english', '
  <html>
  <!-- some comment -->
@@ -454,6 +459,12 @@ Water, water, every where,
  S. T. Coleridge (1772-1834)
  ', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2,FragmentDelimiter=***');
  
+--Fragments with phrase search
+SELECT ts_headline('english',
+'Lorem ipsum urna.  Nullam nullam ullamcorper urna.',
+to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
+'MaxFragments=100, MaxWords=100, MinWords=1');
+
  --Rewrite sub system
  
  CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 9 Apr 2020 17:19:23 +0000 (13:19 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 9 Apr 2020 17:19:23 +0000 (13:19 -0400)
src/backend/tsearch/wparser_def.c		patch \| blob \| blame \| history
src/test/regress/expected/tsearch.out		patch \| blob \| blame \| history
src/test/regress/sql/tsearch.sql		patch \| blob \| blame \| history