From eb086056fec44516efdd5db71244a079fed65c7f Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 3 May 2021 03:58:03 +0300 Subject: [PATCH] Make websearch_to_tsquery() parse text in quotes as a single token websearch_to_tsquery() splits text in quotes into tokens and connects them with phrase operator on its own. However, that leads to surprising results when the token contains no words. For instance, websearch_to_tsquery('"aaa: bbb"') is 'aaa <2> bbb', because it is equivalent of to_tsquery(E'aaa <-> \':\' <-> bbb'). But websearch_to_tsquery('"aaa: bbb"') has to be 'aaa <-> bbb' in order to match to_tsvector('aaa: bbb'). Since 0c4f355c6a, we anyway connect lexemes of complex tokens with phrase operators. Thus, let's just websearch_to_tsquery() parse text in quotes as a single token. Therefore, websearch_to_tsquery() should process the quoted text in the same way phraseto_tsquery() does. This solution is what we exactly need and also simplifies the code. This commit is an incompatible change, so we don't backpatch it. Reported-by: Valentin Gatien-Baron Discussion: https://postgr.es/m/CA%2B0DEqiZs7gdOd4ikmg%3D0UWG%2BSwWOLxPsk_JW-sx9WNOyrb0KQ%40mail.gmail.com Author: Alexander Korotkov Reviewed-by: Tom Lane, Zhihong Yu --- src/backend/utils/adt/tsquery.c | 81 ++++++++------------------- src/test/regress/expected/tsearch.out | 24 +++++--- src/test/regress/sql/tsearch.sql | 1 + 3 files changed, 39 insertions(+), 67 deletions(-) diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index fe4470174f5..b2ca0d2f8a2 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -77,7 +77,6 @@ struct TSQueryParserStateData char *buf; /* current scan point */ int count; /* nesting count, incremented by (, * decremented by ) */ - bool in_quotes; /* phrase in quotes "" */ ts_parserstate state; /* polish (prefix) notation in list, filled in by push* functions */ @@ -235,9 +234,6 @@ parse_or_operator(TSQueryParserState pstate) { char *ptr = pstate->buf; - if (pstate->in_quotes) - return false; - /* it should begin with "OR" literal */ if (pg_strncasecmp(ptr, "or", 2) != 0) return false; @@ -398,38 +394,29 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator, state->buf++; state->state = WAITOPERAND; - if (state->in_quotes) - continue; - *operator = OP_NOT; return PT_OPR; } else if (t_iseq(state->buf, '"')) { + /* Everything in quotes is processed as a single token */ + + /* skip opening quote */ state->buf++; + *strval = state->buf; - if (!state->in_quotes) - { - state->state = WAITOPERAND; + /* iterate to the closing quote or end of the string */ + while (*state->buf != '\0' && !t_iseq(state->buf, '"')) + state->buf++; + *lenval = state->buf - *strval; - if (strchr(state->buf, '"')) - { - /* quoted text should be ordered <-> */ - state->in_quotes = true; - return PT_OPEN; - } + /* skip closing quote if not end of the string */ + if (*state->buf != '\0') + state->buf++; - /* web search tolerates missing quotes */ - continue; - } - else - { - /* we have to provide an operand */ - state->in_quotes = false; - state->state = WAITOPERATOR; - pushStop(state); - return PT_CLOSE; - } + state->state = WAITOPERATOR; + state->count++; + return PT_VAL; } else if (ISOPERATOR(state->buf)) { @@ -467,24 +454,13 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator, case WAITOPERATOR: if (t_iseq(state->buf, '"')) { - if (!state->in_quotes) - { - /* - * put implicit AND after an operand and handle this - * quote in WAITOPERAND - */ - state->state = WAITOPERAND; - *operator = OP_AND; - return PT_OPR; - } - else - { - state->buf++; - - /* just close quotes */ - state->in_quotes = false; - return PT_CLOSE; - } + /* + * put implicit AND after an operand and handle this quote + * in WAITOPERAND + */ + state->state = WAITOPERAND; + *operator = OP_AND; + return PT_OPR; } else if (parse_or_operator(state)) { @@ -498,18 +474,8 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator, } else if (!t_isspace(state->buf)) { - if (state->in_quotes) - { - /* put implicit <-> after an operand */ - *operator = OP_PHRASE; - *weight = 1; - } - else - { - /* put implicit AND after an operand */ - *operator = OP_AND; - } - + /* put implicit AND after an operand */ + *operator = OP_AND; state->state = WAITOPERAND; return PT_OPR; } @@ -846,7 +812,6 @@ parse_tsquery(char *buf, state.buffer = buf; state.buf = buf; state.count = 0; - state.in_quotes = false; state.state = WAITFIRSTOPERAND; state.polstr = NIL; diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index 4ae62320c9f..45b92a63388 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -2678,9 +2678,9 @@ select websearch_to_tsquery('simple', 'abc OR_abc'); -- test quotes select websearch_to_tsquery('english', '"pg_class pg'); - websearch_to_tsquery -------------------------- - 'pg' <-> 'class' & 'pg' + websearch_to_tsquery +--------------------------- + 'pg' <-> 'class' <-> 'pg' (1 row) select websearch_to_tsquery('english', 'pg_class pg"'); @@ -2695,6 +2695,12 @@ select websearch_to_tsquery('english', '"pg_class pg"'); 'pg' <-> 'class' <-> 'pg' (1 row) +select websearch_to_tsquery('english', '"pg_class : pg"'); + websearch_to_tsquery +--------------------------- + 'pg' <-> 'class' <-> 'pg' +(1 row) + select websearch_to_tsquery('english', 'abc "pg_class pg"'); websearch_to_tsquery ----------------------------------- @@ -2708,15 +2714,15 @@ select websearch_to_tsquery('english', '"pg_class pg" def'); (1 row) select websearch_to_tsquery('english', 'abc "pg pg_class pg" def'); - websearch_to_tsquery --------------------------------------------------------- - 'abc' & 'pg' <-> ( 'pg' <-> 'class' ) <-> 'pg' & 'def' + websearch_to_tsquery +---------------------------------------------------- + 'abc' & 'pg' <-> 'pg' <-> 'class' <-> 'pg' & 'def' (1 row) select websearch_to_tsquery('english', ' or "pg pg_class pg" or '); - websearch_to_tsquery ----------------------------------------- - 'pg' <-> ( 'pg' <-> 'class' ) <-> 'pg' + websearch_to_tsquery +------------------------------------ + 'pg' <-> 'pg' <-> 'class' <-> 'pg' (1 row) select websearch_to_tsquery('english', '""pg pg_class pg""'); diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql index b02ed73f6a8..d929210998a 100644 --- a/src/test/regress/sql/tsearch.sql +++ b/src/test/regress/sql/tsearch.sql @@ -759,6 +759,7 @@ select websearch_to_tsquery('simple', 'abc OR_abc'); select websearch_to_tsquery('english', '"pg_class pg'); select websearch_to_tsquery('english', 'pg_class pg"'); select websearch_to_tsquery('english', '"pg_class pg"'); +select websearch_to_tsquery('english', '"pg_class : pg"'); select websearch_to_tsquery('english', 'abc "pg_class pg"'); select websearch_to_tsquery('english', '"pg_class pg" def'); select websearch_to_tsquery('english', 'abc "pg pg_class pg" def'); -- 2.30.2