Improve worst-case performance of text_position_get_match_pos()
authorJohn Naylor <john.naylor@postgresql.org>
Fri, 17 Dec 2021 16:27:21 +0000 (12:27 -0400)
committerJohn Naylor <john.naylor@postgresql.org>
Fri, 4 Feb 2022 15:53:24 +0000 (10:53 -0500)
This function converts a byte position to a character position after
a successful string match. Rather than calling pg_mblen() in a loop,
use pg_mbstrlen_with_len() since the latter can inline its own call to
pg_mblen(). When the string match is at the end of the haystack text, this
change results in 10-20% performance improvement, depending on platform and
typical character length in bytes. This also simplifies the code a little.

Specializing for UTF-8 could result in further improvement, but the
performance gain was not found to be reliable between platforms. The modest
gain in this commit is stable between platforms and usable by all server
encodings.

Discussion:
https://www.postgresql.org/message-id/CAFBsxsH1Yutrmu+6LLHKK8iXY+vG--Do6zN+2900spHXQNNQKQ@mail.gmail.com

src/backend/utils/adt/varlena.c

index a8db8080e29e8ccc7d9762f9a24bb972af001efa..b73cebfdb5d0cac9281c9af43486d96b7a482cc7 100644 (file)
@@ -51,7 +51,6 @@ typedef struct varlena VarString;
  */
 typedef struct
 {
-   bool        is_multibyte;   /* T if multibyte encoding */
    bool        is_multibyte_char_in_char;  /* need to check char boundaries? */
 
    char       *str1;           /* haystack string */
@@ -1221,20 +1220,11 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
     * and continue the search if it was a false match.
     */
    if (pg_database_encoding_max_length() == 1)
-   {
-       state->is_multibyte = false;
        state->is_multibyte_char_in_char = false;
-   }
    else if (GetDatabaseEncoding() == PG_UTF8)
-   {
-       state->is_multibyte = true;
        state->is_multibyte_char_in_char = false;
-   }
    else
-   {
-       state->is_multibyte = true;
        state->is_multibyte_char_in_char = true;
-   }
 
    state->str1 = VARDATA_ANY(t1);
    state->str2 = VARDATA_ANY(t2);
@@ -1466,19 +1456,11 @@ text_position_get_match_ptr(TextPositionState *state)
 static int
 text_position_get_match_pos(TextPositionState *state)
 {
-   if (!state->is_multibyte)
-       return state->last_match - state->str1 + 1;
-   else
-   {
-       /* Convert the byte position to char position. */
-       while (state->refpoint < state->last_match)
-       {
-           state->refpoint += pg_mblen(state->refpoint);
-           state->refpos++;
-       }
-       Assert(state->refpoint == state->last_match);
-       return state->refpos + 1;
-   }
+   /* Convert the byte position to char position. */
+   state->refpos += pg_mbstrlen_with_len(state->refpoint,
+                                         state->last_match - state->refpoint);
+   state->refpoint = state->last_match;
+   return state->refpos + 1;
 }
 
 /*