Change regex \D and \W shorthands to always match newlines.

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 25 Feb 2021 18:29:06 +0000 (13:29 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 25 Feb 2021 18:29:06 +0000 (13:29 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 25 Feb 2021 18:29:06 +0000 (13:29 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 25 Feb 2021 18:29:06 +0000 (13:29 -0500)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml

index 860ae118264279ad8baa333c03bcabec3b5eda96..c5048a199886ca8b6de3b58d1dfa061c38730e6b 100644 (file)
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -6323,32 +6323,38 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
        <tbody>
         <row>
         <entry> <literal>\d</literal> </entry>
-       <entry> <literal>[[:digit:]]</literal> </entry>
+       <entry> matches any digit, like
+        <literal>[[:digit:]]</literal> </entry>
         </row>
  
         <row>
         <entry> <literal>\s</literal> </entry>
-       <entry> <literal>[[:space:]]</literal> </entry>
+       <entry> matches any whitespace character, like
+        <literal>[[:space:]]</literal> </entry>
         </row>
  
         <row>
         <entry> <literal>\w</literal> </entry>
-       <entry> <literal>[[:word:]]</literal> </entry>
+       <entry> matches any word character, like
+        <literal>[[:word:]]</literal> </entry>
         </row>
  
         <row>
         <entry> <literal>\D</literal> </entry>
-       <entry> <literal>[^[:digit:]]</literal> </entry>
+       <entry> matches any non-digit, like
+        <literal>[^[:digit:]]</literal> </entry>
         </row>
  
         <row>
         <entry> <literal>\S</literal> </entry>
-       <entry> <literal>[^[:space:]]</literal> </entry>
+       <entry> matches any non-whitespace character, like
+        <literal>[^[:space:]]</literal> </entry>
         </row>
  
         <row>
         <entry> <literal>\W</literal> </entry>
-       <entry> <literal>[^[:word:]]</literal> </entry>
+       <entry> matches any non-word character, like
+        <literal>[^[:word:]]</literal> </entry>
         </row>
        </tbody>
       </tgroup>
@@ -6813,14 +6819,20 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
      If newline-sensitive matching is specified, <literal>.</literal>
      and bracket expressions using <literal>^</literal>
      will never match the newline character
-    (so that matches will never cross newlines unless the RE
-    explicitly arranges it)
+    (so that matches will not cross lines unless the RE
+    explicitly includes a newline)
      and <literal>^</literal> and <literal>$</literal>
      will match the empty string after and before a newline
      respectively, in addition to matching at beginning and end of string
      respectively.
      But the ARE escapes <literal>\A</literal> and <literal>\Z</literal>
      continue to match beginning or end of string <emphasis>only</emphasis>.
+    Also, the character class shorthands <literal>\D</literal>
+    and <literal>\W</literal> will match a newline regardless of this mode.
+    (Before <productname>PostgreSQL</productname> 14, they did not match
+    newlines when in newline-sensitive mode.
+    Write <literal>[^[:digit:]]</literal>
+    or <literal>[^[:word:]]</literal> to get the old behavior.)
     </para>
  
     <para>
diff --git a/src/backend/regex/re_syntax.n b/src/backend/regex/re_syntax.n

index 1afaa7cce7cad0ec1a761c76d99415515f306e77..93830fd1000b20ba5fa8948d288f79c103d18047 100644 (file)
--- a/src/backend/regex/re_syntax.n
+++ b/src/backend/regex/re_syntax.n
@@ -804,7 +804,7 @@ and bracket expressions using
  \fB^\fR
  will never match the newline character
  (so that matches will never cross newlines unless the RE
-explicitly arranges it)
+explicitly includes a newline)
  and
  \fB^\fR
  and
@@ -817,6 +817,11 @@ ARE
  and
  \fB\eZ\fR
  continue to match beginning or end of string \fIonly\fR.
+Also, the character class shorthands
+\fB\eD\fR
+and
+\fB\eW\fR
+will match a newline regardless of this mode.
  .PP
  If partial newline-sensitive matching is specified,
  this affects \fB.\fR
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c

index 7b77a29136c8b0f518afb53da437fcb1916d57a6..d3540fdd0f384039462d680bc88d83461db67262 100644 (file)
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -1407,10 +1407,6 @@ charclasscomplement(struct vars *v,
  
     /* build arcs for char class; this may cause color splitting */
     subcolorcvec(v, cv, cstate, cstate);
-
-   /* in NLSTOP mode, ensure newline is not part of the result set */
-   if (v->cflags & REG_NLSTOP)
-       newarc(v->nfa, PLAIN, v->nlcolor, cstate, cstate);
     NOERR();
  
     /* clean up any subcolors in the arc set */
@@ -1612,6 +1608,8 @@ cbracket(struct vars *v,
  
     NOERR();
     bracket(v, left, right);
+
+   /* in NLSTOP mode, ensure newline is not part of the result set */
     if (v->cflags & REG_NLSTOP)
         newarc(v->nfa, PLAIN, v->nlcolor, left, right);
     NOERR();
diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out

index 92154b6d28aa570424d571afaa7f3e6fc359f100..5d993f40c259752eb634afd4458360943ad09109 100644 (file)
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -2144,7 +2144,8 @@ select * from test_regex('\D+', E'abc\ndef345', 'nLP');
            test_regex           
  -------------------------------
   {0,REG_UNONPOSIX,REG_ULOCALE}
- {abc}
+ {"abc                        +
+ def"}
  (2 rows)
  
  select * from test_regex('[\D]+', E'abc\ndef345', 'LPE');
@@ -2159,7 +2160,8 @@ select * from test_regex('[\D]+', E'abc\ndef345', 'nLPE');
                 test_regex               
  ----------------------------------------
   {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
- {abc}
+ {"abc                                 +
+ def"}
  (2 rows)
  
  select * from test_regex('\w+', E'abc_012\ndef', 'LP');
@@ -2202,7 +2204,8 @@ select * from test_regex('\W+', E'***\n@@@___', 'nLP');
            test_regex           
  -------------------------------
   {0,REG_UNONPOSIX,REG_ULOCALE}
- {***}
+ {"***                        +
+ @@@"}
  (2 rows)
  
  select * from test_regex('[\W]+', E'***\n@@@___', 'LPE');
@@ -2217,7 +2220,8 @@ select * from test_regex('[\W]+', E'***\n@@@___', 'nLPE');
                 test_regex               
  ----------------------------------------
   {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
- {***}
+ {"***                                 +
+ @@@"}
  (2 rows)
  
  -- doing 13 "escapes"
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 25 Feb 2021 18:29:06 +0000 (13:29 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 25 Feb 2021 18:29:06 +0000 (13:29 -0500)
doc/src/sgml/func.sgml		patch \| blob \| blame \| history
src/backend/regex/re_syntax.n		patch \| blob \| blame \| history
src/backend/regex/regcomp.c		patch \| blob \| blame \| history
src/test/modules/test_regex/expected/test_regex.out		patch \| blob \| blame \| history