Rethink regexp engine's backref-related compilation state.

author Tom Lane <tgl@sss.pgh.pa.us>

Sun, 8 Aug 2021 15:56:29 +0000 (11:56 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sun, 8 Aug 2021 15:56:29 +0000 (11:56 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Sun, 8 Aug 2021 15:56:29 +0000 (11:56 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sun, 8 Aug 2021 15:56:29 +0000 (11:56 -0400)
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c

index 4e160d54b8cf059c5d2deb78a56d57ac55ea47ef..60a220c57ab69df02decdc22a2e01b11f06fb499 100644 (file)
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -233,13 +233,6 @@ static int cmp(const chr *, const chr *, size_t);
  static int casecmp(const chr *, const chr *, size_t);
  
  
-/* info we need during compilation about a known capturing subexpression */
-struct subinfo
-{
-   struct state *left;         /* left end of its sub-NFA */
-   struct state *right;        /* right end of its sub-NFA */
-};
-
  /* internal variables, bundled for easy passing around */
  struct vars
  {
@@ -252,10 +245,10 @@ struct vars
     int         nexttype;       /* type of next token */
     chr         nextvalue;      /* value (if any) of next token */
     int         lexcon;         /* lexical context type (see regc_lex.c) */
-   int         nsubexp;        /* number of known capturing subexpressions */
-   struct subinfo *subs;       /* info about known capturing subexpressions */
-   size_t      nsubs;          /* allocated length of subs[] vector */
-   struct subinfo sub10[10];   /* initial vector, enough for most */
+   int         nsubexp;        /* subexpression count */
+   struct subre **subs;        /* subRE pointer vector */
+   size_t      nsubs;          /* length of vector */
+   struct subre *sub10[10];    /* initial vector, enough for most */
     struct nfa *nfa;            /* the NFA */
     struct colormap *cm;        /* character color map */
     color       nlcolor;        /* color of newline */
@@ -375,7 +368,7 @@ pg_regcomp(regex_t *re,
     v->subs = v->sub10;
     v->nsubs = 10;
     for (j = 0; j < v->nsubs; j++)
-       v->subs[j].left = v->subs[j].right = NULL;
+       v->subs[j] = NULL;
     v->nfa = NULL;
     v->cm = NULL;
     v->nlcolor = COLORLESS;
@@ -511,13 +504,13 @@ pg_regcomp(regex_t *re,
  }
  
  /*
- * moresubs - enlarge capturing-subexpressions vector
+ * moresubs - enlarge subRE vector
   */
  static void
  moresubs(struct vars *v,
          int wanted)            /* want enough room for this one */
  {
-   struct subinfo *p;
+   struct subre **p;
     size_t      n;
  
     assert(wanted > 0 && (size_t) wanted >= v->nsubs);
@@ -525,13 +518,13 @@ moresubs(struct vars *v,
  
     if (v->subs == v->sub10)
     {
-       p = (struct subinfo *) MALLOC(n * sizeof(struct subinfo));
+       p = (struct subre **) MALLOC(n * sizeof(struct subre *));
         if (p != NULL)
             memcpy(VS(p), VS(v->subs),
-                  v->nsubs * sizeof(struct subinfo));
+                  v->nsubs * sizeof(struct subre *));
     }
     else
-       p = (struct subinfo *) REALLOC(v->subs, n * sizeof(struct subinfo));
+       p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
     if (p == NULL)
     {
         ERR(REG_ESPACE);
@@ -539,7 +532,7 @@ moresubs(struct vars *v,
     }
     v->subs = p;
     for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
-       p->left = p->right = NULL;
+       *p = NULL;
     assert(v->nsubs == n);
     assert((size_t) wanted < v->nsubs);
  }
@@ -988,6 +981,7 @@ parseqatom(struct vars *v,
             s = newstate(v->nfa);
             s2 = newstate(v->nfa);
             NOERRN();
+           /* We may not need these arcs, but keep things connected for now */
             EMPTYARC(lp, s);
             EMPTYARC(s2, rp);
             NOERRN();
@@ -997,10 +991,6 @@ parseqatom(struct vars *v,
             NOERRN();
             if (cap)
             {
-               /* save the sub-NFA's endpoints for future backrefs to use */
-               assert(v->subs[subno].left == NULL);
-               v->subs[subno].left = s;
-               v->subs[subno].right = s2;
                 if (atom->capno == 0)
                 {
                     /* normal case: just mark the atom as capturing */
@@ -1016,13 +1006,15 @@ parseqatom(struct vars *v,
                     t->child = atom;
                     atom = t;
                 }
+               assert(v->subs[subno] == NULL);
+               v->subs[subno] = atom;
             }
             /* postpone everything else pending possible {0} */
             break;
         case BACKREF:           /* the Feature From The Black Lagoon */
             INSIST(type != LACON, REG_ESUBREG);
             INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
-           INSIST(v->subs[v->nextvalue].left != NULL, REG_ESUBREG);
+           INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
             NOERRN();
             assert(v->nextvalue > 0);
             atom = subre(v, 'b', BACKR, lp, rp);
@@ -1097,7 +1089,7 @@ parseqatom(struct vars *v,
         if (atom != NULL)
             freesubre(v, atom);
         if (atomtype == '(')
-           v->subs[subno].left = v->subs[subno].right = NULL;
+           v->subs[subno] = NULL;
         delsub(v->nfa, lp, rp);
         EMPTYARC(lp, rp);
         return top;
@@ -1130,30 +1122,48 @@ parseqatom(struct vars *v,
         NOERRN();
     }
  
+   /*
+    * For what follows, we need the atom to have its own begin/end states
+    * that are distinct from lp/rp, so that we can wrap iteration structure
+    * around it.  The parenthesized-atom case above already made suitable
+    * states (and we don't want to modify a capturing subre, since it's
+    * already recorded in v->subs[]).  Otherwise, we need more states.
+    */
+   if (atom->begin == lp || atom->end == rp)
+   {
+       s = newstate(v->nfa);
+       s2 = newstate(v->nfa);
+       NOERRN();
+       moveouts(v->nfa, lp, s);
+       moveins(v->nfa, rp, s2);
+       atom->begin = s;
+       atom->end = s2;
+   }
+   else
+   {
+       /* The atom's OK, but we must temporarily disconnect it from lp/rp */
+       /* (this removes the EMPTY arcs we made above) */
+       delsub(v->nfa, lp, atom->begin);
+       delsub(v->nfa, atom->end, rp);
+   }
+
     /*----------
      * Prepare a general-purpose state skeleton.
      *
      * In the no-backrefs case, we want this:
      *
-    * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
+    * [lp] ---> [s] ---prefix---> ---atom---> ---rest---> [rp]
      *
-    * where prefix is some repetitions of atom.  In the general case we need
+    * where prefix is some repetitions of atom, and "rest" is the remainder
+    * of the branch.  In the general case we need:
      *
      * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
      *
-    * where the iterator wraps around [begin] ---atom---> [end]
+    * where the iterator wraps around the atom.
      *
      * We make the s state here for both cases; s2 is made below if needed
      *----------
      */
-   s = newstate(v->nfa);       /* first, new endpoints for the atom */
-   s2 = newstate(v->nfa);
-   NOERRN();
-   moveouts(v->nfa, lp, s);
-   moveins(v->nfa, rp, s2);
-   NOERRN();
-   atom->begin = s;
-   atom->end = s2;
     s = newstate(v->nfa);       /* set up starting state */
     NOERRN();
     EMPTYARC(lp, s);
@@ -1190,14 +1200,14 @@ parseqatom(struct vars *v,
     {
         assert(atom->begin->nouts == 1);    /* just the EMPTY */
         delsub(v->nfa, atom->begin, atom->end);
-       assert(v->subs[subno].left != NULL);
+       assert(v->subs[subno] != NULL);
  
         /*
          * And here's why the recursion got postponed: it must wait until the
          * skeleton is filled in, because it may hit a backref that wants to
          * copy the filled-in skeleton.
          */
-       dupnfa(v->nfa, v->subs[subno].left, v->subs[subno].right,
+       dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
                atom->begin, atom->end);
         NOERRN();
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 8 Aug 2021 15:56:29 +0000 (11:56 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 8 Aug 2021 15:56:29 +0000 (11:56 -0400)