Refactor string case conversion into provider-specific files.
authorJeff Davis <jdavis@postgresql.org>
Mon, 16 Dec 2024 17:35:18 +0000 (09:35 -0800)
committerJeff Davis <jdavis@postgresql.org>
Mon, 16 Dec 2024 17:35:18 +0000 (09:35 -0800)
Create API entry points pg_strlower(), etc., that work with any
provider and give the caller control over the destination
buffer. Then, move provider-specific logic into pg_locale_builtin.c,
pg_locale_icu.c, and pg_locale_libc.c as appropriate.

Discussion: https://postgr.es/m/7aa46d77b377428058403723440862d12a8a129a.camel@j-davis.com

src/backend/utils/adt/formatting.c
src/backend/utils/adt/pg_locale.c
src/backend/utils/adt/pg_locale_builtin.c
src/backend/utils/adt/pg_locale_icu.c
src/backend/utils/adt/pg_locale_libc.c
src/include/utils/pg_locale.h

index 0dcb5515119f40f1c17119dc227cbd1e38e8b2d3..30c06c8d0992e3ecec9eb9d2f89601278fb52b8f 100644 (file)
@@ -1571,52 +1571,6 @@ str_numth(char *dest, char *num, int type)
  *         upper/lower/initcap functions
  *****************************************************************************/
 
-#ifdef USE_ICU
-
-typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
-                                    const UChar *src, int32_t srcLength,
-                                    const char *locale,
-                                    UErrorCode *pErrorCode);
-
-static int32_t
-icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
-                UChar **buff_dest, UChar *buff_source, int32_t len_source)
-{
-   UErrorCode  status;
-   int32_t     len_dest;
-
-   len_dest = len_source;      /* try first with same length */
-   *buff_dest = palloc(len_dest * sizeof(**buff_dest));
-   status = U_ZERO_ERROR;
-   len_dest = func(*buff_dest, len_dest, buff_source, len_source,
-                   mylocale->info.icu.locale, &status);
-   if (status == U_BUFFER_OVERFLOW_ERROR)
-   {
-       /* try again with adjusted length */
-       pfree(*buff_dest);
-       *buff_dest = palloc(len_dest * sizeof(**buff_dest));
-       status = U_ZERO_ERROR;
-       len_dest = func(*buff_dest, len_dest, buff_source, len_source,
-                       mylocale->info.icu.locale, &status);
-   }
-   if (U_FAILURE(status))
-       ereport(ERROR,
-               (errmsg("case conversion failed: %s", u_errorName(status))));
-   return len_dest;
-}
-
-static int32_t
-u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
-                       const UChar *src, int32_t srcLength,
-                       const char *locale,
-                       UErrorCode *pErrorCode)
-{
-   return u_strToTitle(dest, destCapacity, src, srcLength,
-                       NULL, locale, pErrorCode);
-}
-
-#endif                         /* USE_ICU */
-
 /*
  * If the system provides the needed functions for wide-character manipulation
  * (which are all standardized by C99), then we implement upper/lower/initcap
@@ -1664,106 +1618,28 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
    }
    else
    {
-#ifdef USE_ICU
-       if (mylocale->provider == COLLPROVIDER_ICU)
-       {
-           int32_t     len_uchar;
-           int32_t     len_conv;
-           UChar      *buff_uchar;
-           UChar      *buff_conv;
-
-           len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes);
-           len_conv = icu_convert_case(u_strToLower, mylocale,
-                                       &buff_conv, buff_uchar, len_uchar);
-           icu_from_uchar(&result, buff_conv, len_conv);
-           pfree(buff_uchar);
-           pfree(buff_conv);
-       }
-       else
-#endif
-       if (mylocale->provider == COLLPROVIDER_BUILTIN)
+       const char *src = buff;
+       size_t      srclen = nbytes;
+       size_t      dstsize;
+       char       *dst;
+       size_t      needed;
+
+       /* first try buffer of equal size plus terminating NUL */
+       dstsize = srclen + 1;
+       dst = palloc(dstsize);
+
+       needed = pg_strlower(dst, dstsize, src, srclen, mylocale);
+       if (needed + 1 > dstsize)
        {
-           const char *src = buff;
-           size_t      srclen = nbytes;
-           size_t      dstsize;
-           char       *dst;
-           size_t      needed;
-
-           Assert(GetDatabaseEncoding() == PG_UTF8);
-
-           /* first try buffer of equal size plus terminating NUL */
-           dstsize = srclen + 1;
-           dst = palloc(dstsize);
-
-           needed = unicode_strlower(dst, dstsize, src, srclen);
-           if (needed + 1 > dstsize)
-           {
-               /* grow buffer if needed and retry */
-               dstsize = needed + 1;
-               dst = repalloc(dst, dstsize);
-               needed = unicode_strlower(dst, dstsize, src, srclen);
-               Assert(needed + 1 == dstsize);
-           }
-
-           Assert(dst[needed] == '\0');
-           result = dst;
+           /* grow buffer if needed and retry */
+           dstsize = needed + 1;
+           dst = repalloc(dst, dstsize);
+           needed = pg_strlower(dst, dstsize, src, srclen, mylocale);
+           Assert(needed + 1 <= dstsize);
        }
-       else
-       {
-           Assert(mylocale->provider == COLLPROVIDER_LIBC);
-
-           if (pg_database_encoding_max_length() > 1)
-           {
-               wchar_t    *workspace;
-               size_t      curr_char;
-               size_t      result_size;
-
-               /* Overflow paranoia */
-               if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
-                   ereport(ERROR,
-                           (errcode(ERRCODE_OUT_OF_MEMORY),
-                            errmsg("out of memory")));
-
-               /* Output workspace cannot have more codes than input bytes */
-               workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
-
-               char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
-
-               for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
-                   workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt);
-
-               /*
-                * Make result large enough; case change might change number
-                * of bytes
-                */
-               result_size = curr_char * pg_database_encoding_max_length() + 1;
-               result = palloc(result_size);
 
-               wchar2char(result, workspace, result_size, mylocale);
-               pfree(workspace);
-           }
-           else
-           {
-               char       *p;
-
-               result = pnstrdup(buff, nbytes);
-
-               /*
-                * Note: we assume that tolower_l() will not be so broken as
-                * to need an isupper_l() guard test.  When using the default
-                * collation, we apply the traditional Postgres behavior that
-                * forces ASCII-style treatment of I/i, but in non-default
-                * collations you get exactly what the collation says.
-                */
-               for (p = result; *p; p++)
-               {
-                   if (mylocale->is_default)
-                       *p = pg_tolower((unsigned char) *p);
-                   else
-                       *p = tolower_l((unsigned char) *p, mylocale->info.lt);
-               }
-           }
-       }
+       Assert(dst[needed] == '\0');
+       result = dst;
    }
 
    return result;
@@ -1806,152 +1682,33 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
    }
    else
    {
-#ifdef USE_ICU
-       if (mylocale->provider == COLLPROVIDER_ICU)
+       const char *src = buff;
+       size_t      srclen = nbytes;
+       size_t      dstsize;
+       char       *dst;
+       size_t      needed;
+
+       /* first try buffer of equal size plus terminating NUL */
+       dstsize = srclen + 1;
+       dst = palloc(dstsize);
+
+       needed = pg_strupper(dst, dstsize, src, srclen, mylocale);
+       if (needed + 1 > dstsize)
        {
-           int32_t     len_uchar,
-                       len_conv;
-           UChar      *buff_uchar;
-           UChar      *buff_conv;
-
-           len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes);
-           len_conv = icu_convert_case(u_strToUpper, mylocale,
-                                       &buff_conv, buff_uchar, len_uchar);
-           icu_from_uchar(&result, buff_conv, len_conv);
-           pfree(buff_uchar);
-           pfree(buff_conv);
+           /* grow buffer if needed and retry */
+           dstsize = needed + 1;
+           dst = repalloc(dst, dstsize);
+           needed = pg_strupper(dst, dstsize, src, srclen, mylocale);
+           Assert(needed + 1 <= dstsize);
        }
-       else
-#endif
-       if (mylocale->provider == COLLPROVIDER_BUILTIN)
-       {
-           const char *src = buff;
-           size_t      srclen = nbytes;
-           size_t      dstsize;
-           char       *dst;
-           size_t      needed;
-
-           Assert(GetDatabaseEncoding() == PG_UTF8);
-
-           /* first try buffer of equal size plus terminating NUL */
-           dstsize = srclen + 1;
-           dst = palloc(dstsize);
-
-           needed = unicode_strupper(dst, dstsize, src, srclen);
-           if (needed + 1 > dstsize)
-           {
-               /* grow buffer if needed and retry */
-               dstsize = needed + 1;
-               dst = repalloc(dst, dstsize);
-               needed = unicode_strupper(dst, dstsize, src, srclen);
-               Assert(needed + 1 == dstsize);
-           }
-
-           Assert(dst[needed] == '\0');
-           result = dst;
-       }
-       else
-       {
-           Assert(mylocale->provider == COLLPROVIDER_LIBC);
-
-           if (pg_database_encoding_max_length() > 1)
-           {
-               wchar_t    *workspace;
-               size_t      curr_char;
-               size_t      result_size;
-
-               /* Overflow paranoia */
-               if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
-                   ereport(ERROR,
-                           (errcode(ERRCODE_OUT_OF_MEMORY),
-                            errmsg("out of memory")));
-
-               /* Output workspace cannot have more codes than input bytes */
-               workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
-
-               char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
-
-               for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
-                   workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt);
-
-               /*
-                * Make result large enough; case change might change number
-                * of bytes
-                */
-               result_size = curr_char * pg_database_encoding_max_length() + 1;
-               result = palloc(result_size);
 
-               wchar2char(result, workspace, result_size, mylocale);
-               pfree(workspace);
-           }
-           else
-           {
-               char       *p;
-
-               result = pnstrdup(buff, nbytes);
-
-               /*
-                * Note: we assume that toupper_l() will not be so broken as
-                * to need an islower_l() guard test.  When using the default
-                * collation, we apply the traditional Postgres behavior that
-                * forces ASCII-style treatment of I/i, but in non-default
-                * collations you get exactly what the collation says.
-                */
-               for (p = result; *p; p++)
-               {
-                   if (mylocale->is_default)
-                       *p = pg_toupper((unsigned char) *p);
-                   else
-                       *p = toupper_l((unsigned char) *p, mylocale->info.lt);
-               }
-           }
-       }
+       Assert(dst[needed] == '\0');
+       result = dst;
    }
 
    return result;
 }
 
-struct WordBoundaryState
-{
-   const char *str;
-   size_t      len;
-   size_t      offset;
-   bool        init;
-   bool        prev_alnum;
-};
-
-/*
- * Simple word boundary iterator that draws boundaries each time the result of
- * pg_u_isalnum() changes.
- */
-static size_t
-initcap_wbnext(void *state)
-{
-   struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
-
-   while (wbstate->offset < wbstate->len &&
-          wbstate->str[wbstate->offset] != '\0')
-   {
-       pg_wchar    u = utf8_to_unicode((unsigned char *) wbstate->str +
-                                       wbstate->offset);
-       bool        curr_alnum = pg_u_isalnum(u, true);
-
-       if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
-       {
-           size_t      prev_offset = wbstate->offset;
-
-           wbstate->init = true;
-           wbstate->offset += unicode_utf8len(u);
-           wbstate->prev_alnum = curr_alnum;
-           return prev_offset;
-       }
-
-       wbstate->offset += unicode_utf8len(u);
-   }
-
-   return wbstate->len;
-}
-
 /*
  * collation-aware, wide-character-aware initcap function
  *
@@ -1962,7 +1719,6 @@ char *
 str_initcap(const char *buff, size_t nbytes, Oid collid)
 {
    char       *result;
-   int         wasalnum = false;
    pg_locale_t mylocale;
 
    if (!buff)
@@ -1990,135 +1746,28 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
    }
    else
    {
-#ifdef USE_ICU
-       if (mylocale->provider == COLLPROVIDER_ICU)
+       const char *src = buff;
+       size_t      srclen = nbytes;
+       size_t      dstsize;
+       char       *dst;
+       size_t      needed;
+
+       /* first try buffer of equal size plus terminating NUL */
+       dstsize = srclen + 1;
+       dst = palloc(dstsize);
+
+       needed = pg_strtitle(dst, dstsize, src, srclen, mylocale);
+       if (needed + 1 > dstsize)
        {
-           int32_t     len_uchar,
-                       len_conv;
-           UChar      *buff_uchar;
-           UChar      *buff_conv;
-
-           len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes);
-           len_conv = icu_convert_case(u_strToTitle_default_BI, mylocale,
-                                       &buff_conv, buff_uchar, len_uchar);
-           icu_from_uchar(&result, buff_conv, len_conv);
-           pfree(buff_uchar);
-           pfree(buff_conv);
+           /* grow buffer if needed and retry */
+           dstsize = needed + 1;
+           dst = repalloc(dst, dstsize);
+           needed = pg_strtitle(dst, dstsize, src, srclen, mylocale);
+           Assert(needed + 1 <= dstsize);
        }
-       else
-#endif
-       if (mylocale->provider == COLLPROVIDER_BUILTIN)
-       {
-           const char *src = buff;
-           size_t      srclen = nbytes;
-           size_t      dstsize;
-           char       *dst;
-           size_t      needed;
-           struct WordBoundaryState wbstate = {
-               .str = src,
-               .len = srclen,
-               .offset = 0,
-               .init = false,
-               .prev_alnum = false,
-           };
-
-           Assert(GetDatabaseEncoding() == PG_UTF8);
-
-           /* first try buffer of equal size plus terminating NUL */
-           dstsize = srclen + 1;
-           dst = palloc(dstsize);
-
-           needed = unicode_strtitle(dst, dstsize, src, srclen,
-                                     initcap_wbnext, &wbstate);
-           if (needed + 1 > dstsize)
-           {
-               /* reset iterator */
-               wbstate.offset = 0;
-               wbstate.init = false;
-
-               /* grow buffer if needed and retry */
-               dstsize = needed + 1;
-               dst = repalloc(dst, dstsize);
-               needed = unicode_strtitle(dst, dstsize, src, srclen,
-                                         initcap_wbnext, &wbstate);
-               Assert(needed + 1 == dstsize);
-           }
 
-           result = dst;
-       }
-       else
-       {
-           Assert(mylocale->provider == COLLPROVIDER_LIBC);
-
-           if (pg_database_encoding_max_length() > 1)
-           {
-               wchar_t    *workspace;
-               size_t      curr_char;
-               size_t      result_size;
-
-               /* Overflow paranoia */
-               if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
-                   ereport(ERROR,
-                           (errcode(ERRCODE_OUT_OF_MEMORY),
-                            errmsg("out of memory")));
-
-               /* Output workspace cannot have more codes than input bytes */
-               workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
-
-               char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
-
-               for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
-               {
-                   if (wasalnum)
-                       workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt);
-                   else
-                       workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt);
-                   wasalnum = iswalnum_l(workspace[curr_char], mylocale->info.lt);
-               }
-
-               /*
-                * Make result large enough; case change might change number
-                * of bytes
-                */
-               result_size = curr_char * pg_database_encoding_max_length() + 1;
-               result = palloc(result_size);
-
-               wchar2char(result, workspace, result_size, mylocale);
-               pfree(workspace);
-           }
-           else
-           {
-               char       *p;
-
-               result = pnstrdup(buff, nbytes);
-
-               /*
-                * Note: we assume that toupper_l()/tolower_l() will not be so
-                * broken as to need guard tests.  When using the default
-                * collation, we apply the traditional Postgres behavior that
-                * forces ASCII-style treatment of I/i, but in non-default
-                * collations you get exactly what the collation says.
-                */
-               for (p = result; *p; p++)
-               {
-                   if (mylocale->is_default)
-                   {
-                       if (wasalnum)
-                           *p = pg_tolower((unsigned char) *p);
-                       else
-                           *p = pg_toupper((unsigned char) *p);
-                   }
-                   else
-                   {
-                       if (wasalnum)
-                           *p = tolower_l((unsigned char) *p, mylocale->info.lt);
-                       else
-                           *p = toupper_l((unsigned char) *p, mylocale->info.lt);
-                   }
-                   wasalnum = isalnum_l((unsigned char) *p, mylocale->info.lt);
-               }
-           }
-       }
+       Assert(dst[needed] == '\0');
+       result = dst;
    }
 
    return result;
index 4cb56126e97b4e793632b04877ad91be5f8ecd4a..d16f26f1705887e3172ae2830c080760e5f5bf39 100644 (file)
@@ -116,6 +116,27 @@ extern size_t strnxfrm_libc(char *dest, size_t destsize,
                            const char *src, ssize_t srclen,
                            pg_locale_t locale);
 
+extern size_t strlower_builtin(char *dst, size_t dstsize, const char *src,
+                              ssize_t srclen, pg_locale_t locale);
+extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
+                              ssize_t srclen, pg_locale_t locale);
+extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
+                              ssize_t srclen, pg_locale_t locale);
+
+extern size_t strlower_icu(char *dst, size_t dstsize, const char *src,
+                          ssize_t srclen, pg_locale_t locale);
+extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
+                          ssize_t srclen, pg_locale_t locale);
+extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
+                          ssize_t srclen, pg_locale_t locale);
+
+extern size_t strlower_libc(char *dst, size_t dstsize, const char *src,
+                           ssize_t srclen, pg_locale_t locale);
+extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src,
+                           ssize_t srclen, pg_locale_t locale);
+extern size_t strupper_libc(char *dst, size_t dstsize, const char *src,
+                           ssize_t srclen, pg_locale_t locale);
+
 /* GUC settings */
 char      *locale_messages;
 char      *locale_monetary;
@@ -1468,6 +1489,63 @@ get_collation_actual_version(char collprovider, const char *collcollate)
    return collversion;
 }
 
+size_t
+pg_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+           pg_locale_t locale)
+{
+   if (locale->provider == COLLPROVIDER_BUILTIN)
+       return strlower_builtin(dst, dstsize, src, srclen, locale);
+#ifdef USE_ICU
+   else if (locale->provider == COLLPROVIDER_ICU)
+       return strlower_icu(dst, dstsize, src, srclen, locale);
+#endif
+   else if (locale->provider == COLLPROVIDER_LIBC)
+       return strlower_libc(dst, dstsize, src, srclen, locale);
+   else
+       /* shouldn't happen */
+       PGLOCALE_SUPPORT_ERROR(locale->provider);
+
+   return 0;                   /* keep compiler quiet */
+}
+
+size_t
+pg_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+           pg_locale_t locale)
+{
+   if (locale->provider == COLLPROVIDER_BUILTIN)
+       return strtitle_builtin(dst, dstsize, src, srclen, locale);
+#ifdef USE_ICU
+   else if (locale->provider == COLLPROVIDER_ICU)
+       return strtitle_icu(dst, dstsize, src, srclen, locale);
+#endif
+   else if (locale->provider == COLLPROVIDER_LIBC)
+       return strtitle_libc(dst, dstsize, src, srclen, locale);
+   else
+       /* shouldn't happen */
+       PGLOCALE_SUPPORT_ERROR(locale->provider);
+
+   return 0;                   /* keep compiler quiet */
+}
+
+size_t
+pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+           pg_locale_t locale)
+{
+   if (locale->provider == COLLPROVIDER_BUILTIN)
+       return strupper_builtin(dst, dstsize, src, srclen, locale);
+#ifdef USE_ICU
+   else if (locale->provider == COLLPROVIDER_ICU)
+       return strupper_icu(dst, dstsize, src, srclen, locale);
+#endif
+   else if (locale->provider == COLLPROVIDER_LIBC)
+       return strupper_libc(dst, dstsize, src, srclen, locale);
+   else
+       /* shouldn't happen */
+       PGLOCALE_SUPPORT_ERROR(locale->provider);
+
+   return 0;                   /* keep compiler quiet */
+}
+
 /*
  * pg_strcoll
  *
index 4246971a4d8d4495bfe002bacf6f6e81d607bb4f..d3aa7bceacd7d4c3b6a325354ac501e6f8c33fa2 100644 (file)
@@ -13,6 +13,8 @@
 
 #include "catalog/pg_database.h"
 #include "catalog/pg_collation.h"
+#include "common/unicode_case.h"
+#include "common/unicode_category.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "utils/builtins.h"
 
 extern pg_locale_t create_pg_locale_builtin(Oid collid,
                                            MemoryContext context);
+extern size_t strlower_builtin(char *dst, size_t dstsize, const char *src,
+                              ssize_t srclen, pg_locale_t locale);
+extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
+                              ssize_t srclen, pg_locale_t locale);
+extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
+                              ssize_t srclen, pg_locale_t locale);
+
+
+struct WordBoundaryState
+{
+   const char *str;
+   size_t      len;
+   size_t      offset;
+   bool        init;
+   bool        prev_alnum;
+};
+
+/*
+ * Simple word boundary iterator that draws boundaries each time the result of
+ * pg_u_isalnum() changes.
+ */
+static size_t
+initcap_wbnext(void *state)
+{
+   struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+
+   while (wbstate->offset < wbstate->len &&
+          wbstate->str[wbstate->offset] != '\0')
+   {
+       pg_wchar    u = utf8_to_unicode((unsigned char *) wbstate->str +
+                                       wbstate->offset);
+       bool        curr_alnum = pg_u_isalnum(u, true);
+
+       if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+       {
+           size_t      prev_offset = wbstate->offset;
+
+           wbstate->init = true;
+           wbstate->offset += unicode_utf8len(u);
+           wbstate->prev_alnum = curr_alnum;
+           return prev_offset;
+       }
+
+       wbstate->offset += unicode_utf8len(u);
+   }
+
+   return wbstate->len;
+}
+
+size_t
+strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
+                pg_locale_t locale)
+{
+   return unicode_strlower(dest, destsize, src, srclen);
+}
+
+size_t
+strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
+                pg_locale_t locale)
+{
+   struct WordBoundaryState wbstate = {
+       .str = src,
+       .len = srclen,
+       .offset = 0,
+       .init = false,
+       .prev_alnum = false,
+   };
+
+   return unicode_strtitle(dest, destsize, src, srclen,
+                           initcap_wbnext, &wbstate);
+}
+
+size_t
+strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
+                pg_locale_t locale)
+{
+   return unicode_strupper(dest, destsize, src, srclen);
+}
 
 pg_locale_t
 create_pg_locale_builtin(Oid collid, MemoryContext context)
index 2c6b950ec18afb16156562f935b798d326a88135..f0a77a767e7c7ca27bed9c4cae231cb6615377df 100644 (file)
 #define        TEXTBUFLEN          1024
 
 extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
+extern size_t strlower_icu(char *dst, size_t dstsize, const char *src,
+                          ssize_t srclen, pg_locale_t locale);
+extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
+                          ssize_t srclen, pg_locale_t locale);
+extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
+                          ssize_t srclen, pg_locale_t locale);
 
 #ifdef USE_ICU
 
@@ -62,6 +68,11 @@ extern size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
                                  const char *src, ssize_t srclen,
                                  pg_locale_t locale);
 
+typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
+                                    const UChar *src, int32_t srcLength,
+                                    const char *locale,
+                                    UErrorCode *pErrorCode);
+
 /*
  * Converter object for converting between ICU's UChar strings and C strings
  * in database encoding.  Since the database encoding doesn't change, we only
@@ -83,8 +94,19 @@ static size_t uchar_length(UConverter *converter,
 static int32_t uchar_convert(UConverter *converter,
                             UChar *dest, int32_t destlen,
                             const char *src, int32_t srclen);
+static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
+                           size_t nbytes);
+static size_t icu_from_uchar(char *dest, size_t destsize,
+                            const UChar *buff_uchar, int32_t len_uchar);
 static void icu_set_collation_attributes(UCollator *collator, const char *loc,
                                         UErrorCode *status);
+static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
+                               UChar **buff_dest, UChar *buff_source,
+                               int32_t len_source);
+static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
+                                      const UChar *src, int32_t srcLength,
+                                      const char *locale,
+                                      UErrorCode *pErrorCode);
 #endif
 
 pg_locale_t
@@ -324,6 +346,66 @@ make_icu_collator(const char *iculocstr, const char *icurules)
    }
 }
 
+size_t
+strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
+            pg_locale_t locale)
+{
+   int32_t     len_uchar;
+   int32_t     len_conv;
+   UChar      *buff_uchar;
+   UChar      *buff_conv;
+   size_t      result_len;
+
+   len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
+   len_conv = icu_convert_case(u_strToLower, locale,
+                               &buff_conv, buff_uchar, len_uchar);
+   result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
+   pfree(buff_uchar);
+   pfree(buff_conv);
+
+   return result_len;
+}
+
+size_t
+strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
+            pg_locale_t locale)
+{
+   int32_t     len_uchar;
+   int32_t     len_conv;
+   UChar      *buff_uchar;
+   UChar      *buff_conv;
+   size_t      result_len;
+
+   len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
+   len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
+                               &buff_conv, buff_uchar, len_uchar);
+   result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
+   pfree(buff_uchar);
+   pfree(buff_conv);
+
+   return result_len;
+}
+
+size_t
+strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
+            pg_locale_t locale)
+{
+   int32_t     len_uchar;
+   int32_t     len_conv;
+   UChar      *buff_uchar;
+   UChar      *buff_conv;
+   size_t      result_len;
+
+   len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
+   len_conv = icu_convert_case(u_strToUpper, locale,
+                               &buff_conv, buff_uchar, len_uchar);
+   result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
+   pfree(buff_uchar);
+   pfree(buff_conv);
+
+   return result_len;
+}
+
 /*
  * strncoll_icu
  *
@@ -458,7 +540,7 @@ strnxfrm_prefix_icu(char *dest, size_t destsize,
  * The result string is nul-terminated, though most callers rely on the
  * result length instead.
  */
-int32_t
+static int32_t
 icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
 {
    int32_t     len_uchar;
@@ -485,8 +567,8 @@ icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
  *
  * The result string is nul-terminated.
  */
-int32_t
-icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
+static size_t
+icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
 {
    UErrorCode  status;
    int32_t     len_result;
@@ -501,10 +583,11 @@ icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
                (errmsg("%s failed: %s", "ucnv_fromUChars",
                        u_errorName(status))));
 
-   *result = palloc(len_result + 1);
+   if (len_result + 1 > destsize)
+       return len_result;
 
    status = U_ZERO_ERROR;
-   len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1,
+   len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
                                 buff_uchar, len_uchar, &status);
    if (U_FAILURE(status) ||
        status == U_STRING_NOT_TERMINATED_WARNING)
@@ -515,6 +598,43 @@ icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
    return len_result;
 }
 
+static int32_t
+icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
+                UChar **buff_dest, UChar *buff_source, int32_t len_source)
+{
+   UErrorCode  status;
+   int32_t     len_dest;
+
+   len_dest = len_source;      /* try first with same length */
+   *buff_dest = palloc(len_dest * sizeof(**buff_dest));
+   status = U_ZERO_ERROR;
+   len_dest = func(*buff_dest, len_dest, buff_source, len_source,
+                   mylocale->info.icu.locale, &status);
+   if (status == U_BUFFER_OVERFLOW_ERROR)
+   {
+       /* try again with adjusted length */
+       pfree(*buff_dest);
+       *buff_dest = palloc(len_dest * sizeof(**buff_dest));
+       status = U_ZERO_ERROR;
+       len_dest = func(*buff_dest, len_dest, buff_source, len_source,
+                       mylocale->info.icu.locale, &status);
+   }
+   if (U_FAILURE(status))
+       ereport(ERROR,
+               (errmsg("case conversion failed: %s", u_errorName(status))));
+   return len_dest;
+}
+
+static int32_t
+u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
+                       const UChar *src, int32_t srcLength,
+                       const char *locale,
+                       UErrorCode *pErrorCode)
+{
+   return u_strToTitle(dest, destCapacity, src, srcLength,
+                       NULL, locale, pErrorCode);
+}
+
 /*
  * strncoll_icu_no_utf8
  *
index 374ac37ba0aea997c1b11515398a3b7c175856a5..97ca5a28e66b0a382d087d5d4b76bb1722654f82 100644 (file)
@@ -11,6 +11,9 @@
 
 #include "postgres.h"
 
+#include <limits.h>
+#include <wctype.h>
+
 #include "access/htup_details.h"
 #include "catalog/pg_database.h"
 #include "catalog/pg_collation.h"
 
 extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context);
 
+extern size_t strlower_libc(char *dst, size_t dstsize, const char *src,
+                           ssize_t srclen, pg_locale_t locale);
+extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src,
+                           ssize_t srclen, pg_locale_t locale);
+extern size_t strupper_libc(char *dst, size_t dstsize, const char *src,
+                           ssize_t srclen, pg_locale_t locale);
+
 extern int strncoll_libc(const char *arg1, ssize_t len1,
                          const char *arg2, ssize_t len2,
                          pg_locale_t locale);
@@ -48,6 +58,323 @@ static int  strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
                                     pg_locale_t locale);
 #endif
 
+static size_t strlower_libc_sb(char *dest, size_t destsize,
+                              const char *src, ssize_t srclen,
+                              pg_locale_t locale);
+static size_t strlower_libc_mb(char *dest, size_t destsize,
+                              const char *src, ssize_t srclen,
+                              pg_locale_t locale);
+static size_t strtitle_libc_sb(char *dest, size_t destsize,
+                              const char *src, ssize_t srclen,
+                              pg_locale_t locale);
+static size_t strtitle_libc_mb(char *dest, size_t destsize,
+                              const char *src, ssize_t srclen,
+                              pg_locale_t locale);
+static size_t strupper_libc_sb(char *dest, size_t destsize,
+                              const char *src, ssize_t srclen,
+                              pg_locale_t locale);
+static size_t strupper_libc_mb(char *dest, size_t destsize,
+                              const char *src, ssize_t srclen,
+                              pg_locale_t locale);
+
+size_t
+strlower_libc(char *dst, size_t dstsize, const char *src,
+             ssize_t srclen, pg_locale_t locale)
+{
+   if (pg_database_encoding_max_length() > 1)
+       return strlower_libc_mb(dst, dstsize, src, srclen, locale);
+   else
+       return strlower_libc_sb(dst, dstsize, src, srclen, locale);
+}
+
+size_t
+strtitle_libc(char *dst, size_t dstsize, const char *src,
+             ssize_t srclen, pg_locale_t locale)
+{
+   if (pg_database_encoding_max_length() > 1)
+       return strtitle_libc_mb(dst, dstsize, src, srclen, locale);
+   else
+       return strtitle_libc_sb(dst, dstsize, src, srclen, locale);
+}
+
+size_t
+strupper_libc(char *dst, size_t dstsize, const char *src,
+             ssize_t srclen, pg_locale_t locale)
+{
+   if (pg_database_encoding_max_length() > 1)
+       return strupper_libc_mb(dst, dstsize, src, srclen, locale);
+   else
+       return strupper_libc_sb(dst, dstsize, src, srclen, locale);
+}
+
+static size_t
+strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
+                pg_locale_t locale)
+{
+   if (srclen < 0)
+       srclen = strlen(src);
+
+   if (srclen + 1 <= destsize)
+   {
+       locale_t    loc = locale->info.lt;
+       char       *p;
+
+       if (srclen + 1 > destsize)
+           return srclen;
+
+       memcpy(dest, src, srclen);
+       dest[srclen] = '\0';
+
+       /*
+        * Note: we assume that tolower_l() will not be so broken as to need
+        * an isupper_l() guard test.  When using the default collation, we
+        * apply the traditional Postgres behavior that forces ASCII-style
+        * treatment of I/i, but in non-default collations you get exactly
+        * what the collation says.
+        */
+       for (p = dest; *p; p++)
+       {
+           if (locale->is_default)
+               *p = pg_tolower((unsigned char) *p);
+           else
+               *p = tolower_l((unsigned char) *p, loc);
+       }
+   }
+
+   return srclen;
+}
+
+static size_t
+strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
+                pg_locale_t locale)
+{
+   locale_t    loc = locale->info.lt;
+   size_t      result_size;
+   wchar_t    *workspace;
+   char       *result;
+   size_t      curr_char;
+   size_t      max_size;
+
+   if (srclen < 0)
+       srclen = strlen(src);
+
+   /* Overflow paranoia */
+   if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
+       ereport(ERROR,
+               (errcode(ERRCODE_OUT_OF_MEMORY),
+                errmsg("out of memory")));
+
+   /* Output workspace cannot have more codes than input bytes */
+   workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
+
+   char2wchar(workspace, srclen + 1, src, srclen, locale);
+
+   for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
+       workspace[curr_char] = towlower_l(workspace[curr_char], loc);
+
+   /*
+    * Make result large enough; case change might change number of bytes
+    */
+   max_size = curr_char * pg_database_encoding_max_length();
+   result = palloc(max_size + 1);
+
+   result_size = wchar2char(result, workspace, max_size + 1, locale);
+
+   if (result_size + 1 > destsize)
+       return result_size;
+
+   memcpy(dest, result, result_size);
+   dest[result_size] = '\0';
+
+   pfree(workspace);
+   pfree(result);
+
+   return result_size;
+}
+
+static size_t
+strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
+                pg_locale_t locale)
+{
+   if (srclen < 0)
+       srclen = strlen(src);
+
+   if (srclen + 1 <= destsize)
+   {
+       locale_t    loc = locale->info.lt;
+       int         wasalnum = false;
+       char       *p;
+
+       memcpy(dest, src, srclen);
+       dest[srclen] = '\0';
+
+       /*
+        * Note: we assume that toupper_l()/tolower_l() will not be so broken
+        * as to need guard tests.  When using the default collation, we apply
+        * the traditional Postgres behavior that forces ASCII-style treatment
+        * of I/i, but in non-default collations you get exactly what the
+        * collation says.
+        */
+       for (p = dest; *p; p++)
+       {
+           if (locale->is_default)
+           {
+               if (wasalnum)
+                   *p = pg_tolower((unsigned char) *p);
+               else
+                   *p = pg_toupper((unsigned char) *p);
+           }
+           else
+           {
+               if (wasalnum)
+                   *p = tolower_l((unsigned char) *p, loc);
+               else
+                   *p = toupper_l((unsigned char) *p, loc);
+           }
+           wasalnum = isalnum_l((unsigned char) *p, loc);
+       }
+   }
+
+   return srclen;
+}
+
+static size_t
+strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
+                pg_locale_t locale)
+{
+   locale_t    loc = locale->info.lt;
+   int         wasalnum = false;
+   size_t      result_size;
+   wchar_t    *workspace;
+   char       *result;
+   size_t      curr_char;
+   size_t      max_size;
+
+   if (srclen < 0)
+       srclen = strlen(src);
+
+   /* Overflow paranoia */
+   if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
+       ereport(ERROR,
+               (errcode(ERRCODE_OUT_OF_MEMORY),
+                errmsg("out of memory")));
+
+   /* Output workspace cannot have more codes than input bytes */
+   workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
+
+   char2wchar(workspace, srclen + 1, src, srclen, locale);
+
+   for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
+   {
+       if (wasalnum)
+           workspace[curr_char] = towlower_l(workspace[curr_char], loc);
+       else
+           workspace[curr_char] = towupper_l(workspace[curr_char], loc);
+       wasalnum = iswalnum_l(workspace[curr_char], loc);
+   }
+
+   /*
+    * Make result large enough; case change might change number of bytes
+    */
+   max_size = curr_char * pg_database_encoding_max_length();
+   result = palloc(max_size + 1);
+
+   result_size = wchar2char(result, workspace, max_size + 1, locale);
+
+   if (result_size + 1 > destsize)
+       return result_size;
+
+   memcpy(dest, result, result_size);
+   dest[result_size] = '\0';
+
+   pfree(workspace);
+   pfree(result);
+
+   return result_size;
+}
+
+static size_t
+strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
+                pg_locale_t locale)
+{
+   if (srclen < 0)
+       srclen = strlen(src);
+
+   if (srclen + 1 <= destsize)
+   {
+       locale_t    loc = locale->info.lt;
+       char       *p;
+
+       memcpy(dest, src, srclen);
+       dest[srclen] = '\0';
+
+       /*
+        * Note: we assume that toupper_l() will not be so broken as to need
+        * an islower_l() guard test.  When using the default collation, we
+        * apply the traditional Postgres behavior that forces ASCII-style
+        * treatment of I/i, but in non-default collations you get exactly
+        * what the collation says.
+        */
+       for (p = dest; *p; p++)
+       {
+           if (locale->is_default)
+               *p = pg_toupper((unsigned char) *p);
+           else
+               *p = toupper_l((unsigned char) *p, loc);
+       }
+   }
+
+   return srclen;
+}
+
+static size_t
+strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
+                pg_locale_t locale)
+{
+   locale_t    loc = locale->info.lt;
+   size_t      result_size;
+   wchar_t    *workspace;
+   char       *result;
+   size_t      curr_char;
+   size_t      max_size;
+
+   if (srclen < 0)
+       srclen = strlen(src);
+
+   /* Overflow paranoia */
+   if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
+       ereport(ERROR,
+               (errcode(ERRCODE_OUT_OF_MEMORY),
+                errmsg("out of memory")));
+
+   /* Output workspace cannot have more codes than input bytes */
+   workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
+
+   char2wchar(workspace, srclen + 1, src, srclen, locale);
+
+   for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
+       workspace[curr_char] = towupper_l(workspace[curr_char], loc);
+
+   /*
+    * Make result large enough; case change might change number of bytes
+    */
+   max_size = curr_char * pg_database_encoding_max_length();
+   result = palloc(max_size + 1);
+
+   result_size = wchar2char(result, workspace, max_size + 1, locale);
+
+   if (result_size + 1 > destsize)
+       return result_size;
+
+   memcpy(dest, result, result_size);
+   dest[result_size] = '\0';
+
+   pfree(workspace);
+   pfree(result);
+
+   return result_size;
+}
+
 pg_locale_t
 create_pg_locale_libc(Oid collid, MemoryContext context)
 {
index 776f8f6f2fee14f9b8a909c6396e9a489c062120..861df3ddd0502fc19032f23e4620ed686460fbc0 100644 (file)
@@ -93,6 +93,15 @@ extern void init_database_collation(void);
 extern pg_locale_t pg_newlocale_from_collation(Oid collid);
 
 extern char *get_collation_actual_version(char collprovider, const char *collcollate);
+extern size_t pg_strlower(char *dest, size_t destsize,
+                         const char *src, ssize_t srclen,
+                         pg_locale_t locale);
+extern size_t pg_strtitle(char *dest, size_t destsize,
+                         const char *src, ssize_t srclen,
+                         pg_locale_t locale);
+extern size_t pg_strupper(char *dest, size_t destsize,
+                         const char *src, ssize_t srclen,
+                         pg_locale_t locale);
 extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
 extern int pg_strncoll(const char *arg1, ssize_t len1,
                        const char *arg2, ssize_t len2, pg_locale_t locale);
@@ -112,11 +121,6 @@ extern const char *builtin_validate_locale(int encoding, const char *locale);
 extern void icu_validate_locale(const char *loc_str);
 extern char *icu_language_tag(const char *loc_str, int elevel);
 
-#ifdef USE_ICU
-extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes);
-extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar);
-#endif
-
 /* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */
 extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen,
                         pg_locale_t locale);