From d3d0983169130a9b81e3fe48d5c2ca4931480956 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Fri, 17 Jan 2025 15:56:30 -0800 Subject: [PATCH] Support PG_UNICODE_FAST locale in the builtin collation provider. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The PG_UNICODE_FAST locale uses code point sort order (fast, memcmp-based) combined with Unicode character semantics. The character semantics are based on Unicode full case mapping. Full case mapping can map a single codepoint to multiple codepoints, such as "ß" uppercasing to "SS". Additionally, it handles context-sensitive mappings like the "final sigma", and it uses titlecase mappings such as "Dž" when titlecasing (rather than plain uppercase mappings). Importantly, the uppercasing of "ß" as "SS" is specifically mentioned by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics for case mapping and pattern matching, so if we changed it to use the PG_UNICODE_FAST locale, it would offer better compliance with the standard. For now, though, do not change the behavior of UCS_BASIC. Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite --- doc/src/sgml/charset.sgml | 29 +++- doc/src/sgml/ref/create_collation.sgml | 3 +- doc/src/sgml/ref/create_database.sgml | 6 +- doc/src/sgml/ref/initdb.sgml | 4 +- src/backend/regex/regc_pg_locale.c | 6 +- src/backend/utils/adt/pg_locale.c | 7 +- src/backend/utils/adt/pg_locale_builtin.c | 12 +- src/bin/initdb/initdb.c | 6 +- src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_collation.dat | 3 + src/include/utils/pg_locale.h | 1 + src/test/regress/expected/collate.utf8.out | 160 +++++++++++++++++++++ src/test/regress/sql/collate.utf8.sql | 60 ++++++++ 13 files changed, 283 insertions(+), 16 deletions(-) diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 6c633678790..99f01990004 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -377,8 +377,9 @@ initdb --locale-provider=icu --icu-locale=en The builtin provider uses built-in operations. Only - the C and C.UTF-8 locales are - supported for this provider. + the C, C.UTF-8, and + PG_UNICODE_FAST locales are supported for this + provider. The C locale behavior is identical to the @@ -392,6 +393,13 @@ initdb --locale-provider=icu --icu-locale=en regular expression character classes are based on the "POSIX Compatible" semantics, and the case mapping is the "simple" variant. + + The PG_UNICODE_FAST locale is available only when + the database encoding is UTF-8, and the behavior is + based on Unicode. The collation uses the code point values only. The + regular expression character classes are based on the "Standard" + semantics, and the case mapping is the "full" variant. + @@ -886,6 +894,23 @@ SELECT * FROM test1 ORDER BY a || b COLLATE "fr_FR"; + + pg_unicode_fast + + + This collation sorts by Unicode code point values rather than natural + language order. For the functions lower, + initcap, and upper it uses + Unicode full case mapping. For pattern matching (including regular + expressions), it uses the Standard variant of Unicode Compatibility + Properties. Behavior is efficient and stable within a + Postgres major version. It is only + available for encoding UTF8. + + + + pg_c_utf8 diff --git a/doc/src/sgml/ref/create_collation.sgml b/doc/src/sgml/ref/create_collation.sgml index e34bfc97c3d..4af1836ae30 100644 --- a/doc/src/sgml/ref/create_collation.sgml +++ b/doc/src/sgml/ref/create_collation.sgml @@ -99,7 +99,8 @@ CREATE COLLATION [ IF NOT EXISTS ] name FROM If provider is builtin, then locale must be specified and set to - either C or C.UTF-8. + either C, C.UTF-8 or + PG_UNICODE_FAST. diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index 7653cb902ee..a4b052ba08b 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -168,7 +168,8 @@ CREATE DATABASE name If is builtin, then locale or builtin_locale must be specified and set to - either C or C.UTF-8. + either C, C.UTF-8, or + PG_UNICODE_FAST. @@ -233,7 +234,8 @@ CREATE DATABASE name The locales available for the builtin provider are - C and C.UTF-8. + C, C.UTF-8 and + PG_UNICODE_FAST. diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index 0c32114cf70..0026318485a 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -295,8 +295,8 @@ PostgreSQL documentation If is builtin, or must be - specified and set to C or - C.UTF-8. + specified and set to C, C.UTF-8 + or PG_UNICODE_FAST. diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 2360d08efae..ed7411df83d 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -307,7 +307,7 @@ pg_wc_isdigit(pg_wchar c) return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISDIGIT)); case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isdigit(c, true); + return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full); case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswdigit_l((wint_t) c, pg_regex_locale->info.lt); @@ -361,7 +361,7 @@ pg_wc_isalnum(pg_wchar c) return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALNUM)); case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isalnum(c, true); + return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full); case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalnum_l((wint_t) c, pg_regex_locale->info.lt); @@ -505,7 +505,7 @@ pg_wc_ispunct(pg_wchar c) return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPUNCT)); case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_ispunct(c, true); + return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full); case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswpunct_l((wint_t) c, pg_regex_locale->info.lt); diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 875cca6efc8..94444acd2c5 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1590,8 +1590,11 @@ builtin_locale_encoding(const char *locale) { if (strcmp(locale, "C") == 0) return -1; - if (strcmp(locale, "C.UTF-8") == 0) + else if (strcmp(locale, "C.UTF-8") == 0) return PG_UTF8; + else if (strcmp(locale, "PG_UNICODE_FAST") == 0) + return PG_UTF8; + ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), @@ -1616,6 +1619,8 @@ builtin_validate_locale(int encoding, const char *locale) canonical_name = "C"; else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0) canonical_name = "C.UTF-8"; + else if (strcmp(locale, "PG_UNICODE_FAST") == 0) + canonical_name = "PG_UNICODE_FAST"; if (!canonical_name) ereport(ERROR, diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index fef5b6e6d38..436e32c0ca0 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -78,7 +78,8 @@ size_t strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - return unicode_strlower(dest, destsize, src, srclen, false); + return unicode_strlower(dest, destsize, src, srclen, + locale->info.builtin.casemap_full); } size_t @@ -93,7 +94,8 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, .prev_alnum = false, }; - return unicode_strtitle(dest, destsize, src, srclen, false, + return unicode_strtitle(dest, destsize, src, srclen, + locale->info.builtin.casemap_full, initcap_wbnext, &wbstate); } @@ -101,7 +103,8 @@ size_t strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - return unicode_strupper(dest, destsize, src, srclen, false); + return unicode_strupper(dest, destsize, src, srclen, + locale->info.builtin.casemap_full); } pg_locale_t @@ -142,6 +145,7 @@ create_pg_locale_builtin(Oid collid, MemoryContext context) result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); result->info.builtin.locale = MemoryContextStrdup(context, locstr); + result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0); result->provider = COLLPROVIDER_BUILTIN; result->deterministic = true; result->collate_is_c = true; @@ -164,6 +168,8 @@ get_collation_actual_version_builtin(const char *collcollate) return "1"; else if (strcmp(collcollate, "C.UTF-8") == 0) return "1"; + else if (strcmp(collcollate, "PG_UNICODE_FAST") == 0) + return "1"; else ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index ea4b66b3bf5..759672a9b97 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2489,6 +2489,8 @@ setlocales(void) else if (strcmp(datlocale, "C.UTF-8") == 0 || strcmp(datlocale, "C.UTF8") == 0) canonname = "C.UTF-8"; + else if (strcmp(datlocale, "PG_UNICODE_FAST") == 0) + canonname = "PG_UNICODE_FAST"; else pg_fatal("invalid locale name \"%s\" for builtin provider", datlocale); @@ -2782,7 +2784,9 @@ setup_locale_encoding(void) if (locale_provider == COLLPROVIDER_BUILTIN) { - if (strcmp(datlocale, "C.UTF-8") == 0 && encodingid != PG_UTF8) + if ((strcmp(datlocale, "C.UTF-8") == 0 || + strcmp(datlocale, "PG_UNICODE_FAST") == 0) && + encodingid != PG_UTF8) pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"", datlocale, "UTF-8"); } diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 54856ab214d..28de0c83342 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202501162 +#define CATALOG_VERSION_NO 202501171 #endif diff --git a/src/include/catalog/pg_collation.dat b/src/include/catalog/pg_collation.dat index 5fa2d33e94b..fb76c421931 100644 --- a/src/include/catalog/pg_collation.dat +++ b/src/include/catalog/pg_collation.dat @@ -33,5 +33,8 @@ descr => 'sorts by Unicode code point; Unicode and POSIX character semantics', collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6', colllocale => 'C.UTF-8', collversion => '1' }, +{ oid => '9535', descr => 'sorts by Unicode code point; Unicode character semantics', + collname => 'pg_unicode_fast', collprovider => 'b', collencoding => '6', + colllocale => 'PG_UNICODE_FAST', collversion => '1' }, ] diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index ec42ca3da4c..2bc3a7df2d9 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -108,6 +108,7 @@ struct pg_locale_struct struct { const char *locale; + bool casemap_full; } builtin; locale_t lt; #ifdef USE_ICU diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out index 4558d2521a2..8b7176a2756 100644 --- a/src/test/regress/expected/collate.utf8.out +++ b/src/test/regress/expected/collate.utf8.out @@ -160,3 +160,163 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed t (1 row) +-- +-- Test PG_UNICODE_FAST +-- +CREATE COLLATION regress_pg_unicode_fast ( + provider = builtin, locale = 'unicode'); -- fails +ERROR: invalid locale name "unicode" for builtin provider +CREATE COLLATION regress_pg_unicode_fast ( + provider = builtin, locale = 'PG_UNICODE_FAST'); +CREATE TABLE test_pg_unicode_fast ( + t TEXT COLLATE PG_UNICODE_FAST +); +INSERT INTO test_pg_unicode_fast VALUES + ('abc DEF 123abc'), + ('ábc sßs ßss DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM test_pg_unicode_fast; + t | lower | initcap | upper | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes +-----------------+-----------------+------------------+-------------------+---------+---------------+-----------------+--------------- + abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14 + ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF | 19 | 19 | 19 | 19 + DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | Džxxdž Džxxdž Džxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20 + ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6 + ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6 + ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4 +(6 rows) + +DROP TABLE test_pg_unicode_fast; +-- test Final_Sigma +SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 + lower +------- + ας +(1 row) + +SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030 + lower +------- + ας0 +(1 row) + +SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 + lower +------- + ἀς̓ +(1 row) + +SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 + lower +------- + ᾳςͅ +(1 row) + +-- test !Final_Sigma +SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3 + lower +------- + σ +(1 row) + +SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3 + lower +------- + 0σ +(1 row) + +SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391 + lower +------- + ασα +(1 row) + +SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391 + lower +------- + ἀσ̓α +(1 row) + +SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391 + lower +------- + ᾳσͅα +(1 row) + +-- properties +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation + ?column? +---------- + t +(1 row) + +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +-- case mapping +SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed + ?column? +---------- + t +(1 row) + diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql index 87fe06ddf1b..46e9c5232ad 100644 --- a/src/test/regress/sql/collate.utf8.sql +++ b/src/test/regress/sql/collate.utf8.sql @@ -80,3 +80,63 @@ SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8; SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8; SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed + +-- +-- Test PG_UNICODE_FAST +-- + +CREATE COLLATION regress_pg_unicode_fast ( + provider = builtin, locale = 'unicode'); -- fails +CREATE COLLATION regress_pg_unicode_fast ( + provider = builtin, locale = 'PG_UNICODE_FAST'); + +CREATE TABLE test_pg_unicode_fast ( + t TEXT COLLATE PG_UNICODE_FAST +); +INSERT INTO test_pg_unicode_fast VALUES + ('abc DEF 123abc'), + ('ábc sßs ßss DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); + +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM test_pg_unicode_fast; + +DROP TABLE test_pg_unicode_fast; + +-- test Final_Sigma +SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 +SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030 +SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 +SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 + +-- test !Final_Sigma +SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3 +SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3 +SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391 +SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391 +SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391 + +-- properties + +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST; +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; +SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST; +SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST; + +-- case mapping + +SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST; +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST; +SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST; +SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST; +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed -- 2.30.2