Support C.UTF-8 locale in the new builtin collation provider.

The builtin C.UTF-8 locale has similar semantics to the libc locale of the same name. That is, code point sort order (fast, memcmp-based) combined with Unicode semantics for character operations such as pattern matching, regular expressions, and LOWER()/INITCAP()/UPPER(). The character semantics are based on Unicode simple case mappings. The builtin provider's C.UTF-8 offers several important advantages over libc: * faster sorting -- benefits from additional optimizations such as abbreviated keys and varstrfastcmp_c * faster case conversion, e.g. LOWER(), at least compared with some libc implementations * available on all platforms with identical semantics, and the semantics are stable, testable, and documentable within a given Postgres major version Being based on memcmp, the builtin C.UTF-8 locale does not offer natural language sort order. But it is an improvement for most use cases that might otherwise use libc's "C.UTF-8" locale, as well as many use cases that use libc's "C" locale. Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com Reviewed-by: Daniel Vérité, Peter Eisentraut, Jeremy Schneider
author: Jeff Davis 2024-03-19 22:24:41 +0000
committer: Jeff Davis 2024-03-19 22:24:41 +0000
commit: f69319f2f1fb16eda4b535bcccec90dff3a6795e (patch)
tree: 48077a7e6eb0309218b09a3be483aec37a6f204f /src/test/regress/sql/collate.utf8.sql
parent: fd0398fcb099980fbedbb7750356ef234408c1c9 (diff)
1 files changed, 67 insertions, 0 deletions
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql
new file mode 100644
index 00000000000..1f5f9ef491d
--- /dev/null
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -0,0 +1,67 @@
+/*
+ * This test is for collations and character operations when using the
+ * builtin provider with the C.UTF-8 locale.
+ */
+
+/* skip test if not UTF8 server encoding */
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+SET client_encoding TO UTF8;
+
+--
+-- Test PG_C_UTF8
+--
+
+CREATE COLLATION regress_pg_c_utf8 (
+  provider = builtin, locale = 'C_UTF8'); -- fails
+CREATE COLLATION regress_pg_c_utf8 (
+  provider = builtin, locale = 'C.UTF8');
+DROP COLLATION regress_pg_c_utf8;
+CREATE COLLATION regress_pg_c_utf8 (
+  provider = builtin, locale = 'C.UTF-8');
+
+CREATE TABLE test_pg_c_utf8 (
+  t TEXT COLLATE PG_C_UTF8
+);
+INSERT INTO test_pg_c_utf8 VALUES
+  ('abc DEF 123abc'),
+  ('ábc sßs ßss DÉF'),
+  ('ǄxxǄ ǆxxǅ ǅxxǆ'),
+  ('ȺȺȺ'),
+  ('ⱥⱥⱥ'),
+  ('ⱥȺ');
+
+SELECT
+    t, lower(t), initcap(t), upper(t),
+    length(convert_to(t, 'UTF8')) AS t_bytes,
+    length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
+    length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
+    length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
+  FROM test_pg_c_utf8;
+
+DROP TABLE test_pg_c_utf8;
+
+-- negative test: Final_Sigma not used for builtin locale C.UTF-8
+SELECT lower('ΑΣ' COLLATE PG_C_UTF8);
+SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8);
+SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8);
+
+-- properties
+
+SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8;
+SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8;
+SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8;
+SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix
+SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8;
+SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix
+
+-- case mapping
+
+SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8;
+SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8;
+SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8;
+SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8;
+SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
author	Jeff Davis	2024-03-19 22:24:41 +0000
committer	Jeff Davis	2024-03-19 22:24:41 +0000
commit	f69319f2f1fb16eda4b535bcccec90dff3a6795e (patch)
tree	48077a7e6eb0309218b09a3be483aec37a6f204f /src/test/regress/sql/collate.utf8.sql
parent	fd0398fcb099980fbedbb7750356ef234408c1c9 (diff)