diff options
author | Jeff Davis | 2024-03-19 22:24:41 +0000 |
---|---|---|
committer | Jeff Davis | 2024-03-19 22:24:41 +0000 |
commit | f69319f2f1fb16eda4b535bcccec90dff3a6795e (patch) | |
tree | 48077a7e6eb0309218b09a3be483aec37a6f204f /src/test/regress/sql/collate.utf8.sql | |
parent | fd0398fcb099980fbedbb7750356ef234408c1c9 (diff) |
Support C.UTF-8 locale in the new builtin collation provider.
The builtin C.UTF-8 locale has similar semantics to the libc locale of
the same name. That is, code point sort order (fast, memcmp-based)
combined with Unicode semantics for character operations such as
pattern matching, regular expressions, and
LOWER()/INITCAP()/UPPER(). The character semantics are based on
Unicode simple case mappings.
The builtin provider's C.UTF-8 offers several important advantages
over libc:
* faster sorting -- benefits from additional optimizations such as
abbreviated keys and varstrfastcmp_c
* faster case conversion, e.g. LOWER(), at least compared with some
libc implementations
* available on all platforms with identical semantics, and the
semantics are stable, testable, and documentable within a given
Postgres major version
Being based on memcmp, the builtin C.UTF-8 locale does not offer
natural language sort order. But it is an improvement for most use
cases that might otherwise use libc's "C.UTF-8" locale, as well as
many use cases that use libc's "C" locale.
Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com
Reviewed-by: Daniel Vérité, Peter Eisentraut, Jeremy Schneider
Diffstat (limited to 'src/test/regress/sql/collate.utf8.sql')
-rw-r--r-- | src/test/regress/sql/collate.utf8.sql | 67 |
1 files changed, 67 insertions, 0 deletions
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql new file mode 100644 index 00000000000..1f5f9ef491d --- /dev/null +++ b/src/test/regress/sql/collate.utf8.sql @@ -0,0 +1,67 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ + +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +SET client_encoding TO UTF8; + +-- +-- Test PG_C_UTF8 +-- + +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C_UTF8'); -- fails +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C.UTF8'); +DROP COLLATION regress_pg_c_utf8; +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C.UTF-8'); + +CREATE TABLE test_pg_c_utf8 ( + t TEXT COLLATE PG_C_UTF8 +); +INSERT INTO test_pg_c_utf8 VALUES + ('abc DEF 123abc'), + ('ábc sßs ßss DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); + +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM test_pg_c_utf8; + +DROP TABLE test_pg_c_utf8; + +-- negative test: Final_Sigma not used for builtin locale C.UTF-8 +SELECT lower('ΑΣ' COLLATE PG_C_UTF8); +SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8); +SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8); + +-- properties + +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8; +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8; +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8; +SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8; +SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix + +-- case mapping + +SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8; +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8; +SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; +SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8; +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed |