diff options
author | Jeff Davis | 2025-01-24 22:56:22 +0000 |
---|---|---|
committer | Jeff Davis | 2025-01-24 22:56:22 +0000 |
commit | bfc5992069cf00b189af83d96a83ae5ebb65e938 (patch) | |
tree | 94332f38e12deb4a6dcfdc011c42848069190ec5 | |
parent | f15538cd27d4eeb7d665263a3d7b5700362d7eb0 (diff) |
Add SQL function CASEFOLD().
Useful for caseless matching. Similar to LOWER(), but avoids edge-case
problems with using LOWER() for caseless matching.
For collations that support it, CASEFOLD() handles characters with
more than two case variations or multi-character case variations. Some
characters may fold to uppercase. The results of case folding are also
more stable across Unicode versions than LOWER() or UPPER().
Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com
Reviewed-by: Ian Lawrence Barwick
-rw-r--r-- | doc/src/sgml/func.sgml | 46 | ||||
-rw-r--r-- | src/backend/utils/adt/formatting.c | 69 | ||||
-rw-r--r-- | src/backend/utils/adt/oracle_compat.c | 16 | ||||
-rw-r--r-- | src/backend/utils/adt/pg_locale.c | 24 | ||||
-rw-r--r-- | src/backend/utils/adt/pg_locale_builtin.c | 10 | ||||
-rw-r--r-- | src/backend/utils/adt/pg_locale_icu.c | 58 | ||||
-rw-r--r-- | src/include/catalog/catversion.h | 2 | ||||
-rw-r--r-- | src/include/catalog/pg_proc.dat | 3 | ||||
-rw-r--r-- | src/include/utils/formatting.h | 1 | ||||
-rw-r--r-- | src/include/utils/pg_locale.h | 3 | ||||
-rw-r--r-- | src/test/regress/expected/collate.icu.utf8.out | 24 | ||||
-rw-r--r-- | src/test/regress/expected/collate.utf8.out | 14 | ||||
-rw-r--r-- | src/test/regress/sql/collate.icu.utf8.sql | 5 | ||||
-rw-r--r-- | src/test/regress/sql/collate.utf8.sql | 6 |
14 files changed, 278 insertions, 3 deletions
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 5678e7621a5..d2b0b059e65 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -2596,7 +2596,7 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in <row> <entry role="func_table_entry"><para role="func_signature"> - <indexterm> + <indexterm id="function-lower"> <primary>lower</primary> </indexterm> <function>lower</function> ( <type>text</type> ) @@ -2657,7 +2657,7 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in <row> <entry role="func_table_entry"><para role="func_signature"> - <indexterm> + <indexterm id="function-normalize"> <primary>normalize</primary> </indexterm> <indexterm> @@ -3112,6 +3112,48 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in <row> <entry role="func_table_entry"><para role="func_signature"> <indexterm> + <primary>casefold</primary> + </indexterm> + <function>casefold</function> ( <type>text</type> ) + <returnvalue>text</returnvalue> + </para> + <para> + Performs case folding of the input string according to the collation. + Case folding is similar to case conversion, but the purpose of case + folding is to facilitate case-insensitive comparison of strings, + whereas the purpose of case conversion is to convert to a particular + cased form. This function can only be used when the server encoding + is <literal>UTF8</literal>. + </para> + <para> + Ordinarily, case folding simply converts to lowercase, but there are a + few notable exceptions depending on the collation. For instance, the + character <literal>Σ</literal> (U+03A3) has two lowercase forms: + <literal>σ</literal> (U+03C3) and <literal>ς</literal> (U+03C2); case + folding in the <literal>PG_C_UTF8</literal> collation maps all three + forms to <literal>σ</literal>. Additionally, the result is not + necessarily lowercase; some characters may be folded to uppercase. + </para> + <para> + Case folding may change the length of the string. For instance, in + the <literal>PG_UNICODE_FAST</literal> collation, <literal>ß</literal> + (U+00DF) folds to <literal>ss</literal>. + </para> + <para> + <function>casefold</function> can be used for Unicode Default Caseless + Matching. It does not always preserve the normalized form of the + input string (see <xref linkend="function-normalize"/>). + </para> + <para> + The <literal>libc</literal> provider doesn't support case folding, so + <function>casefold</function> is identical to <xref + linkend="function-lower"/>. + </para></entry> + </row> + + <row> + <entry role="func_table_entry"><para role="func_signature"> + <indexterm> <primary>left</primary> </indexterm> <function>left</function> ( <parameter>string</parameter> <type>text</type>, diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 7c4c4aa07d5..2720d3902ab 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1820,6 +1820,75 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) } /* + * collation-aware, wide-character-aware case folding + * + * We pass the number of bytes so we can pass varlena and char* + * to this function. The result is a palloc'd, null-terminated string. + */ +char * +str_casefold(const char *buff, size_t nbytes, Oid collid) +{ + char *result; + pg_locale_t mylocale; + + if (!buff) + return NULL; + + if (!OidIsValid(collid)) + { + /* + * This typically means that the parser could not resolve a conflict + * of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for %s function", + "lower()"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + + if (GetDatabaseEncoding() != PG_UTF8) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Unicode case folding can only be performed if server encoding is UTF8"))); + + mylocale = pg_newlocale_from_collation(collid); + + /* C/POSIX collations use this path regardless of database encoding */ + if (mylocale->ctype_is_c) + { + result = asc_tolower(buff, nbytes); + } + else + { + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize; + char *dst; + size_t needed; + + /* first try buffer of equal size plus terminating NUL */ + dstsize = srclen + 1; + dst = palloc(dstsize); + + needed = pg_strfold(dst, dstsize, src, srclen, mylocale); + if (needed + 1 > dstsize) + { + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = pg_strfold(dst, dstsize, src, srclen, mylocale); + Assert(needed + 1 <= dstsize); + } + + Assert(dst[needed] == '\0'); + result = dst; + } + + return result; +} + +/* * ASCII-only lower function * * We pass the number of bytes so we can pass varlena and char* diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c index 2cba7cd1621..a24a2d208fb 100644 --- a/src/backend/utils/adt/oracle_compat.c +++ b/src/backend/utils/adt/oracle_compat.c @@ -126,6 +126,22 @@ initcap(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(result); } +Datum +casefold(PG_FUNCTION_ARGS) +{ + text *in_string = PG_GETARG_TEXT_PP(0); + char *out_string; + text *result; + + out_string = str_casefold(VARDATA_ANY(in_string), + VARSIZE_ANY_EXHDR(in_string), + PG_GET_COLLATION()); + result = cstring_to_text(out_string); + pfree(out_string); + + PG_RETURN_TEXT_P(result); +} + /******************************************************************** * diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 94444acd2c5..7d92f580a57 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -106,6 +106,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); extern size_t strlower_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); @@ -113,6 +115,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t strfold_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); extern size_t strlower_libc(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); @@ -1447,6 +1451,26 @@ pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, return 0; /* keep compiler quiet */ } +size_t +pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + if (locale->provider == COLLPROVIDER_BUILTIN) + return strfold_builtin(dst, dstsize, src, srclen, locale); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + return strfold_icu(dst, dstsize, src, srclen, locale); +#endif + /* for libc, just use strlower */ + else if (locale->provider == COLLPROVIDER_LIBC) + return strlower_libc(dst, dstsize, src, srclen, locale); + else + /* shouldn't happen */ + PGLOCALE_SUPPORT_ERROR(locale->provider); + + return 0; /* keep compiler quiet */ +} + /* * pg_strcoll * diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 436e32c0ca0..33ad20bbf07 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -31,6 +31,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); struct WordBoundaryState @@ -107,6 +109,14 @@ strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, locale->info.builtin.casemap_full); } +size_t +strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + return unicode_strfold(dest, destsize, src, srclen, + locale->info.builtin.casemap_full); +} + pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context) { diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index 5185b0f7289..b0c73f2e43d 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -54,6 +54,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t strfold_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); #ifdef USE_ICU @@ -117,6 +119,10 @@ static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode); +static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + const char *locale, + UErrorCode *pErrorCode); static const struct collate_methods collate_methods_icu = { .strncoll = strncoll_icu, @@ -439,6 +445,26 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } +size_t +strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + int32_t len_uchar; + int32_t len_conv; + UChar *buff_uchar; + UChar *buff_conv; + size_t result_len; + + len_uchar = icu_to_uchar(&buff_uchar, src, srclen); + len_conv = icu_convert_case(u_strFoldCase_default, locale, + &buff_conv, buff_uchar, len_uchar); + result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); + pfree(buff_uchar); + pfree(buff_conv); + + return result_len; +} + /* * strncoll_icu_utf8 * @@ -673,6 +699,38 @@ u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, NULL, locale, pErrorCode); } +static int32_t +u_strFoldCase_default(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + const char *locale, + UErrorCode *pErrorCode) +{ + uint32 options = U_FOLD_CASE_DEFAULT; + char lang[3]; + UErrorCode status; + + /* + * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case + * folding does not accept a locale. Instead it just supports a single + * option relevant to Turkic languages 'az' and 'tr'; check for those + * languages to enable the option. + */ + status = U_ZERO_ERROR; + uloc_getLanguage(locale, lang, 3, &status); + if (U_SUCCESS(status)) + { + /* + * The option name is confusing, but it causes u_strFoldCase to use + * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT. + */ + if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0) + options = U_FOLD_CASE_EXCLUDE_SPECIAL_I; + } + + return u_strFoldCase(dest, destCapacity, src, srcLength, + options, pErrorCode); +} + /* * strncoll_icu * diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 586b83f2f4d..e3a308024de 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202501231 +#define CATALOG_VERSION_NO 202501232 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 18560755d26..2aafdbc3e93 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3623,6 +3623,9 @@ { oid => '872', descr => 'capitalize each word', proname => 'initcap', prorettype => 'text', proargtypes => 'text', prosrc => 'initcap' }, +{ oid => '9569', descr => 'fold case', + proname => 'casefold', prorettype => 'text', proargtypes => 'text', + prosrc => 'casefold' }, { oid => '873', descr => 'left-pad string to length', proname => 'lpad', prorettype => 'text', proargtypes => 'text int4 text', prosrc => 'lpad' }, diff --git a/src/include/utils/formatting.h b/src/include/utils/formatting.h index 5fa49539aaa..835307dac09 100644 --- a/src/include/utils/formatting.h +++ b/src/include/utils/formatting.h @@ -21,6 +21,7 @@ extern char *str_tolower(const char *buff, size_t nbytes, Oid collid); extern char *str_toupper(const char *buff, size_t nbytes, Oid collid); extern char *str_initcap(const char *buff, size_t nbytes, Oid collid); +extern char *str_casefold(const char *buff, size_t nbytes, Oid collid); extern char *asc_tolower(const char *buff, size_t nbytes); extern char *asc_toupper(const char *buff, size_t nbytes); diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 2bc3a7df2d9..0d5f0513ceb 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -134,6 +134,9 @@ extern size_t pg_strtitle(char *dest, size_t destsize, extern size_t pg_strupper(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t pg_strfold(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale); extern int pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index d4f327636fd..910de9120f2 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -255,6 +255,30 @@ SELECT a, x, y FROM collate_test10 ORDER BY lower(y), a; 1 | hij | hij (2 rows) +SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu"); + lower +------------------------------- + abcd 123 #$% ıiii̇ ß ß dždždž σσς +(1 row) + +SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu"); + casefold +--------------------------------- + abcd 123 #$% ıiii̇ ss ss dždždž σσσ +(1 row) + +SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu"); + lower +------------------------------- + abcd 123 #$% ıiıi ß ß dždždž σσς +(1 row) + +SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu"); + casefold +--------------------------------- + abcd 123 #$% ıiıi ss ss dždždž σσσ +(1 row) + -- LIKE/ILIKE SELECT * FROM collate_test1 WHERE b LIKE 'abc'; a | b diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out index 8b7176a2756..5508622b16d 100644 --- a/src/test/regress/expected/collate.utf8.out +++ b/src/test/regress/expected/collate.utf8.out @@ -160,6 +160,13 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed t (1 row) +-- case folding +select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_C_UTF8); + casefold +------------------------------- + abcd 123 #$% ıiiİ ß ß dždždž σσσ +(1 row) + -- -- Test PG_UNICODE_FAST -- @@ -320,3 +327,10 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases re t (1 row) +-- case folding +select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_UNICODE_FAST); + casefold +--------------------------------- + abcd 123 #$% ıiii̇ ss ss dždždž σσσ +(1 row) + diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 5ee2da4e0e0..f99f186f2d6 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -116,6 +116,11 @@ SELECT a, lower(x COLLATE "C"), lower(y COLLATE "C") FROM collate_test10; SELECT a, x, y FROM collate_test10 ORDER BY lower(y), a; +SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu"); +SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu"); +SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu"); +SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu"); + -- LIKE/ILIKE SELECT * FROM collate_test1 WHERE b LIKE 'abc'; diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql index 46e9c5232ad..6c7c7aec9ec 100644 --- a/src/test/regress/sql/collate.utf8.sql +++ b/src/test/regress/sql/collate.utf8.sql @@ -81,6 +81,9 @@ SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8; SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed +-- case folding +select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_C_UTF8); + -- -- Test PG_UNICODE_FAST -- @@ -140,3 +143,6 @@ SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST; SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST; SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST; SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed + +-- case folding +select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_UNICODE_FAST); |