summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeff Davis2025-01-24 22:56:22 +0000
committerJeff Davis2025-01-24 22:56:22 +0000
commitbfc5992069cf00b189af83d96a83ae5ebb65e938 (patch)
tree94332f38e12deb4a6dcfdc011c42848069190ec5
parentf15538cd27d4eeb7d665263a3d7b5700362d7eb0 (diff)
Add SQL function CASEFOLD().
Useful for caseless matching. Similar to LOWER(), but avoids edge-case problems with using LOWER() for caseless matching. For collations that support it, CASEFOLD() handles characters with more than two case variations or multi-character case variations. Some characters may fold to uppercase. The results of case folding are also more stable across Unicode versions than LOWER() or UPPER(). Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com Reviewed-by: Ian Lawrence Barwick
-rw-r--r--doc/src/sgml/func.sgml46
-rw-r--r--src/backend/utils/adt/formatting.c69
-rw-r--r--src/backend/utils/adt/oracle_compat.c16
-rw-r--r--src/backend/utils/adt/pg_locale.c24
-rw-r--r--src/backend/utils/adt/pg_locale_builtin.c10
-rw-r--r--src/backend/utils/adt/pg_locale_icu.c58
-rw-r--r--src/include/catalog/catversion.h2
-rw-r--r--src/include/catalog/pg_proc.dat3
-rw-r--r--src/include/utils/formatting.h1
-rw-r--r--src/include/utils/pg_locale.h3
-rw-r--r--src/test/regress/expected/collate.icu.utf8.out24
-rw-r--r--src/test/regress/expected/collate.utf8.out14
-rw-r--r--src/test/regress/sql/collate.icu.utf8.sql5
-rw-r--r--src/test/regress/sql/collate.utf8.sql6
14 files changed, 278 insertions, 3 deletions
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 5678e7621a5..d2b0b059e65 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -2596,7 +2596,7 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
<row>
<entry role="func_table_entry"><para role="func_signature">
- <indexterm>
+ <indexterm id="function-lower">
<primary>lower</primary>
</indexterm>
<function>lower</function> ( <type>text</type> )
@@ -2657,7 +2657,7 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
<row>
<entry role="func_table_entry"><para role="func_signature">
- <indexterm>
+ <indexterm id="function-normalize">
<primary>normalize</primary>
</indexterm>
<indexterm>
@@ -3112,6 +3112,48 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
+ <primary>casefold</primary>
+ </indexterm>
+ <function>casefold</function> ( <type>text</type> )
+ <returnvalue>text</returnvalue>
+ </para>
+ <para>
+ Performs case folding of the input string according to the collation.
+ Case folding is similar to case conversion, but the purpose of case
+ folding is to facilitate case-insensitive comparison of strings,
+ whereas the purpose of case conversion is to convert to a particular
+ cased form. This function can only be used when the server encoding
+ is <literal>UTF8</literal>.
+ </para>
+ <para>
+ Ordinarily, case folding simply converts to lowercase, but there are a
+ few notable exceptions depending on the collation. For instance, the
+ character <literal>Σ</literal> (U+03A3) has two lowercase forms:
+ <literal>σ</literal> (U+03C3) and <literal>ς</literal> (U+03C2); case
+ folding in the <literal>PG_C_UTF8</literal> collation maps all three
+ forms to <literal>σ</literal>. Additionally, the result is not
+ necessarily lowercase; some characters may be folded to uppercase.
+ </para>
+ <para>
+ Case folding may change the length of the string. For instance, in
+ the <literal>PG_UNICODE_FAST</literal> collation, <literal>ß</literal>
+ (U+00DF) folds to <literal>ss</literal>.
+ </para>
+ <para>
+ <function>casefold</function> can be used for Unicode Default Caseless
+ Matching. It does not always preserve the normalized form of the
+ input string (see <xref linkend="function-normalize"/>).
+ </para>
+ <para>
+ The <literal>libc</literal> provider doesn't support case folding, so
+ <function>casefold</function> is identical to <xref
+ linkend="function-lower"/>.
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
<primary>left</primary>
</indexterm>
<function>left</function> ( <parameter>string</parameter> <type>text</type>,
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 7c4c4aa07d5..2720d3902ab 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -1820,6 +1820,75 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
}
/*
+ * collation-aware, wide-character-aware case folding
+ *
+ * We pass the number of bytes so we can pass varlena and char*
+ * to this function. The result is a palloc'd, null-terminated string.
+ */
+char *
+str_casefold(const char *buff, size_t nbytes, Oid collid)
+{
+ char *result;
+ pg_locale_t mylocale;
+
+ if (!buff)
+ return NULL;
+
+ if (!OidIsValid(collid))
+ {
+ /*
+ * This typically means that the parser could not resolve a conflict
+ * of implicit collations, so report it that way.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INDETERMINATE_COLLATION),
+ errmsg("could not determine which collation to use for %s function",
+ "lower()"),
+ errhint("Use the COLLATE clause to set the collation explicitly.")));
+ }
+
+ if (GetDatabaseEncoding() != PG_UTF8)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Unicode case folding can only be performed if server encoding is UTF8")));
+
+ mylocale = pg_newlocale_from_collation(collid);
+
+ /* C/POSIX collations use this path regardless of database encoding */
+ if (mylocale->ctype_is_c)
+ {
+ result = asc_tolower(buff, nbytes);
+ }
+ else
+ {
+ const char *src = buff;
+ size_t srclen = nbytes;
+ size_t dstsize;
+ char *dst;
+ size_t needed;
+
+ /* first try buffer of equal size plus terminating NUL */
+ dstsize = srclen + 1;
+ dst = palloc(dstsize);
+
+ needed = pg_strfold(dst, dstsize, src, srclen, mylocale);
+ if (needed + 1 > dstsize)
+ {
+ /* grow buffer if needed and retry */
+ dstsize = needed + 1;
+ dst = repalloc(dst, dstsize);
+ needed = pg_strfold(dst, dstsize, src, srclen, mylocale);
+ Assert(needed + 1 <= dstsize);
+ }
+
+ Assert(dst[needed] == '\0');
+ result = dst;
+ }
+
+ return result;
+}
+
+/*
* ASCII-only lower function
*
* We pass the number of bytes so we can pass varlena and char*
diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c
index 2cba7cd1621..a24a2d208fb 100644
--- a/src/backend/utils/adt/oracle_compat.c
+++ b/src/backend/utils/adt/oracle_compat.c
@@ -126,6 +126,22 @@ initcap(PG_FUNCTION_ARGS)
PG_RETURN_TEXT_P(result);
}
+Datum
+casefold(PG_FUNCTION_ARGS)
+{
+ text *in_string = PG_GETARG_TEXT_PP(0);
+ char *out_string;
+ text *result;
+
+ out_string = str_casefold(VARDATA_ANY(in_string),
+ VARSIZE_ANY_EXHDR(in_string),
+ PG_GET_COLLATION());
+ result = cstring_to_text(out_string);
+ pfree(out_string);
+
+ PG_RETURN_TEXT_P(result);
+}
+
/********************************************************************
*
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 94444acd2c5..7d92f580a57 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -106,6 +106,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
+extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src,
+ ssize_t srclen, pg_locale_t locale);
extern size_t strlower_icu(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
@@ -113,6 +115,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
+extern size_t strfold_icu(char *dst, size_t dstsize, const char *src,
+ ssize_t srclen, pg_locale_t locale);
extern size_t strlower_libc(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
@@ -1447,6 +1451,26 @@ pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
return 0; /* keep compiler quiet */
}
+size_t
+pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+ pg_locale_t locale)
+{
+ if (locale->provider == COLLPROVIDER_BUILTIN)
+ return strfold_builtin(dst, dstsize, src, srclen, locale);
+#ifdef USE_ICU
+ else if (locale->provider == COLLPROVIDER_ICU)
+ return strfold_icu(dst, dstsize, src, srclen, locale);
+#endif
+ /* for libc, just use strlower */
+ else if (locale->provider == COLLPROVIDER_LIBC)
+ return strlower_libc(dst, dstsize, src, srclen, locale);
+ else
+ /* shouldn't happen */
+ PGLOCALE_SUPPORT_ERROR(locale->provider);
+
+ return 0; /* keep compiler quiet */
+}
+
/*
* pg_strcoll
*
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 436e32c0ca0..33ad20bbf07 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -31,6 +31,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
+extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src,
+ ssize_t srclen, pg_locale_t locale);
struct WordBoundaryState
@@ -107,6 +109,14 @@ strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
locale->info.builtin.casemap_full);
}
+size_t
+strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
+ pg_locale_t locale)
+{
+ return unicode_strfold(dest, destsize, src, srclen,
+ locale->info.builtin.casemap_full);
+}
+
pg_locale_t
create_pg_locale_builtin(Oid collid, MemoryContext context)
{
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
index 5185b0f7289..b0c73f2e43d 100644
--- a/src/backend/utils/adt/pg_locale_icu.c
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -54,6 +54,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
+extern size_t strfold_icu(char *dst, size_t dstsize, const char *src,
+ ssize_t srclen, pg_locale_t locale);
#ifdef USE_ICU
@@ -117,6 +119,10 @@ static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
const char *locale,
UErrorCode *pErrorCode);
+static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ const char *locale,
+ UErrorCode *pErrorCode);
static const struct collate_methods collate_methods_icu = {
.strncoll = strncoll_icu,
@@ -439,6 +445,26 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
return result_len;
}
+size_t
+strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
+ pg_locale_t locale)
+{
+ int32_t len_uchar;
+ int32_t len_conv;
+ UChar *buff_uchar;
+ UChar *buff_conv;
+ size_t result_len;
+
+ len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
+ len_conv = icu_convert_case(u_strFoldCase_default, locale,
+ &buff_conv, buff_uchar, len_uchar);
+ result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
+ pfree(buff_uchar);
+ pfree(buff_conv);
+
+ return result_len;
+}
+
/*
* strncoll_icu_utf8
*
@@ -673,6 +699,38 @@ u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
NULL, locale, pErrorCode);
}
+static int32_t
+u_strFoldCase_default(UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ const char *locale,
+ UErrorCode *pErrorCode)
+{
+ uint32 options = U_FOLD_CASE_DEFAULT;
+ char lang[3];
+ UErrorCode status;
+
+ /*
+ * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
+ * folding does not accept a locale. Instead it just supports a single
+ * option relevant to Turkic languages 'az' and 'tr'; check for those
+ * languages to enable the option.
+ */
+ status = U_ZERO_ERROR;
+ uloc_getLanguage(locale, lang, 3, &status);
+ if (U_SUCCESS(status))
+ {
+ /*
+ * The option name is confusing, but it causes u_strFoldCase to use
+ * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
+ */
+ if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
+ options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
+ }
+
+ return u_strFoldCase(dest, destCapacity, src, srcLength,
+ options, pErrorCode);
+}
+
/*
* strncoll_icu
*
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 586b83f2f4d..e3a308024de 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202501231
+#define CATALOG_VERSION_NO 202501232
#endif
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 18560755d26..2aafdbc3e93 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3623,6 +3623,9 @@
{ oid => '872', descr => 'capitalize each word',
proname => 'initcap', prorettype => 'text', proargtypes => 'text',
prosrc => 'initcap' },
+{ oid => '9569', descr => 'fold case',
+ proname => 'casefold', prorettype => 'text', proargtypes => 'text',
+ prosrc => 'casefold' },
{ oid => '873', descr => 'left-pad string to length',
proname => 'lpad', prorettype => 'text', proargtypes => 'text int4 text',
prosrc => 'lpad' },
diff --git a/src/include/utils/formatting.h b/src/include/utils/formatting.h
index 5fa49539aaa..835307dac09 100644
--- a/src/include/utils/formatting.h
+++ b/src/include/utils/formatting.h
@@ -21,6 +21,7 @@
extern char *str_tolower(const char *buff, size_t nbytes, Oid collid);
extern char *str_toupper(const char *buff, size_t nbytes, Oid collid);
extern char *str_initcap(const char *buff, size_t nbytes, Oid collid);
+extern char *str_casefold(const char *buff, size_t nbytes, Oid collid);
extern char *asc_tolower(const char *buff, size_t nbytes);
extern char *asc_toupper(const char *buff, size_t nbytes);
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 2bc3a7df2d9..0d5f0513ceb 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -134,6 +134,9 @@ extern size_t pg_strtitle(char *dest, size_t destsize,
extern size_t pg_strupper(char *dest, size_t destsize,
const char *src, ssize_t srclen,
pg_locale_t locale);
+extern size_t pg_strfold(char *dest, size_t destsize,
+ const char *src, ssize_t srclen,
+ pg_locale_t locale);
extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
extern int pg_strncoll(const char *arg1, ssize_t len1,
const char *arg2, ssize_t len2, pg_locale_t locale);
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index d4f327636fd..910de9120f2 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -255,6 +255,30 @@ SELECT a, x, y FROM collate_test10 ORDER BY lower(y), a;
1 | hij | hij
(2 rows)
+SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu");
+ lower
+-------------------------------
+ abcd 123 #$% ıiii̇ ß ß dždždž σσς
+(1 row)
+
+SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu");
+ casefold
+---------------------------------
+ abcd 123 #$% ıiii̇ ss ss dždždž σσσ
+(1 row)
+
+SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu");
+ lower
+-------------------------------
+ abcd 123 #$% ıiıi ß ß dždždž σσς
+(1 row)
+
+SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu");
+ casefold
+---------------------------------
+ abcd 123 #$% ıiıi ss ss dždždž σσσ
+(1 row)
+
-- LIKE/ILIKE
SELECT * FROM collate_test1 WHERE b LIKE 'abc';
a | b
diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out
index 8b7176a2756..5508622b16d 100644
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@@ -160,6 +160,13 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
t
(1 row)
+-- case folding
+select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_C_UTF8);
+ casefold
+-------------------------------
+ abcd 123 #$% ıiiİ ß ß dždždž σσσ
+(1 row)
+
--
-- Test PG_UNICODE_FAST
--
@@ -320,3 +327,10 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases re
t
(1 row)
+-- case folding
+select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_UNICODE_FAST);
+ casefold
+---------------------------------
+ abcd 123 #$% ıiii̇ ss ss dždždž σσσ
+(1 row)
+
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 5ee2da4e0e0..f99f186f2d6 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -116,6 +116,11 @@ SELECT a, lower(x COLLATE "C"), lower(y COLLATE "C") FROM collate_test10;
SELECT a, x, y FROM collate_test10 ORDER BY lower(y), a;
+SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu");
+SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu");
+SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu");
+SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu");
+
-- LIKE/ILIKE
SELECT * FROM collate_test1 WHERE b LIKE 'abc';
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql
index 46e9c5232ad..6c7c7aec9ec 100644
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -81,6 +81,9 @@ SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8;
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8;
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
+-- case folding
+select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_C_UTF8);
+
--
-- Test PG_UNICODE_FAST
--
@@ -140,3 +143,6 @@ SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST;
SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST;
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST;
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed
+
+-- case folding
+select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_UNICODE_FAST);