From 36cbe9f02e718071cdcb116ffb0fe0a8450dd1a4 Mon Sep 17 00:00:00 2001 From: Andreas Karlsson Date: Tue, 17 Dec 2024 22:47:00 +0100 Subject: [PATCH 1/2] Use optimized versions of ICU case conversion for UTF-8 Instead of converting to and from UChar when doing case conversions we use the UTF-8 versions of the functions. This can give a signficant speedup, 15-20%, on short to medium length strings. --- src/backend/utils/adt/pg_locale_icu.c | 149 +++++++++++++++++++------- 1 file changed, 108 insertions(+), 41 deletions(-) diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index b0c73f2e43d0..3b75a3525e53 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -12,6 +12,7 @@ #include "postgres.h" #ifdef USE_ICU +#include "unicode/ucasemap.h" #include #include @@ -112,9 +113,9 @@ static size_t icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar); static void icu_set_collation_attributes(UCollator *collator, const char *loc, UErrorCode *status); -static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, - UChar **buff_dest, UChar *buff_source, - int32_t len_source); +static int32_t icu_convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale, + UChar **buff_dest, UChar *buff_source, + int32_t len_source); static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, @@ -389,60 +390,126 @@ size_t strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - int32_t len_uchar; - int32_t len_conv; - UChar *buff_uchar; - UChar *buff_conv; - size_t result_len; + if (GetDatabaseEncoding() == PG_UTF8) + { + UErrorCode status = U_ZERO_ERROR; + UCaseMap *casemap; + int32_t needed; - len_uchar = icu_to_uchar(&buff_uchar, src, srclen); - len_conv = icu_convert_case(u_strToLower, locale, - &buff_conv, buff_uchar, len_uchar); - result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); + casemap = ucasemap_open(locale->info.icu.locale, U_FOLD_CASE_DEFAULT, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("casemap lookup failed: %s", u_errorName(status)))); - return result_len; + status = U_ZERO_ERROR; + needed = ucasemap_utf8ToLower(casemap, dest, destsize, src, srclen, &status); + ucasemap_close(casemap); + if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status)) + ereport(ERROR, + (errmsg("case conversion failed: %s", u_errorName(status)))); + return needed; + } + else + { + int32_t len_uchar; + int32_t len_conv; + UChar *buff_uchar; + UChar *buff_conv; + size_t result_len; + + len_uchar = icu_to_uchar(&buff_uchar, src, srclen); + len_conv = icu_convert_case_uchar(u_strToLower, locale, &buff_conv, + buff_uchar, len_uchar); + result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); + pfree(buff_uchar); + pfree(buff_conv); + + return result_len; + } } size_t strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - int32_t len_uchar; - int32_t len_conv; - UChar *buff_uchar; - UChar *buff_conv; - size_t result_len; + if (GetDatabaseEncoding() == PG_UTF8) + { + UErrorCode status = U_ZERO_ERROR; + UCaseMap *casemap; + int32_t needed; - len_uchar = icu_to_uchar(&buff_uchar, src, srclen); - len_conv = icu_convert_case(u_strToTitle_default_BI, locale, - &buff_conv, buff_uchar, len_uchar); - result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); + casemap = ucasemap_open(locale->info.icu.locale, U_FOLD_CASE_DEFAULT, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("casemap lookup failed: %s", u_errorName(status)))); - return result_len; + status = U_ZERO_ERROR; + needed = ucasemap_utf8ToTitle(casemap, dest, destsize, src, srclen, &status); + ucasemap_close(casemap); + if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status)) + ereport(ERROR, + (errmsg("case conversion failed: %s", u_errorName(status)))); + return needed; + } + else + { + int32_t len_uchar; + int32_t len_conv; + UChar *buff_uchar; + UChar *buff_conv; + size_t result_len; + + len_uchar = icu_to_uchar(&buff_uchar, src, srclen); + len_conv = icu_convert_case_uchar(u_strToTitle_default_BI, locale, &buff_conv, + buff_uchar, len_uchar); + result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); + pfree(buff_uchar); + pfree(buff_conv); + + return result_len; + } } size_t strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - int32_t len_uchar; - int32_t len_conv; - UChar *buff_uchar; - UChar *buff_conv; - size_t result_len; + if (GetDatabaseEncoding() == PG_UTF8) + { + UErrorCode status = U_ZERO_ERROR; + UCaseMap *casemap; + int32_t needed; - len_uchar = icu_to_uchar(&buff_uchar, src, srclen); - len_conv = icu_convert_case(u_strToUpper, locale, - &buff_conv, buff_uchar, len_uchar); - result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); + casemap = ucasemap_open(locale->info.icu.locale, U_FOLD_CASE_DEFAULT, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("casemap lookup failed: %s", u_errorName(status)))); - return result_len; + status = U_ZERO_ERROR; + needed = ucasemap_utf8ToUpper(casemap, dest, destsize, src, srclen, &status); + ucasemap_close(casemap); + if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status)) + ereport(ERROR, + (errmsg("case conversion failed: %s", u_errorName(status)))); + return needed; + } + else + { + int32_t len_uchar; + int32_t len_conv; + UChar *buff_uchar; + UChar *buff_conv; + size_t result_len; + + len_uchar = icu_to_uchar(&buff_uchar, src, srclen); + len_conv = icu_convert_case_uchar(u_strToUpper, locale, &buff_conv, + buff_uchar, len_uchar); + result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); + pfree(buff_uchar); + pfree(buff_conv); + + return result_len; + } } size_t @@ -663,8 +730,8 @@ icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len } static int32_t -icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, - UChar **buff_dest, UChar *buff_source, int32_t len_source) +icu_convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale, + UChar **buff_dest, UChar *buff_source, int32_t len_source) { UErrorCode status; int32_t len_dest; From a903b4dd587d7ca2000b0095288733f0241cc8d4 Mon Sep 17 00:00:00 2001 From: Andreas Karlsson Date: Fri, 20 Dec 2024 02:00:33 +0100 Subject: [PATCH 2/2] Reduce code duplication in ICU case mapping code --- src/backend/utils/adt/pg_locale_icu.c | 74 ++++++++++----------------- 1 file changed, 26 insertions(+), 48 deletions(-) diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index 3b75a3525e53..9385987f7e82 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -113,6 +113,9 @@ static size_t icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar); static void icu_set_collation_attributes(UCollator *collator, const char *loc, UErrorCode *status); +static int32_t icu_convert_case_no_utf8(ICU_Convert_Func func, char *dest, + size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); static int32_t icu_convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale, UChar **buff_dest, UChar *buff_source, int32_t len_source); @@ -410,22 +413,7 @@ strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return needed; } else - { - int32_t len_uchar; - int32_t len_conv; - UChar *buff_uchar; - UChar *buff_conv; - size_t result_len; - - len_uchar = icu_to_uchar(&buff_uchar, src, srclen); - len_conv = icu_convert_case_uchar(u_strToLower, locale, &buff_conv, - buff_uchar, len_uchar); - result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); - - return result_len; - } + return icu_convert_case_no_utf8(u_strToLower, dest, destsize, src, srclen, locale); } size_t @@ -452,22 +440,7 @@ strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return needed; } else - { - int32_t len_uchar; - int32_t len_conv; - UChar *buff_uchar; - UChar *buff_conv; - size_t result_len; - - len_uchar = icu_to_uchar(&buff_uchar, src, srclen); - len_conv = icu_convert_case_uchar(u_strToTitle_default_BI, locale, &buff_conv, - buff_uchar, len_uchar); - result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); - - return result_len; - } + return icu_convert_case_no_utf8(u_strToTitle_default_BI, dest, destsize, src, srclen, locale); } size_t @@ -494,22 +467,7 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return needed; } else - { - int32_t len_uchar; - int32_t len_conv; - UChar *buff_uchar; - UChar *buff_conv; - size_t result_len; - - len_uchar = icu_to_uchar(&buff_uchar, src, srclen); - len_conv = icu_convert_case_uchar(u_strToUpper, locale, &buff_conv, - buff_uchar, len_uchar); - result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); - - return result_len; - } + return icu_convert_case_no_utf8(u_strToUpper, dest, destsize, src, srclen, locale); } size_t @@ -729,6 +687,26 @@ icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len return len_result; } +static int32_t +icu_convert_case_no_utf8(ICU_Convert_Func func, char *dest, size_t destsize, + const char *src, ssize_t srclen, pg_locale_t locale) +{ + int32_t len_uchar; + int32_t len_conv; + UChar *buff_uchar; + UChar *buff_conv; + size_t result_len; + + len_uchar = icu_to_uchar(&buff_uchar, src, srclen); + len_conv = icu_convert_case_uchar(func, locale, &buff_conv, + buff_uchar, len_uchar); + result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); + pfree(buff_uchar); + pfree(buff_conv); + + return result_len; +} + static int32_t icu_convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale, UChar **buff_dest, UChar *buff_source, int32_t len_source)