diff options
| author | Tom Lane | 2025-12-06 01:10:33 +0000 |
|---|---|---|
| committer | Tom Lane | 2025-12-06 01:10:33 +0000 |
| commit | 6dfce8420e99d8cf41ffb7da698caee57fd73eb7 (patch) | |
| tree | fadeddc960089a51d8a257b854b4a84c591fbc7e | |
| parent | 7c2061bdfba7c738dac1e2c14db51faeef3f34b1 (diff) | |
Fix text substring search for non-deterministic collations.
Due to an off-by-one error, the code failed to find matches at the
end of the haystack. Fix by rewriting the loop.
While at it, fix a comment that claimed that the function could find
a zero-length match. Such a match could send a caller into an endless
loop. However, zero-length matches only make sense with an empty
search string, and that case is explicitly excluded by all callers.
To make sure it stays that way, add an Assert and a comment.
Bug: #19341
Reported-by: Adam Warland <[email protected]>
Author: Laurenz Albe <[email protected]>
Reviewed-by: Heikki Linnakangas <[email protected]>
Reviewed-by: Tom Lane <[email protected]>
Discussion: https://postgr.es/m/[email protected]
Backpatch-through: 18
| -rw-r--r-- | src/backend/utils/adt/varlena.c | 25 | ||||
| -rw-r--r-- | src/test/regress/expected/collate.icu.utf8.out | 7 | ||||
| -rw-r--r-- | src/test/regress/sql/collate.icu.utf8.sql | 3 |
3 files changed, 28 insertions, 7 deletions
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 3894457ab40..f202b8df4e2 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -1111,6 +1111,7 @@ text_position_next_internal(char *start_ptr, TextPositionState *state) const char *hptr; Assert(start_ptr >= haystack && start_ptr <= haystack_end); + Assert(needle_len > 0); state->last_match_len_tmp = needle_len; @@ -1123,19 +1124,26 @@ text_position_next_internal(char *start_ptr, TextPositionState *state) * needle under the given collation. * * Note, the found substring could have a different length than the - * needle, including being empty. Callers that want to skip over the - * found string need to read the length of the found substring from - * last_match_len rather than just using the length of their needle. + * needle. Callers that want to skip over the found string need to + * read the length of the found substring from last_match_len rather + * than just using the length of their needle. * * Most callers will require "greedy" semantics, meaning that we need * to find the longest such substring, not the shortest. For callers * that don't need greedy semantics, we can finish on the first match. + * + * This loop depends on the assumption that the needle is nonempty and + * any matching substring must also be nonempty. (Even if the + * collation would accept an empty match, returning one would send + * callers that search for successive matches into an infinite loop.) */ const char *result_hptr = NULL; hptr = start_ptr; while (hptr < haystack_end) { + const char *test_end; + /* * First check the common case that there is a match in the * haystack of exactly the length of the needle. @@ -1146,11 +1154,13 @@ text_position_next_internal(char *start_ptr, TextPositionState *state) return (char *) hptr; /* - * Else check if any of the possible substrings starting at hptr - * are equal to the needle. + * Else check if any of the non-empty substrings starting at hptr + * compare equal to the needle. */ - for (const char *test_end = hptr; test_end < haystack_end; test_end += pg_mblen(test_end)) + test_end = hptr; + do { + test_end += pg_mblen(test_end); if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0) { state->last_match_len_tmp = (test_end - hptr); @@ -1158,7 +1168,8 @@ text_position_next_internal(char *start_ptr, TextPositionState *state) if (!state->greedy) break; } - } + } while (test_end < haystack_end); + if (result_hptr) break; diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index b8579a1efc6..8023014fe63 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1484,6 +1484,13 @@ SELECT array_sort('{a,B}'::text[] COLLATE "C"); {B,a} (1 row) +-- test replace() at the end of the string (bug #19341) +SELECT replace('testX' COLLATE case_insensitive, 'x' COLLATE case_insensitive, 'er'); + replace +--------- + tester +(1 row) + -- test language tags CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false); SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive; diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 6f5abac0dc0..b6c54503d21 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -568,6 +568,9 @@ SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_inse SELECT array_sort('{a,B}'::text[] COLLATE case_insensitive); SELECT array_sort('{a,B}'::text[] COLLATE "C"); +-- test replace() at the end of the string (bug #19341) +SELECT replace('testX' COLLATE case_insensitive, 'x' COLLATE case_insensitive, 'er'); + -- test language tags CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false); SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive; |
