Fix text substring search for non-deterministic collations.

Due to an off-by-one error, the code failed to find matches at the end of the haystack. Fix by rewriting the loop. While at it, fix a comment that claimed that the function could find a zero-length match. Such a match could send a caller into an endless loop. However, zero-length matches only make sense with an empty search string, and that case is explicitly excluded by all callers. To make sure it stays that way, add an Assert and a comment. Bug: #19341 Reported-by: Adam Warland <[email protected]> Author: Laurenz Albe <[email protected]> Reviewed-by: Heikki Linnakangas <[email protected]> Reviewed-by: Tom Lane <[email protected]> Discussion: https://postgr.es/m/[email protected] Backpatch-through: 18
author: Tom Lane 2025-12-06 01:10:33 +0000
committer: Tom Lane 2025-12-06 01:10:33 +0000
commit: 6dfce8420e99d8cf41ffb7da698caee57fd73eb7 (patch)
tree: fadeddc960089a51d8a257b854b4a84c591fbc7e
parent: 7c2061bdfba7c738dac1e2c14db51faeef3f34b1 (diff)
3 files changed, 28 insertions, 7 deletions
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 3894457ab40..f202b8df4e2 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -1111,6 +1111,7 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
 	const char *hptr;
 
 	Assert(start_ptr >= haystack && start_ptr <= haystack_end);
+	Assert(needle_len > 0);
 
 	state->last_match_len_tmp = needle_len;
 
@@ -1123,19 +1124,26 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
 		 * needle under the given collation.
 		 *
 		 * Note, the found substring could have a different length than the
-		 * needle, including being empty.  Callers that want to skip over the
-		 * found string need to read the length of the found substring from
-		 * last_match_len rather than just using the length of their needle.
+		 * needle.  Callers that want to skip over the found string need to
+		 * read the length of the found substring from last_match_len rather
+		 * than just using the length of their needle.
 		 *
 		 * Most callers will require "greedy" semantics, meaning that we need
 		 * to find the longest such substring, not the shortest.  For callers
 		 * that don't need greedy semantics, we can finish on the first match.
+		 *
+		 * This loop depends on the assumption that the needle is nonempty and
+		 * any matching substring must also be nonempty.  (Even if the
+		 * collation would accept an empty match, returning one would send
+		 * callers that search for successive matches into an infinite loop.)
 		 */
 		const char *result_hptr = NULL;
 
 		hptr = start_ptr;
 		while (hptr < haystack_end)
 		{
+			const char *test_end;
+
 			/*
 			 * First check the common case that there is a match in the
 			 * haystack of exactly the length of the needle.
@@ -1146,11 +1154,13 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
 				return (char *) hptr;
 
 			/*
-			 * Else check if any of the possible substrings starting at hptr
-			 * are equal to the needle.
+			 * Else check if any of the non-empty substrings starting at hptr
+			 * compare equal to the needle.
 			 */
-			for (const char *test_end = hptr; test_end < haystack_end; test_end += pg_mblen(test_end))
+			test_end = hptr;
+			do
 			{
+				test_end += pg_mblen(test_end);
 				if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
 				{
 					state->last_match_len_tmp = (test_end - hptr);
@@ -1158,7 +1168,8 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
 					if (!state->greedy)
 						break;
 				}
-			}
+			} while (test_end < haystack_end);
+
 			if (result_hptr)
 				break;
 
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index b8579a1efc6..8023014fe63 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1484,6 +1484,13 @@ SELECT array_sort('{a,B}'::text[] COLLATE "C");
  {B,a}
 (1 row)
 
+-- test replace() at the end of the string (bug #19341)
+SELECT replace('testX' COLLATE case_insensitive, 'x' COLLATE case_insensitive, 'er');
+ replace 
+---------
+ tester
+(1 row)
+
 -- test language tags
 CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
 SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 6f5abac0dc0..b6c54503d21 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -568,6 +568,9 @@ SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_inse
 SELECT array_sort('{a,B}'::text[] COLLATE case_insensitive);
 SELECT array_sort('{a,B}'::text[] COLLATE "C");
 
+-- test replace() at the end of the string (bug #19341)
+SELECT replace('testX' COLLATE case_insensitive, 'x' COLLATE case_insensitive, 'er');
+
 -- test language tags
 CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
 SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;
author	Tom Lane	2025-12-06 01:10:33 +0000
committer	Tom Lane	2025-12-06 01:10:33 +0000
commit	6dfce8420e99d8cf41ffb7da698caee57fd73eb7 (patch)
tree	fadeddc960089a51d8a257b854b4a84c591fbc7e
parent	7c2061bdfba7c738dac1e2c14db51faeef3f34b1 (diff)