Optimize strspn()

nielsdos · nielsdos · commit d0b29d828637 · 2023-10-14T21:24:55.000+02:00
The current implementation uses a nested loop (for + goto), which has complexity O(|s1| * |s2|). If we instead use a lookup table, the complexity drops to O(|s1| + |s2|). This is conceptually the same strategy that common C library implementations such as glibc and musl use. The variation with a bitvector instead of a table also gives a speed-up, but the table variation was about 1.34x faster. On microbenchmarks this easily gave a 5x speedup. This can bring a 1.4-1.5% performance improvement in the Symfony benchmark. Closes GH-12431.
diff --git a/UPGRADING b/UPGRADING
@@ -145,3 +145,6 @@ PHP 8.4 UPGRADE NOTES
 * The performance of DOMNode::C14N() is greatly improved for the case without
   an xpath query. This can give a time improvement of easily two order of
   magnitude for documents with tens of thousands of nodes.
+
+* The performance of strspn() is greatly improved. It now runs in linear time
+  instead of being bounded by quadratic time.
diff --git a/ext/standard/string.c b/ext/standard/string.c
@@ -1597,19 +1597,40 @@ PHPAPI char *php_stristr(char *s, char *t, size_t s_len, size_t t_len)
 /* }}} */
 
 /* {{{ php_strspn */
-PHPAPI size_t php_strspn(const char *s1, const char *s2, const char *s1_end, const char *s2_end)
+PHPAPI size_t php_strspn(const char *haystack, const char *characters, const char *haystack_end, const char *characters_end)
 {
-	const char *p = s1, *spanp;
-	char c = *p;
-
-cont:
-	for (spanp = s2; p != s1_end && spanp != s2_end;) {
-		if (*spanp++ == c) {
-			c = *(++p);
-			goto cont;
+	/* Fast path for short strings.
+	 * The table lookup cannot be faster in this case because we not only have to compare, but also build the table.
+	 * We only compare in this case.
+	 * Empirically tested that the table lookup approach is only beneficial if characters is longer than 1 character. */
+	if (characters_end - characters == 1) {
+		const char *ptr = haystack;
+		while (ptr < haystack_end && *ptr == *characters) {
+			ptr++;
 		}
+		return ptr - haystack;
+	}
+
+	/* Every character in characters will set a boolean in this lookup table.
+	 * We'll use the lookup table as a fast lookup for the characters in characters while looping over haystack. */
+	bool table[256];
+	/* Use multiple small memsets to inline the memset with intrinsics, trick learned from glibc. */
+	memset(table, 0, 64);
+	memset(table + 64, 0, 64);
+	memset(table + 128, 0, 64);
+	memset(table + 192, 0, 64);
+
+	while (characters < characters_end) {
+		table[(unsigned char) *characters] = true;
+		characters++;
+	}
+
+	const char *ptr = haystack;
+	while (ptr < haystack_end && table[(unsigned char) *ptr]) {
+		ptr++;
 	}
-	return (p - s1);
+
+	return ptr - haystack;
 }
 /* }}} */