Add SQL functions for Unicode normalization

This adds SQL expressions NORMALIZE() and IS NORMALIZED to convert and check Unicode normal forms, per SQL standard. To support fast IS NORMALIZED tests, we pull in a new data file DerivedNormalizationProps.txt from Unicode and build a lookup table from that, using techniques similar to ones already used for other Unicode data. make update-unicode will keep it up to date. We only build and use these tables for the NFC and NFKC forms, because they are too big for NFD and NFKD and the improvement is not significant enough there. Reviewed-by: Daniel Verite <[email protected]> Reviewed-by: Andreas Karlsson <[email protected]> Discussion: https://www.postgresql.org/message-id/flat/[email protected]
author: Peter Eisentraut 2020-03-26 07:14:00 +0000
committer: Peter Eisentraut 2020-04-02 06:56:27 +0000
commit: 2991ac5fc9b3904ca4582be6d323497d7c3d17c9 (patch)
tree: d558847de39ee972b261026d4846f1f31e8dff12 /src/common/unicode_norm.c
parent: 070c3d3937e75e04d36405287353b7eca516555d (diff)
1 files changed, 110 insertions, 0 deletions
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index ec5abea6bdd..4f4c029075b 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -20,6 +20,9 @@
 
 #include "common/unicode_norm.h"
 #include "common/unicode_norm_table.h"
+#ifndef FRONTEND
+#include "common/unicode_normprops_table.h"
+#endif
 
 #ifndef FRONTEND
 #define ALLOC(size) palloc(size)
@@ -442,3 +445,110 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 
 	return recomp_chars;
 }
+
+/*
+ * Normalization "quick check" algorithm; see
+ * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
+ */
+
+/* We only need this in the backend. */
+#ifndef FRONTEND
+
+static uint8
+get_canonical_class(pg_wchar ch)
+{
+	pg_unicode_decomposition *entry = get_code_entry(ch);
+
+	if (!entry)
+		return 0;
+	else
+		return entry->comb_class;
+}
+
+static int
+qc_compare(const void *p1, const void *p2)
+{
+	uint32		v1,
+				v2;
+
+	v1 = ((const pg_unicode_normprops *) p1)->codepoint;
+	v2 = ((const pg_unicode_normprops *) p2)->codepoint;
+	return (v1 - v2);
+}
+
+/*
+ * Look up the normalization quick check character property
+ */
+static UnicodeNormalizationQC
+qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
+{
+	pg_unicode_normprops key;
+	pg_unicode_normprops *found = NULL;
+
+	key.codepoint = ch;
+
+	switch (form)
+	{
+		case UNICODE_NFC:
+			found = bsearch(&key,
+							UnicodeNormProps_NFC_QC,
+							lengthof(UnicodeNormProps_NFC_QC),
+							sizeof(pg_unicode_normprops),
+							qc_compare);
+			break;
+		case UNICODE_NFKC:
+			found = bsearch(&key,
+							UnicodeNormProps_NFKC_QC,
+							lengthof(UnicodeNormProps_NFKC_QC),
+							sizeof(pg_unicode_normprops),
+							qc_compare);
+			break;
+		default:
+			Assert(false);
+			break;
+	}
+
+	if (found)
+		return found->quickcheck;
+	else
+		return UNICODE_NORM_QC_YES;
+}
+
+UnicodeNormalizationQC
+unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
+{
+	uint8		lastCanonicalClass = 0;
+	UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
+
+	/*
+	 * For the "D" forms, we don't run the quickcheck.  We don't include the
+	 * lookup tables for those because they are huge, checking for these
+	 * particular forms is less common, and running the slow path is faster
+	 * for the "D" forms than the "C" forms because you don't need to
+	 * recompose, which is slow.
+	 */
+	if (form == UNICODE_NFD || form == UNICODE_NFKD)
+		return UNICODE_NORM_QC_MAYBE;
+
+	for (const pg_wchar *p = input; *p; p++)
+	{
+		pg_wchar	ch = *p;
+		uint8		canonicalClass;
+		UnicodeNormalizationQC check;
+
+		canonicalClass = get_canonical_class(ch);
+		if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
+			return UNICODE_NORM_QC_NO;
+
+		check = qc_is_allowed(form, ch);
+		if (check == UNICODE_NORM_QC_NO)
+			return UNICODE_NORM_QC_NO;
+		else if (check == UNICODE_NORM_QC_MAYBE)
+			result = UNICODE_NORM_QC_MAYBE;
+
+		lastCanonicalClass = canonicalClass;
+	}
+	return result;
+}
+
+#endif			/* !FRONTEND */
author	Peter Eisentraut	2020-03-26 07:14:00 +0000
committer	Peter Eisentraut	2020-04-02 06:56:27 +0000
commit	2991ac5fc9b3904ca4582be6d323497d7c3d17c9 (patch)
tree	d558847de39ee972b261026d4846f1f31e8dff12 /src/common/unicode_norm.c
parent	070c3d3937e75e04d36405287353b7eca516555d (diff)