summaryrefslogtreecommitdiff
path: root/contrib/unaccent
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/unaccent')
-rw-r--r--contrib/unaccent/expected/unaccent.out18
-rw-r--r--contrib/unaccent/generate_unaccent_rules.py31
-rw-r--r--contrib/unaccent/sql/unaccent.sql3
-rw-r--r--contrib/unaccent/unaccent.rules106
4 files changed, 157 insertions, 1 deletions
diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out
index 69c2cf9bd7a..c1bd7cd897d 100644
--- a/contrib/unaccent/expected/unaccent.out
+++ b/contrib/unaccent/expected/unaccent.out
@@ -31,6 +31,12 @@ SELECT unaccent('˃˖˗˜');
>+-~
(1 row)
+SELECT unaccent('À'); -- Remove combining diacritical 0x0300
+ unaccent
+----------
+ A
+(1 row)
+
SELECT unaccent('unaccent', 'foobar');
unaccent
----------
@@ -55,6 +61,12 @@ SELECT unaccent('unaccent', '˃˖˗˜');
>+-~
(1 row)
+SELECT unaccent('unaccent', 'À');
+ unaccent
+----------
+ A
+(1 row)
+
SELECT ts_lexize('unaccent', 'foobar');
ts_lexize
-----------
@@ -79,3 +91,9 @@ SELECT ts_lexize('unaccent', '˃˖˗˜');
{>+-~}
(1 row)
+SELECT ts_lexize('unaccent', 'À');
+ ts_lexize
+-----------
+ {A}
+(1 row)
+
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
index 4419a771edf..58b6e7deb74 100644
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -61,8 +61,25 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
(0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
(0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
+# Combining marks follow a "base" character, and result in a composite
+# character. Example: "U&'A\0300'"produces "À".There are three types of
+# combining marks: enclosing (Me), non-spacing combining (Mn), spacing
+# combining (Mc). We identify the ranges of marks we feel safe removing.
+# References:
+# https://en.wikipedia.org/wiki/Combining_character
+# https://www.unicode.org/charts/PDF/U0300.pdf
+# https://www.unicode.org/charts/PDF/U20D0.pdf
+COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA
+ (0x20dd, 0x20E0), # Me: Symbols
+ (0x20e2, 0x20e4),) # Me: Screen, keycap, triangle
+
def print_record(codepoint, letter):
- print (chr(codepoint) + "\t" + letter)
+ if letter:
+ output = chr(codepoint) + "\t" + letter
+ else:
+ output = chr(codepoint)
+
+ print(output)
class Codepoint:
def __init__(self, id, general_category, combining_ids):
@@ -70,6 +87,16 @@ class Codepoint:
self.general_category = general_category
self.combining_ids = combining_ids
+def is_mark_to_remove(codepoint):
+ """Return true if this is a combining mark to remove."""
+ if not is_mark(codepoint):
+ return False
+
+ for begin, end in COMBINING_MARK_RANGES:
+ if codepoint.id >= begin and codepoint.id <= end:
+ return True
+ return False
+
def is_plain_letter(codepoint):
"""Return true if codepoint represents a "plain letter"."""
for begin, end in PLAIN_LETTER_RANGES:
@@ -234,6 +261,8 @@ def main(args):
"".join(chr(combining_codepoint.id)
for combining_codepoint \
in get_plain_letters(codepoint, table))))
+ elif is_mark_to_remove(codepoint):
+ charactersSet.add((codepoint.id, None))
# add CLDR Latin-ASCII characters
if not args.noLigaturesExpansion:
diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql
index c671827caa5..2ae097ff2b8 100644
--- a/contrib/unaccent/sql/unaccent.sql
+++ b/contrib/unaccent/sql/unaccent.sql
@@ -9,13 +9,16 @@ SELECT unaccent('foobar');
SELECT unaccent('ёлка');
SELECT unaccent('ЁЖИК');
SELECT unaccent('˃˖˗˜');
+SELECT unaccent('À'); -- Remove combining diacritical 0x0300
SELECT unaccent('unaccent', 'foobar');
SELECT unaccent('unaccent', 'ёлка');
SELECT unaccent('unaccent', 'ЁЖИК');
SELECT unaccent('unaccent', '˃˖˗˜');
+SELECT unaccent('unaccent', 'À');
SELECT ts_lexize('unaccent', 'foobar');
SELECT ts_lexize('unaccent', 'ёлка');
SELECT ts_lexize('unaccent', 'ЁЖИК');
SELECT ts_lexize('unaccent', '˃˖˗˜');
+SELECT ts_lexize('unaccent', 'À');
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
index 7ce25eef03d..99826408ac1 100644
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -414,6 +414,105 @@
˖ +
˗ -
˜ ~
+̿
Ά Α
Έ Ε
Ή Η
@@ -982,6 +1081,13 @@
₧ Pts
₹ Rs
₺ TL
+⃝
+⃞
+⃟
+⃠
+⃢
+⃣
+⃤
℀ a/c
℁ a/s
ℂ C