summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/unaccent/generate_unaccent_rules.py39
-rw-r--r--contrib/unaccent/unaccent.rules114
2 files changed, 145 insertions, 8 deletions
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
index a5eb42f0b18..4b1b011861f 100644
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -48,24 +48,47 @@ def is_mark(codepoint):
return codepoint.general_category in ("Mn", "Me", "Mc")
def is_letter_with_marks(codepoint, table):
- """Returns true for plain letters combined with one or more marks."""
+ """Returns true for letters combined with one or more marks."""
# See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
- return len(codepoint.combining_ids) > 1 and \
- is_plain_letter(table[codepoint.combining_ids[0]]) and \
- all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
+
+ # Letter may have no combining characters, in which case it has
+ # no marks.
+ if len(codepoint.combining_ids) == 1:
+ return False
+
+ # A letter without diacritical marks has none of them.
+ if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False:
+ return False
+
+ # Check if the base letter of this letter has marks.
+ codepoint_base = codepoint.combining_ids[0]
+ if (is_plain_letter(table[codepoint_base]) is False and \
+ is_letter_with_marks(table[codepoint_base], table) is False):
+ return False
+
+ return True
def is_letter(codepoint, table):
"""Return true for letter with or without diacritical marks."""
return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
def get_plain_letter(codepoint, table):
- """Return the base codepoint without marks."""
+ """Return the base codepoint without marks. If this codepoint has more
+ than one combining character, do a recursive lookup on the table to
+ find out its plain base letter."""
if is_letter_with_marks(codepoint, table):
- return table[codepoint.combining_ids[0]]
+ if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
+ return get_plain_letter(table[codepoint.combining_ids[0]], table)
+ elif is_plain_letter(table[codepoint.combining_ids[0]]):
+ return table[codepoint.combining_ids[0]]
+
+ # Should not come here
+ assert(False)
elif is_plain_letter(codepoint):
return codepoint
- else:
- raise "mu"
+
+ # Should not come here
+ assert(False)
def is_ligature(codepoint, table):
"""Return true for letters combined with letters."""
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
index 84886da587a..97f9ed47cfa 100644
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -254,6 +254,18 @@
ǒ o
Ǔ U
ǔ u
+Ǖ U
+ǖ u
+Ǘ U
+ǘ u
+Ǚ U
+ǚ u
+Ǜ U
+ǜ u
+Ǟ A
+ǟ a
+Ǡ A
+ǡ a
Ǥ G
ǥ g
Ǧ G
@@ -262,6 +274,8 @@
ǩ k
Ǫ O
ǫ o
+Ǭ O
+ǭ o
ǰ j
DZ DZ
Dz Dz
@@ -270,6 +284,8 @@
ǵ g
Ǹ N
ǹ n
+Ǻ A
+ǻ a
Ȁ A
ȁ a
Ȃ A
@@ -307,8 +323,14 @@
ȧ a
Ȩ E
ȩ e
+Ȫ O
+ȫ o
+Ȭ O
+ȭ o
Ȯ O
ȯ o
+Ȱ O
+ȱ o
Ȳ Y
ȳ y
ȴ l
@@ -441,6 +463,8 @@
ḅ b
Ḇ B
ḇ b
+Ḉ C
+ḉ c
Ḋ D
ḋ d
Ḍ D
@@ -451,10 +475,16 @@
ḑ d
Ḓ D
ḓ d
+Ḕ E
+ḕ e
+Ḗ E
+ḗ e
Ḙ E
ḙ e
Ḛ E
ḛ e
+Ḝ E
+ḝ e
Ḟ F
ḟ f
Ḡ G
@@ -471,6 +501,8 @@
ḫ h
Ḭ I
ḭ i
+Ḯ I
+ḯ i
Ḱ K
ḱ k
Ḳ K
@@ -479,6 +511,8 @@
ḵ k
Ḷ L
ḷ l
+Ḹ L
+ḹ l
Ḻ L
ḻ l
Ḽ L
@@ -497,6 +531,14 @@
ṉ n
Ṋ N
ṋ n
+Ṍ O
+ṍ o
+Ṏ O
+ṏ o
+Ṑ O
+ṑ o
+Ṓ O
+ṓ o
Ṕ P
ṕ p
Ṗ P
@@ -505,12 +547,20 @@
ṙ r
Ṛ R
ṛ r
+Ṝ R
+ṝ r
Ṟ R
ṟ r
Ṡ S
ṡ s
Ṣ S
ṣ s
+Ṥ S
+ṥ s
+Ṧ S
+ṧ s
+Ṩ S
+ṩ s
Ṫ T
ṫ t
Ṭ T
@@ -525,6 +575,10 @@
ṵ u
Ṷ U
ṷ u
+Ṹ U
+ṹ u
+Ṻ U
+ṻ u
Ṽ V
ṽ v
Ṿ V
@@ -563,12 +617,42 @@
ạ a
Ả A
ả a
+Ấ A
+ấ a
+Ầ A
+ầ a
+Ẩ A
+ẩ a
+Ẫ A
+ẫ a
+Ậ A
+ậ a
+Ắ A
+ắ a
+Ằ A
+ằ a
+Ẳ A
+ẳ a
+Ẵ A
+ẵ a
+Ặ A
+ặ a
Ẹ E
ẹ e
Ẻ E
ẻ e
Ẽ E
ẽ e
+Ế E
+ế e
+Ề E
+ề e
+Ể E
+ể e
+Ễ E
+ễ e
+Ệ E
+ệ e
Ỉ I
ỉ i
Ị I
@@ -577,10 +661,40 @@
ọ o
Ỏ O
ỏ o
+Ố O
+ố o
+Ồ O
+ồ o
+Ổ O
+ổ o
+Ỗ O
+ỗ o
+Ộ O
+ộ o
+Ớ O
+ớ o
+Ờ O
+ờ o
+Ở O
+ở o
+Ỡ O
+ỡ o
+Ợ O
+ợ o
Ụ U
ụ u
Ủ U
ủ u
+Ứ U
+ứ u
+Ừ U
+ừ u
+Ử U
+ử u
+Ữ U
+ữ u
+Ự U
+ự u
Ỳ Y
ỳ y
Ỵ Y