1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
|
# coding: utf-8
# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst ([email protected])
# Script to generate Ruby data structures used in implementing
# String#unicode_normalize,...
# Constants for input and ouput directory
InputDataDir = '../enc/unicode/data'
OuputDataDir = '../lib/unicode_normalize'
# convenience methods
class Integer
def to_UTF8() # convert to string, taking legibility into account
if self>0xFFFF
"\\u{#{to_s(16).upcase}}"
elsif self>0x7f
"\\u#{to_s(16).upcase.rjust(4, '0')}"
else
chr.sub(/[\\\"]/, '\\\&')
end
end
end
class Array
def line_slice (new_line) # joins items, 16 items per line
each_slice(16).collect(&:join).join new_line
end
def to_UTF8() collect(&:to_UTF8).join end
def to_regexp_chars # converts an array of Integers to character ranges
sort.inject([]) do |ranges, value|
if ranges.last and ranges.last[1]+1>=value
ranges.last[1] = value
ranges
else
ranges << [value, value]
end
end.collect do |first, last|
case last-first
when 0
first.to_UTF8
when 1
first.to_UTF8 + last.to_UTF8
else
first.to_UTF8 + '-' + last.to_UTF8
end
end.line_slice "\" \\\n \""
end
end
class Hash
def to_hash_string
collect do |key, value|
"\"#{key.to_UTF8}\"=>\"#{value.to_UTF8}\", "
end.line_slice "\n "
end
end
# read the file 'CompositionExclusions.txt'
composition_exclusions = IO.readlines("#{InputDataDir}/CompositionExclusions.txt")
.select { |line| line =~ /^[A-Z0-9]{4,5}/ }
.collect { |line| line.split(' ').first.hex }
decomposition_table = {}
kompatible_table = {}
CombiningClass = {} # constant to allow use in Integer#to_UTF8
# read the file 'UnicodeData.txt'
IO.foreach("#{InputDataDir}/UnicodeData.txt") do |line|
codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";")
case decomposition
when /^[0-9A-F]/
decomposition_table[codepoint.hex] = decomposition.split(' ').collect(&:hex)
when /^</
kompatible_table[codepoint.hex] = decomposition.split(' ').drop(1).collect(&:hex)
end
CombiningClass[codepoint.hex] = char_class.to_i if char_class != "0"
if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="")
warn "Unexpected: Character range with data relevant to normalization!"
end
end
# calculate compositions from decompositions
composition_table = decomposition_table.reject do |character, decomposition|
composition_exclusions.member? character or # predefined composition exclusion
decomposition.length<=1 or # Singleton Decomposition
CombiningClass[character] or # character is not a Starter
CombiningClass[decomposition.first] # decomposition begins with a character that is not a Starter
end.invert
# recalculate composition_exclusions
composition_exclusions = decomposition_table.keys - composition_table.values
accent_array = CombiningClass.keys + composition_table.keys.collect(&:last)
composition_starters = composition_table.keys.collect(&:first)
hangul_no_trailing = 0xAC00.step(0xD7A3, 28).to_a
# expand decomposition table values
decomposition_table.each do |key, value|
position = 0
while position < value.length
if decomposition = decomposition_table[value[position]]
decomposition_table[key] = value = value.dup # avoid overwriting composition_table key
value[position, 1] = decomposition
else
position += 1
end
end
end
# deal with relationship between canonical and kompatibility decompositions
decomposition_table.each do |key, value|
value = value.dup
expanded = false
position = 0
while position < value.length
if decomposition = kompatible_table[value[position]]
value[position, 1] = decomposition
expanded = true
else
position += 1
end
end
kompatible_table[key] = value if expanded
end
class_table_str = CombiningClass.collect do |key, value|
"\"#{key.to_UTF8}\"=>#{value}, "
end.line_slice "\n "
# generate normalization tables file
open("#{OuputDataDir}/normalize_tables.rb", "w").print <<MAPPING_TABLE_FILE_END
# coding: us-ascii
# automatically generated by tool/unicode_norm_gen.rb
module Normalize
accents = "" \\
"[#{accent_array.to_regexp_chars}]" \\
""
ACCENTS = accents
REGEXP_D_STRING = "\#{'' # composition starters and composition exclusions
}" \\
"[#{(composition_table.values+composition_exclusions).to_regexp_chars}]\#{accents}*" \\
"|\#{'' # characters that can be the result of a composition, except composition starters
}" \\
"[#{(composition_starters-composition_table.values).to_regexp_chars}]?\#{accents}+" \\
"|\#{'' # precomposed Hangul syllables
}" \\
"[\\u{AC00}-\\u{D7A4}]" \\
""
REGEXP_C_STRING = "\#{'' # composition exclusions
}" \\
"[#{composition_exclusions.to_regexp_chars}]\#{accents}*" \\
"|\#{'' # composition starters and characters that can be the result of a composition
}" \\
"[#{(composition_starters+composition_table.values).to_regexp_chars}]?\#{accents}+" \\
"|\#{'' # Hangul syllables with separate trailer
}" \\
"[#{hangul_no_trailing.to_regexp_chars}][\\u11A8-\\u11C2]" \\
"|\#{'' # decomposed Hangul syllables
}" \\
"[\\u1100-\\u1112][\\u1161-\\u1175][\\u11A8-\\u11C2]?" \\
""
REGEXP_K_STRING = "" \\
"[#{kompatible_table.keys.to_regexp_chars}]" \\
""
class_table = {
#{class_table_str}
}
class_table.default = 0
CLASS_TABLE = class_table
DECOMPOSITION_TABLE = {
#{decomposition_table.to_hash_string}
}
KOMPATIBLE_TABLE = {
#{kompatible_table.to_hash_string}
}
COMPOSITION_TABLE = {
#{composition_table.to_hash_string}
}
end
MAPPING_TABLE_FILE_END
|