diff options
author | duerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2014-10-06 01:27:34 +0000 |
---|---|---|
committer | duerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2014-10-06 01:27:34 +0000 |
commit | 33447b80d52f395b26c31a907648503129b1d077 (patch) | |
tree | fbc0175a4b9fdf64c62dbe27db1761017301e444 /tool | |
parent | 0fb67d59b2279540d99333ef1ef601e826fdf5d6 (diff) |
tool/unicode_norm_gen.rb: Data generation script imported from
https://github.com/duerst/eprun/blob/master/lib/generate.rb
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@47808 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'tool')
-rw-r--r-- | tool/unicode_norm_gen.rb | 178 |
1 files changed, 178 insertions, 0 deletions
diff --git a/tool/unicode_norm_gen.rb b/tool/unicode_norm_gen.rb new file mode 100644 index 0000000000..90eba75f3a --- /dev/null +++ b/tool/unicode_norm_gen.rb @@ -0,0 +1,178 @@ +# coding: utf-8 + +# Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst ([email protected]) +# available under the same licence as Ruby itself +# (see http://www.ruby-lang.org/en/LICENSE.txt) + +class Integer + def to_UTF8() + if self>0xFFFF + "\\u{#{to_s(16).upcase}}" + elsif CombiningClass[self] or self=='\\'.ord or self=='"'.ord + "\\u#{to_s(16).upcase.rjust(4, '0')}" + else + chr Encoding::UTF_8 + end + end +end + +class Array + def line_slice (new_line) # joins items, 16 items per line + each_slice(16).collect(&:join).join new_line + end + + def to_UTF8() collect(&:to_UTF8).join end + + def to_regexp_chars # converts an array of Integers to character ranges + sort.inject([]) do |ranges, value| + if ranges.last and ranges.last[1]+1>=value + ranges.last[1] = value + ranges + else + ranges << [value, value] + end + end.collect do |first, last| + case last-first + when 0 + first.to_UTF8 + when 1 + first.to_UTF8 + last.to_UTF8 + else + first.to_UTF8 + '-' + last.to_UTF8 + end + end.line_slice "\" +\n \"" + end +end + +class Hash + def to_hash_string + collect do |key, value| + "\"#{key.to_UTF8}\"=>\"#{value.to_UTF8}\", " + end.line_slice "\n " + end +end + +# read the file 'CompositionExclusions.txt' +composition_exclusions = IO.readlines("../data/CompositionExclusions.txt") + .select { |line| line =~ /^[A-Z0-9]{4,5}/ } + .collect { |line| line.split(' ').first.hex } + +decomposition_table = {} +kompatible_table = {} +CombiningClass = {} # constant to allow use in Integer#to_UTF8 + +# read the file 'UnicodeData.txt' +IO.foreach("../data/UnicodeData.txt") do |line| + codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";") + + case decomposition + when /^[0-9A-F]/ + decomposition_table[codepoint.hex] = decomposition.split(' ').collect(&:hex) + when /^</ + kompatible_table[codepoint.hex] = decomposition.split(' ').drop(1).collect(&:hex) + end + CombiningClass[codepoint.hex] = char_class.to_i if char_class != "0" + + if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="") + warn "Unexpected: Character range with data relevant to normalization!" + end +end + +# calculate compositions from decompositions +composition_table = decomposition_table.reject do |character, decomposition| + composition_exclusions.member? character or # predefined composition exclusion + decomposition.length<=1 or # Singleton Decomposition + CombiningClass[character] or # character is not a Starter + CombiningClass[decomposition.first] # decomposition begins with a character that is not a Starter +end.invert + +# recalculate composition_exclusions +composition_exclusions = decomposition_table.keys - composition_table.values + +accent_array = CombiningClass.keys + composition_table.keys.collect(&:last) + +composition_starters = composition_table.keys.collect(&:first) + +hangul_no_trailing = 0xAC00.step(0xD7A3, 28).to_a + +# expand decomposition table values +decomposition_table.each do |key, value| + position = 0 + while position < value.length + if decomposition = decomposition_table[value[position]] + decomposition_table[key] = value = value.dup # avoid overwriting composition_table key + value[position, 1] = decomposition + else + position += 1 + end + end +end + +# deal with relationship between canonical and kompatibility decompositions +decomposition_table.each do |key, value| + value = value.dup + expanded = false + position = 0 + while position < value.length + if decomposition = kompatible_table[value[position]] + value[position, 1] = decomposition + expanded = true + else + position += 1 + end + end + kompatible_table[key] = value if expanded +end + +class_table_str = CombiningClass.collect do |key, value| + "\"#{key.to_UTF8}\"=>#{value}, " +end.line_slice "\n " + +# generate normalization tables file +open("normalize_tables.rb", "w").print <<MAPPING_TABLE_FILE_END +# coding: utf-8 + +# automatically generated by generate.rb + +module Normalize + ACCENTS = " + [#{accent_array.to_regexp_chars}] + " + REGEXP_D_STRING = " # composition starters and composition exclusions + [#{(composition_table.values+composition_exclusions).to_regexp_chars}]\#{ACCENTS}* + | # characters that can be the result of a composition, except composition starters + [#{(composition_starters-composition_table.values).to_regexp_chars}]?\#{ACCENTS}+ + | # precomposed Hangul syllables + [\\u{AC00}-\\u{D7A4}] + " + REGEXP_C_STRING = " # composition exclusions + [#{composition_exclusions.to_regexp_chars}]\#{ACCENTS}* + | # composition starters and characters that can be the result of a composition + [#{(composition_starters+composition_table.values).to_regexp_chars}]?\#{ACCENTS}+ + | # Hangul syllables with separate trailer + [#{hangul_no_trailing.to_regexp_chars}][\\u11A8-\\u11C2] + | # decomposed Hangul syllables + [\\u1100-\\u1112][\\u1161-\\u1175][\\u11A8-\\u11C2]? + " + REGEXP_K_STRING = " + [#{kompatible_table.keys.to_regexp_chars}] + " + + CLASS_TABLE = { + #{class_table_str} + } + CLASS_TABLE.default = 0 + + DECOMPOSITION_TABLE = { + #{decomposition_table.to_hash_string} + } + + KOMPATIBLE_TABLE = { + #{kompatible_table.to_hash_string} + } + + COMPOSITION_TABLE = { + #{composition_table.to_hash_string} + } +end +MAPPING_TABLE_FILE_END |