diff options
author | nobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2014-10-25 07:20:15 +0000 |
---|---|---|
committer | nobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2014-10-25 07:20:15 +0000 |
commit | 9b581e0d0b41dccc8c15400f05ca5c763c6c41b9 (patch) | |
tree | a1f22b735e7cf00ff41d3acf463e66513e749dd2 /tool | |
parent | 67a19e7a59dccbc00daed2970350a20124926afb (diff) |
template/unicode_norm_gen.tmpl: from tool/unicode_norm_gen.rb
* template/unicode_norm_gen.tmpl: use generic_erb.rb to update if
changed and manage timestamp, so that source tree on read-only
filesystem works.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@48129 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'tool')
-rw-r--r-- | tool/unicode_norm_gen.rb | 198 |
1 files changed, 0 insertions, 198 deletions
diff --git a/tool/unicode_norm_gen.rb b/tool/unicode_norm_gen.rb deleted file mode 100644 index 766be26dc4..0000000000 --- a/tool/unicode_norm_gen.rb +++ /dev/null @@ -1,198 +0,0 @@ -# coding: utf-8 - -# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst ([email protected]) - -# Script to generate Ruby data structures used in implementing -# String#unicode_normalize,... - -# Constants for input and ouput directory -InputDataDir = $input || 'enc/unicode/data' -OuputDataDir = $ouput || 'lib/unicode_normalize' - -# convenience methods -class Integer - def to_UTF8() # convert to string, taking legibility into account - if self>0xFFFF - "\\u{#{to_s(16).upcase}}" - elsif self>0x7f - "\\u#{to_s(16).upcase.rjust(4, '0')}" - else - chr.sub(/[\\\"]/, "\\\\\\\&") - end - end -end - -class Array - def line_slice(new_line) # joins items, 8 items per line - ary = [] - 0.step(size-1, 8) {|i| - ary << self[i, 8].join('') - } - ary.join(new_line).gsub(/ +$/, '') - end - - def to_UTF8() collect {|c| c.to_UTF8}.join('') end - - def to_regexp_chars # converts an array of Integers to character ranges - sort.inject([]) do |ranges, value| - if ranges.last and ranges.last[1]+1>=value - ranges.last[1] = value - ranges - else - ranges << [value, value] - end - end.collect do |first, last| - case last-first - when 0 - first.to_UTF8 - when 1 - first.to_UTF8 + last.to_UTF8 - else - first.to_UTF8 + '-' + last.to_UTF8 - end - end.line_slice "\" \\\n \"" - end -end - -class Hash - def to_hash_string - collect do |key, value| - "\"#{key.to_UTF8}\"=>\"#{value.to_UTF8}\".freeze, " - end.line_slice "\n " - end -end - -# read the file 'CompositionExclusions.txt' -composition_exclusions = File.open("#{InputDataDir}/CompositionExclusions.txt") {|f| - f.grep(/^[A-Z0-9]{4,5}/) {|line| line.hex} -} - -decomposition_table = {} -kompatible_table = {} -CombiningClass = {} # constant to allow use in Integer#to_UTF8 - -# read the file 'UnicodeData.txt' -IO.foreach("#{InputDataDir}/UnicodeData.txt") do |line| - codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";") - - case decomposition - when /^[0-9A-F]/ - decomposition_table[codepoint.hex] = decomposition.split(' ').collect {|w| w.hex} - when /^</ - kompatible_table[codepoint.hex] = decomposition.split(' ')[1..-1].collect {|w| w.hex} - end - CombiningClass[codepoint.hex] = char_class.to_i if char_class != "0" - - if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="") - warn "Unexpected: Character range with data relevant to normalization!" - end -end - -# calculate compositions from decompositions -composition_table = decomposition_table.reject do |character, decomposition| - composition_exclusions.member? character or # predefined composition exclusion - decomposition.length<=1 or # Singleton Decomposition - CombiningClass[character] or # character is not a Starter - CombiningClass[decomposition.first] # decomposition begins with a character that is not a Starter -end.invert - -# recalculate composition_exclusions -composition_exclusions = decomposition_table.keys - composition_table.values - -accent_array = CombiningClass.keys + composition_table.keys.collect {|key| key.last} - -composition_starters = composition_table.keys.collect {|key| key.first} - -hangul_no_trailing = [] -0xAC00.step(0xD7A3, 28) {|c| hangul_no_trailing << c} - -# expand decomposition table values -decomposition_table.each do |key, value| - position = 0 - while position < value.length - if decomposition = decomposition_table[value[position]] - decomposition_table[key] = value = value.dup # avoid overwriting composition_table key - value[position, 1] = decomposition - else - position += 1 - end - end -end - -# deal with relationship between canonical and kompatibility decompositions -decomposition_table.each do |key, value| - value = value.dup - expanded = false - position = 0 - while position < value.length - if decomposition = kompatible_table[value[position]] - value[position, 1] = decomposition - expanded = true - else - position += 1 - end - end - kompatible_table[key] = value if expanded -end - -class_table_str = CombiningClass.collect do |key, value| - "\"#{key.to_UTF8}\"=>#{value}, " -end.line_slice "\n " - -# generate normalization tables file -open("#{OuputDataDir}/tables.rb", "w").print <<MAPPING_TABLE_FILE_END -# coding: us-ascii - -# automatically generated by tool/unicode_norm_gen.rb - -module UnicodeNormalize - accents = "" \\ - "[#{accent_array.to_regexp_chars}]" \\ - "".freeze - ACCENTS = accents - REGEXP_D_STRING = "\#{'' # composition starters and composition exclusions - }" \\ - "[#{(composition_table.values+composition_exclusions).to_regexp_chars}]\#{accents}*" \\ - "|\#{'' # characters that can be the result of a composition, except composition starters - }" \\ - "[#{(composition_starters-composition_table.values).to_regexp_chars}]?\#{accents}+" \\ - "|\#{'' # precomposed Hangul syllables - }" \\ - "[\\u{AC00}-\\u{D7A4}]" \\ - "".freeze - REGEXP_C_STRING = "\#{'' # composition exclusions - }" \\ - "[#{composition_exclusions.to_regexp_chars}]\#{accents}*" \\ - "|\#{'' # composition starters and characters that can be the result of a composition - }" \\ - "[#{(composition_starters+composition_table.values).to_regexp_chars}]?\#{accents}+" \\ - "|\#{'' # Hangul syllables with separate trailer - }" \\ - "[#{hangul_no_trailing.to_regexp_chars}][\\u11A8-\\u11C2]" \\ - "|\#{'' # decomposed Hangul syllables - }" \\ - "[\\u1100-\\u1112][\\u1161-\\u1175][\\u11A8-\\u11C2]?" \\ - "".freeze - REGEXP_K_STRING = "" \\ - "[#{kompatible_table.keys.to_regexp_chars}]" \\ - "".freeze - - class_table = { - #{class_table_str} - } - class_table.default = 0 - CLASS_TABLE = class_table.freeze - - DECOMPOSITION_TABLE = { - #{decomposition_table.to_hash_string} - }.freeze - - KOMPATIBLE_TABLE = { - #{kompatible_table.to_hash_string} - }.freeze - - COMPOSITION_TABLE = { - #{composition_table.to_hash_string} - }.freeze -end -MAPPING_TABLE_FILE_END |