-
Notifications
You must be signed in to change notification settings - Fork 401
/
Copy pathtext_normalizer.cc
232 lines (210 loc) · 8.32 KB
/
text_normalizer.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
// Copyright 2010-2021, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "base/text_normalizer.h"
#include <array>
#include <cstdint>
#include <string>
#include <utility>
#include "absl/strings/string_view.h"
#include "base/util.h"
namespace mozc {
namespace {
// Unicode vender specific character table:
// http://hp.vector.co.jp/authors/VA010341/unicode/
// http://www.notoinsatu.co.jp/font/omake/OTF_other.pdf
//
// Example: WAVE_DASH / FULLWIDTH TILDE
// http://ja.wikipedia.org/wiki/%E6%B3%A2%E3%83%80%E3%83%83%E3%82%B7%E3%83%A5
// Windows CP932 (shift-jis) maps WAVE_DASH to FULL_WIDTH_TILDA.
// Since the font of WAVE-DASH is ugly on Windows, here we convert WAVE-DHASH to
// FULL_WIDTH_TILDA as CP932 does.
//
// As Unicode has became the defacto default encoding, we have reduced
// the number of characters to be normalized.
inline char32_t NormalizeCharForWindows(char32_t c) {
switch (c) {
case 0x301C: // WAVE DASH
return 0xFF5E; // FULLWIDTH TILDE
break;
case 0x2212: // MINUS SIGN
return 0xFF0D; // FULLWIDTH HYPHEN MINUS
break;
default:
return c;
break;
}
}
std::pair<int, int> ConvertJaCjkCompatToSvs(char32_t cjk_compat_char) {
constexpr const std::pair<int, int> no_value = {0, 0};
// value (2N): codepoint of CJK compatibility character.
// value (2N+1): codepoint of SVS base character.
constexpr std::array<int, 16> exceptions = {
0xF91D, 0x6B04, // {欄, 欄} defined in KS X 1001.
0xF928, 0x5ECA, // {廊, 廊} defined in KS X 1001.
0xF929, 0x6717, // {朗, 朗} defined in KS X 1001.
0xF936, 0x865C, // {虜, 虜} defined in KS X 1001.
0xF970, 0x6BBA, // {殺, 殺} defined in KS X 1001.
0xF9D0, 0x985E, // {類, 類} defined in KS X 1001.
0xF9DC, 0x9686, // {隆, 隆} defined in KS X 1001.
0xFA6C, 0x242EE, // {𤋮, 𤋮} value is more than 16bits.
};
// value: codepoint of CJK compatibility character
// to be converted to SVS character with FE01.
constexpr std::array<uint16_t, 3> fe01_chars = {
0xFA57, // 練 → 7DF4 FE01 練
0xFA5E, // 艹 → 8279 FE01 艹
0xFA67, // 逸 → 8279 FE01 艹
};
// index: codepoint of CJK compatibility character - 0xFA10.
// value: codepoint of SVS base character.
constexpr std::array<uint16_t, 94> conv_table = {
// FA10
// 塚 (﨑) 晴 (﨓) (﨔) 凞 猪 益
0x585A, 0x0000, 0x6674, 0x0000, 0x0000, 0x51DE, 0x732A, 0x76CA,
// FA18
// 礼 神 祥 福 靖 精 羽 (﨟)
0x793C, 0x795E, 0x7965, 0x798F, 0x9756, 0x7CBE, 0x7FBD, 0x0000,
// FA20
// 蘒 (﨡) 諸 (﨣) (﨤) 逸 都 (﨧)
0x8612, 0x0000, 0x8AF8, 0x0000, 0x0000, 0x9038, 0x90FD, 0x0000,
// FA28
// (﨨) (﨩) 飯 飼 館 鶴 (郞) (隷)
0x0000, 0x0000, 0x98EF, 0x98FC, 0x9928, 0x9DB4, 0x90DE, 0x96B7,
// FA30
// 侮 僧 免 勉 勤 卑 喝 嘆
0x4FAE, 0x50E7, 0x514D, 0x52C9, 0x52E4, 0x5351, 0x559D, 0x5606,
// FA38
// 器 塀 墨 層 屮 悔 慨 憎
0x5668, 0x5840, 0x58A8, 0x5C64, 0x5C6E, 0x6094, 0x6168, 0x618E,
// FA40
// 懲 敏 既 暑 梅 海 渚 漢
0x61F2, 0x654F, 0x65E2, 0x6691, 0x6885, 0x6D77, 0x6E1A, 0x6F22,
// FA48
// 煮 爫 琢 碑 社 祉 祈 祐
0x716E, 0x722B, 0x7422, 0x7891, 0x793E, 0x7949, 0x7948, 0x7950,
// FA50
// 祖 祝 禍 禎 穀 突 節 練
0x7956, 0x795D, 0x798D, 0x798E, 0x7A40, 0x7A81, 0x7BC0, 0x7DF4,
// FA58
// 縉 繁 署 者 臭 艹 艹 著
0x7E09, 0x7E41, 0x7F72, 0x8005, 0x81ED, 0x8279, 0x8279, 0x8457,
// FA60
// 褐 視 謁 謹 賓 贈 辶 逸
0x8910, 0x8996, 0x8B01, 0x8B39, 0x8CD3, 0x8D08, 0x8FB6, 0x8279,
// FA68 - FA6D
// 難 響 頻 恵 (𤋮) 舘
0x96E3, 0x97FF, 0x983B, 0x6075, 0x0000, 0x8218,
};
// If the char is out of all data range, return kNoValue.
// kExceptionMap: 0xF91D - 0xFA6C
// kFe01Set: 0xFA57 - 0xFA67
// kConvTable: 0xFA10 - 0xFA6D
if (cjk_compat_char < 0xF91D || cjk_compat_char > 0xFA6D) {
return no_value;
}
int svs_base = 0;
int svs_extend = 0xFE00;
// Check the value in `exceptions`.
for (int i = 0; i < exceptions.size(); i += 2) {
if (cjk_compat_char == exceptions[i]) {
svs_base = exceptions[i+1];
// svs_extend for all values in `exceptions` is 0xFE00;
return std::make_pair(svs_base, svs_extend);
}
}
// Check if the char is out of kConvTable. Upper range is already checked.
if (cjk_compat_char < 0xFA10) {
return no_value;
}
// Get the value from the table.
svs_base = conv_table[cjk_compat_char - 0xFA10];
if (svs_base == 0) {
return no_value;
}
// Check if the SVS extend is FE01. In most cases, it's FE00.
// There is no case of 0xFE02 or more for Japanese CJK compatibility chars.
for (int i = 0; i < fe01_chars.size(); ++i) {
if (cjk_compat_char == fe01_chars[i]) {
svs_extend++;
break;
}
}
return std::make_pair(svs_base, svs_extend);
}
std::string NormalizeTextForWindows(absl::string_view input) {
std::string output;
for (ConstChar32Iterator iter(input); !iter.Done(); iter.Next()) {
Util::CodepointToUtf8Append(NormalizeCharForWindows(iter.Get()), &output);
}
return output;
}
} // namespace
std::string TextNormalizer::NormalizeTextWithFlag(absl::string_view input,
TextNormalizer::Flag flag) {
if (flag == TextNormalizer::kDefault) {
#ifdef _WIN32
flag = TextNormalizer::kAll;
#else // _WIN32
flag = TextNormalizer::kNone;
#endif // _WIN32
}
if (flag != TextNormalizer::kAll) {
return std::string(input.data(), input.size());
}
return NormalizeTextForWindows(input);
}
bool TextNormalizer::NormalizeTextToSvs(absl::string_view input,
std::string *output) {
std::u32string codepoints = Util::Utf8ToUtf32(input);
std::u32string normalized;
bool modified = false;
for (const char32_t cp : codepoints) {
const std::pair<int, int> svs = ConvertJaCjkCompatToSvs(cp);
if (svs.first == 0) {
normalized.push_back(cp);
} else {
modified = true;
normalized.push_back(static_cast<char32_t>(svs.first));
normalized.push_back(static_cast<char32_t>(svs.second));
}
}
if (!modified) {
return false;
}
*output = Util::Utf32ToUtf8(normalized);
return true;
}
std::string TextNormalizer::NormalizeTextToSvs(absl::string_view input) {
std::string output;
if (NormalizeTextToSvs(input, &output)) {
return output;
}
return std::string(input.data(), input.size());
}
} // namespace mozc