prism/encoding.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249

/**
 * @file encoding.h
 *
 * The encoding interface and implementations used by the parser.
 */
#ifndef PRISM_ENCODING_H
#define PRISM_ENCODING_H

#include "prism/defines.h"

#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>

/**
 * This struct defines the functions necessary to implement the encoding
 * interface so we can determine how many bytes the subsequent character takes.
 * Each callback should return the number of bytes, or 0 if the next bytes are
 * invalid for the encoding and type.
 */
typedef struct {
    /**
     * Return the number of bytes that the next character takes if it is valid
     * in the encoding. Does not read more than n bytes. It is assumed that n is
     * at least 1.
     */
    size_t (*char_width)(const uint8_t *b, ptrdiff_t n);

    /**
     * Return the number of bytes that the next character takes if it is valid
     * in the encoding and is alphabetical. Does not read more than n bytes. It
     * is assumed that n is at least 1.
     */
    size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);

    /**
     * Return the number of bytes that the next character takes if it is valid
     * in the encoding and is alphanumeric. Does not read more than n bytes. It
     * is assumed that n is at least 1.
     */
    size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);

    /**
     * Return true if the next character is valid in the encoding and is an
     * uppercase character. Does not read more than n bytes. It is assumed that
     * n is at least 1.
     */
    bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);

    /**
     * The name of the encoding. This should correspond to a value that can be
     * passed to Encoding.find in Ruby.
     */
    const char *name;

    /**
     * Return true if the encoding is a multibyte encoding.
     */
    bool multibyte;
} pm_encoding_t;

/**
 * All of the lookup tables use the first bit of each embedded byte to indicate
 * whether the codepoint is alphabetical.
 */
#define PRISM_ENCODING_ALPHABETIC_BIT 1 << 0

/**
 * All of the lookup tables use the second bit of each embedded byte to indicate
 * whether the codepoint is alphanumeric.
 */
#define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1

/**
 * All of the lookup tables use the third bit of each embedded byte to indicate
 * whether the codepoint is uppercase.
 */
#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2

/**
 * Return the size of the next character in the ASCII encoding if it is an
 * alphabetical character.
 *
 * @param b The bytes to read.
 * @param n The number of bytes that can be read.
 * @returns The number of bytes that the next character takes if it is valid in
 *     the encoding, or 0 if it is not.
 */
size_t pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);

/**
 * Return the size of the next character in the ASCII encoding if it is an
 * alphanumeric character.
 *
 * @param b The bytes to read.
 * @param n The number of bytes that can be read.
 * @returns The number of bytes that the next character takes if it is valid in
 *     the encoding, or 0 if it is not.
 */
size_t pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);

/**
 * Return true if the next character in the ASCII encoding if it is an uppercase
 * character.
 *
 * @param b The bytes to read.
 * @param n The number of bytes that can be read.
 * @returns True if the next character is valid in the encoding and is an
 *     uppercase character, or false if it is not.
 */
bool pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);

/**
 * Return the size of the next character in the UTF-8 encoding if it is an
 * alphabetical character.
 *
 * @param b The bytes to read.
 * @param n The number of bytes that can be read.
 * @returns The number of bytes that the next character takes if it is valid in
 *     the encoding, or 0 if it is not.
 */
size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);

/**
 * Return the size of the next character in the UTF-8 encoding if it is an
 * alphanumeric character.
 *
 * @param b The bytes to read.
 * @param n The number of bytes that can be read.
 * @returns The number of bytes that the next character takes if it is valid in
 *     the encoding, or 0 if it is not.
 */
size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);

/**
 * Return true if the next character in the UTF-8 encoding if it is an uppercase
 * character.
 *
 * @param b The bytes to read.
 * @param n The number of bytes that can be read.
 * @returns True if the next character is valid in the encoding and is an
 *     uppercase character, or false if it is not.
 */
bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);

/**
 * This lookup table is referenced in both the UTF-8 encoding file and the
 * parser directly in order to speed up the default encoding processing. It is
 * used to indicate whether a character is alphabetical, alphanumeric, or
 * uppercase in unicode mappings.
 */
extern const uint8_t pm_encoding_unicode_table[256];

// Below are the encodings that are supported by the parser. They are defined in
// their own files in the src/enc directory.

extern pm_encoding_t pm_encoding_ascii;
extern pm_encoding_t pm_encoding_ascii_8bit;
extern pm_encoding_t pm_encoding_big5;
extern pm_encoding_t pm_encoding_big5_hkscs;
extern pm_encoding_t pm_encoding_big5_uao;
extern pm_encoding_t pm_encoding_cesu_8;
extern pm_encoding_t pm_encoding_cp51932;
extern pm_encoding_t pm_encoding_cp850;
extern pm_encoding_t pm_encoding_cp852;
extern pm_encoding_t pm_encoding_cp855;
extern pm_encoding_t pm_encoding_cp949;
extern pm_encoding_t pm_encoding_cp950;
extern pm_encoding_t pm_encoding_cp951;
extern pm_encoding_t pm_encoding_emacs_mule;
extern pm_encoding_t pm_encoding_euc_jp;
extern pm_encoding_t pm_encoding_euc_jp_ms;
extern pm_encoding_t pm_encoding_euc_jis_2004;
extern pm_encoding_t pm_encoding_euc_kr;
extern pm_encoding_t pm_encoding_euc_tw;
extern pm_encoding_t pm_encoding_gb12345;
extern pm_encoding_t pm_encoding_gb18030;
extern pm_encoding_t pm_encoding_gb1988;
extern pm_encoding_t pm_encoding_gb2312;
extern pm_encoding_t pm_encoding_gbk;
extern pm_encoding_t pm_encoding_ibm437;
extern pm_encoding_t pm_encoding_ibm720;
extern pm_encoding_t pm_encoding_ibm737;
extern pm_encoding_t pm_encoding_ibm775;
extern pm_encoding_t pm_encoding_ibm852;
extern pm_encoding_t pm_encoding_ibm855;
extern pm_encoding_t pm_encoding_ibm857;
extern pm_encoding_t pm_encoding_ibm860;
extern pm_encoding_t pm_encoding_ibm861;
extern pm_encoding_t pm_encoding_ibm862;
extern pm_encoding_t pm_encoding_ibm863;
extern pm_encoding_t pm_encoding_ibm864;
extern pm_encoding_t pm_encoding_ibm865;
extern pm_encoding_t pm_encoding_ibm866;
extern pm_encoding_t pm_encoding_ibm869;
extern pm_encoding_t pm_encoding_iso_8859_1;
extern pm_encoding_t pm_encoding_iso_8859_2;
extern pm_encoding_t pm_encoding_iso_8859_3;
extern pm_encoding_t pm_encoding_iso_8859_4;
extern pm_encoding_t pm_encoding_iso_8859_5;
extern pm_encoding_t pm_encoding_iso_8859_6;
extern pm_encoding_t pm_encoding_iso_8859_7;
extern pm_encoding_t pm_encoding_iso_8859_8;
extern pm_encoding_t pm_encoding_iso_8859_9;
extern pm_encoding_t pm_encoding_iso_8859_10;
extern pm_encoding_t pm_encoding_iso_8859_11;
extern pm_encoding_t pm_encoding_iso_8859_13;
extern pm_encoding_t pm_encoding_iso_8859_14;
extern pm_encoding_t pm_encoding_iso_8859_15;
extern pm_encoding_t pm_encoding_iso_8859_16;
extern pm_encoding_t pm_encoding_koi8_r;
extern pm_encoding_t pm_encoding_koi8_u;
extern pm_encoding_t pm_encoding_mac_cent_euro;
extern pm_encoding_t pm_encoding_mac_croatian;
extern pm_encoding_t pm_encoding_mac_cyrillic;
extern pm_encoding_t pm_encoding_mac_greek;
extern pm_encoding_t pm_encoding_mac_iceland;
extern pm_encoding_t pm_encoding_mac_japanese;
extern pm_encoding_t pm_encoding_mac_roman;
extern pm_encoding_t pm_encoding_mac_romania;
extern pm_encoding_t pm_encoding_mac_thai;
extern pm_encoding_t pm_encoding_mac_turkish;
extern pm_encoding_t pm_encoding_mac_ukraine;
extern pm_encoding_t pm_encoding_shift_jis;
extern pm_encoding_t pm_encoding_sjis_docomo;
extern pm_encoding_t pm_encoding_sjis_kddi;
extern pm_encoding_t pm_encoding_sjis_softbank;
extern pm_encoding_t pm_encoding_stateless_iso_2022_jp;
extern pm_encoding_t pm_encoding_stateless_iso_2022_jp_kddi;
extern pm_encoding_t pm_encoding_tis_620;
extern pm_encoding_t pm_encoding_utf_8;
extern pm_encoding_t pm_encoding_utf8_mac;
extern pm_encoding_t pm_encoding_utf8_docomo;
extern pm_encoding_t pm_encoding_utf8_kddi;
extern pm_encoding_t pm_encoding_utf8_softbank;
extern pm_encoding_t pm_encoding_windows_1250;
extern pm_encoding_t pm_encoding_windows_1251;
extern pm_encoding_t pm_encoding_windows_1252;
extern pm_encoding_t pm_encoding_windows_1253;
extern pm_encoding_t pm_encoding_windows_1254;
extern pm_encoding_t pm_encoding_windows_1255;
extern pm_encoding_t pm_encoding_windows_1256;
extern pm_encoding_t pm_encoding_windows_1257;
extern pm_encoding_t pm_encoding_windows_1258;
extern pm_encoding_t pm_encoding_windows_31j;
extern pm_encoding_t pm_encoding_windows_874;

#endif