-
Notifications
You must be signed in to change notification settings - Fork 0
/
utf8.c
230 lines (203 loc) · 5.99 KB
/
utf8.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
/* This file is part of the auxiliaries library.
Written by Dick Grune, [email protected]
$Id: utf8.c,v 1.13 2017-12-14 15:39:22 Gebruiker Exp $
*/
#include <stdio.h>
#include <stdlib.h>
#include "utf8.h"
/* Library utf8 source prelude */
#undef _UTF8_CODE_
#ifndef lint
#define _UTF8_CODE_
#endif
#ifdef LIB
#define _UTF8_CODE_
#endif
#ifdef _UTF8_CODE_
/* Library utf8 source code */
/* Although Unicode is usually given in hexadecimal, UTF-8 is easier
understood in octal: UTF-8 has 2 mark bit + 6 data bits to a byte,
octal has 2 data bits + 3 data bits + 3 data bits to a byte.
So we get: 03XX is a leading byte, 02xx is a continuation byte,
and 00xx and 01xx are ASCII characters.
*/
#define BYTE_MASK (0377)
#define MARK_MASK (0300)
#define CONTINUATION_BYTE_MARK (0200)
#define CONTINUATION_BYTE_VALUE (0077)
void
clear_utf8_box(utf8_box *u) {
u->_pos = 0;
u->_saved_pending = 0;
}
/* All returns to the user must either return 0 or go through
finished_sequence() or process_first_byte().
*/
static int
finished_sequence(utf8_box *u, int len) {
/* prepares the finished utf8_box for being passed to the user and then
returns the length of the recognized UTF-8 sequence, correct or not
*/
/* Note: since len may be u->_pos, this routine uses the call-by-value
mechanism in a sneaky way.
*/
if (u->text[0] == '\0') {
/* we implement Modified UTF-8, in which the NULL byte is not
a valid UTF-8 encoding
*/
len = -1;
}
u->_pos = 0;
return len;
}
static int
process_first_byte(utf8_box *u, int byte) {
/* processes the first byte of a UTF-8 sequence or a single ASCII char;
returns +1: done, OK; 0: wait for more; -1 done, not OK
*/
u->_pos = 0; u->text[u->_pos++] = byte; u->text[u->_pos] = '\0';
/* it must either be an ASCII char or a good leading UTF-8 byte */
switch (byte & MARK_MASK) {
case 0000: case 0100:
/* ASCII: we are done */
u->_size = 1;
u->codepoint = byte; /* no sign extension for ASCII */
return finished_sequence(u, 1);
case 0300:
/* leading byte: if it is correct we return 0 */
switch (byte & BYTE_MASK) {
case 0300: case 0301: case 0302: case 0303:
case 0304: case 0305: case 0306: case 0307:
case 0310: case 0311: case 0312: case 0313:
case 0314: case 0315: case 0316: case 0317:
case 0320: case 0321: case 0322: case 0323:
case 0324: case 0325: case 0326: case 0327:
case 0330: case 0331: case 0332: case 0333:
case 0334: case 0335: case 0336: case 0337:
u->_size = 2;
u->codepoint = byte & 0077;
return 0;
case 0340: case 0341: case 0342: case 0343:
case 0344: case 0345: case 0346: case 0347:
case 0350: case 0351: case 0352: case 0353:
case 0354: case 0355: case 0356: case 0357:
u->_size = 3;
u->codepoint = byte & 0037;
return 0;
case 0360: case 0361: case 0362: case 0363:
case 0364: case 0365: case 0366: case 0367:
u->_size = 4;
u->codepoint = byte & 0017;
return 0;
case 0370: case 0371: case 0372: case 0373:
u->_size = 5;
u->codepoint = byte & 0007;
return 0;
case 0374: case 0375:
u->_size = 6;
u->codepoint = byte & 0003;
return 0;
}
default:
/* a drop-out; it is either a continuation byte or
a bad leading byte; either way we return it as garbage
*/
return finished_sequence(u, -1);
}
}
int
box_utf8(char ch, utf8_box *u) {
int byte = ch & BYTE_MASK;
/* is there a byte left over from last time? */
if (u->_saved_pending) {
/* there seems to be a left-over byte but ... */
if (u->_pos != 0) {
/* that can't be; conclusion: uninitialized utf8_box */
clear_utf8_box(u);
} else {
/* the saved byte is genuine */
/* retrieve it from _saved to text */
int b = u->_saved; u->_saved_pending = 0;
/* box it */
int len = box_utf8(b, u);
if (len != 0) {
/* the box is done; save the incoming byte */
u->_saved = byte; u->_saved_pending = 1;
/* and return to user */
return finished_sequence(u, len);
} else {
/* there is more room in the box; continue */
}
}
}
if (u->_pos == 0) {
/* the box is still empty, so this is the first byte */
return process_first_byte(u, byte);
}
/* there is already a part of an UTF8 sequence present, so we need
a continuation byte
*/
switch (byte & MARK_MASK) {
case CONTINUATION_BYTE_MARK:
/* a good continuation byte; incorporate it */
u->text[u->_pos++] = byte; u->text[u->_pos] = '\0';
u->codepoint =
(u->codepoint << 6) | (byte & CONTINUATION_BYTE_VALUE);
if (u->_pos == u->_size) {
/* full UTF8 sequence seen */
return finished_sequence(u, u->_size);
} else {
/* sequence still incomplete; wait for more */
return 0;
}
default:
/* not a continuation byte, so the box cannot be completed;
we save the byte
*/
u->_saved = byte; u->_saved_pending = 1;
/* and return the box as garbage */
return finished_sequence(u, -u->_pos);
}
}
int
is_valid_utf8(utf8_box *u) {
if (u->_pos != 0) return 0; /* unfinished box */
/* since we have the code point, this is easy */
int cp = u->codepoint;
int sz = u->_size;
if (cp == 0) return sz == 2; /* modified UTF-8 exception */
if (cp < 0200) return sz == 1;
if (cp < 04000) return sz == 2;
if (cp < 0154000) return sz == 3;
if (cp < 0160000) return 0; /* UTF-16 surrogate halves (2048) */
if (cp < 0200000) return sz == 3;
if (cp < 04200000) return sz == 4;
return 0; /* too large for Unicode */
}
int
flush_utf8_box(utf8_box *u) {
if (u->_saved_pending) {
/* retrieve _saved to text */
int b = u->_saved; u->_saved_pending = 0;
switch (process_first_byte(u, b)) {
case 1: /* it's a real character */
return finished_sequence(u, 1);
case 0: /* it's a lonely leading byte */
case -1:/* it's garbage; return as garbage */
return finished_sequence(u, -1);
}
}
return finished_sequence(u, -u->_pos); /* u->_pos may be 0 here */
}
/* End library utf8 source code */
#endif /* _UTF8_CODE_ */
#ifdef lint
static void
satisfy_lint(void *x) {
/* calls of all external entries of utf8 */
clear_utf8_box(0);
box_utf8(0, 0);
flush_utf8_box(0);
satisfy_lint(x);
}
#endif /* lint */