This source file includes following definitions.
- unicode_name_word
- unicode_name_word_lookup
- unicode_code_to_index
- unicode_index_to_code
- unicode_character_name
- unicode_name_character
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 #include <config.h>
26
27
28 #include "uniname.h"
29
30 #include <assert.h>
31 #include <stdbool.h>
32 #include <stdint.h>
33 #include <stdio.h>
34 #include <string.h>
35
36 #include "attribute.h"
37
38 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
39
40
41
42
43
44
45
46
47
48 #include "uninames.h"
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66 static const char *
67 unicode_name_word (unsigned int index, unsigned int *lengthp)
68 {
69 unsigned int i1;
70 unsigned int i2;
71
72 assert (index < UNICODE_CHARNAME_NUM_WORDS);
73
74
75
76
77
78
79
80 i1 = 0;
81 i2 = SIZEOF (unicode_name_by_length) - 1;
82 while (i2 - i1 > 1)
83 {
84 unsigned int i = (i1 + i2) >> 1;
85 if (unicode_name_by_length[i].ind_offset <= index)
86 i1 = i;
87 else
88 i2 = i;
89 }
90 unsigned int i = i1;
91 assert (unicode_name_by_length[i].ind_offset <= index
92 && index < unicode_name_by_length[i+1].ind_offset);
93 *lengthp = i;
94 return &unicode_name_words[unicode_name_by_length[i].extra_offset
95 + (index-unicode_name_by_length[i].ind_offset)*i];
96 }
97
98
99 static int
100 unicode_name_word_lookup (const char *word, size_t length)
101 {
102 if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
103 {
104
105 unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
106 unsigned int i0 = unicode_name_by_length[length].ind_offset;
107 unsigned int i1 = i0;
108 unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
109 while (i2 - i1 > 0)
110 {
111 unsigned int i = (i1 + i2) >> 1;
112 const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
113 const char *w = word;
114 unsigned int n = length;
115 for (;;)
116 {
117 if (*p < *w)
118 {
119 if (i1 == i)
120 return -1;
121
122 i1 = i;
123 break;
124 }
125 if (*p > *w)
126 {
127
128 i2 = i;
129 break;
130 }
131 p++; w++; n--;
132 if (n == 0)
133 return i;
134 }
135 }
136 }
137 return -1;
138 }
139
140 #define UNINAME_INVALID_INDEX UINT16_MAX
141
142
143 static uint16_t
144 unicode_code_to_index (ucs4_t c)
145 {
146
147 unsigned int i1 = 0;
148 unsigned int i2 = SIZEOF (unicode_ranges);
149
150 for (;;)
151 {
152 unsigned int i = (i1 + i2) >> 1;
153 ucs4_t start_code =
154 unicode_ranges[i].index + unicode_ranges[i].gap;
155 ucs4_t end_code =
156 start_code + unicode_ranges[i].length - 1;
157
158 if (start_code <= c && c <= end_code)
159 return c - unicode_ranges[i].gap;
160
161 if (end_code < c)
162 {
163 if (i1 == i)
164 break;
165
166 i1 = i;
167 }
168 else if (c < start_code)
169 {
170 if (i2 == i)
171 break;
172
173 i2 = i;
174 }
175 }
176 return UNINAME_INVALID_INDEX;
177 }
178
179
180
181 static ucs4_t
182 unicode_index_to_code (uint16_t index)
183 {
184
185 unsigned int i1 = 0;
186 unsigned int i2 = SIZEOF (unicode_ranges);
187
188 for (;;)
189 {
190 unsigned int i = (i1 + i2) >> 1;
191 uint16_t start_index = unicode_ranges[i].index;
192 uint16_t end_index = start_index + unicode_ranges[i].length - 1;
193
194 if (start_index <= index && index <= end_index)
195 return index + unicode_ranges[i].gap;
196
197 if (end_index < index)
198 {
199 if (i1 == i)
200 break;
201
202 i1 = i;
203 }
204 else if (index < start_index)
205 {
206 if (i2 == i)
207 break;
208
209 i2 = i;
210 }
211 }
212 return UNINAME_INVALID;
213 }
214
215
216
217
218 static const char jamo_initial_short_name[19][3] =
219 {
220 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
221 "C", "K", "T", "P", "H"
222 };
223 static const char jamo_medial_short_name[21][4] =
224 {
225 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
226 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
227 };
228 static const char jamo_final_short_name[28][3] =
229 {
230 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
231 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
232 };
233
234
235
236 char *
237 unicode_character_name (ucs4_t c, char *buf)
238 {
239 if (c >= 0xAC00 && c <= 0xD7A3)
240 {
241
242 char *ptr;
243 unsigned int tmp;
244 unsigned int index1;
245 unsigned int index2;
246 unsigned int index3;
247 const char *q;
248
249
250 memcpy (buf, "HANGUL SYLLABLE ", 16);
251 ptr = buf + 16;
252
253 tmp = c - 0xAC00;
254 index3 = tmp % 28; tmp = tmp / 28;
255 index2 = tmp % 21; tmp = tmp / 21;
256 index1 = tmp;
257
258 q = jamo_initial_short_name[index1];
259 while (*q != '\0')
260 *ptr++ = *q++;
261 q = jamo_medial_short_name[index2];
262 while (*q != '\0')
263 *ptr++ = *q++;
264 q = jamo_final_short_name[index3];
265 while (*q != '\0')
266 *ptr++ = *q++;
267 *ptr = '\0';
268 return buf;
269 }
270 else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
271 || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
272 {
273
274
275 char *ptr;
276 int i;
277
278
279 memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
280 ptr = buf + 28;
281
282 for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
283 {
284 unsigned int x = (c >> i) & 0xf;
285 *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
286 }
287 *ptr = '\0';
288 return buf;
289 }
290 else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF))
291 {
292
293
294
295
296 sprintf (buf, "VARIATION SELECTOR-%d",
297 c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17);
298 return buf;
299 }
300 else
301 {
302 uint16_t index = unicode_code_to_index (c);
303 const uint16_t *words = NULL;
304
305 if (index != UNINAME_INVALID_INDEX)
306 {
307
308 unsigned int i1 = 0;
309 unsigned int i2 = SIZEOF (unicode_index_to_name);
310 for (;;)
311 {
312 unsigned int i = (i1 + i2) >> 1;
313 if (unicode_index_to_name[i].index == index)
314 {
315 words = &unicode_names[unicode_index_to_name[i].name];
316 break;
317 }
318 else if (unicode_index_to_name[i].index < index)
319 {
320 if (i1 == i)
321 {
322 words = NULL;
323 break;
324 }
325
326 i1 = i;
327 }
328 else if (unicode_index_to_name[i].index > index)
329 {
330 if (i2 == i)
331 {
332 words = NULL;
333 break;
334 }
335
336 i2 = i;
337 }
338 }
339 }
340 if (words != NULL)
341 {
342
343
344 char *ptr = buf;
345 for (;;)
346 {
347 unsigned int wordlen;
348 const char *word = unicode_name_word (*words>>1, &wordlen);
349 do
350 *ptr++ = *word++;
351 while (--wordlen > 0);
352 if ((*words & 1) == 0)
353 break;
354 *ptr++ = ' ';
355 words++;
356 }
357 *ptr = '\0';
358 return buf;
359 }
360 return NULL;
361 }
362 }
363
364
365
366 ucs4_t
367 unicode_name_character (const char *name)
368 {
369 size_t len = strlen (name);
370 if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
371 {
372
373 char buf[UNICODE_CHARNAME_MAX_LENGTH];
374 char *ptr = buf;
375 for (;;)
376 {
377 char c = *name++;
378 if (!(c >= ' ' && c <= '~'))
379 break;
380 *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
381 if (--len == 0)
382 goto filled_buf;
383 }
384 if (false)
385 filled_buf:
386 {
387 {
388
389
390 const char *p1 = buf;
391 if (ptr >= buf + 3 && *p1++ == 'V')
392 {
393 if (*p1++ == 'S')
394 {
395 if (*p1 != '0')
396 {
397 unsigned int c = 0;
398 for (;;)
399 {
400 if (*p1 >= '0' && *p1 <= '9')
401 c += (*p1 - '0');
402 p1++;
403 if (p1 == ptr)
404 {
405 if (c >= 1 && c <= 16)
406 return c - 1 + 0xFE00;
407 else if (c >= 17 && c <= 256)
408 return c - 17 + 0xE0100;
409 else
410 break;
411 }
412 c = c * 10;
413 }
414 }
415 }
416 }
417 }
418 {
419
420 uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
421 uint16_t *wordptr = words;
422 {
423 const char *p1 = buf;
424 for (;;)
425 {
426 {
427 int word;
428 const char *p2 = p1;
429 while (p2 < ptr && *p2 != ' ')
430 p2++;
431 word = unicode_name_word_lookup (p1, p2 - p1);
432 if (word < 0)
433 break;
434 if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
435 break;
436 *wordptr++ = word;
437 if (p2 == ptr)
438 goto filled_words;
439 p1 = p2 + 1;
440 }
441
442 if (wordptr == &words[2]
443 && words[0] == UNICODE_CHARNAME_WORD_HANGUL
444 && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
445 {
446
447
448
449
450
451 const char *p2;
452 const char *p3;
453 const char *p4;
454
455 p2 = p1;
456 while (p2 < ptr
457 && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
458 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
459 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
460 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
461 || *p2 == 'T'))
462 p2++;
463 p3 = p2;
464 while (p3 < ptr
465 && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
466 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
467 || *p3 == 'Y'))
468 p3++;
469 p4 = p3;
470 while (p4 < ptr
471 && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
472 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
473 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
474 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
475 || *p4 == 'S' || *p4 == 'T'))
476 p4++;
477 if (p4 == ptr)
478 {
479 size_t n1 = p2 - p1;
480 size_t n2 = p3 - p2;
481 size_t n3 = p4 - p3;
482
483 if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
484 {
485 unsigned int index1;
486
487 for (index1 = 0; index1 < 19; index1++)
488 if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
489 && jamo_initial_short_name[index1][n1] == '\0')
490 {
491 unsigned int index2;
492
493 for (index2 = 0; index2 < 21; index2++)
494 if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
495 && jamo_medial_short_name[index2][n2] == '\0')
496 {
497 unsigned int index3;
498
499 for (index3 = 0; index3 < 28; index3++)
500 if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
501 && jamo_final_short_name[index3][n3] == '\0')
502 {
503 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
504 }
505 break;
506 }
507 break;
508 }
509 }
510 }
511 }
512
513
514 if (wordptr == &words[2]
515 && words[0] == UNICODE_CHARNAME_WORD_CJK
516 && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
517 && p1 + 14 <= ptr
518 && p1 + 15 >= ptr
519 && memcmp (p1, "IDEOGRAPH-", 10) == 0)
520 {
521 const char *p2 = p1 + 10;
522
523 if (*p2 != '0')
524 {
525 unsigned int c = 0;
526
527 for (;;)
528 {
529 if (*p2 >= '0' && *p2 <= '9')
530 c += (*p2 - '0');
531 else if (*p2 >= 'A' && *p2 <= 'F')
532 c += (*p2 - 'A' + 10);
533 else
534 break;
535 p2++;
536 if (p2 == ptr)
537 {
538 if ((c >= 0xF900 && c <= 0xFA2D)
539 || (c >= 0xFA30 && c <= 0xFA6A)
540 || (c >= 0xFA70 && c <= 0xFAD9)
541 || (c >= 0x2F800 && c <= 0x2FA1D))
542 return c;
543 else
544 break;
545 }
546 c = c << 4;
547 }
548 }
549 }
550
551
552 if (wordptr == &words[1]
553 && words[0] == UNICODE_CHARNAME_WORD_VARIATION
554 && p1 + 10 <= ptr
555 && p1 + 12 >= ptr
556 && memcmp (p1, "SELECTOR-", 9) == 0)
557 {
558 const char *p2 = p1 + 9;
559
560 if (*p2 != '0')
561 {
562 unsigned int c = 0;
563
564 for (;;)
565 {
566 if (*p2 >= '0' && *p2 <= '9')
567 c += (*p2 - '0');
568 p2++;
569 if (p2 == ptr)
570 {
571 if (c >= 1 && c <= 16)
572 return c - 1 + 0xFE00;
573 else if (c >= 17 && c <= 256)
574 return c - 17 + 0xE0100;
575 else
576 break;
577 }
578 c = c * 10;
579 }
580 }
581 }
582 }
583 }
584 if (false)
585 filled_words:
586 {
587
588 size_t words_length = wordptr - words;
589 {
590 size_t i = words_length - 1;
591 words[i] = 2 * words[i];
592 for (; i > 0; )
593 {
594 --i;
595 words[i] = 2 * words[i] + 1;
596 }
597 }
598
599 {
600 unsigned int i1 = 0;
601 unsigned int i2 = SIZEOF (unicode_name_to_index);
602 for (;;)
603 {
604 unsigned int i = (i1 + i2) >> 1;
605 const uint16_t *w = words;
606 const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];
607 size_t n = words_length;
608 for (;;)
609 {
610 if (*p < *w)
611 {
612 if (i1 == i)
613 goto name_not_found;
614
615 i1 = i;
616 break;
617 }
618 else if (*p > *w)
619 {
620 if (i2 == i)
621 goto name_not_found;
622
623 i2 = i;
624 break;
625 }
626 p++; w++; n--;
627 if (n == 0)
628 return unicode_index_to_code (unicode_name_to_index[i].index);
629 }
630 }
631 }
632 name_not_found: ;
633 }
634 }
635 }
636 }
637 return UNINAME_INVALID;
638 }