This source file includes following definitions.
- str_unichar_iscombiningmark
- str_utf8_insert_replace_char
- str_utf8_is_valid_string
- str_utf8_is_valid_char
- str_utf8_cnext_char
- str_utf8_cprev_char
- str_utf8_cnext_char_safe
- str_utf8_cprev_char_safe
- str_utf8_fix_string
- str_utf8_isspace
- str_utf8_ispunct
- str_utf8_isalnum
- str_utf8_isdigit
- str_utf8_isprint
- str_utf8_iscombiningmark
- str_utf8_cnext_noncomb_char
- str_utf8_cprev_noncomb_char
- str_utf8_toupper
- str_utf8_tolower
- str_utf8_length
- str_utf8_length2
- str_utf8_length_noncomb
- str_utf8_questmark_sustb
- str_utf8_conv_gerror_message
- str_utf8_vfs_convert_to
- str_utf8_make_make_term_form
- str_utf8_term_form
- utf8_tool_copy_chars_to_end
- utf8_tool_copy_chars_to
- utf8_tool_insert_space
- utf8_tool_insert_char
- utf8_tool_skip_chars_to
- utf8_tool_compose
- str_utf8_fit_to_term
- str_utf8_term_trim
- str_utf8_term_width2
- str_utf8_term_width1
- str_utf8_term_char_width
- str_utf8_term_substring
- str_utf8_trunc
- str_utf8_offset_to_pos
- str_utf8_column_to_pos
- str_utf8_create_search_needle
- str_utf8_release_search_needle
- str_utf8_search_first
- str_utf8_search_last
- str_utf8_normalize
- str_utf8_casefold_normalize
- str_utf8_compare
- str_utf8_ncompare
- str_utf8_casecmp
- str_utf8_ncasecmp
- str_utf8_prefix
- str_utf8_caseprefix
- str_utf8_create_key_gen
- str_utf8_create_key
- str_utf8_create_key_for_filename
- str_utf8_key_collate
- str_utf8_release_key
- str_utf8_init
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 #include <config.h>
27
28 #include <stdlib.h>
29 #include <langinfo.h>
30 #include <limits.h>
31 #include <string.h>
32
33 #include "lib/global.h"
34 #include "lib/strutil.h"
35
36
37
38
39
40
41
42
43
44 struct utf8_tool
45 {
46 char *actual;
47 size_t remain;
48 const char *checked;
49 int ident;
50 gboolean compose;
51 };
52
53 struct term_form
54 {
55 char text[BUF_MEDIUM * MB_LEN_MAX];
56 size_t width;
57 gboolean compose;
58 };
59
60
61
62
63
64 static const char replch[] = "\xEF\xBF\xBD";
65
66
67
68
69
70 static gboolean
71 str_unichar_iscombiningmark (gunichar uni)
72 {
73 GUnicodeType type;
74
75 type = g_unichar_type (uni);
76 return (type == G_UNICODE_SPACING_MARK) || (type == G_UNICODE_ENCLOSING_MARK)
77 || (type == G_UNICODE_NON_SPACING_MARK);
78 }
79
80
81
82 static void
83 str_utf8_insert_replace_char (GString *buffer)
84 {
85 g_string_append (buffer, replch);
86 }
87
88
89
90 static gboolean
91 str_utf8_is_valid_string (const char *text)
92 {
93 return g_utf8_validate (text, -1, NULL);
94 }
95
96
97
98 static int
99 str_utf8_is_valid_char (const char *ch, size_t size)
100 {
101 switch (g_utf8_get_char_validated (ch, size))
102 {
103 case (gunichar) (-2):
104 return (-2);
105 case (gunichar) (-1):
106 return (-1);
107 default:
108 return 1;
109 }
110 }
111
112
113
114 static void
115 str_utf8_cnext_char (const char **text)
116 {
117 (*text) = g_utf8_next_char (*text);
118 }
119
120
121
122 static void
123 str_utf8_cprev_char (const char **text)
124 {
125 (*text) = g_utf8_prev_char (*text);
126 }
127
128
129
130 static void
131 str_utf8_cnext_char_safe (const char **text)
132 {
133 if (str_utf8_is_valid_char (*text, -1) == 1)
134 (*text) = g_utf8_next_char (*text);
135 else
136 (*text)++;
137 }
138
139
140
141 static void
142 str_utf8_cprev_char_safe (const char **text)
143 {
144 const char *result, *t;
145
146 result = g_utf8_prev_char (*text);
147 t = result;
148 str_utf8_cnext_char_safe (&t);
149 if (t == *text)
150 (*text) = result;
151 else
152 (*text)--;
153 }
154
155
156
157 static void
158 str_utf8_fix_string (char *text)
159 {
160 while (text[0] != '\0')
161 {
162 gunichar uni;
163
164 uni = g_utf8_get_char_validated (text, -1);
165 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
166 text = g_utf8_next_char (text);
167 else
168 {
169 text[0] = '?';
170 text++;
171 }
172 }
173 }
174
175
176
177 static gboolean
178 str_utf8_isspace (const char *text)
179 {
180 gunichar uni;
181
182 uni = g_utf8_get_char_validated (text, -1);
183 return g_unichar_isspace (uni);
184 }
185
186
187
188 static gboolean
189 str_utf8_ispunct (const char *text)
190 {
191 gunichar uni;
192
193 uni = g_utf8_get_char_validated (text, -1);
194 return g_unichar_ispunct (uni);
195 }
196
197
198
199 static gboolean
200 str_utf8_isalnum (const char *text)
201 {
202 gunichar uni;
203
204 uni = g_utf8_get_char_validated (text, -1);
205 return g_unichar_isalnum (uni);
206 }
207
208
209
210 static gboolean
211 str_utf8_isdigit (const char *text)
212 {
213 gunichar uni;
214
215 uni = g_utf8_get_char_validated (text, -1);
216 return g_unichar_isdigit (uni);
217 }
218
219
220
221 static gboolean
222 str_utf8_isprint (const char *ch)
223 {
224 gunichar uni;
225
226 uni = g_utf8_get_char_validated (ch, -1);
227 return g_unichar_isprint (uni);
228 }
229
230
231
232 static gboolean
233 str_utf8_iscombiningmark (const char *ch)
234 {
235 gunichar uni;
236
237 uni = g_utf8_get_char_validated (ch, -1);
238 return str_unichar_iscombiningmark (uni);
239 }
240
241
242
243 static int
244 str_utf8_cnext_noncomb_char (const char **text)
245 {
246 int count = 0;
247
248 while ((*text)[0] != '\0')
249 {
250 str_utf8_cnext_char_safe (text);
251 count++;
252 if (!str_utf8_iscombiningmark (*text))
253 break;
254 }
255
256 return count;
257 }
258
259
260
261 static int
262 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
263 {
264 int count = 0;
265
266 while ((*text) != begin)
267 {
268 str_utf8_cprev_char_safe (text);
269 count++;
270 if (!str_utf8_iscombiningmark (*text))
271 break;
272 }
273
274 return count;
275 }
276
277
278
279 static gboolean
280 str_utf8_toupper (const char *text, char **out, size_t *remain)
281 {
282 gunichar uni;
283 size_t left;
284
285 uni = g_utf8_get_char_validated (text, -1);
286 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
287 return FALSE;
288
289 uni = g_unichar_toupper (uni);
290 left = g_unichar_to_utf8 (uni, NULL);
291 if (left >= *remain)
292 return FALSE;
293
294 left = g_unichar_to_utf8 (uni, *out);
295 (*out) += left;
296 (*remain) -= left;
297 return TRUE;
298 }
299
300
301
302 static gboolean
303 str_utf8_tolower (const char *text, char **out, size_t *remain)
304 {
305 gunichar uni;
306 size_t left;
307
308 uni = g_utf8_get_char_validated (text, -1);
309 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
310 return FALSE;
311
312 uni = g_unichar_tolower (uni);
313 left = g_unichar_to_utf8 (uni, NULL);
314 if (left >= *remain)
315 return FALSE;
316
317 left = g_unichar_to_utf8 (uni, *out);
318 (*out) += left;
319 (*remain) -= left;
320 return TRUE;
321 }
322
323
324
325 static int
326 str_utf8_length (const char *text)
327 {
328 int result = 0;
329 const char *start;
330 const char *end;
331
332 start = text;
333 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
334 {
335 if (start != end)
336 result += g_utf8_strlen (start, end - start);
337
338 result++;
339 start = end + 1;
340 }
341
342 if (start == text)
343 result = g_utf8_strlen (text, -1);
344 else if (start[0] != '\0' && start != end)
345 result += g_utf8_strlen (start, end - start);
346
347 return result;
348 }
349
350
351
352 static int
353 str_utf8_length2 (const char *text, int size)
354 {
355 int result = 0;
356 const char *start;
357 const char *end;
358
359 start = text;
360 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
361 {
362 if (start != end)
363 {
364 result += g_utf8_strlen (start, MIN (end - start, size));
365 size -= end - start;
366 }
367 result += (size > 0);
368 size--;
369 start = end + 1;
370 }
371
372 if (start == text)
373 result = g_utf8_strlen (text, size);
374 else if (start[0] != '\0' && start != end && size > 0)
375 result += g_utf8_strlen (start, MIN (end - start, size));
376
377 return result;
378 }
379
380
381
382 static int
383 str_utf8_length_noncomb (const char *text)
384 {
385 int result = 0;
386 const char *t = text;
387
388 while (t[0] != '\0')
389 {
390 str_utf8_cnext_noncomb_char (&t);
391 result++;
392 }
393
394 return result;
395 }
396
397
398
399 #if 0
400 static void
401 str_utf8_questmark_sustb (char **string, size_t *left, GString *buffer)
402 {
403 char *next;
404
405 next = g_utf8_next_char (*string);
406 (*left) -= next - (*string);
407 (*string) = next;
408 g_string_append_c (buffer, '?');
409 }
410 #endif
411
412
413
414 static gchar *
415 str_utf8_conv_gerror_message (GError *mcerror, const char *def_msg)
416 {
417 if (mcerror != NULL)
418 return g_strdup (mcerror->message);
419
420 return g_strdup (def_msg != NULL ? def_msg : "");
421 }
422
423
424
425 static estr_t
426 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString *buffer)
427 {
428 estr_t result = ESTR_SUCCESS;
429
430 if (coder == str_cnv_not_convert)
431 g_string_append_len (buffer, string, size);
432 else
433 result = str_nconvert (coder, string, size, buffer);
434
435 return result;
436 }
437
438
439
440
441
442 static const struct term_form *
443 str_utf8_make_make_term_form (const char *text, const ssize_t width)
444 {
445 static struct term_form result;
446 size_t width1;
447 gunichar uni;
448 size_t left;
449 char *actual;
450
451 width1 = width < 0 ? SIZE_MAX : (size_t) width;
452
453 result.text[0] = '\0';
454 result.width = 0;
455 result.compose = FALSE;
456
457 if (width1 == 0 || text[0] == '\0')
458 return &result;
459
460 actual = result.text;
461
462
463
464 uni = g_utf8_get_char_validated (text, -1);
465 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)) && str_unichar_iscombiningmark (uni))
466 {
467 actual[0] = ' ';
468 actual++;
469 result.width++;
470 result.compose = TRUE;
471 }
472
473 for (; width1 != 0 && text[0] != '\0'; width1--)
474 {
475 uni = g_utf8_get_char_validated (text, -1);
476 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
477 {
478 if (g_unichar_isprint (uni))
479 {
480 left = g_unichar_to_utf8 (uni, actual);
481 actual += left;
482 if (str_unichar_iscombiningmark (uni))
483 result.compose = TRUE;
484 else
485 {
486 result.width++;
487 if (g_unichar_iswide (uni))
488 result.width++;
489 }
490 }
491 else
492 {
493 actual[0] = '.';
494 actual++;
495 result.width++;
496 }
497 text = g_utf8_next_char (text);
498 }
499 else
500 {
501 size_t repl_len;
502
503 text++;
504
505 repl_len = strlen (replch);
506 memcpy (actual, replch, repl_len);
507 actual += repl_len;
508 result.width++;
509 }
510 }
511 actual[0] = '\0';
512
513 return &result;
514 }
515
516
517
518 static const char *
519 str_utf8_term_form (const char *text)
520 {
521 static char result[BUF_MEDIUM * MB_LEN_MAX];
522 const struct term_form *pre_form;
523
524 pre_form = str_utf8_make_make_term_form (text, -1);
525 if (pre_form->compose)
526 {
527 char *composed;
528
529 composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
530 g_strlcpy (result, composed, sizeof (result));
531 g_free (composed);
532 }
533 else
534 g_strlcpy (result, pre_form->text, sizeof (result));
535
536 return result;
537 }
538
539
540
541
542 static gboolean
543 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
544 {
545 tool->compose = FALSE;
546
547 while (tool->checked[0] != '\0')
548 {
549 gunichar uni;
550 size_t left;
551
552 uni = g_utf8_get_char (tool->checked);
553 tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
554 left = g_unichar_to_utf8 (uni, NULL);
555 if (tool->remain <= left)
556 return FALSE;
557 left = g_unichar_to_utf8 (uni, tool->actual);
558 tool->actual += left;
559 tool->remain -= left;
560 tool->checked = g_utf8_next_char (tool->checked);
561 }
562
563 return TRUE;
564 }
565
566
567
568
569
570 static gboolean
571 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
572 {
573 tool->compose = FALSE;
574
575 while (tool->checked[0] != '\0')
576 {
577 gunichar uni;
578 size_t left;
579 int w = 0;
580
581 uni = g_utf8_get_char (tool->checked);
582 if (str_unichar_iscombiningmark (uni))
583 tool->compose = TRUE;
584 else
585 {
586 w = 1;
587 if (g_unichar_iswide (uni))
588 w++;
589 if (tool->ident + w > to_ident)
590 return TRUE;
591 }
592
593 left = g_unichar_to_utf8 (uni, NULL);
594 if (tool->remain <= left)
595 return FALSE;
596 left = g_unichar_to_utf8 (uni, tool->actual);
597 tool->actual += left;
598 tool->remain -= left;
599 tool->checked = g_utf8_next_char (tool->checked);
600 tool->ident += w;
601 }
602
603 return TRUE;
604 }
605
606
607
608
609 static int
610 utf8_tool_insert_space (struct utf8_tool *tool, int count)
611 {
612 if (count <= 0)
613 return 1;
614 if (tool->remain <= (gsize) count)
615 return 0;
616
617 memset (tool->actual, ' ', count);
618 tool->actual += count;
619 tool->remain -= count;
620 return 1;
621 }
622
623
624
625
626 static int
627 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
628 {
629 if (tool->remain <= 1)
630 return 0;
631
632 tool->actual[0] = ch;
633 tool->actual++;
634 tool->remain--;
635 return 1;
636 }
637
638
639
640
641
642 static gboolean
643 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
644 {
645 gunichar uni;
646
647 while (to_ident > tool->ident && tool->checked[0] != '\0')
648 {
649 uni = g_utf8_get_char (tool->checked);
650 if (!str_unichar_iscombiningmark (uni))
651 {
652 tool->ident++;
653 if (g_unichar_iswide (uni))
654 tool->ident++;
655 }
656 tool->checked = g_utf8_next_char (tool->checked);
657 }
658
659 uni = g_utf8_get_char (tool->checked);
660 while (str_unichar_iscombiningmark (uni))
661 {
662 tool->checked = g_utf8_next_char (tool->checked);
663 uni = g_utf8_get_char (tool->checked);
664 }
665
666 return TRUE;
667 }
668
669
670
671 static void
672 utf8_tool_compose (char *buffer, size_t size)
673 {
674 char *composed;
675
676 composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
677 g_strlcpy (buffer, composed, size);
678 g_free (composed);
679 }
680
681
682
683 static const char *
684 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
685 {
686 static char result[BUF_MEDIUM * MB_LEN_MAX];
687 const struct term_form *pre_form;
688 struct utf8_tool tool;
689
690 pre_form = str_utf8_make_make_term_form (text, -1);
691
692 tool.checked = pre_form->text;
693 tool.actual = result;
694 tool.remain = sizeof (result);
695 tool.compose = FALSE;
696
697 if (pre_form->width <= (gsize) width)
698 {
699 switch (HIDE_FIT (just_mode))
700 {
701 case J_CENTER_LEFT:
702 case J_CENTER:
703 tool.ident = (width - pre_form->width) / 2;
704 break;
705 case J_RIGHT:
706 tool.ident = width - pre_form->width;
707 break;
708 default:
709 tool.ident = 0;
710 break;
711 }
712
713 utf8_tool_insert_space (&tool, tool.ident);
714 utf8_tool_copy_chars_to_end (&tool);
715 utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
716 }
717 else if (IS_FIT (just_mode))
718 {
719 tool.ident = 0;
720 utf8_tool_copy_chars_to (&tool, width / 2);
721 utf8_tool_insert_char (&tool, '~');
722
723 tool.ident = 0;
724 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
725 utf8_tool_copy_chars_to_end (&tool);
726 utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
727 }
728 else
729 {
730 switch (HIDE_FIT (just_mode))
731 {
732 case J_CENTER:
733 tool.ident = (width - pre_form->width) / 2;
734 break;
735 case J_RIGHT:
736 tool.ident = width - pre_form->width;
737 break;
738 default:
739 tool.ident = 0;
740 break;
741 }
742
743 utf8_tool_skip_chars_to (&tool, 0);
744 utf8_tool_insert_space (&tool, tool.ident);
745 utf8_tool_copy_chars_to (&tool, width);
746 utf8_tool_insert_space (&tool, width - tool.ident);
747 }
748
749 tool.actual[0] = '\0';
750 if (tool.compose)
751 utf8_tool_compose (result, sizeof (result));
752 return result;
753 }
754
755
756
757 static const char *
758 str_utf8_term_trim (const char *text, const ssize_t width)
759 {
760 static char result[BUF_MEDIUM * MB_LEN_MAX];
761 const struct term_form *pre_form;
762 struct utf8_tool tool;
763
764 if (width < 1)
765 {
766 result[0] = '\0';
767 return result;
768 }
769
770 const size_t width1 = (size_t) width;
771
772 pre_form = str_utf8_make_make_term_form (text, -1);
773
774 tool.checked = pre_form->text;
775 tool.actual = result;
776 tool.remain = sizeof (result);
777 tool.compose = FALSE;
778
779 if (width1 >= pre_form->width)
780 utf8_tool_copy_chars_to_end (&tool);
781 else if (width1 <= 3)
782 {
783 memset (tool.actual, '.', width1);
784 tool.actual += width1;
785 tool.remain -= width1;
786 }
787 else
788 {
789 memset (tool.actual, '.', 3);
790 tool.actual += 3;
791 tool.remain -= 3;
792
793 tool.ident = 0;
794 utf8_tool_skip_chars_to (&tool, pre_form->width - width1 + 3);
795 utf8_tool_copy_chars_to_end (&tool);
796 }
797
798 tool.actual[0] = '\0';
799 if (tool.compose)
800 utf8_tool_compose (result, sizeof (result));
801 return result;
802 }
803
804
805
806 static size_t
807 str_utf8_term_width2 (const char *text, const ssize_t width)
808 {
809 const struct term_form *result = str_utf8_make_make_term_form (text, width);
810
811 return result->width;
812 }
813
814
815
816 static size_t
817 str_utf8_term_width1 (const char *text)
818 {
819 return str_utf8_term_width2 (text, -1);
820 }
821
822
823
824 static int
825 str_utf8_term_char_width (const char *text)
826 {
827 gunichar uni;
828
829 uni = g_utf8_get_char_validated (text, -1);
830 return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
831 }
832
833
834
835 static const char *
836 str_utf8_term_substring (const char *text, int start, int width)
837 {
838 static char result[BUF_MEDIUM * MB_LEN_MAX];
839 const struct term_form *pre_form;
840 struct utf8_tool tool;
841
842 pre_form = str_utf8_make_make_term_form (text, -1);
843
844 tool.checked = pre_form->text;
845 tool.actual = result;
846 tool.remain = sizeof (result);
847 tool.compose = FALSE;
848
849 tool.ident = -start;
850 utf8_tool_skip_chars_to (&tool, 0);
851 if (tool.ident < 0)
852 tool.ident = 0;
853 utf8_tool_insert_space (&tool, tool.ident);
854
855 utf8_tool_copy_chars_to (&tool, width);
856 utf8_tool_insert_space (&tool, width - tool.ident);
857
858 tool.actual[0] = '\0';
859 if (tool.compose)
860 utf8_tool_compose (result, sizeof (result));
861 return result;
862 }
863
864
865
866 static const char *
867 str_utf8_trunc (const char *text, const ssize_t width)
868 {
869 static char result[MC_MAXPATHLEN * MB_LEN_MAX * 2];
870 const struct term_form *pre_form;
871 struct utf8_tool tool;
872
873 const size_t width1 = width < 0 ? SIZE_MAX : (size_t) width;
874
875 pre_form = str_utf8_make_make_term_form (text, -1);
876
877 tool.checked = pre_form->text;
878 tool.actual = result;
879 tool.remain = sizeof (result);
880 tool.compose = FALSE;
881
882 if (pre_form->width <= width1)
883 utf8_tool_copy_chars_to_end (&tool);
884 else
885 {
886 tool.ident = 0;
887 utf8_tool_copy_chars_to (&tool, width1 / 2);
888 utf8_tool_insert_char (&tool, '~');
889
890 tool.ident = 0;
891 utf8_tool_skip_chars_to (&tool, pre_form->width - width1 + 1);
892 utf8_tool_copy_chars_to_end (&tool);
893 }
894
895 tool.actual[0] = '\0';
896 if (tool.compose)
897 utf8_tool_compose (result, sizeof (result));
898 return result;
899 }
900
901
902
903 static int
904 str_utf8_offset_to_pos (const char *text, size_t length)
905 {
906 if (str_utf8_is_valid_string (text))
907 return g_utf8_offset_to_pointer (text, length) - text;
908 else
909 {
910 int result;
911 char *buffer;
912
913 buffer = g_strdup (text);
914 str_utf8_fix_string (buffer);
915 result = g_utf8_offset_to_pointer (buffer, length) - buffer;
916 g_free (buffer);
917 return result;
918 }
919 }
920
921
922
923 static int
924 str_utf8_column_to_pos (const char *text, size_t pos)
925 {
926 int result = 0;
927 int width = 0;
928
929 while (text[0] != '\0')
930 {
931 gunichar uni;
932
933 uni = g_utf8_get_char_validated (text, MB_LEN_MAX);
934 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
935 {
936 if (g_unichar_isprint (uni))
937 {
938 if (!str_unichar_iscombiningmark (uni))
939 {
940 width++;
941 if (g_unichar_iswide (uni))
942 width++;
943 }
944 }
945 else
946 {
947 width++;
948 }
949 text = g_utf8_next_char (text);
950 }
951 else
952 {
953 text++;
954 width++;
955 }
956
957 if ((gsize) width > pos)
958 return result;
959
960 result++;
961 }
962
963 return result;
964 }
965
966
967
968 static char *
969 str_utf8_create_search_needle (const char *needle, gboolean case_sen)
970 {
971 char *fold, *result;
972
973 if (needle == NULL)
974 return NULL;
975
976 if (case_sen)
977 return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
978
979 fold = g_utf8_casefold (needle, -1);
980 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
981 g_free (fold);
982 return result;
983 }
984
985
986
987 static void
988 str_utf8_release_search_needle (char *needle, gboolean case_sen)
989 {
990 (void) case_sen;
991 g_free (needle);
992 }
993
994
995
996 static const char *
997 str_utf8_search_first (const char *text, const char *search, gboolean case_sen)
998 {
999 char *deco_text;
1000 const char *match;
1001 const char *result = NULL;
1002 size_t search_len;
1003
1004 if (case_sen)
1005 deco_text = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1006 else
1007 {
1008 char *fold_text;
1009
1010 fold_text = g_utf8_casefold (text, -1);
1011 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1012 g_free (fold_text);
1013 }
1014
1015 search_len = strlen (search);
1016
1017 match = deco_text;
1018 do
1019 {
1020 match = g_strstr_len (match, -1, search);
1021 if (match != NULL)
1022 {
1023 if ((!str_utf8_iscombiningmark (match) || (match == deco_text))
1024 && !str_utf8_iscombiningmark (match + search_len))
1025 {
1026 const char *m = deco_text;
1027
1028 result = text;
1029 while (m < match)
1030 {
1031 str_utf8_cnext_noncomb_char (&m);
1032 str_utf8_cnext_noncomb_char (&result);
1033 }
1034 }
1035 else
1036 str_utf8_cnext_char (&match);
1037 }
1038 }
1039 while (match != NULL && result == NULL);
1040
1041 g_free (deco_text);
1042
1043 return result;
1044 }
1045
1046
1047
1048 static const char *
1049 str_utf8_search_last (const char *text, const char *search, gboolean case_sen)
1050 {
1051 char *deco_text;
1052 char *match;
1053 const char *result = NULL;
1054 size_t search_len;
1055
1056 if (case_sen)
1057 deco_text = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1058 else
1059 {
1060 char *fold_text;
1061
1062 fold_text = g_utf8_casefold (text, -1);
1063 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1064 g_free (fold_text);
1065 }
1066
1067 search_len = strlen (search);
1068
1069 do
1070 {
1071 match = g_strrstr_len (deco_text, -1, search);
1072 if (match != NULL)
1073 {
1074 if ((!str_utf8_iscombiningmark (match) || (match == deco_text))
1075 && !str_utf8_iscombiningmark (match + search_len))
1076 {
1077 const char *m = deco_text;
1078
1079 result = text;
1080 while (m < match)
1081 {
1082 str_utf8_cnext_noncomb_char (&m);
1083 str_utf8_cnext_noncomb_char (&result);
1084 }
1085 }
1086 else
1087 match[0] = '\0';
1088 }
1089 }
1090 while (match != NULL && result == NULL);
1091
1092 g_free (deco_text);
1093
1094 return result;
1095 }
1096
1097
1098
1099 static char *
1100 str_utf8_normalize (const char *text)
1101 {
1102 GString *fixed;
1103 char *tmp;
1104 char *result;
1105 const char *start;
1106 const char *end;
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116 for (end = text; *end != '\0'; end++)
1117 if ((*end & 0x80) != 0)
1118 {
1119
1120 break;
1121 }
1122
1123
1124 if (*end == '\0')
1125 return g_strndup (text, end - text);
1126
1127 fixed = g_string_sized_new (4);
1128
1129 start = text;
1130 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1131 {
1132 if (start != end)
1133 {
1134 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1135 g_string_append (fixed, tmp);
1136 g_free (tmp);
1137 }
1138 g_string_append_c (fixed, end[0]);
1139 start = end + 1;
1140 }
1141
1142 if (start == text)
1143 {
1144 result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1145 g_string_free (fixed, TRUE);
1146 }
1147 else
1148 {
1149 if (start[0] != '\0' && start != end)
1150 {
1151 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1152 g_string_append (fixed, tmp);
1153 g_free (tmp);
1154 }
1155 result = g_string_free (fixed, FALSE);
1156 }
1157
1158 return result;
1159 }
1160
1161
1162
1163 static char *
1164 str_utf8_casefold_normalize (const char *text)
1165 {
1166 GString *fixed;
1167 char *tmp, *fold;
1168 char *result;
1169 const char *start;
1170 const char *end;
1171
1172 fixed = g_string_sized_new (4);
1173
1174 start = text;
1175 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1176 {
1177 if (start != end)
1178 {
1179 fold = g_utf8_casefold (start, end - start);
1180 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1181 g_string_append (fixed, tmp);
1182 g_free (tmp);
1183 g_free (fold);
1184 }
1185 g_string_append_c (fixed, end[0]);
1186 start = end + 1;
1187 }
1188
1189 if (start == text)
1190 {
1191 fold = g_utf8_casefold (text, -1);
1192 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1193 g_free (fold);
1194 g_string_free (fixed, TRUE);
1195 }
1196 else
1197 {
1198 if (start[0] != '\0' && start != end)
1199 {
1200 fold = g_utf8_casefold (start, end - start);
1201 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1202 g_string_append (fixed, tmp);
1203 g_free (tmp);
1204 g_free (fold);
1205 }
1206 result = g_string_free (fixed, FALSE);
1207 }
1208
1209 return result;
1210 }
1211
1212
1213
1214 static int
1215 str_utf8_compare (const char *t1, const char *t2)
1216 {
1217 char *n1, *n2;
1218 int result;
1219
1220 n1 = str_utf8_normalize (t1);
1221 n2 = str_utf8_normalize (t2);
1222
1223 result = strcmp (n1, n2);
1224
1225 g_free (n1);
1226 g_free (n2);
1227
1228 return result;
1229 }
1230
1231
1232
1233 static int
1234 str_utf8_ncompare (const char *t1, const char *t2)
1235 {
1236 char *n1, *n2;
1237 size_t l1, l2;
1238 int result;
1239
1240 n1 = str_utf8_normalize (t1);
1241 n2 = str_utf8_normalize (t2);
1242
1243 l1 = strlen (n1);
1244 l2 = strlen (n2);
1245 result = strncmp (n1, n2, MIN (l1, l2));
1246
1247 g_free (n1);
1248 g_free (n2);
1249
1250 return result;
1251 }
1252
1253
1254
1255 static int
1256 str_utf8_casecmp (const char *t1, const char *t2)
1257 {
1258 char *n1, *n2;
1259 int result;
1260
1261 n1 = str_utf8_casefold_normalize (t1);
1262 n2 = str_utf8_casefold_normalize (t2);
1263
1264 result = strcmp (n1, n2);
1265
1266 g_free (n1);
1267 g_free (n2);
1268
1269 return result;
1270 }
1271
1272
1273
1274 static int
1275 str_utf8_ncasecmp (const char *t1, const char *t2)
1276 {
1277 char *n1, *n2;
1278 size_t l1, l2;
1279 int result;
1280
1281 n1 = str_utf8_casefold_normalize (t1);
1282 n2 = str_utf8_casefold_normalize (t2);
1283
1284 l1 = strlen (n1);
1285 l2 = strlen (n2);
1286 result = strncmp (n1, n2, MIN (l1, l2));
1287
1288 g_free (n1);
1289 g_free (n2);
1290
1291 return result;
1292 }
1293
1294
1295
1296 static int
1297 str_utf8_prefix (const char *text, const char *prefix)
1298 {
1299 char *t, *p;
1300 const char *nt, *np;
1301 const char *nnt, *nnp;
1302 int result;
1303
1304 t = str_utf8_normalize (text);
1305 p = str_utf8_normalize (prefix);
1306 nt = t;
1307 np = p;
1308 nnt = t;
1309 nnp = p;
1310
1311 while (nt[0] != '\0' && np[0] != '\0')
1312 {
1313 str_utf8_cnext_char_safe (&nnt);
1314 str_utf8_cnext_char_safe (&nnp);
1315 if (nnt - nt != nnp - np)
1316 break;
1317 if (strncmp (nt, np, nnt - nt) != 0)
1318 break;
1319 nt = nnt;
1320 np = nnp;
1321 }
1322
1323 result = np - p;
1324
1325 g_free (t);
1326 g_free (p);
1327
1328 return result;
1329 }
1330
1331
1332
1333 static int
1334 str_utf8_caseprefix (const char *text, const char *prefix)
1335 {
1336 char *t, *p;
1337 const char *nt, *np;
1338 const char *nnt, *nnp;
1339 int result;
1340
1341 t = str_utf8_casefold_normalize (text);
1342 p = str_utf8_casefold_normalize (prefix);
1343 nt = t;
1344 np = p;
1345 nnt = t;
1346 nnp = p;
1347
1348 while (nt[0] != '\0' && np[0] != '\0')
1349 {
1350 str_utf8_cnext_char_safe (&nnt);
1351 str_utf8_cnext_char_safe (&nnp);
1352 if (nnt - nt != nnp - np)
1353 break;
1354 if (strncmp (nt, np, nnt - nt) != 0)
1355 break;
1356 nt = nnt;
1357 np = nnp;
1358 }
1359
1360 result = np - p;
1361
1362 g_free (t);
1363 g_free (p);
1364
1365 return result;
1366 }
1367
1368
1369
1370 static char *
1371 str_utf8_create_key_gen (const char *text, gboolean case_sen,
1372 gchar *(*keygen) (const gchar *text, gssize size))
1373 {
1374 char *result;
1375
1376 if (case_sen)
1377 result = str_utf8_normalize (text);
1378 else
1379 {
1380 gboolean dot;
1381 GString *fixed;
1382 const char *start, *end;
1383 char *fold, *key;
1384
1385 dot = text[0] == '.';
1386 fixed = g_string_sized_new (16);
1387
1388 if (!dot)
1389 start = text;
1390 else
1391 {
1392 start = text + 1;
1393 g_string_append_c (fixed, '.');
1394 }
1395
1396 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1397 {
1398 if (start != end)
1399 {
1400 fold = g_utf8_casefold (start, end - start);
1401 key = keygen (fold, -1);
1402 g_string_append (fixed, key);
1403 g_free (key);
1404 g_free (fold);
1405 }
1406 g_string_append_c (fixed, end[0]);
1407 start = end + 1;
1408 }
1409
1410 if (start == text)
1411 {
1412 fold = g_utf8_casefold (start, -1);
1413 result = keygen (fold, -1);
1414 g_free (fold);
1415 g_string_free (fixed, TRUE);
1416 }
1417 else if (dot && (start == text + 1))
1418 {
1419 fold = g_utf8_casefold (start, -1);
1420 key = keygen (fold, -1);
1421 g_string_append (fixed, key);
1422 g_free (key);
1423 g_free (fold);
1424 result = g_string_free (fixed, FALSE);
1425 }
1426 else
1427 {
1428 if (start[0] != '\0' && start != end)
1429 {
1430 fold = g_utf8_casefold (start, end - start);
1431 key = keygen (fold, -1);
1432 g_string_append (fixed, key);
1433 g_free (key);
1434 g_free (fold);
1435 }
1436 result = g_string_free (fixed, FALSE);
1437 }
1438 }
1439 return result;
1440 }
1441
1442
1443
1444 static char *
1445 str_utf8_create_key (const char *text, gboolean case_sen)
1446 {
1447 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1448 }
1449
1450
1451
1452 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1453 static char *
1454 str_utf8_create_key_for_filename (const char *text, gboolean case_sen)
1455 {
1456 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1457 }
1458 #endif
1459
1460
1461
1462 static int
1463 str_utf8_key_collate (const char *t1, const char *t2, gboolean case_sen)
1464 {
1465 (void) case_sen;
1466 return strcmp (t1, t2);
1467 }
1468
1469
1470
1471 static void
1472 str_utf8_release_key (char *key, gboolean case_sen)
1473 {
1474 (void) case_sen;
1475 g_free (key);
1476 }
1477
1478
1479
1480
1481
1482 struct str_class
1483 str_utf8_init (void)
1484 {
1485 struct str_class result;
1486
1487 result.conv_gerror_message = str_utf8_conv_gerror_message;
1488 result.vfs_convert_to = str_utf8_vfs_convert_to;
1489 result.insert_replace_char = str_utf8_insert_replace_char;
1490 result.is_valid_string = str_utf8_is_valid_string;
1491 result.is_valid_char = str_utf8_is_valid_char;
1492 result.cnext_char = str_utf8_cnext_char;
1493 result.cprev_char = str_utf8_cprev_char;
1494 result.cnext_char_safe = str_utf8_cnext_char_safe;
1495 result.cprev_char_safe = str_utf8_cprev_char_safe;
1496 result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1497 result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1498 result.char_isspace = str_utf8_isspace;
1499 result.char_ispunct = str_utf8_ispunct;
1500 result.char_isalnum = str_utf8_isalnum;
1501 result.char_isdigit = str_utf8_isdigit;
1502 result.char_isprint = str_utf8_isprint;
1503 result.char_iscombiningmark = str_utf8_iscombiningmark;
1504 result.char_toupper = str_utf8_toupper;
1505 result.char_tolower = str_utf8_tolower;
1506 result.length = str_utf8_length;
1507 result.length2 = str_utf8_length2;
1508 result.length_noncomb = str_utf8_length_noncomb;
1509 result.fix_string = str_utf8_fix_string;
1510 result.term_form = str_utf8_term_form;
1511 result.fit_to_term = str_utf8_fit_to_term;
1512 result.term_trim = str_utf8_term_trim;
1513 result.term_width2 = str_utf8_term_width2;
1514 result.term_width1 = str_utf8_term_width1;
1515 result.term_char_width = str_utf8_term_char_width;
1516 result.term_substring = str_utf8_term_substring;
1517 result.trunc = str_utf8_trunc;
1518 result.offset_to_pos = str_utf8_offset_to_pos;
1519 result.column_to_pos = str_utf8_column_to_pos;
1520 result.create_search_needle = str_utf8_create_search_needle;
1521 result.release_search_needle = str_utf8_release_search_needle;
1522 result.search_first = str_utf8_search_first;
1523 result.search_last = str_utf8_search_last;
1524 result.compare = str_utf8_compare;
1525 result.ncompare = str_utf8_ncompare;
1526 result.casecmp = str_utf8_casecmp;
1527 result.ncasecmp = str_utf8_ncasecmp;
1528 result.prefix = str_utf8_prefix;
1529 result.caseprefix = str_utf8_caseprefix;
1530 result.create_key = str_utf8_create_key;
1531 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1532
1533 result.create_key_for_filename = str_utf8_create_key_for_filename;
1534 #else
1535
1536 result.create_key_for_filename = str_utf8_create_key;
1537 #endif
1538 result.key_collate = str_utf8_key_collate;
1539 result.release_key = str_utf8_release_key;
1540
1541 return result;
1542 }
1543
1544