This source file includes following definitions.
- str_unichar_iscombiningmark
- str_utf8_insert_replace_char
- str_utf8_is_valid_string
- str_utf8_is_valid_char
- str_utf8_cnext_char
- str_utf8_cprev_char
- str_utf8_cnext_char_safe
- str_utf8_cprev_char_safe
- str_utf8_fix_string
- str_utf8_isspace
- str_utf8_ispunct
- str_utf8_isalnum
- str_utf8_isdigit
- str_utf8_isprint
- str_utf8_iscombiningmark
- str_utf8_cnext_noncomb_char
- str_utf8_cprev_noncomb_char
- str_utf8_toupper
- str_utf8_tolower
- str_utf8_length
- str_utf8_length2
- str_utf8_length_noncomb
- str_utf8_questmark_sustb
- str_utf8_conv_gerror_message
- str_utf8_vfs_convert_to
- str_utf8_make_make_term_form
- str_utf8_term_form
- utf8_tool_copy_chars_to_end
- utf8_tool_copy_chars_to
- utf8_tool_insert_space
- utf8_tool_insert_char
- utf8_tool_skip_chars_to
- utf8_tool_compose
- str_utf8_fit_to_term
- str_utf8_term_trim
- str_utf8_term_width2
- str_utf8_term_width1
- str_utf8_term_char_width
- str_utf8_term_substring
- str_utf8_trunc
- str_utf8_offset_to_pos
- str_utf8_column_to_pos
- str_utf8_create_search_needle
- str_utf8_release_search_needle
- str_utf8_search_first
- str_utf8_search_last
- str_utf8_normalize
- str_utf8_casefold_normalize
- str_utf8_compare
- str_utf8_ncompare
- str_utf8_casecmp
- str_utf8_ncasecmp
- str_utf8_prefix
- str_utf8_caseprefix
- str_utf8_create_key_gen
- str_utf8_create_key
- str_utf8_create_key_for_filename
- str_utf8_key_collate
- str_utf8_release_key
- str_utf8_init
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 #include <config.h>
27
28 #include <stdlib.h>
29 #include <langinfo.h>
30 #include <limits.h>
31 #include <string.h>
32
33 #include "lib/global.h"
34 #include "lib/strutil.h"
35
36
37
38
39
40
41
42
43
44 struct utf8_tool
45 {
46 char *actual;
47 size_t remain;
48 const char *checked;
49 int ident;
50 gboolean compose;
51 };
52
53 struct term_form
54 {
55 char text[BUF_MEDIUM * MB_LEN_MAX];
56 size_t width;
57 gboolean compose;
58 };
59
60
61
62
63
64 static const char replch[] = "\xEF\xBF\xBD";
65
66
67
68
69
70 static gboolean
71 str_unichar_iscombiningmark (gunichar uni)
72 {
73 GUnicodeType type;
74
75 type = g_unichar_type (uni);
76 return (type == G_UNICODE_SPACING_MARK)
77 || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
78 }
79
80
81
82 static void
83 str_utf8_insert_replace_char (GString *buffer)
84 {
85 g_string_append (buffer, replch);
86 }
87
88
89
90 static gboolean
91 str_utf8_is_valid_string (const char *text)
92 {
93 return g_utf8_validate (text, -1, NULL);
94 }
95
96
97
98 static int
99 str_utf8_is_valid_char (const char *ch, size_t size)
100 {
101 switch (g_utf8_get_char_validated (ch, size))
102 {
103 case (gunichar) (-2):
104 return (-2);
105 case (gunichar) (-1):
106 return (-1);
107 default:
108 return 1;
109 }
110 }
111
112
113
114 static void
115 str_utf8_cnext_char (const char **text)
116 {
117 (*text) = g_utf8_next_char (*text);
118 }
119
120
121
122 static void
123 str_utf8_cprev_char (const char **text)
124 {
125 (*text) = g_utf8_prev_char (*text);
126 }
127
128
129
130 static void
131 str_utf8_cnext_char_safe (const char **text)
132 {
133 if (str_utf8_is_valid_char (*text, -1) == 1)
134 (*text) = g_utf8_next_char (*text);
135 else
136 (*text)++;
137 }
138
139
140
141 static void
142 str_utf8_cprev_char_safe (const char **text)
143 {
144 const char *result, *t;
145
146 result = g_utf8_prev_char (*text);
147 t = result;
148 str_utf8_cnext_char_safe (&t);
149 if (t == *text)
150 (*text) = result;
151 else
152 (*text)--;
153 }
154
155
156
157 static void
158 str_utf8_fix_string (char *text)
159 {
160 while (text[0] != '\0')
161 {
162 gunichar uni;
163
164 uni = g_utf8_get_char_validated (text, -1);
165 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
166 text = g_utf8_next_char (text);
167 else
168 {
169 text[0] = '?';
170 text++;
171 }
172 }
173 }
174
175
176
177 static gboolean
178 str_utf8_isspace (const char *text)
179 {
180 gunichar uni;
181
182 uni = g_utf8_get_char_validated (text, -1);
183 return g_unichar_isspace (uni);
184 }
185
186
187
188 static gboolean
189 str_utf8_ispunct (const char *text)
190 {
191 gunichar uni;
192
193 uni = g_utf8_get_char_validated (text, -1);
194 return g_unichar_ispunct (uni);
195 }
196
197
198
199 static gboolean
200 str_utf8_isalnum (const char *text)
201 {
202 gunichar uni;
203
204 uni = g_utf8_get_char_validated (text, -1);
205 return g_unichar_isalnum (uni);
206 }
207
208
209
210 static gboolean
211 str_utf8_isdigit (const char *text)
212 {
213 gunichar uni;
214
215 uni = g_utf8_get_char_validated (text, -1);
216 return g_unichar_isdigit (uni);
217 }
218
219
220
221 static gboolean
222 str_utf8_isprint (const char *ch)
223 {
224 gunichar uni;
225
226 uni = g_utf8_get_char_validated (ch, -1);
227 return g_unichar_isprint (uni);
228 }
229
230
231
232 static gboolean
233 str_utf8_iscombiningmark (const char *ch)
234 {
235 gunichar uni;
236
237 uni = g_utf8_get_char_validated (ch, -1);
238 return str_unichar_iscombiningmark (uni);
239 }
240
241
242
243 static int
244 str_utf8_cnext_noncomb_char (const char **text)
245 {
246 int count = 0;
247
248 while ((*text)[0] != '\0')
249 {
250 str_utf8_cnext_char_safe (text);
251 count++;
252 if (!str_utf8_iscombiningmark (*text))
253 break;
254 }
255
256 return count;
257 }
258
259
260
261 static int
262 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
263 {
264 int count = 0;
265
266 while ((*text) != begin)
267 {
268 str_utf8_cprev_char_safe (text);
269 count++;
270 if (!str_utf8_iscombiningmark (*text))
271 break;
272 }
273
274 return count;
275 }
276
277
278
279 static gboolean
280 str_utf8_toupper (const char *text, char **out, size_t *remain)
281 {
282 gunichar uni;
283 size_t left;
284
285 uni = g_utf8_get_char_validated (text, -1);
286 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
287 return FALSE;
288
289 uni = g_unichar_toupper (uni);
290 left = g_unichar_to_utf8 (uni, NULL);
291 if (left >= *remain)
292 return FALSE;
293
294 left = g_unichar_to_utf8 (uni, *out);
295 (*out) += left;
296 (*remain) -= left;
297 return TRUE;
298 }
299
300
301
302 static gboolean
303 str_utf8_tolower (const char *text, char **out, size_t *remain)
304 {
305 gunichar uni;
306 size_t left;
307
308 uni = g_utf8_get_char_validated (text, -1);
309 if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
310 return FALSE;
311
312 uni = g_unichar_tolower (uni);
313 left = g_unichar_to_utf8 (uni, NULL);
314 if (left >= *remain)
315 return FALSE;
316
317 left = g_unichar_to_utf8 (uni, *out);
318 (*out) += left;
319 (*remain) -= left;
320 return TRUE;
321 }
322
323
324
325 static int
326 str_utf8_length (const char *text)
327 {
328 int result = 0;
329 const char *start;
330 const char *end;
331
332 start = text;
333 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
334 {
335 if (start != end)
336 result += g_utf8_strlen (start, end - start);
337
338 result++;
339 start = end + 1;
340 }
341
342 if (start == text)
343 result = g_utf8_strlen (text, -1);
344 else if (start[0] != '\0' && start != end)
345 result += g_utf8_strlen (start, end - start);
346
347 return result;
348 }
349
350
351
352 static int
353 str_utf8_length2 (const char *text, int size)
354 {
355 int result = 0;
356 const char *start;
357 const char *end;
358
359 start = text;
360 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
361 {
362 if (start != end)
363 {
364 result += g_utf8_strlen (start, MIN (end - start, size));
365 size -= end - start;
366 }
367 result += (size > 0);
368 size--;
369 start = end + 1;
370 }
371
372 if (start == text)
373 result = g_utf8_strlen (text, size);
374 else if (start[0] != '\0' && start != end && size > 0)
375 result += g_utf8_strlen (start, MIN (end - start, size));
376
377 return result;
378 }
379
380
381
382 static int
383 str_utf8_length_noncomb (const char *text)
384 {
385 int result = 0;
386 const char *t = text;
387
388 while (t[0] != '\0')
389 {
390 str_utf8_cnext_noncomb_char (&t);
391 result++;
392 }
393
394 return result;
395 }
396
397
398
399 #if 0
400 static void
401 str_utf8_questmark_sustb (char **string, size_t *left, GString *buffer)
402 {
403 char *next;
404
405 next = g_utf8_next_char (*string);
406 (*left) -= next - (*string);
407 (*string) = next;
408 g_string_append_c (buffer, '?');
409 }
410 #endif
411
412
413
414 static gchar *
415 str_utf8_conv_gerror_message (GError *mcerror, const char *def_msg)
416 {
417 if (mcerror != NULL)
418 return g_strdup (mcerror->message);
419
420 return g_strdup (def_msg != NULL ? def_msg : "");
421 }
422
423
424
425 static estr_t
426 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString *buffer)
427 {
428 estr_t result = ESTR_SUCCESS;
429
430 if (coder == str_cnv_not_convert)
431 g_string_append_len (buffer, string, size);
432 else
433 result = str_nconvert (coder, string, size, buffer);
434
435 return result;
436 }
437
438
439
440
441
442 static const struct term_form *
443 str_utf8_make_make_term_form (const char *text, size_t length)
444 {
445 static struct term_form result;
446 gunichar uni;
447 size_t left;
448 char *actual;
449
450 result.text[0] = '\0';
451 result.width = 0;
452 result.compose = FALSE;
453 actual = result.text;
454
455
456
457 if (length != 0 && text[0] != '\0')
458 {
459 uni = g_utf8_get_char_validated (text, -1);
460 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
461 && str_unichar_iscombiningmark (uni))
462 {
463 actual[0] = ' ';
464 actual++;
465 result.width++;
466 result.compose = TRUE;
467 }
468 }
469
470 while (length != 0 && text[0] != '\0')
471 {
472 uni = g_utf8_get_char_validated (text, -1);
473 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
474 {
475 if (g_unichar_isprint (uni))
476 {
477 left = g_unichar_to_utf8 (uni, actual);
478 actual += left;
479 if (str_unichar_iscombiningmark (uni))
480 result.compose = TRUE;
481 else
482 {
483 result.width++;
484 if (g_unichar_iswide (uni))
485 result.width++;
486 }
487 }
488 else
489 {
490 actual[0] = '.';
491 actual++;
492 result.width++;
493 }
494 text = g_utf8_next_char (text);
495 }
496 else
497 {
498 size_t repl_len;
499
500 text++;
501
502 repl_len = strlen (replch);
503 memcpy (actual, replch, repl_len);
504 actual += repl_len;
505 result.width++;
506 }
507
508 if (length != (size_t) (-1))
509 length--;
510 }
511 actual[0] = '\0';
512
513 return &result;
514 }
515
516
517
518 static const char *
519 str_utf8_term_form (const char *text)
520 {
521 static char result[BUF_MEDIUM * MB_LEN_MAX];
522 const struct term_form *pre_form;
523
524 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
525 if (pre_form->compose)
526 {
527 char *composed;
528
529 composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
530 g_strlcpy (result, composed, sizeof (result));
531 g_free (composed);
532 }
533 else
534 g_strlcpy (result, pre_form->text, sizeof (result));
535
536 return result;
537 }
538
539
540
541
542 static gboolean
543 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
544 {
545 tool->compose = FALSE;
546
547 while (tool->checked[0] != '\0')
548 {
549 gunichar uni;
550 size_t left;
551
552 uni = g_utf8_get_char (tool->checked);
553 tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
554 left = g_unichar_to_utf8 (uni, NULL);
555 if (tool->remain <= left)
556 return FALSE;
557 left = g_unichar_to_utf8 (uni, tool->actual);
558 tool->actual += left;
559 tool->remain -= left;
560 tool->checked = g_utf8_next_char (tool->checked);
561 }
562
563 return TRUE;
564 }
565
566
567
568
569
570 static gboolean
571 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
572 {
573 tool->compose = FALSE;
574
575 while (tool->checked[0] != '\0')
576 {
577 gunichar uni;
578 size_t left;
579 int w = 0;
580
581 uni = g_utf8_get_char (tool->checked);
582 if (str_unichar_iscombiningmark (uni))
583 tool->compose = TRUE;
584 else
585 {
586 w = 1;
587 if (g_unichar_iswide (uni))
588 w++;
589 if (tool->ident + w > to_ident)
590 return TRUE;
591 }
592
593 left = g_unichar_to_utf8 (uni, NULL);
594 if (tool->remain <= left)
595 return FALSE;
596 left = g_unichar_to_utf8 (uni, tool->actual);
597 tool->actual += left;
598 tool->remain -= left;
599 tool->checked = g_utf8_next_char (tool->checked);
600 tool->ident += w;
601 }
602
603 return TRUE;
604 }
605
606
607
608
609 static int
610 utf8_tool_insert_space (struct utf8_tool *tool, int count)
611 {
612 if (count <= 0)
613 return 1;
614 if (tool->remain <= (gsize) count)
615 return 0;
616
617 memset (tool->actual, ' ', count);
618 tool->actual += count;
619 tool->remain -= count;
620 return 1;
621 }
622
623
624
625
626 static int
627 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
628 {
629 if (tool->remain <= 1)
630 return 0;
631
632 tool->actual[0] = ch;
633 tool->actual++;
634 tool->remain--;
635 return 1;
636 }
637
638
639
640
641
642 static gboolean
643 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
644 {
645 gunichar uni;
646
647 while (to_ident > tool->ident && tool->checked[0] != '\0')
648 {
649 uni = g_utf8_get_char (tool->checked);
650 if (!str_unichar_iscombiningmark (uni))
651 {
652 tool->ident++;
653 if (g_unichar_iswide (uni))
654 tool->ident++;
655 }
656 tool->checked = g_utf8_next_char (tool->checked);
657 }
658
659 uni = g_utf8_get_char (tool->checked);
660 while (str_unichar_iscombiningmark (uni))
661 {
662 tool->checked = g_utf8_next_char (tool->checked);
663 uni = g_utf8_get_char (tool->checked);
664 }
665
666 return TRUE;
667 }
668
669
670
671 static void
672 utf8_tool_compose (char *buffer, size_t size)
673 {
674 char *composed;
675
676 composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
677 g_strlcpy (buffer, composed, size);
678 g_free (composed);
679 }
680
681
682
683 static const char *
684 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
685 {
686 static char result[BUF_MEDIUM * MB_LEN_MAX];
687 const struct term_form *pre_form;
688 struct utf8_tool tool;
689
690 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
691 tool.checked = pre_form->text;
692 tool.actual = result;
693 tool.remain = sizeof (result);
694 tool.compose = FALSE;
695
696 if (pre_form->width <= (gsize) width)
697 {
698 switch (HIDE_FIT (just_mode))
699 {
700 case J_CENTER_LEFT:
701 case J_CENTER:
702 tool.ident = (width - pre_form->width) / 2;
703 break;
704 case J_RIGHT:
705 tool.ident = width - pre_form->width;
706 break;
707 default:
708 tool.ident = 0;
709 break;
710 }
711
712 utf8_tool_insert_space (&tool, tool.ident);
713 utf8_tool_copy_chars_to_end (&tool);
714 utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
715 }
716 else if (IS_FIT (just_mode))
717 {
718 tool.ident = 0;
719 utf8_tool_copy_chars_to (&tool, width / 2);
720 utf8_tool_insert_char (&tool, '~');
721
722 tool.ident = 0;
723 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
724 utf8_tool_copy_chars_to_end (&tool);
725 utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
726 }
727 else
728 {
729 switch (HIDE_FIT (just_mode))
730 {
731 case J_CENTER:
732 tool.ident = (width - pre_form->width) / 2;
733 break;
734 case J_RIGHT:
735 tool.ident = width - pre_form->width;
736 break;
737 default:
738 tool.ident = 0;
739 break;
740 }
741
742 utf8_tool_skip_chars_to (&tool, 0);
743 utf8_tool_insert_space (&tool, tool.ident);
744 utf8_tool_copy_chars_to (&tool, width);
745 utf8_tool_insert_space (&tool, width - tool.ident);
746 }
747
748 tool.actual[0] = '\0';
749 if (tool.compose)
750 utf8_tool_compose (result, sizeof (result));
751 return result;
752 }
753
754
755
756 static const char *
757 str_utf8_term_trim (const char *text, int width)
758 {
759 static char result[BUF_MEDIUM * MB_LEN_MAX];
760 const struct term_form *pre_form;
761 struct utf8_tool tool;
762
763 if (width < 1)
764 {
765 result[0] = '\0';
766 return result;
767 }
768
769 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
770
771 tool.checked = pre_form->text;
772 tool.actual = result;
773 tool.remain = sizeof (result);
774 tool.compose = FALSE;
775
776 if ((gsize) width >= pre_form->width)
777 utf8_tool_copy_chars_to_end (&tool);
778 else if (width <= 3)
779 {
780 memset (tool.actual, '.', width);
781 tool.actual += width;
782 tool.remain -= width;
783 }
784 else
785 {
786 memset (tool.actual, '.', 3);
787 tool.actual += 3;
788 tool.remain -= 3;
789
790 tool.ident = 0;
791 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
792 utf8_tool_copy_chars_to_end (&tool);
793 }
794
795 tool.actual[0] = '\0';
796 if (tool.compose)
797 utf8_tool_compose (result, sizeof (result));
798 return result;
799 }
800
801
802
803 static int
804 str_utf8_term_width2 (const char *text, size_t length)
805 {
806 const struct term_form *result;
807
808 result = str_utf8_make_make_term_form (text, length);
809 return result->width;
810 }
811
812
813
814 static int
815 str_utf8_term_width1 (const char *text)
816 {
817 return str_utf8_term_width2 (text, (size_t) (-1));
818 }
819
820
821
822 static int
823 str_utf8_term_char_width (const char *text)
824 {
825 gunichar uni;
826
827 uni = g_utf8_get_char_validated (text, -1);
828 return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
829 }
830
831
832
833 static const char *
834 str_utf8_term_substring (const char *text, int start, int width)
835 {
836 static char result[BUF_MEDIUM * MB_LEN_MAX];
837 const struct term_form *pre_form;
838 struct utf8_tool tool;
839
840 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
841
842 tool.checked = pre_form->text;
843 tool.actual = result;
844 tool.remain = sizeof (result);
845 tool.compose = FALSE;
846
847 tool.ident = -start;
848 utf8_tool_skip_chars_to (&tool, 0);
849 if (tool.ident < 0)
850 tool.ident = 0;
851 utf8_tool_insert_space (&tool, tool.ident);
852
853 utf8_tool_copy_chars_to (&tool, width);
854 utf8_tool_insert_space (&tool, width - tool.ident);
855
856 tool.actual[0] = '\0';
857 if (tool.compose)
858 utf8_tool_compose (result, sizeof (result));
859 return result;
860 }
861
862
863
864 static const char *
865 str_utf8_trunc (const char *text, int width)
866 {
867 static char result[MC_MAXPATHLEN * MB_LEN_MAX * 2];
868 const struct term_form *pre_form;
869 struct utf8_tool tool;
870
871 pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
872
873 tool.checked = pre_form->text;
874 tool.actual = result;
875 tool.remain = sizeof (result);
876 tool.compose = FALSE;
877
878 if (pre_form->width <= (gsize) width)
879 utf8_tool_copy_chars_to_end (&tool);
880 else
881 {
882 tool.ident = 0;
883 utf8_tool_copy_chars_to (&tool, width / 2);
884 utf8_tool_insert_char (&tool, '~');
885
886 tool.ident = 0;
887 utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
888 utf8_tool_copy_chars_to_end (&tool);
889 }
890
891 tool.actual[0] = '\0';
892 if (tool.compose)
893 utf8_tool_compose (result, sizeof (result));
894 return result;
895 }
896
897
898
899 static int
900 str_utf8_offset_to_pos (const char *text, size_t length)
901 {
902 if (str_utf8_is_valid_string (text))
903 return g_utf8_offset_to_pointer (text, length) - text;
904 else
905 {
906 int result;
907 char *buffer;
908
909 buffer = g_strdup (text);
910 str_utf8_fix_string (buffer);
911 result = g_utf8_offset_to_pointer (buffer, length) - buffer;
912 g_free (buffer);
913 return result;
914 }
915 }
916
917
918
919 static int
920 str_utf8_column_to_pos (const char *text, size_t pos)
921 {
922 int result = 0;
923 int width = 0;
924
925 while (text[0] != '\0')
926 {
927 gunichar uni;
928
929 uni = g_utf8_get_char_validated (text, MB_LEN_MAX);
930 if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
931 {
932 if (g_unichar_isprint (uni))
933 {
934 if (!str_unichar_iscombiningmark (uni))
935 {
936 width++;
937 if (g_unichar_iswide (uni))
938 width++;
939 }
940 }
941 else
942 {
943 width++;
944 }
945 text = g_utf8_next_char (text);
946 }
947 else
948 {
949 text++;
950 width++;
951 }
952
953 if ((gsize) width > pos)
954 return result;
955
956 result++;
957 }
958
959 return result;
960 }
961
962
963
964 static char *
965 str_utf8_create_search_needle (const char *needle, gboolean case_sen)
966 {
967 char *fold, *result;
968
969 if (needle == NULL)
970 return NULL;
971
972 if (case_sen)
973 return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
974
975 fold = g_utf8_casefold (needle, -1);
976 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
977 g_free (fold);
978 return result;
979 }
980
981
982
983 static void
984 str_utf8_release_search_needle (char *needle, gboolean case_sen)
985 {
986 (void) case_sen;
987 g_free (needle);
988 }
989
990
991
992 static const char *
993 str_utf8_search_first (const char *text, const char *search, gboolean case_sen)
994 {
995 char *deco_text;
996 const char *match;
997 const char *result = NULL;
998 size_t search_len;
999
1000 if (case_sen)
1001 deco_text = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1002 else
1003 {
1004 char *fold_text;
1005
1006 fold_text = g_utf8_casefold (text, -1);
1007 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1008 g_free (fold_text);
1009 }
1010
1011 search_len = strlen (search);
1012
1013 match = deco_text;
1014 do
1015 {
1016 match = g_strstr_len (match, -1, search);
1017 if (match != NULL)
1018 {
1019 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1020 !str_utf8_iscombiningmark (match + search_len))
1021 {
1022 const char *m = deco_text;
1023
1024 result = text;
1025 while (m < match)
1026 {
1027 str_utf8_cnext_noncomb_char (&m);
1028 str_utf8_cnext_noncomb_char (&result);
1029 }
1030 }
1031 else
1032 str_utf8_cnext_char (&match);
1033 }
1034 }
1035 while (match != NULL && result == NULL);
1036
1037 g_free (deco_text);
1038
1039 return result;
1040 }
1041
1042
1043
1044 static const char *
1045 str_utf8_search_last (const char *text, const char *search, gboolean case_sen)
1046 {
1047 char *deco_text;
1048 char *match;
1049 const char *result = NULL;
1050 size_t search_len;
1051
1052 if (case_sen)
1053 deco_text = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1054 else
1055 {
1056 char *fold_text;
1057
1058 fold_text = g_utf8_casefold (text, -1);
1059 deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1060 g_free (fold_text);
1061 }
1062
1063 search_len = strlen (search);
1064
1065 do
1066 {
1067 match = g_strrstr_len (deco_text, -1, search);
1068 if (match != NULL)
1069 {
1070 if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1071 !str_utf8_iscombiningmark (match + search_len))
1072 {
1073 const char *m = deco_text;
1074
1075 result = text;
1076 while (m < match)
1077 {
1078 str_utf8_cnext_noncomb_char (&m);
1079 str_utf8_cnext_noncomb_char (&result);
1080 }
1081 }
1082 else
1083 match[0] = '\0';
1084 }
1085 }
1086 while (match != NULL && result == NULL);
1087
1088 g_free (deco_text);
1089
1090 return result;
1091 }
1092
1093
1094
1095 static char *
1096 str_utf8_normalize (const char *text)
1097 {
1098 GString *fixed;
1099 char *tmp;
1100 char *result;
1101 const char *start;
1102 const char *end;
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112 for (end = text; *end != '\0'; end++)
1113 if ((*end & 0x80) != 0)
1114 {
1115
1116 break;
1117 }
1118
1119
1120 if (*end == '\0')
1121 return g_strndup (text, end - text);
1122
1123 fixed = g_string_sized_new (4);
1124
1125 start = text;
1126 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1127 {
1128 if (start != end)
1129 {
1130 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1131 g_string_append (fixed, tmp);
1132 g_free (tmp);
1133 }
1134 g_string_append_c (fixed, end[0]);
1135 start = end + 1;
1136 }
1137
1138 if (start == text)
1139 {
1140 result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1141 g_string_free (fixed, TRUE);
1142 }
1143 else
1144 {
1145 if (start[0] != '\0' && start != end)
1146 {
1147 tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1148 g_string_append (fixed, tmp);
1149 g_free (tmp);
1150 }
1151 result = g_string_free (fixed, FALSE);
1152 }
1153
1154 return result;
1155 }
1156
1157
1158
1159 static char *
1160 str_utf8_casefold_normalize (const char *text)
1161 {
1162 GString *fixed;
1163 char *tmp, *fold;
1164 char *result;
1165 const char *start;
1166 const char *end;
1167
1168 fixed = g_string_sized_new (4);
1169
1170 start = text;
1171 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1172 {
1173 if (start != end)
1174 {
1175 fold = g_utf8_casefold (start, end - start);
1176 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1177 g_string_append (fixed, tmp);
1178 g_free (tmp);
1179 g_free (fold);
1180 }
1181 g_string_append_c (fixed, end[0]);
1182 start = end + 1;
1183 }
1184
1185 if (start == text)
1186 {
1187 fold = g_utf8_casefold (text, -1);
1188 result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1189 g_free (fold);
1190 g_string_free (fixed, TRUE);
1191 }
1192 else
1193 {
1194 if (start[0] != '\0' && start != end)
1195 {
1196 fold = g_utf8_casefold (start, end - start);
1197 tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1198 g_string_append (fixed, tmp);
1199 g_free (tmp);
1200 g_free (fold);
1201 }
1202 result = g_string_free (fixed, FALSE);
1203 }
1204
1205 return result;
1206 }
1207
1208
1209
1210 static int
1211 str_utf8_compare (const char *t1, const char *t2)
1212 {
1213 char *n1, *n2;
1214 int result;
1215
1216 n1 = str_utf8_normalize (t1);
1217 n2 = str_utf8_normalize (t2);
1218
1219 result = strcmp (n1, n2);
1220
1221 g_free (n1);
1222 g_free (n2);
1223
1224 return result;
1225 }
1226
1227
1228
1229 static int
1230 str_utf8_ncompare (const char *t1, const char *t2)
1231 {
1232 char *n1, *n2;
1233 size_t l1, l2;
1234 int result;
1235
1236 n1 = str_utf8_normalize (t1);
1237 n2 = str_utf8_normalize (t2);
1238
1239 l1 = strlen (n1);
1240 l2 = strlen (n2);
1241 result = strncmp (n1, n2, MIN (l1, l2));
1242
1243 g_free (n1);
1244 g_free (n2);
1245
1246 return result;
1247 }
1248
1249
1250
1251 static int
1252 str_utf8_casecmp (const char *t1, const char *t2)
1253 {
1254 char *n1, *n2;
1255 int result;
1256
1257 n1 = str_utf8_casefold_normalize (t1);
1258 n2 = str_utf8_casefold_normalize (t2);
1259
1260 result = strcmp (n1, n2);
1261
1262 g_free (n1);
1263 g_free (n2);
1264
1265 return result;
1266 }
1267
1268
1269
1270 static int
1271 str_utf8_ncasecmp (const char *t1, const char *t2)
1272 {
1273 char *n1, *n2;
1274 size_t l1, l2;
1275 int result;
1276
1277 n1 = str_utf8_casefold_normalize (t1);
1278 n2 = str_utf8_casefold_normalize (t2);
1279
1280 l1 = strlen (n1);
1281 l2 = strlen (n2);
1282 result = strncmp (n1, n2, MIN (l1, l2));
1283
1284 g_free (n1);
1285 g_free (n2);
1286
1287 return result;
1288 }
1289
1290
1291
1292 static int
1293 str_utf8_prefix (const char *text, const char *prefix)
1294 {
1295 char *t, *p;
1296 const char *nt, *np;
1297 const char *nnt, *nnp;
1298 int result;
1299
1300 t = str_utf8_normalize (text);
1301 p = str_utf8_normalize (prefix);
1302 nt = t;
1303 np = p;
1304 nnt = t;
1305 nnp = p;
1306
1307 while (nt[0] != '\0' && np[0] != '\0')
1308 {
1309 str_utf8_cnext_char_safe (&nnt);
1310 str_utf8_cnext_char_safe (&nnp);
1311 if (nnt - nt != nnp - np)
1312 break;
1313 if (strncmp (nt, np, nnt - nt) != 0)
1314 break;
1315 nt = nnt;
1316 np = nnp;
1317 }
1318
1319 result = np - p;
1320
1321 g_free (t);
1322 g_free (p);
1323
1324 return result;
1325 }
1326
1327
1328
1329 static int
1330 str_utf8_caseprefix (const char *text, const char *prefix)
1331 {
1332 char *t, *p;
1333 const char *nt, *np;
1334 const char *nnt, *nnp;
1335 int result;
1336
1337 t = str_utf8_casefold_normalize (text);
1338 p = str_utf8_casefold_normalize (prefix);
1339 nt = t;
1340 np = p;
1341 nnt = t;
1342 nnp = p;
1343
1344 while (nt[0] != '\0' && np[0] != '\0')
1345 {
1346 str_utf8_cnext_char_safe (&nnt);
1347 str_utf8_cnext_char_safe (&nnp);
1348 if (nnt - nt != nnp - np)
1349 break;
1350 if (strncmp (nt, np, nnt - nt) != 0)
1351 break;
1352 nt = nnt;
1353 np = nnp;
1354 }
1355
1356 result = np - p;
1357
1358 g_free (t);
1359 g_free (p);
1360
1361 return result;
1362 }
1363
1364
1365
1366 static char *
1367 str_utf8_create_key_gen (const char *text, gboolean case_sen,
1368 gchar *(*keygen) (const gchar *text, gssize size))
1369 {
1370 char *result;
1371
1372 if (case_sen)
1373 result = str_utf8_normalize (text);
1374 else
1375 {
1376 gboolean dot;
1377 GString *fixed;
1378 const char *start, *end;
1379 char *fold, *key;
1380
1381 dot = text[0] == '.';
1382 fixed = g_string_sized_new (16);
1383
1384 if (!dot)
1385 start = text;
1386 else
1387 {
1388 start = text + 1;
1389 g_string_append_c (fixed, '.');
1390 }
1391
1392 while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1393 {
1394 if (start != end)
1395 {
1396 fold = g_utf8_casefold (start, end - start);
1397 key = keygen (fold, -1);
1398 g_string_append (fixed, key);
1399 g_free (key);
1400 g_free (fold);
1401 }
1402 g_string_append_c (fixed, end[0]);
1403 start = end + 1;
1404 }
1405
1406 if (start == text)
1407 {
1408 fold = g_utf8_casefold (start, -1);
1409 result = keygen (fold, -1);
1410 g_free (fold);
1411 g_string_free (fixed, TRUE);
1412 }
1413 else if (dot && (start == text + 1))
1414 {
1415 fold = g_utf8_casefold (start, -1);
1416 key = keygen (fold, -1);
1417 g_string_append (fixed, key);
1418 g_free (key);
1419 g_free (fold);
1420 result = g_string_free (fixed, FALSE);
1421 }
1422 else
1423 {
1424 if (start[0] != '\0' && start != end)
1425 {
1426 fold = g_utf8_casefold (start, end - start);
1427 key = keygen (fold, -1);
1428 g_string_append (fixed, key);
1429 g_free (key);
1430 g_free (fold);
1431 }
1432 result = g_string_free (fixed, FALSE);
1433 }
1434 }
1435 return result;
1436 }
1437
1438
1439
1440 static char *
1441 str_utf8_create_key (const char *text, gboolean case_sen)
1442 {
1443 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1444 }
1445
1446
1447
1448 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1449 static char *
1450 str_utf8_create_key_for_filename (const char *text, gboolean case_sen)
1451 {
1452 return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1453 }
1454 #endif
1455
1456
1457
1458 static int
1459 str_utf8_key_collate (const char *t1, const char *t2, gboolean case_sen)
1460 {
1461 (void) case_sen;
1462 return strcmp (t1, t2);
1463 }
1464
1465
1466
1467 static void
1468 str_utf8_release_key (char *key, gboolean case_sen)
1469 {
1470 (void) case_sen;
1471 g_free (key);
1472 }
1473
1474
1475
1476
1477
1478 struct str_class
1479 str_utf8_init (void)
1480 {
1481 struct str_class result;
1482
1483 result.conv_gerror_message = str_utf8_conv_gerror_message;
1484 result.vfs_convert_to = str_utf8_vfs_convert_to;
1485 result.insert_replace_char = str_utf8_insert_replace_char;
1486 result.is_valid_string = str_utf8_is_valid_string;
1487 result.is_valid_char = str_utf8_is_valid_char;
1488 result.cnext_char = str_utf8_cnext_char;
1489 result.cprev_char = str_utf8_cprev_char;
1490 result.cnext_char_safe = str_utf8_cnext_char_safe;
1491 result.cprev_char_safe = str_utf8_cprev_char_safe;
1492 result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1493 result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1494 result.char_isspace = str_utf8_isspace;
1495 result.char_ispunct = str_utf8_ispunct;
1496 result.char_isalnum = str_utf8_isalnum;
1497 result.char_isdigit = str_utf8_isdigit;
1498 result.char_isprint = str_utf8_isprint;
1499 result.char_iscombiningmark = str_utf8_iscombiningmark;
1500 result.char_toupper = str_utf8_toupper;
1501 result.char_tolower = str_utf8_tolower;
1502 result.length = str_utf8_length;
1503 result.length2 = str_utf8_length2;
1504 result.length_noncomb = str_utf8_length_noncomb;
1505 result.fix_string = str_utf8_fix_string;
1506 result.term_form = str_utf8_term_form;
1507 result.fit_to_term = str_utf8_fit_to_term;
1508 result.term_trim = str_utf8_term_trim;
1509 result.term_width2 = str_utf8_term_width2;
1510 result.term_width1 = str_utf8_term_width1;
1511 result.term_char_width = str_utf8_term_char_width;
1512 result.term_substring = str_utf8_term_substring;
1513 result.trunc = str_utf8_trunc;
1514 result.offset_to_pos = str_utf8_offset_to_pos;
1515 result.column_to_pos = str_utf8_column_to_pos;
1516 result.create_search_needle = str_utf8_create_search_needle;
1517 result.release_search_needle = str_utf8_release_search_needle;
1518 result.search_first = str_utf8_search_first;
1519 result.search_last = str_utf8_search_last;
1520 result.compare = str_utf8_compare;
1521 result.ncompare = str_utf8_ncompare;
1522 result.casecmp = str_utf8_casecmp;
1523 result.ncasecmp = str_utf8_ncasecmp;
1524 result.prefix = str_utf8_prefix;
1525 result.caseprefix = str_utf8_caseprefix;
1526 result.create_key = str_utf8_create_key;
1527 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1528
1529 result.create_key_for_filename = str_utf8_create_key_for_filename;
1530 #else
1531
1532 result.create_key_for_filename = str_utf8_create_key;
1533 #endif
1534 result.key_collate = str_utf8_key_collate;
1535 result.release_key = str_utf8_release_key;
1536
1537 return result;
1538 }
1539
1540